Index: include/Makefile =================================================================== --- include/Makefile +++ include/Makefile @@ -54,6 +54,7 @@ geom/mirror geom/mountver geom/multipath geom/nop \ geom/raid geom/raid3 geom/shsec geom/stripe geom/virstor \ net/altq \ + net/route \ netgraph/atm netgraph/netflow \ netinet/cc \ netinet/netdump \ Index: lib/libc/gen/sysctl.3 =================================================================== --- lib/libc/gen/sysctl.3 +++ lib/libc/gen/sysctl.3 @@ -563,6 +563,8 @@ .It Dv NET_RT_IFLIST Ta 0 or if_index Ta None .It Dv NET_RT_IFMALIST Ta 0 or if_index Ta None .It Dv NET_RT_IFLISTL Ta 0 or if_index Ta None +.It Dv NET_RT_NHOPS Ta None Ta fib number +.It Dv NET_RT_NHGROUPS Ta None Ta fib number .El .Pp The @@ -583,6 +585,12 @@ .Va struct if_msghdrl and .Va struct ifa_msghdrl . +.Pp +.Dv NET_RT_NHOPS +returns all nexthops for specified address family in given fib. +.Pp +.Dv NET_RT_NHGRUOPS +returns all multipath groups for specified address family in given fib. .It Li PF_INET Get or set various global information about the IPv4 (Internet Protocol version 4). Index: sys/amd64/conf/GENERIC =================================================================== --- sys/amd64/conf/GENERIC +++ sys/amd64/conf/GENERIC @@ -31,6 +31,7 @@ options INET # InterNETworking options INET6 # IPv6 communications protocols options IPSEC_SUPPORT # Allow kldload of ipsec and tcpmd5 +options ROUTE_MPATH # Enable multipath routing options TCP_OFFLOAD # TCP offload options TCP_BLACKBOX # Enhanced TCP event logging options TCP_HHOOK # hhook(9) framework for TCP Index: sys/conf/NOTES =================================================================== --- sys/conf/NOTES +++ sys/conf/NOTES @@ -986,7 +986,7 @@ # # TCP_HHOOK enables the hhook(9) framework hooks for the TCP stack. # -# RADIX_MPATH provides support for equal-cost multi-path routing. +# ROUTE_MPATH provides support for multi-path routing. # options MROUTING # Multicast routing options IPFIREWALL #firewall Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -4089,16 +4089,23 @@ net/debugnet_inet.c optional inet debugnet net/pfil.c optional ether | inet net/radix.c standard -net/radix_mpath.c standard net/raw_cb.c standard net/raw_usrreq.c standard net/route.c standard net/route_temporal.c standard +net/route/mpath_ctl.c optional route_mpath +net/route/route_ctl.c standard +net/route/route_helpers.c standard +net/route/nhop.c standard +net/route/nhop_ctl.c standard +net/route/nhop_utils.c standard +net/route/nhgrp.c optional route_mpath +net/route/nhgrp_ctl.c optional route_mpath net/rss_config.c optional inet rss | inet6 rss net/rtsock.c standard net/slcompress.c optional netgraph_vjc | sppp | \ netgraph_sppp -net/toeplitz.c optional inet rss | inet6 rss +net/toeplitz.c optional inet rss | inet6 rss | route_mpath net/vnet.c optional vimage net80211/ieee80211.c optional wlan net80211/ieee80211_acl.c optional wlan wlan_acl Index: sys/conf/options =================================================================== --- sys/conf/options +++ sys/conf/options @@ -453,6 +453,7 @@ PCBGROUP opt_pcbgroup.h PF_DEFAULT_TO_DROP opt_pf.h RADIX_MPATH opt_mpath.h +ROUTE_MPATH opt_route_mpath.h ROUTETABLES opt_route.h RSS opt_rss.h SLIP_IFF_OPTS opt_slip.h Index: sys/dev/cxgbe/tom/t4_connect.c =================================================================== --- sys/dev/cxgbe/tom/t4_connect.c +++ sys/dev/cxgbe/tom/t4_connect.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -224,13 +225,13 @@ * rtalloc1, RT_UNLOCK on rt. */ int -t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt, +t4_connect(struct toedev *tod, struct socket *so, struct nhop_object *nh, struct sockaddr *nam) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = NULL; struct wrqe *wr = NULL; - struct ifnet *rt_ifp = rt->rt_ifp; + struct ifnet *rt_ifp = nh->nh_ifp; struct vi_info *vi; int qid_atid, rc, isipv6; struct inpcb *inp = sotoinpcb(so); @@ -275,7 +276,7 @@ DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->l2te = t4_l2t_get(vi->pi, rt_ifp, - rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam); + nh->nh_flags & NHF_GATEWAY ? &nh->gw_sa : nam); if (toep->l2te == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); Index: sys/dev/cxgbe/tom/t4_tom.h =================================================================== --- sys/dev/cxgbe/tom/t4_tom.h +++ sys/dev/cxgbe/tom/t4_tom.h @@ -369,7 +369,7 @@ /* t4_connect.c */ void t4_init_connect_cpl_handlers(void); void t4_uninit_connect_cpl_handlers(void); -int t4_connect(struct toedev *, struct socket *, struct rtentry *, +int t4_connect(struct toedev *, struct socket *, struct nhop_object *, struct sockaddr *); void act_open_failure_cleanup(struct adapter *, u_int, u_int); Index: sys/fs/nfsclient/nfs_clvfsops.c =================================================================== --- sys/fs/nfsclient/nfs_clvfsops.c +++ sys/fs/nfsclient/nfs_clvfsops.c @@ -473,10 +473,9 @@ sin.sin_len = sizeof(sin); /* XXX MRT use table 0 for this sort of thing */ CURVNET_SET(TD_TO_VNET(td)); - error = rtrequest_fib(RTM_ADD, (struct sockaddr *)&sin, - (struct sockaddr *)&nd->mygateway, - (struct sockaddr *)&mask, - RTF_UP | RTF_GATEWAY, NULL, RT_DEFAULT_FIB); + error = rib_request_simple(RIB_ADD, RT_DEFAULT_FIB, + (struct sockaddr *)&sin, (struct sockaddr *)&mask, + (struct sockaddr *)&nd->mygateway, RTF_UP | RTF_GATEWAY); CURVNET_RESTORE(); if (error) panic("nfs_mountroot: RTM_ADD: %d", error); Index: sys/modules/tests/Makefile =================================================================== --- sys/modules/tests/Makefile +++ sys/modules/tests/Makefile @@ -3,6 +3,7 @@ SUBDIR+= framework SUBDIR+= .WAIT SUBDIR+= callout_test +SUBDIR+= routing SUBDIR_PARALLEL= Index: sys/modules/tests/routing/Makefile =================================================================== --- /dev/null +++ sys/modules/tests/routing/Makefile @@ -0,0 +1,15 @@ +# +# $FreeBSD$ +# + +.PATH: ${SRCTOP}/sys/tests/routing + +KMOD= routing_test +SRCS= module.c test_route_ctl.c + +# +# Enable full debugging +# +CFLAGS += -g -O0 + +.include Index: sys/net/debugnet.c =================================================================== --- sys/net/debugnet.c +++ sys/net/debugnet.c @@ -57,6 +57,8 @@ #include #include #include +#include +#include #include #include #include @@ -644,7 +646,8 @@ if (pcb->dp_client == INADDR_ANY || pcb->dp_gateway == INADDR_ANY || pcb->dp_ifp == NULL) { struct sockaddr_in dest_sin, *gw_sin, *local_sin; - struct rtentry *dest_rt; + struct nhop_object *dest_nh; + struct in_addr dest_addr; struct ifnet *rt_ifp; memset(&dest_sin, 0, sizeof(dest_sin)); @@ -653,31 +656,32 @@ .sin_family = AF_INET, .sin_addr.s_addr = pcb->dp_server, }; + dest_addr.s_addr = pcb->dp_server; CURVNET_SET(vnet0); - dest_rt = rtalloc1((struct sockaddr *)&dest_sin, 0, - RTF_RNH_LOCKED); + dest_nh = fib4_lookup_nh_ptr(RT_DEFAULT_FIB, dest_addr, 0, + NHR_REF, 0); CURVNET_RESTORE(); - if (dest_rt == NULL) { + if (dest_nh == NULL) { printf("%s: Could not get route for that server.\n", __func__); error = ENOENT; goto cleanup; } - if (dest_rt->rt_gateway->sa_family == AF_INET) - gw_sin = (struct sockaddr_in *)dest_rt->rt_gateway; + if (dest_nh->gw4_sa.sin_family == AF_INET) + gw_sin = &dest_nh->gw4_sa; else { - if (dest_rt->rt_gateway->sa_family == AF_LINK) + if (dest_nh->gw4_sa.sin_family == AF_LINK) DNETDEBUG("Destination address is on link.\n"); gw_sin = NULL; } - MPASS(dest_rt->rt_ifa->ifa_addr->sa_family == AF_INET); - local_sin = (struct sockaddr_in *)dest_rt->rt_ifa->ifa_addr; + MPASS(dest_nh->nh_ifa->ifa_addr->sa_family == AF_INET); + local_sin = (struct sockaddr_in *)dest_nh->nh_ifa->ifa_addr; - rt_ifp = dest_rt->rt_ifp; + rt_ifp = dest_nh->nh_ifp; if (pcb->dp_client == INADDR_ANY) pcb->dp_client = local_sin->sin_addr.s_addr; @@ -686,7 +690,7 @@ if (pcb->dp_ifp == NULL) pcb->dp_ifp = rt_ifp; - RTFREE_LOCKED(dest_rt); + NH_FREE(dest_nh); } ifp = pcb->dp_ifp; Index: sys/net/if.c =================================================================== --- sys/net/if.c +++ sys/net/if.c @@ -1851,6 +1851,7 @@ struct sockaddr_dl null_sdl; struct ifnet *ifp; struct ifaddr *rti_ifa = NULL; + struct rib_cmd_info rc; ifp = ifa->ifa_ifp; @@ -1873,7 +1874,9 @@ info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; link_init_sdl(ifp, (struct sockaddr *)&null_sdl, ifp->if_type); - error = rtrequest1_fib(cmd, &info, NULL, ifp->if_fib); + NET_EPOCH_ENTER(et); + error = rib_request(cmd, ifp->if_fib, &info, &rc); + NET_EPOCH_EXIT(et); if (rti_ifa != NULL) ifa_free(rti_ifa); Index: sys/net/if_var.h =================================================================== --- sys/net/if_var.h +++ sys/net/if_var.h @@ -61,6 +61,7 @@ */ struct rtentry; /* ifa_rtrequest */ +struct nhop_object; /* ifa_rtrequest */ struct rt_addrinfo; /* ifa_rtrequest */ struct socket; struct carp_if; @@ -551,7 +552,7 @@ struct carp_softc *ifa_carp; /* pointer to CARP data */ CK_STAILQ_ENTRY(ifaddr) ifa_link; /* queue macro glue */ void (*ifa_rtrequest) /* check or clean routes (+ or -)'d */ - (int, struct rtentry *, struct rt_addrinfo *); + (int, struct rtentry *, struct nhop_object *, struct rt_addrinfo *); u_short ifa_flags; /* mostly rt_flags for cloning */ #define IFA_ROUTE RTF_UP /* route installed */ #define IFA_RTSELF RTF_HOST /* loopback route to self installed */ Index: sys/net/route.h =================================================================== --- sys/net/route.h +++ sys/net/route.h @@ -35,7 +35,6 @@ #ifndef _NET_ROUTE_H_ #define _NET_ROUTE_H_ -#include #include /* @@ -45,13 +44,14 @@ * are set by making entries for all directly connected interfaces. */ +struct nhop_object; /* * Struct route consiste of a destination address, * a route entry pointer, link-layer prepend data pointer along * with its length. */ struct route { - struct rtentry *ro_rt; + struct nhop_object *ro_nh; struct llentry *ro_lle; /* * ro_prepend and ro_plen are only used for bpf to pass in a @@ -118,6 +118,10 @@ extern u_int rt_numfibs; /* number of usable routing tables */ VNET_DECLARE(u_int, rt_add_addr_allfibs); /* Announce interfaces to all fibs */ #define V_rt_add_addr_allfibs VNET(rt_add_addr_allfibs) + +/* Calculate flowid for locally-originated packets */ +#define V_fib_hash_outbound VNET(fib_hash_outbound) +VNET_DECLARE(u_int, fib_hash_outbound); #endif /* @@ -128,41 +132,7 @@ * gateways are marked so that the output routines know to address the * gateway rather than the ultimate destination. */ -#ifndef RNF_NORMAL -#include -#ifdef RADIX_MPATH -#include -#endif -#endif -#if defined(_KERNEL) -struct rtentry { - struct radix_node rt_nodes[2]; /* tree glue, and other values */ - /* - * XXX struct rtentry must begin with a struct radix_node (or two!) - * because the code does some casts of a 'struct radix_node *' - * to a 'struct rtentry *' - */ -#define rt_key(r) (*((struct sockaddr **)(&(r)->rt_nodes->rn_key))) -#define rt_mask(r) (*((struct sockaddr **)(&(r)->rt_nodes->rn_mask))) -#define rt_key_const(r) (*((const struct sockaddr * const *)(&(r)->rt_nodes->rn_key))) -#define rt_mask_const(r) (*((const struct sockaddr * const *)(&(r)->rt_nodes->rn_mask))) - struct sockaddr *rt_gateway; /* value */ - struct ifnet *rt_ifp; /* the answer: interface to use */ - struct ifaddr *rt_ifa; /* the answer: interface address to use */ - int rt_flags; /* up/down?, host/net */ - int rt_refcnt; /* # held references */ - u_int rt_fibnum; /* which FIB */ - u_long rt_mtu; /* MTU for this path */ - u_long rt_weight; /* absolute weight */ - u_long rt_expire; /* lifetime for route, e.g. redirect */ -#define rt_endzero rt_pksent - counter_u64_t rt_pksent; /* packets sent using this route */ - struct mtx rt_mtx; /* mutex for routing entry */ - struct rtentry *rt_chain; /* pointer to next rtentry to delete */ -}; -#endif /* _KERNEL */ - #define RTF_UP 0x1 /* route usable */ #define RTF_GATEWAY 0x2 /* destination is a gateway */ #define RTF_HOST 0x4 /* host entry (net otherwise) */ @@ -197,15 +167,17 @@ with existing routing apps */ /* Mask of RTF flags that are allowed to be modified by RTM_CHANGE. */ -#define RTF_FMASK \ - (RTF_PROTO1 | RTF_PROTO2 | RTF_PROTO3 | RTF_BLACKHOLE | \ - RTF_REJECT | RTF_STATIC | RTF_STICKY) +#define RIB_RTE_CHANGE_MASK (RTF_GATEWAY | RTF_REJECT | RTF_DYNAMIC | \ + RTF_STATIC | RTF_BLACKHOLE | RTF_PROTO1 | RTF_PROTO2 | RTF_PROTO3 | \ + RTF_FIXEDMTU) /* * fib_ nexthop API flags. */ /* Consumer-visible nexthop info flags */ +#define NHF_INVALID 0x0001 /* Nexthop is not usable */ +#define NHF_MULTIPATH 0x0008 /* Nexhop is a multipath group */ #define NHF_REJECT 0x0010 /* RTF_REJECT */ #define NHF_BLACKHOLE 0x0020 /* RTF_BLACKHOLE */ #define NHF_REDIRECT 0x0040 /* RTF_DYNAMIC|RTF_MODIFIED */ @@ -215,27 +187,16 @@ #define NHF_HOST 0x0400 /* RTF_HOST */ /* Nexthop request flags */ +#define NHR_NONE 0x00 /* empty flags field */ #define NHR_IFAIF 0x01 /* Return ifa_ifp interface */ #define NHR_REF 0x02 /* For future use */ +/* uRPF */ +#define NHR_NODEFAULT 0x04 /* do not consider default route */ + /* Control plane route request flags */ #define NHR_COPY 0x100 /* Copy rte data */ -#ifdef _KERNEL -/* rte<>ro_flags translation */ -static inline void -rt_update_ro_flags(struct route *ro) -{ - int rt_flags = ro->ro_rt->rt_flags; - - ro->ro_flags &= ~ (RT_REJECT|RT_BLACKHOLE|RT_HAS_GW); - - ro->ro_flags |= (rt_flags & RTF_REJECT) ? RT_REJECT : 0; - ro->ro_flags |= (rt_flags & RTF_BLACKHOLE) ? RT_BLACKHOLE : 0; - ro->ro_flags |= (rt_flags & RTF_GATEWAY) ? RT_HAS_GW : 0; -} -#endif - /* * Routing statistics. */ @@ -245,6 +206,15 @@ uint64_t rts_newgateway; /* routes modified by redirects */ uint64_t rts_unreach; /* lookups which failed */ uint64_t rts_wildcard; /* lookups satisfied by a wildcard */ + uint64_t rts_add_success; /* number of routes added */ + uint64_t rts_add_algo_fail; /* failuers to add a routing entry */ + uint64_t rts_add_pinned; /* number of pinned routes added */ + uint64_t rts_add_retry; /* number of add retries */ + uint64_t rts_mpath_ineligible; /* number of ineligible mpath add failures */ + uint64_t rts_del_fail_priority; /* # of delte failures due to priority */ + uint64_t rts_del_algo_fail; /* # of algorithm failures to delete an entry */ + uint64_t rts_del_success; /* number of successful deletes */ + uint64_t rts_del_retry; /* number of delete retries */ }; /* @@ -338,7 +308,8 @@ #define RTAX_BRD 7 /* for NEWADDR, broadcast or p-p dest addr */ #define RTAX_MAX 8 /* size of array to allocate */ -typedef int rt_filter_f_t(const struct rtentry *, void *); +struct rtentry; +typedef int rt_filter_f_t(const struct rtentry *, const struct nhop_object *, void *); struct rt_addrinfo { int rti_addrs; /* Route RTF_ flags */ @@ -373,58 +344,22 @@ #define RT_LINK_IS_UP(ifp) (!((ifp)->if_capabilities & IFCAP_LINKSTATE) \ || (ifp)->if_link_state == LINK_STATE_UP) -#define RT_LOCK_INIT(_rt) \ - mtx_init(&(_rt)->rt_mtx, "rtentry", NULL, MTX_DEF | MTX_DUPOK | MTX_NEW) -#define RT_LOCK(_rt) mtx_lock(&(_rt)->rt_mtx) -#define RT_UNLOCK(_rt) mtx_unlock(&(_rt)->rt_mtx) -#define RT_LOCK_DESTROY(_rt) mtx_destroy(&(_rt)->rt_mtx) -#define RT_LOCK_ASSERT(_rt) mtx_assert(&(_rt)->rt_mtx, MA_OWNED) -#define RT_UNLOCK_COND(_rt) do { \ - if (mtx_owned(&(_rt)->rt_mtx)) \ - mtx_unlock(&(_rt)->rt_mtx); \ -} while (0) - -#define RT_ADDREF(_rt) do { \ - RT_LOCK_ASSERT(_rt); \ - KASSERT((_rt)->rt_refcnt >= 0, \ - ("negative refcnt %d", (_rt)->rt_refcnt)); \ - (_rt)->rt_refcnt++; \ -} while (0) - -#define RT_REMREF(_rt) do { \ - RT_LOCK_ASSERT(_rt); \ - KASSERT((_rt)->rt_refcnt > 0, \ - ("bogus refcnt %d", (_rt)->rt_refcnt)); \ - (_rt)->rt_refcnt--; \ -} while (0) - -#define RTFREE_LOCKED(_rt) do { \ - if ((_rt)->rt_refcnt <= 1) \ - rtfree(_rt); \ - else { \ - RT_REMREF(_rt); \ - RT_UNLOCK(_rt); \ +#define RO_NHFREE(_ro) do { \ + if ((_ro)->ro_nh) { \ + NH_FREE((_ro)->ro_nh); \ + (_ro)->ro_nh = NULL; \ } \ - /* guard against invalid refs */ \ - _rt = 0; \ } while (0) -#define RTFREE(_rt) do { \ - RT_LOCK(_rt); \ - RTFREE_LOCKED(_rt); \ -} while (0) - -#define RO_RTFREE(_ro) do { \ - if ((_ro)->ro_rt) \ - RTFREE((_ro)->ro_rt); \ -} while (0) - #define RO_INVALIDATE_CACHE(ro) do { \ - RO_RTFREE(ro); \ if ((ro)->ro_lle != NULL) { \ LLE_FREE((ro)->ro_lle); \ (ro)->ro_lle = NULL; \ } \ + if ((ro)->ro_nh != NULL) { \ + NH_FREE((ro)->ro_nh); \ + (ro)->ro_nh = NULL; \ + } \ } while (0) /* @@ -432,7 +367,7 @@ * out-of-date cache, simply free it. Update the generation number * for the new allocation */ -#define RT_VALIDATE(ro, cookiep, fibnum) do { \ +#define NH_VALIDATE(ro, cookiep, fibnum) do { \ rt_gen_t cookie = RT_GEN(fibnum, (ro)->ro_dst.sa_family); \ if (*(cookiep) != cookie) { \ RO_INVALIDATE_CACHE(ro); \ @@ -440,6 +375,25 @@ } \ } while (0) +/* Keep values consistent with RTM_ ones for now */ +enum rib_cmd_type { + RIB_ADD = 1, /* Add route to the RIB */ + RIB_DEL = 2, /* Delete route from the RIB */ + RIB_CHANGE = 3, /* Change route properties */ +}; + +struct rib_cmd_info { + uint8_t cmd; /* RTM_ADD|RTM_DEL|RTM_CHANGE */ + uint8_t num_changed; /* Number of changed nhops */ + uint8_t idx_changed; /* Index of the first changed nhop */ + uint8_t spare; + uint32_t rt_weight; /* new weight */ + struct rtentry *rt; /* Target entry */ + struct nhop_object *nh_old; /* Target nhop OR mpath */ + struct nhop_object *nh_new; /* Target nhop OR mpath */ + uint64_t mask_changed; /* Bitmask of changed nhops */ +}; + struct ifmultiaddr; struct rib_head; @@ -450,18 +404,18 @@ void rt_missmsg_fib(int, struct rt_addrinfo *, int, int, int); void rt_newaddrmsg_fib(int, struct ifaddr *, struct rtentry *, int); int rt_addrmsg(int, struct ifaddr *, int); -int rt_routemsg(int, struct rtentry *, struct ifnet *ifp, int, int); +int rt_routemsg(int, struct rtentry *, struct nhop_object *, int); int rt_routemsg_info(int, struct rt_addrinfo *, int); void rt_newmaddrmsg(int, struct ifmultiaddr *); -int rt_setgate(struct rtentry *, struct sockaddr *, struct sockaddr *); void rt_maskedcopy(struct sockaddr *, struct sockaddr *, struct sockaddr *); struct rib_head *rt_table_init(int, int, u_int); void rt_table_destroy(struct rib_head *); u_int rt_tables_get_gen(int table, int fam); +int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum); int rtsock_addrmsg(int, struct ifaddr *, int); -int rtsock_routemsg(int, struct rtentry *, struct ifnet *ifp, int, int); -int rtsock_routemsg_info(int, struct rt_addrinfo *, int); +int rtsock_routemsg(int, struct rtentry *, struct nhop_object *, int); +int rtsock_routemsg_info(int cmd, struct rt_addrinfo *info, int fibnum); /* * Note the following locking behavior: @@ -487,25 +441,67 @@ /* XXX MRT COMPAT VERSIONS THAT SET UNIVERSE to 0 */ /* Thes are used by old code not yet converted to use multiple FIBS */ -struct rtentry *rtalloc1(struct sockaddr *, int, u_long); int rtinit(struct ifaddr *, int, int); /* XXX MRT NEW VERSIONS THAT USE FIBs * For now the protocol indepedent versions are the same as the AF_INET ones * but this will change.. */ -void rtalloc_ign_fib(struct route *ro, u_long ignflags, u_int fibnum); -struct rtentry *rtalloc1_fib(struct sockaddr *, int, u_long, u_int); int rtioctl_fib(u_long, caddr_t, u_int); -int rtrequest_fib(int, struct sockaddr *, - struct sockaddr *, struct sockaddr *, int, struct rtentry **, u_int); -int rtrequest1_fib(int, struct rt_addrinfo *, struct rtentry **, u_int); + int rib_lookup_info(uint32_t, const struct sockaddr *, uint32_t, uint32_t, struct rt_addrinfo *); void rib_free_info(struct rt_addrinfo *info); int rib_add_redirect(u_int fibnum, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *author, struct ifnet *ifp, int flags, int expire_sec); + +/* New API */ +#define ROUTE_DEFAULT_WEIGHT 100 +#define ROUTE_MAX_WEIGHT 16777215 /* Limit weight to 3 bytes */ + +int rib_add_route(u_int fibnum, struct rt_addrinfo *info, + struct rib_cmd_info *rc); +int rib_del_route(u_int fibnum, struct rt_addrinfo *info, + struct rib_cmd_info *rc); +int rib_change_route(u_int fibnum, struct rt_addrinfo *info, + struct rib_cmd_info *rc); +int rib_request(enum rib_cmd_type cmd, u_int fibnum, + struct rt_addrinfo *info, struct rib_cmd_info *rc); +int rib_request_simple(enum rib_cmd_type cmd, u_int fibnum, + struct sockaddr *dst, struct sockaddr *mask, struct sockaddr *gw, + int rt_flags); + +int rib_lookup_route_netmask(u_int fibnum, const struct sockaddr *dst, + const struct sockaddr *mask, struct rtentry **ret); + +int rib_get_entry_prefix(const struct rtentry *rt, struct sockaddr *dst, + struct sockaddr *netmask, int *plen); +struct sockaddr *rib_get_entry_netmask_sa(const struct rtentry *rt, + struct sockaddr *netmask, size_t sa_len, int *error); +struct sockaddr *rib_get_entry_dst_sa(const struct rtentry *rt, + struct sockaddr *dst, size_t sa_len, int *error); +int rib_get_entry_weight(const struct rtentry *rt); +unsigned long rib_get_entry_expire_time(const struct rtentry *rt); +int rib_get_entry_rtflags(const struct rtentry *rt, + const struct nhop_object *nh); +const struct nhop_object *rib_get_entry_nhop(const struct rtentry *rt); +sa_family_t rib_get_entry_family(const struct rtentry *rt); +unsigned int rib_get_entry_fibnum(const struct rtentry *rt); +int rib_get_entry_plen(const struct rtentry *rt); +int rib_can_export_rte(struct ucred *td_ucred, const struct rtentry *rt); + +/* Helper functions */ +typedef void(route_notification_t)(int cmd, struct rib_head *rnh, + struct rt_addrinfo *info, struct rtentry *rt, struct nhop_object *nh_old, + struct nhop_object *nh_new, uint32_t weight, void *cbdata); +int rib_decompose_notification(struct rib_head *rnh, struct rt_addrinfo *info, + struct rib_cmd_info *rc, route_notification_t *cb, void *cbdata); +void rib_notify_subscribers(struct rib_head *rnh, struct rt_addrinfo *info, + struct rib_cmd_info *rc); + +int rib_print_sockaddr(char *buf, int buflen, const struct sockaddr *s); +void rib_walk(int af, u_int fibnum, rt_walktree_f_t *wa_f, void *arg); #endif Index: sys/net/route.c =================================================================== --- sys/net/route.c +++ sys/net/route.c @@ -61,7 +61,11 @@ #include #include #include +#include +#include +#define NEED_RTZONE #include +#include #include #ifdef RADIX_MPATH @@ -108,7 +112,7 @@ SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(rt_add_addr_allfibs), 0, ""); -VNET_PCPUSTAT_DEFINE_STATIC(struct rtstat, rtstat); +VNET_PCPUSTAT_DEFINE(struct rtstat, rtstat); #define RTSTAT_ADD(name, val) \ VNET_PCPUSTAT_ADD(struct rtstat, rtstat, name, (val)) #define RTSTAT_INC(name) RTSTAT_ADD(name, 1) @@ -124,47 +128,30 @@ VNET_DEFINE(int, rttrash); /* routes not in table but not freed */ #define V_rttrash VNET(rttrash) +#if 0 +VNET_DEFINE(uma_zone_t, rtzone); /* Routing table UMA zone. */ +#endif +uma_zone_t rtzone; /* Routing table UMA zone. */ -/* - * Convert a 'struct radix_node *' to a 'struct rtentry *'. - * The operation can be done safely (in this code) because a - * 'struct rtentry' starts with two 'struct radix_node''s, the first - * one representing leaf nodes in the routing tree, which is - * what the code in radix.c passes us as a 'struct radix_node'. - * - * But because there are a lot of assumptions in this conversion, - * do not cast explicitly, but always use the macro below. - */ -#define RNTORT(p) ((struct rtentry *)(p)) - -VNET_DEFINE_STATIC(uma_zone_t, rtzone); /* Routing table UMA zone. */ -#define V_rtzone VNET(rtzone) - EVENTHANDLER_LIST_DEFINE(rt_addrmsg); -static int rt_getifa_fib(struct rt_addrinfo *, u_int); -static int rtrequest1_fib_change(struct rib_head *, struct rt_addrinfo *, - struct rtentry **, u_int); -static void rt_setmetrics(const struct rt_addrinfo *, struct rtentry *); -static int rt_ifdelroute(const struct rtentry *rt, void *arg); -static struct rtentry *rt_unlinkrte(struct rib_head *rnh, - struct rt_addrinfo *info, int *perror); -static void rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info); +static void destroy_rtentry(struct rtentry *rt); +static void destroy_rtentry_epoch(epoch_context_t ctx); + +static int rt_ifdelroute(const struct rtentry *rt, const struct nhop_object *, + void *arg); +static void rt_notifydelete(struct rtentry *rt, struct nhop_object *nh, + struct rt_addrinfo *info); #ifdef RADIX_MPATH static struct radix_node *rt_mpath_unlink(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry *rto, int *perror); #endif -static int rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, - int flags); +static int rt_exportinfo(struct rtentry *rt, struct nhop_object *nh, + struct rt_addrinfo *info, int flags); -struct if_mtuinfo -{ - struct ifnet *ifp; - int mtu; -}; +int p_sockaddr(char *buf, int buflen, const struct sockaddr *s, int family); +int rt_print(char *buf, int buflen, const struct rtentry *rt); -static int if_updatemtu_cb(struct radix_node *, void *); - /* * handler for net.my_fibnum */ @@ -222,31 +209,11 @@ } -/* - * route initialization must occur before ip6_init2(), which happenas at - * SI_ORDER_MIDDLE. - */ -static void -route_init(void) -{ - - /* whack the tunable ints into line. */ - if (rt_numfibs > RT_MAXFIBS) - rt_numfibs = RT_MAXFIBS; - if (rt_numfibs == 0) - rt_numfibs = 1; -} -SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, NULL); - static int rtentry_zinit(void *mem, int size, int how) { struct rtentry *rt = mem; - rt->rt_pksent = counter_u64_alloc(how); - if (rt->rt_pksent == NULL) - return (ENOMEM); - RT_LOCK_INIT(rt); return (0); @@ -258,7 +225,6 @@ struct rtentry *rt = mem; RT_LOCK_DESTROY(rt); - counter_u64_free(rt->rt_pksent); } static int @@ -267,7 +233,6 @@ struct rtentry *rt = mem; bzero(rt, offsetof(struct rtentry, rt_endzero)); - counter_u64_zero(rt->rt_pksent); rt->rt_chain = NULL; return (0); @@ -281,7 +246,26 @@ RT_UNLOCK_COND(rt); } +/* + * route initialization must occur before ip6_init2(), which happenas at + * SI_ORDER_MIDDLE. + */ static void +route_init(void) +{ + + /* whack the tunable ints into line. */ + if (rt_numfibs > RT_MAXFIBS) + rt_numfibs = RT_MAXFIBS; + if (rt_numfibs == 0) + rt_numfibs = 1; + V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), + rtentry_ctor, rtentry_dtor, + rtentry_zinit, rtentry_zfini, UMA_ALIGN_PTR, 0); +} +SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, NULL); + +static void vnet_route_init(const void *unused __unused) { struct domain *dom; @@ -292,9 +276,11 @@ V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) * sizeof(struct rib_head *), M_RTABLE, M_WAITOK|M_ZERO); +/* V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), rtentry_ctor, rtentry_dtor, rtentry_zinit, rtentry_zfini, UMA_ALIGN_PTR, 0); +*/ for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtattach == NULL) continue; @@ -323,6 +309,9 @@ struct domain *dom; struct rib_head **rnh; + rnh = (struct rib_head **)V_rt_tables; + printf("--VNET V_rt_tables=%p\n", rnh); + for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtdetach == NULL) continue; @@ -341,7 +330,7 @@ } free(V_rt_tables, M_RTABLE); - uma_zdestroy(V_rtzone); + //uma_zdestroy(V_rtzone); } VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, vnet_route_uninit, 0); @@ -372,6 +361,8 @@ /* Init locks */ RIB_LOCK_INIT(rh); + nhops_init(rh); + /* Finally, set base callbacks */ rh->rnh_addaddr = rn_addroute; rh->rnh_deladdr = rn_delete; @@ -403,6 +394,8 @@ rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head); + nhops_destroy(rh); + /* Assume table is already empty */ RIB_LOCK_DESTROY(rh); free(rh, M_RTABLE); @@ -423,39 +416,7 @@ return (0); } -/* - * Packet routing routines. - */ -void -rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum) -{ - struct rtentry *rt; - - if ((rt = ro->ro_rt) != NULL) { - if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP) - return; - RTFREE(rt); - ro->ro_rt = NULL; - } - ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum); - if (ro->ro_rt) - RT_UNLOCK(ro->ro_rt); -} - -/* - * Look up the route that matches the address given - * Or, at least try.. Create a cloned route if needed. - * - * The returned route, if any, is locked. - */ struct rtentry * -rtalloc1(struct sockaddr *dst, int report, u_long ignflags) -{ - - return (rtalloc1_fib(dst, report, ignflags, RT_DEFAULT_FIB)); -} - -struct rtentry * rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum) { @@ -534,9 +495,34 @@ RT_REMREF(rt); if (rt->rt_refcnt > 0) { log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt); - goto done; + RT_UNLOCK(rt); + return; } + RT_UNLOCK(rt); + /* Save curvnet */ +#ifdef VIMAGE + rt->rt_chain = (struct rtentry *)curvnet; +#else + rt->rt_chain = NULL; +#endif + + epoch_call(net_epoch_preempt, destroy_rtentry_epoch, &rt->rt_epoch_ctx); +} + + +__noinline static void +destroy_rtentry(struct rtentry *rt) +{ +#if 0 + struct rib_head *rnh; + +#ifdef VIMAGE + CURVNET_SET((struct vnet *)rt->rt_chain); +#endif + rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family); + KASSERT(rnh != NULL,("%s: NULL rnh", __func__)); + /* * On last reference give the "close method" a chance * to cleanup private state. This also permits (for @@ -548,50 +534,59 @@ */ if (rt->rt_refcnt == 0 && rnh->rnh_close) rnh->rnh_close((struct radix_node *)rt, &rnh->head); +#endif /* * If we are no longer "up" (and ref == 0) * then we can free the resources associated * with the route. */ - if ((rt->rt_flags & RTF_UP) == 0) { + if (!RT_IS_UP(rt)) { if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic("rtfree 2"); /* * the rtentry must have been removed from the routing table * so it is represented in rttrash.. remove that now. */ - V_rttrash--; + //V_rttrash--; #ifdef DIAGNOSTIC if (rt->rt_refcnt < 0) { printf("rtfree: %p not freed (neg refs)\n", rt); - goto done; + CURVNET_RESTORE(); + return; } #endif /* - * release references on items we hold them on.. - * e.g other routes and ifaddrs. - */ - if (rt->rt_ifa) - ifa_free(rt->rt_ifa); - /* * The key is separatly alloc'd so free it (see rt_setgate()). * This also frees the gateway, as they are always malloc'd * together. */ - R_Free(rt_key(rt)); + if (rt_key(rt) != &rt->rt_dst) + free(rt_key(rt), M_RTABLE); /* * and the rtentry itself of course */ uma_zfree(V_rtzone, rt); - return; } -done: - RT_UNLOCK(rt); + + //CURVNET_RESTORE(); } /* + * Epoch callback indicating rtentry is safe to destroy + */ +static void +destroy_rtentry_epoch(epoch_context_t ctx) +{ + struct rtentry *rt; + + rt = __containerof(ctx, struct rtentry, rt_epoch_ctx); + + destroy_rtentry(rt); +} + +/* * Adds a temporal redirect entry to the routing table. * @fibnum: fib number * @dst: destination to install redirect to @@ -607,9 +602,9 @@ rib_add_redirect(u_int fibnum, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *author, struct ifnet *ifp, int flags, int lifetime_sec) { - struct rtentry *rt; int error; struct rt_addrinfo info; + struct rib_cmd_info rc; struct rt_metrics rti_rmx; struct ifaddr *ifa; @@ -641,7 +636,7 @@ info.rti_mflags |= RTV_EXPIRE; info.rti_rmx = &rti_rmx; - error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum); + error = rib_add_route(fibnum, &info, &rc); ifa_free(ifa); if (error != 0) { @@ -649,9 +644,7 @@ return (error); } - RT_LOCK(rt); - flags = rt->rt_flags; - RTFREE_LOCKED(rt); + flags = rib_get_entry_rtflags(rc.rt, rc.nh_new); RTSTAT_INC(rts_dynamic); @@ -719,6 +712,7 @@ ifa = ifa_ifwithnet(gateway, 0, fibnum); if (ifa == NULL) { struct rtentry *rt; + struct nhop_object *nh; rt = rtalloc1_fib(gateway, 0, flags, fibnum); if (rt == NULL) @@ -739,8 +733,9 @@ default: break; } - if (!not_found && rt->rt_ifa != NULL) { - ifa = rt->rt_ifa; + nh = RT_SELECT_NHOP(rt, 0); + if (!not_found && nh->nh_ifa != NULL) { + ifa = nh->nh_ifa; } RT_REMREF(rt); RT_UNLOCK(rt); @@ -758,33 +753,6 @@ } /* - * Do appropriate manipulations of a routing tree given - * all the bits of info needed - */ -int -rtrequest_fib(int req, - struct sockaddr *dst, - struct sockaddr *gateway, - struct sockaddr *netmask, - int flags, - struct rtentry **ret_nrt, - u_int fibnum) -{ - struct rt_addrinfo info; - - if (dst->sa_len == 0) - return(EINVAL); - - bzero((caddr_t)&info, sizeof(info)); - info.rti_flags = flags; - info.rti_info[RTAX_DST] = dst; - info.rti_info[RTAX_GATEWAY] = gateway; - info.rti_info[RTAX_NETMASK] = netmask; - return rtrequest1_fib(req, &info, ret_nrt, fibnum); -} - - -/* * Copy most of @rt data into @info. * * If @flags contains NHR_COPY, copies dst,netmask and gw to the @@ -798,7 +766,8 @@ * Returns 0 on success. */ int -rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, int flags) +rt_exportinfo(struct rtentry *rt, struct nhop_object *nh, + struct rt_addrinfo *info, int flags) { struct rt_metrics *rmx; struct sockaddr *src, *dst; @@ -833,9 +802,9 @@ } /* Copy gateway is set && dst is non-zero */ - src = rt->rt_gateway; + src = &nh->gw_sa; dst = info->rti_info[RTAX_GATEWAY]; - if ((rt->rt_flags & RTF_GATEWAY) && src != NULL && dst != NULL){ + if ((nh->nh_flags & NHF_GATEWAY) && src != NULL && dst != NULL){ if (src->sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); @@ -848,8 +817,8 @@ info->rti_info[RTAX_NETMASK] = rt_mask(rt); info->rti_addrs |= RTA_NETMASK; } - if (rt->rt_flags & RTF_GATEWAY) { - info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; + if (nh->nh_flags & NHF_GATEWAY) { + info->rti_info[RTAX_GATEWAY] = &nh->gw_sa; info->rti_addrs |= RTA_GATEWAY; } } @@ -857,12 +826,12 @@ rmx = info->rti_rmx; if (rmx != NULL) { info->rti_mflags |= RTV_MTU; - rmx->rmx_mtu = rt->rt_mtu; + rmx->rmx_mtu = nh->nh_mtu; } - info->rti_flags = rt->rt_flags; - info->rti_ifp = rt->rt_ifp; - info->rti_ifa = rt->rt_ifa; + info->rti_flags = rib_get_entry_rtflags(rt, nh); + info->rti_ifp = nh->nh_ifp; + info->rti_ifa = nh->nh_ifa; if (flags & NHR_REF) { if_ref(info->rti_ifp); ifa_ref(info->rti_ifa); @@ -889,6 +858,7 @@ struct rib_head *rh; struct radix_node *rn; struct rtentry *rt; + struct nhop_object *nh; int error; KASSERT((fibnum < rt_numfibs), ("rib_lookup_rte: bad fibnum")); @@ -900,10 +870,11 @@ rn = rh->rnh_matchaddr(__DECONST(void *, dst), &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rt = RNTORT(rn); + nh = RT_SELECT_NHOP((RNTORT(rn)), flowid); /* Ensure route & ifp is UP */ - if (RT_LINK_IS_UP(rt->rt_ifp)) { + if (RT_LINK_IS_UP(nh->nh_ifp)) { flags = (flags & NHR_REF) | NHR_COPY; - error = rt_exportinfo(rt, info, flags); + error = rt_exportinfo(rt, nh, info, flags); RIB_RUNLOCK(rh); return (error); @@ -927,10 +898,10 @@ } /* - * Iterates over all existing fibs in system calling - * @setwa_f function prior to traversing each fib. + * Iterates over all existing fibs in system and deletes each element + * for which @filter_f function returns non-zero value. * Calls @wa_f function for each element in current fib. - * If af is not AF_UNSPEC, iterates over fibs in particular + * If @family is not AF_UNSPEC, iterates over fibs in particular * address family. */ void @@ -938,11 +909,11 @@ void *arg) { struct rib_head *rnh; - uint32_t fibnum; + u_int fibnum; int i; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { - /* Do we want some specific family? */ + /* Do we want some specific af? */ if (af != AF_UNSPEC) { rnh = rt_tables_get_rnh(fibnum, af); if (rnh == NULL) @@ -975,36 +946,148 @@ struct rt_addrinfo info; struct rib_head *rnh; struct rtentry *head; + int error_count; }; + +#ifdef ROUTE_MPATH /* - * Conditionally unlinks @rn from radix tree based - * on info data passed in @arg. + * Helper function to remove matching paths from multipath route. + * @rt: prefix rtentry + * @di: filter function and data */ +static void +rt_checkdelroute_mpath(struct rtentry *rt, struct rt_delinfo *di) +{ + struct nhop_object *nh_new; + struct nhop_mpath *mp, *mp_new; + struct rt_addrinfo *info; + struct weightened_nhop *wn; + struct radix_node *rn; + uint64_t del_mask; + uint32_t num_nhops, weight_new; + int count; + + count = 0; + del_mask = 0; + info = &di->info; + nh_new = NULL; + weight_new = 0; + + mp = (struct nhop_mpath *)rt->rt_nhop; + wn = mpath_get_nhops(mp, &num_nhops); + for (uint32_t i = 0; i < num_nhops; i++) { + if (info->rti_filter.func(rt, wn[i].nh, info->rti_filter.data) != 0) { + del_mask |= (1 << i); + count++; + nh_new = wn[i].nh; + weight_new = wn[i].weight; + } + } + + if (count == 0) { + /* No matches, just return */ + return; + } else if (count == num_nhops) { + /* Eliminated all paths */ + rn = di->rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), &di->rnh->head); + if (rn == NULL) { + di->error_count++; + return; + } + /* Entry was unlinked. Lock, add to the list and return */ + RT_LOCK(rt); + rt->rte_flags &= ~RTF_UP; + rt->rt_chain = di->head; + di->head = rt; + + return; + } else if (count + 1 == num_nhops) { + /* + * Eliminated all but one path, so it's not multipath + * group anymore. + */ + KASSERT((nh_new != NULL), ("nh_new == NULL")); + + /* Switch mpath group to a single nexthop */ + nhop_ref_object(nh_new); + + RT_LOCK(rt); + rt->rt_nhop = nh_new; + /* XXX: violates rte fields immutability */ + rt->rt_weight = weight_new; + RT_UNLOCK(rt); + + mpath_free_group(mp); + + return; + } + + /* + * The worst case: new nhop group needs to be created, while radix + * WLOCK is held + */ + mp_new = mpath_get_del_nhops(di->rnh->nh_control, mp, &del_mask); + if (mp_new == NULL) { + /* + * Failed to create new nexthop group, thus + * route deletion fails. + * + * Given that nexhops references all necessary pieces + * it may be not fatal for the system, however the stale route + * has to be evicted somehow. Assume the routing daemon will + * do the housekeeping. + */ + di->error_count++; + return; + } else { + /* Switch to a new mpath group, freeing the old one. */ + RT_LOCK(rt); + rt->rt_nhop = (struct nhop_object *)mp_new; + RT_UNLOCK(rt); + mpath_free_group(mp); + } + + return; +} +#endif + static int rt_checkdelroute(struct radix_node *rn, void *arg) { struct rt_delinfo *di; struct rt_addrinfo *info; struct rtentry *rt; - int error; + struct nhop_object *nh; + struct rib_head *rnh; di = (struct rt_delinfo *)arg; rt = (struct rtentry *)rn; + rnh = di->rnh; info = &di->info; - error = 0; - info->rti_info[RTAX_DST] = rt_key(rt); - info->rti_info[RTAX_NETMASK] = rt_mask(rt); - info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; + nh = rt->rt_nhop; - rt = rt_unlinkrte(di->rnh, info, &error); - if (rt == NULL) { - /* Either not allowed or not matched. Skip entry */ +#ifdef ROUTE_MPATH + if (NH_IS_MULTIPATH(nh)) { + rt_checkdelroute_mpath(rt, di); return (0); } +#endif + if (info->rti_filter(rt, nh, info->rti_filterdata) == 0) { + /* Not matched */ + return (0); + } + rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), &rnh->head); + if (rn == NULL) { + di->error_count++; + return (0); + } /* Entry was unlinked. Add to the list and return */ + RT_LOCK(rt); + RT_ADDREF(rt); + rt->rte_flags &= ~RTF_UP; rt->rt_chain = di->head; di->head = rt; @@ -1019,6 +1102,10 @@ * @filter_f: function returning non-zero value for items to delete * @arg: data to pass to the @filter_f function * @report: true if rtsock notification is needed. + * + * Note: currently reporting is NOT supported for multipath routes. + * The only current customer requiring reporting is temporal routes, + * which are not multipath. */ void rib_walk_del(u_int fibnum, int family, rt_filter_f_t *filter_f, void *arg, bool report) @@ -1026,6 +1113,7 @@ struct rib_head *rnh; struct rt_delinfo di; struct rtentry *rt; + struct nhop_object *nh; rnh = rt_tables_get_rnh(fibnum, family); if (rnh == NULL) @@ -1053,12 +1141,37 @@ di.info.rti_info[RTAX_DST] = rt_key(rt); di.info.rti_info[RTAX_NETMASK] = rt_mask(rt); - rt_notifydelete(rt, &di.info); + nh = rt->rt_nhop; +#ifdef ROUTE_MPATH + if (NH_IS_MULTIPATH(nh)) { + struct weightened_nhop *wn; + uint32_t num_nhops; + + wn = nhgrp_get_nhops((struct nhgrp_object *)rt->rt_nhop, + &num_nhops); + for (uint32_t i = 0; i < num_nhops; i++) { + nh = wn[i].nh; + rt_notifydelete(rt, nh, &di.info); + if (report) + rt_routemsg(RTM_DELETE, rt, nh, fibnum); + } + nhgrp_free_group((struct nhgrp_object *)rt->rt_nhop); + } else +#endif + { + rt_notifydelete(rt, nh, &di.info); - if (report) - rt_routemsg(RTM_DELETE, rt, rt->rt_ifp, 0, fibnum); + if (report) + rt_routemsg(RTM_DELETE, rt, nh, fibnum); + NH_FREE(nh); + } + RTFREE_LOCKED(rt); } + + if (di.error_count > 0) + log(LOG_ERR, "Unable to delete %u route(s) for fib %u in family %d\n", + di.error_count, fibnum, family); } /* @@ -1107,18 +1220,18 @@ * errno failed - reason indicated */ static int -rt_ifdelroute(const struct rtentry *rt, void *arg) +rt_ifdelroute(const struct rtentry *rt, const struct nhop_object *nh, void *arg) { struct ifnet *ifp = arg; - if (rt->rt_ifp != ifp) + if (nh->nh_ifp != ifp) return (0); /* * Protect (sorta) against walktree recursion problems * with cloned routes */ - if ((rt->rt_flags & RTF_UP) == 0) + if (!RT_IS_UP(rt)) return (0); return (1); @@ -1146,97 +1259,24 @@ rt_foreach_fib_walk_del(AF_UNSPEC, rt_ifdelroute, ifp); } -/* - * Conditionally unlinks rtentry matching data inside @info from @rnh. - * Returns unlinked, locked and referenced @rtentry on success, - * Returns NULL and sets @perror to: - * ESRCH - if prefix was not found, - * EADDRINUSE - if trying to delete PINNED route without appropriate flag. - * ENOENT - if supplied filter function returned 0 (not matched). - */ -static struct rtentry * -rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, int *perror) -{ - struct sockaddr *dst, *netmask; - struct rtentry *rt; - struct radix_node *rn; - - dst = info->rti_info[RTAX_DST]; - netmask = info->rti_info[RTAX_NETMASK]; - - rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head); - if (rt == NULL) { - *perror = ESRCH; - return (NULL); - } - - if ((info->rti_flags & RTF_PINNED) == 0) { - /* Check if target route can be deleted */ - if (rt->rt_flags & RTF_PINNED) { - *perror = EADDRINUSE; - return (NULL); - } - } - - if (info->rti_filter != NULL) { - if (info->rti_filter(rt, info->rti_filterdata) == 0) { - /* Not matched */ - *perror = ENOENT; - return (NULL); - } - - /* - * Filter function requested rte deletion. - * Ease the caller work by filling in remaining info - * from that particular entry. - */ - info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; - } - - /* - * Remove the item from the tree and return it. - * Complain if it is not there and do no more processing. - */ - *perror = ESRCH; -#ifdef RADIX_MPATH - if (rt_mpath_capable(rnh)) - rn = rt_mpath_unlink(rnh, info, rt, perror); - else -#endif - rn = rnh->rnh_deladdr(dst, netmask, &rnh->head); - if (rn == NULL) - return (NULL); - - if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) - panic ("rtrequest delete"); - - rt = RNTORT(rn); - RT_LOCK(rt); - RT_ADDREF(rt); - rt->rt_flags &= ~RTF_UP; - - *perror = 0; - - return (rt); -} - static void -rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info) +rt_notifydelete(struct rtentry *rt, struct nhop_object *nh, struct rt_addrinfo *info) { struct ifaddr *ifa; /* * give the protocol a chance to keep things in sync. */ - ifa = rt->rt_ifa; + ifa = nh->nh_ifa; if (ifa != NULL && ifa->ifa_rtrequest != NULL) - ifa->ifa_rtrequest(RTM_DELETE, rt, info); + ifa->ifa_rtrequest(RTM_DELETE, rt, nh, info); /* * One more rtentry floating around that is not * linked to the routing table. rttrash will be decremented * when RTFREE(rt) is eventually called. */ + /* XXX: pcpu? */ V_rttrash++; } @@ -1336,120 +1376,81 @@ return (error); } -static int -if_updatemtu_cb(struct radix_node *rn, void *arg) -{ - struct rtentry *rt; - struct if_mtuinfo *ifmtu; - rt = (struct rtentry *)rn; - ifmtu = (struct if_mtuinfo *)arg; - - if (rt->rt_ifp != ifmtu->ifp) - return (0); - - if (rt->rt_mtu >= ifmtu->mtu) { - /* We have to decrease mtu regardless of flags */ - rt->rt_mtu = ifmtu->mtu; - return (0); - } - - /* - * New MTU is bigger. Check if are allowed to alter it - */ - if ((rt->rt_flags & (RTF_FIXEDMTU | RTF_GATEWAY | RTF_HOST)) != 0) { - - /* - * Skip routes with user-supplied MTU and - * non-interface routes - */ - return (0); - } - - /* We are safe to update route MTU */ - rt->rt_mtu = ifmtu->mtu; - - return (0); -} - +/* + * Updates transmit mtu for all routes using interface @ifp. + */ void rt_updatemtu(struct ifnet *ifp) { - struct if_mtuinfo ifmtu; struct rib_head *rnh; + uint32_t mtu; int i, j; - ifmtu.ifp = ifp; - /* * Try to update rt_mtu for all routes using this interface * Unfortunately the only way to do this is to traverse all * routing tables in all fibs/domains. */ for (i = 1; i <= AF_MAX; i++) { - ifmtu.mtu = if_getmtu_family(ifp, i); + mtu = if_getmtu_family(ifp, i); for (j = 0; j < rt_numfibs; j++) { rnh = rt_tables_get_rnh(j, i); if (rnh == NULL) continue; - RIB_WLOCK(rnh); - rnh->rnh_walktree(&rnh->head, if_updatemtu_cb, &ifmtu); - RIB_WUNLOCK(rnh); + nhops_update_ifmtu(rnh, ifp, mtu); } } } - -#if 0 -int p_sockaddr(char *buf, int buflen, struct sockaddr *s); -int rt_print(char *buf, int buflen, struct rtentry *rt); - int -p_sockaddr(char *buf, int buflen, struct sockaddr *s) +p_sockaddr(char *buf, int buflen, const struct sockaddr *s, int family) { - void *paddr = NULL; + const void *paddr = NULL; - switch (s->sa_family) { + switch (family) { case AF_INET: - paddr = &((struct sockaddr_in *)s)->sin_addr; + paddr = &((const struct sockaddr_in *)s)->sin_addr; break; case AF_INET6: - paddr = &((struct sockaddr_in6 *)s)->sin6_addr; + paddr = &((const struct sockaddr_in6 *)s)->sin6_addr; break; } if (paddr == NULL) return (0); - if (inet_ntop(s->sa_family, paddr, buf, buflen) == NULL) + if (inet_ntop(family, paddr, buf, buflen) == NULL) return (0); return (strlen(buf)); } int -rt_print(char *buf, int buflen, struct rtentry *rt) +rt_print(char *buf, int buflen, const struct rtentry *rt) { - struct sockaddr *addr, *mask; + const struct sockaddr *addr, *mask; int i = 0; + int family; - addr = rt_key(rt); - mask = rt_mask(rt); + addr = rt_key_const(rt); + mask = rt_mask_const(rt); + family = addr->sa_family; - i = p_sockaddr(buf, buflen, addr); - if (!(rt->rt_flags & RTF_HOST)) { + i = p_sockaddr(buf, buflen, addr, family); + if (!(rt->rte_flags & RTF_HOST)) { buf[i++] = '/'; - i += p_sockaddr(buf + i, buflen - i, mask); + i += p_sockaddr(buf + i, buflen - i, mask, family); } +#if 0 if (rt->rt_flags & RTF_GATEWAY) { buf[i++] = '>'; - i += p_sockaddr(buf + i, buflen - i, rt->rt_gateway); + i += p_sockaddr(buf + i, buflen - i, rt->rt_gateway, family); } - +#endif return (i); } -#endif #ifdef RADIX_MPATH /* @@ -1528,223 +1529,6 @@ } #endif -int -rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, - u_int fibnum) -{ - int error = 0; - struct rtentry *rt, *rt_old; - struct radix_node *rn; - struct rib_head *rnh; - struct ifaddr *ifa; - struct sockaddr *ndst; - struct sockaddr_storage mdst; - - KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum")); - KASSERT((flags & RTF_RNH_LOCKED) == 0, ("rtrequest1_fib: locked")); - switch (dst->sa_family) { - case AF_INET6: - case AF_INET: - /* We support multiple FIBs. */ - break; - default: - fibnum = RT_DEFAULT_FIB; - break; - } - - /* - * Find the correct routing tree to use for this Address Family - */ - rnh = rt_tables_get_rnh(fibnum, dst->sa_family); - if (rnh == NULL) - return (EAFNOSUPPORT); - - /* - * If we are adding a host route then we don't want to put - * a netmask in the tree, nor do we want to clone it. - */ - if (flags & RTF_HOST) - netmask = NULL; - - switch (req) { - case RTM_DELETE: - if (netmask) { - if (dst->sa_len > sizeof(mdst)) - return (EINVAL); - rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask); - dst = (struct sockaddr *)&mdst; - } - - RIB_WLOCK(rnh); - rt = rt_unlinkrte(rnh, info, &error); - RIB_WUNLOCK(rnh); - if (error != 0) - return (error); - - rt_notifydelete(rt, info); - - /* - * If the caller wants it, then it can have it, - * but it's up to it to free the rtentry as we won't be - * doing it. - */ - if (ret_nrt) { - *ret_nrt = rt; - RT_UNLOCK(rt); - } else - RTFREE_LOCKED(rt); - break; - case RTM_RESOLVE: - /* - * resolve was only used for route cloning - * here for compat - */ - break; - case RTM_ADD: - if ((flags & RTF_GATEWAY) && !gateway) - return (EINVAL); - if (dst && gateway && (dst->sa_family != gateway->sa_family) && - (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK)) - return (EINVAL); - - if (info->rti_ifa == NULL) { - error = rt_getifa_fib(info, fibnum); - if (error) - return (error); - } else { - ifa_ref(info->rti_ifa); - } - rt = uma_zalloc(V_rtzone, M_NOWAIT); - if (rt == NULL) { - ifa_free(info->rti_ifa); - return (ENOBUFS); - } - rt->rt_flags = RTF_UP | flags; - rt->rt_fibnum = fibnum; - /* - * Add the gateway. Possibly re-malloc-ing the storage for it. - */ - if ((error = rt_setgate(rt, dst, gateway)) != 0) { - ifa_free(info->rti_ifa); - uma_zfree(V_rtzone, rt); - return (error); - } - - /* - * point to the (possibly newly malloc'd) dest address. - */ - ndst = (struct sockaddr *)rt_key(rt); - - /* - * make sure it contains the value we want (masked if needed). - */ - if (netmask) { - rt_maskedcopy(dst, ndst, netmask); - } else - bcopy(dst, ndst, dst->sa_len); - - /* - * We use the ifa reference returned by rt_getifa_fib(). - * This moved from below so that rnh->rnh_addaddr() can - * examine the ifa and ifa->ifa_ifp if it so desires. - */ - ifa = info->rti_ifa; - rt->rt_ifa = ifa; - rt->rt_ifp = ifa->ifa_ifp; - rt->rt_weight = 1; - - rt_setmetrics(info, rt); - - RIB_WLOCK(rnh); - RT_LOCK(rt); -#ifdef RADIX_MPATH - /* do not permit exactly the same dst/mask/gw pair */ - if (rt_mpath_capable(rnh) && - rt_mpath_conflict(rnh, rt, netmask)) { - RIB_WUNLOCK(rnh); - - ifa_free(rt->rt_ifa); - R_Free(rt_key(rt)); - uma_zfree(V_rtzone, rt); - return (EEXIST); - } -#endif - - /* XXX mtu manipulation will be done in rnh_addaddr -- itojun */ - rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes); - - if (rn != NULL && rt->rt_expire > 0) - tmproutes_update(rnh, rt); - - rt_old = NULL; - if (rn == NULL && (info->rti_flags & RTF_PINNED) != 0) { - - /* - * Force removal and re-try addition - * TODO: better multipath&pinned support - */ - struct sockaddr *info_dst = info->rti_info[RTAX_DST]; - info->rti_info[RTAX_DST] = ndst; - /* Do not delete existing PINNED(interface) routes */ - info->rti_flags &= ~RTF_PINNED; - rt_old = rt_unlinkrte(rnh, info, &error); - info->rti_flags |= RTF_PINNED; - info->rti_info[RTAX_DST] = info_dst; - if (rt_old != NULL) - rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, - rt->rt_nodes); - } - RIB_WUNLOCK(rnh); - - if (rt_old != NULL) - RT_UNLOCK(rt_old); - - /* - * If it still failed to go into the tree, - * then un-make it (this should be a function) - */ - if (rn == NULL) { - ifa_free(rt->rt_ifa); - R_Free(rt_key(rt)); - uma_zfree(V_rtzone, rt); - return (EEXIST); - } - - if (rt_old != NULL) { - rt_notifydelete(rt_old, info); - RTFREE(rt_old); - } - - /* - * If this protocol has something to add to this then - * allow it to do that as well. - */ - if (ifa->ifa_rtrequest) - ifa->ifa_rtrequest(req, rt, info); - - /* - * actually return a resultant rtentry and - * give the caller a single reference. - */ - if (ret_nrt) { - *ret_nrt = rt; - RT_ADDREF(rt); - } - rnh->rnh_gen++; /* Routing table updated */ - RT_UNLOCK(rt); - break; - case RTM_CHANGE: - RIB_WLOCK(rnh); - error = rtrequest1_fib_change(rnh, info, ret_nrt, fibnum); - RIB_WUNLOCK(rnh); - break; - default: - error = EOPNOTSUPP; - } - - return (error); -} - #undef dst #undef gateway #undef netmask @@ -1752,199 +1536,6 @@ #undef ifpaddr #undef flags -static int -rtrequest1_fib_change(struct rib_head *rnh, struct rt_addrinfo *info, - struct rtentry **ret_nrt, u_int fibnum) -{ - struct rtentry *rt = NULL; - int error = 0; - int free_ifa = 0; - int family, mtu; - struct if_mtuinfo ifmtu; - - RIB_WLOCK_ASSERT(rnh); - - rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], - info->rti_info[RTAX_NETMASK], &rnh->head); - - if (rt == NULL) - return (ESRCH); - -#ifdef RADIX_MPATH - /* - * If we got multipath routes, - * we require users to specify a matching RTAX_GATEWAY. - */ - if (rt_mpath_capable(rnh)) { - rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]); - if (rt == NULL) - return (ESRCH); - } -#endif - - RT_LOCK(rt); - - rt_setmetrics(info, rt); - - /* - * New gateway could require new ifaddr, ifp; - * flags may also be different; ifp may be specified - * by ll sockaddr when protocol address is ambiguous - */ - if (((rt->rt_flags & RTF_GATEWAY) && - info->rti_info[RTAX_GATEWAY] != NULL) || - info->rti_info[RTAX_IFP] != NULL || - (info->rti_info[RTAX_IFA] != NULL && - !sa_equal(info->rti_info[RTAX_IFA], rt->rt_ifa->ifa_addr))) { - /* - * XXX: Temporarily set RTF_RNH_LOCKED flag in the rti_flags - * to avoid rlock in the ifa_ifwithroute(). - */ - info->rti_flags |= RTF_RNH_LOCKED; - error = rt_getifa_fib(info, fibnum); - info->rti_flags &= ~RTF_RNH_LOCKED; - if (info->rti_ifa != NULL) - free_ifa = 1; - - if (error != 0) - goto bad; - } - - /* Check if outgoing interface has changed */ - if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa && - rt->rt_ifa != NULL) { - if (rt->rt_ifa->ifa_rtrequest != NULL) - rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt, info); - ifa_free(rt->rt_ifa); - rt->rt_ifa = NULL; - } - /* Update gateway address */ - if (info->rti_info[RTAX_GATEWAY] != NULL) { - error = rt_setgate(rt, rt_key(rt), info->rti_info[RTAX_GATEWAY]); - if (error != 0) - goto bad; - - rt->rt_flags &= ~RTF_GATEWAY; - rt->rt_flags |= (RTF_GATEWAY & info->rti_flags); - } - - if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa) { - ifa_ref(info->rti_ifa); - rt->rt_ifa = info->rti_ifa; - rt->rt_ifp = info->rti_ifp; - } - /* Allow some flags to be toggled on change. */ - rt->rt_flags &= ~RTF_FMASK; - rt->rt_flags |= info->rti_flags & RTF_FMASK; - - if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest != NULL) - rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info); - - /* Alter route MTU if necessary */ - if (rt->rt_ifp != NULL) { - family = info->rti_info[RTAX_DST]->sa_family; - mtu = if_getmtu_family(rt->rt_ifp, family); - /* Set default MTU */ - if (rt->rt_mtu == 0) - rt->rt_mtu = mtu; - if (rt->rt_mtu != mtu) { - /* Check if we really need to update */ - ifmtu.ifp = rt->rt_ifp; - ifmtu.mtu = mtu; - if_updatemtu_cb(rt->rt_nodes, &ifmtu); - } - } - - /* - * This route change may have modified the route's gateway. In that - * case, any inpcbs that have cached this route need to invalidate their - * llentry cache. - */ - rnh->rnh_gen++; - - if (ret_nrt) { - *ret_nrt = rt; - RT_ADDREF(rt); - } -bad: - RT_UNLOCK(rt); - if (free_ifa != 0) { - ifa_free(info->rti_ifa); - info->rti_ifa = NULL; - } - return (error); -} - -static void -rt_setmetrics(const struct rt_addrinfo *info, struct rtentry *rt) -{ - - if (info->rti_mflags & RTV_MTU) { - if (info->rti_rmx->rmx_mtu != 0) { - - /* - * MTU was explicitly provided by user. - * Keep it. - */ - rt->rt_flags |= RTF_FIXEDMTU; - } else { - - /* - * User explicitly sets MTU to 0. - * Assume rollback to default. - */ - rt->rt_flags &= ~RTF_FIXEDMTU; - } - rt->rt_mtu = info->rti_rmx->rmx_mtu; - } - if (info->rti_mflags & RTV_WEIGHT) - rt->rt_weight = info->rti_rmx->rmx_weight; - /* Kernel -> userland timebase conversion. */ - if (info->rti_mflags & RTV_EXPIRE) - rt->rt_expire = info->rti_rmx->rmx_expire ? - info->rti_rmx->rmx_expire - time_second + time_uptime : 0; -} - -int -rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) -{ - /* XXX dst may be overwritten, can we move this to below */ - int dlen = SA_SIZE(dst), glen = SA_SIZE(gate); - - /* - * Prepare to store the gateway in rt->rt_gateway. - * Both dst and gateway are stored one after the other in the same - * malloc'd chunk. If we have room, we can reuse the old buffer, - * rt_gateway already points to the right place. - * Otherwise, malloc a new block and update the 'dst' address. - */ - if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) { - caddr_t new; - - R_Malloc(new, caddr_t, dlen + glen); - if (new == NULL) - return ENOBUFS; - /* - * XXX note, we copy from *dst and not *rt_key(rt) because - * rt_setgate() can be called to initialize a newly - * allocated route entry, in which case rt_key(rt) == NULL - * (and also rt->rt_gateway == NULL). - * Free()/free() handle a NULL argument just fine. - */ - bcopy(dst, new, dlen); - R_Free(rt_key(rt)); /* free old block, if any */ - rt_key(rt) = (struct sockaddr *)new; - rt->rt_gateway = (struct sockaddr *)(new + dlen); - } - - /* - * Copy the new gateway value into the memory chunk. - */ - bcopy(gate, rt->rt_gateway, glen); - - return (0); -} - void rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask) { @@ -1975,7 +1566,6 @@ RIB_RLOCK_TRACKER; struct sockaddr *dst; struct sockaddr *netmask; - struct rtentry *rt = NULL; struct rt_addrinfo info; int error = 0; int startfib, endfib; @@ -1984,6 +1574,7 @@ int a_failure = 0; struct sockaddr_dl *sdl = NULL; struct rib_head *rnh; + struct epoch_tracker et; if (flags & RTF_HOST) { dst = ifa->ifa_dstaddr; @@ -2081,9 +1672,15 @@ } } #endif - error = (rn == NULL || - (rn->rn_flags & RNF_ROOT) || - RNTORT(rn)->rt_ifa != ifa); + error = 0; + struct nhop_object *nh; + if (rn == NULL || (rn->rn_flags & RNF_ROOT)) + error = 1; + else { + nh = RNTORT(rn)->rt_nhop; + if (NH_IS_MULTIPATH(nh) || nh->nh_ifa != ifa) + error = 1; + } RIB_RUNLOCK(rnh); if (error) { /* this is only an error if bad on ALL tables */ @@ -2098,61 +1695,32 @@ info.rti_flags = flags | (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED; info.rti_info[RTAX_DST] = dst; + info.rti_info[RTAX_NETMASK] = netmask; /* * doing this for compatibility reasons */ - if (cmd == RTM_ADD) + struct rib_cmd_info rc; + bzero(&rc, sizeof(rc)); + NET_EPOCH_ENTER(et); + if (cmd == RTM_ADD) { info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)sdl; - else + error = rib_add_route(fibnum, &info, &rc); + if (error == 0) { + rt_addrmsg(cmd, ifa, fibnum); + rt_routemsg(cmd, rc.rt, rc.nh_new, fibnum); + } + } else { info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr; - info.rti_info[RTAX_NETMASK] = netmask; - error = rtrequest1_fib(cmd, &info, &rt, fibnum); - if (error == 0 && rt != NULL) { - /* - * notify any listening routing agents of the change - */ - RT_LOCK(rt); -#ifdef RADIX_MPATH - /* - * in case address alias finds the first address - * e.g. ifconfig bge0 192.0.2.246/24 - * e.g. ifconfig bge0 192.0.2.247/24 - * the address set in the route is 192.0.2.246 - * so we need to replace it with 192.0.2.247 - */ - if (memcmp(rt->rt_ifa->ifa_addr, - ifa->ifa_addr, ifa->ifa_addr->sa_len)) { - ifa_free(rt->rt_ifa); - ifa_ref(ifa); - rt->rt_ifp = ifa->ifa_ifp; - rt->rt_ifa = ifa; + error = rib_del_route(fibnum, &info, &rc); + if (error == 0) { + rt_routemsg(cmd, rc.rt, rc.nh_old, fibnum); + rt_addrmsg(cmd, ifa, fibnum); } -#endif - RT_ADDREF(rt); - RT_UNLOCK(rt); - rt_newaddrmsg_fib(cmd, ifa, rt, fibnum); - RT_LOCK(rt); - RT_REMREF(rt); - if (cmd == RTM_DELETE) { - /* - * If we are deleting, and we found an entry, - * then it's been removed from the tree.. - * now throw it away. - */ - RTFREE_LOCKED(rt); - } else { - if (cmd == RTM_ADD) { - /* - * We just wanted to add it.. - * we don't actually need a reference. - */ - RT_REMREF(rt); - } - RT_UNLOCK(rt); - } - didwork = 1; } - if (error) + NET_EPOCH_EXIT(et); + if (error == 0) + didwork = 1; + else a_failure = error; } if (cmd == RTM_DELETE) { @@ -2219,16 +1787,14 @@ * Announce kernel-originated route addition/removal to rtsock based on @rt data. * cmd: RTM_ cmd * @rt: valid rtentry - * @ifp: target route interface + * @nh: nexthop of the route * @fibnum: fib id or RT_ALL_FIBS * * Returns 0 on success. */ int -rt_routemsg(int cmd, struct rtentry *rt, struct ifnet *ifp, int rti_addrs, - int fibnum) +rt_routemsg(int cmd, struct rtentry *rt, struct nhop_object *nh, int fibnum) { - KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); @@ -2237,7 +1803,7 @@ KASSERT(rt_key(rt) != NULL, (":%s: rt_key must be supplied", __func__)); - return (rtsock_routemsg(cmd, rt, ifp, 0, fibnum)); + return (rtsock_routemsg(cmd, rt, nh, fibnum)); } /* @@ -2280,10 +1846,10 @@ if (cmd == RTM_ADD) { rt_addrmsg(cmd, ifa, fibnum); if (rt != NULL) - rt_routemsg(cmd, rt, ifa->ifa_ifp, 0, fibnum); + rt_routemsg(cmd, rt, rt->rt_nhop, fibnum); } else { if (rt != NULL) - rt_routemsg(cmd, rt, ifa->ifa_ifp, 0, fibnum); + rt_routemsg(cmd, rt, rt->rt_nhop, fibnum); rt_addrmsg(cmd, ifa, fibnum); } } Index: sys/net/route/mpath_ctl.c =================================================================== --- /dev/null +++ sys/net/route/mpath_ctl.c @@ -0,0 +1,558 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_inet.h" +#include "opt_route.h" +#include "opt_route_mpath.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +/* + * This file contains the supporting functions for adding/deleting/updating + * multipath routes to the routing table. Terms "Nexthop group" and "multipath + * groups" are used in this file interchangeably. + */ + +VNET_DEFINE(u_int, fib_hash_outbound) = 0; +SYSCTL_UINT(_net_route, OID_AUTO, hash_outbound, CTLFLAG_RD | CTLFLAG_VNET, + &VNET_NAME(fib_hash_outbound), 0, + "Compute flowid for locally-originated packets"); + +static int try_add_route_mpath(struct rib_head *rnh, struct rtentry *rt, + struct weightened_nhop *wn_orig, struct rt_addrinfo *info, + struct rib_cmd_info *rc); +static int try_del_route_mpath(struct rib_head *rnh, struct rtentry *rt, + struct nhgrp_object **pmp_orig, struct rt_addrinfo *info, + struct rib_cmd_info *rc); + + +/* Default entropy to add to the hash calculation for the outbound connections*/ +uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN] = { + 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, + 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, + 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, + 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, + 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa, +}; + +/* + * Tries to add route denoted by @rt and nhop @rt->rt_nhop to the (potentially mpath) + * nhop denoted by @wn_orig. @rt and @rt->rt_nhop are referenced. + * + * Returns 0 on success. @rt and @rt->rt_nhop references are consumed. + * EAGAIN if the original condition has changed. Updates nhop ptr in @wn_orig. + * errno otherwise. + * It is caller responsibility to free references for @rt and @rt->rt_nhop + * in case of an error. + */ +static int +try_add_route_mpath(struct rib_head *rnh, struct rtentry *rt, + struct weightened_nhop *wn_orig, struct rt_addrinfo *info, + struct rib_cmd_info *rc) +{ + struct nhgrp_object *mp; + struct rtentry *rt_new; + struct sockaddr *ndst, *netmask; + struct weightened_nhop wn[2], *wn_tmp; + void *rn = NULL; + uint64_t addmask; + uint32_t num_nhops_tmp; + int error; + + ndst = (struct sockaddr *)rt_key(rt); + netmask = info->rti_info[RTAX_NETMASK]; + + /* + * @rt->rt_nhop represents new non-mpath nhop to be added. + * nhop in @wn_orig can be multipath (which is confusing) + * + * Try to create multipath group based on the joined nexthops + * above. + */ + + wn[0].nh = rt->rt_nhop; + wn[0].weight = rt->rt_weight; + + if (!NH_IS_MULTIPATH(wn_orig->nh)) { + /* Simple merge of 2 non-multipath nexthops */ + if (wn_orig->nh == rt->rt_nhop) { + /* + * This nexhop already exists. If the goal + * was to change weight, change request should + * have been executed for this nexthop. + */ + return (EEXIST); + } + wn[1].nh = wn_orig->nh; + wn[1].weight = wn_orig->weight; + + mp = nhgrp_get_group(rnh, wn, 2, &error); + if (mp != NULL) { + /* Calculate addition mask */ + wn_tmp = nhgrp_get_nhops(mp, &num_nhops_tmp); + if (wn_tmp[0].nh == rt->rt_nhop) + addmask = 1 << 0; + else + addmask = 1 << 1; + } + } else { + /* Get new nhop group with @rt->rt_nhop as an additional nhop */ + mp = nhgrp_append_nhops(rnh, (struct nhgrp_object *)wn_orig->nh, + wn, 1, &addmask, &error); + } + + /* + * As we haven't referenced multipath groups/nhops it is possible that + * object in @wn_orig got scheduled for deletion. In that case, we need + * to re-fetch latest data from the RIB and retry. + * + * It is also possible that allocation simply fails. In that case, return + * immediately. + */ + if (mp == NULL) { + if (error != EAGAIN) { + /* + * Some fatal allocation problem, most likely + * memory-related. + */ + return (error); + } + /* + * Rare case when the @wn_orig data got scheduled for deletion. + * Zero the original data to indicate the need to refill it for + * the code below. + */ + wn_orig->nh = NULL; + wn_orig->weight = 0; + } + + RIB_WLOCK(rnh); + + rt_new = (struct rtentry *)rnh->rnh_lookup(ndst, netmask, &rnh->head); + if (rt_new == NULL) { + + /* + * Our prefix got deleted, let's add proposed single route path + * and return. + */ + RT_LOCK(rt); + rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes); + if (rn != NULL) + rnh->rnh_gen++; + RIB_WUNLOCK(rnh); + + /* In any case, dereference created multipath group */ + if (mp != NULL) + nhgrp_free_group(mp); + + if (rn == NULL) { + /* + * Route addition failed, while there is no existing prefix. + * Most probably, we ran out of memory. + */ + RT_UNLOCK(rt); + RTSTAT_INC(rts_add_algo_fail); + return (ENOMEM); + } + + /* + * Success! As a result, single-path route has been added. + * By convention, references for original @rt and @rt->rt_nhop + * gets consumed. + */ + + /* Finalise notification */ + rc->rt = rt; + rc->rt_weight = rt->rt_weight; + rc->nh_new = rt->rt_nhop; + RT_UNLOCK(rt); + + return (0); + } + + /* Prefix exists, try to update */ + RT_LOCK(rt_new); + if ((rt_new->rt_nhop == wn_orig->nh) && + (rt_new->rt_weight == wn_orig->weight)) { + /* + * Most common case: nhop/mpath group hasn't changed. + * Flip to the new precalculated one and return. + */ + DPRINTF("mpath replace: %p -> %p", wn_orig->nh, mp); + rt_new->rt_nhop = (struct nhop_object *)mp; + + rnh->rnh_gen++; + RIB_WUNLOCK(rnh); + RT_UNLOCK(rt_new); + + rc->rt = rt_new; + rc->nh_old = wn_orig->nh; + rc->nh_new = (struct nhop_object *)mp; + rc->mask_changed = addmask; + + /* + * As original @rt and @rt->rt_nhop was not used directly, + * unref them. + */ + nhop_free_object(rt->rt_nhop); + RTFREE(rt); + + return (0); + } else { + /* + * Nhop/mpath group has been updated. + * Need to backout the work done in this cycle and + * return EAGAIN indicating the caller to retry. + * + * Check the new nexthop before returning is it + * may be ineligible for multipath. + */ + RIB_WUNLOCK(rnh); + + /* unref our newly-created group */ + if (mp != NULL) + nhgrp_free_group(mp); + + /* + * Check if current in-tree nhop is eligible + * for multipath and update @wn_orig with + * its data. + */ + if (can_nh_multipath(rt_new->rt_nhop)) { + wn_orig->nh = rt_new->rt_nhop; + wn_orig->weight = rt_new->rt_weight; + error = EAGAIN; + } else + error = EEXIST; + RT_UNLOCK(rt_new); + + return (error); + } + + return (0); +} + +/* + * Tries to add @rt->rt_nhop to the existing set of nhops (@nh_orig) in the @rt prefix. + * @rt and @nh_orig are referenced and unlocked. + * + * On success: returns 0. Function consumes rt and rt->rt_nhop references. + * @rc gets populated with referenced objects. + * Otherwise: errno is returned, caller responsibility is to unlock/free rt and + * rt->rt_nhop. + */ +int +add_route_mpath(struct rib_head *rnh, struct rtentry *rt, struct nhop_object *nh_orig, + u_long weight_orig, struct rt_addrinfo *info, struct rib_cmd_info *rc) +{ + int error; + struct weightened_nhop wn; + + /* + * 1) In the presense of multiple rtsock speakers such as some + * loadbalancer-like automation there can be some contention present. + * As multiple adds/changes should not (from user standpoint) change + * the result of operation, retry the request multiple times. + * 2) In more common situation, most of the large-fib updates + * are done by the routing daemon via the single route socket, thus the + * contention should be minimal. + * With the above statements, optimize for the simplest case while still + * retain the possibility of retrying. + */ + wn.nh = nh_orig; + wn.weight = weight_orig; + for (int i = 0; i < RIB_MAX_RETRIES; i++) { + error = try_add_route_mpath(rnh, rt, &wn, info, rc); + if (error != EAGAIN) + break; + RTSTAT_INC(rts_add_retry); + } + + if (V_fib_hash_outbound == 0 && error == 0 && + NH_IS_MULTIPATH(rc->nh_new)) { + /* + * First multipath route got installed. Enable local + * outbound connections hashing. + */ + if (bootverbose) + printf("FIB: enabled flowid calculation for locally-originated packets\n"); + V_fib_hash_outbound = 1; + } + + return (error); +} + +/* + * Deletes paths matching gw from @info, from the route defined + * by @rt and mpath group defined by @mp_orig + * + */ +static int +try_del_route_mpath(struct rib_head *rnh, struct rtentry *rt, + struct nhgrp_object **pmp_orig, struct rt_addrinfo *info, + struct rib_cmd_info *rc) +{ + struct sockaddr *gw, *ndst, *netmask; + struct nhgrp_object *mp, *mp_orig; + struct nhop_object *nh_new; + struct weightened_nhop *wn; + unsigned long weight_new; + uint32_t num_nhops; + uint64_t del_mask; + int count, error; + + ndst = (struct sockaddr *)rt_key(rt); + netmask = info->rti_info[RTAX_NETMASK]; + gw = info->rti_info[RTAX_GATEWAY]; + mp_orig = *pmp_orig; + + KASSERT((mp_orig->mp_flags & MPF_MULTIPATH), ("mp_orig not mpath")); + + del_mask = 0; + count = 0; + + wn = nhgrp_get_nhops(mp_orig, &num_nhops); + for (uint32_t i = 0; i < num_nhops; i++) { + if (rib_match_nhop_gw(wn[i].nh, gw)) { + del_mask |= (1 << i); + count++; + } + } + + if (count == 0) { + /* + * Unable to find any matching nexthop to delete. + */ + return (ESRCH); + } + + weight_new = 0; + + if (num_nhops > count + 1) { + /* + * The result will still be a multipath group. + * mp is returned unlocked&referenced + */ + mp = nhgrp_get_del_nhops(rnh, mp_orig, &del_mask, &error); + if (mp == NULL) + return (error); + nh_new = (struct nhop_object *)mp; + } else if (num_nhops == count) { + /* All nexthops has been deleted, request prefix deletion */ + nh_new = NULL; + } else { + /* + * Not multipath group anymore. Set nh_new to the last remaining + * nexthop. + */ + nh_new = NULL; + for (uint32_t i = 0; i < num_nhops; i++) { + if ((del_mask & (1 << i)) == 0) { + nh_new = wn[i].nh; + weight_new = wn[i].weight; + break; + } + } + KASSERT((nh_new != NULL), ("nh_new == NULL")); + nhop_ref_object(nh_new); + } + + /* New nexhop or nexthop group is stored in @nh_new and referenced. */ + + RIB_WLOCK(rnh); + + rt = (struct rtentry *)rnh->rnh_lookup(ndst, netmask, &rnh->head); + + if (rt == NULL) { + /* + * Our prefix got deleted. + * Free resources and return. + */ + RIB_WUNLOCK(rnh); + + if (nh_new != NULL) + nhop_free_any(nh_new); + + return (ESRCH); + } + + /* Prefix still exists, try to update */ + if (rt->rt_nhop == (struct nhop_object *)mp_orig) { + + error = 0; + /* + * Nhop/mpath group hasn't changed. Flip + * to the new precalculated one and return + */ + if (nh_new == NULL) { + /* + * Delete all of the routes for + * the multipath prefix. + */ + struct radix_node *rn; + rn = rnh->rnh_deladdr(ndst, netmask, &rnh->head); + if (rn == NULL) { + RIB_WUNLOCK(rnh); + RTSTAT_INC(rts_del_algo_fail); + return (ESRCH); + } + } else { + /* Changing nexthop to a new one */ + RT_LOCK(rt); + rt->rt_nhop = nh_new; + if (weight_new != 0) + rt->rt_weight = weight_new; + RT_UNLOCK(rt); + } + + if (error == 0) + rnh->rnh_gen++; + RIB_WUNLOCK(rnh); + + if (error != 0) { + nhop_free_object(nh_new); + return (error); + } + + /* Prepare notification */ + + rc->rt = rt; + rc->nh_old = (struct nhop_object *)mp_orig; + rc->mask_changed = del_mask; + rc->nh_new = nh_new; + + /* Unref mp_orig, as it was referenced when attached to rte */ + nhgrp_free_group(mp_orig); + + return (0); + } + + /* + * Nexthop has changed. Check if it is not multipath anymore + */ + if (!NH_IS_MULTIPATH(rt->rt_nhop)) { + int error = del_route_one(rnh, rt, info); + + RIB_WUNLOCK(rnh); + /* + * Regardless of operation result, created multipath + * group is not needed anymore, hence free it. + */ + if (nh_new != NULL) + nhop_free_any(nh_new); + + if (error != 0) + return (error); + + /* Successfully deleted, prepare operation result */ + rc->rt = rt; + rc->nh_old = rt->rt_nhop; + rc->rt_weight = rt->rt_weight; + + return (0); + } + + /* + * The updated nexthop is a new multipath group. + * Need to restart the operation. + */ + mp_orig = (struct nhgrp_object *)rt->rt_nhop; + RIB_WUNLOCK(rnh); + + if (nh_new != NULL) + nhop_free_any(nh_new); + + *pmp_orig = mp_orig; + + return (EAGAIN); +} + +/* + * Deletes paths specified in @info gateway from multipath route @rt + * with multupath group @mp_orig. + * + * Returns 0 on success, with filling @rc with operation results. + */ +int +del_route_mpath(struct rib_head *rnh, struct rtentry *rt, + struct nhgrp_object *mp_orig, struct rt_addrinfo *info, + struct rib_cmd_info *rc) +{ + int error; + + /* + * 1) In the presense of multiple rtsock speakers such as some + * loadbalancer-like automation there can be some contention present. + * As multiple adds/changes should not (from user standpoint) change + * the result of operation, retry the request multiple times. + * 2) In more common situation, most of the large-fib updates + * are done by the routing daemon via the single route socket, thus the + * contention should be minimal. + * With the above statements, optimize for the simplest case while still + * retain the possibility of retrying. + */ + + for (int i = 0; i < RIB_MAX_RETRIES; i++) { + error = try_del_route_mpath(rnh, rt, &mp_orig, info, rc); + if (error != EAGAIN) + break; + RTSTAT_INC(rts_del_retry); + } + + return (error); +} + + Index: sys/net/route/nhgrp.h =================================================================== --- /dev/null +++ sys/net/route/nhgrp.h @@ -0,0 +1,50 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_ROUTE_NHGRP_H_ +#define _NET_ROUTE_NHGRP_H_ + +#define MPF_MULTIPATH 0x08 /* need to be consistent with NHF_MULTIPATH */ + +struct nhgrp_object { + uint16_t mp_flags; /* mpath flags */ + uint8_t mp_size; /* size of mpath group used in selection */ + uint8_t spare; + struct nhop_object *nhops[0]; /* nhops */ +}; + +struct weightened_nhop { + struct nhop_object *nh; + uint32_t weight; +}; + +struct nhop_mpath; +struct weightened_nhop *mpath_get_nhops(struct nhop_mpath *mp, uint32_t *pnum_nhops); + +#endif Index: sys/net/route/nhgrp.c =================================================================== --- /dev/null +++ sys/net/route/nhgrp.c @@ -0,0 +1,321 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_inet.h" +#include "opt_route.h" +#include "opt_route_mpath.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +/* + * This file contains data structures management logic for the nexthop + * groups ("nhgrp") route subsystem. + * + * Nexthop groups are used to store multiple routes available for the specific + * prefix. Nexthop groups are immutable and can be shared across multiple + * prefixes. + * + * Each group consists of a control plane part and a dataplane part. + * Control plane is basically a collection of nexthop objects with + * weights and refcount. + * + * Datapath consists of a array of nexthop pointers, compiled from control + * plane data to support O(1) nexthop selection. + * + * For example, consider the following group: + * [(nh1, weight=100), (nh2, weight=200)] + * It will compile to the following array: + * [nh1, nh2, nh2] + * + */ + +static void consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, uint32_t new_idx_items); + +static int cmp_nhgrp(const struct nhgrp_priv *a, const struct nhgrp_priv *b); +static unsigned int hash_nhgrp(const struct nhgrp_priv *obj); + +static int +cmp_nhgrp(const struct nhgrp_priv *a, const struct nhgrp_priv *b) +{ + /* + * In case of consistent hashing, there can be multiple multipath groups + * with the same "control plane" list of nexthops with weights and a + * different set of "data plane" nexthops. + * For now, ignore the data plane and focus on the control plane list. + */ + if (a->gr_nh_count != b->gr_nh_count) + return (0); + return !memcmp(a->gr_nh_weights, b->gr_nh_weights, + sizeof(struct weightened_nhop) * a->gr_nh_count); +} + +/* + * Hash callback: calculate hash of an object + */ +static unsigned int +hash_nhgrp(const struct nhgrp_priv *obj) +{ + const unsigned char *key; + + key = (const unsigned char *)obj->gr_nh_weights; + + return (djb_hash(key, sizeof(struct weightened_nhop) * obj->gr_nh_count)); +} + +/* + * Returns object referenced and unlocked + */ +struct nhgrp_priv * +find_nhgrp(struct nh_control *ctl, const struct nhgrp_priv *key) +{ + struct nhgrp_priv *priv_ret; + + NHOPS_RLOCK(ctl); + CHT_SLIST_FIND_BYOBJ(&ctl->gr_head, mpath, key, priv_ret); + if (priv_ret != NULL) { + if (refcount_acquire_if_not_zero(&priv_ret->gr_refcnt) == 0) { + /* refcount was 0 -> group os being deleted */ + priv_ret = NULL; + } + } + NHOPS_RUNLOCK(ctl); + + return (priv_ret); +} + +int +link_nhgrp(struct nh_control *ctl, struct nhgrp_priv *grp_priv) +{ + uint16_t idx; + uint32_t new_num_buckets, new_num_items; + + NHOPS_WLOCK(ctl); + /* Check if we need to resize hash and index */ + new_num_buckets = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->gr_head); + new_num_items = bitmask_get_resize_items(&ctl->gr_idx_head); + + if (bitmask_alloc_idx(&ctl->gr_idx_head, &idx) != 0) { + NHOPS_WUNLOCK(ctl); + DPRINTF("Unable to allocate mpath index"); + consider_resize(ctl, new_num_buckets, new_num_items); + return (0); + } + + MP_PRIV_LOCK(grp_priv); + grp_priv->gr_idx = idx; + grp_priv->nh_control = ctl; + CHT_SLIST_INSERT_HEAD(&ctl->gr_head, mpath, grp_priv); + MP_PRIV_UNLOCK(grp_priv); + + NHOPS_WUNLOCK(ctl); + + consider_resize(ctl, new_num_buckets, new_num_items); + + return (1); +} + +struct nhgrp_priv * +unlink_nhgrp(struct nh_control *ctl, struct nhgrp_priv *key) +{ + struct nhgrp_priv *mp_ret; + int ret, idx; + + NHOPS_WLOCK(ctl); + + CHT_SLIST_REMOVE_BYOBJ(&ctl->gr_head, mpath, key, mp_ret); + + if (mp_ret == NULL) { + DPRINTF("Unable to find nhop group!"); + NHOPS_WUNLOCK(ctl); + return (NULL); + } + + idx = mp_ret->gr_idx; + ret = bitmask_free_idx(&ctl->gr_idx_head, idx); + MP_PRIV_LOCK(mp_ret); + mp_ret->gr_idx = 0; + mp_ret->nh_control = NULL; + MP_PRIV_UNLOCK(mp_ret); + + NHOPS_WUNLOCK(ctl); + + return (mp_ret); +} + +/* + * Checks if hash needs resizing and performs this resize if necessary + * + */ +__noinline static void +consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, uint32_t new_idx_items) +{ + void *nh_ptr, *nh_idx_ptr; + void *old_idx_ptr; + size_t alloc_size; + + nh_ptr = NULL ; + if (new_nh_buckets != 0) { + alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_nh_buckets); + nh_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO); + } + + nh_idx_ptr = NULL; + if (new_idx_items != 0) { + alloc_size = bitmask_get_size(new_idx_items); + nh_idx_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO); + } + + if (nh_ptr == NULL && nh_idx_ptr == NULL) + return; + + DPRINTF("mp: going to resize: nh:[ptr:%p sz:%u] idx:[ptr:%p sz:%u]", + nh_ptr, new_nh_buckets, nh_idx_ptr, new_idx_items); + + old_idx_ptr = NULL; + + NHOPS_WLOCK(ctl); + if (nh_ptr != NULL) { + CHT_SLIST_RESIZE(&ctl->gr_head, mpath, nh_ptr, new_nh_buckets); + } + if (nh_idx_ptr != NULL) { + if (bitmask_copy(&ctl->gr_idx_head, nh_idx_ptr, new_idx_items)) + bitmask_swap(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items, &old_idx_ptr); + } + NHOPS_WUNLOCK(ctl); + + if (nh_ptr != NULL) + free(nh_ptr, M_NHOP); + if (old_idx_ptr != NULL) + free(old_idx_ptr, M_NHOP); +} + +/* + * Function allocating the necessary group data structures. + */ +int +nhgrp_ctl_alloc_default(struct nh_control *ctl, int malloc_flags) +{ + size_t alloc_size; + uint32_t num_buckets, num_items; + void *cht_ptr, *mask_ptr; + + + num_buckets = 8; + alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); + cht_ptr = malloc(alloc_size, M_NHOP, malloc_flags | M_ZERO); + + if (cht_ptr == NULL) { + DPRINTF("mpath init failed"); + return (0); + } + + /* + * Allocate nexthop index bitmask. + */ + num_items = 128; + mask_ptr = malloc(bitmask_get_size(num_items), M_NHOP, malloc_flags | M_ZERO); + if (mask_ptr == NULL) { + DPRINTF("mpath bitmask init failed"); + free(cht_ptr, M_NHOP); + return (0); + } + + /* + * Reinit hash, as the previous instance contained 0 items. + */ + NHOPS_WLOCK(ctl); + + if (ctl->gr_head.hash_size == 0) { + CHT_SLIST_INIT(&ctl->gr_head, cht_ptr, num_buckets); + bitmask_init(&ctl->gr_idx_head, mask_ptr, num_items); + NHOPS_WUNLOCK(ctl); + } else { + /* Other thread has already initiliazed hash/bitmask */ + NHOPS_WUNLOCK(ctl); + free(cht_ptr, M_NHOP); + free(mask_ptr, M_NHOP); + } + + DPRINTF("mpath init done for fib/af %d/%d", ctl->rh->rib_fibnum, + ctl->rh->rib_family); + + return (1); +} + +int +nhgrp_ctl_init(struct nh_control *ctl) +{ + /* + * By default, do not allocate datastructures as multipath + * routes will not be necessarily used. + */ + CHT_SLIST_INIT(&ctl->gr_head, NULL, 0); + bitmask_init(&ctl->gr_idx_head, NULL, 0); + + return (0); +} + +void +nhgrp_ctl_free(struct nh_control *ctl) +{ + + if (ctl->gr_head.ptr != NULL) + free(ctl->gr_head.ptr, M_NHOP); + if (ctl->gr_idx_head.idx != NULL) + free(ctl->gr_idx_head.idx, M_NHOP); +} + Index: sys/net/route/nhgrp_ctl.c =================================================================== --- /dev/null +++ sys/net/route/nhgrp_ctl.c @@ -0,0 +1,823 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_inet.h" +#include "opt_route.h" +#include "opt_route_mpath.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +/* + * This file contains the supporting functions for creating multipath groups + * and compiling their dataplane parts. + */ + +/* MPF_MULTIPATH must be the same as NHF_MULTIPATH for nhop selection to work */ +CTASSERT(MPF_MULTIPATH == NHF_MULTIPATH); +/* Offset and size of flags field has to be the same for nhop/nhop groups */ +CHK_STRUCT_FIELD_GENERIC(struct nhop_object, nh_flags, struct nhgrp_object, mp_flags); +/* Cap multipath to 64, as the larger values would break rib_cmd_info bmasks */ +CTASSERT(RIB_MAX_MPATH_WIDTH <= 64); + +static int wn_cmp(const void *a, const void *b); +static void sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops); + +static struct nhgrp_priv *get_nhgrp(struct nh_control *ctl, + struct weightened_nhop *wn, int num_nhops, int *perror); +static void destroy_nhgrp(struct nhgrp_priv *gr_priv); +static void destroy_nhgrp_epoch(epoch_context_t ctx); +static void free_nhgrp_nhops(struct nhgrp_priv *gr_priv); + +static int dump_nhgrp_entry(struct rib_head *rh, struct nhgrp_priv *grp_priv, + char *buffer, struct sysctl_req *w); + + +static int +wn_cmp(const void *a, const void *b) +{ + const struct weightened_nhop *wa = a; + const struct weightened_nhop *wb = b; + + if (wa->weight > wb->weight) + return (1); + else if (wa->weight < wb->weight) + return (-1); + + /* Compare nexthops by pointer */ + if (wa->nh > wb->nh) + return (1); + else if (wa->nh < wb->nh) + return (-1); + else + return (0); +} + +/* + * Perform in-place sorting for array of nexthops in @wn. + * + * To avoid nh groups duplication, nexthops/weights in the + * @wn need to be ordered deterministically. + * As this sorting is needed only for the control plane functionality, + * there are no specific external requirements. + * + * Sort by weight first, to ease calculation of the slot sizes. + */ +static void +sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops) +{ + + qsort(wn, num_nhops, sizeof(struct weightened_nhop), wn_cmp); +} + +/* + * Calculate minimum number of slots required to fit the existing + * set of weights while maintaining weight coefficints. + * + * Assume @wn is sorted by weight ascending and each weight is > 0. + * + * Some examples: + * nh=1,weight=1 nh=2,weight=2 -> 3 slots [1, 2, 2] + * nh=1,weight=1000 nh=2,weight=2000 -> 3 slots: [1, 2, 2] + * nh=1,weight=17 nh=2,weight=37 -> 3 slots: [1, 2, 2] + * nh=1,weight=1 nh=2,weight=70 -> 64 slots: [1, 2, 2, ..] + */ +static uint32_t +calc_min_mpath_slots(const struct weightened_nhop *wn, size_t num_items) +{ + uint32_t i, last, xmin; + uint64_t v, total = 0; + + last = 0; + xmin = wn[0].weight; + for (i = 0; i < num_items; i++) { + total += wn[i].weight; + if ((wn[i].weight - last < xmin) && (wn[i].weight != last)) + xmin = wn[i].weight - last; + last = wn[i].weight; + } + // got minimum unit of desired accuracy + v = total / xmin; + if (v > RIB_MAX_MPATH_WIDTH) { + /* + * TODO: round to the MAX_MPATH and + * see if this reduces the group size. + */ + v = RIB_MAX_MPATH_WIDTH; + } + + return (uint32_t)v; +} + +/* + * Nexthop group data consists of + * 1) dataplane part, with nhgrp_object as a header followed by an + * arbitrary number of nexthop pointers. + * 2) control plane part, with nhgrp_priv as a header, followed by + * an arbirtrary number of 'struct weightened_nhop' object. + * + * Given nexthop groups are (mostly) immutable, allocate all data + * in one go. + * + */ +__noinline static size_t +get_nhgrp_alloc_size(uint32_t mp_size, uint32_t num_nhops) +{ + size_t sz; + + sz = sizeof(struct nhgrp_object); + sz += mp_size * sizeof(struct nhop_object *); + sz += sizeof(struct nhgrp_priv); + sz += num_nhops * sizeof(struct weightened_nhop); + return (sz); +} + + +/* + * Compile actual list of nexthops to be used by datapath from + * the nexthop group @dst. + * + * For example, compiling control plane list of 2 nexthops + * [(200, A), (100, B)] would result in the datapath array + * [A, A, B] + */ +static void +compile_nhgrp(struct nhgrp_priv *dst_priv, const struct weightened_nhop *x, + uint32_t num_slots) +{ + struct nhgrp_object *dst; + int i, slot_idx, remaining_slots; + uint64_t remaining_sum, nh_weight, nh_slots; + + slot_idx = 0; + dst = dst_priv->gr; + /* Calculate sum of all weights */ + remaining_sum = 0; + for (i = 0; i < dst_priv->gr_nh_count; i++) + remaining_sum += x[i].weight; + remaining_slots = num_slots; + DPRINTF("O: %u/%u", (uint32_t)remaining_sum, remaining_slots); + for (i = 0; i < dst_priv->gr_nh_count; i++) { + /* Calculate number of slots for the current nexthop */ + if (remaining_sum > 0) { + nh_weight = (uint64_t)x[i].weight; + nh_slots = (nh_weight * remaining_slots / remaining_sum); + } else + nh_slots = 0; + + remaining_sum -= x[i].weight; + remaining_slots -= nh_slots; + + DPRINTF(" OO[%d]: %u/%u curr=%d slot_idx=%d", i, + (uint32_t)remaining_sum, remaining_slots, + (int)nh_slots, slot_idx); + + while (nh_slots-- > 0) + dst->nhops[slot_idx++] = x[i].nh; + } +} + +/* + * Allocates new multipath group for the list of weightened nexthops. + * Does NOT reference any nexthops in the group. + * Returns group with refcount=1 or NULL. + */ +static struct nhgrp_priv * +alloc_nhgrp(struct weightened_nhop *wn, int num_nhops) +{ + uint32_t mpath_size; + int flags = M_NOWAIT; + struct nhgrp_object *mp; + struct nhgrp_priv *grp_priv; + + sort_weightened_nhops(wn, num_nhops); + + mpath_size = calc_min_mpath_slots(wn, num_nhops); + if (mpath_size == 0) { + /* Zero weights, abort */ + return (NULL); + } + + size_t sz = get_nhgrp_alloc_size(mpath_size, num_nhops); + mp = malloc(sz, M_NHOP, flags | M_ZERO); + if (mp == NULL) { + return (NULL); + } + + /* Has to be the first to make NHGRP_PRIV() work */ + mp->mp_size = mpath_size; + DPRINTF("new mpath group: num_nhops: %u", (uint32_t)mpath_size); + mp->mp_flags = MPF_MULTIPATH; + + grp_priv = NHGRP_PRIV(mp); + grp_priv->gr_nh_count = num_nhops; + refcount_init(&grp_priv->gr_refcnt, 1); + grp_priv->gr = mp; + memcpy(&grp_priv->gr_nh_weights[0], wn, + num_nhops * sizeof(struct weightened_nhop)); + + compile_nhgrp(grp_priv, wn, mp->mp_size); + + MP_PRIV_LOCK_INIT(grp_priv); + + return (grp_priv); +} + +void +nhgrp_free_group(struct nhgrp_object *gr) +{ + struct nhgrp_priv *gr_priv; + struct nh_control *ctl; + + gr_priv = NHGRP_PRIV(gr); + + if (!refcount_release(&gr_priv->gr_refcnt)) + return; + + MP_PRIV_LOCK(gr_priv); + ctl = gr_priv->nh_control; + MP_PRIV_UNLOCK(gr_priv); + + if (ctl != NULL) { + if (unlink_nhgrp(ctl, gr_priv) == NULL) { + /* Failed to unlink. Do not try to reclaim */ + /* XXX: error reporting */ + return; + } + } + + epoch_call(net_epoch_preempt, destroy_nhgrp_epoch, + &gr_priv->gr_epoch_ctx); +} + +/* + * Destroys all local resources belonging to @gr_priv. + */ +__noinline static void +destroy_nhgrp_int(struct nhgrp_priv *gr_priv) +{ + + MP_PRIV_LOCK(gr_priv); + MP_PRIV_LOCK_DESTROY(gr_priv); + + free(gr_priv->gr, M_NHOP); +} + +__noinline static void +destroy_nhgrp(struct nhgrp_priv *gr_priv) +{ + + KASSERT((gr_priv->gr_refcnt == 0), ("gr_refcnt != 0")); + + DPRINTF("DEL MPATH %p", gr_priv); + + KASSERT((gr_priv->gr_idx == 0), ("gr_idx != 0")); + + free_nhgrp_nhops(gr_priv); + + destroy_nhgrp_int(gr_priv); +} + +/* + * Epoch callback indicating group is safe to destroy + */ +static void +destroy_nhgrp_epoch(epoch_context_t ctx) +{ + struct nhgrp_priv *gr_priv; + + gr_priv = __containerof(ctx, struct nhgrp_priv, gr_epoch_ctx); + + destroy_nhgrp(gr_priv); +} + + +static int +ref_nhgrp(struct nhgrp_priv *gr_priv) +{ + + return (refcount_acquire_if_not_zero(&gr_priv->gr_refcnt)); +} + +int +nhgrp_ref_group(struct nhgrp_object *gr) +{ + + return (ref_nhgrp(NHGRP_PRIV(gr))); +} + +static int +ref_nhgrp_nhops(struct nhgrp_priv *gr_priv) +{ + + for (int i = 0; i < gr_priv->gr_nh_count; i++) { + if (nhop_ref_object(gr_priv->gr_nh_weights[i].nh) != 0) + continue; + + /* + * Failed to ref the nexthop, b/c it's deleted. + * Need to rollback references back. + */ + for (int j = 0; j < i; j++) + nhop_free_object(gr_priv->gr_nh_weights[j].nh); + return (0); + } + + return (1); +} + +static void +free_nhgrp_nhops(struct nhgrp_priv *gr_priv) +{ + + for (int i = 0; i < gr_priv->gr_nh_count; i++) + NH_FREE(gr_priv->gr_nh_weights[i].nh); +} + + +/* + * Creates or looks up an existing nexthop group based on @wn and @num_nhops. + * + * Returns referenced nhop group or NULL, passing error code in @perror. + */ +struct nhgrp_priv * +get_nhgrp(struct nh_control *ctl, struct weightened_nhop *wn, int num_nhops, + int *perror) +{ + struct nhgrp_priv *key, *grp_priv; + + if (ctl->gr_head.hash_size == 0) { + /* First multipath request. Bootstrap mpath datastructures. */ + if (nhgrp_ctl_alloc_default(ctl, M_NOWAIT) == 0) { + *perror = ENOMEM; + return (NULL); + } + } + + if ((key = alloc_nhgrp(wn, num_nhops)) == NULL) { + *perror = ENOMEM; + return (NULL); + } + + grp_priv = find_nhgrp(ctl, key); + if (grp_priv != NULL) { + /* + * Free originally-created group. As it hasn't been linked + * and the dependent nexhops haven't been referenced, just free + * the group. + */ + destroy_nhgrp_int(key); + *perror = 0; + return (grp_priv); + } else { + /* No existing group, try to link the new one */ + if (ref_nhgrp_nhops(key) == 0) { + /* + * Some of the nexthops have been sheduled for deletion. + * As the group hasn't been linked / no nexhops have been + * referenced, call the final destructor immediately. + */ + destroy_nhgrp_int(key); + *perror = EAGAIN; + return (NULL); + } + if (link_nhgrp(ctl, key) == 0) { + /* Unable to allocate index? */ + *perror = EAGAIN; + destroy_nhgrp(key); + } + *perror = 0; + return (key); + } + + /* NOTREACHED */ +} + +/* + * Creates/finds nexthop group based on @wn and @num_nhops. + * Returns referenced group or NULL, with an error in @perror. + * + * If the error is EAGAIN, then the operation can be retried. + */ +struct nhgrp_object * +nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops, + int *perror) +{ + struct nh_control *ctl = rh->nh_control; + struct nhgrp_priv *gr_priv; + + gr_priv = get_nhgrp(ctl, wn, num_nhops, perror); + + if (gr_priv != NULL) + return (gr_priv->gr); + + return (NULL); +} + +/* + * Creates new nexthop group based on @src group with the nexthops defined in bitmask + * @nhop_mask removed. + * Returns referenced nexthop group or NULL on failure. + */ +struct nhgrp_object * +nhgrp_get_del_nhops(struct rib_head *rh, const struct nhgrp_object *src, + uint64_t *nhop_mask, int *perror) +{ + char storage[64]; + struct nh_control *ctl = rh->nh_control; + struct weightened_nhop *pnhops; + const struct nhgrp_priv *mp_priv, *src_priv; + size_t sz; + int i, num_nhops; + + src_priv = NHGRP_PRIV_CONST(src); + + sz = src_priv->gr_nh_count * (sizeof(struct weightened_nhop)); + /* optimize for <= 4 paths, each path=16 bytes */ + if (sz <= sizeof(storage)) + pnhops = (struct weightened_nhop *)&storage[0]; + else { + pnhops = malloc(sz, M_TEMP, M_NOWAIT); + if (pnhops == NULL) { + *perror = ENOMEM; + return (NULL); + } + } + + /* Copy nhops first */ + num_nhops = 0; + for (i = 0; i < src_priv->gr_nh_count; i++) { + /* Do not copy deleted nexthops */ + if (nhop_mask[i / 64] & (1 << (i % 64))) + continue; + memcpy(&pnhops[num_nhops++], &src_priv->gr_nh_weights[i], + sizeof(struct weightened_nhop)); + } + + KASSERT((num_nhops >= 2), ("num_nhops < 2 after deletion")); + + + mp_priv = get_nhgrp(ctl, pnhops, num_nhops, perror); + + if (pnhops != (struct weightened_nhop *)&storage[0]) + free(pnhops, M_TEMP); + + if (mp_priv == NULL) + return (NULL); + + return (mp_priv->gr); +} + + +#if 0 +/* + * Adds new nexthop to existing multipath group or a single nexthop. + * + */ +struct nhgrp_object * +nhgrp_append_nhop(struct rib_head *rh, struct nhop_object *nh_orig, + u_long weight_orig, struct nhop_object *nh_new, u_long weight_new, + uint64_t *addmask, uint64_t *changemask) +{ + struct nh_control *ctl = rh->nh_control; + struct weightened_nhop wn[2]; + struct nhop_mpath *mp; + + /* + * Add our nexthop we try to add as a first one. + */ + wn[0].nh = nh_new; + wn[0].weight = weight_new; + if (!NH_IS_MULTIPATH(nh_orig)) { + /* + * Original nexthop is not multipath. + * Request new multipath group consisting of 2 nexthops. + */ + wn[1].nh = nh_orig; + wn[1].weight = weight_orig; + mp = get_nhgrp(ctl, wn, 2); + *addmask = 1 << 0; + *changemask = 0; + } else { + /* + * Original nexthop is already multipath. + * Create a new multipath group from existing group + * and the new nexthop. + */ + struct nhop_mpath *mp_orig = (struct nhop_mpath *)nh_orig; + *addmask = 0; + *changemask = 0; + mp = nhgrp_append_nhops(ctl, mp_orig, wn, 1, addmask, + changemask); + DPRINTF("mpath append returned %p from %p\n", mp, mp_orig); + } + /* mp has now referenced all nexthops */ + + return (mp); +} +#endif + +/* + * Appends one or more nexthops denoted by @wm to the nexthop group @gr_orig. + * + * Returns referenced nexthop group or NULL. In the latter case, @perror is + * filled with an error code. + * Note that function does NOT care if the next nexthops already exists + * in the @gr_orig. As a result, they will be added, resulting in the + * same nexthop being present multiple times in the new group. + */ +struct nhgrp_object * +nhgrp_append_nhops(struct rib_head *rh, const struct nhgrp_object *gr_orig, + struct weightened_nhop *wn, int num_nhops, uint64_t *paddmask, int *perror) +{ + char storage[64]; + struct weightened_nhop *pnhops; + const struct weightened_nhop *c_wn; + const struct nhgrp_priv *src_priv, *mp_priv; + size_t sz; + int curr_nhops; + + src_priv = NHGRP_PRIV_CONST(gr_orig); + curr_nhops = src_priv->gr_nh_count; + + *perror = 0; + + sz = (src_priv->gr_nh_count + num_nhops) * (sizeof(struct weightened_nhop)); + /* optimize for <= 4 paths, each path=16 bytes */ + if (sz <= sizeof(storage)) + pnhops = (struct weightened_nhop *)&storage[0]; + else { + pnhops = malloc(sz, M_TEMP, M_NOWAIT); + if (pnhops == NULL) { + *perror = ENOMEM; + return (NULL); + } + } + + /* Copy nhops from original group first */ + memcpy(pnhops, src_priv->gr_nh_weights, + curr_nhops * sizeof(struct weightened_nhop)); + memcpy(&pnhops[curr_nhops], wn, num_nhops * sizeof(struct weightened_nhop)); + curr_nhops += num_nhops; + + mp_priv = get_nhgrp(rh->nh_control, pnhops, curr_nhops, perror); + + if (pnhops != (struct weightened_nhop *)&storage[0]) + free(pnhops, M_TEMP); + + if (mp_priv == NULL) + return (NULL); + + /* + * Nhops are reordered in the new nexthop group. + * Walk through old&new groups to calculate an addition mask. + * TODO: optimize for large multipath groups. + */ + *paddmask = 0; + for (int i = 0; i < num_nhops; i++) { + for (int j = 0; j < curr_nhops; j++) { + c_wn = &mp_priv->gr_nh_weights[j]; + if ((wn[i].nh == c_wn->nh) && (wn[i].weight == c_wn->weight)) { + /* Found */ + *paddmask |= 1 << j; + break; + } + } + } + + return (mp_priv->gr); +} + +/* + * Replaces nexthop with index @replace_idx in @gr_orig with the new one in @wn. + * + * Returns new referenced nhop group or NULL. + */ +struct nhgrp_object * +nhgrp_get_replace_nhop(struct rib_head *rh, const struct nhgrp_object *gr_orig, + struct weightened_nhop *wn, uint8_t replace_idx, uint64_t *pmodmask, + int *perror) +{ + char storage[64]; + struct weightened_nhop *pnhops; + const struct weightened_nhop *c_wn; + const struct nhgrp_priv *src_priv, *mp_priv; + size_t sz; + int curr_nhops; + + src_priv = NHGRP_PRIV_CONST(gr_orig); + curr_nhops = src_priv->gr_nh_count; + + if (replace_idx >= src_priv->gr_nh_count) { + *perror = EINVAL; + return (NULL); + } + + *perror = 0; + + sz = src_priv->gr_nh_count * (sizeof(struct weightened_nhop)); + /* optimize for <= 4 paths, each path=16 bytes */ + if (sz <= sizeof(storage)) + pnhops = (struct weightened_nhop *)&storage[0]; + else { + pnhops = malloc(sz, M_TEMP, M_NOWAIT); + if (pnhops == NULL) { + *perror = ENOMEM; + return (NULL); + } + } + + /* Copy nhops from original group & update the relevant nhop */ + memcpy(pnhops, src_priv->gr_nh_weights, + src_priv->gr_nh_count * sizeof(struct weightened_nhop)); + pnhops[replace_idx] = *wn; + + mp_priv = get_nhgrp(rh->nh_control, pnhops, src_priv->gr_nh_count, perror); + + if (pnhops != (struct weightened_nhop *)&storage[0]) + free(pnhops, M_TEMP); + + if (mp_priv == NULL) + return (NULL); + + /* + * In the resulting group, nhop can be reordered. + * Re-iterave over the group to calculate the addition mask. + * TODO: optimize for large multipath groups. + */ + for (int i = 0; i < mp_priv->gr_nh_count; i++) { + c_wn = &mp_priv->gr_nh_weights[i]; + if ((wn[i].nh == c_wn->nh) && (wn[i].weight == c_wn->weight)) { + /* Found */ + *pmodmask |= 1 << i; + break; + } + } + + return (mp_priv->gr); +} + +/* + * Returns pointer to array of nexthops with weights for + * given @mp. Stores number of items in the array into @pnum_nhops. + * XXX: const! + */ +struct weightened_nhop * +nhgrp_get_nhops(struct nhgrp_object *mp, uint32_t *pnum_nhops) +{ + struct nhgrp_priv *grp_priv; + + KASSERT(((mp->mp_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath")); + + grp_priv = NHGRP_PRIV(mp); + *pnum_nhops = grp_priv->gr_nh_count; + + return (grp_priv->gr_nh_weights); +} + +__noinline static int +dump_nhgrp_entry(struct rib_head *rh, struct nhgrp_priv *grp_priv, + char *buffer, struct sysctl_req *w) +{ + + struct rt_msghdr *rtm; + struct mpath_external *mpe; + struct nhgrp_object *mp; + struct mpath_nhop_external *ext; + uint32_t *pidx; + int error; + size_t sz; + + //DPRINTF("Dumping: head %p nh %p flags %X req %p\n", rh, nh, nh->nh_flags, w); + + mp = grp_priv->gr; + + sz = sizeof(struct rt_msghdr) + sizeof(struct mpath_external); + sz += sizeof(struct mpath_nhop_external) * grp_priv->gr_nh_count; + sz += sizeof(uint32_t) * mp->mp_size; + + bzero(buffer, sz); + + rtm = (struct rt_msghdr *)buffer; + rtm->rtm_msglen = sz; + rtm->rtm_version = RTM_VERSION; + rtm->rtm_type = RTM_GET; + + mpe = (struct mpath_external *)(rtm + 1); + + mpe->mp_idx = grp_priv->gr_idx; + mpe->mp_refcount = grp_priv->gr_refcnt; + mpe->mp_nh_count = grp_priv->gr_nh_count; + mpe->mp_group_size = mp->mp_size; + + ext = (struct mpath_nhop_external *)(mpe + 1); + for (int i = 0; i < grp_priv->gr_nh_count; i++) { + ext[i].nh_idx = grp_priv->gr_nh_weights[i].nh->nh_priv->nh_idx; + ext[i].nh_weight = grp_priv->gr_nh_weights[i].weight; + } + + pidx = (uint32_t *)&ext[grp_priv->gr_nh_count]; + for (int i = 0; i < mp->mp_size; i++) + pidx[i] = mp->nhops[i]->nh_priv->nh_idx; + + error = SYSCTL_OUT(w, buffer, sz); + + /* + DPRINTF("Exported %d ifindex %d family %d type %d error %d\n", nh->nh_priv->nh_idx, pnhe->ifindex, + pnhe->nh_family, pnhe->nh_type, error); + */ + + return (error); +} + +int +nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w) +{ + struct nh_control *ctl = rh->nh_control; + struct epoch_tracker et; + struct nhgrp_priv *grp_priv; + char *buffer; + size_t sz; + int error; + + if (ctl->gr_head.items_count == 0) + return (0); + + sz = sizeof(struct mpath_external); + sz += (sizeof(struct mpath_nhop_external) + sizeof(uint32_t)) * + RIB_MAX_MPATH_WIDTH; + buffer = malloc(sz, M_TEMP, M_WAITOK); + + DPRINTF("NHGRP DUMP: count=%u", ctl->gr_head.items_count); + NET_EPOCH_ENTER(et); + NHOPS_RLOCK(ctl); + CHT_SLIST_FOREACH(&ctl->gr_head, mpath, grp_priv) { + error = dump_nhgrp_entry(rh, grp_priv, buffer, w); + if (error != 0) { + NHOPS_RUNLOCK(ctl); + NET_EPOCH_EXIT(et); + free(buffer, M_TEMP); + return (error); + } + } CHT_SLIST_FOREACH_END; + NHOPS_RUNLOCK(ctl); + NET_EPOCH_EXIT(et); + + free(buffer, M_TEMP); + + return (0); +} Index: sys/net/route/nhgrp_var.h =================================================================== --- /dev/null +++ sys/net/route/nhgrp_var.h @@ -0,0 +1,85 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This header file contains private definitions for the nexthop groups. + * + * Header is not intended to be included by the code external to the + * routing subsystem. + */ + +#ifndef _NET_ROUTE_NHGRP_VAR_H_ +#define _NET_ROUTE_NHGRP_VAR_H_ + +/* define mpath hash table */ +struct nhgrp_priv; +//CHT_SLIST_DEFINE(mpath, struct nhgrp_priv); + +/* Hash definition */ +/* produce hash value for an object */ +#define mpath_hash_obj(_obj) (hash_nhgrp(_obj)) +/* compare two objects */ +#define mpath_cmp(_one, _two) (cmp_nhgrp(_one, _two)) +/* next object accessor */ +#define mpath_next(_obj) (_obj)->gr_priv_next + +struct nhgrp_priv { + uint32_t gr_idx; + uint8_t gr_nh_count; /* number of items in nh_weights */ + uint8_t gr_spare[3]; + u_int gr_refcnt; /* use refcount */ + struct mtx gr_mtx; /* mutex */ + struct nh_control *nh_control; /* parent control structure */ + struct nhgrp_priv *gr_priv_next; + struct nhgrp_object *gr; + struct epoch_context gr_epoch_ctx; /* epoch data for nhop */ + struct weightened_nhop gr_nh_weights[0]; +}; + +#define _NHGRP_PRIV(_src) (&(_src)->nhops[(_src)->mp_size]) +#define NHGRP_PRIV(_src) ((struct nhgrp_priv *)_NHGRP_PRIV(_src)) +#define NHGRP_PRIV_CONST(_src) ((const struct nhgrp_priv *)_NHGRP_PRIV(_src)) + +#define MP_PRIV_LOCK_INIT(_priv) mtx_init(&(_priv)->gr_mtx, "nhgrp", NULL, MTX_DEF) +#define MP_PRIV_LOCK(_priv) mtx_lock(&(_priv)->gr_mtx) +#define MP_PRIV_UNLOCK(_priv) mtx_unlock(&(_priv)->gr_mtx) +#define MP_PRIV_LOCK_DESTROY(_priv) mtx_destroy(&(_priv)->gr_mtx) +#define MP_PRIV_LOCK_ASSERT(_priv) mtx_assert(&(_priv)->gr_mtx, MA_OWNED) + +/* mpath */ +struct weightened_nhop; + +/* nhgrp.c */ +int nhgrp_ctl_alloc_default(struct nh_control *ctl, int malloc_flags); +struct nhgrp_priv *find_nhgrp(struct nh_control *ctl, const struct nhgrp_priv *key); +int link_nhgrp(struct nh_control *ctl, struct nhgrp_priv *grp_priv); +struct nhgrp_priv *unlink_nhgrp(struct nh_control *ctl, struct nhgrp_priv *key); + +#endif + Index: sys/net/route/nhop.h =================================================================== --- /dev/null +++ sys/net/route/nhop.h @@ -0,0 +1,228 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This header file contains public definitions for the nexthop routing subsystem. + */ + +#ifndef _NET_ROUTE_NHOP_H_ +#define _NET_ROUTE_NHOP_H_ + +#include /* sockaddr_in && sockaddr_in6 */ + +#include + +enum nhop_type { + NH_TYPE_IPV4_ETHER_RSLV = 1, /* IPv4 ethernet without GW */ + NH_TYPE_IPV4_ETHER_NHOP = 2, /* IPv4 with pre-calculated ethernet encap */ + NH_TYPE_IPV6_ETHER_RSLV = 3, /* IPv6 ethernet, without GW */ + NH_TYPE_IPV6_ETHER_NHOP = 4 /* IPv6 with pre-calculated ethernet encap*/ +}; + +#ifdef _KERNEL + +/* + * Currently the only use case of AF_LINK gateway is storing + * interface index of the interface of the source IPv6 address. + * This is used by the IPv6 code for the connections over loopback + * interface. + * + * The structure below copies 'struct sockaddr_dl', reducing the + * size of sdl_data buffer, as it is not used. This change + * allows to store the AF_LINK gateways in the nhop gateway itself, + * simplifying control plane handling. + */ +struct sockaddr_dl_short { + u_char sdl_len; /* Total length of sockaddr */ + u_char sdl_family; /* AF_LINK */ + u_short sdl_index; /* if != 0, system given index for interface */ + u_char sdl_type; /* interface type */ + u_char sdl_nlen; /* interface name length, no trailing 0 reqd. */ + u_char sdl_alen; /* link level address length */ + u_char sdl_slen; /* link layer selector length */ + char sdl_data[8]; /* unused */ +}; + +#define NHOP_RELATED_FLAGS \ + (RTF_GATEWAY | RTF_HOST | RTF_REJECT | RTF_BLACKHOLE | \ + RTF_FIXEDMTU | RTF_LOCAL | RTF_BROADCAST | RTF_MULTICAST) + +struct nhop_request { + struct ifnet *ifp; + struct ifaddr *ifa; + struct sockaddr *gw; + int family; + int mtu; + int rt_flags; /* gets converted to nh_flags later */ + uint16_t nh_type; + uint16_t nh_flags_additional; /* Additional flags to set to the nh_flags */ +}; + +struct nh_control; +struct nhop_priv; + +/* + * Struct 'nhop_object' field description: + * + * nh_flags: NHF_ flags used in the dataplane code. NHF_GATEWAY or NHF_BLACKHOLE + * can be examples of such flags. + * nh_mtu: ready-to-use nexthop mtu. Already accounts for the link-level header, + * interface MTU and protocol-specific limitations. + * nh_prepend_len: link-level prepend length. Currently unused. + * nh_ifp: logical transmit interface. The one from which if_transmit() will be + * called. Guaranteed to be non-NULL. + * nh_aifp: ifnet of the source address. Same as nh_ifp except IPv6 loopback + * routes. See the example below. + * nh_ifa: interface address to use. Guaranteed to be non-NULL. + * nh_pksent: counter(9) reflecting the number of packets transmitted. + * + * gw_: storage suitable to hold AF_INET, AF_INET6 or AF_LINK gateway. More + * details ara available in the examples below. + * + * + * Direct routes (routes w/o gateway): + * NHF_GATEWAY is NOT set. + * nh_ifp denotes the logical transmit interface (). + * nh_aifp is the same as nh_ifp + * gw_sa contains AF_LINK sa with nh_aifp ifindex (compat) + * Loopback routes: + * NHF_GATEWAY is NOT set. + * nh_ifp points to the loopback interface (lo0). + * nh_aifp points to the interface where the destination address belongs to. + * This is useful in IPv6 link-local-over-loopback communications. + * gw_sa contains AF_LINK sa with nh_aifp ifindex (compat) + * GW routes: + * NHF_GATEWAY is set. + * nh_ifp denotes the logical transmit interface. + * nh_aifp is the same as nh_ifp + * gw_sa contains L3 address (either AF_INET or AF_INET6). + * + * + * Note: struct nhop_object fields are ordered in a way that + * supports memcmp-based comparisons. + * + */ +#define NHOP_END_CMP (__offsetof(struct nhop_object, nh_pksent)) + +struct nhop_object { + uint16_t nh_flags; /* nhop flags */ + uint16_t nh_mtu; /* nexthop mtu */ + union { + struct sockaddr_in gw4_sa; /* GW accessor as IPv4 */ + struct sockaddr_in6 gw6_sa; /* GW accessor as IPv6 */ + struct sockaddr gw_sa; + struct sockaddr_dl_short gwl_sa; /* AF_LINK gw (compat) */ + char gw_buf[28]; + }; + struct ifnet *nh_ifp; /* Logical egress interface. Always != NULL */ + struct ifaddr *nh_ifa; /* interface address to use. Always != NULL */ + struct ifnet *nh_aifp; /* ifnet of the source address. Always != NULL */ + counter_u64_t nh_pksent; /* packets sent using this nhop */ + /* 32 bytes + 4xPTR == 64(amd64) / 48(i386) */ + uint8_t nh_prepend_len; /* length of prepend data */ + uint8_t spare[3]; + uint32_t spare1; /* alignment */ + char nh_prepend[56]; /* L2 prepend */ + /* -- 128 bytes -- */ + struct nhop_priv *nh_priv; /* control plane data */ + uint8_t spare2[16]; +}; + +/* + * Nhop validness. + * + * Currently we verify whether link is up or not on every packet, which can be + * quite costy. + * TODO: subscribe for the interface notifications and update the nexthops + * with NHF_INVALID flag. + */ + +//#define NH_IS_VALID(_nh) (((_nh)->nh_flags & NHF_INVALID) == 0) +#define NH_IS_VALID(_nh) RT_LINK_IS_UP((_nh)->nh_ifp) +#define NH_IS_MULTIPATH(_nh) ((_nh)->nh_flags & NHF_MULTIPATH) + +#define RT_GATEWAY(_rt) ((struct sockaddr *)&(_rt)->rt_nhop->gw4_sa) +#define RT_GATEWAY_CONST(_rt) ((const struct sockaddr *)&(_rt)->rt_nhop->gw4_sa) + +#define NH_FREE(_nh) do { \ + nhop_free_object(_nh); \ + /* guard against invalid refs */ \ + _nh = NULL; \ +} while (0) + + +void nhop_free_object(struct nhop_object *nh); + +struct sysctl_req; +struct sockaddr_dl; +struct rib_head; + +uint32_t nhop_get_idx(const struct nhop_object *nh); +void nhop_free(struct nhop_object *nh); + +#endif + +/* Kernel <> userland structures */ + +struct nhop_external { + uint32_t nh_idx; + uint32_t nh_fib; + uint32_t ifindex; /* transmit interface ifindex */ + uint32_t aifindex; /* address ifindex */ + uint8_t nh_family; /* address family */ + uint16_t nh_type; /* nexthop type */ + uint16_t nh_mtu; /* nexthop mtu */ + + uint16_t nh_flags; /* nhop flags */ + struct in_addr nh_addr; /* GW/DST IPv4 address */ + struct in_addr nh_src; /* default source IPv4 address */ + uint64_t nh_pksent; + /* control plane */ + /* lookup key: address, family, type */ + char nh_prepend[64]; /* L2 prepend */ + uint8_t prepend_len; /* length of the prepend */ + uint64_t nh_refcount; /* number of references */ +}; + +struct mpath_nhop_external { + uint32_t nh_idx; + uint32_t nh_weight; +}; + +struct mpath_external { + uint32_t mp_idx; + uint32_t mp_refcount; + uint32_t mp_nh_count; + uint32_t mp_group_size; +}; + + +#endif + + Index: sys/net/route/nhop.c =================================================================== --- /dev/null +++ sys/net/route/nhop.c @@ -0,0 +1,346 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + + +#include +#include "opt_inet.h" +#include "opt_route.h" +#include "opt_route_mpath.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +/* + * This file contains data structures management logic for the nexthop ("nhop") + * route subsystem. + * + * Nexthops in the original sense are the objects containing all the necessary + * information to forward the packet to the selected destination. + * In particular, nexthop is defined by a combination of + * ifp, ifa, aifp, mtu, gw addr(if set), nh_type, nh_family, mask of rt_flags and + * NHF_DEFAULT + * + * All nexthops are stored in the resizable hash table. + * Additionally, each nexthop gets assigned its unique index (nexthop index) + * so userland programs can interact with the nexthops easier. Index allocation + * is backed by the bitmask array. + */ + +MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data"); + + +/* Hash management functions */ + +int +nhops_init(struct rib_head *rh) +{ + struct nh_control *ctl; + size_t alloc_size; + uint32_t num_buckets, num_items; + void *ptr; + + ctl = malloc(sizeof(struct nh_control), M_NHOP, M_WAITOK | M_ZERO); + + /* + * Allocate nexthop hash. Start with 16 items by default (128 bytes). + * This will be enough for most of the cases. + */ + num_buckets = 16; + alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); + ptr = malloc(alloc_size, M_NHOP, M_WAITOK | M_ZERO); + CHT_SLIST_INIT(&ctl->nh_head, ptr, num_buckets); + + /* + * Allocate nexthop index bitmask. + */ + num_items = 128 * 8; /* 128 bytes */ + ptr = malloc(bitmask_get_size(num_items), M_NHOP, M_WAITOK | M_ZERO); + bitmask_init(&ctl->nh_idx_head, ptr, num_items); + + NHOPS_LOCK_INIT(ctl); + + rh->nh_control = ctl; + ctl->rh = rh; + +#ifdef ROUTE_MPATH + nhgrp_ctl_init(ctl); +#endif + + return (0); +} + +void +nhops_destroy(struct rib_head *rh) +{ + struct nh_control *ctl; + struct nhop_priv *nh_priv; + + ctl = rh->nh_control; + + /* + * All routes should have been deleted in rt_table_destroy(). + * However, TCP stack or other consumers may store referenced + * nexthop pointers. When these references go to zero, + * nhop_free_object() will try to unlink these records from the + * datastructures, most likely leading to panic. + * + * Avoid that by explicitly marking all of the remaining + * nexthops as unlinked. + */ + + NHOPS_RLOCK(ctl); + CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) { + DPRINTF("Unlinking referenced nhop %u", nh_priv->nh_idx); + NH_PRIV_LOCK(nh_priv); + nh_priv->nh_control = NULL; + nh_priv->nh_idx = 0; + NH_PRIV_UNLOCK(nh_priv); + } CHT_SLIST_FOREACH_END; + NHOPS_RUNLOCK(ctl); + +#ifdef ROUTE_MPATH + nhgrp_ctl_free(ctl); +#endif + free(ctl->nh_head.ptr, M_NHOP); + free(ctl->nh_idx_head.idx, M_NHOP); + free(ctl, M_NHOP); +} + +/* + * Nexthops distribution: + * + * 2 "mandatory" nexthops per interface ("interface route", "loopback"). + * For direct peering: 1 nexthop for the peering router per ifp/af. + * For Ix-like peering: tens to hundreds nexthops of neghbors per ifp/af. + * IGP control plane & broadcast segment: tens of nexthops per ifp/af. + * + * With that in mind, hash nexthops by the combination of the interface + * and GW IP address. + */ +struct _hash_data { + uint16_t ifindex; + uint8_t family; + uint8_t nh_type; + uint32_t gw_addr; +}; + +static uint32_t +hash_priv(const struct nhop_priv *priv) +{ + struct nhop_object *nh; + uint16_t ifindex; + struct _hash_data key; + + nh = priv->nh; + ifindex = nh->nh_ifp->if_index & 0xFFFF; + memset(&key, 0, sizeof(key)); + + key.ifindex = ifindex; + key.family = nh->gw_sa.sa_family; + key.nh_type = priv->nh_type & 0xFF; + if (nh->gw_sa.sa_family == AF_INET6) + memcpy(&key.gw_addr, &nh->gw6_sa.sin6_addr.s6_addr32[3], 4); + else if (nh->gw_sa.sa_family == AF_INET) + memcpy(&key.gw_addr, &nh->gw4_sa.sin_addr, 4); + + return (uint32_t)(djb_hash((const unsigned char *)&key, sizeof(key))); +} + +/* + * Checks if hash needs resizing and performs this resize if necessary + * + */ +__noinline static void +consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, uint32_t new_idx_items) +{ + void *nh_ptr, *nh_idx_ptr; + void *old_idx_ptr; + size_t alloc_size; + + nh_ptr = NULL ; + if (new_nh_buckets != 0) { + alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_nh_buckets); + nh_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO); + } + + nh_idx_ptr = NULL; + if (new_idx_items != 0) { + alloc_size = bitmask_get_size(new_idx_items); + nh_idx_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO); + } + + if (nh_ptr == NULL && nh_idx_ptr == NULL) { + /* Both allocations failed */ + return; + } + + DPRINTF("going to resize: nh:[ptr:%p sz:%u] idx:[ptr:%p sz:%u]", nh_ptr, + new_nh_buckets, nh_idx_ptr, new_idx_items); + + old_idx_ptr = NULL; + + NHOPS_WLOCK(ctl); + if (nh_ptr != NULL) { + CHT_SLIST_RESIZE(&ctl->nh_head, nhops, nh_ptr, new_nh_buckets); + } + if (nh_idx_ptr != NULL) { + if (bitmask_copy(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items)) + bitmask_swap(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items, &old_idx_ptr); + } + NHOPS_WUNLOCK(ctl); + + if (nh_ptr != NULL) + free(nh_ptr, M_NHOP); + if (old_idx_ptr != NULL) + free(old_idx_ptr, M_NHOP); +} + +__noinline int +link_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv) +{ + uint16_t idx; + uint32_t new_num_buckets, new_num_items; + + //old_idx = nh->nh_priv->nh_idx; + + NHOPS_WLOCK(ctl); + /* Check if we need to resize hash and index */ + new_num_buckets = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->nh_head); + new_num_items = bitmask_get_resize_items(&ctl->nh_idx_head); + + if (bitmask_alloc_idx(&ctl->nh_idx_head, &idx) != 0) { + NHOPS_WUNLOCK(ctl); + DPRINTF("Unable to allocate nhop index"); + consider_resize(ctl, new_num_buckets, new_num_items); + return (0); + } + + nh_priv->nh_idx = idx; + nh_priv->nh_control = ctl; + CHT_SLIST_INSERT_HEAD(&ctl->nh_head, nhops, nh_priv); + + NHOPS_WUNLOCK(ctl); + + DPRINTF("Linked nhop priv %p to %d, hash %u, ctl %p", nh_priv, idx, + hash_priv(nh_priv), ctl); + consider_resize(ctl, new_num_buckets, new_num_items); + + return (idx); +} + +__noinline struct nhop_priv * +unlink_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv) +{ + struct nhop_priv *priv_ret; + int idx, ret; + uint32_t new_num_buckets, new_num_items; + + idx = 0; + ret = 0; + + NHOPS_WLOCK(ctl); + CHT_SLIST_REMOVE_BYOBJ(&ctl->nh_head, nhops, nh_priv, priv_ret); + + if (priv_ret != NULL) { + NH_PRIV_LOCK(priv_ret); + idx = priv_ret->nh_idx; + priv_ret->nh_idx = 0; + priv_ret->nh_control = NULL; + NH_PRIV_UNLOCK(priv_ret); + + KASSERT((idx != 0), ("bogus nhop index 0")); + ret = bitmask_free_idx(&ctl->nh_idx_head, idx); + } + + /* Check if we need to resize hash and index */ + new_num_buckets = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->nh_head); + new_num_items = bitmask_get_resize_items(&ctl->nh_idx_head); + + NHOPS_WUNLOCK(ctl); + + if (priv_ret == NULL) + DPRINTF("Unable to unlink nhop priv %p from hash, hash %u ctl %p", + nh_priv,hash_priv(nh_priv), ctl); + KASSERT(priv_ret != NULL, ("FCK")); + if (ret != 0) + DPRINTF("Unable to unlink index %d from nhop %p", idx, priv_ret->nh); + + DPRINTF("Unlinked nhop %p priv idx %d", nh_priv, idx); + + consider_resize(ctl, new_num_buckets, new_num_items); + + return (priv_ret); +} + +__noinline struct nhop_priv * +find_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv) +{ + struct nhop_priv *nh_priv_ret; + + //DPRINTF("--- start search ---"); + NHOPS_RLOCK(ctl); + CHT_SLIST_FIND_BYOBJ(&ctl->nh_head, nhops, nh_priv, nh_priv_ret); + if (nh_priv_ret != NULL) { + if (refcount_acquire_if_not_zero(&nh_priv_ret->nh_refcnt) == 0){ + /* refcount was 0 -> nhop is being deleted */ + nh_priv_ret = NULL; + } + } + NHOPS_RUNLOCK(ctl); + + //if (nh_priv_ret == NULL) + // DPRINTF("--- end search (not found) ---"); + return (nh_priv_ret); +} + + Index: sys/net/route/nhop_ctl.c =================================================================== --- /dev/null +++ sys/net/route/nhop_ctl.c @@ -0,0 +1,740 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + + +#include +#include "opt_inet.h" +#include "opt_route.h" +#include "opt_route_mpath.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +/* + * This file contains core functionality for the nexthop ("nhop") route subsystem. + * The business logic needed to create nexhop objects is implemented here. + * + * Nexthops in the original sense are the objects containing all the necessary + * information to forward the packet to the selected destination. + * In particular, nexthop is defined by a combination of + * ifp, ifa, aifp, mtu, gw addr(if set), nh_type, nh_family, mask of rt_flags and + * NHF_DEFAULT + * + * Additionally, each nexthop gets assigned its unique index (nexthop index) + * so userland programs can interact with the nexthops easier. Index allocation + * is backed by the bitmask array. + * All nexthops are stored in the resizable hash table. + * + * Basically, this file revolves around supproring 2 functions: + * 1) fill_nhop(), which contains all business logic on filling the nexthop fields + * based on the provided request + * 2) nhop_get(), which gets a nexthop based on the provided request. + * + * + * Conventions: + * 1) non-exported functions start with verb + * 2) exported function starts with the subsystem prefix: "nhop" + * + */ + +static struct nhop_object *alloc_nhop(const struct nhop_request *req); +static int fill_nhop(const struct nhop_request *req, struct nhop_object *nh); +static int dump_nhop_entry(struct rib_head *rh, struct nhop_object *nh, struct sysctl_req *w); + +static struct nhop_object *alloc_nhop_tmp(const struct nhop_request *req); +static void free_nhop_tmp(struct nhop_object *nh); + +static struct ifnet *get_aifp(const struct nhop_request *req); +static void fill_sdl_from_ifp(struct sockaddr_dl_short *sdl, const struct ifnet *ifp); + +static void destroy_nhop_epoch(epoch_context_t ctx); +static void destroy_nhop(struct nhop_priv *nh_priv); + +_Static_assert(__offsetof(struct nhop_object, nh_ifp) == 32, + "nhop_object size mistmatch"); + +/* + * Fetches the interface of the route source address. + * In all cases except interface-addresses it would be the + * same as the transmit interfaces. + * However, for the interface address this function will return + * this interface ifp instead of loopback. This is needed to support + * link-local IPv6 loopback communications. + * + * Returns referenced ifp. + */ +static struct ifnet * +get_aifp(const struct nhop_request *req) +{ + struct ifnet *aifp = NULL; + struct sockaddr_dl *sdl; + struct epoch_tracker et; + + /* + * Adjust the "outgoing" interface. If we're going to loop + * the packet back to ourselves, the ifp would be the loopback + * interface. However, we'd rather know the interface associated + * to the destination address (which should probably be one of + * our own addresses.) + */ + if ((req->ifp->if_flags & IFF_LOOPBACK) && + req->gw->sa_family == AF_LINK) { + sdl = (struct sockaddr_dl *)req->gw; + NET_EPOCH_ENTER(et); + aifp = ifnet_byindex_ref(sdl->sdl_index); + NET_EPOCH_EXIT(et); + if (aifp == NULL) { + DPRINTF("unable to get aifp for %s index %d", + if_name(req->ifp), sdl->sdl_index); + } + } + + if (aifp == NULL) { + aifp = req->ifp; + if_ref(aifp); + } + + return (aifp); +} + +#if 0 +static int compare_sa(const struct sockaddr *first, const struct sockaddr *second); +static int compare_nhop_addr(const struct nhop_object *nh, const struct nhop_object *nh_src); + +static int +compare_sa(const struct sockaddr *first, const struct sockaddr *second) +{ + if (first == second) + return (1); + if (first->sa_family != second->sa_family) { + DPRINTF("family different: %d %d", (int)first->sa_family, + (int)second->sa_family); + return (0); + } + if (first->sa_len != second->sa_len) { + DPRINTF("size different: %d %d", (int)first->sa_len, + (int)second->sa_len); + return (0); + } + if (memcmp(first, second, first->sa_len)) { + DPRINTF("data different"); + return (0); + } + return (1); +} + + +__noinline static int +compare_nhop_addr(const struct nhop_object *nh, const struct nhop_object *nh_src) +{ + const struct sockaddr *nh_sa, *nh_src_sa; + //struct sockaddr_in6 gw6; + //struct sockaddr *sa; + //struct in_ifaddr *ia; + + switch (nh->nh_priv->nh_type) { +#if 0 + case NH_TYPE_IPV4_ETHER_RSLV: + pgw = (struct sockaddr *)&gw6; + fill_empty_sa(pgw, AF_INET); + if (compare_sa(pgw, sa) != 0) + return (1); + break; +#endif + case NH_TYPE_IPV4_ETHER_NHOP: + nh_sa = (const struct sockaddr *)&nh->gw4_sa; + nh_src_sa = (const struct sockaddr *)&nh_src->gw4_sa; + if (compare_sa(nh_sa, nh_src_sa) != 0) + return (1); + break; +#if 0 + case NH_TYPE_IPV6_ETHER_RSLV: + //nh_sa = (const struct sockaddr *)&ifatoia6(nh->nh_ifa)->ia_addr; + pgw = (struct sockaddr *)&gw6; + fill_empty_sa(pgw, AF_INET6); + if (compare_sa(pgw, sa) != 0) + return (1); + break; +#endif + case NH_TYPE_IPV6_ETHER_NHOP: + nh_sa = (const struct sockaddr *)&nh->gw6_sa; + nh_src_sa = (const struct sockaddr *)&nh_src->gw6_sa; + if (compare_sa(nh_sa, nh_src_sa) != 0) + return (1); + break; + default: + DPRINTF("unknown nh_type: %d", (int)nh->nh_priv->nh_type); + } + + return (0); +} + +int +cmp_priv_debug(const struct nhop_priv *_one, const struct nhop_priv *_two) +{ + const struct nhop_object *nh, *nh_src; + + DPRINTF("Q: cmp %p and %p", _one, _two); + + nh = _one->nh; + nh_src = _two->nh; + + if (nh->nh_ifp != nh_src->nh_ifp) + return (0); + + if ((_one->nh_type != _two->nh_type) || (_one->nh_family != _two->nh_family)) { + DPRINTF("MISS: type: %d %d family %d %d", (int)_one->nh_type, + (int)_two->nh_type, (int)_one->nh_family, + (int)_two->nh_family); + return (0); + } + if (nh->nh_priv->rt_flags != nh_src->nh_priv->rt_flags) { + DPRINTF("MISS: rt_flags 0x%X 0x%X", + (unsigned int)nh->nh_priv->rt_flags, + (unsigned int)nh_src->nh_priv->rt_flags); + return (0); + } + if (nh->nh_mtu != nh_src->nh_mtu) { + DPRINTF("MISS: mtu %d %d", (int)nh->nh_mtu, (int)nh_src->nh_mtu); + return (0); + } + if (nh->nh_flags != nh_src->nh_flags) { + DPRINTF("MISS NHF_DEFAULT: nh_flags 0x%X 0x%X", + (unsigned int)nh->nh_flags, (unsigned int)nh_src->nh_flags); + return (0); + } + if (nh->nh_ifa != nh_src->nh_ifa) { + DPRINTF("MISS: ifa %p %p", nh->nh_ifa, nh_src->nh_ifa); + return (0); + } + if (nh->nh_aifp != nh_src->nh_aifp) { + DPRINTF("MISS: aifp %s %s", if_name(nh->nh_aifp), + if_name(nh_src->nh_aifp)); + return (0); + } + if ((nh->nh_flags & NHF_GATEWAY) && (compare_nhop_addr(nh, nh_src) == 0)) { + DPRINTF("MISS: SA"); + return (0); + } + /* Finally, loopback IPv6 nexthops */ + + return (1); +} +#endif + +int +cmp_priv(const struct nhop_priv *_one, const struct nhop_priv *_two) +{ + + if (memcmp(_one->nh, _two->nh, NHOP_END_CMP) != 0) + return (0); + + if ((_one->nh_type != _two->nh_type) || + (_one->nh_family != _two->nh_family)) + return (0); + + return (1); +} + +/* + * Finds or creates new nhop_object based on @req. + * Returns referenced and linked nhop_object or NULL. + */ +__noinline struct nhop_object * +nhop_get(struct rib_head *rh, const struct nhop_request *req) +{ + struct nh_control *ctl = rh->nh_control; + struct nhop_object *nh, *nh_tmp; + struct nhop_priv *nh_priv; + + /* + * + * TODO: performance optimizations. + * In order to find the nexthop, we first need + * to construct most of it to make hash lookup + * work correctly. + * The assumption is that for _most_ routes nexthops + * will be shared, so it would make sense to optimize + * the lookup process. Current implementation refcounts + * all dependent objects even in "temporary" nexthop + * usecase, which is an overkill. + */ + + nh_tmp = alloc_nhop_tmp(req); + if (nh_tmp == NULL) + return (NULL); + nh_priv = find_nhop(ctl, nh_tmp->nh_priv); + free_nhop_tmp(nh_tmp); + + if (nh_priv != NULL) + return (nh_priv->nh); + + nh = alloc_nhop(req); + if (nh == NULL) { + DPRINTF("nh_alloc failed!"); + return (NULL); + } + if (link_nhop(ctl, nh->nh_priv) == 0) { + /* + * Adding nexthop to the datastructures + * failed. Call destructor w/o waiting for + * the epoch end, as nexthop is not used + * and return. + */ + DPRINTF("link_nhop failed!"); + destroy_nhop(nh->nh_priv); + + return (NULL); + } + + return (nh); +} + +__noinline static void +fill_sdl_from_ifp(struct sockaddr_dl_short *sdl, const struct ifnet *ifp) +{ + + sdl->sdl_family = AF_LINK; + sdl->sdl_len = sizeof(struct sockaddr_dl_short); + sdl->sdl_index = ifp->if_index; + sdl->sdl_type = ifp->if_type; +} + +__noinline static struct nhop_object * +alloc_nhop_tmp(const struct nhop_request *req) +{ + struct nhop_object *nh; + struct nhop_priv *nh_priv; + /* IPv6 ND is unhappy */ + int flags = M_NOWAIT; + + nh_priv = malloc(sizeof(struct nhop_priv) + sizeof(struct nhop_object), M_TEMP, flags | M_ZERO); + if (nh_priv == NULL) + return (NULL); + + nh = (struct nhop_object *)(nh_priv + 1); + + /* + * consists of + * ifp, ifa, aifp, mtu, gw addr(if set), nh_type, nh_family, mask of rt_flags and + */ + + nh->nh_priv = nh_priv; + nh_priv->nh = nh; + + if (fill_nhop(req, nh) != 0) { + free(nh_priv, M_TEMP); + return (NULL); + } + + return (nh); +} + +__noinline static void +free_nhop_tmp(struct nhop_object *nh) +{ + /* TODO: rewrite */ + if_rele(nh->nh_ifp); + if_rele(nh->nh_aifp); + ifa_free(nh->nh_ifa); + + free(nh->nh_priv, M_TEMP); +} + +__noinline static void +print_nhop(const char *prefix, const struct nhop_object *nh) +{ + char src_buf[INET6_ADDRSTRLEN], addr_buf[INET6_ADDRSTRLEN]; + int af = nh->nh_priv->nh_family; + + if (af == AF_INET) { + const struct sockaddr_in *gw, *src; + gw = &nh->gw4_sa; + src = IA_SIN(ifatoia(nh->nh_ifa)); + inet_ntop(af, &src->sin_addr, src_buf, sizeof(src_buf)); + inet_ntop(af, &gw->sin_addr, addr_buf, sizeof(addr_buf)); + } else if (af == AF_INET6) { + const struct sockaddr_in6 *gw, *src; + gw = &nh->gw6_sa; + src = &(ifatoia6(nh->nh_ifa)->ia_addr); + inet_ntop(af, &src->sin6_addr, src_buf, sizeof(src_buf)); + inet_ntop(af, &gw->sin6_addr, addr_buf, sizeof(addr_buf)); + } + + DPRINTF("%s nhop: AF %d ifp %p %s addr %s src %p %s aifp %p %s mtu %d nh_flags %X", + prefix, af, nh->nh_ifp, if_name(nh->nh_ifp), addr_buf, nh->nh_ifa, + src_buf, nh->nh_aifp, if_name(nh->nh_aifp), nh->nh_mtu, nh->nh_flags); +} + +__noinline static struct nhop_object * +alloc_nhop(const struct nhop_request *req) +{ + struct nhop_object *nh; + struct nhop_priv *nh_priv; + /* IPv6 ND is unhappy */ + int flags = M_NOWAIT; + + KASSERT((req->mtu > 0), ("nh requested mtu is zero")); + + nh_priv = malloc(sizeof(struct nhop_priv), M_NHOP, flags | M_ZERO); + if (nh_priv == NULL) + return (NULL); + nh = malloc(sizeof(struct nhop_object), M_NHOP, flags | M_ZERO); + if (nh == NULL) { + free(nh_priv, M_NHOP); + return (NULL); + } + + /* Allocate per-cpu packet counter */ + nh->nh_pksent = counter_u64_alloc(flags); + if (nh->nh_pksent == NULL) { + free(nh_priv, M_NHOP); + free(nh, M_NHOP); + return (NULL); + } + + nh->nh_priv = nh_priv; + nh_priv->nh = nh; + + /* Refcounting for all of the necessary resources done in fill_nhop() */ + if (fill_nhop(req, nh) != 0) { + counter_u64_free(nh->nh_pksent); + free(nh_priv, M_NHOP); + free(nh, M_NHOP); + return (NULL); + } + + NH_PRIV_LOCK_INIT(nh_priv); + refcount_init(&nh_priv->nh_refcnt, 1); + + print_nhop("ALLOC", nh); + + return (nh); +} + +static void +destroy_nhop(struct nhop_priv *nh_priv) +{ + struct nhop_object *nh = nh_priv->nh; + + NH_PRIV_LOCK(nh_priv); + DPRINTF("DEL nhop: AF %d ifp %p %s src %p mtu %d nh_flags %X", + nh_priv->nh_family, nh->nh_ifp, nh->nh_ifp->if_xname, + nh->nh_ifa, nh->nh_mtu, nh->nh_flags); + + NH_PRIV_UNLOCK(nh_priv); + + free(nh_priv, M_NHOP); + + if_rele(nh->nh_ifp); + if_rele(nh->nh_aifp); + ifa_free(nh->nh_ifa); + counter_u64_free(nh->nh_pksent); + + free(nh, M_NHOP); +} + +/* + * Epoch callback indicating nhop is safe to destroy + */ +static void +destroy_nhop_epoch(epoch_context_t ctx) +{ + struct nhop_priv *nh_priv; + + nh_priv = __containerof(ctx, struct nhop_priv, nh_epoch_ctx); + + destroy_nhop(nh_priv); +} + +/* + * Fills @nh fields with the data supplied in the @req. + * Returns 0 on success. References nh_aifp, nh_ifp and nh_ifa. + */ +__noinline static int +fill_nhop(const struct nhop_request *req, struct nhop_object *nh) +{ + int rt_flags; + + rt_flags = req->rt_flags & NHOP_RT_FLAG_MASK; + + nh->nh_ifp = req->ifp; + nh->nh_mtu = req->mtu; + nh->nh_flags = fib_rte_to_nh_flags(rt_flags); + nh->nh_flags |= (req->nh_flags_additional & NHF_DEFAULT); + nh->nh_priv->rt_flags = rt_flags; + nh->nh_ifa = req->ifa; + + if (req->rt_flags & RTF_GATEWAY) { + if (req->gw->sa_len > sizeof(struct sockaddr_in6)) { + DPRINTF("nhop SA size too big: AF %d len %u", + req->gw->sa_family, req->gw->sa_len); + return (ENOMEM); + } + memcpy(&nh->gw_sa, req->gw, req->gw->sa_len); + } else { + /* + * Interface route. Currently the route.c code adds + * empty sa of type AF_LINK, which is 56 bytes long. + * The only place where this data is used is the IPv6 + * loopback output, where we need to preserve the original + * interface to maintain proper scoping. + * Current code stores original interface in the separate field + * (nh_aifp, see below). Given that, write fake empty SA + * with the request AF. + */ + fill_sdl_from_ifp(&nh->gwl_sa, req->ifp); + } + + /* IPv6 (mostly) helper */ + nh->nh_aifp = get_aifp(req); + DPRINTF("AIFP: %p req->ifp %p nh_ifp %p", nh->nh_aifp, req->ifp, nh->nh_ifp); + + /* Reference the needed objects (note nh_aifp is already referenced) */ + if_ref(nh->nh_ifp); + ifa_ref(nh->nh_ifa); + + /* TODO: verify blackhole/reject behavior | NHF_HOST */ + + nh->nh_priv->nh_family = req->family; + nh->nh_priv->nh_type = req->nh_type; + + return (0); +} + +int +nhop_ref_object(struct nhop_object *nh) +{ + + return (refcount_acquire_if_not_zero(&nh->nh_priv->nh_refcnt)); +} + +void +nhop_free_object(struct nhop_object *nh) +{ + struct nh_control *ctl; + struct nhop_priv *nh_priv = nh->nh_priv; + + if (!refcount_release(&nh_priv->nh_refcnt)) + return; + + NH_PRIV_LOCK(nh_priv); + ctl = nh_priv->nh_control; + /* Use nh_control as an indicator of linked/unlinked entry */ + nh_priv->nh_control = NULL; + NH_PRIV_UNLOCK(nh_priv); + + if (ctl != NULL) { + if (unlink_nhop(ctl, nh_priv) == NULL) { + /* Do not try to reclaim */ + return; + } + } + + epoch_call(net_epoch_preempt, destroy_nhop_epoch, + &nh_priv->nh_epoch_ctx); +} + +int +nhop_ref_any(struct nhop_object *nh) +{ +#ifdef ROUTE_MPATH + if (!NH_IS_MULTIPATH(nh)) + return (nhop_ref_object(nh)); + else + return (nhgrp_ref_group((struct nhgrp_object *)nh)); +#else + return (nhop_ref_object(nh)); +#endif +} + +void +nhop_free_any(struct nhop_object *nh) +{ + if (!NH_IS_MULTIPATH(nh)) + nhop_free_object(nh); +#ifdef ROUTE_MPATH + else + nhgrp_free_group((struct nhgrp_object *)nh); +#endif +} + + +/* Helper functions */ + +uint32_t +nhop_get_idx(const struct nhop_object *nh) +{ + + return (nh->nh_priv->nh_idx); +} + +__noinline void +nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu) +{ + struct nh_control *ctl = rh->nh_control; + struct nhop_priv *nh_priv; + struct nhop_object *nh; + + NHOPS_RLOCK(ctl); + CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) { + nh = nh_priv->nh; + if (nh->nh_ifp == ifp) { + if ((nh_priv->rt_flags & RTF_FIXEDMTU) == 0 || + nh->nh_mtu > mtu) { + /* Update */ + NH_PRIV_LOCK(nh_priv); + nh->nh_mtu = mtu; + NH_PRIV_UNLOCK(nh_priv); + } + } + } CHT_SLIST_FOREACH_END; + NHOPS_RUNLOCK(ctl); + +} + +__noinline static int +dump_nhop_entry(struct rib_head *rh, struct nhop_object *nh, struct sysctl_req *w) +{ + struct { + struct rt_msghdr rtm; + struct nhop_external nhe; + } arpc; + struct nhop_external *pnhe; + struct sockaddr *gw_sa, *src_sa; + struct sockaddr_storage ss; + int error; + + //DPRINTF("Dumping: head %p nh %p flags %X req %p\n", rh, nh, nh->nh_flags, w); + + memset(&arpc, 0, sizeof(arpc)); + + arpc.rtm.rtm_msglen = sizeof(arpc); + arpc.rtm.rtm_version = RTM_VERSION; + arpc.rtm.rtm_type = RTM_GET; + //arpc.rtm.rtm_flags = RTF_UP; + arpc.rtm.rtm_flags = nh->nh_priv->rt_flags; + + pnhe = &arpc.nhe; + + pnhe->nh_idx = nh->nh_priv->nh_idx; + pnhe->nh_fib = rh->rib_fibnum; + pnhe->ifindex = nh->nh_ifp->if_index; + pnhe->aifindex = nh->nh_aifp->if_index; + pnhe->nh_family = nh->nh_priv->nh_family; + pnhe->nh_type = nh->nh_priv->nh_type; + pnhe->nh_mtu = nh->nh_mtu; + pnhe->nh_flags = nh->nh_flags; + + size_t len = 0; + gw_sa = (struct sockaddr *)&nh->gw4_sa; + // KASSRT sin6_len > 0 + len += gw_sa->sa_len; + //DPRINTF("ADDING gw_sa %lu len, af %d nh_sa: %p\n", len, nh->gw6_sa.sin6_family, &nh->gw6_sa); + + int af = nh->nh_priv->nh_family; + if (af == AF_INET) { + src_sa = (struct sockaddr *)IA_SIN(ifatoia(nh->nh_ifa)); + } else if (af == AF_INET6) { + src_sa = (struct sockaddr *)&ifatoia6(nh->nh_ifa)->ia_addr; + } else { + return (1); + } + if (src_sa->sa_family != af) { + /* ifa can be link address. XXX: AF_NULL ? */ + memset(&ss, 0, sizeof(struct sockaddr_storage)); + fill_sdl_from_ifp((struct sockaddr_dl_short *)&ss, nh->nh_ifp); + src_sa = (struct sockaddr *)&ss; + } + + //memcpy(sa, nh_sa, nh_sa->sa_len); + len += src_sa->sa_len; + arpc.rtm.rtm_msglen += len; + //DPRINTF("ADDING %lu len, af %d nh_sa: %p\n", len, src_sa->sa_len, src_sa); + + memcpy(pnhe->nh_prepend, nh->nh_prepend, sizeof(nh->nh_prepend)); + pnhe->prepend_len = nh->nh_prepend_len; + pnhe->nh_refcount = nh->nh_priv->nh_refcnt; + + pnhe->nh_pksent = counter_u64_fetch(nh->nh_pksent); + + error = SYSCTL_OUT(w, &arpc, sizeof(arpc)); + if (error == 0) + error = SYSCTL_OUT(w, gw_sa, gw_sa->sa_len); + if (error == 0) + error = SYSCTL_OUT(w, src_sa, src_sa->sa_len); + + /* + DPRINTF("Exported %d ifindex %d family %d type %d error %d\n", nh->nh_priv->nh_idx, pnhe->ifindex, + pnhe->nh_family, pnhe->nh_type, error); + */ + + return (error); +} + +int +nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w) +{ + struct nh_control *ctl = rh->nh_control; + struct nhop_priv *nh_priv; + int error; + + DPRINTF("NHDUMP: count=%u", ctl->nh_head.items_count); + CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) { + error = dump_nhop_entry(rh, nh_priv->nh, w); + if (error != 0) + return (error); + } CHT_SLIST_FOREACH_END; + + return (0); +} + Index: sys/net/route/nhop_utils.h =================================================================== --- /dev/null +++ sys/net/route/nhop_utils.h @@ -0,0 +1,200 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_ROUTE_NHOP_UTILS_H_ +#define _NET_ROUTE_NHOP_UTILS_H_ + +/* Chained hash table */ +struct _cht_head { + uint32_t hash_size; + uint32_t items_count; + void **ptr; +}; + +static inline uint32_t +_cht_get_resize_size(const struct _cht_head *head) +{ + uint32_t new_size = 0; + + if ((head->items_count * 2 > head->hash_size) && (head->hash_size < 65536)) + new_size = head->hash_size * 2; + else if ((head->items_count * 4 < head->hash_size) && head->hash_size > 16) + new_size = head->hash_size / 2; + + return (new_size); +} + +static inline int +_cht_need_resize(const struct _cht_head *head) +{ + + return (_cht_get_resize_size(head) > 0); +} + + +#ifndef typeof +#define typeof __typeof +#endif + +#define CHT_SLIST_NEED_RESIZE(_head) \ + _cht_need_resize((const struct _cht_head *)(_head)) +#define CHT_SLIST_GET_RESIZE_BUCKETS(_head) \ + _cht_get_resize_size((const struct _cht_head *)(_head)) +#define CHT_SLIST_GET_RESIZE_SIZE(_buckets) ((_buckets) * sizeof(void *)) + +#define CHT_SLIST_DEFINE(_HNAME, _ITEM_TYPE) \ +struct _HNAME##_head { \ + uint32_t hash_size; \ + uint32_t items_count; \ + _ITEM_TYPE **ptr; \ +} + +#define CHT_SLIST_INIT(_head, _ptr, _num_buckets) \ + (_head)->hash_size = _num_buckets; \ + (_head)->items_count = 0; \ + (_head)->ptr = _ptr; + +/* Default hash method for constant-size keys */ + +#define CHT_GET_BUCK(_head, _PX, _key) _PX##_hash_key(_key) & ((_head)->hash_size - 1) +#define CHT_GET_BUCK_OBJ(_head, _PX, _obj) _PX##_hash_obj(_obj) & ((_head)->hash_size - 1) + +#define CHT_FIRST(_head, idx) _CHT_FIRST((_head)->ptr, idx) +#define _CHT_FIRST(_ptr, idx) (_ptr)[idx] + +#define CHT_SLIST_FIND(_head, _PX, _key, _ret) do { \ + uint32_t _buck = CHT_GET_BUCK(_head, _PX, _key); \ + _ret = CHT_FIRST(_head, _buck); \ + for ( ; _ret != NULL; _ret = _PX##_next(_ret)) { \ + if (_PX##_cmp(_key, (_ret))) \ + break; \ + } \ +} while(0) + +/* + * hash_obj, nhop_cmp + */ +#define CHT_SLIST_FIND_BYOBJ(_head, _PX, _obj, _ret) do { \ + uint32_t _buck = CHT_GET_BUCK_OBJ(_head, _PX, _obj); \ + _ret = CHT_FIRST(_head, _buck); \ + for ( ; _ret != NULL; _ret = _PX##_next(_ret)) { \ + if (_PX##_cmp(_obj, _ret)) \ + break; \ + } \ +} while(0) + +#define CHT_SLIST_INSERT_HEAD(_head, _PX, _obj) do { \ + uint32_t _buck = CHT_GET_BUCK_OBJ(_head, _PX, _obj); \ + _PX##_next(_obj) = CHT_FIRST(_head, _buck); \ + CHT_FIRST(_head, _buck) = _obj; \ + (_head)->items_count++; \ +} while(0) + +#define CHT_SLIST_REMOVE(_head, _PX, _key, _ret) do { \ + typeof(*(_head)->ptr) _tmp; \ + uint32_t _buck = CHT_GET_BUCK(_head, _PX, _key); \ + _ret = CHT_FIRST(_head, _buck); \ + _tmp = NULL; \ + for ( ; _ret != NULL; _tmp = _ret, _ret = _PX##_next(_ret)) { \ + if (_PX##_cmp(_key, _ret)) \ + break; \ + } \ + if (_ret != NULL) { \ + if (_tmp == NULL) \ + CHT_FIRST(_head, _buck) = _PX##_next(_ret); \ + else \ + _PX##_next(_tmp) = _PX##_next(_ret); \ + (_head)->items_count--; \ + } \ +} while(0) + +#define CHT_SLIST_REMOVE_BYOBJ(_head, _PX, _obj, _ret) do { \ + typeof(*(_head)->ptr) _tmp; \ + uint32_t _buck = CHT_GET_BUCK_OBJ(_head, _PX, _obj); \ + _ret = CHT_FIRST(_head, _buck); \ + _tmp = NULL; \ + for ( ; _ret != NULL; _tmp = _ret, _ret = _PX##_next(_ret)) { \ + if (_PX##_cmp(_obj, _ret)) \ + break; \ + } \ + if (_ret != NULL) { \ + if (_tmp == NULL) \ + CHT_FIRST(_head, _buck) = _PX##_next(_ret); \ + else \ + _PX##_next(_tmp) = _PX##_next(_ret); \ + (_head)->items_count--; \ + } \ +} while(0) + + +#define CHT_SLIST_FOREACH(_head, _PX, _x) \ + for (uint32_t _i = 0; _i < (_head)->hash_size; _i++) { \ + for (_x = CHT_FIRST(_head, _i); _x; _x = _PX##_next(_x)) + +#define CHT_SLIST_FOREACH_END } + +#define CHT_SLIST_RESIZE(_head, _PX, _new_void_ptr, _new_hsize) \ + uint32_t _new_idx; \ + typeof((_head)->ptr) _new_ptr = (void *)_new_void_ptr; \ + typeof(*(_head)->ptr) _x, _y; \ + for (uint32_t _old_idx = 0; _old_idx < (_head)->hash_size; _old_idx++) {\ + _x = CHT_FIRST(_head, _old_idx); \ + _y = _x; \ + while (_y != NULL) { \ + _y = _PX##_next(_x); \ + _new_idx = _PX##_hash_obj(_x) & (_new_hsize - 1);\ + _PX##_next(_x) = _CHT_FIRST(_new_ptr, _new_idx);\ + _CHT_FIRST(_new_ptr, _new_idx) = _x; \ + _x = _y; \ + } \ + } \ + (_head)->hash_size = _new_hsize; \ + _new_void_ptr = (void *)(_head)->ptr; \ + (_head)->ptr = _new_ptr; + +/* bitmasks */ + +struct bitmask_head { + uint16_t free_off; /* index of the first potentially free block */ + uint16_t blocks; /* number of 4/8-byte blocks in the index */ + uint32_t items_count; /* total number of items */ + u_long *idx; +}; + +size_t bitmask_get_size(uint32_t items); +uint32_t bitmask_get_resize_items(const struct bitmask_head *nh); +int bitmask_should_resize(const struct bitmask_head *bh); +void bitmask_swap(struct bitmask_head *bh, void *new_idx, uint32_t new_items, void **pidx); +void bitmask_init(struct bitmask_head *bh, void *idx, uint32_t num_items); +int bitmask_copy(const struct bitmask_head *bi, void *new_idx, uint32_t new_items); +int bitmask_alloc_idx(struct bitmask_head *bi, uint16_t *pidx); +int bitmask_free_idx(struct bitmask_head *bi, uint16_t idx); + +#endif + Index: sys/net/route/nhop_utils.c =================================================================== --- /dev/null +++ sys/net/route/nhop_utils.c @@ -0,0 +1,220 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include "opt_inet.h" +#include "opt_route.h" +#include "opt_mpath.h" + +#include +#include +#include +#include +#include +#include + +#include + +#define BLOCK_ITEMS (8 * sizeof(u_long)) /* Number of items for ffsl() */ + +#define _BLOCKS_TO_SZ(_blocks) ((size_t)(_blocks) * sizeof(u_long)) +#define _BLOCKS_TO_ITEMS(_blocks) ((uint32_t)(_blocks) * BLOCK_ITEMS) +#define _ITEMS_TO_BLOCKS(_items) ((_items) / BLOCK_ITEMS) + + +static void _bitmask_init_idx(void *index, uint32_t items); + +void +bitmask_init(struct bitmask_head *bh, void *idx, uint32_t num_items) +{ + + if (idx != NULL) + _bitmask_init_idx(idx, num_items); + + memset(bh, 0, sizeof(struct bitmask_head)); + bh->blocks = _ITEMS_TO_BLOCKS(num_items); + bh->idx = (u_long *)idx; +} + +uint32_t +bitmask_get_resize_items(const struct bitmask_head *bh) +{ + if ((bh->items_count * 2 > _BLOCKS_TO_ITEMS(bh->blocks)) && bh->items_count < 65536) + return (_BLOCKS_TO_ITEMS(bh->blocks) * 2); + + return (0); +} + +int +bitmask_should_resize(const struct bitmask_head *bh) +{ + + return (bitmask_get_resize_items(bh) != 0); +} + +#if 0 +uint32_t +_bitmask_get_blocks(uint32_t items) +{ + + return (items / BLOCK_ITEMS); +} +#endif + +size_t +bitmask_get_size(uint32_t items) +{ +#if _KERNEL + KASSERT((items % BLOCK_ITEMS) == 0, + ("bitmask size needs to power of 2 and greater or equal to %zu", + BLOCK_ITEMS)); +#else + assert((items % BLOCK_ITEMS) == 0); +#endif + + return (items / 8); +} + +static void +_bitmask_init_idx(void *_idx, uint32_t items) +{ + size_t size = bitmask_get_size(items); + u_long *idx = (u_long *)_idx; + + /* Mark all as free */ + memset(idx, 0xFF, size); + *idx &= ~(u_long)1; /* Always skip index 0 */ +} + + +/* + * _try_merge api to allow shrinking? + */ +int +bitmask_copy(const struct bitmask_head *bi, void *new_idx, uint32_t new_items) +{ + uint32_t new_blocks = _BLOCKS_TO_ITEMS(new_items); + + _bitmask_init_idx(new_idx, new_items); + + if (bi->blocks < new_blocks) { + /* extend current blocks */ + if (bi->blocks > 0) + memcpy(new_idx, bi->idx, _BLOCKS_TO_SZ(bi->blocks)); + return (0); + } else { + /* XXX: ensure all other blocks are non-zero */ + for (int i = new_blocks; i < bi->blocks; i++) { + } + + return (1); + } +} + +void +bitmask_swap(struct bitmask_head *bh, void *new_idx, uint32_t new_items, void **pidx) +{ + void *old_ptr; + + old_ptr = bh->idx; + + bh->idx = (u_long *)new_idx; + bh->blocks = _ITEMS_TO_BLOCKS(new_items); + + if (pidx != NULL) + *pidx = old_ptr; +} + +/* + * Allocate new index in given instance and stores in in @pidx. + * Returns 0 on success. + */ +int +bitmask_alloc_idx(struct bitmask_head *bi, uint16_t *pidx) +{ + u_long *mask; + int i, off, v; + + off = bi->free_off; + mask = &bi->idx[off]; + + for (i = off; i < bi->blocks; i++, mask++) { + if ((v = ffsl(*mask)) == 0) + continue; + + /* Mark as busy */ + *mask &= ~ ((u_long)1 << (v - 1)); + + bi->free_off = i; + + v = BLOCK_ITEMS * i + v - 1; + + *pidx = v; + bi->items_count++; + return (0); + } + + return (1); +} + +/* + * Removes index from given set. + * Returns 0 on success. + */ +int +bitmask_free_idx(struct bitmask_head *bi, uint16_t idx) +{ + u_long *mask; + int i, v; + + if (idx == 0) + return (1); + + i = idx / BLOCK_ITEMS; + v = idx % BLOCK_ITEMS; + + if (i >= bi->blocks) + return (1); + + mask = &bi->idx[i]; + + if ((*mask & ((u_long)1 << v)) != 0) + return (1); + + /* Mark as free */ + *mask |= (u_long)1 << v; + bi->items_count--; + + /* Update free offset */ + if (bi->free_off > i) + bi->free_off = i; + + return (0); +} + Index: sys/net/route/nhop_var.h =================================================================== --- /dev/null +++ sys/net/route/nhop_var.h @@ -0,0 +1,127 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This header file contains private definitions for nexthop routing. + * + * Header is not intended to be included by the code external to the + * routing subsystem. + */ + +#ifndef _NET_ROUTE_NHOP_VAR_H_ +#define _NET_ROUTE_NHOP_VAR_H_ + +static unsigned +djb_hash(const unsigned char *h, const int len) +{ + unsigned int result = 0; + int i; + + for (i = 0; i < len; i++) + result = 33 * result ^ h[i]; + + return (result); +} + +/* define nhop hash table */ +struct nhop_priv; +CHT_SLIST_DEFINE(nhops, struct nhop_priv); +/* produce hash value for an object */ +#define nhops_hash_obj(_obj) hash_priv(_obj) +/* compare two objects */ +#define nhops_cmp(_one, _two) cmp_priv(_one, _two) +/* next object accessor */ +#define nhops_next(_obj) (_obj)->nh_next + +/* XXX: declare! */ +/* define mpath hash table */ +struct nhgrp_priv; +CHT_SLIST_DEFINE(mpath, struct nhgrp_priv); + + +struct nh_control { + struct nhops_head nh_head; /* hash table head */ + struct bitmask_head nh_idx_head; /* nhop index head */ + struct mpath_head gr_head; /* nhgrp hash table head */ + struct bitmask_head gr_idx_head; /* nhgrp index head */ + struct rwlock nhop_lock; /* overall ctl lock */ + struct rib_head *rh; /* pointer back to rnh */ +}; + +#define NHOPS_WLOCK(ctl) rw_wlock(&(ctl)->nhop_lock) +#define NHOPS_RLOCK(ctl) rw_rlock(&(ctl)->nhop_lock) +#define NHOPS_WUNLOCK(ctl) rw_wunlock(&(ctl)->nhop_lock) +#define NHOPS_RUNLOCK(ctl) rw_runlock(&(ctl)->nhop_lock) +#define NHOPS_LOCK_INIT(ctl) rw_init(&(ctl)->nhop_lock, "ctl") +#define NHOPS_LOCK_DESTROY(ctl) rw_destroy(&(ctl)->nhop_lock) +#define NHOPS_WLOCK_ASSERT(ctl) rw_assert(&(ctl)->nhop_lock, RA_WLOCKED) + + +/* Control plane-only nhop data */ +struct nhop_object; +struct nhop_priv { + uint32_t nh_idx; /* nexthop index */ + uint8_t nh_family; /* address family */ + uint16_t nh_type; /* nexthop type */ + void *cb_func; /* function handling additional rewrite caps */ + u_int nh_refcnt; /* number of references */ + int rt_flags; /* routing flags for the control plane */ + struct nhop_object *nh; /* backreference to the dataplane nhop */ + struct nh_control *nh_control; /* backreference to the rnh */ + struct nhop_priv *nh_next; /* hash table membership */ + struct mtx nh_mtx; /* mutex */ + struct epoch_context nh_epoch_ctx; /* epoch data for nhop */ +}; + +#define NH_PRIV_LOCK_INIT(_priv) mtx_init(&(_priv)->nh_mtx, "nhop", NULL, MTX_DEF) +#define NH_PRIV_LOCK(_priv) mtx_lock(&(_priv)->nh_mtx) +#define NH_PRIV_UNLOCK(_priv) mtx_unlock(&(_priv)->nh_mtx) +#define NH_PRIV_LOCK_DESTROY(_priv) mtx_destroy(&(_priv)->nh_mtx) +#define NH_PRIV_LOCK_ASSERT(_priv) mtx_assert(&(_priv)->nh_mtx, MA_OWNED) + +#define NH_LOCK(_nh) NH_PRIV_LOCK((_nh)->nh_priv) +#define NH_UNLOCK(_nh) NH_PRIV_UNLOCK((_nh)->nh_priv) + +#define NH_IS_PINNED(_nh) ((_nh)->nh_priv->rt_flags & RTF_PINNED) + +/* nhop.c */ +struct nhop_priv *find_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv); +int link_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv); +struct nhop_priv *unlink_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv); + +/* nhop_ctl.c */ +void free_nhop(struct nhop_priv *nh_priv); +int cmp_priv(const struct nhop_priv *_one, const struct nhop_priv *_two); + +/* mpath */ +struct weightened_nhop; + + +#endif + Index: sys/net/route/route_ctl.c =================================================================== --- /dev/null +++ sys/net/route/route_ctl.c @@ -0,0 +1,1601 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_route_mpath.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#define NEED_RTZONE +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * This file contains control plane routing tables functions. + * + * All functions assumes they are called in net epoch. + */ + +#define V_rib_route_multipath VNET(rib_route_multipath) +#ifdef ROUTE_MPATH +VNET_DEFINE(u_int, rib_route_multipath) = 1; +#define MP_FLAGS CTLFLAG_RWTUN +#else +VNET_DEFINE(u_int, rib_route_multipath) = 0; +#define MP_FLAGS CTLFLAG_RD +#endif +SYSCTL_UINT(_net_route, OID_AUTO, multipath, MP_FLAGS | CTLFLAG_VNET, + &VNET_NAME(rib_route_multipath), 0, "Enable route multipath"); + + +static void set_req_mtu(const struct rt_addrinfo *info, struct nhop_request *req); +static int create_rte_from_info(struct rib_head *rnh, struct rt_addrinfo *info, + struct rtentry **ret_rt); +static int can_rib_multipath(struct rib_head *rh); + +static int add_route(struct rib_head *rnh, struct rtentry *rt, + struct rt_addrinfo *info, struct rib_cmd_info *rc); +static int del_route(struct rib_head *rnh, struct rt_addrinfo *info, + struct rib_cmd_info *rc); + +static int replace_rte(struct rib_head *rnh, struct sockaddr *dst, + struct sockaddr *mask, struct rtentry *rt_new); +static int update_gateway_metadata(struct rt_addrinfo *info, int fibnum); +static void fill_nh_request(struct rib_head *rnh, struct rt_addrinfo *info, + struct nhop_request *nh_req); +static void fill_nh_request_from_nhop(const struct nhop_object *nh, + struct sockaddr_storage *gw_storage, struct nhop_request *nh_req); + + +/* + * Returns address family to operate on from @info. + */ +static sa_family_t +get_family_from_info(const struct rt_addrinfo *info) +{ + + return ((info->rti_info[RTAX_DST])->sa_family); +} + + +/* + * Sets @nh_req mtu data based on the @info data. + */ +static void +set_req_mtu(const struct rt_addrinfo *info, struct nhop_request *nh_req) +{ + + if (info->rti_mflags & RTV_MTU) { + if (info->rti_rmx->rmx_mtu != 0) { + + /* + * MTU was explicitly provided by user. + * Keep it. + */ + nh_req->rt_flags |= RTF_FIXEDMTU; + } else { + + /* + * User explicitly sets MTU to 0. + * Assume rollback to default. + */ + nh_req->rt_flags &= ~RTF_FIXEDMTU; + } + nh_req->mtu = info->rti_rmx->rmx_mtu; + } +} + + +/* + * Fills @nh_req based on the data provided in @info. + */ +static void +fill_nh_request(struct rib_head *rnh, struct rt_addrinfo *info, + struct nhop_request *nh_req) +{ + + bzero(nh_req, sizeof(struct nhop_request)); + nh_req->ifp = info->rti_ifa->ifa_ifp; + nh_req->ifa = info->rti_ifa; + nh_req->gw = info->rti_info[RTAX_GATEWAY]; + nh_req->family = info->rti_info[RTAX_DST]->sa_family; + nh_req->rt_flags = info->rti_flags; // fill original rt flags + nh_req->nh_type = 0; // hook responsibility to set nhop type + set_req_mtu(info, nh_req); +} + +/* + * Fill @nh_req based on the real @nh. + */ +static void +fill_nh_request_from_nhop(const struct nhop_object *nh, + struct sockaddr_storage *gw_storage, struct nhop_request *nh_req) +{ + + memset(nh_req, 0, sizeof(struct nhop_request)); + nh_req->ifp = nh->nh_ifp; + nh_req->ifa = nh->nh_ifa; + nh_req->family = nh->nh_priv->nh_family; + nh_req->mtu = nh->nh_mtu; + nh_req->rt_flags = nh->nh_priv->rt_flags; + nh_req->nh_type = nh->nh_priv->nh_type; + + if (nh_req->rt_flags & RTF_GATEWAY) { + /* Assume size is already validated */ + memcpy(gw_storage, &nh->gw4_sa, nh->gw4_sa.sin_len); + } else { + /* Nhop value is largerly ignored, set some random bits */ + gw_storage->ss_len = 0; + } + nh_req->gw = (struct sockaddr *)gw_storage; +} + +/* + * Update @nh_req request data based on the parameters supplied in @info. + * This is a helper function to support route changes. + * + * It limits the changes that can be done to the route to the following: + * 1) all combination of gateway changes (gw, interface, blackhole/reject) + * 2) route flags (FLAG[123],STATIC,BLACKHOLE,REJECT) + * 3) route MTU + * 4) route weight (handled by the caller) + * 5) route lifetime (setting rte expiration time is handled by the caller) + * + * Assumes nh_req gw pointer has sockaddr_storage-sized pointer supplied + * + * Returns: + * 0 on success, nh_req->ifa and nh_req->ifp referenced + * error code otherwise + */ +static int +alter_nh_request(struct rt_addrinfo *info, u_int fibnum, struct nhop_request *nh_req) +{ + int error; + + /* Update MTU if set in the request*/ + set_req_mtu(info, nh_req); + + /* XXX: allow only one of BLACKHOLE,REJECT,GATEWAY */ + + /* Allow some flags (FLAG1,STATIC,BLACKHOLE,REJECT) to be toggled on change. */ + nh_req->rt_flags &= ~RIB_RTE_CHANGE_MASK; + nh_req->rt_flags |= info->rti_flags & RIB_RTE_CHANGE_MASK; + + /* Consider gateway change */ + struct sockaddr *info_gw = info->rti_info[RTAX_GATEWAY]; + + if (info_gw != NULL) { + error = update_gateway_metadata(info, fibnum); + if (error != 0) + return (error); + /* ifa/ifp are already referenced by update_gateway_metadata() */ + nh_req->ifa = info->rti_ifa; + nh_req->ifp = info->rti_ifp; + /* Update RTF_GATEWAY flag status */ + nh_req->rt_flags &= ~RTF_GATEWAY; + nh_req->rt_flags |= (RTF_GATEWAY & info->rti_flags); + } else { + /* Original nexthop data copy haven't been referenced, do it now */ + ifa_ref(nh_req->ifa); + if_ref(nh_req->ifp); + } + + return (0); +} + +/* + * Creates a new nexthop based on the information in @info. + * + * Returns: + * 0 on success, filling @nh_ret with the desired nexthop object ptr + * errno otherwise + */ +static int +create_nhop_from_info(struct rib_head *rnh, struct rt_addrinfo *info, + struct nhop_object **nh_ret) +{ + struct sockaddr *gateway, *dst, *netmask; + struct nhop_request nh_req; + int error; + + fill_nh_request(rnh, info, &nh_req); + + /* Give the protocols chance to augment the request data */ + dst = info->rti_info[RTAX_DST]; + netmask = info->rti_info[RTAX_NETMASK]; + gateway = info->rti_info[RTAX_GATEWAY]; + + if (rnh->rnh_preadd != NULL) { + error = rnh->rnh_preadd(rnh->rib_fibnum, dst, netmask, &nh_req); + if (error != 0) + return (error); + } + + *nh_ret = nhop_get(rnh, &nh_req); + if (*nh_ret == NULL) { + DPRINTF("failed to get the nexthop from req"); + return (EAGAIN); + } + + return (0); +} + +/* + * Creates new nexthop based on @nh_old and augmentation data from @info. + * Helper function used in the route changes, please see + * alter_nh_request() comments for more details. + * + * Returns: + * 0 on success, filling @nh_ret with the desired nexthop object + * errno otherwise + */ +static int +create_nhop_from_nhop(struct rib_head *rnh, const struct nhop_object *nh_old, + struct rt_addrinfo *info, struct nhop_object **nh_ret) +{ + struct nhop_request nh_req; + struct sockaddr_storage gw_storage; + int error; + + /* Start with copying data from original nexthop */ + fill_nh_request_from_nhop(nh_old, &gw_storage, &nh_req); + + /* return ifa/ifp referenced */ + error = alter_nh_request(info, rnh->rib_fibnum, &nh_req); + if (error != 0) + return (error); + + /* Give protocol chance to alter the nexthop request */ + if (rnh->rnh_preadd != NULL) { + error = rnh->rnh_preadd(rnh->rib_fibnum, info->rti_info[RTAX_DST], + info->rti_info[RTAX_NETMASK], &nh_req); + if (error != 0) { + DPRINTF("failed to create nhop: prehook returned %d", + error); + /* cleanup */ + ifa_free(nh_req.ifa); + if_rele(nh_req.ifp); + return (error); + } + } + + *nh_ret = nhop_get(rnh, &nh_req); + if (*nh_ret == NULL) { + DPRINTF("failed to create nhop: nhop_get() failed"); + ifa_free(nh_req.ifa); + if_rele(nh_req.ifp); + /* XXX: verify */ + return (EAGAIN); + } + + return (0); +} + +/* + * Gets kernel-usable time of the route expiration from @info. + * Userland provides absolute expiration timestamp (UTC), this function + * converts it to the kernel uptime-based interval. + * + * Returns: kernel uptime-based timestamp of the route expiration or 0. + */ +static u_long +get_expire_from_info(const struct rt_addrinfo *info) +{ + u_long expire = 0; + + /* Kernel -> userland timebase conversion. */ + if ((info->rti_mflags & RTV_EXPIRE) && (info->rti_rmx->rmx_expire > 0)) + expire = info->rti_rmx->rmx_expire - time_second + time_uptime; + + return (expire); +} + +/* + * Gets route weight from @info. + * If weight is not set (true in most cases, 2020-01), returns + * ROUTE_DEFAULT_WEIGHT (100). If the weight is too high, + * caps it to ROUTE_MAX_WEIGHT (2^24 -1). + */ +static uint32_t +get_weight_from_info(const struct rt_addrinfo *info) +{ + uint32_t weight; + + + if ((info->rti_mflags & RTV_WEIGHT) && (info->rti_rmx->rmx_weight > 0)) + weight = info->rti_rmx->rmx_weight; + else + weight = ROUTE_DEFAULT_WEIGHT; + + if (weight > ROUTE_MAX_WEIGHT) + weight = ROUTE_MAX_WEIGHT; + + return (weight); +} + +/* + * Creates rtentry based on dst, mask and other metadata in @info. + * + * Returns 0 on success, filling @ret_rt with referenced & unlocked + * rtentry. + */ +static int +create_rte_from_info(struct rib_head *rnh, struct rt_addrinfo *info, + struct rtentry **ret_rt) +{ + struct sockaddr *gateway, *dst, *ndst, *netmask; + struct rtentry *rt; + + dst = info->rti_info[RTAX_DST]; + netmask = info->rti_info[RTAX_NETMASK]; + gateway = info->rti_info[RTAX_GATEWAY]; + + rt = uma_zalloc(V_rtzone, M_NOWAIT); + if (rt == NULL) { + return (ENOBUFS); + } + + /* Bump refcount to return referenced rte */ + rt->rt_refcnt = 1; + rt->rt_fibnum = rnh->rib_fibnum; + + if (dst->sa_len <= sizeof(struct sockaddr_in6)) { + memcpy(&rt->rt_dst, dst, dst->sa_len); + rt_key(rt) = &rt->rt_dst; + } else { + /* dst size is too big. Alloc separately */ + rt_key(rt) = malloc(dst->sa_len, M_RTABLE, M_NOWAIT); + if (rt_key(rt) == NULL) { + uma_zfree(V_rtzone, rt); + return (ENOBUFS); + } + } + + /* + * point to the (possibly newly malloc'd) dest address. + */ + ndst = (struct sockaddr *)rt_key(rt); + + /* + * make sure it contains the value we want (masked if needed). + */ + if (netmask != NULL) { + /* TODO: verify instead of masked copy */ + rt_maskedcopy(dst, ndst, netmask); + if (!sa_equal(dst, ndst)) { + /* contract violation, return */ + char abuf[INET6_ADDRSTRLEN]; + rib_print_sockaddr(abuf, INET6_ADDRSTRLEN, dst); + DPRINTF("warn: masked dst != dst (%s)", abuf); + /* XXX: fix callers! */ +#if 0 + uma_zfree(V_rtzone, rt); + return (EINVAL); +#endif + } + } else + bcopy(dst, ndst, dst->sa_len); + + rt->rt_weight = get_weight_from_info(info); + rt->rt_expire = get_expire_from_info(info); + rt->rte_flags = info->rti_flags & RTE_RT_FLAG_MASK; + + *ret_rt = rt; + + return (0); +} + +int +create_rte_from_rte(struct rib_head *rnh, struct rtentry *rt_orig, + struct rtentry **ret_rt) +{ + struct sockaddr *dst; + struct rtentry *rt; + + dst = rt_key(rt_orig); + + rt = uma_zalloc(V_rtzone, M_NOWAIT); + if (rt == NULL) { + return (ENOBUFS); + } + + /* Bump refcount to return referenced rte */ + rt->rt_refcnt = 1; + rt->rt_fibnum = rnh->rib_fibnum; + + if (dst->sa_len <= sizeof(struct sockaddr_in6)) { + memcpy(&rt->rt_dst, dst, dst->sa_len); + rt_key(rt) = &rt->rt_dst; + } else { + /* dst size is too big. Alloc separately */ + rt_key(rt) = malloc(dst->sa_len, M_RTABLE, M_NOWAIT); + if (rt_key(rt) == NULL) { + uma_zfree(V_rtzone, rt); + return (ENOBUFS); + } + memcpy(rt_key(rt), dst, dst->sa_len); + } + + rt->rt_weight = rt_orig->rt_weight; + rt->rt_expire = rt_orig->rt_expire; + rt->rte_flags = rt_orig->rte_flags; + rt->rt_nhop = rt_orig->rt_nhop; + + *ret_rt = rt; + + return (0); +} + +int +replace_rte(struct rib_head *rnh, struct sockaddr *dst, struct sockaddr *mask, + struct rtentry *rt_new) +{ + struct radix_node *rn; + + RIB_WLOCK_ASSERT(rnh); + + rn = rnh->rnh_deladdr(dst, mask, &rnh->head); + if (rn == NULL) + return (ESRCH); + + rn = rnh->rnh_addaddr(dst, mask, &rnh->head, + rt_new->rt_nodes); + + if (rn == NULL) + return (ENOBUFS); + + return (0); +} + +/* + * Verify that the combination of dst and gateway address families is supported. + * + * Currently accepted options: + * gw_af == dst_af: default option for the routes with RTF_GATEWAY + * gw_af == AF_LINK: IPv4/IPv6 interface routes, storing inteface index in sdl. + * gw_af == AF_UNSPEC: was used to provide raw ethernet header. Currently not supported. + * + * Return 0 on success, errno otherwise. + */ +static int +verify_gateway_family(const struct rt_addrinfo *info) +{ + const struct sockaddr *dst, *gateway; + + dst = info->rti_info[RTAX_DST]; + gateway = info->rti_info[RTAX_GATEWAY]; + + if (dst && gateway && (dst->sa_family != gateway->sa_family) && + (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK)) + return (EINVAL); + + return (0); +} + +/* + * Check is nhop is multipath-eligible. + * Avoid nhops without gateways and redirects. + * + * Returns 1 for multipath-eligible nexthop, + * 0 otherwise. + */ +int +can_nh_multipath(const struct nhop_object *nh) +{ + + if ((nh->nh_flags & NHF_MULTIPATH) != 0) + return (1); + if ((nh->nh_flags & NHF_GATEWAY) == 0) + return (0); + if ((nh->nh_flags & NHF_REDIRECT) != 0) + return (0); + + return (1); +} + +/* + * Get relativer route priority across other routes. + * Interface routes (RTF_PINNED) are the hightest, + * Normal routes goes next. + * Redirected routes have the least priority. + * + * Returns preference as a number, higher is better. + * + */ +static uint16_t +get_rt_preference(int rt_flags) +{ + uint16_t result; + + result = (!!(rt_flags & RTF_PINNED)) << 2; + result |= (!(rt_flags & (RTF_DYNAMIC))) << 1; + result |= !!(rt_flags & (RTF_DYNAMIC)); + + return (result); +} + +/* + * Tries to add route to the RIB. + * Assumes @rt_new and @rt_new->rt_nhop are referenced and unlocked + * + * Return values: + * 0 for success. @rt and rt->rt_nhop is consumed. + * If @rc is supplied, unlocked operation result is saved there. + * != 0: Error code is returned. It is caller responsibility to free rt / rt->rt_nhop. + */ +static int +add_route(struct rib_head *rnh, struct rtentry *rt_new, struct rt_addrinfo *info, + struct rib_cmd_info *rc) +{ + struct rtentry *rt_orig; + struct nhop_object *nh_orig, *nh_new; + struct sockaddr *ndst, *netmask; + int error; + + ndst = (struct sockaddr *)rt_key(rt_new); + netmask = info->rti_info[RTAX_NETMASK]; + nh_new = rt_new->rt_nhop; + + rc->cmd = RTM_ADD; + + RIB_WLOCK(rnh); + RT_LOCK(rt_new); + + rt_orig = (struct rtentry *)rnh->rnh_addaddr(ndst, netmask, &rnh->head, + rt_new->rt_nodes); + + if (rt_orig != NULL) { + /* Success. Update generation id. */ + rnh->rnh_gen++; + /* Notify temporal routes of a new route */ + if (rt_new->rt_expire != 0) + tmproutes_update(rnh, rt_new); + RIB_WUNLOCK(rnh); + + /* + * Prepare notification: + * RTM_ADD, nh_old: NULL, nh_new: rt_new->rt_nhop + */ + rc->nh_new = nh_new; + rc->rt = rt_new; + RT_UNLOCK(rt_new); + + return (0); + } + + /* Route addition failed. Inspect the prefix in the rib to determine the cause */ + rt_orig = (struct rtentry *)rnh->rnh_lookup(ndst, netmask, &rnh->head); + if (rt_orig == NULL) { + /* + * The only reason this can happen is when + * rnh_addaddr fails to allocate memory, so the first error + * was not really "prefix exists". + * Unlock everything and return. + */ + RIB_WUNLOCK(rnh); + RT_UNLOCK(rt_new); + RTSTAT_INC(rts_add_algo_fail); + return (ENOMEM); + } + + /* We have existing route in the RIB. */ + nh_orig = rt_orig->rt_nhop; + /* TODO: generalise to the protocol preferences */ + if ((info->rti_flags & RTF_PINNED) && !NH_IS_PINNED(nh_orig)) { + /* + * Our new proposed route is an interface route so it + * takes precedence. Replace old nexthop & rte with a new pair. + */ + error = replace_rte(rnh, ndst, netmask, rt_new); + RT_UNLOCK(rt_new); + if (error == 0) + rnh->rnh_gen++; + RIB_WUNLOCK(rnh); + + if (error != 0) + return (error); + + RTSTAT_INC(rts_add_pinned); + + /* Update notification data */ + rc->cmd = RTM_CHANGE; + rc->rt = rt_new; + rc->nh_new = nh_new; + rc->nh_old = nh_orig; + + RTFREE(rt_orig); + nhop_free_any(nh_orig); + + return (0); + } + +#ifdef ROUTE_MPATH + /* Eligible for multipath? */ + if (!can_rib_multipath(rnh) || (can_nh_multipath(rt_new->rt_nhop) == 0) || + (can_nh_multipath(rt_orig->rt_nhop) == 0)) { + /* + * Multipath not enabled OR + * new NH is not a route with gw OR + * existing NH is NOT multipah group / gateway + */ + RIB_WUNLOCK(rnh); + RT_UNLOCK(rt_new); + RTSTAT_INC(rts_mpath_ineligible); + return (EEXIST); + } + + /* + * One or more routes is already in the RIB and we need to add + * another one, which requires getting a new nexthop group. + */ + unsigned int weight_orig = rt_orig->rt_weight; + nh_orig = rt_orig->rt_nhop; + RIB_WUNLOCK(rnh); + RT_UNLOCK(rt_new); + + error = add_route_mpath(rnh, rt_new, nh_orig, weight_orig, info, rc); +#else + RIB_WUNLOCK(rnh); + RT_UNLOCK(rt_new); + error = EEXIST; +#endif + return (error); +} + +/* + * Check if specified @gw matches gw data in the nexthop @nh. + * + * Returns 1 if matches, 0 otherwise. + */ +int +rib_match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw) +{ + + if (nh->gw_sa.sa_family != gw->sa_family) + return (0); + + switch (gw->sa_family) { + case AF_INET: + return (nh->gw4_sa.sin_addr.s_addr == + ((const struct sockaddr_in *)gw)->sin_addr.s_addr); + case AF_INET6: + { + const struct sockaddr_in6 *gw6; + gw6 = (const struct sockaddr_in6 *)gw; + /* + * Currently (2020-01) IPv6 gws in kernel have their + * scope embedded. Once this becomes false, this code + * has to be revisited. + */ + if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr, + &gw6->sin6_addr)) + return (1); + return (0); + } + default: + if (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) != 0) + return (0); + return (1); + } + + /* NOTREACHED */ + return (0); +} + +int +del_route_one(struct rib_head *rnh, struct rtentry *rt, struct rt_addrinfo *info) +{ + struct sockaddr *dst, *netmask, *gw; + struct radix_node *rn; + + RIB_WLOCK_ASSERT(rnh); + KASSERT((!NH_IS_MULTIPATH(rt->rt_nhop)), ("called with mpath route")); + + /* TODO: generalise priorities */ + if (NH_IS_PINNED(rt->rt_nhop) && ((info->rti_flags & RTF_PINNED) == 0)){ + /* + * Target route is PINNED, while our request does not + * contain RTF_PINNED flag -> refuse to delete. + */ + RTSTAT_INC(rts_del_fail_priority); + return (EADDRINUSE); + } + + gw = info->rti_info[RTAX_GATEWAY]; + if ((info->rti_flags & RTF_GATEWAY) && (gw != NULL)) { + /* + * Delete request contains specific gateway. + * Have to verify it prior to the deletion. + */ + if (rib_match_nhop_gw(rt->rt_nhop, gw) == 0) { + return (ESRCH); + } + } + + if (info->rti_filter != NULL) { + /* + * Delete request contains specific matching function. + * Run the found rte through it. + */ + if (info->rti_filter(rt, rt->rt_nhop, info->rti_filterdata) == 0) { + /* Not matched */ + return (ESRCH); + } + } + + /* Finally, remove record */ + dst = info->rti_info[RTAX_DST]; + netmask = info->rti_info[RTAX_NETMASK]; + rn = rnh->rnh_deladdr(dst, netmask, &rnh->head); + + if (rn == NULL) { + /* Should not happen */ + RTSTAT_INC(rts_del_algo_fail); + return (ESRCH); + } + KASSERT((struct rtentry *)rn == rt, + ("rnh_deladdr returned wrong rte: expected %p got %p", rt, rn)); + + /* Mark rte as deleted */ + rt->rte_flags &= ~RTF_UP; + + return (0); +} + +/* + * Tries to delete route specified by @info. + * Returns 0 on success. + * If successful, references rt, nhop and + * returns them unlocked. + */ +static int +del_route(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc) +{ + struct sockaddr *dst, *netmask; + struct rtentry *rt; +#ifdef ROUTE_MPATH + struct nhgrp_object *mp; +#endif + int error; + + dst = info->rti_info[RTAX_DST]; + netmask = info->rti_info[RTAX_NETMASK]; + + rc->cmd = RTM_DELETE; + + RIB_WLOCK(rnh); + rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head); + if (rt == NULL) { + RIB_WUNLOCK(rnh); + return (ESRCH); + } + + if (NH_IS_MULTIPATH(rt->rt_nhop)) { + /* We hit multipath group */ +#ifdef ROUTE_MPATH + if (info->rti_info[RTAX_GATEWAY] == NULL) { + /* + * No gateway specification in the delete + * request, aborting. + */ + RIB_WUNLOCK(rnh); + return (ESRCH); + } + + mp = (struct nhgrp_object *)rt->rt_nhop; + + RIB_WUNLOCK(rnh); + + return (del_route_mpath(rnh, rt, mp, info, rc)); +#else + RIB_WUNLOCK(rnh); + return (ENOTSUP); +#endif + } + + error = del_route_one(rnh, rt, info); + RIB_WUNLOCK(rnh); + if (error != 0) + return (error); + + RTSTAT_INC(rts_del_success); + + /* Finalise notification data */ + rc->rt = rt; + rc->nh_old = rt->rt_nhop; + rc->rt_weight = rt->rt_weight; + + /* + * rt was removed from the tree as well as rt_nhop. + * Decrease their reference counts. + */ + NH_FREE(rt->rt_nhop); + RTFREE(rt); + + return (0); +} + +static int +clone_rte_conditional(struct rib_head *rnh, struct rtentry *rt_orig, + struct nhop_object *nh_orig, struct rt_addrinfo *info, struct rtentry **ret) +{ + struct rtentry *rt_new; + int error; + + rt_new = NULL; + if (info->rti_mflags & RTV_EXPIRE) { + if (NH_IS_MULTIPATH(nh_orig)) { + return (ENOTSUP); + } + + if (get_expire_from_info(info) != rt_orig->rt_expire) { + error = create_rte_from_rte(rnh, rt_orig, &rt_new); + if (error != 0) + return (error); + } + /* XXX: set expire */ + } + if ((info->rti_mflags & RTV_WEIGHT) && !NH_IS_MULTIPATH(nh_orig) && + get_weight_from_info(info) != rt_orig->rt_weight) { + if (rt_new == NULL) { + error = create_rte_from_rte(rnh, rt_orig, &rt_new); + if (error != 0) + return (error); + } + /* XXX: set weight */ + } + + *ret = rt_new; + return (0); +} + +static int +change_route(struct rib_head *rnh, struct rt_addrinfo *info, + struct rib_cmd_info *rc) +{ + struct nhop_object *nh_orig, *nh_new, *nh_src, *nh_insert; + struct sockaddr *gw; + struct rtentry *rt_orig, *rt_curr, *rt_new; + int error; + uint32_t weight_orig; + unsigned long expire_orig; +#ifdef ROUTE_MPATH + struct weightened_nhop *wn_orig, wn_new; + struct nhgrp_object *mp_new; + uint32_t changed_idx, num_nhops; + uint64_t modmask; +#endif + RIB_RLOCK_TRACKER; + + gw = info->rti_info[RTAX_GATEWAY]; + + RIB_RLOCK(rnh); + rt_orig = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], + info->rti_info[RTAX_NETMASK], &rnh->head); + + if (rt_orig == NULL) { + RIB_RUNLOCK(rnh); + return (ESRCH); + } + + weight_orig = rt_orig->rt_weight; + expire_orig = rt_orig->rt_expire; + nh_orig = rt_orig->rt_nhop; + if (NH_IS_MULTIPATH(nh_orig)) { +#ifdef ROUTE_MPATH + if (gw == NULL) { + /* Unable to choose the proper nexthop */ + RIB_RUNLOCK(rnh); + return (ESRCH); + } + + nh_src = NULL; + wn_orig = nhgrp_get_nhops((struct nhgrp_object *)nh_orig, + &num_nhops); + for (uint32_t i = 0; i < num_nhops; i++) { + if (rib_match_nhop_gw(wn_orig[i].nh, gw)) { + nh_src = wn_orig[i].nh; + changed_idx = i; + break; + } + } + + if (nh_src == NULL) { + RIB_RUNLOCK(rnh); + return (ESRCH); + } +#else + RIB_RUNLOCK(rnh); + return (ENOTSUP); +#endif + } else { + if (gw != NULL && !rib_match_nhop_gw(nh_orig, gw)) { + RIB_RUNLOCK(rnh); + return (ESRCH); + } + nh_src = nh_orig; + } + + /* + * Chosen nexthop is nh_src, original rt is rt_orig, original + * nhop/nhop group is nh_orig. + * Drop the lock and try to create a new nexthop and a new + * nhop group if needed. + */ + RIB_RUNLOCK(rnh); + + /* + * Route change may request weight / expire time change. + * As these changes has to be stored in rtentry and we need + * to maintain immutability of most fields, we clone&insert + * cloned rtentry in the rib in such cases. + * + * Note: rt_new CAN be NULL and is NULL for all common cases. + */ + error = clone_rte_conditional(rnh, rt_orig, nh_orig, info, &rt_new); + if (error != 0) + return (error); + + error = create_nhop_from_nhop(rnh, nh_src, info, &nh_new); + if (error != 0) { + if (rt_new != NULL) + RTFREE(rt_new); + return (error); + } + DPRINTF("Update nhop: %d -> %d", nh_orig->nh_priv->nh_idx, + nh_new->nh_priv->nh_idx); + +#ifdef ROUTE_MPATH + mp_new = NULL; + if (NH_IS_MULTIPATH(nh_orig)) { + /* Create mpath group with an updated nhop/weight */ + wn_new.nh = nh_src; + if (info->rti_mflags & RTV_WEIGHT) + wn_new.weight = get_weight_from_info(info); + else + wn_new.weight = wn_orig[changed_idx].weight; + + mp_new = nhgrp_get_replace_nhop(rnh, + (struct nhgrp_object *)nh_orig, &wn_new, + changed_idx, &modmask, &error); + + if (mp_new == NULL) { + NH_FREE(nh_src); + if (rt_new != NULL) + RTFREE(rt_new); + return (error); + } + nh_insert = (struct nhop_object *)mp_new; + } else +#endif + nh_insert = nh_new; + + if (rt_new != NULL) + rt_new->rt_nhop = nh_insert; + + /* Update notification metadata */ + rc->nh_old = nh_src; + rc->nh_new = nh_insert; + + RIB_WLOCK(rnh); + + /* + * Lookup route once again as it may have been changed or deleted. + */ + rt_curr = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], + info->rti_info[RTAX_NETMASK], &rnh->head); + + /* Check if anything has changed */ + if ((rt_curr != rt_orig) || (rt_curr->rt_nhop != nh_orig)) { + /* + * The original nexthop has changed. Free the resources + * and return EAGAIN, so the caller can retry. + */ + RIB_WUNLOCK(rnh); + NH_FREE(nh_new); +#ifdef ROUTE_MPATH + if (mp_new != NULL) + nhgrp_free_group(mp_new); +#endif + if (rt_new != NULL) + RTFREE(rt_new); + return (EAGAIN); + } + + if (rt_new != NULL) { + error = replace_rte(rnh, info->rti_info[RTAX_DST], + info->rti_info[RTAX_NETMASK], rt_new); + } else { + RT_LOCK(rt_orig); + rt_orig->rt_nhop = nh_insert; + RT_UNLOCK(rt_orig); + error = 0; + } + + if (error == 0) + rnh->rnh_gen++; + RIB_WUNLOCK(rnh); + + if (error != 0) { + /* + * Failed to install new rte with new nexthop. + * Free resources. + */ + NH_FREE(nh_new); +#ifdef ROUTE_MPATH + if (mp_new != NULL) + nhgrp_free_group(mp_new); +#endif + if (rt_new != NULL) + RTFREE(rt_new); + + return (error); + } + + /* Remove refcount from the old nhop */ + nhop_free_any(nh_orig); + + if (rt_new != NULL) { + RTFREE(rt_orig); + rc->rt = rt_new; + } else + rc->rt = rt_orig; + + return (0); +} + +/* + * Assumes RTAX_GATEWAY is set + * Returns 0 on success, references ifa/ifp + * XXX: verify freeing refcount + */ +static int +update_gateway_metadata(struct rt_addrinfo *info, int fibnum) +{ + int error; + + KASSERT((info->rti_info[RTAX_GATEWAY] != NULL), ("gateway is NULL")); + + /* + * Allow the same set of rules as with route creation + */ + error = verify_gateway_family(info); + if (error != 0) + return (error); + + if (info->rti_ifa == NULL) { + error = rt_getifa_fib(info, fibnum); + if (error != 0) + return (error); + } else { + ifa_ref(info->rti_ifa); + } + + if (info->rti_ifp == NULL) + info->rti_ifp = info->rti_ifa->ifa_ifp; + if_ref(info->rti_ifp); + + return (0); +} + + + +static void +refine_info(struct rt_addrinfo *info) +{ + + /* + * If we are adding a host route then we don't want to put + * a netmask in the tree, nor do we want to clone it. + */ + if (info->rti_flags & RTF_HOST) + info->rti_info[RTAX_NETMASK] = NULL; +} + +/* + * Allocates rtentry and gets referenced&linked nhop. + * + * Returns 0 on success, storing rtentry with the valid nhop into @ret_rt. + * + */ +static int +create_rt_nh_pair_from_info(struct rib_head *rnh, struct rt_addrinfo *info, + struct rtentry **ret_rt) +{ + struct rtentry *rt; + struct nhop_object *nh; + int error; + + error = create_rte_from_info(rnh, info, &rt); + if (error != 0) { + DPRINTF("failed to create rte: %d", error); + return (error); + } + + DPRINTF("new rte %p af %d", rt, (int)(info->rti_info[RTAX_DST])->sa_family); + + error = create_nhop_from_info(rnh, info, &nh); + if (error != 0) { + DPRINTF("failed to create nhop: %d", error); + uma_zfree(V_rtzone, rt); + return (error); + } + + rt->rt_nhop = nh; + + *ret_rt = rt; + + return (0); +} + + +/* + * Adds route defined by @info into the kernel table specified by @fibnum and + * sa_family in @info->rti_info[RTAX_DST]. + * + * Returns 0 on success and fills in operation metadata into @rc. + */ +int +rib_add_route(u_int fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc) +{ + int error = 0; + struct rtentry *rt; + struct sockaddr *gateway, *dst, *netmask; + struct rib_head *rnh; + int ifa_referenced = 0; + + NET_EPOCH_ASSERT_INVARIANTS(); + + refine_info(info); + + dst = info->rti_info[RTAX_DST]; + netmask = info->rti_info[RTAX_NETMASK]; + gateway = info->rti_info[RTAX_GATEWAY]; + + if ((info->rti_flags & RTF_GATEWAY) && gateway == NULL) + return (EINVAL); + error = verify_gateway_family(info); + if (error != 0) + return (error); + + /* ensure route is UP */ + info->rti_flags |= RTF_UP; + + if (info->rti_ifa == NULL) { + /* rt_getifa_fib() references ifa upon successful completion */ + error = rt_getifa_fib(info, fibnum); + if (error != 0) + return (error); + ifa_referenced = 1; + } + + rnh = rt_tables_get_rnh(fibnum, dst->sa_family); + + error = create_rt_nh_pair_from_info(rnh, info, &rt); + if (error != 0) { + if (ifa_referenced != 0) + ifa_free(info->rti_ifa); + return (error); + } + + bzero(rc, sizeof(struct rib_cmd_info)); + + error = add_route(rnh, rt, info, rc); + + /* + * If it still failed to go into the tree, + * then un-make it (this should be a function) + */ + if (error != 0) { + NH_FREE(rt->rt_nhop); + if (rt_key(rt) != &rt->rt_dst) + R_Free(rt_key(rt)); + uma_zfree(V_rtzone, rt); + if (ifa_referenced != 0) + ifa_free(info->rti_ifa); + return (error); + } + RTSTAT_INC(rts_add_success); + + rib_notify_subscribers(rnh, info, rc); + + return (0); +} + + +/* + * Removes route defined by @info from the kernel table specified by @fibnum and + * sa_family in @info->rti_info[RTAX_DST]. + * + * Returns 0 on success and fills in operation metadata into @rc. + */ +int +rib_del_route(u_int fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc) +{ + struct rib_head *rnh; + int error = 0; + + NET_EPOCH_ASSERT_INVARIANTS(); + + refine_info(info); + + rnh = rt_tables_get_rnh(fibnum, get_family_from_info(info)); + + bzero(rc, sizeof(struct rib_cmd_info)); + + error = del_route(rnh, info, rc); + + if (error == 0) + rib_notify_subscribers(rnh, info, rc); + + return (error); +} + + +/* + * Changes route properties defined by @info in the kernel table specified by + * @fibnum and sa_family in @info->rti_info[RTAX_DST]. + * + * Returns 0 on success and fills in operation metadata into @rc. + */ +int +rib_change_route(u_int fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc) +{ + struct rib_head *rnh; + int error = 0; + + NET_EPOCH_ASSERT_INVARIANTS(); + + refine_info(info); + + rnh = rt_tables_get_rnh(fibnum, get_family_from_info(info)); + + bzero(rc, sizeof(struct rib_cmd_info)); + + for (int i = 0; i < RIB_MAX_RETRIES; i++) { + error = change_route(rnh, info, rc); + if (error != EAGAIN) + break; + } + + if (error == 0) + rib_notify_subscribers(rnh, info, rc); + + return (error); +} + +static int +can_rib_multipath(struct rib_head *rh) +{ + int result; + + CURVNET_SET(rh->rib_vnet); + result = !!V_rib_route_multipath; + CURVNET_RESTORE(); + + return (result); +} + +/* + * Looks up route based on @dst and @mask. + * + * @dst: destination to lookup. + * @mask: route netmask for exact prefix match, can be NULL. + * + * Returns 0 on success, filling @ret with found rtentry. + * rtentry is returned locked. + */ +int +rib_lookup_route_netmask(u_int fibnum, const struct sockaddr *dst, + const struct sockaddr *mask, struct rtentry **ret) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rnh; + struct radix_node *rn; + struct rtentry *rt; + + rnh = rt_tables_get_rnh(fibnum, dst->sa_family); + if (rnh == NULL) + return (EAFNOSUPPORT); + + RIB_RLOCK(rnh); + + if (mask == NULL) { + /* Longest prefix match lookup */ + rn = rnh->rnh_matchaddr(__DECONST(void *, dst), &rnh->head); + } else { + /* Exact match lookup */ + rn = rnh->rnh_lookup(__DECONST(void *, dst), + __DECONST(void *, mask), &rnh->head); + } + + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { + rt = RNTORT(rn); + RT_LOCK(rt); + RIB_RUNLOCK(rnh); + + *ret = rt; + return (0); + } + + RIB_RUNLOCK(rnh); + return (ESRCH); +} + +static int +contigmask(const uint8_t *p, int len) +{ + int i, n; + + for (i = 0; i < len ; i++) + if ( (p[i/8] & (1 << (7 - (i%8)))) == 0) /* first bit unset */ + break; + for (n= i + 1; n < len; n++) + if ( (p[n/8] & (1 << (7 - (n % 8)))) != 0) + return (-1); /* mask not contiguous */ + return (i); +} + + +/* + * Retrieves address and prefix from @rt. + * @dst: prefix dst storage. Can be NULL, if not NULL, buffer size in sa_len. + * @netmask: prefix mask storage. Can be NULL, if not NULL, buffer size in sa_len. + * @plen: CIDR len, can be NULL. -1 on failure (non-contig mask). + * + * Returns 0 on success. + * + */ +int +rib_get_entry_prefix(const struct rtentry *rt, struct sockaddr *dst, + struct sockaddr *netmask, int *plen) +{ + const struct sockaddr *src; + + if (dst != NULL) { + src = rt_key_const(rt); + if (src->sa_len > dst->sa_len) + return (ENOBUFS); + memcpy(dst, src, src->sa_len); + } + + if (netmask != NULL) { + src = rt_mask_const(rt); + if (src->sa_len > netmask->sa_len) + return (ENOBUFS); + /* + * Currently in-tree netmasks + * a) does not have address family attached + * b) have different notion of sa_len, + * limiting it to the amount of + * non-zero bytes in netmask to + * speedup lookup. + * Fix this by copyin the remaining data + * from the key. + */ + const struct sockaddr *dst_sa = rt_key_const(rt); + memcpy(netmask, src, dst_sa->sa_len); + netmask->sa_family = dst_sa->sa_family; + netmask->sa_len = dst_sa->sa_len; + } + + if (plen != NULL) { + int family = (rt_key_const(rt))->sa_family; + const char *ptr = NULL; + int max_prefix = -1; + src = rt_mask_const(rt); + if (family == AF_INET) { + max_prefix = 32; + ptr = (const char *)&((const struct sockaddr_in *)src)->sin_addr; + } else if (family == AF_INET6) { + max_prefix = 128; + ptr = (const char *)&((const struct sockaddr_in6 *)src)->sin6_addr; + } + + if (src != NULL && ptr != NULL) + *plen = contigmask(ptr, max_prefix); + else + *plen = max_prefix; + } + + return (0); +} + +struct sockaddr * +rib_get_entry_dst_sa(const struct rtentry *rt, struct sockaddr *dst, + size_t sa_len, int *error) +{ + const struct sockaddr *src = rt_key_const(rt); + + if (src->sa_len > sa_len) { + *error = ENOBUFS; + return (NULL); + } + memcpy(dst, src, src->sa_len); + *error = 0; + + return (dst); +} + +struct sockaddr * +rib_get_entry_netmask_sa(const struct rtentry *rt, struct sockaddr *netmask, + size_t sa_len, int *error) +{ + const struct sockaddr *src = rt_mask_const(rt); + if (src == NULL) { + *error = 0; + return (NULL); + } + + if (src->sa_len > sa_len) { + *error = ENOBUFS; + return (NULL); + } + + *error = 0; + + /* + * Currently in-tree netmasks + * a) do not have address family attached + * b) have different notion of sa_len, + * limiting it to the amount of + * non-zero bytes in netmask to + * speedup lookup. + * Fix this by copyin the remaining data + * from the key. + */ + const struct sockaddr *dst = rt_key_const(rt); + + bzero(netmask, dst->sa_len); + netmask->sa_len = dst->sa_len; + netmask->sa_family = dst->sa_family; + + switch (dst->sa_family) { + case AF_INET: + ((struct sockaddr_in *)netmask)->sin_addr = + ((const struct sockaddr_in *)src)->sin_addr; + break; + case AF_INET6: + ((struct sockaddr_in6 *)netmask)->sin6_addr = + ((const struct sockaddr_in6 *)src)->sin6_addr; + break; + default: + memcpy(netmask, src, dst->sa_len); + netmask->sa_family = dst->sa_family; + netmask->sa_len = dst->sa_len; + } + + return (netmask); +} + +int +rib_get_entry_plen(const struct rtentry *rt) +{ + int family = (rt_key_const(rt))->sa_family; + const char *ptr = NULL; + int max_prefix = -1; + const struct sockaddr *src = rt_mask_const(rt); + int plen; + + if (family == AF_INET) { + max_prefix = 32; + ptr = (const char *)&((const struct sockaddr_in *)src)->sin_addr; + } else if (family == AF_INET6) { + max_prefix = 128; + ptr = (const char *)&((const struct sockaddr_in6 *)src)->sin6_addr; + } + + if (src != NULL && ptr != NULL) + plen = contigmask(ptr, max_prefix); + else + plen = max_prefix; + + return (plen); +} + +int +rib_get_entry_weight(const struct rtentry *rt) +{ + + return (rt->rt_weight); +} + +int +rib_get_entry_rtflags(const struct rtentry *rt, const struct nhop_object *nh) +{ + + return (rt->rte_flags | nh->nh_priv->rt_flags); +} + +const struct nhop_object * +rib_get_entry_nhop(const struct rtentry *rt) +{ + + return (rt->rt_nhop); +} + +sa_family_t +rib_get_entry_family(const struct rtentry *rt) +{ + + return ((rt_key_const(rt))->sa_family); +} + +unsigned int +rib_get_entry_fibnum(const struct rtentry *rt) +{ + + return (rt->rt_fibnum); +} + +unsigned long +rib_get_entry_expire_time(const struct rtentry *rt) +{ + + return (rt->rt_expire); +} + +/* +int +rib_is_host_entry(const struct rtentry *rt) +{ + + return (rt->rt_flags & RTF_HOST); +} +*/ + + + + + + +#include "tests/routing/test_route_ctl.h" + Index: sys/net/route/route_helpers.c =================================================================== --- /dev/null +++ sys/net/route/route_helpers.c @@ -0,0 +1,387 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_route.h" +#include "opt_route_mpath.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include + +/* + * RIB helper functions. + */ + +/* + * Operation results generated by the rib__route() + * can represent faily complex operations on multiple paths, making + * the notification handlers much more complex than they need to be. + * This function servers as helper, decomposing such notificaions into + * list of simple operations and calling + * provided callback on each primitive operation. + */ +int +rib_decompose_notification(struct rib_head *rnh, struct rt_addrinfo *info, + struct rib_cmd_info *rc, route_notification_t *cb, void *cbdata) +{ +#ifdef ROUTE_MPATH + struct weightened_nhop *wn; + uint32_t num_nhops; +#endif + + DPRINTF("rnh=%p cb=%p info=%p, cmd=%d nh_old=%p nh_new=%p change_mask=%X", + rnh, cb, info, rc->cmd, rc->nh_old, rc->nh_new, + (uint32_t)rc->mask_changed); + switch (rc->cmd) { + case RTM_ADD: + if (!NH_IS_MULTIPATH(rc->nh_new)) { + cb(RTM_ADD, rnh, info, rc->rt, NULL, rc->nh_new, + rc->rt->rt_weight, cbdata); + break; + } +#ifdef ROUTE_MPATH + wn = nhgrp_get_nhops((struct nhgrp_object *)rc->nh_new, &num_nhops); + for (uint32_t i = 0; i < num_nhops; i++) { + if (rc->mask_changed & (1 << i)) + cb(RTM_ADD, rnh, info, rc->rt, NULL, wn[i].nh, + wn[i].weight, cbdata); + } +#endif + break; + case RTM_DELETE: + if (!NH_IS_MULTIPATH(rc->nh_old)) { + cb(RTM_DELETE, rnh, info, rc->rt, rc->nh_old, NULL, + rc->rt->rt_weight, cbdata); + break; + } +#ifdef ROUTE_MPATH + wn = nhgrp_get_nhops((struct nhgrp_object *)rc->nh_old, &num_nhops); + for (uint32_t i = 0; i < num_nhops; i++) { + if (rc->mask_changed & (1 << i)) { + DPRINTF("RTM_DELETE: rnh=%p info=%p i=%d wn=%p", + rnh, info, i, wn); + cb(RTM_DELETE, rnh, info, rc->rt, wn[i].nh, + NULL, wn[i].weight, cbdata); + } + } +#endif + break; + case RTM_CHANGE: + /* + * Current rtsock API does not allow changing more than one path at + * once. This will change in the future, as the most + * efficient way of dealing with large number of multipath routes + * is to allow routing daemon to have direct control over nexthops + * and multipath objects. + * + * Additionally, there is a case with a force switch from multipath + * route to the inteface route. This is a corner case, which should + * be infrequent. + * + * Given that, implement mpath <> mpath support in the easiest way, + * postponing more performant implementation till other + * rtsock / netlink changes. + */ +#ifdef ROUTE_MPATH + if (NH_IS_MULTIPATH(rc->nh_old) || NH_IS_MULTIPATH(rc->nh_new)) { + uint32_t num_old, num_new; + struct weightened_nhop *wn_old, *wno, *wn_new, *wnn; + struct weightened_nhop tmp = { NULL, 0 }; + struct nhgrp_object *mp; + uint32_t idx_old, idx_new; + uint64_t bmask; + + if (NH_IS_MULTIPATH(rc->nh_old)) { + mp = (struct nhgrp_object *)rc->nh_old; + wn_old = nhgrp_get_nhops(mp, &num_old); + } else { + tmp.nh = rc->nh_old; + tmp.weight = rc->rt_weight; + wn_old = &tmp; + num_old = 1; + } + if (NH_IS_MULTIPATH(rc->nh_new)) { + mp = (struct nhgrp_object *)rc->nh_new; + wn_new = nhgrp_get_nhops(mp, &num_new); + } else { + tmp.nh = rc->nh_new; + tmp.weight = rc->rt_weight; + wn_new = &tmp; + num_new = 1; + } + + int found; + bmask = 0; + for (idx_old = 0; idx_old < num_old; idx_old++) { + wno = &wn_old[idx_old]; + found = 0; + for (idx_new = 0; idx_new < num_new; idx_new++) { + wnn = &wn_new[idx_new]; + if (wno->nh != wnn->nh) + continue; + bmask |= (1 << idx_new); + found = 1; + if (wno->weight != wnn->weight) { + cb(RTM_CHANGE, rnh, info, rc->rt, + wno->nh, wnn->nh, wnn->weight, + cbdata); + } + break; + } + if (found == 0) { + DPRINTF("RTM_DELETE: rnh=%p info=%p rt=%p wn=%p nh[%d]=%p nh_old=%p nh_new=%p", + rnh, info, rc->rt, wn_old, + idx_old, wno->nh, rc->nh_old, + rc->nh_new); + cb(RTM_DELETE, rnh, info, rc->rt, wno->nh, + NULL, wno->weight, cbdata); + } + } + for (idx_new = 0; idx_new < num_new; idx_new++) { + if ((bmask & (1 << idx_new)) == 0) + continue; + wnn = &wn_new[idx_new]; + cb(RTM_ADD, rnh, info, rc->rt, NULL, wnn->nh, + wnn->weight, cbdata); + } + + break; + } +#endif + + /* Weight changes ? */ + cb(RTM_CHANGE, rnh, info, rc->rt, rc->nh_old, rc->nh_new, + rc->rt->rt_weight, cbdata); + break; + } + + return (0); +} + +static void +rt_notify_ifa_handler(int cmd, struct rib_head *rnh, struct rt_addrinfo *info, + struct rtentry *rt, struct nhop_object *nh_old, struct nhop_object *nh_new, uint32_t weight, void *cbdata) +{ + struct ifaddr *ifa; + + switch (cmd) { + case RTM_ADD: + ifa = nh_new->nh_ifa; + if (ifa->ifa_rtrequest != NULL) + ifa->ifa_rtrequest(RTM_ADD, rt, nh_new, info); + break; + case RTM_DELETE: + ifa = nh_old->nh_ifa; + if (ifa->ifa_rtrequest != NULL) + ifa->ifa_rtrequest(RTM_DELETE, rt, nh_old, info); + break; + case RTM_CHANGE: + if (nh_old->nh_ifa == nh_new->nh_ifa) + break; + ifa = nh_old->nh_ifa; + if (ifa->ifa_rtrequest != NULL) + ifa->ifa_rtrequest(RTM_DELETE, rt, nh_old, info); + ifa = nh_new->nh_ifa; + if (ifa->ifa_rtrequest != NULL) + ifa->ifa_rtrequest(RTM_ADD, rt, nh_new, info); + break; + } +} + +/* + * old_nh, new_nh, bmask + * + * ADD [] -> [1] + (NULL, new, 1) + * CHANGE [1] -> [2] + (old, new, 1) + * CHANGE [1:w1] -> [1:w2] -> ? + * DEL [1] (old, NULL, ?) + * - + * ADD [1] -> [1, 2] + (old, new, 2) + * ADD [1] -> [1, 2, 3] + (old, new, 2,3) + * CHANGE [1, 2] -> [3, 4] ? old_bmask? + * CHANGE [1, 2:w1] -> [1, 2:w2] + * CHANGE [1, 2:w1, 3] -> [2:w2] + * DEL [1, 2, 3] -> [1] + * + */ +void +rib_notify_subscribers(struct rib_head *rnh, struct rt_addrinfo *info, + struct rib_cmd_info *rc) +{ + + rib_decompose_notification(rnh, info, rc, rt_notify_ifa_handler, NULL); +} + + +/* + * Prints sockaddr @s into supplied buffer @buf of size @buflen. + * + * Returns length of the resulting string excluding last '\0' + */ +int +rib_print_sockaddr(char *buf, int buflen, const struct sockaddr *s) +{ + const void *paddr = NULL; + + switch (s->sa_family) { + case AF_INET: + paddr = &((const struct sockaddr_in *)s)->sin_addr; + break; + case AF_INET6: + paddr = &((const struct sockaddr_in6 *)s)->sin6_addr; + break; + } + + if (paddr == NULL) { + return (snprintf(buf, buflen, "unknown_af:%d:len:%d", + s->sa_family, s->sa_len)); + } + + if (inet_ntop(s->sa_family, paddr, buf, buflen) == NULL) + return (0); + + return (strlen(buf)); +} + +/* + * Calls @wa_f with @arg for each entry in the table specified by + * @af and @fibnum. + * + * Table is traversed under read lock. + */ +void +rib_walk(int af, u_int fibnum, rt_walktree_f_t *wa_f, void *arg) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rnh; + + if ((rnh = rt_tables_get_rnh(fibnum, af)) == NULL) + return; + + RIB_RLOCK(rnh); + rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg); + RIB_RUNLOCK(rnh); +} + +int +rib_request(enum rib_cmd_type cmd, u_int fibnum, struct rt_addrinfo *info, + struct rib_cmd_info *rc) +{ + int error; + + switch (cmd) { + case RIB_ADD: + error = rib_add_route(fibnum, info, rc); + break; + case RIB_DEL: + error = rib_del_route(fibnum, info, rc); + break; + case RIB_CHANGE: + error = rib_change_route(fibnum, info, rc); + break; + default: + error = ENOTSUP; + } + + return (error); +} + +/* + * Adds/removes route denoted by @dst, @mask and @gw to/from the RIB. + * Return 0 on success. + */ +int +rib_request_simple(enum rib_cmd_type cmd, u_int fibnum, struct sockaddr *dst, + struct sockaddr *mask, struct sockaddr *gw, int rt_flags) +{ + struct rt_addrinfo info; + struct rib_cmd_info rc; + struct epoch_tracker et; + int error; + + bzero(&info, sizeof(info)); + info.rti_flags = rt_flags; + info.rti_info[RTAX_DST] = dst; + info.rti_info[RTAX_NETMASK] = mask; + info.rti_info[RTAX_GATEWAY] = gw; + + NET_EPOCH_ENTER(et); + error = rib_request(cmd, fibnum, &info, &rc); + NET_EPOCH_EXIT(et); + + return (error); +} + +/* + * Checks if rte can be exported v.r.t jails/vnets. + * + * Returns 1 if it can, 0 otherwise. + */ +int +rib_can_export_rte(struct ucred *td_ucred, const struct rtentry *rt) +{ + + if (!RT_IS_HOST_ROUTE(rt) + ? jailed_without_vnet(td_ucred) + : prison_if(td_ucred, rt_key_const(rt)) != 0) + return (0); + return (1); +} + Index: sys/net/route/rtentry_var.h =================================================================== --- /dev/null +++ sys/net/route/rtentry_var.h @@ -0,0 +1,146 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1980, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)route.h 8.4 (Berkeley) 1/9/95 + * $FreeBSD$ + */ + +/* + * This header contains struct rtentry definition and supporting macro. + * + * Header is not intended to be included by the code external to the + * routing subsystem. + */ + +#ifndef _NET_RTENTRY_VAR_H_ +#define _NET_RTENTRY_VAR_H_ + +#if defined(_KERNEL) + +#include + +#define rt_key_const(r) (*((const struct sockaddr * const *)(&(r)->rt_nodes->rn_key))) +#define rt_mask_const(r) (*((const struct sockaddr * const *)(&(r)->rt_nodes->rn_mask))) +struct rtentry { + struct radix_node rt_nodes[2]; /* tree glue, and other values */ + /* + * XXX struct rtentry must begin with a struct radix_node (or two!) + * because the code does some casts of a 'struct radix_node *' + * to a 'struct rtentry *' + */ +#define rt_key(r) (*((struct sockaddr **)(&(r)->rt_nodes->rn_key))) +#define rt_mask(r) (*((struct sockaddr **)(&(r)->rt_nodes->rn_mask))) + /* + * 2 structures above consists of 2x6 pointers, leaving + * 4 pointers / 32 bytes in the cacheline on amd64 + */ + struct nhop_object *rt_nhop; /* nexthop data */ + union { + struct sockaddr_in rt_dst4; + struct sockaddr_in6 rt_dst6; + struct sockaddr rt_dst; + }; + + /* + * sizeof(struct sockaddr_in6) == 28 on amd64, + * however, the dataplane-relevant part (e.g. address) + * lies at offset 8..24, making it into the end of the cache line. + */ + + int rte_flags; /* up/down?, host/net */ + int rt_refcnt; /* # held references */ + u_int rt_fibnum; /* which FIB */ + u_long rt_weight; /* absolute weight */ + u_long rt_expire; /* lifetime for route, e.g. redirect */ +#define rt_endzero rt_mtx + struct mtx rt_mtx; /* mutex for routing entry */ + struct rtentry *rt_chain; /* pointer to next rtentry to delete */ + struct epoch_context rt_epoch_ctx; /* net epoch tracker */ +}; + +#define RT_LOCK_INIT(_rt) \ + mtx_init(&(_rt)->rt_mtx, "rtentry", NULL, MTX_DEF | MTX_DUPOK | MTX_NEW) +#define RT_LOCK(_rt) mtx_lock(&(_rt)->rt_mtx) +#define RT_UNLOCK(_rt) mtx_unlock(&(_rt)->rt_mtx) +#define RT_LOCK_DESTROY(_rt) mtx_destroy(&(_rt)->rt_mtx) +#define RT_LOCK_ASSERT(_rt) mtx_assert(&(_rt)->rt_mtx, MA_OWNED) +#define RT_UNLOCK_COND(_rt) do { \ + if (mtx_owned(&(_rt)->rt_mtx)) \ + mtx_unlock(&(_rt)->rt_mtx); \ +} while (0) + +#define RT_ADDREF(_rt) do { \ + RT_LOCK_ASSERT(_rt); \ + KASSERT((_rt)->rt_refcnt >= 0, \ + ("negative refcnt %d", (_rt)->rt_refcnt)); \ + (_rt)->rt_refcnt++; \ +} while (0) + +#define RT_REMREF(_rt) do { \ + RT_LOCK_ASSERT(_rt); \ + KASSERT((_rt)->rt_refcnt > 0, \ + ("bogus refcnt %d", (_rt)->rt_refcnt)); \ + (_rt)->rt_refcnt--; \ +} while (0) + +#define RTFREE_LOCKED(_rt) do { \ + if ((_rt)->rt_refcnt <= 1) \ + rtfree(_rt); \ + else { \ + RT_REMREF(_rt); \ + RT_UNLOCK(_rt); \ + } \ + /* guard against invalid refs */ \ + _rt = 0; \ +} while (0) + +#define RTFREE(_rt) do { \ + RT_LOCK(_rt); \ + RTFREE_LOCKED(_rt); \ +} while (0) + +#define RT_IS_UP(_rt) ((_rt)->rte_flags & RTF_UP) +#define RT_IS_HOST_ROUTE(_rt) ((_rt)->rte_flags & RTF_HOST) + +/* + * Convert a 'struct radix_node *' to a 'struct rtentry *'. + * The operation can be done safely (in this code) because a + * 'struct rtentry' starts with two 'struct radix_node''s, the first + * one representing leaf nodes in the routing tree, which is + * what the code in radix.c passes us as a 'struct radix_node'. + * + * But because there are a lot of assumptions in this conversion, + * do not cast explicitly, but always use the macro below. + */ +#define RNTORT(p) ((struct rtentry *)(p)) + +#endif /* _KERNEL */ + +#endif Index: sys/net/route/shared.h =================================================================== --- /dev/null +++ sys/net/route/shared.h @@ -0,0 +1,126 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Contains various definitions shared between the parts of a routing subsystem. + * + * Header is not intended to be included by the code external to the + * routing subsystem. + */ + +#ifndef _NET_ROUTE_SHARED_H_ +#define _NET_ROUTE_SHARED_H_ + +#ifdef INVARIANTS +#define NET_EPOCH_ASSERT_INVARIANTS() NET_EPOCH_ASSERT() +#else +#define NET_EPOCH_ASSERT_INVARIANTS() +#endif + +#ifdef RTDEBUG +#define DPRINTF(_fmt, ...) printf("%s: " _fmt "\n", __func__ , ## __VA_ARGS__) +#else +#define DPRINTF(_fmt, ...) +#endif + +struct rib_head; + +/* Shared across nexthops and nexthop groups */ +MALLOC_DECLARE(M_NHOP); + +/* Nexhops */ +int nhops_init(struct rib_head *rh); +void nhops_destroy(struct rib_head *rh); +struct nhop_object *nhop_get(struct rib_head *rh, const struct nhop_request *req); +int nhop_ref_object(struct nhop_object *nh); +int nhop_ref_any(struct nhop_object *nh); +void nhop_free_any(struct nhop_object *nh); + +void nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu); +int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w); + + +/* multipath */ +#define MPF_MULTIPATH 0x08 /* need to be consistent with NHF_MULTIPATH */ +#define MPF_LINKED 0x10 /* mpath group is linked */ + +struct nhgrp_object { + uint16_t mp_flags; /* mpath flags */ + uint8_t mp_size; /* size of mpath group used in selection */ + uint8_t spare; + struct nhop_object *nhops[0]; /* nhops */ +}; + +struct weightened_nhop { + struct nhop_object *nh; + uint32_t weight; +}; + + +/* nhgrp.c */ +int nhgrp_ctl_init(struct nh_control *ctl); +void nhgrp_ctl_free(struct nh_control *ctl); + +struct nhgrp_object; + +/* nhgrp_ctl.c */ +struct weightened_nhop *nhgrp_get_nhops(struct nhgrp_object *mp, + uint32_t *pnum_nhops); +int nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w); + +struct nhgrp_object *nhgrp_get_group(struct rib_head *rh, + struct weightened_nhop *wn, int num_nhops, int *perror); +struct nhgrp_object *nhgrp_append_nhops(struct rib_head *rh, + const struct nhgrp_object *gr_orig, struct weightened_nhop *wn, + int num_nhops, uint64_t *paddmask, int *perror); +struct nhgrp_object *nhgrp_get_del_nhops(struct rib_head *rh, + const struct nhgrp_object *src, uint64_t *nhop_mask, int *perror); +struct nhgrp_object *nhgrp_get_replace_nhop(struct rib_head *rh, + const struct nhgrp_object *gr_orig, struct weightened_nhop *wn, + uint8_t replace_idx, uint64_t *pmodmask, int *perror); + +void nhgrp_free_group(struct nhgrp_object *gr); +int nhgrp_ref_group(struct nhgrp_object *gr); + +/* nhgrp*/ + +/* route_ctl.c */ +int can_nh_multipath(const struct nhop_object *nh); +int create_rte_from_rte(struct rib_head *rnh, struct rtentry *rt_orig, + struct rtentry **ret_rt); +int del_route_one(struct rib_head *rnh, struct rtentry *rt, + struct rt_addrinfo *info); + +int rib_match_nhop_gw(const struct nhop_object *nh, + const struct sockaddr *gw); + +#endif + + + Index: sys/net/route_temporal.c =================================================================== --- sys/net/route_temporal.c +++ sys/net/route_temporal.c @@ -40,7 +40,12 @@ #include #include #include +#include +#include +#include +#include + #include #include #include @@ -51,14 +56,17 @@ * Updates time of the next nearest route expiration as a side effect. */ static int -expire_route(const struct rtentry *rt, void *arg) +expire_route(const struct rtentry *rt, const struct nhop_object *nh, void *arg) { time_t *next_callout; + unsigned long rt_expire; - if (rt->rt_expire == 0) + rt_expire = rib_get_entry_expire_time(rt); + + if (rt_expire == 0) return (0); - if (rt->rt_expire <= time_uptime) + if (rt_expire <= time_uptime) return (1); next_callout = (time_t *)arg; @@ -67,8 +75,8 @@ * Update next_callout to determine the next ts to * run the callback at. */ - if (*next_callout == 0 || *next_callout > rt->rt_expire) - *next_callout = rt->rt_expire; + if (*next_callout == 0 || *next_callout > rt_expire) + *next_callout = rt_expire; return (0); } @@ -124,23 +132,26 @@ tmproutes_update(struct rib_head *rnh, struct rtentry *rt) { int seconds; + unsigned long rt_expire; RIB_WLOCK_ASSERT(rnh); - if (rnh->next_expire == 0 || rnh->next_expire > rt->rt_expire) { + rt_expire = rib_get_entry_expire_time(rt); + + if (rnh->next_expire == 0 || rnh->next_expire > rt_expire) { /* * Callback is not scheduled, is executing, * or is scheduled for a later time than we need. * * Schedule the one for the current @rt expiration time. */ - seconds = (rt->rt_expire - time_uptime); + seconds = (rt_expire - time_uptime); if (seconds < 0) seconds = 0; callout_reset_sbt(&rnh->expire_callout, SBT_1S * seconds, SBT_1MS * 500, expire_callout, rnh, 0); - rnh->next_expire = rt->rt_expire; + rnh->next_expire = rt_expire; } } Index: sys/net/route_var.h =================================================================== --- sys/net/route_var.h +++ sys/net/route_var.h @@ -32,6 +32,15 @@ #ifndef _NET_ROUTE_VAR_H_ #define _NET_ROUTE_VAR_H_ +#ifndef RNF_NORMAL +#include +#endif + +struct nh_control; +struct nhop_request; +typedef int rnh_preadd_entry_f_t(u_int fibnum, const struct sockaddr *addr, + const struct sockaddr *mask, struct nhop_request *req); + struct rib_head { struct radix_head head; rn_matchaddr_f_t *rnh_matchaddr; /* longest match for sockaddr */ @@ -41,6 +50,7 @@ rn_walktree_t *rnh_walktree; /* traverse tree */ rn_walktree_from_t *rnh_walktree_from; /* traverse tree below a */ rn_close_t *rnh_close; /*do something when the last ref drops*/ + rnh_preadd_entry_f_t *rnh_preadd; /* hook to alter record prior to insertion */ rt_gen_t rnh_gen; /* generation counter */ int rnh_multipath; /* multipath capable ? */ struct radix_node rnh_nodes[3]; /* empty tree for common case */ @@ -51,6 +61,7 @@ u_int rib_fibnum; /* fib number */ struct callout expire_callout; /* Callout for expiring dynamic routes */ time_t next_expire; /* Next expire run ts */ + struct nh_control *nh_control; /* nexthop subsystem data */ }; #define RIB_RLOCK_TRACKER struct rm_priotracker _rib_tracker @@ -74,7 +85,7 @@ CHK_STRUCT_FIELD_GENERIC(struct route, _field, _route_new, _field) #define CHK_STRUCT_ROUTE_FIELDS(_route_new) \ - _CHK_ROUTE_FIELD(_route_new, ro_rt) \ + _CHK_ROUTE_FIELD(_route_new, ro_nh) \ _CHK_ROUTE_FIELD(_route_new, ro_lle) \ _CHK_ROUTE_FIELD(_route_new, ro_prepend)\ _CHK_ROUTE_FIELD(_route_new, ro_plen) \ @@ -89,6 +100,74 @@ struct rib_head *rt_tables_get_rnh(int fib, int family); +#ifdef NEED_RTZONE +#if 0 +VNET_DECLARE(uma_zone_t, rtzone); /* Routing table UMA zone. */ +#define V_rtzone VNET(rtzone) +#endif +extern uma_zone_t rtzone; /* Routing table UMA zone. */ +#define V_rtzone rtzone +#endif + +VNET_PCPUSTAT_DECLARE(struct rtstat, rtstat); +#define RTSTAT_ADD(name, val) \ + VNET_PCPUSTAT_ADD(struct rtstat, rtstat, name, (val)) +#define RTSTAT_INC(name) RTSTAT_ADD(name, 1) + +SYSCTL_DECL(_net_route); + +/* Constants */ + +/* + * Number of times to retry the operation such as RTM_CHANGE + * on error caused by concurrent rtable changes before returning + * to userland with an error. + */ +#define RIB_MAX_RETRIES 3 + +/* + * Maximum width of the multipath group. + */ +#define RIB_MAX_MPATH_WIDTH 64 + + +/* + * With the split between the routing entry and the nexthop, + * rt_flags has to be split between these 2 entries. As rtentry + * mostly contains prefix data and is thought to be generic enough + * so one can transparently change the nexthop pointer w/o requiring + * any other rtentry changes, most of rt_flags shifts to the particular nexthop. + * / + * + * RTF_UP: rtentry, as an indication that it is linked. + * RTF_HOST: rtentry, nhop. The latter indication is needed for the datapath + * RTF_DYNAMIC: nhop, to make rtentry generic. + * RTF_MODIFIED: nhop, to make rtentry generic. (legacy) + * -- "native" path (nhop) properties: + * RTF_GATEWAY, RTF_STATIC, RTF_PROTO1, RTF_PROTO2, RTF_PROTO3, RTF_FIXEDMTU, + * RTF_PINNED, RTF_REJECT, RTF_BLACKHOLE, RTF_BROADCAST + */ + +/* Nexthop rt flags mask */ +#define NHOP_RT_FLAG_MASK (RTF_GATEWAY | RTF_HOST | RTF_REJECT | RTF_DYNAMIC | \ + RTF_MODIFIED | RTF_STATIC | RTF_BLACKHOLE | RTF_PROTO1 | RTF_PROTO2 | \ + RTF_PROTO3 | RTF_FIXEDMTU | RTF_PINNED | RTF_BROADCAST) + +/* rtentry rt flag mask */ +#define RTE_RT_FLAG_MASK (RTF_UP | RTF_HOST) + +/* Nexthop selection */ +#define _NH2MP(_nh) ((struct nhgrp_object *)(_nh)) +#define _SELECT_NHOP(_nh, _flowid) \ + (_NH2MP(_nh))->nhops[(_flowid) % (_NH2MP(_nh))->mp_size] +#define _RT_SELECT_NHOP(_nh, _flowid) \ + ((!NH_IS_MULTIPATH(_nh)) ? (_nh) : _SELECT_NHOP(_nh, _flowid)) +#define RT_SELECT_NHOP(_rt, _flowid) _RT_SELECT_NHOP((_rt)->rt_nhop, _flowid) + +/* Entropy data used for outbound hashing */ +#define MPATH_ENTROPY_KEY_LEN 40 +extern uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN]; + /* rte<>nhop translation */ static inline uint16_t fib_rte_to_nh_flags(int rt_flags) @@ -105,8 +184,24 @@ return (res); } +/* route.c */ +struct rtentry *rtalloc1_fib(struct sockaddr *dst, int report, + u_long ignflags, u_int fibnum); void tmproutes_update(struct rib_head *rnh, struct rtentry *rt); void tmproutes_init(struct rib_head *rh); void tmproutes_destroy(struct rib_head *rh); + +/* route_ctl.c */ +int match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw); + +/* mpath_ctl.c */ +struct nhgrp_object; + +int add_route_mpath(struct rib_head *rnh, struct rtentry *rt, + struct nhop_object *nh_orig, u_long weight_orig, struct rt_addrinfo *info, + struct rib_cmd_info *rc); +int del_route_mpath(struct rib_head *rnh, struct rtentry *rt, + struct nhgrp_object *mp_orig, struct rt_addrinfo *info, + struct rib_cmd_info *rc); #endif Index: sys/net/rtsock.c =================================================================== --- sys/net/rtsock.c +++ sys/net/rtsock.c @@ -32,7 +32,7 @@ * $FreeBSD$ */ #include "opt_ddb.h" -#include "opt_mpath.h" +#include "opt_route_mpath.h" #include "opt_inet.h" #include "opt_inet6.h" @@ -68,6 +68,8 @@ #include #include #include +#include +#include #include #include @@ -77,6 +79,7 @@ #include #include #endif +#include #ifdef COMPAT_FREEBSD32 #include @@ -158,8 +161,7 @@ #define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx) #define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED) -static SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, - ""); +SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); struct walkarg { int w_tmemsize; @@ -168,25 +170,30 @@ struct sysctl_req *w_req; }; +struct nh_walkarg; + static void rts_input(struct mbuf *m); static struct mbuf *rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo); static int rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen); static int rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo); +static int sysctl_dump_rt_nhop(struct nhop_object *nh, uint32_t rt_weight, + struct nh_walkarg *nw); static int sysctl_dumpentry(struct radix_node *rn, void *vw); static int sysctl_iflist(int af, struct walkarg *w); static int sysctl_ifmalist(int af, struct walkarg *w); static int route_output(struct mbuf *m, struct socket *so, ...); -static void rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out); +static void rt_getmetrics(const struct rtentry *rt, const struct nhop_object *nh, + uint32_t weight, struct rt_metrics *out); static void rt_dispatch(struct mbuf *, sa_family_t); static struct sockaddr *rtsock_fix_netmask(struct sockaddr *dst, - struct sockaddr *smask, struct sockaddr_storage *dmask); -static int handle_rtm_get(struct rt_addrinfo *info, u_int fibnum, - struct rt_msghdr *rtm, struct rtentry **ret_nrt); -static int update_rtm_from_rte(struct rt_addrinfo *info, - struct rt_msghdr **prtm, int alloc_len, - struct rtentry *rt); + struct sockaddr *smask, struct sockaddr_in6 *dmask); +static int handle_rtm_get(u_int fibnum, struct rt_addrinfo *info, + struct rt_msghdr **prtm, int alloc_len); +static int +update_rtm_from_rte(struct rt_addrinfo *info, struct rt_msghdr **prtm, + int alloc_len, const struct rtentry *rt, struct nhop_object *nh, uint32_t rt_weight); static void send_rtm_reply(struct socket *so, struct rt_msghdr *rtm, struct mbuf *m, sa_family_t saf, u_int fibnum, int rtm_errno); @@ -455,15 +462,15 @@ static int rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, - struct rtentry *rt, union sockaddr_union *saun, struct ucred *cred) + const struct nhop_object *nh, union sockaddr_union *saun, struct ucred *cred) { #if defined(INET) || defined(INET6) struct epoch_tracker et; #endif /* First, see if the returned address is part of the jail. */ - if (prison_if(cred, rt->rt_ifa->ifa_addr) == 0) { - info->rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; + if (prison_if(cred, nh->nh_ifa->ifa_addr) == 0) { + info->rti_info[RTAX_IFA] = nh->nh_ifa->ifa_addr; return (0); } @@ -497,7 +504,7 @@ /* * As a last resort return the 'default' jail address. */ - ia = ((struct sockaddr_in *)rt->rt_ifa->ifa_addr)-> + ia = ((struct sockaddr_in *)nh->nh_ifa->ifa_addr)-> sin_addr; if (prison_get_ip4(cred, &ia) != 0) return (ESRCH); @@ -540,7 +547,7 @@ /* * As a last resort return the 'default' jail address. */ - ia6 = ((struct sockaddr_in6 *)rt->rt_ifa->ifa_addr)-> + ia6 = ((struct sockaddr_in6 *)nh->nh_ifa->ifa_addr)-> sin6_addr; if (prison_get_ip6(cred, &ia6) != 0) return (ESRCH); @@ -570,8 +577,13 @@ fill_addrinfo(struct rt_msghdr *rtm, int len, u_int fibnum, struct rt_addrinfo *info) { int error; - sa_family_t saf; + /* + * Starting from here, it is possible + * to alter original message and insert + * caller PID and error value. + */ + rtm->rtm_pid = curproc->p_pid; info->rti_addrs = rtm->rtm_addrs; @@ -594,7 +606,6 @@ (info->rti_info[RTAX_GATEWAY] != NULL && info->rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX)) return (EINVAL); - saf = info->rti_info[RTAX_DST]->sa_family; /* * Verify that the caller has the appropriate privilege; RTM_GET * is the only operation the non-superuser is allowed. @@ -653,43 +664,26 @@ * Handles RTM_GET message from routing socket, returning matching rt. * * Returns: - * 0 on success, with locked and referenced matching rt in @rt_nrt + * 0 on success, with locked matching rt, nh in @ret_nrt and @ret_nh * errno of failure */ static int -handle_rtm_get(struct rt_addrinfo *info, u_int fibnum, - struct rt_msghdr *rtm, struct rtentry **ret_nrt) +handle_rtm_get(u_int fibnum, struct rt_addrinfo *info, + struct rt_msghdr **prtm, int alloc_len) { - RIB_RLOCK_TRACKER; struct rtentry *rt; - struct rib_head *rnh; - sa_family_t saf; + struct nhop_object *nh; + uint32_t rt_weight; + int error; - saf = info->rti_info[RTAX_DST]->sa_family; + error = rib_lookup_route_netmask(fibnum, info->rti_info[RTAX_DST], + info->rti_info[RTAX_NETMASK], &rt); - rnh = rt_tables_get_rnh(fibnum, saf); - if (rnh == NULL) - return (EAFNOSUPPORT); + if (error != 0) + return (error); - RIB_RLOCK(rnh); + /* rt is locked and unreferenced. */ - if (info->rti_info[RTAX_NETMASK] == NULL) { - /* - * Provide longest prefix match for - * address lookup (no mask). - * 'route -n get addr' - */ - rt = (struct rtentry *) rnh->rnh_matchaddr( - info->rti_info[RTAX_DST], &rnh->head); - } else - rt = (struct rtentry *) rnh->rnh_lookup( - info->rti_info[RTAX_DST], - info->rti_info[RTAX_NETMASK], &rnh->head); - - if (rt == NULL) { - RIB_RUNLOCK(rnh); - return (ESRCH); - } #ifdef RADIX_MPATH /* * for RTM_GET, gate is optional even with multipath. @@ -704,12 +698,46 @@ } } #endif + rt_weight = rt->rt_weight; + nh = rt->rt_nhop; +#ifdef ROUTE_MPATH + if (NH_IS_MULTIPATH(nh)) { + struct weightened_nhop *wn; + uint32_t num_nhops; + struct sockaddr *gw; + + nh = NULL; + gw = info->rti_info[RTAX_GATEWAY]; + wn = nhgrp_get_nhops((struct nhgrp_object *)rt->rt_nhop, + &num_nhops); + if (gw != NULL) { + for (uint32_t i = 0; i < num_nhops; i++) { + if (rib_match_nhop_gw(wn[i].nh, gw)) { + nh = wn[i].nh; + rt_weight = wn[i].weight; + break; + } + } + if (nh == NULL) { + RT_UNLOCK(rt); + return (ESRCH); + } + } else { + /* By default, use the first control plane nexthop */ + nh = wn[0].nh; + rt_weight = wn[0].weight; + } + } +#endif + /* * If performing proxied L2 entry insertion, and * the actual PPP host entry is found, perform * another search to retrieve the prefix route of * the local end point of the PPP link. */ + /* XXX: fix RTF_ANNOUNCE */ +#if 0 if (rtm->rtm_flags & RTF_ANNOUNCE) { struct sockaddr laddr; @@ -740,19 +768,24 @@ return (ESRCH); } } - RT_LOCK(rt); - RT_ADDREF(rt); - RIB_RUNLOCK(rnh); +#endif - *ret_nrt = rt; + if (!can_export_rte(curthread->td_ucred, rt)) { + RT_UNLOCK(rt); + return (ESRCH); + } - return (0); + error = update_rtm_from_rte(info, prtm, alloc_len, rt, nh, rt_weight); + RT_UNLOCK(rt); + if (error != 0) + printf("%s: ret %d\n", __func__, error); + + return (error); } /* * Update sockaddrs, flags, etc in @prtm based on @rt data. * Assumes @rt is locked. - * rtm can be reallocated. * * Returns 0 on success, along with pointer to (potentially reallocated) * rtm. @@ -760,36 +793,41 @@ */ static int update_rtm_from_rte(struct rt_addrinfo *info, struct rt_msghdr **prtm, - int alloc_len, struct rtentry *rt) + int alloc_len, const struct rtentry *rt, struct nhop_object *nh, uint32_t rt_weight) { - struct sockaddr_storage netmask_ss; - struct walkarg w; - union sockaddr_union saun; - struct rt_msghdr *rtm, *orig_rtm = NULL; + struct sockaddr_in6 dst, mask; struct ifnet *ifp; + struct rt_msghdr *rtm, *orig_rtm = NULL; + struct walkarg w; int error, len; + union sockaddr_union saun; RT_LOCK_ASSERT(rt); - rtm = *prtm; - - info->rti_info[RTAX_DST] = rt_key(rt); - info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; - info->rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt), - rt_mask(rt), &netmask_ss); + info->rti_info[RTAX_DST] = rib_get_entry_dst_sa(rt, + (struct sockaddr *)&dst, sizeof(dst), &error); + if (error != 0) + return (error); + info->rti_info[RTAX_NETMASK] = rib_get_entry_netmask_sa(rt, + (struct sockaddr *)&mask, sizeof(mask), &error); + if (error != 0) + return (error); info->rti_info[RTAX_GENMASK] = 0; - ifp = rt->rt_ifp; + info->rti_info[RTAX_GATEWAY] = &nh->gw_sa; + + ifp = nh->nh_ifp; + rtm = *prtm; if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { if (ifp) { info->rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; - error = rtm_get_jailed(info, ifp, rt, + error = rtm_get_jailed(info, ifp, nh, &saun, curthread->td_ucred); if (error != 0) return (error); if (ifp->if_flags & IFF_POINTOPOINT) info->rti_info[RTAX_BRD] = - rt->rt_ifa->ifa_dstaddr; + nh->nh_ifa->ifa_dstaddr; rtm->rtm_index = ifp->if_index; } else { info->rti_info[RTAX_IFP] = NULL; @@ -821,12 +859,14 @@ w.w_tmemsize = alloc_len; rtsock_msg_buffer(rtm->rtm_type, info, &w, &len); - if (rt->rt_flags & RTF_GWFLAG_COMPAT) + int rt_flags = rib_get_entry_rtflags(rt, nh); + /* XXX: Eliminate RTF_GWFLAG_COMPAT */ + if (rt->rte_flags & RTF_GWFLAG_COMPAT) rtm->rtm_flags = RTF_GATEWAY | - (rt->rt_flags & ~RTF_GWFLAG_COMPAT); + (rt_flags & ~RTF_GWFLAG_COMPAT); else - rtm->rtm_flags = rt->rt_flags; - rt_getmetrics(rt, &rtm->rtm_rmx); + rtm->rtm_flags = rt_flags; + rt_getmetrics(rt, nh, rt_weight, &rtm->rtm_rmx); rtm->rtm_addrs = info->rti_addrs; if (orig_rtm != NULL) @@ -841,11 +881,10 @@ route_output(struct mbuf *m, struct socket *so, ...) { struct rt_msghdr *rtm = NULL; - struct rtentry *rt = NULL; struct rt_addrinfo info; + struct sockaddr_storage ss; struct epoch_tracker et; #ifdef INET6 - struct sockaddr_storage ss; struct sockaddr_in6 *sin6; int i, rti_need_deembed = 0; #endif @@ -909,35 +948,50 @@ goto flush; } - switch (rtm->rtm_type) { - struct rtentry *saved_nrt; + struct rib_cmd_info rc; + bzero(&rc, sizeof(rc)); + switch (rtm->rtm_type) { case RTM_ADD: case RTM_CHANGE: if (rtm->rtm_type == RTM_ADD) { if (info.rti_info[RTAX_GATEWAY] == NULL) senderr(EINVAL); - } - saved_nrt = NULL; - error = rtrequest1_fib(rtm->rtm_type, &info, &saved_nrt, - fibnum); - if (error == 0 && saved_nrt != NULL) { + error = rib_add_route(fibnum, &info, &rc); + } else + error = rib_change_route(fibnum, &info, &rc); + if (error == 0) { #ifdef INET6 rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif - RT_LOCK(saved_nrt); - rtm->rtm_index = saved_nrt->rt_ifp->if_index; - RT_REMREF(saved_nrt); - RT_UNLOCK(saved_nrt); +#ifdef ROUTE_MPATH + if (NH_IS_MULTIPATH(rc.nh_new) && rc.mask_changed) { + uint32_t num_nhops, idx; + struct weightened_nhop *wn; + + /* Find the index of the added nhop. */ + idx = ffsll(rc.mask_changed) - 1; + wn = nhgrp_get_nhops((struct nhgrp_object *)rc.nh_new, + &num_nhops); + rtm->rtm_index = wn[idx].nh->nh_ifp->if_index; + } else +#endif + rtm->rtm_index = rc.nh_new->nh_ifp->if_index; } break; case RTM_DELETE: - saved_nrt = NULL; - error = rtrequest1_fib(RTM_DELETE, &info, &saved_nrt, fibnum); + error = rib_del_route(fibnum, &info, &rc); if (error == 0) { - RT_LOCK(saved_nrt); - rt = saved_nrt; + /* XXX: mpath */ + if (can_export_rte(curthread->td_ucred, rc.rt)) { + RT_LOCK(rc.rt); + error = update_rtm_from_rte(&info, &rtm, + alloc_len, rc.rt, rc.nh_old, + rc.rt->rt_weight); + RT_UNLOCK(rc.rt); + } else + error = ESRCH; goto report; } #ifdef INET6 @@ -947,17 +1001,12 @@ break; case RTM_GET: - error = handle_rtm_get(&info, fibnum, rtm, &rt); + /* XXX: verify deembed on errors */ + error = handle_rtm_get(fibnum, &info, &rtm, alloc_len); if (error != 0) senderr(error); report: - RT_LOCK_ASSERT(rt); - if (!can_export_rte(curthread->td_ucred, rt)) { - RT_UNLOCK(rt); - senderr(ESRCH); - } - error = update_rtm_from_rte(&info, &rtm, alloc_len, rt); /* * Note that some sockaddr pointers may have changed to * point to memory outsize @rtm. Some may be pointing @@ -973,7 +1022,6 @@ #ifdef INET6 rti_need_deembed = 0; #endif - RT_UNLOCK(rt); if (error != 0) senderr(error); break; @@ -984,8 +1032,6 @@ flush: NET_EPOCH_EXIT(et); - if (rt != NULL) - RTFREE(rt); #ifdef INET6 if (rtm != NULL) { @@ -1069,13 +1115,14 @@ static void -rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out) +rt_getmetrics(const struct rtentry *rt, const struct nhop_object *nh, + uint32_t rt_weight, struct rt_metrics *out) { bzero(out, sizeof(*out)); - out->rmx_mtu = rt->rt_mtu; - out->rmx_weight = rt->rt_weight; - out->rmx_pksent = counter_u64_fetch(rt->rt_pksent); + out->rmx_mtu = nh->nh_mtu; + out->rmx_weight = rt_weight; + out->rmx_pksent = nhop_get_idx(nh); /* Kernel -> userland timebase conversion. */ out->rmx_expire = rt->rt_expire ? rt->rt_expire - time_uptime + time_second : 0; @@ -1126,23 +1173,84 @@ /* * Fill in @dmask with valid netmask leaving original @smask - * intact. Mostly used with radix netmasks. + * intact. Used with radix-originated netmasks. */ static struct sockaddr * rtsock_fix_netmask(struct sockaddr *dst, struct sockaddr *smask, - struct sockaddr_storage *dmask) + struct sockaddr_in6 *dmask) { if (dst == NULL || smask == NULL) return (NULL); - memset(dmask, 0, dst->sa_len); - memcpy(dmask, smask, smask->sa_len); - dmask->ss_len = dst->sa_len; - dmask->ss_family = dst->sa_family; + if (dst->sa_len > sizeof(struct sockaddr_in6)) { + printf("NETMASK SA_LEN: %d\n", dst->sa_len); + return (NULL); + } + bzero(dmask, dst->sa_len); + dmask->sin6_len = dst->sa_len; + dmask->sin6_family = dst->sa_family; - return ((struct sockaddr *)dmask); + switch (dst->sa_family) { + case AF_INET: + ((struct sockaddr_in *)dmask)->sin_addr = + ((struct sockaddr_in *)smask)->sin_addr; + return ((struct sockaddr *)dmask); + case AF_INET6: + ((struct sockaddr_in6 *)dmask)->sin6_addr = + ((struct sockaddr_in6 *)smask)->sin6_addr; + return ((struct sockaddr *)dmask); + } + + return (smask); } +#ifdef COMPAT_FREEBSD32 +#define SA_SIZE_COMPAT(_sa, _compat32) { \ + if (_compat32) \ + SA_SIZE32(sa); \ + else \ + SA_SIZE(_sa); \ +} +#else +#define SA_SIZE_COMPAT(_sa, _compat32) SA_SIZE(_sa) +#endif + + +#if 0 +inline static struct sockaddr * +prepare_sockaddr(struct rt_addrinfo *info, int addr, struct sockaddr *buf, + int buflen, int *sa_len, int compat32, int deembed) +{ + struct sockaddr *sa = info->rti_info[addr]; + + if (addr = RTAX_NETMASK) { + struct sockaddr *dst = info->rti_info[RTAX_DST]; + *sa_len = SA_SIZE_COMPAT(dst, compat32); + + if (buf == NULL) + return (NULL); + return (rtsock_fix_netmask(dst, sa, buf)); + } + + *sa_len = SA_SIZE_COMPAT(sa, compat32); + if (buf == NULL) + return (NULL); +#ifdef INET6 + if ((sa->sa_family == AF_INET6) && deembed) { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa; + if ((IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) || + IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr))){ + memcpy(buf, sin6, sin6->sin6_len); + } + + } + +#else + return (sa); +#endif +} +#endif + /* * Writes information related to @rtinfo object to newly-allocated mbuf. * Assumes MCLBYTES is enough to construct any message. @@ -1220,9 +1328,15 @@ m_freem(m); return (NULL); } + + /* + * The following 3 fields are the only fields shared + * by the rtsock messages. + */ rtm->rtm_msglen = len; rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; + return (m); } @@ -1441,7 +1555,7 @@ struct mbuf *m; struct ifa_msghdr *ifam; struct ifnet *ifp = ifa->ifa_ifp; - struct sockaddr_storage ss; + struct sockaddr_in6 ss; if (V_route_cb.any_count == 0) return (0); @@ -1481,10 +1595,9 @@ * Returns 0 on success. */ int -rtsock_routemsg(int cmd, struct rtentry *rt, struct ifnet *ifp, int rti_addrs, - int fibnum) +rtsock_routemsg(int cmd, struct rtentry *rt, struct nhop_object *nh, int fibnum) { - struct sockaddr_storage ss; + struct sockaddr_in6 ss; struct rt_addrinfo info; if (V_route_cb.any_count == 0) @@ -1493,9 +1606,9 @@ bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt), rt_mask(rt), &ss); - info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; - info.rti_flags = rt->rt_flags; - info.rti_ifp = ifp; + info.rti_info[RTAX_GATEWAY] = &nh->gw_sa; + info.rti_flags = rib_get_entry_rtflags(rt, nh); + info.rti_ifp = nh->nh_aifp; return (rtsock_routemsg_info(cmd, &info, fibnum)); } @@ -1695,7 +1808,7 @@ can_export_rte(struct ucred *td_ucred, const struct rtentry *rt) { - if ((rt->rt_flags & RTF_HOST) == 0 + if (!RT_IS_HOST_ROUTE(rt) ? jailed_without_vnet(td_ucred) : prison_if(td_ucred, rt_key_const(rt)) != 0) return (0); @@ -1705,32 +1818,83 @@ /* * This is used in dumping the kernel table via sysctl(). */ +struct nh_walkarg { + struct walkarg *w; + struct rtentry *rt; +}; + static int sysctl_dumpentry(struct radix_node *rn, void *vw) { struct walkarg *w = vw; struct rtentry *rt = (struct rtentry *)rn; - int error = 0, size; - struct rt_addrinfo info; - struct sockaddr_storage ss; + struct nhop_object *nh; + struct nh_walkarg nw; + int rt_flags; NET_EPOCH_ASSERT(); - if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg)) + nh = rt->rt_nhop; + rt_flags = rib_get_entry_rtflags(rt, nh); + + if (w->w_op == NET_RT_FLAGS && !(rt_flags & w->w_arg)) return 0; if (!can_export_rte(w->w_req->td->td_ucred, rt)) return (0); + + nw.w = w; + nw.rt = rt; + + if (!NH_IS_MULTIPATH(nh)) + return (sysctl_dump_rt_nhop(nh, rib_get_entry_weight(rt), &nw)); + +#ifdef ROUTE_MPATH + struct weightened_nhop *wn; + uint32_t num_nhops; + int error; + + wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops); + for (uint32_t i = 0; i < num_nhops; i++) { + error = sysctl_dump_rt_nhop(wn[i].nh, wn[i].weight, &nw); + if (error != 0) + return (error); + } +#endif + + return (0); +} + + +__noinline static int +sysctl_dump_rt_nhop(struct nhop_object *nh, uint32_t rt_weight, + struct nh_walkarg *nw) +{ + int error = 0, size; + struct walkarg *w = nw->w; + struct rtentry *rt = nw->rt; + struct rt_addrinfo info; + struct sockaddr_in6 dst, netmask; + int rt_flags; + + rt_flags = rib_get_entry_rtflags(rt, nh); + bzero((caddr_t)&info, sizeof(info)); - info.rti_info[RTAX_DST] = rt_key(rt); - info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; - info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt), - rt_mask(rt), &ss); + rib_get_entry_dst_sa(rt, (struct sockaddr *)&dst, sizeof(dst), &error); + info.rti_info[RTAX_DST] = rib_get_entry_dst_sa(rt, + (struct sockaddr *)&dst, sizeof(dst), &error); + if (error != 0) + return (error); + info.rti_info[RTAX_NETMASK] = rib_get_entry_netmask_sa(rt, + (struct sockaddr *)&netmask, sizeof(netmask), &error); + if (error != 0) + return (error); info.rti_info[RTAX_GENMASK] = 0; - if (rt->rt_ifp && !(rt->rt_ifp->if_flags & IFF_DYING)) { - info.rti_info[RTAX_IFP] = rt->rt_ifp->if_addr->ifa_addr; - info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; - if (rt->rt_ifp->if_flags & IFF_POINTOPOINT) - info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr; + info.rti_info[RTAX_GATEWAY] = &nh->gw_sa; + if (nh->nh_ifp && !(nh->nh_ifp->if_flags & IFF_DYING)) { + info.rti_info[RTAX_IFP] = nh->nh_ifp->if_addr->ifa_addr; + info.rti_info[RTAX_IFA] = nh->nh_ifa->ifa_addr; + if (nh->nh_ifp->if_flags & IFF_POINTOPOINT) + info.rti_info[RTAX_BRD] = nh->nh_ifa->ifa_dstaddr; } if ((error = rtsock_msg_buffer(RTM_GET, &info, w, &size)) != 0) return (error); @@ -1739,13 +1903,13 @@ bzero(&rtm->rtm_index, sizeof(*rtm) - offsetof(struct rt_msghdr, rtm_index)); - if (rt->rt_flags & RTF_GWFLAG_COMPAT) + if (rt_flags & RTF_GWFLAG_COMPAT) rtm->rtm_flags = RTF_GATEWAY | - (rt->rt_flags & ~RTF_GWFLAG_COMPAT); + (rt_flags & ~RTF_GWFLAG_COMPAT); else - rtm->rtm_flags = rt->rt_flags; - rt_getmetrics(rt, &rtm->rtm_rmx); - rtm->rtm_index = rt->rt_ifp->if_index; + rtm->rtm_flags = rt_flags; + rt_getmetrics(rt, nh, rt_weight, &rtm->rtm_rmx); + rtm->rtm_index = nh->nh_ifp->if_index; rtm->rtm_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); return (error); @@ -1901,7 +2065,7 @@ struct if_data ifd; struct rt_addrinfo info; int len, error = 0; - struct sockaddr_storage ss; + struct sockaddr_in6 ss; bzero((caddr_t)&info, sizeof(info)); bzero(&ifd, sizeof(ifd)); @@ -2025,7 +2189,7 @@ namelen--; if (req->newptr) return (EPERM); - if (name[1] == NET_RT_DUMP) { + if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP || name[1] == NET_RT_NHGROUPS) { if (namelen == 3) fib = req->td->td_proc->p_fibnum; else if (namelen == 4) @@ -2092,7 +2256,30 @@ error = EAFNOSUPPORT; } break; - + case NET_RT_NHOP: + case NET_RT_NHGROUPS: + /* Allow dumping one specific af/fib at a time */ + if (namelen < 4) { + error = EINVAL; + break; + } + fib = name[3]; + if (fib < 0 || fib > rt_numfibs) { + error = EINVAL; + break; + } + rnh = rt_tables_get_rnh(fib, af); + if (rnh == NULL) { + error = EAFNOSUPPORT; + break; + } + if (w.w_op == NET_RT_NHOP) + error = nhops_dump_sysctl(rnh, w.w_req); +#ifdef ROUTE_MPATH + else + error = nhgrp_dump_sysctl(rnh, w.w_req); +#endif + break; case NET_RT_IFLIST: case NET_RT_IFLISTL: error = sysctl_iflist(af, &w); @@ -2215,31 +2402,34 @@ static int rt_dumpentry_ddb(struct radix_node *rn, void *arg __unused) { - struct sockaddr_storage ss; + struct sockaddr_in6 ss; struct rtentry *rt; + struct nhop_object *nh; int flags, idx; /* If RNTORT is important, put it in a header. */ rt = (void *)rn; + /* XXX: mpath */ + nh = rt->rt_nhop; rt_dumpaddr_ddb("dst", rt_key(rt)); - rt_dumpaddr_ddb("gateway", rt->rt_gateway); + rt_dumpaddr_ddb("gateway", &nh->gw_sa); rt_dumpaddr_ddb("netmask", rtsock_fix_netmask(rt_key(rt), rt_mask(rt), &ss)); - if (rt->rt_ifp != NULL && (rt->rt_ifp->if_flags & IFF_DYING) == 0) { - rt_dumpaddr_ddb("ifp", rt->rt_ifp->if_addr->ifa_addr); - rt_dumpaddr_ddb("ifa", rt->rt_ifa->ifa_addr); + if (nh->nh_ifp != NULL && (nh->nh_ifp->if_flags & IFF_DYING) == 0) { + rt_dumpaddr_ddb("ifp", nh->nh_ifp->if_addr->ifa_addr); + rt_dumpaddr_ddb("ifa", nh->nh_ifa->ifa_addr); } db_printf("flags "); - flags = rt->rt_flags; + flags = rt->rte_flags; if (flags == 0) db_printf("none"); while ((idx = ffs(flags)) > 0) { idx--; - if (flags != rt->rt_flags) + if (flags != rt->rte_flags) db_printf(","); db_printf("%s", rt_flag_name(idx)); @@ -2522,7 +2712,7 @@ db_printf("Looking up route to destination '%s'\n", bp); CURVNET_SET(vnet0); - rt = rtalloc1(dstp, 0, RTF_RNH_LOCKED); + rt = rtalloc1_fib(dstp, 0, RTF_RNH_LOCKED, 0); CURVNET_RESTORE(); if (rt == NULL) { Index: sys/netinet/icmp6.h =================================================================== --- sys/netinet/icmp6.h +++ sys/netinet/icmp6.h @@ -693,7 +693,7 @@ #ifdef _KERNEL # ifdef __STDC__ -struct rtentry; +struct nhop_object; struct rttimer; struct in6_multi; # endif @@ -705,7 +705,7 @@ void icmp6_slowtimo(void); void icmp6_prepare(struct mbuf *); void icmp6_redirect_input(struct mbuf *, int); -void icmp6_redirect_output(struct mbuf *, struct rtentry *); +void icmp6_redirect_output(struct mbuf *, struct nhop_object *); struct ip6ctlparam; void icmp6_mtudisc_update(struct ip6ctlparam *, int); Index: sys/netinet/in_fib.h =================================================================== --- sys/netinet/in_fib.h +++ sys/netinet/in_fib.h @@ -32,6 +32,19 @@ #ifndef _NETINET_IN_FIB_H_ #define _NETINET_IN_FIB_H_ +struct route_in { + /* common fields shared among all 'struct route' */ + struct nhop_object *ro_nh; + struct llentry *ro_lle; + char *ro_prepend; + uint16_t ro_plen; + uint16_t ro_flags; + uint16_t ro_mtu; /* saved ro_rt mtu */ + uint16_t spare; + /* custom sockaddr */ + struct sockaddr_in ro_dst4; +}; + /* Basic nexthop info used for uRPF/mtu checks */ struct nhop4_basic { struct ifnet *nh_ifp; /* Logical egress interface */ @@ -57,6 +70,17 @@ int fib4_lookup_nh_ext(uint32_t fibnum, struct in_addr dst, uint32_t flags, uint32_t flowid, struct nhop4_extended *pnh4); void fib4_free_nh_ext(uint32_t fibnum, struct nhop4_extended *pnh4); + +struct nhop_object *fib4_lookup_nh_ptr(uint32_t fibnum, struct in_addr dst, + uint32_t scopeid, uint32_t flags, uint32_t flowid); +int fib4_lookup_nh_route(uint32_t fibnum, struct route_in *ro4, uint32_t flags, + uint32_t flowid); +int fib4_lookup_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, + uint32_t flags, const struct ifnet *src_if); + +uint32_t fib4_calc_software_hash(struct in_addr src, struct in_addr dst, + unsigned short src_port, unsigned short dst_port, char proto, + uint32_t *phashtype); #endif Index: sys/netinet/in_fib.c =================================================================== --- sys/netinet/in_fib.c +++ sys/netinet/in_fib.c @@ -33,6 +33,7 @@ #include "opt_inet.h" #include "opt_route.h" #include "opt_mpath.h" +#include "opt_route_mpath.h" #include #include @@ -49,70 +50,98 @@ #include #include #include +#include +#include +#include #include -#ifdef RADIX_MPATH -#include -#endif - #include #include #include +#include #ifdef INET -static void fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst, + +/* Verify struct route compatiblity */ +/* Assert 'struct route_in' is compatible with 'struct route' */ +CHK_STRUCT_ROUTE_COMPAT(struct route_in, ro_dst4); + +static void fib4_rte_to_nh_basic(struct nhop_object *nh, struct in_addr dst, uint32_t flags, struct nhop4_basic *pnh4); -static void fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst, +static void fib4_rte_to_nh_extended(struct nhop_object *nh, struct in_addr dst, uint32_t flags, struct nhop4_extended *pnh4); -#define RNTORT(p) ((struct rtentry *)(p)) +#ifdef ROUTE_MPATH +struct _hash_5tuple_ipv4 { + struct in_addr src; + struct in_addr dst; + unsigned short src_port; + unsigned short dst_port; + char proto; + char spare[3]; +}; +_Static_assert(sizeof(struct _hash_5tuple_ipv4) == 16, + "_hash_5tuple_ipv4 size is wrong"); + + +uint32_t +fib4_calc_software_hash(struct in_addr src, struct in_addr dst, + unsigned short src_port, unsigned short dst_port, char proto, + uint32_t *phashtype) +{ + struct _hash_5tuple_ipv4 data; + + data.src = src; + data.dst = dst; + data.src_port = src_port; + data.dst_port = dst_port; + data.proto = proto; + data.spare[0] = data.spare[1] = data.spare[2] = 0; + + *phashtype = M_HASHTYPE_OPAQUE; + + return (toeplitz_hash(MPATH_ENTROPY_KEY_LEN, mpath_entropy_key, + sizeof(data), (uint8_t *)&data)); +} +#endif + static void -fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst, +fib4_rte_to_nh_basic(struct nhop_object *nh, struct in_addr dst, uint32_t flags, struct nhop4_basic *pnh4) { - struct sockaddr_in *gw; if ((flags & NHR_IFAIF) != 0) - pnh4->nh_ifp = rte->rt_ifa->ifa_ifp; + pnh4->nh_ifp = nh->nh_ifa->ifa_ifp; else - pnh4->nh_ifp = rte->rt_ifp; - pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu); - if (rte->rt_flags & RTF_GATEWAY) { - gw = (struct sockaddr_in *)rte->rt_gateway; - pnh4->nh_addr = gw->sin_addr; - } else + pnh4->nh_ifp = nh->nh_ifp; + pnh4->nh_mtu = nh->nh_mtu; + if (nh->nh_flags & NHF_GATEWAY) + pnh4->nh_addr = nh->gw4_sa.sin_addr; + else pnh4->nh_addr = dst; /* Set flags */ - pnh4->nh_flags = fib_rte_to_nh_flags(rte->rt_flags); - gw = (struct sockaddr_in *)rt_key(rte); - if (gw->sin_addr.s_addr == 0) - pnh4->nh_flags |= NHF_DEFAULT; + pnh4->nh_flags = nh->nh_flags; /* TODO: Handle RTF_BROADCAST here */ } static void -fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst, +fib4_rte_to_nh_extended(struct nhop_object *nh, struct in_addr dst, uint32_t flags, struct nhop4_extended *pnh4) { - struct sockaddr_in *gw; if ((flags & NHR_IFAIF) != 0) - pnh4->nh_ifp = rte->rt_ifa->ifa_ifp; + pnh4->nh_ifp = nh->nh_ifa->ifa_ifp; else - pnh4->nh_ifp = rte->rt_ifp; - pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu); - if (rte->rt_flags & RTF_GATEWAY) { - gw = (struct sockaddr_in *)rte->rt_gateway; - pnh4->nh_addr = gw->sin_addr; - } else + pnh4->nh_ifp = nh->nh_ifp; + pnh4->nh_mtu = nh->nh_mtu; + if (nh->nh_flags & NHF_GATEWAY) + pnh4->nh_addr = nh->gw4_sa.sin_addr; + else pnh4->nh_addr = dst; /* Set flags */ - pnh4->nh_flags = fib_rte_to_nh_flags(rte->rt_flags); - gw = (struct sockaddr_in *)rt_key(rte); - if (gw->sin_addr.s_addr == 0) - pnh4->nh_flags |= NHF_DEFAULT; - pnh4->nh_ia = ifatoia(rte->rt_ifa); + pnh4->nh_flags = nh->nh_flags; + pnh4->nh_ia = ifatoia(nh->nh_ifa); pnh4->nh_src = IA_SIN(pnh4->nh_ia)->sin_addr; } @@ -135,7 +164,7 @@ struct rib_head *rh; struct radix_node *rn; struct sockaddr_in sin; - struct rtentry *rte; + struct nhop_object *nh; KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_basic: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET); @@ -150,10 +179,10 @@ RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rte = RNTORT(rn); + nh = RT_SELECT_NHOP((RNTORT(rn)), flowid); /* Ensure route & ifp is UP */ - if (RT_LINK_IS_UP(rte->rt_ifp)) { - fib4_rte_to_nh_basic(rte, dst, flags, pnh4); + if (RT_LINK_IS_UP(nh->nh_ifp)) { + fib4_rte_to_nh_basic(nh, dst, flags, pnh4); RIB_RUNLOCK(rh); return (0); @@ -183,8 +212,8 @@ RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; + struct nhop_object *nh; struct sockaddr_in sin; - struct rtentry *rte; KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_ext: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET); @@ -193,23 +222,18 @@ /* Prepare lookup key */ memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_addr = dst; + nh = NULL; RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rte = RNTORT(rn); -#ifdef RADIX_MPATH - rte = rt_mpath_select(rte, flowid); - if (rte == NULL) { - RIB_RUNLOCK(rh); - return (ENOENT); - } -#endif + nh = RT_SELECT_NHOP((RNTORT(rn)), flowid); /* Ensure route & ifp is UP */ - if (RT_LINK_IS_UP(rte->rt_ifp)) { - fib4_rte_to_nh_extended(rte, dst, flags, pnh4); + if (RT_LINK_IS_UP(nh->nh_ifp)) { + fib4_rte_to_nh_extended(nh, dst, flags, pnh4); if ((flags & NHR_REF) != 0) { /* TODO: lwref on egress ifp's ? */ } @@ -229,4 +253,197 @@ } +struct nhop_object * +fib4_lookup_nh_ptr(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, + uint32_t flags, uint32_t flowid) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rh; + struct radix_node *rn; + struct nhop_object *nh; + + KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_route: bad fibnum")); + rh = rt_tables_get_rnh(fibnum, AF_INET); + if (rh == NULL) + return (NULL); + + /* Prepare lookup key */ + struct sockaddr_in sin4; + memset(&sin4, 0, sizeof(sin4)); + sin4.sin_family = AF_INET; + sin4.sin_len = sizeof(struct sockaddr_in); + sin4.sin_addr = dst; + + nh = NULL; + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { + nh = RT_SELECT_NHOP((RNTORT(rn)), flowid); + /* Ensure route & ifp is UP */ + if (RT_LINK_IS_UP(nh->nh_ifp)) { + if (flags & NHR_REF) + nhop_ref_object(nh); + RIB_RUNLOCK(rh); + return (nh); + } + } + RIB_RUNLOCK(rh); + + RTSTAT_INC(rts_unreach); + return (NULL); +} + +inline static int +check_urpf(const struct nhop_object *nh, uint32_t flags, + const struct ifnet *src_if) +{ +#ifdef ROUTE_MPATH + const struct nhgrp_object *nhgrp; + + if (NH_IS_MULTIPATH(nh)) { + nhgrp = (const struct nhgrp_object *)nh; + + if (src_if == NULL) { + if ((flags & NHR_NODEFAULT) == 0) + return (1); + else if ((nhgrp->nhops[0]->nh_flags & NHF_DEFAULT) == 0) + return (1); + return (0); + } + + /* src_if != NULL, need to iterate over nhops */ + /* TODO: consider iterating control plane nhop list */ + for (int i = 0; i < nhgrp->mp_size; i++) { + if (nhgrp->nhops[i]->nh_aifp == src_if) + return (1); + } + return (0); + } +#endif + + if (src_if != NULL && nh->nh_aifp == src_if) { + return (1); + } + if (src_if == NULL) { + if ((flags & NHR_NODEFAULT) == 0) + return (1); + else if ((nh->nh_flags & NHF_DEFAULT) == 0) + return (1); + } + + return (0); +} + +/* + * Performs reverse path forwarding lookup. + * If @src_if is non-zero, verifies that at least 1 path goes via + * this interface. + * If @src_if is zero, verifies that route exist. + * if @flags contains NHR_NOTDEFAULT, do not consider default route. + * + * Returns 1 if route matching conditions is found, 0 otherwise. + */ +int +fib4_lookup_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, + uint32_t flags, const struct ifnet *src_if) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rh; + struct radix_node *rn; + struct nhop_object *nh; + int ret; + + KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_route: bad fibnum")); + rh = rt_tables_get_rnh(fibnum, AF_INET); + if (rh == NULL) + return (0); + + /* Prepare lookup key */ + struct sockaddr_in sin4; + memset(&sin4, 0, sizeof(sin4)); + sin4.sin_len = sizeof(struct sockaddr_in); + sin4.sin_addr = dst; + + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { + nh = (RNTORT(rn))->rt_nhop; + ret = check_urpf(nh, flags, src_if); + RIB_RUNLOCK(rh); + return (ret); + } + RIB_RUNLOCK(rh); + + return (0); +} + +/* + * Lookups route for the destination specified in the @ro4. + * + * If the lookup resulting nhop is the same as saved in @ro4->ro_nh, returns 1 + * If the lookup result is different from the @ro4->ro_nh, under old & ref new + * IFF NHR_REF is set in flags. + * + * + * Returns: + * 0 if the the lookup was successful, with the referenced&unlocked nexthop stored. + * errno otherwise, with ro_nh freed and set to 0. + * + * If ro4->ro_nh is NOT null, returns + * If search WAS done AND entry WAS found, returns 1 + * flags supported: NHR_LOCK -> if the entry WAS found, lock it prior returning + */ +int +fib4_lookup_nh_route(uint32_t fibnum, struct route_in *ro4, + uint32_t flags, uint32_t flowid) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rh; + struct radix_node *rn; + struct nhop_object *nh, *nh_old; + + KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_route: bad fibnum")); + rh = rt_tables_get_rnh(fibnum, AF_INET); + if (rh == NULL) + return (EAFNOSUPPORT); + + /* Prepare lookup key */ + struct sockaddr_in sin4; + memset(&sin4, 0, sizeof(sin4)); + sin4.sin_family = AF_INET; + sin4.sin_len = sizeof(struct sockaddr_in); + sin4.sin_addr = ro4->ro_dst4.sin_addr; + + nh_old = NULL; + flags |= NHR_REF; + + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { + nh = RT_SELECT_NHOP((RNTORT(rn)), flowid); + /* Ensure route & ifp is UP */ + if (RT_LINK_IS_UP(nh->nh_ifp)) { + /* Valid nhop is found */ + if (ro4->ro_nh != nh) { + nh_old = ro4->ro_nh; + if (flags & NHR_REF) + nhop_ref_object(nh); + ro4->ro_nh = nh; + } + RIB_RUNLOCK(rh); + if ((nh_old != NULL) && (flags & NHR_REF)) + nhop_free_object(nh_old); + return (0); + } + } + RIB_RUNLOCK(rh); + + /* Not found */ + if ((ro4->ro_nh != NULL) && (flags & NHR_REF)) { + nhop_free_object(ro4->ro_nh); + ro4->ro_nh = NULL; + } + + return (ESRCH); +} #endif Index: sys/netinet/in_pcb.c =================================================================== --- sys/netinet/in_pcb.c +++ sys/netinet/in_pcb.c @@ -46,7 +46,7 @@ #include "opt_inet6.h" #include "opt_ratelimit.h" #include "opt_pcbgroup.h" -#include "opt_rss.h" +#include "opt_route_mpath.h" #include #include @@ -86,8 +86,10 @@ #if defined(INET) || defined(INET6) #include #include +#include #ifdef INET #include +#include #endif #include #include @@ -101,7 +103,9 @@ #include #include #include +#include #endif /* INET6 */ +#include #endif #include @@ -111,6 +115,7 @@ #define INPCBLBGROUP_SIZMIN 8 #define INPCBLBGROUP_SIZMAX 256 + static struct callout ipport_tick_callout; /* @@ -1033,8 +1038,8 @@ { struct ifaddr *ifa; struct sockaddr *sa; - struct sockaddr_in *sin; - struct route sro; + struct sockaddr_in *sin, dst; + struct nhop_object *nh = NULL; int error; NET_EPOCH_ASSERT(); @@ -1047,9 +1052,9 @@ return (0); error = 0; - bzero(&sro, sizeof(sro)); - sin = (struct sockaddr_in *)&sro.ro_dst; + bzero(&dst, sizeof(dst)); + sin = &dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); sin->sin_addr.s_addr = faddr->s_addr; @@ -1061,7 +1066,8 @@ * Find out route to destination. */ if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) - in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum); + nh = fib4_lookup_nh_ptr(inp->inp_inc.inc_fibnum, *faddr, + 0, NHR_NONE, 0); /* * If we found a route, use the address corresponding to @@ -1071,7 +1077,7 @@ * network and try to find a corresponding interface to take * the source address from. */ - if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) { + if (nh == NULL || nh->nh_ifp == NULL) { struct in_ifaddr *ia; struct ifnet *ifp; @@ -1124,22 +1130,22 @@ * belonging to this jail. If so use it. * 3. as a last resort return the 'default' jail address. */ - if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) { + if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) { struct in_ifaddr *ia; struct ifnet *ifp; /* If not jailed, use the default returned. */ if (cred == NULL || !prison_flag(cred, PR_IP4)) { - ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; + ia = (struct in_ifaddr *)nh->nh_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ /* 1. Check if the iface address belongs to the jail. */ - sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr; + sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { - ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; + ia = (struct in_ifaddr *)nh->nh_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } @@ -1149,7 +1155,7 @@ * belonging to this jail. */ ia = NULL; - ifp = sro.ro_rt->rt_ifp; + ifp = nh->nh_ifp; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) @@ -1179,7 +1185,7 @@ * In case of jails, check that it is an address of the jail * and if we cannot find, fall back to the 'default' jail address. */ - if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) { + if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) { struct sockaddr_in sain; struct in_ifaddr *ia; @@ -1234,8 +1240,6 @@ } done: - if (sro.ro_rt != NULL) - RTFREE(sro.ro_rt); return (error); } @@ -1266,6 +1270,9 @@ struct in_addr laddr, faddr; u_short lport, fport; int error; +#ifdef ROUTE_MPATH + uint32_t hash_val, hash_type; +#endif /* * Because a global state change doesn't actually occur here, a read @@ -1288,6 +1295,15 @@ faddr = sin->sin_addr; fport = sin->sin_port; +#ifdef ROUTE_MPATH + if (V_fib_hash_outbound) { + hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport, + inp->inp_socket->so_proto->pr_protocol, &hash_type); + + inp->inp_flowid = hash_val; + inp->inp_flowtype = hash_type; + } +#endif if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { /* * If the destination address is INADDR_ANY, @@ -3367,22 +3383,6 @@ mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); } } else { - error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); - } - if (error == 0 || error == EOPNOTSUPP) - inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; - - return (error); -} - -/* - * This function should be called when the INP_RATE_LIMIT_CHANGED flag - * is set in the fast path and will attach/detach/modify the TX rate - * limit send tag based on the socket's so_max_pacing_rate value. - */ -void -in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) -{ struct socket *socket; uint32_t max_pacing_rate; bool did_upgrade; Index: sys/netinet/in_rmx.c =================================================================== --- sys/netinet/in_rmx.c +++ sys/netinet/in_rmx.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -48,67 +49,63 @@ #include #include #include +#include extern int in_inithead(void **head, int off, u_int fibnum); #ifdef VIMAGE extern int in_detachhead(void **head, int off); #endif -/* - * Do what we need to do when inserting a route. - */ -static struct radix_node * -in_addroute(void *v_arg, void *n_arg, struct radix_head *head, - struct radix_node *treenodes) +static int +rib4_preadd(u_int fibnum, const struct sockaddr *addr, const struct sockaddr *mask, + struct nhop_request *req) { - struct rtentry *rt = (struct rtentry *)treenodes; - struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt); + const struct sockaddr_in *addr4 = (const struct sockaddr_in *)addr; - /* - * A little bit of help for both IP output and input: - * For host routes, we make sure that RTF_BROADCAST - * is set for anything that looks like a broadcast address. - * This way, we can avoid an expensive call to in_broadcast() - * in ip_output() most of the time (because the route passed - * to ip_output() is almost always a host route). - * - * We also do the same for local addresses, with the thought - * that this might one day be used to speed up ip_input(). - * - * We also mark routes to multicast addresses as such, because - * it's easy to do and might be useful (but this is much more - * dubious since it's so easy to inspect the address). - */ - if (rt->rt_flags & RTF_HOST) { - struct epoch_tracker et; - bool bcast; + /* XXX: RTF_LOCAL && RTF_MULTICAST */ - NET_EPOCH_ENTER(et); - bcast = in_broadcast(sin->sin_addr, rt->rt_ifp); - NET_EPOCH_EXIT(et); - if (bcast) - rt->rt_flags |= RTF_BROADCAST; - else if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr == - sin->sin_addr.s_addr) - rt->rt_flags |= RTF_LOCAL; + if (req->rt_flags & RTF_HOST) { + /* + * Backward compatibility: + * if the destination is broadcast, + * mark route as broadcast. + * This behavior was useful when route cloning + * was in place, so there was an explicit cloned + * route for every broadcasted address. + * Currently (2019-12) there are no kernel machinery + * to do route cloning, though someone might explicitly + * add these routes to support some cases with active-active + * load balancing. Given that, retain this support. + */ + if (in_broadcast(addr4->sin_addr, req->ifp)) + req->rt_flags |= RTF_BROADCAST; } - if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) - rt->rt_flags |= RTF_MULTICAST; - if (rt->rt_ifp != NULL) { - /* - * Check route MTU: - * inherit interface MTU if not set or - * check if MTU is too large. - */ - if (rt->rt_mtu == 0) { - rt->rt_mtu = rt->rt_ifp->if_mtu; - } else if (rt->rt_mtu > rt->rt_ifp->if_mtu) - rt->rt_mtu = rt->rt_ifp->if_mtu; + /* + * Check route MTU: + * inherit interface MTU if not set or + * check if MTU is too large. + */ + if (req->mtu == 0) { + req->mtu = req->ifp->if_mtu; + } else if (req->mtu > req->ifp->if_mtu) + req->mtu = req->ifp->if_mtu; + + /* Ensure that default route nhop has special flag */ + const struct sockaddr_in *mask4 = (const struct sockaddr_in *)mask; + if ((req->rt_flags & RTF_HOST) == 0 && mask4->sin_addr.s_addr == 0) + req->nh_flags_additional |= NHF_DEFAULT; + + /* Set nhop type to basic per-AF nhop */ + if (req->nh_type == 0) { + if (req->rt_flags & RTF_GATEWAY) + req->nh_type = NH_TYPE_IPV4_ETHER_NHOP; + else + req->nh_type = NH_TYPE_IPV4_ETHER_RSLV; } - return (rn_addroute(v_arg, n_arg, head, treenodes)); + return (0); } static int _in_rt_was_here; @@ -124,7 +121,7 @@ if (rh == NULL) return (0); - rh->rnh_addaddr = in_addroute; + rh->rnh_preadd = rib4_preadd; *head = (void *)rh; if (_in_rt_was_here == 0 ) { @@ -158,14 +155,15 @@ }; static int -in_ifadownkill(const struct rtentry *rt, void *xap) +in_ifadownkill(const struct rtentry *rt, const struct nhop_object *nh, void *xap) { struct in_ifadown_arg *ap = xap; - if (rt->rt_ifa != ap->ifa) + if (nh->nh_ifa != ap->ifa) return (0); - if ((rt->rt_flags & RTF_STATIC) != 0 && ap->del == 0) + int rt_flags = rib_get_entry_rtflags(rt, nh); + if ((rt_flags & RTF_STATIC) != 0 && ap->del == 0) return (0); return (1); @@ -184,16 +182,5 @@ rt_foreach_fib_walk_del(AF_INET, in_ifadownkill, &arg); ifa->ifa_flags &= ~IFA_ROUTE; /* XXXlocking? */ -} - -/* - * inet versions of rt functions. These have fib extensions and - * for now will just reference the _fib variants. - * eventually this order will be reversed, - */ -void -in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum) -{ - rtalloc_ign_fib(ro, ignflags, fibnum); } Index: sys/netinet/in_var.h =================================================================== --- sys/netinet/in_var.h +++ sys/netinet/in_var.h @@ -473,7 +473,6 @@ /* XXX */ -void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum); #endif /* _KERNEL */ /* INET6 stuff */ Index: sys/netinet/ip_fastfwd.c =================================================================== --- sys/netinet/ip_fastfwd.c +++ sys/netinet/ip_fastfwd.c @@ -96,6 +96,7 @@ #include #include #include +#include #include #include @@ -111,11 +112,13 @@ #include static int -ip_findroute(struct nhop4_basic *pnh, struct in_addr dest, struct mbuf *m) +ip_findroute(struct nhop_object **pnh, struct in_addr dest, struct mbuf *m) { + struct nhop_object *nh; - bzero(pnh, sizeof(*pnh)); - if (fib4_lookup_nh_basic(M_GETFIB(m), dest, 0, 0, pnh) != 0) { + nh = fib4_lookup_nh_ptr(M_GETFIB(m), dest, 0, NHR_NONE, + m->m_pkthdr.flowid); + if (nh == NULL) { IPSTAT_INC(ips_noroute); IPSTAT_INC(ips_cantforward); icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); @@ -124,18 +127,20 @@ /* * Drop blackholed traffic and directed broadcasts. */ - if ((pnh->nh_flags & (NHF_BLACKHOLE | NHF_BROADCAST)) != 0) { + if ((nh->nh_flags & (NHF_BLACKHOLE | NHF_BROADCAST)) != 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return (EHOSTUNREACH); } - if (pnh->nh_flags & NHF_REJECT) { + if (nh->nh_flags & NHF_REJECT) { IPSTAT_INC(ips_cantforward); icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); return (EHOSTUNREACH); } + *pnh = nh; + return (0); } @@ -151,7 +156,7 @@ { struct ip *ip; struct mbuf *m0 = NULL; - struct nhop4_basic nh; + struct nhop_object *nh; struct sockaddr_in dst; struct in_addr dest, odest, rtdest; uint16_t ip_len, ip_off; @@ -323,7 +328,7 @@ if (!PFIL_HOOKED_OUT(V_inet_pfil_head)) goto passout; - if (pfil_run_hooks(V_inet_pfil_head, &m, nh.nh_ifp, + if (pfil_run_hooks(V_inet_pfil_head, &m, nh->nh_ifp, PFIL_OUT | PFIL_FWD, NULL) != PFIL_PASS) goto drop; @@ -376,12 +381,15 @@ bzero(&dst, sizeof(dst)); dst.sin_family = AF_INET; dst.sin_len = sizeof(dst); - dst.sin_addr = nh.nh_addr; + if (nh->nh_flags & NHF_GATEWAY) + dst.sin_addr = nh->gw4_sa.sin_addr; + else + dst.sin_addr = dest; /* * Check if packet fits MTU or if hardware will fragment for us */ - if (ip_len <= nh.nh_mtu) { + if (ip_len <= nh->nh_mtu) { /* * Avoid confusing lower layers. */ @@ -389,8 +397,8 @@ /* * Send off the packet via outgoing interface */ - IP_PROBE(send, NULL, NULL, ip, nh.nh_ifp, ip, NULL); - error = (*nh.nh_ifp->if_output)(nh.nh_ifp, m, + IP_PROBE(send, NULL, NULL, ip, nh->nh_ifp, ip, NULL); + error = (*nh->nh_ifp->if_output)(nh->nh_ifp, m, (struct sockaddr *)&dst, NULL); } else { /* @@ -399,15 +407,15 @@ if (ip_off & IP_DF) { IPSTAT_INC(ips_cantfrag); icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, - 0, nh.nh_mtu); + 0, nh->nh_mtu); goto consumed; } else { /* * We have to fragment the packet */ m->m_pkthdr.csum_flags |= CSUM_IP; - if (ip_fragment(ip, &m, nh.nh_mtu, - nh.nh_ifp->if_hwassist) != 0) + if (ip_fragment(ip, &m, nh->nh_mtu, + nh->nh_ifp->if_hwassist) != 0) goto drop; KASSERT(m != NULL, ("null mbuf and no error")); /* @@ -423,10 +431,10 @@ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, - mtod(m, struct ip *), nh.nh_ifp, + mtod(m, struct ip *), nh->nh_ifp, mtod(m, struct ip *), NULL); /* XXX: we can use cached route here */ - error = (*nh.nh_ifp->if_output)(nh.nh_ifp, m, + error = (*nh->nh_ifp->if_output)(nh->nh_ifp, m, (struct sockaddr *)&dst, NULL); if (error) break; Index: sys/netinet/ip_icmp.c =================================================================== --- sys/netinet/ip_icmp.c +++ sys/netinet/ip_icmp.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include @@ -945,7 +946,7 @@ icmp_verify_redirect_gateway(struct sockaddr_in *src, struct sockaddr_in *dst, struct sockaddr_in *gateway, u_int fibnum) { - struct rtentry *rt; + struct nhop_object *nh; struct ifaddr *ifa; NET_EPOCH_ASSERT(); @@ -958,8 +959,8 @@ if (ifa_ifwithaddr_check((struct sockaddr *)gateway)) return (EHOSTUNREACH); - rt = rtalloc1_fib((struct sockaddr *)dst, 0, 0UL, fibnum); /* NB: rt is locked */ - if (rt == NULL) + nh = fib4_lookup_nh_ptr(fibnum, dst->sin_addr, 0, NHR_NONE, 0); + if (nh == NULL) return (EINVAL); /* @@ -968,28 +969,19 @@ * we have a routing loop, perhaps as a result of an interface * going down recently. */ - if (!sa_equal((struct sockaddr *)src, rt->rt_gateway)) { - RTFREE_LOCKED(rt); + if (!sa_equal((struct sockaddr *)src, &nh->gw_sa)) return (EINVAL); - } - if (rt->rt_ifa != ifa && ifa->ifa_addr->sa_family != AF_LINK) { - RTFREE_LOCKED(rt); + if (nh->nh_ifa != ifa && ifa->ifa_addr->sa_family != AF_LINK) return (EINVAL); - } /* If host route already exists, ignore redirect. */ - if (rt->rt_flags & RTF_HOST) { - RTFREE_LOCKED(rt); + if (nh->nh_flags & NHF_HOST) return (EEXIST); - } /* If the prefix is directly reachable, ignore redirect. */ - if (!(rt->rt_flags & RTF_GATEWAY)) { - RTFREE_LOCKED(rt); + if (!(nh->nh_flags & NHF_GATEWAY)) return (EEXIST); - } - RTFREE_LOCKED(rt); return (0); } Index: sys/netinet/ip_input.c =================================================================== --- sys/netinet/ip_input.c +++ sys/netinet/ip_input.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include #include @@ -72,6 +73,7 @@ #include #include #include +#include #include #include #include @@ -980,10 +982,11 @@ ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), M_GETFIB(m)); #else - in_rtalloc_ign(&ro, 0, M_GETFIB(m)); + ro.ro_nh = fib4_lookup_nh_ptr(M_GETFIB(m), ip->ip_dst, 0, NHR_REF, + m->m_pkthdr.flowid); #endif - if (ro.ro_rt != NULL) { - ia = ifatoia(ro.ro_rt->rt_ifa); + if (ro.ro_nh != NULL) { + ia = ifatoia(ro.ro_nh->nh_ifa); } else ia = NULL; /* @@ -1045,19 +1048,18 @@ dest.s_addr = 0; if (!srcrt && V_ipsendredirects && ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) { - struct rtentry *rt; + struct nhop_object *nh; - rt = ro.ro_rt; + nh = ro.ro_nh; - if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && - satosin(rt_key(rt))->sin_addr.s_addr != 0) { -#define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) + if (nh != NULL && ((nh->nh_flags & (NHF_REDIRECT|NHF_DEFAULT)) == 0)) { + struct in_ifaddr *nh_ia = (struct in_ifaddr *)(nh->nh_ifa); u_long src = ntohl(ip->ip_src.s_addr); - if (RTA(rt) && - (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { - if (rt->rt_flags & RTF_GATEWAY) - dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr; + if (nh_ia && + (src & nh_ia->ia_subnetmask) == nh_ia->ia_subnet) { + if (nh->nh_flags & NHF_GATEWAY) + dest.s_addr = nh->gw4_sa.sin_addr.s_addr; else dest.s_addr = ip->ip_dst.s_addr; /* Router requirements says to only send host redirects */ @@ -1069,9 +1071,9 @@ error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL); - if (error == EMSGSIZE && ro.ro_rt) - mtu = ro.ro_rt->rt_mtu; - RO_RTFREE(&ro); + if (error == EMSGSIZE && ro.ro_nh) + mtu = ro.ro_nh->nh_mtu; + RO_NHFREE(&ro); if (error) IPSTAT_INC(ips_cantforward); Index: sys/netinet/ip_output.c =================================================================== --- sys/netinet/ip_output.c +++ sys/netinet/ip_output.c @@ -67,6 +67,7 @@ #include #include #include +#include #ifdef RADIX_MPATH #include #endif @@ -78,6 +79,7 @@ #include #include #include +#include #include #include #include @@ -287,6 +289,18 @@ return (error); } +static inline void +rt_update_ro_flags(struct route *ro) +{ + int nh_flags = ro->ro_nh->nh_flags; + + ro->ro_flags &= ~ (RT_REJECT|RT_BLACKHOLE|RT_HAS_GW); + + ro->ro_flags |= (nh_flags & NHF_REJECT) ? RT_REJECT : 0; + ro->ro_flags |= (nh_flags & NHF_BLACKHOLE) ? RT_BLACKHOLE : 0; + ro->ro_flags |= (nh_flags & NHF_GATEWAY) ? RT_HAS_GW : 0; +} + /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). @@ -368,7 +382,7 @@ dst = (struct sockaddr_in *)&ro->ro_dst; else dst = &sin; - if (ro == NULL || ro->ro_rt == NULL) { + if (ro == NULL || ro->ro_nh == NULL) { bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); @@ -380,8 +394,8 @@ * Validate route against routing table additions; * a better/more specific route might have been added. */ - if (inp != NULL && ro != NULL && ro->ro_rt != NULL) - RT_VALIDATE(ro, &inp->inp_rt_cookie, fibnum); + if (inp != NULL && ro != NULL && ro->ro_nh != NULL) + NH_VALIDATE(ro, &inp->inp_rt_cookie, fibnum); /* * If there is a cached route, * check that it is to the same destination @@ -390,9 +404,9 @@ * cache with IPv6. * Also check whether routing cache needs invalidation. */ - if (ro != NULL && ro->ro_rt != NULL && - ((ro->ro_rt->rt_flags & RTF_UP) == 0 || - ro->ro_rt->rt_ifp == NULL || !RT_LINK_IS_UP(ro->ro_rt->rt_ifp) || + if (ro != NULL && ro->ro_nh != NULL && + ((!NH_IS_VALID(ro->ro_nh)) || + ro->ro_nh->nh_ifp == NULL || !RT_LINK_IS_UP(ro->ro_nh->nh_ifp) || dst->sin_family != AF_INET || dst->sin_addr.s_addr != ip->ip_dst.s_addr)) RO_INVALIDATE_CACHE(ro); @@ -450,25 +464,23 @@ else src.s_addr = INADDR_ANY; } else if (ro != NULL) { - if (ro->ro_rt == NULL) { + if (ro->ro_nh == NULL) { /* * We want to do any cloning requested by the link * layer, as this is probably required in all cases * for correct operation (as it is for ARP). */ -#ifdef RADIX_MPATH - rtalloc_mpath_fib(ro, - ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), - fibnum); -#else - in_rtalloc_ign(ro, 0, fibnum); + fib4_lookup_nh_route(fibnum, (struct route_in *)ro, + NHR_NONE, m->m_pkthdr.flowid); +#if 0 + char xbuf[20]; + inet_ntop(AF_INET, &((struct sockaddr_in *)&ro->ro_dst)->sin_addr, xbuf, sizeof(xbuf)); + printf("lookup for %s in fib %u returned ro_nh=%p\n", xbuf, fibnum, ro->ro_nh); #endif - if (ro->ro_rt == NULL || - (ro->ro_rt->rt_flags & RTF_UP) == 0 || - ro->ro_rt->rt_ifp == NULL || - !RT_LINK_IS_UP(ro->ro_rt->rt_ifp)) { + if (ro->ro_nh == NULL || (!NH_IS_VALID(ro->ro_nh)) || + !RT_LINK_IS_UP(ro->ro_nh->nh_ifp)) { #if defined(IPSEC) || defined(IPSEC_SUPPORT) - /* + /* * There is no route for this packet, but it is * possible that a matching SPD entry exists. */ @@ -481,29 +493,29 @@ goto bad; } } - ia = ifatoia(ro->ro_rt->rt_ifa); - ifp = ro->ro_rt->rt_ifp; - counter_u64_add(ro->ro_rt->rt_pksent, 1); + ia = ifatoia(ro->ro_nh->nh_ifa); + ifp = ro->ro_nh->nh_ifp; + counter_u64_add(ro->ro_nh->nh_pksent, 1); rt_update_ro_flags(ro); - if (ro->ro_rt->rt_flags & RTF_GATEWAY) - gw = (struct sockaddr_in *)ro->ro_rt->rt_gateway; - if (ro->ro_rt->rt_flags & RTF_HOST) - isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); + if (ro->ro_nh->nh_flags & NHF_GATEWAY) + gw = &ro->ro_nh->gw4_sa; + if (ro->ro_nh->nh_flags & NHF_HOST) + isbroadcast = (ro->ro_nh->nh_flags & NHF_BROADCAST); else if (ifp->if_flags & IFF_BROADCAST) isbroadcast = in_ifaddr_broadcast(gw->sin_addr, ia); else isbroadcast = 0; - if (ro->ro_rt->rt_flags & RTF_HOST) - mtu = ro->ro_rt->rt_mtu; + if (ro->ro_nh->nh_flags & NHF_HOST) + mtu = ro->ro_nh->nh_mtu; else mtu = ifp->if_mtu; src = IA_SIN(ia)->sin_addr; } else { - struct nhop4_extended nh; + struct nhop_object *nh; - bzero(&nh, sizeof(nh)); - if (fib4_lookup_nh_ext(M_GETFIB(m), ip->ip_dst, 0, 0, &nh) != - 0) { + nh = fib4_lookup_nh_ptr(M_GETFIB(m), ip->ip_dst, 0, NHR_NONE, + m->m_pkthdr.flowid); + if (nh == NULL) { #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * There is no route for this packet, but it is @@ -514,11 +526,12 @@ goto sendit; #endif IPSTAT_INC(ips_noroute); + printf("NOTOUTE: extended\n"); error = EHOSTUNREACH; goto bad; } - ifp = nh.nh_ifp; - mtu = nh.nh_mtu; + ifp = nh->nh_ifp; + mtu = nh->nh_mtu; /* * We are rewriting here dst to be gw actually, contradicting * comment at the beginning of the function. However, in this @@ -527,19 +540,20 @@ * function, the dst would be rewritten by ip_output_pfil(). */ MPASS(dst == &sin); - dst->sin_addr = nh.nh_addr; - ia = nh.nh_ia; - src = nh.nh_src; - isbroadcast = (((nh.nh_flags & (NHF_HOST | NHF_BROADCAST)) == + if (nh->nh_flags & NHF_GATEWAY) + dst->sin_addr = nh->gw4_sa.sin_addr; + ia = (struct in_ifaddr *)nh->nh_ifa; + src = ia->ia_addr.sin_addr; + isbroadcast = (((nh->nh_flags & (NHF_HOST | NHF_BROADCAST)) == (NHF_HOST | NHF_BROADCAST)) || ((ifp->if_flags & IFF_BROADCAST) && in_ifaddr_broadcast(dst->sin_addr, ia))); } /* Catch a possible divide by zero later. */ - KASSERT(mtu > 0, ("%s: mtu %d <= 0, ro=%p (rt_flags=0x%08x) ifp=%p", + KASSERT(mtu > 0, ("%s: mtu %d <= 0, ro=%p (nh_flags=0x%08x) ifp=%p", __func__, mtu, ro, - (ro != NULL && ro->ro_rt != NULL) ? ro->ro_rt->rt_flags : 0, ifp)); + (ro != NULL && ro->ro_nh != NULL) ? ro->ro_nh->nh_flags : 0, ifp)); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { m->m_flags |= M_MCAST; @@ -702,7 +716,7 @@ case -1: /* Need to try again */ /* Reset everything for a new round */ if (ro != NULL) { - RO_RTFREE(ro); + RO_NHFREE(ro); ro->ro_prepend = NULL; } gw = dst; Index: sys/netinet/raw_ip.c =================================================================== --- sys/netinet/raw_ip.c +++ sys/netinet/raw_ip.c @@ -38,6 +38,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" +#include "opt_route_mpath.h" #include #include @@ -67,6 +68,7 @@ #include #include +#include #include #include #include @@ -455,6 +457,9 @@ IP_ALLOWBROADCAST; int cnt, hlen; u_char opttype, optlen, *cp; +#ifdef ROUTE_MPATH + uint32_t hash_val, hash_type; +#endif va_start(ap, so); dst = va_arg(ap, u_long); @@ -484,6 +489,15 @@ ip->ip_len = htons(m->m_pkthdr.len); ip->ip_src = inp->inp_laddr; ip->ip_dst.s_addr = dst; +#ifdef ROUTE_MPATH + if (V_fib_hash_outbound) { + hash_val = fib4_calc_software_hash(ip->ip_src, + ip->ip_dst, 0, 0, ip->ip_p, &hash_type); + m->m_pkthdr.flowid = hash_val; + M_HASHTYPE_SET(m, hash_type); + flags |= IP_NODEFAULTFLOWID; + } +#endif if (jailed(inp->inp_cred)) { /* * prison_local_ip4() would be good enough but would @@ -519,6 +533,15 @@ return (EINVAL); ip = mtod(m, struct ip *); } +#ifdef ROUTE_MPATH + if (V_fib_hash_outbound) { + hash_val = fib4_calc_software_hash(ip->ip_dst, + ip->ip_src, 0, 0, ip->ip_p, &hash_type); + m->m_pkthdr.flowid = hash_val; + M_HASHTYPE_SET(m, hash_type); + flags |= IP_NODEFAULTFLOWID; + } +#endif INP_RLOCK(inp); /* Index: sys/netinet/sctp_asconf.c =================================================================== --- sys/netinet/sctp_asconf.c +++ sys/netinet/sctp_asconf.c @@ -981,8 +981,7 @@ ((ifn == NULL) || (SCTP_GET_IF_INDEX_FROM_ROUTE(&net->ro) != ifn->ifn_index))) { /* clear any cached route */ - RTFREE(net->ro.ro_rt); - net->ro.ro_rt = NULL; + RO_NHFREE(&net->ro); } /* clear any cached source address */ if (net->src_addr_selected) { @@ -1091,10 +1090,7 @@ if (addrnum == 1) { TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { /* clear any cached route and source address */ - if (net->ro.ro_rt) { - RTFREE(net->ro.ro_rt); - net->ro.ro_rt = NULL; - } + RO_NHFREE(&net->ro); if (net->src_addr_selected) { sctp_free_ifa(net->ro._s_addr); net->ro._s_addr = NULL; @@ -1113,9 +1109,9 @@ /* Multiple local addresses exsist in the association. */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { /* clear any cached route and source address */ - if (net->ro.ro_rt) { - RTFREE(net->ro.ro_rt); - net->ro.ro_rt = NULL; + if (net->ro.ro_nh) { + NH_FREE(net->ro.ro_nh); + net->ro.ro_nh = NULL; } if (net->src_addr_selected) { sctp_free_ifa(net->ro._s_addr); @@ -1132,7 +1128,7 @@ SCTP_RTALLOC((sctp_route_t *)&net->ro, stcb->sctp_ep->def_vrf_id, stcb->sctp_ep->fibnum); - if (net->ro.ro_rt == NULL) + if (net->ro.ro_nh == NULL) continue; changed = 0; @@ -2215,18 +2211,13 @@ struct sctp_nets *net; TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { - sctp_rtentry_t *rt; /* delete this address if cached */ if (net->ro._s_addr == ifa) { sctp_free_ifa(net->ro._s_addr); net->ro._s_addr = NULL; net->src_addr_selected = 0; - rt = net->ro.ro_rt; - if (rt) { - RTFREE(rt); - net->ro.ro_rt = NULL; - } + RO_NHFREE(&net->ro); /* * Now we deleted our src address, * should we not also now reset the Index: sys/netinet/sctp_os_bsd.h =================================================================== --- sys/netinet/sctp_os_bsd.h +++ sys/netinet/sctp_os_bsd.h @@ -71,11 +71,13 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -85,6 +87,7 @@ #ifdef INET6 #include #include +#include #include #include #include @@ -199,15 +202,15 @@ #define SCTP_INIT_VRF_TABLEID(vrf) #define SCTP_IFN_IS_IFT_LOOP(ifn) ((ifn)->ifn_type == IFT_LOOP) -#define SCTP_ROUTE_IS_REAL_LOOP(ro) ((ro)->ro_rt && (ro)->ro_rt->rt_ifa && (ro)->ro_rt->rt_ifa->ifa_ifp && (ro)->ro_rt->rt_ifa->ifa_ifp->if_type == IFT_LOOP) +#define SCTP_ROUTE_IS_REAL_LOOP(ro) ((ro)->ro_nh && (ro)->ro_nh->nh_ifa && (ro)->ro_nh->nh_ifa->ifa_ifp && (ro)->ro_nh->nh_ifa->ifa_ifp->if_type == IFT_LOOP) /* * Access to IFN's to help with src-addr-selection */ /* This could return VOID if the index works but for BSD we provide both. */ -#define SCTP_GET_IFN_VOID_FROM_ROUTE(ro) (void *)ro->ro_rt->rt_ifp -#define SCTP_GET_IF_INDEX_FROM_ROUTE(ro) (ro)->ro_rt->rt_ifp->if_index -#define SCTP_ROUTE_HAS_VALID_IFN(ro) ((ro)->ro_rt && (ro)->ro_rt->rt_ifp) +#define SCTP_GET_IFN_VOID_FROM_ROUTE(ro) (void *)ro->ro_nh->nh_ifp +#define SCTP_GET_IF_INDEX_FROM_ROUTE(ro) (ro)->ro_nh->nh_ifp->if_index +#define SCTP_ROUTE_HAS_VALID_IFN(ro) ((ro)->ro_nh && (ro)->ro_nh->nh_ifp) /* * general memory allocation @@ -304,12 +307,10 @@ /* MTU */ /*************************/ #define SCTP_GATHER_MTU_FROM_IFN_INFO(ifn, ifn_index, af) ((struct ifnet *)ifn)->if_mtu -#define SCTP_GATHER_MTU_FROM_ROUTE(sctp_ifa, sa, rt) ((uint32_t)((rt != NULL) ? rt->rt_mtu : 0)) +#define SCTP_GATHER_MTU_FROM_ROUTE(sctp_ifa, sa, nh) ((uint32_t)((nh != NULL) ? nh->nh_mtu : 0)) #define SCTP_GATHER_MTU_FROM_INTFC(sctp_ifn) ((sctp_ifn->ifn_p != NULL) ? ((struct ifnet *)(sctp_ifn->ifn_p))->if_mtu : 0) -#define SCTP_SET_MTU_OF_ROUTE(sa, rt, mtu) do { \ - if (rt != NULL) \ - rt->rt_mtu = mtu; \ - } while(0) +/* XXX: Setting MTU from the protocol in this way is simply incorrect */ +#define SCTP_SET_MTU_OF_ROUTE(sa, rt, mtu) /* (de-)register interface event notifications */ #define SCTP_REGISTER_INTERFACE(ifhandle, af) @@ -365,7 +366,7 @@ */ /* get the v6 hop limit */ -#define SCTP_GET_HLIM(inp, ro) in6_selecthlim(&inp->ip_inp.inp, (ro ? (ro->ro_rt ? (ro->ro_rt->rt_ifp) : (NULL)) : (NULL))); +#define SCTP_GET_HLIM(inp, ro) in6_selecthlim(&inp->ip_inp.inp, (ro ? (ro->ro_nh ? (ro->ro_nh->nh_ifp) : (NULL)) : (NULL))); /* is the endpoint v6only? */ #define SCTP_IPV6_V6ONLY(sctp_inpcb) ((sctp_inpcb)->ip_inp.inp.inp_flags & IN6P_IPV6_V6ONLY) @@ -400,7 +401,14 @@ typedef struct rtentry sctp_rtentry_t; #define SCTP_RTALLOC(ro, vrf_id, fibnum) \ - rtalloc_ign_fib((struct route *)ro, 0UL, fibnum) +{ \ + if ((ro)->ro_nh == NULL) { \ + if ((ro)->ro_dst.sa_family == AF_INET) \ + (ro)->ro_nh = fib4_lookup_nh_ptr(fibnum, ((struct sockaddr_in *)&(ro)->ro_dst)->sin_addr, NHR_REF, 0, 0); \ + if ((ro)->ro_dst.sa_family == AF_INET6) \ + (ro)->ro_nh = fib6_lookup_nh_ptr(fibnum, &((struct sockaddr_in6 *)&(ro)->ro_dst)->sin6_addr, NHR_REF, 0, 0); \ + } \ +} /* * SCTP protocol specific mbuf flags. Index: sys/netinet/sctp_output.c =================================================================== --- sys/netinet/sctp_output.c +++ sys/netinet/sctp_output.c @@ -3387,13 +3387,13 @@ * addresses. If the bound set is NOT assigned to the interface then * we must use rotation amongst the bound addresses.. */ - if (ro->ro_rt == NULL) { + if (ro->ro_nh == NULL) { /* * Need a route to cache. */ SCTP_RTALLOC(ro, vrf_id, inp->fibnum); } - if (ro->ro_rt == NULL) { + if (ro->ro_nh == NULL) { return (NULL); } fam = ro->ro_dst.sa_family; @@ -4131,10 +4131,7 @@ sctp_free_ifa(net->ro._s_addr); net->ro._s_addr = NULL; net->src_addr_selected = 0; - if (ro->ro_rt) { - RTFREE(ro->ro_rt); - ro->ro_rt = NULL; - } + RO_NHFREE(ro); } if (net->src_addr_selected == 0) { /* Cache the source address */ @@ -4206,7 +4203,7 @@ * catch that somewhere and abort the association * right away (assuming this is an INIT being sent). */ - if (ro->ro_rt == NULL) { + if (ro->ro_nh == NULL) { /* * src addr selection failed to find a route * (or valid source addr), so we can't get @@ -4225,7 +4222,7 @@ SCTPDBG(SCTP_DEBUG_OUTPUT3, "Destination is %x\n", (uint32_t)(ntohl(ip->ip_dst.s_addr))); SCTPDBG(SCTP_DEBUG_OUTPUT3, "RTP route is %p through\n", - (void *)ro->ro_rt); + (void *)ro->ro_nh); if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) { /* failed to prepend data, give up */ @@ -4278,13 +4275,13 @@ SCTPDBG(SCTP_DEBUG_OUTPUT3, "IP output returns %d\n", ret); if (net == NULL) { /* free tempy routes */ - RO_RTFREE(ro); + RO_NHFREE(ro); } else { - if ((ro->ro_rt != NULL) && (net->ro._s_addr) && + if ((ro->ro_nh != NULL) && (net->ro._s_addr) && ((net->dest_state & SCTP_ADDR_NO_PMTUD) == 0)) { uint32_t mtu; - mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, ro->ro_rt); + mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, ro->ro_nh); if (mtu > 0) { if (net->port) { mtu -= sizeof(struct udphdr); @@ -4296,7 +4293,7 @@ net->mtu = mtu; } } - } else if (ro->ro_rt == NULL) { + } else if (ro->ro_nh == NULL) { /* route was freed */ if (net->ro._s_addr && net->src_addr_selected) { @@ -4426,10 +4423,7 @@ sctp_free_ifa(net->ro._s_addr); net->ro._s_addr = NULL; net->src_addr_selected = 0; - if (ro->ro_rt) { - RTFREE(ro->ro_rt); - ro->ro_rt = NULL; - } + RO_NHFREE(ro); } if (net->src_addr_selected == 0) { sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; @@ -4489,7 +4483,7 @@ } lsa6->sin6_port = inp->sctp_lport; - if (ro->ro_rt == NULL) { + if (ro->ro_nh == NULL) { /* * src addr selection failed to find a route * (or valid source addr), so we can't get @@ -4625,13 +4619,13 @@ } if (net == NULL) { /* Now if we had a temp route free it */ - RO_RTFREE(ro); + RO_NHFREE(ro); } else { /* * PMTU check versus smallest asoc MTU goes * here */ - if (ro->ro_rt == NULL) { + if (ro->ro_nh == NULL) { /* Route was freed */ if (net->ro._s_addr && net->src_addr_selected) { @@ -4640,11 +4634,11 @@ } net->src_addr_selected = 0; } - if ((ro->ro_rt != NULL) && (net->ro._s_addr) && + if ((ro->ro_nh != NULL) && (net->ro._s_addr) && ((net->dest_state & SCTP_ADDR_NO_PMTUD) == 0)) { uint32_t mtu; - mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, ro->ro_rt); + mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, ro->ro_nh); if (mtu > 0) { if (net->port) { mtu -= sizeof(struct udphdr); @@ -13836,7 +13830,7 @@ struct nd_pfxrouter *pfxrtr = NULL; struct sockaddr_in6 gw6; - if (ro == NULL || ro->ro_rt == NULL || src6->sin6_family != AF_INET6) + if (ro == NULL || ro->ro_nh == NULL || src6->sin6_family != AF_INET6) return (0); /* get prefix entry of address */ @@ -13869,8 +13863,8 @@ SCTPDBG(SCTP_DEBUG_OUTPUT2, "prefix router is "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)&gw6); SCTPDBG(SCTP_DEBUG_OUTPUT2, "installed router is "); - SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, ro->ro_rt->rt_gateway); - if (sctp_cmpaddr((struct sockaddr *)&gw6, ro->ro_rt->rt_gateway)) { + SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &ro->ro_nh->gw_sa); + if (sctp_cmpaddr((struct sockaddr *)&gw6, &ro->ro_nh->gw_sa)) { ND6_RUNLOCK(); SCTPDBG(SCTP_DEBUG_OUTPUT2, "pfxrouter is installed\n"); return (1); @@ -13890,7 +13884,7 @@ struct ifaddr *ifa; struct in_addr srcnetaddr, gwnetaddr; - if (ro == NULL || ro->ro_rt == NULL || + if (ro == NULL || ro->ro_nh == NULL || sifa->address.sa.sa_family != AF_INET) { return (0); } @@ -13902,10 +13896,10 @@ SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &sifa->address.sa); SCTPDBG(SCTP_DEBUG_OUTPUT1, "network address is %x\n", srcnetaddr.s_addr); - sin = (struct sockaddr_in *)ro->ro_rt->rt_gateway; + sin = &ro->ro_nh->gw4_sa; gwnetaddr.s_addr = (sin->sin_addr.s_addr & mask->sin_addr.s_addr); SCTPDBG(SCTP_DEBUG_OUTPUT1, "match_nexthop4: nexthop is "); - SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, ro->ro_rt->rt_gateway); + SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &ro->ro_nh->gw4_sa); SCTPDBG(SCTP_DEBUG_OUTPUT1, "network address is %x\n", gwnetaddr.s_addr); if (srcnetaddr.s_addr == gwnetaddr.s_addr) { return (1); Index: sys/netinet/sctp_pcb.c =================================================================== --- sys/netinet/sctp_pcb.c +++ sys/netinet/sctp_pcb.c @@ -3978,9 +3978,11 @@ } else { imtu = 0; } - rmtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, net->ro.ro_rt); + rmtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, net->ro.ro_nh); hcmtu = sctp_hc_get_mtu(&net->ro._l_addr, stcb->sctp_ep->fibnum); net->mtu = sctp_min_mtu(hcmtu, rmtu, imtu); + /* XXXME: not possible */ +#if 0 if (rmtu == 0) { /* * Start things off to match mtu of @@ -3989,6 +3991,7 @@ SCTP_SET_MTU_OF_ROUTE(&net->ro._l_addr.sa, net->ro.ro_rt, net->mtu); } +#endif } } if (net->mtu == 0) { @@ -4069,19 +4072,19 @@ *netp = net; } netfirst = TAILQ_FIRST(&stcb->asoc.nets); - if (net->ro.ro_rt == NULL) { + if (net->ro.ro_nh == NULL) { /* Since we have no route put it at the back */ TAILQ_INSERT_TAIL(&stcb->asoc.nets, net, sctp_next); } else if (netfirst == NULL) { /* We are the first one in the pool. */ TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next); - } else if (netfirst->ro.ro_rt == NULL) { + } else if (netfirst->ro.ro_nh == NULL) { /* * First one has NO route. Place this one ahead of the first * one. */ TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next); - } else if (net->ro.ro_rt->rt_ifp != netfirst->ro.ro_rt->rt_ifp) { + } else if (net->ro.ro_nh->nh_ifp != netfirst->ro.ro_nh->nh_ifp) { /* * This one has a different interface than the one at the * top of the list. Place it ahead. @@ -4102,11 +4105,11 @@ /* End of the list */ TAILQ_INSERT_TAIL(&stcb->asoc.nets, net, sctp_next); break; - } else if (netlook->ro.ro_rt == NULL) { + } else if (netlook->ro.ro_nh == NULL) { /* next one has NO route */ TAILQ_INSERT_BEFORE(netfirst, net, sctp_next); break; - } else if (netlook->ro.ro_rt->rt_ifp != net->ro.ro_rt->rt_ifp) { + } else if (netlook->ro.ro_nh->nh_ifp != net->ro.ro_nh->nh_ifp) { TAILQ_INSERT_AFTER(&stcb->asoc.nets, netlook, net, sctp_next); break; @@ -4119,8 +4122,8 @@ /* got to have a primary set */ if (stcb->asoc.primary_destination == 0) { stcb->asoc.primary_destination = net; - } else if ((stcb->asoc.primary_destination->ro.ro_rt == NULL) && - (net->ro.ro_rt) && + } else if ((stcb->asoc.primary_destination->ro.ro_nh == NULL) && + (net->ro.ro_nh) && ((net->dest_state & SCTP_ADDR_UNCONFIRMED) == 0)) { /* No route to current primary adopt new primary */ stcb->asoc.primary_destination = net; @@ -5461,14 +5464,9 @@ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (net->ro._s_addr == laddr->ifa) { /* Yep, purge src address selected */ - sctp_rtentry_t *rt; /* delete this address if cached */ - rt = net->ro.ro_rt; - if (rt != NULL) { - RTFREE(rt); - net->ro.ro_rt = NULL; - } + RO_NHFREE(&net->ro); sctp_free_ifa(net->ro._s_addr); net->ro._s_addr = NULL; net->src_addr_selected = 0; Index: sys/netinet/sctp_structs.h =================================================================== --- sys/netinet/sctp_structs.h +++ sys/netinet/sctp_structs.h @@ -189,7 +189,7 @@ #define SCTP_ITERATOR_STOP_CUR_INP 0x00000008 struct sctp_net_route { - sctp_rtentry_t *ro_rt; + struct nhop_object *ro_nh; struct llentry *ro_lle; char *ro_prepend; uint16_t ro_plen; Index: sys/netinet/sctp_timer.c =================================================================== --- sys/netinet/sctp_timer.c +++ sys/netinet/sctp_timer.c @@ -350,7 +350,7 @@ return (NULL); } } - if (alt->ro.ro_rt == NULL) { + if (alt->ro.ro_nh == NULL) { if (alt->ro._s_addr) { sctp_free_ifa(alt->ro._s_addr); alt->ro._s_addr = NULL; @@ -358,7 +358,7 @@ alt->src_addr_selected = 0; } if (((alt->dest_state & SCTP_ADDR_REACHABLE) == SCTP_ADDR_REACHABLE) && - (alt->ro.ro_rt != NULL) && + (alt->ro.ro_nh != NULL) && (!(alt->dest_state & SCTP_ADDR_UNCONFIRMED))) { /* Found a reachable address */ break; @@ -937,10 +937,7 @@ net->src_addr_selected = 0; /* Force a route allocation too */ - if (net->ro.ro_rt) { - RTFREE(net->ro.ro_rt); - net->ro.ro_rt = NULL; - } + RO_NHFREE(&net->ro); /* Was it our primary? */ if ((stcb->asoc.primary_destination == net) && (alt != net)) { @@ -1501,7 +1498,7 @@ net->src_addr_selected = 1; } if (net->ro._s_addr) { - mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._s_addr.sa, net->ro.ro_rt); + mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._s_addr.sa, net->ro.ro_nh); #if defined(INET) || defined(INET6) if (net->port) { mtu -= sizeof(struct udphdr); Index: sys/netinet/sctp_var.h =================================================================== --- sys/netinet/sctp_var.h +++ sys/netinet/sctp_var.h @@ -187,9 +187,9 @@ if ((__net)) { \ if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&(__net)->ref_count)) { \ (void)SCTP_OS_TIMER_STOP(&(__net)->rxt_timer.timer); \ - if ((__net)->ro.ro_rt) { \ - RTFREE((__net)->ro.ro_rt); \ - (__net)->ro.ro_rt = NULL; \ + if ((__net)->ro.ro_nh) { \ + NH_FREE((__net)->ro.ro_nh); \ + (__net)->ro.ro_nh = NULL; \ } \ if ((__net)->src_addr_selected) { \ sctp_free_ifa((__net)->ro._s_addr); \ Index: sys/netinet/tcp_offload.c =================================================================== --- sys/netinet/tcp_offload.c +++ sys/netinet/tcp_offload.c @@ -41,8 +41,11 @@ #include #include #include +#include #include #include +#include +#include #include #include #define TCPOUTFLAGS @@ -60,7 +63,8 @@ { struct ifnet *ifp; struct toedev *tod; - struct rtentry *rt; + struct nhop_object *nh; + struct epoch_tracker et; int error = EOPNOTSUPP; INP_WLOCK_ASSERT(sotoinpcb(so)); @@ -70,13 +74,20 @@ if (registered_toedevs == 0) return (error); - rt = rtalloc1(nam, 0, 0); - if (rt) - RT_UNLOCK(rt); - else + NET_EPOCH_ENTER(et); + nh = NULL; + if (nam->sa_family == AF_INET) + nh = fib4_lookup_nh_ptr(0, ((struct sockaddr_in *)nam)->sin_addr, + NHR_NONE, 0, 0); + else if (nam->sa_family == AF_INET6) + nh = fib6_lookup_nh_ptr(0, &((struct sockaddr_in6 *)nam)->sin6_addr, + NHR_NONE, 0, 0); + if (nh == NULL) { + NET_EPOCH_EXIT(et); return (EHOSTUNREACH); + } - ifp = rt->rt_ifp; + ifp = nh->nh_ifp; if (nam->sa_family == AF_INET && !(ifp->if_capenable & IFCAP_TOE4)) goto done; @@ -85,9 +96,9 @@ tod = TOEDEV(ifp); if (tod != NULL) - error = tod->tod_connect(tod, so, rt, nam); + error = tod->tod_connect(tod, so, nh, nam); done: - RTFREE(rt); + NET_EPOCH_EXIT(et); return (error); } Index: sys/netinet/tcp_output.c =================================================================== --- sys/netinet/tcp_output.c +++ sys/netinet/tcp_output.c @@ -64,6 +64,7 @@ #include #include +#include #include #include @@ -1411,8 +1412,8 @@ ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, NULL, tp->t_inpcb); - if (error == EMSGSIZE && tp->t_inpcb->inp_route6.ro_rt != NULL) - mtu = tp->t_inpcb->inp_route6.ro_rt->rt_mtu; + if (error == EMSGSIZE && tp->t_inpcb->inp_route6.ro_nh != NULL) + mtu = tp->t_inpcb->inp_route6.ro_nh->nh_mtu; } #endif /* INET6 */ #if defined(INET) && defined(INET6) @@ -1454,8 +1455,8 @@ ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, tp->t_inpcb); - if (error == EMSGSIZE && tp->t_inpcb->inp_route.ro_rt != NULL) - mtu = tp->t_inpcb->inp_route.ro_rt->rt_mtu; + if (error == EMSGSIZE && tp->t_inpcb->inp_route.ro_nh != NULL) + mtu = tp->t_inpcb->inp_route.ro_nh->nh_mtu; } #endif /* INET */ Index: sys/netinet/tcp_subr.c =================================================================== --- sys/netinet/tcp_subr.c +++ sys/netinet/tcp_subr.c @@ -76,6 +76,7 @@ #include #include +#include #include #include #include @@ -2199,9 +2200,9 @@ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { - if (inp->inp_route.ro_rt) { - RTFREE(inp->inp_route.ro_rt); - inp->inp_route.ro_rt = (struct rtentry *)NULL; + if (inp->inp_route.ro_nh) { + NH_FREE(inp->inp_route.ro_nh); + inp->inp_route.ro_nh = (struct nhop_object *)NULL; } return (inp); } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && Index: sys/netinet/toecore.h =================================================================== --- sys/netinet/toecore.h +++ sys/netinet/toecore.h @@ -41,6 +41,7 @@ struct tcphdr; struct in_conninfo; struct tcp_info; +struct nhop_object; struct ktls_session; struct toedev { @@ -51,7 +52,7 @@ * Active open. If a failure occurs, it is reported back by the driver * via toe_connect_failed. */ - int (*tod_connect)(struct toedev *, struct socket *, struct rtentry *, + int (*tod_connect)(struct toedev *, struct socket *, struct nhop_object *, struct sockaddr *); /* Passive open. */ Index: sys/netinet/toecore.c =================================================================== --- sys/netinet/toecore.c +++ sys/netinet/toecore.c @@ -77,7 +77,7 @@ static int toedev_connect(struct toedev *tod __unused, struct socket *so __unused, - struct rtentry *rt __unused, struct sockaddr *nam __unused) + struct nhop_object *nh __unused, struct sockaddr *nam __unused) { return (ENOTSUP); Index: sys/netinet/udp_usrreq.c =================================================================== --- sys/netinet/udp_usrreq.c +++ sys/netinet/udp_usrreq.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #include @@ -761,9 +762,9 @@ INP_WLOCK_ASSERT(inp); if ((errno == EHOSTUNREACH || errno == ENETUNREACH || - errno == EHOSTDOWN) && inp->inp_route.ro_rt) { - RTFREE(inp->inp_route.ro_rt); - inp->inp_route.ro_rt = (struct rtentry *)NULL; + errno == EHOSTDOWN) && inp->inp_route.ro_nh) { + NH_FREE(inp->inp_route.ro_nh); + inp->inp_route.ro_nh = (struct nhop_object *)NULL; } inp->inp_socket->so_error = errno; Index: sys/netinet6/icmp6.c =================================================================== --- sys/netinet6/icmp6.c +++ sys/netinet6/icmp6.c @@ -93,6 +93,7 @@ #include #include #include +#include #include #include @@ -2412,7 +2413,7 @@ } void -icmp6_redirect_output(struct mbuf *m0, struct rtentry *rt) +icmp6_redirect_output(struct mbuf *m0, struct nhop_object *nh) { struct ifnet *ifp; /* my outgoing interface */ struct in6_addr *ifp_ll6; @@ -2435,7 +2436,7 @@ goto fail; /* sanity check */ - if (!m0 || !rt || !(rt->rt_flags & RTF_UP) || !(ifp = rt->rt_ifp)) + if (!m0 || !nh || !(NH_IS_VALID(nh)) || !(ifp = nh->nh_ifp)) goto fail; /* @@ -2469,7 +2470,7 @@ m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) goto fail; - M_SETFIB(m, rt->rt_fibnum); + M_SETFIB(m, M_GETFIB(m0)); maxlen = M_TRAILINGSPACE(m); maxlen = min(IPV6_MMTU, maxlen); /* just for safety */ @@ -2491,9 +2492,9 @@ } /* get ip6 linklocal address for the router. */ - if (rt->rt_gateway && (rt->rt_flags & RTF_GATEWAY)) { + if (nh->nh_flags & NHF_GATEWAY) { struct sockaddr_in6 *sin6; - sin6 = (struct sockaddr_in6 *)rt->rt_gateway; + sin6 = &nh->gw6_sa; router_ll6 = &sin6->sin6_addr; if (!IN6_IS_ADDR_LINKLOCAL(router_ll6)) router_ll6 = (struct in6_addr *)NULL; @@ -2517,7 +2518,7 @@ nd_rd->nd_rd_type = ND_REDIRECT; nd_rd->nd_rd_code = 0; nd_rd->nd_rd_reserved = 0; - if (rt->rt_flags & RTF_GATEWAY) { + if (nh->nh_flags & NHF_GATEWAY) { /* * nd_rd->nd_rd_target must be a link-local address in * better router cases. Index: sys/netinet6/in6.h =================================================================== --- sys/netinet6/in6.h +++ sys/netinet6/in6.h @@ -375,8 +375,9 @@ * IP6 route structure */ #if __BSD_VISIBLE +struct nhop_object; struct route_in6 { - struct rtentry *ro_rt; + struct nhop_object *ro_nh; struct llentry *ro_lle; /* * ro_prepend and ro_plen are only used for bpf to pass in a Index: sys/netinet6/in6_fib.h =================================================================== --- sys/netinet6/in6_fib.h +++ sys/netinet6/in6_fib.h @@ -58,5 +58,14 @@ uint32_t scopeid, uint32_t flags, uint32_t flowid, struct nhop6_extended *pnh6); void fib6_free_nh_ext(uint32_t fibnum, struct nhop6_extended *pnh6); +struct nhop_object *fib6_lookup_nh_ptr(uint32_t fibnum, + const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags, + uint32_t flowid); +int fib6_lookup_urpf(uint32_t fibnum, const struct in6_addr *dst6, + uint32_t scopeid, uint32_t flags, const struct ifnet *src_if); + +uint32_t fib6_calc_software_hash(const struct in6_addr *src, + const struct in6_addr *dst, unsigned short src_port, unsigned short dst_port, + char proto, uint32_t *phashtype); #endif Index: sys/netinet6/in6_fib.c =================================================================== --- sys/netinet6/in6_fib.c +++ sys/netinet6/in6_fib.c @@ -34,6 +34,7 @@ #include "opt_inet6.h" #include "opt_route.h" #include "opt_mpath.h" +#include "opt_route_mpath.h" #include #include @@ -50,12 +51,11 @@ #include #include #include +#include +#include +#include #include -#ifdef RADIX_MPATH -#include -#endif - #include #include #include @@ -64,98 +64,99 @@ #include #include #include +#include #include #ifdef INET6 -static void fib6_rte_to_nh_extended(struct rtentry *rte, +static void fib6_rte_to_nh_extended(const struct nhop_object *nh, const struct in6_addr *dst, uint32_t flags, struct nhop6_extended *pnh6); -static void fib6_rte_to_nh_basic(struct rtentry *rte, const struct in6_addr *dst, +static void fib6_rte_to_nh_basic(const struct nhop_object *nh, const struct in6_addr *dst, uint32_t flags, struct nhop6_basic *pnh6); -static struct ifnet *fib6_get_ifaifp(struct rtentry *rte); #define RNTORT(p) ((struct rtentry *)(p)) #define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) CHK_STRUCT_ROUTE_COMPAT(struct route_in6, ro_dst); -/* - * Gets real interface for the @rte. - * Returns rt_ifp for !IFF_LOOPBACK routers. - * Extracts "real" address interface from interface address - * loopback routes. - */ -static struct ifnet * -fib6_get_ifaifp(struct rtentry *rte) +#ifdef ROUTE_MPATH +struct _hash_5tuple_ipv6 { + struct in6_addr src; + struct in6_addr dst; + unsigned short src_port; + unsigned short dst_port; + char proto; + char spare[3]; +}; +_Static_assert(sizeof(struct _hash_5tuple_ipv6) == 40, + "_hash_5tuple_ipv6 size is wrong"); + +uint32_t +fib6_calc_software_hash(const struct in6_addr *src, const struct in6_addr *dst, + unsigned short src_port, unsigned short dst_port, char proto, + uint32_t *phashtype) { - struct ifnet *ifp; - struct sockaddr_dl *sdl; + struct _hash_5tuple_ipv6 data; - ifp = rte->rt_ifp; - if ((ifp->if_flags & IFF_LOOPBACK) && - rte->rt_gateway->sa_family == AF_LINK) { - sdl = (struct sockaddr_dl *)rte->rt_gateway; - return (ifnet_byindex(sdl->sdl_index)); - } + data.src = *src; + data.dst = *dst; + data.src_port = src_port; + data.dst_port = dst_port; + data.proto = proto; + data.spare[0] = data.spare[1] = data.spare[2] = 0; - return (ifp); + *phashtype = M_HASHTYPE_OPAQUE_HASH; + + return (toeplitz_hash(MPATH_ENTROPY_KEY_LEN, mpath_entropy_key, + sizeof(data), (uint8_t *)&data)); } +#endif static void -fib6_rte_to_nh_basic(struct rtentry *rte, const struct in6_addr *dst, +fib6_rte_to_nh_basic(const struct nhop_object *nh, const struct in6_addr *dst, uint32_t flags, struct nhop6_basic *pnh6) { - struct sockaddr_in6 *gw; /* Do explicit nexthop zero unless we're copying it */ memset(pnh6, 0, sizeof(*pnh6)); if ((flags & NHR_IFAIF) != 0) - pnh6->nh_ifp = fib6_get_ifaifp(rte); + pnh6->nh_ifp = nh->nh_aifp; else - pnh6->nh_ifp = rte->rt_ifp; + pnh6->nh_ifp = nh->nh_ifp; - pnh6->nh_mtu = min(rte->rt_mtu, IN6_LINKMTU(rte->rt_ifp)); - if (rte->rt_flags & RTF_GATEWAY) { + pnh6->nh_mtu = nh->nh_mtu; + if (nh->nh_flags & NHF_GATEWAY) { /* Return address with embedded scope. */ - gw = (struct sockaddr_in6 *)rte->rt_gateway; - pnh6->nh_addr = gw->sin6_addr; + pnh6->nh_addr = nh->gw6_sa.sin6_addr; } else pnh6->nh_addr = *dst; /* Set flags */ - pnh6->nh_flags = fib_rte_to_nh_flags(rte->rt_flags); - gw = (struct sockaddr_in6 *)rt_key(rte); - if (IN6_IS_ADDR_UNSPECIFIED(&gw->sin6_addr)) - pnh6->nh_flags |= NHF_DEFAULT; + pnh6->nh_flags = nh->nh_flags; } static void -fib6_rte_to_nh_extended(struct rtentry *rte, const struct in6_addr *dst, +fib6_rte_to_nh_extended(const struct nhop_object *nh, const struct in6_addr *dst, uint32_t flags, struct nhop6_extended *pnh6) { - struct sockaddr_in6 *gw; /* Do explicit nexthop zero unless we're copying it */ memset(pnh6, 0, sizeof(*pnh6)); if ((flags & NHR_IFAIF) != 0) - pnh6->nh_ifp = fib6_get_ifaifp(rte); + pnh6->nh_ifp = nh->nh_aifp; else - pnh6->nh_ifp = rte->rt_ifp; + pnh6->nh_ifp = nh->nh_ifp; - pnh6->nh_mtu = min(rte->rt_mtu, IN6_LINKMTU(rte->rt_ifp)); - if (rte->rt_flags & RTF_GATEWAY) { + pnh6->nh_mtu = nh->nh_mtu; + if (nh->nh_flags & NHF_GATEWAY) { /* Return address with embedded scope. */ - gw = (struct sockaddr_in6 *)rte->rt_gateway; - pnh6->nh_addr = gw->sin6_addr; + pnh6->nh_addr = nh->gw6_sa.sin6_addr; } else pnh6->nh_addr = *dst; /* Set flags */ - pnh6->nh_flags = fib_rte_to_nh_flags(rte->rt_flags); - gw = (struct sockaddr_in6 *)rt_key(rte); - if (IN6_IS_ADDR_UNSPECIFIED(&gw->sin6_addr)) - pnh6->nh_flags |= NHF_DEFAULT; - pnh6->nh_ia = ifatoia6(rte->rt_ifa); + pnh6->nh_flags = nh->nh_flags; + pnh6->nh_ia = ifatoia6(nh->nh_ifa); } /* @@ -180,7 +181,7 @@ struct rib_head *rh; struct radix_node *rn; struct sockaddr_in6 sin6; - struct rtentry *rte; + struct nhop_object *nh; KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_basic: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET6); @@ -198,10 +199,10 @@ RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rte = RNTORT(rn); + nh = RT_SELECT_NHOP((RNTORT(rn)), flowid); /* Ensure route & ifp is UP */ - if (RT_LINK_IS_UP(rte->rt_ifp)) { - fib6_rte_to_nh_basic(rte, &sin6.sin6_addr, flags, pnh6); + if (RT_LINK_IS_UP(nh->nh_ifp)) { + fib6_rte_to_nh_basic(nh, &sin6.sin6_addr, flags, pnh6); RIB_RUNLOCK(rh); return (0); } @@ -230,7 +231,7 @@ struct rib_head *rh; struct radix_node *rn; struct sockaddr_in6 sin6; - struct rtentry *rte; + struct nhop_object *nh; KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_ext: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET6); @@ -248,17 +249,10 @@ RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rte = RNTORT(rn); -#ifdef RADIX_MPATH - rte = rt_mpath_select(rte, flowid); - if (rte == NULL) { - RIB_RUNLOCK(rh); - return (ENOENT); - } -#endif + nh = RT_SELECT_NHOP((RNTORT(rn)), flowid); /* Ensure route & ifp is UP */ - if (RT_LINK_IS_UP(rte->rt_ifp)) { - fib6_rte_to_nh_extended(rte, &sin6.sin6_addr, flags, + if (RT_LINK_IS_UP(nh->nh_ifp)) { + fib6_rte_to_nh_extended(nh, &sin6.sin6_addr, flags, pnh6); if ((flags & NHR_REF) != 0) { /* TODO: Do lwref on egress ifp's */ @@ -277,6 +271,138 @@ fib6_free_nh_ext(uint32_t fibnum, struct nhop6_extended *pnh6) { +} + +/* + * + * Assumes scope is deembedded and provided in @scopeid + */ +struct nhop_object * +fib6_lookup_nh_ptr(uint32_t fibnum, const struct in6_addr *dst6, + uint32_t scopeid, uint32_t flags, uint32_t flowid) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rh; + struct radix_node *rn; + struct nhop_object *nh; + struct sockaddr_in6 sin6; + + KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_ext: bad fibnum")); + rh = rt_tables_get_rnh(fibnum, AF_INET6); + if (rh == NULL) + return (NULL); + + /* TODO: radix changes */ + //addr = *dst6; + /* Prepare lookup key */ + memset(&sin6, 0, sizeof(sin6)); + sin6.sin6_len = sizeof(struct sockaddr_in6); + sin6.sin6_addr = *dst6; + + /* Assume scopeid is valid and embed it directly */ + if (IN6_IS_SCOPE_LINKLOCAL(dst6)) + sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff); + + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { + nh = RT_SELECT_NHOP((RNTORT(rn)), flowid); + /* Ensure route & ifp is UP */ + if (RT_LINK_IS_UP(nh->nh_ifp)) { + if (flags & NHR_REF) + nhop_ref_object(nh); + RIB_RUNLOCK(rh); + return (nh); + } + } + RIB_RUNLOCK(rh); + + RTSTAT_INC(rts_unreach); + return (NULL); +} + +inline static int +check_urpf(const struct nhop_object *nh, uint32_t flags, + const struct ifnet *src_if) +{ +#ifdef ROUTE_MPATH + const struct nhgrp_object *nhgrp; + + if (NH_IS_MULTIPATH(nh)) { + nhgrp = (const struct nhgrp_object *)nh; + + if (src_if == NULL) { + if ((flags & NHR_NODEFAULT) == 0) + return (1); + else if ((nhgrp->nhops[0]->nh_flags & NHF_DEFAULT) == 0) + return (1); + return (0); + } + + /* src_if != NULL, need to iterate over nhops */ + /* TODO: consider iterating control plane nhop list */ + for (int i = 0; i < nhgrp->mp_size; i++) { + if (nhgrp->nhops[i]->nh_aifp == src_if) + return (1); + } + return (0); + } +#endif + + if (src_if != NULL && nh->nh_aifp == src_if) { + return (1); + } + if (src_if == NULL) { + if ((flags & NHR_NODEFAULT) == 0) + return (1); + else if ((nh->nh_flags & NHF_DEFAULT) == 0) + return (1); + } + + return (0); +} + +/* + * Performs reverse path forwarding lookup. + * If @src_if is non-zero, verifies that at least 1 path goes via + * this interface. + * If @src_if is zero, verifies that route exist. + * if @flags contains NHR_NOTDEFAULT, do not consider default route. + * + * Returns 1 if route matching conditions is found, 0 otherwise. + */ +int +fib6_lookup_urpf(uint32_t fibnum, const struct in6_addr *dst6, + uint32_t scopeid, uint32_t flags, const struct ifnet *src_if) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rh; + struct radix_node *rn; + struct nhop_object *nh; + struct in6_addr addr; + int ret; + + KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_ext: bad fibnum")); + rh = rt_tables_get_rnh(fibnum, AF_INET6); + if (rh == NULL) + return (0); + + addr = *dst6; + /* Assume scopeid is valid and embed it directly */ + if (IN6_IS_SCOPE_LINKLOCAL(dst6)) + addr.s6_addr16[1] = htons(scopeid & 0xffff); + + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr((void *)&addr, &rh->head); + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { + nh = (RNTORT(rn))->rt_nhop; + ret = check_urpf(nh, flags, src_if); + RIB_RUNLOCK(rh); + return (ret); + } + RIB_RUNLOCK(rh); + + return (0); } #endif Index: sys/netinet6/in6_pcb.c =================================================================== --- sys/netinet6/in6_pcb.c +++ sys/netinet6/in6_pcb.c @@ -74,6 +74,7 @@ #include "opt_ipsec.h" #include "opt_pcbgroup.h" #include "opt_rss.h" +#include "opt_route_mpath.h" #include #include @@ -97,6 +98,7 @@ #include #include #include +#include #include #include @@ -109,6 +111,7 @@ #include #include #include +#include #include static struct inpcb *in6_pcblookup_hash_locked(struct inpcbinfo *, @@ -417,10 +420,22 @@ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; struct in6_addr addr6; int error; +#ifdef ROUTE_MPATH + uint32_t hash_val, hash_type; +#endif INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); +#ifdef ROUTE_MPATH + if (V_fib_hash_outbound) { + hash_val = fib6_calc_software_hash(&inp->in6p_laddr, + &sin6->sin6_addr, 0, sin6->sin6_port, + inp->inp_socket->so_proto->pr_protocol, &hash_type); + inp->inp_flowid = hash_val; + inp->inp_flowtype = hash_type; + } +#endif /* * Call inner routine, to assign local interface address. * in6_pcbladdr() may automatically fill in sin6_scope_id. Index: sys/netinet6/in6_rmx.c =================================================================== --- sys/netinet6/in6_rmx.c +++ sys/netinet6/in6_rmx.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include @@ -80,6 +81,8 @@ #include #include #include +#include +#include #include #include @@ -101,6 +104,40 @@ extern int in6_detachhead(void **head, int off); #endif +static int +rib6_preadd(u_int fibnum, const struct sockaddr *addr, const struct sockaddr *mask, + struct nhop_request *req) +{ + + /* XXX: RTF_LOCAL */ + + /* + * Check route MTU: + * inherit interface MTU if not set or + * check if MTU is too large. + */ + if (req->mtu == 0) { + req->mtu = IN6_LINKMTU(req->ifp); + } else if (req->mtu > IN6_LINKMTU(req->ifp)) + req->mtu = IN6_LINKMTU(req->ifp); + + /* Ensure that default route nhop has special flag */ + const struct sockaddr_in6 *mask6 = (const struct sockaddr_in6 *)mask; + if ((req->rt_flags & RTF_HOST) == 0 && IN6_IS_ADDR_UNSPECIFIED(&mask6->sin6_addr)) + req->nh_flags_additional |= NHF_DEFAULT; + + /* Set nexthop type */ + if (req->nh_type == 0) { + if (req->rt_flags & RTF_GATEWAY) + req->nh_type = NH_TYPE_IPV6_ETHER_NHOP; + else + req->nh_type = NH_TYPE_IPV6_ETHER_RSLV; + } + + return (0); +} + +#if 0 /* * Do what we need to do when inserting a route. */ @@ -151,6 +188,7 @@ return (rn_addroute(v_arg, n_arg, head, treenodes)); } +#endif /* * Initialize our routing tree. @@ -166,7 +204,7 @@ if (rh == NULL) return (0); - rh->rnh_addaddr = in6_addroute; + rh->rnh_preadd = rib6_preadd; *head = (void *)rh; return (1); @@ -186,31 +224,4 @@ /* * Extended API for IPv6 FIB support. */ -int -in6_rtrequest(int req, struct sockaddr *dst, struct sockaddr *gw, - struct sockaddr *mask, int flags, struct rtentry **ret_nrt, u_int fibnum) -{ - return (rtrequest_fib(req, dst, gw, mask, flags, ret_nrt, fibnum)); -} - -void -in6_rtalloc(struct route_in6 *ro, u_int fibnum) -{ - - rtalloc_ign_fib((struct route *)ro, 0ul, fibnum); -} - -void -in6_rtalloc_ign(struct route_in6 *ro, u_long ignflags, u_int fibnum) -{ - - rtalloc_ign_fib((struct route *)ro, ignflags, fibnum); -} - -struct rtentry * -in6_rtalloc1(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum) -{ - - return (rtalloc1_fib(dst, report, ignflags, fibnum)); -} Index: sys/netinet6/in6_src.c =================================================================== --- sys/netinet6/in6_src.c +++ sys/netinet6/in6_src.c @@ -91,6 +91,7 @@ #include #include #include +#include #include #ifdef RADIX_MPATH #include @@ -134,7 +135,7 @@ static int selectroute(struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *, struct ifnet **, - struct rtentry **, int, u_int); + struct nhop_object **, int, u_int, uint32_t); static int in6_selectif(struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct ifnet **, struct ifnet *, u_int); @@ -625,11 +626,12 @@ static int selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, - struct ifnet **retifp, struct rtentry **retrt, int norouteok, u_int fibnum) + struct ifnet **retifp, struct nhop_object **retnh, int norouteok, + u_int fibnum, uint32_t flowid) { int error = 0; struct ifnet *ifp = NULL; - struct rtentry *rt = NULL; + struct nhop_object *nh = NULL; struct sockaddr_in6 *sin6_next; struct in6_pktinfo *pi = NULL; struct in6_addr *dst = &dstsock->sin6_addr; @@ -654,7 +656,7 @@ /* XXX boundary check is assumed to be already done. */ ifp = ifnet_byindex(pi->ipi6_ifindex); if (ifp != NULL && - (norouteok || retrt == NULL || + (norouteok || retnh == NULL || IN6_IS_ADDR_MULTICAST(dst))) { /* * we do not have to check or get the route for @@ -707,26 +709,31 @@ } ron = &opts->ip6po_nextroute; /* Use a cached route if it exists and is valid. */ - if (ron->ro_rt != NULL && ( - (ron->ro_rt->rt_flags & RTF_UP) == 0 || + if (ron->ro_nh != NULL && ( + !NH_IS_VALID(ron->ro_nh) || ron->ro_dst.sin6_family != AF_INET6 || !IN6_ARE_ADDR_EQUAL(&ron->ro_dst.sin6_addr, &sin6_next->sin6_addr))) - RO_RTFREE(ron); - if (ron->ro_rt == NULL) { + RO_NHFREE(ron); + if (ron->ro_nh == NULL) { ron->ro_dst = *sin6_next; - in6_rtalloc(ron, fibnum); /* multi path case? */ + /* + * sin6_next is not link-local OR scopeid is 0, + * no need to clear scope + */ + ron->ro_nh = fib6_lookup_nh_ptr(fibnum, + &sin6_next->sin6_addr, 0, NHR_REF, flowid); } /* * The node identified by that address must be a * neighbor of the sending host. */ - if (ron->ro_rt == NULL || - (ron->ro_rt->rt_flags & RTF_GATEWAY) != 0) + if (ron->ro_nh == NULL || + (ron->ro_nh->nh_flags & NHF_GATEWAY) != 0) error = EHOSTUNREACH; else { - rt = ron->ro_rt; - ifp = rt->rt_ifp; + nh = ron->ro_nh; + ifp = nh->nh_ifp; } goto done; } @@ -737,15 +744,14 @@ * cached destination, in case of sharing the cache with IPv4. */ if (ro) { - if (ro->ro_rt && - (!(ro->ro_rt->rt_flags & RTF_UP) || + if (ro->ro_nh && + (!NH_IS_VALID(ro->ro_nh) || ((struct sockaddr *)(&ro->ro_dst))->sa_family != AF_INET6 || !IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr, dst))) { - RTFREE(ro->ro_rt); - ro->ro_rt = (struct rtentry *)NULL; + RO_NHFREE(ro); } - if (ro->ro_rt == (struct rtentry *)NULL) { + if (ro->ro_nh == (struct nhop_object *)NULL) { struct sockaddr_in6 *sa6; /* No route yet, so try to acquire one */ @@ -754,15 +760,28 @@ *sa6 = *dstsock; sa6->sin6_scope_id = 0; + /* + * Currently dst has scopeid embedded. + * New routing API accepts scopeid as a separate argument. + * Convert dst before/after doing lookup + */ + uint32_t scopeid = 0; + if (IN6_IS_SCOPE_LINKLOCAL(&sa6->sin6_addr)) { + /* Unwrap in6_getscope() and in6_clearscope() */ + scopeid = ntohs(sa6->sin6_addr.s6_addr16[1]); + sa6->sin6_addr.s6_addr16[1] = 0; + + } + #ifdef RADIX_MPATH rtalloc_mpath_fib((struct route *)ro, ntohl(sa6->sin6_addr.s6_addr32[3]), fibnum); #else - ro->ro_rt = in6_rtalloc1((struct sockaddr *) - &ro->ro_dst, 0, 0UL, fibnum); - if (ro->ro_rt) - RT_UNLOCK(ro->ro_rt); + ro->ro_nh = fib6_lookup_nh_ptr(fibnum, + &sa6->sin6_addr, scopeid, NHR_REF, flowid); #endif + if (IN6_IS_SCOPE_LINKLOCAL(&sa6->sin6_addr)) + sa6->sin6_addr.s6_addr16[1] = htons(scopeid); } /* @@ -772,17 +791,11 @@ if (opts && opts->ip6po_nexthop) goto done; - if (ro->ro_rt) { - ifp = ro->ro_rt->rt_ifp; - - if (ifp == NULL) { /* can this really happen? */ - RTFREE(ro->ro_rt); - ro->ro_rt = NULL; - } - } - if (ro->ro_rt == NULL) + if (ro->ro_nh) + ifp = ro->ro_nh->nh_ifp; + else error = EHOSTUNREACH; - rt = ro->ro_rt; + nh = ro->ro_nh; /* * Check if the outgoing interface conflicts with @@ -803,7 +816,7 @@ } done: - if (ifp == NULL && rt == NULL) { + if (ifp == NULL && nh == NULL) { /* * This can happen if the caller did not pass a cached route * nor any other hints. We treat this case an error. @@ -814,26 +827,14 @@ IP6STAT_INC(ip6s_noroute); if (retifp != NULL) { - *retifp = ifp; - - /* - * Adjust the "outgoing" interface. If we're going to loop - * the packet back to ourselves, the ifp would be the loopback - * interface. However, we'd rather know the interface associated - * to the destination address (which should probably be one of - * our own addresses.) - */ - if (rt) { - if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) && - (rt->rt_gateway->sa_family == AF_LINK)) - *retifp = - ifnet_byindex(((struct sockaddr_dl *) - rt->rt_gateway)->sdl_index); - } + if (nh != NULL) + *retifp = nh->nh_aifp; + else + *retifp = ifp; } - if (retrt != NULL) - *retrt = rt; /* rt may be NULL */ + if (retnh != NULL) + *retnh = nh; /* nh may be NULL */ return (error); } @@ -845,20 +846,20 @@ { int error; struct route_in6 sro; - struct rtentry *rt = NULL; - int rt_flags; + struct nhop_object *nh = NULL; + uint16_t nh_flags; KASSERT(retifp != NULL, ("%s: retifp is NULL", __func__)); bzero(&sro, sizeof(sro)); - rt_flags = 0; + nh_flags = 0; - error = selectroute(dstsock, opts, mopts, &sro, retifp, &rt, 1, fibnum); + error = selectroute(dstsock, opts, mopts, &sro, retifp, &nh, 1, fibnum, 0); - if (rt) - rt_flags = rt->rt_flags; - if (rt && rt == sro.ro_rt) - RTFREE(rt); + if (nh != NULL) + nh_flags = nh->nh_flags; + if (nh != NULL && nh == sro.ro_nh) + NH_FREE(nh); if (error != 0) { /* Help ND. See oifp comment in in6_selectsrc(). */ @@ -887,8 +888,8 @@ * We thus reject the case here. */ - if (rt_flags & (RTF_REJECT | RTF_BLACKHOLE)) { - error = (rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); + if (nh_flags & (NHF_REJECT | NHF_BLACKHOLE)) { + error = (nh_flags & NHF_HOST ? EHOSTUNREACH : ENETUNREACH); return (error); } @@ -899,11 +900,11 @@ int in6_selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, - struct ifnet **retifp, struct rtentry **retrt, u_int fibnum) + struct ifnet **retifp, struct nhop_object **retnh, u_int fibnum, uint32_t flowid) { return (selectroute(dstsock, opts, mopts, ro, retifp, - retrt, 0, fibnum)); + retnh, 0, fibnum, flowid)); } /* Index: sys/netinet6/in6_var.h =================================================================== --- sys/netinet6/in6_var.h +++ sys/netinet6/in6_var.h @@ -915,11 +915,6 @@ * Extended API for IPv6 FIB support. */ struct mbuf *ip6_tryforward(struct mbuf *); -int in6_rtrequest(int, struct sockaddr *, struct sockaddr *, - struct sockaddr *, int, struct rtentry **, u_int); -void in6_rtalloc(struct route_in6 *, u_int); -void in6_rtalloc_ign(struct route_in6 *, u_long, u_int); -struct rtentry *in6_rtalloc1(struct sockaddr *, int, u_long, u_int); #endif /* _KERNEL */ #endif /* _NETINET6_IN6_VAR_H_ */ Index: sys/netinet6/ip6_fastfwd.c =================================================================== --- sys/netinet6/ip6_fastfwd.c +++ sys/netinet6/ip6_fastfwd.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -55,30 +56,35 @@ #include static int -ip6_findroute(struct nhop6_basic *pnh, const struct sockaddr_in6 *dst, +ip6_findroute(struct nhop_object **pnh, const struct sockaddr_in6 *dst, struct mbuf *m) { + struct nhop_object *nh; - if (fib6_lookup_nh_basic(M_GETFIB(m), &dst->sin6_addr, - dst->sin6_scope_id, 0, dst->sin6_flowinfo, pnh) != 0) { + nh = fib6_lookup_nh_ptr(M_GETFIB(m), &dst->sin6_addr, + dst->sin6_scope_id, NHR_NONE, m->m_pkthdr.flowid); + if (nh == NULL) { IP6STAT_INC(ip6s_noroute); IP6STAT_INC(ip6s_cantforward); icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE, 0); return (EHOSTUNREACH); } - if (pnh->nh_flags & NHF_BLACKHOLE) { + if (nh->nh_flags & NHF_BLACKHOLE) { IP6STAT_INC(ip6s_cantforward); m_freem(m); return (EHOSTUNREACH); } - if (pnh->nh_flags & NHF_REJECT) { + if (nh->nh_flags & NHF_REJECT) { IP6STAT_INC(ip6s_cantforward); icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_REJECT, 0); return (EHOSTUNREACH); } + + *pnh = nh; + return (0); } @@ -86,7 +92,7 @@ ip6_tryforward(struct mbuf *m) { struct sockaddr_in6 dst; - struct nhop6_basic nh; + struct nhop_object *nh; struct m_tag *fwd_tag; struct ip6_hdr *ip6; struct ifnet *rcvif; @@ -196,9 +202,9 @@ goto dropin; } if (!PFIL_HOOKED_OUT(V_inet6_pfil_head)) { - if (m->m_pkthdr.len > nh.nh_mtu) { - in6_ifstat_inc(nh.nh_ifp, ifs6_in_toobig); - icmp6_error(m, ICMP6_PACKET_TOO_BIG, 0, nh.nh_mtu); + if (m->m_pkthdr.len > nh->nh_mtu) { + in6_ifstat_inc(nh->nh_ifp, ifs6_in_toobig); + icmp6_error(m, ICMP6_PACKET_TOO_BIG, 0, nh->nh_mtu); m = NULL; goto dropout; } @@ -208,7 +214,7 @@ /* * Outgoing packet firewall processing. */ - if (pfil_run_hooks(V_inet6_pfil_head, &m, nh.nh_ifp, PFIL_OUT | + if (pfil_run_hooks(V_inet6_pfil_head, &m, nh->nh_ifp, PFIL_OUT | PFIL_FWD, NULL) != PFIL_PASS) goto dropout; @@ -216,9 +222,9 @@ * We used slow path processing for packets with scoped addresses. * So, scope checks aren't needed here. */ - if (m->m_pkthdr.len > nh.nh_mtu) { - in6_ifstat_inc(nh.nh_ifp, ifs6_in_toobig); - icmp6_error(m, ICMP6_PACKET_TOO_BIG, 0, nh.nh_mtu); + if (m->m_pkthdr.len > nh->nh_mtu) { + in6_ifstat_inc(nh->nh_ifp, ifs6_in_toobig); + icmp6_error(m, ICMP6_PACKET_TOO_BIG, 0, nh->nh_mtu); m = NULL; goto dropout; } @@ -272,16 +278,17 @@ } m_clrprotoflags(m); /* Avoid confusing lower layers. */ - IP_PROBE(send, NULL, NULL, ip6, nh.nh_ifp, NULL, ip6); + IP_PROBE(send, NULL, NULL, ip6, nh->nh_ifp, NULL, ip6); - dst.sin6_addr = nh.nh_addr; - error = (*nh.nh_ifp->if_output)(nh.nh_ifp, m, + if (nh->nh_flags & NHF_GATEWAY) + dst.sin6_addr = nh->gw6_sa.sin6_addr; + error = (*nh->nh_ifp->if_output)(nh->nh_ifp, m, (struct sockaddr *)&dst, NULL); if (error != 0) { - in6_ifstat_inc(nh.nh_ifp, ifs6_out_discard); + in6_ifstat_inc(nh->nh_ifp, ifs6_out_discard); IP6STAT_INC(ip6s_cantforward); } else { - in6_ifstat_inc(nh.nh_ifp, ifs6_out_forward); + in6_ifstat_inc(nh->nh_ifp, ifs6_out_forward); IP6STAT_INC(ip6s_forward); } return (NULL); @@ -289,7 +296,7 @@ in6_ifstat_inc(rcvif, ifs6_in_discard); goto drop; dropout: - in6_ifstat_inc(nh.nh_ifp, ifs6_out_discard); + in6_ifstat_inc(nh->nh_ifp, ifs6_out_discard); drop: if (m != NULL) m_freem(m); Index: sys/netinet6/ip6_forward.c =================================================================== --- sys/netinet6/ip6_forward.c +++ sys/netinet6/ip6_forward.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -65,6 +66,7 @@ #include #include #include +#include #include #include #include @@ -90,14 +92,13 @@ ip6_forward(struct mbuf *m, int srcrt) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); - struct sockaddr_in6 *dst = NULL; - struct rtentry *rt = NULL; - struct route_in6 rin6; + struct sockaddr_in6 dst; + struct nhop_object *nh = NULL; int error, type = 0, code = 0; struct mbuf *mcopy = NULL; struct ifnet *origifp; /* maybe unnecessary */ u_int32_t inzone, outzone; - struct in6_addr src_in6, dst_in6, odst; + struct in6_addr odst; struct m_tag *fwd_tag; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; @@ -165,18 +166,27 @@ /* No IPsec processing required */ } #endif + /* + * ip6_forward() operates with IPv6 addresses with deembedded scope. + * + * There are 3 sources of IPv6 destination address: + * + * 1) ip6_input(), where ip6_dst contains deembedded address. + * In order to deal with forwarding of link-local packets, + * calculate the scope based on input interface (RFC 4007, clause 9). + * 2) packet filters changing ip6_dst directly. It would embed scope + * for LL addresses, so in6_localip() performs properly. + * 3) packet filters attaching PACKET_TAG_IPFORWARD would embed + * scope for the nexthop. + */ + bzero(&dst, sizeof(struct sockaddr_in6)); + dst.sin6_family = AF_INET6; + dst.sin6_addr = ip6->ip6_dst; + dst.sin6_scope_id = in6_get_unicast_scopeid(&ip6->ip6_dst, m->m_pkthdr.rcvif); again: - bzero(&rin6, sizeof(struct route_in6)); - dst = (struct sockaddr_in6 *)&rin6.ro_dst; - dst->sin6_len = sizeof(struct sockaddr_in6); - dst->sin6_family = AF_INET6; - dst->sin6_addr = ip6->ip6_dst; -again2: - rin6.ro_rt = in6_rtalloc1((struct sockaddr *)dst, 0, 0, M_GETFIB(m)); - rt = rin6.ro_rt; - if (rin6.ro_rt != NULL) - RT_UNLOCK(rin6.ro_rt); - else { + nh = fib6_lookup_nh_ptr(M_GETFIB(m), &dst.sin6_addr, dst.sin6_scope_id, + NHR_REF, m->m_pkthdr.flowid); + if (nh == NULL) { IP6STAT_INC(ip6s_noroute); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_noroute); if (mcopy) { @@ -195,8 +205,10 @@ * will possibly modify its first argument. * [draft-ietf-ipngwg-icmp-v3-04.txt, Section 3.1] */ - src_in6 = ip6->ip6_src; - if (in6_setscope(&src_in6, rt->rt_ifp, &outzone)) { + outzone = in6_get_unicast_scopeid(&ip6->ip6_src, nh->nh_ifp); + inzone = in6_get_unicast_scopeid(&ip6->ip6_src, m->m_pkthdr.rcvif); +#if 0 + if (in6_setscope(&src_in6, nh->nh_ifp, &outzone)) { /* XXX: this should not happen */ IP6STAT_INC(ip6s_cantforward); IP6STAT_INC(ip6s_badscope); @@ -207,10 +219,11 @@ IP6STAT_INC(ip6s_badscope); goto bad; } +#endif if (inzone != outzone) { IP6STAT_INC(ip6s_cantforward); IP6STAT_INC(ip6s_badscope); - in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard); + in6_ifstat_inc(nh->nh_ifp, ifs6_in_discard); if (V_ip6_log_time + V_ip6_log_interval < time_uptime) { V_ip6_log_time = time_uptime; @@ -220,7 +233,7 @@ ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), ip6->ip6_nxt, - if_name(m->m_pkthdr.rcvif), if_name(rt->rt_ifp)); + if_name(m->m_pkthdr.rcvif), if_name(nh->nh_ifp)); } if (mcopy) icmp6_error(mcopy, ICMP6_DST_UNREACH, @@ -235,17 +248,21 @@ * we need an explicit check because we may mistakenly forward the * packet to a different zone by (e.g.) a default route. */ - dst_in6 = ip6->ip6_dst; - if (in6_setscope(&dst_in6, m->m_pkthdr.rcvif, &inzone) != 0 || - in6_setscope(&dst_in6, rt->rt_ifp, &outzone) != 0 || - inzone != outzone) { + inzone = in6_get_unicast_scopeid(&ip6->ip6_dst, m->m_pkthdr.rcvif); + outzone = in6_get_unicast_scopeid(&ip6->ip6_dst, nh->nh_ifp); + + if (inzone != outzone) { IP6STAT_INC(ip6s_cantforward); IP6STAT_INC(ip6s_badscope); goto bad; } - if (rt->rt_flags & RTF_GATEWAY) - dst = (struct sockaddr_in6 *)rt->rt_gateway; + if (nh->nh_flags & NHF_GATEWAY) { + /* Store gateway address in deembedded form */ + dst.sin6_addr = nh->gw6_sa.sin6_addr; + dst.sin6_scope_id = ntohs(in6_getscope(&dst.sin6_addr)); + in6_clearscope(&dst.sin6_addr); + } /* * If we are to forward the packet using the same interface @@ -256,9 +273,9 @@ * Also, don't send redirect if forwarding using a route * modified by a redirect. */ - if (V_ip6_sendredirects && rt->rt_ifp == m->m_pkthdr.rcvif && !srcrt && - (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0) { - if ((rt->rt_ifp->if_flags & IFF_POINTOPOINT) != 0) { + if (V_ip6_sendredirects && nh->nh_ifp == m->m_pkthdr.rcvif && !srcrt && + (nh->nh_flags & NHF_REDIRECT) == 0) { + if ((nh->nh_ifp->if_flags & IFF_POINTOPOINT) != 0) { /* * If the incoming interface is equal to the outgoing * one, and the link attached to the interface is @@ -284,7 +301,7 @@ * link identifiers, we can do this stuff after making a copy for * returning an error. */ - if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) { + if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) { /* * See corresponding comments in ip6_output. * XXX: but is it possible that ip6_forward() sends a packet @@ -305,14 +322,14 @@ ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), ip6->ip6_nxt, if_name(m->m_pkthdr.rcvif), - if_name(rt->rt_ifp)); + if_name(nh->nh_ifp)); } /* we can just use rcvif in forwarding. */ origifp = m->m_pkthdr.rcvif; } else - origifp = rt->rt_ifp; + origifp = nh->nh_ifp; /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. @@ -326,7 +343,7 @@ odst = ip6->ip6_dst; /* Run through list of hooks for forwarded packets. */ - if (pfil_run_hooks(V_inet6_pfil_head, &m, rt->rt_ifp, PFIL_OUT | + if (pfil_run_hooks(V_inet6_pfil_head, &m, nh->nh_ifp, PFIL_OUT | PFIL_FWD, NULL) != PFIL_PASS) goto freecopy; ip6 = mtod(m, struct ip6_hdr *); @@ -338,7 +355,12 @@ if (in6_localip(&ip6->ip6_dst)) m->m_flags |= M_FASTFWD_OURS; else { - RTFREE(rt); + NH_FREE(nh); + + /* Update address and scopeid. Assume scope is embedded */ + dst.sin6_scope_id = ntohs(in6_getscope(&ip6->ip6_dst)); + dst.sin6_addr = ip6->ip6_dst; + in6_clearscope(&dst.sin6_addr); goto again; /* Redo the routing table lookup. */ } } @@ -362,32 +384,43 @@ /* Or forward to some other address? */ if ((m->m_flags & M_IP6_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { - dst = (struct sockaddr_in6 *)&rin6.ro_dst; - bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in6)); + struct sockaddr_in6 *gw6 = (struct sockaddr_in6 *)(fwd_tag + 1); + + /* Update address and scopeid. Assume scope is embedded */ + dst.sin6_scope_id = ntohs(in6_getscope(&gw6->sin6_addr)); + dst.sin6_addr = gw6->sin6_addr; + in6_clearscope(&dst.sin6_addr); + m->m_flags |= M_SKIP_FIREWALL; m->m_flags &= ~M_IP6_NEXTHOP; m_tag_delete(m, fwd_tag); - RTFREE(rt); - goto again2; + NH_FREE(nh); + goto again; } pass: /* See if the size was changed by the packet filter. */ - if (m->m_pkthdr.len > IN6_LINKMTU(rt->rt_ifp)) { - in6_ifstat_inc(rt->rt_ifp, ifs6_in_toobig); + /* TODO: change to nh->nh_mtu */ + if (m->m_pkthdr.len > IN6_LINKMTU(nh->nh_ifp)) { + in6_ifstat_inc(nh->nh_ifp, ifs6_in_toobig); if (mcopy) icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0, - IN6_LINKMTU(rt->rt_ifp)); + IN6_LINKMTU(nh->nh_ifp)); goto bad; } - error = nd6_output_ifp(rt->rt_ifp, origifp, m, dst, NULL); + /* Currently LLE layer stores embedded IPv6 addresses */ + if (IN6_IS_SCOPE_LINKLOCAL(&dst.sin6_addr)) { + in6_set_unicast_scopeid(&dst.sin6_addr, dst.sin6_scope_id); + dst.sin6_scope_id = 0; + } + error = nd6_output_ifp(nh->nh_ifp, origifp, m, &dst, NULL); if (error) { - in6_ifstat_inc(rt->rt_ifp, ifs6_out_discard); + in6_ifstat_inc(nh->nh_ifp, ifs6_out_discard); IP6STAT_INC(ip6s_cantforward); } else { IP6STAT_INC(ip6s_forward); - in6_ifstat_inc(rt->rt_ifp, ifs6_out_forward); + in6_ifstat_inc(nh->nh_ifp, ifs6_out_forward); if (type) IP6STAT_INC(ip6s_redirectsent); else { @@ -401,7 +434,7 @@ switch (error) { case 0: if (type == ND_REDIRECT) { - icmp6_redirect_output(mcopy, rt); + icmp6_redirect_output(mcopy, nh); goto out; } goto freecopy; @@ -432,6 +465,6 @@ bad: m_freem(m); out: - if (rt != NULL) - RTFREE(rt); + if (nh != NULL) + NH_FREE(nh); } Index: sys/netinet6/ip6_output.c =================================================================== --- sys/netinet6/ip6_output.c +++ sys/netinet6/ip6_output.c @@ -95,6 +95,7 @@ #include #include #include +#include #include #include #include @@ -403,18 +404,15 @@ * This function may modify ver and hlim only. * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. - * If route_in6 ro is present and has ro_rt initialized, route lookup would be - * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL, - * then result of route lookup is stored in ro->ro_rt. + * If route_in6 ro is present and has ro_nh initialized, route lookup would be + * skipped and ro->ro_nh would be used. If ro is present but ro->ro_nh is NULL, + * then result of route lookup is stored in ro->ro_nh. * * Type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int, and nd_ifinfo.linkmtu * is uint32_t. So we use u_long to hold largest one, which is rt_mtu. * * ifpp - XXX: just for statistics */ -/* - * XXX TODO: no flowid is assigned for outbound flows? - */ int ip6_output(struct mbuf *m0, struct ip6_pktopts *opt, struct route_in6 *ro, int flags, struct ip6_moptions *im6o, @@ -425,7 +423,7 @@ struct mbuf *m = m0; struct mbuf *mprev; struct route_in6 *ro_pmtu; - struct rtentry *rt; + struct nhop_object *nh; struct sockaddr_in6 *dst, sin6, src_sa, dst_sa; struct in6_addr odst; u_char *nexthdrp; @@ -666,7 +664,7 @@ ip6->ip6_hlim = V_ip6_defmcasthlim; } - if (ro == NULL || ro->ro_rt == NULL) { + if (ro == NULL || ro->ro_nh == NULL) { bzero(dst, sizeof(*dst)); dst->sin6_family = AF_INET6; dst->sin6_len = sizeof(*dst); @@ -676,29 +674,26 @@ * Validate route against routing table changes. * Make sure that the address family is set in route. */ - rt = NULL; + nh = NULL; ifp = NULL; mtu = 0; if (ro != NULL) { - if (ro->ro_rt != NULL && inp != NULL) { + if (ro->ro_nh != NULL && inp != NULL) { ro->ro_dst.sin6_family = AF_INET6; /* XXX KASSERT? */ - RT_VALIDATE((struct route *)ro, &inp->inp_rt_cookie, + NH_VALIDATE((struct route *)ro, &inp->inp_rt_cookie, fibnum); } - if (ro->ro_rt != NULL && fwd_tag == NULL && - ((ro->ro_rt->rt_flags & RTF_UP) == 0 || - ro->ro_rt->rt_ifp == NULL || - !RT_LINK_IS_UP(ro->ro_rt->rt_ifp) || + if (ro->ro_nh != NULL && fwd_tag == NULL && + (!NH_IS_VALID(ro->ro_nh) || ro->ro_dst.sin6_family != AF_INET6 || !IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst))) RO_INVALIDATE_CACHE(ro); - if (ro->ro_rt != NULL && fwd_tag == NULL && - (ro->ro_rt->rt_flags & RTF_UP) && + if (ro->ro_nh != NULL && fwd_tag == NULL && ro->ro_dst.sin6_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst)) { - rt = ro->ro_rt; - ifp = ro->ro_rt->rt_ifp; + nh = ro->ro_nh; + ifp = nh->nh_ifp; } else { if (ro->ro_lle) LLE_FREE(ro->ro_lle); /* zeros ro_lle */ @@ -710,7 +705,7 @@ dst_sa.sin6_addr = ip6->ip6_dst; } error = in6_selectroute(&dst_sa, opt, im6o, ro, &ifp, - &rt, fibnum); + &nh, fibnum, m->m_pkthdr.flowid); if (error != 0) { IP6STAT_INC(ip6s_noroute); if (ifp != NULL) @@ -720,17 +715,17 @@ if (ifp != NULL) mtu = ifp->if_mtu; } - if (rt == NULL) { + if (nh == NULL) { /* - * If in6_selectroute() does not return a route entry + * If in6_selectroute() does not return nexthop * dst may not have been updated. */ *dst = dst_sa; /* XXX */ } else { - if (rt->rt_flags & RTF_HOST) - mtu = rt->rt_mtu; - ia = (struct in6_ifaddr *)(rt->rt_ifa); - counter_u64_add(rt->rt_pksent, 1); + if (nh->nh_flags & NHF_HOST) + mtu = nh->nh_mtu; + ia = (struct in6_ifaddr *)(nh->nh_ifa); + counter_u64_add(nh->nh_pksent, 1); } } else { struct nhop6_extended nh6; @@ -763,8 +758,8 @@ } } - error = fib6_lookup_nh_ext(fibnum, &kdst, scopeid, NHR_REF, 0, - &nh6); + error = fib6_lookup_nh_ext(fibnum, &kdst, scopeid, NHR_REF, + m->m_pkthdr.flowid, &nh6); if (error != 0) { IP6STAT_INC(ip6s_noroute); /* No ifp in6_ifstat_inc(ifp, ifs6_out_discard); */ @@ -781,7 +776,7 @@ ; } - /* Then rt (for unicast) and ifp must be non-NULL valid values. */ + /* Then nh (for unicast) and ifp must be non-NULL valid values. */ if ((flags & IPV6_FORWARDING) == 0) { /* XXX: the FORWARDING flag can be set for mrouting. */ in6_ifstat_inc(ifp, ifs6_out_request); @@ -852,8 +847,8 @@ } /* All scope ID checks are successful. */ - if (rt && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { - if (opt && opt->ip6po_nextroute.ro_rt) { + if (nh && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { + if (opt && opt->ip6po_nextroute.ro_nh) { /* * The nexthop is explicitly specified by the * application. We assume the next hop is an IPv6 @@ -861,8 +856,8 @@ */ dst = (struct sockaddr_in6 *)opt->ip6po_nexthop; } - else if ((rt->rt_flags & RTF_GATEWAY)) - dst = (struct sockaddr_in6 *)rt->rt_gateway; + else if ((nh->nh_flags & NHF_GATEWAY)) + dst = &nh->gw6_sa; } if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { @@ -1517,8 +1512,8 @@ mtu = ro_pmtu->ro_mtu; } - if (ro_pmtu != NULL && ro_pmtu->ro_rt != NULL) - mtu = ro_pmtu->ro_rt->rt_mtu; + if (ro_pmtu != NULL && ro_pmtu->ro_nh != NULL) + mtu = ro_pmtu->ro_nh->nh_mtu; return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp, proto)); } @@ -2646,9 +2641,9 @@ if (optname == -1 || optname == IPV6_TCLASS) pktopt->ip6po_tclass = -1; if (optname == -1 || optname == IPV6_NEXTHOP) { - if (pktopt->ip6po_nextroute.ro_rt) { - RTFREE(pktopt->ip6po_nextroute.ro_rt); - pktopt->ip6po_nextroute.ro_rt = NULL; + if (pktopt->ip6po_nextroute.ro_nh) { + NH_FREE(pktopt->ip6po_nextroute.ro_nh); + pktopt->ip6po_nextroute.ro_nh = NULL; } if (pktopt->ip6po_nexthop) free(pktopt->ip6po_nexthop, M_IP6OPT); @@ -2668,9 +2663,9 @@ if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr) free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT); pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL; - if (pktopt->ip6po_route.ro_rt) { - RTFREE(pktopt->ip6po_route.ro_rt); - pktopt->ip6po_route.ro_rt = NULL; + if (pktopt->ip6po_route.ro_nh) { + NH_FREE(pktopt->ip6po_route.ro_nh); + pktopt->ip6po_route.ro_nh = NULL; } } if (optname == -1 || optname == IPV6_DSTOPTS) { Index: sys/netinet6/ip6_var.h =================================================================== --- sys/netinet6/ip6_var.h +++ sys/netinet6/ip6_var.h @@ -416,7 +416,7 @@ uint32_t, struct ifnet *, struct in6_addr *, int *); int in6_selectroute(struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *, struct ifnet **, - struct rtentry **, u_int); + struct nhop_object **, u_int, uint32_t); u_int32_t ip6_randomid(void); u_int32_t ip6_randomflowlabel(void); void in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset); Index: sys/netinet6/nd6.c =================================================================== --- sys/netinet6/nd6.c +++ sys/netinet6/nd6.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -136,7 +137,8 @@ static void nd6_llinfo_timer(void *); static void nd6_llinfo_settimer_locked(struct llentry *, long); static void clear_llinfo_pqueue(struct llentry *); -static void nd6_rtrequest(int, struct rtentry *, struct rt_addrinfo *); +static void nd6_rtrequest(int, struct rtentry *, struct nhop_object *, + struct rt_addrinfo *); static int nd6_resolve_slow(struct ifnet *, int, struct mbuf *, const struct sockaddr_in6 *, u_char *, uint32_t *, struct llentry **); static int nd6_need_cache(struct ifnet *); @@ -1526,14 +1528,17 @@ } static int -nd6_isdynrte(const struct rtentry *rt, void *xap) +nd6_isdynrte(const struct rtentry *rt, const struct nhop_object *nh, void *xap) { - if (rt->rt_flags == (RTF_UP | RTF_HOST | RTF_DYNAMIC)) + int rt_flags = rib_get_entry_rtflags(rt, nh); + + if (rt_flags == (RTF_UP | RTF_HOST | RTF_DYNAMIC)) return (1); return (0); } + /* * Remove the rtentry for the given llentry, * both of which were installed by a redirect. @@ -1544,6 +1549,7 @@ int fibnum; struct sockaddr_in6 sin6; struct rt_addrinfo info; + struct rib_cmd_info rc; lltable_fill_sa_entry(ln, (struct sockaddr *)&sin6); memset(&info, 0, sizeof(info)); @@ -1551,7 +1557,7 @@ info.rti_filter = nd6_isdynrte; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) - rtrequest1_fib(RTM_DELETE, &info, NULL, fibnum); + rib_del_route(fibnum, &info, &rc); } /* @@ -1559,14 +1565,15 @@ * processing. */ void -nd6_rtrequest(int req, struct rtentry *rt, struct rt_addrinfo *info) +nd6_rtrequest(int req, struct rtentry *rt, struct nhop_object *nh, + struct rt_addrinfo *info) { struct sockaddr_in6 *gateway; struct nd_defrouter *dr; struct ifnet *ifp; - gateway = (struct sockaddr_in6 *)rt->rt_gateway; - ifp = rt->rt_ifp; + gateway = &nh->gw6_sa; + ifp = nh->nh_ifp; switch (req) { case RTM_ADD: @@ -1578,14 +1585,13 @@ /* * Only indirect routes are interesting. */ - if ((rt->rt_flags & RTF_GATEWAY) == 0) + if ((nh->nh_flags & NHF_GATEWAY) == 0) return; /* * check for default route */ - if (IN6_ARE_ADDR_EQUAL(&in6addr_any, - &SIN6(rt_key(rt))->sin6_addr)) { - dr = defrouter_lookup(&gateway->sin6_addr, ifp); + if (nh->nh_flags & NHF_DEFAULT) { + dr = defrouter_lookup(&nh->gw6_sa.sin6_addr, ifp); if (dr != NULL) { dr->installed = 0; defrouter_rele(dr); Index: sys/netinet6/nd6_rtr.c =================================================================== --- sys/netinet6/nd6_rtr.c +++ sys/netinet6/nd6_rtr.c @@ -59,8 +59,7 @@ #include #include #include -#include -#include +#include #include #include @@ -603,14 +602,6 @@ m_freem(m); } -/* tell the change to user processes watching the routing socket. */ -static void -nd6_rtmsg(int cmd, struct rtentry *rt) -{ - - rt_routemsg(cmd, rt, rt->rt_ifp, 0, rt->rt_fibnum); -} - /* PFXRTR */ static struct nd_pfxrouter * pfxrtr_lookup(struct nd_prefix *pr, struct nd_defrouter *dr) @@ -680,7 +671,8 @@ defrouter_addreq(struct nd_defrouter *new) { struct sockaddr_in6 def, mask, gate; - struct rtentry *newrt = NULL; + struct epoch_tracker et; + u_int fibnum; int error; bzero(&def, sizeof(def)); @@ -692,15 +684,25 @@ def.sin6_family = gate.sin6_family = AF_INET6; gate.sin6_addr = new->rtaddr; - error = in6_rtrequest(RTM_ADD, (struct sockaddr *)&def, - (struct sockaddr *)&gate, (struct sockaddr *)&mask, - RTF_GATEWAY, &newrt, new->ifp->if_fib); - if (newrt) { - nd6_rtmsg(RTM_ADD, newrt); /* tell user process */ - RTFREE(newrt); - } - if (error == 0) + struct rt_addrinfo info; + + bzero(&info, sizeof(info)); + info.rti_info[RTAX_DST] = (struct sockaddr *)&def; + info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&mask; + info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gate; + info.rti_flags = RTF_GATEWAY; + + struct rib_cmd_info rc; + + NET_EPOCH_ENTER(et); + fibnum = new->ifp->if_fib; + error = rib_add_route(fibnum, &info, &rc); + + if (error == 0) { new->installed = 1; + rt_routemsg(RTM_ADD, rc.rt, rc.nh_new, fibnum); + } + NET_EPOCH_EXIT(et); } /* @@ -712,7 +714,9 @@ defrouter_delreq(struct nd_defrouter *dr) { struct sockaddr_in6 def, mask, gate; - struct rtentry *oldrt = NULL; + struct epoch_tracker et; + u_int fibnum; + int error; bzero(&def, sizeof(def)); bzero(&mask, sizeof(mask)); @@ -723,15 +727,25 @@ def.sin6_family = gate.sin6_family = AF_INET6; gate.sin6_addr = dr->rtaddr; - in6_rtrequest(RTM_DELETE, (struct sockaddr *)&def, - (struct sockaddr *)&gate, - (struct sockaddr *)&mask, RTF_GATEWAY, &oldrt, dr->ifp->if_fib); - if (oldrt) { - nd6_rtmsg(RTM_DELETE, oldrt); - RTFREE(oldrt); - } + struct rt_addrinfo info; - dr->installed = 0; + bzero(&info, sizeof(info)); + info.rti_info[RTAX_DST] = (struct sockaddr *)&def; + info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&mask; + info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gate; + info.rti_flags = RTF_GATEWAY; + + struct rib_cmd_info rc; + + NET_EPOCH_ENTER(et); + fibnum = dr->ifp->if_fib; + error = rib_del_route(fibnum, &info, &rc); + + if (error == 0) { + dr->installed = 0; + rt_routemsg(RTM_DELETE, rc.rt, rc.nh_old, fibnum); + } + NET_EPOCH_EXIT(et); } static void @@ -2010,11 +2024,13 @@ nd6_prefix_onlink_rtrequest(struct nd_prefix *pr, struct ifaddr *ifa) { struct sockaddr_dl sdl; - struct rtentry *rt; struct sockaddr_in6 mask6; u_long rtflags; int error, a_failure, fibnum, maxfib; + struct rt_addrinfo info; + struct epoch_tracker et; + /* * in6_ifinit() sets nd6_rtrequest to ifa_rtrequest for all ifaddrs. * ifa->ifa_rtrequest = nd6_rtrequest; @@ -2030,6 +2046,14 @@ sdl.sdl_type = ifa->ifa_ifp->if_type; sdl.sdl_index = ifa->ifa_ifp->if_index; + bzero(&info, sizeof(struct rt_addrinfo)); + info.rti_info[RTAX_DST] = (struct sockaddr *)&pr->ndpr_prefix; + info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&mask6; + info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&sdl; + info.rti_flags = rtflags; + + struct rib_cmd_info rc; + if(V_rt_add_addr_allfibs) { fibnum = 0; maxfib = rt_numfibs; @@ -2040,17 +2064,10 @@ a_failure = 0; for (; fibnum < maxfib; fibnum++) { - rt = NULL; - error = in6_rtrequest(RTM_ADD, - (struct sockaddr *)&pr->ndpr_prefix, (struct sockaddr *)&sdl, - (struct sockaddr *)&mask6, rtflags, &rt, fibnum); + NET_EPOCH_ENTER(et); + error = rib_add_route(fibnum, &info, &rc); if (error == 0) { - KASSERT(rt != NULL, ("%s: in6_rtrequest return no " - "error(%d) but rt is NULL, pr=%p, ifa=%p", __func__, - error, pr, ifa)); - RT_LOCK(rt); - nd6_rtmsg(RTM_ADD, rt); - RT_UNLOCK(rt); + rt_routemsg(RTM_ADD, rc.rt, rc.nh_new, fibnum); pr->ndpr_stateflags |= NDPRF_ONLINK; } else { char ip6buf[INET6_ADDRSTRLEN]; @@ -2071,12 +2088,7 @@ /* Save last error to return, see rtinit(). */ a_failure = error; } - - if (rt != NULL) { - RT_LOCK(rt); - RT_REMREF(rt); - RT_UNLOCK(rt); - } + NET_EPOCH_EXIT(et); } /* Return the last error we got. */ @@ -2175,7 +2187,6 @@ struct ifnet *ifp = pr->ndpr_ifp; struct nd_prefix *opr; struct sockaddr_in6 sa6, mask6; - struct rtentry *rt; char ip6buf[INET6_ADDRSTRLEN]; uint64_t genid; int fibnum, maxfib, a_failure; @@ -2204,22 +2215,27 @@ maxfib = fibnum + 1; } + struct rt_addrinfo info; + + bzero(&info, sizeof(info)); + info.rti_info[RTAX_DST] = (struct sockaddr *)&sa6; + info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&mask6; + + struct rib_cmd_info rc; + struct epoch_tracker et; + a_failure = 0; for (; fibnum < maxfib; fibnum++) { - rt = NULL; - error = in6_rtrequest(RTM_DELETE, (struct sockaddr *)&sa6, NULL, - (struct sockaddr *)&mask6, 0, &rt, fibnum); + NET_EPOCH_ENTER(et); + error = rib_del_route(fibnum, &info, &rc); if (error == 0) { /* report the route deletion to the routing socket. */ - if (rt != NULL) - nd6_rtmsg(RTM_DELETE, rt); + rt_routemsg(RTM_DELETE, rc.rt, rc.nh_old, fibnum); } else { /* Save last error to return, see rtinit(). */ a_failure = error; } - if (rt != NULL) { - RTFREE(rt); - } + NET_EPOCH_EXIT(et); } error = a_failure; a_failure = 1; @@ -2406,16 +2422,21 @@ return (0); } +struct rt6_args { + struct ifnet *ifp; + struct in6_addr *gateway; +}; + static int -rt6_deleteroute(const struct rtentry *rt, void *arg) +rt6_deleteroute(const struct rtentry *rt, const struct nhop_object *nh, void *arg) { -#define SIN6(s) ((struct sockaddr_in6 *)s) - struct in6_addr *gate = (struct in6_addr *)arg; + struct rt6_args *args = (struct rt6_args *)arg; + int rt_flags = rib_get_entry_rtflags(rt, nh); - if (rt->rt_gateway == NULL || rt->rt_gateway->sa_family != AF_INET6) + if (nh->gw6_sa.sin6_family != AF_INET6) return (0); - if (!IN6_ARE_ADDR_EQUAL(gate, &SIN6(rt->rt_gateway)->sin6_addr)) { + if (!IN6_ARE_ADDR_EQUAL(args->gateway, &nh->gw6_sa.sin6_addr)) { return (0); } @@ -2424,18 +2445,17 @@ * XXX: this seems to be a bit ad-hoc. Should we consider the * 'cloned' bit instead? */ - if ((rt->rt_flags & RTF_STATIC) != 0) + if ((rt_flags & RTF_STATIC) != 0) return (0); /* * We delete only host route. This means, in particular, we don't * delete default route. */ - if ((rt->rt_flags & RTF_HOST) == 0) + if ((rt_flags & RTF_HOST) == 0) return (0); return (1); -#undef SIN6 } /* @@ -2446,13 +2466,17 @@ void rt6_flush(struct in6_addr *gateway, struct ifnet *ifp) { + struct rt6_args args; /* We'll care only link-local addresses */ if (!IN6_IS_ADDR_LINKLOCAL(gateway)) return; + args.ifp = ifp; + args.gateway = gateway; + /* XXX Do we really need to walk any but the default FIB? */ - rt_foreach_fib_walk_del(AF_INET6, rt6_deleteroute, (void *)gateway); + rt_foreach_fib_walk_del(AF_INET6, rt6_deleteroute, (void *)&args); } int Index: sys/netinet6/raw_ip6.c =================================================================== --- sys/netinet6/raw_ip6.c +++ sys/netinet6/raw_ip6.c @@ -66,6 +66,7 @@ #include "opt_ipsec.h" #include "opt_inet6.h" +#include "opt_route_mpath.h" #include #include @@ -99,6 +100,7 @@ #include #include #include +#include #include #include #include @@ -404,6 +406,9 @@ int use_defzone = 0; int hlim = 0; struct in6_addr in6a; +#ifdef ROUTE_MPATH + uint32_t hash_val, hash_type; +#endif va_list ap; va_start(ap, so); @@ -462,6 +467,15 @@ } ip6 = mtod(m, struct ip6_hdr *); +#ifdef ROUTE_MPATH + if (V_fib_hash_outbound) { + hash_val = fib6_calc_software_hash(&inp->in6p_laddr, + &dstsock->sin6_addr, 0, 0, so->so_proto->pr_protocol, + &hash_type); + inp->inp_flowid = hash_val; + inp->inp_flowtype = hash_type; + } +#endif /* * Source address selection. */ Index: sys/netinet6/scope6.c =================================================================== --- sys/netinet6/scope6.c +++ sys/netinet6/scope6.c @@ -466,6 +466,28 @@ } /* + * Returns scope zone id for the unicast address @in6. + * + * Returns 0 for global unicast and loopback addresses. + * Returns interface index for the link-local addresses. + */ +uint32_t +in6_get_unicast_scopeid(const struct in6_addr *in6, const struct ifnet *ifp) +{ + + if (IN6_IS_SCOPE_LINKLOCAL(in6)) + return (ifp->if_index); + return (0); +} + +void +in6_set_unicast_scopeid(struct in6_addr *in6, uint32_t scopeid) +{ + + in6->s6_addr16[1] = htons(scopeid & 0xffff); +} + +/* * Return pointer to ifnet structure, corresponding to the zone id of * link-local scope. */ Index: sys/netinet6/scope6_var.h =================================================================== --- sys/netinet6/scope6_var.h +++ sys/netinet6/scope6_var.h @@ -67,6 +67,9 @@ uint32_t in6_getscopezone(const struct ifnet *, int); void in6_splitscope(const struct in6_addr *, struct in6_addr *, uint32_t *); struct ifnet* in6_getlinkifnet(uint32_t); +uint32_t in6_get_unicast_scopeid(const struct in6_addr *, const struct ifnet *); +void in6_set_unicast_scopeid(struct in6_addr *, uint32_t); + #endif /* _KERNEL */ #endif /* _NETINET6_SCOPE6_VAR_H_ */ Index: sys/netinet6/udp6_usrreq.c =================================================================== --- sys/netinet6/udp6_usrreq.c +++ sys/netinet6/udp6_usrreq.c @@ -1048,6 +1048,7 @@ static int udp6_attach(struct socket *so, int proto, struct thread *td) { + static uint32_t udp_flowid; struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; @@ -1071,6 +1072,8 @@ inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; + inp->inp_flowid = atomic_fetchadd_int(&udp_flowid, 1); + inp->inp_flowtype = M_HASHTYPE_OPAQUE; inp->in6p_hops = -1; /* use kernel default */ inp->in6p_cksum = -1; /* just to be sure */ /* Index: sys/netpfil/ipfw/ip_fw_table_algo.c =================================================================== --- sys/netpfil/ipfw/ip_fw_table_algo.c +++ sys/netpfil/ipfw/ip_fw_table_algo.c @@ -51,7 +51,6 @@ #include /* ip_fw.h requires IFNAMSIZ */ #include #include -#include #include #include @@ -3918,10 +3917,10 @@ tinfo->flags = IPFW_TATFLAGS_AFDATA; tinfo->taclass4 = IPFW_TACLASS_RADIX; tinfo->count4 = 0; - tinfo->itemsize4 = sizeof(struct rtentry); + tinfo->itemsize4 = 0; tinfo->taclass6 = IPFW_TACLASS_RADIX; tinfo->count6 = 0; - tinfo->itemsize6 = sizeof(struct rtentry); + tinfo->itemsize6 = 0; } static int @@ -3943,11 +3942,17 @@ ta_dump_kfib_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { - struct rtentry *rte; + struct rtentry *rt; + struct sockaddr_in6 dst, mask; - rte = (struct rtentry *)e; + rt = (struct rtentry *)e; - return ta_dump_kfib_tentry_int(rt_key(rte), rt_mask(rte), tent); + dst.sin6_len = sizeof(struct sockaddr_in6); + mask.sin6_len = sizeof(struct sockaddr_in6); + + rib_get_entry_prefix(rt, (struct sockaddr *)&dst, (struct sockaddr *)&mask, NULL); + + return ta_dump_kfib_tentry_int((struct sockaddr *)&dst, (struct sockaddr *)&mask, tent); } static int @@ -4047,23 +4052,9 @@ ta_foreach_kfib(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { - RIB_RLOCK_TRACKER; - struct rib_head *rh; - int error; - rh = rt_tables_get_rnh(ti->data, AF_INET); - if (rh != NULL) { - RIB_RLOCK(rh); - error = rh->rnh_walktree(&rh->head, (walktree_f_t *)f, arg); - RIB_RUNLOCK(rh); - } - - rh = rt_tables_get_rnh(ti->data, AF_INET6); - if (rh != NULL) { - RIB_RLOCK(rh); - error = rh->rnh_walktree(&rh->head, (walktree_f_t *)f, arg); - RIB_RUNLOCK(rh); - } + rib_walk(AF_INET, ti->data, (rt_walktree_f_t *)f, arg); + rib_walk(AF_INET6, ti->data, (rt_walktree_f_t *)f, arg); } struct table_algo addr_kfib = { Index: sys/netpfil/pf/pf.c =================================================================== --- sys/netpfil/pf/pf.c +++ sys/netpfil/pf/pf.c @@ -69,7 +69,6 @@ #include #include #include -#include #include #include @@ -5338,122 +5337,12 @@ return (p); } -#ifdef RADIX_MPATH -static int -pf_routable_oldmpath(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif, - int rtableid) -{ - struct radix_node_head *rnh; - struct sockaddr_in *dst; - int ret = 1; - int check_mpath; -#ifdef INET6 - struct sockaddr_in6 *dst6; - struct route_in6 ro; -#else - struct route ro; -#endif - struct radix_node *rn; - struct rtentry *rt; - struct ifnet *ifp; - - check_mpath = 0; - /* XXX: stick to table 0 for now */ - rnh = rt_tables_get_rnh(0, af); - if (rnh != NULL && rn_mpath_capable(rnh)) - check_mpath = 1; - bzero(&ro, sizeof(ro)); - switch (af) { - case AF_INET: - dst = satosin(&ro.ro_dst); - dst->sin_family = AF_INET; - dst->sin_len = sizeof(*dst); - dst->sin_addr = addr->v4; - break; -#ifdef INET6 - case AF_INET6: - /* - * Skip check for addresses with embedded interface scope, - * as they would always match anyway. - */ - if (IN6_IS_SCOPE_EMBED(&addr->v6)) - goto out; - dst6 = (struct sockaddr_in6 *)&ro.ro_dst; - dst6->sin6_family = AF_INET6; - dst6->sin6_len = sizeof(*dst6); - dst6->sin6_addr = addr->v6; - break; -#endif /* INET6 */ - default: - return (0); - } - - /* Skip checks for ipsec interfaces */ - if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC) - goto out; - - switch (af) { -#ifdef INET6 - case AF_INET6: - in6_rtalloc_ign(&ro, 0, rtableid); - break; -#endif -#ifdef INET - case AF_INET: - in_rtalloc_ign((struct route *)&ro, 0, rtableid); - break; -#endif - } - - if (ro.ro_rt != NULL) { - /* No interface given, this is a no-route check */ - if (kif == NULL) - goto out; - - if (kif->pfik_ifp == NULL) { - ret = 0; - goto out; - } - - /* Perform uRPF check if passed input interface */ - ret = 0; - rn = (struct radix_node *)ro.ro_rt; - do { - rt = (struct rtentry *)rn; - ifp = rt->rt_ifp; - - if (kif->pfik_ifp == ifp) - ret = 1; - rn = rn_mpath_next(rn); - } while (check_mpath == 1 && rn != NULL && ret == 0); - } else - ret = 0; -out: - if (ro.ro_rt != NULL) - RTFREE(ro.ro_rt); - return (ret); -} -#endif - int pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif, int rtableid) { -#ifdef INET - struct nhop4_basic nh4; -#endif -#ifdef INET6 - struct nhop6_basic nh6; -#endif struct ifnet *ifp; -#ifdef RADIX_MPATH - struct radix_node_head *rnh; - /* XXX: stick to table 0 for now */ - rnh = rt_tables_get_rnh(0, af); - if (rnh != NULL && rn_mpath_capable(rnh)) - return (pf_routable_oldmpath(addr, af, kif, rtableid)); -#endif /* * Skip check for addresses with embedded interface scope, * as they would always match anyway. @@ -5468,35 +5357,21 @@ if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC) return (1); - ifp = NULL; + ifp = (kif != NULL) ? kif->pfik_ifp : NULL; switch (af) { #ifdef INET6 case AF_INET6: - if (fib6_lookup_nh_basic(rtableid, &addr->v6, 0, 0, 0, &nh6)!=0) - return (0); - ifp = nh6.nh_ifp; - break; + return (fib6_lookup_urpf(rtableid, &addr->v6, 0, NHR_NONE, + ifp)); #endif #ifdef INET case AF_INET: - if (fib4_lookup_nh_basic(rtableid, addr->v4, 0, 0, &nh4) != 0) - return (0); - ifp = nh4.nh_ifp; - break; + return (fib4_lookup_urpf(rtableid, addr->v4, 0, NHR_NONE, + ifp)); #endif } - /* No interface given, this is a no-route check */ - if (kif == NULL) - return (1); - - if (kif->pfik_ifp == NULL) - return (0); - - /* Perform uRPF check if passed input interface */ - if (kif->pfik_ifp == ifp) - return (1); return (0); } Index: sys/nfs/bootp_subr.c =================================================================== --- sys/nfs/bootp_subr.c +++ sys/nfs/bootp_subr.c @@ -347,6 +347,7 @@ bootpboot_p_sa(rt_key(rt), rt_mask(rt)); printf(" "); + /* XXX: fix this */ bootpboot_p_sa(rt->rt_gateway, NULL); printf(" "); printf("flags %x", (unsigned short) rt->rt_flags); @@ -1082,11 +1083,12 @@ clear_sinaddr(&defdst); clear_sinaddr(&defmask); - error = rtrequest_fib(RTM_ADD, (struct sockaddr *)&defdst, - (struct sockaddr *) &ifctx->gw, (struct sockaddr *)&defmask, - (RTF_UP | RTF_GATEWAY | RTF_STATIC), NULL, RT_DEFAULT_FIB); + error = rib_request_simple(RIB_ADD, RT_DEFAULT_FIB, + (struct sockaddr *)&defdst, (struct sockaddr *)&defmask, + (struct sockaddr *) &ifctx->gw, RTF_UP | RTF_GATEWAY | RTF_STATIC); + if (error != 0) { - printf("%s: RTM_ADD, error=%d\n", __func__, error); + printf("%s: RIB_ADD, error=%d\n", __func__, error); } } @@ -1103,11 +1105,11 @@ clear_sinaddr(&defdst); clear_sinaddr(&defmask); - error = rtrequest_fib(RTM_DELETE, (struct sockaddr *)&defdst, - (struct sockaddr *) &ifctx->gw, (struct sockaddr *)&defmask, - (RTF_UP | RTF_GATEWAY | RTF_STATIC), NULL, RT_DEFAULT_FIB); + error = rib_request_simple(RIB_DEL, RT_DEFAULT_FIB, + (struct sockaddr *)&defdst, (struct sockaddr *)&defmask, + (struct sockaddr *) &ifctx->gw, 0); if (error != 0) { - printf("%s: RTM_DELETE, error=%d\n", __func__, error); + printf("%s: RIB_DEL, error=%d\n", __func__, error); } } Index: sys/ofed/drivers/infiniband/core/ib_addr.c =================================================================== --- sys/ofed/drivers/infiniband/core/ib_addr.c +++ sys/ofed/drivers/infiniband/core/ib_addr.c @@ -44,14 +44,17 @@ #include #include #include +#include #include #include #include +#include #include #include #include #include +#include #include "core_priv.h" @@ -275,7 +278,7 @@ struct sockaddr_in dst_tmp = *dst_in; in_port_t src_port; struct sockaddr *saddr = NULL; - struct rtentry *rte; + struct nhop_object *nh; struct ifnet *ifp; int error; int type; @@ -293,8 +296,7 @@ type |= ADDR_DST_ANY; /* - * Make sure the socket address length field - * is set, else rtalloc1() will fail. + * Make sure the socket address length field is set. */ dst_tmp.sin_len = sizeof(dst_tmp); @@ -303,16 +305,12 @@ case ADDR_VALID: case ADDR_SRC_ANY: /* regular destination route lookup */ - rte = rtalloc1((struct sockaddr *)&dst_tmp, 1, 0); - if (rte == NULL) { + nh = fib4_lookup_nh_ptr(RT_DEFAULT_FIB, dst_tmp.sin_addr, + 0, NHR_NONE, 0); + if (nh == NULL) { error = EHOSTUNREACH; goto done; - } else if (rte->rt_ifp == NULL || RT_LINK_IS_UP(rte->rt_ifp) == 0) { - RTFREE_LOCKED(rte); - error = EHOSTUNREACH; - goto done; } - RT_UNLOCK(rte); break; default: error = ENETUNREACH; @@ -332,14 +330,14 @@ /* check source interface */ if (ifp == NULL) { error = ENETUNREACH; - goto error_rt_free; + goto done; } else if (ifp->if_flags & IFF_LOOPBACK) { /* * Source address cannot be a loopback device. */ error = EHOSTUNREACH; goto error_put_ifp; - } else if (rte->rt_ifp->if_flags & IFF_LOOPBACK) { + } else if (nh->nh_ifp->if_flags & IFF_LOOPBACK) { if (memcmp(&src_in->sin_addr, &dst_in->sin_addr, sizeof(src_in->sin_addr))) { /* @@ -352,9 +350,9 @@ } /* get destination network interface from route */ dev_put(ifp); - ifp = rte->rt_ifp; + ifp = nh->nh_ifp; dev_hold(ifp); - } else if (ifp != rte->rt_ifp) { + } else if (ifp != nh->nh_ifp) { /* * Source and destination interfaces are * different. @@ -365,13 +363,13 @@ break; case ADDR_SRC_ANY: /* check for loopback device */ - if (rte->rt_ifp->if_flags & IFF_LOOPBACK) + if (nh->nh_ifp->if_flags & IFF_LOOPBACK) saddr = (struct sockaddr *)&dst_tmp; else - saddr = rte->rt_ifa->ifa_addr; + saddr = nh->nh_ifa->ifa_addr; /* get destination network interface from route */ - ifp = rte->rt_ifp; + ifp = nh->nh_ifp; dev_hold(ifp); break; default: @@ -386,7 +384,7 @@ ifp->if_addrlen, MAX_ADDR_LEN); error = 0; } else if (IN_MULTICAST(ntohl(dst_tmp.sin_addr.s_addr))) { - bool is_gw = (rte->rt_flags & RTF_GATEWAY) != 0; + bool is_gw = (nh->nh_flags & NHF_GATEWAY) != 0; error = addr_resolve_multi(edst, ifp, (struct sockaddr *)&dst_tmp); if (error != 0) goto error_put_ifp; @@ -396,10 +394,10 @@ memset(edst, 0, MAX_ADDR_LEN); error = 0; } else { - bool is_gw = (rte->rt_flags & RTF_GATEWAY) != 0; + bool is_gw = (nh->nh_flags & NHF_GATEWAY) != 0; memset(edst, 0, MAX_ADDR_LEN); error = arpresolve(ifp, is_gw, NULL, is_gw ? - rte->rt_gateway : (const struct sockaddr *)&dst_tmp, + &nh->gw_sa : (const struct sockaddr *)&dst_tmp, edst, NULL, NULL); if (error != 0) goto error_put_ifp; @@ -416,17 +414,12 @@ src_in->sin_port = src_port; /* preserve port number */ } - if (rte != NULL) - RTFREE(rte); - *ifpp = ifp; goto done; error_put_ifp: dev_put(ifp); -error_rt_free: - RTFREE(rte); done: CURVNET_RESTORE(); @@ -460,7 +453,7 @@ struct sockaddr_in6 dst_tmp = *dst_in; in_port_t src_port; struct sockaddr *saddr = NULL; - struct rtentry *rte; + struct nhop_object *nh; struct ifnet *ifp; int error; int type; @@ -478,8 +471,7 @@ type |= ADDR_DST_ANY; /* - * Make sure the socket address length field - * is set, else rtalloc1() will fail. + * Make sure the socket address length field is set. */ dst_tmp.sin6_len = sizeof(dst_tmp); @@ -502,16 +494,12 @@ /* FALLTHROUGH */ case ADDR_SRC_ANY: /* regular destination route lookup */ - rte = rtalloc1((struct sockaddr *)&dst_tmp, 1, 0); - if (rte == NULL) { + nh = fib6_lookup_nh_ptr(RT_DEFAULT_FIB, &dst_in->sin6_addr, + addr->bound_dev_if, NHR_NONE, 0); + if (nh == NULL) { error = EHOSTUNREACH; goto done; - } else if (rte->rt_ifp == NULL || RT_LINK_IS_UP(rte->rt_ifp) == 0) { - RTFREE_LOCKED(rte); - error = EHOSTUNREACH; - goto done; } - RT_UNLOCK(rte); break; default: error = ENETUNREACH; @@ -531,14 +519,14 @@ /* check source interface */ if (ifp == NULL) { error = ENETUNREACH; - goto error_rt_free; + goto done; } else if (ifp->if_flags & IFF_LOOPBACK) { /* * Source address cannot be a loopback device. */ error = EHOSTUNREACH; goto error_put_ifp; - } else if (rte->rt_ifp->if_flags & IFF_LOOPBACK) { + } else if (nh->nh_ifp->if_flags & IFF_LOOPBACK) { if (memcmp(&src_in->sin6_addr, &dst_in->sin6_addr, sizeof(src_in->sin6_addr))) { /* @@ -551,9 +539,9 @@ } /* get destination network interface from route */ dev_put(ifp); - ifp = rte->rt_ifp; + ifp = nh->nh_ifp; dev_hold(ifp); - } else if (ifp != rte->rt_ifp) { + } else if (ifp != nh->nh_ifp) { /* * Source and destination interfaces are * different. @@ -564,13 +552,13 @@ break; case ADDR_SRC_ANY: /* check for loopback device */ - if (rte->rt_ifp->if_flags & IFF_LOOPBACK) + if (nh->nh_ifp->if_flags & IFF_LOOPBACK) saddr = (struct sockaddr *)&dst_tmp; else - saddr = rte->rt_ifa->ifa_addr; + saddr = nh->nh_ifa->ifa_addr; /* get destination network interface from route */ - ifp = rte->rt_ifp; + ifp = nh->nh_ifp; dev_hold(ifp); break; default: @@ -581,21 +569,21 @@ * Step 3 - resolve destination MAC address */ if (IN6_IS_ADDR_MULTICAST(&dst_tmp.sin6_addr)) { - bool is_gw = (rte->rt_flags & RTF_GATEWAY) != 0; + bool is_gw = (nh->nh_flags & NHF_GATEWAY) != 0; error = addr_resolve_multi(edst, ifp, (struct sockaddr *)&dst_tmp); if (error != 0) goto error_put_ifp; else if (is_gw) addr->network = RDMA_NETWORK_IPV6; - } else if (rte->rt_ifp->if_flags & IFF_LOOPBACK) { + } else if (nh->nh_ifp->if_flags & IFF_LOOPBACK) { memset(edst, 0, MAX_ADDR_LEN); error = 0; } else { - bool is_gw = (rte->rt_flags & RTF_GATEWAY) != 0; + bool is_gw = (nh->nh_flags & NHF_GATEWAY) != 0; memset(edst, 0, MAX_ADDR_LEN); error = nd6_resolve(ifp, is_gw, NULL, is_gw ? - rte->rt_gateway : (const struct sockaddr *)&dst_tmp, + &nh->gw_sa : (const struct sockaddr *)&dst_tmp, edst, NULL, NULL); if (error != 0) goto error_put_ifp; @@ -612,17 +600,12 @@ src_in->sin6_port = src_port; /* preserve port number */ } - if (rte != NULL) - RTFREE(rte); - *ifpp = ifp; goto done; error_put_ifp: dev_put(ifp); -error_rt_free: - RTFREE(rte); done: CURVNET_RESTORE(); Index: sys/ofed/drivers/infiniband/core/ib_cma.c =================================================================== --- sys/ofed/drivers/infiniband/core/ib_cma.c +++ sys/ofed/drivers/infiniband/core/ib_cma.c @@ -50,10 +50,14 @@ #include #include #include +#include #include #include +#include + +#include #include #include @@ -1355,7 +1359,7 @@ __be32 daddr = dst_addr->sin_addr.s_addr, saddr = src_addr->sin_addr.s_addr; struct net_device *dst_dev; - struct rtentry *rte; + struct nhop_object *nh; bool ret; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || @@ -1385,13 +1389,12 @@ src_tmp.sin_len = sizeof(src_tmp); CURVNET_SET(net_dev->if_vnet); - rte = rtalloc1((struct sockaddr *)&src_tmp, 1, 0); - if (rte != NULL) { - ret = (rte->rt_ifp == net_dev); - RTFREE_LOCKED(rte); - } else { + nh = fib4_lookup_nh_ptr(RT_DEFAULT_FIB, src_addr->sin_addr, + 0, NHR_NONE, 0); + if (nh != NULL) + ret = (nh->nh_ifp == net_dev); + else ret = false; - } CURVNET_RESTORE(); return ret; #else @@ -1407,7 +1410,7 @@ struct sockaddr_in6 src_tmp = *src_addr; struct sockaddr_in6 dst_tmp = *dst_addr; struct net_device *dst_dev; - struct rtentry *rte; + struct nhop_object *nh; bool ret; dst_dev = ip6_dev_find(net_dev->if_vnet, dst_tmp.sin6_addr, @@ -1446,13 +1449,12 @@ ret = true; } else { /* non-loopback case */ - rte = rtalloc1((struct sockaddr *)&src_tmp, 1, 0); - if (rte != NULL) { - ret = (rte->rt_ifp == net_dev); - RTFREE_LOCKED(rte); - } else { + nh = fib6_lookup_nh_ptr(RT_DEFAULT_FIB, &src_addr->sin6_addr, + net_dev->if_index, NHR_NONE, 0); + if (nh != NULL) + ret = (nh->nh_ifp == net_dev); + else ret = false; - } } CURVNET_RESTORE(); return ret; @@ -1512,6 +1514,7 @@ *src_addr = (struct sockaddr *)&src_addr_storage; struct net_device *net_dev; const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL; + struct epoch_tracker et; int err; err = cma_save_ip_info(listen_addr, src_addr, ib_event, @@ -1530,10 +1533,13 @@ if (!net_dev) return ERR_PTR(-ENODEV); + NET_EPOCH_ENTER(et); if (!validate_net_dev(net_dev, listen_addr, src_addr)) { + NET_EPOCH_EXIT(et); dev_put(net_dev); return ERR_PTR(-EHOSTUNREACH); } + NET_EPOCH_EXIT(et); return net_dev; } Index: sys/sys/socket.h =================================================================== --- sys/sys/socket.h +++ sys/sys/socket.h @@ -416,6 +416,8 @@ #define NET_RT_IFMALIST 4 /* return multicast address list */ #define NET_RT_IFLISTL 5 /* Survey interface list, using 'l'en * versions of msghdr structs. */ +#define NET_RT_NHOP 6 /* dump routing nexthops */ +#define NET_RT_NHGROUPS 7 /* dump routing mulipath groups */ #endif /* __BSD_VISIBLE */ /* Index: sys/tests/routing/module.h =================================================================== --- /dev/null +++ sys/tests/routing/module.h @@ -0,0 +1,39 @@ +#ifndef SYS_TESTS_ROUTING_MODULE_H_ +#define SYS_TESTS_ROUTING_MODULE_H_ + +typedef int (ktest_f_t)(void); + +struct ktest_item { + char *name; + ktest_f_t *fn; + char *descr; +}; +#define DECLARE_KTEST(_f) {#_f, &(_f), ""} +#define DECLARE_KTEST_DESC(_f, _d) {#_f, &(_f), _d} + +struct ktests { + char *name; + char *descr; + struct ktest_item *tests; + int num_tests; +}; + +#define ARRAYLEN(_a) (sizeof(_a) / sizeof(_a[0])) +#define DEFINE_KTESTS(_name, _descr, _tests) \ + struct ktests kt_##_name = {#_name, #_descr, _tests, ARRAYLEN(_tests)} + +#define DECLARE_KTESTS(_name) extern struct ktests kt_##_name + +#define TPRINTF(_arg, ...) printf("KTEST:%s:%d " _arg "\n", __func__, __LINE__, ##__VA_ARGS__) + +#define TASSERT(_cond, _fmt, ...) do { \ + if (!(_cond)) { \ + TPRINTF(_fmt, ##__VA_ARGS__); \ + error = EINVAL; \ + } \ +} while (0); + + +DECLARE_KTESTS(route_ctl); + +#endif Index: sys/tests/routing/module.c =================================================================== --- /dev/null +++ sys/tests/routing/module.c @@ -0,0 +1,163 @@ +/*- + * Copyright (c) 2019, Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Neither the name of Alexander V. Chernikov nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tests/routing/module.h" + +static int inited; +#define ET_EXITING 0x1 +static volatile int state_flags; + +struct sysctl_ctx_list ctx; +static struct ktest_item *ki = NULL; +static int ki_size = 0, ki_count = 0; + +static int +invoke_test_handler(SYSCTL_HANDLER_ARGS) +{ + int error, v; + struct ktest_item *item; + + if (inited == 0) + return (ENOENT); + + v = 0; + error = sysctl_handle_int(oidp, &v, 0, req); + if (error) + return (error); + if (req->newptr == NULL) + return (error); + if (v == 0) + return (0); + + item = &ki[arg2]; + printf("running item %d: %s\n", (int)arg2, item->name); + error = item->fn(); + printf("done running item %d: %s - ret %d\n", (int)arg2, item->name, error); + + return (error); +} + +SYSCTL_NODE(_kern, OID_AUTO, test, CTLFLAG_RW, 0, "Test framework"); +SYSCTL_NODE(_kern_test, OID_AUTO, routing, CTLFLAG_RW, 0, "Routing test framework"); + +static int +test_init_one(struct ktests *kt) +{ + struct sysctl_oid *tree; + + if (kt->num_tests + ki_count > ki_size) { + size_t new_size = roundup2(kt->num_tests + ki_count, 32); + void *new_ptr; + new_ptr = malloc(new_size * sizeof(struct ktest_item), M_TEMP, M_WAITOK | M_ZERO); + if (ki_count > 0) + memcpy(new_ptr, ki, ki_count * sizeof(struct ktest_item)); + free(ki, M_TEMP); + ki = new_ptr; + ki_size = new_size; + } + + tree = SYSCTL_ADD_NODE(&ctx, SYSCTL_STATIC_CHILDREN(_kern_test_routing), + OID_AUTO, kt->name, CTLFLAG_RW, 0, "routing tests"); + + memcpy(&ki[ki_count], kt->tests, kt->num_tests * sizeof(struct ktest_item)); + + for (int i = 0; i < kt->num_tests; i++) { + SYSCTL_ADD_PROC(&ctx, SYSCTL_CHILDREN(tree), OID_AUTO, + kt->tests[i].name, (CTLTYPE_INT | CTLFLAG_RW), NULL, ki_count + i, + invoke_test_handler, "I", kt->tests[i].descr); + } + ki_count += kt->num_tests; + + return (0); +} + +static int +test_modinit(void) +{ + sysctl_ctx_init(&ctx); + + test_init_one(&kt_route_ctl); + + inited = 1; + return (0); +} + + +static int +routing_test_module_event_handler(module_t mod, int what, void *arg __unused) +{ + int err; + + switch (what) { + case MOD_LOAD: + if ((err = test_modinit()) != 0) + return (err); + break; + case MOD_UNLOAD: + //mtx_lock(&state_mtx); + state_flags = ET_EXITING; + sysctl_ctx_free(&ctx); + free(ki, M_TEMP); + //wakeup(&state_mtx); + //mtx_unlock(&state_mtx); + /* yes --- gross */ + pause("epoch unload", 2 * hz); + break; + default: + return (EOPNOTSUPP); + } + + return (0); +} + +static moduledata_t routing_test_moduledata = { + "routing_test", + routing_test_module_event_handler, + NULL +}; + +MODULE_VERSION(routing_test, 1); +DECLARE_MODULE(routing_test, routing_test_moduledata, SI_SUB_PSEUDO, SI_ORDER_ANY); Index: sys/tests/routing/test_route_ctl.h =================================================================== --- /dev/null +++ sys/tests/routing/test_route_ctl.h @@ -0,0 +1,73 @@ +#ifndef _SYS_TESTS_ROUTING_TEST_ROUTE_CTL_H_ +#define _SYS_TESTS_ROUTING_TEST_ROUTE_CTL_H_ + +int create_rte_from_info_wrapper(struct rib_head *rnh, struct rt_addrinfo *info, + struct rtentry **ret_rt); + +int create_nhop_from_info_wrapper(struct rib_head *rnh, struct rt_addrinfo *info, + struct nhop_object **nh_ret); + +int create_rt_nh_pair_from_info_wrapper(struct rib_head *rnh, + struct rt_addrinfo *info, struct rtentry **ret_rt); + +int add_route_wrapper(struct rib_head *rnh, struct rtentry *rt_new, + struct rt_addrinfo *info, struct rib_cmd_info *rc); + +int del_route_one_wrapper(struct rib_head *rnh, struct rtentry *rt, + struct rt_addrinfo *info); + +int change_route_wrapper(struct rib_head *rnh, struct rt_addrinfo *info, + struct rib_cmd_info *rc); + +#ifndef _TEST_CALLER +int +create_nhop_from_info_wrapper(struct rib_head *rnh, struct rt_addrinfo *info, + struct nhop_object **nh_ret) +{ + + return (create_nhop_from_info(rnh, info, nh_ret)); +} + +int +create_rte_from_info_wrapper(struct rib_head *rnh, struct rt_addrinfo *info, + struct rtentry **ret_rt) +{ + + return (create_rte_from_info(rnh, info, ret_rt)); +} + +int +create_rt_nh_pair_from_info_wrapper(struct rib_head *rnh, + struct rt_addrinfo *info, struct rtentry **ret_rt) +{ + + return (create_rt_nh_pair_from_info(rnh, info, ret_rt)); +} + +int +add_route_wrapper(struct rib_head *rnh, struct rtentry *rt_new, + struct rt_addrinfo *info, struct rib_cmd_info *rc) +{ + + return (add_route(rnh, rt_new, info, rc)); +} + +int +del_route_one_wrapper(struct rib_head *rnh, struct rtentry *rt, + struct rt_addrinfo *info) +{ + + return (del_route_one(rnh, rt, info)); +} + +int +change_route_wrapper(struct rib_head *rnh, struct rt_addrinfo *info, + struct rib_cmd_info *rc) +{ + + return (change_route(rnh, info, rc)); +} +#endif + +#endif + Index: sys/tests/routing/test_route_ctl.c =================================================================== --- /dev/null +++ sys/tests/routing/test_route_ctl.c @@ -0,0 +1,451 @@ +/*- + * Copyright (c) 2020, Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Neither the name of Alexander V. Chernikov nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +#include + +#include "tests/routing/module.h" +#define _TEST_CALLER +#include "tests/routing/test_route_ctl.h" + +static struct rib_head * +create_rnh(int family, u_long fibnum) +{ + struct domain *dom; + struct rib_head *rnh; + + for (dom = domains; dom; dom = dom->dom_next) { + if (dom->dom_family != family) + continue; + dom->dom_rtattach((void **)&rnh, 0, fibnum); + return (rnh); + } + + return (NULL); +} + +static void +free_rnh(struct rib_head *rnh) +{ + struct domain *dom; + + if (rnh == NULL) + return; + + for (dom = domains; dom; dom = dom->dom_next) { + if (dom->dom_family != rnh->rib_family) + continue; + dom->dom_rtdetach((void **)&rnh, 0); + break; + } + +} + +static size_t +fill_sa(struct sockaddr *sa, const char *addr) +{ + size_t sz; + + if (strchr(addr, ':')) { + struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa; + + sz = sizeof(struct sockaddr_in6); + bzero(sa6, sz); + sa6->sin6_family = AF_INET6; + sa6->sin6_len = sz; + inet_pton(AF_INET6, addr, &sa6->sin6_addr); + } else { + struct sockaddr_in *sa4 = (struct sockaddr_in *)sa; + + sz = sizeof(struct sockaddr_in); + bzero(sa4, sz); + sa4->sin_family = AF_INET; + sa4->sin_len = sz; + inet_pton(AF_INET, addr, &sa4->sin_addr); + } + + return (sz); +} + +static void +sa_fill_mask4(struct sockaddr_in *sin, int plen) +{ + + memset(sin, 0, sizeof(struct sockaddr_in)); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_addr.s_addr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0); +} + +static void +sa_fill_mask6(struct sockaddr_in6 *sin6, uint8_t mask) +{ + uint32_t *cp; + + memset(sin6, 0, sizeof(struct sockaddr_in6)); + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(struct sockaddr_in6); + + for (cp = (uint32_t *)&sin6->sin6_addr; mask >= 32; mask -= 32) + *cp++ = 0xFFFFFFFF; + if (mask > 0) + *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0); +} + + +static struct rt_addrinfo * +get_info(char *buf, char *_prefix, char *gw_s) +{ + struct rt_addrinfo *info; + struct sockaddr *dst, *gw; + struct sockaddr_in6 *sa6; + struct sockaddr_in *sa4; + char prefix[128], *d; + struct epoch_tracker et; + size_t sz; + + info = (struct rt_addrinfo *)buf; + buf += sizeof(struct rt_addrinfo); + + bzero(info, sizeof(struct rt_addrinfo)); + + strlcpy(prefix, _prefix, sizeof(prefix)); + d = strchr(prefix, '/'); + if (d != NULL) { + *d++ = '\0'; + if (strchr(prefix, ':')) { + sa6 = (struct sockaddr_in6 *)buf; + sa_fill_mask6(sa6, strtol(d, NULL, 10)); + sz = sa6->sin6_len; + } else { + sa4 = (struct sockaddr_in *)buf; + sa_fill_mask4(sa4, strtol(d, NULL, 10)); + sz = sa4->sin_len; + } + info->rti_info[RTAX_NETMASK] = (struct sockaddr *)buf; + buf += sz; + } + + dst = (struct sockaddr *)buf; + buf += fill_sa(dst, prefix); + info->rti_info[RTAX_DST] = dst; + + if (gw_s != NULL) { + gw = (struct sockaddr *)buf; + buf += fill_sa(gw, gw_s); + info->rti_info[RTAX_GATEWAY] = gw; + + NET_EPOCH_ENTER(et); + info->rti_ifa = ifa_ifwithnet(gw, 0, 0); + NET_EPOCH_EXIT(et); + if (info->rti_ifa != NULL) + info->rti_ifp = info->rti_ifa->ifa_ifp; + } + + return (info); +} + +static int +test_add_route_plain_add_success() +{ + int error; + struct rib_cmd_info rc; + struct rt_addrinfo *info; + struct rib_head *rnh; + struct rtentry *rt; + struct radix_node *rn; + struct epoch_tracker et; + + char *buf = malloc(1024, M_TEMP, M_WAITOK | M_ZERO); + + rnh = create_rnh(AF_INET6, 0); + info = get_info(buf, "2001:db8:1::/64", "::1"); + + if (info == NULL || info->rti_ifp == NULL || info->rti_ifa == NULL) { + TPRINTF("failed to create info"); + free_rnh(rnh); + free(buf, M_TEMP); + return (EINVAL); + } + + /* done by rib_add_route() */ + bzero(&rc, sizeof(struct rib_cmd_info)); + + NET_EPOCH_ENTER(et); + + error = create_rt_nh_pair_from_info_wrapper(rnh, info, &rt); + if (error != 0) { + TPRINTF("create_rt_nh_pair_from_info() failed: %d", error); + } else { + error = add_route_wrapper(rnh, rt, info, &rc); + if (error == 0) { + rn = rnh->rnh_lookup(rt_key(rt), rt_mask(rt), &rnh->head); + TASSERT((struct rtentry *)rn == rt, "inserted rt not found"); + /* verify rc */ + TASSERT(rc.cmd == RTM_ADD, "cmd!=RTM_ADD:%d", rc.cmd); + TASSERT(rc.rt == rt, "rc.rt!=rt"); + TASSERT(rc.nh_old == NULL, "rc.nh_old!=NULL"); + TASSERT(rc.nh_new == rt->rt_nhop, "rc.nh_new!=rt.rt_nhop"); + } else { + TPRINTF("add_route() returned %d", error); + } + } + NET_EPOCH_EXIT(et); + free(buf, M_TEMP); + free_rnh(rnh); + + return (error); +} + +static int +test_add_route_exist_fail() +{ + int error; + struct rib_cmd_info rc; + struct rt_addrinfo *info; + struct rib_head *rnh; + struct rtentry *rt, *rt2; + struct epoch_tracker et; + + char *buf = malloc(1024, M_TEMP, M_WAITOK | M_ZERO); + + rnh = create_rnh(AF_INET6, 0); + info = get_info(buf, "2001:db8:1::/64", "::1"); + + if (info == NULL || info->rti_ifp == NULL || info->rti_ifa == NULL) { + TPRINTF("failed to create info"); + free_rnh(rnh); + free(buf, M_TEMP); + return (EINVAL); + } + /* Do not set RTF_GATEWAY so the first route is multipath ineligible */ + + /* done by rib_add_route() */ + bzero(&rc, sizeof(struct rib_cmd_info)); + + NET_EPOCH_ENTER(et); + + error = create_rt_nh_pair_from_info_wrapper(rnh, info, &rt); + if (error != 0) { + TPRINTF("create_rt_nh_pair_from_info() failed: %d", error); + goto cleanup; + } + /* Set RTF_GATEWAY so the new nexthop is different */ + info->rti_flags |= RTF_GATEWAY; + error = create_rt_nh_pair_from_info_wrapper(rnh, info, &rt2); + if (error != 0) { + TPRINTF("second create_rt_nh_pair_from_info() failed: %d", error); + goto cleanup; + } + + error = add_route_wrapper(rnh, rt, info, &rc); + if (error != 0) { + TPRINTF("add_route() returned %d", error); + goto cleanup; + } + + if (rnh->rnh_lookup(rt_key(rt), rt_mask(rt), &rnh->head) == NULL) { + TPRINTF("added route not found"); + error = EINVAL; + goto cleanup; + } + + error = add_route_wrapper(rnh, rt2, info, &rc); + if (error != EEXIST) { + TPRINTF("add_route() returned %d instead of EEXIST", error); + goto cleanup; + } + + error = 0; +cleanup: + NET_EPOCH_EXIT(et); + free(buf, M_TEMP); + free_rnh(rnh); + + return (error); +} + +static int +test_add_route_pinned_success() +{ + int error; + struct rib_cmd_info rc; + struct rt_addrinfo *info; + struct rib_head *rnh; + struct rtentry *rt, *rt2; + struct epoch_tracker et; + + char *buf = malloc(1024, M_TEMP, M_WAITOK | M_ZERO); + + rnh = create_rnh(AF_INET6, 0); + info = get_info(buf, "2001:db8:1::/64", "::1"); + + if (info == NULL || info->rti_ifp == NULL || info->rti_ifa == NULL) { + TPRINTF("failed to create info"); + free_rnh(rnh); + free(buf, M_TEMP); + return (EINVAL); + } + + /* done by rib_add_route() */ + bzero(&rc, sizeof(struct rib_cmd_info)); + + NET_EPOCH_ENTER(et); + + error = create_rt_nh_pair_from_info_wrapper(rnh, info, &rt); + if (error != 0) { + TPRINTF("create_rt_nh_pair_from_info() failed: %d", error); + goto cleanup; + } + + info->rti_flags |= RTF_PINNED; + error = create_rt_nh_pair_from_info_wrapper(rnh, info, &rt2); + if (error != 0) { + TPRINTF("second create_rt_nh_pair_from_info() failed: %d", error); + goto cleanup; + } + + error = add_route_wrapper(rnh, rt, info, &rc); + if (error != 0) { + TPRINTF("add_route() returned %d", error); + goto cleanup; + } + + if (rnh->rnh_lookup(rt_key(rt), rt_mask(rt), &rnh->head) == NULL) { + TPRINTF("added route not found"); + error = EINVAL; + goto cleanup; + } + + error = add_route_wrapper(rnh, rt2, info, &rc); + if (error != 0) { + TPRINTF("second add_route() returned %d", error); + goto cleanup; + } +cleanup: + NET_EPOCH_EXIT(et); + free(buf, M_TEMP); + free_rnh(rnh); + + return (error); +} + +static int +test_del_route_plain_del_success() +{ + int error; + struct rib_cmd_info rc; + struct rt_addrinfo *info; + struct rib_head *rnh; + struct rtentry *rt, *rt2; + struct epoch_tracker et; + + char *buf = malloc(1024, M_TEMP, M_WAITOK | M_ZERO); + + rnh = create_rnh(AF_INET6, 0); + info = get_info(buf, "2001:db8:1::/64", "::1"); + + if (info == NULL || info->rti_ifp == NULL || info->rti_ifa == NULL) { + TPRINTF("failed to create info"); + free_rnh(rnh); + free(buf, M_TEMP); + return (EINVAL); + } + + /* done by rib_del_route() */ + bzero(&rc, sizeof(struct rib_cmd_info)); + + NET_EPOCH_ENTER(et); + + error = create_rt_nh_pair_from_info_wrapper(rnh, info, &rt); + if (error != 0) { + TPRINTF("create_rt_nh_pair_from_info() failed: %d", error); + goto cleanup; + } + + error = add_route_wrapper(rnh, rt, info, &rc); + if (error != 0) { + TPRINTF("add_route() returned %d", error); + goto cleanup; + } + + RIB_WLOCK(rnh); + error = del_route_one_wrapper(rnh, &rt, info); + RIB_WUNLOCK(rnh); + + if (error != 0) { + TPRINTF("del_route_one() returned %d", error); + goto cleanup; + } + + if (rnh->rnh_lookup(rt_key(rt), rt_mask(rt), &rnh->head) != NULL) { + TPRINTF("deleted route still in tree"); + error = EINVAL; + goto cleanup; + } +cleanup: + NET_EPOCH_EXIT(et); + free(buf, M_TEMP); + free_rnh(rnh); + + return (error); +} + +struct ktest_item tests[] = { + DECLARE_KTEST(test_add_route_plain_add_success), + DECLARE_KTEST(test_add_route_exist_fail), + DECLARE_KTEST(test_add_route_pinned_success), + DECLARE_KTEST(test_del_route_plain_del_success), +}; +DEFINE_KTESTS(route_ctl, "routing control plane tests", tests); + Index: usr.bin/netstat/Makefile =================================================================== --- usr.bin/netstat/Makefile +++ usr.bin/netstat/Makefile @@ -5,7 +5,7 @@ PROG= netstat SRCS= if.c inet.c main.c mbuf.c mroute.c netisr.c nl_symbols.c route.c \ - unix.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c \ + unix.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c common.c nhops.c \ nl_defs.h nl_symbols.c: nlist_symbols Index: usr.bin/netstat/common.h =================================================================== --- /dev/null +++ usr.bin/netstat/common.h @@ -0,0 +1,24 @@ +#ifndef _NETSTAT_COMMON_H_ +#define _NETSTAT_COMMON_H_ + +struct bits { + u_long b_mask; + char b_val; + const char *b_name; +}; +extern struct bits rt_bits[]; + +const char *fmt_flags(const struct bits *p, int f); +void print_flags_generic(int flags, const struct bits *pbits, + const char *format, const char *tag_name); +int print_sockaddr(const char *name, struct sockaddr *sa, + struct sockaddr *mask, int flags, int width); + +struct ifmap_entry { + char ifname[IFNAMSIZ]; +}; + +struct ifmap_entry *prepare_ifmap(size_t *ifmap_size); + +#endif + Index: usr.bin/netstat/common.c =================================================================== --- /dev/null +++ usr.bin/netstat/common.c @@ -0,0 +1,140 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1983, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "netstat.h" +#include "common.h" + +const char * +fmt_flags(const struct bits *p, int f) +{ + static char name[33]; + char *flags; + + for (flags = name; p->b_mask; p++) + if (p->b_mask & f) + *flags++ = p->b_val; + *flags = '\0'; + return (name); +} + +void +print_flags_generic(int flags, const struct bits *pbits, const char *format, + const char *tag_name) +{ + const struct bits *p; + char tag_fmt[64]; + + xo_emit(format, fmt_flags(pbits, flags)); + + snprintf(tag_fmt, sizeof(tag_fmt), "{le:%s/%%s}", tag_name); + xo_open_list(tag_name); + for (p = pbits; p->b_mask; p++) + if (p->b_mask & flags) + xo_emit(tag_fmt, p->b_name); + xo_close_list(tag_name); +} + +struct ifmap_entry * +prepare_ifmap(size_t *pifmap_size) +{ + int ifindex = 0, size; + struct ifaddrs *ifap, *ifa; + struct sockaddr_dl *sdl; + + struct ifmap_entry *ifmap = NULL; + int ifmap_size = 0; + + /* + * Retrieve interface list at first + * since we need #ifindex -> if_xname match + */ + if (getifaddrs(&ifap) != 0) + err(EX_OSERR, "getifaddrs"); + + for (ifa = ifap; ifa; ifa = ifa->ifa_next) { + + if (ifa->ifa_addr->sa_family != AF_LINK) + continue; + + sdl = (struct sockaddr_dl *)ifa->ifa_addr; + ifindex = sdl->sdl_index; + + if (ifindex >= ifmap_size) { + size = roundup(ifindex + 1, 32) * + sizeof(struct ifmap_entry); + if ((ifmap = realloc(ifmap, size)) == NULL) + errx(2, "realloc(%d) failed", size); + memset(&ifmap[ifmap_size], 0, + size - ifmap_size * + sizeof(struct ifmap_entry)); + + ifmap_size = roundup(ifindex + 1, 32); + } + + if (*ifmap[ifindex].ifname != '\0') + continue; + + strlcpy(ifmap[ifindex].ifname, ifa->ifa_name, IFNAMSIZ); + } + + freeifaddrs(ifap); + + *pifmap_size = ifmap_size; + + return (ifmap); +} + Index: usr.bin/netstat/main.c =================================================================== --- usr.bin/netstat/main.c +++ usr.bin/netstat/main.c @@ -214,6 +214,7 @@ int noutputs = 0; /* how much outputs before we exit */ int numeric_addr; /* show addresses numerically */ int numeric_port; /* show ports numerically */ +int oflag; /* show nexthop objects*/ int Pflag; /* show TCP log ID */ static int pflag; /* show given protocol */ static int Qflag; /* show netisr information */ @@ -248,7 +249,7 @@ if (argc < 0) exit(EXIT_FAILURE); - while ((ch = getopt(argc, argv, "46AaBbdF:f:ghI:iLlM:mN:nPp:Qq:RrSTsuWw:xz")) + while ((ch = getopt(argc, argv, "46AaBbdF:f:ghI:iLlM:mN:noPp:Qq:RrSTsuWw:xz")) != -1) switch(ch) { case '4': @@ -345,6 +346,9 @@ case 'n': numeric_addr = numeric_port = 1; break; + case 'o': + oflag = 1; + break; case 'P': Pflag = 1; break; @@ -494,6 +498,15 @@ xo_finish(); exit(0); } + if (oflag) { + xo_open_container("statistics"); + nhops_print(fib, af); + nhgrp_print(fib, af); + xo_close_container("statistics"); + xo_finish(); + exit(0); + } + if (gflag) { xo_open_container("statistics"); Index: usr.bin/netstat/netstat.h =================================================================== --- usr.bin/netstat/netstat.h +++ usr.bin/netstat/netstat.h @@ -147,6 +147,10 @@ char *routename(struct sockaddr *, int); const char *netname(struct sockaddr *, struct sockaddr *); void routepr(int, int); +int p_sockaddr(const char *name, struct sockaddr *sa, + struct sockaddr *mask, int flags, int width); +const char *fmt_sockaddr(struct sockaddr *sa, struct sockaddr *mask, + int flags); #ifdef NETGRAPH void netgraphprotopr(u_long, const char *, int, int); @@ -157,3 +161,6 @@ void mroutepr(void); void mrt_stats(void); void bpf_stats(char *); +void nhops_print(int fibnum, int af); +void nhgrp_print(int fibnum, int af); + Index: usr.bin/netstat/nhops.c =================================================================== --- /dev/null +++ usr.bin/netstat/nhops.c @@ -0,0 +1,687 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1983, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "netstat.h" +#include "common.h" + +/* column widths; each followed by one space */ +#ifndef INET6 +#define WID_DST_DEFAULT(af) 18 /* width of destination column */ +#define WID_GW_DEFAULT(af) 18 /* width of gateway column */ +#define WID_IF_DEFAULT(af) (Wflag ? 10 : 8) /* width of netif column */ +#else +#define WID_DST_DEFAULT(af) \ + ((af) == AF_INET6 ? (numeric_addr ? 33: 18) : 18) +#define WID_GW_DEFAULT(af) \ + ((af) == AF_INET6 ? (numeric_addr ? 29 : 18) : 18) +#define WID_IF_DEFAULT(af) ((af) == AF_INET6 ? 8 : (Wflag ? 10 : 8)) +#endif /*INET6*/ +static int wid_dst; +static int wid_gw; +static int wid_flags; +static int wid_pksent; +static int wid_mtu; +static int wid_if; +static int wid_nhidx; +static int wid_nhtype; +static int wid_refcnt; +static int wid_prepend; + +static struct bits nh_bits[] = { + { NHF_REJECT, 'R', "reject" }, + { NHF_BLACKHOLE,'B', "blackhole" }, + { NHF_REDIRECT, 'r', "redirect" }, + { NHF_GATEWAY, 'G', "gateway" }, + { NHF_DEFAULT, 'd', "default" }, + { NHF_BROADCAST,'b', "broadcast" }, + { 0 , 0, NULL } +}; + +static char *nh_types[] = { + "empty", /* 0 */ + "v4/resolve", /* 1 */ + "v4/gw", + "v6/resolve", + "v6/gw" +}; + +struct nhop_entry { + char gw[64]; + char ifname[IFNAMSIZ]; +}; + +struct nhop_map { + struct nhop_entry *ptr; + size_t size; +}; +static struct nhop_map global_nhop_map; + +static void nhop_map_update(struct nhop_map *map, uint32_t idx, + char *gw, char *ifname); +static struct nhop_entry *nhop_get(struct nhop_map *map, uint32_t idx); + + +static struct ifmap_entry *ifmap; +static size_t ifmap_size; + +static void +print_sockaddr_buf(char *buf, size_t bufsize, const struct sockaddr *sa) +{ + + switch (sa->sa_family) { + case AF_INET: + inet_ntop(AF_INET, &((struct sockaddr_in *)sa)->sin_addr, + buf, bufsize); + break; + case AF_INET6: + inet_ntop(AF_INET6, &((struct sockaddr_in6 *)sa)->sin6_addr, + buf, bufsize); + break; + default: + snprintf(buf, bufsize, "unknown:%d", sa->sa_family); + break; + } +} + +static int +print_addr(const char *name, const char *addr, int width) +{ + char buf[128]; + int protrusion; + + if (width < 0) { + snprintf(buf, sizeof(buf), "{:%s/%%s} ", name); + xo_emit(buf, addr); + protrusion = 0; + } else { + if (Wflag != 0 || numeric_addr) { + snprintf(buf, sizeof(buf), "{[:%d}{:%s/%%s}{]:} ", + -width, name); + xo_emit(buf, addr); + protrusion = strlen(addr) - width; + if (protrusion < 0) + protrusion = 0; + } else { + snprintf(buf, sizeof(buf), "{[:%d}{:%s/%%-.*s}{]:} ", + -width, name); + xo_emit(buf, width, addr); + protrusion = 0; + } + } + return (protrusion); +} + + +static void +print_nhop_header(int af1 __unused) +{ + + if (Wflag) { + xo_emit("{T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%*.*s} " + "{T:/%*.*s} {T:/%-*.*s} {T:/%*.*s} {T:/%*.*s} {T:/%*.*s} {T:/%*s}\n", + wid_nhidx, wid_nhidx, "Idx", + wid_nhtype, wid_nhtype, "Type", + wid_dst, wid_dst, "IFA", + wid_gw, wid_gw, "Gateway", + wid_flags, wid_flags, "Flags", + wid_pksent, wid_pksent, "Use", + wid_mtu, wid_mtu, "Mtu", + wid_if, wid_if, "Netif", + wid_if, wid_if, "Addrif", + wid_refcnt, wid_refcnt, "Refcnt", + wid_prepend, "Prepend"); + } else { + xo_emit("{T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%*.*s} " + " {T:/%*s}\n", + wid_nhidx, wid_nhidx, "Idx", + wid_dst, wid_dst, "IFA", + wid_gw, wid_gw, "Gateway", + wid_flags, wid_flags, "Flags", + wid_if, wid_if, "Netif", + wid_prepend, "Refcnt"); + } +} + +static void +print_nhgroup_header(int af1 __unused) +{ + + xo_emit("{T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%*.*s}" + " {T:/%-*.*s} {T:/%*s}\n", + wid_nhidx, wid_nhidx, "MpIdx", + wid_nhidx, wid_nhidx, "NHIdx", + wid_nhidx, wid_nhidx, "Weight", + wid_nhidx, wid_nhidx, "Slots", + wid_gw, wid_gw, "Gateway", + wid_if, wid_if, "Netif", + wid_nhidx, "Refcnt"); +} + +static void +print_nhgroup_entry_sysctl(const char *name, struct rt_msghdr *rtm, + struct mpath_external *mpe) +{ + char buffer[128]; + struct nhop_entry *ne; + + xo_open_instance(name); + + snprintf(buffer, sizeof(buffer), "{[:-%d}{:mp_index/%%lu}{]:} ", wid_nhidx); + xo_emit(buffer, mpe->mp_idx); + + xo_emit("{t:dummy-1/%*.*s}", wid_nhidx, wid_nhidx, "----"); + xo_emit("{t:dummy-2/%*.*s}", wid_nhidx, wid_nhidx, "----"); + xo_emit("{t:dummy-2/%*.*s}", wid_nhidx, wid_nhidx, "----"); + xo_emit("{t:dummy-3/%*.*s}", wid_gw, wid_gw, "----"); + xo_emit("{t:dummy-4/%*.*s}", wid_if, wid_if, "----"); + xo_emit("{t:mp-refcnt/%*lu}", wid_nhidx, mpe->mp_refcount); + xo_emit("\n"); + + struct mpath_nhop_external *ext; + ext = (struct mpath_nhop_external *)(mpe + 1); + + uint32_t *fwd_c = calloc(sizeof(uint32_t), global_nhop_map.size); + uint32_t *pidx; + pidx = (uint32_t *)&ext[mpe->mp_nh_count]; + for (uint32_t i = 0; i < mpe->mp_group_size; i++) { + fwd_c[pidx[i]]++; + } + + xo_open_list("nhop_weights"); + for (uint32_t i = 0; i < mpe->mp_nh_count; i++) { + xo_open_instance("nhop-weight"); + snprintf(buffer, sizeof(buffer), "{[:-%d}{:d-0/%%s}{]:} ", wid_nhidx); + xo_emit(buffer, ""); + // nh index + xo_emit("{t:nh-index/%*lu} ", wid_nhidx, ext[i].nh_idx); + xo_emit("{t:nh-weight/%*lu} ", wid_nhidx, ext[i].nh_weight); + xo_emit("{t:nh-slots/%*lu} ", wid_nhidx, fwd_c[ext[i].nh_idx]); + ne = nhop_get(&global_nhop_map, ext[i].nh_idx); + if (ne != NULL) { + xo_emit("{t:nh-gw/%*.*s}", wid_gw, wid_gw, ne->gw); + xo_emit("{t:nh-interface/%*.*s}", wid_if, wid_if, ne->ifname); + } + xo_emit("\n"); + xo_close_instance("nhop-weight"); + } + xo_close_list("nhop_weights"); + +#if 0 + xo_emit("{t:dummy-2/%*.*s}", wid_nhidx, wid_nhidx, ""); + xo_emit("{t:dummy-1/%*.*s}", wid_nhidx, wid_nhidx, "----"); + xo_emit("\n"); + + uint32_t *pidx; + pidx = (uint32_t *)&ext[mpe->mp_nh_count]; + xo_open_list("fwd-nhops"); + for (uint32_t i = 0; i < mpe->mp_group_size; i++) { + xo_open_instance("fwd-nhop"); + snprintf(buffer, sizeof(buffer), "{[:-%d}{:d-0/%%s}{]:} ", wid_nhidx); + xo_emit(buffer, ""); + // nh index + xo_emit("{t:nh-index/%*lu} ", wid_nhidx, pidx[i]); + ne = nhop_get(&global_nhop_map, pidx[i]); + if (ne != NULL) { + xo_emit("{t:dummy-2/%*.*s}", wid_nhidx, wid_nhidx, ""); + xo_emit("{t:nh-gw/%*.*s}", wid_gw, wid_gw, ne->gw); + xo_emit("{t:nh-interface/%*.*s}", wid_if, wid_if, ne->ifname); + } + xo_emit("\n"); + xo_close_instance("fwd-nhop"); + } + xo_close_list("fwd-nhops"); +#endif +#if 0 + if (Wflag) { + char *cp = nh_types[nh->nh_type]; + xo_emit("{t:type_str/%*s} ", wid_nhtype, cp); + } + memset(iface_name, 0, sizeof(iface_name)); + if (nh->ifindex < (uint32_t)ifmap_size) { + strlcpy(iface_name, ifmap[nh->ifindex].ifname, + sizeof(iface_name)); + if (*iface_name == '\0') + strlcpy(iface_name, "---", sizeof(iface_name)); + } + + //inet_ntop(nh->nh_family, &nh->nh_src, src_addr, sizeof(src_addr)); + //protrusion = p_addr("ifa", src_addr, wid_dst); + sa_gw = (struct sockaddr *)(nh + 1); + sa_ifa = (struct sockaddr *)((char *)sa_gw + sa_gw->sa_len); + protrusion = p_sockaddr("ifa", sa_ifa, NULL, RTF_HOST, wid_dst); + + if (nh->nh_flags & NHF_GATEWAY) { + const char *cp; + cp = fmt_sockaddr(sa_gw, NULL, RTF_HOST); + strlcpy(gw_addr, cp, sizeof(gw_addr)); + } else + snprintf(gw_addr, sizeof(gw_addr), "%s/resolve", iface_name); + protrusion = p_addr("gateway", gw_addr, wid_dst - protrusion); + + snprintf(buffer, sizeof(buffer), "{[:-%d}{:flags/%%s}{]:} ", + wid_flags - protrusion); + + //p_nhflags(nh->nh_flags, buffer); + p_flags(rtm->rtm_flags, buffer); + + if (Wflag) { + xo_emit("{t:use/%*lu} ", wid_pksent, nh->nh_pksent); + xo_emit("{t:mtu/%*lu} ", wid_mtu, nh->nh_mtu); + } + //printf("IDX: %d IFACE: %s FAMILY: %d TYPE: %d FLAGS: %X GW \n"); + + if (Wflag) + xo_emit("{t:interface-name/%*s}", wid_if, iface_name); + else + xo_emit("{t:interface-name/%*.*s}", wid_if, wid_if, iface_name); + + memset(iface_name, 0, sizeof(iface_name)); + if (nh->aifindex < (uint32_t)ifmap_size && nh->ifindex != nh->aifindex) { + strlcpy(iface_name, ifmap[nh->aifindex].ifname, + sizeof(iface_name)); + if (*iface_name == '\0') + strlcpy(iface_name, "---", sizeof(iface_name)); + } + if (Wflag) + xo_emit("{t:address-interface-name/%*s}", wid_if, iface_name); + + xo_emit("{t:refcount/%*lu} ", wid_refcnt, nh->nh_refcount); + if (Wflag && nh->prepend_len) { + char *prepend_hex = "AABBCCDDEE"; + xo_emit(" {:nhop-prepend/%*s}", wid_prepend, prepend_hex); + } +#endif + //xo_emit("\n"); + xo_close_instance(name); +} + + +static void +print_nhgrp_sysctl(int fibnum, int af) +{ + size_t needed; + int mib[7]; + char *buf, *next, *lim; + struct rt_msghdr *rtm; + struct mpath_external *mp; + int fam = AF_UNSPEC; + int need_table_close = false; + + mib[0] = CTL_NET; + mib[1] = PF_ROUTE; + mib[2] = 0; + mib[3] = af; + mib[4] = NET_RT_NHGROUPS; + mib[5] = 0; + mib[6] = fibnum; + if (sysctl(mib, nitems(mib), NULL, &needed, NULL, 0) < 0) + err(EX_OSERR, "sysctl: net.route.0.%d.nhgrpdump.%d estimate", + af, fibnum); + if ((buf = malloc(needed)) == NULL) + errx(2, "malloc(%lu)", (unsigned long)needed); + if (sysctl(mib, nitems(mib), buf, &needed, NULL, 0) < 0) + err(1, "sysctl: net.route.0.%d.nhgrpdump.%d", af, fibnum); + printf("BUF: %zu\n", needed); + lim = buf + needed; + xo_open_container("nhgrp-table"); + xo_open_list("rt-family"); + for (next = buf; next < lim; next += rtm->rtm_msglen) { + rtm = (struct rt_msghdr *)next; + if (rtm->rtm_version != RTM_VERSION) + continue; + + mp = (struct mpath_external *)(rtm + 1); + /* + * Peek inside header to determine AF + */ + /* Only print family first time. */ + if (fam != af) { + if (need_table_close) { + xo_close_list("nhgrp-entry"); + xo_close_instance("rt-family"); + } + need_table_close = true; + + fam = af; + wid_dst = WID_GW_DEFAULT(fam); + wid_gw = WID_GW_DEFAULT(fam); + wid_nhidx = 5; + wid_nhtype = 12; + wid_refcnt = 6; + wid_flags = 6; + wid_pksent = 8; + wid_mtu = 6; + wid_if = WID_IF_DEFAULT(fam); + xo_open_instance("rt-family"); + pr_family(fam); + xo_open_list("nhgrp-entry"); + + print_nhgroup_header(fam); + } + print_nhgroup_entry_sysctl("nhgrp-entry", rtm, mp); + } + if (need_table_close) { + xo_close_list("nhgrp-entry"); + xo_close_instance("rt-family"); + } + xo_close_list("rt-family"); + xo_close_container("nhgrp-table"); + free(buf); +} + +static void +nhop_map_update(struct nhop_map *map, uint32_t idx, char *gw, char *ifname) +{ + if (idx >= map->size) { + uint32_t new_size; + size_t sz; + if (map->size == 0) + new_size = 32; + else + new_size = map->size * 2; + if (new_size <= idx) + new_size = roundup(idx + 1, 32); + + sz = new_size * (sizeof(struct nhop_entry)); + if ((map->ptr = realloc(map->ptr, sz)) == NULL) + errx(2, "realloc(%lu) failed", sz); + + memset(&map->ptr[map->size], 0, (new_size - map->size) * sizeof(struct nhop_entry)); + map->size = new_size; + } + + strlcpy(map->ptr[idx].ifname, ifname, sizeof(map->ptr[idx].ifname)); + strlcpy(map->ptr[idx].gw, gw, sizeof(map->ptr[idx].gw)); +} + +static struct nhop_entry * +nhop_get(struct nhop_map *map, uint32_t idx) +{ + + if (idx >= map->size) + return (NULL); + if (*map->ptr[idx].ifname == '\0') + return (NULL); + return &map->ptr[idx]; +} + +static void +print_nhop_entry_sysctl(const char *name, struct rt_msghdr *rtm, struct nhop_external *nh) +{ + char buffer[128]; + char iface_name[128]; + int protrusion; + char gw_addr[64]; + struct sockaddr *sa_gw, *sa_ifa; + + xo_open_instance(name); + + snprintf(buffer, sizeof(buffer), "{[:-%d}{:index/%%lu}{]:} ", wid_nhidx); + //xo_emit("{t:index/%-lu} ", wid_nhidx, nh->nh_idx); + xo_emit(buffer, nh->nh_idx); + + if (Wflag) { + char *cp = nh_types[nh->nh_type]; + xo_emit("{t:type_str/%*s} ", wid_nhtype, cp); + } + memset(iface_name, 0, sizeof(iface_name)); + if (nh->ifindex < (uint32_t)ifmap_size) { + strlcpy(iface_name, ifmap[nh->ifindex].ifname, + sizeof(iface_name)); + if (*iface_name == '\0') + strlcpy(iface_name, "---", sizeof(iface_name)); + } + + //inet_ntop(nh->nh_family, &nh->nh_src, src_addr, sizeof(src_addr)); + //protrusion = p_addr("ifa", src_addr, wid_dst); + sa_gw = (struct sockaddr *)(nh + 1); + sa_ifa = (struct sockaddr *)((char *)sa_gw + sa_gw->sa_len); + protrusion = p_sockaddr("ifa", sa_ifa, NULL, RTF_HOST, wid_dst); + + if (nh->nh_flags & NHF_GATEWAY) { + const char *cp; + cp = fmt_sockaddr(sa_gw, NULL, RTF_HOST); + strlcpy(gw_addr, cp, sizeof(gw_addr)); + } else + snprintf(gw_addr, sizeof(gw_addr), "%s/resolve", iface_name); + protrusion = print_addr("gateway", gw_addr, wid_dst - protrusion); + + nhop_map_update(&global_nhop_map, nh->nh_idx, gw_addr, iface_name); + + snprintf(buffer, sizeof(buffer), "{[:-%d}{:flags/%%s}{]:} ", + wid_flags - protrusion); + + //p_nhflags(nh->nh_flags, buffer); + print_flags_generic(rtm->rtm_flags, rt_bits, buffer, "rt_flags_pretty"); + + if (Wflag) { + xo_emit("{t:use/%*lu} ", wid_pksent, nh->nh_pksent); + xo_emit("{t:mtu/%*lu} ", wid_mtu, nh->nh_mtu); + } + //printf("IDX: %d IFACE: %s FAMILY: %d TYPE: %d FLAGS: %X GW \n"); + + if (Wflag) + xo_emit("{t:interface-name/%*s}", wid_if, iface_name); + else + xo_emit("{t:interface-name/%*.*s}", wid_if, wid_if, iface_name); + + memset(iface_name, 0, sizeof(iface_name)); + if (nh->aifindex < (uint32_t)ifmap_size && nh->ifindex != nh->aifindex) { + strlcpy(iface_name, ifmap[nh->aifindex].ifname, + sizeof(iface_name)); + if (*iface_name == '\0') + strlcpy(iface_name, "---", sizeof(iface_name)); + } + if (Wflag) + xo_emit("{t:address-interface-name/%*s}", wid_if, iface_name); + + xo_emit("{t:refcount/%*lu} ", wid_refcnt, nh->nh_refcount); + if (Wflag && nh->prepend_len) { + char *prepend_hex = "AABBCCDDEE"; + xo_emit(" {:nhop-prepend/%*s}", wid_prepend, prepend_hex); + } + + xo_emit("\n"); + xo_close_instance(name); +} + +static void +print_nhops_sysctl(int fibnum, int af) +{ + size_t needed; + int mib[7]; + char *buf, *next, *lim; + struct rt_msghdr *rtm; + struct nhop_external *nh; + int fam = AF_UNSPEC; + int need_table_close = false; + + mib[0] = CTL_NET; + mib[1] = PF_ROUTE; + mib[2] = 0; + mib[3] = af; + mib[4] = NET_RT_NHOP; + mib[5] = 0; + mib[6] = fibnum; + if (sysctl(mib, nitems(mib), NULL, &needed, NULL, 0) < 0) + err(EX_OSERR, "sysctl: net.route.0.%d.nhdump.%d estimate", af, + fibnum); + if ((buf = malloc(needed)) == NULL) + errx(2, "malloc(%lu)", (unsigned long)needed); + if (sysctl(mib, nitems(mib), buf, &needed, NULL, 0) < 0) + err(1, "sysctl: net.route.0.%d.nhdump.%d", af, fibnum); + lim = buf + needed; + xo_open_container("nhop-table"); + xo_open_list("rt-family"); + for (next = buf; next < lim; next += rtm->rtm_msglen) { + rtm = (struct rt_msghdr *)next; + if (rtm->rtm_version != RTM_VERSION) + continue; + + nh = (struct nhop_external *)(rtm + 1); + /* + * Peek inside header to determine AF + */ + /* Only print family first time. */ + if (fam != nh->nh_family) { + if (need_table_close) { + xo_close_list("nh-entry"); + xo_close_instance("rt-family"); + } + need_table_close = true; + + fam = nh->nh_family; + wid_dst = WID_GW_DEFAULT(fam); + wid_gw = WID_GW_DEFAULT(fam); + wid_nhidx = 5; + wid_nhtype = 12; + wid_refcnt = 6; + wid_flags = 6; + wid_pksent = 8; + wid_mtu = 6; + wid_if = WID_IF_DEFAULT(fam); + xo_open_instance("rt-family"); + pr_family(fam); + xo_open_list("nh-entry"); + + print_nhop_header(fam); + } + print_nhop_entry_sysctl("nh-entry", rtm, nh); + } + if (need_table_close) { + xo_close_list("nh-entry"); + xo_close_instance("rt-family"); + } + xo_close_list("rt-family"); + xo_close_container("nhop-table"); + free(buf); +} + +static void +p_nhflags(int f, const char *format) +{ + struct bits *p; + char *pretty_name = "nh_flags_pretty"; + + xo_emit(format, fmt_flags(nh_bits, f)); + + xo_open_list(pretty_name); + for (p = nh_bits; p->b_mask; p++) + if (p->b_mask & f) + xo_emit("{le:nh_flags_pretty/%s}", p->b_name); + xo_close_list(pretty_name); +} + +void +nhops_print(int fibnum, int af) +{ + size_t intsize; + int numfibs; + + intsize = sizeof(int); + if (fibnum == -1 && + sysctlbyname("net.my_fibnum", &fibnum, &intsize, NULL, 0) == -1) + fibnum = 0; + if (sysctlbyname("net.fibs", &numfibs, &intsize, NULL, 0) == -1) + numfibs = 1; + if (fibnum < 0 || fibnum > numfibs - 1) + errx(EX_USAGE, "%d: invalid fib", fibnum); + + ifmap = prepare_ifmap(&ifmap_size); + + xo_open_container("route-nhop-information"); + xo_emit("{T:Nexthop data}"); + if (fibnum) + xo_emit(" ({L:fib}: {:fib/%d})", fibnum); + xo_emit("\n"); + print_nhops_sysctl(fibnum, af); + xo_close_container("route-nhop-information"); +} + +void +nhgrp_print(int fibnum, int af) +{ + size_t intsize; + int numfibs; + + intsize = sizeof(int); + if (fibnum == -1 && + sysctlbyname("net.my_fibnum", &fibnum, &intsize, NULL, 0) == -1) + fibnum = 0; + if (sysctlbyname("net.fibs", &numfibs, &intsize, NULL, 0) == -1) + numfibs = 1; + if (fibnum < 0 || fibnum > numfibs - 1) + errx(EX_USAGE, "%d: invalid fib", fibnum); + + ifmap = prepare_ifmap(&ifmap_size); + + xo_open_container("route-nhgrp-information"); + xo_emit("{T:Nexthop groups data}"); + if (fibnum) + xo_emit(" ({L:fib}: {:fib/%d})", fibnum); + xo_emit("\n"); + print_nhgrp_sysctl(fibnum, af); + xo_close_container("route-nhgrp-information"); +} Index: usr.bin/netstat/route.c =================================================================== --- usr.bin/netstat/route.c +++ usr.bin/netstat/route.c @@ -69,16 +69,13 @@ #include #include #include "netstat.h" +#include "common.h" #include "nl_defs.h" /* * Definitions for showing gateway flags. */ -static struct bits { - u_long b_mask; - char b_val; - const char *b_name; -} bits[] = { +struct bits rt_bits[] = { { RTF_UP, 'U', "up" }, { RTF_GATEWAY, 'G', "gateway" }, { RTF_HOST, 'H', "host" }, @@ -99,11 +96,8 @@ { 0 , 0, NULL } }; -struct ifmap_entry { - char ifname[IFNAMSIZ]; -}; static struct ifmap_entry *ifmap; -static int ifmap_size; +static size_t ifmap_size; static struct timespec uptime; static const char *netname4(in_addr_t, in_addr_t); @@ -112,12 +106,7 @@ #endif static void p_rtable_sysctl(int, int); static void p_rtentry_sysctl(const char *name, struct rt_msghdr *); -static int p_sockaddr(const char *name, struct sockaddr *, struct sockaddr *, - int, int); -static const char *fmt_sockaddr(struct sockaddr *sa, struct sockaddr *mask, - int flags); static void p_flags(int, const char *); -static const char *fmt_flags(int f); static void domask(char *, size_t, u_long); @@ -229,7 +218,7 @@ wid_dst, wid_dst, "Destination", wid_gw, wid_gw, "Gateway", wid_flags, wid_flags, "Flags", - wid_pksent, wid_pksent, "Use", + wid_mtu, wid_mtu, "Nhop#", wid_mtu, wid_mtu, "Mtu", wid_if, wid_if, "Netif", wid_expire, "Expire"); @@ -252,47 +241,11 @@ char *buf, *next, *lim; struct rt_msghdr *rtm; struct sockaddr *sa; - int fam = AF_UNSPEC, ifindex = 0, size; + int fam = AF_UNSPEC; int need_table_close = false; - struct ifaddrs *ifap, *ifa; - struct sockaddr_dl *sdl; + ifmap = prepare_ifmap(&ifmap_size); - /* - * Retrieve interface list at first - * since we need #ifindex -> if_xname match - */ - if (getifaddrs(&ifap) != 0) - err(EX_OSERR, "getifaddrs"); - - for (ifa = ifap; ifa; ifa = ifa->ifa_next) { - - if (ifa->ifa_addr->sa_family != AF_LINK) - continue; - - sdl = (struct sockaddr_dl *)ifa->ifa_addr; - ifindex = sdl->sdl_index; - - if (ifindex >= ifmap_size) { - size = roundup(ifindex + 1, 32) * - sizeof(struct ifmap_entry); - if ((ifmap = realloc(ifmap, size)) == NULL) - errx(2, "realloc(%d) failed", size); - memset(&ifmap[ifmap_size], 0, - size - ifmap_size * - sizeof(struct ifmap_entry)); - - ifmap_size = roundup(ifindex + 1, 32); - } - - if (*ifmap[ifindex].ifname != '\0') - continue; - - strlcpy(ifmap[ifindex].ifname, ifa->ifa_name, IFNAMSIZ); - } - - freeifaddrs(ifap); - mib[0] = CTL_NET; mib[1] = PF_ROUTE; mib[2] = 0; @@ -377,7 +330,8 @@ wid_flags - protrusion); p_flags(rtm->rtm_flags, buffer); if (Wflag) { - xo_emit("{t:use/%*lu} ", wid_pksent, rtm->rtm_rmx.rmx_pksent); + /* XXX: use=0? */ + xo_emit("{t:nhop/%*lu} ", wid_mtu, rtm->rtm_rmx.rmx_pksent); if (rtm->rtm_rmx.rmx_mtu != 0) xo_emit("{t:mtu/%*lu} ", wid_mtu, rtm->rtm_rmx.rmx_mtu); @@ -410,7 +364,7 @@ xo_close_instance(name); } -static int +int p_sockaddr(const char *name, struct sockaddr *sa, struct sockaddr *mask, int flags, int width) { @@ -442,7 +396,7 @@ return (protrusion); } -static const char * +const char * fmt_sockaddr(struct sockaddr *sa, struct sockaddr *mask, int flags) { static char buf[128]; @@ -519,30 +473,10 @@ static void p_flags(int f, const char *format) { - struct bits *p; - xo_emit(format, fmt_flags(f)); - - xo_open_list("flags_pretty"); - for (p = bits; p->b_mask; p++) - if (p->b_mask & f) - xo_emit("{le:flags_pretty/%s}", p->b_name); - xo_close_list("flags_pretty"); + print_flags_generic(f, rt_bits, format, "flags_pretty"); } -static const char * -fmt_flags(int f) -{ - static char name[33]; - char *flags; - struct bits *p = bits; - - for (flags = name; p->b_mask; p++) - if (p->b_mask & f) - *flags++ = p->b_val; - *flags = '\0'; - return (name); -} char * routename(struct sockaddr *sa, int flags)