Index: include/Makefile =================================================================== --- include/Makefile +++ include/Makefile @@ -1,5 +1,5 @@ # @(#)Makefile 8.2 (Berkeley) 1/4/94 -# $FreeBSD$ +# $FreeBSD: head/include/Makefile 358500 2020-03-01 20:37:42Z imp $ # # Doing a "make install" builds /usr/include. @@ -54,6 +54,7 @@ geom/mirror geom/mountver geom/multipath geom/nop \ geom/raid geom/raid3 geom/shsec geom/stripe geom/virstor \ net/altq \ + net/route \ netgraph/atm netgraph/netflow \ netinet/cc \ netinet/netdump \ Index: lib/libc/gen/sysctl.3 =================================================================== --- lib/libc/gen/sysctl.3 +++ lib/libc/gen/sysctl.3 @@ -563,6 +563,7 @@ .It Dv NET_RT_IFLIST Ta 0 or if_index Ta None .It Dv NET_RT_IFMALIST Ta 0 or if_index Ta None .It Dv NET_RT_IFLISTL Ta 0 or if_index Ta None +.It Dv NET_RT_NHOPS Ta None Ta fib number .El .Pp The @@ -583,6 +584,9 @@ .Va struct if_msghdrl and .Va struct ifa_msghdrl . +.Pp +.Dv NET_RT_NHOPS +returns all nexthops for specified address family in given fib. .It Li PF_INET Get or set various global information about the IPv4 (Internet Protocol version 4). Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -4094,6 +4094,11 @@ net/raw_usrreq.c standard net/route.c standard net/route_temporal.c standard +net/route/nhop.c standard +net/route/nhop_ctl.c standard +net/route/nhop_utils.c standard +net/route/route_ctl.c standard +net/route/route_helpers.c standard net/rss_config.c optional inet rss | inet6 rss net/rtsock.c standard net/slcompress.c optional netgraph_vjc | sppp | \ Index: sys/net/route.h =================================================================== --- sys/net/route.h +++ sys/net/route.h @@ -90,7 +90,8 @@ u_long rmx_rttvar; /* estimated rtt variance */ u_long rmx_pksent; /* packets sent using this route */ u_long rmx_weight; /* route weight */ - u_long rmx_filler[3]; /* will be used for T/TCP later */ + u_long rmx_nhidx; /* route nexhop index */ + u_long rmx_filler[2]; /* will be used for T/TCP later */ }; /* @@ -150,6 +151,7 @@ struct sockaddr *rt_gateway; /* value */ struct ifnet *rt_ifp; /* the answer: interface to use */ struct ifaddr *rt_ifa; /* the answer: interface address to use */ + struct nhop_object *rt_nhop; /* nexthop data */ int rt_flags; /* up/down?, host/net */ int rt_refcnt; /* # held references */ u_int rt_fibnum; /* which FIB */ @@ -215,9 +217,13 @@ #define NHF_HOST 0x0400 /* RTF_HOST */ /* Nexthop request flags */ +#define NHR_NONE 0x00 /* empty flags field */ #define NHR_IFAIF 0x01 /* Return ifa_ifp interface */ #define NHR_REF 0x02 /* For future use */ +/* uRPF */ +#define NHR_NODEFAULT 0x04 /* do not consider default route */ + /* Control plane route request flags */ #define NHR_COPY 0x100 /* Copy rte data */ @@ -245,6 +251,8 @@ uint64_t rts_newgateway; /* routes modified by redirects */ uint64_t rts_unreach; /* lookups which failed */ uint64_t rts_wildcard; /* lookups satisfied by a wildcard */ + uint64_t rts_nh_idx_alloc_failure; /* nexthop index alloc failure*/ + uint64_t rts_nh_alloc_failure; /* nexthop allocation failure*/ }; /* @@ -507,6 +515,8 @@ struct sockaddr *gateway, struct sockaddr *author, struct ifnet *ifp, int flags, int expire_sec); +/* New API */ +void rib_walk(int af, u_int fibnum, rt_walktree_f_t *wa_f, void *arg); #endif #endif Index: sys/net/route.c =================================================================== --- sys/net/route.c +++ sys/net/route.c @@ -62,6 +62,8 @@ #include #include #include +#include +#include #include #ifdef RADIX_MPATH @@ -108,10 +110,7 @@ SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(rt_add_addr_allfibs), 0, ""); -VNET_PCPUSTAT_DEFINE_STATIC(struct rtstat, rtstat); -#define RTSTAT_ADD(name, val) \ - VNET_PCPUSTAT_ADD(struct rtstat, rtstat, name, (val)) -#define RTSTAT_INC(name) RTSTAT_ADD(name, 1) +VNET_PCPUSTAT_DEFINE(struct rtstat, rtstat); VNET_PCPUSTAT_SYSINIT(rtstat); #ifdef VIMAGE @@ -142,7 +141,6 @@ EVENTHANDLER_LIST_DEFINE(rt_addrmsg); -static int rt_getifa_fib(struct rt_addrinfo *, u_int); static int rtrequest1_fib_change(struct rib_head *, struct rt_addrinfo *, struct rtentry **, u_int); static void rt_setmetrics(const struct rt_addrinfo *, struct rtentry *); @@ -235,6 +233,7 @@ rt_numfibs = RT_MAXFIBS; if (rt_numfibs == 0) rt_numfibs = 1; + nhops_init(); } SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, NULL); @@ -372,6 +371,8 @@ /* Init locks */ RIB_LOCK_INIT(rh); + nhops_init_rib(rh); + /* Finally, set base callbacks */ rh->rnh_addaddr = rn_addroute; rh->rnh_deladdr = rn_delete; @@ -403,6 +404,8 @@ rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head); + nhops_destroy_rib(rh); + /* Assume table is already empty */ RIB_LOCK_DESTROY(rh); free(rh, M_RTABLE); @@ -581,6 +584,9 @@ */ R_Free(rt_key(rt)); + /* Unreference nexthop */ + nhop_free_object(rt->rt_nhop); + /* * and the rtentry itself of course */ @@ -1395,6 +1401,7 @@ RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, if_updatemtu_cb, &ifmtu); RIB_WUNLOCK(rnh); + nhops_update_ifmtu(rnh, ifp, ifmtu.mtu); } } } @@ -1538,7 +1545,9 @@ struct rib_head *rnh; struct ifaddr *ifa; struct sockaddr *ndst; + struct nhop_object *nh; struct sockaddr_storage mdst; + struct epoch_tracker et; KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum")); KASSERT((flags & RTF_RNH_LOCKED) == 0, ("rtrequest1_fib: locked")); @@ -1614,11 +1623,22 @@ } else { ifa_ref(info->rti_ifa); } + + NET_EPOCH_ENTER(et); + nh = nhop_create_from_info_wrapper(rnh, info); + NET_EPOCH_EXIT(et); + if (nh == NULL) { + ifa_free(info->rti_ifa); + return (ENOBUFS); + } + rt = uma_zalloc(V_rtzone, M_NOWAIT); if (rt == NULL) { ifa_free(info->rti_ifa); + nhop_free_object(nh); return (ENOBUFS); } + rt->rt_nhop = nh; rt->rt_flags = RTF_UP | flags; rt->rt_fibnum = fibnum; /* @@ -1626,6 +1646,7 @@ */ if ((error = rt_setgate(rt, dst, gateway)) != 0) { ifa_free(info->rti_ifa); + nhop_free_object(nh); uma_zfree(V_rtzone, rt); return (error); } @@ -1664,6 +1685,7 @@ RIB_WUNLOCK(rnh); ifa_free(rt->rt_ifa); + nhop_free_object(nh); R_Free(rt_key(rt)); uma_zfree(V_rtzone, rt); return (EEXIST); @@ -1705,6 +1727,7 @@ */ if (rn == NULL) { ifa_free(rt->rt_ifa); + nhop_free_object(nh); R_Free(rt_key(rt)); uma_zfree(V_rtzone, rt); return (EEXIST); @@ -1734,9 +1757,11 @@ RT_UNLOCK(rt); break; case RTM_CHANGE: + NET_EPOCH_ENTER(et); RIB_WLOCK(rnh); error = rtrequest1_fib_change(rnh, info, ret_nrt, fibnum); RIB_WUNLOCK(rnh); + NET_EPOCH_EXIT(et); break; default: error = EOPNOTSUPP; @@ -1760,6 +1785,7 @@ int error = 0; int free_ifa = 0; int family, mtu; + struct nhop_object *nh; struct if_mtuinfo ifmtu; RIB_WLOCK_ASSERT(rnh); @@ -1784,6 +1810,12 @@ RT_LOCK(rt); + nh = nhop_create_from_nhop_wrapper(rnh, rt->rt_nhop, info); + if (nh == NULL) { + RT_UNLOCK(rt); + return (ENOBUFS); + } + rt_setmetrics(info, rt); /* @@ -1855,6 +1887,11 @@ } } + /* Update nexthop */ + nhop_free_object(rt->rt_nhop); + rt->rt_nhop = nh; + nh = NULL; + /* * This route change may have modified the route's gateway. In that * case, any inpcbs that have cached this route need to invalidate their @@ -1868,6 +1905,8 @@ } bad: RT_UNLOCK(rt); + if (nh != NULL) + nhop_free_object(nh); if (free_ifa != 0) { ifa_free(info->rti_ifa); info->rti_ifa = NULL; Index: sys/net/route/nhop.h =================================================================== --- /dev/null +++ sys/net/route/nhop.h @@ -0,0 +1,236 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This header file contains public definitions for the nexthop routing subsystem. + */ + +#ifndef _NET_ROUTE_NHOP_H_ +#define _NET_ROUTE_NHOP_H_ + +#include /* sockaddr_in && sockaddr_in6 */ + +#include + +enum nhop_type { + NH_TYPE_IPV4_ETHER_RSLV = 1, /* IPv4 ethernet without GW */ + NH_TYPE_IPV4_ETHER_NHOP = 2, /* IPv4 with pre-calculated ethernet encap */ + NH_TYPE_IPV6_ETHER_RSLV = 3, /* IPv6 ethernet, without GW */ + NH_TYPE_IPV6_ETHER_NHOP = 4 /* IPv6 with pre-calculated ethernet encap*/ +}; + +#ifdef _KERNEL + +/* + * Currently the only use case of AF_LINK gateway is storing + * interface index of the interface of the source IPv6 address. + * This is used by the IPv6 code for the connections over loopback + * interface. + * + * The structure below copies 'struct sockaddr_dl', reducing the + * size of sdl_data buffer, as it is not used. This change + * allows to store the AF_LINK gateways in the nhop gateway itself, + * simplifying control plane handling. + */ +struct sockaddr_dl_short { + u_char sdl_len; /* Total length of sockaddr */ + u_char sdl_family; /* AF_LINK */ + u_short sdl_index; /* if != 0, system given index for interface */ + u_char sdl_type; /* interface type */ + u_char sdl_nlen; /* interface name length, no trailing 0 reqd. */ + u_char sdl_alen; /* link level address length */ + u_char sdl_slen; /* link layer selector length */ + char sdl_data[8]; /* unused */ +}; + +#define NHOP_RELATED_FLAGS \ + (RTF_GATEWAY | RTF_HOST | RTF_REJECT | RTF_BLACKHOLE | \ + RTF_FIXEDMTU | RTF_LOCAL | RTF_BROADCAST | RTF_MULTICAST) + +struct nhop_request { + struct ifnet *ifp; + struct ifaddr *ifa; + struct sockaddr *gw; + int family; + int mtu; + int rt_flags; /* gets converted to nh_flags later */ + uint16_t nh_type; + uint16_t nh_flags_additional; /* Additional flags to set to the nh_flags */ +}; + +struct nh_control; +struct nhop_priv; + +/* + * Struct 'nhop_object' field description: + * + * nh_flags: NHF_ flags used in the dataplane code. NHF_GATEWAY or NHF_BLACKHOLE + * can be examples of such flags. + * nh_mtu: ready-to-use nexthop mtu. Already accounts for the link-level header, + * interface MTU and protocol-specific limitations. + * nh_prepend_len: link-level prepend length. Currently unused. + * nh_ifp: logical transmit interface. The one from which if_transmit() will be + * called. Guaranteed to be non-NULL. + * nh_aifp: ifnet of the source address. Same as nh_ifp except IPv6 loopback + * routes. See the example below. + * nh_ifa: interface address to use. Guaranteed to be non-NULL. + * nh_pksent: counter(9) reflecting the number of packets transmitted. + * + * gw_: storage suitable to hold AF_INET, AF_INET6 or AF_LINK gateway. More + * details ara available in the examples below. + * + * + * Direct routes (routes w/o gateway): + * NHF_GATEWAY is NOT set. + * nh_ifp denotes the logical transmit interface (). + * nh_aifp is the same as nh_ifp + * gw_sa contains AF_LINK sa with nh_aifp ifindex (compat) + * Loopback routes: + * NHF_GATEWAY is NOT set. + * nh_ifp points to the loopback interface (lo0). + * nh_aifp points to the interface where the destination address belongs to. + * This is useful in IPv6 link-local-over-loopback communications. + * gw_sa contains AF_LINK sa with nh_aifp ifindex (compat) + * GW routes: + * NHF_GATEWAY is set. + * nh_ifp denotes the logical transmit interface. + * nh_aifp is the same as nh_ifp + * gw_sa contains L3 address (either AF_INET or AF_INET6). + * + * + * Note: struct nhop_object fields are ordered in a way that + * supports memcmp-based comparisons. + * + */ +#define NHOP_END_CMP (__offsetof(struct nhop_object, nh_pksent)) + +struct nhop_object { + uint16_t nh_flags; /* nhop flags */ + uint16_t nh_mtu; /* nexthop mtu */ + union { + struct sockaddr_in gw4_sa; /* GW accessor as IPv4 */ + struct sockaddr_in6 gw6_sa; /* GW accessor as IPv6 */ + struct sockaddr gw_sa; + struct sockaddr_dl_short gwl_sa; /* AF_LINK gw (compat) */ + char gw_buf[28]; + }; + struct ifnet *nh_ifp; /* Logical egress interface. Always != NULL */ + struct ifaddr *nh_ifa; /* interface address to use. Always != NULL */ + struct ifnet *nh_aifp; /* ifnet of the source address. Always != NULL */ + counter_u64_t nh_pksent; /* packets sent using this nhop */ + /* 32 bytes + 4xPTR == 64(amd64) / 48(i386) */ + uint8_t nh_prepend_len; /* length of prepend data */ + uint8_t spare[3]; + uint32_t spare1; /* alignment */ + char nh_prepend[48]; /* L2 prepend */ + struct nhop_priv *nh_priv; /* control plane data */ + /* -- 128 bytes -- */ +}; + +/* + * Nhop validness. + * + * Currently we verify whether link is up or not on every packet, which can be + * quite costy. + * TODO: subscribe for the interface notifications and update the nexthops + * with NHF_INVALID flag. + */ + +//#define NH_IS_VALID(_nh) (((_nh)->nh_flags & NHF_INVALID) == 0) +#define NH_IS_VALID(_nh) RT_LINK_IS_UP((_nh)->nh_ifp) +#define NH_IS_MULTIPATH(_nh) ((_nh)->nh_flags & NHF_MULTIPATH) + +#define RT_GATEWAY(_rt) ((struct sockaddr *)&(_rt)->rt_nhop->gw4_sa) +#define RT_GATEWAY_CONST(_rt) ((const struct sockaddr *)&(_rt)->rt_nhop->gw4_sa) + +#define NH_FREE(_nh) do { \ + nhop_free_object(_nh); \ + /* guard against invalid refs */ \ + _nh = NULL; \ +} while (0) + + +void nhop_free_object(struct nhop_object *nh); + +struct sysctl_req; +struct sockaddr_dl; +struct rib_head; + +uint32_t nhop_get_idx(const struct nhop_object *nh); +void nhop_free(struct nhop_object *nh); + +int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w); + +#endif + +/* Kernel <> userland structures */ + +struct nhop_external { + uint32_t nh_len; /* length of the datastructure */ + uint32_t nh_idx; /* Nexthop index */ + uint32_t nh_fib; /* Fib nexhop is attached to */ + uint32_t ifindex; /* transmit interface ifindex */ + uint32_t aifindex; /* address ifindex */ + uint8_t prepend_len; /* length of the prepend */ + uint8_t nh_family; /* address family */ + uint16_t nh_type; /* nexthop type */ + uint16_t nh_mtu; /* nexthop mtu */ + + uint16_t nh_flags; /* nhop flags */ + struct in_addr nh_addr; /* GW/DST IPv4 address */ + struct in_addr nh_src; /* default source IPv4 address */ + uint64_t nh_pksent; + /* control plane */ + /* lookup key: address, family, type */ + char nh_prepend[64]; /* L2 prepend */ + uint64_t nh_refcount; /* number of references */ +}; + +struct nhop_addrs { + uint32_t na_len; /* length of the datastructure */ + uint16_t gw_sa_off; /* offset of gateway SA */ + uint16_t src_sa_off; /* offset of src address SA */ +}; + +struct mpath_nhop_external { + uint32_t nh_idx; + uint32_t nh_weight; +}; + +struct mpath_external { + uint32_t mp_idx; + uint32_t mp_refcount; + uint32_t mp_nh_count; + uint32_t mp_group_size; +}; + + +#endif + + Index: sys/net/route/nhop.c =================================================================== --- /dev/null +++ sys/net/route/nhop.c @@ -0,0 +1,349 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include "opt_inet.h" +#include "opt_route.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This file contains data structures management logic for the nexthop ("nhop") + * route subsystem. + * + * Nexthops in the original sense are the objects containing all the necessary + * information to forward the packet to the selected destination. + * In particular, nexthop is defined by a combination of + * ifp, ifa, aifp, mtu, gw addr(if set), nh_type, nh_family, mask of rt_flags and + * NHF_DEFAULT + * + * All nexthops are stored in the resizable hash table. + * Additionally, each nexthop gets assigned its unique index (nexthop index) + * so userland programs can interact with the nexthops easier. Index allocation + * is backed by the bitmask array. + */ + +static MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data"); + + +/* Hash management functions */ + +int +nhops_init_rib(struct rib_head *rh) +{ + struct nh_control *ctl; + size_t alloc_size; + uint32_t num_buckets, num_items; + void *ptr; + + ctl = malloc(sizeof(struct nh_control), M_NHOP, M_WAITOK | M_ZERO); + + /* + * Allocate nexthop hash. Start with 16 items by default (128 bytes). + * This will be enough for most of the cases. + */ + num_buckets = 16; + alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); + ptr = malloc(alloc_size, M_NHOP, M_WAITOK | M_ZERO); + CHT_SLIST_INIT(&ctl->nh_head, ptr, num_buckets); + + /* + * Allocate nexthop index bitmask. + */ + num_items = 128 * 8; /* 128 bytes */ + ptr = malloc(bitmask_get_size(num_items), M_NHOP, M_WAITOK | M_ZERO); + bitmask_init(&ctl->nh_idx_head, ptr, num_items); + + NHOPS_LOCK_INIT(ctl); + + rh->nh_control = ctl; + ctl->rh = rh; + + DPRINTF("NHOPS init for fib %u af %u: ctl %p rh %p", rh->rib_fibnum, + rh->rib_family, ctl, rh); + + return (0); +} + +void +nhops_destroy_rib(struct rib_head *rh) +{ + struct nh_control *ctl; + struct nhop_priv *nh_priv; + + ctl = rh->nh_control; + + /* + * All routes should have been deleted in rt_table_destroy(). + * However, TCP stack or other consumers may store referenced + * nexthop pointers. When these references go to zero, + * nhop_free_object() will try to unlink these records from the + * datastructures, most likely leading to panic. + * + * Avoid that by explicitly marking all of the remaining + * nexthops as unlinked. + */ + + NHOPS_RLOCK(ctl); + CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) { + DPRINTF("Unlinking referenced nhop %u", nh_priv->nh_idx); + NH_PRIV_LOCK(nh_priv); + nh_priv->nh_control = NULL; + nh_priv->nh_idx = 0; + NH_PRIV_UNLOCK(nh_priv); + } CHT_SLIST_FOREACH_END; + NHOPS_RUNLOCK(ctl); + + free(ctl->nh_head.ptr, M_NHOP); + free(ctl->nh_idx_head.idx, M_NHOP); + free(ctl, M_NHOP); +} + +/* + * Nexthops distribution: + * + * 2 "mandatory" nexthops per interface ("interface route", "loopback"). + * For direct peering: 1 nexthop for the peering router per ifp/af. + * For Ix-like peering: tens to hundreds nexthops of neghbors per ifp/af. + * IGP control plane & broadcast segment: tens of nexthops per ifp/af. + * + * With that in mind, hash nexthops by the combination of the interface + * and GW IP address. + */ +struct _hash_data { + uint16_t ifindex; + uint8_t family; + uint8_t nh_type; + uint32_t gw_addr; +}; + +static uint32_t +hash_priv(const struct nhop_priv *priv) +{ + struct nhop_object *nh; + uint16_t ifindex; + struct _hash_data key; + + nh = priv->nh; + ifindex = nh->nh_ifp->if_index & 0xFFFF; + memset(&key, 0, sizeof(key)); + + key.ifindex = ifindex; + key.family = nh->gw_sa.sa_family; + key.nh_type = priv->nh_type & 0xFF; + if (nh->gw_sa.sa_family == AF_INET6) + memcpy(&key.gw_addr, &nh->gw6_sa.sin6_addr.s6_addr32[3], 4); + else if (nh->gw_sa.sa_family == AF_INET) + memcpy(&key.gw_addr, &nh->gw4_sa.sin_addr, 4); + + return (uint32_t)(djb_hash((const unsigned char *)&key, sizeof(key))); +} + +/* + * Checks if hash needs resizing and performs this resize if necessary + * + */ +static void +consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, uint32_t new_idx_items) +{ + void *nh_ptr, *nh_idx_ptr; + void *old_idx_ptr; + size_t alloc_size; + + nh_ptr = NULL; + if (new_nh_buckets != 0) { + alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_nh_buckets); + nh_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO); + } + + nh_idx_ptr = NULL; + if (new_idx_items != 0) { + alloc_size = bitmask_get_size(new_idx_items); + nh_idx_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO); + } + + if (nh_ptr == NULL && nh_idx_ptr == NULL) { + /* Either resize is not required or allocations have failed. */ + return; + } + + DPRINTF("going to resize: nh:[ptr:%p sz:%u] idx:[ptr:%p sz:%u]", nh_ptr, + new_nh_buckets, nh_idx_ptr, new_idx_items); + + old_idx_ptr = NULL; + + NHOPS_WLOCK(ctl); + if (nh_ptr != NULL) { + CHT_SLIST_RESIZE(&ctl->nh_head, nhops, nh_ptr, new_nh_buckets); + } + if (nh_idx_ptr != NULL) { + if (bitmask_copy(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items)) + bitmask_swap(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items, &old_idx_ptr); + } + NHOPS_WUNLOCK(ctl); + + if (nh_ptr != NULL) + free(nh_ptr, M_NHOP); + if (old_idx_ptr != NULL) + free(old_idx_ptr, M_NHOP); +} + +/* + * Links nextop @nh_priv to the nexhop hash table and allocates + * nexhop index. + * Returns allocated index or 0 on failure. + */ +int +link_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv) +{ + uint16_t idx; + uint32_t new_num_buckets, new_num_items; + + KASSERT((nh_priv->nh_idx == 0), ("nhop index is already allocated")); + NHOPS_WLOCK(ctl); + + /* + * Check if we need to resize hash and index. + * The following 2 functions returns either new size or 0 + * if resize is not required. + */ + new_num_buckets = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->nh_head); + new_num_items = bitmask_get_resize_items(&ctl->nh_idx_head); + + if (bitmask_alloc_idx(&ctl->nh_idx_head, &idx) != 0) { + NHOPS_WUNLOCK(ctl); + DPRINTF("Unable to allocate nhop index"); + RTSTAT_INC(rts_nh_idx_alloc_failure); + consider_resize(ctl, new_num_buckets, new_num_items); + return (0); + } + + nh_priv->nh_idx = idx; + nh_priv->nh_control = ctl; + CHT_SLIST_INSERT_HEAD(&ctl->nh_head, nhops, nh_priv); + + NHOPS_WUNLOCK(ctl); + + DPRINTF("Linked nhop priv %p to %d, hash %u, ctl %p", nh_priv, idx, + hash_priv(nh_priv), ctl); + consider_resize(ctl, new_num_buckets, new_num_items); + + return (idx); +} + +/* + * Unlinks nexthop specified by @nh_priv data from the hash. + * + * Returns found nexthop or NULL. + */ +struct nhop_priv * +unlink_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv_del) +{ + struct nhop_priv *priv_ret; + int idx; + uint32_t new_num_buckets, new_num_items; + + idx = 0; + + NHOPS_WLOCK(ctl); + CHT_SLIST_REMOVE_BYOBJ(&ctl->nh_head, nhops, nh_priv_del, priv_ret); + + if (priv_ret != NULL) { + NH_PRIV_LOCK(priv_ret); + idx = priv_ret->nh_idx; + priv_ret->nh_idx = 0; + priv_ret->nh_control = NULL; + NH_PRIV_UNLOCK(priv_ret); + + KASSERT((idx != 0), ("bogus nhop index 0")); + if ((bitmask_free_idx(&ctl->nh_idx_head, idx)) != 0) { + DPRINTF("Unable to remove index %d from fib %u af %d", + idx, ctl->rh->rib_fibnum, ctl->rh->rib_family); + } + } + + /* Check if hash or index needs to be resized */ + new_num_buckets = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->nh_head); + new_num_items = bitmask_get_resize_items(&ctl->nh_idx_head); + + NHOPS_WUNLOCK(ctl); + + if (priv_ret == NULL) + DPRINTF("Unable to unlink nhop priv %p from hash, hash %u ctl %p", + nh_priv_del, hash_priv(nh_priv_del), ctl); + else + DPRINTF("Unlinked nhop %p priv idx %d", priv_ret, idx); + + consider_resize(ctl, new_num_buckets, new_num_items); + + return (priv_ret); +} + +/* + * Searches for the nexthop by data specifcied in @nh_priv. + * Returns referenced nexthop or NULL. + */ +__noinline struct nhop_priv * +find_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv) +{ + struct nhop_priv *nh_priv_ret; + + //DPRINTF("--- start search ---"); + NHOPS_RLOCK(ctl); + CHT_SLIST_FIND_BYOBJ(&ctl->nh_head, nhops, nh_priv, nh_priv_ret); + if (nh_priv_ret != NULL) { + if (refcount_acquire_if_not_zero(&nh_priv_ret->nh_refcnt) == 0){ + /* refcount was 0 -> nhop is being deleted */ + nh_priv_ret = NULL; + } + } + NHOPS_RUNLOCK(ctl); + + //if (nh_priv_ret == NULL) + // DPRINTF("--- end search (not found) ---"); + return (nh_priv_ret); +} + Index: sys/net/route/nhop_ctl.c =================================================================== --- /dev/null +++ sys/net/route/nhop_ctl.c @@ -0,0 +1,621 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include "opt_inet.h" +#include "opt_route.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This file contains core functionality for the nexthop ("nhop") route subsystem. + * The business logic needed to create nexhop objects is implemented here. + * + * Nexthops in the original sense are the objects containing all the necessary + * information to forward the packet to the selected destination. + * In particular, nexthop is defined by a combination of + * ifp, ifa, aifp, mtu, gw addr(if set), nh_type, nh_family, mask of rt_flags and + * NHF_DEFAULT + * + * Additionally, each nexthop gets assigned its unique index (nexthop index). + * It serves two purposes: first one is to ease the ability of userland programs to + * reference nexthops by their index. The second one allows lookup algorithms to + * to store index instead of pointer (2 bytes vs 8) as a lookup result. + * All nexthops are stored in the resizable hash table. + * + * Basically, this file revolves around supporting 2 functions: + * 1) fill_nhop(), which contains all business logic on filling the nexthop fields + * based on the provided request + * 2) nhop_get(), which gets a nexthop based on the provided request. + * + * Conventions: + * 1) non-exported functions start with verb + * 2) exported function starts with the subsystem prefix: "nhop" + * + */ + +static int fill_nhop(const struct nhop_request *req, struct nhop_object *nh); +static int dump_nhop_entry(struct rib_head *rh, struct nhop_object *nh, struct sysctl_req *w); + +static struct nhop_priv *alloc_nhop_partial(const struct nhop_request *req); +static int finalize_nhop(struct nhop_priv *nh_priv, const struct nhop_request *req); +static struct ifnet *get_aifp(const struct nhop_request *req, int reference); +static void fill_sdl_from_ifp(struct sockaddr_dl_short *sdl, const struct ifnet *ifp); + +static void destroy_nhop_epoch(epoch_context_t ctx); +static void destroy_nhop(struct nhop_priv *nh_priv); + +static void print_nhop(const char *prefix, const struct nhop_object *nh); + +_Static_assert(__offsetof(struct nhop_object, nh_ifp) == 32, + "nhop_object: wrong nh_ifp offset"); +_Static_assert(sizeof(struct nhop_object) <= 128, + "nhop_object: size exceeds 128 bytes"); + +static uma_zone_t nhops_zone; /* Global zone for each and every nexthop */ + + +#define NHOP_OBJECT_ALIGNED_SIZE roundup2(sizeof(struct nhop_object), \ + 2 * CACHE_LINE_SIZE) +#define NHOP_PRIV_ALIGNED_SIZE roundup2(sizeof(struct nhop_priv), \ + 2 * CACHE_LINE_SIZE) +void +nhops_init(void) +{ + + nhops_zone = uma_zcreate("routing nhops", + NHOP_OBJECT_ALIGNED_SIZE + NHOP_PRIV_ALIGNED_SIZE, + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); +} + +/* + * Fetches the interface of source address used by the route. + * In all cases except interface-address-route it would be the + * same as the transmit interfaces. + * However, for the interface address this function will return + * this interface ifp instead of loopback. This is needed to support + * link-local IPv6 loopback communications. + * + * If @reference is non-zero, found ifp is referenced. + * + * Returns found ifp. + */ +static struct ifnet * +get_aifp(const struct nhop_request *req, int reference) +{ + struct ifnet *aifp = NULL; + struct sockaddr_dl *sdl; + struct epoch_tracker et; + + /* + * Adjust the "outgoing" interface. If we're going to loop + * the packet back to ourselves, the ifp would be the loopback + * interface. However, we'd rather know the interface associated + * to the destination address (which should probably be one of + * our own addresses.) + */ + if ((req->ifp->if_flags & IFF_LOOPBACK) && + req->gw->sa_family == AF_LINK) { + sdl = (struct sockaddr_dl *)req->gw; + NET_EPOCH_ENTER(et); + if (reference) + aifp = ifnet_byindex_ref(sdl->sdl_index); + else + aifp = ifnet_byindex(sdl->sdl_index); + NET_EPOCH_EXIT(et); + if (aifp == NULL) { + DPRINTF("unable to get aifp for %s index %d", + if_name(req->ifp), sdl->sdl_index); + } + } + + if (aifp == NULL) { + aifp = req->ifp; + if (reference) + if_ref(aifp); + } + + return (aifp); +} + +int +cmp_priv(const struct nhop_priv *_one, const struct nhop_priv *_two) +{ + + if (memcmp(_one->nh, _two->nh, NHOP_END_CMP) != 0) + return (0); + + if ((_one->nh_type != _two->nh_type) || + (_one->nh_family != _two->nh_family)) + return (0); + + return (1); +} + +/* + * Finds or creates new nhop_object based on @req. + * Returns referenced and linked nhop_object or NULL. + */ +__noinline struct nhop_object * +nhop_get(struct rib_head *rh, const struct nhop_request *req) +{ + struct nh_control *ctl = rh->nh_control; + struct nhop_priv *nh_priv, *nh_tmp_priv; + + /* + * In the cases with large amount of routes, most routes will + * share nexthops, making alloc-to-lookup ratio low. + * + * With that in mind, split nexthop allocation in two stages: + * first does the bare minimum to support the lookup for the + * existing nexthop. + * the second does the heavy-lifting with counters allocations + * and external objects refcounting. + */ + nh_tmp_priv = alloc_nhop_partial(req); + if (nh_tmp_priv == NULL) { + RTSTAT_INC(rts_nh_alloc_failure); + DPRINTF("nh_alloc failed"); + return (NULL); + } + + nh_priv = find_nhop(ctl, nh_tmp_priv); + if (nh_priv != NULL) { + uma_zfree(nhops_zone, nh_tmp_priv->nh); + return (nh_priv->nh); + } + + nh_priv = nh_tmp_priv; + + if (finalize_nhop(nh_priv, req) != 0) { + RTSTAT_INC(rts_nh_alloc_failure); + DPRINTF("nh_alloc_finalize failed"); + return (NULL); + } + if (link_nhop(ctl, nh_priv) == 0) { + + /* + * Adding nexthop to the datastructures + * failed. Call destructor w/o waiting for + * the epoch end, as nexthop is not used + * and return. + */ + DPRINTF("link_nhop failed!"); + destroy_nhop(nh_priv); + + return (NULL); + } + + return (nh_priv->nh); +} + +/* + * Fills in shorted link-level sockadd version suitable to be stored inside the + * nexthop gateway buffer. + */ +static void +fill_sdl_from_ifp(struct sockaddr_dl_short *sdl, const struct ifnet *ifp) +{ + + sdl->sdl_family = AF_LINK; + sdl->sdl_len = sizeof(struct sockaddr_dl_short); + sdl->sdl_index = ifp->if_index; + sdl->sdl_type = ifp->if_type; +} + +/* + * Allocates nexthops and fills in the minimum amount of + * data to perform hash lookup. + * No external objects are referenced. Packet counters + * not allocated. + * + * Returns nh_priv pointer or NULL. + */ +static struct nhop_priv * +alloc_nhop_partial(const struct nhop_request *req) +{ + struct nhop_object *nh; + struct nhop_priv *nh_priv; + + KASSERT((req->mtu > 0), ("nh requested mtu is zero")); + + nh = (struct nhop_object *)uma_zalloc(nhops_zone, M_NOWAIT | M_ZERO); + if (nh == NULL) + return (NULL); + nh_priv = (struct nhop_priv *)((char *)nh + NHOP_OBJECT_ALIGNED_SIZE); + + nh->nh_priv = nh_priv; + nh_priv->nh = nh; + + if (fill_nhop(req, nh) != 0) { + uma_zfree(nhops_zone, nh); + return (NULL); + } + + /* calculate aifp, but don't reference it */ + nh->nh_aifp = get_aifp(req, 0); + + return (nh_priv); +} + +/* + * Finalizes nexthop data to make nhop suitable for linking into the + * hash table. + * Returns 0 if successful, + * errno otherwise. @nh_priv is freed in case of error. + */ +static int +finalize_nhop(struct nhop_priv *nh_priv, const struct nhop_request *req) +{ + struct nhop_object *nh; + + nh = nh_priv->nh; + + /* Allocate per-cpu packet counter */ + nh->nh_pksent = counter_u64_alloc(M_NOWAIT); + if (nh->nh_pksent == NULL) { + uma_zfree(nhops_zone, nh); + return (ENOMEM); + } + + /* Reference external objects and calculate (referenced) ifa */ + if_ref(nh->nh_ifp); + ifa_ref(nh->nh_ifa); + nh->nh_aifp = get_aifp(req, 1); + DPRINTF("AIFP: %p req->ifp %p nh_ifp %p", nh->nh_aifp, req->ifp, nh->nh_ifp); + + NH_PRIV_LOCK_INIT(nh_priv); + refcount_init(&nh_priv->nh_refcnt, 1); + + print_nhop("FINALIZE", nh); + + return (0); +} + +static void +print_nhop_sa(char *buf, size_t buflen, const struct sockaddr *sa) +{ + + if (sa->sa_family == AF_INET) { + const struct sockaddr_in *sin4; + sin4 = (const struct sockaddr_in *)sa; + inet_ntop(AF_INET, &sin4->sin_addr, buf, buflen); + } else if (sa->sa_family == AF_INET6) { + const struct sockaddr_in6 *sin6; + sin6 = (const struct sockaddr_in6 *)sa; + inet_ntop(AF_INET6, &sin6->sin6_addr, buf, buflen); + } else if (sa->sa_family == AF_LINK) { + const struct sockaddr_dl *sdl; + sdl = (const struct sockaddr_dl *)sa; + snprintf(buf, buflen, "if#%d", sdl->sdl_index); + } else + snprintf(buf, buflen, "af:%d", sa->sa_family); +} + +static void +print_nhop(const char *prefix, const struct nhop_object *nh) +{ + char src_buf[INET6_ADDRSTRLEN], addr_buf[INET6_ADDRSTRLEN]; + + print_nhop_sa(src_buf, sizeof(src_buf), nh->nh_ifa->ifa_addr); + print_nhop_sa(addr_buf, sizeof(addr_buf), &nh->gw_sa); + + DPRINTF("%s nhop priv %p: AF %d ifp %p %s addr %s src %p %s aifp %p %s mtu %d nh_flags %X", + prefix, nh->nh_priv, af, nh->nh_ifp, if_name(nh->nh_ifp), addr_buf, + nh->nh_ifa, src_buf, nh->nh_aifp, if_name(nh->nh_aifp), nh->nh_mtu, + nh->nh_flags); +} + +static void +destroy_nhop(struct nhop_priv *nh_priv) +{ + struct nhop_object *nh; + + nh = nh_priv->nh; + + NH_PRIV_LOCK(nh_priv); + print_nhop("DEL", nh); + NH_PRIV_UNLOCK(nh_priv); + + if_rele(nh->nh_ifp); + if_rele(nh->nh_aifp); + ifa_free(nh->nh_ifa); + counter_u64_free(nh->nh_pksent); + + uma_zfree(nhops_zone, nh); +} + +/* + * Epoch callback indicating nhop is safe to destroy + */ +static void +destroy_nhop_epoch(epoch_context_t ctx) +{ + struct nhop_priv *nh_priv; + + nh_priv = __containerof(ctx, struct nhop_priv, nh_epoch_ctx); + + destroy_nhop(nh_priv); +} + +/* + * Fills @nh fields with the data supplied in the @req. + * Function does NOT fill in nh_aifp and does not take any reference. + * Returns 0 on success. + */ +static int +fill_nhop(const struct nhop_request *req, struct nhop_object *nh) +{ + int rt_flags; + + rt_flags = req->rt_flags & NHOP_RT_FLAG_MASK; + + nh->nh_ifp = req->ifp; + nh->nh_mtu = req->mtu; + nh->nh_flags = fib_rte_to_nh_flags(rt_flags); + nh->nh_flags |= (req->nh_flags_additional & NHF_DEFAULT); + nh->nh_priv->rt_flags = rt_flags; + nh->nh_ifa = req->ifa; + + if (req->rt_flags & RTF_GATEWAY) { + if (req->gw->sa_len > sizeof(struct sockaddr_in6)) { + DPRINTF("nhop SA size too big: AF %d len %u", + req->gw->sa_family, req->gw->sa_len); + return (ENOMEM); + } + memcpy(&nh->gw_sa, req->gw, req->gw->sa_len); + } else { + /* + * Interface route. Currently the route.c code adds + * empty sa of type AF_LINK, which is 56 bytes long. + * The only place where this data is used is the IPv6 + * loopback output, where we need to preserve the original + * interface to maintain proper scoping. + * Current code stores original interface in the separate field + * (nh_aifp, see below). Given that, write fake empty SA + * with the request AF. + */ + fill_sdl_from_ifp(&nh->gwl_sa, req->ifp); + } + + nh->nh_priv->nh_family = req->family; + nh->nh_priv->nh_type = req->nh_type; + + return (0); +} + +int +nhop_ref_object(struct nhop_object *nh) +{ + + return (refcount_acquire_if_not_zero(&nh->nh_priv->nh_refcnt)); +} + +void +nhop_free_object(struct nhop_object *nh) +{ + struct nh_control *ctl; + struct nhop_priv *nh_priv = nh->nh_priv; + + if (!refcount_release(&nh_priv->nh_refcnt)) + return; + + NH_PRIV_LOCK(nh_priv); + ctl = nh_priv->nh_control; + /* Use nh_control as an indicator of linked/unlinked entry */ + nh_priv->nh_control = NULL; + NH_PRIV_UNLOCK(nh_priv); + + if (ctl != NULL) { + if (unlink_nhop(ctl, nh_priv) == NULL) { + /* Do not try to reclaim */ + DPRINTF("Failed to find nexhop %p", nh_priv); + return; + } + } + + epoch_call(net_epoch_preempt, destroy_nhop_epoch, + &nh_priv->nh_epoch_ctx); +} + +int +nhop_ref_any(struct nhop_object *nh) +{ + + return (nhop_ref_object(nh)); +} + +void +nhop_free_any(struct nhop_object *nh) +{ + + nhop_free_object(nh); +} + + +/* Helper functions */ + +uint32_t +nhop_get_idx(const struct nhop_object *nh) +{ + + return (nh->nh_priv->nh_idx); +} + +void +nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu) +{ + struct nh_control *ctl; + struct nhop_priv *nh_priv; + struct nhop_object *nh; + + ctl = rh->nh_control; + + NHOPS_RLOCK(ctl); + CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) { + nh = nh_priv->nh; + if (nh->nh_ifp == ifp) { + if ((nh_priv->rt_flags & RTF_FIXEDMTU) == 0 || + nh->nh_mtu > mtu) { + /* Update */ + NH_PRIV_LOCK(nh_priv); + nh->nh_mtu = mtu; + NH_PRIV_UNLOCK(nh_priv); + } + } + } CHT_SLIST_FOREACH_END; + NHOPS_RUNLOCK(ctl); + +} + +/* + * Dumps a single entry to sysctl buffer. + * + * Layout: + * rt_msghdr - generic RTM header to allow users to skip non-understood messages + * nhop_external - nexhop description structure (with length) + * nhop_addrs - structure encapsulating GW/SRC sockaddrs + */ +static int +dump_nhop_entry(struct rib_head *rh, struct nhop_object *nh, struct sysctl_req *w) +{ + struct { + struct rt_msghdr rtm; + struct nhop_external nhe; + struct nhop_addrs na; + } arpc; + struct nhop_external *pnhe; + struct sockaddr *gw_sa, *src_sa; + struct sockaddr_storage ss; + size_t addrs_len; + int error; + + //DPRINTF("Dumping: head %p nh %p flags %X req %p\n", rh, nh, nh->nh_flags, w); + + memset(&arpc, 0, sizeof(arpc)); + + arpc.rtm.rtm_msglen = sizeof(arpc); + arpc.rtm.rtm_version = RTM_VERSION; + arpc.rtm.rtm_type = RTM_GET; + //arpc.rtm.rtm_flags = RTF_UP; + arpc.rtm.rtm_flags = nh->nh_priv->rt_flags; + + /* nhop_external */ + pnhe = &arpc.nhe; + pnhe->nh_len = sizeof(struct nhop_external); + pnhe->nh_idx = nh->nh_priv->nh_idx; + pnhe->nh_fib = rh->rib_fibnum; + pnhe->ifindex = nh->nh_ifp->if_index; + pnhe->aifindex = nh->nh_aifp->if_index; + pnhe->nh_family = nh->nh_priv->nh_family; + pnhe->nh_type = nh->nh_priv->nh_type; + pnhe->nh_mtu = nh->nh_mtu; + pnhe->nh_flags = nh->nh_flags; + + memcpy(pnhe->nh_prepend, nh->nh_prepend, sizeof(nh->nh_prepend)); + pnhe->prepend_len = nh->nh_prepend_len; + pnhe->nh_refcount = nh->nh_priv->nh_refcnt; + pnhe->nh_pksent = counter_u64_fetch(nh->nh_pksent); + + /* sockaddr container */ + addrs_len = sizeof(struct nhop_addrs); + arpc.na.gw_sa_off = addrs_len; + gw_sa = (struct sockaddr *)&nh->gw4_sa; + addrs_len += gw_sa->sa_len; + + src_sa = nh->nh_ifa->ifa_addr; + if (src_sa->sa_family == AF_LINK) { + /* Shorten structure */ + memset(&ss, 0, sizeof(struct sockaddr_storage)); + fill_sdl_from_ifp((struct sockaddr_dl_short *)&ss, + nh->nh_ifa->ifa_ifp); + src_sa = (struct sockaddr *)&ss; + } + arpc.na.src_sa_off = addrs_len; + addrs_len += src_sa->sa_len; + + /* Write total length */ + arpc.na.na_len = addrs_len; + + arpc.rtm.rtm_msglen += arpc.na.na_len - sizeof(struct nhop_addrs); + + error = SYSCTL_OUT(w, &arpc, sizeof(arpc)); + if (error == 0) + error = SYSCTL_OUT(w, gw_sa, gw_sa->sa_len); + if (error == 0) + error = SYSCTL_OUT(w, src_sa, src_sa->sa_len); + + /* + DPRINTF("Exported %d ifindex %d family %d type %d error %d\n", nh->nh_priv->nh_idx, pnhe->ifindex, + pnhe->nh_family, pnhe->nh_type, error); + */ + + return (error); +} + +int +nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w) +{ + struct nh_control *ctl; + struct nhop_priv *nh_priv; + int error; + + ctl = rh->nh_control; + + NHOPS_RLOCK(ctl); + DPRINTF("NHDUMP: count=%u", ctl->nh_head.items_count); + CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) { + error = dump_nhop_entry(rh, nh_priv->nh, w); + if (error != 0) + return (error); + } CHT_SLIST_FOREACH_END; + NHOPS_RUNLOCK(ctl); + + return (0); +} + Index: sys/net/route/nhop_utils.h =================================================================== --- /dev/null +++ sys/net/route/nhop_utils.h @@ -0,0 +1,200 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_ROUTE_NHOP_UTILS_H_ +#define _NET_ROUTE_NHOP_UTILS_H_ + +/* Chained hash table */ +struct _cht_head { + uint32_t hash_size; + uint32_t items_count; + void **ptr; +}; + +static inline uint32_t +_cht_get_resize_size(const struct _cht_head *head) +{ + uint32_t new_size = 0; + + if ((head->items_count * 2 > head->hash_size) && (head->hash_size < 65536)) + new_size = head->hash_size * 2; + else if ((head->items_count * 4 < head->hash_size) && head->hash_size > 16) + new_size = head->hash_size / 2; + + return (new_size); +} + +static inline int +_cht_need_resize(const struct _cht_head *head) +{ + + return (_cht_get_resize_size(head) > 0); +} + + +#ifndef typeof +#define typeof __typeof +#endif + +#define CHT_SLIST_NEED_RESIZE(_head) \ + _cht_need_resize((const struct _cht_head *)(_head)) +#define CHT_SLIST_GET_RESIZE_BUCKETS(_head) \ + _cht_get_resize_size((const struct _cht_head *)(_head)) +#define CHT_SLIST_GET_RESIZE_SIZE(_buckets) ((_buckets) * sizeof(void *)) + +#define CHT_SLIST_DEFINE(_HNAME, _ITEM_TYPE) \ +struct _HNAME##_head { \ + uint32_t hash_size; \ + uint32_t items_count; \ + _ITEM_TYPE **ptr; \ +} + +#define CHT_SLIST_INIT(_head, _ptr, _num_buckets) \ + (_head)->hash_size = _num_buckets; \ + (_head)->items_count = 0; \ + (_head)->ptr = _ptr; + +/* Default hash method for constant-size keys */ + +#define CHT_GET_BUCK(_head, _PX, _key) _PX##_hash_key(_key) & ((_head)->hash_size - 1) +#define CHT_GET_BUCK_OBJ(_head, _PX, _obj) _PX##_hash_obj(_obj) & ((_head)->hash_size - 1) + +#define CHT_FIRST(_head, idx) _CHT_FIRST((_head)->ptr, idx) +#define _CHT_FIRST(_ptr, idx) (_ptr)[idx] + +#define CHT_SLIST_FIND(_head, _PX, _key, _ret) do { \ + uint32_t _buck = CHT_GET_BUCK(_head, _PX, _key); \ + _ret = CHT_FIRST(_head, _buck); \ + for ( ; _ret != NULL; _ret = _PX##_next(_ret)) { \ + if (_PX##_cmp(_key, (_ret))) \ + break; \ + } \ +} while(0) + +/* + * hash_obj, nhop_cmp + */ +#define CHT_SLIST_FIND_BYOBJ(_head, _PX, _obj, _ret) do { \ + uint32_t _buck = CHT_GET_BUCK_OBJ(_head, _PX, _obj); \ + _ret = CHT_FIRST(_head, _buck); \ + for ( ; _ret != NULL; _ret = _PX##_next(_ret)) { \ + if (_PX##_cmp(_obj, _ret)) \ + break; \ + } \ +} while(0) + +#define CHT_SLIST_INSERT_HEAD(_head, _PX, _obj) do { \ + uint32_t _buck = CHT_GET_BUCK_OBJ(_head, _PX, _obj); \ + _PX##_next(_obj) = CHT_FIRST(_head, _buck); \ + CHT_FIRST(_head, _buck) = _obj; \ + (_head)->items_count++; \ +} while(0) + +#define CHT_SLIST_REMOVE(_head, _PX, _key, _ret) do { \ + typeof(*(_head)->ptr) _tmp; \ + uint32_t _buck = CHT_GET_BUCK(_head, _PX, _key); \ + _ret = CHT_FIRST(_head, _buck); \ + _tmp = NULL; \ + for ( ; _ret != NULL; _tmp = _ret, _ret = _PX##_next(_ret)) { \ + if (_PX##_cmp(_key, _ret)) \ + break; \ + } \ + if (_ret != NULL) { \ + if (_tmp == NULL) \ + CHT_FIRST(_head, _buck) = _PX##_next(_ret); \ + else \ + _PX##_next(_tmp) = _PX##_next(_ret); \ + (_head)->items_count--; \ + } \ +} while(0) + +#define CHT_SLIST_REMOVE_BYOBJ(_head, _PX, _obj, _ret) do { \ + typeof(*(_head)->ptr) _tmp; \ + uint32_t _buck = CHT_GET_BUCK_OBJ(_head, _PX, _obj); \ + _ret = CHT_FIRST(_head, _buck); \ + _tmp = NULL; \ + for ( ; _ret != NULL; _tmp = _ret, _ret = _PX##_next(_ret)) { \ + if (_PX##_cmp(_obj, _ret)) \ + break; \ + } \ + if (_ret != NULL) { \ + if (_tmp == NULL) \ + CHT_FIRST(_head, _buck) = _PX##_next(_ret); \ + else \ + _PX##_next(_tmp) = _PX##_next(_ret); \ + (_head)->items_count--; \ + } \ +} while(0) + + +#define CHT_SLIST_FOREACH(_head, _PX, _x) \ + for (uint32_t _i = 0; _i < (_head)->hash_size; _i++) { \ + for (_x = CHT_FIRST(_head, _i); _x; _x = _PX##_next(_x)) + +#define CHT_SLIST_FOREACH_END } + +#define CHT_SLIST_RESIZE(_head, _PX, _new_void_ptr, _new_hsize) \ + uint32_t _new_idx; \ + typeof((_head)->ptr) _new_ptr = (void *)_new_void_ptr; \ + typeof(*(_head)->ptr) _x, _y; \ + for (uint32_t _old_idx = 0; _old_idx < (_head)->hash_size; _old_idx++) {\ + _x = CHT_FIRST(_head, _old_idx); \ + _y = _x; \ + while (_y != NULL) { \ + _y = _PX##_next(_x); \ + _new_idx = _PX##_hash_obj(_x) & (_new_hsize - 1);\ + _PX##_next(_x) = _CHT_FIRST(_new_ptr, _new_idx);\ + _CHT_FIRST(_new_ptr, _new_idx) = _x; \ + _x = _y; \ + } \ + } \ + (_head)->hash_size = _new_hsize; \ + _new_void_ptr = (void *)(_head)->ptr; \ + (_head)->ptr = _new_ptr; + +/* bitmasks */ + +struct bitmask_head { + uint16_t free_off; /* index of the first potentially free block */ + uint16_t blocks; /* number of 4/8-byte blocks in the index */ + uint32_t items_count; /* total number of items */ + u_long *idx; +}; + +size_t bitmask_get_size(uint32_t items); +uint32_t bitmask_get_resize_items(const struct bitmask_head *nh); +int bitmask_should_resize(const struct bitmask_head *bh); +void bitmask_swap(struct bitmask_head *bh, void *new_idx, uint32_t new_items, void **pidx); +void bitmask_init(struct bitmask_head *bh, void *idx, uint32_t num_items); +int bitmask_copy(const struct bitmask_head *bi, void *new_idx, uint32_t new_items); +int bitmask_alloc_idx(struct bitmask_head *bi, uint16_t *pidx); +int bitmask_free_idx(struct bitmask_head *bi, uint16_t idx); + +#endif + Index: sys/net/route/nhop_utils.c =================================================================== --- /dev/null +++ sys/net/route/nhop_utils.c @@ -0,0 +1,220 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include "opt_inet.h" +#include "opt_route.h" +#include "opt_mpath.h" + +#include +#include +#include +#include +#include +#include + +#include + +#define BLOCK_ITEMS (8 * sizeof(u_long)) /* Number of items for ffsl() */ + +#define _BLOCKS_TO_SZ(_blocks) ((size_t)(_blocks) * sizeof(u_long)) +#define _BLOCKS_TO_ITEMS(_blocks) ((uint32_t)(_blocks) * BLOCK_ITEMS) +#define _ITEMS_TO_BLOCKS(_items) ((_items) / BLOCK_ITEMS) + + +static void _bitmask_init_idx(void *index, uint32_t items); + +void +bitmask_init(struct bitmask_head *bh, void *idx, uint32_t num_items) +{ + + if (idx != NULL) + _bitmask_init_idx(idx, num_items); + + memset(bh, 0, sizeof(struct bitmask_head)); + bh->blocks = _ITEMS_TO_BLOCKS(num_items); + bh->idx = (u_long *)idx; +} + +uint32_t +bitmask_get_resize_items(const struct bitmask_head *bh) +{ + if ((bh->items_count * 2 > _BLOCKS_TO_ITEMS(bh->blocks)) && bh->items_count < 65536) + return (_BLOCKS_TO_ITEMS(bh->blocks) * 2); + + return (0); +} + +int +bitmask_should_resize(const struct bitmask_head *bh) +{ + + return (bitmask_get_resize_items(bh) != 0); +} + +#if 0 +uint32_t +_bitmask_get_blocks(uint32_t items) +{ + + return (items / BLOCK_ITEMS); +} +#endif + +size_t +bitmask_get_size(uint32_t items) +{ +#if _KERNEL + KASSERT((items % BLOCK_ITEMS) == 0, + ("bitmask size needs to power of 2 and greater or equal to %zu", + BLOCK_ITEMS)); +#else + assert((items % BLOCK_ITEMS) == 0); +#endif + + return (items / 8); +} + +static void +_bitmask_init_idx(void *_idx, uint32_t items) +{ + size_t size = bitmask_get_size(items); + u_long *idx = (u_long *)_idx; + + /* Mark all as free */ + memset(idx, 0xFF, size); + *idx &= ~(u_long)1; /* Always skip index 0 */ +} + + +/* + * _try_merge api to allow shrinking? + */ +int +bitmask_copy(const struct bitmask_head *bi, void *new_idx, uint32_t new_items) +{ + uint32_t new_blocks = _BLOCKS_TO_ITEMS(new_items); + + _bitmask_init_idx(new_idx, new_items); + + if (bi->blocks < new_blocks) { + /* extend current blocks */ + if (bi->blocks > 0) + memcpy(new_idx, bi->idx, _BLOCKS_TO_SZ(bi->blocks)); + return (0); + } else { + /* XXX: ensure all other blocks are non-zero */ + for (int i = new_blocks; i < bi->blocks; i++) { + } + + return (1); + } +} + +void +bitmask_swap(struct bitmask_head *bh, void *new_idx, uint32_t new_items, void **pidx) +{ + void *old_ptr; + + old_ptr = bh->idx; + + bh->idx = (u_long *)new_idx; + bh->blocks = _ITEMS_TO_BLOCKS(new_items); + + if (pidx != NULL) + *pidx = old_ptr; +} + +/* + * Allocate new index in given instance and stores in in @pidx. + * Returns 0 on success. + */ +int +bitmask_alloc_idx(struct bitmask_head *bi, uint16_t *pidx) +{ + u_long *mask; + int i, off, v; + + off = bi->free_off; + mask = &bi->idx[off]; + + for (i = off; i < bi->blocks; i++, mask++) { + if ((v = ffsl(*mask)) == 0) + continue; + + /* Mark as busy */ + *mask &= ~ ((u_long)1 << (v - 1)); + + bi->free_off = i; + + v = BLOCK_ITEMS * i + v - 1; + + *pidx = v; + bi->items_count++; + return (0); + } + + return (1); +} + +/* + * Removes index from given set. + * Returns 0 on success. + */ +int +bitmask_free_idx(struct bitmask_head *bi, uint16_t idx) +{ + u_long *mask; + int i, v; + + if (idx == 0) + return (1); + + i = idx / BLOCK_ITEMS; + v = idx % BLOCK_ITEMS; + + if (i >= bi->blocks) + return (1); + + mask = &bi->idx[i]; + + if ((*mask & ((u_long)1 << v)) != 0) + return (1); + + /* Mark as free */ + *mask |= (u_long)1 << v; + bi->items_count--; + + /* Update free offset */ + if (bi->free_off > i) + bi->free_off = i; + + return (0); +} + Index: sys/net/route/nhop_var.h =================================================================== --- /dev/null +++ sys/net/route/nhop_var.h @@ -0,0 +1,127 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This header file contains private definitions for nexthop routing. + * + * Header is not intended to be included by the code external to the + * routing subsystem. + */ + +#ifndef _NET_ROUTE_NHOP_VAR_H_ +#define _NET_ROUTE_NHOP_VAR_H_ + +static unsigned +djb_hash(const unsigned char *h, const int len) +{ + unsigned int result = 0; + int i; + + for (i = 0; i < len; i++) + result = 33 * result ^ h[i]; + + return (result); +} + +/* define nhop hash table */ +struct nhop_priv; +CHT_SLIST_DEFINE(nhops, struct nhop_priv); +/* produce hash value for an object */ +#define nhops_hash_obj(_obj) hash_priv(_obj) +/* compare two objects */ +#define nhops_cmp(_one, _two) cmp_priv(_one, _two) +/* next object accessor */ +#define nhops_next(_obj) (_obj)->nh_next + +/* XXX: declare! */ +/* define mpath hash table */ +struct nhgrp_priv; +CHT_SLIST_DEFINE(mpath, struct nhgrp_priv); + + +struct nh_control { + struct nhops_head nh_head; /* hash table head */ + struct bitmask_head nh_idx_head; /* nhop index head */ + struct mpath_head gr_head; /* nhgrp hash table head */ + struct bitmask_head gr_idx_head; /* nhgrp index head */ + struct rwlock nhop_lock; /* overall ctl lock */ + struct rib_head *rh; /* pointer back to rnh */ +}; + +#define NHOPS_WLOCK(ctl) rw_wlock(&(ctl)->nhop_lock) +#define NHOPS_RLOCK(ctl) rw_rlock(&(ctl)->nhop_lock) +#define NHOPS_WUNLOCK(ctl) rw_wunlock(&(ctl)->nhop_lock) +#define NHOPS_RUNLOCK(ctl) rw_runlock(&(ctl)->nhop_lock) +#define NHOPS_LOCK_INIT(ctl) rw_init(&(ctl)->nhop_lock, "ctl") +#define NHOPS_LOCK_DESTROY(ctl) rw_destroy(&(ctl)->nhop_lock) +#define NHOPS_WLOCK_ASSERT(ctl) rw_assert(&(ctl)->nhop_lock, RA_WLOCKED) + + +/* Control plane-only nhop data */ +struct nhop_object; +struct nhop_priv { + uint32_t nh_idx; /* nexthop index */ + uint8_t nh_family; /* address family of the lookup */ + uint16_t nh_type; /* nexthop type */ + void *cb_func; /* function handling additional rewrite caps */ + u_int nh_refcnt; /* number of references */ + int rt_flags; /* routing flags for the control plane */ + struct nhop_object *nh; /* backreference to the dataplane nhop */ + struct nh_control *nh_control; /* backreference to the rnh */ + struct nhop_priv *nh_next; /* hash table membership */ + struct mtx nh_mtx; /* mutex */ + struct epoch_context nh_epoch_ctx; /* epoch data for nhop */ +}; + +#define NH_PRIV_LOCK_INIT(_priv) mtx_init(&(_priv)->nh_mtx, "nhop", NULL, MTX_DEF) +#define NH_PRIV_LOCK(_priv) mtx_lock(&(_priv)->nh_mtx) +#define NH_PRIV_UNLOCK(_priv) mtx_unlock(&(_priv)->nh_mtx) +#define NH_PRIV_LOCK_DESTROY(_priv) mtx_destroy(&(_priv)->nh_mtx) +#define NH_PRIV_LOCK_ASSERT(_priv) mtx_assert(&(_priv)->nh_mtx, MA_OWNED) + +#define NH_LOCK(_nh) NH_PRIV_LOCK((_nh)->nh_priv) +#define NH_UNLOCK(_nh) NH_PRIV_UNLOCK((_nh)->nh_priv) + +#define NH_IS_PINNED(_nh) ((_nh)->nh_priv->rt_flags & RTF_PINNED) + +/* nhop.c */ +struct nhop_priv *find_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv); +int link_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv); +struct nhop_priv *unlink_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv); + +/* nhop_ctl.c */ +void free_nhop(struct nhop_priv *nh_priv); +int cmp_priv(const struct nhop_priv *_one, const struct nhop_priv *_two); + +/* mpath */ +struct weightened_nhop; + + +#endif + Index: sys/net/route/route_ctl.c =================================================================== --- /dev/null +++ sys/net/route/route_ctl.c @@ -0,0 +1,290 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_inet.h" +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +/* + * This file contains control plane routing tables functions. + * + * All functions assumes they are called in net epoch. + */ + +static void set_req_mtu(const struct rt_addrinfo *info, struct nhop_request *req); + +static void fill_nh_request(struct rib_head *rnh, struct rt_addrinfo *info, + struct nhop_request *nh_req); +static void fill_nh_request_from_nhop(const struct nhop_object *nh, + struct sockaddr_storage *gw_storage, struct nhop_request *nh_req); + + +/* + * Sets @nh_req mtu data based on the @info data. + */ +static void +set_req_mtu(const struct rt_addrinfo *info, struct nhop_request *nh_req) +{ + + if (info->rti_mflags & RTV_MTU) { + if (info->rti_rmx->rmx_mtu != 0) { + + /* + * MTU was explicitly provided by user. + * Keep it. + */ + nh_req->rt_flags |= RTF_FIXEDMTU; + } else { + + /* + * User explicitly sets MTU to 0. + * Assume rollback to default. + */ + nh_req->rt_flags &= ~RTF_FIXEDMTU; + } + nh_req->mtu = info->rti_rmx->rmx_mtu; + } +} + + +/* + * Fills @nh_req based on the data provided in @info. + */ +static void +fill_nh_request(struct rib_head *rnh, struct rt_addrinfo *info, + struct nhop_request *nh_req) +{ + + bzero(nh_req, sizeof(struct nhop_request)); + nh_req->ifp = info->rti_ifa->ifa_ifp; + nh_req->ifa = info->rti_ifa; + nh_req->gw = info->rti_info[RTAX_GATEWAY]; + nh_req->family = info->rti_info[RTAX_DST]->sa_family; + nh_req->rt_flags = info->rti_flags; // fill original rt flags + nh_req->nh_type = 0; // hook responsibility to set nhop type + set_req_mtu(info, nh_req); +} + +/* + * Fill @nh_req based on the real @nh. + */ +static void +fill_nh_request_from_nhop(const struct nhop_object *nh, + struct sockaddr_storage *gw_storage, struct nhop_request *nh_req) +{ + + memset(nh_req, 0, sizeof(struct nhop_request)); + nh_req->ifp = nh->nh_ifp; + nh_req->ifa = nh->nh_ifa; + nh_req->family = nh->nh_priv->nh_family; + nh_req->mtu = nh->nh_mtu; + nh_req->rt_flags = nh->nh_priv->rt_flags; + nh_req->nh_type = nh->nh_priv->nh_type; + + if (nh_req->rt_flags & RTF_GATEWAY) { + /* Assume size is already validated */ + memcpy(gw_storage, &nh->gw4_sa, nh->gw4_sa.sin_len); + } else { + /* Nhop value is largerly ignored, set some random bits */ + gw_storage->ss_len = 0; + } + nh_req->gw = (struct sockaddr *)gw_storage; +} + +/* + * Update @nh_req request data based on the parameters supplied in @info. + * This is a helper function to support route changes. + * + * It limits the changes that can be done to the route to the following: + * 1) all combination of gateway changes (gw, interface, blackhole/reject) + * 2) route flags (FLAG[123],STATIC,BLACKHOLE,REJECT) + * 3) route MTU + * + * Assumes nh_req gw pointer has sockaddr_storage-sized pointer supplied + * + * Returns: + * 0 on success + */ +static int +alter_nh_request(struct rt_addrinfo *info, u_int fibnum, struct nhop_request *nh_req) +{ + + /* Update MTU if set in the request*/ + set_req_mtu(info, nh_req); + + /* XXX: allow only one of BLACKHOLE,REJECT,GATEWAY */ + + /* Allow some flags (FLAG1,STATIC,BLACKHOLE,REJECT) to be toggled on change. */ + nh_req->rt_flags &= ~RTF_FMASK; + nh_req->rt_flags |= info->rti_flags & RTF_FMASK; + + /* Consider gateway change */ + struct sockaddr *info_gw = info->rti_info[RTAX_GATEWAY]; + + if (info_gw != NULL) { + nh_req->ifa = info->rti_ifa; + nh_req->ifp = info->rti_ifp; + /* Update RTF_GATEWAY flag status */ + nh_req->rt_flags &= ~RTF_GATEWAY; + nh_req->rt_flags |= (RTF_GATEWAY & info->rti_flags); + } + + return (0); +} + +/* + * Creates a new nexthop based on the information in @info. + * + * Returns: + * 0 on success, filling @nh_ret with the desired nexthop object ptr + * errno otherwise + */ +static int +create_nhop_from_info(struct rib_head *rnh, struct rt_addrinfo *info, + struct nhop_object **nh_ret) +{ + struct sockaddr *gateway, *dst, *netmask; + struct nhop_request nh_req; + int error; + + fill_nh_request(rnh, info, &nh_req); + + /* Give the protocols chance to augment the request data */ + dst = info->rti_info[RTAX_DST]; + netmask = info->rti_info[RTAX_NETMASK]; + gateway = info->rti_info[RTAX_GATEWAY]; + + if (rnh->rnh_preadd != NULL) { + error = rnh->rnh_preadd(rnh->rib_fibnum, dst, netmask, &nh_req); + if (error != 0) + return (error); + } + + *nh_ret = nhop_get(rnh, &nh_req); + if (*nh_ret == NULL) { + DPRINTF("failed to get the nexthop from req"); + return (EAGAIN); + } + + return (0); +} + +struct nhop_object * +nhop_create_from_info_wrapper(struct rib_head *rnh, struct rt_addrinfo *info) +{ + struct nhop_object *nh; + + if (create_nhop_from_info(rnh, info, &nh) != 0) + return (NULL); + + return (nh); +} + +/* + * Creates new nexthop based on @nh_old and augmentation data from @info. + * Helper function used in the route changes, please see + * alter_nh_request() comments for more details. + * + * Returns: + * 0 on success, filling @nh_ret with the desired nexthop object + * errno otherwise + */ +static int +create_nhop_from_nhop(struct rib_head *rnh, const struct nhop_object *nh_old, + struct rt_addrinfo *info, struct nhop_object **nh_ret) +{ + struct nhop_request nh_req; + struct sockaddr_storage gw_storage; + int error; + + /* Start with copying data from original nexthop */ + fill_nh_request_from_nhop(nh_old, &gw_storage, &nh_req); + + /* return ifa/ifp referenced */ + error = alter_nh_request(info, rnh->rib_fibnum, &nh_req); + if (error != 0) + return (error); + + /* Give protocol chance to alter the nexthop request */ + if (rnh->rnh_preadd != NULL) { + error = rnh->rnh_preadd(rnh->rib_fibnum, info->rti_info[RTAX_DST], + info->rti_info[RTAX_NETMASK], &nh_req); + if (error != 0) { + DPRINTF("failed to create nhop: prehook returned %d", + error); + return (error); + } + } + + *nh_ret = nhop_get(rnh, &nh_req); + if (*nh_ret == NULL) { + DPRINTF("failed to create nhop: nhop_get() failed"); + return (EAGAIN); + } + + return (0); +} + +struct nhop_object * +nhop_create_from_nhop_wrapper(struct rib_head *rnh, + const struct nhop_object *nh_old, struct rt_addrinfo *info) +{ + struct nhop_object *nh; + + if (create_nhop_from_nhop(rnh, nh_old, info, &nh) != 0) + return (NULL); + + return (nh); +} + Index: sys/net/route/route_helpers.c =================================================================== --- /dev/null +++ sys/net/route/route_helpers.c @@ -0,0 +1,83 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_route.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * RIB helper functions. + */ + +/* + * Calls @wa_f with @arg for each entry in the table specified by + * @af and @fibnum. + * + * Table is traversed under read lock. + */ +void +rib_walk(int af, u_int fibnum, rt_walktree_f_t *wa_f, void *arg) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rnh; + + if ((rnh = rt_tables_get_rnh(fibnum, af)) == NULL) + return; + + RIB_RLOCK(rnh); + rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg); + RIB_RUNLOCK(rnh); +} + Index: sys/net/route/shared.h =================================================================== --- /dev/null +++ sys/net/route/shared.h @@ -0,0 +1,131 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Contains various definitions shared between the parts of a routing subsystem. + * + * Header is not intended to be included by the code external to the + * routing subsystem. + */ + +#ifndef _NET_ROUTE_SHARED_H_ +#define _NET_ROUTE_SHARED_H_ + +#ifdef INVARIANTS +#define NET_EPOCH_ASSERT_INVARIANTS() NET_EPOCH_ASSERT() +#else +#define NET_EPOCH_ASSERT_INVARIANTS() +#endif + +#ifdef RTDEBUG +#define DPRINTF(_fmt, ...) printf("%s: " _fmt "\n", __func__ , ## __VA_ARGS__) +#else +#define DPRINTF(_fmt, ...) +#endif + +struct rib_head; + +/* Nexhops */ +void nhops_init(void); +int nhops_init_rib(struct rib_head *rh); +void nhops_destroy_rib(struct rib_head *rh); +struct nhop_object *nhop_get(struct rib_head *rh, const struct nhop_request *req); +int nhop_ref_object(struct nhop_object *nh); +int nhop_ref_any(struct nhop_object *nh); +void nhop_free_any(struct nhop_object *nh); + +void nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu); +int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w); + + +/* multipath */ +#define MPF_MULTIPATH 0x08 /* need to be consistent with NHF_MULTIPATH */ +#define MPF_LINKED 0x10 /* mpath group is linked */ + +struct nhgrp_object { + uint16_t mp_flags; /* mpath flags */ + uint8_t mp_size; /* size of mpath group used in selection */ + uint8_t spare; + struct nhop_object *nhops[0]; /* nhops */ +}; + +struct weightened_nhop { + struct nhop_object *nh; + uint32_t weight; +}; + +/* */ +int rt_getifa_fib(struct rt_addrinfo *, u_int); + +/* nhgrp.c */ +int nhgrp_ctl_init(struct nh_control *ctl); +void nhgrp_ctl_free(struct nh_control *ctl); + +struct nhgrp_object; + +/* nhgrp_ctl.c */ +struct weightened_nhop *nhgrp_get_nhops(struct nhgrp_object *mp, + uint32_t *pnum_nhops); +int nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w); + +struct nhgrp_object *nhgrp_get_group(struct rib_head *rh, + struct weightened_nhop *wn, int num_nhops, int *perror); +struct nhgrp_object *nhgrp_append_nhops(struct rib_head *rh, + const struct nhgrp_object *gr_orig, struct weightened_nhop *wn, + int num_nhops, uint64_t *paddmask, int *perror); +struct nhgrp_object *nhgrp_get_del_nhops(struct rib_head *rh, + const struct nhgrp_object *src, uint64_t *nhop_mask, int *perror); +struct nhgrp_object *nhgrp_get_replace_nhop(struct rib_head *rh, + const struct nhgrp_object *gr_orig, struct weightened_nhop *wn, + uint8_t replace_idx, uint64_t *pmodmask, int *perror); + +void nhgrp_free_group(struct nhgrp_object *gr); +int nhgrp_ref_group(struct nhgrp_object *gr); + +/* nhgrp*/ + +/* route_ctl.c */ +int can_nh_multipath(const struct nhop_object *nh); +int create_rte_from_rte(struct rib_head *rnh, struct rtentry *rt_orig, + struct rtentry **ret_rt); +int del_route_one(struct rib_head *rnh, struct rtentry *rt, + struct rt_addrinfo *info); + +int rib_match_nhop_gw(const struct nhop_object *nh, + const struct sockaddr *gw); + +struct nhop_object *nhop_create_from_info_wrapper(struct rib_head *rnh, + struct rt_addrinfo *info); +struct nhop_object *nhop_create_from_nhop_wrapper(struct rib_head *rnh, + const struct nhop_object *nh_old, struct rt_addrinfo *info); + +#endif + + + Index: sys/net/route_var.h =================================================================== --- sys/net/route_var.h +++ sys/net/route_var.h @@ -32,6 +32,11 @@ #ifndef _NET_ROUTE_VAR_H_ #define _NET_ROUTE_VAR_H_ +struct nh_control; +struct nhop_request; +typedef int rnh_preadd_entry_f_t(u_int fibnum, const struct sockaddr *addr, + const struct sockaddr *mask, struct nhop_request *req); + struct rib_head { struct radix_head head; rn_matchaddr_f_t *rnh_matchaddr; /* longest match for sockaddr */ @@ -41,6 +46,7 @@ rn_walktree_t *rnh_walktree; /* traverse tree */ rn_walktree_from_t *rnh_walktree_from; /* traverse tree below a */ rn_close_t *rnh_close; /*do something when the last ref drops*/ + rnh_preadd_entry_f_t *rnh_preadd; /* hook to alter record prior to insertion */ rt_gen_t rnh_gen; /* generation counter */ int rnh_multipath; /* multipath capable ? */ struct radix_node rnh_nodes[3]; /* empty tree for common case */ @@ -51,6 +57,7 @@ u_int rib_fibnum; /* fib number */ struct callout expire_callout; /* Callout for expiring dynamic routes */ time_t next_expire; /* Next expire run ts */ + struct nh_control *nh_control; /* nexthop subsystem data */ }; #define RIB_RLOCK_TRACKER struct rm_priotracker _rib_tracker @@ -89,6 +96,44 @@ struct rib_head *rt_tables_get_rnh(int fib, int family); +VNET_PCPUSTAT_DECLARE(struct rtstat, rtstat); +#define RTSTAT_ADD(name, val) \ + VNET_PCPUSTAT_ADD(struct rtstat, rtstat, name, (val)) +#define RTSTAT_INC(name) RTSTAT_ADD(name, 1) + +/* + * With the split between the routing entry and the nexthop, + * rt_flags has to be split between these 2 entries. As rtentry + * mostly contains prefix data and is thought to be generic enough + * so one can transparently change the nexthop pointer w/o requiring + * any other rtentry changes, most of rt_flags shifts to the particular nexthop. + * / + * + * RTF_UP: rtentry, as an indication that it is linked. + * RTF_HOST: rtentry, nhop. The latter indication is needed for the datapath + * RTF_DYNAMIC: nhop, to make rtentry generic. + * RTF_MODIFIED: nhop, to make rtentry generic. (legacy) + * -- "native" path (nhop) properties: + * RTF_GATEWAY, RTF_STATIC, RTF_PROTO1, RTF_PROTO2, RTF_PROTO3, RTF_FIXEDMTU, + * RTF_PINNED, RTF_REJECT, RTF_BLACKHOLE, RTF_BROADCAST + */ + +/* Nexthop rt flags mask */ +#define NHOP_RT_FLAG_MASK (RTF_GATEWAY | RTF_HOST | RTF_REJECT | RTF_DYNAMIC | \ + RTF_MODIFIED | RTF_STATIC | RTF_BLACKHOLE | RTF_PROTO1 | RTF_PROTO2 | \ + RTF_PROTO3 | RTF_FIXEDMTU | RTF_PINNED | RTF_BROADCAST) + +/* rtentry rt flag mask */ +#define RTE_RT_FLAG_MASK (RTF_UP | RTF_HOST) + +/* Nexthop selection */ +#define _NH2MP(_nh) ((struct nhgrp_object *)(_nh)) +#define _SELECT_NHOP(_nh, _flowid) \ + (_NH2MP(_nh))->nhops[(_flowid) % (_NH2MP(_nh))->mp_size] +#define _RT_SELECT_NHOP(_nh, _flowid) \ + ((!NH_IS_MULTIPATH(_nh)) ? (_nh) : _SELECT_NHOP(_nh, _flowid)) +#define RT_SELECT_NHOP(_rt, _flowid) _RT_SELECT_NHOP((_rt)->rt_nhop, _flowid) + /* rte<>nhop translation */ static inline uint16_t fib_rte_to_nh_flags(int rt_flags) Index: sys/net/rtsock.c =================================================================== --- sys/net/rtsock.c +++ sys/net/rtsock.c @@ -77,6 +77,7 @@ #include #include #endif +#include #ifdef COMPAT_FREEBSD32 #include @@ -1076,6 +1077,7 @@ out->rmx_mtu = rt->rt_mtu; out->rmx_weight = rt->rt_weight; out->rmx_pksent = counter_u64_fetch(rt->rt_pksent); + out->rmx_nhidx = nhop_get_idx(rt->rt_nhop); /* Kernel -> userland timebase conversion. */ out->rmx_expire = rt->rt_expire ? rt->rt_expire - time_uptime + time_second : 0; @@ -2025,7 +2027,7 @@ namelen--; if (req->newptr) return (EPERM); - if (name[1] == NET_RT_DUMP) { + if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP) { if (namelen == 3) fib = req->td->td_proc->p_fibnum; else if (namelen == 4) @@ -2092,7 +2094,25 @@ error = EAFNOSUPPORT; } break; - + case NET_RT_NHOP: + /* Allow dumping one specific af/fib at a time */ + if (namelen < 4) { + error = EINVAL; + break; + } + fib = name[3]; + if (fib < 0 || fib > rt_numfibs) { + error = EINVAL; + break; + } + rnh = rt_tables_get_rnh(fib, af); + if (rnh == NULL) { + error = EAFNOSUPPORT; + break; + } + if (w.w_op == NET_RT_NHOP) + error = nhops_dump_sysctl(rnh, w.w_req); + break; case NET_RT_IFLIST: case NET_RT_IFLISTL: error = sysctl_iflist(af, &w); Index: sys/netinet/in_fib.h =================================================================== --- sys/netinet/in_fib.h +++ sys/netinet/in_fib.h @@ -58,5 +58,9 @@ uint32_t flowid, struct nhop4_extended *pnh4); void fib4_free_nh_ext(uint32_t fibnum, struct nhop4_extended *pnh4); +struct nhop_object *fib4_lookup_nh_ptr(uint32_t fibnum, struct in_addr dst, + uint32_t scopeid, uint32_t flags, uint32_t flowid); +int fib4_lookup_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, + uint32_t flags, const struct ifnet *src_if); #endif Index: sys/netinet/in_fib.c =================================================================== --- sys/netinet/in_fib.c +++ sys/netinet/in_fib.c @@ -49,6 +49,8 @@ #include #include #include +#include +#include #include #ifdef RADIX_MPATH @@ -60,59 +62,49 @@ #include #ifdef INET -static void fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst, +static void fib4_rte_to_nh_basic(struct nhop_object *nh, struct in_addr dst, uint32_t flags, struct nhop4_basic *pnh4); -static void fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst, +static void fib4_rte_to_nh_extended(struct nhop_object *nh, struct in_addr dst, uint32_t flags, struct nhop4_extended *pnh4); #define RNTORT(p) ((struct rtentry *)(p)) static void -fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst, +fib4_rte_to_nh_basic(struct nhop_object *nh, struct in_addr dst, uint32_t flags, struct nhop4_basic *pnh4) { - struct sockaddr_in *gw; if ((flags & NHR_IFAIF) != 0) - pnh4->nh_ifp = rte->rt_ifa->ifa_ifp; + pnh4->nh_ifp = nh->nh_ifa->ifa_ifp; + else + pnh4->nh_ifp = nh->nh_ifp; + pnh4->nh_mtu = nh->nh_mtu; + if (nh->nh_flags & NHF_GATEWAY) + pnh4->nh_addr = nh->gw4_sa.sin_addr; else - pnh4->nh_ifp = rte->rt_ifp; - pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu); - if (rte->rt_flags & RTF_GATEWAY) { - gw = (struct sockaddr_in *)rte->rt_gateway; - pnh4->nh_addr = gw->sin_addr; - } else pnh4->nh_addr = dst; /* Set flags */ - pnh4->nh_flags = fib_rte_to_nh_flags(rte->rt_flags); - gw = (struct sockaddr_in *)rt_key(rte); - if (gw->sin_addr.s_addr == 0) - pnh4->nh_flags |= NHF_DEFAULT; + pnh4->nh_flags = nh->nh_flags; /* TODO: Handle RTF_BROADCAST here */ } static void -fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst, +fib4_rte_to_nh_extended(struct nhop_object *nh, struct in_addr dst, uint32_t flags, struct nhop4_extended *pnh4) { - struct sockaddr_in *gw; if ((flags & NHR_IFAIF) != 0) - pnh4->nh_ifp = rte->rt_ifa->ifa_ifp; + pnh4->nh_ifp = nh->nh_ifa->ifa_ifp; + else + pnh4->nh_ifp = nh->nh_ifp; + pnh4->nh_mtu = nh->nh_mtu; + if (nh->nh_flags & NHF_GATEWAY) + pnh4->nh_addr = nh->gw4_sa.sin_addr; else - pnh4->nh_ifp = rte->rt_ifp; - pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu); - if (rte->rt_flags & RTF_GATEWAY) { - gw = (struct sockaddr_in *)rte->rt_gateway; - pnh4->nh_addr = gw->sin_addr; - } else pnh4->nh_addr = dst; /* Set flags */ - pnh4->nh_flags = fib_rte_to_nh_flags(rte->rt_flags); - gw = (struct sockaddr_in *)rt_key(rte); - if (gw->sin_addr.s_addr == 0) - pnh4->nh_flags |= NHF_DEFAULT; - pnh4->nh_ia = ifatoia(rte->rt_ifa); + pnh4->nh_flags = nh->nh_flags; + pnh4->nh_ia = ifatoia(nh->nh_ifa); pnh4->nh_src = IA_SIN(pnh4->nh_ia)->sin_addr; } @@ -135,7 +127,7 @@ struct rib_head *rh; struct radix_node *rn; struct sockaddr_in sin; - struct rtentry *rte; + struct nhop_object *nh; KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_basic: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET); @@ -150,10 +142,10 @@ RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rte = RNTORT(rn); + nh = RNTORT(rn)->rt_nhop; /* Ensure route & ifp is UP */ - if (RT_LINK_IS_UP(rte->rt_ifp)) { - fib4_rte_to_nh_basic(rte, dst, flags, pnh4); + if (RT_LINK_IS_UP(nh->nh_ifp)) { + fib4_rte_to_nh_basic(nh, dst, flags, pnh4); RIB_RUNLOCK(rh); return (0); @@ -183,8 +175,8 @@ RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; + struct nhop_object *nh; struct sockaddr_in sin; - struct rtentry *rte; KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_ext: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET); @@ -193,23 +185,18 @@ /* Prepare lookup key */ memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_addr = dst; + nh = NULL; RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rte = RNTORT(rn); -#ifdef RADIX_MPATH - rte = rt_mpath_select(rte, flowid); - if (rte == NULL) { - RIB_RUNLOCK(rh); - return (ENOENT); - } -#endif + nh = RNTORT(rn)->rt_nhop; /* Ensure route & ifp is UP */ - if (RT_LINK_IS_UP(rte->rt_ifp)) { - fib4_rte_to_nh_extended(rte, dst, flags, pnh4); + if (RT_LINK_IS_UP(nh->nh_ifp)) { + fib4_rte_to_nh_extended(nh, dst, flags, pnh4); if ((flags & NHR_REF) != 0) { /* TODO: lwref on egress ifp's ? */ } @@ -229,4 +216,105 @@ } +struct nhop_object * +fib4_lookup_nh_ptr(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, + uint32_t flags, uint32_t flowid) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rh; + struct radix_node *rn; + struct nhop_object *nh; + + KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_route: bad fibnum")); + rh = rt_tables_get_rnh(fibnum, AF_INET); + if (rh == NULL) + return (NULL); + + /* Prepare lookup key */ + struct sockaddr_in sin4; + memset(&sin4, 0, sizeof(sin4)); + sin4.sin_family = AF_INET; + sin4.sin_len = sizeof(struct sockaddr_in); + sin4.sin_addr = dst; + + nh = NULL; + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { + nh = (RNTORT(rn))->rt_nhop; + /* Ensure route & ifp is UP */ + if (RT_LINK_IS_UP(nh->nh_ifp)) { + if (flags & NHR_REF) + nhop_ref_object(nh); + RIB_RUNLOCK(rh); + return (nh); + } + } + RIB_RUNLOCK(rh); + + RTSTAT_INC(rts_unreach); + return (NULL); +} + +inline static int +check_urpf(const struct nhop_object *nh, uint32_t flags, + const struct ifnet *src_if) +{ + + if (src_if != NULL && nh->nh_aifp == src_if) { + return (1); + } + if (src_if == NULL) { + if ((flags & NHR_NODEFAULT) == 0) + return (1); + else if ((nh->nh_flags & NHF_DEFAULT) == 0) + return (1); + } + + return (0); +} + +/* + * Performs reverse path forwarding lookup. + * If @src_if is non-zero, verifies that at least 1 path goes via + * this interface. + * If @src_if is zero, verifies that route exist. + * if @flags contains NHR_NOTDEFAULT, do not consider default route. + * + * Returns 1 if route matching conditions is found, 0 otherwise. + */ +int +fib4_lookup_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, + uint32_t flags, const struct ifnet *src_if) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rh; + struct radix_node *rn; + struct nhop_object *nh; + int ret; + + KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_route: bad fibnum")); + rh = rt_tables_get_rnh(fibnum, AF_INET); + if (rh == NULL) + return (0); + + /* Prepare lookup key */ + struct sockaddr_in sin4; + memset(&sin4, 0, sizeof(sin4)); + sin4.sin_len = sizeof(struct sockaddr_in); + sin4.sin_addr = dst; + + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { + nh = (RNTORT(rn))->rt_nhop; + ret = check_urpf(nh, flags, src_if); + RIB_RUNLOCK(rh); + return (ret); + } + RIB_RUNLOCK(rh); + + return (0); +} + #endif Index: sys/netinet/in_rmx.c =================================================================== --- sys/netinet/in_rmx.c +++ sys/netinet/in_rmx.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -54,6 +55,58 @@ extern int in_detachhead(void **head, int off); #endif +static int +rib4_preadd(u_int fibnum, const struct sockaddr *addr, const struct sockaddr *mask, + struct nhop_request *req) +{ + const struct sockaddr_in *addr4 = (const struct sockaddr_in *)addr; + + /* XXX: RTF_LOCAL && RTF_MULTICAST */ + + if (req->rt_flags & RTF_HOST) { + /* + * Backward compatibility: + * if the destination is broadcast, + * mark route as broadcast. + * This behavior was useful when route cloning + * was in place, so there was an explicit cloned + * route for every broadcasted address. + * Currently (2019-12) there are no kernel machinery + * to do route cloning, though someone might explicitly + * add these routes to support some cases with active-active + * load balancing. Given that, retain this support. + */ + if (in_broadcast(addr4->sin_addr, req->ifp)) + req->rt_flags |= RTF_BROADCAST; + } + + + /* + * Check route MTU: + * inherit interface MTU if not set or + * check if MTU is too large. + */ + if (req->mtu == 0) { + req->mtu = req->ifp->if_mtu; + } else if (req->mtu > req->ifp->if_mtu) + req->mtu = req->ifp->if_mtu; + + /* Ensure that default route nhop has special flag */ + const struct sockaddr_in *mask4 = (const struct sockaddr_in *)mask; + if ((req->rt_flags & RTF_HOST) == 0 && mask4->sin_addr.s_addr == 0) + req->nh_flags_additional |= NHF_DEFAULT; + + /* Set nhop type to basic per-AF nhop */ + if (req->nh_type == 0) { + if (req->rt_flags & RTF_GATEWAY) + req->nh_type = NH_TYPE_IPV4_ETHER_NHOP; + else + req->nh_type = NH_TYPE_IPV4_ETHER_RSLV; + } + + return (0); +} + /* * Do what we need to do when inserting a route. */ @@ -124,6 +177,7 @@ if (rh == NULL) return (0); + rh->rnh_preadd = rib4_preadd; rh->rnh_addaddr = in_addroute; *head = (void *)rh; Index: sys/netinet6/in6_fib.h =================================================================== --- sys/netinet6/in6_fib.h +++ sys/netinet6/in6_fib.h @@ -58,5 +58,10 @@ uint32_t scopeid, uint32_t flags, uint32_t flowid, struct nhop6_extended *pnh6); void fib6_free_nh_ext(uint32_t fibnum, struct nhop6_extended *pnh6); +struct nhop_object *fib6_lookup_nh_ptr(uint32_t fibnum, + const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags, + uint32_t flowid); +int fib6_lookup_urpf(uint32_t fibnum, const struct in6_addr *dst6, + uint32_t scopeid, uint32_t flags, const struct ifnet *src_if); #endif Index: sys/netinet6/in6_fib.c =================================================================== --- sys/netinet6/in6_fib.c +++ sys/netinet6/in6_fib.c @@ -50,6 +50,8 @@ #include #include #include +#include +#include #include #ifdef RADIX_MPATH @@ -68,94 +70,63 @@ #include #ifdef INET6 -static void fib6_rte_to_nh_extended(struct rtentry *rte, +static void fib6_rte_to_nh_extended(const struct nhop_object *nh, const struct in6_addr *dst, uint32_t flags, struct nhop6_extended *pnh6); -static void fib6_rte_to_nh_basic(struct rtentry *rte, const struct in6_addr *dst, +static void fib6_rte_to_nh_basic(const struct nhop_object *nh, const struct in6_addr *dst, uint32_t flags, struct nhop6_basic *pnh6); -static struct ifnet *fib6_get_ifaifp(struct rtentry *rte); #define RNTORT(p) ((struct rtentry *)(p)) #define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) CHK_STRUCT_ROUTE_COMPAT(struct route_in6, ro_dst); -/* - * Gets real interface for the @rte. - * Returns rt_ifp for !IFF_LOOPBACK routers. - * Extracts "real" address interface from interface address - * loopback routes. - */ -static struct ifnet * -fib6_get_ifaifp(struct rtentry *rte) -{ - struct ifnet *ifp; - struct sockaddr_dl *sdl; - - ifp = rte->rt_ifp; - if ((ifp->if_flags & IFF_LOOPBACK) && - rte->rt_gateway->sa_family == AF_LINK) { - sdl = (struct sockaddr_dl *)rte->rt_gateway; - return (ifnet_byindex(sdl->sdl_index)); - } - return (ifp); -} static void -fib6_rte_to_nh_basic(struct rtentry *rte, const struct in6_addr *dst, +fib6_rte_to_nh_basic(const struct nhop_object *nh, const struct in6_addr *dst, uint32_t flags, struct nhop6_basic *pnh6) { - struct sockaddr_in6 *gw; /* Do explicit nexthop zero unless we're copying it */ memset(pnh6, 0, sizeof(*pnh6)); if ((flags & NHR_IFAIF) != 0) - pnh6->nh_ifp = fib6_get_ifaifp(rte); + pnh6->nh_ifp = nh->nh_aifp; else - pnh6->nh_ifp = rte->rt_ifp; + pnh6->nh_ifp = nh->nh_ifp; - pnh6->nh_mtu = min(rte->rt_mtu, IN6_LINKMTU(rte->rt_ifp)); - if (rte->rt_flags & RTF_GATEWAY) { + pnh6->nh_mtu = nh->nh_mtu; + if (nh->nh_flags & NHF_GATEWAY) { /* Return address with embedded scope. */ - gw = (struct sockaddr_in6 *)rte->rt_gateway; - pnh6->nh_addr = gw->sin6_addr; + pnh6->nh_addr = nh->gw6_sa.sin6_addr; } else pnh6->nh_addr = *dst; /* Set flags */ - pnh6->nh_flags = fib_rte_to_nh_flags(rte->rt_flags); - gw = (struct sockaddr_in6 *)rt_key(rte); - if (IN6_IS_ADDR_UNSPECIFIED(&gw->sin6_addr)) - pnh6->nh_flags |= NHF_DEFAULT; + pnh6->nh_flags = nh->nh_flags; } static void -fib6_rte_to_nh_extended(struct rtentry *rte, const struct in6_addr *dst, +fib6_rte_to_nh_extended(const struct nhop_object *nh, const struct in6_addr *dst, uint32_t flags, struct nhop6_extended *pnh6) { - struct sockaddr_in6 *gw; /* Do explicit nexthop zero unless we're copying it */ memset(pnh6, 0, sizeof(*pnh6)); if ((flags & NHR_IFAIF) != 0) - pnh6->nh_ifp = fib6_get_ifaifp(rte); + pnh6->nh_ifp = nh->nh_aifp; else - pnh6->nh_ifp = rte->rt_ifp; + pnh6->nh_ifp = nh->nh_ifp; - pnh6->nh_mtu = min(rte->rt_mtu, IN6_LINKMTU(rte->rt_ifp)); - if (rte->rt_flags & RTF_GATEWAY) { + pnh6->nh_mtu = nh->nh_mtu; + if (nh->nh_flags & NHF_GATEWAY) { /* Return address with embedded scope. */ - gw = (struct sockaddr_in6 *)rte->rt_gateway; - pnh6->nh_addr = gw->sin6_addr; + pnh6->nh_addr = nh->gw6_sa.sin6_addr; } else pnh6->nh_addr = *dst; /* Set flags */ - pnh6->nh_flags = fib_rte_to_nh_flags(rte->rt_flags); - gw = (struct sockaddr_in6 *)rt_key(rte); - if (IN6_IS_ADDR_UNSPECIFIED(&gw->sin6_addr)) - pnh6->nh_flags |= NHF_DEFAULT; - pnh6->nh_ia = ifatoia6(rte->rt_ifa); + pnh6->nh_flags = nh->nh_flags; + pnh6->nh_ia = ifatoia6(nh->nh_ifa); } /* @@ -180,7 +151,7 @@ struct rib_head *rh; struct radix_node *rn; struct sockaddr_in6 sin6; - struct rtentry *rte; + struct nhop_object *nh; KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_basic: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET6); @@ -198,10 +169,10 @@ RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rte = RNTORT(rn); + nh = RNTORT(rn)->rt_nhop; /* Ensure route & ifp is UP */ - if (RT_LINK_IS_UP(rte->rt_ifp)) { - fib6_rte_to_nh_basic(rte, &sin6.sin6_addr, flags, pnh6); + if (RT_LINK_IS_UP(nh->nh_ifp)) { + fib6_rte_to_nh_basic(nh, &sin6.sin6_addr, flags, pnh6); RIB_RUNLOCK(rh); return (0); } @@ -230,7 +201,7 @@ struct rib_head *rh; struct radix_node *rn; struct sockaddr_in6 sin6; - struct rtentry *rte; + struct nhop_object *nh; KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_ext: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET6); @@ -248,17 +219,10 @@ RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rte = RNTORT(rn); -#ifdef RADIX_MPATH - rte = rt_mpath_select(rte, flowid); - if (rte == NULL) { - RIB_RUNLOCK(rh); - return (ENOENT); - } -#endif + nh = RNTORT(rn)->rt_nhop; /* Ensure route & ifp is UP */ - if (RT_LINK_IS_UP(rte->rt_ifp)) { - fib6_rte_to_nh_extended(rte, &sin6.sin6_addr, flags, + if (RT_LINK_IS_UP(nh->nh_ifp)) { + fib6_rte_to_nh_extended(nh, &sin6.sin6_addr, flags, pnh6); if ((flags & NHR_REF) != 0) { /* TODO: Do lwref on egress ifp's */ @@ -279,5 +243,114 @@ } +/* + * + * Assumes scope is deembedded and provided in @scopeid + */ +struct nhop_object * +fib6_lookup_nh_ptr(uint32_t fibnum, const struct in6_addr *dst6, + uint32_t scopeid, uint32_t flags, uint32_t flowid) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rh; + struct radix_node *rn; + struct nhop_object *nh; + struct sockaddr_in6 sin6; + + KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_ext: bad fibnum")); + rh = rt_tables_get_rnh(fibnum, AF_INET6); + if (rh == NULL) + return (NULL); + + /* TODO: radix changes */ + //addr = *dst6; + /* Prepare lookup key */ + memset(&sin6, 0, sizeof(sin6)); + sin6.sin6_len = sizeof(struct sockaddr_in6); + sin6.sin6_addr = *dst6; + + /* Assume scopeid is valid and embed it directly */ + if (IN6_IS_SCOPE_LINKLOCAL(dst6)) + sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff); + + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { + nh = (RNTORT(rn))->rt_nhop; + /* Ensure route & ifp is UP */ + if (RT_LINK_IS_UP(nh->nh_ifp)) { + if (flags & NHR_REF) + nhop_ref_object(nh); + RIB_RUNLOCK(rh); + return (nh); + } + } + RIB_RUNLOCK(rh); + + RTSTAT_INC(rts_unreach); + return (NULL); +} + +inline static int +check_urpf(const struct nhop_object *nh, uint32_t flags, + const struct ifnet *src_if) +{ + + if (src_if != NULL && nh->nh_aifp == src_if) { + return (1); + } + if (src_if == NULL) { + if ((flags & NHR_NODEFAULT) == 0) + return (1); + else if ((nh->nh_flags & NHF_DEFAULT) == 0) + return (1); + } + + return (0); +} + +/* + * Performs reverse path forwarding lookup. + * If @src_if is non-zero, verifies that at least 1 path goes via + * this interface. + * If @src_if is zero, verifies that route exist. + * if @flags contains NHR_NOTDEFAULT, do not consider default route. + * + * Returns 1 if route matching conditions is found, 0 otherwise. + */ +int +fib6_lookup_urpf(uint32_t fibnum, const struct in6_addr *dst6, + uint32_t scopeid, uint32_t flags, const struct ifnet *src_if) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rh; + struct radix_node *rn; + struct nhop_object *nh; + struct in6_addr addr; + int ret; + + KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_ext: bad fibnum")); + rh = rt_tables_get_rnh(fibnum, AF_INET6); + if (rh == NULL) + return (0); + + addr = *dst6; + /* Assume scopeid is valid and embed it directly */ + if (IN6_IS_SCOPE_LINKLOCAL(dst6)) + addr.s6_addr16[1] = htons(scopeid & 0xffff); + + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr((void *)&addr, &rh->head); + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { + nh = (RNTORT(rn))->rt_nhop; + ret = check_urpf(nh, flags, src_if); + RIB_RUNLOCK(rh); + return (ret); + } + RIB_RUNLOCK(rh); + + return (0); +} + #endif Index: sys/netinet6/in6_rmx.c =================================================================== --- sys/netinet6/in6_rmx.c +++ sys/netinet6/in6_rmx.c @@ -80,6 +80,7 @@ #include #include #include +#include #include #include @@ -101,6 +102,39 @@ extern int in6_detachhead(void **head, int off); #endif +static int +rib6_preadd(u_int fibnum, const struct sockaddr *addr, const struct sockaddr *mask, + struct nhop_request *req) +{ + + /* XXX: RTF_LOCAL */ + + /* + * Check route MTU: + * inherit interface MTU if not set or + * check if MTU is too large. + */ + if (req->mtu == 0) { + req->mtu = IN6_LINKMTU(req->ifp); + } else if (req->mtu > IN6_LINKMTU(req->ifp)) + req->mtu = IN6_LINKMTU(req->ifp); + + /* Ensure that default route nhop has special flag */ + const struct sockaddr_in6 *mask6 = (const struct sockaddr_in6 *)mask; + if ((req->rt_flags & RTF_HOST) == 0 && IN6_IS_ADDR_UNSPECIFIED(&mask6->sin6_addr)) + req->nh_flags_additional |= NHF_DEFAULT; + + /* Set nexthop type */ + if (req->nh_type == 0) { + if (req->rt_flags & RTF_GATEWAY) + req->nh_type = NH_TYPE_IPV6_ETHER_NHOP; + else + req->nh_type = NH_TYPE_IPV6_ETHER_RSLV; + } + + return (0); +} + /* * Do what we need to do when inserting a route. */ @@ -167,6 +201,7 @@ return (0); rh->rnh_addaddr = in6_addroute; + rh->rnh_preadd = rib6_preadd; *head = (void *)rh; return (1); Index: sys/sys/socket.h =================================================================== --- sys/sys/socket.h +++ sys/sys/socket.h @@ -416,6 +416,7 @@ #define NET_RT_IFMALIST 4 /* return multicast address list */ #define NET_RT_IFLISTL 5 /* Survey interface list, using 'l'en * versions of msghdr structs. */ +#define NET_RT_NHOP 6 /* dump routing nexthops */ #endif /* __BSD_VISIBLE */ /* Index: usr.bin/netstat/Makefile =================================================================== --- usr.bin/netstat/Makefile +++ usr.bin/netstat/Makefile @@ -5,7 +5,7 @@ PROG= netstat SRCS= if.c inet.c main.c mbuf.c mroute.c netisr.c nl_symbols.c route.c \ - unix.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c \ + unix.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c common.c nhops.c \ nl_defs.h nl_symbols.c: nlist_symbols Index: usr.bin/netstat/common.h =================================================================== --- /dev/null +++ usr.bin/netstat/common.h @@ -0,0 +1,24 @@ +#ifndef _NETSTAT_COMMON_H_ +#define _NETSTAT_COMMON_H_ + +struct bits { + u_long b_mask; + char b_val; + const char *b_name; +}; +extern struct bits rt_bits[]; + +const char *fmt_flags(const struct bits *p, int f); +void print_flags_generic(int flags, const struct bits *pbits, + const char *format, const char *tag_name); +int print_sockaddr(const char *name, struct sockaddr *sa, + struct sockaddr *mask, int flags, int width); + +struct ifmap_entry { + char ifname[IFNAMSIZ]; +}; + +struct ifmap_entry *prepare_ifmap(size_t *ifmap_size); + +#endif + Index: usr.bin/netstat/common.c =================================================================== --- /dev/null +++ usr.bin/netstat/common.c @@ -0,0 +1,140 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1983, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "netstat.h" +#include "common.h" + +const char * +fmt_flags(const struct bits *p, int f) +{ + static char name[33]; + char *flags; + + for (flags = name; p->b_mask; p++) + if (p->b_mask & f) + *flags++ = p->b_val; + *flags = '\0'; + return (name); +} + +void +print_flags_generic(int flags, const struct bits *pbits, const char *format, + const char *tag_name) +{ + const struct bits *p; + char tag_fmt[64]; + + xo_emit(format, fmt_flags(pbits, flags)); + + snprintf(tag_fmt, sizeof(tag_fmt), "{le:%s/%%s}", tag_name); + xo_open_list(tag_name); + for (p = pbits; p->b_mask; p++) + if (p->b_mask & flags) + xo_emit(tag_fmt, p->b_name); + xo_close_list(tag_name); +} + +struct ifmap_entry * +prepare_ifmap(size_t *pifmap_size) +{ + int ifindex = 0, size; + struct ifaddrs *ifap, *ifa; + struct sockaddr_dl *sdl; + + struct ifmap_entry *ifmap = NULL; + int ifmap_size = 0; + + /* + * Retrieve interface list at first + * since we need #ifindex -> if_xname match + */ + if (getifaddrs(&ifap) != 0) + err(EX_OSERR, "getifaddrs"); + + for (ifa = ifap; ifa; ifa = ifa->ifa_next) { + + if (ifa->ifa_addr->sa_family != AF_LINK) + continue; + + sdl = (struct sockaddr_dl *)ifa->ifa_addr; + ifindex = sdl->sdl_index; + + if (ifindex >= ifmap_size) { + size = roundup(ifindex + 1, 32) * + sizeof(struct ifmap_entry); + if ((ifmap = realloc(ifmap, size)) == NULL) + errx(2, "realloc(%d) failed", size); + memset(&ifmap[ifmap_size], 0, + size - ifmap_size * + sizeof(struct ifmap_entry)); + + ifmap_size = roundup(ifindex + 1, 32); + } + + if (*ifmap[ifindex].ifname != '\0') + continue; + + strlcpy(ifmap[ifindex].ifname, ifa->ifa_name, IFNAMSIZ); + } + + freeifaddrs(ifap); + + *pifmap_size = ifmap_size; + + return (ifmap); +} + Index: usr.bin/netstat/main.c =================================================================== --- usr.bin/netstat/main.c +++ usr.bin/netstat/main.c @@ -214,6 +214,7 @@ int noutputs = 0; /* how much outputs before we exit */ int numeric_addr; /* show addresses numerically */ int numeric_port; /* show ports numerically */ +int oflag; /* show nexthop objects*/ int Pflag; /* show TCP log ID */ static int pflag; /* show given protocol */ static int Qflag; /* show netisr information */ @@ -248,7 +249,7 @@ if (argc < 0) exit(EXIT_FAILURE); - while ((ch = getopt(argc, argv, "46AaBbdF:f:ghI:iLlM:mN:nPp:Qq:RrSTsuWw:xz")) + while ((ch = getopt(argc, argv, "46AaBbdF:f:ghI:iLlM:mN:noPp:Qq:RrSTsuWw:xz")) != -1) switch(ch) { case '4': @@ -345,6 +346,9 @@ case 'n': numeric_addr = numeric_port = 1; break; + case 'o': + oflag = 1; + break; case 'P': Pflag = 1; break; @@ -494,6 +498,15 @@ xo_finish(); exit(0); } + if (oflag) { + xo_open_container("statistics"); + nhops_print(fib, af); + //nhgrp_print(fib, af); + xo_close_container("statistics"); + xo_finish(); + exit(0); + } + if (gflag) { xo_open_container("statistics"); Index: usr.bin/netstat/netstat.h =================================================================== --- usr.bin/netstat/netstat.h +++ usr.bin/netstat/netstat.h @@ -147,6 +147,10 @@ char *routename(struct sockaddr *, int); const char *netname(struct sockaddr *, struct sockaddr *); void routepr(int, int); +int p_sockaddr(const char *name, struct sockaddr *sa, + struct sockaddr *mask, int flags, int width); +const char *fmt_sockaddr(struct sockaddr *sa, struct sockaddr *mask, + int flags); #ifdef NETGRAPH void netgraphprotopr(u_long, const char *, int, int); @@ -157,3 +161,5 @@ void mroutepr(void); void mrt_stats(void); void bpf_stats(char *); +void nhops_print(int fibnum, int af); +void nhgrp_print(int fibnum, int af); Index: usr.bin/netstat/nhops.c =================================================================== --- /dev/null +++ usr.bin/netstat/nhops.c @@ -0,0 +1,724 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1983, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "netstat.h" +#include "common.h" + +/* column widths; each followed by one space */ +#ifndef INET6 +#define WID_DST_DEFAULT(af) 18 /* width of destination column */ +#define WID_GW_DEFAULT(af) 18 /* width of gateway column */ +#define WID_IF_DEFAULT(af) (Wflag ? 10 : 8) /* width of netif column */ +#else +#define WID_DST_DEFAULT(af) \ + ((af) == AF_INET6 ? (numeric_addr ? 33: 18) : 18) +#define WID_GW_DEFAULT(af) \ + ((af) == AF_INET6 ? (numeric_addr ? 29 : 18) : 18) +#define WID_IF_DEFAULT(af) ((af) == AF_INET6 ? 8 : (Wflag ? 10 : 8)) +#endif /*INET6*/ +static int wid_dst; +static int wid_gw; +static int wid_flags; +static int wid_pksent; +static int wid_mtu; +static int wid_if; +static int wid_nhidx; +static int wid_nhtype; +static int wid_refcnt; +static int wid_prepend; + +static struct bits nh_bits[] = { + { NHF_REJECT, 'R', "reject" }, + { NHF_BLACKHOLE,'B', "blackhole" }, + { NHF_REDIRECT, 'r', "redirect" }, + { NHF_GATEWAY, 'G', "gateway" }, + { NHF_DEFAULT, 'd', "default" }, + { NHF_BROADCAST,'b', "broadcast" }, + { 0 , 0, NULL } +}; + +static char *nh_types[] = { + "empty", /* 0 */ + "v4/resolve", /* 1 */ + "v4/gw", + "v6/resolve", + "v6/gw" +}; + +struct nhop_entry { + char gw[64]; + char ifname[IFNAMSIZ]; +}; + +struct nhop_map { + struct nhop_entry *ptr; + size_t size; +}; +static struct nhop_map global_nhop_map; + +static void nhop_map_update(struct nhop_map *map, uint32_t idx, + char *gw, char *ifname); +static struct nhop_entry *nhop_get(struct nhop_map *map, uint32_t idx); + + +static struct ifmap_entry *ifmap; +static size_t ifmap_size; + +static void +print_sockaddr_buf(char *buf, size_t bufsize, const struct sockaddr *sa) +{ + + switch (sa->sa_family) { + case AF_INET: + inet_ntop(AF_INET, &((struct sockaddr_in *)sa)->sin_addr, + buf, bufsize); + break; + case AF_INET6: + inet_ntop(AF_INET6, &((struct sockaddr_in6 *)sa)->sin6_addr, + buf, bufsize); + break; + default: + snprintf(buf, bufsize, "unknown:%d", sa->sa_family); + break; + } +} + +static int +print_addr(const char *name, const char *addr, int width) +{ + char buf[128]; + int protrusion; + + if (width < 0) { + snprintf(buf, sizeof(buf), "{:%s/%%s} ", name); + xo_emit(buf, addr); + protrusion = 0; + } else { + if (Wflag != 0 || numeric_addr) { + snprintf(buf, sizeof(buf), "{[:%d}{:%s/%%s}{]:} ", + -width, name); + xo_emit(buf, addr); + protrusion = strlen(addr) - width; + if (protrusion < 0) + protrusion = 0; + } else { + snprintf(buf, sizeof(buf), "{[:%d}{:%s/%%-.*s}{]:} ", + -width, name); + xo_emit(buf, width, addr); + protrusion = 0; + } + } + return (protrusion); +} + + +static void +print_nhop_header(int af1 __unused) +{ + + if (Wflag) { + xo_emit("{T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%*.*s} " + "{T:/%*.*s} {T:/%-*.*s} {T:/%*.*s} {T:/%*.*s} {T:/%*.*s} {T:/%*s}\n", + wid_nhidx, wid_nhidx, "Idx", + wid_nhtype, wid_nhtype, "Type", + wid_dst, wid_dst, "IFA", + wid_gw, wid_gw, "Gateway", + wid_flags, wid_flags, "Flags", + wid_pksent, wid_pksent, "Use", + wid_mtu, wid_mtu, "Mtu", + wid_if, wid_if, "Netif", + wid_if, wid_if, "Addrif", + wid_refcnt, wid_refcnt, "Refcnt", + wid_prepend, "Prepend"); + } else { + xo_emit("{T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%*.*s} " + " {T:/%*s}\n", + wid_nhidx, wid_nhidx, "Idx", + wid_dst, wid_dst, "IFA", + wid_gw, wid_gw, "Gateway", + wid_flags, wid_flags, "Flags", + wid_if, wid_if, "Netif", + wid_prepend, "Refcnt"); + } +} + +static void +print_nhgroup_header(int af1 __unused) +{ + + xo_emit("{T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%*.*s}" + " {T:/%-*.*s} {T:/%*s}\n", + wid_nhidx, wid_nhidx, "MpIdx", + wid_nhidx, wid_nhidx, "NHIdx", + wid_nhidx, wid_nhidx, "Weight", + wid_nhidx, wid_nhidx, "Slots", + wid_gw, wid_gw, "Gateway", + wid_if, wid_if, "Netif", + wid_nhidx, "Refcnt"); +} + +static void +print_nhgroup_entry_sysctl(const char *name, struct rt_msghdr *rtm, + struct mpath_external *mpe) +{ + char buffer[128]; + struct nhop_entry *ne; + + xo_open_instance(name); + + snprintf(buffer, sizeof(buffer), "{[:-%d}{:mp_index/%%lu}{]:} ", wid_nhidx); + xo_emit(buffer, mpe->mp_idx); + + xo_emit("{t:dummy-1/%*.*s}", wid_nhidx, wid_nhidx, "----"); + xo_emit("{t:dummy-2/%*.*s}", wid_nhidx, wid_nhidx, "----"); + xo_emit("{t:dummy-2/%*.*s}", wid_nhidx, wid_nhidx, "----"); + xo_emit("{t:dummy-3/%*.*s}", wid_gw, wid_gw, "----"); + xo_emit("{t:dummy-4/%*.*s}", wid_if, wid_if, "----"); + xo_emit("{t:mp-refcnt/%*lu}", wid_nhidx, mpe->mp_refcount); + xo_emit("\n"); + + struct mpath_nhop_external *ext; + ext = (struct mpath_nhop_external *)(mpe + 1); + + uint32_t *fwd_c = calloc(sizeof(uint32_t), global_nhop_map.size); + uint32_t *pidx; + pidx = (uint32_t *)&ext[mpe->mp_nh_count]; + for (uint32_t i = 0; i < mpe->mp_group_size; i++) { + fwd_c[pidx[i]]++; + } + + xo_open_list("nhop_weights"); + for (uint32_t i = 0; i < mpe->mp_nh_count; i++) { + xo_open_instance("nhop-weight"); + snprintf(buffer, sizeof(buffer), "{[:-%d}{:d-0/%%s}{]:} ", wid_nhidx); + xo_emit(buffer, ""); + // nh index + xo_emit("{t:nh-index/%*lu} ", wid_nhidx, ext[i].nh_idx); + xo_emit("{t:nh-weight/%*lu} ", wid_nhidx, ext[i].nh_weight); + xo_emit("{t:nh-slots/%*lu} ", wid_nhidx, fwd_c[ext[i].nh_idx]); + ne = nhop_get(&global_nhop_map, ext[i].nh_idx); + if (ne != NULL) { + xo_emit("{t:nh-gw/%*.*s}", wid_gw, wid_gw, ne->gw); + xo_emit("{t:nh-interface/%*.*s}", wid_if, wid_if, ne->ifname); + } + xo_emit("\n"); + xo_close_instance("nhop-weight"); + } + xo_close_list("nhop_weights"); + +#if 0 + xo_emit("{t:dummy-2/%*.*s}", wid_nhidx, wid_nhidx, ""); + xo_emit("{t:dummy-1/%*.*s}", wid_nhidx, wid_nhidx, "----"); + xo_emit("\n"); + + uint32_t *pidx; + pidx = (uint32_t *)&ext[mpe->mp_nh_count]; + xo_open_list("fwd-nhops"); + for (uint32_t i = 0; i < mpe->mp_group_size; i++) { + xo_open_instance("fwd-nhop"); + snprintf(buffer, sizeof(buffer), "{[:-%d}{:d-0/%%s}{]:} ", wid_nhidx); + xo_emit(buffer, ""); + // nh index + xo_emit("{t:nh-index/%*lu} ", wid_nhidx, pidx[i]); + ne = nhop_get(&global_nhop_map, pidx[i]); + if (ne != NULL) { + xo_emit("{t:dummy-2/%*.*s}", wid_nhidx, wid_nhidx, ""); + xo_emit("{t:nh-gw/%*.*s}", wid_gw, wid_gw, ne->gw); + xo_emit("{t:nh-interface/%*.*s}", wid_if, wid_if, ne->ifname); + } + xo_emit("\n"); + xo_close_instance("fwd-nhop"); + } + xo_close_list("fwd-nhops"); +#endif +#if 0 + if (Wflag) { + char *cp = nh_types[nh->nh_type]; + xo_emit("{t:type_str/%*s} ", wid_nhtype, cp); + } + memset(iface_name, 0, sizeof(iface_name)); + if (nh->ifindex < (uint32_t)ifmap_size) { + strlcpy(iface_name, ifmap[nh->ifindex].ifname, + sizeof(iface_name)); + if (*iface_name == '\0') + strlcpy(iface_name, "---", sizeof(iface_name)); + } + + //inet_ntop(nh->nh_family, &nh->nh_src, src_addr, sizeof(src_addr)); + //protrusion = p_addr("ifa", src_addr, wid_dst); + sa_gw = (struct sockaddr *)(nh + 1); + sa_ifa = (struct sockaddr *)((char *)sa_gw + sa_gw->sa_len); + protrusion = p_sockaddr("ifa", sa_ifa, NULL, RTF_HOST, wid_dst); + + if (nh->nh_flags & NHF_GATEWAY) { + const char *cp; + cp = fmt_sockaddr(sa_gw, NULL, RTF_HOST); + strlcpy(gw_addr, cp, sizeof(gw_addr)); + } else + snprintf(gw_addr, sizeof(gw_addr), "%s/resolve", iface_name); + protrusion = p_addr("gateway", gw_addr, wid_dst - protrusion); + + snprintf(buffer, sizeof(buffer), "{[:-%d}{:flags/%%s}{]:} ", + wid_flags - protrusion); + + //p_nhflags(nh->nh_flags, buffer); + p_flags(rtm->rtm_flags, buffer); + + if (Wflag) { + xo_emit("{t:use/%*lu} ", wid_pksent, nh->nh_pksent); + xo_emit("{t:mtu/%*lu} ", wid_mtu, nh->nh_mtu); + } + //printf("IDX: %d IFACE: %s FAMILY: %d TYPE: %d FLAGS: %X GW \n"); + + if (Wflag) + xo_emit("{t:interface-name/%*s}", wid_if, iface_name); + else + xo_emit("{t:interface-name/%*.*s}", wid_if, wid_if, iface_name); + + memset(iface_name, 0, sizeof(iface_name)); + if (nh->aifindex < (uint32_t)ifmap_size && nh->ifindex != nh->aifindex) { + strlcpy(iface_name, ifmap[nh->aifindex].ifname, + sizeof(iface_name)); + if (*iface_name == '\0') + strlcpy(iface_name, "---", sizeof(iface_name)); + } + if (Wflag) + xo_emit("{t:address-interface-name/%*s}", wid_if, iface_name); + + xo_emit("{t:refcount/%*lu} ", wid_refcnt, nh->nh_refcount); + if (Wflag && nh->prepend_len) { + char *prepend_hex = "AABBCCDDEE"; + xo_emit(" {:nhop-prepend/%*s}", wid_prepend, prepend_hex); + } +#endif + //xo_emit("\n"); + xo_close_instance(name); +} + + +static void +print_nhgrp_sysctl(int fibnum, int af) +{ + size_t needed; + int mib[7]; + char *buf, *next, *lim; + struct rt_msghdr *rtm; + struct mpath_external *mp; + int fam = AF_UNSPEC; + int need_table_close = false; + + mib[0] = CTL_NET; + mib[1] = PF_ROUTE; + mib[2] = 0; + mib[3] = af; + mib[4] = NET_RT_NHGROUPS; + mib[5] = 0; + mib[6] = fibnum; + if (sysctl(mib, nitems(mib), NULL, &needed, NULL, 0) < 0) + err(EX_OSERR, "sysctl: net.route.0.%d.nhgrpdump.%d estimate", + af, fibnum); + if ((buf = malloc(needed)) == NULL) + errx(2, "malloc(%lu)", (unsigned long)needed); + if (sysctl(mib, nitems(mib), buf, &needed, NULL, 0) < 0) + err(1, "sysctl: net.route.0.%d.nhgrpdump.%d", af, fibnum); + printf("BUF: %zu\n", needed); + lim = buf + needed; + xo_open_container("nhgrp-table"); + xo_open_list("rt-family"); + for (next = buf; next < lim; next += rtm->rtm_msglen) { + rtm = (struct rt_msghdr *)next; + if (rtm->rtm_version != RTM_VERSION) + continue; + + mp = (struct mpath_external *)(rtm + 1); + /* + * Peek inside header to determine AF + */ + /* Only print family first time. */ + if (fam != af) { + if (need_table_close) { + xo_close_list("nhgrp-entry"); + xo_close_instance("rt-family"); + } + need_table_close = true; + + fam = af; + wid_dst = WID_GW_DEFAULT(fam); + wid_gw = WID_GW_DEFAULT(fam); + wid_nhidx = 5; + wid_nhtype = 12; + wid_refcnt = 6; + wid_flags = 6; + wid_pksent = 8; + wid_mtu = 6; + wid_if = WID_IF_DEFAULT(fam); + xo_open_instance("rt-family"); + pr_family(fam); + xo_open_list("nhgrp-entry"); + + print_nhgroup_header(fam); + } + print_nhgroup_entry_sysctl("nhgrp-entry", rtm, mp); + } + if (need_table_close) { + xo_close_list("nhgrp-entry"); + xo_close_instance("rt-family"); + } + xo_close_list("rt-family"); + xo_close_container("nhgrp-table"); + free(buf); +} + +static void +nhop_map_update(struct nhop_map *map, uint32_t idx, char *gw, char *ifname) +{ + if (idx >= map->size) { + uint32_t new_size; + size_t sz; + if (map->size == 0) + new_size = 32; + else + new_size = map->size * 2; + if (new_size <= idx) + new_size = roundup(idx + 1, 32); + + sz = new_size * (sizeof(struct nhop_entry)); + if ((map->ptr = realloc(map->ptr, sz)) == NULL) + errx(2, "realloc(%lu) failed", sz); + + memset(&map->ptr[map->size], 0, (new_size - map->size) * sizeof(struct nhop_entry)); + map->size = new_size; + } + + strlcpy(map->ptr[idx].ifname, ifname, sizeof(map->ptr[idx].ifname)); + strlcpy(map->ptr[idx].gw, gw, sizeof(map->ptr[idx].gw)); +} + +static struct nhop_entry * +nhop_get(struct nhop_map *map, uint32_t idx) +{ + + if (idx >= map->size) + return (NULL); + if (*map->ptr[idx].ifname == '\0') + return (NULL); + return &map->ptr[idx]; +} + +static void +print_nhop_entry_sysctl(const char *name, struct rt_msghdr *rtm, struct nhop_external *nh) +{ + char buffer[128]; + char iface_name[128]; + int protrusion; + char gw_addr[64]; + struct nhop_addrs *na; + struct sockaddr *sa_gw, *sa_ifa; + + xo_open_instance(name); + + snprintf(buffer, sizeof(buffer), "{[:-%d}{:index/%%lu}{]:} ", wid_nhidx); + //xo_emit("{t:index/%-lu} ", wid_nhidx, nh->nh_idx); + xo_emit(buffer, nh->nh_idx); + + if (Wflag) { + char *cp = nh_types[nh->nh_type]; + xo_emit("{t:type_str/%*s} ", wid_nhtype, cp); + } + memset(iface_name, 0, sizeof(iface_name)); + if (nh->ifindex < (uint32_t)ifmap_size) { + strlcpy(iface_name, ifmap[nh->ifindex].ifname, + sizeof(iface_name)); + if (*iface_name == '\0') + strlcpy(iface_name, "---", sizeof(iface_name)); + } + + na = (struct nhop_addrs *)((char *)nh + nh->nh_len); + //inet_ntop(nh->nh_family, &nh->nh_src, src_addr, sizeof(src_addr)); + //protrusion = p_addr("ifa", src_addr, wid_dst); + sa_gw = (struct sockaddr *)((char *)na + na->gw_sa_off); + sa_ifa = (struct sockaddr *)((char *)na + na->src_sa_off); + protrusion = p_sockaddr("ifa", sa_ifa, NULL, RTF_HOST, wid_dst); + + if (nh->nh_flags & NHF_GATEWAY) { + const char *cp; + cp = fmt_sockaddr(sa_gw, NULL, RTF_HOST); + strlcpy(gw_addr, cp, sizeof(gw_addr)); + } else + snprintf(gw_addr, sizeof(gw_addr), "%s/resolve", iface_name); + protrusion = print_addr("gateway", gw_addr, wid_dst - protrusion); + + nhop_map_update(&global_nhop_map, nh->nh_idx, gw_addr, iface_name); + + snprintf(buffer, sizeof(buffer), "{[:-%d}{:flags/%%s}{]:} ", + wid_flags - protrusion); + + //p_nhflags(nh->nh_flags, buffer); + print_flags_generic(rtm->rtm_flags, rt_bits, buffer, "rt_flags_pretty"); + + if (Wflag) { + xo_emit("{t:use/%*lu} ", wid_pksent, nh->nh_pksent); + xo_emit("{t:mtu/%*lu} ", wid_mtu, nh->nh_mtu); + } + //printf("IDX: %d IFACE: %s FAMILY: %d TYPE: %d FLAGS: %X GW \n"); + + if (Wflag) + xo_emit("{t:interface-name/%*s}", wid_if, iface_name); + else + xo_emit("{t:interface-name/%*.*s}", wid_if, wid_if, iface_name); + + memset(iface_name, 0, sizeof(iface_name)); + if (nh->aifindex < (uint32_t)ifmap_size && nh->ifindex != nh->aifindex) { + strlcpy(iface_name, ifmap[nh->aifindex].ifname, + sizeof(iface_name)); + if (*iface_name == '\0') + strlcpy(iface_name, "---", sizeof(iface_name)); + } + if (Wflag) + xo_emit("{t:address-interface-name/%*s}", wid_if, iface_name); + + xo_emit("{t:refcount/%*lu} ", wid_refcnt, nh->nh_refcount); + if (Wflag && nh->prepend_len) { + char *prepend_hex = "AABBCCDDEE"; + xo_emit(" {:nhop-prepend/%*s}", wid_prepend, prepend_hex); + } + + xo_emit("\n"); + xo_close_instance(name); +} + +struct nhops_map { + uint32_t idx; + struct rt_msghdr *rtm; +}; + +static int +cmp_nh_idx(const void *_a, const void *_b) +{ + const struct nhops_map *a, *b; + + a = _a; + b = _b; + + if (a->idx > b->idx) + return (1); + else if (a->idx < b->idx) + return (-1); + return (0); +} + +static void +print_nhops_sysctl(int fibnum, int af) +{ + size_t needed; + int mib[7]; + char *buf, *next, *lim; + struct rt_msghdr *rtm; + struct nhop_external *nh; + int fam; + struct nhops_map *nh_map; + size_t nh_count, nh_size; + + mib[0] = CTL_NET; + mib[1] = PF_ROUTE; + mib[2] = 0; + mib[3] = af; + mib[4] = NET_RT_NHOP; + mib[5] = 0; + mib[6] = fibnum; + if (sysctl(mib, nitems(mib), NULL, &needed, NULL, 0) < 0) + err(EX_OSERR, "sysctl: net.route.0.%d.nhdump.%d estimate", af, + fibnum); + if ((buf = malloc(needed)) == NULL) + errx(2, "malloc(%lu)", (unsigned long)needed); + if (sysctl(mib, nitems(mib), buf, &needed, NULL, 0) < 0) + err(1, "sysctl: net.route.0.%d.nhdump.%d", af, fibnum); + lim = buf + needed; + xo_open_container("nhop-table"); + xo_open_list("rt-family"); + + /* + * nexhops are received unsorted. Collect everything first, sort and then display + * sorted. + */ + nh_count = 0; + nh_size = 16; + nh_map = calloc(nh_size, sizeof(struct nhops_map)); + for (next = buf; next < lim; next += rtm->rtm_msglen) { + rtm = (struct rt_msghdr *)next; + if (rtm->rtm_version != RTM_VERSION) + continue; + + if (nh_count >= nh_size) { + nh_size *= 2; + nh_map = realloc(nh_map, nh_size * sizeof(struct nhops_map)); + } + + nh = (struct nhop_external *)(rtm + 1); + nh_map[nh_count].idx = nh->nh_idx; + nh_map[nh_count].rtm = rtm; + nh_count++; + } + + if (nh_count > 0) { + qsort(nh_map, nh_count, sizeof(struct nhops_map), cmp_nh_idx); + nh = (struct nhop_external *)(nh_map[0].rtm + 1); + fam = nh->nh_family; + + wid_dst = WID_GW_DEFAULT(fam); + wid_gw = WID_GW_DEFAULT(fam); + wid_nhidx = 5; + wid_nhtype = 12; + wid_refcnt = 6; + wid_flags = 6; + wid_pksent = 8; + wid_mtu = 6; + wid_if = WID_IF_DEFAULT(fam); + xo_open_instance("rt-family"); + pr_family(fam); + xo_open_list("nh-entry"); + + print_nhop_header(fam); + + for (size_t i = 0; i < nh_count; i++) { + rtm = nh_map[i].rtm; + nh = (struct nhop_external *)(rtm + 1); + print_nhop_entry_sysctl("nh-entry", rtm, nh); + } + + xo_close_list("nh-entry"); + xo_close_instance("rt-family"); + } + xo_close_list("rt-family"); + xo_close_container("nhop-table"); + free(buf); +} + +static void +p_nhflags(int f, const char *format) +{ + struct bits *p; + char *pretty_name = "nh_flags_pretty"; + + xo_emit(format, fmt_flags(nh_bits, f)); + + xo_open_list(pretty_name); + for (p = nh_bits; p->b_mask; p++) + if (p->b_mask & f) + xo_emit("{le:nh_flags_pretty/%s}", p->b_name); + xo_close_list(pretty_name); +} + +void +nhops_print(int fibnum, int af) +{ + size_t intsize; + int numfibs; + + intsize = sizeof(int); + if (fibnum == -1 && + sysctlbyname("net.my_fibnum", &fibnum, &intsize, NULL, 0) == -1) + fibnum = 0; + if (sysctlbyname("net.fibs", &numfibs, &intsize, NULL, 0) == -1) + numfibs = 1; + if (fibnum < 0 || fibnum > numfibs - 1) + errx(EX_USAGE, "%d: invalid fib", fibnum); + + ifmap = prepare_ifmap(&ifmap_size); + + xo_open_container("route-nhop-information"); + xo_emit("{T:Nexthop data}"); + if (fibnum) + xo_emit(" ({L:fib}: {:fib/%d})", fibnum); + xo_emit("\n"); + print_nhops_sysctl(fibnum, af); + xo_close_container("route-nhop-information"); +} + +void +nhgrp_print(int fibnum, int af) +{ + size_t intsize; + int numfibs; + + intsize = sizeof(int); + if (fibnum == -1 && + sysctlbyname("net.my_fibnum", &fibnum, &intsize, NULL, 0) == -1) + fibnum = 0; + if (sysctlbyname("net.fibs", &numfibs, &intsize, NULL, 0) == -1) + numfibs = 1; + if (fibnum < 0 || fibnum > numfibs - 1) + errx(EX_USAGE, "%d: invalid fib", fibnum); + + ifmap = prepare_ifmap(&ifmap_size); + + xo_open_container("route-nhgrp-information"); + xo_emit("{T:Nexthop groups data}"); + if (fibnum) + xo_emit(" ({L:fib}: {:fib/%d})", fibnum); + xo_emit("\n"); + print_nhgrp_sysctl(fibnum, af); + xo_close_container("route-nhgrp-information"); +} Index: usr.bin/netstat/route.c =================================================================== --- usr.bin/netstat/route.c +++ usr.bin/netstat/route.c @@ -36,7 +36,7 @@ #endif #include -__FBSDID("$FreeBSD$"); +__FBSDID("$FreeBSD: head/usr.bin/netstat/route.c 355840 2019-12-17 02:02:26Z glebius $"); #include #include @@ -69,16 +69,13 @@ #include #include #include "netstat.h" +#include "common.h" #include "nl_defs.h" /* * Definitions for showing gateway flags. */ -static struct bits { - u_long b_mask; - char b_val; - const char *b_name; -} bits[] = { +struct bits rt_bits[] = { { RTF_UP, 'U', "up" }, { RTF_GATEWAY, 'G', "gateway" }, { RTF_HOST, 'H', "host" }, @@ -99,11 +96,8 @@ { 0 , 0, NULL } }; -struct ifmap_entry { - char ifname[IFNAMSIZ]; -}; static struct ifmap_entry *ifmap; -static int ifmap_size; +static size_t ifmap_size; static struct timespec uptime; static const char *netname4(in_addr_t, in_addr_t); @@ -112,12 +106,7 @@ #endif static void p_rtable_sysctl(int, int); static void p_rtentry_sysctl(const char *name, struct rt_msghdr *); -static int p_sockaddr(const char *name, struct sockaddr *, struct sockaddr *, - int, int); -static const char *fmt_sockaddr(struct sockaddr *sa, struct sockaddr *mask, - int flags); static void p_flags(int, const char *); -static const char *fmt_flags(int f); static void domask(char *, size_t, u_long); @@ -229,7 +218,7 @@ wid_dst, wid_dst, "Destination", wid_gw, wid_gw, "Gateway", wid_flags, wid_flags, "Flags", - wid_pksent, wid_pksent, "Use", + wid_mtu, wid_mtu, "Nhop#", wid_mtu, wid_mtu, "Mtu", wid_if, wid_if, "Netif", wid_expire, "Expire"); @@ -252,46 +241,10 @@ char *buf, *next, *lim; struct rt_msghdr *rtm; struct sockaddr *sa; - int fam = AF_UNSPEC, ifindex = 0, size; + int fam = AF_UNSPEC; int need_table_close = false; - struct ifaddrs *ifap, *ifa; - struct sockaddr_dl *sdl; - - /* - * Retrieve interface list at first - * since we need #ifindex -> if_xname match - */ - if (getifaddrs(&ifap) != 0) - err(EX_OSERR, "getifaddrs"); - - for (ifa = ifap; ifa; ifa = ifa->ifa_next) { - - if (ifa->ifa_addr->sa_family != AF_LINK) - continue; - - sdl = (struct sockaddr_dl *)ifa->ifa_addr; - ifindex = sdl->sdl_index; - - if (ifindex >= ifmap_size) { - size = roundup(ifindex + 1, 32) * - sizeof(struct ifmap_entry); - if ((ifmap = realloc(ifmap, size)) == NULL) - errx(2, "realloc(%d) failed", size); - memset(&ifmap[ifmap_size], 0, - size - ifmap_size * - sizeof(struct ifmap_entry)); - - ifmap_size = roundup(ifindex + 1, 32); - } - - if (*ifmap[ifindex].ifname != '\0') - continue; - - strlcpy(ifmap[ifindex].ifname, ifa->ifa_name, IFNAMSIZ); - } - - freeifaddrs(ifap); + ifmap = prepare_ifmap(&ifmap_size); mib[0] = CTL_NET; mib[1] = PF_ROUTE; @@ -377,7 +330,8 @@ wid_flags - protrusion); p_flags(rtm->rtm_flags, buffer); if (Wflag) { - xo_emit("{t:use/%*lu} ", wid_pksent, rtm->rtm_rmx.rmx_pksent); + /* XXX: use=0? */ + xo_emit("{t:nhop/%*lu} ", wid_mtu, rtm->rtm_rmx.rmx_nhidx); if (rtm->rtm_rmx.rmx_mtu != 0) xo_emit("{t:mtu/%*lu} ", wid_mtu, rtm->rtm_rmx.rmx_mtu); @@ -410,7 +364,7 @@ xo_close_instance(name); } -static int +int p_sockaddr(const char *name, struct sockaddr *sa, struct sockaddr *mask, int flags, int width) { @@ -442,7 +396,7 @@ return (protrusion); } -static const char * +const char * fmt_sockaddr(struct sockaddr *sa, struct sockaddr *mask, int flags) { static char buf[128]; @@ -519,30 +473,10 @@ static void p_flags(int f, const char *format) { - struct bits *p; - - xo_emit(format, fmt_flags(f)); - xo_open_list("flags_pretty"); - for (p = bits; p->b_mask; p++) - if (p->b_mask & f) - xo_emit("{le:flags_pretty/%s}", p->b_name); - xo_close_list("flags_pretty"); + print_flags_generic(f, rt_bits, format, "flags_pretty"); } -static const char * -fmt_flags(int f) -{ - static char name[33]; - char *flags; - struct bits *p = bits; - - for (flags = name; p->b_mask; p++) - if (p->b_mask & f) - *flags++ = p->b_val; - *flags = '\0'; - return (name); -} char * routename(struct sockaddr *sa, int flags)