diff --git a/sys/net/route/nhop.c b/sys/net/route/nhop.c index 51b4e5bfb3e5..b7154fe69a3d 100644 --- a/sys/net/route/nhop.c +++ b/sys/net/route/nhop.c @@ -1,392 +1,403 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2020 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#define DEBUG_MOD_NAME nhop +#define DEBUG_MAX_LEVEL LOG_DEBUG +#include +_DECLARE_DEBUG(LOG_INFO); + /* * This file contains data structures management logic for the nexthop ("nhop") * route subsystem. * * Nexthops in the original sense are the objects containing all the necessary * information to forward the packet to the selected destination. * In particular, nexthop is defined by a combination of * ifp, ifa, aifp, mtu, gw addr(if set), nh_type, nh_family, mask of rt_flags and * NHF_DEFAULT * * All nexthops are stored in the resizable hash table. * Additionally, each nexthop gets assigned its unique index (nexthop index) * so userland programs can interact with the nexthops easier. Index allocation * is backed by the bitmask array. */ MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data"); /* Hash management functions */ int nhops_init_rib(struct rib_head *rh) { struct nh_control *ctl; size_t alloc_size; uint32_t num_buckets, num_items; void *ptr; ctl = malloc(sizeof(struct nh_control), M_NHOP, M_WAITOK | M_ZERO); /* * Allocate nexthop hash. Start with 16 items by default (128 bytes). * This will be enough for most of the cases. */ num_buckets = 16; alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); ptr = malloc(alloc_size, M_NHOP, M_WAITOK | M_ZERO); CHT_SLIST_INIT(&ctl->nh_head, ptr, num_buckets); /* * Allocate nexthop index bitmask. */ num_items = 128 * 8; /* 128 bytes */ ptr = malloc(bitmask_get_size(num_items), M_NHOP, M_WAITOK | M_ZERO); bitmask_init(&ctl->nh_idx_head, ptr, num_items); NHOPS_LOCK_INIT(ctl); rh->nh_control = ctl; ctl->ctl_rh = rh; - DPRINTF("NHOPS init for fib %u af %u: ctl %p rh %p", rh->rib_fibnum, - rh->rib_family, ctl, rh); + FIB_CTL_LOG(LOG_DEBUG2, ctl, "nhops init: ctl %p rh %p", ctl, rh); return (0); } static void destroy_ctl(struct nh_control *ctl) { NHOPS_LOCK_DESTROY(ctl); free(ctl->nh_head.ptr, M_NHOP); free(ctl->nh_idx_head.idx, M_NHOP); #ifdef ROUTE_MPATH nhgrp_ctl_free(ctl); #endif free(ctl, M_NHOP); } /* * Epoch callback indicating ctl is safe to destroy */ static void destroy_ctl_epoch(epoch_context_t ctx) { struct nh_control *ctl; ctl = __containerof(ctx, struct nh_control, ctl_epoch_ctx); destroy_ctl(ctl); } void nhops_destroy_rib(struct rib_head *rh) { struct nh_control *ctl; struct nhop_priv *nh_priv; ctl = rh->nh_control; /* * All routes should have been deleted in rt_table_destroy(). * However, TCP stack or other consumers may store referenced * nexthop pointers. When these references go to zero, * nhop_free() will try to unlink these records from the * datastructures, most likely leading to panic. * * Avoid that by explicitly marking all of the remaining * nexthops as unlinked by removing a reference from a special * counter. Please see nhop_free() comments for more * details. */ NHOPS_WLOCK(ctl); CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) { - DPRINTF("Marking nhop %u unlinked", nh_priv->nh_idx); + FIB_RH_LOG(LOG_DEBUG3, rh, "marking nhop %u unlinked", nh_priv->nh_idx); refcount_release(&nh_priv->nh_linked); } CHT_SLIST_FOREACH_END; #ifdef ROUTE_MPATH nhgrp_ctl_unlink_all(ctl); #endif NHOPS_WUNLOCK(ctl); /* * Postpone destruction till the end of current epoch * so nhop_free() can safely use nh_control pointer. */ epoch_call(net_epoch_preempt, destroy_ctl_epoch, &ctl->ctl_epoch_ctx); } /* * Nexhop hash calculation: * * Nexthops distribution: * 2 "mandatory" nexthops per interface ("interface route", "loopback"). * For direct peering: 1 nexthop for the peering router per ifp/af. * For Ix-like peering: tens to hundreds nexthops of neghbors per ifp/af. * IGP control plane & broadcast segment: tens of nexthops per ifp/af. * * Each fib/af combination has its own hash table. * With that in mind, hash nexthops by the combination of the interface * and GW IP address. * * To optimize hash calculation, ignore higher bytes of ifindex, as they * give very little entropy. * Similarly, use lower 4 bytes of IPv6 address to distinguish between the * neighbors. */ struct _hash_data { uint16_t ifindex; uint8_t family; uint8_t nh_type; uint32_t gw_addr; }; static unsigned djb_hash(const unsigned char *h, const int len) { unsigned int result = 0; int i; for (i = 0; i < len; i++) result = 33 * result ^ h[i]; return (result); } static uint32_t hash_priv(const struct nhop_priv *priv) { struct nhop_object *nh; uint16_t ifindex; struct _hash_data key; nh = priv->nh; ifindex = nh->nh_ifp->if_index & 0xFFFF; memset(&key, 0, sizeof(key)); key.ifindex = ifindex; key.family = nh->gw_sa.sa_family; key.nh_type = priv->nh_type & 0xFF; if (nh->gw_sa.sa_family == AF_INET6) memcpy(&key.gw_addr, &nh->gw6_sa.sin6_addr.s6_addr32[3], 4); else if (nh->gw_sa.sa_family == AF_INET) memcpy(&key.gw_addr, &nh->gw4_sa.sin_addr, 4); return (uint32_t)(djb_hash((const unsigned char *)&key, sizeof(key))); } /* * Checks if hash needs resizing and performs this resize if necessary * */ static void consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, uint32_t new_idx_items) { void *nh_ptr, *nh_idx_ptr; void *old_idx_ptr; size_t alloc_size; nh_ptr = NULL; if (new_nh_buckets != 0) { alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_nh_buckets); nh_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO); } nh_idx_ptr = NULL; if (new_idx_items != 0) { alloc_size = bitmask_get_size(new_idx_items); nh_idx_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO); } if (nh_ptr == NULL && nh_idx_ptr == NULL) { /* Either resize is not required or allocations have failed. */ return; } - DPRINTF("going to resize: nh:[ptr:%p sz:%u] idx:[ptr:%p sz:%u]", nh_ptr, - new_nh_buckets, nh_idx_ptr, new_idx_items); + FIB_CTL_LOG(LOG_DEBUG, ctl, + "going to resize: nh:[ptr:%p sz:%u] idx:[ptr:%p sz:%u]", + nh_ptr, new_nh_buckets, nh_idx_ptr, new_idx_items); old_idx_ptr = NULL; NHOPS_WLOCK(ctl); if (nh_ptr != NULL) { CHT_SLIST_RESIZE(&ctl->nh_head, nhops, nh_ptr, new_nh_buckets); } if (nh_idx_ptr != NULL) { if (bitmask_copy(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items) == 0) bitmask_swap(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items, &old_idx_ptr); } NHOPS_WUNLOCK(ctl); if (nh_ptr != NULL) free(nh_ptr, M_NHOP); if (old_idx_ptr != NULL) free(old_idx_ptr, M_NHOP); } /* * Links nextop @nh_priv to the nexhop hash table and allocates * nexhop index. * Returns allocated index or 0 on failure. */ int link_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv) { uint16_t idx; uint32_t num_buckets_new, num_items_new; KASSERT((nh_priv->nh_idx == 0), ("nhop index is already allocated")); NHOPS_WLOCK(ctl); /* * Check if we need to resize hash and index. * The following 2 functions returns either new size or 0 * if resize is not required. */ num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->nh_head); num_items_new = bitmask_get_resize_items(&ctl->nh_idx_head); if (bitmask_alloc_idx(&ctl->nh_idx_head, &idx) != 0) { NHOPS_WUNLOCK(ctl); - DPRINTF("Unable to allocate nhop index"); + FIB_CTL_LOG(LOG_INFO, ctl, "Unable to allocate nhop index"); RTSTAT_INC(rts_nh_idx_alloc_failure); consider_resize(ctl, num_buckets_new, num_items_new); return (0); } nh_priv->nh_idx = idx; nh_priv->nh_control = ctl; nh_priv->nh_finalized = 1; CHT_SLIST_INSERT_HEAD(&ctl->nh_head, nhops, nh_priv); NHOPS_WUNLOCK(ctl); - DPRINTF("Linked nhop priv %p to %d, hash %u, ctl %p", nh_priv, idx, - hash_priv(nh_priv), ctl); + FIB_RH_LOG(LOG_DEBUG2, ctl->ctl_rh, + "Linked nhop priv %p to %d, hash %u, ctl %p", + nh_priv, idx, hash_priv(nh_priv), ctl); consider_resize(ctl, num_buckets_new, num_items_new); return (idx); } /* * Unlinks nexthop specified by @nh_priv data from the hash. * * Returns found nexthop or NULL. */ struct nhop_priv * unlink_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv_del) { struct nhop_priv *priv_ret; int idx; uint32_t num_buckets_new, num_items_new; idx = 0; NHOPS_WLOCK(ctl); CHT_SLIST_REMOVE(&ctl->nh_head, nhops, nh_priv_del, priv_ret); if (priv_ret != NULL) { idx = priv_ret->nh_idx; priv_ret->nh_idx = 0; KASSERT((idx != 0), ("bogus nhop index 0")); if ((bitmask_free_idx(&ctl->nh_idx_head, idx)) != 0) { - DPRINTF("Unable to remove index %d from fib %u af %d", - idx, ctl->ctl_rh->rib_fibnum, - ctl->ctl_rh->rib_family); + FIB_CTL_LOG(LOG_DEBUG, ctl, + "Unable to remove index %d from fib %u af %d", + idx, ctl->ctl_rh->rib_fibnum, ctl->ctl_rh->rib_family); } } /* Check if hash or index needs to be resized */ num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->nh_head); num_items_new = bitmask_get_resize_items(&ctl->nh_idx_head); NHOPS_WUNLOCK(ctl); + FIB_CTL_LOG(LOG_INFO, ctl, "Unable to unlink nhop priv %p from hash, hash %u ctl %p", + nh_priv_del, hash_priv(nh_priv_del), ctl); - if (priv_ret == NULL) - DPRINTF("Unable to unlink nhop priv %p from hash, hash %u ctl %p", + if (priv_ret == NULL) { + FIB_CTL_LOG(LOG_INFO, ctl, + "Unable to unlink nhop priv %p from hash, hash %u ctl %p", nh_priv_del, hash_priv(nh_priv_del), ctl); - else - DPRINTF("Unlinked nhop %p priv idx %d", priv_ret, idx); + } else { + FIB_CTL_LOG(LOG_DEBUG2, ctl, "Unlinked nhop %p priv idx %d", + priv_ret, idx); + } consider_resize(ctl, num_buckets_new, num_items_new); return (priv_ret); } /* * Searches for the nexthop by data specifcied in @nh_priv. * Returns referenced nexthop or NULL. */ struct nhop_priv * find_nhop(struct nh_control *ctl, const struct nhop_priv *nh_priv) { struct nhop_priv *nh_priv_ret; NHOPS_RLOCK(ctl); CHT_SLIST_FIND_BYOBJ(&ctl->nh_head, nhops, nh_priv, nh_priv_ret); if (nh_priv_ret != NULL) { if (refcount_acquire_if_not_zero(&nh_priv_ret->nh_refcnt) == 0){ /* refcount was 0 -> nhop is being deleted */ nh_priv_ret = NULL; } } NHOPS_RUNLOCK(ctl); return (nh_priv_ret); } diff --git a/sys/net/route/route_debug.h b/sys/net/route/route_debug.h index 30d2a1c9a99f..de80bd1ea78b 100644 --- a/sys/net/route/route_debug.h +++ b/sys/net/route/route_debug.h @@ -1,168 +1,170 @@ /*- * Copyright (c) 2021 * Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _NET_ROUTE_DEBUG_H_ #define _NET_ROUTE_DEBUG_H_ #include #include /* DEBUG logic */ #if defined(DEBUG_MOD_NAME) && defined(DEBUG_MAX_LEVEL) #define DEBUG_VAR_NAME _DEBUG_VAR_NAME(DEBUG_MOD_NAME) #define _DEBUG_VAR_NAME(a) _DEBUG_VAR_NAME_INDIRECT(a) #define _DEBUG_VAR_NAME_INDIRECT(prefix) prefix##_debug_level #define DEBUG_PREFIX_NAME _DEBUG_PREFIX_NAME(DEBUG_MOD_NAME) #define _DEBUG_PREFIX_NAME(n) __DEBUG_PREFIX_NAME(n) #define __DEBUG_PREFIX_NAME(n) #n #define _DECLARE_DEBUG(_default_level) \ SYSCTL_DECL(_net_route_debug); \ static int DEBUG_VAR_NAME = _default_level; \ SYSCTL_INT(_net_route_debug, OID_AUTO, DEBUG_VAR_NAME, \ CTLFLAG_RW | CTLFLAG_RWTUN, \ &(DEBUG_VAR_NAME), 0, "debuglevel") /* Additional tracing levels not defined by log.h */ #ifndef LOG_DEBUG2 #define LOG_DEBUG2 8 #endif #ifndef LOG_DEBUG3 #define LOG_DEBUG3 9 #endif /* * Severity usage guidelines: * * LOG_WARNING - subsystem-global errors ("multipath init failed") * * LOG_INFO - subsystem non-transient errors ("Failed to unlink nexhop"). * All logging <= LOG_INFO by default will be written to syslog. * * LOG_DEBUG - subsystem debug. Not-too often events (hash resizes, recoverable failures). * These are compiled in by default on production. Turning it it should NOT notable affect * performance * LOG_DEBUG2 - more debug. Per-item level (nhg,nh,route) debug, up to multiple lines per item. * This is NOT compiled in by default. Turning it on should NOT seriously impact performance * LOG_DEBUG3 - last debug level. Per-item large debug outputs. * This is NOT compiled in by default. All performance bets are off. * */ #define _output printf #define _DEBUG_PASS_MSG(_l) (DEBUG_VAR_NAME >= (_l)) /* * Logging for events specific for particular family and fib * Example: [nhop_neigh] inet.0 find_lle: nhop nh#4/inet/vtnet0/10.0.0.1: mapped to lle NULL */ #define FIB_LOG(_l, _fib, _fam, _fmt, ...) FIB_LOG_##_l(_l, _fib, _fam, _fmt, ## __VA_ARGS__) #define _FIB_LOG(_l, _fib, _fam, _fmt, ...) if (_DEBUG_PASS_MSG(_l)) { \ _output("[" DEBUG_PREFIX_NAME "] %s.%u %s: " _fmt "\n", rib_print_family(_fam), _fib, __func__, ##__VA_ARGS__); \ } /* Same as FIB_LOG, but uses nhop to get fib and family */ #define FIB_NH_LOG(_l, _nh, _fmt, ...) FIB_LOG_##_l(_l, nhop_get_fibnum(_nh), nhop_get_upper_family(_nh), _fmt, ## __VA_ARGS__) /* Same as FIB_LOG, but uses rib_head to get fib and family */ #define FIB_RH_LOG(_l, _rh, _fmt, ...) FIB_LOG_##_l(_l, (_rh)->rib_fibnum, (_rh)->rib_family, _fmt, ## __VA_ARGS__) +/* Same as FIB_LOG, but uses nh_control to get fib and family from linked rib */ +#define FIB_CTL_LOG(_l, _ctl, _fmt, ...) FIB_LOG_##_l(_l, (_ctl)->ctl_rh->rib_fibnum, (_ctl)->ctl_rh->rib_family, _fmt, ## __VA_ARGS__) /* * Generic logging for routing subsystem * Example: [nhop_neigh] nhops_update_neigh: L2 prepend update from lle/inet/valid/vtnet0/10.0.0.157 */ #define RT_LOG(_l, _fmt, ...) RT_LOG_##_l(_l, _fmt, ## __VA_ARGS__) #define _RT_LOG(_l, _fmt, ...) if (_DEBUG_PASS_MSG(_l)) { \ _output("[" DEBUG_PREFIX_NAME "] %s: " _fmt "\n", __func__, ##__VA_ARGS__); \ } /* * Wrapper logic to avoid compiling high levels of debugging messages for production systems. */ #if DEBUG_MAX_LEVEL>=LOG_DEBUG3 #define FIB_LOG_LOG_DEBUG3 _FIB_LOG #define RT_LOG_LOG_DEBUG3 _RT_LOG #else #define FIB_LOG_LOG_DEBUG3(_l, _fib, _fam, _fmt, ...) #define RT_LOG_LOG_DEBUG3(_l, _fmt, ...) #endif #if DEBUG_MAX_LEVEL>=LOG_DEBUG2 #define FIB_LOG_LOG_DEBUG2 _FIB_LOG #define RT_LOG_LOG_DEBUG2 _RT_LOG #else #define FIB_LOG_LOG_DEBUG2(_l, _fib, _fam, _fmt, ...) #define RT_LOG_LOG_DEBUG2(_l, _fmt, ...) #endif #if DEBUG_MAX_LEVEL>=LOG_DEBUG #define FIB_LOG_LOG_DEBUG _FIB_LOG #define RT_LOG_LOG_DEBUG _RT_LOG #else #define FIB_LOG_LOG_DEBUG(_l, _fib, _fam, _fmt, ...) #define RT_LOG_LOG_DEBUG(_l, _fmt, ...) #endif #if DEBUG_MAX_LEVEL>=LOG_INFO #define FIB_LOG_LOG_INFO _FIB_LOG #define RT_LOG_LOG_INFO _RT_LOG #else #define FIB_LOG_LOG_INFO(_l, _fib, _fam, _fmt, ...) #define RT_LOG_LOG_INFO(_l, _fmt, ...) #endif #define FIB_LOG_LOG_NOTICE _FIB_LOG #define FIB_LOG_LOG_ERR _FIB_LOG #define FIB_LOG_LOG_WARNING _FIB_LOG #define RT_LOG_LOG_NOTICE _RT_LOG #define RT_LOG_LOG_ERR _RT_LOG #define RT_LOG_LOG_WARNING _RT_LOG #endif /* Helpers for fancy-printing various objects */ struct nhop_object; struct nhgrp_object; struct llentry; struct nhop_neigh; struct rtentry; #define NHOP_PRINT_BUFSIZE 48 char *nhop_print_buf(const struct nhop_object *nh, char *buf, size_t bufsize); char *nhop_print_buf_any(const struct nhop_object *nh, char *buf, size_t bufsize); char *nhgrp_print_buf(const struct nhgrp_object *nhg, char *buf, size_t bufsize); char *llentry_print_buf(const struct llentry *lle, struct ifnet *ifp, int family, char *buf, size_t bufsize); char *llentry_print_buf_lltable(const struct llentry *lle, char *buf, size_t bufsize); char *neigh_print_buf(const struct nhop_neigh *nn, char *buf, size_t bufsize); char *rt_print_buf(const struct rtentry *rt, char *buf, size_t bufsize); const char *rib_print_cmd(int rib_cmd); #endif diff --git a/sys/net/route/route_helpers.c b/sys/net/route/route_helpers.c index 001ebc5e388b..6c7f16eb047e 100644 --- a/sys/net/route/route_helpers.c +++ b/sys/net/route/route_helpers.c @@ -1,624 +1,622 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2020 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #endif #ifdef INET6 #include #include #endif #include #define DEBUG_MOD_NAME rt_helpers #define DEBUG_MAX_LEVEL LOG_DEBUG2 #include _DECLARE_DEBUG(LOG_INFO); /* * RIB helper functions. */ void rib_walk_ext_locked(struct rib_head *rnh, rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg) { if (hook_f != NULL) hook_f(rnh, RIB_WALK_HOOK_PRE, arg); rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg); if (hook_f != NULL) hook_f(rnh, RIB_WALK_HOOK_POST, arg); } /* * Calls @wa_f with @arg for each entry in the table specified by * @af and @fibnum. * * @ss_t callback is called before and after the tree traversal * while holding table lock. * * Table is traversed under read lock unless @wlock is set. */ void rib_walk_ext_internal(struct rib_head *rnh, bool wlock, rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg) { RIB_RLOCK_TRACKER; if (wlock) RIB_WLOCK(rnh); else RIB_RLOCK(rnh); rib_walk_ext_locked(rnh, wa_f, hook_f, arg); if (wlock) RIB_WUNLOCK(rnh); else RIB_RUNLOCK(rnh); } void rib_walk_ext(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg) { struct rib_head *rnh; if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL) rib_walk_ext_internal(rnh, wlock, wa_f, hook_f, arg); } /* * Calls @wa_f with @arg for each entry in the table specified by * @af and @fibnum. * * Table is traversed under read lock unless @wlock is set. */ void rib_walk(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f, void *arg) { rib_walk_ext(fibnum, family, wlock, wa_f, NULL, arg); } /* * Calls @wa_f with @arg for each entry in the table matching @prefix/@mask. * * The following flags are supported: * RIB_FLAG_WLOCK: acquire exclusive lock * RIB_FLAG_LOCKED: Assumes the table is already locked & skip locking * * By default, table is traversed under read lock. */ void rib_walk_from(uint32_t fibnum, int family, uint32_t flags, struct sockaddr *prefix, struct sockaddr *mask, rib_walktree_f_t *wa_f, void *arg) { RIB_RLOCK_TRACKER; struct rib_head *rnh = rt_tables_get_rnh(fibnum, family); if (rnh == NULL) return; if (flags & RIB_FLAG_WLOCK) RIB_WLOCK(rnh); else if (!(flags & RIB_FLAG_LOCKED)) RIB_RLOCK(rnh); rnh->rnh_walktree_from(&rnh->head, prefix, mask, (walktree_f_t *)wa_f, arg); if (flags & RIB_FLAG_WLOCK) RIB_WUNLOCK(rnh); else if (!(flags & RIB_FLAG_LOCKED)) RIB_RUNLOCK(rnh); } /* * Iterates over all existing fibs in system calling * @hook_f function before/after traversing each fib. * Calls @wa_f function for each element in current fib. * If af is not AF_UNSPEC, iterates over fibs in particular * address family. */ void rib_foreach_table_walk(int family, bool wlock, rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg) { for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) { /* Do we want some specific family? */ if (family != AF_UNSPEC) { rib_walk_ext(fibnum, family, wlock, wa_f, hook_f, arg); continue; } for (int i = 1; i <= AF_MAX; i++) rib_walk_ext(fibnum, i, wlock, wa_f, hook_f, arg); } } /* * Iterates over all existing fibs in system and deletes each element * for which @filter_f function returns non-zero value. * If @family is not AF_UNSPEC, iterates over fibs in particular * address family. */ void rib_foreach_table_walk_del(int family, rib_filter_f_t *filter_f, void *arg) { for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) { /* Do we want some specific family? */ if (family != AF_UNSPEC) { rib_walk_del(fibnum, family, filter_f, arg, 0); continue; } for (int i = 1; i <= AF_MAX; i++) rib_walk_del(fibnum, i, filter_f, arg, 0); } } /* * Wrapper for the control plane functions for performing af-agnostic * lookups. * @fibnum: fib to perform the lookup. * @dst: sockaddr with family and addr filled in. IPv6 addresses needs to be in * deembedded from. * @flags: fib(9) flags. * @flowid: flow id for path selection in multipath use case. * * Returns nhop_object or NULL. * * Requires NET_EPOCH. * */ struct nhop_object * rib_lookup(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, uint32_t flowid) { struct nhop_object *nh; nh = NULL; switch (dst->sa_family) { #ifdef INET case AF_INET: { const struct sockaddr_in *a = (const struct sockaddr_in *)dst; nh = fib4_lookup(fibnum, a->sin_addr, 0, flags, flowid); break; } #endif #ifdef INET6 case AF_INET6: { const struct sockaddr_in6 *a = (const struct sockaddr_in6*)dst; nh = fib6_lookup(fibnum, &a->sin6_addr, a->sin6_scope_id, flags, flowid); break; } #endif } return (nh); } #ifdef ROUTE_MPATH static void notify_add(struct rib_cmd_info *rc, const struct weightened_nhop *wn_src, route_notification_t *cb, void *cbdata) { rc->rc_nh_new = wn_src->nh; rc->rc_nh_weight = wn_src->weight; #if DEBUG_MAX_LEVEL >= LOG_DEBUG2 char nhbuf[NHOP_PRINT_BUFSIZE]; FIB_NH_LOG(LOG_DEBUG2, wn_src->nh, "RTM_ADD for %s @ w=%u", nhop_print_buf(wn_src->nh, nhbuf, sizeof(nhbuf)), wn_src->weight); #endif cb(rc, cbdata); } static void notify_del(struct rib_cmd_info *rc, const struct weightened_nhop *wn_src, route_notification_t *cb, void *cbdata) { rc->rc_nh_old = wn_src->nh; rc->rc_nh_weight = wn_src->weight; #if DEBUG_MAX_LEVEL >= LOG_DEBUG2 char nhbuf[NHOP_PRINT_BUFSIZE]; FIB_NH_LOG(LOG_DEBUG2, wn_src->nh, "RTM_DEL for %s @ w=%u", nhop_print_buf(wn_src->nh, nhbuf, sizeof(nhbuf)), wn_src->weight); #endif cb(rc, cbdata); } static void decompose_change_notification(struct rib_cmd_info *rc, route_notification_t *cb, void *cbdata) { uint32_t num_old, num_new; struct weightened_nhop *wn_old, *wn_new; struct weightened_nhop tmp = { NULL, 0 }; uint32_t idx_old = 0, idx_new = 0; struct rib_cmd_info rc_del = { .rc_cmd = RTM_DELETE, .rc_rt = rc->rc_rt }; struct rib_cmd_info rc_add = { .rc_cmd = RTM_ADD, .rc_rt = rc->rc_rt }; if (NH_IS_NHGRP(rc->rc_nh_old)) { wn_old = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_old); } else { tmp.nh = rc->rc_nh_old; tmp.weight = rc->rc_nh_weight; wn_old = &tmp; num_old = 1; } if (NH_IS_NHGRP(rc->rc_nh_new)) { wn_new = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_new); } else { tmp.nh = rc->rc_nh_new; tmp.weight = rc->rc_nh_weight; wn_new = &tmp; num_new = 1; } #if DEBUG_MAX_LEVEL >= LOG_DEBUG { char buf_old[NHOP_PRINT_BUFSIZE], buf_new[NHOP_PRINT_BUFSIZE]; nhop_print_buf_any(rc->rc_nh_old, buf_old, NHOP_PRINT_BUFSIZE); nhop_print_buf_any(rc->rc_nh_new, buf_new, NHOP_PRINT_BUFSIZE); FIB_NH_LOG(LOG_DEBUG, wn_old[0].nh, "change %s -> %s", buf_old, buf_new); } #endif /* Use the fact that each @wn array is sorted */ /* * Here we have one (or two) multipath groups and transition * between them needs to be reported to the caller, using series * of primitive (RTM_DEL, RTM_ADD) operations. * * Leverage the fact that each nexthop group has its nexthops sorted * by their indices. * [1] -> [1, 2] = A{2} * [1, 2] -> [1] = D{2} * [1, 2, 4] -> [1, 3, 4] = D{2}, A{3} * [1, 2] -> [3, 4] = D{1}, D{2}, A{3}, A{4] */ while ((idx_old < num_old) && (idx_new < num_new)) { uint32_t nh_idx_old = wn_old[idx_old].nh->nh_priv->nh_idx; uint32_t nh_idx_new = wn_new[idx_new].nh->nh_priv->nh_idx; if (nh_idx_old == nh_idx_new) { if (wn_old[idx_old].weight != wn_new[idx_new].weight) { /* Update weight by providing del/add notifications */ notify_del(&rc_del, &wn_old[idx_old], cb, cbdata); notify_add(&rc_add, &wn_new[idx_new], cb, cbdata); } idx_old++; idx_new++; } else if (nh_idx_old < nh_idx_new) { /* [1, ~2~, 4], [1, ~3~, 4] */ notify_del(&rc_del, &wn_old[idx_old], cb, cbdata); idx_old++; } else { /* nh_idx_old > nh_idx_new. */ notify_add(&rc_add, &wn_new[idx_new], cb, cbdata); idx_new++; } } while (idx_old < num_old) { notify_del(&rc_del, &wn_old[idx_old], cb, cbdata); idx_old++; } while (idx_new < num_new) { notify_add(&rc_add, &wn_new[idx_new], cb, cbdata); idx_new++; } } /* * Decompose multipath cmd info @rc into a list of add/del/change * single-path operations, calling @cb callback for each operation. * Assumes at least one of the nexthops in @rc is multipath. */ void rib_decompose_notification(struct rib_cmd_info *rc, route_notification_t *cb, void *cbdata) { struct weightened_nhop *wn; uint32_t num_nhops; struct rib_cmd_info rc_new; rc_new = *rc; - DPRINTF("cb=%p cmd=%d nh_old=%p nh_new=%p", - cb, rc->cmd, rc->nh_old, rc->nh_new); switch (rc->rc_cmd) { case RTM_ADD: if (!NH_IS_NHGRP(rc->rc_nh_new)) return; wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_nhops); for (uint32_t i = 0; i < num_nhops; i++) { notify_add(&rc_new, &wn[i], cb, cbdata); } break; case RTM_DELETE: if (!NH_IS_NHGRP(rc->rc_nh_old)) return; wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_nhops); for (uint32_t i = 0; i < num_nhops; i++) { notify_del(&rc_new, &wn[i], cb, cbdata); } break; case RTM_CHANGE: if (!NH_IS_NHGRP(rc->rc_nh_old) && !NH_IS_NHGRP(rc->rc_nh_new)) return; decompose_change_notification(rc, cb, cbdata); break; } } #endif #ifdef INET /* * Checks if the found key in the trie contains (<=) a prefix covering * @paddr/@plen. * Returns the most specific rtentry matching the condition or NULL. */ static struct rtentry * get_inet_parent_prefix(uint32_t fibnum, struct in_addr addr, int plen) { struct route_nhop_data rnd; struct rtentry *rt; struct in_addr addr4; uint32_t scopeid; int parent_plen; struct radix_node *rn; rt = fib4_lookup_rt(fibnum, addr, 0, NHR_UNLOCKED, &rnd); if (rt == NULL) return (NULL); rt_get_inet_prefix_plen(rt, &addr4, &parent_plen, &scopeid); if (parent_plen <= plen) return (rt); /* * There can be multiple prefixes associated with the found key: * 10.0.0.0 -> 10.0.0.0/24, 10.0.0.0/23, 10.0.0.0/22, etc. * All such prefixes are linked via rn_dupedkey, from most specific * to least specific. Iterate over them to check if any of these * prefixes are wider than desired plen. */ rn = (struct radix_node *)rt; while ((rn = rn_nextprefix(rn)) != NULL) { rt = RNTORT(rn); rt_get_inet_prefix_plen(rt, &addr4, &parent_plen, &scopeid); if (parent_plen <= plen) return (rt); } return (NULL); } /* * Returns the most specific prefix containing (>) @paddr/plen. */ struct rtentry * rt_get_inet_parent(uint32_t fibnum, struct in_addr addr, int plen) { struct in_addr lookup_addr = { .s_addr = INADDR_BROADCAST }; struct in_addr addr4 = addr; struct in_addr mask4; struct rtentry *rt; while (plen-- > 0) { /* Calculate wider mask & new key to lookup */ mask4.s_addr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0); addr4.s_addr = htonl(ntohl(addr4.s_addr) & ntohl(mask4.s_addr)); if (addr4.s_addr == lookup_addr.s_addr) { /* Skip lookup if the key is the same */ continue; } lookup_addr = addr4; rt = get_inet_parent_prefix(fibnum, lookup_addr, plen); if (rt != NULL) return (rt); } return (NULL); } #endif #ifdef INET6 /* * Checks if the found key in the trie contains (<=) a prefix covering * @paddr/@plen. * Returns the most specific rtentry matching the condition or NULL. */ static struct rtentry * get_inet6_parent_prefix(uint32_t fibnum, const struct in6_addr *paddr, int plen) { struct route_nhop_data rnd; struct rtentry *rt; struct in6_addr addr6; uint32_t scopeid; int parent_plen; struct radix_node *rn; rt = fib6_lookup_rt(fibnum, paddr, 0, NHR_UNLOCKED, &rnd); if (rt == NULL) return (NULL); rt_get_inet6_prefix_plen(rt, &addr6, &parent_plen, &scopeid); if (parent_plen <= plen) return (rt); /* * There can be multiple prefixes associated with the found key: * 2001:db8:1::/64 -> 2001:db8:1::/56, 2001:db8:1::/48, etc. * All such prefixes are linked via rn_dupedkey, from most specific * to least specific. Iterate over them to check if any of these * prefixes are wider than desired plen. */ rn = (struct radix_node *)rt; while ((rn = rn_nextprefix(rn)) != NULL) { rt = RNTORT(rn); rt_get_inet6_prefix_plen(rt, &addr6, &parent_plen, &scopeid); if (parent_plen <= plen) return (rt); } return (NULL); } static void ipv6_writemask(struct in6_addr *addr6, uint8_t mask) { uint32_t *cp; for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32) *cp++ = 0xFFFFFFFF; if (mask > 0) *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0); } /* * Returns the most specific prefix containing (>) @paddr/plen. */ struct rtentry * rt_get_inet6_parent(uint32_t fibnum, const struct in6_addr *paddr, int plen) { struct in6_addr lookup_addr = in6mask128; struct in6_addr addr6 = *paddr; struct in6_addr mask6; struct rtentry *rt; while (plen-- > 0) { /* Calculate wider mask & new key to lookup */ ipv6_writemask(&mask6, plen); IN6_MASK_ADDR(&addr6, &mask6); if (IN6_ARE_ADDR_EQUAL(&addr6, &lookup_addr)) { /* Skip lookup if the key is the same */ continue; } lookup_addr = addr6; rt = get_inet6_parent_prefix(fibnum, &lookup_addr, plen); if (rt != NULL) return (rt); } return (NULL); } #endif /* * Prints rtentry @rt data in the provided @buf. * Example: rt/192.168.0.0/24 */ char * rt_print_buf(const struct rtentry *rt, char *buf, size_t bufsize) { char abuf[INET6_ADDRSTRLEN]; uint32_t scopeid; int plen; switch (rt_get_family(rt)) { #ifdef INET case AF_INET: { struct in_addr addr4; rt_get_inet_prefix_plen(rt, &addr4, &plen, &scopeid); inet_ntop(AF_INET, &addr4, abuf, sizeof(abuf)); snprintf(buf, bufsize, "rt/%s/%d", abuf, plen); } break; #endif #ifdef INET6 case AF_INET6: { struct in6_addr addr6; rt_get_inet6_prefix_plen(rt, &addr6, &plen, &scopeid); inet_ntop(AF_INET6, &addr6, abuf, sizeof(abuf)); snprintf(buf, bufsize, "rt/%s/%d", abuf, plen); } break; #endif default: snprintf(buf, bufsize, "rt/unknown_af#%d", rt_get_family(rt)); break; } return (buf); } const char * rib_print_cmd(int rib_cmd) { switch (rib_cmd) { case RTM_ADD: return ("RTM_ADD"); case RTM_CHANGE: return ("RTM_CHANGE"); case RTM_DELETE: return ("RTM_DELETE"); case RTM_GET: return ("RTM_GET"); } return ("UNKNOWN"); } diff --git a/sys/net/route/route_var.h b/sys/net/route/route_var.h index 740729ecb415..e54ea08f4e80 100644 --- a/sys/net/route/route_var.h +++ b/sys/net/route/route_var.h @@ -1,334 +1,328 @@ /*- * Copyright (c) 2015-2016 * Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _NET_ROUTE_VAR_H_ #define _NET_ROUTE_VAR_H_ #ifndef RNF_NORMAL #include #endif #include #include #include /* struct sockaddr_in */ #include #include -#ifdef RTDEBUG -#define DPRINTF(_fmt, ...) printf("%s: " _fmt "\n", __func__ , ## __VA_ARGS__) -#else -#define DPRINTF(_fmt, ...) -#endif - struct nh_control; /* Sets prefix-specific nexthop flags (NHF_DEFAULT, RTF/NHF_HOST, RTF_BROADCAST,..) */ typedef int rnh_set_nh_pfxflags_f_t(u_int fibnum, const struct sockaddr *addr, const struct sockaddr *mask, struct nhop_object *nh); /* Fills in family-specific details that are not yet set up (mtu, nhop type, ..) */ typedef int rnh_augment_nh_f_t(u_int fibnum, struct nhop_object *nh); struct rib_head { struct radix_head head; rn_matchaddr_f_t *rnh_matchaddr; /* longest match for sockaddr */ rn_addaddr_f_t *rnh_addaddr; /* add based on sockaddr*/ rn_deladdr_f_t *rnh_deladdr; /* remove based on sockaddr */ rn_lookup_f_t *rnh_lookup; /* exact match for sockaddr */ rn_walktree_t *rnh_walktree; /* traverse tree */ rn_walktree_from_t *rnh_walktree_from; /* traverse tree below a */ rnh_set_nh_pfxflags_f_t *rnh_set_nh_pfxflags; /* hook to alter record prior to insertion */ rt_gen_t rnh_gen; /* datapath generation counter */ int rnh_multipath; /* multipath capable ? */ struct radix_node rnh_nodes[3]; /* empty tree for common case */ struct rmlock rib_lock; /* config/data path lock */ struct radix_mask_head rmhead; /* masks radix head */ struct vnet *rib_vnet; /* vnet pointer */ int rib_family; /* AF of the rtable */ u_int rib_fibnum; /* fib number */ struct callout expire_callout; /* Callout for expiring dynamic routes */ time_t next_expire; /* Next expire run ts */ uint32_t rnh_prefixes; /* Number of prefixes */ rt_gen_t rnh_gen_rib; /* fib algo: rib generation counter */ uint32_t rib_dying:1; /* rib is detaching */ uint32_t rib_algo_fixed:1;/* fixed algorithm */ uint32_t rib_algo_init:1;/* algo init done */ struct nh_control *nh_control; /* nexthop subsystem data */ rnh_augment_nh_f_t *rnh_augment_nh;/* hook to alter nexthop prior to insertion */ CK_STAILQ_HEAD(, rib_subscription) rnh_subscribers;/* notification subscribers */ }; #define RIB_RLOCK_TRACKER struct rm_priotracker _rib_tracker #define RIB_LOCK_INIT(rh) rm_init(&(rh)->rib_lock, "rib head lock") #define RIB_LOCK_DESTROY(rh) rm_destroy(&(rh)->rib_lock) #define RIB_RLOCK(rh) rm_rlock(&(rh)->rib_lock, &_rib_tracker) #define RIB_RUNLOCK(rh) rm_runlock(&(rh)->rib_lock, &_rib_tracker) #define RIB_WLOCK(rh) rm_wlock(&(rh)->rib_lock) #define RIB_WUNLOCK(rh) rm_wunlock(&(rh)->rib_lock) #define RIB_LOCK_ASSERT(rh) rm_assert(&(rh)->rib_lock, RA_LOCKED) #define RIB_WLOCK_ASSERT(rh) rm_assert(&(rh)->rib_lock, RA_WLOCKED) /* Constants */ #define RIB_MAX_RETRIES 3 #define RT_MAXFIBS UINT16_MAX #define RIB_MAX_MPATH_WIDTH 64 /* Macro for verifying fields in af-specific 'struct route' structures */ #define CHK_STRUCT_FIELD_GENERIC(_s1, _f1, _s2, _f2) \ _Static_assert(sizeof(((_s1 *)0)->_f1) == sizeof(((_s2 *)0)->_f2), \ "Fields " #_f1 " and " #_f2 " size differs"); \ _Static_assert(__offsetof(_s1, _f1) == __offsetof(_s2, _f2), \ "Fields " #_f1 " and " #_f2 " offset differs"); #define _CHK_ROUTE_FIELD(_route_new, _field) \ CHK_STRUCT_FIELD_GENERIC(struct route, _field, _route_new, _field) #define CHK_STRUCT_ROUTE_FIELDS(_route_new) \ _CHK_ROUTE_FIELD(_route_new, ro_nh) \ _CHK_ROUTE_FIELD(_route_new, ro_lle) \ _CHK_ROUTE_FIELD(_route_new, ro_prepend)\ _CHK_ROUTE_FIELD(_route_new, ro_plen) \ _CHK_ROUTE_FIELD(_route_new, ro_flags) \ _CHK_ROUTE_FIELD(_route_new, ro_mtu) \ _CHK_ROUTE_FIELD(_route_new, spare) #define CHK_STRUCT_ROUTE_COMPAT(_ro_new, _dst_new) \ CHK_STRUCT_ROUTE_FIELDS(_ro_new); \ _Static_assert(__offsetof(struct route, ro_dst) == __offsetof(_ro_new, _dst_new),\ "ro_dst and " #_dst_new " are at different offset") static inline void rib_bump_gen(struct rib_head *rnh) { #ifdef FIB_ALGO rnh->rnh_gen_rib++; #else rnh->rnh_gen++; #endif } struct rib_head *rt_tables_get_rnh(uint32_t table, sa_family_t family); int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum); struct rib_cmd_info; VNET_PCPUSTAT_DECLARE(struct rtstat, rtstat); #define RTSTAT_ADD(name, val) \ VNET_PCPUSTAT_ADD(struct rtstat, rtstat, name, (val)) #define RTSTAT_INC(name) RTSTAT_ADD(name, 1) /* * Convert a 'struct radix_node *' to a 'struct rtentry *'. * The operation can be done safely (in this code) because a * 'struct rtentry' starts with two 'struct radix_node''s, the first * one representing leaf nodes in the routing tree, which is * what the code in radix.c passes us as a 'struct radix_node'. * * But because there are a lot of assumptions in this conversion, * do not cast explicitly, but always use the macro below. */ #define RNTORT(p) ((struct rtentry *)(p)) struct rtentry { struct radix_node rt_nodes[2]; /* tree glue, and other values */ /* * XXX struct rtentry must begin with a struct radix_node (or two!) * because the code does some casts of a 'struct radix_node *' * to a 'struct rtentry *' */ #define rt_key(r) (*((struct sockaddr **)(&(r)->rt_nodes->rn_key))) #define rt_mask(r) (*((struct sockaddr **)(&(r)->rt_nodes->rn_mask))) #define rt_key_const(r) (*((const struct sockaddr * const *)(&(r)->rt_nodes->rn_key))) #define rt_mask_const(r) (*((const struct sockaddr * const *)(&(r)->rt_nodes->rn_mask))) /* * 2 radix_node structurs above consists of 2x6 pointers, leaving * 4 pointers (32 bytes) of the second cache line on amd64. * */ struct nhop_object *rt_nhop; /* nexthop data */ union { /* * Destination address storage. * sizeof(struct sockaddr_in6) == 28, however * the dataplane-relevant part (e.g. address) lies * at offset 8..24, making the address not crossing * cacheline boundary. */ struct sockaddr_in rt_dst4; struct sockaddr_in6 rt_dst6; struct sockaddr rt_dst; char rt_dstb[28]; }; int rte_flags; /* up/down?, host/net */ u_long rt_weight; /* absolute weight */ struct rtentry *rt_chain; /* pointer to next rtentry to delete */ struct epoch_context rt_epoch_ctx; /* net epoch tracker */ }; /* * With the split between the routing entry and the nexthop, * rt_flags has to be split between these 2 entries. As rtentry * mostly contains prefix data and is thought to be generic enough * so one can transparently change the nexthop pointer w/o requiring * any other rtentry changes, most of rt_flags shifts to the particular nexthop. * / * * RTF_UP: rtentry, as an indication that it is linked. * RTF_HOST: rtentry, nhop. The latter indication is needed for the datapath * RTF_DYNAMIC: nhop, to make rtentry generic. * RTF_MODIFIED: nhop, to make rtentry generic. (legacy) * -- "native" path (nhop) properties: * RTF_GATEWAY, RTF_STATIC, RTF_PROTO1, RTF_PROTO2, RTF_PROTO3, RTF_FIXEDMTU, * RTF_PINNED, RTF_REJECT, RTF_BLACKHOLE, RTF_BROADCAST */ /* rtentry rt flag mask */ #define RTE_RT_FLAG_MASK (RTF_UP | RTF_HOST) /* route_temporal.c */ void tmproutes_update(struct rib_head *rnh, struct rtentry *rt, struct nhop_object *nh); void tmproutes_init(struct rib_head *rh); void tmproutes_destroy(struct rib_head *rh); /* route_ctl.c */ struct route_nhop_data; int change_route_nhop(struct rib_head *rnh, struct rtentry *rt, struct route_nhop_data *rnd, struct rib_cmd_info *rc); int change_route_conditional(struct rib_head *rnh, struct rtentry *rt, struct rt_addrinfo *info, struct route_nhop_data *nhd_orig, struct route_nhop_data *nhd_new, struct rib_cmd_info *rc); struct rtentry *lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info, struct route_nhop_data *rnd); bool nhop_can_multipath(const struct nhop_object *nh); bool match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw); int check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt, const struct nhop_object *nh); int can_override_nhop(const struct rt_addrinfo *info, const struct nhop_object *nh); void vnet_rtzone_init(void); void vnet_rtzone_destroy(void); /* subscriptions */ void rib_init_subscriptions(struct rib_head *rnh); void rib_destroy_subscriptions(struct rib_head *rnh); /* Nexhops */ void nhops_init(void); int nhops_init_rib(struct rib_head *rh); void nhops_destroy_rib(struct rib_head *rh); void nhop_ref_object(struct nhop_object *nh); int nhop_try_ref_object(struct nhop_object *nh); void nhop_ref_any(struct nhop_object *nh); void nhop_free_any(struct nhop_object *nh); int nhop_create_from_info(struct rib_head *rnh, struct rt_addrinfo *info, struct nhop_object **nh_ret); int nhop_create_from_nhop(struct rib_head *rnh, const struct nhop_object *nh_orig, struct rt_addrinfo *info, struct nhop_object **pnh_priv); void nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu); int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w); /* MULTIPATH */ #define MPF_MULTIPATH 0x08 /* need to be consistent with NHF_MULTIPATH */ struct nhgrp_object { uint16_t nhg_flags; /* nexthop group flags */ uint8_t nhg_size; /* dataplain group size */ uint8_t spare; struct nhop_object *nhops[0]; /* nhops */ }; static inline struct nhop_object * nhop_select(struct nhop_object *nh, uint32_t flowid) { #ifdef ROUTE_MPATH if (NH_IS_NHGRP(nh)) { struct nhgrp_object *nhg = (struct nhgrp_object *)nh; nh = nhg->nhops[flowid % nhg->nhg_size]; } #endif return (nh); } struct weightened_nhop; /* mpath_ctl.c */ int add_route_mpath(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry *rt, struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc); int del_route_mpath(struct rib_head *rh, struct rt_addrinfo *info, struct rtentry *rt, struct nhgrp_object *nhg, struct rib_cmd_info *rc); /* nhgrp.c */ int nhgrp_ctl_init(struct nh_control *ctl); void nhgrp_ctl_free(struct nh_control *ctl); void nhgrp_ctl_unlink_all(struct nh_control *ctl); /* nhgrp_ctl.c */ int nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w); int nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops, struct route_nhop_data *rnd); typedef bool nhgrp_filter_cb_t(const struct nhop_object *nh, void *data); int nhgrp_get_filtered_group(struct rib_head *rh, const struct nhgrp_object *src, nhgrp_filter_cb_t flt_func, void *flt_data, struct route_nhop_data *rnd); int nhgrp_get_addition_group(struct rib_head *rnh, struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_new); void nhgrp_ref_object(struct nhgrp_object *nhg); uint32_t nhgrp_get_idx(const struct nhgrp_object *nhg); void nhgrp_free(struct nhgrp_object *nhg); /* rtsock */ int rtsock_routemsg(int cmd, struct rtentry *rt, struct nhop_object *nh, int fibnum); int rtsock_routemsg_info(int cmd, struct rt_addrinfo *info, int fibnum); int rtsock_addrmsg(int cmd, struct ifaddr *ifa, int fibnum); /* lookup_framework.c */ void fib_grow_rtables(uint32_t new_num_tables); void fib_setup_family(int family, uint32_t num_tables); void fib_destroy_rib(struct rib_head *rh); void vnet_fib_init(void); void vnet_fib_destroy(void); /* Entropy data used for outbound hashing */ #define MPATH_ENTROPY_KEY_LEN 40 extern uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN]; #endif