diff --git a/sys/net/route/fib_algo.c b/sys/net/route/fib_algo.c index 1274e5e90bd4..3c2ffa45afbb 100644 --- a/sys/net/route/fib_algo.c +++ b/sys/net/route/fib_algo.c @@ -1,2079 +1,2079 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2020 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #include #include #include #include #include #include /* * Fib lookup framework. * * This framework enables accelerated longest-prefix-match lookups for the * routing tables by adding the ability to dynamically attach/detach lookup * algorithms implementation to/from the datapath. * * flm - fib lookup modules - implementation of particular lookup algorithm * fd - fib data - instance of an flm bound to specific routing table * * This file provides main framework functionality. * * The following are the features provided by the framework * * 1) nexhops abstraction -> provides transparent referencing, indexing * and efficient idx->ptr mappings for nexthop and nexthop groups. * 2) Routing table synchronisation * 3) dataplane attachment points * 4) automatic algorithm selection based on the provided preference. * * * DATAPATH * For each supported address family, there is a an allocated array of fib_dp * structures, indexed by fib number. Each array entry contains callback function * and its argument. This function will be called with a family-specific lookup key, * scope and provided argument. This array gets re-created every time when new algo * instance gets created. Please take a look at the replace_rtables_family() function * for more details. * */ SYSCTL_DECL(_net_route); SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Fib algorithm lookups"); /* Algorithm sync policy */ /* Time interval to bucket updates */ VNET_DEFINE_STATIC(unsigned int, update_bucket_time_ms) = 50; #define V_update_bucket_time_ms VNET(update_bucket_time_ms) SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_time_ms, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(update_bucket_time_ms), 0, "Time interval to calculate update rate"); /* Minimum update rate to delay sync */ VNET_DEFINE_STATIC(unsigned int, bucket_change_threshold_rate) = 500; #define V_bucket_change_threshold_rate VNET(bucket_change_threshold_rate) SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_change_threshold_rate, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(bucket_change_threshold_rate), 0, "Minimum update rate to delay sync"); /* Max allowed delay to sync */ VNET_DEFINE_STATIC(unsigned int, fib_max_sync_delay_ms) = 1000; #define V_fib_max_sync_delay_ms VNET(fib_max_sync_delay_ms) SYSCTL_UINT(_net_route_algo, OID_AUTO, fib_max_sync_delay_ms, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(fib_max_sync_delay_ms), 0, "Maximum time to delay sync (ms)"); #ifdef INET6 VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false; #define V_algo_fixed_inet6 VNET(algo_fixed_inet6) SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPv6 longest prefix match lookups"); #endif #ifdef INET VNET_DEFINE_STATIC(bool, algo_fixed_inet) = false; #define V_algo_fixed_inet VNET(algo_fixed_inet) SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPv4 longest prefix match lookups"); #endif /* Fib instance counter */ static uint32_t fib_gen = 0; struct nhop_ref_table { uint32_t count; int32_t refcnt[0]; }; enum fib_callout_action { FDA_NONE, /* No callout scheduled */ FDA_REBUILD, /* Asks to rebuild algo instance */ FDA_EVAL, /* Asks to evaluate if the current algo is still be best */ FDA_BATCH, /* Asks to submit batch of updates to the algo */ }; struct fib_sync_status { struct timeval diverge_time; /* ts when diverged */ uint32_t num_changes; /* number of changes since sync */ uint32_t bucket_changes; /* num changes within the current bucket */ uint64_t bucket_id; /* 50ms bucket # */ struct fib_change_queue fd_change_queue;/* list of scheduled entries */ }; /* * Data structure for the fib lookup instance tied to the particular rib. */ struct fib_data { uint32_t number_nhops; /* current # of nhops */ uint8_t hit_nhops; /* true if out of nhop limit */ uint8_t init_done; /* true if init is competed */ uint32_t fd_dead:1; /* Scheduled for deletion */ uint32_t fd_linked:1; /* true if linked */ uint32_t fd_need_rebuild:1; /* true if rebuild scheduled */ uint32_t fd_batch:1; /* true if batched notification scheduled */ uint8_t fd_family; /* family */ uint32_t fd_fibnum; /* fibnum */ uint32_t fd_failed_rebuilds; /* stat: failed rebuilds */ uint32_t fd_gen; /* instance gen# */ struct callout fd_callout; /* rebuild callout */ enum fib_callout_action fd_callout_action; /* Callout action to take */ void *fd_algo_data; /* algorithm data */ struct nhop_object **nh_idx; /* nhop idx->ptr array */ struct nhop_ref_table *nh_ref_table; /* array with # of nhop references */ struct rib_head *fd_rh; /* RIB table we're attached to */ struct rib_subscription *fd_rs; /* storing table subscription */ struct fib_dp fd_dp; /* fib datapath data */ struct vnet *fd_vnet; /* vnet fib belongs to */ struct epoch_context fd_epoch_ctx; /* epoch context for deletion */ struct fib_lookup_module *fd_flm;/* pointer to the lookup module */ struct fib_sync_status fd_ss; /* State relevant to the rib sync */ uint32_t fd_num_changes; /* number of changes since last callout */ TAILQ_ENTRY(fib_data) entries; /* list of all fds in vnet */ }; static bool rebuild_fd(struct fib_data *fd, const char *reason); static bool rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new); static void handle_fd_callout(void *_data); static void destroy_fd_instance_epoch(epoch_context_t ctx); static bool is_idx_free(struct fib_data *fd, uint32_t index); static void set_algo_fixed(struct rib_head *rh); static bool is_algo_fixed(struct rib_head *rh); static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh); static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh); static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm); static void fib_unref_algo(struct fib_lookup_module *flm); static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum); struct mtx fib_mtx; #define FIB_MOD_LOCK() mtx_lock(&fib_mtx) #define FIB_MOD_UNLOCK() mtx_unlock(&fib_mtx) #define FIB_MOD_LOCK_ASSERT() mtx_assert(&fib_mtx, MA_OWNED) MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF); /* Algorithm has to be this percent better than the current to switch */ #define BEST_DIFF_PERCENT (5 * 256 / 100) /* Schedule algo re-evaluation X seconds after a change */ #define ALGO_EVAL_DELAY_MS 30000 /* Force algo re-evaluation after X changes */ #define ALGO_EVAL_NUM_ROUTES 100 /* Try to setup algorithm X times */ #define FIB_MAX_TRIES 32 /* Max amount of supported nexthops */ #define FIB_MAX_NHOPS 262144 #define FIB_CALLOUT_DELAY_MS 50 /* Debug */ static int flm_debug_level = LOG_NOTICE; SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN, &flm_debug_level, 0, "debuglevel"); #define FLM_MAX_DEBUG_LEVEL LOG_DEBUG #ifndef LOG_DEBUG2 #define LOG_DEBUG2 8 #endif #define _PASS_MSG(_l) (flm_debug_level >= (_l)) #define ALGO_PRINTF(_l, _fmt, ...) if (_PASS_MSG(_l)) { \ printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__); \ } #define _ALGO_PRINTF(_fib, _fam, _aname, _gen, _func, _fmt, ...) \ printf("[fib_algo] %s.%u (%s#%u) %s: " _fmt "\n",\ print_family(_fam), _fib, _aname, _gen, _func, ## __VA_ARGS__) #define _RH_PRINTF(_fib, _fam, _func, _fmt, ...) \ printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__) #define RH_PRINTF(_l, _rh, _fmt, ...) if (_PASS_MSG(_l)) { \ _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\ } #define FD_PRINTF(_l, _fd, _fmt, ...) FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__) #define _FD_PRINTF(_l, _fd, _fmt, ...) if (_PASS_MSG(_l)) { \ _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name, \ _fd->fd_gen, __func__, _fmt, ## __VA_ARGS__); \ } #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG2 #define FD_PRINTF_LOG_DEBUG2 _FD_PRINTF #else #define FD_PRINTF_LOG_DEBUG2(_l, _fd, _fmt, ...) #endif #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG #define FD_PRINTF_LOG_DEBUG _FD_PRINTF #else #define FD_PRINTF_LOG_DEBUG() #endif #if FLM_MAX_DEBUG_LEVEL>=LOG_INFO #define FD_PRINTF_LOG_INFO _FD_PRINTF #else #define FD_PRINTF_LOG_INFO() #endif #define FD_PRINTF_LOG_NOTICE _FD_PRINTF #define FD_PRINTF_LOG_ERR _FD_PRINTF #define FD_PRINTF_LOG_WARNING _FD_PRINTF /* List of all registered lookup algorithms */ static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list); /* List of all fib lookup instances in the vnet */ VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list); #define V_fib_data_list VNET(fib_data_list) /* Datastructure for storing non-transient fib lookup module failures */ struct fib_error { int fe_family; uint32_t fe_fibnum; /* failed rtable */ struct fib_lookup_module *fe_flm; /* failed module */ TAILQ_ENTRY(fib_error) entries;/* list of all errored entries */ }; VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list); #define V_fib_error_list VNET(fib_error_list) /* Per-family array of fibnum -> {func, arg} mappings used in datapath */ struct fib_dp_header { struct epoch_context fdh_epoch_ctx; uint32_t fdh_num_tables; struct fib_dp fdh_idx[0]; }; /* * Tries to add new non-transient algorithm error to the list of * errors. * Returns true on success. */ static bool flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum) { struct fib_error *fe; fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO); if (fe == NULL) return (false); fe->fe_flm = flm; fe->fe_family = flm->flm_family; fe->fe_fibnum = fibnum; FIB_MOD_LOCK(); /* Avoid duplicates by checking if error already exists first */ if (flm_error_check(flm, fibnum)) { FIB_MOD_UNLOCK(); free(fe, M_TEMP); return (true); } TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries); FIB_MOD_UNLOCK(); return (true); } /* * True if non-transient error has been registered for @flm in @fibnum. */ static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum) { const struct fib_error *fe; TAILQ_FOREACH(fe, &V_fib_error_list, entries) { if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum)) return (true); } return (false); } /* * Clear all errors of algo specified by @flm. */ static void fib_error_clear_flm(struct fib_lookup_module *flm) { struct fib_error *fe, *fe_tmp; FIB_MOD_LOCK_ASSERT(); TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { if (fe->fe_flm == flm) { TAILQ_REMOVE(&V_fib_error_list, fe, entries); free(fe, M_TEMP); } } } /* * Clears all errors in current VNET. */ static void fib_error_clear(void) { struct fib_error *fe, *fe_tmp; FIB_MOD_LOCK_ASSERT(); TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { TAILQ_REMOVE(&V_fib_error_list, fe, entries); free(fe, M_TEMP); } } static const char * print_op_result(enum flm_op_result result) { switch (result) { case FLM_SUCCESS: return "success"; case FLM_REBUILD: return "rebuild"; case FLM_BATCH: return "batch"; case FLM_ERROR: return "error"; } return "unknown"; } static const char * print_family(int family) { if (family == AF_INET) return ("inet"); else if (family == AF_INET6) return ("inet6"); else return ("unknown"); } /* * Debug function used by lookup algorithms. * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) " */ void fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...) { char buf[128]; va_list ap; if (level > flm_debug_level) return; va_start(ap, fmt); vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); _ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name, fd->fd_gen, func, "%s", buf); } /* * Outputs list of algorithms supported by the provided address family. */ static int print_algos_sysctl(struct sysctl_req *req, int family) { struct fib_lookup_module *flm; struct sbuf sbuf; int error, count = 0; error = sysctl_wire_old_buffer(req, 0); if (error == 0) { sbuf_new_for_sysctl(&sbuf, NULL, 512, req); TAILQ_FOREACH(flm, &all_algo_list, entries) { if (flm->flm_family == family) { if (count++ > 0) sbuf_cat(&sbuf, ", "); sbuf_cat(&sbuf, flm->flm_name); } } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); } return (error); } #ifdef INET6 static int print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS) { return (print_algos_sysctl(req, AF_INET6)); } SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms"); #endif #ifdef INET static int print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS) { return (print_algos_sysctl(req, AF_INET)); } SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms"); #endif /* * Calculate delay between repeated failures. * Returns current delay in milliseconds. */ static uint32_t callout_calc_delay_ms(struct fib_data *fd) { uint32_t shift; if (fd->fd_failed_rebuilds > 10) shift = 10; else shift = fd->fd_failed_rebuilds; return ((1 << shift) * FIB_CALLOUT_DELAY_MS); } static void schedule_callout(struct fib_data *fd, enum fib_callout_action action, int delay_ms) { FD_PRINTF(LOG_DEBUG, fd, "delay=%d action=%d", delay_ms, action); fd->fd_callout_action = action; callout_reset_sbt(&fd->fd_callout, SBT_1MS * delay_ms, 0, handle_fd_callout, fd, 0); } static void schedule_fd_rebuild(struct fib_data *fd, const char *reason) { RIB_WLOCK_ASSERT(fd->fd_rh); if (!fd->fd_need_rebuild) { fd->fd_need_rebuild = true; /* Stop batch updates */ fd->fd_batch = false; /* * Potentially re-schedules pending callout * initiated by schedule_algo_eval. */ FD_PRINTF(LOG_INFO, fd, "Scheduling rebuild: %s (failures=%d)", reason, fd->fd_failed_rebuilds); schedule_callout(fd, FDA_REBUILD, callout_calc_delay_ms(fd)); } } static void sync_rib_gen(struct fib_data *fd) { FD_PRINTF(LOG_DEBUG, fd, "Sync gen %u -> %u", fd->fd_rh->rnh_gen, fd->fd_rh->rnh_gen_rib); fd->fd_rh->rnh_gen = fd->fd_rh->rnh_gen_rib; } static int64_t get_tv_diff_ms(const struct timeval *old_tv, const struct timeval *new_tv) { int64_t diff = 0; diff = ((int64_t)(new_tv->tv_sec - old_tv->tv_sec)) * 1000; diff += (new_tv->tv_usec - old_tv->tv_usec) / 1000; return (diff); } static void add_tv_diff_ms(struct timeval *tv, int ms) { tv->tv_sec += ms / 1000; ms = ms % 1000; if (ms * 1000 + tv->tv_usec < 1000000) tv->tv_usec += ms * 1000; else { tv->tv_sec += 1; tv->tv_usec = ms * 1000 + tv->tv_usec - 1000000; } } /* * Marks the time when algo state diverges from the rib state. */ static void mark_diverge_time(struct fib_data *fd) { struct fib_sync_status *fd_ss = &fd->fd_ss; getmicrouptime(&fd_ss->diverge_time); fd_ss->bucket_id = 0; fd_ss->bucket_changes = 0; } /* * Calculates and updates the next algorithm sync time, based on the current activity. * * The intent is to provide reasonable balance between the update * latency and efficient batching when changing large amount of routes. * * High-level algorithm looks the following: * 1) all changes are bucketed in 50ms intervals * 2) If amount of changes within the bucket is greater than the threshold, * the update gets delayed, up to maximum delay threshold. */ static void update_rebuild_delay(struct fib_data *fd, enum fib_callout_action action) { uint32_t bucket_id, new_delay = 0; struct timeval tv; /* Fetch all variables at once to ensure consistent reads */ uint32_t bucket_time_ms = V_update_bucket_time_ms; uint32_t threshold_rate = V_bucket_change_threshold_rate; uint32_t max_delay_ms = V_fib_max_sync_delay_ms; if (bucket_time_ms == 0) bucket_time_ms = 50; /* calculate per-bucket threshold rate */ threshold_rate = threshold_rate * bucket_time_ms / 1000; getmicrouptime(&tv); struct fib_sync_status *fd_ss = &fd->fd_ss; bucket_id = get_tv_diff_ms(&fd_ss->diverge_time, &tv) / bucket_time_ms; if (fd_ss->bucket_id == bucket_id) { fd_ss->bucket_changes++; if (fd_ss->bucket_changes == threshold_rate) { new_delay = (bucket_id + 2) * bucket_time_ms; if (new_delay <= max_delay_ms) { FD_PRINTF(LOG_DEBUG, fd, "hit threshold of %u routes, delay update," "bucket: %u, total delay: %u", threshold_rate, bucket_id + 1, new_delay); } else { new_delay = 0; FD_PRINTF(LOG_DEBUG, fd, "maximum sync delay (%u ms) reached", max_delay_ms); } } else if ((bucket_id == 0) && (fd_ss->bucket_changes == 1)) new_delay = bucket_time_ms; } else { fd_ss->bucket_id = bucket_id; fd_ss->bucket_changes = 1; } if (new_delay > 0) { /* Calculated time has been updated */ struct timeval new_tv = fd_ss->diverge_time; add_tv_diff_ms(&new_tv, new_delay); int32_t delay_ms = get_tv_diff_ms(&tv, &new_tv); schedule_callout(fd, action, delay_ms); } } static void update_algo_state(struct fib_data *fd) { RIB_WLOCK_ASSERT(fd->fd_rh); if (fd->fd_batch || fd->fd_need_rebuild) { enum fib_callout_action action = fd->fd_need_rebuild ? FDA_REBUILD : FDA_BATCH; update_rebuild_delay(fd, action); return; } if (fd->fd_num_changes++ == 0) { /* Start callout to consider switch */ if (!callout_pending(&fd->fd_callout)) schedule_callout(fd, FDA_EVAL, ALGO_EVAL_DELAY_MS); } else if (fd->fd_num_changes == ALGO_EVAL_NUM_ROUTES) { /* Reset callout to exec immediately */ if (fd->fd_callout_action == FDA_EVAL) schedule_callout(fd, FDA_EVAL, 1); } } static bool need_immediate_sync(struct fib_data *fd, struct rib_cmd_info *rc) { struct nhop_object *nh; /* Sync addition/removal of interface routes */ switch (rc->rc_cmd) { case RTM_ADD: nh = rc->rc_nh_new; if (!NH_IS_NHGRP(nh)) { if (!(nh->nh_flags & NHF_GATEWAY)) return (true); if (nhop_get_rtflags(nh) & RTF_STATIC) return (true); } break; case RTM_DELETE: nh = rc->rc_nh_old; if (!NH_IS_NHGRP(nh)) { if (!(nh->nh_flags & NHF_GATEWAY)) return (true); if (nhop_get_rtflags(nh) & RTF_STATIC) return (true); } break; } return (false); } static bool apply_rtable_changes(struct fib_data *fd) { enum flm_op_result result; struct fib_change_queue *q = &fd->fd_ss.fd_change_queue; result = fd->fd_flm->flm_change_rib_items_cb(fd->fd_rh, q, fd->fd_algo_data); if (result == FLM_SUCCESS) { sync_rib_gen(fd); for (int i = 0; i < q->count; i++) if (q->entries[i].nh_old) fib_unref_nhop(fd, q->entries[i].nh_old); q->count = 0; } fd->fd_batch = false; return (result == FLM_SUCCESS); } static bool fill_change_entry(struct fib_data *fd, struct fib_change_entry *ce, struct rib_cmd_info *rc) { int plen = 0; switch (fd->fd_family) { #ifdef INET case AF_INET: rt_get_inet_prefix_plen(rc->rc_rt, &ce->addr4, &plen, &ce->scopeid); break; #endif #ifdef INET6 case AF_INET6: rt_get_inet6_prefix_plen(rc->rc_rt, &ce->addr6, &plen, &ce->scopeid); break; #endif } ce->plen = plen; ce->nh_old = rc->rc_nh_old; ce->nh_new = rc->rc_nh_new; if (ce->nh_new != NULL) { if (fib_ref_nhop(fd, ce->nh_new) == 0) return (false); } return (true); } static bool queue_rtable_change(struct fib_data *fd, struct rib_cmd_info *rc) { struct fib_change_queue *q = &fd->fd_ss.fd_change_queue; if (q->count >= q->size) { uint32_t q_size; if (q->size == 0) q_size = 256; /* ~18k memory */ else q_size = q->size * 2; size_t size = q_size * sizeof(struct fib_change_entry); void *a = realloc(q->entries, size, M_TEMP, M_NOWAIT | M_ZERO); if (a == NULL) { FD_PRINTF(LOG_INFO, fd, "Unable to realloc queue for %u elements", q_size); return (false); } q->entries = a; q->size = q_size; } return (fill_change_entry(fd, &q->entries[q->count++], rc)); } /* * Rib subscription handler. Checks if the algorithm is ready to * receive updates, handles nexthop refcounting and passes change * data to the algorithm callback. */ static void handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, void *_data) { struct fib_data *fd = (struct fib_data *)_data; enum flm_op_result result; RIB_WLOCK_ASSERT(rnh); /* * There is a small gap between subscribing for route changes * and initiating rtable dump. Avoid receiving route changes * prior to finishing rtable dump by checking `init_done`. */ if (!fd->init_done) return; bool immediate_sync = need_immediate_sync(fd, rc); /* Consider scheduling algorithm re-evaluation */ update_algo_state(fd); /* * If algo requested rebuild, stop sending updates by default. * This simplifies nexthop refcount handling logic. */ if (fd->fd_need_rebuild) { if (immediate_sync) rebuild_fd(fd, "rtable change type enforced sync"); return; } /* * Algo requested updates to be delivered in batches. * Add the current change to the queue and return. */ if (fd->fd_batch) { if (immediate_sync) { if (!queue_rtable_change(fd, rc) || !apply_rtable_changes(fd)) rebuild_fd(fd, "batch sync failed"); } else { if (!queue_rtable_change(fd, rc)) schedule_fd_rebuild(fd, "batch queue failed"); } return; } /* * Maintain guarantee that every nexthop returned by the dataplane * lookup has > 0 refcount, so can be safely referenced within current * epoch. */ if (rc->rc_nh_new != NULL) { if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) { /* ran out of indexes */ schedule_fd_rebuild(fd, "ran out of nhop indexes"); return; } } result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data); switch (result) { case FLM_SUCCESS: sync_rib_gen(fd); /* Unref old nexthop on success */ if (rc->rc_nh_old != NULL) fib_unref_nhop(fd, rc->rc_nh_old); break; case FLM_BATCH: /* * Algo asks to batch the changes. */ if (queue_rtable_change(fd, rc)) { if (!immediate_sync) { fd->fd_batch = true; mark_diverge_time(fd); update_rebuild_delay(fd, FDA_BATCH); break; } if (apply_rtable_changes(fd)) break; } FD_PRINTF(LOG_ERR, fd, "batched sync failed, force the rebuild"); case FLM_REBUILD: /* * Algo is not able to apply the update. * Schedule algo rebuild. */ if (!immediate_sync) { mark_diverge_time(fd); schedule_fd_rebuild(fd, "algo requested rebuild"); break; } FD_PRINTF(LOG_INFO, fd, "running sync rebuild"); rebuild_fd(fd, "rtable change type enforced sync"); break; case FLM_ERROR: /* * Algo reported a non-recoverable error. * Record the error and schedule rebuild, which will * trigger best algo selection. */ FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error"); if (!flm_error_add(fd->fd_flm, fd->fd_fibnum)) FD_PRINTF(LOG_ERR, fd, "failed to ban algo"); schedule_fd_rebuild(fd, "algo reported non-recoverable error"); } } static void estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd) { if (old_fd == NULL) { // TODO: read from rtable fd->number_nhops = 16; return; } if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS) fd->number_nhops = 2 * old_fd->number_nhops; else fd->number_nhops = old_fd->number_nhops; } struct walk_cbdata { struct fib_data *fd; flm_dump_t *func; enum flm_op_result result; }; /* * Handler called after all rtenties have been dumped. * Performs post-dump framework checks and calls * algo:flm_dump_end_cb(). * * Updates walk_cbdata result. */ static void sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data) { struct walk_cbdata *w = (struct walk_cbdata *)_data; struct fib_data *fd = w->fd; RIB_WLOCK_ASSERT(w->fd->fd_rh); if (rnh->rib_dying) { w->result = FLM_ERROR; return; } if (fd->hit_nhops) { FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops", fd->nh_ref_table->count); if (w->result == FLM_SUCCESS) w->result = FLM_REBUILD; return; } if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS) return; /* Post-dump hook, dump successful */ w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp); if (w->result == FLM_SUCCESS) { /* Mark init as done to allow routing updates */ fd->init_done = 1; } } /* * Callback for each entry in rib. * Calls algo:flm_dump_rib_item_cb func as a part of initial * route table synchronisation. */ static int sync_algo_cb(struct rtentry *rt, void *_data) { struct walk_cbdata *w = (struct walk_cbdata *)_data; RIB_WLOCK_ASSERT(w->fd->fd_rh); if (w->result == FLM_SUCCESS && w->func) { /* * Reference nexthops to maintain guarantee that * each nexthop returned by datapath has > 0 references * and can be safely referenced within current epoch. */ struct nhop_object *nh = rt_get_raw_nhop(rt); if (fib_ref_nhop(w->fd, nh) != 0) w->result = w->func(rt, w->fd->fd_algo_data); else w->result = FLM_REBUILD; } return (0); } /* * Dump all routing table state to the algo instance. */ static enum flm_op_result sync_algo(struct fib_data *fd) { struct walk_cbdata w = { .fd = fd, .func = fd->fd_flm->flm_dump_rib_item_cb, .result = FLM_SUCCESS, }; rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w); FD_PRINTF(LOG_INFO, fd, "initial dump completed (rtable version: %d), result: %s", fd->fd_rh->rnh_gen, print_op_result(w.result)); return (w.result); } /* * Schedules epoch-backed @fd instance deletion. * * Unlinks @fd from the list of active algo instances. * * Removes rib subscription. * * Stops callout. * * Schedules actual deletion. * * Assume @fd is already unlinked from the datapath. */ static int schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout) { bool is_dead; NET_EPOCH_ASSERT(); RIB_WLOCK_ASSERT(fd->fd_rh); FIB_MOD_LOCK(); is_dead = fd->fd_dead; if (!is_dead) fd->fd_dead = true; if (fd->fd_linked) { TAILQ_REMOVE(&V_fib_data_list, fd, entries); fd->fd_linked = false; } FIB_MOD_UNLOCK(); if (is_dead) return (0); FD_PRINTF(LOG_INFO, fd, "DETACH"); if (fd->fd_rs != NULL) rib_unsubscribe_locked(fd->fd_rs); /* * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls * will be executed, hence no _new_ callout schedules will happen. */ callout_stop(&fd->fd_callout); fib_epoch_call(destroy_fd_instance_epoch, &fd->fd_epoch_ctx); return (0); } /* * Wipe all fd instances from the list matching rib specified by @rh. * If @keep_first is set, remove all but the first record. */ static void fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout) { struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head); struct fib_data *fd, *fd_tmp; struct epoch_tracker et; FIB_MOD_LOCK(); TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) { if (fd->fd_rh == rh) { if (keep_first) { keep_first = false; continue; } TAILQ_REMOVE(&V_fib_data_list, fd, entries); fd->fd_linked = false; TAILQ_INSERT_TAIL(&tmp_head, fd, entries); } } FIB_MOD_UNLOCK(); /* Pass 2: remove each entry */ NET_EPOCH_ENTER(et); TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) { if (!in_callout) RIB_WLOCK(fd->fd_rh); schedule_destroy_fd_instance(fd, in_callout); if (!in_callout) RIB_WUNLOCK(fd->fd_rh); } NET_EPOCH_EXIT(et); } void fib_destroy_rib(struct rib_head *rh) { /* * rnh has `is_dying` flag set, so setup of new fd's will fail at * sync_algo() stage, preventing new entries to be added to the list * of active algos. Remove all existing entries for the particular rib. */ fib_cleanup_algo(rh, false, false); } /* * Finalises fd destruction by freeing all fd resources. */ static void destroy_fd_instance(struct fib_data *fd) { FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd); /* Call destroy callback first */ if (fd->fd_algo_data != NULL) fd->fd_flm->flm_destroy_cb(fd->fd_algo_data); /* Nhop table */ if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) { for (int i = 0; i < fd->number_nhops; i++) { if (!is_idx_free(fd, i)) { FD_PRINTF(LOG_DEBUG2, fd, " FREE nhop %d %p", i, fd->nh_idx[i]); nhop_free_any(fd->nh_idx[i]); } } free(fd->nh_idx, M_RTABLE); } if (fd->nh_ref_table != NULL) free(fd->nh_ref_table, M_RTABLE); if (fd->fd_ss.fd_change_queue.entries != NULL) free(fd->fd_ss.fd_change_queue.entries, M_TEMP); fib_unref_algo(fd->fd_flm); free(fd, M_RTABLE); } /* * Epoch callback indicating fd is safe to destroy */ static void destroy_fd_instance_epoch(epoch_context_t ctx) { struct fib_data *fd; fd = __containerof(ctx, struct fib_data, fd_epoch_ctx); CURVNET_SET(fd->fd_vnet); destroy_fd_instance(fd); CURVNET_RESTORE(); } /* * Tries to setup fd instance. * - Allocates fd/nhop table * - Runs algo:flm_init_cb algo init * - Subscribes fd to the rib * - Runs rtable dump * - Adds instance to the list of active instances. * * Returns: operation result. Fills in @pfd with resulting fd on success. * */ static enum flm_op_result try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, struct fib_data *old_fd, struct fib_data **pfd) { struct fib_data *fd; size_t size; enum flm_op_result result; /* Allocate */ fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO); if (fd == NULL) { *pfd = NULL; RH_PRINTF(LOG_INFO, rh, "Unable to allocate fib_data structure"); return (FLM_REBUILD); } *pfd = fd; estimate_nhop_scale(old_fd, fd); fd->fd_rh = rh; fd->fd_family = rh->rib_family; fd->fd_fibnum = rh->rib_fibnum; callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0); fd->fd_vnet = curvnet; fd->fd_flm = flm; FIB_MOD_LOCK(); flm->flm_refcount++; fd->fd_gen = ++fib_gen; FIB_MOD_UNLOCK(); FD_PRINTF(LOG_DEBUG, fd, "allocated fd %p", fd); /* Allocate nhidx -> nhop_ptr table */ size = fd->number_nhops * sizeof(void *); fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); if (fd->nh_idx == NULL) { FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size); return (FLM_REBUILD); } /* Allocate nhop index refcount table */ size = sizeof(struct nhop_ref_table); size += fd->number_nhops * sizeof(uint32_t); fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); if (fd->nh_ref_table == NULL) { FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size); return (FLM_REBUILD); } FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops); /* Okay, we're ready for algo init */ void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL; result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data); if (result != FLM_SUCCESS) { FD_PRINTF(LOG_INFO, fd, "%s algo init failed", flm->flm_name); return (result); } /* Try to subscribe */ if (flm->flm_change_rib_item_cb != NULL) { fd->fd_rs = rib_subscribe_locked(fd->fd_rh, handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE); if (fd->fd_rs == NULL) { FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes"); return (FLM_REBUILD); } } /* Dump */ result = sync_algo(fd); if (result != FLM_SUCCESS) { FD_PRINTF(LOG_INFO, fd, "rib sync failed"); return (result); } FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully."); FIB_MOD_LOCK(); /* * Insert fd in the beginning of a list, to maintain invariant * that first matching entry for the AF/fib is always the active * one. */ TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries); fd->fd_linked = true; FIB_MOD_UNLOCK(); return (FLM_SUCCESS); } /* * Sets up algo @flm for table @rh and links it to the datapath. * */ static enum flm_op_result setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, struct fib_data *orig_fd, struct fib_data **pfd, bool attach) { struct fib_data *prev_fd, *new_fd; enum flm_op_result result; NET_EPOCH_ASSERT(); RIB_WLOCK_ASSERT(rh); prev_fd = orig_fd; new_fd = NULL; for (int i = 0; i < FIB_MAX_TRIES; i++) { result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd); if ((result == FLM_SUCCESS) && attach) { if (fib_set_datapath_ptr(new_fd, &new_fd->fd_dp)) sync_rib_gen(new_fd); else result = FLM_REBUILD; } if ((prev_fd != NULL) && (prev_fd != orig_fd)) { schedule_destroy_fd_instance(prev_fd, false); prev_fd = NULL; } RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i, print_op_result(result)); if (result == FLM_REBUILD) { prev_fd = new_fd; new_fd = NULL; continue; } break; } if (result != FLM_SUCCESS) { RH_PRINTF(LOG_WARNING, rh, "%s algo instance setup failed, failures=%d", flm->flm_name, orig_fd ? orig_fd->fd_failed_rebuilds + 1 : 0); /* update failure count */ FIB_MOD_LOCK(); if (orig_fd != NULL) orig_fd->fd_failed_rebuilds++; FIB_MOD_UNLOCK(); /* Ban algo on non-recoverable error */ if (result == FLM_ERROR) flm_error_add(flm, rh->rib_fibnum); if ((prev_fd != NULL) && (prev_fd != orig_fd)) schedule_destroy_fd_instance(prev_fd, false); if (new_fd != NULL) { schedule_destroy_fd_instance(new_fd, false); new_fd = NULL; } } *pfd = new_fd; return (result); } /* * Tries to sync algo with the current rtable state, either * by executing batch update or rebuilding. * Returns true on success. */ static bool execute_callout_action(struct fib_data *fd) { enum fib_callout_action action = fd->fd_callout_action; struct fib_lookup_module *flm_new = NULL; bool result = true; NET_EPOCH_ASSERT(); RIB_WLOCK_ASSERT(fd->fd_rh); fd->fd_need_rebuild = false; fd->fd_batch = false; fd->fd_num_changes = 0; /* First, check if we're still OK to use this algo */ if (!is_algo_fixed(fd->fd_rh)) flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); if (flm_new != NULL) action = FDA_REBUILD; if (action == FDA_BATCH) { /* Try to sync */ if (!apply_rtable_changes(fd)) action = FDA_REBUILD; } if (action == FDA_REBUILD) result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm); if (flm_new != NULL) fib_unref_algo(flm_new); return (result); } /* * Callout for all scheduled fd-related work. * - Checks if the current algo is still the best algo * - Synchronises algo instance to the rtable (batch usecase) * - Creates a new instance of an algo for af/fib if desired. */ static void handle_fd_callout(void *_data) { struct fib_data *fd = (struct fib_data *)_data; struct epoch_tracker et; FD_PRINTF(LOG_INFO, fd, "running callout type=%d", fd->fd_callout_action); NET_EPOCH_ENTER(et); CURVNET_SET(fd->fd_vnet); execute_callout_action(fd); CURVNET_RESTORE(); NET_EPOCH_EXIT(et); } /* * Tries to create new algo instance based on @fd data. * Returns true on success. */ static bool rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new) { struct fib_data *fd_new, *fd_tmp = NULL; bool result; if (flm_new == fd->fd_flm) fd_tmp = fd; else FD_PRINTF(LOG_NOTICE, fd, "switching algo to %s", flm_new->flm_name); result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true); if (result != FLM_SUCCESS) { FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed"); return (false); } FD_PRINTF(LOG_INFO, fd_new, "switched to new instance"); /* Remove old instance */ schedule_destroy_fd_instance(fd, true); return (true); } static bool rebuild_fd(struct fib_data *fd, const char *reason) { struct fib_lookup_module *flm_new = NULL; bool result; if (!is_algo_fixed(fd->fd_rh)) flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); FD_PRINTF(LOG_INFO, fd, "running sync rebuild: %s", reason); result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm); if (flm_new != NULL) fib_unref_algo(flm_new); if (!result) { FD_PRINTF(LOG_ERR, fd, "sync rebuild failed"); schedule_fd_rebuild(fd, "sync rebuild failed"); } return (result); } /* * Finds algo by name/family. * Returns referenced algo or NULL. */ static struct fib_lookup_module * fib_find_algo(const char *algo_name, int family) { struct fib_lookup_module *flm; FIB_MOD_LOCK(); TAILQ_FOREACH(flm, &all_algo_list, entries) { if ((strcmp(flm->flm_name, algo_name) == 0) && (family == flm->flm_family)) { flm->flm_refcount++; FIB_MOD_UNLOCK(); return (flm); } } FIB_MOD_UNLOCK(); return (NULL); } static void fib_unref_algo(struct fib_lookup_module *flm) { FIB_MOD_LOCK(); flm->flm_refcount--; FIB_MOD_UNLOCK(); } static int set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req) { struct fib_lookup_module *flm = NULL; struct fib_data *fd = NULL; char old_algo_name[32], algo_name[32]; struct rib_head *rh = NULL; enum flm_op_result result; struct epoch_tracker et; int error; /* Fetch current algo/rib for af/family */ FIB_MOD_LOCK(); TAILQ_FOREACH(fd, &V_fib_data_list, entries) { if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum)) break; } if (fd == NULL) { FIB_MOD_UNLOCK(); return (ENOENT); } rh = fd->fd_rh; strlcpy(old_algo_name, fd->fd_flm->flm_name, sizeof(old_algo_name)); FIB_MOD_UNLOCK(); strlcpy(algo_name, old_algo_name, sizeof(algo_name)); error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req); if (error != 0 || req->newptr == NULL) return (error); if (strcmp(algo_name, old_algo_name) == 0) return (0); /* New algorithm name is different */ flm = fib_find_algo(algo_name, family); if (flm == NULL) { RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name); return (ESRCH); } fd = NULL; NET_EPOCH_ENTER(et); RIB_WLOCK(rh); result = setup_fd_instance(flm, rh, NULL, &fd, true); RIB_WUNLOCK(rh); NET_EPOCH_EXIT(et); fib_unref_algo(flm); if (result != FLM_SUCCESS) return (EINVAL); /* Disable automated jumping between algos */ FIB_MOD_LOCK(); set_algo_fixed(rh); FIB_MOD_UNLOCK(); /* Remove old instance(s) */ fib_cleanup_algo(rh, true, false); /* Drain cb so user can unload the module after userret if so desired */ NET_EPOCH_DRAIN_CALLBACKS(); return (0); } #ifdef INET static int set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS) { return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET, oidp, req)); } SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo, CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo"); #endif #ifdef INET6 static int set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS) { return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET6, oidp, req)); } SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo, CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo"); #endif static struct nhop_object * dummy_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) { return (NULL); } static void destroy_fdh_epoch(epoch_context_t ctx) { struct fib_dp_header *fdh; fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx); free(fdh, M_RTABLE); } static struct fib_dp_header * alloc_fib_dp_array(uint32_t num_tables, bool waitok) { size_t sz; struct fib_dp_header *fdh; sz = sizeof(struct fib_dp_header); sz += sizeof(struct fib_dp) * num_tables; fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO); if (fdh != NULL) { fdh->fdh_num_tables = num_tables; /* * Set dummy lookup function ptr always returning NULL, so * we can delay algo init. */ for (uint32_t i = 0; i < num_tables; i++) fdh->fdh_idx[i].f = dummy_lookup; } return (fdh); } static struct fib_dp_header * get_fib_dp_header(struct fib_dp *dp) { return (__containerof((void *)dp, struct fib_dp_header, fdh_idx)); } /* * Replace per-family index pool @pdp with a new one which * contains updated callback/algo data from @fd. * Returns true on success. */ static bool replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd, struct fib_dp *dp) { struct fib_dp_header *new_fdh, *old_fdh; NET_EPOCH_ASSERT(); FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p", curvnet, dp->f, dp->arg); FIB_MOD_LOCK(); old_fdh = get_fib_dp_header(*pdp); if (old_fdh->fdh_idx[fd->fd_fibnum].f == dp->f) { /* * Function is the same, data pointer needs update. * Perform in-line replace without reallocation. */ old_fdh->fdh_idx[fd->fd_fibnum].arg = dp->arg; FD_PRINTF(LOG_DEBUG, fd, "FDH %p inline update", old_fdh); FIB_MOD_UNLOCK(); return (true); } new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false); FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh); if (new_fdh == NULL) { FIB_MOD_UNLOCK(); FD_PRINTF(LOG_WARNING, fd, "error attaching datapath"); return (false); } memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], old_fdh->fdh_num_tables * sizeof(struct fib_dp)); /* Update relevant data structure for @fd */ new_fdh->fdh_idx[fd->fd_fibnum] = *dp; /* Ensure memcpy() writes have completed */ atomic_thread_fence_rel(); /* Set new datapath pointer */ *pdp = &new_fdh->fdh_idx[0]; FIB_MOD_UNLOCK(); FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh); fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx); return (true); } static struct fib_dp ** get_family_dp_ptr(int family) { switch (family) { #ifdef INET case AF_INET: return (&V_inet_dp); #endif #ifdef INET6 case AF_INET6: return (&V_inet6_dp); #endif } return (NULL); } /* * Make datapath use fib instance @fd */ bool fib_set_datapath_ptr(struct fib_data *fd, struct fib_dp *dp) { struct fib_dp **pdp; pdp = get_family_dp_ptr(fd->fd_family); return (replace_rtables_family(pdp, fd, dp)); } /* * Grow datapath pointers array. * Called from sysctl handler on growing number of routing tables. */ static void grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables) { struct fib_dp_header *new_fdh, *old_fdh = NULL; new_fdh = alloc_fib_dp_array(new_num_tables, true); FIB_MOD_LOCK(); if (*pdp != NULL) { old_fdh = get_fib_dp_header(*pdp); memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], old_fdh->fdh_num_tables * sizeof(struct fib_dp)); } /* Wait till all writes completed */ atomic_thread_fence_rel(); *pdp = &new_fdh->fdh_idx[0]; FIB_MOD_UNLOCK(); if (old_fdh != NULL) fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx); } /* * Grows per-AF arrays of datapath pointers for each supported family. * Called from fibs resize sysctl handler. */ void fib_grow_rtables(uint32_t new_num_tables) { #ifdef INET grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables); #endif #ifdef INET6 grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables); #endif } void fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo) { bzero(rinfo, sizeof(struct rib_rtable_info)); rinfo->num_prefixes = rh->rnh_prefixes; rinfo->num_nhops = nhops_get_count(rh); #ifdef ROUTE_MPATH rinfo->num_nhgrp = nhgrp_get_count(rh); #endif } /* * Updates pointer to the algo data for the @fd. */ void fib_set_algo_ptr(struct fib_data *fd, void *algo_data) { RIB_WLOCK_ASSERT(fd->fd_rh); fd->fd_algo_data = algo_data; } /* * Calls @callback with @ctx after the end of a current epoch. */ void fib_epoch_call(epoch_callback_t callback, epoch_context_t ctx) { - epoch_call(net_epoch_preempt, callback, ctx); + NET_EPOCH_CALL(callback, ctx); } /* * Accessor to get rib instance @fd is attached to. */ struct rib_head * fib_get_rh(struct fib_data *fd) { return (fd->fd_rh); } /* * Accessor to export idx->nhop array */ struct nhop_object ** fib_get_nhop_array(struct fib_data *fd) { return (fd->nh_idx); } static uint32_t get_nhop_idx(struct nhop_object *nh) { #ifdef ROUTE_MPATH if (NH_IS_NHGRP(nh)) return (nhgrp_get_idx((struct nhgrp_object *)nh)); else #endif return (nhop_get_idx(nh)); } uint32_t fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh) { return (get_nhop_idx(nh)); } static bool is_idx_free(struct fib_data *fd, uint32_t index) { return (fd->nh_ref_table->refcnt[index] == 0); } static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh) { uint32_t idx = get_nhop_idx(nh); if (idx >= fd->number_nhops) { fd->hit_nhops = 1; return (0); } if (is_idx_free(fd, idx)) { nhop_ref_any(nh); fd->nh_idx[idx] = nh; fd->nh_ref_table->count++; FD_PRINTF(LOG_DEBUG2, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]); } fd->nh_ref_table->refcnt[idx]++; return (idx); } struct nhop_release_data { struct nhop_object *nh; struct epoch_context ctx; }; static void release_nhop_epoch(epoch_context_t ctx) { struct nhop_release_data *nrd; nrd = __containerof(ctx, struct nhop_release_data, ctx); nhop_free_any(nrd->nh); free(nrd, M_TEMP); } /* * Delays nexthop refcount release. * Datapath may have the datastructures not updated yet, so the old * nexthop may still be returned till the end of current epoch. Delay * refcount removal, as we may be removing the last instance, which will * trigger nexthop deletion, rendering returned nexthop invalid. */ static void fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh) { struct nhop_release_data *nrd; nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO); if (nrd != NULL) { nrd->nh = nh; fib_epoch_call(release_nhop_epoch, &nrd->ctx); } else { /* * Unable to allocate memory. Leak nexthop to maintain guarantee * that each nhop can be referenced. */ FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh); } } static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh) { uint32_t idx = get_nhop_idx(nh); KASSERT((idx < fd->number_nhops), ("invalid nhop index")); KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh")); fd->nh_ref_table->refcnt[idx]--; if (fd->nh_ref_table->refcnt[idx] == 0) { FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]); fib_schedule_release_nhop(fd, fd->nh_idx[idx]); } } static void set_algo_fixed(struct rib_head *rh) { switch (rh->rib_family) { #ifdef INET case AF_INET: V_algo_fixed_inet = true; break; #endif #ifdef INET6 case AF_INET6: V_algo_fixed_inet6 = true; break; #endif } } static bool is_algo_fixed(struct rib_head *rh) { switch (rh->rib_family) { #ifdef INET case AF_INET: return (V_algo_fixed_inet); #endif #ifdef INET6 case AF_INET6: return (V_algo_fixed_inet6); #endif } return (false); } /* * Runs the check on what would be the best algo for rib @rh, assuming * that the current algo is the one specified by @orig_flm. Note that * it can be NULL for initial selection. * * Returns referenced new algo or NULL if the current one is the best. */ static struct fib_lookup_module * fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm) { uint8_t preference, curr_preference = 0, best_preference = 0; struct fib_lookup_module *flm, *best_flm = NULL; struct rib_rtable_info rinfo; int candidate_algos = 0; fib_get_rtable_info(rh, &rinfo); FIB_MOD_LOCK(); TAILQ_FOREACH(flm, &all_algo_list, entries) { if (flm->flm_family != rh->rib_family) continue; candidate_algos++; preference = flm->flm_get_pref(&rinfo); if (preference > best_preference) { if (!flm_error_check(flm, rh->rib_fibnum)) { best_preference = preference; best_flm = flm; } } if (flm == orig_flm) curr_preference = preference; } if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference)) best_flm->flm_refcount++; else best_flm = NULL; FIB_MOD_UNLOCK(); RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)", candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference, best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"), best_preference); return (best_flm); } /* * Called when new route table is created. * Selects, allocates and attaches fib algo for the table. */ static bool fib_select_algo_initial(struct rib_head *rh, struct fib_dp *dp) { struct fib_lookup_module *flm; struct fib_data *fd = NULL; enum flm_op_result result; struct epoch_tracker et; flm = fib_check_best_algo(rh, NULL); if (flm == NULL) { RH_PRINTF(LOG_CRIT, rh, "no algo selected"); return (false); } RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name); NET_EPOCH_ENTER(et); RIB_WLOCK(rh); result = setup_fd_instance(flm, rh, NULL, &fd, false); RIB_WUNLOCK(rh); NET_EPOCH_EXIT(et); RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd); if (result == FLM_SUCCESS) *dp = fd->fd_dp; else RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name); fib_unref_algo(flm); return (result == FLM_SUCCESS); } /* * Sets up fib algo instances for the non-initialized RIBs in the @family. * Allocates temporary datapath index to amortize datapaint index updates * with large @num_tables. */ void fib_setup_family(int family, uint32_t num_tables) { struct fib_dp_header *new_fdh = alloc_fib_dp_array(num_tables, false); if (new_fdh == NULL) { ALGO_PRINTF(LOG_CRIT, "Unable to setup framework for %s", print_family(family)); return; } for (int i = 0; i < num_tables; i++) { struct rib_head *rh = rt_tables_get_rnh(i, family); if (rh->rib_algo_init) continue; if (!fib_select_algo_initial(rh, &new_fdh->fdh_idx[i])) continue; rh->rib_algo_init = true; } FIB_MOD_LOCK(); struct fib_dp **pdp = get_family_dp_ptr(family); struct fib_dp_header *old_fdh = get_fib_dp_header(*pdp); /* Update the items not touched by the new init, from the old data pointer */ for (int i = 0; i < num_tables; i++) { if (new_fdh->fdh_idx[i].f == dummy_lookup) new_fdh->fdh_idx[i] = old_fdh->fdh_idx[i]; } /* Ensure all index writes have completed */ atomic_thread_fence_rel(); /* Set new datapath pointer */ *pdp = &new_fdh->fdh_idx[0]; FIB_MOD_UNLOCK(); fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx); } /* * Registers fib lookup module within the subsystem. */ int fib_module_register(struct fib_lookup_module *flm) { FIB_MOD_LOCK(); ALGO_PRINTF(LOG_INFO, "attaching %s to %s", flm->flm_name, print_family(flm->flm_family)); TAILQ_INSERT_TAIL(&all_algo_list, flm, entries); FIB_MOD_UNLOCK(); return (0); } /* * Tries to unregister fib lookup module. * * Returns 0 on success, EBUSY if module is still used * by some of the tables. */ int fib_module_unregister(struct fib_lookup_module *flm) { FIB_MOD_LOCK(); if (flm->flm_refcount > 0) { FIB_MOD_UNLOCK(); return (EBUSY); } fib_error_clear_flm(flm); ALGO_PRINTF(LOG_INFO, "detaching %s from %s", flm->flm_name, print_family(flm->flm_family)); TAILQ_REMOVE(&all_algo_list, flm, entries); FIB_MOD_UNLOCK(); return (0); } void vnet_fib_init(void) { TAILQ_INIT(&V_fib_data_list); } void vnet_fib_destroy(void) { FIB_MOD_LOCK(); fib_error_clear(); FIB_MOD_UNLOCK(); } diff --git a/sys/net/route/nhgrp_ctl.c b/sys/net/route/nhgrp_ctl.c index 5d6fb219f877..b829b1125597 100644 --- a/sys/net/route/nhgrp_ctl.c +++ b/sys/net/route/nhgrp_ctl.c @@ -1,978 +1,977 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2020 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_inet.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nhgrp_ctl #define DEBUG_MAX_LEVEL LOG_DEBUG #include _DECLARE_DEBUG(LOG_INFO); /* * This file contains the supporting functions for creating multipath groups * and compiling their dataplane parts. */ /* MPF_MULTIPATH must be the same as NHF_MULTIPATH for nhop selection to work */ _Static_assert(MPF_MULTIPATH == NHF_MULTIPATH, "MPF_MULTIPATH must be the same as NHF_MULTIPATH"); /* Offset and size of flags field has to be the same for nhop/nhop groups */ CHK_STRUCT_FIELD_GENERIC(struct nhop_object, nh_flags, struct nhgrp_object, nhg_flags); /* Cap multipath to 64, as the larger values would break rib_cmd_info bmasks */ CTASSERT(RIB_MAX_MPATH_WIDTH <= 64); static int wn_cmp_idx(const void *a, const void *b); static void sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops); static struct nhgrp_priv *get_nhgrp(struct nh_control *ctl, struct weightened_nhop *wn, int num_nhops, uint32_t uidx, int *perror); static void destroy_nhgrp(struct nhgrp_priv *nhg_priv); static void destroy_nhgrp_epoch(epoch_context_t ctx); static void free_nhgrp_nhops(struct nhgrp_priv *nhg_priv); static int wn_cmp_idx(const void *a, const void *b) { const struct weightened_nhop *w_a = a; const struct weightened_nhop *w_b = b; uint32_t a_idx = w_a->nh->nh_priv->nh_idx; uint32_t b_idx = w_b->nh->nh_priv->nh_idx; if (a_idx < b_idx) return (-1); else if (a_idx > b_idx) return (1); else return (0); } /* * Perform in-place sorting for array of nexthops in @wn. * Sort by nexthop index ascending. */ static void sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops) { qsort(wn, num_nhops, sizeof(struct weightened_nhop), wn_cmp_idx); } /* * In order to determine the minimum weight difference in the array * of weights, create a sorted array of weights, using spare "storage" * field in the `struct weightened_nhop`. * Assume weights to be (mostly) the same and use insertion sort to * make it sorted. */ static void sort_weightened_nhops_weights(struct weightened_nhop *wn, int num_items) { wn[0].storage = wn[0].weight; for (int i = 1, j = 0; i < num_items; i++) { uint32_t weight = wn[i].weight; // read from 'weight' as it's not reordered /* Move all weights > weight 1 position right */ for (j = i - 1; j >= 0 && wn[j].storage > weight; j--) wn[j + 1].storage = wn[j].storage; wn[j + 1].storage = weight; } } /* * Calculate minimum number of slots required to fit the existing * set of weights in the common use case where weights are "easily" * comparable. * Assumes @wn is sorted by weight ascending and each weight is > 0. * Returns number of slots or 0 if precise calculation failed. * * Some examples: * note: (i, X) pair means (nhop=i, weight=X): * (1, 1) (2, 2) -> 3 slots [1, 2, 2] * (1, 100), (2, 200) -> 3 slots [1, 2, 2] * (1, 100), (2, 200), (3, 400) -> 7 slots [1, 2, 2, 3, 3, 3] */ static uint32_t calc_min_mpath_slots_fast(struct weightened_nhop *wn, size_t num_items, uint64_t *ptotal) { uint32_t i, last, xmin; uint64_t total = 0; // Get sorted array of weights in .storage field sort_weightened_nhops_weights(wn, num_items); last = 0; xmin = wn[0].storage; for (i = 0; i < num_items; i++) { total += wn[i].storage; if ((wn[i].storage != last) && ((wn[i].storage - last < xmin) || xmin == 0)) { xmin = wn[i].storage - last; } last = wn[i].storage; } *ptotal = total; /* xmin is the minimum unit of desired capacity */ if ((total % xmin) != 0) return (0); for (i = 0; i < num_items; i++) { if ((wn[i].weight % xmin) != 0) return (0); } return ((uint32_t)(total / xmin)); } /* * Calculate minimum number of slots required to fit the existing * set of weights while maintaining weight coefficients. * * Assume @wn is sorted by weight ascending and each weight is > 0. * * Tries to find simple precise solution first and falls back to * RIB_MAX_MPATH_WIDTH in case of any failure. */ static uint32_t calc_min_mpath_slots(struct weightened_nhop *wn, size_t num_items) { uint32_t v; uint64_t total; v = calc_min_mpath_slots_fast(wn, num_items, &total); if (total == 0) return (0); if ((v == 0) || (v > RIB_MAX_MPATH_WIDTH)) v = RIB_MAX_MPATH_WIDTH; return (v); } /* * Nexthop group data consists of * 1) dataplane part, with nhgrp_object as a header followed by an * arbitrary number of nexthop pointers. * 2) control plane part, with nhgrp_priv as a header, followed by * an arbirtrary number of 'struct weightened_nhop' object. * * Given nexthop groups are (mostly) immutable, allocate all data * in one go. * */ __noinline static size_t get_nhgrp_alloc_size(uint32_t nhg_size, uint32_t num_nhops) { size_t sz; sz = sizeof(struct nhgrp_object); sz += nhg_size * sizeof(struct nhop_object *); sz += sizeof(struct nhgrp_priv); sz += num_nhops * sizeof(struct weightened_nhop); return (sz); } /* * Compile actual list of nexthops to be used by datapath from * the nexthop group @dst. * * For example, compiling control plane list of 2 nexthops * [(200, A), (100, B)] would result in the datapath array * [A, A, B] */ static void compile_nhgrp(struct nhgrp_priv *dst_priv, const struct weightened_nhop *x, uint32_t num_slots) { struct nhgrp_object *dst; int i, slot_idx, remaining_slots; uint64_t remaining_sum, nh_weight, nh_slots; slot_idx = 0; dst = dst_priv->nhg; /* Calculate sum of all weights */ remaining_sum = 0; for (i = 0; i < dst_priv->nhg_nh_count; i++) remaining_sum += x[i].weight; remaining_slots = num_slots; FIB_NH_LOG(LOG_DEBUG3, x[0].nh, "sum: %lu, slots: %d", remaining_sum, remaining_slots); for (i = 0; i < dst_priv->nhg_nh_count; i++) { /* Calculate number of slots for the current nexthop */ if (remaining_sum > 0) { nh_weight = (uint64_t)x[i].weight; nh_slots = (nh_weight * remaining_slots / remaining_sum); } else nh_slots = 0; remaining_sum -= x[i].weight; remaining_slots -= nh_slots; FIB_NH_LOG(LOG_DEBUG3, x[0].nh, " rem_sum: %lu, rem_slots: %d nh_slots: %d, slot_idx: %d", remaining_sum, remaining_slots, (int)nh_slots, slot_idx); KASSERT((slot_idx + nh_slots <= num_slots), ("index overflow during nhg compilation")); while (nh_slots-- > 0) dst->nhops[slot_idx++] = x[i].nh; } } /* * Allocates new nexthop group for the list of weightened nexthops. * Assume sorted list. * Does NOT reference any nexthops in the group. * Returns group with refcount=1 or NULL. */ static struct nhgrp_priv * alloc_nhgrp(struct weightened_nhop *wn, int num_nhops) { uint32_t nhgrp_size; struct nhgrp_object *nhg; struct nhgrp_priv *nhg_priv; nhgrp_size = calc_min_mpath_slots(wn, num_nhops); if (nhgrp_size == 0) { /* Zero weights, abort */ return (NULL); } size_t sz = get_nhgrp_alloc_size(nhgrp_size, num_nhops); nhg = malloc(sz, M_NHOP, M_NOWAIT | M_ZERO); if (nhg == NULL) { FIB_NH_LOG(LOG_INFO, wn[0].nh, "unable to allocate group with num_nhops %d (compiled %u)", num_nhops, nhgrp_size); return (NULL); } /* Has to be the first to make NHGRP_PRIV() work */ nhg->nhg_size = nhgrp_size; nhg->nhg_flags = MPF_MULTIPATH; nhg_priv = NHGRP_PRIV(nhg); nhg_priv->nhg_nh_count = num_nhops; refcount_init(&nhg_priv->nhg_refcount, 1); /* Please see nhgrp_free() comments on the initial value */ refcount_init(&nhg_priv->nhg_linked, 2); nhg_priv->nhg = nhg; memcpy(&nhg_priv->nhg_nh_weights[0], wn, num_nhops * sizeof(struct weightened_nhop)); FIB_NH_LOG(LOG_DEBUG, wn[0].nh, "num_nhops: %d, compiled_nhop: %u", num_nhops, nhgrp_size); compile_nhgrp(nhg_priv, wn, nhg->nhg_size); return (nhg_priv); } void nhgrp_ref_object(struct nhgrp_object *nhg) { struct nhgrp_priv *nhg_priv; u_int old __diagused; nhg_priv = NHGRP_PRIV(nhg); old = refcount_acquire(&nhg_priv->nhg_refcount); KASSERT(old > 0, ("%s: nhgrp object %p has 0 refs", __func__, nhg)); } void nhgrp_free(struct nhgrp_object *nhg) { struct nhgrp_priv *nhg_priv; struct nh_control *ctl; struct epoch_tracker et; nhg_priv = NHGRP_PRIV(nhg); if (!refcount_release(&nhg_priv->nhg_refcount)) return; /* * group objects don't have an explicit lock attached to it. * As groups are reclaimed based on reference count, it is possible * that some groups will persist after vnet destruction callback * called. Given that, handle scenario with nhgrp_free_group() being * called either after or simultaneously with nhgrp_ctl_unlink_all() * by using another reference counter: nhg_linked. * * There are only 2 places, where nhg_linked can be decreased: * rib destroy (nhgrp_ctl_unlink_all) and this function. * nhg_link can never be increased. * * Hence, use initial value of 2 to make use of * refcount_release_if_not_last(). * * There can be two scenarious when calling this function: * * 1) nhg_linked value is 2. This means that either * nhgrp_ctl_unlink_all() has not been called OR it is running, * but we are guaranteed that nh_control won't be freed in * this epoch. Hence, nexthop can be safely unlinked. * * 2) nh_linked value is 1. In that case, nhgrp_ctl_unlink_all() * has been called and nhgrp unlink can be skipped. */ NET_EPOCH_ENTER(et); if (refcount_release_if_not_last(&nhg_priv->nhg_linked)) { ctl = nhg_priv->nh_control; if (unlink_nhgrp(ctl, nhg_priv) == NULL) { /* Do not try to reclaim */ RT_LOG(LOG_INFO, "Failed to unlink nexhop group %p", nhg_priv); NET_EPOCH_EXIT(et); return; } } NET_EPOCH_EXIT(et); KASSERT((nhg_priv->nhg_idx == 0), ("gr_idx != 0")); - epoch_call(net_epoch_preempt, destroy_nhgrp_epoch, - &nhg_priv->nhg_epoch_ctx); + NET_EPOCH_CALL(destroy_nhgrp_epoch, &nhg_priv->nhg_epoch_ctx); } /* * Destroys all local resources belonging to @nhg_priv. */ __noinline static void destroy_nhgrp_int(struct nhgrp_priv *nhg_priv) { free(nhg_priv->nhg, M_NHOP); } __noinline static void destroy_nhgrp(struct nhgrp_priv *nhg_priv) { KASSERT((nhg_priv->nhg_refcount == 0), ("nhg_refcount != 0")); KASSERT((nhg_priv->nhg_idx == 0), ("gr_idx != 0")); IF_DEBUG_LEVEL(LOG_DEBUG2) { char nhgbuf[NHOP_PRINT_BUFSIZE] __unused; FIB_NH_LOG(LOG_DEBUG2, nhg_priv->nhg_nh_weights[0].nh, "destroying %s", nhgrp_print_buf(nhg_priv->nhg, nhgbuf, sizeof(nhgbuf))); } free_nhgrp_nhops(nhg_priv); destroy_nhgrp_int(nhg_priv); } /* * Epoch callback indicating group is safe to destroy */ static void destroy_nhgrp_epoch(epoch_context_t ctx) { struct nhgrp_priv *nhg_priv; nhg_priv = __containerof(ctx, struct nhgrp_priv, nhg_epoch_ctx); destroy_nhgrp(nhg_priv); } static bool ref_nhgrp_nhops(struct nhgrp_priv *nhg_priv) { for (int i = 0; i < nhg_priv->nhg_nh_count; i++) { if (nhop_try_ref_object(nhg_priv->nhg_nh_weights[i].nh) != 0) continue; /* * Failed to ref the nexthop, b/c it's deleted. * Need to rollback references back. */ for (int j = 0; j < i; j++) nhop_free(nhg_priv->nhg_nh_weights[j].nh); return (false); } return (true); } static void free_nhgrp_nhops(struct nhgrp_priv *nhg_priv) { for (int i = 0; i < nhg_priv->nhg_nh_count; i++) nhop_free(nhg_priv->nhg_nh_weights[i].nh); } /* * Allocate nexthop group of size @num_nhops with nexthops specified by * @wn. Nexthops have to be unique and match the fibnum/family of the group. * Returns unlinked nhgrp object on success or NULL and non-zero perror. */ struct nhgrp_object * nhgrp_alloc(uint32_t fibnum, int family, struct weightened_nhop *wn, int num_nhops, int *perror) { struct rib_head *rh = rt_tables_get_rnh(fibnum, family); struct nhgrp_priv *nhg_priv; struct nh_control *ctl; if (rh == NULL) { *perror = E2BIG; return (NULL); } ctl = rh->nh_control; if (num_nhops > RIB_MAX_MPATH_WIDTH) { *perror = E2BIG; return (NULL); } if (ctl->gr_head.hash_size == 0) { /* First multipath request. Bootstrap mpath datastructures. */ if (nhgrp_ctl_alloc_default(ctl, M_NOWAIT) == 0) { *perror = ENOMEM; return (NULL); } } /* Sort nexthops & check there are no duplicates */ sort_weightened_nhops(wn, num_nhops); uint32_t last_id = 0; for (int i = 0; i < num_nhops; i++) { if (wn[i].nh->nh_priv->nh_control != ctl) { *perror = EINVAL; return (NULL); } if (wn[i].nh->nh_priv->nh_idx == last_id) { *perror = EEXIST; return (NULL); } last_id = wn[i].nh->nh_priv->nh_idx; } if ((nhg_priv = alloc_nhgrp(wn, num_nhops)) == NULL) { *perror = ENOMEM; return (NULL); } nhg_priv->nh_control = ctl; *perror = 0; return (nhg_priv->nhg); } /* * Finds an existing group matching @nhg or links @nhg to the tree. * Returns the referenced group or NULL and non-zero @perror. */ struct nhgrp_object * nhgrp_get_nhgrp(struct nhgrp_object *nhg, int *perror) { struct nhgrp_priv *nhg_priv, *key = NHGRP_PRIV(nhg); struct nh_control *ctl = key->nh_control; nhg_priv = find_nhgrp(ctl, key); if (nhg_priv != NULL) { /* * Free originally-created group. As it hasn't been linked * and the dependent nexhops haven't been referenced, just free * the group. */ destroy_nhgrp_int(key); *perror = 0; return (nhg_priv->nhg); } else { /* No existing group, try to link the new one */ if (!ref_nhgrp_nhops(key)) { /* * Some of the nexthops have been scheduled for deletion. * As the group hasn't been linked / no nexhops have been * referenced, call the final destructor immediately. */ destroy_nhgrp_int(key); *perror = EAGAIN; return (NULL); } if (link_nhgrp(ctl, key) == 0) { /* Unable to allocate index? */ *perror = EAGAIN; free_nhgrp_nhops(key); destroy_nhgrp_int(key); return (NULL); } *perror = 0; return (nhg); } /* NOTREACHED */ } /* * Creates or looks up an existing nexthop group based on @wn and @num_nhops. * * Returns referenced nhop group or NULL, passing error code in @perror. */ struct nhgrp_priv * get_nhgrp(struct nh_control *ctl, struct weightened_nhop *wn, int num_nhops, uint32_t uidx, int *perror) { struct nhgrp_object *nhg; nhg = nhgrp_alloc(ctl->ctl_rh->rib_fibnum, ctl->ctl_rh->rib_family, wn, num_nhops, perror); if (nhg == NULL) return (NULL); nhgrp_set_uidx(nhg, uidx); nhg = nhgrp_get_nhgrp(nhg, perror); if (nhg != NULL) return (NHGRP_PRIV(nhg)); return (NULL); } /* * Appends one or more nexthops denoted by @wm to the nexthop group @gr_orig. * * Returns referenced nexthop group or NULL. In the latter case, @perror is * filled with an error code. * Note that function does NOT care if the next nexthops already exists * in the @gr_orig. As a result, they will be added, resulting in the * same nexthop being present multiple times in the new group. */ static struct nhgrp_priv * append_nhops(struct nh_control *ctl, const struct nhgrp_object *gr_orig, struct weightened_nhop *wn, int num_nhops, int *perror) { char storage[64]; struct weightened_nhop *pnhops; struct nhgrp_priv *nhg_priv; const struct nhgrp_priv *src_priv; size_t sz; int curr_nhops; src_priv = NHGRP_PRIV_CONST(gr_orig); curr_nhops = src_priv->nhg_nh_count; *perror = 0; sz = (src_priv->nhg_nh_count + num_nhops) * (sizeof(struct weightened_nhop)); /* optimize for <= 4 paths, each path=16 bytes */ if (sz <= sizeof(storage)) pnhops = (struct weightened_nhop *)&storage[0]; else { pnhops = malloc(sz, M_TEMP, M_NOWAIT); if (pnhops == NULL) { *perror = ENOMEM; return (NULL); } } /* Copy nhops from original group first */ memcpy(pnhops, src_priv->nhg_nh_weights, curr_nhops * sizeof(struct weightened_nhop)); memcpy(&pnhops[curr_nhops], wn, num_nhops * sizeof(struct weightened_nhop)); curr_nhops += num_nhops; nhg_priv = get_nhgrp(ctl, pnhops, curr_nhops, 0, perror); if (pnhops != (struct weightened_nhop *)&storage[0]) free(pnhops, M_TEMP); if (nhg_priv == NULL) return (NULL); return (nhg_priv); } /* * Creates/finds nexthop group based on @wn and @num_nhops. * Returns 0 on success with referenced group in @rnd, or * errno. * * If the error is EAGAIN, then the operation can be retried. */ int nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops, uint32_t uidx, struct nhgrp_object **pnhg) { struct nh_control *ctl = rh->nh_control; struct nhgrp_priv *nhg_priv; int error; nhg_priv = get_nhgrp(ctl, wn, num_nhops, uidx, &error); if (nhg_priv != NULL) *pnhg = nhg_priv->nhg; return (error); } /* * Creates new nexthop group based on @src group without the nexthops * chosen by @flt_func. * Returns 0 on success, storring the reference nhop group/object in @rnd. */ int nhgrp_get_filtered_group(struct rib_head *rh, const struct rtentry *rt, const struct nhgrp_object *src, rib_filter_f_t flt_func, void *flt_data, struct route_nhop_data *rnd) { char storage[64]; struct nh_control *ctl = rh->nh_control; struct weightened_nhop *pnhops; const struct nhgrp_priv *mp_priv, *src_priv; size_t sz; int error, i, num_nhops; src_priv = NHGRP_PRIV_CONST(src); sz = src_priv->nhg_nh_count * (sizeof(struct weightened_nhop)); /* optimize for <= 4 paths, each path=16 bytes */ if (sz <= sizeof(storage)) pnhops = (struct weightened_nhop *)&storage[0]; else { if ((pnhops = malloc(sz, M_TEMP, M_NOWAIT)) == NULL) return (ENOMEM); } /* Filter nexthops */ error = 0; num_nhops = 0; for (i = 0; i < src_priv->nhg_nh_count; i++) { if (flt_func(rt, src_priv->nhg_nh_weights[i].nh, flt_data)) continue; memcpy(&pnhops[num_nhops++], &src_priv->nhg_nh_weights[i], sizeof(struct weightened_nhop)); } if (num_nhops == 0) { rnd->rnd_nhgrp = NULL; rnd->rnd_weight = 0; } else if (num_nhops == 1) { rnd->rnd_nhop = pnhops[0].nh; rnd->rnd_weight = pnhops[0].weight; if (nhop_try_ref_object(rnd->rnd_nhop) == 0) error = EAGAIN; } else { mp_priv = get_nhgrp(ctl, pnhops, num_nhops, 0, &error); if (mp_priv != NULL) rnd->rnd_nhgrp = mp_priv->nhg; rnd->rnd_weight = 0; } if (pnhops != (struct weightened_nhop *)&storage[0]) free(pnhops, M_TEMP); return (error); } /* * Creates new multipath group based on existing group/nhop in @rnd_orig and * to-be-added nhop @wn_add. * Returns 0 on success and stores result in @rnd_new. */ int nhgrp_get_addition_group(struct rib_head *rh, struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_new) { struct nh_control *ctl = rh->nh_control; struct nhgrp_priv *nhg_priv; struct weightened_nhop wn[2] = {}; int error; if (rnd_orig->rnd_nhop == NULL) { /* No paths to add to, just reference current nhop */ *rnd_new = *rnd_add; if (nhop_try_ref_object(rnd_new->rnd_nhop) == 0) return (EAGAIN); return (0); } wn[0].nh = rnd_add->rnd_nhop; wn[0].weight = rnd_add->rnd_weight; if (!NH_IS_NHGRP(rnd_orig->rnd_nhop)) { /* Simple merge of 2 non-multipath nexthops */ wn[1].nh = rnd_orig->rnd_nhop; wn[1].weight = rnd_orig->rnd_weight; nhg_priv = get_nhgrp(ctl, wn, 2, 0, &error); } else { /* Get new nhop group with @rt->rt_nhop as an additional nhop */ nhg_priv = append_nhops(ctl, rnd_orig->rnd_nhgrp, &wn[0], 1, &error); } if (nhg_priv == NULL) return (error); rnd_new->rnd_nhgrp = nhg_priv->nhg; rnd_new->rnd_weight = 0; return (0); } /* * Returns pointer to array of nexthops with weights for * given @nhg. Stores number of items in the array into @pnum_nhops. */ const struct weightened_nhop * nhgrp_get_nhops(const struct nhgrp_object *nhg, uint32_t *pnum_nhops) { const struct nhgrp_priv *nhg_priv; KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath")); nhg_priv = NHGRP_PRIV_CONST(nhg); *pnum_nhops = nhg_priv->nhg_nh_count; return (nhg_priv->nhg_nh_weights); } void nhgrp_set_uidx(struct nhgrp_object *nhg, uint32_t uidx) { struct nhgrp_priv *nhg_priv; KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath")); nhg_priv = NHGRP_PRIV(nhg); nhg_priv->nhg_uidx = uidx; } uint32_t nhgrp_get_uidx(const struct nhgrp_object *nhg) { const struct nhgrp_priv *nhg_priv; KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath")); nhg_priv = NHGRP_PRIV_CONST(nhg); return (nhg_priv->nhg_uidx); } /* * Prints nexhop group @nhg data in the provided @buf. * Example: nhg#33/sz=3:[#1:100,#2:100,#3:100] * Example: nhg#33/sz=5:[#1:100,#2:100,..] */ char * nhgrp_print_buf(const struct nhgrp_object *nhg, char *buf, size_t bufsize) { const struct nhgrp_priv *nhg_priv = NHGRP_PRIV_CONST(nhg); int off = snprintf(buf, bufsize, "nhg#%u/sz=%u:[", nhg_priv->nhg_idx, nhg_priv->nhg_nh_count); for (int i = 0; i < nhg_priv->nhg_nh_count; i++) { const struct weightened_nhop *wn = &nhg_priv->nhg_nh_weights[i]; int len = snprintf(&buf[off], bufsize - off, "#%u:%u,", wn->nh->nh_priv->nh_idx, wn->weight); if (len + off + 3 >= bufsize) { int len = snprintf(&buf[off], bufsize - off, "..."); off += len; break; } off += len; } if (off > 0) off--; // remove last "," if (off + 1 < bufsize) snprintf(&buf[off], bufsize - off, "]"); return buf; } __noinline static int dump_nhgrp_entry(struct rib_head *rh, const struct nhgrp_priv *nhg_priv, char *buffer, size_t buffer_size, struct sysctl_req *w) { struct rt_msghdr *rtm; struct nhgrp_external *nhge; struct nhgrp_container *nhgc; const struct nhgrp_object *nhg; struct nhgrp_nhop_external *ext; int error; size_t sz; nhg = nhg_priv->nhg; sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external); /* controlplane nexthops */ sz += sizeof(struct nhgrp_container); sz += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count; /* dataplane nexthops */ sz += sizeof(struct nhgrp_container); sz += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size; KASSERT(sz <= buffer_size, ("increase nhgrp buffer size")); bzero(buffer, sz); rtm = (struct rt_msghdr *)buffer; rtm->rtm_msglen = sz; rtm->rtm_version = RTM_VERSION; rtm->rtm_type = RTM_GET; nhge = (struct nhgrp_external *)(rtm + 1); nhge->nhg_idx = nhg_priv->nhg_idx; nhge->nhg_refcount = nhg_priv->nhg_refcount; /* fill in control plane nexthops firs */ nhgc = (struct nhgrp_container *)(nhge + 1); nhgc->nhgc_type = NHG_C_TYPE_CNHOPS; nhgc->nhgc_subtype = 0; nhgc->nhgc_len = sizeof(struct nhgrp_container); nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count; nhgc->nhgc_count = nhg_priv->nhg_nh_count; ext = (struct nhgrp_nhop_external *)(nhgc + 1); for (int i = 0; i < nhg_priv->nhg_nh_count; i++) { ext[i].nh_idx = nhg_priv->nhg_nh_weights[i].nh->nh_priv->nh_idx; ext[i].nh_weight = nhg_priv->nhg_nh_weights[i].weight; } /* fill in dataplane nexthops */ nhgc = (struct nhgrp_container *)(&ext[nhg_priv->nhg_nh_count]); nhgc->nhgc_type = NHG_C_TYPE_DNHOPS; nhgc->nhgc_subtype = 0; nhgc->nhgc_len = sizeof(struct nhgrp_container); nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size; nhgc->nhgc_count = nhg->nhg_size; ext = (struct nhgrp_nhop_external *)(nhgc + 1); for (int i = 0; i < nhg->nhg_size; i++) { ext[i].nh_idx = nhg->nhops[i]->nh_priv->nh_idx; ext[i].nh_weight = 0; } error = SYSCTL_OUT(w, buffer, sz); return (error); } uint32_t nhgrp_get_idx(const struct nhgrp_object *nhg) { const struct nhgrp_priv *nhg_priv; nhg_priv = NHGRP_PRIV_CONST(nhg); return (nhg_priv->nhg_idx); } uint8_t nhgrp_get_origin(const struct nhgrp_object *nhg) { return (NHGRP_PRIV_CONST(nhg)->nhg_origin); } void nhgrp_set_origin(struct nhgrp_object *nhg, uint8_t origin) { NHGRP_PRIV(nhg)->nhg_origin = origin; } uint32_t nhgrp_get_count(struct rib_head *rh) { struct nh_control *ctl; uint32_t count; ctl = rh->nh_control; NHOPS_RLOCK(ctl); count = ctl->gr_head.items_count; NHOPS_RUNLOCK(ctl); return (count); } int nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w) { struct nh_control *ctl = rh->nh_control; struct epoch_tracker et; struct nhgrp_priv *nhg_priv; char *buffer; size_t sz; int error = 0; if (ctl->gr_head.items_count == 0) return (0); /* Calculate the maximum nhop group size in bytes */ sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external); sz += 2 * sizeof(struct nhgrp_container); sz += 2 * sizeof(struct nhgrp_nhop_external) * RIB_MAX_MPATH_WIDTH; buffer = malloc(sz, M_TEMP, M_NOWAIT); if (buffer == NULL) return (ENOMEM); NET_EPOCH_ENTER(et); NHOPS_RLOCK(ctl); CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) { error = dump_nhgrp_entry(rh, nhg_priv, buffer, sz, w); if (error != 0) break; } CHT_SLIST_FOREACH_END; NHOPS_RUNLOCK(ctl); NET_EPOCH_EXIT(et); free(buffer, M_TEMP); return (error); } diff --git a/sys/net/route/nhop.c b/sys/net/route/nhop.c index 9fac76105598..07bced3e8b11 100644 --- a/sys/net/route/nhop.c +++ b/sys/net/route/nhop.c @@ -1,395 +1,394 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2020 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nhop #define DEBUG_MAX_LEVEL LOG_DEBUG #include _DECLARE_DEBUG(LOG_INFO); /* * This file contains data structures management logic for the nexthop ("nhop") * route subsystem. * * Nexthops in the original sense are the objects containing all the necessary * information to forward the packet to the selected destination. * In particular, nexthop is defined by a combination of * ifp, ifa, aifp, mtu, gw addr(if set), nh_type, nh_family, mask of rt_flags and * NHF_DEFAULT * * All nexthops are stored in the resizable hash table. * Additionally, each nexthop gets assigned its unique index (nexthop index) * so userland programs can interact with the nexthops easier. Index allocation * is backed by the bitmask array. */ MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data"); /* Hash management functions */ int nhops_init_rib(struct rib_head *rh) { struct nh_control *ctl; size_t alloc_size; uint32_t num_buckets, num_items; void *ptr; ctl = malloc(sizeof(struct nh_control), M_NHOP, M_WAITOK | M_ZERO); /* * Allocate nexthop hash. Start with 16 items by default (128 bytes). * This will be enough for most of the cases. */ num_buckets = 16; alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); ptr = malloc(alloc_size, M_NHOP, M_WAITOK | M_ZERO); CHT_SLIST_INIT(&ctl->nh_head, ptr, num_buckets); /* * Allocate nexthop index bitmask. */ num_items = 128 * 8; /* 128 bytes */ ptr = malloc(bitmask_get_size(num_items), M_NHOP, M_WAITOK | M_ZERO); bitmask_init(&ctl->nh_idx_head, ptr, num_items); NHOPS_LOCK_INIT(ctl); rh->nh_control = ctl; ctl->ctl_rh = rh; FIB_CTL_LOG(LOG_DEBUG2, ctl, "nhops init: ctl %p rh %p", ctl, rh); return (0); } static void destroy_ctl(struct nh_control *ctl) { NHOPS_LOCK_DESTROY(ctl); free(ctl->nh_head.ptr, M_NHOP); free(ctl->nh_idx_head.idx, M_NHOP); #ifdef ROUTE_MPATH nhgrp_ctl_free(ctl); #endif free(ctl, M_NHOP); } /* * Epoch callback indicating ctl is safe to destroy */ static void destroy_ctl_epoch(epoch_context_t ctx) { struct nh_control *ctl; ctl = __containerof(ctx, struct nh_control, ctl_epoch_ctx); destroy_ctl(ctl); } void nhops_destroy_rib(struct rib_head *rh) { struct nh_control *ctl; struct nhop_priv *nh_priv; ctl = rh->nh_control; /* * All routes should have been deleted in rt_table_destroy(). * However, TCP stack or other consumers may store referenced * nexthop pointers. When these references go to zero, * nhop_free() will try to unlink these records from the * datastructures, most likely leading to panic. * * Avoid that by explicitly marking all of the remaining * nexthops as unlinked by removing a reference from a special * counter. Please see nhop_free() comments for more * details. */ NHOPS_WLOCK(ctl); CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) { FIB_RH_LOG(LOG_DEBUG3, rh, "marking nhop %u unlinked", nh_priv->nh_idx); refcount_release(&nh_priv->nh_linked); } CHT_SLIST_FOREACH_END; #ifdef ROUTE_MPATH nhgrp_ctl_unlink_all(ctl); #endif NHOPS_WUNLOCK(ctl); /* * Postpone destruction till the end of current epoch * so nhop_free() can safely use nh_control pointer. */ - epoch_call(net_epoch_preempt, destroy_ctl_epoch, - &ctl->ctl_epoch_ctx); + NET_EPOCH_CALL(destroy_ctl_epoch, &ctl->ctl_epoch_ctx); } /* * Nexhop hash calculation: * * Nexthops distribution: * 2 "mandatory" nexthops per interface ("interface route", "loopback"). * For direct peering: 1 nexthop for the peering router per ifp/af. * For Ix-like peering: tens to hundreds nexthops of neghbors per ifp/af. * IGP control plane & broadcast segment: tens of nexthops per ifp/af. * * Each fib/af combination has its own hash table. * With that in mind, hash nexthops by the combination of the interface * and GW IP address. * * To optimize hash calculation, ignore lower bits of ifnet pointer, * as they give very little entropy. * Similarly, use lower 4 bytes of IPv6 address to distinguish between the * neighbors. */ struct _hash_data { uint16_t ifentropy; uint8_t family; uint8_t nh_type; uint32_t gw_addr; }; static unsigned djb_hash(const unsigned char *h, const int len) { unsigned int result = 0; int i; for (i = 0; i < len; i++) result = 33 * result ^ h[i]; return (result); } static uint32_t hash_priv(const struct nhop_priv *priv) { struct nhop_object *nh = priv->nh; struct _hash_data key = { .ifentropy = (uint16_t)((((uintptr_t)nh->nh_ifp) >> 6) & 0xFFFF), .family = nh->gw_sa.sa_family, .nh_type = priv->nh_type & 0xFF, .gw_addr = (nh->gw_sa.sa_family == AF_INET6) ? nh->gw6_sa.sin6_addr.s6_addr32[3] : nh->gw4_sa.sin_addr.s_addr }; return (uint32_t)(djb_hash((const unsigned char *)&key, sizeof(key))); } /* * Checks if hash needs resizing and performs this resize if necessary * */ static void consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, uint32_t new_idx_items) { void *nh_ptr, *nh_idx_ptr; void *old_idx_ptr; size_t alloc_size; nh_ptr = NULL; if (new_nh_buckets != 0) { alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_nh_buckets); nh_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO); } nh_idx_ptr = NULL; if (new_idx_items != 0) { alloc_size = bitmask_get_size(new_idx_items); nh_idx_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO); } if (nh_ptr == NULL && nh_idx_ptr == NULL) { /* Either resize is not required or allocations have failed. */ return; } FIB_CTL_LOG(LOG_DEBUG, ctl, "going to resize: nh:[ptr:%p sz:%u] idx:[ptr:%p sz:%u]", nh_ptr, new_nh_buckets, nh_idx_ptr, new_idx_items); old_idx_ptr = NULL; NHOPS_WLOCK(ctl); if (nh_ptr != NULL) { CHT_SLIST_RESIZE(&ctl->nh_head, nhops, nh_ptr, new_nh_buckets); } if (nh_idx_ptr != NULL) { if (bitmask_copy(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items) == 0) bitmask_swap(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items, &old_idx_ptr); } NHOPS_WUNLOCK(ctl); if (nh_ptr != NULL) free(nh_ptr, M_NHOP); if (old_idx_ptr != NULL) free(old_idx_ptr, M_NHOP); } /* * Links nextop @nh_priv to the nexhop hash table and allocates * nexhop index. * Returns allocated index or 0 on failure. */ int link_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv) { uint16_t idx; uint32_t num_buckets_new, num_items_new; KASSERT((nh_priv->nh_idx == 0), ("nhop index is already allocated")); NHOPS_WLOCK(ctl); /* * Check if we need to resize hash and index. * The following 2 functions returns either new size or 0 * if resize is not required. */ num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->nh_head); num_items_new = bitmask_get_resize_items(&ctl->nh_idx_head); if (bitmask_alloc_idx(&ctl->nh_idx_head, &idx) != 0) { NHOPS_WUNLOCK(ctl); FIB_CTL_LOG(LOG_INFO, ctl, "Unable to allocate nhop index"); RTSTAT_INC(rts_nh_idx_alloc_failure); consider_resize(ctl, num_buckets_new, num_items_new); return (0); } nh_priv->nh_idx = idx; nh_priv->nh_control = ctl; nh_priv->nh_finalized = 1; CHT_SLIST_INSERT_HEAD(&ctl->nh_head, nhops, nh_priv); NHOPS_WUNLOCK(ctl); FIB_RH_LOG(LOG_DEBUG2, ctl->ctl_rh, "Linked nhop priv %p to %d, hash %u, ctl %p", nh_priv, idx, hash_priv(nh_priv), ctl); consider_resize(ctl, num_buckets_new, num_items_new); return (idx); } /* * Unlinks nexthop specified by @nh_priv data from the hash. * * Returns found nexthop or NULL. */ struct nhop_priv * unlink_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv_del) { struct nhop_priv *priv_ret; int idx; uint32_t num_buckets_new, num_items_new; idx = 0; NHOPS_WLOCK(ctl); CHT_SLIST_REMOVE(&ctl->nh_head, nhops, nh_priv_del, priv_ret); if (priv_ret != NULL) { idx = priv_ret->nh_idx; priv_ret->nh_idx = 0; KASSERT((idx != 0), ("bogus nhop index 0")); if ((bitmask_free_idx(&ctl->nh_idx_head, idx)) != 0) { FIB_CTL_LOG(LOG_DEBUG, ctl, "Unable to remove index %d from fib %u af %d", idx, ctl->ctl_rh->rib_fibnum, ctl->ctl_rh->rib_family); } } /* Check if hash or index needs to be resized */ num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->nh_head); num_items_new = bitmask_get_resize_items(&ctl->nh_idx_head); NHOPS_WUNLOCK(ctl); if (priv_ret == NULL) { FIB_CTL_LOG(LOG_INFO, ctl, "Unable to unlink nhop priv %p from hash, hash %u ctl %p", nh_priv_del, hash_priv(nh_priv_del), ctl); } else { FIB_CTL_LOG(LOG_DEBUG2, ctl, "Unlinked nhop %p priv idx %d", priv_ret, idx); } consider_resize(ctl, num_buckets_new, num_items_new); return (priv_ret); } /* * Searches for the nexthop by data specifcied in @nh_priv. * Returns referenced nexthop or NULL. */ struct nhop_priv * find_nhop(struct nh_control *ctl, const struct nhop_priv *nh_priv) { struct nhop_priv *nh_priv_ret; NHOPS_RLOCK(ctl); CHT_SLIST_FIND_BYOBJ(&ctl->nh_head, nhops, nh_priv, nh_priv_ret); if (nh_priv_ret != NULL) { if (refcount_acquire_if_not_zero(&nh_priv_ret->nh_refcnt) == 0){ /* refcount was 0 -> nhop is being deleted */ nh_priv_ret = NULL; } } NHOPS_RUNLOCK(ctl); return (nh_priv_ret); } diff --git a/sys/net/route/nhop_ctl.c b/sys/net/route/nhop_ctl.c index 201a1ed0a094..d042d9519f6b 100644 --- a/sys/net/route/nhop_ctl.c +++ b/sys/net/route/nhop_ctl.c @@ -1,1218 +1,1217 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2020 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nhop_ctl #define DEBUG_MAX_LEVEL LOG_DEBUG #include _DECLARE_DEBUG(LOG_INFO); /* * This file contains core functionality for the nexthop ("nhop") route subsystem. * The business logic needed to create nexhop objects is implemented here. * * Nexthops in the original sense are the objects containing all the necessary * information to forward the packet to the selected destination. * In particular, nexthop is defined by a combination of * ifp, ifa, aifp, mtu, gw addr(if set), nh_type, nh_upper_family, mask of rt_flags and * NHF_DEFAULT * * Additionally, each nexthop gets assigned its unique index (nexthop index). * It serves two purposes: first one is to ease the ability of userland programs to * reference nexthops by their index. The second one allows lookup algorithms to * to store index instead of pointer (2 bytes vs 8) as a lookup result. * All nexthops are stored in the resizable hash table. * * Basically, this file revolves around supporting 3 functions: * 1) nhop_create_from_info / nhop_create_from_nhop, which contains all * business logic on filling the nexthop fields based on the provided request. * 2) nhop_get(), which gets a usable referenced nexthops. * * Conventions: * 1) non-exported functions start with verb * 2) exported function starts with the subsystem prefix: "nhop" */ static int dump_nhop_entry(struct rib_head *rh, struct nhop_object *nh, struct sysctl_req *w); static int finalize_nhop(struct nh_control *ctl, struct nhop_object *nh, bool link); static struct ifnet *get_aifp(const struct nhop_object *nh); static void fill_sdl_from_ifp(struct sockaddr_dl_short *sdl, const struct ifnet *ifp); static void destroy_nhop_epoch(epoch_context_t ctx); static void destroy_nhop(struct nhop_object *nh); _Static_assert(__offsetof(struct nhop_object, nh_ifp) == 32, "nhop_object: wrong nh_ifp offset"); _Static_assert(sizeof(struct nhop_object) <= 128, "nhop_object: size exceeds 128 bytes"); static uma_zone_t nhops_zone; /* Global zone for each and every nexthop */ #define NHOP_OBJECT_ALIGNED_SIZE roundup2(sizeof(struct nhop_object), \ 2 * CACHE_LINE_SIZE) #define NHOP_PRIV_ALIGNED_SIZE roundup2(sizeof(struct nhop_priv), \ 2 * CACHE_LINE_SIZE) void nhops_init(void) { nhops_zone = uma_zcreate("routing nhops", NHOP_OBJECT_ALIGNED_SIZE + NHOP_PRIV_ALIGNED_SIZE, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } /* * Fetches the interface of source address used by the route. * In all cases except interface-address-route it would be the * same as the transmit interfaces. * However, for the interface address this function will return * this interface ifp instead of loopback. This is needed to support * link-local IPv6 loopback communications. * * Returns found ifp. */ static struct ifnet * get_aifp(const struct nhop_object *nh) { struct ifnet *aifp = NULL; /* * Adjust the "outgoing" interface. If we're going to loop * the packet back to ourselves, the ifp would be the loopback * interface. However, we'd rather know the interface associated * to the destination address (which should probably be one of * our own addresses). */ if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) && nh->gw_sa.sa_family == AF_LINK) { aifp = ifnet_byindex(nh->gwl_sa.sdl_index); if (aifp == NULL) { FIB_NH_LOG(LOG_WARNING, nh, "unable to get aifp for %s index %d", if_name(nh->nh_ifp), nh->gwl_sa.sdl_index); } } if (aifp == NULL) aifp = nh->nh_ifp; return (aifp); } int cmp_priv(const struct nhop_priv *_one, const struct nhop_priv *_two) { if (memcmp(_one->nh, _two->nh, NHOP_END_CMP) != 0) return (0); if (memcmp(_one, _two, NH_PRIV_END_CMP) != 0) return (0); return (1); } /* * Conditionally sets @nh mtu data based on the @info data. */ static void set_nhop_mtu_from_info(struct nhop_object *nh, const struct rt_addrinfo *info) { if (info->rti_mflags & RTV_MTU) nhop_set_mtu(nh, info->rti_rmx->rmx_mtu, true); } /* * Fills in shorted link-level sockadd version suitable to be stored inside the * nexthop gateway buffer. */ static void fill_sdl_from_ifp(struct sockaddr_dl_short *sdl, const struct ifnet *ifp) { bzero(sdl, sizeof(struct sockaddr_dl_short)); sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(struct sockaddr_dl_short); sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; } static int set_nhop_gw_from_info(struct nhop_object *nh, struct rt_addrinfo *info) { struct sockaddr *gw; gw = info->rti_info[RTAX_GATEWAY]; MPASS(gw != NULL); bool is_gw = info->rti_flags & RTF_GATEWAY; if ((gw->sa_family == AF_LINK) && !is_gw) { /* * Interface route with interface specified by the interface * index in sockadd_dl structure. It is used in the IPv6 loopback * output code, where we need to preserve the original interface * to maintain proper scoping. * Despite the fact that nexthop code stores original interface * in the separate field (nh_aifp, see below), write AF_LINK * compatible sa with shorter total length. */ struct sockaddr_dl *sdl = (struct sockaddr_dl *)gw; struct ifnet *ifp = ifnet_byindex(sdl->sdl_index); if (ifp == NULL) { FIB_NH_LOG(LOG_DEBUG, nh, "error: invalid ifindex %d", sdl->sdl_index); return (EINVAL); } nhop_set_direct_gw(nh, ifp); } else { /* * Multiple options here: * * 1) RTF_GATEWAY with IPv4/IPv6 gateway data * 2) Interface route with IPv4/IPv6 address of the * matching interface. Some routing daemons do that * instead of specifying ifindex in AF_LINK. * * In both cases, save the original nexthop to make the callers * happy. */ if (!nhop_set_gw(nh, gw, is_gw)) return (EINVAL); } return (0); } static void set_nhop_expire_from_info(struct nhop_object *nh, const struct rt_addrinfo *info) { uint32_t nh_expire = 0; /* Kernel -> userland timebase conversion. */ if ((info->rti_mflags & RTV_EXPIRE) && (info->rti_rmx->rmx_expire > 0)) nh_expire = info->rti_rmx->rmx_expire - time_second + time_uptime; nhop_set_expire(nh, nh_expire); } /* * Creates a new nexthop based on the information in @info. * * Returns: * 0 on success, filling @nh_ret with the desired nexthop object ptr * errno otherwise */ int nhop_create_from_info(struct rib_head *rnh, struct rt_addrinfo *info, struct nhop_object **nh_ret) { int error; NET_EPOCH_ASSERT(); MPASS(info->rti_ifa != NULL); MPASS(info->rti_ifp != NULL); if (info->rti_info[RTAX_GATEWAY] == NULL) { FIB_RH_LOG(LOG_DEBUG, rnh, "error: empty gateway"); return (EINVAL); } struct nhop_object *nh = nhop_alloc(rnh->rib_fibnum, rnh->rib_family); if (nh == NULL) return (ENOMEM); if ((error = set_nhop_gw_from_info(nh, info)) != 0) { nhop_free(nh); return (error); } nhop_set_transmit_ifp(nh, info->rti_ifp); nhop_set_blackhole(nh, info->rti_flags & (RTF_BLACKHOLE | RTF_REJECT)); error = rnh->rnh_set_nh_pfxflags(rnh->rib_fibnum, info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], nh); nhop_set_redirect(nh, info->rti_flags & RTF_DYNAMIC); nhop_set_pinned(nh, info->rti_flags & RTF_PINNED); set_nhop_expire_from_info(nh, info); nhop_set_rtflags(nh, info->rti_flags); set_nhop_mtu_from_info(nh, info); nhop_set_src(nh, info->rti_ifa); /* * The remaining fields are either set from nh_preadd hook * or are computed from the provided data */ *nh_ret = nhop_get_nhop(nh, &error); return (error); } /* * Gets linked nhop using the provided @nh nexhop data. * If linked nhop is found, returns it, freeing the provided one. * If there is no such nexthop, attaches the remaining data to the * provided nexthop and links it. * * Returns 0 on success, storing referenced nexthop in @pnh. * Otherwise, errno is returned. */ struct nhop_object * nhop_get_nhop(struct nhop_object *nh, int *perror) { struct rib_head *rnh = nhop_get_rh(nh); if (__predict_false(rnh == NULL)) { *perror = EAFNOSUPPORT; nhop_free(nh); return (NULL); } return (nhop_get_nhop_internal(rnh, nh, perror)); } struct nhop_object * nhop_get_nhop_internal(struct rib_head *rnh, struct nhop_object *nh, int *perror) { struct nhop_priv *tmp_priv; int error; nh->nh_aifp = get_aifp(nh); /* Give the protocols chance to augment nexthop properties */ error = rnh->rnh_augment_nh(rnh->rib_fibnum, nh); if (error != 0) { nhop_free(nh); *perror = error; return (NULL); } tmp_priv = find_nhop(rnh->nh_control, nh->nh_priv); if (tmp_priv != NULL) { nhop_free(nh); *perror = 0; return (tmp_priv->nh); } /* * Existing nexthop not found, need to create new one. * Note: multiple simultaneous requests * can result in multiple equal nexhops existing in the * nexthop table. This is not a not a problem until the * relative number of such nexthops is significant, which * is extremely unlikely. */ *perror = finalize_nhop(rnh->nh_control, nh, true); return (*perror == 0 ? nh : NULL); } /* * Gets referenced but unlinked nhop. * Alocates/references the remaining bits of the nexthop data, so * it can be safely linked later or used as a clone source. * * Returns 0 on success. */ int nhop_get_unlinked(struct nhop_object *nh) { struct rib_head *rnh = nhop_get_rh(nh); if (__predict_false(rnh == NULL)) { nhop_free(nh); return (EAFNOSUPPORT); } nh->nh_aifp = get_aifp(nh); return (finalize_nhop(rnh->nh_control, nh, false)); } /* * Update @nh with data supplied in @info. * This is a helper function to support route changes. * * It limits the changes that can be done to the route to the following: * 1) all combination of gateway changes * 2) route flags (FLAG[123],STATIC) * 3) route MTU * * Returns: * 0 on success, errno otherwise */ static int alter_nhop_from_info(struct nhop_object *nh, struct rt_addrinfo *info) { struct sockaddr *info_gw; int error; /* Update MTU if set in the request*/ set_nhop_mtu_from_info(nh, info); /* Only RTF_FLAG[123] and RTF_STATIC */ uint32_t rt_flags = nhop_get_rtflags(nh) & ~RT_CHANGE_RTFLAGS_MASK; rt_flags |= info->rti_flags & RT_CHANGE_RTFLAGS_MASK; nhop_set_rtflags(nh, rt_flags); /* Consider gateway change */ info_gw = info->rti_info[RTAX_GATEWAY]; if (info_gw != NULL) { error = set_nhop_gw_from_info(nh, info); if (error != 0) return (error); } if (info->rti_ifa != NULL) nhop_set_src(nh, info->rti_ifa); if (info->rti_ifp != NULL) nhop_set_transmit_ifp(nh, info->rti_ifp); return (0); } /* * Creates new nexthop based on @nh_orig and augmentation data from @info. * Helper function used in the route changes, please see * alter_nhop_from_info() comments for more details. * * Returns: * 0 on success, filling @nh_ret with the desired nexthop object * errno otherwise */ int nhop_create_from_nhop(struct rib_head *rnh, const struct nhop_object *nh_orig, struct rt_addrinfo *info, struct nhop_object **pnh) { struct nhop_object *nh; int error; NET_EPOCH_ASSERT(); nh = nhop_alloc(rnh->rib_fibnum, rnh->rib_family); if (nh == NULL) return (ENOMEM); nhop_copy(nh, nh_orig); error = alter_nhop_from_info(nh, info); if (error != 0) { nhop_free(nh); return (error); } *pnh = nhop_get_nhop(nh, &error); return (error); } static bool reference_nhop_deps(struct nhop_object *nh) { if (!ifa_try_ref(nh->nh_ifa)) return (false); nh->nh_aifp = get_aifp(nh); if (!if_try_ref(nh->nh_aifp)) { ifa_free(nh->nh_ifa); return (false); } FIB_NH_LOG(LOG_DEBUG2, nh, "nh_aifp: %s nh_ifp %s", if_name(nh->nh_aifp), if_name(nh->nh_ifp)); if (!if_try_ref(nh->nh_ifp)) { ifa_free(nh->nh_ifa); if_rele(nh->nh_aifp); return (false); } return (true); } /* * Alocates/references the remaining bits of nexthop data and links * it to the hash table. * Returns 0 if successful, * errno otherwise. @nh_priv is freed in case of error. */ static int finalize_nhop(struct nh_control *ctl, struct nhop_object *nh, bool link) { /* Allocate per-cpu packet counter */ nh->nh_pksent = counter_u64_alloc(M_NOWAIT); if (nh->nh_pksent == NULL) { nhop_free(nh); RTSTAT_INC(rts_nh_alloc_failure); FIB_NH_LOG(LOG_WARNING, nh, "counter_u64_alloc() failed"); return (ENOMEM); } if (!reference_nhop_deps(nh)) { counter_u64_free(nh->nh_pksent); nhop_free(nh); RTSTAT_INC(rts_nh_alloc_failure); FIB_NH_LOG(LOG_WARNING, nh, "interface reference failed"); return (EAGAIN); } /* Save vnet to ease destruction */ nh->nh_priv->nh_vnet = curvnet; /* Please see nhop_free() comments on the initial value */ refcount_init(&nh->nh_priv->nh_linked, 2); MPASS(nh->nh_priv->nh_fibnum == ctl->ctl_rh->rib_fibnum); if (!link) { refcount_release(&nh->nh_priv->nh_linked); NHOPS_WLOCK(ctl); nh->nh_priv->nh_finalized = 1; NHOPS_WUNLOCK(ctl); } else if (link_nhop(ctl, nh->nh_priv) == 0) { /* * Adding nexthop to the datastructures * failed. Call destructor w/o waiting for * the epoch end, as nexthop is not used * and return. */ char nhbuf[NHOP_PRINT_BUFSIZE]; FIB_NH_LOG(LOG_WARNING, nh, "failed to link %s", nhop_print_buf(nh, nhbuf, sizeof(nhbuf))); destroy_nhop(nh); return (ENOBUFS); } IF_DEBUG_LEVEL(LOG_DEBUG) { char nhbuf[NHOP_PRINT_BUFSIZE] __unused; FIB_NH_LOG(LOG_DEBUG, nh, "finalized: %s", nhop_print_buf(nh, nhbuf, sizeof(nhbuf))); } return (0); } static void destroy_nhop(struct nhop_object *nh) { if_rele(nh->nh_ifp); if_rele(nh->nh_aifp); ifa_free(nh->nh_ifa); counter_u64_free(nh->nh_pksent); uma_zfree(nhops_zone, nh); } /* * Epoch callback indicating nhop is safe to destroy */ static void destroy_nhop_epoch(epoch_context_t ctx) { struct nhop_priv *nh_priv; nh_priv = __containerof(ctx, struct nhop_priv, nh_epoch_ctx); destroy_nhop(nh_priv->nh); } void nhop_ref_object(struct nhop_object *nh) { u_int old __diagused; old = refcount_acquire(&nh->nh_priv->nh_refcnt); KASSERT(old > 0, ("%s: nhop object %p has 0 refs", __func__, nh)); } int nhop_try_ref_object(struct nhop_object *nh) { return (refcount_acquire_if_not_zero(&nh->nh_priv->nh_refcnt)); } void nhop_free(struct nhop_object *nh) { struct nh_control *ctl; struct nhop_priv *nh_priv = nh->nh_priv; struct epoch_tracker et; if (!refcount_release(&nh_priv->nh_refcnt)) return; /* allows to use nhop_free() during nhop init */ if (__predict_false(nh_priv->nh_finalized == 0)) { uma_zfree(nhops_zone, nh); return; } IF_DEBUG_LEVEL(LOG_DEBUG) { char nhbuf[NHOP_PRINT_BUFSIZE] __unused; FIB_NH_LOG(LOG_DEBUG, nh, "deleting %s", nhop_print_buf(nh, nhbuf, sizeof(nhbuf))); } /* * There are only 2 places, where nh_linked can be decreased: * rib destroy (nhops_destroy_rib) and this function. * nh_link can never be increased. * * Hence, use initial value of 2 to make use of * refcount_release_if_not_last(). * * There can be two scenarious when calling this function: * * 1) nh_linked value is 2. This means that either * nhops_destroy_rib() has not been called OR it is running, * but we are guaranteed that nh_control won't be freed in * this epoch. Hence, nexthop can be safely unlinked. * * 2) nh_linked value is 1. In that case, nhops_destroy_rib() * has been called and nhop unlink can be skipped. */ NET_EPOCH_ENTER(et); if (refcount_release_if_not_last(&nh_priv->nh_linked)) { ctl = nh_priv->nh_control; if (unlink_nhop(ctl, nh_priv) == NULL) { /* Do not try to reclaim */ char nhbuf[NHOP_PRINT_BUFSIZE]; FIB_NH_LOG(LOG_WARNING, nh, "failed to unlink %s", nhop_print_buf(nh, nhbuf, sizeof(nhbuf))); NET_EPOCH_EXIT(et); return; } } NET_EPOCH_EXIT(et); - epoch_call(net_epoch_preempt, destroy_nhop_epoch, - &nh_priv->nh_epoch_ctx); + NET_EPOCH_CALL(destroy_nhop_epoch, &nh_priv->nh_epoch_ctx); } void nhop_ref_any(struct nhop_object *nh) { #ifdef ROUTE_MPATH if (!NH_IS_NHGRP(nh)) nhop_ref_object(nh); else nhgrp_ref_object((struct nhgrp_object *)nh); #else nhop_ref_object(nh); #endif } void nhop_free_any(struct nhop_object *nh) { #ifdef ROUTE_MPATH if (!NH_IS_NHGRP(nh)) nhop_free(nh); else nhgrp_free((struct nhgrp_object *)nh); #else nhop_free(nh); #endif } /* Nhop-related methods */ /* * Allocates an empty unlinked nhop object. * Returns object pointer or NULL on failure */ struct nhop_object * nhop_alloc(uint32_t fibnum, int family) { struct nhop_object *nh; struct nhop_priv *nh_priv; nh = (struct nhop_object *)uma_zalloc(nhops_zone, M_NOWAIT | M_ZERO); if (__predict_false(nh == NULL)) return (NULL); nh_priv = (struct nhop_priv *)((char *)nh + NHOP_OBJECT_ALIGNED_SIZE); nh->nh_priv = nh_priv; nh_priv->nh = nh; nh_priv->nh_upper_family = family; nh_priv->nh_fibnum = fibnum; /* Setup refcount early to allow nhop_free() to work */ refcount_init(&nh_priv->nh_refcnt, 1); return (nh); } void nhop_copy(struct nhop_object *nh, const struct nhop_object *nh_orig) { struct nhop_priv *nh_priv = nh->nh_priv; nh->nh_flags = nh_orig->nh_flags; nh->nh_mtu = nh_orig->nh_mtu; memcpy(&nh->gw_sa, &nh_orig->gw_sa, nh_orig->gw_sa.sa_len); nh->nh_ifp = nh_orig->nh_ifp; nh->nh_ifa = nh_orig->nh_ifa; nh->nh_aifp = nh_orig->nh_aifp; nh_priv->nh_upper_family = nh_orig->nh_priv->nh_upper_family; nh_priv->nh_neigh_family = nh_orig->nh_priv->nh_neigh_family; nh_priv->nh_type = nh_orig->nh_priv->nh_type; nh_priv->rt_flags = nh_orig->nh_priv->rt_flags; nh_priv->nh_fibnum = nh_orig->nh_priv->nh_fibnum; nh_priv->nh_origin = nh_orig->nh_priv->nh_origin; } void nhop_set_direct_gw(struct nhop_object *nh, struct ifnet *ifp) { nh->nh_flags &= ~NHF_GATEWAY; nh->nh_priv->rt_flags &= ~RTF_GATEWAY; nh->nh_priv->nh_neigh_family = nh->nh_priv->nh_upper_family; fill_sdl_from_ifp(&nh->gwl_sa, ifp); memset(&nh->gw_buf[nh->gw_sa.sa_len], 0, sizeof(nh->gw_buf) - nh->gw_sa.sa_len); } bool nhop_check_gateway(int upper_family, int neigh_family) { if (upper_family == neigh_family) return (true); else if (neigh_family == AF_UNSPEC || neigh_family == AF_LINK) return (true); #if defined(INET) && defined(INET6) else if (upper_family == AF_INET && neigh_family == AF_INET6 && rib_can_4o6_nhop()) return (true); #endif else return (false); } /* * Sets gateway for the nexthop. * It can be "normal" gateway with is_gw set or a special form of * adding interface route, refering to it by specifying local interface * address. In that case is_gw is set to false. */ bool nhop_set_gw(struct nhop_object *nh, const struct sockaddr *gw, bool is_gw) { if (gw->sa_len > sizeof(nh->gw_buf)) { FIB_NH_LOG(LOG_DEBUG, nh, "nhop SA size too big: AF %d len %u", gw->sa_family, gw->sa_len); return (false); } if (!nhop_check_gateway(nh->nh_priv->nh_upper_family, gw->sa_family)) { FIB_NH_LOG(LOG_DEBUG, nh, "error: invalid dst/gateway family combination (%d, %d)", nh->nh_priv->nh_upper_family, gw->sa_family); return (false); } memcpy(&nh->gw_sa, gw, gw->sa_len); memset(&nh->gw_buf[gw->sa_len], 0, sizeof(nh->gw_buf) - gw->sa_len); if (is_gw) { nh->nh_flags |= NHF_GATEWAY; nh->nh_priv->rt_flags |= RTF_GATEWAY; nh->nh_priv->nh_neigh_family = gw->sa_family; } else { nh->nh_flags &= ~NHF_GATEWAY; nh->nh_priv->rt_flags &= ~RTF_GATEWAY; nh->nh_priv->nh_neigh_family = nh->nh_priv->nh_upper_family; } return (true); } bool nhop_set_upper_family(struct nhop_object *nh, int family) { if (!nhop_check_gateway(nh->nh_priv->nh_upper_family, family)) { FIB_NH_LOG(LOG_DEBUG, nh, "error: invalid upper/neigh family combination (%d, %d)", nh->nh_priv->nh_upper_family, family); return (false); } nh->nh_priv->nh_upper_family = family; return (true); } void nhop_set_broadcast(struct nhop_object *nh, bool is_broadcast) { if (is_broadcast) { nh->nh_flags |= NHF_BROADCAST; nh->nh_priv->rt_flags |= RTF_BROADCAST; } else { nh->nh_flags &= ~NHF_BROADCAST; nh->nh_priv->rt_flags &= ~RTF_BROADCAST; } } void nhop_set_blackhole(struct nhop_object *nh, int blackhole_rt_flag) { nh->nh_flags &= ~(NHF_BLACKHOLE | NHF_REJECT); nh->nh_priv->rt_flags &= ~(RTF_BLACKHOLE | RTF_REJECT); switch (blackhole_rt_flag) { case RTF_BLACKHOLE: nh->nh_flags |= NHF_BLACKHOLE; nh->nh_priv->rt_flags |= RTF_BLACKHOLE; break; case RTF_REJECT: nh->nh_flags |= NHF_REJECT; nh->nh_priv->rt_flags |= RTF_REJECT; break; } } void nhop_set_redirect(struct nhop_object *nh, bool is_redirect) { if (is_redirect) { nh->nh_priv->rt_flags |= RTF_DYNAMIC; nh->nh_flags |= NHF_REDIRECT; } else { nh->nh_priv->rt_flags &= ~RTF_DYNAMIC; nh->nh_flags &= ~NHF_REDIRECT; } } void nhop_set_pinned(struct nhop_object *nh, bool is_pinned) { if (is_pinned) nh->nh_priv->rt_flags |= RTF_PINNED; else nh->nh_priv->rt_flags &= ~RTF_PINNED; } uint32_t nhop_get_idx(const struct nhop_object *nh) { return (nh->nh_priv->nh_idx); } uint32_t nhop_get_uidx(const struct nhop_object *nh) { return (nh->nh_priv->nh_uidx); } void nhop_set_uidx(struct nhop_object *nh, uint32_t uidx) { nh->nh_priv->nh_uidx = uidx; } enum nhop_type nhop_get_type(const struct nhop_object *nh) { return (nh->nh_priv->nh_type); } void nhop_set_type(struct nhop_object *nh, enum nhop_type nh_type) { nh->nh_priv->nh_type = nh_type; } int nhop_get_rtflags(const struct nhop_object *nh) { return (nh->nh_priv->rt_flags); } /* * Sets generic rtflags that are not covered by other functions. */ void nhop_set_rtflags(struct nhop_object *nh, int rt_flags) { nh->nh_priv->rt_flags &= ~RT_SET_RTFLAGS_MASK; nh->nh_priv->rt_flags |= (rt_flags & RT_SET_RTFLAGS_MASK); } /* * Sets flags that are specific to the prefix (NHF_HOST or NHF_DEFAULT). */ void nhop_set_pxtype_flag(struct nhop_object *nh, int nh_flag) { if (nh_flag == NHF_HOST) { nh->nh_flags |= NHF_HOST; nh->nh_flags &= ~NHF_DEFAULT; nh->nh_priv->rt_flags |= RTF_HOST; } else if (nh_flag == NHF_DEFAULT) { nh->nh_flags |= NHF_DEFAULT; nh->nh_flags &= ~NHF_HOST; nh->nh_priv->rt_flags &= ~RTF_HOST; } else { nh->nh_flags &= ~(NHF_HOST | NHF_DEFAULT); nh->nh_priv->rt_flags &= ~RTF_HOST; } } /* * Sets nhop MTU. Sets RTF_FIXEDMTU if mtu is explicitly * specified by userland. */ void nhop_set_mtu(struct nhop_object *nh, uint32_t mtu, bool from_user) { if (from_user) { if (mtu != 0) nh->nh_priv->rt_flags |= RTF_FIXEDMTU; else nh->nh_priv->rt_flags &= ~RTF_FIXEDMTU; } nh->nh_mtu = mtu; } void nhop_set_src(struct nhop_object *nh, struct ifaddr *ifa) { nh->nh_ifa = ifa; } void nhop_set_transmit_ifp(struct nhop_object *nh, struct ifnet *ifp) { nh->nh_ifp = ifp; } struct vnet * nhop_get_vnet(const struct nhop_object *nh) { return (nh->nh_priv->nh_vnet); } struct nhop_object * nhop_select_func(struct nhop_object *nh, uint32_t flowid) { return (nhop_select(nh, flowid)); } /* * Returns address family of the traffic uses the nexthop. */ int nhop_get_upper_family(const struct nhop_object *nh) { return (nh->nh_priv->nh_upper_family); } /* * Returns address family of the LLE or gateway that is used * to forward the traffic to. */ int nhop_get_neigh_family(const struct nhop_object *nh) { return (nh->nh_priv->nh_neigh_family); } uint32_t nhop_get_fibnum(const struct nhop_object *nh) { return (nh->nh_priv->nh_fibnum); } void nhop_set_fibnum(struct nhop_object *nh, uint32_t fibnum) { nh->nh_priv->nh_fibnum = fibnum; } uint32_t nhop_get_expire(const struct nhop_object *nh) { return (nh->nh_priv->nh_expire); } void nhop_set_expire(struct nhop_object *nh, uint32_t expire) { MPASS(!NH_IS_LINKED(nh)); nh->nh_priv->nh_expire = expire; } struct rib_head * nhop_get_rh(const struct nhop_object *nh) { uint32_t fibnum = nhop_get_fibnum(nh); int family = nhop_get_neigh_family(nh); return (rt_tables_get_rnh(fibnum, family)); } uint8_t nhop_get_origin(const struct nhop_object *nh) { return (nh->nh_priv->nh_origin); } void nhop_set_origin(struct nhop_object *nh, uint8_t origin) { nh->nh_priv->nh_origin = origin; } void nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu) { struct nh_control *ctl; struct nhop_priv *nh_priv; struct nhop_object *nh; ctl = rh->nh_control; NHOPS_WLOCK(ctl); CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) { nh = nh_priv->nh; if (nh->nh_ifp == ifp) { if ((nh_priv->rt_flags & RTF_FIXEDMTU) == 0 || nh->nh_mtu > mtu) { /* Update MTU directly */ nh->nh_mtu = mtu; } } } CHT_SLIST_FOREACH_END; NHOPS_WUNLOCK(ctl); } /* * Prints nexthop @nh data in the provided @buf. * Example: nh#33/inet/em0/192.168.0.1 */ char * nhop_print_buf(const struct nhop_object *nh, char *buf, size_t bufsize) { #if defined(INET) || defined(INET6) char abuf[INET6_ADDRSTRLEN]; #endif struct nhop_priv *nh_priv = nh->nh_priv; const char *upper_str = rib_print_family(nh->nh_priv->nh_upper_family); switch (nh->gw_sa.sa_family) { #ifdef INET case AF_INET: inet_ntop(AF_INET, &nh->gw4_sa.sin_addr, abuf, sizeof(abuf)); snprintf(buf, bufsize, "nh#%d/%s/%s/%s", nh_priv->nh_idx, upper_str, if_name(nh->nh_ifp), abuf); break; #endif #ifdef INET6 case AF_INET6: inet_ntop(AF_INET6, &nh->gw6_sa.sin6_addr, abuf, sizeof(abuf)); snprintf(buf, bufsize, "nh#%d/%s/%s/%s", nh_priv->nh_idx, upper_str, if_name(nh->nh_ifp), abuf); break; #endif case AF_LINK: snprintf(buf, bufsize, "nh#%d/%s/%s/resolve", nh_priv->nh_idx, upper_str, if_name(nh->nh_ifp)); break; default: snprintf(buf, bufsize, "nh#%d/%s/%s/????", nh_priv->nh_idx, upper_str, if_name(nh->nh_ifp)); break; } return (buf); } char * nhop_print_buf_any(const struct nhop_object *nh, char *buf, size_t bufsize) { #ifdef ROUTE_MPATH if (NH_IS_NHGRP(nh)) return (nhgrp_print_buf((const struct nhgrp_object *)nh, buf, bufsize)); else #endif return (nhop_print_buf(nh, buf, bufsize)); } /* * Dumps a single entry to sysctl buffer. * * Layout: * rt_msghdr - generic RTM header to allow users to skip non-understood messages * nhop_external - nexhop description structure (with length) * nhop_addrs - structure encapsulating GW/SRC sockaddrs */ static int dump_nhop_entry(struct rib_head *rh, struct nhop_object *nh, struct sysctl_req *w) { struct { struct rt_msghdr rtm; struct nhop_external nhe; struct nhop_addrs na; } arpc; struct nhop_external *pnhe; struct sockaddr *gw_sa, *src_sa; struct sockaddr_storage ss; size_t addrs_len; int error; memset(&arpc, 0, sizeof(arpc)); arpc.rtm.rtm_msglen = sizeof(arpc); arpc.rtm.rtm_version = RTM_VERSION; arpc.rtm.rtm_type = RTM_GET; //arpc.rtm.rtm_flags = RTF_UP; arpc.rtm.rtm_flags = nh->nh_priv->rt_flags; /* nhop_external */ pnhe = &arpc.nhe; pnhe->nh_len = sizeof(struct nhop_external); pnhe->nh_idx = nh->nh_priv->nh_idx; pnhe->nh_fib = rh->rib_fibnum; pnhe->ifindex = nh->nh_ifp->if_index; pnhe->aifindex = nh->nh_aifp->if_index; pnhe->nh_family = nh->nh_priv->nh_upper_family; pnhe->nh_type = nh->nh_priv->nh_type; pnhe->nh_mtu = nh->nh_mtu; pnhe->nh_flags = nh->nh_flags; memcpy(pnhe->nh_prepend, nh->nh_prepend, sizeof(nh->nh_prepend)); pnhe->prepend_len = nh->nh_prepend_len; pnhe->nh_refcount = nh->nh_priv->nh_refcnt; pnhe->nh_pksent = counter_u64_fetch(nh->nh_pksent); /* sockaddr container */ addrs_len = sizeof(struct nhop_addrs); arpc.na.gw_sa_off = addrs_len; gw_sa = (struct sockaddr *)&nh->gw4_sa; addrs_len += gw_sa->sa_len; src_sa = nh->nh_ifa->ifa_addr; if (src_sa->sa_family == AF_LINK) { /* Shorten structure */ memset(&ss, 0, sizeof(struct sockaddr_storage)); fill_sdl_from_ifp((struct sockaddr_dl_short *)&ss, nh->nh_ifa->ifa_ifp); src_sa = (struct sockaddr *)&ss; } arpc.na.src_sa_off = addrs_len; addrs_len += src_sa->sa_len; /* Write total container length */ arpc.na.na_len = addrs_len; arpc.rtm.rtm_msglen += arpc.na.na_len - sizeof(struct nhop_addrs); error = SYSCTL_OUT(w, &arpc, sizeof(arpc)); if (error == 0) error = SYSCTL_OUT(w, gw_sa, gw_sa->sa_len); if (error == 0) error = SYSCTL_OUT(w, src_sa, src_sa->sa_len); return (error); } uint32_t nhops_get_count(struct rib_head *rh) { struct nh_control *ctl; uint32_t count; ctl = rh->nh_control; NHOPS_RLOCK(ctl); count = ctl->nh_head.items_count; NHOPS_RUNLOCK(ctl); return (count); } int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w) { struct nh_control *ctl; struct nhop_priv *nh_priv; int error; ctl = rh->nh_control; NHOPS_RLOCK(ctl); FIB_RH_LOG(LOG_DEBUG, rh, "dump %u items", ctl->nh_head.items_count); CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) { error = dump_nhop_entry(rh, nh_priv->nh, w); if (error != 0) { NHOPS_RUNLOCK(ctl); return (error); } } CHT_SLIST_FOREACH_END; NHOPS_RUNLOCK(ctl); return (0); } diff --git a/sys/net/route/route_rtentry.c b/sys/net/route/route_rtentry.c index 41e4ff8ac49f..64900ae3ae39 100644 --- a/sys/net/route/route_rtentry.c +++ b/sys/net/route/route_rtentry.c @@ -1,310 +1,309 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2021-2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Routing table UMA zone */ VNET_DEFINE_STATIC(uma_zone_t, rtzone); #define V_rtzone VNET(rtzone) void vnet_rtzone_init(void) { V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } #ifdef VIMAGE void vnet_rtzone_destroy(void) { uma_zdestroy(V_rtzone); } #endif /* * Creates rtentry and based on @dst/@netmask data. * Return 0 and fills in rtentry into @prt on success, * Note: rtentry mask ptr will be set to @netmask , thus its pointer is required * to be stable till the end of the operation (radix rt insertion/change/removal). */ struct rtentry * rt_alloc(struct rib_head *rnh, const struct sockaddr *dst, struct sockaddr *netmask) { MPASS(dst->sa_len <= sizeof(((struct rtentry *)NULL)->rt_dstb)); struct rtentry *rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO); if (rt == NULL) return (NULL); rt->rte_flags = RTF_UP | (netmask == NULL ? RTF_HOST : 0); /* Fill in dst, ensuring it's masked if needed. */ if (netmask != NULL) { rt_maskedcopy(dst, &rt->rt_dst, netmask); } else bcopy(dst, &rt->rt_dst, dst->sa_len); rt_key(rt) = &rt->rt_dst; /* Set netmask to the storage from info. It will be updated upon insertion */ rt_mask(rt) = netmask; return (rt); } static void destroy_rtentry(struct rtentry *rt) { #ifdef VIMAGE struct nhop_object *nh = rt->rt_nhop; /* * At this moment rnh, nh_control may be already freed. * nhop interface may have been migrated to a different vnet. * Use vnet stored in the nexthop to delete the entry. */ #ifdef ROUTE_MPATH if (NH_IS_NHGRP(nh)) { const struct weightened_nhop *wn; uint32_t num_nhops; wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops); nh = wn[0].nh; } #endif CURVNET_SET(nhop_get_vnet(nh)); #endif /* Unreference nexthop */ nhop_free_any(rt->rt_nhop); rt_free_immediate(rt); CURVNET_RESTORE(); } /* * Epoch callback indicating rtentry is safe to destroy */ static void destroy_rtentry_epoch(epoch_context_t ctx) { struct rtentry *rt; rt = __containerof(ctx, struct rtentry, rt_epoch_ctx); destroy_rtentry(rt); } /* * Schedule rtentry deletion */ void rt_free(struct rtentry *rt) { KASSERT(rt != NULL, ("%s: NULL rt", __func__)); - epoch_call(net_epoch_preempt, destroy_rtentry_epoch, - &rt->rt_epoch_ctx); + NET_EPOCH_CALL(destroy_rtentry_epoch, &rt->rt_epoch_ctx); } void rt_free_immediate(struct rtentry *rt) { uma_zfree(V_rtzone, rt); } bool rt_is_host(const struct rtentry *rt) { return (rt->rte_flags & RTF_HOST); } sa_family_t rt_get_family(const struct rtentry *rt) { const struct sockaddr *dst; dst = (const struct sockaddr *)rt_key_const(rt); return (dst->sa_family); } /* * Returns pointer to nexthop or nexthop group * associated with @rt */ struct nhop_object * rt_get_raw_nhop(const struct rtentry *rt) { return (rt->rt_nhop); } void rt_get_rnd(const struct rtentry *rt, struct route_nhop_data *rnd) { rnd->rnd_nhop = rt->rt_nhop; rnd->rnd_weight = rt->rt_weight; } #ifdef INET /* * Stores IPv4 address and prefix length of @rt inside * @paddr and @plen. * @pscopeid is currently always set to 0. */ void rt_get_inet_prefix_plen(const struct rtentry *rt, struct in_addr *paddr, int *plen, uint32_t *pscopeid) { const struct sockaddr_in *dst; dst = (const struct sockaddr_in *)rt_key_const(rt); KASSERT((dst->sin_family == AF_INET), ("rt family is %d, not inet", dst->sin_family)); *paddr = dst->sin_addr; dst = (const struct sockaddr_in *)rt_mask_const(rt); if (dst == NULL) *plen = 32; else *plen = bitcount32(dst->sin_addr.s_addr); *pscopeid = 0; } /* * Stores IPv4 address and prefix mask of @rt inside * @paddr and @pmask. Sets mask to INADDR_ANY for host routes. * @pscopeid is currently always set to 0. */ void rt_get_inet_prefix_pmask(const struct rtentry *rt, struct in_addr *paddr, struct in_addr *pmask, uint32_t *pscopeid) { const struct sockaddr_in *dst; dst = (const struct sockaddr_in *)rt_key_const(rt); KASSERT((dst->sin_family == AF_INET), ("rt family is %d, not inet", dst->sin_family)); *paddr = dst->sin_addr; dst = (const struct sockaddr_in *)rt_mask_const(rt); if (dst == NULL) pmask->s_addr = INADDR_BROADCAST; else *pmask = dst->sin_addr; *pscopeid = 0; } #endif #ifdef INET6 static int inet6_get_plen(const struct in6_addr *addr) { return (bitcount32(addr->s6_addr32[0]) + bitcount32(addr->s6_addr32[1]) + bitcount32(addr->s6_addr32[2]) + bitcount32(addr->s6_addr32[3])); } /* * Stores IPv6 address and prefix length of @rt inside * @paddr and @plen. Addresses are returned in de-embedded form. * Scopeid is set to 0 for non-LL addresses. */ void rt_get_inet6_prefix_plen(const struct rtentry *rt, struct in6_addr *paddr, int *plen, uint32_t *pscopeid) { const struct sockaddr_in6 *dst; dst = (const struct sockaddr_in6 *)rt_key_const(rt); KASSERT((dst->sin6_family == AF_INET6), ("rt family is %d, not inet6", dst->sin6_family)); if (IN6_IS_SCOPE_LINKLOCAL(&dst->sin6_addr)) in6_splitscope(&dst->sin6_addr, paddr, pscopeid); else *paddr = dst->sin6_addr; dst = (const struct sockaddr_in6 *)rt_mask_const(rt); if (dst == NULL) *plen = 128; else *plen = inet6_get_plen(&dst->sin6_addr); } /* * Stores IPv6 address and prefix mask of @rt inside * @paddr and @pmask. Addresses are returned in de-embedded form. * Scopeid is set to 0 for non-LL addresses. */ void rt_get_inet6_prefix_pmask(const struct rtentry *rt, struct in6_addr *paddr, struct in6_addr *pmask, uint32_t *pscopeid) { const struct sockaddr_in6 *dst; dst = (const struct sockaddr_in6 *)rt_key_const(rt); KASSERT((dst->sin6_family == AF_INET6), ("rt family is %d, not inet", dst->sin6_family)); if (IN6_IS_SCOPE_LINKLOCAL(&dst->sin6_addr)) in6_splitscope(&dst->sin6_addr, paddr, pscopeid); else *paddr = dst->sin6_addr; dst = (const struct sockaddr_in6 *)rt_mask_const(rt); if (dst == NULL) memset(pmask, 0xFF, sizeof(struct in6_addr)); else *pmask = dst->sin6_addr; } #endif diff --git a/sys/net/route/route_subscription.c b/sys/net/route/route_subscription.c index 65b1bdae9900..2c2b3e94e6b4 100644 --- a/sys/net/route/route_subscription.c +++ b/sys/net/route/route_subscription.c @@ -1,213 +1,210 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2021-2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include struct rib_subscription { CK_STAILQ_ENTRY(rib_subscription) next; rib_subscription_cb_t *func; void *arg; struct rib_head *rnh; enum rib_subscription_type type; struct epoch_context epoch_ctx; }; static void destroy_subscription_epoch(epoch_context_t ctx); void rib_notify(struct rib_head *rnh, enum rib_subscription_type type, struct rib_cmd_info *rc) { struct rib_subscription *rs; CK_STAILQ_FOREACH(rs, &rnh->rnh_subscribers, next) { if (rs->type == type) rs->func(rnh, rc, rs->arg); } } static struct rib_subscription * allocate_subscription(rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type, bool waitok) { struct rib_subscription *rs; int flags = M_ZERO | (waitok ? M_WAITOK : M_NOWAIT); rs = malloc(sizeof(struct rib_subscription), M_RTABLE, flags); if (rs == NULL) return (NULL); rs->func = f; rs->arg = arg; rs->type = type; return (rs); } /* * Subscribe for the changes in the routing table specified by @fibnum and * @family. * * Returns pointer to the subscription structure on success. */ struct rib_subscription * rib_subscribe(uint32_t fibnum, int family, rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type, bool waitok) { struct rib_head *rnh; struct epoch_tracker et; NET_EPOCH_ENTER(et); KASSERT((fibnum < rt_numfibs), ("%s: bad fibnum", __func__)); rnh = rt_tables_get_rnh(fibnum, family); NET_EPOCH_EXIT(et); return (rib_subscribe_internal(rnh, f, arg, type, waitok)); } struct rib_subscription * rib_subscribe_internal(struct rib_head *rnh, rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type, bool waitok) { struct rib_subscription *rs; struct epoch_tracker et; if ((rs = allocate_subscription(f, arg, type, waitok)) == NULL) return (NULL); rs->rnh = rnh; NET_EPOCH_ENTER(et); RIB_WLOCK(rnh); CK_STAILQ_INSERT_HEAD(&rnh->rnh_subscribers, rs, next); RIB_WUNLOCK(rnh); NET_EPOCH_EXIT(et); return (rs); } struct rib_subscription * rib_subscribe_locked(struct rib_head *rnh, rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type) { struct rib_subscription *rs; NET_EPOCH_ASSERT(); RIB_WLOCK_ASSERT(rnh); if ((rs = allocate_subscription(f, arg, type, false)) == NULL) return (NULL); rs->rnh = rnh; CK_STAILQ_INSERT_HEAD(&rnh->rnh_subscribers, rs, next); return (rs); } /* * Remove rtable subscription @rs from the routing table. * Needs to be run in network epoch. */ void rib_unsubscribe(struct rib_subscription *rs) { struct rib_head *rnh = rs->rnh; NET_EPOCH_ASSERT(); RIB_WLOCK(rnh); CK_STAILQ_REMOVE(&rnh->rnh_subscribers, rs, rib_subscription, next); RIB_WUNLOCK(rnh); - epoch_call(net_epoch_preempt, destroy_subscription_epoch, - &rs->epoch_ctx); + NET_EPOCH_CALL(destroy_subscription_epoch, &rs->epoch_ctx); } void rib_unsubscribe_locked(struct rib_subscription *rs) { struct rib_head *rnh = rs->rnh; NET_EPOCH_ASSERT(); RIB_WLOCK_ASSERT(rnh); CK_STAILQ_REMOVE(&rnh->rnh_subscribers, rs, rib_subscription, next); - epoch_call(net_epoch_preempt, destroy_subscription_epoch, - &rs->epoch_ctx); + NET_EPOCH_CALL(destroy_subscription_epoch, &rs->epoch_ctx); } /* * Epoch callback indicating subscription is safe to destroy */ static void destroy_subscription_epoch(epoch_context_t ctx) { struct rib_subscription *rs; rs = __containerof(ctx, struct rib_subscription, epoch_ctx); free(rs, M_RTABLE); } void rib_init_subscriptions(struct rib_head *rnh) { CK_STAILQ_INIT(&rnh->rnh_subscribers); } void rib_destroy_subscriptions(struct rib_head *rnh) { struct rib_subscription *rs; struct epoch_tracker et; NET_EPOCH_ENTER(et); RIB_WLOCK(rnh); while ((rs = CK_STAILQ_FIRST(&rnh->rnh_subscribers)) != NULL) { CK_STAILQ_REMOVE_HEAD(&rnh->rnh_subscribers, next); - epoch_call(net_epoch_preempt, destroy_subscription_epoch, - &rs->epoch_ctx); + NET_EPOCH_CALL(destroy_subscription_epoch, &rs->epoch_ctx); } RIB_WUNLOCK(rnh); NET_EPOCH_EXIT(et); } diff --git a/sys/net/route/route_tables.c b/sys/net/route/route_tables.c index 614c28ded0cf..637a86573320 100644 --- a/sys/net/route/route_tables.c +++ b/sys/net/route/route_tables.c @@ -1,365 +1,365 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /************************************************************************ * Note: In this file a 'fib' is a "forwarding information base" * * Which is the new name for an in kernel routing (next hop) table. * ***********************************************************************/ #include __FBSDID("$FreeBSD$"); #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Kernel config default option. */ #ifdef ROUTETABLES #if ROUTETABLES <= 0 #error "ROUTETABLES defined too low" #endif #if ROUTETABLES > RT_MAXFIBS #error "ROUTETABLES defined too big" #endif #define RT_NUMFIBS ROUTETABLES #endif /* ROUTETABLES */ /* Initialize to default if not otherwise set. */ #ifndef RT_NUMFIBS #define RT_NUMFIBS 1 #endif static void grow_rtables(uint32_t num_fibs); VNET_DEFINE_STATIC(struct sx, rtables_lock); #define V_rtables_lock VNET(rtables_lock) #define RTABLES_LOCK() sx_xlock(&V_rtables_lock) #define RTABLES_UNLOCK() sx_xunlock(&V_rtables_lock) #define RTABLES_LOCK_INIT() sx_init(&V_rtables_lock, "rtables lock") #define RTABLES_LOCK_ASSERT() sx_assert(&V_rtables_lock, SA_LOCKED) VNET_DEFINE_STATIC(struct rib_head **, rt_tables); #define V_rt_tables VNET(rt_tables) VNET_DEFINE(uint32_t, _rt_numfibs) = RT_NUMFIBS; /* * Handler for net.my_fibnum. * Returns current fib of the process. */ static int sysctl_my_fibnum(SYSCTL_HANDLER_ARGS) { int fibnum; int error; fibnum = curthread->td_proc->p_fibnum; error = sysctl_handle_int(oidp, &fibnum, 0, req); return (error); } SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller"); static uint32_t normalize_num_rtables(uint32_t num_rtables) { if (num_rtables > RT_MAXFIBS) num_rtables = RT_MAXFIBS; else if (num_rtables == 0) num_rtables = 1; return (num_rtables); } /* * Sets the number of fibs in the current vnet. * Function does not allow shrinking number of rtables. */ static int sysctl_fibs(SYSCTL_HANDLER_ARGS) { uint32_t new_fibs; int error; RTABLES_LOCK(); new_fibs = V_rt_numfibs; error = sysctl_handle_32(oidp, &new_fibs, 0, req); if (error == 0) { new_fibs = normalize_num_rtables(new_fibs); if (new_fibs < V_rt_numfibs) error = ENOTCAPABLE; if (new_fibs > V_rt_numfibs) grow_rtables(new_fibs); } RTABLES_UNLOCK(); return (error); } SYSCTL_PROC(_net, OID_AUTO, fibs, CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, &sysctl_fibs, "IU", "set number of fibs"); /* * Sets fib of a current process. */ int sys_setfib(struct thread *td, struct setfib_args *uap) { int error = 0; CURVNET_SET(TD_TO_VNET(td)); if (uap->fibnum >= 0 && uap->fibnum < V_rt_numfibs) td->td_proc->p_fibnum = uap->fibnum; else error = EINVAL; CURVNET_RESTORE(); return (error); } /* * If required, copy interface routes from existing tables to the * newly-created routing table. */ static void populate_kernel_routes(struct rib_head **new_rt_tables, struct rib_head *rh) { for (int i = 0; i < V_rt_numfibs; i++) { struct rib_head *rh_src = new_rt_tables[i * (AF_MAX + 1) + rh->rib_family]; if ((rh_src != NULL) && (rh_src != rh)) rib_copy_kernel_routes(rh_src, rh); } } /* * Grows up the number of routing tables in the current fib. * Function creates new index array for all rtables and allocates * remaining routing tables. */ static void grow_rtables(uint32_t num_tables) { struct domain *dom; struct rib_head **prnh, *rh; struct rib_head **new_rt_tables, **old_rt_tables; int family; RTABLES_LOCK_ASSERT(); KASSERT(num_tables >= V_rt_numfibs, ("num_tables(%u) < rt_numfibs(%u)\n", num_tables, V_rt_numfibs)); new_rt_tables = mallocarray(num_tables * (AF_MAX + 1), sizeof(void *), M_RTABLE, M_WAITOK | M_ZERO); if ((num_tables > 1) && (V_rt_add_addr_allfibs == 0)) printf("WARNING: Adding ifaddrs to all fibs has been turned off " "by default. Consider tuning %s if needed\n", "net.add_addr_allfibs"); #ifdef FIB_ALGO fib_grow_rtables(num_tables); #endif /* * Current rt_tables layout: * fib0[af0, af1, af2, .., AF_MAX]fib1[af0, af1, af2, .., Af_MAX].. * this allows to copy existing tables data by using memcpy() */ if (V_rt_tables != NULL) memcpy(new_rt_tables, V_rt_tables, V_rt_numfibs * (AF_MAX + 1) * sizeof(void *)); /* Populate the remainders */ SLIST_FOREACH(dom, &domains, dom_next) { if (dom->dom_rtattach == NULL) continue; family = dom->dom_family; for (int i = 0; i < num_tables; i++) { prnh = &new_rt_tables[i * (AF_MAX + 1) + family]; if (*prnh != NULL) continue; rh = dom->dom_rtattach(i); if (rh == NULL) log(LOG_ERR, "unable to create routing table for %d.%d\n", dom->dom_family, i); else populate_kernel_routes(new_rt_tables, rh); *prnh = rh; } } /* * Update rtables pointer. * Ensure all writes to new_rt_tables has been completed before * switching pointer. */ atomic_thread_fence_rel(); old_rt_tables = V_rt_tables; V_rt_tables = new_rt_tables; /* Wait till all cpus see new pointers */ atomic_thread_fence_rel(); - epoch_wait_preempt(net_epoch_preempt); + NET_EPOCH_WAIT(); /* Set number of fibs to a new value */ V_rt_numfibs = num_tables; #ifdef FIB_ALGO /* Attach fib algo to the new rtables */ SLIST_FOREACH(dom, &domains, dom_next) { if (dom->dom_rtattach != NULL) fib_setup_family(dom->dom_family, num_tables); } #endif if (old_rt_tables != NULL) free(old_rt_tables, M_RTABLE); } static void vnet_rtables_init(const void *unused __unused) { int num_rtables_base; if (IS_DEFAULT_VNET(curvnet)) { num_rtables_base = RT_NUMFIBS; TUNABLE_INT_FETCH("net.fibs", &num_rtables_base); V_rt_numfibs = normalize_num_rtables(num_rtables_base); } else V_rt_numfibs = 1; vnet_rtzone_init(); #ifdef FIB_ALGO vnet_fib_init(); #endif RTABLES_LOCK_INIT(); RTABLES_LOCK(); grow_rtables(V_rt_numfibs); RTABLES_UNLOCK(); } VNET_SYSINIT(vnet_rtables_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, vnet_rtables_init, 0); #ifdef VIMAGE static void rtables_destroy(const void *unused __unused) { struct rib_head *rnh; struct domain *dom; int family; RTABLES_LOCK(); SLIST_FOREACH(dom, &domains, dom_next) { if (dom->dom_rtdetach == NULL) continue; family = dom->dom_family; for (int i = 0; i < V_rt_numfibs; i++) { rnh = rt_tables_get_rnh(i, family); dom->dom_rtdetach(rnh); } } RTABLES_UNLOCK(); /* * dom_rtdetach calls rt_table_destroy(), which * schedules deletion for all rtentries, nexthops and control * structures. Wait for the destruction callbacks to fire. * Note that this should result in freeing all rtentries, but * nexthops deletions will be scheduled for the next epoch run * and will be completed after vnet teardown. */ NET_EPOCH_DRAIN_CALLBACKS(); free(V_rt_tables, M_RTABLE); vnet_rtzone_destroy(); #ifdef FIB_ALGO vnet_fib_destroy(); #endif } VNET_SYSUNINIT(rtables_destroy, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, rtables_destroy, 0); #endif static inline struct rib_head * rt_tables_get_rnh_ptr(uint32_t table, sa_family_t family) { struct rib_head **prnh; KASSERT(table < V_rt_numfibs, ("%s: table out of bounds (%d < %d)", __func__, table, V_rt_numfibs)); KASSERT(family < (AF_MAX + 1), ("%s: fam out of bounds (%d < %d)", __func__, family, AF_MAX + 1)); /* rnh is [fib=0][af=0]. */ prnh = V_rt_tables; /* Get the offset to the requested table and fam. */ prnh += table * (AF_MAX + 1) + family; return (*prnh); } struct rib_head * rt_tables_get_rnh(uint32_t table, sa_family_t family) { return (rt_tables_get_rnh_ptr(table, family)); } u_int rt_tables_get_gen(uint32_t table, sa_family_t family) { struct rib_head *rnh; rnh = rt_tables_get_rnh_ptr(table, family); KASSERT(rnh != NULL, ("%s: NULL rib_head pointer table %d family %d", __func__, table, family)); return (rnh->rnh_gen); }