diff --git a/sys/net/route/fib_algo.c b/sys/net/route/fib_algo.c index 30e715e1e1ef..91565d727a9c 100644 --- a/sys/net/route/fib_algo.c +++ b/sys/net/route/fib_algo.c @@ -1,2009 +1,2020 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2020 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #include #include #include #include #include #include /* * Fib lookup framework. * * This framework enables accelerated longest-prefix-match lookups for the * routing tables by adding the ability to dynamically attach/detach lookup * algorithms implementation to/from the datapath. * * flm - fib lookup modules - implementation of particular lookup algorithm * fd - fib data - instance of an flm bound to specific routing table * * This file provides main framework functionality. * * The following are the features provided by the framework * * 1) nexhops abstraction -> provides transparent referencing, indexing * and efficient idx->ptr mappings for nexthop and nexthop groups. * 2) Routing table synchronisation * 3) dataplane attachment points * 4) automatic algorithm selection based on the provided preference. * * * DATAPATH * For each supported address family, there is a an allocated array of fib_dp * structures, indexed by fib number. Each array entry contains callback function * and its argument. This function will be called with a family-specific lookup key, * scope and provided argument. This array gets re-created every time when new algo * instance gets created. Please take a look at the replace_rtables_family() function * for more details. * */ SYSCTL_DECL(_net_route); SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Fib algorithm lookups"); /* Algorithm sync policy */ /* Time interval to bucket updates */ VNET_DEFINE(unsigned int, bucket_time_ms) = 50; #define V_bucket_time_ms VNET(bucket_time_ms) SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_time_ms, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(bucket_time_ms), 0, "Time interval to calculate update rate"); /* Minimum update rate to delay sync */ VNET_DEFINE(unsigned int, bucket_change_threshold_rate) = 500; #define V_bucket_change_threshold_rate VNET(bucket_change_threshold_rate) SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_change_threshold_rate, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(bucket_change_threshold_rate), 0, "Minimum update rate to delay sync"); /* Max allowed delay to sync */ VNET_DEFINE(unsigned int, fib_max_sync_delay_ms) = 1000; #define V_fib_max_sync_delay_ms VNET(fib_max_sync_delay_ms) SYSCTL_UINT(_net_route_algo, OID_AUTO, fib_max_sync_delay_ms, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(fib_max_sync_delay_ms), 0, "Maximum time to delay sync (ms)"); #ifdef INET6 VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false; #define V_algo_fixed_inet6 VNET(algo_fixed_inet6) SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPv6 longest prefix match lookups"); #endif #ifdef INET VNET_DEFINE_STATIC(bool, algo_fixed_inet) = false; #define V_algo_fixed_inet VNET(algo_fixed_inet) SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPv4 longest prefix match lookups"); #endif /* Fib instance counter */ static uint32_t fib_gen = 0; struct nhop_ref_table { uint32_t count; int32_t refcnt[0]; }; enum fib_callout_action { FDA_NONE, /* No callout scheduled */ FDA_REBUILD, /* Asks to rebuild algo instance */ FDA_EVAL, /* Asks to evaluate if the current algo is still be best */ FDA_BATCH, /* Asks to submit batch of updates to the algo */ }; struct fib_sync_status { struct timeval diverge_time; /* ts when diverged */ uint32_t num_changes; /* number of changes since sync */ uint32_t bucket_changes; /* num changes within the current bucket */ uint64_t bucket_id; /* 50ms bucket # */ struct fib_change_queue fd_change_queue;/* list of scheduled entries */ }; /* * Data structure for the fib lookup instance tied to the particular rib. */ struct fib_data { uint32_t number_nhops; /* current # of nhops */ uint8_t hit_nhops; /* true if out of nhop limit */ uint8_t init_done; /* true if init is competed */ uint32_t fd_dead:1; /* Scheduled for deletion */ uint32_t fd_linked:1; /* true if linked */ uint32_t fd_need_rebuild:1; /* true if rebuild scheduled */ uint32_t fd_batch:1; /* true if batched notification scheduled */ uint8_t fd_family; /* family */ uint32_t fd_fibnum; /* fibnum */ uint32_t fd_failed_rebuilds; /* stat: failed rebuilds */ uint32_t fd_gen; /* instance gen# */ struct callout fd_callout; /* rebuild callout */ enum fib_callout_action fd_callout_action; /* Callout action to take */ void *fd_algo_data; /* algorithm data */ struct nhop_object **nh_idx; /* nhop idx->ptr array */ struct nhop_ref_table *nh_ref_table; /* array with # of nhop references */ struct rib_head *fd_rh; /* RIB table we're attached to */ struct rib_subscription *fd_rs; /* storing table subscription */ struct fib_dp fd_dp; /* fib datapath data */ struct vnet *fd_vnet; /* vnet fib belongs to */ struct epoch_context fd_epoch_ctx; /* epoch context for deletion */ struct fib_lookup_module *fd_flm;/* pointer to the lookup module */ struct fib_sync_status fd_ss; /* State relevant to the rib sync */ uint32_t fd_num_changes; /* number of changes since last callout */ TAILQ_ENTRY(fib_data) entries; /* list of all fds in vnet */ }; static bool rebuild_fd(struct fib_data *fd, const char *reason); static bool rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new); static void handle_fd_callout(void *_data); static void destroy_fd_instance_epoch(epoch_context_t ctx); static bool is_idx_free(struct fib_data *fd, uint32_t index); static void set_algo_fixed(struct rib_head *rh); static bool is_algo_fixed(struct rib_head *rh); static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh); static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh); static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm); static void fib_unref_algo(struct fib_lookup_module *flm); static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum); struct mtx fib_mtx; #define FIB_MOD_LOCK() mtx_lock(&fib_mtx) #define FIB_MOD_UNLOCK() mtx_unlock(&fib_mtx) #define FIB_MOD_LOCK_ASSERT() mtx_assert(&fib_mtx, MA_OWNED) MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF); /* Algorithm has to be this percent better than the current to switch */ #define BEST_DIFF_PERCENT (5 * 256 / 100) /* Schedule algo re-evaluation X seconds after a change */ #define ALGO_EVAL_DELAY_MS 30000 /* Force algo re-evaluation after X changes */ #define ALGO_EVAL_NUM_ROUTES 100 /* Try to setup algorithm X times */ #define FIB_MAX_TRIES 32 /* Max amount of supported nexthops */ #define FIB_MAX_NHOPS 262144 #define FIB_CALLOUT_DELAY_MS 50 /* Debug */ static int flm_debug_level = LOG_NOTICE; SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN, &flm_debug_level, 0, "debuglevel"); #define FLM_MAX_DEBUG_LEVEL LOG_DEBUG #ifndef LOG_DEBUG2 #define LOG_DEBUG2 8 #endif #define _PASS_MSG(_l) (flm_debug_level >= (_l)) #define ALGO_PRINTF(_fmt, ...) printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__) #define _ALGO_PRINTF(_fib, _fam, _aname, _gen, _func, _fmt, ...) \ printf("[fib_algo] %s.%u (%s#%u) %s: " _fmt "\n",\ print_family(_fam), _fib, _aname, _gen, _func, ## __VA_ARGS__) #define _RH_PRINTF(_fib, _fam, _func, _fmt, ...) \ printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__) #define RH_PRINTF(_l, _rh, _fmt, ...) if (_PASS_MSG(_l)) { \ _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\ } #define FD_PRINTF(_l, _fd, _fmt, ...) FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__) #define _FD_PRINTF(_l, _fd, _fmt, ...) if (_PASS_MSG(_l)) { \ _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name, \ _fd->fd_gen, __func__, _fmt, ## __VA_ARGS__); \ } #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG2 #define FD_PRINTF_LOG_DEBUG2 _FD_PRINTF #else #define FD_PRINTF_LOG_DEBUG2(_l, _fd, _fmt, ...) #endif #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG #define FD_PRINTF_LOG_DEBUG _FD_PRINTF #else #define FD_PRINTF_LOG_DEBUG() #endif #if FLM_MAX_DEBUG_LEVEL>=LOG_INFO #define FD_PRINTF_LOG_INFO _FD_PRINTF #else #define FD_PRINTF_LOG_INFO() #endif #define FD_PRINTF_LOG_NOTICE _FD_PRINTF #define FD_PRINTF_LOG_ERR _FD_PRINTF #define FD_PRINTF_LOG_WARNING _FD_PRINTF /* List of all registered lookup algorithms */ static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list); /* List of all fib lookup instances in the vnet */ VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list); #define V_fib_data_list VNET(fib_data_list) /* Datastructure for storing non-transient fib lookup module failures */ struct fib_error { int fe_family; uint32_t fe_fibnum; /* failed rtable */ struct fib_lookup_module *fe_flm; /* failed module */ TAILQ_ENTRY(fib_error) entries;/* list of all errored entries */ }; VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list); #define V_fib_error_list VNET(fib_error_list) /* Per-family array of fibnum -> {func, arg} mappings used in datapath */ struct fib_dp_header { struct epoch_context fdh_epoch_ctx; uint32_t fdh_num_tables; struct fib_dp fdh_idx[0]; }; /* * Tries to add new non-transient algorithm error to the list of * errors. * Returns true on success. */ static bool flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum) { struct fib_error *fe; fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO); if (fe == NULL) return (false); fe->fe_flm = flm; fe->fe_family = flm->flm_family; fe->fe_fibnum = fibnum; FIB_MOD_LOCK(); /* Avoid duplicates by checking if error already exists first */ if (flm_error_check(flm, fibnum)) { FIB_MOD_UNLOCK(); free(fe, M_TEMP); return (true); } TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries); FIB_MOD_UNLOCK(); return (true); } /* * True if non-transient error has been registered for @flm in @fibnum. */ static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum) { const struct fib_error *fe; TAILQ_FOREACH(fe, &V_fib_error_list, entries) { if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum)) return (true); } return (false); } /* * Clear all errors of algo specified by @flm. */ static void fib_error_clear_flm(struct fib_lookup_module *flm) { struct fib_error *fe, *fe_tmp; FIB_MOD_LOCK_ASSERT(); TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { if (fe->fe_flm == flm) { TAILQ_REMOVE(&V_fib_error_list, fe, entries); free(fe, M_TEMP); } } } /* * Clears all errors in current VNET. */ static void fib_error_clear() { struct fib_error *fe, *fe_tmp; FIB_MOD_LOCK_ASSERT(); TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { TAILQ_REMOVE(&V_fib_error_list, fe, entries); free(fe, M_TEMP); } } static const char * print_op_result(enum flm_op_result result) { switch (result) { case FLM_SUCCESS: return "success"; case FLM_REBUILD: return "rebuild"; case FLM_BATCH: return "batch"; case FLM_ERROR: return "error"; } return "unknown"; } static const char * print_family(int family) { if (family == AF_INET) return ("inet"); else if (family == AF_INET6) return ("inet6"); else return ("unknown"); } /* * Debug function used by lookup algorithms. * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) " */ void fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...) { char buf[128]; va_list ap; if (level > flm_debug_level) return; va_start(ap, fmt); vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); _ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name, fd->fd_gen, func, "%s", buf); } /* * Outputs list of algorithms supported by the provided address family. */ static int print_algos_sysctl(struct sysctl_req *req, int family) { struct fib_lookup_module *flm; struct sbuf sbuf; int error, count = 0; error = sysctl_wire_old_buffer(req, 0); if (error == 0) { sbuf_new_for_sysctl(&sbuf, NULL, 512, req); TAILQ_FOREACH(flm, &all_algo_list, entries) { if (flm->flm_family == family) { if (count++ > 0) sbuf_cat(&sbuf, ", "); sbuf_cat(&sbuf, flm->flm_name); } } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); } return (error); } #ifdef INET6 static int print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS) { return (print_algos_sysctl(req, AF_INET6)); } SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms"); #endif #ifdef INET static int print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS) { return (print_algos_sysctl(req, AF_INET)); } SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms"); #endif /* * Calculate delay between repeated failures. * Returns current delay in milliseconds. */ static uint32_t callout_calc_delay_ms(struct fib_data *fd) { uint32_t shift; if (fd->fd_failed_rebuilds > 10) shift = 10; else shift = fd->fd_failed_rebuilds; return ((1 << shift) * FIB_CALLOUT_DELAY_MS); } static void schedule_callout(struct fib_data *fd, enum fib_callout_action action, int delay_ms) { FD_PRINTF(LOG_DEBUG, fd, "delay=%d action=%d", delay_ms, action); fd->fd_callout_action = action; callout_reset_sbt(&fd->fd_callout, SBT_1MS * delay_ms, 0, handle_fd_callout, fd, 0); } static void schedule_fd_rebuild(struct fib_data *fd, const char *reason) { RIB_WLOCK_ASSERT(fd->fd_rh); if (!fd->fd_need_rebuild) { fd->fd_need_rebuild = true; /* Stop batch updates */ fd->fd_batch = false; /* * Potentially re-schedules pending callout * initiated by schedule_algo_eval. */ FD_PRINTF(LOG_INFO, fd, "Scheduling rebuild: %s (failures=%d)", reason, fd->fd_failed_rebuilds); schedule_callout(fd, FDA_REBUILD, callout_calc_delay_ms(fd)); } } +static void +sync_rib_gen(struct fib_data *fd) +{ + FD_PRINTF(LOG_DEBUG, fd, "Sync gen %u -> %u", fd->fd_rh->rnh_gen, fd->fd_rh->rnh_gen_rib); + fd->fd_rh->rnh_gen = fd->fd_rh->rnh_gen_rib; +} + static int64_t get_tv_diff_ms(const struct timeval *old_tv, const struct timeval *new_tv) { int64_t diff = 0; diff = ((int64_t)(new_tv->tv_sec - old_tv->tv_sec)) * 1000; diff += (new_tv->tv_usec - old_tv->tv_usec) / 1000; return (diff); } static void add_tv_diff_ms(struct timeval *tv, int ms) { tv->tv_sec += ms / 1000; ms = ms % 1000; if (ms * 1000 + tv->tv_usec < 1000000) tv->tv_usec += ms * 1000; else { tv->tv_sec += 1; tv->tv_usec = ms * 1000 + tv->tv_usec - 1000000; } } /* * Marks the time when algo state diverges from the rib state. */ static void mark_diverge_time(struct fib_data *fd) { struct fib_sync_status *fd_ss = &fd->fd_ss; getmicrouptime(&fd_ss->diverge_time); fd_ss->bucket_id = 0; fd_ss->bucket_changes = 0; } /* * Calculates and updates the next algorithm sync time, based on the current activity. * * The intent is to provide reasonable balance between the update * latency and efficient batching when changing large amount of routes. * * High-level algorithm looks the following: * 1) all changes are bucketed in 50ms intervals * 2) If amount of changes within the bucket is greater than the threshold, * the update gets delayed, up to maximum delay threshold. */ static void update_rebuild_delay(struct fib_data *fd, enum fib_callout_action action) { uint32_t bucket_id, new_delay = 0; struct timeval tv; /* Fetch all variables at once to ensure consistent reads */ uint32_t bucket_time_ms = V_bucket_time_ms; uint32_t threshold_rate = V_bucket_change_threshold_rate; uint32_t max_delay_ms = V_fib_max_sync_delay_ms; if (bucket_time_ms == 0) bucket_time_ms = 50; /* calculate per-bucket threshold rate */ threshold_rate = threshold_rate * bucket_time_ms / 1000; getmicrouptime(&tv); struct fib_sync_status *fd_ss = &fd->fd_ss; bucket_id = get_tv_diff_ms(&fd_ss->diverge_time, &tv) / bucket_time_ms; if (fd_ss->bucket_id == bucket_id) { fd_ss->bucket_changes++; if (fd_ss->bucket_changes == threshold_rate) { new_delay = (bucket_id + 2) * bucket_time_ms; if (new_delay <= max_delay_ms) { FD_PRINTF(LOG_DEBUG, fd, "hit threshold of %u routes, delay update," "bucket: %u, total delay: %u", threshold_rate, bucket_id + 1, new_delay); } else { new_delay = 0; FD_PRINTF(LOG_DEBUG, fd, "maximum sync delay (%u ms) reached", max_delay_ms); } } else if ((bucket_id == 0) && (fd_ss->bucket_changes == 1)) new_delay = bucket_time_ms; } else { fd_ss->bucket_id = bucket_id; fd_ss->bucket_changes = 1; } if (new_delay > 0) { /* Calculated time has been updated */ struct timeval new_tv = fd_ss->diverge_time; add_tv_diff_ms(&new_tv, new_delay); int32_t delay_ms = get_tv_diff_ms(&tv, &new_tv); schedule_callout(fd, action, delay_ms); } } static void update_algo_state(struct fib_data *fd) { RIB_WLOCK_ASSERT(fd->fd_rh); if (fd->fd_batch || fd->fd_need_rebuild) { enum fib_callout_action action = fd->fd_need_rebuild ? FDA_REBUILD : FDA_BATCH; update_rebuild_delay(fd, action); return; } if (fd->fd_num_changes++ == 0) { /* Start callout to consider switch */ if (!callout_pending(&fd->fd_callout)) schedule_callout(fd, FDA_EVAL, ALGO_EVAL_DELAY_MS); } else if (fd->fd_num_changes == ALGO_EVAL_NUM_ROUTES) { /* Reset callout to exec immediately */ if (fd->fd_callout_action == FDA_EVAL) schedule_callout(fd, FDA_EVAL, 1); } } static bool need_immediate_sync(struct fib_data *fd, struct rib_cmd_info *rc) { struct nhop_object *nh; /* Sync addition/removal of interface routes */ switch (rc->rc_cmd) { case RTM_ADD: nh = rc->rc_nh_new; if (!NH_IS_NHGRP(nh) && (!(nh->nh_flags & NHF_GATEWAY))) return (true); break; case RTM_DELETE: nh = rc->rc_nh_old; if (!NH_IS_NHGRP(nh) && (!(nh->nh_flags & NHF_GATEWAY))) return (true); break; } return (false); } static bool apply_rtable_changes(struct fib_data *fd) { enum flm_op_result result; struct fib_change_queue *q = &fd->fd_ss.fd_change_queue; result = fd->fd_flm->flm_change_rib_items_cb(fd->fd_rh, q, fd->fd_algo_data); if (result == FLM_SUCCESS) { + sync_rib_gen(fd); for (int i = 0; i < q->count; i++) if (q->entries[i].nh_old) fib_unref_nhop(fd, q->entries[i].nh_old); q->count = 0; } fd->fd_batch = false; return (result == FLM_SUCCESS); } static bool fill_change_entry(struct fib_data *fd, struct fib_change_entry *ce, struct rib_cmd_info *rc) { int plen = 0; switch (fd->fd_family) { case AF_INET: rt_get_inet_prefix_plen(rc->rc_rt, &ce->addr4, &plen, &ce->scopeid); break; case AF_INET6: rt_get_inet6_prefix_plen(rc->rc_rt, &ce->addr6, &plen, &ce->scopeid); break; } ce->plen = plen; ce->nh_old = rc->rc_nh_old; ce->nh_new = rc->rc_nh_new; if (ce->nh_new != NULL) { if (fib_ref_nhop(fd, ce->nh_new) == 0) return (false); } return (true); } static bool queue_rtable_change(struct fib_data *fd, struct rib_cmd_info *rc) { struct fib_change_queue *q = &fd->fd_ss.fd_change_queue; if (q->count >= q->size) { uint32_t q_size; if (q->size == 0) q_size = 256; /* ~18k memory */ else q_size = q->size * 2; size_t size = q_size * sizeof(struct fib_change_entry); void *a = realloc(q->entries, size, M_TEMP, M_NOWAIT | M_ZERO); if (a == NULL) { FD_PRINTF(LOG_INFO, fd, "Unable to realloc queue for %u elements", q_size); return (false); } q->entries = a; q->size = q_size; } return (fill_change_entry(fd, &q->entries[q->count++], rc)); } /* * Rib subscription handler. Checks if the algorithm is ready to * receive updates, handles nexthop refcounting and passes change * data to the algorithm callback. */ static void handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, void *_data) { struct fib_data *fd = (struct fib_data *)_data; enum flm_op_result result; RIB_WLOCK_ASSERT(rnh); /* * There is a small gap between subscribing for route changes * and initiating rtable dump. Avoid receiving route changes * prior to finishing rtable dump by checking `init_done`. */ if (!fd->init_done) return; bool immediate_sync = need_immediate_sync(fd, rc); /* Consider scheduling algorithm re-evaluation */ update_algo_state(fd); /* * If algo requested rebuild, stop sending updates by default. * This simplifies nexthop refcount handling logic. */ if (fd->fd_need_rebuild) { if (immediate_sync) rebuild_fd(fd, "rtable change type enforced sync"); return; } /* * Algo requested updates to be delivered in batches. * Add the current change to the queue and return. */ if (fd->fd_batch) { if (immediate_sync) { if (!queue_rtable_change(fd, rc) || !apply_rtable_changes(fd)) rebuild_fd(fd, "batch sync failed"); } else { if (!queue_rtable_change(fd, rc)) schedule_fd_rebuild(fd, "batch queue failed"); } return; } /* * Maintain guarantee that every nexthop returned by the dataplane * lookup has > 0 refcount, so can be safely referenced within current * epoch. */ if (rc->rc_nh_new != NULL) { if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) { /* ran out of indexes */ schedule_fd_rebuild(fd, "ran out of nhop indexes"); return; } } result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data); switch (result) { case FLM_SUCCESS: + sync_rib_gen(fd); /* Unref old nexthop on success */ if (rc->rc_nh_old != NULL) fib_unref_nhop(fd, rc->rc_nh_old); break; case FLM_BATCH: /* * Algo asks to batch the changes. */ if (queue_rtable_change(fd, rc)) { if (!immediate_sync) { fd->fd_batch = true; mark_diverge_time(fd); update_rebuild_delay(fd, FDA_BATCH); break; } if (apply_rtable_changes(fd)) break; } FD_PRINTF(LOG_ERR, fd, "batched sync failed, force the rebuild"); case FLM_REBUILD: /* * Algo is not able to apply the update. * Schedule algo rebuild. */ if (!immediate_sync) { mark_diverge_time(fd); schedule_fd_rebuild(fd, "algo requested rebuild"); break; } FD_PRINTF(LOG_INFO, fd, "running sync rebuild"); rebuild_fd(fd, "rtable change type enforced sync"); break; case FLM_ERROR: /* * Algo reported a non-recoverable error. * Record the error and schedule rebuild, which will * trigger best algo selection. */ FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error"); if (!flm_error_add(fd->fd_flm, fd->fd_fibnum)) FD_PRINTF(LOG_ERR, fd, "failed to ban algo"); schedule_fd_rebuild(fd, "algo reported non-recoverable error"); } } static void estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd) { if (old_fd == NULL) { // TODO: read from rtable fd->number_nhops = 16; return; } if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS) fd->number_nhops = 2 * old_fd->number_nhops; else fd->number_nhops = old_fd->number_nhops; } struct walk_cbdata { struct fib_data *fd; flm_dump_t *func; enum flm_op_result result; }; /* * Handler called after all rtenties have been dumped. * Performs post-dump framework checks and calls * algo:flm_dump_end_cb(). * * Updates walk_cbdata result. */ static void sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data) { struct walk_cbdata *w = (struct walk_cbdata *)_data; struct fib_data *fd = w->fd; RIB_WLOCK_ASSERT(w->fd->fd_rh); if (rnh->rib_dying) { w->result = FLM_ERROR; return; } if (fd->hit_nhops) { FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops", fd->nh_ref_table->count); if (w->result == FLM_SUCCESS) w->result = FLM_REBUILD; return; } if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS) return; /* Post-dump hook, dump successful */ w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp); if (w->result == FLM_SUCCESS) { /* Mark init as done to allow routing updates */ fd->init_done = 1; } } /* * Callback for each entry in rib. * Calls algo:flm_dump_rib_item_cb func as a part of initial * route table synchronisation. */ static int sync_algo_cb(struct rtentry *rt, void *_data) { struct walk_cbdata *w = (struct walk_cbdata *)_data; RIB_WLOCK_ASSERT(w->fd->fd_rh); if (w->result == FLM_SUCCESS && w->func) { /* * Reference nexthops to maintain guarantee that * each nexthop returned by datapath has > 0 references * and can be safely referenced within current epoch. */ struct nhop_object *nh = rt_get_raw_nhop(rt); if (fib_ref_nhop(w->fd, nh) != 0) w->result = w->func(rt, w->fd->fd_algo_data); else w->result = FLM_REBUILD; } return (0); } /* * Dump all routing table state to the algo instance. */ static enum flm_op_result sync_algo(struct fib_data *fd) { struct walk_cbdata w = { .fd = fd, .func = fd->fd_flm->flm_dump_rib_item_cb, .result = FLM_SUCCESS, }; rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w); FD_PRINTF(LOG_INFO, fd, "initial dump completed (rtable version: %d), result: %s", fd->fd_rh->rnh_gen, print_op_result(w.result)); return (w.result); } /* * Schedules epoch-backed @fd instance deletion. * * Unlinks @fd from the list of active algo instances. * * Removes rib subscription. * * Stops callout. * * Schedules actual deletion. * * Assume @fd is already unlinked from the datapath. */ static int schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout) { bool is_dead; NET_EPOCH_ASSERT(); RIB_WLOCK_ASSERT(fd->fd_rh); FIB_MOD_LOCK(); is_dead = fd->fd_dead; if (!is_dead) fd->fd_dead = true; if (fd->fd_linked) { TAILQ_REMOVE(&V_fib_data_list, fd, entries); fd->fd_linked = false; } FIB_MOD_UNLOCK(); if (is_dead) return (0); FD_PRINTF(LOG_INFO, fd, "DETACH"); if (fd->fd_rs != NULL) rib_unsibscribe_locked(fd->fd_rs); /* * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls * will be executed, hence no _new_ callout schedules will happen. */ callout_stop(&fd->fd_callout); fib_epoch_call(destroy_fd_instance_epoch, &fd->fd_epoch_ctx); return (0); } /* * Wipe all fd instances from the list matching rib specified by @rh. * If @keep_first is set, remove all but the first record. */ static void fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout) { struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head); struct fib_data *fd, *fd_tmp; struct epoch_tracker et; FIB_MOD_LOCK(); TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) { if (fd->fd_rh == rh) { if (keep_first) { keep_first = false; continue; } TAILQ_REMOVE(&V_fib_data_list, fd, entries); fd->fd_linked = false; TAILQ_INSERT_TAIL(&tmp_head, fd, entries); } } FIB_MOD_UNLOCK(); /* Pass 2: remove each entry */ NET_EPOCH_ENTER(et); TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) { if (!in_callout) RIB_WLOCK(fd->fd_rh); schedule_destroy_fd_instance(fd, in_callout); if (!in_callout) RIB_WUNLOCK(fd->fd_rh); } NET_EPOCH_EXIT(et); } void fib_destroy_rib(struct rib_head *rh) { /* * rnh has `is_dying` flag set, so setup of new fd's will fail at * sync_algo() stage, preventing new entries to be added to the list * of active algos. Remove all existing entries for the particular rib. */ fib_cleanup_algo(rh, false, false); } /* * Finalises fd destruction by freeing all fd resources. */ static void destroy_fd_instance(struct fib_data *fd) { FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd); /* Call destroy callback first */ if (fd->fd_algo_data != NULL) fd->fd_flm->flm_destroy_cb(fd->fd_algo_data); /* Nhop table */ if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) { for (int i = 0; i < fd->number_nhops; i++) { if (!is_idx_free(fd, i)) { FD_PRINTF(LOG_DEBUG2, fd, " FREE nhop %d %p", i, fd->nh_idx[i]); nhop_free_any(fd->nh_idx[i]); } } free(fd->nh_idx, M_RTABLE); } if (fd->nh_ref_table != NULL) free(fd->nh_ref_table, M_RTABLE); if (fd->fd_ss.fd_change_queue.entries != NULL) free(fd->fd_ss.fd_change_queue.entries, M_TEMP); fib_unref_algo(fd->fd_flm); free(fd, M_RTABLE); } /* * Epoch callback indicating fd is safe to destroy */ static void destroy_fd_instance_epoch(epoch_context_t ctx) { struct fib_data *fd; fd = __containerof(ctx, struct fib_data, fd_epoch_ctx); destroy_fd_instance(fd); } /* * Tries to setup fd instance. * - Allocates fd/nhop table * - Runs algo:flm_init_cb algo init * - Subscribes fd to the rib * - Runs rtable dump * - Adds instance to the list of active instances. * * Returns: operation result. Fills in @pfd with resulting fd on success. * */ static enum flm_op_result try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, struct fib_data *old_fd, struct fib_data **pfd) { struct fib_data *fd; size_t size; enum flm_op_result result; /* Allocate */ fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO); if (fd == NULL) { *pfd = NULL; RH_PRINTF(LOG_INFO, rh, "Unable to allocate fib_data structure"); return (FLM_REBUILD); } *pfd = fd; estimate_nhop_scale(old_fd, fd); fd->fd_rh = rh; fd->fd_gen = ++fib_gen; fd->fd_family = rh->rib_family; fd->fd_fibnum = rh->rib_fibnum; callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0); fd->fd_vnet = curvnet; fd->fd_flm = flm; FD_PRINTF(LOG_DEBUG, fd, "allocated fd %p", fd); FIB_MOD_LOCK(); flm->flm_refcount++; FIB_MOD_UNLOCK(); /* Allocate nhidx -> nhop_ptr table */ size = fd->number_nhops * sizeof(void *); fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); if (fd->nh_idx == NULL) { FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size); return (FLM_REBUILD); } /* Allocate nhop index refcount table */ size = sizeof(struct nhop_ref_table); size += fd->number_nhops * sizeof(uint32_t); fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); if (fd->nh_ref_table == NULL) { FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size); return (FLM_REBUILD); } FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops); /* Okay, we're ready for algo init */ void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL; result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data); if (result != FLM_SUCCESS) { FD_PRINTF(LOG_INFO, fd, "%s algo init failed", flm->flm_name); return (result); } /* Try to subscribe */ if (flm->flm_change_rib_item_cb != NULL) { fd->fd_rs = rib_subscribe_locked(fd->fd_rh, handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE); if (fd->fd_rs == NULL) { FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes"); return (FLM_REBUILD); } } /* Dump */ result = sync_algo(fd); if (result != FLM_SUCCESS) { FD_PRINTF(LOG_INFO, fd, "rib sync failed"); return (result); } FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully."); FIB_MOD_LOCK(); /* * Insert fd in the beginning of a list, to maintain invariant * that first matching entry for the AF/fib is always the active * one. */ TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries); fd->fd_linked = true; FIB_MOD_UNLOCK(); return (FLM_SUCCESS); } /* * Sets up algo @flm for table @rh and links it to the datapath. * */ static enum flm_op_result setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, struct fib_data *orig_fd, struct fib_data **pfd, bool attach) { struct fib_data *prev_fd, *new_fd; enum flm_op_result result; NET_EPOCH_ASSERT(); RIB_WLOCK_ASSERT(rh); prev_fd = orig_fd; new_fd = NULL; for (int i = 0; i < FIB_MAX_TRIES; i++) { result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd); if ((result == FLM_SUCCESS) && attach) { - if (!fib_set_datapath_ptr(new_fd, &new_fd->fd_dp)) + if (fib_set_datapath_ptr(new_fd, &new_fd->fd_dp)) + sync_rib_gen(new_fd); + else result = FLM_REBUILD; } if ((prev_fd != NULL) && (prev_fd != orig_fd)) { schedule_destroy_fd_instance(prev_fd, false); prev_fd = NULL; } RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i, print_op_result(result)); if (result == FLM_REBUILD) { prev_fd = new_fd; new_fd = NULL; continue; } break; } if (result != FLM_SUCCESS) { RH_PRINTF(LOG_WARNING, rh, "%s algo instance setup failed, failures=%d", flm->flm_name, orig_fd ? orig_fd->fd_failed_rebuilds + 1 : 0); /* update failure count */ FIB_MOD_LOCK(); if (orig_fd != NULL) orig_fd->fd_failed_rebuilds++; FIB_MOD_UNLOCK(); /* Ban algo on non-recoverable error */ if (result == FLM_ERROR) flm_error_add(flm, rh->rib_fibnum); if ((prev_fd != NULL) && (prev_fd != orig_fd)) schedule_destroy_fd_instance(prev_fd, false); if (new_fd != NULL) { schedule_destroy_fd_instance(new_fd, false); new_fd = NULL; } } *pfd = new_fd; return (result); } /* * Tries to sync algo with the current rtable state, either * by executing batch update or rebuilding. * Returns true on success. */ static bool execute_callout_action(struct fib_data *fd) { enum fib_callout_action action = fd->fd_callout_action; struct fib_lookup_module *flm_new = NULL; bool result = true; NET_EPOCH_ASSERT(); RIB_WLOCK_ASSERT(fd->fd_rh); fd->fd_need_rebuild = false; fd->fd_batch = false; fd->fd_num_changes = 0; /* First, check if we're still OK to use this algo */ if (!is_algo_fixed(fd->fd_rh)) flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); if (flm_new != NULL) action = FDA_REBUILD; if (action == FDA_BATCH) { /* Try to sync */ if (!apply_rtable_changes(fd)) action = FDA_REBUILD; } if (action == FDA_REBUILD) result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm); if (flm_new != NULL) fib_unref_algo(flm_new); return (result); } /* * Callout for all scheduled fd-related work. * - Checks if the current algo is still the best algo * - Synchronises algo instance to the rtable (batch usecase) * - Creates a new instance of an algo for af/fib if desired. */ static void handle_fd_callout(void *_data) { struct fib_data *fd = (struct fib_data *)_data; struct epoch_tracker et; FD_PRINTF(LOG_INFO, fd, "running callout type=%d", fd->fd_callout_action); NET_EPOCH_ENTER(et); CURVNET_SET(fd->fd_vnet); execute_callout_action(fd); CURVNET_RESTORE(); NET_EPOCH_EXIT(et); } /* * Tries to create new algo instance based on @fd data. * Returns true on success. */ static bool rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new) { struct fib_data *fd_new, *fd_tmp = NULL; bool result; if (flm_new == fd->fd_flm) fd_tmp = fd; else FD_PRINTF(LOG_NOTICE, fd, "switching algo to %s", flm_new->flm_name); result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true); if (result != FLM_SUCCESS) { FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed"); return (false); } FD_PRINTF(LOG_INFO, fd_new, "switched to new instance"); /* Remove old instance */ schedule_destroy_fd_instance(fd, true); return (true); } static bool rebuild_fd(struct fib_data *fd, const char *reason) { struct fib_lookup_module *flm_new = NULL; bool result; if (!is_algo_fixed(fd->fd_rh)) flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); FD_PRINTF(LOG_INFO, fd, "running sync rebuild: %s", reason); result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm); if (flm_new != NULL) fib_unref_algo(flm_new); if (!result) { FD_PRINTF(LOG_ERR, fd, "sync rebuild failed"); schedule_fd_rebuild(fd, "sync rebuild failed"); } return (result); } /* * Finds algo by name/family. * Returns referenced algo or NULL. */ static struct fib_lookup_module * fib_find_algo(const char *algo_name, int family) { struct fib_lookup_module *flm; FIB_MOD_LOCK(); TAILQ_FOREACH(flm, &all_algo_list, entries) { if ((strcmp(flm->flm_name, algo_name) == 0) && (family == flm->flm_family)) { flm->flm_refcount++; FIB_MOD_UNLOCK(); return (flm); } } FIB_MOD_UNLOCK(); return (NULL); } static void fib_unref_algo(struct fib_lookup_module *flm) { FIB_MOD_LOCK(); flm->flm_refcount--; FIB_MOD_UNLOCK(); } static int set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req) { struct fib_lookup_module *flm = NULL; struct fib_data *fd = NULL; char old_algo_name[32], algo_name[32]; struct rib_head *rh = NULL; enum flm_op_result result; struct epoch_tracker et; int error; /* Fetch current algo/rib for af/family */ FIB_MOD_LOCK(); TAILQ_FOREACH(fd, &V_fib_data_list, entries) { if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum)) break; } if (fd == NULL) { FIB_MOD_UNLOCK(); return (ENOENT); } rh = fd->fd_rh; strlcpy(old_algo_name, fd->fd_flm->flm_name, sizeof(old_algo_name)); FIB_MOD_UNLOCK(); strlcpy(algo_name, old_algo_name, sizeof(algo_name)); error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req); if (error != 0 || req->newptr == NULL) return (error); if (strcmp(algo_name, old_algo_name) == 0) return (0); /* New algorithm name is different */ flm = fib_find_algo(algo_name, family); if (flm == NULL) { RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name); return (ESRCH); } fd = NULL; NET_EPOCH_ENTER(et); RIB_WLOCK(rh); result = setup_fd_instance(flm, rh, NULL, &fd, true); RIB_WUNLOCK(rh); NET_EPOCH_EXIT(et); fib_unref_algo(flm); if (result != FLM_SUCCESS) return (EINVAL); /* Disable automated jumping between algos */ FIB_MOD_LOCK(); set_algo_fixed(rh); FIB_MOD_UNLOCK(); /* Remove old instance(s) */ fib_cleanup_algo(rh, true, false); /* Drain cb so user can unload the module after userret if so desired */ epoch_drain_callbacks(net_epoch_preempt); return (0); } #ifdef INET static int set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS) { return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET, oidp, req)); } SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo, CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo"); #endif #ifdef INET6 static int set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS) { return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET6, oidp, req)); } SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo, CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo"); #endif static void destroy_fdh_epoch(epoch_context_t ctx) { struct fib_dp_header *fdh; fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx); free(fdh, M_RTABLE); } static struct fib_dp_header * alloc_fib_dp_array(uint32_t num_tables, bool waitok) { size_t sz; struct fib_dp_header *fdh; sz = sizeof(struct fib_dp_header); sz += sizeof(struct fib_dp) * num_tables; fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO); if (fdh != NULL) fdh->fdh_num_tables = num_tables; return (fdh); } static struct fib_dp_header * get_fib_dp_header(struct fib_dp *dp) { return (__containerof((void *)dp, struct fib_dp_header, fdh_idx)); } /* * Replace per-family index pool @pdp with a new one which * contains updated callback/algo data from @fd. * Returns true on success. */ static bool replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd, struct fib_dp *dp) { struct fib_dp_header *new_fdh, *old_fdh; NET_EPOCH_ASSERT(); FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p", curvnet, dp->f, dp->arg); FIB_MOD_LOCK(); old_fdh = get_fib_dp_header(*pdp); if (old_fdh->fdh_idx[fd->fd_fibnum].f == dp->f) { /* * Function is the same, data pointer needs update. * Perform in-line replace without reallocation. */ old_fdh->fdh_idx[fd->fd_fibnum].arg = dp->arg; FD_PRINTF(LOG_DEBUG, fd, "FDH %p inline update", old_fdh); FIB_MOD_UNLOCK(); return (true); } new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false); FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh); if (new_fdh == NULL) { FIB_MOD_UNLOCK(); FD_PRINTF(LOG_WARNING, fd, "error attaching datapath"); return (false); } memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], old_fdh->fdh_num_tables * sizeof(struct fib_dp)); /* Update relevant data structure for @fd */ new_fdh->fdh_idx[fd->fd_fibnum] = *dp; /* Ensure memcpy() writes have completed */ atomic_thread_fence_rel(); /* Set new datapath pointer */ *pdp = &new_fdh->fdh_idx[0]; FIB_MOD_UNLOCK(); FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh); fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx); return (true); } static struct fib_dp ** get_family_dp_ptr(int family) { switch (family) { case AF_INET: return (&V_inet_dp); case AF_INET6: return (&V_inet6_dp); } return (NULL); } /* * Make datapath use fib instance @fd */ bool fib_set_datapath_ptr(struct fib_data *fd, struct fib_dp *dp) { struct fib_dp **pdp; pdp = get_family_dp_ptr(fd->fd_family); return (replace_rtables_family(pdp, fd, dp)); } /* * Grow datapath pointers array. * Called from sysctl handler on growing number of routing tables. */ static void grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables) { struct fib_dp_header *new_fdh, *old_fdh = NULL; new_fdh = alloc_fib_dp_array(new_num_tables, true); FIB_MOD_LOCK(); if (*pdp != NULL) { old_fdh = get_fib_dp_header(*pdp); memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], old_fdh->fdh_num_tables * sizeof(struct fib_dp)); } /* Wait till all writes completed */ atomic_thread_fence_rel(); *pdp = &new_fdh->fdh_idx[0]; FIB_MOD_UNLOCK(); if (old_fdh != NULL) fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx); } /* * Grows per-AF arrays of datapath pointers for each supported family. * Called from fibs resize sysctl handler. */ void fib_grow_rtables(uint32_t new_num_tables) { #ifdef INET grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables); #endif #ifdef INET6 grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables); #endif } void fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo) { bzero(rinfo, sizeof(struct rib_rtable_info)); rinfo->num_prefixes = rh->rnh_prefixes; rinfo->num_nhops = nhops_get_count(rh); #ifdef ROUTE_MPATH rinfo->num_nhgrp = nhgrp_get_count(rh); #endif } /* * Updates pointer to the algo data for the @fd. */ void fib_set_algo_ptr(struct fib_data *fd, void *algo_data) { RIB_WLOCK_ASSERT(fd->fd_rh); fd->fd_algo_data = algo_data; } /* * Calls @callback with @ctx after the end of a current epoch. */ void fib_epoch_call(epoch_callback_t callback, epoch_context_t ctx) { epoch_call(net_epoch_preempt, callback, ctx); } /* * Accessor to get rib instance @fd is attached to. */ struct rib_head * fib_get_rh(struct fib_data *fd) { return (fd->fd_rh); } /* * Accessor to export idx->nhop array */ struct nhop_object ** fib_get_nhop_array(struct fib_data *fd) { return (fd->nh_idx); } static uint32_t get_nhop_idx(struct nhop_object *nh) { #ifdef ROUTE_MPATH if (NH_IS_NHGRP(nh)) return (nhgrp_get_idx((struct nhgrp_object *)nh) * 2 - 1); else return (nhop_get_idx(nh) * 2); #else return (nhop_get_idx(nh)); #endif } uint32_t fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh) { return (get_nhop_idx(nh)); } static bool is_idx_free(struct fib_data *fd, uint32_t index) { return (fd->nh_ref_table->refcnt[index] == 0); } static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh) { uint32_t idx = get_nhop_idx(nh); if (idx >= fd->number_nhops) { fd->hit_nhops = 1; return (0); } if (is_idx_free(fd, idx)) { nhop_ref_any(nh); fd->nh_idx[idx] = nh; fd->nh_ref_table->count++; FD_PRINTF(LOG_DEBUG2, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]); } fd->nh_ref_table->refcnt[idx]++; return (idx); } struct nhop_release_data { struct nhop_object *nh; struct epoch_context ctx; }; static void release_nhop_epoch(epoch_context_t ctx) { struct nhop_release_data *nrd; nrd = __containerof(ctx, struct nhop_release_data, ctx); nhop_free_any(nrd->nh); free(nrd, M_TEMP); } /* * Delays nexthop refcount release. * Datapath may have the datastructures not updated yet, so the old * nexthop may still be returned till the end of current epoch. Delay * refcount removal, as we may be removing the last instance, which will * trigger nexthop deletion, rendering returned nexthop invalid. */ static void fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh) { struct nhop_release_data *nrd; nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO); if (nrd != NULL) { nrd->nh = nh; fib_epoch_call(release_nhop_epoch, &nrd->ctx); } else { /* * Unable to allocate memory. Leak nexthop to maintain guarantee * that each nhop can be referenced. */ FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh); } } static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh) { uint32_t idx = get_nhop_idx(nh); KASSERT((idx < fd->number_nhops), ("invalid nhop index")); KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh")); fd->nh_ref_table->refcnt[idx]--; if (fd->nh_ref_table->refcnt[idx] == 0) { FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]); fib_schedule_release_nhop(fd, fd->nh_idx[idx]); } } static void set_algo_fixed(struct rib_head *rh) { switch (rh->rib_family) { #ifdef INET case AF_INET: V_algo_fixed_inet = true; break; #endif #ifdef INET6 case AF_INET6: V_algo_fixed_inet6 = true; break; #endif } } static bool is_algo_fixed(struct rib_head *rh) { switch (rh->rib_family) { #ifdef INET case AF_INET: return (V_algo_fixed_inet); #endif #ifdef INET6 case AF_INET6: return (V_algo_fixed_inet6); #endif } return (false); } /* * Runs the check on what would be the best algo for rib @rh, assuming * that the current algo is the one specified by @orig_flm. Note that * it can be NULL for initial selection. * * Returns referenced new algo or NULL if the current one is the best. */ static struct fib_lookup_module * fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm) { uint8_t preference, curr_preference = 0, best_preference = 0; struct fib_lookup_module *flm, *best_flm = NULL; struct rib_rtable_info rinfo; int candidate_algos = 0; fib_get_rtable_info(rh, &rinfo); FIB_MOD_LOCK(); TAILQ_FOREACH(flm, &all_algo_list, entries) { if (flm->flm_family != rh->rib_family) continue; candidate_algos++; preference = flm->flm_get_pref(&rinfo); if (preference > best_preference) { if (!flm_error_check(flm, rh->rib_fibnum)) { best_preference = preference; best_flm = flm; } } if (flm == orig_flm) curr_preference = preference; } if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference)) best_flm->flm_refcount++; else best_flm = NULL; FIB_MOD_UNLOCK(); RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)", candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference, best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"), best_preference); return (best_flm); } /* * Called when new route table is created. * Selects, allocates and attaches fib algo for the table. */ int fib_select_algo_initial(struct rib_head *rh) { struct fib_lookup_module *flm; struct fib_data *fd = NULL; enum flm_op_result result; struct epoch_tracker et; int error = 0; flm = fib_check_best_algo(rh, NULL); if (flm == NULL) { RH_PRINTF(LOG_CRIT, rh, "no algo selected"); return (ENOENT); } RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name); NET_EPOCH_ENTER(et); RIB_WLOCK(rh); result = setup_fd_instance(flm, rh, NULL, &fd, false); RIB_WUNLOCK(rh); NET_EPOCH_EXIT(et); RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd); if (result == FLM_SUCCESS) { /* * Attach datapath directly to avoid multiple reallocations * during fib growth */ struct fib_dp_header *fdp; struct fib_dp **pdp; pdp = get_family_dp_ptr(rh->rib_family); if (pdp != NULL) { fdp = get_fib_dp_header(*pdp); fdp->fdh_idx[fd->fd_fibnum] = fd->fd_dp; FD_PRINTF(LOG_INFO, fd, "datapath attached"); } } else { error = EINVAL; RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name); } fib_unref_algo(flm); return (error); } /* * Registers fib lookup module within the subsystem. */ int fib_module_register(struct fib_lookup_module *flm) { FIB_MOD_LOCK(); ALGO_PRINTF("attaching %s to %s", flm->flm_name, print_family(flm->flm_family)); TAILQ_INSERT_TAIL(&all_algo_list, flm, entries); FIB_MOD_UNLOCK(); return (0); } /* * Tries to unregister fib lookup module. * * Returns 0 on success, EBUSY if module is still used * by some of the tables. */ int fib_module_unregister(struct fib_lookup_module *flm) { FIB_MOD_LOCK(); if (flm->flm_refcount > 0) { FIB_MOD_UNLOCK(); return (EBUSY); } fib_error_clear_flm(flm); ALGO_PRINTF("detaching %s from %s", flm->flm_name, print_family(flm->flm_family)); TAILQ_REMOVE(&all_algo_list, flm, entries); FIB_MOD_UNLOCK(); return (0); } void vnet_fib_init(void) { TAILQ_INIT(&V_fib_data_list); } void vnet_fib_destroy(void) { FIB_MOD_LOCK(); fib_error_clear(); FIB_MOD_UNLOCK(); } diff --git a/sys/net/route/route_ctl.c b/sys/net/route/route_ctl.c index fb3dfff87019..a0c4a2283a00 100644 --- a/sys/net/route/route_ctl.c +++ b/sys/net/route/route_ctl.c @@ -1,1543 +1,1543 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2020 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * This file contains control plane routing tables functions. * * All functions assumes they are called in net epoch. */ struct rib_subscription { CK_STAILQ_ENTRY(rib_subscription) next; rib_subscription_cb_t *func; void *arg; struct rib_head *rnh; enum rib_subscription_type type; struct epoch_context epoch_ctx; }; static int add_route(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc); static int add_route_nhop(struct rib_head *rnh, struct rtentry *rt, struct rt_addrinfo *info, struct route_nhop_data *rnd, struct rib_cmd_info *rc); static int del_route(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc); static int change_route(struct rib_head *rnh, struct rt_addrinfo *info, struct route_nhop_data *nhd_orig, struct rib_cmd_info *rc); static int rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc); static void rib_notify(struct rib_head *rnh, enum rib_subscription_type type, struct rib_cmd_info *rc); static void destroy_subscription_epoch(epoch_context_t ctx); #ifdef ROUTE_MPATH static bool rib_can_multipath(struct rib_head *rh); #endif /* Per-vnet multipath routing configuration */ SYSCTL_DECL(_net_route); #define V_rib_route_multipath VNET(rib_route_multipath) #ifdef ROUTE_MPATH #define _MP_FLAGS CTLFLAG_RW #else #define _MP_FLAGS CTLFLAG_RD #endif VNET_DEFINE(u_int, rib_route_multipath) = 1; SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET, &VNET_NAME(rib_route_multipath), 0, "Enable route multipath"); #undef _MP_FLAGS /* Routing table UMA zone */ VNET_DEFINE_STATIC(uma_zone_t, rtzone); #define V_rtzone VNET(rtzone) void vnet_rtzone_init() { V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } #ifdef VIMAGE void vnet_rtzone_destroy() { uma_zdestroy(V_rtzone); } #endif static void destroy_rtentry(struct rtentry *rt) { #ifdef VIMAGE struct nhop_object *nh = rt->rt_nhop; /* * At this moment rnh, nh_control may be already freed. * nhop interface may have been migrated to a different vnet. * Use vnet stored in the nexthop to delete the entry. */ #ifdef ROUTE_MPATH if (NH_IS_NHGRP(nh)) { struct weightened_nhop *wn; uint32_t num_nhops; wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops); nh = wn[0].nh; } #endif CURVNET_SET(nhop_get_vnet(nh)); #endif /* Unreference nexthop */ nhop_free_any(rt->rt_nhop); uma_zfree(V_rtzone, rt); CURVNET_RESTORE(); } /* * Epoch callback indicating rtentry is safe to destroy */ static void destroy_rtentry_epoch(epoch_context_t ctx) { struct rtentry *rt; rt = __containerof(ctx, struct rtentry, rt_epoch_ctx); destroy_rtentry(rt); } /* * Schedule rtentry deletion */ static void rtfree(struct rtentry *rt) { KASSERT(rt != NULL, ("%s: NULL rt", __func__)); epoch_call(net_epoch_preempt, destroy_rtentry_epoch, &rt->rt_epoch_ctx); } static struct rib_head * get_rnh(uint32_t fibnum, const struct rt_addrinfo *info) { struct rib_head *rnh; struct sockaddr *dst; KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum")); dst = info->rti_info[RTAX_DST]; rnh = rt_tables_get_rnh(fibnum, dst->sa_family); return (rnh); } #ifdef ROUTE_MPATH static bool rib_can_multipath(struct rib_head *rh) { int result; CURVNET_SET(rh->rib_vnet); result = !!V_rib_route_multipath; CURVNET_RESTORE(); return (result); } /* * Check is nhop is multipath-eligible. * Avoid nhops without gateways and redirects. * * Returns 1 for multipath-eligible nexthop, * 0 otherwise. */ bool nhop_can_multipath(const struct nhop_object *nh) { if ((nh->nh_flags & NHF_MULTIPATH) != 0) return (1); if ((nh->nh_flags & NHF_GATEWAY) == 0) return (0); if ((nh->nh_flags & NHF_REDIRECT) != 0) return (0); return (1); } #endif static int get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight) { uint32_t weight; if (info->rti_mflags & RTV_WEIGHT) weight = info->rti_rmx->rmx_weight; else weight = default_weight; /* Keep upper 1 byte for adm distance purposes */ if (weight > RT_MAX_WEIGHT) weight = RT_MAX_WEIGHT; return (weight); } bool rt_is_host(const struct rtentry *rt) { return (rt->rte_flags & RTF_HOST); } sa_family_t rt_get_family(const struct rtentry *rt) { const struct sockaddr *dst; dst = (const struct sockaddr *)rt_key_const(rt); return (dst->sa_family); } /* * Returns pointer to nexthop or nexthop group * associated with @rt */ struct nhop_object * rt_get_raw_nhop(const struct rtentry *rt) { return (rt->rt_nhop); } #ifdef INET /* * Stores IPv4 address and prefix length of @rt inside * @paddr and @plen. * @pscopeid is currently always set to 0. */ void rt_get_inet_prefix_plen(const struct rtentry *rt, struct in_addr *paddr, int *plen, uint32_t *pscopeid) { const struct sockaddr_in *dst; dst = (const struct sockaddr_in *)rt_key_const(rt); KASSERT((dst->sin_family == AF_INET), ("rt family is %d, not inet", dst->sin_family)); *paddr = dst->sin_addr; dst = (const struct sockaddr_in *)rt_mask_const(rt); if (dst == NULL) *plen = 32; else *plen = bitcount32(dst->sin_addr.s_addr); *pscopeid = 0; } /* * Stores IPv4 address and prefix mask of @rt inside * @paddr and @pmask. Sets mask to INADDR_ANY for host routes. * @pscopeid is currently always set to 0. */ void rt_get_inet_prefix_pmask(const struct rtentry *rt, struct in_addr *paddr, struct in_addr *pmask, uint32_t *pscopeid) { const struct sockaddr_in *dst; dst = (const struct sockaddr_in *)rt_key_const(rt); KASSERT((dst->sin_family == AF_INET), ("rt family is %d, not inet", dst->sin_family)); *paddr = dst->sin_addr; dst = (const struct sockaddr_in *)rt_mask_const(rt); if (dst == NULL) pmask->s_addr = INADDR_BROADCAST; else *pmask = dst->sin_addr; *pscopeid = 0; } #endif #ifdef INET6 static int inet6_get_plen(const struct in6_addr *addr) { return (bitcount32(addr->s6_addr32[0]) + bitcount32(addr->s6_addr32[1]) + bitcount32(addr->s6_addr32[2]) + bitcount32(addr->s6_addr32[3])); } /* * Stores IPv6 address and prefix length of @rt inside * @paddr and @plen. Addresses are returned in de-embedded form. * Scopeid is set to 0 for non-LL addresses. */ void rt_get_inet6_prefix_plen(const struct rtentry *rt, struct in6_addr *paddr, int *plen, uint32_t *pscopeid) { const struct sockaddr_in6 *dst; dst = (const struct sockaddr_in6 *)rt_key_const(rt); KASSERT((dst->sin6_family == AF_INET6), ("rt family is %d, not inet6", dst->sin6_family)); if (IN6_IS_SCOPE_LINKLOCAL(&dst->sin6_addr)) in6_splitscope(&dst->sin6_addr, paddr, pscopeid); else *paddr = dst->sin6_addr; dst = (const struct sockaddr_in6 *)rt_mask_const(rt); if (dst == NULL) *plen = 128; else *plen = inet6_get_plen(&dst->sin6_addr); } /* * Stores IPv6 address and prefix mask of @rt inside * @paddr and @pmask. Addresses are returned in de-embedded form. * Scopeid is set to 0 for non-LL addresses. */ void rt_get_inet6_prefix_pmask(const struct rtentry *rt, struct in6_addr *paddr, struct in6_addr *pmask, uint32_t *pscopeid) { const struct sockaddr_in6 *dst; dst = (const struct sockaddr_in6 *)rt_key_const(rt); KASSERT((dst->sin6_family == AF_INET6), ("rt family is %d, not inet", dst->sin6_family)); if (IN6_IS_SCOPE_LINKLOCAL(&dst->sin6_addr)) in6_splitscope(&dst->sin6_addr, paddr, pscopeid); else *paddr = dst->sin6_addr; dst = (const struct sockaddr_in6 *)rt_mask_const(rt); if (dst == NULL) memset(pmask, 0xFF, sizeof(struct in6_addr)); else *pmask = dst->sin6_addr; } #endif static void rt_set_expire_info(struct rtentry *rt, const struct rt_addrinfo *info) { /* Kernel -> userland timebase conversion. */ if (info->rti_mflags & RTV_EXPIRE) rt->rt_expire = info->rti_rmx->rmx_expire ? info->rti_rmx->rmx_expire - time_second + time_uptime : 0; } /* * Check if specified @gw matches gw data in the nexthop @nh. * * Returns true if matches, false otherwise. */ bool match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw) { if (nh->gw_sa.sa_family != gw->sa_family) return (false); switch (gw->sa_family) { case AF_INET: return (nh->gw4_sa.sin_addr.s_addr == ((const struct sockaddr_in *)gw)->sin_addr.s_addr); case AF_INET6: { const struct sockaddr_in6 *gw6; gw6 = (const struct sockaddr_in6 *)gw; /* * Currently (2020-09) IPv6 gws in kernel have their * scope embedded. Once this becomes false, this code * has to be revisited. */ if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr, &gw6->sin6_addr)) return (true); return (false); } case AF_LINK: { const struct sockaddr_dl *sdl; sdl = (const struct sockaddr_dl *)gw; return (nh->gwl_sa.sdl_index == sdl->sdl_index); } default: return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0); } /* NOTREACHED */ return (false); } /* * Checks if data in @info matches nexhop @nh. * * Returns 0 on success, * ESRCH if not matched, * ENOENT if filter function returned false */ int check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt, const struct nhop_object *nh) { const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY]; if (info->rti_filter != NULL) { if (info->rti_filter(rt, nh, info->rti_filterdata) == 0) return (ENOENT); else return (0); } if ((gw != NULL) && !match_nhop_gw(nh, gw)) return (ESRCH); return (0); } /* * Checks if nexhop @nh can be rewritten by data in @info because * of higher "priority". Currently the only case for such scenario * is kernel installing interface routes, marked by RTF_PINNED flag. * * Returns: * 1 if @info data has higher priority * 0 if priority is the same * -1 if priority is lower */ int can_override_nhop(const struct rt_addrinfo *info, const struct nhop_object *nh) { if (info->rti_flags & RTF_PINNED) { return (NH_IS_PINNED(nh)) ? 0 : 1; } else { return (NH_IS_PINNED(nh)) ? -1 : 0; } } /* * Runs exact prefix match based on @dst and @netmask. * Returns matched @rtentry if found or NULL. * If rtentry was found, saves nexthop / weight value into @rnd. */ static struct rtentry * lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst, const struct sockaddr *netmask, struct route_nhop_data *rnd) { struct rtentry *rt; RIB_LOCK_ASSERT(rnh); rt = (struct rtentry *)rnh->rnh_lookup(__DECONST(void *, dst), __DECONST(void *, netmask), &rnh->head); if (rt != NULL) { rnd->rnd_nhop = rt->rt_nhop; rnd->rnd_weight = rt->rt_weight; } else { rnd->rnd_nhop = NULL; rnd->rnd_weight = 0; } return (rt); } /* * Runs exact prefix match based on dst/netmask from @info. * Assumes RIB lock is held. * Returns matched @rtentry if found or NULL. * If rtentry was found, saves nexthop / weight value into @rnd. */ struct rtentry * lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info, struct route_nhop_data *rnd) { struct rtentry *rt; rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], rnd); return (rt); } /* * Adds route defined by @info into the kernel table specified by @fibnum and * sa_family in @info->rti_info[RTAX_DST]. * * Returns 0 on success and fills in operation metadata into @rc. */ int rib_add_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc) { struct rib_head *rnh; int error; NET_EPOCH_ASSERT(); rnh = get_rnh(fibnum, info); if (rnh == NULL) return (EAFNOSUPPORT); /* * Check consistency between RTF_HOST flag and netmask * existence. */ if (info->rti_flags & RTF_HOST) info->rti_info[RTAX_NETMASK] = NULL; else if (info->rti_info[RTAX_NETMASK] == NULL) return (EINVAL); bzero(rc, sizeof(struct rib_cmd_info)); rc->rc_cmd = RTM_ADD; error = add_route(rnh, info, rc); if (error == 0) rib_notify(rnh, RIB_NOTIFY_DELAYED, rc); return (error); } /* * Creates rtentry and nexthop based on @info data. * Return 0 and fills in rtentry into @prt on success, * return errno otherwise. */ static int create_rtentry(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry **prt) { struct sockaddr *dst, *ndst, *gateway, *netmask; struct rtentry *rt; struct nhop_object *nh; struct ifaddr *ifa; int error, flags; dst = info->rti_info[RTAX_DST]; gateway = info->rti_info[RTAX_GATEWAY]; netmask = info->rti_info[RTAX_NETMASK]; flags = info->rti_flags; if ((flags & RTF_GATEWAY) && !gateway) return (EINVAL); if (dst && gateway && (dst->sa_family != gateway->sa_family) && (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK)) return (EINVAL); if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb)) return (EINVAL); if (info->rti_ifa == NULL) { error = rt_getifa_fib(info, rnh->rib_fibnum); if (error) return (error); } error = nhop_create_from_info(rnh, info, &nh); if (error != 0) return (error); rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO); if (rt == NULL) { nhop_free(nh); return (ENOBUFS); } rt->rte_flags = (RTF_UP | flags) & RTE_RT_FLAG_MASK; rt->rt_nhop = nh; /* Fill in dst */ memcpy(&rt->rt_dst, dst, dst->sa_len); rt_key(rt) = &rt->rt_dst; /* * point to the (possibly newly malloc'd) dest address. */ ndst = (struct sockaddr *)rt_key(rt); /* * make sure it contains the value we want (masked if needed). */ if (netmask) { rt_maskedcopy(dst, ndst, netmask); } else bcopy(dst, ndst, dst->sa_len); /* * We use the ifa reference returned by rt_getifa_fib(). * This moved from below so that rnh->rnh_addaddr() can * examine the ifa and ifa->ifa_ifp if it so desires. */ ifa = info->rti_ifa; rt->rt_weight = get_info_weight(info, RT_DEFAULT_WEIGHT); rt_set_expire_info(rt, info); *prt = rt; return (0); } static int add_route(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc) { struct nhop_object *nh_orig; struct route_nhop_data rnd_orig, rnd_add; struct nhop_object *nh; struct rtentry *rt, *rt_orig; int error; error = create_rtentry(rnh, info, &rt); if (error != 0) return (error); rnd_add.rnd_nhop = rt->rt_nhop; rnd_add.rnd_weight = rt->rt_weight; nh = rt->rt_nhop; RIB_WLOCK(rnh); error = add_route_nhop(rnh, rt, info, &rnd_add, rc); if (error == 0) { RIB_WUNLOCK(rnh); return (0); } /* addition failed. Lookup prefix in the rib to determine the cause */ rt_orig = lookup_prefix(rnh, info, &rnd_orig); if (rt_orig == NULL) { /* No prefix -> rnh_addaddr() failed to allocate memory */ RIB_WUNLOCK(rnh); nhop_free(nh); uma_zfree(V_rtzone, rt); return (ENOMEM); } /* We have existing route in the RIB. */ nh_orig = rnd_orig.rnd_nhop; /* Check if new route has higher preference */ if (can_override_nhop(info, nh_orig) > 0) { /* Update nexthop to the new route */ change_route_nhop(rnh, rt_orig, info, &rnd_add, rc); RIB_WUNLOCK(rnh); uma_zfree(V_rtzone, rt); nhop_free(nh_orig); return (0); } RIB_WUNLOCK(rnh); #ifdef ROUTE_MPATH if (rib_can_multipath(rnh) && nhop_can_multipath(rnd_add.rnd_nhop) && nhop_can_multipath(rnd_orig.rnd_nhop)) error = add_route_mpath(rnh, info, rt, &rnd_add, &rnd_orig, rc); else #endif /* Unable to add - another route with the same preference exists */ error = EEXIST; /* * ROUTE_MPATH disabled: failed to add route, free both nhop and rt. * ROUTE_MPATH enabled: original nhop reference is unused in any case, * free rt only if not _adding_ new route to rib (e.g. the case * when initial lookup returned existing route, but then it got * deleted prior to multipath group insertion, leading to a simple * non-multipath add as a result). */ nhop_free(nh); if ((error != 0) || rc->rc_cmd != RTM_ADD) uma_zfree(V_rtzone, rt); return (error); } /* * Removes route defined by @info from the kernel table specified by @fibnum and * sa_family in @info->rti_info[RTAX_DST]. * * Returns 0 on success and fills in operation metadata into @rc. */ int rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc) { struct rib_head *rnh; struct sockaddr *dst_orig, *netmask; struct sockaddr_storage mdst; int error; NET_EPOCH_ASSERT(); rnh = get_rnh(fibnum, info); if (rnh == NULL) return (EAFNOSUPPORT); bzero(rc, sizeof(struct rib_cmd_info)); rc->rc_cmd = RTM_DELETE; dst_orig = info->rti_info[RTAX_DST]; netmask = info->rti_info[RTAX_NETMASK]; if (netmask != NULL) { /* Ensure @dst is always properly masked */ if (dst_orig->sa_len > sizeof(mdst)) return (EINVAL); rt_maskedcopy(dst_orig, (struct sockaddr *)&mdst, netmask); info->rti_info[RTAX_DST] = (struct sockaddr *)&mdst; } error = del_route(rnh, info, rc); info->rti_info[RTAX_DST] = dst_orig; return (error); } /* * Conditionally unlinks rtentry matching data inside @info from @rnh. * Returns 0 on success with operation result stored in @rc. * On error, returns: * ESRCH - if prefix was not found, * EADDRINUSE - if trying to delete higher priority route. * ENOENT - if supplied filter function returned 0 (not matched). */ static int rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc) { struct rtentry *rt; struct nhop_object *nh; struct radix_node *rn; struct route_nhop_data rnd; int error; rt = lookup_prefix(rnh, info, &rnd); if (rt == NULL) return (ESRCH); nh = rt->rt_nhop; #ifdef ROUTE_MPATH if (NH_IS_NHGRP(nh)) { error = del_route_mpath(rnh, info, rt, (struct nhgrp_object *)nh, rc); return (error); } #endif error = check_info_match_nhop(info, rt, nh); if (error != 0) return (error); if (can_override_nhop(info, nh) < 0) return (EADDRINUSE); /* * Remove the item from the tree and return it. * Complain if it is not there and do no more processing. */ rn = rnh->rnh_deladdr(info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], &rnh->head); if (rn == NULL) return (ESRCH); if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic ("rtrequest delete"); rt = RNTORT(rn); rt->rte_flags &= ~RTF_UP; /* Finalize notification */ - rnh->rnh_gen++; + rib_bump_gen(rnh); rnh->rnh_prefixes--; rc->rc_cmd = RTM_DELETE; rc->rc_rt = rt; rc->rc_nh_old = rt->rt_nhop; rc->rc_nh_weight = rt->rt_weight; rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc); return (0); } static int del_route(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc) { int error; RIB_WLOCK(rnh); error = rt_unlinkrte(rnh, info, rc); RIB_WUNLOCK(rnh); if (error != 0) return (error); rib_notify(rnh, RIB_NOTIFY_DELAYED, rc); /* * If the caller wants it, then it can have it, * the entry will be deleted after the end of the current epoch. */ if (rc->rc_cmd == RTM_DELETE) rtfree(rc->rc_rt); #ifdef ROUTE_MPATH else { /* * Deleting 1 path may result in RTM_CHANGE to * a different mpath group/nhop. * Free old mpath group. */ nhop_free_any(rc->rc_nh_old); } #endif return (0); } int rib_change_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc) { RIB_RLOCK_TRACKER; struct route_nhop_data rnd_orig; struct rib_head *rnh; struct rtentry *rt; int error; NET_EPOCH_ASSERT(); rnh = get_rnh(fibnum, info); if (rnh == NULL) return (EAFNOSUPPORT); bzero(rc, sizeof(struct rib_cmd_info)); rc->rc_cmd = RTM_CHANGE; /* Check if updated gateway exists */ if ((info->rti_flags & RTF_GATEWAY) && (info->rti_info[RTAX_GATEWAY] == NULL)) { /* * route(8) adds RTF_GATEWAY flag if -interface is not set. * Remove RTF_GATEWAY to enforce consistency and maintain * compatibility.. */ info->rti_flags &= ~RTF_GATEWAY; } /* * route change is done in multiple steps, with dropping and * reacquiring lock. In the situations with multiple processes * changes the same route in can lead to the case when route * is changed between the steps. Address it by retrying the operation * multiple times before failing. */ RIB_RLOCK(rnh); rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], &rnh->head); if (rt == NULL) { RIB_RUNLOCK(rnh); return (ESRCH); } rnd_orig.rnd_nhop = rt->rt_nhop; rnd_orig.rnd_weight = rt->rt_weight; RIB_RUNLOCK(rnh); for (int i = 0; i < RIB_MAX_RETRIES; i++) { error = change_route(rnh, info, &rnd_orig, rc); if (error != EAGAIN) break; } return (error); } static int change_nhop(struct rib_head *rnh, struct rt_addrinfo *info, struct nhop_object *nh_orig, struct nhop_object **nh_new) { int error; /* * New gateway could require new ifaddr, ifp; * flags may also be different; ifp may be specified * by ll sockaddr when protocol address is ambiguous */ if (((nh_orig->nh_flags & NHF_GATEWAY) && info->rti_info[RTAX_GATEWAY] != NULL) || info->rti_info[RTAX_IFP] != NULL || (info->rti_info[RTAX_IFA] != NULL && !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) { error = rt_getifa_fib(info, rnh->rib_fibnum); if (error != 0) { info->rti_ifa = NULL; return (error); } } error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new); info->rti_ifa = NULL; return (error); } #ifdef ROUTE_MPATH static int change_mpath_route(struct rib_head *rnh, struct rt_addrinfo *info, struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) { int error = 0; struct nhop_object *nh, *nh_orig, *nh_new; struct route_nhop_data rnd_new; nh = NULL; nh_orig = rnd_orig->rnd_nhop; struct weightened_nhop *wn = NULL, *wn_new; uint32_t num_nhops; wn = nhgrp_get_nhops((struct nhgrp_object *)nh_orig, &num_nhops); nh_orig = NULL; for (int i = 0; i < num_nhops; i++) { if (check_info_match_nhop(info, NULL, wn[i].nh)) { nh_orig = wn[i].nh; break; } } if (nh_orig == NULL) return (ESRCH); error = change_nhop(rnh, info, nh_orig, &nh_new); if (error != 0) return (error); wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT | M_ZERO); if (wn_new == NULL) { nhop_free(nh_new); return (EAGAIN); } memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop)); for (int i = 0; i < num_nhops; i++) { if (wn[i].nh == nh_orig) { wn[i].nh = nh_new; wn[i].weight = get_info_weight(info, rnd_orig->rnd_weight); break; } } error = nhgrp_get_group(rnh, wn_new, num_nhops, &rnd_new); nhop_free(nh_new); free(wn_new, M_TEMP); if (error != 0) return (error); error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc); return (error); } #endif static int change_route(struct rib_head *rnh, struct rt_addrinfo *info, struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) { int error = 0; struct nhop_object *nh, *nh_orig; struct route_nhop_data rnd_new; nh = NULL; nh_orig = rnd_orig->rnd_nhop; if (nh_orig == NULL) return (ESRCH); #ifdef ROUTE_MPATH if (NH_IS_NHGRP(nh_orig)) return (change_mpath_route(rnh, info, rnd_orig, rc)); #endif rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight); error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop); if (error != 0) return (error); error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc); return (error); } /* * Insert @rt with nhop data from @rnd_new to @rnh. * Returns 0 on success and stores operation results in @rc. */ static int add_route_nhop(struct rib_head *rnh, struct rtentry *rt, struct rt_addrinfo *info, struct route_nhop_data *rnd, struct rib_cmd_info *rc) { struct sockaddr *ndst, *netmask; struct radix_node *rn; int error = 0; RIB_WLOCK_ASSERT(rnh); ndst = (struct sockaddr *)rt_key(rt); netmask = info->rti_info[RTAX_NETMASK]; rt->rt_nhop = rnd->rnd_nhop; rt->rt_weight = rnd->rnd_weight; rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes); if (rn != NULL) { if (rt->rt_expire > 0) tmproutes_update(rnh, rt); /* Finalize notification */ - rnh->rnh_gen++; + rib_bump_gen(rnh); rnh->rnh_prefixes++; rc->rc_cmd = RTM_ADD; rc->rc_rt = rt; rc->rc_nh_old = NULL; rc->rc_nh_new = rnd->rnd_nhop; rc->rc_nh_weight = rnd->rnd_weight; rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc); } else { /* Existing route or memory allocation failure */ error = EEXIST; } return (error); } /* * Switch @rt nhop/weigh to the ones specified in @rnd. * Conditionally set rt_expire if set in @info. * Returns 0 on success. */ int change_route_nhop(struct rib_head *rnh, struct rtentry *rt, struct rt_addrinfo *info, struct route_nhop_data *rnd, struct rib_cmd_info *rc) { struct nhop_object *nh_orig; RIB_WLOCK_ASSERT(rnh); nh_orig = rt->rt_nhop; if (rnd->rnd_nhop != NULL) { /* Changing expiration & nexthop & weight to a new one */ rt_set_expire_info(rt, info); rt->rt_nhop = rnd->rnd_nhop; rt->rt_weight = rnd->rnd_weight; if (rt->rt_expire > 0) tmproutes_update(rnh, rt); } else { /* Route deletion requested. */ struct sockaddr *ndst, *netmask; struct radix_node *rn; ndst = (struct sockaddr *)rt_key(rt); netmask = info->rti_info[RTAX_NETMASK]; rn = rnh->rnh_deladdr(ndst, netmask, &rnh->head); if (rn == NULL) return (ESRCH); rt = RNTORT(rn); rt->rte_flags &= ~RTF_UP; } /* Finalize notification */ - rnh->rnh_gen++; + rib_bump_gen(rnh); if (rnd->rnd_nhop == NULL) rnh->rnh_prefixes--; rc->rc_cmd = (rnd->rnd_nhop != NULL) ? RTM_CHANGE : RTM_DELETE; rc->rc_rt = rt; rc->rc_nh_old = nh_orig; rc->rc_nh_new = rnd->rnd_nhop; rc->rc_nh_weight = rnd->rnd_weight; rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc); return (0); } /* * Conditionally update route nhop/weight IFF data in @nhd_orig is * consistent with the current route data. * Nexthop in @nhd_new is consumed. */ int change_route_conditional(struct rib_head *rnh, struct rtentry *rt, struct rt_addrinfo *info, struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_new, struct rib_cmd_info *rc) { struct rtentry *rt_new; int error = 0; RIB_WLOCK(rnh); rt_new = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], &rnh->head); if (rt_new == NULL) { if (rnd_orig->rnd_nhop == NULL) error = add_route_nhop(rnh, rt, info, rnd_new, rc); else { /* * Prefix does not exist, which was not our assumption. * Update @rnd_orig with the new data and return */ rnd_orig->rnd_nhop = NULL; rnd_orig->rnd_weight = 0; error = EAGAIN; } } else { /* Prefix exists, try to update */ if (rnd_orig->rnd_nhop == rt_new->rt_nhop) { /* * Nhop/mpath group hasn't changed. Flip * to the new precalculated one and return */ error = change_route_nhop(rnh, rt_new, info, rnd_new, rc); } else { /* Update and retry */ rnd_orig->rnd_nhop = rt_new->rt_nhop; rnd_orig->rnd_weight = rt_new->rt_weight; error = EAGAIN; } } RIB_WUNLOCK(rnh); if (error == 0) { rib_notify(rnh, RIB_NOTIFY_DELAYED, rc); if (rnd_orig->rnd_nhop != NULL) nhop_free_any(rnd_orig->rnd_nhop); } else { if (rnd_new->rnd_nhop != NULL) nhop_free_any(rnd_new->rnd_nhop); } return (error); } /* * Performs modification of routing table specificed by @action. * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST]. * Needs to be run in network epoch. * * Returns 0 on success and fills in @rc with action result. */ int rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info, struct rib_cmd_info *rc) { int error; switch (action) { case RTM_ADD: error = rib_add_route(fibnum, info, rc); break; case RTM_DELETE: error = rib_del_route(fibnum, info, rc); break; case RTM_CHANGE: error = rib_change_route(fibnum, info, rc); break; default: error = ENOTSUP; } return (error); } struct rt_delinfo { struct rt_addrinfo info; struct rib_head *rnh; struct rtentry *head; struct rib_cmd_info rc; }; /* * Conditionally unlinks @rn from radix tree based * on info data passed in @arg. */ static int rt_checkdelroute(struct radix_node *rn, void *arg) { struct rt_delinfo *di; struct rt_addrinfo *info; struct rtentry *rt; di = (struct rt_delinfo *)arg; rt = (struct rtentry *)rn; info = &di->info; info->rti_info[RTAX_DST] = rt_key(rt); info->rti_info[RTAX_NETMASK] = rt_mask(rt); if (rt_unlinkrte(di->rnh, info, &di->rc) != 0) return (0); /* * Add deleted rtentries to the list to GC them * after dropping the lock. * * XXX: Delayed notifications not implemented * for nexthop updates. */ if (di->rc.rc_cmd == RTM_DELETE) { /* Add to the list and return */ rt->rt_chain = di->head; di->head = rt; #ifdef ROUTE_MPATH } else { /* * RTM_CHANGE to a diferent nexthop or nexthop group. * Free old multipath group. */ nhop_free_any(di->rc.rc_nh_old); #endif } return (0); } /* * Iterates over a routing table specified by @fibnum and @family and * deletes elements marked by @filter_f. * @fibnum: rtable id * @family: AF_ address family * @filter_f: function returning non-zero value for items to delete * @arg: data to pass to the @filter_f function * @report: true if rtsock notification is needed. */ void rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *arg, bool report) { struct rib_head *rnh; struct rt_delinfo di; struct rtentry *rt; struct nhop_object *nh; struct epoch_tracker et; rnh = rt_tables_get_rnh(fibnum, family); if (rnh == NULL) return; bzero(&di, sizeof(di)); di.info.rti_filter = filter_f; di.info.rti_filterdata = arg; di.rnh = rnh; di.rc.rc_cmd = RTM_DELETE; NET_EPOCH_ENTER(et); RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di); RIB_WUNLOCK(rnh); /* We might have something to reclaim. */ bzero(&di.rc, sizeof(di.rc)); di.rc.rc_cmd = RTM_DELETE; while (di.head != NULL) { rt = di.head; di.head = rt->rt_chain; rt->rt_chain = NULL; nh = rt->rt_nhop; di.rc.rc_rt = rt; di.rc.rc_nh_old = nh; rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc); /* TODO std rt -> rt_addrinfo export */ di.info.rti_info[RTAX_DST] = rt_key(rt); di.info.rti_info[RTAX_NETMASK] = rt_mask(rt); if (report) { #ifdef ROUTE_MPATH struct nhgrp_object *nhg; struct weightened_nhop *wn; uint32_t num_nhops; if (NH_IS_NHGRP(nh)) { nhg = (struct nhgrp_object *)nh; wn = nhgrp_get_nhops(nhg, &num_nhops); for (int i = 0; i < num_nhops; i++) rt_routemsg(RTM_DELETE, rt, wn[i].nh, fibnum); } else #endif rt_routemsg(RTM_DELETE, rt, nh, fibnum); } rtfree(rt); } NET_EPOCH_EXIT(et); } static int rt_delete_unconditional(struct radix_node *rn, void *arg) { struct rtentry *rt = RNTORT(rn); struct rib_head *rnh = (struct rib_head *)arg; rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), &rnh->head); if (RNTORT(rn) == rt) rtfree(rt); return (0); } /* * Removes all routes from the routing table without executing notifications. * rtentres will be removed after the end of a current epoch. */ static void rib_flush_routes(struct rib_head *rnh) { RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, rt_delete_unconditional, rnh); RIB_WUNLOCK(rnh); } void rib_flush_routes_family(int family) { struct rib_head *rnh; for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) { if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL) rib_flush_routes(rnh); } } static void rib_notify(struct rib_head *rnh, enum rib_subscription_type type, struct rib_cmd_info *rc) { struct rib_subscription *rs; CK_STAILQ_FOREACH(rs, &rnh->rnh_subscribers, next) { if (rs->type == type) rs->func(rnh, rc, rs->arg); } } static struct rib_subscription * allocate_subscription(rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type, bool waitok) { struct rib_subscription *rs; int flags = M_ZERO | (waitok ? M_WAITOK : M_NOWAIT); rs = malloc(sizeof(struct rib_subscription), M_RTABLE, flags); if (rs == NULL) return (NULL); rs->func = f; rs->arg = arg; rs->type = type; return (rs); } /* * Subscribe for the changes in the routing table specified by @fibnum and * @family. * * Returns pointer to the subscription structure on success. */ struct rib_subscription * rib_subscribe(uint32_t fibnum, int family, rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type, bool waitok) { struct rib_head *rnh; struct epoch_tracker et; NET_EPOCH_ENTER(et); KASSERT((fibnum < rt_numfibs), ("%s: bad fibnum", __func__)); rnh = rt_tables_get_rnh(fibnum, family); NET_EPOCH_EXIT(et); return (rib_subscribe_internal(rnh, f, arg, type, waitok)); } struct rib_subscription * rib_subscribe_internal(struct rib_head *rnh, rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type, bool waitok) { struct rib_subscription *rs; struct epoch_tracker et; if ((rs = allocate_subscription(f, arg, type, waitok)) == NULL) return (NULL); rs->rnh = rnh; NET_EPOCH_ENTER(et); RIB_WLOCK(rnh); CK_STAILQ_INSERT_HEAD(&rnh->rnh_subscribers, rs, next); RIB_WUNLOCK(rnh); NET_EPOCH_EXIT(et); return (rs); } struct rib_subscription * rib_subscribe_locked(struct rib_head *rnh, rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type) { struct rib_subscription *rs; NET_EPOCH_ASSERT(); RIB_WLOCK_ASSERT(rnh); if ((rs = allocate_subscription(f, arg, type, false)) == NULL) return (NULL); rs->rnh = rnh; CK_STAILQ_INSERT_HEAD(&rnh->rnh_subscribers, rs, next); return (rs); } /* * Remove rtable subscription @rs from the routing table. * Needs to be run in network epoch. */ void rib_unsibscribe(struct rib_subscription *rs) { struct rib_head *rnh = rs->rnh; NET_EPOCH_ASSERT(); RIB_WLOCK(rnh); CK_STAILQ_REMOVE(&rnh->rnh_subscribers, rs, rib_subscription, next); RIB_WUNLOCK(rnh); epoch_call(net_epoch_preempt, destroy_subscription_epoch, &rs->epoch_ctx); } void rib_unsibscribe_locked(struct rib_subscription *rs) { struct rib_head *rnh = rs->rnh; NET_EPOCH_ASSERT(); RIB_WLOCK_ASSERT(rnh); CK_STAILQ_REMOVE(&rnh->rnh_subscribers, rs, rib_subscription, next); epoch_call(net_epoch_preempt, destroy_subscription_epoch, &rs->epoch_ctx); } /* * Epoch callback indicating subscription is safe to destroy */ static void destroy_subscription_epoch(epoch_context_t ctx) { struct rib_subscription *rs; rs = __containerof(ctx, struct rib_subscription, epoch_ctx); free(rs, M_RTABLE); } void rib_init_subscriptions(struct rib_head *rnh) { CK_STAILQ_INIT(&rnh->rnh_subscribers); } void rib_destroy_subscriptions(struct rib_head *rnh) { struct rib_subscription *rs; struct epoch_tracker et; NET_EPOCH_ENTER(et); RIB_WLOCK(rnh); while ((rs = CK_STAILQ_FIRST(&rnh->rnh_subscribers)) != NULL) { CK_STAILQ_REMOVE_HEAD(&rnh->rnh_subscribers, next); epoch_call(net_epoch_preempt, destroy_subscription_epoch, &rs->epoch_ctx); } RIB_WUNLOCK(rnh); NET_EPOCH_EXIT(et); } diff --git a/sys/net/route/route_var.h b/sys/net/route/route_var.h index 427c286a5090..f12931476fd3 100644 --- a/sys/net/route/route_var.h +++ b/sys/net/route/route_var.h @@ -1,327 +1,340 @@ /*- * Copyright (c) 2015-2016 * Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _NET_ROUTE_VAR_H_ #define _NET_ROUTE_VAR_H_ #ifndef RNF_NORMAL #include #endif #include #include #include /* struct sockaddr_in */ #include #include #ifdef RTDEBUG #define DPRINTF(_fmt, ...) printf("%s: " _fmt "\n", __func__ , ## __VA_ARGS__) #else #define DPRINTF(_fmt, ...) #endif struct nh_control; typedef int rnh_preadd_entry_f_t(u_int fibnum, const struct sockaddr *addr, const struct sockaddr *mask, struct nhop_object *nh); struct rib_head { struct radix_head head; rn_matchaddr_f_t *rnh_matchaddr; /* longest match for sockaddr */ rn_addaddr_f_t *rnh_addaddr; /* add based on sockaddr*/ rn_deladdr_f_t *rnh_deladdr; /* remove based on sockaddr */ rn_lookup_f_t *rnh_lookup; /* exact match for sockaddr */ rn_walktree_t *rnh_walktree; /* traverse tree */ rn_walktree_from_t *rnh_walktree_from; /* traverse tree below a */ rnh_preadd_entry_f_t *rnh_preadd; /* hook to alter record prior to insertion */ - rt_gen_t rnh_gen; /* generation counter */ + rt_gen_t rnh_gen; /* datapath generation counter */ int rnh_multipath; /* multipath capable ? */ struct radix_node rnh_nodes[3]; /* empty tree for common case */ struct rmlock rib_lock; /* config/data path lock */ struct radix_mask_head rmhead; /* masks radix head */ struct vnet *rib_vnet; /* vnet pointer */ int rib_family; /* AF of the rtable */ u_int rib_fibnum; /* fib number */ struct callout expire_callout; /* Callout for expiring dynamic routes */ time_t next_expire; /* Next expire run ts */ uint32_t rnh_prefixes; /* Number of prefixes */ +#ifdef FIB_ALGO + rt_gen_t rnh_gen_rib; /* rib generation counter */ +#endif uint32_t rib_dying:1; /* rib is detaching */ uint32_t rib_algo_fixed:1;/* fixed algorithm */ struct nh_control *nh_control; /* nexthop subsystem data */ CK_STAILQ_HEAD(, rib_subscription) rnh_subscribers;/* notification subscribers */ }; #define RIB_RLOCK_TRACKER struct rm_priotracker _rib_tracker #define RIB_LOCK_INIT(rh) rm_init(&(rh)->rib_lock, "rib head lock") #define RIB_LOCK_DESTROY(rh) rm_destroy(&(rh)->rib_lock) #define RIB_RLOCK(rh) rm_rlock(&(rh)->rib_lock, &_rib_tracker) #define RIB_RUNLOCK(rh) rm_runlock(&(rh)->rib_lock, &_rib_tracker) #define RIB_WLOCK(rh) rm_wlock(&(rh)->rib_lock) #define RIB_WUNLOCK(rh) rm_wunlock(&(rh)->rib_lock) #define RIB_LOCK_ASSERT(rh) rm_assert(&(rh)->rib_lock, RA_LOCKED) #define RIB_WLOCK_ASSERT(rh) rm_assert(&(rh)->rib_lock, RA_WLOCKED) /* Constants */ #define RIB_MAX_RETRIES 3 #define RT_MAXFIBS UINT16_MAX #define RIB_MAX_MPATH_WIDTH 64 /* Macro for verifying fields in af-specific 'struct route' structures */ #define CHK_STRUCT_FIELD_GENERIC(_s1, _f1, _s2, _f2) \ _Static_assert(sizeof(((_s1 *)0)->_f1) == sizeof(((_s2 *)0)->_f2), \ "Fields " #_f1 " and " #_f2 " size differs"); \ _Static_assert(__offsetof(_s1, _f1) == __offsetof(_s2, _f2), \ "Fields " #_f1 " and " #_f2 " offset differs"); #define _CHK_ROUTE_FIELD(_route_new, _field) \ CHK_STRUCT_FIELD_GENERIC(struct route, _field, _route_new, _field) #define CHK_STRUCT_ROUTE_FIELDS(_route_new) \ _CHK_ROUTE_FIELD(_route_new, ro_nh) \ _CHK_ROUTE_FIELD(_route_new, ro_lle) \ _CHK_ROUTE_FIELD(_route_new, ro_prepend)\ _CHK_ROUTE_FIELD(_route_new, ro_plen) \ _CHK_ROUTE_FIELD(_route_new, ro_flags) \ _CHK_ROUTE_FIELD(_route_new, ro_mtu) \ _CHK_ROUTE_FIELD(_route_new, spare) #define CHK_STRUCT_ROUTE_COMPAT(_ro_new, _dst_new) \ CHK_STRUCT_ROUTE_FIELDS(_ro_new); \ _Static_assert(__offsetof(struct route, ro_dst) == __offsetof(_ro_new, _dst_new),\ "ro_dst and " #_dst_new " are at different offset") +static inline void +rib_bump_gen(struct rib_head *rnh) +{ +#ifdef FIB_ALGO + rnh->rnh_gen_rib++; +#else + rnh->rnh_gen++; +#endif +} + struct rib_head *rt_tables_get_rnh(uint32_t table, sa_family_t family); int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum); struct rib_cmd_info; VNET_PCPUSTAT_DECLARE(struct rtstat, rtstat); #define RTSTAT_ADD(name, val) \ VNET_PCPUSTAT_ADD(struct rtstat, rtstat, name, (val)) #define RTSTAT_INC(name) RTSTAT_ADD(name, 1) /* * Convert a 'struct radix_node *' to a 'struct rtentry *'. * The operation can be done safely (in this code) because a * 'struct rtentry' starts with two 'struct radix_node''s, the first * one representing leaf nodes in the routing tree, which is * what the code in radix.c passes us as a 'struct radix_node'. * * But because there are a lot of assumptions in this conversion, * do not cast explicitly, but always use the macro below. */ #define RNTORT(p) ((struct rtentry *)(p)) struct rtentry { struct radix_node rt_nodes[2]; /* tree glue, and other values */ /* * XXX struct rtentry must begin with a struct radix_node (or two!) * because the code does some casts of a 'struct radix_node *' * to a 'struct rtentry *' */ #define rt_key(r) (*((struct sockaddr **)(&(r)->rt_nodes->rn_key))) #define rt_mask(r) (*((struct sockaddr **)(&(r)->rt_nodes->rn_mask))) #define rt_key_const(r) (*((const struct sockaddr * const *)(&(r)->rt_nodes->rn_key))) #define rt_mask_const(r) (*((const struct sockaddr * const *)(&(r)->rt_nodes->rn_mask))) /* * 2 radix_node structurs above consists of 2x6 pointers, leaving * 4 pointers (32 bytes) of the second cache line on amd64. * */ struct nhop_object *rt_nhop; /* nexthop data */ union { /* * Destination address storage. * sizeof(struct sockaddr_in6) == 28, however * the dataplane-relevant part (e.g. address) lies * at offset 8..24, making the address not crossing * cacheline boundary. */ struct sockaddr_in rt_dst4; struct sockaddr_in6 rt_dst6; struct sockaddr rt_dst; char rt_dstb[28]; }; int rte_flags; /* up/down?, host/net */ u_long rt_weight; /* absolute weight */ u_long rt_expire; /* lifetime for route, e.g. redirect */ struct rtentry *rt_chain; /* pointer to next rtentry to delete */ struct epoch_context rt_epoch_ctx; /* net epoch tracker */ }; /* * With the split between the routing entry and the nexthop, * rt_flags has to be split between these 2 entries. As rtentry * mostly contains prefix data and is thought to be generic enough * so one can transparently change the nexthop pointer w/o requiring * any other rtentry changes, most of rt_flags shifts to the particular nexthop. * / * * RTF_UP: rtentry, as an indication that it is linked. * RTF_HOST: rtentry, nhop. The latter indication is needed for the datapath * RTF_DYNAMIC: nhop, to make rtentry generic. * RTF_MODIFIED: nhop, to make rtentry generic. (legacy) * -- "native" path (nhop) properties: * RTF_GATEWAY, RTF_STATIC, RTF_PROTO1, RTF_PROTO2, RTF_PROTO3, RTF_FIXEDMTU, * RTF_PINNED, RTF_REJECT, RTF_BLACKHOLE, RTF_BROADCAST */ /* Nexthop rt flags mask */ #define NHOP_RT_FLAG_MASK (RTF_GATEWAY | RTF_HOST | RTF_REJECT | RTF_DYNAMIC | \ RTF_MODIFIED | RTF_STATIC | RTF_BLACKHOLE | RTF_PROTO1 | RTF_PROTO2 | \ RTF_PROTO3 | RTF_FIXEDMTU | RTF_PINNED | RTF_BROADCAST) /* rtentry rt flag mask */ #define RTE_RT_FLAG_MASK (RTF_UP | RTF_HOST) /* route_temporal.c */ void tmproutes_update(struct rib_head *rnh, struct rtentry *rt); void tmproutes_init(struct rib_head *rh); void tmproutes_destroy(struct rib_head *rh); /* route_ctl.c */ struct route_nhop_data; int change_route_nhop(struct rib_head *rnh, struct rtentry *rt, struct rt_addrinfo *info, struct route_nhop_data *rnd, struct rib_cmd_info *rc); int change_route_conditional(struct rib_head *rnh, struct rtentry *rt, struct rt_addrinfo *info, struct route_nhop_data *nhd_orig, struct route_nhop_data *nhd_new, struct rib_cmd_info *rc); struct rtentry *lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info, struct route_nhop_data *rnd); bool nhop_can_multipath(const struct nhop_object *nh); bool match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw); int check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt, const struct nhop_object *nh); int can_override_nhop(const struct rt_addrinfo *info, const struct nhop_object *nh); void vnet_rtzone_init(void); void vnet_rtzone_destroy(void); /* subscriptions */ void rib_init_subscriptions(struct rib_head *rnh); void rib_destroy_subscriptions(struct rib_head *rnh); /* Nexhops */ void nhops_init(void); int nhops_init_rib(struct rib_head *rh); void nhops_destroy_rib(struct rib_head *rh); void nhop_ref_object(struct nhop_object *nh); int nhop_try_ref_object(struct nhop_object *nh); void nhop_ref_any(struct nhop_object *nh); void nhop_free_any(struct nhop_object *nh); void nhop_set_type(struct nhop_object *nh, enum nhop_type nh_type); void nhop_set_rtflags(struct nhop_object *nh, int rt_flags); int nhop_create_from_info(struct rib_head *rnh, struct rt_addrinfo *info, struct nhop_object **nh_ret); int nhop_create_from_nhop(struct rib_head *rnh, const struct nhop_object *nh_orig, struct rt_addrinfo *info, struct nhop_object **pnh_priv); void nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu); int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w); /* MULTIPATH */ #define MPF_MULTIPATH 0x08 /* need to be consistent with NHF_MULTIPATH */ struct nhgrp_object { uint16_t nhg_flags; /* nexthop group flags */ uint8_t nhg_size; /* dataplain group size */ uint8_t spare; struct nhop_object *nhops[0]; /* nhops */ }; static inline struct nhop_object * nhop_select(struct nhop_object *nh, uint32_t flowid) { #ifdef ROUTE_MPATH if (NH_IS_NHGRP(nh)) { struct nhgrp_object *nhg = (struct nhgrp_object *)nh; nh = nhg->nhops[flowid % nhg->nhg_size]; } #endif return (nh); } struct weightened_nhop; /* mpath_ctl.c */ int add_route_mpath(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry *rt, struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc); int del_route_mpath(struct rib_head *rh, struct rt_addrinfo *info, struct rtentry *rt, struct nhgrp_object *nhg, struct rib_cmd_info *rc); /* nhgrp.c */ int nhgrp_ctl_init(struct nh_control *ctl); void nhgrp_ctl_free(struct nh_control *ctl); void nhgrp_ctl_unlink_all(struct nh_control *ctl); /* nhgrp_ctl.c */ int nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w); int nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops, struct route_nhop_data *rnd); typedef bool nhgrp_filter_cb_t(const struct nhop_object *nh, void *data); int nhgrp_get_filtered_group(struct rib_head *rh, const struct nhgrp_object *src, nhgrp_filter_cb_t flt_func, void *flt_data, struct route_nhop_data *rnd); int nhgrp_get_addition_group(struct rib_head *rnh, struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_new); void nhgrp_ref_object(struct nhgrp_object *nhg); uint32_t nhgrp_get_idx(const struct nhgrp_object *nhg); void nhgrp_free(struct nhgrp_object *nhg); /* rtsock */ int rtsock_routemsg(int cmd, struct rtentry *rt, struct nhop_object *nh, int fibnum); int rtsock_routemsg_info(int cmd, struct rt_addrinfo *info, int fibnum); int rtsock_addrmsg(int cmd, struct ifaddr *ifa, int fibnum); /* lookup_framework.c */ void fib_grow_rtables(uint32_t new_num_tables); int fib_select_algo_initial(struct rib_head *rh); void fib_destroy_rib(struct rib_head *rh); void vnet_fib_init(void); void vnet_fib_destroy(void); /* Entropy data used for outbound hashing */ #define MPATH_ENTROPY_KEY_LEN 40 extern uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN]; #endif