Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -4178,6 +4178,7 @@ net/route/nhop.c standard net/route/nhop_ctl.c standard net/route/nhop_utils.c standard +net/route/fib_algo.c optional route_algo net/route/route_ctl.c standard net/route/route_ddb.c optional ddb net/route/route_helpers.c standard @@ -4329,6 +4330,7 @@ netinet/in_kdtrace.c optional inet | inet6 netinet/ip_carp.c optional inet carp | inet6 carp netinet/in_fib.c optional inet +netinet/in_fib_algo.c optional inet route_algo netinet/in_gif.c optional gif inet | netgraph_gif inet netinet/ip_gre.c optional gre inet netinet/ip_id.c optional inet @@ -4405,6 +4407,7 @@ netinet6/in6.c optional inet6 netinet6/in6_cksum.c optional inet6 netinet6/in6_fib.c optional inet6 +netinet6/in6_fib_algo.c optional inet6 route_algo netinet6/in6_gif.c optional gif inet6 | netgraph_gif inet6 netinet6/in6_ifattach.c optional inet6 netinet6/in6_jail.c optional inet6 Index: sys/conf/options =================================================================== --- sys/conf/options +++ sys/conf/options @@ -453,6 +453,7 @@ PCBGROUP opt_pcbgroup.h PF_DEFAULT_TO_DROP opt_pf.h ROUTE_MPATH opt_route.h +ROUTE_ALGO opt_route.h ROUTETABLES opt_route.h RSS opt_rss.h SLIP_IFF_OPTS opt_slip.h Index: sys/modules/test_lookup/Makefile =================================================================== --- /dev/null +++ sys/modules/test_lookup/Makefile @@ -0,0 +1,11 @@ +# $FreeBSD$ + +SYSDIR?=${SRCTOP}/sys +.include "${SYSDIR}/conf/kern.opts.mk" + +.PATH: ${SYSDIR}/net/route + +KMOD= test_lookup +SRCS= opt_inet.h opt_inet6.h test_lookup.c + +.include Index: sys/net/route.h =================================================================== --- sys/net/route.h +++ sys/net/route.h @@ -453,6 +453,8 @@ /* New API */ struct nhop_object *rib_lookup(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, uint32_t flowid); +struct rib_rtable_info; +bool rib_get_rtable_info(uint32_t fibnum, int family, struct rib_rtable_info *info); #endif #endif Index: sys/net/route.c =================================================================== --- sys/net/route.c +++ sys/net/route.c @@ -151,6 +151,12 @@ rt_table_destroy(struct rib_head *rh) { + RIB_WLOCK(rh); + rh->rib_dying = true; + RIB_WUNLOCK(rh); + + fib_destroy_rib(rh); + tmproutes_destroy(rh); rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head); Index: sys/net/route/route_algo.h =================================================================== --- /dev/null +++ sys/net/route/route_algo.h @@ -0,0 +1,109 @@ +/*- + * Copyright (c) 2020 + * Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + + +struct fib_data; +struct fib_dp; +enum flm_op_result { + FLM_SUCCESS, /* No errors, operation successful */ + FLM_REBUILD, /* Operation cannot be completed, schedule algorithm rebuild */ + FLM_ERROR, /* Operation failed, this algo cannot be used */ +}; + +struct rib_rtable_info { + uint32_t num_prefixes; + uint32_t num_nhops; + uint32_t num_nhgrp; +}; + +struct flm_lookup_key { + union { + const struct in6_addr *addr6; + struct in_addr addr4; + }; +}; + +typedef struct nhop_object *flm_lookup_t(void *algo_data, + const struct flm_lookup_key key, uint32_t scopeid); +typedef enum flm_op_result flm_init_t (uint32_t fibnum, struct fib_data *fd, + void *_old_data, void **new_data); +typedef void flm_destroy_t(void *data); +typedef enum flm_op_result flm_dump_t(struct rtentry *rt, void *data); +typedef enum flm_op_result flm_dump_end_t(void *data, struct fib_dp *dp); +typedef enum flm_op_result flm_change_t(struct rib_head *rnh, + struct rib_cmd_info *rc, void *data); +typedef uint8_t flm_get_pref_t(const struct rib_rtable_info *rinfo); + +struct fib_lookup_module { + char *flm_name; /* algo name */ + int flm_family; /* address family this module supports */ + int flm_refcount; /* # of references */ + uint32_t flm_flags; /* flags */ + uint8_t flm_index; /* internal algo index */ + flm_init_t *flm_init_cb; /* instance init */ + flm_destroy_t *flm_destroy_cb; /* destroy instance */ + flm_change_t *flm_change_rib_item_cb;/* routing table change hook */ + flm_dump_t *flm_dump_rib_item_cb; /* routing table dump cb */ + flm_dump_end_t *flm_dump_end_cb; /* end of dump */ + flm_lookup_t *flm_lookup; /* lookup function */ + flm_get_pref_t *flm_get_pref; /* get algo preference */ + TAILQ_ENTRY(fib_lookup_module) entries; +}; + +/* Datapath lookup data */ +struct fib_dp { + flm_lookup_t *f; + void *arg; +}; + +VNET_DECLARE(struct fib_dp *, inet_dp); +#define V_inet_dp VNET(inet_dp) +VNET_DECLARE(struct fib_dp *, inet6_dp); +#define V_inet6_dp VNET(inet6_dp) + +#define FIB_PRINTF(_l, _fd, _fmt, ...) fib_printf(_l, _fd, __func__, _fmt, ##__VA_ARGS__) + +void fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...); +int fib_module_init(struct fib_lookup_module *flm, uint32_t fibnum, + int family); +int fib_module_clone(const struct fib_lookup_module *flm_orig, + struct fib_lookup_module *flm, bool waitok); +int fib_module_dumptree(struct fib_lookup_module *flm, + enum rib_subscription_type subscription_type); +int fib_module_register(struct fib_lookup_module *flm); +int fib_module_unregister(struct fib_lookup_module *flm); + +uint32_t fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh); +struct nhop_object **fib_get_nhop_array(struct fib_data *fd); +void fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo); +struct rib_head *fib_get_rh(struct fib_data *fd); + + Index: sys/net/route/route_algo.c =================================================================== --- /dev/null +++ sys/net/route/route_algo.c @@ -0,0 +1,1472 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_route.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#ifdef INET6 +#include +#include +#endif + +#include +#include +#include +#include +#include + +#include + +/* + * Fib lookup framework. + * + * This framework enables dynamic loading of lookup modules enabling + * accelerated longest-prefix-match lookups for the routing tables. + * + * flm - fib lookup modules - kernel modules implementing particular algo + * fd - fib data - instance of an flm bound to specific routing table + * + * For each supported address family, there is a an allocated array of fib_dp + * structures, indexed by fib number. Each array entry contains callback function + * and its argument. This function will be called with a family-specific lookup key, + * scope and provided argument. This array gets re-created every time when new algo + * instance gets created. Please take a look at the replace_rtables_family() function + * for more details. + * + * Control plane for to setup and update the necessary dataplane structures. + * 1) nexhops abstraction -> module has to deal with index, refcounting, nexhtop groups etc + * 2) sync with route tables + * 3) dataplane attachment points + * 3) fail early. Some algorithms are immutable, so any change leads to rebuild. Some + * are mutable till some extent so the module is build over common setup/teardown + * instances, making error handling * easier. + * 4) preference. Lookup modules contain callbacks to determine the preference of particula + * lookup algorithm given the current route table scale. + * + */ + +SYSCTL_DECL(_net_route); +SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "Route algorithm lookups"); + +#ifdef INET6 +bool algo_fixed_inet6 = false; +SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "IPv6 algorithm lookups"); +#endif +#ifdef INET +bool algo_fixed_inet = false; +SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "IPv4 algorithm lookups"); +#endif + +struct nhop_ref_table { + uint32_t count; + int32_t refcnt[0]; +}; + +struct fib_data { + uint32_t number_nhops; /* current # of nhops */ + uint32_t number_records; /* current # of routes */ + uint8_t hit_nhops; /* true if out of nhop limit */ + uint8_t init_done; /* true if init is competed */ + uint32_t fd_dead:1; /* Scheduled for deletion */ + uint32_t fd_linked:1; /* true if linked */ + uint32_t fd_need_rebuild:1; /* true if rebuild scheduled */ + uint32_t fd_force_eval:1;/* true if rebuild scheduled */ + uint8_t fd_family; /* family */ + uint32_t fd_fibnum; /* fibnum */ + uint32_t fd_failed_rebuilds; /* stat: failed rebuilds */ + uint32_t fd_algo_mask; /* bitmask for algo data */ + struct callout fd_callout; /* rebuild callout */ + void *fd_algo_data; /* algorithm data */ + struct nhop_object **nh_idx; /* nhop idx->ptr array */ + struct nhop_ref_table *nh_ref_table; /* array with # of nhop references */ + struct rib_head *fd_rh; /* RIB table we're attached to */ + struct rib_subscription *fd_rs; /* storing table subscription */ + struct fib_dp fd_dp; /* fib datapath data */ + struct vnet *fd_vnet; /* vnet fib belongs to */ + struct epoch_context fd_epoch_ctx; /* epoch context for deletion */ + uint64_t gencnt; + struct fib_lookup_module *fd_flm;/* pointer to the lookup module */ + uint32_t fd_num_changes; /* number of changes since last callout */ + TAILQ_ENTRY(fib_data) entries; /* list of all fds in vnet */ +}; + +static void rebuild_callout(void *_data); +static void destroy_fd_instance_epoch(epoch_context_t ctx); +static enum flm_op_result attach_datapath(struct fib_data *fd); +static bool is_idx_free(struct fib_data *fd, uint32_t index); +static void set_algo_fixed(struct rib_head *rh); + +static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh); +static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh); + +static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh, + struct fib_lookup_module *orig_flm); +static void fib_unref_algo(struct fib_lookup_module *flm); + +struct mtx fib_mtx; +#define MOD_LOCK() mtx_lock(&fib_mtx) +#define MOD_UNLOCK() mtx_unlock(&fib_mtx) + +#if 0 +uint32_t algo_bitmask_idx = 0; +static uint32_t algo_inet_mask = 0; +static uint32_t algo_inet6_mask = 0; +#endif + + +/* Algorithm has to be this percent better than the current to switch */ +#define BEST_DIFF_PERCENT (5 * 256 / 100) +/* Schedule algo re-evaluation X seconds after a change */ +#define ALGO_EVAL_DELAY_MS 30000 +/* Force algo re-evaluation after X changes */ +#define ALGO_EVAL_NUM_ROUTES 100 +/* Try to setup algorithm X times */ +#define FIB_MAX_TRIES 32 +/* Max amount of supported nexthops */ +#define FIB_MAX_NHOPS 262144 +#define FIB_CALLOUT_DELAY_MS 50 + +/* Debug */ +static int flm_debug_level = LOG_NOTICE; +SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW, + &flm_debug_level, 0, "debuglevel"); + +#define RTDEBUG +#ifdef RTDEBUG +#define _PASS_MSG(_l) (flm_debug_level >= (_l)) +#define ALGO_PRINTF(_fmt, ...) printf("[rt_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__) +#define _ALGO_PRINTF(_fib, _fam, _aname, _func, _fmt, ...) \ + printf("[rt_algo] %s.%u (%s) %s: " _fmt "\n",\ + print_family(_fam), _fib, _aname, _func, ## __VA_ARGS__) +#define _RH_PRINTF(_fib, _fam, _func, _fmt, ...) \ + printf("[rt_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__) +#define RH_PRINTF(_l, _rh, _fmt, ...) if (_PASS_MSG(_l)) { \ + _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\ +} +#define FD_PRINTF(_l, _fd, _fmt, ...) if (_PASS_MSG(_l)) { \ + _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name, \ + __func__, _fmt, ## __VA_ARGS__); \ +} +#else +#define FD_PRINTF(fd, _fmt, ...) +#define ALGO_PRINTF(_fmt, ...) +#define RH_PRINTF(_fmt, ...) +#endif + +/* List of all fib lookup instances */ +VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list); +#define V_fib_data_list VNET(fib_data_list) + +struct fib_error { + int fe_family; + uint32_t fe_fibnum; + struct fib_lookup_module *fe_flm; + TAILQ_ENTRY(fib_error) entries;/* list of all errored entries */ +}; +TAILQ_HEAD(fib_error_head, fib_error) fib_error_list; + +struct fib_dp_header { + struct epoch_context ffi_epoch_ctx; + uint32_t ffi_algo_mask; + uint32_t ffi_num_tables; + struct fib_dp ffi_idx[0]; +}; + +static TAILQ_HEAD(, fib_lookup_module) all_algo_list; + + +static bool +flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum) +{ + struct fib_error *fe, *fe_tmp; + + fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO); + if (fe == NULL) + return (false); + fe->fe_flm = flm; + fe->fe_family = flm->flm_family; + fe->fe_fibnum = fibnum; + + MOD_LOCK(); + TAILQ_FOREACH(fe_tmp, &fib_error_list, entries) { + if ((fe_tmp->fe_flm == flm) && (fe_tmp->fe_fibnum == fibnum)) { + MOD_UNLOCK(); + free(fe, M_TEMP); + return (true); + } + } + TAILQ_INSERT_HEAD(&fib_error_list, fe, entries); + MOD_UNLOCK(); + + return (true); +} + +static bool +flm_error_check(struct fib_lookup_module *flm, uint32_t fibnum) +{ + struct fib_error *fe; + + TAILQ_FOREACH(fe, &fib_error_list, entries) { + if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum)) + return (true); + } + + return (false); +} + +static void +fib_error_clear_flm(struct fib_lookup_module *flm) +{ + struct fib_error *fe, *fe_tmp; + + TAILQ_FOREACH_SAFE(fe, &fib_error_list, entries, fe_tmp) { + if (fe->fe_flm == flm) { + TAILQ_REMOVE(&fib_error_list, fe, entries); + free(fe, M_TEMP); + } + } +} + + +static const char * +print_family(int family) +{ + + if (family == AF_INET) + return ("inet"); + else if (family == AF_INET6) + return ("inet6"); + else + return ("unknown"); +} + +/* + * Debug function used by lookup modules. + * Outputs message denoted by @fmt, prepended by "[rt_algo] inetX.Y (algo) " + */ +void +fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...) +{ + char buf[128]; + va_list ap; + + if (level > flm_debug_level) + return; + + va_start(ap, fmt); + vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + + _ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name, + func, "%s", buf); +} + +static int +print_algos(struct sysctl_req *req, int family) +{ + struct fib_lookup_module *flm; + struct sbuf sbuf; + int error, count = 0; + + error = sysctl_wire_old_buffer(req, 0); + if (error == 0) { + sbuf_new_for_sysctl(&sbuf, NULL, 512, req); + TAILQ_FOREACH(flm, &all_algo_list, entries) { + if (flm->flm_family == family) { + if (count++ > 0) + sbuf_cat(&sbuf, ", "); + sbuf_cat(&sbuf, flm->flm_name); + } + } + error = sbuf_finish(&sbuf); + sbuf_delete(&sbuf); + } + return (error); +} + +#ifdef INET6 +static int +print_algos_inet6(SYSCTL_HANDLER_ARGS) +{ + + return (print_algos(req, AF_INET6)); +} +SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, + print_algos_inet6, "A", "List of IPv6 lookup algorithms"); +#endif + +#ifdef INET +static int +print_algos_inet(SYSCTL_HANDLER_ARGS) +{ + + return (print_algos(req, AF_INET)); +} +SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, + print_algos_inet, "A", "List of IPv4 lookup algorithms"); +#endif + +static uint32_t +callout_calc_delay(struct fib_data *fd) +{ + uint32_t shift; + + if (fd->fd_failed_rebuilds > 10) + shift = 10; + else + shift = fd->fd_failed_rebuilds; + + return ((1 << shift) * FIB_CALLOUT_DELAY_MS); +} + +static void +schedule_callout(struct fib_data *fd, int delay_ms) +{ + + callout_reset_sbt(&fd->fd_callout, 0, SBT_1MS * delay_ms, + rebuild_callout, fd, 0); +} + +static void +schedule_fd_rebuild(struct fib_data *fd) +{ + + MOD_LOCK(); + if (!fd->fd_need_rebuild) { + fd->fd_need_rebuild = true; + + /* + * Potentially re-schedules pending callout + * initiated by schedule_algo_eval. + */ + FD_PRINTF(LOG_INFO, fd, "Scheduling rebuilt"); + schedule_callout(fd, callout_calc_delay(fd)); + } + MOD_UNLOCK(); +} + +static void +schedule_algo_eval(struct fib_data *fd) +{ + + if (fd->fd_num_changes++ == 0) { + /* Start callout to consider switch */ + MOD_LOCK(); + if (!callout_pending(&fd->fd_callout)) + schedule_callout(fd, ALGO_EVAL_DELAY_MS); + MOD_UNLOCK(); + } else if (fd->fd_num_changes > ALGO_EVAL_NUM_ROUTES && !fd->fd_force_eval) { + /* Reset callout to exec immediately */ + MOD_LOCK(); + if (!fd->fd_need_rebuild) { + fd->fd_force_eval = true; + schedule_callout(fd, 1); + } + MOD_UNLOCK(); + } +} + +/* + * rib subscription handler + */ +static void +handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, + void *_data) +{ + struct fib_data *fd = (struct fib_data *)_data; + enum flm_op_result result; + + RIB_WLOCK_ASSERT(rnh); + + /* + * There is a small gap between subscribing for route changes + * and initiating rtable dump. Avoid receiving route changes + * prior to finishing rtable dump by checking `init_done`. + */ + if (!fd->init_done) + return; + /* + * If algo requested rebuild, stop sending updates by default. + * This simplifies nexthop refcount handling logic. + */ + if (fd->fd_need_rebuild) + return; + + /* Consider scheduling algorithm re-evaluation */ + schedule_algo_eval(fd); + + + /* + * Maintain guarantee that every nexthop returned by the dataplane + * lookup has > 0 refcount, so can be safely referenced within current + * epoch. + */ + if (rc->rc_nh_new != NULL) { + if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) { + /* ran out of indexes */ + schedule_fd_rebuild(fd); + return; + } + } + + result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data); + + switch (result) { + case FLM_SUCCESS: + /* Unref old nexthop on success */ + if (rc->rc_nh_old != NULL) + fib_unref_nhop(fd, rc->rc_nh_old); + break; + case FLM_REBUILD: + /* + * Algo reported inability to handle, + * schedule algo rebuild. + */ + schedule_fd_rebuild(fd); + break; + case FLM_ERROR: + /* + * Algo reported a non-recoverable error. + */ + FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error"); + if (!flm_error_add(fd->fd_flm, fd->fd_fibnum)) + FD_PRINTF(LOG_ERR, fd, "failed to ban algo"); + schedule_fd_rebuild(fd); + } +} + +static void +estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd) +{ + + if (old_fd == NULL) { + // TODO: read from rtable + fd->number_nhops = 16; + return; + } + + if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS) + fd->number_nhops = 2 * old_fd->number_nhops; + else + fd->number_nhops = old_fd->number_nhops; +} + +struct walk_cbdata { + struct fib_data *fd; + flm_dump_t *func; + enum flm_op_result result; +}; + +static void +sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data) +{ + struct walk_cbdata *w = (struct walk_cbdata *)_data; + struct fib_data *fd = w->fd; + + RIB_WLOCK_ASSERT(w->fd->fd_rh); + + if (rnh->rib_dying) { + w->result = FLM_ERROR; + return; + } + + if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS) + return; + + /* Post-dump hook, dump successful */ + + if (fd->hit_nhops) { + FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops", + fd->nh_ref_table->count); + w->result = FLM_REBUILD; + return; + } + + w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp); + + if (w->result == FLM_SUCCESS) { + /* Mark init as done to allow routing updates */ + fd->init_done = 1; + } +} + +static int +sync_algo_cb(struct rtentry *rt, void *_data) +{ + struct walk_cbdata *w = (struct walk_cbdata *)_data; + + RIB_WLOCK_ASSERT(w->fd->fd_rh); + + if (w->result == FLM_SUCCESS && w->func) { + + /* + * Reference nexthops to maintain guarantee that + * each nexthop returned by datapath has > 0 references + * and can be safely referenced within current epoch. + */ + struct nhop_object *nh = rt_get_raw_nhop(rt); + if (fib_ref_nhop(w->fd, nh) != 0) + w->result = w->func(rt, w->fd->fd_algo_data); + else + w->result = FLM_REBUILD; + } + + return (0); +} + +/* + * Dump all routing table state to the algo instance. + */ +static enum flm_op_result +sync_algo(struct fib_data *fd) +{ + struct walk_cbdata w; + + w.fd = fd; + w.func = fd->fd_flm->flm_dump_rib_item_cb; + w.result = FLM_SUCCESS; + + rib_walk_ext_internal(fd->fd_rh, true, sync_algo_cb, sync_algo_end_cb, &w); + + FD_PRINTF(LOG_INFO, fd, "initial dump completed."); + + return (w.result); +} + +/* + * Assume already unlinked from datapath + */ +static int +schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout) +{ + bool is_dead; + + NET_EPOCH_ASSERT(); + + MOD_LOCK(); + is_dead = fd->fd_dead; + if (!is_dead) + fd->fd_dead = true; + if (fd->fd_linked) { + TAILQ_REMOVE(&V_fib_data_list, fd, entries); + fd->fd_linked = false; + } + MOD_UNLOCK(); + if (is_dead) + return (0); + + FD_PRINTF(LOG_INFO, fd, "DETACH"); + + if (fd->fd_rs != NULL) + rib_unsibscribe(fd->fd_rs); + + /* + * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls + * will be executed, hence no _new_ callout schedules will happen. + * + * There can be 3 possible scenarious here: + * 1) we're running inside a callout when we're deleting ourselves + * due to migration to a newer fd + * 2) we're running from rt_table_destroy() and callout is scheduled + * for execution OR is executing + * + * For (2) we need to wait for the callout termination, as the routing table + * will be destroyed after this function returns. + * For (1) we cannot call drain, but can ensure that this is the last invocation. + */ + + if (in_callout) + callout_stop(&fd->fd_callout); + else + callout_drain(&fd->fd_callout); + + /* + * At this moment there are no other pending work scheduled. + */ + FD_PRINTF(LOG_INFO, fd, "destroying old instance"); + epoch_call(net_epoch_preempt, destroy_fd_instance_epoch, + &fd->fd_epoch_ctx); + + return (0); +} + +/* + * Wipe all instances from the list matching rib specified by @rh. + */ +static void +fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout) +{ + struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head); + struct fib_data *fd, *fd_tmp; + struct epoch_tracker et; + + MOD_LOCK(); + TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) { + if (fd->fd_rh == rh) { + if (keep_first) { + keep_first = false; + continue; + } + TAILQ_REMOVE(&V_fib_data_list, fd, entries); + fd->fd_linked = false; + TAILQ_INSERT_TAIL(&tmp_head, fd, entries); + } + } + MOD_UNLOCK(); + + /* Pass 2: remove each entry */ + NET_EPOCH_ENTER(et); + TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) { + schedule_destroy_fd_instance(fd, in_callout); + } + NET_EPOCH_EXIT(et); +} + +void +fib_destroy_rib(struct rib_head *rh) +{ + + /* + * Atm we have set is_dying flag on rnh, so all new fd's will + * fail at sync_algo() stage, so nothing new will be added to the list. + */ + fib_cleanup_algo(rh, false, false); +} + +static void +destroy_instance(struct fib_data *fd) +{ + + FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd); + + /* Call destroy callback first */ + if (fd->fd_algo_data != NULL) + fd->fd_flm->flm_destroy_cb(fd->fd_algo_data); + + /* Nhop table */ + if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) { + for (int i = 0; i < fd->number_nhops; i++) { + if (!is_idx_free(fd, i)) { + FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", + i, fd->nh_idx[i]); + nhop_free_any(fd->nh_idx[i]); + } + } + free(fd->nh_idx, M_RTABLE); + } + if (fd->nh_ref_table != NULL) + free(fd->nh_ref_table, M_RTABLE); + + fib_unref_algo(fd->fd_flm); + + free(fd, M_RTABLE); +} + +/* + * Epoch callback indicating fd is safe to destroy + */ +static void +destroy_fd_instance_epoch(epoch_context_t ctx) +{ + struct fib_data *fd; + + fd = __containerof(ctx, struct fib_data, fd_epoch_ctx); + + destroy_instance(fd); +} + +static enum flm_op_result +try_setup_instance(struct fib_lookup_module *flm, struct rib_head *rh, + struct fib_data *old_fd, struct fib_data **pfd) +{ + struct fib_data *fd; + size_t size; + enum flm_op_result result; + + /* Allocate */ + fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO); + if (fd == NULL) { + *pfd = NULL; + return (FLM_REBUILD); + } + *pfd = fd; + + estimate_nhop_scale(old_fd, fd); + + fd->fd_rh = rh; + fd->fd_family = rh->rib_family; + fd->fd_fibnum = rh->rib_fibnum; + callout_init(&fd->fd_callout, 1); + fd->fd_vnet = curvnet; + fd->fd_flm = flm; + + MOD_LOCK(); + flm->flm_refcount++; + MOD_UNLOCK(); + + /* Allocate nhidx -> nhop_ptr table */ + size = fd->number_nhops * sizeof(void *); + //FD_PRINTF(fd, "malloc(%lu)", size); + fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); + if (fd->nh_idx == NULL) { + FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size); + return (FLM_REBUILD); + } + + /* Allocate nhop index refcount table */ + size = sizeof(struct nhop_ref_table); + size += fd->number_nhops * sizeof(uint32_t); + //FD_PRINTF(fd, "malloc(%lu)", size); + fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); + if (fd->nh_ref_table == NULL) { + FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size); + return (FLM_REBUILD); + } + + /* Okay, we're ready for algo init */ + void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL; + result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data); + if (result != FLM_SUCCESS) + return (result); + + /* Try to subscribe */ + if (flm->flm_change_rib_item_cb != NULL) { + fd->fd_rs = rib_subscribe_internal(fd->fd_rh, + handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE, 0); + if (fd->fd_rs == NULL) + return (FLM_REBUILD); + } + + /* Dump */ + result = sync_algo(fd); + if (result != FLM_SUCCESS) + return (result); + FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully."); + + MOD_LOCK(); + /* + * Insert in the beginning of a list, to simplify search + * first matching entry is the one. + */ + TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries); + fd->fd_linked = true; + MOD_UNLOCK(); + + return (FLM_SUCCESS); +} + +/* + * Sets up algo @flm for table @rh and links it to the datapath. + * + */ +static enum flm_op_result +setup_instance(struct fib_lookup_module *flm, struct rib_head *rh, + struct fib_data *orig_fd, struct fib_data **pfd, bool attach) +{ + struct fib_data *prev_fd, *new_fd; + struct epoch_tracker et; + enum flm_op_result result; + + prev_fd = orig_fd; + new_fd = NULL; + for (int i = 0; i < FIB_MAX_TRIES; i++) { + NET_EPOCH_ENTER(et); + result = try_setup_instance(flm, rh, prev_fd, &new_fd); + + if ((result == FLM_SUCCESS) && attach) + result = attach_datapath(new_fd); + + if ((prev_fd != NULL) && (prev_fd != orig_fd)) { + schedule_destroy_fd_instance(prev_fd, false); + prev_fd = NULL; + } + NET_EPOCH_EXIT(et); + + RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %d", i, result); + + if (result == FLM_REBUILD) { + prev_fd = new_fd; + new_fd = NULL; + continue; + } + + break; + } + + if (result != FLM_SUCCESS) { + /* update failure count */ + MOD_LOCK(); + if (orig_fd != NULL) + orig_fd->fd_failed_rebuilds++; + MOD_UNLOCK(); + + /* Ban algo on non-recoverable error */ + if (result == FLM_ERROR) + flm_error_add(flm, rh->rib_fibnum); + + NET_EPOCH_ENTER(et); + if ((prev_fd != NULL) && (prev_fd != orig_fd)) + schedule_destroy_fd_instance(prev_fd, false); + if (new_fd != NULL) { + schedule_destroy_fd_instance(new_fd, false); + new_fd = NULL; + } + NET_EPOCH_EXIT(et); + } + + *pfd = new_fd; + return (result); +} + +static void +rebuild_callout(void *_data) +{ + struct fib_data *fd, *fd_new, *fd_tmp; + struct fib_lookup_module *flm_new; + struct epoch_tracker et; + enum flm_op_result result; + bool need_rebuild = false; + + fd = (struct fib_data *)_data; + + MOD_LOCK(); + need_rebuild = fd->fd_need_rebuild; + fd->fd_need_rebuild = false; + fd->fd_force_eval = false; + fd->fd_num_changes = 0; + MOD_UNLOCK(); + + CURVNET_SET(fd->fd_vnet); + + /* First, check if we're still OK to use this algo */ + flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); + if ((flm_new == NULL) && (!need_rebuild)) { + /* Keep existing algo, no need to rebuild. */ + CURVNET_RESTORE(); + return; + } + + if (flm_new == NULL) { + flm_new = fd->fd_flm; + fd_tmp = fd; + } else { + fd_tmp = NULL; + FD_PRINTF(LOG_NOTICE, fd, "switching algo to %s", flm_new->flm_name); + } + result = setup_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true); + if (fd_tmp == NULL) { + /* fd_new represents new algo */ + fib_unref_algo(flm_new); + } + if (result != FLM_SUCCESS) { + FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed"); + CURVNET_RESTORE(); + return; + } + FD_PRINTF(LOG_INFO, fd_new, "switched to new instance"); + + /* Remove old */ + if (fd != NULL) { + NET_EPOCH_ENTER(et); + schedule_destroy_fd_instance(fd, true); + NET_EPOCH_EXIT(et); + } + + CURVNET_RESTORE(); +} + +static struct fib_lookup_module * +fib_find_algo(const char *algo_name, int family) +{ + struct fib_lookup_module *flm; + + MOD_LOCK(); + TAILQ_FOREACH(flm, &all_algo_list, entries) { + if ((strcmp(flm->flm_name, algo_name) == 0) && + (family == flm->flm_family)) { + flm->flm_refcount++; + MOD_UNLOCK(); + return (flm); + } + } + MOD_UNLOCK(); + + return (NULL); +} + +static void +fib_unref_algo(struct fib_lookup_module *flm) +{ + + MOD_LOCK(); + flm->flm_refcount--; + MOD_UNLOCK(); +} + +static int +set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req) +{ + struct fib_lookup_module *flm = NULL; + struct fib_data *fd = NULL; + char old_algo_name[32], algo_name[32]; + struct rib_head *rh = NULL; + enum flm_op_result result; + int error; + + MOD_LOCK(); + TAILQ_FOREACH(fd, &V_fib_data_list, entries) { + if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum)) + break; + } + if (fd == NULL) { + MOD_UNLOCK(); + return (ENOENT); + } + rh = fd->fd_rh; + strlcpy(old_algo_name, fd->fd_flm->flm_name, + sizeof(old_algo_name)); + MOD_UNLOCK(); + + strlcpy(algo_name, old_algo_name, sizeof(algo_name)); + error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req); + if (error != 0 || req->newptr == NULL) + return (error); + + if (strcmp(algo_name, old_algo_name) == 0) + return (0); + + flm = fib_find_algo(algo_name, family); + if (flm == NULL) { + RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name); + return (ESRCH); + } + + fd = NULL; + result = setup_instance(flm, rh, NULL, &fd, true); + fib_unref_algo(flm); + if (result != FLM_SUCCESS) + return (EINVAL); + + /* Disable jumping between algos */ + MOD_LOCK(); + set_algo_fixed(rh); + MOD_UNLOCK(); + /* Remove old instance(s) */ + fib_cleanup_algo(rh, true, false); + + /* Drain cb so user can unload the module after userret if so desired */ + epoch_drain_callbacks(net_epoch_preempt); + + return (0); +} + +#ifdef INET +static int +set_algo4_sysctl_handler(SYSCTL_HANDLER_ARGS) +{ + + return (set_fib_algo(RT_DEFAULT_FIB, AF_INET, oidp, req)); +} +SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, + set_algo4_sysctl_handler, "A", "Set IPv4 lookup algo"); +#endif + +#ifdef INET6 +static int +set_algo6_sysctl_handler(SYSCTL_HANDLER_ARGS) +{ + + return (set_fib_algo(RT_DEFAULT_FIB, AF_INET6, oidp, req)); +} +SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, + set_algo6_sysctl_handler, "A", "Set IPv6 lookup algo"); +#endif + +static void +destroy_fdh_epoch(epoch_context_t ctx) +{ + struct fib_dp_header *ffi; + + ffi = __containerof(ctx, struct fib_dp_header, ffi_epoch_ctx); + free(ffi, M_RTABLE); +} + +static struct fib_dp_header * +alloc_fib_dp_array(uint32_t num_tables, bool waitok) +{ + size_t sz; + struct fib_dp_header *ffi; + + sz = sizeof(struct fib_dp_header); + sz += sizeof(struct fib_dp) * num_tables; + ffi = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO); + if (ffi != NULL) + ffi->ffi_num_tables = num_tables; + return (ffi); +} + +static struct fib_dp_header * +get_fib_dp_header(struct fib_dp *dp) +{ + + return (__containerof((void *)dp, struct fib_dp_header, ffi_idx)); +} + +/* + * Replace per-family index pool @pdp with a new one which + * contains updated callback/algo data from @fd. + * Returns 0 on success. + */ +static enum flm_op_result +replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd) +{ + struct fib_dp_header *new_ffi, *old_ffi; + + NET_EPOCH_ASSERT(); + + FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p", + curvnet, fd->fd_dp.f, fd->fd_dp.arg); + + MOD_LOCK(); + old_ffi = get_fib_dp_header(*pdp); + new_ffi = alloc_fib_dp_array(old_ffi->ffi_num_tables, false); + FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_ffi, new_ffi); + if (new_ffi == NULL) { + MOD_UNLOCK(); + FD_PRINTF(LOG_WARNING, fd, "error attaching datapath"); + return (FLM_REBUILD); + } + + memcpy(&new_ffi->ffi_idx[0], &old_ffi->ffi_idx[0], + old_ffi->ffi_num_tables * sizeof(struct fib_dp)); + /* Update relevant data structure for @fd */ + new_ffi->ffi_idx[fd->fd_fibnum] = fd->fd_dp; + + /* Ensure memcpy() writes have completed */ + atomic_thread_fence_rel(); + /* Set new datapath pointer */ + *pdp = &new_ffi->ffi_idx[0]; + MOD_UNLOCK(); + FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_ffi, new_ffi); + + epoch_call(net_epoch_preempt, destroy_fdh_epoch, + &old_ffi->ffi_epoch_ctx); + + return (FLM_SUCCESS); +} + +static struct fib_dp ** +get_family_dp_ptr(int family) +{ + switch (family) { + case AF_INET: + return (&V_inet_dp); + case AF_INET6: + return (&V_inet6_dp); + } + return (NULL); +} + +/* + * Make datapath use fib instance @fd + */ +static enum flm_op_result +attach_datapath(struct fib_data *fd) +{ + struct fib_dp **pdp; + + pdp = get_family_dp_ptr(fd->fd_family); + return (replace_rtables_family(pdp, fd)); +} + +/* + * Grow datapath pointers array. + * Called from sysctl handler on growing number of routing tables. + */ +static void +grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables) +{ + struct fib_dp_header *new_fdh, *old_fdh = NULL; + + new_fdh = alloc_fib_dp_array(new_num_tables, true); + + MOD_LOCK(); + if (*pdp != NULL) { + old_fdh = get_fib_dp_header(*pdp); + memcpy(&new_fdh->ffi_idx[0], &old_fdh->ffi_idx[0], + old_fdh->ffi_num_tables * sizeof(struct fib_dp)); + } + + /* Wait till all writes completed */ + atomic_thread_fence_rel(); + + *pdp = &new_fdh->ffi_idx[0]; + MOD_UNLOCK(); + + if (old_fdh != NULL) + epoch_call(net_epoch_preempt, destroy_fdh_epoch, + &old_fdh->ffi_epoch_ctx); +} + +/* + * Grows per-AF arrays of datapath pointers for each supported family. + * Called from fibs resize sysctl handler. + */ +void +fib_grow_rtables(uint32_t new_num_tables) +{ + +#ifdef INET + grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables); +#endif +#ifdef INET6 + grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables); +#endif +} + +void +fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo) +{ + + bzero(rinfo, sizeof(struct rib_rtable_info)); + rinfo->num_prefixes = rh->rnh_prefixes; + rinfo->num_nhops = nhops_get_count(rh); +#ifdef ROUTE_MPATH + rinfo->num_nhgrp = nhgrp_get_count(rh); +#endif +} + +/* + * Accessor to get rib instance @fd is attached to. + */ +struct rib_head * +fib_get_rh(struct fib_data *fd) +{ + + return (fd->fd_rh); +} + +/* + * Accessor to export idx->nhop array + */ +struct nhop_object ** +fib_get_nhop_array(struct fib_data *fd) +{ + + return (fd->nh_idx); +} + +static uint32_t +get_nhop_idx(struct nhop_object *nh) +{ +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(nh)) + return (nhgrp_get_idx((struct nhgrp_object *)nh) * 2 - 1); + else + return (nhop_get_idx(nh) * 2); +#else + return (nhop_get_idx(nh)); +#endif +} + +uint32_t +fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh) +{ + + return (get_nhop_idx(nh)); +} + +static bool +is_idx_free(struct fib_data *fd, uint32_t index) +{ + + return (fd->nh_ref_table->refcnt[index] == 0); +} + +static uint32_t +fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh) +{ + uint32_t idx = get_nhop_idx(nh); + + if (idx >= fd->number_nhops) { + fd->hit_nhops = 1; + return (0); + } + + if (is_idx_free(fd, idx)) { + nhop_ref_any(nh); + fd->nh_idx[idx] = nh; + fd->nh_ref_table->count++; + FD_PRINTF(LOG_DEBUG, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]); + } + fd->nh_ref_table->refcnt[idx]++; + + return (idx); +} + +struct nhop_release_data { + struct nhop_object *nh; + struct epoch_context ctx; +}; + +static void +release_nhop_epoch(epoch_context_t ctx) +{ + struct nhop_release_data *nrd; + + nrd = __containerof(ctx, struct nhop_release_data, ctx); + nhop_free_any(nrd->nh); + free(nrd, M_RTABLE); +} + +static void +fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh) +{ + struct nhop_release_data *nrd; + + nrd = malloc(sizeof(struct nhop_release_data), M_RTABLE, M_NOWAIT | M_ZERO); + if (nrd != NULL) { + nrd->nh = nh; + epoch_call(net_epoch_preempt, release_nhop_epoch, &nrd->ctx); + } else { + /* + * Unable to allocate memory. Leak nexthop to maintain guarantee + * that each nhop can be referenced. + */ + FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh); + } +} + +static void +fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh) +{ + uint32_t idx = get_nhop_idx(nh); + + KASSERT((idx < fd->number_nhops), ("invalid nhop index")); + KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh")); + + fd->nh_ref_table->refcnt[idx]--; + if (fd->nh_ref_table->refcnt[idx] == 0) { + FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]); + fib_schedule_release_nhop(fd, fd->nh_idx[idx]); + } +} + +static void +set_algo_fixed(struct rib_head *rh) +{ + switch (rh->rib_family) { + case AF_INET: + algo_fixed_inet = true; + break; + case AF_INET6: + algo_fixed_inet6 = true; + break; + } +} + +static bool +is_algo_fixed(struct rib_head *rh) +{ + + switch (rh->rib_family) { + case AF_INET: + return (algo_fixed_inet); + case AF_INET6: + return (algo_fixed_inet6); + } + return (false); +} + +static struct fib_lookup_module * +fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm) +{ + uint8_t preference, curr_preference = 0, best_preference = 0; + struct fib_lookup_module *flm, *best_flm = NULL; + struct rib_rtable_info rinfo; + int candidate_algos = 0; + + fib_get_rtable_info(rh, &rinfo); + + MOD_LOCK(); + if (is_algo_fixed(rh)) { + MOD_UNLOCK(); + return (NULL); + } + + TAILQ_FOREACH(flm, &all_algo_list, entries) { + if (flm->flm_family != rh->rib_family) + continue; + candidate_algos++; + preference = flm->flm_get_pref(&rinfo); + if (preference > best_preference) { + if (!flm_error_check(flm, rh->rib_fibnum)) { + best_preference = preference; + best_flm = flm; + } + } + if (flm == orig_flm) + curr_preference = preference; + } + if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference)) + best_flm->flm_refcount++; + else + best_flm = NULL; + MOD_UNLOCK(); + + RH_PRINTF(LOG_INFO, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)", + candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference, + best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"), + best_preference); + + return (best_flm); +} + +/* + * Called when new route table is created. + * Selects, allocates and attaches fib algo for the table. + */ +int +fib_select_algo_initial(struct rib_head *rh) +{ + struct fib_lookup_module *flm; + struct fib_data *fd = NULL; + enum flm_op_result result; + + flm = fib_check_best_algo(rh, NULL); + if (flm == NULL) { + RH_PRINTF(LOG_CRIT, rh, "no algo selected"); + return (ENOENT); + } + RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name); + + result = setup_instance(flm, rh, NULL, &fd, false); + fib_unref_algo(flm); + RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd); + if (result == FLM_SUCCESS) { + + /* + * Attach datapath directly to avoid multiple reallocations + * during fib growth + */ + struct fib_dp_header *fdp; + struct fib_dp **pdp; + + pdp = get_family_dp_ptr(rh->rib_family); + if (pdp != NULL) { + fdp = get_fib_dp_header(*pdp); + fdp->ffi_idx[fd->fd_fibnum] = fd->fd_dp; + FD_PRINTF(LOG_INFO, fd, "datapath attached"); + } + } + + return (0); +} + +int +fib_module_register(struct fib_lookup_module *flm) +{ + + MOD_LOCK(); + ALGO_PRINTF("attaching %s to %s", flm->flm_name, + print_family(flm->flm_family)); + TAILQ_INSERT_TAIL(&all_algo_list, flm, entries); + MOD_UNLOCK(); + + return (0); +} + +int +fib_module_unregister(struct fib_lookup_module *flm) +{ + + MOD_LOCK(); + if (flm->flm_refcount > 0) { + MOD_UNLOCK(); + return (EBUSY); + } + fib_error_clear_flm(flm); + ALGO_PRINTF("detaching %s from %s", flm->flm_name, + print_family(flm->flm_family)); + TAILQ_REMOVE(&all_algo_list, flm, entries); + MOD_UNLOCK(); + + return (0); +} + +void +vnet_fib_init(void) +{ + + TAILQ_INIT(&V_fib_data_list); +} + +static void +fib_init(void) +{ + + mtx_init(&fib_mtx, "algo list mutex", NULL, MTX_DEF); + TAILQ_INIT(&all_algo_list); +} +SYSINIT(fib_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, fib_init, NULL); + Index: sys/net/route/route_tables.c =================================================================== --- sys/net/route/route_tables.c +++ sys/net/route/route_tables.c @@ -171,7 +171,7 @@ grow_rtables(uint32_t num_tables) { struct domain *dom; - struct rib_head **prnh; + struct rib_head **prnh, *rh; struct rib_head **new_rt_tables, **old_rt_tables; int family; @@ -188,6 +188,8 @@ "by default. Consider tuning %s if needed\n", "net.add_addr_allfibs"); + fib_grow_rtables(num_tables); + /* * Current rt_tables layout: * fib0[af0, af1, af2, .., AF_MAX]fib1[af0, af1, af2, .., Af_MAX].. @@ -206,10 +208,16 @@ prnh = &new_rt_tables[i * (AF_MAX + 1) + family]; if (*prnh != NULL) continue; - *prnh = dom->dom_rtattach(i); - if (*prnh == NULL) - log(LOG_ERR, "unable to create routing tables for domain %d\n", - dom->dom_family); + rh = dom->dom_rtattach(i); + if (rh == NULL) + log(LOG_ERR, "unable to create routing table for %d.%d\n", + dom->dom_family, i); + if (fib_select_algo_initial(rh) != 0) { + log(LOG_ERR, "unable to select algo for table %d.%d\n", + dom->dom_family, i); + // TODO: detach table + } + *prnh = rh; } } @@ -246,6 +254,7 @@ V_rt_numfibs = 1; vnet_rtzone_init(); + vnet_fib_init(); RTABLES_LOCK_INIT(); Index: sys/net/route/route_var.h =================================================================== --- sys/net/route/route_var.h +++ sys/net/route/route_var.h @@ -68,6 +68,7 @@ struct vnet *rib_vnet; /* vnet pointer */ int rib_family; /* AF of the rtable */ u_int rib_fibnum; /* fib number */ + bool rib_dying; /* rib is detaching */ struct callout expire_callout; /* Callout for expiring dynamic routes */ time_t next_expire; /* Next expire run ts */ uint32_t rnh_prefixes; /* Number of prefixes */ @@ -303,6 +304,13 @@ void nhgrp_ref_object(struct nhgrp_object *nhg); uint32_t nhgrp_get_idx(const struct nhgrp_object *nhg); void nhgrp_free(struct nhgrp_object *nhg); +uint32_t nhgrp_get_idx(const struct nhgrp_object *nhg); + +/* lookup_framework.c */ +void fib_grow_rtables(uint32_t new_num_tables); +int fib_select_algo_initial(struct rib_head *rh); +void fib_destroy_rib(struct rib_head *rh); +void vnet_fib_init(void); /* Entropy data used for outbound hashing */ #define MPATH_ENTROPY_KEY_LEN 40 Index: sys/net/route/test_lookup.c =================================================================== --- /dev/null +++ sys/net/route/test_lookup.c @@ -0,0 +1,322 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +#define CHUNK_SIZE 10000 + +VNET_DEFINE_STATIC(struct in_addr *, inet_addr_list); +#define V_inet_addr_list VNET(inet_addr_list) +VNET_DEFINE_STATIC(int, inet_list_size); +#define V_inet_list_size VNET(inet_list_size) + +VNET_DEFINE_STATIC(struct in6_addr *, inet6_addr_list); +#define V_inet6_addr_list VNET(inet6_addr_list) +VNET_DEFINE_STATIC(int, inet6_list_size); +#define V_inet6_list_size VNET(inet6_list_size) + +SYSCTL_DECL(_net_route); +SYSCTL_NODE(_net_route, OID_AUTO, test, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "Route algorithm lookups"); + +static int +add_addr(int family, char *addr_str) +{ + + if (family == AF_INET) { + struct in_addr *paddr_old = V_inet_addr_list; + int size_old = V_inet_list_size; + struct in_addr addr; + + if (inet_pton(AF_INET, addr_str, &addr) != 1) + return (EINVAL); + + struct in_addr *paddr = mallocarray(size_old + 1, + sizeof(struct in_addr), M_TEMP, M_ZERO | M_WAITOK); + + if (paddr_old != NULL) { + memcpy(paddr, paddr_old, size_old * sizeof(struct in_addr)); + free(paddr_old, M_TEMP); + } + paddr[size_old] = addr; + + V_inet_addr_list = paddr; + V_inet_list_size = size_old + 1; + inet_ntop(AF_INET, &addr, addr_str, sizeof(addr_str)); + printf("Added %s, total size: %d\n", addr_str, size_old + 1); + } else if (family == AF_INET6) { + struct in6_addr *paddr_old = V_inet6_addr_list; + int size_old = V_inet6_list_size; + struct in6_addr addr6; + + if (inet_pton(AF_INET6, addr_str, &addr6) != 1) + return (EINVAL); + + struct in6_addr *paddr = mallocarray(size_old + 1, + sizeof(struct in6_addr), M_TEMP, M_ZERO | M_WAITOK); + + if (paddr_old != NULL) { + memcpy(paddr, paddr_old, size_old * sizeof(struct in6_addr)); + free(paddr_old, M_TEMP); + } + paddr[size_old] = addr6; + + V_inet6_addr_list = paddr; + V_inet6_list_size = size_old + 1; + inet_ntop(AF_INET6, &addr6, addr_str, sizeof(addr_str)); + printf("Added %s, total size: %d\n", addr_str, size_old + 1); + } + + return (0); +} + +static int +add_addr_sysctl_handler(struct sysctl_oid *oidp, struct sysctl_req *req, int family) +{ + char addr_str[INET6_ADDRSTRLEN]; + int error; + + bzero(addr_str, sizeof(addr_str)); + + error = sysctl_handle_string(oidp, addr_str, sizeof(addr_str), req); + if (error != 0 || req->newptr == NULL) + return (error); + + error = add_addr(family, addr_str); + + return (0); +} + +static int +add_inet_addr_sysctl_handler(SYSCTL_HANDLER_ARGS) +{ + + return (add_addr_sysctl_handler(oidp, req, AF_INET)); +} +SYSCTL_PROC(_net_route_test, OID_AUTO, add_inet_addr, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, + add_inet_addr_sysctl_handler, "A", "Set"); + +static int +add_inet6_addr_sysctl_handler(SYSCTL_HANDLER_ARGS) +{ + + return (add_addr_sysctl_handler(oidp, req, AF_INET6)); +} +SYSCTL_PROC(_net_route_test, OID_AUTO, add_inet6_addr, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, + add_inet6_addr_sysctl_handler, "A", "Set"); + +static uint64_t +run_test_inet_one_pass() +{ + /* Assume epoch */ + int sz = V_inet_list_size; + int tries = CHUNK_SIZE / sz; + const struct in_addr *a = V_inet_addr_list; + uint64_t count = 0; + + for (int pass = 0; pass < tries; pass++) { + for (int i = 0; i < sz; i++) { + fib4_lookup(RT_DEFAULT_FIB, a[i], 0, NHR_NONE, 0); + count++; + } + } + return (count); +} + +static int +run_test_inet(SYSCTL_HANDLER_ARGS) +{ + struct epoch_tracker et; + + int count = 0; + int error = sysctl_handle_int(oidp, &count, 0, req); + if (error != 0) + return (error); + + if (count == 0) + return (0); + + if (V_inet_list_size <= 0) + return (ENOENT); + + printf("run: %d packets vnet %p\n", count, curvnet); + if (count < CHUNK_SIZE) + count = CHUNK_SIZE; + + + struct timespec ts_pre, ts_post; + int64_t pass_diff, total_diff = 0; + uint64_t pass_packets, total_packets = 0; + + for (int pass = 0; pass < count / CHUNK_SIZE; pass++) { + NET_EPOCH_ENTER(et); + nanouptime(&ts_pre); + pass_packets = run_test_inet_one_pass(); + nanouptime(&ts_post); + NET_EPOCH_EXIT(et); + + pass_diff = (ts_post.tv_sec - ts_pre.tv_sec) * 1000000000 + + (ts_post.tv_nsec - ts_pre.tv_nsec); + total_diff += pass_diff; + total_packets += pass_packets; + } + + printf("%zu packets in %zu nanoseconds, %zu pps\n", + total_packets, total_diff, total_packets * 1000000000 / total_diff); + + return (0); +} +SYSCTL_PROC(_net_route_test, OID_AUTO, run_inet, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + 0, 0, run_test_inet, "I", "Execute fib4_lookup test"); + +static uint64_t +run_test_inet6_one_pass() +{ + /* Assume epoch */ + int sz = V_inet6_list_size; + int tries = CHUNK_SIZE / sz; + const struct in6_addr *a = V_inet6_addr_list; + uint64_t count = 0; + + for (int pass = 0; pass < tries; pass++) { + for (int i = 0; i < sz; i++) { + fib6_lookup(RT_DEFAULT_FIB, &a[i], 0, NHR_NONE, 0); + count++; + } + } + return (count); +} + +static int +run_test_inet6(SYSCTL_HANDLER_ARGS) +{ + struct epoch_tracker et; + + int count = 0; + int error = sysctl_handle_int(oidp, &count, 0, req); + if (error != 0) + return (error); + + if (count == 0) + return (0); + + if (V_inet6_list_size <= 0) + return (ENOENT); + + printf("run: %d packets vnet %p\n", count, curvnet); + if (count < CHUNK_SIZE) + count = CHUNK_SIZE; + + + struct timespec ts_pre, ts_post; + int64_t pass_diff, total_diff = 0; + uint64_t pass_packets, total_packets = 0; + + for (int pass = 0; pass < count / CHUNK_SIZE; pass++) { + NET_EPOCH_ENTER(et); + nanouptime(&ts_pre); + pass_packets = run_test_inet6_one_pass(); + nanouptime(&ts_post); + NET_EPOCH_EXIT(et); + + pass_diff = (ts_post.tv_sec - ts_pre.tv_sec) * 1000000000 + + (ts_post.tv_nsec - ts_pre.tv_nsec); + total_diff += pass_diff; + total_packets += pass_packets; + } + + printf("%zu packets in %zu nanoseconds, %zu pps\n", + total_packets, total_diff, total_packets * 1000000000 / total_diff); + + return (0); +} +SYSCTL_PROC(_net_route_test, OID_AUTO, run_inet6, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + 0, 0, run_test_inet6, "I", "Execute fib6_lookup test"); + + +static int +test_fib_lookup_modevent(module_t mod, int type, void *unused) +{ + int error = 0; + + switch (type) { + case MOD_LOAD: + break; + case MOD_UNLOAD: + if (V_inet_addr_list != NULL) + free(V_inet_addr_list, M_TEMP); + if (V_inet6_addr_list != NULL) + free(V_inet6_addr_list, M_TEMP); + break; + default: + error = EOPNOTSUPP; + break; + } + return (error); +} + +static moduledata_t testfiblookupmod = { + "test_fib_lookup", + test_fib_lookup_modevent, + 0 +}; + +DECLARE_MODULE(testfiblookupmod, testfiblookupmod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(testfiblookup, 1); Index: sys/netinet/in_fib.c =================================================================== --- sys/netinet/in_fib.c +++ sys/netinet/in_fib.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -63,6 +64,10 @@ /* Assert 'struct route_in' is compatible with 'struct route' */ CHK_STRUCT_ROUTE_COMPAT(struct route_in, ro_dst4); +#ifdef ROUTE_ALGO +VNET_DEFINE(struct fib_dp *, inet_dp); +#endif + #ifdef ROUTE_MPATH struct _hash_5tuple_ipv4 { struct in_addr src; @@ -103,6 +108,29 @@ * one needs to pass NHR_REF as a flag. This will return referenced * nexthop. */ +#ifdef ROUTE_ALGO +struct nhop_object * +fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, + uint32_t flags, uint32_t flowid) +{ + struct nhop_object *nh; + struct fib_dp *dp = &V_inet_dp[fibnum]; + struct flm_lookup_key key = {.addr4 = dst }; + + nh = dp->f(dp->arg, key, scopeid); + if (nh != NULL) { + nh = nhop_select(nh, flowid); + /* Ensure route & ifp is UP */ + if (RT_LINK_IS_UP(nh->nh_ifp)) { + if (flags & NHR_REF) + nhop_ref_object(nh); + return (nh); + } + } + RTSTAT_INC(rts_unreach); + return (NULL); +} +#else struct nhop_object * fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, uint32_t flags, uint32_t flowid) @@ -142,6 +170,7 @@ RTSTAT_INC(rts_unreach); return (NULL); } +#endif inline static int check_urpf_nhop(const struct nhop_object *nh, uint32_t flags, @@ -180,6 +209,7 @@ return (check_urpf_nhop(nh, flags, src_if)); } +#ifndef ROUTE_ALGO static struct nhop_object * lookup_nhop(uint32_t fibnum, struct in_addr dst, uint32_t scopeid) { @@ -208,6 +238,7 @@ return (nh); } +#endif /* * Performs reverse path forwarding lookup. @@ -223,8 +254,14 @@ uint32_t flags, const struct ifnet *src_if) { struct nhop_object *nh; +#ifdef ROUTE_ALGO + struct fib_dp *dp = &V_inet_dp[fibnum]; + struct flm_lookup_key key = {.addr4 = dst }; + nh = dp->f(dp->arg, key, scopeid); +#else nh = lookup_nhop(fibnum, dst, scopeid); +#endif if (nh != NULL) return (check_urpf(nh, flags, src_if)); Index: sys/netinet/in_fib_algo.c =================================================================== --- /dev/null +++ sys/netinet/in_fib_algo.c @@ -0,0 +1,601 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + + +struct bsearch4_record { + uint32_t addr4; + uint32_t mask4; + struct nhop_object *nh; +}; + +struct bsearch4_data { + struct fib_data *fd; + uint32_t alloc_items; + uint32_t num_items; + void *mem; + struct bsearch4_record *rr; + struct bsearch4_record br[0]; +}; + +static struct nhop_object * +bsearch4_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) +{ + const struct bsearch4_data *bd = (const struct bsearch4_data *)algo_data; + const struct bsearch4_record *br; + uint32_t addr4 = ntohl(key.addr4.s_addr); + + int start = 0; + int end = bd->num_items; + + int i = (start + end) / 2; + while (start + 1 < end) { + i = (start + end) / 2; + br = &bd->br[i]; + if (addr4 < br->addr4) { + /* key < average, reduce right boundary */ + end = i; + continue; + } else if (addr4 > br->addr4) { + /* key > average, increase left aboundary */ + start = i; + continue; + } else { + /* direct match */ + return br->nh; + } + } + /* start + 1 == end */ + return bd->br[start].nh; +} + +static uint8_t +bsearch4_get_pref(const struct rib_rtable_info *rinfo) +{ + + if (rinfo->num_prefixes < 10) + return (253); + else if (rinfo->num_prefixes < 1000) + return (255 - rinfo->num_prefixes / 4); + else + return (1); +} + +static enum flm_op_result +bsearch4_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data) +{ + struct bsearch4_data *bd; + struct rib_rtable_info rinfo; + uint32_t count; + size_t sz; + void *mem; + + fib_get_rtable_info(fib_get_rh(fd), &rinfo); + count = rinfo.num_prefixes * 11 / 10 + 64; + + if (_old_data != NULL) { + struct bsearch4_data *old_bd = (struct bsearch4_data *)old_bd; + } + + sz = sizeof(struct bsearch4_data) + sizeof(struct bsearch4_record) * count; + /* add cache line sz to ease alignment */ + sz += CACHE_LINE_SIZE; + mem = malloc(sz, M_RTABLE, M_NOWAIT | M_ZERO); + if (mem == NULL) + return (FLM_REBUILD); + bd = (struct bsearch4_data *)roundup2((uintptr_t)mem, CACHE_LINE_SIZE); + bd->mem = mem; + bd->alloc_items = count; + bd->fd = fd; + + *_data = bd; + + bd->rr = malloc(sizeof(struct bsearch4_record) * count, M_TEMP, M_NOWAIT | M_ZERO); + if (bd->rr == NULL) + return (FLM_REBUILD); + + return (FLM_SUCCESS); +} + +static void +bsearch4_destroy(void *_data) +{ + struct bsearch4_data *bd = (struct bsearch4_data *)_data; + + if (bd->rr != NULL) + free(bd->rr, M_TEMP); + free(bd->mem, M_RTABLE); +} + +static enum flm_op_result +bsearch4_add_route_cb(struct rtentry *rt, void *_data) +{ + struct bsearch4_data *bd = (struct bsearch4_data *)_data; + struct nhop_object *nh; + struct bsearch4_record *rr; + + nh = rt_get_raw_nhop(rt); + + if (bd->num_items >= bd->alloc_items) + return (FLM_REBUILD); + + rr = &bd->rr[bd->num_items++]; + uint32_t scopeid; + struct in_addr addr4, mask4; + rt_get_inet_prefix_pmask(rt, &addr4, &mask4, &scopeid); + rr->addr4 = ntohl(addr4.s_addr); + rr->mask4 = ntohl(mask4.s_addr); + rr->nh = nh; + + return (FLM_SUCCESS); +} + +static int +rr_cmp(const void *_rec1, const void *_rec2) +{ + const struct bsearch4_record *rec1, *rec2; + rec1 = _rec1; + rec2 = _rec2; + + if (rec1->addr4 < rec2->addr4) + return (-1); + else if (rec1->addr4 > rec2->addr4) + return (1); + /* + * wider mask value is lesser mask + * we want less specific come first, e.g. < + */ + if (rec1->mask4 < rec2->mask4) + return (-1); + else if (rec1->mask4 > rec2->mask4) + return (1); + return (0); +} + +static bool +close_stack(struct bsearch4_data *bd, struct bsearch4_record *pst) +{ + + if (bd->num_items >= bd->alloc_items) + return (false); + struct bsearch4_record *br = &bd->br[bd->num_items]; + struct bsearch4_record *br_prev = &bd->br[bd->num_items - 1]; + uint32_t last_prev = (br_prev->addr4 | ~br_prev->mask4); + uint32_t last_rec = (pst->addr4 | ~pst->mask4); + + if (last_rec > last_prev) { + br->addr4 = last_prev + 1; + br->mask4 = pst->mask4; + br->nh = pst->nh; + bd->num_items++; + } + + return (true); +} + +static enum flm_op_result +bsearch4_build_array(struct bsearch4_data *bd, int num_items) +{ + struct bsearch4_record *br; + struct bsearch4_record *rr_stack; + int stack_off = 0; + + /* alloc stack of size 32 */ + rr_stack = malloc(32 * sizeof(struct bsearch4_record), M_TEMP, M_NOWAIT | M_ZERO); + if (rr_stack == NULL) + return (FLM_REBUILD); + + for (int i = 0; i < num_items; i++) { + struct bsearch4_record *rib_entry = &bd->rr[i]; + + bool diverged = false; + while (stack_off > 0) { + struct bsearch4_record *pst = &rr_stack[stack_off - 1]; + + /* + * Check if we need to pop stack. + * Rely on the ordering - larger prefixes comes up first + */ + bool match = pst->addr4 == (rib_entry->addr4 & pst->mask4); + + if (match && !diverged) + break; + + if (!close_stack(bd, pst)) + return (FLM_REBUILD); + + if (!match) { + stack_off--; + diverged = true; + } else + break; + } + + if (bd->num_items >= bd->alloc_items) + return (FLM_REBUILD); + br = &bd->br[bd->num_items++]; + br->addr4 = rib_entry->addr4; + br->mask4= rib_entry->mask4; + br->nh = rib_entry->nh; + rr_stack[stack_off++] = *rib_entry; + } + + while (stack_off > 0) { + close_stack(bd, &rr_stack[stack_off - 1]); + stack_off--; + } + + return (FLM_SUCCESS); +} + +static enum flm_op_result +bsearch4_build(struct bsearch4_data *bd) +{ + enum flm_op_result ret; + int num_items; + + /* Add default route if not exists */ + bool default_found = false; + for (int i = 0; i < bd->num_items; i++) { + if (bd->rr[i].mask4 == 0) { + default_found = true; + break; + } + } + if (!default_found) { + if (bd->num_items >= bd->alloc_items) + return (FLM_REBUILD); + /* Add default route with NULL nhop */ + bd->num_items++; + } + + /* Sort prefixes */ + qsort(bd->rr, bd->num_items, sizeof(struct bsearch4_record), rr_cmp); + num_items = bd->num_items; + bd->num_items = 0; + + ret = bsearch4_build_array(bd, num_items); + + free(bd->rr, M_TEMP); + bd->rr = NULL; + return (ret); +} + + +static enum flm_op_result +bsearch4_end_dump(void *_data, struct fib_dp *dp) +{ + struct bsearch4_data *bd = (struct bsearch4_data *)_data; + enum flm_op_result ret; + + ret = bsearch4_build(bd); + if (ret == FLM_SUCCESS) { + dp->f = bsearch4_lookup; + dp->arg = bd; + } + + return (ret); +} + +static enum flm_op_result +bsearch4_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, + void *_data) +{ + + return (FLM_REBUILD); +} + +struct fib_lookup_module flm_bsearch4= { + .flm_name = "bsearch4", + .flm_family = AF_INET, + .flm_init_cb = bsearch4_init, + .flm_destroy_cb = bsearch4_destroy, + .flm_dump_rib_item_cb = bsearch4_add_route_cb, + .flm_dump_end_cb = bsearch4_end_dump, + .flm_change_rib_item_cb = bsearch4_change_cb, + .flm_get_pref = bsearch4_get_pref, +}; + +#define KEY_LEN_INET (offsetof(struct sockaddr_in, sin_addr) + sizeof(in_addr_t)) +#define OFF_LEN_INET (8 * offsetof(struct sockaddr_in, sin_addr)) +struct radix4_addr_entry { + struct radix_node rn[2]; + struct sockaddr_in addr; + struct nhop_object *nhop; +}; +#define LRADIX4_ITEM_SZ roundup2(sizeof(struct radix4_addr_entry), 64) + +struct lradix4_data { + struct radix_node_head *rnh; + struct fib_data *fd; + void *mem; + uint32_t alloc_items; + uint32_t num_items; +}; + +static struct nhop_object * +lradix4_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) +{ + struct radix_node_head *rnh = (struct radix_node_head *)algo_data; + struct radix4_addr_entry *ent; + struct sockaddr_in addr4 = { + .sin_len = KEY_LEN_INET, + .sin_addr = key.addr4, + }; + ent = (struct radix4_addr_entry *)(rnh->rnh_matchaddr(&addr4, &rnh->rh)); + if (ent != NULL) + return (ent->nhop); + return (NULL); +} + +static uint8_t +lradix4_get_pref(const struct rib_rtable_info *rinfo) +{ + + if (rinfo->num_prefixes < 10) + return (250); + else if (rinfo->num_prefixes < 1000) + return (254 - rinfo->num_prefixes / 4); + else + return (1); +} + +static enum flm_op_result +lradix4_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data) +{ + struct lradix4_data *lr; + struct rib_rtable_info rinfo; + uint32_t count; + + lr = malloc(sizeof(struct lradix4_data), M_RTABLE, M_NOWAIT | M_ZERO); + if (lr == NULL || !rn_inithead((void **)&lr->rnh, OFF_LEN_INET)) + return (FLM_REBUILD); + fib_get_rtable_info(fib_get_rh(fd), &rinfo); + + count = rinfo.num_prefixes * 11 / 10; + // XXX: alignment! + lr->mem = malloc(count * LRADIX4_ITEM_SZ, M_RTABLE, M_NOWAIT | M_ZERO); + if (lr->mem == NULL) + return (FLM_REBUILD); + lr->alloc_items = count; + lr->fd = fd; + + *_data = lr; + + return (FLM_SUCCESS); +} + +static void +lradix4_destroy(void *_data) +{ + struct lradix4_data *lr = (struct lradix4_data *)_data; + + if (lr->rnh != NULL) + rn_detachhead((void **)&lr->rnh); + if (lr->mem != NULL) + free(lr->mem, M_RTABLE); + free(lr, M_RTABLE); +} + +static enum flm_op_result +lradix4_add_route_cb(struct rtentry *rt, void *_data) +{ + struct lradix4_data *lr = (struct lradix4_data *)_data; + struct radix4_addr_entry *ae; + struct sockaddr_in mask; + struct sockaddr *rt_mask = NULL; + struct radix_node *rn; + struct in_addr addr4, mask4; + uint32_t scopeid; + + if (lr->num_items >= lr->alloc_items) + return (FLM_REBUILD); + + ae = (struct radix4_addr_entry *)((char *)lr->mem + lr->num_items * LRADIX4_ITEM_SZ); + lr->num_items++; + + ae->nhop = rt_get_raw_nhop(rt); + + rt_get_inet_prefix_pmask(rt, &addr4, &mask4, &scopeid); + ae->addr.sin_len = KEY_LEN_INET; + ae->addr.sin_addr = addr4; + + if (mask4.s_addr != INADDR_ANY) { + bzero(&mask, sizeof(mask)); + mask.sin_len = KEY_LEN_INET; + mask.sin_addr = mask4; + rt_mask = (struct sockaddr *)&mask; + } + + rn = lr->rnh->rnh_addaddr((struct sockaddr *)&ae->addr, rt_mask, + &lr->rnh->rh, ae->rn); + if (rn == NULL) + return (FLM_REBUILD); + + return (FLM_SUCCESS); +} + +static enum flm_op_result +lradix4_end_dump(void *_data, struct fib_dp *dp) +{ + struct lradix4_data *lr = (struct lradix4_data *)_data; + + dp->f = lradix4_lookup; + dp->arg = lr->rnh; + + return (FLM_SUCCESS); +} + +static enum flm_op_result +lradix4_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, + void *_data) +{ + + return (FLM_REBUILD); +} + +struct fib_lookup_module flm_radix4_lockless = { + .flm_name = "radix4_lockless", + .flm_family = AF_INET, + .flm_init_cb = lradix4_init, + .flm_destroy_cb = lradix4_destroy, + .flm_dump_rib_item_cb = lradix4_add_route_cb, + .flm_dump_end_cb = lradix4_end_dump, + .flm_change_rib_item_cb = lradix4_change_cb, + .flm_get_pref = lradix4_get_pref, +}; + + +struct radix4_data { + struct fib_data *fd; + struct rib_head *rh; +}; + +static struct nhop_object * +radix4_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rh = (struct rib_head *)algo_data; + struct radix_node *rn; + struct nhop_object *nh; + + /* Prepare lookup key */ + struct sockaddr_in sin4 = { + .sin_family = AF_INET, + .sin_len = sizeof(struct sockaddr_in), + .sin_addr = key.addr4, + }; + + nh = NULL; + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) + nh = (RNTORT(rn))->rt_nhop; + RIB_RUNLOCK(rh); + + return (nh); +} + +static uint8_t +radix4_get_pref(const struct rib_rtable_info *rinfo) +{ + + return (50); +} + +static enum flm_op_result +radix4_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data) +{ + struct radix4_data *r4; + + r4 = malloc(sizeof(struct radix4_data), M_RTABLE, M_NOWAIT | M_ZERO); + if (r4 == NULL) + return (FLM_REBUILD); + r4->fd = fd; + r4->rh = fib_get_rh(fd); + + *_data = r4; + + return (FLM_SUCCESS); +} + +static void +radix4_destroy(void *_data) +{ + + free(_data, M_RTABLE); +} + +static enum flm_op_result +radix4_add_route_cb(struct rtentry *rt, void *_data) +{ + + return (FLM_SUCCESS); +} + +static enum flm_op_result +radix4_end_dump(void *_data, struct fib_dp *dp) +{ + struct radix4_data *r4 = (struct radix4_data *)_data; + + dp->f = radix4_lookup; + dp->arg = r4->rh; + + return (FLM_SUCCESS); +} + +static enum flm_op_result +radix4_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, + void *_data) +{ + + return (FLM_SUCCESS); +} + +struct fib_lookup_module flm_radix4 = { + .flm_name = "radix4", + .flm_family = AF_INET, + .flm_init_cb = radix4_init, + .flm_destroy_cb = radix4_destroy, + .flm_dump_rib_item_cb = radix4_add_route_cb, + .flm_dump_end_cb = radix4_end_dump, + .flm_change_rib_item_cb = radix4_change_cb, + .flm_get_pref = radix4_get_pref, +}; + +static void +fib4_algo_init(void) +{ + + fib_module_register(&flm_bsearch4); + fib_module_register(&flm_radix4_lockless); + fib_module_register(&flm_radix4); +} +SYSINIT(fib4_algo_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, fib4_algo_init, NULL); Index: sys/netinet6/in6_fib.h =================================================================== --- sys/netinet6/in6_fib.h +++ sys/netinet6/in6_fib.h @@ -44,6 +44,8 @@ uint32_t scopeid, uint32_t flags, struct route_nhop_data *rnd); struct nhop_object *fib6_lookup_debugnet(uint32_t fibnum, const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags); +struct nhop_object *fib6_radix_lookup_nh(uint32_t fibnum, + const struct in6_addr *dst6, uint32_t scopeid); uint32_t fib6_calc_software_hash(const struct in6_addr *src, const struct in6_addr *dst, unsigned short src_port, unsigned short dst_port, char proto, uint32_t *phashtype); Index: sys/netinet6/in6_fib.c =================================================================== --- sys/netinet6/in6_fib.c +++ sys/netinet6/in6_fib.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -69,6 +70,10 @@ CHK_STRUCT_ROUTE_COMPAT(struct route_in6, ro_dst); +#ifdef ROUTE_ALGO +VNET_DEFINE(struct fib_dp *, inet6_dp); +#endif + #ifdef ROUTE_MPATH struct _hash_5tuple_ipv6 { struct in6_addr src; @@ -111,6 +116,29 @@ * one needs to pass NHR_REF as a flag. This will return referenced * nexthop. */ +#ifdef ROUTE_ALGO +struct nhop_object * +fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6, + uint32_t scopeid, uint32_t flags, uint32_t flowid) +{ + struct nhop_object *nh; + struct fib_dp *dp = &V_inet6_dp[fibnum]; + struct flm_lookup_key key = {.addr6 = dst6 }; + + nh = dp->f(dp->arg, key, scopeid); + if (nh != NULL) { + nh = nhop_select(nh, flowid); + /* Ensure route & ifp is UP */ + if (RT_LINK_IS_UP(nh->nh_ifp)) { + if (flags & NHR_REF) + nhop_ref_object(nh); + return (nh); + } + } + RTSTAT_INC(rts_unreach); + return (NULL); +} +#else struct nhop_object * fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags, uint32_t flowid) @@ -151,6 +179,7 @@ RTSTAT_INC(rts_unreach); return (NULL); } +#endif inline static int check_urpf_nhop(const struct nhop_object *nh, uint32_t flags, @@ -237,8 +266,14 @@ uint32_t scopeid, uint32_t flags, const struct ifnet *src_if) { struct nhop_object *nh; +#ifndef ROUTE_ALGO + struct fib_dp *dp = &V_inet6_dp[fibnum]; + struct flm_lookup_key key = {.addr6 = dst6 }; + nh = dp->f(dp->arg, key, scopeid); +#else nh = lookup_nhop(fibnum, dst6, scopeid); +#endif if (nh != NULL) return (check_urpf(nh, flags, src_if)); return (0); Index: sys/netinet6/in6_fib_algo.c =================================================================== --- /dev/null +++ sys/netinet6/in6_fib_algo.c @@ -0,0 +1,349 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#define RTDEBUG + +#define KEY_LEN_INET6 (offsetof(struct sa_in6, sin6_addr) + sizeof(struct in6_addr)) +#define OFF_LEN_INET6 (8 * offsetof(struct sa_in6, sin6_addr)) +struct sa_in6 { + uint8_t sin6_len; + uint8_t sin6_family; + uint8_t pad[2]; + struct in6_addr sin6_addr; +}; +struct radix6_addr_entry { + struct radix_node rn[2]; + struct sa_in6 addr; + struct nhop_object *nhop; +}; +#define LRADIX6_ITEM_SZ roundup2(sizeof(struct radix6_addr_entry), CACHE_LINE_SIZE) + +struct lradix6_data { + struct radix_node_head *rnh; + struct fib_data *fd; + void *mem; // raw radix_mem pointer to free + void *radix_mem; + uint32_t alloc_items; + uint32_t num_items; +}; + +static struct nhop_object * +lradix6_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) +{ + struct radix_node_head *rnh = (struct radix_node_head *)algo_data; + struct radix6_addr_entry *ent; + struct sa_in6 addr6 = { + .sin6_len = KEY_LEN_INET6, + .sin6_addr = *key.addr6, + }; + if (IN6_IS_SCOPE_LINKLOCAL(key.addr6)) + addr6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff); + ent = (struct radix6_addr_entry *)(rnh->rnh_matchaddr(&addr6, &rnh->rh)); + if (ent != NULL) + return (ent->nhop); + return (NULL); +} + +static uint8_t +lradix6_get_pref(const struct rib_rtable_info *rinfo) +{ + + if (rinfo->num_prefixes < 10) + return (255); + else if (rinfo->num_prefixes < 100000) + return (255 - rinfo->num_prefixes / 394); + else + return (1); +} + +static enum flm_op_result +lradix6_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data) +{ + struct lradix6_data *lr; + struct rib_rtable_info rinfo; + uint32_t count; + void *mem; + + lr = malloc(sizeof(struct lradix6_data), M_RTABLE, M_NOWAIT | M_ZERO); + if (lr == NULL || !rn_inithead((void **)&lr->rnh, OFF_LEN_INET6)) + return (FLM_REBUILD); + fib_get_rtable_info(fib_get_rh(fd), &rinfo); + + count = rinfo.num_prefixes * 11 / 10; + // count+1 adds at least 1 cache line + mem = malloc((count + 1) * LRADIX6_ITEM_SZ, M_RTABLE, M_NOWAIT | M_ZERO); + if (mem == NULL) + return (FLM_REBUILD); + lr->mem = mem; + lr->radix_mem = (void *)roundup2((uintptr_t)mem, CACHE_LINE_SIZE); + lr->alloc_items = count; + lr->fd = fd; + + *_data = lr; + + return (FLM_SUCCESS); +} + +static void +lradix6_destroy(void *_data) +{ + struct lradix6_data *lr = (struct lradix6_data *)_data; + + if (lr->rnh != NULL) + rn_detachhead((void **)&lr->rnh); + if (lr->mem != NULL) + free(lr->mem, M_RTABLE); + free(lr, M_RTABLE); +} + +static enum flm_op_result +lradix6_add_route_cb(struct rtentry *rt, void *_data) +{ + struct lradix6_data *lr = (struct lradix6_data *)_data; + struct radix6_addr_entry *ae; + struct sockaddr_in6 *rt_dst, *rt_mask; + struct sa_in6 mask; + struct radix_node *rn; + struct nhop_object *nh; + + nh = rt_get_raw_nhop(rt); + + if (lr->num_items >= lr->alloc_items) + return (FLM_REBUILD); + + ae = (struct radix6_addr_entry *)((char *)lr->radix_mem + lr->num_items * LRADIX6_ITEM_SZ); + lr->num_items++; + + ae->nhop = nh; + + rt_dst = (struct sockaddr_in6 *)rt_key(rt); + rt_mask = (struct sockaddr_in6 *)rt_mask(rt); + + ae->addr.sin6_len = KEY_LEN_INET6; + ae->addr.sin6_addr = rt_dst->sin6_addr; + + if (rt_mask != NULL) { + bzero(&mask, sizeof(mask)); + mask.sin6_len = KEY_LEN_INET6; + mask.sin6_addr = rt_mask->sin6_addr; + rt_mask = (struct sockaddr_in6 *)&mask; + } + + rn = lr->rnh->rnh_addaddr((struct sockaddr *)&ae->addr, + (struct sockaddr *)rt_mask, &lr->rnh->rh, ae->rn); + if (rn == NULL) + return (FLM_REBUILD); + + return (FLM_SUCCESS); +} + +static enum flm_op_result +lradix6_end_dump(void *_data, struct fib_dp *dp) +{ + struct lradix6_data *lr = (struct lradix6_data *)_data; + + dp->f = lradix6_lookup; + dp->arg = lr->rnh; + + return (FLM_SUCCESS); +} + +static enum flm_op_result +lradix6_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, + void *_data) +{ + + return (FLM_REBUILD); +} + +struct fib_lookup_module flm_radix6_lockless = { + .flm_name = "radix6_lockless", + .flm_family = AF_INET6, + .flm_init_cb = lradix6_init, + .flm_destroy_cb = lradix6_destroy, + .flm_dump_rib_item_cb = lradix6_add_route_cb, + .flm_dump_end_cb = lradix6_end_dump, + .flm_change_rib_item_cb = lradix6_change_cb, + .flm_get_pref = lradix6_get_pref, +}; + + +struct radix6_data { + struct fib_data *fd; + struct rib_head *rh; +}; + +static struct nhop_object * +radix6_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rh = (struct rib_head *)algo_data; + struct radix_node *rn; + struct nhop_object *nh; + + /* Prepare lookup key */ + struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + .sin6_len = sizeof(struct sockaddr_in6), + .sin6_addr = *key.addr6, + }; + if (IN6_IS_SCOPE_LINKLOCAL(key.addr6)) + sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff); + + nh = NULL; + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) + nh = (RNTORT(rn))->rt_nhop; + RIB_RUNLOCK(rh); + + return (nh); +} + +struct nhop_object * +fib6_radix_lookup_nh(uint32_t fibnum, const struct in6_addr *dst6, uint32_t scopeid) +{ + struct rib_head *rh = rh = rt_tables_get_rnh(fibnum, AF_INET6); + const struct flm_lookup_key key = { .addr6 = dst6 }; + + if (rh == NULL) + return (NULL); + + return (radix6_lookup(rh, key, scopeid)); +} + +static uint8_t +radix6_get_pref(const struct rib_rtable_info *rinfo) +{ + + return (50); +} + +static enum flm_op_result +radix6_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data) +{ + struct radix6_data *r6; + + r6 = malloc(sizeof(struct radix6_data), M_RTABLE, M_NOWAIT | M_ZERO); + if (r6 == NULL) + return (FLM_REBUILD); + r6->fd = fd; + r6->rh = fib_get_rh(fd); + + *_data = r6; + + return (FLM_SUCCESS); +} + +static void +radix6_destroy(void *_data) +{ + + free(_data, M_RTABLE); +} + +static enum flm_op_result +radix6_add_route_cb(struct rtentry *rt, void *_data) +{ + + return (FLM_SUCCESS); +} + +static enum flm_op_result +radix6_end_dump(void *_data, struct fib_dp *dp) +{ + struct radix6_data *r6 = (struct radix6_data *)_data; + + dp->f = radix6_lookup; + dp->arg = r6->rh; + + return (FLM_SUCCESS); +} + +static enum flm_op_result +radix6_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, + void *_data) +{ + + return (FLM_SUCCESS); +} + +struct fib_lookup_module flm_radix6 = { + .flm_name = "radix6", + .flm_family = AF_INET6, + .flm_init_cb = radix6_init, + .flm_destroy_cb = radix6_destroy, + .flm_dump_rib_item_cb = radix6_add_route_cb, + .flm_dump_end_cb = radix6_end_dump, + .flm_change_rib_item_cb = radix6_change_cb, + .flm_get_pref = radix6_get_pref, +}; + +static void +fib6_algo_init(void) +{ + + fib_module_register(&flm_radix6_lockless); + fib_module_register(&flm_radix6); +} +SYSINIT(fib6_algo_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, fib6_algo_init, NULL);