Changeset View
Changeset View
Standalone View
Standalone View
sys/net/route/route_algo.c
- This file was added.
/*- | |||||
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD | |||||
* | |||||
* Copyright (c) 2020 Alexander V. Chernikov | |||||
* | |||||
* Redistribution and use in source and binary forms, with or without | |||||
* modification, are permitted provided that the following conditions | |||||
* are met: | |||||
* 1. Redistributions of source code must retain the above copyright | |||||
* notice, this list of conditions and the following disclaimer. | |||||
* 2. Redistributions in binary form must reproduce the above copyright | |||||
* notice, this list of conditions and the following disclaimer in the | |||||
* documentation and/or other materials provided with the distribution. | |||||
* | |||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | |||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |||||
* SUCH DAMAGE. | |||||
*/ | |||||
#define RTDEBUG | |||||
#include <sys/cdefs.h> | |||||
__FBSDID("$FreeBSD$"); | |||||
#include "opt_inet.h" | |||||
#include "opt_inet6.h" | |||||
#include "opt_route.h" | |||||
#include <sys/param.h> | |||||
#include <sys/eventhandler.h> | |||||
#include <sys/kernel.h> | |||||
#include <sys/sbuf.h> | |||||
#include <sys/lock.h> | |||||
#include <sys/rmlock.h> | |||||
#include <sys/malloc.h> | |||||
#include <sys/mbuf.h> | |||||
#include <sys/module.h> | |||||
#include <sys/kernel.h> | |||||
#include <sys/priv.h> | |||||
#include <sys/proc.h> | |||||
#include <sys/socket.h> | |||||
#include <sys/socketvar.h> | |||||
#include <sys/sysctl.h> | |||||
#include <sys/queue.h> | |||||
#include <net/vnet.h> | |||||
#include <net/if.h> | |||||
#include <net/if_var.h> | |||||
#include <netinet/in.h> | |||||
#include <netinet/in_var.h> | |||||
#include <netinet/ip.h> | |||||
#include <netinet/ip_var.h> | |||||
#ifdef INET6 | |||||
#include <netinet/ip6.h> | |||||
#include <netinet6/ip6_var.h> | |||||
#endif | |||||
#include <net/route.h> | |||||
#include <net/route/nhop.h> | |||||
#include <net/route/route_ctl.h> | |||||
#include <net/route/route_var.h> | |||||
#include <net/route/route_algo.h> | |||||
/* | |||||
* Route lookup framework. | |||||
* | |||||
* flm - fib lookup modules - kernel modules implementing particular algo | |||||
* fd - fib data - instance of an flm bound to specific routing table | |||||
* | |||||
* | |||||
* For each supported address family, there is a an allocated array of fib_dp | |||||
* structures, indexed by fib number. Each array entry contains callback function | |||||
* and its argument. This function will be called with a family-specific lookup key, | |||||
* scope and provided argument. This array gets re-created every time when new algo | |||||
* instance gets created. Please take a look at the replace_rtables_family() function | |||||
* for more details. | |||||
* | |||||
* Control plane for to setup and update the necessary dataplane structures. | |||||
* 1) nexhops abstraction -> module has to deal with index, refcounting, nexhtop groups etc | |||||
* 2) sync with route tables | |||||
* 3) dataplane attachment points | |||||
* 3) fail early. Some algorithms are immutable, so any change leads to rebuild. Some | |||||
* are mutable till some extent so the module is build over common setup/teardown | |||||
* instances, making error handling * easier. | |||||
* 4) preference. | |||||
* | |||||
*/ | |||||
SYSCTL_DECL(_net_route); | |||||
SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, | |||||
"Route algorithm lookups"); | |||||
#ifdef INET6 | |||||
SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, | |||||
"IPv6 algorithm lookups"); | |||||
#endif | |||||
#ifdef INET | |||||
SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, | |||||
"IPv4 algorithm lookups"); | |||||
#endif | |||||
struct nhop_ref_table { | |||||
uint32_t count; | |||||
int32_t refcnt[0]; | |||||
}; | |||||
struct fib_data { | |||||
uint32_t number_nhops; /* current # of nhops */ | |||||
uint32_t number_records; /* current # of routes */ | |||||
uint8_t hit_nhops; /* true if out of nhop limit */ | |||||
uint8_t init_done; /* true if init is competed */ | |||||
uint32_t fd_dead:1; /* Scheduled for deletion */ | |||||
uint32_t fd_linked:1; /* true if linked */ | |||||
uint32_t fd_need_rebuild:1; /* true if rebuild scheduled */ | |||||
uint32_t fd_force_eval:1;/* true if rebuild scheduled */ | |||||
uint8_t fd_family; /* family */ | |||||
uint32_t fd_fibnum; /* fibnum */ | |||||
uint32_t fd_failed_rebuilds; /* stat: failed rebuilds */ | |||||
struct callout fd_callout; /* rebuild callout */ | |||||
void *fd_algo_data; /* algorithm data */ | |||||
struct nhop_object **nh_idx; /* nhop idx->ptr array */ | |||||
struct nhop_ref_table *nh_ref_table; /* array with # of nhop references */ | |||||
struct rib_head *fd_rh; /* RIB table we're attached to */ | |||||
struct rib_subscription *fd_rs; /* storing table subscription */ | |||||
struct fib_algo_calldata *fa; | |||||
struct fib_dp fd_dp; /* fib datapath data */ | |||||
struct vnet *fd_vnet; /* vnet nhop belongs to */ | |||||
struct epoch_context fd_epoch_ctx; | |||||
uint64_t gencnt; | |||||
struct fib_lookup_module *fd_flm; | |||||
uint32_t fd_num_changes; /* number of changes since last callout */ | |||||
TAILQ_ENTRY(fib_data) entries; /* list of all fds in vnet */ | |||||
}; | |||||
static void rebuild_callout(void *_data); | |||||
static void destroy_instance_epoch(epoch_context_t ctx); | |||||
static enum flm_op_result switch_algo(struct fib_data *fd); | |||||
static struct fib_lookup_module *find_algo(const char *algo_name, int family); | |||||
static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh, | |||||
struct fib_lookup_module *orig_flm); | |||||
struct mtx fib_mtx; | |||||
#define MOD_LOCK() mtx_lock(&fib_mtx) | |||||
#define MOD_UNLOCK() mtx_unlock(&fib_mtx) | |||||
uint32_t algo_bitmask_idx = 0; | |||||
/* Algorithm has to be this percent better than the current to switch */ | |||||
#define BEST_DIFF_PERCENT (5 * 256 / 100) | |||||
/* Schedule algo re-evaluation X seconds after a change */ | |||||
#define ALGO_EVAL_DELAY_MS 30000 | |||||
/* Force algo re-evaluation after X changes */ | |||||
#define ALGO_EVAL_NUM_ROUTES 100 | |||||
/* Try to setup algorithm X times */ | |||||
#define FIB_MAX_TRIES 32 | |||||
/* Max amount of supported nexthops */ | |||||
#define FIB_MAX_NHOPS 262144 | |||||
#define FIB_CALLOUT_DELAY_MS 50 | |||||
/* TODO: per-VNET */ | |||||
static TAILQ_HEAD(fib_data_head, fib_data) fib_data_list = TAILQ_HEAD_INITIALIZER(fib_data_list); | |||||
struct fib_dp_header { | |||||
struct epoch_context ffi_epoch_ctx; | |||||
uint32_t ffi_num_tables; | |||||
struct fib_dp ffi_idx[0]; | |||||
}; | |||||
static TAILQ_HEAD(, fib_lookup_module) all_algo_list; | |||||
#ifdef RTDEBUG | |||||
#define RH_PRINTF(_rh, _fmt, ...) printf("[rt_algo] %s.%u %s: " _fmt "\n", \ | |||||
print_family(_rh->rib_family), _rh->rib_fibnum, __func__ , ## __VA_ARGS__) | |||||
#define RH_PRINTF_RAW(_fmt, ...) printf("[rt_algo] %s: " _fmt "\n", __func__ , ## __VA_ARGS__) | |||||
#define FD_PRINTF(fd, _fmt, ...) printf("[rt_algo] %s.%u (%s) %s: " _fmt "\n",\ | |||||
print_family(fd->fd_family), fd->fd_fibnum, fd->fd_flm->flm_name, __func__, \ | |||||
##__VA_ARGS__) | |||||
#else | |||||
#define FD_RH_PRINTF(fd, _fmt, ...) | |||||
#define RH_PRINTF(_fmt, ...) | |||||
#define RH_PRINTF_RAW(_fmt, ...) | |||||
#endif | |||||
static const char * | |||||
print_family(int family) | |||||
{ | |||||
if (family == AF_INET) | |||||
return ("inet"); | |||||
else if (family == AF_INET6) | |||||
return ("inet6"); | |||||
else | |||||
return ("unknown"); | |||||
} | |||||
static int | |||||
print_algos(struct sysctl_req *req, int family) | |||||
{ | |||||
struct fib_lookup_module *flm; | |||||
struct sbuf sbuf; | |||||
int error, count = 0; | |||||
error = sysctl_wire_old_buffer(req, 0); | |||||
if (error == 0) { | |||||
sbuf_new_for_sysctl(&sbuf, NULL, 128, req); | |||||
TAILQ_FOREACH(flm, &all_algo_list, entries) { | |||||
if (flm->flm_family == family) { | |||||
if (count++ > 0) | |||||
sbuf_cat(&sbuf, ", "); | |||||
sbuf_cat(&sbuf, flm->flm_name); | |||||
} | |||||
} | |||||
error = sbuf_finish(&sbuf); | |||||
sbuf_delete(&sbuf); | |||||
} | |||||
return (error); | |||||
} | |||||
static int | |||||
print_algos_inet6(SYSCTL_HANDLER_ARGS) | |||||
{ | |||||
return (print_algos(req, AF_INET6)); | |||||
} | |||||
SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list, | |||||
CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, | |||||
print_algos_inet6, "A", "List of algos"); | |||||
static int | |||||
print_algos_inet(SYSCTL_HANDLER_ARGS) | |||||
{ | |||||
return (print_algos(req, AF_INET)); | |||||
} | |||||
SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list, | |||||
CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, | |||||
print_algos_inet, "A", "List of algos"); | |||||
static struct fib_lookup_module * | |||||
find_algo(const char *algo_name, int family) | |||||
{ | |||||
struct fib_lookup_module *flm; | |||||
TAILQ_FOREACH(flm, &all_algo_list, entries) { | |||||
if ((strcmp(flm->flm_name, algo_name) == 0) && | |||||
(family == flm->flm_family)) | |||||
return (flm); | |||||
} | |||||
return (NULL); | |||||
} | |||||
static uint32_t | |||||
callout_calc_delay(struct fib_data *fd) | |||||
{ | |||||
uint32_t shift; | |||||
if (fd->fd_failed_rebuilds > 10) | |||||
shift = 10; | |||||
else | |||||
shift = fd->fd_failed_rebuilds; | |||||
return ((1 << shift) * FIB_CALLOUT_DELAY_MS); | |||||
} | |||||
static void | |||||
schedule_callout(struct fib_data *fd, int delay_ms) | |||||
{ | |||||
callout_reset_sbt(&fd->fd_callout, 0, SBT_1MS * delay_ms, | |||||
rebuild_callout, fd, 0); | |||||
} | |||||
static void | |||||
schedule_algo_eval(struct fib_data *fd) | |||||
{ | |||||
if (fd->fd_num_changes++ == 0) { | |||||
/* Start callout to consider switch */ | |||||
MOD_LOCK(); | |||||
if (!callout_pending(&fd->fd_callout)) | |||||
schedule_callout(fd, ALGO_EVAL_DELAY_MS); | |||||
MOD_UNLOCK(); | |||||
} else if (fd->fd_num_changes > ALGO_EVAL_NUM_ROUTES && !fd->fd_force_eval) { | |||||
/* Reset callout to exec immediately */ | |||||
MOD_LOCK(); | |||||
if (!fd->fd_need_rebuild) { | |||||
fd->fd_force_eval = true; | |||||
schedule_callout(fd, 1); | |||||
} | |||||
MOD_UNLOCK(); | |||||
} | |||||
} | |||||
/* | |||||
* rib subscription handler | |||||
*/ | |||||
static void | |||||
handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, | |||||
void *_data) | |||||
{ | |||||
struct fib_data *fd = (struct fib_data *)_data; | |||||
enum flm_op_result result; | |||||
RIB_WLOCK_ASSERT(rnh); | |||||
if (!fd->init_done) | |||||
return; | |||||
schedule_algo_eval(fd); | |||||
result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data); | |||||
switch (result) { | |||||
case FLM_SUCCESS: | |||||
break; | |||||
case FLM_REBUILD: | |||||
/* | |||||
* Algo reported inability to handle, | |||||
* schedule algo rebuild. | |||||
*/ | |||||
MOD_LOCK(); | |||||
if (!fd->fd_need_rebuild) { | |||||
fd->fd_need_rebuild = true; | |||||
/* | |||||
* Potentially rewrites pending callout | |||||
* to re-evaluate algo. | |||||
*/ | |||||
FD_PRINTF(fd, "Scheduling rebuilt"); | |||||
schedule_callout(fd, callout_calc_delay(fd)); | |||||
} | |||||
MOD_UNLOCK(); | |||||
break; | |||||
default: | |||||
/* | |||||
* Algo reported a non-recoverable error. | |||||
* Remove and switch to radix? | |||||
*/ | |||||
FD_PRINTF(fd, "algo reported non-recoverable error"); | |||||
// TODO: switch to radix | |||||
} | |||||
} | |||||
static void | |||||
estimate_scale(const struct fib_data *old_fd, struct fib_data *fd) | |||||
{ | |||||
if (old_fd == NULL) { | |||||
fd->number_nhops = 16; | |||||
return; | |||||
} | |||||
if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS) | |||||
fd->number_nhops = 2 * old_fd->number_nhops; | |||||
else | |||||
fd->number_nhops = old_fd->number_nhops; | |||||
} | |||||
struct walk_cbdata { | |||||
struct fib_data *fd; | |||||
flm_dump_t *func; | |||||
enum flm_op_result result; | |||||
}; | |||||
static void | |||||
sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data) | |||||
{ | |||||
struct walk_cbdata *w = (struct walk_cbdata *)_data; | |||||
struct fib_data *fd = w->fd; | |||||
if (rnh->rib_dying) { | |||||
w->result = FLM_ERROR; | |||||
return; | |||||
} | |||||
if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS) | |||||
return; | |||||
if (fd->hit_nhops) { | |||||
FD_PRINTF(fd, "ran out of nexthops at %u nhops", | |||||
fd->nh_ref_table->count); | |||||
w->result = FLM_REBUILD; | |||||
return; | |||||
} | |||||
w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp); | |||||
if (w->result == FLM_SUCCESS) { | |||||
/* Mark init as done to allow routing updates */ | |||||
fd->init_done = 1; | |||||
} | |||||
} | |||||
static int | |||||
sync_algo_cb(struct rtentry *rt, void *_data) | |||||
{ | |||||
struct walk_cbdata *w = (struct walk_cbdata *)_data; | |||||
enum flm_op_result result; | |||||
if (w->result == FLM_SUCCESS && w->func) { | |||||
result = w->func(rt, w->fd->fd_algo_data); | |||||
if (result != FLM_SUCCESS) | |||||
w->result = result; | |||||
} | |||||
return (0); | |||||
} | |||||
static enum flm_op_result | |||||
sync_algo(struct fib_data *fd) | |||||
{ | |||||
struct walk_cbdata w; | |||||
w.fd = fd; | |||||
w.func = fd->fd_flm->flm_dump_rib_item_cb; | |||||
w.result = FLM_SUCCESS; | |||||
rib_walk_ext_internal(fd->fd_rh, true, sync_algo_cb, sync_algo_end_cb, &w); | |||||
FD_PRINTF(fd, "initial dump completed."); | |||||
return (w.result); | |||||
} | |||||
/* | |||||
* Assume already unlinked from datapath | |||||
*/ | |||||
static int | |||||
schedule_destroy_instance(struct fib_data *fd, bool in_callout) | |||||
{ | |||||
bool is_dead; | |||||
NET_EPOCH_ASSERT(); | |||||
MOD_LOCK(); | |||||
is_dead = fd->fd_dead; | |||||
if (!is_dead) | |||||
fd->fd_dead = true; | |||||
if (fd->fd_linked) { | |||||
TAILQ_REMOVE(&fib_data_list, fd, entries); | |||||
fd->fd_linked = false; | |||||
} | |||||
MOD_UNLOCK(); | |||||
if (is_dead) | |||||
return (0); | |||||
FD_PRINTF(fd, "DETACH"); | |||||
if (fd->fd_rs != NULL) | |||||
rib_unsibscribe(fd->fd_rs); | |||||
/* | |||||
* After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls | |||||
* will be executed, hence no _new_ callout schedules will happen. | |||||
* | |||||
* There can be 3 possible scenarious here: | |||||
* 1) we're running inside a callout when we're deleting ourselves | |||||
* due to migration to a newer fd | |||||
* 2) we're running from rt_table_destroy() and callout is scheduled | |||||
* for execution OR is executing | |||||
* | |||||
* For (2) we need to wait for the callout termination, as the routing table | |||||
* will be destroyed after this function returns. | |||||
* For (1) we cannot call drain, but can ensure that this is the last invocation. | |||||
*/ | |||||
if (in_callout) | |||||
callout_stop(&fd->fd_callout); | |||||
else | |||||
callout_drain(&fd->fd_callout); | |||||
/* | |||||
* At this moment there are no other pending work scheduled. | |||||
*/ | |||||
FD_PRINTF(fd, "destroying old instance"); | |||||
epoch_call(net_epoch_preempt, destroy_instance_epoch, | |||||
&fd->fd_epoch_ctx); | |||||
return (0); | |||||
} | |||||
void | |||||
fib_destroy_rib(struct rib_head *rh) | |||||
{ | |||||
struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head); | |||||
struct fib_data *fd, *fd_tmp; | |||||
struct epoch_tracker et; | |||||
/* | |||||
* Atm we have set is_dying flag on rnh, so all new fd's will | |||||
* fail at sync_algo() stage, so nothing new will be added to the list. | |||||
*/ | |||||
MOD_LOCK(); | |||||
TAILQ_FOREACH_SAFE(fd, &fib_data_list, entries, fd_tmp) { | |||||
if (fd->fd_rh == rh) { | |||||
TAILQ_REMOVE(&fib_data_list, fd, entries); | |||||
fd->fd_linked = false; | |||||
TAILQ_INSERT_TAIL(&tmp_head, fd, entries); | |||||
} | |||||
} | |||||
MOD_UNLOCK(); | |||||
/* Pass 2: remove each entry */ | |||||
NET_EPOCH_ENTER(et); | |||||
TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) { | |||||
schedule_destroy_instance(fd, false); | |||||
} | |||||
NET_EPOCH_EXIT(et); | |||||
} | |||||
static void | |||||
destroy_instance(struct fib_data *fd) | |||||
{ | |||||
FD_PRINTF(fd, "destroy fd %p", fd); | |||||
/* Call destroy callback first */ | |||||
if (fd->fd_algo_data != NULL) | |||||
fd->fd_flm->flm_destroy_cb(fd->fd_algo_data); | |||||
/* Nhop table */ | |||||
if (fd->nh_idx != NULL) { | |||||
for (int i = 0; i < fd->number_nhops; i++) { | |||||
if (fd->nh_idx[i] != NULL) { | |||||
//FD_PRINTF(fd, " FREE nhop %d %p", i, fd->nh_idx[i]); | |||||
nhop_free_any(fd->nh_idx[i]); | |||||
} | |||||
} | |||||
free(fd->nh_idx, M_RTABLE); | |||||
} | |||||
if (fd->nh_ref_table != NULL) | |||||
free(fd->nh_ref_table, M_RTABLE); | |||||
MOD_LOCK(); | |||||
fd->fd_flm->flm_refcount--; | |||||
MOD_UNLOCK(); | |||||
free(fd, M_RTABLE); | |||||
} | |||||
/* | |||||
* Epoch callback indicating fd is safe to destroy | |||||
*/ | |||||
static void | |||||
destroy_instance_epoch(epoch_context_t ctx) | |||||
{ | |||||
struct fib_data *fd; | |||||
fd = __containerof(ctx, struct fib_data, fd_epoch_ctx); | |||||
destroy_instance(fd); | |||||
} | |||||
static enum flm_op_result | |||||
try_setup_instance(struct fib_lookup_module *flm, struct rib_head *rh, | |||||
struct fib_data *old_fd, struct fib_data **pfd) | |||||
{ | |||||
struct fib_data *fd; | |||||
size_t size; | |||||
enum flm_op_result result; | |||||
/* Allocate */ | |||||
fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO); | |||||
if (fd == NULL) { | |||||
*pfd = NULL; | |||||
return (FLM_REBUILD); | |||||
} | |||||
*pfd = fd; | |||||
estimate_scale(old_fd, fd); | |||||
fd->fd_rh = rh; | |||||
fd->fd_family = rh->rib_family; | |||||
fd->fd_fibnum = rh->rib_fibnum; | |||||
callout_init(&fd->fd_callout, 1); | |||||
fd->fd_vnet = curvnet; | |||||
fd->fd_flm = flm; | |||||
/* Allocate nhidx -> nhop_ptr table */ | |||||
size = fd->number_nhops * sizeof(void *); | |||||
//FD_PRINTF(fd, "malloc(%lu)", size); | |||||
fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); | |||||
if (fd->nh_idx == NULL) { | |||||
FD_PRINTF(fd, "Unable to allocate nhop table idx (sz:%zu)", size); | |||||
return (FLM_REBUILD); | |||||
} | |||||
/* Allocate nhop index refcount table */ | |||||
size = sizeof(struct nhop_ref_table); | |||||
size += fd->number_nhops * sizeof(uint32_t); | |||||
//FD_PRINTF(fd, "malloc(%lu)", size); | |||||
fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); | |||||
if (fd->nh_ref_table == NULL) { | |||||
FD_PRINTF(fd, "Unable to allocate nhop refcount table (sz:%zu)", size); | |||||
return (FLM_REBUILD); | |||||
} | |||||
/* Okay, we're ready for algo init */ | |||||
void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL; | |||||
result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data); | |||||
if (result != FLM_SUCCESS) | |||||
return (result); | |||||
/* Try to subscribe */ | |||||
if (flm->flm_change_rib_item_cb != NULL) { | |||||
fd->fd_rs = rib_subscribe_internal(fd->fd_rh, | |||||
handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE, 0); | |||||
if (fd->fd_rs == NULL) | |||||
return (FLM_REBUILD); | |||||
} | |||||
/* Dump */ | |||||
result = sync_algo(fd); | |||||
if (result != FLM_SUCCESS) | |||||
return (result); | |||||
FD_PRINTF(fd, "DUMP completed successfully."); | |||||
MOD_LOCK(); | |||||
/* | |||||
* Insert in the beginning of a list, to simplify search | |||||
* first matching entry is the one. | |||||
*/ | |||||
TAILQ_INSERT_HEAD(&fib_data_list, fd, entries); | |||||
fd->fd_linked = true; | |||||
MOD_UNLOCK(); | |||||
return (FLM_SUCCESS); | |||||
} | |||||
/* | |||||
* Sets up algo @flm for table @rh and links it to the datapath. | |||||
* | |||||
*/ | |||||
static enum flm_op_result | |||||
setup_instance(struct fib_lookup_module *flm, struct rib_head *rh, | |||||
struct fib_data *orig_fd, struct fib_data **pfd, bool attach) | |||||
{ | |||||
struct fib_data *prev_fd, *new_fd; | |||||
struct epoch_tracker et; | |||||
enum flm_op_result result; | |||||
prev_fd = orig_fd; | |||||
new_fd = NULL; | |||||
for (int i = 0; i < FIB_MAX_TRIES; i++) { | |||||
NET_EPOCH_ENTER(et); | |||||
result = try_setup_instance(flm, rh, prev_fd, &new_fd); | |||||
if ((result == FLM_SUCCESS) && attach) | |||||
result = switch_algo(new_fd); | |||||
if ((prev_fd != NULL) && (prev_fd != orig_fd)) { | |||||
schedule_destroy_instance(prev_fd, false); | |||||
prev_fd = NULL; | |||||
} | |||||
NET_EPOCH_EXIT(et); | |||||
RH_PRINTF(rh, "try %d: fib algo result: %d", i, result); | |||||
if (result == FLM_REBUILD) { | |||||
prev_fd = new_fd; | |||||
new_fd = NULL; | |||||
continue; | |||||
} | |||||
break; | |||||
} | |||||
if (result != FLM_SUCCESS) { | |||||
/* update failure count */ | |||||
MOD_LOCK(); | |||||
if (orig_fd != NULL) | |||||
orig_fd->fd_failed_rebuilds++; | |||||
MOD_UNLOCK(); | |||||
NET_EPOCH_ENTER(et); | |||||
if ((prev_fd != NULL) && (prev_fd != orig_fd)) | |||||
schedule_destroy_instance(prev_fd, false); | |||||
if (new_fd != NULL) { | |||||
schedule_destroy_instance(new_fd, false); | |||||
new_fd = NULL; | |||||
} | |||||
NET_EPOCH_EXIT(et); | |||||
} | |||||
*pfd = new_fd; | |||||
return (result); | |||||
} | |||||
static void | |||||
rebuild_callout(void *_data) | |||||
{ | |||||
struct fib_data *fd, *fd_new; | |||||
struct fib_lookup_module *flm_new; | |||||
struct epoch_tracker et; | |||||
enum flm_op_result result; | |||||
bool need_rebuild = false; | |||||
fd = (struct fib_data *)_data; | |||||
MOD_LOCK(); | |||||
need_rebuild = fd->fd_need_rebuild; | |||||
fd->fd_need_rebuild = false; | |||||
fd->fd_force_eval = false; | |||||
fd->fd_num_changes = 0; | |||||
MOD_UNLOCK(); | |||||
CURVNET_SET(fd->fd_vnet); | |||||
/* First, check if we're still OK to use this algo */ | |||||
flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); | |||||
if ((flm_new == NULL) && (!need_rebuild)) { | |||||
/* Keep existing algo, no need to rebuild. */ | |||||
CURVNET_RESTORE(); | |||||
return; | |||||
} | |||||
struct fib_data *fd_tmp = (flm_new == NULL) ? fd : NULL; | |||||
result = setup_instance(fd->fd_flm, fd->fd_rh, fd_tmp, &fd_new, true); | |||||
if (result != FLM_SUCCESS) { | |||||
FD_PRINTF(fd, "table rebuild failed"); | |||||
CURVNET_RESTORE(); | |||||
return; | |||||
} | |||||
FD_PRINTF(fd_new, "switched to new instance"); | |||||
/* Remove old */ | |||||
if (fd != NULL) { | |||||
NET_EPOCH_ENTER(et); | |||||
schedule_destroy_instance(fd, true); | |||||
NET_EPOCH_EXIT(et); | |||||
} | |||||
CURVNET_RESTORE(); | |||||
} | |||||
static int | |||||
set_algo_sysctl_handler(SYSCTL_HANDLER_ARGS) | |||||
{ | |||||
int error = 0; | |||||
#if 0 | |||||
struct epoch_tracker et; | |||||
struct fib_lookup_module *flm; | |||||
struct fib_data *old_fd, *fd; | |||||
char old_algo_name[32], algo_name[32]; | |||||
uint32_t fibnum; | |||||
int error; | |||||
fibnum = RT_DEFAULT_FIB; | |||||
if (old_fd == NULL) { | |||||
strlcpy(old_algo_name, "radix", sizeof(old_algo_name)); | |||||
} else { | |||||
strlcpy(old_algo_name, fd_ptr->fd_flm->flm_name, | |||||
sizeof(old_algo_name)); | |||||
} | |||||
strlcpy(algo_name, old_algo_name, sizeof(algo_name)); | |||||
error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req); | |||||
if (error != 0 || req->newptr == NULL) | |||||
return (error); | |||||
if (strcmp(algo_name, old_algo_name) == 0) | |||||
return (0); | |||||
if (strcmp(algo_name, "radix") == 0) { | |||||
/* teardown old one */ | |||||
NET_EPOCH_ENTER(et); | |||||
MOD_LOCK(); | |||||
old_fd = fd_ptr; | |||||
fd_ptr = NULL; | |||||
MOD_UNLOCK(); | |||||
if (old_fd != NULL) | |||||
schedule_destroy_instance(old_fd); | |||||
NET_EPOCH_EXIT(et); | |||||
return (0); | |||||
} | |||||
MOD_LOCK(); | |||||
flm = find_algo(algo_name, AF_INET6); | |||||
if (flm != NULL) | |||||
flm->flm_refcount++; | |||||
MOD_UNLOCK(); | |||||
if (flm == NULL) { | |||||
DPRINTF("unable to find algo %s", algo_name); | |||||
return (ESRCH); | |||||
} | |||||
DPRINTF("inet6.%u: requested fib algo %s", fibnum, algo_name); | |||||
fd = setup_instance(flm, fibnum, NULL, &error); | |||||
if (error != 0) { | |||||
MOD_LOCK(); | |||||
flm->flm_refcount--; | |||||
MOD_UNLOCK(); | |||||
return (error); | |||||
} | |||||
MOD_LOCK(); | |||||
old_fd = fd_ptr; | |||||
fd_ptr = fd; | |||||
MOD_UNLOCK(); | |||||
/* Remove old */ | |||||
NET_EPOCH_ENTER(et); | |||||
if (old_fd != NULL) { | |||||
error = schedule_destroy_instance(old_fd); | |||||
} | |||||
NET_EPOCH_EXIT(et); | |||||
#endif | |||||
/* Set new */ | |||||
/* Drain cb so user can unload the module after userret if so desired */ | |||||
epoch_drain_callbacks(net_epoch_preempt); | |||||
return (error); | |||||
} | |||||
SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo, | |||||
CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, | |||||
set_algo_sysctl_handler, "A", | |||||
"Set"); | |||||
static void | |||||
destroy_fdh_epoch(epoch_context_t ctx) | |||||
{ | |||||
struct fib_dp_header *ffi; | |||||
ffi = __containerof(ctx, struct fib_dp_header, ffi_epoch_ctx); | |||||
free(ffi, M_RTABLE); | |||||
} | |||||
static struct fib_dp_header * | |||||
alloc_fib_dp_array(uint32_t num_tables, bool waitok) | |||||
{ | |||||
size_t sz; | |||||
struct fib_dp_header *ffi; | |||||
sz = sizeof(struct fib_dp_header); | |||||
sz += sizeof(struct fib_dp) * num_tables; | |||||
ffi = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO); | |||||
if (ffi != NULL) | |||||
ffi->ffi_num_tables = num_tables; | |||||
return (ffi); | |||||
} | |||||
static struct fib_dp_header * | |||||
get_fib_dp_header(struct fib_dp *dp) | |||||
{ | |||||
return (__containerof((void *)dp, struct fib_dp_header, ffi_idx)); | |||||
} | |||||
/* | |||||
* Replace per-family index pool @pdp with a new one which | |||||
* contains updated callback/algo data from @fd. | |||||
* Returns 0 on success. | |||||
*/ | |||||
static enum flm_op_result | |||||
replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd) | |||||
{ | |||||
struct fib_dp_header *new_ffi, *old_ffi; | |||||
NET_EPOCH_ASSERT(); | |||||
//FD_PRINTF(fd, "[vnet %p] replace with f:%p arg:%p", curvnet, fd->fd_dp.f, fd->fd_dp.arg); | |||||
MOD_LOCK(); | |||||
old_ffi = get_fib_dp_header(*pdp); | |||||
new_ffi = alloc_fib_dp_array(old_ffi->ffi_num_tables, false); | |||||
//FD_PRINTF(fd, "OLD FFI: %p NEW FFI: %p", old_ffi, new_ffi); | |||||
if (new_ffi == NULL) { | |||||
MOD_UNLOCK(); | |||||
FD_PRINTF(fd, "error attaching datapath"); | |||||
return (FLM_REBUILD); | |||||
} | |||||
memcpy(&new_ffi->ffi_idx[0], &old_ffi->ffi_idx[0], | |||||
old_ffi->ffi_num_tables * sizeof(struct fib_dp)); | |||||
/* Update relevant data structure for @fd */ | |||||
new_ffi->ffi_idx[fd->fd_fibnum] = fd->fd_dp; | |||||
/* Ensure memcpy() writes have completed */ | |||||
atomic_thread_fence_rel(); | |||||
/* Set new datapath pointer */ | |||||
*pdp = &new_ffi->ffi_idx[0]; | |||||
MOD_UNLOCK(); | |||||
//FD_PRINTF(fd, "update %p -> %p", old_ffi, new_ffi); | |||||
epoch_call(net_epoch_preempt, destroy_fdh_epoch, | |||||
&old_ffi->ffi_epoch_ctx); | |||||
return (FLM_SUCCESS); | |||||
} | |||||
static struct fib_dp ** | |||||
get_family_ptr(int family) | |||||
{ | |||||
switch (family) { | |||||
case AF_INET: | |||||
return (&V_inet_dp); | |||||
case AF_INET6: | |||||
return (&V_inet6_dp); | |||||
} | |||||
return (NULL); | |||||
} | |||||
static enum flm_op_result | |||||
switch_algo(struct fib_data *fd) | |||||
{ | |||||
struct fib_dp **pdp; | |||||
pdp = get_family_ptr(fd->fd_family); | |||||
return (replace_rtables_family(pdp, fd)); | |||||
} | |||||
/* | |||||
* Grow datapath pointers array. | |||||
* Called from sysctl handler on growing number of routing tables. | |||||
*/ | |||||
static void | |||||
grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables) | |||||
{ | |||||
struct fib_dp_header *new_fdh, *old_fdh = NULL; | |||||
new_fdh = alloc_fib_dp_array(new_num_tables, true); | |||||
MOD_LOCK(); | |||||
if (*pdp != NULL) { | |||||
old_fdh = get_fib_dp_header(*pdp); | |||||
memcpy(&new_fdh->ffi_idx[0], &old_fdh->ffi_idx[0], | |||||
old_fdh->ffi_num_tables * sizeof(struct fib_dp)); | |||||
} | |||||
/* Wait till all writes completed */ | |||||
atomic_thread_fence_rel(); | |||||
*pdp = &new_fdh->ffi_idx[0]; | |||||
MOD_UNLOCK(); | |||||
if (old_fdh != NULL) | |||||
epoch_call(net_epoch_preempt, destroy_fdh_epoch, | |||||
&old_fdh->ffi_epoch_ctx); | |||||
} | |||||
/* | |||||
* Grows per-AF arrays of datapath pointers for each supported family. | |||||
* Called from fibs resize sysctl handler. | |||||
*/ | |||||
void | |||||
fib_grow_rtables(uint32_t new_num_tables) | |||||
{ | |||||
grow_rtables_family(get_family_ptr(AF_INET), new_num_tables); | |||||
grow_rtables_family(get_family_ptr(AF_INET6), new_num_tables); | |||||
} | |||||
void | |||||
fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo) | |||||
{ | |||||
bzero(rinfo, sizeof(struct rib_rtable_info)); | |||||
rinfo->num_prefixes = rh->rnh_prefixes; | |||||
rinfo->num_nhops = nhops_get_count(rh); | |||||
#ifdef ROUTE_MPATH | |||||
rinfo->num_nhgrp = nhgrp_get_count(rh); | |||||
#endif | |||||
} | |||||
struct rib_head * | |||||
fib_get_rh(struct fib_data *fd) | |||||
{ | |||||
return (fd->fd_rh); | |||||
} | |||||
static uint32_t | |||||
get_nhop_idx(struct nhop_object *nh) | |||||
{ | |||||
#ifdef ROUTE_MPATH | |||||
if (NH_IS_NHGRP(nh)) | |||||
return (nhgrp_get_idx((struct nhgrp_object *)nh) * 2 - 1); | |||||
else | |||||
return (nhop_get_idx(nh) * 2); | |||||
#else | |||||
return (nhop_get_idx(nh)); | |||||
#endif | |||||
} | |||||
uint32_t | |||||
fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh) | |||||
{ | |||||
uint32_t idx = get_nhop_idx(nh); | |||||
if (idx >= fd->number_nhops) { | |||||
fd->hit_nhops = 1; | |||||
return (0); | |||||
} | |||||
if (fd->nh_idx[idx] == NULL) { | |||||
nhop_ref_any(nh); | |||||
fd->nh_idx[idx] = nh; | |||||
fd->nh_ref_table->count++; | |||||
//FD_PRINTF(fd, " REF nhop %u %p", idx, fd->nh_idx[idx]); | |||||
} | |||||
fd->nh_ref_table->refcnt[idx]++; | |||||
return (idx); | |||||
} | |||||
struct nhop_release_data { | |||||
struct nhop_object *nh; | |||||
struct epoch_context ctx; | |||||
}; | |||||
static void | |||||
release_nhop_epoch(epoch_context_t ctx) | |||||
{ | |||||
struct nhop_release_data *nrd; | |||||
nrd = __containerof(ctx, struct nhop_release_data, ctx); | |||||
nhop_free_any(nrd->nh); | |||||
free(nrd, M_RTABLE); | |||||
} | |||||
static void | |||||
fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh) | |||||
{ | |||||
struct nhop_release_data *nrd; | |||||
nrd = malloc(sizeof(struct nhop_release_data), M_RTABLE, M_NOWAIT | M_ZERO); | |||||
if (nrd != NULL) { | |||||
nrd->nh = nh; | |||||
epoch_call(net_epoch_preempt, release_nhop_epoch, &nrd->ctx); | |||||
} else { | |||||
/* | |||||
* Unable to allocate memory. Leak nexthop to maintain guarantee | |||||
* that each nhop. | |||||
*/ | |||||
FD_PRINTF(fd, "unable to allocate structure for nhop %p deletion", nh); | |||||
} | |||||
} | |||||
void | |||||
fib_free_nhop_idx(struct fib_data *fd, uint32_t idx) | |||||
{ | |||||
KASSERT((idx < fd->number_nhops), ("invalid nhop index")); | |||||
fd->nh_ref_table->refcnt[idx]--; | |||||
if (fd->nh_ref_table->refcnt[idx] == 0) { | |||||
//FD_PRINTF(fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]); | |||||
fib_schedule_release_nhop(fd, fd->nh_idx[idx]); | |||||
} | |||||
} | |||||
void | |||||
fib_free_nhop(struct fib_data *fd, struct nhop_object *nh) | |||||
{ | |||||
fib_free_nhop_idx(fd, get_nhop_idx(nh)); | |||||
} | |||||
struct nhop_object ** | |||||
fib_get_nhop_array(struct fib_data *fd) | |||||
{ | |||||
return (fd->nh_idx); | |||||
} | |||||
static struct fib_lookup_module * | |||||
fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm) | |||||
{ | |||||
uint8_t preference, curr_preference = 0, best_preference = 0; | |||||
struct fib_lookup_module *flm, *best_flm = NULL; | |||||
struct rib_rtable_info rinfo; | |||||
int candidate_algos = 0; | |||||
fib_get_rtable_info(rh, &rinfo); | |||||
MOD_LOCK(); | |||||
TAILQ_FOREACH(flm, &all_algo_list, entries) { | |||||
if (flm->flm_family != rh->rib_family) | |||||
continue; | |||||
candidate_algos++; | |||||
preference = flm->flm_get_pref(&rinfo); | |||||
if (preference > best_preference) { | |||||
best_preference = preference; | |||||
best_flm = flm; | |||||
} | |||||
if (flm == orig_flm) | |||||
curr_preference = preference; | |||||
} | |||||
if (best_flm != NULL && best_flm != orig_flm) { | |||||
/* Check */ | |||||
if (curr_preference + BEST_DIFF_PERCENT < best_preference) | |||||
best_flm->flm_refcount++; | |||||
else | |||||
best_flm = NULL; | |||||
} else | |||||
best_flm = NULL; | |||||
MOD_UNLOCK(); | |||||
RH_PRINTF(rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)", | |||||
candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference, | |||||
best_flm ? best_flm->flm_name : "NULL", best_preference); | |||||
return (best_flm); | |||||
} | |||||
/* | |||||
* Called when new route table is created. | |||||
* Selects, allocates and attaches fib algo for the table. | |||||
*/ | |||||
int | |||||
fib_select_algo_initial(struct rib_head *rh) | |||||
{ | |||||
struct fib_lookup_module *flm; | |||||
struct fib_data *fd = NULL; | |||||
enum flm_op_result result; | |||||
flm = fib_check_best_algo(rh, NULL); | |||||
if (flm == NULL) { | |||||
RH_PRINTF(rh, "no algo selected"); | |||||
return (ENOENT); | |||||
} | |||||
RH_PRINTF(rh, "selected algo %s", flm->flm_name); | |||||
result = setup_instance(flm, rh, NULL, &fd, false); | |||||
RH_PRINTF(rh, "result=%d fd=%p", result, fd); | |||||
if (result == FLM_SUCCESS) { | |||||
/* | |||||
* Attach datapath directly to avoid N reallocations | |||||
* during fib growth | |||||
*/ | |||||
struct fib_dp_header *fdp; | |||||
struct fib_dp **pdp; | |||||
pdp = get_family_ptr(rh->rib_family); | |||||
if (pdp != NULL) { | |||||
fdp = get_fib_dp_header(*pdp); | |||||
fdp->ffi_idx[fd->fd_fibnum] = fd->fd_dp; | |||||
FD_PRINTF(fd, "datapath attached"); | |||||
} | |||||
} | |||||
return (0); | |||||
} | |||||
int | |||||
fib_module_register(struct fib_lookup_module *flm) | |||||
{ | |||||
MOD_LOCK(); | |||||
RH_PRINTF_RAW("attaching %s to %s", flm->flm_name, | |||||
print_family(flm->flm_family)); | |||||
TAILQ_INSERT_TAIL(&all_algo_list, flm, entries); | |||||
MOD_UNLOCK(); | |||||
return (0); | |||||
} | |||||
int | |||||
fib_module_unregister(struct fib_lookup_module *flm) | |||||
{ | |||||
MOD_LOCK(); | |||||
if (flm->flm_refcount > 0) { | |||||
MOD_UNLOCK(); | |||||
return (EBUSY); | |||||
} | |||||
RH_PRINTF_RAW("detaching %s from %s", flm->flm_name, | |||||
print_family(flm->flm_family)); | |||||
TAILQ_REMOVE(&all_algo_list, flm, entries); | |||||
MOD_UNLOCK(); | |||||
return (0); | |||||
} | |||||
int | |||||
fib_module_clone(const struct fib_lookup_module *flm_orig, | |||||
struct fib_lookup_module *flm, bool waitok) | |||||
{ | |||||
return (0); | |||||
} | |||||
int | |||||
fib_module_dumptree(struct fib_lookup_module *flm, | |||||
enum rib_subscription_type subscription_type) | |||||
{ | |||||
return (0); | |||||
} | |||||
static void | |||||
fib_algo_init(void) | |||||
{ | |||||
mtx_init(&fib_mtx, "algo list mutex", NULL, MTX_DEF); | |||||
TAILQ_INIT(&all_algo_list); | |||||
} | |||||
SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, fib_algo_init, NULL); | |||||