Page MenuHomeFreeBSD

D27401.id80345.diff
No OneTemporary

D27401.id80345.diff

Index: sys/conf/files
===================================================================
--- sys/conf/files
+++ sys/conf/files
@@ -4175,6 +4175,7 @@
net/route/nhop.c standard
net/route/nhop_ctl.c standard
net/route/nhop_utils.c standard
+net/route/route_algo.c optional route_algo
net/route/route_ctl.c standard
net/route/route_ddb.c optional ddb
net/route/route_helpers.c standard
@@ -4325,6 +4326,7 @@
netinet/in_kdtrace.c optional inet | inet6
netinet/ip_carp.c optional inet carp | inet6 carp
netinet/in_fib.c optional inet
+netinet/in_fib_algo.c optional inet route_algo
netinet/in_gif.c optional gif inet | netgraph_gif inet
netinet/ip_gre.c optional gre inet
netinet/ip_id.c optional inet
@@ -4401,6 +4403,7 @@
netinet6/in6.c optional inet6
netinet6/in6_cksum.c optional inet6
netinet6/in6_fib.c optional inet6
+netinet6/in6_fib_algo.c optional inet6 route_algo
netinet6/in6_gif.c optional gif inet6 | netgraph_gif inet6
netinet6/in6_ifattach.c optional inet6
netinet6/in6_jail.c optional inet6
Index: sys/conf/options
===================================================================
--- sys/conf/options
+++ sys/conf/options
@@ -453,6 +453,7 @@
PCBGROUP opt_pcbgroup.h
PF_DEFAULT_TO_DROP opt_pf.h
ROUTE_MPATH opt_route.h
+ROUTE_ALGO opt_route.h
ROUTETABLES opt_route.h
RSS opt_rss.h
SLIP_IFF_OPTS opt_slip.h
Index: sys/net/route.h
===================================================================
--- sys/net/route.h
+++ sys/net/route.h
@@ -453,6 +453,8 @@
/* New API */
struct nhop_object *rib_lookup(uint32_t fibnum, const struct sockaddr *dst,
uint32_t flags, uint32_t flowid);
+struct rib_rtable_info;
+bool rib_get_rtable_info(uint32_t fibnum, int family, struct rib_rtable_info *info);
#endif
#endif
Index: sys/net/route.c
===================================================================
--- sys/net/route.c
+++ sys/net/route.c
@@ -151,6 +151,12 @@
rt_table_destroy(struct rib_head *rh)
{
+ RIB_WLOCK(rh);
+ rh->rib_dying = true;
+ RIB_WUNLOCK(rh);
+
+ fib_destroy_rib(rh);
+
tmproutes_destroy(rh);
rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head);
Index: sys/net/route/route_algo.h
===================================================================
--- /dev/null
+++ sys/net/route/route_algo.h
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 2020
+ * Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+
+struct fib_data;
+struct fib_dp;
+enum flm_op_result {
+ FLM_SUCCESS, /* No errors, operation successful */
+ FLM_REBUILD, /* Operation cannot be completed, schedule algorithm rebuild */
+ FLM_ERROR, /* Operation failed, this algo cannot be used */
+};
+
+struct rib_rtable_info {
+ uint32_t num_prefixes;
+ uint32_t num_nhops;
+ uint32_t num_nhgrp;
+};
+
+struct flm_lookup_key {
+ union {
+ const struct in6_addr *addr6;
+ struct in_addr addr4;
+ };
+};
+
+typedef struct nhop_object *flm_lookup_t(void *algo_data,
+ const struct flm_lookup_key key, uint32_t scopeid);
+typedef enum flm_op_result flm_init_t (uint32_t fibnum, struct fib_data *fd,
+ void *_old_data, void **new_data);
+typedef void flm_destroy_t(void *data);
+typedef enum flm_op_result flm_dump_t(struct rtentry *rt, void *data);
+typedef enum flm_op_result flm_dump_end_t(void *data, struct fib_dp *dp);
+typedef enum flm_op_result flm_change_t(struct rib_head *rnh,
+ struct rib_cmd_info *rc, void *data);
+typedef uint8_t flm_get_pref_t(const struct rib_rtable_info *rinfo);
+
+#define FIB_M_NEED_NHOPS 0x01 /* need nexthop index map allocation */
+#define FIB_M_NO_CALLOUT 0x02 /* does not need callouts */
+
+struct fib_lookup_module {
+ char *flm_name; /* algo name */
+ int flm_family; /* address family this module supports */
+ int flm_refcount; /* # of references */
+ uint32_t flm_flags; /* flags */
+ uint8_t flm_index; /* internal algo index */
+ flm_init_t *flm_init_cb; /* instance init */
+ flm_destroy_t *flm_destroy_cb; /* destroy instance */
+ flm_change_t *flm_change_rib_item_cb;/* routing table change hook */
+ flm_dump_t *flm_dump_rib_item_cb; /* routing table dump cb */
+ flm_dump_end_t *flm_dump_end_cb; /* end of dump */
+ flm_lookup_t *flm_lookup; /* lookup function */
+ flm_get_pref_t *flm_get_pref; /* get algo preference */
+ TAILQ_ENTRY(fib_lookup_module) entries;
+};
+
+/* Datapath lookup data */
+struct fib_dp {
+ flm_lookup_t *f;
+ void *arg;
+};
+
+VNET_DECLARE(struct fib_dp *, inet_dp);
+#define V_inet_dp VNET(inet_dp)
+VNET_DECLARE(struct fib_dp *, inet6_dp);
+#define V_inet6_dp VNET(inet6_dp)
+
+int fib_module_init(struct fib_lookup_module *flm, uint32_t fibnum,
+ int family);
+int fib_module_clone(const struct fib_lookup_module *flm_orig,
+ struct fib_lookup_module *flm, bool waitok);
+int fib_module_dumptree(struct fib_lookup_module *flm,
+ enum rib_subscription_type subscription_type);
+int fib_module_register(struct fib_lookup_module *flm);
+int fib_module_unregister(struct fib_lookup_module *flm);
+
+uint32_t fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh);
+void fib_free_nhop_idx(struct fib_data *fd, uint32_t idx);
+void fib_free_nhop(struct fib_data *fd, struct nhop_object *nh);
+struct nhop_object **fib_get_nhop_array(struct fib_data *fd);
+void fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo);
+struct rib_head *fib_get_rh(struct fib_data *fd);
+
+
Index: sys/net/route/route_algo.c
===================================================================
--- /dev/null
+++ sys/net/route/route_algo.c
@@ -0,0 +1,1215 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#define RTDEBUG
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_route.h"
+
+#include <sys/param.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/sbuf.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/queue.h>
+#include <net/vnet.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
+#endif
+
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/route/route_algo.h>
+
+/*
+ * Route lookup framework.
+ *
+ * flm - fib lookup modules - kernel modules implementing particular algo
+ * fd - fib data - instance of an flm bound to specific routing table
+ *
+ *
+ * For each supported address family, there is a an allocated array of fib_dp
+ * structures, indexed by fib number. Each array entry contains callback function
+ * and its argument. This function will be called with a family-specific lookup key,
+ * scope and provided argument. This array gets re-created every time when new algo
+ * instance gets created. Please take a look at the replace_rtables_family() function
+ * for more details.
+ *
+ * Control plane for to setup and update the necessary dataplane structures.
+ * 1) nexhops abstraction -> module has to deal with index, refcounting, nexhtop groups etc
+ * 2) sync with route tables
+ * 3) dataplane attachment points
+ * 3) fail early. Some algorithms are immutable, so any change leads to rebuild. Some
+ * are mutable till some extent so the module is build over common setup/teardown
+ * instances, making error handling * easier.
+ * 4) preference.
+ *
+ */
+
+SYSCTL_DECL(_net_route);
+SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "Route algorithm lookups");
+
+#ifdef INET6
+SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "IPv6 algorithm lookups");
+#endif
+#ifdef INET
+SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "IPv4 algorithm lookups");
+#endif
+
+struct nhop_ref_table {
+ uint32_t count;
+ int32_t refcnt[0];
+};
+
+struct fib_data {
+ uint32_t number_nhops; /* current # of nhops */
+ uint32_t number_records; /* current # of routes */
+ uint8_t hit_nhops; /* true if out of nhop limit */
+ uint8_t init_done; /* true if init is competed */
+ uint32_t fd_dead:1; /* Scheduled for deletion */
+ uint32_t fd_linked:1; /* true if linked */
+ uint32_t fd_need_rebuild:1; /* true if rebuild scheduled */
+ uint32_t fd_force_eval:1;/* true if rebuild scheduled */
+ uint8_t fd_family; /* family */
+ uint32_t fd_fibnum; /* fibnum */
+ uint32_t fd_failed_rebuilds; /* stat: failed rebuilds */
+ struct callout fd_callout; /* rebuild callout */
+ void *fd_algo_data; /* algorithm data */
+ struct nhop_object **nh_idx; /* nhop idx->ptr array */
+ struct nhop_ref_table *nh_ref_table; /* array with # of nhop references */
+ struct rib_head *fd_rh; /* RIB table we're attached to */
+ struct rib_subscription *fd_rs; /* storing table subscription */
+ struct fib_algo_calldata *fa;
+ struct fib_dp fd_dp; /* fib datapath data */
+ struct vnet *fd_vnet; /* vnet nhop belongs to */
+ struct epoch_context fd_epoch_ctx;
+ uint64_t gencnt;
+ struct fib_lookup_module *fd_flm;
+ uint32_t fd_num_changes; /* number of changes since last callout */
+ TAILQ_ENTRY(fib_data) entries; /* list of all fds in vnet */
+};
+
+static void rebuild_callout(void *_data);
+static void destroy_instance_epoch(epoch_context_t ctx);
+static enum flm_op_result switch_algo(struct fib_data *fd);
+static struct fib_lookup_module *find_algo(const char *algo_name, int family);
+
+static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh,
+ struct fib_lookup_module *orig_flm);
+
+struct mtx fib_mtx;
+#define MOD_LOCK() mtx_lock(&fib_mtx)
+#define MOD_UNLOCK() mtx_unlock(&fib_mtx)
+
+uint32_t algo_bitmask_idx = 0;
+
+
+/* Algorithm has to be this percent better than the current to switch */
+#define BEST_DIFF_PERCENT (5 * 256 / 100)
+/* Schedule algo re-evaluation X seconds after a change */
+#define ALGO_EVAL_DELAY_MS 30000
+/* Force algo re-evaluation after X changes */
+#define ALGO_EVAL_NUM_ROUTES 100
+/* Try to setup algorithm X times */
+#define FIB_MAX_TRIES 32
+/* Max amount of supported nexthops */
+#define FIB_MAX_NHOPS 262144
+#define FIB_CALLOUT_DELAY_MS 50
+
+
+/* TODO: per-VNET */
+static TAILQ_HEAD(fib_data_head, fib_data) fib_data_list = TAILQ_HEAD_INITIALIZER(fib_data_list);
+
+struct fib_dp_header {
+ struct epoch_context ffi_epoch_ctx;
+ uint32_t ffi_num_tables;
+ struct fib_dp ffi_idx[0];
+};
+
+static TAILQ_HEAD(, fib_lookup_module) all_algo_list;
+
+#ifdef RTDEBUG
+#define RH_PRINTF(_rh, _fmt, ...) printf("[rt_algo] %s.%u %s: " _fmt "\n", \
+ print_family(_rh->rib_family), _rh->rib_fibnum, __func__ , ## __VA_ARGS__)
+#define RH_PRINTF_RAW(_fmt, ...) printf("[rt_algo] %s: " _fmt "\n", __func__ , ## __VA_ARGS__)
+#define FD_PRINTF(fd, _fmt, ...) printf("[rt_algo] %s.%u (%s) %s: " _fmt "\n",\
+ print_family(fd->fd_family), fd->fd_fibnum, fd->fd_flm->flm_name, __func__, \
+ ##__VA_ARGS__)
+#else
+#define FD_RH_PRINTF(fd, _fmt, ...)
+#define RH_PRINTF(_fmt, ...)
+#define RH_PRINTF_RAW(_fmt, ...)
+#endif
+
+static const char *
+print_family(int family)
+{
+ if (family == AF_INET)
+ return ("inet");
+ else if (family == AF_INET6)
+ return ("inet6");
+ else
+ return ("unknown");
+}
+
+static int
+print_algos(struct sysctl_req *req, int family)
+{
+ struct fib_lookup_module *flm;
+ struct sbuf sbuf;
+ int error, count = 0;
+
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error == 0) {
+ sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
+ TAILQ_FOREACH(flm, &all_algo_list, entries) {
+ if (flm->flm_family == family) {
+ if (count++ > 0)
+ sbuf_cat(&sbuf, ", ");
+ sbuf_cat(&sbuf, flm->flm_name);
+ }
+ }
+ error = sbuf_finish(&sbuf);
+ sbuf_delete(&sbuf);
+ }
+ return (error);
+}
+
+static int
+print_algos_inet6(SYSCTL_HANDLER_ARGS)
+{
+
+ return (print_algos(req, AF_INET6));
+}
+SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list,
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+ print_algos_inet6, "A", "List of algos");
+
+static int
+print_algos_inet(SYSCTL_HANDLER_ARGS)
+{
+
+ return (print_algos(req, AF_INET));
+}
+SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list,
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+ print_algos_inet, "A", "List of algos");
+
+
+static struct fib_lookup_module *
+find_algo(const char *algo_name, int family)
+{
+ struct fib_lookup_module *flm;
+
+ TAILQ_FOREACH(flm, &all_algo_list, entries) {
+ if ((strcmp(flm->flm_name, algo_name) == 0) &&
+ (family == flm->flm_family))
+ return (flm);
+ }
+
+ return (NULL);
+}
+
+static uint32_t
+callout_calc_delay(struct fib_data *fd)
+{
+ uint32_t shift;
+
+ if (fd->fd_failed_rebuilds > 10)
+ shift = 10;
+ else
+ shift = fd->fd_failed_rebuilds;
+
+ return ((1 << shift) * FIB_CALLOUT_DELAY_MS);
+}
+
+static void
+schedule_callout(struct fib_data *fd, int delay_ms)
+{
+
+ callout_reset_sbt(&fd->fd_callout, 0, SBT_1MS * delay_ms,
+ rebuild_callout, fd, 0);
+}
+
+static void
+schedule_algo_eval(struct fib_data *fd)
+{
+
+ if (fd->fd_num_changes++ == 0) {
+ /* Start callout to consider switch */
+ MOD_LOCK();
+ if (!callout_pending(&fd->fd_callout))
+ schedule_callout(fd, ALGO_EVAL_DELAY_MS);
+ MOD_UNLOCK();
+ } else if (fd->fd_num_changes > ALGO_EVAL_NUM_ROUTES && !fd->fd_force_eval) {
+ /* Reset callout to exec immediately */
+ MOD_LOCK();
+ if (!fd->fd_need_rebuild) {
+ fd->fd_force_eval = true;
+ schedule_callout(fd, 1);
+ }
+ MOD_UNLOCK();
+ }
+}
+
+/*
+ * rib subscription handler
+ */
+static void
+handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
+ void *_data)
+{
+ struct fib_data *fd = (struct fib_data *)_data;
+ enum flm_op_result result;
+
+ RIB_WLOCK_ASSERT(rnh);
+
+ if (!fd->init_done)
+ return;
+
+ schedule_algo_eval(fd);
+
+ result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data);
+
+ switch (result) {
+ case FLM_SUCCESS:
+ break;
+ case FLM_REBUILD:
+ /*
+ * Algo reported inability to handle,
+ * schedule algo rebuild.
+ */
+ MOD_LOCK();
+ if (!fd->fd_need_rebuild) {
+ fd->fd_need_rebuild = true;
+ /*
+ * Potentially rewrites pending callout
+ * to re-evaluate algo.
+ */
+ FD_PRINTF(fd, "Scheduling rebuilt");
+ schedule_callout(fd, callout_calc_delay(fd));
+ }
+ MOD_UNLOCK();
+ break;
+ default:
+ /*
+ * Algo reported a non-recoverable error.
+ * Remove and switch to radix?
+ */
+ FD_PRINTF(fd, "algo reported non-recoverable error");
+ // TODO: switch to radix
+ }
+}
+
+static void
+estimate_scale(const struct fib_data *old_fd, struct fib_data *fd)
+{
+
+ if (old_fd == NULL) {
+ fd->number_nhops = 16;
+ return;
+ }
+
+ if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS)
+ fd->number_nhops = 2 * old_fd->number_nhops;
+ else
+ fd->number_nhops = old_fd->number_nhops;
+}
+
+struct walk_cbdata {
+ struct fib_data *fd;
+ flm_dump_t *func;
+ enum flm_op_result result;
+};
+
+static void
+sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data)
+{
+ struct walk_cbdata *w = (struct walk_cbdata *)_data;
+ struct fib_data *fd = w->fd;
+
+ if (rnh->rib_dying) {
+ w->result = FLM_ERROR;
+ return;
+ }
+
+ if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS)
+ return;
+
+ if (fd->hit_nhops) {
+ FD_PRINTF(fd, "ran out of nexthops at %u nhops",
+ fd->nh_ref_table->count);
+ w->result = FLM_REBUILD;
+ return;
+ }
+
+ w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp);
+
+ if (w->result == FLM_SUCCESS) {
+ /* Mark init as done to allow routing updates */
+ fd->init_done = 1;
+ }
+}
+
+static int
+sync_algo_cb(struct rtentry *rt, void *_data)
+{
+ struct walk_cbdata *w = (struct walk_cbdata *)_data;
+ enum flm_op_result result;
+
+ if (w->result == FLM_SUCCESS && w->func) {
+ result = w->func(rt, w->fd->fd_algo_data);
+ if (result != FLM_SUCCESS)
+ w->result = result;
+ }
+
+ return (0);
+}
+
+static enum flm_op_result
+sync_algo(struct fib_data *fd)
+{
+ struct walk_cbdata w;
+
+ w.fd = fd;
+ w.func = fd->fd_flm->flm_dump_rib_item_cb;
+ w.result = FLM_SUCCESS;
+
+ rib_walk_ext_internal(fd->fd_rh, true, sync_algo_cb, sync_algo_end_cb, &w);
+
+ FD_PRINTF(fd, "initial dump completed.");
+
+ return (w.result);
+}
+
+/*
+ * Assume already unlinked from datapath
+ */
+static int
+schedule_destroy_instance(struct fib_data *fd, bool in_callout)
+{
+ bool is_dead;
+
+ NET_EPOCH_ASSERT();
+
+ MOD_LOCK();
+ is_dead = fd->fd_dead;
+ if (!is_dead)
+ fd->fd_dead = true;
+ if (fd->fd_linked) {
+ TAILQ_REMOVE(&fib_data_list, fd, entries);
+ fd->fd_linked = false;
+ }
+ MOD_UNLOCK();
+ if (is_dead)
+ return (0);
+
+ FD_PRINTF(fd, "DETACH");
+
+ if (fd->fd_rs != NULL)
+ rib_unsibscribe(fd->fd_rs);
+
+ /*
+ * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls
+ * will be executed, hence no _new_ callout schedules will happen.
+ *
+ * There can be 3 possible scenarious here:
+ * 1) we're running inside a callout when we're deleting ourselves
+ * due to migration to a newer fd
+ * 2) we're running from rt_table_destroy() and callout is scheduled
+ * for execution OR is executing
+ *
+ * For (2) we need to wait for the callout termination, as the routing table
+ * will be destroyed after this function returns.
+ * For (1) we cannot call drain, but can ensure that this is the last invocation.
+ */
+
+ if (in_callout)
+ callout_stop(&fd->fd_callout);
+ else
+ callout_drain(&fd->fd_callout);
+
+ /*
+ * At this moment there are no other pending work scheduled.
+ */
+ FD_PRINTF(fd, "destroying old instance");
+ epoch_call(net_epoch_preempt, destroy_instance_epoch,
+ &fd->fd_epoch_ctx);
+
+ return (0);
+}
+
+void
+fib_destroy_rib(struct rib_head *rh)
+{
+ struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head);
+ struct fib_data *fd, *fd_tmp;
+ struct epoch_tracker et;
+
+ /*
+ * Atm we have set is_dying flag on rnh, so all new fd's will
+ * fail at sync_algo() stage, so nothing new will be added to the list.
+ */
+ MOD_LOCK();
+ TAILQ_FOREACH_SAFE(fd, &fib_data_list, entries, fd_tmp) {
+ if (fd->fd_rh == rh) {
+ TAILQ_REMOVE(&fib_data_list, fd, entries);
+ fd->fd_linked = false;
+ TAILQ_INSERT_TAIL(&tmp_head, fd, entries);
+ }
+ }
+ MOD_UNLOCK();
+
+ /* Pass 2: remove each entry */
+ NET_EPOCH_ENTER(et);
+ TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) {
+ schedule_destroy_instance(fd, false);
+ }
+ NET_EPOCH_EXIT(et);
+}
+
+static void
+destroy_instance(struct fib_data *fd)
+{
+
+ FD_PRINTF(fd, "destroy fd %p", fd);
+
+ /* Call destroy callback first */
+ if (fd->fd_algo_data != NULL)
+ fd->fd_flm->flm_destroy_cb(fd->fd_algo_data);
+
+ /* Nhop table */
+ if (fd->nh_idx != NULL) {
+ for (int i = 0; i < fd->number_nhops; i++) {
+ if (fd->nh_idx[i] != NULL) {
+ //FD_PRINTF(fd, " FREE nhop %d %p", i, fd->nh_idx[i]);
+ nhop_free_any(fd->nh_idx[i]);
+ }
+ }
+ free(fd->nh_idx, M_RTABLE);
+ }
+ if (fd->nh_ref_table != NULL)
+ free(fd->nh_ref_table, M_RTABLE);
+
+ MOD_LOCK();
+ fd->fd_flm->flm_refcount--;
+ MOD_UNLOCK();
+
+ free(fd, M_RTABLE);
+}
+
+/*
+ * Epoch callback indicating fd is safe to destroy
+ */
+static void
+destroy_instance_epoch(epoch_context_t ctx)
+{
+ struct fib_data *fd;
+
+ fd = __containerof(ctx, struct fib_data, fd_epoch_ctx);
+
+ destroy_instance(fd);
+}
+
+static enum flm_op_result
+try_setup_instance(struct fib_lookup_module *flm, struct rib_head *rh,
+ struct fib_data *old_fd, struct fib_data **pfd)
+{
+ struct fib_data *fd;
+ size_t size;
+ enum flm_op_result result;
+
+ /* Allocate */
+ fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO);
+ if (fd == NULL) {
+ *pfd = NULL;
+ return (FLM_REBUILD);
+ }
+ *pfd = fd;
+
+ estimate_scale(old_fd, fd);
+
+ fd->fd_rh = rh;
+ fd->fd_family = rh->rib_family;
+ fd->fd_fibnum = rh->rib_fibnum;
+ callout_init(&fd->fd_callout, 1);
+ fd->fd_vnet = curvnet;
+ fd->fd_flm = flm;
+
+ /* Allocate nhidx -> nhop_ptr table */
+ size = fd->number_nhops * sizeof(void *);
+ //FD_PRINTF(fd, "malloc(%lu)", size);
+ fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
+ if (fd->nh_idx == NULL) {
+ FD_PRINTF(fd, "Unable to allocate nhop table idx (sz:%zu)", size);
+ return (FLM_REBUILD);
+ }
+
+ /* Allocate nhop index refcount table */
+ size = sizeof(struct nhop_ref_table);
+ size += fd->number_nhops * sizeof(uint32_t);
+ //FD_PRINTF(fd, "malloc(%lu)", size);
+ fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
+ if (fd->nh_ref_table == NULL) {
+ FD_PRINTF(fd, "Unable to allocate nhop refcount table (sz:%zu)", size);
+ return (FLM_REBUILD);
+ }
+
+ /* Okay, we're ready for algo init */
+ void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL;
+ result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data);
+ if (result != FLM_SUCCESS)
+ return (result);
+
+ /* Try to subscribe */
+ if (flm->flm_change_rib_item_cb != NULL) {
+ fd->fd_rs = rib_subscribe_internal(fd->fd_rh,
+ handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE, 0);
+ if (fd->fd_rs == NULL)
+ return (FLM_REBUILD);
+ }
+
+ /* Dump */
+ result = sync_algo(fd);
+ if (result != FLM_SUCCESS)
+ return (result);
+ FD_PRINTF(fd, "DUMP completed successfully.");
+
+ MOD_LOCK();
+ /*
+ * Insert in the beginning of a list, to simplify search
+ * first matching entry is the one.
+ */
+ TAILQ_INSERT_HEAD(&fib_data_list, fd, entries);
+ fd->fd_linked = true;
+ MOD_UNLOCK();
+
+ return (FLM_SUCCESS);
+}
+
+/*
+ * Sets up algo @flm for table @rh and links it to the datapath.
+ *
+ */
+static enum flm_op_result
+setup_instance(struct fib_lookup_module *flm, struct rib_head *rh,
+ struct fib_data *orig_fd, struct fib_data **pfd, bool attach)
+{
+ struct fib_data *prev_fd, *new_fd;
+ struct epoch_tracker et;
+ enum flm_op_result result;
+
+ prev_fd = orig_fd;
+ new_fd = NULL;
+ for (int i = 0; i < FIB_MAX_TRIES; i++) {
+ NET_EPOCH_ENTER(et);
+ result = try_setup_instance(flm, rh, prev_fd, &new_fd);
+
+ if ((result == FLM_SUCCESS) && attach)
+ result = switch_algo(new_fd);
+
+ if ((prev_fd != NULL) && (prev_fd != orig_fd)) {
+ schedule_destroy_instance(prev_fd, false);
+ prev_fd = NULL;
+ }
+ NET_EPOCH_EXIT(et);
+
+ RH_PRINTF(rh, "try %d: fib algo result: %d", i, result);
+
+ if (result == FLM_REBUILD) {
+ prev_fd = new_fd;
+ new_fd = NULL;
+ continue;
+ }
+
+ break;
+ }
+
+ if (result != FLM_SUCCESS) {
+ /* update failure count */
+ MOD_LOCK();
+ if (orig_fd != NULL)
+ orig_fd->fd_failed_rebuilds++;
+ MOD_UNLOCK();
+
+ NET_EPOCH_ENTER(et);
+ if ((prev_fd != NULL) && (prev_fd != orig_fd))
+ schedule_destroy_instance(prev_fd, false);
+ if (new_fd != NULL) {
+ schedule_destroy_instance(new_fd, false);
+ new_fd = NULL;
+ }
+ NET_EPOCH_EXIT(et);
+ }
+
+ *pfd = new_fd;
+ return (result);
+}
+
+static void
+rebuild_callout(void *_data)
+{
+ struct fib_data *fd, *fd_new;
+ struct fib_lookup_module *flm_new;
+ struct epoch_tracker et;
+ enum flm_op_result result;
+ bool need_rebuild = false;
+
+ fd = (struct fib_data *)_data;
+
+ MOD_LOCK();
+ need_rebuild = fd->fd_need_rebuild;
+ fd->fd_need_rebuild = false;
+ fd->fd_force_eval = false;
+ fd->fd_num_changes = 0;
+ MOD_UNLOCK();
+
+ CURVNET_SET(fd->fd_vnet);
+
+ /* First, check if we're still OK to use this algo */
+ flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
+ if ((flm_new == NULL) && (!need_rebuild)) {
+ /* Keep existing algo, no need to rebuild. */
+ CURVNET_RESTORE();
+ return;
+ }
+
+ struct fib_data *fd_tmp = (flm_new == NULL) ? fd : NULL;
+ result = setup_instance(fd->fd_flm, fd->fd_rh, fd_tmp, &fd_new, true);
+ if (result != FLM_SUCCESS) {
+ FD_PRINTF(fd, "table rebuild failed");
+ CURVNET_RESTORE();
+ return;
+ }
+ FD_PRINTF(fd_new, "switched to new instance");
+
+ /* Remove old */
+ if (fd != NULL) {
+ NET_EPOCH_ENTER(et);
+ schedule_destroy_instance(fd, true);
+ NET_EPOCH_EXIT(et);
+ }
+
+ CURVNET_RESTORE();
+}
+
+static int
+set_algo_sysctl_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error = 0;
+#if 0
+ struct epoch_tracker et;
+ struct fib_lookup_module *flm;
+ struct fib_data *old_fd, *fd;
+ char old_algo_name[32], algo_name[32];
+ uint32_t fibnum;
+ int error;
+
+ fibnum = RT_DEFAULT_FIB;
+
+ if (old_fd == NULL) {
+ strlcpy(old_algo_name, "radix", sizeof(old_algo_name));
+ } else {
+ strlcpy(old_algo_name, fd_ptr->fd_flm->flm_name,
+ sizeof(old_algo_name));
+ }
+ strlcpy(algo_name, old_algo_name, sizeof(algo_name));
+ error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+
+ if (strcmp(algo_name, old_algo_name) == 0)
+ return (0);
+
+ if (strcmp(algo_name, "radix") == 0) {
+ /* teardown old one */
+ NET_EPOCH_ENTER(et);
+ MOD_LOCK();
+ old_fd = fd_ptr;
+ fd_ptr = NULL;
+ MOD_UNLOCK();
+
+ if (old_fd != NULL)
+ schedule_destroy_instance(old_fd);
+ NET_EPOCH_EXIT(et);
+ return (0);
+ }
+
+ MOD_LOCK();
+ flm = find_algo(algo_name, AF_INET6);
+ if (flm != NULL)
+ flm->flm_refcount++;
+ MOD_UNLOCK();
+
+ if (flm == NULL) {
+ DPRINTF("unable to find algo %s", algo_name);
+ return (ESRCH);
+ }
+ DPRINTF("inet6.%u: requested fib algo %s", fibnum, algo_name);
+
+ fd = setup_instance(flm, fibnum, NULL, &error);
+
+ if (error != 0) {
+ MOD_LOCK();
+ flm->flm_refcount--;
+ MOD_UNLOCK();
+ return (error);
+ }
+
+ MOD_LOCK();
+ old_fd = fd_ptr;
+ fd_ptr = fd;
+ MOD_UNLOCK();
+
+ /* Remove old */
+ NET_EPOCH_ENTER(et);
+ if (old_fd != NULL) {
+ error = schedule_destroy_instance(old_fd);
+ }
+ NET_EPOCH_EXIT(et);
+#endif
+
+ /* Set new */
+
+ /* Drain cb so user can unload the module after userret if so desired */
+ epoch_drain_callbacks(net_epoch_preempt);
+
+ return (error);
+}
+SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo,
+ CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
+ set_algo_sysctl_handler, "A",
+ "Set");
+
+static void
+destroy_fdh_epoch(epoch_context_t ctx)
+{
+ struct fib_dp_header *ffi;
+
+ ffi = __containerof(ctx, struct fib_dp_header, ffi_epoch_ctx);
+ free(ffi, M_RTABLE);
+}
+
+static struct fib_dp_header *
+alloc_fib_dp_array(uint32_t num_tables, bool waitok)
+{
+ size_t sz;
+ struct fib_dp_header *ffi;
+
+ sz = sizeof(struct fib_dp_header);
+ sz += sizeof(struct fib_dp) * num_tables;
+ ffi = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO);
+ if (ffi != NULL)
+ ffi->ffi_num_tables = num_tables;
+ return (ffi);
+}
+
+static struct fib_dp_header *
+get_fib_dp_header(struct fib_dp *dp)
+{
+
+ return (__containerof((void *)dp, struct fib_dp_header, ffi_idx));
+}
+
+/*
+ * Replace per-family index pool @pdp with a new one which
+ * contains updated callback/algo data from @fd.
+ * Returns 0 on success.
+ */
+static enum flm_op_result
+replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd)
+{
+ struct fib_dp_header *new_ffi, *old_ffi;
+
+ NET_EPOCH_ASSERT();
+
+ //FD_PRINTF(fd, "[vnet %p] replace with f:%p arg:%p", curvnet, fd->fd_dp.f, fd->fd_dp.arg);
+
+ MOD_LOCK();
+ old_ffi = get_fib_dp_header(*pdp);
+ new_ffi = alloc_fib_dp_array(old_ffi->ffi_num_tables, false);
+ //FD_PRINTF(fd, "OLD FFI: %p NEW FFI: %p", old_ffi, new_ffi);
+ if (new_ffi == NULL) {
+ MOD_UNLOCK();
+ FD_PRINTF(fd, "error attaching datapath");
+ return (FLM_REBUILD);
+ }
+
+ memcpy(&new_ffi->ffi_idx[0], &old_ffi->ffi_idx[0],
+ old_ffi->ffi_num_tables * sizeof(struct fib_dp));
+ /* Update relevant data structure for @fd */
+ new_ffi->ffi_idx[fd->fd_fibnum] = fd->fd_dp;
+
+ /* Ensure memcpy() writes have completed */
+ atomic_thread_fence_rel();
+ /* Set new datapath pointer */
+ *pdp = &new_ffi->ffi_idx[0];
+ MOD_UNLOCK();
+ //FD_PRINTF(fd, "update %p -> %p", old_ffi, new_ffi);
+
+ epoch_call(net_epoch_preempt, destroy_fdh_epoch,
+ &old_ffi->ffi_epoch_ctx);
+
+ return (FLM_SUCCESS);
+}
+
+static struct fib_dp **
+get_family_ptr(int family)
+{
+ switch (family) {
+ case AF_INET:
+ return (&V_inet_dp);
+ case AF_INET6:
+ return (&V_inet6_dp);
+ }
+ return (NULL);
+}
+
+static enum flm_op_result
+switch_algo(struct fib_data *fd)
+{
+ struct fib_dp **pdp;
+
+ pdp = get_family_ptr(fd->fd_family);
+ return (replace_rtables_family(pdp, fd));
+}
+
+/*
+ * Grow datapath pointers array.
+ * Called from sysctl handler on growing number of routing tables.
+ */
+static void
+grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables)
+{
+ struct fib_dp_header *new_fdh, *old_fdh = NULL;
+
+ new_fdh = alloc_fib_dp_array(new_num_tables, true);
+
+ MOD_LOCK();
+ if (*pdp != NULL) {
+ old_fdh = get_fib_dp_header(*pdp);
+ memcpy(&new_fdh->ffi_idx[0], &old_fdh->ffi_idx[0],
+ old_fdh->ffi_num_tables * sizeof(struct fib_dp));
+ }
+
+ /* Wait till all writes completed */
+ atomic_thread_fence_rel();
+
+ *pdp = &new_fdh->ffi_idx[0];
+ MOD_UNLOCK();
+
+ if (old_fdh != NULL)
+ epoch_call(net_epoch_preempt, destroy_fdh_epoch,
+ &old_fdh->ffi_epoch_ctx);
+}
+
+/*
+ * Grows per-AF arrays of datapath pointers for each supported family.
+ * Called from fibs resize sysctl handler.
+ */
+void
+fib_grow_rtables(uint32_t new_num_tables)
+{
+
+ grow_rtables_family(get_family_ptr(AF_INET), new_num_tables);
+ grow_rtables_family(get_family_ptr(AF_INET6), new_num_tables);
+}
+
+void
+fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo)
+{
+
+ bzero(rinfo, sizeof(struct rib_rtable_info));
+ rinfo->num_prefixes = rh->rnh_prefixes;
+ rinfo->num_nhops = nhops_get_count(rh);
+#ifdef ROUTE_MPATH
+ rinfo->num_nhgrp = nhgrp_get_count(rh);
+#endif
+}
+
+struct rib_head *
+fib_get_rh(struct fib_data *fd)
+{
+
+ return (fd->fd_rh);
+}
+
+static uint32_t
+get_nhop_idx(struct nhop_object *nh)
+{
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh))
+ return (nhgrp_get_idx((struct nhgrp_object *)nh) * 2 - 1);
+ else
+ return (nhop_get_idx(nh) * 2);
+#else
+ return (nhop_get_idx(nh));
+#endif
+}
+
+
+uint32_t
+fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh)
+{
+ uint32_t idx = get_nhop_idx(nh);
+
+ if (idx >= fd->number_nhops) {
+ fd->hit_nhops = 1;
+ return (0);
+ }
+
+ if (fd->nh_idx[idx] == NULL) {
+ nhop_ref_any(nh);
+ fd->nh_idx[idx] = nh;
+ fd->nh_ref_table->count++;
+ //FD_PRINTF(fd, " REF nhop %u %p", idx, fd->nh_idx[idx]);
+ }
+ fd->nh_ref_table->refcnt[idx]++;
+
+ return (idx);
+}
+
+struct nhop_release_data {
+ struct nhop_object *nh;
+ struct epoch_context ctx;
+};
+
+static void
+release_nhop_epoch(epoch_context_t ctx)
+{
+ struct nhop_release_data *nrd;
+
+ nrd = __containerof(ctx, struct nhop_release_data, ctx);
+ nhop_free_any(nrd->nh);
+ free(nrd, M_RTABLE);
+}
+
+static void
+fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh)
+{
+ struct nhop_release_data *nrd;
+
+ nrd = malloc(sizeof(struct nhop_release_data), M_RTABLE, M_NOWAIT | M_ZERO);
+ if (nrd != NULL) {
+ nrd->nh = nh;
+ epoch_call(net_epoch_preempt, release_nhop_epoch, &nrd->ctx);
+ } else {
+ /*
+ * Unable to allocate memory. Leak nexthop to maintain guarantee
+ * that each nhop.
+ */
+ FD_PRINTF(fd, "unable to allocate structure for nhop %p deletion", nh);
+ }
+}
+
+void
+fib_free_nhop_idx(struct fib_data *fd, uint32_t idx)
+{
+
+ KASSERT((idx < fd->number_nhops), ("invalid nhop index"));
+
+ fd->nh_ref_table->refcnt[idx]--;
+ if (fd->nh_ref_table->refcnt[idx] == 0) {
+ //FD_PRINTF(fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]);
+ fib_schedule_release_nhop(fd, fd->nh_idx[idx]);
+ }
+}
+
+void
+fib_free_nhop(struct fib_data *fd, struct nhop_object *nh)
+{
+
+ fib_free_nhop_idx(fd, get_nhop_idx(nh));
+}
+
+struct nhop_object **
+fib_get_nhop_array(struct fib_data *fd)
+{
+
+ return (fd->nh_idx);
+}
+
+static struct fib_lookup_module *
+fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm)
+{
+ uint8_t preference, curr_preference = 0, best_preference = 0;
+ struct fib_lookup_module *flm, *best_flm = NULL;
+ struct rib_rtable_info rinfo;
+ int candidate_algos = 0;
+
+ fib_get_rtable_info(rh, &rinfo);
+
+ MOD_LOCK();
+ TAILQ_FOREACH(flm, &all_algo_list, entries) {
+ if (flm->flm_family != rh->rib_family)
+ continue;
+ candidate_algos++;
+ preference = flm->flm_get_pref(&rinfo);
+ if (preference > best_preference) {
+ best_preference = preference;
+ best_flm = flm;
+ }
+ if (flm == orig_flm)
+ curr_preference = preference;
+ }
+ if (best_flm != NULL && best_flm != orig_flm) {
+ /* Check */
+ if (curr_preference + BEST_DIFF_PERCENT < best_preference)
+ best_flm->flm_refcount++;
+ else
+ best_flm = NULL;
+ } else
+ best_flm = NULL;
+ MOD_UNLOCK();
+
+ RH_PRINTF(rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)",
+ candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference,
+ best_flm ? best_flm->flm_name : "NULL", best_preference);
+
+ return (best_flm);
+}
+
+/*
+ * Called when new route table is created.
+ * Selects, allocates and attaches fib algo for the table.
+ */
+int
+fib_select_algo_initial(struct rib_head *rh)
+{
+ struct fib_lookup_module *flm;
+ struct fib_data *fd = NULL;
+ enum flm_op_result result;
+
+ flm = fib_check_best_algo(rh, NULL);
+ if (flm == NULL) {
+ RH_PRINTF(rh, "no algo selected");
+ return (ENOENT);
+ }
+ RH_PRINTF(rh, "selected algo %s", flm->flm_name);
+
+ result = setup_instance(flm, rh, NULL, &fd, false);
+ RH_PRINTF(rh, "result=%d fd=%p", result, fd);
+ if (result == FLM_SUCCESS) {
+ /*
+ * Attach datapath directly to avoid N reallocations
+ * during fib growth
+ */
+ struct fib_dp_header *fdp;
+ struct fib_dp **pdp;
+
+ pdp = get_family_ptr(rh->rib_family);
+ if (pdp != NULL) {
+ fdp = get_fib_dp_header(*pdp);
+ fdp->ffi_idx[fd->fd_fibnum] = fd->fd_dp;
+ FD_PRINTF(fd, "datapath attached");
+ }
+ }
+
+ return (0);
+}
+
+int
+fib_module_register(struct fib_lookup_module *flm)
+{
+
+ MOD_LOCK();
+ RH_PRINTF_RAW("attaching %s to %s", flm->flm_name,
+ print_family(flm->flm_family));
+ TAILQ_INSERT_TAIL(&all_algo_list, flm, entries);
+ MOD_UNLOCK();
+
+ return (0);
+}
+
+int
+fib_module_unregister(struct fib_lookup_module *flm)
+{
+ MOD_LOCK();
+ if (flm->flm_refcount > 0) {
+ MOD_UNLOCK();
+ return (EBUSY);
+ }
+ RH_PRINTF_RAW("detaching %s from %s", flm->flm_name,
+ print_family(flm->flm_family));
+ TAILQ_REMOVE(&all_algo_list, flm, entries);
+ MOD_UNLOCK();
+
+ return (0);
+}
+
+int
+fib_module_clone(const struct fib_lookup_module *flm_orig,
+ struct fib_lookup_module *flm, bool waitok)
+{
+
+ return (0);
+}
+
+int
+fib_module_dumptree(struct fib_lookup_module *flm,
+ enum rib_subscription_type subscription_type)
+{
+
+
+ return (0);
+}
+
+static void
+fib_algo_init(void)
+{
+
+ mtx_init(&fib_mtx, "algo list mutex", NULL, MTX_DEF);
+ TAILQ_INIT(&all_algo_list);
+}
+SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, fib_algo_init, NULL);
+
Index: sys/net/route/route_tables.c
===================================================================
--- sys/net/route/route_tables.c
+++ sys/net/route/route_tables.c
@@ -171,7 +171,7 @@
grow_rtables(uint32_t num_tables)
{
struct domain *dom;
- struct rib_head **prnh;
+ struct rib_head **prnh, *rh;
struct rib_head **new_rt_tables, **old_rt_tables;
int family;
@@ -188,6 +188,8 @@
"by default. Consider tuning %s if needed\n",
"net.add_addr_allfibs");
+ fib_grow_rtables(num_tables);
+
/*
* Current rt_tables layout:
* fib0[af0, af1, af2, .., AF_MAX]fib1[af0, af1, af2, .., Af_MAX]..
@@ -206,10 +208,16 @@
prnh = &new_rt_tables[i * (AF_MAX + 1) + family];
if (*prnh != NULL)
continue;
- *prnh = dom->dom_rtattach(i);
- if (*prnh == NULL)
- log(LOG_ERR, "unable to create routing tables for domain %d\n",
- dom->dom_family);
+ rh = dom->dom_rtattach(i);
+ if (rh == NULL)
+ log(LOG_ERR, "unable to create routing table for %d.%d\n",
+ dom->dom_family, i);
+ if (fib_select_algo_initial(rh) != 0) {
+ log(LOG_ERR, "unable to select algo for table %d.%d\n",
+ dom->dom_family, i);
+ // TODO: detach table
+ }
+ *prnh = rh;
}
}
Index: sys/net/route/route_var.h
===================================================================
--- sys/net/route/route_var.h
+++ sys/net/route/route_var.h
@@ -68,6 +68,7 @@
struct vnet *rib_vnet; /* vnet pointer */
int rib_family; /* AF of the rtable */
u_int rib_fibnum; /* fib number */
+ bool rib_dying; /* rib is detaching */
struct callout expire_callout; /* Callout for expiring dynamic routes */
time_t next_expire; /* Next expire run ts */
uint32_t rnh_prefixes; /* Number of prefixes */
@@ -310,6 +311,12 @@
void nhgrp_ref_object(struct nhgrp_object *nhg);
uint32_t nhgrp_get_idx(const struct nhgrp_object *nhg);
void nhgrp_free(struct nhgrp_object *nhg);
+uint32_t nhgrp_get_idx(const struct nhgrp_object *nhg);
+
+/* lookup_framework.c */
+void fib_grow_rtables(uint32_t new_num_tables);
+int fib_select_algo_initial(struct rib_head *rh);
+void fib_destroy_rib(struct rib_head *rh);
/* Entropy data used for outbound hashing */
#define MPATH_ENTROPY_KEY_LEN 40
Index: sys/netinet/in_fib.c
===================================================================
--- sys/netinet/in_fib.c
+++ sys/netinet/in_fib.c
@@ -49,6 +49,7 @@
#include <net/route.h>
#include <net/route/route_ctl.h>
#include <net/route/route_var.h>
+#include <net/route/route_algo.h>
#include <net/route/nhop.h>
#include <net/toeplitz.h>
#include <net/vnet.h>
@@ -63,6 +64,10 @@
/* Assert 'struct route_in' is compatible with 'struct route' */
CHK_STRUCT_ROUTE_COMPAT(struct route_in, ro_dst4);
+#ifdef ROUTE_ALGO
+VNET_DEFINE(struct fib_dp *, inet_dp);
+#endif
+
#ifdef ROUTE_MPATH
struct _hash_5tuple_ipv4 {
struct in_addr src;
@@ -103,6 +108,29 @@
* one needs to pass NHR_REF as a flag. This will return referenced
* nexthop.
*/
+#ifdef ROUTE_ALGO
+struct nhop_object *
+fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
+ uint32_t flags, uint32_t flowid)
+{
+ struct nhop_object *nh;
+ struct fib_dp *dp = &V_inet_dp[fibnum];
+ struct flm_lookup_key key = {.addr4 = dst };
+
+ nh = dp->f(dp->arg, key, scopeid);
+ if (nh != NULL) {
+ nh = nhop_select(nh, flowid);
+ /* Ensure route & ifp is UP */
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ if (flags & NHR_REF)
+ nhop_ref_object(nh);
+ return (nh);
+ }
+ }
+ RTSTAT_INC(rts_unreach);
+ return (NULL);
+}
+#else
struct nhop_object *
fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
uint32_t flags, uint32_t flowid)
@@ -142,6 +170,7 @@
RTSTAT_INC(rts_unreach);
return (NULL);
}
+#endif
inline static int
check_urpf_nhop(const struct nhop_object *nh, uint32_t flags,
@@ -180,6 +209,7 @@
return (check_urpf_nhop(nh, flags, src_if));
}
+#ifndef ROUTE_ALGO
static struct nhop_object *
lookup_nhop(uint32_t fibnum, struct in_addr dst, uint32_t scopeid)
{
@@ -208,6 +238,7 @@
return (nh);
}
+#endif
/*
* Performs reverse path forwarding lookup.
@@ -223,8 +254,14 @@
uint32_t flags, const struct ifnet *src_if)
{
struct nhop_object *nh;
+#ifdef ROUTE_ALGO
+ struct fib_dp *dp = &V_inet_dp[fibnum];
+ struct flm_lookup_key key = {.addr4 = dst };
+ nh = dp->f(dp->arg, key, scopeid);
+#else
nh = lookup_nhop(fibnum, dst, scopeid);
+#endif
if (nh != NULL)
return (check_urpf(nh, flags, src_if));
Index: sys/netinet/in_fib_algo.c
===================================================================
--- /dev/null
+++ sys/netinet/in_fib_algo.c
@@ -0,0 +1,315 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/priv.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <net/vnet.h>
+
+#include <net/if.h>
+#include <netinet/in.h>
+
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/route/route_algo.h>
+
+
+#define KEY_LEN_INET (offsetof(struct sockaddr_in, sin_addr) + sizeof(in_addr_t))
+#define OFF_LEN_INET (8 * offsetof(struct sockaddr_in, sin_addr))
+struct radix4_addr_entry {
+ struct radix_node rn[2];
+ struct sockaddr_in addr;
+ struct nhop_object *nhop;
+};
+#define LRADIX4_ITEM_SZ roundup2(sizeof(struct radix4_addr_entry), 64)
+
+struct lradix4_data {
+ struct radix_node_head *rnh;
+ struct fib_data *fd;
+ void *mem;
+ uint32_t alloc_items;
+ uint32_t num_items;
+};
+
+static struct nhop_object *
+lradix4_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
+{
+ struct radix_node_head *rnh = (struct radix_node_head *)algo_data;
+ struct radix4_addr_entry *ent;
+ struct sockaddr_in addr4 = {
+ .sin_len = KEY_LEN_INET,
+ .sin_addr = key.addr4,
+ };
+ ent = (struct radix4_addr_entry *)(rnh->rnh_matchaddr(&addr4, &rnh->rh));
+ if (ent != NULL)
+ return (ent->nhop);
+ return (NULL);
+}
+
+static uint8_t
+lradix4_get_pref(const struct rib_rtable_info *rinfo)
+{
+
+ if (rinfo->num_prefixes < 10)
+ return (255);
+ else if (rinfo->num_prefixes < 100000)
+ return (255 - rinfo->num_prefixes / 394);
+ else
+ return (1);
+}
+
+static enum flm_op_result
+lradix4_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data)
+{
+ struct lradix4_data *lr;
+ struct rib_rtable_info rinfo;
+ uint32_t count;
+
+ lr = malloc(sizeof(struct lradix4_data), M_RTABLE, M_NOWAIT | M_ZERO);
+ if (lr == NULL || !rn_inithead((void **)&lr->rnh, OFF_LEN_INET))
+ return (FLM_REBUILD);
+ fib_get_rtable_info(fib_get_rh(fd), &rinfo);
+
+ count = rinfo.num_prefixes * 11 / 10;
+ // XXX: alignment!
+ lr->mem = malloc(count * LRADIX4_ITEM_SZ, M_RTABLE, M_NOWAIT | M_ZERO);
+ if (lr->mem == NULL)
+ return (FLM_REBUILD);
+ lr->alloc_items = count;
+ lr->fd = fd;
+
+ *_data = lr;
+
+ return (FLM_SUCCESS);
+}
+
+static void
+lradix4_destroy(void *_data)
+{
+ struct lradix4_data *lr = (struct lradix4_data *)_data;
+
+ if (lr->rnh != NULL)
+ rn_detachhead((void **)&lr->rnh);
+ if (lr->mem != NULL)
+ free(lr->mem, M_RTABLE);
+ free(lr, M_RTABLE);
+}
+
+static enum flm_op_result
+lradix4_add_route_cb(struct rtentry *rt, void *_data)
+{
+ struct lradix4_data *lr = (struct lradix4_data *)_data;
+ struct radix4_addr_entry *ae;
+ struct sockaddr_in *rt_dst, *rt_mask, mask;
+ struct radix_node *rn;
+
+ if (fib_get_nhop_idx(lr->fd, rt->rt_nhop) == 0)
+ return (FLM_REBUILD);
+
+ if (lr->num_items >= lr->alloc_items)
+ return (FLM_REBUILD);
+
+ ae = (struct radix4_addr_entry *)((char *)lr->mem + lr->num_items * LRADIX4_ITEM_SZ);
+ lr->num_items++;
+
+ ae->nhop = rt->rt_nhop;
+
+ rt_dst = (struct sockaddr_in *)rt_key(rt);
+ rt_mask = (struct sockaddr_in *)rt_mask(rt);
+
+ ae->addr.sin_len = KEY_LEN_INET;
+ ae->addr.sin_addr = rt_dst->sin_addr;
+
+ if (rt_mask != NULL) {
+ bzero(&mask, sizeof(mask));
+ mask.sin_len = KEY_LEN_INET;
+ mask.sin_addr = rt_mask->sin_addr;
+ rt_mask = &mask;
+ }
+
+ rn = lr->rnh->rnh_addaddr((struct sockaddr *)&ae->addr,
+ (struct sockaddr *)rt_mask, &lr->rnh->rh, ae->rn);
+ if (rn == NULL)
+ return (FLM_REBUILD);
+
+ return (FLM_SUCCESS);
+}
+
+static enum flm_op_result
+lradix4_end_dump(void *_data, struct fib_dp *dp)
+{
+ struct lradix4_data *lr = (struct lradix4_data *)_data;
+
+ dp->f = lradix4_lookup;
+ dp->arg = lr->rnh;
+
+ return (FLM_SUCCESS);
+}
+
+static enum flm_op_result
+lradix4_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
+ void *_data)
+{
+
+ return (FLM_REBUILD);
+}
+
+struct fib_lookup_module flm_radix4_lockless = {
+ .flm_name = "radix4_lockless",
+ .flm_family = AF_INET,
+ .flm_init_cb = lradix4_init,
+ .flm_destroy_cb = lradix4_destroy,
+ .flm_dump_rib_item_cb = lradix4_add_route_cb,
+ .flm_dump_end_cb = lradix4_end_dump,
+ .flm_change_rib_item_cb = lradix4_change_cb,
+ .flm_get_pref = lradix4_get_pref,
+};
+
+
+struct radix4_data {
+ struct fib_data *fd;
+ struct rib_head *rh;
+};
+
+static struct nhop_object *
+radix4_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rh = (struct rib_head *)algo_data;
+ struct radix_node *rn;
+ struct nhop_object *nh;
+
+ /* Prepare lookup key */
+ struct sockaddr_in sin4 = {
+ .sin_family = AF_INET,
+ .sin_len = sizeof(struct sockaddr_in),
+ .sin_addr = key.addr4,
+ };
+
+ nh = NULL;
+ RIB_RLOCK(rh);
+ rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0))
+ nh = (RNTORT(rn))->rt_nhop;
+ RIB_RUNLOCK(rh);
+
+ return (nh);
+}
+
+static uint8_t
+radix4_get_pref(const struct rib_rtable_info *rinfo)
+{
+
+ return (50);
+}
+
+static enum flm_op_result
+radix4_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data)
+{
+ struct radix4_data *r4;
+
+ r4 = malloc(sizeof(struct radix4_data), M_RTABLE, M_NOWAIT | M_ZERO);
+ if (r4 == NULL)
+ return (FLM_REBUILD);
+ r4->fd = fd;
+ r4->rh = fib_get_rh(fd);
+ if (r4->rh == NULL)
+ return (FLM_ERROR);
+
+ *_data = r4;
+
+ return (FLM_SUCCESS);
+}
+
+static void
+radix4_destroy(void *_data)
+{
+
+ free(_data, M_RTABLE);
+}
+
+static enum flm_op_result
+radix4_end_dump(void *_data, struct fib_dp *dp)
+{
+ struct radix4_data *r4 = (struct radix4_data *)_data;
+
+ dp->f = radix4_lookup;
+ dp->arg = r4->rh;
+
+ return (FLM_SUCCESS);
+}
+
+static enum flm_op_result
+radix4_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
+ void *_data)
+{
+ struct radix4_data *r4 = (struct radix4_data *)_data;
+
+ /*
+ * Grab additional reference for each nexthop to maintain guarantee
+ * that we have non-zero # of reference for each nexthop in radix in
+ * the epoch.
+ */
+ if (rc->rc_nh_new != NULL) {
+ if (fib_get_nhop_idx(r4->fd, rc->rc_nh_new) == 0)
+ return (FLM_REBUILD);
+ }
+ if (rc->rc_nh_old != NULL)
+ fib_free_nhop(r4->fd, rc->rc_nh_old);
+
+ return (FLM_SUCCESS);
+}
+
+struct fib_lookup_module flm_radix4 = {
+ .flm_name = "radix4",
+ .flm_family = AF_INET,
+ .flm_init_cb = radix4_init,
+ .flm_destroy_cb = radix4_destroy,
+ .flm_dump_end_cb = radix4_end_dump,
+ .flm_change_rib_item_cb = radix4_change_cb,
+ .flm_get_pref = radix4_get_pref,
+};
+
+static void
+fib4_algo_init(void)
+{
+
+ fib_module_register(&flm_radix4_lockless);
+ fib_module_register(&flm_radix4);
+}
+SYSINIT(fib4_algo_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, fib4_algo_init, NULL);
Index: sys/netinet6/in6_fib.c
===================================================================
--- sys/netinet6/in6_fib.c
+++ sys/netinet6/in6_fib.c
@@ -50,6 +50,7 @@
#include <net/route.h>
#include <net/route/route_ctl.h>
#include <net/route/route_var.h>
+#include <net/route/route_algo.h>
#include <net/route/nhop.h>
#include <net/toeplitz.h>
#include <net/vnet.h>
@@ -69,6 +70,10 @@
CHK_STRUCT_ROUTE_COMPAT(struct route_in6, ro_dst);
+#ifdef ROUTE_ALGO
+VNET_DEFINE(struct fib_dp *, inet6_dp);
+#endif
+
#ifdef ROUTE_MPATH
struct _hash_5tuple_ipv6 {
struct in6_addr src;
@@ -81,6 +86,7 @@
_Static_assert(sizeof(struct _hash_5tuple_ipv6) == 40,
"_hash_5tuple_ipv6 size is wrong");
+
uint32_t
fib6_calc_software_hash(const struct in6_addr *src, const struct in6_addr *dst,
unsigned short src_port, unsigned short dst_port, char proto,
@@ -111,6 +117,29 @@
* one needs to pass NHR_REF as a flag. This will return referenced
* nexthop.
*/
+#ifdef ROUTE_ALGO
+struct nhop_object *
+fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6,
+ uint32_t scopeid, uint32_t flags, uint32_t flowid)
+{
+ struct nhop_object *nh;
+ struct fib_dp *dp = &V_inet6_dp[fibnum];
+ struct flm_lookup_key key = {.addr6 = dst6 };
+
+ nh = dp->f(dp->arg, key, scopeid);
+ if (nh != NULL) {
+ nh = nhop_select(nh, flowid);
+ /* Ensure route & ifp is UP */
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ if (flags & NHR_REF)
+ nhop_ref_object(nh);
+ return (nh);
+ }
+ }
+ RTSTAT_INC(rts_unreach);
+ return (NULL);
+}
+#else
struct nhop_object *
fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6,
uint32_t scopeid, uint32_t flags, uint32_t flowid)
@@ -151,6 +180,7 @@
RTSTAT_INC(rts_unreach);
return (NULL);
}
+#endif
inline static int
check_urpf_nhop(const struct nhop_object *nh, uint32_t flags,
@@ -237,8 +267,14 @@
uint32_t scopeid, uint32_t flags, const struct ifnet *src_if)
{
struct nhop_object *nh;
+#ifndef ROUTE_ALGO
+ struct fib_dp *dp = &V_inet6_dp[fibnum];
+ struct flm_lookup_key key = {.addr6 = dst6 };
+ nh = dp->f(dp->arg, key, scopeid);
+#else
nh = lookup_nhop(fibnum, dst6, scopeid);
+#endif
if (nh != NULL)
return (check_urpf(nh, flags, src_if));
return (0);
Index: sys/netinet6/in6_fib_algo.c
===================================================================
--- /dev/null
+++ sys/netinet6/in6_fib_algo.c
@@ -0,0 +1,340 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <net/vnet.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
+
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/route/route_algo.h>
+#define RTDEBUG
+
+#define KEY_LEN_INET6 (offsetof(struct sa_in6, sin6_addr) + sizeof(struct in6_addr))
+#define OFF_LEN_INET6 (8 * offsetof(struct sa_in6, sin6_addr))
+struct sa_in6 {
+ uint8_t sin6_len;
+ uint8_t sin6_family;
+ uint8_t pad[2];
+ struct in6_addr sin6_addr;
+};
+struct radix6_addr_entry {
+ struct radix_node rn[2];
+ struct sa_in6 addr;
+ struct nhop_object *nhop;
+};
+#define LRADIX6_ITEM_SZ roundup2(sizeof(struct radix6_addr_entry), 64)
+
+struct lradix6_data {
+ struct radix_node_head *rnh;
+ struct fib_data *fd;
+ void *mem;
+ uint32_t alloc_items;
+ uint32_t num_items;
+};
+
+static struct nhop_object *
+lradix6_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
+{
+ struct radix_node_head *rnh = (struct radix_node_head *)algo_data;
+ struct radix6_addr_entry *ent;
+ struct sa_in6 addr6 = {
+ .sin6_len = KEY_LEN_INET6,
+ .sin6_addr = *key.addr6,
+ };
+ if (IN6_IS_SCOPE_LINKLOCAL(key.addr6))
+ addr6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff);
+ ent = (struct radix6_addr_entry *)(rnh->rnh_matchaddr(&addr6, &rnh->rh));
+ if (ent != NULL)
+ return (ent->nhop);
+ return (NULL);
+}
+
+static uint8_t
+lradix6_get_pref(const struct rib_rtable_info *rinfo)
+{
+
+ if (rinfo->num_prefixes < 10)
+ return (255);
+ else if (rinfo->num_prefixes < 100000)
+ return (255 - rinfo->num_prefixes / 394);
+ else
+ return (1);
+}
+
+static enum flm_op_result
+lradix6_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data)
+{
+ struct lradix6_data *lr;
+ struct rib_rtable_info rinfo;
+ uint32_t count;
+
+ lr = malloc(sizeof(struct lradix6_data), M_RTABLE, M_NOWAIT | M_ZERO);
+ if (lr == NULL || !rn_inithead((void **)&lr->rnh, OFF_LEN_INET6))
+ return (FLM_REBUILD);
+ fib_get_rtable_info(fib_get_rh(fd), &rinfo);
+
+ count = rinfo.num_prefixes * 11 / 10;
+ // XXX: alignment!
+ lr->mem = malloc(count * LRADIX6_ITEM_SZ, M_RTABLE, M_NOWAIT | M_ZERO);
+ if (lr->mem == NULL)
+ return (FLM_REBUILD);
+ lr->alloc_items = count;
+ lr->fd = fd;
+
+ *_data = lr;
+
+ return (FLM_SUCCESS);
+}
+
+static void
+lradix6_destroy(void *_data)
+{
+ struct lradix6_data *lr = (struct lradix6_data *)_data;
+
+ if (lr->rnh != NULL)
+ rn_detachhead((void **)&lr->rnh);
+ if (lr->mem != NULL)
+ free(lr->mem, M_RTABLE);
+ free(lr, M_RTABLE);
+}
+
+static enum flm_op_result
+lradix6_add_route_cb(struct rtentry *rt, void *_data)
+{
+ struct lradix6_data *lr = (struct lradix6_data *)_data;
+ struct radix6_addr_entry *ae;
+ struct sockaddr_in6 *rt_dst, *rt_mask;
+ struct sa_in6 mask;
+ struct radix_node *rn;
+
+ if (fib_get_nhop_idx(lr->fd, rt->rt_nhop) == 0)
+ return (FLM_REBUILD);
+
+ if (lr->num_items >= lr->alloc_items)
+ return (FLM_REBUILD);
+
+ ae = (struct radix6_addr_entry *)((char *)lr->mem + lr->num_items * LRADIX6_ITEM_SZ);
+ lr->num_items++;
+
+ ae->nhop = rt->rt_nhop;
+
+ rt_dst = (struct sockaddr_in6 *)rt_key(rt);
+ rt_mask = (struct sockaddr_in6 *)rt_mask(rt);
+
+ ae->addr.sin6_len = KEY_LEN_INET6;
+ ae->addr.sin6_addr = rt_dst->sin6_addr;
+
+ if (rt_mask != NULL) {
+ bzero(&mask, sizeof(mask));
+ mask.sin6_len = KEY_LEN_INET6;
+ mask.sin6_addr = rt_mask->sin6_addr;
+ rt_mask = (struct sockaddr_in6 *)&mask;
+ }
+
+ rn = lr->rnh->rnh_addaddr((struct sockaddr *)&ae->addr,
+ (struct sockaddr *)rt_mask, &lr->rnh->rh, ae->rn);
+ if (rn == NULL)
+ return (FLM_REBUILD);
+
+ return (FLM_SUCCESS);
+}
+
+static enum flm_op_result
+lradix6_end_dump(void *_data, struct fib_dp *dp)
+{
+ struct lradix6_data *lr = (struct lradix6_data *)_data;
+
+ dp->f = lradix6_lookup;
+ dp->arg = lr->rnh;
+
+ return (FLM_SUCCESS);
+}
+
+static enum flm_op_result
+lradix6_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
+ void *_data)
+{
+
+ return (FLM_REBUILD);
+}
+
+struct fib_lookup_module flm_radix6_lockless = {
+ .flm_name = "radix6_lockless",
+ .flm_family = AF_INET6,
+ .flm_init_cb = lradix6_init,
+ .flm_destroy_cb = lradix6_destroy,
+ .flm_dump_rib_item_cb = lradix6_add_route_cb,
+ .flm_dump_end_cb = lradix6_end_dump,
+ .flm_change_rib_item_cb = lradix6_change_cb,
+ .flm_get_pref = lradix6_get_pref,
+};
+
+
+struct radix6_data {
+ struct fib_data *fd;
+ struct rib_head *rh;
+};
+
+static struct nhop_object *
+radix6_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rh = (struct rib_head *)algo_data;
+ struct radix_node *rn;
+ struct nhop_object *nh;
+
+ /* Prepare lookup key */
+ struct sockaddr_in6 sin6 = {
+ .sin6_family = AF_INET6,
+ .sin6_len = sizeof(struct sockaddr_in6),
+ .sin6_addr = *key.addr6,
+ };
+ if (IN6_IS_SCOPE_LINKLOCAL(key.addr6))
+ sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff);
+
+ nh = NULL;
+ RIB_RLOCK(rh);
+ rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0))
+ nh = (RNTORT(rn))->rt_nhop;
+ RIB_RUNLOCK(rh);
+
+ return (nh);
+}
+
+static uint8_t
+radix6_get_pref(const struct rib_rtable_info *rinfo)
+{
+
+ return (50);
+}
+
+static enum flm_op_result
+radix6_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data)
+{
+ struct radix6_data *r6;
+
+ r6 = malloc(sizeof(struct radix6_data), M_RTABLE, M_NOWAIT | M_ZERO);
+ if (r6 == NULL)
+ return (FLM_REBUILD);
+ r6->fd = fd;
+ r6->rh = fib_get_rh(fd);
+ if (r6->rh == NULL)
+ return (FLM_ERROR);
+
+ *_data = r6;
+
+ return (FLM_SUCCESS);
+}
+
+static void
+radix6_destroy(void *_data)
+{
+
+ free(_data, M_RTABLE);
+}
+
+static enum flm_op_result
+radix6_end_dump(void *_data, struct fib_dp *dp)
+{
+ struct radix6_data *r6 = (struct radix6_data *)_data;
+
+ dp->f = radix6_lookup;
+ dp->arg = r6->rh;
+
+ return (FLM_SUCCESS);
+}
+
+static enum flm_op_result
+radix6_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
+ void *_data)
+{
+ struct radix6_data *r6 = (struct radix6_data *)_data;
+
+ /*
+ * Grab additional reference for each nexthop to maintain guarantee
+ * that we have non-zero # of reference for each nexthop in radix in
+ * the epoch.
+ */
+ if (rc->rc_nh_new != NULL) {
+ if (fib_get_nhop_idx(r6->fd, rc->rc_nh_new) == 0)
+ return (FLM_REBUILD);
+ }
+ if (rc->rc_nh_old != NULL)
+ fib_free_nhop(r6->fd, rc->rc_nh_old);
+
+ return (FLM_SUCCESS);
+}
+
+struct fib_lookup_module flm_radix6 = {
+ .flm_name = "radix6",
+ .flm_family = AF_INET6,
+ .flm_init_cb = radix6_init,
+ .flm_destroy_cb = radix6_destroy,
+ .flm_dump_end_cb = radix6_end_dump,
+ .flm_change_rib_item_cb = radix6_change_cb,
+ .flm_get_pref = radix6_get_pref,
+};
+
+static void
+fib6_algo_init(void)
+{
+
+ fib_module_register(&flm_radix6_lockless);
+ fib_module_register(&flm_radix6);
+}
+SYSINIT(fib6_algo_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, fib6_algo_init, NULL);

File Metadata

Mime Type
text/plain
Expires
Mon, Oct 27, 4:13 PM (34 m, 2 s)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
24302510
Default Alt Text
D27401.id80345.diff (60 KB)

Event Timeline