diff --git a/sys/net/route/fib_algo.c b/sys/net/route/fib_algo.c
index 171fa2cfb2dd..43db482d73da 100644
--- a/sys/net/route/fib_algo.c
+++ b/sys/net/route/fib_algo.c
@@ -1,2011 +1,2022 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2020 Alexander V. Chernikov
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_route.h"
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/sbuf.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/queue.h>
 #include <net/vnet.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
 
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 #include <net/route/fib_algo.h>
 
 #include <machine/stdarg.h>
 
 /*
  * Fib lookup framework.
  *
  * This framework enables accelerated longest-prefix-match lookups for the
  *  routing tables by adding the ability to dynamically attach/detach lookup
  *  algorithms implementation to/from the datapath.
  *
  * flm - fib lookup modules - implementation of particular lookup algorithm
  * fd - fib data - instance of an flm bound to specific routing table
  *
  * This file provides main framework functionality.
  *
  * The following are the features provided by the framework
  *
  * 1) nexhops abstraction -> provides transparent referencing, indexing
  *   and efficient idx->ptr mappings for nexthop and nexthop groups.
  * 2) Routing table synchronisation
  * 3) dataplane attachment points
  * 4) automatic algorithm selection based on the provided preference.
  *
  *
  * DATAPATH
  * For each supported address family, there is a an allocated array of fib_dp
  *  structures, indexed by fib number. Each array entry contains callback function
  *  and its argument. This function will be called with a family-specific lookup key,
  *  scope and provided argument. This array gets re-created every time when new algo
  *  instance gets created. Please take a look at the replace_rtables_family() function
  *  for more details.
  *
  */
 
 SYSCTL_DECL(_net_route);
 SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Fib algorithm lookups");
 
 /* Algorithm sync policy */
 
 /* Time interval to bucket updates */
 VNET_DEFINE(unsigned int, bucket_time_ms) = 50;
 #define	V_bucket_time_ms	VNET(bucket_time_ms)
 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_time_ms, CTLFLAG_RW | CTLFLAG_VNET,
     &VNET_NAME(bucket_time_ms), 0, "Time interval to calculate update rate");
 
 /* Minimum update rate to delay sync */
 VNET_DEFINE(unsigned int, bucket_change_threshold_rate) = 500;
 #define	V_bucket_change_threshold_rate	VNET(bucket_change_threshold_rate)
 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_change_threshold_rate, CTLFLAG_RW | CTLFLAG_VNET,
     &VNET_NAME(bucket_change_threshold_rate), 0, "Minimum update rate to delay sync");
 
 /* Max allowed delay to sync */
 VNET_DEFINE(unsigned int, fib_max_sync_delay_ms) = 1000;
 #define	V_fib_max_sync_delay_ms	VNET(fib_max_sync_delay_ms)
 SYSCTL_UINT(_net_route_algo, OID_AUTO, fib_max_sync_delay_ms, CTLFLAG_RW | CTLFLAG_VNET,
     &VNET_NAME(fib_max_sync_delay_ms), 0, "Maximum time to delay sync (ms)");
 
 
 #ifdef INET6
 VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false;
 #define	V_algo_fixed_inet6	VNET(algo_fixed_inet6)
 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IPv6 longest prefix match lookups");
 #endif
 #ifdef INET
 VNET_DEFINE_STATIC(bool, algo_fixed_inet) = false;
 #define	V_algo_fixed_inet	VNET(algo_fixed_inet)
 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IPv4 longest prefix match lookups");
 #endif
 
 /* Fib instance counter */
 static uint32_t fib_gen = 0;
 
 struct nhop_ref_table {
 	uint32_t		count;
 	int32_t			refcnt[0];
 };
 
 enum fib_callout_action {
 	FDA_NONE,	/* No callout scheduled */
 	FDA_REBUILD,	/* Asks to rebuild algo instance */
 	FDA_EVAL,	/* Asks to evaluate if the current algo is still be best */
 	FDA_BATCH,	/* Asks to submit batch of updates to the algo */
 };
 
 struct fib_sync_status {
 	struct timeval		diverge_time;	/* ts when diverged */
 	uint32_t		num_changes;	/* number of changes since sync */
 	uint32_t		bucket_changes;	/* num changes within the current bucket */
 	uint64_t		bucket_id;	/* 50ms bucket # */
 	struct fib_change_queue	fd_change_queue;/* list of scheduled entries */
 };
 
 /*
  * Data structure for the fib lookup instance tied to the particular rib.
  */
 struct fib_data {
 	uint32_t		number_nhops;	/* current # of nhops */
 	uint8_t			hit_nhops;	/* true if out of nhop limit */
 	uint8_t			init_done;	/* true if init is competed */
 	uint32_t		fd_dead:1;	/* Scheduled for deletion */
 	uint32_t		fd_linked:1;	/* true if linked */
 	uint32_t		fd_need_rebuild:1;	/* true if rebuild scheduled */
 	uint32_t		fd_batch:1;	/* true if batched notification scheduled */
 	uint8_t			fd_family;	/* family */
 	uint32_t		fd_fibnum;	/* fibnum */
 	uint32_t		fd_failed_rebuilds;	/* stat: failed rebuilds */
 	uint32_t		fd_gen;		/* instance gen# */
 	struct callout		fd_callout;	/* rebuild callout */
 	enum fib_callout_action	fd_callout_action;	/* Callout action to take */
 	void			*fd_algo_data;	/* algorithm data */
 	struct nhop_object	**nh_idx;	/* nhop idx->ptr array */
 	struct nhop_ref_table	*nh_ref_table;	/* array with # of nhop references */
 	struct rib_head		*fd_rh;		/* RIB table we're attached to */
 	struct rib_subscription	*fd_rs;		/* storing table subscription */
 	struct fib_dp		fd_dp;		/* fib datapath data */
 	struct vnet		*fd_vnet;	/* vnet fib belongs to */
 	struct epoch_context	fd_epoch_ctx;	/* epoch context for deletion */
 	struct fib_lookup_module	*fd_flm;/* pointer to the lookup module */
 	struct fib_sync_status	fd_ss;		/* State relevant to the rib sync  */
 	uint32_t		fd_num_changes;	/* number of changes since last callout */
 	TAILQ_ENTRY(fib_data)	entries;	/* list of all fds in vnet */
 };
 
 static bool rebuild_fd(struct fib_data *fd, const char *reason);
 static bool rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new);
 static void handle_fd_callout(void *_data);
 static void destroy_fd_instance_epoch(epoch_context_t ctx);
 static bool is_idx_free(struct fib_data *fd, uint32_t index);
 static void set_algo_fixed(struct rib_head *rh);
 static bool is_algo_fixed(struct rib_head *rh);
 
 static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh);
 static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh);
 
 static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh,
     struct fib_lookup_module *orig_flm);
 static void fib_unref_algo(struct fib_lookup_module *flm);
 static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum);
 
 struct mtx fib_mtx;
 #define	FIB_MOD_LOCK()		mtx_lock(&fib_mtx)
 #define	FIB_MOD_UNLOCK()	mtx_unlock(&fib_mtx)
 #define	FIB_MOD_LOCK_ASSERT()	mtx_assert(&fib_mtx, MA_OWNED)
 
 MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF);
 
 /* Algorithm has to be this percent better than the current to switch */
 #define	BEST_DIFF_PERCENT	(5 * 256 / 100)
 /* Schedule algo re-evaluation X seconds after a change */
 #define	ALGO_EVAL_DELAY_MS	30000
 /* Force algo re-evaluation after X changes */
 #define	ALGO_EVAL_NUM_ROUTES	100
 /* Try to setup algorithm X times */
 #define	FIB_MAX_TRIES		32
 /* Max amount of supported nexthops */
 #define	FIB_MAX_NHOPS		262144
 #define	FIB_CALLOUT_DELAY_MS	50
 
 
 /* Debug */
 static int flm_debug_level = LOG_NOTICE;
 SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN,
     &flm_debug_level, 0, "debuglevel");
 #define	FLM_MAX_DEBUG_LEVEL	LOG_DEBUG
 #ifndef	LOG_DEBUG2
 #define	LOG_DEBUG2	8
 #endif
 
 #define	_PASS_MSG(_l)	(flm_debug_level >= (_l))
 #define	ALGO_PRINTF(_l, _fmt, ...)	if (_PASS_MSG(_l)) {		\
 	printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__);	\
 }
 #define	_ALGO_PRINTF(_fib, _fam, _aname, _gen, _func, _fmt, ...) \
     printf("[fib_algo] %s.%u (%s#%u) %s: " _fmt "\n",\
     print_family(_fam), _fib, _aname, _gen, _func, ## __VA_ARGS__)
 #define	_RH_PRINTF(_fib, _fam, _func, _fmt, ...) \
     printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__)
 #define	RH_PRINTF(_l, _rh, _fmt, ...)	if (_PASS_MSG(_l)) {	\
     _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\
 }
 #define	FD_PRINTF(_l, _fd, _fmt, ...)	FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__)
 #define	_FD_PRINTF(_l, _fd, _fmt, ...)	if (_PASS_MSG(_l)) {		\
     _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name,	\
     _fd->fd_gen, __func__, _fmt, ## __VA_ARGS__);			\
 }
 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG2
 #define	FD_PRINTF_LOG_DEBUG2	_FD_PRINTF
 #else
 #define	FD_PRINTF_LOG_DEBUG2(_l, _fd, _fmt, ...)
 #endif
 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG
 #define	FD_PRINTF_LOG_DEBUG	_FD_PRINTF
 #else
 #define	FD_PRINTF_LOG_DEBUG()
 #endif
 #if FLM_MAX_DEBUG_LEVEL>=LOG_INFO
 #define	FD_PRINTF_LOG_INFO	_FD_PRINTF
 #else
 #define	FD_PRINTF_LOG_INFO()
 #endif
 #define	FD_PRINTF_LOG_NOTICE	_FD_PRINTF
 #define	FD_PRINTF_LOG_ERR	_FD_PRINTF
 #define	FD_PRINTF_LOG_WARNING	_FD_PRINTF
 
 
 /* List of all registered lookup algorithms */
 static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list);
 
 /* List of all fib lookup instances in the vnet */
 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list);
 #define	V_fib_data_list	VNET(fib_data_list)
 
 /* Datastructure for storing non-transient fib lookup module failures */
 struct fib_error {
 	int				fe_family;
 	uint32_t			fe_fibnum;	/* failed rtable */
 	struct fib_lookup_module	*fe_flm;	/* failed module */
 	TAILQ_ENTRY(fib_error)		entries;/* list of all errored entries */
 };
 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list);
 #define	V_fib_error_list VNET(fib_error_list)
 
 /* Per-family array of fibnum -> {func, arg} mappings used in datapath */
 struct fib_dp_header {
 	struct epoch_context	fdh_epoch_ctx;
 	uint32_t		fdh_num_tables;
 	struct fib_dp		fdh_idx[0];
 };
 
 /*
  * Tries to add new non-transient algorithm error to the list of
  *  errors.
  * Returns true on success.
  */
 static bool
 flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum)
 {
 	struct fib_error *fe;
 
 	fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO);
 	if (fe == NULL)
 		return (false);
 	fe->fe_flm = flm;
 	fe->fe_family = flm->flm_family;
 	fe->fe_fibnum = fibnum;
 
 	FIB_MOD_LOCK();
 	/* Avoid duplicates by checking if error already exists first */
 	if (flm_error_check(flm, fibnum)) {
 		FIB_MOD_UNLOCK();
 		free(fe, M_TEMP);
 		return (true);
 	}
 	TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries);
 	FIB_MOD_UNLOCK();
 
 	return (true);
 }
 
 /*
  * True if non-transient error has been registered for @flm in @fibnum.
  */
 static bool
 flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum)
 {
 	const struct fib_error *fe;
 
 	TAILQ_FOREACH(fe, &V_fib_error_list, entries) {
 		if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum))
 			return (true);
 	}
 
 	return (false);
 }
 
 /*
  * Clear all errors of algo specified by @flm.
  */
 static void
 fib_error_clear_flm(struct fib_lookup_module *flm)
 {
 	struct fib_error *fe, *fe_tmp;
 
 	FIB_MOD_LOCK_ASSERT();
 
 	TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
 		if (fe->fe_flm == flm) {
 			TAILQ_REMOVE(&V_fib_error_list, fe, entries);
 			free(fe, M_TEMP);
 		}
 	}
 }
 
 /*
  * Clears all errors in current VNET.
  */
 static void
 fib_error_clear(void)
 {
 	struct fib_error *fe, *fe_tmp;
 
 	FIB_MOD_LOCK_ASSERT();
 
 	TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
 		TAILQ_REMOVE(&V_fib_error_list, fe, entries);
 		free(fe, M_TEMP);
 	}
 }
 
 static const char *
 print_op_result(enum flm_op_result result)
 {
 	switch (result) {
 	case FLM_SUCCESS:
 		return "success";
 	case FLM_REBUILD:
 		return "rebuild";
 	case FLM_BATCH:
 		return "batch";
 	case FLM_ERROR:
 		return "error";
 	}
 
 	return "unknown";
 }
 
 static const char *
 print_family(int family)
 {
 
 	if (family == AF_INET)
 		return ("inet");
 	else if (family == AF_INET6)
 		return ("inet6");
 	else
 		return ("unknown");
 }
 
 /*
  * Debug function used by lookup algorithms.
  * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) "
  */
 void
 fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...)
 {
 	char buf[128];
 	va_list ap;
 
 	if (level > flm_debug_level)
 		return;
 
 	va_start(ap, fmt);
 	vsnprintf(buf, sizeof(buf), fmt, ap);
 	va_end(ap);
 
 	_ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name,
 	    fd->fd_gen, func, "%s", buf);
 }
 
 /*
  * Outputs list of algorithms supported by the provided address family.
  */
 static int
 print_algos_sysctl(struct sysctl_req *req, int family)
 {
 	struct fib_lookup_module *flm;
 	struct sbuf sbuf;
 	int error, count = 0;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error == 0) {
 		sbuf_new_for_sysctl(&sbuf, NULL, 512, req);
 		TAILQ_FOREACH(flm, &all_algo_list, entries) {
 			if (flm->flm_family == family) {
 				if (count++ > 0)
 					sbuf_cat(&sbuf, ", ");
 				sbuf_cat(&sbuf, flm->flm_name);
 			}
 		}
 		error = sbuf_finish(&sbuf);
 		sbuf_delete(&sbuf);
 	}
 	return (error);
 }
 
 #ifdef INET6
 static int
 print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS)
 {
 
 	return (print_algos_sysctl(req, AF_INET6));
 }
 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms");
 #endif
 
 #ifdef INET
 static int
 print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS)
 {
 
 	return (print_algos_sysctl(req, AF_INET));
 }
 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms");
 #endif
 
 /*
  * Calculate delay between repeated failures.
  * Returns current delay in milliseconds.
  */
 static uint32_t
 callout_calc_delay_ms(struct fib_data *fd)
 {
 	uint32_t shift;
 
 	if (fd->fd_failed_rebuilds > 10)
 		shift = 10;
 	else
 		shift = fd->fd_failed_rebuilds;
 
 	return ((1 << shift) * FIB_CALLOUT_DELAY_MS);
 }
 
 static void
 schedule_callout(struct fib_data *fd, enum fib_callout_action action, int delay_ms)
 {
 
 	FD_PRINTF(LOG_DEBUG, fd, "delay=%d action=%d", delay_ms, action);
 	fd->fd_callout_action = action;
 	callout_reset_sbt(&fd->fd_callout, SBT_1MS * delay_ms, 0,
 	    handle_fd_callout, fd, 0);
 }
 
 static void
 schedule_fd_rebuild(struct fib_data *fd, const char *reason)
 {
 
 	RIB_WLOCK_ASSERT(fd->fd_rh);
 
 	if (!fd->fd_need_rebuild) {
 		fd->fd_need_rebuild = true;
 		/* Stop batch updates */
 		fd->fd_batch = false;
 
 		/*
 		 * Potentially re-schedules pending callout
 		 *  initiated by schedule_algo_eval.
 		 */
 		FD_PRINTF(LOG_INFO, fd, "Scheduling rebuild: %s (failures=%d)",
 		    reason, fd->fd_failed_rebuilds);
 		schedule_callout(fd, FDA_REBUILD, callout_calc_delay_ms(fd));
 	}
 }
 
+static void
+sync_rib_gen(struct fib_data *fd)
+{
+	FD_PRINTF(LOG_DEBUG, fd, "Sync gen %u -> %u", fd->fd_rh->rnh_gen, fd->fd_rh->rnh_gen_rib);
+	fd->fd_rh->rnh_gen = fd->fd_rh->rnh_gen_rib;
+}
+
 static int64_t
 get_tv_diff_ms(const struct timeval *old_tv, const struct timeval *new_tv)
 {
 	int64_t diff = 0;
 
 	diff = ((int64_t)(new_tv->tv_sec - old_tv->tv_sec)) * 1000;
 	diff += (new_tv->tv_usec - old_tv->tv_usec) / 1000;
 
 	return (diff);
 }
 
 static void
 add_tv_diff_ms(struct timeval *tv, int ms)
 {
 	tv->tv_sec += ms / 1000;
 	ms = ms % 1000;
 	if (ms * 1000 + tv->tv_usec < 1000000)
 		tv->tv_usec += ms * 1000;
 	else {
 		tv->tv_sec += 1;
 		tv->tv_usec = ms * 1000 + tv->tv_usec - 1000000;
 	}
 }
 
 /*
  * Marks the time when algo state diverges from the rib state.
  */
 static void
 mark_diverge_time(struct fib_data *fd)
 {
 	struct fib_sync_status *fd_ss = &fd->fd_ss;
 
 	getmicrouptime(&fd_ss->diverge_time);
 	fd_ss->bucket_id = 0;
 	fd_ss->bucket_changes = 0;
 }
 
 /*
  * Calculates and updates the next algorithm sync time, based on the current activity.
  *
  * The intent is to provide reasonable balance between the update
  *  latency and efficient batching when changing large amount of routes.
  *
  * High-level algorithm looks the following:
  * 1) all changes are bucketed in 50ms intervals
  * 2) If amount of changes within the bucket is greater than the threshold,
  *   the update gets delayed, up to maximum delay threshold.
  */
 static void
 update_rebuild_delay(struct fib_data *fd, enum fib_callout_action action)
 {
 	uint32_t bucket_id, new_delay = 0;
 	struct timeval tv;
 
 	/* Fetch all variables at once to ensure consistent reads */
 	uint32_t bucket_time_ms = V_bucket_time_ms;
 	uint32_t threshold_rate = V_bucket_change_threshold_rate;
 	uint32_t max_delay_ms = V_fib_max_sync_delay_ms;
 
 	if (bucket_time_ms == 0)
 		bucket_time_ms = 50;
 	/* calculate per-bucket threshold rate */
 	threshold_rate = threshold_rate * bucket_time_ms / 1000;
 
 	getmicrouptime(&tv);
 
 	struct fib_sync_status *fd_ss = &fd->fd_ss;
 
 	bucket_id = get_tv_diff_ms(&fd_ss->diverge_time, &tv) / bucket_time_ms;
 
 	if (fd_ss->bucket_id == bucket_id) {
 		fd_ss->bucket_changes++;
 		if (fd_ss->bucket_changes == threshold_rate) {
 			new_delay = (bucket_id + 2) * bucket_time_ms;
 			if (new_delay <= max_delay_ms) {
 				FD_PRINTF(LOG_DEBUG, fd,
 				    "hit threshold of %u routes, delay update,"
 				    "bucket: %u, total delay: %u",
 				    threshold_rate, bucket_id + 1, new_delay);
 			} else {
 				new_delay = 0;
 				FD_PRINTF(LOG_DEBUG, fd,
 				    "maximum sync delay (%u ms) reached", max_delay_ms);
 			}
 		} else if ((bucket_id == 0) && (fd_ss->bucket_changes == 1))
 			new_delay = bucket_time_ms;
 	} else {
 		fd_ss->bucket_id = bucket_id;
 		fd_ss->bucket_changes = 1;
 	}
 
 	if (new_delay > 0) {
 		/* Calculated time has been updated */
 		struct timeval new_tv = fd_ss->diverge_time;
 		add_tv_diff_ms(&new_tv, new_delay);
 
 		int32_t delay_ms = get_tv_diff_ms(&tv, &new_tv);
 		schedule_callout(fd, action, delay_ms);
 	}
 }
 
 static void
 update_algo_state(struct fib_data *fd)
 {
 
 	RIB_WLOCK_ASSERT(fd->fd_rh);
 
 	if (fd->fd_batch || fd->fd_need_rebuild) {
 		enum fib_callout_action action = fd->fd_need_rebuild ? FDA_REBUILD : FDA_BATCH;
 		update_rebuild_delay(fd, action);
 		return;
 	}
 
 	if (fd->fd_num_changes++ == 0) {
 		/* Start callout to consider switch */
 		if (!callout_pending(&fd->fd_callout))
 			schedule_callout(fd, FDA_EVAL, ALGO_EVAL_DELAY_MS);
 	} else if (fd->fd_num_changes == ALGO_EVAL_NUM_ROUTES) {
 		/* Reset callout to exec immediately */
 		if (fd->fd_callout_action == FDA_EVAL)
 			schedule_callout(fd, FDA_EVAL, 1);
 	}
 }
 
 static bool
 need_immediate_sync(struct fib_data *fd, struct rib_cmd_info *rc)
 {
 	struct nhop_object *nh;
 
 	/* Sync addition/removal of interface routes */
 	switch (rc->rc_cmd) {
 	case RTM_ADD:
 		nh = rc->rc_nh_new;
 		if (!NH_IS_NHGRP(nh) && (!(nh->nh_flags & NHF_GATEWAY)))
 			return (true);
 		break;
 	case RTM_DELETE:
 		nh = rc->rc_nh_old;
 		if (!NH_IS_NHGRP(nh) && (!(nh->nh_flags & NHF_GATEWAY)))
 			return (true);
 		break;
 	}
 
 	return (false);
 }
 
 static bool
 apply_rtable_changes(struct fib_data *fd)
 {
 	enum flm_op_result result;
 	struct fib_change_queue *q = &fd->fd_ss.fd_change_queue;
 
 	result = fd->fd_flm->flm_change_rib_items_cb(fd->fd_rh, q, fd->fd_algo_data);
 
 	if (result == FLM_SUCCESS) {
+		sync_rib_gen(fd);
 		for (int i = 0; i < q->count; i++)
 			if (q->entries[i].nh_old)
 				fib_unref_nhop(fd, q->entries[i].nh_old);
 		q->count = 0;
 	}
 	fd->fd_batch = false;
 
 	return (result == FLM_SUCCESS);
 }
 
 static bool
 fill_change_entry(struct fib_data *fd, struct fib_change_entry *ce, struct rib_cmd_info *rc)
 {
 	int plen = 0;
 
 	switch (fd->fd_family) {
 	case AF_INET:
 		rt_get_inet_prefix_plen(rc->rc_rt, &ce->addr4, &plen, &ce->scopeid);
 		break;
 	case AF_INET6:
 		rt_get_inet6_prefix_plen(rc->rc_rt, &ce->addr6, &plen, &ce->scopeid);
 		break;
 	}
 
 	ce->plen = plen;
 	ce->nh_old = rc->rc_nh_old;
 	ce->nh_new = rc->rc_nh_new;
 	if (ce->nh_new != NULL) {
 		if (fib_ref_nhop(fd, ce->nh_new) == 0)
 			return (false);
 	}
 
 	return (true);
 }
 
 static bool
 queue_rtable_change(struct fib_data *fd, struct rib_cmd_info *rc)
 {
 	struct fib_change_queue *q = &fd->fd_ss.fd_change_queue;
 
 	if (q->count >= q->size) {
 		uint32_t q_size;
 
 		if (q->size == 0)
 			q_size = 256; /* ~18k memory */
 		else
 			q_size = q->size * 2;
 
 		size_t size = q_size * sizeof(struct fib_change_entry);
 		void *a = realloc(q->entries, size, M_TEMP, M_NOWAIT | M_ZERO);
 		if (a == NULL) {
 			FD_PRINTF(LOG_INFO, fd, "Unable to realloc queue for %u elements",
 			    q_size);
 			return (false);
 		}
 		q->entries = a;
 		q->size = q_size;
 	}
 
 	return (fill_change_entry(fd, &q->entries[q->count++], rc));
 }
 
 /*
  * Rib subscription handler. Checks if the algorithm is ready to
  *  receive updates, handles nexthop refcounting and passes change
  *  data to the algorithm callback.
  */
 static void
 handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
     void *_data)
 {
 	struct fib_data *fd = (struct fib_data *)_data;
 	enum flm_op_result result;
 
 	RIB_WLOCK_ASSERT(rnh);
 
 	/*
 	 * There is a small gap between subscribing for route changes
 	 *  and initiating rtable dump. Avoid receiving route changes
 	 *  prior to finishing rtable dump by checking `init_done`.
 	 */
 	if (!fd->init_done)
 		return;
 
 	bool immediate_sync = need_immediate_sync(fd, rc);
 
 	/* Consider scheduling algorithm re-evaluation */
 	update_algo_state(fd);
 
 	/*
 	 * If algo requested rebuild, stop sending updates by default.
 	 * This simplifies nexthop refcount handling logic.
 	 */
 	if (fd->fd_need_rebuild) {
 		if (immediate_sync)
 			rebuild_fd(fd, "rtable change type enforced sync");
 		return;
 	}
 
 	/*
 	 * Algo requested updates to be delivered in batches.
 	 * Add the current change to the queue and return.
 	 */
 	if (fd->fd_batch) {
 		if (immediate_sync) {
 			if (!queue_rtable_change(fd, rc) || !apply_rtable_changes(fd))
 				rebuild_fd(fd, "batch sync failed");
 		} else {
 			if (!queue_rtable_change(fd, rc))
 				schedule_fd_rebuild(fd, "batch queue failed");
 		}
 		return;
 	}
 
 	/*
 	 * Maintain guarantee that every nexthop returned by the dataplane
 	 *  lookup has > 0 refcount, so can be safely referenced within current
 	 *  epoch.
 	 */
 	if (rc->rc_nh_new != NULL) {
 		if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) {
 			/* ran out of indexes */
 			schedule_fd_rebuild(fd, "ran out of nhop indexes");
 			return;
 		}
 	}
 
 	result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data);
 
 	switch (result) {
 	case FLM_SUCCESS:
+		sync_rib_gen(fd);
 		/* Unref old nexthop on success */
 		if (rc->rc_nh_old != NULL)
 			fib_unref_nhop(fd, rc->rc_nh_old);
 		break;
 	case FLM_BATCH:
 
 		/*
 		 * Algo asks to batch the changes.
 		 */
 		if (queue_rtable_change(fd, rc)) {
 			if (!immediate_sync) {
 				fd->fd_batch = true;
 				mark_diverge_time(fd);
 				update_rebuild_delay(fd, FDA_BATCH);
 				break;
 			}
 			if (apply_rtable_changes(fd))
 				break;
 		}
 		FD_PRINTF(LOG_ERR, fd, "batched sync failed, force the rebuild");
 
 	case FLM_REBUILD:
 
 		/*
 		 * Algo is not able to apply the update.
 		 * Schedule algo rebuild.
 		 */
 		if (!immediate_sync) {
 			mark_diverge_time(fd);
 			schedule_fd_rebuild(fd, "algo requested rebuild");
 			break;
 		}
 
 		FD_PRINTF(LOG_INFO, fd, "running sync rebuild");
 		rebuild_fd(fd, "rtable change type enforced sync");
 		break;
 	case FLM_ERROR:
 
 		/*
 		 * Algo reported a non-recoverable error.
 		 * Record the error and schedule rebuild, which will
 		 *  trigger best algo selection.
 		 */
 		FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error");
 		if (!flm_error_add(fd->fd_flm, fd->fd_fibnum))
 			FD_PRINTF(LOG_ERR, fd, "failed to ban algo");
 		schedule_fd_rebuild(fd, "algo reported non-recoverable error");
 	}
 }
 
 static void
 estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd)
 {
 
 	if (old_fd == NULL) {
 		// TODO: read from rtable
 		fd->number_nhops = 16;
 		return;
 	}
 
 	if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS)
 		fd->number_nhops = 2 * old_fd->number_nhops;
 	else
 		fd->number_nhops = old_fd->number_nhops;
 }
 
 struct walk_cbdata {
 	struct fib_data		*fd;
 	flm_dump_t		*func;
 	enum flm_op_result	result;
 };
 
 /*
  * Handler called after all rtenties have been dumped.
  * Performs post-dump framework checks and calls
  * algo:flm_dump_end_cb().
  *
  * Updates walk_cbdata result.
  */
 static void
 sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data)
 {
 	struct walk_cbdata *w = (struct walk_cbdata *)_data;
 	struct fib_data *fd = w->fd;
 
 	RIB_WLOCK_ASSERT(w->fd->fd_rh);
 
 	if (rnh->rib_dying) {
 		w->result = FLM_ERROR;
 		return;
 	}
 
 	if (fd->hit_nhops) {
 		FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops",
 		    fd->nh_ref_table->count);
 		if (w->result == FLM_SUCCESS)
 			w->result = FLM_REBUILD;
 		return;
 	}
 
 	if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS)
 		return;
 
 	/* Post-dump hook, dump successful */
 	w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp);
 
 	if (w->result == FLM_SUCCESS) {
 		/* Mark init as done to allow routing updates */
 		fd->init_done = 1;
 	}
 }
 
 /*
  * Callback for each entry in rib.
  * Calls algo:flm_dump_rib_item_cb func as a part of initial
  *  route table synchronisation.
  */
 static int
 sync_algo_cb(struct rtentry *rt, void *_data)
 {
 	struct walk_cbdata *w = (struct walk_cbdata *)_data;
 
 	RIB_WLOCK_ASSERT(w->fd->fd_rh);
 
 	if (w->result == FLM_SUCCESS && w->func) {
 
 		/*
 		 * Reference nexthops to maintain guarantee that
 		 *  each nexthop returned by datapath has > 0 references
 		 *  and can be safely referenced within current epoch.
 		 */
 		struct nhop_object *nh = rt_get_raw_nhop(rt);
 		if (fib_ref_nhop(w->fd, nh) != 0)
 			w->result = w->func(rt, w->fd->fd_algo_data);
 		else
 			w->result = FLM_REBUILD;
 	}
 
 	return (0);
 }
 
 /*
  * Dump all routing table state to the algo instance.
  */
 static enum flm_op_result
 sync_algo(struct fib_data *fd)
 {
 	struct walk_cbdata w = {
 		.fd = fd,
 		.func = fd->fd_flm->flm_dump_rib_item_cb,
 		.result = FLM_SUCCESS,
 	};
 
 	rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w);
 
 	FD_PRINTF(LOG_INFO, fd,
 	    "initial dump completed (rtable version: %d), result: %s",
 	    fd->fd_rh->rnh_gen, print_op_result(w.result));
 
 	return (w.result);
 }
 
 /*
  * Schedules epoch-backed @fd instance deletion.
  * * Unlinks @fd from the list of active algo instances.
  * * Removes rib subscription.
  * * Stops callout.
  * * Schedules actual deletion.
  *
  * Assume @fd is already unlinked from the datapath.
  */
 static int
 schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout)
 {
 	bool is_dead;
 
 	NET_EPOCH_ASSERT();
 	RIB_WLOCK_ASSERT(fd->fd_rh);
 
 	FIB_MOD_LOCK();
 	is_dead = fd->fd_dead;
 	if (!is_dead)
 		fd->fd_dead = true;
 	if (fd->fd_linked) {
 		TAILQ_REMOVE(&V_fib_data_list, fd, entries);
 		fd->fd_linked = false;
 	}
 	FIB_MOD_UNLOCK();
 	if (is_dead)
 		return (0);
 
 	FD_PRINTF(LOG_INFO, fd, "DETACH");
 
 	if (fd->fd_rs != NULL)
 		rib_unsibscribe_locked(fd->fd_rs);
 
 	/*
 	 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls
 	 * will be executed, hence no _new_ callout schedules will happen.
 	 */
 	callout_stop(&fd->fd_callout);
 
 	fib_epoch_call(destroy_fd_instance_epoch, &fd->fd_epoch_ctx);
 
 	return (0);
 }
 
 /*
  * Wipe all fd instances from the list matching rib specified by @rh.
  * If @keep_first is set, remove all but the first record.
  */
 static void
 fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout)
 {
 	struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head);
 	struct fib_data *fd, *fd_tmp;
 	struct epoch_tracker et;
 
 	FIB_MOD_LOCK();
 	TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) {
 		if (fd->fd_rh == rh) {
 			if (keep_first) {
 				keep_first = false;
 				continue;
 			}
 			TAILQ_REMOVE(&V_fib_data_list, fd, entries);
 			fd->fd_linked = false;
 			TAILQ_INSERT_TAIL(&tmp_head, fd, entries);
 		}
 	}
 	FIB_MOD_UNLOCK();
 
 	/* Pass 2: remove each entry */
 	NET_EPOCH_ENTER(et);
 	TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) {
 		if (!in_callout)
 			RIB_WLOCK(fd->fd_rh);
 		schedule_destroy_fd_instance(fd, in_callout);
 		if (!in_callout)
 			RIB_WUNLOCK(fd->fd_rh);
 	}
 	NET_EPOCH_EXIT(et);
 }
 
 void
 fib_destroy_rib(struct rib_head *rh)
 {
 
 	/*
 	 * rnh has `is_dying` flag set, so setup of new fd's will fail at
 	 *  sync_algo() stage, preventing new entries to be added to the list
 	 *  of active algos. Remove all existing entries for the particular rib.
 	 */
 	fib_cleanup_algo(rh, false, false);
 }
 
 /*
  * Finalises fd destruction by freeing all fd resources.
  */
 static void
 destroy_fd_instance(struct fib_data *fd)
 {
 
 	FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd);
 
 	/* Call destroy callback first */
 	if (fd->fd_algo_data != NULL)
 		fd->fd_flm->flm_destroy_cb(fd->fd_algo_data);
 
 	/* Nhop table */
 	if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) {
 		for (int i = 0; i < fd->number_nhops; i++) {
 			if (!is_idx_free(fd, i)) {
 				FD_PRINTF(LOG_DEBUG2, fd, " FREE nhop %d %p",
 				    i, fd->nh_idx[i]);
 				nhop_free_any(fd->nh_idx[i]);
 			}
 		}
 		free(fd->nh_idx, M_RTABLE);
 	}
 	if (fd->nh_ref_table != NULL)
 		free(fd->nh_ref_table, M_RTABLE);
 
 	if (fd->fd_ss.fd_change_queue.entries != NULL)
 		free(fd->fd_ss.fd_change_queue.entries, M_TEMP);
 
 	fib_unref_algo(fd->fd_flm);
 
 	free(fd, M_RTABLE);
 }
 
 /*
  * Epoch callback indicating fd is safe to destroy
  */
 static void
 destroy_fd_instance_epoch(epoch_context_t ctx)
 {
 	struct fib_data *fd;
 
 	fd = __containerof(ctx, struct fib_data, fd_epoch_ctx);
 
 	destroy_fd_instance(fd);
 }
 
 /*
  * Tries to setup fd instance.
  * - Allocates fd/nhop table
  * - Runs algo:flm_init_cb algo init
  * - Subscribes fd to the rib
  * - Runs rtable dump
  * - Adds instance to the list of active instances.
  *
  * Returns: operation result. Fills in @pfd with resulting fd on success.
  *
  */
 static enum flm_op_result
 try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
     struct fib_data *old_fd, struct fib_data **pfd)
 {
 	struct fib_data *fd;
 	size_t size;
 	enum flm_op_result result;
 
 	/* Allocate */
 	fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO);
 	if (fd == NULL)  {
 		*pfd = NULL;
 		RH_PRINTF(LOG_INFO, rh, "Unable to allocate fib_data structure");
 		return (FLM_REBUILD);
 	}
 	*pfd = fd;
 
 	estimate_nhop_scale(old_fd, fd);
 
 	fd->fd_rh = rh;
 	fd->fd_gen = ++fib_gen;
 	fd->fd_family = rh->rib_family;
 	fd->fd_fibnum = rh->rib_fibnum;
 	callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0);
 	fd->fd_vnet = curvnet;
 	fd->fd_flm = flm;
 
 	FD_PRINTF(LOG_DEBUG, fd, "allocated fd %p", fd);
 
 	FIB_MOD_LOCK();
 	flm->flm_refcount++;
 	FIB_MOD_UNLOCK();
 
 	/* Allocate nhidx -> nhop_ptr table */
 	size = fd->number_nhops * sizeof(void *);
 	fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
 	if (fd->nh_idx == NULL) {
 		FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size);
 		return (FLM_REBUILD);
 	}
 
 	/* Allocate nhop index refcount table */
 	size = sizeof(struct nhop_ref_table);
 	size += fd->number_nhops * sizeof(uint32_t);
 	fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
 	if (fd->nh_ref_table == NULL) {
 		FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size);
 		return (FLM_REBUILD);
 	}
 	FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops);
 
 	/* Okay, we're ready for algo init */
 	void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL;
 	result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data);
 	if (result != FLM_SUCCESS) {
 		FD_PRINTF(LOG_INFO, fd, "%s algo init failed", flm->flm_name);
 		return (result);
 	}
 
 	/* Try to subscribe */
 	if (flm->flm_change_rib_item_cb != NULL) {
 		fd->fd_rs = rib_subscribe_locked(fd->fd_rh,
 		    handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE);
 		if (fd->fd_rs == NULL) {
 			FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes");
 			return (FLM_REBUILD);
 		}
 	}
 
 	/* Dump */
 	result = sync_algo(fd);
 	if (result != FLM_SUCCESS) {
 		FD_PRINTF(LOG_INFO, fd, "rib sync failed");
 		return (result);
 	}
 	FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully.");
 
 	FIB_MOD_LOCK();
 	/*
 	 * Insert fd in the beginning of a list, to maintain invariant
 	 *  that first matching entry for the AF/fib is always the active
 	 *  one.
 	 */
 	TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries);
 	fd->fd_linked = true;
 	FIB_MOD_UNLOCK();
 
 	return (FLM_SUCCESS);
 }
 
 /*
  * Sets up algo @flm for table @rh and links it to the datapath.
  *
  */
 static enum flm_op_result
 setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
     struct fib_data *orig_fd, struct fib_data **pfd, bool attach)
 {
 	struct fib_data *prev_fd, *new_fd;
 	enum flm_op_result result;
 
 	NET_EPOCH_ASSERT();
 	RIB_WLOCK_ASSERT(rh);
 
 	prev_fd = orig_fd;
 	new_fd = NULL;
 	for (int i = 0; i < FIB_MAX_TRIES; i++) {
 		result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd);
 
 		if ((result == FLM_SUCCESS) && attach) {
-			if (!fib_set_datapath_ptr(new_fd, &new_fd->fd_dp))
+			if (fib_set_datapath_ptr(new_fd, &new_fd->fd_dp))
+				sync_rib_gen(new_fd);
+			else
 				result = FLM_REBUILD;
 		}
 
 		if ((prev_fd != NULL) && (prev_fd != orig_fd)) {
 			schedule_destroy_fd_instance(prev_fd, false);
 			prev_fd = NULL;
 		}
 
 		RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i,
 		    print_op_result(result));
 
 		if (result == FLM_REBUILD) {
 			prev_fd = new_fd;
 			new_fd = NULL;
 			continue;
 		}
 
 		break;
 	}
 
 	if (result != FLM_SUCCESS) {
 		RH_PRINTF(LOG_WARNING, rh,
 		    "%s algo instance setup failed, failures=%d", flm->flm_name,
 		    orig_fd ? orig_fd->fd_failed_rebuilds + 1 : 0);
 		/* update failure count */
 		FIB_MOD_LOCK();
 		if (orig_fd != NULL)
 			orig_fd->fd_failed_rebuilds++;
 		FIB_MOD_UNLOCK();
 
 		/* Ban algo on non-recoverable error */
 		if (result == FLM_ERROR)
 			flm_error_add(flm, rh->rib_fibnum);
 
 		if ((prev_fd != NULL) && (prev_fd != orig_fd))
 			schedule_destroy_fd_instance(prev_fd, false);
 		if (new_fd != NULL) {
 			schedule_destroy_fd_instance(new_fd, false);
 			new_fd = NULL;
 		}
 	}
 
 	*pfd = new_fd;
 	return (result);
 }
 
 /*
  * Tries to sync algo with the current rtable state, either
  * by executing batch update or rebuilding.
  * Returns true on success.
  */
 static bool
 execute_callout_action(struct fib_data *fd)
 {
 	enum fib_callout_action action = fd->fd_callout_action;
 	struct fib_lookup_module *flm_new = NULL;
 	bool result = true;
 
 	NET_EPOCH_ASSERT();
 	RIB_WLOCK_ASSERT(fd->fd_rh);
 
 	fd->fd_need_rebuild = false;
 	fd->fd_batch = false;
 	fd->fd_num_changes = 0;
 
 	/* First, check if we're still OK to use this algo */
 	if (!is_algo_fixed(fd->fd_rh))
 		flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
 	if (flm_new != NULL)
 		action = FDA_REBUILD;
 
 	if (action == FDA_BATCH) {
 		/* Try to sync */
 		if (!apply_rtable_changes(fd))
 			action = FDA_REBUILD;
 	}
 
 	if (action == FDA_REBUILD)
 		result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm);
 	if (flm_new != NULL)
 		fib_unref_algo(flm_new);
 
 	return (result);
 }
 
 /*
  * Callout for all scheduled fd-related work.
  * - Checks if the current algo is still the best algo
  * - Synchronises algo instance to the rtable (batch usecase)
  * - Creates a new instance of an algo for af/fib if desired.
  */
 static void
 handle_fd_callout(void *_data)
 {
 	struct fib_data *fd = (struct fib_data *)_data;
 	struct epoch_tracker et;
 
 	FD_PRINTF(LOG_INFO, fd, "running callout type=%d", fd->fd_callout_action);
 
 	NET_EPOCH_ENTER(et);
 	CURVNET_SET(fd->fd_vnet);
 	execute_callout_action(fd);
 	CURVNET_RESTORE();
 	NET_EPOCH_EXIT(et);
 }
 
 /*
  * Tries to create new algo instance based on @fd data.
  * Returns true on success.
  */
 static bool
 rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new)
 {
 	struct fib_data *fd_new, *fd_tmp = NULL;
 	bool result;
 
 	if (flm_new == fd->fd_flm)
 		fd_tmp = fd;
 	else
 		FD_PRINTF(LOG_NOTICE, fd, "switching algo to %s", flm_new->flm_name);
 
 	result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true);
 	if (result != FLM_SUCCESS) {
 		FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed");
 		return (false);
 	}
 	FD_PRINTF(LOG_INFO, fd_new, "switched to new instance");
 
 	/* Remove old instance */
 	schedule_destroy_fd_instance(fd, true);
 
 	return (true);
 }
 
 static bool
 rebuild_fd(struct fib_data *fd, const char *reason)
 {
 	struct fib_lookup_module *flm_new = NULL;
 	bool result;
 
 	if (!is_algo_fixed(fd->fd_rh))
 		flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
 
 	FD_PRINTF(LOG_INFO, fd, "running sync rebuild: %s", reason);
 	result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm);
 	if (flm_new != NULL)
 		fib_unref_algo(flm_new);
 
 	if (!result) {
 		FD_PRINTF(LOG_ERR, fd, "sync rebuild failed");
 		schedule_fd_rebuild(fd, "sync rebuild failed");
 	}
 
 	return (result);
 }
 
 /*
  * Finds algo by name/family.
  * Returns referenced algo or NULL.
  */
 static struct fib_lookup_module *
 fib_find_algo(const char *algo_name, int family)
 {
 	struct fib_lookup_module *flm;
 
 	FIB_MOD_LOCK();
 	TAILQ_FOREACH(flm, &all_algo_list, entries) {
 		if ((strcmp(flm->flm_name, algo_name) == 0) &&
 		    (family == flm->flm_family)) {
 			flm->flm_refcount++;
 			FIB_MOD_UNLOCK();
 			return (flm);
 		}
 	}
 	FIB_MOD_UNLOCK();
 
 	return (NULL);
 }
 
 static void
 fib_unref_algo(struct fib_lookup_module *flm)
 {
 
 	FIB_MOD_LOCK();
 	flm->flm_refcount--;
 	FIB_MOD_UNLOCK();
 }
 
 static int
 set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req)
 {
 	struct fib_lookup_module *flm = NULL;
 	struct fib_data *fd = NULL;
 	char old_algo_name[32], algo_name[32];
 	struct rib_head *rh = NULL;
 	enum flm_op_result result;
 	struct epoch_tracker et;
 	int error;
 
 	/* Fetch current algo/rib for af/family */
 	FIB_MOD_LOCK();
 	TAILQ_FOREACH(fd, &V_fib_data_list, entries) {
 		if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum))
 			break;
 	}
 	if (fd == NULL) {
 		FIB_MOD_UNLOCK();
 		return (ENOENT);
 	}
 	rh = fd->fd_rh;
 	strlcpy(old_algo_name, fd->fd_flm->flm_name,
 	    sizeof(old_algo_name));
 	FIB_MOD_UNLOCK();
 
 	strlcpy(algo_name, old_algo_name, sizeof(algo_name));
 	error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (strcmp(algo_name, old_algo_name) == 0)
 		return (0);
 
 	/* New algorithm name is different */
 	flm = fib_find_algo(algo_name, family);
 	if (flm == NULL) {
 		RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name);
 		return (ESRCH);
 	}
 
 	fd = NULL;
 	NET_EPOCH_ENTER(et);
 	RIB_WLOCK(rh);
 	result = setup_fd_instance(flm, rh, NULL, &fd, true);
 	RIB_WUNLOCK(rh);
 	NET_EPOCH_EXIT(et);
 	fib_unref_algo(flm);
 	if (result != FLM_SUCCESS)
 		return (EINVAL);
 
 	/* Disable automated jumping between algos */
 	FIB_MOD_LOCK();
 	set_algo_fixed(rh);
 	FIB_MOD_UNLOCK();
 	/* Remove old instance(s) */
 	fib_cleanup_algo(rh, true, false);
 
 	/* Drain cb so user can unload the module after userret if so desired */
 	epoch_drain_callbacks(net_epoch_preempt);
 
 	return (0);
 }
 
 #ifdef INET
 static int
 set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 
 	return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET, oidp, req));
 }
 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo,
     CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
     set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo");
 #endif
 
 #ifdef INET6
 static int
 set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 
 	return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET6, oidp, req));
 }
 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo,
     CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
     set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo");
 #endif
 
 static void
 destroy_fdh_epoch(epoch_context_t ctx)
 {
 	struct fib_dp_header *fdh;
 
 	fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx);
 	free(fdh, M_RTABLE);
 }
 
 static struct fib_dp_header *
 alloc_fib_dp_array(uint32_t num_tables, bool waitok)
 {
 	size_t sz;
 	struct fib_dp_header *fdh;
 
 	sz = sizeof(struct fib_dp_header);
 	sz += sizeof(struct fib_dp) * num_tables;
 	fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO);
 	if (fdh != NULL)
 		fdh->fdh_num_tables = num_tables;
 	return (fdh);
 }
 
 static struct fib_dp_header *
 get_fib_dp_header(struct fib_dp *dp)
 {
 
 	return (__containerof((void *)dp, struct fib_dp_header, fdh_idx));
 }
 
 /*
  * Replace per-family index pool @pdp with a new one which
  * contains updated callback/algo data from @fd.
  * Returns true on success.
  */
 static bool
 replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd, struct fib_dp *dp)
 {
 	struct fib_dp_header *new_fdh, *old_fdh;
 
 	NET_EPOCH_ASSERT();
 
 	FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p",
 	    curvnet, dp->f, dp->arg);
 
 	FIB_MOD_LOCK();
 	old_fdh = get_fib_dp_header(*pdp);
 
 	if (old_fdh->fdh_idx[fd->fd_fibnum].f == dp->f) {
 		/*
 		 * Function is the same, data pointer needs update.
 		 * Perform in-line replace without reallocation.
 		 */
 		old_fdh->fdh_idx[fd->fd_fibnum].arg = dp->arg;
 		FD_PRINTF(LOG_DEBUG, fd, "FDH %p inline update", old_fdh);
 		FIB_MOD_UNLOCK();
 		return (true);
 	}
 
 	new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false);
 	FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh);
 	if (new_fdh == NULL) {
 		FIB_MOD_UNLOCK();
 		FD_PRINTF(LOG_WARNING, fd, "error attaching datapath");
 		return (false);
 	}
 
 	memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
 	    old_fdh->fdh_num_tables * sizeof(struct fib_dp));
 	/* Update relevant data structure for @fd */
 	new_fdh->fdh_idx[fd->fd_fibnum] = *dp;
 
 	/* Ensure memcpy() writes have completed */
 	atomic_thread_fence_rel();
 	/* Set new datapath pointer */
 	*pdp = &new_fdh->fdh_idx[0];
 	FIB_MOD_UNLOCK();
 	FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh);
 
 	fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
 
 	return (true);
 }
 
 static struct fib_dp **
 get_family_dp_ptr(int family)
 {
 	switch (family) {
 	case AF_INET:
 		return (&V_inet_dp);
 	case AF_INET6:
 		return (&V_inet6_dp);
 	}
 	return (NULL);
 }
 
 /*
  * Make datapath use fib instance @fd
  */
 bool
 fib_set_datapath_ptr(struct fib_data *fd, struct fib_dp *dp)
 {
 	struct fib_dp **pdp;
 
 	pdp = get_family_dp_ptr(fd->fd_family);
 	return (replace_rtables_family(pdp, fd, dp));
 }
 
 /*
  * Grow datapath pointers array.
  * Called from sysctl handler on growing number of routing tables.
  */
 static void
 grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables)
 {
 	struct fib_dp_header *new_fdh, *old_fdh = NULL;
 
 	new_fdh = alloc_fib_dp_array(new_num_tables, true);
 
 	FIB_MOD_LOCK();
 	if (*pdp != NULL) {
 		old_fdh = get_fib_dp_header(*pdp);
 		memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
 		    old_fdh->fdh_num_tables * sizeof(struct fib_dp));
 	}
 
 	/* Wait till all writes completed */
 	atomic_thread_fence_rel();
 
 	*pdp = &new_fdh->fdh_idx[0];
 	FIB_MOD_UNLOCK();
 
 	if (old_fdh != NULL)
 		fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
 }
 
 /*
  * Grows per-AF arrays of datapath pointers for each supported family.
  * Called from fibs resize sysctl handler.
  */
 void
 fib_grow_rtables(uint32_t new_num_tables)
 {
 
 #ifdef INET
 	grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables);
 #endif
 #ifdef INET6
 	grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables);
 #endif
 }
 
 void
 fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo)
 {
 
 	bzero(rinfo, sizeof(struct rib_rtable_info));
 	rinfo->num_prefixes = rh->rnh_prefixes;
 	rinfo->num_nhops = nhops_get_count(rh);
 #ifdef ROUTE_MPATH
 	rinfo->num_nhgrp = nhgrp_get_count(rh);
 #endif
 }
 
 /*
  * Updates pointer to the algo data for the @fd.
  */
 void
 fib_set_algo_ptr(struct fib_data *fd, void *algo_data)
 {
 	RIB_WLOCK_ASSERT(fd->fd_rh);
 
 	fd->fd_algo_data = algo_data;
 }
 
 /*
  * Calls @callback with @ctx after the end of a current epoch.
  */
 void
 fib_epoch_call(epoch_callback_t callback, epoch_context_t ctx)
 {
 	epoch_call(net_epoch_preempt, callback, ctx);
 }
 
 /*
  * Accessor to get rib instance @fd is attached to.
  */
 struct rib_head *
 fib_get_rh(struct fib_data *fd)
 {
 
 	return (fd->fd_rh);
 }
 
 /*
  * Accessor to export idx->nhop array
  */
 struct nhop_object **
 fib_get_nhop_array(struct fib_data *fd)
 {
 
 	return (fd->nh_idx);
 }
 
 static uint32_t
 get_nhop_idx(struct nhop_object *nh)
 {
 #ifdef ROUTE_MPATH
 	if (NH_IS_NHGRP(nh))
 		return (nhgrp_get_idx((struct nhgrp_object *)nh) * 2 - 1);
 	else
 		return (nhop_get_idx(nh) * 2);
 #else
 	return (nhop_get_idx(nh));
 #endif
 }
 
 uint32_t
 fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh)
 {
 
 	return (get_nhop_idx(nh));
 }
 
 static bool
 is_idx_free(struct fib_data *fd, uint32_t index)
 {
 
 	return (fd->nh_ref_table->refcnt[index] == 0);
 }
 
 static uint32_t
 fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh)
 {
 	uint32_t idx = get_nhop_idx(nh);
 
 	if (idx >= fd->number_nhops) {
 		fd->hit_nhops = 1;
 		return (0);
 	}
 
 	if (is_idx_free(fd, idx)) {
 		nhop_ref_any(nh);
 		fd->nh_idx[idx] = nh;
 		fd->nh_ref_table->count++;
 		FD_PRINTF(LOG_DEBUG2, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]);
 	}
 	fd->nh_ref_table->refcnt[idx]++;
 
 	return (idx);
 }
 
 struct nhop_release_data {
 	struct nhop_object	*nh;
 	struct epoch_context	ctx;
 };
 
 static void
 release_nhop_epoch(epoch_context_t ctx)
 {
 	struct nhop_release_data *nrd;
 
 	nrd = __containerof(ctx, struct nhop_release_data, ctx);
 	nhop_free_any(nrd->nh);
 	free(nrd, M_TEMP);
 }
 
 /*
  * Delays nexthop refcount release.
  * Datapath may have the datastructures not updated yet, so the old
  *  nexthop may still be returned till the end of current epoch. Delay
  *  refcount removal, as we may be removing the last instance, which will
  *  trigger nexthop deletion, rendering returned nexthop invalid.
  */
 static void
 fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh)
 {
 	struct nhop_release_data *nrd;
 
 	nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO);
 	if (nrd != NULL) {
 		nrd->nh = nh;
 		fib_epoch_call(release_nhop_epoch, &nrd->ctx);
 	} else {
 		/*
 		 * Unable to allocate memory. Leak nexthop to maintain guarantee
 		 *  that each nhop can be referenced.
 		 */
 		FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh);
 	}
 }
 
 static void
 fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh)
 {
 	uint32_t idx = get_nhop_idx(nh);
 
 	KASSERT((idx < fd->number_nhops), ("invalid nhop index"));
 	KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh"));
 
 	fd->nh_ref_table->refcnt[idx]--;
 	if (fd->nh_ref_table->refcnt[idx] == 0) {
 		FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]);
 		fib_schedule_release_nhop(fd, fd->nh_idx[idx]);
 	}
 }
 
 static void
 set_algo_fixed(struct rib_head *rh)
 {
 	switch (rh->rib_family) {
 #ifdef INET
 	case AF_INET:
 		V_algo_fixed_inet = true;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		V_algo_fixed_inet6 = true;
 		break;
 #endif
 	}
 }
 
 static bool
 is_algo_fixed(struct rib_head *rh)
 {
 
 	switch (rh->rib_family) {
 #ifdef INET
 	case AF_INET:
 		return (V_algo_fixed_inet);
 #endif
 #ifdef INET6
 	case AF_INET6:
 		return (V_algo_fixed_inet6);
 #endif
 	}
 	return (false);
 }
 
 /*
  * Runs the check on what would be the best algo for rib @rh, assuming
  *  that the current algo is the one specified by @orig_flm. Note that
  *  it can be NULL for initial selection.
  *
  * Returns referenced new algo or NULL if the current one is the best.
  */
 static struct fib_lookup_module *
 fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm)
 {
 	uint8_t preference, curr_preference = 0, best_preference = 0;
 	struct fib_lookup_module *flm, *best_flm = NULL;
 	struct rib_rtable_info rinfo;
 	int candidate_algos = 0;
 
 	fib_get_rtable_info(rh, &rinfo);
 
 	FIB_MOD_LOCK();
 	TAILQ_FOREACH(flm, &all_algo_list, entries) {
 		if (flm->flm_family != rh->rib_family)
 			continue;
 		candidate_algos++;
 		preference = flm->flm_get_pref(&rinfo);
 		if (preference > best_preference) {
 			if (!flm_error_check(flm, rh->rib_fibnum)) {
 				best_preference = preference;
 				best_flm = flm;
 			}
 		}
 		if (flm == orig_flm)
 			curr_preference = preference;
 	}
 	if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference))
 		best_flm->flm_refcount++;
 	else
 		best_flm = NULL;
 	FIB_MOD_UNLOCK();
 
 	RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)",
 	    candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference,
 	    best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"),
 	    best_preference);
 
 	return (best_flm);
 }
 
 /*
  * Called when new route table is created.
  * Selects, allocates and attaches fib algo for the table.
  */
 int
 fib_select_algo_initial(struct rib_head *rh)
 {
 	struct fib_lookup_module *flm;
 	struct fib_data *fd = NULL;
 	enum flm_op_result result;
 	struct epoch_tracker et;
 	int error = 0;
 
 	flm = fib_check_best_algo(rh, NULL);
 	if (flm == NULL) {
 		RH_PRINTF(LOG_CRIT, rh, "no algo selected");
 		return (ENOENT);
 	}
 	RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name);
 
 	NET_EPOCH_ENTER(et);
 	RIB_WLOCK(rh);
 	result = setup_fd_instance(flm, rh, NULL, &fd, false);
 	RIB_WUNLOCK(rh);
 	NET_EPOCH_EXIT(et);
 
 	RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd);
 	if (result == FLM_SUCCESS) {
 
 		/*
 		 * Attach datapath directly to avoid multiple reallocations
 		 * during fib growth
 		 */
 		struct fib_dp_header *fdp;
 		struct fib_dp **pdp;
 
 		pdp = get_family_dp_ptr(rh->rib_family);
 		if (pdp != NULL) {
 			fdp = get_fib_dp_header(*pdp);
 			fdp->fdh_idx[fd->fd_fibnum] = fd->fd_dp;
 			FD_PRINTF(LOG_INFO, fd, "datapath attached");
 		}
 	} else {
 		error = EINVAL;
 		RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name);
 	}
 
 	fib_unref_algo(flm);
 
 	return (error);
 }
 
 /*
  * Registers fib lookup module within the subsystem.
  */
 int
 fib_module_register(struct fib_lookup_module *flm)
 {
 
 	FIB_MOD_LOCK();
 	ALGO_PRINTF(LOG_INFO, "attaching %s to %s", flm->flm_name,
 	    print_family(flm->flm_family));
 	TAILQ_INSERT_TAIL(&all_algo_list, flm, entries);
 	FIB_MOD_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Tries to unregister fib lookup module.
  *
  * Returns 0 on success, EBUSY if module is still used
  *  by some of the tables.
  */
 int
 fib_module_unregister(struct fib_lookup_module *flm)
 {
 
 	FIB_MOD_LOCK();
 	if (flm->flm_refcount > 0) {
 		FIB_MOD_UNLOCK();
 		return (EBUSY);
 	}
 	fib_error_clear_flm(flm);
 	ALGO_PRINTF(LOG_INFO, "detaching %s from %s", flm->flm_name,
 	    print_family(flm->flm_family));
 	TAILQ_REMOVE(&all_algo_list, flm, entries);
 	FIB_MOD_UNLOCK();
 
 	return (0);
 }
 
 void
 vnet_fib_init(void)
 {
 
 	TAILQ_INIT(&V_fib_data_list);
 }
 
 void
 vnet_fib_destroy(void)
 {
 
 	FIB_MOD_LOCK();
 	fib_error_clear();
 	FIB_MOD_UNLOCK();
 }
diff --git a/sys/net/route/route_ctl.c b/sys/net/route/route_ctl.c
index af3853041ac6..2ec25c94299d 100644
--- a/sys/net/route/route_ctl.c
+++ b/sys/net/route/route_ctl.c
@@ -1,1556 +1,1556 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2020 Alexander V. Chernikov
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_route.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/vnet.h>
 #include <net/route.h>
 #include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 #include <net/route/nhop_utils.h>
 #include <net/route/nhop.h>
 #include <net/route/nhop_var.h>
 #include <netinet/in.h>
 #include <netinet6/scope6_var.h>
 
 #include <vm/uma.h>
 
 /*
  * This file contains control plane routing tables functions.
  *
  * All functions assumes they are called in net epoch.
  */
 
 struct rib_subscription {
 	CK_STAILQ_ENTRY(rib_subscription)	next;
 	rib_subscription_cb_t			*func;
 	void					*arg;
 	struct rib_head				*rnh;
 	enum rib_subscription_type		type;
 	struct epoch_context			epoch_ctx;
 };
 
 static int add_route(struct rib_head *rnh, struct rt_addrinfo *info,
     struct rib_cmd_info *rc);
 static int add_route_nhop(struct rib_head *rnh, struct rtentry *rt,
     struct rt_addrinfo *info, struct route_nhop_data *rnd,
     struct rib_cmd_info *rc);
 static int del_route(struct rib_head *rnh, struct rt_addrinfo *info,
     struct rib_cmd_info *rc);
 static int change_route(struct rib_head *rnh, struct rt_addrinfo *info,
     struct route_nhop_data *nhd_orig, struct rib_cmd_info *rc);
 
 static int rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info,
     struct rib_cmd_info *rc);
 
 static void rib_notify(struct rib_head *rnh, enum rib_subscription_type type,
     struct rib_cmd_info *rc);
 
 static void destroy_subscription_epoch(epoch_context_t ctx);
 #ifdef ROUTE_MPATH
 static bool rib_can_multipath(struct rib_head *rh);
 #endif
 
 /* Per-vnet multipath routing configuration */
 SYSCTL_DECL(_net_route);
 #define	V_rib_route_multipath	VNET(rib_route_multipath)
 #ifdef ROUTE_MPATH
 #define _MP_FLAGS	CTLFLAG_RW
 #else
 #define _MP_FLAGS	CTLFLAG_RD
 #endif
 VNET_DEFINE(u_int, rib_route_multipath) = 1;
 SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET,
     &VNET_NAME(rib_route_multipath), 0, "Enable route multipath");
 #undef _MP_FLAGS
 
 /* Routing table UMA zone */
 VNET_DEFINE_STATIC(uma_zone_t, rtzone);
 #define	V_rtzone	VNET(rtzone)
 
 void
 vnet_rtzone_init()
 {
 
 	V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 }
 
 #ifdef VIMAGE
 void
 vnet_rtzone_destroy()
 {
 
 	uma_zdestroy(V_rtzone);
 }
 #endif
 
 static void
 destroy_rtentry(struct rtentry *rt)
 {
 #ifdef VIMAGE
 	struct nhop_object *nh = rt->rt_nhop;
 
 	/*
 	 * At this moment rnh, nh_control may be already freed.
 	 * nhop interface may have been migrated to a different vnet.
 	 * Use vnet stored in the nexthop to delete the entry.
 	 */
 #ifdef ROUTE_MPATH
 	if (NH_IS_NHGRP(nh)) {
 		struct weightened_nhop *wn;
 		uint32_t num_nhops;
 		wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
 		nh = wn[0].nh;
 	}
 #endif
 	CURVNET_SET(nhop_get_vnet(nh));
 #endif
 
 	/* Unreference nexthop */
 	nhop_free_any(rt->rt_nhop);
 
 	uma_zfree(V_rtzone, rt);
 
 	CURVNET_RESTORE();
 }
 
 /*
  * Epoch callback indicating rtentry is safe to destroy
  */
 static void
 destroy_rtentry_epoch(epoch_context_t ctx)
 {
 	struct rtentry *rt;
 
 	rt = __containerof(ctx, struct rtentry, rt_epoch_ctx);
 
 	destroy_rtentry(rt);
 }
 
 /*
  * Schedule rtentry deletion
  */
 static void
 rtfree(struct rtentry *rt)
 {
 
 	KASSERT(rt != NULL, ("%s: NULL rt", __func__));
 
 	epoch_call(net_epoch_preempt, destroy_rtentry_epoch,
 	    &rt->rt_epoch_ctx);
 }
 
 static struct rib_head *
 get_rnh(uint32_t fibnum, const struct rt_addrinfo *info)
 {
 	struct rib_head *rnh;
 	struct sockaddr *dst;
 
 	KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum"));
 
 	dst = info->rti_info[RTAX_DST];
 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
 
 	return (rnh);
 }
 
 #ifdef ROUTE_MPATH
 static bool
 rib_can_multipath(struct rib_head *rh)
 {
 	int result;
 
 	CURVNET_SET(rh->rib_vnet);
 	result = !!V_rib_route_multipath;
 	CURVNET_RESTORE();
 
 	return (result);
 }
 
 /*
  * Check is nhop is multipath-eligible.
  * Avoid nhops without gateways and redirects.
  *
  * Returns 1 for multipath-eligible nexthop,
  * 0 otherwise.
  */
 bool
 nhop_can_multipath(const struct nhop_object *nh)
 {
 
 	if ((nh->nh_flags & NHF_MULTIPATH) != 0)
 		return (1);
 	if ((nh->nh_flags & NHF_GATEWAY) == 0)
 		return (0);
 	if ((nh->nh_flags & NHF_REDIRECT) != 0)
 		return (0);
 
 	return (1);
 }
 #endif
 
 static int
 get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight)
 {
 	uint32_t weight;
 
 	if (info->rti_mflags & RTV_WEIGHT)
 		weight = info->rti_rmx->rmx_weight;
 	else
 		weight = default_weight;
 	/* Keep upper 1 byte for adm distance purposes */
 	if (weight > RT_MAX_WEIGHT)
 		weight = RT_MAX_WEIGHT;
 
 	return (weight);
 }
 
 bool
 rt_is_host(const struct rtentry *rt)
 {
 
 	return (rt->rte_flags & RTF_HOST);
 }
 
 sa_family_t
 rt_get_family(const struct rtentry *rt)
 {
 	const struct sockaddr *dst;
 
 	dst = (const struct sockaddr *)rt_key_const(rt);
 
 	return (dst->sa_family);
 }
 
 /*
  * Returns pointer to nexthop or nexthop group
  * associated with @rt
  */
 struct nhop_object *
 rt_get_raw_nhop(const struct rtentry *rt)
 {
 
 	return (rt->rt_nhop);
 }
 
 #ifdef INET
 /*
  * Stores IPv4 address and prefix length of @rt inside
  *  @paddr and @plen.
  * @pscopeid is currently always set to 0.
  */
 void
 rt_get_inet_prefix_plen(const struct rtentry *rt, struct in_addr *paddr,
     int *plen, uint32_t *pscopeid)
 {
 	const struct sockaddr_in *dst;
 
 	dst = (const struct sockaddr_in *)rt_key_const(rt);
 	KASSERT((dst->sin_family == AF_INET),
 	    ("rt family is %d, not inet", dst->sin_family));
 	*paddr = dst->sin_addr;
 	dst = (const struct sockaddr_in *)rt_mask_const(rt);
 	if (dst == NULL)
 		*plen = 32;
 	else
 		*plen = bitcount32(dst->sin_addr.s_addr);
 	*pscopeid = 0;
 }
 
 /*
  * Stores IPv4 address and prefix mask of @rt inside
  *  @paddr and @pmask. Sets mask to INADDR_ANY for host routes.
  * @pscopeid is currently always set to 0.
  */
 void
 rt_get_inet_prefix_pmask(const struct rtentry *rt, struct in_addr *paddr,
     struct in_addr *pmask, uint32_t *pscopeid)
 {
 	const struct sockaddr_in *dst;
 
 	dst = (const struct sockaddr_in *)rt_key_const(rt);
 	KASSERT((dst->sin_family == AF_INET),
 	    ("rt family is %d, not inet", dst->sin_family));
 	*paddr = dst->sin_addr;
 	dst = (const struct sockaddr_in *)rt_mask_const(rt);
 	if (dst == NULL)
 		pmask->s_addr = INADDR_BROADCAST;
 	else
 		*pmask = dst->sin_addr;
 	*pscopeid = 0;
 }
 #endif
 
 #ifdef INET6
 static int
 inet6_get_plen(const struct in6_addr *addr)
 {
 
 	return (bitcount32(addr->s6_addr32[0]) + bitcount32(addr->s6_addr32[1]) +
 	    bitcount32(addr->s6_addr32[2]) + bitcount32(addr->s6_addr32[3]));
 }
 
 /*
  * Stores IPv6 address and prefix length of @rt inside
  *  @paddr and @plen. Addresses are returned in de-embedded form.
  * Scopeid is set to 0 for non-LL addresses.
  */
 void
 rt_get_inet6_prefix_plen(const struct rtentry *rt, struct in6_addr *paddr,
     int *plen, uint32_t *pscopeid)
 {
 	const struct sockaddr_in6 *dst;
 
 	dst = (const struct sockaddr_in6 *)rt_key_const(rt);
 	KASSERT((dst->sin6_family == AF_INET6),
 	    ("rt family is %d, not inet6", dst->sin6_family));
 	if (IN6_IS_SCOPE_LINKLOCAL(&dst->sin6_addr))
 		in6_splitscope(&dst->sin6_addr, paddr, pscopeid);
 	else
 		*paddr = dst->sin6_addr;
 	dst = (const struct sockaddr_in6 *)rt_mask_const(rt);
 	if (dst == NULL)
 		*plen = 128;
 	else
 		*plen = inet6_get_plen(&dst->sin6_addr);
 }
 
 /*
  * Stores IPv6 address and prefix mask of @rt inside
  *  @paddr and @pmask. Addresses are returned in de-embedded form.
  * Scopeid is set to 0 for non-LL addresses.
  */
 void
 rt_get_inet6_prefix_pmask(const struct rtentry *rt, struct in6_addr *paddr,
     struct in6_addr *pmask, uint32_t *pscopeid)
 {
 	const struct sockaddr_in6 *dst;
 
 	dst = (const struct sockaddr_in6 *)rt_key_const(rt);
 	KASSERT((dst->sin6_family == AF_INET6),
 	    ("rt family is %d, not inet", dst->sin6_family));
 	if (IN6_IS_SCOPE_LINKLOCAL(&dst->sin6_addr))
 		in6_splitscope(&dst->sin6_addr, paddr, pscopeid);
 	else
 		*paddr = dst->sin6_addr;
 	dst = (const struct sockaddr_in6 *)rt_mask_const(rt);
 	if (dst == NULL)
 		memset(pmask, 0xFF, sizeof(struct in6_addr));
 	else
 		*pmask = dst->sin6_addr;
 }
 #endif
 
 static void
 rt_set_expire_info(struct rtentry *rt, const struct rt_addrinfo *info)
 {
 
 	/* Kernel -> userland timebase conversion. */
 	if (info->rti_mflags & RTV_EXPIRE)
 		rt->rt_expire = info->rti_rmx->rmx_expire ?
 		    info->rti_rmx->rmx_expire - time_second + time_uptime : 0;
 }
 
 /*
  * Check if specified @gw matches gw data in the nexthop @nh.
  *
  * Returns true if matches, false otherwise.
  */
 bool
 match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw)
 {
 
 	if (nh->gw_sa.sa_family != gw->sa_family)
 		return (false);
 
 	switch (gw->sa_family) {
 	case AF_INET:
 		return (nh->gw4_sa.sin_addr.s_addr ==
 		    ((const struct sockaddr_in *)gw)->sin_addr.s_addr);
 	case AF_INET6:
 		{
 			const struct sockaddr_in6 *gw6;
 			gw6 = (const struct sockaddr_in6 *)gw;
 
 			/*
 			 * Currently (2020-09) IPv6 gws in kernel have their
 			 * scope embedded. Once this becomes false, this code
 			 * has to be revisited.
 			 */
 			if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr,
 			    &gw6->sin6_addr))
 				return (true);
 			return (false);
 		}
 	case AF_LINK:
 		{
 			const struct sockaddr_dl *sdl;
 			sdl = (const struct sockaddr_dl *)gw;
 			return (nh->gwl_sa.sdl_index == sdl->sdl_index);
 		}
 	default:
 		return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0);
 	}
 
 	/* NOTREACHED */
 	return (false);
 }
 
 /*
  * Checks if data in @info matches nexhop @nh.
  *
  * Returns 0 on success,
  * ESRCH if not matched,
  * ENOENT if filter function returned false
  */
 int
 check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt,
     const struct nhop_object *nh)
 {
 	const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY];
 
 	if (info->rti_filter != NULL) {
 	    if (info->rti_filter(rt, nh, info->rti_filterdata) == 0)
 		    return (ENOENT);
 	    else
 		    return (0);
 	}
 	if ((gw != NULL) && !match_nhop_gw(nh, gw))
 		return (ESRCH);
 
 	return (0);
 }
 
 /*
  * Checks if nexhop @nh can be rewritten by data in @info because
  *  of higher "priority". Currently the only case for such scenario
  *  is kernel installing interface routes, marked by RTF_PINNED flag.
  *
  * Returns:
  * 1 if @info data has higher priority
  * 0 if priority is the same
  * -1 if priority is lower
  */
 int
 can_override_nhop(const struct rt_addrinfo *info, const struct nhop_object *nh)
 {
 
 	if (info->rti_flags & RTF_PINNED) {
 		return (NH_IS_PINNED(nh)) ? 0 : 1;
 	} else {
 		return (NH_IS_PINNED(nh)) ? -1 : 0;
 	}
 }
 
 /*
  * Runs exact prefix match based on @dst and @netmask.
  * Returns matched @rtentry if found or NULL.
  * If rtentry was found, saves nexthop / weight value into @rnd.
  */
 static struct rtentry *
 lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst,
     const struct sockaddr *netmask, struct route_nhop_data *rnd)
 {
 	struct rtentry *rt;
 
 	RIB_LOCK_ASSERT(rnh);
 
 	rt = (struct rtentry *)rnh->rnh_lookup(__DECONST(void *, dst),
 	    __DECONST(void *, netmask), &rnh->head);
 	if (rt != NULL) {
 		rnd->rnd_nhop = rt->rt_nhop;
 		rnd->rnd_weight = rt->rt_weight;
 	} else {
 		rnd->rnd_nhop = NULL;
 		rnd->rnd_weight = 0;
 	}
 
 	return (rt);
 }
 
 /*
  * Runs exact prefix match based on dst/netmask from @info.
  * Assumes RIB lock is held.
  * Returns matched @rtentry if found or NULL.
  * If rtentry was found, saves nexthop / weight value into @rnd.
  */
 struct rtentry *
 lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info,
     struct route_nhop_data *rnd)
 {
 	struct rtentry *rt;
 
 	rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST],
 	    info->rti_info[RTAX_NETMASK], rnd);
 
 	return (rt);
 }
 
 /*
  * Adds route defined by @info into the kernel table specified by @fibnum and
  * sa_family in @info->rti_info[RTAX_DST].
  *
  * Returns 0 on success and fills in operation metadata into @rc.
  */
 int
 rib_add_route(uint32_t fibnum, struct rt_addrinfo *info,
     struct rib_cmd_info *rc)
 {
 	struct rib_head *rnh;
 	int error;
 
 	NET_EPOCH_ASSERT();
 
 	rnh = get_rnh(fibnum, info);
 	if (rnh == NULL)
 		return (EAFNOSUPPORT);
 
 	/*
 	 * Check consistency between RTF_HOST flag and netmask
 	 * existence.
 	 */
 	if (info->rti_flags & RTF_HOST)
 		info->rti_info[RTAX_NETMASK] = NULL;
 	else if (info->rti_info[RTAX_NETMASK] == NULL)
 		return (EINVAL);
 
 	bzero(rc, sizeof(struct rib_cmd_info));
 	rc->rc_cmd = RTM_ADD;
 
 	error = add_route(rnh, info, rc);
 	if (error == 0)
 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
 
 	return (error);
 }
 
 /*
  * Creates rtentry and nexthop based on @info data.
  * Return 0 and fills in rtentry into @prt on success,
  * return errno otherwise.
  */
 static int
 create_rtentry(struct rib_head *rnh, struct rt_addrinfo *info,
     struct rtentry **prt)
 {
 	struct sockaddr *dst, *ndst, *gateway, *netmask;
 	struct rtentry *rt;
 	struct nhop_object *nh;
 	struct ifaddr *ifa;
 	int error, flags;
 
 	dst = info->rti_info[RTAX_DST];
 	gateway = info->rti_info[RTAX_GATEWAY];
 	netmask = info->rti_info[RTAX_NETMASK];
 	flags = info->rti_flags;
 
 	if ((flags & RTF_GATEWAY) && !gateway)
 		return (EINVAL);
 	if (dst && gateway && (dst->sa_family != gateway->sa_family) && 
 	    (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK))
 		return (EINVAL);
 
 	if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb))
 		return (EINVAL);
 
 	if (info->rti_ifa == NULL) {
 		error = rt_getifa_fib(info, rnh->rib_fibnum);
 		if (error)
 			return (error);
 	} else {
 		ifa_ref(info->rti_ifa);
 	}
 
 	error = nhop_create_from_info(rnh, info, &nh);
 	ifa_free(info->rti_ifa);
 	if (error != 0)
 		return (error);
 
 	rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO);
 	if (rt == NULL) {
 		nhop_free(nh);
 		return (ENOBUFS);
 	}
 	rt->rte_flags = (RTF_UP | flags) & RTE_RT_FLAG_MASK;
 	rt->rt_nhop = nh;
 
 	/* Fill in dst */
 	memcpy(&rt->rt_dst, dst, dst->sa_len);
 	rt_key(rt) = &rt->rt_dst;
 
 	/*
 	 * point to the (possibly newly malloc'd) dest address.
 	 */
 	ndst = (struct sockaddr *)rt_key(rt);
 
 	/*
 	 * make sure it contains the value we want (masked if needed).
 	 */
 	if (netmask) {
 		rt_maskedcopy(dst, ndst, netmask);
 	} else
 		bcopy(dst, ndst, dst->sa_len);
 
 	/*
 	 * We use the ifa reference returned by rt_getifa_fib().
 	 * This moved from below so that rnh->rnh_addaddr() can
 	 * examine the ifa and  ifa->ifa_ifp if it so desires.
 	 */
 	ifa = info->rti_ifa;
 	rt->rt_weight = get_info_weight(info, RT_DEFAULT_WEIGHT);
 	rt_set_expire_info(rt, info);
 
 	*prt = rt;
 	return (0);
 }
 
 static int
 add_route(struct rib_head *rnh, struct rt_addrinfo *info,
     struct rib_cmd_info *rc)
 {
 	struct nhop_object *nh_orig;
 	struct route_nhop_data rnd_orig, rnd_add;
 	struct nhop_object *nh;
 	struct rtentry *rt, *rt_orig;
 	int error;
 
 	error = create_rtentry(rnh, info, &rt);
 	if (error != 0)
 		return (error);
 
 	rnd_add.rnd_nhop = rt->rt_nhop;
 	rnd_add.rnd_weight = rt->rt_weight;
 	nh = rt->rt_nhop;
 
 	RIB_WLOCK(rnh);
 	error = add_route_nhop(rnh, rt, info, &rnd_add, rc);
 	if (error == 0) {
 		RIB_WUNLOCK(rnh);
 		return (0);
 	}
 
 	/* addition failed. Lookup prefix in the rib to determine the cause */
 	rt_orig = lookup_prefix(rnh, info, &rnd_orig);
 	if (rt_orig == NULL) {
 		/* No prefix -> rnh_addaddr() failed to allocate memory */
 		RIB_WUNLOCK(rnh);
 		nhop_free(nh);
 		uma_zfree(V_rtzone, rt);
 		return (ENOMEM);
 	}
 
 	/* We have existing route in the RIB. */
 	nh_orig = rnd_orig.rnd_nhop;
 	/* Check if new route has higher preference */
 	if (can_override_nhop(info, nh_orig) > 0) {
 		/* Update nexthop to the new route */
 		change_route_nhop(rnh, rt_orig, info, &rnd_add, rc);
 		RIB_WUNLOCK(rnh);
 		uma_zfree(V_rtzone, rt);
 		nhop_free(nh_orig);
 		return (0);
 	}
 
 	RIB_WUNLOCK(rnh);
 
 #ifdef ROUTE_MPATH
 	if (rib_can_multipath(rnh) && nhop_can_multipath(rnd_add.rnd_nhop) &&
 	    nhop_can_multipath(rnd_orig.rnd_nhop))
 		error = add_route_mpath(rnh, info, rt, &rnd_add, &rnd_orig, rc);
 	else
 #endif
 	/* Unable to add - another route with the same preference exists */
 	error = EEXIST;
 
 	/*
 	 * ROUTE_MPATH disabled: failed to add route, free both nhop and rt.
 	 * ROUTE_MPATH enabled: original nhop reference is unused in any case,
 	 *  free rt only if not _adding_ new route to rib (e.g. the case
 	 *  when initial lookup returned existing route, but then it got
 	 *  deleted prior to multipath group insertion, leading to a simple
 	 *  non-multipath add as a result).
 	 */
 	nhop_free(nh);
 	if ((error != 0) || rc->rc_cmd != RTM_ADD)
 		uma_zfree(V_rtzone, rt);
 
 	return (error);
 }
 
 /*
  * Removes route defined by @info from the kernel table specified by @fibnum and
  * sa_family in @info->rti_info[RTAX_DST].
  *
  * Returns 0 on success and fills in operation metadata into @rc.
  */
 int
 rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc)
 {
 	struct rib_head *rnh;
 	struct sockaddr *dst_orig, *netmask;
 	struct sockaddr_storage mdst;
 	int error;
 
 	NET_EPOCH_ASSERT();
 
 	rnh = get_rnh(fibnum, info);
 	if (rnh == NULL)
 		return (EAFNOSUPPORT);
 
 	bzero(rc, sizeof(struct rib_cmd_info));
 	rc->rc_cmd = RTM_DELETE;
 
 	dst_orig = info->rti_info[RTAX_DST];
 	netmask = info->rti_info[RTAX_NETMASK];
 
 	if (netmask != NULL) {
 		/* Ensure @dst is always properly masked */
 		if (dst_orig->sa_len > sizeof(mdst))
 			return (EINVAL);
 		rt_maskedcopy(dst_orig, (struct sockaddr *)&mdst, netmask);
 		info->rti_info[RTAX_DST] = (struct sockaddr *)&mdst;
 	}
 	error = del_route(rnh, info, rc);
 	info->rti_info[RTAX_DST] = dst_orig;
 
 	return (error);
 }
 
 /*
  * Conditionally unlinks rtentry matching data inside @info from @rnh.
  * Returns 0 on success with operation result stored in @rc.
  * On error, returns:
  * ESRCH - if prefix was not found,
  * EADDRINUSE - if trying to delete higher priority route.
  * ENOENT - if supplied filter function returned 0 (not matched).
  */
 static int
 rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc)
 {
 	struct rtentry *rt;
 	struct nhop_object *nh;
 	struct radix_node *rn;
 	struct route_nhop_data rnd;
 	int error;
 
 	rt = lookup_prefix(rnh, info, &rnd);
 	if (rt == NULL)
 		return (ESRCH);
 
 	nh = rt->rt_nhop;
 #ifdef ROUTE_MPATH
 	if (NH_IS_NHGRP(nh)) {
 		error = del_route_mpath(rnh, info, rt,
 		    (struct nhgrp_object *)nh, rc);
 		return (error);
 	}
 #endif
 	error = check_info_match_nhop(info, rt, nh);
 	if (error != 0)
 		return (error);
 
 	if (can_override_nhop(info, nh) < 0)
 		return (EADDRINUSE);
 
 	/*
 	 * Remove the item from the tree and return it.
 	 * Complain if it is not there and do no more processing.
 	 */
 	rn = rnh->rnh_deladdr(info->rti_info[RTAX_DST],
 	    info->rti_info[RTAX_NETMASK], &rnh->head);
 	if (rn == NULL)
 		return (ESRCH);
 
 	if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
 		panic ("rtrequest delete");
 
 	rt = RNTORT(rn);
 	rt->rte_flags &= ~RTF_UP;
 
 	/* Finalize notification */
-	rnh->rnh_gen++;
+	rib_bump_gen(rnh);
 	rnh->rnh_prefixes--;
 
 	rc->rc_cmd = RTM_DELETE;
 	rc->rc_rt = rt;
 	rc->rc_nh_old = rt->rt_nhop;
 	rc->rc_nh_weight = rt->rt_weight;
 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
 
 	return (0);
 }
 
 static int
 del_route(struct rib_head *rnh, struct rt_addrinfo *info,
     struct rib_cmd_info *rc)
 {
 	int error;
 
 	RIB_WLOCK(rnh);
 	error = rt_unlinkrte(rnh, info, rc);
 	RIB_WUNLOCK(rnh);
 	if (error != 0)
 		return (error);
 
 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
 
 	/*
 	 * If the caller wants it, then it can have it,
 	 * the entry will be deleted after the end of the current epoch.
 	 */
 	if (rc->rc_cmd == RTM_DELETE)
 		rtfree(rc->rc_rt);
 #ifdef ROUTE_MPATH
 	else {
 		/*
 		 * Deleting 1 path may result in RTM_CHANGE to
 		 * a different mpath group/nhop.
 		 * Free old mpath group.
 		 */
 		nhop_free_any(rc->rc_nh_old);
 	}
 #endif
 
 	return (0);
 }
 
 int
 rib_change_route(uint32_t fibnum, struct rt_addrinfo *info,
     struct rib_cmd_info *rc)
 {
 	RIB_RLOCK_TRACKER;
 	struct route_nhop_data rnd_orig;
 	struct rib_head *rnh;
 	struct rtentry *rt;
 	int error;
 
 	NET_EPOCH_ASSERT();
 
 	rnh = get_rnh(fibnum, info);
 	if (rnh == NULL)
 		return (EAFNOSUPPORT);
 
 	bzero(rc, sizeof(struct rib_cmd_info));
 	rc->rc_cmd = RTM_CHANGE;
 
 	/* Check if updated gateway exists */
 	if ((info->rti_flags & RTF_GATEWAY) &&
 	    (info->rti_info[RTAX_GATEWAY] == NULL)) {
 
 		/*
 		 * route(8) adds RTF_GATEWAY flag if -interface is not set.
 		 * Remove RTF_GATEWAY to enforce consistency and maintain
 		 * compatibility..
 		 */
 		info->rti_flags &= ~RTF_GATEWAY;
 	}
 
 	/*
 	 * route change is done in multiple steps, with dropping and
 	 * reacquiring lock. In the situations with multiple processes
 	 * changes the same route in can lead to the case when route
 	 * is changed between the steps. Address it by retrying the operation
 	 * multiple times before failing.
 	 */
 
 	RIB_RLOCK(rnh);
 	rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
 	    info->rti_info[RTAX_NETMASK], &rnh->head);
 
 	if (rt == NULL) {
 		RIB_RUNLOCK(rnh);
 		return (ESRCH);
 	}
 
 	rnd_orig.rnd_nhop = rt->rt_nhop;
 	rnd_orig.rnd_weight = rt->rt_weight;
 
 	RIB_RUNLOCK(rnh);
 
 	for (int i = 0; i < RIB_MAX_RETRIES; i++) {
 		error = change_route(rnh, info, &rnd_orig, rc);
 		if (error != EAGAIN)
 			break;
 	}
 
 	return (error);
 }
 
 static int
 change_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
     struct nhop_object *nh_orig, struct nhop_object **nh_new)
 {
 	int free_ifa = 0;
 	int error;
 
 	/*
 	 * New gateway could require new ifaddr, ifp;
 	 * flags may also be different; ifp may be specified
 	 * by ll sockaddr when protocol address is ambiguous
 	 */
 	if (((nh_orig->nh_flags & NHF_GATEWAY) &&
 	    info->rti_info[RTAX_GATEWAY] != NULL) ||
 	    info->rti_info[RTAX_IFP] != NULL ||
 	    (info->rti_info[RTAX_IFA] != NULL &&
 	     !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) {
 		error = rt_getifa_fib(info, rnh->rib_fibnum);
 		if (info->rti_ifa != NULL)
 			free_ifa = 1;
 
 		if (error != 0) {
 			if (free_ifa) {
 				ifa_free(info->rti_ifa);
 				info->rti_ifa = NULL;
 			}
 
 			return (error);
 		}
 	}
 
 	error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new);
 	if (free_ifa) {
 		ifa_free(info->rti_ifa);
 		info->rti_ifa = NULL;
 	}
 
 	return (error);
 }
 
 #ifdef ROUTE_MPATH
 static int
 change_mpath_route(struct rib_head *rnh, struct rt_addrinfo *info,
     struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
 {
 	int error = 0;
 	struct nhop_object *nh, *nh_orig, *nh_new;
 	struct route_nhop_data rnd_new;
 
 	nh = NULL;
 	nh_orig = rnd_orig->rnd_nhop;
 
 	struct weightened_nhop *wn = NULL, *wn_new;
 	uint32_t num_nhops;
 
 	wn = nhgrp_get_nhops((struct nhgrp_object *)nh_orig, &num_nhops);
 	nh_orig = NULL;
 	for (int i = 0; i < num_nhops; i++) {
 		if (check_info_match_nhop(info, NULL, wn[i].nh)) {
 			nh_orig = wn[i].nh;
 			break;
 		}
 	}
 
 	if (nh_orig == NULL)
 		return (ESRCH);
 
 	error = change_nhop(rnh, info, nh_orig, &nh_new);
 	if (error != 0)
 		return (error);
 
 	wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop),
 	    M_TEMP, M_NOWAIT | M_ZERO);
 	if (wn_new == NULL) {
 		nhop_free(nh_new);
 		return (EAGAIN);
 	}
 
 	memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop));
 	for (int i = 0; i < num_nhops; i++) {
 		if (wn[i].nh == nh_orig) {
 			wn[i].nh = nh_new;
 			wn[i].weight = get_info_weight(info, rnd_orig->rnd_weight);
 			break;
 		}
 	}
 
 	error = nhgrp_get_group(rnh, wn_new, num_nhops, &rnd_new);
 	nhop_free(nh_new);
 	free(wn_new, M_TEMP);
 
 	if (error != 0)
 		return (error);
 
 	error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc);
 
 	return (error);
 }
 #endif
 
 static int
 change_route(struct rib_head *rnh, struct rt_addrinfo *info,
     struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
 {
 	int error = 0;
 	struct nhop_object *nh, *nh_orig;
 	struct route_nhop_data rnd_new;
 
 	nh = NULL;
 	nh_orig = rnd_orig->rnd_nhop;
 	if (nh_orig == NULL)
 		return (ESRCH);
 
 #ifdef ROUTE_MPATH
 	if (NH_IS_NHGRP(nh_orig))
 		return (change_mpath_route(rnh, info, rnd_orig, rc));
 #endif
 
 	rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight);
 	error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop);
 	if (error != 0)
 		return (error);
 	error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc);
 
 	return (error);
 }
 
 /*
  * Insert @rt with nhop data from @rnd_new to @rnh.
  * Returns 0 on success and stores operation results in @rc.
  */
 static int
 add_route_nhop(struct rib_head *rnh, struct rtentry *rt,
     struct rt_addrinfo *info, struct route_nhop_data *rnd,
     struct rib_cmd_info *rc)
 {
 	struct sockaddr *ndst, *netmask;
 	struct radix_node *rn;
 	int error = 0;
 
 	RIB_WLOCK_ASSERT(rnh);
 
 	ndst = (struct sockaddr *)rt_key(rt);
 	netmask = info->rti_info[RTAX_NETMASK];
 
 	rt->rt_nhop = rnd->rnd_nhop;
 	rt->rt_weight = rnd->rnd_weight;
 	rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes);
 
 	if (rn != NULL) {
 		if (rt->rt_expire > 0)
 			tmproutes_update(rnh, rt);
 
 		/* Finalize notification */
-		rnh->rnh_gen++;
+		rib_bump_gen(rnh);
 		rnh->rnh_prefixes++;
 
 		rc->rc_cmd = RTM_ADD;
 		rc->rc_rt = rt;
 		rc->rc_nh_old = NULL;
 		rc->rc_nh_new = rnd->rnd_nhop;
 		rc->rc_nh_weight = rnd->rnd_weight;
 
 		rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
 	} else {
 		/* Existing route or memory allocation failure */
 		error = EEXIST;
 	}
 
 	return (error);
 }
 
 /*
  * Switch @rt nhop/weigh to the ones specified in @rnd.
  *  Conditionally set rt_expire if set in @info.
  * Returns 0 on success.
  */
 int
 change_route_nhop(struct rib_head *rnh, struct rtentry *rt,
     struct rt_addrinfo *info, struct route_nhop_data *rnd,
     struct rib_cmd_info *rc)
 {
 	struct nhop_object *nh_orig;
 
 	RIB_WLOCK_ASSERT(rnh);
 
 	nh_orig = rt->rt_nhop;
 
 	if (rnd->rnd_nhop != NULL) {
 		/* Changing expiration & nexthop & weight to a new one */
 		rt_set_expire_info(rt, info);
 		rt->rt_nhop = rnd->rnd_nhop;
 		rt->rt_weight = rnd->rnd_weight;
 		if (rt->rt_expire > 0)
 			tmproutes_update(rnh, rt);
 	} else {
 		/* Route deletion requested. */
 		struct sockaddr *ndst, *netmask;
 		struct radix_node *rn;
 
 		ndst = (struct sockaddr *)rt_key(rt);
 		netmask = info->rti_info[RTAX_NETMASK];
 		rn = rnh->rnh_deladdr(ndst, netmask, &rnh->head);
 		if (rn == NULL)
 			return (ESRCH);
 		rt = RNTORT(rn);
 		rt->rte_flags &= ~RTF_UP;
 	}
 
 	/* Finalize notification */
-	rnh->rnh_gen++;
+	rib_bump_gen(rnh);
 	if (rnd->rnd_nhop == NULL)
 		rnh->rnh_prefixes--;
 
 	rc->rc_cmd = (rnd->rnd_nhop != NULL) ? RTM_CHANGE : RTM_DELETE;
 	rc->rc_rt = rt;
 	rc->rc_nh_old = nh_orig;
 	rc->rc_nh_new = rnd->rnd_nhop;
 	rc->rc_nh_weight = rnd->rnd_weight;
 
 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
 
 	return (0);
 }
 
 /*
  * Conditionally update route nhop/weight IFF data in @nhd_orig is
  *  consistent with the current route data.
  * Nexthop in @nhd_new is consumed.
  */
 int
 change_route_conditional(struct rib_head *rnh, struct rtentry *rt,
     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
     struct route_nhop_data *rnd_new, struct rib_cmd_info *rc)
 {
 	struct rtentry *rt_new;
 	int error = 0;
 
 	RIB_WLOCK(rnh);
 
 	rt_new = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
 	    info->rti_info[RTAX_NETMASK], &rnh->head);
 
 	if (rt_new == NULL) {
 		if (rnd_orig->rnd_nhop == NULL)
 			error = add_route_nhop(rnh, rt, info, rnd_new, rc);
 		else {
 			/*
 			 * Prefix does not exist, which was not our assumption.
 			 * Update @rnd_orig with the new data and return
 			 */
 			rnd_orig->rnd_nhop = NULL;
 			rnd_orig->rnd_weight = 0;
 			error = EAGAIN;
 		}
 	} else {
 		/* Prefix exists, try to update */
 		if (rnd_orig->rnd_nhop == rt_new->rt_nhop) {
 			/*
 			 * Nhop/mpath group hasn't changed. Flip
 			 * to the new precalculated one and return
 			 */
 			error = change_route_nhop(rnh, rt_new, info, rnd_new, rc);
 		} else {
 			/* Update and retry */
 			rnd_orig->rnd_nhop = rt_new->rt_nhop;
 			rnd_orig->rnd_weight = rt_new->rt_weight;
 			error = EAGAIN;
 		}
 	}
 
 	RIB_WUNLOCK(rnh);
 
 	if (error == 0) {
 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
 
 		if (rnd_orig->rnd_nhop != NULL)
 			nhop_free_any(rnd_orig->rnd_nhop);
 
 	} else {
 		if (rnd_new->rnd_nhop != NULL)
 			nhop_free_any(rnd_new->rnd_nhop);
 	}
 
 	return (error);
 }
 
 /*
  * Performs modification of routing table specificed by @action.
  * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST].
  * Needs to be run in network epoch.
  *
  * Returns 0 on success and fills in @rc with action result.
  */
 int
 rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info,
     struct rib_cmd_info *rc)
 {
 	int error;
 
 	switch (action) {
 	case RTM_ADD:
 		error = rib_add_route(fibnum, info, rc);
 		break;
 	case RTM_DELETE:
 		error = rib_del_route(fibnum, info, rc);
 		break;
 	case RTM_CHANGE:
 		error = rib_change_route(fibnum, info, rc);
 		break;
 	default:
 		error = ENOTSUP;
 	}
 
 	return (error);
 }
 
 struct rt_delinfo
 {
 	struct rt_addrinfo info;
 	struct rib_head *rnh;
 	struct rtentry *head;
 	struct rib_cmd_info rc;
 };
 
 /*
  * Conditionally unlinks @rn from radix tree based
  * on info data passed in @arg.
  */
 static int
 rt_checkdelroute(struct radix_node *rn, void *arg)
 {
 	struct rt_delinfo *di;
 	struct rt_addrinfo *info;
 	struct rtentry *rt;
 
 	di = (struct rt_delinfo *)arg;
 	rt = (struct rtentry *)rn;
 	info = &di->info;
 
 	info->rti_info[RTAX_DST] = rt_key(rt);
 	info->rti_info[RTAX_NETMASK] = rt_mask(rt);
 
 	if (rt_unlinkrte(di->rnh, info, &di->rc) != 0)
 		return (0);
 
 	/*
 	 * Add deleted rtentries to the list to GC them
 	 *  after dropping the lock.
 	 *
 	 * XXX: Delayed notifications not implemented
 	 *  for nexthop updates.
 	 */
 	if (di->rc.rc_cmd == RTM_DELETE) {
 		/* Add to the list and return */
 		rt->rt_chain = di->head;
 		di->head = rt;
 #ifdef ROUTE_MPATH
 	} else {
 		/*
 		 * RTM_CHANGE to a diferent nexthop or nexthop group.
 		 * Free old multipath group.
 		 */
 		nhop_free_any(di->rc.rc_nh_old);
 #endif
 	}
 
 	return (0);
 }
 
 /*
  * Iterates over a routing table specified by @fibnum and @family and
  *  deletes elements marked by @filter_f.
  * @fibnum: rtable id
  * @family: AF_ address family
  * @filter_f: function returning non-zero value for items to delete
  * @arg: data to pass to the @filter_f function
  * @report: true if rtsock notification is needed.
  */
 void
 rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *arg, bool report)
 {
 	struct rib_head *rnh;
 	struct rt_delinfo di;
 	struct rtentry *rt;
 	struct nhop_object *nh;
 	struct epoch_tracker et;
 
 	rnh = rt_tables_get_rnh(fibnum, family);
 	if (rnh == NULL)
 		return;
 
 	bzero(&di, sizeof(di));
 	di.info.rti_filter = filter_f;
 	di.info.rti_filterdata = arg;
 	di.rnh = rnh;
 	di.rc.rc_cmd = RTM_DELETE;
 
 	NET_EPOCH_ENTER(et);
 
 	RIB_WLOCK(rnh);
 	rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di);
 	RIB_WUNLOCK(rnh);
 
 	/* We might have something to reclaim. */
 	bzero(&di.rc, sizeof(di.rc));
 	di.rc.rc_cmd = RTM_DELETE;
 	while (di.head != NULL) {
 		rt = di.head;
 		di.head = rt->rt_chain;
 		rt->rt_chain = NULL;
 		nh = rt->rt_nhop;
 
 		di.rc.rc_rt = rt;
 		di.rc.rc_nh_old = nh;
 		rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc);
 
 		/* TODO std rt -> rt_addrinfo export */
 		di.info.rti_info[RTAX_DST] = rt_key(rt);
 		di.info.rti_info[RTAX_NETMASK] = rt_mask(rt);
 
 		if (report) {
 #ifdef ROUTE_MPATH
 			struct nhgrp_object *nhg;
 			struct weightened_nhop *wn;
 			uint32_t num_nhops;
 			if (NH_IS_NHGRP(nh)) {
 				nhg = (struct nhgrp_object *)nh;
 				wn = nhgrp_get_nhops(nhg, &num_nhops);
 				for (int i = 0; i < num_nhops; i++)
 					rt_routemsg(RTM_DELETE, rt, wn[i].nh, fibnum);
 			} else
 #endif
 			rt_routemsg(RTM_DELETE, rt, nh, fibnum);
 		}
 		rtfree(rt);
 	}
 
 	NET_EPOCH_EXIT(et);
 }
 
 static int
 rt_delete_unconditional(struct radix_node *rn, void *arg)
 {
 	struct rtentry *rt = RNTORT(rn);
 	struct rib_head *rnh = (struct rib_head *)arg;
 
 	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), &rnh->head);
 	if (RNTORT(rn) == rt)
 		rtfree(rt);
 
 	return (0);
 }
 
 /*
  * Removes all routes from the routing table without executing notifications.
  * rtentres will be removed after the end of a current epoch.
  */
 static void
 rib_flush_routes(struct rib_head *rnh)
 {
 	RIB_WLOCK(rnh);
 	rnh->rnh_walktree(&rnh->head, rt_delete_unconditional, rnh);
 	RIB_WUNLOCK(rnh);
 }
 
 void
 rib_flush_routes_family(int family)
 {
 	struct rib_head *rnh;
 
 	for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 		if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL)
 			rib_flush_routes(rnh);
 	}
 }
 
 static void
 rib_notify(struct rib_head *rnh, enum rib_subscription_type type,
     struct rib_cmd_info *rc)
 {
 	struct rib_subscription *rs;
 
 	CK_STAILQ_FOREACH(rs, &rnh->rnh_subscribers, next) {
 		if (rs->type == type)
 			rs->func(rnh, rc, rs->arg);
 	}
 }
 
 static struct rib_subscription *
 allocate_subscription(rib_subscription_cb_t *f, void *arg,
     enum rib_subscription_type type, bool waitok)
 {
 	struct rib_subscription *rs;
 	int flags = M_ZERO | (waitok ? M_WAITOK : M_NOWAIT);
 
 	rs = malloc(sizeof(struct rib_subscription), M_RTABLE, flags);
 	if (rs == NULL)
 		return (NULL);
 
 	rs->func = f;
 	rs->arg = arg;
 	rs->type = type;
 
 	return (rs);
 }
 
 /*
  * Subscribe for the changes in the routing table specified by @fibnum and
  *  @family.
  *
  * Returns pointer to the subscription structure on success.
  */
 struct rib_subscription *
 rib_subscribe(uint32_t fibnum, int family, rib_subscription_cb_t *f, void *arg,
     enum rib_subscription_type type, bool waitok)
 {
 	struct rib_head *rnh;
 	struct epoch_tracker et;
 
 	NET_EPOCH_ENTER(et);
 	KASSERT((fibnum < rt_numfibs), ("%s: bad fibnum", __func__));
 	rnh = rt_tables_get_rnh(fibnum, family);
 	NET_EPOCH_EXIT(et);
 
 	return (rib_subscribe_internal(rnh, f, arg, type, waitok));
 }
 
 struct rib_subscription *
 rib_subscribe_internal(struct rib_head *rnh, rib_subscription_cb_t *f, void *arg,
     enum rib_subscription_type type, bool waitok)
 {
 	struct rib_subscription *rs;
 	struct epoch_tracker et;
 
 	if ((rs = allocate_subscription(f, arg, type, waitok)) == NULL)
 		return (NULL);
 	rs->rnh = rnh;
 
 	NET_EPOCH_ENTER(et);
 	RIB_WLOCK(rnh);
 	CK_STAILQ_INSERT_HEAD(&rnh->rnh_subscribers, rs, next);
 	RIB_WUNLOCK(rnh);
 	NET_EPOCH_EXIT(et);
 
 	return (rs);
 }
 
 struct rib_subscription *
 rib_subscribe_locked(struct rib_head *rnh, rib_subscription_cb_t *f, void *arg,
     enum rib_subscription_type type)
 {
 	struct rib_subscription *rs;
 
 	NET_EPOCH_ASSERT();
 	RIB_WLOCK_ASSERT(rnh);
 
 	if ((rs = allocate_subscription(f, arg, type, false)) == NULL)
 		return (NULL);
 	rs->rnh = rnh;
 
 	CK_STAILQ_INSERT_HEAD(&rnh->rnh_subscribers, rs, next);
 
 	return (rs);
 }
 
 /*
  * Remove rtable subscription @rs from the routing table.
  * Needs to be run in network epoch.
  */
 void
 rib_unsibscribe(struct rib_subscription *rs)
 {
 	struct rib_head *rnh = rs->rnh;
 
 	NET_EPOCH_ASSERT();
 
 	RIB_WLOCK(rnh);
 	CK_STAILQ_REMOVE(&rnh->rnh_subscribers, rs, rib_subscription, next);
 	RIB_WUNLOCK(rnh);
 
 	epoch_call(net_epoch_preempt, destroy_subscription_epoch,
 	    &rs->epoch_ctx);
 }
 
 void
 rib_unsibscribe_locked(struct rib_subscription *rs)
 {
 	struct rib_head *rnh = rs->rnh;
 
 	NET_EPOCH_ASSERT();
 	RIB_WLOCK_ASSERT(rnh);
 
 	CK_STAILQ_REMOVE(&rnh->rnh_subscribers, rs, rib_subscription, next);
 
 	epoch_call(net_epoch_preempt, destroy_subscription_epoch,
 	    &rs->epoch_ctx);
 }
 
 /*
  * Epoch callback indicating subscription is safe to destroy
  */
 static void
 destroy_subscription_epoch(epoch_context_t ctx)
 {
 	struct rib_subscription *rs;
 
 	rs = __containerof(ctx, struct rib_subscription, epoch_ctx);
 
 	free(rs, M_RTABLE);
 }
 
 void
 rib_init_subscriptions(struct rib_head *rnh)
 {
 
 	CK_STAILQ_INIT(&rnh->rnh_subscribers);
 }
 
 void
 rib_destroy_subscriptions(struct rib_head *rnh)
 {
 	struct rib_subscription *rs;
 	struct epoch_tracker et;
 
 	NET_EPOCH_ENTER(et);
 	RIB_WLOCK(rnh);
 	while ((rs = CK_STAILQ_FIRST(&rnh->rnh_subscribers)) != NULL) {
 		CK_STAILQ_REMOVE_HEAD(&rnh->rnh_subscribers, next);
 		epoch_call(net_epoch_preempt, destroy_subscription_epoch,
 		    &rs->epoch_ctx);
 	}
 	RIB_WUNLOCK(rnh);
 	NET_EPOCH_EXIT(et);
 }
diff --git a/sys/net/route/route_var.h b/sys/net/route/route_var.h
index 427c286a5090..f12931476fd3 100644
--- a/sys/net/route/route_var.h
+++ b/sys/net/route/route_var.h
@@ -1,327 +1,340 @@
 /*-
  * Copyright (c) 2015-2016
  * 	Alexander V. Chernikov <melifaro@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _NET_ROUTE_VAR_H_
 #define _NET_ROUTE_VAR_H_
 
 #ifndef RNF_NORMAL
 #include <net/radix.h>
 #endif
 #include <sys/ck.h>
 #include <sys/epoch.h>
 #include <netinet/in.h>		/* struct sockaddr_in */
 #include <sys/counter.h>
 #include <net/route/nhop.h>
 
 #ifdef	RTDEBUG
 #define	DPRINTF(_fmt, ...)	printf("%s: " _fmt "\n", __func__ , ## __VA_ARGS__)
 #else
 #define	DPRINTF(_fmt, ...)
 #endif
 
 struct nh_control;
 typedef int rnh_preadd_entry_f_t(u_int fibnum, const struct sockaddr *addr,
 	const struct sockaddr *mask, struct nhop_object *nh);
 
 struct rib_head {
 	struct radix_head	head;
 	rn_matchaddr_f_t	*rnh_matchaddr;	/* longest match for sockaddr */
 	rn_addaddr_f_t		*rnh_addaddr;	/* add based on sockaddr*/
 	rn_deladdr_f_t		*rnh_deladdr;	/* remove based on sockaddr */
 	rn_lookup_f_t		*rnh_lookup;	/* exact match for sockaddr */
 	rn_walktree_t		*rnh_walktree;	/* traverse tree */
 	rn_walktree_from_t	*rnh_walktree_from; /* traverse tree below a */
 	rnh_preadd_entry_f_t	*rnh_preadd;	/* hook to alter record prior to insertion */
-	rt_gen_t		rnh_gen;	/* generation counter */
+	rt_gen_t		rnh_gen;	/* datapath generation counter */
 	int			rnh_multipath;	/* multipath capable ? */
 	struct radix_node	rnh_nodes[3];	/* empty tree for common case */
 	struct rmlock		rib_lock;	/* config/data path lock */
 	struct radix_mask_head	rmhead;		/* masks radix head */
 	struct vnet		*rib_vnet;	/* vnet pointer */
 	int			rib_family;	/* AF of the rtable */
 	u_int			rib_fibnum;	/* fib number */
 	struct callout		expire_callout;	/* Callout for expiring dynamic routes */
 	time_t			next_expire;	/* Next expire run ts */
 	uint32_t		rnh_prefixes;	/* Number of prefixes */
+#ifdef FIB_ALGO
+	rt_gen_t		rnh_gen_rib;	/* rib generation counter */
+#endif
 	uint32_t		rib_dying:1;	/* rib is detaching */
 	uint32_t		rib_algo_fixed:1;/* fixed algorithm */
 	struct nh_control	*nh_control;	/* nexthop subsystem data */
 	CK_STAILQ_HEAD(, rib_subscription)	rnh_subscribers;/* notification subscribers */
 };
 
 #define	RIB_RLOCK_TRACKER	struct rm_priotracker _rib_tracker
 #define	RIB_LOCK_INIT(rh)	rm_init(&(rh)->rib_lock, "rib head lock")
 #define	RIB_LOCK_DESTROY(rh)	rm_destroy(&(rh)->rib_lock)
 #define	RIB_RLOCK(rh)		rm_rlock(&(rh)->rib_lock, &_rib_tracker)
 #define	RIB_RUNLOCK(rh)		rm_runlock(&(rh)->rib_lock, &_rib_tracker)
 #define	RIB_WLOCK(rh)		rm_wlock(&(rh)->rib_lock)
 #define	RIB_WUNLOCK(rh)		rm_wunlock(&(rh)->rib_lock)
 #define	RIB_LOCK_ASSERT(rh)	rm_assert(&(rh)->rib_lock, RA_LOCKED)
 #define	RIB_WLOCK_ASSERT(rh)	rm_assert(&(rh)->rib_lock, RA_WLOCKED)
 
 /* Constants */
 #define	RIB_MAX_RETRIES	3
 #define	RT_MAXFIBS	UINT16_MAX
 #define	RIB_MAX_MPATH_WIDTH	64
 
 /* Macro for verifying fields in af-specific 'struct route' structures */
 #define CHK_STRUCT_FIELD_GENERIC(_s1, _f1, _s2, _f2)			\
 _Static_assert(sizeof(((_s1 *)0)->_f1) == sizeof(((_s2 *)0)->_f2),	\
 		"Fields " #_f1 " and " #_f2 " size differs");		\
 _Static_assert(__offsetof(_s1, _f1) == __offsetof(_s2, _f2),		\
 		"Fields " #_f1 " and " #_f2 " offset differs");
 
 #define _CHK_ROUTE_FIELD(_route_new, _field) \
 	CHK_STRUCT_FIELD_GENERIC(struct route, _field, _route_new, _field)
 
 #define CHK_STRUCT_ROUTE_FIELDS(_route_new)	\
 	_CHK_ROUTE_FIELD(_route_new, ro_nh)	\
 	_CHK_ROUTE_FIELD(_route_new, ro_lle)	\
 	_CHK_ROUTE_FIELD(_route_new, ro_prepend)\
 	_CHK_ROUTE_FIELD(_route_new, ro_plen)	\
 	_CHK_ROUTE_FIELD(_route_new, ro_flags)	\
 	_CHK_ROUTE_FIELD(_route_new, ro_mtu)	\
 	_CHK_ROUTE_FIELD(_route_new, spare)
 
 #define CHK_STRUCT_ROUTE_COMPAT(_ro_new, _dst_new)				\
 CHK_STRUCT_ROUTE_FIELDS(_ro_new);						\
 _Static_assert(__offsetof(struct route, ro_dst) == __offsetof(_ro_new, _dst_new),\
 		"ro_dst and " #_dst_new " are at different offset")
 
+static inline void
+rib_bump_gen(struct rib_head *rnh)
+{
+#ifdef FIB_ALGO
+	rnh->rnh_gen_rib++;
+#else
+	rnh->rnh_gen++;
+#endif
+}
+
 struct rib_head *rt_tables_get_rnh(uint32_t table, sa_family_t family);
 int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum);
 struct rib_cmd_info;
 
 VNET_PCPUSTAT_DECLARE(struct rtstat, rtstat);
 #define	RTSTAT_ADD(name, val)	\
 	VNET_PCPUSTAT_ADD(struct rtstat, rtstat, name, (val))
 #define	RTSTAT_INC(name)	RTSTAT_ADD(name, 1)
 
 /*
  * Convert a 'struct radix_node *' to a 'struct rtentry *'.
  * The operation can be done safely (in this code) because a
  * 'struct rtentry' starts with two 'struct radix_node''s, the first
  * one representing leaf nodes in the routing tree, which is
  * what the code in radix.c passes us as a 'struct radix_node'.
  *
  * But because there are a lot of assumptions in this conversion,
  * do not cast explicitly, but always use the macro below.
  */
 #define RNTORT(p)	((struct rtentry *)(p))
 
 struct rtentry {
 	struct	radix_node rt_nodes[2];	/* tree glue, and other values */
 	/*
 	 * XXX struct rtentry must begin with a struct radix_node (or two!)
 	 * because the code does some casts of a 'struct radix_node *'
 	 * to a 'struct rtentry *'
 	 */
 #define	rt_key(r)	(*((struct sockaddr **)(&(r)->rt_nodes->rn_key)))
 #define	rt_mask(r)	(*((struct sockaddr **)(&(r)->rt_nodes->rn_mask)))
 #define	rt_key_const(r)		(*((const struct sockaddr * const *)(&(r)->rt_nodes->rn_key)))
 #define	rt_mask_const(r)	(*((const struct sockaddr * const *)(&(r)->rt_nodes->rn_mask)))
 
 	/*
 	 * 2 radix_node structurs above consists of 2x6 pointers, leaving
 	 * 4 pointers (32 bytes) of the second cache line on amd64.
 	 *
 	 */
 	struct nhop_object	*rt_nhop;	/* nexthop data */
 	union {
 		/*
 		 * Destination address storage.
 		 * sizeof(struct sockaddr_in6) == 28, however
 		 * the dataplane-relevant part (e.g. address) lies
 		 * at offset 8..24, making the address not crossing
 		 * cacheline boundary.
 		 */
 		struct sockaddr_in	rt_dst4;
 		struct sockaddr_in6	rt_dst6;
 		struct sockaddr		rt_dst;
 		char			rt_dstb[28];
 	};
 
 	int		rte_flags;	/* up/down?, host/net */
 	u_long		rt_weight;	/* absolute weight */ 
 	u_long		rt_expire;	/* lifetime for route, e.g. redirect */
 	struct rtentry	*rt_chain;	/* pointer to next rtentry to delete */
 	struct epoch_context	rt_epoch_ctx;	/* net epoch tracker */
 };
 
 /*
  * With the split between the routing entry and the nexthop,
  *  rt_flags has to be split between these 2 entries. As rtentry
  *  mostly contains prefix data and is thought to be generic enough
  *  so one can transparently change the nexthop pointer w/o requiring
  *  any other rtentry changes, most of rt_flags shifts to the particular nexthop.
  * /
  *
  * RTF_UP: rtentry, as an indication that it is linked.
  * RTF_HOST: rtentry, nhop. The latter indication is needed for the datapath
  * RTF_DYNAMIC: nhop, to make rtentry generic.
  * RTF_MODIFIED: nhop, to make rtentry generic. (legacy)
  * -- "native" path (nhop) properties:
  * RTF_GATEWAY, RTF_STATIC, RTF_PROTO1, RTF_PROTO2, RTF_PROTO3, RTF_FIXEDMTU,
  *  RTF_PINNED, RTF_REJECT, RTF_BLACKHOLE, RTF_BROADCAST
  */
 
 /* Nexthop rt flags mask */
 #define	NHOP_RT_FLAG_MASK	(RTF_GATEWAY | RTF_HOST | RTF_REJECT | RTF_DYNAMIC | \
     RTF_MODIFIED | RTF_STATIC | RTF_BLACKHOLE | RTF_PROTO1 | RTF_PROTO2 | \
     RTF_PROTO3 | RTF_FIXEDMTU | RTF_PINNED | RTF_BROADCAST)
 
 /* rtentry rt flag mask */
 #define	RTE_RT_FLAG_MASK	(RTF_UP | RTF_HOST)
 
 /* route_temporal.c */
 void tmproutes_update(struct rib_head *rnh, struct rtentry *rt);
 void tmproutes_init(struct rib_head *rh);
 void tmproutes_destroy(struct rib_head *rh);
 
 /* route_ctl.c */
 struct route_nhop_data;
 int change_route_nhop(struct rib_head *rnh, struct rtentry *rt,
     struct rt_addrinfo *info, struct route_nhop_data *rnd,
     struct rib_cmd_info *rc);
 int change_route_conditional(struct rib_head *rnh, struct rtentry *rt,
     struct rt_addrinfo *info, struct route_nhop_data *nhd_orig,
     struct route_nhop_data *nhd_new, struct rib_cmd_info *rc);
 struct rtentry *lookup_prefix(struct rib_head *rnh,
     const struct rt_addrinfo *info, struct route_nhop_data *rnd);
 
 bool nhop_can_multipath(const struct nhop_object *nh);
 bool match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw);
 int check_info_match_nhop(const struct rt_addrinfo *info,
     const struct rtentry *rt, const struct nhop_object *nh);
 int can_override_nhop(const struct rt_addrinfo *info,
     const struct nhop_object *nh);
 
 void vnet_rtzone_init(void);
 void vnet_rtzone_destroy(void);
 
 /* subscriptions */
 void rib_init_subscriptions(struct rib_head *rnh);
 void rib_destroy_subscriptions(struct rib_head *rnh);
 
 /* Nexhops */
 void nhops_init(void);
 int nhops_init_rib(struct rib_head *rh);
 void nhops_destroy_rib(struct rib_head *rh);
 void nhop_ref_object(struct nhop_object *nh);
 int nhop_try_ref_object(struct nhop_object *nh);
 void nhop_ref_any(struct nhop_object *nh);
 void nhop_free_any(struct nhop_object *nh);
 
 void nhop_set_type(struct nhop_object *nh, enum nhop_type nh_type);
 void nhop_set_rtflags(struct nhop_object *nh, int rt_flags);
 
 int nhop_create_from_info(struct rib_head *rnh, struct rt_addrinfo *info,
     struct nhop_object **nh_ret);
 int nhop_create_from_nhop(struct rib_head *rnh, const struct nhop_object *nh_orig,
     struct rt_addrinfo *info, struct nhop_object **pnh_priv);
 
 void nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu);
 int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w);
 
 /* MULTIPATH */
 #define	MPF_MULTIPATH	0x08	/* need to be consistent with NHF_MULTIPATH */
 
 struct nhgrp_object {
 	uint16_t		nhg_flags;	/* nexthop group flags */
 	uint8_t			nhg_size;	/* dataplain group size */
 	uint8_t			spare;
 	struct nhop_object	*nhops[0];	/* nhops */
 };
 
 static inline struct nhop_object *
 nhop_select(struct nhop_object *nh, uint32_t flowid)
 {
 
 #ifdef ROUTE_MPATH
 	if (NH_IS_NHGRP(nh)) {
 		struct nhgrp_object *nhg = (struct nhgrp_object *)nh;
 		nh = nhg->nhops[flowid % nhg->nhg_size];
 	}
 #endif
 	return (nh);
 }
 
 
 struct weightened_nhop;
 
 /* mpath_ctl.c */
 int add_route_mpath(struct rib_head *rnh, struct rt_addrinfo *info,
     struct rtentry *rt, struct route_nhop_data *rnd_add,
     struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc);
 int del_route_mpath(struct rib_head *rh, struct rt_addrinfo *info,
     struct rtentry *rt, struct nhgrp_object *nhg, struct rib_cmd_info *rc);
 
 /* nhgrp.c */
 int nhgrp_ctl_init(struct nh_control *ctl);
 void nhgrp_ctl_free(struct nh_control *ctl);
 void nhgrp_ctl_unlink_all(struct nh_control *ctl);
 
 
 /* nhgrp_ctl.c */
 int nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w);
 
 int nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn,
     int num_nhops, struct route_nhop_data *rnd);
 typedef bool nhgrp_filter_cb_t(const struct nhop_object *nh, void *data);
 int nhgrp_get_filtered_group(struct rib_head *rh, const struct nhgrp_object *src,
     nhgrp_filter_cb_t flt_func, void *flt_data, struct route_nhop_data *rnd);
 int nhgrp_get_addition_group(struct rib_head *rnh,
     struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_add,
     struct route_nhop_data *rnd_new);
 
 void nhgrp_ref_object(struct nhgrp_object *nhg);
 uint32_t nhgrp_get_idx(const struct nhgrp_object *nhg);
 void nhgrp_free(struct nhgrp_object *nhg);
 
 /* rtsock */
 int rtsock_routemsg(int cmd, struct rtentry *rt, struct nhop_object *nh,
     int fibnum);
 int rtsock_routemsg_info(int cmd, struct rt_addrinfo *info, int fibnum);
 int rtsock_addrmsg(int cmd, struct ifaddr *ifa, int fibnum);
 
 
 /* lookup_framework.c */
 void fib_grow_rtables(uint32_t new_num_tables);
 int fib_select_algo_initial(struct rib_head *rh);
 void fib_destroy_rib(struct rib_head *rh);
 void vnet_fib_init(void);
 void vnet_fib_destroy(void);
 
 /* Entropy data used for outbound hashing */
 #define MPATH_ENTROPY_KEY_LEN	40
 extern uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN];
 
 #endif