Page MenuHomeFreeBSD

D26449.diff
No OneTemporary

D26449.diff

Index: head/sys/conf/NOTES
===================================================================
--- head/sys/conf/NOTES
+++ head/sys/conf/NOTES
@@ -1002,7 +1002,7 @@
#
# TCP_HHOOK enables the hhook(9) framework hooks for the TCP stack.
#
-# RADIX_MPATH provides support for equal-cost multi-path routing.
+# ROUTE_MPATH provides support for multipath routing.
#
options MROUTING # Multicast routing
options IPFIREWALL #firewall
@@ -1023,7 +1023,7 @@
options TCPPCAP
options TCP_BLACKBOX
options TCP_HHOOK
-options RADIX_MPATH
+options ROUTE_MPATH
# The MBUF_STRESS_TEST option enables options which create
# various random failures / extreme cases related to mbuf
Index: head/sys/conf/files
===================================================================
--- head/sys/conf/files
+++ head/sys/conf/files
@@ -4143,10 +4143,12 @@
net/debugnet_inet.c optional inet debugnet
net/pfil.c optional ether | inet
net/radix.c standard
-net/radix_mpath.c standard
net/raw_cb.c standard
net/raw_usrreq.c standard
net/route.c standard
+net/route/mpath_ctl.c optional route_mpath
+net/route/nhgrp.c optional route_mpath
+net/route/nhgrp_ctl.c optional route_mpath
net/route/nhop.c standard
net/route/nhop_ctl.c standard
net/route/nhop_utils.c standard
Index: head/sys/conf/options
===================================================================
--- head/sys/conf/options
+++ head/sys/conf/options
@@ -454,6 +454,7 @@
PCBGROUP opt_pcbgroup.h
PF_DEFAULT_TO_DROP opt_pf.h
RADIX_MPATH opt_mpath.h
+ROUTE_MPATH opt_route.h
ROUTETABLES opt_route.h
RSS opt_rss.h
SLIP_IFF_OPTS opt_slip.h
Index: head/sys/net/radix.c
===================================================================
--- head/sys/net/radix.c
+++ head/sys/net/radix.c
@@ -44,10 +44,6 @@
#include <sys/malloc.h>
#include <sys/syslog.h>
#include <net/radix.h>
-#include "opt_mpath.h"
-#ifdef RADIX_MPATH
-#include <net/radix_mpath.h>
-#endif
#else /* !_KERNEL */
#include <stdio.h>
#include <strings.h>
Index: head/sys/net/route.h
===================================================================
--- head/sys/net/route.h
+++ head/sys/net/route.h
@@ -178,6 +178,7 @@
*/
/* Consumer-visible nexthop info flags */
+#define NHF_MULTIPATH 0x0008 /* Nexhop is a nexthop group */
#define NHF_REJECT 0x0010 /* RTF_REJECT */
#define NHF_BLACKHOLE 0x0020 /* RTF_BLACKHOLE */
#define NHF_REDIRECT 0x0040 /* RTF_DYNAMIC|RTF_MODIFIED */
@@ -208,6 +209,10 @@
uint64_t rts_wildcard; /* lookups satisfied by a wildcard */
uint64_t rts_nh_idx_alloc_failure; /* nexthop index alloc failure*/
uint64_t rts_nh_alloc_failure; /* nexthop allocation failure*/
+ uint64_t rts_add_failure; /* # of route addition failures */
+ uint64_t rts_add_retry; /* # of route addition retries */
+ uint64_t rts_del_failure; /* # of route deletion failure */
+ uint64_t rts_del_retry; /* # of route deletion retries */
};
/*
Index: head/sys/net/route.c
===================================================================
--- head/sys/net/route.c
+++ head/sys/net/route.c
@@ -39,7 +39,6 @@
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_mrouting.h"
-#include "opt_mpath.h"
#include "opt_route.h"
#include <sys/param.h>
Index: head/sys/net/route/mpath_ctl.c
===================================================================
--- head/sys/net/route/mpath_ctl.c
+++ head/sys/net/route/mpath_ctl.c
@@ -0,0 +1,165 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_inet.h"
+#include "opt_route.h"
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+
+/*
+ * This file contains the supporting functions for adding/deleting/updating
+ * multipath routes to the routing table.
+ */
+
+SYSCTL_DECL(_net_route);
+
+/*
+ * Tries to add @rnd_add nhop to the existing set of nhops (@nh_orig) for the
+ * prefix specified by @rt.
+ *
+ * Return 0 ans consumes rt / rnd_add nhop references. @rc gets populated
+ * with the operation result.
+ * Otherwise errno is returned.
+ *
+ * caller responsibility is to unlock/free rt and
+ * rt->rt_nhop.
+ */
+int
+add_route_mpath(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct rtentry *rt, struct route_nhop_data *rnd_add,
+ struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
+{
+ RIB_RLOCK_TRACKER;
+ struct route_nhop_data rnd_new;
+ int error = 0;
+
+ /*
+ * It is possible that multiple rtsock speakers will try to update
+ * the same route simultaneously. Reduce the chance of failing the
+ * request by retrying the cycle multiple times.
+ */
+ for (int i = 0; i < RIB_MAX_RETRIES; i++) {
+ error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add,
+ &rnd_new);
+ if (error != 0) {
+ if (error != EAGAIN)
+ break;
+
+ /*
+ * Group creation failed, most probably because
+ * @rnd_orig data got scheduled for deletion.
+ * Refresh @rnd_orig data and retry.
+ */
+ RIB_RLOCK(rnh);
+ lookup_prefix(rnh, info, rnd_orig);
+ RIB_RUNLOCK(rnh);
+ continue;
+ }
+
+ error = change_route_conditional(rnh, rt, info, rnd_orig,
+ &rnd_new, rc);
+ if (error != EAGAIN)
+ break;
+ RTSTAT_INC(rts_add_retry);
+ }
+
+ return (error);
+}
+
+struct rt_match_info {
+ struct rt_addrinfo *info;
+ struct rtentry *rt;
+};
+
+static bool
+gw_filter_func(const struct nhop_object *nh, void *_data)
+{
+ struct rt_match_info *ri = (struct rt_match_info *)_data;
+
+ return (check_info_match_nhop(ri->info, ri->rt, nh) == 0);
+}
+
+/*
+ * Tries to delete matching paths from @nhg.
+ * Returns 0 on success and updates operation result in @rc.
+ */
+int
+del_route_mpath(struct rib_head *rh, struct rt_addrinfo *info,
+ struct rtentry *rt, struct nhgrp_object *nhg,
+ struct rib_cmd_info *rc)
+{
+ struct route_nhop_data rnd;
+ struct rt_match_info ri = { .info = info, .rt = rt };
+ int error;
+
+ RIB_WLOCK_ASSERT(rh);
+
+ /*
+ * Require gateway to delete multipath routes, to forbid
+ * deleting all paths at once.
+ * If the filter function is provided, skip gateway check to
+ * allow rib_walk_del() delete routes for any criteria based
+ * on provided callback.
+ */
+ if ((info->rti_info[RTAX_GATEWAY] == NULL) && (info->rti_filter == NULL))
+ return (ESRCH);
+
+ error = nhgrp_get_filtered_group(rh, nhg, gw_filter_func, (void *)&ri,
+ &rnd);
+ if (error == 0)
+ error = change_route_nhop(rh, rt, info, &rnd, rc);
+ return (error);
+}
+
Index: head/sys/net/route/nhgrp.c
===================================================================
--- head/sys/net/route/nhgrp.c
+++ head/sys/net/route/nhgrp.c
@@ -0,0 +1,344 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_inet.h"
+#include "opt_route.h"
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/refcount.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/nhgrp_var.h>
+
+/*
+ * This file contains data structures management logic for the nexthop
+ * groups ("nhgrp") route subsystem.
+ *
+ * Nexthop groups are used to store multiple routes available for the specific
+ * prefix. Nexthop groups are immutable and can be shared across multiple
+ * prefixes.
+ *
+ * Each group consists of a control plane part and a dataplane part.
+ * Control plane is basically a collection of nexthop objects with
+ * weights and refcount.
+ *
+ * Datapath consists of a array of nexthop pointers, compiled from control
+ * plane data to support O(1) nexthop selection.
+ *
+ * For example, consider the following group:
+ * [(nh1, weight=100), (nh2, weight=200)]
+ * It will compile to the following array:
+ * [nh1, nh2, nh2]
+ *
+ */
+
+static void consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets,
+ uint32_t new_idx_items);
+
+static int cmp_nhgrp(const struct nhgrp_priv *a, const struct nhgrp_priv *b);
+static unsigned int hash_nhgrp(const struct nhgrp_priv *obj);
+
+static unsigned
+djb_hash(const unsigned char *h, const int len)
+{
+ unsigned int result = 0;
+ int i;
+
+ for (i = 0; i < len; i++)
+ result = 33 * result ^ h[i];
+
+ return (result);
+}
+
+static int
+cmp_nhgrp(const struct nhgrp_priv *a, const struct nhgrp_priv *b)
+{
+
+ /*
+ * In case of consistent hashing, there can be multiple nexthop groups
+ * with the same "control plane" list of nexthops with weights and a
+ * different set of "data plane" nexthops.
+ * For now, ignore the data plane and focus on the control plane list.
+ */
+ if (a->nhg_nh_count != b->nhg_nh_count)
+ return (0);
+ return !memcmp(a->nhg_nh_weights, b->nhg_nh_weights,
+ sizeof(struct weightened_nhop) * a->nhg_nh_count);
+}
+
+/*
+ * Hash callback: calculate hash of an object
+ */
+static unsigned int
+hash_nhgrp(const struct nhgrp_priv *obj)
+{
+ const unsigned char *key;
+
+ key = (const unsigned char *)obj->nhg_nh_weights;
+
+ return (djb_hash(key, sizeof(struct weightened_nhop) * obj->nhg_nh_count));
+}
+
+/*
+ * Returns object referenced and unlocked
+ */
+struct nhgrp_priv *
+find_nhgrp(struct nh_control *ctl, const struct nhgrp_priv *key)
+{
+ struct nhgrp_priv *priv_ret;
+
+ NHOPS_RLOCK(ctl);
+ CHT_SLIST_FIND_BYOBJ(&ctl->gr_head, mpath, key, priv_ret);
+ if (priv_ret != NULL) {
+ if (refcount_acquire_if_not_zero(&priv_ret->nhg_refcount) == 0) {
+ /* refcount is 0 -> group is being deleted */
+ priv_ret = NULL;
+ }
+ }
+ NHOPS_RUNLOCK(ctl);
+
+ return (priv_ret);
+}
+
+int
+link_nhgrp(struct nh_control *ctl, struct nhgrp_priv *grp_priv)
+{
+ uint16_t idx;
+ uint32_t new_num_buckets, new_num_items;
+
+ NHOPS_WLOCK(ctl);
+ /* Check if we need to resize hash and index */
+ new_num_buckets = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->gr_head);
+ new_num_items = bitmask_get_resize_items(&ctl->gr_idx_head);
+
+ if (bitmask_alloc_idx(&ctl->gr_idx_head, &idx) != 0) {
+ NHOPS_WUNLOCK(ctl);
+ DPRINTF("Unable to allocate mpath index");
+ consider_resize(ctl, new_num_buckets, new_num_items);
+ return (0);
+ }
+
+ grp_priv->nhg_idx = idx;
+ grp_priv->nh_control = ctl;
+ CHT_SLIST_INSERT_HEAD(&ctl->gr_head, mpath, grp_priv);
+
+ NHOPS_WUNLOCK(ctl);
+
+ consider_resize(ctl, new_num_buckets, new_num_items);
+
+ return (1);
+}
+
+struct nhgrp_priv *
+unlink_nhgrp(struct nh_control *ctl, struct nhgrp_priv *key)
+{
+ struct nhgrp_priv *nhg_priv_ret;
+ int ret, idx;
+
+ NHOPS_WLOCK(ctl);
+
+ CHT_SLIST_REMOVE_BYOBJ(&ctl->gr_head, mpath, key, nhg_priv_ret);
+
+ if (nhg_priv_ret == NULL) {
+ DPRINTF("Unable to find nhop group!");
+ NHOPS_WUNLOCK(ctl);
+ return (NULL);
+ }
+
+ idx = nhg_priv_ret->nhg_idx;
+ ret = bitmask_free_idx(&ctl->gr_idx_head, idx);
+ nhg_priv_ret->nhg_idx = 0;
+ nhg_priv_ret->nh_control = NULL;
+
+ NHOPS_WUNLOCK(ctl);
+
+ return (nhg_priv_ret);
+}
+
+/*
+ * Checks if hash needs resizing and performs this resize if necessary
+ *
+ */
+__noinline static void
+consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, uint32_t new_idx_items)
+{
+ void *nh_ptr, *nh_idx_ptr;
+ void *old_idx_ptr;
+ size_t alloc_size;
+
+ nh_ptr = NULL ;
+ if (new_nh_buckets != 0) {
+ alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_nh_buckets);
+ nh_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO);
+ }
+
+ nh_idx_ptr = NULL;
+ if (new_idx_items != 0) {
+ alloc_size = bitmask_get_size(new_idx_items);
+ nh_idx_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO);
+ }
+
+ if (nh_ptr == NULL && nh_idx_ptr == NULL) {
+ /* Either resize is not required or allocations have failed. */
+ return;
+ }
+
+ DPRINTF("mp: going to resize: nh:[ptr:%p sz:%u] idx:[ptr:%p sz:%u]",
+ nh_ptr, new_nh_buckets, nh_idx_ptr, new_idx_items);
+
+ old_idx_ptr = NULL;
+
+ NHOPS_WLOCK(ctl);
+ if (nh_ptr != NULL) {
+ CHT_SLIST_RESIZE(&ctl->gr_head, mpath, nh_ptr, new_nh_buckets);
+ }
+ if (nh_idx_ptr != NULL) {
+ if (bitmask_copy(&ctl->gr_idx_head, nh_idx_ptr, new_idx_items))
+ bitmask_swap(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items, &old_idx_ptr);
+ }
+ NHOPS_WUNLOCK(ctl);
+
+ if (nh_ptr != NULL)
+ free(nh_ptr, M_NHOP);
+ if (old_idx_ptr != NULL)
+ free(old_idx_ptr, M_NHOP);
+}
+
+/*
+ * Function allocating the necessary group data structures.
+ */
+bool
+nhgrp_ctl_alloc_default(struct nh_control *ctl, int malloc_flags)
+{
+ size_t alloc_size;
+ uint32_t num_buckets, num_items;
+ void *cht_ptr, *mask_ptr;
+
+ malloc_flags = (malloc_flags & (M_NOWAIT | M_WAITOK)) | M_ZERO;
+
+ num_buckets = 8;
+ alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
+ cht_ptr = malloc(alloc_size, M_NHOP, malloc_flags);
+
+ if (cht_ptr == NULL) {
+ DPRINTF("mpath init failed");
+ return (false);
+ }
+
+ /*
+ * Allocate nexthop index bitmask.
+ */
+ num_items = 128;
+ mask_ptr = malloc(bitmask_get_size(num_items), M_NHOP, malloc_flags);
+ if (mask_ptr == NULL) {
+ DPRINTF("mpath bitmask init failed");
+ free(cht_ptr, M_NHOP);
+ return (false);
+ }
+
+ NHOPS_WLOCK(ctl);
+
+ if (ctl->gr_head.hash_size == 0) {
+ /* Init hash and bitmask */
+ CHT_SLIST_INIT(&ctl->gr_head, cht_ptr, num_buckets);
+ bitmask_init(&ctl->gr_idx_head, mask_ptr, num_items);
+ NHOPS_WUNLOCK(ctl);
+ } else {
+ /* Other thread has already initiliazed hash/bitmask */
+ NHOPS_WUNLOCK(ctl);
+ free(cht_ptr, M_NHOP);
+ free(mask_ptr, M_NHOP);
+ }
+
+ DPRINTF("mpath init done for fib/af %d/%d", ctl->rh->rib_fibnum,
+ ctl->rh->rib_family);
+
+ return (true);
+}
+
+int
+nhgrp_ctl_init(struct nh_control *ctl)
+{
+
+ /*
+ * By default, do not allocate datastructures as multipath
+ * routes will not be necessarily used.
+ */
+ CHT_SLIST_INIT(&ctl->gr_head, NULL, 0);
+ bitmask_init(&ctl->gr_idx_head, NULL, 0);
+ return (0);
+}
+
+void
+nhgrp_ctl_free(struct nh_control *ctl)
+{
+
+ if (ctl->gr_head.ptr != NULL)
+ free(ctl->gr_head.ptr, M_NHOP);
+ if (ctl->gr_idx_head.idx != NULL)
+ free(ctl->gr_idx_head.idx, M_NHOP);
+}
+
+void
+nhgrp_ctl_unlink_all(struct nh_control *ctl)
+{
+ struct nhgrp_priv *nhg_priv;
+
+ NHOPS_WLOCK_ASSERT(ctl);
+
+ CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) {
+ DPRINTF("Marking nhgrp %u unlinked", nhg_priv->nhg_idx);
+ refcount_release(&nhg_priv->nhg_linked);
+ } CHT_SLIST_FOREACH_END;
+}
+
Index: head/sys/net/route/nhgrp_ctl.c
===================================================================
--- head/sys/net/route/nhgrp_ctl.c
+++ head/sys/net/route/nhgrp_ctl.c
@@ -0,0 +1,788 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+#define RTDEBUG
+#include "opt_inet.h"
+#include "opt_route.h"
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/refcount.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+#include <sys/epoch.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/route.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/nhgrp_var.h>
+
+/*
+ * This file contains the supporting functions for creating multipath groups
+ * and compiling their dataplane parts.
+ */
+
+/* MPF_MULTIPATH must be the same as NHF_MULTIPATH for nhop selection to work */
+_Static_assert(MPF_MULTIPATH == NHF_MULTIPATH,
+ "MPF_MULTIPATH must be the same as NHF_MULTIPATH");
+/* Offset and size of flags field has to be the same for nhop/nhop groups */
+CHK_STRUCT_FIELD_GENERIC(struct nhop_object, nh_flags, struct nhgrp_object, nhg_flags);
+/* Cap multipath to 64, as the larger values would break rib_cmd_info bmasks */
+CTASSERT(RIB_MAX_MPATH_WIDTH <= 64);
+
+static int wn_cmp(const void *a, const void *b);
+static void sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops);
+
+static struct nhgrp_priv *get_nhgrp(struct nh_control *ctl,
+ struct weightened_nhop *wn, int num_nhops, int *perror);
+static void destroy_nhgrp(struct nhgrp_priv *nhg_priv);
+static void destroy_nhgrp_epoch(epoch_context_t ctx);
+static void free_nhgrp_nhops(struct nhgrp_priv *nhg_priv);
+
+static int
+wn_cmp(const void *a, const void *b)
+{
+ const struct weightened_nhop *wa = a;
+ const struct weightened_nhop *wb = b;
+
+ if (wa->weight > wb->weight)
+ return (1);
+ else if (wa->weight < wb->weight)
+ return (-1);
+
+ /* Compare nexthops by pointer */
+ if (wa->nh > wb->nh)
+ return (1);
+ else if (wa->nh < wb->nh)
+ return (-1);
+ else
+ return (0);
+}
+
+/*
+ * Perform in-place sorting for array of nexthops in @wn.
+ *
+ * To avoid nh groups duplication, nexthops/weights in the
+ * @wn need to be ordered deterministically.
+ * As this sorting is needed only for the control plane functionality,
+ * there are no specific external requirements.
+ *
+ * Sort by weight first, to ease calculation of the slot sizes.
+ */
+static void
+sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops)
+{
+
+ qsort(wn, num_nhops, sizeof(struct weightened_nhop), wn_cmp);
+}
+
+/*
+ * Calculate minimum number of slots required to fit the existing
+ * set of weights in the common use case where weights are "easily"
+ * comparable.
+ * Assumes @wn is sorted by weight ascending and each weight is > 0.
+ * Returns number of slots or 0 if precise calculation failed.
+ *
+ * Some examples:
+ * note: (i, X) pair means (nhop=i, weight=X):
+ * (1, 1) (2, 2) -> 3 slots [1, 2, 2]
+ * (1, 100), (2, 200) -> 3 slots [1, 2, 2]
+ * (1, 100), (2, 200), (3, 400) -> 7 slots [1, 2, 2, 3, 3, 3]
+ */
+static uint32_t
+calc_min_mpath_slots_fast(const struct weightened_nhop *wn, size_t num_items)
+{
+ uint32_t i, last, xmin;
+ uint64_t total = 0;
+
+ last = 0;
+ xmin = wn[0].weight;
+ for (i = 0; i < num_items; i++) {
+ total += wn[i].weight;
+ if ((wn[i].weight - last < xmin) && (wn[i].weight != last))
+ xmin = wn[i].weight - last;
+ last = wn[i].weight;
+ }
+ /* xmin is the minimum unit of desired capacity */
+ if ((total % xmin) != 0)
+ return (0);
+ for (i = 0; i < num_items; i++) {
+ if ((wn[i].weight % xmin) != 0)
+ return (0);
+ }
+
+ return ((uint32_t)(total / xmin));
+}
+
+/*
+ * Calculate minimum number of slots required to fit the existing
+ * set of weights while maintaining weight coefficients.
+ *
+ * Assume @wn is sorted by weight ascending and each weight is > 0.
+ *
+ * Tries to find simple precise solution first and falls back to
+ * RIB_MAX_MPATH_WIDTH in case of any failure.
+ */
+static uint32_t
+calc_min_mpath_slots(const struct weightened_nhop *wn, size_t num_items)
+{
+ uint32_t v;
+
+ v = calc_min_mpath_slots_fast(wn, num_items);
+ if ((v == 0) || (v > RIB_MAX_MPATH_WIDTH))
+ v = RIB_MAX_MPATH_WIDTH;
+
+ return (v);
+}
+
+/*
+ * Nexthop group data consists of
+ * 1) dataplane part, with nhgrp_object as a header followed by an
+ * arbitrary number of nexthop pointers.
+ * 2) control plane part, with nhgrp_priv as a header, followed by
+ * an arbirtrary number of 'struct weightened_nhop' object.
+ *
+ * Given nexthop groups are (mostly) immutable, allocate all data
+ * in one go.
+ *
+ */
+__noinline static size_t
+get_nhgrp_alloc_size(uint32_t nhg_size, uint32_t num_nhops)
+{
+ size_t sz;
+
+ sz = sizeof(struct nhgrp_object);
+ sz += nhg_size * sizeof(struct nhop_object *);
+ sz += sizeof(struct nhgrp_priv);
+ sz += num_nhops * sizeof(struct weightened_nhop);
+ return (sz);
+}
+
+/*
+ * Compile actual list of nexthops to be used by datapath from
+ * the nexthop group @dst.
+ *
+ * For example, compiling control plane list of 2 nexthops
+ * [(200, A), (100, B)] would result in the datapath array
+ * [A, A, B]
+ */
+static void
+compile_nhgrp(struct nhgrp_priv *dst_priv, const struct weightened_nhop *x,
+ uint32_t num_slots)
+{
+ struct nhgrp_object *dst;
+ int i, slot_idx, remaining_slots;
+ uint64_t remaining_sum, nh_weight, nh_slots;
+
+ slot_idx = 0;
+ dst = dst_priv->nhg;
+ /* Calculate sum of all weights */
+ remaining_sum = 0;
+ for (i = 0; i < dst_priv->nhg_nh_count; i++)
+ remaining_sum += x[i].weight;
+ remaining_slots = num_slots;
+ DPRINTF("O: %u/%u", (uint32_t)remaining_sum, remaining_slots);
+ for (i = 0; i < dst_priv->nhg_nh_count; i++) {
+ /* Calculate number of slots for the current nexthop */
+ if (remaining_sum > 0) {
+ nh_weight = (uint64_t)x[i].weight;
+ nh_slots = (nh_weight * remaining_slots / remaining_sum);
+ } else
+ nh_slots = 0;
+
+ remaining_sum -= x[i].weight;
+ remaining_slots -= nh_slots;
+
+ DPRINTF(" OO[%d]: %u/%u curr=%d slot_idx=%d", i,
+ (uint32_t)remaining_sum, remaining_slots,
+ (int)nh_slots, slot_idx);
+
+ KASSERT((slot_idx + nh_slots <= num_slots),
+ ("index overflow during nhg compilation"));
+ while (nh_slots-- > 0)
+ dst->nhops[slot_idx++] = x[i].nh;
+ }
+}
+
+/*
+ * Allocates new nexthop group for the list of weightened nexthops.
+ * Assume sorted list.
+ * Does NOT reference any nexthops in the group.
+ * Returns group with refcount=1 or NULL.
+ */
+static struct nhgrp_priv *
+alloc_nhgrp(struct weightened_nhop *wn, int num_nhops)
+{
+ uint32_t nhgrp_size;
+ int flags = M_NOWAIT;
+ struct nhgrp_object *nhg;
+ struct nhgrp_priv *nhg_priv;
+
+ nhgrp_size = calc_min_mpath_slots(wn, num_nhops);
+ if (nhgrp_size == 0) {
+ /* Zero weights, abort */
+ return (NULL);
+ }
+
+ size_t sz = get_nhgrp_alloc_size(nhgrp_size, num_nhops);
+ nhg = malloc(sz, M_NHOP, flags | M_ZERO);
+ if (nhg == NULL) {
+ return (NULL);
+ }
+
+ /* Has to be the first to make NHGRP_PRIV() work */
+ nhg->nhg_size = nhgrp_size;
+ DPRINTF("new mpath group: num_nhops: %u", (uint32_t)nhgrp_size);
+ nhg->nhg_flags = MPF_MULTIPATH;
+
+ nhg_priv = NHGRP_PRIV(nhg);
+ nhg_priv->nhg_nh_count = num_nhops;
+ refcount_init(&nhg_priv->nhg_refcount, 1);
+
+ /* Please see nhgrp_free() comments on the initial value */
+ refcount_init(&nhg_priv->nhg_linked, 2);
+
+ nhg_priv->nhg = nhg;
+ memcpy(&nhg_priv->nhg_nh_weights[0], wn,
+ num_nhops * sizeof(struct weightened_nhop));
+
+ compile_nhgrp(nhg_priv, wn, nhg->nhg_size);
+
+ return (nhg_priv);
+}
+
+void
+nhgrp_free(struct nhgrp_object *nhg)
+{
+ struct nhgrp_priv *nhg_priv;
+ struct nh_control *ctl;
+ struct epoch_tracker et;
+
+ nhg_priv = NHGRP_PRIV(nhg);
+
+ if (!refcount_release(&nhg_priv->nhg_refcount))
+ return;
+
+ /*
+ * group objects don't have an explicit lock attached to it.
+ * As groups are reclaimed based on reference count, it is possible
+ * that some groups will persist after vnet destruction callback
+ * called. Given that, handle scenario with nhgrp_free_group() being
+ * called either after or simultaneously with nhgrp_ctl_unlink_all()
+ * by using another reference counter: nhg_linked.
+ *
+ * There are only 2 places, where nhg_linked can be decreased:
+ * rib destroy (nhgrp_ctl_unlink_all) and this function.
+ * nhg_link can never be increased.
+ *
+ * Hence, use initial value of 2 to make use of
+ * refcount_release_if_not_last().
+ *
+ * There can be two scenarious when calling this function:
+ *
+ * 1) nhg_linked value is 2. This means that either
+ * nhgrp_ctl_unlink_all() has not been called OR it is running,
+ * but we are guaranteed that nh_control won't be freed in
+ * this epoch. Hence, nexthop can be safely unlinked.
+ *
+ * 2) nh_linked value is 1. In that case, nhgrp_ctl_unlink_all()
+ * has been called and nhgrp unlink can be skipped.
+ */
+
+ NET_EPOCH_ENTER(et);
+ if (refcount_release_if_not_last(&nhg_priv->nhg_linked)) {
+ ctl = nhg_priv->nh_control;
+ if (unlink_nhgrp(ctl, nhg_priv) == NULL) {
+ /* Do not try to reclaim */
+ DPRINTF("Failed to unlink nexhop group %p", nhg_priv);
+ NET_EPOCH_EXIT(et);
+ return;
+ }
+ }
+ NET_EPOCH_EXIT(et);
+
+ epoch_call(net_epoch_preempt, destroy_nhgrp_epoch,
+ &nhg_priv->nhg_epoch_ctx);
+}
+
+/*
+ * Destroys all local resources belonging to @nhg_priv.
+ */
+__noinline static void
+destroy_nhgrp_int(struct nhgrp_priv *nhg_priv)
+{
+
+ free(nhg_priv->nhg, M_NHOP);
+}
+
+__noinline static void
+destroy_nhgrp(struct nhgrp_priv *nhg_priv)
+{
+
+ KASSERT((nhg_priv->nhg_refcount == 0), ("nhg_refcount != 0"));
+
+ DPRINTF("DEL MPATH %p", nhg_priv);
+
+ KASSERT((nhg_priv->nhg_idx == 0), ("gr_idx != 0"));
+
+ free_nhgrp_nhops(nhg_priv);
+
+ destroy_nhgrp_int(nhg_priv);
+}
+
+/*
+ * Epoch callback indicating group is safe to destroy
+ */
+static void
+destroy_nhgrp_epoch(epoch_context_t ctx)
+{
+ struct nhgrp_priv *nhg_priv;
+
+ nhg_priv = __containerof(ctx, struct nhgrp_priv, nhg_epoch_ctx);
+
+ destroy_nhgrp(nhg_priv);
+}
+
+static bool
+ref_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
+{
+
+ for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
+ if (nhop_try_ref_object(nhg_priv->nhg_nh_weights[i].nh) != 0)
+ continue;
+
+ /*
+ * Failed to ref the nexthop, b/c it's deleted.
+ * Need to rollback references back.
+ */
+ for (int j = 0; j < i; j++)
+ nhop_free(nhg_priv->nhg_nh_weights[j].nh);
+ return (false);
+ }
+
+ return (true);
+}
+
+static void
+free_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
+{
+
+ for (int i = 0; i < nhg_priv->nhg_nh_count; i++)
+ nhop_free(nhg_priv->nhg_nh_weights[i].nh);
+}
+
+/*
+ * Creates or looks up an existing nexthop group based on @wn and @num_nhops.
+ *
+ * Returns referenced nhop group or NULL, passing error code in @perror.
+ */
+struct nhgrp_priv *
+get_nhgrp(struct nh_control *ctl, struct weightened_nhop *wn, int num_nhops,
+ int *perror)
+{
+ struct nhgrp_priv *key, *nhg_priv;
+
+ if (num_nhops > RIB_MAX_MPATH_WIDTH) {
+ *perror = E2BIG;
+ return (NULL);
+ }
+
+ if (ctl->gr_head.hash_size == 0) {
+ /* First multipath request. Bootstrap mpath datastructures. */
+ if (nhgrp_ctl_alloc_default(ctl, M_NOWAIT) == 0) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+ }
+
+ /* Sort nexthops & check there are no duplicates */
+ sort_weightened_nhops(wn, num_nhops);
+ uint32_t last_id = 0;
+ for (int i = 0; i < num_nhops; i++) {
+ if (wn[i].nh->nh_priv->nh_idx == last_id) {
+ *perror = EEXIST;
+ return (NULL);
+ }
+ last_id = wn[i].nh->nh_priv->nh_idx;
+ }
+
+ if ((key = alloc_nhgrp(wn, num_nhops)) == NULL) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+
+ nhg_priv = find_nhgrp(ctl, key);
+ if (nhg_priv != NULL) {
+ /*
+ * Free originally-created group. As it hasn't been linked
+ * and the dependent nexhops haven't been referenced, just free
+ * the group.
+ */
+ destroy_nhgrp_int(key);
+ *perror = 0;
+ return (nhg_priv);
+ } else {
+ /* No existing group, try to link the new one */
+ if (!ref_nhgrp_nhops(key)) {
+ /*
+ * Some of the nexthops have been scheduled for deletion.
+ * As the group hasn't been linked / no nexhops have been
+ * referenced, call the final destructor immediately.
+ */
+ destroy_nhgrp_int(key);
+ *perror = EAGAIN;
+ return (NULL);
+ }
+ if (link_nhgrp(ctl, key) == 0) {
+ /* Unable to allocate index? */
+ *perror = EAGAIN;
+ destroy_nhgrp(key);
+ }
+ *perror = 0;
+ return (key);
+ }
+
+ /* NOTREACHED */
+}
+
+/*
+ * Appends one or more nexthops denoted by @wm to the nexthop group @gr_orig.
+ *
+ * Returns referenced nexthop group or NULL. In the latter case, @perror is
+ * filled with an error code.
+ * Note that function does NOT care if the next nexthops already exists
+ * in the @gr_orig. As a result, they will be added, resulting in the
+ * same nexthop being present multiple times in the new group.
+ */
+static struct nhgrp_priv *
+append_nhops(struct nh_control *ctl, const struct nhgrp_object *gr_orig,
+ struct weightened_nhop *wn, int num_nhops, int *perror)
+{
+ char storage[64];
+ struct weightened_nhop *pnhops;
+ struct nhgrp_priv *nhg_priv;
+ const struct nhgrp_priv *src_priv;
+ size_t sz;
+ int curr_nhops;
+
+ src_priv = NHGRP_PRIV_CONST(gr_orig);
+ curr_nhops = src_priv->nhg_nh_count;
+
+ *perror = 0;
+
+ sz = (src_priv->nhg_nh_count + num_nhops) * (sizeof(struct weightened_nhop));
+ /* optimize for <= 4 paths, each path=16 bytes */
+ if (sz <= sizeof(storage))
+ pnhops = (struct weightened_nhop *)&storage[0];
+ else {
+ pnhops = malloc(sz, M_TEMP, M_NOWAIT);
+ if (pnhops == NULL) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+ }
+
+ /* Copy nhops from original group first */
+ memcpy(pnhops, src_priv->nhg_nh_weights,
+ curr_nhops * sizeof(struct weightened_nhop));
+ memcpy(&pnhops[curr_nhops], wn, num_nhops * sizeof(struct weightened_nhop));
+ curr_nhops += num_nhops;
+
+ nhg_priv = get_nhgrp(ctl, pnhops, curr_nhops, perror);
+
+ if (pnhops != (struct weightened_nhop *)&storage[0])
+ free(pnhops, M_TEMP);
+
+ if (nhg_priv == NULL)
+ return (NULL);
+
+ return (nhg_priv);
+}
+
+
+/*
+ * Creates/finds nexthop group based on @wn and @num_nhops.
+ * Returns 0 on success with referenced group in @rnd, or
+ * errno.
+ *
+ * If the error is EAGAIN, then the operation can be retried.
+ */
+int
+nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops,
+ struct route_nhop_data *rnd)
+{
+ struct nh_control *ctl = rh->nh_control;
+ struct nhgrp_priv *nhg_priv;
+ int error;
+
+ nhg_priv = get_nhgrp(ctl, wn, num_nhops, &error);
+ if (nhg_priv != NULL)
+ rnd->rnd_nhgrp = nhg_priv->nhg;
+ rnd->rnd_weight = 0;
+
+ return (error);
+}
+
+/*
+ * Creates new nexthop group based on @src group with the nexthops defined in bitmask
+ * @nhop_mask removed.
+ * Returns referenced nexthop group or NULL on failure.
+ */
+int
+nhgrp_get_filtered_group(struct rib_head *rh, const struct nhgrp_object *src,
+ nhgrp_filter_cb_t flt_func, void *flt_data, struct route_nhop_data *rnd)
+{
+ char storage[64];
+ struct nh_control *ctl = rh->nh_control;
+ struct weightened_nhop *pnhops;
+ const struct nhgrp_priv *mp_priv, *src_priv;
+ size_t sz;
+ int error, i, num_nhops;
+
+ src_priv = NHGRP_PRIV_CONST(src);
+
+ sz = src_priv->nhg_nh_count * (sizeof(struct weightened_nhop));
+ /* optimize for <= 4 paths, each path=16 bytes */
+ if (sz <= sizeof(storage))
+ pnhops = (struct weightened_nhop *)&storage[0];
+ else {
+ if ((pnhops = malloc(sz, M_TEMP, M_NOWAIT)) == NULL)
+ return (ENOMEM);
+ }
+
+ /* Filter nexthops */
+ error = 0;
+ num_nhops = 0;
+ for (i = 0; i < src_priv->nhg_nh_count; i++) {
+ if (flt_func(src_priv->nhg_nh_weights[i].nh, flt_data))
+ continue;
+ memcpy(&pnhops[num_nhops++], &src_priv->nhg_nh_weights[i],
+ sizeof(struct weightened_nhop));
+ }
+
+ if (num_nhops == 0) {
+ rnd->rnd_nhgrp = NULL;
+ rnd->rnd_weight = 0;
+ } else if (num_nhops == 1) {
+ rnd->rnd_nhop = pnhops[0].nh;
+ rnd->rnd_weight = pnhops[0].weight;
+ if (nhop_try_ref_object(rnd->rnd_nhop) == 0)
+ error = EAGAIN;
+ } else {
+ mp_priv = get_nhgrp(ctl, pnhops, num_nhops, &error);
+ if (mp_priv != NULL)
+ rnd->rnd_nhgrp = mp_priv->nhg;
+ rnd->rnd_weight = 0;
+ }
+
+ if (pnhops != (struct weightened_nhop *)&storage[0])
+ free(pnhops, M_TEMP);
+
+ return (error);
+}
+
+/*
+ * Creates new multipath group based on existing group/nhop in @rnd_orig and
+ * to-be-added nhop @wn_add.
+ * Returns 0 on success and stores result in @rnd_new.
+ */
+int
+nhgrp_get_addition_group(struct rib_head *rh, struct route_nhop_data *rnd_orig,
+ struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_new)
+{
+ struct nh_control *ctl = rh->nh_control;
+ struct nhgrp_priv *nhg_priv;
+ struct weightened_nhop wn[2];
+ int error;
+
+ if (rnd_orig->rnd_nhop == NULL) {
+ /* No paths to add to, just reference current nhop */
+ *rnd_new = *rnd_add;
+ if (nhop_try_ref_object(rnd_new->rnd_nhop) == 0)
+ return (EAGAIN);
+ return (0);
+ }
+
+ wn[0].nh = rnd_add->rnd_nhop;
+ wn[0].weight = rnd_add->rnd_weight;
+
+ if (!NH_IS_NHGRP(rnd_orig->rnd_nhop)) {
+ /* Simple merge of 2 non-multipath nexthops */
+ wn[1].nh = rnd_orig->rnd_nhop;
+ wn[1].weight = rnd_orig->rnd_weight;
+ nhg_priv = get_nhgrp(ctl, wn, 2, &error);
+ } else {
+ /* Get new nhop group with @rt->rt_nhop as an additional nhop */
+ nhg_priv = append_nhops(ctl, rnd_orig->rnd_nhgrp, &wn[0], 1,
+ &error);
+ }
+
+ if (nhg_priv == NULL)
+ return (error);
+ rnd_new->rnd_nhgrp = nhg_priv->nhg;
+ rnd_new->rnd_weight = 0;
+
+ return (0);
+}
+
+/*
+ * Returns pointer to array of nexthops with weights for
+ * given @nhg. Stores number of items in the array into @pnum_nhops.
+ */
+struct weightened_nhop *
+nhgrp_get_nhops(struct nhgrp_object *nhg, uint32_t *pnum_nhops)
+{
+ struct nhgrp_priv *nhg_priv;
+
+ KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
+
+ nhg_priv = NHGRP_PRIV(nhg);
+ *pnum_nhops = nhg_priv->nhg_nh_count;
+
+ return (nhg_priv->nhg_nh_weights);
+}
+
+__noinline static int
+dump_nhgrp_entry(struct rib_head *rh, const struct nhgrp_priv *nhg_priv,
+ char *buffer, size_t buffer_size, struct sysctl_req *w)
+{
+ struct rt_msghdr *rtm;
+ struct nhgrp_external *nhge;
+ struct nhgrp_container *nhgc;
+ const struct nhgrp_object *nhg;
+ struct nhgrp_nhop_external *ext;
+ int error;
+ size_t sz;
+
+ nhg = nhg_priv->nhg;
+
+ sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
+ /* controlplane nexthops */
+ sz += sizeof(struct nhgrp_container);
+ sz += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
+ /* dataplane nexthops */
+ sz += sizeof(struct nhgrp_container);
+ sz += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
+
+ KASSERT(sz <= buffer_size, ("increase nhgrp buffer size"));
+
+ bzero(buffer, sz);
+
+ rtm = (struct rt_msghdr *)buffer;
+ rtm->rtm_msglen = sz;
+ rtm->rtm_version = RTM_VERSION;
+ rtm->rtm_type = RTM_GET;
+
+ nhge = (struct nhgrp_external *)(rtm + 1);
+
+ nhge->nhg_idx = nhg_priv->nhg_idx;
+ nhge->nhg_refcount = nhg_priv->nhg_refcount;
+
+ /* fill in control plane nexthops firs */
+ nhgc = (struct nhgrp_container *)(nhge + 1);
+ nhgc->nhgc_type = NHG_C_TYPE_CNHOPS;
+ nhgc->nhgc_subtype = 0;
+ nhgc->nhgc_len = sizeof(struct nhgrp_container);
+ nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
+ nhgc->nhgc_count = nhg_priv->nhg_nh_count;
+
+ ext = (struct nhgrp_nhop_external *)(nhgc + 1);
+ for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
+ ext[i].nh_idx = nhg_priv->nhg_nh_weights[i].nh->nh_priv->nh_idx;
+ ext[i].nh_weight = nhg_priv->nhg_nh_weights[i].weight;
+ }
+
+ /* fill in dataplane nexthops */
+ nhgc = (struct nhgrp_container *)(&ext[nhg_priv->nhg_nh_count]);
+ nhgc->nhgc_type = NHG_C_TYPE_DNHOPS;
+ nhgc->nhgc_subtype = 0;
+ nhgc->nhgc_len = sizeof(struct nhgrp_container);
+ nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
+ nhgc->nhgc_count = nhg->nhg_size;
+
+ ext = (struct nhgrp_nhop_external *)(nhgc + 1);
+ for (int i = 0; i < nhg->nhg_size; i++) {
+ ext[i].nh_idx = nhg->nhops[i]->nh_priv->nh_idx;
+ ext[i].nh_weight = 0;
+ }
+
+ error = SYSCTL_OUT(w, buffer, sz);
+
+ return (error);
+}
+
+int
+nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w)
+{
+ struct nh_control *ctl = rh->nh_control;
+ struct epoch_tracker et;
+ struct nhgrp_priv *nhg_priv;
+ char *buffer;
+ size_t sz;
+ int error = 0;
+
+ if (ctl->gr_head.items_count == 0)
+ return (0);
+
+ /* Calculate the maximum nhop group size in bytes */
+ sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
+ sz += 2 * sizeof(struct nhgrp_container);
+ sz += 2 * sizeof(struct nhgrp_nhop_external) * RIB_MAX_MPATH_WIDTH;
+ buffer = malloc(sz, M_TEMP, M_WAITOK);
+
+ NET_EPOCH_ENTER(et);
+ NHOPS_RLOCK(ctl);
+ CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) {
+ error = dump_nhgrp_entry(rh, nhg_priv, buffer, sz, w);
+ if (error != 0)
+ break;
+ } CHT_SLIST_FOREACH_END;
+ NHOPS_RUNLOCK(ctl);
+ NET_EPOCH_EXIT(et);
+
+ free(buffer, M_TEMP);
+
+ return (error);
+}
Index: head/sys/net/route/nhgrp_var.h
===================================================================
--- head/sys/net/route/nhgrp_var.h
+++ head/sys/net/route/nhgrp_var.h
@@ -0,0 +1,72 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This header file contains private definitions for the nexthop groups.
+ *
+ * Header is not intended to be included by the code external to the
+ * routing subsystem.
+ */
+
+#ifndef _NET_ROUTE_NHGRP_VAR_H_
+#define _NET_ROUTE_NHGRP_VAR_H_
+
+/* nhgrp hash definition */
+/* produce hash value for an object */
+#define mpath_hash_obj(_obj) (hash_nhgrp(_obj))
+/* compare two objects */
+#define mpath_cmp(_one, _two) (cmp_nhgrp(_one, _two))
+/* next object accessor */
+#define mpath_next(_obj) (_obj)->nhg_priv_next
+
+struct nhgrp_priv {
+ uint32_t nhg_idx;
+ uint8_t nhg_nh_count; /* number of items in nh_weights */
+ uint8_t nhg_spare[3];
+ u_int nhg_refcount; /* use refcount */
+ u_int nhg_linked; /* refcount(9), == 2 if linked to the list */
+ struct nh_control *nh_control; /* parent control structure */
+ struct nhgrp_priv *nhg_priv_next;
+ struct nhgrp_object *nhg;
+ struct epoch_context nhg_epoch_ctx; /* epoch data for nhop */
+ struct weightened_nhop nhg_nh_weights[0];
+};
+
+#define _NHGRP_PRIV(_src) (&(_src)->nhops[(_src)->nhg_size])
+#define NHGRP_PRIV(_src) ((struct nhgrp_priv *)_NHGRP_PRIV(_src))
+#define NHGRP_PRIV_CONST(_src) ((const struct nhgrp_priv *)_NHGRP_PRIV(_src))
+
+/* nhgrp.c */
+bool nhgrp_ctl_alloc_default(struct nh_control *ctl, int malloc_flags);
+struct nhgrp_priv *find_nhgrp(struct nh_control *ctl, const struct nhgrp_priv *key);
+int link_nhgrp(struct nh_control *ctl, struct nhgrp_priv *grp_priv);
+struct nhgrp_priv *unlink_nhgrp(struct nh_control *ctl, struct nhgrp_priv *key);
+
+#endif
+
Index: head/sys/net/route/nhop.h
===================================================================
--- head/sys/net/route/nhop.h
+++ head/sys/net/route/nhop.h
@@ -155,7 +155,7 @@
*/
#define NH_IS_VALID(_nh) RT_LINK_IS_UP((_nh)->nh_ifp)
-#define NH_IS_MULTIPATH(_nh) ((_nh)->nh_flags & NHF_MULTIPATH)
+#define NH_IS_NHGRP(_nh) ((_nh)->nh_flags & NHF_MULTIPATH)
#define RT_GATEWAY(_rt) ((struct sockaddr *)&(_rt)->rt_nhop->gw4_sa)
#define RT_GATEWAY_CONST(_rt) ((const struct sockaddr *)&(_rt)->rt_nhop->gw4_sa)
@@ -166,6 +166,11 @@
_nh = NULL; \
} while (0)
+struct weightened_nhop {
+ struct nhop_object *nh;
+ uint32_t weight;
+};
+
void nhop_free(struct nhop_object *nh);
struct sysctl_req;
@@ -209,16 +214,34 @@
uint16_t src_sa_off; /* offset of src address SA */
};
-struct mpath_nhop_external {
+#define NHG_C_TYPE_CNHOPS 0x1 /* Control plane nhops list */
+#define NHG_C_TYPE_DNHOPS 0x2 /* Dataplane nhops list */
+struct nhgrp_container {
+ uint32_t nhgc_len; /* container length */
+ uint16_t nhgc_count; /* number of items */
+ uint8_t nhgc_type; /* container type */
+ uint8_t nhgc_subtype; /* container subtype */
+};
+
+struct nhgrp_nhop_external {
uint32_t nh_idx;
uint32_t nh_weight;
};
-struct mpath_external {
- uint32_t mp_idx;
- uint32_t mp_refcount;
- uint32_t mp_nh_count;
- uint32_t mp_group_size;
+/*
+ * Layout:
+ * - nhgrp_external
+ * - nhgrp_container (control plane nhops list)
+ * - nhgrp_nhop_external
+ * - nhgrp_nhop_external
+ * ..
+ * - nhgrp_container (dataplane nhops list)
+ * - nhgrp_nhop_external
+ * - nhgrp_nhop_external
+ */
+struct nhgrp_external {
+ uint32_t nhg_idx; /* Nexthop group index */
+ uint32_t nhg_refcount; /* number of references */
};
#endif
Index: head/sys/net/route/nhop.c
===================================================================
--- head/sys/net/route/nhop.c
+++ head/sys/net/route/nhop.c
@@ -64,7 +64,7 @@
* is backed by the bitmask array.
*/
-static MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data");
+MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data");
/* Hash management functions */
@@ -112,6 +112,9 @@
NHOPS_LOCK_DESTROY(ctl);
free(ctl->nh_head.ptr, M_NHOP);
free(ctl->nh_idx_head.idx, M_NHOP);
+#ifdef ROUTE_MPATH
+ nhgrp_ctl_free(ctl);
+#endif
free(ctl, M_NHOP);
}
@@ -154,6 +157,9 @@
DPRINTF("Marking nhop %u unlinked", nh_priv->nh_idx);
refcount_release(&nh_priv->nh_linked);
} CHT_SLIST_FOREACH_END;
+#ifdef ROUTE_MPATH
+ nhgrp_ctl_unlink_all(ctl);
+#endif
NHOPS_WUNLOCK(ctl);
/*
Index: head/sys/net/route/nhop_ctl.c
===================================================================
--- head/sys/net/route/nhop_ctl.c
+++ head/sys/net/route/nhop_ctl.c
@@ -695,7 +695,14 @@
nhop_free_any(struct nhop_object *nh)
{
+#ifdef ROUTE_MPATH
+ if (!NH_IS_NHGRP(nh))
+ nhop_free(nh);
+ else
+ nhgrp_free((struct nhgrp_object *)nh);
+#else
nhop_free(nh);
+#endif
}
/* Helper functions */
Index: head/sys/net/route/nhop_var.h
===================================================================
--- head/sys/net/route/nhop_var.h
+++ head/sys/net/route/nhop_var.h
@@ -37,6 +37,8 @@
#ifndef _NET_ROUTE_NHOP_VAR_H_
#define _NET_ROUTE_NHOP_VAR_H_
+MALLOC_DECLARE(M_NHOP);
+
/* define nhop hash table */
struct nhop_priv;
CHT_SLIST_DEFINE(nhops, struct nhop_priv);
@@ -47,9 +49,15 @@
/* next object accessor */
#define nhops_next(_obj) (_obj)->nh_next
+/* define multipath hash table */
+struct nhgrp_priv;
+CHT_SLIST_DEFINE(nhgroups, struct nhgrp_priv);
+
struct nh_control {
struct nhops_head nh_head; /* hash table head */
struct bitmask_head nh_idx_head; /* nhop index head */
+ struct nhgroups_head gr_head; /* nhgrp hash table head */
+ struct bitmask_head gr_idx_head; /* nhgrp index head */
struct rwlock ctl_lock; /* overall ctl lock */
struct rib_head *ctl_rh; /* pointer back to rnh */
struct epoch_context ctl_epoch_ctx; /* epoch ctl helper */
@@ -80,7 +88,8 @@
struct epoch_context nh_epoch_ctx; /* epoch data for nhop */
};
-#define NH_IS_PINNED(_nh) ((_nh)->nh_priv->rt_flags & RTF_PINNED)
+#define NH_IS_PINNED(_nh) ((!NH_IS_NHGRP(_nh)) && \
+ ((_nh)->nh_priv->rt_flags & RTF_PINNED))
/* nhop.c */
struct nhop_priv *find_nhop(struct nh_control *ctl,
Index: head/sys/net/route/route_ctl.h
===================================================================
--- head/sys/net/route/route_ctl.h
+++ head/sys/net/route/route_ctl.h
@@ -53,6 +53,10 @@
int rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info,
struct rib_cmd_info *rc);
+typedef void route_notification_t(struct rib_cmd_info *rc, void *);
+void rib_decompose_notification(struct rib_cmd_info *rc,
+ route_notification_t *cb, void *cbdata);
+
int rib_add_redirect(u_int fibnum, struct sockaddr *dst,
struct sockaddr *gateway, struct sockaddr *author, struct ifnet *ifp,
int flags, int expire_sec);
@@ -65,6 +69,20 @@
typedef void rt_setwarg_t(struct rib_head *, uint32_t, int, void *);
void rt_foreach_fib_walk(int af, rt_setwarg_t *, rt_walktree_f_t *, void *);
void rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg);
+
+struct route_nhop_data;
+const struct rtentry *rib_lookup_prefix(uint32_t fibnum, int family,
+ const struct sockaddr *dst, const struct sockaddr *netmask,
+ struct route_nhop_data *rnd);
+const struct rtentry *rib_lookup_lpm(uint32_t fibnum, int family,
+ const struct sockaddr *dst, struct route_nhop_data *rnd);
+
+/* Multipath */
+struct nhgrp_object;
+struct weightened_nhop;
+
+struct weightened_nhop *nhgrp_get_nhops(struct nhgrp_object *nhg,
+ uint32_t *pnum_nhops);
enum rib_subscription_type {
RIB_NOTIFY_IMMEDIATE,
Index: head/sys/net/route/route_ctl.c
===================================================================
--- head/sys/net/route/route_ctl.c
+++ head/sys/net/route/route_ctl.c
@@ -29,7 +29,7 @@
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
-#include "opt_mpath.h"
+#include "opt_route.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -83,9 +83,6 @@
struct rib_cmd_info *rc);
static int change_route(struct rib_head *rnh, struct rt_addrinfo *info,
struct route_nhop_data *nhd_orig, struct rib_cmd_info *rc);
-static int change_route_nhop(struct rib_head *rnh, struct rtentry *rt,
- struct rt_addrinfo *info, struct route_nhop_data *rnd,
- struct rib_cmd_info *rc);
static int rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info,
struct rib_cmd_info *rc);
@@ -94,7 +91,21 @@
struct rib_cmd_info *rc);
static void destroy_subscription_epoch(epoch_context_t ctx);
+static bool rib_can_multipath(struct rib_head *rh);
+/* Per-vnet multipath routing configuration */
+SYSCTL_DECL(_net_route);
+#define V_rib_route_multipath VNET(rib_route_multipath)
+#ifdef ROUTE_MPATH
+#define _MP_FLAGS CTLFLAG_RW
+#else
+#define _MP_FLAGS CTLFLAG_RD
+#endif
+VNET_DEFINE(u_int, rib_route_multipath) = 0;
+SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET,
+ &VNET_NAME(rib_route_multipath), 0, "Enable route multipath");
+#undef _MP_FLAGS
+
/* Routing table UMA zone */
VNET_DEFINE_STATIC(uma_zone_t, rtzone);
#define V_rtzone VNET(rtzone)
@@ -128,7 +139,7 @@
CURVNET_SET(nhop_get_vnet(rt->rt_nhop));
/* Unreference nexthop */
- nhop_free(rt->rt_nhop);
+ nhop_free_any(rt->rt_nhop);
uma_zfree(V_rtzone, rt);
@@ -175,6 +186,41 @@
return (rnh);
}
+#ifdef ROUTE_MPATH
+static bool
+rib_can_multipath(struct rib_head *rh)
+{
+ int result;
+
+ CURVNET_SET(rh->rib_vnet);
+ result = !!V_rib_route_multipath;
+ CURVNET_RESTORE();
+
+ return (result);
+}
+
+/*
+ * Check is nhop is multipath-eligible.
+ * Avoid nhops without gateways and redirects.
+ *
+ * Returns 1 for multipath-eligible nexthop,
+ * 0 otherwise.
+ */
+bool
+nhop_can_multipath(const struct nhop_object *nh)
+{
+
+ if ((nh->nh_flags & NHF_MULTIPATH) != 0)
+ return (1);
+ if ((nh->nh_flags & NHF_GATEWAY) == 0)
+ return (0);
+ if ((nh->nh_flags & NHF_REDIRECT) != 0)
+ return (0);
+
+ return (1);
+}
+#endif
+
static int
get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight)
{
@@ -206,7 +252,7 @@
*
* Returns true if matches, false otherwise.
*/
-static bool
+bool
match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw)
{
@@ -461,7 +507,7 @@
struct rib_cmd_info *rc)
{
struct nhop_object *nh_orig;
- struct route_nhop_data rnd;
+ struct route_nhop_data rnd_orig, rnd_add;
struct nhop_object *nh;
struct rtentry *rt, *rt_orig;
int error;
@@ -470,32 +516,19 @@
if (error != 0)
return (error);
- rnd.rnd_nhop = rt->rt_nhop;
- rnd.rnd_weight = rt->rt_weight;
+ rnd_add.rnd_nhop = rt->rt_nhop;
+ rnd_add.rnd_weight = rt->rt_weight;
nh = rt->rt_nhop;
RIB_WLOCK(rnh);
-#ifdef RADIX_MPATH
- struct sockaddr *netmask;
- netmask = info->rti_info[RTAX_NETMASK];
- /* do not permit exactly the same dst/mask/gw pair */
- if (rt_mpath_capable(rnh) &&
- rt_mpath_conflict(rnh, rt, netmask)) {
- RIB_WUNLOCK(rnh);
-
- nhop_free(nh);
- uma_zfree(V_rtzone, rt);
- return (EEXIST);
- }
-#endif
- error = add_route_nhop(rnh, rt, info, &rnd, rc);
+ error = add_route_nhop(rnh, rt, info, &rnd_add, rc);
if (error == 0) {
RIB_WUNLOCK(rnh);
return (0);
}
/* addition failed. Lookup prefix in the rib to determine the cause */
- rt_orig = lookup_prefix(rnh, info, &rnd);
+ rt_orig = lookup_prefix(rnh, info, &rnd_orig);
if (rt_orig == NULL) {
/* No prefix -> rnh_addaddr() failed to allocate memory */
RIB_WUNLOCK(rnh);
@@ -505,11 +538,11 @@
}
/* We have existing route in the RIB. */
- nh_orig = rnd.rnd_nhop;
+ nh_orig = rnd_orig.rnd_nhop;
/* Check if new route has higher preference */
if (can_override_nhop(info, nh_orig) > 0) {
/* Update nexthop to the new route */
- change_route_nhop(rnh, rt_orig, info, &rnd, rc);
+ change_route_nhop(rnh, rt_orig, info, &rnd_add, rc);
RIB_WUNLOCK(rnh);
uma_zfree(V_rtzone, rt);
nhop_free(nh_orig);
@@ -518,11 +551,26 @@
RIB_WUNLOCK(rnh);
+#ifdef ROUTE_MPATH
+ if (rib_can_multipath(rnh) && nhop_can_multipath(rnd_add.rnd_nhop) &&
+ nhop_can_multipath(rnd_orig.rnd_nhop))
+ error = add_route_mpath(rnh, info, rt, &rnd_add, &rnd_orig, rc);
+ else
+#endif
/* Unable to add - another route with the same preference exists */
error = EEXIST;
+ /*
+ * ROUTE_MPATH disabled: failed to add route, free both nhop and rt.
+ * ROUTE_MPATH enabled: original nhop reference is unused in any case,
+ * free rt only if not _adding_ new route to rib (e.g. the case
+ * when initial lookup returned existing route, but then it got
+ * deleted prior to multipath group insertion, leading to a simple
+ * non-multipath add as a result).
+ */
nhop_free(nh);
- uma_zfree(V_rtzone, rt);
+ if ((error != 0) || rc->rc_cmd != RTM_ADD)
+ uma_zfree(V_rtzone, rt);
return (error);
}
@@ -588,7 +636,13 @@
return (ESRCH);
nh = rt->rt_nhop;
-
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh)) {
+ error = del_route_mpath(rnh, info, rt,
+ (struct nhgrp_object *)nh, rc);
+ return (error);
+ }
+#endif
error = check_info_match_nhop(info, rt, nh);
if (error != 0)
return (error);
@@ -600,14 +654,6 @@
* Remove the item from the tree and return it.
* Complain if it is not there and do no more processing.
*/
-#ifdef RADIX_MPATH
- info->rti_info[RTAX_GATEWAY] = &nh->gw_sa;
- if (rt_mpath_capable(rnh)) {
- rn = rt_mpath_unlink(rnh, info, rt, &error);
- if (error != 0)
- return (error);
- } else
-#endif
rn = rnh->rnh_deladdr(info->rti_info[RTAX_DST],
info->rti_info[RTAX_NETMASK], &rnh->head);
if (rn == NULL)
@@ -648,7 +694,18 @@
* If the caller wants it, then it can have it,
* the entry will be deleted after the end of the current epoch.
*/
- rtfree(rc->rc_rt);
+ if (rc->rc_cmd == RTM_DELETE)
+ rtfree(rc->rc_rt);
+#ifdef ROUTE_MPATH
+ else {
+ /*
+ * Deleting 1 path may result in RTM_CHANGE to
+ * a different mpath group/nhop.
+ * Free old mpath group.
+ */
+ nhop_free_any(rc->rc_nh_old);
+ }
+#endif
return (0);
}
@@ -694,19 +751,6 @@
return (ESRCH);
}
-#ifdef RADIX_MPATH
- /*
- * If we got multipath routes,
- * we require users to specify a matching RTAX_GATEWAY.
- */
- if (rt_mpath_capable(rnh)) {
- rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]);
- if (rt == NULL) {
- RIB_RUNLOCK(rnh);
- return (ESRCH);
- }
- }
-#endif
rnd_orig.rnd_nhop = rt->rt_nhop;
rnd_orig.rnd_weight = rt->rt_weight;
@@ -722,19 +766,12 @@
}
static int
-change_route(struct rib_head *rnh, struct rt_addrinfo *info,
- struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
+change_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct nhop_object *nh_orig, struct nhop_object **nh_new)
{
- int error = 0;
int free_ifa = 0;
- struct nhop_object *nh, *nh_orig;
- struct route_nhop_data rnd_new;
+ int error;
- nh = NULL;
- nh_orig = rnd_orig->rnd_nhop;
- if (nh_orig == NULL)
- return (ESRCH);
-
/*
* New gateway could require new ifaddr, ifp;
* flags may also be different; ifp may be specified
@@ -759,25 +796,102 @@
}
}
- error = nhop_create_from_nhop(rnh, nh_orig, info, &nh);
+ error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new);
if (free_ifa) {
ifa_free(info->rti_ifa);
info->rti_ifa = NULL;
}
+
+ return (error);
+}
+
+#ifdef ROUTE_MPATH
+static int
+change_mpath_route(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
+{
+ int error = 0;
+ struct nhop_object *nh, *nh_orig, *nh_new;
+ struct route_nhop_data rnd_new;
+
+ nh = NULL;
+ nh_orig = rnd_orig->rnd_nhop;
+
+ struct weightened_nhop *wn = NULL, *wn_new;
+ uint32_t num_nhops;
+
+ wn = nhgrp_get_nhops((struct nhgrp_object *)nh_orig, &num_nhops);
+ nh_orig = NULL;
+ for (int i = 0; i < num_nhops; i++) {
+ if (check_info_match_nhop(info, NULL, wn[i].nh)) {
+ nh_orig = wn[i].nh;
+ break;
+ }
+ }
+
+ if (nh_orig == NULL)
+ return (ESRCH);
+
+ error = change_nhop(rnh, info, nh_orig, &nh_new);
if (error != 0)
return (error);
- rnd_new.rnd_nhop = nh;
- if (info->rti_mflags & RTV_WEIGHT)
- rnd_new.rnd_weight = info->rti_rmx->rmx_weight;
- else
- rnd_new.rnd_weight = rnd_orig->rnd_weight;
+ wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop),
+ M_TEMP, M_NOWAIT | M_ZERO);
+ if (wn_new == NULL) {
+ nhop_free(nh_new);
+ return (EAGAIN);
+ }
+ memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop));
+ for (int i = 0; i < num_nhops; i++) {
+ if (wn[i].nh == nh_orig) {
+ wn[i].nh = nh_new;
+ wn[i].weight = get_info_weight(info, rnd_orig->rnd_weight);
+ break;
+ }
+ }
+
+ error = nhgrp_get_group(rnh, wn_new, num_nhops, &rnd_new);
+ nhop_free(nh_new);
+ free(wn_new, M_TEMP);
+
+ if (error != 0)
+ return (error);
+
error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc);
return (error);
}
+#endif
+static int
+change_route(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
+{
+ int error = 0;
+ struct nhop_object *nh, *nh_orig;
+ struct route_nhop_data rnd_new;
+
+ nh = NULL;
+ nh_orig = rnd_orig->rnd_nhop;
+ if (nh_orig == NULL)
+ return (ESRCH);
+
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh_orig))
+ return (change_mpath_route(rnh, info, rnd_orig, rc));
+#endif
+
+ rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight);
+ error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop);
+ if (error != 0)
+ return (error);
+ error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc);
+
+ return (error);
+}
+
/*
* Insert @rt with nhop data from @rnd_new to @rnh.
* Returns 0 on success and stores operation results in @rc.
@@ -827,7 +941,7 @@
* Conditionally set rt_expire if set in @info.
* Returns 0 on success.
*/
-static int
+int
change_route_nhop(struct rib_head *rnh, struct rtentry *rt,
struct rt_addrinfo *info, struct route_nhop_data *rnd,
struct rib_cmd_info *rc)
@@ -855,6 +969,8 @@
rn = rnh->rnh_deladdr(ndst, netmask, &rnh->head);
if (rn == NULL)
return (ESRCH);
+ rt = RNTORT(rn);
+ rt->rte_flags &= ~RTF_UP;
}
/* Finalize notification */
@@ -989,7 +1105,6 @@
info->rti_info[RTAX_DST] = rt_key(rt);
info->rti_info[RTAX_NETMASK] = rt_mask(rt);
- info->rti_info[RTAX_GATEWAY] = &rt->rt_nhop->gw_sa;
error = rt_unlinkrte(di->rnh, info, &di->rc);
@@ -1000,7 +1115,7 @@
* XXX: Delayed notifications not implemented
* for nexthop updates.
*/
- if (error == 0) {
+ if ((error == 0) && (di->rc.rc_cmd == RTM_DELETE)) {
/* Add to the list and return */
rt->rt_chain = di->head;
di->head = rt;
@@ -1024,6 +1139,7 @@
struct rib_head *rnh;
struct rt_delinfo di;
struct rtentry *rt;
+ struct nhop_object *nh;
struct epoch_tracker et;
rnh = rt_tables_get_rnh(fibnum, family);
@@ -1049,18 +1165,31 @@
rt = di.head;
di.head = rt->rt_chain;
rt->rt_chain = NULL;
+ nh = rt->rt_nhop;
di.rc.rc_rt = rt;
- di.rc.rc_nh_old = rt->rt_nhop;
+ di.rc.rc_nh_old = nh;
rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc);
/* TODO std rt -> rt_addrinfo export */
di.info.rti_info[RTAX_DST] = rt_key(rt);
di.info.rti_info[RTAX_NETMASK] = rt_mask(rt);
- if (report)
- rt_routemsg(RTM_DELETE, rt, rt->rt_nhop->nh_ifp, 0,
- fibnum);
+ if (report) {
+#ifdef ROUTE_MPATH
+ struct nhgrp_object *nhg;
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ if (NH_IS_NHGRP(nh)) {
+ nhg = (struct nhgrp_object *)nh;
+ wn = nhgrp_get_nhops(nhg, &num_nhops);
+ for (int i = 0; i < num_nhops; i++)
+ rt_routemsg(RTM_DELETE, rt,
+ wn[i].nh->nh_ifp, 0, fibnum);
+ } else
+#endif
+ rt_routemsg(RTM_DELETE, rt, nh->nh_ifp, 0, fibnum);
+ }
rtfree(rt);
}
Index: head/sys/net/route/route_helpers.c
===================================================================
--- head/sys/net/route/route_helpers.c
+++ head/sys/net/route/route_helpers.c
@@ -131,3 +131,167 @@
return (nh);
}
+
+#ifdef ROUTE_MPATH
+static void
+decompose_change_notification(struct rib_cmd_info *rc, route_notification_t *cb,
+ void *cbdata)
+{
+ uint32_t num_old, num_new;
+ uint32_t nh_idx_old, nh_idx_new;
+ struct weightened_nhop *wn_old, *wn_new;
+ struct weightened_nhop tmp = { NULL, 0 };
+ uint32_t idx_old = 0, idx_new = 0;
+
+ struct rib_cmd_info rc_del = { .rc_cmd = RTM_DELETE, .rc_rt = rc->rc_rt };
+ struct rib_cmd_info rc_add = { .rc_cmd = RTM_ADD, .rc_rt = rc->rc_rt };
+
+ if (NH_IS_NHGRP(rc->rc_nh_old)) {
+ wn_old = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_old);
+ } else {
+ tmp.nh = rc->rc_nh_old;
+ tmp.weight = rc->rc_nh_weight;
+ wn_old = &tmp;
+ num_old = 1;
+ }
+ if (NH_IS_NHGRP(rc->rc_nh_new)) {
+ wn_new = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_new);
+ } else {
+ tmp.nh = rc->rc_nh_new;
+ tmp.weight = rc->rc_nh_weight;
+ wn_new = &tmp;
+ num_new = 1;
+ }
+
+ /* Use the fact that each @wn array is sorted */
+ /*
+ * Want to convert into set of add and delete operations
+ * [1] -> [1, 2] = A{2}
+ * [2] -> [1, 2] = A{1}
+ * [1, 2, 4]->[1, 3, 4] = A{2}, D{3}
+ * [1, 2, 4]->[1, 4] = D{2}
+ * [1, 2, 4] -> [3, 4] = D{1}, C{2,3} OR C{1,3}, D{2} OR D{1},D{2},A{3}
+ * [1, 2] -> [3, 4] =
+ *
+ */
+ idx_old = 0;
+ while ((idx_old < num_old) && (idx_new < num_new)) {
+ nh_idx_old = wn_old[idx_old].nh->nh_priv->nh_idx;
+ nh_idx_new = wn_new[idx_new].nh->nh_priv->nh_idx;
+
+ if (nh_idx_old == nh_idx_new) {
+ if (wn_old[idx_old].weight != wn_new[idx_new].weight) {
+ /* Update weight by providing del/add notifications */
+ rc_del.rc_nh_old = wn_old[idx_old].nh;
+ rc_del.rc_nh_weight = wn_old[idx_old].weight;
+ cb(&rc_del, cbdata);
+
+ rc_add.rc_nh_new = wn_new[idx_new].nh;
+ rc_add.rc_nh_weight = wn_new[idx_new].weight;
+ cb(&rc_add, cbdata);
+ }
+ idx_old++;
+ idx_new++;
+ } else if (nh_idx_old < nh_idx_new) {
+ /*
+ * [1, ~2~, 4], [1, ~3~, 4]
+ * [1, ~2~, 5], [1, ~3~, 4]
+ * [1, ~2~], [1, ~3~, 4]
+ */
+ if ((idx_old + 1 >= num_old) ||
+ (wn_old[idx_old + 1].nh->nh_priv->nh_idx > nh_idx_new)) {
+ /* Add new unless the next old item is still <= new */
+ rc_add.rc_nh_new = wn_new[idx_new].nh;
+ rc_add.rc_nh_weight = wn_new[idx_new].weight;
+ cb(&rc_add, cbdata);
+ idx_new++;
+ }
+ /* In any case, delete current old */
+ rc_del.rc_nh_old = wn_old[idx_old].nh;
+ rc_del.rc_nh_weight = wn_old[idx_old].weight;
+ cb(&rc_del, cbdata);
+ idx_old++;
+ } else {
+ /*
+ * nh_idx_old > nh_idx_new
+ *
+ * [1, ~3~, 4], [1, ~2~, 4]
+ * [1, ~3~, 5], [1, ~2~, 4]
+ * [1, ~3~, 4], [1, ~2~]
+ */
+ if ((idx_new + 1 >= num_new) ||
+ (wn_new[idx_new + 1].nh->nh_priv->nh_idx > nh_idx_old)) {
+ /* No next item or next item is > current one */
+ rc_add.rc_nh_new = wn_new[idx_new].nh;
+ rc_add.rc_nh_weight = wn_new[idx_new].weight;
+ cb(&rc_add, cbdata);
+ idx_new++;
+ }
+ /* In any case, delete current old */
+ rc_del.rc_nh_old = wn_old[idx_old].nh;
+ rc_del.rc_nh_weight = wn_old[idx_old].weight;
+ cb(&rc_del, cbdata);
+ idx_old++;
+ }
+ }
+
+ while (idx_old < num_old) {
+ rc_del.rc_nh_old = wn_old[idx_old].nh;
+ rc_del.rc_nh_weight = wn_old[idx_old].weight;
+ cb(&rc_del, cbdata);
+ idx_old++;
+ }
+
+ while (idx_new < num_new) {
+ rc_add.rc_nh_new = wn_new[idx_new].nh;
+ rc_add.rc_nh_weight = wn_new[idx_new].weight;
+ cb(&rc_add, cbdata);
+ idx_new++;
+ }
+}
+
+/*
+ * Decompose multipath cmd info @rc into a list of add/del/change
+ * single-path operations, calling @cb callback for each operation.
+ * Assumes at least one of the nexthops in @rc is multipath.
+ */
+void
+rib_decompose_notification(struct rib_cmd_info *rc, route_notification_t *cb,
+ void *cbdata)
+{
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ struct rib_cmd_info rc_new;
+
+ rc_new = *rc;
+ DPRINTF("cb=%p cmd=%d nh_old=%p nh_new=%p",
+ cb, rc->cmd, rc->nh_old, rc->nh_new);
+ switch (rc->rc_cmd) {
+ case RTM_ADD:
+ if (!NH_IS_NHGRP(rc->rc_nh_new))
+ return;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_nhops);
+ for (uint32_t i = 0; i < num_nhops; i++) {
+ rc_new.rc_nh_new = wn[i].nh;
+ rc_new.rc_nh_weight = wn[i].weight;
+ cb(&rc_new, cbdata);
+ }
+ break;
+ case RTM_DELETE:
+ if (!NH_IS_NHGRP(rc->rc_nh_old))
+ return;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_nhops);
+ for (uint32_t i = 0; i < num_nhops; i++) {
+ rc_new.rc_nh_old = wn[i].nh;
+ rc_new.rc_nh_weight = wn[i].weight;
+ cb(&rc_new, cbdata);
+ }
+ break;
+ case RTM_CHANGE:
+ if (!NH_IS_NHGRP(rc->rc_nh_old) && !NH_IS_NHGRP(rc->rc_nh_new))
+ return;
+ decompose_change_notification(rc, cb, cbdata);
+ break;
+ }
+}
+#endif
Index: head/sys/net/route/route_var.h
===================================================================
--- head/sys/net/route/route_var.h
+++ head/sys/net/route/route_var.h
@@ -87,6 +87,7 @@
/* Constants */
#define RIB_MAX_RETRIES 3
#define RT_MAXFIBS UINT16_MAX
+#define RIB_MAX_MPATH_WIDTH 64
/* Macro for verifying fields in af-specific 'struct route' structures */
#define CHK_STRUCT_FIELD_GENERIC(_s1, _f1, _s2, _f2) \
@@ -113,12 +114,7 @@
"ro_dst and " #_dst_new " are at different offset")
struct rib_head *rt_tables_get_rnh(uint32_t table, sa_family_t family);
-void rt_mpath_init_rnh(struct rib_head *rnh);
int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum);
-#ifdef RADIX_MPATH
-struct radix_node *rt_mpath_unlink(struct rib_head *rnh,
- struct rt_addrinfo *info, struct rtentry *rto, int *perror);
-#endif
struct rib_cmd_info;
VNET_PCPUSTAT_DECLARE(struct rtstat, rtstat);
@@ -202,14 +198,6 @@
/* rtentry rt flag mask */
#define RTE_RT_FLAG_MASK (RTF_UP | RTF_HOST)
-/* Nexthop selection */
-#define _NH2MP(_nh) ((struct nhgrp_object *)(_nh))
-#define _SELECT_NHOP(_nh, _flowid) \
- (_NH2MP(_nh))->nhops[(_flowid) % (_NH2MP(_nh))->mp_size]
-#define _RT_SELECT_NHOP(_nh, _flowid) \
- ((!NH_IS_MULTIPATH(_nh)) ? (_nh) : _SELECT_NHOP(_nh, _flowid))
-#define RT_SELECT_NHOP(_rt, _flowid) _RT_SELECT_NHOP((_rt)->rt_nhop, _flowid)
-
/* route_temporal.c */
void tmproutes_update(struct rib_head *rnh, struct rtentry *rt);
void tmproutes_init(struct rib_head *rh);
@@ -217,14 +205,24 @@
/* route_ctl.c */
struct route_nhop_data {
- struct nhop_object *rnd_nhop;
- uint32_t rnd_weight;
+ union {
+ struct nhop_object *rnd_nhop;
+ struct nhgrp_object *rnd_nhgrp;
+ };
+ uint32_t rnd_weight;
};
+
+int change_route_nhop(struct rib_head *rnh, struct rtentry *rt,
+ struct rt_addrinfo *info, struct route_nhop_data *rnd,
+ struct rib_cmd_info *rc);
int change_route_conditional(struct rib_head *rnh, struct rtentry *rt,
struct rt_addrinfo *info, struct route_nhop_data *nhd_orig,
struct route_nhop_data *nhd_new, struct rib_cmd_info *rc);
struct rtentry *lookup_prefix(struct rib_head *rnh,
const struct rt_addrinfo *info, struct route_nhop_data *rnd);
+
+bool nhop_can_multipath(const struct nhop_object *nh);
+bool match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw);
int check_info_match_nhop(const struct rt_addrinfo *info,
const struct rtentry *rt, const struct nhop_object *nh);
int can_override_nhop(const struct rt_addrinfo *info,
@@ -256,5 +254,57 @@
void nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu);
int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w);
+/* MULTIPATH */
+#define MPF_MULTIPATH 0x08 /* need to be consistent with NHF_MULTIPATH */
+
+struct nhgrp_object {
+ uint16_t nhg_flags; /* nexthop group flags */
+ uint8_t nhg_size; /* dataplain group size */
+ uint8_t spare;
+ struct nhop_object *nhops[0]; /* nhops */
+};
+
+static inline struct nhop_object *
+nhop_select(struct nhop_object *nh, uint32_t flowid)
+{
+
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh)) {
+ struct nhgrp_object *nhg = (struct nhgrp_object *)nh;
+ nh = nhg->nhops[flowid % nhg->nhg_size];
+ }
+#endif
+ return (nh);
+}
+
+
+struct weightened_nhop;
+
+/* mpath_ctl.c */
+int add_route_mpath(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct rtentry *rt, struct route_nhop_data *rnd_add,
+ struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc);
+int del_route_mpath(struct rib_head *rh, struct rt_addrinfo *info,
+ struct rtentry *rt, struct nhgrp_object *nhg, struct rib_cmd_info *rc);
+
+/* nhgrp.c */
+int nhgrp_ctl_init(struct nh_control *ctl);
+void nhgrp_ctl_free(struct nh_control *ctl);
+void nhgrp_ctl_unlink_all(struct nh_control *ctl);
+
+
+/* nhgrp_ctl.c */
+int nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w);
+
+int nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn,
+ int num_nhops, struct route_nhop_data *rnd);
+typedef bool nhgrp_filter_cb_t(const struct nhop_object *nh, void *data);
+int nhgrp_get_filtered_group(struct rib_head *rh, const struct nhgrp_object *src,
+ nhgrp_filter_cb_t flt_func, void *flt_data, struct route_nhop_data *rnd);
+int nhgrp_get_addition_group(struct rib_head *rnh,
+ struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_add,
+ struct route_nhop_data *rnd_new);
+
+void nhgrp_free(struct nhgrp_object *nhg);
#endif
Index: head/sys/net/rtsock.c
===================================================================
--- head/sys/net/rtsock.c
+++ head/sys/net/rtsock.c
@@ -32,7 +32,7 @@
* $FreeBSD$
*/
#include "opt_ddb.h"
-#include "opt_mpath.h"
+#include "opt_route.h"
#include "opt_inet.h"
#include "opt_inet6.h"
@@ -158,8 +158,7 @@
#define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx)
#define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED)
-static SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
- "");
+SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
struct walkarg {
int w_tmemsize;
@@ -650,6 +649,25 @@
return (0);
}
+static struct nhop_object *
+select_nhop(struct nhop_object *nh, const struct sockaddr *gw)
+{
+ if (!NH_IS_NHGRP(nh))
+ return (nh);
+#ifdef ROUTE_MPATH
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
+ if (gw == NULL)
+ return (wn[0].nh);
+ for (int i = 0; i < num_nhops; i++) {
+ if (match_nhop_gw(wn[i].nh, gw))
+ return (wn[i].nh);
+ }
+#endif
+ return (NULL);
+}
+
/*
* Handles RTM_GET message from routing socket, returning matching rt.
*
@@ -663,6 +681,7 @@
{
RIB_RLOCK_TRACKER;
struct rib_head *rnh;
+ struct nhop_object *nh;
sa_family_t saf;
saf = info->rti_info[RTAX_DST]->sa_family;
@@ -690,21 +709,12 @@
RIB_RUNLOCK(rnh);
return (ESRCH);
}
-#ifdef RADIX_MPATH
- /*
- * for RTM_GET, gate is optional even with multipath.
- * if gate == NULL the first match is returned.
- * (no need to call rt_mpath_matchgate if gate == NULL)
- */
- if (rt_mpath_capable(rnh) && info->rti_info[RTAX_GATEWAY]) {
- rc->rc_rt = rt_mpath_matchgate(rc->rc_rt,
- info->rti_info[RTAX_GATEWAY]);
- if (rc->rc_rt == NULL) {
- RIB_RUNLOCK(rnh);
- return (ESRCH);
- }
+
+ nh = select_nhop(rc->rc_rt->rt_nhop, info->rti_info[RTAX_GATEWAY]);
+ if (nh == NULL) {
+ RIB_RUNLOCK(rnh);
+ return (ESRCH);
}
-#endif
/*
* If performing proxied L2 entry insertion, and
* the actual PPP host entry is found, perform
@@ -740,8 +750,13 @@
RIB_RUNLOCK(rnh);
return (ESRCH);
}
+ nh = select_nhop(rc->rc_rt->rt_nhop, info->rti_info[RTAX_GATEWAY]);
+ if (nh == NULL) {
+ RIB_RUNLOCK(rnh);
+ return (ESRCH);
+ }
}
- rc->rc_nh_new = rc->rc_rt->rt_nhop;
+ rc->rc_nh_new = nh;
rc->rc_nh_weight = rc->rc_rt->rt_weight;
RIB_RUNLOCK(rnh);
@@ -832,6 +847,24 @@
return (0);
}
+static void
+save_del_notification(struct rib_cmd_info *rc, void *_cbdata)
+{
+ struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata;
+
+ if (rc->rc_cmd == RTM_DELETE)
+ *rc_new = *rc;
+}
+
+static void
+save_add_notification(struct rib_cmd_info *rc, void *_cbdata)
+{
+ struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata;
+
+ if (rc->rc_cmd == RTM_ADD)
+ *rc_new = *rc;
+}
+
/*ARGSUSED*/
static int
route_output(struct mbuf *m, struct socket *so, ...)
@@ -919,6 +952,15 @@
#ifdef INET6
rti_need_deembed = 1;
#endif
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(rc.rc_nh_new) ||
+ (rc.rc_nh_old && NH_IS_NHGRP(rc.rc_nh_old))) {
+ struct rib_cmd_info rc_simple = {};
+ rib_decompose_notification(&rc,
+ save_add_notification, (void *)&rc_simple);
+ rc = rc_simple;
+ }
+#endif
nh = rc.rc_nh_new;
rtm->rtm_index = nh->nh_ifp->if_index;
}
@@ -927,6 +969,15 @@
case RTM_DELETE:
error = rib_action(fibnum, RTM_DELETE, &info, &rc);
if (error == 0) {
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(rc.rc_nh_old) ||
+ (rc.rc_nh_new && NH_IS_NHGRP(rc.rc_nh_new))) {
+ struct rib_cmd_info rc_simple = {};
+ rib_decompose_notification(&rc,
+ save_del_notification, (void *)&rc_simple);
+ rc = rc_simple;
+ }
+#endif
nh = rc.rc_nh_old;
goto report;
}
@@ -1708,7 +1759,19 @@
if (!can_export_rte(w->w_req->td->td_ucred, rt))
return (0);
nh = rt->rt_nhop;
- error = sysctl_dumpnhop(rt, nh, rt->rt_weight, w);
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh)) {
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
+ for (int i = 0; i < num_nhops; i++) {
+ error = sysctl_dumpnhop(rt, wn[i].nh, wn[i].weight, w);
+ if (error != 0)
+ return (error);
+ }
+ } else
+#endif
+ error = sysctl_dumpnhop(rt, nh, rt->rt_weight, w);
return (0);
}
@@ -1748,6 +1811,7 @@
rtm->rtm_flags = rt->rte_flags;
rtm->rtm_flags |= nhop_get_rtflags(nh);
rt_getmetrics(rt, nh, &rtm->rtm_rmx);
+ rtm->rtm_rmx.rmx_weight = weight;
rtm->rtm_index = nh->nh_ifp->if_index;
rtm->rtm_addrs = info.rti_addrs;
error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size);
@@ -2028,7 +2092,7 @@
namelen--;
if (req->newptr)
return (EPERM);
- if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP) {
+ if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP || name[1] == NET_RT_NHGRP) {
if (namelen == 3)
fib = req->td->td_proc->p_fibnum;
else if (namelen == 4)
@@ -2096,6 +2160,7 @@
}
break;
case NET_RT_NHOP:
+ case NET_RT_NHGRP:
/* Allow dumping one specific af/fib at a time */
if (namelen < 4) {
error = EINVAL;
@@ -2113,6 +2178,12 @@
}
if (w.w_op == NET_RT_NHOP)
error = nhops_dump_sysctl(rnh, w.w_req);
+ else
+#ifdef ROUTE_MPATH
+ error = nhgrp_dump_sysctl(rnh, w.w_req);
+#else
+ error = ENOTSUP;
+#endif
break;
case NET_RT_IFLIST:
case NET_RT_IFLISTL:
Index: head/sys/netinet/in.c
===================================================================
--- head/sys/netinet/in.c
+++ head/sys/netinet/in.c
@@ -35,8 +35,6 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include "opt_mpath.h"
-
#include <sys/param.h>
#include <sys/eventhandler.h>
#include <sys/systm.h>
@@ -699,14 +697,6 @@
* interface address, we are done here.
*/
if (ia->ia_flags & IFA_ROUTE) {
-#ifdef RADIX_MPATH
- if (ia->ia_addr.sin_addr.s_addr ==
- target->ia_addr.sin_addr.s_addr) {
- IN_IFADDR_RUNLOCK(&in_ifa_tracker);
- return (EEXIST);
- } else
- break;
-#endif
if (V_nosameprefix) {
IN_IFADDR_RUNLOCK(&in_ifa_tracker);
return (EEXIST);
Index: head/sys/netinet/in_fib.c
===================================================================
--- head/sys/netinet/in_fib.c
+++ head/sys/netinet/in_fib.c
@@ -32,7 +32,6 @@
#include "opt_inet.h"
#include "opt_route.h"
-#include "opt_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -48,14 +47,11 @@
#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/route.h>
+#include <net/route/route_ctl.h>
#include <net/route/route_var.h>
#include <net/route/nhop.h>
#include <net/vnet.h>
-#ifdef RADIX_MPATH
-#include <net/radix_mpath.h>
-#endif
-
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_fib.h>
@@ -80,7 +76,6 @@
RIB_RLOCK_TRACKER;
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
struct nhop_object *nh;
KASSERT((fibnum < rt_numfibs), ("fib4_lookup: bad fibnum"));
@@ -99,12 +94,7 @@
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
-#ifdef RADIX_MPATH
- if (rt_mpath_next(rt) != NULL)
- rt = rt_mpath_selectrte(rt, flowid);
-#endif
- nh = rt->rt_nhop;
+ nh = nhop_select((RNTORT(rn))->rt_nhop, flowid);
/* Ensure route & ifp is UP */
if (RT_LINK_IS_UP(nh->nh_ifp)) {
if (flags & NHR_REF)
@@ -120,7 +110,7 @@
}
inline static int
-check_urpf(const struct nhop_object *nh, uint32_t flags,
+check_urpf_nhop(const struct nhop_object *nh, uint32_t flags,
const struct ifnet *src_if)
{
@@ -137,21 +127,24 @@
return (0);
}
-#ifdef RADIX_MPATH
-inline static int
-check_urpf_mpath(struct rtentry *rt, uint32_t flags,
+static int
+check_urpf(struct nhop_object *nh, uint32_t flags,
const struct ifnet *src_if)
{
-
- while (rt != NULL) {
- if (check_urpf(rt->rt_nhop, flags, src_if) != 0)
- return (1);
- rt = rt_mpath_next(rt);
- }
-
- return (0);
-}
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh)) {
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
+ for (int i = 0; i < num_nhops; i++) {
+ if (check_urpf_nhop(wn[i].nh, flags, src_if) != 0)
+ return (1);
+ }
+ return (0);
+ } else
#endif
+ return (check_urpf_nhop(nh, flags, src_if));
+}
/*
* Performs reverse path forwarding lookup.
@@ -169,7 +162,6 @@
RIB_RLOCK_TRACKER;
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
int ret;
KASSERT((fibnum < rt_numfibs), ("fib4_check_urpf: bad fibnum"));
@@ -186,12 +178,7 @@
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
-#ifdef RADIX_MPATH
- ret = check_urpf_mpath(rt, flags, src_if);
-#else
- ret = check_urpf(rt->rt_nhop, flags, src_if);
-#endif
+ ret = check_urpf(RNTORT(rn)->rt_nhop, flags, src_if);
RIB_RUNLOCK(rh);
return (ret);
}
@@ -206,7 +193,6 @@
{
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
struct nhop_object *nh;
KASSERT((fibnum < rt_numfibs), ("fib4_lookup_debugnet: bad fibnum"));
@@ -225,12 +211,7 @@
/* unlocked lookup */
rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
-#ifdef RADIX_MPATH
- if (rt_mpath_next(rt) != NULL)
- rt = rt_mpath_selectrte(rt, 0);
-#endif
- nh = rt->rt_nhop;
+ nh = nhop_select((RNTORT(rn))->rt_nhop, 0);
/* Ensure route & ifp is UP */
if (RT_LINK_IS_UP(nh->nh_ifp)) {
if (flags & NHR_REF)
Index: head/sys/netinet/in_rmx.c
===================================================================
--- head/sys/netinet/in_rmx.c
+++ head/sys/netinet/in_rmx.c
@@ -30,8 +30,6 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include "opt_mpath.h"
-
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
@@ -127,9 +125,6 @@
return (NULL);
rh->rnh_preadd = rib4_preadd;
-#ifdef RADIX_MPATH
- rt_mpath_init_rnh(rh);
-#endif
return (rh);
}
Index: head/sys/netinet/ip_output.c
===================================================================
--- head/sys/netinet/ip_output.c
+++ head/sys/netinet/ip_output.c
@@ -38,7 +38,6 @@
#include "opt_ipsec.h"
#include "opt_kern_tls.h"
#include "opt_mbuf_stress_test.h"
-#include "opt_mpath.h"
#include "opt_ratelimit.h"
#include "opt_route.h"
#include "opt_rss.h"
@@ -470,11 +469,7 @@
* for correct operation (as it is for ARP).
*/
uint32_t flowid;
-#ifdef RADIX_MPATH
- flowid = ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr);
-#else
flowid = m->m_pkthdr.flowid;
-#endif
ro->ro_nh = fib4_lookup(fibnum, dst->sin_addr, 0,
NHR_REF, flowid);
Index: head/sys/netinet6/in6_fib.c
===================================================================
--- head/sys/netinet6/in6_fib.c
+++ head/sys/netinet6/in6_fib.c
@@ -33,7 +33,6 @@
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_route.h"
-#include "opt_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -49,14 +48,11 @@
#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/route.h>
+#include <net/route/route_ctl.h>
#include <net/route/route_var.h>
#include <net/route/nhop.h>
#include <net/vnet.h>
-#ifdef RADIX_MPATH
-#include <net/radix_mpath.h>
-#endif
-
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/ip_mroute.h>
@@ -88,7 +84,6 @@
RIB_RLOCK_TRACKER;
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
struct nhop_object *nh;
struct sockaddr_in6 sin6;
@@ -111,12 +106,7 @@
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
-#ifdef RADIX_MPATH
- if (rt_mpath_next(rt) != NULL)
- rt = rt_mpath_selectrte(rt, flowid);
-#endif
- nh = rt->rt_nhop;
+ nh = nhop_select((RNTORT(rn))->rt_nhop, flowid);
/* Ensure route & ifp is UP */
if (RT_LINK_IS_UP(nh->nh_ifp)) {
if (flags & NHR_REF)
@@ -132,7 +122,7 @@
}
inline static int
-check_urpf(const struct nhop_object *nh, uint32_t flags,
+check_urpf_nhop(const struct nhop_object *nh, uint32_t flags,
const struct ifnet *src_if)
{
@@ -149,21 +139,24 @@
return (0);
}
-#ifdef RADIX_MPATH
-inline static int
-check_urpf_mpath(struct rtentry *rt, uint32_t flags,
+static int
+check_urpf(struct nhop_object *nh, uint32_t flags,
const struct ifnet *src_if)
{
-
- while (rt != NULL) {
- if (check_urpf(rt->rt_nhop, flags, src_if) != 0)
- return (1);
- rt = rt_mpath_next(rt);
- }
-
- return (0);
-}
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh)) {
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
+ for (int i = 0; i < num_nhops; i++) {
+ if (check_urpf_nhop(wn[i].nh, flags, src_if) != 0)
+ return (1);
+ }
+ return (0);
+ } else
#endif
+ return (check_urpf_nhop(nh, flags, src_if));
+}
/*
* Performs reverse path forwarding lookup.
@@ -181,7 +174,6 @@
RIB_RLOCK_TRACKER;
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
struct sockaddr_in6 sin6;
int ret;
@@ -203,12 +195,7 @@
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
-#ifdef RADIX_MPATH
- ret = check_urpf_mpath(rt, flags, src_if);
-#else
- ret = check_urpf(rt->rt_nhop, flags, src_if);
-#endif
+ ret = check_urpf(RNTORT(rn)->rt_nhop, flags, src_if);
RIB_RUNLOCK(rh);
return (ret);
}
@@ -223,7 +210,6 @@
{
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
struct nhop_object *nh;
struct sockaddr_in6 sin6;
@@ -245,8 +231,7 @@
rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
- nh = rt->rt_nhop;
+ nh = nhop_select((RNTORT(rn))->rt_nhop, 0);
/* Ensure route & ifp is UP */
if (RT_LINK_IS_UP(nh->nh_ifp)) {
if (flags & NHR_REF)
Index: head/sys/netinet6/in6_rmx.c
===================================================================
--- head/sys/netinet6/in6_rmx.c
+++ head/sys/netinet6/in6_rmx.c
@@ -64,8 +64,6 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include "opt_mpath.h"
-
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
@@ -153,9 +151,6 @@
return (NULL);
rh->rnh_preadd = rib6_preadd;
-#ifdef RADIX_MPATH
- rt_mpath_init_rnh(rh);
-#endif
rs = rib_subscribe_internal(rh, nd6_subscription_cb, NULL,
RIB_NOTIFY_IMMEDIATE, true);
Index: head/sys/netinet6/nd6.c
===================================================================
--- head/sys/netinet6/nd6.c
+++ head/sys/netinet6/nd6.c
@@ -36,6 +36,7 @@
#include "opt_inet.h"
#include "opt_inet6.h"
+#include "opt_route.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -1591,7 +1592,11 @@
nd6_subscription_cb(struct rib_head *rnh, struct rib_cmd_info *rc, void *arg)
{
+#ifdef ROUTE_MPATH
+ rib_decompose_notification(rc, check_release_defrouter, NULL);
+#else
check_release_defrouter(rc, NULL);
+#endif
}
int
Index: head/sys/sys/socket.h
===================================================================
--- head/sys/sys/socket.h
+++ head/sys/sys/socket.h
@@ -417,6 +417,7 @@
#define NET_RT_IFLISTL 5 /* Survey interface list, using 'l'en
* versions of msghdr structs. */
#define NET_RT_NHOP 6 /* dump routing nexthops */
+#define NET_RT_NHGRP 7 /* dump routing nexthop groups */
#endif /* __BSD_VISIBLE */
/*
Index: head/usr.bin/netstat/Makefile
===================================================================
--- head/usr.bin/netstat/Makefile
+++ head/usr.bin/netstat/Makefile
@@ -5,7 +5,7 @@
PROG= netstat
SRCS= if.c inet.c main.c mbuf.c mroute.c netisr.c nl_symbols.c route.c \
- unix.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c common.c nhops.c \
+ unix.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c common.c nhops.c nhgrp.c \
nl_defs.h
nl_symbols.c: nlist_symbols
Index: head/usr.bin/netstat/common.h
===================================================================
--- head/usr.bin/netstat/common.h
+++ head/usr.bin/netstat/common.h
@@ -54,5 +54,22 @@
struct ifmap_entry *prepare_ifmap(size_t *ifmap_size);
+struct rt_msghdr;
+struct nhops_map {
+ uint32_t idx;
+ struct rt_msghdr *rtm;
+};
+
+struct nhops_dump {
+ void *nh_buf;
+ struct nhops_map *nh_map;
+ size_t nh_count;
+};
+
+void dump_nhops_sysctl(int fibnum, int af, struct nhops_dump *nd);
+struct nhop_map;
+void nhop_map_update(struct nhop_map *map, uint32_t idx, char *gw, char *ifname);
+
+
#endif
Index: head/usr.bin/netstat/main.c
===================================================================
--- head/usr.bin/netstat/main.c
+++ head/usr.bin/netstat/main.c
@@ -215,6 +215,7 @@
int noutputs = 0; /* how much outputs before we exit */
int numeric_addr; /* show addresses numerically */
int numeric_port; /* show ports numerically */
+int Oflag; /* show nhgrp objects*/
int oflag; /* show nexthop objects*/
int Pflag; /* show TCP log ID */
static int pflag; /* show given protocol */
@@ -250,7 +251,7 @@
if (argc < 0)
exit(EXIT_FAILURE);
- while ((ch = getopt(argc, argv, "46AaBbCdF:f:ghI:iLlM:mN:noPp:Qq:RrSTsuWw:xz"))
+ while ((ch = getopt(argc, argv, "46AaBbCdF:f:ghI:iLlM:mN:nOoPp:Qq:RrSTsuWw:xz"))
!= -1)
switch(ch) {
case '4':
@@ -353,6 +354,9 @@
case 'o':
oflag = 1;
break;
+ case 'O':
+ Oflag = 1;
+ break;
case 'P':
Pflag = 1;
break;
@@ -509,6 +513,14 @@
xo_finish();
exit(0);
}
+ if (Oflag) {
+ xo_open_container("statistics");
+ nhgrp_print(fib, af);
+ xo_close_container("statistics");
+ xo_finish();
+ exit(0);
+ }
+
if (gflag) {
Index: head/usr.bin/netstat/netstat.h
===================================================================
--- head/usr.bin/netstat/netstat.h
+++ head/usr.bin/netstat/netstat.h
@@ -163,3 +163,4 @@
void mrt_stats(void);
void bpf_stats(char *);
void nhops_print(int fibnum, int af);
+void nhgrp_print(int fibnum, int af);
Index: head/usr.bin/netstat/nhgrp.c
===================================================================
--- head/usr.bin/netstat/nhgrp.c
+++ head/usr.bin/netstat/nhgrp.c
@@ -0,0 +1,355 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_types.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+
+#include <netinet/in.h>
+
+#include <arpa/inet.h>
+#include <libutil.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+#include <err.h>
+#include <libxo/xo.h>
+#include "netstat.h"
+#include "common.h"
+
+#define WID_GW_DEFAULT(af) (((af) == AF_INET6) ? 40 : 18)
+
+static int wid_gw;
+static int wid_if = 10;
+static int wid_nhidx = 8;
+static int wid_refcnt = 8;
+
+struct nhop_entry {
+ char gw[64];
+ char ifname[IFNAMSIZ];
+};
+
+struct nhop_map {
+ struct nhop_entry *ptr;
+ size_t size;
+};
+static struct nhop_map global_nhop_map;
+
+static struct ifmap_entry *ifmap;
+static size_t ifmap_size;
+
+static struct nhop_entry *
+nhop_get(struct nhop_map *map, uint32_t idx)
+{
+
+ if (idx >= map->size)
+ return (NULL);
+ if (*map->ptr[idx].ifname == '\0')
+ return (NULL);
+ return &map->ptr[idx];
+}
+
+static void
+print_nhgroup_header(int af1 __unused)
+{
+
+ xo_emit("{T:/%-*.*s}{T:/%-*.*s}{T:/%*.*s}{T:/%*.*s}{T:/%*.*s}"
+ "{T:/%*.*s}{T:/%*s}\n",
+ wid_nhidx, wid_nhidx, "GrpIdx",
+ wid_nhidx, wid_nhidx, "NhIdx",
+ wid_nhidx, wid_nhidx, "Weight",
+ wid_nhidx, wid_nhidx, "Slots",
+ wid_gw, wid_gw, "Gateway",
+ wid_if, wid_if, "Netif",
+ wid_refcnt, "Refcnt");
+}
+
+static void
+print_padding(char sym, int len)
+{
+ char buffer[56];
+
+ memset(buffer, sym, sizeof(buffer));
+ buffer[0] = '{';
+ buffer[1] = 'P';
+ buffer[2] = ':';
+ buffer[3] = ' ';
+ buffer[len + 3] = '}';
+ buffer[len + 4] = '\0';
+ xo_emit(buffer);
+}
+
+
+static void
+print_nhgroup_entry_sysctl(const char *name, struct rt_msghdr *rtm,
+ struct nhgrp_external *nhge)
+{
+ char buffer[128];
+ struct nhop_entry *ne;
+ struct nhgrp_nhop_external *ext_cp, *ext_dp;
+ struct nhgrp_container *nhg_cp, *nhg_dp;
+
+ nhg_cp = (struct nhgrp_container *)(nhge + 1);
+ if (nhg_cp->nhgc_type != NHG_C_TYPE_CNHOPS || nhg_cp->nhgc_subtype != 0)
+ return;
+ ext_cp = (struct nhgrp_nhop_external *)(nhg_cp + 1);
+
+ nhg_dp = (struct nhgrp_container *)((char *)nhg_cp + nhg_cp->nhgc_len);
+ if (nhg_dp->nhgc_type != NHG_C_TYPE_DNHOPS || nhg_dp->nhgc_subtype != 0)
+ return;
+ ext_dp = (struct nhgrp_nhop_external *)(nhg_dp + 1);
+
+ xo_open_instance(name);
+
+ snprintf(buffer, sizeof(buffer), "{[:-%d}{:nhgrp-index/%%lu}{]:} ", wid_nhidx);
+
+ xo_emit(buffer, nhge->nhg_idx);
+
+ /* nhidx */
+ print_padding('-', wid_nhidx);
+ /* weight */
+ print_padding('-', wid_nhidx);
+ /* slots */
+ print_padding('-', wid_nhidx);
+ print_padding('-', wid_gw);
+ print_padding('-', wid_if);
+ xo_emit("{t:nhg-refcnt/%*lu}", wid_refcnt, nhge->nhg_refcount);
+ xo_emit("\n");
+
+ xo_open_list("nhop-weights");
+ for (uint32_t i = 0; i < nhg_cp->nhgc_count; i++) {
+ /* TODO: optimize slots calculations */
+ uint32_t slots = 0;
+ for (uint32_t sidx = 0; sidx < nhg_dp->nhgc_count; sidx++) {
+ if (ext_dp[sidx].nh_idx == ext_cp[i].nh_idx)
+ slots++;
+ }
+ xo_open_instance("nhop-weight");
+ print_padding(' ', wid_nhidx);
+ // nh index
+ xo_emit("{t:nh-index/%*lu}", wid_nhidx, ext_cp[i].nh_idx);
+ xo_emit("{t:nh-weight/%*lu}", wid_nhidx, ext_cp[i].nh_weight);
+ xo_emit("{t:nh-slots/%*lu}", wid_nhidx, slots);
+ ne = nhop_get(&global_nhop_map, ext_cp[i].nh_idx);
+ if (ne != NULL) {
+ xo_emit("{t:nh-gw/%*.*s}", wid_gw, wid_gw, ne->gw);
+ xo_emit("{t:nh-interface/%*.*s}", wid_if, wid_if, ne->ifname);
+ }
+ xo_emit("\n");
+ xo_close_instance("nhop-weight");
+ }
+ xo_close_list("nhop-weights");
+ xo_close_instance(name);
+}
+
+static int
+cmp_nhg_idx(const void *_a, const void *_b)
+{
+ const struct nhops_map *a, *b;
+
+ a = _a;
+ b = _b;
+
+ if (a->idx > b->idx)
+ return (1);
+ else if (a->idx < b->idx)
+ return (-1);
+ return (0);
+}
+
+static void
+dump_nhgrp_sysctl(int fibnum, int af, struct nhops_dump *nd)
+{
+ size_t needed;
+ int mib[7];
+ char *buf, *next, *lim;
+ struct rt_msghdr *rtm;
+ struct nhgrp_external *nhg;
+ struct nhops_map *nhg_map;
+ size_t nhg_count, nhg_size;
+
+ mib[0] = CTL_NET;
+ mib[1] = PF_ROUTE;
+ mib[2] = 0;
+ mib[3] = af;
+ mib[4] = NET_RT_NHGRP;
+ mib[5] = 0;
+ mib[6] = fibnum;
+ if (sysctl(mib, nitems(mib), NULL, &needed, NULL, 0) < 0)
+ err(EX_OSERR, "sysctl: net.route.0.%d.nhgrpdump.%d estimate",
+ af, fibnum);
+ if ((buf = malloc(needed)) == NULL)
+ errx(2, "malloc(%lu)", (unsigned long)needed);
+ if (sysctl(mib, nitems(mib), buf, &needed, NULL, 0) < 0)
+ err(1, "sysctl: net.route.0.%d.nhgrpdump.%d", af, fibnum);
+ lim = buf + needed;
+
+ /*
+ * nexhops groups are received unsorted. Collect everything first,
+ * and sort prior displaying.
+ */
+ nhg_count = 0;
+ nhg_size = 16;
+ nhg_map = calloc(nhg_size, sizeof(struct nhops_map));
+ for (next = buf; next < lim; next += rtm->rtm_msglen) {
+ rtm = (struct rt_msghdr *)next;
+ if (rtm->rtm_version != RTM_VERSION)
+ continue;
+
+ if (nhg_count >= nhg_size) {
+ nhg_size *= 2;
+ nhg_map = realloc(nhg_map, nhg_size * sizeof(struct nhops_map));
+ }
+
+ nhg = (struct nhgrp_external *)(rtm + 1);
+ nhg_map[nhg_count].idx = nhg->nhg_idx;
+ nhg_map[nhg_count].rtm = rtm;
+ nhg_count++;
+ }
+
+ if (nhg_count > 0)
+ qsort(nhg_map, nhg_count, sizeof(struct nhops_map), cmp_nhg_idx);
+ nd->nh_buf = buf;
+ nd->nh_count = nhg_count;
+ nd->nh_map = nhg_map;
+}
+
+static void
+print_nhgrp_sysctl(int fibnum, int af)
+{
+ struct nhops_dump nd;
+ struct nhgrp_external *nhg;
+ struct rt_msghdr *rtm;
+
+ dump_nhgrp_sysctl(fibnum, af, &nd);
+
+ xo_open_container("nhgrp-table");
+ xo_open_list("rt-family");
+ if (nd.nh_count > 0) {
+ wid_gw = WID_GW_DEFAULT(af);
+ xo_open_instance("rt-family");
+ pr_family(af);
+ xo_open_list("nhgrp-entry");
+
+ print_nhgroup_header(af);
+
+ for (size_t i = 0; i < nd.nh_count; i++) {
+ rtm = nd.nh_map[i].rtm;
+ nhg = (struct nhgrp_external *)(rtm + 1);
+ print_nhgroup_entry_sysctl("nhgrp-entry", rtm, nhg);
+ }
+ }
+ xo_close_list("rt-family");
+ xo_close_container("nhgrp-table");
+ free(nd.nh_buf);
+}
+
+static void
+update_global_map(struct nhop_external *nh)
+{
+ char iface_name[128];
+ char gw_addr[64];
+ struct nhop_addrs *na;
+ struct sockaddr *sa_gw;
+
+ na = (struct nhop_addrs *)((char *)nh + nh->nh_len);
+ sa_gw = (struct sockaddr *)((char *)na + na->gw_sa_off);
+
+ memset(iface_name, 0, sizeof(iface_name));
+ if (nh->ifindex < (uint32_t)ifmap_size) {
+ strlcpy(iface_name, ifmap[nh->ifindex].ifname,
+ sizeof(iface_name));
+ if (*iface_name == '\0')
+ strlcpy(iface_name, "---", sizeof(iface_name));
+ }
+
+ if (nh->nh_flags & NHF_GATEWAY) {
+ const char *cp;
+ cp = fmt_sockaddr(sa_gw, NULL, RTF_HOST);
+ strlcpy(gw_addr, cp, sizeof(gw_addr));
+ } else
+ snprintf(gw_addr, sizeof(gw_addr), "%s/resolve", iface_name);
+
+ nhop_map_update(&global_nhop_map, nh->nh_idx, gw_addr, iface_name);
+}
+
+static void
+prepare_nh_map(int fibnum, int af)
+{
+ struct nhops_dump nd;
+ struct nhop_external *nh;
+ struct rt_msghdr *rtm;
+
+ dump_nhops_sysctl(fibnum, af, &nd);
+
+ for (size_t i = 0; i < nd.nh_count; i++) {
+ rtm = nd.nh_map[i].rtm;
+ nh = (struct nhop_external *)(rtm + 1);
+ update_global_map(nh);
+ }
+
+ free(nd.nh_buf);
+}
+
+void
+nhgrp_print(int fibnum, int af)
+{
+ size_t intsize;
+ int numfibs;
+
+ intsize = sizeof(int);
+ if (fibnum == -1 &&
+ sysctlbyname("net.my_fibnum", &fibnum, &intsize, NULL, 0) == -1)
+ fibnum = 0;
+ if (sysctlbyname("net.fibs", &numfibs, &intsize, NULL, 0) == -1)
+ numfibs = 1;
+ if (fibnum < 0 || fibnum > numfibs - 1)
+ errx(EX_USAGE, "%d: invalid fib", fibnum);
+
+ ifmap = prepare_ifmap(&ifmap_size);
+ prepare_nh_map(fibnum, af);
+
+ xo_open_container("route-nhgrp-information");
+ xo_emit("{T:Nexthop groups data}");
+ if (fibnum)
+ xo_emit(" ({L:fib}: {:fib/%d})", fibnum);
+ xo_emit("\n");
+ print_nhgrp_sysctl(fibnum, af);
+ xo_close_container("route-nhgrp-information");
+}
+
Index: head/usr.bin/netstat/nhops.c
===================================================================
--- head/usr.bin/netstat/nhops.c
+++ head/usr.bin/netstat/nhops.c
@@ -118,8 +118,6 @@
};
static struct nhop_map global_nhop_map;
-static void nhop_map_update(struct nhop_map *map, uint32_t idx,
- char *gw, char *ifname);
static struct nhop_entry *nhop_get(struct nhop_map *map, uint32_t idx);
@@ -204,7 +202,7 @@
}
}
-static void
+void
nhop_map_update(struct nhop_map *map, uint32_t idx, char *gw, char *ifname)
{
if (idx >= map->size) {
@@ -322,11 +320,6 @@
xo_close_instance(name);
}
-struct nhops_map {
- uint32_t idx;
- struct rt_msghdr *rtm;
-};
-
static int
cmp_nh_idx(const void *_a, const void *_b)
{
@@ -342,15 +335,14 @@
return (0);
}
-static void
-print_nhops_sysctl(int fibnum, int af)
+void
+dump_nhops_sysctl(int fibnum, int af, struct nhops_dump *nd)
{
size_t needed;
int mib[7];
char *buf, *next, *lim;
struct rt_msghdr *rtm;
struct nhop_external *nh;
- int fam;
struct nhops_map *nh_map;
size_t nh_count, nh_size;
@@ -369,8 +361,6 @@
if (sysctl(mib, nitems(mib), buf, &needed, NULL, 0) < 0)
err(1, "sysctl: net.route.0.%d.nhdump.%d", af, fibnum);
lim = buf + needed;
- xo_open_container("nhop-table");
- xo_open_list("rt-family");
/*
* nexhops are received unsorted. Collect everything first, sort and then display
@@ -395,9 +385,27 @@
nh_count++;
}
- if (nh_count > 0) {
+ if (nh_count > 0)
qsort(nh_map, nh_count, sizeof(struct nhops_map), cmp_nh_idx);
- nh = (struct nhop_external *)(nh_map[0].rtm + 1);
+ nd->nh_buf = buf;
+ nd->nh_count = nh_count;
+ nd->nh_map = nh_map;
+}
+
+static void
+print_nhops_sysctl(int fibnum, int af)
+{
+ struct nhops_dump nd;
+ struct nhop_external *nh;
+ int fam;
+ struct rt_msghdr *rtm;
+
+ dump_nhops_sysctl(fibnum, af, &nd);
+
+ xo_open_container("nhop-table");
+ xo_open_list("rt-family");
+ if (nd.nh_count > 0) {
+ nh = (struct nhop_external *)(nd.nh_map[0].rtm + 1);
fam = nh->nh_family;
wid_dst = WID_GW_DEFAULT(fam);
@@ -415,8 +423,8 @@
print_nhop_header(fam);
- for (size_t i = 0; i < nh_count; i++) {
- rtm = nh_map[i].rtm;
+ for (size_t i = 0; i < nd.nh_count; i++) {
+ rtm = nd.nh_map[i].rtm;
nh = (struct nhop_external *)(rtm + 1);
print_nhop_entry_sysctl("nh-entry", rtm, nh);
}
@@ -426,7 +434,7 @@
}
xo_close_list("rt-family");
xo_close_container("nhop-table");
- free(buf);
+ free(nd.nh_buf);
}
static void

File Metadata

Mime Type
text/plain
Expires
Wed, Jan 22, 6:00 AM (1 h, 23 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
16025780
Default Alt Text
D26449.diff (99 KB)

Event Timeline