Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F108083706
D26449.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
99 KB
Referenced Files
None
Subscribers
None
D26449.diff
View Options
Index: head/sys/conf/NOTES
===================================================================
--- head/sys/conf/NOTES
+++ head/sys/conf/NOTES
@@ -1002,7 +1002,7 @@
#
# TCP_HHOOK enables the hhook(9) framework hooks for the TCP stack.
#
-# RADIX_MPATH provides support for equal-cost multi-path routing.
+# ROUTE_MPATH provides support for multipath routing.
#
options MROUTING # Multicast routing
options IPFIREWALL #firewall
@@ -1023,7 +1023,7 @@
options TCPPCAP
options TCP_BLACKBOX
options TCP_HHOOK
-options RADIX_MPATH
+options ROUTE_MPATH
# The MBUF_STRESS_TEST option enables options which create
# various random failures / extreme cases related to mbuf
Index: head/sys/conf/files
===================================================================
--- head/sys/conf/files
+++ head/sys/conf/files
@@ -4143,10 +4143,12 @@
net/debugnet_inet.c optional inet debugnet
net/pfil.c optional ether | inet
net/radix.c standard
-net/radix_mpath.c standard
net/raw_cb.c standard
net/raw_usrreq.c standard
net/route.c standard
+net/route/mpath_ctl.c optional route_mpath
+net/route/nhgrp.c optional route_mpath
+net/route/nhgrp_ctl.c optional route_mpath
net/route/nhop.c standard
net/route/nhop_ctl.c standard
net/route/nhop_utils.c standard
Index: head/sys/conf/options
===================================================================
--- head/sys/conf/options
+++ head/sys/conf/options
@@ -454,6 +454,7 @@
PCBGROUP opt_pcbgroup.h
PF_DEFAULT_TO_DROP opt_pf.h
RADIX_MPATH opt_mpath.h
+ROUTE_MPATH opt_route.h
ROUTETABLES opt_route.h
RSS opt_rss.h
SLIP_IFF_OPTS opt_slip.h
Index: head/sys/net/radix.c
===================================================================
--- head/sys/net/radix.c
+++ head/sys/net/radix.c
@@ -44,10 +44,6 @@
#include <sys/malloc.h>
#include <sys/syslog.h>
#include <net/radix.h>
-#include "opt_mpath.h"
-#ifdef RADIX_MPATH
-#include <net/radix_mpath.h>
-#endif
#else /* !_KERNEL */
#include <stdio.h>
#include <strings.h>
Index: head/sys/net/route.h
===================================================================
--- head/sys/net/route.h
+++ head/sys/net/route.h
@@ -178,6 +178,7 @@
*/
/* Consumer-visible nexthop info flags */
+#define NHF_MULTIPATH 0x0008 /* Nexhop is a nexthop group */
#define NHF_REJECT 0x0010 /* RTF_REJECT */
#define NHF_BLACKHOLE 0x0020 /* RTF_BLACKHOLE */
#define NHF_REDIRECT 0x0040 /* RTF_DYNAMIC|RTF_MODIFIED */
@@ -208,6 +209,10 @@
uint64_t rts_wildcard; /* lookups satisfied by a wildcard */
uint64_t rts_nh_idx_alloc_failure; /* nexthop index alloc failure*/
uint64_t rts_nh_alloc_failure; /* nexthop allocation failure*/
+ uint64_t rts_add_failure; /* # of route addition failures */
+ uint64_t rts_add_retry; /* # of route addition retries */
+ uint64_t rts_del_failure; /* # of route deletion failure */
+ uint64_t rts_del_retry; /* # of route deletion retries */
};
/*
Index: head/sys/net/route.c
===================================================================
--- head/sys/net/route.c
+++ head/sys/net/route.c
@@ -39,7 +39,6 @@
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_mrouting.h"
-#include "opt_mpath.h"
#include "opt_route.h"
#include <sys/param.h>
Index: head/sys/net/route/mpath_ctl.c
===================================================================
--- head/sys/net/route/mpath_ctl.c
+++ head/sys/net/route/mpath_ctl.c
@@ -0,0 +1,165 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_inet.h"
+#include "opt_route.h"
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+
+/*
+ * This file contains the supporting functions for adding/deleting/updating
+ * multipath routes to the routing table.
+ */
+
+SYSCTL_DECL(_net_route);
+
+/*
+ * Tries to add @rnd_add nhop to the existing set of nhops (@nh_orig) for the
+ * prefix specified by @rt.
+ *
+ * Return 0 ans consumes rt / rnd_add nhop references. @rc gets populated
+ * with the operation result.
+ * Otherwise errno is returned.
+ *
+ * caller responsibility is to unlock/free rt and
+ * rt->rt_nhop.
+ */
+int
+add_route_mpath(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct rtentry *rt, struct route_nhop_data *rnd_add,
+ struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
+{
+ RIB_RLOCK_TRACKER;
+ struct route_nhop_data rnd_new;
+ int error = 0;
+
+ /*
+ * It is possible that multiple rtsock speakers will try to update
+ * the same route simultaneously. Reduce the chance of failing the
+ * request by retrying the cycle multiple times.
+ */
+ for (int i = 0; i < RIB_MAX_RETRIES; i++) {
+ error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add,
+ &rnd_new);
+ if (error != 0) {
+ if (error != EAGAIN)
+ break;
+
+ /*
+ * Group creation failed, most probably because
+ * @rnd_orig data got scheduled for deletion.
+ * Refresh @rnd_orig data and retry.
+ */
+ RIB_RLOCK(rnh);
+ lookup_prefix(rnh, info, rnd_orig);
+ RIB_RUNLOCK(rnh);
+ continue;
+ }
+
+ error = change_route_conditional(rnh, rt, info, rnd_orig,
+ &rnd_new, rc);
+ if (error != EAGAIN)
+ break;
+ RTSTAT_INC(rts_add_retry);
+ }
+
+ return (error);
+}
+
+struct rt_match_info {
+ struct rt_addrinfo *info;
+ struct rtentry *rt;
+};
+
+static bool
+gw_filter_func(const struct nhop_object *nh, void *_data)
+{
+ struct rt_match_info *ri = (struct rt_match_info *)_data;
+
+ return (check_info_match_nhop(ri->info, ri->rt, nh) == 0);
+}
+
+/*
+ * Tries to delete matching paths from @nhg.
+ * Returns 0 on success and updates operation result in @rc.
+ */
+int
+del_route_mpath(struct rib_head *rh, struct rt_addrinfo *info,
+ struct rtentry *rt, struct nhgrp_object *nhg,
+ struct rib_cmd_info *rc)
+{
+ struct route_nhop_data rnd;
+ struct rt_match_info ri = { .info = info, .rt = rt };
+ int error;
+
+ RIB_WLOCK_ASSERT(rh);
+
+ /*
+ * Require gateway to delete multipath routes, to forbid
+ * deleting all paths at once.
+ * If the filter function is provided, skip gateway check to
+ * allow rib_walk_del() delete routes for any criteria based
+ * on provided callback.
+ */
+ if ((info->rti_info[RTAX_GATEWAY] == NULL) && (info->rti_filter == NULL))
+ return (ESRCH);
+
+ error = nhgrp_get_filtered_group(rh, nhg, gw_filter_func, (void *)&ri,
+ &rnd);
+ if (error == 0)
+ error = change_route_nhop(rh, rt, info, &rnd, rc);
+ return (error);
+}
+
Index: head/sys/net/route/nhgrp.c
===================================================================
--- head/sys/net/route/nhgrp.c
+++ head/sys/net/route/nhgrp.c
@@ -0,0 +1,344 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_inet.h"
+#include "opt_route.h"
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/refcount.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/nhgrp_var.h>
+
+/*
+ * This file contains data structures management logic for the nexthop
+ * groups ("nhgrp") route subsystem.
+ *
+ * Nexthop groups are used to store multiple routes available for the specific
+ * prefix. Nexthop groups are immutable and can be shared across multiple
+ * prefixes.
+ *
+ * Each group consists of a control plane part and a dataplane part.
+ * Control plane is basically a collection of nexthop objects with
+ * weights and refcount.
+ *
+ * Datapath consists of a array of nexthop pointers, compiled from control
+ * plane data to support O(1) nexthop selection.
+ *
+ * For example, consider the following group:
+ * [(nh1, weight=100), (nh2, weight=200)]
+ * It will compile to the following array:
+ * [nh1, nh2, nh2]
+ *
+ */
+
+static void consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets,
+ uint32_t new_idx_items);
+
+static int cmp_nhgrp(const struct nhgrp_priv *a, const struct nhgrp_priv *b);
+static unsigned int hash_nhgrp(const struct nhgrp_priv *obj);
+
+static unsigned
+djb_hash(const unsigned char *h, const int len)
+{
+ unsigned int result = 0;
+ int i;
+
+ for (i = 0; i < len; i++)
+ result = 33 * result ^ h[i];
+
+ return (result);
+}
+
+static int
+cmp_nhgrp(const struct nhgrp_priv *a, const struct nhgrp_priv *b)
+{
+
+ /*
+ * In case of consistent hashing, there can be multiple nexthop groups
+ * with the same "control plane" list of nexthops with weights and a
+ * different set of "data plane" nexthops.
+ * For now, ignore the data plane and focus on the control plane list.
+ */
+ if (a->nhg_nh_count != b->nhg_nh_count)
+ return (0);
+ return !memcmp(a->nhg_nh_weights, b->nhg_nh_weights,
+ sizeof(struct weightened_nhop) * a->nhg_nh_count);
+}
+
+/*
+ * Hash callback: calculate hash of an object
+ */
+static unsigned int
+hash_nhgrp(const struct nhgrp_priv *obj)
+{
+ const unsigned char *key;
+
+ key = (const unsigned char *)obj->nhg_nh_weights;
+
+ return (djb_hash(key, sizeof(struct weightened_nhop) * obj->nhg_nh_count));
+}
+
+/*
+ * Returns object referenced and unlocked
+ */
+struct nhgrp_priv *
+find_nhgrp(struct nh_control *ctl, const struct nhgrp_priv *key)
+{
+ struct nhgrp_priv *priv_ret;
+
+ NHOPS_RLOCK(ctl);
+ CHT_SLIST_FIND_BYOBJ(&ctl->gr_head, mpath, key, priv_ret);
+ if (priv_ret != NULL) {
+ if (refcount_acquire_if_not_zero(&priv_ret->nhg_refcount) == 0) {
+ /* refcount is 0 -> group is being deleted */
+ priv_ret = NULL;
+ }
+ }
+ NHOPS_RUNLOCK(ctl);
+
+ return (priv_ret);
+}
+
+int
+link_nhgrp(struct nh_control *ctl, struct nhgrp_priv *grp_priv)
+{
+ uint16_t idx;
+ uint32_t new_num_buckets, new_num_items;
+
+ NHOPS_WLOCK(ctl);
+ /* Check if we need to resize hash and index */
+ new_num_buckets = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->gr_head);
+ new_num_items = bitmask_get_resize_items(&ctl->gr_idx_head);
+
+ if (bitmask_alloc_idx(&ctl->gr_idx_head, &idx) != 0) {
+ NHOPS_WUNLOCK(ctl);
+ DPRINTF("Unable to allocate mpath index");
+ consider_resize(ctl, new_num_buckets, new_num_items);
+ return (0);
+ }
+
+ grp_priv->nhg_idx = idx;
+ grp_priv->nh_control = ctl;
+ CHT_SLIST_INSERT_HEAD(&ctl->gr_head, mpath, grp_priv);
+
+ NHOPS_WUNLOCK(ctl);
+
+ consider_resize(ctl, new_num_buckets, new_num_items);
+
+ return (1);
+}
+
+struct nhgrp_priv *
+unlink_nhgrp(struct nh_control *ctl, struct nhgrp_priv *key)
+{
+ struct nhgrp_priv *nhg_priv_ret;
+ int ret, idx;
+
+ NHOPS_WLOCK(ctl);
+
+ CHT_SLIST_REMOVE_BYOBJ(&ctl->gr_head, mpath, key, nhg_priv_ret);
+
+ if (nhg_priv_ret == NULL) {
+ DPRINTF("Unable to find nhop group!");
+ NHOPS_WUNLOCK(ctl);
+ return (NULL);
+ }
+
+ idx = nhg_priv_ret->nhg_idx;
+ ret = bitmask_free_idx(&ctl->gr_idx_head, idx);
+ nhg_priv_ret->nhg_idx = 0;
+ nhg_priv_ret->nh_control = NULL;
+
+ NHOPS_WUNLOCK(ctl);
+
+ return (nhg_priv_ret);
+}
+
+/*
+ * Checks if hash needs resizing and performs this resize if necessary
+ *
+ */
+__noinline static void
+consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, uint32_t new_idx_items)
+{
+ void *nh_ptr, *nh_idx_ptr;
+ void *old_idx_ptr;
+ size_t alloc_size;
+
+ nh_ptr = NULL ;
+ if (new_nh_buckets != 0) {
+ alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_nh_buckets);
+ nh_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO);
+ }
+
+ nh_idx_ptr = NULL;
+ if (new_idx_items != 0) {
+ alloc_size = bitmask_get_size(new_idx_items);
+ nh_idx_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO);
+ }
+
+ if (nh_ptr == NULL && nh_idx_ptr == NULL) {
+ /* Either resize is not required or allocations have failed. */
+ return;
+ }
+
+ DPRINTF("mp: going to resize: nh:[ptr:%p sz:%u] idx:[ptr:%p sz:%u]",
+ nh_ptr, new_nh_buckets, nh_idx_ptr, new_idx_items);
+
+ old_idx_ptr = NULL;
+
+ NHOPS_WLOCK(ctl);
+ if (nh_ptr != NULL) {
+ CHT_SLIST_RESIZE(&ctl->gr_head, mpath, nh_ptr, new_nh_buckets);
+ }
+ if (nh_idx_ptr != NULL) {
+ if (bitmask_copy(&ctl->gr_idx_head, nh_idx_ptr, new_idx_items))
+ bitmask_swap(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items, &old_idx_ptr);
+ }
+ NHOPS_WUNLOCK(ctl);
+
+ if (nh_ptr != NULL)
+ free(nh_ptr, M_NHOP);
+ if (old_idx_ptr != NULL)
+ free(old_idx_ptr, M_NHOP);
+}
+
+/*
+ * Function allocating the necessary group data structures.
+ */
+bool
+nhgrp_ctl_alloc_default(struct nh_control *ctl, int malloc_flags)
+{
+ size_t alloc_size;
+ uint32_t num_buckets, num_items;
+ void *cht_ptr, *mask_ptr;
+
+ malloc_flags = (malloc_flags & (M_NOWAIT | M_WAITOK)) | M_ZERO;
+
+ num_buckets = 8;
+ alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
+ cht_ptr = malloc(alloc_size, M_NHOP, malloc_flags);
+
+ if (cht_ptr == NULL) {
+ DPRINTF("mpath init failed");
+ return (false);
+ }
+
+ /*
+ * Allocate nexthop index bitmask.
+ */
+ num_items = 128;
+ mask_ptr = malloc(bitmask_get_size(num_items), M_NHOP, malloc_flags);
+ if (mask_ptr == NULL) {
+ DPRINTF("mpath bitmask init failed");
+ free(cht_ptr, M_NHOP);
+ return (false);
+ }
+
+ NHOPS_WLOCK(ctl);
+
+ if (ctl->gr_head.hash_size == 0) {
+ /* Init hash and bitmask */
+ CHT_SLIST_INIT(&ctl->gr_head, cht_ptr, num_buckets);
+ bitmask_init(&ctl->gr_idx_head, mask_ptr, num_items);
+ NHOPS_WUNLOCK(ctl);
+ } else {
+ /* Other thread has already initiliazed hash/bitmask */
+ NHOPS_WUNLOCK(ctl);
+ free(cht_ptr, M_NHOP);
+ free(mask_ptr, M_NHOP);
+ }
+
+ DPRINTF("mpath init done for fib/af %d/%d", ctl->rh->rib_fibnum,
+ ctl->rh->rib_family);
+
+ return (true);
+}
+
+int
+nhgrp_ctl_init(struct nh_control *ctl)
+{
+
+ /*
+ * By default, do not allocate datastructures as multipath
+ * routes will not be necessarily used.
+ */
+ CHT_SLIST_INIT(&ctl->gr_head, NULL, 0);
+ bitmask_init(&ctl->gr_idx_head, NULL, 0);
+ return (0);
+}
+
+void
+nhgrp_ctl_free(struct nh_control *ctl)
+{
+
+ if (ctl->gr_head.ptr != NULL)
+ free(ctl->gr_head.ptr, M_NHOP);
+ if (ctl->gr_idx_head.idx != NULL)
+ free(ctl->gr_idx_head.idx, M_NHOP);
+}
+
+void
+nhgrp_ctl_unlink_all(struct nh_control *ctl)
+{
+ struct nhgrp_priv *nhg_priv;
+
+ NHOPS_WLOCK_ASSERT(ctl);
+
+ CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) {
+ DPRINTF("Marking nhgrp %u unlinked", nhg_priv->nhg_idx);
+ refcount_release(&nhg_priv->nhg_linked);
+ } CHT_SLIST_FOREACH_END;
+}
+
Index: head/sys/net/route/nhgrp_ctl.c
===================================================================
--- head/sys/net/route/nhgrp_ctl.c
+++ head/sys/net/route/nhgrp_ctl.c
@@ -0,0 +1,788 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+#define RTDEBUG
+#include "opt_inet.h"
+#include "opt_route.h"
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/refcount.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+#include <sys/epoch.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/route.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/nhgrp_var.h>
+
+/*
+ * This file contains the supporting functions for creating multipath groups
+ * and compiling their dataplane parts.
+ */
+
+/* MPF_MULTIPATH must be the same as NHF_MULTIPATH for nhop selection to work */
+_Static_assert(MPF_MULTIPATH == NHF_MULTIPATH,
+ "MPF_MULTIPATH must be the same as NHF_MULTIPATH");
+/* Offset and size of flags field has to be the same for nhop/nhop groups */
+CHK_STRUCT_FIELD_GENERIC(struct nhop_object, nh_flags, struct nhgrp_object, nhg_flags);
+/* Cap multipath to 64, as the larger values would break rib_cmd_info bmasks */
+CTASSERT(RIB_MAX_MPATH_WIDTH <= 64);
+
+static int wn_cmp(const void *a, const void *b);
+static void sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops);
+
+static struct nhgrp_priv *get_nhgrp(struct nh_control *ctl,
+ struct weightened_nhop *wn, int num_nhops, int *perror);
+static void destroy_nhgrp(struct nhgrp_priv *nhg_priv);
+static void destroy_nhgrp_epoch(epoch_context_t ctx);
+static void free_nhgrp_nhops(struct nhgrp_priv *nhg_priv);
+
+static int
+wn_cmp(const void *a, const void *b)
+{
+ const struct weightened_nhop *wa = a;
+ const struct weightened_nhop *wb = b;
+
+ if (wa->weight > wb->weight)
+ return (1);
+ else if (wa->weight < wb->weight)
+ return (-1);
+
+ /* Compare nexthops by pointer */
+ if (wa->nh > wb->nh)
+ return (1);
+ else if (wa->nh < wb->nh)
+ return (-1);
+ else
+ return (0);
+}
+
+/*
+ * Perform in-place sorting for array of nexthops in @wn.
+ *
+ * To avoid nh groups duplication, nexthops/weights in the
+ * @wn need to be ordered deterministically.
+ * As this sorting is needed only for the control plane functionality,
+ * there are no specific external requirements.
+ *
+ * Sort by weight first, to ease calculation of the slot sizes.
+ */
+static void
+sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops)
+{
+
+ qsort(wn, num_nhops, sizeof(struct weightened_nhop), wn_cmp);
+}
+
+/*
+ * Calculate minimum number of slots required to fit the existing
+ * set of weights in the common use case where weights are "easily"
+ * comparable.
+ * Assumes @wn is sorted by weight ascending and each weight is > 0.
+ * Returns number of slots or 0 if precise calculation failed.
+ *
+ * Some examples:
+ * note: (i, X) pair means (nhop=i, weight=X):
+ * (1, 1) (2, 2) -> 3 slots [1, 2, 2]
+ * (1, 100), (2, 200) -> 3 slots [1, 2, 2]
+ * (1, 100), (2, 200), (3, 400) -> 7 slots [1, 2, 2, 3, 3, 3]
+ */
+static uint32_t
+calc_min_mpath_slots_fast(const struct weightened_nhop *wn, size_t num_items)
+{
+ uint32_t i, last, xmin;
+ uint64_t total = 0;
+
+ last = 0;
+ xmin = wn[0].weight;
+ for (i = 0; i < num_items; i++) {
+ total += wn[i].weight;
+ if ((wn[i].weight - last < xmin) && (wn[i].weight != last))
+ xmin = wn[i].weight - last;
+ last = wn[i].weight;
+ }
+ /* xmin is the minimum unit of desired capacity */
+ if ((total % xmin) != 0)
+ return (0);
+ for (i = 0; i < num_items; i++) {
+ if ((wn[i].weight % xmin) != 0)
+ return (0);
+ }
+
+ return ((uint32_t)(total / xmin));
+}
+
+/*
+ * Calculate minimum number of slots required to fit the existing
+ * set of weights while maintaining weight coefficients.
+ *
+ * Assume @wn is sorted by weight ascending and each weight is > 0.
+ *
+ * Tries to find simple precise solution first and falls back to
+ * RIB_MAX_MPATH_WIDTH in case of any failure.
+ */
+static uint32_t
+calc_min_mpath_slots(const struct weightened_nhop *wn, size_t num_items)
+{
+ uint32_t v;
+
+ v = calc_min_mpath_slots_fast(wn, num_items);
+ if ((v == 0) || (v > RIB_MAX_MPATH_WIDTH))
+ v = RIB_MAX_MPATH_WIDTH;
+
+ return (v);
+}
+
+/*
+ * Nexthop group data consists of
+ * 1) dataplane part, with nhgrp_object as a header followed by an
+ * arbitrary number of nexthop pointers.
+ * 2) control plane part, with nhgrp_priv as a header, followed by
+ * an arbirtrary number of 'struct weightened_nhop' object.
+ *
+ * Given nexthop groups are (mostly) immutable, allocate all data
+ * in one go.
+ *
+ */
+__noinline static size_t
+get_nhgrp_alloc_size(uint32_t nhg_size, uint32_t num_nhops)
+{
+ size_t sz;
+
+ sz = sizeof(struct nhgrp_object);
+ sz += nhg_size * sizeof(struct nhop_object *);
+ sz += sizeof(struct nhgrp_priv);
+ sz += num_nhops * sizeof(struct weightened_nhop);
+ return (sz);
+}
+
+/*
+ * Compile actual list of nexthops to be used by datapath from
+ * the nexthop group @dst.
+ *
+ * For example, compiling control plane list of 2 nexthops
+ * [(200, A), (100, B)] would result in the datapath array
+ * [A, A, B]
+ */
+static void
+compile_nhgrp(struct nhgrp_priv *dst_priv, const struct weightened_nhop *x,
+ uint32_t num_slots)
+{
+ struct nhgrp_object *dst;
+ int i, slot_idx, remaining_slots;
+ uint64_t remaining_sum, nh_weight, nh_slots;
+
+ slot_idx = 0;
+ dst = dst_priv->nhg;
+ /* Calculate sum of all weights */
+ remaining_sum = 0;
+ for (i = 0; i < dst_priv->nhg_nh_count; i++)
+ remaining_sum += x[i].weight;
+ remaining_slots = num_slots;
+ DPRINTF("O: %u/%u", (uint32_t)remaining_sum, remaining_slots);
+ for (i = 0; i < dst_priv->nhg_nh_count; i++) {
+ /* Calculate number of slots for the current nexthop */
+ if (remaining_sum > 0) {
+ nh_weight = (uint64_t)x[i].weight;
+ nh_slots = (nh_weight * remaining_slots / remaining_sum);
+ } else
+ nh_slots = 0;
+
+ remaining_sum -= x[i].weight;
+ remaining_slots -= nh_slots;
+
+ DPRINTF(" OO[%d]: %u/%u curr=%d slot_idx=%d", i,
+ (uint32_t)remaining_sum, remaining_slots,
+ (int)nh_slots, slot_idx);
+
+ KASSERT((slot_idx + nh_slots <= num_slots),
+ ("index overflow during nhg compilation"));
+ while (nh_slots-- > 0)
+ dst->nhops[slot_idx++] = x[i].nh;
+ }
+}
+
+/*
+ * Allocates new nexthop group for the list of weightened nexthops.
+ * Assume sorted list.
+ * Does NOT reference any nexthops in the group.
+ * Returns group with refcount=1 or NULL.
+ */
+static struct nhgrp_priv *
+alloc_nhgrp(struct weightened_nhop *wn, int num_nhops)
+{
+ uint32_t nhgrp_size;
+ int flags = M_NOWAIT;
+ struct nhgrp_object *nhg;
+ struct nhgrp_priv *nhg_priv;
+
+ nhgrp_size = calc_min_mpath_slots(wn, num_nhops);
+ if (nhgrp_size == 0) {
+ /* Zero weights, abort */
+ return (NULL);
+ }
+
+ size_t sz = get_nhgrp_alloc_size(nhgrp_size, num_nhops);
+ nhg = malloc(sz, M_NHOP, flags | M_ZERO);
+ if (nhg == NULL) {
+ return (NULL);
+ }
+
+ /* Has to be the first to make NHGRP_PRIV() work */
+ nhg->nhg_size = nhgrp_size;
+ DPRINTF("new mpath group: num_nhops: %u", (uint32_t)nhgrp_size);
+ nhg->nhg_flags = MPF_MULTIPATH;
+
+ nhg_priv = NHGRP_PRIV(nhg);
+ nhg_priv->nhg_nh_count = num_nhops;
+ refcount_init(&nhg_priv->nhg_refcount, 1);
+
+ /* Please see nhgrp_free() comments on the initial value */
+ refcount_init(&nhg_priv->nhg_linked, 2);
+
+ nhg_priv->nhg = nhg;
+ memcpy(&nhg_priv->nhg_nh_weights[0], wn,
+ num_nhops * sizeof(struct weightened_nhop));
+
+ compile_nhgrp(nhg_priv, wn, nhg->nhg_size);
+
+ return (nhg_priv);
+}
+
+void
+nhgrp_free(struct nhgrp_object *nhg)
+{
+ struct nhgrp_priv *nhg_priv;
+ struct nh_control *ctl;
+ struct epoch_tracker et;
+
+ nhg_priv = NHGRP_PRIV(nhg);
+
+ if (!refcount_release(&nhg_priv->nhg_refcount))
+ return;
+
+ /*
+ * group objects don't have an explicit lock attached to it.
+ * As groups are reclaimed based on reference count, it is possible
+ * that some groups will persist after vnet destruction callback
+ * called. Given that, handle scenario with nhgrp_free_group() being
+ * called either after or simultaneously with nhgrp_ctl_unlink_all()
+ * by using another reference counter: nhg_linked.
+ *
+ * There are only 2 places, where nhg_linked can be decreased:
+ * rib destroy (nhgrp_ctl_unlink_all) and this function.
+ * nhg_link can never be increased.
+ *
+ * Hence, use initial value of 2 to make use of
+ * refcount_release_if_not_last().
+ *
+ * There can be two scenarious when calling this function:
+ *
+ * 1) nhg_linked value is 2. This means that either
+ * nhgrp_ctl_unlink_all() has not been called OR it is running,
+ * but we are guaranteed that nh_control won't be freed in
+ * this epoch. Hence, nexthop can be safely unlinked.
+ *
+ * 2) nh_linked value is 1. In that case, nhgrp_ctl_unlink_all()
+ * has been called and nhgrp unlink can be skipped.
+ */
+
+ NET_EPOCH_ENTER(et);
+ if (refcount_release_if_not_last(&nhg_priv->nhg_linked)) {
+ ctl = nhg_priv->nh_control;
+ if (unlink_nhgrp(ctl, nhg_priv) == NULL) {
+ /* Do not try to reclaim */
+ DPRINTF("Failed to unlink nexhop group %p", nhg_priv);
+ NET_EPOCH_EXIT(et);
+ return;
+ }
+ }
+ NET_EPOCH_EXIT(et);
+
+ epoch_call(net_epoch_preempt, destroy_nhgrp_epoch,
+ &nhg_priv->nhg_epoch_ctx);
+}
+
+/*
+ * Destroys all local resources belonging to @nhg_priv.
+ */
+__noinline static void
+destroy_nhgrp_int(struct nhgrp_priv *nhg_priv)
+{
+
+ free(nhg_priv->nhg, M_NHOP);
+}
+
+__noinline static void
+destroy_nhgrp(struct nhgrp_priv *nhg_priv)
+{
+
+ KASSERT((nhg_priv->nhg_refcount == 0), ("nhg_refcount != 0"));
+
+ DPRINTF("DEL MPATH %p", nhg_priv);
+
+ KASSERT((nhg_priv->nhg_idx == 0), ("gr_idx != 0"));
+
+ free_nhgrp_nhops(nhg_priv);
+
+ destroy_nhgrp_int(nhg_priv);
+}
+
+/*
+ * Epoch callback indicating group is safe to destroy
+ */
+static void
+destroy_nhgrp_epoch(epoch_context_t ctx)
+{
+ struct nhgrp_priv *nhg_priv;
+
+ nhg_priv = __containerof(ctx, struct nhgrp_priv, nhg_epoch_ctx);
+
+ destroy_nhgrp(nhg_priv);
+}
+
+static bool
+ref_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
+{
+
+ for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
+ if (nhop_try_ref_object(nhg_priv->nhg_nh_weights[i].nh) != 0)
+ continue;
+
+ /*
+ * Failed to ref the nexthop, b/c it's deleted.
+ * Need to rollback references back.
+ */
+ for (int j = 0; j < i; j++)
+ nhop_free(nhg_priv->nhg_nh_weights[j].nh);
+ return (false);
+ }
+
+ return (true);
+}
+
+static void
+free_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
+{
+
+ for (int i = 0; i < nhg_priv->nhg_nh_count; i++)
+ nhop_free(nhg_priv->nhg_nh_weights[i].nh);
+}
+
+/*
+ * Creates or looks up an existing nexthop group based on @wn and @num_nhops.
+ *
+ * Returns referenced nhop group or NULL, passing error code in @perror.
+ */
+struct nhgrp_priv *
+get_nhgrp(struct nh_control *ctl, struct weightened_nhop *wn, int num_nhops,
+ int *perror)
+{
+ struct nhgrp_priv *key, *nhg_priv;
+
+ if (num_nhops > RIB_MAX_MPATH_WIDTH) {
+ *perror = E2BIG;
+ return (NULL);
+ }
+
+ if (ctl->gr_head.hash_size == 0) {
+ /* First multipath request. Bootstrap mpath datastructures. */
+ if (nhgrp_ctl_alloc_default(ctl, M_NOWAIT) == 0) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+ }
+
+ /* Sort nexthops & check there are no duplicates */
+ sort_weightened_nhops(wn, num_nhops);
+ uint32_t last_id = 0;
+ for (int i = 0; i < num_nhops; i++) {
+ if (wn[i].nh->nh_priv->nh_idx == last_id) {
+ *perror = EEXIST;
+ return (NULL);
+ }
+ last_id = wn[i].nh->nh_priv->nh_idx;
+ }
+
+ if ((key = alloc_nhgrp(wn, num_nhops)) == NULL) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+
+ nhg_priv = find_nhgrp(ctl, key);
+ if (nhg_priv != NULL) {
+ /*
+ * Free originally-created group. As it hasn't been linked
+ * and the dependent nexhops haven't been referenced, just free
+ * the group.
+ */
+ destroy_nhgrp_int(key);
+ *perror = 0;
+ return (nhg_priv);
+ } else {
+ /* No existing group, try to link the new one */
+ if (!ref_nhgrp_nhops(key)) {
+ /*
+ * Some of the nexthops have been scheduled for deletion.
+ * As the group hasn't been linked / no nexhops have been
+ * referenced, call the final destructor immediately.
+ */
+ destroy_nhgrp_int(key);
+ *perror = EAGAIN;
+ return (NULL);
+ }
+ if (link_nhgrp(ctl, key) == 0) {
+ /* Unable to allocate index? */
+ *perror = EAGAIN;
+ destroy_nhgrp(key);
+ }
+ *perror = 0;
+ return (key);
+ }
+
+ /* NOTREACHED */
+}
+
+/*
+ * Appends one or more nexthops denoted by @wm to the nexthop group @gr_orig.
+ *
+ * Returns referenced nexthop group or NULL. In the latter case, @perror is
+ * filled with an error code.
+ * Note that function does NOT care if the next nexthops already exists
+ * in the @gr_orig. As a result, they will be added, resulting in the
+ * same nexthop being present multiple times in the new group.
+ */
+static struct nhgrp_priv *
+append_nhops(struct nh_control *ctl, const struct nhgrp_object *gr_orig,
+ struct weightened_nhop *wn, int num_nhops, int *perror)
+{
+ char storage[64];
+ struct weightened_nhop *pnhops;
+ struct nhgrp_priv *nhg_priv;
+ const struct nhgrp_priv *src_priv;
+ size_t sz;
+ int curr_nhops;
+
+ src_priv = NHGRP_PRIV_CONST(gr_orig);
+ curr_nhops = src_priv->nhg_nh_count;
+
+ *perror = 0;
+
+ sz = (src_priv->nhg_nh_count + num_nhops) * (sizeof(struct weightened_nhop));
+ /* optimize for <= 4 paths, each path=16 bytes */
+ if (sz <= sizeof(storage))
+ pnhops = (struct weightened_nhop *)&storage[0];
+ else {
+ pnhops = malloc(sz, M_TEMP, M_NOWAIT);
+ if (pnhops == NULL) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+ }
+
+ /* Copy nhops from original group first */
+ memcpy(pnhops, src_priv->nhg_nh_weights,
+ curr_nhops * sizeof(struct weightened_nhop));
+ memcpy(&pnhops[curr_nhops], wn, num_nhops * sizeof(struct weightened_nhop));
+ curr_nhops += num_nhops;
+
+ nhg_priv = get_nhgrp(ctl, pnhops, curr_nhops, perror);
+
+ if (pnhops != (struct weightened_nhop *)&storage[0])
+ free(pnhops, M_TEMP);
+
+ if (nhg_priv == NULL)
+ return (NULL);
+
+ return (nhg_priv);
+}
+
+
+/*
+ * Creates/finds nexthop group based on @wn and @num_nhops.
+ * Returns 0 on success with referenced group in @rnd, or
+ * errno.
+ *
+ * If the error is EAGAIN, then the operation can be retried.
+ */
+int
+nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops,
+ struct route_nhop_data *rnd)
+{
+ struct nh_control *ctl = rh->nh_control;
+ struct nhgrp_priv *nhg_priv;
+ int error;
+
+ nhg_priv = get_nhgrp(ctl, wn, num_nhops, &error);
+ if (nhg_priv != NULL)
+ rnd->rnd_nhgrp = nhg_priv->nhg;
+ rnd->rnd_weight = 0;
+
+ return (error);
+}
+
+/*
+ * Creates new nexthop group based on @src group with the nexthops defined in bitmask
+ * @nhop_mask removed.
+ * Returns referenced nexthop group or NULL on failure.
+ */
+int
+nhgrp_get_filtered_group(struct rib_head *rh, const struct nhgrp_object *src,
+ nhgrp_filter_cb_t flt_func, void *flt_data, struct route_nhop_data *rnd)
+{
+ char storage[64];
+ struct nh_control *ctl = rh->nh_control;
+ struct weightened_nhop *pnhops;
+ const struct nhgrp_priv *mp_priv, *src_priv;
+ size_t sz;
+ int error, i, num_nhops;
+
+ src_priv = NHGRP_PRIV_CONST(src);
+
+ sz = src_priv->nhg_nh_count * (sizeof(struct weightened_nhop));
+ /* optimize for <= 4 paths, each path=16 bytes */
+ if (sz <= sizeof(storage))
+ pnhops = (struct weightened_nhop *)&storage[0];
+ else {
+ if ((pnhops = malloc(sz, M_TEMP, M_NOWAIT)) == NULL)
+ return (ENOMEM);
+ }
+
+ /* Filter nexthops */
+ error = 0;
+ num_nhops = 0;
+ for (i = 0; i < src_priv->nhg_nh_count; i++) {
+ if (flt_func(src_priv->nhg_nh_weights[i].nh, flt_data))
+ continue;
+ memcpy(&pnhops[num_nhops++], &src_priv->nhg_nh_weights[i],
+ sizeof(struct weightened_nhop));
+ }
+
+ if (num_nhops == 0) {
+ rnd->rnd_nhgrp = NULL;
+ rnd->rnd_weight = 0;
+ } else if (num_nhops == 1) {
+ rnd->rnd_nhop = pnhops[0].nh;
+ rnd->rnd_weight = pnhops[0].weight;
+ if (nhop_try_ref_object(rnd->rnd_nhop) == 0)
+ error = EAGAIN;
+ } else {
+ mp_priv = get_nhgrp(ctl, pnhops, num_nhops, &error);
+ if (mp_priv != NULL)
+ rnd->rnd_nhgrp = mp_priv->nhg;
+ rnd->rnd_weight = 0;
+ }
+
+ if (pnhops != (struct weightened_nhop *)&storage[0])
+ free(pnhops, M_TEMP);
+
+ return (error);
+}
+
+/*
+ * Creates new multipath group based on existing group/nhop in @rnd_orig and
+ * to-be-added nhop @wn_add.
+ * Returns 0 on success and stores result in @rnd_new.
+ */
+int
+nhgrp_get_addition_group(struct rib_head *rh, struct route_nhop_data *rnd_orig,
+ struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_new)
+{
+ struct nh_control *ctl = rh->nh_control;
+ struct nhgrp_priv *nhg_priv;
+ struct weightened_nhop wn[2];
+ int error;
+
+ if (rnd_orig->rnd_nhop == NULL) {
+ /* No paths to add to, just reference current nhop */
+ *rnd_new = *rnd_add;
+ if (nhop_try_ref_object(rnd_new->rnd_nhop) == 0)
+ return (EAGAIN);
+ return (0);
+ }
+
+ wn[0].nh = rnd_add->rnd_nhop;
+ wn[0].weight = rnd_add->rnd_weight;
+
+ if (!NH_IS_NHGRP(rnd_orig->rnd_nhop)) {
+ /* Simple merge of 2 non-multipath nexthops */
+ wn[1].nh = rnd_orig->rnd_nhop;
+ wn[1].weight = rnd_orig->rnd_weight;
+ nhg_priv = get_nhgrp(ctl, wn, 2, &error);
+ } else {
+ /* Get new nhop group with @rt->rt_nhop as an additional nhop */
+ nhg_priv = append_nhops(ctl, rnd_orig->rnd_nhgrp, &wn[0], 1,
+ &error);
+ }
+
+ if (nhg_priv == NULL)
+ return (error);
+ rnd_new->rnd_nhgrp = nhg_priv->nhg;
+ rnd_new->rnd_weight = 0;
+
+ return (0);
+}
+
+/*
+ * Returns pointer to array of nexthops with weights for
+ * given @nhg. Stores number of items in the array into @pnum_nhops.
+ */
+struct weightened_nhop *
+nhgrp_get_nhops(struct nhgrp_object *nhg, uint32_t *pnum_nhops)
+{
+ struct nhgrp_priv *nhg_priv;
+
+ KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
+
+ nhg_priv = NHGRP_PRIV(nhg);
+ *pnum_nhops = nhg_priv->nhg_nh_count;
+
+ return (nhg_priv->nhg_nh_weights);
+}
+
+__noinline static int
+dump_nhgrp_entry(struct rib_head *rh, const struct nhgrp_priv *nhg_priv,
+ char *buffer, size_t buffer_size, struct sysctl_req *w)
+{
+ struct rt_msghdr *rtm;
+ struct nhgrp_external *nhge;
+ struct nhgrp_container *nhgc;
+ const struct nhgrp_object *nhg;
+ struct nhgrp_nhop_external *ext;
+ int error;
+ size_t sz;
+
+ nhg = nhg_priv->nhg;
+
+ sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
+ /* controlplane nexthops */
+ sz += sizeof(struct nhgrp_container);
+ sz += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
+ /* dataplane nexthops */
+ sz += sizeof(struct nhgrp_container);
+ sz += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
+
+ KASSERT(sz <= buffer_size, ("increase nhgrp buffer size"));
+
+ bzero(buffer, sz);
+
+ rtm = (struct rt_msghdr *)buffer;
+ rtm->rtm_msglen = sz;
+ rtm->rtm_version = RTM_VERSION;
+ rtm->rtm_type = RTM_GET;
+
+ nhge = (struct nhgrp_external *)(rtm + 1);
+
+ nhge->nhg_idx = nhg_priv->nhg_idx;
+ nhge->nhg_refcount = nhg_priv->nhg_refcount;
+
+ /* fill in control plane nexthops firs */
+ nhgc = (struct nhgrp_container *)(nhge + 1);
+ nhgc->nhgc_type = NHG_C_TYPE_CNHOPS;
+ nhgc->nhgc_subtype = 0;
+ nhgc->nhgc_len = sizeof(struct nhgrp_container);
+ nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
+ nhgc->nhgc_count = nhg_priv->nhg_nh_count;
+
+ ext = (struct nhgrp_nhop_external *)(nhgc + 1);
+ for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
+ ext[i].nh_idx = nhg_priv->nhg_nh_weights[i].nh->nh_priv->nh_idx;
+ ext[i].nh_weight = nhg_priv->nhg_nh_weights[i].weight;
+ }
+
+ /* fill in dataplane nexthops */
+ nhgc = (struct nhgrp_container *)(&ext[nhg_priv->nhg_nh_count]);
+ nhgc->nhgc_type = NHG_C_TYPE_DNHOPS;
+ nhgc->nhgc_subtype = 0;
+ nhgc->nhgc_len = sizeof(struct nhgrp_container);
+ nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
+ nhgc->nhgc_count = nhg->nhg_size;
+
+ ext = (struct nhgrp_nhop_external *)(nhgc + 1);
+ for (int i = 0; i < nhg->nhg_size; i++) {
+ ext[i].nh_idx = nhg->nhops[i]->nh_priv->nh_idx;
+ ext[i].nh_weight = 0;
+ }
+
+ error = SYSCTL_OUT(w, buffer, sz);
+
+ return (error);
+}
+
+int
+nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w)
+{
+ struct nh_control *ctl = rh->nh_control;
+ struct epoch_tracker et;
+ struct nhgrp_priv *nhg_priv;
+ char *buffer;
+ size_t sz;
+ int error = 0;
+
+ if (ctl->gr_head.items_count == 0)
+ return (0);
+
+ /* Calculate the maximum nhop group size in bytes */
+ sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
+ sz += 2 * sizeof(struct nhgrp_container);
+ sz += 2 * sizeof(struct nhgrp_nhop_external) * RIB_MAX_MPATH_WIDTH;
+ buffer = malloc(sz, M_TEMP, M_WAITOK);
+
+ NET_EPOCH_ENTER(et);
+ NHOPS_RLOCK(ctl);
+ CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) {
+ error = dump_nhgrp_entry(rh, nhg_priv, buffer, sz, w);
+ if (error != 0)
+ break;
+ } CHT_SLIST_FOREACH_END;
+ NHOPS_RUNLOCK(ctl);
+ NET_EPOCH_EXIT(et);
+
+ free(buffer, M_TEMP);
+
+ return (error);
+}
Index: head/sys/net/route/nhgrp_var.h
===================================================================
--- head/sys/net/route/nhgrp_var.h
+++ head/sys/net/route/nhgrp_var.h
@@ -0,0 +1,72 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This header file contains private definitions for the nexthop groups.
+ *
+ * Header is not intended to be included by the code external to the
+ * routing subsystem.
+ */
+
+#ifndef _NET_ROUTE_NHGRP_VAR_H_
+#define _NET_ROUTE_NHGRP_VAR_H_
+
+/* nhgrp hash definition */
+/* produce hash value for an object */
+#define mpath_hash_obj(_obj) (hash_nhgrp(_obj))
+/* compare two objects */
+#define mpath_cmp(_one, _two) (cmp_nhgrp(_one, _two))
+/* next object accessor */
+#define mpath_next(_obj) (_obj)->nhg_priv_next
+
+struct nhgrp_priv {
+ uint32_t nhg_idx;
+ uint8_t nhg_nh_count; /* number of items in nh_weights */
+ uint8_t nhg_spare[3];
+ u_int nhg_refcount; /* use refcount */
+ u_int nhg_linked; /* refcount(9), == 2 if linked to the list */
+ struct nh_control *nh_control; /* parent control structure */
+ struct nhgrp_priv *nhg_priv_next;
+ struct nhgrp_object *nhg;
+ struct epoch_context nhg_epoch_ctx; /* epoch data for nhop */
+ struct weightened_nhop nhg_nh_weights[0];
+};
+
+#define _NHGRP_PRIV(_src) (&(_src)->nhops[(_src)->nhg_size])
+#define NHGRP_PRIV(_src) ((struct nhgrp_priv *)_NHGRP_PRIV(_src))
+#define NHGRP_PRIV_CONST(_src) ((const struct nhgrp_priv *)_NHGRP_PRIV(_src))
+
+/* nhgrp.c */
+bool nhgrp_ctl_alloc_default(struct nh_control *ctl, int malloc_flags);
+struct nhgrp_priv *find_nhgrp(struct nh_control *ctl, const struct nhgrp_priv *key);
+int link_nhgrp(struct nh_control *ctl, struct nhgrp_priv *grp_priv);
+struct nhgrp_priv *unlink_nhgrp(struct nh_control *ctl, struct nhgrp_priv *key);
+
+#endif
+
Index: head/sys/net/route/nhop.h
===================================================================
--- head/sys/net/route/nhop.h
+++ head/sys/net/route/nhop.h
@@ -155,7 +155,7 @@
*/
#define NH_IS_VALID(_nh) RT_LINK_IS_UP((_nh)->nh_ifp)
-#define NH_IS_MULTIPATH(_nh) ((_nh)->nh_flags & NHF_MULTIPATH)
+#define NH_IS_NHGRP(_nh) ((_nh)->nh_flags & NHF_MULTIPATH)
#define RT_GATEWAY(_rt) ((struct sockaddr *)&(_rt)->rt_nhop->gw4_sa)
#define RT_GATEWAY_CONST(_rt) ((const struct sockaddr *)&(_rt)->rt_nhop->gw4_sa)
@@ -166,6 +166,11 @@
_nh = NULL; \
} while (0)
+struct weightened_nhop {
+ struct nhop_object *nh;
+ uint32_t weight;
+};
+
void nhop_free(struct nhop_object *nh);
struct sysctl_req;
@@ -209,16 +214,34 @@
uint16_t src_sa_off; /* offset of src address SA */
};
-struct mpath_nhop_external {
+#define NHG_C_TYPE_CNHOPS 0x1 /* Control plane nhops list */
+#define NHG_C_TYPE_DNHOPS 0x2 /* Dataplane nhops list */
+struct nhgrp_container {
+ uint32_t nhgc_len; /* container length */
+ uint16_t nhgc_count; /* number of items */
+ uint8_t nhgc_type; /* container type */
+ uint8_t nhgc_subtype; /* container subtype */
+};
+
+struct nhgrp_nhop_external {
uint32_t nh_idx;
uint32_t nh_weight;
};
-struct mpath_external {
- uint32_t mp_idx;
- uint32_t mp_refcount;
- uint32_t mp_nh_count;
- uint32_t mp_group_size;
+/*
+ * Layout:
+ * - nhgrp_external
+ * - nhgrp_container (control plane nhops list)
+ * - nhgrp_nhop_external
+ * - nhgrp_nhop_external
+ * ..
+ * - nhgrp_container (dataplane nhops list)
+ * - nhgrp_nhop_external
+ * - nhgrp_nhop_external
+ */
+struct nhgrp_external {
+ uint32_t nhg_idx; /* Nexthop group index */
+ uint32_t nhg_refcount; /* number of references */
};
#endif
Index: head/sys/net/route/nhop.c
===================================================================
--- head/sys/net/route/nhop.c
+++ head/sys/net/route/nhop.c
@@ -64,7 +64,7 @@
* is backed by the bitmask array.
*/
-static MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data");
+MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data");
/* Hash management functions */
@@ -112,6 +112,9 @@
NHOPS_LOCK_DESTROY(ctl);
free(ctl->nh_head.ptr, M_NHOP);
free(ctl->nh_idx_head.idx, M_NHOP);
+#ifdef ROUTE_MPATH
+ nhgrp_ctl_free(ctl);
+#endif
free(ctl, M_NHOP);
}
@@ -154,6 +157,9 @@
DPRINTF("Marking nhop %u unlinked", nh_priv->nh_idx);
refcount_release(&nh_priv->nh_linked);
} CHT_SLIST_FOREACH_END;
+#ifdef ROUTE_MPATH
+ nhgrp_ctl_unlink_all(ctl);
+#endif
NHOPS_WUNLOCK(ctl);
/*
Index: head/sys/net/route/nhop_ctl.c
===================================================================
--- head/sys/net/route/nhop_ctl.c
+++ head/sys/net/route/nhop_ctl.c
@@ -695,7 +695,14 @@
nhop_free_any(struct nhop_object *nh)
{
+#ifdef ROUTE_MPATH
+ if (!NH_IS_NHGRP(nh))
+ nhop_free(nh);
+ else
+ nhgrp_free((struct nhgrp_object *)nh);
+#else
nhop_free(nh);
+#endif
}
/* Helper functions */
Index: head/sys/net/route/nhop_var.h
===================================================================
--- head/sys/net/route/nhop_var.h
+++ head/sys/net/route/nhop_var.h
@@ -37,6 +37,8 @@
#ifndef _NET_ROUTE_NHOP_VAR_H_
#define _NET_ROUTE_NHOP_VAR_H_
+MALLOC_DECLARE(M_NHOP);
+
/* define nhop hash table */
struct nhop_priv;
CHT_SLIST_DEFINE(nhops, struct nhop_priv);
@@ -47,9 +49,15 @@
/* next object accessor */
#define nhops_next(_obj) (_obj)->nh_next
+/* define multipath hash table */
+struct nhgrp_priv;
+CHT_SLIST_DEFINE(nhgroups, struct nhgrp_priv);
+
struct nh_control {
struct nhops_head nh_head; /* hash table head */
struct bitmask_head nh_idx_head; /* nhop index head */
+ struct nhgroups_head gr_head; /* nhgrp hash table head */
+ struct bitmask_head gr_idx_head; /* nhgrp index head */
struct rwlock ctl_lock; /* overall ctl lock */
struct rib_head *ctl_rh; /* pointer back to rnh */
struct epoch_context ctl_epoch_ctx; /* epoch ctl helper */
@@ -80,7 +88,8 @@
struct epoch_context nh_epoch_ctx; /* epoch data for nhop */
};
-#define NH_IS_PINNED(_nh) ((_nh)->nh_priv->rt_flags & RTF_PINNED)
+#define NH_IS_PINNED(_nh) ((!NH_IS_NHGRP(_nh)) && \
+ ((_nh)->nh_priv->rt_flags & RTF_PINNED))
/* nhop.c */
struct nhop_priv *find_nhop(struct nh_control *ctl,
Index: head/sys/net/route/route_ctl.h
===================================================================
--- head/sys/net/route/route_ctl.h
+++ head/sys/net/route/route_ctl.h
@@ -53,6 +53,10 @@
int rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info,
struct rib_cmd_info *rc);
+typedef void route_notification_t(struct rib_cmd_info *rc, void *);
+void rib_decompose_notification(struct rib_cmd_info *rc,
+ route_notification_t *cb, void *cbdata);
+
int rib_add_redirect(u_int fibnum, struct sockaddr *dst,
struct sockaddr *gateway, struct sockaddr *author, struct ifnet *ifp,
int flags, int expire_sec);
@@ -65,6 +69,20 @@
typedef void rt_setwarg_t(struct rib_head *, uint32_t, int, void *);
void rt_foreach_fib_walk(int af, rt_setwarg_t *, rt_walktree_f_t *, void *);
void rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg);
+
+struct route_nhop_data;
+const struct rtentry *rib_lookup_prefix(uint32_t fibnum, int family,
+ const struct sockaddr *dst, const struct sockaddr *netmask,
+ struct route_nhop_data *rnd);
+const struct rtentry *rib_lookup_lpm(uint32_t fibnum, int family,
+ const struct sockaddr *dst, struct route_nhop_data *rnd);
+
+/* Multipath */
+struct nhgrp_object;
+struct weightened_nhop;
+
+struct weightened_nhop *nhgrp_get_nhops(struct nhgrp_object *nhg,
+ uint32_t *pnum_nhops);
enum rib_subscription_type {
RIB_NOTIFY_IMMEDIATE,
Index: head/sys/net/route/route_ctl.c
===================================================================
--- head/sys/net/route/route_ctl.c
+++ head/sys/net/route/route_ctl.c
@@ -29,7 +29,7 @@
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
-#include "opt_mpath.h"
+#include "opt_route.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -83,9 +83,6 @@
struct rib_cmd_info *rc);
static int change_route(struct rib_head *rnh, struct rt_addrinfo *info,
struct route_nhop_data *nhd_orig, struct rib_cmd_info *rc);
-static int change_route_nhop(struct rib_head *rnh, struct rtentry *rt,
- struct rt_addrinfo *info, struct route_nhop_data *rnd,
- struct rib_cmd_info *rc);
static int rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info,
struct rib_cmd_info *rc);
@@ -94,7 +91,21 @@
struct rib_cmd_info *rc);
static void destroy_subscription_epoch(epoch_context_t ctx);
+static bool rib_can_multipath(struct rib_head *rh);
+/* Per-vnet multipath routing configuration */
+SYSCTL_DECL(_net_route);
+#define V_rib_route_multipath VNET(rib_route_multipath)
+#ifdef ROUTE_MPATH
+#define _MP_FLAGS CTLFLAG_RW
+#else
+#define _MP_FLAGS CTLFLAG_RD
+#endif
+VNET_DEFINE(u_int, rib_route_multipath) = 0;
+SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET,
+ &VNET_NAME(rib_route_multipath), 0, "Enable route multipath");
+#undef _MP_FLAGS
+
/* Routing table UMA zone */
VNET_DEFINE_STATIC(uma_zone_t, rtzone);
#define V_rtzone VNET(rtzone)
@@ -128,7 +139,7 @@
CURVNET_SET(nhop_get_vnet(rt->rt_nhop));
/* Unreference nexthop */
- nhop_free(rt->rt_nhop);
+ nhop_free_any(rt->rt_nhop);
uma_zfree(V_rtzone, rt);
@@ -175,6 +186,41 @@
return (rnh);
}
+#ifdef ROUTE_MPATH
+static bool
+rib_can_multipath(struct rib_head *rh)
+{
+ int result;
+
+ CURVNET_SET(rh->rib_vnet);
+ result = !!V_rib_route_multipath;
+ CURVNET_RESTORE();
+
+ return (result);
+}
+
+/*
+ * Check is nhop is multipath-eligible.
+ * Avoid nhops without gateways and redirects.
+ *
+ * Returns 1 for multipath-eligible nexthop,
+ * 0 otherwise.
+ */
+bool
+nhop_can_multipath(const struct nhop_object *nh)
+{
+
+ if ((nh->nh_flags & NHF_MULTIPATH) != 0)
+ return (1);
+ if ((nh->nh_flags & NHF_GATEWAY) == 0)
+ return (0);
+ if ((nh->nh_flags & NHF_REDIRECT) != 0)
+ return (0);
+
+ return (1);
+}
+#endif
+
static int
get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight)
{
@@ -206,7 +252,7 @@
*
* Returns true if matches, false otherwise.
*/
-static bool
+bool
match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw)
{
@@ -461,7 +507,7 @@
struct rib_cmd_info *rc)
{
struct nhop_object *nh_orig;
- struct route_nhop_data rnd;
+ struct route_nhop_data rnd_orig, rnd_add;
struct nhop_object *nh;
struct rtentry *rt, *rt_orig;
int error;
@@ -470,32 +516,19 @@
if (error != 0)
return (error);
- rnd.rnd_nhop = rt->rt_nhop;
- rnd.rnd_weight = rt->rt_weight;
+ rnd_add.rnd_nhop = rt->rt_nhop;
+ rnd_add.rnd_weight = rt->rt_weight;
nh = rt->rt_nhop;
RIB_WLOCK(rnh);
-#ifdef RADIX_MPATH
- struct sockaddr *netmask;
- netmask = info->rti_info[RTAX_NETMASK];
- /* do not permit exactly the same dst/mask/gw pair */
- if (rt_mpath_capable(rnh) &&
- rt_mpath_conflict(rnh, rt, netmask)) {
- RIB_WUNLOCK(rnh);
-
- nhop_free(nh);
- uma_zfree(V_rtzone, rt);
- return (EEXIST);
- }
-#endif
- error = add_route_nhop(rnh, rt, info, &rnd, rc);
+ error = add_route_nhop(rnh, rt, info, &rnd_add, rc);
if (error == 0) {
RIB_WUNLOCK(rnh);
return (0);
}
/* addition failed. Lookup prefix in the rib to determine the cause */
- rt_orig = lookup_prefix(rnh, info, &rnd);
+ rt_orig = lookup_prefix(rnh, info, &rnd_orig);
if (rt_orig == NULL) {
/* No prefix -> rnh_addaddr() failed to allocate memory */
RIB_WUNLOCK(rnh);
@@ -505,11 +538,11 @@
}
/* We have existing route in the RIB. */
- nh_orig = rnd.rnd_nhop;
+ nh_orig = rnd_orig.rnd_nhop;
/* Check if new route has higher preference */
if (can_override_nhop(info, nh_orig) > 0) {
/* Update nexthop to the new route */
- change_route_nhop(rnh, rt_orig, info, &rnd, rc);
+ change_route_nhop(rnh, rt_orig, info, &rnd_add, rc);
RIB_WUNLOCK(rnh);
uma_zfree(V_rtzone, rt);
nhop_free(nh_orig);
@@ -518,11 +551,26 @@
RIB_WUNLOCK(rnh);
+#ifdef ROUTE_MPATH
+ if (rib_can_multipath(rnh) && nhop_can_multipath(rnd_add.rnd_nhop) &&
+ nhop_can_multipath(rnd_orig.rnd_nhop))
+ error = add_route_mpath(rnh, info, rt, &rnd_add, &rnd_orig, rc);
+ else
+#endif
/* Unable to add - another route with the same preference exists */
error = EEXIST;
+ /*
+ * ROUTE_MPATH disabled: failed to add route, free both nhop and rt.
+ * ROUTE_MPATH enabled: original nhop reference is unused in any case,
+ * free rt only if not _adding_ new route to rib (e.g. the case
+ * when initial lookup returned existing route, but then it got
+ * deleted prior to multipath group insertion, leading to a simple
+ * non-multipath add as a result).
+ */
nhop_free(nh);
- uma_zfree(V_rtzone, rt);
+ if ((error != 0) || rc->rc_cmd != RTM_ADD)
+ uma_zfree(V_rtzone, rt);
return (error);
}
@@ -588,7 +636,13 @@
return (ESRCH);
nh = rt->rt_nhop;
-
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh)) {
+ error = del_route_mpath(rnh, info, rt,
+ (struct nhgrp_object *)nh, rc);
+ return (error);
+ }
+#endif
error = check_info_match_nhop(info, rt, nh);
if (error != 0)
return (error);
@@ -600,14 +654,6 @@
* Remove the item from the tree and return it.
* Complain if it is not there and do no more processing.
*/
-#ifdef RADIX_MPATH
- info->rti_info[RTAX_GATEWAY] = &nh->gw_sa;
- if (rt_mpath_capable(rnh)) {
- rn = rt_mpath_unlink(rnh, info, rt, &error);
- if (error != 0)
- return (error);
- } else
-#endif
rn = rnh->rnh_deladdr(info->rti_info[RTAX_DST],
info->rti_info[RTAX_NETMASK], &rnh->head);
if (rn == NULL)
@@ -648,7 +694,18 @@
* If the caller wants it, then it can have it,
* the entry will be deleted after the end of the current epoch.
*/
- rtfree(rc->rc_rt);
+ if (rc->rc_cmd == RTM_DELETE)
+ rtfree(rc->rc_rt);
+#ifdef ROUTE_MPATH
+ else {
+ /*
+ * Deleting 1 path may result in RTM_CHANGE to
+ * a different mpath group/nhop.
+ * Free old mpath group.
+ */
+ nhop_free_any(rc->rc_nh_old);
+ }
+#endif
return (0);
}
@@ -694,19 +751,6 @@
return (ESRCH);
}
-#ifdef RADIX_MPATH
- /*
- * If we got multipath routes,
- * we require users to specify a matching RTAX_GATEWAY.
- */
- if (rt_mpath_capable(rnh)) {
- rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]);
- if (rt == NULL) {
- RIB_RUNLOCK(rnh);
- return (ESRCH);
- }
- }
-#endif
rnd_orig.rnd_nhop = rt->rt_nhop;
rnd_orig.rnd_weight = rt->rt_weight;
@@ -722,19 +766,12 @@
}
static int
-change_route(struct rib_head *rnh, struct rt_addrinfo *info,
- struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
+change_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct nhop_object *nh_orig, struct nhop_object **nh_new)
{
- int error = 0;
int free_ifa = 0;
- struct nhop_object *nh, *nh_orig;
- struct route_nhop_data rnd_new;
+ int error;
- nh = NULL;
- nh_orig = rnd_orig->rnd_nhop;
- if (nh_orig == NULL)
- return (ESRCH);
-
/*
* New gateway could require new ifaddr, ifp;
* flags may also be different; ifp may be specified
@@ -759,25 +796,102 @@
}
}
- error = nhop_create_from_nhop(rnh, nh_orig, info, &nh);
+ error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new);
if (free_ifa) {
ifa_free(info->rti_ifa);
info->rti_ifa = NULL;
}
+
+ return (error);
+}
+
+#ifdef ROUTE_MPATH
+static int
+change_mpath_route(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
+{
+ int error = 0;
+ struct nhop_object *nh, *nh_orig, *nh_new;
+ struct route_nhop_data rnd_new;
+
+ nh = NULL;
+ nh_orig = rnd_orig->rnd_nhop;
+
+ struct weightened_nhop *wn = NULL, *wn_new;
+ uint32_t num_nhops;
+
+ wn = nhgrp_get_nhops((struct nhgrp_object *)nh_orig, &num_nhops);
+ nh_orig = NULL;
+ for (int i = 0; i < num_nhops; i++) {
+ if (check_info_match_nhop(info, NULL, wn[i].nh)) {
+ nh_orig = wn[i].nh;
+ break;
+ }
+ }
+
+ if (nh_orig == NULL)
+ return (ESRCH);
+
+ error = change_nhop(rnh, info, nh_orig, &nh_new);
if (error != 0)
return (error);
- rnd_new.rnd_nhop = nh;
- if (info->rti_mflags & RTV_WEIGHT)
- rnd_new.rnd_weight = info->rti_rmx->rmx_weight;
- else
- rnd_new.rnd_weight = rnd_orig->rnd_weight;
+ wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop),
+ M_TEMP, M_NOWAIT | M_ZERO);
+ if (wn_new == NULL) {
+ nhop_free(nh_new);
+ return (EAGAIN);
+ }
+ memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop));
+ for (int i = 0; i < num_nhops; i++) {
+ if (wn[i].nh == nh_orig) {
+ wn[i].nh = nh_new;
+ wn[i].weight = get_info_weight(info, rnd_orig->rnd_weight);
+ break;
+ }
+ }
+
+ error = nhgrp_get_group(rnh, wn_new, num_nhops, &rnd_new);
+ nhop_free(nh_new);
+ free(wn_new, M_TEMP);
+
+ if (error != 0)
+ return (error);
+
error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc);
return (error);
}
+#endif
+static int
+change_route(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
+{
+ int error = 0;
+ struct nhop_object *nh, *nh_orig;
+ struct route_nhop_data rnd_new;
+
+ nh = NULL;
+ nh_orig = rnd_orig->rnd_nhop;
+ if (nh_orig == NULL)
+ return (ESRCH);
+
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh_orig))
+ return (change_mpath_route(rnh, info, rnd_orig, rc));
+#endif
+
+ rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight);
+ error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop);
+ if (error != 0)
+ return (error);
+ error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc);
+
+ return (error);
+}
+
/*
* Insert @rt with nhop data from @rnd_new to @rnh.
* Returns 0 on success and stores operation results in @rc.
@@ -827,7 +941,7 @@
* Conditionally set rt_expire if set in @info.
* Returns 0 on success.
*/
-static int
+int
change_route_nhop(struct rib_head *rnh, struct rtentry *rt,
struct rt_addrinfo *info, struct route_nhop_data *rnd,
struct rib_cmd_info *rc)
@@ -855,6 +969,8 @@
rn = rnh->rnh_deladdr(ndst, netmask, &rnh->head);
if (rn == NULL)
return (ESRCH);
+ rt = RNTORT(rn);
+ rt->rte_flags &= ~RTF_UP;
}
/* Finalize notification */
@@ -989,7 +1105,6 @@
info->rti_info[RTAX_DST] = rt_key(rt);
info->rti_info[RTAX_NETMASK] = rt_mask(rt);
- info->rti_info[RTAX_GATEWAY] = &rt->rt_nhop->gw_sa;
error = rt_unlinkrte(di->rnh, info, &di->rc);
@@ -1000,7 +1115,7 @@
* XXX: Delayed notifications not implemented
* for nexthop updates.
*/
- if (error == 0) {
+ if ((error == 0) && (di->rc.rc_cmd == RTM_DELETE)) {
/* Add to the list and return */
rt->rt_chain = di->head;
di->head = rt;
@@ -1024,6 +1139,7 @@
struct rib_head *rnh;
struct rt_delinfo di;
struct rtentry *rt;
+ struct nhop_object *nh;
struct epoch_tracker et;
rnh = rt_tables_get_rnh(fibnum, family);
@@ -1049,18 +1165,31 @@
rt = di.head;
di.head = rt->rt_chain;
rt->rt_chain = NULL;
+ nh = rt->rt_nhop;
di.rc.rc_rt = rt;
- di.rc.rc_nh_old = rt->rt_nhop;
+ di.rc.rc_nh_old = nh;
rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc);
/* TODO std rt -> rt_addrinfo export */
di.info.rti_info[RTAX_DST] = rt_key(rt);
di.info.rti_info[RTAX_NETMASK] = rt_mask(rt);
- if (report)
- rt_routemsg(RTM_DELETE, rt, rt->rt_nhop->nh_ifp, 0,
- fibnum);
+ if (report) {
+#ifdef ROUTE_MPATH
+ struct nhgrp_object *nhg;
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ if (NH_IS_NHGRP(nh)) {
+ nhg = (struct nhgrp_object *)nh;
+ wn = nhgrp_get_nhops(nhg, &num_nhops);
+ for (int i = 0; i < num_nhops; i++)
+ rt_routemsg(RTM_DELETE, rt,
+ wn[i].nh->nh_ifp, 0, fibnum);
+ } else
+#endif
+ rt_routemsg(RTM_DELETE, rt, nh->nh_ifp, 0, fibnum);
+ }
rtfree(rt);
}
Index: head/sys/net/route/route_helpers.c
===================================================================
--- head/sys/net/route/route_helpers.c
+++ head/sys/net/route/route_helpers.c
@@ -131,3 +131,167 @@
return (nh);
}
+
+#ifdef ROUTE_MPATH
+static void
+decompose_change_notification(struct rib_cmd_info *rc, route_notification_t *cb,
+ void *cbdata)
+{
+ uint32_t num_old, num_new;
+ uint32_t nh_idx_old, nh_idx_new;
+ struct weightened_nhop *wn_old, *wn_new;
+ struct weightened_nhop tmp = { NULL, 0 };
+ uint32_t idx_old = 0, idx_new = 0;
+
+ struct rib_cmd_info rc_del = { .rc_cmd = RTM_DELETE, .rc_rt = rc->rc_rt };
+ struct rib_cmd_info rc_add = { .rc_cmd = RTM_ADD, .rc_rt = rc->rc_rt };
+
+ if (NH_IS_NHGRP(rc->rc_nh_old)) {
+ wn_old = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_old);
+ } else {
+ tmp.nh = rc->rc_nh_old;
+ tmp.weight = rc->rc_nh_weight;
+ wn_old = &tmp;
+ num_old = 1;
+ }
+ if (NH_IS_NHGRP(rc->rc_nh_new)) {
+ wn_new = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_new);
+ } else {
+ tmp.nh = rc->rc_nh_new;
+ tmp.weight = rc->rc_nh_weight;
+ wn_new = &tmp;
+ num_new = 1;
+ }
+
+ /* Use the fact that each @wn array is sorted */
+ /*
+ * Want to convert into set of add and delete operations
+ * [1] -> [1, 2] = A{2}
+ * [2] -> [1, 2] = A{1}
+ * [1, 2, 4]->[1, 3, 4] = A{2}, D{3}
+ * [1, 2, 4]->[1, 4] = D{2}
+ * [1, 2, 4] -> [3, 4] = D{1}, C{2,3} OR C{1,3}, D{2} OR D{1},D{2},A{3}
+ * [1, 2] -> [3, 4] =
+ *
+ */
+ idx_old = 0;
+ while ((idx_old < num_old) && (idx_new < num_new)) {
+ nh_idx_old = wn_old[idx_old].nh->nh_priv->nh_idx;
+ nh_idx_new = wn_new[idx_new].nh->nh_priv->nh_idx;
+
+ if (nh_idx_old == nh_idx_new) {
+ if (wn_old[idx_old].weight != wn_new[idx_new].weight) {
+ /* Update weight by providing del/add notifications */
+ rc_del.rc_nh_old = wn_old[idx_old].nh;
+ rc_del.rc_nh_weight = wn_old[idx_old].weight;
+ cb(&rc_del, cbdata);
+
+ rc_add.rc_nh_new = wn_new[idx_new].nh;
+ rc_add.rc_nh_weight = wn_new[idx_new].weight;
+ cb(&rc_add, cbdata);
+ }
+ idx_old++;
+ idx_new++;
+ } else if (nh_idx_old < nh_idx_new) {
+ /*
+ * [1, ~2~, 4], [1, ~3~, 4]
+ * [1, ~2~, 5], [1, ~3~, 4]
+ * [1, ~2~], [1, ~3~, 4]
+ */
+ if ((idx_old + 1 >= num_old) ||
+ (wn_old[idx_old + 1].nh->nh_priv->nh_idx > nh_idx_new)) {
+ /* Add new unless the next old item is still <= new */
+ rc_add.rc_nh_new = wn_new[idx_new].nh;
+ rc_add.rc_nh_weight = wn_new[idx_new].weight;
+ cb(&rc_add, cbdata);
+ idx_new++;
+ }
+ /* In any case, delete current old */
+ rc_del.rc_nh_old = wn_old[idx_old].nh;
+ rc_del.rc_nh_weight = wn_old[idx_old].weight;
+ cb(&rc_del, cbdata);
+ idx_old++;
+ } else {
+ /*
+ * nh_idx_old > nh_idx_new
+ *
+ * [1, ~3~, 4], [1, ~2~, 4]
+ * [1, ~3~, 5], [1, ~2~, 4]
+ * [1, ~3~, 4], [1, ~2~]
+ */
+ if ((idx_new + 1 >= num_new) ||
+ (wn_new[idx_new + 1].nh->nh_priv->nh_idx > nh_idx_old)) {
+ /* No next item or next item is > current one */
+ rc_add.rc_nh_new = wn_new[idx_new].nh;
+ rc_add.rc_nh_weight = wn_new[idx_new].weight;
+ cb(&rc_add, cbdata);
+ idx_new++;
+ }
+ /* In any case, delete current old */
+ rc_del.rc_nh_old = wn_old[idx_old].nh;
+ rc_del.rc_nh_weight = wn_old[idx_old].weight;
+ cb(&rc_del, cbdata);
+ idx_old++;
+ }
+ }
+
+ while (idx_old < num_old) {
+ rc_del.rc_nh_old = wn_old[idx_old].nh;
+ rc_del.rc_nh_weight = wn_old[idx_old].weight;
+ cb(&rc_del, cbdata);
+ idx_old++;
+ }
+
+ while (idx_new < num_new) {
+ rc_add.rc_nh_new = wn_new[idx_new].nh;
+ rc_add.rc_nh_weight = wn_new[idx_new].weight;
+ cb(&rc_add, cbdata);
+ idx_new++;
+ }
+}
+
+/*
+ * Decompose multipath cmd info @rc into a list of add/del/change
+ * single-path operations, calling @cb callback for each operation.
+ * Assumes at least one of the nexthops in @rc is multipath.
+ */
+void
+rib_decompose_notification(struct rib_cmd_info *rc, route_notification_t *cb,
+ void *cbdata)
+{
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ struct rib_cmd_info rc_new;
+
+ rc_new = *rc;
+ DPRINTF("cb=%p cmd=%d nh_old=%p nh_new=%p",
+ cb, rc->cmd, rc->nh_old, rc->nh_new);
+ switch (rc->rc_cmd) {
+ case RTM_ADD:
+ if (!NH_IS_NHGRP(rc->rc_nh_new))
+ return;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_nhops);
+ for (uint32_t i = 0; i < num_nhops; i++) {
+ rc_new.rc_nh_new = wn[i].nh;
+ rc_new.rc_nh_weight = wn[i].weight;
+ cb(&rc_new, cbdata);
+ }
+ break;
+ case RTM_DELETE:
+ if (!NH_IS_NHGRP(rc->rc_nh_old))
+ return;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_nhops);
+ for (uint32_t i = 0; i < num_nhops; i++) {
+ rc_new.rc_nh_old = wn[i].nh;
+ rc_new.rc_nh_weight = wn[i].weight;
+ cb(&rc_new, cbdata);
+ }
+ break;
+ case RTM_CHANGE:
+ if (!NH_IS_NHGRP(rc->rc_nh_old) && !NH_IS_NHGRP(rc->rc_nh_new))
+ return;
+ decompose_change_notification(rc, cb, cbdata);
+ break;
+ }
+}
+#endif
Index: head/sys/net/route/route_var.h
===================================================================
--- head/sys/net/route/route_var.h
+++ head/sys/net/route/route_var.h
@@ -87,6 +87,7 @@
/* Constants */
#define RIB_MAX_RETRIES 3
#define RT_MAXFIBS UINT16_MAX
+#define RIB_MAX_MPATH_WIDTH 64
/* Macro for verifying fields in af-specific 'struct route' structures */
#define CHK_STRUCT_FIELD_GENERIC(_s1, _f1, _s2, _f2) \
@@ -113,12 +114,7 @@
"ro_dst and " #_dst_new " are at different offset")
struct rib_head *rt_tables_get_rnh(uint32_t table, sa_family_t family);
-void rt_mpath_init_rnh(struct rib_head *rnh);
int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum);
-#ifdef RADIX_MPATH
-struct radix_node *rt_mpath_unlink(struct rib_head *rnh,
- struct rt_addrinfo *info, struct rtentry *rto, int *perror);
-#endif
struct rib_cmd_info;
VNET_PCPUSTAT_DECLARE(struct rtstat, rtstat);
@@ -202,14 +198,6 @@
/* rtentry rt flag mask */
#define RTE_RT_FLAG_MASK (RTF_UP | RTF_HOST)
-/* Nexthop selection */
-#define _NH2MP(_nh) ((struct nhgrp_object *)(_nh))
-#define _SELECT_NHOP(_nh, _flowid) \
- (_NH2MP(_nh))->nhops[(_flowid) % (_NH2MP(_nh))->mp_size]
-#define _RT_SELECT_NHOP(_nh, _flowid) \
- ((!NH_IS_MULTIPATH(_nh)) ? (_nh) : _SELECT_NHOP(_nh, _flowid))
-#define RT_SELECT_NHOP(_rt, _flowid) _RT_SELECT_NHOP((_rt)->rt_nhop, _flowid)
-
/* route_temporal.c */
void tmproutes_update(struct rib_head *rnh, struct rtentry *rt);
void tmproutes_init(struct rib_head *rh);
@@ -217,14 +205,24 @@
/* route_ctl.c */
struct route_nhop_data {
- struct nhop_object *rnd_nhop;
- uint32_t rnd_weight;
+ union {
+ struct nhop_object *rnd_nhop;
+ struct nhgrp_object *rnd_nhgrp;
+ };
+ uint32_t rnd_weight;
};
+
+int change_route_nhop(struct rib_head *rnh, struct rtentry *rt,
+ struct rt_addrinfo *info, struct route_nhop_data *rnd,
+ struct rib_cmd_info *rc);
int change_route_conditional(struct rib_head *rnh, struct rtentry *rt,
struct rt_addrinfo *info, struct route_nhop_data *nhd_orig,
struct route_nhop_data *nhd_new, struct rib_cmd_info *rc);
struct rtentry *lookup_prefix(struct rib_head *rnh,
const struct rt_addrinfo *info, struct route_nhop_data *rnd);
+
+bool nhop_can_multipath(const struct nhop_object *nh);
+bool match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw);
int check_info_match_nhop(const struct rt_addrinfo *info,
const struct rtentry *rt, const struct nhop_object *nh);
int can_override_nhop(const struct rt_addrinfo *info,
@@ -256,5 +254,57 @@
void nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu);
int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w);
+/* MULTIPATH */
+#define MPF_MULTIPATH 0x08 /* need to be consistent with NHF_MULTIPATH */
+
+struct nhgrp_object {
+ uint16_t nhg_flags; /* nexthop group flags */
+ uint8_t nhg_size; /* dataplain group size */
+ uint8_t spare;
+ struct nhop_object *nhops[0]; /* nhops */
+};
+
+static inline struct nhop_object *
+nhop_select(struct nhop_object *nh, uint32_t flowid)
+{
+
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh)) {
+ struct nhgrp_object *nhg = (struct nhgrp_object *)nh;
+ nh = nhg->nhops[flowid % nhg->nhg_size];
+ }
+#endif
+ return (nh);
+}
+
+
+struct weightened_nhop;
+
+/* mpath_ctl.c */
+int add_route_mpath(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct rtentry *rt, struct route_nhop_data *rnd_add,
+ struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc);
+int del_route_mpath(struct rib_head *rh, struct rt_addrinfo *info,
+ struct rtentry *rt, struct nhgrp_object *nhg, struct rib_cmd_info *rc);
+
+/* nhgrp.c */
+int nhgrp_ctl_init(struct nh_control *ctl);
+void nhgrp_ctl_free(struct nh_control *ctl);
+void nhgrp_ctl_unlink_all(struct nh_control *ctl);
+
+
+/* nhgrp_ctl.c */
+int nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w);
+
+int nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn,
+ int num_nhops, struct route_nhop_data *rnd);
+typedef bool nhgrp_filter_cb_t(const struct nhop_object *nh, void *data);
+int nhgrp_get_filtered_group(struct rib_head *rh, const struct nhgrp_object *src,
+ nhgrp_filter_cb_t flt_func, void *flt_data, struct route_nhop_data *rnd);
+int nhgrp_get_addition_group(struct rib_head *rnh,
+ struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_add,
+ struct route_nhop_data *rnd_new);
+
+void nhgrp_free(struct nhgrp_object *nhg);
#endif
Index: head/sys/net/rtsock.c
===================================================================
--- head/sys/net/rtsock.c
+++ head/sys/net/rtsock.c
@@ -32,7 +32,7 @@
* $FreeBSD$
*/
#include "opt_ddb.h"
-#include "opt_mpath.h"
+#include "opt_route.h"
#include "opt_inet.h"
#include "opt_inet6.h"
@@ -158,8 +158,7 @@
#define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx)
#define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED)
-static SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
- "");
+SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
struct walkarg {
int w_tmemsize;
@@ -650,6 +649,25 @@
return (0);
}
+static struct nhop_object *
+select_nhop(struct nhop_object *nh, const struct sockaddr *gw)
+{
+ if (!NH_IS_NHGRP(nh))
+ return (nh);
+#ifdef ROUTE_MPATH
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
+ if (gw == NULL)
+ return (wn[0].nh);
+ for (int i = 0; i < num_nhops; i++) {
+ if (match_nhop_gw(wn[i].nh, gw))
+ return (wn[i].nh);
+ }
+#endif
+ return (NULL);
+}
+
/*
* Handles RTM_GET message from routing socket, returning matching rt.
*
@@ -663,6 +681,7 @@
{
RIB_RLOCK_TRACKER;
struct rib_head *rnh;
+ struct nhop_object *nh;
sa_family_t saf;
saf = info->rti_info[RTAX_DST]->sa_family;
@@ -690,21 +709,12 @@
RIB_RUNLOCK(rnh);
return (ESRCH);
}
-#ifdef RADIX_MPATH
- /*
- * for RTM_GET, gate is optional even with multipath.
- * if gate == NULL the first match is returned.
- * (no need to call rt_mpath_matchgate if gate == NULL)
- */
- if (rt_mpath_capable(rnh) && info->rti_info[RTAX_GATEWAY]) {
- rc->rc_rt = rt_mpath_matchgate(rc->rc_rt,
- info->rti_info[RTAX_GATEWAY]);
- if (rc->rc_rt == NULL) {
- RIB_RUNLOCK(rnh);
- return (ESRCH);
- }
+
+ nh = select_nhop(rc->rc_rt->rt_nhop, info->rti_info[RTAX_GATEWAY]);
+ if (nh == NULL) {
+ RIB_RUNLOCK(rnh);
+ return (ESRCH);
}
-#endif
/*
* If performing proxied L2 entry insertion, and
* the actual PPP host entry is found, perform
@@ -740,8 +750,13 @@
RIB_RUNLOCK(rnh);
return (ESRCH);
}
+ nh = select_nhop(rc->rc_rt->rt_nhop, info->rti_info[RTAX_GATEWAY]);
+ if (nh == NULL) {
+ RIB_RUNLOCK(rnh);
+ return (ESRCH);
+ }
}
- rc->rc_nh_new = rc->rc_rt->rt_nhop;
+ rc->rc_nh_new = nh;
rc->rc_nh_weight = rc->rc_rt->rt_weight;
RIB_RUNLOCK(rnh);
@@ -832,6 +847,24 @@
return (0);
}
+static void
+save_del_notification(struct rib_cmd_info *rc, void *_cbdata)
+{
+ struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata;
+
+ if (rc->rc_cmd == RTM_DELETE)
+ *rc_new = *rc;
+}
+
+static void
+save_add_notification(struct rib_cmd_info *rc, void *_cbdata)
+{
+ struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata;
+
+ if (rc->rc_cmd == RTM_ADD)
+ *rc_new = *rc;
+}
+
/*ARGSUSED*/
static int
route_output(struct mbuf *m, struct socket *so, ...)
@@ -919,6 +952,15 @@
#ifdef INET6
rti_need_deembed = 1;
#endif
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(rc.rc_nh_new) ||
+ (rc.rc_nh_old && NH_IS_NHGRP(rc.rc_nh_old))) {
+ struct rib_cmd_info rc_simple = {};
+ rib_decompose_notification(&rc,
+ save_add_notification, (void *)&rc_simple);
+ rc = rc_simple;
+ }
+#endif
nh = rc.rc_nh_new;
rtm->rtm_index = nh->nh_ifp->if_index;
}
@@ -927,6 +969,15 @@
case RTM_DELETE:
error = rib_action(fibnum, RTM_DELETE, &info, &rc);
if (error == 0) {
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(rc.rc_nh_old) ||
+ (rc.rc_nh_new && NH_IS_NHGRP(rc.rc_nh_new))) {
+ struct rib_cmd_info rc_simple = {};
+ rib_decompose_notification(&rc,
+ save_del_notification, (void *)&rc_simple);
+ rc = rc_simple;
+ }
+#endif
nh = rc.rc_nh_old;
goto report;
}
@@ -1708,7 +1759,19 @@
if (!can_export_rte(w->w_req->td->td_ucred, rt))
return (0);
nh = rt->rt_nhop;
- error = sysctl_dumpnhop(rt, nh, rt->rt_weight, w);
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh)) {
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
+ for (int i = 0; i < num_nhops; i++) {
+ error = sysctl_dumpnhop(rt, wn[i].nh, wn[i].weight, w);
+ if (error != 0)
+ return (error);
+ }
+ } else
+#endif
+ error = sysctl_dumpnhop(rt, nh, rt->rt_weight, w);
return (0);
}
@@ -1748,6 +1811,7 @@
rtm->rtm_flags = rt->rte_flags;
rtm->rtm_flags |= nhop_get_rtflags(nh);
rt_getmetrics(rt, nh, &rtm->rtm_rmx);
+ rtm->rtm_rmx.rmx_weight = weight;
rtm->rtm_index = nh->nh_ifp->if_index;
rtm->rtm_addrs = info.rti_addrs;
error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size);
@@ -2028,7 +2092,7 @@
namelen--;
if (req->newptr)
return (EPERM);
- if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP) {
+ if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP || name[1] == NET_RT_NHGRP) {
if (namelen == 3)
fib = req->td->td_proc->p_fibnum;
else if (namelen == 4)
@@ -2096,6 +2160,7 @@
}
break;
case NET_RT_NHOP:
+ case NET_RT_NHGRP:
/* Allow dumping one specific af/fib at a time */
if (namelen < 4) {
error = EINVAL;
@@ -2113,6 +2178,12 @@
}
if (w.w_op == NET_RT_NHOP)
error = nhops_dump_sysctl(rnh, w.w_req);
+ else
+#ifdef ROUTE_MPATH
+ error = nhgrp_dump_sysctl(rnh, w.w_req);
+#else
+ error = ENOTSUP;
+#endif
break;
case NET_RT_IFLIST:
case NET_RT_IFLISTL:
Index: head/sys/netinet/in.c
===================================================================
--- head/sys/netinet/in.c
+++ head/sys/netinet/in.c
@@ -35,8 +35,6 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include "opt_mpath.h"
-
#include <sys/param.h>
#include <sys/eventhandler.h>
#include <sys/systm.h>
@@ -699,14 +697,6 @@
* interface address, we are done here.
*/
if (ia->ia_flags & IFA_ROUTE) {
-#ifdef RADIX_MPATH
- if (ia->ia_addr.sin_addr.s_addr ==
- target->ia_addr.sin_addr.s_addr) {
- IN_IFADDR_RUNLOCK(&in_ifa_tracker);
- return (EEXIST);
- } else
- break;
-#endif
if (V_nosameprefix) {
IN_IFADDR_RUNLOCK(&in_ifa_tracker);
return (EEXIST);
Index: head/sys/netinet/in_fib.c
===================================================================
--- head/sys/netinet/in_fib.c
+++ head/sys/netinet/in_fib.c
@@ -32,7 +32,6 @@
#include "opt_inet.h"
#include "opt_route.h"
-#include "opt_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -48,14 +47,11 @@
#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/route.h>
+#include <net/route/route_ctl.h>
#include <net/route/route_var.h>
#include <net/route/nhop.h>
#include <net/vnet.h>
-#ifdef RADIX_MPATH
-#include <net/radix_mpath.h>
-#endif
-
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_fib.h>
@@ -80,7 +76,6 @@
RIB_RLOCK_TRACKER;
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
struct nhop_object *nh;
KASSERT((fibnum < rt_numfibs), ("fib4_lookup: bad fibnum"));
@@ -99,12 +94,7 @@
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
-#ifdef RADIX_MPATH
- if (rt_mpath_next(rt) != NULL)
- rt = rt_mpath_selectrte(rt, flowid);
-#endif
- nh = rt->rt_nhop;
+ nh = nhop_select((RNTORT(rn))->rt_nhop, flowid);
/* Ensure route & ifp is UP */
if (RT_LINK_IS_UP(nh->nh_ifp)) {
if (flags & NHR_REF)
@@ -120,7 +110,7 @@
}
inline static int
-check_urpf(const struct nhop_object *nh, uint32_t flags,
+check_urpf_nhop(const struct nhop_object *nh, uint32_t flags,
const struct ifnet *src_if)
{
@@ -137,21 +127,24 @@
return (0);
}
-#ifdef RADIX_MPATH
-inline static int
-check_urpf_mpath(struct rtentry *rt, uint32_t flags,
+static int
+check_urpf(struct nhop_object *nh, uint32_t flags,
const struct ifnet *src_if)
{
-
- while (rt != NULL) {
- if (check_urpf(rt->rt_nhop, flags, src_if) != 0)
- return (1);
- rt = rt_mpath_next(rt);
- }
-
- return (0);
-}
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh)) {
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
+ for (int i = 0; i < num_nhops; i++) {
+ if (check_urpf_nhop(wn[i].nh, flags, src_if) != 0)
+ return (1);
+ }
+ return (0);
+ } else
#endif
+ return (check_urpf_nhop(nh, flags, src_if));
+}
/*
* Performs reverse path forwarding lookup.
@@ -169,7 +162,6 @@
RIB_RLOCK_TRACKER;
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
int ret;
KASSERT((fibnum < rt_numfibs), ("fib4_check_urpf: bad fibnum"));
@@ -186,12 +178,7 @@
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
-#ifdef RADIX_MPATH
- ret = check_urpf_mpath(rt, flags, src_if);
-#else
- ret = check_urpf(rt->rt_nhop, flags, src_if);
-#endif
+ ret = check_urpf(RNTORT(rn)->rt_nhop, flags, src_if);
RIB_RUNLOCK(rh);
return (ret);
}
@@ -206,7 +193,6 @@
{
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
struct nhop_object *nh;
KASSERT((fibnum < rt_numfibs), ("fib4_lookup_debugnet: bad fibnum"));
@@ -225,12 +211,7 @@
/* unlocked lookup */
rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
-#ifdef RADIX_MPATH
- if (rt_mpath_next(rt) != NULL)
- rt = rt_mpath_selectrte(rt, 0);
-#endif
- nh = rt->rt_nhop;
+ nh = nhop_select((RNTORT(rn))->rt_nhop, 0);
/* Ensure route & ifp is UP */
if (RT_LINK_IS_UP(nh->nh_ifp)) {
if (flags & NHR_REF)
Index: head/sys/netinet/in_rmx.c
===================================================================
--- head/sys/netinet/in_rmx.c
+++ head/sys/netinet/in_rmx.c
@@ -30,8 +30,6 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include "opt_mpath.h"
-
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
@@ -127,9 +125,6 @@
return (NULL);
rh->rnh_preadd = rib4_preadd;
-#ifdef RADIX_MPATH
- rt_mpath_init_rnh(rh);
-#endif
return (rh);
}
Index: head/sys/netinet/ip_output.c
===================================================================
--- head/sys/netinet/ip_output.c
+++ head/sys/netinet/ip_output.c
@@ -38,7 +38,6 @@
#include "opt_ipsec.h"
#include "opt_kern_tls.h"
#include "opt_mbuf_stress_test.h"
-#include "opt_mpath.h"
#include "opt_ratelimit.h"
#include "opt_route.h"
#include "opt_rss.h"
@@ -470,11 +469,7 @@
* for correct operation (as it is for ARP).
*/
uint32_t flowid;
-#ifdef RADIX_MPATH
- flowid = ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr);
-#else
flowid = m->m_pkthdr.flowid;
-#endif
ro->ro_nh = fib4_lookup(fibnum, dst->sin_addr, 0,
NHR_REF, flowid);
Index: head/sys/netinet6/in6_fib.c
===================================================================
--- head/sys/netinet6/in6_fib.c
+++ head/sys/netinet6/in6_fib.c
@@ -33,7 +33,6 @@
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_route.h"
-#include "opt_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -49,14 +48,11 @@
#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/route.h>
+#include <net/route/route_ctl.h>
#include <net/route/route_var.h>
#include <net/route/nhop.h>
#include <net/vnet.h>
-#ifdef RADIX_MPATH
-#include <net/radix_mpath.h>
-#endif
-
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/ip_mroute.h>
@@ -88,7 +84,6 @@
RIB_RLOCK_TRACKER;
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
struct nhop_object *nh;
struct sockaddr_in6 sin6;
@@ -111,12 +106,7 @@
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
-#ifdef RADIX_MPATH
- if (rt_mpath_next(rt) != NULL)
- rt = rt_mpath_selectrte(rt, flowid);
-#endif
- nh = rt->rt_nhop;
+ nh = nhop_select((RNTORT(rn))->rt_nhop, flowid);
/* Ensure route & ifp is UP */
if (RT_LINK_IS_UP(nh->nh_ifp)) {
if (flags & NHR_REF)
@@ -132,7 +122,7 @@
}
inline static int
-check_urpf(const struct nhop_object *nh, uint32_t flags,
+check_urpf_nhop(const struct nhop_object *nh, uint32_t flags,
const struct ifnet *src_if)
{
@@ -149,21 +139,24 @@
return (0);
}
-#ifdef RADIX_MPATH
-inline static int
-check_urpf_mpath(struct rtentry *rt, uint32_t flags,
+static int
+check_urpf(struct nhop_object *nh, uint32_t flags,
const struct ifnet *src_if)
{
-
- while (rt != NULL) {
- if (check_urpf(rt->rt_nhop, flags, src_if) != 0)
- return (1);
- rt = rt_mpath_next(rt);
- }
-
- return (0);
-}
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh)) {
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
+ for (int i = 0; i < num_nhops; i++) {
+ if (check_urpf_nhop(wn[i].nh, flags, src_if) != 0)
+ return (1);
+ }
+ return (0);
+ } else
#endif
+ return (check_urpf_nhop(nh, flags, src_if));
+}
/*
* Performs reverse path forwarding lookup.
@@ -181,7 +174,6 @@
RIB_RLOCK_TRACKER;
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
struct sockaddr_in6 sin6;
int ret;
@@ -203,12 +195,7 @@
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
-#ifdef RADIX_MPATH
- ret = check_urpf_mpath(rt, flags, src_if);
-#else
- ret = check_urpf(rt->rt_nhop, flags, src_if);
-#endif
+ ret = check_urpf(RNTORT(rn)->rt_nhop, flags, src_if);
RIB_RUNLOCK(rh);
return (ret);
}
@@ -223,7 +210,6 @@
{
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
struct nhop_object *nh;
struct sockaddr_in6 sin6;
@@ -245,8 +231,7 @@
rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
- nh = rt->rt_nhop;
+ nh = nhop_select((RNTORT(rn))->rt_nhop, 0);
/* Ensure route & ifp is UP */
if (RT_LINK_IS_UP(nh->nh_ifp)) {
if (flags & NHR_REF)
Index: head/sys/netinet6/in6_rmx.c
===================================================================
--- head/sys/netinet6/in6_rmx.c
+++ head/sys/netinet6/in6_rmx.c
@@ -64,8 +64,6 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include "opt_mpath.h"
-
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
@@ -153,9 +151,6 @@
return (NULL);
rh->rnh_preadd = rib6_preadd;
-#ifdef RADIX_MPATH
- rt_mpath_init_rnh(rh);
-#endif
rs = rib_subscribe_internal(rh, nd6_subscription_cb, NULL,
RIB_NOTIFY_IMMEDIATE, true);
Index: head/sys/netinet6/nd6.c
===================================================================
--- head/sys/netinet6/nd6.c
+++ head/sys/netinet6/nd6.c
@@ -36,6 +36,7 @@
#include "opt_inet.h"
#include "opt_inet6.h"
+#include "opt_route.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -1591,7 +1592,11 @@
nd6_subscription_cb(struct rib_head *rnh, struct rib_cmd_info *rc, void *arg)
{
+#ifdef ROUTE_MPATH
+ rib_decompose_notification(rc, check_release_defrouter, NULL);
+#else
check_release_defrouter(rc, NULL);
+#endif
}
int
Index: head/sys/sys/socket.h
===================================================================
--- head/sys/sys/socket.h
+++ head/sys/sys/socket.h
@@ -417,6 +417,7 @@
#define NET_RT_IFLISTL 5 /* Survey interface list, using 'l'en
* versions of msghdr structs. */
#define NET_RT_NHOP 6 /* dump routing nexthops */
+#define NET_RT_NHGRP 7 /* dump routing nexthop groups */
#endif /* __BSD_VISIBLE */
/*
Index: head/usr.bin/netstat/Makefile
===================================================================
--- head/usr.bin/netstat/Makefile
+++ head/usr.bin/netstat/Makefile
@@ -5,7 +5,7 @@
PROG= netstat
SRCS= if.c inet.c main.c mbuf.c mroute.c netisr.c nl_symbols.c route.c \
- unix.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c common.c nhops.c \
+ unix.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c common.c nhops.c nhgrp.c \
nl_defs.h
nl_symbols.c: nlist_symbols
Index: head/usr.bin/netstat/common.h
===================================================================
--- head/usr.bin/netstat/common.h
+++ head/usr.bin/netstat/common.h
@@ -54,5 +54,22 @@
struct ifmap_entry *prepare_ifmap(size_t *ifmap_size);
+struct rt_msghdr;
+struct nhops_map {
+ uint32_t idx;
+ struct rt_msghdr *rtm;
+};
+
+struct nhops_dump {
+ void *nh_buf;
+ struct nhops_map *nh_map;
+ size_t nh_count;
+};
+
+void dump_nhops_sysctl(int fibnum, int af, struct nhops_dump *nd);
+struct nhop_map;
+void nhop_map_update(struct nhop_map *map, uint32_t idx, char *gw, char *ifname);
+
+
#endif
Index: head/usr.bin/netstat/main.c
===================================================================
--- head/usr.bin/netstat/main.c
+++ head/usr.bin/netstat/main.c
@@ -215,6 +215,7 @@
int noutputs = 0; /* how much outputs before we exit */
int numeric_addr; /* show addresses numerically */
int numeric_port; /* show ports numerically */
+int Oflag; /* show nhgrp objects*/
int oflag; /* show nexthop objects*/
int Pflag; /* show TCP log ID */
static int pflag; /* show given protocol */
@@ -250,7 +251,7 @@
if (argc < 0)
exit(EXIT_FAILURE);
- while ((ch = getopt(argc, argv, "46AaBbCdF:f:ghI:iLlM:mN:noPp:Qq:RrSTsuWw:xz"))
+ while ((ch = getopt(argc, argv, "46AaBbCdF:f:ghI:iLlM:mN:nOoPp:Qq:RrSTsuWw:xz"))
!= -1)
switch(ch) {
case '4':
@@ -353,6 +354,9 @@
case 'o':
oflag = 1;
break;
+ case 'O':
+ Oflag = 1;
+ break;
case 'P':
Pflag = 1;
break;
@@ -509,6 +513,14 @@
xo_finish();
exit(0);
}
+ if (Oflag) {
+ xo_open_container("statistics");
+ nhgrp_print(fib, af);
+ xo_close_container("statistics");
+ xo_finish();
+ exit(0);
+ }
+
if (gflag) {
Index: head/usr.bin/netstat/netstat.h
===================================================================
--- head/usr.bin/netstat/netstat.h
+++ head/usr.bin/netstat/netstat.h
@@ -163,3 +163,4 @@
void mrt_stats(void);
void bpf_stats(char *);
void nhops_print(int fibnum, int af);
+void nhgrp_print(int fibnum, int af);
Index: head/usr.bin/netstat/nhgrp.c
===================================================================
--- head/usr.bin/netstat/nhgrp.c
+++ head/usr.bin/netstat/nhgrp.c
@@ -0,0 +1,355 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_types.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+
+#include <netinet/in.h>
+
+#include <arpa/inet.h>
+#include <libutil.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+#include <err.h>
+#include <libxo/xo.h>
+#include "netstat.h"
+#include "common.h"
+
+#define WID_GW_DEFAULT(af) (((af) == AF_INET6) ? 40 : 18)
+
+static int wid_gw;
+static int wid_if = 10;
+static int wid_nhidx = 8;
+static int wid_refcnt = 8;
+
+struct nhop_entry {
+ char gw[64];
+ char ifname[IFNAMSIZ];
+};
+
+struct nhop_map {
+ struct nhop_entry *ptr;
+ size_t size;
+};
+static struct nhop_map global_nhop_map;
+
+static struct ifmap_entry *ifmap;
+static size_t ifmap_size;
+
+static struct nhop_entry *
+nhop_get(struct nhop_map *map, uint32_t idx)
+{
+
+ if (idx >= map->size)
+ return (NULL);
+ if (*map->ptr[idx].ifname == '\0')
+ return (NULL);
+ return &map->ptr[idx];
+}
+
+static void
+print_nhgroup_header(int af1 __unused)
+{
+
+ xo_emit("{T:/%-*.*s}{T:/%-*.*s}{T:/%*.*s}{T:/%*.*s}{T:/%*.*s}"
+ "{T:/%*.*s}{T:/%*s}\n",
+ wid_nhidx, wid_nhidx, "GrpIdx",
+ wid_nhidx, wid_nhidx, "NhIdx",
+ wid_nhidx, wid_nhidx, "Weight",
+ wid_nhidx, wid_nhidx, "Slots",
+ wid_gw, wid_gw, "Gateway",
+ wid_if, wid_if, "Netif",
+ wid_refcnt, "Refcnt");
+}
+
+static void
+print_padding(char sym, int len)
+{
+ char buffer[56];
+
+ memset(buffer, sym, sizeof(buffer));
+ buffer[0] = '{';
+ buffer[1] = 'P';
+ buffer[2] = ':';
+ buffer[3] = ' ';
+ buffer[len + 3] = '}';
+ buffer[len + 4] = '\0';
+ xo_emit(buffer);
+}
+
+
+static void
+print_nhgroup_entry_sysctl(const char *name, struct rt_msghdr *rtm,
+ struct nhgrp_external *nhge)
+{
+ char buffer[128];
+ struct nhop_entry *ne;
+ struct nhgrp_nhop_external *ext_cp, *ext_dp;
+ struct nhgrp_container *nhg_cp, *nhg_dp;
+
+ nhg_cp = (struct nhgrp_container *)(nhge + 1);
+ if (nhg_cp->nhgc_type != NHG_C_TYPE_CNHOPS || nhg_cp->nhgc_subtype != 0)
+ return;
+ ext_cp = (struct nhgrp_nhop_external *)(nhg_cp + 1);
+
+ nhg_dp = (struct nhgrp_container *)((char *)nhg_cp + nhg_cp->nhgc_len);
+ if (nhg_dp->nhgc_type != NHG_C_TYPE_DNHOPS || nhg_dp->nhgc_subtype != 0)
+ return;
+ ext_dp = (struct nhgrp_nhop_external *)(nhg_dp + 1);
+
+ xo_open_instance(name);
+
+ snprintf(buffer, sizeof(buffer), "{[:-%d}{:nhgrp-index/%%lu}{]:} ", wid_nhidx);
+
+ xo_emit(buffer, nhge->nhg_idx);
+
+ /* nhidx */
+ print_padding('-', wid_nhidx);
+ /* weight */
+ print_padding('-', wid_nhidx);
+ /* slots */
+ print_padding('-', wid_nhidx);
+ print_padding('-', wid_gw);
+ print_padding('-', wid_if);
+ xo_emit("{t:nhg-refcnt/%*lu}", wid_refcnt, nhge->nhg_refcount);
+ xo_emit("\n");
+
+ xo_open_list("nhop-weights");
+ for (uint32_t i = 0; i < nhg_cp->nhgc_count; i++) {
+ /* TODO: optimize slots calculations */
+ uint32_t slots = 0;
+ for (uint32_t sidx = 0; sidx < nhg_dp->nhgc_count; sidx++) {
+ if (ext_dp[sidx].nh_idx == ext_cp[i].nh_idx)
+ slots++;
+ }
+ xo_open_instance("nhop-weight");
+ print_padding(' ', wid_nhidx);
+ // nh index
+ xo_emit("{t:nh-index/%*lu}", wid_nhidx, ext_cp[i].nh_idx);
+ xo_emit("{t:nh-weight/%*lu}", wid_nhidx, ext_cp[i].nh_weight);
+ xo_emit("{t:nh-slots/%*lu}", wid_nhidx, slots);
+ ne = nhop_get(&global_nhop_map, ext_cp[i].nh_idx);
+ if (ne != NULL) {
+ xo_emit("{t:nh-gw/%*.*s}", wid_gw, wid_gw, ne->gw);
+ xo_emit("{t:nh-interface/%*.*s}", wid_if, wid_if, ne->ifname);
+ }
+ xo_emit("\n");
+ xo_close_instance("nhop-weight");
+ }
+ xo_close_list("nhop-weights");
+ xo_close_instance(name);
+}
+
+static int
+cmp_nhg_idx(const void *_a, const void *_b)
+{
+ const struct nhops_map *a, *b;
+
+ a = _a;
+ b = _b;
+
+ if (a->idx > b->idx)
+ return (1);
+ else if (a->idx < b->idx)
+ return (-1);
+ return (0);
+}
+
+static void
+dump_nhgrp_sysctl(int fibnum, int af, struct nhops_dump *nd)
+{
+ size_t needed;
+ int mib[7];
+ char *buf, *next, *lim;
+ struct rt_msghdr *rtm;
+ struct nhgrp_external *nhg;
+ struct nhops_map *nhg_map;
+ size_t nhg_count, nhg_size;
+
+ mib[0] = CTL_NET;
+ mib[1] = PF_ROUTE;
+ mib[2] = 0;
+ mib[3] = af;
+ mib[4] = NET_RT_NHGRP;
+ mib[5] = 0;
+ mib[6] = fibnum;
+ if (sysctl(mib, nitems(mib), NULL, &needed, NULL, 0) < 0)
+ err(EX_OSERR, "sysctl: net.route.0.%d.nhgrpdump.%d estimate",
+ af, fibnum);
+ if ((buf = malloc(needed)) == NULL)
+ errx(2, "malloc(%lu)", (unsigned long)needed);
+ if (sysctl(mib, nitems(mib), buf, &needed, NULL, 0) < 0)
+ err(1, "sysctl: net.route.0.%d.nhgrpdump.%d", af, fibnum);
+ lim = buf + needed;
+
+ /*
+ * nexhops groups are received unsorted. Collect everything first,
+ * and sort prior displaying.
+ */
+ nhg_count = 0;
+ nhg_size = 16;
+ nhg_map = calloc(nhg_size, sizeof(struct nhops_map));
+ for (next = buf; next < lim; next += rtm->rtm_msglen) {
+ rtm = (struct rt_msghdr *)next;
+ if (rtm->rtm_version != RTM_VERSION)
+ continue;
+
+ if (nhg_count >= nhg_size) {
+ nhg_size *= 2;
+ nhg_map = realloc(nhg_map, nhg_size * sizeof(struct nhops_map));
+ }
+
+ nhg = (struct nhgrp_external *)(rtm + 1);
+ nhg_map[nhg_count].idx = nhg->nhg_idx;
+ nhg_map[nhg_count].rtm = rtm;
+ nhg_count++;
+ }
+
+ if (nhg_count > 0)
+ qsort(nhg_map, nhg_count, sizeof(struct nhops_map), cmp_nhg_idx);
+ nd->nh_buf = buf;
+ nd->nh_count = nhg_count;
+ nd->nh_map = nhg_map;
+}
+
+static void
+print_nhgrp_sysctl(int fibnum, int af)
+{
+ struct nhops_dump nd;
+ struct nhgrp_external *nhg;
+ struct rt_msghdr *rtm;
+
+ dump_nhgrp_sysctl(fibnum, af, &nd);
+
+ xo_open_container("nhgrp-table");
+ xo_open_list("rt-family");
+ if (nd.nh_count > 0) {
+ wid_gw = WID_GW_DEFAULT(af);
+ xo_open_instance("rt-family");
+ pr_family(af);
+ xo_open_list("nhgrp-entry");
+
+ print_nhgroup_header(af);
+
+ for (size_t i = 0; i < nd.nh_count; i++) {
+ rtm = nd.nh_map[i].rtm;
+ nhg = (struct nhgrp_external *)(rtm + 1);
+ print_nhgroup_entry_sysctl("nhgrp-entry", rtm, nhg);
+ }
+ }
+ xo_close_list("rt-family");
+ xo_close_container("nhgrp-table");
+ free(nd.nh_buf);
+}
+
+static void
+update_global_map(struct nhop_external *nh)
+{
+ char iface_name[128];
+ char gw_addr[64];
+ struct nhop_addrs *na;
+ struct sockaddr *sa_gw;
+
+ na = (struct nhop_addrs *)((char *)nh + nh->nh_len);
+ sa_gw = (struct sockaddr *)((char *)na + na->gw_sa_off);
+
+ memset(iface_name, 0, sizeof(iface_name));
+ if (nh->ifindex < (uint32_t)ifmap_size) {
+ strlcpy(iface_name, ifmap[nh->ifindex].ifname,
+ sizeof(iface_name));
+ if (*iface_name == '\0')
+ strlcpy(iface_name, "---", sizeof(iface_name));
+ }
+
+ if (nh->nh_flags & NHF_GATEWAY) {
+ const char *cp;
+ cp = fmt_sockaddr(sa_gw, NULL, RTF_HOST);
+ strlcpy(gw_addr, cp, sizeof(gw_addr));
+ } else
+ snprintf(gw_addr, sizeof(gw_addr), "%s/resolve", iface_name);
+
+ nhop_map_update(&global_nhop_map, nh->nh_idx, gw_addr, iface_name);
+}
+
+static void
+prepare_nh_map(int fibnum, int af)
+{
+ struct nhops_dump nd;
+ struct nhop_external *nh;
+ struct rt_msghdr *rtm;
+
+ dump_nhops_sysctl(fibnum, af, &nd);
+
+ for (size_t i = 0; i < nd.nh_count; i++) {
+ rtm = nd.nh_map[i].rtm;
+ nh = (struct nhop_external *)(rtm + 1);
+ update_global_map(nh);
+ }
+
+ free(nd.nh_buf);
+}
+
+void
+nhgrp_print(int fibnum, int af)
+{
+ size_t intsize;
+ int numfibs;
+
+ intsize = sizeof(int);
+ if (fibnum == -1 &&
+ sysctlbyname("net.my_fibnum", &fibnum, &intsize, NULL, 0) == -1)
+ fibnum = 0;
+ if (sysctlbyname("net.fibs", &numfibs, &intsize, NULL, 0) == -1)
+ numfibs = 1;
+ if (fibnum < 0 || fibnum > numfibs - 1)
+ errx(EX_USAGE, "%d: invalid fib", fibnum);
+
+ ifmap = prepare_ifmap(&ifmap_size);
+ prepare_nh_map(fibnum, af);
+
+ xo_open_container("route-nhgrp-information");
+ xo_emit("{T:Nexthop groups data}");
+ if (fibnum)
+ xo_emit(" ({L:fib}: {:fib/%d})", fibnum);
+ xo_emit("\n");
+ print_nhgrp_sysctl(fibnum, af);
+ xo_close_container("route-nhgrp-information");
+}
+
Index: head/usr.bin/netstat/nhops.c
===================================================================
--- head/usr.bin/netstat/nhops.c
+++ head/usr.bin/netstat/nhops.c
@@ -118,8 +118,6 @@
};
static struct nhop_map global_nhop_map;
-static void nhop_map_update(struct nhop_map *map, uint32_t idx,
- char *gw, char *ifname);
static struct nhop_entry *nhop_get(struct nhop_map *map, uint32_t idx);
@@ -204,7 +202,7 @@
}
}
-static void
+void
nhop_map_update(struct nhop_map *map, uint32_t idx, char *gw, char *ifname)
{
if (idx >= map->size) {
@@ -322,11 +320,6 @@
xo_close_instance(name);
}
-struct nhops_map {
- uint32_t idx;
- struct rt_msghdr *rtm;
-};
-
static int
cmp_nh_idx(const void *_a, const void *_b)
{
@@ -342,15 +335,14 @@
return (0);
}
-static void
-print_nhops_sysctl(int fibnum, int af)
+void
+dump_nhops_sysctl(int fibnum, int af, struct nhops_dump *nd)
{
size_t needed;
int mib[7];
char *buf, *next, *lim;
struct rt_msghdr *rtm;
struct nhop_external *nh;
- int fam;
struct nhops_map *nh_map;
size_t nh_count, nh_size;
@@ -369,8 +361,6 @@
if (sysctl(mib, nitems(mib), buf, &needed, NULL, 0) < 0)
err(1, "sysctl: net.route.0.%d.nhdump.%d", af, fibnum);
lim = buf + needed;
- xo_open_container("nhop-table");
- xo_open_list("rt-family");
/*
* nexhops are received unsorted. Collect everything first, sort and then display
@@ -395,9 +385,27 @@
nh_count++;
}
- if (nh_count > 0) {
+ if (nh_count > 0)
qsort(nh_map, nh_count, sizeof(struct nhops_map), cmp_nh_idx);
- nh = (struct nhop_external *)(nh_map[0].rtm + 1);
+ nd->nh_buf = buf;
+ nd->nh_count = nh_count;
+ nd->nh_map = nh_map;
+}
+
+static void
+print_nhops_sysctl(int fibnum, int af)
+{
+ struct nhops_dump nd;
+ struct nhop_external *nh;
+ int fam;
+ struct rt_msghdr *rtm;
+
+ dump_nhops_sysctl(fibnum, af, &nd);
+
+ xo_open_container("nhop-table");
+ xo_open_list("rt-family");
+ if (nd.nh_count > 0) {
+ nh = (struct nhop_external *)(nd.nh_map[0].rtm + 1);
fam = nh->nh_family;
wid_dst = WID_GW_DEFAULT(fam);
@@ -415,8 +423,8 @@
print_nhop_header(fam);
- for (size_t i = 0; i < nh_count; i++) {
- rtm = nh_map[i].rtm;
+ for (size_t i = 0; i < nd.nh_count; i++) {
+ rtm = nd.nh_map[i].rtm;
nh = (struct nhop_external *)(rtm + 1);
print_nhop_entry_sysctl("nh-entry", rtm, nh);
}
@@ -426,7 +434,7 @@
}
xo_close_list("rt-family");
xo_close_container("nhop-table");
- free(buf);
+ free(nd.nh_buf);
}
static void
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Jan 22, 6:00 AM (1 h, 23 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
16025780
Default Alt Text
D26449.diff (99 KB)
Attached To
Mode
D26449: Stage 2: Introduce scalable route multipath
Attached
Detach File
Event Timeline
Log In to Comment