Page MenuHomeFreeBSD

D26449.id77161.diff
No OneTemporary

D26449.id77161.diff

Index: sys/conf/NOTES
===================================================================
--- sys/conf/NOTES
+++ sys/conf/NOTES
@@ -1002,7 +1002,7 @@
#
# TCP_HHOOK enables the hhook(9) framework hooks for the TCP stack.
#
-# RADIX_MPATH provides support for equal-cost multi-path routing.
+# ROUTE_MPATH provides support for multipath routing.
#
options MROUTING # Multicast routing
options IPFIREWALL #firewall
Index: sys/conf/files
===================================================================
--- sys/conf/files
+++ sys/conf/files
@@ -4140,10 +4140,12 @@
net/debugnet_inet.c optional inet debugnet
net/pfil.c optional ether | inet
net/radix.c standard
-net/radix_mpath.c standard
net/raw_cb.c standard
net/raw_usrreq.c standard
net/route.c standard
+net/route/mpath_ctl.c optional route_mpath
+net/route/nhgrp.c optional route_mpath
+net/route/nhgrp_ctl.c optional route_mpath
net/route/nhop.c standard
net/route/nhop_ctl.c standard
net/route/nhop_utils.c standard
@@ -4156,7 +4158,7 @@
net/rtsock.c standard
net/slcompress.c optional netgraph_vjc | sppp | \
netgraph_sppp
-net/toeplitz.c optional inet rss | inet6 rss
+net/toeplitz.c optional inet rss | inet6 rss | route_mpath
net/vnet.c optional vimage
net80211/ieee80211.c optional wlan
net80211/ieee80211_acl.c optional wlan wlan_acl
Index: sys/conf/options
===================================================================
--- sys/conf/options
+++ sys/conf/options
@@ -454,6 +454,7 @@
PCBGROUP opt_pcbgroup.h
PF_DEFAULT_TO_DROP opt_pf.h
RADIX_MPATH opt_mpath.h
+ROUTE_MPATH opt_route.h
ROUTETABLES opt_route.h
RSS opt_rss.h
SLIP_IFF_OPTS opt_slip.h
Index: sys/net/radix.c
===================================================================
--- sys/net/radix.c
+++ sys/net/radix.c
@@ -44,10 +44,6 @@
#include <sys/malloc.h>
#include <sys/syslog.h>
#include <net/radix.h>
-#include "opt_mpath.h"
-#ifdef RADIX_MPATH
-#include <net/radix_mpath.h>
-#endif
#else /* !_KERNEL */
#include <stdio.h>
#include <strings.h>
@@ -628,21 +624,6 @@
saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes);
if (keyduplicated) {
for (t = tt; tt; t = tt, tt = tt->rn_dupedkey) {
-#ifdef RADIX_MPATH
- /* permit multipath, if enabled for the family */
- if (rn_mpath_capable(head) && netmask == tt->rn_mask) {
- /*
- * go down to the end of multipaths, so that
- * new entry goes into the end of rn_dupedkey
- * chain.
- */
- do {
- t = tt;
- tt = tt->rn_dupedkey;
- } while (tt && t->rn_mask == tt->rn_mask);
- break;
- }
-#endif
if (tt->rn_mask == netmask)
return (0);
if (netmask == 0 ||
@@ -748,10 +729,8 @@
if (m->rm_flags & RNF_NORMAL) {
mmask = m->rm_leaf->rn_mask;
if (tt->rn_flags & RNF_NORMAL) {
-#if !defined(RADIX_MPATH)
log(LOG_ERR,
"Non-unique normal route, mask not entered\n");
-#endif
return (tt);
}
} else
Index: sys/net/route.h
===================================================================
--- sys/net/route.h
+++ sys/net/route.h
@@ -104,6 +104,10 @@
/* lle state is exported in rmx_state rt_metrics field */
#define rmx_state rmx_weight
+/* default route weight */
+#define RT_DEFAULT_WEIGHT 1
+#define RT_MAX_WEIGHT 16777215 /* 3 bytes */
+
/*
* Keep a generation count of routing table, incremented on route addition,
* so we can invalidate caches. This is accessed without a lock, as precision
@@ -121,6 +125,10 @@
#define rt_numfibs V_rt_numfibs
VNET_DECLARE(u_int, rt_add_addr_allfibs); /* Announce interfaces to all fibs */
#define V_rt_add_addr_allfibs VNET(rt_add_addr_allfibs)
+
+/* Calculate flowid for locally-originated packets */
+#define V_fib_hash_outbound VNET(fib_hash_outbound)
+VNET_DECLARE(u_int, fib_hash_outbound);
#endif
/*
@@ -174,6 +182,7 @@
*/
/* Consumer-visible nexthop info flags */
+#define NHF_MULTIPATH 0x0008 /* Nexhop is a nexthop group */
#define NHF_REJECT 0x0010 /* RTF_REJECT */
#define NHF_BLACKHOLE 0x0020 /* RTF_BLACKHOLE */
#define NHF_REDIRECT 0x0040 /* RTF_DYNAMIC|RTF_MODIFIED */
@@ -204,6 +213,10 @@
uint64_t rts_wildcard; /* lookups satisfied by a wildcard */
uint64_t rts_nh_idx_alloc_failure; /* nexthop index alloc failure*/
uint64_t rts_nh_alloc_failure; /* nexthop allocation failure*/
+ uint64_t rts_add_failure; /* route addition failure */
+ uint64_t rts_add_retry; /* route addition failure */
+ uint64_t rts_del_failure; /* route addition failure */
+ uint64_t rts_del_retry; /* route addition failure */
};
/*
Index: sys/net/route.c
===================================================================
--- sys/net/route.c
+++ sys/net/route.c
@@ -39,7 +39,6 @@
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_mrouting.h"
-#include "opt_mpath.h"
#include "opt_route.h"
#include <sys/param.h>
@@ -848,18 +847,6 @@
}
#endif
-void
-rt_setmetrics(const struct rt_addrinfo *info, struct rtentry *rt)
-{
-
- if (info->rti_mflags & RTV_WEIGHT)
- rt->rt_weight = info->rti_rmx->rmx_weight;
- /* Kernel -> userland timebase conversion. */
- if (info->rti_mflags & RTV_EXPIRE)
- rt->rt_expire = info->rti_rmx->rmx_expire ?
- info->rti_rmx->rmx_expire - time_second + time_uptime : 0;
-}
-
void
rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask)
{
Index: sys/net/route/mpath_ctl.c
===================================================================
--- /dev/null
+++ sys/net/route/mpath_ctl.c
@@ -0,0 +1,179 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_inet.h"
+#include "opt_route.h"
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+
+/*
+ * This file contains the supporting functions for adding/deleting/updating
+ * multipath routes to the routing table.
+ */
+
+SYSCTL_DECL(_net_route);
+VNET_DEFINE(u_int, fib_hash_outbound) = 0;
+SYSCTL_UINT(_net_route, OID_AUTO, hash_outbound, CTLFLAG_RD | CTLFLAG_VNET,
+ &VNET_NAME(fib_hash_outbound), 0,
+ "Compute flowid for locally-originated packets");
+
+/* Default entropy to add to the hash calculation for the outbound connections*/
+uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN] = {
+ 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+ 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+ 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+ 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+ 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+};
+
+
+/*
+ * Tries to add @rnd_add nhop to the existing set of nhops (@nh_orig) for the
+ * prefix specified by @rt.
+ *
+ * Return 0 ans consumes rt / rnd_add nhop references. @rc gets populated
+ * with the operation result.
+ * Otherwise errno is returned.
+ *
+ * caller responsibility is to unlock/free rt and
+ * rt->rt_nhop.
+ */
+int
+add_route_mpath(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct rtentry *rt, struct route_nhop_data *rnd_add,
+ struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
+{
+ RIB_RLOCK_TRACKER;
+ struct route_nhop_data rnd_new;
+ int error = 0;
+
+ /*
+ * It is possible that multiple rtsock speakers will try to update
+ * the same route simultaneously. Reduce the chance of failing the
+ * request by retrying the cycle multiple times.
+ */
+ for (int i = 0; i < RIB_MAX_RETRIES; i++) {
+ error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add,
+ &rnd_new);
+ if (error != 0) {
+ if (error != EAGAIN)
+ break;
+
+ /*
+ * Group creation failed, most probably because
+ * @rnd_orig data got scheduled for deletion.
+ * Refresh @rnd_orig data and retry.
+ */
+ RIB_RLOCK(rnh);
+ lookup_prefix(rnh, info, rnd_orig);
+ RIB_RUNLOCK(rnh);
+ continue;
+ }
+
+ error = change_route_conditional(rnh, rt, info, rnd_orig,
+ &rnd_new, rc);
+ if (error != EAGAIN)
+ break;
+ RTSTAT_INC(rts_add_retry);
+ }
+
+ if (V_fib_hash_outbound == 0 && error == 0 &&
+ NH_IS_NHGRP(rc->rc_nh_new)) {
+ /*
+ * First multipath route got installed. Enable local
+ * outbound connections hashing.
+ */
+ if (bootverbose)
+ printf("FIB: enabled flowid calculation for locally-originated packets\n");
+ V_fib_hash_outbound = 1;
+ }
+
+ return (error);
+}
+
+static bool
+gw_filter_func(const struct nhop_object *nh, void *_data)
+{
+ struct sockaddr *gw = (struct sockaddr *)_data;
+
+ return (match_nhop_gw(nh, gw));
+}
+
+/*
+ * Tries to delete matching paths from @nhg.
+ * Returns 0 on success and updates operation result in @rc.
+ */
+int
+del_route_mpath(struct rib_head *rh, struct rt_addrinfo *info,
+ struct rtentry *rt, struct nhgrp_object *nhg,
+ struct rib_cmd_info *rc)
+{
+ struct route_nhop_data rnd;
+ struct sockaddr *gw;
+ int error;
+
+ RIB_WLOCK_ASSERT(rh);
+
+ gw = info->rti_info[RTAX_GATEWAY];
+ if (gw == NULL)
+ return (ESRCH);
+
+ error = nhgrp_get_filtered_group(rh, nhg, gw_filter_func, (void *)gw,
+ &rnd);
+ if (error == 0)
+ error = change_route_nhop(rh, rt, info, &rnd, rc);
+ return (error);
+}
+
Index: sys/net/route/nhgrp.c
===================================================================
--- /dev/null
+++ sys/net/route/nhgrp.c
@@ -0,0 +1,344 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_inet.h"
+#include "opt_route.h"
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/refcount.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/nhgrp_var.h>
+
+/*
+ * This file contains data structures management logic for the nexthop
+ * groups ("nhgrp") route subsystem.
+ *
+ * Nexthop groups are used to store multiple routes available for the specific
+ * prefix. Nexthop groups are immutable and can be shared across multiple
+ * prefixes.
+ *
+ * Each group consists of a control plane part and a dataplane part.
+ * Control plane is basically a collection of nexthop objects with
+ * weights and refcount.
+ *
+ * Datapath consists of a array of nexthop pointers, compiled from control
+ * plane data to support O(1) nexthop selection.
+ *
+ * For example, consider the following group:
+ * [(nh1, weight=100), (nh2, weight=200)]
+ * It will compile to the following array:
+ * [nh1, nh2, nh2]
+ *
+ */
+
+static void consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets,
+ uint32_t new_idx_items);
+
+static int cmp_nhgrp(const struct nhgrp_priv *a, const struct nhgrp_priv *b);
+static unsigned int hash_nhgrp(const struct nhgrp_priv *obj);
+
+static unsigned
+djb_hash(const unsigned char *h, const int len)
+{
+ unsigned int result = 0;
+ int i;
+
+ for (i = 0; i < len; i++)
+ result = 33 * result ^ h[i];
+
+ return (result);
+}
+
+static int
+cmp_nhgrp(const struct nhgrp_priv *a, const struct nhgrp_priv *b)
+{
+
+ /*
+ * In case of consistent hashing, there can be multiple nexthop groups
+ * with the same "control plane" list of nexthops with weights and a
+ * different set of "data plane" nexthops.
+ * For now, ignore the data plane and focus on the control plane list.
+ */
+ if (a->nhg_nh_count != b->nhg_nh_count)
+ return (0);
+ return !memcmp(a->nhg_nh_weights, b->nhg_nh_weights,
+ sizeof(struct weightened_nhop) * a->nhg_nh_count);
+}
+
+/*
+ * Hash callback: calculate hash of an object
+ */
+static unsigned int
+hash_nhgrp(const struct nhgrp_priv *obj)
+{
+ const unsigned char *key;
+
+ key = (const unsigned char *)obj->nhg_nh_weights;
+
+ return (djb_hash(key, sizeof(struct weightened_nhop) * obj->nhg_nh_count));
+}
+
+/*
+ * Returns object referenced and unlocked
+ */
+struct nhgrp_priv *
+find_nhgrp(struct nh_control *ctl, const struct nhgrp_priv *key)
+{
+ struct nhgrp_priv *priv_ret;
+
+ NHOPS_RLOCK(ctl);
+ CHT_SLIST_FIND_BYOBJ(&ctl->gr_head, mpath, key, priv_ret);
+ if (priv_ret != NULL) {
+ if (refcount_acquire_if_not_zero(&priv_ret->nhg_refcount) == 0) {
+ /* refcount is 0 -> group is being deleted */
+ priv_ret = NULL;
+ }
+ }
+ NHOPS_RUNLOCK(ctl);
+
+ return (priv_ret);
+}
+
+int
+link_nhgrp(struct nh_control *ctl, struct nhgrp_priv *grp_priv)
+{
+ uint16_t idx;
+ uint32_t new_num_buckets, new_num_items;
+
+ NHOPS_WLOCK(ctl);
+ /* Check if we need to resize hash and index */
+ new_num_buckets = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->gr_head);
+ new_num_items = bitmask_get_resize_items(&ctl->gr_idx_head);
+
+ if (bitmask_alloc_idx(&ctl->gr_idx_head, &idx) != 0) {
+ NHOPS_WUNLOCK(ctl);
+ DPRINTF("Unable to allocate mpath index");
+ consider_resize(ctl, new_num_buckets, new_num_items);
+ return (0);
+ }
+
+ grp_priv->nhg_idx = idx;
+ grp_priv->nh_control = ctl;
+ CHT_SLIST_INSERT_HEAD(&ctl->gr_head, mpath, grp_priv);
+
+ NHOPS_WUNLOCK(ctl);
+
+ consider_resize(ctl, new_num_buckets, new_num_items);
+
+ return (1);
+}
+
+struct nhgrp_priv *
+unlink_nhgrp(struct nh_control *ctl, struct nhgrp_priv *key)
+{
+ struct nhgrp_priv *nhg_priv_ret;
+ int ret, idx;
+
+ NHOPS_WLOCK(ctl);
+
+ CHT_SLIST_REMOVE_BYOBJ(&ctl->gr_head, mpath, key, nhg_priv_ret);
+
+ if (nhg_priv_ret == NULL) {
+ DPRINTF("Unable to find nhop group!");
+ NHOPS_WUNLOCK(ctl);
+ return (NULL);
+ }
+
+ idx = nhg_priv_ret->nhg_idx;
+ ret = bitmask_free_idx(&ctl->gr_idx_head, idx);
+ nhg_priv_ret->nhg_idx = 0;
+ nhg_priv_ret->nh_control = NULL;
+
+ NHOPS_WUNLOCK(ctl);
+
+ return (nhg_priv_ret);
+}
+
+/*
+ * Checks if hash needs resizing and performs this resize if necessary
+ *
+ */
+__noinline static void
+consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, uint32_t new_idx_items)
+{
+ void *nh_ptr, *nh_idx_ptr;
+ void *old_idx_ptr;
+ size_t alloc_size;
+
+ nh_ptr = NULL ;
+ if (new_nh_buckets != 0) {
+ alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_nh_buckets);
+ nh_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO);
+ }
+
+ nh_idx_ptr = NULL;
+ if (new_idx_items != 0) {
+ alloc_size = bitmask_get_size(new_idx_items);
+ nh_idx_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO);
+ }
+
+ if (nh_ptr == NULL && nh_idx_ptr == NULL) {
+ /* Either resize is not required or allocations have failed. */
+ return;
+ }
+
+ DPRINTF("mp: going to resize: nh:[ptr:%p sz:%u] idx:[ptr:%p sz:%u]",
+ nh_ptr, new_nh_buckets, nh_idx_ptr, new_idx_items);
+
+ old_idx_ptr = NULL;
+
+ NHOPS_WLOCK(ctl);
+ if (nh_ptr != NULL) {
+ CHT_SLIST_RESIZE(&ctl->gr_head, mpath, nh_ptr, new_nh_buckets);
+ }
+ if (nh_idx_ptr != NULL) {
+ if (bitmask_copy(&ctl->gr_idx_head, nh_idx_ptr, new_idx_items))
+ bitmask_swap(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items, &old_idx_ptr);
+ }
+ NHOPS_WUNLOCK(ctl);
+
+ if (nh_ptr != NULL)
+ free(nh_ptr, M_NHOP);
+ if (old_idx_ptr != NULL)
+ free(old_idx_ptr, M_NHOP);
+}
+
+/*
+ * Function allocating the necessary group data structures.
+ */
+bool
+nhgrp_ctl_alloc_default(struct nh_control *ctl, int malloc_flags)
+{
+ size_t alloc_size;
+ uint32_t num_buckets, num_items;
+ void *cht_ptr, *mask_ptr;
+
+ malloc_flags = (malloc_flags & (M_NOWAIT | M_WAITOK)) | M_ZERO;
+
+ num_buckets = 8;
+ alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
+ cht_ptr = malloc(alloc_size, M_NHOP, malloc_flags);
+
+ if (cht_ptr == NULL) {
+ DPRINTF("mpath init failed");
+ return (false);
+ }
+
+ /*
+ * Allocate nexthop index bitmask.
+ */
+ num_items = 128;
+ mask_ptr = malloc(bitmask_get_size(num_items), M_NHOP, malloc_flags);
+ if (mask_ptr == NULL) {
+ DPRINTF("mpath bitmask init failed");
+ free(cht_ptr, M_NHOP);
+ return (false);
+ }
+
+ NHOPS_WLOCK(ctl);
+
+ if (ctl->gr_head.hash_size == 0) {
+ /* Init hash and bitmask */
+ CHT_SLIST_INIT(&ctl->gr_head, cht_ptr, num_buckets);
+ bitmask_init(&ctl->gr_idx_head, mask_ptr, num_items);
+ NHOPS_WUNLOCK(ctl);
+ } else {
+ /* Other thread has already initiliazed hash/bitmask */
+ NHOPS_WUNLOCK(ctl);
+ free(cht_ptr, M_NHOP);
+ free(mask_ptr, M_NHOP);
+ }
+
+ DPRINTF("mpath init done for fib/af %d/%d", ctl->rh->rib_fibnum,
+ ctl->rh->rib_family);
+
+ return (true);
+}
+
+int
+nhgrp_ctl_init(struct nh_control *ctl)
+{
+
+ /*
+ * By default, do not allocate datastructures as multipath
+ * routes will not be necessarily used.
+ */
+ CHT_SLIST_INIT(&ctl->gr_head, NULL, 0);
+ bitmask_init(&ctl->gr_idx_head, NULL, 0);
+ return (0);
+}
+
+void
+nhgrp_ctl_free(struct nh_control *ctl)
+{
+
+ if (ctl->gr_head.ptr != NULL)
+ free(ctl->gr_head.ptr, M_NHOP);
+ if (ctl->gr_idx_head.idx != NULL)
+ free(ctl->gr_idx_head.idx, M_NHOP);
+}
+
+void
+nhgrp_ctl_unlink_all(struct nh_control *ctl)
+{
+ struct nhgrp_priv *nhg_priv;
+
+ NHOPS_WLOCK_ASSERT(ctl);
+
+ CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) {
+ DPRINTF("Marking nhgrp %u unlinked", nhg_priv->nhg_idx);
+ refcount_release(&nhg_priv->nhg_linked);
+ } CHT_SLIST_FOREACH_END;
+}
+
Index: sys/net/route/nhgrp_ctl.c
===================================================================
--- /dev/null
+++ sys/net/route/nhgrp_ctl.c
@@ -0,0 +1,750 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_inet.h"
+#include "opt_route.h"
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/refcount.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+#include <sys/epoch.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/route.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/nhgrp_var.h>
+
+/*
+ * This file contains the supporting functions for creating multipath groups
+ * and compiling their dataplane parts.
+ */
+
+/* MPF_MULTIPATH must be the same as NHF_MULTIPATH for nhop selection to work */
+CTASSERT(MPF_MULTIPATH == NHF_MULTIPATH);
+/* Offset and size of flags field has to be the same for nhop/nhop groups */
+CHK_STRUCT_FIELD_GENERIC(struct nhop_object, nh_flags, struct nhgrp_object, mp_flags);
+/* Cap multipath to 64, as the larger values would break rib_cmd_info bmasks */
+CTASSERT(RIB_MAX_MPATH_WIDTH <= 64);
+
+static int wn_cmp(const void *a, const void *b);
+static void sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops);
+
+static struct nhgrp_priv *get_nhgrp(struct nh_control *ctl,
+ struct weightened_nhop *wn, int num_nhops, int *perror);
+static void destroy_nhgrp(struct nhgrp_priv *nhg_priv);
+static void destroy_nhgrp_epoch(epoch_context_t ctx);
+static void free_nhgrp_nhops(struct nhgrp_priv *nhg_priv);
+
+
+static int
+wn_cmp(const void *a, const void *b)
+{
+ const struct weightened_nhop *wa = a;
+ const struct weightened_nhop *wb = b;
+
+ if (wa->weight > wb->weight)
+ return (1);
+ else if (wa->weight < wb->weight)
+ return (-1);
+
+ /* Compare nexthops by pointer */
+ if (wa->nh > wb->nh)
+ return (1);
+ else if (wa->nh < wb->nh)
+ return (-1);
+ else
+ return (0);
+}
+
+/*
+ * Perform in-place sorting for array of nexthops in @wn.
+ *
+ * To avoid nh groups duplication, nexthops/weights in the
+ * @wn need to be ordered deterministically.
+ * As this sorting is needed only for the control plane functionality,
+ * there are no specific external requirements.
+ *
+ * Sort by weight first, to ease calculation of the slot sizes.
+ */
+static void
+sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops)
+{
+
+ qsort(wn, num_nhops, sizeof(struct weightened_nhop), wn_cmp);
+}
+
+/*
+ * Calculate minimum number of slots required to fit the existing
+ * set of weights while maintaining weight coefficints.
+ *
+ * Assume @wn is sorted by weight ascending and each weight is > 0.
+ *
+ * Some examples:
+ * nh=1,weight=1 nh=2,weight=2 -> 3 slots [1, 2, 2]
+ * nh=1,weight=1000 nh=2,weight=2000 -> 3 slots: [1, 2, 2]
+ * nh=1,weight=17 nh=2,weight=37 -> 3 slots: [1, 2, 2]
+ * nh=1,weight=1 nh=2,weight=70 -> 64 slots: [1, 2, 2, ..]
+ */
+static uint32_t
+calc_min_mpath_slots(const struct weightened_nhop *wn, size_t num_items)
+{
+ uint32_t i, last, xmin;
+ uint64_t v, total = 0;
+
+ last = 0;
+ xmin = wn[0].weight;
+ for (i = 0; i < num_items; i++) {
+ total += wn[i].weight;
+ if ((wn[i].weight - last < xmin) && (wn[i].weight != last))
+ xmin = wn[i].weight - last;
+ last = wn[i].weight;
+ }
+ // got minimum unit of desired accuracy
+ v = total / xmin;
+ if (v > RIB_MAX_MPATH_WIDTH) {
+ /*
+ * TODO: round to the MAX_MPATH and
+ * see if this reduces the group size.
+ */
+ v = RIB_MAX_MPATH_WIDTH;
+ }
+
+ return (uint32_t)v;
+}
+
+/*
+ * Nexthop group data consists of
+ * 1) dataplane part, with nhgrp_object as a header followed by an
+ * arbitrary number of nexthop pointers.
+ * 2) control plane part, with nhgrp_priv as a header, followed by
+ * an arbirtrary number of 'struct weightened_nhop' object.
+ *
+ * Given nexthop groups are (mostly) immutable, allocate all data
+ * in one go.
+ *
+ */
+__noinline static size_t
+get_nhgrp_alloc_size(uint32_t mp_size, uint32_t num_nhops)
+{
+ size_t sz;
+
+ sz = sizeof(struct nhgrp_object);
+ sz += mp_size * sizeof(struct nhop_object *);
+ sz += sizeof(struct nhgrp_priv);
+ sz += num_nhops * sizeof(struct weightened_nhop);
+ return (sz);
+}
+
+/*
+ * Compile actual list of nexthops to be used by datapath from
+ * the nexthop group @dst.
+ *
+ * For example, compiling control plane list of 2 nexthops
+ * [(200, A), (100, B)] would result in the datapath array
+ * [A, A, B]
+ */
+static void
+compile_nhgrp(struct nhgrp_priv *dst_priv, const struct weightened_nhop *x,
+ uint32_t num_slots)
+{
+ struct nhgrp_object *dst;
+ int i, slot_idx, remaining_slots;
+ uint64_t remaining_sum, nh_weight, nh_slots;
+
+ slot_idx = 0;
+ dst = dst_priv->nhg;
+ /* Calculate sum of all weights */
+ remaining_sum = 0;
+ for (i = 0; i < dst_priv->nhg_nh_count; i++)
+ remaining_sum += x[i].weight;
+ remaining_slots = num_slots;
+ DPRINTF("O: %u/%u", (uint32_t)remaining_sum, remaining_slots);
+ for (i = 0; i < dst_priv->nhg_nh_count; i++) {
+ /* Calculate number of slots for the current nexthop */
+ if (remaining_sum > 0) {
+ nh_weight = (uint64_t)x[i].weight;
+ nh_slots = (nh_weight * remaining_slots / remaining_sum);
+ } else
+ nh_slots = 0;
+
+ remaining_sum -= x[i].weight;
+ remaining_slots -= nh_slots;
+
+ DPRINTF(" OO[%d]: %u/%u curr=%d slot_idx=%d", i,
+ (uint32_t)remaining_sum, remaining_slots,
+ (int)nh_slots, slot_idx);
+
+ while (nh_slots-- > 0)
+ dst->nhops[slot_idx++] = x[i].nh;
+ }
+}
+
+/*
+ * Allocates new nexthop group for the list of weightened nexthops.
+ * Assume sorted list.
+ * Does NOT reference any nexthops in the group.
+ * Returns group with refcount=1 or NULL.
+ */
+static struct nhgrp_priv *
+alloc_nhgrp(struct weightened_nhop *wn, int num_nhops)
+{
+ uint32_t mpath_size;
+ int flags = M_NOWAIT;
+ struct nhgrp_object *nhg;
+ struct nhgrp_priv *nhg_priv;
+
+ mpath_size = calc_min_mpath_slots(wn, num_nhops);
+ if (mpath_size == 0) {
+ /* Zero weights, abort */
+ return (NULL);
+ }
+
+ size_t sz = get_nhgrp_alloc_size(mpath_size, num_nhops);
+ nhg = malloc(sz, M_NHOP, flags | M_ZERO);
+ if (nhg == NULL) {
+ return (NULL);
+ }
+
+ /* Has to be the first to make NHGRP_PRIV() work */
+ nhg->mp_size = mpath_size;
+ DPRINTF("new mpath group: num_nhops: %u", (uint32_t)mpath_size);
+ nhg->mp_flags = MPF_MULTIPATH;
+
+ nhg_priv = NHGRP_PRIV(nhg);
+ nhg_priv->nhg_nh_count = num_nhops;
+ refcount_init(&nhg_priv->nhg_refcount, 1);
+
+ /* Please see nhgrp_free() comments on the initial value */
+ refcount_init(&nhg_priv->nhg_linked, 2);
+
+ nhg_priv->nhg = nhg;
+ memcpy(&nhg_priv->nhg_nh_weights[0], wn,
+ num_nhops * sizeof(struct weightened_nhop));
+
+ compile_nhgrp(nhg_priv, wn, nhg->mp_size);
+
+ return (nhg_priv);
+}
+
+void
+nhgrp_free(struct nhgrp_object *nhg)
+{
+ struct nhgrp_priv *nhg_priv;
+ struct nh_control *ctl;
+ struct epoch_tracker et;
+
+ nhg_priv = NHGRP_PRIV(nhg);
+
+ if (!refcount_release(&nhg_priv->nhg_refcount))
+ return;
+
+ /*
+ * group objects don't have an explicit lock attached to it.
+ * As groups are reclaimed based on reference count, it is possible
+ * that some groups will persist after vnet destruction callback
+ * called. Given that, handle scenario with nhgrp_free_group() being
+ * called either after or simultaneously with nhgrp_ctl_unlink_all()
+ * by using another reference counter: nhg_linked.
+ *
+ * There are only 2 places, where nhg_linked can be decreased:
+ * rib destroy (nhgrp_ctl_unlink_all) and this function.
+ * nhg_link can never be increased.
+ *
+ * Hence, use initial value of 2 to make use of
+ * refcount_release_if_not_last().
+ *
+ * There can be two scenarious when calling this function:
+ *
+ * 1) nhg_linked value is 2. This means that either
+ * nhgrp_ctl_unlink_all() has not been called OR it is running,
+ * but we are guaranteed that nh_control won't be freed in
+ * this epoch. Hence, nexthop can be safely unlinked.
+ *
+ * 2) nh_linked value is 1. In that case, nhgrp_ctl_unlink_all()
+ * has been called and nhgrp unlink can be skipped.
+ */
+
+ NET_EPOCH_ENTER(et);
+ if (refcount_release_if_not_last(&nhg_priv->nhg_linked)) {
+ ctl = nhg_priv->nh_control;
+ if (unlink_nhgrp(ctl, nhg_priv) == NULL) {
+ /* Do not try to reclaim */
+ DPRINTF("Failed to unlink nexhop group %p", nhg_priv);
+ NET_EPOCH_EXIT(et);
+ return;
+ }
+ }
+ NET_EPOCH_EXIT(et);
+
+ epoch_call(net_epoch_preempt, destroy_nhgrp_epoch,
+ &nhg_priv->nhg_epoch_ctx);
+}
+
+/*
+ * Destroys all local resources belonging to @nhg_priv.
+ */
+__noinline static void
+destroy_nhgrp_int(struct nhgrp_priv *nhg_priv)
+{
+
+ free(nhg_priv->nhg, M_NHOP);
+}
+
+__noinline static void
+destroy_nhgrp(struct nhgrp_priv *nhg_priv)
+{
+
+ KASSERT((nhg_priv->nhg_refcount == 0), ("nhg_refcount != 0"));
+
+ DPRINTF("DEL MPATH %p", nhg_priv);
+
+ KASSERT((nhg_priv->nhg_idx == 0), ("gr_idx != 0"));
+
+ free_nhgrp_nhops(nhg_priv);
+
+ destroy_nhgrp_int(nhg_priv);
+}
+
+/*
+ * Epoch callback indicating group is safe to destroy
+ */
+static void
+destroy_nhgrp_epoch(epoch_context_t ctx)
+{
+ struct nhgrp_priv *nhg_priv;
+
+ nhg_priv = __containerof(ctx, struct nhgrp_priv, nhg_epoch_ctx);
+
+ destroy_nhgrp(nhg_priv);
+}
+
+static bool
+ref_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
+{
+
+ for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
+ if (nhop_try_ref_object(nhg_priv->nhg_nh_weights[i].nh) != 0)
+ continue;
+
+ /*
+ * Failed to ref the nexthop, b/c it's deleted.
+ * Need to rollback references back.
+ */
+ for (int j = 0; j < i; j++)
+ nhop_free(nhg_priv->nhg_nh_weights[j].nh);
+ return (false);
+ }
+
+ return (true);
+}
+
+static void
+free_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
+{
+
+ for (int i = 0; i < nhg_priv->nhg_nh_count; i++)
+ nhop_free(nhg_priv->nhg_nh_weights[i].nh);
+}
+
+/*
+ * Creates or looks up an existing nexthop group based on @wn and @num_nhops.
+ *
+ * Returns referenced nhop group or NULL, passing error code in @perror.
+ */
+struct nhgrp_priv *
+get_nhgrp(struct nh_control *ctl, struct weightened_nhop *wn, int num_nhops,
+ int *perror)
+{
+ struct nhgrp_priv *key, *nhg_priv;
+
+ if (ctl->gr_head.hash_size == 0) {
+ /* First multipath request. Bootstrap mpath datastructures. */
+ if (nhgrp_ctl_alloc_default(ctl, M_NOWAIT) == 0) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+ }
+
+ /* Sort nexthops & check there are no duplicates */
+ sort_weightened_nhops(wn, num_nhops);
+ uint32_t last_id = 0;
+ for (int i = 0; i < num_nhops; i++) {
+ if (wn[i].nh->nh_priv->nh_idx == last_id) {
+ *perror = EEXIST;
+ return (NULL);
+ }
+ last_id = wn[i].nh->nh_priv->nh_idx;
+ }
+
+ if ((key = alloc_nhgrp(wn, num_nhops)) == NULL) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+
+ nhg_priv = find_nhgrp(ctl, key);
+ if (nhg_priv != NULL) {
+ /*
+ * Free originally-created group. As it hasn't been linked
+ * and the dependent nexhops haven't been referenced, just free
+ * the group.
+ */
+ destroy_nhgrp_int(key);
+ *perror = 0;
+ return (nhg_priv);
+ } else {
+ /* No existing group, try to link the new one */
+ if (!ref_nhgrp_nhops(key)) {
+ /*
+ * Some of the nexthops have been sheduled for deletion.
+ * As the group hasn't been linked / no nexhops have been
+ * referenced, call the final destructor immediately.
+ */
+ destroy_nhgrp_int(key);
+ *perror = EAGAIN;
+ return (NULL);
+ }
+ if (link_nhgrp(ctl, key) == 0) {
+ /* Unable to allocate index? */
+ *perror = EAGAIN;
+ destroy_nhgrp(key);
+ }
+ *perror = 0;
+ return (key);
+ }
+
+ /* NOTREACHED */
+}
+
+/*
+ * Appends one or more nexthops denoted by @wm to the nexthop group @gr_orig.
+ *
+ * Returns referenced nexthop group or NULL. In the latter case, @perror is
+ * filled with an error code.
+ * Note that function does NOT care if the next nexthops already exists
+ * in the @gr_orig. As a result, they will be added, resulting in the
+ * same nexthop being present multiple times in the new group.
+ */
+static struct nhgrp_priv *
+append_nhops(struct nh_control *ctl, const struct nhgrp_object *gr_orig,
+ struct weightened_nhop *wn, int num_nhops, int *perror)
+{
+ char storage[64];
+ struct weightened_nhop *pnhops;
+ struct nhgrp_priv *nhg_priv;
+ const struct nhgrp_priv *src_priv;
+ size_t sz;
+ int curr_nhops;
+
+ src_priv = NHGRP_PRIV_CONST(gr_orig);
+ curr_nhops = src_priv->nhg_nh_count;
+
+ *perror = 0;
+
+ sz = (src_priv->nhg_nh_count + num_nhops) * (sizeof(struct weightened_nhop));
+ /* optimize for <= 4 paths, each path=16 bytes */
+ if (sz <= sizeof(storage))
+ pnhops = (struct weightened_nhop *)&storage[0];
+ else {
+ pnhops = malloc(sz, M_TEMP, M_NOWAIT);
+ if (pnhops == NULL) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+ }
+
+ /* Copy nhops from original group first */
+ memcpy(pnhops, src_priv->nhg_nh_weights,
+ curr_nhops * sizeof(struct weightened_nhop));
+ memcpy(&pnhops[curr_nhops], wn, num_nhops * sizeof(struct weightened_nhop));
+ curr_nhops += num_nhops;
+
+ nhg_priv = get_nhgrp(ctl, pnhops, curr_nhops, perror);
+
+ if (pnhops != (struct weightened_nhop *)&storage[0])
+ free(pnhops, M_TEMP);
+
+ if (nhg_priv == NULL)
+ return (NULL);
+
+ return (nhg_priv);
+}
+
+
+/*
+ * Creates/finds nexthop group based on @wn and @num_nhops.
+ * Returns referenced group or NULL, with an error in @perror.
+ *
+ * If the error is EAGAIN, then the operation can be retried.
+ */
+int
+nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops,
+ struct route_nhop_data *rnd)
+{
+ struct nh_control *ctl = rh->nh_control;
+ struct nhgrp_priv *nhg_priv;
+ int error;
+
+ nhg_priv = get_nhgrp(ctl, wn, num_nhops, &error);
+ if (nhg_priv != NULL)
+ rnd->rnd_nhgrp = nhg_priv->nhg;
+ rnd->rnd_weight = 0;
+
+ return (error);
+}
+
+/*
+ * Creates new nexthop group based on @src group with the nexthops defined in bitmask
+ * @nhop_mask removed.
+ * Returns referenced nexthop group or NULL on failure.
+ */
+int
+nhgrp_get_filtered_group(struct rib_head *rh, const struct nhgrp_object *src,
+ nhgrp_filter_cb_t flt_func, void *flt_data, struct route_nhop_data *rnd)
+{
+ char storage[64];
+ struct nh_control *ctl = rh->nh_control;
+ struct weightened_nhop *pnhops;
+ const struct nhgrp_priv *mp_priv, *src_priv;
+ size_t sz;
+ int error, i, num_nhops;
+
+ src_priv = NHGRP_PRIV_CONST(src);
+
+ sz = src_priv->nhg_nh_count * (sizeof(struct weightened_nhop));
+ /* optimize for <= 4 paths, each path=16 bytes */
+ if (sz <= sizeof(storage))
+ pnhops = (struct weightened_nhop *)&storage[0];
+ else {
+ if ((pnhops = malloc(sz, M_TEMP, M_NOWAIT)) == NULL)
+ return (ENOMEM);
+ }
+
+ /* Filter nexthops */
+ error = 0;
+ num_nhops = 0;
+ for (i = 0; i < src_priv->nhg_nh_count; i++) {
+ if (flt_func(src_priv->nhg_nh_weights[i].nh, flt_data))
+ continue;
+ memcpy(&pnhops[num_nhops++], &src_priv->nhg_nh_weights[i],
+ sizeof(struct weightened_nhop));
+ }
+
+ if (num_nhops == 0) {
+ rnd->rnd_nhgrp = NULL;
+ rnd->rnd_weight = 0;
+ } else if (num_nhops == 1) {
+ rnd->rnd_nhop = pnhops[0].nh;
+ rnd->rnd_weight = pnhops[0].weight;
+ if (nhop_try_ref_object(rnd->rnd_nhop) == 0)
+ error = EAGAIN;
+ } else {
+ mp_priv = get_nhgrp(ctl, pnhops, num_nhops, &error);
+ if (mp_priv != NULL)
+ rnd->rnd_nhgrp = mp_priv->nhg;
+ rnd->rnd_weight = 0;
+ }
+
+ if (pnhops != (struct weightened_nhop *)&storage[0])
+ free(pnhops, M_TEMP);
+
+ return (error);
+}
+
+/*
+ * Creates new multipath group based on existing group/nhop in @rnd_orig and
+ * to-be-added nhop @wn_add.
+ * Returns 0 on success and stores result in @rnd_new.
+ */
+int
+nhgrp_get_addition_group(struct rib_head *rh, struct route_nhop_data *rnd_orig,
+ struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_new)
+{
+ struct nh_control *ctl = rh->nh_control;
+ struct nhgrp_priv *nhg_priv;
+ struct weightened_nhop wn[2];
+ int error;
+
+ if (rnd_orig->rnd_nhop == NULL) {
+ /* No paths to add to, just reference current nhop */
+ *rnd_new = *rnd_add;
+ if (nhop_try_ref_object(rnd_new->rnd_nhop) == 0)
+ return (EAGAIN);
+ return (0);
+ }
+
+ wn[0].nh = rnd_add->rnd_nhop;
+ wn[0].weight = rnd_add->rnd_weight;
+
+ if (!NH_IS_NHGRP(rnd_orig->rnd_nhop)) {
+ /* Simple merge of 2 non-multipath nexthops */
+ wn[1].nh = rnd_orig->rnd_nhop;
+ wn[1].weight = rnd_orig->rnd_weight;
+ nhg_priv = get_nhgrp(ctl, wn, 2, &error);
+ } else {
+ /* Get new nhop group with @rt->rt_nhop as an additional nhop */
+ nhg_priv = append_nhops(ctl, rnd_orig->rnd_nhgrp, &wn[0], 1,
+ &error);
+ }
+
+ if (nhg_priv == NULL)
+ return (error);
+ rnd_new->rnd_nhgrp = nhg_priv->nhg;
+ rnd_new->rnd_weight = 0;
+
+ return (0);
+}
+
+/*
+ * Returns pointer to array of nexthops with weights for
+ * given @nhg. Stores number of items in the array into @pnum_nhops.
+ */
+struct weightened_nhop *
+nhgrp_get_nhops(struct nhgrp_object *nhg, uint32_t *pnum_nhops)
+{
+ struct nhgrp_priv *nhg_priv;
+
+ KASSERT(((nhg->mp_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
+
+ nhg_priv = NHGRP_PRIV(nhg);
+ *pnum_nhops = nhg_priv->nhg_nh_count;
+
+ return (nhg_priv->nhg_nh_weights);
+}
+
+__noinline static int
+dump_nhgrp_entry(struct rib_head *rh, const struct nhgrp_priv *nhg_priv,
+ char *buffer, struct sysctl_req *w)
+{
+
+ struct rt_msghdr *rtm;
+ struct mpath_external *mpe;
+ struct nhgrp_object *mp;
+ struct mpath_nhop_external *ext;
+ uint32_t *pidx;
+ int error;
+ size_t sz;
+
+ //DPRINTF("Dumping: head %p nh %p flags %X req %p\n", rh, nh, nh->nh_flags, w);
+
+ mp = nhg_priv->nhg;
+
+ sz = sizeof(struct rt_msghdr) + sizeof(struct mpath_external);
+ sz += sizeof(struct mpath_nhop_external) * nhg_priv->nhg_nh_count;
+ sz += sizeof(uint32_t) * mp->mp_size;
+
+ bzero(buffer, sz);
+
+ rtm = (struct rt_msghdr *)buffer;
+ rtm->rtm_msglen = sz;
+ rtm->rtm_version = RTM_VERSION;
+ rtm->rtm_type = RTM_GET;
+
+ mpe = (struct mpath_external *)(rtm + 1);
+
+ mpe->mp_idx = nhg_priv->nhg_idx;
+ mpe->mp_refcount = nhg_priv->nhg_refcount;
+ mpe->mp_nh_count = nhg_priv->nhg_nh_count;
+ mpe->mp_group_size = mp->mp_size;
+
+ ext = (struct mpath_nhop_external *)(mpe + 1);
+ for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
+ ext[i].nh_idx = nhg_priv->nhg_nh_weights[i].nh->nh_priv->nh_idx;
+ ext[i].nh_weight = nhg_priv->nhg_nh_weights[i].weight;
+ }
+
+ pidx = (uint32_t *)&ext[nhg_priv->nhg_nh_count];
+ for (int i = 0; i < mp->mp_size; i++)
+ pidx[i] = mp->nhops[i]->nh_priv->nh_idx;
+
+ error = SYSCTL_OUT(w, buffer, sz);
+
+ /*
+ DPRINTF("Exported %d ifindex %d family %d type %d error %d\n", nh->nh_priv->nh_idx, pnhe->ifindex,
+ pnhe->nh_family, pnhe->nh_type, error);
+ */
+
+ return (error);
+}
+
+int
+nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w)
+{
+ struct nh_control *ctl = rh->nh_control;
+ struct epoch_tracker et;
+ struct nhgrp_priv *nhg_priv;
+ char *buffer;
+ size_t sz;
+ int error;
+
+ if (ctl->gr_head.items_count == 0)
+ return (0);
+
+ sz = sizeof(struct mpath_external);
+ sz += (sizeof(struct mpath_nhop_external) + sizeof(uint32_t)) *
+ RIB_MAX_MPATH_WIDTH;
+ buffer = malloc(sz, M_TEMP, M_WAITOK);
+
+ DPRINTF("NHGRP DUMP: count=%u", ctl->gr_head.items_count);
+ NET_EPOCH_ENTER(et);
+ NHOPS_RLOCK(ctl);
+ CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) {
+ error = dump_nhgrp_entry(rh, nhg_priv, buffer, w);
+ if (error != 0) {
+ NHOPS_RUNLOCK(ctl);
+ NET_EPOCH_EXIT(et);
+ free(buffer, M_TEMP);
+ return (error);
+ }
+ } CHT_SLIST_FOREACH_END;
+ NHOPS_RUNLOCK(ctl);
+ NET_EPOCH_EXIT(et);
+
+ free(buffer, M_TEMP);
+
+ return (0);
+}
Index: sys/net/route/nhgrp_var.h
===================================================================
--- /dev/null
+++ sys/net/route/nhgrp_var.h
@@ -0,0 +1,72 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This header file contains private definitions for the nexthop groups.
+ *
+ * Header is not intended to be included by the code external to the
+ * routing subsystem.
+ */
+
+#ifndef _NET_ROUTE_NHGRP_VAR_H_
+#define _NET_ROUTE_NHGRP_VAR_H_
+
+/* nhgrp hash definition */
+/* produce hash value for an object */
+#define mpath_hash_obj(_obj) (hash_nhgrp(_obj))
+/* compare two objects */
+#define mpath_cmp(_one, _two) (cmp_nhgrp(_one, _two))
+/* next object accessor */
+#define mpath_next(_obj) (_obj)->nhg_priv_next
+
+struct nhgrp_priv {
+ uint32_t nhg_idx;
+ uint8_t nhg_nh_count; /* number of items in nh_weights */
+ uint8_t nhg_spare[3];
+ u_int nhg_refcount; /* use refcount */
+ u_int nhg_linked; /* refcount(9), == 2 if linked to the list */
+ struct nh_control *nh_control; /* parent control structure */
+ struct nhgrp_priv *nhg_priv_next;
+ struct nhgrp_object *nhg;
+ struct epoch_context nhg_epoch_ctx; /* epoch data for nhop */
+ struct weightened_nhop nhg_nh_weights[0];
+};
+
+#define _NHGRP_PRIV(_src) (&(_src)->nhops[(_src)->mp_size])
+#define NHGRP_PRIV(_src) ((struct nhgrp_priv *)_NHGRP_PRIV(_src))
+#define NHGRP_PRIV_CONST(_src) ((const struct nhgrp_priv *)_NHGRP_PRIV(_src))
+
+/* nhgrp.c */
+bool nhgrp_ctl_alloc_default(struct nh_control *ctl, int malloc_flags);
+struct nhgrp_priv *find_nhgrp(struct nh_control *ctl, const struct nhgrp_priv *key);
+int link_nhgrp(struct nh_control *ctl, struct nhgrp_priv *grp_priv);
+struct nhgrp_priv *unlink_nhgrp(struct nh_control *ctl, struct nhgrp_priv *key);
+
+#endif
+
Index: sys/net/route/nhop.h
===================================================================
--- sys/net/route/nhop.h
+++ sys/net/route/nhop.h
@@ -155,7 +155,7 @@
*/
#define NH_IS_VALID(_nh) RT_LINK_IS_UP((_nh)->nh_ifp)
-#define NH_IS_MULTIPATH(_nh) ((_nh)->nh_flags & NHF_MULTIPATH)
+#define NH_IS_NHGRP(_nh) ((_nh)->nh_flags & NHF_MULTIPATH)
#define RT_GATEWAY(_rt) ((struct sockaddr *)&(_rt)->rt_nhop->gw4_sa)
#define RT_GATEWAY_CONST(_rt) ((const struct sockaddr *)&(_rt)->rt_nhop->gw4_sa)
@@ -166,6 +166,11 @@
_nh = NULL; \
} while (0)
+struct weightened_nhop {
+ struct nhop_object *nh;
+ uint32_t weight;
+};
+
void nhop_free(struct nhop_object *nh);
struct sysctl_req;
Index: sys/net/route/nhop.c
===================================================================
--- sys/net/route/nhop.c
+++ sys/net/route/nhop.c
@@ -64,7 +64,7 @@
* is backed by the bitmask array.
*/
-static MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data");
+MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data");
/* Hash management functions */
@@ -112,6 +112,9 @@
NHOPS_LOCK_DESTROY(ctl);
free(ctl->nh_head.ptr, M_NHOP);
free(ctl->nh_idx_head.idx, M_NHOP);
+#ifdef ROUTE_MPATH
+ nhgrp_ctl_free(ctl);
+#endif
free(ctl, M_NHOP);
}
@@ -154,6 +157,9 @@
DPRINTF("Marking nhop %u unlinked", nh_priv->nh_idx);
refcount_release(&nh_priv->nh_linked);
} CHT_SLIST_FOREACH_END;
+#ifdef ROUTE_MPATH
+ nhgrp_ctl_unlink_all(ctl);
+#endif
NHOPS_WUNLOCK(ctl);
/*
Index: sys/net/route/nhop_ctl.c
===================================================================
--- sys/net/route/nhop_ctl.c
+++ sys/net/route/nhop_ctl.c
@@ -685,18 +685,18 @@
&nh_priv->nh_epoch_ctx);
}
-int
-nhop_ref_any(struct nhop_object *nh)
-{
-
- return (nhop_try_ref_object(nh));
-}
-
void
nhop_free_any(struct nhop_object *nh)
{
+#ifdef ROUTE_MPATH
+ if (!NH_IS_NHGRP(nh))
+ nhop_free(nh);
+ else
+ nhgrp_free((struct nhgrp_object *)nh);
+#else
nhop_free(nh);
+#endif
}
/* Helper functions */
Index: sys/net/route/nhop_utils.c
===================================================================
--- sys/net/route/nhop_utils.c
+++ sys/net/route/nhop_utils.c
@@ -29,7 +29,6 @@
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_route.h"
-#include "opt_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
Index: sys/net/route/nhop_var.h
===================================================================
--- sys/net/route/nhop_var.h
+++ sys/net/route/nhop_var.h
@@ -37,6 +37,8 @@
#ifndef _NET_ROUTE_NHOP_VAR_H_
#define _NET_ROUTE_NHOP_VAR_H_
+MALLOC_DECLARE(M_NHOP);
+
/* define nhop hash table */
struct nhop_priv;
CHT_SLIST_DEFINE(nhops, struct nhop_priv);
@@ -47,9 +49,15 @@
/* next object accessor */
#define nhops_next(_obj) (_obj)->nh_next
+/* define multipath hash table */
+struct nhgrp_priv;
+CHT_SLIST_DEFINE(nhgroups, struct nhgrp_priv);
+
struct nh_control {
struct nhops_head nh_head; /* hash table head */
struct bitmask_head nh_idx_head; /* nhop index head */
+ struct nhgroups_head gr_head; /* nhgrp hash table head */
+ struct bitmask_head gr_idx_head; /* nhgrp index head */
struct rwlock ctl_lock; /* overall ctl lock */
struct rib_head *ctl_rh; /* pointer back to rnh */
struct epoch_context ctl_epoch_ctx; /* epoch ctl helper */
@@ -80,7 +88,8 @@
struct epoch_context nh_epoch_ctx; /* epoch data for nhop */
};
-#define NH_IS_PINNED(_nh) ((_nh)->nh_priv->rt_flags & RTF_PINNED)
+#define NH_IS_PINNED(_nh) ((!NH_IS_NHGRP(_nh)) && \
+ ((_nh)->nh_priv->rt_flags & RTF_PINNED))
/* nhop.c */
struct nhop_priv *find_nhop(struct nh_control *ctl,
Index: sys/net/route/route_ctl.h
===================================================================
--- sys/net/route/route_ctl.h
+++ sys/net/route/route_ctl.h
@@ -53,6 +53,10 @@
int rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info,
struct rib_cmd_info *rc);
+typedef void route_notification_t(struct rib_cmd_info *rc, void *);
+void rib_decompose_notification(struct rib_cmd_info *rc,
+ route_notification_t *cb, void *cbdata);
+
int rib_add_redirect(u_int fibnum, struct sockaddr *dst,
struct sockaddr *gateway, struct sockaddr *author, struct ifnet *ifp,
int flags, int expire_sec);
@@ -66,6 +70,20 @@
void rt_foreach_fib_walk(int af, rt_setwarg_t *, rt_walktree_f_t *, void *);
void rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg);
+struct route_nhop_data;
+const struct rtentry *rib_lookup_prefix(uint32_t fibnum, int family,
+ const struct sockaddr *dst, const struct sockaddr *netmask,
+ struct route_nhop_data *rnd);
+const struct rtentry *rib_lookup_lpm(uint32_t fibnum, int family,
+ const struct sockaddr *dst, struct route_nhop_data *rnd);
+
+/* Multipath */
+struct nhgrp_object;
+struct weightened_nhop;
+
+struct weightened_nhop *nhgrp_get_nhops(struct nhgrp_object *nhg,
+ uint32_t *pnum_nhops);
+
enum rib_subscription_type {
RIB_NOTIFY_IMMEDIATE,
RIB_NOTIFY_DELAYED
Index: sys/net/route/route_ctl.c
===================================================================
--- sys/net/route/route_ctl.c
+++ sys/net/route/route_ctl.c
@@ -29,7 +29,7 @@
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
-#include "opt_mpath.h"
+#include "opt_route.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -83,9 +83,6 @@
struct rib_cmd_info *rc);
static int change_route(struct rib_head *rnh, struct rt_addrinfo *info,
struct route_nhop_data *nhd_orig, struct rib_cmd_info *rc);
-static int change_route_nhop(struct rib_head *rnh, struct rtentry *rt,
- struct rt_addrinfo *info, struct route_nhop_data *rnd,
- struct rib_cmd_info *rc);
static int rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info,
struct rib_cmd_info *rc);
@@ -94,6 +91,21 @@
struct rib_cmd_info *rc);
static void destroy_subscription_epoch(epoch_context_t ctx);
+static bool rib_can_multipath(struct rib_head *rh);
+
+/* Per-vnet multipath routing configuration */
+SYSCTL_DECL(_net_route);
+#define V_rib_route_multipath VNET(rib_route_multipath)
+#ifdef ROUTE_MPATH
+VNET_DEFINE(u_int, rib_route_multipath) = 1;
+#define _MP_FLAGS CTLFLAG_RWTUN
+#else
+VNET_DEFINE(u_int, rib_route_multipath) = 0;
+#define _MP_FLAGS CTLFLAG_RD
+#endif
+SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET,
+ &VNET_NAME(rib_route_multipath), 0, "Enable route multipath");
+#undef _MP_FLAGS
/* Routing table UMA zone */
VNET_DEFINE_STATIC(uma_zone_t, rtzone);
@@ -128,7 +140,7 @@
CURVNET_SET(nhop_get_vnet(rt->rt_nhop));
/* Unreference nexthop */
- nhop_free(rt->rt_nhop);
+ nhop_free_any(rt->rt_nhop);
uma_zfree(V_rtzone, rt);
@@ -175,12 +187,73 @@
return (rnh);
}
+#ifdef ROUTE_MPATH
+static bool
+rib_can_multipath(struct rib_head *rh)
+{
+ int result;
+
+ CURVNET_SET(rh->rib_vnet);
+ result = !!V_rib_route_multipath;
+ CURVNET_RESTORE();
+
+ return (result);
+}
+
+/*
+ * Check is nhop is multipath-eligible.
+ * Avoid nhops without gateways and redirects.
+ *
+ * Returns 1 for multipath-eligible nexthop,
+ * 0 otherwise.
+ */
+bool
+nhop_can_multipath(const struct nhop_object *nh)
+{
+
+ if ((nh->nh_flags & NHF_MULTIPATH) != 0)
+ return (1);
+ if ((nh->nh_flags & NHF_GATEWAY) == 0)
+ return (0);
+ if ((nh->nh_flags & NHF_REDIRECT) != 0)
+ return (0);
+
+ return (1);
+}
+#endif
+
+static int
+get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight)
+{
+ uint32_t weight;
+
+ if (info->rti_mflags & RTV_WEIGHT)
+ weight = info->rti_rmx->rmx_weight;
+ else
+ weight = default_weight;
+ /* Keep upper 1 byte for adm distance purposes */
+ if (weight > RT_MAX_WEIGHT)
+ weight = RT_MAX_WEIGHT;
+
+ return (weight);
+}
+
+static void
+rt_set_expire_info(struct rtentry *rt, const struct rt_addrinfo *info)
+{
+
+ /* Kernel -> userland timebase conversion. */
+ if (info->rti_mflags & RTV_EXPIRE)
+ rt->rt_expire = info->rti_rmx->rmx_expire ?
+ info->rti_rmx->rmx_expire - time_second + time_uptime : 0;
+}
+
/*
* Check if specified @gw matches gw data in the nexthop @nh.
*
* Returns true if matches, false otherwise.
*/
-static bool
+bool
match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw)
{
@@ -423,9 +496,8 @@
* examine the ifa and ifa->ifa_ifp if it so desires.
*/
ifa = info->rti_ifa;
- rt->rt_weight = 1;
-
- rt_setmetrics(info, rt);
+ rt->rt_weight = get_info_weight(info, RT_DEFAULT_WEIGHT);
+ rt_set_expire_info(rt, info);
*prt = rt;
return (0);
@@ -436,7 +508,7 @@
struct rib_cmd_info *rc)
{
struct nhop_object *nh_orig;
- struct route_nhop_data rnd;
+ struct route_nhop_data rnd_orig, rnd_add;
struct nhop_object *nh;
struct rtentry *rt, *rt_orig;
int error;
@@ -445,32 +517,19 @@
if (error != 0)
return (error);
- rnd.rnd_nhop = rt->rt_nhop;
- rnd.rnd_weight = rt->rt_weight;
+ rnd_add.rnd_nhop = rt->rt_nhop;
+ rnd_add.rnd_weight = rt->rt_weight;
nh = rt->rt_nhop;
RIB_WLOCK(rnh);
-#ifdef RADIX_MPATH
- struct sockaddr *netmask;
- netmask = info->rti_info[RTAX_NETMASK];
- /* do not permit exactly the same dst/mask/gw pair */
- if (rt_mpath_capable(rnh) &&
- rt_mpath_conflict(rnh, rt, netmask)) {
- RIB_WUNLOCK(rnh);
-
- nhop_free(nh);
- uma_zfree(V_rtzone, rt);
- return (EEXIST);
- }
-#endif
- error = add_route_nhop(rnh, rt, info, &rnd, rc);
+ error = add_route_nhop(rnh, rt, info, &rnd_add, rc);
if (error == 0) {
RIB_WUNLOCK(rnh);
return (0);
}
/* addition failed. Lookup prefix in the rib to determine the cause */
- rt_orig = lookup_prefix(rnh, info, &rnd);
+ rt_orig = lookup_prefix(rnh, info, &rnd_orig);
if (rt_orig == NULL) {
/* No prefix -> rnh_addaddr() failed to allocate memory */
RIB_WUNLOCK(rnh);
@@ -480,11 +539,11 @@
}
/* We have existing route in the RIB. */
- nh_orig = rnd.rnd_nhop;
+ nh_orig = rnd_orig.rnd_nhop;
/* Check if new route has higher preference */
if (can_override_nhop(info, nh_orig) > 0) {
/* Update nexthop to the new route */
- change_route_nhop(rnh, rt_orig, info, &rnd, rc);
+ change_route_nhop(rnh, rt_orig, info, &rnd_add, rc);
RIB_WUNLOCK(rnh);
uma_zfree(V_rtzone, rt);
nhop_free(nh_orig);
@@ -493,11 +552,26 @@
RIB_WUNLOCK(rnh);
+#ifdef ROUTE_MPATH
+ if (rib_can_multipath(rnh) && nhop_can_multipath(rnd_add.rnd_nhop) &&
+ nhop_can_multipath(rnd_orig.rnd_nhop))
+ error = add_route_mpath(rnh, info, rt, &rnd_add, &rnd_orig, rc);
+ else
+#endif
/* Unable to add - another route with the same preference exists */
error = EEXIST;
+ /*
+ * no multipath: failed to add, free both nhop and rc
+ * multipath: original nhop reference is unused in any case,
+ * rt can be used only if _adding_ new route (e.g. the case
+ * when initial lookup returned existing route, but then it get
+ * deleted prior to multipath group insertion, leading to a simple
+ * non-multipath add as a result).
+ */
nhop_free(nh);
- uma_zfree(V_rtzone, rt);
+ if ((error != 0) || rc->rc_cmd != RTM_ADD)
+ uma_zfree(V_rtzone, rt);
return (error);
}
@@ -549,7 +623,7 @@
* EADDRINUSE - if trying to delete higher priority route.
* ENOENT - if supplied filter function returned 0 (not matched).
*/
-static int
+int
rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc)
{
struct rtentry *rt;
@@ -563,7 +637,13 @@
return (ESRCH);
nh = rt->rt_nhop;
-
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh)) {
+ error = del_route_mpath(rnh, info, rt,
+ (struct nhgrp_object *)nh, rc);
+ return (error);
+ }
+#endif
error = check_info_match_nhop(info, rt, nh);
if (error != 0)
return (error);
@@ -575,14 +655,6 @@
* Remove the item from the tree and return it.
* Complain if it is not there and do no more processing.
*/
-#ifdef RADIX_MPATH
- info->rti_info[RTAX_GATEWAY] = &nh->gw_sa;
- if (rt_mpath_capable(rnh)) {
- rn = rt_mpath_unlink(rnh, info, rt, &error);
- if (error != 0)
- return (error);
- } else
-#endif
rn = rnh->rnh_deladdr(info->rti_info[RTAX_DST],
info->rti_info[RTAX_NETMASK], &rnh->head);
if (rn == NULL)
@@ -623,7 +695,18 @@
* If the caller wants it, then it can have it,
* the entry will be deleted after the end of the current epoch.
*/
- rtfree(rc->rc_rt);
+ if (rc->rc_cmd == RTM_DELETE)
+ rtfree(rc->rc_rt);
+#ifdef ROUTE_MPATH
+ else {
+ /*
+ * Deleting 1 path may result in RTM_CHANGE to
+ * a different mpath group/nhop.
+ * Free old mpath group.
+ */
+ nhop_free_any(rc->rc_nh_old);
+ }
+#endif
return (0);
}
@@ -669,19 +752,6 @@
return (ESRCH);
}
-#ifdef RADIX_MPATH
- /*
- * If we got multipath routes,
- * we require users to specify a matching RTAX_GATEWAY.
- */
- if (rt_mpath_capable(rnh)) {
- rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]);
- if (rt == NULL) {
- RIB_RUNLOCK(rnh);
- return (ESRCH);
- }
- }
-#endif
rnd_orig.rnd_nhop = rt->rt_nhop;
rnd_orig.rnd_weight = rt->rt_weight;
@@ -697,18 +767,11 @@
}
static int
-change_route(struct rib_head *rnh, struct rt_addrinfo *info,
- struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
+change_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct nhop_object *nh_orig, struct nhop_object **nh_new)
{
- int error = 0;
int free_ifa = 0;
- struct nhop_object *nh, *nh_orig;
- struct route_nhop_data rnd_new;
-
- nh = NULL;
- nh_orig = rnd_orig->rnd_nhop;
- if (nh_orig == NULL)
- return (ESRCH);
+ int error;
/*
* New gateway could require new ifaddr, ifp;
@@ -734,20 +797,95 @@
}
}
- error = nhop_create_from_nhop(rnh, nh_orig, info, &nh);
+ error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new);
if (free_ifa) {
ifa_free(info->rti_ifa);
info->rti_ifa = NULL;
}
+
+ return (error);
+}
+
+#ifdef ROUTE_MPATH
+static int
+change_mpath_route(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
+{
+ int error = 0;
+ struct nhop_object *nh, *nh_orig, *nh_new;
+ struct route_nhop_data rnd_new;
+
+ nh = NULL;
+ nh_orig = rnd_orig->rnd_nhop;
+
+ struct weightened_nhop *wn = NULL, *wn_new;
+ uint32_t num_nhops;
+
+ wn = nhgrp_get_nhops((struct nhgrp_object *)nh_orig, &num_nhops);
+ nh_orig = NULL;
+ for (int i = 0; i < num_nhops; i++) {
+ if (check_info_match_nhop(info, NULL, wn[i].nh)) {
+ nh_orig = wn[i].nh;
+ break;
+ }
+ }
+
+ if (nh_orig == NULL)
+ return (ESRCH);
+
+ error = change_nhop(rnh, info, nh_orig, &nh_new);
if (error != 0)
return (error);
- rnd_new.rnd_nhop = nh;
- if (info->rti_mflags & RTV_WEIGHT)
- rnd_new.rnd_weight = info->rti_rmx->rmx_weight;
- else
- rnd_new.rnd_weight = rnd_orig->rnd_weight;
+ wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop),
+ M_TEMP, M_NOWAIT | M_ZERO);
+ if (wn_new == NULL) {
+ nhop_free(nh_new);
+ return (EAGAIN);
+ }
+
+ memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop));
+ for (int i = 0; i < num_nhops; i++) {
+ if (wn[i].nh == nh_orig) {
+ wn[i].nh = nh_new;
+ wn[i].weight = get_info_weight(info, rnd_orig->rnd_weight);
+ break;
+ }
+ }
+
+ error = nhgrp_get_group(rnh, wn_new, num_nhops, &rnd_new);
+ nhop_free(nh_new);
+ free(wn_new, M_TEMP);
+
+ if (error != 0)
+ return (error);
+
+ error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc);
+
+ return (error);
+}
+#endif
+
+static int
+change_route(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
+{
+ int error = 0;
+ struct nhop_object *nh, *nh_orig;
+ struct route_nhop_data rnd_new;
+ nh = NULL;
+ nh_orig = rnd_orig->rnd_nhop;
+ if (nh_orig == NULL)
+ return (ESRCH);
+
+ if (NH_IS_NHGRP(nh_orig))
+ return (change_mpath_route(rnh, info, rnd_orig, rc));
+
+ rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight);
+ error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop);
+ if (error != 0)
+ return (error);
error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc);
return (error);
@@ -802,7 +940,7 @@
* Conditionally set rt_expire if set in @info.
* Returns 0 on success.
*/
-static int
+int
change_route_nhop(struct rib_head *rnh, struct rtentry *rt,
struct rt_addrinfo *info, struct route_nhop_data *rnd,
struct rib_cmd_info *rc)
@@ -815,7 +953,7 @@
if (rnd->rnd_nhop != NULL) {
/* Changing expiration & nexthop & weight to a new one */
- rt_setmetrics(info, rt);
+ rt_set_expire_info(rt, info);
rt->rt_nhop = rnd->rnd_nhop;
rt->rt_weight = rnd->rnd_weight;
if (rt->rt_expire > 0)
@@ -830,6 +968,8 @@
rn = rnh->rnh_deladdr(ndst, netmask, &rnh->head);
if (rn == NULL)
return (ESRCH);
+ rt = RNTORT(rn);
+ rt->rte_flags &= ~RTF_UP;
}
/* Finalize notification */
@@ -975,7 +1115,7 @@
* XXX: Delayed notifications not implemented
* for nexthop updates.
*/
- if (error == 0) {
+ if ((error == 0) && (di->rc.rc_cmd == RTM_DELETE)) {
/* Add to the list and return */
rt->rt_chain = di->head;
di->head = rt;
Index: sys/net/route/route_helpers.c
===================================================================
--- sys/net/route/route_helpers.c
+++ sys/net/route/route_helpers.c
@@ -131,3 +131,167 @@
return (nh);
}
+
+#ifdef ROUTE_MPATH
+static void
+decompose_change_notification(struct rib_cmd_info *rc, route_notification_t *cb,
+ void *cbdata)
+{
+ uint32_t num_old, num_new;
+ uint32_t nh_idx_old, nh_idx_new;
+ struct weightened_nhop *wn_old, *wn_new;
+ struct weightened_nhop tmp = { NULL, 0 };
+ uint32_t idx_old = 0, idx_new = 0;
+
+ struct rib_cmd_info rc_del = { .rc_cmd = RTM_DELETE, .rc_rt = rc->rc_rt };
+ struct rib_cmd_info rc_add = { .rc_cmd = RTM_ADD, .rc_rt = rc->rc_rt };
+
+ if (NH_IS_NHGRP(rc->rc_nh_old)) {
+ wn_old = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_old);
+ } else {
+ tmp.nh = rc->rc_nh_old;
+ tmp.weight = rc->rc_nh_weight;
+ wn_old = &tmp;
+ num_old = 1;
+ }
+ if (NH_IS_NHGRP(rc->rc_nh_new)) {
+ wn_new = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_new);
+ } else {
+ tmp.nh = rc->rc_nh_new;
+ tmp.weight = rc->rc_nh_weight;
+ wn_new = &tmp;
+ num_new = 1;
+ }
+
+ /* Use the fact that each @wn array is sorted */
+ /*
+ * Want to convert into set of add and delete operations
+ * [1] -> [1, 2] = A{2}
+ * [2] -> [1, 2] = A{1}
+ * [1, 2, 4]->[1, 3, 4] = A{2}, D{3}
+ * [1, 2, 4]->[1, 4] = D{2}
+ * [1, 2, 4] -> [3, 4] = D{1}, C{2,3} OR C{1,3}, D{2} OR D{1},D{2},A{3}
+ * [1, 2] -> [3, 4] =
+ *
+ */
+ idx_old = 0;
+ while ((idx_old < num_old) && (idx_new < num_new)) {
+ nh_idx_old = wn_old[idx_old].nh->nh_priv->nh_idx;
+ nh_idx_new = wn_new[idx_new].nh->nh_priv->nh_idx;
+
+ if (nh_idx_old == nh_idx_new) {
+ if (wn_old[idx_old].weight != wn_new[idx_new].weight) {
+ /* Update weight by providing del/add notifications */
+ rc_del.rc_nh_old = wn_old[idx_old].nh;
+ rc_del.rc_nh_weight = wn_old[idx_old].weight;
+ cb(&rc_del, cbdata);
+
+ rc_add.rc_nh_new = wn_new[idx_new].nh;
+ rc_add.rc_nh_weight = wn_new[idx_new].weight;
+ cb(&rc_add, cbdata);
+ }
+ idx_old++;
+ idx_new++;
+ } else if (nh_idx_old < nh_idx_new) {
+ /*
+ * [1, ~2~, 4], [1, ~3~, 4]
+ * [1, ~2~, 5], [1, ~3~, 4]
+ * [1, ~2~], [1, ~3~, 4]
+ */
+ if ((idx_old + 1 >= num_old) ||
+ (wn_old[idx_old + 1].nh->nh_priv->nh_idx > nh_idx_new)) {
+ /* Add new unless the next old item is still <= new */
+ rc_add.rc_nh_new = wn_new[idx_new].nh;
+ rc_add.rc_nh_weight = wn_new[idx_new].weight;
+ cb(&rc_add, cbdata);
+ idx_new++;
+ }
+ /* In any case, delete current old */
+ rc_del.rc_nh_old = wn_old[idx_old].nh;
+ rc_del.rc_nh_weight = wn_old[idx_old].weight;
+ cb(&rc_del, cbdata);
+ idx_old++;
+ } else {
+ /*
+ * nh_idx_old > nh_idx_new
+ *
+ * [1, ~3~, 4], [1, ~2~, 4]
+ * [1, ~3~, 5], [1, ~2~, 4]
+ * [1, ~3~, 4], [1, ~2~]
+ */
+ if ((idx_new + 1 >= num_new) ||
+ (wn_new[idx_new + 1].nh->nh_priv->nh_idx > nh_idx_old)) {
+ /* No next item or next item is > current one */
+ rc_add.rc_nh_new = wn_new[idx_new].nh;
+ rc_add.rc_nh_weight = wn_new[idx_new].weight;
+ cb(&rc_add, cbdata);
+ idx_new++;
+ }
+ /* In any case, delete current old */
+ rc_del.rc_nh_old = wn_old[idx_old].nh;
+ rc_del.rc_nh_weight = wn_old[idx_old].weight;
+ cb(&rc_del, cbdata);
+ idx_old++;
+ }
+ }
+
+ while (idx_old < num_old) {
+ rc_del.rc_nh_old = wn_old[idx_old].nh;
+ rc_del.rc_nh_weight = wn_old[idx_old].weight;
+ cb(&rc_del, cbdata);
+ idx_old++;
+ }
+
+ while (idx_new < num_new) {
+ rc_add.rc_nh_new = wn_new[idx_new].nh;
+ rc_add.rc_nh_weight = wn_new[idx_new].weight;
+ cb(&rc_add, cbdata);
+ idx_new++;
+ }
+}
+
+/*
+ * Decompose multipath cmd info @rc into a list of add/del/change
+ * single-path operations, calling @cb callback for each operation.
+ * Assumes at least one of the nexthops in @rc is multipath.
+ */
+void
+rib_decompose_notification(struct rib_cmd_info *rc, route_notification_t *cb,
+ void *cbdata)
+{
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ struct rib_cmd_info rc_new;
+
+ rc_new = *rc;
+ DPRINTF("cb=%p cmd=%d nh_old=%p nh_new=%p",
+ cb, rc->cmd, rc->nh_old, rc->nh_new);
+ switch (rc->rc_cmd) {
+ case RTM_ADD:
+ if (!NH_IS_NHGRP(rc->rc_nh_new))
+ return;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_nhops);
+ for (uint32_t i = 0; i < num_nhops; i++) {
+ rc_new.rc_nh_new = wn[i].nh;
+ rc_new.rc_nh_weight = wn[i].weight;
+ cb(&rc_new, cbdata);
+ }
+ break;
+ case RTM_DELETE:
+ if (!NH_IS_NHGRP(rc->rc_nh_old))
+ return;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_nhops);
+ for (uint32_t i = 0; i < num_nhops; i++) {
+ rc_new.rc_nh_old = wn[i].nh;
+ rc_new.rc_nh_weight = wn[i].weight;
+ cb(&rc_new, cbdata);
+ }
+ break;
+ case RTM_CHANGE:
+ if (!NH_IS_NHGRP(rc->rc_nh_old) && !NH_IS_NHGRP(rc->rc_nh_new))
+ return;
+ decompose_change_notification(rc, cb, cbdata);
+ break;
+ }
+}
+#endif
Index: sys/net/route/route_var.h
===================================================================
--- sys/net/route/route_var.h
+++ sys/net/route/route_var.h
@@ -87,6 +87,7 @@
/* Constants */
#define RIB_MAX_RETRIES 3
#define RT_MAXFIBS UINT16_MAX
+#define RIB_MAX_MPATH_WIDTH 64
/* Macro for verifying fields in af-specific 'struct route' structures */
#define CHK_STRUCT_FIELD_GENERIC(_s1, _f1, _s2, _f2) \
@@ -115,11 +116,6 @@
struct rib_head *rt_tables_get_rnh(uint32_t table, sa_family_t family);
void rt_mpath_init_rnh(struct rib_head *rnh);
int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum);
-void rt_setmetrics(const struct rt_addrinfo *info, struct rtentry *rt);
-#ifdef RADIX_MPATH
-struct radix_node *rt_mpath_unlink(struct rib_head *rnh,
- struct rt_addrinfo *info, struct rtentry *rto, int *perror);
-#endif
struct rib_cmd_info;
VNET_PCPUSTAT_DECLARE(struct rtstat, rtstat);
@@ -203,14 +199,6 @@
/* rtentry rt flag mask */
#define RTE_RT_FLAG_MASK (RTF_UP | RTF_HOST)
-/* Nexthop selection */
-#define _NH2MP(_nh) ((struct nhgrp_object *)(_nh))
-#define _SELECT_NHOP(_nh, _flowid) \
- (_NH2MP(_nh))->nhops[(_flowid) % (_NH2MP(_nh))->mp_size]
-#define _RT_SELECT_NHOP(_nh, _flowid) \
- ((!NH_IS_MULTIPATH(_nh)) ? (_nh) : _SELECT_NHOP(_nh, _flowid))
-#define RT_SELECT_NHOP(_rt, _flowid) _RT_SELECT_NHOP((_rt)->rt_nhop, _flowid)
-
/* route_temporal.c */
void tmproutes_update(struct rib_head *rnh, struct rtentry *rt);
void tmproutes_init(struct rib_head *rh);
@@ -218,14 +206,26 @@
/* route_ctl.c */
struct route_nhop_data {
- struct nhop_object *rnd_nhop;
- uint32_t rnd_weight;
+ union {
+ struct nhop_object *rnd_nhop;
+ struct nhgrp_object *rnd_nhgrp;
+ };
+ uint32_t rnd_weight;
};
+
+int change_route_nhop(struct rib_head *rnh, struct rtentry *rt,
+ struct rt_addrinfo *info, struct route_nhop_data *rnd,
+ struct rib_cmd_info *rc);
int change_route_conditional(struct rib_head *rnh, struct rtentry *rt,
struct rt_addrinfo *info, struct route_nhop_data *nhd_orig,
struct route_nhop_data *nhd_new, struct rib_cmd_info *rc);
struct rtentry *lookup_prefix(struct rib_head *rnh,
const struct rt_addrinfo *info, struct route_nhop_data *rnd);
+struct rtentry *lookup_lpm(struct rib_head *rnh, const struct sockaddr *dst,
+ struct route_nhop_data *rnd);
+
+bool nhop_can_multipath(const struct nhop_object *nh);
+bool match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw);
int check_info_match_nhop(const struct rt_addrinfo *info,
const struct rtentry *rt, const struct nhop_object *nh);
int can_override_nhop(const struct rt_addrinfo *info,
@@ -244,7 +244,6 @@
void nhops_destroy_rib(struct rib_head *rh);
void nhop_ref_object(struct nhop_object *nh);
int nhop_try_ref_object(struct nhop_object *nh);
-int nhop_ref_any(struct nhop_object *nh);
void nhop_free_any(struct nhop_object *nh);
void nhop_set_type(struct nhop_object *nh, enum nhop_type nh_type);
@@ -258,5 +257,61 @@
void nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu);
int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w);
+/* MULTIPATH */
+#define MPF_MULTIPATH 0x08 /* need to be consistent with NHF_MULTIPATH */
+
+struct nhgrp_object {
+ uint16_t mp_flags; /* multipath flags */
+ uint8_t mp_size; /* size of datapath mpath group */
+ uint8_t spare;
+ struct nhop_object *nhops[0]; /* nhops */
+};
+
+static inline struct nhop_object *
+nhop_select(struct nhop_object *nh, uint32_t flowid)
+{
+
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh)) {
+ struct nhgrp_object *nhg = (struct nhgrp_object *)nh;
+ nh = nhg->nhops[flowid % nhg->mp_size];
+ }
+#endif
+ return (nh);
+}
+
+
+struct weightened_nhop;
+
+/* mpath_ctl.c */
+int add_route_mpath(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct rtentry *rt, struct route_nhop_data *rnd_add,
+ struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc);
+int del_route_mpath(struct rib_head *rh, struct rt_addrinfo *info,
+ struct rtentry *rt, struct nhgrp_object *nhg, struct rib_cmd_info *rc);
+
+/* nhgrp.c */
+int nhgrp_ctl_init(struct nh_control *ctl);
+void nhgrp_ctl_free(struct nh_control *ctl);
+void nhgrp_ctl_unlink_all(struct nh_control *ctl);
+
+
+/* nhgrp_ctl.c */
+int nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w);
+
+int nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn,
+ int num_nhops, struct route_nhop_data *rnd);
+typedef bool nhgrp_filter_cb_t(const struct nhop_object *nh, void *data);
+int nhgrp_get_filtered_group(struct rib_head *rh, const struct nhgrp_object *src,
+ nhgrp_filter_cb_t flt_func, void *flt_data, struct route_nhop_data *rnd);
+int nhgrp_get_addition_group(struct rib_head *rnh,
+ struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_add,
+ struct route_nhop_data *rnd_new);
+
+void nhgrp_free(struct nhgrp_object *nhg);
+
+/* Entropy data used for outbound hashing */
+#define MPATH_ENTROPY_KEY_LEN 40
+extern uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN];
#endif
Index: sys/net/rtsock.c
===================================================================
--- sys/net/rtsock.c
+++ sys/net/rtsock.c
@@ -32,7 +32,7 @@
* $FreeBSD$
*/
#include "opt_ddb.h"
-#include "opt_mpath.h"
+#include "opt_route.h"
#include "opt_inet.h"
#include "opt_inet6.h"
@@ -158,8 +158,7 @@
#define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx)
#define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED)
-static SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
- "");
+SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
struct walkarg {
int w_tmemsize;
@@ -175,6 +174,8 @@
static int rt_xaddrs(caddr_t cp, caddr_t cplim,
struct rt_addrinfo *rtinfo);
static int sysctl_dumpentry(struct radix_node *rn, void *vw);
+static int sysctl_dumpnhop(struct rtentry *rt, struct nhop_object *nh,
+ uint32_t weight, struct walkarg *w);
static int sysctl_iflist(int af, struct walkarg *w);
static int sysctl_ifmalist(int af, struct walkarg *w);
static int route_output(struct mbuf *m, struct socket *so, ...);
@@ -648,6 +649,25 @@
return (0);
}
+static struct nhop_object *
+select_nhop(struct nhop_object *nh, const struct sockaddr *gw)
+{
+ if (!NH_IS_NHGRP(nh))
+ return (nh);
+#ifdef ROUTE_MPATH
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
+ if (gw == NULL)
+ return (wn[0].nh);
+ for (int i = 0; i < num_nhops; i++) {
+ if (match_nhop_gw(wn[i].nh, gw))
+ return (wn[i].nh);
+ }
+#endif
+ return (NULL);
+}
+
/*
* Handles RTM_GET message from routing socket, returning matching rt.
*
@@ -661,6 +681,7 @@
{
RIB_RLOCK_TRACKER;
struct rib_head *rnh;
+ struct nhop_object *nh;
sa_family_t saf;
saf = info->rti_info[RTAX_DST]->sa_family;
@@ -688,21 +709,12 @@
RIB_RUNLOCK(rnh);
return (ESRCH);
}
-#ifdef RADIX_MPATH
- /*
- * for RTM_GET, gate is optional even with multipath.
- * if gate == NULL the first match is returned.
- * (no need to call rt_mpath_matchgate if gate == NULL)
- */
- if (rt_mpath_capable(rnh) && info->rti_info[RTAX_GATEWAY]) {
- rc->rc_rt = rt_mpath_matchgate(rc->rc_rt,
- info->rti_info[RTAX_GATEWAY]);
- if (rc->rc_rt == NULL) {
- RIB_RUNLOCK(rnh);
- return (ESRCH);
- }
+
+ nh = select_nhop(rc->rc_rt->rt_nhop, info->rti_info[RTAX_GATEWAY]);
+ if (nh == NULL) {
+ RIB_RUNLOCK(rnh);
+ return (ESRCH);
}
-#endif
/*
* If performing proxied L2 entry insertion, and
* the actual PPP host entry is found, perform
@@ -738,8 +750,14 @@
RIB_RUNLOCK(rnh);
return (ESRCH);
}
+ nh = select_nhop(rc->rc_rt->rt_nhop, info->rti_info[RTAX_GATEWAY]);
+ if (nh == NULL) {
+ RIB_RUNLOCK(rnh);
+ return (ESRCH);
+ }
}
- rc->rc_nh_new = rc->rc_rt->rt_nhop;
+ rc->rc_nh_new = nh;
+ rc->rc_nh_weight = rc->rc_rt->rt_weight;
RIB_RUNLOCK(rnh);
return (0);
@@ -829,6 +847,24 @@
return (0);
}
+static void
+save_del_notification(struct rib_cmd_info *rc, void *_cbdata)
+{
+ struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata;
+
+ if (rc->rc_cmd == RTM_DELETE)
+ *rc_new = *rc;
+}
+
+static void
+save_add_notification(struct rib_cmd_info *rc, void *_cbdata)
+{
+ struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata;
+
+ if (rc->rc_cmd == RTM_ADD)
+ *rc_new = *rc;
+}
+
/*ARGSUSED*/
static int
route_output(struct mbuf *m, struct socket *so, ...)
@@ -915,6 +951,15 @@
if (error == 0) {
#ifdef INET6
rti_need_deembed = 1;
+#endif
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(rc.rc_nh_new) ||
+ (rc.rc_nh_old && NH_IS_NHGRP(rc.rc_nh_old))) {
+ struct rib_cmd_info rc_simple = {};
+ rib_decompose_notification(&rc,
+ save_add_notification, (void *)&rc_simple);
+ rc = rc_simple;
+ }
#endif
nh = rc.rc_nh_new;
rtm->rtm_index = nh->nh_ifp->if_index;
@@ -924,6 +969,15 @@
case RTM_DELETE:
error = rib_action(fibnum, RTM_DELETE, &info, &rc);
if (error == 0) {
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(rc.rc_nh_old) ||
+ (rc.rc_nh_new && NH_IS_NHGRP(rc.rc_nh_new))) {
+ struct rib_cmd_info rc_simple = {};
+ rib_decompose_notification(&rc,
+ save_del_notification, (void *)&rc_simple);
+ rc = rc_simple;
+ }
+#endif
nh = rc.rc_nh_old;
goto report;
}
@@ -1696,9 +1750,7 @@
struct walkarg *w = vw;
struct rtentry *rt = (struct rtentry *)rn;
struct nhop_object *nh;
- int error = 0, size;
- struct rt_addrinfo info;
- struct sockaddr_storage ss;
+ int error = 0;
NET_EPOCH_ASSERT();
@@ -1707,6 +1759,32 @@
if (!can_export_rte(w->w_req->td->td_ucred, rt))
return (0);
nh = rt->rt_nhop;
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh)) {
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
+ for (int i = 0; i < num_nhops; i++) {
+ error = sysctl_dumpnhop(rt, wn[i].nh, wn[i].weight, w);
+ if (error != 0)
+ return (error);
+ }
+ } else
+#endif
+ error = sysctl_dumpnhop(rt, nh, rt->rt_weight, w);
+
+ return (0);
+}
+
+
+static int
+sysctl_dumpnhop(struct rtentry *rt, struct nhop_object *nh, uint32_t weight,
+ struct walkarg *w)
+{
+ struct rt_addrinfo info;
+ int error = 0, size;
+ struct sockaddr_storage ss;
+
bzero((caddr_t)&info, sizeof(info));
info.rti_info[RTAX_DST] = rt_key(rt);
info.rti_info[RTAX_GATEWAY] = &nh->gw_sa;
@@ -1733,6 +1811,7 @@
rtm->rtm_flags = rt->rte_flags;
rtm->rtm_flags |= nhop_get_rtflags(nh);
rt_getmetrics(rt, nh, &rtm->rtm_rmx);
+ rtm->rtm_rmx.rmx_weight = weight;
rtm->rtm_index = nh->nh_ifp->if_index;
rtm->rtm_addrs = info.rti_addrs;
error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size);
@@ -2013,7 +2092,7 @@
namelen--;
if (req->newptr)
return (EPERM);
- if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP) {
+ if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP || name[1] == NET_RT_NHGRP) {
if (namelen == 3)
fib = req->td->td_proc->p_fibnum;
else if (namelen == 4)
@@ -2081,6 +2160,7 @@
}
break;
case NET_RT_NHOP:
+ case NET_RT_NHGRP:
/* Allow dumping one specific af/fib at a time */
if (namelen < 4) {
error = EINVAL;
@@ -2098,6 +2178,12 @@
}
if (w.w_op == NET_RT_NHOP)
error = nhops_dump_sysctl(rnh, w.w_req);
+ else
+#ifdef ROUTE_MPATH
+ error = nhgrp_dump_sysctl(rnh, w.w_req);
+#else
+ error = ENOTSUP;
+#endif
break;
case NET_RT_IFLIST:
case NET_RT_IFLISTL:
Index: sys/netinet/in.c
===================================================================
--- sys/netinet/in.c
+++ sys/netinet/in.c
@@ -35,8 +35,6 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include "opt_mpath.h"
-
#include <sys/param.h>
#include <sys/eventhandler.h>
#include <sys/systm.h>
@@ -699,14 +697,6 @@
* interface address, we are done here.
*/
if (ia->ia_flags & IFA_ROUTE) {
-#ifdef RADIX_MPATH
- if (ia->ia_addr.sin_addr.s_addr ==
- target->ia_addr.sin_addr.s_addr) {
- IN_IFADDR_RUNLOCK(&in_ifa_tracker);
- return (EEXIST);
- } else
- break;
-#endif
if (V_nosameprefix) {
IN_IFADDR_RUNLOCK(&in_ifa_tracker);
return (EEXIST);
Index: sys/netinet/in_fib.h
===================================================================
--- sys/netinet/in_fib.h
+++ sys/netinet/in_fib.h
@@ -51,4 +51,7 @@
uint32_t flags, const struct ifnet *src_if);
struct nhop_object *fib4_lookup_debugnet(uint32_t fibnum, struct in_addr dst,
uint32_t scopeid, uint32_t flags);
+uint32_t fib4_calc_software_hash(struct in_addr src, struct in_addr dst,
+ unsigned short src_port, unsigned short dst_port, char proto,
+ uint32_t *phashtype);
#endif
Index: sys/netinet/in_fib.c
===================================================================
--- sys/netinet/in_fib.c
+++ sys/netinet/in_fib.c
@@ -32,7 +32,6 @@
#include "opt_inet.h"
#include "opt_route.h"
-#include "opt_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -48,14 +47,12 @@
#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/route.h>
+#include <net/route/route_ctl.h>
#include <net/route/route_var.h>
#include <net/route/nhop.h>
+#include <net/toeplitz.h>
#include <net/vnet.h>
-#ifdef RADIX_MPATH
-#include <net/radix_mpath.h>
-#endif
-
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_fib.h>
@@ -66,6 +63,40 @@
/* Assert 'struct route_in' is compatible with 'struct route' */
CHK_STRUCT_ROUTE_COMPAT(struct route_in, ro_dst4);
+#ifdef ROUTE_MPATH
+struct _hash_5tuple_ipv4 {
+ struct in_addr src;
+ struct in_addr dst;
+ unsigned short src_port;
+ unsigned short dst_port;
+ char proto;
+ char spare[3];
+};
+_Static_assert(sizeof(struct _hash_5tuple_ipv4) == 16,
+ "_hash_5tuple_ipv4 size is wrong");
+
+
+uint32_t
+fib4_calc_software_hash(struct in_addr src, struct in_addr dst,
+ unsigned short src_port, unsigned short dst_port, char proto,
+ uint32_t *phashtype)
+{
+ struct _hash_5tuple_ipv4 data;
+
+ data.src = src;
+ data.dst = dst;
+ data.src_port = src_port;
+ data.dst_port = dst_port;
+ data.proto = proto;
+ data.spare[0] = data.spare[1] = data.spare[2] = 0;
+
+ *phashtype = M_HASHTYPE_OPAQUE;
+
+ return (toeplitz_hash(MPATH_ENTROPY_KEY_LEN, mpath_entropy_key,
+ sizeof(data), (uint8_t *)&data));
+}
+#endif
+
/*
* Looks up path in fib @fibnum specified by @dst.
* Returns path nexthop on success. Nexthop is safe to use
@@ -80,7 +111,6 @@
RIB_RLOCK_TRACKER;
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
struct nhop_object *nh;
KASSERT((fibnum < rt_numfibs), ("fib4_lookup: bad fibnum"));
@@ -99,12 +129,7 @@
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
-#ifdef RADIX_MPATH
- if (rt_mpath_next(rt) != NULL)
- rt = rt_mpath_selectrte(rt, flowid);
-#endif
- nh = rt->rt_nhop;
+ nh = nhop_select((RNTORT(rn))->rt_nhop, flowid);
/* Ensure route & ifp is UP */
if (RT_LINK_IS_UP(nh->nh_ifp)) {
if (flags & NHR_REF)
@@ -120,7 +145,7 @@
}
inline static int
-check_urpf(const struct nhop_object *nh, uint32_t flags,
+check_urpf_nhop(const struct nhop_object *nh, uint32_t flags,
const struct ifnet *src_if)
{
@@ -137,21 +162,24 @@
return (0);
}
-#ifdef RADIX_MPATH
-inline static int
-check_urpf_mpath(struct rtentry *rt, uint32_t flags,
+static int
+check_urpf(struct nhop_object *nh, uint32_t flags,
const struct ifnet *src_if)
{
-
- while (rt != NULL) {
- if (check_urpf(rt->rt_nhop, flags, src_if) != 0)
- return (1);
- rt = rt_mpath_next(rt);
- }
-
- return (0);
-}
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh)) {
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
+ for (int i = 0; i < num_nhops; i++) {
+ if (check_urpf_nhop(wn[i].nh, flags, src_if) != 0)
+ return (1);
+ }
+ return (0);
+ } else
#endif
+ return (check_urpf_nhop(nh, flags, src_if));
+}
/*
* Performs reverse path forwarding lookup.
@@ -169,7 +197,6 @@
RIB_RLOCK_TRACKER;
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
int ret;
KASSERT((fibnum < rt_numfibs), ("fib4_check_urpf: bad fibnum"));
@@ -186,12 +213,7 @@
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
-#ifdef RADIX_MPATH
- ret = check_urpf_mpath(rt, flags, src_if);
-#else
- ret = check_urpf(rt->rt_nhop, flags, src_if);
-#endif
+ ret = check_urpf(RNTORT(rn)->rt_nhop, flags, src_if);
RIB_RUNLOCK(rh);
return (ret);
}
@@ -206,7 +228,6 @@
{
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
struct nhop_object *nh;
KASSERT((fibnum < rt_numfibs), ("fib4_lookup_debugnet: bad fibnum"));
@@ -225,12 +246,7 @@
/* unlocked lookup */
rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
-#ifdef RADIX_MPATH
- if (rt_mpath_next(rt) != NULL)
- rt = rt_mpath_selectrte(rt, 0);
-#endif
- nh = rt->rt_nhop;
+ nh = nhop_select((RNTORT(rn))->rt_nhop, 0);
/* Ensure route & ifp is UP */
if (RT_LINK_IS_UP(nh->nh_ifp)) {
if (flags & NHR_REF)
Index: sys/netinet/in_pcb.c
===================================================================
--- sys/netinet/in_pcb.c
+++ sys/netinet/in_pcb.c
@@ -46,6 +46,7 @@
#include "opt_inet6.h"
#include "opt_ratelimit.h"
#include "opt_pcbgroup.h"
+#include "opt_route.h"
#include "opt_rss.h"
#include <sys/param.h>
@@ -1327,7 +1328,17 @@
lport = *lportp;
faddr = sin->sin_addr;
fport = sin->sin_port;
+#ifdef ROUTE_MPATH
+ if (V_fib_hash_outbound) {
+ uint32_t hash_val, hash_type;
+ hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport,
+ inp->inp_socket->so_proto->pr_protocol, &hash_type);
+
+ inp->inp_flowid = hash_val;
+ inp->inp_flowtype = hash_type;
+ }
+#endif
if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) {
/*
* If the destination address is INADDR_ANY,
Index: sys/netinet/in_proto.c
===================================================================
--- sys/netinet/in_proto.c
+++ sys/netinet/in_proto.c
@@ -39,7 +39,6 @@
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_sctp.h"
-#include "opt_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
Index: sys/netinet/in_rmx.c
===================================================================
--- sys/netinet/in_rmx.c
+++ sys/netinet/in_rmx.c
@@ -30,8 +30,6 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include "opt_mpath.h"
-
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
@@ -127,9 +125,6 @@
return (NULL);
rh->rnh_preadd = rib4_preadd;
-#ifdef RADIX_MPATH
- rt_mpath_init_rnh(rh);
-#endif
return (rh);
}
Index: sys/netinet/ip_output.c
===================================================================
--- sys/netinet/ip_output.c
+++ sys/netinet/ip_output.c
@@ -38,7 +38,6 @@
#include "opt_ipsec.h"
#include "opt_kern_tls.h"
#include "opt_mbuf_stress_test.h"
-#include "opt_mpath.h"
#include "opt_ratelimit.h"
#include "opt_route.h"
#include "opt_rss.h"
@@ -470,11 +469,8 @@
* for correct operation (as it is for ARP).
*/
uint32_t flowid;
-#ifdef RADIX_MPATH
- flowid = ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr);
-#else
+ //flowid = ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr);
flowid = m->m_pkthdr.flowid;
-#endif
ro->ro_nh = fib4_lookup(fibnum, dst->sin_addr, 0,
NHR_REF, flowid);
@@ -513,7 +509,8 @@
} else {
struct nhop_object *nh;
- nh = fib4_lookup(M_GETFIB(m), ip->ip_dst, 0, NHR_NONE, 0);
+ nh = fib4_lookup(M_GETFIB(m), ip->ip_dst, 0, NHR_NONE,
+ m->m_pkthdr.flowid);
if (nh == NULL) {
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
/*
Index: sys/netinet/raw_ip.c
===================================================================
--- sys/netinet/raw_ip.c
+++ sys/netinet/raw_ip.c
@@ -38,6 +38,7 @@
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ipsec.h"
+#include "opt_route.h"
#include <sys/param.h>
#include <sys/jail.h>
@@ -67,6 +68,7 @@
#include <netinet/in.h>
#include <netinet/in_systm.h>
+#include <netinet/in_fib.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/if_ether.h>
@@ -484,6 +486,17 @@
ip->ip_len = htons(m->m_pkthdr.len);
ip->ip_src = inp->inp_laddr;
ip->ip_dst.s_addr = dst;
+#ifdef ROUTE_MPATH
+ if (V_fib_hash_outbound) {
+ uint32_t hash_type, hash_val;
+
+ hash_val = fib4_calc_software_hash(ip->ip_src,
+ ip->ip_dst, 0, 0, ip->ip_p, &hash_type);
+ m->m_pkthdr.flowid = hash_val;
+ M_HASHTYPE_SET(m, hash_type);
+ flags |= IP_NODEFAULTFLOWID;
+ }
+#endif
if (jailed(inp->inp_cred)) {
/*
* prison_local_ip4() would be good enough but would
@@ -519,7 +532,17 @@
return (EINVAL);
ip = mtod(m, struct ip *);
}
-
+#ifdef ROUTE_MPATH
+ if (V_fib_hash_outbound) {
+ uint32_t hash_type, hash_val;
+
+ hash_val = fib4_calc_software_hash(ip->ip_dst,
+ ip->ip_src, 0, 0, ip->ip_p, &hash_type);
+ m->m_pkthdr.flowid = hash_val;
+ M_HASHTYPE_SET(m, hash_type);
+ flags |= IP_NODEFAULTFLOWID;
+ }
+#endif
INP_RLOCK(inp);
/*
* Don't allow both user specified and setsockopt options,
Index: sys/netinet6/in6_fib.h
===================================================================
--- sys/netinet6/in6_fib.h
+++ sys/netinet6/in6_fib.h
@@ -39,4 +39,7 @@
uint32_t scopeid, uint32_t flags, const struct ifnet *src_if);
struct nhop_object *fib6_lookup_debugnet(uint32_t fibnum,
const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags);
+uint32_t fib6_calc_software_hash(const struct in6_addr *src,
+ const struct in6_addr *dst, unsigned short src_port, unsigned short dst_port,
+ char proto, uint32_t *phashtype);
#endif
Index: sys/netinet6/in6_fib.c
===================================================================
--- sys/netinet6/in6_fib.c
+++ sys/netinet6/in6_fib.c
@@ -33,7 +33,6 @@
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_route.h"
-#include "opt_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -49,14 +48,12 @@
#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/route.h>
+#include <net/route/route_ctl.h>
#include <net/route/route_var.h>
#include <net/route/nhop.h>
+#include <net/toeplitz.h>
#include <net/vnet.h>
-#ifdef RADIX_MPATH
-#include <net/radix_mpath.h>
-#endif
-
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/ip_mroute.h>
@@ -72,6 +69,39 @@
CHK_STRUCT_ROUTE_COMPAT(struct route_in6, ro_dst);
+#ifdef ROUTE_MPATH
+struct _hash_5tuple_ipv6 {
+ struct in6_addr src;
+ struct in6_addr dst;
+ unsigned short src_port;
+ unsigned short dst_port;
+ char proto;
+ char spare[3];
+};
+_Static_assert(sizeof(struct _hash_5tuple_ipv6) == 40,
+ "_hash_5tuple_ipv6 size is wrong");
+
+uint32_t
+fib6_calc_software_hash(const struct in6_addr *src, const struct in6_addr *dst,
+ unsigned short src_port, unsigned short dst_port, char proto,
+ uint32_t *phashtype)
+{
+ struct _hash_5tuple_ipv6 data;
+
+ data.src = *src;
+ data.dst = *dst;
+ data.src_port = src_port;
+ data.dst_port = dst_port;
+ data.proto = proto;
+ data.spare[0] = data.spare[1] = data.spare[2] = 0;
+
+ *phashtype = M_HASHTYPE_OPAQUE_HASH;
+
+ return (toeplitz_hash(MPATH_ENTROPY_KEY_LEN, mpath_entropy_key,
+ sizeof(data), (uint8_t *)&data));
+}
+#endif
+
/*
* Looks up path in fib @fibnum specified by @dst.
* Assumes scope is deembedded and provided in @scopeid.
@@ -88,7 +118,6 @@
RIB_RLOCK_TRACKER;
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
struct nhop_object *nh;
struct sockaddr_in6 sin6;
@@ -111,12 +140,7 @@
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
-#ifdef RADIX_MPATH
- if (rt_mpath_next(rt) != NULL)
- rt = rt_mpath_selectrte(rt, flowid);
-#endif
- nh = rt->rt_nhop;
+ nh = nhop_select((RNTORT(rn))->rt_nhop, flowid);
/* Ensure route & ifp is UP */
if (RT_LINK_IS_UP(nh->nh_ifp)) {
if (flags & NHR_REF)
@@ -132,7 +156,7 @@
}
inline static int
-check_urpf(const struct nhop_object *nh, uint32_t flags,
+check_urpf_nhop(const struct nhop_object *nh, uint32_t flags,
const struct ifnet *src_if)
{
@@ -149,21 +173,24 @@
return (0);
}
-#ifdef RADIX_MPATH
-inline static int
-check_urpf_mpath(struct rtentry *rt, uint32_t flags,
+static int
+check_urpf(struct nhop_object *nh, uint32_t flags,
const struct ifnet *src_if)
{
-
- while (rt != NULL) {
- if (check_urpf(rt->rt_nhop, flags, src_if) != 0)
- return (1);
- rt = rt_mpath_next(rt);
- }
-
- return (0);
-}
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh)) {
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
+ for (int i = 0; i < num_nhops; i++) {
+ if (check_urpf_nhop(wn[i].nh, flags, src_if) != 0)
+ return (1);
+ }
+ return (0);
+ } else
#endif
+ return (check_urpf_nhop(nh, flags, src_if));
+}
/*
* Performs reverse path forwarding lookup.
@@ -181,7 +208,6 @@
RIB_RLOCK_TRACKER;
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
struct sockaddr_in6 sin6;
int ret;
@@ -203,12 +229,7 @@
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
-#ifdef RADIX_MPATH
- ret = check_urpf_mpath(rt, flags, src_if);
-#else
- ret = check_urpf(rt->rt_nhop, flags, src_if);
-#endif
+ ret = check_urpf(RNTORT(rn)->rt_nhop, flags, src_if);
RIB_RUNLOCK(rh);
return (ret);
}
@@ -223,7 +244,6 @@
{
struct rib_head *rh;
struct radix_node *rn;
- struct rtentry *rt;
struct nhop_object *nh;
struct sockaddr_in6 sin6;
@@ -245,8 +265,7 @@
rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rt = RNTORT(rn);
- nh = rt->rt_nhop;
+ nh = nhop_select((RNTORT(rn))->rt_nhop, 0);
/* Ensure route & ifp is UP */
if (RT_LINK_IS_UP(nh->nh_ifp)) {
if (flags & NHR_REF)
Index: sys/netinet6/in6_pcb.c
===================================================================
--- sys/netinet6/in6_pcb.c
+++ sys/netinet6/in6_pcb.c
@@ -73,6 +73,7 @@
#include "opt_inet6.h"
#include "opt_ipsec.h"
#include "opt_pcbgroup.h"
+#include "opt_route.h"
#include "opt_rss.h"
#include <sys/param.h>
@@ -423,6 +424,17 @@
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(pcbinfo);
+#ifdef ROUTE_MPATH
+ if (V_fib_hash_outbound) {
+ uint32_t hash_type, hash_val;
+
+ hash_val = fib6_calc_software_hash(&inp->in6p_laddr,
+ &sin6->sin6_addr, 0, sin6->sin6_port,
+ inp->inp_socket->so_proto->pr_protocol, &hash_type);
+ inp->inp_flowid = hash_val;
+ inp->inp_flowtype = hash_type;
+ }
+#endif
/*
* Call inner routine, to assign local interface address.
* in6_pcbladdr() may automatically fill in sin6_scope_id.
Index: sys/netinet6/in6_proto.c
===================================================================
--- sys/netinet6/in6_proto.c
+++ sys/netinet6/in6_proto.c
@@ -70,7 +70,6 @@
#include "opt_ipsec.h"
#include "opt_ipstealth.h"
#include "opt_sctp.h"
-#include "opt_mpath.h"
#include "opt_route.h"
#include <sys/param.h>
Index: sys/netinet6/in6_rmx.c
===================================================================
--- sys/netinet6/in6_rmx.c
+++ sys/netinet6/in6_rmx.c
@@ -64,8 +64,6 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include "opt_mpath.h"
-
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
@@ -153,9 +151,6 @@
return (NULL);
rh->rnh_preadd = rib6_preadd;
-#ifdef RADIX_MPATH
- rt_mpath_init_rnh(rh);
-#endif
rs = rib_subscribe_internal(rh, nd6_subscription_cb, NULL,
RIB_NOTIFY_IMMEDIATE, true);
Index: sys/netinet6/in6_src.c
===================================================================
--- sys/netinet6/in6_src.c
+++ sys/netinet6/in6_src.c
@@ -67,7 +67,6 @@
#include "opt_inet.h"
#include "opt_inet6.h"
-#include "opt_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
Index: sys/netinet6/nd6.c
===================================================================
--- sys/netinet6/nd6.c
+++ sys/netinet6/nd6.c
@@ -36,6 +36,7 @@
#include "opt_inet.h"
#include "opt_inet6.h"
+#include "opt_route.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -1570,25 +1571,34 @@
/*
* Updates status of the default router route.
*/
-void
-nd6_subscription_cb(struct rib_head *rnh, struct rib_cmd_info *rc, void *arg)
+static void
+check_release_defrouter(struct rib_cmd_info *rc, void *_cbdata)
{
struct nd_defrouter *dr;
struct nhop_object *nh;
- if (rc->rc_cmd == RTM_DELETE) {
- nh = rc->rc_nh_old;
+ nh = rc->rc_nh_old;
- if (nh->nh_flags & NHF_DEFAULT) {
- dr = defrouter_lookup(&nh->gw6_sa.sin6_addr, nh->nh_ifp);
- if (dr != NULL) {
- dr->installed = 0;
- defrouter_rele(dr);
- }
+ if ((nh != NULL) && (nh->nh_flags & NHF_DEFAULT)) {
+ dr = defrouter_lookup(&nh->gw6_sa.sin6_addr, nh->nh_ifp);
+ if (dr != NULL) {
+ dr->installed = 0;
+ defrouter_rele(dr);
}
}
}
+void
+nd6_subscription_cb(struct rib_head *rnh, struct rib_cmd_info *rc, void *arg)
+{
+
+#ifdef ROUTE_MPATH
+ rib_decompose_notification(rc, check_release_defrouter, NULL);
+#else
+ check_release_defrouter(rc, NULL);
+#endif
+}
+
int
nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp)
{
Index: sys/netinet6/raw_ip6.c
===================================================================
--- sys/netinet6/raw_ip6.c
+++ sys/netinet6/raw_ip6.c
@@ -66,6 +66,7 @@
#include "opt_ipsec.h"
#include "opt_inet6.h"
+#include "opt_route.h"
#include <sys/param.h>
#include <sys/errno.h>
@@ -103,6 +104,7 @@
#include <netinet6/ip6_var.h>
#include <netinet6/nd6.h>
#include <netinet6/raw_ip6.h>
+#include <netinet6/in6_fib.h>
#include <netinet6/scope6_var.h>
#include <netinet6/send.h>
@@ -462,6 +464,17 @@
}
ip6 = mtod(m, struct ip6_hdr *);
+#ifdef ROUTE_MPATH
+ if (V_fib_hash_outbound) {
+ uint32_t hash_type, hash_val;
+
+ hash_val = fib6_calc_software_hash(&inp->in6p_laddr,
+ &dstsock->sin6_addr, 0, 0, so->so_proto->pr_protocol,
+ &hash_type);
+ inp->inp_flowid = hash_val;
+ inp->inp_flowtype = hash_type;
+ }
+#endif
/*
* Source address selection.
*/
Index: sys/sys/socket.h
===================================================================
--- sys/sys/socket.h
+++ sys/sys/socket.h
@@ -417,6 +417,7 @@
#define NET_RT_IFLISTL 5 /* Survey interface list, using 'l'en
* versions of msghdr structs. */
#define NET_RT_NHOP 6 /* dump routing nexthops */
+#define NET_RT_NHGRP 7 /* dump routing nexthop groups */
#endif /* __BSD_VISIBLE */
/*
Index: tests/sys/netinet/output.sh
===================================================================
--- tests/sys/netinet/output.sh
+++ tests/sys/netinet/output.sh
@@ -339,11 +339,10 @@
pkt_0=`jexec ${jname}a netstat -Wf link -I ${epair0}a | head | awk '$1!~/^Name/{print$8}'`
pkt_1=`jexec ${jname}a netstat -Wf link -I ${epair1}a | head | awk '$1!~/^Name/{print$8}'`
if [ ${pkt_0} -le 10 ]; then
- echo "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
- exit 1
+ atf_fail "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
fi
if [ ${pkt_1} -le 10 ]; then
- echo "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
+ atf_fail "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
exit 1
fi
echo "TCP Balancing: 1: ${pkt_0} 2: ${pkt_1}"
@@ -468,14 +467,13 @@
pkt_0=`jexec ${jname}a netstat -Wf link -I ${epair0}a | head | awk '$1!~/^Name/{print$8}'`
pkt_1=`jexec ${jname}a netstat -Wf link -I ${epair1}a | head | awk '$1!~/^Name/{print$8}'`
if [ ${pkt_0} -le 10 ]; then
- echo "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
- exit 1
+ atf_fail "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
fi
if [ ${pkt_1} -le 10 ]; then
- echo "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
- exit 1
+ atf_fail "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
fi
echo "UDP BALANCING: 1: ${pkt_0} 2: ${pkt_1}"
+ jexec ${jname}a netstat -4rnW
}
output_udp_flowid_mpath_success_cleanup()
@@ -561,12 +559,10 @@
jexec ${jname}a netstat -bWf link -I ${epair0}a
jexec ${jname}a netstat -bWf link -I ${epair1}a
if [ ${pkt_0} -le 10 ]; then
- echo "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
- exit 1
+ atf_fail "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
fi
if [ ${pkt_1} -le 10 ]; then
- echo "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
- exit 1
+ atf_fail "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
fi
echo "RAW BALANCING: 1: ${pkt_0} 2: ${pkt_1}"
}
Index: tests/sys/netinet6/output6.sh
===================================================================
--- tests/sys/netinet6/output6.sh
+++ tests/sys/netinet6/output6.sh
@@ -376,12 +376,10 @@
pkt_0=`jexec ${jname}a netstat -Wf link -I ${epair0}a | head | awk '$1!~/^Name/{print$8}'`
pkt_1=`jexec ${jname}a netstat -Wf link -I ${epair1}a | head | awk '$1!~/^Name/{print$8}'`
if [ ${pkt_0} -le 10 ]; then
- echo "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
- exit 1
+ atf_fail "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
fi
if [ ${pkt_1} -le 10 ]; then
- echo "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
- exit 1
+ atf_fail "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
fi
echo "TCP Balancing: 1: ${pkt_0} 2: ${pkt_1}"
}
@@ -519,12 +517,10 @@
pkt_0=`jexec ${jname}a netstat -Wf link -I ${epair0}a | head | awk '$1!~/^Name/{print$8}'`
pkt_1=`jexec ${jname}a netstat -Wf link -I ${epair1}a | head | awk '$1!~/^Name/{print$8}'`
if [ ${pkt_0} -le 10 ]; then
- echo "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
- exit 1
+ atf_fail "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
fi
if [ ${pkt_1} -le 10 ]; then
- echo "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
- exit 1
+ atf_fail "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
fi
echo "UDP BALANCING: 1: ${pkt_0} 2: ${pkt_1}"
}
@@ -628,12 +624,10 @@
jexec ${jname}a netstat -bWf link -I ${epair0}a
jexec ${jname}a netstat -bWf link -I ${epair1}a
if [ ${pkt_0} -le 10 ]; then
- echo "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
- exit 1
+ atf_fail "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
fi
if [ ${pkt_1} -le 10 ]; then
- echo "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
- exit 1
+ atf_fail "Balancing failure: 1: ${pkt_0} 2: ${pkt_1}"
fi
echo "RAW BALANCING: 1: ${pkt_0} 2: ${pkt_1}"
}
Index: usr.bin/netstat/Makefile
===================================================================
--- usr.bin/netstat/Makefile
+++ usr.bin/netstat/Makefile
@@ -5,7 +5,7 @@
PROG= netstat
SRCS= if.c inet.c main.c mbuf.c mroute.c netisr.c nl_symbols.c route.c \
- unix.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c common.c nhops.c \
+ unix.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c common.c nhops.c nhgrp.c \
nl_defs.h
nl_symbols.c: nlist_symbols
Index: usr.bin/netstat/common.h
===================================================================
--- usr.bin/netstat/common.h
+++ usr.bin/netstat/common.h
@@ -54,5 +54,22 @@
struct ifmap_entry *prepare_ifmap(size_t *ifmap_size);
+struct rt_msghdr;
+struct nhops_map {
+ uint32_t idx;
+ struct rt_msghdr *rtm;
+};
+
+struct nhops_dump {
+ void *nh_buf;
+ struct nhops_map *nh_map;
+ size_t nh_count;
+};
+
+void dump_nhops_sysctl(int fibnum, int af, struct nhops_dump *nd);
+struct nhop_map;
+void nhop_map_update(struct nhop_map *map, uint32_t idx, char *gw, char *ifname);
+
+
#endif
Index: usr.bin/netstat/main.c
===================================================================
--- usr.bin/netstat/main.c
+++ usr.bin/netstat/main.c
@@ -215,6 +215,7 @@
int noutputs = 0; /* how much outputs before we exit */
int numeric_addr; /* show addresses numerically */
int numeric_port; /* show ports numerically */
+int Oflag; /* show nhgrp objects*/
int oflag; /* show nexthop objects*/
int Pflag; /* show TCP log ID */
static int pflag; /* show given protocol */
@@ -250,7 +251,7 @@
if (argc < 0)
exit(EXIT_FAILURE);
- while ((ch = getopt(argc, argv, "46AaBbCdF:f:ghI:iLlM:mN:noPp:Qq:RrSTsuWw:xz"))
+ while ((ch = getopt(argc, argv, "46AaBbCdF:f:ghI:iLlM:mN:nOoPp:Qq:RrSTsuWw:xz"))
!= -1)
switch(ch) {
case '4':
@@ -353,6 +354,9 @@
case 'o':
oflag = 1;
break;
+ case 'O':
+ Oflag = 1;
+ break;
case 'P':
Pflag = 1;
break;
@@ -509,6 +513,14 @@
xo_finish();
exit(0);
}
+ if (Oflag) {
+ xo_open_container("statistics");
+ nhgrp_print(fib, af);
+ xo_close_container("statistics");
+ xo_finish();
+ exit(0);
+ }
+
if (gflag) {
Index: usr.bin/netstat/netstat.h
===================================================================
--- usr.bin/netstat/netstat.h
+++ usr.bin/netstat/netstat.h
@@ -163,3 +163,4 @@
void mrt_stats(void);
void bpf_stats(char *);
void nhops_print(int fibnum, int af);
+void nhgrp_print(int fibnum, int af);
Index: usr.bin/netstat/nhgrp.c
===================================================================
--- usr.bin/netstat/nhgrp.c
+++ usr.bin/netstat/nhgrp.c
@@ -87,25 +87,6 @@
static int wid_nhidx;
static int wid_nhtype;
static int wid_refcnt;
-static int wid_prepend;
-
-static struct bits nh_bits[] = {
- { NHF_REJECT, 'R', "reject" },
- { NHF_BLACKHOLE,'B', "blackhole" },
- { NHF_REDIRECT, 'r', "redirect" },
- { NHF_GATEWAY, 'G', "gateway" },
- { NHF_DEFAULT, 'd', "default" },
- { NHF_BROADCAST,'b', "broadcast" },
- { 0 , 0, NULL }
-};
-
-static char *nh_types[] = {
- "empty", /* 0 */
- "v4/resolve", /* 1 */
- "v4/gw",
- "v6/resolve",
- "v6/gw"
-};
struct nhop_entry {
char gw[64];
@@ -118,117 +99,9 @@
};
static struct nhop_map global_nhop_map;
-static void nhop_map_update(struct nhop_map *map, uint32_t idx,
- char *gw, char *ifname);
-static struct nhop_entry *nhop_get(struct nhop_map *map, uint32_t idx);
-
-
static struct ifmap_entry *ifmap;
static size_t ifmap_size;
-static void
-print_sockaddr_buf(char *buf, size_t bufsize, const struct sockaddr *sa)
-{
-
- switch (sa->sa_family) {
- case AF_INET:
- inet_ntop(AF_INET, &((struct sockaddr_in *)sa)->sin_addr,
- buf, bufsize);
- break;
- case AF_INET6:
- inet_ntop(AF_INET6, &((struct sockaddr_in6 *)sa)->sin6_addr,
- buf, bufsize);
- break;
- default:
- snprintf(buf, bufsize, "unknown:%d", sa->sa_family);
- break;
- }
-}
-
-static int
-print_addr(const char *name, const char *addr, int width)
-{
- char buf[128];
- int protrusion;
-
- if (width < 0) {
- snprintf(buf, sizeof(buf), "{:%s/%%s} ", name);
- xo_emit(buf, addr);
- protrusion = 0;
- } else {
- if (Wflag != 0 || numeric_addr) {
- snprintf(buf, sizeof(buf), "{[:%d}{:%s/%%s}{]:} ",
- -width, name);
- xo_emit(buf, addr);
- protrusion = strlen(addr) - width;
- if (protrusion < 0)
- protrusion = 0;
- } else {
- snprintf(buf, sizeof(buf), "{[:%d}{:%s/%%-.*s}{]:} ",
- -width, name);
- xo_emit(buf, width, addr);
- protrusion = 0;
- }
- }
- return (protrusion);
-}
-
-
-static void
-print_nhop_header(int af1 __unused)
-{
-
- if (Wflag) {
- xo_emit("{T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%*.*s} "
- "{T:/%*.*s} {T:/%-*.*s} {T:/%*.*s} {T:/%*.*s} {T:/%*.*s} {T:/%*s}\n",
- wid_nhidx, wid_nhidx, "Idx",
- wid_nhtype, wid_nhtype, "Type",
- wid_dst, wid_dst, "IFA",
- wid_gw, wid_gw, "Gateway",
- wid_flags, wid_flags, "Flags",
- wid_pksent, wid_pksent, "Use",
- wid_mtu, wid_mtu, "Mtu",
- wid_if, wid_if, "Netif",
- wid_if, wid_if, "Addrif",
- wid_refcnt, wid_refcnt, "Refcnt",
- wid_prepend, "Prepend");
- } else {
- xo_emit("{T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%*.*s} "
- " {T:/%*s}\n",
- wid_nhidx, wid_nhidx, "Idx",
- wid_dst, wid_dst, "IFA",
- wid_gw, wid_gw, "Gateway",
- wid_flags, wid_flags, "Flags",
- wid_if, wid_if, "Netif",
- wid_prepend, "Refcnt");
- }
-}
-
-static void
-nhop_map_update(struct nhop_map *map, uint32_t idx, char *gw, char *ifname)
-{
- if (idx >= map->size) {
- uint32_t new_size;
- size_t sz;
- if (map->size == 0)
- new_size = 32;
- else
- new_size = map->size * 2;
- if (new_size <= idx)
- new_size = roundup(idx + 1, 32);
-
- sz = new_size * (sizeof(struct nhop_entry));
- if ((map->ptr = realloc(map->ptr, sz)) == NULL)
- errx(2, "realloc(%zu) failed", sz);
-
- memset(&map->ptr[map->size], 0, (new_size - map->size) * sizeof(struct nhop_entry));
- map->size = new_size;
- }
-
- strlcpy(map->ptr[idx].ifname, ifname, sizeof(map->ptr[idx].ifname));
- strlcpy(map->ptr[idx].gw, gw, sizeof(map->ptr[idx].gw));
-}
-
static struct nhop_entry *
nhop_get(struct nhop_map *map, uint32_t idx)
{
@@ -241,21 +114,96 @@
}
static void
-print_nhop_entry_sysctl(const char *name, struct rt_msghdr *rtm, struct nhop_external *nh)
+print_nhgroup_header(int af1 __unused)
+{
+
+ xo_emit("{T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%*.*s}"
+ " {T:/%-*.*s} {T:/%*s}\n",
+ wid_nhidx, wid_nhidx, "MpIdx",
+ wid_nhidx, wid_nhidx, "NHIdx",
+ wid_nhidx, wid_nhidx, "Weight",
+ wid_nhidx, wid_nhidx, "Slots",
+ wid_gw, wid_gw, "Gateway",
+ wid_if, wid_if, "Netif",
+ wid_nhidx, "Refcnt");
+}
+
+
+static void
+print_nhgroup_entry_sysctl(const char *name, struct rt_msghdr *rtm,
+ struct mpath_external *mpe)
{
char buffer[128];
- char iface_name[128];
- int protrusion;
- char gw_addr[64];
- struct nhop_addrs *na;
- struct sockaddr *sa_gw, *sa_ifa;
+ struct nhop_entry *ne;
xo_open_instance(name);
- snprintf(buffer, sizeof(buffer), "{[:-%d}{:index/%%lu}{]:} ", wid_nhidx);
- //xo_emit("{t:index/%-lu} ", wid_nhidx, nh->nh_idx);
- xo_emit(buffer, nh->nh_idx);
+ snprintf(buffer, sizeof(buffer), "{[:-%d}{:mp_index/%%lu}{]:} ", wid_nhidx);
+ xo_emit(buffer, mpe->mp_idx);
+ xo_emit("{t:dummy-1/%*.*s}", wid_nhidx, wid_nhidx, "----");
+ xo_emit("{t:dummy-2/%*.*s}", wid_nhidx, wid_nhidx, "----");
+ xo_emit("{t:dummy-2/%*.*s}", wid_nhidx, wid_nhidx, "----");
+ xo_emit("{t:dummy-3/%*.*s}", wid_gw, wid_gw, "----");
+ xo_emit("{t:dummy-4/%*.*s}", wid_if, wid_if, "----");
+ xo_emit("{t:mp-refcnt/%*lu}", wid_nhidx, mpe->mp_refcount);
+ xo_emit("\n");
+
+ struct mpath_nhop_external *ext;
+ ext = (struct mpath_nhop_external *)(mpe + 1);
+
+ uint32_t *fwd_c = calloc(sizeof(uint32_t), global_nhop_map.size);
+ uint32_t *pidx;
+ pidx = (uint32_t *)&ext[mpe->mp_nh_count];
+ for (uint32_t i = 0; i < mpe->mp_group_size; i++) {
+ fwd_c[pidx[i]]++;
+ }
+
+ xo_open_list("nhop_weights");
+ for (uint32_t i = 0; i < mpe->mp_nh_count; i++) {
+ xo_open_instance("nhop-weight");
+ snprintf(buffer, sizeof(buffer), "{[:-%d}{:d-0/%%s}{]:} ", wid_nhidx);
+ xo_emit(buffer, "");
+ // nh index
+ xo_emit("{t:nh-index/%*lu} ", wid_nhidx, ext[i].nh_idx);
+ xo_emit("{t:nh-weight/%*lu} ", wid_nhidx, ext[i].nh_weight);
+ xo_emit("{t:nh-slots/%*lu} ", wid_nhidx, fwd_c[ext[i].nh_idx]);
+ ne = nhop_get(&global_nhop_map, ext[i].nh_idx);
+ if (ne != NULL) {
+ xo_emit("{t:nh-gw/%*.*s}", wid_gw, wid_gw, ne->gw);
+ xo_emit("{t:nh-interface/%*.*s}", wid_if, wid_if, ne->ifname);
+ }
+ xo_emit("\n");
+ xo_close_instance("nhop-weight");
+ }
+ xo_close_list("nhop_weights");
+
+#if 0
+ xo_emit("{t:dummy-2/%*.*s}", wid_nhidx, wid_nhidx, "");
+ xo_emit("{t:dummy-1/%*.*s}", wid_nhidx, wid_nhidx, "----");
+ xo_emit("\n");
+
+ uint32_t *pidx;
+ pidx = (uint32_t *)&ext[mpe->mp_nh_count];
+ xo_open_list("fwd-nhops");
+ for (uint32_t i = 0; i < mpe->mp_group_size; i++) {
+ xo_open_instance("fwd-nhop");
+ snprintf(buffer, sizeof(buffer), "{[:-%d}{:d-0/%%s}{]:} ", wid_nhidx);
+ xo_emit(buffer, "");
+ // nh index
+ xo_emit("{t:nh-index/%*lu} ", wid_nhidx, pidx[i]);
+ ne = nhop_get(&global_nhop_map, pidx[i]);
+ if (ne != NULL) {
+ xo_emit("{t:dummy-2/%*.*s}", wid_nhidx, wid_nhidx, "");
+ xo_emit("{t:nh-gw/%*.*s}", wid_gw, wid_gw, ne->gw);
+ xo_emit("{t:nh-interface/%*.*s}", wid_if, wid_if, ne->ifname);
+ }
+ xo_emit("\n");
+ xo_close_instance("fwd-nhop");
+ }
+ xo_close_list("fwd-nhops");
+#endif
+#if 0
if (Wflag) {
char *cp = nh_types[nh->nh_type];
xo_emit("{t:type_str/%*s} ", wid_nhtype, cp);
@@ -268,11 +216,10 @@
strlcpy(iface_name, "---", sizeof(iface_name));
}
- na = (struct nhop_addrs *)((char *)nh + nh->nh_len);
//inet_ntop(nh->nh_family, &nh->nh_src, src_addr, sizeof(src_addr));
//protrusion = p_addr("ifa", src_addr, wid_dst);
- sa_gw = (struct sockaddr *)((char *)na + na->gw_sa_off);
- sa_ifa = (struct sockaddr *)((char *)na + na->src_sa_off);
+ sa_gw = (struct sockaddr *)(nh + 1);
+ sa_ifa = (struct sockaddr *)((char *)sa_gw + sa_gw->sa_len);
protrusion = p_sockaddr("ifa", sa_ifa, NULL, RTF_HOST, wid_dst);
if (nh->nh_flags & NHF_GATEWAY) {
@@ -281,15 +228,13 @@
strlcpy(gw_addr, cp, sizeof(gw_addr));
} else
snprintf(gw_addr, sizeof(gw_addr), "%s/resolve", iface_name);
- protrusion = print_addr("gateway", gw_addr, wid_dst - protrusion);
-
- nhop_map_update(&global_nhop_map, nh->nh_idx, gw_addr, iface_name);
+ protrusion = p_addr("gateway", gw_addr, wid_dst - protrusion);
snprintf(buffer, sizeof(buffer), "{[:-%d}{:flags/%%s}{]:} ",
wid_flags - protrusion);
//p_nhflags(nh->nh_flags, buffer);
- print_flags_generic(rtm->rtm_flags, rt_bits, buffer, "rt_flags_pretty");
+ p_flags(rtm->rtm_flags, buffer);
if (Wflag) {
xo_emit("{t:use/%*lu} ", wid_pksent, nh->nh_pksent);
@@ -317,135 +262,135 @@
char *prepend_hex = "AABBCCDDEE";
xo_emit(" {:nhop-prepend/%*s}", wid_prepend, prepend_hex);
}
-
- xo_emit("\n");
+#endif
+ //xo_emit("\n");
xo_close_instance(name);
}
-struct nhops_map {
- uint32_t idx;
- struct rt_msghdr *rtm;
-};
-
-static int
-cmp_nh_idx(const void *_a, const void *_b)
-{
- const struct nhops_map *a, *b;
-
- a = _a;
- b = _b;
-
- if (a->idx > b->idx)
- return (1);
- else if (a->idx < b->idx)
- return (-1);
- return (0);
-}
static void
-print_nhops_sysctl(int fibnum, int af)
+print_nhgrp_sysctl(int fibnum, int af)
{
size_t needed;
int mib[7];
char *buf, *next, *lim;
struct rt_msghdr *rtm;
- struct nhop_external *nh;
- int fam;
- struct nhops_map *nh_map;
- size_t nh_count, nh_size;
+ struct mpath_external *mp;
+ int fam = AF_UNSPEC;
+ int need_table_close = false;
mib[0] = CTL_NET;
mib[1] = PF_ROUTE;
mib[2] = 0;
mib[3] = af;
- mib[4] = NET_RT_NHOP;
+ mib[4] = NET_RT_NHGROUPS;
mib[5] = 0;
mib[6] = fibnum;
if (sysctl(mib, nitems(mib), NULL, &needed, NULL, 0) < 0)
- err(EX_OSERR, "sysctl: net.route.0.%d.nhdump.%d estimate", af,
- fibnum);
+ err(EX_OSERR, "sysctl: net.route.0.%d.nhgrpdump.%d estimate",
+ af, fibnum);
if ((buf = malloc(needed)) == NULL)
errx(2, "malloc(%lu)", (unsigned long)needed);
if (sysctl(mib, nitems(mib), buf, &needed, NULL, 0) < 0)
- err(1, "sysctl: net.route.0.%d.nhdump.%d", af, fibnum);
+ err(1, "sysctl: net.route.0.%d.nhgrpdump.%d", af, fibnum);
+ printf("BUF: %zu\n", needed);
lim = buf + needed;
- xo_open_container("nhop-table");
+ xo_open_container("nhgrp-table");
xo_open_list("rt-family");
-
- /*
- * nexhops are received unsorted. Collect everything first, sort and then display
- * sorted.
- */
- nh_count = 0;
- nh_size = 16;
- nh_map = calloc(nh_size, sizeof(struct nhops_map));
for (next = buf; next < lim; next += rtm->rtm_msglen) {
rtm = (struct rt_msghdr *)next;
if (rtm->rtm_version != RTM_VERSION)
continue;
- if (nh_count >= nh_size) {
- nh_size *= 2;
- nh_map = realloc(nh_map, nh_size * sizeof(struct nhops_map));
+ mp = (struct mpath_external *)(rtm + 1);
+ /*
+ * Peek inside header to determine AF
+ */
+ /* Only print family first time. */
+ if (fam != af) {
+ if (need_table_close) {
+ xo_close_list("nhgrp-entry");
+ xo_close_instance("rt-family");
+ }
+ need_table_close = true;
+
+ fam = af;
+ wid_dst = WID_GW_DEFAULT(fam);
+ wid_gw = WID_GW_DEFAULT(fam);
+ wid_nhidx = 5;
+ wid_nhtype = 12;
+ wid_refcnt = 6;
+ wid_flags = 6;
+ wid_pksent = 8;
+ wid_mtu = 6;
+ wid_if = WID_IF_DEFAULT(fam);
+ xo_open_instance("rt-family");
+ pr_family(fam);
+ xo_open_list("nhgrp-entry");
+
+ print_nhgroup_header(fam);
}
-
- nh = (struct nhop_external *)(rtm + 1);
- nh_map[nh_count].idx = nh->nh_idx;
- nh_map[nh_count].rtm = rtm;
- nh_count++;
+ print_nhgroup_entry_sysctl("nhgrp-entry", rtm, mp);
}
-
- if (nh_count > 0) {
- qsort(nh_map, nh_count, sizeof(struct nhops_map), cmp_nh_idx);
- nh = (struct nhop_external *)(nh_map[0].rtm + 1);
- fam = nh->nh_family;
-
- wid_dst = WID_GW_DEFAULT(fam);
- wid_gw = WID_GW_DEFAULT(fam);
- wid_nhidx = 5;
- wid_nhtype = 12;
- wid_refcnt = 6;
- wid_flags = 6;
- wid_pksent = 8;
- wid_mtu = 6;
- wid_if = WID_IF_DEFAULT(fam);
- xo_open_instance("rt-family");
- pr_family(fam);
- xo_open_list("nh-entry");
-
- print_nhop_header(fam);
-
- for (size_t i = 0; i < nh_count; i++) {
- rtm = nh_map[i].rtm;
- nh = (struct nhop_external *)(rtm + 1);
- print_nhop_entry_sysctl("nh-entry", rtm, nh);
- }
-
- xo_close_list("nh-entry");
+ if (need_table_close) {
+ xo_close_list("nhgrp-entry");
xo_close_instance("rt-family");
}
xo_close_list("rt-family");
- xo_close_container("nhop-table");
+ xo_close_container("nhgrp-table");
free(buf);
}
static void
-p_nhflags(int f, const char *format)
+update_global_map(struct nhop_external *nh)
+{
+ char iface_name[128];
+ char gw_addr[64];
+ struct nhop_addrs *na;
+ struct sockaddr *sa_gw;
+
+ na = (struct nhop_addrs *)((char *)nh + nh->nh_len);
+ sa_gw = (struct sockaddr *)((char *)na + na->gw_sa_off);
+
+ memset(iface_name, 0, sizeof(iface_name));
+ if (nh->ifindex < (uint32_t)ifmap_size) {
+ strlcpy(iface_name, ifmap[nh->ifindex].ifname,
+ sizeof(iface_name));
+ if (*iface_name == '\0')
+ strlcpy(iface_name, "---", sizeof(iface_name));
+ }
+
+
+ if (nh->nh_flags & NHF_GATEWAY) {
+ const char *cp;
+ cp = fmt_sockaddr(sa_gw, NULL, RTF_HOST);
+ strlcpy(gw_addr, cp, sizeof(gw_addr));
+ } else
+ snprintf(gw_addr, sizeof(gw_addr), "%s/resolve", iface_name);
+
+ nhop_map_update(&global_nhop_map, nh->nh_idx, gw_addr, iface_name);
+}
+
+static void
+prepare_nh_map(int fibnum, int af)
{
- struct bits *p;
- char *pretty_name = "nh_flags_pretty";
+ struct nhops_dump nd;
+ struct nhop_external *nh;
+ struct rt_msghdr *rtm;
+
+ dump_nhops_sysctl(fibnum, af, &nd);
- xo_emit(format, fmt_flags(nh_bits, f));
+ for (size_t i = 0; i < nd.nh_count; i++) {
+ rtm = nd.nh_map[i].rtm;
+ nh = (struct nhop_external *)(rtm + 1);
+ update_global_map(nh);
+ }
- xo_open_list(pretty_name);
- for (p = nh_bits; p->b_mask; p++)
- if (p->b_mask & f)
- xo_emit("{le:nh_flags_pretty/%s}", p->b_name);
- xo_close_list(pretty_name);
+ free(nd.nh_buf);
}
void
-nhops_print(int fibnum, int af)
+nhgrp_print(int fibnum, int af)
{
size_t intsize;
int numfibs;
@@ -460,13 +405,14 @@
errx(EX_USAGE, "%d: invalid fib", fibnum);
ifmap = prepare_ifmap(&ifmap_size);
+ prepare_nh_map(fibnum, af);
- xo_open_container("route-nhop-information");
- xo_emit("{T:Nexthop data}");
+ xo_open_container("route-nhgrp-information");
+ xo_emit("{T:Nexthop groups data}");
if (fibnum)
xo_emit(" ({L:fib}: {:fib/%d})", fibnum);
xo_emit("\n");
- print_nhops_sysctl(fibnum, af);
- xo_close_container("route-nhop-information");
+ print_nhgrp_sysctl(fibnum, af);
+ xo_close_container("route-nhgrp-information");
}
Index: usr.bin/netstat/nhops.c
===================================================================
--- usr.bin/netstat/nhops.c
+++ usr.bin/netstat/nhops.c
@@ -118,8 +118,6 @@
};
static struct nhop_map global_nhop_map;
-static void nhop_map_update(struct nhop_map *map, uint32_t idx,
- char *gw, char *ifname);
static struct nhop_entry *nhop_get(struct nhop_map *map, uint32_t idx);
@@ -204,7 +202,7 @@
}
}
-static void
+void
nhop_map_update(struct nhop_map *map, uint32_t idx, char *gw, char *ifname)
{
if (idx >= map->size) {
@@ -322,11 +320,6 @@
xo_close_instance(name);
}
-struct nhops_map {
- uint32_t idx;
- struct rt_msghdr *rtm;
-};
-
static int
cmp_nh_idx(const void *_a, const void *_b)
{
@@ -342,15 +335,14 @@
return (0);
}
-static void
-print_nhops_sysctl(int fibnum, int af)
+void
+dump_nhops_sysctl(int fibnum, int af, struct nhops_dump *nd)
{
size_t needed;
int mib[7];
char *buf, *next, *lim;
struct rt_msghdr *rtm;
struct nhop_external *nh;
- int fam;
struct nhops_map *nh_map;
size_t nh_count, nh_size;
@@ -369,8 +361,6 @@
if (sysctl(mib, nitems(mib), buf, &needed, NULL, 0) < 0)
err(1, "sysctl: net.route.0.%d.nhdump.%d", af, fibnum);
lim = buf + needed;
- xo_open_container("nhop-table");
- xo_open_list("rt-family");
/*
* nexhops are received unsorted. Collect everything first, sort and then display
@@ -395,9 +385,27 @@
nh_count++;
}
- if (nh_count > 0) {
+ if (nh_count > 0)
qsort(nh_map, nh_count, sizeof(struct nhops_map), cmp_nh_idx);
- nh = (struct nhop_external *)(nh_map[0].rtm + 1);
+ nd->nh_buf = buf;
+ nd->nh_count = nh_count;
+ nd->nh_map = nh_map;
+}
+
+static void
+print_nhops_sysctl(int fibnum, int af)
+{
+ struct nhops_dump nd;
+ struct nhop_external *nh;
+ int fam;
+ struct rt_msghdr *rtm;
+
+ dump_nhops_sysctl(fibnum, af, &nd);
+
+ xo_open_container("nhop-table");
+ xo_open_list("rt-family");
+ if (nd.nh_count > 0) {
+ nh = (struct nhop_external *)(nd.nh_map[0].rtm + 1);
fam = nh->nh_family;
wid_dst = WID_GW_DEFAULT(fam);
@@ -415,8 +423,8 @@
print_nhop_header(fam);
- for (size_t i = 0; i < nh_count; i++) {
- rtm = nh_map[i].rtm;
+ for (size_t i = 0; i < nd.nh_count; i++) {
+ rtm = nd.nh_map[i].rtm;
nh = (struct nhop_external *)(rtm + 1);
print_nhop_entry_sysctl("nh-entry", rtm, nh);
}
@@ -426,7 +434,7 @@
}
xo_close_list("rt-family");
xo_close_container("nhop-table");
- free(buf);
+ free(nd.nh_buf);
}
static void

File Metadata

Mime Type
text/plain
Expires
Fri, Dec 26, 12:27 PM (4 h, 8 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
27282249
Default Alt Text
D26449.id77161.diff (117 KB)

Event Timeline