Index: sys/conf/NOTES =================================================================== --- sys/conf/NOTES +++ sys/conf/NOTES @@ -1002,7 +1002,7 @@ # # TCP_HHOOK enables the hhook(9) framework hooks for the TCP stack. # -# RADIX_MPATH provides support for equal-cost multi-path routing. +# ROUTE_MPATH provides support for multipath routing. # options MROUTING # Multicast routing options IPFIREWALL #firewall Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -4140,10 +4140,12 @@ net/debugnet_inet.c optional inet debugnet net/pfil.c optional ether | inet net/radix.c standard -net/radix_mpath.c standard net/raw_cb.c standard net/raw_usrreq.c standard net/route.c standard +net/route/mpath_ctl.c optional route_mpath +net/route/nhgrp.c optional route_mpath +net/route/nhgrp_ctl.c optional route_mpath net/route/nhop.c standard net/route/nhop_ctl.c standard net/route/nhop_utils.c standard Index: sys/conf/options =================================================================== --- sys/conf/options +++ sys/conf/options @@ -454,6 +454,7 @@ PCBGROUP opt_pcbgroup.h PF_DEFAULT_TO_DROP opt_pf.h RADIX_MPATH opt_mpath.h +ROUTE_MPATH opt_route.h ROUTETABLES opt_route.h RSS opt_rss.h SLIP_IFF_OPTS opt_slip.h Index: sys/net/radix.c =================================================================== --- sys/net/radix.c +++ sys/net/radix.c @@ -44,10 +44,6 @@ #include #include #include -#include "opt_mpath.h" -#ifdef RADIX_MPATH -#include -#endif #else /* !_KERNEL */ #include #include Index: sys/net/route.h =================================================================== --- sys/net/route.h +++ sys/net/route.h @@ -178,6 +178,7 @@ */ /* Consumer-visible nexthop info flags */ +#define NHF_MULTIPATH 0x0008 /* Nexhop is a nexthop group */ #define NHF_REJECT 0x0010 /* RTF_REJECT */ #define NHF_BLACKHOLE 0x0020 /* RTF_BLACKHOLE */ #define NHF_REDIRECT 0x0040 /* RTF_DYNAMIC|RTF_MODIFIED */ @@ -208,6 +209,10 @@ uint64_t rts_wildcard; /* lookups satisfied by a wildcard */ uint64_t rts_nh_idx_alloc_failure; /* nexthop index alloc failure*/ uint64_t rts_nh_alloc_failure; /* nexthop allocation failure*/ + uint64_t rts_add_failure; /* route addition failure */ + uint64_t rts_add_retry; /* route addition failure */ + uint64_t rts_del_failure; /* route addition failure */ + uint64_t rts_del_retry; /* route addition failure */ }; /* Index: sys/net/route.c =================================================================== --- sys/net/route.c +++ sys/net/route.c @@ -39,7 +39,6 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_mrouting.h" -#include "opt_mpath.h" #include "opt_route.h" #include Index: sys/net/route/mpath_ctl.c =================================================================== --- /dev/null +++ sys/net/route/mpath_ctl.c @@ -0,0 +1,165 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_inet.h" +#include "opt_route.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +/* + * This file contains the supporting functions for adding/deleting/updating + * multipath routes to the routing table. + */ + +SYSCTL_DECL(_net_route); + +/* + * Tries to add @rnd_add nhop to the existing set of nhops (@nh_orig) for the + * prefix specified by @rt. + * + * Return 0 ans consumes rt / rnd_add nhop references. @rc gets populated + * with the operation result. + * Otherwise errno is returned. + * + * caller responsibility is to unlock/free rt and + * rt->rt_nhop. + */ +int +add_route_mpath(struct rib_head *rnh, struct rt_addrinfo *info, + struct rtentry *rt, struct route_nhop_data *rnd_add, + struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) +{ + RIB_RLOCK_TRACKER; + struct route_nhop_data rnd_new; + int error = 0; + + /* + * It is possible that multiple rtsock speakers will try to update + * the same route simultaneously. Reduce the chance of failing the + * request by retrying the cycle multiple times. + */ + for (int i = 0; i < RIB_MAX_RETRIES; i++) { + error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add, + &rnd_new); + if (error != 0) { + if (error != EAGAIN) + break; + + /* + * Group creation failed, most probably because + * @rnd_orig data got scheduled for deletion. + * Refresh @rnd_orig data and retry. + */ + RIB_RLOCK(rnh); + lookup_prefix(rnh, info, rnd_orig); + RIB_RUNLOCK(rnh); + continue; + } + + error = change_route_conditional(rnh, rt, info, rnd_orig, + &rnd_new, rc); + if (error != EAGAIN) + break; + RTSTAT_INC(rts_add_retry); + } + + return (error); +} + +struct rt_match_info { + struct rt_addrinfo *info; + struct rtentry *rt; +}; + +static bool +gw_filter_func(const struct nhop_object *nh, void *_data) +{ + struct rt_match_info *ri = (struct rt_match_info *)_data; + + return (check_info_match_nhop(ri->info, ri->rt, nh) == 0); +} + +/* + * Tries to delete matching paths from @nhg. + * Returns 0 on success and updates operation result in @rc. + */ +int +del_route_mpath(struct rib_head *rh, struct rt_addrinfo *info, + struct rtentry *rt, struct nhgrp_object *nhg, + struct rib_cmd_info *rc) +{ + struct route_nhop_data rnd; + struct rt_match_info ri = { .info = info, .rt = rt }; + int error; + + RIB_WLOCK_ASSERT(rh); + + /* + * Require gateway to delete multipath routes, to forbid + * deleting all paths at once. + * If the filter function is provided, skip gateway check to + * allow rib_walk_del() delete routes for any criteria based + * on provided callback. + */ + if ((info->rti_info[RTAX_GATEWAY] == NULL) && (info->rti_filter == NULL)) + return (ESRCH); + + error = nhgrp_get_filtered_group(rh, nhg, gw_filter_func, (void *)&ri, + &rnd); + if (error == 0) + error = change_route_nhop(rh, rt, info, &rnd, rc); + return (error); +} + Index: sys/net/route/nhgrp.c =================================================================== --- /dev/null +++ sys/net/route/nhgrp.c @@ -0,0 +1,344 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_inet.h" +#include "opt_route.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +/* + * This file contains data structures management logic for the nexthop + * groups ("nhgrp") route subsystem. + * + * Nexthop groups are used to store multiple routes available for the specific + * prefix. Nexthop groups are immutable and can be shared across multiple + * prefixes. + * + * Each group consists of a control plane part and a dataplane part. + * Control plane is basically a collection of nexthop objects with + * weights and refcount. + * + * Datapath consists of a array of nexthop pointers, compiled from control + * plane data to support O(1) nexthop selection. + * + * For example, consider the following group: + * [(nh1, weight=100), (nh2, weight=200)] + * It will compile to the following array: + * [nh1, nh2, nh2] + * + */ + +static void consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, + uint32_t new_idx_items); + +static int cmp_nhgrp(const struct nhgrp_priv *a, const struct nhgrp_priv *b); +static unsigned int hash_nhgrp(const struct nhgrp_priv *obj); + +static unsigned +djb_hash(const unsigned char *h, const int len) +{ + unsigned int result = 0; + int i; + + for (i = 0; i < len; i++) + result = 33 * result ^ h[i]; + + return (result); +} + +static int +cmp_nhgrp(const struct nhgrp_priv *a, const struct nhgrp_priv *b) +{ + + /* + * In case of consistent hashing, there can be multiple nexthop groups + * with the same "control plane" list of nexthops with weights and a + * different set of "data plane" nexthops. + * For now, ignore the data plane and focus on the control plane list. + */ + if (a->nhg_nh_count != b->nhg_nh_count) + return (0); + return !memcmp(a->nhg_nh_weights, b->nhg_nh_weights, + sizeof(struct weightened_nhop) * a->nhg_nh_count); +} + +/* + * Hash callback: calculate hash of an object + */ +static unsigned int +hash_nhgrp(const struct nhgrp_priv *obj) +{ + const unsigned char *key; + + key = (const unsigned char *)obj->nhg_nh_weights; + + return (djb_hash(key, sizeof(struct weightened_nhop) * obj->nhg_nh_count)); +} + +/* + * Returns object referenced and unlocked + */ +struct nhgrp_priv * +find_nhgrp(struct nh_control *ctl, const struct nhgrp_priv *key) +{ + struct nhgrp_priv *priv_ret; + + NHOPS_RLOCK(ctl); + CHT_SLIST_FIND_BYOBJ(&ctl->gr_head, mpath, key, priv_ret); + if (priv_ret != NULL) { + if (refcount_acquire_if_not_zero(&priv_ret->nhg_refcount) == 0) { + /* refcount is 0 -> group is being deleted */ + priv_ret = NULL; + } + } + NHOPS_RUNLOCK(ctl); + + return (priv_ret); +} + +int +link_nhgrp(struct nh_control *ctl, struct nhgrp_priv *grp_priv) +{ + uint16_t idx; + uint32_t new_num_buckets, new_num_items; + + NHOPS_WLOCK(ctl); + /* Check if we need to resize hash and index */ + new_num_buckets = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->gr_head); + new_num_items = bitmask_get_resize_items(&ctl->gr_idx_head); + + if (bitmask_alloc_idx(&ctl->gr_idx_head, &idx) != 0) { + NHOPS_WUNLOCK(ctl); + DPRINTF("Unable to allocate mpath index"); + consider_resize(ctl, new_num_buckets, new_num_items); + return (0); + } + + grp_priv->nhg_idx = idx; + grp_priv->nh_control = ctl; + CHT_SLIST_INSERT_HEAD(&ctl->gr_head, mpath, grp_priv); + + NHOPS_WUNLOCK(ctl); + + consider_resize(ctl, new_num_buckets, new_num_items); + + return (1); +} + +struct nhgrp_priv * +unlink_nhgrp(struct nh_control *ctl, struct nhgrp_priv *key) +{ + struct nhgrp_priv *nhg_priv_ret; + int ret, idx; + + NHOPS_WLOCK(ctl); + + CHT_SLIST_REMOVE_BYOBJ(&ctl->gr_head, mpath, key, nhg_priv_ret); + + if (nhg_priv_ret == NULL) { + DPRINTF("Unable to find nhop group!"); + NHOPS_WUNLOCK(ctl); + return (NULL); + } + + idx = nhg_priv_ret->nhg_idx; + ret = bitmask_free_idx(&ctl->gr_idx_head, idx); + nhg_priv_ret->nhg_idx = 0; + nhg_priv_ret->nh_control = NULL; + + NHOPS_WUNLOCK(ctl); + + return (nhg_priv_ret); +} + +/* + * Checks if hash needs resizing and performs this resize if necessary + * + */ +__noinline static void +consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, uint32_t new_idx_items) +{ + void *nh_ptr, *nh_idx_ptr; + void *old_idx_ptr; + size_t alloc_size; + + nh_ptr = NULL ; + if (new_nh_buckets != 0) { + alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_nh_buckets); + nh_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO); + } + + nh_idx_ptr = NULL; + if (new_idx_items != 0) { + alloc_size = bitmask_get_size(new_idx_items); + nh_idx_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO); + } + + if (nh_ptr == NULL && nh_idx_ptr == NULL) { + /* Either resize is not required or allocations have failed. */ + return; + } + + DPRINTF("mp: going to resize: nh:[ptr:%p sz:%u] idx:[ptr:%p sz:%u]", + nh_ptr, new_nh_buckets, nh_idx_ptr, new_idx_items); + + old_idx_ptr = NULL; + + NHOPS_WLOCK(ctl); + if (nh_ptr != NULL) { + CHT_SLIST_RESIZE(&ctl->gr_head, mpath, nh_ptr, new_nh_buckets); + } + if (nh_idx_ptr != NULL) { + if (bitmask_copy(&ctl->gr_idx_head, nh_idx_ptr, new_idx_items)) + bitmask_swap(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items, &old_idx_ptr); + } + NHOPS_WUNLOCK(ctl); + + if (nh_ptr != NULL) + free(nh_ptr, M_NHOP); + if (old_idx_ptr != NULL) + free(old_idx_ptr, M_NHOP); +} + +/* + * Function allocating the necessary group data structures. + */ +bool +nhgrp_ctl_alloc_default(struct nh_control *ctl, int malloc_flags) +{ + size_t alloc_size; + uint32_t num_buckets, num_items; + void *cht_ptr, *mask_ptr; + + malloc_flags = (malloc_flags & (M_NOWAIT | M_WAITOK)) | M_ZERO; + + num_buckets = 8; + alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); + cht_ptr = malloc(alloc_size, M_NHOP, malloc_flags); + + if (cht_ptr == NULL) { + DPRINTF("mpath init failed"); + return (false); + } + + /* + * Allocate nexthop index bitmask. + */ + num_items = 128; + mask_ptr = malloc(bitmask_get_size(num_items), M_NHOP, malloc_flags); + if (mask_ptr == NULL) { + DPRINTF("mpath bitmask init failed"); + free(cht_ptr, M_NHOP); + return (false); + } + + NHOPS_WLOCK(ctl); + + if (ctl->gr_head.hash_size == 0) { + /* Init hash and bitmask */ + CHT_SLIST_INIT(&ctl->gr_head, cht_ptr, num_buckets); + bitmask_init(&ctl->gr_idx_head, mask_ptr, num_items); + NHOPS_WUNLOCK(ctl); + } else { + /* Other thread has already initiliazed hash/bitmask */ + NHOPS_WUNLOCK(ctl); + free(cht_ptr, M_NHOP); + free(mask_ptr, M_NHOP); + } + + DPRINTF("mpath init done for fib/af %d/%d", ctl->rh->rib_fibnum, + ctl->rh->rib_family); + + return (true); +} + +int +nhgrp_ctl_init(struct nh_control *ctl) +{ + + /* + * By default, do not allocate datastructures as multipath + * routes will not be necessarily used. + */ + CHT_SLIST_INIT(&ctl->gr_head, NULL, 0); + bitmask_init(&ctl->gr_idx_head, NULL, 0); + return (0); +} + +void +nhgrp_ctl_free(struct nh_control *ctl) +{ + + if (ctl->gr_head.ptr != NULL) + free(ctl->gr_head.ptr, M_NHOP); + if (ctl->gr_idx_head.idx != NULL) + free(ctl->gr_idx_head.idx, M_NHOP); +} + +void +nhgrp_ctl_unlink_all(struct nh_control *ctl) +{ + struct nhgrp_priv *nhg_priv; + + NHOPS_WLOCK_ASSERT(ctl); + + CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) { + DPRINTF("Marking nhgrp %u unlinked", nhg_priv->nhg_idx); + refcount_release(&nhg_priv->nhg_linked); + } CHT_SLIST_FOREACH_END; +} + Index: sys/net/route/nhgrp_ctl.c =================================================================== --- /dev/null +++ sys/net/route/nhgrp_ctl.c @@ -0,0 +1,780 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_inet.h" +#include "opt_route.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +/* + * This file contains the supporting functions for creating multipath groups + * and compiling their dataplane parts. + */ + +/* MPF_MULTIPATH must be the same as NHF_MULTIPATH for nhop selection to work */ +CTASSERT(MPF_MULTIPATH == NHF_MULTIPATH); +/* Offset and size of flags field has to be the same for nhop/nhop groups */ +CHK_STRUCT_FIELD_GENERIC(struct nhop_object, nh_flags, struct nhgrp_object, nhg_flags); +/* Cap multipath to 64, as the larger values would break rib_cmd_info bmasks */ +CTASSERT(RIB_MAX_MPATH_WIDTH <= 64); + +static int wn_cmp(const void *a, const void *b); +static void sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops); + +static struct nhgrp_priv *get_nhgrp(struct nh_control *ctl, + struct weightened_nhop *wn, int num_nhops, int *perror); +static void destroy_nhgrp(struct nhgrp_priv *nhg_priv); +static void destroy_nhgrp_epoch(epoch_context_t ctx); +static void free_nhgrp_nhops(struct nhgrp_priv *nhg_priv); + + +static int +wn_cmp(const void *a, const void *b) +{ + const struct weightened_nhop *wa = a; + const struct weightened_nhop *wb = b; + + if (wa->weight > wb->weight) + return (1); + else if (wa->weight < wb->weight) + return (-1); + + /* Compare nexthops by pointer */ + if (wa->nh > wb->nh) + return (1); + else if (wa->nh < wb->nh) + return (-1); + else + return (0); +} + +/* + * Perform in-place sorting for array of nexthops in @wn. + * + * To avoid nh groups duplication, nexthops/weights in the + * @wn need to be ordered deterministically. + * As this sorting is needed only for the control plane functionality, + * there are no specific external requirements. + * + * Sort by weight first, to ease calculation of the slot sizes. + */ +static void +sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops) +{ + + qsort(wn, num_nhops, sizeof(struct weightened_nhop), wn_cmp); +} + +/* + * Calculate minimum number of slots required to fit the existing + * set of weights in the common use case where weights are "easily" + * comparable. + * Assumes @wn is sorted by weight ascending and each weight is > 0. + * Returns number of slots or 0 if precise calculation failed. + * + * Some examples: + * note: (i, X) pair means (nhop=i, weight=X): + * (1, 1) (2, 2) -> 3 slots [1, 2, 2] + * (1, 100), (2, 200) -> 3 slots [1, 2, 2] + * (1, 100), (2, 200), (3, 400) -> 7 slots [1, 2, 2, 3, 3, 3] + */ +static uint32_t +calc_min_mpath_slots_fast(const struct weightened_nhop *wn, size_t num_items) +{ + uint32_t i, last, xmin; + uint64_t total = 0; + + last = 0; + xmin = wn[0].weight; + for (i = 0; i < num_items; i++) { + total += wn[i].weight; + if ((wn[i].weight - last < xmin) && (wn[i].weight != last)) + xmin = wn[i].weight - last; + last = wn[i].weight; + } + /* xmin is the minimum unit of desired capacity */ + if ((total % xmin) != 0) + return (0); + for (i = 0; i < num_items; i++) { + if ((wn[i].weight % xmin) != 0) + return (0); + } + + return ((uint32_t)(total / xmin)); +} + +/* + * Calculate minimum number of slots required to fit the existing + * set of weights while maintaining weight coefficients. + * + * Assume @wn is sorted by weight ascending and each weight is > 0. + * + * Tries to find simple precise solution first and falls back to + * RIB_MAX_MPATH_WIDTH in case of any failure. + */ +static uint32_t +calc_min_mpath_slots(const struct weightened_nhop *wn, size_t num_items) +{ + uint32_t v; + + v = calc_min_mpath_slots_fast(wn, num_items); + if ((v == 0) || (v > RIB_MAX_MPATH_WIDTH)) + v = RIB_MAX_MPATH_WIDTH; + + return (v); +} + +/* + * Nexthop group data consists of + * 1) dataplane part, with nhgrp_object as a header followed by an + * arbitrary number of nexthop pointers. + * 2) control plane part, with nhgrp_priv as a header, followed by + * an arbirtrary number of 'struct weightened_nhop' object. + * + * Given nexthop groups are (mostly) immutable, allocate all data + * in one go. + * + */ +__noinline static size_t +get_nhgrp_alloc_size(uint32_t nhg_size, uint32_t num_nhops) +{ + size_t sz; + + sz = sizeof(struct nhgrp_object); + sz += nhg_size * sizeof(struct nhop_object *); + sz += sizeof(struct nhgrp_priv); + sz += num_nhops * sizeof(struct weightened_nhop); + return (sz); +} + +/* + * Compile actual list of nexthops to be used by datapath from + * the nexthop group @dst. + * + * For example, compiling control plane list of 2 nexthops + * [(200, A), (100, B)] would result in the datapath array + * [A, A, B] + */ +static void +compile_nhgrp(struct nhgrp_priv *dst_priv, const struct weightened_nhop *x, + uint32_t num_slots) +{ + struct nhgrp_object *dst; + int i, slot_idx, remaining_slots; + uint64_t remaining_sum, nh_weight, nh_slots; + + slot_idx = 0; + dst = dst_priv->nhg; + /* Calculate sum of all weights */ + remaining_sum = 0; + for (i = 0; i < dst_priv->nhg_nh_count; i++) + remaining_sum += x[i].weight; + remaining_slots = num_slots; + DPRINTF("O: %u/%u", (uint32_t)remaining_sum, remaining_slots); + for (i = 0; i < dst_priv->nhg_nh_count; i++) { + /* Calculate number of slots for the current nexthop */ + if (remaining_sum > 0) { + nh_weight = (uint64_t)x[i].weight; + nh_slots = (nh_weight * remaining_slots / remaining_sum); + } else + nh_slots = 0; + + remaining_sum -= x[i].weight; + remaining_slots -= nh_slots; + + DPRINTF(" OO[%d]: %u/%u curr=%d slot_idx=%d", i, + (uint32_t)remaining_sum, remaining_slots, + (int)nh_slots, slot_idx); + + while (nh_slots-- > 0) + dst->nhops[slot_idx++] = x[i].nh; + } +} + +/* + * Allocates new nexthop group for the list of weightened nexthops. + * Assume sorted list. + * Does NOT reference any nexthops in the group. + * Returns group with refcount=1 or NULL. + */ +static struct nhgrp_priv * +alloc_nhgrp(struct weightened_nhop *wn, int num_nhops) +{ + uint32_t nhgrp_size; + int flags = M_NOWAIT; + struct nhgrp_object *nhg; + struct nhgrp_priv *nhg_priv; + + nhgrp_size = calc_min_mpath_slots(wn, num_nhops); + if (nhgrp_size == 0) { + /* Zero weights, abort */ + return (NULL); + } + + size_t sz = get_nhgrp_alloc_size(nhgrp_size, num_nhops); + nhg = malloc(sz, M_NHOP, flags | M_ZERO); + if (nhg == NULL) { + return (NULL); + } + + /* Has to be the first to make NHGRP_PRIV() work */ + nhg->nhg_size = nhgrp_size; + DPRINTF("new mpath group: num_nhops: %u", (uint32_t)nhgrp_size); + nhg->nhg_flags = MPF_MULTIPATH; + + nhg_priv = NHGRP_PRIV(nhg); + nhg_priv->nhg_nh_count = num_nhops; + refcount_init(&nhg_priv->nhg_refcount, 1); + + /* Please see nhgrp_free() comments on the initial value */ + refcount_init(&nhg_priv->nhg_linked, 2); + + nhg_priv->nhg = nhg; + memcpy(&nhg_priv->nhg_nh_weights[0], wn, + num_nhops * sizeof(struct weightened_nhop)); + + compile_nhgrp(nhg_priv, wn, nhg->nhg_size); + + return (nhg_priv); +} + +void +nhgrp_free(struct nhgrp_object *nhg) +{ + struct nhgrp_priv *nhg_priv; + struct nh_control *ctl; + struct epoch_tracker et; + + nhg_priv = NHGRP_PRIV(nhg); + + if (!refcount_release(&nhg_priv->nhg_refcount)) + return; + + /* + * group objects don't have an explicit lock attached to it. + * As groups are reclaimed based on reference count, it is possible + * that some groups will persist after vnet destruction callback + * called. Given that, handle scenario with nhgrp_free_group() being + * called either after or simultaneously with nhgrp_ctl_unlink_all() + * by using another reference counter: nhg_linked. + * + * There are only 2 places, where nhg_linked can be decreased: + * rib destroy (nhgrp_ctl_unlink_all) and this function. + * nhg_link can never be increased. + * + * Hence, use initial value of 2 to make use of + * refcount_release_if_not_last(). + * + * There can be two scenarious when calling this function: + * + * 1) nhg_linked value is 2. This means that either + * nhgrp_ctl_unlink_all() has not been called OR it is running, + * but we are guaranteed that nh_control won't be freed in + * this epoch. Hence, nexthop can be safely unlinked. + * + * 2) nh_linked value is 1. In that case, nhgrp_ctl_unlink_all() + * has been called and nhgrp unlink can be skipped. + */ + + NET_EPOCH_ENTER(et); + if (refcount_release_if_not_last(&nhg_priv->nhg_linked)) { + ctl = nhg_priv->nh_control; + if (unlink_nhgrp(ctl, nhg_priv) == NULL) { + /* Do not try to reclaim */ + DPRINTF("Failed to unlink nexhop group %p", nhg_priv); + NET_EPOCH_EXIT(et); + return; + } + } + NET_EPOCH_EXIT(et); + + epoch_call(net_epoch_preempt, destroy_nhgrp_epoch, + &nhg_priv->nhg_epoch_ctx); +} + +/* + * Destroys all local resources belonging to @nhg_priv. + */ +__noinline static void +destroy_nhgrp_int(struct nhgrp_priv *nhg_priv) +{ + + free(nhg_priv->nhg, M_NHOP); +} + +__noinline static void +destroy_nhgrp(struct nhgrp_priv *nhg_priv) +{ + + KASSERT((nhg_priv->nhg_refcount == 0), ("nhg_refcount != 0")); + + DPRINTF("DEL MPATH %p", nhg_priv); + + KASSERT((nhg_priv->nhg_idx == 0), ("gr_idx != 0")); + + free_nhgrp_nhops(nhg_priv); + + destroy_nhgrp_int(nhg_priv); +} + +/* + * Epoch callback indicating group is safe to destroy + */ +static void +destroy_nhgrp_epoch(epoch_context_t ctx) +{ + struct nhgrp_priv *nhg_priv; + + nhg_priv = __containerof(ctx, struct nhgrp_priv, nhg_epoch_ctx); + + destroy_nhgrp(nhg_priv); +} + +static bool +ref_nhgrp_nhops(struct nhgrp_priv *nhg_priv) +{ + + for (int i = 0; i < nhg_priv->nhg_nh_count; i++) { + if (nhop_try_ref_object(nhg_priv->nhg_nh_weights[i].nh) != 0) + continue; + + /* + * Failed to ref the nexthop, b/c it's deleted. + * Need to rollback references back. + */ + for (int j = 0; j < i; j++) + nhop_free(nhg_priv->nhg_nh_weights[j].nh); + return (false); + } + + return (true); +} + +static void +free_nhgrp_nhops(struct nhgrp_priv *nhg_priv) +{ + + for (int i = 0; i < nhg_priv->nhg_nh_count; i++) + nhop_free(nhg_priv->nhg_nh_weights[i].nh); +} + +/* + * Creates or looks up an existing nexthop group based on @wn and @num_nhops. + * + * Returns referenced nhop group or NULL, passing error code in @perror. + */ +struct nhgrp_priv * +get_nhgrp(struct nh_control *ctl, struct weightened_nhop *wn, int num_nhops, + int *perror) +{ + struct nhgrp_priv *key, *nhg_priv; + + if (ctl->gr_head.hash_size == 0) { + /* First multipath request. Bootstrap mpath datastructures. */ + if (nhgrp_ctl_alloc_default(ctl, M_NOWAIT) == 0) { + *perror = ENOMEM; + return (NULL); + } + } + + /* Sort nexthops & check there are no duplicates */ + sort_weightened_nhops(wn, num_nhops); + uint32_t last_id = 0; + for (int i = 0; i < num_nhops; i++) { + if (wn[i].nh->nh_priv->nh_idx == last_id) { + *perror = EEXIST; + return (NULL); + } + last_id = wn[i].nh->nh_priv->nh_idx; + } + + if ((key = alloc_nhgrp(wn, num_nhops)) == NULL) { + *perror = ENOMEM; + return (NULL); + } + + nhg_priv = find_nhgrp(ctl, key); + if (nhg_priv != NULL) { + /* + * Free originally-created group. As it hasn't been linked + * and the dependent nexhops haven't been referenced, just free + * the group. + */ + destroy_nhgrp_int(key); + *perror = 0; + return (nhg_priv); + } else { + /* No existing group, try to link the new one */ + if (!ref_nhgrp_nhops(key)) { + /* + * Some of the nexthops have been scheduled for deletion. + * As the group hasn't been linked / no nexhops have been + * referenced, call the final destructor immediately. + */ + destroy_nhgrp_int(key); + *perror = EAGAIN; + return (NULL); + } + if (link_nhgrp(ctl, key) == 0) { + /* Unable to allocate index? */ + *perror = EAGAIN; + destroy_nhgrp(key); + } + *perror = 0; + return (key); + } + + /* NOTREACHED */ +} + +/* + * Appends one or more nexthops denoted by @wm to the nexthop group @gr_orig. + * + * Returns referenced nexthop group or NULL. In the latter case, @perror is + * filled with an error code. + * Note that function does NOT care if the next nexthops already exists + * in the @gr_orig. As a result, they will be added, resulting in the + * same nexthop being present multiple times in the new group. + */ +static struct nhgrp_priv * +append_nhops(struct nh_control *ctl, const struct nhgrp_object *gr_orig, + struct weightened_nhop *wn, int num_nhops, int *perror) +{ + char storage[64]; + struct weightened_nhop *pnhops; + struct nhgrp_priv *nhg_priv; + const struct nhgrp_priv *src_priv; + size_t sz; + int curr_nhops; + + src_priv = NHGRP_PRIV_CONST(gr_orig); + curr_nhops = src_priv->nhg_nh_count; + + *perror = 0; + + sz = (src_priv->nhg_nh_count + num_nhops) * (sizeof(struct weightened_nhop)); + /* optimize for <= 4 paths, each path=16 bytes */ + if (sz <= sizeof(storage)) + pnhops = (struct weightened_nhop *)&storage[0]; + else { + pnhops = malloc(sz, M_TEMP, M_NOWAIT); + if (pnhops == NULL) { + *perror = ENOMEM; + return (NULL); + } + } + + /* Copy nhops from original group first */ + memcpy(pnhops, src_priv->nhg_nh_weights, + curr_nhops * sizeof(struct weightened_nhop)); + memcpy(&pnhops[curr_nhops], wn, num_nhops * sizeof(struct weightened_nhop)); + curr_nhops += num_nhops; + + nhg_priv = get_nhgrp(ctl, pnhops, curr_nhops, perror); + + if (pnhops != (struct weightened_nhop *)&storage[0]) + free(pnhops, M_TEMP); + + if (nhg_priv == NULL) + return (NULL); + + return (nhg_priv); +} + + +/* + * Creates/finds nexthop group based on @wn and @num_nhops. + * Returns referenced group or NULL, with an error in @perror. + * + * If the error is EAGAIN, then the operation can be retried. + */ +int +nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops, + struct route_nhop_data *rnd) +{ + struct nh_control *ctl = rh->nh_control; + struct nhgrp_priv *nhg_priv; + int error; + + nhg_priv = get_nhgrp(ctl, wn, num_nhops, &error); + if (nhg_priv != NULL) + rnd->rnd_nhgrp = nhg_priv->nhg; + rnd->rnd_weight = 0; + + return (error); +} + +/* + * Creates new nexthop group based on @src group with the nexthops defined in bitmask + * @nhop_mask removed. + * Returns referenced nexthop group or NULL on failure. + */ +int +nhgrp_get_filtered_group(struct rib_head *rh, const struct nhgrp_object *src, + nhgrp_filter_cb_t flt_func, void *flt_data, struct route_nhop_data *rnd) +{ + char storage[64]; + struct nh_control *ctl = rh->nh_control; + struct weightened_nhop *pnhops; + const struct nhgrp_priv *mp_priv, *src_priv; + size_t sz; + int error, i, num_nhops; + + src_priv = NHGRP_PRIV_CONST(src); + + sz = src_priv->nhg_nh_count * (sizeof(struct weightened_nhop)); + /* optimize for <= 4 paths, each path=16 bytes */ + if (sz <= sizeof(storage)) + pnhops = (struct weightened_nhop *)&storage[0]; + else { + if ((pnhops = malloc(sz, M_TEMP, M_NOWAIT)) == NULL) + return (ENOMEM); + } + + /* Filter nexthops */ + error = 0; + num_nhops = 0; + for (i = 0; i < src_priv->nhg_nh_count; i++) { + if (flt_func(src_priv->nhg_nh_weights[i].nh, flt_data)) + continue; + memcpy(&pnhops[num_nhops++], &src_priv->nhg_nh_weights[i], + sizeof(struct weightened_nhop)); + } + + if (num_nhops == 0) { + rnd->rnd_nhgrp = NULL; + rnd->rnd_weight = 0; + } else if (num_nhops == 1) { + rnd->rnd_nhop = pnhops[0].nh; + rnd->rnd_weight = pnhops[0].weight; + if (nhop_try_ref_object(rnd->rnd_nhop) == 0) + error = EAGAIN; + } else { + mp_priv = get_nhgrp(ctl, pnhops, num_nhops, &error); + if (mp_priv != NULL) + rnd->rnd_nhgrp = mp_priv->nhg; + rnd->rnd_weight = 0; + } + + if (pnhops != (struct weightened_nhop *)&storage[0]) + free(pnhops, M_TEMP); + + return (error); +} + +/* + * Creates new multipath group based on existing group/nhop in @rnd_orig and + * to-be-added nhop @wn_add. + * Returns 0 on success and stores result in @rnd_new. + */ +int +nhgrp_get_addition_group(struct rib_head *rh, struct route_nhop_data *rnd_orig, + struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_new) +{ + struct nh_control *ctl = rh->nh_control; + struct nhgrp_priv *nhg_priv; + struct weightened_nhop wn[2]; + int error; + + if (rnd_orig->rnd_nhop == NULL) { + /* No paths to add to, just reference current nhop */ + *rnd_new = *rnd_add; + if (nhop_try_ref_object(rnd_new->rnd_nhop) == 0) + return (EAGAIN); + return (0); + } + + wn[0].nh = rnd_add->rnd_nhop; + wn[0].weight = rnd_add->rnd_weight; + + if (!NH_IS_NHGRP(rnd_orig->rnd_nhop)) { + /* Simple merge of 2 non-multipath nexthops */ + wn[1].nh = rnd_orig->rnd_nhop; + wn[1].weight = rnd_orig->rnd_weight; + nhg_priv = get_nhgrp(ctl, wn, 2, &error); + } else { + /* Get new nhop group with @rt->rt_nhop as an additional nhop */ + nhg_priv = append_nhops(ctl, rnd_orig->rnd_nhgrp, &wn[0], 1, + &error); + } + + if (nhg_priv == NULL) + return (error); + rnd_new->rnd_nhgrp = nhg_priv->nhg; + rnd_new->rnd_weight = 0; + + return (0); +} + +/* + * Returns pointer to array of nexthops with weights for + * given @nhg. Stores number of items in the array into @pnum_nhops. + */ +struct weightened_nhop * +nhgrp_get_nhops(struct nhgrp_object *nhg, uint32_t *pnum_nhops) +{ + struct nhgrp_priv *nhg_priv; + + KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath")); + + nhg_priv = NHGRP_PRIV(nhg); + *pnum_nhops = nhg_priv->nhg_nh_count; + + return (nhg_priv->nhg_nh_weights); +} + +__noinline static int +dump_nhgrp_entry(struct rib_head *rh, const struct nhgrp_priv *nhg_priv, + char *buffer, size_t buffer_size, struct sysctl_req *w) +{ + struct rt_msghdr *rtm; + struct nhgrp_external *nhge; + struct nhgrp_container *nhgc; + const struct nhgrp_object *nhg; + struct nhgrp_nhop_external *ext; + int error; + size_t sz; + + nhg = nhg_priv->nhg; + + sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external); + /* controlplane nexthops */ + sz += sizeof(struct nhgrp_container); + sz += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count; + /* dataplane nexthops */ + sz += sizeof(struct nhgrp_container); + sz += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size; + + KASSERT(sz <= buffer_size, ("increase nhgrp buffer size")); + + bzero(buffer, sz); + + rtm = (struct rt_msghdr *)buffer; + rtm->rtm_msglen = sz; + rtm->rtm_version = RTM_VERSION; + rtm->rtm_type = RTM_GET; + + nhge = (struct nhgrp_external *)(rtm + 1); + + nhge->nhg_idx = nhg_priv->nhg_idx; + nhge->nhg_refcount = nhg_priv->nhg_refcount; + + /* fill in control plane nexthops firs */ + nhgc = (struct nhgrp_container *)(nhge + 1); + nhgc->nhgc_type = NHG_C_TYPE_CNHOPS; + nhgc->nhgc_subtype = 0; + nhgc->nhgc_len = sizeof(struct nhgrp_container); + nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count; + nhgc->nhgc_count = nhg_priv->nhg_nh_count; + + ext = (struct nhgrp_nhop_external *)(nhgc + 1); + for (int i = 0; i < nhg_priv->nhg_nh_count; i++) { + ext[i].nh_idx = nhg_priv->nhg_nh_weights[i].nh->nh_priv->nh_idx; + ext[i].nh_weight = nhg_priv->nhg_nh_weights[i].weight; + } + + /* fill in dataplane nexthops */ + nhgc = (struct nhgrp_container *)(&ext[nhg_priv->nhg_nh_count]); + nhgc->nhgc_type = NHG_C_TYPE_DNHOPS; + nhgc->nhgc_subtype = 0; + nhgc->nhgc_len = sizeof(struct nhgrp_container); + nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size; + nhgc->nhgc_count = nhg->nhg_size; + + ext = (struct nhgrp_nhop_external *)(nhgc + 1); + for (int i = 0; i < nhg->nhg_size; i++) { + ext[i].nh_idx = nhg->nhops[i]->nh_priv->nh_idx; + ext[i].nh_weight = 0; + } + + error = SYSCTL_OUT(w, buffer, sz); + + return (error); +} + +int +nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w) +{ + struct nh_control *ctl = rh->nh_control; + struct epoch_tracker et; + struct nhgrp_priv *nhg_priv; + char *buffer; + size_t sz; + int error = 0; + + if (ctl->gr_head.items_count == 0) + return (0); + + /* Calculate the maximum nhop group size in bytes */ + sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external); + sz += 2 * sizeof(struct nhgrp_container); + sz += 2 * sizeof(struct nhgrp_nhop_external) * RIB_MAX_MPATH_WIDTH; + buffer = malloc(sz, M_TEMP, M_WAITOK); + + NET_EPOCH_ENTER(et); + NHOPS_RLOCK(ctl); + CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) { + error = dump_nhgrp_entry(rh, nhg_priv, buffer, sz, w); + if (error != 0) + break; + } CHT_SLIST_FOREACH_END; + NHOPS_RUNLOCK(ctl); + NET_EPOCH_EXIT(et); + + free(buffer, M_TEMP); + + return (error); +} Index: sys/net/route/nhgrp_var.h =================================================================== --- /dev/null +++ sys/net/route/nhgrp_var.h @@ -0,0 +1,72 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This header file contains private definitions for the nexthop groups. + * + * Header is not intended to be included by the code external to the + * routing subsystem. + */ + +#ifndef _NET_ROUTE_NHGRP_VAR_H_ +#define _NET_ROUTE_NHGRP_VAR_H_ + +/* nhgrp hash definition */ +/* produce hash value for an object */ +#define mpath_hash_obj(_obj) (hash_nhgrp(_obj)) +/* compare two objects */ +#define mpath_cmp(_one, _two) (cmp_nhgrp(_one, _two)) +/* next object accessor */ +#define mpath_next(_obj) (_obj)->nhg_priv_next + +struct nhgrp_priv { + uint32_t nhg_idx; + uint8_t nhg_nh_count; /* number of items in nh_weights */ + uint8_t nhg_spare[3]; + u_int nhg_refcount; /* use refcount */ + u_int nhg_linked; /* refcount(9), == 2 if linked to the list */ + struct nh_control *nh_control; /* parent control structure */ + struct nhgrp_priv *nhg_priv_next; + struct nhgrp_object *nhg; + struct epoch_context nhg_epoch_ctx; /* epoch data for nhop */ + struct weightened_nhop nhg_nh_weights[0]; +}; + +#define _NHGRP_PRIV(_src) (&(_src)->nhops[(_src)->nhg_size]) +#define NHGRP_PRIV(_src) ((struct nhgrp_priv *)_NHGRP_PRIV(_src)) +#define NHGRP_PRIV_CONST(_src) ((const struct nhgrp_priv *)_NHGRP_PRIV(_src)) + +/* nhgrp.c */ +bool nhgrp_ctl_alloc_default(struct nh_control *ctl, int malloc_flags); +struct nhgrp_priv *find_nhgrp(struct nh_control *ctl, const struct nhgrp_priv *key); +int link_nhgrp(struct nh_control *ctl, struct nhgrp_priv *grp_priv); +struct nhgrp_priv *unlink_nhgrp(struct nh_control *ctl, struct nhgrp_priv *key); + +#endif + Index: sys/net/route/nhop.h =================================================================== --- sys/net/route/nhop.h +++ sys/net/route/nhop.h @@ -155,7 +155,7 @@ */ #define NH_IS_VALID(_nh) RT_LINK_IS_UP((_nh)->nh_ifp) -#define NH_IS_MULTIPATH(_nh) ((_nh)->nh_flags & NHF_MULTIPATH) +#define NH_IS_NHGRP(_nh) ((_nh)->nh_flags & NHF_MULTIPATH) #define RT_GATEWAY(_rt) ((struct sockaddr *)&(_rt)->rt_nhop->gw4_sa) #define RT_GATEWAY_CONST(_rt) ((const struct sockaddr *)&(_rt)->rt_nhop->gw4_sa) @@ -166,6 +166,11 @@ _nh = NULL; \ } while (0) +struct weightened_nhop { + struct nhop_object *nh; + uint32_t weight; +}; + void nhop_free(struct nhop_object *nh); struct sysctl_req; @@ -209,16 +214,34 @@ uint16_t src_sa_off; /* offset of src address SA */ }; -struct mpath_nhop_external { +#define NHG_C_TYPE_CNHOPS 0x1 /* Control plane nhops list */ +#define NHG_C_TYPE_DNHOPS 0x2 /* Dataplane nhops list */ +struct nhgrp_container { + uint32_t nhgc_len; /* container length */ + uint16_t nhgc_count; /* number of items */ + uint8_t nhgc_type; /* container type */ + uint8_t nhgc_subtype; /* container subtype */ +}; + +struct nhgrp_nhop_external { uint32_t nh_idx; uint32_t nh_weight; }; -struct mpath_external { - uint32_t mp_idx; - uint32_t mp_refcount; - uint32_t mp_nh_count; - uint32_t mp_group_size; +/* + * Layout: + * - nhgrp_external + * - nhgrp_container (control plane nhops list) + * - nhgrp_nhop_external + * - nhgrp_nhop_external + * .. + * - nhgrp_container (dataplane nhops list) + * - nhgrp_nhop_external + * - nhgrp_nhop_external + */ +struct nhgrp_external { + uint32_t nhg_idx; /* Nexthop group index */ + uint32_t nhg_refcount; /* number of references */ }; #endif Index: sys/net/route/nhop.c =================================================================== --- sys/net/route/nhop.c +++ sys/net/route/nhop.c @@ -64,7 +64,7 @@ * is backed by the bitmask array. */ -static MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data"); +MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data"); /* Hash management functions */ @@ -112,6 +112,9 @@ NHOPS_LOCK_DESTROY(ctl); free(ctl->nh_head.ptr, M_NHOP); free(ctl->nh_idx_head.idx, M_NHOP); +#ifdef ROUTE_MPATH + nhgrp_ctl_free(ctl); +#endif free(ctl, M_NHOP); } @@ -154,6 +157,9 @@ DPRINTF("Marking nhop %u unlinked", nh_priv->nh_idx); refcount_release(&nh_priv->nh_linked); } CHT_SLIST_FOREACH_END; +#ifdef ROUTE_MPATH + nhgrp_ctl_unlink_all(ctl); +#endif NHOPS_WUNLOCK(ctl); /* Index: sys/net/route/nhop_ctl.c =================================================================== --- sys/net/route/nhop_ctl.c +++ sys/net/route/nhop_ctl.c @@ -695,7 +695,14 @@ nhop_free_any(struct nhop_object *nh) { +#ifdef ROUTE_MPATH + if (!NH_IS_NHGRP(nh)) + nhop_free(nh); + else + nhgrp_free((struct nhgrp_object *)nh); +#else nhop_free(nh); +#endif } /* Helper functions */ Index: sys/net/route/nhop_var.h =================================================================== --- sys/net/route/nhop_var.h +++ sys/net/route/nhop_var.h @@ -37,6 +37,8 @@ #ifndef _NET_ROUTE_NHOP_VAR_H_ #define _NET_ROUTE_NHOP_VAR_H_ +MALLOC_DECLARE(M_NHOP); + /* define nhop hash table */ struct nhop_priv; CHT_SLIST_DEFINE(nhops, struct nhop_priv); @@ -47,9 +49,15 @@ /* next object accessor */ #define nhops_next(_obj) (_obj)->nh_next +/* define multipath hash table */ +struct nhgrp_priv; +CHT_SLIST_DEFINE(nhgroups, struct nhgrp_priv); + struct nh_control { struct nhops_head nh_head; /* hash table head */ struct bitmask_head nh_idx_head; /* nhop index head */ + struct nhgroups_head gr_head; /* nhgrp hash table head */ + struct bitmask_head gr_idx_head; /* nhgrp index head */ struct rwlock ctl_lock; /* overall ctl lock */ struct rib_head *ctl_rh; /* pointer back to rnh */ struct epoch_context ctl_epoch_ctx; /* epoch ctl helper */ @@ -80,7 +88,8 @@ struct epoch_context nh_epoch_ctx; /* epoch data for nhop */ }; -#define NH_IS_PINNED(_nh) ((_nh)->nh_priv->rt_flags & RTF_PINNED) +#define NH_IS_PINNED(_nh) ((!NH_IS_NHGRP(_nh)) && \ + ((_nh)->nh_priv->rt_flags & RTF_PINNED)) /* nhop.c */ struct nhop_priv *find_nhop(struct nh_control *ctl, Index: sys/net/route/route_ctl.h =================================================================== --- sys/net/route/route_ctl.h +++ sys/net/route/route_ctl.h @@ -53,6 +53,10 @@ int rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info, struct rib_cmd_info *rc); +typedef void route_notification_t(struct rib_cmd_info *rc, void *); +void rib_decompose_notification(struct rib_cmd_info *rc, + route_notification_t *cb, void *cbdata); + int rib_add_redirect(u_int fibnum, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *author, struct ifnet *ifp, int flags, int expire_sec); @@ -66,6 +70,20 @@ void rt_foreach_fib_walk(int af, rt_setwarg_t *, rt_walktree_f_t *, void *); void rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg); +struct route_nhop_data; +const struct rtentry *rib_lookup_prefix(uint32_t fibnum, int family, + const struct sockaddr *dst, const struct sockaddr *netmask, + struct route_nhop_data *rnd); +const struct rtentry *rib_lookup_lpm(uint32_t fibnum, int family, + const struct sockaddr *dst, struct route_nhop_data *rnd); + +/* Multipath */ +struct nhgrp_object; +struct weightened_nhop; + +struct weightened_nhop *nhgrp_get_nhops(struct nhgrp_object *nhg, + uint32_t *pnum_nhops); + enum rib_subscription_type { RIB_NOTIFY_IMMEDIATE, RIB_NOTIFY_DELAYED Index: sys/net/route/route_ctl.c =================================================================== --- sys/net/route/route_ctl.c +++ sys/net/route/route_ctl.c @@ -29,7 +29,7 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" -#include "opt_mpath.h" +#include "opt_route.h" #include #include @@ -83,9 +83,6 @@ struct rib_cmd_info *rc); static int change_route(struct rib_head *rnh, struct rt_addrinfo *info, struct route_nhop_data *nhd_orig, struct rib_cmd_info *rc); -static int change_route_nhop(struct rib_head *rnh, struct rtentry *rt, - struct rt_addrinfo *info, struct route_nhop_data *rnd, - struct rib_cmd_info *rc); static int rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc); @@ -94,6 +91,21 @@ struct rib_cmd_info *rc); static void destroy_subscription_epoch(epoch_context_t ctx); +static bool rib_can_multipath(struct rib_head *rh); + +/* Per-vnet multipath routing configuration */ +SYSCTL_DECL(_net_route); +#define V_rib_route_multipath VNET(rib_route_multipath) +#ifdef ROUTE_MPATH +VNET_DEFINE(u_int, rib_route_multipath) = 1; +#define _MP_FLAGS CTLFLAG_RW +#else +VNET_DEFINE(u_int, rib_route_multipath) = 0; +#define _MP_FLAGS CTLFLAG_RD +#endif +SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET, + &VNET_NAME(rib_route_multipath), 0, "Enable route multipath"); +#undef _MP_FLAGS /* Routing table UMA zone */ VNET_DEFINE_STATIC(uma_zone_t, rtzone); @@ -128,7 +140,7 @@ CURVNET_SET(nhop_get_vnet(rt->rt_nhop)); /* Unreference nexthop */ - nhop_free(rt->rt_nhop); + nhop_free_any(rt->rt_nhop); uma_zfree(V_rtzone, rt); @@ -175,6 +187,41 @@ return (rnh); } +#ifdef ROUTE_MPATH +static bool +rib_can_multipath(struct rib_head *rh) +{ + int result; + + CURVNET_SET(rh->rib_vnet); + result = !!V_rib_route_multipath; + CURVNET_RESTORE(); + + return (result); +} + +/* + * Check is nhop is multipath-eligible. + * Avoid nhops without gateways and redirects. + * + * Returns 1 for multipath-eligible nexthop, + * 0 otherwise. + */ +bool +nhop_can_multipath(const struct nhop_object *nh) +{ + + if ((nh->nh_flags & NHF_MULTIPATH) != 0) + return (1); + if ((nh->nh_flags & NHF_GATEWAY) == 0) + return (0); + if ((nh->nh_flags & NHF_REDIRECT) != 0) + return (0); + + return (1); +} +#endif + static int get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight) { @@ -206,7 +253,7 @@ * * Returns true if matches, false otherwise. */ -static bool +bool match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw) { @@ -461,7 +508,7 @@ struct rib_cmd_info *rc) { struct nhop_object *nh_orig; - struct route_nhop_data rnd; + struct route_nhop_data rnd_orig, rnd_add; struct nhop_object *nh; struct rtentry *rt, *rt_orig; int error; @@ -470,32 +517,19 @@ if (error != 0) return (error); - rnd.rnd_nhop = rt->rt_nhop; - rnd.rnd_weight = rt->rt_weight; + rnd_add.rnd_nhop = rt->rt_nhop; + rnd_add.rnd_weight = rt->rt_weight; nh = rt->rt_nhop; RIB_WLOCK(rnh); -#ifdef RADIX_MPATH - struct sockaddr *netmask; - netmask = info->rti_info[RTAX_NETMASK]; - /* do not permit exactly the same dst/mask/gw pair */ - if (rt_mpath_capable(rnh) && - rt_mpath_conflict(rnh, rt, netmask)) { - RIB_WUNLOCK(rnh); - - nhop_free(nh); - uma_zfree(V_rtzone, rt); - return (EEXIST); - } -#endif - error = add_route_nhop(rnh, rt, info, &rnd, rc); + error = add_route_nhop(rnh, rt, info, &rnd_add, rc); if (error == 0) { RIB_WUNLOCK(rnh); return (0); } /* addition failed. Lookup prefix in the rib to determine the cause */ - rt_orig = lookup_prefix(rnh, info, &rnd); + rt_orig = lookup_prefix(rnh, info, &rnd_orig); if (rt_orig == NULL) { /* No prefix -> rnh_addaddr() failed to allocate memory */ RIB_WUNLOCK(rnh); @@ -505,11 +539,11 @@ } /* We have existing route in the RIB. */ - nh_orig = rnd.rnd_nhop; + nh_orig = rnd_orig.rnd_nhop; /* Check if new route has higher preference */ if (can_override_nhop(info, nh_orig) > 0) { /* Update nexthop to the new route */ - change_route_nhop(rnh, rt_orig, info, &rnd, rc); + change_route_nhop(rnh, rt_orig, info, &rnd_add, rc); RIB_WUNLOCK(rnh); uma_zfree(V_rtzone, rt); nhop_free(nh_orig); @@ -518,11 +552,26 @@ RIB_WUNLOCK(rnh); +#ifdef ROUTE_MPATH + if (rib_can_multipath(rnh) && nhop_can_multipath(rnd_add.rnd_nhop) && + nhop_can_multipath(rnd_orig.rnd_nhop)) + error = add_route_mpath(rnh, info, rt, &rnd_add, &rnd_orig, rc); + else +#endif /* Unable to add - another route with the same preference exists */ error = EEXIST; + /* + * ROUTE_MPATH disabled: failed to add route, free both nhop and rt. + * ROUTE_MPATH enabled: original nhop reference is unused in any case, + * free rt only if not _adding_ new route to rib (e.g. the case + * when initial lookup returned existing route, but then it got + * deleted prior to multipath group insertion, leading to a simple + * non-multipath add as a result). + */ nhop_free(nh); - uma_zfree(V_rtzone, rt); + if ((error != 0) || rc->rc_cmd != RTM_ADD) + uma_zfree(V_rtzone, rt); return (error); } @@ -588,7 +637,13 @@ return (ESRCH); nh = rt->rt_nhop; - +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(nh)) { + error = del_route_mpath(rnh, info, rt, + (struct nhgrp_object *)nh, rc); + return (error); + } +#endif error = check_info_match_nhop(info, rt, nh); if (error != 0) return (error); @@ -600,14 +655,6 @@ * Remove the item from the tree and return it. * Complain if it is not there and do no more processing. */ -#ifdef RADIX_MPATH - info->rti_info[RTAX_GATEWAY] = &nh->gw_sa; - if (rt_mpath_capable(rnh)) { - rn = rt_mpath_unlink(rnh, info, rt, &error); - if (error != 0) - return (error); - } else -#endif rn = rnh->rnh_deladdr(info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], &rnh->head); if (rn == NULL) @@ -648,7 +695,18 @@ * If the caller wants it, then it can have it, * the entry will be deleted after the end of the current epoch. */ - rtfree(rc->rc_rt); + if (rc->rc_cmd == RTM_DELETE) + rtfree(rc->rc_rt); +#ifdef ROUTE_MPATH + else { + /* + * Deleting 1 path may result in RTM_CHANGE to + * a different mpath group/nhop. + * Free old mpath group. + */ + nhop_free_any(rc->rc_nh_old); + } +#endif return (0); } @@ -694,19 +752,6 @@ return (ESRCH); } -#ifdef RADIX_MPATH - /* - * If we got multipath routes, - * we require users to specify a matching RTAX_GATEWAY. - */ - if (rt_mpath_capable(rnh)) { - rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]); - if (rt == NULL) { - RIB_RUNLOCK(rnh); - return (ESRCH); - } - } -#endif rnd_orig.rnd_nhop = rt->rt_nhop; rnd_orig.rnd_weight = rt->rt_weight; @@ -722,18 +767,11 @@ } static int -change_route(struct rib_head *rnh, struct rt_addrinfo *info, - struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) +change_nhop(struct rib_head *rnh, struct rt_addrinfo *info, + struct nhop_object *nh_orig, struct nhop_object **nh_new) { - int error = 0; int free_ifa = 0; - struct nhop_object *nh, *nh_orig; - struct route_nhop_data rnd_new; - - nh = NULL; - nh_orig = rnd_orig->rnd_nhop; - if (nh_orig == NULL) - return (ESRCH); + int error; /* * New gateway could require new ifaddr, ifp; @@ -759,24 +797,99 @@ } } - error = nhop_create_from_nhop(rnh, nh_orig, info, &nh); + error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new); if (free_ifa) { ifa_free(info->rti_ifa); info->rti_ifa = NULL; } + + return (error); +} + +#ifdef ROUTE_MPATH +static int +change_mpath_route(struct rib_head *rnh, struct rt_addrinfo *info, + struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) +{ + int error = 0; + struct nhop_object *nh, *nh_orig, *nh_new; + struct route_nhop_data rnd_new; + + nh = NULL; + nh_orig = rnd_orig->rnd_nhop; + + struct weightened_nhop *wn = NULL, *wn_new; + uint32_t num_nhops; + + wn = nhgrp_get_nhops((struct nhgrp_object *)nh_orig, &num_nhops); + nh_orig = NULL; + for (int i = 0; i < num_nhops; i++) { + if (check_info_match_nhop(info, NULL, wn[i].nh)) { + nh_orig = wn[i].nh; + break; + } + } + + if (nh_orig == NULL) + return (ESRCH); + + error = change_nhop(rnh, info, nh_orig, &nh_new); if (error != 0) return (error); - rnd_new.rnd_nhop = nh; - if (info->rti_mflags & RTV_WEIGHT) - rnd_new.rnd_weight = info->rti_rmx->rmx_weight; - else - rnd_new.rnd_weight = rnd_orig->rnd_weight; + wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop), + M_TEMP, M_NOWAIT | M_ZERO); + if (wn_new == NULL) { + nhop_free(nh_new); + return (EAGAIN); + } + + memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop)); + for (int i = 0; i < num_nhops; i++) { + if (wn[i].nh == nh_orig) { + wn[i].nh = nh_new; + wn[i].weight = get_info_weight(info, rnd_orig->rnd_weight); + break; + } + } + + error = nhgrp_get_group(rnh, wn_new, num_nhops, &rnd_new); + nhop_free(nh_new); + free(wn_new, M_TEMP); + + if (error != 0) + return (error); error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc); return (error); } +#endif + +static int +change_route(struct rib_head *rnh, struct rt_addrinfo *info, + struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) +{ + int error = 0; + struct nhop_object *nh, *nh_orig; + struct route_nhop_data rnd_new; + + nh = NULL; + nh_orig = rnd_orig->rnd_nhop; + if (nh_orig == NULL) + return (ESRCH); + + if (NH_IS_NHGRP(nh_orig)) + return (change_mpath_route(rnh, info, rnd_orig, rc)); + + rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight); + error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop); + if (error != 0) + return (error); + error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc); + + return (error); +} /* * Insert @rt with nhop data from @rnd_new to @rnh. @@ -827,7 +940,7 @@ * Conditionally set rt_expire if set in @info. * Returns 0 on success. */ -static int +int change_route_nhop(struct rib_head *rnh, struct rtentry *rt, struct rt_addrinfo *info, struct route_nhop_data *rnd, struct rib_cmd_info *rc) @@ -855,6 +968,8 @@ rn = rnh->rnh_deladdr(ndst, netmask, &rnh->head); if (rn == NULL) return (ESRCH); + rt = RNTORT(rn); + rt->rte_flags &= ~RTF_UP; } /* Finalize notification */ @@ -989,7 +1104,6 @@ info->rti_info[RTAX_DST] = rt_key(rt); info->rti_info[RTAX_NETMASK] = rt_mask(rt); - info->rti_info[RTAX_GATEWAY] = &rt->rt_nhop->gw_sa; error = rt_unlinkrte(di->rnh, info, &di->rc); @@ -1000,7 +1114,7 @@ * XXX: Delayed notifications not implemented * for nexthop updates. */ - if (error == 0) { + if ((error == 0) && (di->rc.rc_cmd == RTM_DELETE)) { /* Add to the list and return */ rt->rt_chain = di->head; di->head = rt; @@ -1024,6 +1138,7 @@ struct rib_head *rnh; struct rt_delinfo di; struct rtentry *rt; + struct nhop_object *nh; struct epoch_tracker et; rnh = rt_tables_get_rnh(fibnum, family); @@ -1049,18 +1164,31 @@ rt = di.head; di.head = rt->rt_chain; rt->rt_chain = NULL; + nh = rt->rt_nhop; di.rc.rc_rt = rt; - di.rc.rc_nh_old = rt->rt_nhop; + di.rc.rc_nh_old = nh; rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc); /* TODO std rt -> rt_addrinfo export */ di.info.rti_info[RTAX_DST] = rt_key(rt); di.info.rti_info[RTAX_NETMASK] = rt_mask(rt); - if (report) - rt_routemsg(RTM_DELETE, rt, rt->rt_nhop->nh_ifp, 0, - fibnum); + if (report) { +#ifdef ROUTE_MPATH + struct nhgrp_object *nhg; + struct weightened_nhop *wn; + uint32_t num_nhops; + if (NH_IS_NHGRP(nh)) { + nhg = (struct nhgrp_object *)nh; + wn = nhgrp_get_nhops(nhg, &num_nhops); + for (int i = 0; i < num_nhops; i++) + rt_routemsg(RTM_DELETE, rt, + wn[i].nh->nh_ifp, 0, fibnum); + } else +#endif + rt_routemsg(RTM_DELETE, rt, nh->nh_ifp, 0, fibnum); + } rtfree(rt); } Index: sys/net/route/route_helpers.c =================================================================== --- sys/net/route/route_helpers.c +++ sys/net/route/route_helpers.c @@ -131,3 +131,167 @@ return (nh); } + +#ifdef ROUTE_MPATH +static void +decompose_change_notification(struct rib_cmd_info *rc, route_notification_t *cb, + void *cbdata) +{ + uint32_t num_old, num_new; + uint32_t nh_idx_old, nh_idx_new; + struct weightened_nhop *wn_old, *wn_new; + struct weightened_nhop tmp = { NULL, 0 }; + uint32_t idx_old = 0, idx_new = 0; + + struct rib_cmd_info rc_del = { .rc_cmd = RTM_DELETE, .rc_rt = rc->rc_rt }; + struct rib_cmd_info rc_add = { .rc_cmd = RTM_ADD, .rc_rt = rc->rc_rt }; + + if (NH_IS_NHGRP(rc->rc_nh_old)) { + wn_old = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_old); + } else { + tmp.nh = rc->rc_nh_old; + tmp.weight = rc->rc_nh_weight; + wn_old = &tmp; + num_old = 1; + } + if (NH_IS_NHGRP(rc->rc_nh_new)) { + wn_new = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_new); + } else { + tmp.nh = rc->rc_nh_new; + tmp.weight = rc->rc_nh_weight; + wn_new = &tmp; + num_new = 1; + } + + /* Use the fact that each @wn array is sorted */ + /* + * Want to convert into set of add and delete operations + * [1] -> [1, 2] = A{2} + * [2] -> [1, 2] = A{1} + * [1, 2, 4]->[1, 3, 4] = A{2}, D{3} + * [1, 2, 4]->[1, 4] = D{2} + * [1, 2, 4] -> [3, 4] = D{1}, C{2,3} OR C{1,3}, D{2} OR D{1},D{2},A{3} + * [1, 2] -> [3, 4] = + * + */ + idx_old = 0; + while ((idx_old < num_old) && (idx_new < num_new)) { + nh_idx_old = wn_old[idx_old].nh->nh_priv->nh_idx; + nh_idx_new = wn_new[idx_new].nh->nh_priv->nh_idx; + + if (nh_idx_old == nh_idx_new) { + if (wn_old[idx_old].weight != wn_new[idx_new].weight) { + /* Update weight by providing del/add notifications */ + rc_del.rc_nh_old = wn_old[idx_old].nh; + rc_del.rc_nh_weight = wn_old[idx_old].weight; + cb(&rc_del, cbdata); + + rc_add.rc_nh_new = wn_new[idx_new].nh; + rc_add.rc_nh_weight = wn_new[idx_new].weight; + cb(&rc_add, cbdata); + } + idx_old++; + idx_new++; + } else if (nh_idx_old < nh_idx_new) { + /* + * [1, ~2~, 4], [1, ~3~, 4] + * [1, ~2~, 5], [1, ~3~, 4] + * [1, ~2~], [1, ~3~, 4] + */ + if ((idx_old + 1 >= num_old) || + (wn_old[idx_old + 1].nh->nh_priv->nh_idx > nh_idx_new)) { + /* Add new unless the next old item is still <= new */ + rc_add.rc_nh_new = wn_new[idx_new].nh; + rc_add.rc_nh_weight = wn_new[idx_new].weight; + cb(&rc_add, cbdata); + idx_new++; + } + /* In any case, delete current old */ + rc_del.rc_nh_old = wn_old[idx_old].nh; + rc_del.rc_nh_weight = wn_old[idx_old].weight; + cb(&rc_del, cbdata); + idx_old++; + } else { + /* + * nh_idx_old > nh_idx_new + * + * [1, ~3~, 4], [1, ~2~, 4] + * [1, ~3~, 5], [1, ~2~, 4] + * [1, ~3~, 4], [1, ~2~] + */ + if ((idx_new + 1 >= num_new) || + (wn_new[idx_new + 1].nh->nh_priv->nh_idx > nh_idx_old)) { + /* No next item or next item is > current one */ + rc_add.rc_nh_new = wn_new[idx_new].nh; + rc_add.rc_nh_weight = wn_new[idx_new].weight; + cb(&rc_add, cbdata); + idx_new++; + } + /* In any case, delete current old */ + rc_del.rc_nh_old = wn_old[idx_old].nh; + rc_del.rc_nh_weight = wn_old[idx_old].weight; + cb(&rc_del, cbdata); + idx_old++; + } + } + + while (idx_old < num_old) { + rc_del.rc_nh_old = wn_old[idx_old].nh; + rc_del.rc_nh_weight = wn_old[idx_old].weight; + cb(&rc_del, cbdata); + idx_old++; + } + + while (idx_new < num_new) { + rc_add.rc_nh_new = wn_new[idx_new].nh; + rc_add.rc_nh_weight = wn_new[idx_new].weight; + cb(&rc_add, cbdata); + idx_new++; + } +} + +/* + * Decompose multipath cmd info @rc into a list of add/del/change + * single-path operations, calling @cb callback for each operation. + * Assumes at least one of the nexthops in @rc is multipath. + */ +void +rib_decompose_notification(struct rib_cmd_info *rc, route_notification_t *cb, + void *cbdata) +{ + struct weightened_nhop *wn; + uint32_t num_nhops; + struct rib_cmd_info rc_new; + + rc_new = *rc; + DPRINTF("cb=%p cmd=%d nh_old=%p nh_new=%p", + cb, rc->cmd, rc->nh_old, rc->nh_new); + switch (rc->rc_cmd) { + case RTM_ADD: + if (!NH_IS_NHGRP(rc->rc_nh_new)) + return; + wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_nhops); + for (uint32_t i = 0; i < num_nhops; i++) { + rc_new.rc_nh_new = wn[i].nh; + rc_new.rc_nh_weight = wn[i].weight; + cb(&rc_new, cbdata); + } + break; + case RTM_DELETE: + if (!NH_IS_NHGRP(rc->rc_nh_old)) + return; + wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_nhops); + for (uint32_t i = 0; i < num_nhops; i++) { + rc_new.rc_nh_old = wn[i].nh; + rc_new.rc_nh_weight = wn[i].weight; + cb(&rc_new, cbdata); + } + break; + case RTM_CHANGE: + if (!NH_IS_NHGRP(rc->rc_nh_old) && !NH_IS_NHGRP(rc->rc_nh_new)) + return; + decompose_change_notification(rc, cb, cbdata); + break; + } +} +#endif Index: sys/net/route/route_var.h =================================================================== --- sys/net/route/route_var.h +++ sys/net/route/route_var.h @@ -87,6 +87,7 @@ /* Constants */ #define RIB_MAX_RETRIES 3 #define RT_MAXFIBS UINT16_MAX +#define RIB_MAX_MPATH_WIDTH 64 /* Macro for verifying fields in af-specific 'struct route' structures */ #define CHK_STRUCT_FIELD_GENERIC(_s1, _f1, _s2, _f2) \ @@ -113,12 +114,7 @@ "ro_dst and " #_dst_new " are at different offset") struct rib_head *rt_tables_get_rnh(uint32_t table, sa_family_t family); -void rt_mpath_init_rnh(struct rib_head *rnh); int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum); -#ifdef RADIX_MPATH -struct radix_node *rt_mpath_unlink(struct rib_head *rnh, - struct rt_addrinfo *info, struct rtentry *rto, int *perror); -#endif struct rib_cmd_info; VNET_PCPUSTAT_DECLARE(struct rtstat, rtstat); @@ -202,14 +198,6 @@ /* rtentry rt flag mask */ #define RTE_RT_FLAG_MASK (RTF_UP | RTF_HOST) -/* Nexthop selection */ -#define _NH2MP(_nh) ((struct nhgrp_object *)(_nh)) -#define _SELECT_NHOP(_nh, _flowid) \ - (_NH2MP(_nh))->nhops[(_flowid) % (_NH2MP(_nh))->mp_size] -#define _RT_SELECT_NHOP(_nh, _flowid) \ - ((!NH_IS_MULTIPATH(_nh)) ? (_nh) : _SELECT_NHOP(_nh, _flowid)) -#define RT_SELECT_NHOP(_rt, _flowid) _RT_SELECT_NHOP((_rt)->rt_nhop, _flowid) - /* route_temporal.c */ void tmproutes_update(struct rib_head *rnh, struct rtentry *rt); void tmproutes_init(struct rib_head *rh); @@ -217,14 +205,24 @@ /* route_ctl.c */ struct route_nhop_data { - struct nhop_object *rnd_nhop; - uint32_t rnd_weight; + union { + struct nhop_object *rnd_nhop; + struct nhgrp_object *rnd_nhgrp; + }; + uint32_t rnd_weight; }; + +int change_route_nhop(struct rib_head *rnh, struct rtentry *rt, + struct rt_addrinfo *info, struct route_nhop_data *rnd, + struct rib_cmd_info *rc); int change_route_conditional(struct rib_head *rnh, struct rtentry *rt, struct rt_addrinfo *info, struct route_nhop_data *nhd_orig, struct route_nhop_data *nhd_new, struct rib_cmd_info *rc); struct rtentry *lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info, struct route_nhop_data *rnd); + +bool nhop_can_multipath(const struct nhop_object *nh); +bool match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw); int check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt, const struct nhop_object *nh); int can_override_nhop(const struct rt_addrinfo *info, @@ -256,5 +254,57 @@ void nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu); int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w); +/* MULTIPATH */ +#define MPF_MULTIPATH 0x08 /* need to be consistent with NHF_MULTIPATH */ + +struct nhgrp_object { + uint16_t nhg_flags; /* nexthop group flags */ + uint8_t nhg_size; /* dataplain group size */ + uint8_t spare; + struct nhop_object *nhops[0]; /* nhops */ +}; + +static inline struct nhop_object * +nhop_select(struct nhop_object *nh, uint32_t flowid) +{ + +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(nh)) { + struct nhgrp_object *nhg = (struct nhgrp_object *)nh; + nh = nhg->nhops[flowid % nhg->nhg_size]; + } +#endif + return (nh); +} + + +struct weightened_nhop; + +/* mpath_ctl.c */ +int add_route_mpath(struct rib_head *rnh, struct rt_addrinfo *info, + struct rtentry *rt, struct route_nhop_data *rnd_add, + struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc); +int del_route_mpath(struct rib_head *rh, struct rt_addrinfo *info, + struct rtentry *rt, struct nhgrp_object *nhg, struct rib_cmd_info *rc); + +/* nhgrp.c */ +int nhgrp_ctl_init(struct nh_control *ctl); +void nhgrp_ctl_free(struct nh_control *ctl); +void nhgrp_ctl_unlink_all(struct nh_control *ctl); + + +/* nhgrp_ctl.c */ +int nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w); + +int nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, + int num_nhops, struct route_nhop_data *rnd); +typedef bool nhgrp_filter_cb_t(const struct nhop_object *nh, void *data); +int nhgrp_get_filtered_group(struct rib_head *rh, const struct nhgrp_object *src, + nhgrp_filter_cb_t flt_func, void *flt_data, struct route_nhop_data *rnd); +int nhgrp_get_addition_group(struct rib_head *rnh, + struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_add, + struct route_nhop_data *rnd_new); + +void nhgrp_free(struct nhgrp_object *nhg); #endif Index: sys/net/rtsock.c =================================================================== --- sys/net/rtsock.c +++ sys/net/rtsock.c @@ -32,7 +32,7 @@ * $FreeBSD$ */ #include "opt_ddb.h" -#include "opt_mpath.h" +#include "opt_route.h" #include "opt_inet.h" #include "opt_inet6.h" @@ -158,8 +158,7 @@ #define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx) #define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED) -static SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, - ""); +SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); struct walkarg { int w_tmemsize; @@ -650,6 +649,25 @@ return (0); } +static struct nhop_object * +select_nhop(struct nhop_object *nh, const struct sockaddr *gw) +{ + if (!NH_IS_NHGRP(nh)) + return (nh); +#ifdef ROUTE_MPATH + struct weightened_nhop *wn; + uint32_t num_nhops; + wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops); + if (gw == NULL) + return (wn[0].nh); + for (int i = 0; i < num_nhops; i++) { + if (match_nhop_gw(wn[i].nh, gw)) + return (wn[i].nh); + } +#endif + return (NULL); +} + /* * Handles RTM_GET message from routing socket, returning matching rt. * @@ -663,6 +681,7 @@ { RIB_RLOCK_TRACKER; struct rib_head *rnh; + struct nhop_object *nh; sa_family_t saf; saf = info->rti_info[RTAX_DST]->sa_family; @@ -690,21 +709,12 @@ RIB_RUNLOCK(rnh); return (ESRCH); } -#ifdef RADIX_MPATH - /* - * for RTM_GET, gate is optional even with multipath. - * if gate == NULL the first match is returned. - * (no need to call rt_mpath_matchgate if gate == NULL) - */ - if (rt_mpath_capable(rnh) && info->rti_info[RTAX_GATEWAY]) { - rc->rc_rt = rt_mpath_matchgate(rc->rc_rt, - info->rti_info[RTAX_GATEWAY]); - if (rc->rc_rt == NULL) { - RIB_RUNLOCK(rnh); - return (ESRCH); - } + + nh = select_nhop(rc->rc_rt->rt_nhop, info->rti_info[RTAX_GATEWAY]); + if (nh == NULL) { + RIB_RUNLOCK(rnh); + return (ESRCH); } -#endif /* * If performing proxied L2 entry insertion, and * the actual PPP host entry is found, perform @@ -740,8 +750,13 @@ RIB_RUNLOCK(rnh); return (ESRCH); } + nh = select_nhop(rc->rc_rt->rt_nhop, info->rti_info[RTAX_GATEWAY]); + if (nh == NULL) { + RIB_RUNLOCK(rnh); + return (ESRCH); + } } - rc->rc_nh_new = rc->rc_rt->rt_nhop; + rc->rc_nh_new = nh; rc->rc_nh_weight = rc->rc_rt->rt_weight; RIB_RUNLOCK(rnh); @@ -832,6 +847,24 @@ return (0); } +static void +save_del_notification(struct rib_cmd_info *rc, void *_cbdata) +{ + struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata; + + if (rc->rc_cmd == RTM_DELETE) + *rc_new = *rc; +} + +static void +save_add_notification(struct rib_cmd_info *rc, void *_cbdata) +{ + struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata; + + if (rc->rc_cmd == RTM_ADD) + *rc_new = *rc; +} + /*ARGSUSED*/ static int route_output(struct mbuf *m, struct socket *so, ...) @@ -918,6 +951,15 @@ if (error == 0) { #ifdef INET6 rti_need_deembed = 1; +#endif +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(rc.rc_nh_new) || + (rc.rc_nh_old && NH_IS_NHGRP(rc.rc_nh_old))) { + struct rib_cmd_info rc_simple = {}; + rib_decompose_notification(&rc, + save_add_notification, (void *)&rc_simple); + rc = rc_simple; + } #endif nh = rc.rc_nh_new; rtm->rtm_index = nh->nh_ifp->if_index; @@ -927,6 +969,15 @@ case RTM_DELETE: error = rib_action(fibnum, RTM_DELETE, &info, &rc); if (error == 0) { +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(rc.rc_nh_old) || + (rc.rc_nh_new && NH_IS_NHGRP(rc.rc_nh_new))) { + struct rib_cmd_info rc_simple = {}; + rib_decompose_notification(&rc, + save_del_notification, (void *)&rc_simple); + rc = rc_simple; + } +#endif nh = rc.rc_nh_old; goto report; } @@ -1708,7 +1759,19 @@ if (!can_export_rte(w->w_req->td->td_ucred, rt)) return (0); nh = rt->rt_nhop; - error = sysctl_dumpnhop(rt, nh, rt->rt_weight, w); +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(nh)) { + struct weightened_nhop *wn; + uint32_t num_nhops; + wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops); + for (int i = 0; i < num_nhops; i++) { + error = sysctl_dumpnhop(rt, wn[i].nh, wn[i].weight, w); + if (error != 0) + return (error); + } + } else +#endif + error = sysctl_dumpnhop(rt, nh, rt->rt_weight, w); return (0); } @@ -1748,6 +1811,7 @@ rtm->rtm_flags = rt->rte_flags; rtm->rtm_flags |= nhop_get_rtflags(nh); rt_getmetrics(rt, nh, &rtm->rtm_rmx); + rtm->rtm_rmx.rmx_weight = weight; rtm->rtm_index = nh->nh_ifp->if_index; rtm->rtm_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); @@ -2028,7 +2092,7 @@ namelen--; if (req->newptr) return (EPERM); - if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP) { + if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP || name[1] == NET_RT_NHGRP) { if (namelen == 3) fib = req->td->td_proc->p_fibnum; else if (namelen == 4) @@ -2096,6 +2160,7 @@ } break; case NET_RT_NHOP: + case NET_RT_NHGRP: /* Allow dumping one specific af/fib at a time */ if (namelen < 4) { error = EINVAL; @@ -2113,6 +2178,12 @@ } if (w.w_op == NET_RT_NHOP) error = nhops_dump_sysctl(rnh, w.w_req); + else +#ifdef ROUTE_MPATH + error = nhgrp_dump_sysctl(rnh, w.w_req); +#else + error = ENOTSUP; +#endif break; case NET_RT_IFLIST: case NET_RT_IFLISTL: Index: sys/netinet/in.c =================================================================== --- sys/netinet/in.c +++ sys/netinet/in.c @@ -35,8 +35,6 @@ #include __FBSDID("$FreeBSD$"); -#include "opt_mpath.h" - #include #include #include @@ -699,14 +697,6 @@ * interface address, we are done here. */ if (ia->ia_flags & IFA_ROUTE) { -#ifdef RADIX_MPATH - if (ia->ia_addr.sin_addr.s_addr == - target->ia_addr.sin_addr.s_addr) { - IN_IFADDR_RUNLOCK(&in_ifa_tracker); - return (EEXIST); - } else - break; -#endif if (V_nosameprefix) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (EEXIST); Index: sys/netinet/in_fib.c =================================================================== --- sys/netinet/in_fib.c +++ sys/netinet/in_fib.c @@ -32,7 +32,6 @@ #include "opt_inet.h" #include "opt_route.h" -#include "opt_mpath.h" #include #include @@ -48,14 +47,11 @@ #include #include #include +#include #include #include #include -#ifdef RADIX_MPATH -#include -#endif - #include #include #include @@ -80,7 +76,6 @@ RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; - struct rtentry *rt; struct nhop_object *nh; KASSERT((fibnum < rt_numfibs), ("fib4_lookup: bad fibnum")); @@ -99,12 +94,7 @@ RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rt = RNTORT(rn); -#ifdef RADIX_MPATH - if (rt_mpath_next(rt) != NULL) - rt = rt_mpath_selectrte(rt, flowid); -#endif - nh = rt->rt_nhop; + nh = nhop_select((RNTORT(rn))->rt_nhop, flowid); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(nh->nh_ifp)) { if (flags & NHR_REF) @@ -120,7 +110,7 @@ } inline static int -check_urpf(const struct nhop_object *nh, uint32_t flags, +check_urpf_nhop(const struct nhop_object *nh, uint32_t flags, const struct ifnet *src_if) { @@ -137,21 +127,24 @@ return (0); } -#ifdef RADIX_MPATH -inline static int -check_urpf_mpath(struct rtentry *rt, uint32_t flags, +static int +check_urpf(struct nhop_object *nh, uint32_t flags, const struct ifnet *src_if) { - - while (rt != NULL) { - if (check_urpf(rt->rt_nhop, flags, src_if) != 0) - return (1); - rt = rt_mpath_next(rt); - } - - return (0); -} +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(nh)) { + struct weightened_nhop *wn; + uint32_t num_nhops; + wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops); + for (int i = 0; i < num_nhops; i++) { + if (check_urpf_nhop(wn[i].nh, flags, src_if) != 0) + return (1); + } + return (0); + } else #endif + return (check_urpf_nhop(nh, flags, src_if)); +} /* * Performs reverse path forwarding lookup. @@ -169,7 +162,6 @@ RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; - struct rtentry *rt; int ret; KASSERT((fibnum < rt_numfibs), ("fib4_check_urpf: bad fibnum")); @@ -186,12 +178,7 @@ RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rt = RNTORT(rn); -#ifdef RADIX_MPATH - ret = check_urpf_mpath(rt, flags, src_if); -#else - ret = check_urpf(rt->rt_nhop, flags, src_if); -#endif + ret = check_urpf(RNTORT(rn)->rt_nhop, flags, src_if); RIB_RUNLOCK(rh); return (ret); } @@ -206,7 +193,6 @@ { struct rib_head *rh; struct radix_node *rn; - struct rtentry *rt; struct nhop_object *nh; KASSERT((fibnum < rt_numfibs), ("fib4_lookup_debugnet: bad fibnum")); @@ -225,12 +211,7 @@ /* unlocked lookup */ rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rt = RNTORT(rn); -#ifdef RADIX_MPATH - if (rt_mpath_next(rt) != NULL) - rt = rt_mpath_selectrte(rt, 0); -#endif - nh = rt->rt_nhop; + nh = nhop_select((RNTORT(rn))->rt_nhop, 0); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(nh->nh_ifp)) { if (flags & NHR_REF) Index: sys/netinet/in_rmx.c =================================================================== --- sys/netinet/in_rmx.c +++ sys/netinet/in_rmx.c @@ -30,8 +30,6 @@ #include __FBSDID("$FreeBSD$"); -#include "opt_mpath.h" - #include #include #include @@ -127,9 +125,6 @@ return (NULL); rh->rnh_preadd = rib4_preadd; -#ifdef RADIX_MPATH - rt_mpath_init_rnh(rh); -#endif return (rh); } Index: sys/netinet/ip_output.c =================================================================== --- sys/netinet/ip_output.c +++ sys/netinet/ip_output.c @@ -38,7 +38,6 @@ #include "opt_ipsec.h" #include "opt_kern_tls.h" #include "opt_mbuf_stress_test.h" -#include "opt_mpath.h" #include "opt_ratelimit.h" #include "opt_route.h" #include "opt_rss.h" @@ -470,11 +469,7 @@ * for correct operation (as it is for ARP). */ uint32_t flowid; -#ifdef RADIX_MPATH - flowid = ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr); -#else flowid = m->m_pkthdr.flowid; -#endif ro->ro_nh = fib4_lookup(fibnum, dst->sin_addr, 0, NHR_REF, flowid); Index: sys/netinet6/in6_fib.c =================================================================== --- sys/netinet6/in6_fib.c +++ sys/netinet6/in6_fib.c @@ -33,7 +33,6 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" -#include "opt_mpath.h" #include #include @@ -49,14 +48,11 @@ #include #include #include +#include #include #include #include -#ifdef RADIX_MPATH -#include -#endif - #include #include #include @@ -88,7 +84,6 @@ RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; - struct rtentry *rt; struct nhop_object *nh; struct sockaddr_in6 sin6; @@ -111,12 +106,7 @@ RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rt = RNTORT(rn); -#ifdef RADIX_MPATH - if (rt_mpath_next(rt) != NULL) - rt = rt_mpath_selectrte(rt, flowid); -#endif - nh = rt->rt_nhop; + nh = nhop_select((RNTORT(rn))->rt_nhop, flowid); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(nh->nh_ifp)) { if (flags & NHR_REF) @@ -132,7 +122,7 @@ } inline static int -check_urpf(const struct nhop_object *nh, uint32_t flags, +check_urpf_nhop(const struct nhop_object *nh, uint32_t flags, const struct ifnet *src_if) { @@ -149,21 +139,24 @@ return (0); } -#ifdef RADIX_MPATH -inline static int -check_urpf_mpath(struct rtentry *rt, uint32_t flags, +static int +check_urpf(struct nhop_object *nh, uint32_t flags, const struct ifnet *src_if) { - - while (rt != NULL) { - if (check_urpf(rt->rt_nhop, flags, src_if) != 0) - return (1); - rt = rt_mpath_next(rt); - } - - return (0); -} +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(nh)) { + struct weightened_nhop *wn; + uint32_t num_nhops; + wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops); + for (int i = 0; i < num_nhops; i++) { + if (check_urpf_nhop(wn[i].nh, flags, src_if) != 0) + return (1); + } + return (0); + } else #endif + return (check_urpf_nhop(nh, flags, src_if)); +} /* * Performs reverse path forwarding lookup. @@ -181,7 +174,6 @@ RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; - struct rtentry *rt; struct sockaddr_in6 sin6; int ret; @@ -203,12 +195,7 @@ RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rt = RNTORT(rn); -#ifdef RADIX_MPATH - ret = check_urpf_mpath(rt, flags, src_if); -#else - ret = check_urpf(rt->rt_nhop, flags, src_if); -#endif + ret = check_urpf(RNTORT(rn)->rt_nhop, flags, src_if); RIB_RUNLOCK(rh); return (ret); } @@ -223,7 +210,6 @@ { struct rib_head *rh; struct radix_node *rn; - struct rtentry *rt; struct nhop_object *nh; struct sockaddr_in6 sin6; @@ -245,8 +231,7 @@ rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rt = RNTORT(rn); - nh = rt->rt_nhop; + nh = nhop_select((RNTORT(rn))->rt_nhop, 0); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(nh->nh_ifp)) { if (flags & NHR_REF) Index: sys/netinet6/in6_rmx.c =================================================================== --- sys/netinet6/in6_rmx.c +++ sys/netinet6/in6_rmx.c @@ -64,8 +64,6 @@ #include __FBSDID("$FreeBSD$"); -#include "opt_mpath.h" - #include #include #include @@ -153,9 +151,6 @@ return (NULL); rh->rnh_preadd = rib6_preadd; -#ifdef RADIX_MPATH - rt_mpath_init_rnh(rh); -#endif rs = rib_subscribe_internal(rh, nd6_subscription_cb, NULL, RIB_NOTIFY_IMMEDIATE, true); Index: sys/netinet6/nd6.c =================================================================== --- sys/netinet6/nd6.c +++ sys/netinet6/nd6.c @@ -36,6 +36,7 @@ #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_route.h" #include #include @@ -1591,7 +1592,11 @@ nd6_subscription_cb(struct rib_head *rnh, struct rib_cmd_info *rc, void *arg) { +#ifdef ROUTE_MPATH + rib_decompose_notification(rc, check_release_defrouter, NULL); +#else check_release_defrouter(rc, NULL); +#endif } int Index: sys/sys/socket.h =================================================================== --- sys/sys/socket.h +++ sys/sys/socket.h @@ -417,6 +417,7 @@ #define NET_RT_IFLISTL 5 /* Survey interface list, using 'l'en * versions of msghdr structs. */ #define NET_RT_NHOP 6 /* dump routing nexthops */ +#define NET_RT_NHGRP 7 /* dump routing nexthop groups */ #endif /* __BSD_VISIBLE */ /* Index: usr.bin/netstat/Makefile =================================================================== --- usr.bin/netstat/Makefile +++ usr.bin/netstat/Makefile @@ -5,7 +5,7 @@ PROG= netstat SRCS= if.c inet.c main.c mbuf.c mroute.c netisr.c nl_symbols.c route.c \ - unix.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c common.c nhops.c \ + unix.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c common.c nhops.c nhgrp.c \ nl_defs.h nl_symbols.c: nlist_symbols Index: usr.bin/netstat/common.h =================================================================== --- usr.bin/netstat/common.h +++ usr.bin/netstat/common.h @@ -54,5 +54,22 @@ struct ifmap_entry *prepare_ifmap(size_t *ifmap_size); +struct rt_msghdr; +struct nhops_map { + uint32_t idx; + struct rt_msghdr *rtm; +}; + +struct nhops_dump { + void *nh_buf; + struct nhops_map *nh_map; + size_t nh_count; +}; + +void dump_nhops_sysctl(int fibnum, int af, struct nhops_dump *nd); +struct nhop_map; +void nhop_map_update(struct nhop_map *map, uint32_t idx, char *gw, char *ifname); + + #endif Index: usr.bin/netstat/main.c =================================================================== --- usr.bin/netstat/main.c +++ usr.bin/netstat/main.c @@ -215,6 +215,7 @@ int noutputs = 0; /* how much outputs before we exit */ int numeric_addr; /* show addresses numerically */ int numeric_port; /* show ports numerically */ +int Oflag; /* show nhgrp objects*/ int oflag; /* show nexthop objects*/ int Pflag; /* show TCP log ID */ static int pflag; /* show given protocol */ @@ -250,7 +251,7 @@ if (argc < 0) exit(EXIT_FAILURE); - while ((ch = getopt(argc, argv, "46AaBbCdF:f:ghI:iLlM:mN:noPp:Qq:RrSTsuWw:xz")) + while ((ch = getopt(argc, argv, "46AaBbCdF:f:ghI:iLlM:mN:nOoPp:Qq:RrSTsuWw:xz")) != -1) switch(ch) { case '4': @@ -353,6 +354,9 @@ case 'o': oflag = 1; break; + case 'O': + Oflag = 1; + break; case 'P': Pflag = 1; break; @@ -509,6 +513,14 @@ xo_finish(); exit(0); } + if (Oflag) { + xo_open_container("statistics"); + nhgrp_print(fib, af); + xo_close_container("statistics"); + xo_finish(); + exit(0); + } + if (gflag) { Index: usr.bin/netstat/netstat.h =================================================================== --- usr.bin/netstat/netstat.h +++ usr.bin/netstat/netstat.h @@ -163,3 +163,4 @@ void mrt_stats(void); void bpf_stats(char *); void nhops_print(int fibnum, int af); +void nhgrp_print(int fibnum, int af); Index: usr.bin/netstat/nhgrp.c =================================================================== --- /dev/null +++ usr.bin/netstat/nhgrp.c @@ -0,0 +1,355 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "netstat.h" +#include "common.h" + +#define WID_GW_DEFAULT(af) (((af) == AF_INET6) ? 40 : 18) + +static int wid_gw; +static int wid_if = 10; +static int wid_nhidx = 8; +static int wid_refcnt = 8; + +struct nhop_entry { + char gw[64]; + char ifname[IFNAMSIZ]; +}; + +struct nhop_map { + struct nhop_entry *ptr; + size_t size; +}; +static struct nhop_map global_nhop_map; + +static struct ifmap_entry *ifmap; +static size_t ifmap_size; + +static struct nhop_entry * +nhop_get(struct nhop_map *map, uint32_t idx) +{ + + if (idx >= map->size) + return (NULL); + if (*map->ptr[idx].ifname == '\0') + return (NULL); + return &map->ptr[idx]; +} + +static void +print_nhgroup_header(int af1 __unused) +{ + + xo_emit("{T:/%-*.*s}{T:/%-*.*s}{T:/%*.*s}{T:/%*.*s}{T:/%*.*s}" + "{T:/%*.*s}{T:/%*s}\n", + wid_nhidx, wid_nhidx, "GrpIdx", + wid_nhidx, wid_nhidx, "NhIdx", + wid_nhidx, wid_nhidx, "Weight", + wid_nhidx, wid_nhidx, "Slots", + wid_gw, wid_gw, "Gateway", + wid_if, wid_if, "Netif", + wid_refcnt, "Refcnt"); +} + +static void +print_padding(char sym, int len) +{ + char buffer[56]; + + memset(buffer, sym, sizeof(buffer)); + buffer[0] = '{'; + buffer[1] = 'P'; + buffer[2] = ':'; + buffer[3] = ' '; + buffer[len + 3] = '}'; + buffer[len + 4] = '\0'; + xo_emit(buffer); +} + + +static void +print_nhgroup_entry_sysctl(const char *name, struct rt_msghdr *rtm, + struct nhgrp_external *nhge) +{ + char buffer[128]; + struct nhop_entry *ne; + struct nhgrp_nhop_external *ext_cp, *ext_dp; + struct nhgrp_container *nhg_cp, *nhg_dp; + + nhg_cp = (struct nhgrp_container *)(nhge + 1); + if (nhg_cp->nhgc_type != NHG_C_TYPE_CNHOPS || nhg_cp->nhgc_subtype != 0) + return; + ext_cp = (struct nhgrp_nhop_external *)(nhg_cp + 1); + + nhg_dp = (struct nhgrp_container *)((char *)nhg_cp + nhg_cp->nhgc_len); + if (nhg_dp->nhgc_type != NHG_C_TYPE_DNHOPS || nhg_dp->nhgc_subtype != 0) + return; + ext_dp = (struct nhgrp_nhop_external *)(nhg_dp + 1); + + xo_open_instance(name); + + snprintf(buffer, sizeof(buffer), "{[:-%d}{:nhgrp-index/%%lu}{]:} ", wid_nhidx); + + xo_emit(buffer, nhge->nhg_idx); + + /* nhidx */ + print_padding('-', wid_nhidx); + /* weight */ + print_padding('-', wid_nhidx); + /* slots */ + print_padding('-', wid_nhidx); + print_padding('-', wid_gw); + print_padding('-', wid_if); + xo_emit("{t:nhg-refcnt/%*lu}", wid_refcnt, nhge->nhg_refcount); + xo_emit("\n"); + + xo_open_list("nhop-weights"); + for (uint32_t i = 0; i < nhg_cp->nhgc_count; i++) { + /* TODO: optimize slots calculations */ + uint32_t slots = 0; + for (uint32_t sidx = 0; sidx < nhg_dp->nhgc_count; sidx++) { + if (ext_dp[sidx].nh_idx == ext_cp[i].nh_idx) + slots++; + } + xo_open_instance("nhop-weight"); + print_padding(' ', wid_nhidx); + // nh index + xo_emit("{t:nh-index/%*lu}", wid_nhidx, ext_cp[i].nh_idx); + xo_emit("{t:nh-weight/%*lu}", wid_nhidx, ext_cp[i].nh_weight); + xo_emit("{t:nh-slots/%*lu}", wid_nhidx, slots); + ne = nhop_get(&global_nhop_map, ext_cp[i].nh_idx); + if (ne != NULL) { + xo_emit("{t:nh-gw/%*.*s}", wid_gw, wid_gw, ne->gw); + xo_emit("{t:nh-interface/%*.*s}", wid_if, wid_if, ne->ifname); + } + xo_emit("\n"); + xo_close_instance("nhop-weight"); + } + xo_close_list("nhop-weights"); + xo_close_instance(name); +} + +static int +cmp_nhg_idx(const void *_a, const void *_b) +{ + const struct nhops_map *a, *b; + + a = _a; + b = _b; + + if (a->idx > b->idx) + return (1); + else if (a->idx < b->idx) + return (-1); + return (0); +} + +static void +dump_nhgrp_sysctl(int fibnum, int af, struct nhops_dump *nd) +{ + size_t needed; + int mib[7]; + char *buf, *next, *lim; + struct rt_msghdr *rtm; + struct nhgrp_external *nhg; + struct nhops_map *nhg_map; + size_t nhg_count, nhg_size; + + mib[0] = CTL_NET; + mib[1] = PF_ROUTE; + mib[2] = 0; + mib[3] = af; + mib[4] = NET_RT_NHGRP; + mib[5] = 0; + mib[6] = fibnum; + if (sysctl(mib, nitems(mib), NULL, &needed, NULL, 0) < 0) + err(EX_OSERR, "sysctl: net.route.0.%d.nhgrpdump.%d estimate", + af, fibnum); + if ((buf = malloc(needed)) == NULL) + errx(2, "malloc(%lu)", (unsigned long)needed); + if (sysctl(mib, nitems(mib), buf, &needed, NULL, 0) < 0) + err(1, "sysctl: net.route.0.%d.nhgrpdump.%d", af, fibnum); + lim = buf + needed; + + /* + * nexhops groups are received unsorted. Collect everything first, + * and sort prior displaying. + */ + nhg_count = 0; + nhg_size = 16; + nhg_map = calloc(nhg_size, sizeof(struct nhops_map)); + for (next = buf; next < lim; next += rtm->rtm_msglen) { + rtm = (struct rt_msghdr *)next; + if (rtm->rtm_version != RTM_VERSION) + continue; + + if (nhg_count >= nhg_size) { + nhg_size *= 2; + nhg_map = realloc(nhg_map, nhg_size * sizeof(struct nhops_map)); + } + + nhg = (struct nhgrp_external *)(rtm + 1); + nhg_map[nhg_count].idx = nhg->nhg_idx; + nhg_map[nhg_count].rtm = rtm; + nhg_count++; + } + + if (nhg_count > 0) + qsort(nhg_map, nhg_count, sizeof(struct nhops_map), cmp_nhg_idx); + nd->nh_buf = buf; + nd->nh_count = nhg_count; + nd->nh_map = nhg_map; +} + +static void +print_nhgrp_sysctl(int fibnum, int af) +{ + struct nhops_dump nd; + struct nhgrp_external *nhg; + struct rt_msghdr *rtm; + + dump_nhgrp_sysctl(fibnum, af, &nd); + + xo_open_container("nhgrp-table"); + xo_open_list("rt-family"); + if (nd.nh_count > 0) { + wid_gw = WID_GW_DEFAULT(af); + xo_open_instance("rt-family"); + pr_family(af); + xo_open_list("nhgrp-entry"); + + print_nhgroup_header(af); + + for (size_t i = 0; i < nd.nh_count; i++) { + rtm = nd.nh_map[i].rtm; + nhg = (struct nhgrp_external *)(rtm + 1); + print_nhgroup_entry_sysctl("nhgrp-entry", rtm, nhg); + } + } + xo_close_list("rt-family"); + xo_close_container("nhgrp-table"); + free(nd.nh_buf); +} + +static void +update_global_map(struct nhop_external *nh) +{ + char iface_name[128]; + char gw_addr[64]; + struct nhop_addrs *na; + struct sockaddr *sa_gw; + + na = (struct nhop_addrs *)((char *)nh + nh->nh_len); + sa_gw = (struct sockaddr *)((char *)na + na->gw_sa_off); + + memset(iface_name, 0, sizeof(iface_name)); + if (nh->ifindex < (uint32_t)ifmap_size) { + strlcpy(iface_name, ifmap[nh->ifindex].ifname, + sizeof(iface_name)); + if (*iface_name == '\0') + strlcpy(iface_name, "---", sizeof(iface_name)); + } + + if (nh->nh_flags & NHF_GATEWAY) { + const char *cp; + cp = fmt_sockaddr(sa_gw, NULL, RTF_HOST); + strlcpy(gw_addr, cp, sizeof(gw_addr)); + } else + snprintf(gw_addr, sizeof(gw_addr), "%s/resolve", iface_name); + + nhop_map_update(&global_nhop_map, nh->nh_idx, gw_addr, iface_name); +} + +static void +prepare_nh_map(int fibnum, int af) +{ + struct nhops_dump nd; + struct nhop_external *nh; + struct rt_msghdr *rtm; + + dump_nhops_sysctl(fibnum, af, &nd); + + for (size_t i = 0; i < nd.nh_count; i++) { + rtm = nd.nh_map[i].rtm; + nh = (struct nhop_external *)(rtm + 1); + update_global_map(nh); + } + + free(nd.nh_buf); +} + +void +nhgrp_print(int fibnum, int af) +{ + size_t intsize; + int numfibs; + + intsize = sizeof(int); + if (fibnum == -1 && + sysctlbyname("net.my_fibnum", &fibnum, &intsize, NULL, 0) == -1) + fibnum = 0; + if (sysctlbyname("net.fibs", &numfibs, &intsize, NULL, 0) == -1) + numfibs = 1; + if (fibnum < 0 || fibnum > numfibs - 1) + errx(EX_USAGE, "%d: invalid fib", fibnum); + + ifmap = prepare_ifmap(&ifmap_size); + prepare_nh_map(fibnum, af); + + xo_open_container("route-nhgrp-information"); + xo_emit("{T:Nexthop groups data}"); + if (fibnum) + xo_emit(" ({L:fib}: {:fib/%d})", fibnum); + xo_emit("\n"); + print_nhgrp_sysctl(fibnum, af); + xo_close_container("route-nhgrp-information"); +} + Index: usr.bin/netstat/nhops.c =================================================================== --- usr.bin/netstat/nhops.c +++ usr.bin/netstat/nhops.c @@ -118,8 +118,6 @@ }; static struct nhop_map global_nhop_map; -static void nhop_map_update(struct nhop_map *map, uint32_t idx, - char *gw, char *ifname); static struct nhop_entry *nhop_get(struct nhop_map *map, uint32_t idx); @@ -204,7 +202,7 @@ } } -static void +void nhop_map_update(struct nhop_map *map, uint32_t idx, char *gw, char *ifname) { if (idx >= map->size) { @@ -322,11 +320,6 @@ xo_close_instance(name); } -struct nhops_map { - uint32_t idx; - struct rt_msghdr *rtm; -}; - static int cmp_nh_idx(const void *_a, const void *_b) { @@ -342,15 +335,14 @@ return (0); } -static void -print_nhops_sysctl(int fibnum, int af) +void +dump_nhops_sysctl(int fibnum, int af, struct nhops_dump *nd) { size_t needed; int mib[7]; char *buf, *next, *lim; struct rt_msghdr *rtm; struct nhop_external *nh; - int fam; struct nhops_map *nh_map; size_t nh_count, nh_size; @@ -369,8 +361,6 @@ if (sysctl(mib, nitems(mib), buf, &needed, NULL, 0) < 0) err(1, "sysctl: net.route.0.%d.nhdump.%d", af, fibnum); lim = buf + needed; - xo_open_container("nhop-table"); - xo_open_list("rt-family"); /* * nexhops are received unsorted. Collect everything first, sort and then display @@ -395,9 +385,27 @@ nh_count++; } - if (nh_count > 0) { + if (nh_count > 0) qsort(nh_map, nh_count, sizeof(struct nhops_map), cmp_nh_idx); - nh = (struct nhop_external *)(nh_map[0].rtm + 1); + nd->nh_buf = buf; + nd->nh_count = nh_count; + nd->nh_map = nh_map; +} + +static void +print_nhops_sysctl(int fibnum, int af) +{ + struct nhops_dump nd; + struct nhop_external *nh; + int fam; + struct rt_msghdr *rtm; + + dump_nhops_sysctl(fibnum, af, &nd); + + xo_open_container("nhop-table"); + xo_open_list("rt-family"); + if (nd.nh_count > 0) { + nh = (struct nhop_external *)(nd.nh_map[0].rtm + 1); fam = nh->nh_family; wid_dst = WID_GW_DEFAULT(fam); @@ -415,8 +423,8 @@ print_nhop_header(fam); - for (size_t i = 0; i < nh_count; i++) { - rtm = nh_map[i].rtm; + for (size_t i = 0; i < nd.nh_count; i++) { + rtm = nd.nh_map[i].rtm; nh = (struct nhop_external *)(rtm + 1); print_nhop_entry_sysctl("nh-entry", rtm, nh); } @@ -426,7 +434,7 @@ } xo_close_list("rt-family"); xo_close_container("nhop-table"); - free(buf); + free(nd.nh_buf); } static void