Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -4170,6 +4170,7 @@ net/route/nhgrp_ctl.c optional route_mpath net/route/nhop.c standard net/route/nhop_ctl.c standard +net/route/nhop_neigh.c standard net/route/nhop_utils.c standard net/route/fib_algo.c optional fib_algo net/route/route_ctl.c standard Index: sys/net/if_llatbl.c =================================================================== --- sys/net/if_llatbl.c +++ sys/net/if_llatbl.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include #include @@ -525,16 +526,26 @@ llentry_request_feedback(struct llentry *lle) { struct llentry *child_lle; + struct ifnet *ifp = lle->lle_tbl->llt_ifp; + int family = lle->lle_tbl->llt_af; + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); LLE_REQ_LOCK(lle); lle->r_skip_req = 1; LLE_REQ_UNLOCK(lle); + nhops_request_feedback(ifp, family, lle); + CK_SLIST_FOREACH(child_lle, &lle->lle_children, lle_child_next) { LLE_REQ_LOCK(child_lle); child_lle->r_skip_req = 1; LLE_REQ_UNLOCK(child_lle); + nhops_request_feedback(ifp, family, child_lle); } + + NET_EPOCH_EXIT(et); } /* @@ -559,13 +570,18 @@ static time_t llentry_get_hittime_raw(struct llentry *lle) { - time_t lle_hittime = 0; + time_t lle_hittime = 0, nhops_hittime = 0; LLE_REQ_LOCK(lle); if ((lle->r_skip_req == 0) && (lle_hittime < lle->lle_hittime)) lle_hittime = lle->lle_hittime; LLE_REQ_UNLOCK(lle); + struct lltable *llt = lle->lle_tbl; + nhops_hittime = nhops_get_hittime(llt->llt_ifp, llt->llt_af, lle); + if ((nhops_hittime != 0) && (nhops_hittime < lle_hittime)) + lle_hittime = nhops_hittime; + return (lle_hittime); } @@ -574,7 +590,9 @@ { time_t lle_hittime = 0; struct llentry *child_lle; + struct epoch_tracker et; + NET_EPOCH_ENTER(et); lle_hittime = llentry_get_hittime_raw(lle); CK_SLIST_FOREACH(child_lle, &lle->lle_children, lle_child_next) { @@ -582,6 +600,7 @@ if (hittime > lle_hittime) lle_hittime = hittime; } + NET_EPOCH_EXIT(et); return (lle_hittime); } @@ -724,6 +743,7 @@ { struct llentry *lle; struct ifnet *ifp; + struct epoch_tracker et; ifp = llt->llt_ifp; IF_AFDATA_WLOCK(ifp); @@ -740,6 +760,14 @@ } lltable_unlink_entry(llt, lle); + + /* Mark as invalid to invalidate the caches */ + lle->r_flags &= ~RLLE_VALID; + lle->la_flags &= ~LLE_VALID; + + NET_EPOCH_ENTER(et); + nhops_update_neigh(ifp, llt->llt_af, lle); + NET_EPOCH_EXIT(et); IF_AFDATA_WUNLOCK(ifp); llt->llt_delete_entry(llt, lle); @@ -974,6 +1002,8 @@ u_int laflags = 0; int error; + NET_EPOCH_ASSERT(); + if (dl == NULL || dl->sdl_family != AF_LINK) return (EINVAL); @@ -1033,6 +1063,7 @@ lltable_unlink_entry(llt, lle_tmp); } lltable_link_entry(llt, lle); + nhops_update_neigh(ifp, dst->sa_family, lle); if ((lle->la_flags & LLE_PUB) != 0 && (llt->llt_flags & LLT_ADDEDPROXY) == 0) llt->llt_flags |= LLT_ADDEDPROXY; Index: sys/net/route/nhop.h =================================================================== --- sys/net/route/nhop.h +++ sys/net/route/nhop.h @@ -134,13 +134,11 @@ }; struct ifnet *nh_ifp; /* Logical egress interface. Always != NULL */ struct ifaddr *nh_ifa; /* interface address to use. Always != NULL */ - struct ifnet *nh_aifp; /* ifnet of the source address. Always != NULL */ + void *nh_prepend_raw;/* PTR+len for nexthop prepend */ counter_u64_t nh_pksent; /* packets sent using this nhop */ /* 32 bytes + 4xPTR == 64(amd64) / 48(i386) */ - uint8_t nh_prepend_len; /* length of prepend data */ - uint8_t spare[3]; - uint32_t spare1; /* alignment */ - char nh_prepend[48]; /* L2 prepend */ + struct ifnet *nh_aifp; /* ifnet of the source address. Always != NULL */ + char nh_buffer[48]; /* Space for custom nhop data */ struct nhop_priv *nh_priv; /* control plane data */ /* -- 128 bytes -- */ }; @@ -163,6 +161,32 @@ _nh = NULL; \ } while (0) +/* + * L2 prepend infrastructure definitions + * Nexthop L2 rewrites may change during nextop lifetime when the neighbor + * changes its MAC. For the most common encapsulations - ethernet & IB, + * the maximum encap length is 24 (IPoIB) = LLE_MAX_LINKHDR. + * Leverage the fact that this value is < cache line size by embedding + * prepend length to the pointer data itself & requiring UMA to return + * CACHE_LINE_SIZE-aligned pointers. + */ +#define L2_PREPEND_LEN_BITS CACHE_LINE_SHIFT +#define L2_PREPEND_LEN_MAX ((1 << L2_PREPEND_LEN_BITS) - sizeof(struct epoch_context)) + +#define _NH_L2_PREPEND_MASK_PTR(_p) ((uintptr_t)(_p) & ~((1 << L2_PREPEND_LEN_BITS) - 1)) +#define NH_L2_PREPEND_GET_PTR(_p) ((void *)_NH_L2_PREPEND_MASK_PTR(_p)) +#define NH_L2_PREPEND_GET_LEN(_p) ((uintptr_t)(_p) & ((1 << L2_PREPEND_LEN_BITS) - 1)) + +#define NH_L2_COMPILE_PREPEND_PTR(_p, _l) ((void *)((uintptr_t)(_p) | (_l))) + +static inline void +route_set_prepend_nh(struct route *ro, const struct nhop_object *nh) +{ + void *ptr = atomic_load_ptr(&nh->nh_prepend_raw); + ro->ro_prepend = (char *)NH_L2_PREPEND_GET_PTR(ptr); + ro->ro_plen = NH_L2_PREPEND_GET_LEN(ptr); +} + struct weightened_nhop { struct nhop_object *nh; uint32_t weight; @@ -209,6 +233,15 @@ uint32_t nhop_get_expire(const struct nhop_object *nh); void nhop_set_expire(struct nhop_object *nh, uint32_t expire); +void *nhop_alloc_prepend(size_t size); +void nhop_free_prepend(void *prepend); +bool nhop_update_prepend(struct nhop_object *nh, void *prepend, size_t len); + +void nhops_update_neigh(struct ifnet *ifp, int family, const struct llentry *lle); +void nhops_request_feedback(struct ifnet *ifp, int family, const struct llentry *lle); +void nhops_stop_feedback(struct ifnet *ifp, int family, const struct llentry *lle); +time_t nhops_get_hittime(struct ifnet *ifp, int family, const struct llentry *lle); + #endif /* _KERNEL */ /* Kernel <> userland structures */ Index: sys/net/route/nhop.c =================================================================== --- sys/net/route/nhop.c +++ sys/net/route/nhop.c @@ -103,6 +103,7 @@ rh->nh_control = ctl; ctl->ctl_rh = rh; + ctl->ctl_nn = nhops_get_neigh_ptr(); FIB_CTL_LOG(LOG_DEBUG2, ctl, "nhops init: ctl %p rh %p", ctl, rh); Index: sys/net/route/nhop_ctl.c =================================================================== --- sys/net/route/nhop_ctl.c +++ sys/net/route/nhop_ctl.c @@ -104,6 +104,21 @@ 2 * CACHE_LINE_SIZE) #define NHOP_PRIV_ALIGNED_SIZE roundup2(sizeof(struct nhop_priv), \ 2 * CACHE_LINE_SIZE) + +static uma_zone_t nh_prepend_zone; /* Global zone for all nhop prepend data */ + +struct nhop_prepend { + char prepend[L2_PREPEND_LEN_MAX]; + struct epoch_context epoch_ctx; +}; + +#define NHOP_PREPEND_ALIGNED_SIZE roundup2(sizeof(struct nhop_prepend), \ + CACHE_LINE_SIZE) + +static bool nhop_update_prepend_locked(struct nhop_priv *nh_priv, void *prepend, + size_t len); + + void nhops_init(void) { @@ -111,6 +126,8 @@ nhops_zone = uma_zcreate("routing nhops", NHOP_OBJECT_ALIGNED_SIZE + NHOP_PRIV_ALIGNED_SIZE, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + nh_prepend_zone = uma_zcreate("nhop prepend", NHOP_PREPEND_ALIGNED_SIZE, + NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); } /* @@ -495,6 +512,9 @@ return (ENOBUFS); } + if (nh->nh_priv->nh_need_neigh) + nhop_link_neigh(ctl->ctl_nn, nh); + #if DEBUG_MAX_LEVEL >= LOG_DEBUG char nhbuf[NHOP_PRINT_BUFSIZE]; FIB_NH_LOG(LOG_DEBUG, nh, "finalized: %s", nhop_print_buf(nh, nhbuf, sizeof(nhbuf))); @@ -511,6 +531,12 @@ ifa_free(nh->nh_ifa); counter_u64_free(nh->nh_pksent); + if (nh->nh_prepend_raw != NULL) { + struct nhop_prepend *np; + np = (struct nhop_prepend *)NH_L2_PREPEND_GET_PTR(nh->nh_prepend_raw); + nhop_free_prepend(np); + } + uma_zfree(nhops_zone, nh); } @@ -586,6 +612,11 @@ NET_EPOCH_ENTER(et); if (refcount_release_if_not_last(&nh_priv->nh_linked)) { ctl = nh_priv->nh_control; + /* nhop neigh ctl lifetime is the same as ctl lifetime */ + /* Stop receiving updates for neighbor prepends */ + if (nh_priv->nh_need_neigh) + nhop_unlink_neighbor(ctl->ctl_nn, nh); + if (unlink_nhop(ctl, nh_priv) == NULL) { /* Do not try to reclaim */ char nhbuf[NHOP_PRINT_BUFSIZE]; @@ -851,6 +882,7 @@ nhop_set_transmit_ifp(struct nhop_object *nh, struct ifnet *ifp) { nh->nh_ifp = ifp; + nh->nh_priv->nh_need_neigh = nhop_need_neigh(nh); } @@ -1033,8 +1065,11 @@ pnhe->nh_mtu = nh->nh_mtu; pnhe->nh_flags = nh->nh_flags; - memcpy(pnhe->nh_prepend, nh->nh_prepend, sizeof(nh->nh_prepend)); - pnhe->prepend_len = nh->nh_prepend_len; + if (nh->nh_prepend_raw != NULL) { + void *ptr = nh->nh_prepend_raw; + pnhe->prepend_len = NH_L2_PREPEND_GET_LEN(ptr); + memcpy(pnhe->nh_prepend, NH_L2_PREPEND_GET_PTR(ptr), pnhe->prepend_len); + } pnhe->nh_refcount = nh->nh_priv->nh_refcnt; pnhe->nh_pksent = counter_u64_fetch(nh->nh_pksent); @@ -1109,3 +1144,80 @@ return (0); } + +void * +nhop_alloc_prepend(size_t size) +{ + if (size > L2_PREPEND_LEN_MAX) + return (NULL); + void *prepend = uma_zalloc(nh_prepend_zone, M_NOWAIT | M_ZERO); + return (prepend); +} + +void +nhop_free_prepend(void *prepend) +{ + uma_zfree(nh_prepend_zone, prepend); +} + +static void +destroy_nhop_prepend_epoch(epoch_context_t ctx) +{ + struct nhop_prepend *prepend; + + prepend = __containerof(ctx, struct nhop_prepend, epoch_ctx); + nhop_free_prepend(prepend); +} + +static bool +nhop_is_linked(const struct nhop_object *nh) +{ + return (nh->nh_priv->nh_idx != 0); +} + +static bool +nhop_update_prepend_locked(struct nhop_priv *nh_priv, void *prepend, size_t len) +{ + void *ptr = NULL, *old_ptr = NULL; + bool result = false; + + if (prepend != NH_L2_PREPEND_GET_PTR(prepend)) { + //KASSERT(); + /* XXX: check alignment */ + + prepend = NULL; + } + if (prepend != NULL) + ptr = NH_L2_COMPILE_PREPEND_PTR(prepend, len); + + if (nhop_is_linked(nh_priv->nh)) { + old_ptr = nh_priv->nh->nh_prepend_raw; + nh_priv->nh->nh_prepend_raw = ptr; + result = true; + } + + if (old_ptr != NULL) { + struct nhop_prepend *np = NH_L2_PREPEND_GET_PTR(old_ptr); + epoch_call(net_epoch_preempt, destroy_nhop_prepend_epoch, + &np->epoch_ctx); + } + + return (result); +} + +bool +nhop_update_prepend(struct nhop_object *nh, void *prepend, size_t len) +{ + struct nhop_priv *nh_priv = nh->nh_priv; + struct nh_control *ctl; + bool result; + + ctl = nh_priv->nh_control; + + NHOPS_WLOCK(ctl); + result = nhop_update_prepend_locked(nh_priv, prepend, len); + NHOPS_WUNLOCK(ctl); + + return (result); +} + Index: sys/net/route/nhop_neigh.c =================================================================== --- /dev/null +++ sys/net/route/nhop_neigh.c @@ -0,0 +1,810 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_route.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This file contains data structures management logic for the nexthop ("nhop") + * route subsystem. + * + * Nexthops in the original sense are the objects containing all the necessary + * information to forward the packet to the selected destination. + * In particular, nexthop is defined by a combination of + * ifp, ifa, aifp, mtu, gw addr(if set), nh_type, nh_family, mask of rt_flags and + * NHF_DEFAULT + * + * All nexthops are stored in the resizable hash table. + * Additionally, each nexthop gets assigned its unique index (nexthop index) + * so userland programs can interact with the nexthops easier. Index allocation + * is backed by the bitmask array. + */ + +#define DEBUG_MOD_NAME nhop_neigh +#define DEBUG_MAX_LEVEL LOG_DEBUG +#include +_DECLARE_DEBUG(LOG_DEBUG2); + +CHT_SLIST_DEFINE(nhop_neighs, struct nhop_neigh); +/* produce hash value for an object */ +#define nhop_neighs_hash_obj(_obj) hash_neigh(_obj) +/* compare two objects */ +#define nhop_neighs_cmp(_one, _two) cmp_neigh(_one, _two) +/* next object accessor */ +#define nhop_neighs_next(_obj) (_obj)->nn_next + + +struct nn_control { + struct nhop_neighs_head nn_head; + struct rmlock nn_lock; + struct callout nn_feedback_callout; + int nn_dying; + TAILQ_HEAD(,nhop_neigh) nn_feedback_list; + struct epoch_context nn_epoch_ctx; /* epoch nnctl helper */ +}; + +VNET_DEFINE_STATIC(struct nn_control *, nn_control) = NULL; +#define V_nn_control VNET(nn_control) + +#define CTL_WLOCK(nnctl) rm_wlock(&nnctl->nn_lock) +#define CTL_WUNLOCK(nnctl) rm_wunlock(&nnctl->nn_lock) +#define CTL_TRACKER struct rm_priotracker tracker +#define CTL_RLOCK(nnctl) rm_rlock(&nnctl->nn_lock, &tracker) +#define CTL_RUNLOCK(nnctl) rm_runlock(&nnctl->nn_lock, &tracker) + +struct nhop_neigh { + struct ifnet *nn_ifp; + uint8_t nn_neigh_family; + uint8_t nn_upper_family; + uint16_t nn_flags; + union { + struct in_addr nn_addr4; + struct in6_addr nn_addr6; + }; + uint64_t nn_packets; + time_t nn_hittime; + struct nhop_neigh *nn_next; + TAILQ_HEAD(, nhop_priv) nn_nhops; + TAILQ_ENTRY(nhop_neigh) nn_feedback_entry; +}; +#define NEIGH_END_CMP (__offsetof(struct nhop_neigh, nn_packets)) + +#define NN_FLAG_FB_LINKED 0x01 /* Linked to the feedback list */ + + +_Static_assert(L2_PREPEND_LEN_MAX >= LLE_MAX_LINKHDR, + "CACHE_LINE_SIZE has to be at least LLE_MAX_LINKHDR"); + +static void free_neigh(struct nhop_neigh *nn); +static void update_prepend_ptr(struct nhop_object *nh, const struct llentry *lle); +static void schedule_callout(struct nn_control *nnctl); + +/* + * Prints the relevant data for neigh object @nn into provided buffer. + * Example: nn/inet/em0/192.168.0.1 + * + * Returns @buf. + */ +char * +neigh_print_buf(const struct nhop_neigh *nn, char *buf, size_t bufsize) +{ + char abuf[INET6_ADDRSTRLEN]; + + if (nn == NULL) { + snprintf(buf, bufsize, "nn/NULL"); + return (buf); + } + + switch (nn->nn_neigh_family) { +#ifdef INET + case AF_INET: + inet_ntop(AF_INET, &nn->nn_addr4, abuf, sizeof(abuf)); + snprintf(buf, bufsize, "nn/%s/%s/%s", + rib_print_family(nn->nn_upper_family), if_name(nn->nn_ifp), abuf); + break; +#endif +#ifdef INET6 + case AF_INET6: + inet_ntop(AF_INET6, &nn->nn_addr6, abuf, sizeof(abuf)); + snprintf(buf, bufsize, "nn/%s/%s/%s", + rib_print_family(nn->nn_upper_family), if_name(nn->nn_ifp), abuf); + break; +#endif + default: + snprintf(buf, bufsize, "nn/%s/%s/unknown(%s)", + rib_print_family(nn->nn_upper_family), if_name(nn->nn_ifp), + rib_print_family(nn->nn_neigh_family)); + } + + return (buf); +} + +void +vnet_nhops_init_neigh(void) +{ + struct nn_control *nnctl; + + nnctl = malloc(sizeof(struct nn_control), M_NHOP, M_WAITOK | M_ZERO); + + /* + * Allocate nexthop hash. Start with 16 items by default (128 bytes). + * This will be enough for most of the cases. + */ + int num_buckets = 16; + size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); + + void *ptr = malloc(alloc_size, M_NHOP, M_WAITOK | M_ZERO); + CHT_SLIST_INIT(&nnctl->nn_head, ptr, num_buckets); + rm_init(&nnctl->nn_lock, "nexthop neigh lock"); + TAILQ_INIT(&nnctl->nn_feedback_list); + + callout_init_rm(&nnctl->nn_feedback_callout, &nnctl->nn_lock, 0); + + V_nn_control = nnctl; +} + +struct nn_control * +nhops_get_neigh_ptr(void) +{ + return (V_nn_control); +} + +static void +destroy_nnctl(struct nn_control *nnctl) +{ + struct nhop_neigh *nn, *nn_tmp; + + CHT_SLIST_FOREACH_SAFE(&nnctl->nn_head, nhop_neighs, nn, nn_tmp) { + free_neigh(nn); + } CHT_SLIST_FOREACH_END; + + rm_destroy(&nnctl->nn_lock); + free(nnctl->nn_head.ptr, M_NHOP); + free(nnctl, M_NHOP); +} + +static void +destroy_nnctl_epoch(epoch_context_t ctx) +{ + struct nn_control *nnctl; + + nnctl = __containerof(ctx, struct nn_control, nn_epoch_ctx); + destroy_nnctl(nnctl); +} + +void +vnet_nhop_destroy_neigh(void) +{ + struct nn_control *nnctl = atomic_load_ptr(&V_nn_control); + + V_nn_control = NULL; + + CTL_WLOCK(nnctl); + nnctl->nn_dying = 1; + CTL_WUNLOCK(nnctl); + + callout_drain(&nnctl->nn_feedback_callout); + + epoch_call(net_epoch_preempt, destroy_nnctl_epoch, + &nnctl->nn_epoch_ctx); +} + +/* + * Nexhop hash calculation: + */ +struct _hash_data { + uint16_t ifentropy; + uint8_t neigh_family; + uint8_t upper_family; + uint32_t addr; +}; + +static unsigned +djb_hash(const unsigned char *h, const int len) +{ + unsigned int result = 0; + int i; + + for (i = 0; i < len; i++) + result = 33 * result ^ h[i]; + + return (result); +} + +static uint32_t +hash_neigh(const struct nhop_neigh *nn) +{ + struct _hash_data key = { + .ifentropy = (uint16_t)((((uintptr_t)nn->nn_ifp) >> 6) & 0xFFFF), + .neigh_family = nn->nn_neigh_family, + .upper_family = nn->nn_upper_family, + .addr = (nn->nn_neigh_family == AF_INET6) ? + nn->nn_addr6.s6_addr32[3] : nn->nn_addr4.s_addr + }; + + return (uint32_t)(djb_hash((const unsigned char *)&key, sizeof(key))); +} + +static int +cmp_neigh(const struct nhop_neigh *_one, const struct nhop_neigh *_two) +{ + + if (memcmp(_one, _two, NEIGH_END_CMP) != 0) + return (0); + return (1); +} + +/* + * Searches for the nexthop neigh by data specified in @nh_priv. + * Returns referenced nexthop or NULL. + */ +static struct nhop_neigh * +find_neigh(struct nn_control *nnctl, const struct nhop_neigh *nn) +{ + struct nhop_neigh *nn_ret; + + CHT_SLIST_FIND_BYOBJ(&nnctl->nn_head, nhop_neighs, nn, nn_ret); + return (nn_ret); +} + +static bool +has_neigh(struct nn_control *nnctl, const struct nhop_neigh *nn_base) +{ + CTL_TRACKER; + bool result; + + CTL_RLOCK(nnctl); + result = find_neigh(nnctl, nn_base) != NULL; + CTL_RUNLOCK(nnctl); + + return (result); +} + +/* + * Tries to resize neighbor hash to the value specified by @new_num_buckets. + */ +static void +resize_neigh_hash(struct nn_control *nnctl, uint32_t new_num_buckets) +{ + size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_num_buckets); + void *nn_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO); + if (nn_ptr == NULL) { + /* allocations has failed. */ + RT_LOG(LOG_NOTICE, "neigh hash resize to %u has failed", new_num_buckets); + return; + } + + CTL_WLOCK(nnctl); + RT_LOG(LOG_DEBUG, "going to resize neigh hash: %u -> %u", + nnctl->nn_head.hash_size, new_num_buckets); + CHT_SLIST_RESIZE(&nnctl->nn_head, nhop_neighs, nn_ptr, new_num_buckets); + CTL_WUNLOCK(nnctl); + + if (nn_ptr != NULL) + free(nn_ptr, M_NHOP); +} + +/* + * Checks if nexthop @nh can be attached to the LLE/NDP neighbor. + * Function verifies that target interface has L2 and nexthop contains + * gateway (or is a host route). + * Returns true on success. + */ +bool +nhop_need_neigh(const struct nhop_object *nh) +{ + bool match = false; + + switch (nh->nh_ifp->if_type) { + case IFT_BRIDGE: + case IFT_ETHER: + case IFT_INFINIBAND: + case IFT_L2VLAN: + match = true; + break; + } + + if (match) { + if (nh->nh_flags & (NHF_GATEWAY|NHF_HOST)) + return (true); + } + + return (false); +} + +/* + * Fills in nhop_neigh data based on the nexthop specified by @nh_priv. + */ +static void +init_neigh(struct nhop_neigh *nn, const struct nhop_priv *nh_priv) +{ + const struct nhop_object *nh = nh_priv->nh; + + nn->nn_ifp = nh->nh_ifp; + nn->nn_neigh_family = nh_priv->nh_neigh_family; + nn->nn_upper_family = nh_priv->nh_upper_family; + switch (nn->nn_neigh_family) { + case AF_INET: + nn->nn_addr4 = nh->gw4_sa.sin_addr; + break; + case AF_INET6: + nn->nn_addr6 = nh->gw6_sa.sin6_addr; + break; + } + TAILQ_INIT(&nn->nn_nhops); +} + +static void +free_neigh(struct nhop_neigh *nn) +{ + free(nn, M_NHOP); +} + +static struct llentry * +find_lle(struct nhop_priv *nh_priv) +{ + struct nhop_object *nh = nh_priv->nh; + struct llentry *lle = NULL; + struct lltable *llt; + + llt = lltable_get(nh->nh_ifp, nh_priv->nh_neigh_family); + if (llt != NULL) + lle = lla_lookup(llt, LLE_UNLOCKED, &nh->gw_sa); + if (lle != NULL) { + if (nh_priv->nh_upper_family != nh_priv->nh_neigh_family) + lle = llentry_lookup_family(lle, nh_priv->nh_upper_family); + } + +#if DEBUG_MAX_LEVEL >= LOG_DEBUG2 + char nhbuf[48], lbuf[48]; + FIB_NH_LOG(LOG_DEBUG2, nh, "nhop %s: mapped to lle %s", + nhop_print_buf(nh, nhbuf, sizeof(nhbuf)), + lle ? llentry_print_buf(lle, nh->nh_ifp, nh_priv->nh_neigh_family, lbuf, sizeof(lbuf)) : "NULL"); +#endif + return (lle); +} + +/* + * Links nextop @nh_priv to the nexhop neighbor hash table and tries + * to fill in L2 nexthop prepend. + * Returns true on successful linkage. + */ +bool +nhop_link_neigh(struct nn_control *nnctl, struct nhop_object *nh) +{ + uint32_t num_buckets_new; + struct nhop_neigh *nn = NULL, *nn_new; + struct nhop_priv *nh_priv = nh->nh_priv; + + NET_EPOCH_ASSERT(); + + /* + * Most llentries have at most one nexthop attached. + * Thus, assume we'll be inserting a new record. + */ + nn_new = malloc(sizeof(struct nhop_neigh), M_NHOP, M_NOWAIT | M_ZERO); + if (nn_new == NULL) + return (false); + init_neigh(nn_new, nh_priv); + + /* Try to calculate the prepend */ + struct llentry *lle = find_lle(nh_priv); + if (lle != NULL) + update_prepend_ptr(nh, lle); + + CTL_WLOCK(nnctl); + + /* + * Check if we need to resize hash and index. + * The following 2 functions return either new size or 0 + * if resize is not required. + */ + num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&nnctl->nn_head); + + /* Check if record already exists */ + CHT_SLIST_FIND_BYOBJ(&nnctl->nn_head, nhop_neighs, nn_new, nn); + + if (nn == NULL) { + nn = nn_new; + nn_new = NULL; + CHT_SLIST_INSERT_HEAD(&nnctl->nn_head, nhop_neighs, nn); + + /* + * XXXME: There can be a race when lle gets deleted after lookup + */ + } + TAILQ_INSERT_TAIL(&nn->nn_nhops, nh_priv, nh_neigh_entry); + + CTL_WUNLOCK(nnctl); + +#if DEBUG_MAX_LEVEL >= LOG_DEBUG + char nhbuf[48], nnbuf[48]; + FIB_NH_LOG(LOG_DEBUG, nh_priv->nh, "nhop %s linked to %s %s", + nhop_print_buf(nh_priv->nh, nhbuf, sizeof(nhbuf)), + nn_new == NULL ? "new" : "existing", + neigh_print_buf(nn, nnbuf, sizeof(nnbuf))); +#endif + + if (nn_new != NULL) + free_neigh(nn_new); + + if (num_buckets_new > 0) + resize_neigh_hash(nnctl, num_buckets_new); + + return (true); +} + +/* + * Unlinks nexthop specified by @nh_priv data. + */ +void +nhop_unlink_neighbor(struct nn_control *nnctl, struct nhop_object *nh) +{ + uint32_t num_buckets_new; + struct nhop_neigh *nn, *nn_del = NULL, nn_base = {}; + struct nhop_priv *nh_priv = nh->nh_priv; + + init_neigh(&nn_base, nh_priv); + + CTL_WLOCK(nnctl); + + nn = find_neigh(nnctl, &nn_base); + if (nn != NULL) { + TAILQ_REMOVE(&nn->nn_nhops, nh_priv, nh_neigh_entry); + if (TAILQ_EMPTY(&nn->nn_nhops)) { + CHT_SLIST_REMOVE(&nnctl->nn_head, nhop_neighs, nn, nn_del); + } + } + + /* Check if hash or index needs to be resized */ + num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&nnctl->nn_head); + + CTL_WUNLOCK(nnctl); + +#if DEBUG_MAX_LEVEL >= LOG_DEBUG + char nhbuf[48], nnbuf[48]; + FIB_NH_LOG(LOG_DEBUG, nh_priv->nh, "nhop %s unlinked from the neigh %s%s", + nhop_print_buf(nh_priv->nh, nhbuf, sizeof(nhbuf)), + neigh_print_buf(nn, nnbuf, sizeof(nnbuf)), + nn_del == NULL ? "" : " (last entry)"); +#endif + + if (nn_del != NULL) + free_neigh(nn_del); + + if (num_buckets_new > 0) + resize_neigh_hash(nnctl, num_buckets_new); +} + +/* + * Updates nhop @nh L2 prepend data with the pre-calculated prepend + * in @lle. If @lle contains no valid data, removes an existing L2 prepend. + */ +static void +update_prepend_ptr(struct nhop_object *nh, const struct llentry *lle) +{ + void *prepend = NULL; + int prepend_len = 0; + + if (lle->r_flags & RLLE_VALID) { + prepend_len = lle->r_hdrlen; + prepend = nhop_alloc_prepend(prepend_len); + if (prepend != NULL) + memcpy(prepend, lle->r_linkdata, prepend_len); + } + +#if DEBUG_MAX_LEVEL >= LOG_DEBUG + char nhbuf[48], lbuf[48]; + FIB_NH_LOG(LOG_DEBUG, nh, "nhop %s sync L2 from %s", + nhop_print_buf(nh, nhbuf, sizeof(nhbuf)), + llentry_print_buf(lle, nh->nh_ifp, nh->nh_priv->nh_neigh_family, lbuf, sizeof(lbuf))); +#endif + + nhop_update_prepend(nh, prepend, prepend_len); +} + +/* + * Hook called by the LLE subsystem notifying of the changed L2 prepend + * for the @lle entry. + * Function searches the matching neigh entry and updates NH L2 prepend + * for all of the registered nexthops. + */ +void +nhops_update_neigh(struct ifnet *ifp, int family, const struct llentry *lle) +{ + struct nn_control *nnctl = atomic_load_ptr(&V_nn_control); + CTL_TRACKER; + + NET_EPOCH_ASSERT(); + + if (nnctl == NULL) + return; + + struct nhop_neigh nn_base = { + .nn_ifp = ifp, + .nn_upper_family = llentry_get_upper_family(lle, family), + .nn_neigh_family = family, + .nn_addr6 = lle->r_l3addr.addr6, + }; + + bool matched = has_neigh(nnctl, &nn_base); + +#if DEBUG_MAX_LEVEL >= LOG_DEBUG2 + char lbuf[48]; + RT_LOG(LOG_DEBUG2, "L2 prepend update from %s (matched: %s)", + llentry_print_buf(lle, ifp, family, lbuf, sizeof(lbuf)), + matched ? "true" : "false"); +#endif + + if (!matched) + return; + + CTL_RLOCK(nnctl); + struct nhop_neigh *nn = find_neigh(nnctl, &nn_base); + if (nn != NULL) { + struct nhop_priv *nh_priv; + + TAILQ_FOREACH(nh_priv, &nn->nn_nhops, nh_neigh_entry) + update_prepend_ptr(nh_priv->nh, lle); + } + CTL_RUNLOCK(nnctl); +} + + +/* + * LLE validity. + * Both ARP and ND state machines requires datapath-liveness checking + * as a step of expiring an lle entry. Additionally, ND state machine + * requires exact timestamp of the first packet traversing LLE after the + * liveness checking request, so it can execute check callouts less often + * (STALE -> DELAY -> PROBE). + * + * Thus, upon receiving the request to check dataplane liveness from LLE layers, + * the code below adds matching neigh entry to the feedback list and fires + * per-VNET callout on per-second basis, recording the first time when the + * packet is traversed. + * + * Neighs are removed from the list in 2 ways: the first is done by the callout + * upon recording the timestamp, the second is LLE code removing the matching + * LLE. + */ + +/* + * Returns total count of all packets that traversed the nexthops + * registered in the @nn. + */ +static uint64_t +calc_pktsent(struct nhop_neigh *nn) +{ + uint64_t nn_packets = 0; + struct nhop_priv *nh_priv; + + TAILQ_FOREACH(nh_priv, &nn->nn_nhops, nh_neigh_entry) + nn_packets += counter_u64_fetch(nh_priv->nh->nh_pksent); + return (nn_packets); +} + +/* + * Callout that is called every second to check if the cumulative amount + * of packets traversing relevant neigh entries has changed. If the change + * is observed, record the change time and removes entry from the list. + * + * Note: removing nexthops from the neigh entry results in false positive. + * However, as the value is used to check if the underlying lle is still used, + * the worst that can happen, is that the entry will be kept slightly longer + * before the deletion. + */ +static void +pktsent_callout(void *_arg) +{ + struct nn_control *nnctl = (struct nn_control *)_arg; + struct nhop_neigh *nn, *nn_tmp; + + TAILQ_FOREACH_SAFE(nn, &nnctl->nn_feedback_list, nn_feedback_entry, nn_tmp) { + if (nn->nn_packets != calc_pktsent(nn)) { + nn->nn_packets = 0; + nn->nn_hittime = time_uptime; + nn->nn_flags &= ~NN_FLAG_FB_LINKED; + TAILQ_REMOVE(&nnctl->nn_feedback_list, nn, nn_feedback_entry); +#if DEBUG_MAX_LEVEL >= LOG_DEBUG2 + char nnbuf[48]; + RT_LOG(LOG_DEBUG2, "L2 neigh %s got datapath feedback at %lu", + neigh_print_buf(nn, nnbuf, sizeof(nnbuf)), + nn->nn_hittime); +#endif + } + } + if (!TAILQ_EMPTY(&nnctl->nn_feedback_list)) + schedule_callout(nnctl); +#if DEBUG_MAX_LEVEL >= LOG_DEBUG2 + else + RT_LOG(LOG_DEBUG2, "datapath callout stopped"); +#endif +} + +static void +schedule_callout(struct nn_control *nnctl) +{ + if (callout_pending(&nnctl->nn_feedback_callout) || nnctl->nn_dying) + return; + callout_reset_sbt(&nnctl->nn_feedback_callout, SBT_1S * 1, 0, + pktsent_callout, nnctl, 0); + RT_LOG(LOG_DEBUG2, "datapath callout started"); +} + +static void +update_feedback_membership(struct ifnet *ifp, int family, const struct llentry *lle, + bool add) +{ + struct nn_control *nnctl = atomic_load_ptr(&V_nn_control); + struct nhop_neigh *nn; + bool need_callout = false; + + NET_EPOCH_ASSERT(); + + if (__predict_false(nnctl == NULL)) + return; + + struct nhop_neigh nn_base = { + .nn_ifp = ifp, + .nn_upper_family = llentry_get_upper_family(lle, family), + .nn_neigh_family = family, + .nn_addr6 = lle->r_l3addr.addr6, + }; + + /* Most of LLEs do not have mapped nhops, so fail early */ + bool matched = has_neigh(nnctl, &nn_base); + +#if DEBUG_MAX_LEVEL >= LOG_DEBUG2 + char lbuf[48]; + llentry_print_buf(lle, ifp, family, lbuf, sizeof(lbuf)); + if (matched) { + RT_LOG(LOG_DEBUG2, "%s datapath feedback for %s", add ? "request" : "abort", lbuf); + } else { + RT_LOG(LOG_DEBUG3, "%s datapath feedback for %s (nomatch)", add ? "request" : "abort", lbuf); + } +#endif + + if (!matched) + return; + + CTL_WLOCK(nnctl); + nn = find_neigh(nnctl, &nn_base); + if (nn != NULL) { + if (add) { + nn->nn_packets = calc_pktsent(nn); + nn->nn_hittime = 0; + + if (!(nn->nn_flags & NN_FLAG_FB_LINKED)) { + nn->nn_flags |= NN_FLAG_FB_LINKED; + need_callout = TAILQ_EMPTY(&nnctl->nn_feedback_list); + TAILQ_INSERT_TAIL(&nnctl->nn_feedback_list, nn, nn_feedback_entry); +#if DEBUG_MAX_LEVEL >= LOG_DEBUG2 + char llbuf[48], nnbuf[48]; + RT_LOG(LOG_DEBUG2, "added %s to datapath feedback for %s", + neigh_print_buf(nn, nnbuf, sizeof(nnbuf)), + llentry_print_buf(lle, ifp, family, llbuf, sizeof(llbuf))); +#endif + } + } else { + /* Remove from the list */ + if (nn->nn_flags & NN_FLAG_FB_LINKED) { + nn->nn_flags &= ~NN_FLAG_FB_LINKED; + TAILQ_REMOVE(&nnctl->nn_feedback_list, nn, nn_feedback_entry); +#if DEBUG_MAX_LEVEL >= LOG_DEBUG2 + char llbuf[48], + nnbuf[48]; + RT_LOG(LOG_DEBUG2, "removed %s from datapath feedback for %s", + neigh_print_buf(nn, nnbuf, sizeof(nnbuf)), + llentry_print_buf(lle, ifp, family, llbuf, sizeof(llbuf))); +#endif + } + } + } + if (need_callout) + schedule_callout(nnctl); + CTL_WUNLOCK(nnctl); +} + +void +nhops_request_feedback(struct ifnet *ifp, int family, const struct llentry *lle) +{ + update_feedback_membership(ifp, family, lle, true); +} + +void +nhops_stop_feedback(struct ifnet *ifp, int family, const struct llentry *lle) +{ + update_feedback_membership(ifp, family, lle, false); +} + +/* + * Returns the timestamp of the first packet traversing the nexhops matching @lle + * after nhops_request_feedback() call. + */ +time_t +nhops_get_hittime(struct ifnet *ifp, int family, const struct llentry *lle) +{ + struct nn_control *nnctl = atomic_load_ptr(&V_nn_control); + struct nhop_neigh *nn; + time_t hittime = 0; + CTL_TRACKER; + + NET_EPOCH_ASSERT(); + + if (__predict_false(nnctl == NULL)) + return (0); + + struct nhop_neigh nn_base = { + .nn_ifp = ifp, + .nn_upper_family = llentry_get_upper_family(lle, family), + .nn_neigh_family = family, + .nn_addr6 = lle->r_l3addr.addr6, + }; + + CTL_RLOCK(nnctl); + nn = find_neigh(nnctl, &nn_base); + if (nn != NULL) + hittime = nn->nn_hittime; + CTL_RUNLOCK(nnctl); + +#if DEBUG_MAX_LEVEL >= LOG_DEBUG2 + if (nn != NULL) { + char lbuf[48], nnbuf[48]; + RT_LOG(LOG_DEBUG2, "%s datapath feedback returned %lu from %s", + llentry_print_buf(lle, ifp, family, lbuf, sizeof(lbuf)), + hittime, neigh_print_buf(nn, nnbuf, sizeof(nnbuf))); + } +#endif + + return (hittime); +} + Index: sys/net/route/nhop_utils.h =================================================================== --- sys/net/route/nhop_utils.h +++ sys/net/route/nhop_utils.h @@ -139,6 +139,11 @@ for (_x = CHT_FIRST(_head, _i); _x; _x = _PX##_next(_x)) #define CHT_SLIST_FOREACH_END } +#define CHT_SLIST_FOREACH_SAFE(_head, _PX, _x, _t) \ + for (uint32_t _i = 0; _i < (_head)->hash_size; _i++) { \ + for (_x = CHT_FIRST(_head, _i); (_x) && (_t = _PX##_next(_x)); _x = _t) +#define CHT_SLIST_FOREACH_SAFE_END } + #define CHT_SLIST_RESIZE(_head, _PX, _new_void_ptr, _new_hsize) \ uint32_t _new_idx; \ typeof((_head)->ptr) _new_ptr = (void *)_new_void_ptr; \ Index: sys/net/route/nhop_var.h =================================================================== --- sys/net/route/nhop_var.h +++ sys/net/route/nhop_var.h @@ -51,6 +51,7 @@ /* define multipath hash table */ struct nhgrp_priv; +struct nn_control; CHT_SLIST_DEFINE(nhgroups, struct nhgrp_priv); struct nh_control { @@ -59,6 +60,7 @@ struct nhgroups_head gr_head; /* nhgrp hash table head */ struct rwlock ctl_lock; /* overall ctl lock */ struct rib_head *ctl_rh; /* pointer back to rnh */ + struct nn_control *ctl_nn; /* pointer to neigh ctl */ struct epoch_context ctl_epoch_ctx; /* epoch ctl helper */ }; @@ -86,9 +88,11 @@ u_int nh_refcnt; /* number of references, refcount(9) */ u_int nh_linked; /* refcount(9), == 2 if linked to the list */ int nh_finalized; /* non-zero if finalized() was called */ + bool nh_need_neigh; /* true if L2 resolution is needed */ struct nhop_object *nh; /* backreference to the dataplane nhop */ struct nh_control *nh_control; /* backreference to the rnh */ struct nhop_priv *nh_next; /* hash table membership */ + TAILQ_ENTRY(nhop_priv) nh_neigh_entry; /* neigh membership */ struct vnet *nh_vnet; /* vnet nhop belongs to */ struct epoch_context nh_epoch_ctx; /* epoch data for nhop */ }; @@ -108,4 +112,11 @@ /* nhop_ctl.c */ int cmp_priv(const struct nhop_priv *_one, const struct nhop_priv *_two); +/* nhop_neigh.c */ + +struct nn_control *nhops_get_neigh_ptr(void); +bool nhop_need_neigh(const struct nhop_object *nh); +bool nhop_link_neigh(struct nn_control *nnctl, struct nhop_object *nh); +void nhop_unlink_neighbor(struct nn_control *nnctl, struct nhop_object *nh); + #endif Index: sys/net/route/route_tables.c =================================================================== --- sys/net/route/route_tables.c +++ sys/net/route/route_tables.c @@ -262,6 +262,8 @@ #ifdef FIB_ALGO vnet_fib_init(); #endif + vnet_nhops_init_neigh(); + RTABLES_LOCK_INIT(); RTABLES_LOCK(); @@ -291,6 +293,7 @@ } RTABLES_UNLOCK(); + vnet_nhop_destroy_neigh(); /* * dom_rtdetach calls rt_table_destroy(), which * schedules deletion for all rtentries, nexthops and control Index: sys/net/route/route_var.h =================================================================== --- sys/net/route/route_var.h +++ sys/net/route/route_var.h @@ -238,6 +238,8 @@ void nhops_init(void); int nhops_init_rib(struct rib_head *rh); void nhops_destroy_rib(struct rib_head *rh); +void vnet_nhops_init_neigh(void); +void vnet_nhop_destroy_neigh(void); void nhop_ref_object(struct nhop_object *nh); int nhop_try_ref_object(struct nhop_object *nh); void nhop_ref_any(struct nhop_object *nh); Index: sys/netinet/if_ether.c =================================================================== --- sys/netinet/if_ether.c +++ sys/netinet/if_ether.c @@ -986,6 +986,7 @@ if (la_tmp == NULL) { arp_mark_lle_reachable(la, ifp); LLE_WUNLOCK(la); + nhops_update_neigh(ifp, AF_INET, la); } else { /* Free newly-create entry and handle packet */ lltable_free_entry(LLTABLE(ifp), la); @@ -1228,8 +1229,11 @@ return; } + nhops_update_neigh(ifp, AF_INET, la); + /* Clear fast path feedback request if set */ llentry_mark_used(la); + nhops_stop_feedback(ifp, AF_INET, la); } arp_mark_lle_reachable(la, ifp); Index: sys/netinet/ip_fastfwd.c =================================================================== --- sys/netinet/ip_fastfwd.c +++ sys/netinet/ip_fastfwd.c @@ -470,6 +470,7 @@ ro.ro_flags |= RT_HAS_GW; } else gw = (const struct sockaddr *)dst; + route_set_prepend_nh(&ro, nh); /* Handle redirect case. */ redest.s_addr = 0; Index: sys/netinet/ip_output.c =================================================================== --- sys/netinet/ip_output.c +++ sys/netinet/ip_output.c @@ -302,6 +302,8 @@ ro->ro_flags |= (nh_flags & NHF_REJECT) ? RT_REJECT : 0; ro->ro_flags |= (nh_flags & NHF_BLACKHOLE) ? RT_BLACKHOLE : 0; ro->ro_flags |= (nh_flags & NHF_GATEWAY) ? RT_HAS_GW : 0; + + route_set_prepend_nh(ro, nh); } /* Index: sys/netinet6/nd6.c =================================================================== --- sys/netinet6/nd6.c +++ sys/netinet6/nd6.c @@ -697,10 +697,10 @@ delay = (long)ND_IFINFO(ifp)->retrans * hz / 1000; break; case ND6_LLINFO_REACHABLE: - if (!ND6_LLINFO_PERMANENT(lle)) { - ifp = lle->lle_tbl->llt_ifp; + ifp = lle->lle_tbl->llt_ifp; + if (!ND6_LLINFO_PERMANENT(lle)) delay = (long)ND_IFINFO(ifp)->reachable * hz; - } + nhops_stop_feedback(ifp, AF_INET6, lle); break; case ND6_LLINFO_STALE: @@ -1420,6 +1420,7 @@ /* Update data */ lltable_set_entry_addr(ifp, lle, buf, sz, off); + nhops_update_neigh(ifp, AF_INET6, lle); struct llentry *child_lle; CK_SLIST_FOREACH(child_lle, &lle->lle_children, lle_child_next) { @@ -1429,6 +1430,7 @@ if (lltable_calc_llheader(ifp, fam, lladdr, buf, &sz, &off) == 0) { /* success */ lltable_set_entry_addr(ifp, child_lle, buf, sz, off); + nhops_update_neigh(ifp, AF_INET6, child_lle); child_lle->ln_state = ND6_LLINFO_REACHABLE; } LLE_WUNLOCK(child_lle); @@ -2054,6 +2056,7 @@ if (ln_tmp == NULL) { /* No existing lle, mark as new entry (6,7) */ is_newentry = 1; + nhops_update_neigh(ifp, AF_INET6, ln); if (lladdr != NULL) { /* (7) */ nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); EVENTHANDLER_INVOKE(lle_event, ln,