Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -4156,6 +4156,7 @@ net/route/nhgrp_ctl.c optional route_mpath net/route/nhop.c standard net/route/nhop_ctl.c standard +net/route/nhop_neigh.c standard net/route/nhop_utils.c standard net/route/fib_algo.c optional fib_algo net/route/route_ctl.c standard Index: sys/net/if.c =================================================================== --- sys/net/if.c +++ sys/net/if.c @@ -337,11 +337,6 @@ SX_SYSINIT_FLAGS(ifnet_detach, &ifnet_detach_sxlock, "ifnet_detach_sx", SX_RECURSE); -#ifdef VIMAGE -#define VNET_IS_SHUTTING_DOWN(_vnet) \ - ((_vnet)->vnet_shutdown && (_vnet)->vnet_state < SI_SUB_VNET_DONE) -#endif - static if_com_alloc_t *if_com_alloc[256]; static if_com_free_t *if_com_free[256]; @@ -1122,7 +1117,7 @@ #ifdef VIMAGE bool shutdown; - shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet); + shutdown = VNET_IS_DYING(ifp->if_vnet); #endif /* @@ -1367,7 +1362,7 @@ } /* Make sure the VNET is stable. */ - shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet); + shutdown = VNET_IS_DYING(ifp->if_vnet); if (shutdown) { CURVNET_RESTORE(); prison_free(pr); @@ -1425,7 +1420,7 @@ } /* Make sure the VNET is stable. */ - shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet); + shutdown = VNET_IS_DYING(ifp->if_vnet); if (shutdown) { CURVNET_RESTORE(); prison_free(pr); @@ -2868,7 +2863,7 @@ CURVNET_SET(so->so_vnet); #ifdef VIMAGE /* Make sure the VNET is stable. */ - shutdown = VNET_IS_SHUTTING_DOWN(so->so_vnet); + shutdown = VNET_IS_DYING(so->so_vnet); if (shutdown) { CURVNET_RESTORE(); return (EBUSY); Index: sys/net/if_llatbl.c =================================================================== --- sys/net/if_llatbl.c +++ sys/net/if_llatbl.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include #include @@ -428,15 +429,19 @@ llentry_request_feedback(struct llentry *lle) { struct llentry *child_lle; + struct ifnet *ifp = lle->lle_tbl->llt_ifp; + int family = lle->lle_tbl->llt_af; LLE_REQ_LOCK(lle); lle->r_skip_req = 1; LLE_REQ_UNLOCK(lle); + nhops_request_feedback(ifp, family, lle); CK_SLIST_FOREACH(child_lle, &lle->lle_children, lle_child_next) { LLE_REQ_LOCK(child_lle); child_lle->r_skip_req = 1; LLE_REQ_UNLOCK(child_lle); + nhops_request_feedback(ifp, family, child_lle); } } @@ -462,13 +467,18 @@ static time_t llentry_get_hittime_raw(struct llentry *lle) { - time_t lle_hittime = 0; + time_t lle_hittime = 0, nhops_hittime = 0; LLE_REQ_LOCK(lle); if ((lle->r_skip_req == 0) && (lle_hittime < lle->lle_hittime)) lle_hittime = lle->lle_hittime; LLE_REQ_UNLOCK(lle); + struct lltable *llt = lle->lle_tbl; + nhops_hittime = nhops_get_hittime(llt->llt_ifp, llt->llt_af, lle); + if ((nhops_hittime != 0) && (nhops_hittime < lle_hittime)) + lle_hittime = nhops_hittime; + return (lle_hittime); } @@ -643,6 +653,12 @@ } lltable_unlink_entry(llt, lle); + + /* Mark as invalid to invalidate the caches */ + lle->r_flags &= ~RLLE_VALID; + lle->la_flags &= ~LLE_VALID; + + nhops_update_neigh(ifp, llt->llt_af, lle); IF_AFDATA_WUNLOCK(ifp); llt->llt_delete_entry(llt, lle); @@ -875,6 +891,7 @@ lltable_unlink_entry(llt, lle_tmp); } lltable_link_entry(llt, lle); + nhops_update_neigh(ifp, dst->sa_family, lle); IF_AFDATA_WUNLOCK(ifp); if (lle_tmp != NULL) { Index: sys/net/route/nhop.h =================================================================== --- sys/net/route/nhop.h +++ sys/net/route/nhop.h @@ -134,13 +134,10 @@ }; struct ifnet *nh_ifp; /* Logical egress interface. Always != NULL */ struct ifaddr *nh_ifa; /* interface address to use. Always != NULL */ - struct ifnet *nh_aifp; /* ifnet of the source address. Always != NULL */ + void *nh_prepend_raw;/* PTR+len for nexthop prepend */ counter_u64_t nh_pksent; /* packets sent using this nhop */ /* 32 bytes + 4xPTR == 64(amd64) / 48(i386) */ - uint8_t nh_prepend_len; /* length of prepend data */ - uint8_t spare[3]; - uint32_t spare1; /* alignment */ - char nh_prepend[48]; /* L2 prepend */ + struct ifnet *nh_aifp; /* ifnet of the source address. Always != NULL */ struct nhop_priv *nh_priv; /* control plane data */ /* -- 128 bytes -- */ }; @@ -163,6 +160,29 @@ _nh = NULL; \ } while (0) +/* + * L2 prepend infrastructure definitions + * Nexthop L2 rewrites may change during nextop lifetime when the neighbor + * changes its MAC. For the most common encapsulations - ethernet & IB, + * the maximum encap length is 24 (IPoIB) = LLE_MAX_LINKHDR. + */ +#define L2_PREPEND_LEN_BITS CACHE_LINE_SHIFT +#define L2_PREPEND_LEN_MAX ((1 << L2_PREPEND_LEN_BITS) - sizeof(struct epoch_context)) + +#define _NH_L2_PREPEND_MASK_PTR(_p) ((uintptr_t)(_p) & ~((1 << L2_PREPEND_LEN_BITS) - 1)) +#define NH_L2_PREPEND_GET_PTR(_p) ((void *)_NH_L2_PREPEND_MASK_PTR(_p)) +#define NH_L2_PREPEND_GET_LEN(_p) ((uintptr_t)(_p) & ((1 << L2_PREPEND_LEN_BITS) - 1)) + +#define NH_L2_COMPILE_PREPEND_PTR(_p, _l) ((void *)((uintptr_t)(_p) | (_l))) + +static inline void +route_set_prepend_nh(struct route *ro, const struct nhop_object *nh) +{ + void *ptr = nh->nh_prepend_raw; + ro->ro_prepend = (char *)NH_L2_PREPEND_GET_PTR(ptr); + ro->ro_plen = NH_L2_PREPEND_GET_LEN(ptr); +} + struct weightened_nhop { struct nhop_object *nh; uint32_t weight; @@ -180,6 +200,15 @@ struct vnet *nhop_get_vnet(const struct nhop_object *nh); struct nhop_object *nhop_select_func(struct nhop_object *nh, uint32_t flowid); +void *nhop_alloc_prepend(size_t size); +void nhop_free_prepend(void *prepend); +bool nhop_update_prepend(struct nhop_object *nh, void *prepend, size_t len); + +void nhops_update_neigh(struct ifnet *ifp, int family, const struct llentry *lle); +void nhops_request_feedback(struct ifnet *ifp, int family, const struct llentry *lle); +void nhops_stop_feedback(struct ifnet *ifp, int family, const struct llentry *lle); +time_t nhops_get_hittime(struct ifnet *ifp, int family, const struct llentry *lle); + #endif /* _KERNEL */ /* Kernel <> userland structures */ Index: sys/net/route/nhop.c =================================================================== --- sys/net/route/nhop.c +++ sys/net/route/nhop.c @@ -362,6 +362,12 @@ return (priv_ret); } +bool +is_nhop_linked(struct nhop_priv *nh_priv) +{ + return (nh_priv->nh_idx != 0); +} + /* * Searches for the nexthop by data specifcied in @nh_priv. * Returns referenced nexthop or NULL. Index: sys/net/route/nhop_ctl.c =================================================================== --- sys/net/route/nhop_ctl.c +++ sys/net/route/nhop_ctl.c @@ -103,6 +103,27 @@ 2 * CACHE_LINE_SIZE) #define NHOP_PRIV_ALIGNED_SIZE roundup2(sizeof(struct nhop_priv), \ 2 * CACHE_LINE_SIZE) + +static uma_zone_t nh_prepend_zone; /* Global zone for all nhop prepend data */ + +struct nhop_prepend { + char prepend[L2_PREPEND_LEN_MAX]; + struct epoch_context epoch_ctx; +}; + +#define NHOP_PREPEND_ALIGNED_SIZE roundup2(sizeof(struct nhop_prepend), \ + CACHE_LINE_SIZE) +/* + * Nexthop L2 rewrites may change during nextop lifetime when the neighbor + * changes its MAC. For the most common encapsulations - ethernet & IB, + * the maximum encap length is 24 (IPoIB) = LLE_MAX_LINKHDR. + * + */ + +static bool nhop_update_prepend_locked(struct nhop_priv *nh_priv, void *prepend, + size_t len); + + void nhops_init(void) { @@ -110,6 +131,8 @@ nhops_zone = uma_zcreate("routing nhops", NHOP_OBJECT_ALIGNED_SIZE + NHOP_PRIV_ALIGNED_SIZE, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + nh_prepend_zone = uma_zcreate("nhop prepend", NHOP_PREPEND_ALIGNED_SIZE, + NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); } /* @@ -571,11 +594,16 @@ * and return. */ DPRINTF("link_nhop failed!"); + if (nh_priv->nh_priv_flags & NHF_PRIV_NEIGH) + nhop_unlink_neighbor(nh_priv); destroy_nhop(nh_priv); return (ENOBUFS); } + if (nhop_need_neigh(nh) && nhop_link_neigh(nh_priv)) + nh_priv->nh_priv_flags |= NHF_PRIV_NEIGH; + return (0); } @@ -627,6 +655,12 @@ ifa_free(nh->nh_ifa); counter_u64_free(nh->nh_pksent); + if (nh->nh_prepend_raw != NULL) { + struct nhop_prepend *np; + np = (struct nhop_prepend *)NH_L2_PREPEND_GET_PTR(nh->nh_prepend_raw); + nhop_free_prepend(np); + } + uma_zfree(nhops_zone, nh); } @@ -690,6 +724,10 @@ NET_EPOCH_ENTER(et); if (refcount_release_if_not_last(&nh_priv->nh_linked)) { + /* Stop receiving updates for neighbor prepends */ + if (nh_priv->nh_priv_flags & NHF_PRIV_NEIGH) + nhop_unlink_neighbor(nh_priv); + ctl = nh_priv->nh_control; if (unlink_nhop(ctl, nh_priv) == NULL) { /* Do not try to reclaim */ @@ -850,8 +888,13 @@ pnhe->nh_mtu = nh->nh_mtu; pnhe->nh_flags = nh->nh_flags; - memcpy(pnhe->nh_prepend, nh->nh_prepend, sizeof(nh->nh_prepend)); - pnhe->prepend_len = nh->nh_prepend_len; + if (nh->nh_prepend_raw != NULL) { + void *ptr = nh->nh_prepend_raw; + pnhe->prepend_len = NH_L2_PREPEND_GET_LEN(ptr); + memcpy(pnhe->nh_prepend, + NH_L2_PREPEND_GET_PTR(ptr), pnhe->prepend_len); + } else + pnhe->prepend_len = 0; pnhe->nh_refcount = nh->nh_priv->nh_refcnt; pnhe->nh_pksent = counter_u64_fetch(nh->nh_pksent); @@ -923,3 +966,74 @@ return (0); } + +void * +nhop_alloc_prepend(size_t size) +{ + if (size > L2_PREPEND_LEN_MAX) + return (NULL); + void *prepend = uma_zalloc(nh_prepend_zone, M_NOWAIT | M_ZERO); + return (prepend); +} + +void +nhop_free_prepend(void *prepend) +{ + uma_zfree(nh_prepend_zone, prepend); +} + +static void +destroy_nhop_prepend_epoch(epoch_context_t ctx) +{ + struct nhop_prepend *prepend; + + prepend = __containerof(ctx, struct nhop_prepend, epoch_ctx); + nhop_free_prepend(prepend); +} + +static bool +nhop_update_prepend_locked(struct nhop_priv *nh_priv, void *prepend, size_t len) +{ + void *ptr = NULL, *old_ptr = NULL; + bool result = false; + + if (prepend != NH_L2_PREPEND_GET_PTR(prepend)) { + //KASSERT(); + /* XXX: check alignment */ + + prepend = NULL; + } + if (prepend != NULL) + ptr = NH_L2_COMPILE_PREPEND_PTR(prepend, len); + + if (is_nhop_linked(nh_priv)) { + old_ptr = nh_priv->nh->nh_prepend_raw; + nh_priv->nh->nh_prepend_raw = ptr; + result = true; + } + + if (old_ptr != NULL) { + struct nhop_prepend *np = NH_L2_PREPEND_GET_PTR(old_ptr); + epoch_call(net_epoch_preempt, destroy_nhop_prepend_epoch, + &np->epoch_ctx); + } + + return (result); +} + +bool +nhop_update_prepend(struct nhop_object *nh, void *prepend, size_t len) +{ + struct nhop_priv *nh_priv = nh->nh_priv; + struct nh_control *ctl; + bool result; + + ctl = nh_priv->nh_control; + + NHOPS_WLOCK(ctl); + result = nhop_update_prepend_locked(nh_priv, prepend, len); + NHOPS_WUNLOCK(ctl); + + return (result); +} + Index: sys/net/route/nhop_neigh.c =================================================================== --- /dev/null +++ sys/net/route/nhop_neigh.c @@ -0,0 +1,914 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_route.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * This file contains data structures management logic for the nexthop ("nhop") + * route subsystem. + * + * Nexthops in the original sense are the objects containing all the necessary + * information to forward the packet to the selected destination. + * In particular, nexthop is defined by a combination of + * ifp, ifa, aifp, mtu, gw addr(if set), nh_type, nh_family, mask of rt_flags and + * NHF_DEFAULT + * + * All nexthops are stored in the resizable hash table. + * Additionally, each nexthop gets assigned its unique index (nexthop index) + * so userland programs can interact with the nexthops easier. Index allocation + * is backed by the bitmask array. + */ + +#define DEBUG_MOD_NAME nhop_neigh +#define DEBUG_MAX_LEVEL LOG_DEBUG +#include +_DECLARE_DEBUG(LOG_DEBUG2); + +CHT_SLIST_DEFINE(nhop_neighs, struct nhop_neigh); +/* produce hash value for an object */ +#define nhop_neighs_hash_obj(_obj) hash_neigh(_obj) +/* compare two objects */ +#define nhop_neighs_cmp(_one, _two) cmp_neigh(_one, _two) +/* next object accessor */ +#define nhop_neighs_next(_obj) (_obj)->nn_next + + +struct nn_control { + struct nhop_neighs_head nn_head; + struct rmlock nn_lock; + struct callout nn_feedback_callout; + TAILQ_HEAD(,nhop_neigh) nn_feedback_list; +}; + +VNET_DEFINE_STATIC(struct nn_control, nn_control); +#define V_nn_control VNET(nn_control) + +#define CTL_WLOCK(ctl) rm_wlock(&ctl->nn_lock) +#define CTL_WUNLOCK(ctl) rm_wunlock(&ctl->nn_lock) +#define CTL_TRACKER struct rm_priotracker tracker +#define CTL_RLOCK(ctl) rm_rlock(&ctl->nn_lock, &tracker) +#define CTL_RUNLOCK(ctl) rm_runlock(&ctl->nn_lock, &tracker) + +struct nhop_neigh { + struct ifnet *nn_ifp; + uint8_t nn_neigh_family; + uint8_t nn_upper_family; + uint16_t nn_flags; + union { + struct in_addr nn_addr4; + struct in6_addr nn_addr6; + }; + uint64_t nn_packets; + time_t nn_hittime; + struct mtx nn_lock; + struct nhop_neigh *nn_next; + TAILQ_HEAD(, nhop_priv) nn_nhops; + TAILQ_ENTRY(nhop_neigh) nn_feedback_entry; +}; +#define NEIGH_END_CMP (__offsetof(struct nhop_neigh, nn_packets)) + +#define NN_FLAG_FB_LINKED 0x01 /* Linked to the feedback list */ + +#define NN_LOCK_INIT(nn) mtx_init(&(nn)->nn_lock, "nhop_neigh lock", NULL, MTX_DEF) +#define NN_LOCK_DESTROY(nn) mtx_destroy(&(nn)->nn_lock) +#define NN_LOCK(nn) mtx_lock(&(nn)->nn_lock) +#define NN_UNLOCK(nn) mtx_unlock(&(nn)->nn_lock) + +_Static_assert(L2_PREPEND_LEN_MAX >= LLE_MAX_LINKHDR, + "CACHE_LINE_SIZE has to be at least LLE_MAX_LINKHDR"); + +static void free_neigh(struct nhop_neigh *nn); +static void update_prepend_ptr(struct nhop_object *nh, const struct llentry *lle); +static void schedule_callout(struct nn_control *ctl); + +char *nhop_print_buf(const struct nhop_object *nh, char *buf, size_t bufsize); +char *lle_print_buf(const struct llentry *lle, struct ifnet *ifp, int family, char *buf, size_t bufsize); +char *lle_print_buf_lltable(const struct llentry *lle, char *buf, size_t bufsize); +char *neigh_print_buf(const struct nhop_neigh *nn, char *buf, size_t bufsize); +const char *rib_print_family(int family); + +#if 0 +static char +af_to_char(int family) +{ + switch (family) { + case AF_INET: + return '4'; + case AF_INET6: + return '6'; + case AF_LINK: + return '*'; + } + return 'X'; +} +#endif + +static int +lle_get_upper_family(const struct llentry *lle, int family) +{ + return (lle->r_family == 0 ? family : lle->r_family); +} + +__noinline char * +neigh_print_buf(const struct nhop_neigh *nn, char *buf, size_t bufsize) +{ + /* nn/inet/em0/192.168.0.1 */ + char abuf[INET6_ADDRSTRLEN]; + + if (nn == NULL) { + snprintf(buf, bufsize, "nn/NULL"); + return (buf); + } + + switch (nn->nn_neigh_family) { + case AF_INET6: + inet_ntop(AF_INET6, &nn->nn_addr6, abuf, sizeof(abuf)); + snprintf(buf, bufsize, "nn/%s/%s/%s", + rib_print_family(nn->nn_upper_family), if_name(nn->nn_ifp), abuf); + break; + case AF_INET: + inet_ntop(AF_INET, &nn->nn_addr4, abuf, sizeof(abuf)); + snprintf(buf, bufsize, "nn/%s/%s/%s", + rib_print_family(nn->nn_upper_family), if_name(nn->nn_ifp), abuf); + break; + default: + snprintf(buf, bufsize, "nn/%s/%s/unknown(%s)", + rib_print_family(nn->nn_upper_family), if_name(nn->nn_ifp), + rib_print_family(nn->nn_neigh_family)); + } + + return (buf); +} + +__noinline char * +nhop_print_buf(const struct nhop_object *nh, char *buf, size_t bufsize) +{ + /* nh#33/inet/em0/192.168.0.1 */ + char abuf[INET6_ADDRSTRLEN]; + struct nhop_priv *nh_priv = nh->nh_priv; + + const char *upper_str = rib_print_family(nh->nh_priv->nh_family); + + switch (nh->gw_sa.sa_family) { + case AF_INET6: + inet_ntop(AF_INET6, &nh->gw6_sa.sin6_addr, abuf, sizeof(abuf)); + snprintf(buf, bufsize, "nh#%d/%s/%s/%s", nh_priv->nh_idx, upper_str, + if_name(nh->nh_ifp), abuf); + break; + case AF_INET: + inet_ntop(AF_INET, &nh->gw4_sa.sin_addr, abuf, sizeof(abuf)); + snprintf(buf, bufsize, "nh#%d/%s/%s/%s", nh_priv->nh_idx, upper_str, + if_name(nh->nh_ifp), abuf); + break; + case AF_LINK: + snprintf(buf, bufsize, "nh#%d/%s/%s/resolve", nh_priv->nh_idx, upper_str, + if_name(nh->nh_ifp)); + break; + default: + snprintf(buf, bufsize, "nh#%d/%s/%s/????", nh_priv->nh_idx, upper_str, + if_name(nh->nh_ifp)); + break; + } + + return (buf); +} + +__noinline char * +lle_print_buf(const struct llentry *lle, struct ifnet *ifp, int family, char *buf, size_t bufsize) +{ + /* lle/4/V/em0/1.2.3.4 */ + char abuf[INET6_ADDRSTRLEN]; + + const char *valid = (lle->r_flags & RLLE_VALID) ? "valid" : "no_l2"; + const char *upper_str = rib_print_family(lle_get_upper_family(lle, family)); + + switch (family) { + case AF_INET: + inet_ntop(AF_INET, &lle->r_l3addr.addr4, abuf, sizeof(abuf)); + snprintf(buf, bufsize, "lle/%s/%s/%s/%s", upper_str, + valid, if_name(ifp), abuf); + break; + case AF_INET6: + inet_ntop(AF_INET6, &lle->r_l3addr.addr6, abuf, sizeof(abuf)); + snprintf(buf, bufsize, "lle/%s/%s/%s/%s", upper_str, + valid, if_name(ifp), abuf); + break; + default: + snprintf(buf, bufsize, "lle/%s/%s/%s/????", upper_str, + valid, if_name(ifp)); + break; + } + + return (buf); +} + +__noinline char * +lle_print_buf_lltable(const struct llentry *lle, char *buf, size_t bufsize) +{ + struct lltable *tbl = lle->lle_tbl; + + return (lle_print_buf(lle, lltable_get_ifp(tbl), lltable_get_af(tbl), buf, bufsize)); +} + +const char * +rib_print_family(int family) +{ + + if (family == AF_INET) + return ("inet"); + else if (family == AF_INET6) + return ("inet6"); + else + return ("unknown"); +} + + +void +vnet_nhops_init_neigh(void) +{ + struct nn_control *ctl = &V_nn_control; + /* + * Allocate nexthop hash. Start with 16 items by default (128 bytes). + * This will be enough for most of the cases. + */ + int num_buckets = 16; + size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); + + void *ptr = malloc(alloc_size, M_NHOP, M_WAITOK | M_ZERO); + CHT_SLIST_INIT(&ctl->nn_head, ptr, num_buckets); + rm_init(&ctl->nn_lock, "nexthop neigh lock"); + TAILQ_INIT(&ctl->nn_feedback_list); + + callout_init(&ctl->nn_feedback_callout, 1); +} + +void +vnet_nhop_destroy_neigh(void) +{ + struct nn_control *ctl = &V_nn_control; + struct nhop_neigh *nn, *nn_tmp; + + callout_drain(&ctl->nn_feedback_callout); + + /* + * + * Close to the end. + * All relevant interfaces are set to if_down (~IFF_UP) so + * there shouldn't be any incoming traffic that can trigger + * ARP/ND updates. + * XXX: static records triggered by ndp? + * Should not be any routes - as it's called in the end of + * rtables_destroy() + */ + + CHT_SLIST_FOREACH_SAFE(&ctl->nn_head, nhop_neighs, nn, nn_tmp) { + free_neigh(nn); + } CHT_SLIST_FOREACH_END; + + rm_destroy(&ctl->nn_lock); +} + +/* + * Nexhop hash calculation: + */ +struct _hash_data { + uint16_t ifentropy; + uint8_t neigh_family; + uint8_t upper_family; + uint32_t addr; +}; + +static unsigned +djb_hash(const unsigned char *h, const int len) +{ + unsigned int result = 0; + int i; + + for (i = 0; i < len; i++) + result = 33 * result ^ h[i]; + + return (result); +} + +static uint32_t +hash_neigh(const struct nhop_neigh *nn) +{ + struct _hash_data key = { + .ifentropy = (uint16_t)((((uintptr_t)nn->nn_ifp) >> 6) & 0xFFFF), + .neigh_family = nn->nn_neigh_family, + .upper_family = nn->nn_upper_family, + .addr = (nn->nn_neigh_family == AF_INET6) ? + nn->nn_addr6.s6_addr32[3] : nn->nn_addr4.s_addr + }; + + return (uint32_t)(djb_hash((const unsigned char *)&key, sizeof(key))); +} + +static int +cmp_neigh(const struct nhop_neigh *_one, const struct nhop_neigh *_two) +{ + + if (memcmp(_one, _two, NEIGH_END_CMP) != 0) + return (0); + return (1); +} + +/* + * Searches for the nexthop neigh by data specified in @nh_priv. + * Returns referenced nexthop or NULL. + */ +static struct nhop_neigh * +find_neigh(struct nn_control *ctl, const struct nhop_neigh *nn) +{ + struct nhop_neigh *nn_ret; + + CHT_SLIST_FIND_BYOBJ(&ctl->nn_head, nhop_neighs, nn, nn_ret); + return (nn_ret); +} + +static bool +has_neigh(struct nn_control *ctl, const struct nhop_neigh *nn_base) +{ + CTL_TRACKER; + bool result; + + CTL_RLOCK(ctl); + result = find_neigh(ctl, nn_base) != NULL; + CTL_RUNLOCK(ctl); + + return (result); +} + +/* + * Tries to resize neighbor hash to the value specified by @new_num_buckets. + */ +static void +resize_neigh_hash(struct nn_control *ctl, uint32_t new_num_buckets) +{ + size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_num_buckets); + void *nn_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO); + if (nn_ptr == NULL) { + /* allocations has failed. */ + RT_LOG(LOG_NOTICE, "neigh hash resize to %u has failed", new_num_buckets); + return; + } + + CTL_WLOCK(ctl); + RT_LOG(LOG_DEBUG, "going to resize neigh hash: %u -> %u", + ctl->nn_head.hash_size, new_num_buckets); + CHT_SLIST_RESIZE(&ctl->nn_head, nhop_neighs, nn_ptr, new_num_buckets); + CTL_WUNLOCK(ctl); + + if (nn_ptr != NULL) + free(nn_ptr, M_NHOP); +} + +/* + * Checks if nexthop @nh can be attached to the LLE/NDP neighbor. + * Function verifies that target interface has L2 and nexthop contains + * gateway (or is a host route). + * Returns true on success. + */ +bool +nhop_need_neigh(const struct nhop_object *nh) +{ + bool match = false; + + switch (nh->nh_ifp->if_type) { + case IFT_BRIDGE: + case IFT_ETHER: + case IFT_INFINIBAND: + case IFT_L2VLAN: + match = true; + break; + } + + if (match) { + if (nh->nh_flags & (NHF_GATEWAY|NHF_HOST)) + return (true); + } + + return (false); +} + +/* + * Fills in nhop_neigh data based on the nexthop specified by @nh_priv. + */ +static void +init_neigh(struct nhop_neigh *nn, const struct nhop_priv *nh_priv) +{ + const struct nhop_object *nh = nh_priv->nh; + + nn->nn_ifp = nh->nh_ifp; + nn->nn_neigh_family = nh->gw_sa.sa_family; + nn->nn_upper_family = nh_priv->nh_family; + switch (nn->nn_neigh_family) { + case AF_INET: + nn->nn_addr4 = nh->gw4_sa.sin_addr; + break; + case AF_INET6: + nn->nn_addr6 = nh->gw6_sa.sin6_addr; + break; + } + TAILQ_INIT(&nn->nn_nhops); + NN_LOCK_INIT(nn); +} + +static void +free_neigh(struct nhop_neigh *nn) +{ + NN_LOCK_DESTROY(nn); + free(nn, M_NHOP); +} + +static struct llentry * +find_lle(struct nhop_priv *nh_priv) +{ + void *afdata_ptr; + struct llentry *lle = NULL; + struct lltable *llt = NULL; + struct nhop_object *nh = nh_priv->nh; + + switch (nh->gw_sa.sa_family) { + case AF_INET: + afdata_ptr = nh->nh_ifp->if_afdata[AF_INET]; + if (afdata_ptr != NULL) + llt = ((struct in_ifinfo *)afdata_ptr)->ii_llt; + break; + case AF_INET6: + afdata_ptr = nh->nh_ifp->if_afdata[AF_INET6]; + if (afdata_ptr != NULL) + llt = ((struct in6_ifextra *)afdata_ptr)->lltable; + break; + } + + if (llt != NULL) + lle = lla_lookup(llt, LLE_UNLOCKED, &nh->gw_sa); + if (lle != NULL) { + if (nh_priv->nh_family != nh->gw_sa.sa_family) + lle = llentry_lookup_family(lle, nh_priv->nh_family); + } +#if DEBUG_MAX_LEVEL >= LOG_DEBUG + char nhbuf[48], lbuf[48]; + FIB_NH_LOG(LOG_DEBUG, nh, "nhop %s: mapped to lle %s", + nhop_print_buf(nh, nhbuf, sizeof(nhbuf)), + lle ? lle_print_buf(lle, nh->nh_ifp, nh->gw_sa.sa_family, lbuf, sizeof(lbuf)) : "NULL"); +#endif + return (lle); +} + +/* + * Links nextop @nh_priv to the nexhop neighbor hash table and tries + * to fill in L2 nexthop prepend. + * Returns true on successful linkage. + */ +bool +nhop_link_neigh(struct nhop_priv *nh_priv) +{ + uint32_t num_buckets_new; + struct nn_control *ctl = &V_nn_control; + struct nhop_neigh *nn = NULL, *nn_new; + + NET_EPOCH_ASSERT(); + + /* + * Most llentries have at most one nexthop attached. + * Thus, assume we'll be inserting a new record. + */ + + nn_new = malloc(sizeof(struct nhop_neigh), M_NHOP, M_NOWAIT | M_ZERO); + if (nn_new == NULL) + return (false); + init_neigh(nn_new, nh_priv); + + /* Try to calculate the prepend */ + struct llentry *lle = find_lle(nh_priv); + if (lle != NULL) + update_prepend_ptr(nh_priv->nh, lle); + + CTL_WLOCK(ctl); + + /* + * Check if we need to resize hash and index. + * The following 2 functions returns either new size or 0 + * if resize is not required. + */ + num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->nn_head); + + /* Check if already exists */ + CHT_SLIST_FIND_BYOBJ(&ctl->nn_head, nhop_neighs, nn_new, nn); + + if (nn == NULL) { + nn = nn_new; + nn_new = NULL; + CHT_SLIST_INSERT_HEAD(&ctl->nn_head, nhop_neighs, nn); + + /* + * XXXME: There can be a race when lle gets deleted after lookup + */ + } + TAILQ_INSERT_TAIL(&nn->nn_nhops, nh_priv, nh_neigh_entry); + + CTL_WUNLOCK(ctl); + +#if DEBUG_MAX_LEVEL >= LOG_DEBUG + char nhbuf[48], nnbuf[48]; + FIB_NH_LOG(LOG_DEBUG, nh_priv->nh, "nhop %s linked to %s %s", + nhop_print_buf(nh_priv->nh, nhbuf, sizeof(nhbuf)), + nn_new == NULL ? "new" : "existing", + neigh_print_buf(nn, nnbuf, sizeof(nnbuf))); +#endif + + if (nn_new != NULL) + free_neigh(nn_new); + + if (num_buckets_new > 0) + resize_neigh_hash(ctl, num_buckets_new); + + return (true); +} + +/* + * Unlinks nexthop specified by @nh_priv data. + */ +void +nhop_unlink_neighbor(struct nhop_priv *nh_priv) +{ + struct nn_control *ctl = &V_nn_control; + uint32_t num_buckets_new; + struct nhop_neigh *nn, *nn_del = NULL, nn_base = {}; + + init_neigh(&nn_base, nh_priv); + + CTL_WLOCK(ctl); + + nn = find_neigh(ctl, &nn_base); + if (nn != NULL) { + TAILQ_REMOVE(&nn->nn_nhops, nh_priv, nh_neigh_entry); + if (TAILQ_EMPTY(&nn->nn_nhops)) { + CHT_SLIST_REMOVE(&ctl->nn_head, nhop_neighs, nn, nn_del); + } + } + + /* Check if hash or index needs to be resized */ + num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->nn_head); + + CTL_WUNLOCK(ctl); + +#if DEBUG_MAX_LEVEL >= LOG_DEBUG + char nhbuf[48], nnbuf[48]; + FIB_NH_LOG(LOG_DEBUG, nh_priv->nh, "nhop %s unlinked from the neigh %s%s", + nhop_print_buf(nh_priv->nh, nhbuf, sizeof(nhbuf)), + neigh_print_buf(nn, nnbuf, sizeof(nnbuf)), + nn_del == NULL ? "" : " (last entry)"); +#endif + + if (nn_del != NULL) + free_neigh(nn_del); + + if (num_buckets_new > 0) + resize_neigh_hash(ctl, num_buckets_new); +} + +/* + * Updates nhop @nh L2 prepend data with the pre-calculated prepend + * in @lle. If @lle contains no valid data, removes an existing L2 prepend. + */ +static void +update_prepend_ptr(struct nhop_object *nh, const struct llentry *lle) +{ + void *prepend = NULL; + int prepend_len = 0; + + if (lle->r_flags & RLLE_VALID) { + prepend_len = lle->r_hdrlen; + prepend = nhop_alloc_prepend(prepend_len); + if (prepend != NULL) + memcpy(prepend, lle->r_linkdata, prepend_len); + } + +#if DEBUG_MAX_LEVEL >= LOG_DEBUG + char nhbuf[48], lbuf[48]; + FIB_NH_LOG(LOG_DEBUG, nh, "nhop %s sync L2 from %s", + nhop_print_buf(nh, nhbuf, sizeof(nhbuf)), + lle_print_buf(lle, nh->nh_ifp, nh->gw_sa.sa_family, lbuf, sizeof(lbuf))); +#endif + + nhop_update_prepend(nh, prepend, prepend_len); +} + +/* + * Hook called by the LLE subsystem notifying of the changed L2 prepend + * for the @lle entry. + * Function searches the matching neigh entry and updates NH L2 prepend + * for all of the registered nexthops. + */ +void +nhops_update_neigh(struct ifnet *ifp, int family, const struct llentry *lle) +{ + CTL_TRACKER; + struct nn_control *ctl = &V_nn_control; + + NET_EPOCH_ASSERT(); + + if (VNET_IS_DYING(curvnet)) + return; + + struct nhop_neigh nn_base = { + .nn_ifp = ifp, + .nn_upper_family = lle_get_upper_family(lle, family), + .nn_neigh_family = family, + .nn_addr6 = lle->r_l3addr.addr6, + }; + + bool matched = has_neigh(ctl, &nn_base); + +#if DEBUG_MAX_LEVEL >= LOG_DEBUG2 + char lbuf[48]; + RT_LOG(LOG_DEBUG2, "L2 prepend update from %s (matched: %s)", + lle_print_buf(lle, ifp, family, lbuf, sizeof(lbuf)), + matched ? "true" : "false"); +#endif + + if (!matched) + return; + + CTL_RLOCK(ctl); + struct nhop_neigh *nn = find_neigh(ctl, &nn_base); + if (nn != NULL) { + struct nhop_priv *nh_priv; + + TAILQ_FOREACH(nh_priv, &nn->nn_nhops, nh_neigh_entry) + update_prepend_ptr(nh_priv->nh, lle); + } + CTL_RUNLOCK(ctl); +} + + +/* + * LLE validity. + * Both ARP and ND state machines requires datapath-liveness checking + * as a step of expiring an lle entry. Additionally, ND state machine + * requires exact timestamp of the first packet traversing LLE after the + * liveness checking request, so it can execute check callouts less often + * (STALE -> DELAY -> PROBE). + * + * Thus, upon receiving the request to check dataplane liveness from LLE layers, + * the code below adds matching neigh entry to the feedback list and fires + * per-VNET callout on per-second basis, recording the first time when the + * packet is traversed. + * + * Neighs are removed from the list in 2 ways: the first is done by the callout + * upon recording the timestamp, the second is LLE code removing the matching + * LLE. + * + */ + +/* + * Returns total count of all packets that traversed the nexthops + * registered in the @nn. + */ +static uint64_t +calc_pktsent(struct nhop_neigh *nn) +{ + uint64_t nn_packets = 0; + struct nhop_priv *nh_priv; + + TAILQ_FOREACH(nh_priv, &nn->nn_nhops, nh_neigh_entry) + nn_packets += counter_u64_fetch(nh_priv->nh->nh_pksent); + return (nn_packets); +} + +/* + * Callout that is called every second to check if the cumulative amount + * of packets traversing relevant neigh entries has changed. If the change + * is observed, record the change time and removes entry from the list. + * + * Note: removing nexthops from the neigh entry results in false positive. + * However, as the value is used to check if the underlying lle is still used, + * the worst that can happen, is that the entry will be kept slightly longer + * before the deletion. + */ +static void +pktsent_callout(void *_arg) +{ + struct nn_control *ctl = (struct nn_control *)_arg; + struct nhop_neigh *nn, *nn_tmp; + bool empty; + + CTL_WLOCK(ctl); + + TAILQ_FOREACH_SAFE(nn, &ctl->nn_feedback_list, nn_feedback_entry, nn_tmp) { + if (nn->nn_packets != calc_pktsent(nn)) { + nn->nn_packets = 0; + nn->nn_hittime = time_uptime; + nn->nn_flags &= ~NN_FLAG_FB_LINKED; + TAILQ_REMOVE(&ctl->nn_feedback_list, nn, nn_feedback_entry); +#if DEBUG_MAX_LEVEL >= LOG_DEBUG2 + char nnbuf[48]; + RT_LOG(LOG_DEBUG2, "L2 neigh %s got datapath feedback at %lu", + neigh_print_buf(nn, nnbuf, sizeof(nnbuf)), + nn->nn_hittime); +#endif + } + } + empty = TAILQ_EMPTY(&ctl->nn_feedback_list); + CTL_WUNLOCK(ctl); + if (!empty) + schedule_callout(ctl); +#if DEBUG_MAX_LEVEL >= LOG_DEBUG2 + else + RT_LOG(LOG_DEBUG2, "datapath callout stopped"); +#endif +} + +static void +schedule_callout(struct nn_control *ctl) +{ + if (callout_pending(&ctl->nn_feedback_callout)) + return; + callout_reset_sbt(&ctl->nn_feedback_callout, SBT_1S * 1, 0, + pktsent_callout, ctl, 0); +} + +static void +update_feedback_membership(struct ifnet *ifp, int family, const struct llentry *lle, + bool add) +{ + struct nn_control *ctl = &V_nn_control; + struct nhop_neigh *nn; + bool need_callout = false; + + NET_EPOCH_ASSERT(); + + if (VNET_IS_DYING(curvnet)) + return; + + struct nhop_neigh nn_base = { + .nn_ifp = ifp, + .nn_upper_family = lle_get_upper_family(lle, family), + .nn_neigh_family = family, + .nn_addr6 = lle->r_l3addr.addr6, + }; + + /* Most of LLEs do not have mapped nhops, so fail early */ + bool matched = has_neigh(ctl, &nn_base); + +#if DEBUG_MAX_LEVEL >= LOG_DEBUG2 + char lbuf[48]; + lle_print_buf(lle, ifp, family, lbuf, sizeof(lbuf)); + if (matched) { + RT_LOG(LOG_DEBUG2, "%s datapath feedback for %s", add ? "request" : "abort", lbuf); + } else { + RT_LOG(LOG_DEBUG3, "%s datapath feedback for %s (nomatch)", add ? "request" : "abort", lbuf); + } +#endif + + if (!matched) + return; + + CTL_WLOCK(ctl); + nn = find_neigh(ctl, &nn_base); + if (nn != NULL) { + if (add) { + nn->nn_packets = calc_pktsent(nn); + nn->nn_hittime = 0; + + if (!(nn->nn_flags & NN_FLAG_FB_LINKED)) { + nn->nn_flags |= NN_FLAG_FB_LINKED; + need_callout = TAILQ_EMPTY(&ctl->nn_feedback_list); + TAILQ_INSERT_TAIL(&ctl->nn_feedback_list, nn, nn_feedback_entry); +#if DEBUG_MAX_LEVEL >= LOG_DEBUG2 + char llbuf[48], nnbuf[48]; + RT_LOG(LOG_DEBUG2, "added %s to datapath feedback for %s", + neigh_print_buf(nn, nnbuf, sizeof(nnbuf)), + lle_print_buf(lle, ifp, family, llbuf, sizeof(llbuf))); +#endif + } + } else { + /* Remove from the list */ + if (nn->nn_flags & NN_FLAG_FB_LINKED) { + nn->nn_flags &= ~NN_FLAG_FB_LINKED; + TAILQ_REMOVE(&ctl->nn_feedback_list, nn, nn_feedback_entry); +#if DEBUG_MAX_LEVEL >= LOG_DEBUG2 + char llbuf[48], + nnbuf[48]; + RT_LOG(LOG_DEBUG2, "removed %s from datapath feedback for %s", + neigh_print_buf(nn, nnbuf, sizeof(nnbuf)), + lle_print_buf(lle, ifp, family, llbuf, sizeof(llbuf))); +#endif + } + } + } + CTL_WUNLOCK(ctl); + if (need_callout) { + schedule_callout(ctl); +#if DEBUG_MAX_LEVEL >= LOG_DEBUG2 + RT_LOG(LOG_DEBUG2, "datapath callout started"); +#endif + } +} + +void +nhops_request_feedback(struct ifnet *ifp, int family, const struct llentry *lle) +{ + update_feedback_membership(ifp, family, lle, true); +} + +void +nhops_stop_feedback(struct ifnet *ifp, int family, const struct llentry *lle) +{ + update_feedback_membership(ifp, family, lle, false); +} + +/* + * Returns the timestamp of the first packet traversing the nexhops matching @lle + * after nhops_request_feedback() call. + */ +time_t +nhops_get_hittime(struct ifnet *ifp, int family, const struct llentry *lle) +{ + struct nn_control *ctl = &V_nn_control; + struct nhop_neigh *nn; + time_t hittime = 0; + CTL_TRACKER; + + NET_EPOCH_ASSERT(); + + if (VNET_IS_DYING(curvnet)) + return (0); + + struct nhop_neigh nn_base = { + .nn_ifp = ifp, + .nn_upper_family = lle_get_upper_family(lle, family), + .nn_neigh_family = family, + .nn_addr6 = lle->r_l3addr.addr6, + }; + + CTL_RLOCK(ctl); + nn = find_neigh(ctl, &nn_base); + if (nn != NULL) + hittime = nn->nn_hittime; + CTL_RUNLOCK(ctl); + +#if DEBUG_MAX_LEVEL >= LOG_DEBUG2 + if (nn != NULL) { + char lbuf[48], nnbuf[48]; + RT_LOG(LOG_DEBUG2, "%s datapath feedback returned %lu from %s", + lle_print_buf(lle, ifp, family, lbuf, sizeof(lbuf)), + hittime, neigh_print_buf(nn, nnbuf, sizeof(nnbuf))); + } +#endif + + return (hittime); +} Index: sys/net/route/nhop_utils.h =================================================================== --- sys/net/route/nhop_utils.h +++ sys/net/route/nhop_utils.h @@ -139,6 +139,11 @@ for (_x = CHT_FIRST(_head, _i); _x; _x = _PX##_next(_x)) #define CHT_SLIST_FOREACH_END } +#define CHT_SLIST_FOREACH_SAFE(_head, _PX, _x, _t) \ + for (uint32_t _i = 0; _i < (_head)->hash_size; _i++) { \ + for (_x = CHT_FIRST(_head, _i); (_x) && (_t = _PX##_next(_x)); _x = _t) +#define CHT_SLIST_FOREACH_SAFE_END } + #define CHT_SLIST_RESIZE(_head, _PX, _new_void_ptr, _new_hsize) \ uint32_t _new_idx; \ typeof((_head)->ptr) _new_ptr = (void *)_new_void_ptr; \ Index: sys/net/route/nhop_var.h =================================================================== --- sys/net/route/nhop_var.h +++ sys/net/route/nhop_var.h @@ -80,12 +80,14 @@ uint32_t rt_flags; /* routing flags for the control plane */ /* nhop lookup comparison end */ uint32_t nh_idx; /* nexthop index */ + uint32_t nh_priv_flags; /* non user-visible flags */ void *cb_func; /* function handling additional rewrite caps */ u_int nh_refcnt; /* number of references, refcount(9) */ u_int nh_linked; /* refcount(9), == 2 if linked to the list */ struct nhop_object *nh; /* backreference to the dataplane nhop */ struct nh_control *nh_control; /* backreference to the rnh */ struct nhop_priv *nh_next; /* hash table membership */ + TAILQ_ENTRY(nhop_priv) nh_neigh_entry; /* neigh membership */ struct vnet *nh_vnet; /* vnet nhop belongs to */ struct epoch_context nh_epoch_ctx; /* epoch data for nhop */ }; @@ -95,13 +97,22 @@ #define NH_IS_PINNED(_nh) ((!NH_IS_NHGRP(_nh)) && \ ((_nh)->nh_priv->rt_flags & RTF_PINNED)) +#define NHF_PRIV_NEIGH 0x01 /* linked to a neighbor record */ + /* nhop.c */ struct nhop_priv *find_nhop(struct nh_control *ctl, const struct nhop_priv *nh_priv); int link_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv); struct nhop_priv *unlink_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv); +bool is_nhop_linked(struct nhop_priv *nh_priv); /* nhop_ctl.c */ int cmp_priv(const struct nhop_priv *_one, const struct nhop_priv *_two); +/* nhop_neigh.c */ + +bool nhop_need_neigh(const struct nhop_object *nh); +bool nhop_link_neigh(struct nhop_priv *nh_priv); +void nhop_unlink_neighbor(struct nhop_priv *nh_priv); + #endif Index: sys/net/route/route_ctl.c =================================================================== --- sys/net/route/route_ctl.c +++ sys/net/route/route_ctl.c @@ -118,6 +118,9 @@ VNET_DEFINE_STATIC(uma_zone_t, rtzone); #define V_rtzone VNET(rtzone) +/* Debug bits */ +SYSCTL_NODE(_net_route, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + void vnet_rtzone_init() { Index: sys/net/route/route_debug.h =================================================================== --- /dev/null +++ sys/net/route/route_debug.h @@ -0,0 +1,126 @@ +/*- + * Copyright (c) 2021 + * Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_ROUTE_DEBUG_H_ +#define _NET_ROUTE_DEBUG_H_ + +#include +#include + +const char *rib_print_family(int family); + +static inline uint32_t +nhop_get_fibnum(const struct nhop_object *nh) +{ + return (0); +} + +/* DEBUG logic */ +#if defined(DEBUG_MOD_NAME) && defined(DEBUG_MAX_LEVEL) +#define _DEBUG_PASS_MSG(_l) (DEBUG_MOD_NAME##_debug_level >= (_l)) + +#define OID_NAME _OID_NAME(DEBUG_MOD_NAME) +#define _OID_NAME(a) _OID_NAME_INDIRECT(a) +#define _OID_NAME_INDIRECT(prefix) prefix##_debug_level + +#define SPREFIX _SPREFIX(DEBUG_MOD_NAME) +#define _SPREFIX(a) __SPREFIX(a) +#define __SPREFIX(a) #a + + +#define _DECLARE_DEBUG(_default_level) \ + SYSCTL_DECL(_net_route_debug); \ + static int DEBUG_MOD_NAME##_debug_level = _default_level; \ + SYSCTL_INT(_net_route_debug, OID_AUTO, OID_NAME,\ + CTLFLAG_RW | CTLFLAG_RWTUN, \ + &(DEBUG_MOD_NAME##_debug_level), 0, "debuglevel") + +#ifndef LOG_DEBUG2 +#define LOG_DEBUG2 8 +#endif +#ifndef LOG_DEBUG3 +#define LOG_DEBUG3 9 +#endif + +#define _output printf + +#define _FIB_LOG(_l, _fib, _fam, _fmt, ...) if (_DEBUG_PASS_MSG(_l)) { \ + _output("[" SPREFIX "] %s.%u %s: " _fmt "\n", rib_print_family(_fam), _fib, __func__, ##__VA_ARGS__); \ +} +#define FIB_LOG(_l, _fib, _fam, _fmt, ...) FIB_LOG_##_l(_l, _fib, _fam, _fmt, ## __VA_ARGS__) + +#define FIB_NH_LOG(_l, _nh, _fmt, ...) FIB_LOG_##_l(_l, nhop_get_fibnum(_nh), (_nh)->gw_sa.sa_family, _fmt, ## __VA_ARGS__) + +#define _RT_LOG(_l, _fmt, ...) if (_DEBUG_PASS_MSG(_l)) { \ + _output("[" SPREFIX "] %s: " _fmt "\n", __func__, ##__VA_ARGS__); \ +} +#define RT_LOG(_l, _fmt, ...) RT_LOG_##_l(_l, _fmt, ## __VA_ARGS__) + + +#if DEBUG_MAX_LEVEL>=LOG_DEBUG2 +#define FIB_LOG_LOG_DEBUG3 _FIB_LOG +#define RT_LOG_LOG_DEBUG3 _RT_LOG +#else +#define FIB_LOG_LOG_DEBUG3(_l, _fib, _fam, _fmt, ...) +#define RT_LOG_LOG_DEBUG3(_l, _fmt, ...) +#endif +#if DEBUG_MAX_LEVEL>=LOG_DEBUG2 +#define FIB_LOG_LOG_DEBUG2 _FIB_LOG +#define RT_LOG_LOG_DEBUG2 _RT_LOG +#else +#define FIB_LOG_LOG_DEBUG2(_l, _fib, _fam, _fmt, ...) +#define RT_LOG_LOG_DEBUG2(_l, _fmt, ...) +#endif +#if DEBUG_MAX_LEVEL>=LOG_DEBUG +#define FIB_LOG_LOG_DEBUG _FIB_LOG +#define RT_LOG_LOG_DEBUG _RT_LOG +#else +#define FIB_LOG_LOG_DEBUG(_l, _fib, _fam, _fmt, ...) +#define RT_LOG_LOG_DEBUG(_l, _fmt, ...) +#endif +#if DEBUG_MAX_LEVEL>=LOG_INFO +#define FIB_LOG_LOG_INFO _FIB_LOG +#define RT_LOG_LOG_INFO _RT_LOG +#else +#define FIB_LOG_LOG_INFO(_l, _fib, _fam, _fmt, ...) +#define RT_LOG_LOG_INFO(_l, _fmt, ...) +#endif +#define FIB_LOG_LOG_NOTICE _FIB_LOG +#define FIB_LOG_LOG_ERR _FIB_LOG +#define FIB_LOG_LOG_WARNING _FIB_LOG +#define RT_LOG_LOG_NOTICE _RT_LOG +#define RT_LOG_LOG_ERR _RT_LOG +#define RT_LOG_LOG_WARNING _RT_LOG + + +#endif + +#endif \ No newline at end of file Index: sys/net/route/route_helpers.c =================================================================== --- sys/net/route/route_helpers.c +++ sys/net/route/route_helpers.c @@ -571,3 +571,4 @@ return (NULL); } #endif + Index: sys/net/route/route_tables.c =================================================================== --- sys/net/route/route_tables.c +++ sys/net/route/route_tables.c @@ -262,6 +262,8 @@ #ifdef FIB_ALGO vnet_fib_init(); #endif + vnet_nhops_init_neigh(); + RTABLES_LOCK_INIT(); RTABLES_LOCK(); @@ -306,6 +308,7 @@ #ifdef FIB_ALGO vnet_fib_destroy(); #endif + vnet_nhop_destroy_neigh(); } VNET_SYSUNINIT(rtables_destroy, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, rtables_destroy, 0); Index: sys/net/route/route_var.h =================================================================== --- sys/net/route/route_var.h +++ sys/net/route/route_var.h @@ -247,6 +247,8 @@ void nhops_init(void); int nhops_init_rib(struct rib_head *rh); void nhops_destroy_rib(struct rib_head *rh); +void vnet_nhops_init_neigh(void); +void vnet_nhop_destroy_neigh(void); void nhop_ref_object(struct nhop_object *nh); int nhop_try_ref_object(struct nhop_object *nh); void nhop_ref_any(struct nhop_object *nh); Index: sys/net/vnet.h =================================================================== --- sys/net/vnet.h +++ sys/net/vnet.h @@ -240,6 +240,8 @@ extern struct vnet *vnet0; #define IS_DEFAULT_VNET(arg) ((arg) == vnet0) +#define VNET_IS_DYING(_vnet) \ + ((_vnet)->vnet_shutdown && (_vnet)->vnet_state < SI_SUB_VNET_DONE) #define CRED_TO_VNET(cr) (cr)->cr_prison->pr_vnet #define TD_TO_VNET(td) CRED_TO_VNET((td)->td_ucred) Index: sys/netinet/if_ether.c =================================================================== --- sys/netinet/if_ether.c +++ sys/netinet/if_ether.c @@ -1001,6 +1001,7 @@ if (la_tmp == NULL) { arp_mark_lle_reachable(la); LLE_WUNLOCK(la); + nhops_update_neigh(ifp, AF_INET, la); } else { /* Free newly-create entry and handle packet */ lltable_free_entry(LLTABLE(ifp), la); @@ -1239,8 +1240,11 @@ lladdr_off) == 0) return; + nhops_update_neigh(ifp, AF_INET, la); + /* Clear fast path feedback request if set */ llentry_mark_used(la); + nhops_stop_feedback(ifp, AF_INET, la); } arp_mark_lle_reachable(la); Index: sys/netinet/ip_fastfwd.c =================================================================== --- sys/netinet/ip_fastfwd.c +++ sys/netinet/ip_fastfwd.c @@ -433,6 +433,7 @@ ro.ro_flags |= RT_HAS_GW; } else gw = (const struct sockaddr *)dst; + route_set_prepend_nh(&ro, nh); /* * Handle redirect case. Index: sys/netinet/ip_output.c =================================================================== --- sys/netinet/ip_output.c +++ sys/netinet/ip_output.c @@ -301,6 +301,8 @@ ro->ro_flags |= (nh_flags & NHF_REJECT) ? RT_REJECT : 0; ro->ro_flags |= (nh_flags & NHF_BLACKHOLE) ? RT_BLACKHOLE : 0; ro->ro_flags |= (nh_flags & NHF_GATEWAY) ? RT_HAS_GW : 0; + + route_set_prepend_nh(ro, nh); } /* Index: sys/netinet6/nd6.c =================================================================== --- sys/netinet6/nd6.c +++ sys/netinet6/nd6.c @@ -698,10 +698,10 @@ delay = (long)ND_IFINFO(ifp)->retrans * hz / 1000; break; case ND6_LLINFO_REACHABLE: - if (!ND6_LLINFO_PERMANENT(lle)) { - ifp = lle->lle_tbl->llt_ifp; + ifp = lle->lle_tbl->llt_ifp; + if (!ND6_LLINFO_PERMANENT(lle)) delay = (long)ND_IFINFO(ifp)->reachable * hz; - } + nhops_stop_feedback(ifp, AF_INET6, lle); break; case ND6_LLINFO_STALE: @@ -1420,6 +1420,7 @@ /* Update data */ lltable_set_entry_addr(ifp, lle, buf, sz, off); + nhops_update_neigh(ifp, AF_INET6, lle); struct llentry *child_lle; CK_SLIST_FOREACH(child_lle, &lle->lle_children, lle_child_next) { @@ -1429,6 +1430,7 @@ if (lltable_calc_llheader(ifp, fam, lladdr, buf, &sz, &off) == 0) { /* success */ lltable_set_entry_addr(ifp, child_lle, buf, sz, off); + nhops_update_neigh(ifp, AF_INET6, child_lle); child_lle->ln_state = ND6_LLINFO_REACHABLE; } LLE_WUNLOCK(child_lle); @@ -2052,6 +2054,7 @@ if (ln_tmp == NULL) { /* No existing lle, mark as new entry (6,7) */ is_newentry = 1; + nhops_update_neigh(ifp, AF_INET6, ln); if (lladdr != NULL) { /* (7) */ nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); EVENTHANDLER_INVOKE(lle_event, ln, Index: usr.bin/netstat/nhops.c =================================================================== --- usr.bin/netstat/nhops.c +++ usr.bin/netstat/nhops.c @@ -312,8 +312,9 @@ xo_emit("{t:refcount/%*lu} ", wid_refcnt, nh->nh_refcount); if (Wflag && nh->prepend_len) { - char *prepend_hex = "AABBCCDDEE"; - xo_emit(" {:nhop-prepend/%*s}", wid_prepend, prepend_hex); + for (int i = 0; i < nh->prepend_len; i++) + snprintf(&buffer[i * 2], 3, "%02X", nh->nh_prepend[i]); + xo_emit(" {:nhop-prepend/%*s}", wid_prepend, buffer); } xo_emit("\n");