Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -4174,6 +4174,7 @@ net/route/nhop.c standard net/route/nhop_ctl.c standard net/route/nhop_utils.c standard +net/route/route_algo.c optional route_algo net/route/route_ctl.c standard net/route/route_ddb.c optional ddb net/route/route_helpers.c standard @@ -4324,6 +4325,7 @@ netinet/in_kdtrace.c optional inet | inet6 netinet/ip_carp.c optional inet carp | inet6 carp netinet/in_fib.c optional inet +netinet/in_fib_algo.c optional inet route_algo netinet/in_gif.c optional gif inet | netgraph_gif inet netinet/ip_gre.c optional gre inet netinet/ip_id.c optional inet @@ -4400,6 +4402,7 @@ netinet6/in6.c optional inet6 netinet6/in6_cksum.c optional inet6 netinet6/in6_fib.c optional inet6 +netinet6/in6_fib_algo.c optional inet6 route_algo netinet6/in6_gif.c optional gif inet6 | netgraph_gif inet6 netinet6/in6_ifattach.c optional inet6 netinet6/in6_jail.c optional inet6 Index: sys/conf/options =================================================================== --- sys/conf/options +++ sys/conf/options @@ -454,6 +454,7 @@ PF_DEFAULT_TO_DROP opt_pf.h RADIX_MPATH opt_mpath.h ROUTE_MPATH opt_route.h +ROUTE_ALGO opt_route.h ROUTETABLES opt_route.h RSS opt_rss.h SLIP_IFF_OPTS opt_slip.h Index: sys/contrib/dpdk_rte_lpm/dpdk_lpm.c =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/dpdk_lpm.c @@ -0,0 +1,480 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#define RTDEBUG + +#include "rte_shim.h" +#include "rte_lpm.h" + +#define LPM_MIN_TBL8 8 /* 2 pages of memory */ +#define LPM_MAX_TBL8 65536 * 16 /* 256M */ + +struct fib_algo_calldata { + void *lookup; + void *arg; +}; + +struct dpdk_lpm_data { + struct rte_lpm *lpm; + uint32_t number_tbl8s; + uint64_t routes_added; + uint64_t routes_failed; + uint32_t fibnum; + uint8_t hit_tables; + uint8_t hit_records; + struct fib_algo_calldata fa; + struct fib_data *fd; +}; + +/* + * Main datapath routing + */ +static struct nhop_object * +lookup_ptr(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) +{ + struct rte_lpm *lpm; + const struct rte_lpm_external *rte_ext; + uint32_t nhidx = 0; + int ret; + + lpm = (struct rte_lpm *)algo_data; + rte_ext = (const struct rte_lpm_external *)lpm; + + ret = rte_lpm_lookup(lpm, key.addr4.s_addr, &nhidx); + if (ret == 0) { + /* Success! */ + return (rte_ext->nh_idx[nhidx]); + } else { + /* Not found. Check default route */ + return (rte_ext->nh_idx[rte_ext->default_idx]); + } + + return (NULL); +} + +static uint8_t +rte_get_pref(const struct rib_rtable_info *rinfo) +{ + + if (rinfo->num_prefixes < 10) + return (1); + else if (rinfo->num_prefixes < 1000) + return (rinfo->num_prefixes / 10); + else if (rinfo->num_prefixes < 500000) + return (100 + rinfo->num_prefixes / 3334); + else + return (250); +} + +static int +contigmask(const uint8_t *p, int len) +{ + int i, n; + + for (i = 0; i < len ; i++) + if ( (p[i/8] & (1 << (7 - (i%8)))) == 0) /* first bit unset */ + break; + for (n= i + 1; n < len; n++) + if ( (p[n/8] & (1 << (7 - (n % 8)))) != 0) + return (-1); /* mask not contiguous */ + return (i); +} + +static uint8_t +rt_get_plen(const struct rtentry *rt) +{ + const struct sockaddr *sa; + int family; + int plen; + + sa = rt_key_const(rt); + family = sa->sa_family; + sa = rt_mask_const(rt); + switch (family) { + case AF_INET: + if (sa != NULL) { + const struct in_addr *addr4; + addr4 = &((const struct sockaddr_in *)sa)->sin_addr; + plen = contigmask((const uint8_t *)addr4, 32); + if (plen == -1) + plen = 0; + } else + plen = 32; + break; + case AF_INET6: + if (sa != NULL) { + const struct in6_addr *addr6; + addr6 = &((const struct sockaddr_in6 *)sa)->sin6_addr; + plen = contigmask((const uint8_t *)addr6, 128); + if (plen == -1) + plen = 0; + } else + plen = 128; + break; + default: + plen = 0; + } + + return (plen); +} + +static enum flm_op_result +handle_default_change(struct dpdk_lpm_data *dd, struct rib_cmd_info *rc) +{ + struct rte_lpm_external *rte_ext; + rte_ext = (struct rte_lpm_external *)dd->lpm; + uint32_t old_nhidx = rte_ext->default_idx; + + if (rc->rc_cmd != RTM_DELETE) { + /* Reference new */ + uint32_t nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new); + + if (nhidx == 0) + return (FLM_REBUILD); + rte_ext->default_idx = nhidx; + } else { + /* No default route */ + rte_ext->default_idx = 0; + } + + if (old_nhidx != 0) + fib_free_nhop_idx(dd->fd, old_nhidx); + + return (FLM_SUCCESS); +} + +static void +get_parent_rule(struct dpdk_lpm_data *dd, struct in_addr addr, uint8_t *plen, uint32_t *nhop_idx) +{ + struct route_nhop_data rnd; + struct rtentry *rt; + + rt = fib4_lookup_rt(dd->fibnum, addr, 0, NHR_UNLOCKED, &rnd); + if ((rt != NULL) && (*plen = rt_get_plen(rt)) > 0) { + *nhop_idx = fib_get_nhop_idx(dd->fd, rnd.rnd_nhop); + } else { + *nhop_idx = 0; + *plen = 0; + } +} + +static enum flm_op_result +handle_gu_change(struct dpdk_lpm_data *dd, const struct rib_cmd_info *rc, + const struct in_addr addr, int plen) +{ + uint32_t nhidx = 0; + int ret; + char abuf[INET_ADDRSTRLEN]; + inet_ntop(AF_INET, &addr, abuf, sizeof(abuf)); + + /* So we get sin, plen and nhidx */ + if (rc->rc_cmd != RTM_DELETE) { + /* + * Addition or change. Save nhop in the internal table + * and get index. + */ + nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new); + if (nhidx == 0) { + DPRINTF("nhop limit reached, need rebuild"); + return (FLM_REBUILD); + } + + ret = rte_lpm_add(dd->lpm, addr.s_addr, plen, nhidx); + DPRINTF("DPDK GU: %s %s/%d nhop %u = %d", (rc->rc_cmd == RTM_ADD) ? "ADD" : "UPDATE", + abuf, plen, nhidx, ret); + } else { + /* + * Need to lookup parent. Assume deletion happened already + */ + const struct sockaddr_in *dst; + dst = (const struct sockaddr_in *)rt_key_const(rc->rc_rt); + + uint8_t parent_plen; + uint32_t parent_nhop_idx; + get_parent_rule(dd, dst->sin_addr, &parent_plen, &parent_nhop_idx); + + ret = rte_lpm_delete(dd->lpm, addr.s_addr, plen, parent_plen, parent_nhop_idx); + DPRINTF("DPDK: %s %s/%d nhop %u = %d", "DEL", abuf, plen, nhidx, ret); + } + + if (rc->rc_nh_old != NULL) + fib_free_nhop(dd->fd, rc->rc_nh_old); + + if (ret != 0) { + DPRINTF("error: %d", ret); + if (ret == -EOVERFLOW) + return (FLM_REBUILD); + return (FLM_ERROR); + } + return (FLM_SUCCESS); +} + +static enum flm_op_result +handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, + void *_data) +{ + const struct sockaddr_in *sin; + int plen = rt_get_plen(rc->rc_rt); + enum flm_op_result ret; + struct dpdk_lpm_data *dd; + + dd = (struct dpdk_lpm_data *)_data; + sin = (const struct sockaddr_in *)rt_key_const(rc->rc_rt); + + if (plen != 0) + ret = handle_gu_change(dd, rc, sin->sin_addr, plen); + else + ret = handle_default_change(dd, rc); + + if (ret != 0) + DPRINTF("error handling route"); + return (ret); +} + +static void +destroy_table(void *_data) +{ + struct dpdk_lpm_data *dd = (struct dpdk_lpm_data *)_data; + + DPRINTF("destroy dd %p", dd); + if (dd->lpm != NULL) + rte_lpm_free(dd->lpm); + free(dd, M_TEMP); +} + +static enum flm_op_result +add_route_cb(struct rtentry *rt, void *_data) +{ + struct dpdk_lpm_data *dd = (struct dpdk_lpm_data *)_data; + const struct sockaddr_in *sin; + int plen = rt_get_plen(rt); + int ret; + + sin = (const struct sockaddr_in *)rt_key_const(rt); + + char abuf[INET_ADDRSTRLEN]; + char mbuf[INET_ADDRSTRLEN]; + inet_ntop(AF_INET, &sin->sin_addr, abuf, sizeof(abuf)); + + const struct sockaddr_in *mask; + mask = (const struct sockaddr_in *)rt_mask_const(rt); + if (mask != NULL) { + inet_ntop(AF_INET, &mask->sin_addr, mbuf, sizeof(mbuf)); + } else + mbuf[0] = '\0'; + + DPRINTF("Operating on %s/%d [%s]", abuf, plen, mbuf); + + if (plen == 0) { + struct rib_cmd_info rc; + + bzero(&rc, sizeof(rc)); + rc.rc_cmd = RTM_ADD; + rc.rc_nh_new = rt->rt_nhop; + DPRINTF("Adding default route"); + return (handle_default_change(dd, &rc)); + } + + uint32_t nhidx = fib_get_nhop_idx(dd->fd, rt->rt_nhop); + if (nhidx == 0) { + DPRINTF("unable to get nhop index"); + return (FLM_REBUILD); + } + ret = rte_lpm_add(dd->lpm, sin->sin_addr.s_addr, plen, nhidx); + DPRINTF("ADD %p %s/%d nh %u = %d", dd->lpm, abuf, plen, nhidx, ret); + + if (ret != 0) { + DPRINTF("rte_lpm_add() returned %d", ret); + if (ret == -ENOSPC) { + dd->hit_tables = 1; + return (FLM_REBUILD); + } + dd->routes_failed++; + } else + dd->routes_added++; + + return (FLM_SUCCESS); +} + +static enum flm_op_result +check_dump_success(void *_data, struct fib_dp *dp) +{ + struct dpdk_lpm_data *dd; + + dd = (struct dpdk_lpm_data *)_data; + + DPRINTF("scan completed. added: %zu failed: %zu", + dd->routes_added, dd->routes_failed); + if (dd->hit_tables || dd->routes_failed > 0) + return (FLM_REBUILD); + + DPRINTF("DPDK lookup engine synced with IPv4 RIB id %u, %zu routes", + dd->fibnum, dd->routes_added); + + dp->f = lookup_ptr; + dp->arg = dd; + + return (FLM_SUCCESS); +} + +static void +estimate_scale(const struct dpdk_lpm_data *dd_src, struct dpdk_lpm_data *dd) +{ + + /* XXX: update at 75% capacity */ + if (dd_src->hit_tables) + dd->number_tbl8s = dd_src->number_tbl8s * 2; + else + dd->number_tbl8s = dd_src->number_tbl8s; + + /* TODO: look into the appropriate RIB to adjust */ +} + +static struct dpdk_lpm_data * +build_table(struct dpdk_lpm_data *dd_prev) +{ + struct dpdk_lpm_data *dd; + struct rte_lpm *lpm; + + dd = malloc(sizeof(struct dpdk_lpm_data), M_TEMP, M_NOWAIT | M_ZERO); + if (dd == NULL) { + DPRINTF("Unable to allocate base datastructure"); + return (NULL); + } + dd->fibnum = dd_prev->fibnum; + dd->fd = dd_prev->fd; + + estimate_scale(dd_prev, dd); + + struct rte_lpm_config cfg = {.number_tbl8s = dd->number_tbl8s}; + lpm = rte_lpm_create("test", 0, &cfg); + if (lpm == NULL) { + DPRINTF("unable to create lpm"); + free(dd, M_TEMP); + return (NULL); + } + dd->lpm = lpm; + struct rte_lpm_external *ext = (struct rte_lpm_external *)lpm; + ext->nh_idx = fib_get_nhop_array(dd->fd); + + DPRINTF("allocated %u tbl8s", dd->number_tbl8s); + + return (dd); +} + +static enum flm_op_result +init_table(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **data) +{ + struct dpdk_lpm_data *dd, dd_base; + + if (_old_data == NULL) { + bzero(&dd_base, sizeof(struct dpdk_lpm_data)); + dd_base.fibnum = fibnum; + dd_base.fd = fd; + /* TODO: get rib statistics */ + dd_base.number_tbl8s = LPM_MIN_TBL8; + dd = &dd_base; + } else { + DPRINTF("Starting with old data"); + dd = (struct dpdk_lpm_data *)_old_data; + } + + /* Guaranteed to be in epoch */ + dd = build_table(dd); + if (dd == NULL) { + DPRINTF("table creation failed"); + return (FLM_REBUILD); + } + + *data = dd; + return (FLM_SUCCESS); +} + +static struct fib_lookup_module dpdk_lpm4 = { + .flm_name = "dpdk_lpm4", + .flm_family = AF_INET, + .flm_init_cb = init_table, + .flm_destroy_cb = destroy_table, + .flm_dump_rib_item_cb = add_route_cb, + .flm_dump_end_cb = check_dump_success, + .flm_change_rib_item_cb = handle_rtable_change_cb, + .flm_get_pref = rte_get_pref, +}; + +static int +lpm4_modevent(module_t mod, int type, void *unused) +{ + int error = 0; + + switch (type) { + case MOD_LOAD: + fib_module_register(&dpdk_lpm4); + break; + case MOD_UNLOAD: + error = fib_module_unregister(&dpdk_lpm4); + break; + default: + error = EOPNOTSUPP; + break; + } + return (error); +} + +static moduledata_t lpm4mod = { + "dpdk_lpm4", + lpm4_modevent, + 0 +}; + +DECLARE_MODULE(lpm4mod, lpm4mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(lpm4mod, 1); Index: sys/contrib/dpdk_rte_lpm/dpdk_lpm6.h =================================================================== --- sys/contrib/dpdk_rte_lpm/dpdk_lpm6.h +++ sys/contrib/dpdk_rte_lpm/dpdk_lpm6.h @@ -1,6 +1,7 @@ /*- - * Copyright (c) 2015 - * Alexander V. Chernikov + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -10,14 +11,11 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) @@ -29,17 +27,31 @@ * $FreeBSD$ */ -#ifndef _NETINET6_IN6_FIB_H_ -#define _NETINET6_IN6_FIB_H_ +/* + * Contains various definitions shared between the parts of a routing subsystem. + * + */ + +#ifndef _NETINET6_DPDK_LPM6_H_ +#define _NETINET6_DPDK_LPM6_H_ + +/** LPM structure. */ +struct rte_lpm6; + +/** LPM configuration structure. */ +struct rte_lpm6_config { + uint32_t max_rules; /**< Max number of rules. */ + uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */ + int flags; /**< This field is currently unused. */ +}; + +struct rte_lpm6 * +rte_lpm6_create(const char *name, int socket_id, + const struct rte_lpm6_config *config); +void +rte_lpm6_free(struct rte_lpm6 *lpm); +int +rte_lpm6_add(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + uint32_t next_hop, int is_new_rule); -struct nhop_object *fib6_lookup(uint32_t fibnum, - const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags, - uint32_t flowid); -int fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6, - uint32_t scopeid, uint32_t flags, const struct ifnet *src_if); -struct nhop_object *fib6_lookup_debugnet(uint32_t fibnum, - const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags); -uint32_t fib6_calc_software_hash(const struct in6_addr *src, - const struct in6_addr *dst, unsigned short src_port, unsigned short dst_port, - char proto, uint32_t *phashtype); #endif Index: sys/contrib/dpdk_rte_lpm/dpdk_lpm6.c =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/dpdk_lpm6.c @@ -0,0 +1,560 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#define RTDEBUG + +#include "rte_lpm6.h" + +#define LPM6_MIN_TBL8 8 /* 2 pages of memory */ +#define LPM6_MAX_TBL8 65536 * 16 /* 256M */ + +struct fib_algo_calldata { + void *lookup; + void *arg; +}; + +struct dpdk_lpm6_data { + struct rte_lpm6 *lpm6; + uint32_t number_tbl8s; + uint64_t routes_added; + uint64_t routes_failed; + uint32_t fibnum; + uint8_t hit_tables; + struct fib_data *fd; +}; + +static struct nhop_object * +lookup_ptr_ll(const struct rte_lpm6 *lpm6, const struct in6_addr *dst6, + uint32_t scopeid) +{ + const struct rte_lpm6_external *rte_ext; + struct nhop_object *nh = NULL; + struct sockaddr_in6 sin6; + struct rib_head *rh; + struct radix_node *rn; + RIB_RLOCK_TRACKER; + + memset(&sin6, 0, sizeof(sin6)); + sin6.sin6_len = sizeof(struct sockaddr_in6); + sin6.sin6_addr = *dst6; + /* Assume scopeid is valid and embed it directly */ + sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff); + + rte_ext = (const struct rte_lpm6_external *)lpm6; + rh = rt_tables_get_rnh(rte_ext->fibnum, AF_INET6); + if (rh == NULL) + return (NULL); + + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) + nh = RNTORT(rn)->rt_nhop; + RIB_RUNLOCK(rh); + //TODO: check LL nhops refcounting + + return (nh); +} + +/* + * Main datapath routing + */ +static struct nhop_object * +lookup_ptr(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) +{ + const struct rte_lpm6 *lpm6; + const struct rte_lpm6_external *rte_ext; + const struct in6_addr *addr6; + uint32_t nhidx = 0; + int ret; + + lpm6 = (const struct rte_lpm6 *)algo_data; + addr6 = key.addr6; + rte_ext = (const struct rte_lpm6_external *)lpm6; + + if (!IN6_IS_SCOPE_LINKLOCAL(addr6)) { + ret = rte_lpm6_lookup(lpm6, (const uint8_t *)addr6, &nhidx); + if (ret == 0) { + /* Success! */ + return (rte_ext->nh_idx[nhidx]); + } else { + /* Not found. Check default route */ + if (rte_ext->default_idx > 0) + return (rte_ext->nh_idx[rte_ext->default_idx]); + else + return (NULL); + } + } else { + /* LL */ + return (lookup_ptr_ll(lpm6, addr6, scopeid)); + } +} + +static uint8_t +rte6_get_pref(const struct rib_rtable_info *rinfo) +{ + + if (rinfo->num_prefixes < 10) + return (1); + else if (rinfo->num_prefixes < 1000) + return (rinfo->num_prefixes / 10); + else if (rinfo->num_prefixes < 500000) + return (100 + rinfo->num_prefixes / 3334); + else + return (250); +} + +static int +contigmask(const uint8_t *p, int len) +{ + int i, n; + + for (i = 0; i < len ; i++) + if ( (p[i/8] & (1 << (7 - (i%8)))) == 0) /* first bit unset */ + break; + for (n= i + 1; n < len; n++) + if ( (p[n/8] & (1 << (7 - (n % 8)))) != 0) + return (-1); /* mask not contiguous */ + return (i); +} + +static uint8_t +rt_get_plen(const struct rtentry *rt) +{ + const struct sockaddr *sa; + int family; + int plen; + + sa = rt_key_const(rt); + family = sa->sa_family; + sa = rt_mask_const(rt); + switch (family) { + case AF_INET: + if (sa != NULL) { + const struct in_addr *addr4; + addr4 = &((const struct sockaddr_in *)sa)->sin_addr; + plen = contigmask((const uint8_t *)addr4, 32); + if (plen == -1) + plen = 0; + } else + plen = 32; + break; + case AF_INET6: + if (sa != NULL) { + const struct in6_addr *addr6; + addr6 = &((const struct sockaddr_in6 *)sa)->sin6_addr; + plen = contigmask((const uint8_t *)addr6, 128); + if (plen == -1) + plen = 0; + } else + plen = 128; + break; + default: + plen = 0; + } + + return (plen); +} + +static enum flm_op_result +handle_default_change(struct dpdk_lpm6_data *dd, struct rib_cmd_info *rc) +{ + struct rte_lpm6_external *rte_ext; + rte_ext = (struct rte_lpm6_external *)dd->lpm6; + uint32_t old_nhidx = rte_ext->default_idx; + + if (rc->rc_cmd != RTM_DELETE) { + /* Reference new */ + uint32_t nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new); + + if (nhidx == 0) + return (FLM_REBUILD); + rte_ext->default_idx = nhidx; + } else { + /* No default route */ + rte_ext->default_idx = 0; + } + + if (old_nhidx != 0) + fib_free_nhop_idx(dd->fd, old_nhidx); + + return (FLM_SUCCESS); +} + +static enum flm_op_result +handle_ll_change(struct dpdk_lpm6_data *dd, struct rib_cmd_info *rc, + const struct sockaddr_in6 *sin6, int plen) +{ + + return (FLM_SUCCESS); +} + +static struct rte_lpm6_rule * +pack_parent_rule(struct dpdk_lpm6_data *dd, const struct in6_addr *addr6, + char *buffer) +{ + struct rte_lpm6_rule *lsp_rule = NULL; + struct route_nhop_data rnd; + struct rtentry *rt; + int plen; + + rt = fib6_lookup_rt(dd->fibnum, addr6, 0, NHR_UNLOCKED, &rnd); + /* plen = 0 means default route and it's out of scope */ + if ((rt != NULL) && (plen = rt_get_plen(rt)) > 0) { + uint32_t nhidx = fib_get_nhop_idx(dd->fd, rnd.rnd_nhop); + if (nhidx == 0) { + /* + * shouldn't happen as we already have parent route. + * It will trigger rebuild automatically. + */ + return (NULL); + } + const struct sockaddr_in6 *psin6; + const uint8_t *uaddr6; + psin6 = (const struct sockaddr_in6 *)rt_key_const(rt); + uaddr6 = (const uint8_t *)&psin6->sin6_addr; + lsp_rule = fill_rule6(buffer, uaddr6, plen, nhidx); + } + + return (lsp_rule); +} + +static enum flm_op_result +handle_gu_change(struct dpdk_lpm6_data *dd, const struct rib_cmd_info *rc, + const struct in6_addr *addr6, int plen) +{ + uint32_t nhidx = 0; + int ret; + char abuf[INET6_ADDRSTRLEN]; + inet_ntop(AF_INET6, addr6, abuf, sizeof(abuf)); + + /* So we get sin6, plen and nhidx */ + if (rc->rc_cmd != RTM_DELETE) { + /* + * Addition or change. Save nhop in the internal table + * and get index. + */ + nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new); + if (nhidx == 0) { + DPRINTF("FCK: nhop limit reached, need rebuild"); + return (FLM_REBUILD); + } + + ret = rte_lpm6_add(dd->lpm6, (const uint8_t *)addr6, + plen, nhidx, (rc->rc_cmd == RTM_ADD) ? 1 : 0); + DPRINTF("DPDK GU: %s %s/%d nhop %u = %d", (rc->rc_cmd == RTM_ADD) ? "ADD" : "UPDATE", + abuf, plen, nhidx, ret); + } else { + /* + * Need to lookup parent. Assume deletion happened already + */ + char buffer[RTE_LPM6_RULE_SIZE]; + struct rte_lpm6_rule *lsp_rule = NULL; + const struct sockaddr_in6 *dst6; + dst6 = (const struct sockaddr_in6 *)rt_key_const(rc->rc_rt); + + lsp_rule = pack_parent_rule(dd, &dst6->sin6_addr, buffer); + + ret = rte_lpm6_delete(dd->lpm6, (const uint8_t *)addr6, plen, lsp_rule); + DPRINTF("DPDK GU: %s %s/%d nhop %u = %d", "DEL", abuf, plen, nhidx, ret); + } + + if (rc->rc_nh_old != NULL) + fib_free_nhop(dd->fd, rc->rc_nh_old); + + if (ret != 0) { + DPRINTF("error: %d", ret); + if (ret == -EOVERFLOW) + return (FLM_REBUILD); + return (FLM_ERROR); + } + return (FLM_SUCCESS); +} + +static enum flm_op_result +handle_any_change(struct dpdk_lpm6_data *dd, struct rib_cmd_info *rc) +{ + const struct sockaddr_in6 *sin6; + int plen = rt_get_plen(rc->rc_rt); + enum flm_op_result ret; + + sin6 = (const struct sockaddr_in6 *)rt_key_const(rc->rc_rt); + + if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) + ret = handle_ll_change(dd, rc, sin6, plen); + else if (plen == 0) + ret = handle_default_change(dd, rc); + else + ret = handle_gu_change(dd, rc, &sin6->sin6_addr, plen); + + if (ret != 0) + DPRINTF("error handling route"); + return (ret); +} + +static enum flm_op_result +handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, + void *_data) +{ + struct dpdk_lpm6_data *dd; + + dd = (struct dpdk_lpm6_data *)_data; + + return (handle_any_change(dd, rc)); +} + +static void +destroy_dd(struct dpdk_lpm6_data *dd) +{ + + DPRINTF("destroy dd %p", dd); + if (dd->lpm6 != NULL) + rte_lpm6_free(dd->lpm6); + free(dd, M_TEMP); +} + +static void +destroy_table(void *_data) +{ + + destroy_dd((struct dpdk_lpm6_data *)_data); +} + +static enum flm_op_result +add_route_cb(struct rtentry *rt, void *_data) +{ + struct dpdk_lpm6_data *dd = (struct dpdk_lpm6_data *)_data; + const struct sockaddr_in6 *sin6; + int plen = rt_get_plen(rt); + int ret; + + sin6 = (const struct sockaddr_in6 *)rt_key_const(rt); + + char abuf[INET6_ADDRSTRLEN]; + char mbuf[INET6_ADDRSTRLEN]; + inet_ntop(AF_INET6, &sin6->sin6_addr, abuf, sizeof(abuf)); + + const struct sockaddr_in6 *mask6; + mask6 = (const struct sockaddr_in6 *)rt_mask_const(rt); + if (mask6 != NULL) { + inet_ntop(AF_INET6, &mask6->sin6_addr, mbuf, sizeof(abuf)); + } else + mbuf[0] = '\0'; + + DPRINTF("Operating on %s/%d [%s]", abuf, plen, mbuf); + + if (plen == 0) { + struct rib_cmd_info rc; + + bzero(&rc, sizeof(rc)); + rc.rc_cmd = RTM_ADD; + rc.rc_nh_new = rt->rt_nhop; + DPRINTF("Adding default route"); + return (handle_default_change(dd, &rc)); + } + + uint32_t nhidx = fib_get_nhop_idx(dd->fd, rt->rt_nhop); + if (nhidx == 0) { + DPRINTF("unable to get nhop index"); + return (FLM_REBUILD); + } + ret = rte_lpm6_add(dd->lpm6, (const uint8_t *)&sin6->sin6_addr, plen, + nhidx, 1); + DPRINTF("ADD %p %s/%d nh %u = %d", dd->lpm6, abuf, plen, nhidx, ret); + + if (ret != 0) { + DPRINTF("rte_lpm6_add() returned %d", ret); + if (ret == -ENOSPC) { + dd->hit_tables = 1; + return (FLM_REBUILD); + } + dd->routes_failed++; + } else + dd->routes_added++; + + return (FLM_SUCCESS); +} + +static enum flm_op_result +check_dump_success(void *_data, struct fib_dp *dp) +{ + struct dpdk_lpm6_data *dd; + + dd = (struct dpdk_lpm6_data *)_data; + + DPRINTF("scan completed. added: %zu failed: %zu", + dd->routes_added, dd->routes_failed); + if (dd->hit_tables || dd->routes_failed > 0) + return (FLM_REBUILD); + + DPRINTF("DPDK lookup engine synced with IPv6 RIB id %u, %zu routes", + dd->fibnum, dd->routes_added); + + dp->f = lookup_ptr; + dp->arg = dd->lpm6; + + return (FLM_SUCCESS); +} + +static void +estimate_scale(const struct dpdk_lpm6_data *dd_src, struct dpdk_lpm6_data *dd) +{ + + /* XXX: update at 75% capacity */ + if (dd_src->hit_tables) + dd->number_tbl8s = dd_src->number_tbl8s * 2; + else + dd->number_tbl8s = dd_src->number_tbl8s; + + /* TODO: look into the appropriate RIB to adjust */ +} + +static struct dpdk_lpm6_data * +build_table(struct dpdk_lpm6_data *dd_prev) +{ + struct dpdk_lpm6_data *dd; + struct rte_lpm6 *lpm6; + + dd = malloc(sizeof(struct dpdk_lpm6_data), M_TEMP, M_NOWAIT | M_ZERO); + if (dd == NULL) { + DPRINTF("Unable to allocate base datastructure"); + return (NULL); + } + dd->fibnum = dd_prev->fibnum; + dd->fd = dd_prev->fd; + + estimate_scale(dd_prev, dd); + + struct rte_lpm6_config cfg = {.number_tbl8s = dd->number_tbl8s}; + lpm6 = rte_lpm6_create("test", 0, &cfg); + if (lpm6 == NULL) { + DPRINTF("unable to create lpm6"); + free(dd, M_TEMP); + return (NULL); + } + dd->lpm6 = lpm6; + struct rte_lpm6_external *ext = (struct rte_lpm6_external *)lpm6; + ext->nh_idx = fib_get_nhop_array(dd->fd); + + DPRINTF("allocated %u tbl8s", dd->number_tbl8s); + + return (dd); +} + +static enum flm_op_result +init_table(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **data) +{ + struct dpdk_lpm6_data *dd, dd_base; + + if (_old_data == NULL) { + bzero(&dd_base, sizeof(struct dpdk_lpm6_data)); + dd_base.fibnum = fibnum; + dd_base.fd = fd; + /* TODO: get rib statistics */ + dd_base.number_tbl8s = LPM6_MIN_TBL8; + dd = &dd_base; + } else { + DPRINTF("Starting with old data"); + dd = (struct dpdk_lpm6_data *)_old_data; + } + + /* Guaranteed to be in epoch */ + dd = build_table(dd); + if (dd == NULL) { + DPRINTF("table creation failed"); + return (FLM_REBUILD); + } + + *data = dd; + return (FLM_SUCCESS); +} + +static struct fib_lookup_module dpdk_lpm6 = { + .flm_name = "dpdk_lpm6", + .flm_family = AF_INET6, + .flm_init_cb = init_table, + .flm_destroy_cb = destroy_table, + .flm_dump_rib_item_cb = add_route_cb, + .flm_dump_end_cb = check_dump_success, + .flm_change_rib_item_cb = handle_rtable_change_cb, + .flm_get_pref = rte6_get_pref, +}; + +static int +lpm6_modevent(module_t mod, int type, void *unused) +{ + int error = 0; + + switch (type) { + case MOD_LOAD: + fib_module_register(&dpdk_lpm6); + break; + case MOD_UNLOAD: + error = fib_module_unregister(&dpdk_lpm6); + break; + default: + error = EOPNOTSUPP; + break; + } + return (error); +} + +static moduledata_t lpm6mod = { + "dpdk_lpm6", + lpm6_modevent, + 0 +}; + +DECLARE_MODULE(lpm6mod, lpm6mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(lpm6mod, 1); Index: sys/contrib/dpdk_rte_lpm/rte_branch_prediction.h =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/rte_branch_prediction.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +/** + * @file + * Branch Prediction Helpers in RTE + */ + +#ifndef _RTE_BRANCH_PREDICTION_H_ +#define _RTE_BRANCH_PREDICTION_H_ + +/** + * Check if a branch is likely to be taken. + * + * This compiler builtin allows the developer to indicate if a branch is + * likely to be taken. Example: + * + * if (likely(x > 1)) + * do_stuff(); + * + */ +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1) +#endif /* likely */ + +/** + * Check if a branch is unlikely to be taken. + * + * This compiler builtin allows the developer to indicate if a branch is + * unlikely to be taken. Example: + * + * if (unlikely(x < 1)) + * do_stuff(); + * + */ +#ifndef unlikely +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif /* unlikely */ + +#endif /* _RTE_BRANCH_PREDICTION_H_ */ Index: sys/contrib/dpdk_rte_lpm/rte_common.h =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/rte_common.h @@ -0,0 +1,838 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2019 Intel Corporation + */ + +#ifndef _RTE_COMMON_H_ +#define _RTE_COMMON_H_ + +/** + * @file + * + * Generic, commonly-used macro and inline function definitions + * for DPDK. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +//#include + +/* OS specific include */ +//#include + +#ifndef typeof +#define typeof __typeof__ +#endif + +#ifndef asm +#define asm __asm__ +#endif + +/** C extension macro for environments lacking C11 features. */ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 201112L +#define RTE_STD_C11 __extension__ +#else +#define RTE_STD_C11 +#endif + +/* + * RTE_TOOLCHAIN_GCC is defined if the target is built with GCC, + * while a host application (like pmdinfogen) may have another compiler. + * RTE_CC_IS_GNU is true if the file is compiled with GCC, + * no matter it is a target or host application. + */ +#define RTE_CC_IS_GNU 0 +#if defined __clang__ +#define RTE_CC_CLANG +#elif defined __INTEL_COMPILER +#define RTE_CC_ICC +#elif defined __GNUC__ +#define RTE_CC_GCC +#undef RTE_CC_IS_GNU +#define RTE_CC_IS_GNU 1 +#endif +#if RTE_CC_IS_GNU +#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + \ + __GNUC_PATCHLEVEL__) +#endif + +/** + * Force alignment + */ +#define __rte_aligned(a) __attribute__((__aligned__(a))) + +#ifdef RTE_ARCH_STRICT_ALIGN +typedef uint64_t unaligned_uint64_t __rte_aligned(1); +typedef uint32_t unaligned_uint32_t __rte_aligned(1); +typedef uint16_t unaligned_uint16_t __rte_aligned(1); +#else +typedef uint64_t unaligned_uint64_t; +typedef uint32_t unaligned_uint32_t; +typedef uint16_t unaligned_uint16_t; +#endif + +/** + * Force a structure to be packed + */ +#define __rte_packed __attribute__((__packed__)) + +/******* Macro to mark functions and fields scheduled for removal *****/ +#define __rte_deprecated __attribute__((__deprecated__)) + +/** + * Mark a function or variable to a weak reference. + */ +#define __rte_weak __attribute__((__weak__)) + +/** + * Force symbol to be generated even if it appears to be unused. + */ +#define __rte_used __attribute__((used)) + +/*********** Macros to eliminate unused variable warnings ********/ + +/** + * short definition to mark a function parameter unused + */ +#define __rte_unused __attribute__((__unused__)) + +/** + * definition to mark a variable or function parameter as used so + * as to avoid a compiler warning + */ +#define RTE_SET_USED(x) (void)(x) + +/** + * Check format string and its arguments at compile-time. + * + * GCC on Windows assumes MS-specific format string by default, + * even if the underlying stdio implementation is ANSI-compliant, + * so this must be overridden. + */ +#if RTE_CC_IS_GNU +#define __rte_format_printf(format_index, first_arg) \ + __attribute__((format(gnu_printf, format_index, first_arg))) +#else +#define __rte_format_printf(format_index, first_arg) \ + __attribute__((format(printf, format_index, first_arg))) +#endif + +#define RTE_PRIORITY_LOG 101 +#define RTE_PRIORITY_BUS 110 +#define RTE_PRIORITY_CLASS 120 +#define RTE_PRIORITY_LAST 65535 + +#define RTE_PRIO(prio) \ + RTE_PRIORITY_ ## prio + +/** + * Run function before main() with high priority. + * + * @param func + * Constructor function. + * @param prio + * Priority number must be above 100. + * Lowest number is the first to run. + */ +#ifndef RTE_INIT_PRIO /* Allow to override from EAL */ +#define RTE_INIT_PRIO(func, prio) \ +static void __attribute__((constructor(RTE_PRIO(prio)), used)) func(void) +#endif + +/** + * Run function before main() with low priority. + * + * The constructor will be run after prioritized constructors. + * + * @param func + * Constructor function. + */ +#define RTE_INIT(func) \ + RTE_INIT_PRIO(func, LAST) + +/** + * Run after main() with low priority. + * + * @param func + * Destructor function name. + * @param prio + * Priority number must be above 100. + * Lowest number is the last to run. + */ +#ifndef RTE_FINI_PRIO /* Allow to override from EAL */ +#define RTE_FINI_PRIO(func, prio) \ +static void __attribute__((destructor(RTE_PRIO(prio)), used)) func(void) +#endif + +/** + * Run after main() with high priority. + * + * The destructor will be run *before* prioritized destructors. + * + * @param func + * Destructor function name. + */ +#define RTE_FINI(func) \ + RTE_FINI_PRIO(func, LAST) + +/** + * Hint never returning function + */ +#define __rte_noreturn __attribute__((noreturn)) + +/** + * Force a function to be inlined + */ +#define __rte_always_inline inline __attribute__((always_inline)) + +/** + * Force a function to be noinlined + */ +#define __rte_noinline __attribute__((noinline)) + +/** + * Hint function in the hot path + */ +#define __rte_hot __attribute__((hot)) + +/** + * Hint function in the cold path + */ +#define __rte_cold __attribute__((cold)) + +/*********** Macros for pointer arithmetic ********/ + +/** + * add a byte-value offset to a pointer + */ +#define RTE_PTR_ADD(ptr, x) ((void*)((uintptr_t)(ptr) + (x))) + +/** + * subtract a byte-value offset from a pointer + */ +#define RTE_PTR_SUB(ptr, x) ((void*)((uintptr_t)ptr - (x))) + +/** + * get the difference between two pointer values, i.e. how far apart + * in bytes are the locations they point two. It is assumed that + * ptr1 is greater than ptr2. + */ +#define RTE_PTR_DIFF(ptr1, ptr2) ((uintptr_t)(ptr1) - (uintptr_t)(ptr2)) + +/** + * Workaround to cast a const field of a structure to non-const type. + */ +#define RTE_CAST_FIELD(var, field, type) \ + (*(type *)((uintptr_t)(var) + offsetof(typeof(*(var)), field))) + +/*********** Macros/static functions for doing alignment ********/ + + +/** + * Macro to align a pointer to a given power-of-two. The resultant + * pointer will be a pointer of the same type as the first parameter, and + * point to an address no higher than the first parameter. Second parameter + * must be a power-of-two value. + */ +#define RTE_PTR_ALIGN_FLOOR(ptr, align) \ + ((typeof(ptr))RTE_ALIGN_FLOOR((uintptr_t)ptr, align)) + +/** + * Macro to align a value to a given power-of-two. The resultant value + * will be of the same type as the first parameter, and will be no + * bigger than the first parameter. Second parameter must be a + * power-of-two value. + */ +#define RTE_ALIGN_FLOOR(val, align) \ + (typeof(val))((val) & (~((typeof(val))((align) - 1)))) + +/** + * Macro to align a pointer to a given power-of-two. The resultant + * pointer will be a pointer of the same type as the first parameter, and + * point to an address no lower than the first parameter. Second parameter + * must be a power-of-two value. + */ +#define RTE_PTR_ALIGN_CEIL(ptr, align) \ + RTE_PTR_ALIGN_FLOOR((typeof(ptr))RTE_PTR_ADD(ptr, (align) - 1), align) + +/** + * Macro to align a value to a given power-of-two. The resultant value + * will be of the same type as the first parameter, and will be no lower + * than the first parameter. Second parameter must be a power-of-two + * value. + */ +#define RTE_ALIGN_CEIL(val, align) \ + RTE_ALIGN_FLOOR(((val) + ((typeof(val)) (align) - 1)), align) + +/** + * Macro to align a pointer to a given power-of-two. The resultant + * pointer will be a pointer of the same type as the first parameter, and + * point to an address no lower than the first parameter. Second parameter + * must be a power-of-two value. + * This function is the same as RTE_PTR_ALIGN_CEIL + */ +#define RTE_PTR_ALIGN(ptr, align) RTE_PTR_ALIGN_CEIL(ptr, align) + +/** + * Macro to align a value to a given power-of-two. The resultant + * value will be of the same type as the first parameter, and + * will be no lower than the first parameter. Second parameter + * must be a power-of-two value. + * This function is the same as RTE_ALIGN_CEIL + */ +#define RTE_ALIGN(val, align) RTE_ALIGN_CEIL(val, align) + +/** + * Macro to align a value to the multiple of given value. The resultant + * value will be of the same type as the first parameter and will be no lower + * than the first parameter. + */ +#define RTE_ALIGN_MUL_CEIL(v, mul) \ + (((v + (typeof(v))(mul) - 1) / ((typeof(v))(mul))) * (typeof(v))(mul)) + +/** + * Macro to align a value to the multiple of given value. The resultant + * value will be of the same type as the first parameter and will be no higher + * than the first parameter. + */ +#define RTE_ALIGN_MUL_FLOOR(v, mul) \ + ((v / ((typeof(v))(mul))) * (typeof(v))(mul)) + +/** + * Macro to align value to the nearest multiple of the given value. + * The resultant value might be greater than or less than the first parameter + * whichever difference is the lowest. + */ +#define RTE_ALIGN_MUL_NEAR(v, mul) \ + ({ \ + typeof(v) ceil = RTE_ALIGN_MUL_CEIL(v, mul); \ + typeof(v) floor = RTE_ALIGN_MUL_FLOOR(v, mul); \ + (ceil - v) > (v - floor) ? floor : ceil; \ + }) + +/** + * Checks if a pointer is aligned to a given power-of-two value + * + * @param ptr + * The pointer whose alignment is to be checked + * @param align + * The power-of-two value to which the ptr should be aligned + * + * @return + * True(1) where the pointer is correctly aligned, false(0) otherwise + */ +static inline int +rte_is_aligned(void *ptr, unsigned align) +{ + return RTE_PTR_ALIGN(ptr, align) == ptr; +} + +/*********** Macros for compile type checks ********/ + +/** + * Triggers an error at compilation time if the condition is true. + */ +#define RTE_BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) + +/*********** Cache line related macros ********/ + +/** Cache line mask. */ +#define RTE_CACHE_LINE_MASK (RTE_CACHE_LINE_SIZE-1) + +/** Return the first cache-aligned value greater or equal to size. */ +#define RTE_CACHE_LINE_ROUNDUP(size) \ + (RTE_CACHE_LINE_SIZE * ((size + RTE_CACHE_LINE_SIZE - 1) / \ + RTE_CACHE_LINE_SIZE)) + +/** Cache line size in terms of log2 */ +#if RTE_CACHE_LINE_SIZE == 64 +#define RTE_CACHE_LINE_SIZE_LOG2 6 +#elif RTE_CACHE_LINE_SIZE == 128 +#define RTE_CACHE_LINE_SIZE_LOG2 7 +#else +#error "Unsupported cache line size" +#endif + +/** Minimum Cache line size. */ +#define RTE_CACHE_LINE_MIN_SIZE 64 + +/** Force alignment to cache line. */ +#define __rte_cache_aligned __rte_aligned(RTE_CACHE_LINE_SIZE) + +/** Force minimum cache line alignment. */ +#define __rte_cache_min_aligned __rte_aligned(RTE_CACHE_LINE_MIN_SIZE) + +/*********** PA/IOVA type definitions ********/ + +/** Physical address */ +typedef uint64_t phys_addr_t; +#define RTE_BAD_PHYS_ADDR ((phys_addr_t)-1) + +/** + * IO virtual address type. + * When the physical addressing mode (IOVA as PA) is in use, + * the translation from an IO virtual address (IOVA) to a physical address + * is a direct mapping, i.e. the same value. + * Otherwise, in virtual mode (IOVA as VA), an IOMMU may do the translation. + */ +typedef uint64_t rte_iova_t; +#define RTE_BAD_IOVA ((rte_iova_t)-1) + +/*********** Structure alignment markers ********/ + +/** Generic marker for any place in a structure. */ +__extension__ typedef void *RTE_MARKER[0]; +/** Marker for 1B alignment in a structure. */ +__extension__ typedef uint8_t RTE_MARKER8[0]; +/** Marker for 2B alignment in a structure. */ +__extension__ typedef uint16_t RTE_MARKER16[0]; +/** Marker for 4B alignment in a structure. */ +__extension__ typedef uint32_t RTE_MARKER32[0]; +/** Marker for 8B alignment in a structure. */ +__extension__ typedef uint64_t RTE_MARKER64[0]; + +/** + * Combines 32b inputs most significant set bits into the least + * significant bits to construct a value with the same MSBs as x + * but all 1's under it. + * + * @param x + * The integer whose MSBs need to be combined with its LSBs + * @return + * The combined value. + */ +static inline uint32_t +rte_combine32ms1b(register uint32_t x) +{ + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + + return x; +} + +/** + * Combines 64b inputs most significant set bits into the least + * significant bits to construct a value with the same MSBs as x + * but all 1's under it. + * + * @param v + * The integer whose MSBs need to be combined with its LSBs + * @return + * The combined value. + */ +static inline uint64_t +rte_combine64ms1b(register uint64_t v) +{ + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v |= v >> 32; + + return v; +} + +/*********** Macros to work with powers of 2 ********/ + +/** + * Macro to return 1 if n is a power of 2, 0 otherwise + */ +#define RTE_IS_POWER_OF_2(n) ((n) && !(((n) - 1) & (n))) + +/** + * Returns true if n is a power of 2 + * @param n + * Number to check + * @return 1 if true, 0 otherwise + */ +static inline int +rte_is_power_of_2(uint32_t n) +{ + return n && !(n & (n - 1)); +} + +/** + * Aligns input parameter to the next power of 2 + * + * @param x + * The integer value to align + * + * @return + * Input parameter aligned to the next power of 2 + */ +static inline uint32_t +rte_align32pow2(uint32_t x) +{ + x--; + x = rte_combine32ms1b(x); + + return x + 1; +} + +/** + * Aligns input parameter to the previous power of 2 + * + * @param x + * The integer value to align + * + * @return + * Input parameter aligned to the previous power of 2 + */ +static inline uint32_t +rte_align32prevpow2(uint32_t x) +{ + x = rte_combine32ms1b(x); + + return x - (x >> 1); +} + +/** + * Aligns 64b input parameter to the next power of 2 + * + * @param v + * The 64b value to align + * + * @return + * Input parameter aligned to the next power of 2 + */ +static inline uint64_t +rte_align64pow2(uint64_t v) +{ + v--; + v = rte_combine64ms1b(v); + + return v + 1; +} + +/** + * Aligns 64b input parameter to the previous power of 2 + * + * @param v + * The 64b value to align + * + * @return + * Input parameter aligned to the previous power of 2 + */ +static inline uint64_t +rte_align64prevpow2(uint64_t v) +{ + v = rte_combine64ms1b(v); + + return v - (v >> 1); +} + +/*********** Macros for calculating min and max **********/ + +/** + * Macro to return the minimum of two numbers + */ +#define RTE_MIN(a, b) \ + __extension__ ({ \ + typeof (a) _a = (a); \ + typeof (b) _b = (b); \ + _a < _b ? _a : _b; \ + }) + +/** + * Macro to return the maximum of two numbers + */ +#define RTE_MAX(a, b) \ + __extension__ ({ \ + typeof (a) _a = (a); \ + typeof (b) _b = (b); \ + _a > _b ? _a : _b; \ + }) + +/*********** Other general functions / macros ********/ + +/** + * Searches the input parameter for the least significant set bit + * (starting from zero). + * If a least significant 1 bit is found, its bit index is returned. + * If the content of the input parameter is zero, then the content of the return + * value is undefined. + * @param v + * input parameter, should not be zero. + * @return + * least significant set bit in the input parameter. + */ +static inline uint32_t +rte_bsf32(uint32_t v) +{ + return (uint32_t)__builtin_ctz(v); +} + +/** + * Searches the input parameter for the least significant set bit + * (starting from zero). Safe version (checks for input parameter being zero). + * + * @warning ``pos`` must be a valid pointer. It is not checked! + * + * @param v + * The input parameter. + * @param pos + * If ``v`` was not 0, this value will contain position of least significant + * bit within the input parameter. + * @return + * Returns 0 if ``v`` was 0, otherwise returns 1. + */ +static inline int +rte_bsf32_safe(uint64_t v, uint32_t *pos) +{ + if (v == 0) + return 0; + + *pos = rte_bsf32(v); + return 1; +} + +/** + * Return the rounded-up log2 of a integer. + * + * @note Contrary to the logarithm mathematical operation, + * rte_log2_u32(0) == 0 and not -inf. + * + * @param v + * The input parameter. + * @return + * The rounded-up log2 of the input, or 0 if the input is 0. + */ +static inline uint32_t +rte_log2_u32(uint32_t v) +{ + if (v == 0) + return 0; + v = rte_align32pow2(v); + return rte_bsf32(v); +} + + +/** + * Return the last (most-significant) bit set. + * + * @note The last (most significant) bit is at position 32. + * @note rte_fls_u32(0) = 0, rte_fls_u32(1) = 1, rte_fls_u32(0x80000000) = 32 + * + * @param x + * The input parameter. + * @return + * The last (most-significant) bit set, or 0 if the input is 0. + */ +static inline int +rte_fls_u32(uint32_t x) +{ + return (x == 0) ? 0 : 32 - __builtin_clz(x); +} + +/** + * Searches the input parameter for the least significant set bit + * (starting from zero). + * If a least significant 1 bit is found, its bit index is returned. + * If the content of the input parameter is zero, then the content of the return + * value is undefined. + * @param v + * input parameter, should not be zero. + * @return + * least significant set bit in the input parameter. + */ +static inline int +rte_bsf64(uint64_t v) +{ + return (uint32_t)__builtin_ctzll(v); +} + +/** + * Searches the input parameter for the least significant set bit + * (starting from zero). Safe version (checks for input parameter being zero). + * + * @warning ``pos`` must be a valid pointer. It is not checked! + * + * @param v + * The input parameter. + * @param pos + * If ``v`` was not 0, this value will contain position of least significant + * bit within the input parameter. + * @return + * Returns 0 if ``v`` was 0, otherwise returns 1. + */ +static inline int +rte_bsf64_safe(uint64_t v, uint32_t *pos) +{ + if (v == 0) + return 0; + + *pos = rte_bsf64(v); + return 1; +} + +/** + * Return the last (most-significant) bit set. + * + * @note The last (most significant) bit is at position 64. + * @note rte_fls_u64(0) = 0, rte_fls_u64(1) = 1, + * rte_fls_u64(0x8000000000000000) = 64 + * + * @param x + * The input parameter. + * @return + * The last (most-significant) bit set, or 0 if the input is 0. + */ +static inline int +rte_fls_u64(uint64_t x) +{ + return (x == 0) ? 0 : 64 - __builtin_clzll(x); +} + +/** + * Return the rounded-up log2 of a 64-bit integer. + * + * @note Contrary to the logarithm mathematical operation, + * rte_log2_u64(0) == 0 and not -inf. + * + * @param v + * The input parameter. + * @return + * The rounded-up log2 of the input, or 0 if the input is 0. + */ +static inline uint32_t +rte_log2_u64(uint64_t v) +{ + if (v == 0) + return 0; + v = rte_align64pow2(v); + /* we checked for v being 0 already, so no undefined behavior */ + return rte_bsf64(v); +} + +#ifndef offsetof +/** Return the offset of a field in a structure. */ +#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER) +#endif + +/** + * Return pointer to the wrapping struct instance. + * + * Example: + * + * struct wrapper { + * ... + * struct child c; + * ... + * }; + * + * struct child *x = obtain(...); + * struct wrapper *w = container_of(x, struct wrapper, c); + */ +#ifndef container_of +#define container_of(ptr, type, member) __extension__ ({ \ + const typeof(((type *)0)->member) *_ptr = (ptr); \ + __rte_unused type *_target_ptr = \ + (type *)(ptr); \ + (type *)(((uintptr_t)_ptr) - offsetof(type, member)); \ + }) +#endif + +/** + * Get the size of a field in a structure. + * + * @param type + * The type of the structure. + * @param field + * The field in the structure. + * @return + * The size of the field in the structure, in bytes. + */ +#define RTE_SIZEOF_FIELD(type, field) (sizeof(((type *)0)->field)) + +#define _RTE_STR(x) #x +/** Take a macro value and get a string version of it */ +#define RTE_STR(x) _RTE_STR(x) + +/** + * ISO C helpers to modify format strings using variadic macros. + * This is a replacement for the ", ## __VA_ARGS__" GNU extension. + * An empty %s argument is appended to avoid a dangling comma. + */ +#define RTE_FMT(fmt, ...) fmt "%.0s", __VA_ARGS__ "" +#define RTE_FMT_HEAD(fmt, ...) fmt +#define RTE_FMT_TAIL(fmt, ...) __VA_ARGS__ + +/** Mask value of type "tp" for the first "ln" bit set. */ +#define RTE_LEN2MASK(ln, tp) \ + ((tp)((uint64_t)-1 >> (sizeof(uint64_t) * CHAR_BIT - (ln)))) + +/** Number of elements in the array. */ +#define RTE_DIM(a) (sizeof (a) / sizeof ((a)[0])) + +/** + * Converts a numeric string to the equivalent uint64_t value. + * As well as straight number conversion, also recognises the suffixes + * k, m and g for kilobytes, megabytes and gigabytes respectively. + * + * If a negative number is passed in i.e. a string with the first non-black + * character being "-", zero is returned. Zero is also returned in the case of + * an error with the strtoull call in the function. + * + * @param str + * String containing number to convert. + * @return + * Number. + */ +#if 0 +static inline uint64_t +rte_str_to_size(const char *str) +{ + char *endptr; + unsigned long long size; + + while (isspace((int)*str)) + str++; + if (*str == '-') + return 0; + + errno = 0; + size = strtoull(str, &endptr, 0); + if (errno) + return 0; + + if (*endptr == ' ') + endptr++; /* allow 1 space gap */ + + switch (*endptr){ + case 'G': case 'g': size *= 1024; /* fall-through */ + case 'M': case 'm': size *= 1024; /* fall-through */ + case 'K': case 'k': size *= 1024; /* fall-through */ + default: + break; + } + return size; +} +#endif + +/** + * Function to terminate the application immediately, printing an error + * message and returning the exit_code back to the shell. + * + * This function never returns + * + * @param exit_code + * The exit code to be returned by the application + * @param format + * The format string to be used for printing the message. This can include + * printf format characters which will be expanded using any further parameters + * to the function. + */ +__rte_noreturn void +rte_exit(int exit_code, const char *format, ...) + __rte_format_printf(2, 3); + +#ifdef __cplusplus +} +#endif + +#endif Index: sys/contrib/dpdk_rte_lpm/rte_debug.h =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/rte_debug.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_DEBUG_H_ +#define _RTE_DEBUG_H_ + +/** + * @file + * + * Debug Functions in RTE + * + * This file defines a generic API for debug operations. Part of + * the implementation is architecture-specific. + */ + +//#include "rte_log.h" +#include "rte_branch_prediction.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Dump the stack of the calling core to the console. + */ +void rte_dump_stack(void); + +/** + * Dump the registers of the calling core to the console. + * + * Note: Not implemented in a userapp environment; use gdb instead. + */ +void rte_dump_registers(void); + +/** + * Provide notification of a critical non-recoverable error and terminate + * execution abnormally. + * + * Display the format string and its expanded arguments (printf-like). + * + * In a linux environment, this function dumps the stack and calls + * abort() resulting in a core dump if enabled. + * + * The function never returns. + * + * @param ... + * The format string, followed by the variable list of arguments. + */ +#define rte_panic(...) rte_panic_(__func__, __VA_ARGS__, "dummy") +#define rte_panic_(func, format, ...) __rte_panic(func, format "%.0s", __VA_ARGS__) + +#ifdef RTE_ENABLE_ASSERT +#define RTE_ASSERT(exp) RTE_VERIFY(exp) +#else +#define RTE_ASSERT(exp) do {} while (0) +#endif +#define RTE_VERIFY(exp) do { \ + if (unlikely(!(exp))) \ + rte_panic("line %d\tassert \"%s\" failed\n", __LINE__, #exp); \ +} while (0) + +/* + * Provide notification of a critical non-recoverable error and stop. + * + * This function should not be called directly. Refer to rte_panic() macro + * documentation. + */ +void __rte_panic(const char *funcname , const char *format, ...) +{ +#ifdef __GNUC__ +#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 2)) + __rte_cold +#endif +#endif + //__rte_noreturn + //__rte_format_printf(2, 3); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_DEBUG_H_ */ Index: sys/contrib/dpdk_rte_lpm/rte_jhash.h =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/rte_jhash.h @@ -0,0 +1,379 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2015 Intel Corporation. + */ + +#ifndef _RTE_JHASH_H +#define _RTE_JHASH_H + +/** + * @file + * + * jhash functions. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +//#include + +/* jhash.h: Jenkins hash support. + * + * Copyright (C) 2006 Bob Jenkins (bob_jenkins@burtleburtle.net) + * + * http://burtleburtle.net/bob/hash/ + * + * These are the credits from Bob's sources: + * + * lookup3.c, by Bob Jenkins, May 2006, Public Domain. + * + * These are functions for producing 32-bit hashes for hash table lookup. + * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() + * are externally useful functions. Routines to test the hash are included + * if SELF_TEST is defined. You can use this free for any purpose. It's in + * the public domain. It has no warranty. + * + * $FreeBSD$ + */ + +#define rot(x, k) (((x) << (k)) | ((x) >> (32-(k)))) + +/** @internal Internal function. NOTE: Arguments are modified. */ +#define __rte_jhash_mix(a, b, c) do { \ + a -= c; a ^= rot(c, 4); c += b; \ + b -= a; b ^= rot(a, 6); a += c; \ + c -= b; c ^= rot(b, 8); b += a; \ + a -= c; a ^= rot(c, 16); c += b; \ + b -= a; b ^= rot(a, 19); a += c; \ + c -= b; c ^= rot(b, 4); b += a; \ +} while (0) + +#define __rte_jhash_final(a, b, c) do { \ + c ^= b; c -= rot(b, 14); \ + a ^= c; a -= rot(c, 11); \ + b ^= a; b -= rot(a, 25); \ + c ^= b; c -= rot(b, 16); \ + a ^= c; a -= rot(c, 4); \ + b ^= a; b -= rot(a, 14); \ + c ^= b; c -= rot(b, 24); \ +} while (0) + +/** The golden ratio: an arbitrary value. */ +#define RTE_JHASH_GOLDEN_RATIO 0xdeadbeef + +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN +#define BIT_SHIFT(x, y, k) (((x) >> (k)) | ((uint64_t)(y) << (32-(k)))) +#else +#define BIT_SHIFT(x, y, k) (((uint64_t)(x) << (k)) | ((y) >> (32-(k)))) +#endif + +#define LOWER8b_MASK rte_le_to_cpu_32(0xff) +#define LOWER16b_MASK rte_le_to_cpu_32(0xffff) +#define LOWER24b_MASK rte_le_to_cpu_32(0xffffff) + +static inline void +__rte_jhash_2hashes(const void *key, uint32_t length, uint32_t *pc, + uint32_t *pb, unsigned check_align) +{ + uint32_t a, b, c; + + /* Set up the internal state */ + a = b = c = RTE_JHASH_GOLDEN_RATIO + ((uint32_t)length) + *pc; + c += *pb; + + /* + * Check key alignment. For x86 architecture, first case is always optimal + * If check_align is not set, first case will be used + */ +#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_I686) || defined(RTE_ARCH_X86_X32) + const uint32_t *k = (const uint32_t *)key; + const uint32_t s = 0; +#else + const uint32_t *k = (uint32_t *)((uintptr_t)key & (uintptr_t)~3); + const uint32_t s = ((uintptr_t)key & 3) * CHAR_BIT; +#endif + if (!check_align || s == 0) { + while (length > 12) { + a += k[0]; + b += k[1]; + c += k[2]; + + __rte_jhash_mix(a, b, c); + + k += 3; + length -= 12; + } + + switch (length) { + case 12: + c += k[2]; b += k[1]; a += k[0]; break; + case 11: + c += k[2] & LOWER24b_MASK; b += k[1]; a += k[0]; break; + case 10: + c += k[2] & LOWER16b_MASK; b += k[1]; a += k[0]; break; + case 9: + c += k[2] & LOWER8b_MASK; b += k[1]; a += k[0]; break; + case 8: + b += k[1]; a += k[0]; break; + case 7: + b += k[1] & LOWER24b_MASK; a += k[0]; break; + case 6: + b += k[1] & LOWER16b_MASK; a += k[0]; break; + case 5: + b += k[1] & LOWER8b_MASK; a += k[0]; break; + case 4: + a += k[0]; break; + case 3: + a += k[0] & LOWER24b_MASK; break; + case 2: + a += k[0] & LOWER16b_MASK; break; + case 1: + a += k[0] & LOWER8b_MASK; break; + /* zero length strings require no mixing */ + case 0: + *pc = c; + *pb = b; + return; + }; + } else { + /* all but the last block: affect some 32 bits of (a, b, c) */ + while (length > 12) { + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + c += BIT_SHIFT(k[2], k[3], s); + __rte_jhash_mix(a, b, c); + + k += 3; + length -= 12; + } + + /* last block: affect all 32 bits of (c) */ + switch (length) { + case 12: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + c += BIT_SHIFT(k[2], k[3], s); + break; + case 11: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + c += BIT_SHIFT(k[2], k[3], s) & LOWER24b_MASK; + break; + case 10: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + c += BIT_SHIFT(k[2], k[3], s) & LOWER16b_MASK; + break; + case 9: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + c += BIT_SHIFT(k[2], k[3], s) & LOWER8b_MASK; + break; + case 8: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + break; + case 7: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s) & LOWER24b_MASK; + break; + case 6: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s) & LOWER16b_MASK; + break; + case 5: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s) & LOWER8b_MASK; + break; + case 4: + a += BIT_SHIFT(k[0], k[1], s); + break; + case 3: + a += BIT_SHIFT(k[0], k[1], s) & LOWER24b_MASK; + break; + case 2: + a += BIT_SHIFT(k[0], k[1], s) & LOWER16b_MASK; + break; + case 1: + a += BIT_SHIFT(k[0], k[1], s) & LOWER8b_MASK; + break; + /* zero length strings require no mixing */ + case 0: + *pc = c; + *pb = b; + return; + } + } + + __rte_jhash_final(a, b, c); + + *pc = c; + *pb = b; +} + +/** + * Same as rte_jhash, but takes two seeds and return two uint32_ts. + * pc and pb must be non-null, and *pc and *pb must both be initialized + * with seeds. If you pass in (*pb)=0, the output (*pc) will be + * the same as the return value from rte_jhash. + * + * @param key + * Key to calculate hash of. + * @param length + * Length of key in bytes. + * @param pc + * IN: seed OUT: primary hash value. + * @param pb + * IN: second seed OUT: secondary hash value. + */ +static inline void +rte_jhash_2hashes(const void *key, uint32_t length, uint32_t *pc, uint32_t *pb) +{ + __rte_jhash_2hashes(key, length, pc, pb, 1); +} + +/** + * Same as rte_jhash_32b, but takes two seeds and return two uint32_ts. + * pc and pb must be non-null, and *pc and *pb must both be initialized + * with seeds. If you pass in (*pb)=0, the output (*pc) will be + * the same as the return value from rte_jhash_32b. + * + * @param k + * Key to calculate hash of. + * @param length + * Length of key in units of 4 bytes. + * @param pc + * IN: seed OUT: primary hash value. + * @param pb + * IN: second seed OUT: secondary hash value. + */ +static inline void +rte_jhash_32b_2hashes(const uint32_t *k, uint32_t length, uint32_t *pc, uint32_t *pb) +{ + __rte_jhash_2hashes((const void *) k, (length << 2), pc, pb, 0); +} + +/** + * The most generic version, hashes an arbitrary sequence + * of bytes. No alignment or length assumptions are made about + * the input key. For keys not aligned to four byte boundaries + * or a multiple of four bytes in length, the memory region + * just after may be read (but not used in the computation). + * This may cross a page boundary. + * + * @param key + * Key to calculate hash of. + * @param length + * Length of key in bytes. + * @param initval + * Initialising value of hash. + * @return + * Calculated hash value. + */ +static inline uint32_t +rte_jhash(const void *key, uint32_t length, uint32_t initval) +{ + uint32_t initval2 = 0; + + rte_jhash_2hashes(key, length, &initval, &initval2); + + return initval; +} + +/** + * A special optimized version that handles 1 or more of uint32_ts. + * The length parameter here is the number of uint32_ts in the key. + * + * @param k + * Key to calculate hash of. + * @param length + * Length of key in units of 4 bytes. + * @param initval + * Initialising value of hash. + * @return + * Calculated hash value. + */ +static inline uint32_t +rte_jhash_32b(const uint32_t *k, uint32_t length, uint32_t initval) +{ + uint32_t initval2 = 0; + + rte_jhash_32b_2hashes(k, length, &initval, &initval2); + + return initval; +} + +static inline uint32_t +__rte_jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval) +{ + a += RTE_JHASH_GOLDEN_RATIO + initval; + b += RTE_JHASH_GOLDEN_RATIO + initval; + c += RTE_JHASH_GOLDEN_RATIO + initval; + + __rte_jhash_final(a, b, c); + + return c; +} + +/** + * A special ultra-optimized versions that knows it is hashing exactly + * 3 words. + * + * @param a + * First word to calculate hash of. + * @param b + * Second word to calculate hash of. + * @param c + * Third word to calculate hash of. + * @param initval + * Initialising value of hash. + * @return + * Calculated hash value. + */ +static inline uint32_t +rte_jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval) +{ + return __rte_jhash_3words(a + 12, b + 12, c + 12, initval); +} + +/** + * A special ultra-optimized versions that knows it is hashing exactly + * 2 words. + * + * @param a + * First word to calculate hash of. + * @param b + * Second word to calculate hash of. + * @param initval + * Initialising value of hash. + * @return + * Calculated hash value. + */ +static inline uint32_t +rte_jhash_2words(uint32_t a, uint32_t b, uint32_t initval) +{ + return __rte_jhash_3words(a + 8, b + 8, 8, initval); +} + +/** + * A special ultra-optimized versions that knows it is hashing exactly + * 1 word. + * + * @param a + * Word to calculate hash of. + * @param initval + * Initialising value of hash. + * @return + * Calculated hash value. + */ +static inline uint32_t +rte_jhash_1word(uint32_t a, uint32_t initval) +{ + return __rte_jhash_3words(a + 4, 4, 4, initval); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_JHASH_H */ Index: sys/contrib/dpdk_rte_lpm/rte_log.h =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/rte_log.h @@ -0,0 +1,383 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#ifndef _RTE_LOG_H_ +#define _RTE_LOG_H_ + +/** + * @file + * + * RTE Logs API + * + * This file provides a log API to RTE applications. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include +#include + +#include +#include +#include + +struct rte_log_dynamic_type; + +/** The rte_log structure. */ +struct rte_logs { + uint32_t type; /**< Bitfield with enabled logs. */ + uint32_t level; /**< Log level. */ + FILE *file; /**< Output file set by rte_openlog_stream, or NULL. */ + size_t dynamic_types_len; + struct rte_log_dynamic_type *dynamic_types; +}; + +/** Global log information */ +extern struct rte_logs rte_logs; + +/* SDK log type */ +#define RTE_LOGTYPE_EAL 0 /**< Log related to eal. */ +#define RTE_LOGTYPE_MALLOC 1 /**< Log related to malloc. */ +#define RTE_LOGTYPE_RING 2 /**< Log related to ring. */ +#define RTE_LOGTYPE_MEMPOOL 3 /**< Log related to mempool. */ +#define RTE_LOGTYPE_TIMER 4 /**< Log related to timers. */ +#define RTE_LOGTYPE_PMD 5 /**< Log related to poll mode driver. */ +#define RTE_LOGTYPE_HASH 6 /**< Log related to hash table. */ +#define RTE_LOGTYPE_LPM 7 /**< Log related to LPM. */ +#define RTE_LOGTYPE_KNI 8 /**< Log related to KNI. */ +#define RTE_LOGTYPE_ACL 9 /**< Log related to ACL. */ +#define RTE_LOGTYPE_POWER 10 /**< Log related to power. */ +#define RTE_LOGTYPE_METER 11 /**< Log related to QoS meter. */ +#define RTE_LOGTYPE_SCHED 12 /**< Log related to QoS port scheduler. */ +#define RTE_LOGTYPE_PORT 13 /**< Log related to port. */ +#define RTE_LOGTYPE_TABLE 14 /**< Log related to table. */ +#define RTE_LOGTYPE_PIPELINE 15 /**< Log related to pipeline. */ +#define RTE_LOGTYPE_MBUF 16 /**< Log related to mbuf. */ +#define RTE_LOGTYPE_CRYPTODEV 17 /**< Log related to cryptodev. */ +#define RTE_LOGTYPE_EFD 18 /**< Log related to EFD. */ +#define RTE_LOGTYPE_EVENTDEV 19 /**< Log related to eventdev. */ +#define RTE_LOGTYPE_GSO 20 /**< Log related to GSO. */ + +/* these log types can be used in an application */ +#define RTE_LOGTYPE_USER1 24 /**< User-defined log type 1. */ +#define RTE_LOGTYPE_USER2 25 /**< User-defined log type 2. */ +#define RTE_LOGTYPE_USER3 26 /**< User-defined log type 3. */ +#define RTE_LOGTYPE_USER4 27 /**< User-defined log type 4. */ +#define RTE_LOGTYPE_USER5 28 /**< User-defined log type 5. */ +#define RTE_LOGTYPE_USER6 29 /**< User-defined log type 6. */ +#define RTE_LOGTYPE_USER7 30 /**< User-defined log type 7. */ +#define RTE_LOGTYPE_USER8 31 /**< User-defined log type 8. */ + +/** First identifier for extended logs */ +#define RTE_LOGTYPE_FIRST_EXT_ID 32 + +/* Can't use 0, as it gives compiler warnings */ +#define RTE_LOG_EMERG 1U /**< System is unusable. */ +#define RTE_LOG_ALERT 2U /**< Action must be taken immediately. */ +#define RTE_LOG_CRIT 3U /**< Critical conditions. */ +#define RTE_LOG_ERR 4U /**< Error conditions. */ +#define RTE_LOG_WARNING 5U /**< Warning conditions. */ +#define RTE_LOG_NOTICE 6U /**< Normal but significant condition. */ +#define RTE_LOG_INFO 7U /**< Informational. */ +#define RTE_LOG_DEBUG 8U /**< Debug-level messages. */ + +/** + * Change the stream that will be used by the logging system. + * + * This can be done at any time. The f argument represents the stream + * to be used to send the logs. If f is NULL, the default output is + * used (stderr). + * + * @param f + * Pointer to the stream. + * @return + * - 0 on success. + * - Negative on error. + */ +int rte_openlog_stream(FILE *f); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Retrieve the stream used by the logging system (see rte_openlog_stream() + * to change it). + * + * @return + * Pointer to the stream. + */ +__rte_experimental +FILE *rte_log_get_stream(void); + +/** + * Set the global log level. + * + * After this call, logs with a level lower or equal than the level + * passed as argument will be displayed. + * + * @param level + * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8). + */ +void rte_log_set_global_level(uint32_t level); + +/** + * Get the global log level. + * + * @return + * The current global log level. + */ +uint32_t rte_log_get_global_level(void); + +/** + * Get the log level for a given type. + * + * @param logtype + * The log type identifier. + * @return + * 0 on success, a negative value if logtype is invalid. + */ +int rte_log_get_level(uint32_t logtype); + +/** + * For a given `logtype`, check if a log with `loglevel` can be printed. + * + * @param logtype + * The log type identifier + * @param loglevel + * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8). + * @return + * Returns 'true' if log can be printed and 'false' if it can't. + */ +__rte_experimental +bool rte_log_can_log(uint32_t logtype, uint32_t loglevel); + +/** + * Set the log level for a given type based on shell pattern. + * + * @param pattern + * The match pattern identifying the log type. + * @param level + * The level to be set. + * @return + * 0 on success, a negative value if level is invalid. + */ +int rte_log_set_level_pattern(const char *pattern, uint32_t level); + +/** + * Set the log level for a given type based on regular expression. + * + * @param regex + * The regular expression identifying the log type. + * @param level + * The level to be set. + * @return + * 0 on success, a negative value if level is invalid. + */ +int rte_log_set_level_regexp(const char *regex, uint32_t level); + +/** + * Set the log level for a given type. + * + * @param logtype + * The log type identifier. + * @param level + * The level to be set. + * @return + * 0 on success, a negative value if logtype or level is invalid. + */ +int rte_log_set_level(uint32_t logtype, uint32_t level); + +/** + * Get the current loglevel for the message being processed. + * + * Before calling the user-defined stream for logging, the log + * subsystem sets a per-lcore variable containing the loglevel and the + * logtype of the message being processed. This information can be + * accessed by the user-defined log output function through this + * function. + * + * @return + * The loglevel of the message being processed. + */ +int rte_log_cur_msg_loglevel(void); + +/** + * Get the current logtype for the message being processed. + * + * Before calling the user-defined stream for logging, the log + * subsystem sets a per-lcore variable containing the loglevel and the + * logtype of the message being processed. This information can be + * accessed by the user-defined log output function through this + * function. + * + * @return + * The logtype of the message being processed. + */ +int rte_log_cur_msg_logtype(void); + +/** + * Register a dynamic log type + * + * If a log is already registered with the same type, the returned value + * is the same than the previous one. + * + * @param name + * The string identifying the log type. + * @return + * - >0: success, the returned value is the log type identifier. + * - (-ENOMEM): cannot allocate memory. + */ +int rte_log_register(const char *name); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Register a dynamic log type and try to pick its level from EAL options + * + * rte_log_register() is called inside. If successful, the function tries + * to search for matching regexp in the list of EAL log level options and + * pick the level from the last matching entry. If nothing can be applied + * from the list, the level will be set to the user-defined default value. + * + * @param name + * Name for the log type to be registered + * @param level_def + * Fallback level to be set if the global list has no matching options + * @return + * - >=0: the newly registered log type + * - <0: rte_log_register() error value + */ +__rte_experimental +int rte_log_register_type_and_pick_level(const char *name, uint32_t level_def); + +/** + * Dump log information. + * + * Dump the global level and the registered log types. + * + * @param f + * The output stream where the dump should be sent. + */ +void rte_log_dump(FILE *f); + +/** + * Generates a log message. + * + * The message will be sent in the stream defined by the previous call + * to rte_openlog_stream(). + * + * The level argument determines if the log should be displayed or + * not, depending on the global rte_logs variable. + * + * The preferred alternative is the RTE_LOG() because it adds the + * level and type in the logged string. + * + * @param level + * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8). + * @param logtype + * The log type, for example, RTE_LOGTYPE_EAL. + * @param format + * The format string, as in printf(3), followed by the variable arguments + * required by the format. + * @return + * - 0: Success. + * - Negative on error. + */ +int rte_log(uint32_t level, uint32_t logtype, const char *format, ...) +#ifdef __GNUC__ +#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 2)) + __rte_cold +#endif +#endif + __rte_format_printf(3, 4); + +/** + * Generates a log message. + * + * The message will be sent in the stream defined by the previous call + * to rte_openlog_stream(). + * + * The level argument determines if the log should be displayed or + * not, depending on the global rte_logs variable. A trailing + * newline may be added if needed. + * + * The preferred alternative is the RTE_LOG() because it adds the + * level and type in the logged string. + * + * @param level + * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8). + * @param logtype + * The log type, for example, RTE_LOGTYPE_EAL. + * @param format + * The format string, as in printf(3), followed by the variable arguments + * required by the format. + * @param ap + * The va_list of the variable arguments required by the format. + * @return + * - 0: Success. + * - Negative on error. + */ +int rte_vlog(uint32_t level, uint32_t logtype, const char *format, va_list ap) + __rte_format_printf(3, 0); + +/** + * Generates a log message. + * + * The RTE_LOG() is a helper that prefixes the string with the log level + * and type, and call rte_log(). + * + * @param l + * Log level. A value between EMERG (1) and DEBUG (8). The short name is + * expanded by the macro, so it cannot be an integer value. + * @param t + * The log type, for example, EAL. The short name is expanded by the + * macro, so it cannot be an integer value. + * @param ... + * The fmt string, as in printf(3), followed by the variable arguments + * required by the format. + * @return + * - 0: Success. + * - Negative on error. + */ +#define RTE_LOG(l, t, ...) \ + rte_log(RTE_LOG_ ## l, \ + RTE_LOGTYPE_ ## t, # t ": " __VA_ARGS__) + +/** + * Generates a log message for data path. + * + * Similar to RTE_LOG(), except that it is removed at compilation time + * if the RTE_LOG_DP_LEVEL configuration option is lower than the log + * level argument. + * + * @param l + * Log level. A value between EMERG (1) and DEBUG (8). The short name is + * expanded by the macro, so it cannot be an integer value. + * @param t + * The log type, for example, EAL. The short name is expanded by the + * macro, so it cannot be an integer value. + * @param ... + * The fmt string, as in printf(3), followed by the variable arguments + * required by the format. + * @return + * - 0: Success. + * - Negative on error. + */ +#define RTE_LOG_DP(l, t, ...) \ + (void)((RTE_LOG_ ## l <= RTE_LOG_DP_LEVEL) ? \ + rte_log(RTE_LOG_ ## l, \ + RTE_LOGTYPE_ ## t, # t ": " __VA_ARGS__) : \ + 0) + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_LOG_H_ */ Index: sys/contrib/dpdk_rte_lpm/rte_lpm.h =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/rte_lpm.h @@ -0,0 +1,403 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_LPM_H_ +#define _RTE_LPM_H_ + +/** + * @file + * RTE Longest Prefix Match (LPM) + */ + +/* +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +*/ +#include "rte_branch_prediction.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** Max number of characters in LPM name. */ +#define RTE_LPM_NAMESIZE 32 + +/** Maximum depth value possible for IPv4 LPM. */ +#define RTE_LPM_MAX_DEPTH 32 + +/** @internal Total number of tbl24 entries. */ +#define RTE_LPM_TBL24_NUM_ENTRIES (1 << 24) + +/** @internal Number of entries in a tbl8 group. */ +#define RTE_LPM_TBL8_GROUP_NUM_ENTRIES 256 + +/** @internal Max number of tbl8 groups in the tbl8. */ +#define RTE_LPM_MAX_TBL8_NUM_GROUPS (1 << 24) + +/** @internal Total number of tbl8 groups in the tbl8. */ +#define RTE_LPM_TBL8_NUM_GROUPS 256 + +/** @internal Total number of tbl8 entries. */ +#define RTE_LPM_TBL8_NUM_ENTRIES (RTE_LPM_TBL8_NUM_GROUPS * \ + RTE_LPM_TBL8_GROUP_NUM_ENTRIES) + +/** @internal Macro to enable/disable run-time checks. */ +#if defined(RTE_LIBRTE_LPM_DEBUG) +#define RTE_LPM_RETURN_IF_TRUE(cond, retval) do { \ + if (cond) return (retval); \ +} while (0) +#else +#define RTE_LPM_RETURN_IF_TRUE(cond, retval) +#endif + +/** @internal bitmask with valid and valid_group fields set */ +#define RTE_LPM_VALID_EXT_ENTRY_BITMASK 0x03000000 + +/** Bitmask used to indicate successful lookup */ +#define RTE_LPM_LOOKUP_SUCCESS 0x01000000 + +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN +/** @internal Tbl24 entry structure. */ +__extension__ +struct rte_lpm_tbl_entry { + /** + * Stores Next hop (tbl8 or tbl24 when valid_group is not set) or + * a group index pointing to a tbl8 structure (tbl24 only, when + * valid_group is set) + */ + uint32_t next_hop :24; + /* Using single uint8_t to store 3 values. */ + uint32_t valid :1; /**< Validation flag. */ + /** + * For tbl24: + * - valid_group == 0: entry stores a next hop + * - valid_group == 1: entry stores a group_index pointing to a tbl8 + * For tbl8: + * - valid_group indicates whether the current tbl8 is in use or not + */ + uint32_t valid_group :1; + uint32_t depth :6; /**< Rule depth. */ +}; + +#else + +__extension__ +struct rte_lpm_tbl_entry { + uint32_t depth :6; + uint32_t valid_group :1; + uint32_t valid :1; + uint32_t next_hop :24; + +}; + +#endif + +/** LPM configuration structure. */ +struct rte_lpm_config { + uint32_t max_rules; /**< Max number of rules. */ + uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */ + int flags; /**< This field is currently unused. */ +}; + +/** @internal Rule structure. */ +struct rte_lpm_rule { + uint32_t ip; /**< Rule IP address. */ + uint32_t next_hop; /**< Rule next hop. */ +}; + +/** @internal Contains metadata about the rules table. */ +struct rte_lpm_rule_info { + uint32_t used_rules; /**< Used rules so far. */ + uint32_t first_rule; /**< Indexes the first rule of a given depth. */ +}; + +struct nhop_object; +struct rte_lpm_external { + struct nhop_object **nh_idx; /**< # -> idx mappings */ + uint32_t default_idx; /* nhop index of default route */ + uint32_t fibnum; /* fib index */ +}; + +/** @internal LPM structure. */ +struct rte_lpm { + /* LPM metadata. */ + struct rte_lpm_external ext; + char name[RTE_LPM_NAMESIZE]; /**< Name of the lpm. */ + uint32_t max_rules; /**< Max. balanced rules per lpm. */ + uint32_t number_tbl8s; /**< Number of tbl8s. */ + struct rte_lpm_rule_info rule_info[RTE_LPM_MAX_DEPTH]; /**< Rule info table. */ + + /* LPM Tables. */ + struct rte_lpm_tbl_entry tbl24[RTE_LPM_TBL24_NUM_ENTRIES] + __rte_cache_aligned; /**< LPM tbl24 table. */ + struct rte_lpm_tbl_entry *tbl8; /**< LPM tbl8 table. */ + struct rte_lpm_rule *rules_tbl; /**< LPM rules. */ +}; + +/** + * Create an LPM object. + * + * @param name + * LPM object name + * @param socket_id + * NUMA socket ID for LPM table memory allocation + * @param config + * Structure containing the configuration + * @return + * Handle to LPM object on success, NULL otherwise with rte_errno set + * to an appropriate values. Possible rte_errno values include: + * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure + * - E_RTE_SECONDARY - function was called from a secondary process instance + * - EINVAL - invalid parameter passed to function + * - ENOSPC - the maximum number of memzones has already been allocated + * - EEXIST - a memzone with the same name already exists + * - ENOMEM - no appropriate memory area found in which to create memzone + */ +struct rte_lpm * +rte_lpm_create(const char *name, int socket_id, + const struct rte_lpm_config *config); + +/** + * Find an existing LPM object and return a pointer to it. + * + * @param name + * Name of the lpm object as passed to rte_lpm_create() + * @return + * Pointer to lpm object or NULL if object not found with rte_errno + * set appropriately. Possible rte_errno values include: + * - ENOENT - required entry not available to return. + */ +struct rte_lpm * +rte_lpm_find_existing(const char *name); + +/** + * Free an LPM object. + * + * @param lpm + * LPM object handle + * @return + * None + */ +void +rte_lpm_free(struct rte_lpm *lpm); + +/** + * Add a rule to the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be added to the LPM table + * @param depth + * Depth of the rule to be added to the LPM table + * @param next_hop + * Next hop of the rule to be added to the LPM table + * @return + * 0 on success, negative value otherwise + */ +int +rte_lpm_add(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, uint32_t next_hop); + +/** + * Check if a rule is present in the LPM table, + * and provide its next hop if it is. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be searched + * @param depth + * Depth of the rule to searched + * @param next_hop + * Next hop of the rule (valid only if it is found) + * @return + * 1 if the rule exists, 0 if it does not, a negative value on failure + */ +int +rte_lpm_is_rule_present(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, +uint32_t *next_hop); + +/** + * Delete a rule from the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be deleted from the LPM table + * @param depth + * Depth of the rule to be deleted from the LPM table + * @param psub_rule_depth + * Pointer to depth of the parent rule + * @param sub_rule_nhop + * Pinter to the parent rule nexthop index + * @return + * 0 on success, negative value otherwise + */ +int +rte_lpm_delete(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint8_t sub_rule_depth, uint32_t sub_rule_nhop); + +/** + * Delete all rules from the LPM table. + * + * @param lpm + * LPM object handle + */ +void +rte_lpm_delete_all(struct rte_lpm *lpm); + +/** + * Lookup an IP into the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP to be looked up in the LPM table + * @param next_hop + * Next hop of the most specific rule found for IP (valid on lookup hit only) + * @return + * -EINVAL for incorrect arguments, -ENOENT on lookup miss, 0 on lookup hit + */ +static inline int +rte_lpm_lookup(struct rte_lpm *lpm, uint32_t ip, uint32_t *next_hop) +{ + unsigned tbl24_index = (ip >> 8); + uint32_t tbl_entry; + const uint32_t *ptbl; + + /* DEBUG: Check user input arguments. */ + RTE_LPM_RETURN_IF_TRUE(((lpm == NULL) || (next_hop == NULL)), -EINVAL); + + /* Copy tbl24 entry */ + ptbl = (const uint32_t *)(&lpm->tbl24[tbl24_index]); + tbl_entry = *ptbl; + + /* Memory ordering is not required in lookup. Because dataflow + * dependency exists, compiler or HW won't be able to re-order + * the operations. + */ + /* Copy tbl8 entry (only if needed) */ + if (unlikely((tbl_entry & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + + unsigned tbl8_index = (uint8_t)ip + + (((uint32_t)tbl_entry & 0x00FFFFFF) * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES); + + ptbl = (const uint32_t *)&lpm->tbl8[tbl8_index]; + tbl_entry = *ptbl; + } + + *next_hop = ((uint32_t)tbl_entry & 0x00FFFFFF); + return (tbl_entry & RTE_LPM_LOOKUP_SUCCESS) ? 0 : -ENOENT; +} + +/** + * Lookup multiple IP addresses in an LPM table. This may be implemented as a + * macro, so the address of the function should not be used. + * + * @param lpm + * LPM object handle + * @param ips + * Array of IPs to be looked up in the LPM table + * @param next_hops + * Next hop of the most specific rule found for IP (valid on lookup hit only). + * This is an array of two byte values. The most significant byte in each + * value says whether the lookup was successful (bitmask + * RTE_LPM_LOOKUP_SUCCESS is set). The least significant byte is the + * actual next hop. + * @param n + * Number of elements in ips (and next_hops) array to lookup. This should be a + * compile time constant, and divisible by 8 for best performance. + * @return + * -EINVAL for incorrect arguments, otherwise 0 + */ +#define rte_lpm_lookup_bulk(lpm, ips, next_hops, n) \ + rte_lpm_lookup_bulk_func(lpm, ips, next_hops, n) + +static inline int +rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t *ips, + uint32_t *next_hops, const unsigned n) +{ + unsigned i; + unsigned tbl24_indexes[n]; + const uint32_t *ptbl; + + /* DEBUG: Check user input arguments. */ + RTE_LPM_RETURN_IF_TRUE(((lpm == NULL) || (ips == NULL) || + (next_hops == NULL)), -EINVAL); + + for (i = 0; i < n; i++) { + tbl24_indexes[i] = ips[i] >> 8; + } + + for (i = 0; i < n; i++) { + /* Simply copy tbl24 entry to output */ + ptbl = (const uint32_t *)&lpm->tbl24[tbl24_indexes[i]]; + next_hops[i] = *ptbl; + + /* Overwrite output with tbl8 entry if needed */ + if (unlikely((next_hops[i] & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + + unsigned tbl8_index = (uint8_t)ips[i] + + (((uint32_t)next_hops[i] & 0x00FFFFFF) * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES); + + ptbl = (const uint32_t *)&lpm->tbl8[tbl8_index]; + next_hops[i] = *ptbl; + } + } + return 0; +} + +/* Mask four results. */ +#define RTE_LPM_MASKX4_RES UINT64_C(0x00ffffff00ffffff) + +/** + * Lookup four IP addresses in an LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * Four IPs to be looked up in the LPM table + * @param hop + * Next hop of the most specific rule found for IP (valid on lookup hit only). + * This is an 4 elements array of two byte values. + * If the lookup was successful for the given IP, then least significant byte + * of the corresponding element is the actual next hop and the most + * significant byte is zero. + * If the lookup for the given IP failed, then corresponding element would + * contain default value, see description of then next parameter. + * @param defv + * Default value to populate into corresponding element of hop[] array, + * if lookup would fail. + */ +#if 0 +static inline void +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], + uint32_t defv); + +#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) +#include "rte_lpm_neon.h" +#elif defined(RTE_ARCH_PPC_64) +#include "rte_lpm_altivec.h" +#else +#include "rte_lpm_sse.h" +#endif +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_LPM_H_ */ Index: sys/contrib/dpdk_rte_lpm/rte_lpm.c =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/rte_lpm.c @@ -0,0 +1,1107 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int errno = 0, rte_errno = 0; + +#if 0 +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include /* for definition of RTE_CACHE_LINE_SIZE */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +#include "rte_shim.h" +#include "rte_lpm.h" + +#if 0 +TAILQ_HEAD(rte_lpm_list, rte_tailq_entry); + +static struct rte_tailq_elem rte_lpm_tailq = { + .name = "RTE_LPM", +}; +EAL_REGISTER_TAILQ(rte_lpm_tailq) +#endif + +#define MAX_DEPTH_TBL24 24 + +enum valid_flag { + INVALID = 0, + VALID +}; + +/* Macro to enable/disable run-time checks. */ +#if defined(RTE_LIBRTE_LPM_DEBUG) +#include +#define VERIFY_DEPTH(depth) do { \ + if ((depth == 0) || (depth > RTE_LPM_MAX_DEPTH)) \ + rte_panic("LPM: Invalid depth (%u) at line %d", \ + (unsigned)(depth), __LINE__); \ +} while (0) +#else +#define VERIFY_DEPTH(depth) +#endif + +/* + * Converts a given depth value to its corresponding mask value. + * + * depth (IN) : range = 1 - 32 + * mask (OUT) : 32bit mask + */ +static uint32_t __attribute__((pure)) +depth_to_mask(uint8_t depth) +{ + VERIFY_DEPTH(depth); + + /* To calculate a mask start with a 1 on the left hand side and right + * shift while populating the left hand side with 1's + */ + return (int)0x80000000 >> (depth - 1); +} + +/* + * Converts given depth value to its corresponding range value. + */ +static uint32_t __attribute__((pure)) +depth_to_range(uint8_t depth) +{ + VERIFY_DEPTH(depth); + + /* + * Calculate tbl24 range. (Note: 2^depth = 1 << depth) + */ + if (depth <= MAX_DEPTH_TBL24) + return 1 << (MAX_DEPTH_TBL24 - depth); + + /* Else if depth is greater than 24 */ + return 1 << (RTE_LPM_MAX_DEPTH - depth); +} + +#if 0 +/* + * Find an existing lpm table and return a pointer to it. + */ +struct rte_lpm * +rte_lpm_find_existing(const char *name) +{ + struct rte_lpm *l = NULL; + struct rte_tailq_entry *te; + struct rte_lpm_list *lpm_list; + + lpm_list = RTE_TAILQ_CAST(rte_lpm_tailq.head, rte_lpm_list); + + rte_mcfg_tailq_read_lock(); + TAILQ_FOREACH(te, lpm_list, next) { + l = te->data; + if (strncmp(name, l->name, RTE_LPM_NAMESIZE) == 0) + break; + } + rte_mcfg_tailq_read_unlock(); + + if (te == NULL) { + rte_errno = ENOENT; + return NULL; + } + + return l; +} +#endif + +/* + * Allocates memory for LPM object + */ +struct rte_lpm * +rte_lpm_create(const char *name, int socket_id, + const struct rte_lpm_config *config) +{ + char mem_name[RTE_LPM_NAMESIZE]; + struct rte_lpm *lpm = NULL; + //struct rte_tailq_entry *te; + uint32_t mem_size, rules_size, tbl8s_size; + //struct rte_lpm_list *lpm_list; + + //lpm_list = RTE_TAILQ_CAST(rte_lpm_tailq.head, rte_lpm_list); + + RTE_BUILD_BUG_ON(sizeof(struct rte_lpm_tbl_entry) != 4); + + /* Check user arguments. */ + if ((name == NULL) || (socket_id < -1) || (config->max_rules == 0) + || config->number_tbl8s > RTE_LPM_MAX_TBL8_NUM_GROUPS) { + rte_errno = EINVAL; + return NULL; + } + + snprintf(mem_name, sizeof(mem_name), "LPM_%s", name); + + /* Determine the amount of memory to allocate. */ + mem_size = sizeof(*lpm); + rules_size = sizeof(struct rte_lpm_rule) * config->max_rules; + tbl8s_size = (sizeof(struct rte_lpm_tbl_entry) * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES * config->number_tbl8s); + +#if 0 + rte_mcfg_tailq_write_lock(); + + /* guarantee there's no existing */ + TAILQ_FOREACH(te, lpm_list, next) { + lpm = te->data; + if (strncmp(name, lpm->name, RTE_LPM_NAMESIZE) == 0) + break; + } + + if (te != NULL) { + lpm = NULL; + rte_errno = EEXIST; + goto exit; + } + + /* allocate tailq entry */ + te = rte_zmalloc("LPM_TAILQ_ENTRY", sizeof(*te), 0); + if (te == NULL) { + RTE_LOG(ERR, LPM, "Failed to allocate tailq entry\n"); + rte_errno = ENOMEM; + goto exit; + } +#endif + + /* Allocate memory to store the LPM data structures. */ + lpm = rte_zmalloc_socket(mem_name, mem_size, + RTE_CACHE_LINE_SIZE, socket_id); + if (lpm == NULL) { + RTE_LOG(ERR, LPM, "LPM memory allocation failed\n"); + //rte_free(te); + rte_errno = ENOMEM; + goto exit; + } + + lpm->rules_tbl = rte_zmalloc_socket(NULL, + (size_t)rules_size, RTE_CACHE_LINE_SIZE, socket_id); + + if (lpm->rules_tbl == NULL) { + RTE_LOG(ERR, LPM, "LPM rules_tbl memory allocation failed\n"); + rte_free(lpm); + lpm = NULL; + //rte_free(te); + rte_errno = ENOMEM; + goto exit; + } + + lpm->tbl8 = rte_zmalloc_socket(NULL, + (size_t)tbl8s_size, RTE_CACHE_LINE_SIZE, socket_id); + + if (lpm->tbl8 == NULL) { + RTE_LOG(ERR, LPM, "LPM tbl8 memory allocation failed\n"); + rte_free(lpm->rules_tbl); + rte_free(lpm); + lpm = NULL; + //rte_free(te); + rte_errno = ENOMEM; + goto exit; + } + + /* Save user arguments. */ + lpm->max_rules = config->max_rules; + lpm->number_tbl8s = config->number_tbl8s; + strlcpy(lpm->name, name, sizeof(lpm->name)); + + //te->data = lpm; + + //TAILQ_INSERT_TAIL(lpm_list, te, next); + +exit: + rte_mcfg_tailq_write_unlock(); + + return lpm; +} + +/* + * Deallocates memory for given LPM table. + */ +void +rte_lpm_free(struct rte_lpm *lpm) +{ +#if 0 + struct rte_lpm_list *lpm_list; + struct rte_tailq_entry *te; + + /* Check user arguments. */ + if (lpm == NULL) + return; + + lpm_list = RTE_TAILQ_CAST(rte_lpm_tailq.head, rte_lpm_list); + + rte_mcfg_tailq_write_lock(); + + /* find our tailq entry */ + TAILQ_FOREACH(te, lpm_list, next) { + if (te->data == (void *) lpm) + break; + } + if (te != NULL) + TAILQ_REMOVE(lpm_list, te, next); + + rte_mcfg_tailq_write_unlock(); +#endif + + rte_free(lpm->tbl8); + rte_free(lpm->rules_tbl); + rte_free(lpm); + //rte_free(te); +} + +#if 0 +/* + * Adds a rule to the rule table. + * + * NOTE: The rule table is split into 32 groups. Each group contains rules that + * apply to a specific prefix depth (i.e. group 1 contains rules that apply to + * prefixes with a depth of 1 etc.). In the following code (depth - 1) is used + * to refer to depth 1 because even though the depth range is 1 - 32, depths + * are stored in the rule table from 0 - 31. + * NOTE: Valid range for depth parameter is 1 .. 32 inclusive. + */ +static int32_t +rule_add(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, + uint32_t next_hop) +{ + uint32_t rule_gindex, rule_index, last_rule; + int i; + + VERIFY_DEPTH(depth); + + /* Scan through rule group to see if rule already exists. */ + if (lpm->rule_info[depth - 1].used_rules > 0) { + + /* rule_gindex stands for rule group index. */ + rule_gindex = lpm->rule_info[depth - 1].first_rule; + /* Initialise rule_index to point to start of rule group. */ + rule_index = rule_gindex; + /* Last rule = Last used rule in this rule group. */ + last_rule = rule_gindex + lpm->rule_info[depth - 1].used_rules; + + for (; rule_index < last_rule; rule_index++) { + + /* If rule already exists update next hop and return. */ + if (lpm->rules_tbl[rule_index].ip == ip_masked) { + + if (lpm->rules_tbl[rule_index].next_hop + == next_hop) + return -EEXIST; + lpm->rules_tbl[rule_index].next_hop = next_hop; + + return rule_index; + } + } + + if (rule_index == lpm->max_rules) + return -ENOSPC; + } else { + /* Calculate the position in which the rule will be stored. */ + rule_index = 0; + + for (i = depth - 1; i > 0; i--) { + if (lpm->rule_info[i - 1].used_rules > 0) { + rule_index = lpm->rule_info[i - 1].first_rule + + lpm->rule_info[i - 1].used_rules; + break; + } + } + if (rule_index == lpm->max_rules) + return -ENOSPC; + + lpm->rule_info[depth - 1].first_rule = rule_index; + } + + /* Make room for the new rule in the array. */ + for (i = RTE_LPM_MAX_DEPTH; i > depth; i--) { + if (lpm->rule_info[i - 1].first_rule + + lpm->rule_info[i - 1].used_rules == lpm->max_rules) + return -ENOSPC; + + if (lpm->rule_info[i - 1].used_rules > 0) { + lpm->rules_tbl[lpm->rule_info[i - 1].first_rule + + lpm->rule_info[i - 1].used_rules] + = lpm->rules_tbl[lpm->rule_info[i - 1].first_rule]; + lpm->rule_info[i - 1].first_rule++; + } + } + + /* Add the new rule. */ + lpm->rules_tbl[rule_index].ip = ip_masked; + lpm->rules_tbl[rule_index].next_hop = next_hop; + + /* Increment the used rules counter for this rule group. */ + lpm->rule_info[depth - 1].used_rules++; + + return rule_index; +} + +/* + * Delete a rule from the rule table. + * NOTE: Valid range for depth parameter is 1 .. 32 inclusive. + */ +static void +rule_delete(struct rte_lpm *lpm, int32_t rule_index, uint8_t depth) +{ + int i; + + VERIFY_DEPTH(depth); + + lpm->rules_tbl[rule_index] = + lpm->rules_tbl[lpm->rule_info[depth - 1].first_rule + + lpm->rule_info[depth - 1].used_rules - 1]; + + for (i = depth; i < RTE_LPM_MAX_DEPTH; i++) { + if (lpm->rule_info[i].used_rules > 0) { + lpm->rules_tbl[lpm->rule_info[i].first_rule - 1] = + lpm->rules_tbl[lpm->rule_info[i].first_rule + + lpm->rule_info[i].used_rules - 1]; + lpm->rule_info[i].first_rule--; + } + } + + lpm->rule_info[depth - 1].used_rules--; +} + +/* + * Finds a rule in rule table. + * NOTE: Valid range for depth parameter is 1 .. 32 inclusive. + */ +static int32_t +rule_find(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth) +{ + uint32_t rule_gindex, last_rule, rule_index; + + VERIFY_DEPTH(depth); + + rule_gindex = lpm->rule_info[depth - 1].first_rule; + last_rule = rule_gindex + lpm->rule_info[depth - 1].used_rules; + + /* Scan used rules at given depth to find rule. */ + for (rule_index = rule_gindex; rule_index < last_rule; rule_index++) { + /* If rule is found return the rule index. */ + if (lpm->rules_tbl[rule_index].ip == ip_masked) + return rule_index; + } + + /* If rule is not found return -EINVAL. */ + return -EINVAL; +} +#endif + +/* + * Find, clean and allocate a tbl8. + */ +static int32_t +tbl8_alloc(struct rte_lpm_tbl_entry *tbl8, uint32_t number_tbl8s) +{ + uint32_t group_idx; /* tbl8 group index. */ + struct rte_lpm_tbl_entry *tbl8_entry; + + /* Scan through tbl8 to find a free (i.e. INVALID) tbl8 group. */ + for (group_idx = 0; group_idx < number_tbl8s; group_idx++) { + tbl8_entry = &tbl8[group_idx * RTE_LPM_TBL8_GROUP_NUM_ENTRIES]; + /* If a free tbl8 group is found clean it and set as VALID. */ + if (!tbl8_entry->valid_group) { + struct rte_lpm_tbl_entry new_tbl8_entry = { + .next_hop = 0, + .valid = INVALID, + .depth = 0, + .valid_group = VALID, + }; + + memset(&tbl8_entry[0], 0, + RTE_LPM_TBL8_GROUP_NUM_ENTRIES * + sizeof(tbl8_entry[0])); + + __atomic_store(tbl8_entry, &new_tbl8_entry, + __ATOMIC_RELAXED); + + /* Return group index for allocated tbl8 group. */ + return group_idx; + } + } + + /* If there are no tbl8 groups free then return error. */ + return -ENOSPC; +} + +static void +tbl8_free(struct rte_lpm_tbl_entry *tbl8, uint32_t tbl8_group_start) +{ + /* Set tbl8 group invalid*/ + struct rte_lpm_tbl_entry zero_tbl8_entry = {0}; + + __atomic_store(&tbl8[tbl8_group_start], &zero_tbl8_entry, + __ATOMIC_RELAXED); +} + +static __rte_noinline int32_t +add_depth_small(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint32_t next_hop) +{ +#define group_idx next_hop + uint32_t tbl24_index, tbl24_range, tbl8_index, tbl8_group_end, i, j; + + /* Calculate the index into Table24. */ + tbl24_index = ip >> 8; + tbl24_range = depth_to_range(depth); + + for (i = tbl24_index; i < (tbl24_index + tbl24_range); i++) { + /* + * For invalid OR valid and non-extended tbl 24 entries set + * entry. + */ + if (!lpm->tbl24[i].valid || (lpm->tbl24[i].valid_group == 0 && + lpm->tbl24[i].depth <= depth)) { + + struct rte_lpm_tbl_entry new_tbl24_entry = { + .next_hop = next_hop, + .valid = VALID, + .valid_group = 0, + .depth = depth, + }; + + /* Setting tbl24 entry in one go to avoid race + * conditions + */ + __atomic_store(&lpm->tbl24[i], &new_tbl24_entry, + __ATOMIC_RELEASE); + + continue; + } + + if (lpm->tbl24[i].valid_group == 1) { + /* If tbl24 entry is valid and extended calculate the + * index into tbl8. + */ + tbl8_index = lpm->tbl24[i].group_idx * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + tbl8_group_end = tbl8_index + + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + for (j = tbl8_index; j < tbl8_group_end; j++) { + if (!lpm->tbl8[j].valid || + lpm->tbl8[j].depth <= depth) { + struct rte_lpm_tbl_entry + new_tbl8_entry = { + .valid = VALID, + .valid_group = VALID, + .depth = depth, + .next_hop = next_hop, + }; + + /* + * Setting tbl8 entry in one go to avoid + * race conditions + */ + __atomic_store(&lpm->tbl8[j], + &new_tbl8_entry, + __ATOMIC_RELAXED); + + continue; + } + } + } + } +#undef group_idx + return 0; +} + +static __rte_noinline int32_t +add_depth_big(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, + uint32_t next_hop) +{ +#define group_idx next_hop + uint32_t tbl24_index; + int32_t tbl8_group_index, tbl8_group_start, tbl8_group_end, tbl8_index, + tbl8_range, i; + + tbl24_index = (ip_masked >> 8); + tbl8_range = depth_to_range(depth); + + if (!lpm->tbl24[tbl24_index].valid) { + /* Search for a free tbl8 group. */ + tbl8_group_index = tbl8_alloc(lpm->tbl8, lpm->number_tbl8s); + + /* Check tbl8 allocation was successful. */ + if (tbl8_group_index < 0) { + return tbl8_group_index; + } + + /* Find index into tbl8 and range. */ + tbl8_index = (tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES) + + (ip_masked & 0xFF); + + /* Set tbl8 entry. */ + for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { + struct rte_lpm_tbl_entry new_tbl8_entry = { + .valid = VALID, + .depth = depth, + .valid_group = lpm->tbl8[i].valid_group, + .next_hop = next_hop, + }; + __atomic_store(&lpm->tbl8[i], &new_tbl8_entry, + __ATOMIC_RELAXED); + } + + /* + * Update tbl24 entry to point to new tbl8 entry. Note: The + * ext_flag and tbl8_index need to be updated simultaneously, + * so assign whole structure in one go + */ + + struct rte_lpm_tbl_entry new_tbl24_entry = { + .group_idx = tbl8_group_index, + .valid = VALID, + .valid_group = 1, + .depth = 0, + }; + + /* The tbl24 entry must be written only after the + * tbl8 entries are written. + */ + __atomic_store(&lpm->tbl24[tbl24_index], &new_tbl24_entry, + __ATOMIC_RELEASE); + + } /* If valid entry but not extended calculate the index into Table8. */ + else if (lpm->tbl24[tbl24_index].valid_group == 0) { + /* Search for free tbl8 group. */ + tbl8_group_index = tbl8_alloc(lpm->tbl8, lpm->number_tbl8s); + + if (tbl8_group_index < 0) { + return tbl8_group_index; + } + + tbl8_group_start = tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + tbl8_group_end = tbl8_group_start + + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + /* Populate new tbl8 with tbl24 value. */ + for (i = tbl8_group_start; i < tbl8_group_end; i++) { + struct rte_lpm_tbl_entry new_tbl8_entry = { + .valid = VALID, + .depth = lpm->tbl24[tbl24_index].depth, + .valid_group = lpm->tbl8[i].valid_group, + .next_hop = lpm->tbl24[tbl24_index].next_hop, + }; + __atomic_store(&lpm->tbl8[i], &new_tbl8_entry, + __ATOMIC_RELAXED); + } + + tbl8_index = tbl8_group_start + (ip_masked & 0xFF); + + /* Insert new rule into the tbl8 entry. */ + for (i = tbl8_index; i < tbl8_index + tbl8_range; i++) { + struct rte_lpm_tbl_entry new_tbl8_entry = { + .valid = VALID, + .depth = depth, + .valid_group = lpm->tbl8[i].valid_group, + .next_hop = next_hop, + }; + __atomic_store(&lpm->tbl8[i], &new_tbl8_entry, + __ATOMIC_RELAXED); + } + + /* + * Update tbl24 entry to point to new tbl8 entry. Note: The + * ext_flag and tbl8_index need to be updated simultaneously, + * so assign whole structure in one go. + */ + + struct rte_lpm_tbl_entry new_tbl24_entry = { + .group_idx = tbl8_group_index, + .valid = VALID, + .valid_group = 1, + .depth = 0, + }; + + /* The tbl24 entry must be written only after the + * tbl8 entries are written. + */ + __atomic_store(&lpm->tbl24[tbl24_index], &new_tbl24_entry, + __ATOMIC_RELEASE); + + } else { /* + * If it is valid, extended entry calculate the index into tbl8. + */ + tbl8_group_index = lpm->tbl24[tbl24_index].group_idx; + tbl8_group_start = tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + tbl8_index = tbl8_group_start + (ip_masked & 0xFF); + + for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { + + if (!lpm->tbl8[i].valid || + lpm->tbl8[i].depth <= depth) { + struct rte_lpm_tbl_entry new_tbl8_entry = { + .valid = VALID, + .depth = depth, + .next_hop = next_hop, + .valid_group = lpm->tbl8[i].valid_group, + }; + + /* + * Setting tbl8 entry in one go to avoid race + * condition + */ + __atomic_store(&lpm->tbl8[i], &new_tbl8_entry, + __ATOMIC_RELAXED); + + continue; + } + } + } +#undef group_idx + return 0; +} + +/* + * Add a route + */ +int +rte_lpm_add(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint32_t next_hop) +{ + int32_t status = 0; + uint32_t ip_masked; + + /* Check user arguments. */ + if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM_MAX_DEPTH)) + return -EINVAL; + + ip_masked = ip & depth_to_mask(depth); + +#if 0 + /* Add the rule to the rule table. */ + rule_index = rule_add(lpm, ip_masked, depth, next_hop); + + /* Skip table entries update if The rule is the same as + * the rule in the rules table. + */ + if (rule_index == -EEXIST) + return 0; + + /* If the is no space available for new rule return error. */ + if (rule_index < 0) { + return rule_index; + } +#endif + + if (depth <= MAX_DEPTH_TBL24) { + status = add_depth_small(lpm, ip_masked, depth, next_hop); + } else { /* If depth > RTE_LPM_MAX_DEPTH_TBL24 */ + status = add_depth_big(lpm, ip_masked, depth, next_hop); + + /* + * If add fails due to exhaustion of tbl8 extensions delete + * rule that was added to rule table. + */ + if (status < 0) { + //rule_delete(lpm, rule_index, depth); + + return status; + } + } + + return 0; +} + +#if 0 +/* + * Look for a rule in the high-level rules table + */ +int +rte_lpm_is_rule_present(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, +uint32_t *next_hop) +{ + uint32_t ip_masked; + int32_t rule_index; + + /* Check user arguments. */ + if ((lpm == NULL) || + (next_hop == NULL) || + (depth < 1) || (depth > RTE_LPM_MAX_DEPTH)) + return -EINVAL; + + /* Look for the rule using rule_find. */ + ip_masked = ip & depth_to_mask(depth); + rule_index = rule_find(lpm, ip_masked, depth); + + if (rule_index >= 0) { + *next_hop = lpm->rules_tbl[rule_index].next_hop; + return 1; + } + + /* If rule is not found return 0. */ + return 0; +} + +static int32_t +find_previous_rule(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint8_t *sub_rule_depth) +{ + int32_t rule_index; + uint32_t ip_masked; + uint8_t prev_depth; + + for (prev_depth = (uint8_t)(depth - 1); prev_depth > 0; prev_depth--) { + ip_masked = ip & depth_to_mask(prev_depth); + + rule_index = rule_find(lpm, ip_masked, prev_depth); + + if (rule_index >= 0) { + *sub_rule_depth = prev_depth; + return rule_index; + } + } + + return -1; +} +#endif + +static int32_t +delete_depth_small(struct rte_lpm *lpm, uint32_t ip_masked, + uint8_t depth, uint32_t sub_rule_nhop, uint8_t sub_rule_depth) +{ +#define group_idx next_hop + uint32_t tbl24_range, tbl24_index, tbl8_group_index, tbl8_index, i, j; + + /* Calculate the range and index into Table24. */ + tbl24_range = depth_to_range(depth); + tbl24_index = (ip_masked >> 8); + struct rte_lpm_tbl_entry zero_tbl24_entry = {0}; + + /* + * Firstly check the sub_rule_index. A -1 indicates no replacement rule + * and a positive number indicates a sub_rule_index. + */ + if (sub_rule_nhop == 0) { + /* + * If no replacement rule exists then invalidate entries + * associated with this rule. + */ + for (i = tbl24_index; i < (tbl24_index + tbl24_range); i++) { + + if (lpm->tbl24[i].valid_group == 0 && + lpm->tbl24[i].depth <= depth) { + __atomic_store(&lpm->tbl24[i], + &zero_tbl24_entry, __ATOMIC_RELEASE); + } else if (lpm->tbl24[i].valid_group == 1) { + /* + * If TBL24 entry is extended, then there has + * to be a rule with depth >= 25 in the + * associated TBL8 group. + */ + + tbl8_group_index = lpm->tbl24[i].group_idx; + tbl8_index = tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + for (j = tbl8_index; j < (tbl8_index + + RTE_LPM_TBL8_GROUP_NUM_ENTRIES); j++) { + + if (lpm->tbl8[j].depth <= depth) + lpm->tbl8[j].valid = INVALID; + } + } + } + } else { + /* + * If a replacement rule exists then modify entries + * associated with this rule. + */ + + struct rte_lpm_tbl_entry new_tbl24_entry = { + .next_hop = sub_rule_nhop, + .valid = VALID, + .valid_group = 0, + .depth = sub_rule_depth, + }; + + struct rte_lpm_tbl_entry new_tbl8_entry = { + .valid = VALID, + .valid_group = VALID, + .depth = sub_rule_depth, + .next_hop = sub_rule_nhop, + }; + + for (i = tbl24_index; i < (tbl24_index + tbl24_range); i++) { + + if (lpm->tbl24[i].valid_group == 0 && + lpm->tbl24[i].depth <= depth) { + __atomic_store(&lpm->tbl24[i], &new_tbl24_entry, + __ATOMIC_RELEASE); + } else if (lpm->tbl24[i].valid_group == 1) { + /* + * If TBL24 entry is extended, then there has + * to be a rule with depth >= 25 in the + * associated TBL8 group. + */ + + tbl8_group_index = lpm->tbl24[i].group_idx; + tbl8_index = tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + for (j = tbl8_index; j < (tbl8_index + + RTE_LPM_TBL8_GROUP_NUM_ENTRIES); j++) { + + if (lpm->tbl8[j].depth <= depth) + __atomic_store(&lpm->tbl8[j], + &new_tbl8_entry, + __ATOMIC_RELAXED); + } + } + } + } +#undef group_idx + return 0; +} + +/* + * Checks if table 8 group can be recycled. + * + * Return of -EEXIST means tbl8 is in use and thus can not be recycled. + * Return of -EINVAL means tbl8 is empty and thus can be recycled + * Return of value > -1 means tbl8 is in use but has all the same values and + * thus can be recycled + */ +static int32_t +tbl8_recycle_check(struct rte_lpm_tbl_entry *tbl8, + uint32_t tbl8_group_start) +{ + uint32_t tbl8_group_end, i; + tbl8_group_end = tbl8_group_start + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + /* + * Check the first entry of the given tbl8. If it is invalid we know + * this tbl8 does not contain any rule with a depth < RTE_LPM_MAX_DEPTH + * (As they would affect all entries in a tbl8) and thus this table + * can not be recycled. + */ + if (tbl8[tbl8_group_start].valid) { + /* + * If first entry is valid check if the depth is less than 24 + * and if so check the rest of the entries to verify that they + * are all of this depth. + */ + if (tbl8[tbl8_group_start].depth <= MAX_DEPTH_TBL24) { + for (i = (tbl8_group_start + 1); i < tbl8_group_end; + i++) { + + if (tbl8[i].depth != + tbl8[tbl8_group_start].depth) { + + return -EEXIST; + } + } + /* If all entries are the same return the tb8 index */ + return tbl8_group_start; + } + + return -EEXIST; + } + /* + * If the first entry is invalid check if the rest of the entries in + * the tbl8 are invalid. + */ + for (i = (tbl8_group_start + 1); i < tbl8_group_end; i++) { + if (tbl8[i].valid) + return -EEXIST; + } + /* If no valid entries are found then return -EINVAL. */ + return -EINVAL; +} + +static int32_t +delete_depth_big(struct rte_lpm *lpm, uint32_t ip_masked, + uint8_t depth, uint32_t sub_rule_nhop, uint8_t sub_rule_depth) +{ +#define group_idx next_hop + uint32_t tbl24_index, tbl8_group_index, tbl8_group_start, tbl8_index, + tbl8_range, i; + int32_t tbl8_recycle_index; + + /* + * Calculate the index into tbl24 and range. Note: All depths larger + * than MAX_DEPTH_TBL24 are associated with only one tbl24 entry. + */ + tbl24_index = ip_masked >> 8; + + /* Calculate the index into tbl8 and range. */ + tbl8_group_index = lpm->tbl24[tbl24_index].group_idx; + tbl8_group_start = tbl8_group_index * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + tbl8_index = tbl8_group_start + (ip_masked & 0xFF); + tbl8_range = depth_to_range(depth); + + if (sub_rule_nhop == 0) { + /* + * Loop through the range of entries on tbl8 for which the + * rule_to_delete must be removed or modified. + */ + for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { + if (lpm->tbl8[i].depth <= depth) + lpm->tbl8[i].valid = INVALID; + } + } else { + /* Set new tbl8 entry. */ + struct rte_lpm_tbl_entry new_tbl8_entry = { + .valid = VALID, + .depth = sub_rule_depth, + .valid_group = lpm->tbl8[tbl8_group_start].valid_group, + .next_hop = sub_rule_nhop, + }; + + /* + * Loop through the range of entries on tbl8 for which the + * rule_to_delete must be modified. + */ + for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { + if (lpm->tbl8[i].depth <= depth) + __atomic_store(&lpm->tbl8[i], &new_tbl8_entry, + __ATOMIC_RELAXED); + } + } + + /* + * Check if there are any valid entries in this tbl8 group. If all + * tbl8 entries are invalid we can free the tbl8 and invalidate the + * associated tbl24 entry. + */ + + tbl8_recycle_index = tbl8_recycle_check(lpm->tbl8, tbl8_group_start); + + if (tbl8_recycle_index == -EINVAL) { + /* Set tbl24 before freeing tbl8 to avoid race condition. + * Prevent the free of the tbl8 group from hoisting. + */ + lpm->tbl24[tbl24_index].valid = 0; + __atomic_thread_fence(__ATOMIC_RELEASE); + tbl8_free(lpm->tbl8, tbl8_group_start); + } else if (tbl8_recycle_index > -1) { + /* Update tbl24 entry. */ + struct rte_lpm_tbl_entry new_tbl24_entry = { + .next_hop = lpm->tbl8[tbl8_recycle_index].next_hop, + .valid = VALID, + .valid_group = 0, + .depth = lpm->tbl8[tbl8_recycle_index].depth, + }; + + /* Set tbl24 before freeing tbl8 to avoid race condition. + * Prevent the free of the tbl8 group from hoisting. + */ + __atomic_store(&lpm->tbl24[tbl24_index], &new_tbl24_entry, + __ATOMIC_RELAXED); + __atomic_thread_fence(__ATOMIC_RELEASE); + tbl8_free(lpm->tbl8, tbl8_group_start); + } +#undef group_idx + return 0; +} + +/* + * Deletes a rule + */ +int +rte_lpm_delete(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint8_t sub_rule_depth, uint32_t sub_rule_nhop) +{ + //int32_t rule_to_delete_index; + uint32_t ip_masked; + //uint8_t sub_rule_depth; + /* + * Check input arguments. Note: IP must be a positive integer of 32 + * bits in length therefore it need not be checked. + */ + if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM_MAX_DEPTH)) { + return -EINVAL; + } + + ip_masked = ip & depth_to_mask(depth); + +#if 0 + /* + * Find the index of the input rule, that needs to be deleted, in the + * rule table. + */ + rule_to_delete_index = rule_find(lpm, ip_masked, depth); + + /* + * Check if rule_to_delete_index was found. If no rule was found the + * function rule_find returns -EINVAL. + */ + if (rule_to_delete_index < 0) + return -EINVAL; + + /* Delete the rule from the rule table. */ + rule_delete(lpm, rule_to_delete_index, depth); +#endif + + /* + * Find rule to replace the rule_to_delete. If there is no rule to + * replace the rule_to_delete we return -1 and invalidate the table + * entries associated with this rule. + */ + //sub_rule_depth = *psub_rule_depth; + //sub_rule_index = find_previous_rule(lpm, ip, depth, &sub_rule_depth); + + /* + * If the input depth value is less than 25 use function + * delete_depth_small otherwise use delete_depth_big. + */ + if (depth <= MAX_DEPTH_TBL24) { + return delete_depth_small(lpm, ip_masked, depth, + sub_rule_nhop, sub_rule_depth); + } else { /* If depth > MAX_DEPTH_TBL24 */ + return delete_depth_big(lpm, ip_masked, depth, sub_rule_nhop, + sub_rule_depth); + } +} + +/* + * Delete all rules from the LPM table. + */ +void +rte_lpm_delete_all(struct rte_lpm *lpm) +{ + /* Zero rule information. */ + memset(lpm->rule_info, 0, sizeof(lpm->rule_info)); + + /* Zero tbl24. */ + memset(lpm->tbl24, 0, sizeof(lpm->tbl24)); + + /* Zero tbl8. */ + memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0]) + * RTE_LPM_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s); + + /* Delete all rules form the rules table. */ + memset(lpm->rules_tbl, 0, sizeof(lpm->rules_tbl[0]) * lpm->max_rules); +} Index: sys/contrib/dpdk_rte_lpm/rte_lpm6.h =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/rte_lpm6.h @@ -0,0 +1,209 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#ifndef _RTE_LPM6_H_ +#define _RTE_LPM6_H_ + +/** + * @file + * RTE Longest Prefix Match for IPv6 (LPM6) + */ + +#ifdef __cplusplus +extern "C" { +#endif + + +#define RTE_LPM6_MAX_DEPTH 128 +#define RTE_LPM6_IPV6_ADDR_SIZE 16 +/** Max number of characters in LPM name. */ +#define RTE_LPM6_NAMESIZE 32 + +/** LPM structure. */ +struct rte_lpm6; + +struct nhop_object; +struct rte_lpm6_external { + struct nhop_object **nh_idx; /**< # -> idx mappings */ + uint32_t default_idx; /* nhop index of default route */ + uint32_t fibnum; /* fib index */ +}; + +/** LPM configuration structure. */ +struct rte_lpm6_config { + uint32_t max_rules; /**< Max number of rules. */ + uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */ + int flags; /**< This field is currently unused. */ +}; + +#define RTE_LPM6_RULE_SIZE 32 +struct rte_lpm6_rule *fill_rule6(char *buffer, const uint8_t *ip, + uint8_t depth, uint32_t next_hop); +/** + * Create an LPM object. + * + * @param name + * LPM object name + * @param socket_id + * NUMA socket ID for LPM table memory allocation + * @param config + * Structure containing the configuration + * @return + * Handle to LPM object on success, NULL otherwise with rte_errno set + * to an appropriate values. Possible rte_errno values include: + * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure + * - E_RTE_SECONDARY - function was called from a secondary process instance + * - EINVAL - invalid parameter passed to function + * - ENOSPC - the maximum number of memzones has already been allocated + * - EEXIST - a memzone with the same name already exists + * - ENOMEM - no appropriate memory area found in which to create memzone + */ +struct rte_lpm6 * +rte_lpm6_create(const char *name, int socket_id, + const struct rte_lpm6_config *config); + +/** + * Find an existing LPM object and return a pointer to it. + * + * @param name + * Name of the lpm object as passed to rte_lpm6_create() + * @return + * Pointer to lpm object or NULL if object not found with rte_errno + * set appropriately. Possible rte_errno values include: + * - ENOENT - required entry not available to return. + */ +struct rte_lpm6 * +rte_lpm6_find_existing(const char *name); + +/** + * Free an LPM object. + * + * @param lpm + * LPM object handle + * @return + * None + */ +void +rte_lpm6_free(struct rte_lpm6 *lpm); + +/** + * Add a rule to the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be added to the LPM table + * @param depth + * Depth of the rule to be added to the LPM table + * @param next_hop + * Next hop of the rule to be added to the LPM table + * @return + * 0 on success, negative value otherwise + */ +int +rte_lpm6_add(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + uint32_t next_hop, int is_new_rule); + +/** + * Check if a rule is present in the LPM table, + * and provide its next hop if it is. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be searched + * @param depth + * Depth of the rule to searched + * @param next_hop + * Next hop of the rule (valid only if it is found) + * @return + * 1 if the rule exists, 0 if it does not, a negative value on failure + */ +int +rte_lpm6_is_rule_present(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + uint32_t *next_hop); + +/** + * Delete a rule from the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be deleted from the LPM table + * @param depth + * Depth of the rule to be deleted from the LPM table + * @return + * 0 on success, negative value otherwise + */ +int +rte_lpm6_delete(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + struct rte_lpm6_rule *lsp_rule); + +/** + * Delete a rule from the LPM table. + * + * @param lpm + * LPM object handle + * @param ips + * Array of IPs to be deleted from the LPM table + * @param depths + * Array of depths of the rules to be deleted from the LPM table + * @param n + * Number of rules to be deleted from the LPM table + * @return + * 0 on success, negative value otherwise. + */ +int +rte_lpm6_delete_bulk_func(struct rte_lpm6 *lpm, + uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], uint8_t *depths, unsigned n); + +/** + * Delete all rules from the LPM table. + * + * @param lpm + * LPM object handle + */ +void +rte_lpm6_delete_all(struct rte_lpm6 *lpm); + +/** + * Lookup an IP into the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP to be looked up in the LPM table + * @param next_hop + * Next hop of the most specific rule found for IP (valid on lookup hit only) + * @return + * -EINVAL for incorrect arguments, -ENOENT on lookup miss, 0 on lookup hit + */ +int +rte_lpm6_lookup(const struct rte_lpm6 *lpm, const uint8_t *ip, uint32_t *next_hop); + +/** + * Lookup multiple IP addresses in an LPM table. + * + * @param lpm + * LPM object handle + * @param ips + * Array of IPs to be looked up in the LPM table + * @param next_hops + * Next hop of the most specific rule found for IP (valid on lookup hit only). + * This is an array of two byte values. The next hop will be stored on + * each position on success; otherwise the position will be set to -1. + * @param n + * Number of elements in ips (and next_hops) array to lookup. + * @return + * -EINVAL for incorrect arguments, otherwise 0 + */ +int +rte_lpm6_lookup_bulk_func(const struct rte_lpm6 *lpm, + uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], + int32_t *next_hops, unsigned int n); + +#ifdef __cplusplus +} +#endif + +#endif Index: sys/contrib/dpdk_rte_lpm/rte_lpm6.c =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/rte_lpm6.c @@ -0,0 +1,1415 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +//#include +int errno = 0, rte_errno = 0; + +#include "rte_shim.h" +#include "rte_lpm6.h" + +#define RTE_LPM6_TBL24_NUM_ENTRIES (1 << 24) +#define RTE_LPM6_TBL8_GROUP_NUM_ENTRIES 256 +#define RTE_LPM6_TBL8_MAX_NUM_GROUPS (1 << 21) + +#define RTE_LPM6_VALID_EXT_ENTRY_BITMASK 0xA0000000 +#define RTE_LPM6_LOOKUP_SUCCESS 0x20000000 +#define RTE_LPM6_TBL8_BITMASK 0x001FFFFF + +#define ADD_FIRST_BYTE 3 +#define LOOKUP_FIRST_BYTE 4 +#define BYTE_SIZE 8 +#define BYTES2_SIZE 16 + +#define RULE_HASH_TABLE_EXTRA_SPACE 64 +#define TBL24_IND UINT32_MAX + +#define lpm6_tbl8_gindex next_hop + +/** Flags for setting an entry as valid/invalid. */ +enum valid_flag { + INVALID = 0, + VALID +}; + +#if 0 +TAILQ_HEAD(rte_lpm6_list, rte_tailq_entry); + +static struct rte_tailq_elem rte_lpm6_tailq = { + .name = "RTE_LPM6", +}; +EAL_REGISTER_TAILQ(rte_lpm6_tailq) +#endif + +/** Tbl entry structure. It is the same for both tbl24 and tbl8 */ +struct rte_lpm6_tbl_entry { + uint32_t next_hop: 21; /**< Next hop / next table to be checked. */ + uint32_t depth :8; /**< Rule depth. */ + + /* Flags. */ + uint32_t valid :1; /**< Validation flag. */ + uint32_t valid_group :1; /**< Group validation flag. */ + uint32_t ext_entry :1; /**< External entry. */ +}; + +/** Rules tbl entry structure. */ +struct rte_lpm6_rule { + uint8_t ip[RTE_LPM6_IPV6_ADDR_SIZE]; /**< Rule IP address. */ + uint32_t next_hop; /**< Rule next hop. */ + uint8_t depth; /**< Rule depth. */ +}; + +/** Rules tbl entry key. */ +struct rte_lpm6_rule_key { + uint8_t ip[RTE_LPM6_IPV6_ADDR_SIZE]; /**< Rule IP address. */ + uint8_t depth; /**< Rule depth. */ +}; + +/* Header of tbl8 */ +struct rte_lpm_tbl8_hdr { + uint32_t owner_tbl_ind; /**< owner table: TBL24_IND if owner is tbl24, + * otherwise index of tbl8 + */ + uint32_t owner_entry_ind; /**< index of the owner table entry where + * pointer to the tbl8 is stored + */ + uint32_t ref_cnt; /**< table reference counter */ +}; + +/** LPM6 structure. */ +struct rte_lpm6 { + struct rte_lpm6_external ext; /* Storage used by the algo wrapper */ + /* LPM metadata. */ + char name[RTE_LPM6_NAMESIZE]; /**< Name of the lpm. */ + uint32_t max_rules; /**< Max number of rules. */ + uint32_t used_rules; /**< Used rules so far. */ + uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */ + + /* LPM Tables. */ + //struct rte_hash *rules_tbl; /**< LPM rules. */ + struct rte_lpm6_tbl_entry tbl24[RTE_LPM6_TBL24_NUM_ENTRIES] + __rte_cache_aligned; /**< LPM tbl24 table. */ + + uint32_t *tbl8_pool; /**< pool of indexes of free tbl8s */ + uint32_t tbl8_pool_pos; /**< current position in the tbl8 pool */ + + struct rte_lpm_tbl8_hdr *tbl8_hdrs; /* array of tbl8 headers */ + + struct rte_lpm6_tbl_entry tbl8[0] + __rte_cache_aligned; /**< LPM tbl8 table. */ +}; + +/* + * Takes an array of uint8_t (IPv6 address) and masks it using the depth. + * It leaves untouched one bit per unit in the depth variable + * and set the rest to 0. + */ +static inline void +ip6_mask_addr(uint8_t *ip, uint8_t depth) +{ + int16_t part_depth, mask; + int i; + + part_depth = depth; + + for (i = 0; i < RTE_LPM6_IPV6_ADDR_SIZE; i++) { + if (part_depth < BYTE_SIZE && part_depth >= 0) { + mask = (uint16_t)(~(UINT8_MAX >> part_depth)); + ip[i] = (uint8_t)(ip[i] & mask); + } else if (part_depth < 0) + ip[i] = 0; + + part_depth -= BYTE_SIZE; + } +} + +/* copy ipv6 address */ +static inline void +ip6_copy_addr(uint8_t *dst, const uint8_t *src) +{ + rte_memcpy(dst, src, RTE_LPM6_IPV6_ADDR_SIZE); +} + +#if 0 +/* + * LPM6 rule hash function + * + * It's used as a hash function for the rte_hash + * containing rules + */ +static inline uint32_t +rule_hash(const void *data, __rte_unused uint32_t data_len, + uint32_t init_val) +{ + return rte_jhash(data, sizeof(struct rte_lpm6_rule_key), init_val); +} +#endif + +/* + * Init pool of free tbl8 indexes + */ +static void +tbl8_pool_init(struct rte_lpm6 *lpm) +{ + uint32_t i; + + /* put entire range of indexes to the tbl8 pool */ + for (i = 0; i < lpm->number_tbl8s; i++) + lpm->tbl8_pool[i] = i; + + lpm->tbl8_pool_pos = 0; +} + +/* + * Get an index of a free tbl8 from the pool + */ +static inline uint32_t +tbl8_get(struct rte_lpm6 *lpm, uint32_t *tbl8_ind) +{ + if (lpm->tbl8_pool_pos == lpm->number_tbl8s) + /* no more free tbl8 */ + return -ENOSPC; + + /* next index */ + *tbl8_ind = lpm->tbl8_pool[lpm->tbl8_pool_pos++]; + return 0; +} + +/* + * Put an index of a free tbl8 back to the pool + */ +static inline uint32_t +tbl8_put(struct rte_lpm6 *lpm, uint32_t tbl8_ind) +{ + if (lpm->tbl8_pool_pos == 0) + /* pool is full */ + return -ENOSPC; + + lpm->tbl8_pool[--lpm->tbl8_pool_pos] = tbl8_ind; + return 0; +} + +/* + * Returns number of tbl8s available in the pool + */ +static inline uint32_t +tbl8_available(struct rte_lpm6 *lpm) +{ + return lpm->number_tbl8s - lpm->tbl8_pool_pos; +} + +#if 0 +/* + * Init a rule key. + * note that ip must be already masked + */ +static inline void +rule_key_init(struct rte_lpm6_rule_key *key, uint8_t *ip, uint8_t depth) +{ + ip6_copy_addr(key->ip, ip); + key->depth = depth; +} + +/* + * Rebuild the entire LPM tree by reinserting all rules + */ +static void +rebuild_lpm(struct rte_lpm6 *lpm) +{ + uint64_t next_hop; + struct rte_lpm6_rule_key *rule_key; + uint32_t iter = 0; + + while (rte_hash_iterate(lpm->rules_tbl, (void *) &rule_key, + (void **) &next_hop, &iter) >= 0) + rte_lpm6_add(lpm, rule_key->ip, rule_key->depth, + (uint32_t) next_hop); +} +#endif + +/* + * Allocates memory for LPM object + */ +struct rte_lpm6 * +rte_lpm6_create(const char *name, int socket_id, + const struct rte_lpm6_config *config) +{ + char mem_name[RTE_LPM6_NAMESIZE]; + struct rte_lpm6 *lpm = NULL; + //struct rte_tailq_entry *te; + uint64_t mem_size; + //struct rte_lpm6_list *lpm_list; + //struct rte_hash *rules_tbl = NULL; + uint32_t *tbl8_pool = NULL; + struct rte_lpm_tbl8_hdr *tbl8_hdrs = NULL; + + //lpm_list = RTE_TAILQ_CAST(rte_lpm6_tailq.head, rte_lpm6_list); + + RTE_BUILD_BUG_ON(sizeof(struct rte_lpm6_tbl_entry) != sizeof(uint32_t)); + + /* Check user arguments. */ + if ((name == NULL) || (socket_id < -1) || (config == NULL) || + config->number_tbl8s > RTE_LPM6_TBL8_MAX_NUM_GROUPS) { + rte_errno = EINVAL; + return NULL; + } + +#if 0 + /* create rules hash table */ + snprintf(mem_name, sizeof(mem_name), "LRH_%s", name); + struct rte_hash_parameters rule_hash_tbl_params = { + .entries = config->max_rules * 1.2 + + RULE_HASH_TABLE_EXTRA_SPACE, + .key_len = sizeof(struct rte_lpm6_rule_key), + .hash_func = rule_hash, + .hash_func_init_val = 0, + .name = mem_name, + .reserved = 0, + .socket_id = socket_id, + .extra_flag = 0 + }; + + rules_tbl = rte_hash_create(&rule_hash_tbl_params); + if (rules_tbl == NULL) { + RTE_LOG(ERR, LPM, "LPM rules hash table allocation failed: %s (%d)", + rte_strerror(rte_errno), rte_errno); + goto fail_wo_unlock; + } +#endif + + /* allocate tbl8 indexes pool */ + tbl8_pool = rte_malloc(NULL, + sizeof(uint32_t) * config->number_tbl8s, + RTE_CACHE_LINE_SIZE); + if (tbl8_pool == NULL) { + RTE_LOG(ERR, LPM, "LPM tbl8 pool allocation failed: %s (%d)", + rte_strerror(rte_errno), rte_errno); + rte_errno = ENOMEM; + goto fail_wo_unlock; + } + + /* allocate tbl8 headers */ + tbl8_hdrs = rte_malloc(NULL, + sizeof(struct rte_lpm_tbl8_hdr) * config->number_tbl8s, + RTE_CACHE_LINE_SIZE); + if (tbl8_hdrs == NULL) { + RTE_LOG(ERR, LPM, "LPM tbl8 headers allocation failed: %s (%d)", + rte_strerror(rte_errno), rte_errno); + rte_errno = ENOMEM; + goto fail_wo_unlock; + } + + snprintf(mem_name, sizeof(mem_name), "LPM_%s", name); + + /* Determine the amount of memory to allocate. */ + mem_size = sizeof(*lpm) + (sizeof(lpm->tbl8[0]) * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * config->number_tbl8s); + +#if 0 + rte_mcfg_tailq_write_lock(); + + /* Guarantee there's no existing */ + TAILQ_FOREACH(te, lpm_list, next) { + lpm = (struct rte_lpm6 *) te->data; + if (strncmp(name, lpm->name, RTE_LPM6_NAMESIZE) == 0) + break; + } + lpm = NULL; + if (te != NULL) { + rte_errno = EEXIST; + goto fail; + } + + /* allocate tailq entry */ + te = rte_zmalloc("LPM6_TAILQ_ENTRY", sizeof(*te), 0); + if (te == NULL) { + RTE_LOG(ERR, LPM, "Failed to allocate tailq entry!\n"); + rte_errno = ENOMEM; + goto fail; + } +#endif + + /* Allocate memory to store the LPM data structures. */ + lpm = rte_zmalloc_socket(mem_name, (size_t)mem_size, + RTE_CACHE_LINE_SIZE, socket_id); + + if (lpm == NULL) { + RTE_LOG(ERR, LPM, "LPM memory allocation failed\n"); + //rte_free(te); + rte_errno = ENOMEM; + goto fail; + } + + /* Save user arguments. */ + //lpm->max_rules = config->max_rules; + lpm->number_tbl8s = config->number_tbl8s; + strlcpy(lpm->name, name, sizeof(lpm->name)); + //lpm->rules_tbl = rules_tbl; + lpm->tbl8_pool = tbl8_pool; + lpm->tbl8_hdrs = tbl8_hdrs; + + /* init the stack */ + tbl8_pool_init(lpm); + + //te->data = (void *) lpm; + + //TAILQ_INSERT_TAIL(lpm_list, te, next); + rte_mcfg_tailq_write_unlock(); + return lpm; + +fail: + rte_mcfg_tailq_write_unlock(); + +fail_wo_unlock: + rte_free(tbl8_hdrs); + rte_free(tbl8_pool); + //rte_hash_free(rules_tbl); + + return NULL; +} + +#if 0 +/* + * Find an existing lpm table and return a pointer to it. + */ +struct rte_lpm6 * +rte_lpm6_find_existing(const char *name) +{ + struct rte_lpm6 *l = NULL; + struct rte_tailq_entry *te; + struct rte_lpm6_list *lpm_list; + + lpm_list = RTE_TAILQ_CAST(rte_lpm6_tailq.head, rte_lpm6_list); + + rte_mcfg_tailq_read_lock(); + TAILQ_FOREACH(te, lpm_list, next) { + l = (struct rte_lpm6 *) te->data; + if (strncmp(name, l->name, RTE_LPM6_NAMESIZE) == 0) + break; + } + rte_mcfg_tailq_read_unlock(); + + if (te == NULL) { + rte_errno = ENOENT; + return NULL; + } + + return l; +} +#endif + +/* + * Deallocates memory for given LPM table. + */ +void +rte_lpm6_free(struct rte_lpm6 *lpm) +{ +#if 0 + struct rte_lpm6_list *lpm_list; + struct rte_tailq_entry *te; + + /* Check user arguments. */ + if (lpm == NULL) + return; + + lpm_list = RTE_TAILQ_CAST(rte_lpm6_tailq.head, rte_lpm6_list); + + rte_mcfg_tailq_write_lock(); + + /* find our tailq entry */ + TAILQ_FOREACH(te, lpm_list, next) { + if (te->data == (void *) lpm) + break; + } + + if (te != NULL) + TAILQ_REMOVE(lpm_list, te, next); + + rte_mcfg_tailq_write_unlock(); +#endif + + rte_free(lpm->tbl8_hdrs); + rte_free(lpm->tbl8_pool); + //rte_hash_free(lpm->rules_tbl); + rte_free(lpm); + //rte_free(te); +} + +#if 0 +/* Find a rule */ +static inline int +rule_find_with_key(struct rte_lpm6 *lpm, + const struct rte_lpm6_rule_key *rule_key, + uint32_t *next_hop) +{ + uint64_t hash_val; + int ret; + + /* lookup for a rule */ + ret = rte_hash_lookup_data(lpm->rules_tbl, (const void *) rule_key, + (void **) &hash_val); + if (ret >= 0) { + *next_hop = (uint32_t) hash_val; + return 1; + } + + return 0; +} + +/* Find a rule */ +static int +rule_find(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, + uint32_t *next_hop) +{ + struct rte_lpm6_rule_key rule_key; + + /* init a rule key */ + rule_key_init(&rule_key, ip, depth); + + return rule_find_with_key(lpm, &rule_key, next_hop); +} + +/* + * Checks if a rule already exists in the rules table and updates + * the nexthop if so. Otherwise it adds a new rule if enough space is available. + * + * Returns: + * 0 - next hop of existed rule is updated + * 1 - new rule successfully added + * <0 - error + */ +static inline int +rule_add(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, uint32_t next_hop) +{ + int ret, rule_exist; + struct rte_lpm6_rule_key rule_key; + uint32_t unused; + + /* init a rule key */ + rule_key_init(&rule_key, ip, depth); + + /* Scan through rule list to see if rule already exists. */ + rule_exist = rule_find_with_key(lpm, &rule_key, &unused); + + /* + * If rule does not exist check if there is space to add a new rule to + * this rule group. If there is no space return error. + */ + if (!rule_exist && lpm->used_rules == lpm->max_rules) + return -ENOSPC; + + /* add the rule or update rules next hop */ + ret = rte_hash_add_key_data(lpm->rules_tbl, &rule_key, + (void *)(uintptr_t) next_hop); + if (ret < 0) + return ret; + + /* Increment the used rules counter for this rule group. */ + if (!rule_exist) { + lpm->used_rules++; + return 1; + } + + return 0; +} +#endif + +/* + * Function that expands a rule across the data structure when a less-generic + * one has been added before. It assures that every possible combination of bits + * in the IP address returns a match. + */ +static void +expand_rule(struct rte_lpm6 *lpm, uint32_t tbl8_gindex, uint8_t old_depth, + uint8_t new_depth, uint32_t next_hop, uint8_t valid) +{ + uint32_t tbl8_group_end, tbl8_gindex_next, j; + + tbl8_group_end = tbl8_gindex + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + + struct rte_lpm6_tbl_entry new_tbl8_entry = { + .valid = valid, + .valid_group = valid, + .depth = new_depth, + .next_hop = next_hop, + .ext_entry = 0, + }; + + for (j = tbl8_gindex; j < tbl8_group_end; j++) { + if (!lpm->tbl8[j].valid || (lpm->tbl8[j].ext_entry == 0 + && lpm->tbl8[j].depth <= old_depth)) { + + lpm->tbl8[j] = new_tbl8_entry; + + } else if (lpm->tbl8[j].ext_entry == 1) { + + tbl8_gindex_next = lpm->tbl8[j].lpm6_tbl8_gindex + * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + expand_rule(lpm, tbl8_gindex_next, old_depth, new_depth, + next_hop, valid); + } + } +} + +/* + * Init a tbl8 header + */ +static inline void +init_tbl8_header(struct rte_lpm6 *lpm, uint32_t tbl_ind, + uint32_t owner_tbl_ind, uint32_t owner_entry_ind) +{ + struct rte_lpm_tbl8_hdr *tbl_hdr = &lpm->tbl8_hdrs[tbl_ind]; + tbl_hdr->owner_tbl_ind = owner_tbl_ind; + tbl_hdr->owner_entry_ind = owner_entry_ind; + tbl_hdr->ref_cnt = 0; +} + +/* + * Calculate index to the table based on the number and position + * of the bytes being inspected in this step. + */ +static uint32_t +get_bitshift(const uint8_t *ip, uint8_t first_byte, uint8_t bytes) +{ + uint32_t entry_ind, i; + int8_t bitshift; + + entry_ind = 0; + for (i = first_byte; i < (uint32_t)(first_byte + bytes); i++) { + bitshift = (int8_t)((bytes - i)*BYTE_SIZE); + + if (bitshift < 0) + bitshift = 0; + entry_ind = entry_ind | ip[i-1] << bitshift; + } + + return entry_ind; +} + +/* + * Simulate adding a new route to the LPM counting number + * of new tables that will be needed + * + * It returns 0 on success, or 1 if + * the process needs to be continued by calling the function again. + */ +static inline int +simulate_add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, + struct rte_lpm6_tbl_entry **next_tbl, const uint8_t *ip, + uint8_t bytes, uint8_t first_byte, uint8_t depth, + uint32_t *need_tbl_nb) +{ + uint32_t entry_ind; + uint8_t bits_covered; + uint32_t next_tbl_ind; + + /* + * Calculate index to the table based on the number and position + * of the bytes being inspected in this step. + */ + entry_ind = get_bitshift(ip, first_byte, bytes); + + /* Number of bits covered in this step */ + bits_covered = (uint8_t)((bytes+first_byte-1)*BYTE_SIZE); + + if (depth <= bits_covered) { + *need_tbl_nb = 0; + return 0; + } + + if (tbl[entry_ind].valid == 0 || tbl[entry_ind].ext_entry == 0) { + /* from this point on a new table is needed on each level + * that is not covered yet + */ + depth -= bits_covered; + uint32_t cnt = depth >> 3; /* depth / BYTE_SIZE */ + if (depth & 7) /* 0b00000111 */ + /* if depth % 8 > 0 then one more table is needed + * for those last bits + */ + cnt++; + + *need_tbl_nb = cnt; + return 0; + } + + next_tbl_ind = tbl[entry_ind].lpm6_tbl8_gindex; + *next_tbl = &(lpm->tbl8[next_tbl_ind * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]); + *need_tbl_nb = 0; + return 1; +} + +/* + * Partially adds a new route to the data structure (tbl24+tbl8s). + * It returns 0 on success, a negative number on failure, or 1 if + * the process needs to be continued by calling the function again. + */ +static inline int +add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, + uint32_t tbl_ind, struct rte_lpm6_tbl_entry **next_tbl, + uint32_t *next_tbl_ind, uint8_t *ip, uint8_t bytes, + uint8_t first_byte, uint8_t depth, uint32_t next_hop, + uint8_t is_new_rule) +{ + uint32_t entry_ind, tbl_range, tbl8_group_start, tbl8_group_end, i; + uint32_t tbl8_gindex; + uint8_t bits_covered; + int ret; + + /* + * Calculate index to the table based on the number and position + * of the bytes being inspected in this step. + */ + entry_ind = get_bitshift(ip, first_byte, bytes); + + /* Number of bits covered in this step */ + bits_covered = (uint8_t)((bytes+first_byte-1)*BYTE_SIZE); + + /* + * If depth if smaller than this number (ie this is the last step) + * expand the rule across the relevant positions in the table. + */ + if (depth <= bits_covered) { + tbl_range = 1 << (bits_covered - depth); + + for (i = entry_ind; i < (entry_ind + tbl_range); i++) { + if (!tbl[i].valid || (tbl[i].ext_entry == 0 && + tbl[i].depth <= depth)) { + + struct rte_lpm6_tbl_entry new_tbl_entry = { + .next_hop = next_hop, + .depth = depth, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 0, + }; + + tbl[i] = new_tbl_entry; + + } else if (tbl[i].ext_entry == 1) { + + /* + * If tbl entry is valid and extended calculate the index + * into next tbl8 and expand the rule across the data structure. + */ + tbl8_gindex = tbl[i].lpm6_tbl8_gindex * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + expand_rule(lpm, tbl8_gindex, depth, depth, + next_hop, VALID); + } + } + + /* update tbl8 rule reference counter */ + if (tbl_ind != TBL24_IND && is_new_rule) + lpm->tbl8_hdrs[tbl_ind].ref_cnt++; + + return 0; + } + /* + * If this is not the last step just fill one position + * and calculate the index to the next table. + */ + else { + /* If it's invalid a new tbl8 is needed */ + if (!tbl[entry_ind].valid) { + /* get a new table */ + ret = tbl8_get(lpm, &tbl8_gindex); + if (ret != 0) + return -ENOSPC; + + /* invalidate all new tbl8 entries */ + tbl8_group_start = tbl8_gindex * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + memset(&lpm->tbl8[tbl8_group_start], 0, + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * + sizeof(struct rte_lpm6_tbl_entry)); + + /* init the new table's header: + * save the reference to the owner table + */ + init_tbl8_header(lpm, tbl8_gindex, tbl_ind, entry_ind); + + /* reference to a new tbl8 */ + struct rte_lpm6_tbl_entry new_tbl_entry = { + .lpm6_tbl8_gindex = tbl8_gindex, + .depth = 0, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 1, + }; + + tbl[entry_ind] = new_tbl_entry; + + /* update the current table's reference counter */ + if (tbl_ind != TBL24_IND) + lpm->tbl8_hdrs[tbl_ind].ref_cnt++; + } + /* + * If it's valid but not extended the rule that was stored + * here needs to be moved to the next table. + */ + else if (tbl[entry_ind].ext_entry == 0) { + /* get a new tbl8 index */ + ret = tbl8_get(lpm, &tbl8_gindex); + if (ret != 0) + return -ENOSPC; + + tbl8_group_start = tbl8_gindex * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + tbl8_group_end = tbl8_group_start + + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + + struct rte_lpm6_tbl_entry tbl_entry = { + .next_hop = tbl[entry_ind].next_hop, + .depth = tbl[entry_ind].depth, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 0 + }; + + /* Populate new tbl8 with tbl value. */ + for (i = tbl8_group_start; i < tbl8_group_end; i++) + lpm->tbl8[i] = tbl_entry; + + /* init the new table's header: + * save the reference to the owner table + */ + init_tbl8_header(lpm, tbl8_gindex, tbl_ind, entry_ind); + + /* + * Update tbl entry to point to new tbl8 entry. Note: The + * ext_flag and tbl8_index need to be updated simultaneously, + * so assign whole structure in one go. + */ + struct rte_lpm6_tbl_entry new_tbl_entry = { + .lpm6_tbl8_gindex = tbl8_gindex, + .depth = 0, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 1, + }; + + tbl[entry_ind] = new_tbl_entry; + + /* update the current table's reference counter */ + if (tbl_ind != TBL24_IND) + lpm->tbl8_hdrs[tbl_ind].ref_cnt++; + } + + *next_tbl_ind = tbl[entry_ind].lpm6_tbl8_gindex; + *next_tbl = &(lpm->tbl8[*next_tbl_ind * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]); + } + + return 1; +} + +/* + * Simulate adding a route to LPM + * + * Returns: + * 0 on success + * -ENOSPC not enough tbl8 left + */ +static int +simulate_add(struct rte_lpm6 *lpm, const uint8_t *masked_ip, uint8_t depth) +{ + struct rte_lpm6_tbl_entry *tbl; + struct rte_lpm6_tbl_entry *tbl_next = NULL; + int ret, i; + + /* number of new tables needed for a step */ + uint32_t need_tbl_nb; + /* total number of new tables needed */ + uint32_t total_need_tbl_nb; + + /* Inspect the first three bytes through tbl24 on the first step. */ + ret = simulate_add_step(lpm, lpm->tbl24, &tbl_next, masked_ip, + ADD_FIRST_BYTE, 1, depth, &need_tbl_nb); + total_need_tbl_nb = need_tbl_nb; + /* + * Inspect one by one the rest of the bytes until + * the process is completed. + */ + for (i = ADD_FIRST_BYTE; i < RTE_LPM6_IPV6_ADDR_SIZE && ret == 1; i++) { + tbl = tbl_next; + ret = simulate_add_step(lpm, tbl, &tbl_next, masked_ip, 1, + (uint8_t)(i + 1), depth, &need_tbl_nb); + total_need_tbl_nb += need_tbl_nb; + } + + if (tbl8_available(lpm) < total_need_tbl_nb) + /* not enough tbl8 to add a rule */ + return -ENOSPC; + + return 0; +} + +/* + * Add a route + */ +int +rte_lpm6_add(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + uint32_t next_hop, int is_new_rule) +{ + struct rte_lpm6_tbl_entry *tbl; + struct rte_lpm6_tbl_entry *tbl_next = NULL; + /* init to avoid compiler warning */ + uint32_t tbl_next_num = 123456; + int status; + uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE]; + int i; + + /* Check user arguments. */ + if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH)) + return -EINVAL; + + /* Copy the IP and mask it to avoid modifying user's input data. */ + ip6_copy_addr(masked_ip, ip); + ip6_mask_addr(masked_ip, depth); + + /* Simulate adding a new route */ + int ret = simulate_add(lpm, masked_ip, depth); + if (ret < 0) + return ret; + +#if 0 + /* Add the rule to the rule table. */ + int is_new_rule = rule_add(lpm, masked_ip, depth, next_hop); + /* If there is no space available for new rule return error. */ + if (is_new_rule < 0) + return is_new_rule; +#endif + + /* Inspect the first three bytes through tbl24 on the first step. */ + tbl = lpm->tbl24; + status = add_step(lpm, tbl, TBL24_IND, &tbl_next, &tbl_next_num, + masked_ip, ADD_FIRST_BYTE, 1, depth, next_hop, + is_new_rule); + assert(status >= 0); + + /* + * Inspect one by one the rest of the bytes until + * the process is completed. + */ + for (i = ADD_FIRST_BYTE; i < RTE_LPM6_IPV6_ADDR_SIZE && status == 1; i++) { + tbl = tbl_next; + status = add_step(lpm, tbl, tbl_next_num, &tbl_next, + &tbl_next_num, masked_ip, 1, (uint8_t)(i + 1), + depth, next_hop, is_new_rule); + assert(status >= 0); + } + + return status; +} + +/* + * Takes a pointer to a table entry and inspect one level. + * The function returns 0 on lookup success, ENOENT if no match was found + * or 1 if the process needs to be continued by calling the function again. + */ +static inline int +lookup_step(const struct rte_lpm6 *lpm, const struct rte_lpm6_tbl_entry *tbl, + const struct rte_lpm6_tbl_entry **tbl_next, const uint8_t *ip, + uint8_t first_byte, uint32_t *next_hop) +{ + uint32_t tbl8_index, tbl_entry; + + /* Take the integer value from the pointer. */ + tbl_entry = *(const uint32_t *)tbl; + + /* If it is valid and extended we calculate the new pointer to return. */ + if ((tbl_entry & RTE_LPM6_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM6_VALID_EXT_ENTRY_BITMASK) { + + tbl8_index = ip[first_byte-1] + + ((tbl_entry & RTE_LPM6_TBL8_BITMASK) * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES); + + *tbl_next = &lpm->tbl8[tbl8_index]; + + return 1; + } else { + /* If not extended then we can have a match. */ + *next_hop = ((uint32_t)tbl_entry & RTE_LPM6_TBL8_BITMASK); + return (tbl_entry & RTE_LPM6_LOOKUP_SUCCESS) ? 0 : -ENOENT; + } +} + +/* + * Looks up an IP + */ +int +rte_lpm6_lookup(const struct rte_lpm6 *lpm, const uint8_t *ip, + uint32_t *next_hop) +{ + const struct rte_lpm6_tbl_entry *tbl; + const struct rte_lpm6_tbl_entry *tbl_next = NULL; + int status; + uint8_t first_byte; + uint32_t tbl24_index; + + /* DEBUG: Check user input arguments. */ + if ((lpm == NULL) || (ip == NULL) || (next_hop == NULL)) + return -EINVAL; + + first_byte = LOOKUP_FIRST_BYTE; + tbl24_index = (ip[0] << BYTES2_SIZE) | (ip[1] << BYTE_SIZE) | ip[2]; + + /* Calculate pointer to the first entry to be inspected */ + tbl = &lpm->tbl24[tbl24_index]; + + do { + /* Continue inspecting following levels until success or failure */ + status = lookup_step(lpm, tbl, &tbl_next, ip, first_byte++, next_hop); + tbl = tbl_next; + } while (status == 1); + + return status; +} + +/* + * Looks up a group of IP addresses + */ +int +rte_lpm6_lookup_bulk_func(const struct rte_lpm6 *lpm, + uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], + int32_t *next_hops, unsigned int n) +{ + unsigned int i; + const struct rte_lpm6_tbl_entry *tbl; + const struct rte_lpm6_tbl_entry *tbl_next = NULL; + uint32_t tbl24_index, next_hop; + uint8_t first_byte; + int status; + + /* DEBUG: Check user input arguments. */ + if ((lpm == NULL) || (ips == NULL) || (next_hops == NULL)) + return -EINVAL; + + for (i = 0; i < n; i++) { + first_byte = LOOKUP_FIRST_BYTE; + tbl24_index = (ips[i][0] << BYTES2_SIZE) | + (ips[i][1] << BYTE_SIZE) | ips[i][2]; + + /* Calculate pointer to the first entry to be inspected */ + tbl = &lpm->tbl24[tbl24_index]; + + do { + /* Continue inspecting following levels + * until success or failure + */ + status = lookup_step(lpm, tbl, &tbl_next, ips[i], + first_byte++, &next_hop); + tbl = tbl_next; + } while (status == 1); + + if (status < 0) + next_hops[i] = -1; + else + next_hops[i] = (int32_t)next_hop; + } + + return 0; +} + +struct rte_lpm6_rule * +fill_rule6(char *buffer, const uint8_t *ip, uint8_t depth, uint32_t next_hop) +{ + struct rte_lpm6_rule *rule = (struct rte_lpm6_rule *)buffer; + + ip6_copy_addr((uint8_t *)&rule->ip, ip); + rule->depth = depth; + rule->next_hop = next_hop; + + return (rule); +} + +#if 0 +/* + * Look for a rule in the high-level rules table + */ +int +rte_lpm6_is_rule_present(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + uint32_t *next_hop) +{ + uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE]; + + /* Check user arguments. */ + if ((lpm == NULL) || next_hop == NULL || ip == NULL || + (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH)) + return -EINVAL; + + /* Copy the IP and mask it to avoid modifying user's input data. */ + ip6_copy_addr(masked_ip, ip); + ip6_mask_addr(masked_ip, depth); + + return rule_find(lpm, masked_ip, depth, next_hop); +} + +/* + * Delete a rule from the rule table. + * NOTE: Valid range for depth parameter is 1 .. 128 inclusive. + * return + * 0 on success + * <0 on failure + */ +static inline int +rule_delete(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth) +{ + int ret; + struct rte_lpm6_rule_key rule_key; + + /* init rule key */ + rule_key_init(&rule_key, ip, depth); + + /* delete the rule */ + ret = rte_hash_del_key(lpm->rules_tbl, (void *) &rule_key); + if (ret >= 0) + lpm->used_rules--; + + return ret; +} + +/* + * Deletes a group of rules + * + * Note that the function rebuilds the lpm table, + * rather than doing incremental updates like + * the regular delete function + */ +int +rte_lpm6_delete_bulk_func(struct rte_lpm6 *lpm, + uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], uint8_t *depths, + unsigned n) +{ + uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE]; + unsigned i; + + /* Check input arguments. */ + if ((lpm == NULL) || (ips == NULL) || (depths == NULL)) + return -EINVAL; + + for (i = 0; i < n; i++) { + ip6_copy_addr(masked_ip, ips[i]); + ip6_mask_addr(masked_ip, depths[i]); + rule_delete(lpm, masked_ip, depths[i]); + } + + /* + * Set all the table entries to 0 (ie delete every rule + * from the data structure. + */ + memset(lpm->tbl24, 0, sizeof(lpm->tbl24)); + memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0]) + * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s); + tbl8_pool_init(lpm); + + /* + * Add every rule again (except for the ones that were removed from + * the rules table). + */ + rebuild_lpm(lpm); + + return 0; +} + +/* + * Delete all rules from the LPM table. + */ +void +rte_lpm6_delete_all(struct rte_lpm6 *lpm) +{ + /* Zero used rules counter. */ + lpm->used_rules = 0; + + /* Zero tbl24. */ + memset(lpm->tbl24, 0, sizeof(lpm->tbl24)); + + /* Zero tbl8. */ + memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0]) * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s); + + /* init pool of free tbl8 indexes */ + tbl8_pool_init(lpm); + + /* Delete all rules form the rules table. */ + rte_hash_reset(lpm->rules_tbl); +} +#endif + +/* + * Convert a depth to a one byte long mask + * Example: 4 will be converted to 0xF0 + */ +static uint8_t __attribute__((pure)) +depth_to_mask_1b(uint8_t depth) +{ + /* To calculate a mask start with a 1 on the left hand side and right + * shift while populating the left hand side with 1's + */ + return (signed char)0x80 >> (depth - 1); +} + +#if 0 +/* + * Find a less specific rule + */ +static int +rule_find_less_specific(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, + struct rte_lpm6_rule *rule) +{ + int ret; + uint32_t next_hop; + uint8_t mask; + struct rte_lpm6_rule_key rule_key; + + if (depth == 1) + return 0; + + rule_key_init(&rule_key, ip, depth); + + while (depth > 1) { + depth--; + + /* each iteration zero one more bit of the key */ + mask = depth & 7; /* depth % BYTE_SIZE */ + if (mask > 0) + mask = depth_to_mask_1b(mask); + + rule_key.depth = depth; + rule_key.ip[depth >> 3] &= mask; + + ret = rule_find_with_key(lpm, &rule_key, &next_hop); + if (ret) { + rule->depth = depth; + ip6_copy_addr(rule->ip, rule_key.ip); + rule->next_hop = next_hop; + return 1; + } + } + + return 0; +} +#endif + +/* + * Find range of tbl8 cells occupied by a rule + */ +static void +rule_find_range(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + struct rte_lpm6_tbl_entry **from, + struct rte_lpm6_tbl_entry **to, + uint32_t *out_tbl_ind) +{ + uint32_t ind; + uint32_t first_3bytes = (uint32_t)ip[0] << 16 | ip[1] << 8 | ip[2]; + + if (depth <= 24) { + /* rule is within the top level */ + ind = first_3bytes; + *from = &lpm->tbl24[ind]; + ind += (1 << (24 - depth)) - 1; + *to = &lpm->tbl24[ind]; + *out_tbl_ind = TBL24_IND; + } else { + /* top level entry */ + struct rte_lpm6_tbl_entry *tbl = &lpm->tbl24[first_3bytes]; + assert(tbl->ext_entry == 1); + /* first tbl8 */ + uint32_t tbl_ind = tbl->lpm6_tbl8_gindex; + tbl = &lpm->tbl8[tbl_ind * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]; + /* current ip byte, the top level is already behind */ + uint8_t byte = 3; + /* minus top level */ + depth -= 24; + + /* iterate through levels (tbl8s) + * until we reach the last one + */ + while (depth > 8) { + tbl += ip[byte]; + assert(tbl->ext_entry == 1); + /* go to the next level/tbl8 */ + tbl_ind = tbl->lpm6_tbl8_gindex; + tbl = &lpm->tbl8[tbl_ind * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]; + byte += 1; + depth -= 8; + } + + /* last level/tbl8 */ + ind = ip[byte] & depth_to_mask_1b(depth); + *from = &tbl[ind]; + ind += (1 << (8 - depth)) - 1; + *to = &tbl[ind]; + *out_tbl_ind = tbl_ind; + } +} + +/* + * Remove a table from the LPM tree + */ +static void +remove_tbl(struct rte_lpm6 *lpm, struct rte_lpm_tbl8_hdr *tbl_hdr, + uint32_t tbl_ind, struct rte_lpm6_rule *lsp_rule) +{ + struct rte_lpm6_tbl_entry *owner_entry; + + if (tbl_hdr->owner_tbl_ind == TBL24_IND) + owner_entry = &lpm->tbl24[tbl_hdr->owner_entry_ind]; + else { + uint32_t owner_tbl_ind = tbl_hdr->owner_tbl_ind; + owner_entry = &lpm->tbl8[ + owner_tbl_ind * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES + + tbl_hdr->owner_entry_ind]; + + struct rte_lpm_tbl8_hdr *owner_tbl_hdr = + &lpm->tbl8_hdrs[owner_tbl_ind]; + if (--owner_tbl_hdr->ref_cnt == 0) + remove_tbl(lpm, owner_tbl_hdr, owner_tbl_ind, lsp_rule); + } + + assert(owner_entry->ext_entry == 1); + + /* unlink the table */ + if (lsp_rule != NULL) { + struct rte_lpm6_tbl_entry new_tbl_entry = { + .next_hop = lsp_rule->next_hop, + .depth = lsp_rule->depth, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 0 + }; + + *owner_entry = new_tbl_entry; + } else { + struct rte_lpm6_tbl_entry new_tbl_entry = { + .next_hop = 0, + .depth = 0, + .valid = INVALID, + .valid_group = INVALID, + .ext_entry = 0 + }; + + *owner_entry = new_tbl_entry; + } + + /* return the table to the pool */ + tbl8_put(lpm, tbl_ind); +} + +/* + * Deletes a rule + */ +int +rte_lpm6_delete(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + struct rte_lpm6_rule *lsp_rule) +{ + uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE]; + //struct rte_lpm6_rule lsp_rule_obj; + //struct rte_lpm6_rule *lsp_rule; + //int ret; + uint32_t tbl_ind; + struct rte_lpm6_tbl_entry *from, *to; + + /* Check input arguments. */ + if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH)) + return -EINVAL; + + /* Copy the IP and mask it to avoid modifying user's input data. */ + ip6_copy_addr(masked_ip, ip); + ip6_mask_addr(masked_ip, depth); + +#if 0 + /* Delete the rule from the rule table. */ + ret = rule_delete(lpm, masked_ip, depth); + if (ret < 0) + return -ENOENT; +#endif + + /* find rule cells */ + rule_find_range(lpm, masked_ip, depth, &from, &to, &tbl_ind); + +#if 0 + /* find a less specific rule (a rule with smaller depth) + * note: masked_ip will be modified, don't use it anymore + */ + ret = rule_find_less_specific(lpm, masked_ip, depth, + &lsp_rule_obj); + lsp_rule = ret ? &lsp_rule_obj : NULL; +#endif + /* decrement the table rule counter, + * note that tbl24 doesn't have a header + */ + if (tbl_ind != TBL24_IND) { + struct rte_lpm_tbl8_hdr *tbl_hdr = &lpm->tbl8_hdrs[tbl_ind]; + if (--tbl_hdr->ref_cnt == 0) { + /* remove the table */ + remove_tbl(lpm, tbl_hdr, tbl_ind, lsp_rule); + return 0; + } + } + + /* iterate rule cells */ + for (; from <= to; from++) + if (from->ext_entry == 1) { + /* reference to a more specific space + * of the prefix/rule. Entries in a more + * specific space that are not used by + * a more specific prefix must be occupied + * by the prefix + */ + if (lsp_rule != NULL) + expand_rule(lpm, + from->lpm6_tbl8_gindex * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES, + depth, lsp_rule->depth, + lsp_rule->next_hop, VALID); + else + /* since the prefix has no less specific prefix, + * its more specific space must be invalidated + */ + expand_rule(lpm, + from->lpm6_tbl8_gindex * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES, + depth, 0, 0, INVALID); + } else if (from->depth == depth) { + /* entry is not a reference and belongs to the prefix */ + if (lsp_rule != NULL) { + struct rte_lpm6_tbl_entry new_tbl_entry = { + .next_hop = lsp_rule->next_hop, + .depth = lsp_rule->depth, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 0 + }; + + *from = new_tbl_entry; + } else { + struct rte_lpm6_tbl_entry new_tbl_entry = { + .next_hop = 0, + .depth = 0, + .valid = INVALID, + .valid_group = INVALID, + .ext_entry = 0 + }; + + *from = new_tbl_entry; + } + } + + return 0; +} Index: sys/contrib/dpdk_rte_lpm/rte_shim.h =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/rte_shim.h @@ -0,0 +1,31 @@ +#ifndef _RTE_SHIM_H_ +#define _RTE_SHIM_H_ + +#define rte_malloc(_type, _size, _align) malloc(_size, M_TEMP, M_NOWAIT) +#define rte_free(_ptr) free(_ptr, M_TEMP) +#define rte_zmalloc(_type, _size, _align) malloc(_size, M_TEMP, M_NOWAIT | M_ZERO) +#define rte_zmalloc_socket(_type, _size, _align, _s) malloc(_size, M_TEMP, M_NOWAIT | M_ZERO) + +#define rte_mcfg_tailq_write_unlock() +#define rte_mcfg_tailq_write_lock() + +#define RTE_CACHE_LINE_SIZE CACHE_LINE_SIZE +#define strtoull strtoul +#define assert(_s) KASSERT((_s), ("DPDK: assert failed")) +#define rte_memcpy memcpy +#define rte_strerror(_err) "strerror_not_implemented" +#define RTE_LOG(_sev, _sub, _fmt, ...) printf("DPDK::" #_sev "::" #_sub " %s: " _fmt, __func__ , ## __VA_ARGS__) + +#include "sys/endian.h" +#define RTE_BYTE_ORDER BYTE_ORDER +#define RTE_LITTLE_ENDIAN LITTLE_ENDIAN +#define RTE_BIG_ENDIAN BIG_ENDIAN + +#include "sys/limits.h" // CHAR_BIT +#define rte_le_to_cpu_32 le32toh + +#include "rte_jhash.h" +#include "rte_common.h" + + +#endif Index: sys/contrib/dpdk_rte_lpm/rte_tailq.h =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/rte_tailq.h @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_TAILQ_H_ +#define _RTE_TAILQ_H_ + +/** + * @file + * Here defines rte_tailq APIs for only internal use + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +//#include +#include + +/** dummy structure type used by the rte_tailq APIs */ +struct rte_tailq_entry { + TAILQ_ENTRY(rte_tailq_entry) next; /**< Pointer entries for a tailq list */ + void *data; /**< Pointer to the data referenced by this tailq entry */ +}; +/** dummy */ +TAILQ_HEAD(rte_tailq_entry_head, rte_tailq_entry); + +#define RTE_TAILQ_NAMESIZE 32 + +/** + * The structure defining a tailq header entry for storing + * in the rte_config structure in shared memory. Each tailq + * is identified by name. + * Any library storing a set of objects e.g. rings, mempools, hash-tables, + * is recommended to use an entry here, so as to make it easy for + * a multi-process app to find already-created elements in shared memory. + */ +struct rte_tailq_head { + struct rte_tailq_entry_head tailq_head; /**< NOTE: must be first element */ + char name[RTE_TAILQ_NAMESIZE]; +}; + +struct rte_tailq_elem { + /** + * Reference to head in shared mem, updated at init time by + * rte_eal_tailqs_init() + */ + struct rte_tailq_head *head; + TAILQ_ENTRY(rte_tailq_elem) next; + const char name[RTE_TAILQ_NAMESIZE]; +}; + +/** + * Return the first tailq entry cast to the right struct. + */ +#define RTE_TAILQ_CAST(tailq_entry, struct_name) \ + (struct struct_name *)&(tailq_entry)->tailq_head + +/** + * Utility macro to make looking up a tailqueue for a particular struct easier. + * + * @param name + * The name of tailq + * + * @param struct_name + * The name of the list type we are using. (Generally this is the same as the + * first parameter passed to TAILQ_HEAD macro) + * + * @return + * The return value from rte_eal_tailq_lookup, typecast to the appropriate + * structure pointer type. + * NULL on error, since the tailq_head is the first + * element in the rte_tailq_head structure. + */ +#define RTE_TAILQ_LOOKUP(name, struct_name) \ + RTE_TAILQ_CAST(rte_eal_tailq_lookup(name), struct_name) + +/** + * Dump tail queues to a file. + * + * @param f + * A pointer to a file for output + */ +//void rte_dump_tailq(FILE *f); + +/** + * Lookup for a tail queue. + * + * Get a pointer to a tail queue header of a tail + * queue identified by the name given as an argument. + * Note: this function is not multi-thread safe, and should only be called from + * a single thread at a time + * + * @param name + * The name of the queue. + * @return + * A pointer to the tail queue head structure. + */ +struct rte_tailq_head *rte_eal_tailq_lookup(const char *name); + +/** + * Register a tail queue. + * + * Register a tail queue from shared memory. + * This function is mainly used by EAL_REGISTER_TAILQ macro which is used to + * register tailq from the different dpdk libraries. Since this macro is a + * constructor, the function has no access to dpdk shared memory, so the + * registered tailq can not be used before call to rte_eal_init() which calls + * rte_eal_tailqs_init(). + * + * @param t + * The tailq element which contains the name of the tailq you want to + * create (/retrieve when in secondary process). + * @return + * 0 on success or -1 in case of an error. + */ +int rte_eal_tailq_register(struct rte_tailq_elem *t); + +#define EAL_REGISTER_TAILQ(t) \ +RTE_INIT(tailqinitfn_ ##t) \ +{ \ + if (rte_eal_tailq_register(&t) < 0) \ + rte_panic("Cannot initialize tailq: %s\n", t.name); \ +} + +/* This macro permits both remove and free var within the loop safely.*/ +#ifndef TAILQ_FOREACH_SAFE +#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = TAILQ_FIRST((head)); \ + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_TAILQ_H_ */ Index: sys/modules/dpdk_lpm4/Makefile =================================================================== --- /dev/null +++ sys/modules/dpdk_lpm4/Makefile @@ -0,0 +1,12 @@ +# $FreeBSD$ + +SYSDIR?=${SRCTOP}/sys +.include "${SYSDIR}/conf/kern.opts.mk" + +.PATH: ${SYSDIR}/contrib/dpdk_rte_lpm + +KMOD= dpdk_lpm4 +SRCS= opt_inet.h +SRCS.INET=dpdk_lpm.c rte_lpm.c + +.include Index: sys/modules/dpdk_lpm6/Makefile =================================================================== --- /dev/null +++ sys/modules/dpdk_lpm6/Makefile @@ -0,0 +1,12 @@ +# $FreeBSD$ + +SYSDIR?=${SRCTOP}/sys +.include "${SYSDIR}/conf/kern.opts.mk" + +.PATH: ${SYSDIR}/contrib/dpdk_rte_lpm + +KMOD= dpdk_lpm6 +SRCS= opt_inet6.h +SRCS.INET6=dpdk_lpm6.c rte_lpm6.c + +.include Index: sys/net/route.h =================================================================== --- sys/net/route.h +++ sys/net/route.h @@ -230,6 +230,7 @@ /* Control plane route request flags */ #define NHR_COPY 0x100 /* Copy rte data */ +#define NHR_UNLOCKED 0x200 /* Do not lock table */ /* * Routing statistics. @@ -454,6 +455,8 @@ /* New API */ struct nhop_object *rib_lookup(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, uint32_t flowid); +struct rib_rtable_info; +bool rib_get_rtable_info(uint32_t fibnum, int family, struct rib_rtable_info *info); #endif #endif Index: sys/net/route.c =================================================================== --- sys/net/route.c +++ sys/net/route.c @@ -155,6 +155,12 @@ rt_table_destroy(struct rib_head *rh) { + RIB_WLOCK(rh); + rh->rib_dying = true; + RIB_WUNLOCK(rh); + + fib_destroy_rib(rh); + tmproutes_destroy(rh); rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head); Index: sys/net/route/nhgrp_ctl.c =================================================================== --- sys/net/route/nhgrp_ctl.c +++ sys/net/route/nhgrp_ctl.c @@ -293,6 +293,17 @@ return (nhg_priv); } +void +nhgrp_ref_object(struct nhgrp_object *nhg) +{ + struct nhgrp_priv *nhg_priv; + u_int old; + + nhg_priv = NHGRP_PRIV(nhg); + old = refcount_acquire(&nhg_priv->nhg_refcount); + KASSERT(old > 0, ("%s: nhgrp object %p has 0 refs", __func__, nhg)); +} + void nhgrp_free(struct nhgrp_object *nhg) { @@ -753,6 +764,21 @@ return (error); } +uint32_t +nhgrp_get_count(struct rib_head *rh) +{ + struct nh_control *ctl; + uint32_t count; + + ctl = rh->nh_control; + + NHOPS_RLOCK(ctl); + count = ctl->gr_head.items_count; + NHOPS_RUNLOCK(ctl); + + return (count); +} + uint32_t nhgrp_get_idx(const struct nhgrp_object *nhg) { Index: sys/net/route/nhop_ctl.c =================================================================== --- sys/net/route/nhop_ctl.c +++ sys/net/route/nhop_ctl.c @@ -690,6 +690,19 @@ &nh_priv->nh_epoch_ctx); } +void +nhop_ref_any(struct nhop_object *nh) +{ +#ifdef ROUTE_MPATH + if (!NH_IS_NHGRP(nh)) + nhop_ref_object(nh); + else + nhgrp_ref_object((struct nhgrp_object *)nh); +#else + nhop_ref_object(nh); +#endif +} + void nhop_free_any(struct nhop_object *nh) { @@ -852,6 +865,21 @@ return (error); } +uint32_t +nhops_get_count(struct rib_head *rh) +{ + struct nh_control *ctl; + uint32_t count; + + ctl = rh->nh_control; + + NHOPS_RLOCK(ctl); + count = ctl->nh_head.items_count; + NHOPS_RUNLOCK(ctl); + + return (count); +} + int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w) { Index: sys/net/route/route_algo.h =================================================================== --- /dev/null +++ sys/net/route/route_algo.h @@ -0,0 +1,110 @@ +/*- + * Copyright (c) 2020 + * Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + + +struct fib_data; +struct fib_dp; +enum flm_op_result { + FLM_SUCCESS, /* No errors, operation successful */ + FLM_REBUILD, /* Operation cannot be completed, schedule algorithm rebuild */ + FLM_ERROR, /* Operation failed, this algo cannot be used */ +}; + +struct rib_rtable_info { + uint32_t num_prefixes; + uint32_t num_nhops; + uint32_t num_nhgrp; +}; + +struct flm_lookup_key { + union { + const struct in6_addr *addr6; + struct in_addr addr4; + }; +}; + +typedef struct nhop_object *flm_lookup_t(void *algo_data, + const struct flm_lookup_key key, uint32_t scopeid); +typedef enum flm_op_result flm_init_t (uint32_t fibnum, struct fib_data *fd, + void *_old_data, void **new_data); +typedef void flm_destroy_t(void *data); +typedef enum flm_op_result flm_dump_t(struct rtentry *rt, void *data); +typedef enum flm_op_result flm_dump_end_t(void *data, struct fib_dp *dp); +typedef enum flm_op_result flm_change_t(struct rib_head *rnh, + struct rib_cmd_info *rc, void *data); +typedef uint8_t flm_get_pref_t(const struct rib_rtable_info *rinfo); + +#define FIB_M_NEED_NHOPS 0x01 /* need nexthop index map allocation */ +#define FIB_M_NO_CALLOUT 0x02 /* does not need callouts */ + +struct fib_lookup_module { + char *flm_name; /* algo name */ + int flm_family; /* address family this module supports */ + int flm_refcount; /* # of references */ + uint32_t flm_flags; /* flags */ + flm_init_t *flm_init_cb; /* instance init */ + flm_destroy_t *flm_destroy_cb; /* destroy instance */ + flm_change_t *flm_change_rib_item_cb;/* routing table change hook */ + flm_dump_t *flm_dump_rib_item_cb; /* routing table dump cb */ + flm_dump_end_t *flm_dump_end_cb; /* end of dump */ + flm_lookup_t *flm_lookup; /* lookup function */ + flm_get_pref_t *flm_get_pref; /* get algo preference */ + TAILQ_ENTRY(fib_lookup_module) entries; +}; + +/* Datapath lookup data */ +struct fib_dp { + flm_lookup_t *f; + void *arg; +}; + +VNET_DECLARE(struct fib_dp *, inet_dp); +#define V_inet_dp VNET(inet_dp) +VNET_DECLARE(struct fib_dp *, inet6_dp); +#define V_inet6_dp VNET(inet6_dp) + +int fib_module_init(struct fib_lookup_module *flm, uint32_t fibnum, + int family); +int fib_module_clone(const struct fib_lookup_module *flm_orig, + struct fib_lookup_module *flm, bool waitok); +int fib_module_dumptree(struct fib_lookup_module *flm, + enum rib_subscription_type subscription_type); +int fib_module_register(struct fib_lookup_module *flm); +int fib_module_unregister(struct fib_lookup_module *flm); + +uint32_t fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh); +void fib_free_nhop_idx(struct fib_data *fd, uint32_t idx); +void fib_free_nhop(struct fib_data *fd, struct nhop_object *nh); +struct nhop_object **fib_get_nhop_array(struct fib_data *fd); +void fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo); +struct rib_head *fib_get_rh(struct fib_data *fd); + + Index: sys/net/route/route_algo.c =================================================================== --- /dev/null +++ sys/net/route/route_algo.c @@ -0,0 +1,1198 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#define RTDEBUG + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_route.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#ifdef INET6 +#include +#include +#endif + +#include +#include +#include +#include +#include + +/* + * Route lookup framework. + * + * flm - fib lookup modules - kernel modules implementing particular algo + * fd - fib data - instance of an flm bound to specific routing table + * + * + * For each supported address family, there is a an allocated array of fib_dp + * structures, indexed by fib number. Each array entry contains callback function + * and its argument. This function will be called with a family-specific lookup key, + * scope and provided argument. This array gets re-created every time when new algo + * instance gets created. Please take a look at the replace_rtables_family() function + * for more details. + * + * Control plane for to setup and update the necessary dataplane structures. + * 1) nexhops abstraction -> module has to deal with index, refcounting, nexhtop groups etc + * 2) sync with route tables + * 3) dataplane attachment points + * 3) fail early. Some algorithms are immutable, so any change leads to rebuild. Some + * are mutable till some extent so the module is build over common setup/teardown + * instances, making error handling * easier. + * 4) preference. + * + */ + +SYSCTL_DECL(_net_route); +SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "Route algorithm lookups"); + +SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "IPv6 algorithm lookups"); +SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "IPv4 algorithm lookups"); + +struct nhop_ref_table { + uint32_t count; + int32_t refcnt[0]; +}; + +struct fib_data { + uint32_t number_nhops; /* current # of nhops */ + uint32_t number_records; /* current # of routes */ + uint8_t hit_nhops; /* true if out of nhop limit */ + uint8_t init_done; /* true if init is competed */ + uint32_t fd_dead:1; /* Scheduled for deletion */ + uint32_t fd_linked:1; /* true if linked */ + uint32_t fd_need_rebuild:1; /* true if rebuild scheduled */ + uint32_t fd_force_eval:1;/* true if rebuild scheduled */ + uint8_t fd_family; /* family */ + uint32_t fd_fibnum; /* fibnum */ + uint32_t fd_failed_rebuilds; /* stat: failed rebuilds */ + struct callout fd_callout; /* rebuild callout */ + void *fd_algo_data; /* algorithm data */ + struct nhop_object **nh_idx; /* nhop idx->ptr array */ + struct nhop_ref_table *nh_ref_table; /* array with # of nhop references */ + struct rib_head *fd_rh; /* RIB table we're attached to */ + struct rib_subscription *fd_rs; /* storing table subscription */ + struct fib_algo_calldata *fa; + struct fib_dp fd_dp; /* fib datapath data */ + struct vnet *fd_vnet; /* vnet nhop belongs to */ + struct epoch_context fd_epoch_ctx; + uint64_t gencnt; + struct fib_lookup_module *fd_flm; + uint32_t fd_num_changes; /* number of changes since last callout */ + TAILQ_ENTRY(fib_data) entries; /* list of all fds in vnet */ +}; + +static void rebuild_callout(void *_data); +static void destroy_instance_epoch(epoch_context_t ctx); +static enum flm_op_result switch_algo(struct fib_data *fd); +static struct fib_lookup_module *find_algo(const char *algo_name, int family); + +static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh, + struct fib_lookup_module *orig_flm); + +struct mtx fib_mtx; +#define MOD_LOCK() mtx_lock(&fib_mtx) +#define MOD_UNLOCK() mtx_unlock(&fib_mtx) + + +/* Algorithm has to be this percent better than the current to switch */ +#define BEST_DIFF_PERCENT (5 * 256 / 100) +/* Schedule algo re-evaluation X seconds after a change */ +#define ALGO_EVAL_DELAY_MS 30000 +/* Force algo re-evaluation after X changes */ +#define ALGO_EVAL_NUM_ROUTES 100 +/* Try to setup algorithm X times */ +#define FIB_MAX_TRIES 32 +/* Max amount of supported nexthops */ +#define FIB_MAX_NHOPS 262144 +#define FIB_CALLOUT_DELAY_MS 50 + + +/* TODO: per-VNET */ +static TAILQ_HEAD(fib_data_head, fib_data) fib_data_list = TAILQ_HEAD_INITIALIZER(fib_data_list); + +struct fib_dp_header { + struct epoch_context ffi_epoch_ctx; + uint32_t ffi_num_tables; + struct fib_dp ffi_idx[0]; +}; + +static TAILQ_HEAD(, fib_lookup_module) all_algo_list; + +#ifdef RTDEBUG +#define RH_PRINTF(_rh, _fmt, ...) printf("[rt_algo] %s.%u %s: " _fmt "\n", \ + print_family(_rh->rib_family), _rh->rib_fibnum, __func__ , ## __VA_ARGS__) +#define RH_PRINTF_RAW(_fmt, ...) printf("[rt_algo] %s: " _fmt "\n", __func__ , ## __VA_ARGS__) +#define FD_PRINTF(fd, _fmt, ...) printf("[rt_algo] %s.%u (%s) %s: " _fmt "\n",\ + print_family(fd->fd_family), fd->fd_fibnum, fd->fd_flm->flm_name, __func__, \ + ##__VA_ARGS__) +#else +#define FD_RH_PRINTF(fd, _fmt, ...) +#define RH_PRINTF(_fmt, ...) +#define RH_PRINTF_RAW(_fmt, ...) +#endif + +static const char * +print_family(int family) +{ + if (family == AF_INET) + return ("inet"); + else if (family == AF_INET6) + return ("inet6"); + else + return ("unknown"); +} + +static int +print_algos(struct sysctl_req *req, int family) +{ + struct fib_lookup_module *flm; + struct sbuf sbuf; + int error, count = 0; + + error = sysctl_wire_old_buffer(req, 0); + if (error == 0) { + sbuf_new_for_sysctl(&sbuf, NULL, 128, req); + TAILQ_FOREACH(flm, &all_algo_list, entries) { + if (flm->flm_family == family) { + if (count++ > 0) + sbuf_cat(&sbuf, ", "); + sbuf_cat(&sbuf, flm->flm_name); + } + } + error = sbuf_finish(&sbuf); + sbuf_delete(&sbuf); + } + return (error); +} + +static int +print_algos_inet6(SYSCTL_HANDLER_ARGS) +{ + + return (print_algos(req, AF_INET6)); +} +SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, + print_algos_inet6, "A", "List of algos"); + +static int +print_algos_inet(SYSCTL_HANDLER_ARGS) +{ + + return (print_algos(req, AF_INET)); +} +SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, + print_algos_inet, "A", "List of algos"); + + +static struct fib_lookup_module * +find_algo(const char *algo_name, int family) +{ + struct fib_lookup_module *flm; + + TAILQ_FOREACH(flm, &all_algo_list, entries) { + if ((strcmp(flm->flm_name, algo_name) == 0) && + (family == flm->flm_family)) + return (flm); + } + + return (NULL); +} + +static uint32_t +callout_calc_delay(struct fib_data *fd) +{ + uint32_t shift; + + if (fd->fd_failed_rebuilds > 10) + shift = 10; + else + shift = fd->fd_failed_rebuilds; + + return ((1 << shift) * FIB_CALLOUT_DELAY_MS); +} + +static void +schedule_callout(struct fib_data *fd, int delay_ms) +{ + + callout_reset_sbt(&fd->fd_callout, 0, SBT_1MS * delay_ms, + rebuild_callout, fd, 0); +} + +static void +schedule_algo_eval(struct fib_data *fd) +{ + + if (fd->fd_num_changes++ == 0) { + /* Start callout to consider switch */ + MOD_LOCK(); + if (!callout_pending(&fd->fd_callout)) + schedule_callout(fd, ALGO_EVAL_DELAY_MS); + MOD_UNLOCK(); + } else if (fd->fd_num_changes > ALGO_EVAL_NUM_ROUTES && !fd->fd_force_eval) { + /* Reset callout to exec immediately */ + MOD_LOCK(); + if (!fd->fd_need_rebuild) { + fd->fd_force_eval = true; + schedule_callout(fd, 1); + } + MOD_UNLOCK(); + } +} + +/* + * rib subscription handler + */ +static void +handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, + void *_data) +{ + struct fib_data *fd = (struct fib_data *)_data; + enum flm_op_result result; + + RIB_WLOCK_ASSERT(rnh); + + if (!fd->init_done) + return; + + schedule_algo_eval(fd); + + result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data); + + switch (result) { + case FLM_SUCCESS: + break; + case FLM_REBUILD: + /* + * Algo reported inability to handle, + * schedule algo rebuild. + */ + MOD_LOCK(); + if (!fd->fd_need_rebuild) { + fd->fd_need_rebuild = true; + /* + * Potentially rewrites pending callout + * to re-evaluate algo. + */ + FD_PRINTF(fd, "Scheduling rebuilt"); + schedule_callout(fd, callout_calc_delay(fd)); + } + MOD_UNLOCK(); + break; + default: + /* + * Algo reported a non-recoverable error. + * Remove and switch to radix? + */ + FD_PRINTF(fd, "algo reported non-recoverable error"); + // TODO: switch to radix + } +} + +static void +estimate_scale(const struct fib_data *old_fd, struct fib_data *fd) +{ + + if (old_fd == NULL) { + fd->number_nhops = 16; + return; + } + + if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS) + fd->number_nhops = 2 * old_fd->number_nhops; + else + fd->number_nhops = old_fd->number_nhops; +} + +struct walk_cbdata { + struct fib_data *fd; + flm_dump_t *func; + enum flm_op_result result; +}; + +static void +sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data) +{ + struct walk_cbdata *w = (struct walk_cbdata *)_data; + struct fib_data *fd = w->fd; + + if (rnh->rib_dying) { + w->result = FLM_ERROR; + return; + } + + if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS) + return; + + if (fd->hit_nhops) { + FD_PRINTF(fd, "ran out of nexthops at %u nhops", + fd->nh_ref_table->count); + w->result = FLM_REBUILD; + return; + } + + w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp); + + if (w->result == FLM_SUCCESS) { + /* Mark init as done to allow routing updates */ + fd->init_done = 1; + } +} + +static int +sync_algo_cb(struct rtentry *rt, void *_data) +{ + struct walk_cbdata *w = (struct walk_cbdata *)_data; + enum flm_op_result result; + + if (w->result == FLM_SUCCESS && w->func) { + result = w->func(rt, w->fd->fd_algo_data); + if (result != FLM_SUCCESS) + w->result = result; + } + + return (0); +} + +static enum flm_op_result +sync_algo(struct fib_data *fd) +{ + struct walk_cbdata w; + + w.fd = fd; + w.func = fd->fd_flm->flm_dump_rib_item_cb; + w.result = FLM_SUCCESS; + + rib_walk_ext_internal(fd->fd_rh, true, sync_algo_cb, sync_algo_end_cb, &w); + + FD_PRINTF(fd, "initial dump completed."); + + return (w.result); +} + +/* + * Assume already unlinked from datapath + */ +static int +schedule_destroy_instance(struct fib_data *fd, bool in_callout) +{ + bool is_dead; + + NET_EPOCH_ASSERT(); + + MOD_LOCK(); + is_dead = fd->fd_dead; + if (!is_dead) + fd->fd_dead = true; + if (fd->fd_linked) { + TAILQ_REMOVE(&fib_data_list, fd, entries); + fd->fd_linked = false; + } + MOD_UNLOCK(); + if (is_dead) + return (0); + + FD_PRINTF(fd, "DETACH"); + + if (fd->fd_rs != NULL) + rib_unsibscribe(fd->fd_rs); + + /* + * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls + * will be executed, hence no _new_ callout schedules will happen. + * + * There can be 3 possible scenarious here: + * 1) we're running inside a callout when we're deleting ourselves + * due to migration to a newer fd + * 2) we're running from rt_table_destroy() and callout is scheduled + * for execution OR is executing + * + * For (2) we need to wait for the callout termination, as the routing table + * will be destroyed after this function returns. + * For (1) we cannot call drain, but can ensure that this is the last invocation. + */ + + if (in_callout) + callout_stop(&fd->fd_callout); + else + callout_drain(&fd->fd_callout); + + /* + * At this moment there are no other pending work scheduled. + */ + FD_PRINTF(fd, "destroying old instance"); + epoch_call(net_epoch_preempt, destroy_instance_epoch, + &fd->fd_epoch_ctx); + + return (0); +} + +void +fib_destroy_rib(struct rib_head *rh) +{ + struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head); + struct fib_data *fd, *fd_tmp; + + /* + * Atm we have set is_dying flag on rnh, so all new fd's will + * fail at sync_algo() stage, so nothing new will be added to the list. + */ + MOD_LOCK(); + TAILQ_FOREACH_SAFE(fd, &fib_data_list, entries, fd_tmp) { + if (fd->fd_rh == rh) { + TAILQ_REMOVE(&fib_data_list, fd, entries); + fd->fd_linked = false; + TAILQ_INSERT_TAIL(&tmp_head, fd, entries); + } + } + MOD_UNLOCK(); + + /* Pass 2: remove each entry */ + TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) { + schedule_destroy_instance(fd, false); + } +} + +static void +destroy_instance(struct fib_data *fd) +{ + + FD_PRINTF(fd, "destroy fd %p", fd); + + /* Call destroy callback first */ + if (fd->fd_algo_data != NULL) + fd->fd_flm->flm_destroy_cb(fd->fd_algo_data); + + /* Nhop table */ + if (fd->nh_idx != NULL) { + for (int i = 0; i < fd->number_nhops; i++) { + if (fd->nh_idx[i] != NULL) { + FD_PRINTF(fd, " FREE nhop %d %p", i, fd->nh_idx[i]); + nhop_free_any(fd->nh_idx[i]); + } + } + free(fd->nh_idx, M_RTABLE); + } + if (fd->nh_ref_table != NULL) + free(fd->nh_ref_table, M_RTABLE); + + MOD_LOCK(); + fd->fd_flm->flm_refcount--; + MOD_UNLOCK(); + + free(fd, M_RTABLE); +} + +/* + * Epoch callback indicating fd is safe to destroy + */ +static void +destroy_instance_epoch(epoch_context_t ctx) +{ + struct fib_data *fd; + + fd = __containerof(ctx, struct fib_data, fd_epoch_ctx); + + destroy_instance(fd); +} + +static enum flm_op_result +try_setup_instance(struct fib_lookup_module *flm, struct rib_head *rh, + struct fib_data *old_fd, struct fib_data **pfd) +{ + struct fib_data *fd; + size_t size; + enum flm_op_result result; + + /* Allocate */ + fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO); + if (fd == NULL) { + *pfd = NULL; + return (FLM_REBUILD); + } + *pfd = fd; + + estimate_scale(old_fd, fd); + + fd->fd_rh = rh; + fd->fd_family = rh->rib_family; + fd->fd_fibnum = rh->rib_fibnum; + callout_init(&fd->fd_callout, 1); + fd->fd_vnet = curvnet; + fd->fd_flm = flm; + + /* Allocate nhidx -> nhop_ptr table */ + size = fd->number_nhops * sizeof(void *); + //FD_PRINTF(fd, "malloc(%lu)", size); + fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); + if (fd->nh_idx == NULL) { + FD_PRINTF(fd, "Unable to allocate nhop table idx (sz:%zu)", size); + return (FLM_REBUILD); + } + + /* Allocate nhop index refcount table */ + size = sizeof(struct nhop_ref_table); + size += fd->number_nhops * sizeof(uint32_t); + //FD_PRINTF(fd, "malloc(%lu)", size); + fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); + if (fd->nh_ref_table == NULL) { + FD_PRINTF(fd, "Unable to allocate nhop refcount table (sz:%zu)", size); + return (FLM_REBUILD); + } + + /* Okay, we're ready for algo init */ + void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL; + result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data); + if (result != FLM_SUCCESS) + return (result); + + /* Try to subscribe */ + if (flm->flm_change_rib_item_cb != NULL) { + fd->fd_rs = rib_subscribe_internal(fd->fd_rh, + handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE, 0); + if (fd->fd_rs == NULL) + return (FLM_REBUILD); + } + + /* Dump */ + result = sync_algo(fd); + if (result != FLM_SUCCESS) + return (result); + FD_PRINTF(fd, "DUMP completed successfully."); + + MOD_LOCK(); + TAILQ_INSERT_TAIL(&fib_data_list, fd, entries); + fd->fd_linked = true; + MOD_UNLOCK(); + + return (FLM_SUCCESS); +} + +/* + * Sets up algo @flm for table @rh and links it to the datapath. + * + */ +static enum flm_op_result +setup_instance(struct fib_lookup_module *flm, struct rib_head *rh, + struct fib_data *orig_fd, struct fib_data **pfd, bool attach) +{ + struct fib_data *prev_fd, *new_fd; + struct epoch_tracker et; + enum flm_op_result result; + + prev_fd = orig_fd; + new_fd = NULL; + for (int i = 0; i < FIB_MAX_TRIES; i++) { + NET_EPOCH_ENTER(et); + result = try_setup_instance(flm, rh, prev_fd, &new_fd); + + if ((result == FLM_SUCCESS) && attach) + result = switch_algo(new_fd); + + if ((prev_fd != NULL) && (prev_fd != orig_fd)) { + schedule_destroy_instance(prev_fd, false); + prev_fd = NULL; + } + NET_EPOCH_EXIT(et); + + RH_PRINTF(rh, "try %d: fib algo result: %d", i, result); + + if (result == FLM_REBUILD) { + prev_fd = new_fd; + new_fd = NULL; + continue; + } + + break; + } + + if (result != FLM_SUCCESS) { + /* update failure count */ + MOD_LOCK(); + if (orig_fd != NULL) + orig_fd->fd_failed_rebuilds++; + MOD_UNLOCK(); + + NET_EPOCH_ENTER(et); + if ((prev_fd != NULL) && (prev_fd != orig_fd)) + schedule_destroy_instance(prev_fd, false); + if (new_fd != NULL) { + schedule_destroy_instance(new_fd, false); + new_fd = NULL; + } + NET_EPOCH_EXIT(et); + } + + *pfd = new_fd; + return (result); +} + +static void +rebuild_callout(void *_data) +{ + struct fib_data *fd, *fd_new; + struct fib_lookup_module *flm_new; + struct epoch_tracker et; + enum flm_op_result result; + bool need_rebuild = false; + + fd = (struct fib_data *)_data; + + MOD_LOCK(); + need_rebuild = fd->fd_need_rebuild; + fd->fd_need_rebuild = false; + fd->fd_force_eval = false; + fd->fd_num_changes = 0; + MOD_UNLOCK(); + + CURVNET_SET(fd->fd_vnet); + + /* First, check if we're still OK to use this algo */ + flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); + if ((flm_new == NULL) && (!need_rebuild)) { + /* Keep existing algo, no need to rebuild. */ + CURVNET_RESTORE(); + return; + } + + struct fib_data *fd_tmp = (flm_new == NULL) ? fd : NULL; + result = setup_instance(fd->fd_flm, fd->fd_rh, fd_tmp, &fd_new, true); + if (result != FLM_SUCCESS) { + FD_PRINTF(fd, "table rebuild failed"); + CURVNET_RESTORE(); + return; + } + FD_PRINTF(fd_new, "switched to new instance"); + + /* Remove old */ + if (fd != NULL) { + NET_EPOCH_ENTER(et); + schedule_destroy_instance(fd, true); + NET_EPOCH_EXIT(et); + } + + CURVNET_RESTORE(); +} + +static int +set_algo_sysctl_handler(SYSCTL_HANDLER_ARGS) +{ + int error = 0; +#if 0 + struct epoch_tracker et; + struct fib_lookup_module *flm; + struct fib_data *old_fd, *fd; + char old_algo_name[32], algo_name[32]; + uint32_t fibnum; + int error; + + fibnum = RT_DEFAULT_FIB; + + if (old_fd == NULL) { + strlcpy(old_algo_name, "radix", sizeof(old_algo_name)); + } else { + strlcpy(old_algo_name, fd_ptr->fd_flm->flm_name, + sizeof(old_algo_name)); + } + strlcpy(algo_name, old_algo_name, sizeof(algo_name)); + error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req); + if (error != 0 || req->newptr == NULL) + return (error); + + if (strcmp(algo_name, old_algo_name) == 0) + return (0); + + if (strcmp(algo_name, "radix") == 0) { + /* teardown old one */ + NET_EPOCH_ENTER(et); + MOD_LOCK(); + old_fd = fd_ptr; + fd_ptr = NULL; + MOD_UNLOCK(); + + if (old_fd != NULL) + schedule_destroy_instance(old_fd); + NET_EPOCH_EXIT(et); + return (0); + } + + MOD_LOCK(); + flm = find_algo(algo_name, AF_INET6); + if (flm != NULL) + flm->flm_refcount++; + MOD_UNLOCK(); + + if (flm == NULL) { + DPRINTF("unable to find algo %s", algo_name); + return (ESRCH); + } + DPRINTF("inet6.%u: requested fib algo %s", fibnum, algo_name); + + fd = setup_instance(flm, fibnum, NULL, &error); + + if (error != 0) { + MOD_LOCK(); + flm->flm_refcount--; + MOD_UNLOCK(); + return (error); + } + + MOD_LOCK(); + old_fd = fd_ptr; + fd_ptr = fd; + MOD_UNLOCK(); + + /* Remove old */ + NET_EPOCH_ENTER(et); + if (old_fd != NULL) { + error = schedule_destroy_instance(old_fd); + } + NET_EPOCH_EXIT(et); +#endif + + /* Set new */ + + /* Drain cb so user can unload the module after userret if so desired */ + epoch_drain_callbacks(net_epoch_preempt); + + return (error); +} +SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, + set_algo_sysctl_handler, "A", + "Set"); + +static void +destroy_fdh_epoch(epoch_context_t ctx) +{ + struct fib_dp_header *ffi; + + ffi = __containerof(ctx, struct fib_dp_header, ffi_epoch_ctx); + free(ffi, M_RTABLE); +} + +static struct fib_dp_header * +alloc_fib_dp_array(uint32_t num_tables, bool waitok) +{ + size_t sz; + struct fib_dp_header *ffi; + + sz = sizeof(struct fib_dp_header); + sz += sizeof(struct fib_dp) * num_tables; + ffi = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO); + if (ffi != NULL) + ffi->ffi_num_tables = num_tables; + return (ffi); +} + +static struct fib_dp_header * +get_fib_dp_header(struct fib_dp *dp) +{ + + return (__containerof((void *)dp, struct fib_dp_header, ffi_idx)); +} + +/* + * Replace per-family index pool @pdp with a new one which + * contains updated callback/algo data from @fd. + * Returns 0 on success. + */ +static enum flm_op_result +replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd) +{ + struct fib_dp_header *new_ffi, *old_ffi; + + NET_EPOCH_ASSERT(); + + FD_PRINTF(fd, "[vnet %p] replace with f:%p arg:%p", curvnet, fd->fd_dp.f, fd->fd_dp.arg); + + MOD_LOCK(); + old_ffi = get_fib_dp_header(*pdp); + new_ffi = alloc_fib_dp_array(old_ffi->ffi_num_tables, false); + FD_PRINTF(fd, "OLD FFI: %p NEW FFI: %p", old_ffi, new_ffi); + if (new_ffi == NULL) { + MOD_UNLOCK(); + FD_PRINTF(fd, "error attaching datapath"); + return (FLM_REBUILD); + } + + memcpy(&new_ffi->ffi_idx[0], &old_ffi->ffi_idx[0], + old_ffi->ffi_num_tables * sizeof(struct fib_dp)); + /* Update relevant data structure for @fd */ + new_ffi->ffi_idx[fd->fd_fibnum] = fd->fd_dp; + + /* Ensure memcpy() writes have completed */ + atomic_thread_fence_rel(); + /* Set new datapath pointer */ + *pdp = &new_ffi->ffi_idx[0]; + MOD_UNLOCK(); + FD_PRINTF(fd, "update %p -> %p", old_ffi, new_ffi); + + epoch_call(net_epoch_preempt, destroy_fdh_epoch, + &old_ffi->ffi_epoch_ctx); + + return (FLM_SUCCESS); +} + +static struct fib_dp ** +get_family_ptr(int family) +{ + switch (family) { + case AF_INET: + return (&V_inet_dp); + case AF_INET6: + return (&V_inet6_dp); + } + return (NULL); +} + +static enum flm_op_result +switch_algo(struct fib_data *fd) +{ + struct fib_dp **pdp; + + pdp = get_family_ptr(fd->fd_family); + return (replace_rtables_family(pdp, fd)); +} + +/* + * Grow datapath pointers array. + * Called from sysctl handler on growing number of routing tables. + */ +static void +grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables) +{ + struct fib_dp_header *new_fdh, *old_fdh = NULL; + + new_fdh = alloc_fib_dp_array(new_num_tables, true); + + MOD_LOCK(); + if (*pdp != NULL) { + old_fdh = get_fib_dp_header(*pdp); + memcpy(&new_fdh->ffi_idx[0], &old_fdh->ffi_idx[0], + old_fdh->ffi_num_tables * sizeof(struct fib_dp)); + } + + /* Wait till all writes completed */ + atomic_thread_fence_rel(); + + *pdp = &new_fdh->ffi_idx[0]; + MOD_UNLOCK(); + + if (old_fdh != NULL) + epoch_call(net_epoch_preempt, destroy_fdh_epoch, + &old_fdh->ffi_epoch_ctx); +} + +/* + * Grows per-AF arrays of datapath pointers for each supported family. + * Called from fibs resize sysctl handler. + */ +void +fib_grow_rtables(uint32_t new_num_tables) +{ + + grow_rtables_family(get_family_ptr(AF_INET), new_num_tables); + grow_rtables_family(get_family_ptr(AF_INET6), new_num_tables); +} + +void +fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo) +{ + + bzero(rinfo, sizeof(struct rib_rtable_info)); + rinfo->num_prefixes = rh->rnh_prefixes; + rinfo->num_nhops = nhops_get_count(rh); + rinfo->num_nhgrp = nhgrp_get_count(rh); +} + +struct rib_head * +fib_get_rh(struct fib_data *fd) +{ + + return (fd->fd_rh); +} + +static uint32_t +get_nhop_idx(struct nhop_object *nh) +{ +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(nh)) + return (nhgrp_get_idx((struct nhgrp_object *)nh) * 2 - 1); + else + return (nhop_get_idx(nh) * 2); +#else + return (nhop_get_idx(nh)); +#endif +} + + +uint32_t +fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh) +{ + uint32_t idx = get_nhop_idx(nh); + + if (idx >= fd->number_nhops) { + fd->hit_nhops = 1; + return (0); + } + + if (fd->nh_idx[idx] == NULL) { + nhop_ref_any(nh); + fd->nh_idx[idx] = nh; + fd->nh_ref_table->count++; + FD_PRINTF(fd, " REF nhop %u %p", idx, fd->nh_idx[idx]); + } + fd->nh_ref_table->refcnt[idx]++; + + return (idx); +} + +struct nhop_release_data { + struct nhop_object *nh; + struct epoch_context ctx; +}; + +static void +release_nhop_epoch(epoch_context_t ctx) +{ + struct nhop_release_data *nrd; + + nrd = __containerof(ctx, struct nhop_release_data, ctx); + nhop_free_any(nrd->nh); + free(nrd, M_RTABLE); +} + +static void +fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh) +{ + struct nhop_release_data *nrd; + + nrd = malloc(sizeof(struct nhop_release_data), M_RTABLE, M_NOWAIT | M_ZERO); + if (nrd != NULL) { + nrd->nh = nh; + epoch_call(net_epoch_preempt, release_nhop_epoch, &nrd->ctx); + } else { + /* + * Unable to allocate memory. Leak nexthop to maintain guarantee + * that each nhop. + */ + FD_PRINTF(fd, "unable to allocate structure for nhop %p deletion", nh); + } +} + +void +fib_free_nhop_idx(struct fib_data *fd, uint32_t idx) +{ + + KASSERT((idx < fd->number_nhops), ("invalid nhop index")); + + fd->nh_ref_table->refcnt[idx]--; + if (fd->nh_ref_table->refcnt[idx] == 0) { + FD_PRINTF(fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]); + fib_schedule_release_nhop(fd, fd->nh_idx[idx]); + } +} + +void +fib_free_nhop(struct fib_data *fd, struct nhop_object *nh) +{ + + fib_free_nhop_idx(fd, get_nhop_idx(nh)); +} + +struct nhop_object ** +fib_get_nhop_array(struct fib_data *fd) +{ + + return (fd->nh_idx); +} + +static struct fib_lookup_module * +fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm) +{ + uint8_t preference, curr_preference = 0, best_preference = 0; + struct fib_lookup_module *flm, *best_flm = NULL; + struct rib_rtable_info rinfo; + int candidate_algos = 0; + + fib_get_rtable_info(rh, &rinfo); + + MOD_LOCK(); + TAILQ_FOREACH(flm, &all_algo_list, entries) { + if (flm->flm_family != rh->rib_family) + continue; + candidate_algos++; + preference = flm->flm_get_pref(&rinfo); + if (preference > best_preference) { + best_preference = preference; + best_flm = flm; + } + if (flm == orig_flm) + curr_preference = preference; + } + if (best_flm != NULL && best_flm != orig_flm) { + /* Check */ + if (curr_preference + BEST_DIFF_PERCENT < best_preference) + best_flm->flm_refcount++; + else + best_flm = NULL; + } else + best_flm = NULL; + MOD_UNLOCK(); + + RH_PRINTF(rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)", + candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference, + best_flm ? best_flm->flm_name : "NULL", best_preference); + + return (best_flm); +} + +/* + * Called when new route table is created. + * Selects, allocates and attaches fib algo for the table. + */ +int +fib_select_algo_initial(struct rib_head *rh) +{ + struct fib_lookup_module *flm; + struct fib_data *fd = NULL; + enum flm_op_result result; + + flm = fib_check_best_algo(rh, NULL); + if (flm == NULL) { + RH_PRINTF(rh, "no algo selected"); + return (ENOENT); + } + RH_PRINTF(rh, "selected algo %s", flm->flm_name); + + result = setup_instance(flm, rh, NULL, &fd, false); + RH_PRINTF(rh, "result=%d fd=%p", result, fd); + if (result == FLM_SUCCESS) { + /* + * Attach datapath directly to avoid N reallocations + * during fib growth + */ + struct fib_dp_header *fdp; + struct fib_dp **pdp; + + pdp = get_family_ptr(rh->rib_family); + if (pdp != NULL) { + fdp = get_fib_dp_header(*pdp); + fdp->ffi_idx[fd->fd_fibnum] = fd->fd_dp; + FD_PRINTF(fd, "datapath attached"); + } + } + + return (0); +} + +int +fib_module_register(struct fib_lookup_module *flm) +{ + + MOD_LOCK(); + RH_PRINTF_RAW("linking %s (%p)", flm->flm_name, flm); + TAILQ_INSERT_TAIL(&all_algo_list, flm, entries); + MOD_UNLOCK(); + + return (0); +} + +int +fib_module_unregister(struct fib_lookup_module *flm) +{ + MOD_LOCK(); + if (flm->flm_refcount > 0) { + MOD_UNLOCK(); + return (EBUSY); + } + RH_PRINTF_RAW("unlinking %s (%p)", flm->flm_name, flm); + TAILQ_REMOVE(&all_algo_list, flm, entries); + MOD_UNLOCK(); + + return (0); +} + +int +fib_module_clone(const struct fib_lookup_module *flm_orig, + struct fib_lookup_module *flm, bool waitok) +{ + + return (0); +} + +int +fib_module_dumptree(struct fib_lookup_module *flm, + enum rib_subscription_type subscription_type) +{ + + + return (0); +} + +static void +fib_algo_init(void) +{ + + mtx_init(&fib_mtx, "algo list mutex", NULL, MTX_DEF); + TAILQ_INIT(&all_algo_list); +} +SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, fib_algo_init, NULL); + Index: sys/net/route/route_ctl.h =================================================================== --- sys/net/route/route_ctl.h +++ sys/net/route/route_ctl.h @@ -72,6 +72,8 @@ void *arg); void rib_walk_ext(uint32_t fibnum, int af, bool wlock, rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg); +void rib_walk_ext_internal(struct rib_head *rnh, bool wlock, + rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg); void rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *arg, bool report); @@ -87,6 +89,10 @@ const struct rtentry *rib_lookup_lpm(uint32_t fibnum, int family, const struct sockaddr *dst, struct route_nhop_data *rnd); +/* Nhops */ +void nhop_ref_any(struct nhop_object *nh); +void nhop_free_any(struct nhop_object *nh); + /* Multipath */ struct nhgrp_object; struct weightened_nhop; @@ -109,6 +115,6 @@ struct rib_subscription *rib_subscribe_internal(struct rib_head *rnh, rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type, bool waitok); -int rib_unsibscribe(uint32_t fibnum, int family, struct rib_subscription *rs); +void rib_unsibscribe(struct rib_subscription *rs); #endif Index: sys/net/route/route_ctl.c =================================================================== --- sys/net/route/route_ctl.c +++ sys/net/route/route_ctl.c @@ -70,6 +70,7 @@ CK_STAILQ_ENTRY(rib_subscription) next; rib_subscription_cb_t *func; void *arg; + struct rib_head *rnh; enum rib_subscription_type type; struct epoch_context epoch_ctx; }; @@ -669,6 +670,8 @@ /* Finalize notification */ rnh->rnh_gen++; + rnh->rnh_prefixes--; + rc->rc_cmd = RTM_DELETE; rc->rc_rt = rt; rc->rc_nh_old = rt->rt_nhop; @@ -929,6 +932,7 @@ /* Finalize notification */ rnh->rnh_gen++; + rnh->rnh_prefixes++; rc->rc_cmd = RTM_ADD; rc->rc_rt = rt; @@ -984,6 +988,8 @@ /* Finalize notification */ rnh->rnh_gen++; + if (rnd->rnd_nhop == NULL) + rnh->rnh_prefixes--; rc->rc_cmd = (rnd->rnd_nhop != NULL) ? RTM_CHANGE : RTM_DELETE; rc->rc_rt = rt; @@ -1222,7 +1228,7 @@ enum rib_subscription_type type, bool waitok) { struct rib_subscription *rs; - int flags = M_ZERO | (waitok ? M_WAITOK : 0); + int flags = M_ZERO | (waitok ? M_WAITOK : M_NOWAIT); rs = malloc(sizeof(struct rib_subscription), M_RTABLE, flags); if (rs == NULL) @@ -1246,22 +1252,14 @@ enum rib_subscription_type type, bool waitok) { struct rib_head *rnh; - struct rib_subscription *rs; struct epoch_tracker et; - if ((rs = allocate_subscription(f, arg, type, waitok)) == NULL) - return (NULL); - NET_EPOCH_ENTER(et); KASSERT((fibnum < rt_numfibs), ("%s: bad fibnum", __func__)); rnh = rt_tables_get_rnh(fibnum, family); - - RIB_WLOCK(rnh); - CK_STAILQ_INSERT_TAIL(&rnh->rnh_subscribers, rs, next); - RIB_WUNLOCK(rnh); NET_EPOCH_EXIT(et); - return (rs); + return (rib_subscribe_internal(rnh, f, arg, type, waitok)); } struct rib_subscription * @@ -1273,6 +1271,7 @@ if ((rs = allocate_subscription(f, arg, type, waitok)) == NULL) return (NULL); + rs->rnh = rnh; NET_EPOCH_ENTER(et); RIB_WLOCK(rnh); @@ -1284,23 +1283,15 @@ } /* - * Remove rtable subscription @rs from the table specified by @fibnum - * and @family. + * Remove rtable subscription @rs from the routing table. * Needs to be run in network epoch. - * - * Returns 0 on success. */ -int -rib_unsibscribe(uint32_t fibnum, int family, struct rib_subscription *rs) +void +rib_unsibscribe(struct rib_subscription *rs) { - struct rib_head *rnh; + struct rib_head *rnh = rs->rnh; NET_EPOCH_ASSERT(); - KASSERT((fibnum < rt_numfibs), ("%s: bad fibnum", __func__)); - rnh = rt_tables_get_rnh(fibnum, family); - - if (rnh == NULL) - return (ENOENT); RIB_WLOCK(rnh); CK_STAILQ_REMOVE(&rnh->rnh_subscribers, rs, rib_subscription, next); @@ -1308,8 +1299,6 @@ epoch_call(net_epoch_preempt, destroy_subscription_epoch, &rs->epoch_ctx); - - return (0); } /* Index: sys/net/route/route_helpers.c =================================================================== --- sys/net/route/route_helpers.c +++ sys/net/route/route_helpers.c @@ -77,14 +77,10 @@ * Table is traversed under read lock unless @wlock is set. */ void -rib_walk_ext(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f, +rib_walk_ext_internal(struct rib_head *rnh, bool wlock, rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg) { RIB_RLOCK_TRACKER; - struct rib_head *rnh; - - if ((rnh = rt_tables_get_rnh(fibnum, family)) == NULL) - return; if (wlock) RIB_WLOCK(rnh); @@ -101,6 +97,16 @@ RIB_RUNLOCK(rnh); } +void +rib_walk_ext(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f, + rib_walk_hook_f_t *hook_f, void *arg) +{ + struct rib_head *rnh; + + if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL) + rib_walk_ext_internal(rnh, wlock, wa_f, hook_f, arg); +} + /* * Calls @wa_f with @arg for each entry in the table specified by * @af and @fibnum. Index: sys/net/route/route_tables.c =================================================================== --- sys/net/route/route_tables.c +++ sys/net/route/route_tables.c @@ -171,7 +171,7 @@ grow_rtables(uint32_t num_tables) { struct domain *dom; - struct rib_head **prnh; + struct rib_head **prnh, *rh; struct rib_head **new_rt_tables, **old_rt_tables; int family; @@ -188,6 +188,8 @@ "by default. Consider tuning %s if needed\n", "net.add_addr_allfibs"); + fib_grow_rtables(num_tables); + /* * Current rt_tables layout: * fib0[af0, af1, af2, .., AF_MAX]fib1[af0, af1, af2, .., Af_MAX].. @@ -206,10 +208,16 @@ prnh = &new_rt_tables[i * (AF_MAX + 1) + family]; if (*prnh != NULL) continue; - *prnh = dom->dom_rtattach(i); - if (*prnh == NULL) - log(LOG_ERR, "unable to create routing tables for domain %d\n", - dom->dom_family); + rh = dom->dom_rtattach(i); + if (rh == NULL) + log(LOG_ERR, "unable to create routing table for %d.%d\n", + dom->dom_family, i); + if (fib_select_algo_initial(rh) != 0) { + log(LOG_ERR, "unable to select algo for table %d.%d\n", + dom->dom_family, i); + // TODO: detach table + } + *prnh = rh; } } Index: sys/net/route/route_var.h =================================================================== --- sys/net/route/route_var.h +++ sys/net/route/route_var.h @@ -68,8 +68,10 @@ struct vnet *rib_vnet; /* vnet pointer */ int rib_family; /* AF of the rtable */ u_int rib_fibnum; /* fib number */ + bool rib_dying; /* rib is detaching */ struct callout expire_callout; /* Callout for expiring dynamic routes */ time_t next_expire; /* Next expire run ts */ + uint32_t rnh_prefixes; /* Number of prefixes */ struct nh_control *nh_control; /* nexthop subsystem data */ CK_STAILQ_HEAD(, rib_subscription) rnh_subscribers;/* notification subscribers */ }; @@ -241,7 +243,6 @@ void nhops_destroy_rib(struct rib_head *rh); void nhop_ref_object(struct nhop_object *nh); int nhop_try_ref_object(struct nhop_object *nh); -void nhop_free_any(struct nhop_object *nh); void nhop_set_type(struct nhop_object *nh, enum nhop_type nh_type); void nhop_set_rtflags(struct nhop_object *nh, int rt_flags); @@ -253,6 +254,7 @@ void nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu); int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w); +uint32_t nhops_get_count(struct rib_head *rh); /* MULTIPATH */ #define MPF_MULTIPATH 0x08 /* need to be consistent with NHF_MULTIPATH */ @@ -295,6 +297,7 @@ /* nhgrp_ctl.c */ int nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w); +uint32_t nhgrp_get_count(struct rib_head *rh); int nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops, struct route_nhop_data *rnd); @@ -306,7 +309,14 @@ struct route_nhop_data *rnd_new); uint32_t nhgrp_get_idx(const struct nhgrp_object *nhg); +void nhgrp_ref_object(struct nhgrp_object *nhg); void nhgrp_free(struct nhgrp_object *nhg); +uint32_t nhgrp_get_idx(const struct nhgrp_object *nhg); + +/* lookup_framework.c */ +void fib_grow_rtables(uint32_t new_num_tables); +int fib_select_algo_initial(struct rib_head *rh); +void fib_destroy_rib(struct rib_head *rh); /* Entropy data used for outbound hashing */ #define MPATH_ENTROPY_KEY_LEN 40 Index: sys/netinet/in_fib.h =================================================================== --- sys/netinet/in_fib.h +++ sys/netinet/in_fib.h @@ -45,10 +45,15 @@ struct sockaddr_in ro_dst4; }; +struct rtentry; +struct route_nhop_data; + struct nhop_object *fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, uint32_t flags, uint32_t flowid); int fib4_check_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, uint32_t flags, const struct ifnet *src_if); +struct rtentry *fib4_lookup_rt(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, + uint32_t flags, struct route_nhop_data *nrd); struct nhop_object *fib4_lookup_debugnet(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, uint32_t flags); uint32_t fib4_calc_software_hash(struct in_addr src, struct in_addr dst, Index: sys/netinet/in_fib.c =================================================================== --- sys/netinet/in_fib.c +++ sys/netinet/in_fib.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -63,6 +64,10 @@ /* Assert 'struct route_in' is compatible with 'struct route' */ CHK_STRUCT_ROUTE_COMPAT(struct route_in, ro_dst4); +#ifdef ROUTE_ALGO +VNET_DEFINE(struct fib_dp *, inet_dp); +#endif + #ifdef ROUTE_MPATH struct _hash_5tuple_ipv4 { struct in_addr src; @@ -75,7 +80,6 @@ _Static_assert(sizeof(struct _hash_5tuple_ipv4) == 16, "_hash_5tuple_ipv4 size is wrong"); - uint32_t fib4_calc_software_hash(struct in_addr src, struct in_addr dst, unsigned short src_port, unsigned short dst_port, char proto, @@ -104,6 +108,29 @@ * one needs to pass NHR_REF as a flag. This will return referenced * nexthop. */ +#ifdef ROUTE_ALGO +struct nhop_object * +fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, + uint32_t flags, uint32_t flowid) +{ + struct nhop_object *nh; + struct fib_dp *dp = &V_inet_dp[fibnum]; + struct flm_lookup_key key = {.addr4 = dst }; + + nh = dp->f(dp->arg, key, scopeid); + if (nh != NULL) { + nh = nhop_select(nh, flowid); + /* Ensure route & ifp is UP */ + if (RT_LINK_IS_UP(nh->nh_ifp)) { + if (flags & NHR_REF) + nhop_ref_object(nh); + return (nh); + } + } + RTSTAT_INC(rts_unreach); + return (NULL); +} +#else struct nhop_object * fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, uint32_t flags, uint32_t flowid) @@ -143,6 +170,7 @@ RTSTAT_INC(rts_unreach); return (NULL); } +#endif inline static int check_urpf_nhop(const struct nhop_object *nh, uint32_t flags, @@ -181,28 +209,19 @@ return (check_urpf_nhop(nh, flags, src_if)); } -/* - * Performs reverse path forwarding lookup. - * If @src_if is non-zero, verifies that at least 1 path goes via - * this interface. - * If @src_if is zero, verifies that route exist. - * if @flags contains NHR_NOTDEFAULT, do not consider default route. - * - * Returns 1 if route matching conditions is found, 0 otherwise. - */ -int -fib4_check_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, - uint32_t flags, const struct ifnet *src_if) +#ifndef ROUTE_ALGO +static struct nhop_object * +lookup_nhop(uint32_t fibnum, struct in_addr dst, uint32_t scopeid) { RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; - int ret; + struct nhop_object *nh; KASSERT((fibnum < rt_numfibs), ("fib4_check_urpf: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET); if (rh == NULL) - return (0); + return (NULL); /* Prepare lookup key */ struct sockaddr_in sin4; @@ -210,49 +229,94 @@ sin4.sin_len = sizeof(struct sockaddr_in); sin4.sin_addr = dst; + nh = NULL; RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); - if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - ret = check_urpf(RNTORT(rn)->rt_nhop, flags, src_if); - RIB_RUNLOCK(rh); - return (ret); - } + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) + nh = RNTORT(rn)->rt_nhop; RIB_RUNLOCK(rh); + return (nh); +} +#endif + +/* + * Performs reverse path forwarding lookup. + * If @src_if is non-zero, verifies that at least 1 path goes via + * this interface. + * If @src_if is zero, verifies that route exist. + * if @flags contains NHR_NOTDEFAULT, do not consider default route. + * + * Returns 1 if route matching conditions is found, 0 otherwise. + */ +int +fib4_check_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, + uint32_t flags, const struct ifnet *src_if) +{ + struct nhop_object *nh; +#ifdef ROUTE_ALGO + struct fib_dp *dp = &V_inet_dp[fibnum]; + struct flm_lookup_key key = {.addr4 = dst }; + + nh = dp->f(dp->arg, key, scopeid); +#else + nh = lookup_nhop(fibnum, dst, scopeid); +#endif + if (nh != NULL) + return (check_urpf(nh, flags, src_if)); + return (0); } -struct nhop_object * -fib4_lookup_debugnet(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, - uint32_t flags) +struct rtentry * +fib4_lookup_rt(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, + uint32_t flags, struct route_nhop_data *rnd) { + RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; - struct nhop_object *nh; + struct rtentry *rt; - KASSERT((fibnum < rt_numfibs), ("fib4_lookup_debugnet: bad fibnum")); + KASSERT((fibnum < rt_numfibs), ("fib4_lookup_rt: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET); if (rh == NULL) return (NULL); /* Prepare lookup key */ - struct sockaddr_in sin4; - memset(&sin4, 0, sizeof(sin4)); - sin4.sin_family = AF_INET; - sin4.sin_len = sizeof(struct sockaddr_in); - sin4.sin_addr = dst; - - nh = NULL; - /* unlocked lookup */ + struct sockaddr_in sin4 = { + .sin_family = AF_INET, + .sin_len = sizeof(struct sockaddr_in), + .sin_addr = dst, + }; + + rt = NULL; + if (!(flags & NHR_UNLOCKED)) + RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - nh = nhop_select((RNTORT(rn))->rt_nhop, 0); + rt = (struct rtentry *)rn; + rnd->rnd_nhop = rt->rt_nhop; + rnd->rnd_weight = rt->rt_weight; + } + if (!(flags & NHR_UNLOCKED)) + RIB_RUNLOCK(rh); + + return (rt); +} + +struct nhop_object * +fib4_lookup_debugnet(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, + uint32_t flags) +{ + struct rtentry *rt; + struct route_nhop_data rnd; + + rt = fib4_lookup_rt(fibnum, dst, scopeid, NHR_UNLOCKED, &rnd); + if (rt != NULL) { + struct nhop_object *nh = nhop_select(rnd.rnd_nhop, 0); /* Ensure route & ifp is UP */ - if (RT_LINK_IS_UP(nh->nh_ifp)) { - if (flags & NHR_REF) - nhop_ref_object(nh); + if (RT_LINK_IS_UP(nh->nh_ifp)) return (nh); - } } return (NULL); Index: sys/netinet/in_fib_algo.c =================================================================== --- /dev/null +++ sys/netinet/in_fib_algo.c @@ -0,0 +1,315 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + + +#define KEY_LEN_INET (offsetof(struct sockaddr_in, sin_addr) + sizeof(in_addr_t)) +#define OFF_LEN_INET (8 * offsetof(struct sockaddr_in, sin_addr)) +struct radix4_addr_entry { + struct radix_node rn[2]; + struct sockaddr_in addr; + struct nhop_object *nhop; +}; +#define LRADIX4_ITEM_SZ roundup2(sizeof(struct radix4_addr_entry), 64) + +struct lradix4_data { + struct radix_node_head *rnh; + struct fib_data *fd; + void *mem; + uint32_t alloc_items; + uint32_t num_items; +}; + +static struct nhop_object * +lradix4_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) +{ + struct radix_node_head *rnh = (struct radix_node_head *)algo_data; + struct radix4_addr_entry *ent; + struct sockaddr_in addr4 = { + .sin_len = KEY_LEN_INET, + .sin_addr = key.addr4, + }; + ent = (struct radix4_addr_entry *)(rnh->rnh_matchaddr(&addr4, &rnh->rh)); + if (ent != NULL) + return (ent->nhop); + return (NULL); +} + +static uint8_t +lradix4_get_pref(const struct rib_rtable_info *rinfo) +{ + + if (rinfo->num_prefixes < 10) + return (255); + else if (rinfo->num_prefixes < 100000) + return (255 - rinfo->num_prefixes / 394); + else + return (1); +} + +static enum flm_op_result +lradix4_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data) +{ + struct lradix4_data *lr; + struct rib_rtable_info rinfo; + uint32_t count; + + lr = malloc(sizeof(struct lradix4_data), M_RTABLE, M_NOWAIT | M_ZERO); + if (lr == NULL || !rn_inithead((void **)&lr->rnh, OFF_LEN_INET)) + return (FLM_REBUILD); + fib_get_rtable_info(fib_get_rh(fd), &rinfo); + + count = rinfo.num_prefixes * 11 / 10; + // XXX: alignment! + lr->mem = malloc(count * LRADIX4_ITEM_SZ, M_RTABLE, M_NOWAIT | M_ZERO); + if (lr->mem == NULL) + return (FLM_REBUILD); + lr->alloc_items = count; + lr->fd = fd; + + *_data = lr; + + return (FLM_SUCCESS); +} + +static void +lradix4_destroy(void *_data) +{ + struct lradix4_data *lr = (struct lradix4_data *)_data; + + if (lr->rnh != NULL) + rn_detachhead((void **)&lr->rnh); + if (lr->mem != NULL) + free(lr->mem, M_RTABLE); + free(lr, M_RTABLE); +} + +static enum flm_op_result +lradix4_add_route_cb(struct rtentry *rt, void *_data) +{ + struct lradix4_data *lr = (struct lradix4_data *)_data; + struct radix4_addr_entry *ae; + struct sockaddr_in *rt_dst, *rt_mask, mask; + struct radix_node *rn; + + if (fib_get_nhop_idx(lr->fd, rt->rt_nhop) == 0) + return (FLM_REBUILD); + + if (lr->num_items >= lr->alloc_items) + return (FLM_REBUILD); + + ae = (struct radix4_addr_entry *)((char *)lr->mem + lr->num_items * LRADIX4_ITEM_SZ); + lr->num_items++; + + ae->nhop = rt->rt_nhop; + + rt_dst = (struct sockaddr_in *)rt_key(rt); + rt_mask = (struct sockaddr_in *)rt_mask(rt); + + ae->addr.sin_len = KEY_LEN_INET; + ae->addr.sin_addr = rt_dst->sin_addr; + + if (rt_mask != NULL) { + bzero(&mask, sizeof(mask)); + mask.sin_len = KEY_LEN_INET; + mask.sin_addr = rt_mask->sin_addr; + rt_mask = &mask; + } + + rn = lr->rnh->rnh_addaddr((struct sockaddr *)&ae->addr, + (struct sockaddr *)rt_mask, &lr->rnh->rh, ae->rn); + if (rn == NULL) + return (FLM_REBUILD); + + return (FLM_SUCCESS); +} + +static enum flm_op_result +lradix4_end_dump(void *_data, struct fib_dp *dp) +{ + struct lradix4_data *lr = (struct lradix4_data *)_data; + + dp->f = lradix4_lookup; + dp->arg = lr->rnh; + + return (FLM_SUCCESS); +} + +static enum flm_op_result +lradix4_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, + void *_data) +{ + + return (FLM_REBUILD); +} + +struct fib_lookup_module flm_radix4_lockless = { + .flm_name = "radix4_lockless", + .flm_family = AF_INET, + .flm_init_cb = lradix4_init, + .flm_destroy_cb = lradix4_destroy, + .flm_dump_rib_item_cb = lradix4_add_route_cb, + .flm_dump_end_cb = lradix4_end_dump, + .flm_change_rib_item_cb = lradix4_change_cb, + .flm_get_pref = lradix4_get_pref, +}; + + +struct radix4_data { + struct fib_data *fd; + struct rib_head *rh; +}; + +static struct nhop_object * +radix4_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rh = (struct rib_head *)algo_data; + struct radix_node *rn; + struct nhop_object *nh; + + /* Prepare lookup key */ + struct sockaddr_in sin4 = { + .sin_family = AF_INET, + .sin_len = sizeof(struct sockaddr_in), + .sin_addr = key.addr4, + }; + + nh = NULL; + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) + nh = (RNTORT(rn))->rt_nhop; + RIB_RUNLOCK(rh); + + return (nh); +} + +static uint8_t +radix4_get_pref(const struct rib_rtable_info *rinfo) +{ + + return (50); +} + +static enum flm_op_result +radix4_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data) +{ + struct radix4_data *r4; + + r4 = malloc(sizeof(struct radix4_data), M_RTABLE, M_NOWAIT | M_ZERO); + if (r4 == NULL) + return (FLM_REBUILD); + r4->fd = fd; + r4->rh = fib_get_rh(fd); + if (r4->rh == NULL) + return (FLM_ERROR); + + *_data = r4; + + return (FLM_SUCCESS); +} + +static void +radix4_destroy(void *_data) +{ + + free(_data, M_RTABLE); +} + +static enum flm_op_result +radix4_end_dump(void *_data, struct fib_dp *dp) +{ + struct radix4_data *r4 = (struct radix4_data *)_data; + + dp->f = radix4_lookup; + dp->arg = r4->rh; + + return (FLM_SUCCESS); +} + +static enum flm_op_result +radix4_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, + void *_data) +{ + struct radix4_data *r4 = (struct radix4_data *)_data; + + /* + * Grab additional reference for each nexthop to maintain guarantee + * that we have non-zero # of reference for each nexthop in radix in + * the epoch. + */ + if (rc->rc_nh_new != NULL) { + if (fib_get_nhop_idx(r4->fd, rc->rc_nh_new) == 0) + return (FLM_REBUILD); + } + if (rc->rc_nh_old != NULL) + fib_free_nhop(r4->fd, rc->rc_nh_old); + + return (FLM_SUCCESS); +} + +struct fib_lookup_module flm_radix4 = { + .flm_name = "radix4", + .flm_family = AF_INET, + .flm_init_cb = radix4_init, + .flm_destroy_cb = radix4_destroy, + .flm_dump_end_cb = radix4_end_dump, + .flm_change_rib_item_cb = radix4_change_cb, + .flm_get_pref = radix4_get_pref, +}; + +static void +fib4_algo_init(void) +{ + + fib_module_register(&flm_radix4_lockless); + fib_module_register(&flm_radix4); +} +SYSINIT(fib4_algo_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, fib4_algo_init, NULL); Index: sys/netinet6/in6_fib.h =================================================================== --- sys/netinet6/in6_fib.h +++ sys/netinet6/in6_fib.h @@ -32,11 +32,16 @@ #ifndef _NETINET6_IN6_FIB_H_ #define _NETINET6_IN6_FIB_H_ +struct rtentry; +struct route_nhop_data; + struct nhop_object *fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags, uint32_t flowid); int fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags, const struct ifnet *src_if); +struct rtentry *fib6_lookup_rt(uint32_t fibnum, const struct in6_addr *dst6, + uint32_t scopeid, uint32_t flags, struct route_nhop_data *rnd); struct nhop_object *fib6_lookup_debugnet(uint32_t fibnum, const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags); uint32_t fib6_calc_software_hash(const struct in6_addr *src, Index: sys/netinet6/in6_fib.c =================================================================== --- sys/netinet6/in6_fib.c +++ sys/netinet6/in6_fib.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -69,6 +70,10 @@ CHK_STRUCT_ROUTE_COMPAT(struct route_in6, ro_dst); +#ifdef ROUTE_ALGO +VNET_DEFINE(struct fib_dp *, inet6_dp); +#endif + #ifdef ROUTE_MPATH struct _hash_5tuple_ipv6 { struct in6_addr src; @@ -81,6 +86,7 @@ _Static_assert(sizeof(struct _hash_5tuple_ipv6) == 40, "_hash_5tuple_ipv6 size is wrong"); + uint32_t fib6_calc_software_hash(const struct in6_addr *src, const struct in6_addr *dst, unsigned short src_port, unsigned short dst_port, char proto, @@ -111,6 +117,29 @@ * one needs to pass NHR_REF as a flag. This will return referenced * nexthop. */ +#ifdef ROUTE_ALGO +struct nhop_object * +fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6, + uint32_t scopeid, uint32_t flags, uint32_t flowid) +{ + struct nhop_object *nh; + struct fib_dp *dp = &V_inet6_dp[fibnum]; + struct flm_lookup_key key = {.addr6 = dst6 }; + + nh = dp->f(dp->arg, key, scopeid); + if (nh != NULL) { + nh = nhop_select(nh, flowid); + /* Ensure route & ifp is UP */ + if (RT_LINK_IS_UP(nh->nh_ifp)) { + if (flags & NHR_REF) + nhop_ref_object(nh); + return (nh); + } + } + RTSTAT_INC(rts_unreach); + return (NULL); +} +#else struct nhop_object * fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags, uint32_t flowid) @@ -119,7 +148,6 @@ struct rib_head *rh; struct radix_node *rn; struct nhop_object *nh; - struct sockaddr_in6 sin6; KASSERT((fibnum < rt_numfibs), ("fib6_lookup: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET6); @@ -127,11 +155,10 @@ return (NULL); /* TODO: radix changes */ - //addr = *dst6; - /* Prepare lookup key */ - memset(&sin6, 0, sizeof(sin6)); - sin6.sin6_len = sizeof(struct sockaddr_in6); - sin6.sin6_addr = *dst6; + struct sockaddr_in6 sin6 = { + .sin6_len = sizeof(struct sockaddr_in6), + .sin6_addr = *dst6, + }; /* Assume scopeid is valid and embed it directly */ if (IN6_IS_SCOPE_LINKLOCAL(dst6)) @@ -154,6 +181,7 @@ RTSTAT_INC(rts_unreach); return (NULL); } +#endif inline static int check_urpf_nhop(const struct nhop_object *nh, uint32_t flags, @@ -192,60 +220,75 @@ return (check_urpf_nhop(nh, flags, src_if)); } -/* - * Performs reverse path forwarding lookup. - * If @src_if is non-zero, verifies that at least 1 path goes via - * this interface. - * If @src_if is zero, verifies that route exist. - * if @flags contains NHR_NOTDEFAULT, do not consider default route. - * - * Returns 1 if route matching conditions is found, 0 otherwise. - */ -int -fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6, - uint32_t scopeid, uint32_t flags, const struct ifnet *src_if) +static struct nhop_object * +lookup_nhop(uint32_t fibnum, const struct in6_addr *dst6, + uint32_t scopeid) { RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; - struct sockaddr_in6 sin6; - int ret; + struct nhop_object *nh; KASSERT((fibnum < rt_numfibs), ("fib6_check_urpf: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET6); if (rh == NULL) - return (0); + return (NULL); - /* TODO: radix changes */ /* Prepare lookup key */ - memset(&sin6, 0, sizeof(sin6)); - sin6.sin6_len = sizeof(struct sockaddr_in6); - sin6.sin6_addr = *dst6; + struct sockaddr_in6 sin6 = { + .sin6_len = sizeof(struct sockaddr_in6), + .sin6_addr = *dst6, + }; /* Assume scopeid is valid and embed it directly */ if (IN6_IS_SCOPE_LINKLOCAL(dst6)) sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff); + nh = NULL; RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); - if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - ret = check_urpf(RNTORT(rn)->rt_nhop, flags, src_if); - RIB_RUNLOCK(rh); - return (ret); - } + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) + nh = RNTORT(rn)->rt_nhop; RIB_RUNLOCK(rh); + return (nh); +} + +/* + * Performs reverse path forwarding lookup. + * If @src_if is non-zero, verifies that at least 1 path goes via + * this interface. + * If @src_if is zero, verifies that route exist. + * if @flags contains NHR_NOTDEFAULT, do not consider default route. + * + * Returns 1 if route matching conditions is found, 0 otherwise. + */ +int +fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6, + uint32_t scopeid, uint32_t flags, const struct ifnet *src_if) +{ + struct nhop_object *nh; +#ifndef ROUTE_ALGO + struct fib_dp *dp = &V_inet6_dp[fibnum]; + struct flm_lookup_key key = {.addr6 = dst6 }; + + nh = dp->f(dp->arg, key, scopeid); +#else + nh = lookup_nhop(fibnum, dst6, scopeid); +#endif + if (nh != NULL) + return (check_urpf(nh, flags, src_if)); return (0); } -struct nhop_object * -fib6_lookup_debugnet(uint32_t fibnum, const struct in6_addr *dst6, - uint32_t scopeid, uint32_t flags) +struct rtentry * +fib6_lookup_rt(uint32_t fibnum, const struct in6_addr *dst6, + uint32_t scopeid, uint32_t flags, struct route_nhop_data *rnd) { + RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; - struct nhop_object *nh; - struct sockaddr_in6 sin6; + struct rtentry *rt; KASSERT((fibnum < rt_numfibs), ("fib6_lookup: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET6); @@ -253,25 +296,43 @@ return (NULL); /* TODO: radix changes */ - //addr = *dst6; - /* Prepare lookup key */ - memset(&sin6, 0, sizeof(sin6)); - sin6.sin6_len = sizeof(struct sockaddr_in6); - sin6.sin6_addr = *dst6; + struct sockaddr_in6 sin6 = { + .sin6_len = sizeof(struct sockaddr_in6), + .sin6_addr = *dst6, + }; /* Assume scopeid is valid and embed it directly */ if (IN6_IS_SCOPE_LINKLOCAL(dst6)) sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff); + rt = NULL; + if (!(flags & NHR_UNLOCKED)) + RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - nh = nhop_select((RNTORT(rn))->rt_nhop, 0); + rt = (struct rtentry *)rn; + rnd->rnd_nhop = rt->rt_nhop; + rnd->rnd_weight = rt->rt_weight; + } + if (!(flags & NHR_UNLOCKED)) + RIB_RUNLOCK(rh); + + return (rt); +} + +struct nhop_object * +fib6_lookup_debugnet(uint32_t fibnum, const struct in6_addr *dst6, + uint32_t scopeid, uint32_t flags) +{ + struct rtentry *rt; + struct route_nhop_data rnd; + + rt = fib6_lookup_rt(fibnum, dst6, scopeid, NHR_UNLOCKED, &rnd); + if (rt != NULL) { + struct nhop_object *nh = nhop_select(rnd.rnd_nhop, 0); /* Ensure route & ifp is UP */ - if (RT_LINK_IS_UP(nh->nh_ifp)) { - if (flags & NHR_REF) - nhop_ref_object(nh); + if (RT_LINK_IS_UP(nh->nh_ifp)) return (nh); - } } return (NULL); Index: sys/netinet6/in6_fib_algo.c =================================================================== --- /dev/null +++ sys/netinet6/in6_fib_algo.c @@ -0,0 +1,338 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#define RTDEBUG + +#define KEY_LEN_INET6 (offsetof(struct sa_in6, sin6_addr) + sizeof(struct in6_addr)) +#define OFF_LEN_INET6 (8 * offsetof(struct sa_in6, sin6_addr)) +struct sa_in6 { + uint8_t sin6_len; + uint8_t sin6_family; + uint8_t pad[2]; + struct in6_addr sin6_addr; +}; +struct radix6_addr_entry { + struct radix_node rn[2]; + struct sa_in6 addr; + struct nhop_object *nhop; +}; +#define LRADIX6_ITEM_SZ roundup2(sizeof(struct radix6_addr_entry), 64) + +struct lradix6_data { + struct radix_node_head *rnh; + struct fib_data *fd; + void *mem; + uint32_t alloc_items; + uint32_t num_items; +}; + +static struct nhop_object * +lradix6_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) +{ + struct radix_node_head *rnh = (struct radix_node_head *)algo_data; + struct radix6_addr_entry *ent; + struct sockaddr_in6 addr6 = { + .sin6_len = KEY_LEN_INET6, + .sin6_addr = *key.addr6, + }; + ent = (struct radix6_addr_entry *)(rnh->rnh_matchaddr(&addr6, &rnh->rh)); + if (ent != NULL) + return (ent->nhop); + return (NULL); +} + +static uint8_t +lradix6_get_pref(const struct rib_rtable_info *rinfo) +{ + + if (rinfo->num_prefixes < 10) + return (255); + else if (rinfo->num_prefixes < 100000) + return (255 - rinfo->num_prefixes / 394); + else + return (1); +} + +static enum flm_op_result +lradix6_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data) +{ + struct lradix6_data *lr; + struct rib_rtable_info rinfo; + uint32_t count; + + lr = malloc(sizeof(struct lradix6_data), M_RTABLE, M_NOWAIT | M_ZERO); + if (lr == NULL || !rn_inithead((void **)&lr->rnh, OFF_LEN_INET6)) + return (FLM_REBUILD); + fib_get_rtable_info(fib_get_rh(fd), &rinfo); + + count = rinfo.num_prefixes * 11 / 10; + // XXX: alignment! + lr->mem = malloc(count * LRADIX6_ITEM_SZ, M_RTABLE, M_NOWAIT | M_ZERO); + if (lr->mem == NULL) + return (FLM_REBUILD); + lr->alloc_items = count; + lr->fd = fd; + + *_data = lr; + + return (FLM_SUCCESS); +} + +static void +lradix6_destroy(void *_data) +{ + struct lradix6_data *lr = (struct lradix6_data *)_data; + + if (lr->rnh != NULL) + rn_detachhead((void **)&lr->rnh); + if (lr->mem != NULL) + free(lr->mem, M_RTABLE); + free(lr, M_RTABLE); +} + +static enum flm_op_result +lradix6_add_route_cb(struct rtentry *rt, void *_data) +{ + struct lradix6_data *lr = (struct lradix6_data *)_data; + struct radix6_addr_entry *ae; + struct sockaddr_in6 *rt_dst, *rt_mask; + struct sa_in6 mask; + struct radix_node *rn; + + if (fib_get_nhop_idx(lr->fd, rt->rt_nhop) == 0) + return (FLM_REBUILD); + + if (lr->num_items >= lr->alloc_items) + return (FLM_REBUILD); + + ae = (struct radix6_addr_entry *)((char *)lr->mem + lr->num_items * LRADIX6_ITEM_SZ); + lr->num_items++; + + ae->nhop = rt->rt_nhop; + + rt_dst = (struct sockaddr_in6 *)rt_key(rt); + rt_mask = (struct sockaddr_in6 *)rt_mask(rt); + + ae->addr.sin6_len = KEY_LEN_INET6; + ae->addr.sin6_addr = rt_dst->sin6_addr; + + if (rt_mask != NULL) { + bzero(&mask, sizeof(mask)); + mask.sin6_len = KEY_LEN_INET6; + mask.sin6_addr = rt_mask->sin6_addr; + rt_mask = (struct sockaddr_in6 *)&mask; + } + + rn = lr->rnh->rnh_addaddr((struct sockaddr *)&ae->addr, + (struct sockaddr *)rt_mask, &lr->rnh->rh, ae->rn); + if (rn == NULL) + return (FLM_REBUILD); + + return (FLM_SUCCESS); +} + +static enum flm_op_result +lradix6_end_dump(void *_data, struct fib_dp *dp) +{ + struct lradix6_data *lr = (struct lradix6_data *)_data; + + dp->f = lradix6_lookup; + dp->arg = lr->rnh; + + return (FLM_SUCCESS); +} + +static enum flm_op_result +lradix6_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, + void *_data) +{ + + return (FLM_REBUILD); +} + +struct fib_lookup_module flm_radix6_lockless = { + .flm_name = "radix6_lockless", + .flm_family = AF_INET6, + .flm_init_cb = lradix6_init, + .flm_destroy_cb = lradix6_destroy, + .flm_dump_rib_item_cb = lradix6_add_route_cb, + .flm_dump_end_cb = lradix6_end_dump, + .flm_change_rib_item_cb = lradix6_change_cb, + .flm_get_pref = lradix6_get_pref, +}; + + +struct radix6_data { + struct fib_data *fd; + struct rib_head *rh; +}; + +static struct nhop_object * +radix6_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rh = (struct rib_head *)algo_data; + struct radix_node *rn; + struct nhop_object *nh; + + /* Prepare lookup key */ + struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + .sin6_len = sizeof(struct sockaddr_in6), + .sin6_addr = *key.addr6, + }; + if (IN6_IS_SCOPE_LINKLOCAL(key.addr6)) + sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff); + + nh = NULL; + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) + nh = (RNTORT(rn))->rt_nhop; + RIB_RUNLOCK(rh); + + return (nh); +} + +static uint8_t +radix6_get_pref(const struct rib_rtable_info *rinfo) +{ + + return (50); +} + +static enum flm_op_result +radix6_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data) +{ + struct radix6_data *r6; + + r6 = malloc(sizeof(struct radix6_data), M_RTABLE, M_NOWAIT | M_ZERO); + if (r6 == NULL) + return (FLM_REBUILD); + r6->fd = fd; + r6->rh = fib_get_rh(fd); + if (r6->rh == NULL) + return (FLM_ERROR); + + *_data = r6; + + return (FLM_SUCCESS); +} + +static void +radix6_destroy(void *_data) +{ + + free(_data, M_RTABLE); +} + +static enum flm_op_result +radix6_end_dump(void *_data, struct fib_dp *dp) +{ + struct radix6_data *r6 = (struct radix6_data *)_data; + + dp->f = radix6_lookup; + dp->arg = r6->rh; + + return (FLM_SUCCESS); +} + +static enum flm_op_result +radix6_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, + void *_data) +{ + struct radix6_data *r6 = (struct radix6_data *)_data; + + /* + * Grab additional reference for each nexthop to maintain guarantee + * that we have non-zero # of reference for each nexthop in radix in + * the epoch. + */ + if (rc->rc_nh_new != NULL) { + if (fib_get_nhop_idx(r6->fd, rc->rc_nh_new) == 0) + return (FLM_REBUILD); + } + if (rc->rc_nh_old != NULL) + fib_free_nhop(r6->fd, rc->rc_nh_old); + + return (FLM_SUCCESS); +} + +struct fib_lookup_module flm_radix6 = { + .flm_name = "radix6", + .flm_family = AF_INET6, + .flm_init_cb = radix6_init, + .flm_destroy_cb = radix6_destroy, + .flm_dump_end_cb = radix6_end_dump, + .flm_change_rib_item_cb = radix6_change_cb, + .flm_get_pref = radix6_get_pref, +}; + +static void +fib6_algo_init(void) +{ + + fib_module_register(&flm_radix6_lockless); + fib_module_register(&flm_radix6); +} +SYSINIT(fib6_algo_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, fib6_algo_init, NULL);