Index: sys/contrib/dpdk_rte_lpm/dpdk_lpm.c =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/dpdk_lpm.c @@ -0,0 +1,423 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include "rte_shim.h" +#include "rte_lpm.h" + +#define LPM_MIN_TBL8 8 /* 2 pages of memory */ +#define LPM_MAX_TBL8 65536 * 16 /* 256M */ + +MALLOC_DECLARE(M_RTABLE); + +struct dpdk_lpm_data { + struct rte_lpm *lpm; + uint64_t routes_added; + uint64_t routes_failed; + uint32_t number_tbl8s; + uint32_t fibnum; + uint8_t hit_tables; + uint8_t hit_records; + struct fib_data *fd; +}; + +/* + * Main datapath routing + */ +static struct nhop_object * +lookup_ptr(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) +{ + struct rte_lpm *lpm; + const struct rte_lpm_external *rte_ext; + uint32_t nhidx = 0; + int ret; + + lpm = (struct rte_lpm *)algo_data; + rte_ext = (const struct rte_lpm_external *)lpm; + + ret = rte_lpm_lookup(lpm, ntohl(key.addr4.s_addr), &nhidx); + if (ret == 0) { + /* Success! */ + return (rte_ext->nh_idx[nhidx]); + } else { + /* Not found. Check default route */ + return (rte_ext->nh_idx[rte_ext->default_idx]); + } + + return (NULL); +} + +static uint8_t +rte_get_pref(const struct rib_rtable_info *rinfo) +{ + + if (rinfo->num_prefixes < 10) + return (1); + else if (rinfo->num_prefixes < 1000) + return (rinfo->num_prefixes / 10); + else if (rinfo->num_prefixes < 500000) + return (100 + rinfo->num_prefixes / 3334); + else + return (250); +} + +static enum flm_op_result +handle_default_change(struct dpdk_lpm_data *dd, struct rib_cmd_info *rc) +{ + struct rte_lpm_external *rte_ext; + rte_ext = (struct rte_lpm_external *)dd->lpm; + + if (rc->rc_cmd != RTM_DELETE) { + /* Reference new */ + uint32_t nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new); + + if (nhidx == 0) + return (FLM_REBUILD); + rte_ext->default_idx = nhidx; + } else { + /* No default route */ + rte_ext->default_idx = 0; + } + + return (FLM_SUCCESS); +} + +static void +get_parent_rule(struct dpdk_lpm_data *dd, struct in_addr addr, uint8_t *plen, uint32_t *nhop_idx) +{ + struct route_nhop_data rnd; + struct rtentry *rt; + + rt = fib4_lookup_rt(dd->fibnum, addr, 0, NHR_UNLOCKED, &rnd); + if (rt != NULL) { + struct in_addr addr4; + uint32_t scopeid; + int inet_plen; + rt_get_inet_prefix_plen(rt, &addr4, &inet_plen, &scopeid); + if (inet_plen > 0) { + *plen = inet_plen; + *nhop_idx = fib_get_nhop_idx(dd->fd, rnd.rnd_nhop); + return; + } + } + + *nhop_idx = 0; + *plen = 0; +} + +static enum flm_op_result +handle_gu_change(struct dpdk_lpm_data *dd, const struct rib_cmd_info *rc, + const struct in_addr addr, int plen) +{ + uint32_t nhidx = 0; + int ret; + char abuf[INET_ADDRSTRLEN]; + uint32_t ip; + + ip = ntohl(addr.s_addr); + inet_ntop(AF_INET, &addr, abuf, sizeof(abuf)); + + /* So we get sin, plen and nhidx */ + if (rc->rc_cmd != RTM_DELETE) { + /* + * Addition or change. Save nhop in the internal table + * and get index. + */ + nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new); + if (nhidx == 0) { + FIB_PRINTF(LOG_INFO, dd->fd, "nhop limit reached, need rebuild"); + return (FLM_REBUILD); + } + + ret = rte_lpm_add(dd->lpm, ip, plen, nhidx); + FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK GU: %s %s/%d nhop %u = %d", + (rc->rc_cmd == RTM_ADD) ? "ADD" : "UPDATE", + abuf, plen, nhidx, ret); + } else { + /* + * Need to lookup parent. Assume deletion happened already + */ + uint8_t parent_plen; + uint32_t parent_nhop_idx; + get_parent_rule(dd, addr, &parent_plen, &parent_nhop_idx); + + ret = rte_lpm_delete(dd->lpm, ip, plen, parent_plen, parent_nhop_idx); + FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK: %s %s/%d nhop %u = %d", + "DEL", abuf, plen, nhidx, ret); + } + + if (ret != 0) { + FIB_PRINTF(LOG_INFO, dd->fd, "error: %d", ret); + if (ret == -ENOSPC) + return (FLM_REBUILD); + return (FLM_ERROR); + } + return (FLM_SUCCESS); +} + +static enum flm_op_result +handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, + void *_data) +{ + struct dpdk_lpm_data *dd; + enum flm_op_result ret; + struct in_addr addr4; + uint32_t scopeid; + int plen; + + dd = (struct dpdk_lpm_data *)_data; + rt_get_inet_prefix_plen(rc->rc_rt, &addr4, &plen, &scopeid); + + if (plen != 0) + ret = handle_gu_change(dd, rc, addr4, plen); + else + ret = handle_default_change(dd, rc); + + if (ret != 0) + FIB_PRINTF(LOG_INFO, dd->fd, "error handling route"); + return (ret); +} + +static void +destroy_table(void *_data) +{ + struct dpdk_lpm_data *dd = (struct dpdk_lpm_data *)_data; + + if (dd->lpm != NULL) + rte_lpm_free(dd->lpm); + free(dd, M_RTABLE); +} + +static enum flm_op_result +add_route_cb(struct rtentry *rt, void *_data) +{ + struct dpdk_lpm_data *dd = (struct dpdk_lpm_data *)_data; + struct nhop_object *nh; + int plen, ret; + struct in_addr addr4; + uint32_t scopeid; + + nh = rt_get_raw_nhop(rt); + rt_get_inet_prefix_plen(rt, &addr4, &plen, &scopeid); + + char abuf[INET_ADDRSTRLEN]; + inet_ntop(AF_INET, &addr4, abuf, sizeof(abuf)); + + FIB_PRINTF(LOG_DEBUG, dd->fd, "Operating on %s/%d", abuf, plen); + + if (plen == 0) { + struct rib_cmd_info rc = { + .rc_cmd = RTM_ADD, + .rc_nh_new = nh, + }; + + FIB_PRINTF(LOG_DEBUG, dd->fd, "Adding default route"); + return (handle_default_change(dd, &rc)); + } + + uint32_t nhidx = fib_get_nhop_idx(dd->fd, nh); + if (nhidx == 0) { + FIB_PRINTF(LOG_INFO, dd->fd, "unable to get nhop index"); + return (FLM_REBUILD); + } + ret = rte_lpm_add(dd->lpm, ntohl(addr4.s_addr), plen, nhidx); + FIB_PRINTF(LOG_DEBUG, dd->fd, "ADD %p %s/%d nh %u = %d", + dd->lpm, abuf, plen, nhidx, ret); + + if (ret != 0) { + FIB_PRINTF(LOG_INFO, dd->fd, "rte_lpm_add() returned %d", ret); + if (ret == -ENOSPC) { + dd->hit_tables = 1; + return (FLM_REBUILD); + } + dd->routes_failed++; + return (FLM_ERROR); + } else + dd->routes_added++; + + return (FLM_SUCCESS); +} + +static enum flm_op_result +check_dump_success(void *_data, struct fib_dp *dp) +{ + struct dpdk_lpm_data *dd; + + dd = (struct dpdk_lpm_data *)_data; + + FIB_PRINTF(LOG_INFO, dd->fd, "scan completed. added: %zu failed: %zu", + dd->routes_added, dd->routes_failed); + if (dd->hit_tables || dd->routes_failed > 0) + return (FLM_REBUILD); + + FIB_PRINTF(LOG_INFO, dd->fd, + "DPDK lookup engine synced with IPv4 RIB id %u, %zu routes", + dd->fibnum, dd->routes_added); + + dp->f = lookup_ptr; + dp->arg = dd->lpm; + + return (FLM_SUCCESS); +} + +static void +estimate_scale(const struct dpdk_lpm_data *dd_src, struct dpdk_lpm_data *dd) +{ + + /* XXX: update at 75% capacity */ + if (dd_src->hit_tables) + dd->number_tbl8s = dd_src->number_tbl8s * 2; + else + dd->number_tbl8s = dd_src->number_tbl8s; + + /* TODO: look into the appropriate RIB to adjust */ +} + +static struct dpdk_lpm_data * +build_table(struct dpdk_lpm_data *dd_prev, struct fib_data *fd) +{ + struct dpdk_lpm_data *dd; + struct rte_lpm *lpm; + + dd = malloc(sizeof(struct dpdk_lpm_data), M_RTABLE, M_NOWAIT | M_ZERO); + if (dd == NULL) { + FIB_PRINTF(LOG_INFO, fd, "Unable to allocate base datastructure"); + return (NULL); + } + dd->fibnum = dd_prev->fibnum; + dd->fd = fd; + + estimate_scale(dd_prev, dd); + + struct rte_lpm_config cfg = {.number_tbl8s = dd->number_tbl8s}; + lpm = rte_lpm_create("test", 0, &cfg); + if (lpm == NULL) { + FIB_PRINTF(LOG_INFO, fd, "unable to create lpm"); + free(dd, M_RTABLE); + return (NULL); + } + dd->lpm = lpm; + struct rte_lpm_external *ext = (struct rte_lpm_external *)lpm; + ext->nh_idx = fib_get_nhop_array(dd->fd); + + FIB_PRINTF(LOG_INFO, fd, "allocated %u tbl8s", dd->number_tbl8s); + + return (dd); +} + +static enum flm_op_result +init_table(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **data) +{ + struct dpdk_lpm_data *dd, dd_base; + + if (_old_data == NULL) { + bzero(&dd_base, sizeof(struct dpdk_lpm_data)); + dd_base.fibnum = fibnum; + /* TODO: get rib statistics */ + dd_base.number_tbl8s = LPM_MIN_TBL8; + dd = &dd_base; + } else { + FIB_PRINTF(LOG_DEBUG, fd, "Starting with old data"); + dd = (struct dpdk_lpm_data *)_old_data; + } + + /* Guaranteed to be in epoch */ + dd = build_table(dd, fd); + if (dd == NULL) { + FIB_PRINTF(LOG_NOTICE, fd, "table creation failed"); + return (FLM_REBUILD); + } + + *data = dd; + return (FLM_SUCCESS); +} + +static struct fib_lookup_module dpdk_lpm4 = { + .flm_name = "dpdk_lpm4", + .flm_family = AF_INET, + .flm_init_cb = init_table, + .flm_destroy_cb = destroy_table, + .flm_dump_rib_item_cb = add_route_cb, + .flm_dump_end_cb = check_dump_success, + .flm_change_rib_item_cb = handle_rtable_change_cb, + .flm_get_pref = rte_get_pref, +}; + +static int +lpm4_modevent(module_t mod, int type, void *unused) +{ + int error = 0; + + switch (type) { + case MOD_LOAD: + fib_module_register(&dpdk_lpm4); + break; + case MOD_UNLOAD: + error = fib_module_unregister(&dpdk_lpm4); + break; + default: + error = EOPNOTSUPP; + break; + } + return (error); +} + +static moduledata_t lpm4mod = { + "dpdk_lpm4", + lpm4_modevent, + 0 +}; + +DECLARE_MODULE(lpm4mod, lpm4mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(lpm4mod, 1); Index: sys/contrib/dpdk_rte_lpm/dpdk_lpm6.h =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/dpdk_lpm6.h @@ -0,0 +1,57 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Contains various definitions shared between the parts of a routing subsystem. + * + */ + +#ifndef _NETINET6_DPDK_LPM6_H_ +#define _NETINET6_DPDK_LPM6_H_ + +/** LPM structure. */ +struct rte_lpm6; + +/** LPM configuration structure. */ +struct rte_lpm6_config { + uint32_t max_rules; /**< Max number of rules. */ + uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */ + int flags; /**< This field is currently unused. */ +}; + +struct rte_lpm6 * +rte_lpm6_create(const char *name, int socket_id, + const struct rte_lpm6_config *config); +void +rte_lpm6_free(struct rte_lpm6 *lpm); +int +rte_lpm6_add(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + uint32_t next_hop, int is_new_rule); + +#endif Index: sys/contrib/dpdk_rte_lpm/dpdk_lpm6.c =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/dpdk_lpm6.c @@ -0,0 +1,487 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#define RTDEBUG + +#include "rte_lpm6.h" + +#define LPM6_MIN_TBL8 8 /* 2 pages of memory */ +#define LPM6_MAX_TBL8 65536 * 16 /* 256M */ + +struct fib_algo_calldata { + void *lookup; + void *arg; +}; + +struct dpdk_lpm6_data { + struct rte_lpm6 *lpm6; + uint64_t routes_added; + uint64_t routes_failed; + uint32_t number_tbl8s; + uint32_t fibnum; + uint8_t hit_tables; + struct fib_data *fd; +}; + +static struct nhop_object * +lookup_ptr_ll(const struct rte_lpm6 *lpm6, const struct in6_addr *dst6, + uint32_t scopeid) +{ + const struct rte_lpm6_external *rte_ext; + + rte_ext = (const struct rte_lpm6_external *)lpm6; + + return (fib6_radix_lookup_nh(rte_ext->fibnum, dst6, scopeid)); +} + +/* + * Main datapath routing + */ +static struct nhop_object * +lookup_ptr(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) +{ + const struct rte_lpm6 *lpm6; + const struct rte_lpm6_external *rte_ext; + const struct in6_addr *addr6; + uint32_t nhidx = 0; + int ret; + + lpm6 = (const struct rte_lpm6 *)algo_data; + addr6 = key.addr6; + rte_ext = (const struct rte_lpm6_external *)lpm6; + + if (!IN6_IS_SCOPE_LINKLOCAL(addr6)) { + ret = rte_lpm6_lookup(lpm6, (const uint8_t *)addr6, &nhidx); + if (ret == 0) { + /* Success! */ + return (rte_ext->nh_idx[nhidx]); + } else { + /* Not found. Check default route */ + if (rte_ext->default_idx > 0) + return (rte_ext->nh_idx[rte_ext->default_idx]); + else + return (NULL); + } + } else { + /* LL */ + return (lookup_ptr_ll(lpm6, addr6, scopeid)); + } +} + +static uint8_t +rte6_get_pref(const struct rib_rtable_info *rinfo) +{ + + if (rinfo->num_prefixes < 10) + return (1); + else if (rinfo->num_prefixes < 1000) + return (rinfo->num_prefixes / 10); + else if (rinfo->num_prefixes < 500000) + return (100 + rinfo->num_prefixes / 3334); + else + return (250); +} + +static enum flm_op_result +handle_default_change(struct dpdk_lpm6_data *dd, struct rib_cmd_info *rc) +{ + struct rte_lpm6_external *rte_ext; + rte_ext = (struct rte_lpm6_external *)dd->lpm6; + + if (rc->rc_cmd != RTM_DELETE) { + /* Reference new */ + uint32_t nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new); + + if (nhidx == 0) + return (FLM_REBUILD); + rte_ext->default_idx = nhidx; + } else { + /* No default route */ + rte_ext->default_idx = 0; + } + + return (FLM_SUCCESS); +} + +static enum flm_op_result +handle_ll_change(struct dpdk_lpm6_data *dd, struct rib_cmd_info *rc, + const struct in6_addr addr6, int plen, uint32_t scopeid) +{ + + return (FLM_SUCCESS); +} + +static struct rte_lpm6_rule * +pack_parent_rule(struct dpdk_lpm6_data *dd, const struct in6_addr *addr6, + char *buffer) +{ + struct rte_lpm6_rule *lsp_rule = NULL; + struct route_nhop_data rnd; + struct rtentry *rt; + int plen; + + rt = fib6_lookup_rt(dd->fibnum, addr6, 0, NHR_UNLOCKED, &rnd); + /* plen = 0 means default route and it's out of scope */ + if (rt != NULL) { + uint32_t scopeid; + struct in6_addr new_addr6; + rt_get_inet6_prefix_plen(rt, &new_addr6, &plen, &scopeid); + if (plen > 0) { + uint32_t nhidx = fib_get_nhop_idx(dd->fd, rnd.rnd_nhop); + if (nhidx == 0) { + /* + * shouldn't happen as we already have parent route. + * It will trigger rebuild automatically. + */ + return (NULL); + } + lsp_rule = fill_rule6(buffer, (uint8_t *)&new_addr6, plen, nhidx); + } + } + + return (lsp_rule); +} + +static enum flm_op_result +handle_gu_change(struct dpdk_lpm6_data *dd, const struct rib_cmd_info *rc, + const struct in6_addr *addr6, int plen) +{ + int ret; + char abuf[INET6_ADDRSTRLEN]; + inet_ntop(AF_INET6, addr6, abuf, sizeof(abuf)); + + /* So we get sin6, plen and nhidx */ + if (rc->rc_cmd != RTM_DELETE) { + /* + * Addition or change. Save nhop in the internal table + * and get index. + */ + uint32_t nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new); + if (nhidx == 0) { + FIB_PRINTF(LOG_INFO, dd->fd, "nhop limit reached, need rebuild"); + return (FLM_REBUILD); + } + + ret = rte_lpm6_add(dd->lpm6, (const uint8_t *)addr6, + plen, nhidx, (rc->rc_cmd == RTM_ADD) ? 1 : 0); + FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK GU: %s %s/%d nhop %u = %d", + (rc->rc_cmd == RTM_ADD) ? "ADD" : "UPDATE", + abuf, plen, nhidx, ret); + } else { + /* + * Need to lookup parent. Assume deletion happened already + */ + char buffer[RTE_LPM6_RULE_SIZE]; + struct rte_lpm6_rule *lsp_rule = NULL; + lsp_rule = pack_parent_rule(dd, addr6, buffer); + + ret = rte_lpm6_delete(dd->lpm6, (const uint8_t *)addr6, plen, lsp_rule); + FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK GU: %s %s/%d nhop ? = %d", + "DEL", abuf, plen, ret); + } + + if (ret != 0) { + FIB_PRINTF(LOG_INFO, dd->fd, "error: %d", ret); + if (ret == -ENOSPC) + return (FLM_REBUILD); + return (FLM_ERROR); + } + return (FLM_SUCCESS); +} + +static enum flm_op_result +handle_any_change(struct dpdk_lpm6_data *dd, struct rib_cmd_info *rc) +{ + enum flm_op_result ret; + struct in6_addr addr6; + uint32_t scopeid; + int plen; + + rt_get_inet6_prefix_plen(rc->rc_rt, &addr6, &plen, &scopeid); + + if (IN6_IS_SCOPE_LINKLOCAL(&addr6)) + ret = handle_ll_change(dd, rc, addr6, plen, scopeid); + else if (plen == 0) + ret = handle_default_change(dd, rc); + else + ret = handle_gu_change(dd, rc, &addr6, plen); + + if (ret != 0) + FIB_PRINTF(LOG_INFO, dd->fd, "error handling route"); + return (ret); +} + +static enum flm_op_result +handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, + void *_data) +{ + struct dpdk_lpm6_data *dd; + + dd = (struct dpdk_lpm6_data *)_data; + + return (handle_any_change(dd, rc)); +} + +static void +destroy_dd(struct dpdk_lpm6_data *dd) +{ + + FIB_PRINTF(LOG_INFO, dd->fd, "destroy dd %p", dd); + if (dd->lpm6 != NULL) + rte_lpm6_free(dd->lpm6); + free(dd, M_TEMP); +} + +static void +destroy_table(void *_data) +{ + + destroy_dd((struct dpdk_lpm6_data *)_data); +} + +static enum flm_op_result +add_route_cb(struct rtentry *rt, void *_data) +{ + struct dpdk_lpm6_data *dd = (struct dpdk_lpm6_data *)_data; + struct in6_addr addr6; + struct nhop_object *nh; + uint32_t scopeid; + int plen; + int ret; + + rt_get_inet6_prefix_plen(rt, &addr6, &plen, &scopeid); + nh = rt_get_raw_nhop(rt); + + if (IN6_IS_SCOPE_LINKLOCAL(&addr6)) { + + /* + * We don't operate on LL directly, however + * reference them to maintain guarantee on + * ability to refcount nhops in epoch. + */ + fib_get_nhop_idx(dd->fd, nh); + return (FLM_SUCCESS); + } + + char abuf[INET6_ADDRSTRLEN]; + inet_ntop(AF_INET6, &addr6, abuf, sizeof(abuf)); + FIB_PRINTF(LOG_DEBUG, dd->fd, "Operating on %s/%d", abuf, plen); + + if (plen == 0) { + struct rib_cmd_info rc = { + .rc_cmd = RTM_ADD, + .rc_nh_new = nh, + }; + + FIB_PRINTF(LOG_DEBUG, dd->fd, "Adding default route"); + return (handle_default_change(dd, &rc)); + } + + uint32_t nhidx = fib_get_nhop_idx(dd->fd, nh); + if (nhidx == 0) { + FIB_PRINTF(LOG_INFO, dd->fd, "unable to get nhop index"); + return (FLM_REBUILD); + } + ret = rte_lpm6_add(dd->lpm6, (const uint8_t *)&addr6, plen, nhidx, 1); + FIB_PRINTF(LOG_DEBUG, dd->fd, "ADD %p %s/%d nh %u = %d", + dd->lpm6, abuf, plen, nhidx, ret); + + if (ret != 0) { + FIB_PRINTF(LOG_INFO, dd->fd, "rte_lpm6_add() returned %d", ret); + if (ret == -ENOSPC) { + dd->hit_tables = 1; + return (FLM_REBUILD); + } + dd->routes_failed++; + return (FLM_ERROR); + } else + dd->routes_added++; + + return (FLM_SUCCESS); +} + +static enum flm_op_result +check_dump_success(void *_data, struct fib_dp *dp) +{ + struct dpdk_lpm6_data *dd; + + dd = (struct dpdk_lpm6_data *)_data; + + FIB_PRINTF(LOG_INFO, dd->fd, "scan completed. added: %zu failed: %zu", + dd->routes_added, dd->routes_failed); + if (dd->hit_tables || dd->routes_failed > 0) + return (FLM_REBUILD); + + FIB_PRINTF(LOG_INFO, dd->fd, + "DPDK lookup engine synced with IPv6 RIB id %u, %zu routes", + dd->fibnum, dd->routes_added); + + dp->f = lookup_ptr; + dp->arg = dd->lpm6; + + return (FLM_SUCCESS); +} + +static void +estimate_scale(const struct dpdk_lpm6_data *dd_src, struct dpdk_lpm6_data *dd) +{ + + /* XXX: update at 75% capacity */ + if (dd_src->hit_tables) + dd->number_tbl8s = dd_src->number_tbl8s * 2; + else + dd->number_tbl8s = dd_src->number_tbl8s; + + /* TODO: look into the appropriate RIB to adjust */ +} + +static struct dpdk_lpm6_data * +build_table(struct dpdk_lpm6_data *dd_prev, struct fib_data *fd) +{ + struct dpdk_lpm6_data *dd; + struct rte_lpm6 *lpm6; + + dd = malloc(sizeof(struct dpdk_lpm6_data), M_TEMP, M_NOWAIT | M_ZERO); + if (dd == NULL) { + FIB_PRINTF(LOG_INFO, fd, "Unable to allocate base datastructure"); + return (NULL); + } + dd->fibnum = dd_prev->fibnum; + dd->fd = fd; + + estimate_scale(dd_prev, dd); + + struct rte_lpm6_config cfg = {.number_tbl8s = dd->number_tbl8s}; + lpm6 = rte_lpm6_create("test", 0, &cfg); + if (lpm6 == NULL) { + FIB_PRINTF(LOG_INFO, fd, "unable to create lpm6"); + free(dd, M_TEMP); + return (NULL); + } + dd->lpm6 = lpm6; + struct rte_lpm6_external *ext = (struct rte_lpm6_external *)lpm6; + ext->nh_idx = fib_get_nhop_array(dd->fd); + + FIB_PRINTF(LOG_INFO, fd, "allocated %u tbl8s", dd->number_tbl8s); + + return (dd); +} + +static enum flm_op_result +init_table(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **data) +{ + struct dpdk_lpm6_data *dd, dd_base; + + if (_old_data == NULL) { + bzero(&dd_base, sizeof(struct dpdk_lpm6_data)); + dd_base.fibnum = fibnum; + /* TODO: get rib statistics */ + dd_base.number_tbl8s = LPM6_MIN_TBL8; + dd = &dd_base; + } else { + FIB_PRINTF(LOG_INFO, fd, "Starting with old data"); + dd = (struct dpdk_lpm6_data *)_old_data; + } + + /* Guaranteed to be in epoch */ + dd = build_table(dd, fd); + if (dd == NULL) { + FIB_PRINTF(LOG_INFO, fd, "table creation failed"); + return (FLM_REBUILD); + } + + *data = dd; + return (FLM_SUCCESS); +} + +static struct fib_lookup_module dpdk_lpm6 = { + .flm_name = "dpdk_lpm6", + .flm_family = AF_INET6, + .flm_init_cb = init_table, + .flm_destroy_cb = destroy_table, + .flm_dump_rib_item_cb = add_route_cb, + .flm_dump_end_cb = check_dump_success, + .flm_change_rib_item_cb = handle_rtable_change_cb, + .flm_get_pref = rte6_get_pref, +}; + +static int +lpm6_modevent(module_t mod, int type, void *unused) +{ + int error = 0; + + switch (type) { + case MOD_LOAD: + fib_module_register(&dpdk_lpm6); + break; + case MOD_UNLOAD: + error = fib_module_unregister(&dpdk_lpm6); + break; + default: + error = EOPNOTSUPP; + break; + } + return (error); +} + +static moduledata_t lpm6mod = { + "dpdk_lpm6", + lpm6_modevent, + 0 +}; + +DECLARE_MODULE(lpm6mod, lpm6mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(lpm6mod, 1); Index: sys/contrib/dpdk_rte_lpm/rte_branch_prediction.h =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/rte_branch_prediction.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +/** + * @file + * Branch Prediction Helpers in RTE + */ + +#ifndef _RTE_BRANCH_PREDICTION_H_ +#define _RTE_BRANCH_PREDICTION_H_ + +/** + * Check if a branch is likely to be taken. + * + * This compiler builtin allows the developer to indicate if a branch is + * likely to be taken. Example: + * + * if (likely(x > 1)) + * do_stuff(); + * + */ +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1) +#endif /* likely */ + +/** + * Check if a branch is unlikely to be taken. + * + * This compiler builtin allows the developer to indicate if a branch is + * unlikely to be taken. Example: + * + * if (unlikely(x < 1)) + * do_stuff(); + * + */ +#ifndef unlikely +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif /* unlikely */ + +#endif /* _RTE_BRANCH_PREDICTION_H_ */ Index: sys/contrib/dpdk_rte_lpm/rte_common.h =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/rte_common.h @@ -0,0 +1,838 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2019 Intel Corporation + */ + +#ifndef _RTE_COMMON_H_ +#define _RTE_COMMON_H_ + +/** + * @file + * + * Generic, commonly-used macro and inline function definitions + * for DPDK. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +//#include + +/* OS specific include */ +//#include + +#ifndef typeof +#define typeof __typeof__ +#endif + +#ifndef asm +#define asm __asm__ +#endif + +/** C extension macro for environments lacking C11 features. */ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 201112L +#define RTE_STD_C11 __extension__ +#else +#define RTE_STD_C11 +#endif + +/* + * RTE_TOOLCHAIN_GCC is defined if the target is built with GCC, + * while a host application (like pmdinfogen) may have another compiler. + * RTE_CC_IS_GNU is true if the file is compiled with GCC, + * no matter it is a target or host application. + */ +#define RTE_CC_IS_GNU 0 +#if defined __clang__ +#define RTE_CC_CLANG +#elif defined __INTEL_COMPILER +#define RTE_CC_ICC +#elif defined __GNUC__ +#define RTE_CC_GCC +#undef RTE_CC_IS_GNU +#define RTE_CC_IS_GNU 1 +#endif +#if RTE_CC_IS_GNU +#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + \ + __GNUC_PATCHLEVEL__) +#endif + +/** + * Force alignment + */ +#define __rte_aligned(a) __attribute__((__aligned__(a))) + +#ifdef RTE_ARCH_STRICT_ALIGN +typedef uint64_t unaligned_uint64_t __rte_aligned(1); +typedef uint32_t unaligned_uint32_t __rte_aligned(1); +typedef uint16_t unaligned_uint16_t __rte_aligned(1); +#else +typedef uint64_t unaligned_uint64_t; +typedef uint32_t unaligned_uint32_t; +typedef uint16_t unaligned_uint16_t; +#endif + +/** + * Force a structure to be packed + */ +#define __rte_packed __attribute__((__packed__)) + +/******* Macro to mark functions and fields scheduled for removal *****/ +#define __rte_deprecated __attribute__((__deprecated__)) + +/** + * Mark a function or variable to a weak reference. + */ +#define __rte_weak __attribute__((__weak__)) + +/** + * Force symbol to be generated even if it appears to be unused. + */ +#define __rte_used __attribute__((used)) + +/*********** Macros to eliminate unused variable warnings ********/ + +/** + * short definition to mark a function parameter unused + */ +#define __rte_unused __attribute__((__unused__)) + +/** + * definition to mark a variable or function parameter as used so + * as to avoid a compiler warning + */ +#define RTE_SET_USED(x) (void)(x) + +/** + * Check format string and its arguments at compile-time. + * + * GCC on Windows assumes MS-specific format string by default, + * even if the underlying stdio implementation is ANSI-compliant, + * so this must be overridden. + */ +#if RTE_CC_IS_GNU +#define __rte_format_printf(format_index, first_arg) \ + __attribute__((format(gnu_printf, format_index, first_arg))) +#else +#define __rte_format_printf(format_index, first_arg) \ + __attribute__((format(printf, format_index, first_arg))) +#endif + +#define RTE_PRIORITY_LOG 101 +#define RTE_PRIORITY_BUS 110 +#define RTE_PRIORITY_CLASS 120 +#define RTE_PRIORITY_LAST 65535 + +#define RTE_PRIO(prio) \ + RTE_PRIORITY_ ## prio + +/** + * Run function before main() with high priority. + * + * @param func + * Constructor function. + * @param prio + * Priority number must be above 100. + * Lowest number is the first to run. + */ +#ifndef RTE_INIT_PRIO /* Allow to override from EAL */ +#define RTE_INIT_PRIO(func, prio) \ +static void __attribute__((constructor(RTE_PRIO(prio)), used)) func(void) +#endif + +/** + * Run function before main() with low priority. + * + * The constructor will be run after prioritized constructors. + * + * @param func + * Constructor function. + */ +#define RTE_INIT(func) \ + RTE_INIT_PRIO(func, LAST) + +/** + * Run after main() with low priority. + * + * @param func + * Destructor function name. + * @param prio + * Priority number must be above 100. + * Lowest number is the last to run. + */ +#ifndef RTE_FINI_PRIO /* Allow to override from EAL */ +#define RTE_FINI_PRIO(func, prio) \ +static void __attribute__((destructor(RTE_PRIO(prio)), used)) func(void) +#endif + +/** + * Run after main() with high priority. + * + * The destructor will be run *before* prioritized destructors. + * + * @param func + * Destructor function name. + */ +#define RTE_FINI(func) \ + RTE_FINI_PRIO(func, LAST) + +/** + * Hint never returning function + */ +#define __rte_noreturn __attribute__((noreturn)) + +/** + * Force a function to be inlined + */ +#define __rte_always_inline inline __attribute__((always_inline)) + +/** + * Force a function to be noinlined + */ +#define __rte_noinline __attribute__((noinline)) + +/** + * Hint function in the hot path + */ +#define __rte_hot __attribute__((hot)) + +/** + * Hint function in the cold path + */ +#define __rte_cold __attribute__((cold)) + +/*********** Macros for pointer arithmetic ********/ + +/** + * add a byte-value offset to a pointer + */ +#define RTE_PTR_ADD(ptr, x) ((void*)((uintptr_t)(ptr) + (x))) + +/** + * subtract a byte-value offset from a pointer + */ +#define RTE_PTR_SUB(ptr, x) ((void*)((uintptr_t)ptr - (x))) + +/** + * get the difference between two pointer values, i.e. how far apart + * in bytes are the locations they point two. It is assumed that + * ptr1 is greater than ptr2. + */ +#define RTE_PTR_DIFF(ptr1, ptr2) ((uintptr_t)(ptr1) - (uintptr_t)(ptr2)) + +/** + * Workaround to cast a const field of a structure to non-const type. + */ +#define RTE_CAST_FIELD(var, field, type) \ + (*(type *)((uintptr_t)(var) + offsetof(typeof(*(var)), field))) + +/*********** Macros/static functions for doing alignment ********/ + + +/** + * Macro to align a pointer to a given power-of-two. The resultant + * pointer will be a pointer of the same type as the first parameter, and + * point to an address no higher than the first parameter. Second parameter + * must be a power-of-two value. + */ +#define RTE_PTR_ALIGN_FLOOR(ptr, align) \ + ((typeof(ptr))RTE_ALIGN_FLOOR((uintptr_t)ptr, align)) + +/** + * Macro to align a value to a given power-of-two. The resultant value + * will be of the same type as the first parameter, and will be no + * bigger than the first parameter. Second parameter must be a + * power-of-two value. + */ +#define RTE_ALIGN_FLOOR(val, align) \ + (typeof(val))((val) & (~((typeof(val))((align) - 1)))) + +/** + * Macro to align a pointer to a given power-of-two. The resultant + * pointer will be a pointer of the same type as the first parameter, and + * point to an address no lower than the first parameter. Second parameter + * must be a power-of-two value. + */ +#define RTE_PTR_ALIGN_CEIL(ptr, align) \ + RTE_PTR_ALIGN_FLOOR((typeof(ptr))RTE_PTR_ADD(ptr, (align) - 1), align) + +/** + * Macro to align a value to a given power-of-two. The resultant value + * will be of the same type as the first parameter, and will be no lower + * than the first parameter. Second parameter must be a power-of-two + * value. + */ +#define RTE_ALIGN_CEIL(val, align) \ + RTE_ALIGN_FLOOR(((val) + ((typeof(val)) (align) - 1)), align) + +/** + * Macro to align a pointer to a given power-of-two. The resultant + * pointer will be a pointer of the same type as the first parameter, and + * point to an address no lower than the first parameter. Second parameter + * must be a power-of-two value. + * This function is the same as RTE_PTR_ALIGN_CEIL + */ +#define RTE_PTR_ALIGN(ptr, align) RTE_PTR_ALIGN_CEIL(ptr, align) + +/** + * Macro to align a value to a given power-of-two. The resultant + * value will be of the same type as the first parameter, and + * will be no lower than the first parameter. Second parameter + * must be a power-of-two value. + * This function is the same as RTE_ALIGN_CEIL + */ +#define RTE_ALIGN(val, align) RTE_ALIGN_CEIL(val, align) + +/** + * Macro to align a value to the multiple of given value. The resultant + * value will be of the same type as the first parameter and will be no lower + * than the first parameter. + */ +#define RTE_ALIGN_MUL_CEIL(v, mul) \ + (((v + (typeof(v))(mul) - 1) / ((typeof(v))(mul))) * (typeof(v))(mul)) + +/** + * Macro to align a value to the multiple of given value. The resultant + * value will be of the same type as the first parameter and will be no higher + * than the first parameter. + */ +#define RTE_ALIGN_MUL_FLOOR(v, mul) \ + ((v / ((typeof(v))(mul))) * (typeof(v))(mul)) + +/** + * Macro to align value to the nearest multiple of the given value. + * The resultant value might be greater than or less than the first parameter + * whichever difference is the lowest. + */ +#define RTE_ALIGN_MUL_NEAR(v, mul) \ + ({ \ + typeof(v) ceil = RTE_ALIGN_MUL_CEIL(v, mul); \ + typeof(v) floor = RTE_ALIGN_MUL_FLOOR(v, mul); \ + (ceil - v) > (v - floor) ? floor : ceil; \ + }) + +/** + * Checks if a pointer is aligned to a given power-of-two value + * + * @param ptr + * The pointer whose alignment is to be checked + * @param align + * The power-of-two value to which the ptr should be aligned + * + * @return + * True(1) where the pointer is correctly aligned, false(0) otherwise + */ +static inline int +rte_is_aligned(void *ptr, unsigned align) +{ + return RTE_PTR_ALIGN(ptr, align) == ptr; +} + +/*********** Macros for compile type checks ********/ + +/** + * Triggers an error at compilation time if the condition is true. + */ +#define RTE_BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) + +/*********** Cache line related macros ********/ + +/** Cache line mask. */ +#define RTE_CACHE_LINE_MASK (RTE_CACHE_LINE_SIZE-1) + +/** Return the first cache-aligned value greater or equal to size. */ +#define RTE_CACHE_LINE_ROUNDUP(size) \ + (RTE_CACHE_LINE_SIZE * ((size + RTE_CACHE_LINE_SIZE - 1) / \ + RTE_CACHE_LINE_SIZE)) + +/** Cache line size in terms of log2 */ +#if RTE_CACHE_LINE_SIZE == 64 +#define RTE_CACHE_LINE_SIZE_LOG2 6 +#elif RTE_CACHE_LINE_SIZE == 128 +#define RTE_CACHE_LINE_SIZE_LOG2 7 +#else +#error "Unsupported cache line size" +#endif + +/** Minimum Cache line size. */ +#define RTE_CACHE_LINE_MIN_SIZE 64 + +/** Force alignment to cache line. */ +#define __rte_cache_aligned __rte_aligned(RTE_CACHE_LINE_SIZE) + +/** Force minimum cache line alignment. */ +#define __rte_cache_min_aligned __rte_aligned(RTE_CACHE_LINE_MIN_SIZE) + +/*********** PA/IOVA type definitions ********/ + +/** Physical address */ +typedef uint64_t phys_addr_t; +#define RTE_BAD_PHYS_ADDR ((phys_addr_t)-1) + +/** + * IO virtual address type. + * When the physical addressing mode (IOVA as PA) is in use, + * the translation from an IO virtual address (IOVA) to a physical address + * is a direct mapping, i.e. the same value. + * Otherwise, in virtual mode (IOVA as VA), an IOMMU may do the translation. + */ +typedef uint64_t rte_iova_t; +#define RTE_BAD_IOVA ((rte_iova_t)-1) + +/*********** Structure alignment markers ********/ + +/** Generic marker for any place in a structure. */ +__extension__ typedef void *RTE_MARKER[0]; +/** Marker for 1B alignment in a structure. */ +__extension__ typedef uint8_t RTE_MARKER8[0]; +/** Marker for 2B alignment in a structure. */ +__extension__ typedef uint16_t RTE_MARKER16[0]; +/** Marker for 4B alignment in a structure. */ +__extension__ typedef uint32_t RTE_MARKER32[0]; +/** Marker for 8B alignment in a structure. */ +__extension__ typedef uint64_t RTE_MARKER64[0]; + +/** + * Combines 32b inputs most significant set bits into the least + * significant bits to construct a value with the same MSBs as x + * but all 1's under it. + * + * @param x + * The integer whose MSBs need to be combined with its LSBs + * @return + * The combined value. + */ +static inline uint32_t +rte_combine32ms1b(register uint32_t x) +{ + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + + return x; +} + +/** + * Combines 64b inputs most significant set bits into the least + * significant bits to construct a value with the same MSBs as x + * but all 1's under it. + * + * @param v + * The integer whose MSBs need to be combined with its LSBs + * @return + * The combined value. + */ +static inline uint64_t +rte_combine64ms1b(register uint64_t v) +{ + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v |= v >> 32; + + return v; +} + +/*********** Macros to work with powers of 2 ********/ + +/** + * Macro to return 1 if n is a power of 2, 0 otherwise + */ +#define RTE_IS_POWER_OF_2(n) ((n) && !(((n) - 1) & (n))) + +/** + * Returns true if n is a power of 2 + * @param n + * Number to check + * @return 1 if true, 0 otherwise + */ +static inline int +rte_is_power_of_2(uint32_t n) +{ + return n && !(n & (n - 1)); +} + +/** + * Aligns input parameter to the next power of 2 + * + * @param x + * The integer value to align + * + * @return + * Input parameter aligned to the next power of 2 + */ +static inline uint32_t +rte_align32pow2(uint32_t x) +{ + x--; + x = rte_combine32ms1b(x); + + return x + 1; +} + +/** + * Aligns input parameter to the previous power of 2 + * + * @param x + * The integer value to align + * + * @return + * Input parameter aligned to the previous power of 2 + */ +static inline uint32_t +rte_align32prevpow2(uint32_t x) +{ + x = rte_combine32ms1b(x); + + return x - (x >> 1); +} + +/** + * Aligns 64b input parameter to the next power of 2 + * + * @param v + * The 64b value to align + * + * @return + * Input parameter aligned to the next power of 2 + */ +static inline uint64_t +rte_align64pow2(uint64_t v) +{ + v--; + v = rte_combine64ms1b(v); + + return v + 1; +} + +/** + * Aligns 64b input parameter to the previous power of 2 + * + * @param v + * The 64b value to align + * + * @return + * Input parameter aligned to the previous power of 2 + */ +static inline uint64_t +rte_align64prevpow2(uint64_t v) +{ + v = rte_combine64ms1b(v); + + return v - (v >> 1); +} + +/*********** Macros for calculating min and max **********/ + +/** + * Macro to return the minimum of two numbers + */ +#define RTE_MIN(a, b) \ + __extension__ ({ \ + typeof (a) _a = (a); \ + typeof (b) _b = (b); \ + _a < _b ? _a : _b; \ + }) + +/** + * Macro to return the maximum of two numbers + */ +#define RTE_MAX(a, b) \ + __extension__ ({ \ + typeof (a) _a = (a); \ + typeof (b) _b = (b); \ + _a > _b ? _a : _b; \ + }) + +/*********** Other general functions / macros ********/ + +/** + * Searches the input parameter for the least significant set bit + * (starting from zero). + * If a least significant 1 bit is found, its bit index is returned. + * If the content of the input parameter is zero, then the content of the return + * value is undefined. + * @param v + * input parameter, should not be zero. + * @return + * least significant set bit in the input parameter. + */ +static inline uint32_t +rte_bsf32(uint32_t v) +{ + return (uint32_t)__builtin_ctz(v); +} + +/** + * Searches the input parameter for the least significant set bit + * (starting from zero). Safe version (checks for input parameter being zero). + * + * @warning ``pos`` must be a valid pointer. It is not checked! + * + * @param v + * The input parameter. + * @param pos + * If ``v`` was not 0, this value will contain position of least significant + * bit within the input parameter. + * @return + * Returns 0 if ``v`` was 0, otherwise returns 1. + */ +static inline int +rte_bsf32_safe(uint64_t v, uint32_t *pos) +{ + if (v == 0) + return 0; + + *pos = rte_bsf32(v); + return 1; +} + +/** + * Return the rounded-up log2 of a integer. + * + * @note Contrary to the logarithm mathematical operation, + * rte_log2_u32(0) == 0 and not -inf. + * + * @param v + * The input parameter. + * @return + * The rounded-up log2 of the input, or 0 if the input is 0. + */ +static inline uint32_t +rte_log2_u32(uint32_t v) +{ + if (v == 0) + return 0; + v = rte_align32pow2(v); + return rte_bsf32(v); +} + + +/** + * Return the last (most-significant) bit set. + * + * @note The last (most significant) bit is at position 32. + * @note rte_fls_u32(0) = 0, rte_fls_u32(1) = 1, rte_fls_u32(0x80000000) = 32 + * + * @param x + * The input parameter. + * @return + * The last (most-significant) bit set, or 0 if the input is 0. + */ +static inline int +rte_fls_u32(uint32_t x) +{ + return (x == 0) ? 0 : 32 - __builtin_clz(x); +} + +/** + * Searches the input parameter for the least significant set bit + * (starting from zero). + * If a least significant 1 bit is found, its bit index is returned. + * If the content of the input parameter is zero, then the content of the return + * value is undefined. + * @param v + * input parameter, should not be zero. + * @return + * least significant set bit in the input parameter. + */ +static inline int +rte_bsf64(uint64_t v) +{ + return (uint32_t)__builtin_ctzll(v); +} + +/** + * Searches the input parameter for the least significant set bit + * (starting from zero). Safe version (checks for input parameter being zero). + * + * @warning ``pos`` must be a valid pointer. It is not checked! + * + * @param v + * The input parameter. + * @param pos + * If ``v`` was not 0, this value will contain position of least significant + * bit within the input parameter. + * @return + * Returns 0 if ``v`` was 0, otherwise returns 1. + */ +static inline int +rte_bsf64_safe(uint64_t v, uint32_t *pos) +{ + if (v == 0) + return 0; + + *pos = rte_bsf64(v); + return 1; +} + +/** + * Return the last (most-significant) bit set. + * + * @note The last (most significant) bit is at position 64. + * @note rte_fls_u64(0) = 0, rte_fls_u64(1) = 1, + * rte_fls_u64(0x8000000000000000) = 64 + * + * @param x + * The input parameter. + * @return + * The last (most-significant) bit set, or 0 if the input is 0. + */ +static inline int +rte_fls_u64(uint64_t x) +{ + return (x == 0) ? 0 : 64 - __builtin_clzll(x); +} + +/** + * Return the rounded-up log2 of a 64-bit integer. + * + * @note Contrary to the logarithm mathematical operation, + * rte_log2_u64(0) == 0 and not -inf. + * + * @param v + * The input parameter. + * @return + * The rounded-up log2 of the input, or 0 if the input is 0. + */ +static inline uint32_t +rte_log2_u64(uint64_t v) +{ + if (v == 0) + return 0; + v = rte_align64pow2(v); + /* we checked for v being 0 already, so no undefined behavior */ + return rte_bsf64(v); +} + +#ifndef offsetof +/** Return the offset of a field in a structure. */ +#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER) +#endif + +/** + * Return pointer to the wrapping struct instance. + * + * Example: + * + * struct wrapper { + * ... + * struct child c; + * ... + * }; + * + * struct child *x = obtain(...); + * struct wrapper *w = container_of(x, struct wrapper, c); + */ +#ifndef container_of +#define container_of(ptr, type, member) __extension__ ({ \ + const typeof(((type *)0)->member) *_ptr = (ptr); \ + __rte_unused type *_target_ptr = \ + (type *)(ptr); \ + (type *)(((uintptr_t)_ptr) - offsetof(type, member)); \ + }) +#endif + +/** + * Get the size of a field in a structure. + * + * @param type + * The type of the structure. + * @param field + * The field in the structure. + * @return + * The size of the field in the structure, in bytes. + */ +#define RTE_SIZEOF_FIELD(type, field) (sizeof(((type *)0)->field)) + +#define _RTE_STR(x) #x +/** Take a macro value and get a string version of it */ +#define RTE_STR(x) _RTE_STR(x) + +/** + * ISO C helpers to modify format strings using variadic macros. + * This is a replacement for the ", ## __VA_ARGS__" GNU extension. + * An empty %s argument is appended to avoid a dangling comma. + */ +#define RTE_FMT(fmt, ...) fmt "%.0s", __VA_ARGS__ "" +#define RTE_FMT_HEAD(fmt, ...) fmt +#define RTE_FMT_TAIL(fmt, ...) __VA_ARGS__ + +/** Mask value of type "tp" for the first "ln" bit set. */ +#define RTE_LEN2MASK(ln, tp) \ + ((tp)((uint64_t)-1 >> (sizeof(uint64_t) * CHAR_BIT - (ln)))) + +/** Number of elements in the array. */ +#define RTE_DIM(a) (sizeof (a) / sizeof ((a)[0])) + +/** + * Converts a numeric string to the equivalent uint64_t value. + * As well as straight number conversion, also recognises the suffixes + * k, m and g for kilobytes, megabytes and gigabytes respectively. + * + * If a negative number is passed in i.e. a string with the first non-black + * character being "-", zero is returned. Zero is also returned in the case of + * an error with the strtoull call in the function. + * + * @param str + * String containing number to convert. + * @return + * Number. + */ +#if 0 +static inline uint64_t +rte_str_to_size(const char *str) +{ + char *endptr; + unsigned long long size; + + while (isspace((int)*str)) + str++; + if (*str == '-') + return 0; + + errno = 0; + size = strtoull(str, &endptr, 0); + if (errno) + return 0; + + if (*endptr == ' ') + endptr++; /* allow 1 space gap */ + + switch (*endptr){ + case 'G': case 'g': size *= 1024; /* fall-through */ + case 'M': case 'm': size *= 1024; /* fall-through */ + case 'K': case 'k': size *= 1024; /* fall-through */ + default: + break; + } + return size; +} +#endif + +/** + * Function to terminate the application immediately, printing an error + * message and returning the exit_code back to the shell. + * + * This function never returns + * + * @param exit_code + * The exit code to be returned by the application + * @param format + * The format string to be used for printing the message. This can include + * printf format characters which will be expanded using any further parameters + * to the function. + */ +__rte_noreturn void +rte_exit(int exit_code, const char *format, ...) + __rte_format_printf(2, 3); + +#ifdef __cplusplus +} +#endif + +#endif Index: sys/contrib/dpdk_rte_lpm/rte_debug.h =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/rte_debug.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_DEBUG_H_ +#define _RTE_DEBUG_H_ + +/** + * @file + * + * Debug Functions in RTE + * + * This file defines a generic API for debug operations. Part of + * the implementation is architecture-specific. + */ + +//#include "rte_log.h" +#include "rte_branch_prediction.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Dump the stack of the calling core to the console. + */ +void rte_dump_stack(void); + +/** + * Dump the registers of the calling core to the console. + * + * Note: Not implemented in a userapp environment; use gdb instead. + */ +void rte_dump_registers(void); + +/** + * Provide notification of a critical non-recoverable error and terminate + * execution abnormally. + * + * Display the format string and its expanded arguments (printf-like). + * + * In a linux environment, this function dumps the stack and calls + * abort() resulting in a core dump if enabled. + * + * The function never returns. + * + * @param ... + * The format string, followed by the variable list of arguments. + */ +#define rte_panic(...) rte_panic_(__func__, __VA_ARGS__, "dummy") +#define rte_panic_(func, format, ...) __rte_panic(func, format "%.0s", __VA_ARGS__) + +#ifdef RTE_ENABLE_ASSERT +#define RTE_ASSERT(exp) RTE_VERIFY(exp) +#else +#define RTE_ASSERT(exp) do {} while (0) +#endif +#define RTE_VERIFY(exp) do { \ + if (unlikely(!(exp))) \ + rte_panic("line %d\tassert \"%s\" failed\n", __LINE__, #exp); \ +} while (0) + +/* + * Provide notification of a critical non-recoverable error and stop. + * + * This function should not be called directly. Refer to rte_panic() macro + * documentation. + */ +void __rte_panic(const char *funcname , const char *format, ...) +{ +#ifdef __GNUC__ +#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 2)) + __rte_cold +#endif +#endif + //__rte_noreturn + //__rte_format_printf(2, 3); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_DEBUG_H_ */ Index: sys/contrib/dpdk_rte_lpm/rte_jhash.h =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/rte_jhash.h @@ -0,0 +1,379 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2015 Intel Corporation. + */ + +#ifndef _RTE_JHASH_H +#define _RTE_JHASH_H + +/** + * @file + * + * jhash functions. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +//#include + +/* jhash.h: Jenkins hash support. + * + * Copyright (C) 2006 Bob Jenkins (bob_jenkins@burtleburtle.net) + * + * http://burtleburtle.net/bob/hash/ + * + * These are the credits from Bob's sources: + * + * lookup3.c, by Bob Jenkins, May 2006, Public Domain. + * + * These are functions for producing 32-bit hashes for hash table lookup. + * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() + * are externally useful functions. Routines to test the hash are included + * if SELF_TEST is defined. You can use this free for any purpose. It's in + * the public domain. It has no warranty. + * + * $FreeBSD$ + */ + +#define rot(x, k) (((x) << (k)) | ((x) >> (32-(k)))) + +/** @internal Internal function. NOTE: Arguments are modified. */ +#define __rte_jhash_mix(a, b, c) do { \ + a -= c; a ^= rot(c, 4); c += b; \ + b -= a; b ^= rot(a, 6); a += c; \ + c -= b; c ^= rot(b, 8); b += a; \ + a -= c; a ^= rot(c, 16); c += b; \ + b -= a; b ^= rot(a, 19); a += c; \ + c -= b; c ^= rot(b, 4); b += a; \ +} while (0) + +#define __rte_jhash_final(a, b, c) do { \ + c ^= b; c -= rot(b, 14); \ + a ^= c; a -= rot(c, 11); \ + b ^= a; b -= rot(a, 25); \ + c ^= b; c -= rot(b, 16); \ + a ^= c; a -= rot(c, 4); \ + b ^= a; b -= rot(a, 14); \ + c ^= b; c -= rot(b, 24); \ +} while (0) + +/** The golden ratio: an arbitrary value. */ +#define RTE_JHASH_GOLDEN_RATIO 0xdeadbeef + +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN +#define BIT_SHIFT(x, y, k) (((x) >> (k)) | ((uint64_t)(y) << (32-(k)))) +#else +#define BIT_SHIFT(x, y, k) (((uint64_t)(x) << (k)) | ((y) >> (32-(k)))) +#endif + +#define LOWER8b_MASK rte_le_to_cpu_32(0xff) +#define LOWER16b_MASK rte_le_to_cpu_32(0xffff) +#define LOWER24b_MASK rte_le_to_cpu_32(0xffffff) + +static inline void +__rte_jhash_2hashes(const void *key, uint32_t length, uint32_t *pc, + uint32_t *pb, unsigned check_align) +{ + uint32_t a, b, c; + + /* Set up the internal state */ + a = b = c = RTE_JHASH_GOLDEN_RATIO + ((uint32_t)length) + *pc; + c += *pb; + + /* + * Check key alignment. For x86 architecture, first case is always optimal + * If check_align is not set, first case will be used + */ +#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_I686) || defined(RTE_ARCH_X86_X32) + const uint32_t *k = (const uint32_t *)key; + const uint32_t s = 0; +#else + const uint32_t *k = (uint32_t *)((uintptr_t)key & (uintptr_t)~3); + const uint32_t s = ((uintptr_t)key & 3) * CHAR_BIT; +#endif + if (!check_align || s == 0) { + while (length > 12) { + a += k[0]; + b += k[1]; + c += k[2]; + + __rte_jhash_mix(a, b, c); + + k += 3; + length -= 12; + } + + switch (length) { + case 12: + c += k[2]; b += k[1]; a += k[0]; break; + case 11: + c += k[2] & LOWER24b_MASK; b += k[1]; a += k[0]; break; + case 10: + c += k[2] & LOWER16b_MASK; b += k[1]; a += k[0]; break; + case 9: + c += k[2] & LOWER8b_MASK; b += k[1]; a += k[0]; break; + case 8: + b += k[1]; a += k[0]; break; + case 7: + b += k[1] & LOWER24b_MASK; a += k[0]; break; + case 6: + b += k[1] & LOWER16b_MASK; a += k[0]; break; + case 5: + b += k[1] & LOWER8b_MASK; a += k[0]; break; + case 4: + a += k[0]; break; + case 3: + a += k[0] & LOWER24b_MASK; break; + case 2: + a += k[0] & LOWER16b_MASK; break; + case 1: + a += k[0] & LOWER8b_MASK; break; + /* zero length strings require no mixing */ + case 0: + *pc = c; + *pb = b; + return; + }; + } else { + /* all but the last block: affect some 32 bits of (a, b, c) */ + while (length > 12) { + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + c += BIT_SHIFT(k[2], k[3], s); + __rte_jhash_mix(a, b, c); + + k += 3; + length -= 12; + } + + /* last block: affect all 32 bits of (c) */ + switch (length) { + case 12: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + c += BIT_SHIFT(k[2], k[3], s); + break; + case 11: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + c += BIT_SHIFT(k[2], k[3], s) & LOWER24b_MASK; + break; + case 10: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + c += BIT_SHIFT(k[2], k[3], s) & LOWER16b_MASK; + break; + case 9: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + c += BIT_SHIFT(k[2], k[3], s) & LOWER8b_MASK; + break; + case 8: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + break; + case 7: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s) & LOWER24b_MASK; + break; + case 6: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s) & LOWER16b_MASK; + break; + case 5: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s) & LOWER8b_MASK; + break; + case 4: + a += BIT_SHIFT(k[0], k[1], s); + break; + case 3: + a += BIT_SHIFT(k[0], k[1], s) & LOWER24b_MASK; + break; + case 2: + a += BIT_SHIFT(k[0], k[1], s) & LOWER16b_MASK; + break; + case 1: + a += BIT_SHIFT(k[0], k[1], s) & LOWER8b_MASK; + break; + /* zero length strings require no mixing */ + case 0: + *pc = c; + *pb = b; + return; + } + } + + __rte_jhash_final(a, b, c); + + *pc = c; + *pb = b; +} + +/** + * Same as rte_jhash, but takes two seeds and return two uint32_ts. + * pc and pb must be non-null, and *pc and *pb must both be initialized + * with seeds. If you pass in (*pb)=0, the output (*pc) will be + * the same as the return value from rte_jhash. + * + * @param key + * Key to calculate hash of. + * @param length + * Length of key in bytes. + * @param pc + * IN: seed OUT: primary hash value. + * @param pb + * IN: second seed OUT: secondary hash value. + */ +static inline void +rte_jhash_2hashes(const void *key, uint32_t length, uint32_t *pc, uint32_t *pb) +{ + __rte_jhash_2hashes(key, length, pc, pb, 1); +} + +/** + * Same as rte_jhash_32b, but takes two seeds and return two uint32_ts. + * pc and pb must be non-null, and *pc and *pb must both be initialized + * with seeds. If you pass in (*pb)=0, the output (*pc) will be + * the same as the return value from rte_jhash_32b. + * + * @param k + * Key to calculate hash of. + * @param length + * Length of key in units of 4 bytes. + * @param pc + * IN: seed OUT: primary hash value. + * @param pb + * IN: second seed OUT: secondary hash value. + */ +static inline void +rte_jhash_32b_2hashes(const uint32_t *k, uint32_t length, uint32_t *pc, uint32_t *pb) +{ + __rte_jhash_2hashes((const void *) k, (length << 2), pc, pb, 0); +} + +/** + * The most generic version, hashes an arbitrary sequence + * of bytes. No alignment or length assumptions are made about + * the input key. For keys not aligned to four byte boundaries + * or a multiple of four bytes in length, the memory region + * just after may be read (but not used in the computation). + * This may cross a page boundary. + * + * @param key + * Key to calculate hash of. + * @param length + * Length of key in bytes. + * @param initval + * Initialising value of hash. + * @return + * Calculated hash value. + */ +static inline uint32_t +rte_jhash(const void *key, uint32_t length, uint32_t initval) +{ + uint32_t initval2 = 0; + + rte_jhash_2hashes(key, length, &initval, &initval2); + + return initval; +} + +/** + * A special optimized version that handles 1 or more of uint32_ts. + * The length parameter here is the number of uint32_ts in the key. + * + * @param k + * Key to calculate hash of. + * @param length + * Length of key in units of 4 bytes. + * @param initval + * Initialising value of hash. + * @return + * Calculated hash value. + */ +static inline uint32_t +rte_jhash_32b(const uint32_t *k, uint32_t length, uint32_t initval) +{ + uint32_t initval2 = 0; + + rte_jhash_32b_2hashes(k, length, &initval, &initval2); + + return initval; +} + +static inline uint32_t +__rte_jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval) +{ + a += RTE_JHASH_GOLDEN_RATIO + initval; + b += RTE_JHASH_GOLDEN_RATIO + initval; + c += RTE_JHASH_GOLDEN_RATIO + initval; + + __rte_jhash_final(a, b, c); + + return c; +} + +/** + * A special ultra-optimized versions that knows it is hashing exactly + * 3 words. + * + * @param a + * First word to calculate hash of. + * @param b + * Second word to calculate hash of. + * @param c + * Third word to calculate hash of. + * @param initval + * Initialising value of hash. + * @return + * Calculated hash value. + */ +static inline uint32_t +rte_jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval) +{ + return __rte_jhash_3words(a + 12, b + 12, c + 12, initval); +} + +/** + * A special ultra-optimized versions that knows it is hashing exactly + * 2 words. + * + * @param a + * First word to calculate hash of. + * @param b + * Second word to calculate hash of. + * @param initval + * Initialising value of hash. + * @return + * Calculated hash value. + */ +static inline uint32_t +rte_jhash_2words(uint32_t a, uint32_t b, uint32_t initval) +{ + return __rte_jhash_3words(a + 8, b + 8, 8, initval); +} + +/** + * A special ultra-optimized versions that knows it is hashing exactly + * 1 word. + * + * @param a + * Word to calculate hash of. + * @param initval + * Initialising value of hash. + * @return + * Calculated hash value. + */ +static inline uint32_t +rte_jhash_1word(uint32_t a, uint32_t initval) +{ + return __rte_jhash_3words(a + 4, 4, 4, initval); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_JHASH_H */ Index: sys/contrib/dpdk_rte_lpm/rte_log.h =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/rte_log.h @@ -0,0 +1,383 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#ifndef _RTE_LOG_H_ +#define _RTE_LOG_H_ + +/** + * @file + * + * RTE Logs API + * + * This file provides a log API to RTE applications. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include +#include + +#include +#include +#include + +struct rte_log_dynamic_type; + +/** The rte_log structure. */ +struct rte_logs { + uint32_t type; /**< Bitfield with enabled logs. */ + uint32_t level; /**< Log level. */ + FILE *file; /**< Output file set by rte_openlog_stream, or NULL. */ + size_t dynamic_types_len; + struct rte_log_dynamic_type *dynamic_types; +}; + +/** Global log information */ +extern struct rte_logs rte_logs; + +/* SDK log type */ +#define RTE_LOGTYPE_EAL 0 /**< Log related to eal. */ +#define RTE_LOGTYPE_MALLOC 1 /**< Log related to malloc. */ +#define RTE_LOGTYPE_RING 2 /**< Log related to ring. */ +#define RTE_LOGTYPE_MEMPOOL 3 /**< Log related to mempool. */ +#define RTE_LOGTYPE_TIMER 4 /**< Log related to timers. */ +#define RTE_LOGTYPE_PMD 5 /**< Log related to poll mode driver. */ +#define RTE_LOGTYPE_HASH 6 /**< Log related to hash table. */ +#define RTE_LOGTYPE_LPM 7 /**< Log related to LPM. */ +#define RTE_LOGTYPE_KNI 8 /**< Log related to KNI. */ +#define RTE_LOGTYPE_ACL 9 /**< Log related to ACL. */ +#define RTE_LOGTYPE_POWER 10 /**< Log related to power. */ +#define RTE_LOGTYPE_METER 11 /**< Log related to QoS meter. */ +#define RTE_LOGTYPE_SCHED 12 /**< Log related to QoS port scheduler. */ +#define RTE_LOGTYPE_PORT 13 /**< Log related to port. */ +#define RTE_LOGTYPE_TABLE 14 /**< Log related to table. */ +#define RTE_LOGTYPE_PIPELINE 15 /**< Log related to pipeline. */ +#define RTE_LOGTYPE_MBUF 16 /**< Log related to mbuf. */ +#define RTE_LOGTYPE_CRYPTODEV 17 /**< Log related to cryptodev. */ +#define RTE_LOGTYPE_EFD 18 /**< Log related to EFD. */ +#define RTE_LOGTYPE_EVENTDEV 19 /**< Log related to eventdev. */ +#define RTE_LOGTYPE_GSO 20 /**< Log related to GSO. */ + +/* these log types can be used in an application */ +#define RTE_LOGTYPE_USER1 24 /**< User-defined log type 1. */ +#define RTE_LOGTYPE_USER2 25 /**< User-defined log type 2. */ +#define RTE_LOGTYPE_USER3 26 /**< User-defined log type 3. */ +#define RTE_LOGTYPE_USER4 27 /**< User-defined log type 4. */ +#define RTE_LOGTYPE_USER5 28 /**< User-defined log type 5. */ +#define RTE_LOGTYPE_USER6 29 /**< User-defined log type 6. */ +#define RTE_LOGTYPE_USER7 30 /**< User-defined log type 7. */ +#define RTE_LOGTYPE_USER8 31 /**< User-defined log type 8. */ + +/** First identifier for extended logs */ +#define RTE_LOGTYPE_FIRST_EXT_ID 32 + +/* Can't use 0, as it gives compiler warnings */ +#define RTE_LOG_EMERG 1U /**< System is unusable. */ +#define RTE_LOG_ALERT 2U /**< Action must be taken immediately. */ +#define RTE_LOG_CRIT 3U /**< Critical conditions. */ +#define RTE_LOG_ERR 4U /**< Error conditions. */ +#define RTE_LOG_WARNING 5U /**< Warning conditions. */ +#define RTE_LOG_NOTICE 6U /**< Normal but significant condition. */ +#define RTE_LOG_INFO 7U /**< Informational. */ +#define RTE_LOG_DEBUG 8U /**< Debug-level messages. */ + +/** + * Change the stream that will be used by the logging system. + * + * This can be done at any time. The f argument represents the stream + * to be used to send the logs. If f is NULL, the default output is + * used (stderr). + * + * @param f + * Pointer to the stream. + * @return + * - 0 on success. + * - Negative on error. + */ +int rte_openlog_stream(FILE *f); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Retrieve the stream used by the logging system (see rte_openlog_stream() + * to change it). + * + * @return + * Pointer to the stream. + */ +__rte_experimental +FILE *rte_log_get_stream(void); + +/** + * Set the global log level. + * + * After this call, logs with a level lower or equal than the level + * passed as argument will be displayed. + * + * @param level + * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8). + */ +void rte_log_set_global_level(uint32_t level); + +/** + * Get the global log level. + * + * @return + * The current global log level. + */ +uint32_t rte_log_get_global_level(void); + +/** + * Get the log level for a given type. + * + * @param logtype + * The log type identifier. + * @return + * 0 on success, a negative value if logtype is invalid. + */ +int rte_log_get_level(uint32_t logtype); + +/** + * For a given `logtype`, check if a log with `loglevel` can be printed. + * + * @param logtype + * The log type identifier + * @param loglevel + * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8). + * @return + * Returns 'true' if log can be printed and 'false' if it can't. + */ +__rte_experimental +bool rte_log_can_log(uint32_t logtype, uint32_t loglevel); + +/** + * Set the log level for a given type based on shell pattern. + * + * @param pattern + * The match pattern identifying the log type. + * @param level + * The level to be set. + * @return + * 0 on success, a negative value if level is invalid. + */ +int rte_log_set_level_pattern(const char *pattern, uint32_t level); + +/** + * Set the log level for a given type based on regular expression. + * + * @param regex + * The regular expression identifying the log type. + * @param level + * The level to be set. + * @return + * 0 on success, a negative value if level is invalid. + */ +int rte_log_set_level_regexp(const char *regex, uint32_t level); + +/** + * Set the log level for a given type. + * + * @param logtype + * The log type identifier. + * @param level + * The level to be set. + * @return + * 0 on success, a negative value if logtype or level is invalid. + */ +int rte_log_set_level(uint32_t logtype, uint32_t level); + +/** + * Get the current loglevel for the message being processed. + * + * Before calling the user-defined stream for logging, the log + * subsystem sets a per-lcore variable containing the loglevel and the + * logtype of the message being processed. This information can be + * accessed by the user-defined log output function through this + * function. + * + * @return + * The loglevel of the message being processed. + */ +int rte_log_cur_msg_loglevel(void); + +/** + * Get the current logtype for the message being processed. + * + * Before calling the user-defined stream for logging, the log + * subsystem sets a per-lcore variable containing the loglevel and the + * logtype of the message being processed. This information can be + * accessed by the user-defined log output function through this + * function. + * + * @return + * The logtype of the message being processed. + */ +int rte_log_cur_msg_logtype(void); + +/** + * Register a dynamic log type + * + * If a log is already registered with the same type, the returned value + * is the same than the previous one. + * + * @param name + * The string identifying the log type. + * @return + * - >0: success, the returned value is the log type identifier. + * - (-ENOMEM): cannot allocate memory. + */ +int rte_log_register(const char *name); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Register a dynamic log type and try to pick its level from EAL options + * + * rte_log_register() is called inside. If successful, the function tries + * to search for matching regexp in the list of EAL log level options and + * pick the level from the last matching entry. If nothing can be applied + * from the list, the level will be set to the user-defined default value. + * + * @param name + * Name for the log type to be registered + * @param level_def + * Fallback level to be set if the global list has no matching options + * @return + * - >=0: the newly registered log type + * - <0: rte_log_register() error value + */ +__rte_experimental +int rte_log_register_type_and_pick_level(const char *name, uint32_t level_def); + +/** + * Dump log information. + * + * Dump the global level and the registered log types. + * + * @param f + * The output stream where the dump should be sent. + */ +void rte_log_dump(FILE *f); + +/** + * Generates a log message. + * + * The message will be sent in the stream defined by the previous call + * to rte_openlog_stream(). + * + * The level argument determines if the log should be displayed or + * not, depending on the global rte_logs variable. + * + * The preferred alternative is the RTE_LOG() because it adds the + * level and type in the logged string. + * + * @param level + * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8). + * @param logtype + * The log type, for example, RTE_LOGTYPE_EAL. + * @param format + * The format string, as in printf(3), followed by the variable arguments + * required by the format. + * @return + * - 0: Success. + * - Negative on error. + */ +int rte_log(uint32_t level, uint32_t logtype, const char *format, ...) +#ifdef __GNUC__ +#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 2)) + __rte_cold +#endif +#endif + __rte_format_printf(3, 4); + +/** + * Generates a log message. + * + * The message will be sent in the stream defined by the previous call + * to rte_openlog_stream(). + * + * The level argument determines if the log should be displayed or + * not, depending on the global rte_logs variable. A trailing + * newline may be added if needed. + * + * The preferred alternative is the RTE_LOG() because it adds the + * level and type in the logged string. + * + * @param level + * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8). + * @param logtype + * The log type, for example, RTE_LOGTYPE_EAL. + * @param format + * The format string, as in printf(3), followed by the variable arguments + * required by the format. + * @param ap + * The va_list of the variable arguments required by the format. + * @return + * - 0: Success. + * - Negative on error. + */ +int rte_vlog(uint32_t level, uint32_t logtype, const char *format, va_list ap) + __rte_format_printf(3, 0); + +/** + * Generates a log message. + * + * The RTE_LOG() is a helper that prefixes the string with the log level + * and type, and call rte_log(). + * + * @param l + * Log level. A value between EMERG (1) and DEBUG (8). The short name is + * expanded by the macro, so it cannot be an integer value. + * @param t + * The log type, for example, EAL. The short name is expanded by the + * macro, so it cannot be an integer value. + * @param ... + * The fmt string, as in printf(3), followed by the variable arguments + * required by the format. + * @return + * - 0: Success. + * - Negative on error. + */ +#define RTE_LOG(l, t, ...) \ + rte_log(RTE_LOG_ ## l, \ + RTE_LOGTYPE_ ## t, # t ": " __VA_ARGS__) + +/** + * Generates a log message for data path. + * + * Similar to RTE_LOG(), except that it is removed at compilation time + * if the RTE_LOG_DP_LEVEL configuration option is lower than the log + * level argument. + * + * @param l + * Log level. A value between EMERG (1) and DEBUG (8). The short name is + * expanded by the macro, so it cannot be an integer value. + * @param t + * The log type, for example, EAL. The short name is expanded by the + * macro, so it cannot be an integer value. + * @param ... + * The fmt string, as in printf(3), followed by the variable arguments + * required by the format. + * @return + * - 0: Success. + * - Negative on error. + */ +#define RTE_LOG_DP(l, t, ...) \ + (void)((RTE_LOG_ ## l <= RTE_LOG_DP_LEVEL) ? \ + rte_log(RTE_LOG_ ## l, \ + RTE_LOGTYPE_ ## t, # t ": " __VA_ARGS__) : \ + 0) + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_LOG_H_ */ Index: sys/contrib/dpdk_rte_lpm/rte_lpm.h =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/rte_lpm.h @@ -0,0 +1,403 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_LPM_H_ +#define _RTE_LPM_H_ + +/** + * @file + * RTE Longest Prefix Match (LPM) + */ + +/* +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +*/ +#include "rte_branch_prediction.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** Max number of characters in LPM name. */ +#define RTE_LPM_NAMESIZE 16 + +/** Maximum depth value possible for IPv4 LPM. */ +#define RTE_LPM_MAX_DEPTH 32 + +/** @internal Total number of tbl24 entries. */ +#define RTE_LPM_TBL24_NUM_ENTRIES (1 << 24) + +/** @internal Number of entries in a tbl8 group. */ +#define RTE_LPM_TBL8_GROUP_NUM_ENTRIES 256 + +/** @internal Max number of tbl8 groups in the tbl8. */ +#define RTE_LPM_MAX_TBL8_NUM_GROUPS (1 << 24) + +/** @internal Total number of tbl8 groups in the tbl8. */ +#define RTE_LPM_TBL8_NUM_GROUPS 256 + +/** @internal Total number of tbl8 entries. */ +#define RTE_LPM_TBL8_NUM_ENTRIES (RTE_LPM_TBL8_NUM_GROUPS * \ + RTE_LPM_TBL8_GROUP_NUM_ENTRIES) + +/** @internal Macro to enable/disable run-time checks. */ +#if defined(RTE_LIBRTE_LPM_DEBUG) +#define RTE_LPM_RETURN_IF_TRUE(cond, retval) do { \ + if (cond) return (retval); \ +} while (0) +#else +#define RTE_LPM_RETURN_IF_TRUE(cond, retval) +#endif + +/** @internal bitmask with valid and valid_group fields set */ +#define RTE_LPM_VALID_EXT_ENTRY_BITMASK 0x03000000 + +/** Bitmask used to indicate successful lookup */ +#define RTE_LPM_LOOKUP_SUCCESS 0x01000000 + +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN +/** @internal Tbl24 entry structure. */ +__extension__ +struct rte_lpm_tbl_entry { + /** + * Stores Next hop (tbl8 or tbl24 when valid_group is not set) or + * a group index pointing to a tbl8 structure (tbl24 only, when + * valid_group is set) + */ + uint32_t next_hop :24; + /* Using single uint8_t to store 3 values. */ + uint32_t valid :1; /**< Validation flag. */ + /** + * For tbl24: + * - valid_group == 0: entry stores a next hop + * - valid_group == 1: entry stores a group_index pointing to a tbl8 + * For tbl8: + * - valid_group indicates whether the current tbl8 is in use or not + */ + uint32_t valid_group :1; + uint32_t depth :6; /**< Rule depth. */ +}; + +#else + +__extension__ +struct rte_lpm_tbl_entry { + uint32_t depth :6; + uint32_t valid_group :1; + uint32_t valid :1; + uint32_t next_hop :24; + +}; + +#endif + +/** LPM configuration structure. */ +struct rte_lpm_config { + uint32_t max_rules; /**< Max number of rules. */ + uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */ + int flags; /**< This field is currently unused. */ +}; + +/** @internal Rule structure. */ +struct rte_lpm_rule { + uint32_t ip; /**< Rule IP address. */ + uint32_t next_hop; /**< Rule next hop. */ +}; + +/** @internal Contains metadata about the rules table. */ +struct rte_lpm_rule_info { + uint32_t used_rules; /**< Used rules so far. */ + uint32_t first_rule; /**< Indexes the first rule of a given depth. */ +}; + +struct nhop_object; +struct rte_lpm_external { + struct nhop_object **nh_idx; /**< # -> idx mappings */ + uint32_t default_idx; /* nhop index of default route */ + uint32_t fibnum; /* fib index */ +}; + +/** @internal LPM structure. */ +struct rte_lpm { + /* LPM metadata. */ + struct rte_lpm_external ext; + char name[RTE_LPM_NAMESIZE]; /**< Name of the lpm. */ + uint32_t max_rules; /**< Max. balanced rules per lpm. */ + uint32_t number_tbl8s; /**< Number of tbl8s. */ + struct rte_lpm_rule_info rule_info[RTE_LPM_MAX_DEPTH]; /**< Rule info table. */ + + /* LPM Tables. */ + struct rte_lpm_tbl_entry tbl24[RTE_LPM_TBL24_NUM_ENTRIES] + __rte_cache_aligned; /**< LPM tbl24 table. */ + struct rte_lpm_tbl_entry *tbl8; /**< LPM tbl8 table. */ + struct rte_lpm_rule *rules_tbl; /**< LPM rules. */ +}; + +/** + * Create an LPM object. + * + * @param name + * LPM object name + * @param socket_id + * NUMA socket ID for LPM table memory allocation + * @param config + * Structure containing the configuration + * @return + * Handle to LPM object on success, NULL otherwise with rte_errno set + * to an appropriate values. Possible rte_errno values include: + * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure + * - E_RTE_SECONDARY - function was called from a secondary process instance + * - EINVAL - invalid parameter passed to function + * - ENOSPC - the maximum number of memzones has already been allocated + * - EEXIST - a memzone with the same name already exists + * - ENOMEM - no appropriate memory area found in which to create memzone + */ +struct rte_lpm * +rte_lpm_create(const char *name, int socket_id, + const struct rte_lpm_config *config); + +/** + * Find an existing LPM object and return a pointer to it. + * + * @param name + * Name of the lpm object as passed to rte_lpm_create() + * @return + * Pointer to lpm object or NULL if object not found with rte_errno + * set appropriately. Possible rte_errno values include: + * - ENOENT - required entry not available to return. + */ +struct rte_lpm * +rte_lpm_find_existing(const char *name); + +/** + * Free an LPM object. + * + * @param lpm + * LPM object handle + * @return + * None + */ +void +rte_lpm_free(struct rte_lpm *lpm); + +/** + * Add a rule to the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be added to the LPM table + * @param depth + * Depth of the rule to be added to the LPM table + * @param next_hop + * Next hop of the rule to be added to the LPM table + * @return + * 0 on success, negative value otherwise + */ +int +rte_lpm_add(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, uint32_t next_hop); + +/** + * Check if a rule is present in the LPM table, + * and provide its next hop if it is. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be searched + * @param depth + * Depth of the rule to searched + * @param next_hop + * Next hop of the rule (valid only if it is found) + * @return + * 1 if the rule exists, 0 if it does not, a negative value on failure + */ +int +rte_lpm_is_rule_present(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, +uint32_t *next_hop); + +/** + * Delete a rule from the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be deleted from the LPM table + * @param depth + * Depth of the rule to be deleted from the LPM table + * @param psub_rule_depth + * Pointer to depth of the parent rule + * @param sub_rule_nhop + * Pinter to the parent rule nexthop index + * @return + * 0 on success, negative value otherwise + */ +int +rte_lpm_delete(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint8_t sub_rule_depth, uint32_t sub_rule_nhop); + +/** + * Delete all rules from the LPM table. + * + * @param lpm + * LPM object handle + */ +void +rte_lpm_delete_all(struct rte_lpm *lpm); + +/** + * Lookup an IP into the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP to be looked up in the LPM table + * @param next_hop + * Next hop of the most specific rule found for IP (valid on lookup hit only) + * @return + * -EINVAL for incorrect arguments, -ENOENT on lookup miss, 0 on lookup hit + */ +static inline int +rte_lpm_lookup(struct rte_lpm *lpm, uint32_t ip, uint32_t *next_hop) +{ + unsigned tbl24_index = (ip >> 8); + uint32_t tbl_entry; + const uint32_t *ptbl; + + /* DEBUG: Check user input arguments. */ + RTE_LPM_RETURN_IF_TRUE(((lpm == NULL) || (next_hop == NULL)), -EINVAL); + + /* Copy tbl24 entry */ + ptbl = (const uint32_t *)(&lpm->tbl24[tbl24_index]); + tbl_entry = *ptbl; + + /* Memory ordering is not required in lookup. Because dataflow + * dependency exists, compiler or HW won't be able to re-order + * the operations. + */ + /* Copy tbl8 entry (only if needed) */ + if (unlikely((tbl_entry & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + + unsigned tbl8_index = (uint8_t)ip + + (((uint32_t)tbl_entry & 0x00FFFFFF) * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES); + + ptbl = (const uint32_t *)&lpm->tbl8[tbl8_index]; + tbl_entry = *ptbl; + } + + *next_hop = ((uint32_t)tbl_entry & 0x00FFFFFF); + return (tbl_entry & RTE_LPM_LOOKUP_SUCCESS) ? 0 : -ENOENT; +} + +/** + * Lookup multiple IP addresses in an LPM table. This may be implemented as a + * macro, so the address of the function should not be used. + * + * @param lpm + * LPM object handle + * @param ips + * Array of IPs to be looked up in the LPM table + * @param next_hops + * Next hop of the most specific rule found for IP (valid on lookup hit only). + * This is an array of two byte values. The most significant byte in each + * value says whether the lookup was successful (bitmask + * RTE_LPM_LOOKUP_SUCCESS is set). The least significant byte is the + * actual next hop. + * @param n + * Number of elements in ips (and next_hops) array to lookup. This should be a + * compile time constant, and divisible by 8 for best performance. + * @return + * -EINVAL for incorrect arguments, otherwise 0 + */ +#define rte_lpm_lookup_bulk(lpm, ips, next_hops, n) \ + rte_lpm_lookup_bulk_func(lpm, ips, next_hops, n) + +static inline int +rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t *ips, + uint32_t *next_hops, const unsigned n) +{ + unsigned i; + unsigned tbl24_indexes[n]; + const uint32_t *ptbl; + + /* DEBUG: Check user input arguments. */ + RTE_LPM_RETURN_IF_TRUE(((lpm == NULL) || (ips == NULL) || + (next_hops == NULL)), -EINVAL); + + for (i = 0; i < n; i++) { + tbl24_indexes[i] = ips[i] >> 8; + } + + for (i = 0; i < n; i++) { + /* Simply copy tbl24 entry to output */ + ptbl = (const uint32_t *)&lpm->tbl24[tbl24_indexes[i]]; + next_hops[i] = *ptbl; + + /* Overwrite output with tbl8 entry if needed */ + if (unlikely((next_hops[i] & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + + unsigned tbl8_index = (uint8_t)ips[i] + + (((uint32_t)next_hops[i] & 0x00FFFFFF) * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES); + + ptbl = (const uint32_t *)&lpm->tbl8[tbl8_index]; + next_hops[i] = *ptbl; + } + } + return 0; +} + +/* Mask four results. */ +#define RTE_LPM_MASKX4_RES UINT64_C(0x00ffffff00ffffff) + +/** + * Lookup four IP addresses in an LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * Four IPs to be looked up in the LPM table + * @param hop + * Next hop of the most specific rule found for IP (valid on lookup hit only). + * This is an 4 elements array of two byte values. + * If the lookup was successful for the given IP, then least significant byte + * of the corresponding element is the actual next hop and the most + * significant byte is zero. + * If the lookup for the given IP failed, then corresponding element would + * contain default value, see description of then next parameter. + * @param defv + * Default value to populate into corresponding element of hop[] array, + * if lookup would fail. + */ +#if 0 +static inline void +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], + uint32_t defv); + +#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) +#include "rte_lpm_neon.h" +#elif defined(RTE_ARCH_PPC_64) +#include "rte_lpm_altivec.h" +#else +#include "rte_lpm_sse.h" +#endif +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_LPM_H_ */ Index: sys/contrib/dpdk_rte_lpm/rte_lpm.c =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/rte_lpm.c @@ -0,0 +1,1107 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int errno = 0, rte_errno = 0; + +#if 0 +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include /* for definition of RTE_CACHE_LINE_SIZE */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +#include "rte_shim.h" +#include "rte_lpm.h" + +#if 0 +TAILQ_HEAD(rte_lpm_list, rte_tailq_entry); + +static struct rte_tailq_elem rte_lpm_tailq = { + .name = "RTE_LPM", +}; +EAL_REGISTER_TAILQ(rte_lpm_tailq) +#endif + +#define MAX_DEPTH_TBL24 24 + +enum valid_flag { + INVALID = 0, + VALID +}; + +/* Macro to enable/disable run-time checks. */ +#if defined(RTE_LIBRTE_LPM_DEBUG) +#include +#define VERIFY_DEPTH(depth) do { \ + if ((depth == 0) || (depth > RTE_LPM_MAX_DEPTH)) \ + rte_panic("LPM: Invalid depth (%u) at line %d", \ + (unsigned)(depth), __LINE__); \ +} while (0) +#else +#define VERIFY_DEPTH(depth) +#endif + +/* + * Converts a given depth value to its corresponding mask value. + * + * depth (IN) : range = 1 - 32 + * mask (OUT) : 32bit mask + */ +static uint32_t __attribute__((pure)) +depth_to_mask(uint8_t depth) +{ + VERIFY_DEPTH(depth); + + /* To calculate a mask start with a 1 on the left hand side and right + * shift while populating the left hand side with 1's + */ + return (int)0x80000000 >> (depth - 1); +} + +/* + * Converts given depth value to its corresponding range value. + */ +static uint32_t __attribute__((pure)) +depth_to_range(uint8_t depth) +{ + VERIFY_DEPTH(depth); + + /* + * Calculate tbl24 range. (Note: 2^depth = 1 << depth) + */ + if (depth <= MAX_DEPTH_TBL24) + return 1 << (MAX_DEPTH_TBL24 - depth); + + /* Else if depth is greater than 24 */ + return 1 << (RTE_LPM_MAX_DEPTH - depth); +} + +#if 0 +/* + * Find an existing lpm table and return a pointer to it. + */ +struct rte_lpm * +rte_lpm_find_existing(const char *name) +{ + struct rte_lpm *l = NULL; + struct rte_tailq_entry *te; + struct rte_lpm_list *lpm_list; + + lpm_list = RTE_TAILQ_CAST(rte_lpm_tailq.head, rte_lpm_list); + + rte_mcfg_tailq_read_lock(); + TAILQ_FOREACH(te, lpm_list, next) { + l = te->data; + if (strncmp(name, l->name, RTE_LPM_NAMESIZE) == 0) + break; + } + rte_mcfg_tailq_read_unlock(); + + if (te == NULL) { + rte_errno = ENOENT; + return NULL; + } + + return l; +} +#endif + +/* + * Allocates memory for LPM object + */ +struct rte_lpm * +rte_lpm_create(const char *name, int socket_id, + const struct rte_lpm_config *config) +{ + char mem_name[RTE_LPM_NAMESIZE]; + struct rte_lpm *lpm = NULL; + //struct rte_tailq_entry *te; + uint32_t mem_size, rules_size, tbl8s_size; + //struct rte_lpm_list *lpm_list; + + //lpm_list = RTE_TAILQ_CAST(rte_lpm_tailq.head, rte_lpm_list); + + RTE_BUILD_BUG_ON(sizeof(struct rte_lpm_tbl_entry) != 4); + + /* Check user arguments. */ + if ((name == NULL) || (socket_id < -1) + || config->number_tbl8s > RTE_LPM_MAX_TBL8_NUM_GROUPS) { + rte_errno = EINVAL; + return NULL; + } + + snprintf(mem_name, sizeof(mem_name), "LPM_%s", name); + + /* Determine the amount of memory to allocate. */ + mem_size = sizeof(*lpm); + rules_size = sizeof(struct rte_lpm_rule) * config->max_rules; + tbl8s_size = (sizeof(struct rte_lpm_tbl_entry) * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES * config->number_tbl8s); + +#if 0 + rte_mcfg_tailq_write_lock(); + + /* guarantee there's no existing */ + TAILQ_FOREACH(te, lpm_list, next) { + lpm = te->data; + if (strncmp(name, lpm->name, RTE_LPM_NAMESIZE) == 0) + break; + } + + if (te != NULL) { + lpm = NULL; + rte_errno = EEXIST; + goto exit; + } + + /* allocate tailq entry */ + te = rte_zmalloc("LPM_TAILQ_ENTRY", sizeof(*te), 0); + if (te == NULL) { + RTE_LOG(ERR, LPM, "Failed to allocate tailq entry\n"); + rte_errno = ENOMEM; + goto exit; + } +#endif + + /* Allocate memory to store the LPM data structures. */ + lpm = rte_zmalloc_socket(mem_name, mem_size, + RTE_CACHE_LINE_SIZE, socket_id); + if (lpm == NULL) { + RTE_LOG(ERR, LPM, "LPM memory allocation failed\n"); + //rte_free(te); + rte_errno = ENOMEM; + goto exit; + } + + lpm->rules_tbl = rte_zmalloc_socket(NULL, + (size_t)rules_size, RTE_CACHE_LINE_SIZE, socket_id); + + if (lpm->rules_tbl == NULL) { + RTE_LOG(ERR, LPM, "LPM rules_tbl memory allocation failed\n"); + rte_free(lpm); + lpm = NULL; + //rte_free(te); + rte_errno = ENOMEM; + goto exit; + } + + lpm->tbl8 = rte_zmalloc_socket(NULL, + (size_t)tbl8s_size, RTE_CACHE_LINE_SIZE, socket_id); + + if (lpm->tbl8 == NULL) { + RTE_LOG(ERR, LPM, "LPM tbl8 memory allocation failed\n"); + rte_free(lpm->rules_tbl); + rte_free(lpm); + lpm = NULL; + //rte_free(te); + rte_errno = ENOMEM; + goto exit; + } + + /* Save user arguments. */ + lpm->max_rules = config->max_rules; + lpm->number_tbl8s = config->number_tbl8s; + strlcpy(lpm->name, name, sizeof(lpm->name)); + + //te->data = lpm; + + //TAILQ_INSERT_TAIL(lpm_list, te, next); + +exit: + rte_mcfg_tailq_write_unlock(); + + return lpm; +} + +/* + * Deallocates memory for given LPM table. + */ +void +rte_lpm_free(struct rte_lpm *lpm) +{ +#if 0 + struct rte_lpm_list *lpm_list; + struct rte_tailq_entry *te; + + /* Check user arguments. */ + if (lpm == NULL) + return; + + lpm_list = RTE_TAILQ_CAST(rte_lpm_tailq.head, rte_lpm_list); + + rte_mcfg_tailq_write_lock(); + + /* find our tailq entry */ + TAILQ_FOREACH(te, lpm_list, next) { + if (te->data == (void *) lpm) + break; + } + if (te != NULL) + TAILQ_REMOVE(lpm_list, te, next); + + rte_mcfg_tailq_write_unlock(); +#endif + + rte_free(lpm->tbl8); + rte_free(lpm->rules_tbl); + rte_free(lpm); + //rte_free(te); +} + +#if 0 +/* + * Adds a rule to the rule table. + * + * NOTE: The rule table is split into 32 groups. Each group contains rules that + * apply to a specific prefix depth (i.e. group 1 contains rules that apply to + * prefixes with a depth of 1 etc.). In the following code (depth - 1) is used + * to refer to depth 1 because even though the depth range is 1 - 32, depths + * are stored in the rule table from 0 - 31. + * NOTE: Valid range for depth parameter is 1 .. 32 inclusive. + */ +static int32_t +rule_add(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, + uint32_t next_hop) +{ + uint32_t rule_gindex, rule_index, last_rule; + int i; + + VERIFY_DEPTH(depth); + + /* Scan through rule group to see if rule already exists. */ + if (lpm->rule_info[depth - 1].used_rules > 0) { + + /* rule_gindex stands for rule group index. */ + rule_gindex = lpm->rule_info[depth - 1].first_rule; + /* Initialise rule_index to point to start of rule group. */ + rule_index = rule_gindex; + /* Last rule = Last used rule in this rule group. */ + last_rule = rule_gindex + lpm->rule_info[depth - 1].used_rules; + + for (; rule_index < last_rule; rule_index++) { + + /* If rule already exists update next hop and return. */ + if (lpm->rules_tbl[rule_index].ip == ip_masked) { + + if (lpm->rules_tbl[rule_index].next_hop + == next_hop) + return -EEXIST; + lpm->rules_tbl[rule_index].next_hop = next_hop; + + return rule_index; + } + } + + if (rule_index == lpm->max_rules) + return -ENOSPC; + } else { + /* Calculate the position in which the rule will be stored. */ + rule_index = 0; + + for (i = depth - 1; i > 0; i--) { + if (lpm->rule_info[i - 1].used_rules > 0) { + rule_index = lpm->rule_info[i - 1].first_rule + + lpm->rule_info[i - 1].used_rules; + break; + } + } + if (rule_index == lpm->max_rules) + return -ENOSPC; + + lpm->rule_info[depth - 1].first_rule = rule_index; + } + + /* Make room for the new rule in the array. */ + for (i = RTE_LPM_MAX_DEPTH; i > depth; i--) { + if (lpm->rule_info[i - 1].first_rule + + lpm->rule_info[i - 1].used_rules == lpm->max_rules) + return -ENOSPC; + + if (lpm->rule_info[i - 1].used_rules > 0) { + lpm->rules_tbl[lpm->rule_info[i - 1].first_rule + + lpm->rule_info[i - 1].used_rules] + = lpm->rules_tbl[lpm->rule_info[i - 1].first_rule]; + lpm->rule_info[i - 1].first_rule++; + } + } + + /* Add the new rule. */ + lpm->rules_tbl[rule_index].ip = ip_masked; + lpm->rules_tbl[rule_index].next_hop = next_hop; + + /* Increment the used rules counter for this rule group. */ + lpm->rule_info[depth - 1].used_rules++; + + return rule_index; +} + +/* + * Delete a rule from the rule table. + * NOTE: Valid range for depth parameter is 1 .. 32 inclusive. + */ +static void +rule_delete(struct rte_lpm *lpm, int32_t rule_index, uint8_t depth) +{ + int i; + + VERIFY_DEPTH(depth); + + lpm->rules_tbl[rule_index] = + lpm->rules_tbl[lpm->rule_info[depth - 1].first_rule + + lpm->rule_info[depth - 1].used_rules - 1]; + + for (i = depth; i < RTE_LPM_MAX_DEPTH; i++) { + if (lpm->rule_info[i].used_rules > 0) { + lpm->rules_tbl[lpm->rule_info[i].first_rule - 1] = + lpm->rules_tbl[lpm->rule_info[i].first_rule + + lpm->rule_info[i].used_rules - 1]; + lpm->rule_info[i].first_rule--; + } + } + + lpm->rule_info[depth - 1].used_rules--; +} + +/* + * Finds a rule in rule table. + * NOTE: Valid range for depth parameter is 1 .. 32 inclusive. + */ +static int32_t +rule_find(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth) +{ + uint32_t rule_gindex, last_rule, rule_index; + + VERIFY_DEPTH(depth); + + rule_gindex = lpm->rule_info[depth - 1].first_rule; + last_rule = rule_gindex + lpm->rule_info[depth - 1].used_rules; + + /* Scan used rules at given depth to find rule. */ + for (rule_index = rule_gindex; rule_index < last_rule; rule_index++) { + /* If rule is found return the rule index. */ + if (lpm->rules_tbl[rule_index].ip == ip_masked) + return rule_index; + } + + /* If rule is not found return -EINVAL. */ + return -EINVAL; +} +#endif + +/* + * Find, clean and allocate a tbl8. + */ +static int32_t +tbl8_alloc(struct rte_lpm_tbl_entry *tbl8, uint32_t number_tbl8s) +{ + uint32_t group_idx; /* tbl8 group index. */ + struct rte_lpm_tbl_entry *tbl8_entry; + + /* Scan through tbl8 to find a free (i.e. INVALID) tbl8 group. */ + for (group_idx = 0; group_idx < number_tbl8s; group_idx++) { + tbl8_entry = &tbl8[group_idx * RTE_LPM_TBL8_GROUP_NUM_ENTRIES]; + /* If a free tbl8 group is found clean it and set as VALID. */ + if (!tbl8_entry->valid_group) { + struct rte_lpm_tbl_entry new_tbl8_entry = { + .next_hop = 0, + .valid = INVALID, + .depth = 0, + .valid_group = VALID, + }; + + memset(&tbl8_entry[0], 0, + RTE_LPM_TBL8_GROUP_NUM_ENTRIES * + sizeof(tbl8_entry[0])); + + __atomic_store(tbl8_entry, &new_tbl8_entry, + __ATOMIC_RELAXED); + + /* Return group index for allocated tbl8 group. */ + return group_idx; + } + } + + /* If there are no tbl8 groups free then return error. */ + return -ENOSPC; +} + +static void +tbl8_free(struct rte_lpm_tbl_entry *tbl8, uint32_t tbl8_group_start) +{ + /* Set tbl8 group invalid*/ + struct rte_lpm_tbl_entry zero_tbl8_entry = {0}; + + __atomic_store(&tbl8[tbl8_group_start], &zero_tbl8_entry, + __ATOMIC_RELAXED); +} + +static __rte_noinline int32_t +add_depth_small(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint32_t next_hop) +{ +#define group_idx next_hop + uint32_t tbl24_index, tbl24_range, tbl8_index, tbl8_group_end, i, j; + + /* Calculate the index into Table24. */ + tbl24_index = ip >> 8; + tbl24_range = depth_to_range(depth); + + for (i = tbl24_index; i < (tbl24_index + tbl24_range); i++) { + /* + * For invalid OR valid and non-extended tbl 24 entries set + * entry. + */ + if (!lpm->tbl24[i].valid || (lpm->tbl24[i].valid_group == 0 && + lpm->tbl24[i].depth <= depth)) { + + struct rte_lpm_tbl_entry new_tbl24_entry = { + .next_hop = next_hop, + .valid = VALID, + .valid_group = 0, + .depth = depth, + }; + + /* Setting tbl24 entry in one go to avoid race + * conditions + */ + __atomic_store(&lpm->tbl24[i], &new_tbl24_entry, + __ATOMIC_RELEASE); + + continue; + } + + if (lpm->tbl24[i].valid_group == 1) { + /* If tbl24 entry is valid and extended calculate the + * index into tbl8. + */ + tbl8_index = lpm->tbl24[i].group_idx * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + tbl8_group_end = tbl8_index + + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + for (j = tbl8_index; j < tbl8_group_end; j++) { + if (!lpm->tbl8[j].valid || + lpm->tbl8[j].depth <= depth) { + struct rte_lpm_tbl_entry + new_tbl8_entry = { + .valid = VALID, + .valid_group = VALID, + .depth = depth, + .next_hop = next_hop, + }; + + /* + * Setting tbl8 entry in one go to avoid + * race conditions + */ + __atomic_store(&lpm->tbl8[j], + &new_tbl8_entry, + __ATOMIC_RELAXED); + + continue; + } + } + } + } +#undef group_idx + return 0; +} + +static __rte_noinline int32_t +add_depth_big(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, + uint32_t next_hop) +{ +#define group_idx next_hop + uint32_t tbl24_index; + int32_t tbl8_group_index, tbl8_group_start, tbl8_group_end, tbl8_index, + tbl8_range, i; + + tbl24_index = (ip_masked >> 8); + tbl8_range = depth_to_range(depth); + + if (!lpm->tbl24[tbl24_index].valid) { + /* Search for a free tbl8 group. */ + tbl8_group_index = tbl8_alloc(lpm->tbl8, lpm->number_tbl8s); + + /* Check tbl8 allocation was successful. */ + if (tbl8_group_index < 0) { + return tbl8_group_index; + } + + /* Find index into tbl8 and range. */ + tbl8_index = (tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES) + + (ip_masked & 0xFF); + + /* Set tbl8 entry. */ + for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { + struct rte_lpm_tbl_entry new_tbl8_entry = { + .valid = VALID, + .depth = depth, + .valid_group = lpm->tbl8[i].valid_group, + .next_hop = next_hop, + }; + __atomic_store(&lpm->tbl8[i], &new_tbl8_entry, + __ATOMIC_RELAXED); + } + + /* + * Update tbl24 entry to point to new tbl8 entry. Note: The + * ext_flag and tbl8_index need to be updated simultaneously, + * so assign whole structure in one go + */ + + struct rte_lpm_tbl_entry new_tbl24_entry = { + .group_idx = tbl8_group_index, + .valid = VALID, + .valid_group = 1, + .depth = 0, + }; + + /* The tbl24 entry must be written only after the + * tbl8 entries are written. + */ + __atomic_store(&lpm->tbl24[tbl24_index], &new_tbl24_entry, + __ATOMIC_RELEASE); + + } /* If valid entry but not extended calculate the index into Table8. */ + else if (lpm->tbl24[tbl24_index].valid_group == 0) { + /* Search for free tbl8 group. */ + tbl8_group_index = tbl8_alloc(lpm->tbl8, lpm->number_tbl8s); + + if (tbl8_group_index < 0) { + return tbl8_group_index; + } + + tbl8_group_start = tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + tbl8_group_end = tbl8_group_start + + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + /* Populate new tbl8 with tbl24 value. */ + for (i = tbl8_group_start; i < tbl8_group_end; i++) { + struct rte_lpm_tbl_entry new_tbl8_entry = { + .valid = VALID, + .depth = lpm->tbl24[tbl24_index].depth, + .valid_group = lpm->tbl8[i].valid_group, + .next_hop = lpm->tbl24[tbl24_index].next_hop, + }; + __atomic_store(&lpm->tbl8[i], &new_tbl8_entry, + __ATOMIC_RELAXED); + } + + tbl8_index = tbl8_group_start + (ip_masked & 0xFF); + + /* Insert new rule into the tbl8 entry. */ + for (i = tbl8_index; i < tbl8_index + tbl8_range; i++) { + struct rte_lpm_tbl_entry new_tbl8_entry = { + .valid = VALID, + .depth = depth, + .valid_group = lpm->tbl8[i].valid_group, + .next_hop = next_hop, + }; + __atomic_store(&lpm->tbl8[i], &new_tbl8_entry, + __ATOMIC_RELAXED); + } + + /* + * Update tbl24 entry to point to new tbl8 entry. Note: The + * ext_flag and tbl8_index need to be updated simultaneously, + * so assign whole structure in one go. + */ + + struct rte_lpm_tbl_entry new_tbl24_entry = { + .group_idx = tbl8_group_index, + .valid = VALID, + .valid_group = 1, + .depth = 0, + }; + + /* The tbl24 entry must be written only after the + * tbl8 entries are written. + */ + __atomic_store(&lpm->tbl24[tbl24_index], &new_tbl24_entry, + __ATOMIC_RELEASE); + + } else { /* + * If it is valid, extended entry calculate the index into tbl8. + */ + tbl8_group_index = lpm->tbl24[tbl24_index].group_idx; + tbl8_group_start = tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + tbl8_index = tbl8_group_start + (ip_masked & 0xFF); + + for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { + + if (!lpm->tbl8[i].valid || + lpm->tbl8[i].depth <= depth) { + struct rte_lpm_tbl_entry new_tbl8_entry = { + .valid = VALID, + .depth = depth, + .next_hop = next_hop, + .valid_group = lpm->tbl8[i].valid_group, + }; + + /* + * Setting tbl8 entry in one go to avoid race + * condition + */ + __atomic_store(&lpm->tbl8[i], &new_tbl8_entry, + __ATOMIC_RELAXED); + + continue; + } + } + } +#undef group_idx + return 0; +} + +/* + * Add a route + */ +int +rte_lpm_add(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint32_t next_hop) +{ + int32_t status = 0; + uint32_t ip_masked; + + /* Check user arguments. */ + if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM_MAX_DEPTH)) + return -EINVAL; + + ip_masked = ip & depth_to_mask(depth); + +#if 0 + /* Add the rule to the rule table. */ + rule_index = rule_add(lpm, ip_masked, depth, next_hop); + + /* Skip table entries update if The rule is the same as + * the rule in the rules table. + */ + if (rule_index == -EEXIST) + return 0; + + /* If the is no space available for new rule return error. */ + if (rule_index < 0) { + return rule_index; + } +#endif + + if (depth <= MAX_DEPTH_TBL24) { + status = add_depth_small(lpm, ip_masked, depth, next_hop); + } else { /* If depth > RTE_LPM_MAX_DEPTH_TBL24 */ + status = add_depth_big(lpm, ip_masked, depth, next_hop); + + /* + * If add fails due to exhaustion of tbl8 extensions delete + * rule that was added to rule table. + */ + if (status < 0) { + //rule_delete(lpm, rule_index, depth); + + return status; + } + } + + return 0; +} + +#if 0 +/* + * Look for a rule in the high-level rules table + */ +int +rte_lpm_is_rule_present(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, +uint32_t *next_hop) +{ + uint32_t ip_masked; + int32_t rule_index; + + /* Check user arguments. */ + if ((lpm == NULL) || + (next_hop == NULL) || + (depth < 1) || (depth > RTE_LPM_MAX_DEPTH)) + return -EINVAL; + + /* Look for the rule using rule_find. */ + ip_masked = ip & depth_to_mask(depth); + rule_index = rule_find(lpm, ip_masked, depth); + + if (rule_index >= 0) { + *next_hop = lpm->rules_tbl[rule_index].next_hop; + return 1; + } + + /* If rule is not found return 0. */ + return 0; +} + +static int32_t +find_previous_rule(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint8_t *sub_rule_depth) +{ + int32_t rule_index; + uint32_t ip_masked; + uint8_t prev_depth; + + for (prev_depth = (uint8_t)(depth - 1); prev_depth > 0; prev_depth--) { + ip_masked = ip & depth_to_mask(prev_depth); + + rule_index = rule_find(lpm, ip_masked, prev_depth); + + if (rule_index >= 0) { + *sub_rule_depth = prev_depth; + return rule_index; + } + } + + return -1; +} +#endif + +static int32_t +delete_depth_small(struct rte_lpm *lpm, uint32_t ip_masked, + uint8_t depth, uint32_t sub_rule_nhop, uint8_t sub_rule_depth) +{ +#define group_idx next_hop + uint32_t tbl24_range, tbl24_index, tbl8_group_index, tbl8_index, i, j; + + /* Calculate the range and index into Table24. */ + tbl24_range = depth_to_range(depth); + tbl24_index = (ip_masked >> 8); + struct rte_lpm_tbl_entry zero_tbl24_entry = {0}; + + /* + * Firstly check the sub_rule_index. A -1 indicates no replacement rule + * and a positive number indicates a sub_rule_index. + */ + if (sub_rule_nhop == 0) { + /* + * If no replacement rule exists then invalidate entries + * associated with this rule. + */ + for (i = tbl24_index; i < (tbl24_index + tbl24_range); i++) { + + if (lpm->tbl24[i].valid_group == 0 && + lpm->tbl24[i].depth <= depth) { + __atomic_store(&lpm->tbl24[i], + &zero_tbl24_entry, __ATOMIC_RELEASE); + } else if (lpm->tbl24[i].valid_group == 1) { + /* + * If TBL24 entry is extended, then there has + * to be a rule with depth >= 25 in the + * associated TBL8 group. + */ + + tbl8_group_index = lpm->tbl24[i].group_idx; + tbl8_index = tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + for (j = tbl8_index; j < (tbl8_index + + RTE_LPM_TBL8_GROUP_NUM_ENTRIES); j++) { + + if (lpm->tbl8[j].depth <= depth) + lpm->tbl8[j].valid = INVALID; + } + } + } + } else { + /* + * If a replacement rule exists then modify entries + * associated with this rule. + */ + + struct rte_lpm_tbl_entry new_tbl24_entry = { + .next_hop = sub_rule_nhop, + .valid = VALID, + .valid_group = 0, + .depth = sub_rule_depth, + }; + + struct rte_lpm_tbl_entry new_tbl8_entry = { + .valid = VALID, + .valid_group = VALID, + .depth = sub_rule_depth, + .next_hop = sub_rule_nhop, + }; + + for (i = tbl24_index; i < (tbl24_index + tbl24_range); i++) { + + if (lpm->tbl24[i].valid_group == 0 && + lpm->tbl24[i].depth <= depth) { + __atomic_store(&lpm->tbl24[i], &new_tbl24_entry, + __ATOMIC_RELEASE); + } else if (lpm->tbl24[i].valid_group == 1) { + /* + * If TBL24 entry is extended, then there has + * to be a rule with depth >= 25 in the + * associated TBL8 group. + */ + + tbl8_group_index = lpm->tbl24[i].group_idx; + tbl8_index = tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + for (j = tbl8_index; j < (tbl8_index + + RTE_LPM_TBL8_GROUP_NUM_ENTRIES); j++) { + + if (lpm->tbl8[j].depth <= depth) + __atomic_store(&lpm->tbl8[j], + &new_tbl8_entry, + __ATOMIC_RELAXED); + } + } + } + } +#undef group_idx + return 0; +} + +/* + * Checks if table 8 group can be recycled. + * + * Return of -EEXIST means tbl8 is in use and thus can not be recycled. + * Return of -EINVAL means tbl8 is empty and thus can be recycled + * Return of value > -1 means tbl8 is in use but has all the same values and + * thus can be recycled + */ +static int32_t +tbl8_recycle_check(struct rte_lpm_tbl_entry *tbl8, + uint32_t tbl8_group_start) +{ + uint32_t tbl8_group_end, i; + tbl8_group_end = tbl8_group_start + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + /* + * Check the first entry of the given tbl8. If it is invalid we know + * this tbl8 does not contain any rule with a depth < RTE_LPM_MAX_DEPTH + * (As they would affect all entries in a tbl8) and thus this table + * can not be recycled. + */ + if (tbl8[tbl8_group_start].valid) { + /* + * If first entry is valid check if the depth is less than 24 + * and if so check the rest of the entries to verify that they + * are all of this depth. + */ + if (tbl8[tbl8_group_start].depth <= MAX_DEPTH_TBL24) { + for (i = (tbl8_group_start + 1); i < tbl8_group_end; + i++) { + + if (tbl8[i].depth != + tbl8[tbl8_group_start].depth) { + + return -EEXIST; + } + } + /* If all entries are the same return the tb8 index */ + return tbl8_group_start; + } + + return -EEXIST; + } + /* + * If the first entry is invalid check if the rest of the entries in + * the tbl8 are invalid. + */ + for (i = (tbl8_group_start + 1); i < tbl8_group_end; i++) { + if (tbl8[i].valid) + return -EEXIST; + } + /* If no valid entries are found then return -EINVAL. */ + return -EINVAL; +} + +static int32_t +delete_depth_big(struct rte_lpm *lpm, uint32_t ip_masked, + uint8_t depth, uint32_t sub_rule_nhop, uint8_t sub_rule_depth) +{ +#define group_idx next_hop + uint32_t tbl24_index, tbl8_group_index, tbl8_group_start, tbl8_index, + tbl8_range, i; + int32_t tbl8_recycle_index; + + /* + * Calculate the index into tbl24 and range. Note: All depths larger + * than MAX_DEPTH_TBL24 are associated with only one tbl24 entry. + */ + tbl24_index = ip_masked >> 8; + + /* Calculate the index into tbl8 and range. */ + tbl8_group_index = lpm->tbl24[tbl24_index].group_idx; + tbl8_group_start = tbl8_group_index * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + tbl8_index = tbl8_group_start + (ip_masked & 0xFF); + tbl8_range = depth_to_range(depth); + + if (sub_rule_nhop == 0) { + /* + * Loop through the range of entries on tbl8 for which the + * rule_to_delete must be removed or modified. + */ + for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { + if (lpm->tbl8[i].depth <= depth) + lpm->tbl8[i].valid = INVALID; + } + } else { + /* Set new tbl8 entry. */ + struct rte_lpm_tbl_entry new_tbl8_entry = { + .valid = VALID, + .depth = sub_rule_depth, + .valid_group = lpm->tbl8[tbl8_group_start].valid_group, + .next_hop = sub_rule_nhop, + }; + + /* + * Loop through the range of entries on tbl8 for which the + * rule_to_delete must be modified. + */ + for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { + if (lpm->tbl8[i].depth <= depth) + __atomic_store(&lpm->tbl8[i], &new_tbl8_entry, + __ATOMIC_RELAXED); + } + } + + /* + * Check if there are any valid entries in this tbl8 group. If all + * tbl8 entries are invalid we can free the tbl8 and invalidate the + * associated tbl24 entry. + */ + + tbl8_recycle_index = tbl8_recycle_check(lpm->tbl8, tbl8_group_start); + + if (tbl8_recycle_index == -EINVAL) { + /* Set tbl24 before freeing tbl8 to avoid race condition. + * Prevent the free of the tbl8 group from hoisting. + */ + lpm->tbl24[tbl24_index].valid = 0; + __atomic_thread_fence(__ATOMIC_RELEASE); + tbl8_free(lpm->tbl8, tbl8_group_start); + } else if (tbl8_recycle_index > -1) { + /* Update tbl24 entry. */ + struct rte_lpm_tbl_entry new_tbl24_entry = { + .next_hop = lpm->tbl8[tbl8_recycle_index].next_hop, + .valid = VALID, + .valid_group = 0, + .depth = lpm->tbl8[tbl8_recycle_index].depth, + }; + + /* Set tbl24 before freeing tbl8 to avoid race condition. + * Prevent the free of the tbl8 group from hoisting. + */ + __atomic_store(&lpm->tbl24[tbl24_index], &new_tbl24_entry, + __ATOMIC_RELAXED); + __atomic_thread_fence(__ATOMIC_RELEASE); + tbl8_free(lpm->tbl8, tbl8_group_start); + } +#undef group_idx + return 0; +} + +/* + * Deletes a rule + */ +int +rte_lpm_delete(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint8_t sub_rule_depth, uint32_t sub_rule_nhop) +{ + //int32_t rule_to_delete_index; + uint32_t ip_masked; + //uint8_t sub_rule_depth; + /* + * Check input arguments. Note: IP must be a positive integer of 32 + * bits in length therefore it need not be checked. + */ + if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM_MAX_DEPTH)) { + return -EINVAL; + } + + ip_masked = ip & depth_to_mask(depth); + +#if 0 + /* + * Find the index of the input rule, that needs to be deleted, in the + * rule table. + */ + rule_to_delete_index = rule_find(lpm, ip_masked, depth); + + /* + * Check if rule_to_delete_index was found. If no rule was found the + * function rule_find returns -EINVAL. + */ + if (rule_to_delete_index < 0) + return -EINVAL; + + /* Delete the rule from the rule table. */ + rule_delete(lpm, rule_to_delete_index, depth); +#endif + + /* + * Find rule to replace the rule_to_delete. If there is no rule to + * replace the rule_to_delete we return -1 and invalidate the table + * entries associated with this rule. + */ + //sub_rule_depth = *psub_rule_depth; + //sub_rule_index = find_previous_rule(lpm, ip, depth, &sub_rule_depth); + + /* + * If the input depth value is less than 25 use function + * delete_depth_small otherwise use delete_depth_big. + */ + if (depth <= MAX_DEPTH_TBL24) { + return delete_depth_small(lpm, ip_masked, depth, + sub_rule_nhop, sub_rule_depth); + } else { /* If depth > MAX_DEPTH_TBL24 */ + return delete_depth_big(lpm, ip_masked, depth, sub_rule_nhop, + sub_rule_depth); + } +} + +/* + * Delete all rules from the LPM table. + */ +void +rte_lpm_delete_all(struct rte_lpm *lpm) +{ + /* Zero rule information. */ + memset(lpm->rule_info, 0, sizeof(lpm->rule_info)); + + /* Zero tbl24. */ + memset(lpm->tbl24, 0, sizeof(lpm->tbl24)); + + /* Zero tbl8. */ + memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0]) + * RTE_LPM_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s); + + /* Delete all rules form the rules table. */ + memset(lpm->rules_tbl, 0, sizeof(lpm->rules_tbl[0]) * lpm->max_rules); +} Index: sys/contrib/dpdk_rte_lpm/rte_lpm6.h =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/rte_lpm6.h @@ -0,0 +1,209 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#ifndef _RTE_LPM6_H_ +#define _RTE_LPM6_H_ + +/** + * @file + * RTE Longest Prefix Match for IPv6 (LPM6) + */ + +#ifdef __cplusplus +extern "C" { +#endif + + +#define RTE_LPM6_MAX_DEPTH 128 +#define RTE_LPM6_IPV6_ADDR_SIZE 16 +/** Max number of characters in LPM name. */ +#define RTE_LPM6_NAMESIZE 32 + +/** LPM structure. */ +struct rte_lpm6; + +struct nhop_object; +struct rte_lpm6_external { + struct nhop_object **nh_idx; /**< # -> idx mappings */ + uint32_t default_idx; /* nhop index of default route */ + uint32_t fibnum; /* fib index */ +}; + +/** LPM configuration structure. */ +struct rte_lpm6_config { + uint32_t max_rules; /**< Max number of rules. */ + uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */ + int flags; /**< This field is currently unused. */ +}; + +#define RTE_LPM6_RULE_SIZE 32 +struct rte_lpm6_rule *fill_rule6(char *buffer, const uint8_t *ip, + uint8_t depth, uint32_t next_hop); +/** + * Create an LPM object. + * + * @param name + * LPM object name + * @param socket_id + * NUMA socket ID for LPM table memory allocation + * @param config + * Structure containing the configuration + * @return + * Handle to LPM object on success, NULL otherwise with rte_errno set + * to an appropriate values. Possible rte_errno values include: + * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure + * - E_RTE_SECONDARY - function was called from a secondary process instance + * - EINVAL - invalid parameter passed to function + * - ENOSPC - the maximum number of memzones has already been allocated + * - EEXIST - a memzone with the same name already exists + * - ENOMEM - no appropriate memory area found in which to create memzone + */ +struct rte_lpm6 * +rte_lpm6_create(const char *name, int socket_id, + const struct rte_lpm6_config *config); + +/** + * Find an existing LPM object and return a pointer to it. + * + * @param name + * Name of the lpm object as passed to rte_lpm6_create() + * @return + * Pointer to lpm object or NULL if object not found with rte_errno + * set appropriately. Possible rte_errno values include: + * - ENOENT - required entry not available to return. + */ +struct rte_lpm6 * +rte_lpm6_find_existing(const char *name); + +/** + * Free an LPM object. + * + * @param lpm + * LPM object handle + * @return + * None + */ +void +rte_lpm6_free(struct rte_lpm6 *lpm); + +/** + * Add a rule to the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be added to the LPM table + * @param depth + * Depth of the rule to be added to the LPM table + * @param next_hop + * Next hop of the rule to be added to the LPM table + * @return + * 0 on success, negative value otherwise + */ +int +rte_lpm6_add(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + uint32_t next_hop, int is_new_rule); + +/** + * Check if a rule is present in the LPM table, + * and provide its next hop if it is. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be searched + * @param depth + * Depth of the rule to searched + * @param next_hop + * Next hop of the rule (valid only if it is found) + * @return + * 1 if the rule exists, 0 if it does not, a negative value on failure + */ +int +rte_lpm6_is_rule_present(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + uint32_t *next_hop); + +/** + * Delete a rule from the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be deleted from the LPM table + * @param depth + * Depth of the rule to be deleted from the LPM table + * @return + * 0 on success, negative value otherwise + */ +int +rte_lpm6_delete(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + struct rte_lpm6_rule *lsp_rule); + +/** + * Delete a rule from the LPM table. + * + * @param lpm + * LPM object handle + * @param ips + * Array of IPs to be deleted from the LPM table + * @param depths + * Array of depths of the rules to be deleted from the LPM table + * @param n + * Number of rules to be deleted from the LPM table + * @return + * 0 on success, negative value otherwise. + */ +int +rte_lpm6_delete_bulk_func(struct rte_lpm6 *lpm, + uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], uint8_t *depths, unsigned n); + +/** + * Delete all rules from the LPM table. + * + * @param lpm + * LPM object handle + */ +void +rte_lpm6_delete_all(struct rte_lpm6 *lpm); + +/** + * Lookup an IP into the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP to be looked up in the LPM table + * @param next_hop + * Next hop of the most specific rule found for IP (valid on lookup hit only) + * @return + * -EINVAL for incorrect arguments, -ENOENT on lookup miss, 0 on lookup hit + */ +int +rte_lpm6_lookup(const struct rte_lpm6 *lpm, const uint8_t *ip, uint32_t *next_hop); + +/** + * Lookup multiple IP addresses in an LPM table. + * + * @param lpm + * LPM object handle + * @param ips + * Array of IPs to be looked up in the LPM table + * @param next_hops + * Next hop of the most specific rule found for IP (valid on lookup hit only). + * This is an array of two byte values. The next hop will be stored on + * each position on success; otherwise the position will be set to -1. + * @param n + * Number of elements in ips (and next_hops) array to lookup. + * @return + * -EINVAL for incorrect arguments, otherwise 0 + */ +int +rte_lpm6_lookup_bulk_func(const struct rte_lpm6 *lpm, + uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], + int32_t *next_hops, unsigned int n); + +#ifdef __cplusplus +} +#endif + +#endif Index: sys/contrib/dpdk_rte_lpm/rte_lpm6.c =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/rte_lpm6.c @@ -0,0 +1,1415 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +//#include +int errno = 0, rte_errno = 0; + +#include "rte_shim.h" +#include "rte_lpm6.h" + +#define RTE_LPM6_TBL24_NUM_ENTRIES (1 << 24) +#define RTE_LPM6_TBL8_GROUP_NUM_ENTRIES 256 +#define RTE_LPM6_TBL8_MAX_NUM_GROUPS (1 << 21) + +#define RTE_LPM6_VALID_EXT_ENTRY_BITMASK 0xA0000000 +#define RTE_LPM6_LOOKUP_SUCCESS 0x20000000 +#define RTE_LPM6_TBL8_BITMASK 0x001FFFFF + +#define ADD_FIRST_BYTE 3 +#define LOOKUP_FIRST_BYTE 4 +#define BYTE_SIZE 8 +#define BYTES2_SIZE 16 + +#define RULE_HASH_TABLE_EXTRA_SPACE 64 +#define TBL24_IND UINT32_MAX + +#define lpm6_tbl8_gindex next_hop + +/** Flags for setting an entry as valid/invalid. */ +enum valid_flag { + INVALID = 0, + VALID +}; + +#if 0 +TAILQ_HEAD(rte_lpm6_list, rte_tailq_entry); + +static struct rte_tailq_elem rte_lpm6_tailq = { + .name = "RTE_LPM6", +}; +EAL_REGISTER_TAILQ(rte_lpm6_tailq) +#endif + +/** Tbl entry structure. It is the same for both tbl24 and tbl8 */ +struct rte_lpm6_tbl_entry { + uint32_t next_hop: 21; /**< Next hop / next table to be checked. */ + uint32_t depth :8; /**< Rule depth. */ + + /* Flags. */ + uint32_t valid :1; /**< Validation flag. */ + uint32_t valid_group :1; /**< Group validation flag. */ + uint32_t ext_entry :1; /**< External entry. */ +}; + +/** Rules tbl entry structure. */ +struct rte_lpm6_rule { + uint8_t ip[RTE_LPM6_IPV6_ADDR_SIZE]; /**< Rule IP address. */ + uint32_t next_hop; /**< Rule next hop. */ + uint8_t depth; /**< Rule depth. */ +}; + +/** Rules tbl entry key. */ +struct rte_lpm6_rule_key { + uint8_t ip[RTE_LPM6_IPV6_ADDR_SIZE]; /**< Rule IP address. */ + uint8_t depth; /**< Rule depth. */ +}; + +/* Header of tbl8 */ +struct rte_lpm_tbl8_hdr { + uint32_t owner_tbl_ind; /**< owner table: TBL24_IND if owner is tbl24, + * otherwise index of tbl8 + */ + uint32_t owner_entry_ind; /**< index of the owner table entry where + * pointer to the tbl8 is stored + */ + uint32_t ref_cnt; /**< table reference counter */ +}; + +/** LPM6 structure. */ +struct rte_lpm6 { + struct rte_lpm6_external ext; /* Storage used by the algo wrapper */ + /* LPM metadata. */ + char name[RTE_LPM6_NAMESIZE]; /**< Name of the lpm. */ + uint32_t max_rules; /**< Max number of rules. */ + uint32_t used_rules; /**< Used rules so far. */ + uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */ + + /* LPM Tables. */ + //struct rte_hash *rules_tbl; /**< LPM rules. */ + struct rte_lpm6_tbl_entry tbl24[RTE_LPM6_TBL24_NUM_ENTRIES] + __rte_cache_aligned; /**< LPM tbl24 table. */ + + uint32_t *tbl8_pool; /**< pool of indexes of free tbl8s */ + uint32_t tbl8_pool_pos; /**< current position in the tbl8 pool */ + + struct rte_lpm_tbl8_hdr *tbl8_hdrs; /* array of tbl8 headers */ + + struct rte_lpm6_tbl_entry tbl8[0] + __rte_cache_aligned; /**< LPM tbl8 table. */ +}; + +/* + * Takes an array of uint8_t (IPv6 address) and masks it using the depth. + * It leaves untouched one bit per unit in the depth variable + * and set the rest to 0. + */ +static inline void +ip6_mask_addr(uint8_t *ip, uint8_t depth) +{ + int16_t part_depth, mask; + int i; + + part_depth = depth; + + for (i = 0; i < RTE_LPM6_IPV6_ADDR_SIZE; i++) { + if (part_depth < BYTE_SIZE && part_depth >= 0) { + mask = (uint16_t)(~(UINT8_MAX >> part_depth)); + ip[i] = (uint8_t)(ip[i] & mask); + } else if (part_depth < 0) + ip[i] = 0; + + part_depth -= BYTE_SIZE; + } +} + +/* copy ipv6 address */ +static inline void +ip6_copy_addr(uint8_t *dst, const uint8_t *src) +{ + rte_memcpy(dst, src, RTE_LPM6_IPV6_ADDR_SIZE); +} + +#if 0 +/* + * LPM6 rule hash function + * + * It's used as a hash function for the rte_hash + * containing rules + */ +static inline uint32_t +rule_hash(const void *data, __rte_unused uint32_t data_len, + uint32_t init_val) +{ + return rte_jhash(data, sizeof(struct rte_lpm6_rule_key), init_val); +} +#endif + +/* + * Init pool of free tbl8 indexes + */ +static void +tbl8_pool_init(struct rte_lpm6 *lpm) +{ + uint32_t i; + + /* put entire range of indexes to the tbl8 pool */ + for (i = 0; i < lpm->number_tbl8s; i++) + lpm->tbl8_pool[i] = i; + + lpm->tbl8_pool_pos = 0; +} + +/* + * Get an index of a free tbl8 from the pool + */ +static inline uint32_t +tbl8_get(struct rte_lpm6 *lpm, uint32_t *tbl8_ind) +{ + if (lpm->tbl8_pool_pos == lpm->number_tbl8s) + /* no more free tbl8 */ + return -ENOSPC; + + /* next index */ + *tbl8_ind = lpm->tbl8_pool[lpm->tbl8_pool_pos++]; + return 0; +} + +/* + * Put an index of a free tbl8 back to the pool + */ +static inline uint32_t +tbl8_put(struct rte_lpm6 *lpm, uint32_t tbl8_ind) +{ + if (lpm->tbl8_pool_pos == 0) + /* pool is full */ + return -ENOSPC; + + lpm->tbl8_pool[--lpm->tbl8_pool_pos] = tbl8_ind; + return 0; +} + +/* + * Returns number of tbl8s available in the pool + */ +static inline uint32_t +tbl8_available(struct rte_lpm6 *lpm) +{ + return lpm->number_tbl8s - lpm->tbl8_pool_pos; +} + +#if 0 +/* + * Init a rule key. + * note that ip must be already masked + */ +static inline void +rule_key_init(struct rte_lpm6_rule_key *key, uint8_t *ip, uint8_t depth) +{ + ip6_copy_addr(key->ip, ip); + key->depth = depth; +} + +/* + * Rebuild the entire LPM tree by reinserting all rules + */ +static void +rebuild_lpm(struct rte_lpm6 *lpm) +{ + uint64_t next_hop; + struct rte_lpm6_rule_key *rule_key; + uint32_t iter = 0; + + while (rte_hash_iterate(lpm->rules_tbl, (void *) &rule_key, + (void **) &next_hop, &iter) >= 0) + rte_lpm6_add(lpm, rule_key->ip, rule_key->depth, + (uint32_t) next_hop); +} +#endif + +/* + * Allocates memory for LPM object + */ +struct rte_lpm6 * +rte_lpm6_create(const char *name, int socket_id, + const struct rte_lpm6_config *config) +{ + char mem_name[RTE_LPM6_NAMESIZE]; + struct rte_lpm6 *lpm = NULL; + //struct rte_tailq_entry *te; + uint64_t mem_size; + //struct rte_lpm6_list *lpm_list; + //struct rte_hash *rules_tbl = NULL; + uint32_t *tbl8_pool = NULL; + struct rte_lpm_tbl8_hdr *tbl8_hdrs = NULL; + + //lpm_list = RTE_TAILQ_CAST(rte_lpm6_tailq.head, rte_lpm6_list); + + RTE_BUILD_BUG_ON(sizeof(struct rte_lpm6_tbl_entry) != sizeof(uint32_t)); + + /* Check user arguments. */ + if ((name == NULL) || (socket_id < -1) || (config == NULL) || + config->number_tbl8s > RTE_LPM6_TBL8_MAX_NUM_GROUPS) { + rte_errno = EINVAL; + return NULL; + } + +#if 0 + /* create rules hash table */ + snprintf(mem_name, sizeof(mem_name), "LRH_%s", name); + struct rte_hash_parameters rule_hash_tbl_params = { + .entries = config->max_rules * 1.2 + + RULE_HASH_TABLE_EXTRA_SPACE, + .key_len = sizeof(struct rte_lpm6_rule_key), + .hash_func = rule_hash, + .hash_func_init_val = 0, + .name = mem_name, + .reserved = 0, + .socket_id = socket_id, + .extra_flag = 0 + }; + + rules_tbl = rte_hash_create(&rule_hash_tbl_params); + if (rules_tbl == NULL) { + RTE_LOG(ERR, LPM, "LPM rules hash table allocation failed: %s (%d)", + rte_strerror(rte_errno), rte_errno); + goto fail_wo_unlock; + } +#endif + + /* allocate tbl8 indexes pool */ + tbl8_pool = rte_malloc(NULL, + sizeof(uint32_t) * config->number_tbl8s, + RTE_CACHE_LINE_SIZE); + if (tbl8_pool == NULL) { + RTE_LOG(ERR, LPM, "LPM tbl8 pool allocation failed: %s (%d)", + rte_strerror(rte_errno), rte_errno); + rte_errno = ENOMEM; + goto fail_wo_unlock; + } + + /* allocate tbl8 headers */ + tbl8_hdrs = rte_malloc(NULL, + sizeof(struct rte_lpm_tbl8_hdr) * config->number_tbl8s, + RTE_CACHE_LINE_SIZE); + if (tbl8_hdrs == NULL) { + RTE_LOG(ERR, LPM, "LPM tbl8 headers allocation failed: %s (%d)", + rte_strerror(rte_errno), rte_errno); + rte_errno = ENOMEM; + goto fail_wo_unlock; + } + + snprintf(mem_name, sizeof(mem_name), "LPM_%s", name); + + /* Determine the amount of memory to allocate. */ + mem_size = sizeof(*lpm) + (sizeof(lpm->tbl8[0]) * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * config->number_tbl8s); + +#if 0 + rte_mcfg_tailq_write_lock(); + + /* Guarantee there's no existing */ + TAILQ_FOREACH(te, lpm_list, next) { + lpm = (struct rte_lpm6 *) te->data; + if (strncmp(name, lpm->name, RTE_LPM6_NAMESIZE) == 0) + break; + } + lpm = NULL; + if (te != NULL) { + rte_errno = EEXIST; + goto fail; + } + + /* allocate tailq entry */ + te = rte_zmalloc("LPM6_TAILQ_ENTRY", sizeof(*te), 0); + if (te == NULL) { + RTE_LOG(ERR, LPM, "Failed to allocate tailq entry!\n"); + rte_errno = ENOMEM; + goto fail; + } +#endif + + /* Allocate memory to store the LPM data structures. */ + lpm = rte_zmalloc_socket(mem_name, (size_t)mem_size, + RTE_CACHE_LINE_SIZE, socket_id); + + if (lpm == NULL) { + RTE_LOG(ERR, LPM, "LPM memory allocation failed\n"); + //rte_free(te); + rte_errno = ENOMEM; + goto fail; + } + + /* Save user arguments. */ + //lpm->max_rules = config->max_rules; + lpm->number_tbl8s = config->number_tbl8s; + strlcpy(lpm->name, name, sizeof(lpm->name)); + //lpm->rules_tbl = rules_tbl; + lpm->tbl8_pool = tbl8_pool; + lpm->tbl8_hdrs = tbl8_hdrs; + + /* init the stack */ + tbl8_pool_init(lpm); + + //te->data = (void *) lpm; + + //TAILQ_INSERT_TAIL(lpm_list, te, next); + rte_mcfg_tailq_write_unlock(); + return lpm; + +fail: + rte_mcfg_tailq_write_unlock(); + +fail_wo_unlock: + rte_free(tbl8_hdrs); + rte_free(tbl8_pool); + //rte_hash_free(rules_tbl); + + return NULL; +} + +#if 0 +/* + * Find an existing lpm table and return a pointer to it. + */ +struct rte_lpm6 * +rte_lpm6_find_existing(const char *name) +{ + struct rte_lpm6 *l = NULL; + struct rte_tailq_entry *te; + struct rte_lpm6_list *lpm_list; + + lpm_list = RTE_TAILQ_CAST(rte_lpm6_tailq.head, rte_lpm6_list); + + rte_mcfg_tailq_read_lock(); + TAILQ_FOREACH(te, lpm_list, next) { + l = (struct rte_lpm6 *) te->data; + if (strncmp(name, l->name, RTE_LPM6_NAMESIZE) == 0) + break; + } + rte_mcfg_tailq_read_unlock(); + + if (te == NULL) { + rte_errno = ENOENT; + return NULL; + } + + return l; +} +#endif + +/* + * Deallocates memory for given LPM table. + */ +void +rte_lpm6_free(struct rte_lpm6 *lpm) +{ +#if 0 + struct rte_lpm6_list *lpm_list; + struct rte_tailq_entry *te; + + /* Check user arguments. */ + if (lpm == NULL) + return; + + lpm_list = RTE_TAILQ_CAST(rte_lpm6_tailq.head, rte_lpm6_list); + + rte_mcfg_tailq_write_lock(); + + /* find our tailq entry */ + TAILQ_FOREACH(te, lpm_list, next) { + if (te->data == (void *) lpm) + break; + } + + if (te != NULL) + TAILQ_REMOVE(lpm_list, te, next); + + rte_mcfg_tailq_write_unlock(); +#endif + + rte_free(lpm->tbl8_hdrs); + rte_free(lpm->tbl8_pool); + //rte_hash_free(lpm->rules_tbl); + rte_free(lpm); + //rte_free(te); +} + +#if 0 +/* Find a rule */ +static inline int +rule_find_with_key(struct rte_lpm6 *lpm, + const struct rte_lpm6_rule_key *rule_key, + uint32_t *next_hop) +{ + uint64_t hash_val; + int ret; + + /* lookup for a rule */ + ret = rte_hash_lookup_data(lpm->rules_tbl, (const void *) rule_key, + (void **) &hash_val); + if (ret >= 0) { + *next_hop = (uint32_t) hash_val; + return 1; + } + + return 0; +} + +/* Find a rule */ +static int +rule_find(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, + uint32_t *next_hop) +{ + struct rte_lpm6_rule_key rule_key; + + /* init a rule key */ + rule_key_init(&rule_key, ip, depth); + + return rule_find_with_key(lpm, &rule_key, next_hop); +} + +/* + * Checks if a rule already exists in the rules table and updates + * the nexthop if so. Otherwise it adds a new rule if enough space is available. + * + * Returns: + * 0 - next hop of existed rule is updated + * 1 - new rule successfully added + * <0 - error + */ +static inline int +rule_add(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, uint32_t next_hop) +{ + int ret, rule_exist; + struct rte_lpm6_rule_key rule_key; + uint32_t unused; + + /* init a rule key */ + rule_key_init(&rule_key, ip, depth); + + /* Scan through rule list to see if rule already exists. */ + rule_exist = rule_find_with_key(lpm, &rule_key, &unused); + + /* + * If rule does not exist check if there is space to add a new rule to + * this rule group. If there is no space return error. + */ + if (!rule_exist && lpm->used_rules == lpm->max_rules) + return -ENOSPC; + + /* add the rule or update rules next hop */ + ret = rte_hash_add_key_data(lpm->rules_tbl, &rule_key, + (void *)(uintptr_t) next_hop); + if (ret < 0) + return ret; + + /* Increment the used rules counter for this rule group. */ + if (!rule_exist) { + lpm->used_rules++; + return 1; + } + + return 0; +} +#endif + +/* + * Function that expands a rule across the data structure when a less-generic + * one has been added before. It assures that every possible combination of bits + * in the IP address returns a match. + */ +static void +expand_rule(struct rte_lpm6 *lpm, uint32_t tbl8_gindex, uint8_t old_depth, + uint8_t new_depth, uint32_t next_hop, uint8_t valid) +{ + uint32_t tbl8_group_end, tbl8_gindex_next, j; + + tbl8_group_end = tbl8_gindex + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + + struct rte_lpm6_tbl_entry new_tbl8_entry = { + .valid = valid, + .valid_group = valid, + .depth = new_depth, + .next_hop = next_hop, + .ext_entry = 0, + }; + + for (j = tbl8_gindex; j < tbl8_group_end; j++) { + if (!lpm->tbl8[j].valid || (lpm->tbl8[j].ext_entry == 0 + && lpm->tbl8[j].depth <= old_depth)) { + + lpm->tbl8[j] = new_tbl8_entry; + + } else if (lpm->tbl8[j].ext_entry == 1) { + + tbl8_gindex_next = lpm->tbl8[j].lpm6_tbl8_gindex + * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + expand_rule(lpm, tbl8_gindex_next, old_depth, new_depth, + next_hop, valid); + } + } +} + +/* + * Init a tbl8 header + */ +static inline void +init_tbl8_header(struct rte_lpm6 *lpm, uint32_t tbl_ind, + uint32_t owner_tbl_ind, uint32_t owner_entry_ind) +{ + struct rte_lpm_tbl8_hdr *tbl_hdr = &lpm->tbl8_hdrs[tbl_ind]; + tbl_hdr->owner_tbl_ind = owner_tbl_ind; + tbl_hdr->owner_entry_ind = owner_entry_ind; + tbl_hdr->ref_cnt = 0; +} + +/* + * Calculate index to the table based on the number and position + * of the bytes being inspected in this step. + */ +static uint32_t +get_bitshift(const uint8_t *ip, uint8_t first_byte, uint8_t bytes) +{ + uint32_t entry_ind, i; + int8_t bitshift; + + entry_ind = 0; + for (i = first_byte; i < (uint32_t)(first_byte + bytes); i++) { + bitshift = (int8_t)((bytes - i)*BYTE_SIZE); + + if (bitshift < 0) + bitshift = 0; + entry_ind = entry_ind | ip[i-1] << bitshift; + } + + return entry_ind; +} + +/* + * Simulate adding a new route to the LPM counting number + * of new tables that will be needed + * + * It returns 0 on success, or 1 if + * the process needs to be continued by calling the function again. + */ +static inline int +simulate_add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, + struct rte_lpm6_tbl_entry **next_tbl, const uint8_t *ip, + uint8_t bytes, uint8_t first_byte, uint8_t depth, + uint32_t *need_tbl_nb) +{ + uint32_t entry_ind; + uint8_t bits_covered; + uint32_t next_tbl_ind; + + /* + * Calculate index to the table based on the number and position + * of the bytes being inspected in this step. + */ + entry_ind = get_bitshift(ip, first_byte, bytes); + + /* Number of bits covered in this step */ + bits_covered = (uint8_t)((bytes+first_byte-1)*BYTE_SIZE); + + if (depth <= bits_covered) { + *need_tbl_nb = 0; + return 0; + } + + if (tbl[entry_ind].valid == 0 || tbl[entry_ind].ext_entry == 0) { + /* from this point on a new table is needed on each level + * that is not covered yet + */ + depth -= bits_covered; + uint32_t cnt = depth >> 3; /* depth / BYTE_SIZE */ + if (depth & 7) /* 0b00000111 */ + /* if depth % 8 > 0 then one more table is needed + * for those last bits + */ + cnt++; + + *need_tbl_nb = cnt; + return 0; + } + + next_tbl_ind = tbl[entry_ind].lpm6_tbl8_gindex; + *next_tbl = &(lpm->tbl8[next_tbl_ind * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]); + *need_tbl_nb = 0; + return 1; +} + +/* + * Partially adds a new route to the data structure (tbl24+tbl8s). + * It returns 0 on success, a negative number on failure, or 1 if + * the process needs to be continued by calling the function again. + */ +static inline int +add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, + uint32_t tbl_ind, struct rte_lpm6_tbl_entry **next_tbl, + uint32_t *next_tbl_ind, uint8_t *ip, uint8_t bytes, + uint8_t first_byte, uint8_t depth, uint32_t next_hop, + uint8_t is_new_rule) +{ + uint32_t entry_ind, tbl_range, tbl8_group_start, tbl8_group_end, i; + uint32_t tbl8_gindex; + uint8_t bits_covered; + int ret; + + /* + * Calculate index to the table based on the number and position + * of the bytes being inspected in this step. + */ + entry_ind = get_bitshift(ip, first_byte, bytes); + + /* Number of bits covered in this step */ + bits_covered = (uint8_t)((bytes+first_byte-1)*BYTE_SIZE); + + /* + * If depth if smaller than this number (ie this is the last step) + * expand the rule across the relevant positions in the table. + */ + if (depth <= bits_covered) { + tbl_range = 1 << (bits_covered - depth); + + for (i = entry_ind; i < (entry_ind + tbl_range); i++) { + if (!tbl[i].valid || (tbl[i].ext_entry == 0 && + tbl[i].depth <= depth)) { + + struct rte_lpm6_tbl_entry new_tbl_entry = { + .next_hop = next_hop, + .depth = depth, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 0, + }; + + tbl[i] = new_tbl_entry; + + } else if (tbl[i].ext_entry == 1) { + + /* + * If tbl entry is valid and extended calculate the index + * into next tbl8 and expand the rule across the data structure. + */ + tbl8_gindex = tbl[i].lpm6_tbl8_gindex * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + expand_rule(lpm, tbl8_gindex, depth, depth, + next_hop, VALID); + } + } + + /* update tbl8 rule reference counter */ + if (tbl_ind != TBL24_IND && is_new_rule) + lpm->tbl8_hdrs[tbl_ind].ref_cnt++; + + return 0; + } + /* + * If this is not the last step just fill one position + * and calculate the index to the next table. + */ + else { + /* If it's invalid a new tbl8 is needed */ + if (!tbl[entry_ind].valid) { + /* get a new table */ + ret = tbl8_get(lpm, &tbl8_gindex); + if (ret != 0) + return -ENOSPC; + + /* invalidate all new tbl8 entries */ + tbl8_group_start = tbl8_gindex * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + memset(&lpm->tbl8[tbl8_group_start], 0, + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * + sizeof(struct rte_lpm6_tbl_entry)); + + /* init the new table's header: + * save the reference to the owner table + */ + init_tbl8_header(lpm, tbl8_gindex, tbl_ind, entry_ind); + + /* reference to a new tbl8 */ + struct rte_lpm6_tbl_entry new_tbl_entry = { + .lpm6_tbl8_gindex = tbl8_gindex, + .depth = 0, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 1, + }; + + tbl[entry_ind] = new_tbl_entry; + + /* update the current table's reference counter */ + if (tbl_ind != TBL24_IND) + lpm->tbl8_hdrs[tbl_ind].ref_cnt++; + } + /* + * If it's valid but not extended the rule that was stored + * here needs to be moved to the next table. + */ + else if (tbl[entry_ind].ext_entry == 0) { + /* get a new tbl8 index */ + ret = tbl8_get(lpm, &tbl8_gindex); + if (ret != 0) + return -ENOSPC; + + tbl8_group_start = tbl8_gindex * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + tbl8_group_end = tbl8_group_start + + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + + struct rte_lpm6_tbl_entry tbl_entry = { + .next_hop = tbl[entry_ind].next_hop, + .depth = tbl[entry_ind].depth, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 0 + }; + + /* Populate new tbl8 with tbl value. */ + for (i = tbl8_group_start; i < tbl8_group_end; i++) + lpm->tbl8[i] = tbl_entry; + + /* init the new table's header: + * save the reference to the owner table + */ + init_tbl8_header(lpm, tbl8_gindex, tbl_ind, entry_ind); + + /* + * Update tbl entry to point to new tbl8 entry. Note: The + * ext_flag and tbl8_index need to be updated simultaneously, + * so assign whole structure in one go. + */ + struct rte_lpm6_tbl_entry new_tbl_entry = { + .lpm6_tbl8_gindex = tbl8_gindex, + .depth = 0, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 1, + }; + + tbl[entry_ind] = new_tbl_entry; + + /* update the current table's reference counter */ + if (tbl_ind != TBL24_IND) + lpm->tbl8_hdrs[tbl_ind].ref_cnt++; + } + + *next_tbl_ind = tbl[entry_ind].lpm6_tbl8_gindex; + *next_tbl = &(lpm->tbl8[*next_tbl_ind * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]); + } + + return 1; +} + +/* + * Simulate adding a route to LPM + * + * Returns: + * 0 on success + * -ENOSPC not enough tbl8 left + */ +static int +simulate_add(struct rte_lpm6 *lpm, const uint8_t *masked_ip, uint8_t depth) +{ + struct rte_lpm6_tbl_entry *tbl; + struct rte_lpm6_tbl_entry *tbl_next = NULL; + int ret, i; + + /* number of new tables needed for a step */ + uint32_t need_tbl_nb; + /* total number of new tables needed */ + uint32_t total_need_tbl_nb; + + /* Inspect the first three bytes through tbl24 on the first step. */ + ret = simulate_add_step(lpm, lpm->tbl24, &tbl_next, masked_ip, + ADD_FIRST_BYTE, 1, depth, &need_tbl_nb); + total_need_tbl_nb = need_tbl_nb; + /* + * Inspect one by one the rest of the bytes until + * the process is completed. + */ + for (i = ADD_FIRST_BYTE; i < RTE_LPM6_IPV6_ADDR_SIZE && ret == 1; i++) { + tbl = tbl_next; + ret = simulate_add_step(lpm, tbl, &tbl_next, masked_ip, 1, + (uint8_t)(i + 1), depth, &need_tbl_nb); + total_need_tbl_nb += need_tbl_nb; + } + + if (tbl8_available(lpm) < total_need_tbl_nb) + /* not enough tbl8 to add a rule */ + return -ENOSPC; + + return 0; +} + +/* + * Add a route + */ +int +rte_lpm6_add(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + uint32_t next_hop, int is_new_rule) +{ + struct rte_lpm6_tbl_entry *tbl; + struct rte_lpm6_tbl_entry *tbl_next = NULL; + /* init to avoid compiler warning */ + uint32_t tbl_next_num = 123456; + int status; + uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE]; + int i; + + /* Check user arguments. */ + if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH)) + return -EINVAL; + + /* Copy the IP and mask it to avoid modifying user's input data. */ + ip6_copy_addr(masked_ip, ip); + ip6_mask_addr(masked_ip, depth); + + /* Simulate adding a new route */ + int ret = simulate_add(lpm, masked_ip, depth); + if (ret < 0) + return ret; + +#if 0 + /* Add the rule to the rule table. */ + int is_new_rule = rule_add(lpm, masked_ip, depth, next_hop); + /* If there is no space available for new rule return error. */ + if (is_new_rule < 0) + return is_new_rule; +#endif + + /* Inspect the first three bytes through tbl24 on the first step. */ + tbl = lpm->tbl24; + status = add_step(lpm, tbl, TBL24_IND, &tbl_next, &tbl_next_num, + masked_ip, ADD_FIRST_BYTE, 1, depth, next_hop, + is_new_rule); + assert(status >= 0); + + /* + * Inspect one by one the rest of the bytes until + * the process is completed. + */ + for (i = ADD_FIRST_BYTE; i < RTE_LPM6_IPV6_ADDR_SIZE && status == 1; i++) { + tbl = tbl_next; + status = add_step(lpm, tbl, tbl_next_num, &tbl_next, + &tbl_next_num, masked_ip, 1, (uint8_t)(i + 1), + depth, next_hop, is_new_rule); + assert(status >= 0); + } + + return status; +} + +/* + * Takes a pointer to a table entry and inspect one level. + * The function returns 0 on lookup success, ENOENT if no match was found + * or 1 if the process needs to be continued by calling the function again. + */ +static inline int +lookup_step(const struct rte_lpm6 *lpm, const struct rte_lpm6_tbl_entry *tbl, + const struct rte_lpm6_tbl_entry **tbl_next, const uint8_t *ip, + uint8_t first_byte, uint32_t *next_hop) +{ + uint32_t tbl8_index, tbl_entry; + + /* Take the integer value from the pointer. */ + tbl_entry = *(const uint32_t *)tbl; + + /* If it is valid and extended we calculate the new pointer to return. */ + if ((tbl_entry & RTE_LPM6_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM6_VALID_EXT_ENTRY_BITMASK) { + + tbl8_index = ip[first_byte-1] + + ((tbl_entry & RTE_LPM6_TBL8_BITMASK) * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES); + + *tbl_next = &lpm->tbl8[tbl8_index]; + + return 1; + } else { + /* If not extended then we can have a match. */ + *next_hop = ((uint32_t)tbl_entry & RTE_LPM6_TBL8_BITMASK); + return (tbl_entry & RTE_LPM6_LOOKUP_SUCCESS) ? 0 : -ENOENT; + } +} + +/* + * Looks up an IP + */ +int +rte_lpm6_lookup(const struct rte_lpm6 *lpm, const uint8_t *ip, + uint32_t *next_hop) +{ + const struct rte_lpm6_tbl_entry *tbl; + const struct rte_lpm6_tbl_entry *tbl_next = NULL; + int status; + uint8_t first_byte; + uint32_t tbl24_index; + + /* DEBUG: Check user input arguments. */ + if ((lpm == NULL) || (ip == NULL) || (next_hop == NULL)) + return -EINVAL; + + first_byte = LOOKUP_FIRST_BYTE; + tbl24_index = (ip[0] << BYTES2_SIZE) | (ip[1] << BYTE_SIZE) | ip[2]; + + /* Calculate pointer to the first entry to be inspected */ + tbl = &lpm->tbl24[tbl24_index]; + + do { + /* Continue inspecting following levels until success or failure */ + status = lookup_step(lpm, tbl, &tbl_next, ip, first_byte++, next_hop); + tbl = tbl_next; + } while (status == 1); + + return status; +} + +/* + * Looks up a group of IP addresses + */ +int +rte_lpm6_lookup_bulk_func(const struct rte_lpm6 *lpm, + uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], + int32_t *next_hops, unsigned int n) +{ + unsigned int i; + const struct rte_lpm6_tbl_entry *tbl; + const struct rte_lpm6_tbl_entry *tbl_next = NULL; + uint32_t tbl24_index, next_hop; + uint8_t first_byte; + int status; + + /* DEBUG: Check user input arguments. */ + if ((lpm == NULL) || (ips == NULL) || (next_hops == NULL)) + return -EINVAL; + + for (i = 0; i < n; i++) { + first_byte = LOOKUP_FIRST_BYTE; + tbl24_index = (ips[i][0] << BYTES2_SIZE) | + (ips[i][1] << BYTE_SIZE) | ips[i][2]; + + /* Calculate pointer to the first entry to be inspected */ + tbl = &lpm->tbl24[tbl24_index]; + + do { + /* Continue inspecting following levels + * until success or failure + */ + status = lookup_step(lpm, tbl, &tbl_next, ips[i], + first_byte++, &next_hop); + tbl = tbl_next; + } while (status == 1); + + if (status < 0) + next_hops[i] = -1; + else + next_hops[i] = (int32_t)next_hop; + } + + return 0; +} + +struct rte_lpm6_rule * +fill_rule6(char *buffer, const uint8_t *ip, uint8_t depth, uint32_t next_hop) +{ + struct rte_lpm6_rule *rule = (struct rte_lpm6_rule *)buffer; + + ip6_copy_addr((uint8_t *)&rule->ip, ip); + rule->depth = depth; + rule->next_hop = next_hop; + + return (rule); +} + +#if 0 +/* + * Look for a rule in the high-level rules table + */ +int +rte_lpm6_is_rule_present(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + uint32_t *next_hop) +{ + uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE]; + + /* Check user arguments. */ + if ((lpm == NULL) || next_hop == NULL || ip == NULL || + (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH)) + return -EINVAL; + + /* Copy the IP and mask it to avoid modifying user's input data. */ + ip6_copy_addr(masked_ip, ip); + ip6_mask_addr(masked_ip, depth); + + return rule_find(lpm, masked_ip, depth, next_hop); +} + +/* + * Delete a rule from the rule table. + * NOTE: Valid range for depth parameter is 1 .. 128 inclusive. + * return + * 0 on success + * <0 on failure + */ +static inline int +rule_delete(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth) +{ + int ret; + struct rte_lpm6_rule_key rule_key; + + /* init rule key */ + rule_key_init(&rule_key, ip, depth); + + /* delete the rule */ + ret = rte_hash_del_key(lpm->rules_tbl, (void *) &rule_key); + if (ret >= 0) + lpm->used_rules--; + + return ret; +} + +/* + * Deletes a group of rules + * + * Note that the function rebuilds the lpm table, + * rather than doing incremental updates like + * the regular delete function + */ +int +rte_lpm6_delete_bulk_func(struct rte_lpm6 *lpm, + uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], uint8_t *depths, + unsigned n) +{ + uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE]; + unsigned i; + + /* Check input arguments. */ + if ((lpm == NULL) || (ips == NULL) || (depths == NULL)) + return -EINVAL; + + for (i = 0; i < n; i++) { + ip6_copy_addr(masked_ip, ips[i]); + ip6_mask_addr(masked_ip, depths[i]); + rule_delete(lpm, masked_ip, depths[i]); + } + + /* + * Set all the table entries to 0 (ie delete every rule + * from the data structure. + */ + memset(lpm->tbl24, 0, sizeof(lpm->tbl24)); + memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0]) + * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s); + tbl8_pool_init(lpm); + + /* + * Add every rule again (except for the ones that were removed from + * the rules table). + */ + rebuild_lpm(lpm); + + return 0; +} + +/* + * Delete all rules from the LPM table. + */ +void +rte_lpm6_delete_all(struct rte_lpm6 *lpm) +{ + /* Zero used rules counter. */ + lpm->used_rules = 0; + + /* Zero tbl24. */ + memset(lpm->tbl24, 0, sizeof(lpm->tbl24)); + + /* Zero tbl8. */ + memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0]) * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s); + + /* init pool of free tbl8 indexes */ + tbl8_pool_init(lpm); + + /* Delete all rules form the rules table. */ + rte_hash_reset(lpm->rules_tbl); +} +#endif + +/* + * Convert a depth to a one byte long mask + * Example: 4 will be converted to 0xF0 + */ +static uint8_t __attribute__((pure)) +depth_to_mask_1b(uint8_t depth) +{ + /* To calculate a mask start with a 1 on the left hand side and right + * shift while populating the left hand side with 1's + */ + return (signed char)0x80 >> (depth - 1); +} + +#if 0 +/* + * Find a less specific rule + */ +static int +rule_find_less_specific(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, + struct rte_lpm6_rule *rule) +{ + int ret; + uint32_t next_hop; + uint8_t mask; + struct rte_lpm6_rule_key rule_key; + + if (depth == 1) + return 0; + + rule_key_init(&rule_key, ip, depth); + + while (depth > 1) { + depth--; + + /* each iteration zero one more bit of the key */ + mask = depth & 7; /* depth % BYTE_SIZE */ + if (mask > 0) + mask = depth_to_mask_1b(mask); + + rule_key.depth = depth; + rule_key.ip[depth >> 3] &= mask; + + ret = rule_find_with_key(lpm, &rule_key, &next_hop); + if (ret) { + rule->depth = depth; + ip6_copy_addr(rule->ip, rule_key.ip); + rule->next_hop = next_hop; + return 1; + } + } + + return 0; +} +#endif + +/* + * Find range of tbl8 cells occupied by a rule + */ +static void +rule_find_range(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + struct rte_lpm6_tbl_entry **from, + struct rte_lpm6_tbl_entry **to, + uint32_t *out_tbl_ind) +{ + uint32_t ind; + uint32_t first_3bytes = (uint32_t)ip[0] << 16 | ip[1] << 8 | ip[2]; + + if (depth <= 24) { + /* rule is within the top level */ + ind = first_3bytes; + *from = &lpm->tbl24[ind]; + ind += (1 << (24 - depth)) - 1; + *to = &lpm->tbl24[ind]; + *out_tbl_ind = TBL24_IND; + } else { + /* top level entry */ + struct rte_lpm6_tbl_entry *tbl = &lpm->tbl24[first_3bytes]; + assert(tbl->ext_entry == 1); + /* first tbl8 */ + uint32_t tbl_ind = tbl->lpm6_tbl8_gindex; + tbl = &lpm->tbl8[tbl_ind * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]; + /* current ip byte, the top level is already behind */ + uint8_t byte = 3; + /* minus top level */ + depth -= 24; + + /* iterate through levels (tbl8s) + * until we reach the last one + */ + while (depth > 8) { + tbl += ip[byte]; + assert(tbl->ext_entry == 1); + /* go to the next level/tbl8 */ + tbl_ind = tbl->lpm6_tbl8_gindex; + tbl = &lpm->tbl8[tbl_ind * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]; + byte += 1; + depth -= 8; + } + + /* last level/tbl8 */ + ind = ip[byte] & depth_to_mask_1b(depth); + *from = &tbl[ind]; + ind += (1 << (8 - depth)) - 1; + *to = &tbl[ind]; + *out_tbl_ind = tbl_ind; + } +} + +/* + * Remove a table from the LPM tree + */ +static void +remove_tbl(struct rte_lpm6 *lpm, struct rte_lpm_tbl8_hdr *tbl_hdr, + uint32_t tbl_ind, struct rte_lpm6_rule *lsp_rule) +{ + struct rte_lpm6_tbl_entry *owner_entry; + + if (tbl_hdr->owner_tbl_ind == TBL24_IND) + owner_entry = &lpm->tbl24[tbl_hdr->owner_entry_ind]; + else { + uint32_t owner_tbl_ind = tbl_hdr->owner_tbl_ind; + owner_entry = &lpm->tbl8[ + owner_tbl_ind * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES + + tbl_hdr->owner_entry_ind]; + + struct rte_lpm_tbl8_hdr *owner_tbl_hdr = + &lpm->tbl8_hdrs[owner_tbl_ind]; + if (--owner_tbl_hdr->ref_cnt == 0) + remove_tbl(lpm, owner_tbl_hdr, owner_tbl_ind, lsp_rule); + } + + assert(owner_entry->ext_entry == 1); + + /* unlink the table */ + if (lsp_rule != NULL) { + struct rte_lpm6_tbl_entry new_tbl_entry = { + .next_hop = lsp_rule->next_hop, + .depth = lsp_rule->depth, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 0 + }; + + *owner_entry = new_tbl_entry; + } else { + struct rte_lpm6_tbl_entry new_tbl_entry = { + .next_hop = 0, + .depth = 0, + .valid = INVALID, + .valid_group = INVALID, + .ext_entry = 0 + }; + + *owner_entry = new_tbl_entry; + } + + /* return the table to the pool */ + tbl8_put(lpm, tbl_ind); +} + +/* + * Deletes a rule + */ +int +rte_lpm6_delete(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + struct rte_lpm6_rule *lsp_rule) +{ + uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE]; + //struct rte_lpm6_rule lsp_rule_obj; + //struct rte_lpm6_rule *lsp_rule; + //int ret; + uint32_t tbl_ind; + struct rte_lpm6_tbl_entry *from, *to; + + /* Check input arguments. */ + if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH)) + return -EINVAL; + + /* Copy the IP and mask it to avoid modifying user's input data. */ + ip6_copy_addr(masked_ip, ip); + ip6_mask_addr(masked_ip, depth); + +#if 0 + /* Delete the rule from the rule table. */ + ret = rule_delete(lpm, masked_ip, depth); + if (ret < 0) + return -ENOENT; +#endif + + /* find rule cells */ + rule_find_range(lpm, masked_ip, depth, &from, &to, &tbl_ind); + +#if 0 + /* find a less specific rule (a rule with smaller depth) + * note: masked_ip will be modified, don't use it anymore + */ + ret = rule_find_less_specific(lpm, masked_ip, depth, + &lsp_rule_obj); + lsp_rule = ret ? &lsp_rule_obj : NULL; +#endif + /* decrement the table rule counter, + * note that tbl24 doesn't have a header + */ + if (tbl_ind != TBL24_IND) { + struct rte_lpm_tbl8_hdr *tbl_hdr = &lpm->tbl8_hdrs[tbl_ind]; + if (--tbl_hdr->ref_cnt == 0) { + /* remove the table */ + remove_tbl(lpm, tbl_hdr, tbl_ind, lsp_rule); + return 0; + } + } + + /* iterate rule cells */ + for (; from <= to; from++) + if (from->ext_entry == 1) { + /* reference to a more specific space + * of the prefix/rule. Entries in a more + * specific space that are not used by + * a more specific prefix must be occupied + * by the prefix + */ + if (lsp_rule != NULL) + expand_rule(lpm, + from->lpm6_tbl8_gindex * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES, + depth, lsp_rule->depth, + lsp_rule->next_hop, VALID); + else + /* since the prefix has no less specific prefix, + * its more specific space must be invalidated + */ + expand_rule(lpm, + from->lpm6_tbl8_gindex * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES, + depth, 0, 0, INVALID); + } else if (from->depth == depth) { + /* entry is not a reference and belongs to the prefix */ + if (lsp_rule != NULL) { + struct rte_lpm6_tbl_entry new_tbl_entry = { + .next_hop = lsp_rule->next_hop, + .depth = lsp_rule->depth, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 0 + }; + + *from = new_tbl_entry; + } else { + struct rte_lpm6_tbl_entry new_tbl_entry = { + .next_hop = 0, + .depth = 0, + .valid = INVALID, + .valid_group = INVALID, + .ext_entry = 0 + }; + + *from = new_tbl_entry; + } + } + + return 0; +} Index: sys/contrib/dpdk_rte_lpm/rte_shim.h =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/rte_shim.h @@ -0,0 +1,31 @@ +#ifndef _RTE_SHIM_H_ +#define _RTE_SHIM_H_ + +#define rte_malloc(_type, _size, _align) malloc(_size, M_TEMP, M_NOWAIT) +#define rte_free(_ptr) free(_ptr, M_TEMP) +#define rte_zmalloc(_type, _size, _align) malloc(_size, M_TEMP, M_NOWAIT | M_ZERO) +#define rte_zmalloc_socket(_type, _size, _align, _s) malloc(_size, M_TEMP, M_NOWAIT | M_ZERO) + +#define rte_mcfg_tailq_write_unlock() +#define rte_mcfg_tailq_write_lock() + +#define RTE_CACHE_LINE_SIZE CACHE_LINE_SIZE +#define strtoull strtoul +#define assert(_s) KASSERT((_s), ("DPDK: assert failed")) +#define rte_memcpy memcpy +#define rte_strerror(_err) "strerror_not_implemented" +#define RTE_LOG(_sev, _sub, _fmt, ...) printf("DPDK::" #_sev "::" #_sub " %s: " _fmt, __func__ , ## __VA_ARGS__) + +#include "sys/endian.h" +#define RTE_BYTE_ORDER BYTE_ORDER +#define RTE_LITTLE_ENDIAN LITTLE_ENDIAN +#define RTE_BIG_ENDIAN BIG_ENDIAN + +#include "sys/limits.h" // CHAR_BIT +#define rte_le_to_cpu_32 le32toh + +#include "rte_jhash.h" +#include "rte_common.h" + + +#endif Index: sys/contrib/dpdk_rte_lpm/rte_tailq.h =================================================================== --- /dev/null +++ sys/contrib/dpdk_rte_lpm/rte_tailq.h @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_TAILQ_H_ +#define _RTE_TAILQ_H_ + +/** + * @file + * Here defines rte_tailq APIs for only internal use + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +//#include +#include + +/** dummy structure type used by the rte_tailq APIs */ +struct rte_tailq_entry { + TAILQ_ENTRY(rte_tailq_entry) next; /**< Pointer entries for a tailq list */ + void *data; /**< Pointer to the data referenced by this tailq entry */ +}; +/** dummy */ +TAILQ_HEAD(rte_tailq_entry_head, rte_tailq_entry); + +#define RTE_TAILQ_NAMESIZE 32 + +/** + * The structure defining a tailq header entry for storing + * in the rte_config structure in shared memory. Each tailq + * is identified by name. + * Any library storing a set of objects e.g. rings, mempools, hash-tables, + * is recommended to use an entry here, so as to make it easy for + * a multi-process app to find already-created elements in shared memory. + */ +struct rte_tailq_head { + struct rte_tailq_entry_head tailq_head; /**< NOTE: must be first element */ + char name[RTE_TAILQ_NAMESIZE]; +}; + +struct rte_tailq_elem { + /** + * Reference to head in shared mem, updated at init time by + * rte_eal_tailqs_init() + */ + struct rte_tailq_head *head; + TAILQ_ENTRY(rte_tailq_elem) next; + const char name[RTE_TAILQ_NAMESIZE]; +}; + +/** + * Return the first tailq entry cast to the right struct. + */ +#define RTE_TAILQ_CAST(tailq_entry, struct_name) \ + (struct struct_name *)&(tailq_entry)->tailq_head + +/** + * Utility macro to make looking up a tailqueue for a particular struct easier. + * + * @param name + * The name of tailq + * + * @param struct_name + * The name of the list type we are using. (Generally this is the same as the + * first parameter passed to TAILQ_HEAD macro) + * + * @return + * The return value from rte_eal_tailq_lookup, typecast to the appropriate + * structure pointer type. + * NULL on error, since the tailq_head is the first + * element in the rte_tailq_head structure. + */ +#define RTE_TAILQ_LOOKUP(name, struct_name) \ + RTE_TAILQ_CAST(rte_eal_tailq_lookup(name), struct_name) + +/** + * Dump tail queues to a file. + * + * @param f + * A pointer to a file for output + */ +//void rte_dump_tailq(FILE *f); + +/** + * Lookup for a tail queue. + * + * Get a pointer to a tail queue header of a tail + * queue identified by the name given as an argument. + * Note: this function is not multi-thread safe, and should only be called from + * a single thread at a time + * + * @param name + * The name of the queue. + * @return + * A pointer to the tail queue head structure. + */ +struct rte_tailq_head *rte_eal_tailq_lookup(const char *name); + +/** + * Register a tail queue. + * + * Register a tail queue from shared memory. + * This function is mainly used by EAL_REGISTER_TAILQ macro which is used to + * register tailq from the different dpdk libraries. Since this macro is a + * constructor, the function has no access to dpdk shared memory, so the + * registered tailq can not be used before call to rte_eal_init() which calls + * rte_eal_tailqs_init(). + * + * @param t + * The tailq element which contains the name of the tailq you want to + * create (/retrieve when in secondary process). + * @return + * 0 on success or -1 in case of an error. + */ +int rte_eal_tailq_register(struct rte_tailq_elem *t); + +#define EAL_REGISTER_TAILQ(t) \ +RTE_INIT(tailqinitfn_ ##t) \ +{ \ + if (rte_eal_tailq_register(&t) < 0) \ + rte_panic("Cannot initialize tailq: %s\n", t.name); \ +} + +/* This macro permits both remove and free var within the loop safely.*/ +#ifndef TAILQ_FOREACH_SAFE +#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = TAILQ_FIRST((head)); \ + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_TAILQ_H_ */ Index: sys/modules/dpdk_lpm4/Makefile =================================================================== --- /dev/null +++ sys/modules/dpdk_lpm4/Makefile @@ -0,0 +1,12 @@ +# $FreeBSD$ + +SYSDIR?=${SRCTOP}/sys +.include "${SYSDIR}/conf/kern.opts.mk" + +.PATH: ${SYSDIR}/contrib/dpdk_rte_lpm + +KMOD= dpdk_lpm4 +SRCS= opt_inet.h +SRCS.INET=dpdk_lpm.c rte_lpm.c + +.include Index: sys/modules/dpdk_lpm6/Makefile =================================================================== --- /dev/null +++ sys/modules/dpdk_lpm6/Makefile @@ -0,0 +1,12 @@ +# $FreeBSD$ + +SYSDIR?=${SRCTOP}/sys +.include "${SYSDIR}/conf/kern.opts.mk" + +.PATH: ${SYSDIR}/contrib/dpdk_rte_lpm + +KMOD= dpdk_lpm6 +SRCS= opt_inet6.h +SRCS.INET6=dpdk_lpm6.c rte_lpm6.c + +.include Index: sys/net/route/route_algo.c =================================================================== --- /dev/null +++ sys/net/route/route_algo.c @@ -1,1472 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2020 Alexander V. Chernikov - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); -#include "opt_inet.h" -#include "opt_inet6.h" -#include "opt_route.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#ifdef INET6 -#include -#include -#endif - -#include -#include -#include -#include -#include - -#include - -/* - * Fib lookup framework. - * - * This framework enables dynamic loading of lookup modules enabling - * accelerated longest-prefix-match lookups for the routing tables. - * - * flm - fib lookup modules - kernel modules implementing particular algo - * fd - fib data - instance of an flm bound to specific routing table - * - * For each supported address family, there is a an allocated array of fib_dp - * structures, indexed by fib number. Each array entry contains callback function - * and its argument. This function will be called with a family-specific lookup key, - * scope and provided argument. This array gets re-created every time when new algo - * instance gets created. Please take a look at the replace_rtables_family() function - * for more details. - * - * Control plane for to setup and update the necessary dataplane structures. - * 1) nexhops abstraction -> module has to deal with index, refcounting, nexhtop groups etc - * 2) sync with route tables - * 3) dataplane attachment points - * 3) fail early. Some algorithms are immutable, so any change leads to rebuild. Some - * are mutable till some extent so the module is build over common setup/teardown - * instances, making error handling * easier. - * 4) preference. Lookup modules contain callbacks to determine the preference of particula - * lookup algorithm given the current route table scale. - * - */ - -SYSCTL_DECL(_net_route); -SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, - "Route algorithm lookups"); - -#ifdef INET6 -bool algo_fixed_inet6 = false; -SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, - "IPv6 algorithm lookups"); -#endif -#ifdef INET -bool algo_fixed_inet = false; -SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, - "IPv4 algorithm lookups"); -#endif - -struct nhop_ref_table { - uint32_t count; - int32_t refcnt[0]; -}; - -struct fib_data { - uint32_t number_nhops; /* current # of nhops */ - uint32_t number_records; /* current # of routes */ - uint8_t hit_nhops; /* true if out of nhop limit */ - uint8_t init_done; /* true if init is competed */ - uint32_t fd_dead:1; /* Scheduled for deletion */ - uint32_t fd_linked:1; /* true if linked */ - uint32_t fd_need_rebuild:1; /* true if rebuild scheduled */ - uint32_t fd_force_eval:1;/* true if rebuild scheduled */ - uint8_t fd_family; /* family */ - uint32_t fd_fibnum; /* fibnum */ - uint32_t fd_failed_rebuilds; /* stat: failed rebuilds */ - uint32_t fd_algo_mask; /* bitmask for algo data */ - struct callout fd_callout; /* rebuild callout */ - void *fd_algo_data; /* algorithm data */ - struct nhop_object **nh_idx; /* nhop idx->ptr array */ - struct nhop_ref_table *nh_ref_table; /* array with # of nhop references */ - struct rib_head *fd_rh; /* RIB table we're attached to */ - struct rib_subscription *fd_rs; /* storing table subscription */ - struct fib_dp fd_dp; /* fib datapath data */ - struct vnet *fd_vnet; /* vnet fib belongs to */ - struct epoch_context fd_epoch_ctx; /* epoch context for deletion */ - uint64_t gencnt; - struct fib_lookup_module *fd_flm;/* pointer to the lookup module */ - uint32_t fd_num_changes; /* number of changes since last callout */ - TAILQ_ENTRY(fib_data) entries; /* list of all fds in vnet */ -}; - -static void rebuild_callout(void *_data); -static void destroy_fd_instance_epoch(epoch_context_t ctx); -static enum flm_op_result attach_datapath(struct fib_data *fd); -static bool is_idx_free(struct fib_data *fd, uint32_t index); -static void set_algo_fixed(struct rib_head *rh); - -static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh); -static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh); - -static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh, - struct fib_lookup_module *orig_flm); -static void fib_unref_algo(struct fib_lookup_module *flm); - -struct mtx fib_mtx; -#define MOD_LOCK() mtx_lock(&fib_mtx) -#define MOD_UNLOCK() mtx_unlock(&fib_mtx) - -#if 0 -uint32_t algo_bitmask_idx = 0; -static uint32_t algo_inet_mask = 0; -static uint32_t algo_inet6_mask = 0; -#endif - - -/* Algorithm has to be this percent better than the current to switch */ -#define BEST_DIFF_PERCENT (5 * 256 / 100) -/* Schedule algo re-evaluation X seconds after a change */ -#define ALGO_EVAL_DELAY_MS 30000 -/* Force algo re-evaluation after X changes */ -#define ALGO_EVAL_NUM_ROUTES 100 -/* Try to setup algorithm X times */ -#define FIB_MAX_TRIES 32 -/* Max amount of supported nexthops */ -#define FIB_MAX_NHOPS 262144 -#define FIB_CALLOUT_DELAY_MS 50 - -/* Debug */ -static int flm_debug_level = LOG_NOTICE; -SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW, - &flm_debug_level, 0, "debuglevel"); - -#define RTDEBUG -#ifdef RTDEBUG -#define _PASS_MSG(_l) (flm_debug_level >= (_l)) -#define ALGO_PRINTF(_fmt, ...) printf("[rt_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__) -#define _ALGO_PRINTF(_fib, _fam, _aname, _func, _fmt, ...) \ - printf("[rt_algo] %s.%u (%s) %s: " _fmt "\n",\ - print_family(_fam), _fib, _aname, _func, ## __VA_ARGS__) -#define _RH_PRINTF(_fib, _fam, _func, _fmt, ...) \ - printf("[rt_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__) -#define RH_PRINTF(_l, _rh, _fmt, ...) if (_PASS_MSG(_l)) { \ - _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\ -} -#define FD_PRINTF(_l, _fd, _fmt, ...) if (_PASS_MSG(_l)) { \ - _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name, \ - __func__, _fmt, ## __VA_ARGS__); \ -} -#else -#define FD_PRINTF(fd, _fmt, ...) -#define ALGO_PRINTF(_fmt, ...) -#define RH_PRINTF(_fmt, ...) -#endif - -/* List of all fib lookup instances */ -VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list); -#define V_fib_data_list VNET(fib_data_list) - -struct fib_error { - int fe_family; - uint32_t fe_fibnum; - struct fib_lookup_module *fe_flm; - TAILQ_ENTRY(fib_error) entries;/* list of all errored entries */ -}; -TAILQ_HEAD(fib_error_head, fib_error) fib_error_list; - -struct fib_dp_header { - struct epoch_context ffi_epoch_ctx; - uint32_t ffi_algo_mask; - uint32_t ffi_num_tables; - struct fib_dp ffi_idx[0]; -}; - -static TAILQ_HEAD(, fib_lookup_module) all_algo_list; - - -static bool -flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum) -{ - struct fib_error *fe, *fe_tmp; - - fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO); - if (fe == NULL) - return (false); - fe->fe_flm = flm; - fe->fe_family = flm->flm_family; - fe->fe_fibnum = fibnum; - - MOD_LOCK(); - TAILQ_FOREACH(fe_tmp, &fib_error_list, entries) { - if ((fe_tmp->fe_flm == flm) && (fe_tmp->fe_fibnum == fibnum)) { - MOD_UNLOCK(); - free(fe, M_TEMP); - return (true); - } - } - TAILQ_INSERT_HEAD(&fib_error_list, fe, entries); - MOD_UNLOCK(); - - return (true); -} - -static bool -flm_error_check(struct fib_lookup_module *flm, uint32_t fibnum) -{ - struct fib_error *fe; - - TAILQ_FOREACH(fe, &fib_error_list, entries) { - if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum)) - return (true); - } - - return (false); -} - -static void -fib_error_clear_flm(struct fib_lookup_module *flm) -{ - struct fib_error *fe, *fe_tmp; - - TAILQ_FOREACH_SAFE(fe, &fib_error_list, entries, fe_tmp) { - if (fe->fe_flm == flm) { - TAILQ_REMOVE(&fib_error_list, fe, entries); - free(fe, M_TEMP); - } - } -} - - -static const char * -print_family(int family) -{ - - if (family == AF_INET) - return ("inet"); - else if (family == AF_INET6) - return ("inet6"); - else - return ("unknown"); -} - -/* - * Debug function used by lookup modules. - * Outputs message denoted by @fmt, prepended by "[rt_algo] inetX.Y (algo) " - */ -void -fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...) -{ - char buf[128]; - va_list ap; - - if (level > flm_debug_level) - return; - - va_start(ap, fmt); - vsnprintf(buf, sizeof(buf), fmt, ap); - va_end(ap); - - _ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name, - func, "%s", buf); -} - -static int -print_algos(struct sysctl_req *req, int family) -{ - struct fib_lookup_module *flm; - struct sbuf sbuf; - int error, count = 0; - - error = sysctl_wire_old_buffer(req, 0); - if (error == 0) { - sbuf_new_for_sysctl(&sbuf, NULL, 512, req); - TAILQ_FOREACH(flm, &all_algo_list, entries) { - if (flm->flm_family == family) { - if (count++ > 0) - sbuf_cat(&sbuf, ", "); - sbuf_cat(&sbuf, flm->flm_name); - } - } - error = sbuf_finish(&sbuf); - sbuf_delete(&sbuf); - } - return (error); -} - -#ifdef INET6 -static int -print_algos_inet6(SYSCTL_HANDLER_ARGS) -{ - - return (print_algos(req, AF_INET6)); -} -SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list, - CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, - print_algos_inet6, "A", "List of IPv6 lookup algorithms"); -#endif - -#ifdef INET -static int -print_algos_inet(SYSCTL_HANDLER_ARGS) -{ - - return (print_algos(req, AF_INET)); -} -SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list, - CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, - print_algos_inet, "A", "List of IPv4 lookup algorithms"); -#endif - -static uint32_t -callout_calc_delay(struct fib_data *fd) -{ - uint32_t shift; - - if (fd->fd_failed_rebuilds > 10) - shift = 10; - else - shift = fd->fd_failed_rebuilds; - - return ((1 << shift) * FIB_CALLOUT_DELAY_MS); -} - -static void -schedule_callout(struct fib_data *fd, int delay_ms) -{ - - callout_reset_sbt(&fd->fd_callout, 0, SBT_1MS * delay_ms, - rebuild_callout, fd, 0); -} - -static void -schedule_fd_rebuild(struct fib_data *fd) -{ - - MOD_LOCK(); - if (!fd->fd_need_rebuild) { - fd->fd_need_rebuild = true; - - /* - * Potentially re-schedules pending callout - * initiated by schedule_algo_eval. - */ - FD_PRINTF(LOG_INFO, fd, "Scheduling rebuilt"); - schedule_callout(fd, callout_calc_delay(fd)); - } - MOD_UNLOCK(); -} - -static void -schedule_algo_eval(struct fib_data *fd) -{ - - if (fd->fd_num_changes++ == 0) { - /* Start callout to consider switch */ - MOD_LOCK(); - if (!callout_pending(&fd->fd_callout)) - schedule_callout(fd, ALGO_EVAL_DELAY_MS); - MOD_UNLOCK(); - } else if (fd->fd_num_changes > ALGO_EVAL_NUM_ROUTES && !fd->fd_force_eval) { - /* Reset callout to exec immediately */ - MOD_LOCK(); - if (!fd->fd_need_rebuild) { - fd->fd_force_eval = true; - schedule_callout(fd, 1); - } - MOD_UNLOCK(); - } -} - -/* - * rib subscription handler - */ -static void -handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, - void *_data) -{ - struct fib_data *fd = (struct fib_data *)_data; - enum flm_op_result result; - - RIB_WLOCK_ASSERT(rnh); - - /* - * There is a small gap between subscribing for route changes - * and initiating rtable dump. Avoid receiving route changes - * prior to finishing rtable dump by checking `init_done`. - */ - if (!fd->init_done) - return; - /* - * If algo requested rebuild, stop sending updates by default. - * This simplifies nexthop refcount handling logic. - */ - if (fd->fd_need_rebuild) - return; - - /* Consider scheduling algorithm re-evaluation */ - schedule_algo_eval(fd); - - - /* - * Maintain guarantee that every nexthop returned by the dataplane - * lookup has > 0 refcount, so can be safely referenced within current - * epoch. - */ - if (rc->rc_nh_new != NULL) { - if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) { - /* ran out of indexes */ - schedule_fd_rebuild(fd); - return; - } - } - - result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data); - - switch (result) { - case FLM_SUCCESS: - /* Unref old nexthop on success */ - if (rc->rc_nh_old != NULL) - fib_unref_nhop(fd, rc->rc_nh_old); - break; - case FLM_REBUILD: - /* - * Algo reported inability to handle, - * schedule algo rebuild. - */ - schedule_fd_rebuild(fd); - break; - case FLM_ERROR: - /* - * Algo reported a non-recoverable error. - */ - FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error"); - if (!flm_error_add(fd->fd_flm, fd->fd_fibnum)) - FD_PRINTF(LOG_ERR, fd, "failed to ban algo"); - schedule_fd_rebuild(fd); - } -} - -static void -estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd) -{ - - if (old_fd == NULL) { - // TODO: read from rtable - fd->number_nhops = 16; - return; - } - - if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS) - fd->number_nhops = 2 * old_fd->number_nhops; - else - fd->number_nhops = old_fd->number_nhops; -} - -struct walk_cbdata { - struct fib_data *fd; - flm_dump_t *func; - enum flm_op_result result; -}; - -static void -sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data) -{ - struct walk_cbdata *w = (struct walk_cbdata *)_data; - struct fib_data *fd = w->fd; - - RIB_WLOCK_ASSERT(w->fd->fd_rh); - - if (rnh->rib_dying) { - w->result = FLM_ERROR; - return; - } - - if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS) - return; - - /* Post-dump hook, dump successful */ - - if (fd->hit_nhops) { - FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops", - fd->nh_ref_table->count); - w->result = FLM_REBUILD; - return; - } - - w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp); - - if (w->result == FLM_SUCCESS) { - /* Mark init as done to allow routing updates */ - fd->init_done = 1; - } -} - -static int -sync_algo_cb(struct rtentry *rt, void *_data) -{ - struct walk_cbdata *w = (struct walk_cbdata *)_data; - - RIB_WLOCK_ASSERT(w->fd->fd_rh); - - if (w->result == FLM_SUCCESS && w->func) { - - /* - * Reference nexthops to maintain guarantee that - * each nexthop returned by datapath has > 0 references - * and can be safely referenced within current epoch. - */ - struct nhop_object *nh = rt_get_raw_nhop(rt); - if (fib_ref_nhop(w->fd, nh) != 0) - w->result = w->func(rt, w->fd->fd_algo_data); - else - w->result = FLM_REBUILD; - } - - return (0); -} - -/* - * Dump all routing table state to the algo instance. - */ -static enum flm_op_result -sync_algo(struct fib_data *fd) -{ - struct walk_cbdata w; - - w.fd = fd; - w.func = fd->fd_flm->flm_dump_rib_item_cb; - w.result = FLM_SUCCESS; - - rib_walk_ext_internal(fd->fd_rh, true, sync_algo_cb, sync_algo_end_cb, &w); - - FD_PRINTF(LOG_INFO, fd, "initial dump completed."); - - return (w.result); -} - -/* - * Assume already unlinked from datapath - */ -static int -schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout) -{ - bool is_dead; - - NET_EPOCH_ASSERT(); - - MOD_LOCK(); - is_dead = fd->fd_dead; - if (!is_dead) - fd->fd_dead = true; - if (fd->fd_linked) { - TAILQ_REMOVE(&V_fib_data_list, fd, entries); - fd->fd_linked = false; - } - MOD_UNLOCK(); - if (is_dead) - return (0); - - FD_PRINTF(LOG_INFO, fd, "DETACH"); - - if (fd->fd_rs != NULL) - rib_unsibscribe(fd->fd_rs); - - /* - * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls - * will be executed, hence no _new_ callout schedules will happen. - * - * There can be 3 possible scenarious here: - * 1) we're running inside a callout when we're deleting ourselves - * due to migration to a newer fd - * 2) we're running from rt_table_destroy() and callout is scheduled - * for execution OR is executing - * - * For (2) we need to wait for the callout termination, as the routing table - * will be destroyed after this function returns. - * For (1) we cannot call drain, but can ensure that this is the last invocation. - */ - - if (in_callout) - callout_stop(&fd->fd_callout); - else - callout_drain(&fd->fd_callout); - - /* - * At this moment there are no other pending work scheduled. - */ - FD_PRINTF(LOG_INFO, fd, "destroying old instance"); - epoch_call(net_epoch_preempt, destroy_fd_instance_epoch, - &fd->fd_epoch_ctx); - - return (0); -} - -/* - * Wipe all instances from the list matching rib specified by @rh. - */ -static void -fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout) -{ - struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head); - struct fib_data *fd, *fd_tmp; - struct epoch_tracker et; - - MOD_LOCK(); - TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) { - if (fd->fd_rh == rh) { - if (keep_first) { - keep_first = false; - continue; - } - TAILQ_REMOVE(&V_fib_data_list, fd, entries); - fd->fd_linked = false; - TAILQ_INSERT_TAIL(&tmp_head, fd, entries); - } - } - MOD_UNLOCK(); - - /* Pass 2: remove each entry */ - NET_EPOCH_ENTER(et); - TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) { - schedule_destroy_fd_instance(fd, in_callout); - } - NET_EPOCH_EXIT(et); -} - -void -fib_destroy_rib(struct rib_head *rh) -{ - - /* - * Atm we have set is_dying flag on rnh, so all new fd's will - * fail at sync_algo() stage, so nothing new will be added to the list. - */ - fib_cleanup_algo(rh, false, false); -} - -static void -destroy_instance(struct fib_data *fd) -{ - - FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd); - - /* Call destroy callback first */ - if (fd->fd_algo_data != NULL) - fd->fd_flm->flm_destroy_cb(fd->fd_algo_data); - - /* Nhop table */ - if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) { - for (int i = 0; i < fd->number_nhops; i++) { - if (!is_idx_free(fd, i)) { - FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", - i, fd->nh_idx[i]); - nhop_free_any(fd->nh_idx[i]); - } - } - free(fd->nh_idx, M_RTABLE); - } - if (fd->nh_ref_table != NULL) - free(fd->nh_ref_table, M_RTABLE); - - fib_unref_algo(fd->fd_flm); - - free(fd, M_RTABLE); -} - -/* - * Epoch callback indicating fd is safe to destroy - */ -static void -destroy_fd_instance_epoch(epoch_context_t ctx) -{ - struct fib_data *fd; - - fd = __containerof(ctx, struct fib_data, fd_epoch_ctx); - - destroy_instance(fd); -} - -static enum flm_op_result -try_setup_instance(struct fib_lookup_module *flm, struct rib_head *rh, - struct fib_data *old_fd, struct fib_data **pfd) -{ - struct fib_data *fd; - size_t size; - enum flm_op_result result; - - /* Allocate */ - fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO); - if (fd == NULL) { - *pfd = NULL; - return (FLM_REBUILD); - } - *pfd = fd; - - estimate_nhop_scale(old_fd, fd); - - fd->fd_rh = rh; - fd->fd_family = rh->rib_family; - fd->fd_fibnum = rh->rib_fibnum; - callout_init(&fd->fd_callout, 1); - fd->fd_vnet = curvnet; - fd->fd_flm = flm; - - MOD_LOCK(); - flm->flm_refcount++; - MOD_UNLOCK(); - - /* Allocate nhidx -> nhop_ptr table */ - size = fd->number_nhops * sizeof(void *); - //FD_PRINTF(fd, "malloc(%lu)", size); - fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); - if (fd->nh_idx == NULL) { - FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size); - return (FLM_REBUILD); - } - - /* Allocate nhop index refcount table */ - size = sizeof(struct nhop_ref_table); - size += fd->number_nhops * sizeof(uint32_t); - //FD_PRINTF(fd, "malloc(%lu)", size); - fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); - if (fd->nh_ref_table == NULL) { - FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size); - return (FLM_REBUILD); - } - - /* Okay, we're ready for algo init */ - void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL; - result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data); - if (result != FLM_SUCCESS) - return (result); - - /* Try to subscribe */ - if (flm->flm_change_rib_item_cb != NULL) { - fd->fd_rs = rib_subscribe_internal(fd->fd_rh, - handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE, 0); - if (fd->fd_rs == NULL) - return (FLM_REBUILD); - } - - /* Dump */ - result = sync_algo(fd); - if (result != FLM_SUCCESS) - return (result); - FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully."); - - MOD_LOCK(); - /* - * Insert in the beginning of a list, to simplify search - * first matching entry is the one. - */ - TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries); - fd->fd_linked = true; - MOD_UNLOCK(); - - return (FLM_SUCCESS); -} - -/* - * Sets up algo @flm for table @rh and links it to the datapath. - * - */ -static enum flm_op_result -setup_instance(struct fib_lookup_module *flm, struct rib_head *rh, - struct fib_data *orig_fd, struct fib_data **pfd, bool attach) -{ - struct fib_data *prev_fd, *new_fd; - struct epoch_tracker et; - enum flm_op_result result; - - prev_fd = orig_fd; - new_fd = NULL; - for (int i = 0; i < FIB_MAX_TRIES; i++) { - NET_EPOCH_ENTER(et); - result = try_setup_instance(flm, rh, prev_fd, &new_fd); - - if ((result == FLM_SUCCESS) && attach) - result = attach_datapath(new_fd); - - if ((prev_fd != NULL) && (prev_fd != orig_fd)) { - schedule_destroy_fd_instance(prev_fd, false); - prev_fd = NULL; - } - NET_EPOCH_EXIT(et); - - RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %d", i, result); - - if (result == FLM_REBUILD) { - prev_fd = new_fd; - new_fd = NULL; - continue; - } - - break; - } - - if (result != FLM_SUCCESS) { - /* update failure count */ - MOD_LOCK(); - if (orig_fd != NULL) - orig_fd->fd_failed_rebuilds++; - MOD_UNLOCK(); - - /* Ban algo on non-recoverable error */ - if (result == FLM_ERROR) - flm_error_add(flm, rh->rib_fibnum); - - NET_EPOCH_ENTER(et); - if ((prev_fd != NULL) && (prev_fd != orig_fd)) - schedule_destroy_fd_instance(prev_fd, false); - if (new_fd != NULL) { - schedule_destroy_fd_instance(new_fd, false); - new_fd = NULL; - } - NET_EPOCH_EXIT(et); - } - - *pfd = new_fd; - return (result); -} - -static void -rebuild_callout(void *_data) -{ - struct fib_data *fd, *fd_new, *fd_tmp; - struct fib_lookup_module *flm_new; - struct epoch_tracker et; - enum flm_op_result result; - bool need_rebuild = false; - - fd = (struct fib_data *)_data; - - MOD_LOCK(); - need_rebuild = fd->fd_need_rebuild; - fd->fd_need_rebuild = false; - fd->fd_force_eval = false; - fd->fd_num_changes = 0; - MOD_UNLOCK(); - - CURVNET_SET(fd->fd_vnet); - - /* First, check if we're still OK to use this algo */ - flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); - if ((flm_new == NULL) && (!need_rebuild)) { - /* Keep existing algo, no need to rebuild. */ - CURVNET_RESTORE(); - return; - } - - if (flm_new == NULL) { - flm_new = fd->fd_flm; - fd_tmp = fd; - } else { - fd_tmp = NULL; - FD_PRINTF(LOG_NOTICE, fd, "switching algo to %s", flm_new->flm_name); - } - result = setup_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true); - if (fd_tmp == NULL) { - /* fd_new represents new algo */ - fib_unref_algo(flm_new); - } - if (result != FLM_SUCCESS) { - FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed"); - CURVNET_RESTORE(); - return; - } - FD_PRINTF(LOG_INFO, fd_new, "switched to new instance"); - - /* Remove old */ - if (fd != NULL) { - NET_EPOCH_ENTER(et); - schedule_destroy_fd_instance(fd, true); - NET_EPOCH_EXIT(et); - } - - CURVNET_RESTORE(); -} - -static struct fib_lookup_module * -fib_find_algo(const char *algo_name, int family) -{ - struct fib_lookup_module *flm; - - MOD_LOCK(); - TAILQ_FOREACH(flm, &all_algo_list, entries) { - if ((strcmp(flm->flm_name, algo_name) == 0) && - (family == flm->flm_family)) { - flm->flm_refcount++; - MOD_UNLOCK(); - return (flm); - } - } - MOD_UNLOCK(); - - return (NULL); -} - -static void -fib_unref_algo(struct fib_lookup_module *flm) -{ - - MOD_LOCK(); - flm->flm_refcount--; - MOD_UNLOCK(); -} - -static int -set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req) -{ - struct fib_lookup_module *flm = NULL; - struct fib_data *fd = NULL; - char old_algo_name[32], algo_name[32]; - struct rib_head *rh = NULL; - enum flm_op_result result; - int error; - - MOD_LOCK(); - TAILQ_FOREACH(fd, &V_fib_data_list, entries) { - if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum)) - break; - } - if (fd == NULL) { - MOD_UNLOCK(); - return (ENOENT); - } - rh = fd->fd_rh; - strlcpy(old_algo_name, fd->fd_flm->flm_name, - sizeof(old_algo_name)); - MOD_UNLOCK(); - - strlcpy(algo_name, old_algo_name, sizeof(algo_name)); - error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req); - if (error != 0 || req->newptr == NULL) - return (error); - - if (strcmp(algo_name, old_algo_name) == 0) - return (0); - - flm = fib_find_algo(algo_name, family); - if (flm == NULL) { - RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name); - return (ESRCH); - } - - fd = NULL; - result = setup_instance(flm, rh, NULL, &fd, true); - fib_unref_algo(flm); - if (result != FLM_SUCCESS) - return (EINVAL); - - /* Disable jumping between algos */ - MOD_LOCK(); - set_algo_fixed(rh); - MOD_UNLOCK(); - /* Remove old instance(s) */ - fib_cleanup_algo(rh, true, false); - - /* Drain cb so user can unload the module after userret if so desired */ - epoch_drain_callbacks(net_epoch_preempt); - - return (0); -} - -#ifdef INET -static int -set_algo4_sysctl_handler(SYSCTL_HANDLER_ARGS) -{ - - return (set_fib_algo(RT_DEFAULT_FIB, AF_INET, oidp, req)); -} -SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo, - CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, - set_algo4_sysctl_handler, "A", "Set IPv4 lookup algo"); -#endif - -#ifdef INET6 -static int -set_algo6_sysctl_handler(SYSCTL_HANDLER_ARGS) -{ - - return (set_fib_algo(RT_DEFAULT_FIB, AF_INET6, oidp, req)); -} -SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo, - CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, - set_algo6_sysctl_handler, "A", "Set IPv6 lookup algo"); -#endif - -static void -destroy_fdh_epoch(epoch_context_t ctx) -{ - struct fib_dp_header *ffi; - - ffi = __containerof(ctx, struct fib_dp_header, ffi_epoch_ctx); - free(ffi, M_RTABLE); -} - -static struct fib_dp_header * -alloc_fib_dp_array(uint32_t num_tables, bool waitok) -{ - size_t sz; - struct fib_dp_header *ffi; - - sz = sizeof(struct fib_dp_header); - sz += sizeof(struct fib_dp) * num_tables; - ffi = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO); - if (ffi != NULL) - ffi->ffi_num_tables = num_tables; - return (ffi); -} - -static struct fib_dp_header * -get_fib_dp_header(struct fib_dp *dp) -{ - - return (__containerof((void *)dp, struct fib_dp_header, ffi_idx)); -} - -/* - * Replace per-family index pool @pdp with a new one which - * contains updated callback/algo data from @fd. - * Returns 0 on success. - */ -static enum flm_op_result -replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd) -{ - struct fib_dp_header *new_ffi, *old_ffi; - - NET_EPOCH_ASSERT(); - - FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p", - curvnet, fd->fd_dp.f, fd->fd_dp.arg); - - MOD_LOCK(); - old_ffi = get_fib_dp_header(*pdp); - new_ffi = alloc_fib_dp_array(old_ffi->ffi_num_tables, false); - FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_ffi, new_ffi); - if (new_ffi == NULL) { - MOD_UNLOCK(); - FD_PRINTF(LOG_WARNING, fd, "error attaching datapath"); - return (FLM_REBUILD); - } - - memcpy(&new_ffi->ffi_idx[0], &old_ffi->ffi_idx[0], - old_ffi->ffi_num_tables * sizeof(struct fib_dp)); - /* Update relevant data structure for @fd */ - new_ffi->ffi_idx[fd->fd_fibnum] = fd->fd_dp; - - /* Ensure memcpy() writes have completed */ - atomic_thread_fence_rel(); - /* Set new datapath pointer */ - *pdp = &new_ffi->ffi_idx[0]; - MOD_UNLOCK(); - FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_ffi, new_ffi); - - epoch_call(net_epoch_preempt, destroy_fdh_epoch, - &old_ffi->ffi_epoch_ctx); - - return (FLM_SUCCESS); -} - -static struct fib_dp ** -get_family_dp_ptr(int family) -{ - switch (family) { - case AF_INET: - return (&V_inet_dp); - case AF_INET6: - return (&V_inet6_dp); - } - return (NULL); -} - -/* - * Make datapath use fib instance @fd - */ -static enum flm_op_result -attach_datapath(struct fib_data *fd) -{ - struct fib_dp **pdp; - - pdp = get_family_dp_ptr(fd->fd_family); - return (replace_rtables_family(pdp, fd)); -} - -/* - * Grow datapath pointers array. - * Called from sysctl handler on growing number of routing tables. - */ -static void -grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables) -{ - struct fib_dp_header *new_fdh, *old_fdh = NULL; - - new_fdh = alloc_fib_dp_array(new_num_tables, true); - - MOD_LOCK(); - if (*pdp != NULL) { - old_fdh = get_fib_dp_header(*pdp); - memcpy(&new_fdh->ffi_idx[0], &old_fdh->ffi_idx[0], - old_fdh->ffi_num_tables * sizeof(struct fib_dp)); - } - - /* Wait till all writes completed */ - atomic_thread_fence_rel(); - - *pdp = &new_fdh->ffi_idx[0]; - MOD_UNLOCK(); - - if (old_fdh != NULL) - epoch_call(net_epoch_preempt, destroy_fdh_epoch, - &old_fdh->ffi_epoch_ctx); -} - -/* - * Grows per-AF arrays of datapath pointers for each supported family. - * Called from fibs resize sysctl handler. - */ -void -fib_grow_rtables(uint32_t new_num_tables) -{ - -#ifdef INET - grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables); -#endif -#ifdef INET6 - grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables); -#endif -} - -void -fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo) -{ - - bzero(rinfo, sizeof(struct rib_rtable_info)); - rinfo->num_prefixes = rh->rnh_prefixes; - rinfo->num_nhops = nhops_get_count(rh); -#ifdef ROUTE_MPATH - rinfo->num_nhgrp = nhgrp_get_count(rh); -#endif -} - -/* - * Accessor to get rib instance @fd is attached to. - */ -struct rib_head * -fib_get_rh(struct fib_data *fd) -{ - - return (fd->fd_rh); -} - -/* - * Accessor to export idx->nhop array - */ -struct nhop_object ** -fib_get_nhop_array(struct fib_data *fd) -{ - - return (fd->nh_idx); -} - -static uint32_t -get_nhop_idx(struct nhop_object *nh) -{ -#ifdef ROUTE_MPATH - if (NH_IS_NHGRP(nh)) - return (nhgrp_get_idx((struct nhgrp_object *)nh) * 2 - 1); - else - return (nhop_get_idx(nh) * 2); -#else - return (nhop_get_idx(nh)); -#endif -} - -uint32_t -fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh) -{ - - return (get_nhop_idx(nh)); -} - -static bool -is_idx_free(struct fib_data *fd, uint32_t index) -{ - - return (fd->nh_ref_table->refcnt[index] == 0); -} - -static uint32_t -fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh) -{ - uint32_t idx = get_nhop_idx(nh); - - if (idx >= fd->number_nhops) { - fd->hit_nhops = 1; - return (0); - } - - if (is_idx_free(fd, idx)) { - nhop_ref_any(nh); - fd->nh_idx[idx] = nh; - fd->nh_ref_table->count++; - FD_PRINTF(LOG_DEBUG, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]); - } - fd->nh_ref_table->refcnt[idx]++; - - return (idx); -} - -struct nhop_release_data { - struct nhop_object *nh; - struct epoch_context ctx; -}; - -static void -release_nhop_epoch(epoch_context_t ctx) -{ - struct nhop_release_data *nrd; - - nrd = __containerof(ctx, struct nhop_release_data, ctx); - nhop_free_any(nrd->nh); - free(nrd, M_RTABLE); -} - -static void -fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh) -{ - struct nhop_release_data *nrd; - - nrd = malloc(sizeof(struct nhop_release_data), M_RTABLE, M_NOWAIT | M_ZERO); - if (nrd != NULL) { - nrd->nh = nh; - epoch_call(net_epoch_preempt, release_nhop_epoch, &nrd->ctx); - } else { - /* - * Unable to allocate memory. Leak nexthop to maintain guarantee - * that each nhop can be referenced. - */ - FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh); - } -} - -static void -fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh) -{ - uint32_t idx = get_nhop_idx(nh); - - KASSERT((idx < fd->number_nhops), ("invalid nhop index")); - KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh")); - - fd->nh_ref_table->refcnt[idx]--; - if (fd->nh_ref_table->refcnt[idx] == 0) { - FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]); - fib_schedule_release_nhop(fd, fd->nh_idx[idx]); - } -} - -static void -set_algo_fixed(struct rib_head *rh) -{ - switch (rh->rib_family) { - case AF_INET: - algo_fixed_inet = true; - break; - case AF_INET6: - algo_fixed_inet6 = true; - break; - } -} - -static bool -is_algo_fixed(struct rib_head *rh) -{ - - switch (rh->rib_family) { - case AF_INET: - return (algo_fixed_inet); - case AF_INET6: - return (algo_fixed_inet6); - } - return (false); -} - -static struct fib_lookup_module * -fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm) -{ - uint8_t preference, curr_preference = 0, best_preference = 0; - struct fib_lookup_module *flm, *best_flm = NULL; - struct rib_rtable_info rinfo; - int candidate_algos = 0; - - fib_get_rtable_info(rh, &rinfo); - - MOD_LOCK(); - if (is_algo_fixed(rh)) { - MOD_UNLOCK(); - return (NULL); - } - - TAILQ_FOREACH(flm, &all_algo_list, entries) { - if (flm->flm_family != rh->rib_family) - continue; - candidate_algos++; - preference = flm->flm_get_pref(&rinfo); - if (preference > best_preference) { - if (!flm_error_check(flm, rh->rib_fibnum)) { - best_preference = preference; - best_flm = flm; - } - } - if (flm == orig_flm) - curr_preference = preference; - } - if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference)) - best_flm->flm_refcount++; - else - best_flm = NULL; - MOD_UNLOCK(); - - RH_PRINTF(LOG_INFO, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)", - candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference, - best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"), - best_preference); - - return (best_flm); -} - -/* - * Called when new route table is created. - * Selects, allocates and attaches fib algo for the table. - */ -int -fib_select_algo_initial(struct rib_head *rh) -{ - struct fib_lookup_module *flm; - struct fib_data *fd = NULL; - enum flm_op_result result; - - flm = fib_check_best_algo(rh, NULL); - if (flm == NULL) { - RH_PRINTF(LOG_CRIT, rh, "no algo selected"); - return (ENOENT); - } - RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name); - - result = setup_instance(flm, rh, NULL, &fd, false); - fib_unref_algo(flm); - RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd); - if (result == FLM_SUCCESS) { - - /* - * Attach datapath directly to avoid multiple reallocations - * during fib growth - */ - struct fib_dp_header *fdp; - struct fib_dp **pdp; - - pdp = get_family_dp_ptr(rh->rib_family); - if (pdp != NULL) { - fdp = get_fib_dp_header(*pdp); - fdp->ffi_idx[fd->fd_fibnum] = fd->fd_dp; - FD_PRINTF(LOG_INFO, fd, "datapath attached"); - } - } - - return (0); -} - -int -fib_module_register(struct fib_lookup_module *flm) -{ - - MOD_LOCK(); - ALGO_PRINTF("attaching %s to %s", flm->flm_name, - print_family(flm->flm_family)); - TAILQ_INSERT_TAIL(&all_algo_list, flm, entries); - MOD_UNLOCK(); - - return (0); -} - -int -fib_module_unregister(struct fib_lookup_module *flm) -{ - - MOD_LOCK(); - if (flm->flm_refcount > 0) { - MOD_UNLOCK(); - return (EBUSY); - } - fib_error_clear_flm(flm); - ALGO_PRINTF("detaching %s from %s", flm->flm_name, - print_family(flm->flm_family)); - TAILQ_REMOVE(&all_algo_list, flm, entries); - MOD_UNLOCK(); - - return (0); -} - -void -vnet_fib_init(void) -{ - - TAILQ_INIT(&V_fib_data_list); -} - -static void -fib_init(void) -{ - - mtx_init(&fib_mtx, "algo list mutex", NULL, MTX_DEF); - TAILQ_INIT(&all_algo_list); -} -SYSINIT(fib_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, fib_init, NULL); -