diff --git a/sys/contrib/dpdk_rte_lpm/dpdk_lpm.c b/sys/contrib/dpdk_rte_lpm/dpdk_lpm.c index af145997c4d6..51cd134132f6 100644 --- a/sys/contrib/dpdk_rte_lpm/dpdk_lpm.c +++ b/sys/contrib/dpdk_rte_lpm/dpdk_lpm.c @@ -1,423 +1,427 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2020 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "rte_shim.h" #include "rte_lpm.h" #define LPM_MIN_TBL8 8 /* 2 pages of memory */ #define LPM_MAX_TBL8 65536 * 16 /* 256M */ MALLOC_DECLARE(M_RTABLE); struct dpdk_lpm_data { struct rte_lpm *lpm; uint64_t routes_added; uint64_t routes_failed; uint32_t number_tbl8s; uint32_t fibnum; uint8_t hit_tables; uint8_t hit_records; struct fib_data *fd; }; /* * Main datapath routing */ static struct nhop_object * lookup_ptr(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) { struct rte_lpm *lpm; const struct rte_lpm_external *rte_ext; uint32_t nhidx = 0; int ret; lpm = (struct rte_lpm *)algo_data; rte_ext = (const struct rte_lpm_external *)lpm; ret = rte_lpm_lookup(lpm, ntohl(key.addr4.s_addr), &nhidx); if (ret == 0) { /* Success! */ return (rte_ext->nh_idx[nhidx]); } else { /* Not found. Check default route */ return (rte_ext->nh_idx[rte_ext->default_idx]); } return (NULL); } static uint8_t rte_get_pref(const struct rib_rtable_info *rinfo) { if (rinfo->num_prefixes < 10) return (1); else if (rinfo->num_prefixes < 1000) return (rinfo->num_prefixes / 10); else if (rinfo->num_prefixes < 500000) return (100 + rinfo->num_prefixes / 3334); else return (250); } static enum flm_op_result handle_default_change(struct dpdk_lpm_data *dd, struct rib_cmd_info *rc) { struct rte_lpm_external *rte_ext; rte_ext = (struct rte_lpm_external *)dd->lpm; if (rc->rc_cmd != RTM_DELETE) { /* Reference new */ uint32_t nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new); if (nhidx == 0) return (FLM_REBUILD); rte_ext->default_idx = nhidx; } else { /* No default route */ rte_ext->default_idx = 0; } return (FLM_SUCCESS); } static void -get_parent_rule(struct dpdk_lpm_data *dd, struct in_addr addr, uint8_t *plen, uint32_t *nhop_idx) +get_parent_rule(struct dpdk_lpm_data *dd, struct in_addr addr, int plen, + uint8_t *pplen, uint32_t *nhop_idx) { - struct route_nhop_data rnd; struct rtentry *rt; - rt = fib4_lookup_rt(dd->fibnum, addr, 0, NHR_UNLOCKED, &rnd); + rt = rt_get_inet_parent(dd->fibnum, addr, plen); if (rt != NULL) { struct in_addr addr4; uint32_t scopeid; - int inet_plen; - rt_get_inet_prefix_plen(rt, &addr4, &inet_plen, &scopeid); - if (inet_plen > 0) { - *plen = inet_plen; - *nhop_idx = fib_get_nhop_idx(dd->fd, rnd.rnd_nhop); + int parent_plen; + + rt_get_inet_prefix_plen(rt, &addr4, &parent_plen, &scopeid); + if (parent_plen > 0) { + *pplen = parent_plen; + *nhop_idx = fib_get_nhop_idx(dd->fd, rt_get_raw_nhop(rt)); return; } } *nhop_idx = 0; - *plen = 0; + *pplen = 0; } static enum flm_op_result handle_gu_change(struct dpdk_lpm_data *dd, const struct rib_cmd_info *rc, const struct in_addr addr, int plen) { uint32_t nhidx = 0; int ret; char abuf[INET_ADDRSTRLEN]; uint32_t ip; ip = ntohl(addr.s_addr); inet_ntop(AF_INET, &addr, abuf, sizeof(abuf)); /* So we get sin, plen and nhidx */ if (rc->rc_cmd != RTM_DELETE) { /* * Addition or change. Save nhop in the internal table * and get index. */ nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new); if (nhidx == 0) { FIB_PRINTF(LOG_INFO, dd->fd, "nhop limit reached, need rebuild"); return (FLM_REBUILD); } ret = rte_lpm_add(dd->lpm, ip, plen, nhidx); - FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK GU: %s %s/%d nhop %u = %d", + FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK GU: %s %s/%d nhop %u -> %u ret: %d", (rc->rc_cmd == RTM_ADD) ? "ADD" : "UPDATE", - abuf, plen, nhidx, ret); + abuf, plen, + rc->rc_nh_old != NULL ? fib_get_nhop_idx(dd->fd, rc->rc_nh_old) : 0, + nhidx, ret); } else { /* * Need to lookup parent. Assume deletion happened already */ uint8_t parent_plen; uint32_t parent_nhop_idx; - get_parent_rule(dd, addr, &parent_plen, &parent_nhop_idx); + get_parent_rule(dd, addr, plen, &parent_plen, &parent_nhop_idx); ret = rte_lpm_delete(dd->lpm, ip, plen, parent_plen, parent_nhop_idx); - FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK: %s %s/%d nhop %u = %d", - "DEL", abuf, plen, nhidx, ret); + FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK: %s %s/%d -> /%d nhop %u -> %u ret: %d", + "DEL", abuf, plen, parent_plen, fib_get_nhop_idx(dd->fd, rc->rc_nh_old), + parent_nhop_idx, ret); } if (ret != 0) { FIB_PRINTF(LOG_INFO, dd->fd, "error: %d", ret); if (ret == -ENOSPC) return (FLM_REBUILD); return (FLM_ERROR); } return (FLM_SUCCESS); } static enum flm_op_result handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, void *_data) { struct dpdk_lpm_data *dd; enum flm_op_result ret; struct in_addr addr4; uint32_t scopeid; int plen; dd = (struct dpdk_lpm_data *)_data; rt_get_inet_prefix_plen(rc->rc_rt, &addr4, &plen, &scopeid); if (plen != 0) ret = handle_gu_change(dd, rc, addr4, plen); else ret = handle_default_change(dd, rc); if (ret != 0) FIB_PRINTF(LOG_INFO, dd->fd, "error handling route"); return (ret); } static void destroy_table(void *_data) { struct dpdk_lpm_data *dd = (struct dpdk_lpm_data *)_data; if (dd->lpm != NULL) rte_lpm_free(dd->lpm); free(dd, M_RTABLE); } static enum flm_op_result add_route_cb(struct rtentry *rt, void *_data) { struct dpdk_lpm_data *dd = (struct dpdk_lpm_data *)_data; struct nhop_object *nh; int plen, ret; struct in_addr addr4; uint32_t scopeid; nh = rt_get_raw_nhop(rt); rt_get_inet_prefix_plen(rt, &addr4, &plen, &scopeid); char abuf[INET_ADDRSTRLEN]; inet_ntop(AF_INET, &addr4, abuf, sizeof(abuf)); FIB_PRINTF(LOG_DEBUG, dd->fd, "Operating on %s/%d", abuf, plen); if (plen == 0) { struct rib_cmd_info rc = { .rc_cmd = RTM_ADD, .rc_nh_new = nh, }; FIB_PRINTF(LOG_DEBUG, dd->fd, "Adding default route"); return (handle_default_change(dd, &rc)); } uint32_t nhidx = fib_get_nhop_idx(dd->fd, nh); if (nhidx == 0) { FIB_PRINTF(LOG_INFO, dd->fd, "unable to get nhop index"); return (FLM_REBUILD); } ret = rte_lpm_add(dd->lpm, ntohl(addr4.s_addr), plen, nhidx); FIB_PRINTF(LOG_DEBUG, dd->fd, "ADD %p %s/%d nh %u = %d", dd->lpm, abuf, plen, nhidx, ret); if (ret != 0) { FIB_PRINTF(LOG_INFO, dd->fd, "rte_lpm_add() returned %d", ret); if (ret == -ENOSPC) { dd->hit_tables = 1; return (FLM_REBUILD); } dd->routes_failed++; return (FLM_ERROR); } else dd->routes_added++; return (FLM_SUCCESS); } static enum flm_op_result check_dump_success(void *_data, struct fib_dp *dp) { struct dpdk_lpm_data *dd; dd = (struct dpdk_lpm_data *)_data; FIB_PRINTF(LOG_INFO, dd->fd, "scan completed. added: %zu failed: %zu", dd->routes_added, dd->routes_failed); if (dd->hit_tables || dd->routes_failed > 0) return (FLM_REBUILD); FIB_PRINTF(LOG_INFO, dd->fd, "DPDK lookup engine synced with IPv4 RIB id %u, %zu routes", dd->fibnum, dd->routes_added); dp->f = lookup_ptr; dp->arg = dd->lpm; return (FLM_SUCCESS); } static void estimate_scale(const struct dpdk_lpm_data *dd_src, struct dpdk_lpm_data *dd) { /* XXX: update at 75% capacity */ if (dd_src->hit_tables) dd->number_tbl8s = dd_src->number_tbl8s * 2; else dd->number_tbl8s = dd_src->number_tbl8s; /* TODO: look into the appropriate RIB to adjust */ } static struct dpdk_lpm_data * build_table(struct dpdk_lpm_data *dd_prev, struct fib_data *fd) { struct dpdk_lpm_data *dd; struct rte_lpm *lpm; dd = malloc(sizeof(struct dpdk_lpm_data), M_RTABLE, M_NOWAIT | M_ZERO); if (dd == NULL) { FIB_PRINTF(LOG_INFO, fd, "Unable to allocate base datastructure"); return (NULL); } dd->fibnum = dd_prev->fibnum; dd->fd = fd; estimate_scale(dd_prev, dd); struct rte_lpm_config cfg = {.number_tbl8s = dd->number_tbl8s}; lpm = rte_lpm_create("test", 0, &cfg); if (lpm == NULL) { FIB_PRINTF(LOG_INFO, fd, "unable to create lpm"); free(dd, M_RTABLE); return (NULL); } dd->lpm = lpm; struct rte_lpm_external *ext = (struct rte_lpm_external *)lpm; ext->nh_idx = fib_get_nhop_array(dd->fd); FIB_PRINTF(LOG_INFO, fd, "allocated %u tbl8s", dd->number_tbl8s); return (dd); } static enum flm_op_result init_table(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **data) { struct dpdk_lpm_data *dd, dd_base; if (_old_data == NULL) { bzero(&dd_base, sizeof(struct dpdk_lpm_data)); dd_base.fibnum = fibnum; /* TODO: get rib statistics */ dd_base.number_tbl8s = LPM_MIN_TBL8; dd = &dd_base; } else { FIB_PRINTF(LOG_DEBUG, fd, "Starting with old data"); dd = (struct dpdk_lpm_data *)_old_data; } /* Guaranteed to be in epoch */ dd = build_table(dd, fd); if (dd == NULL) { FIB_PRINTF(LOG_NOTICE, fd, "table creation failed"); return (FLM_REBUILD); } *data = dd; return (FLM_SUCCESS); } static struct fib_lookup_module dpdk_lpm4 = { .flm_name = "dpdk_lpm4", .flm_family = AF_INET, .flm_init_cb = init_table, .flm_destroy_cb = destroy_table, .flm_dump_rib_item_cb = add_route_cb, .flm_dump_end_cb = check_dump_success, .flm_change_rib_item_cb = handle_rtable_change_cb, .flm_get_pref = rte_get_pref, }; static int lpm4_modevent(module_t mod, int type, void *unused) { int error = 0; switch (type) { case MOD_LOAD: fib_module_register(&dpdk_lpm4); break; case MOD_UNLOAD: error = fib_module_unregister(&dpdk_lpm4); break; default: error = EOPNOTSUPP; break; } return (error); } static moduledata_t lpm4mod = { "dpdk_lpm4", lpm4_modevent, 0 }; DECLARE_MODULE(lpm4mod, lpm4mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(lpm4mod, 1); diff --git a/sys/contrib/dpdk_rte_lpm/dpdk_lpm6.c b/sys/contrib/dpdk_rte_lpm/dpdk_lpm6.c index 17d35c16346d..ec1a3228d42b 100644 --- a/sys/contrib/dpdk_rte_lpm/dpdk_lpm6.c +++ b/sys/contrib/dpdk_rte_lpm/dpdk_lpm6.c @@ -1,487 +1,489 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2020 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define RTDEBUG #include "rte_lpm6.h" #define LPM6_MIN_TBL8 8 /* 2 pages of memory */ #define LPM6_MAX_TBL8 65536 * 16 /* 256M */ struct fib_algo_calldata { void *lookup; void *arg; }; struct dpdk_lpm6_data { struct rte_lpm6 *lpm6; uint64_t routes_added; uint64_t routes_failed; uint32_t number_tbl8s; uint32_t fibnum; uint8_t hit_tables; struct fib_data *fd; }; static struct nhop_object * lookup_ptr_ll(const struct rte_lpm6 *lpm6, const struct in6_addr *dst6, uint32_t scopeid) { const struct rte_lpm6_external *rte_ext; rte_ext = (const struct rte_lpm6_external *)lpm6; return (fib6_radix_lookup_nh(rte_ext->fibnum, dst6, scopeid)); } /* * Main datapath routing */ static struct nhop_object * lookup_ptr(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) { const struct rte_lpm6 *lpm6; const struct rte_lpm6_external *rte_ext; const struct in6_addr *addr6; uint32_t nhidx = 0; int ret; lpm6 = (const struct rte_lpm6 *)algo_data; addr6 = key.addr6; rte_ext = (const struct rte_lpm6_external *)lpm6; if (!IN6_IS_SCOPE_LINKLOCAL(addr6)) { ret = rte_lpm6_lookup(lpm6, (const uint8_t *)addr6, &nhidx); if (ret == 0) { /* Success! */ return (rte_ext->nh_idx[nhidx]); } else { /* Not found. Check default route */ if (rte_ext->default_idx > 0) return (rte_ext->nh_idx[rte_ext->default_idx]); else return (NULL); } } else { /* LL */ return (lookup_ptr_ll(lpm6, addr6, scopeid)); } } static uint8_t rte6_get_pref(const struct rib_rtable_info *rinfo) { if (rinfo->num_prefixes < 10) return (1); else if (rinfo->num_prefixes < 1000) return (rinfo->num_prefixes / 10); else if (rinfo->num_prefixes < 100000) return (100 + rinfo->num_prefixes / 667); else return (250); } static enum flm_op_result handle_default_change(struct dpdk_lpm6_data *dd, struct rib_cmd_info *rc) { struct rte_lpm6_external *rte_ext; rte_ext = (struct rte_lpm6_external *)dd->lpm6; if (rc->rc_cmd != RTM_DELETE) { /* Reference new */ uint32_t nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new); if (nhidx == 0) return (FLM_REBUILD); rte_ext->default_idx = nhidx; } else { /* No default route */ rte_ext->default_idx = 0; } return (FLM_SUCCESS); } static enum flm_op_result handle_ll_change(struct dpdk_lpm6_data *dd, struct rib_cmd_info *rc, const struct in6_addr addr6, int plen, uint32_t scopeid) { return (FLM_SUCCESS); } static struct rte_lpm6_rule * -pack_parent_rule(struct dpdk_lpm6_data *dd, const struct in6_addr *addr6, - char *buffer) +pack_parent_rule(struct dpdk_lpm6_data *dd, const struct in6_addr *addr6, int plen, + int *pplen, uint32_t *pnhop_idx, char *buffer) { struct rte_lpm6_rule *lsp_rule = NULL; - struct route_nhop_data rnd; struct rtentry *rt; - int plen; - rt = fib6_lookup_rt(dd->fibnum, addr6, 0, NHR_UNLOCKED, &rnd); + *pnhop_idx = 0; + *pplen = 0; + + rt = rt_get_inet6_parent(dd->fibnum, addr6, plen); /* plen = 0 means default route and it's out of scope */ if (rt != NULL) { - uint32_t scopeid; + uint32_t nhop_idx, scopeid; struct in6_addr new_addr6; rt_get_inet6_prefix_plen(rt, &new_addr6, &plen, &scopeid); if (plen > 0) { - uint32_t nhidx = fib_get_nhop_idx(dd->fd, rnd.rnd_nhop); - if (nhidx == 0) { - /* - * shouldn't happen as we already have parent route. - * It will trigger rebuild automatically. - */ - return (NULL); - } - lsp_rule = fill_rule6(buffer, (uint8_t *)&new_addr6, plen, nhidx); + nhop_idx = fib_get_nhop_idx(dd->fd, rt_get_raw_nhop(rt)); + lsp_rule = fill_rule6(buffer, (uint8_t *)&new_addr6, plen, nhop_idx); + *pnhop_idx = nhop_idx; + *pplen = plen; } } return (lsp_rule); } static enum flm_op_result handle_gu_change(struct dpdk_lpm6_data *dd, const struct rib_cmd_info *rc, const struct in6_addr *addr6, int plen) { int ret; char abuf[INET6_ADDRSTRLEN]; inet_ntop(AF_INET6, addr6, abuf, sizeof(abuf)); /* So we get sin6, plen and nhidx */ if (rc->rc_cmd != RTM_DELETE) { /* * Addition or change. Save nhop in the internal table * and get index. */ uint32_t nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new); if (nhidx == 0) { FIB_PRINTF(LOG_INFO, dd->fd, "nhop limit reached, need rebuild"); return (FLM_REBUILD); } ret = rte_lpm6_add(dd->lpm6, (const uint8_t *)addr6, plen, nhidx, (rc->rc_cmd == RTM_ADD) ? 1 : 0); - FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK GU: %s %s/%d nhop %u = %d", + FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK GU: %s %s/%d nhop %u -> %u ret: %d", (rc->rc_cmd == RTM_ADD) ? "ADD" : "UPDATE", - abuf, plen, nhidx, ret); + abuf, plen, + rc->rc_nh_old != NULL ? fib_get_nhop_idx(dd->fd, rc->rc_nh_old) : 0, + nhidx, ret); } else { /* * Need to lookup parent. Assume deletion happened already */ char buffer[RTE_LPM6_RULE_SIZE]; struct rte_lpm6_rule *lsp_rule = NULL; - lsp_rule = pack_parent_rule(dd, addr6, buffer); + int parent_plen; + uint32_t parent_nhop_idx; + lsp_rule = pack_parent_rule(dd, addr6, plen, &parent_plen, + &parent_nhop_idx, buffer); ret = rte_lpm6_delete(dd->lpm6, (const uint8_t *)addr6, plen, lsp_rule); - FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK GU: %s %s/%d nhop ? = %d", - "DEL", abuf, plen, ret); + FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK GU: %s %s/%d -> /%d nhop %u -> %u ret: %d", + "DEL", abuf, plen, parent_plen, fib_get_nhop_idx(dd->fd, rc->rc_nh_old), + parent_nhop_idx, ret); } if (ret != 0) { FIB_PRINTF(LOG_INFO, dd->fd, "error: %d", ret); if (ret == -ENOSPC) return (FLM_REBUILD); return (FLM_ERROR); } return (FLM_SUCCESS); } static enum flm_op_result handle_any_change(struct dpdk_lpm6_data *dd, struct rib_cmd_info *rc) { enum flm_op_result ret; struct in6_addr addr6; uint32_t scopeid; int plen; rt_get_inet6_prefix_plen(rc->rc_rt, &addr6, &plen, &scopeid); if (IN6_IS_SCOPE_LINKLOCAL(&addr6)) ret = handle_ll_change(dd, rc, addr6, plen, scopeid); else if (plen == 0) ret = handle_default_change(dd, rc); else ret = handle_gu_change(dd, rc, &addr6, plen); if (ret != 0) FIB_PRINTF(LOG_INFO, dd->fd, "error handling route"); return (ret); } static enum flm_op_result handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, void *_data) { struct dpdk_lpm6_data *dd; dd = (struct dpdk_lpm6_data *)_data; return (handle_any_change(dd, rc)); } static void destroy_dd(struct dpdk_lpm6_data *dd) { FIB_PRINTF(LOG_INFO, dd->fd, "destroy dd %p", dd); if (dd->lpm6 != NULL) rte_lpm6_free(dd->lpm6); free(dd, M_TEMP); } static void destroy_table(void *_data) { destroy_dd((struct dpdk_lpm6_data *)_data); } static enum flm_op_result add_route_cb(struct rtentry *rt, void *_data) { struct dpdk_lpm6_data *dd = (struct dpdk_lpm6_data *)_data; struct in6_addr addr6; struct nhop_object *nh; uint32_t scopeid; int plen; int ret; rt_get_inet6_prefix_plen(rt, &addr6, &plen, &scopeid); nh = rt_get_raw_nhop(rt); if (IN6_IS_SCOPE_LINKLOCAL(&addr6)) { /* * We don't operate on LL directly, however * reference them to maintain guarantee on * ability to refcount nhops in epoch. */ fib_get_nhop_idx(dd->fd, nh); return (FLM_SUCCESS); } char abuf[INET6_ADDRSTRLEN]; inet_ntop(AF_INET6, &addr6, abuf, sizeof(abuf)); FIB_PRINTF(LOG_DEBUG, dd->fd, "Operating on %s/%d", abuf, plen); if (plen == 0) { struct rib_cmd_info rc = { .rc_cmd = RTM_ADD, .rc_nh_new = nh, }; FIB_PRINTF(LOG_DEBUG, dd->fd, "Adding default route"); return (handle_default_change(dd, &rc)); } uint32_t nhidx = fib_get_nhop_idx(dd->fd, nh); if (nhidx == 0) { FIB_PRINTF(LOG_INFO, dd->fd, "unable to get nhop index"); return (FLM_REBUILD); } ret = rte_lpm6_add(dd->lpm6, (const uint8_t *)&addr6, plen, nhidx, 1); FIB_PRINTF(LOG_DEBUG, dd->fd, "ADD %p %s/%d nh %u = %d", dd->lpm6, abuf, plen, nhidx, ret); if (ret != 0) { FIB_PRINTF(LOG_INFO, dd->fd, "rte_lpm6_add() returned %d", ret); if (ret == -ENOSPC) { dd->hit_tables = 1; return (FLM_REBUILD); } dd->routes_failed++; return (FLM_ERROR); } else dd->routes_added++; return (FLM_SUCCESS); } static enum flm_op_result check_dump_success(void *_data, struct fib_dp *dp) { struct dpdk_lpm6_data *dd; dd = (struct dpdk_lpm6_data *)_data; FIB_PRINTF(LOG_INFO, dd->fd, "scan completed. added: %zu failed: %zu", dd->routes_added, dd->routes_failed); if (dd->hit_tables || dd->routes_failed > 0) return (FLM_REBUILD); FIB_PRINTF(LOG_INFO, dd->fd, "DPDK lookup engine synced with IPv6 RIB id %u, %zu routes", dd->fibnum, dd->routes_added); dp->f = lookup_ptr; dp->arg = dd->lpm6; return (FLM_SUCCESS); } static void estimate_scale(const struct dpdk_lpm6_data *dd_src, struct dpdk_lpm6_data *dd) { /* XXX: update at 75% capacity */ if (dd_src->hit_tables) dd->number_tbl8s = dd_src->number_tbl8s * 2; else dd->number_tbl8s = dd_src->number_tbl8s; /* TODO: look into the appropriate RIB to adjust */ } static struct dpdk_lpm6_data * build_table(struct dpdk_lpm6_data *dd_prev, struct fib_data *fd) { struct dpdk_lpm6_data *dd; struct rte_lpm6 *lpm6; dd = malloc(sizeof(struct dpdk_lpm6_data), M_TEMP, M_NOWAIT | M_ZERO); if (dd == NULL) { FIB_PRINTF(LOG_INFO, fd, "Unable to allocate base datastructure"); return (NULL); } dd->fibnum = dd_prev->fibnum; dd->fd = fd; estimate_scale(dd_prev, dd); struct rte_lpm6_config cfg = {.number_tbl8s = dd->number_tbl8s}; lpm6 = rte_lpm6_create("test", 0, &cfg); if (lpm6 == NULL) { FIB_PRINTF(LOG_INFO, fd, "unable to create lpm6"); free(dd, M_TEMP); return (NULL); } dd->lpm6 = lpm6; struct rte_lpm6_external *ext = (struct rte_lpm6_external *)lpm6; ext->nh_idx = fib_get_nhop_array(dd->fd); FIB_PRINTF(LOG_INFO, fd, "allocated %u tbl8s", dd->number_tbl8s); return (dd); } static enum flm_op_result init_table(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **data) { struct dpdk_lpm6_data *dd, dd_base; if (_old_data == NULL) { bzero(&dd_base, sizeof(struct dpdk_lpm6_data)); dd_base.fibnum = fibnum; /* TODO: get rib statistics */ dd_base.number_tbl8s = LPM6_MIN_TBL8; dd = &dd_base; } else { FIB_PRINTF(LOG_INFO, fd, "Starting with old data"); dd = (struct dpdk_lpm6_data *)_old_data; } /* Guaranteed to be in epoch */ dd = build_table(dd, fd); if (dd == NULL) { FIB_PRINTF(LOG_INFO, fd, "table creation failed"); return (FLM_REBUILD); } *data = dd; return (FLM_SUCCESS); } static struct fib_lookup_module dpdk_lpm6 = { .flm_name = "dpdk_lpm6", .flm_family = AF_INET6, .flm_init_cb = init_table, .flm_destroy_cb = destroy_table, .flm_dump_rib_item_cb = add_route_cb, .flm_dump_end_cb = check_dump_success, .flm_change_rib_item_cb = handle_rtable_change_cb, .flm_get_pref = rte6_get_pref, }; static int lpm6_modevent(module_t mod, int type, void *unused) { int error = 0; switch (type) { case MOD_LOAD: fib_module_register(&dpdk_lpm6); break; case MOD_UNLOAD: error = fib_module_unregister(&dpdk_lpm6); break; default: error = EOPNOTSUPP; break; } return (error); } static moduledata_t lpm6mod = { "dpdk_lpm6", lpm6_modevent, 0 }; DECLARE_MODULE(lpm6mod, lpm6mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(lpm6mod, 1); diff --git a/sys/net/radix.c b/sys/net/radix.c index 931bf6db871b..2169361c7229 100644 --- a/sys/net/radix.c +++ b/sys/net/radix.c @@ -1,1189 +1,1203 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1988, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)radix.c 8.5 (Berkeley) 5/19/95 * $FreeBSD$ */ /* * Routines to build and maintain radix trees for routing lookups. */ #include #ifdef _KERNEL #include #include #include #include #include #include #include #else /* !_KERNEL */ #include #include #include #define log(x, arg...) fprintf(stderr, ## arg) #define panic(x) fprintf(stderr, "PANIC: %s", x), exit(1) #define min(a, b) ((a) < (b) ? (a) : (b) ) #include #endif /* !_KERNEL */ static struct radix_node *rn_insert(void *, struct radix_head *, int *, struct radix_node [2]), *rn_newpair(void *, int, struct radix_node[2]), *rn_search(void *, struct radix_node *), *rn_search_m(void *, struct radix_node *, void *); static struct radix_node *rn_addmask(void *, struct radix_mask_head *, int,int); static void rn_detachhead_internal(struct radix_head *); #define RADIX_MAX_KEY_LEN 32 static char rn_zeros[RADIX_MAX_KEY_LEN]; static char rn_ones[RADIX_MAX_KEY_LEN] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, }; static int rn_lexobetter(void *m_arg, void *n_arg); static struct radix_mask * rn_new_radix_mask(struct radix_node *tt, struct radix_mask *next); static int rn_satisfies_leaf(char *trial, struct radix_node *leaf, int skip); /* * The data structure for the keys is a radix tree with one way * branching removed. The index rn_bit at an internal node n represents a bit * position to be tested. The tree is arranged so that all descendants * of a node n have keys whose bits all agree up to position rn_bit - 1. * (We say the index of n is rn_bit.) * * There is at least one descendant which has a one bit at position rn_bit, * and at least one with a zero there. * * A route is determined by a pair of key and mask. We require that the * bit-wise logical and of the key and mask to be the key. * We define the index of a route to associated with the mask to be * the first bit number in the mask where 0 occurs (with bit number 0 * representing the highest order bit). * * We say a mask is normal if every bit is 0, past the index of the mask. * If a node n has a descendant (k, m) with index(m) == index(n) == rn_bit, * and m is a normal mask, then the route applies to every descendant of n. * If the index(m) < rn_bit, this implies the trailing last few bits of k * before bit b are all 0, (and hence consequently true of every descendant * of n), so the route applies to all descendants of the node as well. * * Similar logic shows that a non-normal mask m such that * index(m) <= index(n) could potentially apply to many children of n. * Thus, for each non-host route, we attach its mask to a list at an internal * node as high in the tree as we can go. * * The present version of the code makes use of normal routes in short- * circuiting an explict mask and compare operation when testing whether * a key satisfies a normal route, and also in remembering the unique leaf * that governs a subtree. */ /* * Most of the functions in this code assume that the key/mask arguments * are sockaddr-like structures, where the first byte is an u_char * indicating the size of the entire structure. * * To make the assumption more explicit, we use the LEN() macro to access * this field. It is safe to pass an expression with side effects * to LEN() as the argument is evaluated only once. * We cast the result to int as this is the dominant usage. */ #define LEN(x) ( (int) (*(const u_char *)(x)) ) /* * XXX THIS NEEDS TO BE FIXED * In the code, pointers to keys and masks are passed as either * 'void *' (because callers use to pass pointers of various kinds), or * 'caddr_t' (which is fine for pointer arithmetics, but not very * clean when you dereference it to access data). Furthermore, caddr_t * is really 'char *', while the natural type to operate on keys and * masks would be 'u_char'. This mismatch require a lot of casts and * intermediate variables to adapt types that clutter the code. */ /* * Search a node in the tree matching the key. */ static struct radix_node * rn_search(void *v_arg, struct radix_node *head) { struct radix_node *x; caddr_t v; for (x = head, v = v_arg; x->rn_bit >= 0;) { if (x->rn_bmask & v[x->rn_offset]) x = x->rn_right; else x = x->rn_left; } return (x); } /* * Same as above, but with an additional mask. * XXX note this function is used only once. */ static struct radix_node * rn_search_m(void *v_arg, struct radix_node *head, void *m_arg) { struct radix_node *x; caddr_t v = v_arg, m = m_arg; for (x = head; x->rn_bit >= 0;) { if ((x->rn_bmask & m[x->rn_offset]) && (x->rn_bmask & v[x->rn_offset])) x = x->rn_right; else x = x->rn_left; } return (x); } int rn_refines(void *m_arg, void *n_arg) { caddr_t m = m_arg, n = n_arg; caddr_t lim, lim2 = lim = n + LEN(n); int longer = LEN(n++) - LEN(m++); int masks_are_equal = 1; if (longer > 0) lim -= longer; while (n < lim) { if (*n & ~(*m)) return (0); if (*n++ != *m++) masks_are_equal = 0; } while (n < lim2) if (*n++) return (0); if (masks_are_equal && (longer < 0)) for (lim2 = m - longer; m < lim2; ) if (*m++) return (1); return (!masks_are_equal); } /* * Search for exact match in given @head. * Assume host bits are cleared in @v_arg if @m_arg is not NULL * Note that prefixes with /32 or /128 masks are treated differently * from host routes. */ struct radix_node * rn_lookup(void *v_arg, void *m_arg, struct radix_head *head) { struct radix_node *x; caddr_t netmask; if (m_arg != NULL) { /* * Most common case: search exact prefix/mask */ x = rn_addmask(m_arg, head->rnh_masks, 1, head->rnh_treetop->rn_offset); if (x == NULL) return (NULL); netmask = x->rn_key; x = rn_match(v_arg, head); while (x != NULL && x->rn_mask != netmask) x = x->rn_dupedkey; return (x); } /* * Search for host address. */ if ((x = rn_match(v_arg, head)) == NULL) return (NULL); /* Check if found key is the same */ if (LEN(x->rn_key) != LEN(v_arg) || bcmp(x->rn_key, v_arg, LEN(v_arg))) return (NULL); /* Check if this is not host route */ if (x->rn_mask != NULL) return (NULL); return (x); } static int rn_satisfies_leaf(char *trial, struct radix_node *leaf, int skip) { char *cp = trial, *cp2 = leaf->rn_key, *cp3 = leaf->rn_mask; char *cplim; int length = min(LEN(cp), LEN(cp2)); if (cp3 == NULL) cp3 = rn_ones; else length = min(length, LEN(cp3)); cplim = cp + length; cp3 += skip; cp2 += skip; for (cp += skip; cp < cplim; cp++, cp2++, cp3++) if ((*cp ^ *cp2) & *cp3) return (0); return (1); } /* * Search for longest-prefix match in given @head */ struct radix_node * rn_match(void *v_arg, struct radix_head *head) { caddr_t v = v_arg; struct radix_node *t = head->rnh_treetop, *x; caddr_t cp = v, cp2; caddr_t cplim; struct radix_node *saved_t, *top = t; int off = t->rn_offset, vlen = LEN(cp), matched_off; int test, b, rn_bit; /* * Open code rn_search(v, top) to avoid overhead of extra * subroutine call. */ for (; t->rn_bit >= 0; ) { if (t->rn_bmask & cp[t->rn_offset]) t = t->rn_right; else t = t->rn_left; } /* * See if we match exactly as a host destination * or at least learn how many bits match, for normal mask finesse. * * It doesn't hurt us to limit how many bytes to check * to the length of the mask, since if it matches we had a genuine * match and the leaf we have is the most specific one anyway; * if it didn't match with a shorter length it would fail * with a long one. This wins big for class B&C netmasks which * are probably the most common case... */ if (t->rn_mask) vlen = *(u_char *)t->rn_mask; cp += off; cp2 = t->rn_key + off; cplim = v + vlen; for (; cp < cplim; cp++, cp2++) if (*cp != *cp2) goto on1; /* * This extra grot is in case we are explicitly asked * to look up the default. Ugh! * * Never return the root node itself, it seems to cause a * lot of confusion. */ if (t->rn_flags & RNF_ROOT) t = t->rn_dupedkey; return (t); on1: test = (*cp ^ *cp2) & 0xff; /* find first bit that differs */ for (b = 7; (test >>= 1) > 0;) b--; matched_off = cp - v; b += matched_off << 3; rn_bit = -1 - b; /* * If there is a host route in a duped-key chain, it will be first. */ if ((saved_t = t)->rn_mask == 0) t = t->rn_dupedkey; for (; t; t = t->rn_dupedkey) /* * Even if we don't match exactly as a host, * we may match if the leaf we wound up at is * a route to a net. */ if (t->rn_flags & RNF_NORMAL) { if (rn_bit <= t->rn_bit) return (t); } else if (rn_satisfies_leaf(v, t, matched_off)) return (t); t = saved_t; /* start searching up the tree */ do { struct radix_mask *m; t = t->rn_parent; m = t->rn_mklist; /* * If non-contiguous masks ever become important * we can restore the masking and open coding of * the search and satisfaction test and put the * calculation of "off" back before the "do". */ while (m) { if (m->rm_flags & RNF_NORMAL) { if (rn_bit <= m->rm_bit) return (m->rm_leaf); } else { off = min(t->rn_offset, matched_off); x = rn_search_m(v, t, m->rm_mask); while (x && x->rn_mask != m->rm_mask) x = x->rn_dupedkey; if (x && rn_satisfies_leaf(v, x, off)) return (x); } m = m->rm_mklist; } } while (t != top); return (0); } +/* + * Returns the next (wider) prefix for the key defined by @rn + * if exists. + */ +struct radix_node * +rn_nextprefix(struct radix_node *rn) +{ + for (rn = rn->rn_dupedkey; rn != NULL; rn = rn->rn_dupedkey) { + if (!(rn->rn_flags & RNF_ROOT)) + return (rn); + } + return (NULL); +} + #ifdef RN_DEBUG int rn_nodenum; struct radix_node *rn_clist; int rn_saveinfo; int rn_debug = 1; #endif /* * Whenever we add a new leaf to the tree, we also add a parent node, * so we allocate them as an array of two elements: the first one must be * the leaf (see RNTORT() in route.c), the second one is the parent. * This routine initializes the relevant fields of the nodes, so that * the leaf is the left child of the parent node, and both nodes have * (almost) all all fields filled as appropriate. * (XXX some fields are left unset, see the '#if 0' section). * The function returns a pointer to the parent node. */ static struct radix_node * rn_newpair(void *v, int b, struct radix_node nodes[2]) { struct radix_node *tt = nodes, *t = tt + 1; t->rn_bit = b; t->rn_bmask = 0x80 >> (b & 7); t->rn_left = tt; t->rn_offset = b >> 3; #if 0 /* XXX perhaps we should fill these fields as well. */ t->rn_parent = t->rn_right = NULL; tt->rn_mask = NULL; tt->rn_dupedkey = NULL; tt->rn_bmask = 0; #endif tt->rn_bit = -1; tt->rn_key = (caddr_t)v; tt->rn_parent = t; tt->rn_flags = t->rn_flags = RNF_ACTIVE; tt->rn_mklist = t->rn_mklist = 0; #ifdef RN_DEBUG tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++; tt->rn_twin = t; tt->rn_ybro = rn_clist; rn_clist = tt; #endif return (t); } static struct radix_node * rn_insert(void *v_arg, struct radix_head *head, int *dupentry, struct radix_node nodes[2]) { caddr_t v = v_arg; struct radix_node *top = head->rnh_treetop; int head_off = top->rn_offset, vlen = LEN(v); struct radix_node *t = rn_search(v_arg, top); caddr_t cp = v + head_off; unsigned b; struct radix_node *p, *tt, *x; /* * Find first bit at which v and t->rn_key differ */ caddr_t cp2 = t->rn_key + head_off; int cmp_res; caddr_t cplim = v + vlen; while (cp < cplim) if (*cp2++ != *cp++) goto on1; *dupentry = 1; return (t); on1: *dupentry = 0; cmp_res = (cp[-1] ^ cp2[-1]) & 0xff; for (b = (cp - v) << 3; cmp_res; b--) cmp_res >>= 1; x = top; cp = v; do { p = x; if (cp[x->rn_offset] & x->rn_bmask) x = x->rn_right; else x = x->rn_left; } while (b > (unsigned) x->rn_bit); /* x->rn_bit < b && x->rn_bit >= 0 */ #ifdef RN_DEBUG if (rn_debug) log(LOG_DEBUG, "rn_insert: Going In:\n"), traverse(p); #endif t = rn_newpair(v_arg, b, nodes); tt = t->rn_left; if ((cp[p->rn_offset] & p->rn_bmask) == 0) p->rn_left = t; else p->rn_right = t; x->rn_parent = t; t->rn_parent = p; /* frees x, p as temp vars below */ if ((cp[t->rn_offset] & t->rn_bmask) == 0) { t->rn_right = x; } else { t->rn_right = tt; t->rn_left = x; } #ifdef RN_DEBUG if (rn_debug) log(LOG_DEBUG, "rn_insert: Coming Out:\n"), traverse(p); #endif return (tt); } static struct radix_node * rn_addmask(void *n_arg, struct radix_mask_head *maskhead, int search, int skip) { unsigned char *netmask = n_arg; unsigned char *cp, *cplim; struct radix_node *x; int b = 0, mlen, j; int maskduplicated, isnormal; struct radix_node *saved_x; unsigned char addmask_key[RADIX_MAX_KEY_LEN]; if ((mlen = LEN(netmask)) > RADIX_MAX_KEY_LEN) mlen = RADIX_MAX_KEY_LEN; if (skip == 0) skip = 1; if (mlen <= skip) return (maskhead->mask_nodes); bzero(addmask_key, RADIX_MAX_KEY_LEN); if (skip > 1) bcopy(rn_ones + 1, addmask_key + 1, skip - 1); bcopy(netmask + skip, addmask_key + skip, mlen - skip); /* * Trim trailing zeroes. */ for (cp = addmask_key + mlen; (cp > addmask_key) && cp[-1] == 0;) cp--; mlen = cp - addmask_key; if (mlen <= skip) return (maskhead->mask_nodes); *addmask_key = mlen; x = rn_search(addmask_key, maskhead->head.rnh_treetop); if (bcmp(addmask_key, x->rn_key, mlen) != 0) x = NULL; if (x || search) return (x); R_Zalloc(x, struct radix_node *, RADIX_MAX_KEY_LEN + 2 * sizeof (*x)); if ((saved_x = x) == NULL) return (0); netmask = cp = (unsigned char *)(x + 2); bcopy(addmask_key, cp, mlen); x = rn_insert(cp, &maskhead->head, &maskduplicated, x); if (maskduplicated) { log(LOG_ERR, "rn_addmask: mask impossibly already in tree"); R_Free(saved_x); return (x); } /* * Calculate index of mask, and check for normalcy. * First find the first byte with a 0 bit, then if there are * more bits left (remember we already trimmed the trailing 0's), * the bits should be contiguous, otherwise we have got * a non-contiguous mask. */ #define CONTIG(_c) (((~(_c) + 1) & (_c)) == (unsigned char)(~(_c) + 1)) cplim = netmask + mlen; isnormal = 1; for (cp = netmask + skip; (cp < cplim) && *(u_char *)cp == 0xff;) cp++; if (cp != cplim) { for (j = 0x80; (j & *cp) != 0; j >>= 1) b++; if (!CONTIG(*cp) || cp != (cplim - 1)) isnormal = 0; } b += (cp - netmask) << 3; x->rn_bit = -1 - b; if (isnormal) x->rn_flags |= RNF_NORMAL; return (x); } static int /* XXX: arbitrary ordering for non-contiguous masks */ rn_lexobetter(void *m_arg, void *n_arg) { u_char *mp = m_arg, *np = n_arg, *lim; if (LEN(mp) > LEN(np)) return (1); /* not really, but need to check longer one first */ if (LEN(mp) == LEN(np)) for (lim = mp + LEN(mp); mp < lim;) if (*mp++ > *np++) return (1); return (0); } static struct radix_mask * rn_new_radix_mask(struct radix_node *tt, struct radix_mask *next) { struct radix_mask *m; R_Malloc(m, struct radix_mask *, sizeof (struct radix_mask)); if (m == NULL) { log(LOG_ERR, "Failed to allocate route mask\n"); return (0); } bzero(m, sizeof(*m)); m->rm_bit = tt->rn_bit; m->rm_flags = tt->rn_flags; if (tt->rn_flags & RNF_NORMAL) m->rm_leaf = tt; else m->rm_mask = tt->rn_mask; m->rm_mklist = next; tt->rn_mklist = m; return (m); } struct radix_node * rn_addroute(void *v_arg, void *n_arg, struct radix_head *head, struct radix_node treenodes[2]) { caddr_t v = (caddr_t)v_arg, netmask = (caddr_t)n_arg; struct radix_node *t, *x = NULL, *tt; struct radix_node *saved_tt, *top = head->rnh_treetop; short b = 0, b_leaf = 0; int keyduplicated; caddr_t mmask; struct radix_mask *m, **mp; /* * In dealing with non-contiguous masks, there may be * many different routes which have the same mask. * We will find it useful to have a unique pointer to * the mask to speed avoiding duplicate references at * nodes and possibly save time in calculating indices. */ if (netmask) { x = rn_addmask(netmask, head->rnh_masks, 0, top->rn_offset); if (x == NULL) return (0); b_leaf = x->rn_bit; b = -1 - x->rn_bit; netmask = x->rn_key; } /* * Deal with duplicated keys: attach node to previous instance */ saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes); if (keyduplicated) { for (t = tt; tt; t = tt, tt = tt->rn_dupedkey) { if (tt->rn_mask == netmask) return (0); if (netmask == 0 || (tt->rn_mask && ((b_leaf < tt->rn_bit) /* index(netmask) > node */ || rn_refines(netmask, tt->rn_mask) || rn_lexobetter(netmask, tt->rn_mask)))) break; } /* * If the mask is not duplicated, we wouldn't * find it among possible duplicate key entries * anyway, so the above test doesn't hurt. * * We sort the masks for a duplicated key the same way as * in a masklist -- most specific to least specific. * This may require the unfortunate nuisance of relocating * the head of the list. * * We also reverse, or doubly link the list through the * parent pointer. */ if (tt == saved_tt) { struct radix_node *xx = x; /* link in at head of list */ (tt = treenodes)->rn_dupedkey = t; tt->rn_flags = t->rn_flags; tt->rn_parent = x = t->rn_parent; t->rn_parent = tt; /* parent */ if (x->rn_left == t) x->rn_left = tt; else x->rn_right = tt; saved_tt = tt; x = xx; } else { (tt = treenodes)->rn_dupedkey = t->rn_dupedkey; t->rn_dupedkey = tt; tt->rn_parent = t; /* parent */ if (tt->rn_dupedkey) /* parent */ tt->rn_dupedkey->rn_parent = tt; /* parent */ } #ifdef RN_DEBUG t=tt+1; tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++; tt->rn_twin = t; tt->rn_ybro = rn_clist; rn_clist = tt; #endif tt->rn_key = (caddr_t) v; tt->rn_bit = -1; tt->rn_flags = RNF_ACTIVE; } /* * Put mask in tree. */ if (netmask) { tt->rn_mask = netmask; tt->rn_bit = x->rn_bit; tt->rn_flags |= x->rn_flags & RNF_NORMAL; } t = saved_tt->rn_parent; if (keyduplicated) goto on2; b_leaf = -1 - t->rn_bit; if (t->rn_right == saved_tt) x = t->rn_left; else x = t->rn_right; /* Promote general routes from below */ if (x->rn_bit < 0) { for (mp = &t->rn_mklist; x; x = x->rn_dupedkey) if (x->rn_mask && (x->rn_bit >= b_leaf) && x->rn_mklist == 0) { *mp = m = rn_new_radix_mask(x, 0); if (m) mp = &m->rm_mklist; } } else if (x->rn_mklist) { /* * Skip over masks whose index is > that of new node */ for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) if (m->rm_bit >= b_leaf) break; t->rn_mklist = m; *mp = NULL; } on2: /* Add new route to highest possible ancestor's list */ if ((netmask == 0) || (b > t->rn_bit )) return (tt); /* can't lift at all */ b_leaf = tt->rn_bit; do { x = t; t = t->rn_parent; } while (b <= t->rn_bit && x != top); /* * Search through routes associated with node to * insert new route according to index. * Need same criteria as when sorting dupedkeys to avoid * double loop on deletion. */ for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) { if (m->rm_bit < b_leaf) continue; if (m->rm_bit > b_leaf) break; if (m->rm_flags & RNF_NORMAL) { mmask = m->rm_leaf->rn_mask; if (tt->rn_flags & RNF_NORMAL) { log(LOG_ERR, "Non-unique normal route, mask not entered\n"); return (tt); } } else mmask = m->rm_mask; if (mmask == netmask) { m->rm_refs++; tt->rn_mklist = m; return (tt); } if (rn_refines(netmask, mmask) || rn_lexobetter(netmask, mmask)) break; } *mp = rn_new_radix_mask(tt, *mp); return (tt); } struct radix_node * rn_delete(void *v_arg, void *netmask_arg, struct radix_head *head) { struct radix_node *t, *p, *x, *tt; struct radix_mask *m, *saved_m, **mp; struct radix_node *dupedkey, *saved_tt, *top; caddr_t v, netmask; int b, head_off, vlen; v = v_arg; netmask = netmask_arg; x = head->rnh_treetop; tt = rn_search(v, x); head_off = x->rn_offset; vlen = LEN(v); saved_tt = tt; top = x; if (tt == NULL || bcmp(v + head_off, tt->rn_key + head_off, vlen - head_off)) return (0); /* * Delete our route from mask lists. */ if (netmask) { x = rn_addmask(netmask, head->rnh_masks, 1, head_off); if (x == NULL) return (0); netmask = x->rn_key; while (tt->rn_mask != netmask) if ((tt = tt->rn_dupedkey) == NULL) return (0); } if (tt->rn_mask == 0 || (saved_m = m = tt->rn_mklist) == NULL) goto on1; if (tt->rn_flags & RNF_NORMAL) { if (m->rm_leaf != tt || m->rm_refs > 0) { log(LOG_ERR, "rn_delete: inconsistent annotation\n"); return (0); /* dangling ref could cause disaster */ } } else { if (m->rm_mask != tt->rn_mask) { log(LOG_ERR, "rn_delete: inconsistent annotation\n"); goto on1; } if (--m->rm_refs >= 0) goto on1; } b = -1 - tt->rn_bit; t = saved_tt->rn_parent; if (b > t->rn_bit) goto on1; /* Wasn't lifted at all */ do { x = t; t = t->rn_parent; } while (b <= t->rn_bit && x != top); for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) if (m == saved_m) { *mp = m->rm_mklist; R_Free(m); break; } if (m == NULL) { log(LOG_ERR, "rn_delete: couldn't find our annotation\n"); if (tt->rn_flags & RNF_NORMAL) return (0); /* Dangling ref to us */ } on1: /* * Eliminate us from tree */ if (tt->rn_flags & RNF_ROOT) return (0); #ifdef RN_DEBUG /* Get us out of the creation list */ for (t = rn_clist; t && t->rn_ybro != tt; t = t->rn_ybro) {} if (t) t->rn_ybro = tt->rn_ybro; #endif t = tt->rn_parent; dupedkey = saved_tt->rn_dupedkey; if (dupedkey) { /* * Here, tt is the deletion target and * saved_tt is the head of the dupekey chain. */ if (tt == saved_tt) { /* remove from head of chain */ x = dupedkey; x->rn_parent = t; if (t->rn_left == tt) t->rn_left = x; else t->rn_right = x; } else { /* find node in front of tt on the chain */ for (x = p = saved_tt; p && p->rn_dupedkey != tt;) p = p->rn_dupedkey; if (p) { p->rn_dupedkey = tt->rn_dupedkey; if (tt->rn_dupedkey) /* parent */ tt->rn_dupedkey->rn_parent = p; /* parent */ } else log(LOG_ERR, "rn_delete: couldn't find us\n"); } t = tt + 1; if (t->rn_flags & RNF_ACTIVE) { #ifndef RN_DEBUG *++x = *t; p = t->rn_parent; #else b = t->rn_info; *++x = *t; t->rn_info = b; p = t->rn_parent; #endif if (p->rn_left == t) p->rn_left = x; else p->rn_right = x; x->rn_left->rn_parent = x; x->rn_right->rn_parent = x; } goto out; } if (t->rn_left == tt) x = t->rn_right; else x = t->rn_left; p = t->rn_parent; if (p->rn_right == t) p->rn_right = x; else p->rn_left = x; x->rn_parent = p; /* * Demote routes attached to us. */ if (t->rn_mklist) { if (x->rn_bit >= 0) { for (mp = &x->rn_mklist; (m = *mp);) mp = &m->rm_mklist; *mp = t->rn_mklist; } else { /* If there are any key,mask pairs in a sibling duped-key chain, some subset will appear sorted in the same order attached to our mklist */ for (m = t->rn_mklist; m && x; x = x->rn_dupedkey) if (m == x->rn_mklist) { struct radix_mask *mm = m->rm_mklist; x->rn_mklist = 0; if (--(m->rm_refs) < 0) R_Free(m); m = mm; } if (m) log(LOG_ERR, "rn_delete: Orphaned Mask %p at %p\n", m, x); } } /* * We may be holding an active internal node in the tree. */ x = tt + 1; if (t != x) { #ifndef RN_DEBUG *t = *x; #else b = t->rn_info; *t = *x; t->rn_info = b; #endif t->rn_left->rn_parent = t; t->rn_right->rn_parent = t; p = x->rn_parent; if (p->rn_left == x) p->rn_left = t; else p->rn_right = t; } out: tt->rn_flags &= ~RNF_ACTIVE; tt[1].rn_flags &= ~RNF_ACTIVE; return (tt); } /* * This is the same as rn_walktree() except for the parameters and the * exit. */ int rn_walktree_from(struct radix_head *h, void *a, void *m, walktree_f_t *f, void *w) { int error; struct radix_node *base, *next; u_char *xa = (u_char *)a; u_char *xm = (u_char *)m; struct radix_node *rn, *last = NULL; /* shut up gcc */ int stopping = 0; int lastb; KASSERT(m != NULL, ("%s: mask needs to be specified", __func__)); /* * rn_search_m is sort-of-open-coded here. We cannot use the * function because we need to keep track of the last node seen. */ /* printf("about to search\n"); */ for (rn = h->rnh_treetop; rn->rn_bit >= 0; ) { last = rn; /* printf("rn_bit %d, rn_bmask %x, xm[rn_offset] %x\n", rn->rn_bit, rn->rn_bmask, xm[rn->rn_offset]); */ if (!(rn->rn_bmask & xm[rn->rn_offset])) { break; } if (rn->rn_bmask & xa[rn->rn_offset]) { rn = rn->rn_right; } else { rn = rn->rn_left; } } /* printf("done searching\n"); */ /* * Two cases: either we stepped off the end of our mask, * in which case last == rn, or we reached a leaf, in which * case we want to start from the leaf. */ if (rn->rn_bit >= 0) rn = last; lastb = last->rn_bit; /* printf("rn %p, lastb %d\n", rn, lastb);*/ /* * This gets complicated because we may delete the node * while applying the function f to it, so we need to calculate * the successor node in advance. */ while (rn->rn_bit >= 0) rn = rn->rn_left; while (!stopping) { /* printf("node %p (%d)\n", rn, rn->rn_bit); */ base = rn; /* If at right child go back up, otherwise, go right */ while (rn->rn_parent->rn_right == rn && !(rn->rn_flags & RNF_ROOT)) { rn = rn->rn_parent; /* if went up beyond last, stop */ if (rn->rn_bit <= lastb) { stopping = 1; /* printf("up too far\n"); */ /* * XXX we should jump to the 'Process leaves' * part, because the values of 'rn' and 'next' * we compute will not be used. Not a big deal * because this loop will terminate, but it is * inefficient and hard to understand! */ } } /* * At the top of the tree, no need to traverse the right * half, prevent the traversal of the entire tree in the * case of default route. */ if (rn->rn_parent->rn_flags & RNF_ROOT) stopping = 1; /* Find the next *leaf* since next node might vanish, too */ for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;) rn = rn->rn_left; next = rn; /* Process leaves */ while ((rn = base) != NULL) { base = rn->rn_dupedkey; /* printf("leaf %p\n", rn); */ if (!(rn->rn_flags & RNF_ROOT) && (error = (*f)(rn, w))) return (error); } rn = next; if (rn->rn_flags & RNF_ROOT) { /* printf("root, stopping"); */ stopping = 1; } } return (0); } int rn_walktree(struct radix_head *h, walktree_f_t *f, void *w) { int error; struct radix_node *base, *next; struct radix_node *rn = h->rnh_treetop; /* * This gets complicated because we may delete the node * while applying the function f to it, so we need to calculate * the successor node in advance. */ /* First time through node, go left */ while (rn->rn_bit >= 0) rn = rn->rn_left; for (;;) { base = rn; /* If at right child go back up, otherwise, go right */ while (rn->rn_parent->rn_right == rn && (rn->rn_flags & RNF_ROOT) == 0) rn = rn->rn_parent; /* Find the next *leaf* since next node might vanish, too */ for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;) rn = rn->rn_left; next = rn; /* Process leaves */ while ((rn = base)) { base = rn->rn_dupedkey; if (!(rn->rn_flags & RNF_ROOT) && (error = (*f)(rn, w))) return (error); } rn = next; if (rn->rn_flags & RNF_ROOT) return (0); } /* NOTREACHED */ } /* * Initialize an empty tree. This has 3 nodes, which are passed * via base_nodes (in the order ) and are * marked RNF_ROOT so they cannot be freed. * The leaves have all-zero and all-one keys, with significant * bits starting at 'off'. */ void rn_inithead_internal(struct radix_head *rh, struct radix_node *base_nodes, int off) { struct radix_node *t, *tt, *ttt; t = rn_newpair(rn_zeros, off, base_nodes); ttt = base_nodes + 2; t->rn_right = ttt; t->rn_parent = t; tt = t->rn_left; /* ... which in turn is base_nodes */ tt->rn_flags = t->rn_flags = RNF_ROOT | RNF_ACTIVE; tt->rn_bit = -1 - off; *ttt = *tt; ttt->rn_key = rn_ones; rh->rnh_treetop = t; } static void rn_detachhead_internal(struct radix_head *head) { KASSERT((head != NULL), ("%s: head already freed", __func__)); /* Free nodes. */ R_Free(head); } /* Functions used by 'struct radix_node_head' users */ int rn_inithead(void **head, int off) { struct radix_node_head *rnh; struct radix_mask_head *rmh; rnh = *head; rmh = NULL; if (*head != NULL) return (1); R_Zalloc(rnh, struct radix_node_head *, sizeof (*rnh)); R_Zalloc(rmh, struct radix_mask_head *, sizeof (*rmh)); if (rnh == NULL || rmh == NULL) { if (rnh != NULL) R_Free(rnh); if (rmh != NULL) R_Free(rmh); return (0); } /* Init trees */ rn_inithead_internal(&rnh->rh, rnh->rnh_nodes, off); rn_inithead_internal(&rmh->head, rmh->mask_nodes, 0); *head = rnh; rnh->rh.rnh_masks = rmh; /* Finally, set base callbacks */ rnh->rnh_addaddr = rn_addroute; rnh->rnh_deladdr = rn_delete; rnh->rnh_matchaddr = rn_match; rnh->rnh_lookup = rn_lookup; rnh->rnh_walktree = rn_walktree; rnh->rnh_walktree_from = rn_walktree_from; return (1); } static int rn_freeentry(struct radix_node *rn, void *arg) { struct radix_head * const rnh = arg; struct radix_node *x; x = (struct radix_node *)rn_delete(rn + 2, NULL, rnh); if (x != NULL) R_Free(x); return (0); } int rn_detachhead(void **head) { struct radix_node_head *rnh; KASSERT((head != NULL && *head != NULL), ("%s: head already freed", __func__)); rnh = (struct radix_node_head *)(*head); rn_walktree(&rnh->rh.rnh_masks->head, rn_freeentry, rnh->rh.rnh_masks); rn_detachhead_internal(&rnh->rh.rnh_masks->head); rn_detachhead_internal(&rnh->rh); *head = NULL; return (1); } diff --git a/sys/net/radix.h b/sys/net/radix.h index a0e5e5c5aa3f..97555ee9e16d 100644 --- a/sys/net/radix.h +++ b/sys/net/radix.h @@ -1,189 +1,190 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1988, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)radix.h 8.2 (Berkeley) 10/31/94 * $FreeBSD$ */ #ifndef _RADIX_H_ #define _RADIX_H_ #ifdef _KERNEL #include #include #include #endif #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_RTABLE); #endif /* * Radix search tree node layout. */ struct radix_node { struct radix_mask *rn_mklist; /* list of masks contained in subtree */ struct radix_node *rn_parent; /* parent */ short rn_bit; /* bit offset; -1-index(netmask) */ char rn_bmask; /* node: mask for bit test*/ u_char rn_flags; /* enumerated next */ #define RNF_NORMAL 1 /* leaf contains normal route */ #define RNF_ROOT 2 /* leaf is root leaf for tree */ #define RNF_ACTIVE 4 /* This node is alive (for rtfree) */ union { struct { /* leaf only data: */ caddr_t rn_Key; /* object of search */ caddr_t rn_Mask; /* netmask, if present */ struct radix_node *rn_Dupedkey; } rn_leaf; struct { /* node only data: */ int rn_Off; /* where to start compare */ struct radix_node *rn_L;/* progeny */ struct radix_node *rn_R;/* progeny */ } rn_node; } rn_u; #ifdef RN_DEBUG int rn_info; struct radix_node *rn_twin; struct radix_node *rn_ybro; #endif }; #define rn_dupedkey rn_u.rn_leaf.rn_Dupedkey #define rn_key rn_u.rn_leaf.rn_Key #define rn_mask rn_u.rn_leaf.rn_Mask #define rn_offset rn_u.rn_node.rn_Off #define rn_left rn_u.rn_node.rn_L #define rn_right rn_u.rn_node.rn_R /* * Annotations to tree concerning potential routes applying to subtrees. */ struct radix_mask { short rm_bit; /* bit offset; -1-index(netmask) */ char rm_unused; /* cf. rn_bmask */ u_char rm_flags; /* cf. rn_flags */ struct radix_mask *rm_mklist; /* more masks to try */ union { caddr_t rmu_mask; /* the mask */ struct radix_node *rmu_leaf; /* for normal routes */ } rm_rmu; int rm_refs; /* # of references to this struct */ }; #define rm_mask rm_rmu.rmu_mask #define rm_leaf rm_rmu.rmu_leaf /* extra field would make 32 bytes */ struct radix_head; typedef int walktree_f_t(struct radix_node *, void *); typedef struct radix_node *rn_matchaddr_f_t(void *v, struct radix_head *head); typedef struct radix_node *rn_addaddr_f_t(void *v, void *mask, struct radix_head *head, struct radix_node nodes[]); typedef struct radix_node *rn_deladdr_f_t(void *v, void *mask, struct radix_head *head); typedef struct radix_node *rn_lookup_f_t(void *v, void *mask, struct radix_head *head); typedef int rn_walktree_t(struct radix_head *head, walktree_f_t *f, void *w); typedef int rn_walktree_from_t(struct radix_head *head, void *a, void *m, walktree_f_t *f, void *w); typedef void rn_close_t(struct radix_node *rn, struct radix_head *head); +struct radix_node *rn_nextprefix(struct radix_node *rn); struct radix_mask_head; struct radix_head { struct radix_node *rnh_treetop; struct radix_mask_head *rnh_masks; /* Storage for our masks */ }; struct radix_node_head { struct radix_head rh; rn_matchaddr_f_t *rnh_matchaddr; /* longest match for sockaddr */ rn_addaddr_f_t *rnh_addaddr; /* add based on sockaddr*/ rn_deladdr_f_t *rnh_deladdr; /* remove based on sockaddr */ rn_lookup_f_t *rnh_lookup; /* exact match for sockaddr */ rn_walktree_t *rnh_walktree; /* traverse tree */ rn_walktree_from_t *rnh_walktree_from; /* traverse tree below a */ rn_close_t *rnh_close; /*do something when the last ref drops*/ struct radix_node rnh_nodes[3]; /* empty tree for common case */ #ifdef _KERNEL struct rmlock rnh_lock; /* locks entire radix tree */ #endif }; struct radix_mask_head { struct radix_head head; struct radix_node mask_nodes[3]; }; void rn_inithead_internal(struct radix_head *rh, struct radix_node *base_nodes, int off); #ifndef _KERNEL #define R_Malloc(p, t, n) (p = (t) malloc((unsigned int)(n))) #define R_Zalloc(p, t, n) (p = (t) calloc(1,(unsigned int)(n))) #define R_Free(p) free((char *)p); #else #define R_Malloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_NOWAIT)) #define R_Zalloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_NOWAIT | M_ZERO)) #define R_Free(p) free((caddr_t)p, M_RTABLE); #define RADIX_NODE_HEAD_RLOCK_TRACKER struct rm_priotracker _rnh_tracker #define RADIX_NODE_HEAD_LOCK_INIT(rnh) \ rm_init(&(rnh)->rnh_lock, "radix node head") #define RADIX_NODE_HEAD_LOCK(rnh) rm_wlock(&(rnh)->rnh_lock) #define RADIX_NODE_HEAD_UNLOCK(rnh) rm_wunlock(&(rnh)->rnh_lock) #define RADIX_NODE_HEAD_RLOCK(rnh) rm_rlock(&(rnh)->rnh_lock,\ &_rnh_tracker) #define RADIX_NODE_HEAD_RUNLOCK(rnh) rm_runlock(&(rnh)->rnh_lock,\ &_rnh_tracker) #define RADIX_NODE_HEAD_DESTROY(rnh) rm_destroy(&(rnh)->rnh_lock) #define RADIX_NODE_HEAD_LOCK_ASSERT(rnh) rm_assert(&(rnh)->rnh_lock, RA_LOCKED) #define RADIX_NODE_HEAD_WLOCK_ASSERT(rnh) rm_assert(&(rnh)->rnh_lock, RA_WLOCKED) #endif /* _KERNEL */ int rn_inithead(void **, int); int rn_detachhead(void **); int rn_refines(void *, void *); struct radix_node *rn_addroute(void *, void *, struct radix_head *, struct radix_node[2]); struct radix_node *rn_delete(void *, void *, struct radix_head *); struct radix_node *rn_lookup (void *v_arg, void *m_arg, struct radix_head *head); struct radix_node *rn_match(void *, struct radix_head *); int rn_walktree_from(struct radix_head *h, void *a, void *m, walktree_f_t *f, void *w); int rn_walktree(struct radix_head *, walktree_f_t *, void *); #endif /* _RADIX_H_ */ diff --git a/sys/net/route/route_ctl.h b/sys/net/route/route_ctl.h index 229c762d4a73..a670979df8c9 100644 --- a/sys/net/route/route_ctl.h +++ b/sys/net/route/route_ctl.h @@ -1,160 +1,163 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2020 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * This header file contains public functions and structures used for * routing table manipulations. */ #ifndef _NET_ROUTE_ROUTE_CTL_H_ #define _NET_ROUTE_ROUTE_CTL_H_ struct rib_cmd_info { uint8_t rc_cmd; /* RTM_ADD|RTM_DEL|RTM_CHANGE */ uint8_t spare[3]; uint32_t rc_nh_weight; /* new nhop weight */ struct rtentry *rc_rt; /* Target entry */ struct nhop_object *rc_nh_old; /* Target nhop OR mpath */ struct nhop_object *rc_nh_new; /* Target nhop OR mpath */ }; int rib_add_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc); int rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc); int rib_change_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc); int rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info, struct rib_cmd_info *rc); int rib_handle_ifaddr_info(uint32_t fibnum, int cmd, struct rt_addrinfo *info); typedef void route_notification_t(struct rib_cmd_info *rc, void *); void rib_decompose_notification(struct rib_cmd_info *rc, route_notification_t *cb, void *cbdata); int rib_add_redirect(u_int fibnum, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *author, struct ifnet *ifp, int flags, int expire_sec); /* common flags for the functions below */ #define RIB_FLAG_WLOCK 0x01 /* Need exclusive rnh lock */ #define RIB_FLAG_LOCKED 0x02 /* Do not explicitly acquire rnh lock */ enum rib_walk_hook { RIB_WALK_HOOK_PRE, /* Hook is called before iteration */ RIB_WALK_HOOK_POST, /* Hook is called after iteration */ }; typedef int rib_walktree_f_t(struct rtentry *, void *); typedef void rib_walk_hook_f_t(struct rib_head *rnh, enum rib_walk_hook stage, void *arg); void rib_walk(uint32_t fibnum, int af, bool wlock, rib_walktree_f_t *wa_f, void *arg); void rib_walk_ext(uint32_t fibnum, int af, bool wlock, rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg); void rib_walk_ext_internal(struct rib_head *rnh, bool wlock, rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg); void rib_walk_ext_locked(struct rib_head *rnh, rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg); void rib_walk_from(uint32_t fibnum, int family, uint32_t flags, struct sockaddr *prefix, struct sockaddr *mask, rib_walktree_f_t *wa_f, void *arg); void rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *arg, bool report); void rib_foreach_table_walk(int family, bool wlock, rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg); void rib_foreach_table_walk_del(int family, rib_filter_f_t *filter_f, void *arg); struct nhop_object; struct nhgrp_object; struct route_nhop_data { union { struct nhop_object *rnd_nhop; struct nhgrp_object *rnd_nhgrp; }; uint32_t rnd_weight; }; const struct rtentry *rib_lookup_prefix(uint32_t fibnum, int family, const struct sockaddr *dst, const struct sockaddr *netmask, struct route_nhop_data *rnd); const struct rtentry *rib_lookup_lpm(uint32_t fibnum, int family, const struct sockaddr *dst, struct route_nhop_data *rnd); /* rtentry accessors */ bool rt_is_host(const struct rtentry *rt); sa_family_t rt_get_family(const struct rtentry *); struct nhop_object *rt_get_raw_nhop(const struct rtentry *rt); #ifdef INET struct in_addr; void rt_get_inet_prefix_plen(const struct rtentry *rt, struct in_addr *paddr, int *plen, uint32_t *pscopeid); void rt_get_inet_prefix_pmask(const struct rtentry *rt, struct in_addr *paddr, struct in_addr *pmask, uint32_t *pscopeid); +struct rtentry *rt_get_inet_parent(uint32_t fibnum, struct in_addr addr, int plen); #endif #ifdef INET6 struct in6_addr; void rt_get_inet6_prefix_plen(const struct rtentry *rt, struct in6_addr *paddr, int *plen, uint32_t *pscopeid); void rt_get_inet6_prefix_pmask(const struct rtentry *rt, struct in6_addr *paddr, struct in6_addr *pmask, uint32_t *pscopeid); +struct rtentry *rt_get_inet6_parent(uint32_t fibnum, const struct in6_addr *paddr, + int plen); #endif /* Nexthops */ uint32_t nhops_get_count(struct rib_head *rh); /* Multipath */ struct weightened_nhop; struct weightened_nhop *nhgrp_get_nhops(struct nhgrp_object *nhg, uint32_t *pnum_nhops); uint32_t nhgrp_get_count(struct rib_head *rh); /* Route subscriptions */ enum rib_subscription_type { RIB_NOTIFY_IMMEDIATE, RIB_NOTIFY_DELAYED }; struct rib_subscription; typedef void rib_subscription_cb_t(struct rib_head *rnh, struct rib_cmd_info *rc, void *arg); struct rib_subscription *rib_subscribe(uint32_t fibnum, int family, rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type, bool waitok); struct rib_subscription *rib_subscribe_internal(struct rib_head *rnh, rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type, bool waitok); struct rib_subscription *rib_subscribe_locked(struct rib_head *rnh, rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type); void rib_unsubscribe(struct rib_subscription *rs); void rib_unsubscribe_locked(struct rib_subscription *rs); #endif diff --git a/sys/net/route/route_helpers.c b/sys/net/route/route_helpers.c index 5d29197cc4fb..569e13a308eb 100644 --- a/sys/net/route/route_helpers.c +++ b/sys/net/route/route_helpers.c @@ -1,417 +1,567 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2020 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #endif #ifdef INET6 #include +#include #endif #include /* * RIB helper functions. */ void rib_walk_ext_locked(struct rib_head *rnh, rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg) { if (hook_f != NULL) hook_f(rnh, RIB_WALK_HOOK_PRE, arg); rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg); if (hook_f != NULL) hook_f(rnh, RIB_WALK_HOOK_POST, arg); } /* * Calls @wa_f with @arg for each entry in the table specified by * @af and @fibnum. * * @ss_t callback is called before and after the tree traversal * while holding table lock. * * Table is traversed under read lock unless @wlock is set. */ void rib_walk_ext_internal(struct rib_head *rnh, bool wlock, rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg) { RIB_RLOCK_TRACKER; if (wlock) RIB_WLOCK(rnh); else RIB_RLOCK(rnh); rib_walk_ext_locked(rnh, wa_f, hook_f, arg); if (wlock) RIB_WUNLOCK(rnh); else RIB_RUNLOCK(rnh); } void rib_walk_ext(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg) { struct rib_head *rnh; if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL) rib_walk_ext_internal(rnh, wlock, wa_f, hook_f, arg); } /* * Calls @wa_f with @arg for each entry in the table specified by * @af and @fibnum. * * Table is traversed under read lock unless @wlock is set. */ void rib_walk(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f, void *arg) { rib_walk_ext(fibnum, family, wlock, wa_f, NULL, arg); } /* * Calls @wa_f with @arg for each entry in the table matching @prefix/@mask. * * The following flags are supported: * RIB_FLAG_WLOCK: acquire exclusive lock * RIB_FLAG_LOCKED: Assumes the table is already locked & skip locking * * By default, table is traversed under read lock. */ void rib_walk_from(uint32_t fibnum, int family, uint32_t flags, struct sockaddr *prefix, struct sockaddr *mask, rib_walktree_f_t *wa_f, void *arg) { RIB_RLOCK_TRACKER; struct rib_head *rnh = rt_tables_get_rnh(fibnum, family); if (rnh == NULL) return; if (flags & RIB_FLAG_WLOCK) RIB_WLOCK(rnh); else if (!(flags & RIB_FLAG_LOCKED)) RIB_RLOCK(rnh); rnh->rnh_walktree_from(&rnh->head, prefix, mask, (walktree_f_t *)wa_f, arg); if (flags & RIB_FLAG_WLOCK) RIB_WUNLOCK(rnh); else if (!(flags & RIB_FLAG_LOCKED)) RIB_RUNLOCK(rnh); } /* * Iterates over all existing fibs in system calling * @hook_f function before/after traversing each fib. * Calls @wa_f function for each element in current fib. * If af is not AF_UNSPEC, iterates over fibs in particular * address family. */ void rib_foreach_table_walk(int family, bool wlock, rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg) { for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) { /* Do we want some specific family? */ if (family != AF_UNSPEC) { rib_walk_ext(fibnum, family, wlock, wa_f, hook_f, arg); continue; } for (int i = 1; i <= AF_MAX; i++) rib_walk_ext(fibnum, i, wlock, wa_f, hook_f, arg); } } /* * Iterates over all existing fibs in system and deletes each element * for which @filter_f function returns non-zero value. * If @family is not AF_UNSPEC, iterates over fibs in particular * address family. */ void rib_foreach_table_walk_del(int family, rib_filter_f_t *filter_f, void *arg) { for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) { /* Do we want some specific family? */ if (family != AF_UNSPEC) { rib_walk_del(fibnum, family, filter_f, arg, 0); continue; } for (int i = 1; i <= AF_MAX; i++) rib_walk_del(fibnum, i, filter_f, arg, 0); } } /* * Wrapper for the control plane functions for performing af-agnostic * lookups. * @fibnum: fib to perform the lookup. * @dst: sockaddr with family and addr filled in. IPv6 addresses needs to be in * deembedded from. * @flags: fib(9) flags. * @flowid: flow id for path selection in multipath use case. * * Returns nhop_object or NULL. * * Requires NET_EPOCH. * */ struct nhop_object * rib_lookup(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, uint32_t flowid) { struct nhop_object *nh; nh = NULL; switch (dst->sa_family) { #ifdef INET case AF_INET: { const struct sockaddr_in *a = (const struct sockaddr_in *)dst; nh = fib4_lookup(fibnum, a->sin_addr, 0, flags, flowid); break; } #endif #ifdef INET6 case AF_INET6: { const struct sockaddr_in6 *a = (const struct sockaddr_in6*)dst; nh = fib6_lookup(fibnum, &a->sin6_addr, a->sin6_scope_id, flags, flowid); break; } #endif } return (nh); } #ifdef ROUTE_MPATH static void decompose_change_notification(struct rib_cmd_info *rc, route_notification_t *cb, void *cbdata) { uint32_t num_old, num_new; uint32_t nh_idx_old, nh_idx_new; struct weightened_nhop *wn_old, *wn_new; struct weightened_nhop tmp = { NULL, 0 }; uint32_t idx_old = 0, idx_new = 0; struct rib_cmd_info rc_del = { .rc_cmd = RTM_DELETE, .rc_rt = rc->rc_rt }; struct rib_cmd_info rc_add = { .rc_cmd = RTM_ADD, .rc_rt = rc->rc_rt }; if (NH_IS_NHGRP(rc->rc_nh_old)) { wn_old = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_old); } else { tmp.nh = rc->rc_nh_old; tmp.weight = rc->rc_nh_weight; wn_old = &tmp; num_old = 1; } if (NH_IS_NHGRP(rc->rc_nh_new)) { wn_new = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_new); } else { tmp.nh = rc->rc_nh_new; tmp.weight = rc->rc_nh_weight; wn_new = &tmp; num_new = 1; } /* Use the fact that each @wn array is sorted */ /* * Want to convert into set of add and delete operations * [1] -> [1, 2] = A{2} * [2] -> [1, 2] = A{1} * [1, 2, 4]->[1, 3, 4] = A{2}, D{3} * [1, 2, 4]->[1, 4] = D{2} * [1, 2, 4] -> [3, 4] = D{1}, C{2,3} OR C{1,3}, D{2} OR D{1},D{2},A{3} * [1, 2] -> [3, 4] = * */ idx_old = 0; while ((idx_old < num_old) && (idx_new < num_new)) { nh_idx_old = wn_old[idx_old].nh->nh_priv->nh_idx; nh_idx_new = wn_new[idx_new].nh->nh_priv->nh_idx; if (nh_idx_old == nh_idx_new) { if (wn_old[idx_old].weight != wn_new[idx_new].weight) { /* Update weight by providing del/add notifications */ rc_del.rc_nh_old = wn_old[idx_old].nh; rc_del.rc_nh_weight = wn_old[idx_old].weight; cb(&rc_del, cbdata); rc_add.rc_nh_new = wn_new[idx_new].nh; rc_add.rc_nh_weight = wn_new[idx_new].weight; cb(&rc_add, cbdata); } idx_old++; idx_new++; } else if (nh_idx_old < nh_idx_new) { /* * [1, ~2~, 4], [1, ~3~, 4] * [1, ~2~, 5], [1, ~3~, 4] * [1, ~2~], [1, ~3~, 4] */ if ((idx_old + 1 >= num_old) || (wn_old[idx_old + 1].nh->nh_priv->nh_idx > nh_idx_new)) { /* Add new unless the next old item is still <= new */ rc_add.rc_nh_new = wn_new[idx_new].nh; rc_add.rc_nh_weight = wn_new[idx_new].weight; cb(&rc_add, cbdata); idx_new++; } /* In any case, delete current old */ rc_del.rc_nh_old = wn_old[idx_old].nh; rc_del.rc_nh_weight = wn_old[idx_old].weight; cb(&rc_del, cbdata); idx_old++; } else { /* * nh_idx_old > nh_idx_new * * [1, ~3~, 4], [1, ~2~, 4] * [1, ~3~, 5], [1, ~2~, 4] * [1, ~3~, 4], [1, ~2~] */ if ((idx_new + 1 >= num_new) || (wn_new[idx_new + 1].nh->nh_priv->nh_idx > nh_idx_old)) { /* No next item or next item is > current one */ rc_add.rc_nh_new = wn_new[idx_new].nh; rc_add.rc_nh_weight = wn_new[idx_new].weight; cb(&rc_add, cbdata); idx_new++; } /* In any case, delete current old */ rc_del.rc_nh_old = wn_old[idx_old].nh; rc_del.rc_nh_weight = wn_old[idx_old].weight; cb(&rc_del, cbdata); idx_old++; } } while (idx_old < num_old) { rc_del.rc_nh_old = wn_old[idx_old].nh; rc_del.rc_nh_weight = wn_old[idx_old].weight; cb(&rc_del, cbdata); idx_old++; } while (idx_new < num_new) { rc_add.rc_nh_new = wn_new[idx_new].nh; rc_add.rc_nh_weight = wn_new[idx_new].weight; cb(&rc_add, cbdata); idx_new++; } } /* * Decompose multipath cmd info @rc into a list of add/del/change * single-path operations, calling @cb callback for each operation. * Assumes at least one of the nexthops in @rc is multipath. */ void rib_decompose_notification(struct rib_cmd_info *rc, route_notification_t *cb, void *cbdata) { struct weightened_nhop *wn; uint32_t num_nhops; struct rib_cmd_info rc_new; rc_new = *rc; DPRINTF("cb=%p cmd=%d nh_old=%p nh_new=%p", cb, rc->cmd, rc->nh_old, rc->nh_new); switch (rc->rc_cmd) { case RTM_ADD: if (!NH_IS_NHGRP(rc->rc_nh_new)) return; wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_nhops); for (uint32_t i = 0; i < num_nhops; i++) { rc_new.rc_nh_new = wn[i].nh; rc_new.rc_nh_weight = wn[i].weight; cb(&rc_new, cbdata); } break; case RTM_DELETE: if (!NH_IS_NHGRP(rc->rc_nh_old)) return; wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_nhops); for (uint32_t i = 0; i < num_nhops; i++) { rc_new.rc_nh_old = wn[i].nh; rc_new.rc_nh_weight = wn[i].weight; cb(&rc_new, cbdata); } break; case RTM_CHANGE: if (!NH_IS_NHGRP(rc->rc_nh_old) && !NH_IS_NHGRP(rc->rc_nh_new)) return; decompose_change_notification(rc, cb, cbdata); break; } } #endif + +#ifdef INET +/* + * Checks if the found key in the trie contains (<=) a prefix covering + * @paddr/@plen. + * Returns the most specific rtentry matching the condition or NULL. + */ +static struct rtentry * +get_inet_parent_prefix(uint32_t fibnum, struct in_addr addr, int plen) +{ + struct route_nhop_data rnd; + struct rtentry *rt; + struct in_addr addr4; + uint32_t scopeid; + int parent_plen; + struct radix_node *rn; + + rt = fib4_lookup_rt(fibnum, addr, 0, NHR_UNLOCKED, &rnd); + rt_get_inet_prefix_plen(rt, &addr4, &parent_plen, &scopeid); + if (parent_plen <= plen) + return (rt); + + /* + * There can be multiple prefixes associated with the found key: + * 10.0.0.0 -> 10.0.0.0/24, 10.0.0.0/23, 10.0.0.0/22, etc. + * All such prefixes are linked via rn_dupedkey, from most specific + * to least specific. Iterate over them to check if any of these + * prefixes are wider than desired plen. + */ + rn = (struct radix_node *)rt; + while ((rn = rn_nextprefix(rn)) != NULL) { + rt = RNTORT(rn); + rt_get_inet_prefix_plen(rt, &addr4, &parent_plen, &scopeid); + if (parent_plen <= plen) + return (rt); + } + + return (NULL); +} + +/* + * Returns the most specific prefix containing (>) @paddr/plen. + */ +struct rtentry * +rt_get_inet_parent(uint32_t fibnum, struct in_addr addr, int plen) +{ + struct in_addr lookup_addr = { .s_addr = INADDR_BROADCAST }; + struct in_addr addr4 = addr; + struct in_addr mask4; + struct rtentry *rt; + + while (plen-- > 0) { + /* Calculate wider mask & new key to lookup */ + mask4.s_addr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0); + addr4.s_addr = htonl(ntohl(addr4.s_addr) & ntohl(mask4.s_addr)); + if (addr4.s_addr == lookup_addr.s_addr) { + /* Skip lookup if the key is the same */ + continue; + } + lookup_addr = addr4; + + rt = get_inet_parent_prefix(fibnum, lookup_addr, plen); + if (rt != NULL) + return (rt); + } + + return (NULL); +} +#endif + +#ifdef INET6 +/* + * Checks if the found key in the trie contains (<=) a prefix covering + * @paddr/@plen. + * Returns the most specific rtentry matching the condition or NULL. + */ +static struct rtentry * +get_inet6_parent_prefix(uint32_t fibnum, const struct in6_addr *paddr, int plen) +{ + struct route_nhop_data rnd; + struct rtentry *rt; + struct in6_addr addr6; + uint32_t scopeid; + int parent_plen; + struct radix_node *rn; + + rt = fib6_lookup_rt(fibnum, paddr, 0, NHR_UNLOCKED, &rnd); + rt_get_inet6_prefix_plen(rt, &addr6, &parent_plen, &scopeid); + if (parent_plen <= plen) + return (rt); + + /* + * There can be multiple prefixes associated with the found key: + * 2001:db8:1::/64 -> 2001:db8:1::/56, 2001:db8:1::/48, etc. + * All such prefixes are linked via rn_dupedkey, from most specific + * to least specific. Iterate over them to check if any of these + * prefixes are wider than desired plen. + */ + rn = (struct radix_node *)rt; + while ((rn = rn_nextprefix(rn)) != NULL) { + rt = RNTORT(rn); + rt_get_inet6_prefix_plen(rt, &addr6, &parent_plen, &scopeid); + if (parent_plen <= plen) + return (rt); + } + + return (NULL); +} + +static void +ipv6_writemask(struct in6_addr *addr6, uint8_t mask) +{ + uint32_t *cp; + + for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32) + *cp++ = 0xFFFFFFFF; + if (mask > 0) + *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0); +} + +/* + * Returns the most specific prefix containing (>) @paddr/plen. + */ +struct rtentry * +rt_get_inet6_parent(uint32_t fibnum, const struct in6_addr *paddr, int plen) +{ + struct in6_addr lookup_addr = in6mask128; + struct in6_addr addr6 = *paddr; + struct in6_addr mask6; + struct rtentry *rt; + + while (plen-- > 0) { + /* Calculate wider mask & new key to lookup */ + ipv6_writemask(&mask6, plen); + IN6_MASK_ADDR(&addr6, &mask6); + if (IN6_ARE_ADDR_EQUAL(&addr6, &lookup_addr)) { + /* Skip lookup if the key is the same */ + continue; + } + lookup_addr = addr6; + + rt = get_inet6_parent_prefix(fibnum, &lookup_addr, plen); + if (rt != NULL) + return (rt); + } + + return (NULL); +} +#endif