Index: projects/routing/sys/net/route.c =================================================================== --- projects/routing/sys/net/route.c (revision 274335) +++ projects/routing/sys/net/route.c (revision 274336) @@ -1,1994 +1,2013 @@ /*- * Copyright (c) 1980, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)route.c 8.3.1.1 (Berkeley) 2/23/95 * $FreeBSD$ */ /************************************************************************ * Note: In this file a 'fib' is a "forwarding information base" * * Which is the new name for an in kernel routing (next hop) table. * ***********************************************************************/ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include "opt_sctp.h" #include "opt_mrouting.h" #include "opt_mpath.h" #include #include #include +#include +#include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #define RT_MAXFIBS UINT16_MAX /* Kernel config default option. */ #ifdef ROUTETABLES #if ROUTETABLES <= 0 #error "ROUTETABLES defined too low" #endif #if ROUTETABLES > RT_MAXFIBS #error "ROUTETABLES defined too big" #endif #define RT_NUMFIBS ROUTETABLES #endif /* ROUTETABLES */ /* Initialize to default if not otherwise set. */ #ifndef RT_NUMFIBS #define RT_NUMFIBS 1 #endif #if defined(INET) || defined(INET6) #ifdef SCTP extern void sctp_addr_change(struct ifaddr *ifa, int cmd); #endif /* SCTP */ #endif /* This is read-only.. */ u_int rt_numfibs = RT_NUMFIBS; SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RDTUN, &rt_numfibs, 0, ""); /* * By default add routes to all fibs for new interfaces. * Once this is set to 0 then only allocate routes on interface * changes for the FIB of the caller when adding a new set of addresses * to an interface. XXX this is a shotgun aproach to a problem that needs * a more fine grained solution.. that will come. * XXX also has the problems getting the FIB from curthread which will not * always work given the fib can be overridden and prefixes can be added * from the network stack context. */ VNET_DEFINE(u_int, rt_add_addr_allfibs) = 1; SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(rt_add_addr_allfibs), 0, ""); VNET_DEFINE(struct rtstat, rtstat); #define V_rtstat VNET(rtstat) VNET_DEFINE(struct rib_head *, rt_tables); #define V_rt_tables VNET(rt_tables) VNET_DEFINE(int, rttrash); /* routes not in table but not freed */ #define V_rttrash VNET(rttrash) /* * Convert a 'struct radix_node *' to a 'struct rtentry *'. * The operation can be done safely (in this code) because a * 'struct rtentry' starts with two 'struct radix_node''s, the first * one representing leaf nodes in the routing tree, which is * what the code in radix.c passes us as a 'struct radix_node'. * * But because there are a lot of assumptions in this conversion, * do not cast explicitly, but always use the macro below. */ #define RNTORT(p) ((struct rtentry *)(p)) static VNET_DEFINE(uma_zone_t, rtzone); /* Routing table UMA zone. */ #define V_rtzone VNET(rtzone) static int rtrequest1_fib_change(struct rib_head *, struct rt_addrinfo *, struct rtentry **, u_int); static void rt_setmetrics(const struct rt_addrinfo *, struct rtentry *); static int rt_ifdelroute(struct rtentry *rt, void *arg); /* * handler for net.my_fibnum */ static int sysctl_my_fibnum(SYSCTL_HANDLER_ARGS) { int fibnum; int error; fibnum = curthread->td_proc->p_fibnum; error = sysctl_handle_int(oidp, &fibnum, 0, req); return (error); } SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD, NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller"); static __inline struct rib_head ** rt_tables_get_rnh_ptr(int table, int fam) { struct rib_head **rh; KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds.", __func__)); KASSERT(fam >= 0 && fam < (AF_MAX+1), ("%s: fam out of bounds.", __func__)); /* rh is [fib=0][af=0]. */ rh = (struct rib_head **)V_rt_tables; /* Get the offset to the requested table and fam. */ rh += table * (AF_MAX+1) + fam; return (rh); } struct rib_head * rt_tables_get_rnh(int table, int fam) { return (*rt_tables_get_rnh_ptr(table, fam)); } /* * route initialization must occur before ip6_init2(), which happenas at * SI_ORDER_MIDDLE. */ static void route_init(void) { /* whack the tunable ints into line. */ if (rt_numfibs > RT_MAXFIBS) rt_numfibs = RT_MAXFIBS; if (rt_numfibs == 0) rt_numfibs = 1; } SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0); static void vnet_route_init(const void *unused __unused) { struct domain *dom; struct rib_head **rh; int table; int fam; V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) * sizeof(struct rib_head *), M_RTABLE, M_WAITOK|M_ZERO); V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtattach == NULL) continue; for (table = 0; table < rt_numfibs; table++) { fam = dom->dom_family; if (table != 0 && fam != AF_INET6 && fam != AF_INET) break; rh = rt_tables_get_rnh_ptr(table, fam); if (rh == NULL) panic("%s: rh NULL", __func__); dom->dom_rtattach((void **)rh, 0); } } } VNET_SYSINIT(vnet_route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, vnet_route_init, 0); #ifdef VIMAGE static void vnet_route_uninit(const void *unused __unused) { int table; int fam; struct domain *dom; struct rib_head **rh; for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtdetach == NULL) continue; for (table = 0; table < rt_numfibs; table++) { fam = dom->dom_family; if (table != 0 && fam != AF_INET6 && fam != AF_INET) break; rh = rt_tables_get_rnh_ptr(table, fam); if (rh == NULL) panic("%s: rh NULL", __func__); dom->dom_rtdetach((void **)rh, 0); } } free(V_rt_tables, M_RTABLE); uma_zdestroy(V_rtzone); } VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, vnet_route_uninit, 0); #endif struct rib_head * rt_table_init(int offset) { struct rib_head *rh; rh = malloc(sizeof(struct rib_head), M_RTABLE, M_WAITOK | M_ZERO); /* XXX: These details should be hidded inside radix.c */ /* Init masks tree */ rn_inithead_internal(&rh->head, rh->rnh_nodes, offset); rn_inithead_internal(&rh->rmhead.head, rh->rmhead.mask_nodes, 0); rh->head.s.rnh_masks = &rh->rmhead.head; rh->rmhead.head.s.mask_nodes = rh->rmhead.mask_nodes; /* Init locks */ - rw_init(&rh->rib_lock, "rib head"); + rm_init(&rh->rib_lock, "rib head run"); + rw_init(&rh->rib_cfglock, "rib head cfg"); /* Finally, set base callbacks */ rh->rnh_addaddr = rn_addroute; rh->rnh_deladdr = rn_delete; rh->rnh_matchaddr = rn_match; rh->rnh_lookup = rn_lookup; rh->rnh_walktree = rn_walktree; rh->rnh_walktree_from = rn_walktree_from; return (rh); } void rt_table_destroy(struct rib_head *rh) { /* Assume table is already empty */ - rw_destroy(&rh->rib_lock); + rw_destroy(&rh->rib_cfglock); + rm_destroy(&rh->rib_lock); free(rh, M_RTABLE); } #ifndef _SYS_SYSPROTO_H_ struct setfib_args { int fibnum; }; #endif int sys_setfib(struct thread *td, struct setfib_args *uap) { if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs) return EINVAL; td->td_proc->p_fibnum = uap->fibnum; return (0); } /* * Packet routing routines. */ /* * Legacy function for SCTP support. */ void rtalloc_ign(struct route *ro, u_long ignore) { struct rtentry *rt; if ((rt = ro->ro_rt) != NULL) { if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP) return; RTFREE(rt); ro->ro_rt = NULL; } ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, RT_DEFAULT_FIB); if (ro->ro_rt) RT_UNLOCK(ro->ro_rt); } /* * Look up the route that matches the address given * Or, at least try.. Create a cloned route if needed. * * The returned route, if any, is locked. */ struct rtentry * rtalloc1(struct sockaddr *dst, int report, u_long ignflags) { return (rtalloc1_fib(dst, report, ignflags, RT_DEFAULT_FIB)); } struct rtentry * rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum) { struct rib_head *rh; struct radix_node *rn; struct rtentry *newrt; struct rt_addrinfo info; int err = 0, msgtype = RTM_MISS; int needlock; + RIB_LOCK_READER; KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum")); rh = rt_tables_get_rnh(fibnum, dst->sa_family); newrt = NULL; if (rh == NULL) goto miss; /* * Look up the address in the table for that Address Family */ needlock = !(ignflags & RTF_RNH_LOCKED); if (needlock) RIB_RLOCK(rh); #ifdef INVARIANTS else RIB_LOCK_ASSERT(rh); #endif rn = rh->rnh_matchaddr(dst, &rh->head); if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) { newrt = RNTORT(rn); RT_LOCK(newrt); RT_ADDREF(newrt); if (needlock) RIB_RUNLOCK(rh); goto done; } else if (needlock) RIB_RUNLOCK(rh); /* * Either we hit the root or couldn't find any match, * Which basically means * "caint get there frm here" */ miss: V_rtstat.rts_unreach++; if (report) { /* * If required, report the failure to the supervising * Authorities. * For a delete, this is not an error. (report == 0) */ bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; rt_missmsg_fib(msgtype, &info, 0, err, fibnum); } done: if (newrt) RT_LOCK_ASSERT(newrt); return (newrt); } /* * Remove a reference count from an rtentry. * If the count gets low enough, take it out of the routing table */ void rtfree(struct rtentry *rt) { struct rib_head *rh; KASSERT(rt != NULL,("%s: NULL rt", __func__)); rh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family); KASSERT(rh != NULL,("%s: NULL rh", __func__)); RT_LOCK_ASSERT(rt); /* * The callers should use RTFREE_LOCKED() or RTFREE(), so * we should come here exactly with the last reference. */ RT_REMREF(rt); if (rt->rt_refcnt > 0) { log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt); goto done; } /* * On last reference give the "close method" a chance * to cleanup private state. This also permits (for * IPv4 and IPv6) a chance to decide if the routing table * entry should be purged immediately or at a later time. * When an immediate purge is to happen the close routine * typically calls rtexpunge which clears the RTF_UP flag * on the entry so that the code below reclaims the storage. */ if (rt->rt_refcnt == 0 && rh->rnh_close) rh->rnh_close((struct radix_node *)rt, &rh->head); /* * If we are no longer "up" (and ref == 0) * then we can free the resources associated * with the route. */ if ((rt->rt_flags & RTF_UP) == 0) { if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic("rtfree 2"); /* * the rtentry must have been removed from the routing table * so it is represented in rttrash.. remove that now. */ V_rttrash--; #ifdef DIAGNOSTIC if (rt->rt_refcnt < 0) { printf("rtfree: %p not freed (neg refs)\n", rt); goto done; } #endif /* * release references on items we hold them on.. * e.g other routes and ifaddrs. */ if (rt->rt_ifa) ifa_free(rt->rt_ifa); /* * The key is separatly alloc'd so free it (see rt_setgate()). * This also frees the gateway, as they are always malloc'd * together. */ Free(rt_key(rt)); /* * and the rtentry itself of course */ uma_zfree(V_rtzone, rt); return; } done: RT_UNLOCK(rt); } /* * Force a routing table entry to the specified * destination to go through the given gateway. * Normally called as a result of a routing redirect * message from the network layer. */ void rtredirect(struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct sockaddr *src) { rtredirect_fib(dst, gateway, netmask, flags, src, RT_DEFAULT_FIB); } void rtredirect_fib(struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct sockaddr *src, u_int fibnum) { struct rtentry *rt, *rt0 = NULL; int error = 0; short *stat = NULL; struct rt_addrinfo info; struct ifaddr *ifa; struct rib_head *rh; ifa = NULL; rh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rh == NULL) { error = EAFNOSUPPORT; goto out; } /* verify the gateway is directly reachable */ if ((ifa = ifa_ifwithnet(gateway, 0, fibnum)) == NULL) { error = ENETUNREACH; goto out; } rt = rtalloc1_fib(dst, 0, 0UL, fibnum); /* NB: rt is locked */ /* * If the redirect isn't from our current router for this dst, * it's either old or wrong. If it redirects us to ourselves, * we have a routing loop, perhaps as a result of an interface * going down recently. */ if (!(flags & RTF_DONE) && rt && (!sa_equal(src, rt->rt_gateway) || rt->rt_ifa != ifa)) error = EINVAL; else if (ifa_ifwithaddr_check(gateway)) error = EHOSTUNREACH; if (error) goto done; /* * Create a new entry if we just got back a wildcard entry * or the lookup failed. This is necessary for hosts * which use routing redirects generated by smart gateways * to dynamically build the routing tables. */ if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2)) goto create; /* * Don't listen to the redirect if it's * for a route to an interface. */ if (rt->rt_flags & RTF_GATEWAY) { if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) { /* * Changing from route to net => route to host. * Create new route, rather than smashing route to net. */ create: rt0 = rt; rt = NULL; flags |= RTF_GATEWAY | RTF_DYNAMIC; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; info.rti_ifa = ifa; info.rti_flags = flags; if (rt0 != NULL) RT_UNLOCK(rt0); /* drop lock to avoid LOR with rh */ error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum); if (rt != NULL) { RT_LOCK(rt); if (rt0 != NULL) EVENTHANDLER_INVOKE(route_redirect_event, rt0, rt, dst); flags = rt->rt_flags; } if (rt0 != NULL) RTFREE(rt0); stat = &V_rtstat.rts_dynamic; } else { struct rtentry *gwrt; /* * Smash the current notion of the gateway to * this destination. Should check about netmask!!! */ rt->rt_flags |= RTF_MODIFIED; flags |= RTF_MODIFIED; stat = &V_rtstat.rts_newgateway; /* * add the key and gateway (in one malloc'd chunk). */ RT_UNLOCK(rt); + RIB_CFG_WLOCK(rh); RIB_WLOCK(rh); RT_LOCK(rt); rt_setgate(rt, rt_key(rt), gateway); gwrt = rtalloc1(gateway, 1, RTF_RNH_LOCKED); RIB_WUNLOCK(rh); + RIB_CFG_WUNLOCK(rh); EVENTHANDLER_INVOKE(route_redirect_event, rt, gwrt, dst); RTFREE_LOCKED(gwrt); } } else error = EHOSTUNREACH; done: if (rt) RTFREE_LOCKED(rt); out: if (error) V_rtstat.rts_badredirect++; else if (stat != NULL) (*stat)++; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; info.rti_info[RTAX_AUTHOR] = src; rt_missmsg_fib(RTM_REDIRECT, &info, flags, error, fibnum); if (ifa != NULL) ifa_free(ifa); } int rtioctl(u_long req, caddr_t data) { return (rtioctl_fib(req, data, RT_DEFAULT_FIB)); } /* * Routing table ioctl interface. */ int rtioctl_fib(u_long req, caddr_t data, u_int fibnum) { /* * If more ioctl commands are added here, make sure the proper * super-user checks are being performed because it is possible for * prison-root to make it this far if raw sockets have been enabled * in jails. */ #ifdef INET /* Multicast goop, grrr... */ return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP; #else /* INET */ return ENXIO; #endif /* INET */ } struct ifaddr * ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway, u_int fibnum) { struct ifaddr *ifa; int not_found = 0; if ((flags & RTF_GATEWAY) == 0) { /* * If we are adding a route to an interface, * and the interface is a pt to pt link * we should search for the destination * as our clue to the interface. Otherwise * we can use the local address. */ ifa = NULL; if (flags & RTF_HOST) ifa = ifa_ifwithdstaddr(dst, fibnum); if (ifa == NULL) ifa = ifa_ifwithaddr(gateway); } else { /* * If we are adding a route to a remote net * or host, the gateway may still be on the * other end of a pt to pt link. */ ifa = ifa_ifwithdstaddr(gateway, fibnum); } if (ifa == NULL) ifa = ifa_ifwithnet(gateway, 0, fibnum); if (ifa == NULL) { struct rtentry *rt = rtalloc1_fib(gateway, 0, RTF_RNH_LOCKED, fibnum); if (rt == NULL) return (NULL); /* * dismiss a gateway that is reachable only * through the default router */ switch (gateway->sa_family) { case AF_INET: if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY) not_found = 1; break; case AF_INET6: if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr)) not_found = 1; break; default: break; } if (!not_found && rt->rt_ifa != NULL) { ifa = rt->rt_ifa; ifa_ref(ifa); } RT_REMREF(rt); RT_UNLOCK(rt); if (not_found || ifa == NULL) return (NULL); } if (ifa->ifa_addr->sa_family != dst->sa_family) { struct ifaddr *oifa = ifa; ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp); if (ifa == NULL) ifa = oifa; else ifa_free(oifa); } return (ifa); } /* * Do appropriate manipulations of a routing tree given * all the bits of info needed */ int rtrequest(int req, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct rtentry **ret_nrt) { return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt, RT_DEFAULT_FIB)); } int rtrequest_fib(int req, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct rtentry **ret_nrt, u_int fibnum) { struct rt_addrinfo info; if (dst->sa_len == 0) return(EINVAL); bzero((caddr_t)&info, sizeof(info)); info.rti_flags = flags; info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; return rtrequest1_fib(req, &info, ret_nrt, fibnum); } void rt_foreach_fib(int af, rt_setwarg_t *setwa_f, rt_walktree_f_t *wa_f, void *arg) { struct rib_head *rh; uint32_t fibnum; int i; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { /* Do we want some specific family? */ if (af != AF_UNSPEC) { rh = rt_tables_get_rnh(fibnum, af); if (rh == NULL) continue; if (setwa_f != NULL) setwa_f(rh, fibnum, i, arg); + RIB_CFG_WLOCK(rh); + /* Do runtime locking for now */ RIB_WLOCK(rh); rh->rnh_walktree(&rh->head, (walktree_f_t *)wa_f, arg); RIB_WUNLOCK(rh); + RIB_CFG_WUNLOCK(rh); continue; } for (i = 1; i <= AF_MAX; i++) { rh = rt_tables_get_rnh(fibnum, i); if (rh == NULL) continue; if (setwa_f != NULL) setwa_f(rh, fibnum, i, arg); + RIB_CFG_WLOCK(rh); RIB_WLOCK(rh); + /* Do runtime locking for now */ rh->rnh_walktree(&rh->head, (walktree_f_t *)wa_f, arg); RIB_WUNLOCK(rh); + RIB_CFG_WUNLOCK(rh); } } } /* * Delete Routes for a Network Interface * * Called for each routing entry via the rh->rnh_walktree() call above * to delete all route entries referencing a detaching network interface. * * Arguments: * rt pointer to rtentry * arg argument passed to rh->rnh_walktree() - detaching interface * * Returns: * 0 successful * errno failed - reason indicated */ static int rt_ifdelroute(struct rtentry *rt, void *arg) { struct ifnet *ifp = arg; int err; if (rt->rt_ifp != ifp) return (0); /* * Protect (sorta) against walktree recursion problems * with cloned routes */ if ((rt->rt_flags & RTF_UP) == 0) return (0); err = rtrequest_fib(RTM_DELETE, rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags | RTF_RNH_LOCKED | RTF_PINNED, (struct rtentry **) NULL, rt->rt_fibnum); if (err != 0) log(LOG_WARNING, "rt_ifdelroute: error %d\n", err); return (0); } /* * Delete all remaining routes using this interface * Unfortuneatly the only way to do this is to slog through * the entire routing table looking for routes which point * to this interface...oh well... */ void rt_flushifroutes(struct ifnet *ifp) { rt_foreach_fib(AF_UNSPEC, NULL, rt_ifdelroute, ifp); } /* * These (questionable) definitions of apparent local variables apply * to the next two functions. XXXXXX!!! */ #define dst info->rti_info[RTAX_DST] #define gateway info->rti_info[RTAX_GATEWAY] #define netmask info->rti_info[RTAX_NETMASK] #define ifaaddr info->rti_info[RTAX_IFA] #define ifpaddr info->rti_info[RTAX_IFP] #define flags info->rti_flags int rt_getifa(struct rt_addrinfo *info) { return (rt_getifa_fib(info, RT_DEFAULT_FIB)); } /* * Look up rt_addrinfo for a specific fib. Note that if rti_ifa is defined, * it will be referenced so the caller must free it. */ int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) { struct ifaddr *ifa; int error = 0; /* * ifp may be specified by sockaddr_dl * when protocol address is ambiguous. */ if (info->rti_ifp == NULL && ifpaddr != NULL && ifpaddr->sa_family == AF_LINK && (ifa = ifa_ifwithnet(ifpaddr, 0, fibnum)) != NULL) { info->rti_ifp = ifa->ifa_ifp; ifa_free(ifa); } if (info->rti_ifa == NULL && ifaaddr != NULL) info->rti_ifa = ifa_ifwithaddr(ifaaddr); if (info->rti_ifa == NULL) { struct sockaddr *sa; sa = ifaaddr != NULL ? ifaaddr : (gateway != NULL ? gateway : dst); if (sa != NULL && info->rti_ifp != NULL) info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp); else if (dst != NULL && gateway != NULL) info->rti_ifa = ifa_ifwithroute(flags, dst, gateway, fibnum); else if (sa != NULL) info->rti_ifa = ifa_ifwithroute(flags, sa, sa, fibnum); } if ((ifa = info->rti_ifa) != NULL) { if (info->rti_ifp == NULL) info->rti_ifp = ifa->ifa_ifp; } else error = ENETUNREACH; return (error); } /* * Expunges references to a route that's about to be reclaimed. * The route must be locked. */ int rt_expunge(struct rib_head *rh, struct rtentry *rt) { #if !defined(RADIX_MPATH) struct radix_node *rn; #else struct rt_addrinfo info; int fib; struct rtentry *rt0; #endif struct ifaddr *ifa; int error = 0; RT_LOCK_ASSERT(rt); RIB_LOCK_ASSERT(rh); #ifdef RADIX_MPATH fib = rt->rt_fibnum; bzero(&info, sizeof(info)); info.rti_ifp = rt->rt_ifp; info.rti_flags = RTF_RNH_LOCKED; info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_ifa->ifa_addr; RT_UNLOCK(rt); error = rtrequest1_fib(RTM_DELETE, &info, &rt0, fib); if (error == 0 && rt0 != NULL) { rt = rt0; RT_LOCK(rt); } else if (error != 0) { RT_LOCK(rt); return (error); } #else /* * Remove the item from the tree; it should be there, * but when callers invoke us blindly it may not (sigh). */ rn = rh->rnh_deladdr(rt_key(rt), rt_mask(rt), &rh->head); if (rn == NULL) { error = ESRCH; goto bad; } KASSERT((rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) == 0, ("unexpected flags 0x%x", rn->rn_flags)); KASSERT(rt == RNTORT(rn), ("lookup mismatch, rt %p rn %p", rt, rn)); #endif /* RADIX_MPATH */ rt->rt_flags &= ~RTF_UP; /* * Give the protocol a chance to keep things in sync. */ if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) { struct rt_addrinfo info; bzero((caddr_t)&info, sizeof(info)); info.rti_flags = rt->rt_flags; info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rt_mask(rt); ifa->ifa_rtrequest(RTM_DELETE, rt, &info); } /* * one more rtentry floating around that is not * linked to the routing table. */ V_rttrash++; #if !defined(RADIX_MPATH) bad: #endif return (error); } #if 0 int p_sockaddr(char *buf, int buflen, struct sockaddr *s); int rt_print(char *buf, int buflen, struct rtentry *rt); int p_sockaddr(char *buf, int buflen, struct sockaddr *s) { void *paddr = NULL; switch (s->sa_family) { case AF_INET: paddr = &((struct sockaddr_in *)s)->sin_addr; break; case AF_INET6: paddr = &((struct sockaddr_in6 *)s)->sin6_addr; break; } if (paddr == NULL) return (0); if (inet_ntop(s->sa_family, paddr, buf, buflen) == NULL) return (0); return (strlen(buf)); } int rt_print(char *buf, int buflen, struct rtentry *rt) { struct sockaddr *addr, *mask; int i = 0; addr = rt_key(rt); mask = rt_mask(rt); i = p_sockaddr(buf, buflen, addr); if (!(rt->rt_flags & RTF_HOST)) { buf[i++] = '/'; i += p_sockaddr(buf + i, buflen - i, mask); } if (rt->rt_flags & RTF_GATEWAY) { buf[i++] = '>'; i += p_sockaddr(buf + i, buflen - i, rt->rt_gateway); } return (i); } #endif #ifdef RADIX_MPATH static int rn_mpath_update(int req, struct rt_addrinfo *info, struct rib_head *rh, struct rtentry **ret_nrt) { /* * if we got multipath routes, we require users to specify * a matching RTAX_GATEWAY. */ struct rtentry *rt, *rto = NULL; struct radix_node *rn; int error = 0; rn = rh->rnh_lookup(dst, netmask, rh); if (rn == NULL) return (ESRCH); rto = rt = RNTORT(rn); rt = rt_mpath_matchgate(rt, gateway); if (rt == NULL) return (ESRCH); /* * this is the first entry in the chain */ if (rto == rt) { rn = rn_mpath_next((struct radix_node *)rt); /* * there is another entry, now it's active */ if (rn) { rto = RNTORT(rn); RT_LOCK(rto); rto->rt_flags |= RTF_UP; RT_UNLOCK(rto); } else if (rt->rt_flags & RTF_GATEWAY) { /* * For gateway routes, we need to * make sure that we we are deleting * the correct gateway. * rt_mpath_matchgate() does not * check the case when there is only * one route in the chain. */ if (gateway && (rt->rt_gateway->sa_len != gateway->sa_len || memcmp(rt->rt_gateway, gateway, gateway->sa_len))) error = ESRCH; else { /* * remove from tree before returning it * to the caller */ rn = rh->rnh_deladdr(dst, netmask, rh); KASSERT(rt == RNTORT(rn), ("radix node disappeared")); goto gwdelete; } } /* * use the normal delete code to remove * the first entry */ if (req != RTM_DELETE) goto nondelete; error = ENOENT; goto done; } /* * if the entry is 2nd and on up */ if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt)) panic ("rtrequest1: rt_mpath_deldup"); gwdelete: RT_LOCK(rt); RT_ADDREF(rt); if (req == RTM_DELETE) { rt->rt_flags &= ~RTF_UP; /* * One more rtentry floating around that is not * linked to the routing table. rttrash will be decremented * when RTFREE(rt) is eventually called. */ V_rttrash++; } nondelete: if (req != RTM_DELETE) panic("unrecognized request %d", req); /* * If the caller wants it, then it can have it, * but it's up to it to free the rtentry as we won't be * doing it. */ if (ret_nrt) { *ret_nrt = rt; RT_UNLOCK(rt); } else RTFREE_LOCKED(rt); done: return (error); } #endif int rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, u_int fibnum) { int error = 0, needlock = 0; struct rtentry *rt; #ifdef FLOWTABLE struct rtentry *rt0; #endif struct radix_node *rn; struct rib_head *rh; struct ifaddr *ifa; struct sockaddr *ndst; struct sockaddr_storage mdst; #define senderr(x) { error = x ; goto bad; } KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum")); switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We support multiple FIBs. */ break; default: fibnum = RT_DEFAULT_FIB; break; } /* * Find the correct routing tree to use for this Address Family */ rh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rh == NULL) return (EAFNOSUPPORT); needlock = ((flags & RTF_RNH_LOCKED) == 0); flags &= ~RTF_RNH_LOCKED; - if (needlock) + if (needlock) { + RIB_CFG_WLOCK(rh); RIB_WLOCK(rh); - else + } else RIB_LOCK_ASSERT(rh); /* * If we are adding a host route then we don't want to put * a netmask in the tree, nor do we want to clone it. */ if (flags & RTF_HOST) netmask = NULL; switch (req) { case RTM_DELETE: if (netmask) { rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask); dst = (struct sockaddr *)&mdst; } #ifdef RADIX_MPATH if (rn_mpath_capable(rh)) { error = rn_mpath_update(req, info, rh, ret_nrt); /* * "bad" holds true for the success case * as well */ if (error != ENOENT) goto bad; error = 0; } #endif if ((flags & RTF_PINNED) == 0) { /* Check if target route can be deleted */ rt = (struct rtentry *)rh->rnh_lookup(dst, netmask, &rh->head); if ((rt != NULL) && (rt->rt_flags & RTF_PINNED)) senderr(EADDRINUSE); } /* * Remove the item from the tree and return it. * Complain if it is not there and do no more processing. */ rn = rh->rnh_deladdr(dst, netmask, &rh->head); if (rn == NULL) senderr(ESRCH); if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic ("rtrequest delete"); rt = RNTORT(rn); RT_LOCK(rt); RT_ADDREF(rt); rt->rt_flags &= ~RTF_UP; /* * give the protocol a chance to keep things in sync. */ if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) ifa->ifa_rtrequest(RTM_DELETE, rt, info); /* * One more rtentry floating around that is not * linked to the routing table. rttrash will be decremented * when RTFREE(rt) is eventually called. */ V_rttrash++; /* * If the caller wants it, then it can have it, * but it's up to it to free the rtentry as we won't be * doing it. */ if (ret_nrt) { *ret_nrt = rt; RT_UNLOCK(rt); } else RTFREE_LOCKED(rt); break; case RTM_RESOLVE: /* * resolve was only used for route cloning * here for compat */ break; case RTM_ADD: if ((flags & RTF_GATEWAY) && !gateway) senderr(EINVAL); if (dst && gateway && (dst->sa_family != gateway->sa_family) && (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK)) senderr(EINVAL); if (info->rti_ifa == NULL) { error = rt_getifa_fib(info, fibnum); if (error) senderr(error); } else ifa_ref(info->rti_ifa); ifa = info->rti_ifa; rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO); if (rt == NULL) { ifa_free(ifa); senderr(ENOBUFS); } RT_LOCK_INIT(rt); rt->rt_flags = RTF_UP | flags; rt->rt_fibnum = fibnum; /* * Add the gateway. Possibly re-malloc-ing the storage for it. */ RT_LOCK(rt); if ((error = rt_setgate(rt, dst, gateway)) != 0) { RT_LOCK_DESTROY(rt); ifa_free(ifa); uma_zfree(V_rtzone, rt); senderr(error); } /* * point to the (possibly newly malloc'd) dest address. */ ndst = (struct sockaddr *)rt_key(rt); /* * make sure it contains the value we want (masked if needed). */ if (netmask) { rt_maskedcopy(dst, ndst, netmask); } else bcopy(dst, ndst, dst->sa_len); /* * We use the ifa reference returned by rt_getifa_fib(). * This moved from below so that rh->rnh_addaddr() can * examine the ifa and ifa->ifa_ifp if it so desires. */ rt->rt_ifa = ifa; rt->rt_ifp = ifa->ifa_ifp; rt->rt_weight = 1; rt_setmetrics(info, rt); #ifdef RADIX_MPATH /* do not permit exactly the same dst/mask/gw pair */ if (rn_mpath_capable(rh) && rt_mpath_conflict(rh, rt, netmask)) { ifa_free(rt->rt_ifa); Free(rt_key(rt)); RT_LOCK_DESTROY(rt); uma_zfree(V_rtzone, rt); senderr(EEXIST); } #endif #ifdef FLOWTABLE rt0 = NULL; /* "flow-table" only supports IPv6 and IPv4 at the moment. */ switch (dst->sa_family) { #ifdef INET6 case AF_INET6: #endif #ifdef INET case AF_INET: #endif #if defined(INET6) || defined(INET) rn = rh->rnh_matchaddr(dst, rh); if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) { struct sockaddr *mask; u_char *m, *n; int len; /* * compare mask to see if the new route is * more specific than the existing one */ rt0 = RNTORT(rn); RT_LOCK(rt0); RT_ADDREF(rt0); RT_UNLOCK(rt0); /* * A host route is already present, so * leave the flow-table entries as is. */ if (rt0->rt_flags & RTF_HOST) { RTFREE(rt0); rt0 = NULL; } else if (!(flags & RTF_HOST) && netmask) { mask = rt_mask(rt0); len = mask->sa_len; m = (u_char *)mask; n = (u_char *)netmask; while (len-- > 0) { if (*n != *m) break; n++; m++; } if (len == 0 || (*n < *m)) { RTFREE(rt0); rt0 = NULL; } } } #endif/* INET6 || INET */ } #endif /* FLOWTABLE */ /* XXX mtu manipulation will be done in rnh_addaddr -- itojun */ rn = rh->rnh_addaddr(ndst, netmask, &rh->head, rt->rt_nodes); /* * If it still failed to go into the tree, * then un-make it (this should be a function) */ if (rn == NULL) { ifa_free(rt->rt_ifa); Free(rt_key(rt)); uma_zfree(V_rtzone, rt); #ifdef FLOWTABLE if (rt0 != NULL) RTFREE(rt0); #endif senderr(EEXIST); } #ifdef FLOWTABLE else if (rt0 != NULL) { flowtable_route_flush(dst->sa_family, rt0); RTFREE(rt0); } #endif /* * If this protocol has something to add to this then * allow it to do that as well. */ if (ifa->ifa_rtrequest) ifa->ifa_rtrequest(req, rt, info); /* * actually return a resultant rtentry and * give the caller a single reference. */ if (ret_nrt) { *ret_nrt = rt; RT_ADDREF(rt); } RT_UNLOCK(rt); break; case RTM_CHANGE: error = rtrequest1_fib_change(rh, info, ret_nrt, fibnum); break; default: error = EOPNOTSUPP; } bad: - if (needlock) + if (needlock) { RIB_WUNLOCK(rh); + RIB_CFG_WUNLOCK(rh); + } return (error); #undef senderr } #undef dst #undef gateway #undef netmask #undef ifaaddr #undef ifpaddr #undef flags static int rtrequest1_fib_change(struct rib_head *rh, struct rt_addrinfo *info, struct rtentry **ret_nrt, u_int fibnum) { struct rtentry *rt = NULL; int error = 0; int free_ifa = 0; int family, mtu; rt = (struct rtentry *)rh->rnh_lookup(info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], &rh->head); if (rt == NULL) return (ESRCH); #ifdef RADIX_MPATH /* * If we got multipath routes, * we require users to specify a matching RTAX_GATEWAY. */ if (rn_mpath_capable(rh)) { rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]); if (rt == NULL) return (ESRCH); } #endif RT_LOCK(rt); rt_setmetrics(info, rt); /* * New gateway could require new ifaddr, ifp; * flags may also be different; ifp may be specified * by ll sockaddr when protocol address is ambiguous */ if (((rt->rt_flags & RTF_GATEWAY) && info->rti_info[RTAX_GATEWAY] != NULL) || info->rti_info[RTAX_IFP] != NULL || (info->rti_info[RTAX_IFA] != NULL && !sa_equal(info->rti_info[RTAX_IFA], rt->rt_ifa->ifa_addr))) { error = rt_getifa_fib(info, fibnum); if (info->rti_ifa != NULL) free_ifa = 1; if (error != 0) goto bad; } /* Check if outgoing interface has changed */ if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa && rt->rt_ifa != NULL && rt->rt_ifa->ifa_rtrequest != NULL) { rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt, info); ifa_free(rt->rt_ifa); } /* Update gateway address */ if (info->rti_info[RTAX_GATEWAY] != NULL) { error = rt_setgate(rt, rt_key(rt), info->rti_info[RTAX_GATEWAY]); if (error != 0) goto bad; rt->rt_flags &= ~RTF_GATEWAY; rt->rt_flags |= (RTF_GATEWAY & info->rti_flags); } if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa) { ifa_ref(info->rti_ifa); rt->rt_ifa = info->rti_ifa; rt->rt_ifp = info->rti_ifp; } /* Allow some flags to be toggled on change. */ rt->rt_flags &= ~RTF_FMASK; rt->rt_flags |= info->rti_flags & RTF_FMASK; if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest != NULL) rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info); /* Ensure route MTU is not bigger than interface MTU */ if (rt->rt_ifp != NULL) { family = info->rti_info[RTAX_DST]->sa_family; mtu = if_getmtu_family(rt->rt_ifp, family); if (rt->rt_mtu > mtu) rt->rt_mtu = mtu; } if (ret_nrt) { *ret_nrt = rt; RT_ADDREF(rt); } bad: RT_UNLOCK(rt); if (free_ifa != 0) ifa_free(info->rti_ifa); return (error); } static void rt_setmetrics(const struct rt_addrinfo *info, struct rtentry *rt) { if (info->rti_mflags & RTV_MTU) rt->rt_mtu = info->rti_rmx->rmx_mtu; if (info->rti_mflags & RTV_WEIGHT) rt->rt_weight = info->rti_rmx->rmx_weight; /* Kernel -> userland timebase conversion. */ if (info->rti_mflags & RTV_EXPIRE) rt->rt_expire = info->rti_rmx->rmx_expire ? info->rti_rmx->rmx_expire - time_second + time_uptime : 0; } int rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) { /* XXX dst may be overwritten, can we move this to below */ int dlen = SA_SIZE(dst), glen = SA_SIZE(gate); #ifdef INVARIANTS struct rib_head *rh; rh = rt_tables_get_rnh(rt->rt_fibnum, dst->sa_family); #endif RT_LOCK_ASSERT(rt); RIB_LOCK_ASSERT(rh); /* * Prepare to store the gateway in rt->rt_gateway. * Both dst and gateway are stored one after the other in the same * malloc'd chunk. If we have room, we can reuse the old buffer, * rt_gateway already points to the right place. * Otherwise, malloc a new block and update the 'dst' address. */ if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) { caddr_t new; R_Malloc(new, caddr_t, dlen + glen); if (new == NULL) return ENOBUFS; /* * XXX note, we copy from *dst and not *rt_key(rt) because * rt_setgate() can be called to initialize a newly * allocated route entry, in which case rt_key(rt) == NULL * (and also rt->rt_gateway == NULL). * Free()/free() handle a NULL argument just fine. */ bcopy(dst, new, dlen); Free(rt_key(rt)); /* free old block, if any */ rt_key(rt) = (struct sockaddr *)new; rt->rt_gateway = (struct sockaddr *)(new + dlen); } /* * Copy the new gateway value into the memory chunk. */ bcopy(gate, rt->rt_gateway, glen); return (0); } void rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask) { u_char *cp1 = (u_char *)src; u_char *cp2 = (u_char *)dst; u_char *cp3 = (u_char *)netmask; u_char *cplim = cp2 + *cp3; u_char *cplim2 = cp2 + *cp1; *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */ cp3 += 2; if (cplim > cplim2) cplim = cplim2; while (cp2 < cplim) *cp2++ = *cp1++ & *cp3++; if (cp2 < cplim2) bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2)); } /* * Set up a routing table entry, normally * for an interface. */ #define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */ static inline int rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) { struct sockaddr *dst; struct sockaddr *netmask; struct rtentry *rt = NULL; struct rt_addrinfo info; int error = 0; int startfib, endfib; char tempbuf[_SOCKADDR_TMPSIZE]; int didwork = 0; int a_failure = 0; static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; struct rib_head *rh; if (flags & RTF_HOST) { dst = ifa->ifa_dstaddr; netmask = NULL; } else { dst = ifa->ifa_addr; netmask = ifa->ifa_netmask; } if (dst->sa_len == 0) return(EINVAL); switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We support multiple FIBs. */ break; default: fibnum = RT_DEFAULT_FIB; break; } if (fibnum == RT_ALL_FIBS) { if (V_rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) startfib = endfib = ifa->ifa_ifp->if_fib; else { startfib = 0; endfib = rt_numfibs - 1; } } else { KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum")); startfib = fibnum; endfib = fibnum; } /* * If it's a delete, check that if it exists, * it's on the correct interface or we might scrub * a route to another ifa which would * be confusing at best and possibly worse. */ if (cmd == RTM_DELETE) { /* * It's a delete, so it should already exist.. * If it's a net, mask off the host bits * (Assuming we have a mask) * XXX this is kinda inet specific.. */ if (netmask != NULL) { rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask); dst = (struct sockaddr *)tempbuf; } } /* * Now go through all the requested tables (fibs) and do the * requested action. Realistically, this will either be fib 0 * for protocols that don't do multiple tables or all the * tables for those that do. */ for ( fibnum = startfib; fibnum <= endfib; fibnum++) { if (cmd == RTM_DELETE) { struct radix_node *rn; /* * Look up an rtentry that is in the routing tree and * contains the correct info. */ rh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rh == NULL) /* this table doesn't exist but others might */ continue; - RIB_RLOCK(rh); + RIB_CFG_RLOCK(rh); rn = rh->rnh_lookup(dst, netmask, &rh->head); #ifdef RADIX_MPATH if (rn_mpath_capable(rh)) { if (rn == NULL) error = ESRCH; else { rt = RNTORT(rn); /* * for interface route the * rt->rt_gateway is sockaddr_intf * for cloning ARP entries, so * rt_mpath_matchgate must use the * interface address */ rt = rt_mpath_matchgate(rt, ifa->ifa_addr); if (rt == NULL) error = ESRCH; } } #endif error = (rn == NULL || (rn->rn_flags & RNF_ROOT) || RNTORT(rn)->rt_ifa != ifa); - RIB_RUNLOCK(rh); + RIB_CFG_RUNLOCK(rh); if (error) { /* this is only an error if bad on ALL tables */ continue; } } /* * Do the actual request */ bzero((caddr_t)&info, sizeof(info)); info.rti_ifa = ifa; info.rti_flags = flags | (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED; info.rti_info[RTAX_DST] = dst; /* * doing this for compatibility reasons */ if (cmd == RTM_ADD) info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; else info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = netmask; error = rtrequest1_fib(cmd, &info, &rt, fibnum); if ((error == EEXIST) && (cmd == RTM_ADD)) { /* * Interface route addition failed. * Atomically delete current prefix generating * RTM_DELETE message, and retry adding * interface prefix. */ rh = rt_tables_get_rnh(fibnum, dst->sa_family); + RIB_CFG_WLOCK(rh); RIB_WLOCK(rh); /* Delete old prefix */ info.rti_ifa = NULL; info.rti_flags = RTF_RNH_LOCKED; error = rtrequest1_fib(RTM_DELETE, &info, NULL, fibnum); if (error == 0) { info.rti_ifa = ifa; info.rti_flags = flags | RTF_RNH_LOCKED | (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED; error = rtrequest1_fib(cmd, &info, &rt, fibnum); } RIB_WUNLOCK(rh); + RIB_CFG_WUNLOCK(rh); } if (error == 0 && rt != NULL) { /* * notify any listening routing agents of the change */ RT_LOCK(rt); #ifdef RADIX_MPATH /* * in case address alias finds the first address * e.g. ifconfig bge0 192.0.2.246/24 * e.g. ifconfig bge0 192.0.2.247/24 * the address set in the route is 192.0.2.246 * so we need to replace it with 192.0.2.247 */ if (memcmp(rt->rt_ifa->ifa_addr, ifa->ifa_addr, ifa->ifa_addr->sa_len)) { ifa_free(rt->rt_ifa); ifa_ref(ifa); rt->rt_ifp = ifa->ifa_ifp; rt->rt_ifa = ifa; } #endif /* * doing this for compatibility reasons */ if (cmd == RTM_ADD) { ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type = rt->rt_ifp->if_type; ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index = rt->rt_ifp->if_index; } RT_ADDREF(rt); RT_UNLOCK(rt); rt_newaddrmsg_fib(cmd, ifa, error, rt, fibnum); RT_LOCK(rt); RT_REMREF(rt); if (cmd == RTM_DELETE) { /* * If we are deleting, and we found an entry, * then it's been removed from the tree.. * now throw it away. */ RTFREE_LOCKED(rt); } else { if (cmd == RTM_ADD) { /* * We just wanted to add it.. * we don't actually need a reference. */ RT_REMREF(rt); } RT_UNLOCK(rt); } didwork = 1; } if (error) a_failure = error; } if (cmd == RTM_DELETE) { if (didwork) { error = 0; } else { /* we only give an error if it wasn't in any table */ error = ((flags & RTF_HOST) ? EHOSTUNREACH : ENETUNREACH); } } else { if (a_failure) { /* return an error if any of them failed */ error = a_failure; } } return (error); } /* * Set up a routing table entry, normally * for an interface. */ int rtinit(struct ifaddr *ifa, int cmd, int flags) { struct sockaddr *dst; int fib = RT_DEFAULT_FIB; if (flags & RTF_HOST) { dst = ifa->ifa_dstaddr; } else { dst = ifa->ifa_addr; } switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We do support multiple FIBs. */ fib = RT_ALL_FIBS; break; } return (rtinit1(ifa, cmd, flags, fib)); } /* * Announce interface address arrival/withdraw * Returns 0 on success. */ int rt_addrmsg(int cmd, struct ifaddr *ifa, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); #if defined(INET) || defined(INET6) #ifdef SCTP /* * notify the SCTP stack * this will only get called when an address is added/deleted * XXX pass the ifaddr struct instead if ifa->ifa_addr... */ sctp_addr_change(ifa, cmd); #endif /* SCTP */ #endif return (rtsock_addrmsg(cmd, ifa, fibnum)); } /* * Announce route addition/removal. * Users of this function MUST validate input data BEFORE calling. * However we have to be able to handle invalid data: * if some userland app sends us "invalid" route message (invalid mask, * no dst, wrong address families, etc...) we need to pass it back * to app (and any other rtsock consumers) with rtm_errno field set to * non-zero value. * Returns 0 on success. */ int rt_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); KASSERT(rt_key(rt) != NULL, (":%s: rt_key must be supplied", __func__)); return (rtsock_routemsg(cmd, ifp, error, rt, fibnum)); } void rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) { rt_newaddrmsg_fib(cmd, ifa, error, rt, RT_ALL_FIBS); } /* * This is called to generate messages from the routing socket * indicating a network interface has had addresses associated with it. */ void rt_newaddrmsg_fib(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %u", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); if (cmd == RTM_ADD) { rt_addrmsg(cmd, ifa, fibnum); if (rt != NULL) rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum); } else { if (rt != NULL) rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum); rt_addrmsg(cmd, ifa, fibnum); } } Index: projects/routing/sys/net/route_internal.h =================================================================== --- projects/routing/sys/net/route_internal.h (revision 274335) +++ projects/routing/sys/net/route_internal.h (revision 274336) @@ -1,137 +1,146 @@ /*- * Copyright (c) 2014 * Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NET_ROUTE_INTERNAL_H_ #define _NET_ROUTE_INTERNAL_H_ struct rib_head { struct radix_head head; + struct rmlock rib_lock; /* data path lock */ rn_matchaddr_f_t *rnh_matchaddr; /* longest match for sockaddr */ rn_addaddr_f_t *rnh_addaddr; /* add based on sockaddr*/ rn_deladdr_f_t *rnh_deladdr; /* remove based on sockaddr */ rn_lookup_f_t *rnh_lookup; /* exact match for sockaddr */ rn_walktree_t *rnh_walktree; /* traverse tree */ rn_walktree_from_t *rnh_walktree_from; /* traverse tree below a */ rn_close_t *rnh_close; /*do something when the last ref drops*/ struct radix_node rnh_nodes[3]; /* empty tree for common case */ - struct rwlock rib_lock; /* locks entire radix tree */ + struct rwlock rib_cfglock; /* config lock */ struct radix_mask_head rmhead; /* masks radix head */ }; -#define RIB_RLOCK(rh) rw_rlock(&(rh)->rib_lock) -#define RIB_RUNLOCK(rh) rw_runlock(&(rh)->rib_lock) -#define RIB_WLOCK(rh) rw_wlock(&(rh)->rib_lock) -#define RIB_WUNLOCK(rh) rw_wunlock(&(rh)->rib_lock) -#define RIB_LOCK_ASSERT(rh) rw_assert(&(rh)->rib_lock, RA_LOCKED) -#define RIB_WLOCK_ASSERT(rh) rw_assert(&(rh)->rib_lock, RA_WLOCKED) +#define RIB_RLOCK(rh) rm_rlock(&(rh)->rib_lock, &tracker) +#define RIB_RUNLOCK(rh) rm_runlock(&(rh)->rib_lock, &tracker) +#define RIB_WLOCK(rh) rm_wlock(&(rh)->rib_lock) +#define RIB_WUNLOCK(rh) rm_wunlock(&(rh)->rib_lock) +#define RIB_WLOCK_ASSERT(rh) rm_assert(&(rh)->rib_lock, RA_WLOCKED) +#define RIB_LOCK_READER struct rm_priotracker tracker +#define RIB_LOCK_ASSERT(rh) rm_assert(&(rh)->rib_lock, RA_LOCKED) + +#define RIB_CFG_RLOCK(rh) rw_rlock(&(rh)->rib_cfglock) +#define RIB_CFG_RUNLOCK(rh) rw_runlock(&(rh)->rib_cfglock) +#define RIB_CFG_WLOCK(rh) rw_wlock(&(rh)->rib_cfglock) +#define RIB_CFG_WUNLOCK(rh) rw_wunlock(&(rh)->rib_cfglock) +#define RIB_CFG_LOCK_ASSERT(rh) rw_assert(&(rh)->rib_cfglock, RA_LOCKED) +#define RIB_CFG_WLOCK_ASSERT(rh) rw_assert(&(rh)->rib_cfglock, RA_WLOCKED) struct rib_head *rt_table_init(int offset); void rt_table_destroy(struct rib_head *rh); struct rtentry { struct radix_node rt_nodes[2]; /* tree glue, and other values */ /* * XXX struct rtentry must begin with a struct radix_node (or two!) * because the code does some casts of a 'struct radix_node *' * to a 'struct rtentry *' */ #define rt_key(r) (*((struct sockaddr **)(&(r)->rt_nodes->rn_key))) #define rt_mask(r) (*((struct sockaddr **)(&(r)->rt_nodes->rn_mask))) struct sockaddr *rt_gateway; /* value */ struct ifnet *rt_ifp; /* the answer: interface to use */ struct ifaddr *rt_ifa; /* the answer: interface address to use */ int rt_flags; /* up/down?, host/net */ int rt_refcnt; /* # held references */ u_int rt_fibnum; /* which FIB */ u_long rt_mtu; /* MTU for this path */ u_long rt_weight; /* absolute weight */ u_long rt_expire; /* lifetime for route, e.g. redirect */ #define rt_endzero rt_mtx struct mtx rt_mtx; /* mutex for routing entry */ }; #define RT_LOCK_INIT(_rt) \ mtx_init(&(_rt)->rt_mtx, "rtentry", NULL, MTX_DEF | MTX_DUPOK) #define RT_LOCK(_rt) mtx_lock(&(_rt)->rt_mtx) #define RT_UNLOCK(_rt) mtx_unlock(&(_rt)->rt_mtx) #define RT_LOCK_DESTROY(_rt) mtx_destroy(&(_rt)->rt_mtx) #define RT_LOCK_ASSERT(_rt) mtx_assert(&(_rt)->rt_mtx, MA_OWNED) #define RT_UNLOCK_COND(_rt) do { \ if (mtx_owned(&(_rt)->rt_mtx)) \ mtx_unlock(&(_rt)->rt_mtx); \ } while (0) #define RT_ADDREF(_rt) do { \ RT_LOCK_ASSERT(_rt); \ KASSERT((_rt)->rt_refcnt >= 0, \ ("negative refcnt %d", (_rt)->rt_refcnt)); \ (_rt)->rt_refcnt++; \ } while (0) #define RT_REMREF(_rt) do { \ RT_LOCK_ASSERT(_rt); \ KASSERT((_rt)->rt_refcnt > 0, \ ("bogus refcnt %d", (_rt)->rt_refcnt)); \ (_rt)->rt_refcnt--; \ } while (0) #define RTFREE_LOCKED(_rt) do { \ if ((_rt)->rt_refcnt <= 1) \ rtfree(_rt); \ else { \ RT_REMREF(_rt); \ RT_UNLOCK(_rt); \ } \ /* guard against invalid refs */ \ _rt = 0; \ } while (0) #define RTFREE(_rt) do { \ RT_LOCK(_rt); \ RTFREE_LOCKED(_rt); \ } while (0) #define RO_RTFREE(_ro) do { \ if ((_ro)->ro_rt) { \ if ((_ro)->ro_flags & RT_NORTREF) { \ (_ro)->ro_flags &= ~RT_NORTREF; \ (_ro)->ro_rt = NULL; \ } else { \ RT_LOCK((_ro)->ro_rt); \ RTFREE_LOCKED((_ro)->ro_rt); \ } \ } \ } while (0) #endif Index: projects/routing/sys/net/rt_nhops.c =================================================================== --- projects/routing/sys/net/rt_nhops.c (revision 274335) +++ projects/routing/sys/net/rt_nhops.c (revision 274336) @@ -1,1378 +1,1390 @@ /*- * Copyright (c) 2014 * Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Temporary file. In future it should be split between net/route.c * and per-AF files like netinet/in_rmx.c | netinet6/in6_rmx.c */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include "opt_mpath.h" #include #include #include +#include +#include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include struct fwd_info { fib_lookup_t *lookup; void *state; }; #define FWD_FSM_NONE 0 #define FWD_FSM_INIT 1 #define FWD_FSM_FWD 2 struct fwd_control { int fwd_state; /* FSM */ struct fwd_module *fm; }; #if 0 static struct fwd_info *fwd_db[FWD_SIZE]; static struct fwd_control *fwd_ctl[FWD_SIZE]; static TAILQ_HEAD(fwd_module_list, fwd_module) modulehead = TAILQ_HEAD_INITIALIZER(modulehead); static struct fwd_module_list fwd_modules[FWD_SIZE]; static uint8_t fwd_map_af[] = { AF_INET, AF_INET6, }; static struct rwlock fwd_lock; #define FWD_LOCK_INIT() rw_init(&fwd_lock, "fwd_lock") #define FWD_RLOCK() rw_rlock(&fwd_lock) #define FWD_RUNLOCK() rw_runlock(&fwd_lock) #define FWD_WLOCK() rw_wlock(&fwd_lock) #define FWD_WUNLOCK() rw_wunlock(&fwd_lock) int fwd_attach_fib(struct fwd_module *fm, u_int fib); int fwd_destroy_fib(struct fwd_module *fm, u_int fib); #endif static inline uint16_t fib_rte_to_nh_flags(int rt_flags); #ifdef INET static void rib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst, struct rt4_extended *prt4); static void fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst, struct nhop4_extended *pnh4); static void fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst, struct nhop4_basic *pnh4); #endif #ifdef INET6 static void fib6_rte_to_nh_extended(struct rtentry *rte, struct in6_addr *dst, struct nhop6_extended *pnh6); static void fib6_rte_to_nh_basic(struct rtentry *rte, struct in6_addr *dst, struct nhop6_basic *pnh6); static int fib6_storelladdr(struct ifnet *ifp, struct in6_addr *dst, int mm_flags, u_char *desten); static uint16_t fib6_get_ifa(struct rtentry *rte); static int fib6_lla_to_nh_basic(struct in6_addr *dst, uint32_t scopeid, struct nhop6_basic *pnh6); static int fib6_lla_to_nh_extended(struct in6_addr *dst, uint32_t scopeid, struct nhop6_extended *pnh6); static int fib6_lla_to_nh(struct in6_addr *dst, uint32_t scopeid, struct nhop_prepend *nh, struct ifnet **lifp); #endif MALLOC_DEFINE(M_RTFIB, "rtfib", "routing fwd"); /* * Per-AF fast routines returning minimal needed info. * It is not safe to dereference any pointers since it * may end up with use-after-free case. * Typically it may be used to check if outgoing * interface matches or to calculate proper MTU. * * Note that returned interface pointer is logical one, * e.g. actual transmit ifp may be different. * Difference may be triggered by * 1) loopback routes installed for interface addresses. * e.g. for address 10.0.0.1 with prefix /24 bound to * interface ix0, "logical" interface will be "ix0", * while "trasmit" interface will be "lo0" since this is * loopback route. You should consider using other * functions if you need "transmit" interface or both. * * * Returns 0 on match, error code overwise. */ //#define NHOP_DIRECT #define RNTORT(p) ((struct rtentry *)(p)) /* * Copies proper nexthop data based on @nh_src nexthop. * * For non-ECMP nexthop function simply copies @nh_src. * For ECMP nexthops flowid is used to select proper * nexthop. * */ static inline void fib_choose_prepend(uint32_t fibnum, struct nhop_prepend *nh_src, uint32_t flowid, struct nhop_prepend *nh, int af) { struct nhop_multi *nh_multi; int idx; if ((nh_src->nh_flags & NHF_RECURSE) != 0) { /* * Recursive nexthop. Choose direct nexthop * based on flowid. */ nh_multi = (struct nhop_multi *)nh_src; idx = nh_multi->nh_nhops[flowid % nh_multi->nh_count]; #if 0 KASSERT((fibnum < rt_numfibs), ("fib4_lookup_prepend§: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET); //nh_src = &rh->nhops[i]; #endif } *nh = *nh_src; /* TODO: Do some light-weight refcounting on egress ifp's */ } static inline void fib_free_nh_prepend(uint32_t fibnum, struct nhop_prepend *nh, int af) { /* TODO: Do some light-weight refcounting on egress ifp's */ } #ifdef INET void fib4_free_nh_prepend(uint32_t fibnum, struct nhop_prepend *nh) { fib_free_nh_prepend(fibnum, nh, AF_INET); } void fib4_choose_prepend(uint32_t fibnum, struct nhop_prepend *nh_src, uint32_t flowid, struct nhop_prepend *nh, struct nhop4_extended *nh_ext) { fib_choose_prepend(fibnum, nh_src, flowid, nh, AF_INET); if (nh_ext == NULL) return; nh_ext->nh_ifp = NH_LIFP(nh); nh_ext->nh_mtu = nh->nh_mtu; nh_ext->nh_flags = nh->nh_flags; #if 0 /* TODO: copy source/gw address from extended nexthop data */ nh_ext->nh_addr = ; nh_ext->nh_src= ; #endif } /* * Function performs lookup in IPv4 table fib @fibnum. * * In case of successful lookup @nh header is filled with * appropriate interface info and full L2 header to prepend. * * If no valid ARP record is present, NHF_L2_INCOMPLETE flag * is set and gateway address is stored into nh->d.gw4 * * If @nh_ext is not NULL, additional nexthop data is stored there. * * Returns 0 on success. * */ int fib4_lookup_prepend(uint32_t fibnum, struct in_addr dst, struct mbuf *m, struct nhop_prepend *nh, struct nhop4_extended *nh_ext) { struct rib_head *rh; struct radix_node *rn; struct sockaddr_in *gw_sa, sin; struct ifnet *lifp; struct in_addr gw; struct ether_header *eh; int error, flags; - //uint32_t flowid; struct rtentry *rte; + RIB_LOCK_READER; KASSERT((fibnum < rt_numfibs), ("fib4_lookup_prepend: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET); if (rh == NULL) return (EHOSTUNREACH); /* Prepare lookup key */ memset(&sin, 0, sizeof(sin)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_addr = dst; RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin, &rh->head); rte = RNTORT(rn); if (rn == NULL || ((rn->rn_flags & RNF_ROOT) != 0) || RT_LINK_IS_UP(rte->rt_ifp) == 0) { RIB_RUNLOCK(rh); return (EHOSTUNREACH); } /* * Currently we fill in @nh ourselves. * In near future rte will have nhop index to copy from. */ /* Calculate L3 info */ flags = 0; nh->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu); if (rte->rt_flags & RTF_GATEWAY) { gw_sa = (struct sockaddr_in *)rte->rt_gateway; gw = gw_sa->sin_addr; } else gw = dst; /* Set flags */ flags = fib_rte_to_nh_flags(rte->rt_flags); gw_sa = (struct sockaddr_in *)rt_key(rte); if (gw_sa->sin_addr.s_addr == 0) flags |= NHF_DEFAULT; /* * TODO: nh L2/L3 resolve. * Currently all we have is rte ifp. * Simply use it. */ /* Save interface address ifp */ lifp = rte->rt_ifa->ifa_ifp; nh->aifp_idx = lifp->if_index; /* Save both logical and transmit interface indexes */ lifp = rte->rt_ifp; nh->lifp_idx = lifp->if_index; nh->i.ifp_idx = nh->lifp_idx; if (nh_ext != NULL) { /* Fill in extended info */ fib4_rte_to_nh_extended(rte, dst, nh_ext); } RIB_RUNLOCK(rh); nh->nh_flags = flags; /* * Try to lookup L2 info. * Do this using separate LLE locks. * TODO: move this under radix lock. */ if (lifp->if_type == IFT_ETHER) { eh = (struct ether_header *)nh->d.data; /* * Fill in ethernet header. * It should be already presented if we're * sending data via known gateway. */ error = arpresolve_fast(lifp, gw, m ? m->m_flags : 0, eh->ether_dhost); if (error == 0) { memcpy(&eh->ether_shost, IF_LLADDR(lifp), ETHER_ADDR_LEN); eh->ether_type = htons(ETHERTYPE_IP); nh->nh_count = ETHER_HDR_LEN; return (0); } } /* Notify caller that no L2 info is linked */ nh->nh_count = 0; nh->nh_flags |= NHF_L2_INCOMPLETE; /* ..And save gateway address */ nh->d.gw4 = gw; return (0); } int fib4_sendmbuf(struct ifnet *ifp, struct mbuf *m, struct nhop_prepend *nh, struct in_addr dst) { int error; if (nh != NULL && (nh->nh_flags & NHF_L2_INCOMPLETE) == 0) { /* * Fast path case. Most packets should * be sent from here. * TODO: Make special ifnet * 'if_output_frame' handler for that. */ struct nhop_info ni; struct ether_header *eh; bzero(&ni, sizeof(ni)); ni.ni_flags = RT_NHOP; ni.ni_family = AF_INET; ni.ni_nh = nh; M_PREPEND(m, nh->nh_count, M_NOWAIT); if (m == NULL) return (ENOBUFS); eh = mtod(m, struct ether_header *); memcpy(eh, nh->d.data, nh->nh_count); error = (*ifp->if_output)(ifp, m, NULL, &ni); } else { struct sockaddr_in gw_out; memset(&gw_out, 0, sizeof(gw_out)); gw_out.sin_len = sizeof(gw_out); gw_out.sin_family = AF_INET; gw_out.sin_addr = nh ? nh->d.gw4 : dst; error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)&gw_out, NULL); } return (error); } static inline uint16_t fib_rte_to_nh_flags(int rt_flags) { uint16_t res; res = (rt_flags & RTF_REJECT) ? NHF_REJECT : 0; res |= (rt_flags & RTF_BLACKHOLE) ? NHF_BLACKHOLE : 0; res |= (rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) ? NHF_REDIRECT : 0; res |= (rt_flags & RTF_BROADCAST) ? NHF_BROADCAST : 0; res |= (rt_flags & RTF_GATEWAY) ? NHF_GATEWAY : 0; return (res); } static void fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst, struct nhop4_basic *pnh4) { struct sockaddr_in *gw; pnh4->nh_ifp = rte->rt_ifa->ifa_ifp; pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu); if (rte->rt_flags & RTF_GATEWAY) { gw = (struct sockaddr_in *)rte->rt_gateway; pnh4->nh_addr = gw->sin_addr; } else pnh4->nh_addr = dst; /* Set flags */ pnh4->nh_flags = fib_rte_to_nh_flags(rte->rt_flags); gw = (struct sockaddr_in *)rt_key(rte); if (gw->sin_addr.s_addr == 0) pnh4->nh_flags |= NHF_DEFAULT; /* XXX: Set RTF_BROADCAST if GW address is broadcast */ } static void fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst, struct nhop4_extended *pnh4) { struct sockaddr_in *gw; struct in_ifaddr *ia; pnh4->nh_ifp = rte->rt_ifa->ifa_ifp; pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu); if (rte->rt_flags & RTF_GATEWAY) { gw = (struct sockaddr_in *)rte->rt_gateway; pnh4->nh_addr = gw->sin_addr; } else pnh4->nh_addr = dst; /* Set flags */ pnh4->nh_flags = fib_rte_to_nh_flags(rte->rt_flags); gw = (struct sockaddr_in *)rt_key(rte); if (gw->sin_addr.s_addr == 0) pnh4->nh_flags |= NHF_DEFAULT; /* XXX: Set RTF_BROADCAST if GW address is broadcast */ ia = ifatoia(rte->rt_ifa); pnh4->nh_src = IA_SIN(ia)->sin_addr; } static void rib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst, struct rt4_extended *prt4) { struct sockaddr_in *gw; struct in_ifaddr *ia; /* Do explicit nexthop zero unless we're copying it */ memset(prt4, 0, sizeof(*prt4)); gw = ((struct sockaddr_in *)rt_key(rte)); prt4->rt_addr = gw->sin_addr; gw = ((struct sockaddr_in *)rt_mask(rte)); prt4->rt_mask.s_addr = (gw != NULL) ? gw->sin_addr.s_addr : INADDR_BROADCAST; if (rte->rt_flags & RTF_GATEWAY) { gw = (struct sockaddr_in *)rte->rt_gateway; prt4->rt_gateway = gw->sin_addr; } else prt4->rt_gateway = dst; prt4->rt_lifp = rte->rt_ifp; prt4->rt_aifp = rte->rt_ifa->ifa_ifp; prt4->rt_flags = rte->rt_flags; prt4->rt_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu); prt4->rt_nhop = 0; /* XXX: fill real nexthop */ ia = ifatoia(rte->rt_ifa); prt4->rt_src = IA_SIN(ia)->sin_addr; } /* * Performs IPv4 route table lookup on @dst. Returns 0 on success. * Stores nexthop info provided @pnh4 structure. * Note that * - nh_ifp cannot be safely dereferenced * - nh_ifp represents ifaddr ifp (e.g. if looking up address on * interface "ix0" pointer to "ix0" interface will be returned instead * of "lo0") * - howewer mtu from "transmit" interface will be returned. */ int fib4_lookup_nh_basic(uint32_t fibnum, struct in_addr dst, uint32_t flowid, struct nhop4_basic *pnh4) { struct rib_head *rh; struct radix_node *rn; struct sockaddr_in sin; struct rtentry *rte; + RIB_LOCK_READER; KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_basic: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET); if (rh == NULL) return (ENOENT); /* Prepare lookup key */ memset(&sin, 0, sizeof(sin)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_addr = dst; RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rte = RNTORT(rn); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(rte->rt_ifp)) { fib4_rte_to_nh_basic(rte, dst, pnh4); RIB_RUNLOCK(rh); return (0); } } RIB_RUNLOCK(rh); return (ENOENT); } int fib4_lookup_nh_ifp(uint32_t fibnum, struct in_addr dst, uint32_t flowid, struct nhop4_basic *pnh4) { struct rib_head *rh; struct radix_node *rn; struct sockaddr_in sin; struct rtentry *rte; + RIB_LOCK_READER; KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_ifp: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET); if (rh == NULL) return (ENOENT); /* Prepare lookup key */ memset(&sin, 0, sizeof(sin)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_addr = dst; RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rte = RNTORT(rn); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(rte->rt_ifp)) { fib4_rte_to_nh_basic(rte, dst, pnh4); RIB_RUNLOCK(rh); pnh4->nh_ifp = rte->rt_ifp; return (0); } } RIB_RUNLOCK(rh); return (ENOENT); } /* * Performs IPv4 route table lookup on @dst. Returns 0 on success. * Stores extende nexthop info provided @pnh4 structure. * Note that * - nh_ifp cannot be safely dereferenced unless NHOP_LOOKUP_REF is specified. * - in that case you need to call fib4_free_nh_ext() * - nh_ifp represents logical transmit interface (rt_ifp) * - mtu from logical transmit interface will be returned. */ int fib4_lookup_nh_ext(uint32_t fibnum, struct in_addr dst, uint32_t flowid, uint32_t flags, struct nhop4_extended *pnh4) { struct rib_head *rh; struct radix_node *rn; struct sockaddr_in sin; struct rtentry *rte; + RIB_LOCK_READER; KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_ext: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET); if (rh == NULL) return (ENOENT); /* Prepare lookup key */ memset(&sin, 0, sizeof(sin)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_addr = dst; RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rte = RNTORT(rn); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(rte->rt_ifp)) { fib4_rte_to_nh_extended(rte, dst, pnh4); if ((flags & NHOP_LOOKUP_REF) != 0) { /* TODO: Do lwref on egress ifp's */ } RIB_RUNLOCK(rh); return (0); } } RIB_RUNLOCK(rh); return (ENOENT); } void fib4_free_nh_ext(uint32_t fibnum, struct nhop4_extended *pnh4) { } int rib4_lookup_nh_ext(uint32_t fibnum, struct in_addr dst, uint32_t flowid, uint32_t flags, struct rt4_extended *prt4) { struct rib_head *rh; struct radix_node *rn; struct sockaddr_in sin; struct rtentry *rte; + RIB_LOCK_READER; KASSERT((fibnum < rt_numfibs), ("rib4_lookup_nh_ext: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET); if (rh == NULL) return (ENOENT); /* Prepare lookup key */ memset(&sin, 0, sizeof(sin)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_addr = dst; RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rte = RNTORT(rn); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(rte->rt_ifp)) { rib4_rte_to_nh_extended(rte, dst, prt4); if ((flags & NHOP_LOOKUP_REF) != 0) { /* TODO: Do lwref on egress ifp's */ } RIB_RUNLOCK(rh); return (0); } } RIB_RUNLOCK(rh); return (ENOENT); } void rib4_free_nh_ext(uint32_t fibnum, struct rt4_extended *prt4) { } #endif #ifdef INET6 void fib6_free_nh_prepend(uint32_t fibnum, struct nhop_prepend *nh) { fib_free_nh_prepend(fibnum, nh, AF_INET6); } void fib6_choose_prepend(uint32_t fibnum, struct nhop_prepend *nh_src, uint32_t flowid, struct nhop_prepend *nh, struct nhop6_extended *nh_ext) { fib_choose_prepend(fibnum, nh_src, flowid, nh, AF_INET6); if (nh_ext == NULL) return; nh_ext->nh_ifp = NH_LIFP(nh); nh_ext->nh_mtu = nh->nh_mtu; nh_ext->nh_flags = nh->nh_flags; /* nh_ext->nh_addr = ; nh_ext->nh_src= ; */ } /* * Temporary function to copy ethernet address from valid lle */ static int fib6_storelladdr(struct ifnet *ifp, struct in6_addr *dst, int mm_flags, u_char *desten) { struct llentry *ln; struct sockaddr_in6 dst_sa; if (mm_flags & M_MCAST) { ETHER_MAP_IPV6_MULTICAST(&dst, desten); return (0); } memset(&dst_sa, 0, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = *dst; dst_sa.sin6_scope_id = ifp->if_index; /* * the entry should have been created in nd6_store_lladdr */ IF_AFDATA_RLOCK(ifp); ln = lla_lookup(LLTABLE6(ifp), 0, (struct sockaddr *)&dst_sa); /* * Perform fast path for the following cases: * 1) lle state is REACHABLE * 2) lle state is DELAY (NS message sentNS message sent) * * Every other case involves lle modification, so we handle * them separately. */ if (ln == NULL || (ln->ln_state != ND6_LLINFO_REACHABLE && ln->ln_state != ND6_LLINFO_DELAY)) { if (ln != NULL) LLE_RUNLOCK(ln); IF_AFDATA_RUNLOCK(ifp); return (1); } bcopy(&ln->ll_addr, desten, ifp->if_addrlen); LLE_RUNLOCK(ln); IF_AFDATA_RUNLOCK(ifp); return (0); } int fib6_lookup_prepend(uint32_t fibnum, struct in6_addr *dst, uint32_t scopeid, struct mbuf *m, struct nhop_prepend *nh, struct nhop6_extended *nh_ext) { struct rib_head *rh; struct radix_node *rn; struct sockaddr_in6 sin6, *gw_sa; struct in6_addr gw6; struct rtentry *rte; struct ifnet *lifp; struct ether_header *eh; + RIB_LOCK_READER; uint32_t flags; int error; if (IN6_IS_SCOPE_LINKLOCAL(dst)) { /* Do not lookup link-local addresses in rtable */ error = fib6_lla_to_nh(dst, scopeid, nh, &lifp); if (error != 0) return (error); /* */ gw6 = *dst; goto do_l2; } KASSERT((fibnum < rt_numfibs), ("fib6_lookup_prepend: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET6); if (rh == NULL) return (ENOENT); /* Prepare lookup key */ memset(&sin6, 0, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_addr = *dst; sin6.sin6_scope_id = scopeid; sa6_embedscope(&sin6, 0); RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); rte = RNTORT(rn); if (rn == NULL || ((rn->rn_flags & RNF_ROOT) != 0) || RT_LINK_IS_UP(rte->rt_ifp) == 0) { RIB_RUNLOCK(rh); return (EHOSTUNREACH); } /* Explicitly zero nexthop */ memset(nh, 0, sizeof(*nh)); flags = 0; nh->nh_mtu = min(rte->rt_mtu, IN6_LINKMTU(rte->rt_ifp)); if (rte->rt_flags & RTF_GATEWAY) { gw_sa = (struct sockaddr_in6 *)rte->rt_gateway; gw6 = gw_sa->sin6_addr; in6_clearscope(&gw6); } else gw6 = *dst; /* Set flags */ flags = fib_rte_to_nh_flags(rte->rt_flags); gw_sa = (struct sockaddr_in6 *)rt_key(rte); if (IN6_IS_ADDR_UNSPECIFIED(&gw_sa->sin6_addr)) flags |= NHF_DEFAULT; /* * TODO: nh L2/L3 resolve. * Currently all we have is rte ifp. * Simply use it. */ /* Save interface address ifp */ nh->aifp_idx = fib6_get_ifa(rte); /* Save both logical and transmit interface indexes */ lifp = rte->rt_ifp; nh->lifp_idx = lifp->if_index; nh->i.ifp_idx = nh->lifp_idx; RIB_RUNLOCK(rh); nh->nh_flags = flags; do_l2: /* * Try to lookup L2 info. * Do this using separate LLE locks. * TODO: move this under radix lock. */ if (lifp->if_type == IFT_ETHER) { eh = (struct ether_header *)nh->d.data; /* * Fill in ethernet header. * It should be already presented if we're * sending data via known gateway. */ error = fib6_storelladdr(lifp, &gw6, m ? m->m_flags : 0, eh->ether_dhost); if (error == 0) { memcpy(&eh->ether_shost, IF_LLADDR(lifp), ETHER_ADDR_LEN); eh->ether_type = htons(ETHERTYPE_IPV6); nh->nh_count = ETHER_HDR_LEN; return (0); } } /* Notify caller that no L2 info is linked */ nh->nh_count = 0; nh->nh_flags |= NHF_L2_INCOMPLETE; /* ..And save gateway address */ nh->d.gw6 = gw6; return (0); } int fib6_sendmbuf(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m, struct nhop_prepend *nh) { int error; if (nh != NULL && (nh->nh_flags & NHF_L2_INCOMPLETE) == 0) { /* * Fast path case. Most packets should * be sent from here. * TODO: Make special ifnet * 'if_output_frame' handler for that. */ struct nhop_info ni; struct ether_header *eh; bzero(&ni, sizeof(ni)); ni.ni_family = AF_INET6; ni.ni_flags = RT_NHOP; ni.ni_nh = nh; M_PREPEND(m, nh->nh_count, M_NOWAIT); if (m == NULL) return (ENOBUFS); eh = mtod(m, struct ether_header *); memcpy(eh, nh->d.data, nh->nh_count); error = (*ifp->if_output)(ifp, m, NULL, &ni); } else { /* We need to perform ND lookup */ struct sockaddr_in6 gw_out; memset(&gw_out, 0, sizeof(gw_out)); gw_out.sin6_family = AF_INET6; gw_out.sin6_len = sizeof(gw_out); gw_out.sin6_addr = nh->d.gw6; gw_out.sin6_scope_id = ifp->if_index; sa6_embedscope(&gw_out, 0); error = nd6_output(ifp, origifp, m, &gw_out, NULL); } return (error); } static uint16_t fib6_get_ifa(struct rtentry *rte) { struct ifnet *ifp; struct sockaddr_dl *sdl; ifp = rte->rt_ifp; if ((ifp->if_flags & IFF_LOOPBACK) && rte->rt_gateway->sa_family == AF_LINK) { sdl = (struct sockaddr_dl *)rte->rt_gateway; return (sdl->sdl_index); } return (ifp->if_index); #if 0 /* IPv6 case */ /* Alternative way to get interface address ifp */ /* * Adjust the "outgoing" interface. If we're going to loop * the packet back to ourselves, the ifp would be the loopback * interface. However, we'd rather know the interface associated * to the destination address (which should probably be one of * our own addresses.) */ if (rt) { if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) && (rt->rt_gateway->sa_family == AF_LINK)) *retifp = ifnet_byindex(((struct sockaddr_dl *) rt->rt_gateway)->sdl_index); } /* IPv4 case */ //pnh6->nh_ifp = rte->rt_ifa->ifa_ifp; #endif } static int fib6_lla_to_nh_basic(struct in6_addr *dst, uint32_t scopeid, struct nhop6_basic *pnh6) { struct ifnet *ifp; ifp = ifnet_byindex_locked(scopeid); if (ifp == NULL) return (ENOENT); /* Do explicit nexthop zero unless we're copying it */ memset(pnh6, 0, sizeof(*pnh6)); pnh6->nh_ifp = ifp; pnh6->nh_mtu = IN6_LINKMTU(ifp); /* No flags set */ pnh6->nh_addr = *dst; return (0); } static int fib6_lla_to_nh_extended(struct in6_addr *dst, uint32_t scopeid, struct nhop6_extended *pnh6) { struct ifnet *ifp; ifp = ifnet_byindex_locked(scopeid); if (ifp == NULL) return (ENOENT); /* Do explicit nexthop zero unless we're copying it */ memset(pnh6, 0, sizeof(*pnh6)); pnh6->nh_ifp = ifp; pnh6->nh_mtu = IN6_LINKMTU(ifp); /* No flags set */ pnh6->nh_addr = *dst; return (0); } static int rib6_lla_to_nh_extended(struct in6_addr *dst, uint32_t scopeid, struct rt6_extended *prt6) { struct ifnet *ifp; ifp = ifnet_byindex_locked(scopeid); if (ifp == NULL) return (ENOENT); /* Do explicit nexthop zero unless we're copying it */ memset(prt6, 0, sizeof(*prt6)); prt6->rt_addr.s6_addr16[0] = htons(0xFE80); prt6->rt_mask = 64; /* XXX check RFC */ prt6->rt_aifp = ifp; prt6->rt_lifp = ifp; /* Check id this is for-us address */ if (in6_ifawithifp_lla(ifp, dst)) { if ((ifp = V_loif) != NULL) prt6->rt_lifp = ifp; } prt6->rt_mtu = IN6_LINKMTU(ifp); /* No flags set */ return (0); } static int fib6_lla_to_nh(struct in6_addr *dst, uint32_t scopeid, struct nhop_prepend *nh, struct ifnet **lifp) { struct ifnet *ifp; ifp = ifnet_byindex_locked(scopeid); if (ifp == NULL) return (ENOENT); /* Do explicit nexthop zero unless we're copying it */ memset(nh, 0, sizeof(*nh)); /* No flags set */ nh->nh_mtu = IN6_LINKMTU(ifp); /* Save lifp */ *lifp = ifp; nh->aifp_idx = scopeid; nh->lifp_idx = scopeid; /* Check id this is for-us address */ if (in6_ifawithifp_lla(ifp, dst)) { if ((ifp = V_loif) != NULL) nh->lifp_idx = ifp->if_index; } return (0); } static void fib6_rte_to_nh_basic(struct rtentry *rte, struct in6_addr *dst, struct nhop6_basic *pnh6) { struct sockaddr_in6 *gw; /* Do explicit nexthop zero unless we're copying it */ memset(pnh6, 0, sizeof(*pnh6)); pnh6->nh_ifp = ifnet_byindex(fib6_get_ifa(rte)); pnh6->nh_mtu = min(rte->rt_mtu, IN6_LINKMTU(rte->rt_ifp)); if (rte->rt_flags & RTF_GATEWAY) { gw = (struct sockaddr_in6 *)rte->rt_gateway; pnh6->nh_addr = gw->sin6_addr; in6_clearscope(&pnh6->nh_addr); } else pnh6->nh_addr = *dst; /* Set flags */ pnh6->nh_flags = fib_rte_to_nh_flags(rte->rt_flags); gw = (struct sockaddr_in6 *)rt_key(rte); if (IN6_IS_ADDR_UNSPECIFIED(&gw->sin6_addr)) pnh6->nh_flags |= NHF_DEFAULT; } static void fib6_rte_to_nh_extended(struct rtentry *rte, struct in6_addr *dst, struct nhop6_extended *pnh6) { struct sockaddr_in6 *gw; struct in6_ifaddr *ia; /* Do explicit nexthop zero unless we're copying it */ memset(pnh6, 0, sizeof(*pnh6)); pnh6->nh_ifp = ifnet_byindex(fib6_get_ifa(rte)); pnh6->nh_mtu = min(rte->rt_mtu, IN6_LINKMTU(rte->rt_ifp)); if (rte->rt_flags & RTF_GATEWAY) { gw = (struct sockaddr_in6 *)rte->rt_gateway; pnh6->nh_addr = gw->sin6_addr; in6_clearscope(&pnh6->nh_addr); } else pnh6->nh_addr = *dst; /* Set flags */ pnh6->nh_flags = fib_rte_to_nh_flags(rte->rt_flags); gw = (struct sockaddr_in6 *)rt_key(rte); if (IN6_IS_ADDR_UNSPECIFIED(&gw->sin6_addr)) pnh6->nh_flags |= NHF_DEFAULT; ia = ifatoia6(rte->rt_ifa); } #define ipv6_masklen(x) bitcount32((x).__u6_addr.__u6_addr32[0]) + \ bitcount32((x).__u6_addr.__u6_addr32[1]) + \ bitcount32((x).__u6_addr.__u6_addr32[2]) + \ bitcount32((x).__u6_addr.__u6_addr32[3]) static void rib6_rte_to_nh_extended(struct rtentry *rte, struct in6_addr *dst, struct rt6_extended *prt6) { struct sockaddr_in6 *gw; /* Do explicit nexthop zero unless we're copying it */ memset(prt6, 0, sizeof(*prt6)); gw = ((struct sockaddr_in6 *)rt_key(rte)); prt6->rt_addr = gw->sin6_addr; gw = ((struct sockaddr_in6 *)rt_mask(rte)); prt6->rt_mask = (gw != NULL) ? ipv6_masklen(gw->sin6_addr) : 128; if (rte->rt_flags & RTF_GATEWAY) { gw = (struct sockaddr_in6 *)rte->rt_gateway; prt6->rt_gateway = gw->sin6_addr; in6_clearscope(&prt6->rt_gateway); } else prt6->rt_gateway = *dst; prt6->rt_lifp = rte->rt_ifp; prt6->rt_aifp = ifnet_byindex(fib6_get_ifa(rte)); prt6->rt_flags = fib_rte_to_nh_flags(rte->rt_flags); prt6->rt_mtu = min(rte->rt_mtu, IN6_LINKMTU(rte->rt_ifp)); } int fib6_lookup_nh_ifp(uint32_t fibnum, struct in6_addr *dst, uint32_t scopeid, uint32_t flowid, struct nhop6_basic *pnh6) { struct rib_head *rh; struct radix_node *rn; struct sockaddr_in6 sin6; struct rtentry *rte; + RIB_LOCK_READER; if (IN6_IS_SCOPE_LINKLOCAL(dst)) { /* Do not lookup link-local addresses in rtable */ /* XXX: Check if dst is local */ return (fib6_lla_to_nh_basic(dst, scopeid, pnh6)); } KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_basic: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET6); if (rh == NULL) return (ENOENT); /* Prepare lookup key */ memset(&sin6, 0, sizeof(sin6)); sin6.sin6_addr = *dst; sin6.sin6_scope_id = scopeid; sa6_embedscope(&sin6, 0); RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rte = RNTORT(rn); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(rte->rt_ifp)) { fib6_rte_to_nh_basic(rte, dst, pnh6); pnh6->nh_ifp = rte->rt_ifp; RIB_RUNLOCK(rh); return (0); } } RIB_RUNLOCK(rh); return (ENOENT); } int fib6_lookup_nh_basic(uint32_t fibnum, struct in6_addr *dst, uint32_t scopeid, uint32_t flowid, struct nhop6_basic *pnh6) { struct rib_head *rh; struct radix_node *rn; struct sockaddr_in6 sin6; struct rtentry *rte; + RIB_LOCK_READER; if (IN6_IS_SCOPE_LINKLOCAL(dst)) { /* Do not lookup link-local addresses in rtable */ return (fib6_lla_to_nh_basic(dst, scopeid, pnh6)); } KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_basic: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET6); if (rh == NULL) return (ENOENT); /* Prepare lookup key */ memset(&sin6, 0, sizeof(sin6)); sin6.sin6_addr = *dst; sin6.sin6_scope_id = scopeid; sa6_embedscope(&sin6, 0); RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rte = RNTORT(rn); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(rte->rt_ifp)) { fib6_rte_to_nh_basic(rte, dst, pnh6); RIB_RUNLOCK(rh); return (0); } } RIB_RUNLOCK(rh); return (ENOENT); } /* * Performs IPv6 route table lookup on @dst. Returns 0 on success. * Stores extende nexthop info provided @pnh4 structure. * Note that * - nh_ifp cannot be safely dereferenced unless NHOP_LOOKUP_REF is specified. * - in that case you need to call fib6_free_nh_ext() * - nh_ifp represents logical transmit interface (rt_ifp) * - mtu from logical transmit interface will be returned. */ int fib6_lookup_nh_ext(uint32_t fibnum, struct in6_addr *dst, uint32_t scopeid, uint32_t flowid, uint32_t flags, struct nhop6_extended *pnh6) { struct rib_head *rh; struct radix_node *rn; struct sockaddr_in6 sin6; struct rtentry *rte; + RIB_LOCK_READER; if (IN6_IS_SCOPE_LINKLOCAL(dst)) { /* Do not lookup link-local addresses in rtable */ /* XXX: Do lwref on egress ifp */ return (fib6_lla_to_nh_extended(dst, scopeid, pnh6)); } KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_ext: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET6); if (rh == NULL) return (ENOENT); /* Prepare lookup key */ memset(&sin6, 0, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_addr = *dst; sin6.sin6_scope_id = scopeid; sa6_embedscope(&sin6, 0); RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rte = RNTORT(rn); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(rte->rt_ifp)) { fib6_rte_to_nh_extended(rte, dst, pnh6); if ((flags & NHOP_LOOKUP_REF) != 0) { /* TODO: Do lwref on egress ifp's */ } RIB_RUNLOCK(rh); return (0); } } RIB_RUNLOCK(rh); return (ENOENT); } void fib6_free_nh_ext(uint32_t fibnum, struct nhop6_extended *pnh6) { } int rib6_lookup_nh_ext(uint32_t fibnum, struct in6_addr *dst, uint32_t scopeid, uint32_t flowid, uint32_t flags, struct rt6_extended *prt6) { struct rib_head *rh; struct radix_node *rn; struct sockaddr_in6 sin6; struct rtentry *rte; + RIB_LOCK_READER; if (IN6_IS_SCOPE_LINKLOCAL(dst)) { /* Do not lookup link-local addresses in rtable */ /* XXX: Do lwref on egress ifp */ return (rib6_lla_to_nh_extended(dst, scopeid, prt6)); } KASSERT((fibnum < rt_numfibs), ("rib6_lookup_nh_ext: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET6); if (rh == NULL) return (ENOENT); /* Prepare lookup key */ memset(&sin6, 0, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_addr = *dst; sin6.sin6_scope_id = scopeid; sa6_embedscope(&sin6, 0); RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rte = RNTORT(rn); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(rte->rt_ifp)) { rib6_rte_to_nh_extended(rte, dst, prt6); if ((flags & NHOP_LOOKUP_REF) != 0) { /* TODO: Do lwref on egress ifp's */ } RIB_RUNLOCK(rh); return (0); } } RIB_RUNLOCK(rh); return (ENOENT); } void rib6_free_nh_ext(uint32_t fibnum, struct nhop6_extended *prt6) { } #endif void fib_free_nh_ext(uint32_t fibnum, struct nhopu_extended *pnhu) { } #if 0 typedef void nhop_change_cb_t(void *state); struct nhop_tracker { TAILQ_ENTRY(nhop_tracker) next; nhop_change_cb_t *f; void *state; uint32_t fibnum; struct sockaddr_storage ss; }; struct nhop_tracker * nhop_alloc_tracked(uint32_t fibnum, struct sockaddr *sa, nhop_change_cb_t *f, void *state) { struct nhop_tracker *nt; nt = malloc(sizeof(struct nhop_tracker), M_RTFIB, M_WAITOK | M_ZERO); nt->f = f; nt-state = state; nt->fibnum = fibnum; memcpy(&nt->ss, sa, sa->sa_len); return (nt); } int nhop_bind(struct nhop_tracker *nt) { NHOP_LOCK(nnh); NHOP_UNLOCK(nnh); return (0); } #endif Index: projects/routing/sys/net/rtsock.c =================================================================== --- projects/routing/sys/net/rtsock.c (revision 274335) +++ projects/routing/sys/net/rtsock.c (revision 274336) @@ -1,1921 +1,1924 @@ /*- * Copyright (c) 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)rtsock.c 8.7 (Berkeley) 10/12/95 * $FreeBSD$ */ #include "opt_compat.h" #include "opt_mpath.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include +#include +#include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #ifdef COMPAT_FREEBSD32 #include #include struct if_msghdr32 { uint16_t ifm_msglen; uint8_t ifm_version; uint8_t ifm_type; int32_t ifm_addrs; int32_t ifm_flags; uint16_t ifm_index; struct if_data ifm_data; }; struct if_msghdrl32 { uint16_t ifm_msglen; uint8_t ifm_version; uint8_t ifm_type; int32_t ifm_addrs; int32_t ifm_flags; uint16_t ifm_index; uint16_t _ifm_spare1; uint16_t ifm_len; uint16_t ifm_data_off; struct if_data ifm_data; }; struct ifa_msghdrl32 { uint16_t ifam_msglen; uint8_t ifam_version; uint8_t ifam_type; int32_t ifam_addrs; int32_t ifam_flags; uint16_t ifam_index; uint16_t _ifam_spare1; uint16_t ifam_len; uint16_t ifam_data_off; int32_t ifam_metric; struct if_data ifam_data; }; #endif /* COMPAT_FREEBSD32 */ MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); /* NB: these are not modified */ static struct sockaddr route_src = { 2, PF_ROUTE, }; static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, }; /* These are external hooks for CARP. */ int (*carp_get_vhid_p)(struct ifaddr *); /* * Used by rtsock/raw_input callback code to decide whether to filter the update * notification to a socket bound to a particular FIB. */ #define RTS_FILTER_FIB M_PROTO8 typedef struct { int ip_count; /* attached w/ AF_INET */ int ip6_count; /* attached w/ AF_INET6 */ int any_count; /* total attached */ } route_cb_t; static VNET_DEFINE(route_cb_t, route_cb); #define V_route_cb VNET(route_cb) struct mtx rtsock_mtx; MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF); #define RTSOCK_LOCK() mtx_lock(&rtsock_mtx) #define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx) #define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED) static SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD, 0, ""); struct walkarg { int w_tmemsize; int w_op, w_arg; caddr_t w_tmem; struct sysctl_req *w_req; }; static void rts_input(struct mbuf *m); static struct mbuf *rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo); static int rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen); static int rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo); static int sysctl_dumpentry(struct radix_node *rn, void *vw); static int sysctl_iflist(int af, struct walkarg *w); static int sysctl_ifmalist(int af, struct walkarg *w); static int route_output(struct mbuf *m, struct socket *so, ...); static void rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out); static void rt_dispatch(struct mbuf *, sa_family_t); static struct sockaddr *rtsock_fix_netmask(struct sockaddr *dst, struct sockaddr *smask, struct sockaddr_storage *dmask); static struct netisr_handler rtsock_nh = { .nh_name = "rtsock", .nh_handler = rts_input, .nh_proto = NETISR_ROUTE, .nh_policy = NETISR_POLICY_SOURCE, }; static int sysctl_route_netisr_maxqlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&rtsock_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&rtsock_nh, qlimit)); } SYSCTL_PROC(_net_route, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_route_netisr_maxqlen, "I", "maximum routing socket dispatch queue length"); static void rts_init(void) { int tmp; if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp)) rtsock_nh.nh_qlimit = tmp; netisr_register(&rtsock_nh); } SYSINIT(rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rts_init, 0); static int raw_input_rts_cb(struct mbuf *m, struct sockproto *proto, struct sockaddr *src, struct rawcb *rp) { int fibnum; KASSERT(m != NULL, ("%s: m is NULL", __func__)); KASSERT(proto != NULL, ("%s: proto is NULL", __func__)); KASSERT(rp != NULL, ("%s: rp is NULL", __func__)); /* No filtering requested. */ if ((m->m_flags & RTS_FILTER_FIB) == 0) return (0); /* Check if it is a rts and the fib matches the one of the socket. */ fibnum = M_GETFIB(m); if (proto->sp_family != PF_ROUTE || rp->rcb_socket == NULL || rp->rcb_socket->so_fibnum == fibnum) return (0); /* Filtering requested and no match, the socket shall be skipped. */ return (1); } static void rts_input(struct mbuf *m) { struct sockproto route_proto; unsigned short *family; struct m_tag *tag; route_proto.sp_family = PF_ROUTE; tag = m_tag_find(m, PACKET_TAG_RTSOCKFAM, NULL); if (tag != NULL) { family = (unsigned short *)(tag + 1); route_proto.sp_protocol = *family; m_tag_delete(m, tag); } else route_proto.sp_protocol = 0; raw_input_ext(m, &route_proto, &route_src, raw_input_rts_cb); } /* * It really doesn't make any sense at all for this code to share much * with raw_usrreq.c, since its functionality is so restricted. XXX */ static void rts_abort(struct socket *so) { raw_usrreqs.pru_abort(so); } static void rts_close(struct socket *so) { raw_usrreqs.pru_close(so); } /* pru_accept is EOPNOTSUPP */ static int rts_attach(struct socket *so, int proto, struct thread *td) { struct rawcb *rp; int error; KASSERT(so->so_pcb == NULL, ("rts_attach: so_pcb != NULL")); /* XXX */ rp = malloc(sizeof *rp, M_PCB, M_WAITOK | M_ZERO); if (rp == NULL) return ENOBUFS; so->so_pcb = (caddr_t)rp; so->so_fibnum = td->td_proc->p_fibnum; error = raw_attach(so, proto); rp = sotorawcb(so); if (error) { so->so_pcb = NULL; free(rp, M_PCB); return error; } RTSOCK_LOCK(); switch(rp->rcb_proto.sp_protocol) { case AF_INET: V_route_cb.ip_count++; break; case AF_INET6: V_route_cb.ip6_count++; break; } V_route_cb.any_count++; RTSOCK_UNLOCK(); soisconnected(so); so->so_options |= SO_USELOOPBACK; return 0; } static int rts_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { return (raw_usrreqs.pru_bind(so, nam, td)); /* xxx just EINVAL */ } static int rts_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (raw_usrreqs.pru_connect(so, nam, td)); /* XXX just EINVAL */ } /* pru_connect2 is EOPNOTSUPP */ /* pru_control is EOPNOTSUPP */ static void rts_detach(struct socket *so) { struct rawcb *rp = sotorawcb(so); KASSERT(rp != NULL, ("rts_detach: rp == NULL")); RTSOCK_LOCK(); switch(rp->rcb_proto.sp_protocol) { case AF_INET: V_route_cb.ip_count--; break; case AF_INET6: V_route_cb.ip6_count--; break; } V_route_cb.any_count--; RTSOCK_UNLOCK(); raw_usrreqs.pru_detach(so); } static int rts_disconnect(struct socket *so) { return (raw_usrreqs.pru_disconnect(so)); } /* pru_listen is EOPNOTSUPP */ static int rts_peeraddr(struct socket *so, struct sockaddr **nam) { return (raw_usrreqs.pru_peeraddr(so, nam)); } /* pru_rcvd is EOPNOTSUPP */ /* pru_rcvoob is EOPNOTSUPP */ static int rts_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { return (raw_usrreqs.pru_send(so, flags, m, nam, control, td)); } /* pru_sense is null */ static int rts_shutdown(struct socket *so) { return (raw_usrreqs.pru_shutdown(so)); } static int rts_sockaddr(struct socket *so, struct sockaddr **nam) { return (raw_usrreqs.pru_sockaddr(so, nam)); } static struct pr_usrreqs route_usrreqs = { .pru_abort = rts_abort, .pru_attach = rts_attach, .pru_bind = rts_bind, .pru_connect = rts_connect, .pru_detach = rts_detach, .pru_disconnect = rts_disconnect, .pru_peeraddr = rts_peeraddr, .pru_send = rts_send, .pru_shutdown = rts_shutdown, .pru_sockaddr = rts_sockaddr, .pru_close = rts_close, }; #ifndef _SOCKADDR_UNION_DEFINED #define _SOCKADDR_UNION_DEFINED /* * The union of all possible address formats we handle. */ union sockaddr_union { struct sockaddr sa; struct sockaddr_in sin; struct sockaddr_in6 sin6; }; #endif /* _SOCKADDR_UNION_DEFINED */ static int rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, struct rtentry *rt, union sockaddr_union *saun, struct ucred *cred) { /* First, see if the returned address is part of the jail. */ if (prison_if(cred, rt->rt_ifa->ifa_addr) == 0) { info->rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; return (0); } switch (info->rti_info[RTAX_DST]->sa_family) { #ifdef INET case AF_INET: { struct in_addr ia; struct ifaddr *ifa; int found; found = 0; /* * Try to find an address on the given outgoing interface * that belongs to the jail. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; ia = ((struct sockaddr_in *)sa)->sin_addr; if (prison_check_ip4(cred, &ia) == 0) { found = 1; break; } } IF_ADDR_RUNLOCK(ifp); if (!found) { /* * As a last resort return the 'default' jail address. */ ia = ((struct sockaddr_in *)rt->rt_ifa->ifa_addr)-> sin_addr; if (prison_get_ip4(cred, &ia) != 0) return (ESRCH); } bzero(&saun->sin, sizeof(struct sockaddr_in)); saun->sin.sin_len = sizeof(struct sockaddr_in); saun->sin.sin_family = AF_INET; saun->sin.sin_addr.s_addr = ia.s_addr; info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin; break; } #endif #ifdef INET6 case AF_INET6: { struct in6_addr ia6; struct ifaddr *ifa; int found; found = 0; /* * Try to find an address on the given outgoing interface * that belongs to the jail. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; if (sa->sa_family != AF_INET6) continue; bcopy(&((struct sockaddr_in6 *)sa)->sin6_addr, &ia6, sizeof(struct in6_addr)); if (prison_check_ip6(cred, &ia6) == 0) { found = 1; break; } } IF_ADDR_RUNLOCK(ifp); if (!found) { /* * As a last resort return the 'default' jail address. */ ia6 = ((struct sockaddr_in6 *)rt->rt_ifa->ifa_addr)-> sin6_addr; if (prison_get_ip6(cred, &ia6) != 0) return (ESRCH); } bzero(&saun->sin6, sizeof(struct sockaddr_in6)); saun->sin6.sin6_len = sizeof(struct sockaddr_in6); saun->sin6.sin6_family = AF_INET6; bcopy(&ia6, &saun->sin6.sin6_addr, sizeof(struct in6_addr)); if (sa6_recoverscope(&saun->sin6) != 0) return (ESRCH); info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin6; break; } #endif default: return (ESRCH); } return (0); } /*ARGSUSED*/ static int route_output(struct mbuf *m, struct socket *so, ...) { struct rt_msghdr *rtm = NULL; struct rtentry *rt = NULL; struct rib_head *rh; struct rt_addrinfo info; struct sockaddr_storage ss; #ifdef INET6 struct sockaddr_in6 *sin6; int i, rti_need_deembed = 0; #endif int alloc_len = 0, len, error = 0, fibnum; struct ifnet *ifp = NULL; union sockaddr_union saun; sa_family_t saf = AF_UNSPEC; struct rawcb *rp = NULL; struct walkarg w; fibnum = so->so_fibnum; #define senderr(e) { error = e; goto flush;} if (m == NULL || ((m->m_len < sizeof(long)) && (m = m_pullup(m, sizeof(long))) == NULL)) return (ENOBUFS); if ((m->m_flags & M_PKTHDR) == 0) panic("route_output"); len = m->m_pkthdr.len; if (len < sizeof(*rtm) || len != mtod(m, struct rt_msghdr *)->rtm_msglen) senderr(EINVAL); /* * Most of current messages are in range 200-240 bytes, * minimize possible re-allocation on reply using larger size * buffer aligned on 1k boundaty. */ alloc_len = roundup2(len, 1024); if ((rtm = malloc(alloc_len, M_TEMP, M_NOWAIT)) == NULL) senderr(ENOBUFS); m_copydata(m, 0, len, (caddr_t)rtm); bzero(&info, sizeof(info)); bzero(&w, sizeof(w)); if (rtm->rtm_version != RTM_VERSION) { /* Do not touch message since format is unknown */ free(rtm, M_TEMP); rtm = NULL; senderr(EPROTONOSUPPORT); } /* * Starting from here, it is possible * to alter original message and insert * caller PID and error value. */ rtm->rtm_pid = curproc->p_pid; info.rti_addrs = rtm->rtm_addrs; info.rti_mflags = rtm->rtm_inits; info.rti_rmx = &rtm->rtm_rmx; /* * rt_xaddrs() performs s6_addr[2] := sin6_scope_id for AF_INET6 * link-local address because rtrequest requires addresses with * embedded scope id. */ if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) senderr(EINVAL); info.rti_flags = rtm->rtm_flags; if (info.rti_info[RTAX_DST] == NULL || info.rti_info[RTAX_DST]->sa_family >= AF_MAX || (info.rti_info[RTAX_GATEWAY] != NULL && info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX)) senderr(EINVAL); saf = info.rti_info[RTAX_DST]->sa_family; /* * Verify that the caller has the appropriate privilege; RTM_GET * is the only operation the non-superuser is allowed. */ if (rtm->rtm_type != RTM_GET) { error = priv_check(curthread, PRIV_NET_ROUTE); if (error) senderr(error); } /* * The given gateway address may be an interface address. * For example, issuing a "route change" command on a route * entry that was created from a tunnel, and the gateway * address given is the local end point. In this case the * RTF_GATEWAY flag must be cleared or the destination will * not be reachable even though there is no error message. */ if (info.rti_info[RTAX_GATEWAY] != NULL && info.rti_info[RTAX_GATEWAY]->sa_family != AF_LINK) { struct rtentry *rt; /* XXX-ME: Is this enough? */ struct sockaddr dst; bzero(&dst, sizeof(dst)); dst = *info.rti_info[RTAX_GATEWAY]; rt = rtalloc1_fib(&dst, 0, 0, fibnum); /* * A host route through the loopback interface is * installed for each interface adddress. In pre 8.0 * releases the interface address of a PPP link type * is not reachable locally. This behavior is fixed as * part of the new L2/L3 redesign and rewrite work. The * signature of this interface address route is the * AF_LINK sa_family type of the rt_gateway, and the * rt_ifp has the IFF_LOOPBACK flag set. */ if (rt != NULL && rt->rt_gateway->sa_family == AF_LINK && rt->rt_ifp->if_flags & IFF_LOOPBACK) { info.rti_flags &= ~RTF_GATEWAY; info.rti_flags |= RTF_GWFLAG_COMPAT; } if (rt != NULL) RTFREE_LOCKED(rt); } switch (rtm->rtm_type) { struct rtentry *saved_nrt; case RTM_ADD: case RTM_CHANGE: if (info.rti_info[RTAX_GATEWAY] == NULL) senderr(EINVAL); saved_nrt = NULL; /* support for new ARP code */ if (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK && (rtm->rtm_flags & RTF_LLDATA) != 0) { error = lla_rt_output(rtm, &info); #ifdef INET6 if (error == 0) rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif break; } error = rtrequest1_fib(rtm->rtm_type, &info, &saved_nrt, fibnum); if (error == 0 && saved_nrt != NULL) { #ifdef INET6 rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif RT_LOCK(saved_nrt); rtm->rtm_index = saved_nrt->rt_ifp->if_index; RT_REMREF(saved_nrt); RT_UNLOCK(saved_nrt); } break; case RTM_DELETE: saved_nrt = NULL; /* support for new ARP code */ if (info.rti_info[RTAX_GATEWAY] && (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK) && (rtm->rtm_flags & RTF_LLDATA) != 0) { error = lla_rt_output(rtm, &info); #ifdef INET6 if (error == 0) rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif break; } error = rtrequest1_fib(RTM_DELETE, &info, &saved_nrt, fibnum); if (error == 0) { RT_LOCK(saved_nrt); rt = saved_nrt; goto report; } #ifdef INET6 /* rt_msg2() will not be used when RTM_DELETE fails. */ rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif break; case RTM_GET: rh = rt_tables_get_rnh(fibnum, saf); if (rh == NULL) senderr(EAFNOSUPPORT); - RIB_RLOCK(rh); + RIB_CFG_RLOCK(rh); if (info.rti_info[RTAX_NETMASK] == NULL && rtm->rtm_type == RTM_GET) { /* * Provide logest prefix match for * address lookup (no mask). * 'route -n get addr' */ rt = (struct rtentry *) rh->rnh_matchaddr( info.rti_info[RTAX_DST], &rh->head); } else rt = (struct rtentry *) rh->rnh_lookup( info.rti_info[RTAX_DST], info.rti_info[RTAX_NETMASK], &rh->head); if (rt == NULL) { - RIB_RUNLOCK(rh); + RIB_CFG_RUNLOCK(rh); senderr(ESRCH); } #ifdef RADIX_MPATH /* * for RTM_CHANGE/LOCK, if we got multipath routes, * we require users to specify a matching RTAX_GATEWAY. * * for RTM_GET, gate is optional even with multipath. * if gate == NULL the first match is returned. * (no need to call rt_mpath_matchgate if gate == NULL) */ if (rn_mpath_capable(rh) && (rtm->rtm_type != RTM_GET || info.rti_info[RTAX_GATEWAY])) { rt = rt_mpath_matchgate(rt, info.rti_info[RTAX_GATEWAY]); if (!rt) { - RIB_RUNLOCK(rh); + RIB_CFG_RUNLOCK(rh); senderr(ESRCH); } } #endif /* * If performing proxied L2 entry insertion, and * the actual PPP host entry is found, perform * another search to retrieve the prefix route of * the local end point of the PPP link. */ if (rtm->rtm_flags & RTF_ANNOUNCE) { struct sockaddr laddr; if (rt->rt_ifp != NULL && rt->rt_ifp->if_type == IFT_PROPVIRTUAL) { struct ifaddr *ifa; ifa = ifa_ifwithnet(info.rti_info[RTAX_DST], 1, RT_ALL_FIBS); if (ifa != NULL) rt_maskedcopy(ifa->ifa_addr, &laddr, ifa->ifa_netmask); } else rt_maskedcopy(rt->rt_ifa->ifa_addr, &laddr, rt->rt_ifa->ifa_netmask); /* * refactor rt and no lock operation necessary */ rt = (struct rtentry *)rh->rnh_matchaddr(&laddr, &rh->head); if (rt == NULL) { - RIB_RUNLOCK(rh); + RIB_CFG_RUNLOCK(rh); senderr(ESRCH); } } RT_LOCK(rt); RT_ADDREF(rt); - RIB_RUNLOCK(rh); + RIB_CFG_RUNLOCK(rh); report: RT_LOCK_ASSERT(rt); if ((rt->rt_flags & RTF_HOST) == 0 ? jailed_without_vnet(curthread->td_ucred) : prison_if(curthread->td_ucred, rt_key(rt)) != 0) { RT_UNLOCK(rt); senderr(ESRCH); } info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt), rt_mask(rt), &ss); info.rti_info[RTAX_GENMASK] = 0; if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { ifp = rt->rt_ifp; if (ifp) { info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; error = rtm_get_jailed(&info, ifp, rt, &saun, curthread->td_ucred); if (error != 0) { RT_UNLOCK(rt); senderr(error); } if (ifp->if_flags & IFF_POINTOPOINT) info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr; rtm->rtm_index = ifp->if_index; } else { info.rti_info[RTAX_IFP] = NULL; info.rti_info[RTAX_IFA] = NULL; } } else if ((ifp = rt->rt_ifp) != NULL) { rtm->rtm_index = ifp->if_index; } /* Check if we need to realloc storage */ rtsock_msg_buffer(rtm->rtm_type, &info, NULL, &len); if (len > alloc_len) { struct rt_msghdr *new_rtm; new_rtm = malloc(len, M_TEMP, M_NOWAIT); if (new_rtm == NULL) { RT_UNLOCK(rt); senderr(ENOBUFS); } bcopy(rtm, new_rtm, rtm->rtm_msglen); free(rtm, M_TEMP); rtm = new_rtm; alloc_len = len; } w.w_tmem = (caddr_t)rtm; w.w_tmemsize = alloc_len; rtsock_msg_buffer(rtm->rtm_type, &info, &w, &len); if (rt->rt_flags & RTF_GWFLAG_COMPAT) rtm->rtm_flags = RTF_GATEWAY | (rt->rt_flags & ~RTF_GWFLAG_COMPAT); else rtm->rtm_flags = rt->rt_flags; rt_getmetrics(rt, &rtm->rtm_rmx); rtm->rtm_addrs = info.rti_addrs; RT_UNLOCK(rt); break; default: senderr(EOPNOTSUPP); } flush: if (rt != NULL) RTFREE(rt); /* * Check to see if we don't want our own messages. */ if ((so->so_options & SO_USELOOPBACK) == 0) { if (V_route_cb.any_count <= 1) { if (rtm != NULL) free(rtm, M_TEMP); m_freem(m); return (error); } /* There is another listener, so construct message */ rp = sotorawcb(so); } if (rtm != NULL) { #ifdef INET6 if (rti_need_deembed) { /* sin6_scope_id is recovered before sending rtm. */ sin6 = (struct sockaddr_in6 *)&ss; for (i = 0; i < RTAX_MAX; i++) { if (info.rti_info[i] == NULL) continue; if (info.rti_info[i]->sa_family != AF_INET6) continue; bcopy(info.rti_info[i], sin6, sizeof(*sin6)); if (sa6_recoverscope(sin6) == 0) bcopy(sin6, info.rti_info[i], sizeof(*sin6)); } } #endif if (error != 0) rtm->rtm_errno = error; else rtm->rtm_flags |= RTF_DONE; m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm); if (m->m_pkthdr.len < rtm->rtm_msglen) { m_freem(m); m = NULL; } else if (m->m_pkthdr.len > rtm->rtm_msglen) m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len); free(rtm, M_TEMP); } if (m != NULL) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; if (rp) { /* * XXX insure we don't get a copy by * invalidating our protocol */ unsigned short family = rp->rcb_proto.sp_family; rp->rcb_proto.sp_family = 0; rt_dispatch(m, saf); rp->rcb_proto.sp_family = family; } else rt_dispatch(m, saf); } return (error); } static void rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out) { bzero(out, sizeof(*out)); out->rmx_mtu = rt->rt_mtu; out->rmx_weight = rt->rt_weight; /* Kernel -> userland timebase conversion. */ out->rmx_expire = rt->rt_expire ? rt->rt_expire - time_uptime + time_second : 0; } /* * Extract the addresses of the passed sockaddrs. * Do a little sanity checking so as to avoid bad memory references. * This data is derived straight from userland. */ static int rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo) { struct sockaddr *sa; int i; for (i = 0; i < RTAX_MAX && cp < cplim; i++) { if ((rtinfo->rti_addrs & (1 << i)) == 0) continue; sa = (struct sockaddr *)cp; /* * It won't fit. */ if (cp + sa->sa_len > cplim) return (EINVAL); /* * there are no more.. quit now * If there are more bits, they are in error. * I've seen this. route(1) can evidently generate these. * This causes kernel to core dump. * for compatibility, If we see this, point to a safe address. */ if (sa->sa_len == 0) { rtinfo->rti_info[i] = &sa_zero; return (0); /* should be EINVAL but for compat */ } /* accept it */ #ifdef INET6 if (sa->sa_family == AF_INET6) sa6_embedscope((struct sockaddr_in6 *)sa, V_ip6_use_defzone); #endif rtinfo->rti_info[i] = sa; cp += SA_SIZE(sa); } return (0); } /* * Fill in @dmask with valid netmask leaving original @smask * intact. Mostly used with radix netmasks. */ static struct sockaddr * rtsock_fix_netmask(struct sockaddr *dst, struct sockaddr *smask, struct sockaddr_storage *dmask) { if (dst == NULL || smask == NULL) return (NULL); memset(dmask, 0, dst->sa_len); memcpy(dmask, smask, smask->sa_len); dmask->ss_len = dst->sa_len; dmask->ss_family = dst->sa_family; return ((struct sockaddr *)dmask); } /* * Writes information related to @rtinfo object to newly-allocated mbuf. * Assumes MCLBYTES is enough to construct any message. * Used for OS notifications of vaious events (if/ifa announces,etc) * * Returns allocated mbuf or NULL on failure. */ static struct mbuf * rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo) { struct rt_msghdr *rtm; struct mbuf *m; int i; struct sockaddr *sa; #ifdef INET6 struct sockaddr_storage ss; struct sockaddr_in6 *sin6; #endif int len, dlen; switch (type) { case RTM_DELADDR: case RTM_NEWADDR: len = sizeof(struct ifa_msghdr); break; case RTM_DELMADDR: case RTM_NEWMADDR: len = sizeof(struct ifma_msghdr); break; case RTM_IFINFO: len = sizeof(struct if_msghdr); break; case RTM_IFANNOUNCE: case RTM_IEEE80211: len = sizeof(struct if_announcemsghdr); break; default: len = sizeof(struct rt_msghdr); } /* XXXGL: can we use MJUMPAGESIZE cluster here? */ KASSERT(len <= MCLBYTES, ("%s: message too big", __func__)); if (len > MHLEN) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (m); m->m_pkthdr.len = m->m_len = len; rtm = mtod(m, struct rt_msghdr *); bzero((caddr_t)rtm, len); for (i = 0; i < RTAX_MAX; i++) { if ((sa = rtinfo->rti_info[i]) == NULL) continue; rtinfo->rti_addrs |= (1 << i); dlen = SA_SIZE(sa); #ifdef INET6 if (V_deembed_scopeid && sa->sa_family == AF_INET6) { sin6 = (struct sockaddr_in6 *)&ss; bcopy(sa, sin6, sizeof(*sin6)); if (sa6_recoverscope(sin6) == 0) sa = (struct sockaddr *)sin6; } #endif m_copyback(m, len, dlen, (caddr_t)sa); len += dlen; } if (m->m_pkthdr.len != len) { m_freem(m); return (NULL); } rtm->rtm_msglen = len; rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; return (m); } /* * Writes information related to @rtinfo object to preallocated buffer. * Stores needed size in @plen. If @w is NULL, calculates size without * writing. * Used for sysctl dumps and rtsock answers (RTM_DEL/RTM_GET) generation. * * Returns 0 on success. * */ static int rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen) { int i; int len, buflen = 0, dlen; caddr_t cp = NULL; struct rt_msghdr *rtm = NULL; #ifdef INET6 struct sockaddr_storage ss; struct sockaddr_in6 *sin6; #endif switch (type) { case RTM_DELADDR: case RTM_NEWADDR: if (w != NULL && w->w_op == NET_RT_IFLISTL) { #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) len = sizeof(struct ifa_msghdrl32); else #endif len = sizeof(struct ifa_msghdrl); } else len = sizeof(struct ifa_msghdr); break; case RTM_IFINFO: #ifdef COMPAT_FREEBSD32 if (w != NULL && w->w_req->flags & SCTL_MASK32) { if (w->w_op == NET_RT_IFLISTL) len = sizeof(struct if_msghdrl32); else len = sizeof(struct if_msghdr32); break; } #endif if (w != NULL && w->w_op == NET_RT_IFLISTL) len = sizeof(struct if_msghdrl); else len = sizeof(struct if_msghdr); break; case RTM_NEWMADDR: len = sizeof(struct ifma_msghdr); break; default: len = sizeof(struct rt_msghdr); } if (w != NULL) { rtm = (struct rt_msghdr *)w->w_tmem; buflen = w->w_tmemsize - len; cp = (caddr_t)w->w_tmem + len; } rtinfo->rti_addrs = 0; for (i = 0; i < RTAX_MAX; i++) { struct sockaddr *sa; if ((sa = rtinfo->rti_info[i]) == NULL) continue; rtinfo->rti_addrs |= (1 << i); dlen = SA_SIZE(sa); if (cp != NULL && buflen >= dlen) { #ifdef INET6 if (V_deembed_scopeid && sa->sa_family == AF_INET6) { sin6 = (struct sockaddr_in6 *)&ss; bcopy(sa, sin6, sizeof(*sin6)); if (sa6_recoverscope(sin6) == 0) sa = (struct sockaddr *)sin6; } #endif bcopy((caddr_t)sa, cp, (unsigned)dlen); cp += dlen; buflen -= dlen; } else if (cp != NULL) { /* * Buffer too small. Count needed size * and return with error. */ cp = NULL; } len += dlen; } if (cp != NULL) { dlen = ALIGN(len) - len; if (buflen < dlen) cp = NULL; else buflen -= dlen; } len = ALIGN(len); if (cp != NULL) { /* fill header iff buffer is large enough */ rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; rtm->rtm_msglen = len; } *plen = len; if (w != NULL && cp == NULL) return (ENOBUFS); return (0); } /* * This routine is called to generate a message from the routing * socket indicating that a redirect has occured, a routing lookup * has failed, or that a protocol has detected timeouts to a particular * destination. */ void rt_missmsg_fib(int type, struct rt_addrinfo *rtinfo, int flags, int error, int fibnum) { struct rt_msghdr *rtm; struct mbuf *m; struct sockaddr *sa = rtinfo->rti_info[RTAX_DST]; if (V_route_cb.any_count == 0) return; m = rtsock_msg_mbuf(type, rtinfo); if (m == NULL) return; if (fibnum != RT_ALL_FIBS) { KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: fibnum out " "of range 0 <= %d < %d", __func__, fibnum, rt_numfibs)); M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rtm = mtod(m, struct rt_msghdr *); rtm->rtm_flags = RTF_DONE | flags; rtm->rtm_errno = error; rtm->rtm_addrs = rtinfo->rti_addrs; rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); } void rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error) { rt_missmsg_fib(type, rtinfo, flags, error, RT_ALL_FIBS); } /* * This routine is called to generate a message from the routing * socket indicating that the status of a network interface has changed. */ void rt_ifmsg(struct ifnet *ifp) { struct if_msghdr *ifm; struct mbuf *m; struct rt_addrinfo info; if (V_route_cb.any_count == 0) return; bzero((caddr_t)&info, sizeof(info)); m = rtsock_msg_mbuf(RTM_IFINFO, &info); if (m == NULL) return; ifm = mtod(m, struct if_msghdr *); ifm->ifm_index = ifp->if_index; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; if_data_copy(ifp, &ifm->ifm_data); ifm->ifm_addrs = 0; rt_dispatch(m, AF_UNSPEC); } /* * Announce interface address arrival/withdraw. * Please do not call directly, use rt_addrmsg(). * Assume input data to be valid. * Returns 0 on success. */ int rtsock_addrmsg(int cmd, struct ifaddr *ifa, int fibnum) { struct rt_addrinfo info; struct sockaddr *sa; int ncmd; struct mbuf *m; struct ifa_msghdr *ifam; struct ifnet *ifp = ifa->ifa_ifp; struct sockaddr_storage ss; if (V_route_cb.any_count == 0) return (0); ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr; info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask( info.rti_info[RTAX_IFP], ifa->ifa_netmask, &ss); info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; if ((m = rtsock_msg_mbuf(ncmd, &info)) == NULL) return (ENOBUFS); ifam = mtod(m, struct ifa_msghdr *); ifam->ifam_index = ifp->if_index; ifam->ifam_metric = ifa->ifa_ifp->if_metric; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_addrs = info.rti_addrs; if (fibnum != RT_ALL_FIBS) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); return (0); } /* * Announce route addition/removal. * Please do not call directly, use rt_routemsg(). * Note that @rt data MAY be inconsistent/invalid: * if some userland app sends us "invalid" route message (invalid mask, * no dst, wrong address families, etc...) we need to pass it back * to app (and any other rtsock consumers) with rtm_errno field set to * non-zero value. * * Returns 0 on success. */ int rtsock_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt, int fibnum) { struct rt_addrinfo info; struct sockaddr *sa; struct mbuf *m; struct rt_msghdr *rtm; struct sockaddr_storage ss; if (V_route_cb.any_count == 0) return (0); bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = sa = rt_key(rt); info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(sa, rt_mask(rt), &ss); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; if ((m = rtsock_msg_mbuf(cmd, &info)) == NULL) return (ENOBUFS); rtm = mtod(m, struct rt_msghdr *); rtm->rtm_index = ifp->if_index; rtm->rtm_flags |= rt->rt_flags; rtm->rtm_errno = error; rtm->rtm_addrs = info.rti_addrs; if (fibnum != RT_ALL_FIBS) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); return (0); } /* * This is the analogue to the rt_newaddrmsg which performs the same * function but for multicast group memberhips. This is easier since * there is no route state to worry about. */ void rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma) { struct rt_addrinfo info; struct mbuf *m = NULL; struct ifnet *ifp = ifma->ifma_ifp; struct ifma_msghdr *ifmam; if (V_route_cb.any_count == 0) return; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_IFA] = ifma->ifma_addr; info.rti_info[RTAX_IFP] = ifp ? ifp->if_addr->ifa_addr : NULL; /* * If a link-layer address is present, present it as a ``gateway'' * (similarly to how ARP entries, e.g., are presented). */ info.rti_info[RTAX_GATEWAY] = ifma->ifma_lladdr; m = rtsock_msg_mbuf(cmd, &info); if (m == NULL) return; ifmam = mtod(m, struct ifma_msghdr *); KASSERT(ifp != NULL, ("%s: link-layer multicast address w/o ifp\n", __func__)); ifmam->ifmam_index = ifp->if_index; ifmam->ifmam_addrs = info.rti_addrs; rt_dispatch(m, ifma->ifma_addr ? ifma->ifma_addr->sa_family : AF_UNSPEC); } static struct mbuf * rt_makeifannouncemsg(struct ifnet *ifp, int type, int what, struct rt_addrinfo *info) { struct if_announcemsghdr *ifan; struct mbuf *m; if (V_route_cb.any_count == 0) return NULL; bzero((caddr_t)info, sizeof(*info)); m = rtsock_msg_mbuf(type, info); if (m != NULL) { ifan = mtod(m, struct if_announcemsghdr *); ifan->ifan_index = ifp->if_index; strlcpy(ifan->ifan_name, ifp->if_xname, sizeof(ifan->ifan_name)); ifan->ifan_what = what; } return m; } /* * This is called to generate routing socket messages indicating * IEEE80211 wireless events. * XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way. */ void rt_ieee80211msg(struct ifnet *ifp, int what, void *data, size_t data_len) { struct mbuf *m; struct rt_addrinfo info; m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info); if (m != NULL) { /* * Append the ieee80211 data. Try to stick it in the * mbuf containing the ifannounce msg; otherwise allocate * a new mbuf and append. * * NB: we assume m is a single mbuf. */ if (data_len > M_TRAILINGSPACE(m)) { struct mbuf *n = m_get(M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return; } bcopy(data, mtod(n, void *), data_len); n->m_len = data_len; m->m_next = n; } else if (data_len > 0) { bcopy(data, mtod(m, u_int8_t *) + m->m_len, data_len); m->m_len += data_len; } if (m->m_flags & M_PKTHDR) m->m_pkthdr.len += data_len; mtod(m, struct if_announcemsghdr *)->ifan_msglen += data_len; rt_dispatch(m, AF_UNSPEC); } } /* * This is called to generate routing socket messages indicating * network interface arrival and departure. */ void rt_ifannouncemsg(struct ifnet *ifp, int what) { struct mbuf *m; struct rt_addrinfo info; m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info); if (m != NULL) rt_dispatch(m, AF_UNSPEC); } static void rt_dispatch(struct mbuf *m, sa_family_t saf) { struct m_tag *tag; /* * Preserve the family from the sockaddr, if any, in an m_tag for * use when injecting the mbuf into the routing socket buffer from * the netisr. */ if (saf != AF_UNSPEC) { tag = m_tag_get(PACKET_TAG_RTSOCKFAM, sizeof(unsigned short), M_NOWAIT); if (tag == NULL) { m_freem(m); return; } *(unsigned short *)(tag + 1) = saf; m_tag_prepend(m, tag); } #ifdef VIMAGE if (V_loif) m->m_pkthdr.rcvif = V_loif; else { m_freem(m); return; } #endif netisr_queue(NETISR_ROUTE, m); /* mbuf is free'd on failure. */ } /* * This is used in dumping the kernel table via sysctl(). */ static int sysctl_dumpentry(struct radix_node *rn, void *vw) { struct walkarg *w = vw; struct rtentry *rt = (struct rtentry *)rn; int error = 0, size; struct rt_addrinfo info; struct sockaddr_storage ss; if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg)) return 0; if ((rt->rt_flags & RTF_HOST) == 0 ? jailed_without_vnet(w->w_req->td->td_ucred) : prison_if(w->w_req->td->td_ucred, rt_key(rt)) != 0) return (0); bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt), rt_mask(rt), &ss); info.rti_info[RTAX_GENMASK] = 0; if (rt->rt_ifp) { info.rti_info[RTAX_IFP] = rt->rt_ifp->if_addr->ifa_addr; info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; if (rt->rt_ifp->if_flags & IFF_POINTOPOINT) info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr; } if ((error = rtsock_msg_buffer(RTM_GET, &info, w, &size)) != 0) return (error); if (w->w_req && w->w_tmem) { struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem; if (rt->rt_flags & RTF_GWFLAG_COMPAT) rtm->rtm_flags = RTF_GATEWAY | (rt->rt_flags & ~RTF_GWFLAG_COMPAT); else rtm->rtm_flags = rt->rt_flags; rt_getmetrics(rt, &rtm->rtm_rmx); rtm->rtm_index = rt->rt_ifp->if_index; rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0; rtm->rtm_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); return (error); } return (error); } static int sysctl_iflist_ifml(struct ifnet *ifp, struct rt_addrinfo *info, struct walkarg *w, int len) { struct if_msghdrl *ifm; struct if_data *ifd; ifm = (struct if_msghdrl *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct if_msghdrl32 *ifm32; ifm32 = (struct if_msghdrl32 *)ifm; ifm32->ifm_addrs = info->rti_addrs; ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm32->ifm_index = ifp->if_index; ifm32->_ifm_spare1 = 0; ifm32->ifm_len = sizeof(*ifm32); ifm32->ifm_data_off = offsetof(struct if_msghdrl32, ifm_data); ifd = &ifm32->ifm_data; } else #endif { ifm->ifm_addrs = info->rti_addrs; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm->ifm_index = ifp->if_index; ifm->_ifm_spare1 = 0; ifm->ifm_len = sizeof(*ifm); ifm->ifm_data_off = offsetof(struct if_msghdrl, ifm_data); ifd = &ifm->ifm_data; } if_data_copy(ifp, ifd); return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); } static int sysctl_iflist_ifm(struct ifnet *ifp, struct rt_addrinfo *info, struct walkarg *w, int len) { struct if_msghdr *ifm; struct if_data *ifd; ifm = (struct if_msghdr *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct if_msghdr32 *ifm32; ifm32 = (struct if_msghdr32 *)ifm; ifm32->ifm_addrs = info->rti_addrs; ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm32->ifm_index = ifp->if_index; ifd = &ifm32->ifm_data; } else #endif { ifm->ifm_addrs = info->rti_addrs; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm->ifm_index = ifp->if_index; ifd = &ifm->ifm_data; } if_data_copy(ifp, ifd); return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); } static int sysctl_iflist_ifaml(struct ifaddr *ifa, struct rt_addrinfo *info, struct walkarg *w, int len) { struct ifa_msghdrl *ifam; struct if_data *ifd; ifam = (struct ifa_msghdrl *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct ifa_msghdrl32 *ifam32; ifam32 = (struct ifa_msghdrl32 *)ifam; ifam32->ifam_addrs = info->rti_addrs; ifam32->ifam_flags = ifa->ifa_flags; ifam32->ifam_index = ifa->ifa_ifp->if_index; ifam32->_ifam_spare1 = 0; ifam32->ifam_len = sizeof(*ifam32); ifam32->ifam_data_off = offsetof(struct ifa_msghdrl32, ifam_data); ifam32->ifam_metric = ifa->ifa_ifp->if_metric; ifd = &ifam32->ifam_data; } else #endif { ifam->ifam_addrs = info->rti_addrs; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_index = ifa->ifa_ifp->if_index; ifam->_ifam_spare1 = 0; ifam->ifam_len = sizeof(*ifam); ifam->ifam_data_off = offsetof(struct ifa_msghdrl, ifam_data); ifam->ifam_metric = ifa->ifa_ifp->if_metric; ifd = &ifam->ifam_data; } bzero(ifd, sizeof(*ifd)); ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_ipackets = counter_u64_fetch(ifa->ifa_ipackets); ifd->ifi_opackets = counter_u64_fetch(ifa->ifa_opackets); ifd->ifi_ibytes = counter_u64_fetch(ifa->ifa_ibytes); ifd->ifi_obytes = counter_u64_fetch(ifa->ifa_obytes); /* Fixup if_data carp(4) vhid. */ if (carp_get_vhid_p != NULL) ifd->ifi_vhid = (*carp_get_vhid_p)(ifa); return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); } static int sysctl_iflist_ifam(struct ifaddr *ifa, struct rt_addrinfo *info, struct walkarg *w, int len) { struct ifa_msghdr *ifam; ifam = (struct ifa_msghdr *)w->w_tmem; ifam->ifam_addrs = info->rti_addrs; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_index = ifa->ifa_ifp->if_index; ifam->ifam_metric = ifa->ifa_ifp->if_metric; return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); } static int sysctl_iflist(int af, struct walkarg *w) { struct ifnet *ifp; struct ifaddr *ifa; struct rt_addrinfo info; int len, error = 0; struct sockaddr_storage ss; bzero((caddr_t)&info, sizeof(info)); IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; IF_ADDR_RLOCK(ifp); ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa->ifa_addr; error = rtsock_msg_buffer(RTM_IFINFO, &info, w, &len); if (error != 0) goto done; info.rti_info[RTAX_IFP] = NULL; if (w->w_req && w->w_tmem) { if (w->w_op == NET_RT_IFLISTL) error = sysctl_iflist_ifml(ifp, &info, w, len); else error = sysctl_iflist_ifm(ifp, &info, w, len); if (error) goto done; } while ((ifa = TAILQ_NEXT(ifa, ifa_link)) != NULL) { if (af && af != ifa->ifa_addr->sa_family) continue; if (prison_if(w->w_req->td->td_ucred, ifa->ifa_addr) != 0) continue; info.rti_info[RTAX_IFA] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask( ifa->ifa_addr, ifa->ifa_netmask, &ss); info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; error = rtsock_msg_buffer(RTM_NEWADDR, &info, w, &len); if (error != 0) goto done; if (w->w_req && w->w_tmem) { if (w->w_op == NET_RT_IFLISTL) error = sysctl_iflist_ifaml(ifa, &info, w, len); else error = sysctl_iflist_ifam(ifa, &info, w, len); if (error) goto done; } } IF_ADDR_RUNLOCK(ifp); info.rti_info[RTAX_IFA] = NULL; info.rti_info[RTAX_NETMASK] = NULL; info.rti_info[RTAX_BRD] = NULL; } done: if (ifp != NULL) IF_ADDR_RUNLOCK(ifp); IFNET_RUNLOCK_NOSLEEP(); return (error); } static int sysctl_ifmalist(int af, struct walkarg *w) { struct ifnet *ifp; struct ifmultiaddr *ifma; struct rt_addrinfo info; int len, error = 0; struct ifaddr *ifa; bzero((caddr_t)&info, sizeof(info)); IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (af && af != ifma->ifma_addr->sa_family) continue; if (prison_if(w->w_req->td->td_ucred, ifma->ifma_addr) != 0) continue; info.rti_info[RTAX_IFA] = ifma->ifma_addr; info.rti_info[RTAX_GATEWAY] = (ifma->ifma_addr->sa_family != AF_LINK) ? ifma->ifma_lladdr : NULL; error = rtsock_msg_buffer(RTM_NEWMADDR, &info, w, &len); if (error != 0) goto done; if (w->w_req && w->w_tmem) { struct ifma_msghdr *ifmam; ifmam = (struct ifma_msghdr *)w->w_tmem; ifmam->ifmam_index = ifma->ifma_ifp->if_index; ifmam->ifmam_flags = 0; ifmam->ifmam_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, w->w_tmem, len); if (error) { IF_ADDR_RUNLOCK(ifp); goto done; } } } IF_ADDR_RUNLOCK(ifp); } done: IFNET_RUNLOCK_NOSLEEP(); return (error); } static int sysctl_rtsock(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct rib_head *rh = NULL; /* silence compiler. */ int i, lim, error = EINVAL; int fib = 0; u_char af; struct walkarg w; name ++; namelen--; if (req->newptr) return (EPERM); if (name[1] == NET_RT_DUMP) { if (namelen == 3) fib = req->td->td_proc->p_fibnum; else if (namelen == 4) fib = (name[3] == RT_ALL_FIBS) ? req->td->td_proc->p_fibnum : name[3]; else return ((namelen < 3) ? EISDIR : ENOTDIR); if (fib < 0 || fib >= rt_numfibs) return (EINVAL); } else if (namelen != 3) return ((namelen < 3) ? EISDIR : ENOTDIR); af = name[0]; if (af > AF_MAX) return (EINVAL); bzero(&w, sizeof(w)); w.w_op = name[1]; w.w_arg = name[2]; w.w_req = req; error = sysctl_wire_old_buffer(req, 0); if (error) return (error); /* * Allocate reply buffer in advance. * All rtsock messages has maximum length of u_short. */ w.w_tmemsize = 65536; w.w_tmem = malloc(w.w_tmemsize, M_TEMP, M_WAITOK); switch (w.w_op) { case NET_RT_DUMP: case NET_RT_FLAGS: if (af == 0) { /* dump all tables */ i = 1; lim = AF_MAX; } else /* dump only one table */ i = lim = af; /* * take care of llinfo entries, the caller must * specify an AF */ if (w.w_op == NET_RT_FLAGS && (w.w_arg == 0 || w.w_arg & RTF_LLINFO)) { if (af != 0) error = lltable_sysctl_dumparp(af, w.w_req); else error = EINVAL; break; } /* * take care of routing entries */ for (error = 0; error == 0 && i <= lim; i++) { rh = rt_tables_get_rnh(fib, i); if (rh != NULL) { - RIB_RLOCK(rh); + RIB_CFG_RLOCK(rh); error = rh->rnh_walktree(&rh->head, sysctl_dumpentry, &w); - RIB_RUNLOCK(rh); + RIB_CFG_RUNLOCK(rh); } else if (af != 0) error = EAFNOSUPPORT; } break; case NET_RT_IFLIST: case NET_RT_IFLISTL: error = sysctl_iflist(af, &w); break; case NET_RT_IFMALIST: error = sysctl_ifmalist(af, &w); break; } free(w.w_tmem, M_TEMP); return (error); } static SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD, sysctl_rtsock, ""); /* * Definitions of protocols supported in the ROUTE domain. */ static struct domain routedomain; /* or at least forward */ static struct protosw routesw[] = { { .pr_type = SOCK_RAW, .pr_domain = &routedomain, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_output = route_output, .pr_ctlinput = raw_ctlinput, .pr_init = raw_init, .pr_usrreqs = &route_usrreqs } }; static struct domain routedomain = { .dom_family = PF_ROUTE, .dom_name = "route", .dom_protosw = routesw, .dom_protoswNPROTOSW = &routesw[sizeof(routesw)/sizeof(routesw[0])] }; VNET_DOMAIN_SET(route); Index: projects/routing/sys/netinet/in_rmx.c =================================================================== --- projects/routing/sys/netinet/in_rmx.c (revision 274335) +++ projects/routing/sys/netinet/in_rmx.c (revision 274336) @@ -1,434 +1,439 @@ /*- * Copyright 1994, 1995 Massachusetts Institute of Technology * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby * granted, provided that both the above copyright notice and this * permission notice appear in all copies, that both the above * copyright notice and this permission notice appear in all * supporting documentation, and that the name of M.I.T. not be used * in advertising or publicity pertaining to distribution of the * software without specific, written prior permission. M.I.T. makes * no representations about the suitability of this software for any * purpose. It is provided "as is" without express or implied * warranty. * * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include +#include +#include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern int in_inithead(void **head, int off); #ifdef VIMAGE extern int in_detachhead(void **head, int off); #endif static void in_setifarnh(struct rib_head *rh, uint32_t fibnum, int af, void *_arg); static void in_rtqtimo_setrnh(struct rib_head *rh, uint32_t fibnum, int af, void *_arg); #define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ /* * Do what we need to do when inserting a route. */ static struct radix_node * in_addroute(void *v_arg, void *n_arg, struct radix_head *head, struct radix_node *treenodes) { struct rtentry *rt = (struct rtentry *)treenodes; struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt); /* * A little bit of help for both IP output and input: * For host routes, we make sure that RTF_BROADCAST * is set for anything that looks like a broadcast address. * This way, we can avoid an expensive call to in_broadcast() * in ip_output() most of the time (because the route passed * to ip_output() is almost always a host route). * * We also do the same for local addresses, with the thought * that this might one day be used to speed up ip_input(). * * We also mark routes to multicast addresses as such, because * it's easy to do and might be useful (but this is much more * dubious since it's so easy to inspect the address). */ if (rt->rt_flags & RTF_HOST) { if (in_broadcast(sin->sin_addr, rt->rt_ifp)) { rt->rt_flags |= RTF_BROADCAST; } else if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr == sin->sin_addr.s_addr) { rt->rt_flags |= RTF_LOCAL; } } if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) rt->rt_flags |= RTF_MULTICAST; if (rt->rt_ifp != NULL) { /* * Check route MTU: * inherit interface MTU if not set or * check if MTU is too large. */ if (rt->rt_mtu == 0) { rt->rt_mtu = rt->rt_ifp->if_mtu; } else if (rt->rt_mtu > rt->rt_ifp->if_mtu) rt->rt_mtu = rt->rt_ifp->if_mtu; } return (rn_addroute(v_arg, n_arg, head, treenodes)); } /* * This code is the inverse of in_clsroute: on first reference, if we * were managing the route, stop doing so and set the expiration timer * back off again. */ static struct radix_node * in_matroute(void *v_arg, struct radix_head *head) { struct radix_node *rn = rn_match(v_arg, head); struct rtentry *rt = (struct rtentry *)rn; if (rt) { RT_LOCK(rt); if (rt->rt_flags & RTPRF_OURS) { rt->rt_flags &= ~RTPRF_OURS; rt->rt_expire = 0; } RT_UNLOCK(rt); } return rn; } static VNET_DEFINE(int, rtq_reallyold) = 60*60; /* one hour is "really old" */ #define V_rtq_reallyold VNET(rtq_reallyold) SYSCTL_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(rtq_reallyold), 0, "Default expiration time on dynamically learned routes"); /* * On last reference drop, mark the route as belong to us so that it can be * timed out. */ static void in_clsroute(struct radix_node *rn, struct radix_head *head) { struct rtentry *rt = (struct rtentry *)rn; struct rib_head *rh = (struct rib_head *)head; RT_LOCK_ASSERT(rt); if (!(rt->rt_flags & RTF_UP)) return; /* prophylactic measures */ if (rt->rt_flags & RTPRF_OURS) return; if (!(rt->rt_flags & RTF_DYNAMIC)) return; /* * If rtq_reallyold is 0, just delete the route without * waiting for a timeout cycle to kill it. */ if (V_rtq_reallyold != 0) { rt->rt_flags |= RTPRF_OURS; rt->rt_expire = time_uptime + V_rtq_reallyold; } else rt_expunge(rh, rt); } struct rtqk_arg { struct rib_head *rh; int draining; int killed; int found; }; /* * Get rid of old routes. When draining, this deletes everything, even when * the timeout is not expired yet. */ static int in_rtqkill(struct rtentry *rt, void *rock) { struct rtqk_arg *ap = rock; int err; - RIB_WLOCK_ASSERT(ap->rh); + //RIB_WLOCK_ASSERT(ap->rh); if (rt->rt_flags & RTPRF_OURS) { ap->found++; if (ap->draining || rt->rt_expire <= time_uptime) { if (rt->rt_refcnt > 0) panic("rtqkill route really not free"); err = in_rtrequest(RTM_DELETE, (struct sockaddr *)rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags | RTF_RNH_LOCKED, 0, rt->rt_fibnum); if (err != 0) { log(LOG_WARNING, "in_rtqkill: error %d\n", err); } else ap->killed++; } } return 0; } #define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */ static VNET_DEFINE(int, rtq_timeout) = RTQ_TIMEOUT; static VNET_DEFINE(struct callout, rtq_timer); #define V_rtq_timeout VNET(rtq_timeout) #define V_rtq_timer VNET(rtq_timer) static void in_rtqtimo_setrnh(struct rib_head *rh, uint32_t fibnum, int af, void *_arg) { struct rtqk_arg *arg; int draining; arg = (struct rtqk_arg *)_arg; draining = arg->draining; memset(arg, 0, sizeof(*arg)); arg->rh = rh; arg->draining = arg->draining; } static void in_rtqtimo(void *rock) { CURVNET_SET((struct vnet *) rock); struct rtqk_arg arg; struct timeval atv; memset(&arg, 0, sizeof(arg)); rt_foreach_fib(AF_INET, in_rtqtimo_setrnh, in_rtqkill, &arg); atv.tv_usec = 0; atv.tv_sec = V_rtq_timeout; callout_reset(&V_rtq_timer, tvtohz(&atv), in_rtqtimo, rock); CURVNET_RESTORE(); } void in_rtqdrain(void) { VNET_ITERATOR_DECL(vnet_iter); struct rtqk_arg arg; memset(&arg, 0, sizeof(arg)); arg.draining = 1; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); rt_foreach_fib(AF_INET, in_rtqtimo_setrnh, in_rtqkill, &arg); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } void in_setmatchfunc(struct rib_head *rh, int val) { + RIB_CFG_WLOCK(rh); RIB_WLOCK(rh); rh->rnh_matchaddr = (val != 0) ? rn_match : in_matroute; RIB_WUNLOCK(rh); + RIB_CFG_WUNLOCK(rh); } static int _in_rt_was_here; /* * Initialize our routing tree. */ int in_inithead(void **head, int off) { struct rib_head *rh; rh = rt_table_init(32); if (rh == NULL) return (0); rh->rnh_addaddr = in_addroute; in_setmatchfunc(rh, V_drop_redirect); rh->rnh_close = in_clsroute; *head = (void *)rh; if (_in_rt_was_here == 0 ) { callout_init(&V_rtq_timer, CALLOUT_MPSAFE); callout_reset(&V_rtq_timer, 1, in_rtqtimo, curvnet); _in_rt_was_here = 1; } return 1; } #ifdef VIMAGE int in_detachhead(void **head, int off) { callout_drain(&V_rtq_timer); return (1); } #endif /* * This zaps old routes when the interface goes down or interface * address is deleted. In the latter case, it deletes static routes * that point to this address. If we don't do this, we may end up * using the old address in the future. The ones we always want to * get rid of are things like ARP entries, since the user might down * the interface, walk over to a completely different network, and * plug back in. */ struct in_ifadown_arg { struct rib_head *rh; struct ifaddr *ifa; int del; }; static int in_ifadownkill(struct rtentry *rt, void *xap) { struct in_ifadown_arg *ap = xap; RT_LOCK(rt); if (rt->rt_ifa == ap->ifa && (ap->del || !(rt->rt_flags & RTF_STATIC))) { /* * Aquire a reference so that it can later be freed * as the refcount would be 0 here in case of at least * ap->del. */ RT_ADDREF(rt); /* * Disconnect it from the tree and permit protocols * to cleanup. */ rt_expunge(ap->rh, rt); /* * At this point it is an rttrash node, and in case * the above is the only reference we must free it. * If we do not noone will have a pointer and the * rtentry will be leaked forever. * In case someone else holds a reference, we are * fine as we only decrement the refcount. In that * case if the other entity calls RT_REMREF, we * will still be leaking but at least we tried. */ RTFREE_LOCKED(rt); return (0); } RT_UNLOCK(rt); return 0; } static void in_setifarnh(struct rib_head *rh, uint32_t fibnum, int af, void *_arg) { struct in_ifadown_arg *arg; arg = (struct in_ifadown_arg *)_arg; arg->rh = rh; } void in_ifadown(struct ifaddr *ifa, int delete) { struct in_ifadown_arg arg; KASSERT(ifa->ifa_addr->sa_family == AF_INET, ("%s: wrong family", __func__)); arg.ifa = ifa; arg.del = delete; rt_foreach_fib(AF_INET, in_setifarnh, in_ifadownkill, &arg); ifa->ifa_flags &= ~IFA_ROUTE; /* XXXlocking? */ } /* * inet versions of rt functions. These have fib extensions and * for now will just reference the _fib variants. * eventually this order will be reversed, */ int in_rtrequest( int req, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct rtentry **ret_nrt, u_int fibnum) { return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt, fibnum)); } void in_rtredirect(struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct sockaddr *src, u_int fibnum) { rtredirect_fib(dst, gateway, netmask, flags, src, fibnum); } #if 0 int in_rt_getifa(struct rt_addrinfo *, u_int fibnum); int in_rtioctl(u_long, caddr_t, u_int); int in_rtrequest1(int, struct rt_addrinfo *, struct rtentry **, u_int); #endif Index: projects/routing/sys/netinet6/nd6_rtr.c =================================================================== --- projects/routing/sys/netinet6/nd6_rtr.c (revision 274335) +++ projects/routing/sys/netinet6/nd6_rtr.c (revision 274336) @@ -1,2131 +1,2134 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: nd6_rtr.c,v 1.111 2001/04/27 01:37:15 jinmei Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int rtpref(struct nd_defrouter *); static struct nd_defrouter *defrtrlist_update(struct nd_defrouter *); static int prelist_update(struct nd_prefixctl *, struct nd_defrouter *, struct mbuf *, int); static struct in6_ifaddr *in6_ifadd(struct nd_prefixctl *, int); static struct nd_pfxrouter *pfxrtr_lookup(struct nd_prefix *, struct nd_defrouter *); static void pfxrtr_add(struct nd_prefix *, struct nd_defrouter *); static void pfxrtr_del(struct nd_pfxrouter *); static struct nd_pfxrouter *find_pfxlist_reachable_router (struct nd_prefix *); static void defrouter_delreq(struct nd_defrouter *); static void nd6_rtmsg(int, struct rtentry *); static int in6_init_prefix_ltimes(struct nd_prefix *); static void in6_init_address_ltimes(struct nd_prefix *, struct in6_addrlifetime *); static int nd6_prefix_onlink(struct nd_prefix *); static int nd6_prefix_offlink(struct nd_prefix *); static int rt6_deleteroute(struct rtentry *, void *); VNET_DECLARE(int, nd6_recalc_reachtm_interval); #define V_nd6_recalc_reachtm_interval VNET(nd6_recalc_reachtm_interval) static VNET_DEFINE(struct ifnet *, nd6_defifp); VNET_DEFINE(int, nd6_defifindex); #define V_nd6_defifp VNET(nd6_defifp) VNET_DEFINE(int, ip6_use_tempaddr) = 0; VNET_DEFINE(int, ip6_desync_factor); VNET_DEFINE(u_int32_t, ip6_temp_preferred_lifetime) = DEF_TEMP_PREFERRED_LIFETIME; VNET_DEFINE(u_int32_t, ip6_temp_valid_lifetime) = DEF_TEMP_VALID_LIFETIME; VNET_DEFINE(int, ip6_temp_regen_advance) = TEMPADDR_REGEN_ADVANCE; /* RTPREF_MEDIUM has to be 0! */ #define RTPREF_HIGH 1 #define RTPREF_MEDIUM 0 #define RTPREF_LOW (-1) #define RTPREF_RESERVED (-2) #define RTPREF_INVALID (-3) /* internal */ /* * Receive Router Solicitation Message - just for routers. * Router solicitation/advertisement is mostly managed by userland program * (rtadvd) so here we have no function like nd6_ra_output(). * * Based on RFC 2461 */ void nd6_rs_input(struct mbuf *m, int off, int icmp6len) { struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_router_solicit *nd_rs; struct in6_addr saddr6 = ip6->ip6_src; char *lladdr = NULL; int lladdrlen = 0; union nd_opts ndopts; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; /* * Accept RS only when V_ip6_forwarding=1 and the interface has * no ND6_IFF_ACCEPT_RTADV. */ if (!V_ip6_forwarding || ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) goto freeit; /* Sanity checks */ if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "nd6_rs_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp))); goto bad; } /* * Don't update the neighbor cache, if src = ::. * This indicates that the src has no IP address assigned yet. */ if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) goto freeit; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_rs = (struct nd_router_solicit *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_rs, struct nd_router_solicit *, m, off, icmp6len); if (nd_rs == NULL) { ICMP6STAT_INC(icp6s_tooshort); return; } #endif icmp6len -= sizeof(*nd_rs); nd6_option_init(nd_rs + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "nd6_rs_input: invalid ND option, ignored\n")); /* nd6_options have incremented stats */ goto freeit; } if (ndopts.nd_opts_src_lladdr) { lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1); lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3; } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "nd6_rs_input: lladdrlen mismatch for %s " "(if %d, RS packet %d)\n", ip6_sprintf(ip6bufs, &saddr6), ifp->if_addrlen, lladdrlen - 2)); goto bad; } nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen, ND_ROUTER_SOLICIT, 0); freeit: m_freem(m); return; bad: ICMP6STAT_INC(icp6s_badrs); m_freem(m); } /* * Receive Router Advertisement Message. * * Based on RFC 2461 * TODO: on-link bit on prefix information * TODO: ND_RA_FLAG_{OTHER,MANAGED} processing */ void nd6_ra_input(struct mbuf *m, int off, int icmp6len) { struct ifnet *ifp = m->m_pkthdr.rcvif; struct nd_ifinfo *ndi = ND_IFINFO(ifp); struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_router_advert *nd_ra; struct in6_addr saddr6 = ip6->ip6_src; int mcast = 0; union nd_opts ndopts; struct nd_defrouter *dr; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; /* * We only accept RAs only when the per-interface flag * ND6_IFF_ACCEPT_RTADV is on the receiving interface. */ if (!(ndi->flags & ND6_IFF_ACCEPT_RTADV)) goto freeit; if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "nd6_ra_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp))); goto bad; } if (!IN6_IS_ADDR_LINKLOCAL(&saddr6)) { nd6log((LOG_ERR, "nd6_ra_input: src %s is not link-local\n", ip6_sprintf(ip6bufs, &saddr6))); goto bad; } #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_ra = (struct nd_router_advert *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_ra, struct nd_router_advert *, m, off, icmp6len); if (nd_ra == NULL) { ICMP6STAT_INC(icp6s_tooshort); return; } #endif icmp6len -= sizeof(*nd_ra); nd6_option_init(nd_ra + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "nd6_ra_input: invalid ND option, ignored\n")); /* nd6_options have incremented stats */ goto freeit; } { struct nd_defrouter dr0; u_int32_t advreachable = nd_ra->nd_ra_reachable; /* remember if this is a multicasted advertisement */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) mcast = 1; bzero(&dr0, sizeof(dr0)); dr0.rtaddr = saddr6; dr0.flags = nd_ra->nd_ra_flags_reserved; /* * Effectively-disable routes from RA messages when * ND6_IFF_NO_RADR enabled on the receiving interface or * (ip6.forwarding == 1 && ip6.rfc6204w3 != 1). */ if (ndi->flags & ND6_IFF_NO_RADR) dr0.rtlifetime = 0; else if (V_ip6_forwarding && !V_ip6_rfc6204w3) dr0.rtlifetime = 0; else dr0.rtlifetime = ntohs(nd_ra->nd_ra_router_lifetime); dr0.expire = time_uptime + dr0.rtlifetime; dr0.ifp = ifp; /* unspecified or not? (RFC 2461 6.3.4) */ if (advreachable) { advreachable = ntohl(advreachable); if (advreachable <= MAX_REACHABLE_TIME && ndi->basereachable != advreachable) { ndi->basereachable = advreachable; ndi->reachable = ND_COMPUTE_RTIME(ndi->basereachable); ndi->recalctm = V_nd6_recalc_reachtm_interval; /* reset */ } } if (nd_ra->nd_ra_retransmit) ndi->retrans = ntohl(nd_ra->nd_ra_retransmit); if (nd_ra->nd_ra_curhoplimit) ndi->chlim = nd_ra->nd_ra_curhoplimit; dr = defrtrlist_update(&dr0); } /* * prefix */ if (ndopts.nd_opts_pi) { struct nd_opt_hdr *pt; struct nd_opt_prefix_info *pi = NULL; struct nd_prefixctl pr; for (pt = (struct nd_opt_hdr *)ndopts.nd_opts_pi; pt <= (struct nd_opt_hdr *)ndopts.nd_opts_pi_end; pt = (struct nd_opt_hdr *)((caddr_t)pt + (pt->nd_opt_len << 3))) { if (pt->nd_opt_type != ND_OPT_PREFIX_INFORMATION) continue; pi = (struct nd_opt_prefix_info *)pt; if (pi->nd_opt_pi_len != 4) { nd6log((LOG_INFO, "nd6_ra_input: invalid option " "len %d for prefix information option, " "ignored\n", pi->nd_opt_pi_len)); continue; } if (128 < pi->nd_opt_pi_prefix_len) { nd6log((LOG_INFO, "nd6_ra_input: invalid prefix " "len %d for prefix information option, " "ignored\n", pi->nd_opt_pi_prefix_len)); continue; } if (IN6_IS_ADDR_MULTICAST(&pi->nd_opt_pi_prefix) || IN6_IS_ADDR_LINKLOCAL(&pi->nd_opt_pi_prefix)) { nd6log((LOG_INFO, "nd6_ra_input: invalid prefix " "%s, ignored\n", ip6_sprintf(ip6bufs, &pi->nd_opt_pi_prefix))); continue; } bzero(&pr, sizeof(pr)); pr.ndpr_prefix.sin6_family = AF_INET6; pr.ndpr_prefix.sin6_len = sizeof(pr.ndpr_prefix); pr.ndpr_prefix.sin6_addr = pi->nd_opt_pi_prefix; pr.ndpr_ifp = (struct ifnet *)m->m_pkthdr.rcvif; pr.ndpr_raf_onlink = (pi->nd_opt_pi_flags_reserved & ND_OPT_PI_FLAG_ONLINK) ? 1 : 0; pr.ndpr_raf_auto = (pi->nd_opt_pi_flags_reserved & ND_OPT_PI_FLAG_AUTO) ? 1 : 0; pr.ndpr_plen = pi->nd_opt_pi_prefix_len; pr.ndpr_vltime = ntohl(pi->nd_opt_pi_valid_time); pr.ndpr_pltime = ntohl(pi->nd_opt_pi_preferred_time); (void)prelist_update(&pr, dr, m, mcast); } } /* * MTU */ if (ndopts.nd_opts_mtu && ndopts.nd_opts_mtu->nd_opt_mtu_len == 1) { u_long mtu; u_long maxmtu; mtu = (u_long)ntohl(ndopts.nd_opts_mtu->nd_opt_mtu_mtu); /* lower bound */ if (mtu < IPV6_MMTU) { nd6log((LOG_INFO, "nd6_ra_input: bogus mtu option " "mtu=%lu sent from %s, ignoring\n", mtu, ip6_sprintf(ip6bufs, &ip6->ip6_src))); goto skip; } /* upper bound */ maxmtu = (ndi->maxmtu && ndi->maxmtu < ifp->if_mtu) ? ndi->maxmtu : ifp->if_mtu; if (mtu <= maxmtu) { int change = (ndi->linkmtu != mtu); ndi->linkmtu = mtu; if (change) /* in6_maxmtu may change */ in6_setmaxmtu(); } else { nd6log((LOG_INFO, "nd6_ra_input: bogus mtu " "mtu=%lu sent from %s; " "exceeds maxmtu %lu, ignoring\n", mtu, ip6_sprintf(ip6bufs, &ip6->ip6_src), maxmtu)); } } skip: /* * Source link layer address */ { char *lladdr = NULL; int lladdrlen = 0; if (ndopts.nd_opts_src_lladdr) { lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1); lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3; } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "nd6_ra_input: lladdrlen mismatch for %s " "(if %d, RA packet %d)\n", ip6_sprintf(ip6bufs, &saddr6), ifp->if_addrlen, lladdrlen - 2)); goto bad; } nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen, ND_ROUTER_ADVERT, 0); /* * Installing a link-layer address might change the state of the * router's neighbor cache, which might also affect our on-link * detection of adveritsed prefixes. */ pfxlist_onlink_check(); } freeit: m_freem(m); return; bad: ICMP6STAT_INC(icp6s_badra); m_freem(m); } /* * default router list proccessing sub routines */ /* tell the change to user processes watching the routing socket. */ static void nd6_rtmsg(int cmd, struct rtentry *rt) { struct rt_addrinfo info; struct ifnet *ifp; struct ifaddr *ifa; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rt_mask(rt); ifp = rt->rt_ifp; if (ifp != NULL) { IF_ADDR_RLOCK(ifp); ifa = TAILQ_FIRST(&ifp->if_addrhead); info.rti_info[RTAX_IFP] = ifa->ifa_addr; ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; } else ifa = NULL; rt_missmsg_fib(cmd, &info, rt->rt_flags, 0, rt->rt_fibnum); if (ifa != NULL) ifa_free(ifa); } static void defrouter_addreq(struct nd_defrouter *new) { struct sockaddr_in6 def, mask, gate; struct rtentry *newrt = NULL; int error; bzero(&def, sizeof(def)); bzero(&mask, sizeof(mask)); bzero(&gate, sizeof(gate)); def.sin6_len = mask.sin6_len = gate.sin6_len = sizeof(struct sockaddr_in6); def.sin6_family = gate.sin6_family = AF_INET6; gate.sin6_addr = new->rtaddr; error = in6_rtrequest(RTM_ADD, (struct sockaddr *)&def, (struct sockaddr *)&gate, (struct sockaddr *)&mask, RTF_GATEWAY, &newrt, RT_DEFAULT_FIB); if (newrt) { nd6_rtmsg(RTM_ADD, newrt); /* tell user process */ RTFREE(newrt); } if (error == 0) new->installed = 1; return; } struct nd_defrouter * defrouter_lookup(struct in6_addr *addr, struct ifnet *ifp) { struct nd_defrouter *dr; TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { if (dr->ifp == ifp && IN6_ARE_ADDR_EQUAL(addr, &dr->rtaddr)) return (dr); } return (NULL); /* search failed */ } /* * Remove the default route for a given router. * This is just a subroutine function for defrouter_select(), and should * not be called from anywhere else. */ static void defrouter_delreq(struct nd_defrouter *dr) { struct sockaddr_in6 def, mask, gate; struct rtentry *oldrt = NULL; bzero(&def, sizeof(def)); bzero(&mask, sizeof(mask)); bzero(&gate, sizeof(gate)); def.sin6_len = mask.sin6_len = gate.sin6_len = sizeof(struct sockaddr_in6); def.sin6_family = gate.sin6_family = AF_INET6; gate.sin6_addr = dr->rtaddr; in6_rtrequest(RTM_DELETE, (struct sockaddr *)&def, (struct sockaddr *)&gate, (struct sockaddr *)&mask, RTF_GATEWAY, &oldrt, RT_DEFAULT_FIB); if (oldrt) { nd6_rtmsg(RTM_DELETE, oldrt); RTFREE(oldrt); } dr->installed = 0; } /* * remove all default routes from default router list */ void defrouter_reset(void) { struct nd_defrouter *dr; TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) defrouter_delreq(dr); /* * XXX should we also nuke any default routers in the kernel, by * going through them by rtalloc1()? */ } void defrtrlist_del(struct nd_defrouter *dr) { struct nd_defrouter *deldr = NULL; struct nd_prefix *pr; /* * Flush all the routing table entries that use the router * as a next hop. */ if (ND_IFINFO(dr->ifp)->flags & ND6_IFF_ACCEPT_RTADV) rt6_flush(&dr->rtaddr, dr->ifp); if (dr->installed) { deldr = dr; defrouter_delreq(dr); } TAILQ_REMOVE(&V_nd_defrouter, dr, dr_entry); /* * Also delete all the pointers to the router in each prefix lists. */ LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { struct nd_pfxrouter *pfxrtr; if ((pfxrtr = pfxrtr_lookup(pr, dr)) != NULL) pfxrtr_del(pfxrtr); } pfxlist_onlink_check(); /* * If the router is the primary one, choose a new one. * Note that defrouter_select() will remove the current gateway * from the routing table. */ if (deldr) defrouter_select(); free(dr, M_IP6NDP); } /* * Default Router Selection according to Section 6.3.6 of RFC 2461 and * draft-ietf-ipngwg-router-selection: * 1) Routers that are reachable or probably reachable should be preferred. * If we have more than one (probably) reachable router, prefer ones * with the highest router preference. * 2) When no routers on the list are known to be reachable or * probably reachable, routers SHOULD be selected in a round-robin * fashion, regardless of router preference values. * 3) If the Default Router List is empty, assume that all * destinations are on-link. * * We assume nd_defrouter is sorted by router preference value. * Since the code below covers both with and without router preference cases, * we do not need to classify the cases by ifdef. * * At this moment, we do not try to install more than one default router, * even when the multipath routing is available, because we're not sure about * the benefits for stub hosts comparing to the risk of making the code * complicated and the possibility of introducing bugs. */ void defrouter_select(void) { struct nd_defrouter *dr, *selected_dr = NULL, *installed_dr = NULL; struct llentry *ln = NULL; /* * Let's handle easy case (3) first: * If default router list is empty, there's nothing to be done. */ if (TAILQ_EMPTY(&V_nd_defrouter)) return; /* * Search for a (probably) reachable router from the list. * We just pick up the first reachable one (if any), assuming that * the ordering rule of the list described in defrtrlist_update(). */ TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { IF_AFDATA_RLOCK(dr->ifp); if (selected_dr == NULL && (ln = nd6_lookup(&dr->rtaddr, 0, dr->ifp)) && ND6_IS_LLINFO_PROBREACH(ln)) { selected_dr = dr; } IF_AFDATA_RUNLOCK(dr->ifp); if (ln != NULL) { LLE_RUNLOCK(ln); ln = NULL; } if (dr->installed && installed_dr == NULL) installed_dr = dr; else if (dr->installed && installed_dr) { /* this should not happen. warn for diagnosis. */ log(LOG_ERR, "defrouter_select: more than one router" " is installed\n"); } } /* * If none of the default routers was found to be reachable, * round-robin the list regardless of preference. * Otherwise, if we have an installed router, check if the selected * (reachable) router should really be preferred to the installed one. * We only prefer the new router when the old one is not reachable * or when the new one has a really higher preference value. */ if (selected_dr == NULL) { if (installed_dr == NULL || !TAILQ_NEXT(installed_dr, dr_entry)) selected_dr = TAILQ_FIRST(&V_nd_defrouter); else selected_dr = TAILQ_NEXT(installed_dr, dr_entry); } else if (installed_dr) { IF_AFDATA_RLOCK(installed_dr->ifp); if ((ln = nd6_lookup(&installed_dr->rtaddr, 0, installed_dr->ifp)) && ND6_IS_LLINFO_PROBREACH(ln) && rtpref(selected_dr) <= rtpref(installed_dr)) { selected_dr = installed_dr; } IF_AFDATA_RUNLOCK(installed_dr->ifp); if (ln != NULL) LLE_RUNLOCK(ln); } /* * If the selected router is different than the installed one, * remove the installed router and install the selected one. * Note that the selected router is never NULL here. */ if (installed_dr != selected_dr) { if (installed_dr) defrouter_delreq(installed_dr); defrouter_addreq(selected_dr); } return; } /* * for default router selection * regards router-preference field as a 2-bit signed integer */ static int rtpref(struct nd_defrouter *dr) { switch (dr->flags & ND_RA_FLAG_RTPREF_MASK) { case ND_RA_FLAG_RTPREF_HIGH: return (RTPREF_HIGH); case ND_RA_FLAG_RTPREF_MEDIUM: case ND_RA_FLAG_RTPREF_RSV: return (RTPREF_MEDIUM); case ND_RA_FLAG_RTPREF_LOW: return (RTPREF_LOW); default: /* * This case should never happen. If it did, it would mean a * serious bug of kernel internal. We thus always bark here. * Or, can we even panic? */ log(LOG_ERR, "rtpref: impossible RA flag %x\n", dr->flags); return (RTPREF_INVALID); } /* NOTREACHED */ } static struct nd_defrouter * defrtrlist_update(struct nd_defrouter *new) { struct nd_defrouter *dr, *n; if ((dr = defrouter_lookup(&new->rtaddr, new->ifp)) != NULL) { /* entry exists */ if (new->rtlifetime == 0) { defrtrlist_del(dr); dr = NULL; } else { int oldpref = rtpref(dr); /* override */ dr->flags = new->flags; /* xxx flag check */ dr->rtlifetime = new->rtlifetime; dr->expire = new->expire; /* * If the preference does not change, there's no need * to sort the entries. Also make sure the selected * router is still installed in the kernel. */ if (dr->installed && rtpref(new) == oldpref) return (dr); /* * preferred router may be changed, so relocate * this router. * XXX: calling TAILQ_REMOVE directly is a bad manner. * However, since defrtrlist_del() has many side * effects, we intentionally do so here. * defrouter_select() below will handle routing * changes later. */ TAILQ_REMOVE(&V_nd_defrouter, dr, dr_entry); n = dr; goto insert; } return (dr); } /* entry does not exist */ if (new->rtlifetime == 0) return (NULL); n = (struct nd_defrouter *)malloc(sizeof(*n), M_IP6NDP, M_NOWAIT); if (n == NULL) return (NULL); bzero(n, sizeof(*n)); *n = *new; insert: /* * Insert the new router in the Default Router List; * The Default Router List should be in the descending order * of router-preferece. Routers with the same preference are * sorted in the arriving time order. */ /* insert at the end of the group */ TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { if (rtpref(n) > rtpref(dr)) break; } if (dr) TAILQ_INSERT_BEFORE(dr, n, dr_entry); else TAILQ_INSERT_TAIL(&V_nd_defrouter, n, dr_entry); defrouter_select(); return (n); } static struct nd_pfxrouter * pfxrtr_lookup(struct nd_prefix *pr, struct nd_defrouter *dr) { struct nd_pfxrouter *search; LIST_FOREACH(search, &pr->ndpr_advrtrs, pfr_entry) { if (search->router == dr) break; } return (search); } static void pfxrtr_add(struct nd_prefix *pr, struct nd_defrouter *dr) { struct nd_pfxrouter *new; new = (struct nd_pfxrouter *)malloc(sizeof(*new), M_IP6NDP, M_NOWAIT); if (new == NULL) return; bzero(new, sizeof(*new)); new->router = dr; LIST_INSERT_HEAD(&pr->ndpr_advrtrs, new, pfr_entry); pfxlist_onlink_check(); } static void pfxrtr_del(struct nd_pfxrouter *pfr) { LIST_REMOVE(pfr, pfr_entry); free(pfr, M_IP6NDP); } struct nd_prefix * nd6_prefix_lookup(struct nd_prefixctl *key) { struct nd_prefix *search; LIST_FOREACH(search, &V_nd_prefix, ndpr_entry) { if (key->ndpr_ifp == search->ndpr_ifp && key->ndpr_plen == search->ndpr_plen && in6_are_prefix_equal(&key->ndpr_prefix.sin6_addr, &search->ndpr_prefix.sin6_addr, key->ndpr_plen)) { break; } } return (search); } int nd6_prelist_add(struct nd_prefixctl *pr, struct nd_defrouter *dr, struct nd_prefix **newp) { struct nd_prefix *new = NULL; int error = 0; char ip6buf[INET6_ADDRSTRLEN]; new = (struct nd_prefix *)malloc(sizeof(*new), M_IP6NDP, M_NOWAIT); if (new == NULL) return(ENOMEM); bzero(new, sizeof(*new)); new->ndpr_ifp = pr->ndpr_ifp; new->ndpr_prefix = pr->ndpr_prefix; new->ndpr_plen = pr->ndpr_plen; new->ndpr_vltime = pr->ndpr_vltime; new->ndpr_pltime = pr->ndpr_pltime; new->ndpr_flags = pr->ndpr_flags; if ((error = in6_init_prefix_ltimes(new)) != 0) { free(new, M_IP6NDP); return(error); } new->ndpr_lastupdate = time_uptime; if (newp != NULL) *newp = new; /* initialization */ LIST_INIT(&new->ndpr_advrtrs); in6_prefixlen2mask(&new->ndpr_mask, new->ndpr_plen); /* make prefix in the canonical form */ IN6_MASK_ADDR(&new->ndpr_prefix.sin6_addr, &new->ndpr_mask); /* link ndpr_entry to nd_prefix list */ LIST_INSERT_HEAD(&V_nd_prefix, new, ndpr_entry); /* ND_OPT_PI_FLAG_ONLINK processing */ if (new->ndpr_raf_onlink) { int e; if ((e = nd6_prefix_onlink(new)) != 0) { nd6log((LOG_ERR, "nd6_prelist_add: failed to make " "the prefix %s/%d on-link on %s (errno=%d)\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), e)); /* proceed anyway. XXX: is it correct? */ } } if (dr) pfxrtr_add(new, dr); return 0; } void prelist_remove(struct nd_prefix *pr) { struct nd_pfxrouter *pfr, *next; int e; char ip6buf[INET6_ADDRSTRLEN]; /* make sure to invalidate the prefix until it is really freed. */ pr->ndpr_vltime = 0; pr->ndpr_pltime = 0; /* * Though these flags are now meaningless, we'd rather keep the value * of pr->ndpr_raf_onlink and pr->ndpr_raf_auto not to confuse users * when executing "ndp -p". */ if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0 && (e = nd6_prefix_offlink(pr)) != 0) { nd6log((LOG_ERR, "prelist_remove: failed to make %s/%d offlink " "on %s, errno=%d\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), e)); /* what should we do? */ } if (pr->ndpr_refcnt > 0) return; /* notice here? */ /* unlink ndpr_entry from nd_prefix list */ LIST_REMOVE(pr, ndpr_entry); /* free list of routers that adversed the prefix */ LIST_FOREACH_SAFE(pfr, &pr->ndpr_advrtrs, pfr_entry, next) { free(pfr, M_IP6NDP); } free(pr, M_IP6NDP); pfxlist_onlink_check(); } /* * dr - may be NULL */ static int prelist_update(struct nd_prefixctl *new, struct nd_defrouter *dr, struct mbuf *m, int mcast) { struct in6_ifaddr *ia6 = NULL, *ia6_match = NULL; struct ifaddr *ifa; struct ifnet *ifp = new->ndpr_ifp; struct nd_prefix *pr; int error = 0; int newprefix = 0; int auth; struct in6_addrlifetime lt6_tmp; char ip6buf[INET6_ADDRSTRLEN]; auth = 0; if (m) { /* * Authenticity for NA consists authentication for * both IP header and IP datagrams, doesn't it ? */ #if defined(M_AUTHIPHDR) && defined(M_AUTHIPDGM) auth = ((m->m_flags & M_AUTHIPHDR) && (m->m_flags & M_AUTHIPDGM)); #endif } if ((pr = nd6_prefix_lookup(new)) != NULL) { /* * nd6_prefix_lookup() ensures that pr and new have the same * prefix on a same interface. */ /* * Update prefix information. Note that the on-link (L) bit * and the autonomous (A) bit should NOT be changed from 1 * to 0. */ if (new->ndpr_raf_onlink == 1) pr->ndpr_raf_onlink = 1; if (new->ndpr_raf_auto == 1) pr->ndpr_raf_auto = 1; if (new->ndpr_raf_onlink) { pr->ndpr_vltime = new->ndpr_vltime; pr->ndpr_pltime = new->ndpr_pltime; (void)in6_init_prefix_ltimes(pr); /* XXX error case? */ pr->ndpr_lastupdate = time_uptime; } if (new->ndpr_raf_onlink && (pr->ndpr_stateflags & NDPRF_ONLINK) == 0) { int e; if ((e = nd6_prefix_onlink(pr)) != 0) { nd6log((LOG_ERR, "prelist_update: failed to make " "the prefix %s/%d on-link on %s " "(errno=%d)\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), e)); /* proceed anyway. XXX: is it correct? */ } } if (dr && pfxrtr_lookup(pr, dr) == NULL) pfxrtr_add(pr, dr); } else { struct nd_prefix *newpr = NULL; newprefix = 1; if (new->ndpr_vltime == 0) goto end; if (new->ndpr_raf_onlink == 0 && new->ndpr_raf_auto == 0) goto end; error = nd6_prelist_add(new, dr, &newpr); if (error != 0 || newpr == NULL) { nd6log((LOG_NOTICE, "prelist_update: " "nd6_prelist_add failed for %s/%d on %s " "errno=%d, returnpr=%p\n", ip6_sprintf(ip6buf, &new->ndpr_prefix.sin6_addr), new->ndpr_plen, if_name(new->ndpr_ifp), error, newpr)); goto end; /* we should just give up in this case. */ } /* * XXX: from the ND point of view, we can ignore a prefix * with the on-link bit being zero. However, we need a * prefix structure for references from autoconfigured * addresses. Thus, we explicitly make sure that the prefix * itself expires now. */ if (newpr->ndpr_raf_onlink == 0) { newpr->ndpr_vltime = 0; newpr->ndpr_pltime = 0; in6_init_prefix_ltimes(newpr); } pr = newpr; } /* * Address autoconfiguration based on Section 5.5.3 of RFC 2462. * Note that pr must be non NULL at this point. */ /* 5.5.3 (a). Ignore the prefix without the A bit set. */ if (!new->ndpr_raf_auto) goto end; /* * 5.5.3 (b). the link-local prefix should have been ignored in * nd6_ra_input. */ /* 5.5.3 (c). Consistency check on lifetimes: pltime <= vltime. */ if (new->ndpr_pltime > new->ndpr_vltime) { error = EINVAL; /* XXX: won't be used */ goto end; } /* * 5.5.3 (d). If the prefix advertised is not equal to the prefix of * an address configured by stateless autoconfiguration already in the * list of addresses associated with the interface, and the Valid * Lifetime is not 0, form an address. We first check if we have * a matching prefix. * Note: we apply a clarification in rfc2462bis-02 here. We only * consider autoconfigured addresses while RFC2462 simply said * "address". */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in6_ifaddr *ifa6; u_int32_t remaininglifetime; if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; /* * We only consider autoconfigured addresses as per rfc2462bis. */ if (!(ifa6->ia6_flags & IN6_IFF_AUTOCONF)) continue; /* * Spec is not clear here, but I believe we should concentrate * on unicast (i.e. not anycast) addresses. * XXX: other ia6_flags? detached or duplicated? */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0) continue; /* * Ignore the address if it is not associated with a prefix * or is associated with a prefix that is different from this * one. (pr is never NULL here) */ if (ifa6->ia6_ndpr != pr) continue; if (ia6_match == NULL) /* remember the first one */ ia6_match = ifa6; /* * An already autoconfigured address matched. Now that we * are sure there is at least one matched address, we can * proceed to 5.5.3. (e): update the lifetimes according to the * "two hours" rule and the privacy extension. * We apply some clarifications in rfc2462bis: * - use remaininglifetime instead of storedlifetime as a * variable name * - remove the dead code in the "two-hour" rule */ #define TWOHOUR (120*60) lt6_tmp = ifa6->ia6_lifetime; if (lt6_tmp.ia6t_vltime == ND6_INFINITE_LIFETIME) remaininglifetime = ND6_INFINITE_LIFETIME; else if (time_uptime - ifa6->ia6_updatetime > lt6_tmp.ia6t_vltime) { /* * The case of "invalid" address. We should usually * not see this case. */ remaininglifetime = 0; } else remaininglifetime = lt6_tmp.ia6t_vltime - (time_uptime - ifa6->ia6_updatetime); /* when not updating, keep the current stored lifetime. */ lt6_tmp.ia6t_vltime = remaininglifetime; if (TWOHOUR < new->ndpr_vltime || remaininglifetime < new->ndpr_vltime) { lt6_tmp.ia6t_vltime = new->ndpr_vltime; } else if (remaininglifetime <= TWOHOUR) { if (auth) { lt6_tmp.ia6t_vltime = new->ndpr_vltime; } } else { /* * new->ndpr_vltime <= TWOHOUR && * TWOHOUR < remaininglifetime */ lt6_tmp.ia6t_vltime = TWOHOUR; } /* The 2 hour rule is not imposed for preferred lifetime. */ lt6_tmp.ia6t_pltime = new->ndpr_pltime; in6_init_address_ltimes(pr, <6_tmp); /* * We need to treat lifetimes for temporary addresses * differently, according to * draft-ietf-ipv6-privacy-addrs-v2-01.txt 3.3 (1); * we only update the lifetimes when they are in the maximum * intervals. */ if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0) { u_int32_t maxvltime, maxpltime; if (V_ip6_temp_valid_lifetime > (u_int32_t)((time_uptime - ifa6->ia6_createtime) + V_ip6_desync_factor)) { maxvltime = V_ip6_temp_valid_lifetime - (time_uptime - ifa6->ia6_createtime) - V_ip6_desync_factor; } else maxvltime = 0; if (V_ip6_temp_preferred_lifetime > (u_int32_t)((time_uptime - ifa6->ia6_createtime) + V_ip6_desync_factor)) { maxpltime = V_ip6_temp_preferred_lifetime - (time_uptime - ifa6->ia6_createtime) - V_ip6_desync_factor; } else maxpltime = 0; if (lt6_tmp.ia6t_vltime == ND6_INFINITE_LIFETIME || lt6_tmp.ia6t_vltime > maxvltime) { lt6_tmp.ia6t_vltime = maxvltime; } if (lt6_tmp.ia6t_pltime == ND6_INFINITE_LIFETIME || lt6_tmp.ia6t_pltime > maxpltime) { lt6_tmp.ia6t_pltime = maxpltime; } } ifa6->ia6_lifetime = lt6_tmp; ifa6->ia6_updatetime = time_uptime; } IF_ADDR_RUNLOCK(ifp); if (ia6_match == NULL && new->ndpr_vltime) { int ifidlen; /* * 5.5.3 (d) (continued) * No address matched and the valid lifetime is non-zero. * Create a new address. */ /* * Prefix Length check: * If the sum of the prefix length and interface identifier * length does not equal 128 bits, the Prefix Information * option MUST be ignored. The length of the interface * identifier is defined in a separate link-type specific * document. */ ifidlen = in6_if2idlen(ifp); if (ifidlen < 0) { /* this should not happen, so we always log it. */ log(LOG_ERR, "prelist_update: IFID undefined (%s)\n", if_name(ifp)); goto end; } if (ifidlen + pr->ndpr_plen != 128) { nd6log((LOG_INFO, "prelist_update: invalid prefixlen " "%d for %s, ignored\n", pr->ndpr_plen, if_name(ifp))); goto end; } if ((ia6 = in6_ifadd(new, mcast)) != NULL) { /* * note that we should use pr (not new) for reference. */ pr->ndpr_refcnt++; ia6->ia6_ndpr = pr; /* * RFC 3041 3.3 (2). * When a new public address is created as described * in RFC2462, also create a new temporary address. * * RFC 3041 3.5. * When an interface connects to a new link, a new * randomized interface identifier should be generated * immediately together with a new set of temporary * addresses. Thus, we specifiy 1 as the 2nd arg of * in6_tmpifadd(). */ if (V_ip6_use_tempaddr) { int e; if ((e = in6_tmpifadd(ia6, 1, 1)) != 0) { nd6log((LOG_NOTICE, "prelist_update: " "failed to create a temporary " "address, errno=%d\n", e)); } } ifa_free(&ia6->ia_ifa); /* * A newly added address might affect the status * of other addresses, so we check and update it. * XXX: what if address duplication happens? */ pfxlist_onlink_check(); } else { /* just set an error. do not bark here. */ error = EADDRNOTAVAIL; /* XXX: might be unused. */ } } end: return error; } /* * A supplement function used in the on-link detection below; * detect if a given prefix has a (probably) reachable advertising router. * XXX: lengthy function name... */ static struct nd_pfxrouter * find_pfxlist_reachable_router(struct nd_prefix *pr) { struct nd_pfxrouter *pfxrtr; struct llentry *ln; int canreach; LIST_FOREACH(pfxrtr, &pr->ndpr_advrtrs, pfr_entry) { IF_AFDATA_RLOCK(pfxrtr->router->ifp); ln = nd6_lookup(&pfxrtr->router->rtaddr, 0, pfxrtr->router->ifp); IF_AFDATA_RUNLOCK(pfxrtr->router->ifp); if (ln == NULL) continue; canreach = ND6_IS_LLINFO_PROBREACH(ln); LLE_RUNLOCK(ln); if (canreach) break; } return (pfxrtr); } /* * Check if each prefix in the prefix list has at least one available router * that advertised the prefix (a router is "available" if its neighbor cache * entry is reachable or probably reachable). * If the check fails, the prefix may be off-link, because, for example, * we have moved from the network but the lifetime of the prefix has not * expired yet. So we should not use the prefix if there is another prefix * that has an available router. * But, if there is no prefix that has an available router, we still regards * all the prefixes as on-link. This is because we can't tell if all the * routers are simply dead or if we really moved from the network and there * is no router around us. */ void pfxlist_onlink_check() { struct nd_prefix *pr; struct in6_ifaddr *ifa; struct nd_defrouter *dr; struct nd_pfxrouter *pfxrtr = NULL; /* * Check if there is a prefix that has a reachable advertising * router. */ LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { if (pr->ndpr_raf_onlink && find_pfxlist_reachable_router(pr)) break; } /* * If we have no such prefix, check whether we still have a router * that does not advertise any prefixes. */ if (pr == NULL) { TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { struct nd_prefix *pr0; LIST_FOREACH(pr0, &V_nd_prefix, ndpr_entry) { if ((pfxrtr = pfxrtr_lookup(pr0, dr)) != NULL) break; } if (pfxrtr != NULL) break; } } if (pr != NULL || (!TAILQ_EMPTY(&V_nd_defrouter) && pfxrtr == NULL)) { /* * There is at least one prefix that has a reachable router, * or at least a router which probably does not advertise * any prefixes. The latter would be the case when we move * to a new link where we have a router that does not provide * prefixes and we configure an address by hand. * Detach prefixes which have no reachable advertising * router, and attach other prefixes. */ LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { /* XXX: a link-local prefix should never be detached */ if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) continue; /* * we aren't interested in prefixes without the L bit * set. */ if (pr->ndpr_raf_onlink == 0) continue; if (pr->ndpr_raf_auto == 0) continue; if ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 && find_pfxlist_reachable_router(pr) == NULL) pr->ndpr_stateflags |= NDPRF_DETACHED; if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0 && find_pfxlist_reachable_router(pr) != 0) pr->ndpr_stateflags &= ~NDPRF_DETACHED; } } else { /* there is no prefix that has a reachable router */ LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) continue; if (pr->ndpr_raf_onlink == 0) continue; if (pr->ndpr_raf_auto == 0) continue; if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0) pr->ndpr_stateflags &= ~NDPRF_DETACHED; } } /* * Remove each interface route associated with a (just) detached * prefix, and reinstall the interface route for a (just) attached * prefix. Note that all attempt of reinstallation does not * necessarily success, when a same prefix is shared among multiple * interfaces. Such cases will be handled in nd6_prefix_onlink, * so we don't have to care about them. */ LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { int e; char ip6buf[INET6_ADDRSTRLEN]; if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) continue; if (pr->ndpr_raf_onlink == 0) continue; if (pr->ndpr_raf_auto == 0) continue; if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0 && (pr->ndpr_stateflags & NDPRF_ONLINK) != 0) { if ((e = nd6_prefix_offlink(pr)) != 0) { nd6log((LOG_ERR, "pfxlist_onlink_check: failed to " "make %s/%d offlink, errno=%d\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, e)); } } if ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 && (pr->ndpr_stateflags & NDPRF_ONLINK) == 0 && pr->ndpr_raf_onlink) { if ((e = nd6_prefix_onlink(pr)) != 0) { nd6log((LOG_ERR, "pfxlist_onlink_check: failed to " "make %s/%d onlink, errno=%d\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, e)); } } } /* * Changes on the prefix status might affect address status as well. * Make sure that all addresses derived from an attached prefix are * attached, and that all addresses derived from a detached prefix are * detached. Note, however, that a manually configured address should * always be attached. * The precise detection logic is same as the one for prefixes. * * XXXRW: in6_ifaddrhead locking. */ TAILQ_FOREACH(ifa, &V_in6_ifaddrhead, ia_link) { if (!(ifa->ia6_flags & IN6_IFF_AUTOCONF)) continue; if (ifa->ia6_ndpr == NULL) { /* * This can happen when we first configure the address * (i.e. the address exists, but the prefix does not). * XXX: complicated relationships... */ continue; } if (find_pfxlist_reachable_router(ifa->ia6_ndpr)) break; } if (ifa) { TAILQ_FOREACH(ifa, &V_in6_ifaddrhead, ia_link) { if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; if (ifa->ia6_ndpr == NULL) /* XXX: see above. */ continue; if (find_pfxlist_reachable_router(ifa->ia6_ndpr)) { if (ifa->ia6_flags & IN6_IFF_DETACHED) { ifa->ia6_flags &= ~IN6_IFF_DETACHED; ifa->ia6_flags |= IN6_IFF_TENTATIVE; nd6_dad_start((struct ifaddr *)ifa, 0); } } else { ifa->ia6_flags |= IN6_IFF_DETACHED; } } } else { TAILQ_FOREACH(ifa, &V_in6_ifaddrhead, ia_link) { if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; if (ifa->ia6_flags & IN6_IFF_DETACHED) { ifa->ia6_flags &= ~IN6_IFF_DETACHED; ifa->ia6_flags |= IN6_IFF_TENTATIVE; /* Do we need a delay in this case? */ nd6_dad_start((struct ifaddr *)ifa, 0); } } } } static int nd6_prefix_onlink_rtrequest(struct nd_prefix *pr, struct ifaddr *ifa) { static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; struct rib_head *rh; struct rtentry *rt; struct sockaddr_in6 mask6; u_long rtflags; int error, a_failure, fibnum; /* * in6_ifinit() sets nd6_rtrequest to ifa_rtrequest for all ifaddrs. * ifa->ifa_rtrequest = nd6_rtrequest; */ bzero(&mask6, sizeof(mask6)); mask6.sin6_len = sizeof(mask6); mask6.sin6_addr = pr->ndpr_mask; rtflags = (ifa->ifa_flags & ~IFA_RTSELF) | RTF_UP; a_failure = 0; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { rt = NULL; error = in6_rtrequest(RTM_ADD, (struct sockaddr *)&pr->ndpr_prefix, ifa->ifa_addr, (struct sockaddr *)&mask6, rtflags, &rt, fibnum); if (error == 0) { KASSERT(rt != NULL, ("%s: in6_rtrequest return no " "error(%d) but rt is NULL, pr=%p, ifa=%p", __func__, error, pr, ifa)); rh = rt_tables_get_rnh(rt->rt_fibnum, AF_INET6); /* XXX what if rhn == NULL? */ + RIB_CFG_WLOCK(rh); RIB_WLOCK(rh); RT_LOCK(rt); if (rt_setgate(rt, rt_key(rt), (struct sockaddr *)&null_sdl) == 0) { struct sockaddr_dl *dl; dl = (struct sockaddr_dl *)rt->rt_gateway; dl->sdl_type = rt->rt_ifp->if_type; dl->sdl_index = rt->rt_ifp->if_index; } RIB_WUNLOCK(rh); + RIB_CFG_WUNLOCK(rh); nd6_rtmsg(RTM_ADD, rt); RT_UNLOCK(rt); pr->ndpr_stateflags |= NDPRF_ONLINK; } else { char ip6buf[INET6_ADDRSTRLEN]; char ip6bufg[INET6_ADDRSTRLEN]; char ip6bufm[INET6_ADDRSTRLEN]; struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; nd6log((LOG_ERR, "nd6_prefix_onlink: failed to add " "route for a prefix (%s/%d) on %s, gw=%s, mask=%s, " "flags=%lx errno = %d\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), ip6_sprintf(ip6bufg, &sin6->sin6_addr), ip6_sprintf(ip6bufm, &mask6.sin6_addr), rtflags, error)); /* Save last error to return, see rtinit(). */ a_failure = error; } if (rt != NULL) { RT_LOCK(rt); RT_REMREF(rt); RT_UNLOCK(rt); } } /* Return the last error we got. */ return (a_failure); } static int nd6_prefix_onlink(struct nd_prefix *pr) { struct ifaddr *ifa; struct ifnet *ifp = pr->ndpr_ifp; struct nd_prefix *opr; int error = 0; char ip6buf[INET6_ADDRSTRLEN]; /* sanity check */ if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) { nd6log((LOG_ERR, "nd6_prefix_onlink: %s/%d is already on-link\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen)); return (EEXIST); } /* * Add the interface route associated with the prefix. Before * installing the route, check if there's the same prefix on another * interface, and the prefix has already installed the interface route. * Although such a configuration is expected to be rare, we explicitly * allow it. */ LIST_FOREACH(opr, &V_nd_prefix, ndpr_entry) { if (opr == pr) continue; if ((opr->ndpr_stateflags & NDPRF_ONLINK) == 0) continue; if (opr->ndpr_plen == pr->ndpr_plen && in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr, &opr->ndpr_prefix.sin6_addr, pr->ndpr_plen)) return (0); } /* * We prefer link-local addresses as the associated interface address. */ /* search for a link-local addr */ ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY | IN6_IFF_ANYCAST); if (ifa == NULL) { /* XXX: freebsd does not have ifa_ifwithaf */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET6) break; } if (ifa != NULL) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); /* should we care about ia6_flags? */ } if (ifa == NULL) { /* * This can still happen, when, for example, we receive an RA * containing a prefix with the L bit set and the A bit clear, * after removing all IPv6 addresses on the receiving * interface. This should, of course, be rare though. */ nd6log((LOG_NOTICE, "nd6_prefix_onlink: failed to find any ifaddr" " to add route for a prefix(%s/%d) on %s\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(ifp))); return (0); } error = nd6_prefix_onlink_rtrequest(pr, ifa); if (ifa != NULL) ifa_free(ifa); return (error); } static int nd6_prefix_offlink(struct nd_prefix *pr) { int error = 0; struct ifnet *ifp = pr->ndpr_ifp; struct nd_prefix *opr; struct sockaddr_in6 sa6, mask6; struct rtentry *rt; char ip6buf[INET6_ADDRSTRLEN]; int fibnum, a_failure; /* sanity check */ if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0) { nd6log((LOG_ERR, "nd6_prefix_offlink: %s/%d is already off-link\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen)); return (EEXIST); } bzero(&sa6, sizeof(sa6)); sa6.sin6_family = AF_INET6; sa6.sin6_len = sizeof(sa6); bcopy(&pr->ndpr_prefix.sin6_addr, &sa6.sin6_addr, sizeof(struct in6_addr)); bzero(&mask6, sizeof(mask6)); mask6.sin6_family = AF_INET6; mask6.sin6_len = sizeof(sa6); bcopy(&pr->ndpr_mask, &mask6.sin6_addr, sizeof(struct in6_addr)); a_failure = 0; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { rt = NULL; error = in6_rtrequest(RTM_DELETE, (struct sockaddr *)&sa6, NULL, (struct sockaddr *)&mask6, 0, &rt, fibnum); if (error == 0) { /* report the route deletion to the routing socket. */ if (rt != NULL) nd6_rtmsg(RTM_DELETE, rt); } else { /* Save last error to return, see rtinit(). */ a_failure = error; } if (rt != NULL) { RTFREE(rt); } } error = a_failure; a_failure = 1; if (error == 0) { pr->ndpr_stateflags &= ~NDPRF_ONLINK; /* * There might be the same prefix on another interface, * the prefix which could not be on-link just because we have * the interface route (see comments in nd6_prefix_onlink). * If there's one, try to make the prefix on-link on the * interface. */ LIST_FOREACH(opr, &V_nd_prefix, ndpr_entry) { if (opr == pr) continue; if ((opr->ndpr_stateflags & NDPRF_ONLINK) != 0) continue; /* * KAME specific: detached prefixes should not be * on-link. */ if ((opr->ndpr_stateflags & NDPRF_DETACHED) != 0) continue; if (opr->ndpr_plen == pr->ndpr_plen && in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr, &opr->ndpr_prefix.sin6_addr, pr->ndpr_plen)) { int e; if ((e = nd6_prefix_onlink(opr)) != 0) { nd6log((LOG_ERR, "nd6_prefix_offlink: failed to " "recover a prefix %s/%d from %s " "to %s (errno = %d)\n", ip6_sprintf(ip6buf, &opr->ndpr_prefix.sin6_addr), opr->ndpr_plen, if_name(ifp), if_name(opr->ndpr_ifp), e)); } else a_failure = 0; } } } else { /* XXX: can we still set the NDPRF_ONLINK flag? */ nd6log((LOG_ERR, "nd6_prefix_offlink: failed to delete route: " "%s/%d on %s (errno = %d)\n", ip6_sprintf(ip6buf, &sa6.sin6_addr), pr->ndpr_plen, if_name(ifp), error)); } if (a_failure) lltable_prefix_free(AF_INET6, (struct sockaddr *)&sa6, (struct sockaddr *)&mask6, LLE_STATIC); return (error); } static struct in6_ifaddr * in6_ifadd(struct nd_prefixctl *pr, int mcast) { struct ifnet *ifp = pr->ndpr_ifp; struct ifaddr *ifa; struct in6_aliasreq ifra; struct in6_ifaddr *ia, *ib; int error, plen0; struct in6_addr mask; int prefixlen = pr->ndpr_plen; int updateflags; char ip6buf[INET6_ADDRSTRLEN]; in6_prefixlen2mask(&mask, prefixlen); /* * find a link-local address (will be interface ID). * Is it really mandatory? Theoretically, a global or a site-local * address can be configured without a link-local address, if we * have a unique interface identifier... * * it is not mandatory to have a link-local address, we can generate * interface identifier on the fly. we do this because: * (1) it should be the easiest way to find interface identifier. * (2) RFC2462 5.4 suggesting the use of the same interface identifier * for multiple addresses on a single interface, and possible shortcut * of DAD. we omitted DAD for this reason in the past. * (3) a user can prevent autoconfiguration of global address * by removing link-local address by hand (this is partly because we * don't have other way to control the use of IPv6 on an interface. * this has been our design choice - cf. NRL's "ifconfig auto"). * (4) it is easier to manage when an interface has addresses * with the same interface identifier, than to have multiple addresses * with different interface identifiers. */ ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, 0); /* 0 is OK? */ if (ifa) ib = (struct in6_ifaddr *)ifa; else return NULL; /* prefixlen + ifidlen must be equal to 128 */ plen0 = in6_mask2len(&ib->ia_prefixmask.sin6_addr, NULL); if (prefixlen != plen0) { ifa_free(ifa); nd6log((LOG_INFO, "in6_ifadd: wrong prefixlen for %s " "(prefix=%d ifid=%d)\n", if_name(ifp), prefixlen, 128 - plen0)); return NULL; } /* make ifaddr */ in6_prepare_ifra(&ifra, &pr->ndpr_prefix.sin6_addr, &mask); IN6_MASK_ADDR(&ifra.ifra_addr.sin6_addr, &mask); /* interface ID */ ifra.ifra_addr.sin6_addr.s6_addr32[0] |= (ib->ia_addr.sin6_addr.s6_addr32[0] & ~mask.s6_addr32[0]); ifra.ifra_addr.sin6_addr.s6_addr32[1] |= (ib->ia_addr.sin6_addr.s6_addr32[1] & ~mask.s6_addr32[1]); ifra.ifra_addr.sin6_addr.s6_addr32[2] |= (ib->ia_addr.sin6_addr.s6_addr32[2] & ~mask.s6_addr32[2]); ifra.ifra_addr.sin6_addr.s6_addr32[3] |= (ib->ia_addr.sin6_addr.s6_addr32[3] & ~mask.s6_addr32[3]); ifa_free(ifa); /* lifetimes. */ ifra.ifra_lifetime.ia6t_vltime = pr->ndpr_vltime; ifra.ifra_lifetime.ia6t_pltime = pr->ndpr_pltime; /* XXX: scope zone ID? */ ifra.ifra_flags |= IN6_IFF_AUTOCONF; /* obey autoconf */ /* * Make sure that we do not have this address already. This should * usually not happen, but we can still see this case, e.g., if we * have manually configured the exact address to be configured. */ ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr); if (ifa != NULL) { ifa_free(ifa); /* this should be rare enough to make an explicit log */ log(LOG_INFO, "in6_ifadd: %s is already configured\n", ip6_sprintf(ip6buf, &ifra.ifra_addr.sin6_addr)); return (NULL); } /* * Allocate ifaddr structure, link into chain, etc. * If we are going to create a new address upon receiving a multicasted * RA, we need to impose a random delay before starting DAD. * [draft-ietf-ipv6-rfc2462bis-02.txt, Section 5.4.2] */ updateflags = 0; if (mcast) updateflags |= IN6_IFAUPDATE_DADDELAY; if ((error = in6_update_ifa(ifp, &ifra, NULL, updateflags)) != 0) { nd6log((LOG_ERR, "in6_ifadd: failed to make ifaddr %s on %s (errno=%d)\n", ip6_sprintf(ip6buf, &ifra.ifra_addr.sin6_addr), if_name(ifp), error)); return (NULL); /* ifaddr must not have been allocated. */ } ia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr); /* * XXXRW: Assumption of non-NULLness here might not be true with * fine-grained locking -- should we validate it? Or just return * earlier ifa rather than looking it up again? */ return (ia); /* this is always non-NULL and referenced. */ } /* * ia0 - corresponding public address */ int in6_tmpifadd(const struct in6_ifaddr *ia0, int forcegen, int delay) { struct ifnet *ifp = ia0->ia_ifa.ifa_ifp; struct in6_ifaddr *newia; struct in6_aliasreq ifra; int error; int trylimit = 3; /* XXX: adhoc value */ int updateflags; u_int32_t randid[2]; time_t vltime0, pltime0; in6_prepare_ifra(&ifra, &ia0->ia_addr.sin6_addr, &ia0->ia_prefixmask.sin6_addr); ifra.ifra_addr = ia0->ia_addr; /* XXX: do we need this ? */ /* clear the old IFID */ IN6_MASK_ADDR(&ifra.ifra_addr.sin6_addr, &ifra.ifra_prefixmask.sin6_addr); again: if (in6_get_tmpifid(ifp, (u_int8_t *)randid, (const u_int8_t *)&ia0->ia_addr.sin6_addr.s6_addr[8], forcegen)) { nd6log((LOG_NOTICE, "in6_tmpifadd: failed to find a good " "random IFID\n")); return (EINVAL); } ifra.ifra_addr.sin6_addr.s6_addr32[2] |= (randid[0] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[2])); ifra.ifra_addr.sin6_addr.s6_addr32[3] |= (randid[1] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[3])); /* * in6_get_tmpifid() quite likely provided a unique interface ID. * However, we may still have a chance to see collision, because * there may be a time lag between generation of the ID and generation * of the address. So, we'll do one more sanity check. */ if (in6_localip(&ifra.ifra_addr.sin6_addr) != 0) { if (trylimit-- > 0) { forcegen = 1; goto again; } /* Give up. Something strange should have happened. */ nd6log((LOG_NOTICE, "in6_tmpifadd: failed to " "find a unique random IFID\n")); return (EEXIST); } /* * The Valid Lifetime is the lower of the Valid Lifetime of the * public address or TEMP_VALID_LIFETIME. * The Preferred Lifetime is the lower of the Preferred Lifetime * of the public address or TEMP_PREFERRED_LIFETIME - * DESYNC_FACTOR. */ if (ia0->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { vltime0 = IFA6_IS_INVALID(ia0) ? 0 : (ia0->ia6_lifetime.ia6t_vltime - (time_uptime - ia0->ia6_updatetime)); if (vltime0 > V_ip6_temp_valid_lifetime) vltime0 = V_ip6_temp_valid_lifetime; } else vltime0 = V_ip6_temp_valid_lifetime; if (ia0->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { pltime0 = IFA6_IS_DEPRECATED(ia0) ? 0 : (ia0->ia6_lifetime.ia6t_pltime - (time_uptime - ia0->ia6_updatetime)); if (pltime0 > V_ip6_temp_preferred_lifetime - V_ip6_desync_factor){ pltime0 = V_ip6_temp_preferred_lifetime - V_ip6_desync_factor; } } else pltime0 = V_ip6_temp_preferred_lifetime - V_ip6_desync_factor; ifra.ifra_lifetime.ia6t_vltime = vltime0; ifra.ifra_lifetime.ia6t_pltime = pltime0; /* * A temporary address is created only if this calculated Preferred * Lifetime is greater than REGEN_ADVANCE time units. */ if (ifra.ifra_lifetime.ia6t_pltime <= V_ip6_temp_regen_advance) return (0); /* XXX: scope zone ID? */ ifra.ifra_flags |= (IN6_IFF_AUTOCONF|IN6_IFF_TEMPORARY); /* allocate ifaddr structure, link into chain, etc. */ updateflags = 0; if (delay) updateflags |= IN6_IFAUPDATE_DADDELAY; if ((error = in6_update_ifa(ifp, &ifra, NULL, updateflags)) != 0) return (error); newia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr); if (newia == NULL) { /* XXX: can it happen? */ nd6log((LOG_ERR, "in6_tmpifadd: ifa update succeeded, but we got " "no ifaddr\n")); return (EINVAL); /* XXX */ } newia->ia6_ndpr = ia0->ia6_ndpr; newia->ia6_ndpr->ndpr_refcnt++; ifa_free(&newia->ia_ifa); /* * A newly added address might affect the status of other addresses. * XXX: when the temporary address is generated with a new public * address, the onlink check is redundant. However, it would be safe * to do the check explicitly everywhere a new address is generated, * and, in fact, we surely need the check when we create a new * temporary address due to deprecation of an old temporary address. */ pfxlist_onlink_check(); return (0); } static int in6_init_prefix_ltimes(struct nd_prefix *ndpr) { if (ndpr->ndpr_pltime == ND6_INFINITE_LIFETIME) ndpr->ndpr_preferred = 0; else ndpr->ndpr_preferred = time_uptime + ndpr->ndpr_pltime; if (ndpr->ndpr_vltime == ND6_INFINITE_LIFETIME) ndpr->ndpr_expire = 0; else ndpr->ndpr_expire = time_uptime + ndpr->ndpr_vltime; return 0; } static void in6_init_address_ltimes(struct nd_prefix *new, struct in6_addrlifetime *lt6) { /* init ia6t_expire */ if (lt6->ia6t_vltime == ND6_INFINITE_LIFETIME) lt6->ia6t_expire = 0; else { lt6->ia6t_expire = time_uptime; lt6->ia6t_expire += lt6->ia6t_vltime; } /* init ia6t_preferred */ if (lt6->ia6t_pltime == ND6_INFINITE_LIFETIME) lt6->ia6t_preferred = 0; else { lt6->ia6t_preferred = time_uptime; lt6->ia6t_preferred += lt6->ia6t_pltime; } } /* * Delete all the routing table entries that use the specified gateway. * XXX: this function causes search through all entries of routing table, so * it shouldn't be called when acting as a router. */ void rt6_flush(struct in6_addr *gateway, struct ifnet *ifp) { /* We'll care only link-local addresses */ if (!IN6_IS_ADDR_LINKLOCAL(gateway)) return; /* XXX Do we really need to walk any but the default FIB? */ rt_foreach_fib(AF_INET6, NULL, rt6_deleteroute, (void *)gateway); } static int rt6_deleteroute(struct rtentry *rt, void *arg) { #define SIN6(s) ((struct sockaddr_in6 *)s) struct in6_addr *gate = (struct in6_addr *)arg; if (rt->rt_gateway == NULL || rt->rt_gateway->sa_family != AF_INET6) return (0); if (!IN6_ARE_ADDR_EQUAL(gate, &SIN6(rt->rt_gateway)->sin6_addr)) { return (0); } /* * Do not delete a static route. * XXX: this seems to be a bit ad-hoc. Should we consider the * 'cloned' bit instead? */ if ((rt->rt_flags & RTF_STATIC) != 0) return (0); /* * We delete only host route. This means, in particular, we don't * delete default route. */ if ((rt->rt_flags & RTF_HOST) == 0) return (0); return (in6_rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags, NULL, rt->rt_fibnum)); #undef SIN6 } int nd6_setdefaultiface(int ifindex) { int error = 0; if (ifindex < 0 || V_if_index < ifindex) return (EINVAL); if (ifindex != 0 && !ifnet_byindex(ifindex)) return (EINVAL); if (V_nd6_defifindex != ifindex) { V_nd6_defifindex = ifindex; if (V_nd6_defifindex > 0) V_nd6_defifp = ifnet_byindex(V_nd6_defifindex); else V_nd6_defifp = NULL; /* * Our current implementation assumes one-to-one maping between * interfaces and links, so it would be natural to use the * default interface as the default link. */ scope6_setdefault(V_nd6_defifp); } return (error); } Index: projects/routing/sys/netpfil/ipfw/ip_fw_table_algo.c =================================================================== --- projects/routing/sys/netpfil/ipfw/ip_fw_table_algo.c (revision 274335) +++ projects/routing/sys/netpfil/ipfw/ip_fw_table_algo.c (revision 274336) @@ -1,4082 +1,4083 @@ /*- * Copyright (c) 2014 Yandex LLC * Copyright (c) 2014 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Lookup table algorithms. * */ #include "opt_ipfw.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include /* ip_fw.h requires IFNAMSIZ */ #include #include #include #include #include /* struct ipfw_rule_ref */ #include #include #include /* * IPFW table lookup algorithms. * * What is needed to add another table algo? * * Algo init: * * struct table_algo has to be filled with: * name: "type:algoname" format, e.g. "addr:radix". Currently * there are the following types: "addr", "iface", "number" and "flow". * type: one of IPFW_TABLE_* types * flags: one or more TA_FLAGS_* * ta_buf_size: size of structure used to store add/del item state. * Needs to be less than TA_BUF_SZ. * callbacks: see below for description. * * ipfw_add_table_algo / ipfw_del_table_algo has to be called * * Callbacks description: * * -init: request to initialize new table instance. * typedef int (ta_init)(struct ip_fw_chain *ch, void **ta_state, * struct table_info *ti, char *data, uint8_t tflags); * MANDATORY, unlocked. (M_WAITOK). Returns 0 on success. * * Allocate all structures needed for normal operations. * * Caller may want to parse @data for some algo-specific * options provided by userland. * * Caller may want to save configuration state pointer to @ta_state * * Caller needs to save desired runtime structure pointer(s) * inside @ti fields. Note that it is not correct to save * @ti pointer at this moment. Use -change_ti hook for that. * * Caller has to fill in ti->lookup to appropriate function * pointer. * * * * -destroy: request to destroy table instance. * typedef void (ta_destroy)(void *ta_state, struct table_info *ti); * MANDATORY, may be locked (UH+WLOCK). (M_NOWAIT). * * Frees all table entries and all tables structures allocated by -init. * * * * -prepare_add: request to allocate state for adding new entry. * typedef int (ta_prepare_add)(struct ip_fw_chain *ch, struct tentry_info *tei, * void *ta_buf); * MANDATORY, unlocked. (M_WAITOK). Returns 0 on success. * * Allocates state and fills it in with all necessary data (EXCEPT value) * from @tei to minimize operations needed to be done under WLOCK. * "value" field has to be copied to new entry in @add callback. * Buffer ta_buf of size ta->ta_buf_sz may be used to store * allocated state. * * * * -prepare_del: request to set state for deleting existing entry. * typedef int (ta_prepare_del)(struct ip_fw_chain *ch, struct tentry_info *tei, * void *ta_buf); * MANDATORY, locked, UH. (M_NOWAIT). Returns 0 on success. * * Buffer ta_buf of size ta->ta_buf_sz may be used to store * allocated state. Caller should use on-stack ta_buf allocation * instead of doing malloc(). * * * * -add: request to insert new entry into runtime/config structures. * typedef int (ta_add)(void *ta_state, struct table_info *ti, * struct tentry_info *tei, void *ta_buf, uint32_t *pnum); * MANDATORY, UH+WLOCK. (M_NOWAIT). Returns 0 on success. * * Insert new entry using previously-allocated state in @ta_buf. * * @tei may have the following flags: * TEI_FLAGS_UPDATE: request to add or update entry. * TEI_FLAGS_DONTADD: request to update (but not add) entry. * * Caller is required to do the following: * copy real entry value from @tei * entry added: return 0, set 1 to @pnum * entry updated: return 0, store 0 to @pnum, store old value in @tei, * add TEI_FLAGS_UPDATED flag to @tei. * entry exists: return EEXIST * entry not found: return ENOENT * other error: return non-zero error code. * * * * -del: request to delete existing entry from runtime/config structures. * typedef int (ta_del)(void *ta_state, struct table_info *ti, * struct tentry_info *tei, void *ta_buf, uint32_t *pnum); * MANDATORY, UH+WLOCK. (M_NOWAIT). Returns 0 on success. * * Delete entry using previously set up in @ta_buf. * * Caller is required to do the following: * entry deleted: return 0, set 1 to @pnum, store old value in @tei. * entry not found: return ENOENT * other error: return non-zero error code. * * * * -flush_entry: flush entry state created by -prepare_add / -del / others * typedef void (ta_flush_entry)(struct ip_fw_chain *ch, * struct tentry_info *tei, void *ta_buf); * MANDATORY, may be locked. (M_NOWAIT). * * Delete state allocated by: * -prepare_add (-add returned EEXIST|UPDATED) * -prepare_del (if any) * -del * * Caller is required to handle empty @ta_buf correctly. * * * -find_tentry: finds entry specified by key @tei * typedef int ta_find_tentry(void *ta_state, struct table_info *ti, * ipfw_obj_tentry *tent); * OPTIONAL, locked (UH). (M_NOWAIT). Returns 0 on success. * * Finds entry specified by given key. * * Caller is requred to do the following: * entry found: returns 0, export entry to @tent * entry not found: returns ENOENT * * * -need_modify: checks if @ti has enough space to hold another @count items. * typedef int (ta_need_modify)(void *ta_state, struct table_info *ti, * uint32_t count, uint64_t *pflags); * OPTIONAL, locked (UH). (M_NOWAIT). Returns 0 if has. * * Checks if given table has enough space to add @count items without * resize. Caller may use @pflags to store desired modification data. * * * * -prepare_mod: allocate structures for table modification. * typedef int (ta_prepare_mod)(void *ta_buf, uint64_t *pflags); * OPTIONAL(need_modify), unlocked. (M_WAITOK). Returns 0 on success. * * Allocate all needed state for table modification. Caller * should use `struct mod_item` to store new state in @ta_buf. * Up to TA_BUF_SZ (128 bytes) can be stored in @ta_buf. * * * * -fill_mod: copy some data to new state/ * typedef int (ta_fill_mod)(void *ta_state, struct table_info *ti, * void *ta_buf, uint64_t *pflags); * OPTIONAL(need_modify), locked (UH). (M_NOWAIT). Returns 0 on success. * * Copy as much data as we can to minimize changes under WLOCK. * For example, array can be merged inside this callback. * * * * -modify: perform final modification. * typedef void (ta_modify)(void *ta_state, struct table_info *ti, * void *ta_buf, uint64_t pflags); * OPTIONAL(need_modify), locked (UH+WLOCK). (M_NOWAIT). * * Performs all changes necessary to switch to new structures. * * Caller should save old pointers to @ta_buf storage. * * * * -flush_mod: flush table modification state. * typedef void (ta_flush_mod)(void *ta_buf); * OPTIONAL(need_modify), unlocked. (M_WAITOK). * * Performs flush for the following: * - prepare_mod (modification was not necessary) * - modify (for the old state) * * * * -change_gi: monitor table info pointer changes * typedef void (ta_change_ti)(void *ta_state, struct table_info *ti); * OPTIONAL, locked (UH). (M_NOWAIT). * * Called on @ti pointer changed. Called immediately after -init * to set initial state. * * * * -foreach: calls @f for each table entry * typedef void ta_foreach(void *ta_state, struct table_info *ti, * ta_foreach_f *f, void *arg); * MANDATORY, locked(UH). (M_NOWAIT). * * Runs callback with specified argument for each table entry, * Typically used for dumping table entries. * * * * -dump_tentry: dump table entry in current @tentry format. * typedef int ta_dump_tentry(void *ta_state, struct table_info *ti, void *e, * ipfw_obj_tentry *tent); * MANDATORY, locked(UH). (M_NOWAIT). Returns 0 on success. * * Dumps entry @e to @tent. * * * -print_config: prints custom algoritm options into buffer. * typedef void (ta_print_config)(void *ta_state, struct table_info *ti, * char *buf, size_t bufsize); * OPTIONAL. locked(UH). (M_NOWAIT). * * Prints custom algorithm options in the format suitable to pass * back to -init callback. * * * * -dump_tinfo: dumps algo-specific info. * typedef void ta_dump_tinfo(void *ta_state, struct table_info *ti, * ipfw_ta_tinfo *tinfo); * OPTIONAL. locked(UH). (M_NOWAIT). * * Dumps options like items size/hash size, etc. */ MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); /* * Utility structures/functions common to more than one algo */ struct mod_item { void *main_ptr; size_t size; void *main_ptr6; size_t size6; }; static int badd(const void *key, void *item, void *base, size_t nmemb, size_t size, int (*compar) (const void *, const void *)); static int bdel(const void *key, void *base, size_t nmemb, size_t size, int (*compar) (const void *, const void *)); /* * ADDR implementation using radix * */ /* * The radix code expects addr and mask to be array of bytes, * with the first byte being the length of the array. rn_inithead * is called with the offset in bits of the lookup key within the * array. If we use a sockaddr_in as the underlying type, * sin_len is conveniently located at offset 0, sin_addr is at * offset 4 and normally aligned. * But for portability, let's avoid assumption and make the code explicit */ #define KEY_LEN(v) *((uint8_t *)&(v)) /* * Do not require radix to compare more than actual IPv4/IPv6 address */ #define KEY_LEN_INET (offsetof(struct sockaddr_in, sin_addr) + sizeof(in_addr_t)) #define KEY_LEN_INET6 (offsetof(struct sa_in6, sin6_addr) + sizeof(struct in6_addr)) #define OFF_LEN_INET (8 * offsetof(struct sockaddr_in, sin_addr)) #define OFF_LEN_INET6 (8 * offsetof(struct sa_in6, sin6_addr)) struct radix_addr_entry { struct radix_node rn[2]; struct sockaddr_in addr; uint32_t value; uint8_t masklen; }; struct sa_in6 { uint8_t sin6_len; uint8_t sin6_family; uint8_t pad[2]; struct in6_addr sin6_addr; }; struct radix_addr_xentry { struct radix_node rn[2]; struct sa_in6 addr6; uint32_t value; uint8_t masklen; }; struct radix_cfg { struct radix_node_head *head4; struct radix_node_head *head6; size_t count4; size_t count6; }; struct ta_buf_radix { void *ent_ptr; struct sockaddr *addr_ptr; struct sockaddr *mask_ptr; union { struct { struct sockaddr_in sa; struct sockaddr_in ma; } a4; struct { struct sa_in6 sa; struct sa_in6 ma; } a6; } addr; }; static int ta_lookup_radix(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_init_radix(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static int flush_radix_entry(struct radix_node *rn, void *arg); static void ta_destroy_radix(void *ta_state, struct table_info *ti); static void ta_dump_radix_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int ta_dump_radix_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static int ta_find_radix_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static void ta_foreach_radix(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); static void tei_to_sockaddr_ent(struct tentry_info *tei, struct sockaddr *sa, struct sockaddr *ma, int *set_mask); static int ta_prepare_add_radix(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_add_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static int ta_prepare_del_radix(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_del_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static void ta_flush_radix_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_need_modify_radix(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags); static int ta_lookup_radix(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct radix_node_head *rnh; if (keylen == sizeof(in_addr_t)) { struct radix_addr_entry *ent; struct sockaddr_in sa; KEY_LEN(sa) = KEY_LEN_INET; sa.sin_addr.s_addr = *((in_addr_t *)key); rnh = (struct radix_node_head *)ti->state; ent = (struct radix_addr_entry *)(rnh->rnh_matchaddr(&sa, &rnh->rh)); if (ent != NULL) { *val = ent->value; return (1); } } else { struct radix_addr_xentry *xent; struct sa_in6 sa6; KEY_LEN(sa6) = KEY_LEN_INET6; memcpy(&sa6.sin6_addr, key, sizeof(struct in6_addr)); rnh = (struct radix_node_head *)ti->xstate; xent = (struct radix_addr_xentry *)(rnh->rnh_matchaddr(&sa6, &rnh->rh)); if (xent != NULL) { *val = xent->value; return (1); } } return (0); } /* * New table */ static int ta_init_radix(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { struct radix_cfg *cfg; if (!rn_inithead(&ti->state, OFF_LEN_INET)) return (ENOMEM); if (!rn_inithead(&ti->xstate, OFF_LEN_INET6)) { rn_detachhead(&ti->state); return (ENOMEM); } cfg = malloc(sizeof(struct radix_cfg), M_IPFW, M_WAITOK | M_ZERO); *ta_state = cfg; ti->lookup = ta_lookup_radix; return (0); } static int flush_radix_entry(struct radix_node *rn, void *arg) { struct radix_node_head * const rnh = arg; struct radix_addr_entry *ent; ent = (struct radix_addr_entry *) rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, &rnh->rh); if (ent != NULL) free(ent, M_IPFW_TBL); return (0); } static void ta_destroy_radix(void *ta_state, struct table_info *ti) { struct radix_cfg *cfg; struct radix_node_head *rnh; cfg = (struct radix_cfg *)ta_state; rnh = (struct radix_node_head *)(ti->state); rnh->rnh_walktree(&rnh->rh, flush_radix_entry, rnh); rn_detachhead(&ti->state); rnh = (struct radix_node_head *)(ti->xstate); rnh->rnh_walktree(&rnh->rh, flush_radix_entry, rnh); rn_detachhead(&ti->xstate); free(cfg, M_IPFW); } /* * Provide algo-specific table info */ static void ta_dump_radix_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { struct radix_cfg *cfg; cfg = (struct radix_cfg *)ta_state; tinfo->flags = IPFW_TATFLAGS_AFDATA | IPFW_TATFLAGS_AFITEM; tinfo->taclass4 = IPFW_TACLASS_RADIX; tinfo->count4 = cfg->count4; tinfo->itemsize4 = sizeof(struct radix_addr_entry); tinfo->taclass6 = IPFW_TACLASS_RADIX; tinfo->count6 = cfg->count6; tinfo->itemsize6 = sizeof(struct radix_addr_xentry); } static int ta_dump_radix_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct radix_addr_entry *n; #ifdef INET6 struct radix_addr_xentry *xn; #endif n = (struct radix_addr_entry *)e; /* Guess IPv4/IPv6 radix by sockaddr family */ if (n->addr.sin_family == AF_INET) { tent->k.addr.s_addr = n->addr.sin_addr.s_addr; tent->masklen = n->masklen; tent->subtype = AF_INET; tent->v.kidx = n->value; #ifdef INET6 } else { xn = (struct radix_addr_xentry *)e; memcpy(&tent->k, &xn->addr6.sin6_addr, sizeof(struct in6_addr)); tent->masklen = xn->masklen; tent->subtype = AF_INET6; tent->v.kidx = xn->value; #endif } return (0); } static int ta_find_radix_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct radix_node_head *rnh; void *e; e = NULL; if (tent->subtype == AF_INET) { struct sockaddr_in sa; KEY_LEN(sa) = KEY_LEN_INET; sa.sin_addr.s_addr = tent->k.addr.s_addr; rnh = (struct radix_node_head *)ti->state; e = rnh->rnh_matchaddr(&sa, &rnh->rh); } else { struct sa_in6 sa6; KEY_LEN(sa6) = KEY_LEN_INET6; memcpy(&sa6.sin6_addr, &tent->k.addr6, sizeof(struct in6_addr)); rnh = (struct radix_node_head *)ti->xstate; e = rnh->rnh_matchaddr(&sa6, &rnh->rh); } if (e != NULL) { ta_dump_radix_tentry(ta_state, ti, e, tent); return (0); } return (ENOENT); } static void ta_foreach_radix(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct radix_node_head *rnh; rnh = (struct radix_node_head *)(ti->state); rnh->rnh_walktree(&rnh->rh, (walktree_f_t *)f, arg); rnh = (struct radix_node_head *)(ti->xstate); rnh->rnh_walktree(&rnh->rh, (walktree_f_t *)f, arg); } #ifdef INET6 static inline void ipv6_writemask(struct in6_addr *addr6, uint8_t mask); static inline void ipv6_writemask(struct in6_addr *addr6, uint8_t mask) { uint32_t *cp; for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32) *cp++ = 0xFFFFFFFF; *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0); } #endif static void tei_to_sockaddr_ent(struct tentry_info *tei, struct sockaddr *sa, struct sockaddr *ma, int *set_mask) { int mlen; #ifdef INET struct sockaddr_in *addr, *mask; #endif #ifdef INET6 struct sa_in6 *addr6, *mask6; #endif in_addr_t a4; mlen = tei->masklen; if (tei->subtype == AF_INET) { #ifdef INET addr = (struct sockaddr_in *)sa; mask = (struct sockaddr_in *)ma; /* Set 'total' structure length */ KEY_LEN(*addr) = KEY_LEN_INET; KEY_LEN(*mask) = KEY_LEN_INET; addr->sin_family = AF_INET; mask->sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); a4 = *((in_addr_t *)tei->paddr); addr->sin_addr.s_addr = a4 & mask->sin_addr.s_addr; if (mlen != 32) *set_mask = 1; else *set_mask = 0; #endif #ifdef INET6 } else if (tei->subtype == AF_INET6) { /* IPv6 case */ addr6 = (struct sa_in6 *)sa; mask6 = (struct sa_in6 *)ma; /* Set 'total' structure length */ KEY_LEN(*addr6) = KEY_LEN_INET6; KEY_LEN(*mask6) = KEY_LEN_INET6; addr6->sin6_family = AF_INET6; ipv6_writemask(&mask6->sin6_addr, mlen); memcpy(&addr6->sin6_addr, tei->paddr, sizeof(struct in6_addr)); APPLY_MASK(&addr6->sin6_addr, &mask6->sin6_addr); if (mlen != 128) *set_mask = 1; else *set_mask = 0; #endif } } static int ta_prepare_add_radix(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_radix *tb; struct radix_addr_entry *ent; #ifdef INET6 struct radix_addr_xentry *xent; #endif struct sockaddr *addr, *mask; int mlen, set_mask; tb = (struct ta_buf_radix *)ta_buf; mlen = tei->masklen; set_mask = 0; if (tei->subtype == AF_INET) { #ifdef INET if (mlen > 32) return (EINVAL); ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO); ent->masklen = mlen; addr = (struct sockaddr *)&ent->addr; mask = (struct sockaddr *)&tb->addr.a4.ma; tb->ent_ptr = ent; #endif #ifdef INET6 } else if (tei->subtype == AF_INET6) { /* IPv6 case */ if (mlen > 128) return (EINVAL); xent = malloc(sizeof(*xent), M_IPFW_TBL, M_WAITOK | M_ZERO); xent->masklen = mlen; addr = (struct sockaddr *)&xent->addr6; mask = (struct sockaddr *)&tb->addr.a6.ma; tb->ent_ptr = xent; #endif } else { /* Unknown CIDR type */ return (EINVAL); } tei_to_sockaddr_ent(tei, addr, mask, &set_mask); /* Set pointers */ tb->addr_ptr = addr; if (set_mask != 0) tb->mask_ptr = mask; return (0); } static int ta_add_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct radix_cfg *cfg; struct radix_node_head *rnh; struct radix_node *rn; struct ta_buf_radix *tb; uint32_t *old_value, value; cfg = (struct radix_cfg *)ta_state; tb = (struct ta_buf_radix *)ta_buf; /* Save current entry value from @tei */ if (tei->subtype == AF_INET) { rnh = ti->state; ((struct radix_addr_entry *)tb->ent_ptr)->value = tei->value; } else { rnh = ti->xstate; ((struct radix_addr_xentry *)tb->ent_ptr)->value = tei->value; } /* Search for an entry first */ rn = rnh->rnh_lookup(tb->addr_ptr, tb->mask_ptr, &rnh->rh); if (rn != NULL) { if ((tei->flags & TEI_FLAGS_UPDATE) == 0) return (EEXIST); /* Record already exists. Update value if we're asked to */ if (tei->subtype == AF_INET) old_value = &((struct radix_addr_entry *)rn)->value; else old_value = &((struct radix_addr_xentry *)rn)->value; value = *old_value; *old_value = tei->value; tei->value = value; /* Indicate that update has happened instead of addition */ tei->flags |= TEI_FLAGS_UPDATED; *pnum = 0; return (0); } if ((tei->flags & TEI_FLAGS_DONTADD) != 0) return (EFBIG); rn = rnh->rnh_addaddr(tb->addr_ptr, tb->mask_ptr, &rnh->rh, tb->ent_ptr); if (rn == NULL) { /* Unknown error */ return (EINVAL); } if (tei->subtype == AF_INET) cfg->count4++; else cfg->count6++; tb->ent_ptr = NULL; *pnum = 1; return (0); } static int ta_prepare_del_radix(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_radix *tb; struct sockaddr *addr, *mask; int mlen, set_mask; tb = (struct ta_buf_radix *)ta_buf; mlen = tei->masklen; set_mask = 0; if (tei->subtype == AF_INET) { if (mlen > 32) return (EINVAL); addr = (struct sockaddr *)&tb->addr.a4.sa; mask = (struct sockaddr *)&tb->addr.a4.ma; #ifdef INET6 } else if (tei->subtype == AF_INET6) { if (mlen > 128) return (EINVAL); addr = (struct sockaddr *)&tb->addr.a6.sa; mask = (struct sockaddr *)&tb->addr.a6.ma; #endif } else return (EINVAL); tei_to_sockaddr_ent(tei, addr, mask, &set_mask); tb->addr_ptr = addr; if (set_mask != 0) tb->mask_ptr = mask; return (0); } static int ta_del_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct radix_cfg *cfg; struct radix_node_head *rnh; struct radix_node *rn; struct ta_buf_radix *tb; cfg = (struct radix_cfg *)ta_state; tb = (struct ta_buf_radix *)ta_buf; if (tei->subtype == AF_INET) rnh = ti->state; else rnh = ti->xstate; rn = rnh->rnh_deladdr(tb->addr_ptr, tb->mask_ptr, &rnh->rh); if (rn == NULL) return (ENOENT); /* Save entry value to @tei */ if (tei->subtype == AF_INET) tei->value = ((struct radix_addr_entry *)rn)->value; else tei->value = ((struct radix_addr_xentry *)rn)->value; tb->ent_ptr = rn; if (tei->subtype == AF_INET) cfg->count4--; else cfg->count6--; *pnum = 1; return (0); } static void ta_flush_radix_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_radix *tb; tb = (struct ta_buf_radix *)ta_buf; if (tb->ent_ptr != NULL) free(tb->ent_ptr, M_IPFW_TBL); } static int ta_need_modify_radix(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags) { /* * radix does not require additional memory allocations * other than nodes itself. Adding new masks to the tree do * but we don't have any API to call (and we don't known which * sizes do we need). */ return (0); } struct table_algo addr_radix = { .name = "addr:radix", .type = IPFW_TABLE_ADDR, .flags = TA_FLAG_DEFAULT, .ta_buf_size = sizeof(struct ta_buf_radix), .init = ta_init_radix, .destroy = ta_destroy_radix, .prepare_add = ta_prepare_add_radix, .prepare_del = ta_prepare_del_radix, .add = ta_add_radix, .del = ta_del_radix, .flush_entry = ta_flush_radix_entry, .foreach = ta_foreach_radix, .dump_tentry = ta_dump_radix_tentry, .find_tentry = ta_find_radix_tentry, .dump_tinfo = ta_dump_radix_tinfo, .need_modify = ta_need_modify_radix, }; /* * addr:hash cmds * * * ti->data: * [inv.mask4][inv.mask6][log2hsize4][log2hsize6] * [ 8][ 8[ 8][ 8] * * inv.mask4: 32 - mask * inv.mask6: * 1) _slow lookup: mask * 2) _aligned: (128 - mask) / 8 * 3) _64: 8 * * * pflags: * [v4=1/v6=0][hsize] * [ 32][ 32] */ struct chashentry; SLIST_HEAD(chashbhead, chashentry); struct chash_cfg { struct chashbhead *head4; struct chashbhead *head6; size_t size4; size_t size6; size_t items4; size_t items6; uint8_t mask4; uint8_t mask6; }; struct chashentry { SLIST_ENTRY(chashentry) next; uint32_t value; uint32_t type; union { uint32_t a4; /* Host format */ struct in6_addr a6; /* Network format */ } a; }; struct ta_buf_chash { void *ent_ptr; struct chashentry ent; }; #ifdef INET static __inline uint32_t hash_ip(uint32_t addr, int hsize); #endif #ifdef INET6 static __inline uint32_t hash_ip6(struct in6_addr *addr6, int hsize); static __inline uint16_t hash_ip64(struct in6_addr *addr6, int hsize); static __inline uint32_t hash_ip6_slow(struct in6_addr *addr6, void *key, int mask, int hsize); static __inline uint32_t hash_ip6_al(struct in6_addr *addr6, void *key, int mask, int hsize); #endif static int ta_lookup_chash_slow(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_lookup_chash_aligned(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_lookup_chash_64(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int chash_parse_opts(struct chash_cfg *cfg, char *data); static void ta_print_chash_config(void *ta_state, struct table_info *ti, char *buf, size_t bufsize); static int ta_log2(uint32_t v); static int ta_init_chash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static void ta_destroy_chash(void *ta_state, struct table_info *ti); static void ta_dump_chash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int ta_dump_chash_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static uint32_t hash_ent(struct chashentry *ent, int af, int mlen, uint32_t size); static int tei_to_chash_ent(struct tentry_info *tei, struct chashentry *ent); static int ta_find_chash_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static void ta_foreach_chash(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); static int ta_prepare_add_chash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_add_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static int ta_prepare_del_chash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_del_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static void ta_flush_chash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_need_modify_chash(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags); static int ta_prepare_mod_chash(void *ta_buf, uint64_t *pflags); static int ta_fill_mod_chash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags); static void ta_modify_chash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags); static void ta_flush_mod_chash(void *ta_buf); #ifdef INET static __inline uint32_t hash_ip(uint32_t addr, int hsize) { return (addr % (hsize - 1)); } #endif #ifdef INET6 static __inline uint32_t hash_ip6(struct in6_addr *addr6, int hsize) { uint32_t i; i = addr6->s6_addr32[0] ^ addr6->s6_addr32[1] ^ addr6->s6_addr32[2] ^ addr6->s6_addr32[3]; return (i % (hsize - 1)); } static __inline uint16_t hash_ip64(struct in6_addr *addr6, int hsize) { uint32_t i; i = addr6->s6_addr32[0] ^ addr6->s6_addr32[1]; return (i % (hsize - 1)); } static __inline uint32_t hash_ip6_slow(struct in6_addr *addr6, void *key, int mask, int hsize) { struct in6_addr mask6; ipv6_writemask(&mask6, mask); memcpy(addr6, key, sizeof(struct in6_addr)); APPLY_MASK(addr6, &mask6); return (hash_ip6(addr6, hsize)); } static __inline uint32_t hash_ip6_al(struct in6_addr *addr6, void *key, int mask, int hsize) { uint64_t *paddr; paddr = (uint64_t *)addr6; *paddr = 0; *(paddr + 1) = 0; memcpy(addr6, key, mask); return (hash_ip6(addr6, hsize)); } #endif static int ta_lookup_chash_slow(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct chashbhead *head; struct chashentry *ent; uint16_t hash, hsize; uint8_t imask; if (keylen == sizeof(in_addr_t)) { #ifdef INET head = (struct chashbhead *)ti->state; imask = ti->data >> 24; hsize = 1 << ((ti->data & 0xFFFF) >> 8); uint32_t a; a = ntohl(*((in_addr_t *)key)); a = a >> imask; hash = hash_ip(a, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (ent->a.a4 == a) { *val = ent->value; return (1); } } #endif } else { #ifdef INET6 /* IPv6: worst scenario: non-round mask */ struct in6_addr addr6; head = (struct chashbhead *)ti->xstate; imask = (ti->data & 0xFF0000) >> 16; hsize = 1 << (ti->data & 0xFF); hash = hash_ip6_slow(&addr6, key, imask, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (memcmp(&ent->a.a6, &addr6, 16) == 0) { *val = ent->value; return (1); } } #endif } return (0); } static int ta_lookup_chash_aligned(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct chashbhead *head; struct chashentry *ent; uint16_t hash, hsize; uint8_t imask; if (keylen == sizeof(in_addr_t)) { #ifdef INET head = (struct chashbhead *)ti->state; imask = ti->data >> 24; hsize = 1 << ((ti->data & 0xFFFF) >> 8); uint32_t a; a = ntohl(*((in_addr_t *)key)); a = a >> imask; hash = hash_ip(a, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (ent->a.a4 == a) { *val = ent->value; return (1); } } #endif } else { #ifdef INET6 /* IPv6: aligned to 8bit mask */ struct in6_addr addr6; uint64_t *paddr, *ptmp; head = (struct chashbhead *)ti->xstate; imask = (ti->data & 0xFF0000) >> 16; hsize = 1 << (ti->data & 0xFF); hash = hash_ip6_al(&addr6, key, imask, hsize); paddr = (uint64_t *)&addr6; SLIST_FOREACH(ent, &head[hash], next) { ptmp = (uint64_t *)&ent->a.a6; if (paddr[0] == ptmp[0] && paddr[1] == ptmp[1]) { *val = ent->value; return (1); } } #endif } return (0); } static int ta_lookup_chash_64(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct chashbhead *head; struct chashentry *ent; uint16_t hash, hsize; uint8_t imask; if (keylen == sizeof(in_addr_t)) { #ifdef INET head = (struct chashbhead *)ti->state; imask = ti->data >> 24; hsize = 1 << ((ti->data & 0xFFFF) >> 8); uint32_t a; a = ntohl(*((in_addr_t *)key)); a = a >> imask; hash = hash_ip(a, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (ent->a.a4 == a) { *val = ent->value; return (1); } } #endif } else { #ifdef INET6 /* IPv6: /64 */ uint64_t a6, *paddr; head = (struct chashbhead *)ti->xstate; paddr = (uint64_t *)key; hsize = 1 << (ti->data & 0xFF); a6 = *paddr; hash = hash_ip64((struct in6_addr *)key, hsize); SLIST_FOREACH(ent, &head[hash], next) { paddr = (uint64_t *)&ent->a.a6; if (a6 == *paddr) { *val = ent->value; return (1); } } #endif } return (0); } static int chash_parse_opts(struct chash_cfg *cfg, char *data) { char *pdel, *pend, *s; int mask4, mask6; mask4 = cfg->mask4; mask6 = cfg->mask6; if (data == NULL) return (0); if ((pdel = strchr(data, ' ')) == NULL) return (0); while (*pdel == ' ') pdel++; if (strncmp(pdel, "masks=", 6) != 0) return (EINVAL); if ((s = strchr(pdel, ' ')) != NULL) *s++ = '\0'; pdel += 6; /* Need /XX[,/YY] */ if (*pdel++ != '/') return (EINVAL); mask4 = strtol(pdel, &pend, 10); if (*pend == ',') { /* ,/YY */ pdel = pend + 1; if (*pdel++ != '/') return (EINVAL); mask6 = strtol(pdel, &pend, 10); if (*pend != '\0') return (EINVAL); } else if (*pend != '\0') return (EINVAL); if (mask4 < 0 || mask4 > 32 || mask6 < 0 || mask6 > 128) return (EINVAL); cfg->mask4 = mask4; cfg->mask6 = mask6; return (0); } static void ta_print_chash_config(void *ta_state, struct table_info *ti, char *buf, size_t bufsize) { struct chash_cfg *cfg; cfg = (struct chash_cfg *)ta_state; if (cfg->mask4 != 32 || cfg->mask6 != 128) snprintf(buf, bufsize, "%s masks=/%d,/%d", "addr:hash", cfg->mask4, cfg->mask6); else snprintf(buf, bufsize, "%s", "addr:hash"); } static int ta_log2(uint32_t v) { uint32_t r; r = 0; while (v >>= 1) r++; return (r); } /* * New table. * We assume 'data' to be either NULL or the following format: * 'addr:hash [masks=/32[,/128]]' */ static int ta_init_chash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { int error, i; uint32_t hsize; struct chash_cfg *cfg; cfg = malloc(sizeof(struct chash_cfg), M_IPFW, M_WAITOK | M_ZERO); cfg->mask4 = 32; cfg->mask6 = 128; if ((error = chash_parse_opts(cfg, data)) != 0) { free(cfg, M_IPFW); return (error); } cfg->size4 = 128; cfg->size6 = 128; cfg->head4 = malloc(sizeof(struct chashbhead) * cfg->size4, M_IPFW, M_WAITOK | M_ZERO); cfg->head6 = malloc(sizeof(struct chashbhead) * cfg->size6, M_IPFW, M_WAITOK | M_ZERO); for (i = 0; i < cfg->size4; i++) SLIST_INIT(&cfg->head4[i]); for (i = 0; i < cfg->size6; i++) SLIST_INIT(&cfg->head6[i]); *ta_state = cfg; ti->state = cfg->head4; ti->xstate = cfg->head6; /* Store data depending on v6 mask length */ hsize = ta_log2(cfg->size4) << 8 | ta_log2(cfg->size6); if (cfg->mask6 == 64) { ti->data = (32 - cfg->mask4) << 24 | (128 - cfg->mask6) << 16| hsize; ti->lookup = ta_lookup_chash_64; } else if ((cfg->mask6 % 8) == 0) { ti->data = (32 - cfg->mask4) << 24 | cfg->mask6 << 13 | hsize; ti->lookup = ta_lookup_chash_aligned; } else { /* don't do that! */ ti->data = (32 - cfg->mask4) << 24 | cfg->mask6 << 16 | hsize; ti->lookup = ta_lookup_chash_slow; } return (0); } static void ta_destroy_chash(void *ta_state, struct table_info *ti) { struct chash_cfg *cfg; struct chashentry *ent, *ent_next; int i; cfg = (struct chash_cfg *)ta_state; for (i = 0; i < cfg->size4; i++) SLIST_FOREACH_SAFE(ent, &cfg->head4[i], next, ent_next) free(ent, M_IPFW_TBL); for (i = 0; i < cfg->size6; i++) SLIST_FOREACH_SAFE(ent, &cfg->head6[i], next, ent_next) free(ent, M_IPFW_TBL); free(cfg->head4, M_IPFW); free(cfg->head6, M_IPFW); free(cfg, M_IPFW); } static void ta_dump_chash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { struct chash_cfg *cfg; cfg = (struct chash_cfg *)ta_state; tinfo->flags = IPFW_TATFLAGS_AFDATA | IPFW_TATFLAGS_AFITEM; tinfo->taclass4 = IPFW_TACLASS_HASH; tinfo->size4 = cfg->size4; tinfo->count4 = cfg->items4; tinfo->itemsize4 = sizeof(struct chashentry); tinfo->taclass6 = IPFW_TACLASS_HASH; tinfo->size6 = cfg->size6; tinfo->count6 = cfg->items6; tinfo->itemsize6 = sizeof(struct chashentry); } static int ta_dump_chash_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct chash_cfg *cfg; struct chashentry *ent; cfg = (struct chash_cfg *)ta_state; ent = (struct chashentry *)e; if (ent->type == AF_INET) { tent->k.addr.s_addr = htonl(ent->a.a4 << (32 - cfg->mask4)); tent->masklen = cfg->mask4; tent->subtype = AF_INET; tent->v.kidx = ent->value; #ifdef INET6 } else { memcpy(&tent->k, &ent->a.a6, sizeof(struct in6_addr)); tent->masklen = cfg->mask6; tent->subtype = AF_INET6; tent->v.kidx = ent->value; #endif } return (0); } static uint32_t hash_ent(struct chashentry *ent, int af, int mlen, uint32_t size) { uint32_t hash; hash = 0; if (af == AF_INET) { #ifdef INET hash = hash_ip(ent->a.a4, size); #endif } else { #ifdef INET6 if (mlen == 64) hash = hash_ip64(&ent->a.a6, size); else hash = hash_ip6(&ent->a.a6, size); #endif } return (hash); } static int tei_to_chash_ent(struct tentry_info *tei, struct chashentry *ent) { int mlen; #ifdef INET6 struct in6_addr mask6; #endif mlen = tei->masklen; if (tei->subtype == AF_INET) { #ifdef INET if (mlen > 32) return (EINVAL); ent->type = AF_INET; /* Calculate masked address */ ent->a.a4 = ntohl(*((in_addr_t *)tei->paddr)) >> (32 - mlen); #endif #ifdef INET6 } else if (tei->subtype == AF_INET6) { /* IPv6 case */ if (mlen > 128) return (EINVAL); ent->type = AF_INET6; ipv6_writemask(&mask6, mlen); memcpy(&ent->a.a6, tei->paddr, sizeof(struct in6_addr)); APPLY_MASK(&ent->a.a6, &mask6); #endif } else { /* Unknown CIDR type */ return (EINVAL); } return (0); } static int ta_find_chash_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct chash_cfg *cfg; struct chashbhead *head; struct chashentry ent, *tmp; struct tentry_info tei; int error; uint32_t hash; cfg = (struct chash_cfg *)ta_state; memset(&ent, 0, sizeof(ent)); memset(&tei, 0, sizeof(tei)); if (tent->subtype == AF_INET) { tei.paddr = &tent->k.addr; tei.masklen = cfg->mask4; tei.subtype = AF_INET; if ((error = tei_to_chash_ent(&tei, &ent)) != 0) return (error); head = cfg->head4; hash = hash_ent(&ent, AF_INET, cfg->mask4, cfg->size4); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (tmp->a.a4 != ent.a.a4) continue; ta_dump_chash_tentry(ta_state, ti, tmp, tent); return (0); } } else { tei.paddr = &tent->k.addr6; tei.masklen = cfg->mask6; tei.subtype = AF_INET6; if ((error = tei_to_chash_ent(&tei, &ent)) != 0) return (error); head = cfg->head6; hash = hash_ent(&ent, AF_INET6, cfg->mask6, cfg->size6); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (memcmp(&tmp->a.a6, &ent.a.a6, 16) != 0) continue; ta_dump_chash_tentry(ta_state, ti, tmp, tent); return (0); } } return (ENOENT); } static void ta_foreach_chash(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct chash_cfg *cfg; struct chashentry *ent, *ent_next; int i; cfg = (struct chash_cfg *)ta_state; for (i = 0; i < cfg->size4; i++) SLIST_FOREACH_SAFE(ent, &cfg->head4[i], next, ent_next) f(ent, arg); for (i = 0; i < cfg->size6; i++) SLIST_FOREACH_SAFE(ent, &cfg->head6[i], next, ent_next) f(ent, arg); } static int ta_prepare_add_chash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_chash *tb; struct chashentry *ent; int error; tb = (struct ta_buf_chash *)ta_buf; ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO); error = tei_to_chash_ent(tei, ent); if (error != 0) { free(ent, M_IPFW_TBL); return (error); } tb->ent_ptr = ent; return (0); } static int ta_add_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct chash_cfg *cfg; struct chashbhead *head; struct chashentry *ent, *tmp; struct ta_buf_chash *tb; int exists; uint32_t hash, value; cfg = (struct chash_cfg *)ta_state; tb = (struct ta_buf_chash *)ta_buf; ent = (struct chashentry *)tb->ent_ptr; hash = 0; exists = 0; /* Read current value from @tei */ ent->value = tei->value; /* Read cuurrent value */ if (tei->subtype == AF_INET) { if (tei->masklen != cfg->mask4) return (EINVAL); head = cfg->head4; hash = hash_ent(ent, AF_INET, cfg->mask4, cfg->size4); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (tmp->a.a4 == ent->a.a4) { exists = 1; break; } } } else { if (tei->masklen != cfg->mask6) return (EINVAL); head = cfg->head6; hash = hash_ent(ent, AF_INET6, cfg->mask6, cfg->size6); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (memcmp(&tmp->a.a6, &ent->a.a6, 16) == 0) { exists = 1; break; } } } if (exists == 1) { if ((tei->flags & TEI_FLAGS_UPDATE) == 0) return (EEXIST); /* Record already exists. Update value if we're asked to */ value = tmp->value; tmp->value = tei->value; tei->value = value; /* Indicate that update has happened instead of addition */ tei->flags |= TEI_FLAGS_UPDATED; *pnum = 0; } else { if ((tei->flags & TEI_FLAGS_DONTADD) != 0) return (EFBIG); SLIST_INSERT_HEAD(&head[hash], ent, next); tb->ent_ptr = NULL; *pnum = 1; /* Update counters */ if (tei->subtype == AF_INET) cfg->items4++; else cfg->items6++; } return (0); } static int ta_prepare_del_chash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_chash *tb; tb = (struct ta_buf_chash *)ta_buf; return (tei_to_chash_ent(tei, &tb->ent)); } static int ta_del_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct chash_cfg *cfg; struct chashbhead *head; struct chashentry *tmp, *tmp_next, *ent; struct ta_buf_chash *tb; uint32_t hash; cfg = (struct chash_cfg *)ta_state; tb = (struct ta_buf_chash *)ta_buf; ent = &tb->ent; if (tei->subtype == AF_INET) { if (tei->masklen != cfg->mask4) return (EINVAL); head = cfg->head4; hash = hash_ent(ent, AF_INET, cfg->mask4, cfg->size4); SLIST_FOREACH_SAFE(tmp, &head[hash], next, tmp_next) { if (tmp->a.a4 != ent->a.a4) continue; SLIST_REMOVE(&head[hash], tmp, chashentry, next); cfg->items4--; tb->ent_ptr = tmp; tei->value = tmp->value; *pnum = 1; return (0); } } else { if (tei->masklen != cfg->mask6) return (EINVAL); head = cfg->head6; hash = hash_ent(ent, AF_INET6, cfg->mask6, cfg->size6); SLIST_FOREACH_SAFE(tmp, &head[hash], next, tmp_next) { if (memcmp(&tmp->a.a6, &ent->a.a6, 16) != 0) continue; SLIST_REMOVE(&head[hash], tmp, chashentry, next); cfg->items6--; tb->ent_ptr = tmp; tei->value = tmp->value; *pnum = 1; return (0); } } return (ENOENT); } static void ta_flush_chash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_chash *tb; tb = (struct ta_buf_chash *)ta_buf; if (tb->ent_ptr != NULL) free(tb->ent_ptr, M_IPFW_TBL); } /* * Hash growing callbacks. */ static int ta_need_modify_chash(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags) { struct chash_cfg *cfg; uint64_t data; /* * Since we don't know exact number of IPv4/IPv6 records in @count, * ignore non-zero @count value at all. Check current hash sizes * and return appropriate data. */ cfg = (struct chash_cfg *)ta_state; data = 0; if (cfg->items4 > cfg->size4 && cfg->size4 < 65536) data |= (cfg->size4 * 2) << 16; if (cfg->items6 > cfg->size6 && cfg->size6 < 65536) data |= cfg->size6 * 2; if (data != 0) { *pflags = data; return (1); } return (0); } /* * Allocate new, larger chash. */ static int ta_prepare_mod_chash(void *ta_buf, uint64_t *pflags) { struct mod_item *mi; struct chashbhead *head; int i; mi = (struct mod_item *)ta_buf; memset(mi, 0, sizeof(struct mod_item)); mi->size = (*pflags >> 16) & 0xFFFF; mi->size6 = *pflags & 0xFFFF; if (mi->size > 0) { head = malloc(sizeof(struct chashbhead) * mi->size, M_IPFW, M_WAITOK | M_ZERO); for (i = 0; i < mi->size; i++) SLIST_INIT(&head[i]); mi->main_ptr = head; } if (mi->size6 > 0) { head = malloc(sizeof(struct chashbhead) * mi->size6, M_IPFW, M_WAITOK | M_ZERO); for (i = 0; i < mi->size6; i++) SLIST_INIT(&head[i]); mi->main_ptr6 = head; } return (0); } /* * Copy data from old runtime array to new one. */ static int ta_fill_mod_chash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags) { /* In is not possible to do rehash if we're not holidng WLOCK. */ return (0); } /* * Switch old & new arrays. */ static void ta_modify_chash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags) { struct mod_item *mi; struct chash_cfg *cfg; struct chashbhead *old_head, *new_head; struct chashentry *ent, *ent_next; int af, i, mlen; uint32_t nhash; size_t old_size, new_size; mi = (struct mod_item *)ta_buf; cfg = (struct chash_cfg *)ta_state; /* Check which hash we need to grow and do we still need that */ if (mi->size > 0 && cfg->size4 < mi->size) { new_head = (struct chashbhead *)mi->main_ptr; new_size = mi->size; old_size = cfg->size4; old_head = ti->state; mlen = cfg->mask4; af = AF_INET; for (i = 0; i < old_size; i++) { SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) { nhash = hash_ent(ent, af, mlen, new_size); SLIST_INSERT_HEAD(&new_head[nhash], ent, next); } } ti->state = new_head; cfg->head4 = new_head; cfg->size4 = mi->size; mi->main_ptr = old_head; } if (mi->size6 > 0 && cfg->size6 < mi->size6) { new_head = (struct chashbhead *)mi->main_ptr6; new_size = mi->size6; old_size = cfg->size6; old_head = ti->xstate; mlen = cfg->mask6; af = AF_INET6; for (i = 0; i < old_size; i++) { SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) { nhash = hash_ent(ent, af, mlen, new_size); SLIST_INSERT_HEAD(&new_head[nhash], ent, next); } } ti->xstate = new_head; cfg->head6 = new_head; cfg->size6 = mi->size6; mi->main_ptr6 = old_head; } /* Update lower 32 bits with new values */ ti->data &= 0xFFFFFFFF00000000; ti->data |= ta_log2(cfg->size4) << 8 | ta_log2(cfg->size6); } /* * Free unneded array. */ static void ta_flush_mod_chash(void *ta_buf) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; if (mi->main_ptr != NULL) free(mi->main_ptr, M_IPFW); if (mi->main_ptr6 != NULL) free(mi->main_ptr6, M_IPFW); } struct table_algo addr_hash = { .name = "addr:hash", .type = IPFW_TABLE_ADDR, .ta_buf_size = sizeof(struct ta_buf_chash), .init = ta_init_chash, .destroy = ta_destroy_chash, .prepare_add = ta_prepare_add_chash, .prepare_del = ta_prepare_del_chash, .add = ta_add_chash, .del = ta_del_chash, .flush_entry = ta_flush_chash_entry, .foreach = ta_foreach_chash, .dump_tentry = ta_dump_chash_tentry, .find_tentry = ta_find_chash_tentry, .print_config = ta_print_chash_config, .dump_tinfo = ta_dump_chash_tinfo, .need_modify = ta_need_modify_chash, .prepare_mod = ta_prepare_mod_chash, .fill_mod = ta_fill_mod_chash, .modify = ta_modify_chash, .flush_mod = ta_flush_mod_chash, }; /* * Iface table cmds. * * Implementation: * * Runtime part: * - sorted array of "struct ifidx" pointed by ti->state. * Array is allocated with rounding up to IFIDX_CHUNK. Only existing * interfaces are stored in array, however its allocated size is * sufficient to hold all table records if needed. * - current array size is stored in ti->data * * Table data: * - "struct iftable_cfg" is allocated to store table state (ta_state). * - All table records are stored inside namedobj instance. * */ struct ifidx { uint16_t kidx; uint16_t spare; uint32_t value; }; #define DEFAULT_IFIDX_SIZE 64 struct iftable_cfg; struct ifentry { struct named_object no; struct ipfw_ifc ic; struct iftable_cfg *icfg; uint32_t value; int linked; }; struct iftable_cfg { struct namedobj_instance *ii; struct ip_fw_chain *ch; struct table_info *ti; void *main_ptr; size_t size; /* Number of items allocated in array */ size_t count; /* Number of all items */ size_t used; /* Number of items _active_ now */ }; struct ta_buf_ifidx { struct ifentry *ife; uint32_t value; }; int compare_ifidx(const void *k, const void *v); static struct ifidx * ifidx_find(struct table_info *ti, void *key); static int ta_lookup_ifidx(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_init_ifidx(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static void ta_change_ti_ifidx(void *ta_state, struct table_info *ti); static void destroy_ifidx_locked(struct namedobj_instance *ii, struct named_object *no, void *arg); static void ta_destroy_ifidx(void *ta_state, struct table_info *ti); static void ta_dump_ifidx_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int ta_prepare_add_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_add_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static int ta_prepare_del_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_del_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static void ta_flush_ifidx_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static void if_notifier(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex); static int ta_need_modify_ifidx(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags); static int ta_prepare_mod_ifidx(void *ta_buf, uint64_t *pflags); static int ta_fill_mod_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags); static void ta_modify_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags); static void ta_flush_mod_ifidx(void *ta_buf); static int ta_dump_ifidx_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static int ta_find_ifidx_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static void foreach_ifidx(struct namedobj_instance *ii, struct named_object *no, void *arg); static void ta_foreach_ifidx(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); int compare_ifidx(const void *k, const void *v) { const struct ifidx *ifidx; uint16_t key; key = *((const uint16_t *)k); ifidx = (const struct ifidx *)v; if (key < ifidx->kidx) return (-1); else if (key > ifidx->kidx) return (1); return (0); } /* * Adds item @item with key @key into ascending-sorted array @base. * Assumes @base has enough additional storage. * * Returns 1 on success, 0 on duplicate key. */ static int badd(const void *key, void *item, void *base, size_t nmemb, size_t size, int (*compar) (const void *, const void *)) { int min, max, mid, shift, res; caddr_t paddr; if (nmemb == 0) { memcpy(base, item, size); return (1); } /* Binary search */ min = 0; max = nmemb - 1; mid = 0; while (min <= max) { mid = (min + max) / 2; res = compar(key, (const void *)((caddr_t)base + mid * size)); if (res == 0) return (0); if (res > 0) min = mid + 1; else max = mid - 1; } /* Item not found. */ res = compar(key, (const void *)((caddr_t)base + mid * size)); if (res > 0) shift = mid + 1; else shift = mid; paddr = (caddr_t)base + shift * size; if (nmemb > shift) memmove(paddr + size, paddr, (nmemb - shift) * size); memcpy(paddr, item, size); return (1); } /* * Deletes item with key @key from ascending-sorted array @base. * * Returns 1 on success, 0 for non-existent key. */ static int bdel(const void *key, void *base, size_t nmemb, size_t size, int (*compar) (const void *, const void *)) { caddr_t item; size_t sz; item = (caddr_t)bsearch(key, base, nmemb, size, compar); if (item == NULL) return (0); sz = (caddr_t)base + nmemb * size - item; if (sz > 0) memmove(item, item + size, sz); return (1); } static struct ifidx * ifidx_find(struct table_info *ti, void *key) { struct ifidx *ifi; ifi = bsearch(key, ti->state, ti->data, sizeof(struct ifidx), compare_ifidx); return (ifi); } static int ta_lookup_ifidx(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct ifidx *ifi; ifi = ifidx_find(ti, key); if (ifi != NULL) { *val = ifi->value; return (1); } return (0); } static int ta_init_ifidx(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { struct iftable_cfg *icfg; icfg = malloc(sizeof(struct iftable_cfg), M_IPFW, M_WAITOK | M_ZERO); icfg->ii = ipfw_objhash_create(DEFAULT_IFIDX_SIZE); icfg->size = DEFAULT_IFIDX_SIZE; icfg->main_ptr = malloc(sizeof(struct ifidx) * icfg->size, M_IPFW, M_WAITOK | M_ZERO); icfg->ch = ch; *ta_state = icfg; ti->state = icfg->main_ptr; ti->lookup = ta_lookup_ifidx; return (0); } /* * Handle tableinfo @ti pointer change (on table array resize). */ static void ta_change_ti_ifidx(void *ta_state, struct table_info *ti) { struct iftable_cfg *icfg; icfg = (struct iftable_cfg *)ta_state; icfg->ti = ti; } static void destroy_ifidx_locked(struct namedobj_instance *ii, struct named_object *no, void *arg) { struct ifentry *ife; struct ip_fw_chain *ch; ch = (struct ip_fw_chain *)arg; ife = (struct ifentry *)no; ipfw_iface_del_notify(ch, &ife->ic); free(ife, M_IPFW_TBL); } /* * Destroys table @ti */ static void ta_destroy_ifidx(void *ta_state, struct table_info *ti) { struct iftable_cfg *icfg; struct ip_fw_chain *ch; icfg = (struct iftable_cfg *)ta_state; ch = icfg->ch; if (icfg->main_ptr != NULL) free(icfg->main_ptr, M_IPFW); ipfw_objhash_foreach(icfg->ii, destroy_ifidx_locked, ch); ipfw_objhash_destroy(icfg->ii); free(icfg, M_IPFW); } /* * Provide algo-specific table info */ static void ta_dump_ifidx_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { struct iftable_cfg *cfg; cfg = (struct iftable_cfg *)ta_state; tinfo->taclass4 = IPFW_TACLASS_ARRAY; tinfo->size4 = cfg->size; tinfo->count4 = cfg->used; tinfo->itemsize4 = sizeof(struct ifidx); } /* * Prepare state to add to the table: * allocate ifentry and reference needed interface. */ static int ta_prepare_add_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_ifidx *tb; char *ifname; struct ifentry *ife; tb = (struct ta_buf_ifidx *)ta_buf; /* Check if string is terminated */ ifname = (char *)tei->paddr; if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE) return (EINVAL); ife = malloc(sizeof(struct ifentry), M_IPFW_TBL, M_WAITOK | M_ZERO); ife->ic.cb = if_notifier; ife->ic.cbdata = ife; if (ipfw_iface_ref(ch, ifname, &ife->ic) != 0) { free(ife, M_IPFW_TBL); return (EINVAL); } /* Use ipfw_iface 'ifname' field as stable storage */ ife->no.name = ife->ic.iface->ifname; tb->ife = ife; return (0); } static int ta_add_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct iftable_cfg *icfg; struct ifentry *ife, *tmp; struct ta_buf_ifidx *tb; struct ipfw_iface *iif; struct ifidx *ifi; char *ifname; uint32_t value; tb = (struct ta_buf_ifidx *)ta_buf; ifname = (char *)tei->paddr; icfg = (struct iftable_cfg *)ta_state; ife = tb->ife; ife->icfg = icfg; ife->value = tei->value; tmp = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname); if (tmp != NULL) { if ((tei->flags & TEI_FLAGS_UPDATE) == 0) return (EEXIST); /* Exchange values in @tmp and @tei */ value = tmp->value; tmp->value = tei->value; tei->value = value; iif = tmp->ic.iface; if (iif->resolved != 0) { /* We have to update runtime value, too */ ifi = ifidx_find(ti, &iif->ifindex); ifi->value = ife->value; } /* Indicate that update has happened instead of addition */ tei->flags |= TEI_FLAGS_UPDATED; *pnum = 0; return (0); } if ((tei->flags & TEI_FLAGS_DONTADD) != 0) return (EFBIG); /* Link to internal list */ ipfw_objhash_add(icfg->ii, &ife->no); /* Link notifier (possible running its callback) */ ipfw_iface_add_notify(icfg->ch, &ife->ic); icfg->count++; tb->ife = NULL; *pnum = 1; return (0); } /* * Prepare to delete key from table. * Do basic interface name checks. */ static int ta_prepare_del_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_ifidx *tb; char *ifname; tb = (struct ta_buf_ifidx *)ta_buf; /* Check if string is terminated */ ifname = (char *)tei->paddr; if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE) return (EINVAL); return (0); } /* * Remove key from both configuration list and * runtime array. Removed interface notification. */ static int ta_del_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct iftable_cfg *icfg; struct ifentry *ife; struct ta_buf_ifidx *tb; char *ifname; uint16_t ifindex; int res; tb = (struct ta_buf_ifidx *)ta_buf; ifname = (char *)tei->paddr; icfg = (struct iftable_cfg *)ta_state; ife = tb->ife; ife = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname); if (ife == NULL) return (ENOENT); if (ife->linked != 0) { /* We have to remove item from runtime */ ifindex = ife->ic.iface->ifindex; res = bdel(&ifindex, icfg->main_ptr, icfg->used, sizeof(struct ifidx), compare_ifidx); KASSERT(res == 1, ("index %d does not exist", ifindex)); icfg->used--; ti->data = icfg->used; ife->linked = 0; } /* Unlink from local list */ ipfw_objhash_del(icfg->ii, &ife->no); /* Unlink notifier */ ipfw_iface_del_notify(icfg->ch, &ife->ic); icfg->count--; tei->value = ife->value; tb->ife = ife; *pnum = 1; return (0); } /* * Flush deleted entry. * Drops interface reference and frees entry. */ static void ta_flush_ifidx_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_ifidx *tb; tb = (struct ta_buf_ifidx *)ta_buf; if (tb->ife != NULL) { /* Unlink first */ ipfw_iface_unref(ch, &tb->ife->ic); free(tb->ife, M_IPFW_TBL); } } /* * Handle interface announce/withdrawal for particular table. * Every real runtime array modification happens here. */ static void if_notifier(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex) { struct ifentry *ife; struct ifidx ifi; struct iftable_cfg *icfg; struct table_info *ti; int res; ife = (struct ifentry *)cbdata; icfg = ife->icfg; ti = icfg->ti; KASSERT(ti != NULL, ("ti=NULL, check change_ti handler")); if (ife->linked == 0 && ifindex != 0) { /* Interface announce */ ifi.kidx = ifindex; ifi.spare = 0; ifi.value = ife->value; res = badd(&ifindex, &ifi, icfg->main_ptr, icfg->used, sizeof(struct ifidx), compare_ifidx); KASSERT(res == 1, ("index %d already exists", ifindex)); icfg->used++; ti->data = icfg->used; ife->linked = 1; } else if (ife->linked != 0 && ifindex == 0) { /* Interface withdrawal */ ifindex = ife->ic.iface->ifindex; res = bdel(&ifindex, icfg->main_ptr, icfg->used, sizeof(struct ifidx), compare_ifidx); KASSERT(res == 1, ("index %d does not exist", ifindex)); icfg->used--; ti->data = icfg->used; ife->linked = 0; } } /* * Table growing callbacks. */ static int ta_need_modify_ifidx(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags) { struct iftable_cfg *cfg; uint32_t size; cfg = (struct iftable_cfg *)ta_state; size = cfg->size; while (size < cfg->count + count) size *= 2; if (size != cfg->size) { *pflags = size; return (1); } return (0); } /* * Allocate ned, larger runtime ifidx array. */ static int ta_prepare_mod_ifidx(void *ta_buf, uint64_t *pflags) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; memset(mi, 0, sizeof(struct mod_item)); mi->size = *pflags; mi->main_ptr = malloc(sizeof(struct ifidx) * mi->size, M_IPFW, M_WAITOK | M_ZERO); return (0); } /* * Copy data from old runtime array to new one. */ static int ta_fill_mod_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags) { struct mod_item *mi; struct iftable_cfg *icfg; mi = (struct mod_item *)ta_buf; icfg = (struct iftable_cfg *)ta_state; /* Check if we still need to grow array */ if (icfg->size >= mi->size) { *pflags = 0; return (0); } memcpy(mi->main_ptr, icfg->main_ptr, icfg->used * sizeof(struct ifidx)); return (0); } /* * Switch old & new arrays. */ static void ta_modify_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags) { struct mod_item *mi; struct iftable_cfg *icfg; void *old_ptr; mi = (struct mod_item *)ta_buf; icfg = (struct iftable_cfg *)ta_state; old_ptr = icfg->main_ptr; icfg->main_ptr = mi->main_ptr; icfg->size = mi->size; ti->state = icfg->main_ptr; mi->main_ptr = old_ptr; } /* * Free unneded array. */ static void ta_flush_mod_ifidx(void *ta_buf) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; if (mi->main_ptr != NULL) free(mi->main_ptr, M_IPFW); } static int ta_dump_ifidx_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct ifentry *ife; ife = (struct ifentry *)e; tent->masklen = 8 * IF_NAMESIZE; memcpy(&tent->k, ife->no.name, IF_NAMESIZE); tent->v.kidx = ife->value; return (0); } static int ta_find_ifidx_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct iftable_cfg *icfg; struct ifentry *ife; char *ifname; icfg = (struct iftable_cfg *)ta_state; ifname = tent->k.iface; if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE) return (EINVAL); ife = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname); if (ife != NULL) { ta_dump_ifidx_tentry(ta_state, ti, ife, tent); return (0); } return (ENOENT); } struct wa_ifidx { ta_foreach_f *f; void *arg; }; static void foreach_ifidx(struct namedobj_instance *ii, struct named_object *no, void *arg) { struct ifentry *ife; struct wa_ifidx *wa; ife = (struct ifentry *)no; wa = (struct wa_ifidx *)arg; wa->f(ife, wa->arg); } static void ta_foreach_ifidx(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct iftable_cfg *icfg; struct wa_ifidx wa; icfg = (struct iftable_cfg *)ta_state; wa.f = f; wa.arg = arg; ipfw_objhash_foreach(icfg->ii, foreach_ifidx, &wa); } struct table_algo iface_idx = { .name = "iface:array", .type = IPFW_TABLE_INTERFACE, .flags = TA_FLAG_DEFAULT, .ta_buf_size = sizeof(struct ta_buf_ifidx), .init = ta_init_ifidx, .destroy = ta_destroy_ifidx, .prepare_add = ta_prepare_add_ifidx, .prepare_del = ta_prepare_del_ifidx, .add = ta_add_ifidx, .del = ta_del_ifidx, .flush_entry = ta_flush_ifidx_entry, .foreach = ta_foreach_ifidx, .dump_tentry = ta_dump_ifidx_tentry, .find_tentry = ta_find_ifidx_tentry, .dump_tinfo = ta_dump_ifidx_tinfo, .need_modify = ta_need_modify_ifidx, .prepare_mod = ta_prepare_mod_ifidx, .fill_mod = ta_fill_mod_ifidx, .modify = ta_modify_ifidx, .flush_mod = ta_flush_mod_ifidx, .change_ti = ta_change_ti_ifidx, }; /* * Number array cmds. * * Implementation: * * Runtime part: * - sorted array of "struct numarray" pointed by ti->state. * Array is allocated with rounding up to NUMARRAY_CHUNK. * - current array size is stored in ti->data * */ struct numarray { uint32_t number; uint32_t value; }; struct numarray_cfg { void *main_ptr; size_t size; /* Number of items allocated in array */ size_t used; /* Number of items _active_ now */ }; struct ta_buf_numarray { struct numarray na; }; int compare_numarray(const void *k, const void *v); static struct numarray *numarray_find(struct table_info *ti, void *key); static int ta_lookup_numarray(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_init_numarray(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static void ta_destroy_numarray(void *ta_state, struct table_info *ti); static void ta_dump_numarray_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int ta_prepare_add_numarray(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_add_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static int ta_del_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static void ta_flush_numarray_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_need_modify_numarray(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags); static int ta_prepare_mod_numarray(void *ta_buf, uint64_t *pflags); static int ta_fill_mod_numarray(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags); static void ta_modify_numarray(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags); static void ta_flush_mod_numarray(void *ta_buf); static int ta_dump_numarray_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static int ta_find_numarray_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static void ta_foreach_numarray(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); int compare_numarray(const void *k, const void *v) { const struct numarray *na; uint32_t key; key = *((const uint32_t *)k); na = (const struct numarray *)v; if (key < na->number) return (-1); else if (key > na->number) return (1); return (0); } static struct numarray * numarray_find(struct table_info *ti, void *key) { struct numarray *ri; ri = bsearch(key, ti->state, ti->data, sizeof(struct numarray), compare_ifidx); return (ri); } static int ta_lookup_numarray(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct numarray *ri; ri = numarray_find(ti, key); if (ri != NULL) { *val = ri->value; return (1); } return (0); } static int ta_init_numarray(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { struct numarray_cfg *cfg; cfg = malloc(sizeof(*cfg), M_IPFW, M_WAITOK | M_ZERO); cfg->size = 16; cfg->main_ptr = malloc(sizeof(struct numarray) * cfg->size, M_IPFW, M_WAITOK | M_ZERO); *ta_state = cfg; ti->state = cfg->main_ptr; ti->lookup = ta_lookup_numarray; return (0); } /* * Destroys table @ti */ static void ta_destroy_numarray(void *ta_state, struct table_info *ti) { struct numarray_cfg *cfg; cfg = (struct numarray_cfg *)ta_state; if (cfg->main_ptr != NULL) free(cfg->main_ptr, M_IPFW); free(cfg, M_IPFW); } /* * Provide algo-specific table info */ static void ta_dump_numarray_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { struct numarray_cfg *cfg; cfg = (struct numarray_cfg *)ta_state; tinfo->taclass4 = IPFW_TACLASS_ARRAY; tinfo->size4 = cfg->size; tinfo->count4 = cfg->used; tinfo->itemsize4 = sizeof(struct numarray); } /* * Prepare for addition/deletion to an array. */ static int ta_prepare_add_numarray(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_numarray *tb; tb = (struct ta_buf_numarray *)ta_buf; tb->na.number = *((uint32_t *)tei->paddr); return (0); } static int ta_add_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct numarray_cfg *cfg; struct ta_buf_numarray *tb; struct numarray *ri; int res; uint32_t value; tb = (struct ta_buf_numarray *)ta_buf; cfg = (struct numarray_cfg *)ta_state; /* Read current value from @tei */ tb->na.value = tei->value; ri = numarray_find(ti, &tb->na.number); if (ri != NULL) { if ((tei->flags & TEI_FLAGS_UPDATE) == 0) return (EEXIST); /* Exchange values between ri and @tei */ value = ri->value; ri->value = tei->value; tei->value = value; /* Indicate that update has happened instead of addition */ tei->flags |= TEI_FLAGS_UPDATED; *pnum = 0; return (0); } if ((tei->flags & TEI_FLAGS_DONTADD) != 0) return (EFBIG); res = badd(&tb->na.number, &tb->na, cfg->main_ptr, cfg->used, sizeof(struct numarray), compare_numarray); KASSERT(res == 1, ("number %d already exists", tb->na.number)); cfg->used++; ti->data = cfg->used; *pnum = 1; return (0); } /* * Remove key from both configuration list and * runtime array. Removed interface notification. */ static int ta_del_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct numarray_cfg *cfg; struct ta_buf_numarray *tb; struct numarray *ri; int res; tb = (struct ta_buf_numarray *)ta_buf; cfg = (struct numarray_cfg *)ta_state; ri = numarray_find(ti, &tb->na.number); if (ri == NULL) return (ENOENT); tei->value = ri->value; res = bdel(&tb->na.number, cfg->main_ptr, cfg->used, sizeof(struct numarray), compare_numarray); KASSERT(res == 1, ("number %u does not exist", tb->na.number)); cfg->used--; ti->data = cfg->used; *pnum = 1; return (0); } static void ta_flush_numarray_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { /* We don't have any state, do nothing */ } /* * Table growing callbacks. */ static int ta_need_modify_numarray(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags) { struct numarray_cfg *cfg; size_t size; cfg = (struct numarray_cfg *)ta_state; size = cfg->size; while (size < cfg->used + count) size *= 2; if (size != cfg->size) { *pflags = size; return (1); } return (0); } /* * Allocate new, larger runtime array. */ static int ta_prepare_mod_numarray(void *ta_buf, uint64_t *pflags) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; memset(mi, 0, sizeof(struct mod_item)); mi->size = *pflags; mi->main_ptr = malloc(sizeof(struct numarray) * mi->size, M_IPFW, M_WAITOK | M_ZERO); return (0); } /* * Copy data from old runtime array to new one. */ static int ta_fill_mod_numarray(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags) { struct mod_item *mi; struct numarray_cfg *cfg; mi = (struct mod_item *)ta_buf; cfg = (struct numarray_cfg *)ta_state; /* Check if we still need to grow array */ if (cfg->size >= mi->size) { *pflags = 0; return (0); } memcpy(mi->main_ptr, cfg->main_ptr, cfg->used * sizeof(struct numarray)); return (0); } /* * Switch old & new arrays. */ static void ta_modify_numarray(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags) { struct mod_item *mi; struct numarray_cfg *cfg; void *old_ptr; mi = (struct mod_item *)ta_buf; cfg = (struct numarray_cfg *)ta_state; old_ptr = cfg->main_ptr; cfg->main_ptr = mi->main_ptr; cfg->size = mi->size; ti->state = cfg->main_ptr; mi->main_ptr = old_ptr; } /* * Free unneded array. */ static void ta_flush_mod_numarray(void *ta_buf) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; if (mi->main_ptr != NULL) free(mi->main_ptr, M_IPFW); } static int ta_dump_numarray_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct numarray *na; na = (struct numarray *)e; tent->k.key = na->number; tent->v.kidx = na->value; return (0); } static int ta_find_numarray_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct numarray_cfg *cfg; struct numarray *ri; cfg = (struct numarray_cfg *)ta_state; ri = numarray_find(ti, &tent->k.key); if (ri != NULL) { ta_dump_numarray_tentry(ta_state, ti, ri, tent); return (0); } return (ENOENT); } static void ta_foreach_numarray(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct numarray_cfg *cfg; struct numarray *array; int i; cfg = (struct numarray_cfg *)ta_state; array = cfg->main_ptr; for (i = 0; i < cfg->used; i++) f(&array[i], arg); } struct table_algo number_array = { .name = "number:array", .type = IPFW_TABLE_NUMBER, .ta_buf_size = sizeof(struct ta_buf_numarray), .init = ta_init_numarray, .destroy = ta_destroy_numarray, .prepare_add = ta_prepare_add_numarray, .prepare_del = ta_prepare_add_numarray, .add = ta_add_numarray, .del = ta_del_numarray, .flush_entry = ta_flush_numarray_entry, .foreach = ta_foreach_numarray, .dump_tentry = ta_dump_numarray_tentry, .find_tentry = ta_find_numarray_tentry, .dump_tinfo = ta_dump_numarray_tinfo, .need_modify = ta_need_modify_numarray, .prepare_mod = ta_prepare_mod_numarray, .fill_mod = ta_fill_mod_numarray, .modify = ta_modify_numarray, .flush_mod = ta_flush_mod_numarray, }; /* * flow:hash cmds * * * ti->data: * [inv.mask4][inv.mask6][log2hsize4][log2hsize6] * [ 8][ 8[ 8][ 8] * * inv.mask4: 32 - mask * inv.mask6: * 1) _slow lookup: mask * 2) _aligned: (128 - mask) / 8 * 3) _64: 8 * * * pflags: * [hsize4][hsize6] * [ 16][ 16] */ struct fhashentry; SLIST_HEAD(fhashbhead, fhashentry); struct fhashentry { SLIST_ENTRY(fhashentry) next; uint8_t af; uint8_t proto; uint16_t spare0; uint16_t dport; uint16_t sport; uint32_t value; uint32_t spare1; }; struct fhashentry4 { struct fhashentry e; struct in_addr dip; struct in_addr sip; }; struct fhashentry6 { struct fhashentry e; struct in6_addr dip6; struct in6_addr sip6; }; struct fhash_cfg { struct fhashbhead *head; size_t size; size_t items; struct fhashentry4 fe4; struct fhashentry6 fe6; }; struct ta_buf_fhash { void *ent_ptr; struct fhashentry6 fe6; }; static __inline int cmp_flow_ent(struct fhashentry *a, struct fhashentry *b, size_t sz); static __inline uint32_t hash_flow4(struct fhashentry4 *f, int hsize); static __inline uint32_t hash_flow6(struct fhashentry6 *f, int hsize); static uint32_t hash_flow_ent(struct fhashentry *ent, uint32_t size); static int ta_lookup_fhash(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_init_fhash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static void ta_destroy_fhash(void *ta_state, struct table_info *ti); static void ta_dump_fhash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int ta_dump_fhash_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static int tei_to_fhash_ent(struct tentry_info *tei, struct fhashentry *ent); static int ta_find_fhash_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static void ta_foreach_fhash(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); static int ta_prepare_add_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_add_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static int ta_prepare_del_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_del_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static void ta_flush_fhash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_need_modify_fhash(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags); static int ta_prepare_mod_fhash(void *ta_buf, uint64_t *pflags); static int ta_fill_mod_fhash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags); static void ta_modify_fhash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags); static void ta_flush_mod_fhash(void *ta_buf); static __inline int cmp_flow_ent(struct fhashentry *a, struct fhashentry *b, size_t sz) { uint64_t *ka, *kb; ka = (uint64_t *)(&a->next + 1); kb = (uint64_t *)(&b->next + 1); if (*ka == *kb && (memcmp(a + 1, b + 1, sz) == 0)) return (1); return (0); } static __inline uint32_t hash_flow4(struct fhashentry4 *f, int hsize) { uint32_t i; i = (f->dip.s_addr) ^ (f->sip.s_addr) ^ (f->e.dport) ^ (f->e.sport); return (i % (hsize - 1)); } static __inline uint32_t hash_flow6(struct fhashentry6 *f, int hsize) { uint32_t i; i = (f->dip6.__u6_addr.__u6_addr32[2]) ^ (f->dip6.__u6_addr.__u6_addr32[3]) ^ (f->sip6.__u6_addr.__u6_addr32[2]) ^ (f->sip6.__u6_addr.__u6_addr32[3]) ^ (f->e.dport) ^ (f->e.sport); return (i % (hsize - 1)); } static uint32_t hash_flow_ent(struct fhashentry *ent, uint32_t size) { uint32_t hash; if (ent->af == AF_INET) { hash = hash_flow4((struct fhashentry4 *)ent, size); } else { hash = hash_flow6((struct fhashentry6 *)ent, size); } return (hash); } static int ta_lookup_fhash(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct fhashbhead *head; struct fhashentry *ent; struct fhashentry4 *m4; struct ipfw_flow_id *id; uint16_t hash, hsize; id = (struct ipfw_flow_id *)key; head = (struct fhashbhead *)ti->state; hsize = ti->data; m4 = (struct fhashentry4 *)ti->xstate; if (id->addr_type == 4) { struct fhashentry4 f; /* Copy hash mask */ f = *m4; f.dip.s_addr &= id->dst_ip; f.sip.s_addr &= id->src_ip; f.e.dport &= id->dst_port; f.e.sport &= id->src_port; f.e.proto &= id->proto; hash = hash_flow4(&f, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (cmp_flow_ent(ent, &f.e, 2 * 4) != 0) { *val = ent->value; return (1); } } } else if (id->addr_type == 6) { struct fhashentry6 f; uint64_t *fp, *idp; /* Copy hash mask */ f = *((struct fhashentry6 *)(m4 + 1)); /* Handle lack of __u6_addr.__u6_addr64 */ fp = (uint64_t *)&f.dip6; idp = (uint64_t *)&id->dst_ip6; /* src IPv6 is stored after dst IPv6 */ *fp++ &= *idp++; *fp++ &= *idp++; *fp++ &= *idp++; *fp &= *idp; f.e.dport &= id->dst_port; f.e.sport &= id->src_port; f.e.proto &= id->proto; hash = hash_flow6(&f, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (cmp_flow_ent(ent, &f.e, 2 * 16) != 0) { *val = ent->value; return (1); } } } return (0); } /* * New table. */ static int ta_init_fhash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { int i; struct fhash_cfg *cfg; struct fhashentry4 *fe4; struct fhashentry6 *fe6; cfg = malloc(sizeof(struct fhash_cfg), M_IPFW, M_WAITOK | M_ZERO); cfg->size = 512; cfg->head = malloc(sizeof(struct fhashbhead) * cfg->size, M_IPFW, M_WAITOK | M_ZERO); for (i = 0; i < cfg->size; i++) SLIST_INIT(&cfg->head[i]); /* Fill in fe masks based on @tflags */ fe4 = &cfg->fe4; fe6 = &cfg->fe6; if (tflags & IPFW_TFFLAG_SRCIP) { memset(&fe4->sip, 0xFF, sizeof(fe4->sip)); memset(&fe6->sip6, 0xFF, sizeof(fe6->sip6)); } if (tflags & IPFW_TFFLAG_DSTIP) { memset(&fe4->dip, 0xFF, sizeof(fe4->dip)); memset(&fe6->dip6, 0xFF, sizeof(fe6->dip6)); } if (tflags & IPFW_TFFLAG_SRCPORT) { memset(&fe4->e.sport, 0xFF, sizeof(fe4->e.sport)); memset(&fe6->e.sport, 0xFF, sizeof(fe6->e.sport)); } if (tflags & IPFW_TFFLAG_DSTPORT) { memset(&fe4->e.dport, 0xFF, sizeof(fe4->e.dport)); memset(&fe6->e.dport, 0xFF, sizeof(fe6->e.dport)); } if (tflags & IPFW_TFFLAG_PROTO) { memset(&fe4->e.proto, 0xFF, sizeof(fe4->e.proto)); memset(&fe6->e.proto, 0xFF, sizeof(fe6->e.proto)); } fe4->e.af = AF_INET; fe6->e.af = AF_INET6; *ta_state = cfg; ti->state = cfg->head; ti->xstate = &cfg->fe4; ti->data = cfg->size; ti->lookup = ta_lookup_fhash; return (0); } static void ta_destroy_fhash(void *ta_state, struct table_info *ti) { struct fhash_cfg *cfg; struct fhashentry *ent, *ent_next; int i; cfg = (struct fhash_cfg *)ta_state; for (i = 0; i < cfg->size; i++) SLIST_FOREACH_SAFE(ent, &cfg->head[i], next, ent_next) free(ent, M_IPFW_TBL); free(cfg->head, M_IPFW); free(cfg, M_IPFW); } /* * Provide algo-specific table info */ static void ta_dump_fhash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { struct fhash_cfg *cfg; cfg = (struct fhash_cfg *)ta_state; tinfo->flags = IPFW_TATFLAGS_AFITEM; tinfo->taclass4 = IPFW_TACLASS_HASH; tinfo->size4 = cfg->size; tinfo->count4 = cfg->items; tinfo->itemsize4 = sizeof(struct fhashentry4); tinfo->itemsize6 = sizeof(struct fhashentry6); } static int ta_dump_fhash_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct fhash_cfg *cfg; struct fhashentry *ent; struct fhashentry4 *fe4; #ifdef INET6 struct fhashentry6 *fe6; #endif struct tflow_entry *tfe; cfg = (struct fhash_cfg *)ta_state; ent = (struct fhashentry *)e; tfe = &tent->k.flow; tfe->af = ent->af; tfe->proto = ent->proto; tfe->dport = htons(ent->dport); tfe->sport = htons(ent->sport); tent->v.kidx = ent->value; tent->subtype = ent->af; if (ent->af == AF_INET) { fe4 = (struct fhashentry4 *)ent; tfe->a.a4.sip.s_addr = htonl(fe4->sip.s_addr); tfe->a.a4.dip.s_addr = htonl(fe4->dip.s_addr); tent->masklen = 32; #ifdef INET6 } else { fe6 = (struct fhashentry6 *)ent; tfe->a.a6.sip6 = fe6->sip6; tfe->a.a6.dip6 = fe6->dip6; tent->masklen = 128; #endif } return (0); } static int tei_to_fhash_ent(struct tentry_info *tei, struct fhashentry *ent) { #ifdef INET struct fhashentry4 *fe4; #endif #ifdef INET6 struct fhashentry6 *fe6; #endif struct tflow_entry *tfe; tfe = (struct tflow_entry *)tei->paddr; ent->af = tei->subtype; ent->proto = tfe->proto; ent->dport = ntohs(tfe->dport); ent->sport = ntohs(tfe->sport); if (tei->subtype == AF_INET) { #ifdef INET fe4 = (struct fhashentry4 *)ent; fe4->sip.s_addr = ntohl(tfe->a.a4.sip.s_addr); fe4->dip.s_addr = ntohl(tfe->a.a4.dip.s_addr); #endif #ifdef INET6 } else if (tei->subtype == AF_INET6) { fe6 = (struct fhashentry6 *)ent; fe6->sip6 = tfe->a.a6.sip6; fe6->dip6 = tfe->a.a6.dip6; #endif } else { /* Unknown CIDR type */ return (EINVAL); } return (0); } static int ta_find_fhash_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct fhash_cfg *cfg; struct fhashbhead *head; struct fhashentry *ent, *tmp; struct fhashentry6 fe6; struct tentry_info tei; int error; uint32_t hash; size_t sz; cfg = (struct fhash_cfg *)ta_state; ent = &fe6.e; memset(&fe6, 0, sizeof(fe6)); memset(&tei, 0, sizeof(tei)); tei.paddr = &tent->k.flow; tei.subtype = tent->subtype; if ((error = tei_to_fhash_ent(&tei, ent)) != 0) return (error); head = cfg->head; hash = hash_flow_ent(ent, cfg->size); if (tei.subtype == AF_INET) sz = 2 * sizeof(struct in_addr); else sz = 2 * sizeof(struct in6_addr); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (cmp_flow_ent(tmp, ent, sz) != 0) { ta_dump_fhash_tentry(ta_state, ti, tmp, tent); return (0); } } return (ENOENT); } static void ta_foreach_fhash(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct fhash_cfg *cfg; struct fhashentry *ent, *ent_next; int i; cfg = (struct fhash_cfg *)ta_state; for (i = 0; i < cfg->size; i++) SLIST_FOREACH_SAFE(ent, &cfg->head[i], next, ent_next) f(ent, arg); } static int ta_prepare_add_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_fhash *tb; struct fhashentry *ent; size_t sz; int error; tb = (struct ta_buf_fhash *)ta_buf; if (tei->subtype == AF_INET) sz = sizeof(struct fhashentry4); else if (tei->subtype == AF_INET6) sz = sizeof(struct fhashentry6); else return (EINVAL); ent = malloc(sz, M_IPFW_TBL, M_WAITOK | M_ZERO); error = tei_to_fhash_ent(tei, ent); if (error != 0) { free(ent, M_IPFW_TBL); return (error); } tb->ent_ptr = ent; return (0); } static int ta_add_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct fhash_cfg *cfg; struct fhashbhead *head; struct fhashentry *ent, *tmp; struct ta_buf_fhash *tb; int exists; uint32_t hash, value; size_t sz; cfg = (struct fhash_cfg *)ta_state; tb = (struct ta_buf_fhash *)ta_buf; ent = (struct fhashentry *)tb->ent_ptr; exists = 0; /* Read current value from @tei */ ent->value = tei->value; head = cfg->head; hash = hash_flow_ent(ent, cfg->size); if (tei->subtype == AF_INET) sz = 2 * sizeof(struct in_addr); else sz = 2 * sizeof(struct in6_addr); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (cmp_flow_ent(tmp, ent, sz) != 0) { exists = 1; break; } } if (exists == 1) { if ((tei->flags & TEI_FLAGS_UPDATE) == 0) return (EEXIST); /* Record already exists. Update value if we're asked to */ /* Exchange values between tmp and @tei */ value = tmp->value; tmp->value = tei->value; tei->value = value; /* Indicate that update has happened instead of addition */ tei->flags |= TEI_FLAGS_UPDATED; *pnum = 0; } else { if ((tei->flags & TEI_FLAGS_DONTADD) != 0) return (EFBIG); SLIST_INSERT_HEAD(&head[hash], ent, next); tb->ent_ptr = NULL; *pnum = 1; /* Update counters and check if we need to grow hash */ cfg->items++; } return (0); } static int ta_prepare_del_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_fhash *tb; tb = (struct ta_buf_fhash *)ta_buf; return (tei_to_fhash_ent(tei, &tb->fe6.e)); } static int ta_del_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct fhash_cfg *cfg; struct fhashbhead *head; struct fhashentry *ent, *tmp; struct ta_buf_fhash *tb; uint32_t hash; size_t sz; cfg = (struct fhash_cfg *)ta_state; tb = (struct ta_buf_fhash *)ta_buf; ent = &tb->fe6.e; head = cfg->head; hash = hash_flow_ent(ent, cfg->size); if (tei->subtype == AF_INET) sz = 2 * sizeof(struct in_addr); else sz = 2 * sizeof(struct in6_addr); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (cmp_flow_ent(tmp, ent, sz) == 0) continue; SLIST_REMOVE(&head[hash], tmp, fhashentry, next); tei->value = tmp->value; *pnum = 1; cfg->items--; tb->ent_ptr = tmp; return (0); } return (ENOENT); } static void ta_flush_fhash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_fhash *tb; tb = (struct ta_buf_fhash *)ta_buf; if (tb->ent_ptr != NULL) free(tb->ent_ptr, M_IPFW_TBL); } /* * Hash growing callbacks. */ static int ta_need_modify_fhash(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags) { struct fhash_cfg *cfg; cfg = (struct fhash_cfg *)ta_state; if (cfg->items > cfg->size && cfg->size < 65536) { *pflags = cfg->size * 2; return (1); } return (0); } /* * Allocate new, larger fhash. */ static int ta_prepare_mod_fhash(void *ta_buf, uint64_t *pflags) { struct mod_item *mi; struct fhashbhead *head; int i; mi = (struct mod_item *)ta_buf; memset(mi, 0, sizeof(struct mod_item)); mi->size = *pflags; head = malloc(sizeof(struct fhashbhead) * mi->size, M_IPFW, M_WAITOK | M_ZERO); for (i = 0; i < mi->size; i++) SLIST_INIT(&head[i]); mi->main_ptr = head; return (0); } /* * Copy data from old runtime array to new one. */ static int ta_fill_mod_fhash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags) { /* In is not possible to do rehash if we're not holidng WLOCK. */ return (0); } /* * Switch old & new arrays. */ static void ta_modify_fhash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags) { struct mod_item *mi; struct fhash_cfg *cfg; struct fhashbhead *old_head, *new_head; struct fhashentry *ent, *ent_next; int i; uint32_t nhash; size_t old_size; mi = (struct mod_item *)ta_buf; cfg = (struct fhash_cfg *)ta_state; old_size = cfg->size; old_head = ti->state; new_head = (struct fhashbhead *)mi->main_ptr; for (i = 0; i < old_size; i++) { SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) { nhash = hash_flow_ent(ent, mi->size); SLIST_INSERT_HEAD(&new_head[nhash], ent, next); } } ti->state = new_head; ti->data = mi->size; cfg->head = new_head; cfg->size = mi->size; mi->main_ptr = old_head; } /* * Free unneded array. */ static void ta_flush_mod_fhash(void *ta_buf) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; if (mi->main_ptr != NULL) free(mi->main_ptr, M_IPFW); } struct table_algo flow_hash = { .name = "flow:hash", .type = IPFW_TABLE_FLOW, .flags = TA_FLAG_DEFAULT, .ta_buf_size = sizeof(struct ta_buf_fhash), .init = ta_init_fhash, .destroy = ta_destroy_fhash, .prepare_add = ta_prepare_add_fhash, .prepare_del = ta_prepare_del_fhash, .add = ta_add_fhash, .del = ta_del_fhash, .flush_entry = ta_flush_fhash_entry, .foreach = ta_foreach_fhash, .dump_tentry = ta_dump_fhash_tentry, .find_tentry = ta_find_fhash_tentry, .dump_tinfo = ta_dump_fhash_tinfo, .need_modify = ta_need_modify_fhash, .prepare_mod = ta_prepare_mod_fhash, .fill_mod = ta_fill_mod_fhash, .modify = ta_modify_fhash, .flush_mod = ta_flush_mod_fhash, }; /* * Kernel fibs bindings. * * Implementation: * * Runtime part: * - fully relies on route API * - fib number is stored in ti->data * */ static struct rtentry *lookup_kfib(void *key, int keylen, int fib); static int ta_lookup_kfib(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int kfib_parse_opts(int *pfib, char *data); static void ta_print_kfib_config(void *ta_state, struct table_info *ti, char *buf, size_t bufsize); static int ta_init_kfib(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static void ta_destroy_kfib(void *ta_state, struct table_info *ti); static void ta_dump_kfib_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int contigmask(uint8_t *p, int len); static int ta_dump_kfib_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static int ta_find_kfib_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static void ta_foreach_kfib(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); static struct rtentry * lookup_kfib(void *key, int keylen, int fib) { struct sockaddr *s; if (keylen == 4) { struct sockaddr_in sin; bzero(&sin, sizeof(sin)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr.s_addr = *(in_addr_t *)key; s = (struct sockaddr *)&sin; } else { struct sockaddr_in6 sin6; bzero(&sin6, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_addr = *(struct in6_addr *)key; s = (struct sockaddr *)&sin6; } return (rtalloc1_fib(s, 0, 0, fib)); } static int ta_lookup_kfib(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct rtentry *rte; if ((rte = lookup_kfib(key, keylen, ti->data)) == NULL) return (0); *val = 0; RTFREE_LOCKED(rte); return (1); } /* Parse 'fib=%d' */ static int kfib_parse_opts(int *pfib, char *data) { char *pdel, *pend, *s; int fibnum; if (data == NULL) return (0); if ((pdel = strchr(data, ' ')) == NULL) return (0); while (*pdel == ' ') pdel++; if (strncmp(pdel, "fib=", 4) != 0) return (EINVAL); if ((s = strchr(pdel, ' ')) != NULL) *s++ = '\0'; pdel += 4; /* Need \d+ */ fibnum = strtol(pdel, &pend, 10); if (*pend != '\0') return (EINVAL); *pfib = fibnum; return (0); } static void ta_print_kfib_config(void *ta_state, struct table_info *ti, char *buf, size_t bufsize) { if (ti->data != 0) snprintf(buf, bufsize, "%s fib=%lu", "addr:kfib", ti->data); else snprintf(buf, bufsize, "%s", "addr:kfib"); } static int ta_init_kfib(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { int error, fibnum; fibnum = 0; if ((error = kfib_parse_opts(&fibnum, data)) != 0) return (error); if (fibnum >= rt_numfibs) return (E2BIG); ti->data = fibnum; ti->lookup = ta_lookup_kfib; return (0); } /* * Destroys table @ti */ static void ta_destroy_kfib(void *ta_state, struct table_info *ti) { } /* * Provide algo-specific table info */ static void ta_dump_kfib_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { tinfo->flags = IPFW_TATFLAGS_AFDATA; tinfo->taclass4 = IPFW_TACLASS_RADIX; tinfo->count4 = 0; tinfo->itemsize4 = sizeof(struct rtentry); tinfo->taclass6 = IPFW_TACLASS_RADIX; tinfo->count6 = 0; tinfo->itemsize6 = sizeof(struct rtentry); } static int contigmask(uint8_t *p, int len) { int i, n; for (i = 0; i < len ; i++) if ( (p[i/8] & (1 << (7 - (i%8)))) == 0) /* first bit unset */ break; for (n= i + 1; n < len; n++) if ( (p[n/8] & (1 << (7 - (n % 8)))) != 0) return (-1); /* mask not contiguous */ return (i); } static int ta_dump_kfib_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct rtentry *rte; #ifdef INET struct sockaddr_in *addr, *mask; #endif #ifdef INET6 struct sockaddr_in6 *addr6, *mask6; #endif int len; rte = (struct rtentry *)e; addr = (struct sockaddr_in *)rt_key(rte); mask = (struct sockaddr_in *)rt_mask(rte); len = 0; /* Guess IPv4/IPv6 radix by sockaddr family */ #ifdef INET if (addr->sin_family == AF_INET) { tent->k.addr.s_addr = addr->sin_addr.s_addr; len = 32; if (mask != NULL) len = contigmask((uint8_t *)&mask->sin_addr, 32); if (len == -1) len = 0; tent->masklen = len; tent->subtype = AF_INET; tent->v.kidx = 0; /* Do we need to put GW here? */ } #endif #ifdef INET6 if (addr->sin_family == AF_INET6) { addr6 = (struct sockaddr_in6 *)addr; mask6 = (struct sockaddr_in6 *)mask; memcpy(&tent->k, &addr6->sin6_addr, sizeof(struct in6_addr)); len = 128; if (mask6 != NULL) len = contigmask((uint8_t *)&mask6->sin6_addr, 128); if (len == -1) len = 0; tent->masklen = len; tent->subtype = AF_INET6; tent->v.kidx = 0; } #endif return (0); } static int ta_find_kfib_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct rtentry *rte; void *key; int keylen; if (tent->subtype == AF_INET) { key = &tent->k.addr; keylen = sizeof(struct in_addr); } else { key = &tent->k.addr6; keylen = sizeof(struct in6_addr); } if ((rte = lookup_kfib(key, keylen, ti->data)) == NULL) return (0); if (rte != NULL) { ta_dump_kfib_tentry(ta_state, ti, rte, tent); RTFREE_LOCKED(rte); return (0); } return (ENOENT); } static void ta_foreach_kfib(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct rib_head *rh; + RIB_LOCK_READER; int error; rh = rt_tables_get_rnh(ti->data, AF_INET); if (rh != NULL) { RIB_RLOCK(rh); error = rh->rnh_walktree(&rh->head, (walktree_f_t *)f, arg); RIB_RUNLOCK(rh); } rh = rt_tables_get_rnh(ti->data, AF_INET6); if (rh != NULL) { RIB_RLOCK(rh); error = rh->rnh_walktree(&rh->head, (walktree_f_t *)f, arg); RIB_RUNLOCK(rh); } } struct table_algo addr_kfib = { .name = "addr:kfib", .type = IPFW_TABLE_ADDR, .flags = TA_FLAG_READONLY, .ta_buf_size = 0, .init = ta_init_kfib, .destroy = ta_destroy_kfib, .foreach = ta_foreach_kfib, .dump_tentry = ta_dump_kfib_tentry, .find_tentry = ta_find_kfib_tentry, .dump_tinfo = ta_dump_kfib_tinfo, .print_config = ta_print_kfib_config, }; void ipfw_table_algo_init(struct ip_fw_chain *ch) { size_t sz; /* * Register all algorithms presented here. */ sz = sizeof(struct table_algo); ipfw_add_table_algo(ch, &addr_radix, sz, &addr_radix.idx); ipfw_add_table_algo(ch, &addr_hash, sz, &addr_hash.idx); ipfw_add_table_algo(ch, &iface_idx, sz, &iface_idx.idx); ipfw_add_table_algo(ch, &number_array, sz, &number_array.idx); ipfw_add_table_algo(ch, &flow_hash, sz, &flow_hash.idx); ipfw_add_table_algo(ch, &addr_kfib, sz, &addr_kfib.idx); } void ipfw_table_algo_destroy(struct ip_fw_chain *ch) { ipfw_del_table_algo(ch, addr_radix.idx); ipfw_del_table_algo(ch, addr_hash.idx); ipfw_del_table_algo(ch, iface_idx.idx); ipfw_del_table_algo(ch, number_array.idx); ipfw_del_table_algo(ch, flow_hash.idx); ipfw_del_table_algo(ch, addr_kfib.idx); } Index: projects/routing/sys/nfs/bootp_subr.c =================================================================== --- projects/routing/sys/nfs/bootp_subr.c (revision 274335) +++ projects/routing/sys/nfs/bootp_subr.c (revision 274336) @@ -1,1866 +1,1866 @@ /*- * Copyright (c) 1995 Gordon Ross, Adam Glass * Copyright (c) 1992 Regents of the University of California. * All rights reserved. * * This software was developed by the Computer Systems Engineering group * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and * contributed to Berkeley. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Lawrence Berkeley Laboratory and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * based on: * nfs/krpc_subr.c * $NetBSD: krpc_subr.c,v 1.10 1995/08/08 20:43:43 gwr Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_bootp.h" #include "opt_nfs.h" #include "opt_rootdevname.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define BOOTP_MIN_LEN 300 /* Minimum size of bootp udp packet */ #ifndef BOOTP_SETTLE_DELAY #define BOOTP_SETTLE_DELAY 3 #endif /* * Wait 10 seconds for interface appearance * USB ethernet adapters might require some time to pop up */ #ifndef BOOTP_IFACE_WAIT_TIMEOUT #define BOOTP_IFACE_WAIT_TIMEOUT 10 #endif /* * What is the longest we will wait before re-sending a request? * Note this is also the frequency of "RPC timeout" messages. * The re-send loop count sup linearly to this maximum, so the * first complaint will happen after (1+2+3+4+5)=15 seconds. */ #define MAX_RESEND_DELAY 5 /* seconds */ /* Definitions from RFC951 */ struct bootp_packet { u_int8_t op; u_int8_t htype; u_int8_t hlen; u_int8_t hops; u_int32_t xid; u_int16_t secs; u_int16_t flags; struct in_addr ciaddr; struct in_addr yiaddr; struct in_addr siaddr; struct in_addr giaddr; unsigned char chaddr[16]; char sname[64]; char file[128]; unsigned char vend[1222]; }; struct bootpc_ifcontext { STAILQ_ENTRY(bootpc_ifcontext) next; struct bootp_packet call; struct bootp_packet reply; int replylen; int overload; union { struct ifreq _ifreq; struct in_aliasreq _in_alias_req; } _req; #define ireq _req._ifreq #define iareq _req._in_alias_req struct ifnet *ifp; struct sockaddr_dl *sdl; struct sockaddr_in myaddr; struct sockaddr_in netmask; struct sockaddr_in gw; int gotgw; int gotnetmask; int gotrootpath; int outstanding; int sentmsg; u_int32_t xid; enum { IF_BOOTP_UNRESOLVED, IF_BOOTP_RESOLVED, IF_BOOTP_FAILED, IF_DHCP_UNRESOLVED, IF_DHCP_OFFERED, IF_DHCP_RESOLVED, IF_DHCP_FAILED, } state; int dhcpquerytype; /* dhcp type sent */ struct in_addr dhcpserver; int gotdhcpserver; }; #define TAG_MAXLEN 1024 struct bootpc_tagcontext { char buf[TAG_MAXLEN + 1]; int overload; int badopt; int badtag; int foundopt; int taglen; }; struct bootpc_globalcontext { STAILQ_HEAD(, bootpc_ifcontext) interfaces; u_int32_t xid; int any_root_overrides; int gotrootpath; int gotgw; int ifnum; int secs; int starttime; struct bootp_packet reply; int replylen; struct bootpc_ifcontext *setrootfs; struct bootpc_ifcontext *sethostname; struct bootpc_tagcontext tmptag; struct bootpc_tagcontext tag; }; #define IPPORT_BOOTPC 68 #define IPPORT_BOOTPS 67 #define BOOTP_REQUEST 1 #define BOOTP_REPLY 2 /* Common tags */ #define TAG_PAD 0 /* Pad option, implicit length 1 */ #define TAG_SUBNETMASK 1 /* RFC 950 subnet mask */ #define TAG_ROUTERS 3 /* Routers (in order of preference) */ #define TAG_HOSTNAME 12 /* Client host name */ #define TAG_ROOT 17 /* Root path */ /* DHCP specific tags */ #define TAG_OVERLOAD 52 /* Option Overload */ #define TAG_MAXMSGSIZE 57 /* Maximum DHCP Message Size */ #define TAG_END 255 /* End Option (i.e. no more options) */ /* Overload values */ #define OVERLOAD_FILE 1 #define OVERLOAD_SNAME 2 /* Site specific tags: */ #define TAG_ROOTOPTS 130 #define TAG_COOKIE 134 /* ascii info for userland, via sysctl */ #define TAG_DHCP_MSGTYPE 53 #define TAG_DHCP_REQ_ADDR 50 #define TAG_DHCP_SERVERID 54 #define TAG_DHCP_LEASETIME 51 #define TAG_VENDOR_INDENTIFIER 60 #define DHCP_NOMSG 0 #define DHCP_DISCOVER 1 #define DHCP_OFFER 2 #define DHCP_REQUEST 3 #define DHCP_ACK 5 /* NFS read/write block size */ #ifndef BOOTP_BLOCKSIZE #define BOOTP_BLOCKSIZE 8192 #endif static char bootp_cookie[128]; static struct socket *bootp_so; SYSCTL_STRING(_kern, OID_AUTO, bootp_cookie, CTLFLAG_RD, bootp_cookie, 0, "Cookie (T134) supplied by bootp server"); /* mountd RPC */ static int md_mount(struct sockaddr_in *mdsin, char *path, u_char *fhp, int *fhsizep, struct nfs_args *args, struct thread *td); static int setfs(struct sockaddr_in *addr, char *path, char *p, const struct in_addr *siaddr); static int getdec(char **ptr); static int getip(char **ptr, struct in_addr *ip); static void mountopts(struct nfs_args *args, char *p); static int xdr_opaque_decode(struct mbuf **ptr, u_char *buf, int len); static int xdr_int_decode(struct mbuf **ptr, int *iptr); static void print_in_addr(struct in_addr addr); static void print_sin_addr(struct sockaddr_in *addr); static void clear_sinaddr(struct sockaddr_in *sin); static void allocifctx(struct bootpc_globalcontext *gctx); static void bootpc_compose_query(struct bootpc_ifcontext *ifctx, struct thread *td); static unsigned char *bootpc_tag(struct bootpc_tagcontext *tctx, struct bootp_packet *bp, int len, int tag); static void bootpc_tag_helper(struct bootpc_tagcontext *tctx, unsigned char *start, int len, int tag); #ifdef BOOTP_DEBUG void bootpboot_p_sa(struct sockaddr *sa, struct sockaddr *ma); void bootpboot_p_rtentry(struct rtentry *rt); void bootpboot_p_tree(struct radix_node *rn); void bootpboot_p_rtlist(void); void bootpboot_p_if(struct ifnet *ifp, struct ifaddr *ifa); void bootpboot_p_iflist(void); #endif static int bootpc_call(struct bootpc_globalcontext *gctx, struct thread *td); static void bootpc_fakeup_interface(struct bootpc_ifcontext *ifctx, struct thread *td); static int bootpc_adjust_interface(struct bootpc_ifcontext *ifctx, struct bootpc_globalcontext *gctx, struct thread *td); static void bootpc_decode_reply(struct nfsv3_diskless *nd, struct bootpc_ifcontext *ifctx, struct bootpc_globalcontext *gctx); static int bootpc_received(struct bootpc_globalcontext *gctx, struct bootpc_ifcontext *ifctx); static __inline int bootpc_ifctx_isresolved(struct bootpc_ifcontext *ifctx); static __inline int bootpc_ifctx_isunresolved(struct bootpc_ifcontext *ifctx); static __inline int bootpc_ifctx_isfailed(struct bootpc_ifcontext *ifctx); /* * In order to have multiple active interfaces with address 0.0.0.0 * and be able to send data to a selected interface, we first set * mask to /8 on all interfaces, and temporarily set it to /0 when * doing sosend(). */ #ifdef BOOTP_DEBUG void bootpboot_p_sa(struct sockaddr *sa, struct sockaddr *ma) { if (sa == NULL) { printf("(sockaddr *) "); return; } switch (sa->sa_family) { case AF_INET: { struct sockaddr_in *sin; sin = (struct sockaddr_in *) sa; printf("inet "); print_sin_addr(sin); if (ma != NULL) { sin = (struct sockaddr_in *) ma; printf(" mask "); print_sin_addr(sin); } } break; case AF_LINK: { struct sockaddr_dl *sli; int i; sli = (struct sockaddr_dl *) sa; printf("link %.*s ", sli->sdl_nlen, sli->sdl_data); for (i = 0; i < sli->sdl_alen; i++) { if (i > 0) printf(":"); printf("%x", ((unsigned char *) LLADDR(sli))[i]); } } break; default: printf("af%d", sa->sa_family); } } void bootpboot_p_rtentry(struct rtentry *rt) { bootpboot_p_sa(rt_key(rt), rt_mask(rt)); printf(" "); bootpboot_p_sa(rt->rt_gateway, NULL); printf(" "); printf("flags %x", (unsigned short) rt->rt_flags); printf(" %d", (int) rt->rt_expire); printf(" %s\n", rt->rt_ifp->if_xname); } void bootpboot_p_tree(struct radix_node *rn) { while (rn != NULL) { if (rn->rn_bit < 0) { if ((rn->rn_flags & RNF_ROOT) != 0) { } else { bootpboot_p_rtentry((struct rtentry *) rn); } rn = rn->rn_dupedkey; } else { bootpboot_p_tree(rn->rn_left); bootpboot_p_tree(rn->rn_right); return; } } } void bootpboot_p_rtlist(void) { struct rib_head *rh; printf("Routing table:\n"); rnh = rt_tables_get_rnh(0, AF_INET); if (rnh == NULL) return; - RIB_RLOCK(rnh); /* could sleep XXX */ + RIB_CFG_RLOCK(rnh); /* could sleep XXX */ bootpboot_p_tree(rh->rnh_treetop); - RIB_RUNLOCK(rnh); + RIB_CFG_RUNLOCK(rnh); } void bootpboot_p_if(struct ifnet *ifp, struct ifaddr *ifa) { printf("%s flags %x, addr ", ifp->if_xname, ifp->if_flags); print_sin_addr((struct sockaddr_in *) ifa->ifa_addr); printf(", broadcast "); print_sin_addr((struct sockaddr_in *) ifa->ifa_dstaddr); printf(", netmask "); print_sin_addr((struct sockaddr_in *) ifa->ifa_netmask); printf("\n"); } void bootpboot_p_iflist(void) { struct ifnet *ifp; struct ifaddr *ifa; printf("Interface list:\n"); IFNET_RLOCK(); for (ifp = TAILQ_FIRST(&V_ifnet); ifp != NULL; ifp = TAILQ_NEXT(ifp, if_link)) { for (ifa = TAILQ_FIRST(&ifp->if_addrhead); ifa != NULL; ifa = TAILQ_NEXT(ifa, ifa_link)) if (ifa->ifa_addr->sa_family == AF_INET) bootpboot_p_if(ifp, ifa); } IFNET_RUNLOCK(); } #endif /* defined(BOOTP_DEBUG) */ static void clear_sinaddr(struct sockaddr_in *sin) { bzero(sin, sizeof(*sin)); sin->sin_len = sizeof(*sin); sin->sin_family = AF_INET; sin->sin_addr.s_addr = INADDR_ANY; /* XXX: htonl(INAADDR_ANY) ? */ sin->sin_port = 0; } static void allocifctx(struct bootpc_globalcontext *gctx) { struct bootpc_ifcontext *ifctx; ifctx = malloc(sizeof(*ifctx), M_TEMP, M_WAITOK | M_ZERO); ifctx->xid = gctx->xid; #ifdef BOOTP_NO_DHCP ifctx->state = IF_BOOTP_UNRESOLVED; #else ifctx->state = IF_DHCP_UNRESOLVED; #endif gctx->xid += 0x100; STAILQ_INSERT_TAIL(&gctx->interfaces, ifctx, next); } static __inline int bootpc_ifctx_isresolved(struct bootpc_ifcontext *ifctx) { if (ifctx->state == IF_BOOTP_RESOLVED || ifctx->state == IF_DHCP_RESOLVED) return 1; return 0; } static __inline int bootpc_ifctx_isunresolved(struct bootpc_ifcontext *ifctx) { if (ifctx->state == IF_BOOTP_UNRESOLVED || ifctx->state == IF_DHCP_UNRESOLVED) return 1; return 0; } static __inline int bootpc_ifctx_isfailed(struct bootpc_ifcontext *ifctx) { if (ifctx->state == IF_BOOTP_FAILED || ifctx->state == IF_DHCP_FAILED) return 1; return 0; } static int bootpc_received(struct bootpc_globalcontext *gctx, struct bootpc_ifcontext *ifctx) { unsigned char dhcpreplytype; char *p; /* * Need timeout for fallback to less * desirable alternative. */ /* This call used for the side effect (badopt flag) */ (void) bootpc_tag(&gctx->tmptag, &gctx->reply, gctx->replylen, TAG_END); /* If packet is invalid, ignore it */ if (gctx->tmptag.badopt != 0) return 0; p = bootpc_tag(&gctx->tmptag, &gctx->reply, gctx->replylen, TAG_DHCP_MSGTYPE); if (p != NULL) dhcpreplytype = *p; else dhcpreplytype = DHCP_NOMSG; switch (ifctx->dhcpquerytype) { case DHCP_DISCOVER: if (dhcpreplytype != DHCP_OFFER /* Normal DHCP offer */ #ifndef BOOTP_FORCE_DHCP && dhcpreplytype != DHCP_NOMSG /* Fallback to BOOTP */ #endif ) return 0; break; case DHCP_REQUEST: if (dhcpreplytype != DHCP_ACK) return 0; case DHCP_NOMSG: break; } /* Ignore packet unless it gives us a root tag we didn't have */ if ((ifctx->state == IF_BOOTP_RESOLVED || (ifctx->dhcpquerytype == DHCP_DISCOVER && (ifctx->state == IF_DHCP_OFFERED || ifctx->state == IF_DHCP_RESOLVED))) && (bootpc_tag(&gctx->tmptag, &ifctx->reply, ifctx->replylen, TAG_ROOT) != NULL || bootpc_tag(&gctx->tmptag, &gctx->reply, gctx->replylen, TAG_ROOT) == NULL)) return 0; bcopy(&gctx->reply, &ifctx->reply, gctx->replylen); ifctx->replylen = gctx->replylen; /* XXX: Only reset if 'perfect' response */ if (ifctx->state == IF_BOOTP_UNRESOLVED) ifctx->state = IF_BOOTP_RESOLVED; else if (ifctx->state == IF_DHCP_UNRESOLVED && ifctx->dhcpquerytype == DHCP_DISCOVER) { if (dhcpreplytype == DHCP_OFFER) ifctx->state = IF_DHCP_OFFERED; else ifctx->state = IF_BOOTP_RESOLVED; /* Fallback */ } else if (ifctx->state == IF_DHCP_OFFERED && ifctx->dhcpquerytype == DHCP_REQUEST) ifctx->state = IF_DHCP_RESOLVED; if (ifctx->dhcpquerytype == DHCP_DISCOVER && ifctx->state != IF_BOOTP_RESOLVED) { p = bootpc_tag(&gctx->tmptag, &ifctx->reply, ifctx->replylen, TAG_DHCP_SERVERID); if (p != NULL && gctx->tmptag.taglen == 4) { memcpy(&ifctx->dhcpserver, p, 4); ifctx->gotdhcpserver = 1; } else ifctx->gotdhcpserver = 0; return 1; } ifctx->gotrootpath = (bootpc_tag(&gctx->tmptag, &ifctx->reply, ifctx->replylen, TAG_ROOT) != NULL); ifctx->gotgw = (bootpc_tag(&gctx->tmptag, &ifctx->reply, ifctx->replylen, TAG_ROUTERS) != NULL); ifctx->gotnetmask = (bootpc_tag(&gctx->tmptag, &ifctx->reply, ifctx->replylen, TAG_SUBNETMASK) != NULL); return 1; } static int bootpc_call(struct bootpc_globalcontext *gctx, struct thread *td) { struct sockaddr_in *sin, dst; struct uio auio; struct sockopt sopt; struct iovec aio; int error, on, rcvflg, timo, len; time_t atimo; time_t rtimo; struct timeval tv; struct bootpc_ifcontext *ifctx; int outstanding; int gotrootpath; int retry; const char *s; tv.tv_sec = 1; tv.tv_usec = 0; bzero(&sopt, sizeof(sopt)); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = SOL_SOCKET; sopt.sopt_name = SO_RCVTIMEO; sopt.sopt_val = &tv; sopt.sopt_valsize = sizeof tv; error = sosetopt(bootp_so, &sopt); if (error != 0) goto out; /* * Enable broadcast. */ on = 1; sopt.sopt_name = SO_BROADCAST; sopt.sopt_val = &on; sopt.sopt_valsize = sizeof on; error = sosetopt(bootp_so, &sopt); if (error != 0) goto out; /* * Disable routing. */ on = 1; sopt.sopt_name = SO_DONTROUTE; sopt.sopt_val = &on; sopt.sopt_valsize = sizeof on; error = sosetopt(bootp_so, &sopt); if (error != 0) goto out; /* * Bind the local endpoint to a bootp client port. */ sin = &dst; clear_sinaddr(sin); sin->sin_port = htons(IPPORT_BOOTPC); error = sobind(bootp_so, (struct sockaddr *)sin, td); if (error != 0) { printf("bind failed\n"); goto out; } /* * Setup socket address for the server. */ sin = &dst; clear_sinaddr(sin); sin->sin_addr.s_addr = INADDR_BROADCAST; sin->sin_port = htons(IPPORT_BOOTPS); /* * Send it, repeatedly, until a reply is received, * but delay each re-send by an increasing amount. * If the delay hits the maximum, start complaining. */ timo = 0; rtimo = 0; for (;;) { outstanding = 0; gotrootpath = 0; STAILQ_FOREACH(ifctx, &gctx->interfaces, next) { if (bootpc_ifctx_isresolved(ifctx) != 0 && bootpc_tag(&gctx->tmptag, &ifctx->reply, ifctx->replylen, TAG_ROOT) != NULL) gotrootpath = 1; } STAILQ_FOREACH(ifctx, &gctx->interfaces, next) { struct in_aliasreq *ifra = &ifctx->iareq; sin = (struct sockaddr_in *)&ifra->ifra_mask; ifctx->outstanding = 0; if (bootpc_ifctx_isresolved(ifctx) != 0 && gotrootpath != 0) { continue; } if (bootpc_ifctx_isfailed(ifctx) != 0) continue; outstanding++; ifctx->outstanding = 1; /* Proceed to next step in DHCP negotiation */ if ((ifctx->state == IF_DHCP_OFFERED && ifctx->dhcpquerytype != DHCP_REQUEST) || (ifctx->state == IF_DHCP_UNRESOLVED && ifctx->dhcpquerytype != DHCP_DISCOVER) || (ifctx->state == IF_BOOTP_UNRESOLVED && ifctx->dhcpquerytype != DHCP_NOMSG)) { ifctx->sentmsg = 0; bootpc_compose_query(ifctx, td); } /* Send BOOTP request (or re-send). */ if (ifctx->sentmsg == 0) { switch(ifctx->dhcpquerytype) { case DHCP_DISCOVER: s = "DHCP Discover"; break; case DHCP_REQUEST: s = "DHCP Request"; break; case DHCP_NOMSG: default: s = "BOOTP Query"; break; } printf("Sending %s packet from " "interface %s (%*D)\n", s, ifctx->ireq.ifr_name, ifctx->sdl->sdl_alen, (unsigned char *) LLADDR(ifctx->sdl), ":"); ifctx->sentmsg = 1; } aio.iov_base = (caddr_t) &ifctx->call; aio.iov_len = sizeof(ifctx->call); auio.uio_iov = &aio; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_offset = 0; auio.uio_resid = sizeof(ifctx->call); auio.uio_td = td; /* Set netmask to 0.0.0.0 */ clear_sinaddr(sin); error = ifioctl(bootp_so, SIOCAIFADDR, (caddr_t)ifra, td); if (error != 0) panic("%s: SIOCAIFADDR, error=%d", __func__, error); error = sosend(bootp_so, (struct sockaddr *) &dst, &auio, NULL, NULL, 0, td); if (error != 0) printf("%s: sosend: %d state %08x\n", __func__, error, (int )bootp_so->so_state); /* Set netmask to 255.0.0.0 */ sin->sin_addr.s_addr = htonl(IN_CLASSA_NET); error = ifioctl(bootp_so, SIOCAIFADDR, (caddr_t)ifra, td); if (error != 0) panic("%s: SIOCAIFADDR, error=%d", __func__, error); } if (outstanding == 0 && (rtimo == 0 || time_second >= rtimo)) { error = 0; goto out; } /* Determine new timeout. */ if (timo < MAX_RESEND_DELAY) timo++; else { printf("DHCP/BOOTP timeout for server "); print_sin_addr(&dst); printf("\n"); } /* * Wait for up to timo seconds for a reply. * The socket receive timeout was set to 1 second. */ atimo = timo + time_second; while (time_second < atimo) { aio.iov_base = (caddr_t) &gctx->reply; aio.iov_len = sizeof(gctx->reply); auio.uio_iov = &aio; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_offset = 0; auio.uio_resid = sizeof(gctx->reply); auio.uio_td = td; rcvflg = 0; error = soreceive(bootp_so, NULL, &auio, NULL, NULL, &rcvflg); gctx->secs = time_second - gctx->starttime; STAILQ_FOREACH(ifctx, &gctx->interfaces, next) { if (bootpc_ifctx_isresolved(ifctx) != 0 || bootpc_ifctx_isfailed(ifctx) != 0) continue; ifctx->call.secs = htons(gctx->secs); } if (error == EWOULDBLOCK) continue; if (error != 0) goto out; len = sizeof(gctx->reply) - auio.uio_resid; /* Do we have the required number of bytes ? */ if (len < BOOTP_MIN_LEN) continue; gctx->replylen = len; /* Is it a reply? */ if (gctx->reply.op != BOOTP_REPLY) continue; /* Is this an answer to our query */ STAILQ_FOREACH(ifctx, &gctx->interfaces, next) { if (gctx->reply.xid != ifctx->call.xid) continue; /* Same HW address size ? */ if (gctx->reply.hlen != ifctx->call.hlen) continue; /* Correct HW address ? */ if (bcmp(gctx->reply.chaddr, ifctx->call.chaddr, ifctx->call.hlen) != 0) continue; break; } if (ifctx != NULL) { s = bootpc_tag(&gctx->tmptag, &gctx->reply, gctx->replylen, TAG_DHCP_MSGTYPE); if (s != NULL) { switch (*s) { case DHCP_OFFER: s = "DHCP Offer"; break; case DHCP_ACK: s = "DHCP Ack"; break; default: s = "DHCP (unexpected)"; break; } } else s = "BOOTP Reply"; printf("Received %s packet" " on %s from ", s, ifctx->ireq.ifr_name); print_in_addr(gctx->reply.siaddr); if (gctx->reply.giaddr.s_addr != htonl(INADDR_ANY)) { printf(" via "); print_in_addr(gctx->reply.giaddr); } if (bootpc_received(gctx, ifctx) != 0) { printf(" (accepted)"); if (ifctx->outstanding) { ifctx->outstanding = 0; outstanding--; } /* Network settle delay */ if (outstanding == 0) atimo = time_second + BOOTP_SETTLE_DELAY; } else printf(" (ignored)"); if (ifctx->gotrootpath || gctx->any_root_overrides) { gotrootpath = 1; rtimo = time_second + BOOTP_SETTLE_DELAY; if (ifctx->gotrootpath) printf(" (got root path)"); } printf("\n"); } } /* while secs */ #ifdef BOOTP_TIMEOUT if (gctx->secs > BOOTP_TIMEOUT && BOOTP_TIMEOUT > 0) break; #endif /* Force a retry if halfway in DHCP negotiation */ retry = 0; STAILQ_FOREACH(ifctx, &gctx->interfaces, next) if (ifctx->state == IF_DHCP_OFFERED) { if (ifctx->dhcpquerytype == DHCP_DISCOVER) retry = 1; else ifctx->state = IF_DHCP_UNRESOLVED; } if (retry != 0) continue; if (gotrootpath != 0) { gctx->gotrootpath = gotrootpath; if (rtimo != 0 && time_second >= rtimo) break; } } /* forever send/receive */ /* * XXX: These are errors of varying seriousness being silently * ignored */ STAILQ_FOREACH(ifctx, &gctx->interfaces, next) if (bootpc_ifctx_isresolved(ifctx) == 0) { printf("%s timeout for interface %s\n", ifctx->dhcpquerytype != DHCP_NOMSG ? "DHCP" : "BOOTP", ifctx->ireq.ifr_name); } if (gctx->gotrootpath != 0) { #if 0 printf("Got a root path, ignoring remaining timeout\n"); #endif error = 0; goto out; } #ifndef BOOTP_NFSROOT STAILQ_FOREACH(ifctx, &gctx->interfaces, next) if (bootpc_ifctx_isresolved(ifctx) != 0) { error = 0; goto out; } #endif error = ETIMEDOUT; out: return (error); } static void bootpc_fakeup_interface(struct bootpc_ifcontext *ifctx, struct thread *td) { struct ifreq *ifr; struct in_aliasreq *ifra; struct sockaddr_in *sin; int error; ifr = &ifctx->ireq; ifra = &ifctx->iareq; /* * Bring up the interface. * * Get the old interface flags and or IFF_UP into them; if * IFF_UP set blindly, interface selection can be clobbered. */ error = ifioctl(bootp_so, SIOCGIFFLAGS, (caddr_t)ifr, td); if (error != 0) panic("%s: SIOCGIFFLAGS, error=%d", __func__, error); ifr->ifr_flags |= IFF_UP; error = ifioctl(bootp_so, SIOCSIFFLAGS, (caddr_t)ifr, td); if (error != 0) panic("%s: SIOCSIFFLAGS, error=%d", __func__, error); /* * Do enough of ifconfig(8) so that the chosen interface * can talk to the servers. Set address to 0.0.0.0/8 and * broadcast address to local broadcast. */ sin = (struct sockaddr_in *)&ifra->ifra_addr; clear_sinaddr(sin); sin = (struct sockaddr_in *)&ifra->ifra_mask; clear_sinaddr(sin); sin->sin_addr.s_addr = htonl(IN_CLASSA_NET); sin = (struct sockaddr_in *)&ifra->ifra_broadaddr; clear_sinaddr(sin); sin->sin_addr.s_addr = htonl(INADDR_BROADCAST); error = ifioctl(bootp_so, SIOCAIFADDR, (caddr_t)ifra, td); if (error != 0) panic("%s: SIOCAIFADDR, error=%d", __func__, error); } static void bootpc_shutdown_interface(struct bootpc_ifcontext *ifctx, struct thread *td) { struct ifreq *ifr; struct sockaddr_in *sin; int error; ifr = &ifctx->ireq; printf("Shutdown interface %s\n", ifctx->ireq.ifr_name); error = ifioctl(bootp_so, SIOCGIFFLAGS, (caddr_t)ifr, td); if (error != 0) panic("%s: SIOCGIFFLAGS, error=%d", __func__, error); ifr->ifr_flags &= ~IFF_UP; error = ifioctl(bootp_so, SIOCSIFFLAGS, (caddr_t)ifr, td); if (error != 0) panic("%s: SIOCSIFFLAGS, error=%d", __func__, error); sin = (struct sockaddr_in *) &ifr->ifr_addr; clear_sinaddr(sin); error = ifioctl(bootp_so, SIOCDIFADDR, (caddr_t) ifr, td); if (error != 0) panic("%s: SIOCDIFADDR, error=%d", __func__, error); } static int bootpc_adjust_interface(struct bootpc_ifcontext *ifctx, struct bootpc_globalcontext *gctx, struct thread *td) { int error; struct sockaddr_in defdst; struct sockaddr_in defmask; struct sockaddr_in *sin; struct ifreq *ifr; struct in_aliasreq *ifra; struct sockaddr_in *myaddr; struct sockaddr_in *netmask; struct sockaddr_in *gw; ifr = &ifctx->ireq; ifra = &ifctx->iareq; myaddr = &ifctx->myaddr; netmask = &ifctx->netmask; gw = &ifctx->gw; if (bootpc_ifctx_isresolved(ifctx) == 0) { /* Shutdown interfaces where BOOTP failed */ bootpc_shutdown_interface(ifctx, td); return (0); } printf("Adjusted interface %s\n", ifctx->ireq.ifr_name); /* * Do enough of ifconfig(8) so that the chosen interface * can talk to the servers. (just set the address) */ sin = (struct sockaddr_in *) &ifr->ifr_addr; clear_sinaddr(sin); error = ifioctl(bootp_so, SIOCDIFADDR, (caddr_t) ifr, td); if (error != 0) panic("%s: SIOCDIFADDR, error=%d", __func__, error); bcopy(myaddr, &ifra->ifra_addr, sizeof(*myaddr)); bcopy(netmask, &ifra->ifra_mask, sizeof(*netmask)); clear_sinaddr(&ifra->ifra_broadaddr); ifra->ifra_broadaddr.sin_addr.s_addr = myaddr->sin_addr.s_addr | ~netmask->sin_addr.s_addr; error = ifioctl(bootp_so, SIOCAIFADDR, (caddr_t)ifra, td); if (error != 0) panic("%s: SIOCAIFADDR, error=%d", __func__, error); /* Add new default route */ if (ifctx->gotgw != 0 || gctx->gotgw == 0) { clear_sinaddr(&defdst); clear_sinaddr(&defmask); /* XXX MRT just table 0 */ error = rtrequest_fib(RTM_ADD, (struct sockaddr *) &defdst, (struct sockaddr *) gw, (struct sockaddr *) &defmask, (RTF_UP | RTF_GATEWAY | RTF_STATIC), NULL, RT_DEFAULT_FIB); if (error != 0) { printf("%s: RTM_ADD, error=%d\n", __func__, error); return (error); } } return (0); } static int setfs(struct sockaddr_in *addr, char *path, char *p, const struct in_addr *siaddr) { if (getip(&p, &addr->sin_addr) == 0) { if (siaddr != NULL && *p == '/') bcopy(siaddr, &addr->sin_addr, sizeof(struct in_addr)); else return 0; } else { if (*p != ':') return 0; p++; } addr->sin_len = sizeof(struct sockaddr_in); addr->sin_family = AF_INET; strlcpy(path, p, MNAMELEN); return 1; } static int getip(char **ptr, struct in_addr *addr) { char *p; unsigned int ip; int val; p = *ptr; ip = 0; if (((val = getdec(&p)) < 0) || (val > 255)) return 0; ip = val << 24; if (*p != '.') return 0; p++; if (((val = getdec(&p)) < 0) || (val > 255)) return 0; ip |= (val << 16); if (*p != '.') return 0; p++; if (((val = getdec(&p)) < 0) || (val > 255)) return 0; ip |= (val << 8); if (*p != '.') return 0; p++; if (((val = getdec(&p)) < 0) || (val > 255)) return 0; ip |= val; addr->s_addr = htonl(ip); *ptr = p; return 1; } static int getdec(char **ptr) { char *p; int ret; p = *ptr; ret = 0; if ((*p < '0') || (*p > '9')) return -1; while ((*p >= '0') && (*p <= '9')) { ret = ret * 10 + (*p - '0'); p++; } *ptr = p; return ret; } static void mountopts(struct nfs_args *args, char *p) { args->version = NFS_ARGSVERSION; args->rsize = BOOTP_BLOCKSIZE; args->wsize = BOOTP_BLOCKSIZE; args->flags = NFSMNT_RSIZE | NFSMNT_WSIZE | NFSMNT_RESVPORT; args->sotype = SOCK_DGRAM; if (p != NULL) nfs_parse_options(p, args); } static int xdr_opaque_decode(struct mbuf **mptr, u_char *buf, int len) { struct mbuf *m; int alignedlen; m = *mptr; alignedlen = ( len + 3 ) & ~3; if (m->m_len < alignedlen) { m = m_pullup(m, alignedlen); if (m == NULL) { *mptr = NULL; return EBADRPC; } } bcopy(mtod(m, u_char *), buf, len); m_adj(m, alignedlen); *mptr = m; return 0; } static int xdr_int_decode(struct mbuf **mptr, int *iptr) { u_int32_t i; if (xdr_opaque_decode(mptr, (u_char *) &i, sizeof(u_int32_t)) != 0) return EBADRPC; *iptr = fxdr_unsigned(u_int32_t, i); return 0; } static void print_sin_addr(struct sockaddr_in *sin) { print_in_addr(sin->sin_addr); } static void print_in_addr(struct in_addr addr) { unsigned int ip; ip = ntohl(addr.s_addr); printf("%d.%d.%d.%d", ip >> 24, (ip >> 16) & 255, (ip >> 8) & 255, ip & 255); } static void bootpc_compose_query(struct bootpc_ifcontext *ifctx, struct thread *td) { unsigned char *vendp; unsigned char vendor_client[64]; uint32_t leasetime; uint8_t vendor_client_len; ifctx->gotrootpath = 0; bzero((caddr_t) &ifctx->call, sizeof(ifctx->call)); /* bootpc part */ ifctx->call.op = BOOTP_REQUEST; /* BOOTREQUEST */ ifctx->call.htype = 1; /* 10mb ethernet */ ifctx->call.hlen = ifctx->sdl->sdl_alen;/* Hardware address length */ ifctx->call.hops = 0; if (bootpc_ifctx_isunresolved(ifctx) != 0) ifctx->xid++; ifctx->call.xid = txdr_unsigned(ifctx->xid); bcopy(LLADDR(ifctx->sdl), &ifctx->call.chaddr, ifctx->sdl->sdl_alen); vendp = ifctx->call.vend; *vendp++ = 99; /* RFC1048 cookie */ *vendp++ = 130; *vendp++ = 83; *vendp++ = 99; *vendp++ = TAG_MAXMSGSIZE; *vendp++ = 2; *vendp++ = (sizeof(struct bootp_packet) >> 8) & 255; *vendp++ = sizeof(struct bootp_packet) & 255; snprintf(vendor_client, sizeof(vendor_client), "%s:%s:%s", ostype, MACHINE, osrelease); vendor_client_len = strlen(vendor_client); *vendp++ = TAG_VENDOR_INDENTIFIER; *vendp++ = vendor_client_len; memcpy(vendp, vendor_client, vendor_client_len); vendp += vendor_client_len; ifctx->dhcpquerytype = DHCP_NOMSG; switch (ifctx->state) { case IF_DHCP_UNRESOLVED: *vendp++ = TAG_DHCP_MSGTYPE; *vendp++ = 1; *vendp++ = DHCP_DISCOVER; ifctx->dhcpquerytype = DHCP_DISCOVER; ifctx->gotdhcpserver = 0; break; case IF_DHCP_OFFERED: *vendp++ = TAG_DHCP_MSGTYPE; *vendp++ = 1; *vendp++ = DHCP_REQUEST; ifctx->dhcpquerytype = DHCP_REQUEST; *vendp++ = TAG_DHCP_REQ_ADDR; *vendp++ = 4; memcpy(vendp, &ifctx->reply.yiaddr, 4); vendp += 4; if (ifctx->gotdhcpserver != 0) { *vendp++ = TAG_DHCP_SERVERID; *vendp++ = 4; memcpy(vendp, &ifctx->dhcpserver, 4); vendp += 4; } *vendp++ = TAG_DHCP_LEASETIME; *vendp++ = 4; leasetime = htonl(300); memcpy(vendp, &leasetime, 4); vendp += 4; break; default: break; } *vendp = TAG_END; ifctx->call.secs = 0; ifctx->call.flags = htons(0x8000); /* We need a broadcast answer */ } static int bootpc_hascookie(struct bootp_packet *bp) { return (bp->vend[0] == 99 && bp->vend[1] == 130 && bp->vend[2] == 83 && bp->vend[3] == 99); } static void bootpc_tag_helper(struct bootpc_tagcontext *tctx, unsigned char *start, int len, int tag) { unsigned char *j; unsigned char *ej; unsigned char code; if (tctx->badtag != 0 || tctx->badopt != 0) return; j = start; ej = j + len; while (j < ej) { code = *j++; if (code == TAG_PAD) continue; if (code == TAG_END) return; if (j >= ej || j + *j + 1 > ej) { tctx->badopt = 1; return; } len = *j++; if (code == tag) { if (tctx->taglen + len > TAG_MAXLEN) { tctx->badtag = 1; return; } tctx->foundopt = 1; if (len > 0) memcpy(tctx->buf + tctx->taglen, j, len); tctx->taglen += len; } if (code == TAG_OVERLOAD) tctx->overload = *j; j += len; } } static unsigned char * bootpc_tag(struct bootpc_tagcontext *tctx, struct bootp_packet *bp, int len, int tag) { tctx->overload = 0; tctx->badopt = 0; tctx->badtag = 0; tctx->foundopt = 0; tctx->taglen = 0; if (bootpc_hascookie(bp) == 0) return NULL; bootpc_tag_helper(tctx, &bp->vend[4], (unsigned char *) bp + len - &bp->vend[4], tag); if ((tctx->overload & OVERLOAD_FILE) != 0) bootpc_tag_helper(tctx, (unsigned char *) bp->file, sizeof(bp->file), tag); if ((tctx->overload & OVERLOAD_SNAME) != 0) bootpc_tag_helper(tctx, (unsigned char *) bp->sname, sizeof(bp->sname), tag); if (tctx->badopt != 0 || tctx->badtag != 0 || tctx->foundopt == 0) return NULL; tctx->buf[tctx->taglen] = '\0'; return tctx->buf; } static void bootpc_decode_reply(struct nfsv3_diskless *nd, struct bootpc_ifcontext *ifctx, struct bootpc_globalcontext *gctx) { char *p, *s; unsigned int ip; ifctx->gotgw = 0; ifctx->gotnetmask = 0; clear_sinaddr(&ifctx->myaddr); clear_sinaddr(&ifctx->netmask); clear_sinaddr(&ifctx->gw); ifctx->myaddr.sin_addr = ifctx->reply.yiaddr; ip = ntohl(ifctx->myaddr.sin_addr.s_addr); printf("%s at ", ifctx->ireq.ifr_name); print_sin_addr(&ifctx->myaddr); printf(" server "); print_in_addr(ifctx->reply.siaddr); ifctx->gw.sin_addr = ifctx->reply.giaddr; if (ifctx->reply.giaddr.s_addr != htonl(INADDR_ANY)) { printf(" via gateway "); print_in_addr(ifctx->reply.giaddr); } /* This call used for the side effect (overload flag) */ (void) bootpc_tag(&gctx->tmptag, &ifctx->reply, ifctx->replylen, TAG_END); if ((gctx->tmptag.overload & OVERLOAD_SNAME) == 0) if (ifctx->reply.sname[0] != '\0') printf(" server name %s", ifctx->reply.sname); if ((gctx->tmptag.overload & OVERLOAD_FILE) == 0) if (ifctx->reply.file[0] != '\0') printf(" boot file %s", ifctx->reply.file); printf("\n"); p = bootpc_tag(&gctx->tag, &ifctx->reply, ifctx->replylen, TAG_SUBNETMASK); if (p != NULL) { if (gctx->tag.taglen != 4) panic("bootpc: subnet mask len is %d", gctx->tag.taglen); bcopy(p, &ifctx->netmask.sin_addr, 4); ifctx->gotnetmask = 1; printf("subnet mask "); print_sin_addr(&ifctx->netmask); printf(" "); } p = bootpc_tag(&gctx->tag, &ifctx->reply, ifctx->replylen, TAG_ROUTERS); if (p != NULL) { /* Routers */ if (gctx->tag.taglen % 4) panic("bootpc: Router Len is %d", gctx->tag.taglen); if (gctx->tag.taglen > 0) { bcopy(p, &ifctx->gw.sin_addr, 4); printf("router "); print_sin_addr(&ifctx->gw); printf(" "); ifctx->gotgw = 1; gctx->gotgw = 1; } } /* * Choose a root filesystem. If a value is forced in the environment * and it contains "nfs:", use it unconditionally. Otherwise, if the * kernel is compiled with the ROOTDEVNAME option, then use it if: * - The server doesn't provide a pathname. * - The boothowto flags include RB_DFLTROOT (user said to override * the server value). */ p = NULL; if ((s = kern_getenv("vfs.root.mountfrom")) != NULL) { if ((p = strstr(s, "nfs:")) != NULL) p = strdup(p + 4, M_TEMP); freeenv(s); } if (p == NULL) { p = bootpc_tag(&gctx->tag, &ifctx->reply, ifctx->replylen, TAG_ROOT); } #ifdef ROOTDEVNAME if ((p == NULL || (boothowto & RB_DFLTROOT) != 0) && (p = strstr(ROOTDEVNAME, "nfs:")) != NULL) { p += 4; } #endif if (p != NULL) { if (gctx->setrootfs != NULL) { printf("rootfs %s (ignored) ", p); } else if (setfs(&nd->root_saddr, nd->root_hostnam, p, &ifctx->reply.siaddr)) { if (*p == '/') { printf("root_server "); print_sin_addr(&nd->root_saddr); printf(" "); } printf("rootfs %s ", p); gctx->gotrootpath = 1; ifctx->gotrootpath = 1; gctx->setrootfs = ifctx; p = bootpc_tag(&gctx->tag, &ifctx->reply, ifctx->replylen, TAG_ROOTOPTS); if (p != NULL) { mountopts(&nd->root_args, p); printf("rootopts %s ", p); } } else panic("Failed to set rootfs to %s", p); } p = bootpc_tag(&gctx->tag, &ifctx->reply, ifctx->replylen, TAG_HOSTNAME); if (p != NULL) { if (gctx->tag.taglen >= MAXHOSTNAMELEN) panic("bootpc: hostname >= %d bytes", MAXHOSTNAMELEN); if (gctx->sethostname != NULL) { printf("hostname %s (ignored) ", p); } else { strcpy(nd->my_hostnam, p); mtx_lock(&prison0.pr_mtx); strcpy(prison0.pr_hostname, p); mtx_unlock(&prison0.pr_mtx); printf("hostname %s ", p); gctx->sethostname = ifctx; } } p = bootpc_tag(&gctx->tag, &ifctx->reply, ifctx->replylen, TAG_COOKIE); if (p != NULL) { /* store in a sysctl variable */ int i, l = sizeof(bootp_cookie) - 1; for (i = 0; i < l && p[i] != '\0'; i++) bootp_cookie[i] = p[i]; p[i] = '\0'; } printf("\n"); if (ifctx->gotnetmask == 0) { if (IN_CLASSA(ntohl(ifctx->myaddr.sin_addr.s_addr))) ifctx->netmask.sin_addr.s_addr = htonl(IN_CLASSA_NET); else if (IN_CLASSB(ntohl(ifctx->myaddr.sin_addr.s_addr))) ifctx->netmask.sin_addr.s_addr = htonl(IN_CLASSB_NET); else ifctx->netmask.sin_addr.s_addr = htonl(IN_CLASSC_NET); } if (ifctx->gotgw == 0) { /* Use proxyarp */ ifctx->gw.sin_addr.s_addr = ifctx->myaddr.sin_addr.s_addr; } } void bootpc_init(void) { struct bootpc_ifcontext *ifctx; /* Interface BOOTP contexts */ struct bootpc_globalcontext *gctx; /* Global BOOTP context */ struct ifnet *ifp; struct sockaddr_dl *sdl; struct ifaddr *ifa; int error; #ifndef BOOTP_WIRED_TO int ifcnt; #endif struct nfsv3_diskless *nd; struct thread *td; int timeout; int delay; timeout = BOOTP_IFACE_WAIT_TIMEOUT * hz; delay = hz / 10; nd = &nfsv3_diskless; td = curthread; /* * If already filled in, don't touch it here */ if (nfs_diskless_valid != 0) return; gctx = malloc(sizeof(*gctx), M_TEMP, M_WAITOK | M_ZERO); STAILQ_INIT(&gctx->interfaces); gctx->xid = ~0xFFFF; gctx->starttime = time_second; /* * If ROOTDEVNAME is defined or vfs.root.mountfrom is set then we have * root-path overrides that can potentially let us boot even if we don't * get a root path from the server, so we can treat that as a non-error. */ #ifdef ROOTDEVNAME gctx->any_root_overrides = 1; #else gctx->any_root_overrides = testenv("vfs.root.mountfrom"); #endif /* * Find a network interface. */ CURVNET_SET(TD_TO_VNET(td)); #ifdef BOOTP_WIRED_TO printf("%s: wired to interface '%s'\n", __func__, __XSTRING(BOOTP_WIRED_TO)); allocifctx(gctx); #else /* * Preallocate interface context storage, if another interface * attaches and wins the race, it won't be eligible for bootp. */ ifcnt = 0; IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT | IFF_BROADCAST)) != IFF_BROADCAST) continue; switch (ifp->if_alloctype) { case IFT_ETHER: case IFT_FDDI: case IFT_ISO88025: break; default: continue; } ifcnt++; } IFNET_RUNLOCK(); if (ifcnt == 0) panic("%s: no eligible interfaces", __func__); for (; ifcnt > 0; ifcnt--) allocifctx(gctx); #endif retry: ifctx = STAILQ_FIRST(&gctx->interfaces); IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifctx == NULL) break; #ifdef BOOTP_WIRED_TO if (strcmp(ifp->if_xname, __XSTRING(BOOTP_WIRED_TO)) != 0) continue; #else if ((ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT | IFF_BROADCAST)) != IFF_BROADCAST) continue; switch (ifp->if_alloctype) { case IFT_ETHER: case IFT_FDDI: case IFT_ISO88025: break; default: continue; } #endif strlcpy(ifctx->ireq.ifr_name, ifp->if_xname, sizeof(ifctx->ireq.ifr_name)); ifctx->ifp = ifp; /* Get HW address */ sdl = NULL; TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_LINK) { sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl->sdl_type == IFT_ETHER) break; } if (sdl == NULL) panic("bootpc: Unable to find HW address for %s", ifctx->ireq.ifr_name); ifctx->sdl = sdl; ifctx = STAILQ_NEXT(ifctx, next); } IFNET_RUNLOCK(); CURVNET_RESTORE(); if (STAILQ_EMPTY(&gctx->interfaces) || STAILQ_FIRST(&gctx->interfaces)->ifp == NULL) { if (timeout > 0) { pause("bootpc", delay); timeout -= delay; goto retry; } #ifdef BOOTP_WIRED_TO panic("%s: Could not find interface specified " "by BOOTP_WIRED_TO: " __XSTRING(BOOTP_WIRED_TO), __func__); #else panic("%s: no suitable interface", __func__); #endif } error = socreate(AF_INET, &bootp_so, SOCK_DGRAM, 0, td->td_ucred, td); if (error != 0) panic("%s: socreate, error=%d", __func__, error); STAILQ_FOREACH(ifctx, &gctx->interfaces, next) bootpc_fakeup_interface(ifctx, td); STAILQ_FOREACH(ifctx, &gctx->interfaces, next) bootpc_compose_query(ifctx, td); error = bootpc_call(gctx, td); if (error != 0) { printf("BOOTP call failed\n"); } mountopts(&nd->root_args, NULL); STAILQ_FOREACH(ifctx, &gctx->interfaces, next) if (bootpc_ifctx_isresolved(ifctx) != 0) bootpc_decode_reply(nd, ifctx, gctx); #ifdef BOOTP_NFSROOT if (gctx->gotrootpath == 0 && gctx->any_root_overrides == 0) panic("bootpc: No root path offered"); #endif STAILQ_FOREACH(ifctx, &gctx->interfaces, next) bootpc_adjust_interface(ifctx, gctx, td); soclose(bootp_so); STAILQ_FOREACH(ifctx, &gctx->interfaces, next) if (ifctx->gotrootpath != 0) break; if (ifctx == NULL) { STAILQ_FOREACH(ifctx, &gctx->interfaces, next) if (bootpc_ifctx_isresolved(ifctx) != 0) break; } if (ifctx == NULL) goto out; if (gctx->gotrootpath != 0) { kern_setenv("boot.netif.name", ifctx->ifp->if_xname); error = md_mount(&nd->root_saddr, nd->root_hostnam, nd->root_fh, &nd->root_fhsize, &nd->root_args, td); if (error != 0) { if (gctx->any_root_overrides == 0) panic("nfs_boot: mount root, error=%d", error); else goto out; } rootdevnames[0] = "nfs:"; #ifdef NFSCLIENT rootdevnames[1] = "oldnfs:"; #endif nfs_diskless_valid = 3; } strcpy(nd->myif.ifra_name, ifctx->ireq.ifr_name); bcopy(&ifctx->myaddr, &nd->myif.ifra_addr, sizeof(ifctx->myaddr)); bcopy(&ifctx->myaddr, &nd->myif.ifra_broadaddr, sizeof(ifctx->myaddr)); ((struct sockaddr_in *) &nd->myif.ifra_broadaddr)->sin_addr.s_addr = ifctx->myaddr.sin_addr.s_addr | ~ ifctx->netmask.sin_addr.s_addr; bcopy(&ifctx->netmask, &nd->myif.ifra_mask, sizeof(ifctx->netmask)); out: while((ifctx = STAILQ_FIRST(&gctx->interfaces)) != NULL) { STAILQ_REMOVE_HEAD(&gctx->interfaces, next); free(ifctx, M_TEMP); } free(gctx, M_TEMP); } /* * RPC: mountd/mount * Given a server pathname, get an NFS file handle. * Also, sets sin->sin_port to the NFS service port. */ static int md_mount(struct sockaddr_in *mdsin, char *path, u_char *fhp, int *fhsizep, struct nfs_args *args, struct thread *td) { struct mbuf *m; int error; int authunixok; int authcount; int authver; #define RPCPROG_MNT 100005 #define RPCMNT_VER1 1 #define RPCMNT_VER3 3 #define RPCMNT_MOUNT 1 #define AUTH_SYS 1 /* unix style (uid, gids) */ #define AUTH_UNIX AUTH_SYS /* XXX honor v2/v3 flags in args->flags? */ #ifdef BOOTP_NFSV3 /* First try NFS v3 */ /* Get port number for MOUNTD. */ error = krpc_portmap(mdsin, RPCPROG_MNT, RPCMNT_VER3, &mdsin->sin_port, td); if (error == 0) { m = xdr_string_encode(path, strlen(path)); /* Do RPC to mountd. */ error = krpc_call(mdsin, RPCPROG_MNT, RPCMNT_VER3, RPCMNT_MOUNT, &m, NULL, td); } if (error == 0) { args->flags |= NFSMNT_NFSV3; } else { #endif /* Fallback to NFS v2 */ /* Get port number for MOUNTD. */ error = krpc_portmap(mdsin, RPCPROG_MNT, RPCMNT_VER1, &mdsin->sin_port, td); if (error != 0) return error; m = xdr_string_encode(path, strlen(path)); /* Do RPC to mountd. */ error = krpc_call(mdsin, RPCPROG_MNT, RPCMNT_VER1, RPCMNT_MOUNT, &m, NULL, td); if (error != 0) return error; /* message already freed */ #ifdef BOOTP_NFSV3 } #endif if (xdr_int_decode(&m, &error) != 0 || error != 0) goto bad; if ((args->flags & NFSMNT_NFSV3) != 0) { if (xdr_int_decode(&m, fhsizep) != 0 || *fhsizep > NFSX_V3FHMAX || *fhsizep <= 0) goto bad; } else *fhsizep = NFSX_V2FH; if (xdr_opaque_decode(&m, fhp, *fhsizep) != 0) goto bad; if (args->flags & NFSMNT_NFSV3) { if (xdr_int_decode(&m, &authcount) != 0) goto bad; authunixok = 0; if (authcount < 0 || authcount > 100) goto bad; while (authcount > 0) { if (xdr_int_decode(&m, &authver) != 0) goto bad; if (authver == AUTH_UNIX) authunixok = 1; authcount--; } if (authunixok == 0) goto bad; } /* Set port number for NFS use. */ error = krpc_portmap(mdsin, NFS_PROG, (args->flags & NFSMNT_NFSV3) ? NFS_VER3 : NFS_VER2, &mdsin->sin_port, td); goto out; bad: error = EBADRPC; out: m_freem(m); return error; } SYSINIT(bootp_rootconf, SI_SUB_ROOT_CONF, SI_ORDER_FIRST, bootpc_init, NULL);