diff --git a/sys/net/if.c b/sys/net/if.c index c5f0a65721fc..2d3d85ca5ee8 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -1,5208 +1,5258 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2010 Bjoern A. Zeeb * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if.c 8.5 (Berkeley) 1/9/95 * $FreeBSD$ */ #include "opt_bpf.h" #include "opt_inet6.h" #include "opt_inet.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #ifdef INET6 #include #include #endif /* INET6 */ #endif /* INET || INET6 */ #include /* * Consumers of struct ifreq such as tcpdump assume no pad between ifr_name * and ifr_ifru when it is used in SIOCGIFCONF. */ _Static_assert(sizeof(((struct ifreq *)0)->ifr_name) == offsetof(struct ifreq, ifr_ifru), "gap between ifr_name and ifr_ifru"); __read_mostly epoch_t net_epoch_preempt; #ifdef COMPAT_FREEBSD32 #include #include struct ifreq_buffer32 { uint32_t length; /* (size_t) */ uint32_t buffer; /* (void *) */ }; /* * Interface request structure used for socket * ioctl's. All interface ioctl's must have parameter * definitions which begin with ifr_name. The * remainder may be interface specific. */ struct ifreq32 { char ifr_name[IFNAMSIZ]; /* if name, e.g. "en0" */ union { struct sockaddr ifru_addr; struct sockaddr ifru_dstaddr; struct sockaddr ifru_broadaddr; struct ifreq_buffer32 ifru_buffer; short ifru_flags[2]; short ifru_index; int ifru_jid; int ifru_metric; int ifru_mtu; int ifru_phys; int ifru_media; uint32_t ifru_data; int ifru_cap[2]; u_int ifru_fib; u_char ifru_vlan_pcp; } ifr_ifru; }; CTASSERT(sizeof(struct ifreq) == sizeof(struct ifreq32)); CTASSERT(__offsetof(struct ifreq, ifr_ifru) == __offsetof(struct ifreq32, ifr_ifru)); struct ifconf32 { int32_t ifc_len; union { uint32_t ifcu_buf; uint32_t ifcu_req; } ifc_ifcu; }; #define SIOCGIFCONF32 _IOWR('i', 36, struct ifconf32) struct ifdrv32 { char ifd_name[IFNAMSIZ]; uint32_t ifd_cmd; uint32_t ifd_len; uint32_t ifd_data; }; #define SIOCSDRVSPEC32 _IOC_NEWTYPE(SIOCSDRVSPEC, struct ifdrv32) #define SIOCGDRVSPEC32 _IOC_NEWTYPE(SIOCGDRVSPEC, struct ifdrv32) struct ifgroupreq32 { char ifgr_name[IFNAMSIZ]; u_int ifgr_len; union { char ifgru_group[IFNAMSIZ]; uint32_t ifgru_groups; } ifgr_ifgru; }; #define SIOCAIFGROUP32 _IOC_NEWTYPE(SIOCAIFGROUP, struct ifgroupreq32) #define SIOCGIFGROUP32 _IOC_NEWTYPE(SIOCGIFGROUP, struct ifgroupreq32) #define SIOCDIFGROUP32 _IOC_NEWTYPE(SIOCDIFGROUP, struct ifgroupreq32) #define SIOCGIFGMEMB32 _IOC_NEWTYPE(SIOCGIFGMEMB, struct ifgroupreq32) struct ifmediareq32 { char ifm_name[IFNAMSIZ]; int ifm_current; int ifm_mask; int ifm_status; int ifm_active; int ifm_count; uint32_t ifm_ulist; /* (int *) */ }; #define SIOCGIFMEDIA32 _IOC_NEWTYPE(SIOCGIFMEDIA, struct ifmediareq32) #define SIOCGIFXMEDIA32 _IOC_NEWTYPE(SIOCGIFXMEDIA, struct ifmediareq32) #endif /* COMPAT_FREEBSD32 */ union ifreq_union { struct ifreq ifr; #ifdef COMPAT_FREEBSD32 struct ifreq32 ifr32; #endif }; SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Link layers"); SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Generic link-management"); SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN, &ifqmaxlen, 0, "max send queue size"); /* Log link state change events */ static int log_link_state_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW, &log_link_state_change, 0, "log interface link state change events"); /* Log promiscuous mode change events */ static int log_promisc_mode_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_promisc_mode_change, CTLFLAG_RDTUN, &log_promisc_mode_change, 1, "log promiscuous mode change events"); /* Interface description */ static unsigned int ifdescr_maxlen = 1024; SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW, &ifdescr_maxlen, 0, "administrative maximum length for interface description"); static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions"); /* global sx for non-critical path ifdescr */ static struct sx ifdescr_sx; SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr"); void (*ng_ether_link_state_p)(struct ifnet *ifp, int state); void (*lagg_linkstate_p)(struct ifnet *ifp, int state); /* These are external hooks for CARP. */ void (*carp_linkstate_p)(struct ifnet *ifp); void (*carp_demote_adj_p)(int, char *); int (*carp_master_p)(struct ifaddr *); #if defined(INET) || defined(INET6) int (*carp_forus_p)(struct ifnet *ifp, u_char *dhost); int (*carp_output_p)(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa); int (*carp_ioctl_p)(struct ifreq *, u_long, struct thread *); int (*carp_attach_p)(struct ifaddr *, int); void (*carp_detach_p)(struct ifaddr *, bool); #endif #ifdef INET int (*carp_iamatch_p)(struct ifaddr *, uint8_t **); #endif #ifdef INET6 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6); caddr_t (*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr); #endif struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL; /* * XXX: Style; these should be sorted alphabetically, and unprototyped * static functions should be prototyped. Currently they are sorted by * declaration order. */ static void if_attachdomain(void *); static void if_attachdomain1(struct ifnet *); static int ifconf(u_long, caddr_t); static void if_input_default(struct ifnet *, struct mbuf *); static int if_requestencap_default(struct ifnet *, struct if_encap_req *); static int if_setflag(struct ifnet *, int, int, int *, int); static int if_transmit_default(struct ifnet *ifp, struct mbuf *m); static void if_unroute(struct ifnet *, int flag, int fam); static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int); static void do_link_state_change(void *, int); static int if_getgroup(struct ifgroupreq *, struct ifnet *); static int if_getgroupmembers(struct ifgroupreq *); static void if_delgroups(struct ifnet *); static void if_attach_internal(struct ifnet *, bool); static int if_detach_internal(struct ifnet *, bool); static void if_siocaddmulti(void *, int); static void if_link_ifnet(struct ifnet *); static bool if_unlink_ifnet(struct ifnet *, bool); #ifdef VIMAGE static int if_vmove(struct ifnet *, struct vnet *); #endif #ifdef INET6 /* * XXX: declare here to avoid to include many inet6 related files.. * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); #endif /* ipsec helper hooks */ VNET_DEFINE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]); VNET_DEFINE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]); int ifqmaxlen = IFQ_MAXLEN; VNET_DEFINE(struct ifnethead, ifnet); /* depend on static init XXX */ VNET_DEFINE(struct ifgrouphead, ifg_head); /* Table of ifnet by index. */ static int if_index; static int if_indexlim = 8; static struct ifindex_entry { struct ifnet *ife_ifnet; uint16_t ife_gencnt; } *ifindex_table; SYSCTL_NODE(_net_link_generic, IFMIB_SYSTEM, system, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Variables global to all interfaces"); static int sysctl_ifcount(SYSCTL_HANDLER_ARGS) { int rv = 0; IFNET_RLOCK(); for (int i = 1; i <= if_index; i++) if (ifindex_table[i].ife_ifnet != NULL && ifindex_table[i].ife_ifnet->if_vnet == curvnet) rv = i; IFNET_RUNLOCK(); return (sysctl_handle_int(oidp, &rv, 0, req)); } SYSCTL_PROC(_net_link_generic_system, IFMIB_IFCOUNT, ifcount, CTLTYPE_INT | CTLFLAG_VNET | CTLFLAG_RD, NULL, 0, sysctl_ifcount, "I", "Maximum known interface index"); /* * The global network interface list (V_ifnet) and related state (such as * if_index, if_indexlim, and ifindex_table) are protected by an sxlock. * This may be acquired to stabilise the list, or we may rely on NET_EPOCH. */ struct sx ifnet_sxlock; SX_SYSINIT_FLAGS(ifnet_sx, &ifnet_sxlock, "ifnet_sx", SX_RECURSE); struct sx ifnet_detach_sxlock; SX_SYSINIT_FLAGS(ifnet_detach, &ifnet_detach_sxlock, "ifnet_detach_sx", SX_RECURSE); #ifdef VIMAGE #define VNET_IS_SHUTTING_DOWN(_vnet) \ ((_vnet)->vnet_shutdown && (_vnet)->vnet_state < SI_SUB_VNET_DONE) #endif static if_com_alloc_t *if_com_alloc[256]; static if_com_free_t *if_com_free[256]; static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals"); MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address"); struct ifnet * ifnet_byindex(u_int idx) { struct ifnet *ifp; NET_EPOCH_ASSERT(); if (__predict_false(idx > if_index)) return (NULL); ifp = ck_pr_load_ptr(&ifindex_table[idx].ife_ifnet); if (curvnet != NULL && ifp != NULL && ifp->if_vnet != curvnet) ifp = NULL; return (ifp); } struct ifnet * ifnet_byindex_ref(u_int idx) { struct ifnet *ifp; ifp = ifnet_byindex(idx); if (ifp == NULL || (ifp->if_flags & IFF_DYING)) return (NULL); if (!if_try_ref(ifp)) return (NULL); return (ifp); } struct ifnet * ifnet_byindexgen(uint16_t idx, uint16_t gen) { struct ifnet *ifp; NET_EPOCH_ASSERT(); if (__predict_false(idx > if_index)) return (NULL); ifp = ck_pr_load_ptr(&ifindex_table[idx].ife_ifnet); if (ifindex_table[idx].ife_gencnt == gen) return (ifp); else return (NULL); } /* * Network interface utility routines. * * Routines with ifa_ifwith* names take sockaddr *'s as * parameters. */ static void if_init_idxtable(void *arg __unused) { ifindex_table = malloc(if_indexlim * sizeof(*ifindex_table), M_IFNET, M_WAITOK | M_ZERO); } SYSINIT(if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, if_init_idxtable, NULL); static void vnet_if_init(const void *unused __unused) { CK_STAILQ_INIT(&V_ifnet); CK_STAILQ_INIT(&V_ifg_head); vnet_if_clone_init(); } VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init, NULL); static void if_link_ifnet(struct ifnet *ifp) { IFNET_WLOCK(); CK_STAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link); #ifdef VIMAGE curvnet->vnet_ifcnt++; #endif IFNET_WUNLOCK(); } static bool if_unlink_ifnet(struct ifnet *ifp, bool vmove) { struct ifnet *iter; int found = 0; IFNET_WLOCK(); CK_STAILQ_FOREACH(iter, &V_ifnet, if_link) if (iter == ifp) { CK_STAILQ_REMOVE(&V_ifnet, ifp, ifnet, if_link); if (!vmove) ifp->if_flags |= IFF_DYING; found = 1; break; } #ifdef VIMAGE curvnet->vnet_ifcnt--; #endif IFNET_WUNLOCK(); return (found); } #ifdef VIMAGE static void vnet_if_return(const void *unused __unused) { struct ifnet *ifp, *nifp; struct ifnet **pending; int found __diagused; int i; i = 0; /* * We need to protect our access to the V_ifnet tailq. Ordinarily we'd * enter NET_EPOCH, but that's not possible, because if_vmove() calls * if_detach_internal(), which waits for NET_EPOCH callbacks to * complete. We can't do that from within NET_EPOCH. * * However, we can also use the IFNET_xLOCK, which is the V_ifnet * read/write lock. We cannot hold the lock as we call if_vmove() * though, as that presents LOR w.r.t ifnet_sx, in_multi_sx and iflib * ctx lock. */ IFNET_WLOCK(); pending = malloc(sizeof(struct ifnet *) * curvnet->vnet_ifcnt, M_IFNET, M_WAITOK | M_ZERO); /* Return all inherited interfaces to their parent vnets. */ CK_STAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) { if (ifp->if_home_vnet != ifp->if_vnet) { found = if_unlink_ifnet(ifp, true); MPASS(found); pending[i++] = ifp; } } IFNET_WUNLOCK(); for (int j = 0; j < i; j++) { sx_xlock(&ifnet_detach_sxlock); if_vmove(pending[j], pending[j]->if_home_vnet); sx_xunlock(&ifnet_detach_sxlock); } free(pending, M_IFNET); } VNET_SYSUNINIT(vnet_if_return, SI_SUB_VNET_DONE, SI_ORDER_ANY, vnet_if_return, NULL); #endif /* * Allocate a struct ifnet and an index for an interface. A layer 2 * common structure will also be allocated if an allocation routine is * registered for the passed type. */ static struct ifnet * if_alloc_domain(u_char type, int numa_domain) { struct ifnet *ifp; u_short idx; KASSERT(numa_domain <= IF_NODOM, ("numa_domain too large")); if (numa_domain == IF_NODOM) ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK | M_ZERO); else ifp = malloc_domainset(sizeof(struct ifnet), M_IFNET, DOMAINSET_PREF(numa_domain), M_WAITOK | M_ZERO); ifp->if_type = type; ifp->if_alloctype = type; ifp->if_numa_domain = numa_domain; #ifdef VIMAGE ifp->if_vnet = curvnet; #endif if (if_com_alloc[type] != NULL) { ifp->if_l2com = if_com_alloc[type](type, ifp); KASSERT(ifp->if_l2com, ("%s: if_com_alloc[%u] failed", __func__, type)); } IF_ADDR_LOCK_INIT(ifp); TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); TASK_INIT(&ifp->if_addmultitask, 0, if_siocaddmulti, ifp); ifp->if_afdata_initialized = 0; IF_AFDATA_LOCK_INIT(ifp); CK_STAILQ_INIT(&ifp->if_addrhead); CK_STAILQ_INIT(&ifp->if_multiaddrs); CK_STAILQ_INIT(&ifp->if_groups); #ifdef MAC mac_ifnet_init(ifp); #endif ifq_init(&ifp->if_snd, ifp); refcount_init(&ifp->if_refcount, 1); /* Index reference. */ for (int i = 0; i < IFCOUNTERS; i++) ifp->if_counters[i] = counter_u64_alloc(M_WAITOK); ifp->if_get_counter = if_get_counter_default; ifp->if_pcp = IFNET_PCP_NONE; /* Allocate an ifindex array entry. */ IFNET_WLOCK(); /* * Try to find an empty slot below if_index. If we fail, take the * next slot. */ for (idx = 1; idx <= if_index; idx++) { if (ifindex_table[idx].ife_ifnet == NULL) break; } /* Catch if_index overflow. */ if (idx >= if_indexlim) { struct ifindex_entry *new, *old; int newlim; newlim = if_indexlim * 2; new = malloc(newlim * sizeof(*new), M_IFNET, M_WAITOK | M_ZERO); memcpy(new, ifindex_table, if_indexlim * sizeof(*new)); old = ifindex_table; ck_pr_store_ptr(&ifindex_table, new); if_indexlim = newlim; epoch_wait_preempt(net_epoch_preempt); free(old, M_IFNET); } if (idx > if_index) if_index = idx; ifp->if_index = idx; ifp->if_idxgen = ifindex_table[idx].ife_gencnt; ck_pr_store_ptr(&ifindex_table[idx].ife_ifnet, ifp); IFNET_WUNLOCK(); return (ifp); } struct ifnet * if_alloc_dev(u_char type, device_t dev) { int numa_domain; if (dev == NULL || bus_get_domain(dev, &numa_domain) != 0) return (if_alloc_domain(type, IF_NODOM)); return (if_alloc_domain(type, numa_domain)); } struct ifnet * if_alloc(u_char type) { return (if_alloc_domain(type, IF_NODOM)); } /* * Do the actual work of freeing a struct ifnet, and layer 2 common * structure. This call is made when the network epoch guarantees * us that nobody holds a pointer to the interface. */ static void if_free_deferred(epoch_context_t ctx) { struct ifnet *ifp = __containerof(ctx, struct ifnet, if_epoch_ctx); KASSERT((ifp->if_flags & IFF_DYING), ("%s: interface not dying", __func__)); if (if_com_free[ifp->if_alloctype] != NULL) if_com_free[ifp->if_alloctype](ifp->if_l2com, ifp->if_alloctype); #ifdef MAC mac_ifnet_destroy(ifp); #endif /* MAC */ IF_AFDATA_DESTROY(ifp); IF_ADDR_LOCK_DESTROY(ifp); ifq_delete(&ifp->if_snd); for (int i = 0; i < IFCOUNTERS; i++) counter_u64_free(ifp->if_counters[i]); if_freedescr(ifp->if_description); free(ifp->if_hw_addr, M_IFADDR); free(ifp, M_IFNET); } /* * Deregister an interface and free the associated storage. */ void if_free(struct ifnet *ifp) { ifp->if_flags |= IFF_DYING; /* XXX: Locking */ /* * XXXGL: An interface index is really an alias to ifp pointer. * Why would we clear the alias now, and not in the deferred * context? Indeed there is nothing wrong with some network * thread obtaining ifp via ifnet_byindex() inside the network * epoch and then dereferencing ifp while we perform if_free(), * and after if_free() finished, too. * * This early index freeing was important back when ifindex was * virtualized and interface would outlive the vnet. */ IFNET_WLOCK(); MPASS(ifindex_table[ifp->if_index].ife_ifnet == ifp); ck_pr_store_ptr(&ifindex_table[ifp->if_index].ife_ifnet, NULL); ifindex_table[ifp->if_index].ife_gencnt++; while (if_index > 0 && ifindex_table[if_index].ife_ifnet == NULL) if_index--; IFNET_WUNLOCK(); if (refcount_release(&ifp->if_refcount)) NET_EPOCH_CALL(if_free_deferred, &ifp->if_epoch_ctx); } /* * Interfaces to keep an ifnet type-stable despite the possibility of the * driver calling if_free(). If there are additional references, we defer * freeing the underlying data structure. */ void if_ref(struct ifnet *ifp) { u_int old __diagused; /* We don't assert the ifnet list lock here, but arguably should. */ old = refcount_acquire(&ifp->if_refcount); KASSERT(old > 0, ("%s: ifp %p has 0 refs", __func__, ifp)); } bool if_try_ref(struct ifnet *ifp) { NET_EPOCH_ASSERT(); return (refcount_acquire_if_not_zero(&ifp->if_refcount)); } void if_rele(struct ifnet *ifp) { if (!refcount_release(&ifp->if_refcount)) return; NET_EPOCH_CALL(if_free_deferred, &ifp->if_epoch_ctx); } void ifq_init(struct ifaltq *ifq, struct ifnet *ifp) { mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF); if (ifq->ifq_maxlen == 0) ifq->ifq_maxlen = ifqmaxlen; ifq->altq_type = 0; ifq->altq_disc = NULL; ifq->altq_flags &= ALTQF_CANTCHANGE; ifq->altq_tbr = NULL; ifq->altq_ifp = ifp; } void ifq_delete(struct ifaltq *ifq) { mtx_destroy(&ifq->ifq_mtx); } /* * Perform generic interface initialization tasks and attach the interface * to the list of "active" interfaces. If vmove flag is set on entry * to if_attach_internal(), perform only a limited subset of initialization * tasks, given that we are moving from one vnet to another an ifnet which * has already been fully initialized. * * Note that if_detach_internal() removes group membership unconditionally * even when vmove flag is set, and if_attach_internal() adds only IFG_ALL. * Thus, when if_vmove() is applied to a cloned interface, group membership * is lost while a cloned one always joins a group whose name is * ifc->ifc_name. To recover this after if_detach_internal() and * if_attach_internal(), the cloner should be specified to * if_attach_internal() via ifc. If it is non-NULL, if_attach_internal() * attempts to join a group whose name is ifc->ifc_name. * * XXX: * - The decision to return void and thus require this function to * succeed is questionable. * - We should probably do more sanity checking. For instance we don't * do anything to insure if_xname is unique or non-empty. */ void if_attach(struct ifnet *ifp) { if_attach_internal(ifp, false); } /* * Compute the least common TSO limit. */ void if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax) { /* * 1) If there is no limit currently, take the limit from * the network adapter. * * 2) If the network adapter has a limit below the current * limit, apply it. */ if (pmax->tsomaxbytes == 0 || (ifp->if_hw_tsomax != 0 && ifp->if_hw_tsomax < pmax->tsomaxbytes)) { pmax->tsomaxbytes = ifp->if_hw_tsomax; } if (pmax->tsomaxsegcount == 0 || (ifp->if_hw_tsomaxsegcount != 0 && ifp->if_hw_tsomaxsegcount < pmax->tsomaxsegcount)) { pmax->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; } if (pmax->tsomaxsegsize == 0 || (ifp->if_hw_tsomaxsegsize != 0 && ifp->if_hw_tsomaxsegsize < pmax->tsomaxsegsize)) { pmax->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } /* * Update TSO limit of a network adapter. * * Returns zero if no change. Else non-zero. */ int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax) { int retval = 0; if (ifp->if_hw_tsomax != pmax->tsomaxbytes) { ifp->if_hw_tsomax = pmax->tsomaxbytes; retval++; } if (ifp->if_hw_tsomaxsegsize != pmax->tsomaxsegsize) { ifp->if_hw_tsomaxsegsize = pmax->tsomaxsegsize; retval++; } if (ifp->if_hw_tsomaxsegcount != pmax->tsomaxsegcount) { ifp->if_hw_tsomaxsegcount = pmax->tsomaxsegcount; retval++; } return (retval); } static void if_attach_internal(struct ifnet *ifp, bool vmove) { unsigned socksize, ifasize; int namelen, masklen; struct sockaddr_dl *sdl; struct ifaddr *ifa; MPASS(ifindex_table[ifp->if_index].ife_ifnet == ifp); #ifdef VIMAGE ifp->if_vnet = curvnet; if (ifp->if_home_vnet == NULL) ifp->if_home_vnet = curvnet; #endif if_addgroup(ifp, IFG_ALL); #ifdef VIMAGE /* Restore group membership for cloned interface. */ if (vmove) if_clone_restoregroup(ifp); #endif getmicrotime(&ifp->if_lastchange); ifp->if_epoch = time_uptime; KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) || (ifp->if_transmit != NULL && ifp->if_qflush != NULL), ("transmit and qflush must both either be set or both be NULL")); if (ifp->if_transmit == NULL) { ifp->if_transmit = if_transmit_default; ifp->if_qflush = if_qflush; } if (ifp->if_input == NULL) ifp->if_input = if_input_default; if (ifp->if_requestencap == NULL) ifp->if_requestencap = if_requestencap_default; if (!vmove) { #ifdef MAC mac_ifnet_create(ifp); #endif /* * Create a Link Level name for this device. */ namelen = strlen(ifp->if_xname); /* * Always save enough space for any possiable name so we * can do a rename in place later. */ masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ; socksize = masklen + ifp->if_addrlen; if (socksize < sizeof(*sdl)) socksize = sizeof(*sdl); socksize = roundup2(socksize, sizeof(long)); ifasize = sizeof(*ifa) + 2 * socksize; ifa = ifa_alloc(ifasize, M_WAITOK); sdl = (struct sockaddr_dl *)(ifa + 1); sdl->sdl_len = socksize; sdl->sdl_family = AF_LINK; bcopy(ifp->if_xname, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; ifp->if_addr = ifa; ifa->ifa_ifp = ifp; ifa->ifa_addr = (struct sockaddr *)sdl; sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl); ifa->ifa_netmask = (struct sockaddr *)sdl; sdl->sdl_len = masklen; while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; CK_STAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); /* Reliably crash if used uninitialized. */ ifp->if_broadcastaddr = NULL; if (ifp->if_type == IFT_ETHER) { ifp->if_hw_addr = malloc(ifp->if_addrlen, M_IFADDR, M_WAITOK | M_ZERO); } #if defined(INET) || defined(INET6) /* Use defaults for TSO, if nothing is set */ if (ifp->if_hw_tsomax == 0 && ifp->if_hw_tsomaxsegcount == 0 && ifp->if_hw_tsomaxsegsize == 0) { /* * The TSO defaults needs to be such that an * NFS mbuf list of 35 mbufs totalling just * below 64K works and that a chain of mbufs * can be defragged into at most 32 segments: */ ifp->if_hw_tsomax = min(IP_MAXPACKET, (32 * MCLBYTES) - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)); ifp->if_hw_tsomaxsegcount = 35; ifp->if_hw_tsomaxsegsize = 2048; /* 2K */ /* XXX some drivers set IFCAP_TSO after ethernet attach */ if (ifp->if_capabilities & IFCAP_TSO) { if_printf(ifp, "Using defaults for TSO: %u/%u/%u\n", ifp->if_hw_tsomax, ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); } } #endif } #ifdef VIMAGE else { /* * Update the interface index in the link layer address * of the interface. */ for (ifa = ifp->if_addr; ifa != NULL; ifa = CK_STAILQ_NEXT(ifa, ifa_link)) { if (ifa->ifa_addr->sa_family == AF_LINK) { sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_index = ifp->if_index; } } } #endif if_link_ifnet(ifp); if (domain_init_status >= 2) if_attachdomain1(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL); } static void if_epochalloc(void *dummy __unused) { net_epoch_preempt = epoch_alloc("Net preemptible", EPOCH_PREEMPT); } SYSINIT(ifepochalloc, SI_SUB_EPOCH, SI_ORDER_ANY, if_epochalloc, NULL); static void if_attachdomain(void *dummy) { struct ifnet *ifp; CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) if_attachdomain1(ifp); } SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND, if_attachdomain, NULL); static void if_attachdomain1(struct ifnet *ifp) { struct domain *dp; /* * Since dp->dom_ifattach calls malloc() with M_WAITOK, we * cannot lock ifp->if_afdata initialization, entirely. */ IF_AFDATA_LOCK(ifp); if (ifp->if_afdata_initialized >= domain_init_status) { IF_AFDATA_UNLOCK(ifp); log(LOG_WARNING, "%s called more than once on %s\n", __func__, ifp->if_xname); return; } ifp->if_afdata_initialized = domain_init_status; IF_AFDATA_UNLOCK(ifp); /* address family dependent data region */ bzero(ifp->if_afdata, sizeof(ifp->if_afdata)); SLIST_FOREACH(dp, &domains, dom_next) { if (dp->dom_ifattach) ifp->if_afdata[dp->dom_family] = (*dp->dom_ifattach)(ifp); } } /* * Remove any unicast or broadcast network addresses from an interface. */ void if_purgeaddrs(struct ifnet *ifp) { struct ifaddr *ifa; #ifdef INET6 /* * Need to leave multicast addresses of proxy NDP llentries * before in6_purgeifaddr() because the llentries are keys * for in6_multi objects of proxy NDP entries. * in6_purgeifaddr()s clean up llentries including proxy NDPs * then we would lose the keys if they are called earlier. */ in6_purge_proxy_ndp(ifp); #endif while (1) { struct epoch_tracker et; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_LINK) break; } NET_EPOCH_EXIT(et); if (ifa == NULL) break; #ifdef INET /* XXX: Ugly!! ad hoc just for INET */ if (ifa->ifa_addr->sa_family == AF_INET) { struct ifaliasreq ifr; bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL) == 0) continue; } #endif /* INET */ #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) { in6_purgeifaddr((struct in6_ifaddr *)ifa); /* ifp_addrhead is already updated */ continue; } #endif /* INET6 */ IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } } /* * Remove any multicast network addresses from an interface when an ifnet * is going away. */ static void if_purgemaddrs(struct ifnet *ifp) { struct ifmultiaddr *ifma; IF_ADDR_WLOCK(ifp); while (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) { ifma = CK_STAILQ_FIRST(&ifp->if_multiaddrs); CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); if_delmulti_locked(ifp, ifma, 1); } IF_ADDR_WUNLOCK(ifp); } /* * Detach an interface, removing it from the list of "active" interfaces. * If vmove flag is set on entry to if_detach_internal(), perform only a * limited subset of cleanup tasks, given that we are moving an ifnet from * one vnet to another, where it must be fully operational. * * XXXRW: There are some significant questions about event ordering, and * how to prevent things from starting to use the interface during detach. */ void if_detach(struct ifnet *ifp) { bool found; CURVNET_SET_QUIET(ifp->if_vnet); found = if_unlink_ifnet(ifp, false); if (found) { sx_xlock(&ifnet_detach_sxlock); if_detach_internal(ifp, false); sx_xunlock(&ifnet_detach_sxlock); } CURVNET_RESTORE(); } /* * The vmove flag, if set, indicates that we are called from a callpath * that is moving an interface to a different vnet instance. * * The shutdown flag, if set, indicates that we are called in the * process of shutting down a vnet instance. Currently only the * vnet_if_return SYSUNINIT function sets it. Note: we can be called * on a vnet instance shutdown without this flag being set, e.g., when * the cloned interfaces are destoyed as first thing of teardown. */ static int if_detach_internal(struct ifnet *ifp, bool vmove) { struct ifaddr *ifa; int i; struct domain *dp; #ifdef VIMAGE bool shutdown; shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet); #endif /* * At this point we know the interface still was on the ifnet list * and we removed it so we are in a stable state. */ epoch_wait_preempt(net_epoch_preempt); /* * Ensure all pending EPOCH(9) callbacks have been executed. This * fixes issues about late destruction of multicast options * which lead to leave group calls, which in turn access the * belonging ifnet structure: */ NET_EPOCH_DRAIN_CALLBACKS(); /* * In any case (destroy or vmove) detach us from the groups * and remove/wait for pending events on the taskq. * XXX-BZ in theory an interface could still enqueue a taskq change? */ if_delgroups(ifp); taskqueue_drain(taskqueue_swi, &ifp->if_linktask); taskqueue_drain(taskqueue_swi, &ifp->if_addmultitask); if_down(ifp); #ifdef VIMAGE /* * On VNET shutdown abort here as the stack teardown will do all * the work top-down for us. */ if (shutdown) { /* Give interface users the chance to clean up. */ EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); /* * In case of a vmove we are done here without error. * If we would signal an error it would lead to the same * abort as if we did not find the ifnet anymore. * if_detach() calls us in void context and does not care * about an early abort notification, so life is splendid :) */ goto finish_vnet_shutdown; } #endif /* * At this point we are not tearing down a VNET and are either * going to destroy or vmove the interface and have to cleanup * accordingly. */ /* * Remove routes and flush queues. */ #ifdef ALTQ if (ALTQ_IS_ENABLED(&ifp->if_snd)) altq_disable(&ifp->if_snd); if (ALTQ_IS_ATTACHED(&ifp->if_snd)) altq_detach(&ifp->if_snd); #endif if_purgeaddrs(ifp); #ifdef INET in_ifdetach(ifp); #endif #ifdef INET6 /* * Remove all IPv6 kernel structs related to ifp. This should be done * before removing routing entries below, since IPv6 interface direct * routes are expected to be removed by the IPv6-specific kernel API. * Otherwise, the kernel will detect some inconsistency and bark it. */ in6_ifdetach(ifp); #endif if_purgemaddrs(ifp); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); if (!vmove) { /* * Prevent further calls into the device driver via ifnet. */ if_dead(ifp); /* * Clean up all addresses. */ IF_ADDR_WLOCK(ifp); if (!CK_STAILQ_EMPTY(&ifp->if_addrhead)) { ifa = CK_STAILQ_FIRST(&ifp->if_addrhead); CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } else IF_ADDR_WUNLOCK(ifp); } rt_flushifroutes(ifp); #ifdef VIMAGE finish_vnet_shutdown: #endif /* * We cannot hold the lock over dom_ifdetach calls as they might * sleep, for example trying to drain a callout, thus open up the * theoretical race with re-attaching. */ IF_AFDATA_LOCK(ifp); i = ifp->if_afdata_initialized; ifp->if_afdata_initialized = 0; IF_AFDATA_UNLOCK(ifp); if (i == 0) return (0); SLIST_FOREACH(dp, &domains, dom_next) { if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) { (*dp->dom_ifdetach)(ifp, ifp->if_afdata[dp->dom_family]); ifp->if_afdata[dp->dom_family] = NULL; } } return (0); } #ifdef VIMAGE /* * if_vmove() performs a limited version of if_detach() in current * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg. */ static int if_vmove(struct ifnet *ifp, struct vnet *new_vnet) { #ifdef DEV_BPF u_int bif_dlt, bif_hdrlen; #endif int rc; #ifdef DEV_BPF /* * if_detach_internal() will call the eventhandler to notify * interface departure. That will detach if_bpf. We need to * safe the dlt and hdrlen so we can re-attach it later. */ bpf_get_bp_params(ifp->if_bpf, &bif_dlt, &bif_hdrlen); #endif /* * Detach from current vnet, but preserve LLADDR info, do not * mark as dead etc. so that the ifnet can be reattached later. * If we cannot find it, we lost the race to someone else. */ rc = if_detach_internal(ifp, true); if (rc != 0) return (rc); /* * Perform interface-specific reassignment tasks, if provided by * the driver. */ if (ifp->if_reassign != NULL) ifp->if_reassign(ifp, new_vnet, NULL); /* * Switch to the context of the target vnet. */ CURVNET_SET_QUIET(new_vnet); if_attach_internal(ifp, true); #ifdef DEV_BPF if (ifp->if_bpf == NULL) bpfattach(ifp, bif_dlt, bif_hdrlen); #endif CURVNET_RESTORE(); return (0); } /* * Move an ifnet to or from another child prison/vnet, specified by the jail id. */ static int if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid) { struct prison *pr; struct ifnet *difp; int error; bool found __diagused; bool shutdown; MPASS(ifindex_table[ifp->if_index].ife_ifnet == ifp); /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Do not try to move the iface from and to the same prison. */ if (pr->pr_vnet == ifp->if_vnet) { prison_free(pr); return (EEXIST); } /* Make sure the named iface does not exists in the dst. prison/vnet. */ /* XXX Lock interfaces to avoid races. */ CURVNET_SET_QUIET(pr->pr_vnet); difp = ifunit(ifname); if (difp != NULL) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } sx_xlock(&ifnet_detach_sxlock); /* Make sure the VNET is stable. */ shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet); if (shutdown) { sx_xunlock(&ifnet_detach_sxlock); CURVNET_RESTORE(); prison_free(pr); return (EBUSY); } CURVNET_RESTORE(); found = if_unlink_ifnet(ifp, true); if (! found) { sx_xunlock(&ifnet_detach_sxlock); CURVNET_RESTORE(); prison_free(pr); return (ENODEV); } /* Move the interface into the child jail/vnet. */ error = if_vmove(ifp, pr->pr_vnet); /* Report the new if_xname back to the userland on success. */ if (error == 0) sprintf(ifname, "%s", ifp->if_xname); sx_xunlock(&ifnet_detach_sxlock); prison_free(pr); return (error); } static int if_vmove_reclaim(struct thread *td, char *ifname, int jid) { struct prison *pr; struct vnet *vnet_dst; struct ifnet *ifp; int error, found __diagused; bool shutdown; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Make sure the named iface exists in the source prison/vnet. */ CURVNET_SET(pr->pr_vnet); ifp = ifunit(ifname); /* XXX Lock to avoid races. */ if (ifp == NULL) { CURVNET_RESTORE(); prison_free(pr); return (ENXIO); } /* Do not try to move the iface from and to the same prison. */ vnet_dst = TD_TO_VNET(td); if (vnet_dst == ifp->if_vnet) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Make sure the VNET is stable. */ shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet); if (shutdown) { CURVNET_RESTORE(); prison_free(pr); return (EBUSY); } /* Get interface back from child jail/vnet. */ found = if_unlink_ifnet(ifp, true); MPASS(found); sx_xlock(&ifnet_detach_sxlock); error = if_vmove(ifp, vnet_dst); sx_xunlock(&ifnet_detach_sxlock); CURVNET_RESTORE(); /* Report the new if_xname back to the userland on success. */ if (error == 0) sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (error); } #endif /* VIMAGE */ /* * Add a group to an interface */ int if_addgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_group *ifg = NULL; struct ifg_member *ifgm; int new = 0; if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' && groupname[strlen(groupname) - 1] <= '9') return (EINVAL); IFNET_WLOCK(); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) { IFNET_WUNLOCK(); return (EEXIST); } if ((ifgl = malloc(sizeof(*ifgl), M_TEMP, M_NOWAIT)) == NULL) { IFNET_WUNLOCK(); return (ENOMEM); } if ((ifgm = malloc(sizeof(*ifgm), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, groupname)) break; if (ifg == NULL) { if ((ifg = malloc(sizeof(*ifg), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); free(ifgm, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group)); ifg->ifg_refcnt = 0; CK_STAILQ_INIT(&ifg->ifg_members); CK_STAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next); new = 1; } ifg->ifg_refcnt++; ifgl->ifgl_group = ifg; ifgm->ifgm_ifp = ifp; IF_ADDR_WLOCK(ifp); CK_STAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next); CK_STAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); IFNET_WUNLOCK(); if (new) EVENTHANDLER_INVOKE(group_attach_event, ifg); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Helper function to remove a group out of an interface. Expects the global * ifnet lock to be write-locked, and drops it before returning. */ static void _if_delgroup_locked(struct ifnet *ifp, struct ifg_list *ifgl, const char *groupname) { struct ifg_member *ifgm; bool freeifgl; IFNET_WLOCK_ASSERT(); IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_groups, ifgl, ifg_list, ifgl_next); IF_ADDR_WUNLOCK(ifp); CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) { if (ifgm->ifgm_ifp == ifp) { CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifg_member, ifgm_next); break; } } if (--ifgl->ifgl_group->ifg_refcnt == 0) { CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group, ifg_next); freeifgl = true; } else { freeifgl = false; } IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); EVENTHANDLER_INVOKE(group_change_event, groupname); if (freeifgl) { EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } free(ifgm, M_TEMP); free(ifgl, M_TEMP); } /* * Remove a group from an interface */ int if_delgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; IFNET_WLOCK(); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (strcmp(ifgl->ifgl_group->ifg_group, groupname) == 0) break; if (ifgl == NULL) { IFNET_WUNLOCK(); return (ENOENT); } _if_delgroup_locked(ifp, ifgl, groupname); return (0); } /* * Remove an interface from all groups */ static void if_delgroups(struct ifnet *ifp) { struct ifg_list *ifgl; char groupname[IFNAMSIZ]; IFNET_WLOCK(); while ((ifgl = CK_STAILQ_FIRST(&ifp->if_groups)) != NULL) { strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ); _if_delgroup_locked(ifp, ifgl, groupname); IFNET_WLOCK(); } IFNET_WUNLOCK(); } /* * Stores all groups from an interface in memory pointed to by ifgr. */ static int if_getgroup(struct ifgroupreq *ifgr, struct ifnet *ifp) { int len, error; struct ifg_list *ifgl; struct ifg_req ifgrq, *ifgp; NET_EPOCH_ASSERT(); if (ifgr->ifgr_len == 0) { CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) ifgr->ifgr_len += sizeof(struct ifg_req); return (0); } len = ifgr->ifgr_len; ifgp = ifgr->ifgr_groups; /* XXX: wire */ CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) { if (len < sizeof(ifgrq)) return (EINVAL); bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group, sizeof(ifgrq.ifgrq_group)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) return (error); len -= sizeof(ifgrq); ifgp++; } return (0); } /* * Stores all members of a group in memory pointed to by igfr */ static int if_getgroupmembers(struct ifgroupreq *ifgr) { struct ifg_group *ifg; struct ifg_member *ifgm; struct ifg_req ifgrq, *ifgp; int len, error; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (strcmp(ifg->ifg_group, ifgr->ifgr_name) == 0) break; if (ifg == NULL) { IFNET_RUNLOCK(); return (ENOENT); } if (ifgr->ifgr_len == 0) { CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) ifgr->ifgr_len += sizeof(ifgrq); IFNET_RUNLOCK(); return (0); } len = ifgr->ifgr_len; ifgp = ifgr->ifgr_groups; CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) { if (len < sizeof(ifgrq)) { IFNET_RUNLOCK(); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname, sizeof(ifgrq.ifgrq_member)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IFNET_RUNLOCK(); return (error); } len -= sizeof(ifgrq); ifgp++; } IFNET_RUNLOCK(); return (0); } /* * Return counter values from counter(9)s stored in ifnet. */ uint64_t if_get_counter_default(struct ifnet *ifp, ift_counter cnt) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); return (counter_u64_fetch(ifp->if_counters[cnt])); } /* * Increase an ifnet counter. Usually used for counters shared * between the stack and a driver, but function supports them all. */ void if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); counter_u64_add(ifp->if_counters[cnt], inc); } /* * Copy data from ifnet to userland API structure if_data. */ void if_data_copy(struct ifnet *ifp, struct if_data *ifd) { ifd->ifi_type = ifp->if_type; ifd->ifi_physical = 0; ifd->ifi_addrlen = ifp->if_addrlen; ifd->ifi_hdrlen = ifp->if_hdrlen; ifd->ifi_link_state = ifp->if_link_state; ifd->ifi_vhid = 0; ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_mtu = ifp->if_mtu; ifd->ifi_metric = ifp->if_metric; ifd->ifi_baudrate = ifp->if_baudrate; ifd->ifi_hwassist = ifp->if_hwassist; ifd->ifi_epoch = ifp->if_epoch; ifd->ifi_lastchange = ifp->if_lastchange; ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS); ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS); ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS); ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS); ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS); ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES); ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES); ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS); ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS); ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS); ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS); ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO); } /* * Initialization, destruction and refcounting functions for ifaddrs. */ struct ifaddr * ifa_alloc(size_t size, int flags) { struct ifaddr *ifa; KASSERT(size >= sizeof(struct ifaddr), ("%s: invalid size %zu", __func__, size)); ifa = malloc(size, M_IFADDR, M_ZERO | flags); if (ifa == NULL) return (NULL); if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL) goto fail; refcount_init(&ifa->ifa_refcnt, 1); return (ifa); fail: /* free(NULL) is okay */ counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); return (NULL); } void ifa_ref(struct ifaddr *ifa) { u_int old __diagused; old = refcount_acquire(&ifa->ifa_refcnt); KASSERT(old > 0, ("%s: ifa %p has 0 refs", __func__, ifa)); } int ifa_try_ref(struct ifaddr *ifa) { NET_EPOCH_ASSERT(); return (refcount_acquire_if_not_zero(&ifa->ifa_refcnt)); } static void ifa_destroy(epoch_context_t ctx) { struct ifaddr *ifa; ifa = __containerof(ctx, struct ifaddr, ifa_epoch_ctx); counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); } void ifa_free(struct ifaddr *ifa) { if (refcount_release(&ifa->ifa_refcnt)) NET_EPOCH_CALL(ifa_destroy, &ifa->ifa_epoch_ctx); } /* * XXX: Because sockaddr_dl has deeper structure than the sockaddr * structs used to represent other address families, it is necessary * to perform a different comparison. */ #define sa_dl_equal(a1, a2) \ ((((const struct sockaddr_dl *)(a1))->sdl_len == \ ((const struct sockaddr_dl *)(a2))->sdl_len) && \ (bcmp(CLLADDR((const struct sockaddr_dl *)(a1)), \ CLLADDR((const struct sockaddr_dl *)(a2)), \ ((const struct sockaddr_dl *)(a1))->sdl_alen) == 0)) /* * Locate an interface based on a complete address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithaddr(const struct sockaddr *addr) { struct ifnet *ifp; struct ifaddr *ifa; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (sa_equal(addr, ifa->ifa_addr)) { goto done; } /* IP6 doesn't have broadcast */ if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { goto done; } } } ifa = NULL; done: return (ifa); } int ifa_ifwithaddr_check(const struct sockaddr *addr) { struct epoch_tracker et; int rc; NET_EPOCH_ENTER(et); rc = (ifa_ifwithaddr(addr) != NULL); NET_EPOCH_EXIT(et); return (rc); } /* * Locate an interface based on the broadcast address. */ /* ARGSUSED */ struct ifaddr * ifa_ifwithbroadaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { goto done; } } } ifa = NULL; done: return (ifa); } /* * Locate the point to point interface with a given destination address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithdstaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((ifp->if_flags & IFF_POINTOPOINT) == 0) continue; if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { goto done; } } } ifa = NULL; done: return (ifa); } /* * Find an interface on a specific network. If many, choice * is most specific found. */ struct ifaddr * ifa_ifwithnet(const struct sockaddr *addr, int ignore_ptp, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; const char *addr_data = addr->sa_data, *cplim; NET_EPOCH_ASSERT(); /* * AF_LINK addresses can be looked up directly by their index number, * so do that if we can. */ if (af == AF_LINK) { ifp = ifnet_byindex( ((const struct sockaddr_dl *)addr)->sdl_index); return (ifp ? ifp->if_addr : NULL); } /* * Scan though each interface, looking for ones that have addresses * in this address family and the requested fib. */ CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { const char *cp, *cp2, *cp3; if (ifa->ifa_addr->sa_family != af) next: continue; if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) { /* * This is a bit broken as it doesn't * take into account that the remote end may * be a single node in the network we are * looking for. * The trouble is that we don't know the * netmask for the remote end. */ if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { goto done; } } else { /* * Scan all the bits in the ifa's address. * If a bit dissagrees with what we are * looking for, mask it with the netmask * to see if it really matters. * (A byte at a time) */ if (ifa->ifa_netmask == 0) continue; cp = addr_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; while (cp3 < cplim) if ((*cp++ ^ *cp2++) & *cp3++) goto next; /* next address! */ /* * If the netmask of what we just found * is more specific than what we had before * (if we had one), or if the virtual status * of new prefix is better than of the old one, * then remember the new one before continuing * to search for an even better one. */ if (ifa_maybe == NULL || ifa_preferred(ifa_maybe, ifa) || rn_refines((caddr_t)ifa->ifa_netmask, (caddr_t)ifa_maybe->ifa_netmask)) { ifa_maybe = ifa; } } } } ifa = ifa_maybe; ifa_maybe = NULL; done: return (ifa); } /* * Find an interface address specific to an interface best matching * a given address. */ struct ifaddr * ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp) { struct ifaddr *ifa; const char *cp, *cp2, *cp3; char *cplim; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; if (af >= AF_MAX) return (NULL); NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != af) continue; if (ifa_maybe == NULL) ifa_maybe = ifa; if (ifa->ifa_netmask == 0) { if (sa_equal(addr, ifa->ifa_addr) || (ifa->ifa_dstaddr && sa_equal(addr, ifa->ifa_dstaddr))) goto done; continue; } if (ifp->if_flags & IFF_POINTOPOINT) { if (ifa->ifa_dstaddr && sa_equal(addr, ifa->ifa_dstaddr)) goto done; } else { cp = addr->sa_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; for (; cp3 < cplim; cp3++) if ((*cp++ ^ *cp2++) & *cp3) break; if (cp3 == cplim) goto done; } } ifa = ifa_maybe; done: return (ifa); } /* * See whether new ifa is better than current one: * 1) A non-virtual one is preferred over virtual. * 2) A virtual in master state preferred over any other state. * * Used in several address selecting functions. */ int ifa_preferred(struct ifaddr *cur, struct ifaddr *next) { return (cur->ifa_carp && (!next->ifa_carp || ((*carp_master_p)(next) && !(*carp_master_p)(cur)))); } struct sockaddr_dl * link_alloc_sdl(size_t size, int flags) { return (malloc(size, M_TEMP, flags)); } void link_free_sdl(struct sockaddr *sa) { free(sa, M_TEMP); } /* * Fills in given sdl with interface basic info. * Returns pointer to filled sdl. */ struct sockaddr_dl * link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)paddr; memset(sdl, 0, sizeof(struct sockaddr_dl)); sdl->sdl_len = sizeof(struct sockaddr_dl); sdl->sdl_family = AF_LINK; sdl->sdl_index = ifp->if_index; sdl->sdl_type = iftype; return (sdl); } /* * Mark an interface down and notify protocols of * the transition. */ static void if_unroute(struct ifnet *ifp, int flag, int fam) { KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP")); ifp->if_flags &= ~flag; getmicrotime(&ifp->if_lastchange); ifp->if_qflush(ifp); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp, IFF_UP); } void (*vlan_link_state_p)(struct ifnet *); /* XXX: private from if_vlan */ void (*vlan_trunk_cap_p)(struct ifnet *); /* XXX: private from if_vlan */ struct ifnet *(*vlan_trunkdev_p)(struct ifnet *); struct ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t); int (*vlan_tag_p)(struct ifnet *, uint16_t *); int (*vlan_pcp_p)(struct ifnet *, uint16_t *); int (*vlan_setcookie_p)(struct ifnet *, void *); void *(*vlan_cookie_p)(struct ifnet *); /* * Handle a change in the interface link state. To avoid LORs * between driver lock and upper layer locks, as well as possible * recursions, we post event to taskqueue, and all job * is done in static do_link_state_change(). */ void if_link_state_change(struct ifnet *ifp, int link_state) { /* Return if state hasn't changed. */ if (ifp->if_link_state == link_state) return; ifp->if_link_state = link_state; /* XXXGL: reference ifp? */ taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask); } static void do_link_state_change(void *arg, int pending) { struct ifnet *ifp; int link_state; ifp = arg; link_state = ifp->if_link_state; CURVNET_SET(ifp->if_vnet); rt_ifmsg(ifp, 0); if (ifp->if_vlantrunk != NULL) (*vlan_link_state_p)(ifp); if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) && ifp->if_l2com != NULL) (*ng_ether_link_state_p)(ifp, link_state); if (ifp->if_carp) (*carp_linkstate_p)(ifp); if (ifp->if_bridge) ifp->if_bridge_linkstate(ifp); if (ifp->if_lagg) (*lagg_linkstate_p)(ifp, link_state); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL); if (pending > 1) if_printf(ifp, "%d link states coalesced\n", pending); if (log_link_state_change) if_printf(ifp, "link state changed to %s\n", (link_state == LINK_STATE_UP) ? "UP" : "DOWN" ); EVENTHANDLER_INVOKE(ifnet_link_event, ifp, link_state); CURVNET_RESTORE(); } /* * Mark an interface down and notify protocols of * the transition. */ void if_down(struct ifnet *ifp) { EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_DOWN); if_unroute(ifp, IFF_UP, AF_UNSPEC); } /* * Mark an interface up and notify protocols of * the transition. */ void if_up(struct ifnet *ifp) { ifp->if_flags |= IFF_UP; getmicrotime(&ifp->if_lastchange); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp, IFF_UP); EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_UP); } /* * Flush an interface queue. */ void if_qflush(struct ifnet *ifp) { struct mbuf *m, *n; struct ifaltq *ifq; ifq = &ifp->if_snd; IFQ_LOCK(ifq); #ifdef ALTQ if (ALTQ_IS_ENABLED(ifq)) ALTQ_PURGE(ifq); #endif n = ifq->ifq_head; while ((m = n) != NULL) { n = m->m_nextpkt; m_freem(m); } ifq->ifq_head = 0; ifq->ifq_tail = 0; ifq->ifq_len = 0; IFQ_UNLOCK(ifq); } /* * Map interface name to interface structure pointer, with or without * returning a reference. */ struct ifnet * ifunit_ref(const char *name) { struct epoch_tracker et; struct ifnet *ifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 && !(ifp->if_flags & IFF_DYING)) break; } if (ifp != NULL) { if_ref(ifp); MPASS(ifindex_table[ifp->if_index].ife_ifnet == ifp); } NET_EPOCH_EXIT(et); return (ifp); } struct ifnet * ifunit(const char *name) { struct epoch_tracker et; struct ifnet *ifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0) break; } NET_EPOCH_EXIT(et); return (ifp); } void * ifr_buffer_get_buffer(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((void *)(uintptr_t) ifrup->ifr32.ifr_ifru.ifru_buffer.buffer); #endif return (ifrup->ifr.ifr_ifru.ifru_buffer.buffer); } static void ifr_buffer_set_buffer_null(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ifrup->ifr32.ifr_ifru.ifru_buffer.buffer = 0; else #endif ifrup->ifr.ifr_ifru.ifru_buffer.buffer = NULL; } size_t ifr_buffer_get_length(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return (ifrup->ifr32.ifr_ifru.ifru_buffer.length); #endif return (ifrup->ifr.ifr_ifru.ifru_buffer.length); } static void ifr_buffer_set_length(void *data, size_t len) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ifrup->ifr32.ifr_ifru.ifru_buffer.length = len; else #endif ifrup->ifr.ifr_ifru.ifru_buffer.length = len; } void * ifr_data_get_ptr(void *ifrp) { union ifreq_union *ifrup; ifrup = ifrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((void *)(uintptr_t) ifrup->ifr32.ifr_ifru.ifru_data); #endif return (ifrup->ifr.ifr_ifru.ifru_data); } struct ifcap_nv_bit_name { uint64_t cap_bit; const char *cap_name; }; #define CAPNV(x) {.cap_bit = IFCAP_##x, \ .cap_name = __CONCAT(IFCAP_, __CONCAT(x, _NAME)) } const struct ifcap_nv_bit_name ifcap_nv_bit_names[] = { CAPNV(RXCSUM), CAPNV(TXCSUM), CAPNV(NETCONS), CAPNV(VLAN_MTU), CAPNV(VLAN_HWTAGGING), CAPNV(JUMBO_MTU), CAPNV(POLLING), CAPNV(VLAN_HWCSUM), CAPNV(TSO4), CAPNV(TSO6), CAPNV(LRO), CAPNV(WOL_UCAST), CAPNV(WOL_MCAST), CAPNV(WOL_MAGIC), CAPNV(TOE4), CAPNV(TOE6), CAPNV(VLAN_HWFILTER), CAPNV(VLAN_HWTSO), CAPNV(LINKSTATE), CAPNV(NETMAP), CAPNV(RXCSUM_IPV6), CAPNV(TXCSUM_IPV6), CAPNV(HWSTATS), CAPNV(TXRTLMT), CAPNV(HWRXTSTMP), CAPNV(MEXTPG), CAPNV(TXTLS4), CAPNV(TXTLS6), CAPNV(VXLAN_HWCSUM), CAPNV(VXLAN_HWTSO), CAPNV(TXTLS_RTLMT), {0, NULL} }; #define CAP2NV(x) {.cap_bit = IFCAP2_BIT(IFCAP2_##x), \ .cap_name = __CONCAT(IFCAP2_, __CONCAT(x, _NAME)) } const struct ifcap_nv_bit_name ifcap2_nv_bit_names[] = { CAP2NV(RXTLS4), CAP2NV(RXTLS6), {0, NULL} }; #undef CAPNV #undef CAP2NV int if_capnv_to_capint(const nvlist_t *nv, int *old_cap, const struct ifcap_nv_bit_name *nn, bool all) { int i, res; res = 0; for (i = 0; nn[i].cap_name != NULL; i++) { if (nvlist_exists_bool(nv, nn[i].cap_name)) { if (all || nvlist_get_bool(nv, nn[i].cap_name)) res |= nn[i].cap_bit; } else { res |= *old_cap & nn[i].cap_bit; } } return (res); } void if_capint_to_capnv(nvlist_t *nv, const struct ifcap_nv_bit_name *nn, int ifr_cap, int ifr_req) { int i; for (i = 0; nn[i].cap_name != NULL; i++) { if ((nn[i].cap_bit & ifr_cap) != 0) { nvlist_add_bool(nv, nn[i].cap_name, (nn[i].cap_bit & ifr_req) != 0); } } } /* * Hardware specific interface ioctls. */ int ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) { struct ifreq *ifr; int error = 0, do_ifup = 0; int new_flags, temp_flags; size_t descrlen, nvbuflen; char *descrbuf; char new_name[IFNAMSIZ]; void *buf; nvlist_t *nvcap; struct siocsifcapnv_driver_data drv_ioctl_data; ifr = (struct ifreq *)data; switch (cmd) { case SIOCGIFINDEX: ifr->ifr_index = ifp->if_index; break; case SIOCGIFFLAGS: temp_flags = ifp->if_flags | ifp->if_drv_flags; ifr->ifr_flags = temp_flags & 0xffff; ifr->ifr_flagshigh = temp_flags >> 16; break; case SIOCGIFCAP: ifr->ifr_reqcap = ifp->if_capabilities; ifr->ifr_curcap = ifp->if_capenable; break; case SIOCGIFCAPNV: if ((ifp->if_capabilities & IFCAP_NV) == 0) { error = EINVAL; break; } buf = NULL; nvcap = nvlist_create(0); for (;;) { if_capint_to_capnv(nvcap, ifcap_nv_bit_names, ifp->if_capabilities, ifp->if_capenable); if_capint_to_capnv(nvcap, ifcap2_nv_bit_names, ifp->if_capabilities2, ifp->if_capenable2); error = (*ifp->if_ioctl)(ifp, SIOCGIFCAPNV, __DECONST(caddr_t, nvcap)); if (error != 0) { if_printf(ifp, "SIOCGIFCAPNV driver mistake: nvlist error %d\n", error); break; } buf = nvlist_pack(nvcap, &nvbuflen); if (buf == NULL) { error = nvlist_error(nvcap); if (error == 0) error = EDOOFUS; break; } if (nvbuflen > ifr->ifr_cap_nv.buf_length) { ifr->ifr_cap_nv.length = nvbuflen; ifr->ifr_cap_nv.buffer = NULL; error = EFBIG; break; } ifr->ifr_cap_nv.length = nvbuflen; error = copyout(buf, ifr->ifr_cap_nv.buffer, nvbuflen); break; } free(buf, M_NVLIST); nvlist_destroy(nvcap); break; case SIOCGIFDATA: { struct if_data ifd; /* Ensure uninitialised padding is not leaked. */ memset(&ifd, 0, sizeof(ifd)); if_data_copy(ifp, &ifd); error = copyout(&ifd, ifr_data_get_ptr(ifr), sizeof(ifd)); break; } #ifdef MAC case SIOCGIFMAC: error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp); break; #endif case SIOCGIFMETRIC: ifr->ifr_metric = ifp->if_metric; break; case SIOCGIFMTU: ifr->ifr_mtu = ifp->if_mtu; break; case SIOCGIFPHYS: /* XXXGL: did this ever worked? */ ifr->ifr_phys = 0; break; case SIOCGIFDESCR: error = 0; sx_slock(&ifdescr_sx); if (ifp->if_description == NULL) error = ENOMSG; else { /* space for terminating nul */ descrlen = strlen(ifp->if_description) + 1; if (ifr_buffer_get_length(ifr) < descrlen) ifr_buffer_set_buffer_null(ifr); else error = copyout(ifp->if_description, ifr_buffer_get_buffer(ifr), descrlen); ifr_buffer_set_length(ifr, descrlen); } sx_sunlock(&ifdescr_sx); break; case SIOCSIFDESCR: error = priv_check(td, PRIV_NET_SETIFDESCR); if (error) return (error); /* * Copy only (length-1) bytes to make sure that * if_description is always nul terminated. The * length parameter is supposed to count the * terminating nul in. */ if (ifr_buffer_get_length(ifr) > ifdescr_maxlen) return (ENAMETOOLONG); else if (ifr_buffer_get_length(ifr) == 0) descrbuf = NULL; else { descrbuf = if_allocdescr(ifr_buffer_get_length(ifr), M_WAITOK); error = copyin(ifr_buffer_get_buffer(ifr), descrbuf, ifr_buffer_get_length(ifr) - 1); if (error) { if_freedescr(descrbuf); break; } } if_setdescr(ifp, descrbuf); getmicrotime(&ifp->if_lastchange); break; case SIOCGIFFIB: ifr->ifr_fib = ifp->if_fib; break; case SIOCSIFFIB: error = priv_check(td, PRIV_NET_SETIFFIB); if (error) return (error); if (ifr->ifr_fib >= rt_numfibs) return (EINVAL); ifp->if_fib = ifr->ifr_fib; break; case SIOCSIFFLAGS: error = priv_check(td, PRIV_NET_SETIFFLAGS); if (error) return (error); /* * Currently, no driver owned flags pass the IFF_CANTCHANGE * check, so we don't need special handling here yet. */ new_flags = (ifr->ifr_flags & 0xffff) | (ifr->ifr_flagshigh << 16); if (ifp->if_flags & IFF_UP && (new_flags & IFF_UP) == 0) { if_down(ifp); } else if (new_flags & IFF_UP && (ifp->if_flags & IFF_UP) == 0) { do_ifup = 1; } /* See if permanently promiscuous mode bit is about to flip */ if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) { if (new_flags & IFF_PPROMISC) ifp->if_flags |= IFF_PROMISC; else if (ifp->if_pcount == 0) ifp->if_flags &= ~IFF_PROMISC; if (log_promisc_mode_change) if_printf(ifp, "permanently promiscuous mode %s\n", ((new_flags & IFF_PPROMISC) ? "enabled" : "disabled")); } ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) | (new_flags &~ IFF_CANTCHANGE); if (ifp->if_ioctl) { (void) (*ifp->if_ioctl)(ifp, cmd, data); } if (do_ifup) if_up(ifp); getmicrotime(&ifp->if_lastchange); break; case SIOCSIFCAP: error = priv_check(td, PRIV_NET_SETIFCAP); if (error != 0) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); if (ifr->ifr_reqcap & ~ifp->if_capabilities) return (EINVAL); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFCAPNV: error = priv_check(td, PRIV_NET_SETIFCAP); if (error != 0) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); if ((ifp->if_capabilities & IFCAP_NV) == 0) return (EINVAL); if (ifr->ifr_cap_nv.length > IFR_CAP_NV_MAXBUFSIZE) return (EINVAL); nvcap = NULL; buf = malloc(ifr->ifr_cap_nv.length, M_TEMP, M_WAITOK); for (;;) { error = copyin(ifr->ifr_cap_nv.buffer, buf, ifr->ifr_cap_nv.length); if (error != 0) break; nvcap = nvlist_unpack(buf, ifr->ifr_cap_nv.length, 0); if (nvcap == NULL) { error = EINVAL; break; } drv_ioctl_data.reqcap = if_capnv_to_capint(nvcap, &ifp->if_capenable, ifcap_nv_bit_names, false); if ((drv_ioctl_data.reqcap & ~ifp->if_capabilities) != 0) { error = EINVAL; break; } drv_ioctl_data.reqcap2 = if_capnv_to_capint(nvcap, &ifp->if_capenable2, ifcap2_nv_bit_names, false); if ((drv_ioctl_data.reqcap2 & ~ifp->if_capabilities2) != 0) { error = EINVAL; break; } drv_ioctl_data.nvcap = nvcap; error = (*ifp->if_ioctl)(ifp, SIOCSIFCAPNV, (caddr_t)&drv_ioctl_data); break; } nvlist_destroy(nvcap); free(buf, M_TEMP); if (error == 0) getmicrotime(&ifp->if_lastchange); break; #ifdef MAC case SIOCSIFMAC: error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp); break; #endif case SIOCSIFNAME: error = priv_check(td, PRIV_NET_SETIFNAME); if (error) return (error); error = copyinstr(ifr_data_get_ptr(ifr), new_name, IFNAMSIZ, NULL); if (error != 0) return (error); error = if_rename(ifp, new_name); break; #ifdef VIMAGE case SIOCSIFVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error) return (error); error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid); break; #endif case SIOCSIFMETRIC: error = priv_check(td, PRIV_NET_SETIFMETRIC); if (error) return (error); ifp->if_metric = ifr->ifr_metric; getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYS: error = priv_check(td, PRIV_NET_SETIFPHYS); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFMTU: { u_long oldmtu = ifp->if_mtu; error = priv_check(td, PRIV_NET_SETIFMTU); if (error) return (error); if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) return (EINVAL); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); /* Disallow MTU changes on bridge member interfaces. */ if (ifp->if_bridge) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) { getmicrotime(&ifp->if_lastchange); rt_ifmsg(ifp, 0); #ifdef INET DEBUGNET_NOTIFY_MTU(ifp); #endif } /* * If the link MTU changed, do network layer specific procedure. */ if (ifp->if_mtu != oldmtu) if_notifymtu(ifp); break; } case SIOCADDMULTI: case SIOCDELMULTI: if (cmd == SIOCADDMULTI) error = priv_check(td, PRIV_NET_ADDMULTI); else error = priv_check(td, PRIV_NET_DELMULTI); if (error) return (error); /* Don't allow group membership on non-multicast interfaces. */ if ((ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); /* Don't let users screw up protocols' entries. */ if (ifr->ifr_addr.sa_family != AF_LINK) return (EINVAL); if (cmd == SIOCADDMULTI) { struct epoch_tracker et; struct ifmultiaddr *ifma; /* * Userland is only permitted to join groups once * via the if_addmulti() KPI, because it cannot hold * struct ifmultiaddr * between calls. It may also * lose a race while we check if the membership * already exists. */ NET_EPOCH_ENTER(et); ifma = if_findmulti(ifp, &ifr->ifr_addr); NET_EPOCH_EXIT(et); if (ifma != NULL) error = EADDRINUSE; else error = if_addmulti(ifp, &ifr->ifr_addr, &ifma); } else { error = if_delmulti(ifp, &ifr->ifr_addr); } if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYADDR: case SIOCDIFPHYADDR: #ifdef INET6 case SIOCSIFPHYADDR_IN6: #endif case SIOCSIFMEDIA: case SIOCSIFGENERIC: error = priv_check(td, PRIV_NET_HWIOCTL); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCGIFSTATUS: case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: case SIOCGIFMEDIA: case SIOCGIFXMEDIA: case SIOCGIFGENERIC: case SIOCGIFRSSKEY: case SIOCGIFRSSHASH: case SIOCGIFDOWNREASON: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); break; case SIOCSIFLLADDR: error = priv_check(td, PRIV_NET_SETLLADDR); if (error) return (error); error = if_setlladdr(ifp, ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len); break; case SIOCGHWADDR: error = if_gethwaddr(ifp, ifr); break; case SIOCAIFGROUP: error = priv_check(td, PRIV_NET_ADDIFGROUP); if (error) return (error); error = if_addgroup(ifp, ((struct ifgroupreq *)data)->ifgr_group); if (error != 0) return (error); break; case SIOCGIFGROUP: { struct epoch_tracker et; NET_EPOCH_ENTER(et); error = if_getgroup((struct ifgroupreq *)data, ifp); NET_EPOCH_EXIT(et); break; } case SIOCDIFGROUP: error = priv_check(td, PRIV_NET_DELIFGROUP); if (error) return (error); error = if_delgroup(ifp, ((struct ifgroupreq *)data)->ifgr_group); if (error != 0) return (error); break; default: error = ENOIOCTL; break; } return (error); } /* * Interface ioctls. */ int ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) { #ifdef COMPAT_FREEBSD32 union { struct ifconf ifc; struct ifdrv ifd; struct ifgroupreq ifgr; struct ifmediareq ifmr; } thunk; u_long saved_cmd; struct ifconf32 *ifc32; struct ifdrv32 *ifd32; struct ifgroupreq32 *ifgr32; struct ifmediareq32 *ifmr32; #endif struct ifnet *ifp; struct ifreq *ifr; int error; int oif_flags; #ifdef VIMAGE bool shutdown; #endif CURVNET_SET(so->so_vnet); #ifdef VIMAGE /* Make sure the VNET is stable. */ shutdown = VNET_IS_SHUTTING_DOWN(so->so_vnet); if (shutdown) { CURVNET_RESTORE(); return (EBUSY); } #endif #ifdef COMPAT_FREEBSD32 saved_cmd = cmd; switch (cmd) { case SIOCGIFCONF32: ifc32 = (struct ifconf32 *)data; thunk.ifc.ifc_len = ifc32->ifc_len; thunk.ifc.ifc_buf = PTRIN(ifc32->ifc_buf); data = (caddr_t)&thunk.ifc; cmd = SIOCGIFCONF; break; case SIOCGDRVSPEC32: case SIOCSDRVSPEC32: ifd32 = (struct ifdrv32 *)data; memcpy(thunk.ifd.ifd_name, ifd32->ifd_name, sizeof(thunk.ifd.ifd_name)); thunk.ifd.ifd_cmd = ifd32->ifd_cmd; thunk.ifd.ifd_len = ifd32->ifd_len; thunk.ifd.ifd_data = PTRIN(ifd32->ifd_data); data = (caddr_t)&thunk.ifd; cmd = _IOC_NEWTYPE(cmd, struct ifdrv); break; case SIOCAIFGROUP32: case SIOCGIFGROUP32: case SIOCDIFGROUP32: case SIOCGIFGMEMB32: ifgr32 = (struct ifgroupreq32 *)data; memcpy(thunk.ifgr.ifgr_name, ifgr32->ifgr_name, sizeof(thunk.ifgr.ifgr_name)); thunk.ifgr.ifgr_len = ifgr32->ifgr_len; switch (cmd) { case SIOCAIFGROUP32: case SIOCDIFGROUP32: memcpy(thunk.ifgr.ifgr_group, ifgr32->ifgr_group, sizeof(thunk.ifgr.ifgr_group)); break; case SIOCGIFGROUP32: case SIOCGIFGMEMB32: thunk.ifgr.ifgr_groups = PTRIN(ifgr32->ifgr_groups); break; } data = (caddr_t)&thunk.ifgr; cmd = _IOC_NEWTYPE(cmd, struct ifgroupreq); break; case SIOCGIFMEDIA32: case SIOCGIFXMEDIA32: ifmr32 = (struct ifmediareq32 *)data; memcpy(thunk.ifmr.ifm_name, ifmr32->ifm_name, sizeof(thunk.ifmr.ifm_name)); thunk.ifmr.ifm_current = ifmr32->ifm_current; thunk.ifmr.ifm_mask = ifmr32->ifm_mask; thunk.ifmr.ifm_status = ifmr32->ifm_status; thunk.ifmr.ifm_active = ifmr32->ifm_active; thunk.ifmr.ifm_count = ifmr32->ifm_count; thunk.ifmr.ifm_ulist = PTRIN(ifmr32->ifm_ulist); data = (caddr_t)&thunk.ifmr; cmd = _IOC_NEWTYPE(cmd, struct ifmediareq); break; } #endif switch (cmd) { case SIOCGIFCONF: error = ifconf(cmd, data); goto out_noref; } ifr = (struct ifreq *)data; switch (cmd) { #ifdef VIMAGE case SIOCSIFRVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error == 0) error = if_vmove_reclaim(td, ifr->ifr_name, ifr->ifr_jid); goto out_noref; #endif case SIOCIFCREATE: case SIOCIFCREATE2: error = priv_check(td, PRIV_NET_IFCREATE); if (error == 0) error = if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name), cmd == SIOCIFCREATE2 ? ifr_data_get_ptr(ifr) : NULL); goto out_noref; case SIOCIFDESTROY: error = priv_check(td, PRIV_NET_IFDESTROY); if (error == 0) { sx_xlock(&ifnet_detach_sxlock); error = if_clone_destroy(ifr->ifr_name); sx_xunlock(&ifnet_detach_sxlock); } goto out_noref; case SIOCIFGCLONERS: error = if_clone_list((struct if_clonereq *)data); goto out_noref; case SIOCGIFGMEMB: error = if_getgroupmembers((struct ifgroupreq *)data); goto out_noref; #if defined(INET) || defined(INET6) case SIOCSVH: case SIOCGVH: if (carp_ioctl_p == NULL) error = EPROTONOSUPPORT; else error = (*carp_ioctl_p)(ifr, cmd, td); goto out_noref; #endif } ifp = ifunit_ref(ifr->ifr_name); if (ifp == NULL) { error = ENXIO; goto out_noref; } error = ifhwioctl(cmd, ifp, data, td); if (error != ENOIOCTL) goto out_ref; oif_flags = ifp->if_flags; if (so->so_proto == NULL) { error = EOPNOTSUPP; goto out_ref; } /* * Pass the request on to the socket control method, and if the * latter returns EOPNOTSUPP, directly to the interface. * * Make an exception for the legacy SIOCSIF* requests. Drivers * trust SIOCSIFADDR et al to come from an already privileged * layer, and do not perform any credentials checks or input * validation. */ error = so->so_proto->pr_control(so, cmd, data, ifp, td); if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL && cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR && cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK) error = (*ifp->if_ioctl)(ifp, cmd, data); if (!(oif_flags & IFF_UP) && (ifp->if_flags & IFF_UP)) if_up(ifp); out_ref: if_rele(ifp); out_noref: CURVNET_RESTORE(); #ifdef COMPAT_FREEBSD32 if (error != 0) return (error); switch (saved_cmd) { case SIOCGIFCONF32: ifc32->ifc_len = thunk.ifc.ifc_len; break; case SIOCGDRVSPEC32: /* * SIOCGDRVSPEC is IOWR, but nothing actually touches * the struct so just assert that ifd_len (the only * field it might make sense to update) hasn't * changed. */ KASSERT(thunk.ifd.ifd_len == ifd32->ifd_len, ("ifd_len was updated %u -> %zu", ifd32->ifd_len, thunk.ifd.ifd_len)); break; case SIOCGIFGROUP32: case SIOCGIFGMEMB32: ifgr32->ifgr_len = thunk.ifgr.ifgr_len; break; case SIOCGIFMEDIA32: case SIOCGIFXMEDIA32: ifmr32->ifm_current = thunk.ifmr.ifm_current; ifmr32->ifm_mask = thunk.ifmr.ifm_mask; ifmr32->ifm_status = thunk.ifmr.ifm_status; ifmr32->ifm_active = thunk.ifmr.ifm_active; ifmr32->ifm_count = thunk.ifmr.ifm_count; break; } #endif return (error); } int if_rename(struct ifnet *ifp, char *new_name) { struct ifaddr *ifa; struct sockaddr_dl *sdl; size_t namelen, onamelen; char old_name[IFNAMSIZ]; char strbuf[IFNAMSIZ + 8]; if (new_name[0] == '\0') return (EINVAL); if (strcmp(new_name, ifp->if_xname) == 0) return (0); if (ifunit(new_name) != NULL) return (EEXIST); /* * XXX: Locking. Nothing else seems to lock if_flags, * and there are numerous other races with the * ifunit() checks not being atomic with namespace * changes (renames, vmoves, if_attach, etc). */ ifp->if_flags |= IFF_RENAMING; EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if_printf(ifp, "changing name to '%s'\n", new_name); IF_ADDR_WLOCK(ifp); strlcpy(old_name, ifp->if_xname, sizeof(old_name)); strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname)); ifa = ifp->if_addr; sdl = (struct sockaddr_dl *)ifa->ifa_addr; namelen = strlen(new_name); onamelen = sdl->sdl_nlen; /* * Move the address if needed. This is safe because we * allocate space for a name of length IFNAMSIZ when we * create this in if_attach(). */ if (namelen != onamelen) { bcopy(sdl->sdl_data + onamelen, sdl->sdl_data + namelen, sdl->sdl_alen); } bcopy(new_name, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl = (struct sockaddr_dl *)ifa->ifa_netmask; bzero(sdl->sdl_data, onamelen); while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); ifp->if_flags &= ~IFF_RENAMING; snprintf(strbuf, sizeof(strbuf), "name=%s", new_name); devctl_notify("IFNET", old_name, "RENAME", strbuf); return (0); } /* * The code common to handling reference counted flags, * e.g., in ifpromisc() and if_allmulti(). * The "pflag" argument can specify a permanent mode flag to check, * such as IFF_PPROMISC for promiscuous mode; should be 0 if none. * * Only to be used on stack-owned flags, not driver-owned flags. */ static int if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch) { struct ifreq ifr; int error; int oldflags, oldcount; /* Sanity checks to catch programming errors */ KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0, ("%s: setting driver-owned flag %d", __func__, flag)); if (onswitch) KASSERT(*refcount >= 0, ("%s: increment negative refcount %d for flag %d", __func__, *refcount, flag)); else KASSERT(*refcount > 0, ("%s: decrement non-positive refcount %d for flag %d", __func__, *refcount, flag)); /* In case this mode is permanent, just touch refcount */ if (ifp->if_flags & pflag) { *refcount += onswitch ? 1 : -1; return (0); } /* Save ifnet parameters for if_ioctl() may fail */ oldcount = *refcount; oldflags = ifp->if_flags; /* * See if we aren't the only and touching refcount is enough. * Actually toggle interface flag if we are the first or last. */ if (onswitch) { if ((*refcount)++) return (0); ifp->if_flags |= flag; } else { if (--(*refcount)) return (0); ifp->if_flags &= ~flag; } /* Call down the driver since we've changed interface flags */ if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto recover; } ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); if (error) goto recover; /* Notify userland that interface flags have changed */ rt_ifmsg(ifp, flag); return (0); recover: /* Recover after driver error */ *refcount = oldcount; ifp->if_flags = oldflags; return (error); } /* * Set/clear promiscuous mode on interface ifp based on the truth value * of pswitch. The calls are reference counted so that only the first * "on" request actually has an effect, as does the final "off" request. * Results are undefined if the "off" and "on" requests are not matched. */ int ifpromisc(struct ifnet *ifp, int pswitch) { int error; int oldflags = ifp->if_flags; error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC, &ifp->if_pcount, pswitch); /* If promiscuous mode status has changed, log a message */ if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC) && log_promisc_mode_change) if_printf(ifp, "promiscuous mode %s\n", (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled"); return (error); } /* * Return interface configuration * of system. List may be used * in later ioctl's (above) to get * other information. */ /*ARGSUSED*/ static int ifconf(u_long cmd, caddr_t data) { struct ifconf *ifc = (struct ifconf *)data; struct ifnet *ifp; struct ifaddr *ifa; struct ifreq ifr; struct sbuf *sb; int error, full = 0, valid_len, max_len; /* Limit initial buffer size to maxphys to avoid DoS from userspace. */ max_len = maxphys - 1; /* Prevent hostile input from being able to crash the system */ if (ifc->ifc_len <= 0) return (EINVAL); again: if (ifc->ifc_len <= max_len) { max_len = ifc->ifc_len; full = 1; } sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN); max_len = 0; valid_len = 0; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { struct epoch_tracker et; int addrs; /* * Zero the ifr to make sure we don't disclose the contents * of the stack. */ memset(&ifr, 0, sizeof(ifr)); if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name)) >= sizeof(ifr.ifr_name)) { sbuf_delete(sb); IFNET_RUNLOCK(); return (ENAMETOOLONG); } addrs = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; if (prison_if(curthread->td_ucred, sa) != 0) continue; addrs++; if (sa->sa_len <= sizeof(*sa)) { if (sa->sa_len < sizeof(*sa)) { memset(&ifr.ifr_ifru.ifru_addr, 0, sizeof(ifr.ifr_ifru.ifru_addr)); memcpy(&ifr.ifr_ifru.ifru_addr, sa, sa->sa_len); } else ifr.ifr_ifru.ifru_addr = *sa; sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); } else { sbuf_bcat(sb, &ifr, offsetof(struct ifreq, ifr_addr)); max_len += offsetof(struct ifreq, ifr_addr); sbuf_bcat(sb, sa, sa->sa_len); max_len += sa->sa_len; } if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } NET_EPOCH_EXIT(et); if (addrs == 0) { sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } } IFNET_RUNLOCK(); /* * If we didn't allocate enough space (uncommon), try again. If * we have already allocated as much space as we are allowed, * return what we've got. */ if (valid_len != max_len && !full) { sbuf_delete(sb); goto again; } ifc->ifc_len = valid_len; sbuf_finish(sb); error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len); sbuf_delete(sb); return (error); } /* * Just like ifpromisc(), but for all-multicast-reception mode. */ int if_allmulti(struct ifnet *ifp, int onswitch) { return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch)); } struct ifmultiaddr * if_findmulti(struct ifnet *ifp, const struct sockaddr *sa) { struct ifmultiaddr *ifma; IF_ADDR_LOCK_ASSERT(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (sa->sa_family == AF_LINK) { if (sa_dl_equal(ifma->ifma_addr, sa)) break; } else { if (sa_equal(ifma->ifma_addr, sa)) break; } } return ifma; } /* * Allocate a new ifmultiaddr and initialize based on passed arguments. We * make copies of passed sockaddrs. The ifmultiaddr will not be added to * the ifnet multicast address list here, so the caller must do that and * other setup work (such as notifying the device driver). The reference * count is initialized to 1. */ static struct ifmultiaddr * if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa, int mflags) { struct ifmultiaddr *ifma; struct sockaddr *dupsa; ifma = malloc(sizeof *ifma, M_IFMADDR, mflags | M_ZERO); if (ifma == NULL) return (NULL); dupsa = malloc(sa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma, M_IFMADDR); return (NULL); } bcopy(sa, dupsa, sa->sa_len); ifma->ifma_addr = dupsa; ifma->ifma_ifp = ifp; ifma->ifma_refcount = 1; ifma->ifma_protospec = NULL; if (llsa == NULL) { ifma->ifma_lladdr = NULL; return (ifma); } dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); return (NULL); } bcopy(llsa, dupsa, llsa->sa_len); ifma->ifma_lladdr = dupsa; return (ifma); } /* * if_freemulti: free ifmultiaddr structure and possibly attached related * addresses. The caller is responsible for implementing reference * counting, notifying the driver, handling routing messages, and releasing * any dependent link layer state. */ #ifdef MCAST_VERBOSE extern void kdb_backtrace(void); #endif static void if_freemulti_internal(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d", ifma->ifma_refcount)); if (ifma->ifma_lladdr != NULL) free(ifma->ifma_lladdr, M_IFMADDR); #ifdef MCAST_VERBOSE kdb_backtrace(); printf("%s freeing ifma: %p\n", __func__, ifma); #endif free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); } static void if_destroymulti(epoch_context_t ctx) { struct ifmultiaddr *ifma; ifma = __containerof(ctx, struct ifmultiaddr, ifma_epoch_ctx); if_freemulti_internal(ifma); } void if_freemulti(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti_epoch: refcount %d", ifma->ifma_refcount)); NET_EPOCH_CALL(if_destroymulti, &ifma->ifma_epoch_ctx); } /* * Register an additional multicast address with a network interface. * * - If the address is already present, bump the reference count on the * address and return. * - If the address is not link-layer, look up a link layer address. * - Allocate address structures for one or both addresses, and attach to the * multicast address list on the interface. If automatically adding a link * layer address, the protocol address will own a reference to the link * layer address, to be freed when it is freed. * - Notify the network device driver of an addition to the multicast address * list. * * 'sa' points to caller-owned memory with the desired multicast address. * * 'retifma' will be used to return a pointer to the resulting multicast * address reference, if desired. */ int if_addmulti(struct ifnet *ifp, struct sockaddr *sa, struct ifmultiaddr **retifma) { struct ifmultiaddr *ifma, *ll_ifma; struct sockaddr *llsa; struct sockaddr_dl sdl; int error; #ifdef INET IN_MULTI_LIST_UNLOCK_ASSERT(); #endif #ifdef INET6 IN6_MULTI_LIST_UNLOCK_ASSERT(); #endif /* * If the address is already present, return a new reference to it; * otherwise, allocate storage and set up a new address. */ IF_ADDR_WLOCK(ifp); ifma = if_findmulti(ifp, sa); if (ifma != NULL) { ifma->ifma_refcount++; if (retifma != NULL) *retifma = ifma; IF_ADDR_WUNLOCK(ifp); return (0); } /* * The address isn't already present; resolve the protocol address * into a link layer address, and then look that up, bump its * refcount or allocate an ifma for that also. * Most link layer resolving functions returns address data which * fits inside default sockaddr_dl structure. However callback * can allocate another sockaddr structure, in that case we need to * free it later. */ llsa = NULL; ll_ifma = NULL; if (ifp->if_resolvemulti != NULL) { /* Provide called function with buffer size information */ sdl.sdl_len = sizeof(sdl); llsa = (struct sockaddr *)&sdl; error = ifp->if_resolvemulti(ifp, &llsa, sa); if (error) goto unlock_out; } /* * Allocate the new address. Don't hook it up yet, as we may also * need to allocate a link layer multicast address. */ ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT); if (ifma == NULL) { error = ENOMEM; goto free_llsa_out; } /* * If a link layer address is found, we'll need to see if it's * already present in the address list, or allocate is as well. * When this block finishes, the link layer address will be on the * list. */ if (llsa != NULL) { ll_ifma = if_findmulti(ifp, llsa); if (ll_ifma == NULL) { ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT); if (ll_ifma == NULL) { --ifma->ifma_refcount; if_freemulti(ifma); error = ENOMEM; goto free_llsa_out; } ll_ifma->ifma_flags |= IFMA_F_ENQUEUED; CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma, ifma_link); } else ll_ifma->ifma_refcount++; ifma->ifma_llifma = ll_ifma; } /* * We now have a new multicast address, ifma, and possibly a new or * referenced link layer address. Add the primary address to the * ifnet address list. */ ifma->ifma_flags |= IFMA_F_ENQUEUED; CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); if (retifma != NULL) *retifma = ifma; /* * Must generate the message while holding the lock so that 'ifma' * pointer is still valid. */ rt_newmaddrmsg(RTM_NEWMADDR, ifma); IF_ADDR_WUNLOCK(ifp); /* * We are certain we have added something, so call down to the * interface to let them know about it. */ if (ifp->if_ioctl != NULL) { if (THREAD_CAN_SLEEP()) (void )(*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0); else taskqueue_enqueue(taskqueue_swi, &ifp->if_addmultitask); } if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); return (0); free_llsa_out: if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); unlock_out: IF_ADDR_WUNLOCK(ifp); return (error); } static void if_siocaddmulti(void *arg, int pending) { struct ifnet *ifp; ifp = arg; #ifdef DIAGNOSTIC if (pending > 1) if_printf(ifp, "%d SIOCADDMULTI coalesced\n", pending); #endif CURVNET_SET(ifp->if_vnet); (void )(*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0); CURVNET_RESTORE(); } /* * Delete a multicast group membership by network-layer group address. * * Returns ENOENT if the entry could not be found. If ifp no longer * exists, results are undefined. This entry point should only be used * from subsystems which do appropriate locking to hold ifp for the * duration of the call. * Network-layer protocol domains must use if_delmulti_ifma(). */ int if_delmulti(struct ifnet *ifp, struct sockaddr *sa) { struct ifmultiaddr *ifma; int lastref; KASSERT(ifp, ("%s: NULL ifp", __func__)); IF_ADDR_WLOCK(ifp); lastref = 0; ifma = if_findmulti(ifp, sa); if (ifma != NULL) lastref = if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); if (ifma == NULL) return (ENOENT); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } return (0); } /* * Delete all multicast group membership for an interface. * Should be used to quickly flush all multicast filters. */ void if_delallmulti(struct ifnet *ifp) { struct ifmultiaddr *ifma; struct ifmultiaddr *next; IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); } void if_delmulti_ifma(struct ifmultiaddr *ifma) { if_delmulti_ifma_flags(ifma, 0); } /* * Delete a multicast group membership by group membership pointer. * Network-layer protocol domains must use this routine. * * It is safe to call this routine if the ifp disappeared. */ void if_delmulti_ifma_flags(struct ifmultiaddr *ifma, int flags) { struct ifnet *ifp; int lastref; MCDPRINTF("%s freeing ifma: %p\n", __func__, ifma); #ifdef INET IN_MULTI_LIST_UNLOCK_ASSERT(); #endif ifp = ifma->ifma_ifp; #ifdef DIAGNOSTIC if (ifp == NULL) { printf("%s: ifma_ifp seems to be detached\n", __func__); } else { struct epoch_tracker et; struct ifnet *oifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; NET_EPOCH_EXIT(et); if (ifp != oifp) ifp = NULL; } #endif /* * If and only if the ifnet instance exists: Acquire the address lock. */ if (ifp != NULL) IF_ADDR_WLOCK(ifp); lastref = if_delmulti_locked(ifp, ifma, flags); if (ifp != NULL) { /* * If and only if the ifnet instance exists: * Release the address lock. * If the group was left: update the hardware hash filter. */ IF_ADDR_WUNLOCK(ifp); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } } } /* * Perform deletion of network-layer and/or link-layer multicast address. * * Return 0 if the reference count was decremented. * Return 1 if the final reference was released, indicating that the * hardware hash filter should be reprogrammed. */ static int if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching) { struct ifmultiaddr *ll_ifma; if (ifp != NULL && ifma->ifma_ifp != NULL) { KASSERT(ifma->ifma_ifp == ifp, ("%s: inconsistent ifp %p", __func__, ifp)); IF_ADDR_WLOCK_ASSERT(ifp); } ifp = ifma->ifma_ifp; MCDPRINTF("%s freeing %p from %s \n", __func__, ifma, ifp ? ifp->if_xname : ""); /* * If the ifnet is detaching, null out references to ifnet, * so that upper protocol layers will notice, and not attempt * to obtain locks for an ifnet which no longer exists. The * routing socket announcement must happen before the ifnet * instance is detached from the system. */ if (detaching) { #ifdef DIAGNOSTIC printf("%s: detaching ifnet instance %p\n", __func__, ifp); #endif /* * ifp may already be nulled out if we are being reentered * to delete the ll_ifma. */ if (ifp != NULL) { rt_newmaddrmsg(RTM_DELMADDR, ifma); ifma->ifma_ifp = NULL; } } if (--ifma->ifma_refcount > 0) return 0; if (ifp != NULL && detaching == 0 && (ifma->ifma_flags & IFMA_F_ENQUEUED)) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } /* * If this ifma is a network-layer ifma, a link-layer ifma may * have been associated with it. Release it first if so. */ ll_ifma = ifma->ifma_llifma; if (ll_ifma != NULL) { KASSERT(ifma->ifma_lladdr != NULL, ("%s: llifma w/o lladdr", __func__)); if (detaching) ll_ifma->ifma_ifp = NULL; /* XXX */ if (--ll_ifma->ifma_refcount == 0) { if (ifp != NULL) { if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link); ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } } if_freemulti(ll_ifma); } } #ifdef INVARIANTS if (ifp) { struct ifmultiaddr *ifmatmp; CK_STAILQ_FOREACH(ifmatmp, &ifp->if_multiaddrs, ifma_link) MPASS(ifma != ifmatmp); } #endif if_freemulti(ifma); /* * The last reference to this instance of struct ifmultiaddr * was released; the hardware should be notified of this change. */ return 1; } /* * Set the link layer address on an interface. * * At this time we only support certain types of interfaces, * and we don't allow the length of the address to change. * * Set noinline to be dtrace-friendly */ __noinline int if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) { struct sockaddr_dl *sdl; struct ifaddr *ifa; struct ifreq ifr; ifa = ifp->if_addr; if (ifa == NULL) return (EINVAL); sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) return (EINVAL); if (len != sdl->sdl_alen) /* don't allow length to change */ return (EINVAL); switch (ifp->if_type) { case IFT_ETHER: case IFT_XETHER: case IFT_L2VLAN: case IFT_BRIDGE: case IFT_IEEE8023ADLAG: bcopy(lladdr, LLADDR(sdl), len); break; default: return (ENODEV); } /* * If the interface is already up, we need * to re-init it in order to reprogram its * address filter. */ if ((ifp->if_flags & IFF_UP) != 0) { if (ifp->if_ioctl) { ifp->if_flags &= ~IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); ifp->if_flags |= IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); } } EVENTHANDLER_INVOKE(iflladdr_event, ifp); return (0); } /* * Compat function for handling basic encapsulation requests. * Not converted stacks (FDDI, IB, ..) supports traditional * output model: ARP (and other similar L2 protocols) are handled * inside output routine, arpresolve/nd6_resolve() returns MAC * address instead of full prepend. * * This function creates calculated header==MAC for IPv4/IPv6 and * returns EAFNOSUPPORT (which is then handled in ARP code) for other * address families. */ static int if_requestencap_default(struct ifnet *ifp, struct if_encap_req *req) { if (req->rtype != IFENCAP_LL) return (EOPNOTSUPP); if (req->bufsize < req->lladdr_len) return (ENOMEM); switch (req->family) { case AF_INET: case AF_INET6: break; default: return (EAFNOSUPPORT); } /* Copy lladdr to storage as is */ memmove(req->buf, req->lladdr, req->lladdr_len); req->bufsize = req->lladdr_len; req->lladdr_off = 0; return (0); } /* * Tunnel interfaces can nest, also they may cause infinite recursion * calls when misconfigured. We'll prevent this by detecting loops. * High nesting level may cause stack exhaustion. We'll prevent this * by introducing upper limit. * * Return 0, if tunnel nesting count is equal or less than limit. */ int if_tunnel_check_nesting(struct ifnet *ifp, struct mbuf *m, uint32_t cookie, int limit) { struct m_tag *mtag; int count; count = 1; mtag = NULL; while ((mtag = m_tag_locate(m, cookie, 0, mtag)) != NULL) { if (*(struct ifnet **)(mtag + 1) == ifp) { log(LOG_NOTICE, "%s: loop detected\n", if_name(ifp)); return (EIO); } count++; } if (count > limit) { log(LOG_NOTICE, "%s: if_output recursively called too many times(%d)\n", if_name(ifp), count); return (EIO); } mtag = m_tag_alloc(cookie, 0, sizeof(struct ifnet *), M_NOWAIT); if (mtag == NULL) return (ENOMEM); *(struct ifnet **)(mtag + 1) = ifp; m_tag_prepend(m, mtag); return (0); } /* * Get the link layer address that was read from the hardware at attach. * * This is only set by Ethernet NICs (IFT_ETHER), but laggX interfaces re-type * their component interfaces as IFT_IEEE8023ADLAG. */ int if_gethwaddr(struct ifnet *ifp, struct ifreq *ifr) { if (ifp->if_hw_addr == NULL) return (ENODEV); switch (ifp->if_type) { case IFT_ETHER: case IFT_IEEE8023ADLAG: bcopy(ifp->if_hw_addr, ifr->ifr_addr.sa_data, ifp->if_addrlen); return (0); default: return (ENODEV); } } /* * The name argument must be a pointer to storage which will last as * long as the interface does. For physical devices, the result of * device_get_name(dev) is a good choice and for pseudo-devices a * static string works well. */ void if_initname(struct ifnet *ifp, const char *name, int unit) { ifp->if_dname = name; ifp->if_dunit = unit; if (unit != IF_DUNIT_NONE) snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit); else strlcpy(ifp->if_xname, name, IFNAMSIZ); } static int if_vlog(struct ifnet *ifp, int pri, const char *fmt, va_list ap) { char if_fmt[256]; snprintf(if_fmt, sizeof(if_fmt), "%s: %s", ifp->if_xname, fmt); vlog(pri, if_fmt, ap); return (0); } int if_printf(struct ifnet *ifp, const char *fmt, ...) { va_list ap; va_start(ap, fmt); if_vlog(ifp, LOG_INFO, fmt, ap); va_end(ap); return (0); } int if_log(struct ifnet *ifp, int pri, const char *fmt, ...) { va_list ap; va_start(ap, fmt); if_vlog(ifp, pri, fmt, ap); va_end(ap); return (0); } void if_start(struct ifnet *ifp) { (*(ifp)->if_start)(ifp); } /* * Backwards compatibility interface for drivers * that have not implemented it */ static int if_transmit_default(struct ifnet *ifp, struct mbuf *m) { int error; IFQ_HANDOFF(ifp, m, error); return (error); } static void if_input_default(struct ifnet *ifp __unused, struct mbuf *m) { m_freem(m); } int if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust) { int active = 0; IF_LOCK(ifq); if (_IF_QFULL(ifq)) { IF_UNLOCK(ifq); if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); m_freem(m); return (0); } if (ifp != NULL) { if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len + adjust); if (m->m_flags & (M_BCAST|M_MCAST)) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); active = ifp->if_drv_flags & IFF_DRV_OACTIVE; } _IF_ENQUEUE(ifq, m); IF_UNLOCK(ifq); if (ifp != NULL && !active) (*(ifp)->if_start)(ifp); return (1); } void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f) { KASSERT(if_com_alloc[type] == NULL, ("if_register_com_alloc: %d already registered", type)); KASSERT(if_com_free[type] == NULL, ("if_register_com_alloc: %d free already registered", type)); if_com_alloc[type] = a; if_com_free[type] = f; } void if_deregister_com_alloc(u_char type) { KASSERT(if_com_alloc[type] != NULL, ("if_deregister_com_alloc: %d not registered", type)); KASSERT(if_com_free[type] != NULL, ("if_deregister_com_alloc: %d free not registered", type)); /* * Ensure all pending EPOCH(9) callbacks have been executed. This * fixes issues about late invocation of if_destroy(), which leads * to memory leak from if_com_alloc[type] allocated if_l2com. */ NET_EPOCH_DRAIN_CALLBACKS(); if_com_alloc[type] = NULL; if_com_free[type] = NULL; } /* API for driver access to network stack owned ifnet.*/ uint64_t if_setbaudrate(struct ifnet *ifp, uint64_t baudrate) { uint64_t oldbrate; oldbrate = ifp->if_baudrate; ifp->if_baudrate = baudrate; return (oldbrate); } uint64_t if_getbaudrate(const if_t ifp) { return (ifp->if_baudrate); } int if_setcapabilities(if_t ifp, int capabilities) { ifp->if_capabilities = capabilities; return (0); } int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit) { ifp->if_capabilities &= ~clearbit; ifp->if_capabilities |= setbit; return (0); } int if_getcapabilities(const if_t ifp) { return (ifp->if_capabilities); } int if_setcapenable(if_t ifp, int capabilities) { ifp->if_capenable = capabilities; return (0); } int if_setcapenablebit(if_t ifp, int setcap, int clearcap) { ifp->if_capenable &= ~clearcap; ifp->if_capenable |= setcap; return (0); } int if_setcapabilities2(if_t ifp, int capabilities) { ifp->if_capabilities2 = capabilities; return (0); } int if_setcapabilities2bit(if_t ifp, int setbit, int clearbit) { ifp->if_capabilities2 &= ~clearbit; ifp->if_capabilities2 |= setbit; return (0); } int if_getcapabilities2(const if_t ifp) { return (ifp->if_capabilities2); } int if_setcapenable2(if_t ifp, int capabilities2) { ifp->if_capenable2 = capabilities2; return (0); } int if_setcapenable2bit(if_t ifp, int setcap, int clearcap) { ifp->if_capenable2 &= ~clearcap; ifp->if_capenable2 |= setcap; return (0); } const char * if_getdname(const if_t ifp) { return (ifp->if_dname); } void if_setdname(if_t ifp, const char *dname) { ifp->if_dname = dname; } const char * if_name(if_t ifp) { return (ifp->if_xname); } int if_setname(if_t ifp, const char *name) { if (strlen(name) > sizeof(ifp->if_xname) - 1) return (ENAMETOOLONG); strcpy(ifp->if_xname, name); return (0); } int if_togglecapenable(if_t ifp, int togglecap) { ifp->if_capenable ^= togglecap; return (0); } int if_getcapenable(const if_t ifp) { return (ifp->if_capenable); } int if_togglecapenable2(if_t ifp, int togglecap) { ifp->if_capenable2 ^= togglecap; return (0); } int if_getcapenable2(const if_t ifp) { return (ifp->if_capenable2); } int if_getdunit(const if_t ifp) { return (ifp->if_dunit); } int if_getindex(const if_t ifp) { return (ifp->if_index); } int if_getidxgen(const if_t ifp) { return (ifp->if_idxgen); } +const char * +if_getdescr(if_t ifp) +{ + return (ifp->if_description); +} + void if_setdescr(if_t ifp, char *descrbuf) { sx_xlock(&ifdescr_sx); char *odescrbuf = ifp->if_description; ifp->if_description = descrbuf; sx_xunlock(&ifdescr_sx); if_freedescr(odescrbuf); } char * if_allocdescr(size_t sz, int malloc_flag) { malloc_flag &= (M_WAITOK | M_NOWAIT); return (malloc(sz, M_IFDESCR, M_ZERO | malloc_flag)); } void if_freedescr(char *descrbuf) { free(descrbuf, M_IFDESCR); } int if_getalloctype(const if_t ifp) { return (ifp->if_alloctype); } +void +if_setlastchange(if_t ifp) +{ + getmicrotime(&ifp->if_lastchange); +} + /* * This is largely undesirable because it ties ifnet to a device, but does * provide flexiblity for an embedded product vendor. Should be used with * the understanding that it violates the interface boundaries, and should be * a last resort only. */ int if_setdev(if_t ifp, void *dev) { return (0); } int if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags) { ifp->if_drv_flags &= ~clear_flags; ifp->if_drv_flags |= set_flags; return (0); } int if_getdrvflags(const if_t ifp) { return (ifp->if_drv_flags); } int if_setdrvflags(if_t ifp, int flags) { ifp->if_drv_flags = flags; return (0); } int if_setflags(if_t ifp, int flags) { ifp->if_flags = flags; return (0); } int if_setflagbits(if_t ifp, int set, int clear) { ifp->if_flags &= ~clear; ifp->if_flags |= set; return (0); } int if_getflags(const if_t ifp) { return (ifp->if_flags); } int if_clearhwassist(if_t ifp) { ifp->if_hwassist = 0; return (0); } int if_sethwassistbits(if_t ifp, int toset, int toclear) { ifp->if_hwassist &= ~toclear; ifp->if_hwassist |= toset; return (0); } int if_sethwassist(if_t ifp, int hwassist_bit) { ifp->if_hwassist = hwassist_bit; return (0); } int if_gethwassist(const if_t ifp) { return (ifp->if_hwassist); } int if_togglehwassist(if_t ifp, int toggle_bits) { ifp->if_hwassist ^= toggle_bits; return (0); } int if_setmtu(if_t ifp, int mtu) { ifp->if_mtu = mtu; return (0); } void if_notifymtu(if_t ifp) { #ifdef INET6 nd6_setmtu(ifp); #endif rt_updatemtu(ifp); } int if_getmtu(const if_t ifp) { return (ifp->if_mtu); } int if_getmtu_family(const if_t ifp, int family) { struct domain *dp; SLIST_FOREACH(dp, &domains, dom_next) { if (dp->dom_family == family && dp->dom_ifmtu != NULL) return (dp->dom_ifmtu(ifp)); } return (ifp->if_mtu); } /* * Methods for drivers to access interface unicast and multicast * link level addresses. Driver shall not know 'struct ifaddr' neither * 'struct ifmultiaddr'. */ u_int if_lladdr_count(if_t ifp) { struct epoch_tracker et; struct ifaddr *ifa; u_int count; count = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_LINK) count++; NET_EPOCH_EXIT(et); return (count); } int if_foreach(if_foreach_cb_t cb, void *cb_arg) { if_t ifp; int error; NET_EPOCH_ASSERT(); MPASS(cb); error = 0; CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { error = cb(ifp, cb_arg); if (error != 0) break; } return (error); } /* * Iterates over the list of interfaces, permitting callback function @cb to sleep. * Stops iteration if @cb returns non-zero error code. * Returns the last error code from @cb. * @match_cb: optional match callback limiting the iteration to only matched interfaces * @match_arg: argument to pass to @match_cb * @cb: iteration callback * @cb_arg: argument to pass to @cb */ int if_foreach_sleep(if_foreach_match_t match_cb, void *match_arg, if_foreach_cb_t cb, void *cb_arg) { int match_count = 0, array_size = 16; /* 128 bytes for malloc */ struct ifnet **match_array = NULL; int error = 0; MPASS(cb); while (true) { struct ifnet **new_array; int new_size = array_size; struct epoch_tracker et; struct ifnet *ifp; while (new_size < match_count) new_size *= 2; new_array = malloc(new_size * sizeof(void *), M_TEMP, M_WAITOK); if (match_array != NULL) memcpy(new_array, match_array, array_size * sizeof(void *)); free(match_array, M_TEMP); match_array = new_array; array_size = new_size; match_count = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (match_cb != NULL && !match_cb(ifp, match_arg)) continue; if (match_count < array_size) { if (if_try_ref(ifp)) match_array[match_count++] = ifp; } else match_count++; } NET_EPOCH_EXIT(et); if (match_count > array_size) { for (int i = 0; i < array_size; i++) if_rele(match_array[i]); continue; } else { for (int i = 0; i < match_count; i++) { if (error == 0) error = cb(match_array[i], cb_arg); if_rele(match_array[i]); } free(match_array, M_TEMP); break; } } return (error); } /* * Uses just 1 pointer of the 4 available in the public struct. */ if_t if_iter_start(struct if_iter *iter) { if_t ifp; NET_EPOCH_ASSERT(); bzero(iter, sizeof(*iter)); ifp = CK_STAILQ_FIRST(&V_ifnet); if (ifp != NULL) iter->context[0] = CK_STAILQ_NEXT(ifp, if_link); else iter->context[0] = NULL; return (ifp); } if_t if_iter_next(struct if_iter *iter) { if_t cur_ifp = iter->context[0]; if (cur_ifp != NULL) iter->context[0] = CK_STAILQ_NEXT(cur_ifp, if_link); return (cur_ifp); } void if_iter_finish(struct if_iter *iter) { /* Nothing to do here for now. */ } u_int if_foreach_lladdr(if_t ifp, iflladdr_cb_t cb, void *cb_arg) { struct epoch_tracker et; struct ifaddr *ifa; u_int count; MPASS(cb); count = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_LINK) continue; count += (*cb)(cb_arg, (struct sockaddr_dl *)ifa->ifa_addr, count); } NET_EPOCH_EXIT(et); return (count); } u_int if_llmaddr_count(if_t ifp) { struct epoch_tracker et; struct ifmultiaddr *ifma; int count; count = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) if (ifma->ifma_addr->sa_family == AF_LINK) count++; NET_EPOCH_EXIT(et); return (count); } bool if_maddr_empty(if_t ifp) { return (CK_STAILQ_EMPTY(&ifp->if_multiaddrs)); } u_int if_foreach_llmaddr(if_t ifp, iflladdr_cb_t cb, void *cb_arg) { struct epoch_tracker et; struct ifmultiaddr *ifma; u_int count; MPASS(cb); count = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; count += (*cb)(cb_arg, (struct sockaddr_dl *)ifma->ifma_addr, count); } NET_EPOCH_EXIT(et); return (count); } u_int if_foreach_addr_type(if_t ifp, int type, if_addr_cb_t cb, void *cb_arg) { struct epoch_tracker et; struct ifaddr *ifa; u_int count; MPASS(cb); count = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != type) continue; count += (*cb)(cb_arg, ifa, count); } NET_EPOCH_EXIT(et); return (count); } +struct ifaddr * +ifa_iter_start(if_t ifp, struct ifa_iter *iter) +{ + struct ifaddr *ifa; + + NET_EPOCH_ASSERT(); + + bzero(iter, sizeof(*iter)); + ifa = CK_STAILQ_FIRST(&ifp->if_addrhead); + if (ifa != NULL) + iter->context[0] = CK_STAILQ_NEXT(ifa, ifa_link); + else + iter->context[0] = NULL; + return (ifa); +} + +struct ifaddr * +ifa_iter_next(struct ifa_iter *iter) +{ + struct ifaddr *ifa = iter->context[0]; + + if (ifa != NULL) + iter->context[0] = CK_STAILQ_NEXT(ifa, ifa_link); + return (ifa); +} + +void +ifa_iter_finish(struct ifa_iter *iter) +{ + /* Nothing to do here for now. */ +} + int if_setsoftc(if_t ifp, void *softc) { ifp->if_softc = softc; return (0); } void * if_getsoftc(const if_t ifp) { return (ifp->if_softc); } void if_setrcvif(struct mbuf *m, if_t ifp) { MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m->m_pkthdr.rcvif = (struct ifnet *)ifp; } void if_setvtag(struct mbuf *m, uint16_t tag) { m->m_pkthdr.ether_vtag = tag; } uint16_t if_getvtag(struct mbuf *m) { return (m->m_pkthdr.ether_vtag); } int if_sendq_empty(if_t ifp) { return (IFQ_DRV_IS_EMPTY(&ifp->if_snd)); } struct ifaddr * if_getifaddr(const if_t ifp) { return (ifp->if_addr); } int if_getamcount(const if_t ifp) { return (ifp->if_amcount); } int if_setsendqready(if_t ifp) { IFQ_SET_READY(&ifp->if_snd); return (0); } int if_setsendqlen(if_t ifp, int tx_desc_count) { IFQ_SET_MAXLEN(&ifp->if_snd, tx_desc_count); ifp->if_snd.ifq_drv_maxlen = tx_desc_count; return (0); } void if_setnetmapadapter(if_t ifp, struct netmap_adapter *na) { ifp->if_netmap = na; } struct netmap_adapter * if_getnetmapadapter(if_t ifp) { return (ifp->if_netmap); } int if_vlantrunkinuse(if_t ifp) { return (ifp->if_vlantrunk != NULL); } void if_init(if_t ifp, void *ctx) { (*ifp->if_init)(ctx); } void if_input(if_t ifp, struct mbuf* sendmp) { (*ifp->if_input)(ifp, sendmp); } int if_transmit(if_t ifp, struct mbuf *m) { return ((*ifp->if_transmit)(ifp, m)); } int if_resolvemulti(if_t ifp, struct sockaddr **srcs, struct sockaddr *dst) { if (ifp->if_resolvemulti == NULL) return (EOPNOTSUPP); return (ifp->if_resolvemulti(ifp, srcs, dst)); } +int +if_ioctl(if_t ifp, u_long cmd, void *data) +{ + return (ifp->if_ioctl(ifp, cmd, data)); +} + struct mbuf * if_dequeue(if_t ifp) { struct mbuf *m; IFQ_DRV_DEQUEUE(&ifp->if_snd, m); return (m); } int if_sendq_prepend(if_t ifp, struct mbuf *m) { IFQ_DRV_PREPEND(&ifp->if_snd, m); return (0); } int if_setifheaderlen(if_t ifp, int len) { ifp->if_hdrlen = len; return (0); } caddr_t if_getlladdr(const if_t ifp) { return (IF_LLADDR(ifp)); } void * if_gethandle(u_char type) { return (if_alloc(type)); } void if_bpfmtap(if_t ifp, struct mbuf *m) { BPF_MTAP(ifp, m); } void if_etherbpfmtap(if_t ifp, struct mbuf *m) { ETHER_BPF_MTAP(ifp, m); } void if_vlancap(if_t ifp) { VLAN_CAPABILITIES(ifp); } int if_sethwtsomax(if_t ifp, u_int if_hw_tsomax) { ifp->if_hw_tsomax = if_hw_tsomax; return (0); } int if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount) { ifp->if_hw_tsomaxsegcount = if_hw_tsomaxsegcount; return (0); } int if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize) { ifp->if_hw_tsomaxsegsize = if_hw_tsomaxsegsize; return (0); } u_int if_gethwtsomax(const if_t ifp) { return (ifp->if_hw_tsomax); } u_int if_gethwtsomaxsegcount(const if_t ifp) { return (ifp->if_hw_tsomaxsegcount); } u_int if_gethwtsomaxsegsize(const if_t ifp) { return (ifp->if_hw_tsomaxsegsize); } void if_setinitfn(if_t ifp, if_init_fn_t init_fn) { ifp->if_init = init_fn; } void if_setinputfn(if_t ifp, if_input_fn_t input_fn) { ifp->if_input = input_fn; } if_input_fn_t if_getinputfn(if_t ifp) { return (ifp->if_input); } void if_setioctlfn(if_t ifp, if_ioctl_fn_t ioctl_fn) { ifp->if_ioctl = ioctl_fn; } void if_setoutputfn(if_t ifp, if_output_fn_t output_fn) { ifp->if_output = output_fn; } void if_setstartfn(if_t ifp, if_start_fn_t start_fn) { ifp->if_start = start_fn; } if_start_fn_t if_getstartfn(if_t ifp) { return (ifp->if_start); } void if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn) { ifp->if_transmit = start_fn; } if_transmit_fn_t if_gettransmitfn(if_t ifp) { return (ifp->if_transmit); } void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn) { ifp->if_qflush = flush_fn; } void if_setsndtagallocfn(if_t ifp, if_snd_tag_alloc_t alloc_fn) { ifp->if_snd_tag_alloc = alloc_fn; } int if_snd_tag_alloc(if_t ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **mstp) { if (ifp->if_snd_tag_alloc == NULL) return (EOPNOTSUPP); return (ifp->if_snd_tag_alloc(ifp, params, mstp)); } void if_setgetcounterfn(if_t ifp, if_get_counter_t fn) { ifp->if_get_counter = fn; } void if_setreassignfn(if_t ifp, if_reassign_fn_t fn) { ifp->if_reassign = fn; } void if_setratelimitqueryfn(if_t ifp, if_ratelimit_query_t fn) { ifp->if_ratelimit_query = fn; } void if_setdebugnet_methods(if_t ifp, struct debugnet_methods *m) { ifp->if_debugnet_methods = m; } struct label * if_getmaclabel(if_t ifp) { return (ifp->if_label); } void if_setmaclabel(if_t ifp, struct label *label) { ifp->if_label = label; } int if_gettype(if_t ifp) { return (ifp->if_type); } void * if_getllsoftc(if_t ifp) { return (ifp->if_llsoftc); } void if_setllsoftc(if_t ifp, void *llsoftc) { ifp->if_llsoftc = llsoftc; }; int if_getlinkstate(if_t ifp) { return (ifp->if_link_state); } const uint8_t * if_getbroadcastaddr(if_t ifp) { return (ifp->if_broadcastaddr); } void if_setbroadcastaddr(if_t ifp, const uint8_t *addr) { ifp->if_broadcastaddr = addr; } int if_getnumadomain(if_t ifp) { return (ifp->if_numa_domain); } uint64_t if_getcounter(if_t ifp, ift_counter counter) { return (ifp->if_get_counter(ifp, counter)); } bool if_altq_is_enabled(if_t ifp) { return (ALTQ_IS_ENABLED(&ifp->if_snd)); } struct vnet * if_getvnet(if_t ifp) { return (ifp->if_vnet); } void * if_getafdata(if_t ifp, int af) { return (ifp->if_afdata[af]); } u_int if_getfib(if_t ifp) { return (ifp->if_fib); } uint8_t if_getaddrlen(if_t ifp) { return (ifp->if_addrlen); } struct bpf_if * if_getbpf(if_t ifp) { return (ifp->if_bpf); } struct ifvlantrunk * if_getvlantrunk(if_t ifp) { return (ifp->if_vlantrunk); } uint8_t if_getpcp(if_t ifp) { return (ifp->if_pcp); } void * if_getl2com(if_t ifp) { return (ifp->if_l2com); } #ifdef DDB static void if_show_ifnet(struct ifnet *ifp) { if (ifp == NULL) return; db_printf("%s:\n", ifp->if_xname); #define IF_DB_PRINTF(f, e) db_printf(" %s = " f "\n", #e, ifp->e); IF_DB_PRINTF("%s", if_dname); IF_DB_PRINTF("%d", if_dunit); IF_DB_PRINTF("%s", if_description); IF_DB_PRINTF("%u", if_index); IF_DB_PRINTF("%d", if_idxgen); IF_DB_PRINTF("%u", if_refcount); IF_DB_PRINTF("%p", if_softc); IF_DB_PRINTF("%p", if_l2com); IF_DB_PRINTF("%p", if_llsoftc); IF_DB_PRINTF("%d", if_amcount); IF_DB_PRINTF("%p", if_addr); IF_DB_PRINTF("%p", if_broadcastaddr); IF_DB_PRINTF("%p", if_afdata); IF_DB_PRINTF("%d", if_afdata_initialized); IF_DB_PRINTF("%u", if_fib); IF_DB_PRINTF("%p", if_vnet); IF_DB_PRINTF("%p", if_home_vnet); IF_DB_PRINTF("%p", if_vlantrunk); IF_DB_PRINTF("%p", if_bpf); IF_DB_PRINTF("%u", if_pcount); IF_DB_PRINTF("%p", if_bridge); IF_DB_PRINTF("%p", if_lagg); IF_DB_PRINTF("%p", if_pf_kif); IF_DB_PRINTF("%p", if_carp); IF_DB_PRINTF("%p", if_label); IF_DB_PRINTF("%p", if_netmap); IF_DB_PRINTF("0x%08x", if_flags); IF_DB_PRINTF("0x%08x", if_drv_flags); IF_DB_PRINTF("0x%08x", if_capabilities); IF_DB_PRINTF("0x%08x", if_capenable); IF_DB_PRINTF("%p", if_snd.ifq_head); IF_DB_PRINTF("%p", if_snd.ifq_tail); IF_DB_PRINTF("%d", if_snd.ifq_len); IF_DB_PRINTF("%d", if_snd.ifq_maxlen); IF_DB_PRINTF("%p", if_snd.ifq_drv_head); IF_DB_PRINTF("%p", if_snd.ifq_drv_tail); IF_DB_PRINTF("%d", if_snd.ifq_drv_len); IF_DB_PRINTF("%d", if_snd.ifq_drv_maxlen); IF_DB_PRINTF("%d", if_snd.altq_type); IF_DB_PRINTF("%x", if_snd.altq_flags); #undef IF_DB_PRINTF } DB_SHOW_COMMAND(ifnet, db_show_ifnet) { if (!have_addr) { db_printf("usage: show ifnet \n"); return; } if_show_ifnet((struct ifnet *)addr); } DB_SHOW_ALL_COMMAND(ifnets, db_show_all_ifnets) { struct ifnet *ifp; u_short idx; for (idx = 1; idx <= if_index; idx++) { ifp = ifindex_table[idx].ife_ifnet; if (ifp == NULL) continue; db_printf( "%20s ifp=%p\n", ifp->if_xname, ifp); if (db_pager_quit) break; } } #endif /* DDB */ diff --git a/sys/net/if_var.h b/sys/net/if_var.h index 13a82d772c37..89d1c0c039c0 100644 --- a/sys/net/if_var.h +++ b/sys/net/if_var.h @@ -1,721 +1,732 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)if.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NET_IF_VAR_H_ #define _NET_IF_VAR_H_ /* * Structures defining a network interface, providing a packet * transport mechanism (ala level 0 of the PUP protocols). * * Each interface accepts output datagrams of a specified maximum * length, and provides higher level routines with input datagrams * received from its medium. * * Output occurs when the routine if_output is called, with three parameters: * (*ifp->if_output)(ifp, m, dst, ro) * Here m is the mbuf chain to be sent and dst is the destination address. * The output routine encapsulates the supplied datagram if necessary, * and then transmits it on its medium. * * On input, each interface unwraps the data received by it, and either * places it on the input queue of an internetwork datagram routine * and posts the associated software interrupt, or passes the datagram to a raw * packet input routine. * * Routines exist for locating interfaces by their addresses * or for locating an interface on a certain network, as well as more general * routing and gateway routines maintaining information used to locate * interfaces. These routines live in the files if.c and route.c */ struct rtentry; /* ifa_rtrequest */ struct socket; struct carp_if; struct carp_softc; struct ifvlantrunk; struct route; /* if_output */ struct vnet; struct ifmedia; struct netmap_adapter; struct debugnet_methods; #ifdef _KERNEL #include #include /* ifqueue only? */ #include #include #endif /* _KERNEL */ #include #include #include #include /* XXX */ #include /* struct ifqueue */ #include /* XXX */ #include /* XXX */ #include /* if_link_task */ #define IF_DUNIT_NONE -1 #include CK_STAILQ_HEAD(ifnethead, ifnet); /* we use TAILQs so that the order of */ CK_STAILQ_HEAD(ifaddrhead, ifaddr); /* instantiation is preserved in the list */ CK_STAILQ_HEAD(ifmultihead, ifmultiaddr); CK_STAILQ_HEAD(ifgrouphead, ifg_group); #ifdef _KERNEL VNET_DECLARE(struct pfil_head *, link_pfil_head); #define V_link_pfil_head VNET(link_pfil_head) #define PFIL_ETHER_NAME "ethernet" #define HHOOK_IPSEC_INET 0 #define HHOOK_IPSEC_INET6 1 #define HHOOK_IPSEC_COUNT 2 VNET_DECLARE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]); VNET_DECLARE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]); #define V_ipsec_hhh_in VNET(ipsec_hhh_in) #define V_ipsec_hhh_out VNET(ipsec_hhh_out) #endif /* _KERNEL */ typedef enum { IFCOUNTER_IPACKETS = 0, IFCOUNTER_IERRORS, IFCOUNTER_OPACKETS, IFCOUNTER_OERRORS, IFCOUNTER_COLLISIONS, IFCOUNTER_IBYTES, IFCOUNTER_OBYTES, IFCOUNTER_IMCASTS, IFCOUNTER_OMCASTS, IFCOUNTER_IQDROPS, IFCOUNTER_OQDROPS, IFCOUNTER_NOPROTO, IFCOUNTERS /* Array size. */ } ift_counter; typedef void (*if_start_fn_t)(if_t); typedef int (*if_ioctl_fn_t)(if_t, u_long, caddr_t); typedef void (*if_init_fn_t)(void *); typedef void (*if_input_fn_t)(if_t, struct mbuf *); typedef int (*if_output_fn_t)(if_t, struct mbuf *, const struct sockaddr *, struct route *); typedef void (*if_qflush_fn_t)(if_t); typedef int (*if_transmit_fn_t)(if_t, struct mbuf *); typedef uint64_t (*if_get_counter_t)(if_t, ift_counter); typedef void (*if_reassign_fn_t)(if_t, struct vnet *, char *); struct ifnet_hw_tsomax { u_int tsomaxbytes; /* TSO total burst length limit in bytes */ u_int tsomaxsegcount; /* TSO maximum segment count */ u_int tsomaxsegsize; /* TSO maximum segment size in bytes */ }; /* Interface encap request types */ typedef enum { IFENCAP_LL = 1 /* pre-calculate link-layer header */ } ife_type; /* * The structure below allows to request various pre-calculated L2/L3 headers * for different media. Requests varies by type (rtype field). * * IFENCAP_LL type: pre-calculates link header based on address family * and destination lladdr. * * Input data fields: * buf: pointer to destination buffer * bufsize: buffer size * flags: IFENCAP_FLAG_BROADCAST if destination is broadcast * family: address family defined by AF_ constant. * lladdr: pointer to link-layer address * lladdr_len: length of link-layer address * hdata: pointer to L3 header (optional, used for ARP requests). * Output data fields: * buf: encap data is stored here * bufsize: resulting encap length is stored here * lladdr_off: offset of link-layer address from encap hdr start * hdata: L3 header may be altered if necessary */ struct if_encap_req { u_char *buf; /* Destination buffer (w) */ size_t bufsize; /* size of provided buffer (r) */ ife_type rtype; /* request type (r) */ uint32_t flags; /* Request flags (r) */ int family; /* Address family AF_* (r) */ int lladdr_off; /* offset from header start (w) */ int lladdr_len; /* lladdr length (r) */ char *lladdr; /* link-level address pointer (r) */ char *hdata; /* Upper layer header data (rw) */ }; #define IFENCAP_FLAG_BROADCAST 0x02 /* Destination is broadcast */ /* * Network interface send tag support. The storage of "struct * m_snd_tag" comes from the network driver and it is free to allocate * as much additional space as it wants for its own use. */ struct ktls_session; struct m_snd_tag; #define IF_SND_TAG_TYPE_RATE_LIMIT 0 #define IF_SND_TAG_TYPE_UNLIMITED 1 #define IF_SND_TAG_TYPE_TLS 2 #define IF_SND_TAG_TYPE_TLS_RATE_LIMIT 3 #define IF_SND_TAG_TYPE_TLS_RX 4 #define IF_SND_TAG_TYPE_MAX 5 struct if_snd_tag_alloc_header { uint32_t type; /* send tag type, see IF_SND_TAG_XXX */ uint32_t flowid; /* mbuf hash value */ uint32_t flowtype; /* mbuf hash type */ uint8_t numa_domain; /* numa domain of associated inp */ }; struct if_snd_tag_alloc_rate_limit { struct if_snd_tag_alloc_header hdr; uint64_t max_rate; /* in bytes/s */ uint32_t flags; /* M_NOWAIT or M_WAITOK */ uint32_t reserved; /* alignment */ }; struct if_snd_tag_alloc_tls { struct if_snd_tag_alloc_header hdr; struct inpcb *inp; const struct ktls_session *tls; }; struct if_snd_tag_alloc_tls_rx { struct if_snd_tag_alloc_header hdr; struct inpcb *inp; const struct ktls_session *tls; uint16_t vlan_id; /* valid if non-zero */ }; struct if_snd_tag_alloc_tls_rate_limit { struct if_snd_tag_alloc_header hdr; struct inpcb *inp; const struct ktls_session *tls; uint64_t max_rate; /* in bytes/s */ }; struct if_snd_tag_rate_limit_params { uint64_t max_rate; /* in bytes/s */ uint32_t queue_level; /* 0 (empty) .. 65535 (full) */ #define IF_SND_QUEUE_LEVEL_MIN 0 #define IF_SND_QUEUE_LEVEL_MAX 65535 uint32_t flags; /* M_NOWAIT or M_WAITOK */ }; struct if_snd_tag_modify_tls_rx { /* TCP sequence number of TLS header in host endian format */ uint32_t tls_hdr_tcp_sn; /* * TLS record length, including all headers, data and trailers. * If the tls_rec_length is zero, it means HW encryption resumed. */ uint32_t tls_rec_length; /* TLS sequence number in host endian format */ uint64_t tls_seq_number; }; union if_snd_tag_alloc_params { struct if_snd_tag_alloc_header hdr; struct if_snd_tag_alloc_rate_limit rate_limit; struct if_snd_tag_alloc_rate_limit unlimited; struct if_snd_tag_alloc_tls tls; struct if_snd_tag_alloc_tls_rx tls_rx; struct if_snd_tag_alloc_tls_rate_limit tls_rate_limit; }; union if_snd_tag_modify_params { struct if_snd_tag_rate_limit_params rate_limit; struct if_snd_tag_rate_limit_params unlimited; struct if_snd_tag_rate_limit_params tls_rate_limit; struct if_snd_tag_modify_tls_rx tls_rx; }; union if_snd_tag_query_params { struct if_snd_tag_rate_limit_params rate_limit; struct if_snd_tag_rate_limit_params unlimited; struct if_snd_tag_rate_limit_params tls_rate_limit; }; typedef int (if_snd_tag_alloc_t)(if_t, union if_snd_tag_alloc_params *, struct m_snd_tag **); typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *); typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *); typedef void (if_snd_tag_free_t)(struct m_snd_tag *); typedef struct m_snd_tag *(if_next_send_tag_t)(struct m_snd_tag *); struct if_snd_tag_sw { if_snd_tag_modify_t *snd_tag_modify; if_snd_tag_query_t *snd_tag_query; if_snd_tag_free_t *snd_tag_free; if_next_send_tag_t *next_snd_tag; u_int type; /* One of IF_SND_TAG_TYPE_*. */ }; /* Query return flags */ #define RT_NOSUPPORT 0x00000000 /* Not supported */ #define RT_IS_INDIRECT 0x00000001 /* * Interface like a lagg, select * the actual interface for * capabilities. */ #define RT_IS_SELECTABLE 0x00000002 /* * No rate table, you select * rates and the first * number_of_rates are created. */ #define RT_IS_FIXED_TABLE 0x00000004 /* A fixed table is attached */ #define RT_IS_UNUSABLE 0x00000008 /* It is not usable for this */ #define RT_IS_SETUP_REQ 0x00000010 /* The interface setup must be called before use */ struct if_ratelimit_query_results { const uint64_t *rate_table; /* Pointer to table if present */ uint32_t flags; /* Flags indicating results */ uint32_t max_flows; /* Max flows using, 0=unlimited */ uint32_t number_of_rates; /* How many unique rates can be created */ uint32_t min_segment_burst; /* The amount the adapter bursts at each send */ }; typedef void (if_ratelimit_query_t)(if_t, struct if_ratelimit_query_results *); typedef int (if_ratelimit_setup_t)(if_t, uint64_t, uint32_t); #define IF_NODOM 255 /* * Locks for address lists on the network interface. */ #define IF_ADDR_LOCK_INIT(if) mtx_init(&(if)->if_addr_lock, "if_addr_lock", NULL, MTX_DEF) #define IF_ADDR_LOCK_DESTROY(if) mtx_destroy(&(if)->if_addr_lock) #define IF_ADDR_WLOCK(if) mtx_lock(&(if)->if_addr_lock) #define IF_ADDR_WUNLOCK(if) mtx_unlock(&(if)->if_addr_lock) #define IF_ADDR_LOCK_ASSERT(if) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(if)->if_addr_lock)) #define IF_ADDR_WLOCK_ASSERT(if) mtx_assert(&(if)->if_addr_lock, MA_OWNED) #ifdef _KERNEL /* interface link layer address change event */ typedef void (*iflladdr_event_handler_t)(void *, if_t); EVENTHANDLER_DECLARE(iflladdr_event, iflladdr_event_handler_t); /* interface address change event */ typedef void (*ifaddr_event_handler_t)(void *, if_t); EVENTHANDLER_DECLARE(ifaddr_event, ifaddr_event_handler_t); typedef void (*ifaddr_event_ext_handler_t)(void *, if_t, struct ifaddr *, int); EVENTHANDLER_DECLARE(ifaddr_event_ext, ifaddr_event_ext_handler_t); #define IFADDR_EVENT_ADD 0 #define IFADDR_EVENT_DEL 1 /* new interface arrival event */ typedef void (*ifnet_arrival_event_handler_t)(void *, if_t); EVENTHANDLER_DECLARE(ifnet_arrival_event, ifnet_arrival_event_handler_t); /* interface departure event */ typedef void (*ifnet_departure_event_handler_t)(void *, if_t); EVENTHANDLER_DECLARE(ifnet_departure_event, ifnet_departure_event_handler_t); /* Interface link state change event */ typedef void (*ifnet_link_event_handler_t)(void *, if_t, int); EVENTHANDLER_DECLARE(ifnet_link_event, ifnet_link_event_handler_t); /* Interface up/down event */ #define IFNET_EVENT_UP 0 #define IFNET_EVENT_DOWN 1 #define IFNET_EVENT_PCP 2 /* priority code point, PCP */ #define IFNET_EVENT_UPDATE_BAUDRATE 3 typedef void (*ifnet_event_fn)(void *, if_t ifp, int event); EVENTHANDLER_DECLARE(ifnet_event, ifnet_event_fn); /* * interface groups */ struct ifg_group { char ifg_group[IFNAMSIZ]; u_int ifg_refcnt; void *ifg_pf_kif; CK_STAILQ_HEAD(, ifg_member) ifg_members; /* (CK_) */ CK_STAILQ_ENTRY(ifg_group) ifg_next; /* (CK_) */ }; struct ifg_member { CK_STAILQ_ENTRY(ifg_member) ifgm_next; /* (CK_) */ if_t ifgm_ifp; }; struct ifg_list { struct ifg_group *ifgl_group; CK_STAILQ_ENTRY(ifg_list) ifgl_next; /* (CK_) */ }; #ifdef _SYS_EVENTHANDLER_H_ /* group attach event */ typedef void (*group_attach_event_handler_t)(void *, struct ifg_group *); EVENTHANDLER_DECLARE(group_attach_event, group_attach_event_handler_t); /* group detach event */ typedef void (*group_detach_event_handler_t)(void *, struct ifg_group *); EVENTHANDLER_DECLARE(group_detach_event, group_detach_event_handler_t); /* group change event */ typedef void (*group_change_event_handler_t)(void *, const char *); EVENTHANDLER_DECLARE(group_change_event, group_change_event_handler_t); #endif /* _SYS_EVENTHANDLER_H_ */ /* * 72 was chosen below because it is the size of a TCP/IP * header (40) + the minimum mss (32). */ #define IF_MINMTU 72 #define IF_MAXMTU 65535 #define TOEDEV(ifp) if_getllsoftc(ifp) #define SETTOEDEV(ifp, sc) if_setllsoftc((ifp), (sc)) /* * The ifaddr structure contains information about one address * of an interface. They are maintained by the different address families, * are allocated and attached when an address is set, and are linked * together so all addresses for an interface can be located. * * NOTE: a 'struct ifaddr' is always at the beginning of a larger * chunk of malloc'ed memory, where we store the three addresses * (ifa_addr, ifa_dstaddr and ifa_netmask) referenced here. */ struct ifaddr { struct sockaddr *ifa_addr; /* address of interface */ struct sockaddr *ifa_dstaddr; /* other end of p-to-p link */ #define ifa_broadaddr ifa_dstaddr /* broadcast address interface */ struct sockaddr *ifa_netmask; /* used to determine subnet */ if_t ifa_ifp; /* back-pointer to interface */ struct carp_softc *ifa_carp; /* pointer to CARP data */ CK_STAILQ_ENTRY(ifaddr) ifa_link; /* queue macro glue */ u_short ifa_flags; /* mostly rt_flags for cloning */ #define IFA_ROUTE RTF_UP /* route installed */ #define IFA_RTSELF RTF_HOST /* loopback route to self installed */ u_int ifa_refcnt; /* references to this structure */ counter_u64_t ifa_ipackets; counter_u64_t ifa_opackets; counter_u64_t ifa_ibytes; counter_u64_t ifa_obytes; struct epoch_context ifa_epoch_ctx; }; struct ifaddr * ifa_alloc(size_t size, int flags); void ifa_free(struct ifaddr *ifa); void ifa_ref(struct ifaddr *ifa); int __result_use_check ifa_try_ref(struct ifaddr *ifa); /* * Multicast address structure. This is analogous to the ifaddr * structure except that it keeps track of multicast addresses. */ #define IFMA_F_ENQUEUED 0x1 struct ifmultiaddr { CK_STAILQ_ENTRY(ifmultiaddr) ifma_link; /* queue macro glue */ struct sockaddr *ifma_addr; /* address this membership is for */ struct sockaddr *ifma_lladdr; /* link-layer translation, if any */ if_t ifma_ifp; /* back-pointer to interface */ u_int ifma_refcount; /* reference count */ int ifma_flags; void *ifma_protospec; /* protocol-specific state, if any */ struct ifmultiaddr *ifma_llifma; /* pointer to ifma for ifma_lladdr */ struct epoch_context ifma_epoch_ctx; }; extern struct sx ifnet_sxlock; #define IFNET_WLOCK() sx_xlock(&ifnet_sxlock) #define IFNET_WUNLOCK() sx_xunlock(&ifnet_sxlock) #define IFNET_RLOCK_ASSERT() sx_assert(&ifnet_sxlock, SA_SLOCKED) #define IFNET_WLOCK_ASSERT() sx_assert(&ifnet_sxlock, SA_XLOCKED) #define IFNET_RLOCK() sx_slock(&ifnet_sxlock) #define IFNET_RUNLOCK() sx_sunlock(&ifnet_sxlock) /* * Look up an ifnet given its index. The returned value protected from * being freed by the network epoch. The _ref variant also acquires a * reference that must be freed using if_rele(). */ if_t ifnet_byindex(u_int); if_t ifnet_byindex_ref(u_int); /* * ifnet_byindexgen() looks up ifnet by index and generation count, * attempting to restore a weak pointer that had been stored across * the epoch. */ if_t ifnet_byindexgen(uint16_t idx, uint16_t gen); VNET_DECLARE(struct ifnethead, ifnet); VNET_DECLARE(struct ifgrouphead, ifg_head); VNET_DECLARE(if_t, loif); /* first loopback interface */ #define V_ifnet VNET(ifnet) #define V_ifg_head VNET(ifg_head) #define V_loif VNET(loif) #ifdef MCAST_VERBOSE #define MCDPRINTF printf #else #define MCDPRINTF(...) #endif int if_addgroup(if_t, const char *); int if_delgroup(if_t, const char *); int if_addmulti(if_t, struct sockaddr *, struct ifmultiaddr **); int if_allmulti(if_t, int); if_t if_alloc(u_char); if_t if_alloc_dev(u_char, device_t dev); void if_attach(if_t); void if_dead(if_t); int if_delmulti(if_t, struct sockaddr *); void if_delmulti_ifma(struct ifmultiaddr *); void if_delmulti_ifma_flags(struct ifmultiaddr *, int flags); void if_detach(if_t); void if_purgeaddrs(if_t); void if_delallmulti(if_t); void if_down(if_t); struct ifmultiaddr * if_findmulti(if_t, const struct sockaddr *); void if_freemulti(struct ifmultiaddr *ifma); void if_free(if_t); void if_initname(if_t, const char *, int); void if_link_state_change(if_t, int); int if_printf(if_t, const char *, ...) __printflike(2, 3); int if_log(if_t, int, const char *, ...) __printflike(3, 4); void if_ref(if_t); void if_rele(if_t); bool __result_use_check if_try_ref(if_t); int if_setlladdr(if_t, const u_char *, int); int if_tunnel_check_nesting(if_t, struct mbuf *, uint32_t, int); void if_up(if_t); int ifioctl(struct socket *, u_long, caddr_t, struct thread *); int ifpromisc(if_t, int); if_t ifunit(const char *); if_t ifunit_ref(const char *); int ifa_add_loopback_route(struct ifaddr *, struct sockaddr *); int ifa_del_loopback_route(struct ifaddr *, struct sockaddr *); int ifa_switch_loopback_route(struct ifaddr *, struct sockaddr *); struct ifaddr *ifa_ifwithaddr(const struct sockaddr *); int ifa_ifwithaddr_check(const struct sockaddr *); struct ifaddr *ifa_ifwithbroadaddr(const struct sockaddr *, int); struct ifaddr *ifa_ifwithdstaddr(const struct sockaddr *, int); struct ifaddr *ifa_ifwithnet(const struct sockaddr *, int, int); struct ifaddr *ifa_ifwithroute(int, const struct sockaddr *, const struct sockaddr *, u_int); struct ifaddr *ifaof_ifpforaddr(const struct sockaddr *, if_t); int ifa_preferred(struct ifaddr *, struct ifaddr *); int if_simloop(if_t ifp, struct mbuf *m, int af, int hlen); typedef void *if_com_alloc_t(u_char type, if_t ifp); typedef void if_com_free_t(void *com, u_char type); void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f); void if_deregister_com_alloc(u_char type); void if_data_copy(if_t, struct if_data *); uint64_t if_get_counter_default(if_t, ift_counter); void if_inc_counter(if_t, ift_counter, int64_t); uint64_t if_setbaudrate(if_t ifp, uint64_t baudrate); uint64_t if_getbaudrate(const if_t ifp); int if_setcapabilities(if_t ifp, int capabilities); int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit); int if_getcapabilities(const if_t ifp); int if_togglecapenable(if_t ifp, int togglecap); int if_setcapenable(if_t ifp, int capenable); int if_setcapenablebit(if_t ifp, int setcap, int clearcap); int if_getcapenable(const if_t ifp); int if_setcapabilities2(if_t ifp, int capabilities); int if_setcapabilities2bit(if_t ifp, int setbit, int clearbit); int if_getcapabilities2(const if_t ifp); int if_togglecapenable2(if_t ifp, int togglecap); int if_setcapenable2(if_t ifp, int capenable); int if_setcapenable2bit(if_t ifp, int setcap, int clearcap); int if_getcapenable2(const if_t ifp); int if_getdunit(const if_t ifp); int if_getindex(const if_t ifp); int if_getidxgen(const if_t ifp); const char *if_getdname(const if_t ifp); void if_setdname(if_t ifp, const char *name); const char *if_name(if_t ifp); int if_setname(if_t ifp, const char *name); int if_rename(if_t ifp, char *new_name); +const char *if_getdescr(if_t ifp); void if_setdescr(if_t ifp, char *descrbuf); char *if_allocdescr(size_t sz, int malloc_flag); void if_freedescr(char *descrbuf); +void if_setlastchange(if_t ifp); int if_getalloctype(const if_t ifp); int if_gettype(const if_t ifp); int if_setdev(if_t ifp, void *dev); int if_setdrvflagbits(if_t ifp, int if_setflags, int clear_flags); int if_getdrvflags(const if_t ifp); int if_setdrvflags(if_t ifp, int flags); int if_getlinkstate(if_t ifp); int if_clearhwassist(if_t ifp); int if_sethwassistbits(if_t ifp, int toset, int toclear); int if_sethwassist(if_t ifp, int hwassist_bit); int if_gethwassist(const if_t ifp); int if_togglehwassist(if_t ifp, int toggle_bits); int if_setsoftc(if_t ifp, void *softc); void *if_getsoftc(if_t ifp); int if_setflags(if_t ifp, int flags); void if_setllsoftc(if_t ifp, void *softc); void *if_getllsoftc(if_t ifp); u_int if_getfib(if_t ifp); uint8_t if_getaddrlen(if_t ifp); int if_gethwaddr(const if_t ifp, struct ifreq *); const uint8_t *if_getbroadcastaddr(const if_t ifp); void if_setbroadcastaddr(if_t ifp, const uint8_t *); int if_setmtu(if_t ifp, int mtu); int if_getmtu(const if_t ifp); int if_getmtu_family(const if_t ifp, int family); void if_notifymtu(if_t ifp); int if_setflagbits(if_t ifp, int set, int clear); int if_setflags(if_t ifp, int flags); int if_getflags(const if_t ifp); int if_getnumadomain(if_t ifp); int if_sendq_empty(if_t ifp); int if_setsendqready(if_t ifp); int if_setsendqlen(if_t ifp, int tx_desc_count); int if_sethwtsomax(if_t ifp, u_int if_hw_tsomax); int if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount); int if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize); u_int if_gethwtsomax(const if_t ifp); u_int if_gethwtsomaxsegcount(const if_t ifp); u_int if_gethwtsomaxsegsize(const if_t ifp); void if_setnetmapadapter(if_t ifp, struct netmap_adapter *na); struct netmap_adapter *if_getnetmapadapter(if_t ifp); void if_input(if_t ifp, struct mbuf* sendmp); int if_sendq_prepend(if_t ifp, struct mbuf *m); struct mbuf *if_dequeue(if_t ifp); int if_setifheaderlen(if_t ifp, int len); void if_setrcvif(struct mbuf *m, if_t ifp); void if_setvtag(struct mbuf *m, u_int16_t tag); u_int16_t if_getvtag(struct mbuf *m); int if_vlantrunkinuse(if_t ifp); caddr_t if_getlladdr(const if_t ifp); struct vnet *if_getvnet(const if_t ifp); void *if_gethandle(u_char); void if_bpfmtap(if_t ifp, struct mbuf *m); void if_etherbpfmtap(if_t ifp, struct mbuf *m); void if_vlancap(if_t ifp); int if_transmit(if_t ifp, struct mbuf *m); void if_init(if_t ifp, void *ctx); +int if_ioctl(if_t ifp, u_long cmd, void *data); int if_resolvemulti(if_t ifp, struct sockaddr **, struct sockaddr *); uint64_t if_getcounter(if_t ifp, ift_counter counter); struct label *if_getmaclabel(if_t ifp); void if_setmaclabel(if_t ifp, struct label *label); struct bpf_if *if_getbpf(if_t ifp); uint8_t if_getpcp(if_t ifp); void *if_getl2com(if_t ifp); struct ifvlantrunk *if_getvlantrunk(if_t ifp); bool if_altq_is_enabled(if_t ifp); void *if_getafdata(if_t ifp, int); int if_snd_tag_alloc(if_t ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **mstp); /* * Traversing through interface address lists. */ struct sockaddr_dl; typedef u_int iflladdr_cb_t(void *, struct sockaddr_dl *, u_int); u_int if_foreach_lladdr(if_t, iflladdr_cb_t, void *); u_int if_foreach_llmaddr(if_t, iflladdr_cb_t, void *); u_int if_lladdr_count(if_t); u_int if_llmaddr_count(if_t); bool if_maddr_empty(if_t); int if_getamcount(const if_t ifp); struct ifaddr * if_getifaddr(const if_t ifp); typedef u_int if_addr_cb_t(void *, struct ifaddr *, u_int); u_int if_foreach_addr_type(if_t ifp, int type, if_addr_cb_t cb, void *cb_arg); typedef int (*if_foreach_cb_t)(if_t, void *); typedef bool (*if_foreach_match_t)(if_t, void *); int if_foreach(if_foreach_cb_t, void *); int if_foreach_sleep(if_foreach_match_t, void *, if_foreach_cb_t, void *); /* Opaque iterator structure for iterating over interfaces. */ struct if_iter { void *context[4]; }; if_t if_iter_start(struct if_iter *); if_t if_iter_next(struct if_iter *); void if_iter_finish(struct if_iter *); +struct ifa_iter { + void *context[4]; +}; + +struct ifaddr *ifa_iter_start(if_t ifp, struct ifa_iter *iter); +struct ifaddr *ifa_iter_next(struct ifa_iter *iter); +void ifa_iter_finish(struct ifa_iter *iter); + /* Functions */ void if_setinitfn(if_t ifp, if_init_fn_t); void if_setinputfn(if_t ifp, if_input_fn_t); if_input_fn_t if_getinputfn(if_t ifp); void if_setioctlfn(if_t ifp, if_ioctl_fn_t); void if_setoutputfn(if_t ifp, if_output_fn_t); void if_setstartfn(if_t ifp, if_start_fn_t); if_start_fn_t if_getstartfn(if_t ifp); void if_settransmitfn(if_t ifp, if_transmit_fn_t); if_transmit_fn_t if_gettransmitfn(if_t ifp); void if_setqflushfn(if_t ifp, if_qflush_fn_t); void if_setgetcounterfn(if_t ifp, if_get_counter_t); void if_setsndtagallocfn(if_t ifp, if_snd_tag_alloc_t); void if_setdebugnet_methods(if_t, struct debugnet_methods *); void if_setreassignfn(if_t ifp, if_reassign_fn_t); void if_setratelimitqueryfn(if_t ifp, if_ratelimit_query_t); /* TSO */ void if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *); int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *); /* accessors for struct ifreq */ void *ifr_data_get_ptr(void *ifrp); void *ifr_buffer_get_buffer(void *data); size_t ifr_buffer_get_length(void *data); int ifhwioctl(u_long, if_t, caddr_t, struct thread *); #ifdef DEVICE_POLLING enum poll_cmd { POLL_ONLY, POLL_AND_CHECK_STATUS }; typedef int poll_handler_t(if_t ifp, enum poll_cmd cmd, int count); int ether_poll_register(poll_handler_t *h, if_t ifp); int ether_poll_deregister(if_t ifp); #endif /* DEVICE_POLLING */ #endif /* _KERNEL */ #include /* XXX: temporary until drivers converted. */ #include /* XXXAO: temporary unconditional include */ #endif /* !_NET_IF_VAR_H_ */ diff --git a/sys/netlink/route/iface.c b/sys/netlink/route/iface.c index 3d7f752fa613..0874b73a5c98 100644 --- a/sys/netlink/route/iface.c +++ b/sys/netlink/route/iface.c @@ -1,1522 +1,1530 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_netlink.h" #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* scope deembedding */ #include #include #include #include #define DEBUG_MOD_NAME nl_iface #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); struct netlink_walkargs { struct nl_writer *nw; struct nlmsghdr hdr; struct nlpcb *so; struct ucred *cred; uint32_t fibnum; int family; int error; int count; int dumped; }; static eventhandler_tag ifdetach_event, ifattach_event, iflink_event, ifaddr_event; static SLIST_HEAD(, nl_cloner) nl_cloners = SLIST_HEAD_INITIALIZER(nl_cloners); static struct sx rtnl_cloner_lock; SX_SYSINIT(rtnl_cloner_lock, &rtnl_cloner_lock, "rtnl cloner lock"); /* These are external hooks for CARP. */ extern int (*carp_get_vhid_p)(struct ifaddr *); /* * RTM_GETLINK request * sendto(3, {{len=32, type=RTM_GETLINK, flags=NLM_F_REQUEST|NLM_F_DUMP, seq=1641940952, pid=0}, * {ifi_family=AF_INET, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}}, 32, 0, NULL, 0) = 32 * * Reply: * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_ETHER, ifi_index=if_nametoindex("enp0s31f6"), ifi_flags=IFF_UP|IFF_BROADCAST|IFF_RUNNING|IFF_MULTICAST|IFF_LOWER_UP, ifi_change=0}, {{nla_len=10, nla_type=IFLA_ADDRESS}, "\xfe\x54\x00\x52\x3e\x90"} [ {{nla_len=14, nla_type=IFLA_IFNAME}, "enp0s31f6"}, {{nla_len=8, nla_type=IFLA_TXQLEN}, 1000}, {{nla_len=5, nla_type=IFLA_OPERSTATE}, 6}, {{nla_len=5, nla_type=IFLA_LINKMODE}, 0}, {{nla_len=8, nla_type=IFLA_MTU}, 1500}, {{nla_len=8, nla_type=IFLA_MIN_MTU}, 68}, {{nla_len=8, nla_type=IFLA_MAX_MTU}, 9000}, {{nla_len=8, nla_type=IFLA_GROUP}, 0}, {{nla_len=8, nla_type=IFLA_PROMISCUITY}, 0}, {{nla_len=8, nla_type=IFLA_NUM_TX_QUEUES}, 1}, {{nla_len=8, nla_type=IFLA_GSO_MAX_SEGS}, 65535}, {{nla_len=8, nla_type=IFLA_GSO_MAX_SIZE}, 65536}, {{nla_len=8, nla_type=IFLA_NUM_RX_QUEUES}, 1}, {{nla_len=5, nla_type=IFLA_CARRIER}, 1}, {{nla_len=13, nla_type=IFLA_QDISC}, "fq_codel"}, {{nla_len=8, nla_type=IFLA_CARRIER_CHANGES}, 2}, {{nla_len=5, nla_type=IFLA_PROTO_DOWN}, 0}, {{nla_len=8, nla_type=IFLA_CARRIER_UP_COUNT}, 1}, {{nla_len=8, nla_type=IFLA_CARRIER_DOWN_COUNT}, 1}, */ struct if_state { uint8_t ifla_operstate; uint8_t ifla_carrier; }; static void -get_operstate_ether(struct ifnet *ifp, struct if_state *pstate) +get_operstate_ether(if_t ifp, struct if_state *pstate) { struct ifmediareq ifmr = {}; int error; - error = (*ifp->if_ioctl)(ifp, SIOCGIFMEDIA, (void *)&ifmr); + error = if_ioctl(ifp, SIOCGIFMEDIA, (void *)&ifmr); if (error != 0) { NL_LOG(LOG_DEBUG, "error calling SIOCGIFMEDIA on %s: %d", if_name(ifp), error); return; } switch (IFM_TYPE(ifmr.ifm_active)) { case IFM_ETHER: if (ifmr.ifm_status & IFM_ACTIVE) { pstate->ifla_carrier = 1; - if (ifp->if_flags & IFF_MONITOR) + if (if_getflags(ifp) & IFF_MONITOR) pstate->ifla_operstate = IF_OPER_DORMANT; else pstate->ifla_operstate = IF_OPER_UP; } else pstate->ifla_operstate = IF_OPER_DOWN; } } static bool -get_stats(struct nl_writer *nw, struct ifnet *ifp) +get_stats(struct nl_writer *nw, if_t ifp) { struct rtnl_link_stats64 *stats; int nla_len = sizeof(struct nlattr) + sizeof(*stats); struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); if (nla == NULL) return (false); nla->nla_type = IFLA_STATS64; nla->nla_len = nla_len; stats = (struct rtnl_link_stats64 *)(nla + 1); - stats->rx_packets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS); - stats->tx_packets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS); - stats->rx_bytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES); - stats->tx_bytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES); - stats->rx_errors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS); - stats->tx_errors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS); - stats->rx_dropped = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS); - stats->tx_dropped = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS); - stats->multicast = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS); - stats->rx_nohandler = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO); + stats->rx_packets = if_getcounter(ifp, IFCOUNTER_IPACKETS); + stats->tx_packets = if_getcounter(ifp, IFCOUNTER_OPACKETS); + stats->rx_bytes = if_getcounter(ifp, IFCOUNTER_IBYTES); + stats->tx_bytes = if_getcounter(ifp, IFCOUNTER_OBYTES); + stats->rx_errors = if_getcounter(ifp, IFCOUNTER_IERRORS); + stats->tx_errors = if_getcounter(ifp, IFCOUNTER_OERRORS); + stats->rx_dropped = if_getcounter(ifp, IFCOUNTER_IQDROPS); + stats->tx_dropped = if_getcounter(ifp, IFCOUNTER_OQDROPS); + stats->multicast = if_getcounter(ifp, IFCOUNTER_IMCASTS); + stats->rx_nohandler = if_getcounter(ifp, IFCOUNTER_NOPROTO); return (true); } static void -get_operstate(struct ifnet *ifp, struct if_state *pstate) +get_operstate(if_t ifp, struct if_state *pstate) { pstate->ifla_operstate = IF_OPER_UNKNOWN; pstate->ifla_carrier = 0; /* no carrier */ - switch (ifp->if_type) { + switch (if_gettype(ifp)) { case IFT_ETHER: case IFT_L2VLAN: get_operstate_ether(ifp, pstate); break; default: /* Map admin state to the operstate */ - if (ifp->if_flags & IFF_UP) { + if (if_getflags(ifp) & IFF_UP) { pstate->ifla_operstate = IF_OPER_UP; pstate->ifla_carrier = 1; } else pstate->ifla_operstate = IF_OPER_DOWN; break; } } static void -get_hwaddr(struct nl_writer *nw, struct ifnet *ifp) +get_hwaddr(struct nl_writer *nw, if_t ifp) { struct ifreq ifr = {}; if (if_gethwaddr(ifp, &ifr) == 0) { nlattr_add(nw, IFLAF_ORIG_HWADDR, if_getaddrlen(ifp), ifr.ifr_addr.sa_data); } } static unsigned -ifp_flags_to_netlink(const struct ifnet *ifp) +ifp_flags_to_netlink(const if_t ifp) { - return (ifp->if_flags | ifp->if_drv_flags); + return (if_getflags(ifp) | if_getdrvflags(ifp)); } #define LLADDR_CONST(s) ((const void *)((s)->sdl_data + (s)->sdl_nlen)) static bool dump_sa(struct nl_writer *nw, int attr, const struct sockaddr *sa) { uint32_t addr_len = 0; const void *addr_data = NULL; #ifdef INET6 struct in6_addr addr6; #endif if (sa == NULL) return (true); switch (sa->sa_family) { #ifdef INET case AF_INET: addr_len = sizeof(struct in_addr); addr_data = &((const struct sockaddr_in *)sa)->sin_addr; break; #endif #ifdef INET6 case AF_INET6: in6_splitscope(&((const struct sockaddr_in6 *)sa)->sin6_addr, &addr6, &addr_len); addr_len = sizeof(struct in6_addr); addr_data = &addr6; break; #endif case AF_LINK: addr_len = ((const struct sockaddr_dl *)sa)->sdl_alen; addr_data = LLADDR_CONST((const struct sockaddr_dl *)sa); break; case AF_UNSPEC: /* Ignore empty SAs without warning */ return (true); default: NL_LOG(LOG_DEBUG2, "unsupported family: %d, skipping", sa->sa_family); return (true); } return (nlattr_add(nw, attr, addr_len, addr_data)); } static bool dump_iface_caps(struct nl_writer *nw, struct ifnet *ifp) { int off = nlattr_add_nested(nw, IFLAF_CAPS); uint32_t active_caps[roundup2(IFCAP_B_SIZE, 32) / 32] = {}; uint32_t all_caps[roundup2(IFCAP_B_SIZE, 32) / 32] = {}; MPASS(sizeof(active_caps) >= 8); MPASS(sizeof(all_caps) >= 8); if (off == 0) return (false); active_caps[0] = (uint32_t)if_getcapabilities(ifp); all_caps[0] = (uint32_t)if_getcapenable(ifp); active_caps[1] = (uint32_t)if_getcapabilities2(ifp); all_caps[1] = (uint32_t)if_getcapenable2(ifp); nlattr_add_u32(nw, NLA_BITSET_SIZE, IFCAP_B_SIZE); nlattr_add(nw, NLA_BITSET_MASK, sizeof(all_caps), all_caps); nlattr_add(nw, NLA_BITSET_VALUE, sizeof(active_caps), active_caps); nlattr_set_len(nw, off); return (true); } /* * Dumps interface state, properties and metrics. * @nw: message writer * @ifp: target interface * @hdr: template header * @if_flags_mask: changed if_[drv]_flags bitmask * * This function is called without epoch and MAY sleep. */ static bool -dump_iface(struct nl_writer *nw, struct ifnet *ifp, const struct nlmsghdr *hdr, +dump_iface(struct nl_writer *nw, if_t ifp, const struct nlmsghdr *hdr, int if_flags_mask) { struct ifinfomsg *ifinfo; NL_LOG(LOG_DEBUG3, "dumping interface %s data", if_name(ifp)); if (!nlmsg_reply(nw, hdr, sizeof(struct ifinfomsg))) goto enomem; ifinfo = nlmsg_reserve_object(nw, struct ifinfomsg); ifinfo->ifi_family = AF_UNSPEC; ifinfo->__ifi_pad = 0; - ifinfo->ifi_type = ifp->if_type; - ifinfo->ifi_index = ifp->if_index; + ifinfo->ifi_type = if_gettype(ifp); + ifinfo->ifi_index = if_getindex(ifp); ifinfo->ifi_flags = ifp_flags_to_netlink(ifp); ifinfo->ifi_change = if_flags_mask; struct if_state ifs = {}; get_operstate(ifp, &ifs); if (ifs.ifla_operstate == IF_OPER_UP) ifinfo->ifi_flags |= IFF_LOWER_UP; nlattr_add_string(nw, IFLA_IFNAME, if_name(ifp)); nlattr_add_u8(nw, IFLA_OPERSTATE, ifs.ifla_operstate); nlattr_add_u8(nw, IFLA_CARRIER, ifs.ifla_carrier); /* nlattr_add_u8(nw, IFLA_PROTO_DOWN, val); nlattr_add_u8(nw, IFLA_LINKMODE, val); */ if (if_getaddrlen(ifp) != 0) { struct ifaddr *ifa = if_getifaddr(ifp); dump_sa(nw, IFLA_ADDRESS, ifa->ifa_addr); } - if ((ifp->if_broadcastaddr != NULL)) { - nlattr_add(nw, IFLA_BROADCAST, ifp->if_addrlen, - ifp->if_broadcastaddr); + if ((if_getbroadcastaddr(ifp) != NULL)) { + nlattr_add(nw, IFLA_BROADCAST, if_getaddrlen(ifp), + if_getbroadcastaddr(ifp)); } - nlattr_add_u32(nw, IFLA_MTU, ifp->if_mtu); + nlattr_add_u32(nw, IFLA_MTU, if_getmtu(ifp)); /* nlattr_add_u32(nw, IFLA_MIN_MTU, 60); nlattr_add_u32(nw, IFLA_MAX_MTU, 9000); nlattr_add_u32(nw, IFLA_GROUP, 0); */ - if (ifp->if_description != NULL) - nlattr_add_string(nw, IFLA_IFALIAS, ifp->if_description); + if (if_getdescr(ifp) != NULL) + nlattr_add_string(nw, IFLA_IFALIAS, if_getdescr(ifp)); /* Store FreeBSD-specific attributes */ int off = nlattr_add_nested(nw, IFLA_FREEBSD); if (off != 0) { get_hwaddr(nw, ifp); dump_iface_caps(nw, ifp); nlattr_set_len(nw, off); } get_stats(nw, ifp); - uint32_t val = (ifp->if_flags & IFF_PROMISC) != 0; + uint32_t val = (if_getflags(ifp) & IFF_PROMISC) != 0; nlattr_add_u32(nw, IFLA_PROMISCUITY, val); ifc_dump_ifp_nl(ifp, nw); if (nlmsg_end(nw)) return (true); enomem: NL_LOG(LOG_DEBUG, "unable to dump interface %s state (ENOMEM)", if_name(ifp)); nlmsg_abort(nw); return (false); } static bool check_ifmsg(void *hdr, struct nl_pstate *npt) { struct ifinfomsg *ifm = hdr; if (ifm->__ifi_pad != 0 || ifm->ifi_type != 0 || ifm->ifi_flags != 0 || ifm->ifi_change != 0) { nlmsg_report_err_msg(npt, "strict checking: non-zero values in ifinfomsg header"); return (false); } return (true); } #define _IN(_field) offsetof(struct ifinfomsg, _field) #define _OUT(_field) offsetof(struct nl_parsed_link, _field) static const struct nlfield_parser nlf_p_if[] = { { .off_in = _IN(ifi_type), .off_out = _OUT(ifi_type), .cb = nlf_get_u16 }, { .off_in = _IN(ifi_index), .off_out = _OUT(ifi_index), .cb = nlf_get_u32 }, { .off_in = _IN(ifi_flags), .off_out = _OUT(ifi_flags), .cb = nlf_get_u32 }, { .off_in = _IN(ifi_change), .off_out = _OUT(ifi_change), .cb = nlf_get_u32 }, }; static const struct nlattr_parser nla_p_linfo[] = { { .type = IFLA_INFO_KIND, .off = _OUT(ifla_cloner), .cb = nlattr_get_stringn }, { .type = IFLA_INFO_DATA, .off = _OUT(ifla_idata), .cb = nlattr_get_nla }, }; NL_DECLARE_ATTR_PARSER(linfo_parser, nla_p_linfo); static const struct nlattr_parser nla_p_if[] = { { .type = IFLA_IFNAME, .off = _OUT(ifla_ifname), .cb = nlattr_get_string }, { .type = IFLA_MTU, .off = _OUT(ifla_mtu), .cb = nlattr_get_uint32 }, { .type = IFLA_LINK, .off = _OUT(ifla_link), .cb = nlattr_get_uint32 }, { .type = IFLA_LINKINFO, .arg = &linfo_parser, .cb = nlattr_get_nested }, { .type = IFLA_IFALIAS, .off = _OUT(ifla_ifalias), .cb = nlattr_get_string }, { .type = IFLA_GROUP, .off = _OUT(ifla_group), .cb = nlattr_get_string }, { .type = IFLA_ALT_IFNAME, .off = _OUT(ifla_ifname), .cb = nlattr_get_string }, }; #undef _IN #undef _OUT NL_DECLARE_STRICT_PARSER(ifmsg_parser, struct ifinfomsg, check_ifmsg, nlf_p_if, nla_p_if); static bool -match_iface(struct ifnet *ifp, void *_arg) +match_iface(if_t ifp, void *_arg) { struct nl_parsed_link *attrs = (struct nl_parsed_link *)_arg; - if (attrs->ifi_index != 0 && attrs->ifi_index != ifp->if_index) + if (attrs->ifi_index != 0 && attrs->ifi_index != if_getindex(ifp)) return (false); - if (attrs->ifi_type != 0 && attrs->ifi_index != ifp->if_type) + if (attrs->ifi_type != 0 && attrs->ifi_index != if_gettype(ifp)) return (false); if (attrs->ifla_ifname != NULL && strcmp(attrs->ifla_ifname, if_name(ifp))) return (false); /* TODO: add group match */ return (true); } static int -dump_cb(struct ifnet *ifp, void *_arg) +dump_cb(if_t ifp, void *_arg) { struct netlink_walkargs *wa = (struct netlink_walkargs *)_arg; if (!dump_iface(wa->nw, ifp, &wa->hdr, 0)) return (ENOMEM); return (0); } /* * {nlmsg_len=52, nlmsg_type=RTM_GETLINK, nlmsg_flags=NLM_F_REQUEST, nlmsg_seq=1662842818, nlmsg_pid=0}, * {ifi_family=AF_PACKET, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}, * [ * [{nla_len=10, nla_type=IFLA_IFNAME}, "vnet9"], * [{nla_len=8, nla_type=IFLA_EXT_MASK}, RTEXT_FILTER_VF] * ] */ static int rtnl_handle_getlink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct epoch_tracker et; - struct ifnet *ifp; + if_t ifp; int error = 0; struct nl_parsed_link attrs = {}; error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs); if (error != 0) return (error); struct netlink_walkargs wa = { .so = nlp, .nw = npt->nw, .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_flags = hdr->nlmsg_flags, .hdr.nlmsg_type = NL_RTM_NEWLINK, }; /* Fast track for an interface w/ explicit name or index match */ if ((attrs.ifi_index != 0) || (attrs.ifla_ifname != NULL)) { if (attrs.ifi_index != 0) { NLP_LOG(LOG_DEBUG3, nlp, "fast track -> searching index %u", attrs.ifi_index); NET_EPOCH_ENTER(et); ifp = ifnet_byindex_ref(attrs.ifi_index); NET_EPOCH_EXIT(et); } else { NLP_LOG(LOG_DEBUG3, nlp, "fast track -> searching name %s", attrs.ifla_ifname); ifp = ifunit_ref(attrs.ifla_ifname); } if (ifp != NULL) { if (match_iface(ifp, &attrs)) { if (!dump_iface(wa.nw, ifp, &wa.hdr, 0)) error = ENOMEM; } else error = ENODEV; if_rele(ifp); } else error = ENODEV; return (error); } /* Always treat non-direct-match as a multipart message */ wa.hdr.nlmsg_flags |= NLM_F_MULTI; /* * Fetching some link properties require performing ioctl's that may be blocking. * Address it by saving referenced pointers of the matching links, * exiting from epoch and going through the list one-by-one. */ NL_LOG(LOG_DEBUG2, "Start dump"); if_foreach_sleep(match_iface, &attrs, dump_cb, &wa); NL_LOG(LOG_DEBUG2, "End dump, iterated %d dumped %d", wa.count, wa.dumped); if (!nlmsg_end_dump(wa.nw, error, &wa.hdr)) { NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (error); } /* * sendmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[ * {nlmsg_len=60, nlmsg_type=RTM_NEWLINK, nlmsg_flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, nlmsg_seq=1662715618, nlmsg_pid=0}, * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}, * {nla_len=11, nla_type=IFLA_IFNAME}, "dummy0"], * [ * {nla_len=16, nla_type=IFLA_LINKINFO}, * [ * {nla_len=9, nla_type=IFLA_INFO_KIND}, "dummy"... * ] * ] */ static int rtnl_handle_dellink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct epoch_tracker et; - struct ifnet *ifp; + if_t ifp; int error; struct nl_parsed_link attrs = {}; error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs); if (error != 0) return (error); NET_EPOCH_ENTER(et); ifp = ifnet_byindex_ref(attrs.ifi_index); NET_EPOCH_EXIT(et); if (ifp == NULL) { NLP_LOG(LOG_DEBUG, nlp, "unable to find interface %u", attrs.ifi_index); return (ENOENT); } NLP_LOG(LOG_DEBUG3, nlp, "mapped ifindex %u to %s", attrs.ifi_index, if_name(ifp)); sx_xlock(&ifnet_detach_sxlock); error = if_clone_destroy(if_name(ifp)); sx_xunlock(&ifnet_detach_sxlock); NLP_LOG(LOG_DEBUG2, nlp, "deleting interface %s returned %d", if_name(ifp), error); if_rele(ifp); return (error); } /* * New link: * type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1668185590, pid=0}, * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0} * [ * {{nla_len=8, nla_type=IFLA_MTU}, 123}, * {{nla_len=10, nla_type=IFLA_IFNAME}, "vlan1"}, * {{nla_len=24, nla_type=IFLA_LINKINFO}, * [ * {{nla_len=8, nla_type=IFLA_INFO_KIND}, "vlan"...}, * {{nla_len=12, nla_type=IFLA_INFO_DATA}, "\x06\x00\x01\x00\x7b\x00\x00\x00"}]}]} * * Update link: * type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK, seq=1668185923, pid=0}, * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=if_nametoindex("lo"), ifi_flags=0, ifi_change=0}, * {{nla_len=8, nla_type=IFLA_MTU}, 123}} * * * Check command availability: * type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK, seq=0, pid=0}, * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0} */ static int create_link(struct nlmsghdr *hdr, struct nl_parsed_link *lattrs, struct nlattr_bmask *bm, struct nlpcb *nlp, struct nl_pstate *npt) { if (lattrs->ifla_ifname == NULL || strlen(lattrs->ifla_ifname) == 0) { NLMSG_REPORT_ERR_MSG(npt, "empty IFLA_IFNAME attribute"); return (EINVAL); } if (lattrs->ifla_cloner == NULL || strlen(lattrs->ifla_cloner) == 0) { NLMSG_REPORT_ERR_MSG(npt, "empty IFLA_INFO_KIND attribute"); return (EINVAL); } struct ifc_data_nl ifd = { .flags = IFC_F_CREATE, .lattrs = lattrs, .bm = bm, .npt = npt, }; if (ifc_create_ifp_nl(lattrs->ifla_ifname, &ifd) && ifd.error == 0) nl_store_ifp_cookie(npt, ifd.ifp); return (ifd.error); } static int modify_link(struct nlmsghdr *hdr, struct nl_parsed_link *lattrs, struct nlattr_bmask *bm, struct nlpcb *nlp, struct nl_pstate *npt) { - struct ifnet *ifp = NULL; + if_t ifp = NULL; struct epoch_tracker et; if (lattrs->ifi_index == 0 && lattrs->ifla_ifname == NULL) { /* * Applications like ip(8) verify RTM_NEWLINK command * existence by calling it with empty arguments. Always * return "innocent" error in that case. */ NLMSG_REPORT_ERR_MSG(npt, "empty ifi_index field"); return (EPERM); } if (lattrs->ifi_index != 0) { NET_EPOCH_ENTER(et); ifp = ifnet_byindex_ref(lattrs->ifi_index); NET_EPOCH_EXIT(et); if (ifp == NULL) { NLMSG_REPORT_ERR_MSG(npt, "unable to find interface #%u", lattrs->ifi_index); return (ENOENT); } } if (ifp == NULL && lattrs->ifla_ifname != NULL) { ifp = ifunit_ref(lattrs->ifla_ifname); if (ifp == NULL) { NLMSG_REPORT_ERR_MSG(npt, "unable to find interface %s", lattrs->ifla_ifname); return (ENOENT); } } MPASS(ifp != NULL); /* * Modification request can address either * 1) cloned interface, in which case we call the cloner-specific * modification routine * or * 2) non-cloned (e.g. "physical") interface, in which case we call * generic modification routine */ struct ifc_data_nl ifd = { .lattrs = lattrs, .bm = bm, .npt = npt }; if (!ifc_modify_ifp_nl(ifp, &ifd)) ifd.error = nl_modify_ifp_generic(ifp, lattrs, bm, npt); if_rele(ifp); return (ifd.error); } static int rtnl_handle_newlink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct nlattr_bmask bm; int error; struct nl_parsed_link attrs = {}; error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs); if (error != 0) return (error); nl_get_attrs_bmask_nlmsg(hdr, &ifmsg_parser, &bm); if (hdr->nlmsg_flags & NLM_F_CREATE) return (create_link(hdr, &attrs, &bm, nlp, npt)); else return (modify_link(hdr, &attrs, &bm, nlp, npt)); } static void set_scope6(struct sockaddr *sa, uint32_t ifindex) { #ifdef INET6 if (sa != NULL && sa->sa_family == AF_INET6) { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr)) in6_set_unicast_scopeid(&sa6->sin6_addr, ifindex); } #endif } static bool check_sa_family(const struct sockaddr *sa, int family, const char *attr_name, struct nl_pstate *npt) { if (sa == NULL || sa->sa_family == family) return (true); nlmsg_report_err_msg(npt, "wrong family for %s attribute: %d != %d", attr_name, family, sa->sa_family); return (false); } struct nl_parsed_ifa { uint8_t ifa_family; uint8_t ifa_prefixlen; uint8_t ifa_scope; uint32_t ifa_index; uint32_t ifa_flags; uint32_t ifaf_vhid; uint32_t ifaf_flags; struct sockaddr *ifa_address; struct sockaddr *ifa_local; struct sockaddr *ifa_broadcast; struct ifa_cacheinfo *ifa_cacheinfo; struct sockaddr *f_ifa_addr; struct sockaddr *f_ifa_dst; }; static int nlattr_get_cinfo(struct nlattr *nla, struct nl_pstate *npt, const void *arg __unused, void *target) { if (__predict_false(NLA_DATA_LEN(nla) != sizeof(struct ifa_cacheinfo))) { NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not ifa_cacheinfo", nla->nla_type, NLA_DATA_LEN(nla)); return (EINVAL); } *((struct ifa_cacheinfo **)target) = (struct ifa_cacheinfo *)NL_RTA_DATA(nla); return (0); } #define _IN(_field) offsetof(struct ifaddrmsg, _field) #define _OUT(_field) offsetof(struct nl_parsed_ifa, _field) static const struct nlfield_parser nlf_p_ifa[] = { { .off_in = _IN(ifa_family), .off_out = _OUT(ifa_family), .cb = nlf_get_u8 }, { .off_in = _IN(ifa_prefixlen), .off_out = _OUT(ifa_prefixlen), .cb = nlf_get_u8 }, { .off_in = _IN(ifa_scope), .off_out = _OUT(ifa_scope), .cb = nlf_get_u8 }, { .off_in = _IN(ifa_flags), .off_out = _OUT(ifa_flags), .cb = nlf_get_u8_u32 }, { .off_in = _IN(ifa_index), .off_out = _OUT(ifa_index), .cb = nlf_get_u32 }, }; static const struct nlattr_parser nla_p_ifa_fbsd[] = { { .type = IFAF_VHID, .off = _OUT(ifaf_vhid), .cb = nlattr_get_uint32 }, { .type = IFAF_FLAGS, .off = _OUT(ifaf_flags), .cb = nlattr_get_uint32 }, }; NL_DECLARE_ATTR_PARSER(ifa_fbsd_parser, nla_p_ifa_fbsd); static const struct nlattr_parser nla_p_ifa[] = { { .type = IFA_ADDRESS, .off = _OUT(ifa_address), .cb = nlattr_get_ip }, { .type = IFA_LOCAL, .off = _OUT(ifa_local), .cb = nlattr_get_ip }, { .type = IFA_BROADCAST, .off = _OUT(ifa_broadcast), .cb = nlattr_get_ip }, { .type = IFA_CACHEINFO, .off = _OUT(ifa_cacheinfo), .cb = nlattr_get_cinfo }, { .type = IFA_FLAGS, .off = _OUT(ifa_flags), .cb = nlattr_get_uint32 }, { .type = IFA_FREEBSD, .arg = &ifa_fbsd_parser, .cb = nlattr_get_nested }, }; #undef _IN #undef _OUT static bool post_p_ifa(void *_attrs, struct nl_pstate *npt) { struct nl_parsed_ifa *attrs = (struct nl_parsed_ifa *)_attrs; if (!check_sa_family(attrs->ifa_address, attrs->ifa_family, "IFA_ADDRESS", npt)) return (false); if (!check_sa_family(attrs->ifa_local, attrs->ifa_family, "IFA_LOCAL", npt)) return (false); if (!check_sa_family(attrs->ifa_broadcast, attrs->ifa_family, "IFA_BROADADDR", npt)) return (false); set_scope6(attrs->ifa_address, attrs->ifa_index); set_scope6(attrs->ifa_local, attrs->ifa_index); return (true); } NL_DECLARE_PARSER_EXT(ifa_parser, struct ifaddrmsg, NULL, nlf_p_ifa, nla_p_ifa, post_p_ifa); /* {ifa_family=AF_INET, ifa_prefixlen=8, ifa_flags=IFA_F_PERMANENT, ifa_scope=RT_SCOPE_HOST, ifa_index=if_nametoindex("lo")}, [ {{nla_len=8, nla_type=IFA_ADDRESS}, inet_addr("127.0.0.1")}, {{nla_len=8, nla_type=IFA_LOCAL}, inet_addr("127.0.0.1")}, {{nla_len=7, nla_type=IFA_LABEL}, "lo"}, {{nla_len=8, nla_type=IFA_FLAGS}, IFA_F_PERMANENT}, {{nla_len=20, nla_type=IFA_CACHEINFO}, {ifa_prefered=4294967295, ifa_valid=4294967295, cstamp=3619, tstamp=3619}}]}, --- {{len=72, type=RTM_NEWADDR, flags=NLM_F_MULTI, seq=1642191126, pid=566735}, {ifa_family=AF_INET6, ifa_prefixlen=96, ifa_flags=IFA_F_PERMANENT, ifa_scope=RT_SCOPE_UNIVERSE, ifa_index=if_nametoindex("virbr0")}, [ {{nla_len=20, nla_type=IFA_ADDRESS}, inet_pton(AF_INET6, "2a01:4f8:13a:70c:ffff::1")}, {{nla_len=20, nla_type=IFA_CACHEINFO}, {ifa_prefered=4294967295, ifa_valid=4294967295, cstamp=4283, tstamp=4283}}, {{nla_len=8, nla_type=IFA_FLAGS}, IFA_F_PERMANENT}]}, */ static uint8_t ifa_get_scope(const struct ifaddr *ifa) { const struct sockaddr *sa; uint8_t addr_scope = RT_SCOPE_UNIVERSE; sa = ifa->ifa_addr; switch (sa->sa_family) { #ifdef INET case AF_INET: { struct in_addr addr; addr = ((const struct sockaddr_in *)sa)->sin_addr; if (IN_LOOPBACK(addr.s_addr)) addr_scope = RT_SCOPE_HOST; else if (IN_LINKLOCAL(addr.s_addr)) addr_scope = RT_SCOPE_LINK; break; } #endif #ifdef INET6 case AF_INET6: { const struct in6_addr *addr; addr = &((const struct sockaddr_in6 *)sa)->sin6_addr; if (IN6_IS_ADDR_LOOPBACK(addr)) addr_scope = RT_SCOPE_HOST; else if (IN6_IS_ADDR_LINKLOCAL(addr)) addr_scope = RT_SCOPE_LINK; break; } #endif } return (addr_scope); } #ifdef INET6 static uint8_t inet6_get_plen(const struct in6_addr *addr) { return (bitcount32(addr->s6_addr32[0]) + bitcount32(addr->s6_addr32[1]) + bitcount32(addr->s6_addr32[2]) + bitcount32(addr->s6_addr32[3])); } #endif static uint8_t get_sa_plen(const struct sockaddr *sa) { #ifdef INET const struct in_addr *paddr; #endif #ifdef INET6 const struct in6_addr *paddr6; #endif switch (sa->sa_family) { #ifdef INET case AF_INET: paddr = &(((const struct sockaddr_in *)sa)->sin_addr); return bitcount32(paddr->s_addr);; #endif #ifdef INET6 case AF_INET6: paddr6 = &(((const struct sockaddr_in6 *)sa)->sin6_addr); return inet6_get_plen(paddr6); #endif } return (0); } #ifdef INET6 static uint32_t in6_flags_to_nl(uint32_t flags) { uint32_t nl_flags = 0; if (flags & IN6_IFF_TEMPORARY) nl_flags |= IFA_F_TEMPORARY; if (flags & IN6_IFF_NODAD) nl_flags |= IFA_F_NODAD; if (flags & IN6_IFF_DEPRECATED) nl_flags |= IFA_F_DEPRECATED; if (flags & IN6_IFF_TENTATIVE) nl_flags |= IFA_F_TENTATIVE; if ((flags & (IN6_IFF_AUTOCONF|IN6_IFF_TEMPORARY)) == 0) flags |= IFA_F_PERMANENT; if (flags & IN6_IFF_DUPLICATED) flags |= IFA_F_DADFAILED; return (nl_flags); } static uint32_t nl_flags_to_in6(uint32_t flags) { uint32_t in6_flags = 0; if (flags & IFA_F_TEMPORARY) in6_flags |= IN6_IFF_TEMPORARY; if (flags & IFA_F_NODAD) in6_flags |= IN6_IFF_NODAD; if (flags & IFA_F_DEPRECATED) in6_flags |= IN6_IFF_DEPRECATED; if (flags & IFA_F_TENTATIVE) in6_flags |= IN6_IFF_TENTATIVE; if (flags & IFA_F_DADFAILED) in6_flags |= IN6_IFF_DUPLICATED; return (in6_flags); } static void export_cache_info6(struct nl_writer *nw, const struct in6_ifaddr *ia) { struct ifa_cacheinfo ci = { .cstamp = ia->ia6_createtime * 1000, .tstamp = ia->ia6_updatetime * 1000, .ifa_prefered = ia->ia6_lifetime.ia6t_pltime, .ifa_valid = ia->ia6_lifetime.ia6t_vltime, }; nlattr_add(nw, IFA_CACHEINFO, sizeof(ci), &ci); } #endif static void export_cache_info(struct nl_writer *nw, struct ifaddr *ifa) { switch (ifa->ifa_addr->sa_family) { #ifdef INET6 case AF_INET6: export_cache_info6(nw, (struct in6_ifaddr *)ifa); break; #endif } } /* * {'attrs': [('IFA_ADDRESS', '12.0.0.1'), ('IFA_LOCAL', '12.0.0.1'), ('IFA_LABEL', 'eth10'), ('IFA_FLAGS', 128), ('IFA_CACHEINFO', {'ifa_preferred': 4294967295, 'ifa_valid': 4294967295, 'cstamp': 63745746, 'tstamp': 63745746})], */ static bool -dump_iface_addr(struct nl_writer *nw, struct ifnet *ifp, struct ifaddr *ifa, +dump_iface_addr(struct nl_writer *nw, if_t ifp, struct ifaddr *ifa, const struct nlmsghdr *hdr) { struct ifaddrmsg *ifamsg; struct sockaddr *sa = ifa->ifa_addr; struct sockaddr *sa_dst = ifa->ifa_dstaddr; NL_LOG(LOG_DEBUG3, "dumping ifa %p type %s(%d) for interface %s", ifa, rib_print_family(sa->sa_family), sa->sa_family, if_name(ifp)); if (!nlmsg_reply(nw, hdr, sizeof(struct ifaddrmsg))) goto enomem; ifamsg = nlmsg_reserve_object(nw, struct ifaddrmsg); ifamsg->ifa_family = sa->sa_family; ifamsg->ifa_prefixlen = get_sa_plen(ifa->ifa_netmask); ifamsg->ifa_flags = 0; // ifa_flags is useless ifamsg->ifa_scope = ifa_get_scope(ifa); - ifamsg->ifa_index = ifp->if_index; + ifamsg->ifa_index = if_getindex(ifp); - if ((ifp->if_flags & IFF_POINTOPOINT) && sa_dst != NULL && sa_dst->sa_family != 0) { + if ((if_getflags(ifp) & IFF_POINTOPOINT) && sa_dst != NULL && sa_dst->sa_family != 0) { /* P2P interface may have IPv6 LL with no dst address */ dump_sa(nw, IFA_ADDRESS, sa_dst); dump_sa(nw, IFA_LOCAL, sa); } else { dump_sa(nw, IFA_ADDRESS, sa); #ifdef INET /* * In most cases, IFA_ADDRESS == IFA_LOCAL * Skip IFA_LOCAL for anything except INET */ if (sa->sa_family == AF_INET) dump_sa(nw, IFA_LOCAL, sa); #endif } - if (ifp->if_flags & IFF_BROADCAST) + if (if_getflags(ifp) & IFF_BROADCAST) dump_sa(nw, IFA_BROADCAST, ifa->ifa_broadaddr); nlattr_add_string(nw, IFA_LABEL, if_name(ifp)); uint32_t nl_ifa_flags = 0; #ifdef INET6 if (sa->sa_family == AF_INET6) { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; nl_ifa_flags = in6_flags_to_nl(ia->ia6_flags); } #endif nlattr_add_u32(nw, IFA_FLAGS, nl_ifa_flags); export_cache_info(nw, ifa); /* Store FreeBSD-specific attributes */ int off = nlattr_add_nested(nw, IFA_FREEBSD); if (off != 0) { if (ifa->ifa_carp != NULL && carp_get_vhid_p != NULL) { uint32_t vhid = (uint32_t)(*carp_get_vhid_p)(ifa); nlattr_add_u32(nw, IFAF_VHID, vhid); } #ifdef INET6 if (sa->sa_family == AF_INET6) { uint32_t ifa_flags = ((struct in6_ifaddr *)ifa)->ia6_flags; nlattr_add_u32(nw, IFAF_FLAGS, ifa_flags); } #endif nlattr_set_len(nw, off); } if (nlmsg_end(nw)) return (true); enomem: NL_LOG(LOG_DEBUG, "Failed to dump ifa type %s(%d) for interface %s", rib_print_family(sa->sa_family), sa->sa_family, if_name(ifp)); nlmsg_abort(nw); return (false); } static int -dump_iface_addrs(struct netlink_walkargs *wa, struct ifnet *ifp) +dump_iface_addrs(struct netlink_walkargs *wa, if_t ifp) { struct ifaddr *ifa; + struct ifa_iter it; + int error = 0; - CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + for (ifa = ifa_iter_start(ifp, &it); ifa != NULL; ifa = ifa_iter_next(&it)) { if (wa->family != 0 && wa->family != ifa->ifa_addr->sa_family) continue; if (ifa->ifa_addr->sa_family == AF_LINK) continue; if (prison_if(wa->cred, ifa->ifa_addr) != 0) continue; wa->count++; - if (!dump_iface_addr(wa->nw, ifp, ifa, &wa->hdr)) - return (ENOMEM); + if (!dump_iface_addr(wa->nw, ifp, ifa, &wa->hdr)) { + error = ENOMEM; + break; + } wa->dumped++; } + ifa_iter_finish(&it); - return (0); + return (error); } static int rtnl_handle_getaddr(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { - struct ifnet *ifp; + if_t ifp; int error = 0; struct nl_parsed_ifa attrs = {}; error = nl_parse_nlmsg(hdr, &ifa_parser, npt, &attrs); if (error != 0) return (error); struct netlink_walkargs wa = { .so = nlp, .nw = npt->nw, .cred = nlp_get_cred(nlp), .family = attrs.ifa_family, .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI, .hdr.nlmsg_type = NL_RTM_NEWADDR, }; NL_LOG(LOG_DEBUG2, "Start dump"); if (attrs.ifa_index != 0) { ifp = ifnet_byindex(attrs.ifa_index); if (ifp == NULL) error = ENOENT; else error = dump_iface_addrs(&wa, ifp); } else { - CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { + struct if_iter it; + + for (ifp = if_iter_start(&it); ifp != NULL; ifp = if_iter_next(&it)) { error = dump_iface_addrs(&wa, ifp); if (error != 0) break; } + if_iter_finish(&it); } NL_LOG(LOG_DEBUG2, "End dump, iterated %d dumped %d", wa.count, wa.dumped); if (!nlmsg_end_dump(wa.nw, error, &wa.hdr)) { NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (error); } #ifdef INET static int handle_newaddr_inet(struct nlmsghdr *hdr, struct nl_parsed_ifa *attrs, - struct ifnet *ifp, struct nlpcb *nlp, struct nl_pstate *npt) + if_t ifp, struct nlpcb *nlp, struct nl_pstate *npt) { int plen = attrs->ifa_prefixlen; int if_flags = if_getflags(ifp); struct sockaddr_in *addr, *dst; if (plen > 32) { nlmsg_report_err_msg(npt, "invalid ifa_prefixlen"); return (EINVAL); }; if (if_flags & IFF_POINTOPOINT) { /* * Only P2P IFAs are allowed by the implementation. */ if (attrs->ifa_address == NULL || attrs->ifa_local == NULL) { nlmsg_report_err_msg(npt, "Empty IFA_LOCAL/IFA_ADDRESS"); return (EINVAL); } addr = (struct sockaddr_in *)attrs->ifa_local; dst = (struct sockaddr_in *)attrs->ifa_address; } else { /* * Map the Netlink attributes to FreeBSD ifa layout. * If only IFA_ADDRESS or IFA_LOCAL is set OR * both are set to the same value => ifa is not p2p * and the attribute value contains interface address. * * Otherwise (both IFA_ADDRESS and IFA_LOCAL are set and * different), IFA_LOCAL contains an interface address and * IFA_ADDRESS contains peer address. */ addr = (struct sockaddr_in *)attrs->ifa_local; if (addr == NULL) addr = (struct sockaddr_in *)attrs->ifa_address; if (addr == NULL) { nlmsg_report_err_msg(npt, "Empty IFA_LOCAL/IFA_ADDRESS"); return (EINVAL); } /* Generate broadcast address if not set */ if ((if_flags & IFF_BROADCAST) && attrs->ifa_broadcast == NULL) { uint32_t s_baddr; struct sockaddr_in *sin_brd; if (plen == 31) s_baddr = INADDR_BROADCAST; /* RFC 3021 */ else { uint32_t s_mask; s_mask = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0); s_baddr = addr->sin_addr.s_addr | ~s_mask; } sin_brd = (struct sockaddr_in *)npt_alloc(npt, sizeof(*sin_brd)); if (sin_brd == NULL) return (ENOMEM); sin_brd->sin_family = AF_INET; sin_brd->sin_len = sizeof(*sin_brd); sin_brd->sin_addr.s_addr = s_baddr; attrs->ifa_broadcast = (struct sockaddr *)sin_brd; } dst = (struct sockaddr_in *)attrs->ifa_broadcast; } struct sockaddr_in mask = { .sin_len = sizeof(struct sockaddr_in), .sin_family = AF_INET, .sin_addr.s_addr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0), }; struct in_aliasreq req = { .ifra_addr = *addr, .ifra_mask = mask, .ifra_vhid = attrs->ifaf_vhid, }; if (dst != NULL) req.ifra_dstaddr = *dst; return (in_control(NULL, SIOCAIFADDR, &req, ifp, curthread)); } static int handle_deladdr_inet(struct nlmsghdr *hdr, struct nl_parsed_ifa *attrs, - struct ifnet *ifp, struct nlpcb *nlp, struct nl_pstate *npt) + if_t ifp, struct nlpcb *nlp, struct nl_pstate *npt) { struct sockaddr_in *addr = (struct sockaddr_in *)attrs->ifa_local; if (addr == NULL) addr = (struct sockaddr_in *)attrs->ifa_address; if (addr == NULL) { nlmsg_report_err_msg(npt, "empty IFA_ADDRESS/IFA_LOCAL"); return (EINVAL); } struct in_aliasreq req = { .ifra_addr = *addr }; return (in_control(NULL, SIOCDIFADDR, &req, ifp, curthread)); } #endif #ifdef INET6 static int handle_newaddr_inet6(struct nlmsghdr *hdr, struct nl_parsed_ifa *attrs, - struct ifnet *ifp, struct nlpcb *nlp, struct nl_pstate *npt) + if_t ifp, struct nlpcb *nlp, struct nl_pstate *npt) { struct sockaddr_in6 *addr, *dst; if (attrs->ifa_prefixlen > 128) { nlmsg_report_err_msg(npt, "invalid ifa_prefixlen"); return (EINVAL); } /* * In IPv6 implementation, adding non-P2P address to the P2P interface * is allowed. */ addr = (struct sockaddr_in6 *)(attrs->ifa_local); dst = (struct sockaddr_in6 *)(attrs->ifa_address); if (addr == NULL) { addr = dst; dst = NULL; } else if (dst != NULL) { if (IN6_ARE_ADDR_EQUAL(&addr->sin6_addr, &dst->sin6_addr)) { /* * Sometimes Netlink users fills in both attributes * with the same address. It still means "non-p2p". */ dst = NULL; } } if (addr == NULL) { nlmsg_report_err_msg(npt, "Empty IFA_LOCAL/IFA_ADDRESS"); return (EINVAL); } uint32_t flags = nl_flags_to_in6(attrs->ifa_flags) | attrs->ifaf_flags; uint32_t pltime = 0, vltime = 0; if (attrs->ifa_cacheinfo != 0) { pltime = attrs->ifa_cacheinfo->ifa_prefered; vltime = attrs->ifa_cacheinfo->ifa_valid; } struct sockaddr_in6 mask = { .sin6_len = sizeof(struct sockaddr_in6), .sin6_family = AF_INET6, }; ip6_writemask(&mask.sin6_addr, attrs->ifa_prefixlen); struct in6_aliasreq req = { .ifra_addr = *addr, .ifra_prefixmask = mask, .ifra_flags = flags, .ifra_lifetime = { .ia6t_vltime = vltime, .ia6t_pltime = pltime }, .ifra_vhid = attrs->ifaf_vhid, }; if (dst != NULL) req.ifra_dstaddr = *dst; return (in6_control(NULL, SIOCAIFADDR_IN6, &req, ifp, curthread)); } static int handle_deladdr_inet6(struct nlmsghdr *hdr, struct nl_parsed_ifa *attrs, - struct ifnet *ifp, struct nlpcb *nlp, struct nl_pstate *npt) + if_t ifp, struct nlpcb *nlp, struct nl_pstate *npt) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *)attrs->ifa_local; if (addr == NULL) addr = (struct sockaddr_in6 *)(attrs->ifa_address); if (addr == NULL) { nlmsg_report_err_msg(npt, "Empty IFA_LOCAL/IFA_ADDRESS"); return (EINVAL); } struct in6_aliasreq req = { .ifra_addr = *addr }; return (in6_control(NULL, SIOCDIFADDR_IN6, &req, ifp, curthread)); } #endif static int rtnl_handle_addr(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct epoch_tracker et; int error; struct nl_parsed_ifa attrs = {}; error = nl_parse_nlmsg(hdr, &ifa_parser, npt, &attrs); if (error != 0) return (error); NET_EPOCH_ENTER(et); - struct ifnet *ifp = ifnet_byindex_ref(attrs.ifa_index); + if_t ifp = ifnet_byindex_ref(attrs.ifa_index); NET_EPOCH_EXIT(et); if (ifp == NULL) { nlmsg_report_err_msg(npt, "Unable to find interface with index %u", attrs.ifa_index); return (ENOENT); } int if_flags = if_getflags(ifp); #if defined(INET) || defined(INET6) bool new = hdr->nlmsg_type == NL_RTM_NEWADDR; #endif /* * TODO: Properly handle NLM_F_CREATE / NLM_F_EXCL. * The current ioctl-based KPI always does an implicit create-or-replace. * It is not possible to specify fine-grained options. */ switch (attrs.ifa_family) { #ifdef INET case AF_INET: if (new) error = handle_newaddr_inet(hdr, &attrs, ifp, nlp, npt); else error = handle_deladdr_inet(hdr, &attrs, ifp, nlp, npt); break; #endif #ifdef INET6 case AF_INET6: if (new) error = handle_newaddr_inet6(hdr, &attrs, ifp, nlp, npt); else error = handle_deladdr_inet6(hdr, &attrs, ifp, nlp, npt); break; #endif default: error = EAFNOSUPPORT; } if (error == 0 && !(if_flags & IFF_UP) && (if_getflags(ifp) & IFF_UP)) if_up(ifp); if_rele(ifp); return (error); } static void rtnl_handle_ifaddr(void *arg __unused, struct ifaddr *ifa, int cmd) { struct nlmsghdr hdr = {}; struct nl_writer nw = {}; uint32_t group = 0; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: group = RTNLGRP_IPV4_IFADDR; break; #endif #ifdef INET6 case AF_INET6: group = RTNLGRP_IPV6_IFADDR; break; #endif default: NL_LOG(LOG_DEBUG2, "ifa notification for unknown AF: %d", ifa->ifa_addr->sa_family); return; } if (!nl_has_listeners(NETLINK_ROUTE, group)) return; if (!nlmsg_get_group_writer(&nw, NLMSG_LARGE, NETLINK_ROUTE, group)) { NL_LOG(LOG_DEBUG, "error allocating group writer"); return; } hdr.nlmsg_type = (cmd == RTM_DELETE) ? NL_RTM_DELADDR : NL_RTM_NEWADDR; dump_iface_addr(&nw, ifa->ifa_ifp, ifa, &hdr); nlmsg_flush(&nw); } static void -rtnl_handle_ifevent(struct ifnet *ifp, int nlmsg_type, int if_flags_mask) +rtnl_handle_ifevent(if_t ifp, int nlmsg_type, int if_flags_mask) { struct nlmsghdr hdr = { .nlmsg_type = nlmsg_type }; struct nl_writer nw = {}; if (!nl_has_listeners(NETLINK_ROUTE, RTNLGRP_LINK)) return; if (!nlmsg_get_group_writer(&nw, NLMSG_LARGE, NETLINK_ROUTE, RTNLGRP_LINK)) { NL_LOG(LOG_DEBUG, "error allocating mbuf"); return; } dump_iface(&nw, ifp, &hdr, if_flags_mask); nlmsg_flush(&nw); } static void -rtnl_handle_ifattach(void *arg, struct ifnet *ifp) +rtnl_handle_ifattach(void *arg, if_t ifp) { NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp)); rtnl_handle_ifevent(ifp, NL_RTM_NEWLINK, 0); } static void -rtnl_handle_ifdetach(void *arg, struct ifnet *ifp) +rtnl_handle_ifdetach(void *arg, if_t ifp) { NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp)); rtnl_handle_ifevent(ifp, NL_RTM_DELLINK, 0); } static void -rtnl_handle_iflink(void *arg, struct ifnet *ifp) +rtnl_handle_iflink(void *arg, if_t ifp) { NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp)); rtnl_handle_ifevent(ifp, NL_RTM_NEWLINK, 0); } void -rtnl_handle_ifnet_event(struct ifnet *ifp, int if_flags_mask) +rtnl_handle_ifnet_event(if_t ifp, int if_flags_mask) { NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp)); rtnl_handle_ifevent(ifp, NL_RTM_NEWLINK, if_flags_mask); } static const struct rtnl_cmd_handler cmd_handlers[] = { { .cmd = NL_RTM_GETLINK, .name = "RTM_GETLINK", .cb = &rtnl_handle_getlink, .flags = RTNL_F_NOEPOCH | RTNL_F_ALLOW_NONVNET_JAIL, }, { .cmd = NL_RTM_DELLINK, .name = "RTM_DELLINK", .cb = &rtnl_handle_dellink, .priv = PRIV_NET_IFDESTROY, .flags = RTNL_F_NOEPOCH, }, { .cmd = NL_RTM_NEWLINK, .name = "RTM_NEWLINK", .cb = &rtnl_handle_newlink, .priv = PRIV_NET_IFCREATE, .flags = RTNL_F_NOEPOCH, }, { .cmd = NL_RTM_GETADDR, .name = "RTM_GETADDR", .cb = &rtnl_handle_getaddr, .flags = RTNL_F_ALLOW_NONVNET_JAIL, }, { .cmd = NL_RTM_NEWADDR, .name = "RTM_NEWADDR", .cb = &rtnl_handle_addr, .priv = PRIV_NET_ADDIFADDR, .flags = RTNL_F_NOEPOCH, }, { .cmd = NL_RTM_DELADDR, .name = "RTM_DELADDR", .cb = &rtnl_handle_addr, .priv = PRIV_NET_DELIFADDR, .flags = RTNL_F_NOEPOCH, }, }; static const struct nlhdr_parser *all_parsers[] = { &ifmsg_parser, &ifa_parser, &ifa_fbsd_parser, }; void rtnl_iface_add_cloner(struct nl_cloner *cloner) { sx_xlock(&rtnl_cloner_lock); SLIST_INSERT_HEAD(&nl_cloners, cloner, next); sx_xunlock(&rtnl_cloner_lock); } void rtnl_iface_del_cloner(struct nl_cloner *cloner) { sx_xlock(&rtnl_cloner_lock); SLIST_REMOVE(&nl_cloners, cloner, nl_cloner, next); sx_xunlock(&rtnl_cloner_lock); } void rtnl_ifaces_init(void) { ifattach_event = EVENTHANDLER_REGISTER( ifnet_arrival_event, rtnl_handle_ifattach, NULL, EVENTHANDLER_PRI_ANY); ifdetach_event = EVENTHANDLER_REGISTER( ifnet_departure_event, rtnl_handle_ifdetach, NULL, EVENTHANDLER_PRI_ANY); ifaddr_event = EVENTHANDLER_REGISTER( rt_addrmsg, rtnl_handle_ifaddr, NULL, EVENTHANDLER_PRI_ANY); iflink_event = EVENTHANDLER_REGISTER( ifnet_link_event, rtnl_handle_iflink, NULL, EVENTHANDLER_PRI_ANY); NL_VERIFY_PARSERS(all_parsers); rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers)); } void rtnl_ifaces_destroy(void) { EVENTHANDLER_DEREGISTER(ifnet_arrival_event, ifattach_event); EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_event); EVENTHANDLER_DEREGISTER(rt_addrmsg, ifaddr_event); EVENTHANDLER_DEREGISTER(ifnet_link_event, iflink_event); } diff --git a/sys/netlink/route/iface_drivers.c b/sys/netlink/route/iface_drivers.c index 1e839695d960..a0f9ef925a04 100644 --- a/sys/netlink/route/iface_drivers.c +++ b/sys/netlink/route/iface_drivers.c @@ -1,147 +1,147 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_netlink.h" #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* scope deembedding */ #define DEBUG_MOD_NAME nl_iface_drivers #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); /* * Generic modification interface handler. * Responsible for changing network stack interface attributes * such as state, mtu or description. */ int _nl_modify_ifp_generic(struct ifnet *ifp, struct nl_parsed_link *lattrs, const struct nlattr_bmask *bm, struct nl_pstate *npt) { int error; if (lattrs->ifla_ifalias != NULL) { if (nlp_has_priv(npt->nlp, PRIV_NET_SETIFDESCR)) { int len = strlen(lattrs->ifla_ifalias) + 1; char *buf = if_allocdescr(len, M_WAITOK); memcpy(buf, lattrs->ifla_ifalias, len); if_setdescr(ifp, buf); - getmicrotime(&ifp->if_lastchange); + if_setlastchange(ifp); } else { nlmsg_report_err_msg(npt, "Not enough privileges to set descr"); return (EPERM); } } if ((lattrs->ifi_change & IFF_UP) && (lattrs->ifi_flags & IFF_UP) == 0) { /* Request to down the interface */ if_down(ifp); } if (lattrs->ifla_mtu > 0) { if (nlp_has_priv(npt->nlp, PRIV_NET_SETIFMTU)) { struct ifreq ifr = { .ifr_mtu = lattrs->ifla_mtu }; error = ifhwioctl(SIOCSIFMTU, ifp, (char *)&ifr, curthread); } else { nlmsg_report_err_msg(npt, "Not enough privileges to set mtu"); return (EPERM); } } if (lattrs->ifi_change & IFF_PROMISC) { error = ifpromisc(ifp, lattrs->ifi_flags & IFF_PROMISC); if (error != 0) { nlmsg_report_err_msg(npt, "unable to set promisc"); return (error); } } return (0); } /* * Saves the resulting ifindex and ifname to report them * to userland along with the operation result. * NLA format: * NLMSGERR_ATTR_COOKIE(nested) * IFLA_NEW_IFINDEX(u32) * IFLA_IFNAME(string) */ void _nl_store_ifp_cookie(struct nl_pstate *npt, struct ifnet *ifp) { int ifname_len = strlen(if_name(ifp)); - uint32_t ifindex = (uint32_t)ifp->if_index; + uint32_t ifindex = (uint32_t)if_getindex(ifp); int nla_len = sizeof(struct nlattr) * 3 + sizeof(ifindex) + NL_ITEM_ALIGN(ifname_len + 1); struct nlattr *nla_cookie = npt_alloc(npt, nla_len); /* Nested TLV */ nla_cookie->nla_len = nla_len; nla_cookie->nla_type = NLMSGERR_ATTR_COOKIE; struct nlattr *nla = nla_cookie + 1; nla->nla_len = sizeof(struct nlattr) + sizeof(ifindex); nla->nla_type = IFLA_NEW_IFINDEX; memcpy(NLA_DATA(nla), &ifindex, sizeof(ifindex)); nla = NLA_NEXT(nla); nla->nla_len = sizeof(struct nlattr) + ifname_len + 1; nla->nla_type = IFLA_IFNAME; strlcpy(NLA_DATA(nla), if_name(ifp), ifname_len + 1); nlmsg_report_cookie(npt, nla_cookie); } diff --git a/sys/netlink/route/neigh.c b/sys/netlink/route/neigh.c index 74a162bb9464..55749fcbe761 100644 --- a/sys/netlink/route/neigh.c +++ b/sys/netlink/route/neigh.c @@ -1,600 +1,605 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_netlink.h" #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include +#include +#include #include #include #include #include #include #include /* nd6.h requires this */ #include /* nd6 state machine */ #include /* scope deembedding */ #define DEBUG_MOD_NAME nl_neigh #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); static int lle_families[] = { AF_INET, AF_INET6 }; static eventhandler_tag lle_event_p; struct netlink_walkargs { struct nl_writer *nw; struct nlmsghdr hdr; struct nlpcb *so; - struct ifnet *ifp; + if_t ifp; int family; int error; int count; int dumped; }; static int lle_state_to_nl_state(int family, struct llentry *lle) { int state = lle->ln_state; switch (family) { case AF_INET: if (lle->la_flags & (LLE_STATIC | LLE_IFADDR)) state = 1; switch (state) { case 0: /* ARP_LLINFO_INCOMPLETE */ return (NUD_INCOMPLETE); case 1: /* ARP_LLINFO_REACHABLE */ return (NUD_REACHABLE); case 2: /* ARP_LLINFO_VERIFY */ return (NUD_PROBE); } break; case AF_INET6: switch (state) { case ND6_LLINFO_INCOMPLETE: return (NUD_INCOMPLETE); case ND6_LLINFO_REACHABLE: return (NUD_REACHABLE); case ND6_LLINFO_STALE: return (NUD_STALE); case ND6_LLINFO_DELAY: return (NUD_DELAY); case ND6_LLINFO_PROBE: return (NUD_PROBE); } break; } return (NUD_NONE); } static uint32_t lle_flags_to_nl_flags(const struct llentry *lle) { uint32_t nl_flags = 0; if (lle->la_flags & LLE_IFADDR) nl_flags |= NTF_SELF; if (lle->la_flags & LLE_PUB) nl_flags |= NTF_PROXY; if (lle->la_flags & LLE_STATIC) nl_flags |= NTF_STICKY; if (lle->ln_router != 0) nl_flags |= NTF_ROUTER; return (nl_flags); } static uint32_t get_lle_next_ts(const struct llentry *lle) { if (lle->la_expire == 0) return (0); return (lle->la_expire + lle->lle_remtime / hz + time_second - time_uptime); } static int dump_lle_locked(struct llentry *lle, void *arg) { struct netlink_walkargs *wa = (struct netlink_walkargs *)arg; struct nlmsghdr *hdr = &wa->hdr; struct nl_writer *nw = wa->nw; struct ndmsg *ndm; #if defined(INET) || defined(INET6) union { struct in_addr in; struct in6_addr in6; } addr; #endif IF_DEBUG_LEVEL(LOG_DEBUG2) { char llebuf[NHOP_PRINT_BUFSIZE]; llentry_print_buf_lltable(lle, llebuf, sizeof(llebuf)); NL_LOG(LOG_DEBUG2, "dumping %s", llebuf); } if (!nlmsg_reply(nw, hdr, sizeof(struct ndmsg))) goto enomem; ndm = nlmsg_reserve_object(nw, struct ndmsg); ndm->ndm_family = wa->family; - ndm->ndm_ifindex = wa->ifp->if_index; + ndm->ndm_ifindex = if_getindex(wa->ifp); ndm->ndm_state = lle_state_to_nl_state(wa->family, lle); ndm->ndm_flags = lle_flags_to_nl_flags(lle); switch (wa->family) { #ifdef INET case AF_INET: addr.in = lle->r_l3addr.addr4; nlattr_add(nw, NDA_DST, 4, &addr); break; #endif #ifdef INET6 case AF_INET6: addr.in6 = lle->r_l3addr.addr6; in6_clearscope(&addr.in6); nlattr_add(nw, NDA_DST, 16, &addr); break; #endif } if (lle->r_flags & RLLE_VALID) { /* Has L2 */ - int addrlen = wa->ifp->if_addrlen; + int addrlen = if_getaddrlen(wa->ifp); nlattr_add(nw, NDA_LLADDR, addrlen, lle->ll_addr); } nlattr_add_u32(nw, NDA_PROBES, lle->la_asked); struct nda_cacheinfo *cache; cache = nlmsg_reserve_attr(nw, NDA_CACHEINFO, struct nda_cacheinfo); if (cache == NULL) goto enomem; /* TODO: provide confirmed/updated */ cache->ndm_refcnt = lle->lle_refcnt; int off = nlattr_add_nested(nw, NDA_FREEBSD); if (off != 0) { nlattr_add_u32(nw, NDAF_NEXT_STATE_TS, get_lle_next_ts(lle)); nlattr_set_len(nw, off); } if (nlmsg_end(nw)) return (0); enomem: NL_LOG(LOG_DEBUG, "unable to dump lle state (ENOMEM)"); nlmsg_abort(nw); return (ENOMEM); } static int dump_lle(struct lltable *llt, struct llentry *lle, void *arg) { int error; LLE_RLOCK(lle); error = dump_lle_locked(lle, arg); LLE_RUNLOCK(lle); return (error); } static bool dump_llt(struct lltable *llt, struct netlink_walkargs *wa) { lltable_foreach_lle(llt, dump_lle, wa); return (true); } static int -dump_llts_iface(struct netlink_walkargs *wa, struct ifnet *ifp, int family) +dump_llts_iface(struct netlink_walkargs *wa, if_t ifp, int family) { int error = 0; wa->ifp = ifp; for (int i = 0; i < sizeof(lle_families) / sizeof(int); i++) { int fam = lle_families[i]; struct lltable *llt = lltable_get(ifp, fam); if (llt != NULL && (family == 0 || family == fam)) { wa->count++; wa->family = fam; if (!dump_llt(llt, wa)) { error = ENOMEM; break; } wa->dumped++; } } return (error); } static int -dump_llts(struct netlink_walkargs *wa, struct ifnet *ifp, int family) +dump_llts(struct netlink_walkargs *wa, if_t ifp, int family) { - NL_LOG(LOG_DEBUG, "Start dump ifp=%s family=%d", ifp ? if_name(ifp) : "NULL", family); + NL_LOG(LOG_DEBUG2, "Start dump ifp=%s family=%d", ifp ? if_name(ifp) : "NULL", family); wa->hdr.nlmsg_flags |= NLM_F_MULTI; if (ifp != NULL) { dump_llts_iface(wa, ifp, family); } else { - CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { + struct if_iter it; + + for (ifp = if_iter_start(&it); ifp != NULL; ifp = if_iter_next(&it)) { dump_llts_iface(wa, ifp, family); } + if_iter_finish(&it); } - NL_LOG(LOG_DEBUG, "End dump, iterated %d dumped %d", wa->count, wa->dumped); + NL_LOG(LOG_DEBUG2, "End dump, iterated %d dumped %d", wa->count, wa->dumped); if (!nlmsg_end_dump(wa->nw, wa->error, &wa->hdr)) { NL_LOG(LOG_DEBUG, "Unable to add new message"); return (ENOMEM); } return (0); } static int -get_lle(struct netlink_walkargs *wa, struct ifnet *ifp, int family, struct sockaddr *dst) +get_lle(struct netlink_walkargs *wa, if_t ifp, int family, struct sockaddr *dst) { struct lltable *llt = lltable_get(ifp, family); if (llt == NULL) return (ESRCH); struct llentry *lle = lla_lookup(llt, LLE_UNLOCKED, dst); if (lle == NULL) return (ESRCH); wa->ifp = ifp; wa->family = family; return (dump_lle(llt, lle, wa)); } static void -set_scope6(struct sockaddr *sa, struct ifnet *ifp) +set_scope6(struct sockaddr *sa, if_t ifp) { #ifdef INET6 if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr)) in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp)); } #endif } struct nl_parsed_neigh { struct sockaddr *nda_dst; struct ifnet *nda_ifp; struct nlattr *nda_lladdr; uint32_t ndaf_next_ts; uint32_t ndm_flags; uint16_t ndm_state; uint8_t ndm_family; }; #define _IN(_field) offsetof(struct ndmsg, _field) #define _OUT(_field) offsetof(struct nl_parsed_neigh, _field) static const struct nlattr_parser nla_p_neigh_fbsd[] = { { .type = NDAF_NEXT_STATE_TS, .off = _OUT(ndaf_next_ts), .cb = nlattr_get_uint32 }, }; NL_DECLARE_ATTR_PARSER(neigh_fbsd_parser, nla_p_neigh_fbsd); static const struct nlfield_parser nlf_p_neigh[] = { { .off_in = _IN(ndm_family), .off_out = _OUT(ndm_family), .cb = nlf_get_u8 }, { .off_in = _IN(ndm_flags), .off_out = _OUT(ndm_flags), .cb = nlf_get_u8_u32 }, { .off_in = _IN(ndm_state), .off_out = _OUT(ndm_state), .cb = nlf_get_u16 }, { .off_in = _IN(ndm_ifindex), .off_out = _OUT(nda_ifp), .cb = nlf_get_ifpz }, }; static const struct nlattr_parser nla_p_neigh[] = { { .type = NDA_DST, .off = _OUT(nda_dst), .cb = nlattr_get_ip }, { .type = NDA_LLADDR, .off = _OUT(nda_lladdr), .cb = nlattr_get_nla }, { .type = NDA_IFINDEX, .off = _OUT(nda_ifp), .cb = nlattr_get_ifp }, { .type = NDA_FLAGS_EXT, .off = _OUT(ndm_flags), .cb = nlattr_get_uint32 }, { .type = NDA_FREEBSD, .arg = &neigh_fbsd_parser, .cb = nlattr_get_nested }, }; #undef _IN #undef _OUT static bool post_p_neigh(void *_attrs, struct nl_pstate *npt __unused) { struct nl_parsed_neigh *attrs = (struct nl_parsed_neigh *)_attrs; set_scope6(attrs->nda_dst, attrs->nda_ifp); return (true); } NL_DECLARE_PARSER_EXT(ndmsg_parser, struct ndmsg, NULL, nlf_p_neigh, nla_p_neigh, post_p_neigh); /* * type=RTM_NEWNEIGH, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1661941473, pid=0}, * {ndm_family=AF_INET6, ndm_ifindex=if_nametoindex("enp0s31f6"), ndm_state=NUD_PERMANENT, ndm_flags=0, ndm_type=RTN_UNSPEC}, * [ * {{nla_len=20, nla_type=NDA_DST}, inet_pton(AF_INET6, "2a01:4f8:13a:70c::3")}, * {{nla_len=10, nla_type=NDA_LLADDR}, 20:4e:71:62:ae:f2}]}, iov_len=60} */ static int rtnl_handle_newneigh(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { int error; struct nl_parsed_neigh attrs = {}; error = nl_parse_nlmsg(hdr, &ndmsg_parser, npt, &attrs); if (error != 0) return (error); if (attrs.nda_ifp == NULL || attrs.nda_dst == NULL || attrs.nda_lladdr == NULL) { if (attrs.nda_ifp == NULL) NLMSG_REPORT_ERR_MSG(npt, "NDA_IFINDEX / ndm_ifindex not set"); if (attrs.nda_dst == NULL) NLMSG_REPORT_ERR_MSG(npt, "NDA_DST not set"); if (attrs.nda_lladdr == NULL) NLMSG_REPORT_ERR_MSG(npt, "NDA_LLADDR not set"); return (EINVAL); } if (attrs.nda_dst->sa_family != attrs.ndm_family) { NLMSG_REPORT_ERR_MSG(npt, "NDA_DST family (%d) is different from ndm_family (%d)", attrs.nda_dst->sa_family, attrs.ndm_family); return (EINVAL); } - int addrlen = attrs.nda_ifp->if_addrlen; + int addrlen = if_getaddrlen(attrs.nda_ifp); if (attrs.nda_lladdr->nla_len != sizeof(struct nlattr) + addrlen) { NLMSG_REPORT_ERR_MSG(npt, "NDA_LLADDR address length (%d) is different from expected (%d)", (int)attrs.nda_lladdr->nla_len - (int)sizeof(struct nlattr), addrlen); return (EINVAL); } const uint16_t supported_flags = NTF_PROXY | NTF_STICKY; if ((attrs.ndm_flags & supported_flags) != attrs.ndm_flags) { NLMSG_REPORT_ERR_MSG(npt, "ndm_flags %X not supported", attrs.ndm_flags &~ supported_flags); return (ENOTSUP); } /* Replacement requires new entry creation anyway */ if ((hdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_REPLACE)) == 0) return (ENOTSUP); struct lltable *llt = lltable_get(attrs.nda_ifp, attrs.ndm_family); if (llt == NULL) return (EAFNOSUPPORT); uint8_t linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize = sizeof(linkhdr); int lladdr_off = 0; if (lltable_calc_llheader(attrs.nda_ifp, attrs.ndm_family, (char *)(attrs.nda_lladdr + 1), linkhdr, &linkhdrsize, &lladdr_off) != 0) { NLMSG_REPORT_ERR_MSG(npt, "unable to calculate lle prepend data"); return (EINVAL); } int lle_flags = (attrs.ndm_flags & NTF_PROXY) ? LLE_PUB : 0; if (attrs.ndm_flags & NTF_STICKY) lle_flags |= LLE_STATIC; struct llentry *lle = lltable_alloc_entry(llt, lle_flags, attrs.nda_dst); if (lle == NULL) return (ENOMEM); lltable_set_entry_addr(attrs.nda_ifp, lle, linkhdr, linkhdrsize, lladdr_off); if (attrs.ndm_flags & NTF_STICKY) lle->la_expire = 0; else lle->la_expire = attrs.ndaf_next_ts - time_second + time_uptime; /* llentry created, try to insert or update */ IF_AFDATA_WLOCK(attrs.nda_ifp); LLE_WLOCK(lle); struct llentry *lle_tmp = lla_lookup(llt, LLE_EXCLUSIVE, attrs.nda_dst); if (lle_tmp != NULL) { error = EEXIST; if (hdr->nlmsg_flags & NLM_F_EXCL) { LLE_WUNLOCK(lle_tmp); lle_tmp = NULL; } else if (hdr->nlmsg_flags & NLM_F_REPLACE) { if ((lle_tmp->la_flags & LLE_IFADDR) == 0) { lltable_unlink_entry(llt, lle_tmp); lltable_link_entry(llt, lle); error = 0; } else error = EPERM; } } else { if (hdr->nlmsg_flags & NLM_F_CREATE) lltable_link_entry(llt, lle); else error = ENOENT; } IF_AFDATA_WUNLOCK(attrs.nda_ifp); if (error != 0) { if (lle != NULL) llentry_free(lle); return (error); } if (lle_tmp != NULL) llentry_free(lle_tmp); /* XXX: We're inside epoch */ EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED); LLE_WUNLOCK(lle); llt->llt_post_resolved(llt, lle); return (0); } static int rtnl_handle_delneigh(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { int error; struct nl_parsed_neigh attrs = {}; error = nl_parse_nlmsg(hdr, &ndmsg_parser, npt, &attrs); if (error != 0) return (error); if (attrs.nda_dst == NULL) { NLMSG_REPORT_ERR_MSG(npt, "NDA_DST not set"); return (EINVAL); } if (attrs.nda_ifp == NULL) { NLMSG_REPORT_ERR_MSG(npt, "no ifindex provided"); return (EINVAL); } struct lltable *llt = lltable_get(attrs.nda_ifp, attrs.ndm_family); if (llt == NULL) return (EAFNOSUPPORT); return (lltable_delete_addr(llt, 0, attrs.nda_dst)); } static int rtnl_handle_getneigh(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { int error; struct nl_parsed_neigh attrs = {}; error = nl_parse_nlmsg(hdr, &ndmsg_parser, npt, &attrs); if (error != 0) return (error); if (attrs.nda_dst != NULL && attrs.nda_ifp == NULL) { NLMSG_REPORT_ERR_MSG(npt, "has NDA_DST but no ifindex provided"); return (EINVAL); } struct netlink_walkargs wa = { .so = nlp, .nw = npt->nw, .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_flags = hdr->nlmsg_flags, .hdr.nlmsg_type = NL_RTM_NEWNEIGH, }; if (attrs.nda_dst == NULL) error = dump_llts(&wa, attrs.nda_ifp, attrs.ndm_family); else error = get_lle(&wa, attrs.nda_ifp, attrs.ndm_family, attrs.nda_dst); return (error); } static const struct rtnl_cmd_handler cmd_handlers[] = { { .cmd = NL_RTM_NEWNEIGH, .name = "RTM_NEWNEIGH", .cb = &rtnl_handle_newneigh, .priv = PRIV_NET_ROUTE, }, { .cmd = NL_RTM_DELNEIGH, .name = "RTM_DELNEIGH", .cb = &rtnl_handle_delneigh, .priv = PRIV_NET_ROUTE, }, { .cmd = NL_RTM_GETNEIGH, .name = "RTM_GETNEIGH", .cb = &rtnl_handle_getneigh, } }; static void rtnl_lle_event(void *arg __unused, struct llentry *lle, int evt) { - struct ifnet *ifp; + if_t ifp; int family; LLE_WLOCK_ASSERT(lle); ifp = lltable_get_ifp(lle->lle_tbl); family = lltable_get_af(lle->lle_tbl); if (family != AF_INET && family != AF_INET6) return; int nlmsgs_type = evt == LLENTRY_RESOLVED ? NL_RTM_NEWNEIGH : NL_RTM_DELNEIGH; struct nl_writer nw = {}; if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEIGH)) { NL_LOG(LOG_DEBUG, "error allocating group writer"); return; } struct netlink_walkargs wa = { .hdr.nlmsg_type = nlmsgs_type, .nw = &nw, .ifp = ifp, .family = family, }; dump_lle_locked(lle, &wa); nlmsg_flush(&nw); } static const struct nlhdr_parser *all_parsers[] = { &ndmsg_parser, &neigh_fbsd_parser }; void rtnl_neighs_init(void) { NL_VERIFY_PARSERS(all_parsers); rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers)); lle_event_p = EVENTHANDLER_REGISTER(lle_event, rtnl_lle_event, NULL, EVENTHANDLER_PRI_ANY); } void rtnl_neighs_destroy(void) { EVENTHANDLER_DEREGISTER(lle_event, lle_event_p); } diff --git a/sys/netlink/route/nexthop.c b/sys/netlink/route/nexthop.c index d1652cfb1508..fd3ef021a042 100644 --- a/sys/netlink/route/nexthop.c +++ b/sys/netlink/route/nexthop.c @@ -1,1125 +1,1125 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_netlink.h" #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_nhop #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); /* * This file contains the logic to maintain kernel nexthops and * nexhop groups based om the data provided by the user. * * Kernel stores (nearly) all of the routing data in the nexthops, * including the prefix-specific flags (NHF_HOST and NHF_DEFAULT). * * Netlink API provides higher-level abstraction for the user. Each * user-created nexthop may map to multiple kernel nexthops. * * The following variations require separate kernel nexthop to be * created: * * prefix flags (NHF_HOST, NHF_DEFAULT) * * using IPv6 gateway for IPv4 routes * * different fibnum * * These kernel nexthops have the lifetime bound to the lifetime of * the user_nhop object. They are not collected until user requests * to delete the created user_nhop. * */ struct user_nhop { uint32_t un_idx; /* Userland-provided index */ uint32_t un_fibfam; /* fibnum+af(as highest byte) */ uint8_t un_protocol; /* protocol that install the record */ struct nhop_object *un_nhop; /* "production" nexthop */ struct nhop_object *un_nhop_src; /* nexthop to copy from */ struct weightened_nhop *un_nhgrp_src; /* nexthops for nhg */ uint32_t un_nhgrp_count; /* number of nexthops */ struct user_nhop *un_next; /* next item in hash chain */ struct user_nhop *un_nextchild; /* master -> children */ struct epoch_context un_epoch_ctx; /* epoch ctl helper */ }; /* produce hash value for an object */ #define unhop_hash_obj(_obj) (hash_unhop(_obj)) /* compare two objects */ #define unhop_cmp(_one, _two) (cmp_unhop(_one, _two)) /* next object accessor */ #define unhop_next(_obj) (_obj)->un_next CHT_SLIST_DEFINE(unhop, struct user_nhop); struct unhop_ctl { struct unhop_head un_head; struct rmlock un_lock; }; #define UN_LOCK_INIT(_ctl) rm_init(&(_ctl)->un_lock, "unhop_ctl") #define UN_TRACKER struct rm_priotracker un_tracker #define UN_RLOCK(_ctl) rm_rlock(&((_ctl)->un_lock), &un_tracker) #define UN_RUNLOCK(_ctl) rm_runlock(&((_ctl)->un_lock), &un_tracker) #define UN_WLOCK(_ctl) rm_wlock(&(_ctl)->un_lock); #define UN_WUNLOCK(_ctl) rm_wunlock(&(_ctl)->un_lock); VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL; #define V_un_ctl VNET(un_ctl) static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size); static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b); static unsigned int hash_unhop(const struct user_nhop *obj); static void destroy_unhop(struct user_nhop *unhop); static struct nhop_object *clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags); static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b) { return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam); } /* * Hash callback: calculate hash of an object */ static unsigned int hash_unhop(const struct user_nhop *obj) { return (obj->un_idx ^ obj->un_fibfam); } #define UNHOP_IS_MASTER(_unhop) ((_unhop)->un_fibfam == 0) /* * Factory interface for creating matching kernel nexthops/nexthop groups * * @uidx: userland nexhop index used to create the nexthop * @fibnum: fibnum nexthop will be used in * @family: upper family nexthop will be used in * @nh_flags: desired nexthop prefix flags * @perror: pointer to store error to * * Returns referenced nexthop linked to @fibnum/@family rib on success. */ struct nhop_object * nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx, int nh_flags, int *perror) { struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); UN_TRACKER; if (__predict_false(ctl == NULL)) return (NULL); struct user_nhop key= { .un_idx = uidx, .un_fibfam = fibnum | ((uint32_t)family) << 24, }; struct user_nhop *unhop; nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT); if (__predict_false(family == 0)) return (NULL); UN_RLOCK(ctl); CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); if (unhop != NULL) { struct nhop_object *nh = unhop->un_nhop; UN_RLOCK(ctl); *perror = 0; nhop_ref_any(nh); return (nh); } /* * Exact nexthop not found. Search for template nexthop to clone from. */ key.un_fibfam = 0; CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); if (unhop == NULL) { UN_RUNLOCK(ctl); *perror = ESRCH; return (NULL); } UN_RUNLOCK(ctl); /* Create entry to insert first */ struct user_nhop *un_new, *un_tmp; un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO); if (un_new == NULL) { *perror = ENOMEM; return (NULL); } un_new->un_idx = uidx; un_new->un_fibfam = fibnum | ((uint32_t)family) << 24; /* Relying on epoch to protect unhop here */ un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags); if (un_new->un_nhop == NULL) { free(un_new, M_NETLINK); *perror = ENOMEM; return (NULL); } /* Insert back and report */ UN_WLOCK(ctl); /* First, find template record once again */ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); if (unhop == NULL) { /* Someone deleted the nexthop during the call */ UN_WUNLOCK(ctl); *perror = ESRCH; destroy_unhop(un_new); return (NULL); } /* Second, check the direct match */ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp); struct nhop_object *nh; if (un_tmp != NULL) { /* Another thread already created the desired nextop, use it */ nh = un_tmp->un_nhop; } else { /* Finally, insert the new nexthop and link it to the primary */ nh = un_new->un_nhop; CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new); un_new->un_nextchild = unhop->un_nextchild; unhop->un_nextchild = un_new; un_new = NULL; NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh); } UN_WUNLOCK(ctl); if (un_new != NULL) destroy_unhop(un_new); *perror = 0; nhop_ref_any(nh); return (nh); } static struct user_nhop * nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx) { struct user_nhop key= { .un_idx = uidx }; struct user_nhop *unhop = NULL; UN_TRACKER; UN_RLOCK(ctl); CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); UN_RUNLOCK(ctl); return (unhop); } #define MAX_STACK_NHOPS 4 static struct nhop_object * clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags) { #ifdef ROUTE_MPATH const struct weightened_nhop *wn; struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS]; uint32_t num_nhops; #endif struct nhop_object *nh = NULL; int error; if (unhop->un_nhop_src != NULL) { IF_DEBUG_LEVEL(LOG_DEBUG2) { char nhbuf[NHOP_PRINT_BUFSIZE]; nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf)); FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src, "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum, family, nh_flags); } struct nhop_object *nh; nh = nhop_alloc(fibnum, AF_UNSPEC); if (nh == NULL) return (NULL); nhop_copy(nh, unhop->un_nhop_src); /* Check that nexthop gateway is compatible with the new family */ if (!nhop_set_upper_family(nh, family)) { nhop_free(nh); return (NULL); } nhop_set_uidx(nh, unhop->un_idx); nhop_set_pxtype_flag(nh, nh_flags); return (nhop_get_nhop(nh, &error)); } #ifdef ROUTE_MPATH wn = unhop->un_nhgrp_src; num_nhops = unhop->un_nhgrp_count; if (num_nhops > MAX_STACK_NHOPS) { wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT); if (wn_new == NULL) return (NULL); } else wn_new = wn_base; for (int i = 0; i < num_nhops; i++) { uint32_t uidx = nhop_get_uidx(wn[i].nh); MPASS(uidx != 0); wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error); if (error != 0) break; wn_new[i].weight = wn[i].weight; } if (error == 0) { struct rib_head *rh = nhop_get_rh(wn_new[0].nh); struct nhgrp_object *nhg; error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg); nh = (struct nhop_object *)nhg; } if (wn_new != wn_base) free(wn_new, M_TEMP); #endif return (nh); } static void destroy_unhop(struct user_nhop *unhop) { if (unhop->un_nhop != NULL) nhop_free_any(unhop->un_nhop); if (unhop->un_nhop_src != NULL) nhop_free_any(unhop->un_nhop_src); free(unhop, M_NETLINK); } static void destroy_unhop_epoch(epoch_context_t ctx) { struct user_nhop *unhop; unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx); destroy_unhop(unhop); } static uint32_t find_spare_uidx(struct unhop_ctl *ctl) { struct user_nhop *unhop, key = {}; uint32_t uidx = 0; UN_TRACKER; UN_RLOCK(ctl); /* This should return spare uid with 75% of 65k used in ~99/100 cases */ for (int i = 0; i < 16; i++) { key.un_idx = (arc4random() % 65536) + 65536 * 4; CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); if (unhop == NULL) { uidx = key.un_idx; break; } } UN_RUNLOCK(ctl); return (uidx); } /* * Actual netlink code */ struct netlink_walkargs { struct nl_writer *nw; struct nlmsghdr hdr; struct nlpcb *so; int family; int error; int count; int dumped; }; #define ENOMEM_IF_NULL(_v) if ((_v) == NULL) goto enomem static bool dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr, struct nl_writer *nw) { if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg))) goto enomem; struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg); nhm->nh_family = AF_UNSPEC; nhm->nh_scope = 0; nhm->nh_protocol = unhop->un_protocol; nhm->nh_flags = 0; nlattr_add_u32(nw, NHA_ID, unhop->un_idx); nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH); struct weightened_nhop *wn = unhop->un_nhgrp_src; uint32_t num_nhops = unhop->un_nhgrp_count; /* TODO: a better API? */ int nla_len = sizeof(struct nlattr); nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp)); struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); if (nla == NULL) goto enomem; nla->nla_type = NHA_GROUP; nla->nla_len = nla_len; for (int i = 0; i < num_nhops; i++) { struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i]; grp->id = nhop_get_uidx(wn[i].nh); grp->weight = wn[i].weight; grp->resvd1 = 0; grp->resvd2 = 0; } if (nlmsg_end(nw)) return (true); enomem: NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory"); nlmsg_abort(nw); return (false); } static bool dump_nhop(const struct nhop_object *nh, uint32_t uidx, struct nlmsghdr *hdr, struct nl_writer *nw) { if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg))) goto enomem; struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg); ENOMEM_IF_NULL(nhm); nhm->nh_family = nhop_get_neigh_family(nh); nhm->nh_scope = 0; // XXX: what's that? nhm->nh_protocol = nhop_get_origin(nh); nhm->nh_flags = 0; if (uidx != 0) nlattr_add_u32(nw, NHA_ID, uidx); if (nh->nh_flags & NHF_BLACKHOLE) { nlattr_add_flag(nw, NHA_BLACKHOLE); goto done; } - nlattr_add_u32(nw, NHA_OIF, nh->nh_ifp->if_index); + nlattr_add_u32(nw, NHA_OIF, if_getindex(nh->nh_ifp)); switch (nh->gw_sa.sa_family) { #ifdef INET case AF_INET: nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr); break; #endif #ifdef INET6 case AF_INET6: { struct in6_addr addr = nh->gw6_sa.sin6_addr; in6_clearscope(&addr); nlattr_add(nw, NHA_GATEWAY, 16, &addr); break; } #endif } int off = nlattr_add_nested(nw, NHA_FREEBSD); if (off != 0) { - nlattr_add_u32(nw, NHAF_AIF, nh->nh_aifp->if_index); + nlattr_add_u32(nw, NHAF_AIF, if_getindex(nh->nh_aifp)); if (uidx == 0) { nlattr_add_u32(nw, NHAF_KID, nhop_get_idx(nh)); nlattr_add_u32(nw, NHAF_FAMILY, nhop_get_upper_family(nh)); nlattr_add_u32(nw, NHAF_TABLE, nhop_get_fibnum(nh)); } nlattr_set_len(nw, off); } done: if (nlmsg_end(nw)) return (true); enomem: nlmsg_abort(nw); return (false); } static void dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr, struct nl_writer *nw) { if (unhop->un_nhop_src != NULL) dump_nhop(unhop->un_nhop_src, unhop->un_idx, hdr, nw); else dump_nhgrp(unhop, hdr, nw); } static int delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx) { struct user_nhop *unhop_ret, *unhop_base, *unhop_chain; struct user_nhop key = { .un_idx = uidx }; UN_WLOCK(ctl); CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base); if (unhop_base != NULL) { CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret); IF_DEBUG_LEVEL(LOG_DEBUG2) { char nhbuf[NHOP_PRINT_BUFSIZE]; nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf)); FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop, "removed base nhop %u: %s", uidx, nhbuf); } /* Unlink all child nexhops as well, keeping the chain intact */ unhop_chain = unhop_base->un_nextchild; while (unhop_chain != NULL) { CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain, unhop_ret); MPASS(unhop_chain == unhop_ret); IF_DEBUG_LEVEL(LOG_DEBUG3) { char nhbuf[NHOP_PRINT_BUFSIZE]; nhop_print_buf_any(unhop_chain->un_nhop, nhbuf, sizeof(nhbuf)); FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop, "removed child nhop %u: %s", uidx, nhbuf); } unhop_chain = unhop_chain->un_nextchild; } } UN_WUNLOCK(ctl); if (unhop_base == NULL) { NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx); return (ENOENT); } /* Report nexthop deletion */ struct netlink_walkargs wa = { .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_flags = hdr->nlmsg_flags, .hdr.nlmsg_type = NL_RTM_DELNEXTHOP, }; struct nl_writer nw = {}; if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) { NL_LOG(LOG_DEBUG, "error allocating message writer"); return (ENOMEM); } dump_unhop(unhop_base, &wa.hdr, &nw); nlmsg_flush(&nw); while (unhop_base != NULL) { unhop_chain = unhop_base->un_nextchild; NET_EPOCH_CALL(destroy_unhop_epoch, &unhop_base->un_epoch_ctx); unhop_base = unhop_chain; } return (0); } static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size) { void *new_ptr = NULL; size_t alloc_size; if (new_size == 0) return; if (new_size != 0) { alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size); new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); if (new_ptr == NULL) return; } NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size); UN_WLOCK(ctl); if (new_ptr != NULL) { CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size); } UN_WUNLOCK(ctl); if (new_ptr != NULL) free(new_ptr, M_NETLINK); } static bool __noinline vnet_init_unhops(void) { uint32_t num_buckets = 16; size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK, M_NOWAIT | M_ZERO); if (ctl == NULL) return (false); void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); if (ptr == NULL) { free(ctl, M_NETLINK); return (false); } CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets); UN_LOCK_INIT(ctl); if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) { free(ptr, M_NETLINK); free(ctl, M_NETLINK); } if (atomic_load_ptr(&V_un_ctl) == NULL) return (false); NL_LOG(LOG_NOTICE, "UNHOPS init done"); return (true); } static void vnet_destroy_unhops(const void *unused __unused) { struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); struct user_nhop *unhop, *tmp; if (ctl == NULL) return; V_un_ctl = NULL; /* Wait till all unhop users finish their reads */ NET_EPOCH_WAIT(); UN_WLOCK(ctl); CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) { destroy_unhop(unhop); } CHT_SLIST_FOREACH_SAFE_END; UN_WUNLOCK(ctl); free(ctl->un_head.ptr, M_NETLINK); free(ctl, M_NETLINK); } VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_destroy_unhops, NULL); static int nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { int error = 0; /* Verify attribute correctness */ struct nexthop_grp *grp = NLA_DATA(nla); int data_len = NLA_DATA_LEN(nla); int count = data_len / sizeof(*grp); if (count == 0 || (count * sizeof(*grp) != data_len)) { NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len); return (EINVAL); } *((struct nlattr **)target) = nla; return (error); } static void -set_scope6(struct sockaddr *sa, struct ifnet *ifp) +set_scope6(struct sockaddr *sa, if_t ifp) { #ifdef INET6 if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr)) in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp)); } #endif } struct nl_parsed_nhop { uint32_t nha_id; uint8_t nha_blackhole; uint8_t nha_groups; uint8_t nhaf_knhops; uint8_t nhaf_family; struct ifnet *nha_oif; struct sockaddr *nha_gw; struct nlattr *nha_group; uint8_t nh_family; uint8_t nh_protocol; uint32_t nhaf_table; uint32_t nhaf_kid; uint32_t nhaf_aif; }; #define _IN(_field) offsetof(struct nhmsg, _field) #define _OUT(_field) offsetof(struct nl_parsed_nhop, _field) static struct nlattr_parser nla_p_nh_fbsd[] = { { .type = NHAF_KNHOPS, .off = _OUT(nhaf_knhops), .cb = nlattr_get_flag }, { .type = NHAF_TABLE, .off = _OUT(nhaf_table), .cb = nlattr_get_uint32 }, { .type = NHAF_FAMILY, .off = _OUT(nhaf_family), .cb = nlattr_get_uint8 }, { .type = NHAF_KID, .off = _OUT(nhaf_kid), .cb = nlattr_get_uint32 }, { .type = NHAF_AIF, .off = _OUT(nhaf_aif), .cb = nlattr_get_uint32 }, }; NL_DECLARE_ATTR_PARSER(nh_fbsd_parser, nla_p_nh_fbsd); static const struct nlfield_parser nlf_p_nh[] = { { .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 }, { .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 }, }; static const struct nlattr_parser nla_p_nh[] = { { .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 }, { .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg }, { .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag }, { .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp }, { .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip }, { .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag }, { .type = NHA_FREEBSD, .arg = &nh_fbsd_parser, .cb = nlattr_get_nested }, }; #undef _IN #undef _OUT static bool post_p_nh(void *_attrs, struct nl_pstate *npt) { struct nl_parsed_nhop *attrs = (struct nl_parsed_nhop *)_attrs; set_scope6(attrs->nha_gw, attrs->nha_oif); return (true); } NL_DECLARE_PARSER_EXT(nhmsg_parser, struct nhmsg, NULL, nlf_p_nh, nla_p_nh, post_p_nh); static bool eligible_nhg(const struct nhop_object *nh) { return (nh->nh_flags & NHF_GATEWAY); } static int newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop) { struct nexthop_grp *grp = NLA_DATA(attrs->nha_group); int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp); struct weightened_nhop *wn; wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO); if (wn == NULL) return (ENOMEM); for (int i = 0; i < count; i++) { struct user_nhop *unhop; unhop = nl_find_base_unhop(ctl, grp[i].id); if (unhop == NULL) { NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id); free(wn, M_NETLINK); return (ESRCH); } else if (unhop->un_nhop_src == NULL) { NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported", grp[i].id); free(wn, M_NETLINK); return (ENOTSUP); } else if (!eligible_nhg(unhop->un_nhop_src)) { NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible", grp[i].id); free(wn, M_NETLINK); return (ENOTSUP); } /* * TODO: consider more rigid eligibility checks: * restrict nexthops with the same gateway */ wn[i].nh = unhop->un_nhop_src; wn[i].weight = grp[i].weight; } unhop->un_nhgrp_src = wn; unhop->un_nhgrp_count = count; return (0); } /* * Sets nexthop @nh gateway specified by @gw. * If gateway is IPv6 link-local, alters @gw to include scopeid equal to * @ifp ifindex. * Returns 0 on success or errno. */ int -nl_set_nexthop_gw(struct nhop_object *nh, struct sockaddr *gw, struct ifnet *ifp, +nl_set_nexthop_gw(struct nhop_object *nh, struct sockaddr *gw, if_t ifp, struct nl_pstate *npt) { #ifdef INET6 if (gw->sa_family == AF_INET6) { struct sockaddr_in6 *gw6 = (struct sockaddr_in6 *)gw; if (IN6_IS_ADDR_LINKLOCAL(&gw6->sin6_addr)) { if (ifp == NULL) { NLMSG_REPORT_ERR_MSG(npt, "interface not set"); return (EINVAL); } - in6_set_unicast_scopeid(&gw6->sin6_addr, ifp->if_index); + in6_set_unicast_scopeid(&gw6->sin6_addr, if_getindex(ifp)); } } #endif nhop_set_gw(nh, gw, true); return (0); } static int newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop, struct nl_pstate *npt) { struct ifaddr *ifa = NULL; struct nhop_object *nh; int error; if (!attrs->nha_blackhole) { if (attrs->nha_gw == NULL) { NLMSG_REPORT_ERR_MSG(npt, "missing NHA_GATEWAY"); return (EINVAL); } if (attrs->nha_oif == NULL) { NLMSG_REPORT_ERR_MSG(npt, "missing NHA_OIF"); return (EINVAL); } if (ifa == NULL) ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif); if (ifa == NULL) { NLMSG_REPORT_ERR_MSG(npt, "Unable to determine default source IP"); return (EINVAL); } } int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family; nh = nhop_alloc(RT_DEFAULT_FIB, family); if (nh == NULL) { NL_LOG(LOG_DEBUG, "Unable to allocate nexthop"); return (ENOMEM); } nhop_set_uidx(nh, attrs->nha_id); nhop_set_origin(nh, attrs->nh_protocol); if (attrs->nha_blackhole) nhop_set_blackhole(nh, NHF_BLACKHOLE); else { error = nl_set_nexthop_gw(nh, attrs->nha_gw, attrs->nha_oif, npt); if (error != 0) { nhop_free(nh); return (error); } nhop_set_transmit_ifp(nh, attrs->nha_oif); nhop_set_src(nh, ifa); } error = nhop_get_unlinked(nh); if (error != 0) { NL_LOG(LOG_DEBUG, "unable to finalize nexthop"); return (error); } IF_DEBUG_LEVEL(LOG_DEBUG2) { char nhbuf[NHOP_PRINT_BUFSIZE]; nhop_print_buf(nh, nhbuf, sizeof(nhbuf)); NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf); } unhop->un_nhop_src = nh; return (0); } static int rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct user_nhop *unhop; int error; if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops())) return (ENOMEM); struct unhop_ctl *ctl = V_un_ctl; struct nl_parsed_nhop attrs = {}; error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); if (error != 0) return (error); /* * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class * citizen. */ if (attrs.nha_id == 0) { attrs.nha_id = find_spare_uidx(ctl); if (attrs.nha_id == 0) { NL_LOG(LOG_DEBUG, "Unable to get spare uidx"); return (ENOSPC); } } - NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? attrs.nha_oif->if_index : 0); + NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? if_getindex(attrs.nha_oif) : 0); unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO); if (unhop == NULL) { NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop"); return (ENOMEM); } unhop->un_idx = attrs.nha_id; unhop->un_protocol = attrs.nh_protocol; if (attrs.nha_group) error = newnhg(ctl, &attrs, unhop); else error = newnhop(&attrs, unhop, npt); if (error != 0) { free(unhop, M_NETLINK); return (error); } UN_WLOCK(ctl); /* Check if uidx already exists */ struct user_nhop *tmp = NULL; CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp); if (tmp != NULL) { UN_WUNLOCK(ctl); NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id); destroy_unhop(unhop); return (EEXIST); } CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop); uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head); UN_WUNLOCK(ctl); /* Report addition of the next nexhop */ struct netlink_walkargs wa = { .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_flags = hdr->nlmsg_flags, .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP, }; struct nl_writer nw = {}; if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) { NL_LOG(LOG_DEBUG, "error allocating message writer"); return (ENOMEM); } dump_unhop(unhop, &wa.hdr, &nw); nlmsg_flush(&nw); consider_resize(ctl, num_buckets_new); return (0); } static int rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); int error; if (__predict_false(ctl == NULL)) return (ESRCH); struct nl_parsed_nhop attrs = {}; error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); if (error != 0) return (error); if (attrs.nha_id == 0) { NL_LOG(LOG_DEBUG, "NHA_ID not set"); return (EINVAL); } error = delete_unhop(ctl, hdr, attrs.nha_id); return (error); } static bool match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop) { if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id) return (false); if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL) return (false); if (attrs->nha_oif != NULL && (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif)) return (false); return (true); } static int rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct user_nhop *unhop; UN_TRACKER; int error; struct nl_parsed_nhop attrs = {}; error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); if (error != 0) return (error); struct netlink_walkargs wa = { .nw = npt->nw, .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_flags = hdr->nlmsg_flags, .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP, }; if (attrs.nha_id != 0) { struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); struct user_nhop key = { .un_idx = attrs.nha_id }; if (__predict_false(ctl == NULL)) return (ESRCH); NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id); UN_RLOCK(ctl); CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); UN_RUNLOCK(ctl); if (unhop == NULL) return (ESRCH); dump_unhop(unhop, &wa.hdr, wa.nw); return (0); } else if (attrs.nhaf_kid != 0) { struct nhop_iter iter = { .fibnum = attrs.nhaf_table, .family = attrs.nhaf_family, }; int error = ESRCH; NL_LOG(LOG_DEBUG2, "START table %u family %d", attrs.nhaf_table, attrs.nhaf_family); for (struct nhop_object *nh = nhops_iter_start(&iter); nh; nh = nhops_iter_next(&iter)) { NL_LOG(LOG_DEBUG3, "get %u", nhop_get_idx(nh)); if (nhop_get_idx(nh) == attrs.nhaf_kid) { dump_nhop(nh, 0, &wa.hdr, wa.nw); error = 0; break; } } nhops_iter_stop(&iter); return (error); } else if (attrs.nhaf_knhops) { struct nhop_iter iter = { .fibnum = attrs.nhaf_table, .family = attrs.nhaf_family, }; NL_LOG(LOG_DEBUG2, "DUMP table %u family %d", attrs.nhaf_table, attrs.nhaf_family); wa.hdr.nlmsg_flags |= NLM_F_MULTI; for (struct nhop_object *nh = nhops_iter_start(&iter); nh; nh = nhops_iter_next(&iter)) { dump_nhop(nh, 0, &wa.hdr, wa.nw); } nhops_iter_stop(&iter); } else { struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); if (__predict_false(ctl == NULL)) return (ESRCH); NL_LOG(LOG_DEBUG2, "DUMP unhops"); UN_RLOCK(ctl); wa.hdr.nlmsg_flags |= NLM_F_MULTI; CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) { if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop)) dump_unhop(unhop, &wa.hdr, wa.nw); } CHT_SLIST_FOREACH_END; UN_RUNLOCK(ctl); } if (wa.error == 0) { if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) return (ENOMEM); } return (0); } static const struct rtnl_cmd_handler cmd_handlers[] = { { .cmd = NL_RTM_NEWNEXTHOP, .name = "RTM_NEWNEXTHOP", .cb = &rtnl_handle_newnhop, .priv = PRIV_NET_ROUTE, }, { .cmd = NL_RTM_DELNEXTHOP, .name = "RTM_DELNEXTHOP", .cb = &rtnl_handle_delnhop, .priv = PRIV_NET_ROUTE, }, { .cmd = NL_RTM_GETNEXTHOP, .name = "RTM_GETNEXTHOP", .cb = &rtnl_handle_getnhop, } }; static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser, &nh_fbsd_parser }; void rtnl_nexthops_init(void) { NL_VERIFY_PARSERS(all_parsers); rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers)); } diff --git a/sys/netlink/route/rt.c b/sys/netlink/route/rt.c index e194b8f009c1..70101680e2c0 100644 --- a/sys/netlink/route/rt.c +++ b/sys/netlink/route/rt.c @@ -1,1117 +1,1117 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_netlink.h" #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_route #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); static unsigned char get_rtm_type(const struct nhop_object *nh) { int nh_flags = nh->nh_flags; /* Use the fact that nhg runtime flags are only NHF_MULTIPATH */ if (nh_flags & NHF_BLACKHOLE) return (RTN_BLACKHOLE); else if (nh_flags & NHF_REJECT) return (RTN_PROHIBIT); return (RTN_UNICAST); } static uint8_t nl_get_rtm_protocol(const struct nhop_object *nh) { #ifdef ROUTE_MPATH if (NH_IS_NHGRP(nh)) { const struct nhgrp_object *nhg = (const struct nhgrp_object *)nh; uint8_t origin = nhgrp_get_origin(nhg); if (origin != RTPROT_UNSPEC) return (origin); nh = nhg->nhops[0]; } #endif uint8_t origin = nhop_get_origin(nh); if (origin != RTPROT_UNSPEC) return (origin); /* TODO: remove guesswork once all kernel users fill in origin */ int rt_flags = nhop_get_rtflags(nh); if (rt_flags & RTF_PROTO1) return (RTPROT_ZEBRA); if (rt_flags & RTF_STATIC) return (RTPROT_STATIC); return (RTPROT_KERNEL); } static int get_rtmsg_type_from_rtsock(int cmd) { switch (cmd) { case RTM_ADD: case RTM_CHANGE: case RTM_GET: return NL_RTM_NEWROUTE; case RTM_DELETE: return NL_RTM_DELROUTE; } return (0); } /* * fibnum heuristics * * if (dump && rtm_table == 0 && !rta_table) RT_ALL_FIBS * msg rtm_table RTA_TABLE result * RTM_GETROUTE/dump 0 - RT_ALL_FIBS * RTM_GETROUTE/dump 1 - 1 * RTM_GETROUTE/get 0 - 0 * */ static struct nhop_object * rc_get_nhop(const struct rib_cmd_info *rc) { return ((rc->rc_cmd == RTM_DELETE) ? rc->rc_nh_old : rc->rc_nh_new); } static void dump_rc_nhop_gw(struct nl_writer *nw, const struct nhop_object *nh) { #ifdef INET6 int upper_family; #endif switch (nhop_get_neigh_family(nh)) { case AF_LINK: /* onlink prefix, skip */ break; case AF_INET: nlattr_add(nw, NL_RTA_GATEWAY, 4, &nh->gw4_sa.sin_addr); break; #ifdef INET6 case AF_INET6: upper_family = nhop_get_upper_family(nh); if (upper_family == AF_INET6) { struct in6_addr gw6 = nh->gw6_sa.sin6_addr; in6_clearscope(&gw6); nlattr_add(nw, NL_RTA_GATEWAY, 16, &gw6); } else if (upper_family == AF_INET) { /* IPv4 over IPv6 */ struct in6_addr gw6 = nh->gw6_sa.sin6_addr; in6_clearscope(&gw6); char buf[20]; struct rtvia *via = (struct rtvia *)&buf[0]; via->rtvia_family = AF_INET6; memcpy(via->rtvia_addr, &gw6, 16); nlattr_add(nw, NL_RTA_VIA, 17, via); } break; #endif } } static void dump_rc_nhop_mtu(struct nl_writer *nw, const struct nhop_object *nh) { int nla_len = sizeof(struct nlattr) * 2 + sizeof(uint32_t); struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); if (nla == NULL) return; nla->nla_type = NL_RTA_METRICS; nla->nla_len = nla_len; nla++; nla->nla_type = NL_RTAX_MTU; nla->nla_len = sizeof(struct nlattr) + sizeof(uint32_t); *((uint32_t *)(nla + 1)) = nh->nh_mtu; } #ifdef ROUTE_MPATH static void dump_rc_nhg(struct nl_writer *nw, const struct nhgrp_object *nhg, struct rtmsg *rtm) { uint32_t uidx = nhgrp_get_uidx(nhg); uint32_t num_nhops; const struct weightened_nhop *wn = nhgrp_get_nhops(nhg, &num_nhops); uint32_t base_rtflags = nhop_get_rtflags(wn[0].nh); if (uidx != 0) nlattr_add_u32(nw, NL_RTA_NH_ID, uidx); nlattr_add_u32(nw, NL_RTA_KNH_ID, nhgrp_get_idx(nhg)); nlattr_add_u32(nw, NL_RTA_RTFLAGS, base_rtflags); int off = nlattr_add_nested(nw, NL_RTA_MULTIPATH); if (off == 0) return; for (int i = 0; i < num_nhops; i++) { int nh_off = nlattr_save_offset(nw); struct rtnexthop *rtnh = nlmsg_reserve_object(nw, struct rtnexthop); if (rtnh == NULL) return; rtnh->rtnh_flags = 0; - rtnh->rtnh_ifindex = wn[i].nh->nh_ifp->if_index; + rtnh->rtnh_ifindex = if_getindex(wn[i].nh->nh_ifp); rtnh->rtnh_hops = wn[i].weight; dump_rc_nhop_gw(nw, wn[i].nh); uint32_t rtflags = nhop_get_rtflags(wn[i].nh); if (rtflags != base_rtflags) nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags); if (rtflags & RTF_FIXEDMTU) dump_rc_nhop_mtu(nw, wn[i].nh); rtnh = nlattr_restore_offset(nw, nh_off, struct rtnexthop); /* * nlattr_add() allocates 4-byte aligned storage, no need to aligh * length here * */ rtnh->rtnh_len = nlattr_save_offset(nw) - nh_off; } nlattr_set_len(nw, off); } #endif static void dump_rc_nhop(struct nl_writer *nw, const struct route_nhop_data *rnd, struct rtmsg *rtm) { #ifdef ROUTE_MPATH if (NH_IS_NHGRP(rnd->rnd_nhop)) { dump_rc_nhg(nw, rnd->rnd_nhgrp, rtm); return; } #endif const struct nhop_object *nh = rnd->rnd_nhop; uint32_t rtflags = nhop_get_rtflags(nh); /* * IPv4 over IPv6 * ('RTA_VIA', {'family': 10, 'addr': 'fe80::20c:29ff:fe67:2dd'}), ('RTA_OIF', 2), * IPv4 w/ gw * ('RTA_GATEWAY', '172.16.107.131'), ('RTA_OIF', 2)], * Direct route: * ('RTA_OIF', 2) */ if (nh->nh_flags & NHF_GATEWAY) dump_rc_nhop_gw(nw, nh); uint32_t uidx = nhop_get_uidx(nh); if (uidx != 0) nlattr_add_u32(nw, NL_RTA_NH_ID, uidx); nlattr_add_u32(nw, NL_RTA_KNH_ID, nhop_get_idx(nh)); nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags); if (rtflags & RTF_FIXEDMTU) dump_rc_nhop_mtu(nw, nh); uint32_t nh_expire = nhop_get_expire(nh); if (nh_expire > 0) nlattr_add_u32(nw, NL_RTA_EXPIRES, nh_expire - time_uptime); /* In any case, fill outgoing interface */ - nlattr_add_u32(nw, NL_RTA_OIF, nh->nh_ifp->if_index); + nlattr_add_u32(nw, NL_RTA_OIF, if_getindex(nh->nh_ifp)); if (rnd->rnd_weight != RT_DEFAULT_WEIGHT) nlattr_add_u32(nw, NL_RTA_WEIGHT, rnd->rnd_weight); } /* * Dumps output from a rib command into an rtmsg */ static int dump_px(uint32_t fibnum, const struct nlmsghdr *hdr, const struct rtentry *rt, struct route_nhop_data *rnd, struct nl_writer *nw) { struct rtmsg *rtm; int error = 0; NET_EPOCH_ASSERT(); if (!nlmsg_reply(nw, hdr, sizeof(struct rtmsg))) goto enomem; int family = rt_get_family(rt); int rtm_off = nlattr_save_offset(nw); rtm = nlmsg_reserve_object(nw, struct rtmsg); rtm->rtm_family = family; rtm->rtm_dst_len = 0; rtm->rtm_src_len = 0; rtm->rtm_tos = 0; if (fibnum < 255) rtm->rtm_table = (unsigned char)fibnum; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = nl_get_rtm_protocol(rnd->rnd_nhop); rtm->rtm_type = get_rtm_type(rnd->rnd_nhop); nlattr_add_u32(nw, NL_RTA_TABLE, fibnum); int plen = 0; #if defined(INET) || defined(INET6) uint32_t scopeid; #endif switch (family) { #ifdef INET case AF_INET: { struct in_addr addr; rt_get_inet_prefix_plen(rt, &addr, &plen, &scopeid); nlattr_add(nw, NL_RTA_DST, 4, &addr); break; } #endif #ifdef INET6 case AF_INET6: { struct in6_addr addr; rt_get_inet6_prefix_plen(rt, &addr, &plen, &scopeid); nlattr_add(nw, NL_RTA_DST, 16, &addr); break; } #endif default: FIB_LOG(LOG_NOTICE, fibnum, family, "unsupported rt family: %d", family); error = EAFNOSUPPORT; goto flush; } rtm = nlattr_restore_offset(nw, rtm_off, struct rtmsg); if (plen > 0) rtm->rtm_dst_len = plen; dump_rc_nhop(nw, rnd, rtm); if (nlmsg_end(nw)) return (0); enomem: error = ENOMEM; flush: nlmsg_abort(nw); return (error); } static int family_to_group(int family) { switch (family) { case AF_INET: return (RTNLGRP_IPV4_ROUTE); case AF_INET6: return (RTNLGRP_IPV6_ROUTE); } return (0); } static void report_operation(uint32_t fibnum, struct rib_cmd_info *rc, struct nlpcb *nlp, struct nlmsghdr *hdr) { struct nl_writer nw = {}; uint32_t group_id = family_to_group(rt_get_family(rc->rc_rt)); if (nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id)) { struct route_nhop_data rnd = { .rnd_nhop = rc_get_nhop(rc), .rnd_weight = rc->rc_nh_weight, }; hdr->nlmsg_flags &= ~(NLM_F_REPLACE | NLM_F_CREATE); hdr->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_APPEND); switch (rc->rc_cmd) { case RTM_ADD: hdr->nlmsg_type = NL_RTM_NEWROUTE; hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL; break; case RTM_CHANGE: hdr->nlmsg_type = NL_RTM_NEWROUTE; hdr->nlmsg_flags |= NLM_F_REPLACE; break; case RTM_DELETE: hdr->nlmsg_type = NL_RTM_DELROUTE; break; } dump_px(fibnum, hdr, rc->rc_rt, &rnd, &nw); nlmsg_flush(&nw); } rtsock_callback_p->route_f(fibnum, rc); } static void set_scope6(struct sockaddr *sa, struct ifnet *ifp) { #ifdef INET6 if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr)) in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp)); } #endif } struct rta_mpath_nh { struct sockaddr *gw; struct ifnet *ifp; uint8_t rtnh_flags; uint8_t rtnh_weight; }; #define _IN(_field) offsetof(struct rtnexthop, _field) #define _OUT(_field) offsetof(struct rta_mpath_nh, _field) const static struct nlattr_parser nla_p_rtnh[] = { { .type = NL_RTA_GATEWAY, .off = _OUT(gw), .cb = nlattr_get_ip }, { .type = NL_RTA_VIA, .off = _OUT(gw), .cb = nlattr_get_ipvia }, }; const static struct nlfield_parser nlf_p_rtnh[] = { { .off_in = _IN(rtnh_flags), .off_out = _OUT(rtnh_flags), .cb = nlf_get_u8 }, { .off_in = _IN(rtnh_hops), .off_out = _OUT(rtnh_weight), .cb = nlf_get_u8 }, { .off_in = _IN(rtnh_ifindex), .off_out = _OUT(ifp), .cb = nlf_get_ifpz }, }; #undef _IN #undef _OUT static bool post_p_rtnh(void *_attrs, struct nl_pstate *npt __unused) { struct rta_mpath_nh *attrs = (struct rta_mpath_nh *)_attrs; set_scope6(attrs->gw, attrs->ifp); return (true); } NL_DECLARE_PARSER_EXT(mpath_parser, struct rtnexthop, NULL, nlf_p_rtnh, nla_p_rtnh, post_p_rtnh); struct rta_mpath { int num_nhops; struct rta_mpath_nh nhops[0]; }; static int nlattr_get_multipath(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { int data_len = nla->nla_len - sizeof(struct nlattr); struct rtnexthop *rtnh; int max_nhops = data_len / sizeof(struct rtnexthop); struct rta_mpath *mp = npt_alloc(npt, (max_nhops + 2) * sizeof(struct rta_mpath_nh)); mp->num_nhops = 0; for (rtnh = (struct rtnexthop *)(nla + 1); data_len > 0; ) { struct rta_mpath_nh *mpnh = &mp->nhops[mp->num_nhops++]; int error = nl_parse_header(rtnh, rtnh->rtnh_len, &mpath_parser, npt, mpnh); if (error != 0) { NLMSG_REPORT_ERR_MSG(npt, "RTA_MULTIPATH: nexhop %d: parse failed", mp->num_nhops - 1); return (error); } int len = NL_ITEM_ALIGN(rtnh->rtnh_len); data_len -= len; rtnh = (struct rtnexthop *)((char *)rtnh + len); } if (data_len != 0 || mp->num_nhops == 0) { NLMSG_REPORT_ERR_MSG(npt, "invalid RTA_MULTIPATH attr"); return (EINVAL); } *((struct rta_mpath **)target) = mp; return (0); } struct nl_parsed_route { struct sockaddr *rta_dst; struct sockaddr *rta_gw; struct ifnet *rta_oif; struct rta_mpath *rta_multipath; uint32_t rta_table; uint32_t rta_rtflags; uint32_t rta_nh_id; uint32_t rta_weight; uint32_t rtax_mtu; uint8_t rtm_family; uint8_t rtm_dst_len; uint8_t rtm_protocol; uint8_t rtm_type; uint32_t rtm_flags; }; #define _IN(_field) offsetof(struct rtmsg, _field) #define _OUT(_field) offsetof(struct nl_parsed_route, _field) static struct nlattr_parser nla_p_rtmetrics[] = { { .type = NL_RTAX_MTU, .off = _OUT(rtax_mtu), .cb = nlattr_get_uint32 }, }; NL_DECLARE_ATTR_PARSER(metrics_parser, nla_p_rtmetrics); static const struct nlattr_parser nla_p_rtmsg[] = { { .type = NL_RTA_DST, .off = _OUT(rta_dst), .cb = nlattr_get_ip }, { .type = NL_RTA_OIF, .off = _OUT(rta_oif), .cb = nlattr_get_ifp }, { .type = NL_RTA_GATEWAY, .off = _OUT(rta_gw), .cb = nlattr_get_ip }, { .type = NL_RTA_METRICS, .arg = &metrics_parser, .cb = nlattr_get_nested }, { .type = NL_RTA_MULTIPATH, .off = _OUT(rta_multipath), .cb = nlattr_get_multipath }, { .type = NL_RTA_WEIGHT, .off = _OUT(rta_weight), .cb = nlattr_get_uint32 }, { .type = NL_RTA_RTFLAGS, .off = _OUT(rta_rtflags), .cb = nlattr_get_uint32 }, { .type = NL_RTA_TABLE, .off = _OUT(rta_table), .cb = nlattr_get_uint32 }, { .type = NL_RTA_VIA, .off = _OUT(rta_gw), .cb = nlattr_get_ipvia }, { .type = NL_RTA_NH_ID, .off = _OUT(rta_nh_id), .cb = nlattr_get_uint32 }, }; static const struct nlfield_parser nlf_p_rtmsg[] = { { .off_in = _IN(rtm_family), .off_out = _OUT(rtm_family), .cb = nlf_get_u8 }, { .off_in = _IN(rtm_dst_len), .off_out = _OUT(rtm_dst_len), .cb = nlf_get_u8 }, { .off_in = _IN(rtm_protocol), .off_out = _OUT(rtm_protocol), .cb = nlf_get_u8 }, { .off_in = _IN(rtm_type), .off_out = _OUT(rtm_type), .cb = nlf_get_u8 }, { .off_in = _IN(rtm_flags), .off_out = _OUT(rtm_flags), .cb = nlf_get_u32 }, }; #undef _IN #undef _OUT static bool post_p_rtmsg(void *_attrs, struct nl_pstate *npt __unused) { struct nl_parsed_route *attrs = (struct nl_parsed_route *)_attrs; set_scope6(attrs->rta_dst, attrs->rta_oif); set_scope6(attrs->rta_gw, attrs->rta_oif); return (true); } NL_DECLARE_PARSER_EXT(rtm_parser, struct rtmsg, NULL, nlf_p_rtmsg, nla_p_rtmsg, post_p_rtmsg); struct netlink_walkargs { struct nl_writer *nw; struct route_nhop_data rnd; struct nlmsghdr hdr; struct nlpcb *nlp; uint32_t fibnum; int family; int error; int count; int dumped; int dumped_tables; }; static int dump_rtentry(struct rtentry *rt, void *_arg) { struct netlink_walkargs *wa = (struct netlink_walkargs *)_arg; int error; wa->count++; if (wa->error != 0) return (0); if (!rt_is_exportable(rt, nlp_get_cred(wa->nlp))) return (0); wa->dumped++; rt_get_rnd(rt, &wa->rnd); error = dump_px(wa->fibnum, &wa->hdr, rt, &wa->rnd, wa->nw); IF_DEBUG_LEVEL(LOG_DEBUG3) { char rtbuf[INET6_ADDRSTRLEN + 5]; FIB_LOG(LOG_DEBUG3, wa->fibnum, wa->family, "Dump %s, offset %u, error %d", rt_print_buf(rt, rtbuf, sizeof(rtbuf)), wa->nw->offset, error); } wa->error = error; return (0); } static void dump_rtable_one(struct netlink_walkargs *wa, uint32_t fibnum, int family) { FIB_LOG(LOG_DEBUG2, fibnum, family, "Start dump"); wa->count = 0; wa->dumped = 0; rib_walk(fibnum, family, false, dump_rtentry, wa); wa->dumped_tables++; FIB_LOG(LOG_DEBUG2, fibnum, family, "End dump, iterated %d dumped %d", wa->count, wa->dumped); NL_LOG(LOG_DEBUG2, "Current offset: %d", wa->nw->offset); } static int dump_rtable_fib(struct netlink_walkargs *wa, uint32_t fibnum, int family) { wa->fibnum = fibnum; if (family == AF_UNSPEC) { for (int i = 0; i < AF_MAX; i++) { if (rt_tables_get_rnh(fibnum, i) != 0) { wa->family = i; dump_rtable_one(wa, fibnum, i); if (wa->error != 0) break; } } } else { if (rt_tables_get_rnh(fibnum, family) != 0) { wa->family = family; dump_rtable_one(wa, fibnum, family); } } return (wa->error); } static int handle_rtm_getroute(struct nlpcb *nlp, struct nl_parsed_route *attrs, struct nlmsghdr *hdr, struct nl_pstate *npt) { RIB_RLOCK_TRACKER; struct rib_head *rnh; const struct rtentry *rt; struct route_nhop_data rnd; uint32_t fibnum = attrs->rta_table; sa_family_t family = attrs->rtm_family; if (attrs->rta_dst == NULL) { NLMSG_REPORT_ERR_MSG(npt, "No RTA_DST supplied"); return (EINVAL); } rnh = rt_tables_get_rnh(fibnum, family); if (rnh == NULL) return (EAFNOSUPPORT); RIB_RLOCK(rnh); struct sockaddr *dst = attrs->rta_dst; if (attrs->rtm_flags & RTM_F_PREFIX) rt = rib_lookup_prefix_plen(rnh, dst, attrs->rtm_dst_len, &rnd); else rt = (const struct rtentry *)rnh->rnh_matchaddr(dst, &rnh->head); if (rt == NULL) { RIB_RUNLOCK(rnh); return (ESRCH); } rt_get_rnd(rt, &rnd); rnd.rnd_nhop = nhop_select_func(rnd.rnd_nhop, 0); RIB_RUNLOCK(rnh); if (!rt_is_exportable(rt, nlp_get_cred(nlp))) return (ESRCH); IF_DEBUG_LEVEL(LOG_DEBUG2) { char rtbuf[NHOP_PRINT_BUFSIZE] __unused, nhbuf[NHOP_PRINT_BUFSIZE] __unused; FIB_LOG(LOG_DEBUG2, fibnum, family, "getroute completed: got %s for %s", nhop_print_buf_any(rnd.rnd_nhop, nhbuf, sizeof(nhbuf)), rt_print_buf(rt, rtbuf, sizeof(rtbuf))); } hdr->nlmsg_type = NL_RTM_NEWROUTE; dump_px(fibnum, hdr, rt, &rnd, npt->nw); return (0); } static int handle_rtm_dump(struct nlpcb *nlp, uint32_t fibnum, int family, struct nlmsghdr *hdr, struct nl_writer *nw) { struct netlink_walkargs wa = { .nlp = nlp, .nw = nw, .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_type = NL_RTM_NEWROUTE, .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI, }; if (fibnum == RT_TABLE_UNSPEC) { for (int i = 0; i < V_rt_numfibs; i++) { dump_rtable_fib(&wa, fibnum, family); if (wa.error != 0) break; } } else dump_rtable_fib(&wa, fibnum, family); if (wa.error == 0 && wa.dumped_tables == 0) { FIB_LOG(LOG_DEBUG, fibnum, family, "incorrect fibnum/family"); wa.error = ESRCH; // How do we propagate it? } if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) { NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (wa.error); } static struct nhop_object * finalize_nhop(struct nhop_object *nh, const struct sockaddr *dst, int *perror) { /* * The following MUST be filled: * nh_ifp, nh_ifa, nh_gw */ if (nh->gw_sa.sa_family == 0) { /* * Empty gateway. Can be direct route with RTA_OIF set. */ if (nh->nh_ifp != NULL) nhop_set_direct_gw(nh, nh->nh_ifp); else { NL_LOG(LOG_DEBUG, "empty gateway and interface, skipping"); *perror = EINVAL; return (NULL); } /* Both nh_ifp and gateway are set */ } else { /* Gateway is set up, we can derive ifp if not set */ if (nh->nh_ifp == NULL) { uint32_t fibnum = nhop_get_fibnum(nh); uint32_t flags = 0; if (nh->nh_flags & NHF_GATEWAY) flags = RTF_GATEWAY; else if (nh->nh_flags & NHF_HOST) flags = RTF_HOST; struct ifaddr *ifa = ifa_ifwithroute(flags, dst, &nh->gw_sa, fibnum); if (ifa == NULL) { NL_LOG(LOG_DEBUG, "Unable to determine ifp, skipping"); *perror = EINVAL; return (NULL); } nhop_set_transmit_ifp(nh, ifa->ifa_ifp); } } /* Both nh_ifp and gateway are set */ if (nh->nh_ifa == NULL) { const struct sockaddr *gw_sa = &nh->gw_sa; if (gw_sa->sa_family != dst->sa_family) { /* * Use dst as the target for determining the default * preferred ifa IF * 1) the gateway is link-level (e.g. direct route) * 2) the gateway family is different (e.g. IPv4 over IPv6). */ gw_sa = dst; } struct ifaddr *ifa = ifaof_ifpforaddr(gw_sa, nh->nh_ifp); if (ifa == NULL) { NL_LOG(LOG_DEBUG, "Unable to determine ifa, skipping"); *perror = EINVAL; return (NULL); } nhop_set_src(nh, ifa); } return (nhop_get_nhop(nh, perror)); } static int get_pxflag(const struct nl_parsed_route *attrs) { int pxflag = 0; switch (attrs->rtm_family) { case AF_INET: if (attrs->rtm_dst_len == 32) pxflag = NHF_HOST; else if (attrs->rtm_dst_len == 0) pxflag = NHF_DEFAULT; break; case AF_INET6: if (attrs->rtm_dst_len == 128) pxflag = NHF_HOST; else if (attrs->rtm_dst_len == 0) pxflag = NHF_DEFAULT; break; } return (pxflag); } static int get_op_flags(int nlm_flags) { int op_flags = 0; op_flags |= (nlm_flags & NLM_F_REPLACE) ? RTM_F_REPLACE : 0; op_flags |= (nlm_flags & NLM_F_EXCL) ? RTM_F_EXCL : 0; op_flags |= (nlm_flags & NLM_F_CREATE) ? RTM_F_CREATE : 0; op_flags |= (nlm_flags & NLM_F_APPEND) ? RTM_F_APPEND : 0; return (op_flags); } #ifdef ROUTE_MPATH static int create_nexthop_one(struct nl_parsed_route *attrs, struct rta_mpath_nh *mpnh, struct nl_pstate *npt, struct nhop_object **pnh) { int error; if (mpnh->gw == NULL) return (EINVAL); struct nhop_object *nh = nhop_alloc(attrs->rta_table, attrs->rtm_family); if (nh == NULL) return (ENOMEM); error = nl_set_nexthop_gw(nh, mpnh->gw, mpnh->ifp, npt); if (error != 0) { nhop_free(nh); return (error); } if (mpnh->ifp != NULL) nhop_set_transmit_ifp(nh, mpnh->ifp); nhop_set_pxtype_flag(nh, get_pxflag(attrs)); nhop_set_rtflags(nh, attrs->rta_rtflags); if (attrs->rtm_protocol > RTPROT_STATIC) nhop_set_origin(nh, attrs->rtm_protocol); *pnh = finalize_nhop(nh, attrs->rta_dst, &error); return (error); } #endif static struct nhop_object * create_nexthop_from_attrs(struct nl_parsed_route *attrs, struct nl_pstate *npt, int *perror) { struct nhop_object *nh = NULL; int error = 0; if (attrs->rta_multipath != NULL) { #ifdef ROUTE_MPATH /* Multipath w/o explicit nexthops */ int num_nhops = attrs->rta_multipath->num_nhops; struct weightened_nhop *wn = npt_alloc(npt, sizeof(*wn) * num_nhops); for (int i = 0; i < num_nhops; i++) { struct rta_mpath_nh *mpnh = &attrs->rta_multipath->nhops[i]; error = create_nexthop_one(attrs, mpnh, npt, &wn[i].nh); if (error != 0) { for (int j = 0; j < i; j++) nhop_free(wn[j].nh); break; } wn[i].weight = mpnh->rtnh_weight > 0 ? mpnh->rtnh_weight : 1; } if (error == 0) { struct rib_head *rh = nhop_get_rh(wn[0].nh); struct nhgrp_object *nhg; nhg = nhgrp_alloc(rh->rib_fibnum, rh->rib_family, wn, num_nhops, perror); if (nhg != NULL) { if (attrs->rtm_protocol > RTPROT_STATIC) nhgrp_set_origin(nhg, attrs->rtm_protocol); nhg = nhgrp_get_nhgrp(nhg, perror); } for (int i = 0; i < num_nhops; i++) nhop_free(wn[i].nh); if (nhg != NULL) return ((struct nhop_object *)nhg); error = *perror; } #else error = ENOTSUP; #endif *perror = error; } else { nh = nhop_alloc(attrs->rta_table, attrs->rtm_family); if (nh == NULL) { *perror = ENOMEM; return (NULL); } if (attrs->rta_gw != NULL) { *perror = nl_set_nexthop_gw(nh, attrs->rta_gw, attrs->rta_oif, npt); if (*perror != 0) { nhop_free(nh); return (NULL); } } if (attrs->rta_oif != NULL) nhop_set_transmit_ifp(nh, attrs->rta_oif); if (attrs->rtax_mtu != 0) nhop_set_mtu(nh, attrs->rtax_mtu, true); if (attrs->rta_rtflags & RTF_BROADCAST) nhop_set_broadcast(nh, true); if (attrs->rtm_protocol > RTPROT_STATIC) nhop_set_origin(nh, attrs->rtm_protocol); nhop_set_pxtype_flag(nh, get_pxflag(attrs)); nhop_set_rtflags(nh, attrs->rta_rtflags); switch (attrs->rtm_type) { case RTN_UNICAST: break; case RTN_BLACKHOLE: nhop_set_blackhole(nh, RTF_BLACKHOLE); break; case RTN_PROHIBIT: case RTN_UNREACHABLE: nhop_set_blackhole(nh, RTF_REJECT); break; /* TODO: return ENOTSUP for other types if strict option is set */ } nh = finalize_nhop(nh, attrs->rta_dst, perror); } return (nh); } static int rtnl_handle_newroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct rib_cmd_info rc = {}; struct nhop_object *nh = NULL; int error; struct nl_parsed_route attrs = {}; error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs); if (error != 0) return (error); /* Check if we have enough data */ if (attrs.rta_dst == NULL) { NL_LOG(LOG_DEBUG, "missing RTA_DST"); return (EINVAL); } if (attrs.rta_table >= V_rt_numfibs) { NLMSG_REPORT_ERR_MSG(npt, "invalid fib"); return (EINVAL); } if (attrs.rta_nh_id != 0) { /* Referenced uindex */ int pxflag = get_pxflag(&attrs); nh = nl_find_nhop(attrs.rta_table, attrs.rtm_family, attrs.rta_nh_id, pxflag, &error); if (error != 0) return (error); } else { nh = create_nexthop_from_attrs(&attrs, npt, &error); if (error != 0) { NL_LOG(LOG_DEBUG, "Error creating nexthop"); return (error); } } if (!NH_IS_NHGRP(nh) && attrs.rta_weight == 0) attrs.rta_weight = RT_DEFAULT_WEIGHT; struct route_nhop_data rnd = { .rnd_nhop = nh, .rnd_weight = attrs.rta_weight }; int op_flags = get_op_flags(hdr->nlmsg_flags); error = rib_add_route_px(attrs.rta_table, attrs.rta_dst, attrs.rtm_dst_len, &rnd, op_flags, &rc); if (error == 0) report_operation(attrs.rta_table, &rc, nlp, hdr); return (error); } static int path_match_func(const struct rtentry *rt, const struct nhop_object *nh, void *_data) { struct nl_parsed_route *attrs = (struct nl_parsed_route *)_data; if ((attrs->rta_gw != NULL) && !rib_match_gw(rt, nh, attrs->rta_gw)) return (0); if ((attrs->rta_oif != NULL) && (attrs->rta_oif != nh->nh_ifp)) return (0); return (1); } static int rtnl_handle_delroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct rib_cmd_info rc; int error; struct nl_parsed_route attrs = {}; error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs); if (error != 0) return (error); if (attrs.rta_dst == NULL) { NLMSG_REPORT_ERR_MSG(npt, "RTA_DST is not set"); return (ESRCH); } if (attrs.rta_table >= V_rt_numfibs) { NLMSG_REPORT_ERR_MSG(npt, "invalid fib"); return (EINVAL); } error = rib_del_route_px(attrs.rta_table, attrs.rta_dst, attrs.rtm_dst_len, path_match_func, &attrs, 0, &rc); if (error == 0) report_operation(attrs.rta_table, &rc, nlp, hdr); return (error); } static int rtnl_handle_getroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { int error; struct nl_parsed_route attrs = {}; error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs); if (error != 0) return (error); if (attrs.rta_table >= V_rt_numfibs) { NLMSG_REPORT_ERR_MSG(npt, "invalid fib"); return (EINVAL); } if (hdr->nlmsg_flags & NLM_F_DUMP) error = handle_rtm_dump(nlp, attrs.rta_table, attrs.rtm_family, hdr, npt->nw); else error = handle_rtm_getroute(nlp, &attrs, hdr, npt); return (error); } void rtnl_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc) { struct nl_writer nw = {}; int family, nlm_flags = 0; family = rt_get_family(rc->rc_rt); /* XXX: check if there are active listeners first */ /* TODO: consider passing PID/type/seq */ switch (rc->rc_cmd) { case RTM_ADD: nlm_flags = NLM_F_EXCL | NLM_F_CREATE; break; case RTM_CHANGE: nlm_flags = NLM_F_REPLACE; break; case RTM_DELETE: nlm_flags = 0; break; } IF_DEBUG_LEVEL(LOG_DEBUG2) { char rtbuf[NHOP_PRINT_BUFSIZE] __unused; FIB_LOG(LOG_DEBUG2, fibnum, family, "received event %s for %s / nlm_flags=%X", rib_print_cmd(rc->rc_cmd), rt_print_buf(rc->rc_rt, rtbuf, sizeof(rtbuf)), nlm_flags); } struct nlmsghdr hdr = { .nlmsg_flags = nlm_flags, .nlmsg_type = get_rtmsg_type_from_rtsock(rc->rc_cmd), }; struct route_nhop_data rnd = { .rnd_nhop = rc_get_nhop(rc), .rnd_weight = rc->rc_nh_weight, }; uint32_t group_id = family_to_group(family); if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id)) { NL_LOG(LOG_DEBUG, "error allocating event buffer"); return; } dump_px(fibnum, &hdr, rc->rc_rt, &rnd, &nw); nlmsg_flush(&nw); } static const struct rtnl_cmd_handler cmd_handlers[] = { { .cmd = NL_RTM_GETROUTE, .name = "RTM_GETROUTE", .cb = &rtnl_handle_getroute, .flags = RTNL_F_ALLOW_NONVNET_JAIL, }, { .cmd = NL_RTM_DELROUTE, .name = "RTM_DELROUTE", .cb = &rtnl_handle_delroute, .priv = PRIV_NET_ROUTE, }, { .cmd = NL_RTM_NEWROUTE, .name = "RTM_NEWROUTE", .cb = &rtnl_handle_newroute, .priv = PRIV_NET_ROUTE, } }; static const struct nlhdr_parser *all_parsers[] = {&mpath_parser, &metrics_parser, &rtm_parser}; void rtnl_routes_init(void) { NL_VERIFY_PARSERS(all_parsers); rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers)); }