Index: head/sys/net/if.c =================================================================== --- head/sys/net/if.c (revision 333308) +++ head/sys/net/if.c (revision 333309) @@ -1,4481 +1,4501 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if.c 8.5 (Berkeley) 1/9/95 * $FreeBSD$ */ #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #ifdef INET6 #include #include #endif /* INET6 */ #endif /* INET || INET6 */ #include /* * Consumers of struct ifreq such as tcpdump assume no pad between ifr_name * and ifr_ifru when it is used in SIOCGIFCONF. */ _Static_assert(sizeof(((struct ifreq *)0)->ifr_name) == offsetof(struct ifreq, ifr_ifru), "gap between ifr_name and ifr_ifru"); #ifdef COMPAT_FREEBSD32 #include #include struct ifreq_buffer32 { uint32_t length; /* (size_t) */ uint32_t buffer; /* (void *) */ }; /* * Interface request structure used for socket * ioctl's. All interface ioctl's must have parameter * definitions which begin with ifr_name. The * remainder may be interface specific. */ struct ifreq32 { char ifr_name[IFNAMSIZ]; /* if name, e.g. "en0" */ union { struct sockaddr ifru_addr; struct sockaddr ifru_dstaddr; struct sockaddr ifru_broadaddr; struct ifreq_buffer32 ifru_buffer; short ifru_flags[2]; short ifru_index; int ifru_jid; int ifru_metric; int ifru_mtu; int ifru_phys; int ifru_media; uint32_t ifru_data; int ifru_cap[2]; u_int ifru_fib; u_char ifru_vlan_pcp; } ifr_ifru; }; CTASSERT(sizeof(struct ifreq) == sizeof(struct ifreq32)); CTASSERT(__offsetof(struct ifreq, ifr_ifru) == __offsetof(struct ifreq32, ifr_ifru)); struct ifgroupreq32 { char ifgr_name[IFNAMSIZ]; u_int ifgr_len; union { char ifgru_group[IFNAMSIZ]; uint32_t ifgru_groups; } ifgr_ifgru; }; struct ifmediareq32 { char ifm_name[IFNAMSIZ]; int ifm_current; int ifm_mask; int ifm_status; int ifm_active; int ifm_count; uint32_t ifm_ulist; /* (int *) */ }; #define SIOCGIFMEDIA32 _IOC_NEWTYPE(SIOCGIFMEDIA, struct ifmediareq32) #define SIOCGIFXMEDIA32 _IOC_NEWTYPE(SIOCGIFXMEDIA, struct ifmediareq32) #define _CASE_IOC_IFGROUPREQ_32(cmd) \ case _IOC_NEWTYPE((cmd), struct ifgroupreq32): #else /* !COMPAT_FREEBSD32 */ #define _CASE_IOC_IFGROUPREQ_32(cmd) #endif /* !COMPAT_FREEBSD32 */ #define CASE_IOC_IFGROUPREQ(cmd) \ _CASE_IOC_IFGROUPREQ_32(cmd) \ case (cmd) union ifreq_union { struct ifreq ifr; #ifdef COMPAT_FREEBSD32 struct ifreq32 ifr32; #endif }; union ifgroupreq_union { struct ifgroupreq ifgr; #ifdef COMPAT_FREEBSD32 struct ifgroupreq32 ifgr32; #endif }; SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers"); SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management"); SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN, &ifqmaxlen, 0, "max send queue size"); /* Log link state change events */ static int log_link_state_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW, &log_link_state_change, 0, "log interface link state change events"); /* Log promiscuous mode change events */ static int log_promisc_mode_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_promisc_mode_change, CTLFLAG_RDTUN, &log_promisc_mode_change, 1, "log promiscuous mode change events"); /* Interface description */ static unsigned int ifdescr_maxlen = 1024; SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW, &ifdescr_maxlen, 0, "administrative maximum length for interface description"); static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions"); /* global sx for non-critical path ifdescr */ static struct sx ifdescr_sx; SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr"); void (*bridge_linkstate_p)(struct ifnet *ifp); void (*ng_ether_link_state_p)(struct ifnet *ifp, int state); void (*lagg_linkstate_p)(struct ifnet *ifp, int state); /* These are external hooks for CARP. */ void (*carp_linkstate_p)(struct ifnet *ifp); void (*carp_demote_adj_p)(int, char *); int (*carp_master_p)(struct ifaddr *); #if defined(INET) || defined(INET6) int (*carp_forus_p)(struct ifnet *ifp, u_char *dhost); int (*carp_output_p)(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa); int (*carp_ioctl_p)(struct ifreq *, u_long, struct thread *); int (*carp_attach_p)(struct ifaddr *, int); void (*carp_detach_p)(struct ifaddr *, bool); #endif #ifdef INET int (*carp_iamatch_p)(struct ifaddr *, uint8_t **); #endif #ifdef INET6 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6); caddr_t (*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr); #endif struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL; /* * XXX: Style; these should be sorted alphabetically, and unprototyped * static functions should be prototyped. Currently they are sorted by * declaration order. */ static void if_attachdomain(void *); static void if_attachdomain1(struct ifnet *); static int ifconf(u_long, caddr_t); -static void if_freemulti(struct ifmultiaddr *); static void if_grow(void); static void if_input_default(struct ifnet *, struct mbuf *); static int if_requestencap_default(struct ifnet *, struct if_encap_req *); static void if_route(struct ifnet *, int flag, int fam); static int if_setflag(struct ifnet *, int, int, int *, int); static int if_transmit(struct ifnet *ifp, struct mbuf *m); static void if_unroute(struct ifnet *, int flag, int fam); static void link_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static int ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *); static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int); static void do_link_state_change(void *, int); static int if_getgroup(struct ifgroupreq *, struct ifnet *); static int if_getgroupmembers(struct ifgroupreq *); static void if_delgroups(struct ifnet *); static void if_attach_internal(struct ifnet *, int, struct if_clone *); static int if_detach_internal(struct ifnet *, int, struct if_clone **); #ifdef VIMAGE static void if_vmove(struct ifnet *, struct vnet *); #endif #ifdef INET6 /* * XXX: declare here to avoid to include many inet6 related files.. * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); #endif /* ipsec helper hooks */ VNET_DEFINE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]); VNET_DEFINE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]); VNET_DEFINE(int, if_index); int ifqmaxlen = IFQ_MAXLEN; VNET_DEFINE(struct ifnethead, ifnet); /* depend on static init XXX */ VNET_DEFINE(struct ifgrouphead, ifg_head); static VNET_DEFINE(int, if_indexlim) = 8; /* Table of ifnet by index. */ VNET_DEFINE(struct ifnet **, ifindex_table); #define V_if_indexlim VNET(if_indexlim) #define V_ifindex_table VNET(ifindex_table) /* * The global network interface list (V_ifnet) and related state (such as * if_index, if_indexlim, and ifindex_table) are protected by an sxlock and * an rwlock. Either may be acquired shared to stablize the list, but both * must be acquired writable to modify the list. This model allows us to * both stablize the interface list during interrupt thread processing, but * also to stablize it over long-running ioctls, without introducing priority * inversions and deadlocks. */ struct rwlock ifnet_rwlock; RW_SYSINIT_FLAGS(ifnet_rw, &ifnet_rwlock, "ifnet_rw", RW_RECURSE); struct sx ifnet_sxlock; SX_SYSINIT_FLAGS(ifnet_sx, &ifnet_sxlock, "ifnet_sx", SX_RECURSE); /* * The allocation of network interfaces is a rather non-atomic affair; we * need to select an index before we are ready to expose the interface for * use, so will use this pointer value to indicate reservation. */ #define IFNET_HOLD (void *)(uintptr_t)(-1) static if_com_alloc_t *if_com_alloc[256]; static if_com_free_t *if_com_free[256]; static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals"); MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address"); struct ifnet * ifnet_byindex_locked(u_short idx) { if (idx > V_if_index) return (NULL); if (V_ifindex_table[idx] == IFNET_HOLD) return (NULL); return (V_ifindex_table[idx]); } struct ifnet * ifnet_byindex(u_short idx) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); ifp = ifnet_byindex_locked(idx); IFNET_RUNLOCK_NOSLEEP(); return (ifp); } struct ifnet * ifnet_byindex_ref(u_short idx) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); ifp = ifnet_byindex_locked(idx); if (ifp == NULL || (ifp->if_flags & IFF_DYING)) { IFNET_RUNLOCK_NOSLEEP(); return (NULL); } if_ref(ifp); IFNET_RUNLOCK_NOSLEEP(); return (ifp); } /* * Allocate an ifindex array entry; return 0 on success or an error on * failure. */ static u_short ifindex_alloc(void) { u_short idx; IFNET_WLOCK_ASSERT(); retry: /* * Try to find an empty slot below V_if_index. If we fail, take the * next slot. */ for (idx = 1; idx <= V_if_index; idx++) { if (V_ifindex_table[idx] == NULL) break; } /* Catch if_index overflow. */ if (idx >= V_if_indexlim) { if_grow(); goto retry; } if (idx > V_if_index) V_if_index = idx; return (idx); } static void ifindex_free_locked(u_short idx) { IFNET_WLOCK_ASSERT(); V_ifindex_table[idx] = NULL; while (V_if_index > 0 && V_ifindex_table[V_if_index] == NULL) V_if_index--; } static void ifindex_free(u_short idx) { IFNET_WLOCK(); ifindex_free_locked(idx); IFNET_WUNLOCK(); } static void ifnet_setbyindex_locked(u_short idx, struct ifnet *ifp) { IFNET_WLOCK_ASSERT(); V_ifindex_table[idx] = ifp; } static void ifnet_setbyindex(u_short idx, struct ifnet *ifp) { IFNET_WLOCK(); ifnet_setbyindex_locked(idx, ifp); IFNET_WUNLOCK(); } struct ifaddr * ifaddr_byindex(u_short idx) { struct ifnet *ifp; struct ifaddr *ifa = NULL; IFNET_RLOCK_NOSLEEP(); ifp = ifnet_byindex_locked(idx); if (ifp != NULL && (ifa = ifp->if_addr) != NULL) ifa_ref(ifa); IFNET_RUNLOCK_NOSLEEP(); return (ifa); } /* * Network interface utility routines. * * Routines with ifa_ifwith* names take sockaddr *'s as * parameters. */ static void vnet_if_init(const void *unused __unused) { TAILQ_INIT(&V_ifnet); TAILQ_INIT(&V_ifg_head); IFNET_WLOCK(); if_grow(); /* create initial table */ IFNET_WUNLOCK(); vnet_if_clone_init(); } VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init, NULL); #ifdef VIMAGE static void vnet_if_uninit(const void *unused __unused) { VNET_ASSERT(TAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p " "not empty", __func__, __LINE__, &V_ifnet)); VNET_ASSERT(TAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p " "not empty", __func__, __LINE__, &V_ifg_head)); free((caddr_t)V_ifindex_table, M_IFNET); } VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_if_uninit, NULL); static void vnet_if_return(const void *unused __unused) { struct ifnet *ifp, *nifp; /* Return all inherited interfaces to their parent vnets. */ TAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) { if (ifp->if_home_vnet != ifp->if_vnet) if_vmove(ifp, ifp->if_home_vnet); } } VNET_SYSUNINIT(vnet_if_return, SI_SUB_VNET_DONE, SI_ORDER_ANY, vnet_if_return, NULL); #endif static void if_grow(void) { int oldlim; u_int n; struct ifnet **e; IFNET_WLOCK_ASSERT(); oldlim = V_if_indexlim; IFNET_WUNLOCK(); n = (oldlim << 1) * sizeof(*e); e = malloc(n, M_IFNET, M_WAITOK | M_ZERO); IFNET_WLOCK(); if (V_if_indexlim != oldlim) { free(e, M_IFNET); return; } if (V_ifindex_table != NULL) { memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2); free((caddr_t)V_ifindex_table, M_IFNET); } V_if_indexlim <<= 1; V_ifindex_table = e; } /* * Allocate a struct ifnet and an index for an interface. A layer 2 * common structure will also be allocated if an allocation routine is * registered for the passed type. */ struct ifnet * if_alloc(u_char type) { struct ifnet *ifp; u_short idx; ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK|M_ZERO); IFNET_WLOCK(); idx = ifindex_alloc(); ifnet_setbyindex_locked(idx, IFNET_HOLD); IFNET_WUNLOCK(); ifp->if_index = idx; ifp->if_type = type; ifp->if_alloctype = type; #ifdef VIMAGE ifp->if_vnet = curvnet; #endif if (if_com_alloc[type] != NULL) { ifp->if_l2com = if_com_alloc[type](type, ifp); if (ifp->if_l2com == NULL) { free(ifp, M_IFNET); ifindex_free(idx); return (NULL); } } IF_ADDR_LOCK_INIT(ifp); TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); ifp->if_afdata_initialized = 0; IF_AFDATA_LOCK_INIT(ifp); TAILQ_INIT(&ifp->if_addrhead); TAILQ_INIT(&ifp->if_multiaddrs); TAILQ_INIT(&ifp->if_groups); #ifdef MAC mac_ifnet_init(ifp); #endif ifq_init(&ifp->if_snd, ifp); refcount_init(&ifp->if_refcount, 1); /* Index reference. */ for (int i = 0; i < IFCOUNTERS; i++) ifp->if_counters[i] = counter_u64_alloc(M_WAITOK); ifp->if_get_counter = if_get_counter_default; ifp->if_pcp = IFNET_PCP_NONE; ifnet_setbyindex(ifp->if_index, ifp); return (ifp); } /* * Do the actual work of freeing a struct ifnet, and layer 2 common * structure. This call is made when the last reference to an * interface is released. */ static void if_free_internal(struct ifnet *ifp) { KASSERT((ifp->if_flags & IFF_DYING), ("if_free_internal: interface not dying")); if (if_com_free[ifp->if_alloctype] != NULL) if_com_free[ifp->if_alloctype](ifp->if_l2com, ifp->if_alloctype); #ifdef MAC mac_ifnet_destroy(ifp); #endif /* MAC */ if (ifp->if_description != NULL) free(ifp->if_description, M_IFDESCR); IF_AFDATA_DESTROY(ifp); IF_ADDR_LOCK_DESTROY(ifp); ifq_delete(&ifp->if_snd); for (int i = 0; i < IFCOUNTERS; i++) counter_u64_free(ifp->if_counters[i]); free(ifp, M_IFNET); } /* * Deregister an interface and free the associated storage. */ void if_free(struct ifnet *ifp) { ifp->if_flags |= IFF_DYING; /* XXX: Locking */ CURVNET_SET_QUIET(ifp->if_vnet); IFNET_WLOCK(); KASSERT(ifp == ifnet_byindex_locked(ifp->if_index), ("%s: freeing unallocated ifnet", ifp->if_xname)); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); if (refcount_release(&ifp->if_refcount)) if_free_internal(ifp); CURVNET_RESTORE(); } /* * Interfaces to keep an ifnet type-stable despite the possibility of the * driver calling if_free(). If there are additional references, we defer * freeing the underlying data structure. */ void if_ref(struct ifnet *ifp) { /* We don't assert the ifnet list lock here, but arguably should. */ refcount_acquire(&ifp->if_refcount); } void if_rele(struct ifnet *ifp) { if (!refcount_release(&ifp->if_refcount)) return; if_free_internal(ifp); } void ifq_init(struct ifaltq *ifq, struct ifnet *ifp) { mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF); if (ifq->ifq_maxlen == 0) ifq->ifq_maxlen = ifqmaxlen; ifq->altq_type = 0; ifq->altq_disc = NULL; ifq->altq_flags &= ALTQF_CANTCHANGE; ifq->altq_tbr = NULL; ifq->altq_ifp = ifp; } void ifq_delete(struct ifaltq *ifq) { mtx_destroy(&ifq->ifq_mtx); } /* * Perform generic interface initialization tasks and attach the interface * to the list of "active" interfaces. If vmove flag is set on entry * to if_attach_internal(), perform only a limited subset of initialization * tasks, given that we are moving from one vnet to another an ifnet which * has already been fully initialized. * * Note that if_detach_internal() removes group membership unconditionally * even when vmove flag is set, and if_attach_internal() adds only IFG_ALL. * Thus, when if_vmove() is applied to a cloned interface, group membership * is lost while a cloned one always joins a group whose name is * ifc->ifc_name. To recover this after if_detach_internal() and * if_attach_internal(), the cloner should be specified to * if_attach_internal() via ifc. If it is non-NULL, if_attach_internal() * attempts to join a group whose name is ifc->ifc_name. * * XXX: * - The decision to return void and thus require this function to * succeed is questionable. * - We should probably do more sanity checking. For instance we don't * do anything to insure if_xname is unique or non-empty. */ void if_attach(struct ifnet *ifp) { if_attach_internal(ifp, 0, NULL); } /* * Compute the least common TSO limit. */ void if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax) { /* * 1) If there is no limit currently, take the limit from * the network adapter. * * 2) If the network adapter has a limit below the current * limit, apply it. */ if (pmax->tsomaxbytes == 0 || (ifp->if_hw_tsomax != 0 && ifp->if_hw_tsomax < pmax->tsomaxbytes)) { pmax->tsomaxbytes = ifp->if_hw_tsomax; } if (pmax->tsomaxsegcount == 0 || (ifp->if_hw_tsomaxsegcount != 0 && ifp->if_hw_tsomaxsegcount < pmax->tsomaxsegcount)) { pmax->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; } if (pmax->tsomaxsegsize == 0 || (ifp->if_hw_tsomaxsegsize != 0 && ifp->if_hw_tsomaxsegsize < pmax->tsomaxsegsize)) { pmax->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } /* * Update TSO limit of a network adapter. * * Returns zero if no change. Else non-zero. */ int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax) { int retval = 0; if (ifp->if_hw_tsomax != pmax->tsomaxbytes) { ifp->if_hw_tsomax = pmax->tsomaxbytes; retval++; } if (ifp->if_hw_tsomaxsegsize != pmax->tsomaxsegsize) { ifp->if_hw_tsomaxsegsize = pmax->tsomaxsegsize; retval++; } if (ifp->if_hw_tsomaxsegcount != pmax->tsomaxsegcount) { ifp->if_hw_tsomaxsegcount = pmax->tsomaxsegcount; retval++; } return (retval); } static void if_attach_internal(struct ifnet *ifp, int vmove, struct if_clone *ifc) { unsigned socksize, ifasize; int namelen, masklen; struct sockaddr_dl *sdl; struct ifaddr *ifa; if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index)) panic ("%s: BUG: if_attach called without if_alloc'd input()\n", ifp->if_xname); #ifdef VIMAGE ifp->if_vnet = curvnet; if (ifp->if_home_vnet == NULL) ifp->if_home_vnet = curvnet; #endif if_addgroup(ifp, IFG_ALL); /* Restore group membership for cloned interfaces. */ if (vmove && ifc != NULL) if_clone_addgroup(ifp, ifc); getmicrotime(&ifp->if_lastchange); ifp->if_epoch = time_uptime; KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) || (ifp->if_transmit != NULL && ifp->if_qflush != NULL), ("transmit and qflush must both either be set or both be NULL")); if (ifp->if_transmit == NULL) { ifp->if_transmit = if_transmit; ifp->if_qflush = if_qflush; } if (ifp->if_input == NULL) ifp->if_input = if_input_default; if (ifp->if_requestencap == NULL) ifp->if_requestencap = if_requestencap_default; if (!vmove) { #ifdef MAC mac_ifnet_create(ifp); #endif /* * Create a Link Level name for this device. */ namelen = strlen(ifp->if_xname); /* * Always save enough space for any possiable name so we * can do a rename in place later. */ masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ; socksize = masklen + ifp->if_addrlen; if (socksize < sizeof(*sdl)) socksize = sizeof(*sdl); socksize = roundup2(socksize, sizeof(long)); ifasize = sizeof(*ifa) + 2 * socksize; ifa = ifa_alloc(ifasize, M_WAITOK); sdl = (struct sockaddr_dl *)(ifa + 1); sdl->sdl_len = socksize; sdl->sdl_family = AF_LINK; bcopy(ifp->if_xname, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; ifp->if_addr = ifa; ifa->ifa_ifp = ifp; ifa->ifa_rtrequest = link_rtrequest; ifa->ifa_addr = (struct sockaddr *)sdl; sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl); ifa->ifa_netmask = (struct sockaddr *)sdl; sdl->sdl_len = masklen; while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); /* Reliably crash if used uninitialized. */ ifp->if_broadcastaddr = NULL; if (ifp->if_type == IFT_ETHER) { ifp->if_hw_addr = malloc(ifp->if_addrlen, M_IFADDR, M_WAITOK | M_ZERO); } #if defined(INET) || defined(INET6) /* Use defaults for TSO, if nothing is set */ if (ifp->if_hw_tsomax == 0 && ifp->if_hw_tsomaxsegcount == 0 && ifp->if_hw_tsomaxsegsize == 0) { /* * The TSO defaults needs to be such that an * NFS mbuf list of 35 mbufs totalling just * below 64K works and that a chain of mbufs * can be defragged into at most 32 segments: */ ifp->if_hw_tsomax = min(IP_MAXPACKET, (32 * MCLBYTES) - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)); ifp->if_hw_tsomaxsegcount = 35; ifp->if_hw_tsomaxsegsize = 2048; /* 2K */ /* XXX some drivers set IFCAP_TSO after ethernet attach */ if (ifp->if_capabilities & IFCAP_TSO) { if_printf(ifp, "Using defaults for TSO: %u/%u/%u\n", ifp->if_hw_tsomax, ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); } } #endif } #ifdef VIMAGE else { /* * Update the interface index in the link layer address * of the interface. */ for (ifa = ifp->if_addr; ifa != NULL; ifa = TAILQ_NEXT(ifa, ifa_link)) { if (ifa->ifa_addr->sa_family == AF_LINK) { sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_index = ifp->if_index; } } } #endif IFNET_WLOCK(); TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link); #ifdef VIMAGE curvnet->vnet_ifcnt++; #endif IFNET_WUNLOCK(); if (domain_init_status >= 2) if_attachdomain1(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL); /* Announce the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); } static void if_attachdomain(void *dummy) { struct ifnet *ifp; TAILQ_FOREACH(ifp, &V_ifnet, if_link) if_attachdomain1(ifp); } SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND, if_attachdomain, NULL); static void if_attachdomain1(struct ifnet *ifp) { struct domain *dp; /* * Since dp->dom_ifattach calls malloc() with M_WAITOK, we * cannot lock ifp->if_afdata initialization, entirely. */ IF_AFDATA_LOCK(ifp); if (ifp->if_afdata_initialized >= domain_init_status) { IF_AFDATA_UNLOCK(ifp); log(LOG_WARNING, "%s called more than once on %s\n", __func__, ifp->if_xname); return; } ifp->if_afdata_initialized = domain_init_status; IF_AFDATA_UNLOCK(ifp); /* address family dependent data region */ bzero(ifp->if_afdata, sizeof(ifp->if_afdata)); for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_ifattach) ifp->if_afdata[dp->dom_family] = (*dp->dom_ifattach)(ifp); } } /* * Remove any unicast or broadcast network addresses from an interface. */ void if_purgeaddrs(struct ifnet *ifp) { struct ifaddr *ifa, *next; /* XXX cannot hold IF_ADDR_WLOCK over called functions. */ TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) { if (ifa->ifa_addr->sa_family == AF_LINK) continue; #ifdef INET /* XXX: Ugly!! ad hoc just for INET */ if (ifa->ifa_addr->sa_family == AF_INET) { struct ifaliasreq ifr; bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL) == 0) continue; } #endif /* INET */ #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) { in6_purgeaddr(ifa); /* ifp_addrhead is already updated */ continue; } #endif /* INET6 */ IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } } /* * Remove any multicast network addresses from an interface when an ifnet * is going away. */ static void if_purgemaddrs(struct ifnet *ifp) { struct ifmultiaddr *ifma; IF_ADDR_WLOCK(ifp); while (!TAILQ_EMPTY(&ifp->if_multiaddrs)) { ifma = TAILQ_FIRST(&ifp->if_multiaddrs); TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link); if_delmulti_locked(ifp, ifma, 1); } IF_ADDR_WUNLOCK(ifp); } /* * Detach an interface, removing it from the list of "active" interfaces. * If vmove flag is set on entry to if_detach_internal(), perform only a * limited subset of cleanup tasks, given that we are moving an ifnet from * one vnet to another, where it must be fully operational. * * XXXRW: There are some significant questions about event ordering, and * how to prevent things from starting to use the interface during detach. */ void if_detach(struct ifnet *ifp) { CURVNET_SET_QUIET(ifp->if_vnet); if_detach_internal(ifp, 0, NULL); CURVNET_RESTORE(); } /* * The vmove flag, if set, indicates that we are called from a callpath * that is moving an interface to a different vnet instance. * * The shutdown flag, if set, indicates that we are called in the * process of shutting down a vnet instance. Currently only the * vnet_if_return SYSUNINIT function sets it. Note: we can be called * on a vnet instance shutdown without this flag being set, e.g., when * the cloned interfaces are destoyed as first thing of teardown. */ static int if_detach_internal(struct ifnet *ifp, int vmove, struct if_clone **ifcp) { struct ifaddr *ifa; int i; struct domain *dp; struct ifnet *iter; int found = 0; #ifdef VIMAGE int shutdown; shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET && ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; #endif IFNET_WLOCK(); TAILQ_FOREACH(iter, &V_ifnet, if_link) if (iter == ifp) { TAILQ_REMOVE(&V_ifnet, ifp, if_link); found = 1; break; } IFNET_WUNLOCK(); if (!found) { /* * While we would want to panic here, we cannot * guarantee that the interface is indeed still on * the list given we don't hold locks all the way. */ return (ENOENT); #if 0 if (vmove) panic("%s: ifp=%p not on the ifnet tailq %p", __func__, ifp, &V_ifnet); else return; /* XXX this should panic as well? */ #endif } /* * At this point we know the interface still was on the ifnet list * and we removed it so we are in a stable state. */ #ifdef VIMAGE curvnet->vnet_ifcnt--; #endif /* * In any case (destroy or vmove) detach us from the groups * and remove/wait for pending events on the taskq. * XXX-BZ in theory an interface could still enqueue a taskq change? */ if_delgroups(ifp); taskqueue_drain(taskqueue_swi, &ifp->if_linktask); /* * Check if this is a cloned interface or not. Must do even if * shutting down as a if_vmove_reclaim() would move the ifp and * the if_clone_addgroup() will have a corrupted string overwise * from a gibberish pointer. */ if (vmove && ifcp != NULL) *ifcp = if_clone_findifc(ifp); if_down(ifp); #ifdef VIMAGE /* * On VNET shutdown abort here as the stack teardown will do all * the work top-down for us. */ if (shutdown) { /* * In case of a vmove we are done here without error. * If we would signal an error it would lead to the same * abort as if we did not find the ifnet anymore. * if_detach() calls us in void context and does not care * about an early abort notification, so life is splendid :) */ goto finish_vnet_shutdown; } #endif /* * At this point we are not tearing down a VNET and are either * going to destroy or vmove the interface and have to cleanup * accordingly. */ /* * Remove routes and flush queues. */ #ifdef ALTQ if (ALTQ_IS_ENABLED(&ifp->if_snd)) altq_disable(&ifp->if_snd); if (ALTQ_IS_ATTACHED(&ifp->if_snd)) altq_detach(&ifp->if_snd); #endif if_purgeaddrs(ifp); #ifdef INET in_ifdetach(ifp); #endif #ifdef INET6 /* * Remove all IPv6 kernel structs related to ifp. This should be done * before removing routing entries below, since IPv6 interface direct * routes are expected to be removed by the IPv6-specific kernel API. * Otherwise, the kernel will detect some inconsistency and bark it. */ in6_ifdetach(ifp); #endif if_purgemaddrs(ifp); /* Announce that the interface is gone. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); if (!vmove) { /* * Prevent further calls into the device driver via ifnet. */ if_dead(ifp); /* * Remove link ifaddr pointer and maybe decrement if_index. * Clean up all addresses. */ free(ifp->if_hw_addr, M_IFADDR); ifp->if_hw_addr = NULL; ifp->if_addr = NULL; /* We can now free link ifaddr. */ IF_ADDR_WLOCK(ifp); if (!TAILQ_EMPTY(&ifp->if_addrhead)) { ifa = TAILQ_FIRST(&ifp->if_addrhead); TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } else IF_ADDR_WUNLOCK(ifp); } rt_flushifroutes(ifp); #ifdef VIMAGE finish_vnet_shutdown: #endif /* * We cannot hold the lock over dom_ifdetach calls as they might * sleep, for example trying to drain a callout, thus open up the * theoretical race with re-attaching. */ IF_AFDATA_LOCK(ifp); i = ifp->if_afdata_initialized; ifp->if_afdata_initialized = 0; IF_AFDATA_UNLOCK(ifp); for (dp = domains; i > 0 && dp; dp = dp->dom_next) { if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) { (*dp->dom_ifdetach)(ifp, ifp->if_afdata[dp->dom_family]); ifp->if_afdata[dp->dom_family] = NULL; } } return (0); } #ifdef VIMAGE /* * if_vmove() performs a limited version of if_detach() in current * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg. * An attempt is made to shrink if_index in current vnet, find an * unused if_index in target vnet and calls if_grow() if necessary, * and finally find an unused if_xname for the target vnet. */ static void if_vmove(struct ifnet *ifp, struct vnet *new_vnet) { struct if_clone *ifc; u_int bif_dlt, bif_hdrlen; int rc; /* * if_detach_internal() will call the eventhandler to notify * interface departure. That will detach if_bpf. We need to * safe the dlt and hdrlen so we can re-attach it later. */ bpf_get_bp_params(ifp->if_bpf, &bif_dlt, &bif_hdrlen); /* * Detach from current vnet, but preserve LLADDR info, do not * mark as dead etc. so that the ifnet can be reattached later. * If we cannot find it, we lost the race to someone else. */ rc = if_detach_internal(ifp, 1, &ifc); if (rc != 0) return; /* * Unlink the ifnet from ifindex_table[] in current vnet, and shrink * the if_index for that vnet if possible. * * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized, * or we'd lock on one vnet and unlock on another. */ IFNET_WLOCK(); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); /* * Perform interface-specific reassignment tasks, if provided by * the driver. */ if (ifp->if_reassign != NULL) ifp->if_reassign(ifp, new_vnet, NULL); /* * Switch to the context of the target vnet. */ CURVNET_SET_QUIET(new_vnet); IFNET_WLOCK(); ifp->if_index = ifindex_alloc(); ifnet_setbyindex_locked(ifp->if_index, ifp); IFNET_WUNLOCK(); if_attach_internal(ifp, 1, ifc); if (ifp->if_bpf == NULL) bpfattach(ifp, bif_dlt, bif_hdrlen); CURVNET_RESTORE(); } /* * Move an ifnet to or from another child prison/vnet, specified by the jail id. */ static int if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid) { struct prison *pr; struct ifnet *difp; int shutdown; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Do not try to move the iface from and to the same prison. */ if (pr->pr_vnet == ifp->if_vnet) { prison_free(pr); return (EEXIST); } /* Make sure the named iface does not exists in the dst. prison/vnet. */ /* XXX Lock interfaces to avoid races. */ CURVNET_SET_QUIET(pr->pr_vnet); difp = ifunit(ifname); if (difp != NULL) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Make sure the VNET is stable. */ shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET && ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; if (shutdown) { CURVNET_RESTORE(); prison_free(pr); return (EBUSY); } CURVNET_RESTORE(); /* Move the interface into the child jail/vnet. */ if_vmove(ifp, pr->pr_vnet); /* Report the new if_xname back to the userland. */ sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (0); } static int if_vmove_reclaim(struct thread *td, char *ifname, int jid) { struct prison *pr; struct vnet *vnet_dst; struct ifnet *ifp; int shutdown; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Make sure the named iface exists in the source prison/vnet. */ CURVNET_SET(pr->pr_vnet); ifp = ifunit(ifname); /* XXX Lock to avoid races. */ if (ifp == NULL) { CURVNET_RESTORE(); prison_free(pr); return (ENXIO); } /* Do not try to move the iface from and to the same prison. */ vnet_dst = TD_TO_VNET(td); if (vnet_dst == ifp->if_vnet) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Make sure the VNET is stable. */ shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET && ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; if (shutdown) { CURVNET_RESTORE(); prison_free(pr); return (EBUSY); } /* Get interface back from child jail/vnet. */ if_vmove(ifp, vnet_dst); CURVNET_RESTORE(); /* Report the new if_xname back to the userland. */ sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (0); } #endif /* VIMAGE */ /* * Add a group to an interface */ int if_addgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_group *ifg = NULL; struct ifg_member *ifgm; int new = 0; if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' && groupname[strlen(groupname) - 1] <= '9') return (EINVAL); IFNET_WLOCK(); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) { IFNET_WUNLOCK(); return (EEXIST); } if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP, M_NOWAIT)) == NULL) { IFNET_WUNLOCK(); return (ENOMEM); } if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, groupname)) break; if (ifg == NULL) { if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); free(ifgm, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group)); ifg->ifg_refcnt = 0; TAILQ_INIT(&ifg->ifg_members); TAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next); new = 1; } ifg->ifg_refcnt++; ifgl->ifgl_group = ifg; ifgm->ifgm_ifp = ifp; IF_ADDR_WLOCK(ifp); TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next); TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); IFNET_WUNLOCK(); if (new) EVENTHANDLER_INVOKE(group_attach_event, ifg); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Remove a group from an interface */ int if_delgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_member *ifgm; IFNET_WLOCK(); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) break; if (ifgl == NULL) { IFNET_WUNLOCK(); return (ENOENT); } IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) break; if (ifgm != NULL) { TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next); free(ifgm, M_TEMP); } if (--ifgl->ifgl_group->ifg_refcnt == 0) { TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next); IFNET_WUNLOCK(); EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } else IFNET_WUNLOCK(); free(ifgl, M_TEMP); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Remove an interface from all groups */ static void if_delgroups(struct ifnet *ifp) { struct ifg_list *ifgl; struct ifg_member *ifgm; char groupname[IFNAMSIZ]; IFNET_WLOCK(); while (!TAILQ_EMPTY(&ifp->if_groups)) { ifgl = TAILQ_FIRST(&ifp->if_groups); strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ); IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) break; if (ifgm != NULL) { TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next); free(ifgm, M_TEMP); } if (--ifgl->ifgl_group->ifg_refcnt == 0) { TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next); IFNET_WUNLOCK(); EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } else IFNET_WUNLOCK(); free(ifgl, M_TEMP); EVENTHANDLER_INVOKE(group_change_event, groupname); IFNET_WLOCK(); } IFNET_WUNLOCK(); } static char * ifgr_group_get(void *ifgrp) { union ifgroupreq_union *ifgrup; ifgrup = ifgrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return (&ifgrup->ifgr32.ifgr_ifgru.ifgru_group[0]); #endif return (&ifgrup->ifgr.ifgr_ifgru.ifgru_group[0]); } static struct ifg_req * ifgr_groups_get(void *ifgrp) { union ifgroupreq_union *ifgrup; ifgrup = ifgrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((struct ifg_req *)(uintptr_t) ifgrup->ifgr32.ifgr_ifgru.ifgru_groups); #endif return (ifgrup->ifgr.ifgr_ifgru.ifgru_groups); } /* * Stores all groups from an interface in memory pointed to by ifgr. */ static int if_getgroup(struct ifgroupreq *ifgr, struct ifnet *ifp) { int len, error; struct ifg_list *ifgl; struct ifg_req ifgrq, *ifgp; if (ifgr->ifgr_len == 0) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) ifgr->ifgr_len += sizeof(struct ifg_req); IF_ADDR_RUNLOCK(ifp); return (0); } len = ifgr->ifgr_len; ifgp = ifgr_groups_get(ifgr); /* XXX: wire */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) { if (len < sizeof(ifgrq)) { IF_ADDR_RUNLOCK(ifp); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group, sizeof(ifgrq.ifgrq_group)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IF_ADDR_RUNLOCK(ifp); return (error); } len -= sizeof(ifgrq); ifgp++; } IF_ADDR_RUNLOCK(ifp); return (0); } /* * Stores all members of a group in memory pointed to by igfr */ static int if_getgroupmembers(struct ifgroupreq *ifgr) { struct ifg_group *ifg; struct ifg_member *ifgm; struct ifg_req ifgrq, *ifgp; int len, error; IFNET_RLOCK(); TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, ifgr->ifgr_name)) break; if (ifg == NULL) { IFNET_RUNLOCK(); return (ENOENT); } if (ifgr->ifgr_len == 0) { TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) ifgr->ifgr_len += sizeof(ifgrq); IFNET_RUNLOCK(); return (0); } len = ifgr->ifgr_len; ifgp = ifgr_groups_get(ifgr); TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) { if (len < sizeof(ifgrq)) { IFNET_RUNLOCK(); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname, sizeof(ifgrq.ifgrq_member)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IFNET_RUNLOCK(); return (error); } len -= sizeof(ifgrq); ifgp++; } IFNET_RUNLOCK(); return (0); } /* * Return counter values from counter(9)s stored in ifnet. */ uint64_t if_get_counter_default(struct ifnet *ifp, ift_counter cnt) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); return (counter_u64_fetch(ifp->if_counters[cnt])); } /* * Increase an ifnet counter. Usually used for counters shared * between the stack and a driver, but function supports them all. */ void if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); counter_u64_add(ifp->if_counters[cnt], inc); } /* * Copy data from ifnet to userland API structure if_data. */ void if_data_copy(struct ifnet *ifp, struct if_data *ifd) { ifd->ifi_type = ifp->if_type; ifd->ifi_physical = 0; ifd->ifi_addrlen = ifp->if_addrlen; ifd->ifi_hdrlen = ifp->if_hdrlen; ifd->ifi_link_state = ifp->if_link_state; ifd->ifi_vhid = 0; ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_mtu = ifp->if_mtu; ifd->ifi_metric = ifp->if_metric; ifd->ifi_baudrate = ifp->if_baudrate; ifd->ifi_hwassist = ifp->if_hwassist; ifd->ifi_epoch = ifp->if_epoch; ifd->ifi_lastchange = ifp->if_lastchange; ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS); ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS); ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS); ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS); ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS); ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES); ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES); ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS); ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS); ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS); ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS); ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO); } /* * Wrapper functions for struct ifnet address list locking macros. These are * used by kernel modules to avoid encoding programming interface or binary * interface assumptions that may be violated when kernel-internal locking * approaches change. */ void if_addr_rlock(struct ifnet *ifp) { IF_ADDR_RLOCK(ifp); } void if_addr_runlock(struct ifnet *ifp) { IF_ADDR_RUNLOCK(ifp); } void if_maddr_rlock(if_t ifp) { IF_ADDR_RLOCK((struct ifnet *)ifp); } void if_maddr_runlock(if_t ifp) { IF_ADDR_RUNLOCK((struct ifnet *)ifp); } /* * Initialization, destruction and refcounting functions for ifaddrs. */ struct ifaddr * ifa_alloc(size_t size, int flags) { struct ifaddr *ifa; KASSERT(size >= sizeof(struct ifaddr), ("%s: invalid size %zu", __func__, size)); ifa = malloc(size, M_IFADDR, M_ZERO | flags); if (ifa == NULL) return (NULL); if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL) goto fail; refcount_init(&ifa->ifa_refcnt, 1); return (ifa); fail: /* free(NULL) is okay */ counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); return (NULL); } void ifa_ref(struct ifaddr *ifa) { refcount_acquire(&ifa->ifa_refcnt); } void ifa_free(struct ifaddr *ifa) { if (refcount_release(&ifa->ifa_refcnt)) { counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); } } static int ifa_maintain_loopback_route(int cmd, const char *otype, struct ifaddr *ifa, struct sockaddr *ia) { int error; struct rt_addrinfo info; struct sockaddr_dl null_sdl; struct ifnet *ifp; ifp = ifa->ifa_ifp; bzero(&info, sizeof(info)); if (cmd != RTM_DELETE) info.rti_ifp = V_loif; info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC | RTF_PINNED; info.rti_info[RTAX_DST] = ia; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; link_init_sdl(ifp, (struct sockaddr *)&null_sdl, ifp->if_type); error = rtrequest1_fib(cmd, &info, NULL, ifp->if_fib); if (error != 0) log(LOG_DEBUG, "%s: %s failed for interface %s: %u\n", __func__, otype, if_name(ifp), error); return (error); } int ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_ADD, "insertion", ifa, ia)); } int ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_DELETE, "deletion", ifa, ia)); } int ifa_switch_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_CHANGE, "switch", ifa, ia)); } /* * XXX: Because sockaddr_dl has deeper structure than the sockaddr * structs used to represent other address families, it is necessary * to perform a different comparison. */ #define sa_dl_equal(a1, a2) \ ((((const struct sockaddr_dl *)(a1))->sdl_len == \ ((const struct sockaddr_dl *)(a2))->sdl_len) && \ (bcmp(CLLADDR((const struct sockaddr_dl *)(a1)), \ CLLADDR((const struct sockaddr_dl *)(a2)), \ ((const struct sockaddr_dl *)(a1))->sdl_alen) == 0)) /* * Locate an interface based on a complete address. */ /*ARGSUSED*/ static struct ifaddr * ifa_ifwithaddr_internal(const struct sockaddr *addr, int getref) { struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (sa_equal(addr, ifa->ifa_addr)) { if (getref) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } /* IP6 doesn't have broadcast */ if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { if (getref) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } } IF_ADDR_RUNLOCK(ifp); } ifa = NULL; done: IFNET_RUNLOCK_NOSLEEP(); return (ifa); } struct ifaddr * ifa_ifwithaddr(const struct sockaddr *addr) { return (ifa_ifwithaddr_internal(addr, 1)); } int ifa_ifwithaddr_check(const struct sockaddr *addr) { return (ifa_ifwithaddr_internal(addr, 0) != NULL); } /* * Locate an interface based on the broadcast address. */ /* ARGSUSED */ struct ifaddr * ifa_ifwithbroadaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } } IF_ADDR_RUNLOCK(ifp); } ifa = NULL; done: IFNET_RUNLOCK_NOSLEEP(); return (ifa); } /* * Locate the point to point interface with a given destination address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithdstaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((ifp->if_flags & IFF_POINTOPOINT) == 0) continue; if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } } IF_ADDR_RUNLOCK(ifp); } ifa = NULL; done: IFNET_RUNLOCK_NOSLEEP(); return (ifa); } /* * Find an interface on a specific network. If many, choice * is most specific found. */ struct ifaddr * ifa_ifwithnet(const struct sockaddr *addr, int ignore_ptp, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; const char *addr_data = addr->sa_data, *cplim; /* * AF_LINK addresses can be looked up directly by their index number, * so do that if we can. */ if (af == AF_LINK) { const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)addr; if (sdl->sdl_index && sdl->sdl_index <= V_if_index) return (ifaddr_byindex(sdl->sdl_index)); } /* * Scan though each interface, looking for ones that have addresses * in this address family and the requested fib. Maintain a reference * on ifa_maybe once we find one, as we release the IF_ADDR_RLOCK() that * kept it stable when we move onto the next interface. */ IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { const char *cp, *cp2, *cp3; if (ifa->ifa_addr->sa_family != af) next: continue; if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) { /* * This is a bit broken as it doesn't * take into account that the remote end may * be a single node in the network we are * looking for. * The trouble is that we don't know the * netmask for the remote end. */ if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } } else { /* * Scan all the bits in the ifa's address. * If a bit dissagrees with what we are * looking for, mask it with the netmask * to see if it really matters. * (A byte at a time) */ if (ifa->ifa_netmask == 0) continue; cp = addr_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; while (cp3 < cplim) if ((*cp++ ^ *cp2++) & *cp3++) goto next; /* next address! */ /* * If the netmask of what we just found * is more specific than what we had before * (if we had one), or if the virtual status * of new prefix is better than of the old one, * then remember the new one before continuing * to search for an even better one. */ if (ifa_maybe == NULL || ifa_preferred(ifa_maybe, ifa) || rn_refines((caddr_t)ifa->ifa_netmask, (caddr_t)ifa_maybe->ifa_netmask)) { if (ifa_maybe != NULL) ifa_free(ifa_maybe); ifa_maybe = ifa; ifa_ref(ifa_maybe); } } } IF_ADDR_RUNLOCK(ifp); } ifa = ifa_maybe; ifa_maybe = NULL; done: IFNET_RUNLOCK_NOSLEEP(); if (ifa_maybe != NULL) ifa_free(ifa_maybe); return (ifa); } /* * Find an interface address specific to an interface best matching * a given address. */ struct ifaddr * ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp) { struct ifaddr *ifa; const char *cp, *cp2, *cp3; char *cplim; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; if (af >= AF_MAX) return (NULL); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != af) continue; if (ifa_maybe == NULL) ifa_maybe = ifa; if (ifa->ifa_netmask == 0) { if (sa_equal(addr, ifa->ifa_addr) || (ifa->ifa_dstaddr && sa_equal(addr, ifa->ifa_dstaddr))) goto done; continue; } if (ifp->if_flags & IFF_POINTOPOINT) { if (sa_equal(addr, ifa->ifa_dstaddr)) goto done; } else { cp = addr->sa_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; for (; cp3 < cplim; cp3++) if ((*cp++ ^ *cp2++) & *cp3) break; if (cp3 == cplim) goto done; } } ifa = ifa_maybe; done: if (ifa != NULL) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); return (ifa); } /* * See whether new ifa is better than current one: * 1) A non-virtual one is preferred over virtual. * 2) A virtual in master state preferred over any other state. * * Used in several address selecting functions. */ int ifa_preferred(struct ifaddr *cur, struct ifaddr *next) { return (cur->ifa_carp && (!next->ifa_carp || ((*carp_master_p)(next) && !(*carp_master_p)(cur)))); } #include /* * Default action when installing a route with a Link Level gateway. * Lookup an appropriate real ifa to point to. * This should be moved to /sys/net/link.c eventually. */ static void link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info) { struct ifaddr *ifa, *oifa; struct sockaddr *dst; struct ifnet *ifp; if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == NULL) || ((ifp = ifa->ifa_ifp) == NULL) || ((dst = rt_key(rt)) == NULL)) return; ifa = ifaof_ifpforaddr(dst, ifp); if (ifa) { oifa = rt->rt_ifa; rt->rt_ifa = ifa; ifa_free(oifa); if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest) ifa->ifa_rtrequest(cmd, rt, info); } } struct sockaddr_dl * link_alloc_sdl(size_t size, int flags) { return (malloc(size, M_TEMP, flags)); } void link_free_sdl(struct sockaddr *sa) { free(sa, M_TEMP); } /* * Fills in given sdl with interface basic info. * Returns pointer to filled sdl. */ struct sockaddr_dl * link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)paddr; memset(sdl, 0, sizeof(struct sockaddr_dl)); sdl->sdl_len = sizeof(struct sockaddr_dl); sdl->sdl_family = AF_LINK; sdl->sdl_index = ifp->if_index; sdl->sdl_type = iftype; return (sdl); } /* * Mark an interface down and notify protocols of * the transition. */ static void if_unroute(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP")); ifp->if_flags &= ~flag; getmicrotime(&ifp->if_lastchange); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFDOWN, ifa->ifa_addr); ifp->if_qflush(ifp); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); } /* * Mark an interface up and notify protocols of * the transition. */ static void if_route(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP")); ifp->if_flags |= flag; getmicrotime(&ifp->if_lastchange); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFUP, ifa->ifa_addr); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); #ifdef INET6 in6_if_up(ifp); #endif } void (*vlan_link_state_p)(struct ifnet *); /* XXX: private from if_vlan */ void (*vlan_trunk_cap_p)(struct ifnet *); /* XXX: private from if_vlan */ struct ifnet *(*vlan_trunkdev_p)(struct ifnet *); struct ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t); int (*vlan_tag_p)(struct ifnet *, uint16_t *); int (*vlan_setcookie_p)(struct ifnet *, void *); void *(*vlan_cookie_p)(struct ifnet *); /* * Handle a change in the interface link state. To avoid LORs * between driver lock and upper layer locks, as well as possible * recursions, we post event to taskqueue, and all job * is done in static do_link_state_change(). */ void if_link_state_change(struct ifnet *ifp, int link_state) { /* Return if state hasn't changed. */ if (ifp->if_link_state == link_state) return; ifp->if_link_state = link_state; taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask); } static void do_link_state_change(void *arg, int pending) { struct ifnet *ifp = (struct ifnet *)arg; int link_state = ifp->if_link_state; CURVNET_SET(ifp->if_vnet); /* Notify that the link state has changed. */ rt_ifmsg(ifp); if (ifp->if_vlantrunk != NULL) (*vlan_link_state_p)(ifp); if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) && ifp->if_l2com != NULL) (*ng_ether_link_state_p)(ifp, link_state); if (ifp->if_carp) (*carp_linkstate_p)(ifp); if (ifp->if_bridge) (*bridge_linkstate_p)(ifp); if (ifp->if_lagg) (*lagg_linkstate_p)(ifp, link_state); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL); if (pending > 1) if_printf(ifp, "%d link states coalesced\n", pending); if (log_link_state_change) log(LOG_NOTICE, "%s: link state changed to %s\n", ifp->if_xname, (link_state == LINK_STATE_UP) ? "UP" : "DOWN" ); EVENTHANDLER_INVOKE(ifnet_link_event, ifp, link_state); CURVNET_RESTORE(); } /* * Mark an interface down and notify protocols of * the transition. */ void if_down(struct ifnet *ifp) { EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_DOWN); if_unroute(ifp, IFF_UP, AF_UNSPEC); } /* * Mark an interface up and notify protocols of * the transition. */ void if_up(struct ifnet *ifp) { if_route(ifp, IFF_UP, AF_UNSPEC); EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_UP); } /* * Flush an interface queue. */ void if_qflush(struct ifnet *ifp) { struct mbuf *m, *n; struct ifaltq *ifq; ifq = &ifp->if_snd; IFQ_LOCK(ifq); #ifdef ALTQ if (ALTQ_IS_ENABLED(ifq)) ALTQ_PURGE(ifq); #endif n = ifq->ifq_head; while ((m = n) != NULL) { n = m->m_nextpkt; m_freem(m); } ifq->ifq_head = 0; ifq->ifq_tail = 0; ifq->ifq_len = 0; IFQ_UNLOCK(ifq); } /* * Map interface name to interface structure pointer, with or without * returning a reference. */ struct ifnet * ifunit_ref(const char *name) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 && !(ifp->if_flags & IFF_DYING)) break; } if (ifp != NULL) if_ref(ifp); IFNET_RUNLOCK_NOSLEEP(); return (ifp); } struct ifnet * ifunit(const char *name) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0) break; } IFNET_RUNLOCK_NOSLEEP(); return (ifp); } static void * ifr_buffer_get_buffer(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((void *)(uintptr_t) ifrup->ifr32.ifr_ifru.ifru_buffer.buffer); #endif return (ifrup->ifr.ifr_ifru.ifru_buffer.buffer); } static void ifr_buffer_set_buffer_null(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ifrup->ifr32.ifr_ifru.ifru_buffer.buffer = 0; else #endif ifrup->ifr.ifr_ifru.ifru_buffer.buffer = NULL; } static size_t ifr_buffer_get_length(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return (ifrup->ifr32.ifr_ifru.ifru_buffer.length); #endif return (ifrup->ifr.ifr_ifru.ifru_buffer.length); } static void ifr_buffer_set_length(void *data, size_t len) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ifrup->ifr32.ifr_ifru.ifru_buffer.length = len; else #endif ifrup->ifr.ifr_ifru.ifru_buffer.length = len; } void * ifr_data_get_ptr(void *ifrp) { union ifreq_union *ifrup; ifrup = ifrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((void *)(uintptr_t) ifrup->ifr32.ifr_ifru.ifru_data); #endif return (ifrup->ifr.ifr_ifru.ifru_data); } /* * Hardware specific interface ioctls. */ static int ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) { struct ifreq *ifr; int error = 0, do_ifup = 0; int new_flags, temp_flags; size_t namelen, onamelen; size_t descrlen; char *descrbuf, *odescrbuf; char new_name[IFNAMSIZ]; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifr = (struct ifreq *)data; switch (cmd) { case SIOCGIFINDEX: ifr->ifr_index = ifp->if_index; break; case SIOCGIFFLAGS: temp_flags = ifp->if_flags | ifp->if_drv_flags; ifr->ifr_flags = temp_flags & 0xffff; ifr->ifr_flagshigh = temp_flags >> 16; break; case SIOCGIFCAP: ifr->ifr_reqcap = ifp->if_capabilities; ifr->ifr_curcap = ifp->if_capenable; break; #ifdef MAC case SIOCGIFMAC: error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp); break; #endif case SIOCGIFMETRIC: ifr->ifr_metric = ifp->if_metric; break; case SIOCGIFMTU: ifr->ifr_mtu = ifp->if_mtu; break; case SIOCGIFPHYS: /* XXXGL: did this ever worked? */ ifr->ifr_phys = 0; break; case SIOCGIFDESCR: error = 0; sx_slock(&ifdescr_sx); if (ifp->if_description == NULL) error = ENOMSG; else { /* space for terminating nul */ descrlen = strlen(ifp->if_description) + 1; if (ifr_buffer_get_length(ifr) < descrlen) ifr_buffer_set_buffer_null(ifr); else error = copyout(ifp->if_description, ifr_buffer_get_buffer(ifr), descrlen); ifr_buffer_set_length(ifr, descrlen); } sx_sunlock(&ifdescr_sx); break; case SIOCSIFDESCR: error = priv_check(td, PRIV_NET_SETIFDESCR); if (error) return (error); /* * Copy only (length-1) bytes to make sure that * if_description is always nul terminated. The * length parameter is supposed to count the * terminating nul in. */ if (ifr_buffer_get_length(ifr) > ifdescr_maxlen) return (ENAMETOOLONG); else if (ifr_buffer_get_length(ifr) == 0) descrbuf = NULL; else { descrbuf = malloc(ifr_buffer_get_length(ifr), M_IFDESCR, M_WAITOK | M_ZERO); error = copyin(ifr_buffer_get_buffer(ifr), descrbuf, ifr_buffer_get_length(ifr) - 1); if (error) { free(descrbuf, M_IFDESCR); break; } } sx_xlock(&ifdescr_sx); odescrbuf = ifp->if_description; ifp->if_description = descrbuf; sx_xunlock(&ifdescr_sx); getmicrotime(&ifp->if_lastchange); free(odescrbuf, M_IFDESCR); break; case SIOCGIFFIB: ifr->ifr_fib = ifp->if_fib; break; case SIOCSIFFIB: error = priv_check(td, PRIV_NET_SETIFFIB); if (error) return (error); if (ifr->ifr_fib >= rt_numfibs) return (EINVAL); ifp->if_fib = ifr->ifr_fib; break; case SIOCSIFFLAGS: error = priv_check(td, PRIV_NET_SETIFFLAGS); if (error) return (error); /* * Currently, no driver owned flags pass the IFF_CANTCHANGE * check, so we don't need special handling here yet. */ new_flags = (ifr->ifr_flags & 0xffff) | (ifr->ifr_flagshigh << 16); if (ifp->if_flags & IFF_UP && (new_flags & IFF_UP) == 0) { if_down(ifp); } else if (new_flags & IFF_UP && (ifp->if_flags & IFF_UP) == 0) { do_ifup = 1; } /* See if permanently promiscuous mode bit is about to flip */ if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) { if (new_flags & IFF_PPROMISC) ifp->if_flags |= IFF_PROMISC; else if (ifp->if_pcount == 0) ifp->if_flags &= ~IFF_PROMISC; if (log_promisc_mode_change) log(LOG_INFO, "%s: permanently promiscuous mode %s\n", ifp->if_xname, ((new_flags & IFF_PPROMISC) ? "enabled" : "disabled")); } ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) | (new_flags &~ IFF_CANTCHANGE); if (ifp->if_ioctl) { (void) (*ifp->if_ioctl)(ifp, cmd, data); } if (do_ifup) if_up(ifp); getmicrotime(&ifp->if_lastchange); break; case SIOCSIFCAP: error = priv_check(td, PRIV_NET_SETIFCAP); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); if (ifr->ifr_reqcap & ~ifp->if_capabilities) return (EINVAL); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; #ifdef MAC case SIOCSIFMAC: error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp); break; #endif case SIOCSIFNAME: error = priv_check(td, PRIV_NET_SETIFNAME); if (error) return (error); error = copyinstr(ifr_data_get_ptr(ifr), new_name, IFNAMSIZ, NULL); if (error != 0) return (error); if (new_name[0] == '\0') return (EINVAL); if (new_name[IFNAMSIZ-1] != '\0') { new_name[IFNAMSIZ-1] = '\0'; if (strlen(new_name) == IFNAMSIZ-1) return (EINVAL); } if (ifunit(new_name) != NULL) return (EEXIST); /* * XXX: Locking. Nothing else seems to lock if_flags, * and there are numerous other races with the * ifunit() checks not being atomic with namespace * changes (renames, vmoves, if_attach, etc). */ ifp->if_flags |= IFF_RENAMING; /* Announce the departure of the interface. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); log(LOG_INFO, "%s: changing name to '%s'\n", ifp->if_xname, new_name); IF_ADDR_WLOCK(ifp); strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname)); ifa = ifp->if_addr; sdl = (struct sockaddr_dl *)ifa->ifa_addr; namelen = strlen(new_name); onamelen = sdl->sdl_nlen; /* * Move the address if needed. This is safe because we * allocate space for a name of length IFNAMSIZ when we * create this in if_attach(). */ if (namelen != onamelen) { bcopy(sdl->sdl_data + onamelen, sdl->sdl_data + namelen, sdl->sdl_alen); } bcopy(new_name, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl = (struct sockaddr_dl *)ifa->ifa_netmask; bzero(sdl->sdl_data, onamelen); while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); /* Announce the return of the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); ifp->if_flags &= ~IFF_RENAMING; break; #ifdef VIMAGE case SIOCSIFVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error) return (error); error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid); break; #endif case SIOCSIFMETRIC: error = priv_check(td, PRIV_NET_SETIFMETRIC); if (error) return (error); ifp->if_metric = ifr->ifr_metric; getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYS: error = priv_check(td, PRIV_NET_SETIFPHYS); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFMTU: { u_long oldmtu = ifp->if_mtu; error = priv_check(td, PRIV_NET_SETIFMTU); if (error) return (error); if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) return (EINVAL); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) { getmicrotime(&ifp->if_lastchange); rt_ifmsg(ifp); #ifdef INET NETDUMP_REINIT(ifp); #endif } /* * If the link MTU changed, do network layer specific procedure. */ if (ifp->if_mtu != oldmtu) { #ifdef INET6 nd6_setmtu(ifp); #endif rt_updatemtu(ifp); } break; } case SIOCADDMULTI: case SIOCDELMULTI: if (cmd == SIOCADDMULTI) error = priv_check(td, PRIV_NET_ADDMULTI); else error = priv_check(td, PRIV_NET_DELMULTI); if (error) return (error); /* Don't allow group membership on non-multicast interfaces. */ if ((ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); /* Don't let users screw up protocols' entries. */ if (ifr->ifr_addr.sa_family != AF_LINK) return (EINVAL); if (cmd == SIOCADDMULTI) { struct ifmultiaddr *ifma; /* * Userland is only permitted to join groups once * via the if_addmulti() KPI, because it cannot hold * struct ifmultiaddr * between calls. It may also * lose a race while we check if the membership * already exists. */ IF_ADDR_RLOCK(ifp); ifma = if_findmulti(ifp, &ifr->ifr_addr); IF_ADDR_RUNLOCK(ifp); if (ifma != NULL) error = EADDRINUSE; else error = if_addmulti(ifp, &ifr->ifr_addr, &ifma); } else { error = if_delmulti(ifp, &ifr->ifr_addr); } if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYADDR: case SIOCDIFPHYADDR: #ifdef INET6 case SIOCSIFPHYADDR_IN6: #endif case SIOCSIFMEDIA: case SIOCSIFGENERIC: error = priv_check(td, PRIV_NET_HWIOCTL); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCGIFSTATUS: case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: case SIOCGIFMEDIA: case SIOCGIFXMEDIA: case SIOCGIFGENERIC: case SIOCGIFRSSKEY: case SIOCGIFRSSHASH: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); break; case SIOCSIFLLADDR: error = priv_check(td, PRIV_NET_SETLLADDR); if (error) return (error); error = if_setlladdr(ifp, ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len); break; case SIOCGHWADDR: error = if_gethwaddr(ifp, ifr); break; CASE_IOC_IFGROUPREQ(SIOCAIFGROUP): error = priv_check(td, PRIV_NET_ADDIFGROUP); if (error) return (error); if ((error = if_addgroup(ifp, ifgr_group_get((struct ifgroupreq *)data)))) return (error); break; CASE_IOC_IFGROUPREQ(SIOCGIFGROUP): if ((error = if_getgroup((struct ifgroupreq *)data, ifp))) return (error); break; CASE_IOC_IFGROUPREQ(SIOCDIFGROUP): error = priv_check(td, PRIV_NET_DELIFGROUP); if (error) return (error); if ((error = if_delgroup(ifp, ifgr_group_get((struct ifgroupreq *)data)))) return (error); break; default: error = ENOIOCTL; break; } return (error); } #ifdef COMPAT_FREEBSD32 struct ifconf32 { int32_t ifc_len; union { uint32_t ifcu_buf; uint32_t ifcu_req; } ifc_ifcu; }; #define SIOCGIFCONF32 _IOWR('i', 36, struct ifconf32) #endif #ifdef COMPAT_FREEBSD32 static void ifmr_init(struct ifmediareq *ifmr, caddr_t data) { struct ifmediareq32 *ifmr32; ifmr32 = (struct ifmediareq32 *)data; memcpy(ifmr->ifm_name, ifmr32->ifm_name, sizeof(ifmr->ifm_name)); ifmr->ifm_current = ifmr32->ifm_current; ifmr->ifm_mask = ifmr32->ifm_mask; ifmr->ifm_status = ifmr32->ifm_status; ifmr->ifm_active = ifmr32->ifm_active; ifmr->ifm_count = ifmr32->ifm_count; ifmr->ifm_ulist = (int *)(uintptr_t)ifmr32->ifm_ulist; } static void ifmr_update(const struct ifmediareq *ifmr, caddr_t data) { struct ifmediareq32 *ifmr32; ifmr32 = (struct ifmediareq32 *)data; ifmr32->ifm_current = ifmr->ifm_current; ifmr32->ifm_mask = ifmr->ifm_mask; ifmr32->ifm_status = ifmr->ifm_status; ifmr32->ifm_active = ifmr->ifm_active; ifmr32->ifm_count = ifmr->ifm_count; } #endif /* * Interface ioctls. */ int ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) { #ifdef COMPAT_FREEBSD32 caddr_t saved_data; struct ifmediareq ifmr; #endif struct ifmediareq *ifmrp; struct ifnet *ifp; struct ifreq *ifr; int error; int oif_flags; #ifdef VIMAGE int shutdown; #endif CURVNET_SET(so->so_vnet); #ifdef VIMAGE /* Make sure the VNET is stable. */ shutdown = (so->so_vnet->vnet_state > SI_SUB_VNET && so->so_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; if (shutdown) { CURVNET_RESTORE(); return (EBUSY); } #endif switch (cmd) { case SIOCGIFCONF: error = ifconf(cmd, data); CURVNET_RESTORE(); return (error); #ifdef COMPAT_FREEBSD32 case SIOCGIFCONF32: { struct ifconf32 *ifc32; struct ifconf ifc; ifc32 = (struct ifconf32 *)data; ifc.ifc_len = ifc32->ifc_len; ifc.ifc_buf = PTRIN(ifc32->ifc_buf); error = ifconf(SIOCGIFCONF, (void *)&ifc); CURVNET_RESTORE(); if (error == 0) ifc32->ifc_len = ifc.ifc_len; return (error); } #endif } ifmrp = NULL; #ifdef COMPAT_FREEBSD32 switch (cmd) { case SIOCGIFMEDIA32: case SIOCGIFXMEDIA32: ifmrp = &ifmr; ifmr_init(ifmrp, data); cmd = _IOC_NEWTYPE(cmd, struct ifmediareq); saved_data = data; data = (caddr_t)ifmrp; } #endif ifr = (struct ifreq *)data; switch (cmd) { #ifdef VIMAGE case SIOCSIFRVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error == 0) error = if_vmove_reclaim(td, ifr->ifr_name, ifr->ifr_jid); goto out_noref; #endif case SIOCIFCREATE: case SIOCIFCREATE2: error = priv_check(td, PRIV_NET_IFCREATE); if (error == 0) error = if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name), cmd == SIOCIFCREATE2 ? ifr_data_get_ptr(ifr) : NULL); goto out_noref; case SIOCIFDESTROY: error = priv_check(td, PRIV_NET_IFDESTROY); if (error == 0) error = if_clone_destroy(ifr->ifr_name); goto out_noref; case SIOCIFGCLONERS: error = if_clone_list((struct if_clonereq *)data); goto out_noref; CASE_IOC_IFGROUPREQ(SIOCGIFGMEMB): error = if_getgroupmembers((struct ifgroupreq *)data); goto out_noref; #if defined(INET) || defined(INET6) case SIOCSVH: case SIOCGVH: if (carp_ioctl_p == NULL) error = EPROTONOSUPPORT; else error = (*carp_ioctl_p)(ifr, cmd, td); goto out_noref; #endif } ifp = ifunit_ref(ifr->ifr_name); if (ifp == NULL) { error = ENXIO; goto out_noref; } error = ifhwioctl(cmd, ifp, data, td); if (error != ENOIOCTL) goto out_ref; oif_flags = ifp->if_flags; if (so->so_proto == NULL) { error = EOPNOTSUPP; goto out_ref; } /* * Pass the request on to the socket control method, and if the * latter returns EOPNOTSUPP, directly to the interface. * * Make an exception for the legacy SIOCSIF* requests. Drivers * trust SIOCSIFADDR et al to come from an already privileged * layer, and do not perform any credentials checks or input * validation. */ error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, ifp, td)); if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL && cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR && cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK) error = (*ifp->if_ioctl)(ifp, cmd, data); if ((oif_flags ^ ifp->if_flags) & IFF_UP) { #ifdef INET6 if (ifp->if_flags & IFF_UP) in6_if_up(ifp); #endif } out_ref: if_rele(ifp); out_noref: #ifdef COMPAT_FREEBSD32 if (ifmrp != NULL) { KASSERT((cmd == SIOCGIFMEDIA || cmd == SIOCGIFXMEDIA), ("ifmrp non-NULL, but cmd is not an ifmedia req 0x%lx", cmd)); data = saved_data; ifmr_update(ifmrp, data); } #endif CURVNET_RESTORE(); return (error); } /* * The code common to handling reference counted flags, * e.g., in ifpromisc() and if_allmulti(). * The "pflag" argument can specify a permanent mode flag to check, * such as IFF_PPROMISC for promiscuous mode; should be 0 if none. * * Only to be used on stack-owned flags, not driver-owned flags. */ static int if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch) { struct ifreq ifr; int error; int oldflags, oldcount; /* Sanity checks to catch programming errors */ KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0, ("%s: setting driver-owned flag %d", __func__, flag)); if (onswitch) KASSERT(*refcount >= 0, ("%s: increment negative refcount %d for flag %d", __func__, *refcount, flag)); else KASSERT(*refcount > 0, ("%s: decrement non-positive refcount %d for flag %d", __func__, *refcount, flag)); /* In case this mode is permanent, just touch refcount */ if (ifp->if_flags & pflag) { *refcount += onswitch ? 1 : -1; return (0); } /* Save ifnet parameters for if_ioctl() may fail */ oldcount = *refcount; oldflags = ifp->if_flags; /* * See if we aren't the only and touching refcount is enough. * Actually toggle interface flag if we are the first or last. */ if (onswitch) { if ((*refcount)++) return (0); ifp->if_flags |= flag; } else { if (--(*refcount)) return (0); ifp->if_flags &= ~flag; } /* Call down the driver since we've changed interface flags */ if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto recover; } ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); if (error) goto recover; /* Notify userland that interface flags have changed */ rt_ifmsg(ifp); return (0); recover: /* Recover after driver error */ *refcount = oldcount; ifp->if_flags = oldflags; return (error); } /* * Set/clear promiscuous mode on interface ifp based on the truth value * of pswitch. The calls are reference counted so that only the first * "on" request actually has an effect, as does the final "off" request. * Results are undefined if the "off" and "on" requests are not matched. */ int ifpromisc(struct ifnet *ifp, int pswitch) { int error; int oldflags = ifp->if_flags; error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC, &ifp->if_pcount, pswitch); /* If promiscuous mode status has changed, log a message */ if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC) && log_promisc_mode_change) log(LOG_INFO, "%s: promiscuous mode %s\n", ifp->if_xname, (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled"); return (error); } /* * Return interface configuration * of system. List may be used * in later ioctl's (above) to get * other information. */ /*ARGSUSED*/ static int ifconf(u_long cmd, caddr_t data) { struct ifconf *ifc = (struct ifconf *)data; struct ifnet *ifp; struct ifaddr *ifa; struct ifreq ifr; struct sbuf *sb; int error, full = 0, valid_len, max_len; /* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */ max_len = MAXPHYS - 1; /* Prevent hostile input from being able to crash the system */ if (ifc->ifc_len <= 0) return (EINVAL); again: if (ifc->ifc_len <= max_len) { max_len = ifc->ifc_len; full = 1; } sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN); max_len = 0; valid_len = 0; IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { int addrs; /* * Zero the ifr to make sure we don't disclose the contents * of the stack. */ memset(&ifr, 0, sizeof(ifr)); if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name)) >= sizeof(ifr.ifr_name)) { sbuf_delete(sb); IFNET_RUNLOCK(); return (ENAMETOOLONG); } addrs = 0; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; if (prison_if(curthread->td_ucred, sa) != 0) continue; addrs++; if (sa->sa_len <= sizeof(*sa)) { if (sa->sa_len < sizeof(*sa)) { memset(&ifr.ifr_ifru.ifru_addr, 0, sizeof(ifr.ifr_ifru.ifru_addr)); memcpy(&ifr.ifr_ifru.ifru_addr, sa, sa->sa_len); } else ifr.ifr_ifru.ifru_addr = *sa; sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); } else { sbuf_bcat(sb, &ifr, offsetof(struct ifreq, ifr_addr)); max_len += offsetof(struct ifreq, ifr_addr); sbuf_bcat(sb, sa, sa->sa_len); max_len += sa->sa_len; } if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } IF_ADDR_RUNLOCK(ifp); if (addrs == 0) { sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } } IFNET_RUNLOCK(); /* * If we didn't allocate enough space (uncommon), try again. If * we have already allocated as much space as we are allowed, * return what we've got. */ if (valid_len != max_len && !full) { sbuf_delete(sb); goto again; } ifc->ifc_len = valid_len; sbuf_finish(sb); error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len); sbuf_delete(sb); return (error); } /* * Just like ifpromisc(), but for all-multicast-reception mode. */ int if_allmulti(struct ifnet *ifp, int onswitch) { return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch)); } struct ifmultiaddr * if_findmulti(struct ifnet *ifp, const struct sockaddr *sa) { struct ifmultiaddr *ifma; IF_ADDR_LOCK_ASSERT(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (sa->sa_family == AF_LINK) { if (sa_dl_equal(ifma->ifma_addr, sa)) break; } else { if (sa_equal(ifma->ifma_addr, sa)) break; } } return ifma; } /* * Allocate a new ifmultiaddr and initialize based on passed arguments. We * make copies of passed sockaddrs. The ifmultiaddr will not be added to * the ifnet multicast address list here, so the caller must do that and * other setup work (such as notifying the device driver). The reference * count is initialized to 1. */ static struct ifmultiaddr * if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa, int mflags) { struct ifmultiaddr *ifma; struct sockaddr *dupsa; ifma = malloc(sizeof *ifma, M_IFMADDR, mflags | M_ZERO); if (ifma == NULL) return (NULL); dupsa = malloc(sa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma, M_IFMADDR); return (NULL); } bcopy(sa, dupsa, sa->sa_len); ifma->ifma_addr = dupsa; ifma->ifma_ifp = ifp; ifma->ifma_refcount = 1; ifma->ifma_protospec = NULL; if (llsa == NULL) { ifma->ifma_lladdr = NULL; return (ifma); } dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); return (NULL); } bcopy(llsa, dupsa, llsa->sa_len); ifma->ifma_lladdr = dupsa; return (ifma); } /* * if_freemulti: free ifmultiaddr structure and possibly attached related * addresses. The caller is responsible for implementing reference * counting, notifying the driver, handling routing messages, and releasing * any dependent link layer state. */ -static void +#ifdef MCAST_VERBOSE +extern void kdb_backtrace(void); +#endif +void if_freemulti(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d", ifma->ifma_refcount)); if (ifma->ifma_lladdr != NULL) free(ifma->ifma_lladdr, M_IFMADDR); +#ifdef MCAST_VERBOSE + kdb_backtrace(); + printf("%s freeing ifma: %p\n", __func__, ifma); +#endif free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); } /* * Register an additional multicast address with a network interface. * * - If the address is already present, bump the reference count on the * address and return. * - If the address is not link-layer, look up a link layer address. * - Allocate address structures for one or both addresses, and attach to the * multicast address list on the interface. If automatically adding a link * layer address, the protocol address will own a reference to the link * layer address, to be freed when it is freed. * - Notify the network device driver of an addition to the multicast address * list. * * 'sa' points to caller-owned memory with the desired multicast address. * * 'retifma' will be used to return a pointer to the resulting multicast * address reference, if desired. */ int if_addmulti(struct ifnet *ifp, struct sockaddr *sa, struct ifmultiaddr **retifma) { struct ifmultiaddr *ifma, *ll_ifma; struct sockaddr *llsa; struct sockaddr_dl sdl; int error; #ifdef INET IN_MULTI_LIST_UNLOCK_ASSERT(); #endif #ifdef INET6 IN6_MULTI_LIST_UNLOCK_ASSERT(); #endif /* * If the address is already present, return a new reference to it; * otherwise, allocate storage and set up a new address. */ IF_ADDR_WLOCK(ifp); ifma = if_findmulti(ifp, sa); if (ifma != NULL) { ifma->ifma_refcount++; if (retifma != NULL) *retifma = ifma; IF_ADDR_WUNLOCK(ifp); return (0); } /* * The address isn't already present; resolve the protocol address * into a link layer address, and then look that up, bump its * refcount or allocate an ifma for that also. * Most link layer resolving functions returns address data which * fits inside default sockaddr_dl structure. However callback * can allocate another sockaddr structure, in that case we need to * free it later. */ llsa = NULL; ll_ifma = NULL; if (ifp->if_resolvemulti != NULL) { /* Provide called function with buffer size information */ sdl.sdl_len = sizeof(sdl); llsa = (struct sockaddr *)&sdl; error = ifp->if_resolvemulti(ifp, &llsa, sa); if (error) goto unlock_out; } /* * Allocate the new address. Don't hook it up yet, as we may also * need to allocate a link layer multicast address. */ ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT); if (ifma == NULL) { error = ENOMEM; goto free_llsa_out; } /* * If a link layer address is found, we'll need to see if it's * already present in the address list, or allocate is as well. * When this block finishes, the link layer address will be on the * list. */ if (llsa != NULL) { ll_ifma = if_findmulti(ifp, llsa); if (ll_ifma == NULL) { ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT); if (ll_ifma == NULL) { --ifma->ifma_refcount; if_freemulti(ifma); error = ENOMEM; goto free_llsa_out; } TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma, ifma_link); } else ll_ifma->ifma_refcount++; ifma->ifma_llifma = ll_ifma; } /* * We now have a new multicast address, ifma, and possibly a new or * referenced link layer address. Add the primary address to the * ifnet address list. */ TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); if (retifma != NULL) *retifma = ifma; /* * Must generate the message while holding the lock so that 'ifma' * pointer is still valid. */ rt_newmaddrmsg(RTM_NEWMADDR, ifma); IF_ADDR_WUNLOCK(ifp); /* * We are certain we have added something, so call down to the * interface to let them know about it. */ if (ifp->if_ioctl != NULL) { (void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0); } if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); return (0); free_llsa_out: if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); unlock_out: IF_ADDR_WUNLOCK(ifp); return (error); } /* * Delete a multicast group membership by network-layer group address. * * Returns ENOENT if the entry could not be found. If ifp no longer * exists, results are undefined. This entry point should only be used * from subsystems which do appropriate locking to hold ifp for the * duration of the call. * Network-layer protocol domains must use if_delmulti_ifma(). */ int if_delmulti(struct ifnet *ifp, struct sockaddr *sa) { struct ifmultiaddr *ifma; int lastref; #ifdef INVARIANTS struct ifnet *oifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) ifp = NULL; IFNET_RUNLOCK_NOSLEEP(); KASSERT(ifp != NULL, ("%s: ifnet went away", __func__)); #endif if (ifp == NULL) return (ENOENT); IF_ADDR_WLOCK(ifp); lastref = 0; ifma = if_findmulti(ifp, sa); if (ifma != NULL) lastref = if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); if (ifma == NULL) return (ENOENT); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } return (0); } /* * Delete all multicast group membership for an interface. * Should be used to quickly flush all multicast filters. */ void if_delallmulti(struct ifnet *ifp) { struct ifmultiaddr *ifma; struct ifmultiaddr *next; IF_ADDR_WLOCK(ifp); TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); } +void +if_delmulti_ifma(struct ifmultiaddr *ifma) +{ + if_delmulti_ifma_flags(ifma, 0); +} + /* * Delete a multicast group membership by group membership pointer. * Network-layer protocol domains must use this routine. * * It is safe to call this routine if the ifp disappeared. */ void -if_delmulti_ifma(struct ifmultiaddr *ifma) +if_delmulti_ifma_flags(struct ifmultiaddr *ifma, int flags) { struct ifnet *ifp; int lastref; - + MCDPRINTF("%s freeing ifma: %p\n", __func__, ifma); #ifdef INET IN_MULTI_LIST_UNLOCK_ASSERT(); #endif ifp = ifma->ifma_ifp; #ifdef DIAGNOSTIC if (ifp == NULL) { printf("%s: ifma_ifp seems to be detached\n", __func__); } else { struct ifnet *oifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) { printf("%s: ifnet %p disappeared\n", __func__, ifp); ifp = NULL; } IFNET_RUNLOCK_NOSLEEP(); } #endif /* * If and only if the ifnet instance exists: Acquire the address lock. */ if (ifp != NULL) IF_ADDR_WLOCK(ifp); - lastref = if_delmulti_locked(ifp, ifma, 0); + lastref = if_delmulti_locked(ifp, ifma, flags); if (ifp != NULL) { /* * If and only if the ifnet instance exists: * Release the address lock. * If the group was left: update the hardware hash filter. */ IF_ADDR_WUNLOCK(ifp); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } } } /* * Perform deletion of network-layer and/or link-layer multicast address. * * Return 0 if the reference count was decremented. * Return 1 if the final reference was released, indicating that the * hardware hash filter should be reprogrammed. */ static int if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching) { struct ifmultiaddr *ll_ifma; if (ifp != NULL && ifma->ifma_ifp != NULL) { KASSERT(ifma->ifma_ifp == ifp, ("%s: inconsistent ifp %p", __func__, ifp)); IF_ADDR_WLOCK_ASSERT(ifp); } ifp = ifma->ifma_ifp; + MCDPRINTF("%s freeing %p from %s \n", __func__, ifma, ifp ? ifp->if_xname : ""); /* * If the ifnet is detaching, null out references to ifnet, * so that upper protocol layers will notice, and not attempt * to obtain locks for an ifnet which no longer exists. The * routing socket announcement must happen before the ifnet * instance is detached from the system. */ if (detaching) { #ifdef DIAGNOSTIC printf("%s: detaching ifnet instance %p\n", __func__, ifp); #endif /* * ifp may already be nulled out if we are being reentered * to delete the ll_ifma. */ if (ifp != NULL) { rt_newmaddrmsg(RTM_DELMADDR, ifma); ifma->ifma_ifp = NULL; } } if (--ifma->ifma_refcount > 0) return 0; + if (ifp != NULL && detaching == 0) + TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link); + /* * If this ifma is a network-layer ifma, a link-layer ifma may * have been associated with it. Release it first if so. */ ll_ifma = ifma->ifma_llifma; if (ll_ifma != NULL) { KASSERT(ifma->ifma_lladdr != NULL, ("%s: llifma w/o lladdr", __func__)); if (detaching) ll_ifma->ifma_ifp = NULL; /* XXX */ if (--ll_ifma->ifma_refcount == 0) { if (ifp != NULL) { TAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifma_link); } if_freemulti(ll_ifma); } } - if (ifp != NULL && detaching == 0) - TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link); +#ifdef INVARIANTS + if (ifp) { + struct ifmultiaddr *ifmatmp; + TAILQ_FOREACH(ifmatmp, &ifp->if_multiaddrs, ifma_link) + MPASS(ifma != ifmatmp); + } +#endif if_freemulti(ifma); - /* * The last reference to this instance of struct ifmultiaddr * was released; the hardware should be notified of this change. */ return 1; } /* * Set the link layer address on an interface. * * At this time we only support certain types of interfaces, * and we don't allow the length of the address to change. * * Set noinline to be dtrace-friendly */ __noinline int if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) { struct sockaddr_dl *sdl; struct ifaddr *ifa; struct ifreq ifr; IF_ADDR_RLOCK(ifp); ifa = ifp->if_addr; if (ifa == NULL) { IF_ADDR_RUNLOCK(ifp); return (EINVAL); } ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) { ifa_free(ifa); return (EINVAL); } if (len != sdl->sdl_alen) { /* don't allow length to change */ ifa_free(ifa); return (EINVAL); } switch (ifp->if_type) { case IFT_ETHER: case IFT_XETHER: case IFT_L2VLAN: case IFT_BRIDGE: case IFT_IEEE8023ADLAG: bcopy(lladdr, LLADDR(sdl), len); ifa_free(ifa); break; default: ifa_free(ifa); return (ENODEV); } /* * If the interface is already up, we need * to re-init it in order to reprogram its * address filter. */ if ((ifp->if_flags & IFF_UP) != 0) { if (ifp->if_ioctl) { ifp->if_flags &= ~IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); ifp->if_flags |= IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); } } EVENTHANDLER_INVOKE(iflladdr_event, ifp); return (0); } /* * Compat function for handling basic encapsulation requests. * Not converted stacks (FDDI, IB, ..) supports traditional * output model: ARP (and other similar L2 protocols) are handled * inside output routine, arpresolve/nd6_resolve() returns MAC * address instead of full prepend. * * This function creates calculated header==MAC for IPv4/IPv6 and * returns EAFNOSUPPORT (which is then handled in ARP code) for other * address families. */ static int if_requestencap_default(struct ifnet *ifp, struct if_encap_req *req) { if (req->rtype != IFENCAP_LL) return (EOPNOTSUPP); if (req->bufsize < req->lladdr_len) return (ENOMEM); switch (req->family) { case AF_INET: case AF_INET6: break; default: return (EAFNOSUPPORT); } /* Copy lladdr to storage as is */ memmove(req->buf, req->lladdr, req->lladdr_len); req->bufsize = req->lladdr_len; req->lladdr_off = 0; return (0); } /* * Get the link layer address that was read from the hardware at attach. * * This is only set by Ethernet NICs (IFT_ETHER), but laggX interfaces re-type * their component interfaces as IFT_IEEE8023ADLAG. */ int if_gethwaddr(struct ifnet *ifp, struct ifreq *ifr) { if (ifp->if_hw_addr == NULL) return (ENODEV); switch (ifp->if_type) { case IFT_ETHER: case IFT_IEEE8023ADLAG: bcopy(ifp->if_hw_addr, ifr->ifr_addr.sa_data, ifp->if_addrlen); return (0); default: return (ENODEV); } } /* * The name argument must be a pointer to storage which will last as * long as the interface does. For physical devices, the result of * device_get_name(dev) is a good choice and for pseudo-devices a * static string works well. */ void if_initname(struct ifnet *ifp, const char *name, int unit) { ifp->if_dname = name; ifp->if_dunit = unit; if (unit != IF_DUNIT_NONE) snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit); else strlcpy(ifp->if_xname, name, IFNAMSIZ); } int if_printf(struct ifnet *ifp, const char * fmt, ...) { va_list ap; int retval; retval = printf("%s: ", ifp->if_xname); va_start(ap, fmt); retval += vprintf(fmt, ap); va_end(ap); return (retval); } void if_start(struct ifnet *ifp) { (*(ifp)->if_start)(ifp); } /* * Backwards compatibility interface for drivers * that have not implemented it */ static int if_transmit(struct ifnet *ifp, struct mbuf *m) { int error; IFQ_HANDOFF(ifp, m, error); return (error); } static void if_input_default(struct ifnet *ifp __unused, struct mbuf *m) { m_freem(m); } int if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust) { int active = 0; IF_LOCK(ifq); if (_IF_QFULL(ifq)) { IF_UNLOCK(ifq); if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); m_freem(m); return (0); } if (ifp != NULL) { if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len + adjust); if (m->m_flags & (M_BCAST|M_MCAST)) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); active = ifp->if_drv_flags & IFF_DRV_OACTIVE; } _IF_ENQUEUE(ifq, m); IF_UNLOCK(ifq); if (ifp != NULL && !active) (*(ifp)->if_start)(ifp); return (1); } void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f) { KASSERT(if_com_alloc[type] == NULL, ("if_register_com_alloc: %d already registered", type)); KASSERT(if_com_free[type] == NULL, ("if_register_com_alloc: %d free already registered", type)); if_com_alloc[type] = a; if_com_free[type] = f; } void if_deregister_com_alloc(u_char type) { KASSERT(if_com_alloc[type] != NULL, ("if_deregister_com_alloc: %d not registered", type)); KASSERT(if_com_free[type] != NULL, ("if_deregister_com_alloc: %d free not registered", type)); if_com_alloc[type] = NULL; if_com_free[type] = NULL; } /* API for driver access to network stack owned ifnet.*/ uint64_t if_setbaudrate(struct ifnet *ifp, uint64_t baudrate) { uint64_t oldbrate; oldbrate = ifp->if_baudrate; ifp->if_baudrate = baudrate; return (oldbrate); } uint64_t if_getbaudrate(if_t ifp) { return (((struct ifnet *)ifp)->if_baudrate); } int if_setcapabilities(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capabilities = capabilities; return (0); } int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit) { ((struct ifnet *)ifp)->if_capabilities |= setbit; ((struct ifnet *)ifp)->if_capabilities &= ~clearbit; return (0); } int if_getcapabilities(if_t ifp) { return ((struct ifnet *)ifp)->if_capabilities; } int if_setcapenable(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capenable = capabilities; return (0); } int if_setcapenablebit(if_t ifp, int setcap, int clearcap) { if(setcap) ((struct ifnet *)ifp)->if_capenable |= setcap; if(clearcap) ((struct ifnet *)ifp)->if_capenable &= ~clearcap; return (0); } const char * if_getdname(if_t ifp) { return ((struct ifnet *)ifp)->if_dname; } int if_togglecapenable(if_t ifp, int togglecap) { ((struct ifnet *)ifp)->if_capenable ^= togglecap; return (0); } int if_getcapenable(if_t ifp) { return ((struct ifnet *)ifp)->if_capenable; } /* * This is largely undesirable because it ties ifnet to a device, but does * provide flexiblity for an embedded product vendor. Should be used with * the understanding that it violates the interface boundaries, and should be * a last resort only. */ int if_setdev(if_t ifp, void *dev) { return (0); } int if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags) { ((struct ifnet *)ifp)->if_drv_flags |= set_flags; ((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags; return (0); } int if_getdrvflags(if_t ifp) { return ((struct ifnet *)ifp)->if_drv_flags; } int if_setdrvflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_drv_flags = flags; return (0); } int if_setflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_flags = flags; return (0); } int if_setflagbits(if_t ifp, int set, int clear) { ((struct ifnet *)ifp)->if_flags |= set; ((struct ifnet *)ifp)->if_flags &= ~clear; return (0); } int if_getflags(if_t ifp) { return ((struct ifnet *)ifp)->if_flags; } int if_clearhwassist(if_t ifp) { ((struct ifnet *)ifp)->if_hwassist = 0; return (0); } int if_sethwassistbits(if_t ifp, int toset, int toclear) { ((struct ifnet *)ifp)->if_hwassist |= toset; ((struct ifnet *)ifp)->if_hwassist &= ~toclear; return (0); } int if_sethwassist(if_t ifp, int hwassist_bit) { ((struct ifnet *)ifp)->if_hwassist = hwassist_bit; return (0); } int if_gethwassist(if_t ifp) { return ((struct ifnet *)ifp)->if_hwassist; } int if_setmtu(if_t ifp, int mtu) { ((struct ifnet *)ifp)->if_mtu = mtu; return (0); } int if_getmtu(if_t ifp) { return ((struct ifnet *)ifp)->if_mtu; } int if_getmtu_family(if_t ifp, int family) { struct domain *dp; for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_family == family && dp->dom_ifmtu != NULL) return (dp->dom_ifmtu((struct ifnet *)ifp)); } return (((struct ifnet *)ifp)->if_mtu); } int if_setsoftc(if_t ifp, void *softc) { ((struct ifnet *)ifp)->if_softc = softc; return (0); } void * if_getsoftc(if_t ifp) { return ((struct ifnet *)ifp)->if_softc; } void if_setrcvif(struct mbuf *m, if_t ifp) { m->m_pkthdr.rcvif = (struct ifnet *)ifp; } void if_setvtag(struct mbuf *m, uint16_t tag) { m->m_pkthdr.ether_vtag = tag; } uint16_t if_getvtag(struct mbuf *m) { return (m->m_pkthdr.ether_vtag); } int if_sendq_empty(if_t ifp) { return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd); } struct ifaddr * if_getifaddr(if_t ifp) { return ((struct ifnet *)ifp)->if_addr; } int if_getamcount(if_t ifp) { return ((struct ifnet *)ifp)->if_amcount; } int if_setsendqready(if_t ifp) { IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd); return (0); } int if_setsendqlen(if_t ifp, int tx_desc_count) { IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count); ((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count; return (0); } int if_vlantrunkinuse(if_t ifp) { return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0; } int if_input(if_t ifp, struct mbuf* sendmp) { (*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp); return (0); } /* XXX */ #ifndef ETH_ADDR_LEN #define ETH_ADDR_LEN 6 #endif int if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max) { struct ifmultiaddr *ifma; uint8_t *lmta = (uint8_t *)mta; int mcnt = 0; TAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == max) break; bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), &lmta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN); mcnt++; } *cnt = mcnt; return (0); } int if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max) { int error; if_maddr_rlock(ifp); error = if_setupmultiaddr(ifp, mta, cnt, max); if_maddr_runlock(ifp); return (error); } int if_multiaddr_count(if_t ifp, int max) { struct ifmultiaddr *ifma; int count; count = 0; if_maddr_rlock(ifp); TAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; count++; if (count == max) break; } if_maddr_runlock(ifp); return (count); } int if_multi_apply(struct ifnet *ifp, int (*filter)(void *, struct ifmultiaddr *, int), void *arg) { struct ifmultiaddr *ifma; int cnt = 0; if_maddr_rlock(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) cnt += filter(arg, ifma, cnt); if_maddr_runlock(ifp); return (cnt); } struct mbuf * if_dequeue(if_t ifp) { struct mbuf *m; IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m); return (m); } int if_sendq_prepend(if_t ifp, struct mbuf *m) { IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m); return (0); } int if_setifheaderlen(if_t ifp, int len) { ((struct ifnet *)ifp)->if_hdrlen = len; return (0); } caddr_t if_getlladdr(if_t ifp) { return (IF_LLADDR((struct ifnet *)ifp)); } void * if_gethandle(u_char type) { return (if_alloc(type)); } void if_bpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; BPF_MTAP(ifp, m); } void if_etherbpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; ETHER_BPF_MTAP(ifp, m); } void if_vlancap(if_t ifh) { struct ifnet *ifp = (struct ifnet *)ifh; VLAN_CAPABILITIES(ifp); } int if_sethwtsomax(if_t ifp, u_int if_hw_tsomax) { ((struct ifnet *)ifp)->if_hw_tsomax = if_hw_tsomax; return (0); } int if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount) { ((struct ifnet *)ifp)->if_hw_tsomaxsegcount = if_hw_tsomaxsegcount; return (0); } int if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize) { ((struct ifnet *)ifp)->if_hw_tsomaxsegsize = if_hw_tsomaxsegsize; return (0); } u_int if_gethwtsomax(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomax); } u_int if_gethwtsomaxsegcount(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomaxsegcount); } u_int if_gethwtsomaxsegsize(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomaxsegsize); } void if_setinitfn(if_t ifp, void (*init_fn)(void *)) { ((struct ifnet *)ifp)->if_init = init_fn; } void if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t)) { ((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn; } void if_setstartfn(if_t ifp, void (*start_fn)(if_t)) { ((struct ifnet *)ifp)->if_start = (void *)start_fn; } void if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn) { ((struct ifnet *)ifp)->if_transmit = start_fn; } void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn) { ((struct ifnet *)ifp)->if_qflush = flush_fn; } void if_setgetcounterfn(if_t ifp, if_get_counter_t fn) { ifp->if_get_counter = fn; } /* Revisit these - These are inline functions originally. */ int drbr_inuse_drv(if_t ifh, struct buf_ring *br) { return drbr_inuse(ifh, br); } struct mbuf* drbr_dequeue_drv(if_t ifh, struct buf_ring *br) { return drbr_dequeue(ifh, br); } int drbr_needs_enqueue_drv(if_t ifh, struct buf_ring *br) { return drbr_needs_enqueue(ifh, br); } int drbr_enqueue_drv(if_t ifh, struct buf_ring *br, struct mbuf *m) { return drbr_enqueue(ifh, br, m); } Index: head/sys/net/if_var.h =================================================================== --- head/sys/net/if_var.h (revision 333308) +++ head/sys/net/if_var.h (revision 333309) @@ -1,745 +1,753 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)if.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NET_IF_VAR_H_ #define _NET_IF_VAR_H_ /* * Structures defining a network interface, providing a packet * transport mechanism (ala level 0 of the PUP protocols). * * Each interface accepts output datagrams of a specified maximum * length, and provides higher level routines with input datagrams * received from its medium. * * Output occurs when the routine if_output is called, with three parameters: * (*ifp->if_output)(ifp, m, dst, rt) * Here m is the mbuf chain to be sent and dst is the destination address. * The output routine encapsulates the supplied datagram if necessary, * and then transmits it on its medium. * * On input, each interface unwraps the data received by it, and either * places it on the input queue of an internetwork datagram routine * and posts the associated software interrupt, or passes the datagram to a raw * packet input routine. * * Routines exist for locating interfaces by their addresses * or for locating an interface on a certain network, as well as more general * routing and gateway routines maintaining information used to locate * interfaces. These routines live in the files if.c and route.c */ struct rtentry; /* ifa_rtrequest */ struct rt_addrinfo; /* ifa_rtrequest */ struct socket; struct carp_if; struct carp_softc; struct ifvlantrunk; struct route; /* if_output */ struct vnet; struct ifmedia; struct netmap_adapter; struct netdump_methods; #ifdef _KERNEL #include /* ifqueue only? */ #include #include #endif /* _KERNEL */ #include #include /* XXX */ #include /* struct ifqueue */ #include /* XXX */ #include /* XXX */ #include /* if_link_task */ #define IF_DUNIT_NONE -1 #include TAILQ_HEAD(ifnethead, ifnet); /* we use TAILQs so that the order of */ TAILQ_HEAD(ifaddrhead, ifaddr); /* instantiation is preserved in the list */ TAILQ_HEAD(ifmultihead, ifmultiaddr); TAILQ_HEAD(ifgrouphead, ifg_group); #ifdef _KERNEL VNET_DECLARE(struct pfil_head, link_pfil_hook); /* packet filter hooks */ #define V_link_pfil_hook VNET(link_pfil_hook) #define HHOOK_IPSEC_INET 0 #define HHOOK_IPSEC_INET6 1 #define HHOOK_IPSEC_COUNT 2 VNET_DECLARE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]); VNET_DECLARE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]); #define V_ipsec_hhh_in VNET(ipsec_hhh_in) #define V_ipsec_hhh_out VNET(ipsec_hhh_out) #endif /* _KERNEL */ typedef enum { IFCOUNTER_IPACKETS = 0, IFCOUNTER_IERRORS, IFCOUNTER_OPACKETS, IFCOUNTER_OERRORS, IFCOUNTER_COLLISIONS, IFCOUNTER_IBYTES, IFCOUNTER_OBYTES, IFCOUNTER_IMCASTS, IFCOUNTER_OMCASTS, IFCOUNTER_IQDROPS, IFCOUNTER_OQDROPS, IFCOUNTER_NOPROTO, IFCOUNTERS /* Array size. */ } ift_counter; typedef struct ifnet * if_t; typedef void (*if_start_fn_t)(if_t); typedef int (*if_ioctl_fn_t)(if_t, u_long, caddr_t); typedef void (*if_init_fn_t)(void *); typedef void (*if_qflush_fn_t)(if_t); typedef int (*if_transmit_fn_t)(if_t, struct mbuf *); typedef uint64_t (*if_get_counter_t)(if_t, ift_counter); struct ifnet_hw_tsomax { u_int tsomaxbytes; /* TSO total burst length limit in bytes */ u_int tsomaxsegcount; /* TSO maximum segment count */ u_int tsomaxsegsize; /* TSO maximum segment size in bytes */ }; /* Interface encap request types */ typedef enum { IFENCAP_LL = 1 /* pre-calculate link-layer header */ } ife_type; /* * The structure below allows to request various pre-calculated L2/L3 headers * for different media. Requests varies by type (rtype field). * * IFENCAP_LL type: pre-calculates link header based on address family * and destination lladdr. * * Input data fields: * buf: pointer to destination buffer * bufsize: buffer size * flags: IFENCAP_FLAG_BROADCAST if destination is broadcast * family: address family defined by AF_ constant. * lladdr: pointer to link-layer address * lladdr_len: length of link-layer address * hdata: pointer to L3 header (optional, used for ARP requests). * Output data fields: * buf: encap data is stored here * bufsize: resulting encap length is stored here * lladdr_off: offset of link-layer address from encap hdr start * hdata: L3 header may be altered if necessary */ struct if_encap_req { u_char *buf; /* Destination buffer (w) */ size_t bufsize; /* size of provided buffer (r) */ ife_type rtype; /* request type (r) */ uint32_t flags; /* Request flags (r) */ int family; /* Address family AF_* (r) */ int lladdr_off; /* offset from header start (w) */ int lladdr_len; /* lladdr length (r) */ char *lladdr; /* link-level address pointer (r) */ char *hdata; /* Upper layer header data (rw) */ }; #define IFENCAP_FLAG_BROADCAST 0x02 /* Destination is broadcast */ /* * Network interface send tag support. The storage of "struct * m_snd_tag" comes from the network driver and it is free to allocate * as much additional space as it wants for its own use. */ struct m_snd_tag; #define IF_SND_TAG_TYPE_RATE_LIMIT 0 #define IF_SND_TAG_TYPE_UNLIMITED 1 #define IF_SND_TAG_TYPE_MAX 2 struct if_snd_tag_alloc_header { uint32_t type; /* send tag type, see IF_SND_TAG_XXX */ uint32_t flowid; /* mbuf hash value */ uint32_t flowtype; /* mbuf hash type */ }; struct if_snd_tag_alloc_rate_limit { struct if_snd_tag_alloc_header hdr; uint64_t max_rate; /* in bytes/s */ }; struct if_snd_tag_rate_limit_params { uint64_t max_rate; /* in bytes/s */ uint32_t queue_level; /* 0 (empty) .. 65535 (full) */ #define IF_SND_QUEUE_LEVEL_MIN 0 #define IF_SND_QUEUE_LEVEL_MAX 65535 uint32_t reserved; /* padding */ }; union if_snd_tag_alloc_params { struct if_snd_tag_alloc_header hdr; struct if_snd_tag_alloc_rate_limit rate_limit; struct if_snd_tag_alloc_rate_limit unlimited; }; union if_snd_tag_modify_params { struct if_snd_tag_rate_limit_params rate_limit; struct if_snd_tag_rate_limit_params unlimited; }; union if_snd_tag_query_params { struct if_snd_tag_rate_limit_params rate_limit; struct if_snd_tag_rate_limit_params unlimited; }; typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *); typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *); typedef void (if_snd_tag_free_t)(struct m_snd_tag *); /* * Structure defining a network interface. */ struct ifnet { /* General book keeping of interface lists. */ TAILQ_ENTRY(ifnet) if_link; /* all struct ifnets are chained */ LIST_ENTRY(ifnet) if_clones; /* interfaces of a cloner */ TAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if */ /* protected by if_addr_lock */ u_char if_alloctype; /* if_type at time of allocation */ /* Driver and protocol specific information that remains stable. */ void *if_softc; /* pointer to driver state */ void *if_llsoftc; /* link layer softc */ void *if_l2com; /* pointer to protocol bits */ const char *if_dname; /* driver name */ int if_dunit; /* unit or IF_DUNIT_NONE */ u_short if_index; /* numeric abbreviation for this if */ short if_index_reserved; /* spare space to grow if_index */ char if_xname[IFNAMSIZ]; /* external name (name + unit) */ char *if_description; /* interface description */ /* Variable fields that are touched by the stack and drivers. */ int if_flags; /* up/down, broadcast, etc. */ int if_drv_flags; /* driver-managed status flags */ int if_capabilities; /* interface features & capabilities */ int if_capenable; /* enabled features & capabilities */ void *if_linkmib; /* link-type-specific MIB data */ size_t if_linkmiblen; /* length of above data */ u_int if_refcount; /* reference count */ /* These fields are shared with struct if_data. */ uint8_t if_type; /* ethernet, tokenring, etc */ uint8_t if_addrlen; /* media address length */ uint8_t if_hdrlen; /* media header length */ uint8_t if_link_state; /* current link state */ uint32_t if_mtu; /* maximum transmission unit */ uint32_t if_metric; /* routing metric (external only) */ uint64_t if_baudrate; /* linespeed */ uint64_t if_hwassist; /* HW offload capabilities, see IFCAP */ time_t if_epoch; /* uptime at attach or stat reset */ struct timeval if_lastchange; /* time of last administrative change */ struct ifaltq if_snd; /* output queue (includes altq) */ struct task if_linktask; /* task for link change events */ /* Addresses of different protocol families assigned to this if. */ struct rwlock if_addr_lock; /* lock to protect address lists */ /* * if_addrhead is the list of all addresses associated to * an interface. * Some code in the kernel assumes that first element * of the list has type AF_LINK, and contains sockaddr_dl * addresses which store the link-level address and the name * of the interface. * However, access to the AF_LINK address through this * field is deprecated. Use if_addr or ifaddr_byindex() instead. */ struct ifaddrhead if_addrhead; /* linked list of addresses per if */ struct ifmultihead if_multiaddrs; /* multicast addresses configured */ int if_amcount; /* number of all-multicast requests */ struct ifaddr *if_addr; /* pointer to link-level address */ void *if_hw_addr; /* hardware link-level address */ const u_int8_t *if_broadcastaddr; /* linklevel broadcast bytestring */ struct rwlock if_afdata_lock; void *if_afdata[AF_MAX]; int if_afdata_initialized; /* Additional features hung off the interface. */ u_int if_fib; /* interface FIB */ struct vnet *if_vnet; /* pointer to network stack instance */ struct vnet *if_home_vnet; /* where this ifnet originates from */ struct ifvlantrunk *if_vlantrunk; /* pointer to 802.1q data */ struct bpf_if *if_bpf; /* packet filter structure */ int if_pcount; /* number of promiscuous listeners */ void *if_bridge; /* bridge glue */ void *if_lagg; /* lagg glue */ void *if_pf_kif; /* pf glue */ struct carp_if *if_carp; /* carp interface structure */ struct label *if_label; /* interface MAC label */ struct netmap_adapter *if_netmap; /* netmap(4) softc */ /* Various procedures of the layer2 encapsulation and drivers. */ int (*if_output) /* output routine (enqueue) */ (struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); void (*if_input) /* input routine (from h/w driver) */ (struct ifnet *, struct mbuf *); if_start_fn_t if_start; /* initiate output routine */ if_ioctl_fn_t if_ioctl; /* ioctl routine */ if_init_fn_t if_init; /* Init routine */ int (*if_resolvemulti) /* validate/resolve multicast */ (struct ifnet *, struct sockaddr **, struct sockaddr *); if_qflush_fn_t if_qflush; /* flush any queue */ if_transmit_fn_t if_transmit; /* initiate output routine */ void (*if_reassign) /* reassign to vnet routine */ (struct ifnet *, struct vnet *, char *); if_get_counter_t if_get_counter; /* get counter values */ int (*if_requestencap) /* make link header from request */ (struct ifnet *, struct if_encap_req *); /* Statistics. */ counter_u64_t if_counters[IFCOUNTERS]; /* Stuff that's only temporary and doesn't belong here. */ /* * Network adapter TSO limits: * =========================== * * If the "if_hw_tsomax" field is zero the maximum segment * length limit does not apply. If the "if_hw_tsomaxsegcount" * or the "if_hw_tsomaxsegsize" field is zero the TSO segment * count limit does not apply. If all three fields are zero, * there is no TSO limit. * * NOTE: The TSO limits should reflect the values used in the * BUSDMA tag a network adapter is using to load a mbuf chain * for transmission. The TCP/IP network stack will subtract * space for all linklevel and protocol level headers and * ensure that the full mbuf chain passed to the network * adapter fits within the given limits. */ u_int if_hw_tsomax; /* TSO maximum size in bytes */ u_int if_hw_tsomaxsegcount; /* TSO maximum segment count */ u_int if_hw_tsomaxsegsize; /* TSO maximum segment size in bytes */ /* * Network adapter send tag support: */ if_snd_tag_alloc_t *if_snd_tag_alloc; if_snd_tag_modify_t *if_snd_tag_modify; if_snd_tag_query_t *if_snd_tag_query; if_snd_tag_free_t *if_snd_tag_free; /* Ethernet PCP */ uint8_t if_pcp; /* * Netdump hooks to be called while dumping. */ struct netdump_methods *if_netdump_methods; /* * Spare fields to be added before branching a stable branch, so * that structure can be enhanced without changing the kernel * binary interface. */ int if_ispare[4]; /* general use */ }; /* for compatibility with other BSDs */ #define if_name(ifp) ((ifp)->if_xname) /* * Locks for address lists on the network interface. */ #define IF_ADDR_LOCK_INIT(if) rw_init(&(if)->if_addr_lock, "if_addr_lock") #define IF_ADDR_LOCK_DESTROY(if) rw_destroy(&(if)->if_addr_lock) #define IF_ADDR_WLOCK(if) rw_wlock(&(if)->if_addr_lock) #define IF_ADDR_WUNLOCK(if) rw_wunlock(&(if)->if_addr_lock) #define IF_ADDR_RLOCK(if) rw_rlock(&(if)->if_addr_lock) #define IF_ADDR_RUNLOCK(if) rw_runlock(&(if)->if_addr_lock) #define IF_ADDR_LOCK_ASSERT(if) rw_assert(&(if)->if_addr_lock, RA_LOCKED) #define IF_ADDR_WLOCK_ASSERT(if) rw_assert(&(if)->if_addr_lock, RA_WLOCKED) /* * Function variations on locking macros intended to be used by loadable * kernel modules in order to divorce them from the internals of address list * locking. */ void if_addr_rlock(struct ifnet *ifp); /* if_addrhead */ void if_addr_runlock(struct ifnet *ifp); /* if_addrhead */ void if_maddr_rlock(if_t ifp); /* if_multiaddrs */ void if_maddr_runlock(if_t ifp); /* if_multiaddrs */ #ifdef _KERNEL #ifdef _SYS_EVENTHANDLER_H_ /* interface link layer address change event */ typedef void (*iflladdr_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(iflladdr_event, iflladdr_event_handler_t); /* interface address change event */ typedef void (*ifaddr_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(ifaddr_event, ifaddr_event_handler_t); /* new interface arrival event */ typedef void (*ifnet_arrival_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(ifnet_arrival_event, ifnet_arrival_event_handler_t); /* interface departure event */ typedef void (*ifnet_departure_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(ifnet_departure_event, ifnet_departure_event_handler_t); /* Interface link state change event */ typedef void (*ifnet_link_event_handler_t)(void *, struct ifnet *, int); EVENTHANDLER_DECLARE(ifnet_link_event, ifnet_link_event_handler_t); /* Interface up/down event */ #define IFNET_EVENT_UP 0 #define IFNET_EVENT_DOWN 1 #define IFNET_EVENT_PCP 2 /* priority code point, PCP */ typedef void (*ifnet_event_fn)(void *, struct ifnet *ifp, int event); EVENTHANDLER_DECLARE(ifnet_event, ifnet_event_fn); #endif /* _SYS_EVENTHANDLER_H_ */ /* * interface groups */ struct ifg_group { char ifg_group[IFNAMSIZ]; u_int ifg_refcnt; void *ifg_pf_kif; TAILQ_HEAD(, ifg_member) ifg_members; TAILQ_ENTRY(ifg_group) ifg_next; }; struct ifg_member { TAILQ_ENTRY(ifg_member) ifgm_next; struct ifnet *ifgm_ifp; }; struct ifg_list { struct ifg_group *ifgl_group; TAILQ_ENTRY(ifg_list) ifgl_next; }; #ifdef _SYS_EVENTHANDLER_H_ /* group attach event */ typedef void (*group_attach_event_handler_t)(void *, struct ifg_group *); EVENTHANDLER_DECLARE(group_attach_event, group_attach_event_handler_t); /* group detach event */ typedef void (*group_detach_event_handler_t)(void *, struct ifg_group *); EVENTHANDLER_DECLARE(group_detach_event, group_detach_event_handler_t); /* group change event */ typedef void (*group_change_event_handler_t)(void *, const char *); EVENTHANDLER_DECLARE(group_change_event, group_change_event_handler_t); #endif /* _SYS_EVENTHANDLER_H_ */ #define IF_AFDATA_LOCK_INIT(ifp) \ rw_init(&(ifp)->if_afdata_lock, "if_afdata") #define IF_AFDATA_WLOCK(ifp) rw_wlock(&(ifp)->if_afdata_lock) #define IF_AFDATA_RLOCK(ifp) rw_rlock(&(ifp)->if_afdata_lock) #define IF_AFDATA_WUNLOCK(ifp) rw_wunlock(&(ifp)->if_afdata_lock) #define IF_AFDATA_RUNLOCK(ifp) rw_runlock(&(ifp)->if_afdata_lock) #define IF_AFDATA_LOCK(ifp) IF_AFDATA_WLOCK(ifp) #define IF_AFDATA_UNLOCK(ifp) IF_AFDATA_WUNLOCK(ifp) #define IF_AFDATA_TRYLOCK(ifp) rw_try_wlock(&(ifp)->if_afdata_lock) #define IF_AFDATA_DESTROY(ifp) rw_destroy(&(ifp)->if_afdata_lock) #define IF_AFDATA_LOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_LOCKED) #define IF_AFDATA_RLOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_RLOCKED) #define IF_AFDATA_WLOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_WLOCKED) #define IF_AFDATA_UNLOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_UNLOCKED) /* * 72 was chosen below because it is the size of a TCP/IP * header (40) + the minimum mss (32). */ #define IF_MINMTU 72 #define IF_MAXMTU 65535 #define TOEDEV(ifp) ((ifp)->if_llsoftc) /* * The ifaddr structure contains information about one address * of an interface. They are maintained by the different address families, * are allocated and attached when an address is set, and are linked * together so all addresses for an interface can be located. * * NOTE: a 'struct ifaddr' is always at the beginning of a larger * chunk of malloc'ed memory, where we store the three addresses * (ifa_addr, ifa_dstaddr and ifa_netmask) referenced here. */ struct ifaddr { struct sockaddr *ifa_addr; /* address of interface */ struct sockaddr *ifa_dstaddr; /* other end of p-to-p link */ #define ifa_broadaddr ifa_dstaddr /* broadcast address interface */ struct sockaddr *ifa_netmask; /* used to determine subnet */ struct ifnet *ifa_ifp; /* back-pointer to interface */ struct carp_softc *ifa_carp; /* pointer to CARP data */ TAILQ_ENTRY(ifaddr) ifa_link; /* queue macro glue */ void (*ifa_rtrequest) /* check or clean routes (+ or -)'d */ (int, struct rtentry *, struct rt_addrinfo *); u_short ifa_flags; /* mostly rt_flags for cloning */ #define IFA_ROUTE RTF_UP /* route installed */ #define IFA_RTSELF RTF_HOST /* loopback route to self installed */ u_int ifa_refcnt; /* references to this structure */ counter_u64_t ifa_ipackets; counter_u64_t ifa_opackets; counter_u64_t ifa_ibytes; counter_u64_t ifa_obytes; }; struct ifaddr * ifa_alloc(size_t size, int flags); void ifa_free(struct ifaddr *ifa); void ifa_ref(struct ifaddr *ifa); /* * Multicast address structure. This is analogous to the ifaddr * structure except that it keeps track of multicast addresses. */ struct ifmultiaddr { TAILQ_ENTRY(ifmultiaddr) ifma_link; /* queue macro glue */ struct sockaddr *ifma_addr; /* address this membership is for */ struct sockaddr *ifma_lladdr; /* link-layer translation, if any */ struct ifnet *ifma_ifp; /* back-pointer to interface */ u_int ifma_refcount; /* reference count */ void *ifma_protospec; /* protocol-specific state, if any */ struct ifmultiaddr *ifma_llifma; /* pointer to ifma for ifma_lladdr */ }; extern struct rwlock ifnet_rwlock; extern struct sx ifnet_sxlock; #define IFNET_WLOCK() do { \ sx_xlock(&ifnet_sxlock); \ rw_wlock(&ifnet_rwlock); \ } while (0) #define IFNET_WUNLOCK() do { \ rw_wunlock(&ifnet_rwlock); \ sx_xunlock(&ifnet_sxlock); \ } while (0) /* * To assert the ifnet lock, you must know not only whether it's for read or * write, but also whether it was acquired with sleep support or not. */ #define IFNET_RLOCK_ASSERT() sx_assert(&ifnet_sxlock, SA_SLOCKED) #define IFNET_RLOCK_NOSLEEP_ASSERT() rw_assert(&ifnet_rwlock, RA_RLOCKED) #define IFNET_WLOCK_ASSERT() do { \ sx_assert(&ifnet_sxlock, SA_XLOCKED); \ rw_assert(&ifnet_rwlock, RA_WLOCKED); \ } while (0) #define IFNET_RLOCK() sx_slock(&ifnet_sxlock) #define IFNET_RLOCK_NOSLEEP() rw_rlock(&ifnet_rwlock) #define IFNET_RUNLOCK() sx_sunlock(&ifnet_sxlock) #define IFNET_RUNLOCK_NOSLEEP() rw_runlock(&ifnet_rwlock) /* * Look up an ifnet given its index; the _ref variant also acquires a * reference that must be freed using if_rele(). It is almost always a bug * to call ifnet_byindex() instead of ifnet_byindex_ref(). */ struct ifnet *ifnet_byindex(u_short idx); struct ifnet *ifnet_byindex_locked(u_short idx); struct ifnet *ifnet_byindex_ref(u_short idx); /* * Given the index, ifaddr_byindex() returns the one and only * link-level ifaddr for the interface. You are not supposed to use * it to traverse the list of addresses associated to the interface. */ struct ifaddr *ifaddr_byindex(u_short idx); VNET_DECLARE(struct ifnethead, ifnet); VNET_DECLARE(struct ifgrouphead, ifg_head); VNET_DECLARE(int, if_index); VNET_DECLARE(struct ifnet *, loif); /* first loopback interface */ #define V_ifnet VNET(ifnet) #define V_ifg_head VNET(ifg_head) #define V_if_index VNET(if_index) #define V_loif VNET(loif) +#ifdef MCAST_VERBOSE +#define MCDPRINTF printf +#else +#define MCDPRINTF(...) +#endif + int if_addgroup(struct ifnet *, const char *); int if_delgroup(struct ifnet *, const char *); int if_addmulti(struct ifnet *, struct sockaddr *, struct ifmultiaddr **); int if_allmulti(struct ifnet *, int); struct ifnet* if_alloc(u_char); void if_attach(struct ifnet *); void if_dead(struct ifnet *); int if_delmulti(struct ifnet *, struct sockaddr *); void if_delmulti_ifma(struct ifmultiaddr *); +void if_delmulti_ifma_flags(struct ifmultiaddr *, int flags); void if_detach(struct ifnet *); void if_purgeaddrs(struct ifnet *); void if_delallmulti(struct ifnet *); void if_down(struct ifnet *); struct ifmultiaddr * if_findmulti(struct ifnet *, const struct sockaddr *); +void if_freemulti(struct ifmultiaddr *ifma); void if_free(struct ifnet *); void if_initname(struct ifnet *, const char *, int); void if_link_state_change(struct ifnet *, int); int if_printf(struct ifnet *, const char *, ...) __printflike(2, 3); void if_ref(struct ifnet *); void if_rele(struct ifnet *); int if_setlladdr(struct ifnet *, const u_char *, int); void if_up(struct ifnet *); int ifioctl(struct socket *, u_long, caddr_t, struct thread *); int ifpromisc(struct ifnet *, int); struct ifnet *ifunit(const char *); struct ifnet *ifunit_ref(const char *); int ifa_add_loopback_route(struct ifaddr *, struct sockaddr *); int ifa_del_loopback_route(struct ifaddr *, struct sockaddr *); int ifa_switch_loopback_route(struct ifaddr *, struct sockaddr *); struct ifaddr *ifa_ifwithaddr(const struct sockaddr *); int ifa_ifwithaddr_check(const struct sockaddr *); struct ifaddr *ifa_ifwithbroadaddr(const struct sockaddr *, int); struct ifaddr *ifa_ifwithdstaddr(const struct sockaddr *, int); struct ifaddr *ifa_ifwithnet(const struct sockaddr *, int, int); struct ifaddr *ifa_ifwithroute(int, const struct sockaddr *, struct sockaddr *, u_int); struct ifaddr *ifaof_ifpforaddr(const struct sockaddr *, struct ifnet *); int ifa_preferred(struct ifaddr *, struct ifaddr *); int if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen); typedef void *if_com_alloc_t(u_char type, struct ifnet *ifp); typedef void if_com_free_t(void *com, u_char type); void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f); void if_deregister_com_alloc(u_char type); void if_data_copy(struct ifnet *, struct if_data *); uint64_t if_get_counter_default(struct ifnet *, ift_counter); void if_inc_counter(struct ifnet *, ift_counter, int64_t); #define IF_LLADDR(ifp) \ LLADDR((struct sockaddr_dl *)((ifp)->if_addr->ifa_addr)) uint64_t if_setbaudrate(if_t ifp, uint64_t baudrate); uint64_t if_getbaudrate(if_t ifp); int if_setcapabilities(if_t ifp, int capabilities); int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit); int if_getcapabilities(if_t ifp); int if_togglecapenable(if_t ifp, int togglecap); int if_setcapenable(if_t ifp, int capenable); int if_setcapenablebit(if_t ifp, int setcap, int clearcap); int if_getcapenable(if_t ifp); const char *if_getdname(if_t ifp); int if_setdev(if_t ifp, void *dev); int if_setdrvflagbits(if_t ifp, int if_setflags, int clear_flags); int if_getdrvflags(if_t ifp); int if_setdrvflags(if_t ifp, int flags); int if_clearhwassist(if_t ifp); int if_sethwassistbits(if_t ifp, int toset, int toclear); int if_sethwassist(if_t ifp, int hwassist_bit); int if_gethwassist(if_t ifp); int if_setsoftc(if_t ifp, void *softc); void *if_getsoftc(if_t ifp); int if_setflags(if_t ifp, int flags); int if_gethwaddr(if_t ifp, struct ifreq *); int if_setmtu(if_t ifp, int mtu); int if_getmtu(if_t ifp); int if_getmtu_family(if_t ifp, int family); int if_setflagbits(if_t ifp, int set, int clear); int if_getflags(if_t ifp); int if_sendq_empty(if_t ifp); int if_setsendqready(if_t ifp); int if_setsendqlen(if_t ifp, int tx_desc_count); int if_sethwtsomax(if_t ifp, u_int if_hw_tsomax); int if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount); int if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize); u_int if_gethwtsomax(if_t ifp); u_int if_gethwtsomaxsegcount(if_t ifp); u_int if_gethwtsomaxsegsize(if_t ifp); int if_input(if_t ifp, struct mbuf* sendmp); int if_sendq_prepend(if_t ifp, struct mbuf *m); struct mbuf *if_dequeue(if_t ifp); int if_setifheaderlen(if_t ifp, int len); void if_setrcvif(struct mbuf *m, if_t ifp); void if_setvtag(struct mbuf *m, u_int16_t tag); u_int16_t if_getvtag(struct mbuf *m); int if_vlantrunkinuse(if_t ifp); caddr_t if_getlladdr(if_t ifp); void *if_gethandle(u_char); void if_bpfmtap(if_t ifp, struct mbuf *m); void if_etherbpfmtap(if_t ifp, struct mbuf *m); void if_vlancap(if_t ifp); int if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max); int if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max); int if_multiaddr_count(if_t ifp, int max); int if_multi_apply(struct ifnet *ifp, int (*filter)(void *, struct ifmultiaddr *, int), void *arg); int if_getamcount(if_t ifp); struct ifaddr * if_getifaddr(if_t ifp); /* Functions */ void if_setinitfn(if_t ifp, void (*)(void *)); void if_setioctlfn(if_t ifp, int (*)(if_t, u_long, caddr_t)); void if_setstartfn(if_t ifp, void (*)(if_t)); void if_settransmitfn(if_t ifp, if_transmit_fn_t); void if_setqflushfn(if_t ifp, if_qflush_fn_t); void if_setgetcounterfn(if_t ifp, if_get_counter_t); /* Revisit the below. These are inline functions originally */ int drbr_inuse_drv(if_t ifp, struct buf_ring *br); struct mbuf* drbr_dequeue_drv(if_t ifp, struct buf_ring *br); int drbr_needs_enqueue_drv(if_t ifp, struct buf_ring *br); int drbr_enqueue_drv(if_t ifp, struct buf_ring *br, struct mbuf *m); /* TSO */ void if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *); int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *); /* accessors for struct ifreq */ void *ifr_data_get_ptr(void *ifrp); #ifdef DEVICE_POLLING enum poll_cmd { POLL_ONLY, POLL_AND_CHECK_STATUS }; typedef int poll_handler_t(if_t ifp, enum poll_cmd cmd, int count); int ether_poll_register(poll_handler_t *h, if_t ifp); int ether_poll_deregister(if_t ifp); #endif /* DEVICE_POLLING */ #endif /* _KERNEL */ #include /* XXXAO: temporary unconditional include */ #endif /* !_NET_IF_VAR_H_ */ Index: head/sys/netinet/igmp.c =================================================================== --- head/sys/netinet/igmp.c (revision 333308) +++ head/sys/netinet/igmp.c (revision 333309) @@ -1,3649 +1,3655 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Bruce Simpson. * Copyright (c) 1988 Stephen Deering. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)igmp.c 8.1 (Berkeley) 7/19/93 */ /* * Internet Group Management Protocol (IGMP) routines. * [RFC1112, RFC2236, RFC3376] * * Written by Steve Deering, Stanford, May 1988. * Modified by Rosen Sharma, Stanford, Aug 1994. * Modified by Bill Fenner, Xerox PARC, Feb 1995. * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995. * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson. * * MULTICAST Revision: 3.5.1.4 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_IGMPV3 #define KTR_IGMPV3 KTR_INET #endif static struct igmp_ifsoftc * igi_alloc_locked(struct ifnet *); static void igi_delete_locked(const struct ifnet *); static void igmp_dispatch_queue(struct mbufq *, int, const int); static void igmp_fasttimo_vnet(void); static void igmp_final_leave(struct in_multi *, struct igmp_ifsoftc *); static int igmp_handle_state_change(struct in_multi *, struct igmp_ifsoftc *); static int igmp_initial_join(struct in_multi *, struct igmp_ifsoftc *); static int igmp_input_v1_query(struct ifnet *, const struct ip *, const struct igmp *); static int igmp_input_v2_query(struct ifnet *, const struct ip *, const struct igmp *); static int igmp_input_v3_query(struct ifnet *, const struct ip *, /*const*/ struct igmpv3 *); static int igmp_input_v3_group_query(struct in_multi *, struct igmp_ifsoftc *, int, /*const*/ struct igmpv3 *); static int igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *, /*const*/ struct igmp *); static int igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *, /*const*/ struct igmp *); static void igmp_intr(struct mbuf *); static int igmp_isgroupreported(const struct in_addr); static struct mbuf * igmp_ra_alloc(void); #ifdef KTR static char * igmp_rec_type_to_str(const int); #endif static void igmp_set_version(struct igmp_ifsoftc *, const int); static void igmp_slowtimo_vnet(void); static int igmp_v1v2_queue_report(struct in_multi *, const int); static void igmp_v1v2_process_group_timer(struct in_multi *, const int); static void igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *); static void igmp_v2_update_group(struct in_multi *, const int); static void igmp_v3_cancel_link_timers(struct igmp_ifsoftc *); static void igmp_v3_dispatch_general_query(struct igmp_ifsoftc *); static struct mbuf * igmp_v3_encap_report(struct ifnet *, struct mbuf *); static int igmp_v3_enqueue_group_record(struct mbufq *, struct in_multi *, const int, const int, const int); static int igmp_v3_enqueue_filter_change(struct mbufq *, struct in_multi *); static void igmp_v3_process_group_timers(struct in_multi_head *, struct mbufq *, struct mbufq *, struct in_multi *, const int); static int igmp_v3_merge_state_changes(struct in_multi *, struct mbufq *); static void igmp_v3_suppress_group_record(struct in_multi *); static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS); static int sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS); static int sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS); static const struct netisr_handler igmp_nh = { .nh_name = "igmp", .nh_handler = igmp_intr, .nh_proto = NETISR_IGMP, .nh_policy = NETISR_POLICY_SOURCE, }; /* * System-wide globals. * * Unlocked access to these is OK, except for the global IGMP output * queue. The IGMP subsystem lock ends up being system-wide for the moment, * because all VIMAGEs have to share a global output queue, as netisrs * themselves are not virtualized. * * Locking: * * The permitted lock order is: IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK. * Any may be taken independently; if any are held at the same * time, the above lock order must be followed. * * All output is delegated to the netisr. * Now that Giant has been eliminated, the netisr may be inlined. * * IN_MULTI_LIST_LOCK covers in_multi. * * IGMP_LOCK covers igmp_ifsoftc and any global variables in this file, * including the output queue. * * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of * per-link state iterators. * * igmp_ifsoftc is valid as long as PF_INET is attached to the interface, * therefore it is not refcounted. * We allow unlocked reads of igmp_ifsoftc when accessed via in_multi. * * Reference counting * * IGMP acquires its own reference every time an in_multi is passed to * it and the group is being joined for the first time. * * IGMP releases its reference(s) on in_multi in a deferred way, * because the operations which process the release run as part of * a loop whose control variables are directly affected by the release * (that, and not recursing on the IF_ADDR_LOCK). * * VIMAGE: Each in_multi corresponds to an ifp, and each ifp corresponds * to a vnet in ifp->if_vnet. * * SMPng: XXX We may potentially race operations on ifma_protospec. * The problem is that we currently lack a clean way of taking the * IF_ADDR_LOCK() between the ifnet and in layers w/o recursing, * as anything which modifies ifma needs to be covered by that lock. * So check for ifma_protospec being NULL before proceeding. */ struct mtx igmp_mtx; struct mbuf *m_raopt; /* Router Alert option */ static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); /* * VIMAGE-wide globals. * * The IGMPv3 timers themselves need to run per-image, however, * protosw timers run globally (see tcp). * An ifnet can only be in one vimage at a time, and the loopback * ifnet, loif, is itself virtualized. * It would otherwise be possible to seriously hose IGMP state, * and create inconsistencies in upstream multicast routing, if you have * multiple VIMAGEs running on the same link joining different multicast * groups, UNLESS the "primary IP address" is different. This is because * IGMP for IPv4 does not force link-local addresses to be used for each * node, unlike MLD for IPv6. * Obviously the IGMPv3 per-interface state has per-vimage granularity * also as a result. * * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection * policy to control the address used by IGMP on the link. */ static VNET_DEFINE(int, interface_timers_running); /* IGMPv3 general * query response */ static VNET_DEFINE(int, state_change_timers_running); /* IGMPv3 state-change * retransmit */ static VNET_DEFINE(int, current_state_timers_running); /* IGMPv1/v2 host * report; IGMPv3 g/sg * query response */ #define V_interface_timers_running VNET(interface_timers_running) #define V_state_change_timers_running VNET(state_change_timers_running) #define V_current_state_timers_running VNET(current_state_timers_running) static VNET_DEFINE(LIST_HEAD(, igmp_ifsoftc), igi_head) = LIST_HEAD_INITIALIZER(igi_head); static VNET_DEFINE(struct igmpstat, igmpstat) = { .igps_version = IGPS_VERSION_3, .igps_len = sizeof(struct igmpstat), }; static VNET_DEFINE(struct timeval, igmp_gsrdelay) = {10, 0}; #define V_igi_head VNET(igi_head) #define V_igmpstat VNET(igmpstat) #define V_igmp_gsrdelay VNET(igmp_gsrdelay) static VNET_DEFINE(int, igmp_recvifkludge) = 1; static VNET_DEFINE(int, igmp_sendra) = 1; static VNET_DEFINE(int, igmp_sendlocal) = 1; static VNET_DEFINE(int, igmp_v1enable) = 1; static VNET_DEFINE(int, igmp_v2enable) = 1; static VNET_DEFINE(int, igmp_legacysupp); static VNET_DEFINE(int, igmp_default_version) = IGMP_VERSION_3; #define V_igmp_recvifkludge VNET(igmp_recvifkludge) #define V_igmp_sendra VNET(igmp_sendra) #define V_igmp_sendlocal VNET(igmp_sendlocal) #define V_igmp_v1enable VNET(igmp_v1enable) #define V_igmp_v2enable VNET(igmp_v2enable) #define V_igmp_legacysupp VNET(igmp_legacysupp) #define V_igmp_default_version VNET(igmp_default_version) /* * Virtualized sysctls. */ SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmpstat), igmpstat, ""); SYSCTL_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_recvifkludge), 0, "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_sendra), 0, "Send IP Router Alert option in IGMPv2/v3 messages"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_sendlocal), 0, "Send IGMP membership reports for 224.0.0.0/24 groups"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_v1enable), 0, "Enable backwards compatibility with IGMPv1"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_v2enable), 0, "Enable backwards compatibility with IGMPv2"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_legacysupp), 0, "Allow v1/v2 reports to suppress v3 group responses"); SYSCTL_PROC(_net_inet_igmp, OID_AUTO, default_version, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(igmp_default_version), 0, sysctl_igmp_default_version, "I", "Default version of IGMP to run on each interface"); SYSCTL_PROC(_net_inet_igmp, OID_AUTO, gsrdelay, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(igmp_gsrdelay.tv_sec), 0, sysctl_igmp_gsr, "I", "Rate limit for IGMPv3 Group-and-Source queries in seconds"); /* * Non-virtualized sysctls. */ static SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_igmp_ifinfo, "Per-interface IGMPv3 state"); static __inline void igmp_save_context(struct mbuf *m, struct ifnet *ifp) { #ifdef VIMAGE m->m_pkthdr.PH_loc.ptr = ifp->if_vnet; #endif /* VIMAGE */ m->m_pkthdr.flowid = ifp->if_index; } static __inline void igmp_scrub_context(struct mbuf *m) { m->m_pkthdr.PH_loc.ptr = NULL; m->m_pkthdr.flowid = 0; } /* * Restore context from a queued IGMP output chain. * Return saved ifindex. * * VIMAGE: The assertion is there to make sure that we * actually called CURVNET_SET() with what's in the mbuf chain. */ static __inline uint32_t igmp_restore_context(struct mbuf *m) { #ifdef notyet #if defined(VIMAGE) && defined(INVARIANTS) KASSERT(curvnet == (m->m_pkthdr.PH_loc.ptr), ("%s: called when curvnet was not restored", __func__)); #endif #endif return (m->m_pkthdr.flowid); } /* * Retrieve or set default IGMP version. * * VIMAGE: Assume curvnet set by caller. * SMPng: NOTE: Serialized by IGMP lock. */ static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS) { int error; int new; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); IGMP_LOCK(); new = V_igmp_default_version; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) goto out_locked; if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) { error = EINVAL; goto out_locked; } CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d", V_igmp_default_version, new); V_igmp_default_version = new; out_locked: IGMP_UNLOCK(); return (error); } /* * Retrieve or set threshold between group-source queries in seconds. * * VIMAGE: Assume curvnet set by caller. * SMPng: NOTE: Serialized by IGMP lock. */ static int sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS) { int error; int i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); IGMP_LOCK(); i = V_igmp_gsrdelay.tv_sec; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) goto out_locked; if (i < -1 || i >= 60) { error = EINVAL; goto out_locked; } CTR2(KTR_IGMPV3, "change igmp_gsrdelay from %d to %d", V_igmp_gsrdelay.tv_sec, i); V_igmp_gsrdelay.tv_sec = i; out_locked: IGMP_UNLOCK(); return (error); } /* * Expose struct igmp_ifsoftc to userland, keyed by ifindex. * For use by ifmcstat(8). * * SMPng: NOTE: Does an unlocked ifindex space read. * VIMAGE: Assume curvnet set by caller. The node handler itself * is not directly virtualized. */ static int sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS) { int *name; int error; u_int namelen; struct ifnet *ifp; struct igmp_ifsoftc *igi; name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); if (namelen != 1) return (EINVAL); error = sysctl_wire_old_buffer(req, sizeof(struct igmp_ifinfo)); if (error) return (error); IN_MULTI_LIST_LOCK(); IGMP_LOCK(); if (name[0] <= 0 || name[0] > V_if_index) { error = ENOENT; goto out_locked; } error = ENOENT; ifp = ifnet_byindex(name[0]); if (ifp == NULL) goto out_locked; LIST_FOREACH(igi, &V_igi_head, igi_link) { if (ifp == igi->igi_ifp) { struct igmp_ifinfo info; info.igi_version = igi->igi_version; info.igi_v1_timer = igi->igi_v1_timer; info.igi_v2_timer = igi->igi_v2_timer; info.igi_v3_timer = igi->igi_v3_timer; info.igi_flags = igi->igi_flags; info.igi_rv = igi->igi_rv; info.igi_qi = igi->igi_qi; info.igi_qri = igi->igi_qri; info.igi_uri = igi->igi_uri; error = SYSCTL_OUT(req, &info, sizeof(info)); break; } } out_locked: IGMP_UNLOCK(); IN_MULTI_LIST_UNLOCK(); return (error); } /* * Dispatch an entire queue of pending packet chains * using the netisr. * VIMAGE: Assumes the vnet pointer has been set. */ static void igmp_dispatch_queue(struct mbufq *mq, int limit, const int loop) { struct mbuf *m; while ((m = mbufq_dequeue(mq)) != NULL) { CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, mq, m); if (loop) m->m_flags |= M_IGMP_LOOP; netisr_dispatch(NETISR_IGMP, m); if (--limit == 0) break; } } /* * Filter outgoing IGMP report state by group. * * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1). * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are * disabled for all groups in the 224.0.0.0/24 link-local scope. However, * this may break certain IGMP snooping switches which rely on the old * report behaviour. * * Return zero if the given group is one for which IGMP reports * should be suppressed, or non-zero if reports should be issued. */ static __inline int igmp_isgroupreported(const struct in_addr addr) { if (in_allhosts(addr) || ((!V_igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr))))) return (0); return (1); } /* * Construct a Router Alert option to use in outgoing packets. */ static struct mbuf * igmp_ra_alloc(void) { struct mbuf *m; struct ipoption *p; m = m_get(M_WAITOK, MT_DATA); p = mtod(m, struct ipoption *); p->ipopt_dst.s_addr = INADDR_ANY; p->ipopt_list[0] = (char)IPOPT_RA; /* Router Alert Option */ p->ipopt_list[1] = 0x04; /* 4 bytes long */ p->ipopt_list[2] = IPOPT_EOL; /* End of IP option list */ p->ipopt_list[3] = 0x00; /* pad byte */ m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1]; return (m); } /* * Attach IGMP when PF_INET is attached to an interface. */ struct igmp_ifsoftc * igmp_domifattach(struct ifnet *ifp) { struct igmp_ifsoftc *igi; CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK(); igi = igi_alloc_locked(ifp); if (!(ifp->if_flags & IFF_MULTICAST)) igi->igi_flags |= IGIF_SILENT; IGMP_UNLOCK(); return (igi); } /* * VIMAGE: assume curvnet set by caller. */ static struct igmp_ifsoftc * igi_alloc_locked(/*const*/ struct ifnet *ifp) { struct igmp_ifsoftc *igi; IGMP_LOCK_ASSERT(); igi = malloc(sizeof(struct igmp_ifsoftc), M_IGMP, M_NOWAIT|M_ZERO); if (igi == NULL) goto out; igi->igi_ifp = ifp; igi->igi_version = V_igmp_default_version; igi->igi_flags = 0; igi->igi_rv = IGMP_RV_INIT; igi->igi_qi = IGMP_QI_INIT; igi->igi_qri = IGMP_QRI_INIT; igi->igi_uri = IGMP_URI_INIT; mbufq_init(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS); LIST_INSERT_HEAD(&V_igi_head, igi, igi_link); CTR2(KTR_IGMPV3, "allocate igmp_ifsoftc for ifp %p(%s)", ifp, ifp->if_xname); out: return (igi); } /* * Hook for ifdetach. * * NOTE: Some finalization tasks need to run before the protocol domain * is detached, but also before the link layer does its cleanup. * * SMPNG: igmp_ifdetach() needs to take IF_ADDR_LOCK(). * XXX This is also bitten by unlocked ifma_protospec access. */ void igmp_ifdetach(struct ifnet *ifp) { struct igmp_ifsoftc *igi; - struct ifmultiaddr *ifma; + struct ifmultiaddr *ifma, *next; struct in_multi *inm; struct in_multi_head inm_free_tmp; CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); SLIST_INIT(&inm_free_tmp); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; if (igi->igi_version == IGMP_VERSION_3) { - IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + IF_ADDR_WLOCK(ifp); + restart: + TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; -#if 0 - KASSERT(ifma->ifma_protospec != NULL, - ("%s: ifma_protospec is NULL", __func__)); -#endif inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_state == IGMP_LEAVING_MEMBER) inm_rele_locked(&inm_free_tmp, inm); inm_clear_recorded(inm); + if (__predict_false(ifma_restart)) { + ifma_restart = false; + goto restart; + } } - IF_ADDR_RUNLOCK(ifp); + IF_ADDR_WUNLOCK(ifp); inm_release_list_deferred(&inm_free_tmp); } IGMP_UNLOCK(); } /* * Hook for domifdetach. */ void igmp_domifdetach(struct ifnet *ifp) { CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK(); igi_delete_locked(ifp); IGMP_UNLOCK(); } static void igi_delete_locked(const struct ifnet *ifp) { struct igmp_ifsoftc *igi, *tigi; CTR3(KTR_IGMPV3, "%s: freeing igmp_ifsoftc for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK_ASSERT(); LIST_FOREACH_SAFE(igi, &V_igi_head, igi_link, tigi) { if (igi->igi_ifp == ifp) { /* * Free deferred General Query responses. */ mbufq_drain(&igi->igi_gq); LIST_REMOVE(igi, igi_link); free(igi, M_IGMP); return; } } } /* * Process a received IGMPv1 query. * Return non-zero if the message should be dropped. * * VIMAGE: The curvnet pointer is derived from the input ifp. */ static int igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip, const struct igmp *igmp) { struct ifmultiaddr *ifma; struct igmp_ifsoftc *igi; struct in_multi *inm; /* * IGMPv1 Host Mmembership Queries SHOULD always be addressed to * 224.0.0.1. They are always treated as General Queries. * igmp_group is always ignored. Do not drop it as a userland * daemon may wish to see it. * XXX SMPng: unlocked increments in igmpstat assumed atomic. */ if (!in_allhosts(ip->ip_dst) || !in_nullhost(igmp->igmp_group)) { IGMPSTAT_INC(igps_rcv_badqueries); return (0); } IGMPSTAT_INC(igps_rcv_gen_queries); IN_MULTI_LIST_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Switch to IGMPv1 host compatibility mode. */ igmp_set_version(igi, IGMP_VERSION_1); CTR2(KTR_IGMPV3, "process v1 query on ifp %p(%s)", ifp, ifp->if_xname); /* * Start the timers in all of our group records * for the interface on which the query arrived, * except those which are already running. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_timer != 0) continue; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY( IGMP_V1V2_MAX_RI * PR_FASTHZ); V_current_state_timers_running = 1; break; case IGMP_LEAVING_MEMBER: break; } } IF_ADDR_RUNLOCK(ifp); out_locked: IGMP_UNLOCK(); IN_MULTI_LIST_UNLOCK(); return (0); } /* * Process a received IGMPv2 general or group-specific query. */ static int igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip, const struct igmp *igmp) { struct ifmultiaddr *ifma; struct igmp_ifsoftc *igi; struct in_multi *inm; int is_general_query; uint16_t timer; is_general_query = 0; /* * Validate address fields upfront. * XXX SMPng: unlocked increments in igmpstat assumed atomic. */ if (in_nullhost(igmp->igmp_group)) { /* * IGMPv2 General Query. * If this was not sent to the all-hosts group, ignore it. */ if (!in_allhosts(ip->ip_dst)) return (0); IGMPSTAT_INC(igps_rcv_gen_queries); is_general_query = 1; } else { /* IGMPv2 Group-Specific Query. */ IGMPSTAT_INC(igps_rcv_group_queries); } IN_MULTI_LIST_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Ignore v2 query if in v1 Compatibility Mode. */ if (igi->igi_version == IGMP_VERSION_1) goto out_locked; igmp_set_version(igi, IGMP_VERSION_2); timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE; if (timer == 0) timer = 1; if (is_general_query) { /* * For each reporting group joined on this * interface, kick the report timer. */ CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)", ifp, ifp->if_xname); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; igmp_v2_update_group(inm, timer); } IF_ADDR_RUNLOCK(ifp); } else { /* * Group-specific IGMPv2 query, we need only * look up the single group to process it. */ inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { CTR3(KTR_IGMPV3, "process v2 query 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); igmp_v2_update_group(inm, timer); } } out_locked: IGMP_UNLOCK(); IN_MULTI_LIST_UNLOCK(); return (0); } /* * Update the report timer on a group in response to an IGMPv2 query. * * If we are becoming the reporting member for this group, start the timer. * If we already are the reporting member for this group, and timer is * below the threshold, reset it. * * We may be updating the group for the first time since we switched * to IGMPv3. If we are, then we must clear any recorded source lists, * and transition to REPORTING state; the group timer is overloaded * for group and group-source query responses. * * Unlike IGMPv3, the delay per group should be jittered * to avoid bursts of IGMPv2 reports. */ static void igmp_v2_update_group(struct in_multi *inm, const int timer) { CTR4(KTR_IGMPV3, "0x%08x: %s/%s timer=%d", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname, timer); IN_MULTI_LIST_LOCK_ASSERT(); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_REPORTING_MEMBER: if (inm->inm_timer != 0 && inm->inm_timer <= timer) { CTR1(KTR_IGMPV3, "%s: REPORTING and timer running, " "skipping.", __func__); break; } /* FALLTHROUGH */ case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: CTR1(KTR_IGMPV3, "%s: ->REPORTING", __func__); inm->inm_state = IGMP_REPORTING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; break; case IGMP_SLEEPING_MEMBER: CTR1(KTR_IGMPV3, "%s: ->AWAKENING", __func__); inm->inm_state = IGMP_AWAKENING_MEMBER; break; case IGMP_LEAVING_MEMBER: break; } } /* * Process a received IGMPv3 general, group-specific or * group-and-source-specific query. * Assumes m has already been pulled up to the full IGMP message length. * Return 0 if successful, otherwise an appropriate error code is returned. */ static int igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip, /*const*/ struct igmpv3 *igmpv3) { struct igmp_ifsoftc *igi; struct in_multi *inm; int is_general_query; uint32_t maxresp, nsrc, qqi; uint16_t timer; uint8_t qrv; is_general_query = 0; CTR2(KTR_IGMPV3, "process v3 query on ifp %p(%s)", ifp, ifp->if_xname); maxresp = igmpv3->igmp_code; /* in 1/10ths of a second */ if (maxresp >= 128) { maxresp = IGMP_MANT(igmpv3->igmp_code) << (IGMP_EXP(igmpv3->igmp_code) + 3); } /* * Robustness must never be less than 2 for on-wire IGMPv3. * FUTURE: Check if ifp has IGIF_LOOPBACK set, as we will make * an exception for interfaces whose IGMPv3 state changes * are redirected to loopback (e.g. MANET). */ qrv = IGMP_QRV(igmpv3->igmp_misc); if (qrv < 2) { CTR3(KTR_IGMPV3, "%s: clamping qrv %d to %d", __func__, qrv, IGMP_RV_INIT); qrv = IGMP_RV_INIT; } qqi = igmpv3->igmp_qqi; if (qqi >= 128) { qqi = IGMP_MANT(igmpv3->igmp_qqi) << (IGMP_EXP(igmpv3->igmp_qqi) + 3); } timer = maxresp * PR_FASTHZ / IGMP_TIMER_SCALE; if (timer == 0) timer = 1; nsrc = ntohs(igmpv3->igmp_numsrc); /* * Validate address fields and versions upfront before * accepting v3 query. * XXX SMPng: Unlocked access to igmpstat counters here. */ if (in_nullhost(igmpv3->igmp_group)) { /* * IGMPv3 General Query. * * General Queries SHOULD be directed to 224.0.0.1. * A general query with a source list has undefined * behaviour; discard it. */ IGMPSTAT_INC(igps_rcv_gen_queries); if (!in_allhosts(ip->ip_dst) || nsrc > 0) { IGMPSTAT_INC(igps_rcv_badqueries); return (0); } is_general_query = 1; } else { /* Group or group-source specific query. */ if (nsrc == 0) IGMPSTAT_INC(igps_rcv_group_queries); else IGMPSTAT_INC(igps_rcv_gsr_queries); } IN_MULTI_LIST_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Discard the v3 query if we're in Compatibility Mode. * The RFC is not obviously worded that hosts need to stay in * compatibility mode until the Old Version Querier Present * timer expires. */ if (igi->igi_version != IGMP_VERSION_3) { CTR3(KTR_IGMPV3, "ignore v3 query in v%d mode on ifp %p(%s)", igi->igi_version, ifp, ifp->if_xname); goto out_locked; } igmp_set_version(igi, IGMP_VERSION_3); igi->igi_rv = qrv; igi->igi_qi = qqi; igi->igi_qri = maxresp; CTR4(KTR_IGMPV3, "%s: qrv %d qi %d qri %d", __func__, qrv, qqi, maxresp); if (is_general_query) { /* * Schedule a current-state report on this ifp for * all groups, possibly containing source lists. * If there is a pending General Query response * scheduled earlier than the selected delay, do * not schedule any other reports. * Otherwise, reset the interface timer. */ CTR2(KTR_IGMPV3, "process v3 general query on ifp %p(%s)", ifp, ifp->if_xname); if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) { igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer); V_interface_timers_running = 1; } } else { /* * Group-source-specific queries are throttled on * a per-group basis to defeat denial-of-service attempts. * Queries for groups we are not a member of on this * link are simply ignored. */ inm = inm_lookup(ifp, igmpv3->igmp_group); if (inm == NULL) goto out_locked; if (nsrc > 0) { if (!ratecheck(&inm->inm_lastgsrtv, &V_igmp_gsrdelay)) { CTR1(KTR_IGMPV3, "%s: GS query throttled.", __func__); IGMPSTAT_INC(igps_drop_gsr_queries); goto out_locked; } } CTR3(KTR_IGMPV3, "process v3 0x%08x query on ifp %p(%s)", ntohl(igmpv3->igmp_group.s_addr), ifp, ifp->if_xname); /* * If there is a pending General Query response * scheduled sooner than the selected delay, no * further report need be scheduled. * Otherwise, prepare to respond to the * group-specific or group-and-source query. */ if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) igmp_input_v3_group_query(inm, igi, timer, igmpv3); } out_locked: IGMP_UNLOCK(); IN_MULTI_LIST_UNLOCK(); return (0); } /* * Process a received IGMPv3 group-specific or group-and-source-specific * query. * Return <0 if any error occurred. Currently this is ignored. */ static int igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifsoftc *igi, int timer, /*const*/ struct igmpv3 *igmpv3) { int retval; uint16_t nsrc; IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); retval = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LEAVING_MEMBER: return (retval); break; case IGMP_REPORTING_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: break; } nsrc = ntohs(igmpv3->igmp_numsrc); /* * Deal with group-specific queries upfront. * If any group query is already pending, purge any recorded * source-list state if it exists, and schedule a query response * for this group-specific query. */ if (nsrc == 0) { if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) { inm_clear_recorded(inm); timer = min(inm->inm_timer, timer); } inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; return (retval); } /* * Deal with the case where a group-and-source-specific query has * been received but a group-specific query is already pending. */ if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) { timer = min(inm->inm_timer, timer); inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; return (retval); } /* * Finally, deal with the case where a group-and-source-specific * query has been received, where a response to a previous g-s-r * query exists, or none exists. * In this case, we need to parse the source-list which the Querier * has provided us with and check if we have any source list filter * entries at T1 for these sources. If we do not, there is no need * schedule a report and the query may be dropped. * If we do, we must record them and schedule a current-state * report for those sources. * FIXME: Handling source lists larger than 1 mbuf requires that * we pass the mbuf chain pointer down to this function, and use * m_getptr() to walk the chain. */ if (inm->inm_nsrc > 0) { const struct in_addr *ap; int i, nrecorded; ap = (const struct in_addr *)(igmpv3 + 1); nrecorded = 0; for (i = 0; i < nsrc; i++, ap++) { retval = inm_record_source(inm, ap->s_addr); if (retval < 0) break; nrecorded += retval; } if (nrecorded > 0) { CTR1(KTR_IGMPV3, "%s: schedule response to SG query", __func__); inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; } } return (retval); } /* * Process a received IGMPv1 host membership report. * * NOTE: 0.0.0.0 workaround breaks const correctness. */ static int igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip, /*const*/ struct igmp *igmp) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_multi *inm; IGMPSTAT_INC(igps_rcv_reports); if (ifp->if_flags & IFF_LOOPBACK) return (0); if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || !in_hosteq(igmp->igmp_group, ip->ip_dst)) { IGMPSTAT_INC(igps_rcv_badreports); return (EINVAL); } /* * RFC 3376, Section 4.2.13, 9.2, 9.3: * Booting clients may use the source address 0.0.0.0. Some * IGMP daemons may not know how to use IP_RECVIF to determine * the interface upon which this message was received. * Replace 0.0.0.0 with the subnet address if told to do so. */ if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) { ip->ip_src.s_addr = htonl(ia->ia_subnet); ifa_free(&ia->ia_ifa); } } CTR3(KTR_IGMPV3, "process v1 report 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); /* * IGMPv1 report suppression. * If we are a member of this group, and our membership should be * reported, stop our group timer and transition to the 'lazy' state. */ IN_MULTI_LIST_LOCK(); inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { struct igmp_ifsoftc *igi; igi = inm->inm_igi; if (igi == NULL) { KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp)); goto out_locked; } IGMPSTAT_INC(igps_rcv_ourreports); /* * If we are in IGMPv3 host mode, do not allow the * other host's IGMPv1 report to suppress our reports * unless explicitly configured to do so. */ if (igi->igi_version == IGMP_VERSION_3) { if (V_igmp_legacysupp) igmp_v3_suppress_group_record(inm); goto out_locked; } inm->inm_timer = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); case IGMP_SLEEPING_MEMBER: inm->inm_state = IGMP_SLEEPING_MEMBER; break; case IGMP_REPORTING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); if (igi->igi_version == IGMP_VERSION_1) inm->inm_state = IGMP_LAZY_MEMBER; else if (igi->igi_version == IGMP_VERSION_2) inm->inm_state = IGMP_SLEEPING_MEMBER; break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } out_locked: IN_MULTI_LIST_UNLOCK(); return (0); } /* * Process a received IGMPv2 host membership report. * * NOTE: 0.0.0.0 workaround breaks const correctness. */ static int igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip, /*const*/ struct igmp *igmp) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_multi *inm; /* * Make sure we don't hear our own membership report. Fast * leave requires knowing that we are the only member of a * group. */ IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) { ifa_free(&ia->ia_ifa); return (0); } IGMPSTAT_INC(igps_rcv_reports); if (ifp->if_flags & IFF_LOOPBACK) { if (ia != NULL) ifa_free(&ia->ia_ifa); return (0); } if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || !in_hosteq(igmp->igmp_group, ip->ip_dst)) { if (ia != NULL) ifa_free(&ia->ia_ifa); IGMPSTAT_INC(igps_rcv_badreports); return (EINVAL); } /* * RFC 3376, Section 4.2.13, 9.2, 9.3: * Booting clients may use the source address 0.0.0.0. Some * IGMP daemons may not know how to use IP_RECVIF to determine * the interface upon which this message was received. * Replace 0.0.0.0 with the subnet address if told to do so. */ if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { if (ia != NULL) ip->ip_src.s_addr = htonl(ia->ia_subnet); } if (ia != NULL) ifa_free(&ia->ia_ifa); CTR3(KTR_IGMPV3, "process v2 report 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); /* * IGMPv2 report suppression. * If we are a member of this group, and our membership should be * reported, and our group timer is pending or about to be reset, * stop our group timer by transitioning to the 'lazy' state. */ IN_MULTI_LIST_LOCK(); inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { struct igmp_ifsoftc *igi; igi = inm->inm_igi; KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp)); IGMPSTAT_INC(igps_rcv_ourreports); /* * If we are in IGMPv3 host mode, do not allow the * other host's IGMPv1 report to suppress our reports * unless explicitly configured to do so. */ if (igi->igi_version == IGMP_VERSION_3) { if (V_igmp_legacysupp) igmp_v3_suppress_group_record(inm); goto out_locked; } inm->inm_timer = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_AWAKENING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); case IGMP_LAZY_MEMBER: inm->inm_state = IGMP_LAZY_MEMBER; break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } out_locked: IN_MULTI_LIST_UNLOCK(); return (0); } int igmp_input(struct mbuf **mp, int *offp, int proto) { int iphlen; struct ifnet *ifp; struct igmp *igmp; struct ip *ip; struct mbuf *m; int igmplen; int minlen; int queryver; CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, *mp, *offp); m = *mp; ifp = m->m_pkthdr.rcvif; *mp = NULL; IGMPSTAT_INC(igps_rcv_total); ip = mtod(m, struct ip *); iphlen = *offp; igmplen = ntohs(ip->ip_len) - iphlen; /* * Validate lengths. */ if (igmplen < IGMP_MINLEN) { IGMPSTAT_INC(igps_rcv_tooshort); m_freem(m); return (IPPROTO_DONE); } /* * Always pullup to the minimum size for v1/v2 or v3 * to amortize calls to m_pullup(). */ minlen = iphlen; if (igmplen >= IGMP_V3_QUERY_MINLEN) minlen += IGMP_V3_QUERY_MINLEN; else minlen += IGMP_MINLEN; if ((!M_WRITABLE(m) || m->m_len < minlen) && (m = m_pullup(m, minlen)) == NULL) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); /* * Validate checksum. */ m->m_data += iphlen; m->m_len -= iphlen; igmp = mtod(m, struct igmp *); if (in_cksum(m, igmplen)) { IGMPSTAT_INC(igps_rcv_badsum); m_freem(m); return (IPPROTO_DONE); } m->m_data -= iphlen; m->m_len += iphlen; /* * IGMP control traffic is link-scope, and must have a TTL of 1. * DVMRP traffic (e.g. mrinfo, mtrace) is an exception; * probe packets may come from beyond the LAN. */ if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) { IGMPSTAT_INC(igps_rcv_badttl); m_freem(m); return (IPPROTO_DONE); } switch (igmp->igmp_type) { case IGMP_HOST_MEMBERSHIP_QUERY: if (igmplen == IGMP_MINLEN) { if (igmp->igmp_code == 0) queryver = IGMP_VERSION_1; else queryver = IGMP_VERSION_2; } else if (igmplen >= IGMP_V3_QUERY_MINLEN) { queryver = IGMP_VERSION_3; } else { IGMPSTAT_INC(igps_rcv_tooshort); m_freem(m); return (IPPROTO_DONE); } switch (queryver) { case IGMP_VERSION_1: IGMPSTAT_INC(igps_rcv_v1v2_queries); if (!V_igmp_v1enable) break; if (igmp_input_v1_query(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_VERSION_2: IGMPSTAT_INC(igps_rcv_v1v2_queries); if (!V_igmp_v2enable) break; if (igmp_input_v2_query(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_VERSION_3: { struct igmpv3 *igmpv3; uint16_t igmpv3len; uint16_t nsrc; IGMPSTAT_INC(igps_rcv_v3_queries); igmpv3 = (struct igmpv3 *)igmp; /* * Validate length based on source count. */ nsrc = ntohs(igmpv3->igmp_numsrc); if (nsrc * sizeof(in_addr_t) > UINT16_MAX - iphlen - IGMP_V3_QUERY_MINLEN) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } /* * m_pullup() may modify m, so pullup in * this scope. */ igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN + sizeof(struct in_addr) * nsrc; if ((!M_WRITABLE(m) || m->m_len < igmpv3len) && (m = m_pullup(m, igmpv3len)) == NULL) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *) + iphlen); if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) { m_freem(m); return (IPPROTO_DONE); } } break; } break; case IGMP_v1_HOST_MEMBERSHIP_REPORT: if (!V_igmp_v1enable) break; if (igmp_input_v1_report(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_v2_HOST_MEMBERSHIP_REPORT: if (!V_igmp_v2enable) break; if (!ip_checkrouteralert(m)) IGMPSTAT_INC(igps_rcv_nora); if (igmp_input_v2_report(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_v3_HOST_MEMBERSHIP_REPORT: /* * Hosts do not need to process IGMPv3 membership reports, * as report suppression is no longer required. */ if (!ip_checkrouteralert(m)) IGMPSTAT_INC(igps_rcv_nora); break; default: break; } /* * Pass all valid IGMP packets up to any process(es) listening on a * raw IGMP socket. */ *mp = m; return (rip_input(mp, offp, proto)); } /* * Fast timeout handler (global). * VIMAGE: Timeout handlers are expected to service all vimages. */ void igmp_fasttimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); igmp_fasttimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Fast timeout handler (per-vnet). * Sends are shuffled off to a netisr to deal with Giant. * * VIMAGE: Assume caller has set up our curvnet. */ static void igmp_fasttimo_vnet(void) { struct mbufq scq; /* State-change packets */ struct mbufq qrq; /* Query response packets */ struct ifnet *ifp; struct igmp_ifsoftc *igi; - struct ifmultiaddr *ifma; + struct ifmultiaddr *ifma, *next; struct in_multi *inm; struct in_multi_head inm_free_tmp; int loop, uri_fasthz; loop = 0; uri_fasthz = 0; /* * Quick check to see if any work needs to be done, in order to * minimize the overhead of fasttimo processing. * SMPng: XXX Unlocked reads. */ if (!V_current_state_timers_running && !V_interface_timers_running && !V_state_change_timers_running) return; SLIST_INIT(&inm_free_tmp); IN_MULTI_LIST_LOCK(); IGMP_LOCK(); /* * IGMPv3 General Query response timer processing. */ if (V_interface_timers_running) { CTR1(KTR_IGMPV3, "%s: interface timers running", __func__); V_interface_timers_running = 0; LIST_FOREACH(igi, &V_igi_head, igi_link) { if (igi->igi_v3_timer == 0) { /* Do nothing. */ } else if (--igi->igi_v3_timer == 0) { igmp_v3_dispatch_general_query(igi); } else { V_interface_timers_running = 1; } } } if (!V_current_state_timers_running && !V_state_change_timers_running) goto out_locked; V_current_state_timers_running = 0; V_state_change_timers_running = 0; CTR1(KTR_IGMPV3, "%s: state change timers running", __func__); /* * IGMPv1/v2/v3 host report and state-change timer processing. * Note: Processing a v3 group timer may remove a node. */ LIST_FOREACH(igi, &V_igi_head, igi_link) { ifp = igi->igi_ifp; if (igi->igi_version == IGMP_VERSION_3) { loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri * PR_FASTHZ); mbufq_init(&qrq, IGMP_MAX_G_GS_PACKETS); mbufq_init(&scq, IGMP_MAX_STATE_CHANGE_PACKETS); } - IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + IF_ADDR_WLOCK(ifp); + restart: + TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; switch (igi->igi_version) { case IGMP_VERSION_1: case IGMP_VERSION_2: igmp_v1v2_process_group_timer(inm, igi->igi_version); break; case IGMP_VERSION_3: igmp_v3_process_group_timers(&inm_free_tmp, &qrq, &scq, inm, uri_fasthz); break; } + if (__predict_false(ifma_restart)) { + ifma_restart = false; + goto restart; + } } - IF_ADDR_RUNLOCK(ifp); + IF_ADDR_WUNLOCK(ifp); if (igi->igi_version == IGMP_VERSION_3) { igmp_dispatch_queue(&qrq, 0, loop); igmp_dispatch_queue(&scq, 0, loop); /* * Free the in_multi reference(s) for this * IGMP lifecycle. */ inm_release_list_deferred(&inm_free_tmp); } } out_locked: IGMP_UNLOCK(); IN_MULTI_LIST_UNLOCK(); } /* * Update host report group timer for IGMPv1/v2. * Will update the global pending timer flags. */ static void igmp_v1v2_process_group_timer(struct in_multi *inm, const int version) { int report_timer_expired; IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); if (inm->inm_timer == 0) { report_timer_expired = 0; } else if (--inm->inm_timer == 0) { report_timer_expired = 1; } else { V_current_state_timers_running = 1; return; } switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: break; case IGMP_REPORTING_MEMBER: if (report_timer_expired) { inm->inm_state = IGMP_IDLE_MEMBER; (void)igmp_v1v2_queue_report(inm, (version == IGMP_VERSION_2) ? IGMP_v2_HOST_MEMBERSHIP_REPORT : IGMP_v1_HOST_MEMBERSHIP_REPORT); } break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } /* * Update a group's timers for IGMPv3. * Will update the global pending timer flags. * Note: Unlocked read from igi. */ static void igmp_v3_process_group_timers(struct in_multi_head *inmh, struct mbufq *qrq, struct mbufq *scq, struct in_multi *inm, const int uri_fasthz) { int query_response_timer_expired; int state_change_retransmit_timer_expired; IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); query_response_timer_expired = 0; state_change_retransmit_timer_expired = 0; /* * During a transition from v1/v2 compatibility mode back to v3, * a group record in REPORTING state may still have its group * timer active. This is a no-op in this function; it is easier * to deal with it here than to complicate the slow-timeout path. */ if (inm->inm_timer == 0) { query_response_timer_expired = 0; } else if (--inm->inm_timer == 0) { query_response_timer_expired = 1; } else { V_current_state_timers_running = 1; } if (inm->inm_sctimer == 0) { state_change_retransmit_timer_expired = 0; } else if (--inm->inm_sctimer == 0) { state_change_retransmit_timer_expired = 1; } else { V_state_change_timers_running = 1; } /* We are in fasttimo, so be quick about it. */ if (!state_change_retransmit_timer_expired && !query_response_timer_expired) return; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: case IGMP_IDLE_MEMBER: break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: /* * Respond to a previously pending Group-Specific * or Group-and-Source-Specific query by enqueueing * the appropriate Current-State report for * immediate transmission. */ if (query_response_timer_expired) { int retval; retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1, (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); inm->inm_state = IGMP_REPORTING_MEMBER; /* XXX Clear recorded sources for next time. */ inm_clear_recorded(inm); } /* FALLTHROUGH */ case IGMP_REPORTING_MEMBER: case IGMP_LEAVING_MEMBER: if (state_change_retransmit_timer_expired) { /* * State-change retransmission timer fired. * If there are any further pending retransmissions, * set the global pending state-change flag, and * reset the timer. */ if (--inm->inm_scrv > 0) { inm->inm_sctimer = uri_fasthz; V_state_change_timers_running = 1; } /* * Retransmit the previously computed state-change * report. If there are no further pending * retransmissions, the mbuf queue will be consumed. * Update T0 state to T1 as we have now sent * a state-change. */ (void)igmp_v3_merge_state_changes(inm, scq); inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); /* * If we are leaving the group for good, make sure * we release IGMP's reference to it. * This release must be deferred using a SLIST, * as we are called from a loop which traverses * the in_ifmultiaddr TAILQ. */ if (inm->inm_state == IGMP_LEAVING_MEMBER && inm->inm_scrv == 0) { inm->inm_state = IGMP_NOT_MEMBER; inm_rele_locked(inmh, inm); } } break; } } /* * Suppress a group's pending response to a group or source/group query. * * Do NOT suppress state changes. This leads to IGMPv3 inconsistency. * Do NOT update ST1/ST0 as this operation merely suppresses * the currently pending group record. * Do NOT suppress the response to a general query. It is possible but * it would require adding another state or flag. */ static void igmp_v3_suppress_group_record(struct in_multi *inm) { IN_MULTI_LIST_LOCK_ASSERT(); KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3, ("%s: not IGMPv3 mode on link", __func__)); if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER) return; if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) inm_clear_recorded(inm); inm->inm_timer = 0; inm->inm_state = IGMP_REPORTING_MEMBER; } /* * Switch to a different IGMP version on the given interface, * as per Section 7.2.1. */ static void igmp_set_version(struct igmp_ifsoftc *igi, const int version) { int old_version_timer; IGMP_LOCK_ASSERT(); CTR4(KTR_IGMPV3, "%s: switching to v%d on ifp %p(%s)", __func__, version, igi->igi_ifp, igi->igi_ifp->if_xname); if (version == IGMP_VERSION_1 || version == IGMP_VERSION_2) { /* * Compute the "Older Version Querier Present" timer as per * Section 8.12. */ old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri; old_version_timer *= PR_SLOWHZ; if (version == IGMP_VERSION_1) { igi->igi_v1_timer = old_version_timer; igi->igi_v2_timer = 0; } else if (version == IGMP_VERSION_2) { igi->igi_v1_timer = 0; igi->igi_v2_timer = old_version_timer; } } if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { if (igi->igi_version != IGMP_VERSION_2) { igi->igi_version = IGMP_VERSION_2; igmp_v3_cancel_link_timers(igi); } } else if (igi->igi_v1_timer > 0) { if (igi->igi_version != IGMP_VERSION_1) { igi->igi_version = IGMP_VERSION_1; igmp_v3_cancel_link_timers(igi); } } } /* * Cancel pending IGMPv3 timers for the given link and all groups * joined on it; state-change, general-query, and group-query timers. * * Only ever called on a transition from v3 to Compatibility mode. Kill * the timers stone dead (this may be expensive for large N groups), they * will be restarted if Compatibility Mode deems that they must be due to * query processing. */ static void igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi) { struct ifmultiaddr *ifma; struct ifnet *ifp; struct in_multi *inm; struct in_multi_head inm_free_tmp; CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__, igi->igi_ifp, igi->igi_ifp->if_xname); IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); SLIST_INIT(&inm_free_tmp); /* * Stop the v3 General Query Response on this link stone dead. * If fasttimo is woken up due to V_interface_timers_running, * the flag will be cleared if there are no pending link timers. */ igi->igi_v3_timer = 0; /* * Now clear the current-state and state-change report timers * for all memberships scoped to this link. */ ifp = igi->igi_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: /* * These states are either not relevant in v3 mode, * or are unreported. Do nothing. */ break; case IGMP_LEAVING_MEMBER: /* * If we are leaving the group and switching to * compatibility mode, we need to release the final * reference held for issuing the INCLUDE {}, and * transition to REPORTING to ensure the host leave * message is sent upstream to the old querier -- * transition to NOT would lose the leave and race. */ inm_rele_locked(&inm_free_tmp, inm); /* FALLTHROUGH */ case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: inm_clear_recorded(inm); /* FALLTHROUGH */ case IGMP_REPORTING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; break; } /* * Always clear state-change and group report timers. * Free any pending IGMPv3 state-change records. */ inm->inm_sctimer = 0; inm->inm_timer = 0; mbufq_drain(&inm->inm_scq); } IF_ADDR_RUNLOCK(ifp); inm_release_list_deferred(&inm_free_tmp); } /* * Update the Older Version Querier Present timers for a link. * See Section 7.2.1 of RFC 3376. */ static void igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *igi) { IGMP_LOCK_ASSERT(); if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) { /* * IGMPv1 and IGMPv2 Querier Present timers expired. * * Revert to IGMPv3. */ if (igi->igi_version != IGMP_VERSION_3) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_version = IGMP_VERSION_3; } } else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { /* * IGMPv1 Querier Present timer expired, * IGMPv2 Querier Present timer running. * If IGMPv2 was disabled since last timeout, * revert to IGMPv3. * If IGMPv2 is enabled, revert to IGMPv2. */ if (!V_igmp_v2enable) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v2_timer = 0; igi->igi_version = IGMP_VERSION_3; } else { --igi->igi_v2_timer; if (igi->igi_version != IGMP_VERSION_2) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_2, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_version = IGMP_VERSION_2; igmp_v3_cancel_link_timers(igi); } } } else if (igi->igi_v1_timer > 0) { /* * IGMPv1 Querier Present timer running. * Stop IGMPv2 timer if running. * * If IGMPv1 was disabled since last timeout, * revert to IGMPv3. * If IGMPv1 is enabled, reset IGMPv2 timer if running. */ if (!V_igmp_v1enable) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v1_timer = 0; igi->igi_version = IGMP_VERSION_3; } else { --igi->igi_v1_timer; } if (igi->igi_v2_timer > 0) { CTR3(KTR_IGMPV3, "%s: cancel v2 timer on %p(%s)", __func__, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v2_timer = 0; } } } /* * Global slowtimo handler. * VIMAGE: Timeout handlers are expected to service all vimages. */ void igmp_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); igmp_slowtimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Per-vnet slowtimo handler. */ static void igmp_slowtimo_vnet(void) { struct igmp_ifsoftc *igi; IGMP_LOCK(); LIST_FOREACH(igi, &V_igi_head, igi_link) { igmp_v1v2_process_querier_timers(igi); } IGMP_UNLOCK(); } /* * Dispatch an IGMPv1/v2 host report or leave message. * These are always small enough to fit inside a single mbuf. */ static int igmp_v1v2_queue_report(struct in_multi *inm, const int type) { struct ifnet *ifp; struct igmp *igmp; struct ip *ip; struct mbuf *m; IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); ifp = inm->inm_ifp; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOMEM); M_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp)); m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp); m->m_data += sizeof(struct ip); m->m_len = sizeof(struct igmp); igmp = mtod(m, struct igmp *); igmp->igmp_type = type; igmp->igmp_code = 0; igmp->igmp_group = inm->inm_addr; igmp->igmp_cksum = 0; igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp)); m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); ip = mtod(m, struct ip *); ip->ip_tos = 0; ip->ip_len = htons(sizeof(struct ip) + sizeof(struct igmp)); ip->ip_off = 0; ip->ip_p = IPPROTO_IGMP; ip->ip_src.s_addr = INADDR_ANY; if (type == IGMP_HOST_LEAVE_MESSAGE) ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP); else ip->ip_dst = inm->inm_addr; igmp_save_context(m, ifp); m->m_flags |= M_IGMPV2; if (inm->inm_igi->igi_flags & IGIF_LOOPBACK) m->m_flags |= M_IGMP_LOOP; CTR2(KTR_IGMPV3, "%s: netisr_dispatch(NETISR_IGMP, %p)", __func__, m); netisr_dispatch(NETISR_IGMP, m); return (0); } /* * Process a state change from the upper layer for the given IPv4 group. * * Each socket holds a reference on the in_multi in its own ip_moptions. * The socket layer will have made the necessary updates to.the group * state, it is now up to IGMP to issue a state change report if there * has been any change between T0 (when the last state-change was issued) * and T1 (now). * * We use the IGMPv3 state machine at group level. The IGMP module * however makes the decision as to which IGMP protocol version to speak. * A state change *from* INCLUDE {} always means an initial join. * A state change *to* INCLUDE {} always means a final leave. * * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can * save ourselves a bunch of work; any exclusive mode groups need not * compute source filter lists. * * VIMAGE: curvnet should have been set by caller, as this routine * is called from the socket option handlers. */ int igmp_change_state(struct in_multi *inm) { struct igmp_ifsoftc *igi; struct ifnet *ifp; int error; error = 0; IN_MULTI_LOCK_ASSERT(); /* * Try to detect if the upper layer just asked us to change state * for an interface which has now gone away. */ KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->inm_ifma->ifma_ifp; /* * Sanity check that netinet's notion of ifp is the * same as net's. */ KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); /* * If we detect a state transition to or from MCAST_UNDEFINED * for this group, then we are starting or finishing an IGMP * life cycle for this group. */ if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) { CTR3(KTR_IGMPV3, "%s: inm transition %d -> %d", __func__, inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode); if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_IGMPV3, "%s: initial join", __func__); error = igmp_initial_join(inm, igi); goto out_locked; } else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_IGMPV3, "%s: final leave", __func__); igmp_final_leave(inm, igi); goto out_locked; } } else { CTR1(KTR_IGMPV3, "%s: filter set change", __func__); } error = igmp_handle_state_change(inm, igi); out_locked: IGMP_UNLOCK(); return (error); } /* * Perform the initial join for an IGMP group. * * When joining a group: * If the group should have its IGMP traffic suppressed, do nothing. * IGMPv1 starts sending IGMPv1 host membership reports. * IGMPv2 starts sending IGMPv2 host membership reports. * IGMPv3 will schedule an IGMPv3 state-change report containing the * initial state of the membership. */ static int igmp_initial_join(struct in_multi *inm, struct igmp_ifsoftc *igi) { struct ifnet *ifp; struct mbufq *mq; int error, retval, syncstates; CTR4(KTR_IGMPV3, "%s: initial join 0x%08x on ifp %p(%s)", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp, inm->inm_ifp->if_xname); error = 0; syncstates = 1; ifp = inm->inm_ifp; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__)); /* * Groups joined on loopback or marked as 'not reported', * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and * are never reported in any IGMP protocol exchanges. * All other groups enter the appropriate IGMP state machine * for the version in use on this link. * A link marked as IGIF_SILENT causes IGMP to be completely * disabled for the link. */ if ((ifp->if_flags & IFF_LOOPBACK) || (igi->igi_flags & IGIF_SILENT) || !igmp_isgroupreported(inm->inm_addr)) { CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); inm->inm_state = IGMP_SILENT_MEMBER; inm->inm_timer = 0; } else { /* * Deal with overlapping in_multi lifecycle. * If this group was LEAVING, then make sure * we drop the reference we picked up to keep the * group around for the final INCLUDE {} enqueue. */ if (igi->igi_version == IGMP_VERSION_3 && inm->inm_state == IGMP_LEAVING_MEMBER) { MPASS(inm->inm_refcount > 1); inm_rele_locked(NULL, inm); } inm->inm_state = IGMP_REPORTING_MEMBER; switch (igi->igi_version) { case IGMP_VERSION_1: case IGMP_VERSION_2: inm->inm_state = IGMP_IDLE_MEMBER; error = igmp_v1v2_queue_report(inm, (igi->igi_version == IGMP_VERSION_2) ? IGMP_v2_HOST_MEMBERSHIP_REPORT : IGMP_v1_HOST_MEMBERSHIP_REPORT); if (error == 0) { inm->inm_timer = IGMP_RANDOM_DELAY( IGMP_V1V2_MAX_RI * PR_FASTHZ); V_current_state_timers_running = 1; } break; case IGMP_VERSION_3: /* * Defer update of T0 to T1, until the first copy * of the state change has been transmitted. */ syncstates = 0; /* * Immediately enqueue a State-Change Report for * this interface, freeing any previous reports. * Don't kick the timers if there is nothing to do, * or if an error occurred. */ mq = &inm->inm_scq; mbufq_drain(mq); retval = igmp_v3_enqueue_group_record(mq, inm, 1, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) { error = retval * -1; break; } /* * Schedule transmission of pending state-change * report up to RV times for this link. The timer * will fire at the next igmp_fasttimo (~200ms), * giving us an opportunity to merge the reports. */ if (igi->igi_flags & IGIF_LOOPBACK) { inm->inm_scrv = 1; } else { KASSERT(igi->igi_rv > 1, ("%s: invalid robustness %d", __func__, igi->igi_rv)); inm->inm_scrv = igi->igi_rv; } inm->inm_sctimer = 1; V_state_change_timers_running = 1; error = 0; break; } } /* * Only update the T0 state if state change is atomic, * i.e. we don't need to wait for a timer to fire before we * can consider the state change to have been communicated. */ if (syncstates) { inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); } return (error); } /* * Issue an intermediate state change during the IGMP life-cycle. */ static int igmp_handle_state_change(struct in_multi *inm, struct igmp_ifsoftc *igi) { struct ifnet *ifp; int retval; CTR4(KTR_IGMPV3, "%s: state change for 0x%08x on ifp %p(%s)", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp, inm->inm_ifp->if_xname); ifp = inm->inm_ifp; IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__)); if ((ifp->if_flags & IFF_LOOPBACK) || (igi->igi_flags & IGIF_SILENT) || !igmp_isgroupreported(inm->inm_addr) || (igi->igi_version != IGMP_VERSION_3)) { if (!igmp_isgroupreported(inm->inm_addr)) { CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); } CTR1(KTR_IGMPV3, "%s: nothing to do", __func__); inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); return (0); } mbufq_drain(&inm->inm_scq); retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) return (-retval); /* * If record(s) were enqueued, start the state-change * report timer for this group. */ inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv); inm->inm_sctimer = 1; V_state_change_timers_running = 1; return (0); } /* * Perform the final leave for an IGMP group. * * When leaving a group: * IGMPv1 does nothing. * IGMPv2 sends a host leave message, if and only if we are the reporter. * IGMPv3 enqueues a state-change report containing a transition * to INCLUDE {} for immediate transmission. */ static void igmp_final_leave(struct in_multi *inm, struct igmp_ifsoftc *igi) { int syncstates; syncstates = 1; CTR4(KTR_IGMPV3, "%s: final leave 0x%08x on ifp %p(%s)", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp, inm->inm_ifp->if_xname); IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_LEAVING_MEMBER: /* Already leaving or left; do nothing. */ CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: if (igi->igi_version == IGMP_VERSION_2) { #ifdef INVARIANTS if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) panic("%s: IGMPv3 state reached, not IGMPv3 mode", __func__); #endif igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE); inm->inm_state = IGMP_NOT_MEMBER; } else if (igi->igi_version == IGMP_VERSION_3) { /* * Stop group timer and all pending reports. * Immediately enqueue a state-change report * TO_IN {} to be sent on the next fast timeout, * giving us an opportunity to merge reports. */ mbufq_drain(&inm->inm_scq); inm->inm_timer = 0; if (igi->igi_flags & IGIF_LOOPBACK) { inm->inm_scrv = 1; } else { inm->inm_scrv = igi->igi_rv; } CTR4(KTR_IGMPV3, "%s: Leaving 0x%08x/%s with %d " "pending retransmissions.", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname, inm->inm_scrv); if (inm->inm_scrv == 0) { inm->inm_state = IGMP_NOT_MEMBER; inm->inm_sctimer = 0; } else { int retval; inm_acquire_locked(inm); retval = igmp_v3_enqueue_group_record( &inm->inm_scq, inm, 1, 0, 0); KASSERT(retval != 0, ("%s: enqueue record = %d", __func__, retval)); inm->inm_state = IGMP_LEAVING_MEMBER; inm->inm_sctimer = 1; V_state_change_timers_running = 1; syncstates = 0; } break; } break; case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: /* Our reports are suppressed; do nothing. */ break; } if (syncstates) { inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; CTR3(KTR_IGMPV3, "%s: T1 now MCAST_UNDEFINED for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); } } /* * Enqueue an IGMPv3 group record to the given output queue. * * XXX This function could do with having the allocation code * split out, and the multiple-tree-walks coalesced into a single * routine as has been done in igmp_v3_enqueue_filter_change(). * * If is_state_change is zero, a current-state record is appended. * If is_state_change is non-zero, a state-change report is appended. * * If is_group_query is non-zero, an mbuf packet chain is allocated. * If is_group_query is zero, and if there is a packet with free space * at the tail of the queue, it will be appended to providing there * is enough free space. * Otherwise a new mbuf packet chain is allocated. * * If is_source_query is non-zero, each source is checked to see if * it was recorded for a Group-Source query, and will be omitted if * it is not both in-mode and recorded. * * The function will attempt to allocate leading space in the packet * for the IP/IGMP header to be prepended without fragmenting the chain. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int igmp_v3_enqueue_group_record(struct mbufq *mq, struct in_multi *inm, const int is_state_change, const int is_group_query, const int is_source_query) { struct igmp_grouprec ig; struct igmp_grouprec *pig; struct ifnet *ifp; struct ip_msource *ims, *nims; struct mbuf *m0, *m, *md; int error, is_filter_list_change; int minrec0len, m0srcs, msrcs, nbytes, off; int record_has_sources; int now; int type; in_addr_t naddr; uint8_t mode; IN_MULTI_LIST_LOCK_ASSERT(); error = 0; ifp = inm->inm_ifp; is_filter_list_change = 0; m = NULL; m0 = NULL; m0srcs = 0; msrcs = 0; nbytes = 0; nims = NULL; record_has_sources = 1; pig = NULL; type = IGMP_DO_NOTHING; mode = inm->inm_st[1].iss_fmode; /* * If we did not transition out of ASM mode during t0->t1, * and there are no source nodes to process, we can skip * the generation of source records. */ if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 && inm->inm_nsrc == 0) record_has_sources = 0; if (is_state_change) { /* * Queue a state change record. * If the mode did not change, and there are non-ASM * listeners or source filters present, * we potentially need to issue two records for the group. * If we are transitioning to MCAST_UNDEFINED, we need * not send any sources. * If there are ASM listeners, and there was no filter * mode transition of any kind, do nothing. */ if (mode != inm->inm_st[0].iss_fmode) { if (mode == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: change to EXCLUDE", __func__); type = IGMP_CHANGE_TO_EXCLUDE_MODE; } else { CTR1(KTR_IGMPV3, "%s: change to INCLUDE", __func__); type = IGMP_CHANGE_TO_INCLUDE_MODE; if (mode == MCAST_UNDEFINED) record_has_sources = 0; } } else { if (record_has_sources) { is_filter_list_change = 1; } else { type = IGMP_DO_NOTHING; } } } else { /* * Queue a current state record. */ if (mode == MCAST_EXCLUDE) { type = IGMP_MODE_IS_EXCLUDE; } else if (mode == MCAST_INCLUDE) { type = IGMP_MODE_IS_INCLUDE; KASSERT(inm->inm_st[1].iss_asm == 0, ("%s: inm %p is INCLUDE but ASM count is %d", __func__, inm, inm->inm_st[1].iss_asm)); } } /* * Generate the filter list changes using a separate function. */ if (is_filter_list_change) return (igmp_v3_enqueue_filter_change(mq, inm)); if (type == IGMP_DO_NOTHING) { CTR3(KTR_IGMPV3, "%s: nothing to do for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); return (0); } /* * If any sources are present, we must be able to fit at least * one in the trailing space of the tail packet's mbuf, * ideally more. */ minrec0len = sizeof(struct igmp_grouprec); if (record_has_sources) minrec0len += sizeof(in_addr_t); CTR4(KTR_IGMPV3, "%s: queueing %s for 0x%08x/%s", __func__, igmp_rec_type_to_str(type), ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); /* * Check if we have a packet in the tail of the queue for this * group into which the first group record for this group will fit. * Otherwise allocate a new packet. * Always allocate leading space for IP+RA_OPT+IGMP+REPORT. * Note: Group records for G/GSR query responses MUST be sent * in their own packet. */ m0 = mbufq_last(mq); if (!is_group_query && m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) && (m0->m_pkthdr.len + minrec0len) < (ifp->if_mtu - IGMP_LEADINGSPACE)) { m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); m = m0; CTR1(KTR_IGMPV3, "%s: use existing packet", __func__); } else { if (mbufq_full(mq)) { CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__); return (-ENOMEM); } m = NULL; m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); if (!is_state_change && !is_group_query) { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; } if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) return (-ENOMEM); igmp_save_context(m, ifp); CTR1(KTR_IGMPV3, "%s: allocated first packet", __func__); } /* * Append group record. * If we have sources, we don't know how many yet. */ ig.ig_type = type; ig.ig_datalen = 0; ig.ig_numsrc = 0; ig.ig_group = inm->inm_addr; if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(struct igmp_grouprec); /* * Append as many sources as will fit in the first packet. * If we are appending to a new packet, the chain allocation * may potentially use clusters; use m_getptr() in this case. * If we are appending to an existing packet, we need to obtain * a pointer to the group record after m_append(), in case a new * mbuf was allocated. * Only append sources which are in-mode at t1. If we are * transitioning to MCAST_UNDEFINED state on the group, do not * include source entries. * Only report recorded sources in our filter set when responding * to a group-source query. */ if (record_has_sources) { if (m == m0) { md = m_last(m); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + md->m_len - nbytes); } else { md = m_getptr(m, 0, &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); } msrcs = 0; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) { CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__, ims->ims_haddr); now = ims_get_mode(inm, ims, 1); CTR2(KTR_IGMPV3, "%s: node is %d", __func__, now); if ((now != mode) || (now == mode && mode == MCAST_UNDEFINED)) { CTR1(KTR_IGMPV3, "%s: skip node", __func__); continue; } if (is_source_query && ims->ims_stp == 0) { CTR1(KTR_IGMPV3, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_IGMPV3, "%s: append node", __func__); naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(in_addr_t); ++msrcs; if (msrcs == m0srcs) break; } CTR2(KTR_IGMPV3, "%s: msrcs is %d this packet", __func__, msrcs); pig->ig_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(in_addr_t)); } if (is_source_query && msrcs == 0) { CTR1(KTR_IGMPV3, "%s: no recorded sources to report", __func__); if (m != m0) m_freem(m); return (0); } /* * We are good to go with first packet. */ if (m != m0) { CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__); m->m_pkthdr.PH_vt.vt_nrecs = 1; mbufq_enqueue(mq, m); } else m->m_pkthdr.PH_vt.vt_nrecs++; /* * No further work needed if no source list in packet(s). */ if (!record_has_sources) return (nbytes); /* * Whilst sources remain to be announced, we need to allocate * a new packet and fill out as many sources as will fit. * Always try for a cluster first. */ while (nims != NULL) { if (mbufq_full(mq)) { CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__); return (-ENOMEM); } m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) return (-ENOMEM); igmp_save_context(m, ifp); md = m_getptr(m, 0, &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); CTR1(KTR_IGMPV3, "%s: allocated next packet", __func__); if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 1; nbytes += sizeof(struct igmp_grouprec); m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); msrcs = 0; RB_FOREACH_FROM(ims, ip_msource_tree, nims) { CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__, ims->ims_haddr); now = ims_get_mode(inm, ims, 1); if ((now != mode) || (now == mode && mode == MCAST_UNDEFINED)) { CTR1(KTR_IGMPV3, "%s: skip node", __func__); continue; } if (is_source_query && ims->ims_stp == 0) { CTR1(KTR_IGMPV3, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_IGMPV3, "%s: append node", __func__); naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } ++msrcs; if (msrcs == m0srcs) break; } pig->ig_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(in_addr_t)); CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__); mbufq_enqueue(mq, m); } return (nbytes); } /* * Type used to mark record pass completion. * We exploit the fact we can cast to this easily from the * current filter modes on each ip_msource node. */ typedef enum { REC_NONE = 0x00, /* MCAST_UNDEFINED */ REC_ALLOW = 0x01, /* MCAST_INCLUDE */ REC_BLOCK = 0x02, /* MCAST_EXCLUDE */ REC_FULL = REC_ALLOW | REC_BLOCK } rectype_t; /* * Enqueue an IGMPv3 filter list change to the given output queue. * * Source list filter state is held in an RB-tree. When the filter list * for a group is changed without changing its mode, we need to compute * the deltas between T0 and T1 for each source in the filter set, * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records. * * As we may potentially queue two record types, and the entire R-B tree * needs to be walked at once, we break this out into its own function * so we can generate a tightly packed queue of packets. * * XXX This could be written to only use one tree walk, although that makes * serializing into the mbuf chains a bit harder. For now we do two walks * which makes things easier on us, and it may or may not be harder on * the L2 cache. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int igmp_v3_enqueue_filter_change(struct mbufq *mq, struct in_multi *inm) { static const int MINRECLEN = sizeof(struct igmp_grouprec) + sizeof(in_addr_t); struct ifnet *ifp; struct igmp_grouprec ig; struct igmp_grouprec *pig; struct ip_msource *ims, *nims; struct mbuf *m, *m0, *md; in_addr_t naddr; int m0srcs, nbytes, npbytes, off, rsrcs, schanged; int nallow, nblock; uint8_t mode, now, then; rectype_t crt, drt, nrt; IN_MULTI_LIST_LOCK_ASSERT(); if (inm->inm_nsrc == 0 || (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0)) return (0); ifp = inm->inm_ifp; /* interface */ mode = inm->inm_st[1].iss_fmode; /* filter mode at t1 */ crt = REC_NONE; /* current group record type */ drt = REC_NONE; /* mask of completed group record types */ nrt = REC_NONE; /* record type for current node */ m0srcs = 0; /* # source which will fit in current mbuf chain */ nbytes = 0; /* # of bytes appended to group's state-change queue */ npbytes = 0; /* # of bytes appended this packet */ rsrcs = 0; /* # sources encoded in current record */ schanged = 0; /* # nodes encoded in overall filter change */ nallow = 0; /* # of source entries in ALLOW_NEW */ nblock = 0; /* # of source entries in BLOCK_OLD */ nims = NULL; /* next tree node pointer */ /* * For each possible filter record mode. * The first kind of source we encounter tells us which * is the first kind of record we start appending. * If a node transitioned to UNDEFINED at t1, its mode is treated * as the inverse of the group's filter mode. */ while (drt != REC_FULL) { do { m0 = mbufq_last(mq); if (m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) && (m0->m_pkthdr.len + MINRECLEN) < (ifp->if_mtu - IGMP_LEADINGSPACE)) { m = m0; m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); CTR1(KTR_IGMPV3, "%s: use previous packet", __func__); } else { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) { CTR1(KTR_IGMPV3, "%s: m_get*() failed", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 0; igmp_save_context(m, ifp); m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); npbytes = 0; CTR1(KTR_IGMPV3, "%s: allocated new packet", __func__); } /* * Append the IGMP group record header to the * current packet's data area. * Recalculate pointer to free space for next * group record, in case m_append() allocated * a new mbuf or cluster. */ memset(&ig, 0, sizeof(ig)); ig.ig_group = inm->inm_addr; if (!m_append(m, sizeof(ig), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed", __func__); return (-ENOMEM); } npbytes += sizeof(struct igmp_grouprec); if (m != m0) { /* new packet; offset in c hain */ md = m_getptr(m, npbytes - sizeof(struct igmp_grouprec), &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); } else { /* current packet; offset from last append */ md = m_last(m); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + md->m_len - sizeof(struct igmp_grouprec)); } /* * Begin walking the tree for this record type * pass, or continue from where we left off * previously if we had to allocate a new packet. * Only report deltas in-mode at t1. * We need not report included sources as allowed * if we are in inclusive mode on the group, * however the converse is not true. */ rsrcs = 0; if (nims == NULL) nims = RB_MIN(ip_msource_tree, &inm->inm_srcs); RB_FOREACH_FROM(ims, ip_msource_tree, nims) { CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__, ims->ims_haddr); now = ims_get_mode(inm, ims, 1); then = ims_get_mode(inm, ims, 0); CTR3(KTR_IGMPV3, "%s: mode: t0 %d, t1 %d", __func__, then, now); if (now == then) { CTR1(KTR_IGMPV3, "%s: skip unchanged", __func__); continue; } if (mode == MCAST_EXCLUDE && now == MCAST_INCLUDE) { CTR1(KTR_IGMPV3, "%s: skip IN src on EX group", __func__); continue; } nrt = (rectype_t)now; if (nrt == REC_NONE) nrt = (rectype_t)(~mode & REC_FULL); if (schanged++ == 0) { crt = nrt; } else if (crt != nrt) continue; naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed", __func__); return (-ENOMEM); } nallow += !!(crt == REC_ALLOW); nblock += !!(crt == REC_BLOCK); if (++rsrcs == m0srcs) break; } /* * If we did not append any tree nodes on this * pass, back out of allocations. */ if (rsrcs == 0) { npbytes -= sizeof(struct igmp_grouprec); if (m != m0) { CTR1(KTR_IGMPV3, "%s: m_free(m)", __func__); m_freem(m); } else { CTR1(KTR_IGMPV3, "%s: m_adj(m, -ig)", __func__); m_adj(m, -((int)sizeof( struct igmp_grouprec))); } continue; } npbytes += (rsrcs * sizeof(in_addr_t)); if (crt == REC_ALLOW) pig->ig_type = IGMP_ALLOW_NEW_SOURCES; else if (crt == REC_BLOCK) pig->ig_type = IGMP_BLOCK_OLD_SOURCES; pig->ig_numsrc = htons(rsrcs); /* * Count the new group record, and enqueue this * packet if it wasn't already queued. */ m->m_pkthdr.PH_vt.vt_nrecs++; if (m != m0) mbufq_enqueue(mq, m); nbytes += npbytes; } while (nims != NULL); drt |= crt; crt = (~crt & REC_FULL); } CTR3(KTR_IGMPV3, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__, nallow, nblock); return (nbytes); } static int igmp_v3_merge_state_changes(struct in_multi *inm, struct mbufq *scq) { struct mbufq *gq; struct mbuf *m; /* pending state-change */ struct mbuf *m0; /* copy of pending state-change */ struct mbuf *mt; /* last state-change in packet */ int docopy, domerge; u_int recslen; docopy = 0; domerge = 0; recslen = 0; IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); /* * If there are further pending retransmissions, make a writable * copy of each queued state-change message before merging. */ if (inm->inm_scrv > 0) docopy = 1; gq = &inm->inm_scq; #ifdef KTR if (mbufq_first(gq) == NULL) { CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty", __func__, inm); } #endif m = mbufq_first(gq); while (m != NULL) { /* * Only merge the report into the current packet if * there is sufficient space to do so; an IGMPv3 report * packet may only contain 65,535 group records. * Always use a simple mbuf chain concatentation to do this, * as large state changes for single groups may have * allocated clusters. */ domerge = 0; mt = mbufq_last(scq); if (mt != NULL) { recslen = m_length(m, NULL); if ((mt->m_pkthdr.PH_vt.vt_nrecs + m->m_pkthdr.PH_vt.vt_nrecs <= IGMP_V3_REPORT_MAXRECS) && (mt->m_pkthdr.len + recslen <= (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE))) domerge = 1; } if (!domerge && mbufq_full(gq)) { CTR2(KTR_IGMPV3, "%s: outbound queue full, skipping whole packet %p", __func__, m); mt = m->m_nextpkt; if (!docopy) m_freem(m); m = mt; continue; } if (!docopy) { CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m); m0 = mbufq_dequeue(gq); m = m0->m_nextpkt; } else { CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m); m0 = m_dup(m, M_NOWAIT); if (m0 == NULL) return (ENOMEM); m0->m_nextpkt = NULL; m = m->m_nextpkt; } if (!domerge) { CTR3(KTR_IGMPV3, "%s: queueing %p to scq %p)", __func__, m0, scq); mbufq_enqueue(scq, m0); } else { struct mbuf *mtl; /* last mbuf of packet mt */ CTR3(KTR_IGMPV3, "%s: merging %p with scq tail %p)", __func__, m0, mt); mtl = m_last(mt); m0->m_flags &= ~M_PKTHDR; mt->m_pkthdr.len += recslen; mt->m_pkthdr.PH_vt.vt_nrecs += m0->m_pkthdr.PH_vt.vt_nrecs; mtl->m_next = m0; } } return (0); } /* * Respond to a pending IGMPv3 General Query. */ static void igmp_v3_dispatch_general_query(struct igmp_ifsoftc *igi) { struct ifmultiaddr *ifma; struct ifnet *ifp; struct in_multi *inm; int retval, loop; IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi->igi_version == IGMP_VERSION_3, ("%s: called when version %d", __func__, igi->igi_version)); /* * Check that there are some packets queued. If so, send them first. * For large number of groups the reply to general query can take * many packets, we should finish sending them before starting of * queuing the new reply. */ if (mbufq_len(&igi->igi_gq) != 0) goto send; ifp = igi->igi_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; KASSERT(ifp == inm->inm_ifp, ("%s: inconsistent ifp", __func__)); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; retval = igmp_v3_enqueue_group_record(&igi->igi_gq, inm, 0, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } IF_ADDR_RUNLOCK(ifp); send: loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop); /* * Slew transmission of bursts over 500ms intervals. */ if (mbufq_first(&igi->igi_gq) != NULL) { igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY( IGMP_RESPONSE_BURST_INTERVAL); V_interface_timers_running = 1; } } /* * Transmit the next pending IGMP message in the output queue. * * We get called from netisr_processqueue(). A mutex private to igmpoq * will be acquired and released around this routine. * * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis. * MRT: Nothing needs to be done, as IGMP traffic is always local to * a link and uses a link-scope multicast address. */ static void igmp_intr(struct mbuf *m) { struct ip_moptions imo; struct ifnet *ifp; struct mbuf *ipopts, *m0; int error; uint32_t ifindex; CTR2(KTR_IGMPV3, "%s: transmit %p", __func__, m); /* * Set VNET image pointer from enqueued mbuf chain * before doing anything else. Whilst we use interface * indexes to guard against interface detach, they are * unique to each VIMAGE and must be retrieved. */ CURVNET_SET((struct vnet *)(m->m_pkthdr.PH_loc.ptr)); ifindex = igmp_restore_context(m); /* * Check if the ifnet still exists. This limits the scope of * any race in the absence of a global ifp lock for low cost * (an array lookup). */ ifp = ifnet_byindex(ifindex); if (ifp == NULL) { CTR3(KTR_IGMPV3, "%s: dropped %p as ifindex %u went away.", __func__, m, ifindex); m_freem(m); IPSTAT_INC(ips_noroute); goto out; } ipopts = V_igmp_sendra ? m_raopt : NULL; imo.imo_multicast_ttl = 1; imo.imo_multicast_vif = -1; imo.imo_multicast_loop = (V_ip_mrouter != NULL); /* * If the user requested that IGMP traffic be explicitly * redirected to the loopback interface (e.g. they are running a * MANET interface and the routing protocol needs to see the * updates), handle this now. */ if (m->m_flags & M_IGMP_LOOP) imo.imo_multicast_ifp = V_loif; else imo.imo_multicast_ifp = ifp; if (m->m_flags & M_IGMPV2) { m0 = m; } else { m0 = igmp_v3_encap_report(ifp, m); if (m0 == NULL) { CTR2(KTR_IGMPV3, "%s: dropped %p", __func__, m); m_freem(m); IPSTAT_INC(ips_odropped); goto out; } } igmp_scrub_context(m0); m_clrprotoflags(m); m0->m_pkthdr.rcvif = V_loif; #ifdef MAC mac_netinet_igmp_send(ifp, m0); #endif error = ip_output(m0, ipopts, NULL, 0, &imo, NULL); if (error) { CTR3(KTR_IGMPV3, "%s: ip_output(%p) = %d", __func__, m0, error); goto out; } IGMPSTAT_INC(igps_snd_reports); out: /* * We must restore the existing vnet pointer before * continuing as we are run from netisr context. */ CURVNET_RESTORE(); } /* * Encapsulate an IGMPv3 report. * * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf * chain has already had its IP/IGMPv3 header prepended. In this case * the function will not attempt to prepend; the lengths and checksums * will however be re-computed. * * Returns a pointer to the new mbuf chain head, or NULL if the * allocation failed. */ static struct mbuf * igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m) { struct rm_priotracker in_ifa_tracker; struct igmp_report *igmp; struct ip *ip; int hdrlen, igmpreclen; KASSERT((m->m_flags & M_PKTHDR), ("%s: mbuf chain %p is !M_PKTHDR", __func__, m)); igmpreclen = m_length(m, NULL); hdrlen = sizeof(struct ip) + sizeof(struct igmp_report); if (m->m_flags & M_IGMPV3_HDR) { igmpreclen -= hdrlen; } else { M_PREPEND(m, hdrlen, M_NOWAIT); if (m == NULL) return (NULL); m->m_flags |= M_IGMPV3_HDR; } CTR2(KTR_IGMPV3, "%s: igmpreclen is %d", __func__, igmpreclen); m->m_data += sizeof(struct ip); m->m_len -= sizeof(struct ip); igmp = mtod(m, struct igmp_report *); igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT; igmp->ir_rsv1 = 0; igmp->ir_rsv2 = 0; igmp->ir_numgrps = htons(m->m_pkthdr.PH_vt.vt_nrecs); igmp->ir_cksum = 0; igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen); m->m_pkthdr.PH_vt.vt_nrecs = 0; m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); ip = mtod(m, struct ip *); ip->ip_tos = IPTOS_PREC_INTERNETCONTROL; ip->ip_len = htons(hdrlen + igmpreclen); ip->ip_off = htons(IP_DF); ip->ip_p = IPPROTO_IGMP; ip->ip_sum = 0; ip->ip_src.s_addr = INADDR_ANY; if (m->m_flags & M_IGMP_LOOP) { struct in_ifaddr *ia; IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) { ip->ip_src = ia->ia_addr.sin_addr; ifa_free(&ia->ia_ifa); } } ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP); return (m); } #ifdef KTR static char * igmp_rec_type_to_str(const int type) { switch (type) { case IGMP_CHANGE_TO_EXCLUDE_MODE: return "TO_EX"; break; case IGMP_CHANGE_TO_INCLUDE_MODE: return "TO_IN"; break; case IGMP_MODE_IS_EXCLUDE: return "MODE_EX"; break; case IGMP_MODE_IS_INCLUDE: return "MODE_IN"; break; case IGMP_ALLOW_NEW_SOURCES: return "ALLOW_NEW"; break; case IGMP_BLOCK_OLD_SOURCES: return "BLOCK_OLD"; break; default: break; } return "unknown"; } #endif #ifdef VIMAGE static void vnet_igmp_init(const void *unused __unused) { netisr_register_vnet(&igmp_nh); } VNET_SYSINIT(vnet_igmp_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_igmp_init, NULL); static void vnet_igmp_uninit(const void *unused __unused) { /* This can happen when we shutdown the entire network stack. */ CTR1(KTR_IGMPV3, "%s: tearing down", __func__); netisr_unregister_vnet(&igmp_nh); } VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_igmp_uninit, NULL); #endif #ifdef DDB DB_SHOW_COMMAND(igi_list, db_show_igi_list) { struct igmp_ifsoftc *igi, *tigi; LIST_HEAD(_igi_list, igmp_ifsoftc) *igi_head; if (!have_addr) { db_printf("usage: show igi_list \n"); return; } igi_head = (struct _igi_list *)addr; LIST_FOREACH_SAFE(igi, igi_head, igi_link, tigi) { db_printf("igmp_ifsoftc %p:\n", igi); db_printf(" ifp %p\n", igi->igi_ifp); db_printf(" version %u\n", igi->igi_version); db_printf(" v1_timer %u\n", igi->igi_v1_timer); db_printf(" v2_timer %u\n", igi->igi_v2_timer); db_printf(" v3_timer %u\n", igi->igi_v3_timer); db_printf(" flags %#x\n", igi->igi_flags); db_printf(" rv %u\n", igi->igi_rv); db_printf(" qi %u\n", igi->igi_qi); db_printf(" qri %u\n", igi->igi_qri); db_printf(" uri %u\n", igi->igi_uri); /* struct mbufq igi_gq; */ db_printf("\n"); } } #endif static int igmp_modevent(module_t mod, int type, void *unused __unused) { switch (type) { case MOD_LOAD: CTR1(KTR_IGMPV3, "%s: initializing", __func__); IGMP_LOCK_INIT(); m_raopt = igmp_ra_alloc(); netisr_register(&igmp_nh); break; case MOD_UNLOAD: CTR1(KTR_IGMPV3, "%s: tearing down", __func__); netisr_unregister(&igmp_nh); m_free(m_raopt); m_raopt = NULL; IGMP_LOCK_DESTROY(); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t igmp_mod = { "igmp", igmp_modevent, 0 }; DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE); Index: head/sys/netinet/in.c =================================================================== --- head/sys/netinet/in.c (revision 333308) +++ head/sys/netinet/in.c (revision 333309) @@ -1,1508 +1,1509 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (C) 2001 WIDE Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.4 (Berkeley) 1/9/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int in_aifaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); static int in_difaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); static void in_socktrim(struct sockaddr_in *); static void in_purgemaddrs(struct ifnet *); static VNET_DEFINE(int, nosameprefix); #define V_nosameprefix VNET(nosameprefix) SYSCTL_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nosameprefix), 0, "Refuse to create same prefixes on different interfaces"); VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) static struct sx in_control_sx; SX_SYSINIT(in_control_sx, &in_control_sx, "in_control"); /* * Return 1 if an internet address is for a ``local'' host * (one to which we have a connection). */ int in_localaddr(struct in_addr in) { struct rm_priotracker in_ifa_tracker; u_long i = ntohl(in.s_addr); struct in_ifaddr *ia; IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((i & ia->ia_subnetmask) == ia->ia_subnet) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (1); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ int in_localip(struct in_addr in) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) { if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (1); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } /* * Return 1 if an internet address is configured on an interface. */ int in_ifhasaddr(struct ifnet *ifp, struct in_addr in) { struct ifaddr *ifa; struct in_ifaddr *ia; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == in.s_addr) { IF_ADDR_RUNLOCK(ifp); return (1); } } IF_ADDR_RUNLOCK(ifp); return (0); } /* * Return a reference to the interface address which is different to * the supplied one but with same IP address value. */ static struct in_ifaddr * in_localip_more(struct in_ifaddr *ia) { struct rm_priotracker in_ifa_tracker; in_addr_t in = IA_SIN(ia)->sin_addr.s_addr; struct in_ifaddr *it; IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(it, INADDR_HASH(in), ia_hash) { if (it != ia && IA_SIN(it)->sin_addr.s_addr == in) { ifa_ref(&it->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (it); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (NULL); } /* * Determine whether an IP address is in a reserved set of addresses * that may not be forwarded, or whether datagrams to that destination * may be forwarded. */ int in_canforward(struct in_addr in) { u_long i = ntohl(in.s_addr); u_long net; if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i) || IN_LINKLOCAL(i)) return (0); if (IN_CLASSA(i)) { net = i & IN_CLASSA_NET; if (net == 0 || net == (IN_LOOPBACKNET << IN_CLASSA_NSHIFT)) return (0); } return (1); } /* * Trim a mask in a sockaddr */ static void in_socktrim(struct sockaddr_in *ap) { char *cplim = (char *) &ap->sin_addr; char *cp = (char *) (&ap->sin_addr + 1); ap->sin_len = 0; while (--cp >= cplim) if (*cp) { (ap)->sin_len = cp - (char *) (ap) + 1; break; } } /* * Generic internet control operations (ioctl's). */ int in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct ifreq *ifr = (struct ifreq *)data; struct sockaddr_in *addr = (struct sockaddr_in *)&ifr->ifr_addr; struct ifaddr *ifa; struct in_ifaddr *ia; int error; if (ifp == NULL) return (EADDRNOTAVAIL); /* * Filter out 4 ioctls we implement directly. Forward the rest * to specific functions and ifp->if_ioctl(). */ switch (cmd) { case SIOCGIFADDR: case SIOCGIFBRDADDR: case SIOCGIFDSTADDR: case SIOCGIFNETMASK: break; case SIOCDIFADDR: sx_xlock(&in_control_sx); error = in_difaddr_ioctl(cmd, data, ifp, td); sx_xunlock(&in_control_sx); return (error); case OSIOCAIFADDR: /* 9.x compat */ case SIOCAIFADDR: sx_xlock(&in_control_sx); error = in_aifaddr_ioctl(cmd, data, ifp, td); sx_xunlock(&in_control_sx); return (error); case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: /* We no longer support that old commands. */ return (EINVAL); default: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); return ((*ifp->if_ioctl)(ifp, cmd, data)); } if (addr->sin_addr.s_addr != INADDR_ANY && prison_check_ip4(td->td_ucred, &addr->sin_addr) != 0) return (EADDRNOTAVAIL); /* * Find address for this interface, if it exists. If an * address was specified, find that one instead of the * first one on the interface, if possible. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr) break; } if (ifa == NULL) TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET) { ia = (struct in_ifaddr *)ifa; if (prison_check_ip4(td->td_ucred, &ia->ia_addr.sin_addr) == 0) break; } if (ifa == NULL) { IF_ADDR_RUNLOCK(ifp); return (EADDRNOTAVAIL); } error = 0; switch (cmd) { case SIOCGIFADDR: *addr = ia->ia_addr; break; case SIOCGIFBRDADDR: if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EINVAL; break; } *addr = ia->ia_broadaddr; break; case SIOCGIFDSTADDR: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; break; } *addr = ia->ia_dstaddr; break; case SIOCGIFNETMASK: *addr = ia->ia_sockmask; break; } IF_ADDR_RUNLOCK(ifp); return (error); } static int in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { const struct in_aliasreq *ifra = (struct in_aliasreq *)data; const struct sockaddr_in *addr = &ifra->ifra_addr; const struct sockaddr_in *broadaddr = &ifra->ifra_broadaddr; const struct sockaddr_in *mask = &ifra->ifra_mask; const struct sockaddr_in *dstaddr = &ifra->ifra_dstaddr; const int vhid = (cmd == SIOCAIFADDR) ? ifra->ifra_vhid : 0; struct ifaddr *ifa; struct in_ifaddr *ia; bool iaIsFirst; int error = 0; error = priv_check(td, PRIV_NET_ADDIFADDR); if (error) return (error); /* * ifra_addr must be present and be of INET family. * ifra_broadaddr/ifra_dstaddr and ifra_mask are optional. */ if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) return (EINVAL); if (broadaddr->sin_len != 0 && (broadaddr->sin_len != sizeof(struct sockaddr_in) || broadaddr->sin_family != AF_INET)) return (EINVAL); if (mask->sin_len != 0 && (mask->sin_len != sizeof(struct sockaddr_in) || mask->sin_family != AF_INET)) return (EINVAL); if ((ifp->if_flags & IFF_POINTOPOINT) && (dstaddr->sin_len != sizeof(struct sockaddr_in) || dstaddr->sin_addr.s_addr == INADDR_ANY)) return (EDESTADDRREQ); if (vhid > 0 && carp_attach_p == NULL) return (EPROTONOSUPPORT); /* * See whether address already exist. */ iaIsFirst = true; ia = NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; iaIsFirst = false; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0) ia = it; } IF_ADDR_RUNLOCK(ifp); if (ia != NULL) (void )in_difaddr_ioctl(cmd, data, ifp, td); ifa = ifa_alloc(sizeof(struct in_ifaddr), M_WAITOK); ia = (struct in_ifaddr *)ifa; ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; callout_init_rw(&ia->ia_garp_timer, &ifp->if_addr_lock, CALLOUT_RETURNUNLOCKED); ia->ia_ifp = ifp; ia->ia_addr = *addr; if (mask->sin_len != 0) { ia->ia_sockmask = *mask; ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); } else { in_addr_t i = ntohl(addr->sin_addr.s_addr); /* * Be compatible with network classes, if netmask isn't * supplied, guess it based on classes. */ if (IN_CLASSA(i)) ia->ia_subnetmask = IN_CLASSA_NET; else if (IN_CLASSB(i)) ia->ia_subnetmask = IN_CLASSB_NET; else ia->ia_subnetmask = IN_CLASSC_NET; ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask); } ia->ia_subnet = ntohl(addr->sin_addr.s_addr) & ia->ia_subnetmask; in_socktrim(&ia->ia_sockmask); if (ifp->if_flags & IFF_BROADCAST) { if (broadaddr->sin_len != 0) { ia->ia_broadaddr = *broadaddr; } else if (ia->ia_subnetmask == IN_RFC3021_MASK) { ia->ia_broadaddr.sin_addr.s_addr = INADDR_BROADCAST; ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } else { ia->ia_broadaddr.sin_addr.s_addr = htonl(ia->ia_subnet | ~ia->ia_subnetmask); ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } } if (ifp->if_flags & IFF_POINTOPOINT) ia->ia_dstaddr = *dstaddr; /* XXXGL: rtinit() needs this strange assignment. */ if (ifp->if_flags & IFF_LOOPBACK) ia->ia_dstaddr = ia->ia_addr; if (vhid != 0) { error = (*carp_attach_p)(&ia->ia_ifa, vhid); if (error) return (error); } /* if_addrhead is already referenced by ifa_alloc() */ IF_ADDR_WLOCK(ifp); TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(ifa); /* in_ifaddrhead */ IN_IFADDR_WLOCK(); TAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link); LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); IN_IFADDR_WUNLOCK(); /* * Give the interface a chance to initialize * if this is its first address, * and to validate the address if necessary. */ if (ifp->if_ioctl != NULL) { error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); if (error) goto fail1; } /* * Add route for the network. */ if (vhid == 0) { int flags = RTF_UP; if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) flags |= RTF_HOST; error = in_addprefix(ia, flags); if (error) goto fail1; } /* * Add a loopback route to self. */ if (vhid == 0 && (ifp->if_flags & IFF_LOOPBACK) == 0 && ia->ia_addr.sin_addr.s_addr != INADDR_ANY && !((ifp->if_flags & IFF_POINTOPOINT) && ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr)) { struct in_ifaddr *eia; eia = in_localip_more(ia); if (eia == NULL) { error = ifa_add_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error) goto fail2; } else ifa_free(&eia->ia_ifa); } if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST)) { struct in_addr allhosts_addr; struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); error = in_joingroup(ifp, &allhosts_addr, NULL, &ii->ii_allhosts); } EVENTHANDLER_INVOKE(ifaddr_event, ifp); return (error); fail2: if (vhid == 0) (void )in_scrubprefix(ia, LLE_STATIC); fail1: if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, false); IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ IN_IFADDR_WLOCK(); TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link); LIST_REMOVE(ia, ia_hash); IN_IFADDR_WUNLOCK(); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (error); } static int in_difaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { const struct ifreq *ifr = (struct ifreq *)data; const struct sockaddr_in *addr = (const struct sockaddr_in *) &ifr->ifr_addr; struct ifaddr *ifa; struct in_ifaddr *ia; bool deleteAny, iaIsLast; int error; if (td != NULL) { error = priv_check(td, PRIV_NET_DELIFADDR); if (error) return (error); } if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) deleteAny = true; else deleteAny = false; iaIsLast = true; ia = NULL; IF_ADDR_WLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; if (deleteAny && ia == NULL && (td == NULL || prison_check_ip4(td->td_ucred, &it->ia_addr.sin_addr) == 0)) ia = it; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && (td == NULL || prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0)) ia = it; if (it != ia) iaIsLast = false; } if (ia == NULL) { IF_ADDR_WUNLOCK(ifp); return (EADDRNOTAVAIL); } TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ IN_IFADDR_WLOCK(); TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link); LIST_REMOVE(ia, ia_hash); IN_IFADDR_WUNLOCK(); /* * in_scrubprefix() kills the interface route. */ in_scrubprefix(ia, LLE_STATIC); /* * in_ifadown gets rid of all the rest of * the routes. This is not quite the right * thing to do, but at least if we are running * a routing process they will come back. */ in_ifadown(&ia->ia_ifa, 1); if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, (cmd == SIOCDIFADDR) ? false : true); /* * If this is the last IPv4 address configured on this * interface, leave the all-hosts group. * No state-change report need be transmitted. */ if (iaIsLast && (ifp->if_flags & IFF_MULTICAST)) { struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); if (ii->ii_allhosts) { (void)in_leavegroup(ii->ii_allhosts, NULL); ii->ii_allhosts = NULL; } } IF_ADDR_WLOCK(ifp); if (callout_stop(&ia->ia_garp_timer) == 1) { ifa_free(&ia->ia_ifa); } IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifaddr_event, ifp); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (0); } #define rtinitflags(x) \ ((((x)->ia_ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) != 0) \ ? RTF_HOST : 0) /* * Check if we have a route for the given prefix already or add one accordingly. */ int in_addprefix(struct in_ifaddr *target, int flags) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error; if ((flags & RTF_HOST) != 0) { prefix = target->ia_dstaddr.sin_addr; mask.s_addr = 0; } else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; } IN_IFADDR_RLOCK(&in_ifa_tracker); /* Look for an existing address with the same prefix, mask, and fib */ TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_dstaddr.sin_addr; if (prefix.s_addr != p.s_addr) continue; } else { p = ia->ia_addr.sin_addr; m = ia->ia_sockmask.sin_addr; p.s_addr &= m.s_addr; if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; } if (target->ia_ifp->if_fib != ia->ia_ifp->if_fib) continue; /* * If we got a matching prefix route inserted by other * interface address, we are done here. */ if (ia->ia_flags & IFA_ROUTE) { #ifdef RADIX_MPATH if (ia->ia_addr.sin_addr.s_addr == target->ia_addr.sin_addr.s_addr) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (EEXIST); } else break; #endif if (V_nosameprefix) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (EEXIST); } else { int fibnum; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : target->ia_ifp->if_fib; rt_addrmsg(RTM_ADD, &target->ia_ifa, fibnum); IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * No-one seem to have this prefix route, so we try to insert it. */ error = rtinit(&target->ia_ifa, (int)RTM_ADD, flags); if (!error) target->ia_flags |= IFA_ROUTE; return (error); } /* * Removes either all lle entries for given @ia, or lle * corresponding to @ia address. */ static void in_scrubprefixlle(struct in_ifaddr *ia, int all, u_int flags) { struct sockaddr_in addr, mask; struct sockaddr *saddr, *smask; struct ifnet *ifp; saddr = (struct sockaddr *)&addr; bzero(&addr, sizeof(addr)); addr.sin_len = sizeof(addr); addr.sin_family = AF_INET; smask = (struct sockaddr *)&mask; bzero(&mask, sizeof(mask)); mask.sin_len = sizeof(mask); mask.sin_family = AF_INET; mask.sin_addr.s_addr = ia->ia_subnetmask; ifp = ia->ia_ifp; if (all) { /* * Remove all L2 entries matching given prefix. * Convert address to host representation to avoid * doing this on every callback. ia_subnetmask is already * stored in host representation. */ addr.sin_addr.s_addr = ntohl(ia->ia_addr.sin_addr.s_addr); lltable_prefix_free(AF_INET, saddr, smask, flags); } else { /* Remove interface address only */ addr.sin_addr.s_addr = ia->ia_addr.sin_addr.s_addr; lltable_delete_addr(LLTABLE(ifp), LLE_IFADDR, saddr); } } /* * If there is no other address in the system that can serve a route to the * same prefix, remove the route. Hand over the route to the new address * otherwise. */ int in_scrubprefix(struct in_ifaddr *target, u_int flags) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error = 0; /* * Remove the loopback route to the interface address. */ if ((target->ia_addr.sin_addr.s_addr != INADDR_ANY) && !(target->ia_ifp->if_flags & IFF_LOOPBACK) && (flags & LLE_STATIC)) { struct in_ifaddr *eia; /* * XXXME: add fib-aware in_localip. * We definitely don't want to switch between * prefixes in different fibs. */ eia = in_localip_more(target); if (eia != NULL) { error = ifa_switch_loopback_route((struct ifaddr *)eia, (struct sockaddr *)&target->ia_addr); ifa_free(&eia->ia_ifa); } else { error = ifa_del_loopback_route((struct ifaddr *)target, (struct sockaddr *)&target->ia_addr); } } if (rtinitflags(target)) { prefix = target->ia_dstaddr.sin_addr; mask.s_addr = 0; } else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; } if ((target->ia_flags & IFA_ROUTE) == 0) { int fibnum; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : target->ia_ifp->if_fib; rt_addrmsg(RTM_DELETE, &target->ia_ifa, fibnum); /* * Removing address from !IFF_UP interface or * prefix which exists on other interface (along with route). * No entries should exist here except target addr. * Given that, delete this entry only. */ in_scrubprefixlle(target, 0, flags); return (0); } IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_dstaddr.sin_addr; if (prefix.s_addr != p.s_addr) continue; } else { p = ia->ia_addr.sin_addr; m = ia->ia_sockmask.sin_addr; p.s_addr &= m.s_addr; if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; } if ((ia->ia_ifp->if_flags & IFF_UP) == 0) continue; /* * If we got a matching prefix address, move IFA_ROUTE and * the route itself to it. Make sure that routing daemons * get a heads-up. */ if ((ia->ia_flags & IFA_ROUTE) == 0) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, old prefix delete failed\n", error); /* Scrub all entries IFF interface is different */ in_scrubprefixlle(target, target->ia_ifp != ia->ia_ifp, flags); error = rtinit(&ia->ia_ifa, (int)RTM_ADD, rtinitflags(ia) | RTF_UP); if (error == 0) ia->ia_flags |= IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, new prefix add failed\n", error); ifa_free(&ia->ia_ifa); return (error); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * remove all L2 entries on the given prefix */ in_scrubprefixlle(target, 1, flags); /* * As no-one seem to have this prefix, we can remove the route. */ error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, prefix delete failed\n", error); return (error); } #undef rtinitflags void in_ifscrub_all(void) { struct ifnet *ifp; struct ifaddr *ifa, *nifa; struct ifaliasreq ifr; IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* Cannot lock here - lock recursion. */ /* IF_ADDR_RLOCK(ifp); */ TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) { if (ifa->ifa_addr->sa_family != AF_INET) continue; /* * This is ugly but the only way for legacy IP to * cleanly remove addresses and everything attached. */ bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; (void)in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL); } /* IF_ADDR_RUNLOCK(ifp); */ in_purgemaddrs(ifp); igmp_domifdetach(ifp); } IFNET_RUNLOCK(); } int in_ifaddr_broadcast(struct in_addr in, struct in_ifaddr *ia) { return ((in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || /* * Check for old-style (host 0) broadcast, but * taking into account that RFC 3021 obsoletes it. */ (ia->ia_subnetmask != IN_RFC3021_MASK && ntohl(in.s_addr) == ia->ia_subnet)) && /* * Check for an all one subnetmask. These * only exist when an interface gets a secondary * address. */ ia->ia_subnetmask != (u_long)0xffffffff); } /* * Return 1 if the address might be a local broadcast address. */ int in_broadcast(struct in_addr in, struct ifnet *ifp) { struct ifaddr *ifa; int found; if (in.s_addr == INADDR_BROADCAST || in.s_addr == INADDR_ANY) return (1); if ((ifp->if_flags & IFF_BROADCAST) == 0) return (0); found = 0; /* * Look through the list of addresses for a match * with a broadcast address. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && in_ifaddr_broadcast(in, (struct in_ifaddr *)ifa)) { found = 1; break; } IF_ADDR_RUNLOCK(ifp); return (found); } /* * On interface removal, clean up IPv4 data structures hung off of the ifnet. */ void in_ifdetach(struct ifnet *ifp) { IN_MULTI_LOCK(); in_pcbpurgeif0(&V_ripcbinfo, ifp); in_pcbpurgeif0(&V_udbinfo, ifp); in_pcbpurgeif0(&V_ulitecbinfo, ifp); in_purgemaddrs(ifp); IN_MULTI_UNLOCK(); } /* * Delete all IPv4 multicast address records, and associated link-layer * multicast address records, associated with ifp. * XXX It looks like domifdetach runs AFTER the link layer cleanup. * XXX This should not race with ifma_protospec being set during * a new allocation, if it does, we have bigger problems. */ static void in_purgemaddrs(struct ifnet *ifp) { struct in_multi_head purgeinms; struct in_multi *inm; - struct ifmultiaddr *ifma; + struct ifmultiaddr *ifma, *next; SLIST_INIT(&purgeinms); IN_MULTI_LIST_LOCK(); /* * Extract list of in_multi associated with the detaching ifp * which the PF_INET layer is about to release. * We need to do this as IF_ADDR_LOCK() may be re-acquired * by code further down. */ - IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + IF_ADDR_WLOCK(ifp); + restart: + TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; -#if 0 - KASSERT(ifma->ifma_protospec != NULL, - ("%s: ifma_protospec is NULL", __func__)); -#endif inm = (struct in_multi *)ifma->ifma_protospec; inm_rele_locked(&purgeinms, inm); + if (__predict_false(ifma_restart)) { + ifma_restart = true; + goto restart; + } } - IF_ADDR_RUNLOCK(ifp); + IF_ADDR_WUNLOCK(ifp); inm_release_list_deferred(&purgeinms); igmp_ifdetach(ifp); IN_MULTI_LIST_UNLOCK(); } struct in_llentry { struct llentry base; }; #define IN_LLTBL_DEFAULT_HSIZE 32 #define IN_LLTBL_HASH(k, h) \ (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1)) /* * Do actual deallocation of @lle. */ static void in_lltable_destroy_lle_unlocked(struct llentry *lle) { LLE_LOCK_DESTROY(lle); LLE_REQ_DESTROY(lle); free(lle, M_LLTABLE); } /* * Called by the datapath to indicate that * the entry was used. */ static void in_lltable_mark_used(struct llentry *lle) { LLE_REQ_LOCK(lle); lle->r_skip_req = 0; LLE_REQ_UNLOCK(lle); } /* * Called by LLE_FREE_LOCKED when number of references * drops to zero. */ static void in_lltable_destroy_lle(struct llentry *lle) { LLE_WUNLOCK(lle); in_lltable_destroy_lle_unlocked(lle); } static struct llentry * in_lltable_new(struct in_addr addr4, u_int flags) { struct in_llentry *lle; lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; /* * For IPv4 this will trigger "arpresolve" to generate * an ARP request. */ lle->base.la_expire = time_uptime; /* mark expired */ lle->base.r_l3addr.addr4 = addr4; lle->base.lle_refcnt = 1; lle->base.lle_free = in_lltable_destroy_lle; LLE_LOCK_INIT(&lle->base); LLE_REQ_INIT(&lle->base); callout_init(&lle->base.lle_timer, 1); return (&lle->base); } #define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ ((((d).s_addr ^ (a).s_addr) & (m).s_addr)) == 0 ) static int in_lltable_match_prefix(const struct sockaddr *saddr, const struct sockaddr *smask, u_int flags, struct llentry *lle) { struct in_addr addr, mask, lle_addr; addr = ((const struct sockaddr_in *)saddr)->sin_addr; mask = ((const struct sockaddr_in *)smask)->sin_addr; lle_addr.s_addr = ntohl(lle->r_l3addr.addr4.s_addr); if (IN_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0) return (0); if (lle->la_flags & LLE_IFADDR) { /* * Delete LLE_IFADDR records IFF address & flag matches. * Note that addr is the interface address within prefix * being matched. * Note also we should handle 'ifdown' cases without removing * ifaddr macs. */ if (addr.s_addr == lle_addr.s_addr && (flags & LLE_STATIC) != 0) return (1); return (0); } /* flags & LLE_STATIC means deleting both dynamic and static entries */ if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC)) return (1); return (0); } static void in_lltable_free_entry(struct lltable *llt, struct llentry *lle) { struct ifnet *ifp; size_t pkts_dropped; LLE_WLOCK_ASSERT(lle); KASSERT(llt != NULL, ("lltable is NULL")); /* Unlink entry from table if not already */ if ((lle->la_flags & LLE_LINKED) != 0) { ifp = llt->llt_ifp; IF_AFDATA_WLOCK_ASSERT(ifp); lltable_unlink_entry(llt, lle); } /* cancel timer */ if (callout_stop(&lle->lle_timer) > 0) LLE_REMREF(lle); /* Drop hold queue */ pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); } static int in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { struct rt_addrinfo info; struct sockaddr_in rt_key, rt_mask; struct sockaddr rt_gateway; int rt_flags; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); bzero(&rt_key, sizeof(rt_key)); rt_key.sin_len = sizeof(rt_key); bzero(&rt_mask, sizeof(rt_mask)); rt_mask.sin_len = sizeof(rt_mask); bzero(&rt_gateway, sizeof(rt_gateway)); rt_gateway.sa_len = sizeof(rt_gateway); bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = (struct sockaddr *)&rt_key; info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&rt_mask; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&rt_gateway; if (rib_lookup_info(ifp->if_fib, l3addr, NHR_REF, 0, &info) != 0) return (EINVAL); rt_flags = info.rti_flags; /* * If the gateway for an existing host route matches the target L3 * address, which is a special route inserted by some implementation * such as MANET, and the interface is of the correct type, then * allow for ARP to proceed. */ if (rt_flags & RTF_GATEWAY) { if (!(rt_flags & RTF_HOST) || !info.rti_ifp || info.rti_ifp->if_type != IFT_ETHER || (info.rti_ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) != 0 || memcmp(rt_gateway.sa_data, l3addr->sa_data, sizeof(in_addr_t)) != 0) { rib_free_info(&info); return (EINVAL); } } rib_free_info(&info); /* * Make sure that at least the destination address is covered * by the route. This is for handling the case where 2 or more * interfaces have the same prefix. An incoming packet arrives * on one interface and the corresponding outgoing packet leaves * another interface. */ if (!(rt_flags & RTF_HOST) && info.rti_ifp != ifp) { const char *sa, *mask, *addr, *lim; const struct sockaddr_in *l3sin; mask = (const char *)&rt_mask; /* * Just being extra cautious to avoid some custom * code getting into trouble. */ if ((info.rti_addrs & RTA_NETMASK) == 0) return (EINVAL); sa = (const char *)&rt_key; addr = (const char *)l3addr; l3sin = (const struct sockaddr_in *)l3addr; lim = addr + l3sin->sin_len; for ( ; addr < lim; sa++, mask++, addr++) { if ((*sa ^ *addr) & *mask) { #ifdef DIAGNOSTIC char addrbuf[INET_ADDRSTRLEN]; log(LOG_INFO, "IPv4 address: \"%s\" " "is not on the network\n", inet_ntoa_r(l3sin->sin_addr, addrbuf)); #endif return (EINVAL); } } } return (0); } static inline uint32_t in_lltable_hash_dst(const struct in_addr dst, uint32_t hsize) { return (IN_LLTBL_HASH(dst.s_addr, hsize)); } static uint32_t in_lltable_hash(const struct llentry *lle, uint32_t hsize) { return (in_lltable_hash_dst(lle->r_l3addr.addr4, hsize)); } static void in_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)sa; bzero(sin, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = lle->r_l3addr.addr4; } static inline struct llentry * in_lltable_find_dst(struct lltable *llt, struct in_addr dst) { struct llentry *lle; struct llentries *lleh; u_int hashidx; hashidx = in_lltable_hash_dst(dst, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; LIST_FOREACH(lle, lleh, lle_next) { if (lle->la_flags & LLE_DELETED) continue; if (lle->r_l3addr.addr4.s_addr == dst.s_addr) break; } return (lle); } static void in_lltable_delete_entry(struct lltable *llt, struct llentry *lle) { lle->la_flags |= LLE_DELETED; EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif llentry_free(lle); } static struct llentry * in_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in_lltable_rtcheck(ifp, flags, l3addr) != 0) return (NULL); lle = in_lltable_new(sin->sin_addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); return (NULL); } lle->la_flags = flags; if (flags & LLE_STATIC) lle->r_flags |= RLLE_VALID; if ((flags & LLE_IFADDR) == LLE_IFADDR) { linkhdrsize = LLE_MAX_LINKHDR; if (lltable_calc_llheader(ifp, AF_INET, IF_LLADDR(ifp), linkhdr, &linkhdrsize, &lladdr_off) != 0) { in_lltable_destroy_lle_unlocked(lle); return (NULL); } lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off); lle->la_flags |= LLE_STATIC; lle->r_flags |= (RLLE_VALID | RLLE_IFADDR); } return (lle); } /* * Return NULL if not found or marked for deletion. * If found return lle read locked. */ static struct llentry * in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct llentry *lle; IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); lle = in_lltable_find_dst(llt, sin->sin_addr); if (lle == NULL) return (NULL); KASSERT((flags & (LLE_UNLOCKED|LLE_EXCLUSIVE)) != (LLE_UNLOCKED|LLE_EXCLUSIVE),("wrong lle request flags: 0x%X", flags)); if (flags & LLE_UNLOCKED) return (lle); if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); return (lle); } static int in_lltable_dump_entry(struct lltable *llt, struct llentry *lle, struct sysctl_req *wr) { struct ifnet *ifp = llt->llt_ifp; /* XXX stack use */ struct { struct rt_msghdr rtm; struct sockaddr_in sin; struct sockaddr_dl sdl; } arpc; struct sockaddr_dl *sdl; int error; bzero(&arpc, sizeof(arpc)); /* skip deleted entries */ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) return (0); /* Skip if jailed and not a valid IP of the prison. */ lltable_fill_sa_entry(lle,(struct sockaddr *)&arpc.sin); if (prison_if(wr->td->td_ucred, (struct sockaddr *)&arpc.sin) != 0) return (0); /* * produce a msg made of: * struct rt_msghdr; * struct sockaddr_in; (IPv4) * struct sockaddr_dl; */ arpc.rtm.rtm_msglen = sizeof(arpc); arpc.rtm.rtm_version = RTM_VERSION; arpc.rtm.rtm_type = RTM_GET; arpc.rtm.rtm_flags = RTF_UP; arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; /* publish */ if (lle->la_flags & LLE_PUB) arpc.rtm.rtm_flags |= RTF_ANNOUNCE; sdl = &arpc.sdl; sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(*sdl); sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; if ((lle->la_flags & LLE_VALID) == LLE_VALID) { sdl->sdl_alen = ifp->if_addrlen; bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); } else { sdl->sdl_alen = 0; bzero(LLADDR(sdl), ifp->if_addrlen); } arpc.rtm.rtm_rmx.rmx_expire = lle->la_flags & LLE_STATIC ? 0 : lle->la_expire; arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) arpc.rtm.rtm_flags |= RTF_STATIC; if (lle->la_flags & LLE_IFADDR) arpc.rtm.rtm_flags |= RTF_PINNED; arpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &arpc, sizeof(arpc)); return (error); } static struct lltable * in_lltattach(struct ifnet *ifp) { struct lltable *llt; llt = lltable_allocate_htbl(IN_LLTBL_DEFAULT_HSIZE); llt->llt_af = AF_INET; llt->llt_ifp = ifp; llt->llt_lookup = in_lltable_lookup; llt->llt_alloc_entry = in_lltable_alloc; llt->llt_delete_entry = in_lltable_delete_entry; llt->llt_dump_entry = in_lltable_dump_entry; llt->llt_hash = in_lltable_hash; llt->llt_fill_sa_entry = in_lltable_fill_sa_entry; llt->llt_free_entry = in_lltable_free_entry; llt->llt_match_prefix = in_lltable_match_prefix; llt->llt_mark_used = in_lltable_mark_used; lltable_link(llt); return (llt); } void * in_domifattach(struct ifnet *ifp) { struct in_ifinfo *ii; ii = malloc(sizeof(struct in_ifinfo), M_IFADDR, M_WAITOK|M_ZERO); ii->ii_llt = in_lltattach(ifp); ii->ii_igmp = igmp_domifattach(ifp); return (ii); } void in_domifdetach(struct ifnet *ifp, void *aux) { struct in_ifinfo *ii = (struct in_ifinfo *)aux; igmp_domifdetach(ifp); lltable_free(ii->ii_llt); free(ii, M_IFADDR); } Index: head/sys/netinet/in_mcast.c =================================================================== --- head/sys/netinet/in_mcast.c (revision 333308) +++ head/sys/netinet/in_mcast.c (revision 333309) @@ -1,3094 +1,3128 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Bruce Simpson. * Copyright (c) 2005 Robert N. M. Watson. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * IPv4 multicast socket, group, and socket option processing module. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_IGMPV3 #define KTR_IGMPV3 KTR_INET #endif #ifndef __SOCKUNION_DECLARED union sockunion { struct sockaddr_storage ss; struct sockaddr sa; struct sockaddr_dl sdl; struct sockaddr_in sin; }; typedef union sockunion sockunion_t; #define __SOCKUNION_DECLARED #endif /* __SOCKUNION_DECLARED */ static MALLOC_DEFINE(M_INMFILTER, "in_mfilter", "IPv4 multicast PCB-layer source filter"); static MALLOC_DEFINE(M_IPMADDR, "in_multi", "IPv4 multicast group"); static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "IPv4 multicast options"); static MALLOC_DEFINE(M_IPMSOURCE, "ip_msource", "IPv4 multicast IGMP-layer source filter"); /* * Locking: * - Lock order is: Giant, INP_WLOCK, IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK. * - The IF_ADDR_LOCK is implicitly taken by inm_lookup() earlier, however * it can be taken by code in net/if.c also. * - ip_moptions and in_mfilter are covered by the INP_WLOCK. * * struct in_multi is covered by IN_MULTI_LIST_LOCK. There isn't strictly * any need for in_multi itself to be virtualized -- it is bound to an ifp * anyway no matter what happens. */ struct mtx in_multi_list_mtx; MTX_SYSINIT(in_multi_mtx, &in_multi_list_mtx, "in_multi_list_mtx", MTX_DEF); struct mtx in_multi_free_mtx; MTX_SYSINIT(in_multi_free_mtx, &in_multi_free_mtx, "in_multi_free_mtx", MTX_DEF); struct sx in_multi_sx; SX_SYSINIT(in_multi_sx, &in_multi_sx, "in_multi_sx"); +int ifma_restart; + /* * Functions with non-static linkage defined in this file should be * declared in in_var.h: * imo_multi_filter() * in_addmulti() * in_delmulti() * in_joingroup() * in_joingroup_locked() * in_leavegroup() * in_leavegroup_locked() * and ip_var.h: * inp_freemoptions() * inp_getmoptions() * inp_setmoptions() * * XXX: Both carp and pf need to use the legacy (*,G) KPIs in_addmulti() * and in_delmulti(). */ static void imf_commit(struct in_mfilter *); static int imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin, struct in_msource **); static struct in_msource * imf_graft(struct in_mfilter *, const uint8_t, const struct sockaddr_in *); static void imf_leave(struct in_mfilter *); static int imf_prune(struct in_mfilter *, const struct sockaddr_in *); static void imf_purge(struct in_mfilter *); static void imf_rollback(struct in_mfilter *); static void imf_reap(struct in_mfilter *); static int imo_grow(struct ip_moptions *); static size_t imo_match_group(const struct ip_moptions *, const struct ifnet *, const struct sockaddr *); static struct in_msource * imo_match_source(const struct ip_moptions *, const size_t, const struct sockaddr *); static void ims_merge(struct ip_msource *ims, const struct in_msource *lims, const int rollback); static int in_getmulti(struct ifnet *, const struct in_addr *, struct in_multi **); static int inm_get_source(struct in_multi *inm, const in_addr_t haddr, const int noalloc, struct ip_msource **pims); #ifdef KTR static int inm_is_ifp_detached(const struct in_multi *); #endif static int inm_merge(struct in_multi *, /*const*/ struct in_mfilter *); static void inm_purge(struct in_multi *); static void inm_reap(struct in_multi *); static void inm_release(struct in_multi *); static struct ip_moptions * inp_findmoptions(struct inpcb *); static void inp_freemoptions_internal(struct ip_moptions *); static void inp_gcmoptions(void *, int); static int inp_get_source_filters(struct inpcb *, struct sockopt *); static int inp_join_group(struct inpcb *, struct sockopt *); static int inp_leave_group(struct inpcb *, struct sockopt *); static struct ifnet * inp_lookup_mcast_ifp(const struct inpcb *, const struct sockaddr_in *, const struct in_addr); static int inp_block_unblock_source(struct inpcb *, struct sockopt *); static int inp_set_multicast_if(struct inpcb *, struct sockopt *); static int inp_set_source_filters(struct inpcb *, struct sockopt *); static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS); static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW, 0, "IPv4 multicast"); static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER; SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc, CTLFLAG_RWTUN, &in_mcast_maxgrpsrc, 0, "Max source filters per group"); static u_long in_mcast_maxsocksrc = IP_MAX_SOCK_SRC_FILTER; SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxsocksrc, CTLFLAG_RWTUN, &in_mcast_maxsocksrc, 0, "Max source filters per socket"); int in_mcast_loop = IP_DEFAULT_MULTICAST_LOOP; SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RWTUN, &in_mcast_loop, 0, "Loopback multicast datagrams by default"); static SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters, "Per-interface stack-wide source filters"); static STAILQ_HEAD(, ip_moptions) imo_gc_list = STAILQ_HEAD_INITIALIZER(imo_gc_list); static struct task imo_gc_task = TASK_INITIALIZER(0, inp_gcmoptions, NULL); #ifdef KTR /* * Inline function which wraps assertions for a valid ifp. * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp * is detached. */ static int __inline inm_is_ifp_detached(const struct in_multi *inm) { struct ifnet *ifp; KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->inm_ifma->ifma_ifp; if (ifp != NULL) { /* * Sanity check that netinet's notion of ifp is the * same as net's. */ KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); } return (ifp == NULL); } #endif static struct grouptask free_gtask; static struct in_multi_head inm_free_list; static void inm_release_task(void *arg __unused); static void inm_init(void) { SLIST_INIT(&inm_free_list); taskqgroup_config_gtask_init(NULL, &free_gtask, inm_release_task, "inm release task"); } SYSINIT(inm_init, SI_SUB_SMP + 1, SI_ORDER_FIRST, inm_init, NULL); void inm_release_list_deferred(struct in_multi_head *inmh) { if (SLIST_EMPTY(inmh)) return; mtx_lock(&in_multi_free_mtx); SLIST_CONCAT(&inm_free_list, inmh, in_multi, inm_nrele); mtx_unlock(&in_multi_free_mtx); GROUPTASK_ENQUEUE(&free_gtask); } void +inm_disconnect(struct in_multi *inm) +{ + struct ifnet *ifp; + struct ifmultiaddr *ifma, *ll_ifma; + + ifp = inm->inm_ifp; + IF_ADDR_WLOCK_ASSERT(ifp); + ifma = inm->inm_ifma; + + if_ref(ifp); + TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link); + MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname); + if ((ll_ifma = ifma->ifma_llifma) != NULL) { + MPASS(ifma != ll_ifma); + ifma->ifma_llifma = NULL; + MPASS(ll_ifma->ifma_llifma == NULL); + MPASS(ll_ifma->ifma_ifp == ifp); + if (--ll_ifma->ifma_refcount == 0) { + TAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifma_link); + MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname); + if_freemulti(ll_ifma); + ifma_restart = true; + } + } +} + +void inm_release_deferred(struct in_multi *inm) { struct in_multi_head tmp; IN_MULTI_LIST_LOCK_ASSERT(); MPASS(inm->inm_refcount > 0); if (--inm->inm_refcount == 0) { SLIST_INIT(&tmp); + inm_disconnect(inm); inm->inm_ifma->ifma_protospec = NULL; SLIST_INSERT_HEAD(&tmp, inm, inm_nrele); inm_release_list_deferred(&tmp); } } static void inm_release_task(void *arg __unused) { struct in_multi_head inm_free_tmp; struct in_multi *inm, *tinm; SLIST_INIT(&inm_free_tmp); mtx_lock(&in_multi_free_mtx); SLIST_CONCAT(&inm_free_tmp, &inm_free_list, in_multi, inm_nrele); mtx_unlock(&in_multi_free_mtx); IN_MULTI_LOCK(); SLIST_FOREACH_SAFE(inm, &inm_free_tmp, inm_nrele, tinm) { SLIST_REMOVE_HEAD(&inm_free_tmp, inm_nrele); MPASS(inm); inm_release(inm); } IN_MULTI_UNLOCK(); } /* * Initialize an in_mfilter structure to a known state at t0, t1 * with an empty source filter list. */ static __inline void imf_init(struct in_mfilter *imf, const int st0, const int st1) { memset(imf, 0, sizeof(struct in_mfilter)); RB_INIT(&imf->imf_sources); imf->imf_st[0] = st0; imf->imf_st[1] = st1; } /* * Function for looking up an in_multi record for an IPv4 multicast address * on a given interface. ifp must be valid. If no record found, return NULL. * The IN_MULTI_LIST_LOCK and IF_ADDR_LOCK on ifp must be held. */ struct in_multi * inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina) { struct ifmultiaddr *ifma; struct in_multi *inm; IN_MULTI_LIST_LOCK_ASSERT(); IF_ADDR_LOCK_ASSERT(ifp); inm = NULL; TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { if (ifma->ifma_addr->sa_family == AF_INET) { inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_addr.s_addr == ina.s_addr) break; inm = NULL; } } return (inm); } /* * Wrapper for inm_lookup_locked(). * The IF_ADDR_LOCK will be taken on ifp and released on return. */ struct in_multi * inm_lookup(struct ifnet *ifp, const struct in_addr ina) { struct in_multi *inm; IN_MULTI_LIST_LOCK_ASSERT(); IF_ADDR_RLOCK(ifp); inm = inm_lookup_locked(ifp, ina); IF_ADDR_RUNLOCK(ifp); return (inm); } /* * Resize the ip_moptions vector to the next power-of-two minus 1. * May be called with locks held; do not sleep. */ static int imo_grow(struct ip_moptions *imo) { struct in_multi **nmships; struct in_multi **omships; struct in_mfilter *nmfilters; struct in_mfilter *omfilters; size_t idx; size_t newmax; size_t oldmax; nmships = NULL; nmfilters = NULL; omships = imo->imo_membership; omfilters = imo->imo_mfilters; oldmax = imo->imo_max_memberships; newmax = ((oldmax + 1) * 2) - 1; if (newmax <= IP_MAX_MEMBERSHIPS) { nmships = (struct in_multi **)realloc(omships, sizeof(struct in_multi *) * newmax, M_IPMOPTS, M_NOWAIT); nmfilters = (struct in_mfilter *)realloc(omfilters, sizeof(struct in_mfilter) * newmax, M_INMFILTER, M_NOWAIT); if (nmships != NULL && nmfilters != NULL) { /* Initialize newly allocated source filter heads. */ for (idx = oldmax; idx < newmax; idx++) { imf_init(&nmfilters[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); } imo->imo_max_memberships = newmax; imo->imo_membership = nmships; imo->imo_mfilters = nmfilters; } } if (nmships == NULL || nmfilters == NULL) { if (nmships != NULL) free(nmships, M_IPMOPTS); if (nmfilters != NULL) free(nmfilters, M_INMFILTER); return (ETOOMANYREFS); } return (0); } /* * Find an IPv4 multicast group entry for this ip_moptions instance * which matches the specified group, and optionally an interface. * Return its index into the array, or -1 if not found. */ static size_t imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group) { const struct sockaddr_in *gsin; struct in_multi **pinm; int idx; int nmships; gsin = (const struct sockaddr_in *)group; /* The imo_membership array may be lazy allocated. */ if (imo->imo_membership == NULL || imo->imo_num_memberships == 0) return (-1); nmships = imo->imo_num_memberships; pinm = &imo->imo_membership[0]; for (idx = 0; idx < nmships; idx++, pinm++) { if (*pinm == NULL) continue; if ((ifp == NULL || ((*pinm)->inm_ifp == ifp)) && in_hosteq((*pinm)->inm_addr, gsin->sin_addr)) { break; } } if (idx >= nmships) idx = -1; return (idx); } /* * Find an IPv4 multicast source entry for this imo which matches * the given group index for this socket, and source address. * * NOTE: This does not check if the entry is in-mode, merely if * it exists, which may not be the desired behaviour. */ static struct in_msource * imo_match_source(const struct ip_moptions *imo, const size_t gidx, const struct sockaddr *src) { struct ip_msource find; struct in_mfilter *imf; struct ip_msource *ims; const sockunion_t *psa; KASSERT(src->sa_family == AF_INET, ("%s: !AF_INET", __func__)); KASSERT(gidx != -1 && gidx < imo->imo_num_memberships, ("%s: invalid index %d\n", __func__, (int)gidx)); /* The imo_mfilters array may be lazy allocated. */ if (imo->imo_mfilters == NULL) return (NULL); imf = &imo->imo_mfilters[gidx]; /* Source trees are keyed in host byte order. */ psa = (const sockunion_t *)src; find.ims_haddr = ntohl(psa->sin.sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); return ((struct in_msource *)ims); } /* * Perform filtering for multicast datagrams on a socket by group and source. * * Returns 0 if a datagram should be allowed through, or various error codes * if the socket was not a member of the group, or the source was muted, etc. */ int imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group, const struct sockaddr *src) { size_t gidx; struct in_msource *ims; int mode; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); gidx = imo_match_group(imo, ifp, group); if (gidx == -1) return (MCAST_NOTGMEMBER); /* * Check if the source was included in an (S,G) join. * Allow reception on exclusive memberships by default, * reject reception on inclusive memberships by default. * Exclude source only if an in-mode exclude filter exists. * Include source only if an in-mode include filter exists. * NOTE: We are comparing group state here at IGMP t1 (now) * with socket-layer t0 (since last downcall). */ mode = imo->imo_mfilters[gidx].imf_st[1]; ims = imo_match_source(imo, gidx, src); if ((ims == NULL && mode == MCAST_INCLUDE) || (ims != NULL && ims->imsl_st[0] != mode)) return (MCAST_NOTSMEMBER); return (MCAST_PASS); } /* * Find and return a reference to an in_multi record for (ifp, group), * and bump its reference count. * If one does not exist, try to allocate it, and update link-layer multicast * filters on ifp to listen for group. * Assumes the IN_MULTI lock is held across the call. * Return 0 if successful, otherwise return an appropriate error code. */ static int in_getmulti(struct ifnet *ifp, const struct in_addr *group, struct in_multi **pinm) { struct sockaddr_in gsin; struct ifmultiaddr *ifma; struct in_ifinfo *ii; struct in_multi *inm; int error; IN_MULTI_LOCK_ASSERT(); ii = (struct in_ifinfo *)ifp->if_afdata[AF_INET]; IN_MULTI_LIST_LOCK(); inm = inm_lookup(ifp, *group); if (inm != NULL) { /* * If we already joined this group, just bump the * refcount and return it. */ KASSERT(inm->inm_refcount >= 1, ("%s: bad refcount %d", __func__, inm->inm_refcount)); inm_acquire_locked(inm); *pinm = inm; } IN_MULTI_LIST_UNLOCK(); if (inm != NULL) return (0); memset(&gsin, 0, sizeof(gsin)); gsin.sin_family = AF_INET; gsin.sin_len = sizeof(struct sockaddr_in); gsin.sin_addr = *group; /* * Check if a link-layer group is already associated * with this network-layer group on the given ifnet. */ error = if_addmulti(ifp, (struct sockaddr *)&gsin, &ifma); if (error != 0) return (error); /* XXX ifma_protospec must be covered by IF_ADDR_LOCK */ IN_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); /* * If something other than netinet is occupying the link-layer * group, print a meaningful error message and back out of * the allocation. * Otherwise, bump the refcount on the existing network-layer * group association and return it. */ if (ifma->ifma_protospec != NULL) { inm = (struct in_multi *)ifma->ifma_protospec; #ifdef INVARIANTS KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr", __func__)); KASSERT(ifma->ifma_addr->sa_family == AF_INET, ("%s: ifma not AF_INET", __func__)); KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__)); if (inm->inm_ifma != ifma || inm->inm_ifp != ifp || !in_hosteq(inm->inm_addr, *group)) { char addrbuf[INET_ADDRSTRLEN]; panic("%s: ifma %p is inconsistent with %p (%s)", __func__, ifma, inm, inet_ntoa_r(*group, addrbuf)); } #endif inm_acquire_locked(inm); *pinm = inm; goto out_locked; } IF_ADDR_WLOCK_ASSERT(ifp); /* * A new in_multi record is needed; allocate and initialize it. * We DO NOT perform an IGMP join as the in_ layer may need to * push an initial source list down to IGMP to support SSM. * * The initial source filter state is INCLUDE, {} as per the RFC. */ inm = malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT | M_ZERO); if (inm == NULL) { IF_ADDR_WUNLOCK(ifp); IN_MULTI_LIST_UNLOCK(); if_delmulti_ifma(ifma); return (ENOMEM); } inm->inm_addr = *group; inm->inm_ifp = ifp; inm->inm_igi = ii->ii_igmp; inm->inm_ifma = ifma; inm->inm_refcount = 1; inm->inm_state = IGMP_NOT_MEMBER; mbufq_init(&inm->inm_scq, IGMP_MAX_STATE_CHANGES); inm->inm_st[0].iss_fmode = MCAST_UNDEFINED; inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; RB_INIT(&inm->inm_srcs); ifma->ifma_protospec = inm; *pinm = inm; out_locked: IF_ADDR_WUNLOCK(ifp); IN_MULTI_LIST_UNLOCK(); return (0); } /* * Drop a reference to an in_multi record. * * If the refcount drops to 0, free the in_multi record and * delete the underlying link-layer membership. */ static void inm_release(struct in_multi *inm) { struct ifmultiaddr *ifma; struct ifnet *ifp; CTR2(KTR_IGMPV3, "%s: refcount is %d", __func__, inm->inm_refcount); MPASS(inm->inm_refcount == 0); CTR2(KTR_IGMPV3, "%s: freeing inm %p", __func__, inm); ifma = inm->inm_ifma; ifp = inm->inm_ifp; /* XXX this access is not covered by IF_ADDR_LOCK */ CTR2(KTR_IGMPV3, "%s: purging ifma %p", __func__, ifma); if (ifp) CURVNET_SET(ifp->if_vnet); inm_purge(inm); free(inm, M_IPMADDR); - if_delmulti_ifma(ifma); - if (ifp) + if_delmulti_ifma_flags(ifma, 1); + if (ifp) { CURVNET_RESTORE(); + if_rele(ifp); + } } /* * Clear recorded source entries for a group. * Used by the IGMP code. Caller must hold the IN_MULTI lock. * FIXME: Should reap. */ void inm_clear_recorded(struct in_multi *inm) { struct ip_msource *ims; IN_MULTI_LIST_LOCK_ASSERT(); RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { if (ims->ims_stp) { ims->ims_stp = 0; --inm->inm_st[1].iss_rec; } } KASSERT(inm->inm_st[1].iss_rec == 0, ("%s: iss_rec %d not 0", __func__, inm->inm_st[1].iss_rec)); } /* * Record a source as pending for a Source-Group IGMPv3 query. * This lives here as it modifies the shared tree. * * inm is the group descriptor. * naddr is the address of the source to record in network-byte order. * * If the net.inet.igmp.sgalloc sysctl is non-zero, we will * lazy-allocate a source node in response to an SG query. * Otherwise, no allocation is performed. This saves some memory * with the trade-off that the source will not be reported to the * router if joined in the window between the query response and * the group actually being joined on the local host. * * VIMAGE: XXX: Currently the igmp_sgalloc feature has been removed. * This turns off the allocation of a recorded source entry if * the group has not been joined. * * Return 0 if the source didn't exist or was already marked as recorded. * Return 1 if the source was marked as recorded by this function. * Return <0 if any error occurred (negated errno code). */ int inm_record_source(struct in_multi *inm, const in_addr_t naddr) { struct ip_msource find; struct ip_msource *ims, *nims; IN_MULTI_LIST_LOCK_ASSERT(); find.ims_haddr = ntohl(naddr); ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); if (ims && ims->ims_stp) return (0); if (ims == NULL) { if (inm->inm_nsrc == in_mcast_maxgrpsrc) return (-ENOSPC); nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (-ENOMEM); nims->ims_haddr = find.ims_haddr; RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); ++inm->inm_nsrc; ims = nims; } /* * Mark the source as recorded and update the recorded * source count. */ ++ims->ims_stp; ++inm->inm_st[1].iss_rec; return (1); } /* * Return a pointer to an in_msource owned by an in_mfilter, * given its source address. * Lazy-allocate if needed. If this is a new entry its filter state is * undefined at t0. * * imf is the filter set being modified. * haddr is the source address in *host* byte-order. * * SMPng: May be called with locks held; malloc must not block. */ static int imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin, struct in_msource **plims) { struct ip_msource find; struct ip_msource *ims, *nims; struct in_msource *lims; int error; error = 0; ims = NULL; lims = NULL; /* key is host byte order */ find.ims_haddr = ntohl(psin->sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); lims = (struct in_msource *)ims; if (lims == NULL) { if (imf->imf_nsrc == in_mcast_maxsocksrc) return (ENOSPC); nims = malloc(sizeof(struct in_msource), M_INMFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); lims = (struct in_msource *)nims; lims->ims_haddr = find.ims_haddr; lims->imsl_st[0] = MCAST_UNDEFINED; RB_INSERT(ip_msource_tree, &imf->imf_sources, nims); ++imf->imf_nsrc; } *plims = lims; return (error); } /* * Graft a source entry into an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being in the new filter mode at t1. * * Return the pointer to the new node, otherwise return NULL. */ static struct in_msource * imf_graft(struct in_mfilter *imf, const uint8_t st1, const struct sockaddr_in *psin) { struct ip_msource *nims; struct in_msource *lims; nims = malloc(sizeof(struct in_msource), M_INMFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (NULL); lims = (struct in_msource *)nims; lims->ims_haddr = ntohl(psin->sin_addr.s_addr); lims->imsl_st[0] = MCAST_UNDEFINED; lims->imsl_st[1] = st1; RB_INSERT(ip_msource_tree, &imf->imf_sources, nims); ++imf->imf_nsrc; return (lims); } /* * Prune a source entry from an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being left at t1, it is not freed. * * Return 0 if no error occurred, otherwise return an errno value. */ static int imf_prune(struct in_mfilter *imf, const struct sockaddr_in *psin) { struct ip_msource find; struct ip_msource *ims; struct in_msource *lims; /* key is host byte order */ find.ims_haddr = ntohl(psin->sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); if (ims == NULL) return (ENOENT); lims = (struct in_msource *)ims; lims->imsl_st[1] = MCAST_UNDEFINED; return (0); } /* * Revert socket-layer filter set deltas at t1 to t0 state. */ static void imf_rollback(struct in_mfilter *imf) { struct ip_msource *ims, *tims; struct in_msource *lims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == lims->imsl_st[1]) { /* no change at t1 */ continue; } else if (lims->imsl_st[0] != MCAST_UNDEFINED) { /* revert change to existing source at t1 */ lims->imsl_st[1] = lims->imsl_st[0]; } else { /* revert source added t1 */ CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } } imf->imf_st[1] = imf->imf_st[0]; } /* * Mark socket-layer filter set as INCLUDE {} at t1. */ static void imf_leave(struct in_mfilter *imf) { struct ip_msource *ims; struct in_msource *lims; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; lims->imsl_st[1] = MCAST_UNDEFINED; } imf->imf_st[1] = MCAST_INCLUDE; } /* * Mark socket-layer filter set deltas as committed. */ static void imf_commit(struct in_mfilter *imf) { struct ip_msource *ims; struct in_msource *lims; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; lims->imsl_st[0] = lims->imsl_st[1]; } imf->imf_st[0] = imf->imf_st[1]; } /* * Reap unreferenced sources from socket-layer filter set. */ static void imf_reap(struct in_mfilter *imf) { struct ip_msource *ims, *tims; struct in_msource *lims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { lims = (struct in_msource *)ims; if ((lims->imsl_st[0] == MCAST_UNDEFINED) && (lims->imsl_st[1] == MCAST_UNDEFINED)) { CTR2(KTR_IGMPV3, "%s: free lims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } } } /* * Purge socket-layer filter set. */ static void imf_purge(struct in_mfilter *imf) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } imf->imf_st[0] = imf->imf_st[1] = MCAST_UNDEFINED; KASSERT(RB_EMPTY(&imf->imf_sources), ("%s: imf_sources not empty", __func__)); } /* * Look up a source filter entry for a multicast group. * * inm is the group descriptor to work with. * haddr is the host-byte-order IPv4 address to look up. * noalloc may be non-zero to suppress allocation of sources. * *pims will be set to the address of the retrieved or allocated source. * * SMPng: NOTE: may be called with locks held. * Return 0 if successful, otherwise return a non-zero error code. */ static int inm_get_source(struct in_multi *inm, const in_addr_t haddr, const int noalloc, struct ip_msource **pims) { struct ip_msource find; struct ip_msource *ims, *nims; find.ims_haddr = haddr; ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); if (ims == NULL && !noalloc) { if (inm->inm_nsrc == in_mcast_maxgrpsrc) return (ENOSPC); nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); nims->ims_haddr = haddr; RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); ++inm->inm_nsrc; ims = nims; #ifdef KTR CTR3(KTR_IGMPV3, "%s: allocated 0x%08x as %p", __func__, haddr, ims); #endif } *pims = ims; return (0); } /* * Merge socket-layer source into IGMP-layer source. * If rollback is non-zero, perform the inverse of the merge. */ static void ims_merge(struct ip_msource *ims, const struct in_msource *lims, const int rollback) { int n = rollback ? -1 : 1; if (lims->imsl_st[0] == MCAST_EXCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 ex -= %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].ex -= n; } else if (lims->imsl_st[0] == MCAST_INCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 in -= %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].in -= n; } if (lims->imsl_st[1] == MCAST_EXCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 ex += %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].ex += n; } else if (lims->imsl_st[1] == MCAST_INCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 in += %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].in += n; } } /* * Atomically update the global in_multi state, when a membership's * filter list is being updated in any way. * * imf is the per-inpcb-membership group filter pointer. * A fake imf may be passed for in-kernel consumers. * * XXX This is a candidate for a set-symmetric-difference style loop * which would eliminate the repeated lookup from root of ims nodes, * as they share the same key space. * * If any error occurred this function will back out of refcounts * and return a non-zero value. */ static int inm_merge(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { struct ip_msource *ims, *nims; struct in_msource *lims; int schanged, error; int nsrc0, nsrc1; schanged = 0; error = 0; nsrc1 = nsrc0 = 0; IN_MULTI_LIST_LOCK_ASSERT(); /* * Update the source filters first, as this may fail. * Maintain count of in-mode filters at t0, t1. These are * used to work out if we transition into ASM mode or not. * Maintain a count of source filters whose state was * actually modified by this operation. */ RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == imf->imf_st[0]) nsrc0++; if (lims->imsl_st[1] == imf->imf_st[1]) nsrc1++; if (lims->imsl_st[0] == lims->imsl_st[1]) continue; error = inm_get_source(inm, lims->ims_haddr, 0, &nims); ++schanged; if (error) break; ims_merge(nims, lims, 0); } if (error) { struct ip_msource *bims; RB_FOREACH_REVERSE_FROM(ims, ip_msource_tree, nims) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == lims->imsl_st[1]) continue; (void)inm_get_source(inm, lims->ims_haddr, 1, &bims); if (bims == NULL) continue; ims_merge(bims, lims, 1); } goto out_reap; } CTR3(KTR_IGMPV3, "%s: imf filters in-mode: %d at t0, %d at t1", __func__, nsrc0, nsrc1); /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */ if (imf->imf_st[0] == imf->imf_st[1] && imf->imf_st[1] == MCAST_INCLUDE) { if (nsrc1 == 0) { CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__); --inm->inm_st[1].iss_in; } } /* Handle filter mode transition on socket. */ if (imf->imf_st[0] != imf->imf_st[1]) { CTR3(KTR_IGMPV3, "%s: imf transition %d to %d", __func__, imf->imf_st[0], imf->imf_st[1]); if (imf->imf_st[0] == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: --ex on inm at t1", __func__); --inm->inm_st[1].iss_ex; } else if (imf->imf_st[0] == MCAST_INCLUDE) { CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__); --inm->inm_st[1].iss_in; } if (imf->imf_st[1] == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: ex++ on inm at t1", __func__); inm->inm_st[1].iss_ex++; } else if (imf->imf_st[1] == MCAST_INCLUDE && nsrc1 > 0) { CTR1(KTR_IGMPV3, "%s: in++ on inm at t1", __func__); inm->inm_st[1].iss_in++; } } /* * Track inm filter state in terms of listener counts. * If there are any exclusive listeners, stack-wide * membership is exclusive. * Otherwise, if only inclusive listeners, stack-wide is inclusive. * If no listeners remain, state is undefined at t1, * and the IGMP lifecycle for this group should finish. */ if (inm->inm_st[1].iss_ex > 0) { CTR1(KTR_IGMPV3, "%s: transition to EX", __func__); inm->inm_st[1].iss_fmode = MCAST_EXCLUDE; } else if (inm->inm_st[1].iss_in > 0) { CTR1(KTR_IGMPV3, "%s: transition to IN", __func__); inm->inm_st[1].iss_fmode = MCAST_INCLUDE; } else { CTR1(KTR_IGMPV3, "%s: transition to UNDEF", __func__); inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; } /* Decrement ASM listener count on transition out of ASM mode. */ if (imf->imf_st[0] == MCAST_EXCLUDE && nsrc0 == 0) { if ((imf->imf_st[1] != MCAST_EXCLUDE) || (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) { CTR1(KTR_IGMPV3, "%s: --asm on inm at t1", __func__); --inm->inm_st[1].iss_asm; } } /* Increment ASM listener count on transition to ASM mode. */ if (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 == 0) { CTR1(KTR_IGMPV3, "%s: asm++ on inm at t1", __func__); inm->inm_st[1].iss_asm++; } CTR3(KTR_IGMPV3, "%s: merged imf %p to inm %p", __func__, imf, inm); inm_print(inm); out_reap: if (schanged > 0) { CTR1(KTR_IGMPV3, "%s: sources changed; reaping", __func__); inm_reap(inm); } return (error); } /* * Mark an in_multi's filter set deltas as committed. * Called by IGMP after a state change has been enqueued. */ void inm_commit(struct in_multi *inm) { struct ip_msource *ims; CTR2(KTR_IGMPV3, "%s: commit inm %p", __func__, inm); CTR1(KTR_IGMPV3, "%s: pre commit:", __func__); inm_print(inm); RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { ims->ims_st[0] = ims->ims_st[1]; } inm->inm_st[0] = inm->inm_st[1]; } /* * Reap unreferenced nodes from an in_multi's filter set. */ static void inm_reap(struct in_multi *inm) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { if (ims->ims_st[0].ex > 0 || ims->ims_st[0].in > 0 || ims->ims_st[1].ex > 0 || ims->ims_st[1].in > 0 || ims->ims_stp != 0) continue; CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); free(ims, M_IPMSOURCE); inm->inm_nsrc--; } } /* * Purge all source nodes from an in_multi's filter set. */ static void inm_purge(struct in_multi *inm) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); free(ims, M_IPMSOURCE); inm->inm_nsrc--; } } /* * Join a multicast group; unlocked entry point. * * SMPng: XXX: in_joingroup() is called from in_control() when Giant * is not held. Fortunately, ifp is unlikely to have been detached * at this point, so we assume it's OK to recurse. */ int in_joingroup(struct ifnet *ifp, const struct in_addr *gina, /*const*/ struct in_mfilter *imf, struct in_multi **pinm) { int error; IN_MULTI_LOCK(); error = in_joingroup_locked(ifp, gina, imf, pinm); IN_MULTI_UNLOCK(); return (error); } /* * Join a multicast group; real entry point. * * Only preserves atomicity at inm level. * NOTE: imf argument cannot be const due to sys/tree.h limitations. * * If the IGMP downcall fails, the group is not joined, and an error * code is returned. */ int in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina, /*const*/ struct in_mfilter *imf, struct in_multi **pinm) { struct in_mfilter timf; struct in_multi *inm; int error; IN_MULTI_LOCK_ASSERT(); IN_MULTI_LIST_UNLOCK_ASSERT(); CTR4(KTR_IGMPV3, "%s: join 0x%08x on %p(%s))", __func__, ntohl(gina->s_addr), ifp, ifp->if_xname); error = 0; inm = NULL; /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { imf_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE); imf = &timf; } error = in_getmulti(ifp, gina, &inm); if (error) { CTR1(KTR_IGMPV3, "%s: in_getmulti() failure", __func__); return (error); } IN_MULTI_LIST_LOCK(); CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_inm_release; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); if (error) { CTR1(KTR_IGMPV3, "%s: failed to update source", __func__); goto out_inm_release; } out_inm_release: - IN_MULTI_LIST_UNLOCK(); if (error) { CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); inm_release_deferred(inm); } else { *pinm = inm; } + IN_MULTI_LIST_UNLOCK(); return (error); } /* * Leave a multicast group; unlocked entry point. */ int in_leavegroup(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { int error; IN_MULTI_LOCK(); error = in_leavegroup_locked(inm, imf); IN_MULTI_UNLOCK(); return (error); } /* * Leave a multicast group; real entry point. * All source filters will be expunged. * * Only preserves atomicity at inm level. * * Holding the write lock for the INP which contains imf * is highly advisable. We can't assert for it as imf does not * contain a back-pointer to the owning inp. * * Note: This is not the same as inm_release(*) as this function also * makes a state change downcall into IGMP. */ int in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { struct in_mfilter timf; int error; error = 0; IN_MULTI_LOCK_ASSERT(); IN_MULTI_LIST_UNLOCK_ASSERT(); CTR5(KTR_IGMPV3, "%s: leave inm %p, 0x%08x/%s, imf %p", __func__, inm, ntohl(inm->inm_addr.s_addr), (inm_is_ifp_detached(inm) ? "null" : inm->inm_ifp->if_xname), imf); /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { imf_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED); imf = &timf; } /* * Begin state merge transaction at IGMP layer. * * As this particular invocation should not cause any memory * to be allocated, and there is no opportunity to roll back * the transaction, it MUST NOT fail. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); KASSERT(error == 0, ("%s: failed to merge inm state", __func__)); CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); CURVNET_SET(inm->inm_ifp->if_vnet); error = igmp_change_state(inm); + IF_ADDR_WLOCK(inm->inm_ifp); inm_release_deferred(inm); + IF_ADDR_WUNLOCK(inm->inm_ifp); IN_MULTI_LIST_UNLOCK(); CURVNET_RESTORE(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); return (error); } /*#ifndef BURN_BRIDGES*/ /* * Join an IPv4 multicast group in (*,G) exclusive mode. * The group must be a 224.0.0.0/24 link-scope group. * This KPI is for legacy kernel consumers only. */ struct in_multi * in_addmulti(struct in_addr *ap, struct ifnet *ifp) { struct in_multi *pinm; int error; #ifdef INVARIANTS char addrbuf[INET_ADDRSTRLEN]; #endif KASSERT(IN_LOCAL_GROUP(ntohl(ap->s_addr)), ("%s: %s not in 224.0.0.0/24", __func__, inet_ntoa_r(*ap, addrbuf))); error = in_joingroup(ifp, ap, NULL, &pinm); if (error != 0) pinm = NULL; return (pinm); } /* * Block or unblock an ASM multicast source on an inpcb. * This implements the delta-based API described in RFC 3678. * * The delta-based API applies only to exclusive-mode memberships. * An IGMP downcall will be performed. * * SMPng: NOTE: Must take Giant as a join may create a new ifma. * * Return 0 if successful, otherwise return an appropriate error code. */ static int inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims; struct in_multi *inm; size_t idx; uint16_t fmode; int error, doblock; ifp = NULL; error = 0; doblock = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; ssa = (sockunion_t *)&gsr.gsr_source; switch (sopt->sopt_name) { case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: { struct ip_mreq_source mreqs; error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; if (!in_nullhost(mreqs.imr_interface)) INADDR_TO_IFP(mreqs.imr_interface, ifp); if (sopt->sopt_name == IP_BLOCK_SOURCE) doblock = 1; CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); break; } case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (sopt->sopt_name == MCAST_BLOCK_SOURCE) doblock = 1; break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); /* * Check if we are actually a member of this group. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } KASSERT(imo->imo_mfilters != NULL, ("%s: imo_mfilters not allocated", __func__)); imf = &imo->imo_mfilters[idx]; inm = imo->imo_membership[idx]; /* * Attempting to use the delta-based API on an * non exclusive-mode membership is an error. */ fmode = imf->imf_st[0]; if (fmode != MCAST_EXCLUDE) { error = EINVAL; goto out_inp_locked; } /* * Deal with error cases up-front: * Asked to block, but already blocked; or * Asked to unblock, but nothing to unblock. * If adding a new block entry, allocate it. */ ims = imo_match_source(imo, idx, &ssa->sa); if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent", __func__, ntohl(ssa->sin.sin_addr.s_addr), doblock ? "" : "not "); error = EADDRNOTAVAIL; goto out_inp_locked; } INP_WLOCK_ASSERT(inp); /* * Begin state merge transaction at socket layer. */ if (doblock) { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block"); ims = imf_graft(imf, fmode, &ssa->sin); if (ims == NULL) error = ENOMEM; } else { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); error = imf_prune(imf, &ssa->sin); } if (error) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); goto out_imf_rollback; } /* * Begin state merge transaction at IGMP layer. */ IN_MULTI_LOCK(); IN_MULTI_LIST_LOCK(); CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); out_in_multi_locked: IN_MULTI_UNLOCK(); IN_MULTI_UNLOCK(); out_imf_rollback: if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); out_inp_locked: INP_WUNLOCK(inp); return (error); } /* * Given an inpcb, return its multicast options structure pointer. Accepts * an unlocked inpcb pointer, but will return it locked. May sleep. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. * SMPng: NOTE: Returns with the INP write lock held. */ static struct ip_moptions * inp_findmoptions(struct inpcb *inp) { struct ip_moptions *imo; struct in_multi **immp; struct in_mfilter *imfp; size_t idx; INP_WLOCK(inp); if (inp->inp_moptions != NULL) return (inp->inp_moptions); INP_WUNLOCK(inp); imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); immp = malloc(sizeof(*immp) * IP_MIN_MEMBERSHIPS, M_IPMOPTS, M_WAITOK | M_ZERO); imfp = malloc(sizeof(struct in_mfilter) * IP_MIN_MEMBERSHIPS, M_INMFILTER, M_WAITOK); imo->imo_multicast_ifp = NULL; imo->imo_multicast_addr.s_addr = INADDR_ANY; imo->imo_multicast_vif = -1; imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; imo->imo_multicast_loop = in_mcast_loop; imo->imo_num_memberships = 0; imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; imo->imo_membership = immp; /* Initialize per-group source filters. */ for (idx = 0; idx < IP_MIN_MEMBERSHIPS; idx++) imf_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); imo->imo_mfilters = imfp; INP_WLOCK(inp); if (inp->inp_moptions != NULL) { free(imfp, M_INMFILTER); free(immp, M_IPMOPTS); free(imo, M_IPMOPTS); return (inp->inp_moptions); } inp->inp_moptions = imo; return (imo); } /* * Discard the IP multicast options (and source filters). To minimize * the amount of work done while holding locks such as the INP's * pcbinfo lock (which is used in the receive path), the free * operation is performed asynchronously in a separate task. * * SMPng: NOTE: assumes INP write lock is held. */ void inp_freemoptions(struct ip_moptions *imo, struct inpcbinfo *pcbinfo) { int wlock; if (imo == NULL) return; INP_INFO_LOCK_ASSERT(pcbinfo); wlock = INP_INFO_WLOCKED(pcbinfo); if (wlock) INP_INFO_WUNLOCK(pcbinfo); else INP_INFO_RUNLOCK(pcbinfo); KASSERT(imo != NULL, ("%s: ip_moptions is NULL", __func__)); IN_MULTI_LIST_LOCK(); STAILQ_INSERT_TAIL(&imo_gc_list, imo, imo_link); IN_MULTI_LIST_UNLOCK(); taskqueue_enqueue(taskqueue_thread, &imo_gc_task); if (wlock) INP_INFO_WLOCK(pcbinfo); else INP_INFO_RLOCK(pcbinfo); } static void inp_freemoptions_internal(struct ip_moptions *imo) { struct in_mfilter *imf; size_t idx, nmships; nmships = imo->imo_num_memberships; for (idx = 0; idx < nmships; ++idx) { imf = imo->imo_mfilters ? &imo->imo_mfilters[idx] : NULL; if (imf) imf_leave(imf); (void)in_leavegroup(imo->imo_membership[idx], imf); if (imf) imf_purge(imf); } if (imo->imo_mfilters) free(imo->imo_mfilters, M_INMFILTER); free(imo->imo_membership, M_IPMOPTS); free(imo, M_IPMOPTS); } static void inp_gcmoptions(void *context, int pending) { struct ip_moptions *imo; IN_MULTI_LIST_LOCK(); while (!STAILQ_EMPTY(&imo_gc_list)) { imo = STAILQ_FIRST(&imo_gc_list); STAILQ_REMOVE_HEAD(&imo_gc_list, imo_link); IN_MULTI_LIST_UNLOCK(); inp_freemoptions_internal(imo); IN_MULTI_LIST_LOCK(); } IN_MULTI_LIST_UNLOCK(); } /* * Atomically get source filters on a socket for an IPv4 multicast group. * Called with INP lock held; returns with lock released. */ static int inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct ip_moptions *imo; struct in_mfilter *imf; struct ip_msource *ims; struct in_msource *lims; struct sockaddr_in *psin; struct sockaddr_storage *ptss; struct sockaddr_storage *tss; int error; size_t idx, nsrcs, ncsrcs; INP_WLOCK_ASSERT(inp); imo = inp->inp_moptions; KASSERT(imo != NULL, ("%s: null ip_moptions", __func__)); INP_WUNLOCK(inp); error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EINVAL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EINVAL); INP_WLOCK(inp); /* * Lookup group on the socket. */ gsa = (sockunion_t *)&msfr.msfr_group; idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { INP_WUNLOCK(inp); return (EADDRNOTAVAIL); } imf = &imo->imo_mfilters[idx]; /* * Ignore memberships which are in limbo. */ if (imf->imf_st[1] == MCAST_UNDEFINED) { INP_WUNLOCK(inp); return (EAGAIN); } msfr.msfr_fmode = imf->imf_st[1]; /* * If the user specified a buffer, copy out the source filter * entries to userland gracefully. * We only copy out the number of entries which userland * has asked for, but we always tell userland how big the * buffer really needs to be. */ if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) msfr.msfr_nsrcs = in_mcast_maxsocksrc; tss = NULL; if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) { tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_NOWAIT | M_ZERO); if (tss == NULL) { INP_WUNLOCK(inp); return (ENOBUFS); } } /* * Count number of sources in-mode at t0. * If buffer space exists and remains, copy out source entries. */ nsrcs = msfr.msfr_nsrcs; ncsrcs = 0; ptss = tss; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == MCAST_UNDEFINED || lims->imsl_st[0] != imf->imf_st[0]) continue; ++ncsrcs; if (tss != NULL && nsrcs > 0) { psin = (struct sockaddr_in *)ptss; psin->sin_family = AF_INET; psin->sin_len = sizeof(struct sockaddr_in); psin->sin_addr.s_addr = htonl(lims->ims_haddr); psin->sin_port = 0; ++ptss; --nsrcs; } } INP_WUNLOCK(inp); if (tss != NULL) { error = copyout(tss, msfr.msfr_srcs, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); free(tss, M_TEMP); if (error) return (error); } msfr.msfr_nsrcs = ncsrcs; error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq)); return (error); } /* * Return the IP multicast options in response to user getsockopt(). */ int inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) { struct rm_priotracker in_ifa_tracker; struct ip_mreqn mreqn; struct ip_moptions *imo; struct ifnet *ifp; struct in_ifaddr *ia; int error, optval; u_char coptval; INP_WLOCK(inp); imo = inp->inp_moptions; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } error = 0; switch (sopt->sopt_name) { case IP_MULTICAST_VIF: if (imo != NULL) optval = imo->imo_multicast_vif; else optval = -1; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MULTICAST_IF: memset(&mreqn, 0, sizeof(struct ip_mreqn)); if (imo != NULL) { ifp = imo->imo_multicast_ifp; if (!in_nullhost(imo->imo_multicast_addr)) { mreqn.imr_address = imo->imo_multicast_addr; } else if (ifp != NULL) { mreqn.imr_ifindex = ifp->if_index; IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) { mreqn.imr_address = IA_SIN(ia)->sin_addr; ifa_free(&ia->ia_ifa); } } } INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { error = sooptcopyout(sopt, &mreqn, sizeof(struct ip_mreqn)); } else { error = sooptcopyout(sopt, &mreqn.imr_address, sizeof(struct in_addr)); } break; case IP_MULTICAST_TTL: if (imo == NULL) optval = coptval = IP_DEFAULT_MULTICAST_TTL; else optval = coptval = imo->imo_multicast_ttl; INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(u_char)) error = sooptcopyout(sopt, &coptval, sizeof(u_char)); else error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MULTICAST_LOOP: if (imo == NULL) optval = coptval = IP_DEFAULT_MULTICAST_LOOP; else optval = coptval = imo->imo_multicast_loop; INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(u_char)) error = sooptcopyout(sopt, &coptval, sizeof(u_char)); else error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MSFILTER: if (imo == NULL) { error = EADDRNOTAVAIL; INP_WUNLOCK(inp); } else { error = inp_get_source_filters(inp, sopt); } break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Look up the ifnet to use for a multicast group membership, * given the IPv4 address of an interface, and the IPv4 group address. * * This routine exists to support legacy multicast applications * which do not understand that multicast memberships are scoped to * specific physical links in the networking stack, or which need * to join link-scope groups before IPv4 addresses are configured. * * If inp is non-NULL, use this socket's current FIB number for any * required FIB lookup. * If ina is INADDR_ANY, look up the group address in the unicast FIB, * and use its ifp; usually, this points to the default next-hop. * * If the FIB lookup fails, attempt to use the first non-loopback * interface with multicast capability in the system as a * last resort. The legacy IPv4 ASM API requires that we do * this in order to allow groups to be joined when the routing * table has not yet been populated during boot. * * Returns NULL if no ifp could be found. * * SMPng: TODO: Acquire the appropriate locks for INADDR_TO_IFP. * FUTURE: Implement IPv4 source-address selection. */ static struct ifnet * inp_lookup_mcast_ifp(const struct inpcb *inp, const struct sockaddr_in *gsin, const struct in_addr ina) { struct rm_priotracker in_ifa_tracker; struct ifnet *ifp; struct nhop4_basic nh4; uint32_t fibnum; KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__)); KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)), ("%s: not multicast", __func__)); ifp = NULL; if (!in_nullhost(ina)) { INADDR_TO_IFP(ina, ifp); } else { fibnum = inp ? inp->inp_inc.inc_fibnum : 0; if (fib4_lookup_nh_basic(fibnum, gsin->sin_addr, 0, 0, &nh4)==0) ifp = nh4.nh_ifp; else { struct in_ifaddr *ia; struct ifnet *mifp; mifp = NULL; IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { mifp = ia->ia_ifp; if (!(mifp->if_flags & IFF_LOOPBACK) && (mifp->if_flags & IFF_MULTICAST)) { ifp = mifp; break; } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } return (ifp); } /* * Join an IPv4 multicast group, possibly with a source. */ static int inp_join_group(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_multi *inm; struct in_msource *lims; size_t idx; int error, is_new; ifp = NULL; imf = NULL; lims = NULL; error = 0; is_new = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { case IP_ADD_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: { struct ip_mreq_source mreqs; if (sopt->sopt_name == IP_ADD_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq), sizeof(struct ip_mreq)); /* * Do argument switcharoo from ip_mreq into * ip_mreq_source to avoid using two instances. */ mreqs.imr_interface = mreqs.imr_sourceaddr; mreqs.imr_sourceaddr.s_addr = INADDR_ANY; } else if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); } if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) { ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, mreqs.imr_interface); CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); break; } case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: if (sopt->sopt_name == MCAST_JOIN_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); /* * Overwrite the port field if present, as the sockaddr * being copied in may be matched with a binary comparison. */ gsa->sin.sin_port = 0; if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); ssa->sin.sin_port = 0; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1) { is_new = 1; } else { inm = imo->imo_membership[idx]; imf = &imo->imo_mfilters[idx]; if (ssa->ss.ss_family != AF_UNSPEC) { /* * MCAST_JOIN_SOURCE_GROUP on an exclusive membership * is an error. On an existing inclusive membership, * it just adds the source to the filter list. */ if (imf->imf_st[1] != MCAST_INCLUDE) { error = EINVAL; goto out_inp_locked; } /* * Throw out duplicates. * * XXX FIXME: This makes a naive assumption that * even if entries exist for *ssa in this imf, * they will be rejected as dupes, even if they * are not valid in the current mode (in-mode). * * in_msource is transactioned just as for anything * else in SSM -- but note naive use of inm_graft() * below for allocating new filter entries. * * This is only an issue if someone mixes the * full-state SSM API with the delta-based API, * which is discouraged in the relevant RFCs. */ lims = imo_match_source(imo, idx, &ssa->sa); if (lims != NULL /*&& lims->imsl_st[1] == MCAST_INCLUDE*/) { error = EADDRNOTAVAIL; goto out_inp_locked; } } else { /* * MCAST_JOIN_GROUP on an existing exclusive * membership is an error; return EADDRINUSE * to preserve 4.4BSD API idempotence, and * avoid tedious detour to code below. * NOTE: This is bending RFC 3678 a bit. * * On an existing inclusive membership, this is also * an error; if you want to change filter mode, * you must use the userland API setsourcefilter(). * XXX We don't reject this for imf in UNDEFINED * state at t1, because allocation of a filter * is atomic with allocation of a membership. */ error = EINVAL; if (imf->imf_st[1] == MCAST_EXCLUDE) error = EADDRINUSE; goto out_inp_locked; } } /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); if (is_new) { if (imo->imo_num_memberships == imo->imo_max_memberships) { error = imo_grow(imo); if (error) goto out_inp_locked; } /* * Allocate the new slot upfront so we can deal with * grafting the new source filter in same code path * as for join-source on existing membership. */ idx = imo->imo_num_memberships; imo->imo_membership[idx] = NULL; imo->imo_num_memberships++; KASSERT(imo->imo_mfilters != NULL, ("%s: imf_mfilters vector was not allocated", __func__)); imf = &imo->imo_mfilters[idx]; KASSERT(RB_EMPTY(&imf->imf_sources), ("%s: imf_sources not empty", __func__)); } /* * Graft new source into filter list for this inpcb's * membership of the group. The in_multi may not have * been allocated yet if this is a new membership, however, * the in_mfilter slot will be allocated and must be initialized. * * Note: Grafting of exclusive mode filters doesn't happen * in this path. * XXX: Should check for non-NULL lims (node exists but may * not be in-mode) for interop with full-state API. */ if (ssa->ss.ss_family != AF_UNSPEC) { /* Membership starts in IN mode */ if (is_new) { CTR1(KTR_IGMPV3, "%s: new join w/source", __func__); imf_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE); } else { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); } lims = imf_graft(imf, MCAST_INCLUDE, &ssa->sin); if (lims == NULL) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); error = ENOMEM; goto out_imo_free; } } else { /* No address specified; Membership starts in EX mode */ if (is_new) { CTR1(KTR_IGMPV3, "%s: new join w/o source", __func__); imf_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE); } } /* * Begin state merge transaction at IGMP layer. */ in_pcbref(inp); INP_WUNLOCK(inp); IN_MULTI_LOCK(); if (is_new) { error = in_joingroup_locked(ifp, &gsa->sin.sin_addr, imf, &inm); if (error) { CTR1(KTR_IGMPV3, "%s: in_joingroup_locked failed", __func__); IN_MULTI_LIST_UNLOCK(); goto out_imo_free; } imo->imo_membership[idx] = inm; } else { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); IN_MULTI_LIST_UNLOCK(); if (error) { CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); goto out_in_multi_locked; } } out_in_multi_locked: IN_MULTI_UNLOCK(); INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (ENXIO); if (error) { imf_rollback(imf); if (is_new) imf_purge(imf); else imf_reap(imf); } else { imf_commit(imf); } out_imo_free: if (error && is_new) { imo->imo_membership[idx] = NULL; --imo->imo_num_memberships; } out_inp_locked: INP_WUNLOCK(inp); return (error); } /* * Leave an IPv4 multicast group on an inpcb, possibly with a source. */ static int inp_leave_group(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; struct ip_mreq_source mreqs; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims; struct in_multi *inm; size_t idx; int error, is_final; ifp = NULL; error = 0; is_final = 1; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: if (sopt->sopt_name == IP_DROP_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq), sizeof(struct ip_mreq)); /* * Swap interface and sourceaddr arguments, * as ip_mreq and ip_mreq_source are laid * out differently. */ mreqs.imr_interface = mreqs.imr_sourceaddr; mreqs.imr_sourceaddr.s_addr = INADDR_ANY; } else if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); } if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; } /* * Attempt to look up hinted ifp from interface address. * Fallthrough with null ifp iff lookup fails, to * preserve 4.4BSD mcast API idempotence. * XXX NOTE WELL: The RFC 3678 API is preferred because * using an IPv4 address as a key is racy. */ if (!in_nullhost(mreqs.imr_interface)) INADDR_TO_IFP(mreqs.imr_interface, ifp); CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); break; case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: if (sopt->sopt_name == MCAST_LEAVE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); } if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (ifp == NULL) return (EADDRNOTAVAIL); break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); /* * Find the membership in the membership array. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1) { error = EADDRNOTAVAIL; goto out_inp_locked; } inm = imo->imo_membership[idx]; imf = &imo->imo_mfilters[idx]; if (ssa->ss.ss_family != AF_UNSPEC) is_final = 0; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); /* * If we were instructed only to leave a given source, do so. * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. */ if (is_final) { imf_leave(imf); } else { if (imf->imf_st[0] == MCAST_EXCLUDE) { error = EADDRNOTAVAIL; goto out_inp_locked; } ims = imo_match_source(imo, idx, &ssa->sa); if (ims == NULL) { CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent", __func__, ntohl(ssa->sin.sin_addr.s_addr), "not "); error = EADDRNOTAVAIL; goto out_inp_locked; } CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block"); error = imf_prune(imf, &ssa->sin); if (error) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); goto out_inp_locked; } } /* * Begin state merge transaction at IGMP layer. */ IN_MULTI_LOCK(); if (is_final) { /* * Give up the multicast address record to which * the membership points. */ (void)in_leavegroup_locked(inm, imf); } else { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); IN_MULTI_LIST_UNLOCK(); if (error) { CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); } } out_in_multi_locked: IN_MULTI_UNLOCK(); if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); if (is_final) { /* Remove the gap in the membership and filter array. */ for (++idx; idx < imo->imo_num_memberships; ++idx) { imo->imo_membership[idx-1] = imo->imo_membership[idx]; imo->imo_mfilters[idx-1] = imo->imo_mfilters[idx]; } imo->imo_num_memberships--; } out_inp_locked: INP_WUNLOCK(inp); return (error); } /* * Select the interface for transmitting IPv4 multicast datagrams. * * Either an instance of struct in_addr or an instance of struct ip_mreqn * may be passed to this socket option. An address of INADDR_ANY or an * interface index of 0 is used to remove a previous selection. * When no interface is selected, one is chosen for every send. */ static int inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) { struct in_addr addr; struct ip_mreqn mreqn; struct ifnet *ifp; struct ip_moptions *imo; int error; if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { /* * An interface index was specified using the * Linux-derived ip_mreqn structure. */ error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn), sizeof(struct ip_mreqn)); if (error) return (error); if (mreqn.imr_ifindex < 0 || V_if_index < mreqn.imr_ifindex) return (EINVAL); if (mreqn.imr_ifindex == 0) { ifp = NULL; } else { ifp = ifnet_byindex(mreqn.imr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); } } else { /* * An interface was specified by IPv4 address. * This is the traditional BSD usage. */ error = sooptcopyin(sopt, &addr, sizeof(struct in_addr), sizeof(struct in_addr)); if (error) return (error); if (in_nullhost(addr)) { ifp = NULL; } else { INADDR_TO_IFP(addr, ifp); if (ifp == NULL) return (EADDRNOTAVAIL); } CTR3(KTR_IGMPV3, "%s: ifp = %p, addr = 0x%08x", __func__, ifp, ntohl(addr.s_addr)); } /* Reject interfaces which do not support multicast. */ if (ifp != NULL && (ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); imo = inp_findmoptions(inp); imo->imo_multicast_ifp = ifp; imo->imo_multicast_addr.s_addr = INADDR_ANY; INP_WUNLOCK(inp); return (0); } /* * Atomically set source filters on a socket for an IPv4 multicast group. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. */ static int inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_multi *inm; size_t idx; int error; error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) return (ENOBUFS); if ((msfr.msfr_fmode != MCAST_EXCLUDE && msfr.msfr_fmode != MCAST_INCLUDE)) return (EINVAL); if (msfr.msfr_group.ss_family != AF_INET || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); gsa->sin.sin_port = 0; /* ignore port */ if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); /* * Take the INP write lock. * Check if this socket is a member of this group. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } inm = imo->imo_membership[idx]; imf = &imo->imo_mfilters[idx]; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); imf->imf_st[1] = msfr.msfr_fmode; /* * Apply any new source filters, if present. * Make a copy of the user-space source vector so * that we may copy them with a single copyin. This * allows us to deal with page faults up-front. */ if (msfr.msfr_nsrcs > 0) { struct in_msource *lims; struct sockaddr_in *psin; struct sockaddr_storage *kss, *pkss; int i; INP_WUNLOCK(inp); CTR2(KTR_IGMPV3, "%s: loading %lu source list entries", __func__, (unsigned long)msfr.msfr_nsrcs); kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_WAITOK); error = copyin(msfr.msfr_srcs, kss, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); if (error) { free(kss, M_TEMP); return (error); } INP_WLOCK(inp); /* * Mark all source filters as UNDEFINED at t1. * Restore new group filter mode, as imf_leave() * will set it to INCLUDE. */ imf_leave(imf); imf->imf_st[1] = msfr.msfr_fmode; /* * Update socket layer filters at t1, lazy-allocating * new entries. This saves a bunch of memory at the * cost of one RB_FIND() per source entry; duplicate * entries in the msfr_nsrcs vector are ignored. * If we encounter an error, rollback transaction. * * XXX This too could be replaced with a set-symmetric * difference like loop to avoid walking from root * every time, as the key space is common. */ for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) { psin = (struct sockaddr_in *)pkss; if (psin->sin_family != AF_INET) { error = EAFNOSUPPORT; break; } if (psin->sin_len != sizeof(struct sockaddr_in)) { error = EINVAL; break; } error = imf_get_source(imf, psin, &lims); if (error) break; lims->imsl_st[1] = imf->imf_st[1]; } free(kss, M_TEMP); } if (error) goto out_imf_rollback; INP_WLOCK_ASSERT(inp); IN_MULTI_LOCK(); IN_MULTI_LIST_LOCK(); /* * Begin state merge transaction at IGMP layer. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); IN_MULTI_LIST_UNLOCK(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); out_in_multi_locked: IN_MULTI_UNLOCK(); out_imf_rollback: if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); out_inp_locked: INP_WUNLOCK(inp); return (error); } /* * Set the IP multicast options in response to user setsockopt(). * * Many of the socket options handled in this function duplicate the * functionality of socket options in the regular unicast API. However, * it is not possible to merge the duplicate code, because the idempotence * of the IPv4 multicast part of the BSD Sockets API must be preserved; * the effects of these options must be treated as separate and distinct. * * SMPng: XXX: Unlocked read of inp_socket believed OK. * FUTURE: The IP_MULTICAST_VIF option may be eliminated if MROUTING * is refactored to no longer use vifs. */ int inp_setmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip_moptions *imo; int error; error = 0; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) return (EOPNOTSUPP); switch (sopt->sopt_name) { case IP_MULTICAST_VIF: { int vifi; /* * Select a multicast VIF for transmission. * Only useful if multicast forwarding is active. */ if (legal_vif_num == NULL) { error = EOPNOTSUPP; break; } error = sooptcopyin(sopt, &vifi, sizeof(int), sizeof(int)); if (error) break; if (!legal_vif_num(vifi) && (vifi != -1)) { error = EINVAL; break; } imo = inp_findmoptions(inp); imo->imo_multicast_vif = vifi; INP_WUNLOCK(inp); break; } case IP_MULTICAST_IF: error = inp_set_multicast_if(inp, sopt); break; case IP_MULTICAST_TTL: { u_char ttl; /* * Set the IP time-to-live for outgoing multicast packets. * The original multicast API required a char argument, * which is inconsistent with the rest of the socket API. * We allow either a char or an int. */ if (sopt->sopt_valsize == sizeof(u_char)) { error = sooptcopyin(sopt, &ttl, sizeof(u_char), sizeof(u_char)); if (error) break; } else { u_int ittl; error = sooptcopyin(sopt, &ittl, sizeof(u_int), sizeof(u_int)); if (error) break; if (ittl > 255) { error = EINVAL; break; } ttl = (u_char)ittl; } imo = inp_findmoptions(inp); imo->imo_multicast_ttl = ttl; INP_WUNLOCK(inp); break; } case IP_MULTICAST_LOOP: { u_char loop; /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. The original multicast API required a * char argument, which is inconsistent with the rest * of the socket API. We allow either a char or an int. */ if (sopt->sopt_valsize == sizeof(u_char)) { error = sooptcopyin(sopt, &loop, sizeof(u_char), sizeof(u_char)); if (error) break; } else { u_int iloop; error = sooptcopyin(sopt, &iloop, sizeof(u_int), sizeof(u_int)); if (error) break; loop = (u_char)iloop; } imo = inp_findmoptions(inp); imo->imo_multicast_loop = !!loop; INP_WUNLOCK(inp); break; } case IP_ADD_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: error = inp_join_group(inp, sopt); break; case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = inp_leave_group(inp, sopt); break; case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_block_unblock_source(inp, sopt); break; case IP_MSFILTER: error = inp_set_source_filters(inp, sopt); break; default: error = EOPNOTSUPP; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Expose IGMP's multicast filter mode and source list(s) to userland, * keyed by (ifindex, group). * The filter mode is written out as a uint32_t, followed by * 0..n of struct in_addr. * For use by ifmcstat(8). * SMPng: NOTE: unlocked read of ifindex space. */ static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS) { struct in_addr src, group; struct ifnet *ifp; struct ifmultiaddr *ifma; struct in_multi *inm; struct ip_msource *ims; int *name; int retval; u_int namelen; uint32_t fmode, ifindex; name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); if (namelen != 2) return (EINVAL); ifindex = name[0]; if (ifindex <= 0 || ifindex > V_if_index) { CTR2(KTR_IGMPV3, "%s: ifindex %u out of range", __func__, ifindex); return (ENOENT); } group.s_addr = name[1]; if (!IN_MULTICAST(ntohl(group.s_addr))) { CTR2(KTR_IGMPV3, "%s: group 0x%08x is not multicast", __func__, ntohl(group.s_addr)); return (EINVAL); } ifp = ifnet_byindex(ifindex); if (ifp == NULL) { CTR2(KTR_IGMPV3, "%s: no ifp for ifindex %u", __func__, ifindex); return (ENOENT); } retval = sysctl_wire_old_buffer(req, sizeof(uint32_t) + (in_mcast_maxgrpsrc * sizeof(struct in_addr))); if (retval) return (retval); IN_MULTI_LIST_LOCK(); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (!in_hosteq(inm->inm_addr, group)) continue; fmode = inm->inm_st[1].iss_fmode; retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t)); if (retval != 0) break; RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__, ims->ims_haddr); /* * Only copy-out sources which are in-mode. */ if (fmode != ims_get_mode(inm, ims, 1)) { CTR1(KTR_IGMPV3, "%s: skip non-in-mode", __func__); continue; } src.s_addr = htonl(ims->ims_haddr); retval = SYSCTL_OUT(req, &src, sizeof(struct in_addr)); if (retval != 0) break; } } IF_ADDR_RUNLOCK(ifp); IN_MULTI_LIST_UNLOCK(); return (retval); } #if defined(KTR) && (KTR_COMPILE & KTR_IGMPV3) static const char *inm_modestrs[] = { "un", "in", "ex" }; static const char * inm_mode_str(const int mode) { if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE) return (inm_modestrs[mode]); return ("??"); } static const char *inm_statestrs[] = { "not-member", "silent", "idle", "lazy", "sleeping", "awakening", "query-pending", "sg-query-pending", "leaving" }; static const char * inm_state_str(const int state) { if (state >= IGMP_NOT_MEMBER && state <= IGMP_LEAVING_MEMBER) return (inm_statestrs[state]); return ("??"); } /* * Dump an in_multi structure to the console. */ void inm_print(const struct in_multi *inm) { int t; char addrbuf[INET_ADDRSTRLEN]; if ((ktr_mask & KTR_IGMPV3) == 0) return; printf("%s: --- begin inm %p ---\n", __func__, inm); printf("addr %s ifp %p(%s) ifma %p\n", inet_ntoa_r(inm->inm_addr, addrbuf), inm->inm_ifp, inm->inm_ifp->if_xname, inm->inm_ifma); printf("timer %u state %s refcount %u scq.len %u\n", inm->inm_timer, inm_state_str(inm->inm_state), inm->inm_refcount, inm->inm_scq.mq_len); printf("igi %p nsrc %lu sctimer %u scrv %u\n", inm->inm_igi, inm->inm_nsrc, inm->inm_sctimer, inm->inm_scrv); for (t = 0; t < 2; t++) { printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t, inm_mode_str(inm->inm_st[t].iss_fmode), inm->inm_st[t].iss_asm, inm->inm_st[t].iss_ex, inm->inm_st[t].iss_in, inm->inm_st[t].iss_rec); } printf("%s: --- end inm %p ---\n", __func__, inm); } #else /* !KTR || !(KTR_COMPILE & KTR_IGMPV3) */ void inm_print(const struct in_multi *inm) { } #endif /* KTR && (KTR_COMPILE & KTR_IGMPV3) */ RB_GENERATE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp); Index: head/sys/netinet/in_var.h =================================================================== --- head/sys/netinet/in_var.h (revision 333308) +++ head/sys/netinet/in_var.h (revision 333309) @@ -1,429 +1,433 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1985, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_var.h 8.2 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _NETINET_IN_VAR_H_ #define _NETINET_IN_VAR_H_ /* * Argument structure for SIOCAIFADDR. */ struct in_aliasreq { char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ struct sockaddr_in ifra_addr; struct sockaddr_in ifra_broadaddr; #define ifra_dstaddr ifra_broadaddr struct sockaddr_in ifra_mask; int ifra_vhid; }; #ifdef _KERNEL #include #include #include struct igmp_ifsoftc; struct in_multi; struct lltable; SLIST_HEAD(in_multi_head, in_multi); /* * IPv4 per-interface state. */ struct in_ifinfo { struct lltable *ii_llt; /* ARP state */ struct igmp_ifsoftc *ii_igmp; /* IGMP state */ struct in_multi *ii_allhosts; /* 224.0.0.1 membership */ }; /* * Interface address, Internet version. One of these structures * is allocated for each Internet address on an interface. * The ifaddr structure contains the protocol-independent part * of the structure and is assumed to be first. */ struct in_ifaddr { struct ifaddr ia_ifa; /* protocol-independent info */ #define ia_ifp ia_ifa.ifa_ifp #define ia_flags ia_ifa.ifa_flags /* ia_subnet{,mask} in host order */ u_long ia_subnet; /* subnet address */ u_long ia_subnetmask; /* mask of subnet */ LIST_ENTRY(in_ifaddr) ia_hash; /* entry in bucket of inet addresses */ TAILQ_ENTRY(in_ifaddr) ia_link; /* list of internet addresses */ struct sockaddr_in ia_addr; /* reserve space for interface name */ struct sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */ #define ia_broadaddr ia_dstaddr struct sockaddr_in ia_sockmask; /* reserve space for general netmask */ struct callout ia_garp_timer; /* timer for retransmitting GARPs */ int ia_garp_count; /* count of retransmitted GARPs */ }; /* * Given a pointer to an in_ifaddr (ifaddr), * return a pointer to the addr as a sockaddr_in. */ #define IA_SIN(ia) (&(((struct in_ifaddr *)(ia))->ia_addr)) #define IA_DSTSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_dstaddr)) #define IA_MASKSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_sockmask)) #define IN_LNAOF(in, ifa) \ ((ntohl((in).s_addr) & ~((struct in_ifaddr *)(ifa)->ia_subnetmask)) extern u_char inetctlerrmap[]; #define LLTABLE(ifp) \ ((struct in_ifinfo *)(ifp)->if_afdata[AF_INET])->ii_llt /* * Hash table for IP addresses. */ TAILQ_HEAD(in_ifaddrhead, in_ifaddr); LIST_HEAD(in_ifaddrhashhead, in_ifaddr); VNET_DECLARE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); VNET_DECLARE(struct in_ifaddrhead, in_ifaddrhead); VNET_DECLARE(u_long, in_ifaddrhmask); /* mask for hash table */ #define V_in_ifaddrhashtbl VNET(in_ifaddrhashtbl) #define V_in_ifaddrhead VNET(in_ifaddrhead) #define V_in_ifaddrhmask VNET(in_ifaddrhmask) #define INADDR_NHASH_LOG2 9 #define INADDR_NHASH (1 << INADDR_NHASH_LOG2) #define INADDR_HASHVAL(x) fnv_32_buf((&(x)), sizeof(x), FNV1_32_INIT) #define INADDR_HASH(x) \ (&V_in_ifaddrhashtbl[INADDR_HASHVAL(x) & V_in_ifaddrhmask]) extern struct rmlock in_ifaddr_lock; #define IN_IFADDR_LOCK_ASSERT() rm_assert(&in_ifaddr_lock, RA_LOCKED) #define IN_IFADDR_RLOCK(t) rm_rlock(&in_ifaddr_lock, (t)) #define IN_IFADDR_RLOCK_ASSERT() rm_assert(&in_ifaddr_lock, RA_RLOCKED) #define IN_IFADDR_RUNLOCK(t) rm_runlock(&in_ifaddr_lock, (t)) #define IN_IFADDR_WLOCK() rm_wlock(&in_ifaddr_lock) #define IN_IFADDR_WLOCK_ASSERT() rm_assert(&in_ifaddr_lock, RA_WLOCKED) #define IN_IFADDR_WUNLOCK() rm_wunlock(&in_ifaddr_lock) /* * Macro for finding the internet address structure (in_ifaddr) * corresponding to one of our IP addresses (in_addr). */ #define INADDR_TO_IFADDR(addr, ia) \ /* struct in_addr addr; */ \ /* struct in_ifaddr *ia; */ \ do { \ \ LIST_FOREACH(ia, INADDR_HASH((addr).s_addr), ia_hash) \ if (IA_SIN(ia)->sin_addr.s_addr == (addr).s_addr) \ break; \ } while (0) /* * Macro for finding the interface (ifnet structure) corresponding to one * of our IP addresses. */ #define INADDR_TO_IFP(addr, ifp) \ /* struct in_addr addr; */ \ /* struct ifnet *ifp; */ \ { \ struct in_ifaddr *ia; \ \ INADDR_TO_IFADDR(addr, ia); \ (ifp) = (ia == NULL) ? NULL : ia->ia_ifp; \ } /* * Macro for finding the internet address structure (in_ifaddr) corresponding * to a given interface (ifnet structure). */ #define IFP_TO_IA(ifp, ia, t) \ /* struct ifnet *ifp; */ \ /* struct in_ifaddr *ia; */ \ /* struct rm_priotracker *t; */ \ do { \ IN_IFADDR_RLOCK((t)); \ for ((ia) = TAILQ_FIRST(&V_in_ifaddrhead); \ (ia) != NULL && (ia)->ia_ifp != (ifp); \ (ia) = TAILQ_NEXT((ia), ia_link)) \ continue; \ if ((ia) != NULL) \ ifa_ref(&(ia)->ia_ifa); \ IN_IFADDR_RUNLOCK((t)); \ } while (0) /* * Legacy IPv4 IGMP per-link structure. */ struct router_info { struct ifnet *rti_ifp; int rti_type; /* type of router which is querier on this interface */ int rti_time; /* # of slow timeouts since last old query */ SLIST_ENTRY(router_info) rti_list; }; /* * IPv4 multicast IGMP-layer source entry. */ struct ip_msource { RB_ENTRY(ip_msource) ims_link; /* RB tree links */ in_addr_t ims_haddr; /* host byte order */ struct ims_st { uint16_t ex; /* # of exclusive members */ uint16_t in; /* # of inclusive members */ } ims_st[2]; /* state at t0, t1 */ uint8_t ims_stp; /* pending query */ }; /* * IPv4 multicast PCB-layer source entry. */ struct in_msource { RB_ENTRY(ip_msource) ims_link; /* RB tree links */ in_addr_t ims_haddr; /* host byte order */ uint8_t imsl_st[2]; /* state before/at commit */ }; RB_HEAD(ip_msource_tree, ip_msource); /* define struct ip_msource_tree */ static __inline int ip_msource_cmp(const struct ip_msource *a, const struct ip_msource *b) { if (a->ims_haddr < b->ims_haddr) return (-1); if (a->ims_haddr == b->ims_haddr) return (0); return (1); } RB_PROTOTYPE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp); /* * IPv4 multicast PCB-layer group filter descriptor. */ struct in_mfilter { struct ip_msource_tree imf_sources; /* source list for (S,G) */ u_long imf_nsrc; /* # of source entries */ uint8_t imf_st[2]; /* state before/at commit */ }; /* * IPv4 group descriptor. * * For every entry on an ifnet's if_multiaddrs list which represents * an IP multicast group, there is one of these structures. * * If any source filters are present, then a node will exist in the RB-tree * to permit fast lookup by source whenever an operation takes place. * This permits pre-order traversal when we issue reports. * Source filter trees are kept separately from the socket layer to * greatly simplify locking. * * When IGMPv3 is active, inm_timer is the response to group query timer. * The state-change timer inm_sctimer is separate; whenever state changes * for the group the state change record is generated and transmitted, * and kept if retransmissions are necessary. * * FUTURE: inm_link is now only used when groups are being purged * on a detaching ifnet. It could be demoted to a SLIST_ENTRY, but * because it is at the very start of the struct, we can't do this * w/o breaking the ABI for ifmcstat. */ struct in_multi { LIST_ENTRY(in_multi) inm_link; /* to-be-released by in_ifdetach */ struct in_addr inm_addr; /* IP multicast address, convenience */ struct ifnet *inm_ifp; /* back pointer to ifnet */ struct ifmultiaddr *inm_ifma; /* back pointer to ifmultiaddr */ u_int inm_timer; /* IGMPv1/v2 group / v3 query timer */ u_int inm_state; /* state of the membership */ void *inm_rti; /* unused, legacy field */ u_int inm_refcount; /* reference count */ /* New fields for IGMPv3 follow. */ struct igmp_ifsoftc *inm_igi; /* IGMP info */ SLIST_ENTRY(in_multi) inm_nrele; /* to-be-released by IGMP */ struct ip_msource_tree inm_srcs; /* tree of sources */ u_long inm_nsrc; /* # of tree entries */ struct mbufq inm_scq; /* queue of pending * state-change packets */ struct timeval inm_lastgsrtv; /* Time of last G-S-R query */ uint16_t inm_sctimer; /* state-change timer */ uint16_t inm_scrv; /* state-change rexmit count */ /* * SSM state counters which track state at T0 (the time the last * state-change report's RV timer went to zero) and T1 * (time of pending report, i.e. now). * Used for computing IGMPv3 state-change reports. Several refcounts * are maintained here to optimize for common use-cases. */ struct inm_st { uint16_t iss_fmode; /* IGMP filter mode */ uint16_t iss_asm; /* # of ASM listeners */ uint16_t iss_ex; /* # of exclusive members */ uint16_t iss_in; /* # of inclusive members */ uint16_t iss_rec; /* # of recorded sources */ } inm_st[2]; /* state at t0, t1 */ }; /* * Helper function to derive the filter mode on a source entry * from its internal counters. Predicates are: * A source is only excluded if all listeners exclude it. * A source is only included if no listeners exclude it, * and at least one listener includes it. * May be used by ifmcstat(8). */ static __inline uint8_t ims_get_mode(const struct in_multi *inm, const struct ip_msource *ims, uint8_t t) { t = !!t; if (inm->inm_st[t].iss_ex > 0 && inm->inm_st[t].iss_ex == ims->ims_st[t].ex) return (MCAST_EXCLUDE); else if (ims->ims_st[t].in > 0 && ims->ims_st[t].ex == 0) return (MCAST_INCLUDE); return (MCAST_UNDEFINED); } #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet); SYSCTL_DECL(_net_inet_ip); SYSCTL_DECL(_net_inet_raw); #endif /* * Lock macros for IPv4 layer multicast address lists. IPv4 lock goes * before link layer multicast locks in the lock order. In most cases, * consumers of IN_*_MULTI() macros should acquire the locks before * calling them; users of the in_{add,del}multi() functions should not. */ extern struct mtx in_multi_list_mtx; extern struct sx in_multi_sx; #define IN_MULTI_LIST_LOCK() mtx_lock(&in_multi_list_mtx) #define IN_MULTI_LIST_UNLOCK() mtx_unlock(&in_multi_list_mtx) #define IN_MULTI_LIST_LOCK_ASSERT() mtx_assert(&in_multi_list_mtx, MA_OWNED) #define IN_MULTI_LIST_UNLOCK_ASSERT() mtx_assert(&in_multi_list_mtx, MA_NOTOWNED) #define IN_MULTI_LOCK() sx_xlock(&in_multi_sx) #define IN_MULTI_UNLOCK() sx_xunlock(&in_multi_sx) #define IN_MULTI_LOCK_ASSERT() sx_assert(&in_multi_sx, SA_XLOCKED) #define IN_MULTI_UNLOCK_ASSERT() sx_assert(&in_multi_sx, SA_XUNLOCKED) +void inm_disconnect(struct in_multi *inm); +extern int ifma_restart; + /* Acquire an in_multi record. */ static __inline void inm_acquire_locked(struct in_multi *inm) { IN_MULTI_LIST_LOCK_ASSERT(); ++inm->inm_refcount; } static __inline void inm_acquire(struct in_multi *inm) { IN_MULTI_LIST_LOCK(); inm_acquire_locked(inm); IN_MULTI_LIST_UNLOCK(); } static __inline void inm_rele_locked(struct in_multi_head *inmh, struct in_multi *inm) { MPASS(inm->inm_refcount > 0); IN_MULTI_LIST_LOCK_ASSERT(); if (--inm->inm_refcount == 0) { MPASS(inmh != NULL); + inm_disconnect(inm); inm->inm_ifma->ifma_protospec = NULL; SLIST_INSERT_HEAD(inmh, inm, inm_nrele); } } /* * Return values for imo_multi_filter(). */ #define MCAST_PASS 0 /* Pass */ #define MCAST_NOTGMEMBER 1 /* This host not a member of group */ #define MCAST_NOTSMEMBER 2 /* This host excluded source */ #define MCAST_MUTED 3 /* [deprecated] */ struct rtentry; struct route; struct ip_moptions; struct in_multi *inm_lookup_locked(struct ifnet *, const struct in_addr); struct in_multi *inm_lookup(struct ifnet *, const struct in_addr); int imo_multi_filter(const struct ip_moptions *, const struct ifnet *, const struct sockaddr *, const struct sockaddr *); void inm_commit(struct in_multi *); void inm_clear_recorded(struct in_multi *); void inm_print(const struct in_multi *); int inm_record_source(struct in_multi *inm, const in_addr_t); void inm_release_deferred(struct in_multi *); void inm_release_list_deferred(struct in_multi_head *); struct in_multi * in_addmulti(struct in_addr *, struct ifnet *); int in_joingroup(struct ifnet *, const struct in_addr *, /*const*/ struct in_mfilter *, struct in_multi **); int in_joingroup_locked(struct ifnet *, const struct in_addr *, /*const*/ struct in_mfilter *, struct in_multi **); int in_leavegroup(struct in_multi *, /*const*/ struct in_mfilter *); int in_leavegroup_locked(struct in_multi *, /*const*/ struct in_mfilter *); int in_control(struct socket *, u_long, caddr_t, struct ifnet *, struct thread *); int in_addprefix(struct in_ifaddr *, int); int in_scrubprefix(struct in_ifaddr *, u_int); void in_ifscrub_all(void); void ip_input(struct mbuf *); void ip_direct_input(struct mbuf *); void in_ifadown(struct ifaddr *ifa, int); struct mbuf *ip_tryforward(struct mbuf *); void *in_domifattach(struct ifnet *); void in_domifdetach(struct ifnet *, void *); /* XXX */ void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum); void in_rtredirect(struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct sockaddr *, u_int); #endif /* _KERNEL */ /* INET6 stuff */ #include #endif /* _NETINET_IN_VAR_H_ */ Index: head/sys/netinet6/in6_ifattach.c =================================================================== --- head/sys/netinet6/in6_ifattach.c (revision 333308) +++ head/sys/netinet6/in6_ifattach.c (revision 333309) @@ -1,901 +1,904 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_ifattach.c,v 1.118 2001/05/24 07:44:00 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include VNET_DEFINE(unsigned long, in6_maxmtu) = 0; #ifdef IP6_AUTO_LINKLOCAL VNET_DEFINE(int, ip6_auto_linklocal) = IP6_AUTO_LINKLOCAL; #else VNET_DEFINE(int, ip6_auto_linklocal) = 1; /* enabled by default */ #endif VNET_DEFINE(struct callout, in6_tmpaddrtimer_ch); #define V_in6_tmpaddrtimer_ch VNET(in6_tmpaddrtimer_ch) VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) static int get_rand_ifid(struct ifnet *, struct in6_addr *); static int generate_tmp_ifid(u_int8_t *, const u_int8_t *, u_int8_t *); static int get_ifid(struct ifnet *, struct ifnet *, struct in6_addr *); static int in6_ifattach_linklocal(struct ifnet *, struct ifnet *); static int in6_ifattach_loopback(struct ifnet *); static void in6_purgemaddrs(struct ifnet *); #define EUI64_GBIT 0x01 #define EUI64_UBIT 0x02 #define EUI64_TO_IFID(in6) do {(in6)->s6_addr[8] ^= EUI64_UBIT; } while (0) #define EUI64_GROUP(in6) ((in6)->s6_addr[8] & EUI64_GBIT) #define EUI64_INDIVIDUAL(in6) (!EUI64_GROUP(in6)) #define EUI64_LOCAL(in6) ((in6)->s6_addr[8] & EUI64_UBIT) #define EUI64_UNIVERSAL(in6) (!EUI64_LOCAL(in6)) #define IFID_LOCAL(in6) (!EUI64_LOCAL(in6)) #define IFID_UNIVERSAL(in6) (!EUI64_UNIVERSAL(in6)) /* * Generate a last-resort interface identifier, when the machine has no * IEEE802/EUI64 address sources. * The goal here is to get an interface identifier that is * (1) random enough and (2) does not change across reboot. * We currently use MD5(hostname) for it. * * in6 - upper 64bits are preserved */ static int get_rand_ifid(struct ifnet *ifp, struct in6_addr *in6) { MD5_CTX ctxt; struct prison *pr; u_int8_t digest[16]; int hostnamelen; pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); hostnamelen = strlen(pr->pr_hostname); #if 0 /* we need at least several letters as seed for ifid */ if (hostnamelen < 3) { mtx_unlock(&pr->pr_mtx); return -1; } #endif /* generate 8 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, pr->pr_hostname, hostnamelen); mtx_unlock(&pr->pr_mtx); MD5Final(digest, &ctxt); /* assumes sizeof(digest) > sizeof(ifid) */ bcopy(digest, &in6->s6_addr[8], 8); /* make sure to set "u" bit to local, and "g" bit to individual. */ in6->s6_addr[8] &= ~EUI64_GBIT; /* g bit to "individual" */ in6->s6_addr[8] |= EUI64_UBIT; /* u bit to "local" */ /* convert EUI64 into IPv6 interface identifier */ EUI64_TO_IFID(in6); return 0; } static int generate_tmp_ifid(u_int8_t *seed0, const u_int8_t *seed1, u_int8_t *ret) { MD5_CTX ctxt; u_int8_t seed[16], digest[16], nullbuf[8]; u_int32_t val32; /* If there's no history, start with a random seed. */ bzero(nullbuf, sizeof(nullbuf)); if (bcmp(nullbuf, seed0, sizeof(nullbuf)) == 0) { int i; for (i = 0; i < 2; i++) { val32 = arc4random(); bcopy(&val32, seed + sizeof(val32) * i, sizeof(val32)); } } else bcopy(seed0, seed, 8); /* copy the right-most 64-bits of the given address */ /* XXX assumption on the size of IFID */ bcopy(seed1, &seed[8], 8); if (0) { /* for debugging purposes only */ int i; printf("generate_tmp_ifid: new randomized ID from: "); for (i = 0; i < 16; i++) printf("%02x", seed[i]); printf(" "); } /* generate 16 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, seed, sizeof(seed)); MD5Final(digest, &ctxt); /* * RFC 3041 3.2.1. (3) * Take the left-most 64-bits of the MD5 digest and set bit 6 (the * left-most bit is numbered 0) to zero. */ bcopy(digest, ret, 8); ret[0] &= ~EUI64_UBIT; /* * XXX: we'd like to ensure that the generated value is not zero * for simplicity. If the caclculated digest happens to be zero, * use a random non-zero value as the last resort. */ if (bcmp(nullbuf, ret, sizeof(nullbuf)) == 0) { nd6log((LOG_INFO, "generate_tmp_ifid: computed MD5 value is zero.\n")); val32 = arc4random(); val32 = 1 + (val32 % (0xffffffff - 1)); } /* * RFC 3041 3.2.1. (4) * Take the rightmost 64-bits of the MD5 digest and save them in * stable storage as the history value to be used in the next * iteration of the algorithm. */ bcopy(&digest[8], seed0, 8); if (0) { /* for debugging purposes only */ int i; printf("to: "); for (i = 0; i < 16; i++) printf("%02x", digest[i]); printf("\n"); } return 0; } /* * Get interface identifier for the specified interface. * XXX assumes single sockaddr_dl (AF_LINK address) per an interface * * in6 - upper 64bits are preserved */ int in6_get_hw_ifid(struct ifnet *ifp, struct in6_addr *in6) { struct ifaddr *ifa; struct sockaddr_dl *sdl; u_int8_t *addr; size_t addrlen; static u_int8_t allzero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; static u_int8_t allone[8] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_LINK) continue; sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) continue; if (sdl->sdl_alen == 0) continue; goto found; } IF_ADDR_RUNLOCK(ifp); return -1; found: IF_ADDR_LOCK_ASSERT(ifp); addr = LLADDR(sdl); addrlen = sdl->sdl_alen; /* get EUI64 */ switch (ifp->if_type) { case IFT_BRIDGE: case IFT_ETHER: case IFT_L2VLAN: case IFT_ATM: case IFT_IEEE1394: /* IEEE802/EUI64 cases - what others? */ /* IEEE1394 uses 16byte length address starting with EUI64 */ if (addrlen > 8) addrlen = 8; /* look at IEEE802/EUI64 only */ if (addrlen != 8 && addrlen != 6) { IF_ADDR_RUNLOCK(ifp); return -1; } /* * check for invalid MAC address - on bsdi, we see it a lot * since wildboar configures all-zero MAC on pccard before * card insertion. */ if (bcmp(addr, allzero, addrlen) == 0) { IF_ADDR_RUNLOCK(ifp); return -1; } if (bcmp(addr, allone, addrlen) == 0) { IF_ADDR_RUNLOCK(ifp); return -1; } /* make EUI64 address */ if (addrlen == 8) bcopy(addr, &in6->s6_addr[8], 8); else if (addrlen == 6) { in6->s6_addr[8] = addr[0]; in6->s6_addr[9] = addr[1]; in6->s6_addr[10] = addr[2]; in6->s6_addr[11] = 0xff; in6->s6_addr[12] = 0xfe; in6->s6_addr[13] = addr[3]; in6->s6_addr[14] = addr[4]; in6->s6_addr[15] = addr[5]; } break; case IFT_GIF: case IFT_STF: /* * RFC2893 says: "SHOULD use IPv4 address as ifid source". * however, IPv4 address is not very suitable as unique * identifier source (can be renumbered). * we don't do this. */ IF_ADDR_RUNLOCK(ifp); return -1; default: IF_ADDR_RUNLOCK(ifp); return -1; } /* sanity check: g bit must not indicate "group" */ if (EUI64_GROUP(in6)) { IF_ADDR_RUNLOCK(ifp); return -1; } /* convert EUI64 into IPv6 interface identifier */ EUI64_TO_IFID(in6); /* * sanity check: ifid must not be all zero, avoid conflict with * subnet router anycast */ if ((in6->s6_addr[8] & ~(EUI64_GBIT | EUI64_UBIT)) == 0x00 && bcmp(&in6->s6_addr[9], allzero, 7) == 0) { IF_ADDR_RUNLOCK(ifp); return -1; } IF_ADDR_RUNLOCK(ifp); return 0; } /* * Get interface identifier for the specified interface. If it is not * available on ifp0, borrow interface identifier from other information * sources. * * altifp - secondary EUI64 source */ static int get_ifid(struct ifnet *ifp0, struct ifnet *altifp, struct in6_addr *in6) { struct ifnet *ifp; /* first, try to get it from the interface itself */ if (in6_get_hw_ifid(ifp0, in6) == 0) { nd6log((LOG_DEBUG, "%s: got interface identifier from itself\n", if_name(ifp0))); goto success; } /* try secondary EUI64 source. this basically is for ATM PVC */ if (altifp && in6_get_hw_ifid(altifp, in6) == 0) { nd6log((LOG_DEBUG, "%s: got interface identifier from %s\n", if_name(ifp0), if_name(altifp))); goto success; } /* next, try to get it from some other hardware interface */ IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp == ifp0) continue; if (in6_get_hw_ifid(ifp, in6) != 0) continue; /* * to borrow ifid from other interface, ifid needs to be * globally unique */ if (IFID_UNIVERSAL(in6)) { nd6log((LOG_DEBUG, "%s: borrow interface identifier from %s\n", if_name(ifp0), if_name(ifp))); IFNET_RUNLOCK_NOSLEEP(); goto success; } } IFNET_RUNLOCK_NOSLEEP(); /* last resort: get from random number source */ if (get_rand_ifid(ifp, in6) == 0) { nd6log((LOG_DEBUG, "%s: interface identifier generated by random number\n", if_name(ifp0))); goto success; } printf("%s: failed to get interface identifier\n", if_name(ifp0)); return -1; success: nd6log((LOG_INFO, "%s: ifid: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", if_name(ifp0), in6->s6_addr[8], in6->s6_addr[9], in6->s6_addr[10], in6->s6_addr[11], in6->s6_addr[12], in6->s6_addr[13], in6->s6_addr[14], in6->s6_addr[15])); return 0; } /* * altifp - secondary EUI64 source */ static int in6_ifattach_linklocal(struct ifnet *ifp, struct ifnet *altifp) { struct in6_ifaddr *ia; struct in6_aliasreq ifra; struct nd_prefixctl pr0; struct nd_prefix *pr; int error; /* * configure link-local address. */ in6_prepare_ifra(&ifra, NULL, &in6mask64); ifra.ifra_addr.sin6_addr.s6_addr32[0] = htonl(0xfe800000); ifra.ifra_addr.sin6_addr.s6_addr32[1] = 0; if ((ifp->if_flags & IFF_LOOPBACK) != 0) { ifra.ifra_addr.sin6_addr.s6_addr32[2] = 0; ifra.ifra_addr.sin6_addr.s6_addr32[3] = htonl(1); } else { if (get_ifid(ifp, altifp, &ifra.ifra_addr.sin6_addr) != 0) { nd6log((LOG_ERR, "%s: no ifid available\n", if_name(ifp))); return (-1); } } if (in6_setscope(&ifra.ifra_addr.sin6_addr, ifp, NULL)) return (-1); /* link-local addresses should NEVER expire. */ ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME; /* * Now call in6_update_ifa() to do a bunch of procedures to configure * a link-local address. We can set the 3rd argument to NULL, because * we know there's no other link-local address on the interface * and therefore we are adding one (instead of updating one). */ if ((error = in6_update_ifa(ifp, &ifra, NULL, IN6_IFAUPDATE_DADDELAY)) != 0) { /* * XXX: When the interface does not support IPv6, this call * would fail in the SIOCSIFADDR ioctl. I believe the * notification is rather confusing in this case, so just * suppress it. (jinmei@kame.net 20010130) */ if (error != EAFNOSUPPORT) nd6log((LOG_NOTICE, "in6_ifattach_linklocal: failed to " "configure a link-local address on %s " "(errno=%d)\n", if_name(ifp), error)); return (-1); } ia = in6ifa_ifpforlinklocal(ifp, 0); /* ia must not be NULL */ KASSERT(ia != NULL, ("%s: ia == NULL, ifp=%p", __func__, ifp)); ifa_free(&ia->ia_ifa); /* * Make the link-local prefix (fe80::%link/64) as on-link. * Since we'd like to manage prefixes separately from addresses, * we make an ND6 prefix structure for the link-local prefix, * and add it to the prefix list as a never-expire prefix. * XXX: this change might affect some existing code base... */ bzero(&pr0, sizeof(pr0)); pr0.ndpr_ifp = ifp; /* this should be 64 at this moment. */ pr0.ndpr_plen = in6_mask2len(&ifra.ifra_prefixmask.sin6_addr, NULL); pr0.ndpr_prefix = ifra.ifra_addr; /* apply the mask for safety. (nd6_prelist_add will apply it again) */ IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr, &in6mask64); /* * Initialize parameters. The link-local prefix must always be * on-link, and its lifetimes never expire. */ pr0.ndpr_raf_onlink = 1; pr0.ndpr_raf_auto = 1; /* probably meaningless */ pr0.ndpr_vltime = ND6_INFINITE_LIFETIME; pr0.ndpr_pltime = ND6_INFINITE_LIFETIME; /* * Since there is no other link-local addresses, nd6_prefix_lookup() * probably returns NULL. However, we cannot always expect the result. * For example, if we first remove the (only) existing link-local * address, and then reconfigure another one, the prefix is still * valid with referring to the old link-local address. */ if ((pr = nd6_prefix_lookup(&pr0)) == NULL) { if ((error = nd6_prelist_add(&pr0, NULL, NULL)) != 0) return (error); } else nd6_prefix_rele(pr); return 0; } /* * ifp - must be IFT_LOOP */ static int in6_ifattach_loopback(struct ifnet *ifp) { struct in6_aliasreq ifra; int error; in6_prepare_ifra(&ifra, &in6addr_loopback, &in6mask128); /* * Always initialize ia_dstaddr (= broadcast address) to loopback * address. Follows IPv4 practice - see in_ifinit(). */ ifra.ifra_dstaddr.sin6_len = sizeof(struct sockaddr_in6); ifra.ifra_dstaddr.sin6_family = AF_INET6; ifra.ifra_dstaddr.sin6_addr = in6addr_loopback; /* the loopback address should NEVER expire. */ ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME; /* * We are sure that this is a newly assigned address, so we can set * NULL to the 3rd arg. */ if ((error = in6_update_ifa(ifp, &ifra, NULL, 0)) != 0) { nd6log((LOG_ERR, "in6_ifattach_loopback: failed to configure " "the loopback address on %s (errno=%d)\n", if_name(ifp), error)); return (-1); } return 0; } /* * compute NI group address, based on the current hostname setting. * see RFC 4620. * * when ifp == NULL, the caller is responsible for filling scopeid. * * If oldmcprefix == 1, FF02:0:0:0:0:2::/96 is used for NI group address * while it is FF02:0:0:0:0:2:FF00::/104 in RFC 4620. */ static int in6_nigroup0(struct ifnet *ifp, const char *name, int namelen, struct in6_addr *in6, int oldmcprefix) { struct prison *pr; const char *p; u_char *q; MD5_CTX ctxt; u_int8_t digest[16]; char l; char n[64]; /* a single label must not exceed 63 chars */ /* * If no name is given and namelen is -1, * we try to do the hostname lookup ourselves. */ if (!name && namelen == -1) { pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); name = pr->pr_hostname; namelen = strlen(name); } else pr = NULL; if (!name || !namelen) { if (pr != NULL) mtx_unlock(&pr->pr_mtx); return -1; } p = name; while (p && *p && *p != '.' && p - name < namelen) p++; if (p == name || p - name > sizeof(n) - 1) { if (pr != NULL) mtx_unlock(&pr->pr_mtx); return -1; /* label too long */ } l = p - name; strncpy(n, name, l); if (pr != NULL) mtx_unlock(&pr->pr_mtx); n[(int)l] = '\0'; for (q = n; *q; q++) { if ('A' <= *q && *q <= 'Z') *q = *q - 'A' + 'a'; } /* generate 16 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, &l, sizeof(l)); MD5Update(&ctxt, n, l); MD5Final(digest, &ctxt); bzero(in6, sizeof(*in6)); in6->s6_addr16[0] = IPV6_ADDR_INT16_MLL; in6->s6_addr8[11] = 2; if (oldmcprefix == 0) { in6->s6_addr8[12] = 0xff; /* Copy the first 24 bits of 128-bit hash into the address. */ bcopy(digest, &in6->s6_addr8[13], 3); } else { /* Copy the first 32 bits of 128-bit hash into the address. */ bcopy(digest, &in6->s6_addr32[3], sizeof(in6->s6_addr32[3])); } if (in6_setscope(in6, ifp, NULL)) return (-1); /* XXX: should not fail */ return 0; } int in6_nigroup(struct ifnet *ifp, const char *name, int namelen, struct in6_addr *in6) { return (in6_nigroup0(ifp, name, namelen, in6, 0)); } int in6_nigroup_oldmcprefix(struct ifnet *ifp, const char *name, int namelen, struct in6_addr *in6) { return (in6_nigroup0(ifp, name, namelen, in6, 1)); } /* * XXX multiple loopback interface needs more care. for instance, * nodelocal address needs to be configured onto only one of them. * XXX multiple link-local address case * * altifp - secondary EUI64 source */ void in6_ifattach(struct ifnet *ifp, struct ifnet *altifp) { struct in6_ifaddr *ia; if (ifp->if_afdata[AF_INET6] == NULL) return; /* * quirks based on interface type */ switch (ifp->if_type) { case IFT_STF: /* * 6to4 interface is a very special kind of beast. * no multicast, no linklocal. RFC2529 specifies how to make * linklocals for 6to4 interface, but there's no use and * it is rather harmful to have one. */ ND_IFINFO(ifp)->flags &= ~ND6_IFF_AUTO_LINKLOCAL; break; default: break; } /* * usually, we require multicast capability to the interface */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { nd6log((LOG_INFO, "in6_ifattach: " "%s is not multicast capable, IPv6 not enabled\n", if_name(ifp))); return; } /* * assign loopback address for loopback interface. */ if ((ifp->if_flags & IFF_LOOPBACK) != 0) { /* * check that loopback address doesn't exist yet. */ ia = in6ifa_ifwithaddr(&in6addr_loopback, 0); if (ia == NULL) in6_ifattach_loopback(ifp); else ifa_free(&ia->ia_ifa); } /* * assign a link-local address, if there's none. */ if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL) { ia = in6ifa_ifpforlinklocal(ifp, 0); if (ia == NULL) in6_ifattach_linklocal(ifp, altifp); else ifa_free(&ia->ia_ifa); } /* update dynamically. */ if (V_in6_maxmtu < ifp->if_mtu) V_in6_maxmtu = ifp->if_mtu; } /* * NOTE: in6_ifdetach() does not support loopback if at this moment. * * When shutting down a VNET we clean up layers top-down. In that case * upper layer protocols (ulp) are cleaned up already and locks are destroyed * and we must not call into these cleanup functions anymore, thus purgeulp * is set to 0 in that case by in6_ifdetach_destroy(). * The normal case of destroying a (cloned) interface still needs to cleanup * everything related to the interface and will have purgeulp set to 1. */ static void _in6_ifdetach(struct ifnet *ifp, int purgeulp) { struct ifaddr *ifa, *next; if (ifp->if_afdata[AF_INET6] == NULL) return; /* * nuke any of IPv6 addresses we have * XXX: all addresses should be already removed */ TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; in6_purgeaddr(ifa); } if (purgeulp) { in6_pcbpurgeif0(&V_udbinfo, ifp); in6_pcbpurgeif0(&V_ulitecbinfo, ifp); in6_pcbpurgeif0(&V_ripcbinfo, ifp); } /* leave from all multicast groups joined */ in6_purgemaddrs(ifp); /* * Remove neighbor management table. * Enabling the nd6_purge will panic on vmove for interfaces on VNET * teardown as the IPv6 layer is cleaned up already and the locks * are destroyed. */ if (purgeulp) nd6_purge(ifp); } void in6_ifdetach(struct ifnet *ifp) { _in6_ifdetach(ifp, 1); } void in6_ifdetach_destroy(struct ifnet *ifp) { _in6_ifdetach(ifp, 0); } int in6_get_tmpifid(struct ifnet *ifp, u_int8_t *retbuf, const u_int8_t *baseid, int generate) { u_int8_t nullbuf[8]; struct nd_ifinfo *ndi = ND_IFINFO(ifp); bzero(nullbuf, sizeof(nullbuf)); if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) == 0) { /* we've never created a random ID. Create a new one. */ generate = 1; } if (generate) { bcopy(baseid, ndi->randomseed1, sizeof(ndi->randomseed1)); /* generate_tmp_ifid will update seedn and buf */ (void)generate_tmp_ifid(ndi->randomseed0, ndi->randomseed1, ndi->randomid); } bcopy(ndi->randomid, retbuf, 8); return (0); } void in6_tmpaddrtimer(void *arg) { CURVNET_SET((struct vnet *) arg); struct nd_ifinfo *ndi; u_int8_t nullbuf[8]; struct ifnet *ifp; callout_reset(&V_in6_tmpaddrtimer_ch, (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor - V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, curvnet); bzero(nullbuf, sizeof(nullbuf)); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_afdata[AF_INET6] == NULL) continue; ndi = ND_IFINFO(ifp); if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) != 0) { /* * We've been generating a random ID on this interface. * Create a new one. */ (void)generate_tmp_ifid(ndi->randomseed0, ndi->randomseed1, ndi->randomid); } } CURVNET_RESTORE(); } static void in6_purgemaddrs(struct ifnet *ifp) { struct in6_multi_head purgeinms; struct in6_multi *inm; - struct ifmultiaddr *ifma; + struct ifmultiaddr *ifma, *next; SLIST_INIT(&purgeinms); IN6_MULTI_LOCK(); IN6_MULTI_LIST_LOCK(); + IF_ADDR_WLOCK(ifp); /* * Extract list of in6_multi associated with the detaching ifp * which the PF_INET6 layer is about to release. - * We need to do this as IF_ADDR_LOCK() may be re-acquired - * by code further down. */ - IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + restart: + TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { if (ifma->ifma_addr->sa_family != AF_INET6 || ifma->ifma_protospec == NULL) continue; inm = (struct in6_multi *)ifma->ifma_protospec; in6m_rele_locked(&purgeinms, inm); + if (__predict_false(ifma6_restart)) { + ifma6_restart = false; + goto restart; + } } - IF_ADDR_RUNLOCK(ifp); + IF_ADDR_WUNLOCK(ifp); mld_ifdetach(ifp); IN6_MULTI_LIST_UNLOCK(); IN6_MULTI_UNLOCK(); in6m_release_list_deferred(&purgeinms); } void in6_ifattach_destroy(void) { callout_drain(&V_in6_tmpaddrtimer_ch); } static void in6_ifattach_init(void *dummy) { /* Timer for regeneranation of temporary addresses randomize ID. */ callout_init(&V_in6_tmpaddrtimer_ch, 0); callout_reset(&V_in6_tmpaddrtimer_ch, (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor - V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, curvnet); } /* * Cheat. * This must be after route_init(), which is now SI_ORDER_THIRD. */ SYSINIT(in6_ifattach_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, in6_ifattach_init, NULL); Index: head/sys/netinet6/in6_mcast.c =================================================================== --- head/sys/netinet6/in6_mcast.c (revision 333308) +++ head/sys/netinet6/in6_mcast.c (revision 333309) @@ -1,2909 +1,2941 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2009 Bruce Simpson. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * IPv6 multicast socket, group, and socket option processing module. * Normative references: RFC 2292, RFC 3492, RFC 3542, RFC 3678, RFC 3810. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_MLD #define KTR_MLD KTR_INET6 #endif #ifndef __SOCKUNION_DECLARED union sockunion { struct sockaddr_storage ss; struct sockaddr sa; struct sockaddr_dl sdl; struct sockaddr_in6 sin6; }; typedef union sockunion sockunion_t; #define __SOCKUNION_DECLARED #endif /* __SOCKUNION_DECLARED */ static MALLOC_DEFINE(M_IN6MFILTER, "in6_mfilter", "IPv6 multicast PCB-layer source filter"); MALLOC_DEFINE(M_IP6MADDR, "in6_multi", "IPv6 multicast group"); static MALLOC_DEFINE(M_IP6MOPTS, "ip6_moptions", "IPv6 multicast options"); static MALLOC_DEFINE(M_IP6MSOURCE, "ip6_msource", "IPv6 multicast MLD-layer source filter"); RB_GENERATE(ip6_msource_tree, ip6_msource, im6s_link, ip6_msource_cmp); /* * Locking: * - Lock order is: Giant, INP_WLOCK, IN6_MULTI_LOCK, MLD_LOCK, IF_ADDR_LOCK. * - The IF_ADDR_LOCK is implicitly taken by in6m_lookup() earlier, however * it can be taken by code in net/if.c also. * - ip6_moptions and in6_mfilter are covered by the INP_WLOCK. * * struct in6_multi is covered by IN6_MULTI_LOCK. There isn't strictly * any need for in6_multi itself to be virtualized -- it is bound to an ifp * anyway no matter what happens. */ struct mtx in6_multi_list_mtx; MTX_SYSINIT(in6_multi_mtx, &in6_multi_list_mtx, "in6_multi_list_mtx", MTX_DEF); struct mtx in6_multi_free_mtx; MTX_SYSINIT(in6_multi_free_mtx, &in6_multi_free_mtx, "in6_multi_free_mtx", MTX_DEF); struct sx in6_multi_sx; SX_SYSINIT(in6_multi_sx, &in6_multi_sx, "in6_multi_sx"); static void im6f_commit(struct in6_mfilter *); static int im6f_get_source(struct in6_mfilter *imf, const struct sockaddr_in6 *psin, struct in6_msource **); static struct in6_msource * im6f_graft(struct in6_mfilter *, const uint8_t, const struct sockaddr_in6 *); static void im6f_leave(struct in6_mfilter *); static int im6f_prune(struct in6_mfilter *, const struct sockaddr_in6 *); static void im6f_purge(struct in6_mfilter *); static void im6f_rollback(struct in6_mfilter *); static void im6f_reap(struct in6_mfilter *); static int im6o_grow(struct ip6_moptions *); static size_t im6o_match_group(const struct ip6_moptions *, const struct ifnet *, const struct sockaddr *); static struct in6_msource * im6o_match_source(const struct ip6_moptions *, const size_t, const struct sockaddr *); static void im6s_merge(struct ip6_msource *ims, const struct in6_msource *lims, const int rollback); static int in6_getmulti(struct ifnet *, const struct in6_addr *, struct in6_multi **); static int in6m_get_source(struct in6_multi *inm, const struct in6_addr *addr, const int noalloc, struct ip6_msource **pims); #ifdef KTR static int in6m_is_ifp_detached(const struct in6_multi *); #endif static int in6m_merge(struct in6_multi *, /*const*/ struct in6_mfilter *); static void in6m_purge(struct in6_multi *); static void in6m_reap(struct in6_multi *); static struct ip6_moptions * in6p_findmoptions(struct inpcb *); static int in6p_get_source_filters(struct inpcb *, struct sockopt *); static int in6p_join_group(struct inpcb *, struct sockopt *); static int in6p_leave_group(struct inpcb *, struct sockopt *); static struct ifnet * in6p_lookup_mcast_ifp(const struct inpcb *, const struct sockaddr_in6 *); static int in6p_block_unblock_source(struct inpcb *, struct sockopt *); static int in6p_set_multicast_if(struct inpcb *, struct sockopt *); static int in6p_set_source_filters(struct inpcb *, struct sockopt *); static int sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS); SYSCTL_DECL(_net_inet6_ip6); /* XXX Not in any common header. */ static SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, mcast, CTLFLAG_RW, 0, "IPv6 multicast"); static u_long in6_mcast_maxgrpsrc = IPV6_MAX_GROUP_SRC_FILTER; SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxgrpsrc, CTLFLAG_RWTUN, &in6_mcast_maxgrpsrc, 0, "Max source filters per group"); static u_long in6_mcast_maxsocksrc = IPV6_MAX_SOCK_SRC_FILTER; SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxsocksrc, CTLFLAG_RWTUN, &in6_mcast_maxsocksrc, 0, "Max source filters per socket"); /* TODO Virtualize this switch. */ int in6_mcast_loop = IPV6_DEFAULT_MULTICAST_LOOP; SYSCTL_INT(_net_inet6_ip6_mcast, OID_AUTO, loop, CTLFLAG_RWTUN, &in6_mcast_loop, 0, "Loopback multicast datagrams by default"); static SYSCTL_NODE(_net_inet6_ip6_mcast, OID_AUTO, filters, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip6_mcast_filters, "Per-interface stack-wide source filters"); +int ifma6_restart = 0; #ifdef KTR /* * Inline function which wraps assertions for a valid ifp. * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp * is detached. */ static int __inline in6m_is_ifp_detached(const struct in6_multi *inm) { struct ifnet *ifp; KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->in6m_ifma->ifma_ifp; if (ifp != NULL) { /* * Sanity check that network-layer notion of ifp is the * same as that of link-layer. */ KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__)); } return (ifp == NULL); } #endif /* * Initialize an in6_mfilter structure to a known state at t0, t1 * with an empty source filter list. */ static __inline void im6f_init(struct in6_mfilter *imf, const int st0, const int st1) { memset(imf, 0, sizeof(struct in6_mfilter)); RB_INIT(&imf->im6f_sources); imf->im6f_st[0] = st0; imf->im6f_st[1] = st1; } /* * Resize the ip6_moptions vector to the next power-of-two minus 1. * May be called with locks held; do not sleep. */ static int im6o_grow(struct ip6_moptions *imo) { struct in6_multi **nmships; struct in6_multi **omships; struct in6_mfilter *nmfilters; struct in6_mfilter *omfilters; size_t idx; size_t newmax; size_t oldmax; nmships = NULL; nmfilters = NULL; omships = imo->im6o_membership; omfilters = imo->im6o_mfilters; oldmax = imo->im6o_max_memberships; newmax = ((oldmax + 1) * 2) - 1; if (newmax <= IPV6_MAX_MEMBERSHIPS) { nmships = (struct in6_multi **)realloc(omships, sizeof(struct in6_multi *) * newmax, M_IP6MOPTS, M_NOWAIT); nmfilters = (struct in6_mfilter *)realloc(omfilters, sizeof(struct in6_mfilter) * newmax, M_IN6MFILTER, M_NOWAIT); if (nmships != NULL && nmfilters != NULL) { /* Initialize newly allocated source filter heads. */ for (idx = oldmax; idx < newmax; idx++) { im6f_init(&nmfilters[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); } imo->im6o_max_memberships = newmax; imo->im6o_membership = nmships; imo->im6o_mfilters = nmfilters; } } if (nmships == NULL || nmfilters == NULL) { if (nmships != NULL) free(nmships, M_IP6MOPTS); if (nmfilters != NULL) free(nmfilters, M_IN6MFILTER); return (ETOOMANYREFS); } return (0); } /* * Find an IPv6 multicast group entry for this ip6_moptions instance * which matches the specified group, and optionally an interface. * Return its index into the array, or -1 if not found. */ static size_t im6o_match_group(const struct ip6_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group) { const struct sockaddr_in6 *gsin6; struct in6_multi **pinm; int idx; int nmships; gsin6 = (const struct sockaddr_in6 *)group; /* The im6o_membership array may be lazy allocated. */ if (imo->im6o_membership == NULL || imo->im6o_num_memberships == 0) return (-1); nmships = imo->im6o_num_memberships; pinm = &imo->im6o_membership[0]; for (idx = 0; idx < nmships; idx++, pinm++) { if (*pinm == NULL) continue; if ((ifp == NULL || ((*pinm)->in6m_ifp == ifp)) && IN6_ARE_ADDR_EQUAL(&(*pinm)->in6m_addr, &gsin6->sin6_addr)) { break; } } if (idx >= nmships) idx = -1; return (idx); } /* * Find an IPv6 multicast source entry for this imo which matches * the given group index for this socket, and source address. * * XXX TODO: The scope ID, if present in src, is stripped before * any comparison. We SHOULD enforce scope/zone checks where the source * filter entry has a link scope. * * NOTE: This does not check if the entry is in-mode, merely if * it exists, which may not be the desired behaviour. */ static struct in6_msource * im6o_match_source(const struct ip6_moptions *imo, const size_t gidx, const struct sockaddr *src) { struct ip6_msource find; struct in6_mfilter *imf; struct ip6_msource *ims; const sockunion_t *psa; KASSERT(src->sa_family == AF_INET6, ("%s: !AF_INET6", __func__)); KASSERT(gidx != -1 && gidx < imo->im6o_num_memberships, ("%s: invalid index %d\n", __func__, (int)gidx)); /* The im6o_mfilters array may be lazy allocated. */ if (imo->im6o_mfilters == NULL) return (NULL); imf = &imo->im6o_mfilters[gidx]; psa = (const sockunion_t *)src; find.im6s_addr = psa->sin6.sin6_addr; in6_clearscope(&find.im6s_addr); /* XXX */ ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); return ((struct in6_msource *)ims); } /* * Perform filtering for multicast datagrams on a socket by group and source. * * Returns 0 if a datagram should be allowed through, or various error codes * if the socket was not a member of the group, or the source was muted, etc. */ int im6o_mc_filter(const struct ip6_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group, const struct sockaddr *src) { size_t gidx; struct in6_msource *ims; int mode; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); gidx = im6o_match_group(imo, ifp, group); if (gidx == -1) return (MCAST_NOTGMEMBER); /* * Check if the source was included in an (S,G) join. * Allow reception on exclusive memberships by default, * reject reception on inclusive memberships by default. * Exclude source only if an in-mode exclude filter exists. * Include source only if an in-mode include filter exists. * NOTE: We are comparing group state here at MLD t1 (now) * with socket-layer t0 (since last downcall). */ mode = imo->im6o_mfilters[gidx].im6f_st[1]; ims = im6o_match_source(imo, gidx, src); if ((ims == NULL && mode == MCAST_INCLUDE) || (ims != NULL && ims->im6sl_st[0] != mode)) return (MCAST_NOTSMEMBER); return (MCAST_PASS); } /* * Find and return a reference to an in6_multi record for (ifp, group), * and bump its reference count. * If one does not exist, try to allocate it, and update link-layer multicast * filters on ifp to listen for group. * Assumes the IN6_MULTI lock is held across the call. * Return 0 if successful, otherwise return an appropriate error code. */ static int in6_getmulti(struct ifnet *ifp, const struct in6_addr *group, struct in6_multi **pinm) { struct sockaddr_in6 gsin6; struct ifmultiaddr *ifma; struct in6_multi *inm; int error; error = 0; /* * XXX: Accesses to ifma_protospec must be covered by IF_ADDR_LOCK; * if_addmulti() takes this mutex itself, so we must drop and * re-acquire around the call. */ IN6_MULTI_LOCK_ASSERT(); IN6_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); inm = in6m_lookup_locked(ifp, group); if (inm != NULL) { /* * If we already joined this group, just bump the * refcount and return it. */ KASSERT(inm->in6m_refcount >= 1, ("%s: bad refcount %d", __func__, inm->in6m_refcount)); in6m_acquire_locked(inm); *pinm = inm; goto out_locked; } memset(&gsin6, 0, sizeof(gsin6)); gsin6.sin6_family = AF_INET6; gsin6.sin6_len = sizeof(struct sockaddr_in6); gsin6.sin6_addr = *group; /* * Check if a link-layer group is already associated * with this network-layer group on the given ifnet. */ IN6_MULTI_LIST_UNLOCK(); IF_ADDR_WUNLOCK(ifp); error = if_addmulti(ifp, (struct sockaddr *)&gsin6, &ifma); if (error != 0) return (error); IN6_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); /* * If something other than netinet6 is occupying the link-layer * group, print a meaningful error message and back out of * the allocation. * Otherwise, bump the refcount on the existing network-layer * group association and return it. */ if (ifma->ifma_protospec != NULL) { inm = (struct in6_multi *)ifma->ifma_protospec; #ifdef INVARIANTS KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr", __func__)); KASSERT(ifma->ifma_addr->sa_family == AF_INET6, ("%s: ifma not AF_INET6", __func__)); KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__)); if (inm->in6m_ifma != ifma || inm->in6m_ifp != ifp || !IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, group)) panic("%s: ifma %p is inconsistent with %p (%p)", __func__, ifma, inm, group); #endif in6m_acquire_locked(inm); *pinm = inm; goto out_locked; } IF_ADDR_WLOCK_ASSERT(ifp); /* * A new in6_multi record is needed; allocate and initialize it. * We DO NOT perform an MLD join as the in6_ layer may need to * push an initial source list down to MLD to support SSM. * * The initial source filter state is INCLUDE, {} as per the RFC. * Pending state-changes per group are subject to a bounds check. */ inm = malloc(sizeof(*inm), M_IP6MADDR, M_NOWAIT | M_ZERO); if (inm == NULL) { IN6_MULTI_LIST_UNLOCK(); IF_ADDR_WUNLOCK(ifp); if_delmulti_ifma(ifma); return (ENOMEM); } inm->in6m_addr = *group; inm->in6m_ifp = ifp; inm->in6m_mli = MLD_IFINFO(ifp); inm->in6m_ifma = ifma; inm->in6m_refcount = 1; inm->in6m_state = MLD_NOT_MEMBER; mbufq_init(&inm->in6m_scq, MLD_MAX_STATE_CHANGES); inm->in6m_st[0].iss_fmode = MCAST_UNDEFINED; inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; RB_INIT(&inm->in6m_srcs); ifma->ifma_protospec = inm; *pinm = inm; out_locked: IN6_MULTI_LIST_UNLOCK(); IF_ADDR_WUNLOCK(ifp); return (error); } /* * Drop a reference to an in6_multi record. * * If the refcount drops to 0, free the in6_multi record and * delete the underlying link-layer membership. */ static void in6m_release(struct in6_multi *inm) { struct ifmultiaddr *ifma; struct ifnet *ifp; CTR2(KTR_MLD, "%s: refcount is %d", __func__, inm->in6m_refcount); MPASS(inm->in6m_refcount == 0); CTR2(KTR_MLD, "%s: freeing inm %p", __func__, inm); ifma = inm->in6m_ifma; ifp = inm->in6m_ifp; + MPASS(ifma->ifma_llifma == NULL); /* XXX this access is not covered by IF_ADDR_LOCK */ CTR2(KTR_MLD, "%s: purging ifma %p", __func__, ifma); KASSERT(ifma->ifma_protospec == NULL, ("%s: ifma_protospec != NULL", __func__)); if (ifp) CURVNET_SET(ifp->if_vnet); in6m_purge(inm); free(inm, M_IP6MADDR); - if_delmulti_ifma(ifma); - if (ifp) + if_delmulti_ifma_flags(ifma, 1); + if (ifp) { CURVNET_RESTORE(); + if_rele(ifp); + } } static struct grouptask free_gtask; static struct in6_multi_head in6m_free_list; static void in6m_release_task(void *arg __unused); static void in6m_init(void) { SLIST_INIT(&in6m_free_list); taskqgroup_config_gtask_init(NULL, &free_gtask, in6m_release_task, "in6m release task"); } SYSINIT(in6m_init, SI_SUB_SMP + 1, SI_ORDER_FIRST, in6m_init, NULL); void in6m_release_list_deferred(struct in6_multi_head *inmh) { if (SLIST_EMPTY(inmh)) return; mtx_lock(&in6_multi_free_mtx); SLIST_CONCAT(&in6m_free_list, inmh, in6_multi, in6m_nrele); mtx_unlock(&in6_multi_free_mtx); GROUPTASK_ENQUEUE(&free_gtask); } void -in6m_release_deferred(struct in6_multi *inm) +in6m_disconnect(struct in6_multi *inm) { - struct in6_multi_head tmp; struct ifnet *ifp; struct ifaddr *ifa; struct in6_ifaddr *ifa6; struct in6_multi_mship *imm; + struct ifmultiaddr *ifma, *ll_ifma; + ifp = inm->in6m_ifp; + IF_ADDR_WLOCK_ASSERT(ifp); + ifma = inm->in6m_ifma; + + if_ref(ifp); + TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link); + MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname); + if ((ll_ifma = ifma->ifma_llifma) != NULL) { + MPASS(ifma != ll_ifma); + ifma->ifma_llifma = NULL; + MPASS(ll_ifma->ifma_llifma == NULL); + MPASS(ll_ifma->ifma_ifp == ifp); + if (--ll_ifma->ifma_refcount == 0) { + ifma6_restart = true; + TAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifma_link); + MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname); + if_freemulti(ll_ifma); + } + } + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_INET6) + continue; + ifa6 = (void *)ifa; + LIST_FOREACH(imm, &ifa6->ia6_memberships, i6mm_chain) { + if (inm == imm->i6mm_maddr) { + LIST_REMOVE(imm, i6mm_chain); + free(imm, M_IP6MADDR); + } + } + } +} + +void +in6m_release_deferred(struct in6_multi *inm) +{ + struct in6_multi_head tmp; + IN6_MULTI_LIST_LOCK_ASSERT(); KASSERT(inm->in6m_refcount > 0, ("refcount == %d inm: %p", inm->in6m_refcount, inm)); if (--inm->in6m_refcount == 0) { - ifp = inm->in6m_ifp; - IF_ADDR_LOCK_ASSERT(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr->sa_family != AF_INET6) - continue; - ifa6 = (void *)ifa; - LIST_FOREACH(imm, &ifa6->ia6_memberships, i6mm_chain) { - if (inm == imm->i6mm_maddr) - imm->i6mm_maddr = NULL; - } - } + in6m_disconnect(inm); SLIST_INIT(&tmp); inm->in6m_ifma->ifma_protospec = NULL; + MPASS(inm->in6m_ifma->ifma_llifma == NULL); SLIST_INSERT_HEAD(&tmp, inm, in6m_nrele); in6m_release_list_deferred(&tmp); } } static void in6m_release_task(void *arg __unused) { struct in6_multi_head in6m_free_tmp; struct in6_multi *inm, *tinm; SLIST_INIT(&in6m_free_tmp); mtx_lock(&in6_multi_free_mtx); SLIST_CONCAT(&in6m_free_tmp, &in6m_free_list, in6_multi, in6m_nrele); mtx_unlock(&in6_multi_free_mtx); IN6_MULTI_LOCK(); SLIST_FOREACH_SAFE(inm, &in6m_free_tmp, in6m_nrele, tinm) { SLIST_REMOVE_HEAD(&in6m_free_tmp, in6m_nrele); in6m_release(inm); } IN6_MULTI_UNLOCK(); } /* * Clear recorded source entries for a group. * Used by the MLD code. Caller must hold the IN6_MULTI lock. * FIXME: Should reap. */ void in6m_clear_recorded(struct in6_multi *inm) { struct ip6_msource *ims; IN6_MULTI_LIST_LOCK_ASSERT(); RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { if (ims->im6s_stp) { ims->im6s_stp = 0; --inm->in6m_st[1].iss_rec; } } KASSERT(inm->in6m_st[1].iss_rec == 0, ("%s: iss_rec %d not 0", __func__, inm->in6m_st[1].iss_rec)); } /* * Record a source as pending for a Source-Group MLDv2 query. * This lives here as it modifies the shared tree. * * inm is the group descriptor. * naddr is the address of the source to record in network-byte order. * * If the net.inet6.mld.sgalloc sysctl is non-zero, we will * lazy-allocate a source node in response to an SG query. * Otherwise, no allocation is performed. This saves some memory * with the trade-off that the source will not be reported to the * router if joined in the window between the query response and * the group actually being joined on the local host. * * VIMAGE: XXX: Currently the mld_sgalloc feature has been removed. * This turns off the allocation of a recorded source entry if * the group has not been joined. * * Return 0 if the source didn't exist or was already marked as recorded. * Return 1 if the source was marked as recorded by this function. * Return <0 if any error occurred (negated errno code). */ int in6m_record_source(struct in6_multi *inm, const struct in6_addr *addr) { struct ip6_msource find; struct ip6_msource *ims, *nims; IN6_MULTI_LIST_LOCK_ASSERT(); find.im6s_addr = *addr; ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find); if (ims && ims->im6s_stp) return (0); if (ims == NULL) { if (inm->in6m_nsrc == in6_mcast_maxgrpsrc) return (-ENOSPC); nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (-ENOMEM); nims->im6s_addr = find.im6s_addr; RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims); ++inm->in6m_nsrc; ims = nims; } /* * Mark the source as recorded and update the recorded * source count. */ ++ims->im6s_stp; ++inm->in6m_st[1].iss_rec; return (1); } /* * Return a pointer to an in6_msource owned by an in6_mfilter, * given its source address. * Lazy-allocate if needed. If this is a new entry its filter state is * undefined at t0. * * imf is the filter set being modified. * addr is the source address. * * SMPng: May be called with locks held; malloc must not block. */ static int im6f_get_source(struct in6_mfilter *imf, const struct sockaddr_in6 *psin, struct in6_msource **plims) { struct ip6_msource find; struct ip6_msource *ims, *nims; struct in6_msource *lims; int error; error = 0; ims = NULL; lims = NULL; find.im6s_addr = psin->sin6_addr; ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); lims = (struct in6_msource *)ims; if (lims == NULL) { if (imf->im6f_nsrc == in6_mcast_maxsocksrc) return (ENOSPC); nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); lims = (struct in6_msource *)nims; lims->im6s_addr = find.im6s_addr; lims->im6sl_st[0] = MCAST_UNDEFINED; RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims); ++imf->im6f_nsrc; } *plims = lims; return (error); } /* * Graft a source entry into an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being in the new filter mode at t1. * * Return the pointer to the new node, otherwise return NULL. */ static struct in6_msource * im6f_graft(struct in6_mfilter *imf, const uint8_t st1, const struct sockaddr_in6 *psin) { struct ip6_msource *nims; struct in6_msource *lims; nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (NULL); lims = (struct in6_msource *)nims; lims->im6s_addr = psin->sin6_addr; lims->im6sl_st[0] = MCAST_UNDEFINED; lims->im6sl_st[1] = st1; RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims); ++imf->im6f_nsrc; return (lims); } /* * Prune a source entry from an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being left at t1, it is not freed. * * Return 0 if no error occurred, otherwise return an errno value. */ static int im6f_prune(struct in6_mfilter *imf, const struct sockaddr_in6 *psin) { struct ip6_msource find; struct ip6_msource *ims; struct in6_msource *lims; find.im6s_addr = psin->sin6_addr; ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); if (ims == NULL) return (ENOENT); lims = (struct in6_msource *)ims; lims->im6sl_st[1] = MCAST_UNDEFINED; return (0); } /* * Revert socket-layer filter set deltas at t1 to t0 state. */ static void im6f_rollback(struct in6_mfilter *imf) { struct ip6_msource *ims, *tims; struct in6_msource *lims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == lims->im6sl_st[1]) { /* no change at t1 */ continue; } else if (lims->im6sl_st[0] != MCAST_UNDEFINED) { /* revert change to existing source at t1 */ lims->im6sl_st[1] = lims->im6sl_st[0]; } else { /* revert source added t1 */ CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); free(ims, M_IN6MFILTER); imf->im6f_nsrc--; } } imf->im6f_st[1] = imf->im6f_st[0]; } /* * Mark socket-layer filter set as INCLUDE {} at t1. */ static void im6f_leave(struct in6_mfilter *imf) { struct ip6_msource *ims; struct in6_msource *lims; RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; lims->im6sl_st[1] = MCAST_UNDEFINED; } imf->im6f_st[1] = MCAST_INCLUDE; } /* * Mark socket-layer filter set deltas as committed. */ static void im6f_commit(struct in6_mfilter *imf) { struct ip6_msource *ims; struct in6_msource *lims; RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; lims->im6sl_st[0] = lims->im6sl_st[1]; } imf->im6f_st[0] = imf->im6f_st[1]; } /* * Reap unreferenced sources from socket-layer filter set. */ static void im6f_reap(struct in6_mfilter *imf) { struct ip6_msource *ims, *tims; struct in6_msource *lims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { lims = (struct in6_msource *)ims; if ((lims->im6sl_st[0] == MCAST_UNDEFINED) && (lims->im6sl_st[1] == MCAST_UNDEFINED)) { CTR2(KTR_MLD, "%s: free lims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); free(ims, M_IN6MFILTER); imf->im6f_nsrc--; } } } /* * Purge socket-layer filter set. */ static void im6f_purge(struct in6_mfilter *imf) { struct ip6_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); free(ims, M_IN6MFILTER); imf->im6f_nsrc--; } imf->im6f_st[0] = imf->im6f_st[1] = MCAST_UNDEFINED; KASSERT(RB_EMPTY(&imf->im6f_sources), ("%s: im6f_sources not empty", __func__)); } /* * Look up a source filter entry for a multicast group. * * inm is the group descriptor to work with. * addr is the IPv6 address to look up. * noalloc may be non-zero to suppress allocation of sources. * *pims will be set to the address of the retrieved or allocated source. * * SMPng: NOTE: may be called with locks held. * Return 0 if successful, otherwise return a non-zero error code. */ static int in6m_get_source(struct in6_multi *inm, const struct in6_addr *addr, const int noalloc, struct ip6_msource **pims) { struct ip6_msource find; struct ip6_msource *ims, *nims; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif find.im6s_addr = *addr; ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find); if (ims == NULL && !noalloc) { if (inm->in6m_nsrc == in6_mcast_maxgrpsrc) return (ENOSPC); nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); nims->im6s_addr = *addr; RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims); ++inm->in6m_nsrc; ims = nims; CTR3(KTR_MLD, "%s: allocated %s as %p", __func__, ip6_sprintf(ip6tbuf, addr), ims); } *pims = ims; return (0); } /* * Merge socket-layer source into MLD-layer source. * If rollback is non-zero, perform the inverse of the merge. */ static void im6s_merge(struct ip6_msource *ims, const struct in6_msource *lims, const int rollback) { int n = rollback ? -1 : 1; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; ip6_sprintf(ip6tbuf, &lims->im6s_addr); #endif if (lims->im6sl_st[0] == MCAST_EXCLUDE) { CTR3(KTR_MLD, "%s: t1 ex -= %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].ex -= n; } else if (lims->im6sl_st[0] == MCAST_INCLUDE) { CTR3(KTR_MLD, "%s: t1 in -= %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].in -= n; } if (lims->im6sl_st[1] == MCAST_EXCLUDE) { CTR3(KTR_MLD, "%s: t1 ex += %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].ex += n; } else if (lims->im6sl_st[1] == MCAST_INCLUDE) { CTR3(KTR_MLD, "%s: t1 in += %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].in += n; } } /* * Atomically update the global in6_multi state, when a membership's * filter list is being updated in any way. * * imf is the per-inpcb-membership group filter pointer. * A fake imf may be passed for in-kernel consumers. * * XXX This is a candidate for a set-symmetric-difference style loop * which would eliminate the repeated lookup from root of ims nodes, * as they share the same key space. * * If any error occurred this function will back out of refcounts * and return a non-zero value. */ static int in6m_merge(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) { struct ip6_msource *ims, *nims; struct in6_msource *lims; int schanged, error; int nsrc0, nsrc1; schanged = 0; error = 0; nsrc1 = nsrc0 = 0; IN6_MULTI_LIST_LOCK_ASSERT(); /* * Update the source filters first, as this may fail. * Maintain count of in-mode filters at t0, t1. These are * used to work out if we transition into ASM mode or not. * Maintain a count of source filters whose state was * actually modified by this operation. */ RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == imf->im6f_st[0]) nsrc0++; if (lims->im6sl_st[1] == imf->im6f_st[1]) nsrc1++; if (lims->im6sl_st[0] == lims->im6sl_st[1]) continue; error = in6m_get_source(inm, &lims->im6s_addr, 0, &nims); ++schanged; if (error) break; im6s_merge(nims, lims, 0); } if (error) { struct ip6_msource *bims; RB_FOREACH_REVERSE_FROM(ims, ip6_msource_tree, nims) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == lims->im6sl_st[1]) continue; (void)in6m_get_source(inm, &lims->im6s_addr, 1, &bims); if (bims == NULL) continue; im6s_merge(bims, lims, 1); } goto out_reap; } CTR3(KTR_MLD, "%s: imf filters in-mode: %d at t0, %d at t1", __func__, nsrc0, nsrc1); /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */ if (imf->im6f_st[0] == imf->im6f_st[1] && imf->im6f_st[1] == MCAST_INCLUDE) { if (nsrc1 == 0) { CTR1(KTR_MLD, "%s: --in on inm at t1", __func__); --inm->in6m_st[1].iss_in; } } /* Handle filter mode transition on socket. */ if (imf->im6f_st[0] != imf->im6f_st[1]) { CTR3(KTR_MLD, "%s: imf transition %d to %d", __func__, imf->im6f_st[0], imf->im6f_st[1]); if (imf->im6f_st[0] == MCAST_EXCLUDE) { CTR1(KTR_MLD, "%s: --ex on inm at t1", __func__); --inm->in6m_st[1].iss_ex; } else if (imf->im6f_st[0] == MCAST_INCLUDE) { CTR1(KTR_MLD, "%s: --in on inm at t1", __func__); --inm->in6m_st[1].iss_in; } if (imf->im6f_st[1] == MCAST_EXCLUDE) { CTR1(KTR_MLD, "%s: ex++ on inm at t1", __func__); inm->in6m_st[1].iss_ex++; } else if (imf->im6f_st[1] == MCAST_INCLUDE && nsrc1 > 0) { CTR1(KTR_MLD, "%s: in++ on inm at t1", __func__); inm->in6m_st[1].iss_in++; } } /* * Track inm filter state in terms of listener counts. * If there are any exclusive listeners, stack-wide * membership is exclusive. * Otherwise, if only inclusive listeners, stack-wide is inclusive. * If no listeners remain, state is undefined at t1, * and the MLD lifecycle for this group should finish. */ if (inm->in6m_st[1].iss_ex > 0) { CTR1(KTR_MLD, "%s: transition to EX", __func__); inm->in6m_st[1].iss_fmode = MCAST_EXCLUDE; } else if (inm->in6m_st[1].iss_in > 0) { CTR1(KTR_MLD, "%s: transition to IN", __func__); inm->in6m_st[1].iss_fmode = MCAST_INCLUDE; } else { CTR1(KTR_MLD, "%s: transition to UNDEF", __func__); inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; } /* Decrement ASM listener count on transition out of ASM mode. */ if (imf->im6f_st[0] == MCAST_EXCLUDE && nsrc0 == 0) { if ((imf->im6f_st[1] != MCAST_EXCLUDE) || (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) { CTR1(KTR_MLD, "%s: --asm on inm at t1", __func__); --inm->in6m_st[1].iss_asm; } } /* Increment ASM listener count on transition to ASM mode. */ if (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 == 0) { CTR1(KTR_MLD, "%s: asm++ on inm at t1", __func__); inm->in6m_st[1].iss_asm++; } CTR3(KTR_MLD, "%s: merged imf %p to inm %p", __func__, imf, inm); in6m_print(inm); out_reap: if (schanged > 0) { CTR1(KTR_MLD, "%s: sources changed; reaping", __func__); in6m_reap(inm); } return (error); } /* * Mark an in6_multi's filter set deltas as committed. * Called by MLD after a state change has been enqueued. */ void in6m_commit(struct in6_multi *inm) { struct ip6_msource *ims; CTR2(KTR_MLD, "%s: commit inm %p", __func__, inm); CTR1(KTR_MLD, "%s: pre commit:", __func__); in6m_print(inm); RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { ims->im6s_st[0] = ims->im6s_st[1]; } inm->in6m_st[0] = inm->in6m_st[1]; } /* * Reap unreferenced nodes from an in6_multi's filter set. */ static void in6m_reap(struct in6_multi *inm) { struct ip6_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) { if (ims->im6s_st[0].ex > 0 || ims->im6s_st[0].in > 0 || ims->im6s_st[1].ex > 0 || ims->im6s_st[1].in > 0 || ims->im6s_stp != 0) continue; CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims); free(ims, M_IP6MSOURCE); inm->in6m_nsrc--; } } /* * Purge all source nodes from an in6_multi's filter set. */ static void in6m_purge(struct in6_multi *inm) { struct ip6_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) { CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims); free(ims, M_IP6MSOURCE); inm->in6m_nsrc--; } /* Free state-change requests that might be queued. */ mbufq_drain(&inm->in6m_scq); } /* * Join a multicast address w/o sources. * KAME compatibility entry point. * * SMPng: Assume no mc locks held by caller. */ int in6_joingroup(struct ifnet *ifp, const struct in6_addr *mcaddr, /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm, const int delay) { int error; IN6_MULTI_LOCK(); error = in6_joingroup_locked(ifp, mcaddr, NULL, pinm, delay); IN6_MULTI_UNLOCK(); return (error); } /* * Join a multicast group; real entry point. * * Only preserves atomicity at inm level. * NOTE: imf argument cannot be const due to sys/tree.h limitations. * * If the MLD downcall fails, the group is not joined, and an error * code is returned. */ int in6_joingroup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr, /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm, const int delay) { struct in6_mfilter timf; struct in6_multi *inm; struct ifmultiaddr *ifma; int error; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif #ifdef INVARIANTS /* * Sanity: Check scope zone ID was set for ifp, if and * only if group is scoped to an interface. */ KASSERT(IN6_IS_ADDR_MULTICAST(mcaddr), ("%s: not a multicast address", __func__)); if (IN6_IS_ADDR_MC_LINKLOCAL(mcaddr) || IN6_IS_ADDR_MC_INTFACELOCAL(mcaddr)) { KASSERT(mcaddr->s6_addr16[1] != 0, ("%s: scope zone ID not set", __func__)); } #endif IN6_MULTI_LOCK_ASSERT(); IN6_MULTI_LIST_UNLOCK_ASSERT(); CTR4(KTR_MLD, "%s: join %s on %p(%s))", __func__, ip6_sprintf(ip6tbuf, mcaddr), ifp, if_name(ifp)); error = 0; inm = NULL; /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { im6f_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE); imf = &timf; } error = in6_getmulti(ifp, mcaddr, &inm); if (error) { CTR1(KTR_MLD, "%s: in6_getmulti() failure", __func__); return (error); } IN6_MULTI_LIST_LOCK(); CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); if (error) { CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); goto out_in6m_release; } CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, delay); if (error) { CTR1(KTR_MLD, "%s: failed to update source", __func__); goto out_in6m_release; } out_in6m_release: if (error) { CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_protospec == inm) { ifma->ifma_protospec = NULL; break; } } in6m_release_deferred(inm); IF_ADDR_RUNLOCK(ifp); } else { *pinm = inm; } IN6_MULTI_LIST_UNLOCK(); return (error); } /* * Leave a multicast group; unlocked entry point. */ int in6_leavegroup(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) { int error; IN6_MULTI_LOCK(); error = in6_leavegroup_locked(inm, imf); IN6_MULTI_UNLOCK(); return (error); } /* * Leave a multicast group; real entry point. * All source filters will be expunged. * * Only preserves atomicity at inm level. * * Holding the write lock for the INP which contains imf * is highly advisable. We can't assert for it as imf does not * contain a back-pointer to the owning inp. * * Note: This is not the same as in6m_release(*) as this function also * makes a state change downcall into MLD. */ int in6_leavegroup_locked(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) { struct in6_mfilter timf; struct ifnet *ifp; int error; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif error = 0; IN6_MULTI_LOCK_ASSERT(); CTR5(KTR_MLD, "%s: leave inm %p, %s/%s, imf %p", __func__, inm, ip6_sprintf(ip6tbuf, &inm->in6m_addr), (in6m_is_ifp_detached(inm) ? "null" : if_name(inm->in6m_ifp)), imf); /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { im6f_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED); imf = &timf; } /* * Begin state merge transaction at MLD layer. * * As this particular invocation should not cause any memory * to be allocated, and there is no opportunity to roll back * the transaction, it MUST NOT fail. */ ifp = inm->in6m_ifp; IN6_MULTI_LIST_LOCK(); CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); KASSERT(error == 0, ("%s: failed to merge inm state", __func__)); CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm); if (ifp) - IF_ADDR_RLOCK(ifp); + IF_ADDR_WLOCK(ifp); in6m_release_deferred(inm); if (ifp) - IF_ADDR_RUNLOCK(ifp); + IF_ADDR_WUNLOCK(ifp); IN6_MULTI_LIST_UNLOCK(); return (error); } /* * Block or unblock an ASM multicast source on an inpcb. * This implements the delta-based API described in RFC 3678. * * The delta-based API applies only to exclusive-mode memberships. * An MLD downcall will be performed. * * SMPng: NOTE: Must take Giant as a join may create a new ifma. * * Return 0 if successful, otherwise return an appropriate error code. */ static int in6p_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_msource *ims; struct in6_multi *inm; size_t idx; uint16_t fmode; int error, doblock; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif ifp = NULL; error = 0; doblock = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; ssa = (sockunion_t *)&gsr.gsr_source; switch (sopt->sopt_name) { case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); if (error) return (error); if (gsa->sin6.sin6_family != AF_INET6 || gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (ssa->sin6.sin6_family != AF_INET6 || ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (sopt->sopt_name == MCAST_BLOCK_SOURCE) doblock = 1; break; default: CTR2(KTR_MLD, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); /* * Check if we are actually a member of this group. */ imo = in6p_findmoptions(inp); idx = im6o_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->im6o_mfilters == NULL) { error = EADDRNOTAVAIL; goto out_in6p_locked; } KASSERT(imo->im6o_mfilters != NULL, ("%s: im6o_mfilters not allocated", __func__)); imf = &imo->im6o_mfilters[idx]; inm = imo->im6o_membership[idx]; /* * Attempting to use the delta-based API on an * non exclusive-mode membership is an error. */ fmode = imf->im6f_st[0]; if (fmode != MCAST_EXCLUDE) { error = EINVAL; goto out_in6p_locked; } /* * Deal with error cases up-front: * Asked to block, but already blocked; or * Asked to unblock, but nothing to unblock. * If adding a new block entry, allocate it. */ ims = im6o_match_source(imo, idx, &ssa->sa); if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { CTR3(KTR_MLD, "%s: source %s %spresent", __func__, ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr), doblock ? "" : "not "); error = EADDRNOTAVAIL; goto out_in6p_locked; } INP_WLOCK_ASSERT(inp); /* * Begin state merge transaction at socket layer. */ if (doblock) { CTR2(KTR_MLD, "%s: %s source", __func__, "block"); ims = im6f_graft(imf, fmode, &ssa->sin6); if (ims == NULL) error = ENOMEM; } else { CTR2(KTR_MLD, "%s: %s source", __func__, "allow"); error = im6f_prune(imf, &ssa->sin6); } if (error) { CTR1(KTR_MLD, "%s: merge imf state failed", __func__); goto out_im6f_rollback; } /* * Begin state merge transaction at MLD layer. */ IN6_MULTI_LIST_LOCK(); CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); if (error) CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); else { CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); } IN6_MULTI_LIST_UNLOCK(); out_im6f_rollback: if (error) im6f_rollback(imf); else im6f_commit(imf); im6f_reap(imf); out_in6p_locked: INP_WUNLOCK(inp); return (error); } /* * Given an inpcb, return its multicast options structure pointer. Accepts * an unlocked inpcb pointer, but will return it locked. May sleep. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. * SMPng: NOTE: Returns with the INP write lock held. */ static struct ip6_moptions * in6p_findmoptions(struct inpcb *inp) { struct ip6_moptions *imo; struct in6_multi **immp; struct in6_mfilter *imfp; size_t idx; INP_WLOCK(inp); if (inp->in6p_moptions != NULL) return (inp->in6p_moptions); INP_WUNLOCK(inp); imo = malloc(sizeof(*imo), M_IP6MOPTS, M_WAITOK); immp = malloc(sizeof(*immp) * IPV6_MIN_MEMBERSHIPS, M_IP6MOPTS, M_WAITOK | M_ZERO); imfp = malloc(sizeof(struct in6_mfilter) * IPV6_MIN_MEMBERSHIPS, M_IN6MFILTER, M_WAITOK); imo->im6o_multicast_ifp = NULL; imo->im6o_multicast_hlim = V_ip6_defmcasthlim; imo->im6o_multicast_loop = in6_mcast_loop; imo->im6o_num_memberships = 0; imo->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS; imo->im6o_membership = immp; /* Initialize per-group source filters. */ for (idx = 0; idx < IPV6_MIN_MEMBERSHIPS; idx++) im6f_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); imo->im6o_mfilters = imfp; INP_WLOCK(inp); if (inp->in6p_moptions != NULL) { free(imfp, M_IN6MFILTER); free(immp, M_IP6MOPTS); free(imo, M_IP6MOPTS); return (inp->in6p_moptions); } inp->in6p_moptions = imo; return (imo); } /* * Discard the IPv6 multicast options (and source filters). * * SMPng: NOTE: assumes INP write lock is held. */ void ip6_freemoptions(struct ip6_moptions *imo, struct inpcbinfo *pcbinfo) { struct in6_mfilter *imf; size_t idx, nmships; int wlock; if (imo == NULL) return; INP_INFO_LOCK_ASSERT(pcbinfo); wlock = INP_INFO_WLOCKED(pcbinfo); if (wlock) INP_INFO_WUNLOCK(pcbinfo); else INP_INFO_RUNLOCK(pcbinfo); nmships = imo->im6o_num_memberships; for (idx = 0; idx < nmships; ++idx) { imf = imo->im6o_mfilters ? &imo->im6o_mfilters[idx] : NULL; if (imf) im6f_leave(imf); /* XXX this will thrash the lock(s) */ (void)in6_leavegroup(imo->im6o_membership[idx], imf); if (imf) im6f_purge(imf); } if (imo->im6o_mfilters) free(imo->im6o_mfilters, M_IN6MFILTER); free(imo->im6o_membership, M_IP6MOPTS); free(imo, M_IP6MOPTS); if (wlock) INP_INFO_WLOCK(pcbinfo); else INP_INFO_RLOCK(pcbinfo); } /* * Atomically get source filters on a socket for an IPv6 multicast group. * Called with INP lock held; returns with lock released. */ static int in6p_get_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct ip6_moptions *imo; struct in6_mfilter *imf; struct ip6_msource *ims; struct in6_msource *lims; struct sockaddr_in6 *psin; struct sockaddr_storage *ptss; struct sockaddr_storage *tss; int error; size_t idx, nsrcs, ncsrcs; INP_WLOCK_ASSERT(inp); imo = inp->in6p_moptions; KASSERT(imo != NULL, ("%s: null ip6_moptions", __func__)); INP_WUNLOCK(inp); error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_group.ss_family != AF_INET6 || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); INP_WLOCK(inp); /* * Lookup group on the socket. */ idx = im6o_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->im6o_mfilters == NULL) { INP_WUNLOCK(inp); return (EADDRNOTAVAIL); } imf = &imo->im6o_mfilters[idx]; /* * Ignore memberships which are in limbo. */ if (imf->im6f_st[1] == MCAST_UNDEFINED) { INP_WUNLOCK(inp); return (EAGAIN); } msfr.msfr_fmode = imf->im6f_st[1]; /* * If the user specified a buffer, copy out the source filter * entries to userland gracefully. * We only copy out the number of entries which userland * has asked for, but we always tell userland how big the * buffer really needs to be. */ if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc) msfr.msfr_nsrcs = in6_mcast_maxsocksrc; tss = NULL; if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) { tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_NOWAIT | M_ZERO); if (tss == NULL) { INP_WUNLOCK(inp); return (ENOBUFS); } } /* * Count number of sources in-mode at t0. * If buffer space exists and remains, copy out source entries. */ nsrcs = msfr.msfr_nsrcs; ncsrcs = 0; ptss = tss; RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == MCAST_UNDEFINED || lims->im6sl_st[0] != imf->im6f_st[0]) continue; ++ncsrcs; if (tss != NULL && nsrcs > 0) { psin = (struct sockaddr_in6 *)ptss; psin->sin6_family = AF_INET6; psin->sin6_len = sizeof(struct sockaddr_in6); psin->sin6_addr = lims->im6s_addr; psin->sin6_port = 0; --nsrcs; ++ptss; } } INP_WUNLOCK(inp); if (tss != NULL) { error = copyout(tss, msfr.msfr_srcs, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); free(tss, M_TEMP); if (error) return (error); } msfr.msfr_nsrcs = ncsrcs; error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq)); return (error); } /* * Return the IP multicast options in response to user getsockopt(). */ int ip6_getmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip6_moptions *im6o; int error; u_int optval; INP_WLOCK(inp); im6o = inp->in6p_moptions; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } error = 0; switch (sopt->sopt_name) { case IPV6_MULTICAST_IF: if (im6o == NULL || im6o->im6o_multicast_ifp == NULL) { optval = 0; } else { optval = im6o->im6o_multicast_ifp->if_index; } INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(u_int)); break; case IPV6_MULTICAST_HOPS: if (im6o == NULL) optval = V_ip6_defmcasthlim; else optval = im6o->im6o_multicast_hlim; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(u_int)); break; case IPV6_MULTICAST_LOOP: if (im6o == NULL) optval = in6_mcast_loop; /* XXX VIMAGE */ else optval = im6o->im6o_multicast_loop; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(u_int)); break; case IPV6_MSFILTER: if (im6o == NULL) { error = EADDRNOTAVAIL; INP_WUNLOCK(inp); } else { error = in6p_get_source_filters(inp, sopt); } break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Look up the ifnet to use for a multicast group membership, * given the address of an IPv6 group. * * This routine exists to support legacy IPv6 multicast applications. * * If inp is non-NULL, use this socket's current FIB number for any * required FIB lookup. Look up the group address in the unicast FIB, * and use its ifp; usually, this points to the default next-hop. * If the FIB lookup fails, return NULL. * * FUTURE: Support multiple forwarding tables for IPv6. * * Returns NULL if no ifp could be found. */ static struct ifnet * in6p_lookup_mcast_ifp(const struct inpcb *in6p, const struct sockaddr_in6 *gsin6) { struct nhop6_basic nh6; struct in6_addr dst; uint32_t scopeid; uint32_t fibnum; KASSERT(in6p->inp_vflag & INP_IPV6, ("%s: not INP_IPV6 inpcb", __func__)); KASSERT(gsin6->sin6_family == AF_INET6, ("%s: not AF_INET6 group", __func__)); in6_splitscope(&gsin6->sin6_addr, &dst, &scopeid); fibnum = in6p ? in6p->inp_inc.inc_fibnum : RT_DEFAULT_FIB; if (fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6) != 0) return (NULL); return (nh6.nh_ifp); } /* * Join an IPv6 multicast group, possibly with a source. * * FIXME: The KAME use of the unspecified address (::) * to join *all* multicast groups is currently unsupported. */ static int in6p_join_group(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_multi *inm; struct in6_msource *lims; size_t idx; int error, is_new; ifp = NULL; imf = NULL; lims = NULL; error = 0; is_new = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; /* * Chew everything into struct group_source_req. * Overwrite the port field if present, as the sockaddr * being copied in may be matched with a binary comparison. * Ignore passed-in scope ID. */ switch (sopt->sopt_name) { case IPV6_JOIN_GROUP: { struct ipv6_mreq mreq; error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq), sizeof(struct ipv6_mreq)); if (error) return (error); gsa->sin6.sin6_family = AF_INET6; gsa->sin6.sin6_len = sizeof(struct sockaddr_in6); gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr; if (mreq.ipv6mr_interface == 0) { ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6); } else { if (V_if_index < mreq.ipv6mr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(mreq.ipv6mr_interface); } CTR3(KTR_MLD, "%s: ipv6mr_interface = %d, ifp = %p", __func__, mreq.ipv6mr_interface, ifp); } break; case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: if (sopt->sopt_name == MCAST_JOIN_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin6.sin6_family != AF_INET6 || gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { if (ssa->sin6.sin6_family != AF_INET6 || ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr)) return (EINVAL); /* * TODO: Validate embedded scope ID in source * list entry against passed-in ifp, if and only * if source list filter entry is iface or node local. */ in6_clearscope(&ssa->sin6.sin6_addr); ssa->sin6.sin6_port = 0; ssa->sin6.sin6_scope_id = 0; } if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); break; default: CTR2(KTR_MLD, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); gsa->sin6.sin6_port = 0; gsa->sin6.sin6_scope_id = 0; /* * Always set the scope zone ID on memberships created from userland. * Use the passed-in ifp to do this. * XXX The in6_setscope() return value is meaningless. * XXX SCOPE6_LOCK() is taken by in6_setscope(). */ (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); imo = in6p_findmoptions(inp); idx = im6o_match_group(imo, ifp, &gsa->sa); if (idx == -1) { is_new = 1; } else { inm = imo->im6o_membership[idx]; imf = &imo->im6o_mfilters[idx]; if (ssa->ss.ss_family != AF_UNSPEC) { /* * MCAST_JOIN_SOURCE_GROUP on an exclusive membership * is an error. On an existing inclusive membership, * it just adds the source to the filter list. */ if (imf->im6f_st[1] != MCAST_INCLUDE) { error = EINVAL; goto out_in6p_locked; } /* * Throw out duplicates. * * XXX FIXME: This makes a naive assumption that * even if entries exist for *ssa in this imf, * they will be rejected as dupes, even if they * are not valid in the current mode (in-mode). * * in6_msource is transactioned just as for anything * else in SSM -- but note naive use of in6m_graft() * below for allocating new filter entries. * * This is only an issue if someone mixes the * full-state SSM API with the delta-based API, * which is discouraged in the relevant RFCs. */ lims = im6o_match_source(imo, idx, &ssa->sa); if (lims != NULL /*&& lims->im6sl_st[1] == MCAST_INCLUDE*/) { error = EADDRNOTAVAIL; goto out_in6p_locked; } } else { /* * MCAST_JOIN_GROUP alone, on any existing membership, * is rejected, to stop the same inpcb tying up * multiple refs to the in_multi. * On an existing inclusive membership, this is also * an error; if you want to change filter mode, * you must use the userland API setsourcefilter(). * XXX We don't reject this for imf in UNDEFINED * state at t1, because allocation of a filter * is atomic with allocation of a membership. */ error = EINVAL; goto out_in6p_locked; } } /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); if (is_new) { if (imo->im6o_num_memberships == imo->im6o_max_memberships) { error = im6o_grow(imo); if (error) goto out_in6p_locked; } /* * Allocate the new slot upfront so we can deal with * grafting the new source filter in same code path * as for join-source on existing membership. */ idx = imo->im6o_num_memberships; imo->im6o_membership[idx] = NULL; imo->im6o_num_memberships++; KASSERT(imo->im6o_mfilters != NULL, ("%s: im6f_mfilters vector was not allocated", __func__)); imf = &imo->im6o_mfilters[idx]; KASSERT(RB_EMPTY(&imf->im6f_sources), ("%s: im6f_sources not empty", __func__)); } /* * Graft new source into filter list for this inpcb's * membership of the group. The in6_multi may not have * been allocated yet if this is a new membership, however, * the in_mfilter slot will be allocated and must be initialized. * * Note: Grafting of exclusive mode filters doesn't happen * in this path. * XXX: Should check for non-NULL lims (node exists but may * not be in-mode) for interop with full-state API. */ if (ssa->ss.ss_family != AF_UNSPEC) { /* Membership starts in IN mode */ if (is_new) { CTR1(KTR_MLD, "%s: new join w/source", __func__); im6f_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE); } else { CTR2(KTR_MLD, "%s: %s source", __func__, "allow"); } lims = im6f_graft(imf, MCAST_INCLUDE, &ssa->sin6); if (lims == NULL) { CTR1(KTR_MLD, "%s: merge imf state failed", __func__); error = ENOMEM; goto out_im6o_free; } } else { /* No address specified; Membership starts in EX mode */ if (is_new) { CTR1(KTR_MLD, "%s: new join w/o source", __func__); im6f_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE); } } /* * Begin state merge transaction at MLD layer. */ in_pcbref(inp); INP_WUNLOCK(inp); IN6_MULTI_LOCK(); if (is_new) { error = in6_joingroup_locked(ifp, &gsa->sin6.sin6_addr, imf, &inm, 0); if (error) { IN6_MULTI_UNLOCK(); goto out_im6o_free; } imo->im6o_membership[idx] = inm; } else { CTR1(KTR_MLD, "%s: merge inm state", __func__); IN6_MULTI_LIST_LOCK(); error = in6m_merge(inm, imf); if (error) CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); else { CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); } IN6_MULTI_LIST_UNLOCK(); } IN6_MULTI_UNLOCK(); INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (ENXIO); if (error) { im6f_rollback(imf); if (is_new) im6f_purge(imf); else im6f_reap(imf); } else { im6f_commit(imf); } out_im6o_free: if (error && is_new) { imo->im6o_membership[idx] = NULL; --imo->im6o_num_memberships; } out_in6p_locked: INP_WUNLOCK(inp); return (error); } /* * Leave an IPv6 multicast group on an inpcb, possibly with a source. */ static int in6p_leave_group(struct inpcb *inp, struct sockopt *sopt) { struct ipv6_mreq mreq; struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_msource *ims; struct in6_multi *inm; uint32_t ifindex; size_t idx; int error, is_final; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif ifp = NULL; ifindex = 0; error = 0; is_final = 1; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; /* * Chew everything passed in up into a struct group_source_req * as that is easier to process. * Note: Any embedded scope ID in the multicast group passed * in by userland is ignored, the interface index is the recommended * mechanism to specify an interface; see below. */ switch (sopt->sopt_name) { case IPV6_LEAVE_GROUP: error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq), sizeof(struct ipv6_mreq)); if (error) return (error); gsa->sin6.sin6_family = AF_INET6; gsa->sin6.sin6_len = sizeof(struct sockaddr_in6); gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr; gsa->sin6.sin6_port = 0; gsa->sin6.sin6_scope_id = 0; ifindex = mreq.ipv6mr_interface; break; case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: if (sopt->sopt_name == MCAST_LEAVE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin6.sin6_family != AF_INET6 || gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { if (ssa->sin6.sin6_family != AF_INET6 || ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr)) return (EINVAL); /* * TODO: Validate embedded scope ID in source * list entry against passed-in ifp, if and only * if source list filter entry is iface or node local. */ in6_clearscope(&ssa->sin6.sin6_addr); } gsa->sin6.sin6_port = 0; gsa->sin6.sin6_scope_id = 0; ifindex = gsr.gsr_interface; break; default: CTR2(KTR_MLD, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); /* * Validate interface index if provided. If no interface index * was provided separately, attempt to look the membership up * from the default scope as a last resort to disambiguate * the membership we are being asked to leave. * XXX SCOPE6 lock potentially taken here. */ if (ifindex != 0) { if (V_if_index < ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); } else { error = sa6_embedscope(&gsa->sin6, V_ip6_use_defzone); if (error) return (EADDRNOTAVAIL); /* * Some badly behaved applications don't pass an ifindex * or a scope ID, which is an API violation. In this case, * perform a lookup as per a v6 join. * * XXX For now, stomp on zone ID for the corner case. * This is not the 'KAME way', but we need to see the ifp * directly until such time as this implementation is * refactored, assuming the scope IDs are the way to go. */ ifindex = ntohs(gsa->sin6.sin6_addr.s6_addr16[1]); if (ifindex == 0) { CTR2(KTR_MLD, "%s: warning: no ifindex, looking up " "ifp for group %s.", __func__, ip6_sprintf(ip6tbuf, &gsa->sin6.sin6_addr)); ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6); } else { ifp = ifnet_byindex(ifindex); } if (ifp == NULL) return (EADDRNOTAVAIL); } CTR2(KTR_MLD, "%s: ifp = %p", __func__, ifp); KASSERT(ifp != NULL, ("%s: ifp did not resolve", __func__)); /* * Find the membership in the membership array. */ imo = in6p_findmoptions(inp); idx = im6o_match_group(imo, ifp, &gsa->sa); if (idx == -1) { error = EADDRNOTAVAIL; goto out_in6p_locked; } inm = imo->im6o_membership[idx]; imf = &imo->im6o_mfilters[idx]; if (ssa->ss.ss_family != AF_UNSPEC) is_final = 0; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); /* * If we were instructed only to leave a given source, do so. * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. */ if (is_final) { im6f_leave(imf); } else { if (imf->im6f_st[0] == MCAST_EXCLUDE) { error = EADDRNOTAVAIL; goto out_in6p_locked; } ims = im6o_match_source(imo, idx, &ssa->sa); if (ims == NULL) { CTR3(KTR_MLD, "%s: source %p %spresent", __func__, ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr), "not "); error = EADDRNOTAVAIL; goto out_in6p_locked; } CTR2(KTR_MLD, "%s: %s source", __func__, "block"); error = im6f_prune(imf, &ssa->sin6); if (error) { CTR1(KTR_MLD, "%s: merge imf state failed", __func__); goto out_in6p_locked; } } /* * Begin state merge transaction at MLD layer. */ IN6_MULTI_LOCK(); if (is_final) { /* * Give up the multicast address record to which * the membership points. */ (void)in6_leavegroup_locked(inm, imf); } else { CTR1(KTR_MLD, "%s: merge inm state", __func__); IN6_MULTI_LIST_LOCK(); error = in6m_merge(inm, imf); if (error) CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); else { CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); } IN6_MULTI_LIST_UNLOCK(); } IN6_MULTI_UNLOCK(); if (error) im6f_rollback(imf); else im6f_commit(imf); im6f_reap(imf); if (is_final) { /* Remove the gap in the membership array. */ for (++idx; idx < imo->im6o_num_memberships; ++idx) { imo->im6o_membership[idx-1] = imo->im6o_membership[idx]; imo->im6o_mfilters[idx-1] = imo->im6o_mfilters[idx]; } imo->im6o_num_memberships--; } out_in6p_locked: INP_WUNLOCK(inp); return (error); } /* * Select the interface for transmitting IPv6 multicast datagrams. * * Either an instance of struct in6_addr or an instance of struct ipv6_mreqn * may be passed to this socket option. An address of in6addr_any or an * interface index of 0 is used to remove a previous selection. * When no interface is selected, one is chosen for every send. */ static int in6p_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) { struct ifnet *ifp; struct ip6_moptions *imo; u_int ifindex; int error; if (sopt->sopt_valsize != sizeof(u_int)) return (EINVAL); error = sooptcopyin(sopt, &ifindex, sizeof(u_int), sizeof(u_int)); if (error) return (error); if (V_if_index < ifindex) return (EINVAL); if (ifindex == 0) ifp = NULL; else { ifp = ifnet_byindex(ifindex); if (ifp == NULL) return (EINVAL); if ((ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); } imo = in6p_findmoptions(inp); imo->im6o_multicast_ifp = ifp; INP_WUNLOCK(inp); return (0); } /* * Atomically set source filters on a socket for an IPv6 multicast group. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. */ static int in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_multi *inm; size_t idx; int error; error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc) return (ENOBUFS); if (msfr.msfr_fmode != MCAST_EXCLUDE && msfr.msfr_fmode != MCAST_INCLUDE) return (EINVAL); if (msfr.msfr_group.ss_family != AF_INET6 || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); gsa->sin6.sin6_port = 0; /* ignore port */ if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); /* * Take the INP write lock. * Check if this socket is a member of this group. */ imo = in6p_findmoptions(inp); idx = im6o_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->im6o_mfilters == NULL) { error = EADDRNOTAVAIL; goto out_in6p_locked; } inm = imo->im6o_membership[idx]; imf = &imo->im6o_mfilters[idx]; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); imf->im6f_st[1] = msfr.msfr_fmode; /* * Apply any new source filters, if present. * Make a copy of the user-space source vector so * that we may copy them with a single copyin. This * allows us to deal with page faults up-front. */ if (msfr.msfr_nsrcs > 0) { struct in6_msource *lims; struct sockaddr_in6 *psin; struct sockaddr_storage *kss, *pkss; int i; INP_WUNLOCK(inp); CTR2(KTR_MLD, "%s: loading %lu source list entries", __func__, (unsigned long)msfr.msfr_nsrcs); kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_WAITOK); error = copyin(msfr.msfr_srcs, kss, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); if (error) { free(kss, M_TEMP); return (error); } INP_WLOCK(inp); /* * Mark all source filters as UNDEFINED at t1. * Restore new group filter mode, as im6f_leave() * will set it to INCLUDE. */ im6f_leave(imf); imf->im6f_st[1] = msfr.msfr_fmode; /* * Update socket layer filters at t1, lazy-allocating * new entries. This saves a bunch of memory at the * cost of one RB_FIND() per source entry; duplicate * entries in the msfr_nsrcs vector are ignored. * If we encounter an error, rollback transaction. * * XXX This too could be replaced with a set-symmetric * difference like loop to avoid walking from root * every time, as the key space is common. */ for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) { psin = (struct sockaddr_in6 *)pkss; if (psin->sin6_family != AF_INET6) { error = EAFNOSUPPORT; break; } if (psin->sin6_len != sizeof(struct sockaddr_in6)) { error = EINVAL; break; } if (IN6_IS_ADDR_MULTICAST(&psin->sin6_addr)) { error = EINVAL; break; } /* * TODO: Validate embedded scope ID in source * list entry against passed-in ifp, if and only * if source list filter entry is iface or node local. */ in6_clearscope(&psin->sin6_addr); error = im6f_get_source(imf, psin, &lims); if (error) break; lims->im6sl_st[1] = imf->im6f_st[1]; } free(kss, M_TEMP); } if (error) goto out_im6f_rollback; INP_WLOCK_ASSERT(inp); IN6_MULTI_LIST_LOCK(); /* * Begin state merge transaction at MLD layer. */ CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); if (error) CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); else { CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); } IN6_MULTI_LIST_UNLOCK(); out_im6f_rollback: if (error) im6f_rollback(imf); else im6f_commit(imf); im6f_reap(imf); out_in6p_locked: INP_WUNLOCK(inp); return (error); } /* * Set the IP multicast options in response to user setsockopt(). * * Many of the socket options handled in this function duplicate the * functionality of socket options in the regular unicast API. However, * it is not possible to merge the duplicate code, because the idempotence * of the IPv6 multicast part of the BSD Sockets API must be preserved; * the effects of these options must be treated as separate and distinct. * * SMPng: XXX: Unlocked read of inp_socket believed OK. */ int ip6_setmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip6_moptions *im6o; int error; error = 0; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) return (EOPNOTSUPP); switch (sopt->sopt_name) { case IPV6_MULTICAST_IF: error = in6p_set_multicast_if(inp, sopt); break; case IPV6_MULTICAST_HOPS: { int hlim; if (sopt->sopt_valsize != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &hlim, sizeof(hlim), sizeof(int)); if (error) break; if (hlim < -1 || hlim > 255) { error = EINVAL; break; } else if (hlim == -1) { hlim = V_ip6_defmcasthlim; } im6o = in6p_findmoptions(inp); im6o->im6o_multicast_hlim = hlim; INP_WUNLOCK(inp); break; } case IPV6_MULTICAST_LOOP: { u_int loop; /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. */ if (sopt->sopt_valsize != sizeof(u_int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &loop, sizeof(u_int), sizeof(u_int)); if (error) break; if (loop > 1) { error = EINVAL; break; } im6o = in6p_findmoptions(inp); im6o->im6o_multicast_loop = loop; INP_WUNLOCK(inp); break; } case IPV6_JOIN_GROUP: case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: error = in6p_join_group(inp, sopt); break; case IPV6_LEAVE_GROUP: case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = in6p_leave_group(inp, sopt); break; case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = in6p_block_unblock_source(inp, sopt); break; case IPV6_MSFILTER: error = in6p_set_source_filters(inp, sopt); break; default: error = EOPNOTSUPP; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Expose MLD's multicast filter mode and source list(s) to userland, * keyed by (ifindex, group). * The filter mode is written out as a uint32_t, followed by * 0..n of struct in6_addr. * For use by ifmcstat(8). * SMPng: NOTE: unlocked read of ifindex space. */ static int sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS) { struct in6_addr mcaddr; struct in6_addr src; struct ifnet *ifp; struct ifmultiaddr *ifma; struct in6_multi *inm; struct ip6_msource *ims; int *name; int retval; u_int namelen; uint32_t fmode, ifindex; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); /* int: ifindex + 4 * 32 bits of IPv6 address */ if (namelen != 5) return (EINVAL); ifindex = name[0]; if (ifindex <= 0 || ifindex > V_if_index) { CTR2(KTR_MLD, "%s: ifindex %u out of range", __func__, ifindex); return (ENOENT); } memcpy(&mcaddr, &name[1], sizeof(struct in6_addr)); if (!IN6_IS_ADDR_MULTICAST(&mcaddr)) { CTR2(KTR_MLD, "%s: group %s is not multicast", __func__, ip6_sprintf(ip6tbuf, &mcaddr)); return (EINVAL); } ifp = ifnet_byindex(ifindex); if (ifp == NULL) { CTR2(KTR_MLD, "%s: no ifp for ifindex %u", __func__, ifindex); return (ENOENT); } /* * Internal MLD lookups require that scope/zone ID is set. */ (void)in6_setscope(&mcaddr, ifp, NULL); retval = sysctl_wire_old_buffer(req, sizeof(uint32_t) + (in6_mcast_maxgrpsrc * sizeof(struct in6_addr))); if (retval) return (retval); IN6_MULTI_LOCK(); IN6_MULTI_LIST_LOCK(); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET6 || ifma->ifma_protospec == NULL) continue; inm = (struct in6_multi *)ifma->ifma_protospec; if (!IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, &mcaddr)) continue; fmode = inm->in6m_st[1].iss_fmode; retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t)); if (retval != 0) break; RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { CTR2(KTR_MLD, "%s: visit node %p", __func__, ims); /* * Only copy-out sources which are in-mode. */ if (fmode != im6s_get_mode(inm, ims, 1)) { CTR1(KTR_MLD, "%s: skip non-in-mode", __func__); continue; } src = ims->im6s_addr; retval = SYSCTL_OUT(req, &src, sizeof(struct in6_addr)); if (retval != 0) break; } } IF_ADDR_RUNLOCK(ifp); IN6_MULTI_LIST_UNLOCK(); IN6_MULTI_UNLOCK(); return (retval); } #ifdef KTR static const char *in6m_modestrs[] = { "un", "in", "ex" }; static const char * in6m_mode_str(const int mode) { if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE) return (in6m_modestrs[mode]); return ("??"); } static const char *in6m_statestrs[] = { "not-member", "silent", "idle", "lazy", "sleeping", "awakening", "query-pending", "sg-query-pending", "leaving" }; static const char * in6m_state_str(const int state) { if (state >= MLD_NOT_MEMBER && state <= MLD_LEAVING_MEMBER) return (in6m_statestrs[state]); return ("??"); } /* * Dump an in6_multi structure to the console. */ void in6m_print(const struct in6_multi *inm) { int t; char ip6tbuf[INET6_ADDRSTRLEN]; if ((ktr_mask & KTR_MLD) == 0) return; printf("%s: --- begin in6m %p ---\n", __func__, inm); printf("addr %s ifp %p(%s) ifma %p\n", ip6_sprintf(ip6tbuf, &inm->in6m_addr), inm->in6m_ifp, if_name(inm->in6m_ifp), inm->in6m_ifma); printf("timer %u state %s refcount %u scq.len %u\n", inm->in6m_timer, in6m_state_str(inm->in6m_state), inm->in6m_refcount, mbufq_len(&inm->in6m_scq)); printf("mli %p nsrc %lu sctimer %u scrv %u\n", inm->in6m_mli, inm->in6m_nsrc, inm->in6m_sctimer, inm->in6m_scrv); for (t = 0; t < 2; t++) { printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t, in6m_mode_str(inm->in6m_st[t].iss_fmode), inm->in6m_st[t].iss_asm, inm->in6m_st[t].iss_ex, inm->in6m_st[t].iss_in, inm->in6m_st[t].iss_rec); } printf("%s: --- end in6m %p ---\n", __func__, inm); } #else /* !KTR */ void in6m_print(const struct in6_multi *inm) { } #endif /* KTR */ Index: head/sys/netinet6/in6_var.h =================================================================== --- head/sys/netinet6/in6_var.h (revision 333308) +++ head/sys/netinet6/in6_var.h (revision 333309) @@ -1,880 +1,868 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_var.h,v 1.56 2001/03/29 05:34:31 itojun Exp $ */ /*- * Copyright (c) 1985, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_var.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NETINET6_IN6_VAR_H_ #define _NETINET6_IN6_VAR_H_ #include #include #ifdef _KERNEL #include #include #endif /* * Interface address, Internet version. One of these structures * is allocated for each interface with an Internet address. * The ifaddr structure contains the protocol-independent part * of the structure and is assumed to be first. */ /* * pltime/vltime are just for future reference (required to implements 2 * hour rule for hosts). they should never be modified by nd6_timeout or * anywhere else. * userland -> kernel: accept pltime/vltime * kernel -> userland: throw up everything * in kernel: modify preferred/expire only */ struct in6_addrlifetime { time_t ia6t_expire; /* valid lifetime expiration time */ time_t ia6t_preferred; /* preferred lifetime expiration time */ u_int32_t ia6t_vltime; /* valid lifetime */ u_int32_t ia6t_pltime; /* prefix lifetime */ }; struct nd_ifinfo; struct scope6_id; struct lltable; struct mld_ifsoftc; struct in6_multi; struct in6_ifextra { counter_u64_t *in6_ifstat; counter_u64_t *icmp6_ifstat; struct nd_ifinfo *nd_ifinfo; struct scope6_id *scope6_id; struct lltable *lltable; struct mld_ifsoftc *mld_ifinfo; }; #define LLTABLE6(ifp) (((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->lltable) #ifdef _KERNEL SLIST_HEAD(in6_multi_head, in6_multi); MALLOC_DECLARE(M_IP6MADDR); struct in6_ifaddr { struct ifaddr ia_ifa; /* protocol-independent info */ #define ia_ifp ia_ifa.ifa_ifp #define ia_flags ia_ifa.ifa_flags struct sockaddr_in6 ia_addr; /* interface address */ struct sockaddr_in6 ia_net; /* network number of interface */ struct sockaddr_in6 ia_dstaddr; /* space for destination addr */ struct sockaddr_in6 ia_prefixmask; /* prefix mask */ u_int32_t ia_plen; /* prefix length */ TAILQ_ENTRY(in6_ifaddr) ia_link; /* list of IPv6 addresses */ int ia6_flags; struct in6_addrlifetime ia6_lifetime; time_t ia6_createtime; /* the creation time of this address, which is * currently used for temporary addresses only. */ time_t ia6_updatetime; /* back pointer to the ND prefix (for autoconfigured addresses only) */ struct nd_prefix *ia6_ndpr; /* multicast addresses joined from the kernel */ LIST_HEAD(, in6_multi_mship) ia6_memberships; /* entry in bucket of inet6 addresses */ LIST_ENTRY(in6_ifaddr) ia6_hash; }; /* List of in6_ifaddr's. */ TAILQ_HEAD(in6_ifaddrhead, in6_ifaddr); LIST_HEAD(in6_ifaddrlisthead, in6_ifaddr); #endif /* _KERNEL */ /* control structure to manage address selection policy */ struct in6_addrpolicy { struct sockaddr_in6 addr; /* prefix address */ struct sockaddr_in6 addrmask; /* prefix mask */ int preced; /* precedence */ int label; /* matching label */ u_quad_t use; /* statistics */ }; /* * IPv6 interface statistics, as defined in RFC2465 Ipv6IfStatsEntry (p12). */ struct in6_ifstat { uint64_t ifs6_in_receive; /* # of total input datagram */ uint64_t ifs6_in_hdrerr; /* # of datagrams with invalid hdr */ uint64_t ifs6_in_toobig; /* # of datagrams exceeded MTU */ uint64_t ifs6_in_noroute; /* # of datagrams with no route */ uint64_t ifs6_in_addrerr; /* # of datagrams with invalid dst */ uint64_t ifs6_in_protounknown; /* # of datagrams with unknown proto */ /* NOTE: increment on final dst if */ uint64_t ifs6_in_truncated; /* # of truncated datagrams */ uint64_t ifs6_in_discard; /* # of discarded datagrams */ /* NOTE: fragment timeout is not here */ uint64_t ifs6_in_deliver; /* # of datagrams delivered to ULP */ /* NOTE: increment on final dst if */ uint64_t ifs6_out_forward; /* # of datagrams forwarded */ /* NOTE: increment on outgoing if */ uint64_t ifs6_out_request; /* # of outgoing datagrams from ULP */ /* NOTE: does not include forwrads */ uint64_t ifs6_out_discard; /* # of discarded datagrams */ uint64_t ifs6_out_fragok; /* # of datagrams fragmented */ uint64_t ifs6_out_fragfail; /* # of datagrams failed on fragment */ uint64_t ifs6_out_fragcreat; /* # of fragment datagrams */ /* NOTE: this is # after fragment */ uint64_t ifs6_reass_reqd; /* # of incoming fragmented packets */ /* NOTE: increment on final dst if */ uint64_t ifs6_reass_ok; /* # of reassembled packets */ /* NOTE: this is # after reass */ /* NOTE: increment on final dst if */ uint64_t ifs6_reass_fail; /* # of reass failures */ /* NOTE: may not be packet count */ /* NOTE: increment on final dst if */ uint64_t ifs6_in_mcast; /* # of inbound multicast datagrams */ uint64_t ifs6_out_mcast; /* # of outbound multicast datagrams */ }; /* * ICMPv6 interface statistics, as defined in RFC2466 Ipv6IfIcmpEntry. * XXX: I'm not sure if this file is the right place for this structure... */ struct icmp6_ifstat { /* * Input statistics */ /* ipv6IfIcmpInMsgs, total # of input messages */ uint64_t ifs6_in_msg; /* ipv6IfIcmpInErrors, # of input error messages */ uint64_t ifs6_in_error; /* ipv6IfIcmpInDestUnreachs, # of input dest unreach errors */ uint64_t ifs6_in_dstunreach; /* ipv6IfIcmpInAdminProhibs, # of input administratively prohibited errs */ uint64_t ifs6_in_adminprohib; /* ipv6IfIcmpInTimeExcds, # of input time exceeded errors */ uint64_t ifs6_in_timeexceed; /* ipv6IfIcmpInParmProblems, # of input parameter problem errors */ uint64_t ifs6_in_paramprob; /* ipv6IfIcmpInPktTooBigs, # of input packet too big errors */ uint64_t ifs6_in_pkttoobig; /* ipv6IfIcmpInEchos, # of input echo requests */ uint64_t ifs6_in_echo; /* ipv6IfIcmpInEchoReplies, # of input echo replies */ uint64_t ifs6_in_echoreply; /* ipv6IfIcmpInRouterSolicits, # of input router solicitations */ uint64_t ifs6_in_routersolicit; /* ipv6IfIcmpInRouterAdvertisements, # of input router advertisements */ uint64_t ifs6_in_routeradvert; /* ipv6IfIcmpInNeighborSolicits, # of input neighbor solicitations */ uint64_t ifs6_in_neighborsolicit; /* ipv6IfIcmpInNeighborAdvertisements, # of input neighbor advertisements */ uint64_t ifs6_in_neighboradvert; /* ipv6IfIcmpInRedirects, # of input redirects */ uint64_t ifs6_in_redirect; /* ipv6IfIcmpInGroupMembQueries, # of input MLD queries */ uint64_t ifs6_in_mldquery; /* ipv6IfIcmpInGroupMembResponses, # of input MLD reports */ uint64_t ifs6_in_mldreport; /* ipv6IfIcmpInGroupMembReductions, # of input MLD done */ uint64_t ifs6_in_mlddone; /* * Output statistics. We should solve unresolved routing problem... */ /* ipv6IfIcmpOutMsgs, total # of output messages */ uint64_t ifs6_out_msg; /* ipv6IfIcmpOutErrors, # of output error messages */ uint64_t ifs6_out_error; /* ipv6IfIcmpOutDestUnreachs, # of output dest unreach errors */ uint64_t ifs6_out_dstunreach; /* ipv6IfIcmpOutAdminProhibs, # of output administratively prohibited errs */ uint64_t ifs6_out_adminprohib; /* ipv6IfIcmpOutTimeExcds, # of output time exceeded errors */ uint64_t ifs6_out_timeexceed; /* ipv6IfIcmpOutParmProblems, # of output parameter problem errors */ uint64_t ifs6_out_paramprob; /* ipv6IfIcmpOutPktTooBigs, # of output packet too big errors */ uint64_t ifs6_out_pkttoobig; /* ipv6IfIcmpOutEchos, # of output echo requests */ uint64_t ifs6_out_echo; /* ipv6IfIcmpOutEchoReplies, # of output echo replies */ uint64_t ifs6_out_echoreply; /* ipv6IfIcmpOutRouterSolicits, # of output router solicitations */ uint64_t ifs6_out_routersolicit; /* ipv6IfIcmpOutRouterAdvertisements, # of output router advertisements */ uint64_t ifs6_out_routeradvert; /* ipv6IfIcmpOutNeighborSolicits, # of output neighbor solicitations */ uint64_t ifs6_out_neighborsolicit; /* ipv6IfIcmpOutNeighborAdvertisements, # of output neighbor advertisements */ uint64_t ifs6_out_neighboradvert; /* ipv6IfIcmpOutRedirects, # of output redirects */ uint64_t ifs6_out_redirect; /* ipv6IfIcmpOutGroupMembQueries, # of output MLD queries */ uint64_t ifs6_out_mldquery; /* ipv6IfIcmpOutGroupMembResponses, # of output MLD reports */ uint64_t ifs6_out_mldreport; /* ipv6IfIcmpOutGroupMembReductions, # of output MLD done */ uint64_t ifs6_out_mlddone; }; struct in6_ifreq { char ifr_name[IFNAMSIZ]; union { struct sockaddr_in6 ifru_addr; struct sockaddr_in6 ifru_dstaddr; int ifru_flags; int ifru_flags6; int ifru_metric; caddr_t ifru_data; struct in6_addrlifetime ifru_lifetime; struct in6_ifstat ifru_stat; struct icmp6_ifstat ifru_icmp6stat; u_int32_t ifru_scope_id[16]; } ifr_ifru; }; struct in6_aliasreq { char ifra_name[IFNAMSIZ]; struct sockaddr_in6 ifra_addr; struct sockaddr_in6 ifra_dstaddr; struct sockaddr_in6 ifra_prefixmask; int ifra_flags; struct in6_addrlifetime ifra_lifetime; int ifra_vhid; }; /* pre-10.x compat */ struct oin6_aliasreq { char ifra_name[IFNAMSIZ]; struct sockaddr_in6 ifra_addr; struct sockaddr_in6 ifra_dstaddr; struct sockaddr_in6 ifra_prefixmask; int ifra_flags; struct in6_addrlifetime ifra_lifetime; }; /* prefix type macro */ #define IN6_PREFIX_ND 1 #define IN6_PREFIX_RR 2 /* * prefix related flags passed between kernel(NDP related part) and * user land command(ifconfig) and daemon(rtadvd). */ struct in6_prflags { struct prf_ra { u_char onlink : 1; u_char autonomous : 1; u_char reserved : 6; } prf_ra; u_char prf_reserved1; u_short prf_reserved2; /* want to put this on 4byte offset */ struct prf_rr { u_char decrvalid : 1; u_char decrprefd : 1; u_char reserved : 6; } prf_rr; u_char prf_reserved3; u_short prf_reserved4; }; struct in6_prefixreq { char ipr_name[IFNAMSIZ]; u_char ipr_origin; u_char ipr_plen; u_int32_t ipr_vltime; u_int32_t ipr_pltime; struct in6_prflags ipr_flags; struct sockaddr_in6 ipr_prefix; }; #define PR_ORIG_RA 0 #define PR_ORIG_RR 1 #define PR_ORIG_STATIC 2 #define PR_ORIG_KERNEL 3 #define ipr_raf_onlink ipr_flags.prf_ra.onlink #define ipr_raf_auto ipr_flags.prf_ra.autonomous #define ipr_statef_onlink ipr_flags.prf_state.onlink #define ipr_rrf_decrvalid ipr_flags.prf_rr.decrvalid #define ipr_rrf_decrprefd ipr_flags.prf_rr.decrprefd struct in6_rrenumreq { char irr_name[IFNAMSIZ]; u_char irr_origin; u_char irr_m_len; /* match len for matchprefix */ u_char irr_m_minlen; /* minlen for matching prefix */ u_char irr_m_maxlen; /* maxlen for matching prefix */ u_char irr_u_uselen; /* uselen for adding prefix */ u_char irr_u_keeplen; /* keeplen from matching prefix */ struct irr_raflagmask { u_char onlink : 1; u_char autonomous : 1; u_char reserved : 6; } irr_raflagmask; u_int32_t irr_vltime; u_int32_t irr_pltime; struct in6_prflags irr_flags; struct sockaddr_in6 irr_matchprefix; struct sockaddr_in6 irr_useprefix; }; #define irr_raf_mask_onlink irr_raflagmask.onlink #define irr_raf_mask_auto irr_raflagmask.autonomous #define irr_raf_mask_reserved irr_raflagmask.reserved #define irr_raf_onlink irr_flags.prf_ra.onlink #define irr_raf_auto irr_flags.prf_ra.autonomous #define irr_statef_onlink irr_flags.prf_state.onlink #define irr_rrf irr_flags.prf_rr #define irr_rrf_decrvalid irr_flags.prf_rr.decrvalid #define irr_rrf_decrprefd irr_flags.prf_rr.decrprefd /* * Given a pointer to an in6_ifaddr (ifaddr), * return a pointer to the addr as a sockaddr_in6 */ #define IA6_IN6(ia) (&((ia)->ia_addr.sin6_addr)) #define IA6_DSTIN6(ia) (&((ia)->ia_dstaddr.sin6_addr)) #define IA6_MASKIN6(ia) (&((ia)->ia_prefixmask.sin6_addr)) #define IA6_SIN6(ia) (&((ia)->ia_addr)) #define IA6_DSTSIN6(ia) (&((ia)->ia_dstaddr)) #define IFA_IN6(x) (&((struct sockaddr_in6 *)((x)->ifa_addr))->sin6_addr) #define IFA_DSTIN6(x) (&((struct sockaddr_in6 *)((x)->ifa_dstaddr))->sin6_addr) #define IFPR_IN6(x) (&((struct sockaddr_in6 *)((x)->ifpr_prefix))->sin6_addr) #ifdef _KERNEL #define IN6_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ (((d)->s6_addr32[0] ^ (a)->s6_addr32[0]) & (m)->s6_addr32[0]) == 0 && \ (((d)->s6_addr32[1] ^ (a)->s6_addr32[1]) & (m)->s6_addr32[1]) == 0 && \ (((d)->s6_addr32[2] ^ (a)->s6_addr32[2]) & (m)->s6_addr32[2]) == 0 && \ (((d)->s6_addr32[3] ^ (a)->s6_addr32[3]) & (m)->s6_addr32[3]) == 0 ) #define IN6_MASK_ADDR(a, m) do { \ (a)->s6_addr32[0] &= (m)->s6_addr32[0]; \ (a)->s6_addr32[1] &= (m)->s6_addr32[1]; \ (a)->s6_addr32[2] &= (m)->s6_addr32[2]; \ (a)->s6_addr32[3] &= (m)->s6_addr32[3]; \ } while (0) #endif #define SIOCSIFADDR_IN6 _IOW('i', 12, struct in6_ifreq) #define SIOCGIFADDR_IN6 _IOWR('i', 33, struct in6_ifreq) #ifdef _KERNEL /* * SIOCSxxx ioctls should be unused (see comments in in6.c), but * we do not shift numbers for binary compatibility. */ #define SIOCSIFDSTADDR_IN6 _IOW('i', 14, struct in6_ifreq) #define SIOCSIFNETMASK_IN6 _IOW('i', 22, struct in6_ifreq) #endif #define SIOCGIFDSTADDR_IN6 _IOWR('i', 34, struct in6_ifreq) #define SIOCGIFNETMASK_IN6 _IOWR('i', 37, struct in6_ifreq) #define SIOCDIFADDR_IN6 _IOW('i', 25, struct in6_ifreq) #define OSIOCAIFADDR_IN6 _IOW('i', 26, struct oin6_aliasreq) #define SIOCAIFADDR_IN6 _IOW('i', 27, struct in6_aliasreq) #define SIOCSIFPHYADDR_IN6 _IOW('i', 70, struct in6_aliasreq) #define SIOCGIFPSRCADDR_IN6 _IOWR('i', 71, struct in6_ifreq) #define SIOCGIFPDSTADDR_IN6 _IOWR('i', 72, struct in6_ifreq) #define SIOCGIFAFLAG_IN6 _IOWR('i', 73, struct in6_ifreq) #ifdef _KERNEL #define OSIOCGIFINFO_IN6 _IOWR('i', 76, struct in6_ondireq) #endif #define SIOCGIFINFO_IN6 _IOWR('i', 108, struct in6_ndireq) #define SIOCSIFINFO_IN6 _IOWR('i', 109, struct in6_ndireq) #define SIOCSNDFLUSH_IN6 _IOWR('i', 77, struct in6_ifreq) #define SIOCGNBRINFO_IN6 _IOWR('i', 78, struct in6_nbrinfo) #define SIOCSPFXFLUSH_IN6 _IOWR('i', 79, struct in6_ifreq) #define SIOCSRTRFLUSH_IN6 _IOWR('i', 80, struct in6_ifreq) #define SIOCGIFALIFETIME_IN6 _IOWR('i', 81, struct in6_ifreq) #define SIOCGIFSTAT_IN6 _IOWR('i', 83, struct in6_ifreq) #define SIOCGIFSTAT_ICMP6 _IOWR('i', 84, struct in6_ifreq) #define SIOCSDEFIFACE_IN6 _IOWR('i', 85, struct in6_ndifreq) #define SIOCGDEFIFACE_IN6 _IOWR('i', 86, struct in6_ndifreq) #define SIOCSIFINFO_FLAGS _IOWR('i', 87, struct in6_ndireq) /* XXX */ #define SIOCSSCOPE6 _IOW('i', 88, struct in6_ifreq) #define SIOCGSCOPE6 _IOWR('i', 89, struct in6_ifreq) #define SIOCGSCOPE6DEF _IOWR('i', 90, struct in6_ifreq) #define SIOCSIFPREFIX_IN6 _IOW('i', 100, struct in6_prefixreq) /* set */ #define SIOCGIFPREFIX_IN6 _IOWR('i', 101, struct in6_prefixreq) /* get */ #define SIOCDIFPREFIX_IN6 _IOW('i', 102, struct in6_prefixreq) /* del */ #define SIOCAIFPREFIX_IN6 _IOW('i', 103, struct in6_rrenumreq) /* add */ #define SIOCCIFPREFIX_IN6 _IOW('i', 104, \ struct in6_rrenumreq) /* change */ #define SIOCSGIFPREFIX_IN6 _IOW('i', 105, \ struct in6_rrenumreq) /* set global */ #define SIOCGETSGCNT_IN6 _IOWR('u', 106, \ struct sioc_sg_req6) /* get s,g pkt cnt */ #define SIOCGETMIFCNT_IN6 _IOWR('u', 107, \ struct sioc_mif_req6) /* get pkt cnt per if */ #define SIOCAADDRCTL_POLICY _IOW('u', 108, struct in6_addrpolicy) #define SIOCDADDRCTL_POLICY _IOW('u', 109, struct in6_addrpolicy) #define IN6_IFF_ANYCAST 0x01 /* anycast address */ #define IN6_IFF_TENTATIVE 0x02 /* tentative address */ #define IN6_IFF_DUPLICATED 0x04 /* DAD detected duplicate */ #define IN6_IFF_DETACHED 0x08 /* may be detached from the link */ #define IN6_IFF_DEPRECATED 0x10 /* deprecated address */ #define IN6_IFF_NODAD 0x20 /* don't perform DAD on this address * (obsolete) */ #define IN6_IFF_AUTOCONF 0x40 /* autoconfigurable address. */ #define IN6_IFF_TEMPORARY 0x80 /* temporary (anonymous) address. */ #define IN6_IFF_PREFER_SOURCE 0x0100 /* preferred address for SAS */ /* do not input/output */ #define IN6_IFF_NOTREADY (IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED) #ifdef _KERNEL #define IN6_ARE_SCOPE_CMP(a,b) ((a)-(b)) #define IN6_ARE_SCOPE_EQUAL(a,b) ((a)==(b)) #endif #ifdef _KERNEL VNET_DECLARE(struct in6_ifaddrhead, in6_ifaddrhead); VNET_DECLARE(struct in6_ifaddrlisthead *, in6_ifaddrhashtbl); VNET_DECLARE(u_long, in6_ifaddrhmask); #define V_in6_ifaddrhead VNET(in6_ifaddrhead) #define V_in6_ifaddrhashtbl VNET(in6_ifaddrhashtbl) #define V_in6_ifaddrhmask VNET(in6_ifaddrhmask) #define IN6ADDR_NHASH_LOG2 8 #define IN6ADDR_NHASH (1 << IN6ADDR_NHASH_LOG2) #define IN6ADDR_HASHVAL(x) (in6_addrhash(x)) #define IN6ADDR_HASH(x) \ (&V_in6_ifaddrhashtbl[IN6ADDR_HASHVAL(x) & V_in6_ifaddrhmask]) static __inline uint32_t in6_addrhash(const struct in6_addr *in6) { uint32_t x; x = in6->s6_addr32[0] ^ in6->s6_addr32[1] ^ in6->s6_addr32[2] ^ in6->s6_addr32[3]; return (fnv_32_buf(&x, sizeof(x), FNV1_32_INIT)); } extern struct rmlock in6_ifaddr_lock; #define IN6_IFADDR_LOCK_ASSERT() rm_assert(&in6_ifaddr_lock, RA_LOCKED) #define IN6_IFADDR_RLOCK(t) rm_rlock(&in6_ifaddr_lock, (t)) #define IN6_IFADDR_RLOCK_ASSERT() rm_assert(&in6_ifaddr_lock, RA_RLOCKED) #define IN6_IFADDR_RUNLOCK(t) rm_runlock(&in6_ifaddr_lock, (t)) #define IN6_IFADDR_WLOCK() rm_wlock(&in6_ifaddr_lock) #define IN6_IFADDR_WLOCK_ASSERT() rm_assert(&in6_ifaddr_lock, RA_WLOCKED) #define IN6_IFADDR_WUNLOCK() rm_wunlock(&in6_ifaddr_lock) #define in6_ifstat_inc(ifp, tag) \ do { \ if (ifp) \ counter_u64_add(((struct in6_ifextra *) \ ((ifp)->if_afdata[AF_INET6]))->in6_ifstat[ \ offsetof(struct in6_ifstat, tag) / sizeof(uint64_t)], 1);\ } while (/*CONSTCOND*/ 0) extern u_char inet6ctlerrmap[]; VNET_DECLARE(unsigned long, in6_maxmtu); #define V_in6_maxmtu VNET(in6_maxmtu) #endif /* _KERNEL */ /* * IPv6 multicast MLD-layer source entry. */ struct ip6_msource { RB_ENTRY(ip6_msource) im6s_link; /* RB tree links */ struct in6_addr im6s_addr; struct im6s_st { uint16_t ex; /* # of exclusive members */ uint16_t in; /* # of inclusive members */ } im6s_st[2]; /* state at t0, t1 */ uint8_t im6s_stp; /* pending query */ }; RB_HEAD(ip6_msource_tree, ip6_msource); /* * IPv6 multicast PCB-layer source entry. * * NOTE: overlapping use of struct ip6_msource fields at start. */ struct in6_msource { RB_ENTRY(ip6_msource) im6s_link; /* Common field */ struct in6_addr im6s_addr; /* Common field */ uint8_t im6sl_st[2]; /* state before/at commit */ }; #ifdef _KERNEL /* * IPv6 source tree comparison function. * * An ordered predicate is necessary; bcmp() is not documented to return * an indication of order, memcmp() is, and is an ISO C99 requirement. */ static __inline int ip6_msource_cmp(const struct ip6_msource *a, const struct ip6_msource *b) { return (memcmp(&a->im6s_addr, &b->im6s_addr, sizeof(struct in6_addr))); } RB_PROTOTYPE(ip6_msource_tree, ip6_msource, im6s_link, ip6_msource_cmp); /* * IPv6 multicast PCB-layer group filter descriptor. */ struct in6_mfilter { struct ip6_msource_tree im6f_sources; /* source list for (S,G) */ u_long im6f_nsrc; /* # of source entries */ uint8_t im6f_st[2]; /* state before/at commit */ }; /* * Legacy KAME IPv6 multicast membership descriptor. */ struct in6_multi_mship { struct in6_multi *i6mm_maddr; LIST_ENTRY(in6_multi_mship) i6mm_chain; }; /* * IPv6 group descriptor. * * For every entry on an ifnet's if_multiaddrs list which represents * an IP multicast group, there is one of these structures. * * If any source filters are present, then a node will exist in the RB-tree * to permit fast lookup by source whenever an operation takes place. * This permits pre-order traversal when we issue reports. * Source filter trees are kept separately from the socket layer to * greatly simplify locking. * * When MLDv2 is active, in6m_timer is the response to group query timer. * The state-change timer in6m_sctimer is separate; whenever state changes * for the group the state change record is generated and transmitted, * and kept if retransmissions are necessary. * * FUTURE: in6m_link is now only used when groups are being purged * on a detaching ifnet. It could be demoted to a SLIST_ENTRY, but * because it is at the very start of the struct, we can't do this * w/o breaking the ABI for ifmcstat. */ struct in6_multi { struct in6_addr in6m_addr; /* IPv6 multicast address */ struct ifnet *in6m_ifp; /* back pointer to ifnet */ struct ifmultiaddr *in6m_ifma; /* back pointer to ifmultiaddr */ u_int in6m_refcount; /* reference count */ u_int in6m_state; /* state of the membership */ u_int in6m_timer; /* MLD6 listener report timer */ /* New fields for MLDv2 follow. */ struct mld_ifsoftc *in6m_mli; /* MLD info */ SLIST_ENTRY(in6_multi) in6m_nrele; /* to-be-released by MLD */ struct ip6_msource_tree in6m_srcs; /* tree of sources */ u_long in6m_nsrc; /* # of tree entries */ struct mbufq in6m_scq; /* queue of pending * state-change packets */ struct timeval in6m_lastgsrtv; /* last G-S-R query */ uint16_t in6m_sctimer; /* state-change timer */ uint16_t in6m_scrv; /* state-change rexmit count */ /* * SSM state counters which track state at T0 (the time the last * state-change report's RV timer went to zero) and T1 * (time of pending report, i.e. now). * Used for computing MLDv2 state-change reports. Several refcounts * are maintained here to optimize for common use-cases. */ struct in6m_st { uint16_t iss_fmode; /* MLD filter mode */ uint16_t iss_asm; /* # of ASM listeners */ uint16_t iss_ex; /* # of exclusive members */ uint16_t iss_in; /* # of inclusive members */ uint16_t iss_rec; /* # of recorded sources */ } in6m_st[2]; /* state at t0, t1 */ }; +void in6m_disconnect(struct in6_multi *inm); +extern int ifma6_restart; /* * Helper function to derive the filter mode on a source entry * from its internal counters. Predicates are: * A source is only excluded if all listeners exclude it. * A source is only included if no listeners exclude it, * and at least one listener includes it. * May be used by ifmcstat(8). */ static __inline uint8_t im6s_get_mode(const struct in6_multi *inm, const struct ip6_msource *ims, uint8_t t) { t = !!t; if (inm->in6m_st[t].iss_ex > 0 && inm->in6m_st[t].iss_ex == ims->im6s_st[t].ex) return (MCAST_EXCLUDE); else if (ims->im6s_st[t].in > 0 && ims->im6s_st[t].ex == 0) return (MCAST_INCLUDE); return (MCAST_UNDEFINED); } /* * Lock macros for IPv6 layer multicast address lists. IPv6 lock goes * before link layer multicast locks in the lock order. In most cases, * consumers of IN_*_MULTI() macros should acquire the locks before * calling them; users of the in_{add,del}multi() functions should not. */ extern struct mtx in6_multi_list_mtx; extern struct sx in6_multi_sx; #define IN6_MULTI_LIST_LOCK() mtx_lock(&in6_multi_list_mtx) #define IN6_MULTI_LIST_UNLOCK() mtx_unlock(&in6_multi_list_mtx) #define IN6_MULTI_LIST_LOCK_ASSERT() mtx_assert(&in6_multi_list_mtx, MA_OWNED) #define IN6_MULTI_LIST_UNLOCK_ASSERT() mtx_assert(&in6_multi_list_mtx, MA_NOTOWNED) #define IN6_MULTI_LOCK() sx_xlock(&in6_multi_sx) #define IN6_MULTI_UNLOCK() sx_xunlock(&in6_multi_sx) #define IN6_MULTI_LOCK_ASSERT() sx_assert(&in6_multi_sx, SA_XLOCKED) #define IN6_MULTI_UNLOCK_ASSERT() sx_assert(&in6_multi_sx, SA_XUNLOCKED) + /* * Look up an in6_multi record for an IPv6 multicast address * on the interface ifp. * If no record found, return NULL. * * SMPng: The IN6_MULTI_LOCK and IF_ADDR_LOCK on ifp must be held. */ static __inline struct in6_multi * in6m_lookup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr) { struct ifmultiaddr *ifma; struct in6_multi *inm; IF_ADDR_LOCK_ASSERT(ifp); inm = NULL; TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { if (ifma->ifma_addr->sa_family == AF_INET6) { inm = (struct in6_multi *)ifma->ifma_protospec; if (inm == NULL) continue; if (IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, mcaddr)) break; inm = NULL; } } return (inm); } /* * Wrapper for in6m_lookup_locked(). * * SMPng: Assumes that neithr the IN6_MULTI_LOCK() or IF_ADDR_LOCK() are held. */ static __inline struct in6_multi * in6m_lookup(struct ifnet *ifp, const struct in6_addr *mcaddr) { struct in6_multi *inm; IN6_MULTI_LIST_LOCK(); IF_ADDR_RLOCK(ifp); inm = in6m_lookup_locked(ifp, mcaddr); IF_ADDR_RUNLOCK(ifp); IN6_MULTI_LIST_UNLOCK(); return (inm); } /* Acquire an in6_multi record. */ static __inline void in6m_acquire_locked(struct in6_multi *inm) { IN6_MULTI_LIST_LOCK_ASSERT(); ++inm->in6m_refcount; } static __inline void in6m_acquire(struct in6_multi *inm) { IN6_MULTI_LIST_LOCK(); in6m_acquire_locked(inm); IN6_MULTI_LIST_UNLOCK(); } static __inline void in6m_rele_locked(struct in6_multi_head *inmh, struct in6_multi *inm) { - struct ifnet *ifp; - struct ifaddr *ifa; - struct in6_ifaddr *ifa6; - struct in6_multi_mship *imm; - KASSERT(inm->in6m_refcount > 0, ("refcount == %d inm: %p", inm->in6m_refcount, inm)); IN6_MULTI_LIST_LOCK_ASSERT(); if (--inm->in6m_refcount == 0) { - ifp = inm->in6m_ifp; - IF_ADDR_LOCK_ASSERT(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr->sa_family != AF_INET6) - continue; - - ifa6 = (void *)ifa; - LIST_FOREACH(imm, &ifa6->ia6_memberships, i6mm_chain) { - if (inm == imm->i6mm_maddr) - imm->i6mm_maddr = NULL; - } - } + in6m_disconnect(inm); inm->in6m_ifma->ifma_protospec = NULL; + MPASS(inm->in6m_ifma->ifma_llifma == NULL); SLIST_INSERT_HEAD(inmh, inm, in6m_nrele); } } struct ip6_moptions; struct sockopt; struct inpcbinfo; /* Multicast KPIs. */ int im6o_mc_filter(const struct ip6_moptions *, const struct ifnet *, const struct sockaddr *, const struct sockaddr *); int in6_joingroup(struct ifnet *, const struct in6_addr *, struct in6_mfilter *, struct in6_multi **, int); int in6_joingroup_locked(struct ifnet *, const struct in6_addr *, struct in6_mfilter *, struct in6_multi **, int); int in6_leavegroup(struct in6_multi *, struct in6_mfilter *); int in6_leavegroup_locked(struct in6_multi *, struct in6_mfilter *); void in6m_clear_recorded(struct in6_multi *); void in6m_commit(struct in6_multi *); void in6m_print(const struct in6_multi *); int in6m_record_source(struct in6_multi *, const struct in6_addr *); void in6m_release_deferred(struct in6_multi *); void in6m_release_list_deferred(struct in6_multi_head *); void ip6_freemoptions(struct ip6_moptions *, struct inpcbinfo *); int ip6_getmoptions(struct inpcb *, struct sockopt *); int ip6_setmoptions(struct inpcb *, struct sockopt *); /* flags to in6_update_ifa */ #define IN6_IFAUPDATE_DADDELAY 0x1 /* first time to configure an address */ int in6_mask2len(struct in6_addr *, u_char *); int in6_control(struct socket *, u_long, caddr_t, struct ifnet *, struct thread *); int in6_update_ifa(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); void in6_prepare_ifra(struct in6_aliasreq *, const struct in6_addr *, const struct in6_addr *); void in6_purgeaddr(struct ifaddr *); int in6if_do_dad(struct ifnet *); void in6_savemkludge(struct in6_ifaddr *); void *in6_domifattach(struct ifnet *); void in6_domifdetach(struct ifnet *, void *); int in6_domifmtu(struct ifnet *); void in6_setmaxmtu(void); int in6_if2idlen(struct ifnet *); struct in6_ifaddr *in6ifa_ifpforlinklocal(struct ifnet *, int); struct in6_ifaddr *in6ifa_ifpwithaddr(struct ifnet *, const struct in6_addr *); struct in6_ifaddr *in6ifa_ifwithaddr(const struct in6_addr *, uint32_t); struct in6_ifaddr *in6ifa_llaonifp(struct ifnet *); int in6_addr2zoneid(struct ifnet *, struct in6_addr *, u_int32_t *); int in6_matchlen(struct in6_addr *, struct in6_addr *); int in6_are_prefix_equal(struct in6_addr *, struct in6_addr *, int); void in6_prefixlen2mask(struct in6_addr *, int); int in6_prefix_ioctl(struct socket *, u_long, caddr_t, struct ifnet *); int in6_prefix_add_ifid(int, struct in6_ifaddr *); void in6_prefix_remove_ifid(int, struct in6_ifaddr *); void in6_purgeprefix(struct ifnet *); int in6_is_addr_deprecated(struct sockaddr_in6 *); int in6_src_ioctl(u_long, caddr_t); void in6_newaddrmsg(struct in6_ifaddr *, int); /* * Extended API for IPv6 FIB support. */ struct mbuf *ip6_tryforward(struct mbuf *); void in6_rtredirect(struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct sockaddr *, u_int); int in6_rtrequest(int, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct rtentry **, u_int); void in6_rtalloc(struct route_in6 *, u_int); void in6_rtalloc_ign(struct route_in6 *, u_long, u_int); struct rtentry *in6_rtalloc1(struct sockaddr *, int, u_long, u_int); #endif /* _KERNEL */ #endif /* _NETINET6_IN6_VAR_H_ */ Index: head/sys/netinet6/mld6.c =================================================================== --- head/sys/netinet6/mld6.c (revision 333308) +++ head/sys/netinet6/mld6.c (revision 333309) @@ -1,3305 +1,3320 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2009 Bruce Simpson. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: mld6.c,v 1.27 2001/04/04 05:17:30 itojun Exp $ */ /*- * Copyright (c) 1988 Stephen Deering. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)igmp.c 8.1 (Berkeley) 7/19/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_MLD #define KTR_MLD KTR_INET6 #endif static struct mld_ifsoftc * mli_alloc_locked(struct ifnet *); static void mli_delete_locked(const struct ifnet *); static void mld_dispatch_packet(struct mbuf *); static void mld_dispatch_queue(struct mbufq *, int); static void mld_final_leave(struct in6_multi *, struct mld_ifsoftc *); static void mld_fasttimo_vnet(void); static int mld_handle_state_change(struct in6_multi *, struct mld_ifsoftc *); static int mld_initial_join(struct in6_multi *, struct mld_ifsoftc *, const int); #ifdef KTR static char * mld_rec_type_to_str(const int); #endif static void mld_set_version(struct mld_ifsoftc *, const int); static void mld_slowtimo_vnet(void); static int mld_v1_input_query(struct ifnet *, const struct ip6_hdr *, /*const*/ struct mld_hdr *); static int mld_v1_input_report(struct ifnet *, const struct ip6_hdr *, /*const*/ struct mld_hdr *); static void mld_v1_process_group_timer(struct in6_multi_head *, struct in6_multi *); static void mld_v1_process_querier_timers(struct mld_ifsoftc *); static int mld_v1_transmit_report(struct in6_multi *, const int); static void mld_v1_update_group(struct in6_multi *, const int); static void mld_v2_cancel_link_timers(struct mld_ifsoftc *); static void mld_v2_dispatch_general_query(struct mld_ifsoftc *); static struct mbuf * mld_v2_encap_report(struct ifnet *, struct mbuf *); static int mld_v2_enqueue_filter_change(struct mbufq *, struct in6_multi *); static int mld_v2_enqueue_group_record(struct mbufq *, struct in6_multi *, const int, const int, const int, const int); static int mld_v2_input_query(struct ifnet *, const struct ip6_hdr *, struct mbuf *, const int, const int); static int mld_v2_merge_state_changes(struct in6_multi *, struct mbufq *); static void mld_v2_process_group_timers(struct in6_multi_head *, struct mbufq *, struct mbufq *, struct in6_multi *, const int); static int mld_v2_process_group_query(struct in6_multi *, struct mld_ifsoftc *mli, int, struct mbuf *, const int); static int sysctl_mld_gsr(SYSCTL_HANDLER_ARGS); static int sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS); /* * Normative references: RFC 2710, RFC 3590, RFC 3810. * * Locking: * * The MLD subsystem lock ends up being system-wide for the moment, * but could be per-VIMAGE later on. * * The permitted lock order is: IN6_MULTI_LOCK, MLD_LOCK, IF_ADDR_LOCK. * Any may be taken independently; if any are held at the same * time, the above lock order must be followed. * * IN6_MULTI_LOCK covers in_multi. * * MLD_LOCK covers per-link state and any global variables in this file. * * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of * per-link state iterators. * * XXX LOR PREVENTION * A special case for IPv6 is the in6_setscope() routine. ip6_output() * will not accept an ifp; it wants an embedded scope ID, unlike * ip_output(), which happily takes the ifp given to it. The embedded * scope ID is only used by MLD to select the outgoing interface. * * During interface attach and detach, MLD will take MLD_LOCK *after* * the IF_AFDATA_LOCK. * As in6_setscope() takes IF_AFDATA_LOCK then SCOPE_LOCK, we can't call * it with MLD_LOCK held without triggering an LOR. A netisr with indirect * dispatch could work around this, but we'd rather not do that, as it * can introduce other races. * * As such, we exploit the fact that the scope ID is just the interface * index, and embed it in the IPv6 destination address accordingly. * This is potentially NOT VALID for MLDv1 reports, as they * are always sent to the multicast group itself; as MLDv2 * reports are always sent to ff02::16, this is not an issue * when MLDv2 is in use. * * This does not however eliminate the LOR when ip6_output() itself * calls in6_setscope() internally whilst MLD_LOCK is held. This will * trigger a LOR warning in WITNESS when the ifnet is detached. * * The right answer is probably to make IF_AFDATA_LOCK an rwlock, given * how it's used across the network stack. Here we're simply exploiting * the fact that MLD runs at a similar layer in the stack to scope6.c. * * VIMAGE: * * Each in6_multi corresponds to an ifp, and each ifp corresponds * to a vnet in ifp->if_vnet. */ static struct mtx mld_mtx; static MALLOC_DEFINE(M_MLD, "mld", "mld state"); #define MLD_EMBEDSCOPE(pin6, zoneid) \ if (IN6_IS_SCOPE_LINKLOCAL(pin6) || \ IN6_IS_ADDR_MC_INTFACELOCAL(pin6)) \ (pin6)->s6_addr16[1] = htons((zoneid) & 0xFFFF) \ /* * VIMAGE-wide globals. */ static VNET_DEFINE(struct timeval, mld_gsrdelay) = {10, 0}; static VNET_DEFINE(LIST_HEAD(, mld_ifsoftc), mli_head); static VNET_DEFINE(int, interface_timers_running6); static VNET_DEFINE(int, state_change_timers_running6); static VNET_DEFINE(int, current_state_timers_running6); #define V_mld_gsrdelay VNET(mld_gsrdelay) #define V_mli_head VNET(mli_head) #define V_interface_timers_running6 VNET(interface_timers_running6) #define V_state_change_timers_running6 VNET(state_change_timers_running6) #define V_current_state_timers_running6 VNET(current_state_timers_running6) SYSCTL_DECL(_net_inet6); /* Note: Not in any common header. */ SYSCTL_NODE(_net_inet6, OID_AUTO, mld, CTLFLAG_RW, 0, "IPv6 Multicast Listener Discovery"); /* * Virtualized sysctls. */ SYSCTL_PROC(_net_inet6_mld, OID_AUTO, gsrdelay, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(mld_gsrdelay.tv_sec), 0, sysctl_mld_gsr, "I", "Rate limit for MLDv2 Group-and-Source queries in seconds"); /* * Non-virtualized sysctls. */ static SYSCTL_NODE(_net_inet6_mld, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mld_ifinfo, "Per-interface MLDv2 state"); static int mld_v1enable = 1; SYSCTL_INT(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_RWTUN, &mld_v1enable, 0, "Enable fallback to MLDv1"); static int mld_use_allow = 1; SYSCTL_INT(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_RWTUN, &mld_use_allow, 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves"); /* * Packed Router Alert option structure declaration. */ struct mld_raopt { struct ip6_hbh hbh; struct ip6_opt pad; struct ip6_opt_router ra; } __packed; /* * Router Alert hop-by-hop option header. */ static struct mld_raopt mld_ra = { .hbh = { 0, 0 }, .pad = { .ip6o_type = IP6OPT_PADN, 0 }, .ra = { .ip6or_type = IP6OPT_ROUTER_ALERT, .ip6or_len = IP6OPT_RTALERT_LEN - 2, .ip6or_value[0] = ((IP6OPT_RTALERT_MLD >> 8) & 0xFF), .ip6or_value[1] = (IP6OPT_RTALERT_MLD & 0xFF) } }; static struct ip6_pktopts mld_po; static __inline void mld_save_context(struct mbuf *m, struct ifnet *ifp) { #ifdef VIMAGE m->m_pkthdr.PH_loc.ptr = ifp->if_vnet; #endif /* VIMAGE */ m->m_pkthdr.flowid = ifp->if_index; } static __inline void mld_scrub_context(struct mbuf *m) { m->m_pkthdr.PH_loc.ptr = NULL; m->m_pkthdr.flowid = 0; } /* * Restore context from a queued output chain. * Return saved ifindex. * * VIMAGE: The assertion is there to make sure that we * actually called CURVNET_SET() with what's in the mbuf chain. */ static __inline uint32_t mld_restore_context(struct mbuf *m) { #if defined(VIMAGE) && defined(INVARIANTS) KASSERT(curvnet == m->m_pkthdr.PH_loc.ptr, ("%s: called when curvnet was not restored: cuvnet %p m ptr %p", __func__, curvnet, m->m_pkthdr.PH_loc.ptr)); #endif return (m->m_pkthdr.flowid); } /* * Retrieve or set threshold between group-source queries in seconds. * * VIMAGE: Assume curvnet set by caller. * SMPng: NOTE: Serialized by MLD lock. */ static int sysctl_mld_gsr(SYSCTL_HANDLER_ARGS) { int error; int i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); MLD_LOCK(); i = V_mld_gsrdelay.tv_sec; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) goto out_locked; if (i < -1 || i >= 60) { error = EINVAL; goto out_locked; } CTR2(KTR_MLD, "change mld_gsrdelay from %d to %d", V_mld_gsrdelay.tv_sec, i); V_mld_gsrdelay.tv_sec = i; out_locked: MLD_UNLOCK(); return (error); } /* * Expose struct mld_ifsoftc to userland, keyed by ifindex. * For use by ifmcstat(8). * * SMPng: NOTE: Does an unlocked ifindex space read. * VIMAGE: Assume curvnet set by caller. The node handler itself * is not directly virtualized. */ static int sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS) { int *name; int error; u_int namelen; struct ifnet *ifp; struct mld_ifsoftc *mli; name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); if (namelen != 1) return (EINVAL); error = sysctl_wire_old_buffer(req, sizeof(struct mld_ifinfo)); if (error) return (error); IN6_MULTI_LOCK(); IN6_MULTI_LIST_LOCK(); MLD_LOCK(); if (name[0] <= 0 || name[0] > V_if_index) { error = ENOENT; goto out_locked; } error = ENOENT; ifp = ifnet_byindex(name[0]); if (ifp == NULL) goto out_locked; LIST_FOREACH(mli, &V_mli_head, mli_link) { if (ifp == mli->mli_ifp) { struct mld_ifinfo info; info.mli_version = mli->mli_version; info.mli_v1_timer = mli->mli_v1_timer; info.mli_v2_timer = mli->mli_v2_timer; info.mli_flags = mli->mli_flags; info.mli_rv = mli->mli_rv; info.mli_qi = mli->mli_qi; info.mli_qri = mli->mli_qri; info.mli_uri = mli->mli_uri; error = SYSCTL_OUT(req, &info, sizeof(info)); break; } } out_locked: MLD_UNLOCK(); IN6_MULTI_LIST_UNLOCK(); IN6_MULTI_UNLOCK(); return (error); } /* * Dispatch an entire queue of pending packet chains. * VIMAGE: Assumes the vnet pointer has been set. */ static void mld_dispatch_queue(struct mbufq *mq, int limit) { struct mbuf *m; while ((m = mbufq_dequeue(mq)) != NULL) { CTR3(KTR_MLD, "%s: dispatch %p from %p", __func__, mq, m); mld_dispatch_packet(m); if (--limit == 0) break; } } /* * Filter outgoing MLD report state by group. * * Reports are ALWAYS suppressed for ALL-HOSTS (ff02::1) * and node-local addresses. However, kernel and socket consumers * always embed the KAME scope ID in the address provided, so strip it * when performing comparison. * Note: This is not the same as the *multicast* scope. * * Return zero if the given group is one for which MLD reports * should be suppressed, or non-zero if reports should be issued. */ static __inline int mld_is_addr_reported(const struct in6_addr *addr) { KASSERT(IN6_IS_ADDR_MULTICAST(addr), ("%s: not multicast", __func__)); if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_NODELOCAL) return (0); if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_LINKLOCAL) { struct in6_addr tmp = *addr; in6_clearscope(&tmp); if (IN6_ARE_ADDR_EQUAL(&tmp, &in6addr_linklocal_allnodes)) return (0); } return (1); } /* * Attach MLD when PF_INET6 is attached to an interface. * * SMPng: Normally called with IF_AFDATA_LOCK held. */ struct mld_ifsoftc * mld_domifattach(struct ifnet *ifp) { struct mld_ifsoftc *mli; CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp, if_name(ifp)); MLD_LOCK(); mli = mli_alloc_locked(ifp); if (!(ifp->if_flags & IFF_MULTICAST)) mli->mli_flags |= MLIF_SILENT; if (mld_use_allow) mli->mli_flags |= MLIF_USEALLOW; MLD_UNLOCK(); return (mli); } /* * VIMAGE: assume curvnet set by caller. */ static struct mld_ifsoftc * mli_alloc_locked(/*const*/ struct ifnet *ifp) { struct mld_ifsoftc *mli; MLD_LOCK_ASSERT(); mli = malloc(sizeof(struct mld_ifsoftc), M_MLD, M_NOWAIT|M_ZERO); if (mli == NULL) goto out; mli->mli_ifp = ifp; mli->mli_version = MLD_VERSION_2; mli->mli_flags = 0; mli->mli_rv = MLD_RV_INIT; mli->mli_qi = MLD_QI_INIT; mli->mli_qri = MLD_QRI_INIT; mli->mli_uri = MLD_URI_INIT; mbufq_init(&mli->mli_gq, MLD_MAX_RESPONSE_PACKETS); LIST_INSERT_HEAD(&V_mli_head, mli, mli_link); CTR2(KTR_MLD, "allocate mld_ifsoftc for ifp %p(%s)", ifp, if_name(ifp)); out: return (mli); } /* * Hook for ifdetach. * * NOTE: Some finalization tasks need to run before the protocol domain * is detached, but also before the link layer does its cleanup. * Run before link-layer cleanup; cleanup groups, but do not free MLD state. * * SMPng: Caller must hold IN6_MULTI_LOCK(). * Must take IF_ADDR_LOCK() to cover if_multiaddrs iterator. * XXX This routine is also bitten by unlocked ifma_protospec access. */ void mld_ifdetach(struct ifnet *ifp) { struct mld_ifsoftc *mli; - struct ifmultiaddr *ifma; + struct ifmultiaddr *ifma, *next; struct in6_multi *inm; struct in6_multi_head inmh; CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp, if_name(ifp)); SLIST_INIT(&inmh); IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK(); mli = MLD_IFINFO(ifp); if (mli->mli_version == MLD_VERSION_2) { - IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + IF_ADDR_WLOCK(ifp); + restart: + TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { if (ifma->ifma_addr->sa_family != AF_INET6 || ifma->ifma_protospec == NULL) continue; inm = (struct in6_multi *)ifma->ifma_protospec; if (inm->in6m_state == MLD_LEAVING_MEMBER) { in6m_rele_locked(&inmh, inm); ifma->ifma_protospec = NULL; } in6m_clear_recorded(inm); + if (__predict_false(ifma6_restart)) { + ifma6_restart = false; + goto restart; + } } - IF_ADDR_RUNLOCK(ifp); + IF_ADDR_WUNLOCK(ifp); } MLD_UNLOCK(); in6m_release_list_deferred(&inmh); } /* * Hook for domifdetach. * Runs after link-layer cleanup; free MLD state. * * SMPng: Normally called with IF_AFDATA_LOCK held. */ void mld_domifdetach(struct ifnet *ifp) { CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp, if_name(ifp)); MLD_LOCK(); mli_delete_locked(ifp); MLD_UNLOCK(); } static void mli_delete_locked(const struct ifnet *ifp) { struct mld_ifsoftc *mli, *tmli; CTR3(KTR_MLD, "%s: freeing mld_ifsoftc for ifp %p(%s)", __func__, ifp, if_name(ifp)); MLD_LOCK_ASSERT(); LIST_FOREACH_SAFE(mli, &V_mli_head, mli_link, tmli) { if (mli->mli_ifp == ifp) { /* * Free deferred General Query responses. */ mbufq_drain(&mli->mli_gq); LIST_REMOVE(mli, mli_link); free(mli, M_MLD); return; } } } /* * Process a received MLDv1 general or address-specific query. * Assumes that the query header has been pulled up to sizeof(mld_hdr). * * NOTE: Can't be fully const correct as we temporarily embed scope ID in * mld_addr. This is OK as we own the mbuf chain. */ static int mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, /*const*/ struct mld_hdr *mld) { struct ifmultiaddr *ifma; struct mld_ifsoftc *mli; struct in6_multi *inm; int is_general_query; uint16_t timer; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif is_general_query = 0; if (!mld_v1enable) { CTR3(KTR_MLD, "ignore v1 query %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); return (0); } /* * RFC3810 Section 6.2: MLD queries must originate from * a router's link-local address. */ if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) { CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_src), ifp, if_name(ifp)); return (0); } /* * Do address field validation upfront before we accept * the query. */ if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) { /* * MLDv1 General Query. * If this was not sent to the all-nodes group, ignore it. */ struct in6_addr dst; dst = ip6->ip6_dst; in6_clearscope(&dst); if (!IN6_ARE_ADDR_EQUAL(&dst, &in6addr_linklocal_allnodes)) return (EINVAL); is_general_query = 1; } else { /* * Embed scope ID of receiving interface in MLD query for * lookup whilst we don't hold other locks. */ in6_setscope(&mld->mld_addr, ifp, NULL); } IN6_MULTI_LIST_LOCK(); MLD_LOCK(); /* * Switch to MLDv1 host compatibility mode. */ mli = MLD_IFINFO(ifp); KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp)); mld_set_version(mli, MLD_VERSION_1); timer = (ntohs(mld->mld_maxdelay) * PR_FASTHZ) / MLD_TIMER_SCALE; if (timer == 0) timer = 1; IF_ADDR_RLOCK(ifp); if (is_general_query) { /* * For each reporting group joined on this * interface, kick the report timer. */ CTR2(KTR_MLD, "process v1 general query on ifp %p(%s)", - ifp, if_name(ifp)); + ifp, if_name(ifp)); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET6 || ifma->ifma_protospec == NULL) continue; inm = (struct in6_multi *)ifma->ifma_protospec; mld_v1_update_group(inm, timer); } } else { /* * MLDv1 Group-Specific Query. * If this is a group-specific MLDv1 query, we need only * look up the single group to process it. */ inm = in6m_lookup_locked(ifp, &mld->mld_addr); if (inm != NULL) { CTR3(KTR_MLD, "process v1 query %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); mld_v1_update_group(inm, timer); } /* XXX Clear embedded scope ID as userland won't expect it. */ in6_clearscope(&mld->mld_addr); } IF_ADDR_RUNLOCK(ifp); MLD_UNLOCK(); IN6_MULTI_LIST_UNLOCK(); return (0); } /* * Update the report timer on a group in response to an MLDv1 query. * * If we are becoming the reporting member for this group, start the timer. * If we already are the reporting member for this group, and timer is * below the threshold, reset it. * * We may be updating the group for the first time since we switched * to MLDv2. If we are, then we must clear any recorded source lists, * and transition to REPORTING state; the group timer is overloaded * for group and group-source query responses. * * Unlike MLDv2, the delay per group should be jittered * to avoid bursts of MLDv1 reports. */ static void mld_v1_update_group(struct in6_multi *inm, const int timer) { #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif CTR4(KTR_MLD, "%s: %s/%s timer=%d", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp), timer); IN6_MULTI_LIST_LOCK_ASSERT(); switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: break; case MLD_REPORTING_MEMBER: if (inm->in6m_timer != 0 && inm->in6m_timer <= timer) { CTR1(KTR_MLD, "%s: REPORTING and timer running, " "skipping.", __func__); break; } /* FALLTHROUGH */ case MLD_SG_QUERY_PENDING_MEMBER: case MLD_G_QUERY_PENDING_MEMBER: case MLD_IDLE_MEMBER: case MLD_LAZY_MEMBER: case MLD_AWAKENING_MEMBER: CTR1(KTR_MLD, "%s: ->REPORTING", __func__); inm->in6m_state = MLD_REPORTING_MEMBER; inm->in6m_timer = MLD_RANDOM_DELAY(timer); V_current_state_timers_running6 = 1; break; case MLD_SLEEPING_MEMBER: CTR1(KTR_MLD, "%s: ->AWAKENING", __func__); inm->in6m_state = MLD_AWAKENING_MEMBER; break; case MLD_LEAVING_MEMBER: break; } } /* * Process a received MLDv2 general, group-specific or * group-and-source-specific query. * * Assumes that the query header has been pulled up to sizeof(mldv2_query). * * Return 0 if successful, otherwise an appropriate error code is returned. */ static int mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, struct mbuf *m, const int off, const int icmp6len) { struct mld_ifsoftc *mli; struct mldv2_query *mld; struct in6_multi *inm; uint32_t maxdelay, nsrc, qqi; int is_general_query; uint16_t timer; uint8_t qrv; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif is_general_query = 0; /* * RFC3810 Section 6.2: MLD queries must originate from * a router's link-local address. */ if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) { CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_src), ifp, if_name(ifp)); return (0); } CTR2(KTR_MLD, "input v2 query on ifp %p(%s)", ifp, if_name(ifp)); mld = (struct mldv2_query *)(mtod(m, uint8_t *) + off); maxdelay = ntohs(mld->mld_maxdelay); /* in 1/10ths of a second */ if (maxdelay >= 32768) { maxdelay = (MLD_MRC_MANT(maxdelay) | 0x1000) << (MLD_MRC_EXP(maxdelay) + 3); } timer = (maxdelay * PR_FASTHZ) / MLD_TIMER_SCALE; if (timer == 0) timer = 1; qrv = MLD_QRV(mld->mld_misc); if (qrv < 2) { CTR3(KTR_MLD, "%s: clamping qrv %d to %d", __func__, qrv, MLD_RV_INIT); qrv = MLD_RV_INIT; } qqi = mld->mld_qqi; if (qqi >= 128) { qqi = MLD_QQIC_MANT(mld->mld_qqi) << (MLD_QQIC_EXP(mld->mld_qqi) + 3); } nsrc = ntohs(mld->mld_numsrc); if (nsrc > MLD_MAX_GS_SOURCES) return (EMSGSIZE); if (icmp6len < sizeof(struct mldv2_query) + (nsrc * sizeof(struct in6_addr))) return (EMSGSIZE); /* * Do further input validation upfront to avoid resetting timers * should we need to discard this query. */ if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) { /* * A general query with a source list has undefined * behaviour; discard it. */ if (nsrc > 0) return (EINVAL); is_general_query = 1; } else { /* * Embed scope ID of receiving interface in MLD query for * lookup whilst we don't hold other locks (due to KAME * locking lameness). We own this mbuf chain just now. */ in6_setscope(&mld->mld_addr, ifp, NULL); } IN6_MULTI_LIST_LOCK(); MLD_LOCK(); mli = MLD_IFINFO(ifp); KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp)); /* * Discard the v2 query if we're in Compatibility Mode. * The RFC is pretty clear that hosts need to stay in MLDv1 mode * until the Old Version Querier Present timer expires. */ if (mli->mli_version != MLD_VERSION_2) goto out_locked; mld_set_version(mli, MLD_VERSION_2); mli->mli_rv = qrv; mli->mli_qi = qqi; mli->mli_qri = maxdelay; CTR4(KTR_MLD, "%s: qrv %d qi %d maxdelay %d", __func__, qrv, qqi, maxdelay); if (is_general_query) { /* * MLDv2 General Query. * * Schedule a current-state report on this ifp for * all groups, possibly containing source lists. * * If there is a pending General Query response * scheduled earlier than the selected delay, do * not schedule any other reports. * Otherwise, reset the interface timer. */ CTR2(KTR_MLD, "process v2 general query on ifp %p(%s)", ifp, if_name(ifp)); if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer) { mli->mli_v2_timer = MLD_RANDOM_DELAY(timer); V_interface_timers_running6 = 1; } } else { /* * MLDv2 Group-specific or Group-and-source-specific Query. * * Group-source-specific queries are throttled on * a per-group basis to defeat denial-of-service attempts. * Queries for groups we are not a member of on this * link are simply ignored. */ IF_ADDR_RLOCK(ifp); inm = in6m_lookup_locked(ifp, &mld->mld_addr); if (inm == NULL) { IF_ADDR_RUNLOCK(ifp); goto out_locked; } if (nsrc > 0) { if (!ratecheck(&inm->in6m_lastgsrtv, &V_mld_gsrdelay)) { CTR1(KTR_MLD, "%s: GS query throttled.", __func__); IF_ADDR_RUNLOCK(ifp); goto out_locked; } } CTR2(KTR_MLD, "process v2 group query on ifp %p(%s)", ifp, if_name(ifp)); /* * If there is a pending General Query response * scheduled sooner than the selected delay, no * further report need be scheduled. * Otherwise, prepare to respond to the * group-specific or group-and-source query. */ if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer) mld_v2_process_group_query(inm, mli, timer, m, off); /* XXX Clear embedded scope ID as userland won't expect it. */ in6_clearscope(&mld->mld_addr); IF_ADDR_RUNLOCK(ifp); } out_locked: MLD_UNLOCK(); IN6_MULTI_LIST_UNLOCK(); return (0); } /* * Process a received MLDv2 group-specific or group-and-source-specific * query. * Return <0 if any error occurred. Currently this is ignored. */ static int mld_v2_process_group_query(struct in6_multi *inm, struct mld_ifsoftc *mli, int timer, struct mbuf *m0, const int off) { struct mldv2_query *mld; int retval; uint16_t nsrc; IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); retval = 0; mld = (struct mldv2_query *)(mtod(m0, uint8_t *) + off); switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_LAZY_MEMBER: case MLD_AWAKENING_MEMBER: case MLD_IDLE_MEMBER: case MLD_LEAVING_MEMBER: return (retval); break; case MLD_REPORTING_MEMBER: case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: break; } nsrc = ntohs(mld->mld_numsrc); /* * Deal with group-specific queries upfront. * If any group query is already pending, purge any recorded * source-list state if it exists, and schedule a query response * for this group-specific query. */ if (nsrc == 0) { if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER || inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER) { in6m_clear_recorded(inm); timer = min(inm->in6m_timer, timer); } inm->in6m_state = MLD_G_QUERY_PENDING_MEMBER; inm->in6m_timer = MLD_RANDOM_DELAY(timer); V_current_state_timers_running6 = 1; return (retval); } /* * Deal with the case where a group-and-source-specific query has * been received but a group-specific query is already pending. */ if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER) { timer = min(inm->in6m_timer, timer); inm->in6m_timer = MLD_RANDOM_DELAY(timer); V_current_state_timers_running6 = 1; return (retval); } /* * Finally, deal with the case where a group-and-source-specific * query has been received, where a response to a previous g-s-r * query exists, or none exists. * In this case, we need to parse the source-list which the Querier * has provided us with and check if we have any source list filter * entries at T1 for these sources. If we do not, there is no need * schedule a report and the query may be dropped. * If we do, we must record them and schedule a current-state * report for those sources. */ if (inm->in6m_nsrc > 0) { struct mbuf *m; uint8_t *sp; int i, nrecorded; int soff; m = m0; soff = off + sizeof(struct mldv2_query); nrecorded = 0; for (i = 0; i < nsrc; i++) { sp = mtod(m, uint8_t *) + soff; retval = in6m_record_source(inm, (const struct in6_addr *)sp); if (retval < 0) break; nrecorded += retval; soff += sizeof(struct in6_addr); if (soff >= m->m_len) { soff = soff - m->m_len; m = m->m_next; if (m == NULL) break; } } if (nrecorded > 0) { CTR1(KTR_MLD, "%s: schedule response to SG query", __func__); inm->in6m_state = MLD_SG_QUERY_PENDING_MEMBER; inm->in6m_timer = MLD_RANDOM_DELAY(timer); V_current_state_timers_running6 = 1; } } return (retval); } /* * Process a received MLDv1 host membership report. * Assumes mld points to mld_hdr in pulled up mbuf chain. * * NOTE: Can't be fully const correct as we temporarily embed scope ID in * mld_addr. This is OK as we own the mbuf chain. */ static int mld_v1_input_report(struct ifnet *ifp, const struct ip6_hdr *ip6, /*const*/ struct mld_hdr *mld) { struct in6_addr src, dst; struct in6_ifaddr *ia; struct in6_multi *inm; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif if (!mld_v1enable) { CTR3(KTR_MLD, "ignore v1 report %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); return (0); } if (ifp->if_flags & IFF_LOOPBACK) return (0); /* * MLDv1 reports must originate from a host's link-local address, * or the unspecified address (when booting). */ src = ip6->ip6_src; in6_clearscope(&src); if (!IN6_IS_SCOPE_LINKLOCAL(&src) && !IN6_IS_ADDR_UNSPECIFIED(&src)) { CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_src), ifp, if_name(ifp)); return (EINVAL); } /* * RFC2710 Section 4: MLDv1 reports must pertain to a multicast * group, and must be directed to the group itself. */ dst = ip6->ip6_dst; in6_clearscope(&dst); if (!IN6_IS_ADDR_MULTICAST(&mld->mld_addr) || !IN6_ARE_ADDR_EQUAL(&mld->mld_addr, &dst)) { CTR3(KTR_MLD, "ignore v1 query dst %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_dst), ifp, if_name(ifp)); return (EINVAL); } /* * Make sure we don't hear our own membership report, as fast * leave requires knowing that we are the only member of a * group. Assume we used the link-local address if available, * otherwise look for ::. * * XXX Note that scope ID comparison is needed for the address * returned by in6ifa_ifpforlinklocal(), but SHOULD NOT be * performed for the on-wire address. */ ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); if ((ia && IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, IA6_IN6(ia))) || (ia == NULL && IN6_IS_ADDR_UNSPECIFIED(&src))) { if (ia != NULL) ifa_free(&ia->ia_ifa); return (0); } if (ia != NULL) ifa_free(&ia->ia_ifa); CTR3(KTR_MLD, "process v1 report %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); /* * Embed scope ID of receiving interface in MLD query for lookup * whilst we don't hold other locks (due to KAME locking lameness). */ if (!IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) in6_setscope(&mld->mld_addr, ifp, NULL); IN6_MULTI_LIST_LOCK(); MLD_LOCK(); IF_ADDR_RLOCK(ifp); /* * MLDv1 report suppression. * If we are a member of this group, and our membership should be * reported, and our group timer is pending or about to be reset, * stop our group timer by transitioning to the 'lazy' state. */ inm = in6m_lookup_locked(ifp, &mld->mld_addr); if (inm != NULL) { struct mld_ifsoftc *mli; mli = inm->in6m_mli; KASSERT(mli != NULL, ("%s: no mli for ifp %p", __func__, ifp)); /* * If we are in MLDv2 host mode, do not allow the * other host's MLDv1 report to suppress our reports. */ if (mli->mli_version == MLD_VERSION_2) goto out_locked; inm->in6m_timer = 0; switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_SLEEPING_MEMBER: break; case MLD_REPORTING_MEMBER: case MLD_IDLE_MEMBER: case MLD_AWAKENING_MEMBER: CTR3(KTR_MLD, "report suppressed for %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); case MLD_LAZY_MEMBER: inm->in6m_state = MLD_LAZY_MEMBER; break; case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: case MLD_LEAVING_MEMBER: break; } } out_locked: IF_ADDR_RUNLOCK(ifp); MLD_UNLOCK(); IN6_MULTI_LIST_UNLOCK(); /* XXX Clear embedded scope ID as userland won't expect it. */ in6_clearscope(&mld->mld_addr); return (0); } /* * MLD input path. * * Assume query messages which fit in a single ICMPv6 message header * have been pulled up. * Assume that userland will want to see the message, even if it * otherwise fails kernel input validation; do not free it. * Pullup may however free the mbuf chain m if it fails. * * Return IPPROTO_DONE if we freed m. Otherwise, return 0. */ int mld_input(struct mbuf *m, int off, int icmp6len) { struct ifnet *ifp; struct ip6_hdr *ip6; struct mld_hdr *mld; int mldlen; CTR3(KTR_MLD, "%s: called w/mbuf (%p,%d)", __func__, m, off); ifp = m->m_pkthdr.rcvif; ip6 = mtod(m, struct ip6_hdr *); /* Pullup to appropriate size. */ mld = (struct mld_hdr *)(mtod(m, uint8_t *) + off); if (mld->mld_type == MLD_LISTENER_QUERY && icmp6len >= sizeof(struct mldv2_query)) { mldlen = sizeof(struct mldv2_query); } else { mldlen = sizeof(struct mld_hdr); } IP6_EXTHDR_GET(mld, struct mld_hdr *, m, off, mldlen); if (mld == NULL) { ICMP6STAT_INC(icp6s_badlen); return (IPPROTO_DONE); } /* * Userland needs to see all of this traffic for implementing * the endpoint discovery portion of multicast routing. */ switch (mld->mld_type) { case MLD_LISTENER_QUERY: icmp6_ifstat_inc(ifp, ifs6_in_mldquery); if (icmp6len == sizeof(struct mld_hdr)) { if (mld_v1_input_query(ifp, ip6, mld) != 0) return (0); } else if (icmp6len >= sizeof(struct mldv2_query)) { if (mld_v2_input_query(ifp, ip6, m, off, icmp6len) != 0) return (0); } break; case MLD_LISTENER_REPORT: icmp6_ifstat_inc(ifp, ifs6_in_mldreport); if (mld_v1_input_report(ifp, ip6, mld) != 0) return (0); break; case MLDV2_LISTENER_REPORT: icmp6_ifstat_inc(ifp, ifs6_in_mldreport); break; case MLD_LISTENER_DONE: icmp6_ifstat_inc(ifp, ifs6_in_mlddone); break; default: break; } return (0); } /* * Fast timeout handler (global). * VIMAGE: Timeout handlers are expected to service all vimages. */ void mld_fasttimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); mld_fasttimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Fast timeout handler (per-vnet). * * VIMAGE: Assume caller has set up our curvnet. */ static void mld_fasttimo_vnet(void) { struct mbufq scq; /* State-change packets */ struct mbufq qrq; /* Query response packets */ struct ifnet *ifp; struct mld_ifsoftc *mli; - struct ifmultiaddr *ifma; + struct ifmultiaddr *ifma, *next; struct in6_multi *inm, *tinm; struct in6_multi_head inmh; int uri_fasthz; uri_fasthz = 0; /* * Quick check to see if any work needs to be done, in order to * minimize the overhead of fasttimo processing. * SMPng: XXX Unlocked reads. */ if (!V_current_state_timers_running6 && !V_interface_timers_running6 && !V_state_change_timers_running6) return; SLIST_INIT(&inmh); IN6_MULTI_LIST_LOCK(); MLD_LOCK(); /* * MLDv2 General Query response timer processing. */ if (V_interface_timers_running6) { CTR1(KTR_MLD, "%s: interface timers running", __func__); V_interface_timers_running6 = 0; LIST_FOREACH(mli, &V_mli_head, mli_link) { if (mli->mli_v2_timer == 0) { /* Do nothing. */ } else if (--mli->mli_v2_timer == 0) { mld_v2_dispatch_general_query(mli); } else { V_interface_timers_running6 = 1; } } } if (!V_current_state_timers_running6 && !V_state_change_timers_running6) goto out_locked; V_current_state_timers_running6 = 0; V_state_change_timers_running6 = 0; CTR1(KTR_MLD, "%s: state change timers running", __func__); /* * MLD host report and state-change timer processing. * Note: Processing a v2 group timer may remove a node. */ LIST_FOREACH(mli, &V_mli_head, mli_link) { ifp = mli->mli_ifp; if (mli->mli_version == MLD_VERSION_2) { uri_fasthz = MLD_RANDOM_DELAY(mli->mli_uri * PR_FASTHZ); mbufq_init(&qrq, MLD_MAX_G_GS_PACKETS); mbufq_init(&scq, MLD_MAX_STATE_CHANGE_PACKETS); } - IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + IF_ADDR_WLOCK(ifp); + restart: + TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { if (ifma->ifma_addr->sa_family != AF_INET6 || ifma->ifma_protospec == NULL) continue; inm = (struct in6_multi *)ifma->ifma_protospec; switch (mli->mli_version) { case MLD_VERSION_1: mld_v1_process_group_timer(&inmh, inm); break; case MLD_VERSION_2: mld_v2_process_group_timers(&inmh, &qrq, &scq, inm, uri_fasthz); break; } + if (__predict_false(ifma6_restart)) { + ifma6_restart = false; + goto restart; + } } - IF_ADDR_RUNLOCK(ifp); + IF_ADDR_WUNLOCK(ifp); switch (mli->mli_version) { case MLD_VERSION_1: /* * Transmit reports for this lifecycle. This * is done while not holding IF_ADDR_LOCK * since this can call * in6ifa_ifpforlinklocal() which locks * IF_ADDR_LOCK internally as well as * ip6_output() to transmit a packet. */ SLIST_FOREACH_SAFE(inm, &inmh, in6m_nrele, tinm) { SLIST_REMOVE_HEAD(&inmh, in6m_nrele); (void)mld_v1_transmit_report(inm, MLD_LISTENER_REPORT); } break; case MLD_VERSION_2: mld_dispatch_queue(&qrq, 0); mld_dispatch_queue(&scq, 0); /* * Free the in_multi reference(s) for * this lifecycle. */ in6m_release_list_deferred(&inmh); break; } } out_locked: MLD_UNLOCK(); IN6_MULTI_LIST_UNLOCK(); } /* * Update host report group timer. * Will update the global pending timer flags. */ static void mld_v1_process_group_timer(struct in6_multi_head *inmh, struct in6_multi *inm) { int report_timer_expired; IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); if (inm->in6m_timer == 0) { report_timer_expired = 0; } else if (--inm->in6m_timer == 0) { report_timer_expired = 1; } else { V_current_state_timers_running6 = 1; return; } switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_IDLE_MEMBER: case MLD_LAZY_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_AWAKENING_MEMBER: break; case MLD_REPORTING_MEMBER: if (report_timer_expired) { inm->in6m_state = MLD_IDLE_MEMBER; in6m_rele_locked(inmh, inm); } break; case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: case MLD_LEAVING_MEMBER: break; } } /* * Update a group's timers for MLDv2. * Will update the global pending timer flags. * Note: Unlocked read from mli. */ static void mld_v2_process_group_timers(struct in6_multi_head *inmh, struct mbufq *qrq, struct mbufq *scq, struct in6_multi *inm, const int uri_fasthz) { int query_response_timer_expired; int state_change_retransmit_timer_expired; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); query_response_timer_expired = 0; state_change_retransmit_timer_expired = 0; /* * During a transition from compatibility mode back to MLDv2, * a group record in REPORTING state may still have its group * timer active. This is a no-op in this function; it is easier * to deal with it here than to complicate the slow-timeout path. */ if (inm->in6m_timer == 0) { query_response_timer_expired = 0; } else if (--inm->in6m_timer == 0) { query_response_timer_expired = 1; } else { V_current_state_timers_running6 = 1; } if (inm->in6m_sctimer == 0) { state_change_retransmit_timer_expired = 0; } else if (--inm->in6m_sctimer == 0) { state_change_retransmit_timer_expired = 1; } else { V_state_change_timers_running6 = 1; } /* We are in fasttimo, so be quick about it. */ if (!state_change_retransmit_timer_expired && !query_response_timer_expired) return; switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_LAZY_MEMBER: case MLD_AWAKENING_MEMBER: case MLD_IDLE_MEMBER: break; case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: /* * Respond to a previously pending Group-Specific * or Group-and-Source-Specific query by enqueueing * the appropriate Current-State report for * immediate transmission. */ if (query_response_timer_expired) { int retval; retval = mld_v2_enqueue_group_record(qrq, inm, 0, 1, (inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER), 0); CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval); inm->in6m_state = MLD_REPORTING_MEMBER; in6m_clear_recorded(inm); } /* FALLTHROUGH */ case MLD_REPORTING_MEMBER: case MLD_LEAVING_MEMBER: if (state_change_retransmit_timer_expired) { /* * State-change retransmission timer fired. * If there are any further pending retransmissions, * set the global pending state-change flag, and * reset the timer. */ if (--inm->in6m_scrv > 0) { inm->in6m_sctimer = uri_fasthz; V_state_change_timers_running6 = 1; } /* * Retransmit the previously computed state-change * report. If there are no further pending * retransmissions, the mbuf queue will be consumed. * Update T0 state to T1 as we have now sent * a state-change. */ (void)mld_v2_merge_state_changes(inm, scq); in6m_commit(inm); CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); /* * If we are leaving the group for good, make sure * we release MLD's reference to it. * This release must be deferred using a SLIST, * as we are called from a loop which traverses * the in_ifmultiaddr TAILQ. */ if (inm->in6m_state == MLD_LEAVING_MEMBER && inm->in6m_scrv == 0) { inm->in6m_state = MLD_NOT_MEMBER; in6m_rele_locked(inmh, inm); } } break; } } /* * Switch to a different version on the given interface, * as per Section 9.12. */ static void mld_set_version(struct mld_ifsoftc *mli, const int version) { int old_version_timer; MLD_LOCK_ASSERT(); CTR4(KTR_MLD, "%s: switching to v%d on ifp %p(%s)", __func__, version, mli->mli_ifp, if_name(mli->mli_ifp)); if (version == MLD_VERSION_1) { /* * Compute the "Older Version Querier Present" timer as per * Section 9.12. */ old_version_timer = (mli->mli_rv * mli->mli_qi) + mli->mli_qri; old_version_timer *= PR_SLOWHZ; mli->mli_v1_timer = old_version_timer; } if (mli->mli_v1_timer > 0 && mli->mli_version != MLD_VERSION_1) { mli->mli_version = MLD_VERSION_1; mld_v2_cancel_link_timers(mli); } } /* * Cancel pending MLDv2 timers for the given link and all groups * joined on it; state-change, general-query, and group-query timers. */ static void mld_v2_cancel_link_timers(struct mld_ifsoftc *mli) { - struct ifmultiaddr *ifma; + struct ifmultiaddr *ifma, *next; struct ifnet *ifp; struct in6_multi *inm; struct in6_multi_head inmh; CTR3(KTR_MLD, "%s: cancel v2 timers on ifp %p(%s)", __func__, mli->mli_ifp, if_name(mli->mli_ifp)); SLIST_INIT(&inmh); IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); /* * Fast-track this potentially expensive operation * by checking all the global 'timer pending' flags. */ if (!V_interface_timers_running6 && !V_state_change_timers_running6 && !V_current_state_timers_running6) return; mli->mli_v2_timer = 0; ifp = mli->mli_ifp; - IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + IF_ADDR_WLOCK(ifp); + restart: + TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { if (ifma->ifma_addr->sa_family != AF_INET6) continue; inm = (struct in6_multi *)ifma->ifma_protospec; switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_IDLE_MEMBER: case MLD_LAZY_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_AWAKENING_MEMBER: break; case MLD_LEAVING_MEMBER: /* * If we are leaving the group and switching * version, we need to release the final * reference held for issuing the INCLUDE {}. */ in6m_rele_locked(&inmh, inm); ifma->ifma_protospec = NULL; /* FALLTHROUGH */ case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: in6m_clear_recorded(inm); /* FALLTHROUGH */ case MLD_REPORTING_MEMBER: inm->in6m_sctimer = 0; inm->in6m_timer = 0; inm->in6m_state = MLD_REPORTING_MEMBER; /* * Free any pending MLDv2 state-change records. */ mbufq_drain(&inm->in6m_scq); break; } + if (__predict_false(ifma6_restart)) { + ifma6_restart = false; + goto restart; + } } - IF_ADDR_RUNLOCK(ifp); + IF_ADDR_WUNLOCK(ifp); in6m_release_list_deferred(&inmh); } /* * Global slowtimo handler. * VIMAGE: Timeout handlers are expected to service all vimages. */ void mld_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); mld_slowtimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Per-vnet slowtimo handler. */ static void mld_slowtimo_vnet(void) { struct mld_ifsoftc *mli; MLD_LOCK(); LIST_FOREACH(mli, &V_mli_head, mli_link) { mld_v1_process_querier_timers(mli); } MLD_UNLOCK(); } /* * Update the Older Version Querier Present timers for a link. * See Section 9.12 of RFC 3810. */ static void mld_v1_process_querier_timers(struct mld_ifsoftc *mli) { MLD_LOCK_ASSERT(); if (mli->mli_version != MLD_VERSION_2 && --mli->mli_v1_timer == 0) { /* * MLDv1 Querier Present timer expired; revert to MLDv2. */ CTR5(KTR_MLD, "%s: transition from v%d -> v%d on %p(%s)", __func__, mli->mli_version, MLD_VERSION_2, mli->mli_ifp, if_name(mli->mli_ifp)); mli->mli_version = MLD_VERSION_2; } } /* * Transmit an MLDv1 report immediately. */ static int mld_v1_transmit_report(struct in6_multi *in6m, const int type) { struct ifnet *ifp; struct in6_ifaddr *ia; struct ip6_hdr *ip6; struct mbuf *mh, *md; struct mld_hdr *mld; IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); ifp = in6m->in6m_ifp; ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); /* ia may be NULL if link-local address is tentative. */ mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) { if (ia != NULL) ifa_free(&ia->ia_ifa); return (ENOMEM); } md = m_get(M_NOWAIT, MT_DATA); if (md == NULL) { m_free(mh); if (ia != NULL) ifa_free(&ia->ia_ifa); return (ENOMEM); } mh->m_next = md; /* * FUTURE: Consider increasing alignment by ETHER_HDR_LEN, so * that ether_output() does not need to allocate another mbuf * for the header in the most common case. */ M_ALIGN(mh, sizeof(struct ip6_hdr)); mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr); mh->m_len = sizeof(struct ip6_hdr); ip6 = mtod(mh, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any; ip6->ip6_dst = in6m->in6m_addr; md->m_len = sizeof(struct mld_hdr); mld = mtod(md, struct mld_hdr *); mld->mld_type = type; mld->mld_code = 0; mld->mld_cksum = 0; mld->mld_maxdelay = 0; mld->mld_reserved = 0; mld->mld_addr = in6m->in6m_addr; in6_clearscope(&mld->mld_addr); mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), sizeof(struct mld_hdr)); mld_save_context(mh, ifp); mh->m_flags |= M_MLDV1; mld_dispatch_packet(mh); if (ia != NULL) ifa_free(&ia->ia_ifa); return (0); } /* * Process a state change from the upper layer for the given IPv6 group. * * Each socket holds a reference on the in_multi in its own ip_moptions. * The socket layer will have made the necessary updates to.the group * state, it is now up to MLD to issue a state change report if there * has been any change between T0 (when the last state-change was issued) * and T1 (now). * * We use the MLDv2 state machine at group level. The MLd module * however makes the decision as to which MLD protocol version to speak. * A state change *from* INCLUDE {} always means an initial join. * A state change *to* INCLUDE {} always means a final leave. * * If delay is non-zero, and the state change is an initial multicast * join, the state change report will be delayed by 'delay' ticks * in units of PR_FASTHZ if MLDv1 is active on the link; otherwise * the initial MLDv2 state change report will be delayed by whichever * is sooner, a pending state-change timer or delay itself. * * VIMAGE: curvnet should have been set by caller, as this routine * is called from the socket option handlers. */ int mld_change_state(struct in6_multi *inm, const int delay) { struct mld_ifsoftc *mli; struct ifnet *ifp; int error; IN6_MULTI_LIST_LOCK_ASSERT(); error = 0; /* * Try to detect if the upper layer just asked us to change state * for an interface which has now gone away. */ KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->in6m_ifma->ifma_ifp; if (ifp != NULL) { /* * Sanity check that netinet6's notion of ifp is the * same as net's. */ KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__)); } MLD_LOCK(); mli = MLD_IFINFO(ifp); KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp)); /* * If we detect a state transition to or from MCAST_UNDEFINED * for this group, then we are starting or finishing an MLD * life cycle for this group. */ if (inm->in6m_st[1].iss_fmode != inm->in6m_st[0].iss_fmode) { CTR3(KTR_MLD, "%s: inm transition %d -> %d", __func__, inm->in6m_st[0].iss_fmode, inm->in6m_st[1].iss_fmode); if (inm->in6m_st[0].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_MLD, "%s: initial join", __func__); error = mld_initial_join(inm, mli, delay); goto out_locked; } else if (inm->in6m_st[1].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_MLD, "%s: final leave", __func__); mld_final_leave(inm, mli); goto out_locked; } } else { CTR1(KTR_MLD, "%s: filter set change", __func__); } error = mld_handle_state_change(inm, mli); out_locked: MLD_UNLOCK(); return (error); } /* * Perform the initial join for an MLD group. * * When joining a group: * If the group should have its MLD traffic suppressed, do nothing. * MLDv1 starts sending MLDv1 host membership reports. * MLDv2 will schedule an MLDv2 state-change report containing the * initial state of the membership. * * If the delay argument is non-zero, then we must delay sending the * initial state change for delay ticks (in units of PR_FASTHZ). */ static int mld_initial_join(struct in6_multi *inm, struct mld_ifsoftc *mli, const int delay) { struct ifnet *ifp; struct mbufq *mq; int error, retval, syncstates; int odelay; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif CTR4(KTR_MLD, "%s: initial join %s on ifp %p(%s)", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), inm->in6m_ifp, if_name(inm->in6m_ifp)); error = 0; syncstates = 1; ifp = inm->in6m_ifp; IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); KASSERT(mli && mli->mli_ifp == ifp, ("%s: inconsistent ifp", __func__)); /* * Groups joined on loopback or marked as 'not reported', * enter the MLD_SILENT_MEMBER state and * are never reported in any protocol exchanges. * All other groups enter the appropriate state machine * for the version in use on this link. * A link marked as MLIF_SILENT causes MLD to be completely * disabled for the link. */ if ((ifp->if_flags & IFF_LOOPBACK) || (mli->mli_flags & MLIF_SILENT) || !mld_is_addr_reported(&inm->in6m_addr)) { CTR1(KTR_MLD, "%s: not kicking state machine for silent group", __func__); inm->in6m_state = MLD_SILENT_MEMBER; inm->in6m_timer = 0; } else { /* * Deal with overlapping in_multi lifecycle. * If this group was LEAVING, then make sure * we drop the reference we picked up to keep the * group around for the final INCLUDE {} enqueue. */ if (mli->mli_version == MLD_VERSION_2 && inm->in6m_state == MLD_LEAVING_MEMBER) in6m_release_deferred(inm); inm->in6m_state = MLD_REPORTING_MEMBER; switch (mli->mli_version) { case MLD_VERSION_1: /* * If a delay was provided, only use it if * it is greater than the delay normally * used for an MLDv1 state change report, * and delay sending the initial MLDv1 report * by not transitioning to the IDLE state. */ odelay = MLD_RANDOM_DELAY(MLD_V1_MAX_RI * PR_FASTHZ); if (delay) { inm->in6m_timer = max(delay, odelay); V_current_state_timers_running6 = 1; } else { inm->in6m_state = MLD_IDLE_MEMBER; error = mld_v1_transmit_report(inm, MLD_LISTENER_REPORT); if (error == 0) { inm->in6m_timer = odelay; V_current_state_timers_running6 = 1; } } break; case MLD_VERSION_2: /* * Defer update of T0 to T1, until the first copy * of the state change has been transmitted. */ syncstates = 0; /* * Immediately enqueue a State-Change Report for * this interface, freeing any previous reports. * Don't kick the timers if there is nothing to do, * or if an error occurred. */ mq = &inm->in6m_scq; mbufq_drain(mq); retval = mld_v2_enqueue_group_record(mq, inm, 1, 0, 0, (mli->mli_flags & MLIF_USEALLOW)); CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) { error = retval * -1; break; } /* * Schedule transmission of pending state-change * report up to RV times for this link. The timer * will fire at the next mld_fasttimo (~200ms), * giving us an opportunity to merge the reports. * * If a delay was provided to this function, only * use this delay if sooner than the existing one. */ KASSERT(mli->mli_rv > 1, ("%s: invalid robustness %d", __func__, mli->mli_rv)); inm->in6m_scrv = mli->mli_rv; if (delay) { if (inm->in6m_sctimer > 1) { inm->in6m_sctimer = min(inm->in6m_sctimer, delay); } else inm->in6m_sctimer = delay; } else inm->in6m_sctimer = 1; V_state_change_timers_running6 = 1; error = 0; break; } } /* * Only update the T0 state if state change is atomic, * i.e. we don't need to wait for a timer to fire before we * can consider the state change to have been communicated. */ if (syncstates) { in6m_commit(inm); CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); } return (error); } /* * Issue an intermediate state change during the life-cycle. */ static int mld_handle_state_change(struct in6_multi *inm, struct mld_ifsoftc *mli) { struct ifnet *ifp; int retval; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif CTR4(KTR_MLD, "%s: state change for %s on ifp %p(%s)", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), inm->in6m_ifp, if_name(inm->in6m_ifp)); ifp = inm->in6m_ifp; IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); KASSERT(mli && mli->mli_ifp == ifp, ("%s: inconsistent ifp", __func__)); if ((ifp->if_flags & IFF_LOOPBACK) || (mli->mli_flags & MLIF_SILENT) || !mld_is_addr_reported(&inm->in6m_addr) || (mli->mli_version != MLD_VERSION_2)) { if (!mld_is_addr_reported(&inm->in6m_addr)) { CTR1(KTR_MLD, "%s: not kicking state machine for silent group", __func__); } CTR1(KTR_MLD, "%s: nothing to do", __func__); in6m_commit(inm); CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); return (0); } mbufq_drain(&inm->in6m_scq); retval = mld_v2_enqueue_group_record(&inm->in6m_scq, inm, 1, 0, 0, (mli->mli_flags & MLIF_USEALLOW)); CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) return (-retval); /* * If record(s) were enqueued, start the state-change * report timer for this group. */ inm->in6m_scrv = mli->mli_rv; inm->in6m_sctimer = 1; V_state_change_timers_running6 = 1; return (0); } /* * Perform the final leave for a multicast address. * * When leaving a group: * MLDv1 sends a DONE message, if and only if we are the reporter. * MLDv2 enqueues a state-change report containing a transition * to INCLUDE {} for immediate transmission. */ static void mld_final_leave(struct in6_multi *inm, struct mld_ifsoftc *mli) { int syncstates; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif syncstates = 1; CTR4(KTR_MLD, "%s: final leave %s on ifp %p(%s)", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), inm->in6m_ifp, if_name(inm->in6m_ifp)); IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_LEAVING_MEMBER: /* Already leaving or left; do nothing. */ CTR1(KTR_MLD, "%s: not kicking state machine for silent group", __func__); break; case MLD_REPORTING_MEMBER: case MLD_IDLE_MEMBER: case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: if (mli->mli_version == MLD_VERSION_1) { #ifdef INVARIANTS if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER || inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER) panic("%s: MLDv2 state reached, not MLDv2 mode", __func__); #endif mld_v1_transmit_report(inm, MLD_LISTENER_DONE); inm->in6m_state = MLD_NOT_MEMBER; V_current_state_timers_running6 = 1; } else if (mli->mli_version == MLD_VERSION_2) { /* * Stop group timer and all pending reports. * Immediately enqueue a state-change report * TO_IN {} to be sent on the next fast timeout, * giving us an opportunity to merge reports. */ mbufq_drain(&inm->in6m_scq); inm->in6m_timer = 0; inm->in6m_scrv = mli->mli_rv; CTR4(KTR_MLD, "%s: Leaving %s/%s with %d " "pending retransmissions.", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp), inm->in6m_scrv); if (inm->in6m_scrv == 0) { inm->in6m_state = MLD_NOT_MEMBER; inm->in6m_sctimer = 0; } else { int retval; in6m_acquire_locked(inm); retval = mld_v2_enqueue_group_record( &inm->in6m_scq, inm, 1, 0, 0, (mli->mli_flags & MLIF_USEALLOW)); KASSERT(retval != 0, ("%s: enqueue record = %d", __func__, retval)); inm->in6m_state = MLD_LEAVING_MEMBER; inm->in6m_sctimer = 1; V_state_change_timers_running6 = 1; syncstates = 0; } break; } break; case MLD_LAZY_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_AWAKENING_MEMBER: /* Our reports are suppressed; do nothing. */ break; } if (syncstates) { in6m_commit(inm); CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; CTR3(KTR_MLD, "%s: T1 now MCAST_UNDEFINED for %p/%s", __func__, &inm->in6m_addr, if_name(inm->in6m_ifp)); } } /* * Enqueue an MLDv2 group record to the given output queue. * * If is_state_change is zero, a current-state record is appended. * If is_state_change is non-zero, a state-change report is appended. * * If is_group_query is non-zero, an mbuf packet chain is allocated. * If is_group_query is zero, and if there is a packet with free space * at the tail of the queue, it will be appended to providing there * is enough free space. * Otherwise a new mbuf packet chain is allocated. * * If is_source_query is non-zero, each source is checked to see if * it was recorded for a Group-Source query, and will be omitted if * it is not both in-mode and recorded. * * If use_block_allow is non-zero, state change reports for initial join * and final leave, on an inclusive mode group with a source list, will be * rewritten to use the ALLOW_NEW and BLOCK_OLD record types, respectively. * * The function will attempt to allocate leading space in the packet * for the IPv6+ICMP headers to be prepended without fragmenting the chain. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int mld_v2_enqueue_group_record(struct mbufq *mq, struct in6_multi *inm, const int is_state_change, const int is_group_query, const int is_source_query, const int use_block_allow) { struct mldv2_record mr; struct mldv2_record *pmr; struct ifnet *ifp; struct ip6_msource *ims, *nims; struct mbuf *m0, *m, *md; int is_filter_list_change; int minrec0len, m0srcs, msrcs, nbytes, off; int record_has_sources; int now; int type; uint8_t mode; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif IN6_MULTI_LIST_LOCK_ASSERT(); ifp = inm->in6m_ifp; is_filter_list_change = 0; m = NULL; m0 = NULL; m0srcs = 0; msrcs = 0; nbytes = 0; nims = NULL; record_has_sources = 1; pmr = NULL; type = MLD_DO_NOTHING; mode = inm->in6m_st[1].iss_fmode; /* * If we did not transition out of ASM mode during t0->t1, * and there are no source nodes to process, we can skip * the generation of source records. */ if (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0 && inm->in6m_nsrc == 0) record_has_sources = 0; if (is_state_change) { /* * Queue a state change record. * If the mode did not change, and there are non-ASM * listeners or source filters present, * we potentially need to issue two records for the group. * If there are ASM listeners, and there was no filter * mode transition of any kind, do nothing. * * If we are transitioning to MCAST_UNDEFINED, we need * not send any sources. A transition to/from this state is * considered inclusive with some special treatment. * * If we are rewriting initial joins/leaves to use * ALLOW/BLOCK, and the group's membership is inclusive, * we need to send sources in all cases. */ if (mode != inm->in6m_st[0].iss_fmode) { if (mode == MCAST_EXCLUDE) { CTR1(KTR_MLD, "%s: change to EXCLUDE", __func__); type = MLD_CHANGE_TO_EXCLUDE_MODE; } else { CTR1(KTR_MLD, "%s: change to INCLUDE", __func__); if (use_block_allow) { /* * XXX * Here we're interested in state * edges either direction between * MCAST_UNDEFINED and MCAST_INCLUDE. * Perhaps we should just check * the group state, rather than * the filter mode. */ if (mode == MCAST_UNDEFINED) { type = MLD_BLOCK_OLD_SOURCES; } else { type = MLD_ALLOW_NEW_SOURCES; } } else { type = MLD_CHANGE_TO_INCLUDE_MODE; if (mode == MCAST_UNDEFINED) record_has_sources = 0; } } } else { if (record_has_sources) { is_filter_list_change = 1; } else { type = MLD_DO_NOTHING; } } } else { /* * Queue a current state record. */ if (mode == MCAST_EXCLUDE) { type = MLD_MODE_IS_EXCLUDE; } else if (mode == MCAST_INCLUDE) { type = MLD_MODE_IS_INCLUDE; KASSERT(inm->in6m_st[1].iss_asm == 0, ("%s: inm %p is INCLUDE but ASM count is %d", __func__, inm, inm->in6m_st[1].iss_asm)); } } /* * Generate the filter list changes using a separate function. */ if (is_filter_list_change) return (mld_v2_enqueue_filter_change(mq, inm)); if (type == MLD_DO_NOTHING) { CTR3(KTR_MLD, "%s: nothing to do for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); return (0); } /* * If any sources are present, we must be able to fit at least * one in the trailing space of the tail packet's mbuf, * ideally more. */ minrec0len = sizeof(struct mldv2_record); if (record_has_sources) minrec0len += sizeof(struct in6_addr); CTR4(KTR_MLD, "%s: queueing %s for %s/%s", __func__, mld_rec_type_to_str(type), ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); /* * Check if we have a packet in the tail of the queue for this * group into which the first group record for this group will fit. * Otherwise allocate a new packet. * Always allocate leading space for IP6+RA+ICMPV6+REPORT. * Note: Group records for G/GSR query responses MUST be sent * in their own packet. */ m0 = mbufq_last(mq); if (!is_group_query && m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= MLD_V2_REPORT_MAXRECS) && (m0->m_pkthdr.len + minrec0len) < (ifp->if_mtu - MLD_MTUSPACE)) { m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); m = m0; CTR1(KTR_MLD, "%s: use existing packet", __func__); } else { if (mbufq_full(mq)) { CTR1(KTR_MLD, "%s: outbound queue full", __func__); return (-ENOMEM); } m = NULL; m0srcs = (ifp->if_mtu - MLD_MTUSPACE - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); if (!is_state_change && !is_group_query) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (-ENOMEM); mld_save_context(m, ifp); CTR1(KTR_MLD, "%s: allocated first packet", __func__); } /* * Append group record. * If we have sources, we don't know how many yet. */ mr.mr_type = type; mr.mr_datalen = 0; mr.mr_numsrc = 0; mr.mr_addr = inm->in6m_addr; in6_clearscope(&mr.mr_addr); if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(struct mldv2_record); /* * Append as many sources as will fit in the first packet. * If we are appending to a new packet, the chain allocation * may potentially use clusters; use m_getptr() in this case. * If we are appending to an existing packet, we need to obtain * a pointer to the group record after m_append(), in case a new * mbuf was allocated. * * Only append sources which are in-mode at t1. If we are * transitioning to MCAST_UNDEFINED state on the group, and * use_block_allow is zero, do not include source entries. * Otherwise, we need to include this source in the report. * * Only report recorded sources in our filter set when responding * to a group-source query. */ if (record_has_sources) { if (m == m0) { md = m_last(m); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + md->m_len - nbytes); } else { md = m_getptr(m, 0, &off); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off); } msrcs = 0; RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, nims) { CTR2(KTR_MLD, "%s: visit node %s", __func__, ip6_sprintf(ip6tbuf, &ims->im6s_addr)); now = im6s_get_mode(inm, ims, 1); CTR2(KTR_MLD, "%s: node is %d", __func__, now); if ((now != mode) || (now == mode && (!use_block_allow && mode == MCAST_UNDEFINED))) { CTR1(KTR_MLD, "%s: skip node", __func__); continue; } if (is_source_query && ims->im6s_stp == 0) { CTR1(KTR_MLD, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_MLD, "%s: append node", __func__); if (!m_append(m, sizeof(struct in6_addr), (void *)&ims->im6s_addr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(struct in6_addr); ++msrcs; if (msrcs == m0srcs) break; } CTR2(KTR_MLD, "%s: msrcs is %d this packet", __func__, msrcs); pmr->mr_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(struct in6_addr)); } if (is_source_query && msrcs == 0) { CTR1(KTR_MLD, "%s: no recorded sources to report", __func__); if (m != m0) m_freem(m); return (0); } /* * We are good to go with first packet. */ if (m != m0) { CTR1(KTR_MLD, "%s: enqueueing first packet", __func__); m->m_pkthdr.PH_vt.vt_nrecs = 1; mbufq_enqueue(mq, m); } else m->m_pkthdr.PH_vt.vt_nrecs++; /* * No further work needed if no source list in packet(s). */ if (!record_has_sources) return (nbytes); /* * Whilst sources remain to be announced, we need to allocate * a new packet and fill out as many sources as will fit. * Always try for a cluster first. */ while (nims != NULL) { if (mbufq_full(mq)) { CTR1(KTR_MLD, "%s: outbound queue full", __func__); return (-ENOMEM); } m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (-ENOMEM); mld_save_context(m, ifp); md = m_getptr(m, 0, &off); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off); CTR1(KTR_MLD, "%s: allocated next packet", __func__); if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed.", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 1; nbytes += sizeof(struct mldv2_record); m0srcs = (ifp->if_mtu - MLD_MTUSPACE - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); msrcs = 0; RB_FOREACH_FROM(ims, ip6_msource_tree, nims) { CTR2(KTR_MLD, "%s: visit node %s", __func__, ip6_sprintf(ip6tbuf, &ims->im6s_addr)); now = im6s_get_mode(inm, ims, 1); if ((now != mode) || (now == mode && (!use_block_allow && mode == MCAST_UNDEFINED))) { CTR1(KTR_MLD, "%s: skip node", __func__); continue; } if (is_source_query && ims->im6s_stp == 0) { CTR1(KTR_MLD, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_MLD, "%s: append node", __func__); if (!m_append(m, sizeof(struct in6_addr), (void *)&ims->im6s_addr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed.", __func__); return (-ENOMEM); } ++msrcs; if (msrcs == m0srcs) break; } pmr->mr_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(struct in6_addr)); CTR1(KTR_MLD, "%s: enqueueing next packet", __func__); mbufq_enqueue(mq, m); } return (nbytes); } /* * Type used to mark record pass completion. * We exploit the fact we can cast to this easily from the * current filter modes on each ip_msource node. */ typedef enum { REC_NONE = 0x00, /* MCAST_UNDEFINED */ REC_ALLOW = 0x01, /* MCAST_INCLUDE */ REC_BLOCK = 0x02, /* MCAST_EXCLUDE */ REC_FULL = REC_ALLOW | REC_BLOCK } rectype_t; /* * Enqueue an MLDv2 filter list change to the given output queue. * * Source list filter state is held in an RB-tree. When the filter list * for a group is changed without changing its mode, we need to compute * the deltas between T0 and T1 for each source in the filter set, * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records. * * As we may potentially queue two record types, and the entire R-B tree * needs to be walked at once, we break this out into its own function * so we can generate a tightly packed queue of packets. * * XXX This could be written to only use one tree walk, although that makes * serializing into the mbuf chains a bit harder. For now we do two walks * which makes things easier on us, and it may or may not be harder on * the L2 cache. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int mld_v2_enqueue_filter_change(struct mbufq *mq, struct in6_multi *inm) { static const int MINRECLEN = sizeof(struct mldv2_record) + sizeof(struct in6_addr); struct ifnet *ifp; struct mldv2_record mr; struct mldv2_record *pmr; struct ip6_msource *ims, *nims; struct mbuf *m, *m0, *md; int m0srcs, nbytes, npbytes, off, rsrcs, schanged; int nallow, nblock; uint8_t mode, now, then; rectype_t crt, drt, nrt; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif IN6_MULTI_LIST_LOCK_ASSERT(); if (inm->in6m_nsrc == 0 || (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0)) return (0); ifp = inm->in6m_ifp; /* interface */ mode = inm->in6m_st[1].iss_fmode; /* filter mode at t1 */ crt = REC_NONE; /* current group record type */ drt = REC_NONE; /* mask of completed group record types */ nrt = REC_NONE; /* record type for current node */ m0srcs = 0; /* # source which will fit in current mbuf chain */ npbytes = 0; /* # of bytes appended this packet */ nbytes = 0; /* # of bytes appended to group's state-change queue */ rsrcs = 0; /* # sources encoded in current record */ schanged = 0; /* # nodes encoded in overall filter change */ nallow = 0; /* # of source entries in ALLOW_NEW */ nblock = 0; /* # of source entries in BLOCK_OLD */ nims = NULL; /* next tree node pointer */ /* * For each possible filter record mode. * The first kind of source we encounter tells us which * is the first kind of record we start appending. * If a node transitioned to UNDEFINED at t1, its mode is treated * as the inverse of the group's filter mode. */ while (drt != REC_FULL) { do { m0 = mbufq_last(mq); if (m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= MLD_V2_REPORT_MAXRECS) && (m0->m_pkthdr.len + MINRECLEN) < (ifp->if_mtu - MLD_MTUSPACE)) { m = m0; m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); CTR1(KTR_MLD, "%s: use previous packet", __func__); } else { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { CTR1(KTR_MLD, "%s: m_get*() failed", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 0; mld_save_context(m, ifp); m0srcs = (ifp->if_mtu - MLD_MTUSPACE - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); npbytes = 0; CTR1(KTR_MLD, "%s: allocated new packet", __func__); } /* * Append the MLD group record header to the * current packet's data area. * Recalculate pointer to free space for next * group record, in case m_append() allocated * a new mbuf or cluster. */ memset(&mr, 0, sizeof(mr)); mr.mr_addr = inm->in6m_addr; in6_clearscope(&mr.mr_addr); if (!m_append(m, sizeof(mr), (void *)&mr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed", __func__); return (-ENOMEM); } npbytes += sizeof(struct mldv2_record); if (m != m0) { /* new packet; offset in chain */ md = m_getptr(m, npbytes - sizeof(struct mldv2_record), &off); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off); } else { /* current packet; offset from last append */ md = m_last(m); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + md->m_len - sizeof(struct mldv2_record)); } /* * Begin walking the tree for this record type * pass, or continue from where we left off * previously if we had to allocate a new packet. * Only report deltas in-mode at t1. * We need not report included sources as allowed * if we are in inclusive mode on the group, * however the converse is not true. */ rsrcs = 0; if (nims == NULL) { nims = RB_MIN(ip6_msource_tree, &inm->in6m_srcs); } RB_FOREACH_FROM(ims, ip6_msource_tree, nims) { CTR2(KTR_MLD, "%s: visit node %s", __func__, ip6_sprintf(ip6tbuf, &ims->im6s_addr)); now = im6s_get_mode(inm, ims, 1); then = im6s_get_mode(inm, ims, 0); CTR3(KTR_MLD, "%s: mode: t0 %d, t1 %d", __func__, then, now); if (now == then) { CTR1(KTR_MLD, "%s: skip unchanged", __func__); continue; } if (mode == MCAST_EXCLUDE && now == MCAST_INCLUDE) { CTR1(KTR_MLD, "%s: skip IN src on EX group", __func__); continue; } nrt = (rectype_t)now; if (nrt == REC_NONE) nrt = (rectype_t)(~mode & REC_FULL); if (schanged++ == 0) { crt = nrt; } else if (crt != nrt) continue; if (!m_append(m, sizeof(struct in6_addr), (void *)&ims->im6s_addr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed", __func__); return (-ENOMEM); } nallow += !!(crt == REC_ALLOW); nblock += !!(crt == REC_BLOCK); if (++rsrcs == m0srcs) break; } /* * If we did not append any tree nodes on this * pass, back out of allocations. */ if (rsrcs == 0) { npbytes -= sizeof(struct mldv2_record); if (m != m0) { CTR1(KTR_MLD, "%s: m_free(m)", __func__); m_freem(m); } else { CTR1(KTR_MLD, "%s: m_adj(m, -mr)", __func__); m_adj(m, -((int)sizeof( struct mldv2_record))); } continue; } npbytes += (rsrcs * sizeof(struct in6_addr)); if (crt == REC_ALLOW) pmr->mr_type = MLD_ALLOW_NEW_SOURCES; else if (crt == REC_BLOCK) pmr->mr_type = MLD_BLOCK_OLD_SOURCES; pmr->mr_numsrc = htons(rsrcs); /* * Count the new group record, and enqueue this * packet if it wasn't already queued. */ m->m_pkthdr.PH_vt.vt_nrecs++; if (m != m0) mbufq_enqueue(mq, m); nbytes += npbytes; } while (nims != NULL); drt |= crt; crt = (~crt & REC_FULL); } CTR3(KTR_MLD, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__, nallow, nblock); return (nbytes); } static int mld_v2_merge_state_changes(struct in6_multi *inm, struct mbufq *scq) { struct mbufq *gq; struct mbuf *m; /* pending state-change */ struct mbuf *m0; /* copy of pending state-change */ struct mbuf *mt; /* last state-change in packet */ int docopy, domerge; u_int recslen; docopy = 0; domerge = 0; recslen = 0; IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); /* * If there are further pending retransmissions, make a writable * copy of each queued state-change message before merging. */ if (inm->in6m_scrv > 0) docopy = 1; gq = &inm->in6m_scq; #ifdef KTR if (mbufq_first(gq) == NULL) { CTR2(KTR_MLD, "%s: WARNING: queue for inm %p is empty", __func__, inm); } #endif m = mbufq_first(gq); while (m != NULL) { /* * Only merge the report into the current packet if * there is sufficient space to do so; an MLDv2 report * packet may only contain 65,535 group records. * Always use a simple mbuf chain concatentation to do this, * as large state changes for single groups may have * allocated clusters. */ domerge = 0; mt = mbufq_last(scq); if (mt != NULL) { recslen = m_length(m, NULL); if ((mt->m_pkthdr.PH_vt.vt_nrecs + m->m_pkthdr.PH_vt.vt_nrecs <= MLD_V2_REPORT_MAXRECS) && (mt->m_pkthdr.len + recslen <= (inm->in6m_ifp->if_mtu - MLD_MTUSPACE))) domerge = 1; } if (!domerge && mbufq_full(gq)) { CTR2(KTR_MLD, "%s: outbound queue full, skipping whole packet %p", __func__, m); mt = m->m_nextpkt; if (!docopy) m_freem(m); m = mt; continue; } if (!docopy) { CTR2(KTR_MLD, "%s: dequeueing %p", __func__, m); m0 = mbufq_dequeue(gq); m = m0->m_nextpkt; } else { CTR2(KTR_MLD, "%s: copying %p", __func__, m); m0 = m_dup(m, M_NOWAIT); if (m0 == NULL) return (ENOMEM); m0->m_nextpkt = NULL; m = m->m_nextpkt; } if (!domerge) { CTR3(KTR_MLD, "%s: queueing %p to scq %p)", __func__, m0, scq); mbufq_enqueue(scq, m0); } else { struct mbuf *mtl; /* last mbuf of packet mt */ CTR3(KTR_MLD, "%s: merging %p with ifscq tail %p)", __func__, m0, mt); mtl = m_last(mt); m0->m_flags &= ~M_PKTHDR; mt->m_pkthdr.len += recslen; mt->m_pkthdr.PH_vt.vt_nrecs += m0->m_pkthdr.PH_vt.vt_nrecs; mtl->m_next = m0; } } return (0); } /* * Respond to a pending MLDv2 General Query. */ static void mld_v2_dispatch_general_query(struct mld_ifsoftc *mli) { struct ifmultiaddr *ifma; struct ifnet *ifp; struct in6_multi *inm; int retval; IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); KASSERT(mli->mli_version == MLD_VERSION_2, ("%s: called when version %d", __func__, mli->mli_version)); /* * Check that there are some packets queued. If so, send them first. * For large number of groups the reply to general query can take * many packets, we should finish sending them before starting of * queuing the new reply. */ if (mbufq_len(&mli->mli_gq) != 0) goto send; ifp = mli->mli_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET6 || ifma->ifma_protospec == NULL) continue; inm = (struct in6_multi *)ifma->ifma_protospec; KASSERT(ifp == inm->in6m_ifp, ("%s: inconsistent ifp", __func__)); switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: break; case MLD_REPORTING_MEMBER: case MLD_IDLE_MEMBER: case MLD_LAZY_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_AWAKENING_MEMBER: inm->in6m_state = MLD_REPORTING_MEMBER; retval = mld_v2_enqueue_group_record(&mli->mli_gq, inm, 0, 0, 0, 0); CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval); break; case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: case MLD_LEAVING_MEMBER: break; } } IF_ADDR_RUNLOCK(ifp); send: mld_dispatch_queue(&mli->mli_gq, MLD_MAX_RESPONSE_BURST); /* * Slew transmission of bursts over 500ms intervals. */ if (mbufq_first(&mli->mli_gq) != NULL) { mli->mli_v2_timer = 1 + MLD_RANDOM_DELAY( MLD_RESPONSE_BURST_INTERVAL); V_interface_timers_running6 = 1; } } /* * Transmit the next pending message in the output queue. * * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis. * MRT: Nothing needs to be done, as MLD traffic is always local to * a link and uses a link-scope multicast address. */ static void mld_dispatch_packet(struct mbuf *m) { struct ip6_moptions im6o; struct ifnet *ifp; struct ifnet *oifp; struct mbuf *m0; struct mbuf *md; struct ip6_hdr *ip6; struct mld_hdr *mld; int error; int off; int type; uint32_t ifindex; CTR2(KTR_MLD, "%s: transmit %p", __func__, m); /* * Set VNET image pointer from enqueued mbuf chain * before doing anything else. Whilst we use interface * indexes to guard against interface detach, they are * unique to each VIMAGE and must be retrieved. */ ifindex = mld_restore_context(m); /* * Check if the ifnet still exists. This limits the scope of * any race in the absence of a global ifp lock for low cost * (an array lookup). */ ifp = ifnet_byindex(ifindex); if (ifp == NULL) { CTR3(KTR_MLD, "%s: dropped %p as ifindex %u went away.", __func__, m, ifindex); m_freem(m); IP6STAT_INC(ip6s_noroute); goto out; } im6o.im6o_multicast_hlim = 1; im6o.im6o_multicast_loop = (V_ip6_mrouter != NULL); im6o.im6o_multicast_ifp = ifp; if (m->m_flags & M_MLDV1) { m0 = m; } else { m0 = mld_v2_encap_report(ifp, m); if (m0 == NULL) { CTR2(KTR_MLD, "%s: dropped %p", __func__, m); IP6STAT_INC(ip6s_odropped); goto out; } } mld_scrub_context(m0); m_clrprotoflags(m); m0->m_pkthdr.rcvif = V_loif; ip6 = mtod(m0, struct ip6_hdr *); #if 0 (void)in6_setscope(&ip6->ip6_dst, ifp, NULL); /* XXX LOR */ #else /* * XXX XXX Break some KPI rules to prevent an LOR which would * occur if we called in6_setscope() at transmission. * See comments at top of file. */ MLD_EMBEDSCOPE(&ip6->ip6_dst, ifp->if_index); #endif /* * Retrieve the ICMPv6 type before handoff to ip6_output(), * so we can bump the stats. */ md = m_getptr(m0, sizeof(struct ip6_hdr), &off); mld = (struct mld_hdr *)(mtod(md, uint8_t *) + off); type = mld->mld_type; error = ip6_output(m0, &mld_po, NULL, IPV6_UNSPECSRC, &im6o, &oifp, NULL); if (error) { CTR3(KTR_MLD, "%s: ip6_output(%p) = %d", __func__, m0, error); goto out; } ICMP6STAT_INC(icp6s_outhist[type]); if (oifp != NULL) { icmp6_ifstat_inc(oifp, ifs6_out_msg); switch (type) { case MLD_LISTENER_REPORT: case MLDV2_LISTENER_REPORT: icmp6_ifstat_inc(oifp, ifs6_out_mldreport); break; case MLD_LISTENER_DONE: icmp6_ifstat_inc(oifp, ifs6_out_mlddone); break; } } out: return; } /* * Encapsulate an MLDv2 report. * * KAME IPv6 requires that hop-by-hop options be passed separately, * and that the IPv6 header be prepended in a separate mbuf. * * Returns a pointer to the new mbuf chain head, or NULL if the * allocation failed. */ static struct mbuf * mld_v2_encap_report(struct ifnet *ifp, struct mbuf *m) { struct mbuf *mh; struct mldv2_report *mld; struct ip6_hdr *ip6; struct in6_ifaddr *ia; int mldreclen; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); KASSERT((m->m_flags & M_PKTHDR), ("%s: mbuf chain %p is !M_PKTHDR", __func__, m)); /* * RFC3590: OK to send as :: or tentative during DAD. */ ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); if (ia == NULL) CTR1(KTR_MLD, "%s: warning: ia is NULL", __func__); mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) { if (ia != NULL) ifa_free(&ia->ia_ifa); m_freem(m); return (NULL); } M_ALIGN(mh, sizeof(struct ip6_hdr) + sizeof(struct mldv2_report)); mldreclen = m_length(m, NULL); CTR2(KTR_MLD, "%s: mldreclen is %d", __func__, mldreclen); mh->m_len = sizeof(struct ip6_hdr) + sizeof(struct mldv2_report); mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mldv2_report) + mldreclen; ip6 = mtod(mh, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any; if (ia != NULL) ifa_free(&ia->ia_ifa); ip6->ip6_dst = in6addr_linklocal_allv2routers; /* scope ID will be set in netisr */ mld = (struct mldv2_report *)(ip6 + 1); mld->mld_type = MLDV2_LISTENER_REPORT; mld->mld_code = 0; mld->mld_cksum = 0; mld->mld_v2_reserved = 0; mld->mld_v2_numrecs = htons(m->m_pkthdr.PH_vt.vt_nrecs); m->m_pkthdr.PH_vt.vt_nrecs = 0; mh->m_next = m; mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), sizeof(struct mldv2_report) + mldreclen); return (mh); } #ifdef KTR static char * mld_rec_type_to_str(const int type) { switch (type) { case MLD_CHANGE_TO_EXCLUDE_MODE: return "TO_EX"; break; case MLD_CHANGE_TO_INCLUDE_MODE: return "TO_IN"; break; case MLD_MODE_IS_EXCLUDE: return "MODE_EX"; break; case MLD_MODE_IS_INCLUDE: return "MODE_IN"; break; case MLD_ALLOW_NEW_SOURCES: return "ALLOW_NEW"; break; case MLD_BLOCK_OLD_SOURCES: return "BLOCK_OLD"; break; default: break; } return "unknown"; } #endif static void mld_init(void *unused __unused) { CTR1(KTR_MLD, "%s: initializing", __func__); MLD_LOCK_INIT(); ip6_initpktopts(&mld_po); mld_po.ip6po_hlim = 1; mld_po.ip6po_hbh = &mld_ra.hbh; mld_po.ip6po_prefer_tempaddr = IP6PO_TEMPADDR_NOTPREFER; mld_po.ip6po_flags = IP6PO_DONTFRAG; } SYSINIT(mld_init, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, mld_init, NULL); static void mld_uninit(void *unused __unused) { CTR1(KTR_MLD, "%s: tearing down", __func__); MLD_LOCK_DESTROY(); } SYSUNINIT(mld_uninit, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, mld_uninit, NULL); static void vnet_mld_init(const void *unused __unused) { CTR1(KTR_MLD, "%s: initializing", __func__); LIST_INIT(&V_mli_head); } VNET_SYSINIT(vnet_mld_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mld_init, NULL); static void vnet_mld_uninit(const void *unused __unused) { /* This can happen if we shutdown the network stack. */ CTR1(KTR_MLD, "%s: tearing down", __func__); } VNET_SYSUNINIT(vnet_mld_uninit, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mld_uninit, NULL); static int mld_modevent(module_t mod, int type, void *unused __unused) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t mld_mod = { "mld", mld_modevent, 0 }; DECLARE_MODULE(mld, mld_mod, SI_SUB_PROTO_MC, SI_ORDER_ANY);