Index: head/sys/net/if_clone.c =================================================================== --- head/sys/net/if_clone.c (revision 333863) +++ head/sys/net/if_clone.c (revision 333864) @@ -1,756 +1,756 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2012 Gleb Smirnoff * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if.c 8.5 (Berkeley) 1/9/95 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Current IF_MAXUNIT expands maximum to 5 characters. */ #define IFCLOSIZ (IFNAMSIZ - 5) /* * Structure describing a `cloning' interface. * * List of locks * (c) const until freeing * (d) driver specific data, may need external protection. * (e) locked by if_cloners_mtx * (i) locked by ifc_mtx mtx */ struct if_clone { char ifc_name[IFCLOSIZ]; /* (c) Name of device, e.g. `gif' */ struct unrhdr *ifc_unrhdr; /* (c) alloc_unr(9) header */ int ifc_maxunit; /* (c) maximum unit number */ int ifc_flags; long ifc_refcnt; /* (i) Reference count. */ LIST_HEAD(, ifnet) ifc_iflist; /* (i) List of cloned interfaces */ struct mtx ifc_mtx; /* Mutex to protect members. */ enum { SIMPLE, ADVANCED } ifc_type; /* (c) */ /* (c) Driver specific cloning functions. Called with no locks held. */ union { struct { /* advanced cloner */ ifc_match_t *_ifc_match; ifc_create_t *_ifc_create; ifc_destroy_t *_ifc_destroy; } A; struct { /* simple cloner */ ifcs_create_t *_ifcs_create; ifcs_destroy_t *_ifcs_destroy; int _ifcs_minifs; /* minimum ifs */ } S; } U; #define ifc_match U.A._ifc_match #define ifc_create U.A._ifc_create #define ifc_destroy U.A._ifc_destroy #define ifcs_create U.S._ifcs_create #define ifcs_destroy U.S._ifcs_destroy #define ifcs_minifs U.S._ifcs_minifs LIST_ENTRY(if_clone) ifc_list; /* (e) On list of cloners */ }; static void if_clone_free(struct if_clone *ifc); static int if_clone_createif(struct if_clone *ifc, char *name, size_t len, caddr_t params); static int ifc_simple_match(struct if_clone *, const char *); static int ifc_simple_create(struct if_clone *, char *, size_t, caddr_t); static int ifc_simple_destroy(struct if_clone *, struct ifnet *); static struct mtx if_cloners_mtx; MTX_SYSINIT(if_cloners_lock, &if_cloners_mtx, "if_cloners lock", MTX_DEF); static VNET_DEFINE(int, if_cloners_count); VNET_DEFINE(LIST_HEAD(, if_clone), if_cloners); #define V_if_cloners_count VNET(if_cloners_count) #define V_if_cloners VNET(if_cloners) #define IF_CLONERS_LOCK_ASSERT() mtx_assert(&if_cloners_mtx, MA_OWNED) #define IF_CLONERS_LOCK() mtx_lock(&if_cloners_mtx) #define IF_CLONERS_UNLOCK() mtx_unlock(&if_cloners_mtx) #define IF_CLONE_LOCK_INIT(ifc) \ mtx_init(&(ifc)->ifc_mtx, "if_clone lock", NULL, MTX_DEF) #define IF_CLONE_LOCK_DESTROY(ifc) mtx_destroy(&(ifc)->ifc_mtx) #define IF_CLONE_LOCK_ASSERT(ifc) mtx_assert(&(ifc)->ifc_mtx, MA_OWNED) #define IF_CLONE_LOCK(ifc) mtx_lock(&(ifc)->ifc_mtx) #define IF_CLONE_UNLOCK(ifc) mtx_unlock(&(ifc)->ifc_mtx) #define IF_CLONE_ADDREF(ifc) \ do { \ IF_CLONE_LOCK(ifc); \ IF_CLONE_ADDREF_LOCKED(ifc); \ IF_CLONE_UNLOCK(ifc); \ } while (0) #define IF_CLONE_ADDREF_LOCKED(ifc) \ do { \ IF_CLONE_LOCK_ASSERT(ifc); \ KASSERT((ifc)->ifc_refcnt >= 0, \ ("negative refcnt %ld", (ifc)->ifc_refcnt)); \ (ifc)->ifc_refcnt++; \ } while (0) #define IF_CLONE_REMREF(ifc) \ do { \ IF_CLONE_LOCK(ifc); \ IF_CLONE_REMREF_LOCKED(ifc); \ } while (0) #define IF_CLONE_REMREF_LOCKED(ifc) \ do { \ IF_CLONE_LOCK_ASSERT(ifc); \ KASSERT((ifc)->ifc_refcnt > 0, \ ("bogus refcnt %ld", (ifc)->ifc_refcnt)); \ if (--(ifc)->ifc_refcnt == 0) { \ IF_CLONE_UNLOCK(ifc); \ if_clone_free(ifc); \ } else { \ /* silently free the lock */ \ IF_CLONE_UNLOCK(ifc); \ } \ } while (0) #define IFC_IFLIST_INSERT(_ifc, _ifp) \ LIST_INSERT_HEAD(&_ifc->ifc_iflist, _ifp, if_clones) #define IFC_IFLIST_REMOVE(_ifc, _ifp) \ LIST_REMOVE(_ifp, if_clones) static MALLOC_DEFINE(M_CLONE, "clone", "interface cloning framework"); void vnet_if_clone_init(void) { LIST_INIT(&V_if_cloners); } /* * Lookup and create a clone network interface. */ int if_clone_create(char *name, size_t len, caddr_t params) { struct if_clone *ifc; /* Try to find an applicable cloner for this request */ IF_CLONERS_LOCK(); LIST_FOREACH(ifc, &V_if_cloners, ifc_list) if (ifc->ifc_type == SIMPLE) { if (ifc_simple_match(ifc, name)) break; } else { if (ifc->ifc_match(ifc, name)) break; } #ifdef VIMAGE if (ifc == NULL && !IS_DEFAULT_VNET(curvnet)) { CURVNET_SET_QUIET(vnet0); LIST_FOREACH(ifc, &V_if_cloners, ifc_list) if (ifc->ifc_type == SIMPLE) { if (ifc_simple_match(ifc, name)) break; } else { if (ifc->ifc_match(ifc, name)) break; } CURVNET_RESTORE(); } #endif IF_CLONERS_UNLOCK(); if (ifc == NULL) return (EINVAL); return (if_clone_createif(ifc, name, len, params)); } /* * Create a clone network interface. */ static int if_clone_createif(struct if_clone *ifc, char *name, size_t len, caddr_t params) { int err; struct ifnet *ifp; if (ifunit(name) != NULL) return (EEXIST); if (ifc->ifc_type == SIMPLE) err = ifc_simple_create(ifc, name, len, params); else err = (*ifc->ifc_create)(ifc, name, len, params); if (!err) { ifp = ifunit(name); if (ifp == NULL) panic("%s: lookup failed for %s", __func__, name); if ((ifc->ifc_flags & IFC_NOGROUP) == 0) if_addgroup(ifp, ifc->ifc_name); IF_CLONE_LOCK(ifc); IFC_IFLIST_INSERT(ifc, ifp); IF_CLONE_UNLOCK(ifc); } return (err); } /* * Lookup and destroy a clone network interface. */ int if_clone_destroy(const char *name) { int err; struct if_clone *ifc; struct ifnet *ifp; ifp = ifunit_ref(name); if (ifp == NULL) return (ENXIO); /* Find the cloner for this interface */ IF_CLONERS_LOCK(); LIST_FOREACH(ifc, &V_if_cloners, ifc_list) { if (strcmp(ifc->ifc_name, ifp->if_dname) == 0) { break; } } #ifdef VIMAGE if (ifc == NULL && !IS_DEFAULT_VNET(curvnet)) { CURVNET_SET_QUIET(vnet0); LIST_FOREACH(ifc, &V_if_cloners, ifc_list) if (ifc->ifc_type == SIMPLE) { if (ifc_simple_match(ifc, name)) break; } else { if (ifc->ifc_match(ifc, name)) break; } CURVNET_RESTORE(); } #endif IF_CLONERS_UNLOCK(); if (ifc == NULL) { if_rele(ifp); return (EINVAL); } err = if_clone_destroyif(ifc, ifp); if_rele(ifp); return err; } /* * Destroy a clone network interface. */ int if_clone_destroyif(struct if_clone *ifc, struct ifnet *ifp) { int err; struct ifnet *ifcifp; if (ifc->ifc_type == ADVANCED && ifc->ifc_destroy == NULL) return(EOPNOTSUPP); /* * Given that the cloned ifnet might be attached to a different * vnet from where its cloner was registered, we have to * switch to the vnet context of the target vnet. */ CURVNET_SET_QUIET(ifp->if_vnet); IF_CLONE_LOCK(ifc); LIST_FOREACH(ifcifp, &ifc->ifc_iflist, if_clones) { if (ifcifp == ifp) { IFC_IFLIST_REMOVE(ifc, ifp); break; } } IF_CLONE_UNLOCK(ifc); if (ifcifp == NULL) { CURVNET_RESTORE(); return (ENXIO); /* ifp is not on the list. */ } if ((ifc->ifc_flags & IFC_NOGROUP) == 0) if_delgroup(ifp, ifc->ifc_name); if (ifc->ifc_type == SIMPLE) err = ifc_simple_destroy(ifc, ifp); else err = (*ifc->ifc_destroy)(ifc, ifp); if (err != 0) { if ((ifc->ifc_flags & IFC_NOGROUP) == 0) if_addgroup(ifp, ifc->ifc_name); IF_CLONE_LOCK(ifc); IFC_IFLIST_INSERT(ifc, ifp); IF_CLONE_UNLOCK(ifc); } CURVNET_RESTORE(); return (err); } static struct if_clone * if_clone_alloc(const char *name, int maxunit) { struct if_clone *ifc; KASSERT(name != NULL, ("%s: no name\n", __func__)); ifc = malloc(sizeof(struct if_clone), M_CLONE, M_WAITOK | M_ZERO); strncpy(ifc->ifc_name, name, IFCLOSIZ-1); IF_CLONE_LOCK_INIT(ifc); IF_CLONE_ADDREF(ifc); ifc->ifc_maxunit = maxunit ? maxunit : IF_MAXUNIT; ifc->ifc_unrhdr = new_unrhdr(0, ifc->ifc_maxunit, &ifc->ifc_mtx); LIST_INIT(&ifc->ifc_iflist); return (ifc); } static int if_clone_attach(struct if_clone *ifc) { struct if_clone *ifc1; IF_CLONERS_LOCK(); LIST_FOREACH(ifc1, &V_if_cloners, ifc_list) if (strcmp(ifc->ifc_name, ifc1->ifc_name) == 0) { IF_CLONERS_UNLOCK(); IF_CLONE_REMREF(ifc); return (EEXIST); } LIST_INSERT_HEAD(&V_if_cloners, ifc, ifc_list); V_if_cloners_count++; IF_CLONERS_UNLOCK(); return (0); } struct if_clone * if_clone_advanced(const char *name, u_int maxunit, ifc_match_t match, ifc_create_t create, ifc_destroy_t destroy) { struct if_clone *ifc; ifc = if_clone_alloc(name, maxunit); ifc->ifc_type = ADVANCED; ifc->ifc_match = match; ifc->ifc_create = create; ifc->ifc_destroy = destroy; if (if_clone_attach(ifc) != 0) return (NULL); EVENTHANDLER_INVOKE(if_clone_event, ifc); return (ifc); } struct if_clone * if_clone_simple(const char *name, ifcs_create_t create, ifcs_destroy_t destroy, u_int minifs) { struct if_clone *ifc; u_int unit; ifc = if_clone_alloc(name, 0); ifc->ifc_type = SIMPLE; ifc->ifcs_create = create; ifc->ifcs_destroy = destroy; ifc->ifcs_minifs = minifs; if (if_clone_attach(ifc) != 0) return (NULL); for (unit = 0; unit < minifs; unit++) { char name[IFNAMSIZ]; - int error; + int error __unused; snprintf(name, IFNAMSIZ, "%s%d", ifc->ifc_name, unit); error = if_clone_createif(ifc, name, IFNAMSIZ, NULL); KASSERT(error == 0, ("%s: failed to create required interface %s", __func__, name)); } EVENTHANDLER_INVOKE(if_clone_event, ifc); return (ifc); } /* * Unregister a network interface cloner. */ void if_clone_detach(struct if_clone *ifc) { IF_CLONERS_LOCK(); LIST_REMOVE(ifc, ifc_list); V_if_cloners_count--; IF_CLONERS_UNLOCK(); /* Allow all simples to be destroyed */ if (ifc->ifc_type == SIMPLE) ifc->ifcs_minifs = 0; /* destroy all interfaces for this cloner */ while (!LIST_EMPTY(&ifc->ifc_iflist)) if_clone_destroyif(ifc, LIST_FIRST(&ifc->ifc_iflist)); IF_CLONE_REMREF(ifc); } static void if_clone_free(struct if_clone *ifc) { KASSERT(LIST_EMPTY(&ifc->ifc_iflist), ("%s: ifc_iflist not empty", __func__)); IF_CLONE_LOCK_DESTROY(ifc); delete_unrhdr(ifc->ifc_unrhdr); free(ifc, M_CLONE); } /* * Provide list of interface cloners to userspace. */ int if_clone_list(struct if_clonereq *ifcr) { char *buf, *dst, *outbuf = NULL; struct if_clone *ifc; int buf_count, count, err = 0; if (ifcr->ifcr_count < 0) return (EINVAL); IF_CLONERS_LOCK(); /* * Set our internal output buffer size. We could end up not * reporting a cloner that is added between the unlock and lock * below, but that's not a major problem. Not caping our * allocation to the number of cloners actually in the system * could be because that would let arbitrary users cause us to * allocate arbitrary amounts of kernel memory. */ buf_count = (V_if_cloners_count < ifcr->ifcr_count) ? V_if_cloners_count : ifcr->ifcr_count; IF_CLONERS_UNLOCK(); outbuf = malloc(IFNAMSIZ*buf_count, M_CLONE, M_WAITOK | M_ZERO); IF_CLONERS_LOCK(); ifcr->ifcr_total = V_if_cloners_count; if ((dst = ifcr->ifcr_buffer) == NULL) { /* Just asking how many there are. */ goto done; } count = (V_if_cloners_count < buf_count) ? V_if_cloners_count : buf_count; for (ifc = LIST_FIRST(&V_if_cloners), buf = outbuf; ifc != NULL && count != 0; ifc = LIST_NEXT(ifc, ifc_list), count--, buf += IFNAMSIZ) { strlcpy(buf, ifc->ifc_name, IFNAMSIZ); } done: IF_CLONERS_UNLOCK(); if (err == 0 && dst != NULL) err = copyout(outbuf, dst, buf_count*IFNAMSIZ); if (outbuf != NULL) free(outbuf, M_CLONE); return (err); } /* * if_clone_findifc() looks up ifnet from the current * cloner list, and returns ifc if found. Note that ifc_refcnt * is incremented. */ struct if_clone * if_clone_findifc(struct ifnet *ifp) { struct if_clone *ifc, *ifc0; struct ifnet *ifcifp; ifc0 = NULL; IF_CLONERS_LOCK(); LIST_FOREACH(ifc, &V_if_cloners, ifc_list) { IF_CLONE_LOCK(ifc); LIST_FOREACH(ifcifp, &ifc->ifc_iflist, if_clones) { if (ifp == ifcifp) { ifc0 = ifc; IF_CLONE_ADDREF_LOCKED(ifc); break; } } IF_CLONE_UNLOCK(ifc); if (ifc0 != NULL) break; } IF_CLONERS_UNLOCK(); return (ifc0); } /* * if_clone_addgroup() decrements ifc_refcnt because it is called after * if_clone_findifc(). */ void if_clone_addgroup(struct ifnet *ifp, struct if_clone *ifc) { if ((ifc->ifc_flags & IFC_NOGROUP) == 0) { if_addgroup(ifp, ifc->ifc_name); IF_CLONE_REMREF(ifc); } } /* * A utility function to extract unit numbers from interface names of * the form name###. * * Returns 0 on success and an error on failure. */ int ifc_name2unit(const char *name, int *unit) { const char *cp; int cutoff = INT_MAX / 10; int cutlim = INT_MAX % 10; for (cp = name; *cp != '\0' && (*cp < '0' || *cp > '9'); cp++); if (*cp == '\0') { *unit = -1; } else if (cp[0] == '0' && cp[1] != '\0') { /* Disallow leading zeroes. */ return (EINVAL); } else { for (*unit = 0; *cp != '\0'; cp++) { if (*cp < '0' || *cp > '9') { /* Bogus unit number. */ return (EINVAL); } if (*unit > cutoff || (*unit == cutoff && *cp - '0' > cutlim)) return (EINVAL); *unit = (*unit * 10) + (*cp - '0'); } } return (0); } static int ifc_alloc_unit_specific(struct if_clone *ifc, int *unit) { char name[IFNAMSIZ]; if (*unit > ifc->ifc_maxunit) return (ENOSPC); if (alloc_unr_specific(ifc->ifc_unrhdr, *unit) == -1) return (EEXIST); snprintf(name, IFNAMSIZ, "%s%d", ifc->ifc_name, *unit); if (ifunit(name) != NULL) { free_unr(ifc->ifc_unrhdr, *unit); return (EEXIST); } IF_CLONE_ADDREF(ifc); return (0); } static int ifc_alloc_unit_next(struct if_clone *ifc, int *unit) { int error; *unit = alloc_unr(ifc->ifc_unrhdr); if (*unit == -1) return (ENOSPC); free_unr(ifc->ifc_unrhdr, *unit); for (;;) { error = ifc_alloc_unit_specific(ifc, unit); if (error != EEXIST) break; (*unit)++; } return (error); } int ifc_alloc_unit(struct if_clone *ifc, int *unit) { if (*unit < 0) return (ifc_alloc_unit_next(ifc, unit)); else return (ifc_alloc_unit_specific(ifc, unit)); } void ifc_free_unit(struct if_clone *ifc, int unit) { free_unr(ifc->ifc_unrhdr, unit); IF_CLONE_REMREF(ifc); } static int ifc_simple_match(struct if_clone *ifc, const char *name) { const char *cp; int i; /* Match the name */ for (cp = name, i = 0; i < strlen(ifc->ifc_name); i++, cp++) { if (ifc->ifc_name[i] != *cp) return (0); } /* Make sure there's a unit number or nothing after the name */ for (; *cp != '\0'; cp++) { if (*cp < '0' || *cp > '9') return (0); } return (1); } static int ifc_simple_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { char *dp; int wildcard; int unit; int err; err = ifc_name2unit(name, &unit); if (err != 0) return (err); wildcard = (unit < 0); err = ifc_alloc_unit(ifc, &unit); if (err != 0) return (err); err = ifc->ifcs_create(ifc, unit, params); if (err != 0) { ifc_free_unit(ifc, unit); return (err); } /* In the wildcard case, we need to update the name. */ if (wildcard) { for (dp = name; *dp != '\0'; dp++); if (snprintf(dp, len - (dp-name), "%d", unit) > len - (dp-name) - 1) { /* * This can only be a programmer error and * there's no straightforward way to recover if * it happens. */ panic("if_clone_create(): interface name too long"); } } return (0); } static int ifc_simple_destroy(struct if_clone *ifc, struct ifnet *ifp) { int unit; unit = ifp->if_dunit; if (unit < ifc->ifcs_minifs) return (EINVAL); ifc->ifcs_destroy(ifp); ifc_free_unit(ifc, unit); return (0); } const char * ifc_name(struct if_clone *ifc) { return (ifc->ifc_name); } void ifc_flags_set(struct if_clone *ifc, int flags) { ifc->ifc_flags = flags; } int ifc_flags_get(struct if_clone *ifc) { return (ifc->ifc_flags); } Index: head/sys/net/if_epair.c =================================================================== --- head/sys/net/if_epair.c (revision 333863) +++ head/sys/net/if_epair.c (revision 333864) @@ -1,1028 +1,1028 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008 The FreeBSD Foundation * Copyright (c) 2009-2010 Bjoern A. Zeeb * All rights reserved. * * This software was developed by CK Software GmbH under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * A pair of virtual back-to-back connected ethernet like interfaces * (``two interfaces with a virtual cross-over cable''). * * This is mostly intended to be used to provide connectivity between * different virtual network stack instances. */ /* * Things to re-think once we have more experience: * - ifp->if_reassign function once we can test with vimage. Depending on * how if_vmove() is going to be improved. * - Real random etheraddrs that are checked to be uniquish; we would need * to re-do them in case we move the interface between network stacks * in a private if_reassign function. * In case we bridge to a real interface/network or between indepedent * epairs on multiple stacks/machines, we may need this. * For now let the user handle that case. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, OID_AUTO, epair, CTLFLAG_RW, 0, "epair sysctl"); #ifdef EPAIR_DEBUG static int epair_debug = 0; SYSCTL_INT(_net_link_epair, OID_AUTO, epair_debug, CTLFLAG_RW, &epair_debug, 0, "if_epair(4) debugging."); #define DPRINTF(fmt, arg...) \ if (epair_debug) \ printf("[%s:%d] " fmt, __func__, __LINE__, ##arg) #else #define DPRINTF(fmt, arg...) #endif static void epair_nh_sintr(struct mbuf *); static struct mbuf *epair_nh_m2cpuid(struct mbuf *, uintptr_t, u_int *); static void epair_nh_drainedcpu(u_int); static void epair_start_locked(struct ifnet *); static int epair_media_change(struct ifnet *); static void epair_media_status(struct ifnet *, struct ifmediareq *); static int epair_clone_match(struct if_clone *, const char *); static int epair_clone_create(struct if_clone *, char *, size_t, caddr_t); static int epair_clone_destroy(struct if_clone *, struct ifnet *); static const char epairname[] = "epair"; /* Netisr related definitions and sysctl. */ static struct netisr_handler epair_nh = { .nh_name = epairname, .nh_proto = NETISR_EPAIR, .nh_policy = NETISR_POLICY_CPU, .nh_handler = epair_nh_sintr, .nh_m2cpuid = epair_nh_m2cpuid, .nh_drainedcpu = epair_nh_drainedcpu, }; static int sysctl_epair_netisr_maxqlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&epair_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&epair_nh, qlimit)); } SYSCTL_PROC(_net_link_epair, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_epair_netisr_maxqlen, "I", "Maximum if_epair(4) netisr \"hw\" queue length"); struct epair_softc { struct ifnet *ifp; /* This ifp. */ struct ifnet *oifp; /* other ifp of pair. */ struct ifmedia media; /* Media config (fake). */ u_int refcount; /* # of mbufs in flight. */ u_int cpuid; /* CPU ID assigned upon creation. */ void (*if_qflush)(struct ifnet *); /* Original if_qflush routine. */ }; /* * Per-CPU list of ifps with data in the ifq that needs to be flushed * to the netisr ``hw'' queue before we allow any further direct queuing * to the ``hw'' queue. */ struct epair_ifp_drain { STAILQ_ENTRY(epair_ifp_drain) ifp_next; struct ifnet *ifp; }; STAILQ_HEAD(eid_list, epair_ifp_drain); #define EPAIR_LOCK_INIT(dpcpu) mtx_init(&(dpcpu)->if_epair_mtx, \ "if_epair", NULL, MTX_DEF) #define EPAIR_LOCK_DESTROY(dpcpu) mtx_destroy(&(dpcpu)->if_epair_mtx) #define EPAIR_LOCK_ASSERT(dpcpu) mtx_assert(&(dpcpu)->if_epair_mtx, \ MA_OWNED) #define EPAIR_LOCK(dpcpu) mtx_lock(&(dpcpu)->if_epair_mtx) #define EPAIR_UNLOCK(dpcpu) mtx_unlock(&(dpcpu)->if_epair_mtx) #ifdef INVARIANTS #define EPAIR_REFCOUNT_INIT(r, v) refcount_init((r), (v)) #define EPAIR_REFCOUNT_AQUIRE(r) refcount_acquire((r)) #define EPAIR_REFCOUNT_RELEASE(r) refcount_release((r)) #define EPAIR_REFCOUNT_ASSERT(a, p) KASSERT(a, p) #else #define EPAIR_REFCOUNT_INIT(r, v) #define EPAIR_REFCOUNT_AQUIRE(r) #define EPAIR_REFCOUNT_RELEASE(r) #define EPAIR_REFCOUNT_ASSERT(a, p) #endif static MALLOC_DEFINE(M_EPAIR, epairname, "Pair of virtual cross-over connected Ethernet-like interfaces"); static VNET_DEFINE(struct if_clone *, epair_cloner); #define V_epair_cloner VNET(epair_cloner) /* * DPCPU area and functions. */ struct epair_dpcpu { struct mtx if_epair_mtx; /* Per-CPU locking. */ int epair_drv_flags; /* Per-CPU ``hw'' drv flags. */ struct eid_list epair_ifp_drain_list; /* Per-CPU list of ifps with * data in the ifq. */ }; DPCPU_DEFINE(struct epair_dpcpu, epair_dpcpu); static void epair_dpcpu_init(void) { struct epair_dpcpu *epair_dpcpu; struct eid_list *s; u_int cpuid; CPU_FOREACH(cpuid) { epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu); /* Initialize per-cpu lock. */ EPAIR_LOCK_INIT(epair_dpcpu); /* Driver flags are per-cpu as are our netisr "hw" queues. */ epair_dpcpu->epair_drv_flags = 0; /* * Initialize per-cpu drain list. * Manually do what STAILQ_HEAD_INITIALIZER would do. */ s = &epair_dpcpu->epair_ifp_drain_list; s->stqh_first = NULL; s->stqh_last = &s->stqh_first; } } static void epair_dpcpu_detach(void) { struct epair_dpcpu *epair_dpcpu; u_int cpuid; CPU_FOREACH(cpuid) { epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu); /* Destroy per-cpu lock. */ EPAIR_LOCK_DESTROY(epair_dpcpu); } } /* * Helper functions. */ static u_int cpuid_from_ifp(struct ifnet *ifp) { struct epair_softc *sc; if (ifp == NULL) return (0); sc = ifp->if_softc; return (sc->cpuid); } /* * Netisr handler functions. */ static void epair_nh_sintr(struct mbuf *m) { struct ifnet *ifp; - struct epair_softc *sc; + struct epair_softc *sc __unused; ifp = m->m_pkthdr.rcvif; (*ifp->if_input)(ifp, m); sc = ifp->if_softc; EPAIR_REFCOUNT_RELEASE(&sc->refcount); EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1, ("%s: ifp=%p sc->refcount not >= 1: %d", __func__, ifp, sc->refcount)); DPRINTF("ifp=%p refcount=%u\n", ifp, sc->refcount); } static struct mbuf * epair_nh_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid) { *cpuid = cpuid_from_ifp(m->m_pkthdr.rcvif); return (m); } static void epair_nh_drainedcpu(u_int cpuid) { struct epair_dpcpu *epair_dpcpu; struct epair_ifp_drain *elm, *tvar; struct ifnet *ifp; epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu); EPAIR_LOCK(epair_dpcpu); /* * Assume our "hw" queue and possibly ifq will be emptied * again. In case we will overflow the "hw" queue while * draining, epair_start_locked will set IFF_DRV_OACTIVE * again and we will stop and return. */ STAILQ_FOREACH_SAFE(elm, &epair_dpcpu->epair_ifp_drain_list, ifp_next, tvar) { ifp = elm->ifp; epair_dpcpu->epair_drv_flags &= ~IFF_DRV_OACTIVE; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; epair_start_locked(ifp); IFQ_LOCK(&ifp->if_snd); if (IFQ_IS_EMPTY(&ifp->if_snd)) { - struct epair_softc *sc; + struct epair_softc *sc __unused; STAILQ_REMOVE(&epair_dpcpu->epair_ifp_drain_list, elm, epair_ifp_drain, ifp_next); /* The cached ifp goes off the list. */ sc = ifp->if_softc; EPAIR_REFCOUNT_RELEASE(&sc->refcount); EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1, ("%s: ifp=%p sc->refcount not >= 1: %d", __func__, ifp, sc->refcount)); free(elm, M_EPAIR); } IFQ_UNLOCK(&ifp->if_snd); if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) != 0) { /* Our "hw"q overflew again. */ epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE; DPRINTF("hw queue length overflow at %u\n", epair_nh.nh_qlimit); break; } } EPAIR_UNLOCK(epair_dpcpu); } /* * Network interface (`if') related functions. */ static void epair_remove_ifp_from_draining(struct ifnet *ifp) { struct epair_dpcpu *epair_dpcpu; struct epair_ifp_drain *elm, *tvar; u_int cpuid; CPU_FOREACH(cpuid) { epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu); EPAIR_LOCK(epair_dpcpu); STAILQ_FOREACH_SAFE(elm, &epair_dpcpu->epair_ifp_drain_list, ifp_next, tvar) { if (ifp == elm->ifp) { - struct epair_softc *sc; + struct epair_softc *sc __unused; STAILQ_REMOVE( &epair_dpcpu->epair_ifp_drain_list, elm, epair_ifp_drain, ifp_next); /* The cached ifp goes off the list. */ sc = ifp->if_softc; EPAIR_REFCOUNT_RELEASE(&sc->refcount); EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1, ("%s: ifp=%p sc->refcount not >= 1: %d", __func__, ifp, sc->refcount)); free(elm, M_EPAIR); } } EPAIR_UNLOCK(epair_dpcpu); } } static int epair_add_ifp_for_draining(struct ifnet *ifp) { struct epair_dpcpu *epair_dpcpu; struct epair_softc *sc; struct epair_ifp_drain *elm = NULL; sc = ifp->if_softc; epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu); EPAIR_LOCK_ASSERT(epair_dpcpu); STAILQ_FOREACH(elm, &epair_dpcpu->epair_ifp_drain_list, ifp_next) if (elm->ifp == ifp) break; /* If the ifp is there already, return success. */ if (elm != NULL) return (0); elm = malloc(sizeof(struct epair_ifp_drain), M_EPAIR, M_NOWAIT|M_ZERO); if (elm == NULL) return (ENOMEM); elm->ifp = ifp; /* Add a reference for the ifp pointer on the list. */ EPAIR_REFCOUNT_AQUIRE(&sc->refcount); STAILQ_INSERT_TAIL(&epair_dpcpu->epair_ifp_drain_list, elm, ifp_next); return (0); } static void epair_start_locked(struct ifnet *ifp) { struct epair_dpcpu *epair_dpcpu; struct mbuf *m; struct epair_softc *sc; struct ifnet *oifp; int error; DPRINTF("ifp=%p\n", ifp); sc = ifp->if_softc; epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu); EPAIR_LOCK_ASSERT(epair_dpcpu); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; if ((ifp->if_flags & IFF_UP) == 0) return; /* * We get packets here from ether_output via if_handoff() * and need to put them into the input queue of the oifp * and call oifp->if_input() via netisr/epair_sintr(). */ oifp = sc->oifp; sc = oifp->if_softc; for (;;) { IFQ_DEQUEUE(&ifp->if_snd, m); if (m == NULL) break; BPF_MTAP(ifp, m); /* * In case the outgoing interface is not usable, * drop the packet. */ if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (oifp->if_flags & IFF_UP) ==0) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); continue; } DPRINTF("packet %s -> %s\n", ifp->if_xname, oifp->if_xname); /* * Add a reference so the interface cannot go while the * packet is in transit as we rely on rcvif to stay valid. */ EPAIR_REFCOUNT_AQUIRE(&sc->refcount); m->m_pkthdr.rcvif = oifp; CURVNET_SET_QUIET(oifp->if_vnet); error = netisr_queue(NETISR_EPAIR, m); CURVNET_RESTORE(); if (!error) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* Someone else received the packet. */ if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1); } else { /* The packet was freed already. */ epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE; ifp->if_drv_flags |= IFF_DRV_OACTIVE; (void) epair_add_ifp_for_draining(ifp); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); EPAIR_REFCOUNT_RELEASE(&sc->refcount); EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1, ("%s: ifp=%p sc->refcount not >= 1: %d", __func__, oifp, sc->refcount)); } } } static void epair_start(struct ifnet *ifp) { struct epair_dpcpu *epair_dpcpu; epair_dpcpu = DPCPU_ID_PTR(cpuid_from_ifp(ifp), epair_dpcpu); EPAIR_LOCK(epair_dpcpu); epair_start_locked(ifp); EPAIR_UNLOCK(epair_dpcpu); } static int epair_transmit_locked(struct ifnet *ifp, struct mbuf *m) { struct epair_dpcpu *epair_dpcpu; struct epair_softc *sc; struct ifnet *oifp; int error, len; short mflags; DPRINTF("ifp=%p m=%p\n", ifp, m); sc = ifp->if_softc; epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu); EPAIR_LOCK_ASSERT(epair_dpcpu); if (m == NULL) return (0); /* * We are not going to use the interface en/dequeue mechanism * on the TX side. We are called from ether_output_frame() * and will put the packet into the incoming queue of the * other interface of our pair via the netsir. */ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { m_freem(m); return (ENXIO); } if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); return (ENETDOWN); } BPF_MTAP(ifp, m); /* * In case the outgoing interface is not usable, * drop the packet. */ oifp = sc->oifp; if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (oifp->if_flags & IFF_UP) ==0) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (0); } len = m->m_pkthdr.len; mflags = m->m_flags; DPRINTF("packet %s -> %s\n", ifp->if_xname, oifp->if_xname); #ifdef ALTQ /* Support ALTQ via the classic if_start() path. */ IF_LOCK(&ifp->if_snd); if (ALTQ_IS_ENABLED(&ifp->if_snd)) { ALTQ_ENQUEUE(&ifp->if_snd, m, NULL, error); if (error) if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); IF_UNLOCK(&ifp->if_snd); if (!error) { if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if (mflags & (M_BCAST|M_MCAST)) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) epair_start_locked(ifp); else (void)epair_add_ifp_for_draining(ifp); } return (error); } IF_UNLOCK(&ifp->if_snd); #endif if ((epair_dpcpu->epair_drv_flags & IFF_DRV_OACTIVE) != 0) { /* * Our hardware queue is full, try to fall back * queuing to the ifq but do not call ifp->if_start. * Either we are lucky or the packet is gone. */ IFQ_ENQUEUE(&ifp->if_snd, m, error); if (!error) (void)epair_add_ifp_for_draining(ifp); return (error); } sc = oifp->if_softc; /* * Add a reference so the interface cannot go while the * packet is in transit as we rely on rcvif to stay valid. */ EPAIR_REFCOUNT_AQUIRE(&sc->refcount); m->m_pkthdr.rcvif = oifp; CURVNET_SET_QUIET(oifp->if_vnet); error = netisr_queue(NETISR_EPAIR, m); CURVNET_RESTORE(); if (!error) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* * IFQ_HANDOFF_ADJ/ip_handoff() update statistics, * but as we bypass all this we have to duplicate * the logic another time. */ if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if (mflags & (M_BCAST|M_MCAST)) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); /* Someone else received the packet. */ if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1); } else { /* The packet was freed already. */ epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE; ifp->if_drv_flags |= IFF_DRV_OACTIVE; if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); EPAIR_REFCOUNT_RELEASE(&sc->refcount); EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1, ("%s: ifp=%p sc->refcount not >= 1: %d", __func__, oifp, sc->refcount)); } return (error); } static int epair_transmit(struct ifnet *ifp, struct mbuf *m) { struct epair_dpcpu *epair_dpcpu; int error; epair_dpcpu = DPCPU_ID_PTR(cpuid_from_ifp(ifp), epair_dpcpu); EPAIR_LOCK(epair_dpcpu); error = epair_transmit_locked(ifp, m); EPAIR_UNLOCK(epair_dpcpu); return (error); } static void epair_qflush(struct ifnet *ifp) { struct epair_softc *sc; sc = ifp->if_softc; KASSERT(sc != NULL, ("%s: ifp=%p, epair_softc gone? sc=%p\n", __func__, ifp, sc)); /* * Remove this ifp from all backpointer lists. The interface will not * usable for flushing anyway nor should it have anything to flush * after if_qflush(). */ epair_remove_ifp_from_draining(ifp); if (sc->if_qflush) sc->if_qflush(ifp); } static int epair_media_change(struct ifnet *ifp __unused) { /* Do nothing. */ return (0); } static void epair_media_status(struct ifnet *ifp __unused, struct ifmediareq *imr) { imr->ifm_status = IFM_AVALID | IFM_ACTIVE; imr->ifm_active = IFM_ETHER | IFM_10G_T | IFM_FDX; } static int epair_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct epair_softc *sc; struct ifreq *ifr; int error; ifr = (struct ifreq *)data; switch (cmd) { case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: error = 0; break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: sc = ifp->if_softc; error = ifmedia_ioctl(ifp, ifr, &sc->media, cmd); break; case SIOCSIFMTU: /* We basically allow all kinds of MTUs. */ ifp->if_mtu = ifr->ifr_mtu; error = 0; break; default: /* Let the common ethernet handler process this. */ error = ether_ioctl(ifp, cmd, data); break; } return (error); } static void epair_init(void *dummy __unused) { } /* * Interface cloning functions. * We use our private ones so that we can create/destroy our secondary * device along with the primary one. */ static int epair_clone_match(struct if_clone *ifc, const char *name) { const char *cp; DPRINTF("name='%s'\n", name); /* * Our base name is epair. * Our interfaces will be named epair[ab]. * So accept anything of the following list: * - epair * - epair * but not the epair[ab] versions. */ if (strncmp(epairname, name, sizeof(epairname)-1) != 0) return (0); for (cp = name + sizeof(epairname) - 1; *cp != '\0'; cp++) { if (*cp < '0' || *cp > '9') return (0); } return (1); } static int epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { struct epair_softc *sca, *scb; struct ifnet *ifp; char *dp; int error, unit, wildcard; uint8_t eaddr[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */ /* * We are abusing params to create our second interface. * Actually we already created it and called if_clone_create() * for it to do the official insertion procedure the moment we knew * it cannot fail anymore. So just do attach it here. */ if (params) { scb = (struct epair_softc *)params; ifp = scb->ifp; /* Assign a hopefully unique, locally administered etheraddr. */ eaddr[0] = 0x02; eaddr[3] = (ifp->if_index >> 8) & 0xff; eaddr[4] = ifp->if_index & 0xff; eaddr[5] = 0x0b; ether_ifattach(ifp, eaddr); /* Correctly set the name for the cloner list. */ strlcpy(name, scb->ifp->if_xname, len); return (0); } /* Try to see if a special unit was requested. */ error = ifc_name2unit(name, &unit); if (error != 0) return (error); wildcard = (unit < 0); error = ifc_alloc_unit(ifc, &unit); if (error != 0) return (error); /* * If no unit had been given, we need to adjust the ifName. * Also make sure there is space for our extra [ab] suffix. */ for (dp = name; *dp != '\0'; dp++); if (wildcard) { error = snprintf(dp, len - (dp - name), "%d", unit); if (error > len - (dp - name) - 1) { /* ifName too long. */ ifc_free_unit(ifc, unit); return (ENOSPC); } dp += error; } if (len - (dp - name) - 1 < 1) { /* No space left for our [ab] suffix. */ ifc_free_unit(ifc, unit); return (ENOSPC); } *dp = 'b'; /* Must not change dp so we can replace 'a' by 'b' later. */ *(dp+1) = '\0'; /* Check if 'a' and 'b' interfaces already exist. */ if (ifunit(name) != NULL) return (EEXIST); *dp = 'a'; if (ifunit(name) != NULL) return (EEXIST); /* Allocate memory for both [ab] interfaces */ sca = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO); EPAIR_REFCOUNT_INIT(&sca->refcount, 1); sca->ifp = if_alloc(IFT_ETHER); if (sca->ifp == NULL) { free(sca, M_EPAIR); ifc_free_unit(ifc, unit); return (ENOSPC); } scb = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO); EPAIR_REFCOUNT_INIT(&scb->refcount, 1); scb->ifp = if_alloc(IFT_ETHER); if (scb->ifp == NULL) { free(scb, M_EPAIR); if_free(sca->ifp); free(sca, M_EPAIR); ifc_free_unit(ifc, unit); return (ENOSPC); } /* * Cross-reference the interfaces so we will be able to free both. */ sca->oifp = scb->ifp; scb->oifp = sca->ifp; /* * Calculate the cpuid for netisr queueing based on the * ifIndex of the interfaces. As long as we cannot configure * this or use cpuset information easily we cannot guarantee * cache locality but we can at least allow parallelism. */ sca->cpuid = netisr_get_cpuid(sca->ifp->if_index); scb->cpuid = netisr_get_cpuid(scb->ifp->if_index); /* Initialise pseudo media types. */ ifmedia_init(&sca->media, 0, epair_media_change, epair_media_status); ifmedia_add(&sca->media, IFM_ETHER | IFM_10G_T, 0, NULL); ifmedia_set(&sca->media, IFM_ETHER | IFM_10G_T); ifmedia_init(&scb->media, 0, epair_media_change, epair_media_status); ifmedia_add(&scb->media, IFM_ETHER | IFM_10G_T, 0, NULL); ifmedia_set(&scb->media, IFM_ETHER | IFM_10G_T); /* Finish initialization of interface a. */ ifp = sca->ifp; ifp->if_softc = sca; strlcpy(ifp->if_xname, name, IFNAMSIZ); ifp->if_dname = epairname; ifp->if_dunit = unit; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_capabilities = IFCAP_VLAN_MTU; ifp->if_capenable = IFCAP_VLAN_MTU; ifp->if_start = epair_start; ifp->if_ioctl = epair_ioctl; ifp->if_init = epair_init; if_setsendqlen(ifp, ifqmaxlen); if_setsendqready(ifp); /* Assign a hopefully unique, locally administered etheraddr. */ eaddr[0] = 0x02; eaddr[3] = (ifp->if_index >> 8) & 0xff; eaddr[4] = ifp->if_index & 0xff; eaddr[5] = 0x0a; ether_ifattach(ifp, eaddr); sca->if_qflush = ifp->if_qflush; ifp->if_qflush = epair_qflush; ifp->if_transmit = epair_transmit; ifp->if_baudrate = IF_Gbps(10); /* arbitrary maximum */ /* Swap the name and finish initialization of interface b. */ *dp = 'b'; ifp = scb->ifp; ifp->if_softc = scb; strlcpy(ifp->if_xname, name, IFNAMSIZ); ifp->if_dname = epairname; ifp->if_dunit = unit; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_capabilities = IFCAP_VLAN_MTU; ifp->if_capenable = IFCAP_VLAN_MTU; ifp->if_start = epair_start; ifp->if_ioctl = epair_ioctl; ifp->if_init = epair_init; if_setsendqlen(ifp, ifqmaxlen); if_setsendqready(ifp); /* We need to play some tricks here for the second interface. */ strlcpy(name, epairname, len); error = if_clone_create(name, len, (caddr_t)scb); if (error) panic("%s: if_clone_create() for our 2nd iface failed: %d", __func__, error); scb->if_qflush = ifp->if_qflush; ifp->if_qflush = epair_qflush; ifp->if_transmit = epair_transmit; ifp->if_baudrate = IF_Gbps(10); /* arbitrary maximum */ /* * Restore name to a as the ifp for this will go into the * cloner list for the initial call. */ strlcpy(name, sca->ifp->if_xname, len); DPRINTF("name='%s/%db' created sca=%p scb=%p\n", name, unit, sca, scb); /* Tell the world, that we are ready to rock. */ sca->ifp->if_drv_flags |= IFF_DRV_RUNNING; scb->ifp->if_drv_flags |= IFF_DRV_RUNNING; if_link_state_change(sca->ifp, LINK_STATE_UP); if_link_state_change(scb->ifp, LINK_STATE_UP); return (0); } static int epair_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) { struct ifnet *oifp; struct epair_softc *sca, *scb; int unit, error; DPRINTF("ifp=%p\n", ifp); /* * In case we called into if_clone_destroyif() ourselves * again to remove the second interface, the softc will be * NULL. In that case so not do anything but return success. */ if (ifp->if_softc == NULL) return (0); unit = ifp->if_dunit; sca = ifp->if_softc; oifp = sca->oifp; scb = oifp->if_softc; DPRINTF("ifp=%p oifp=%p\n", ifp, oifp); if_link_state_change(ifp, LINK_STATE_DOWN); if_link_state_change(oifp, LINK_STATE_DOWN); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; oifp->if_drv_flags &= ~IFF_DRV_RUNNING; /* * Get rid of our second half. As the other of the two * interfaces may reside in a different vnet, we need to * switch before freeing them. */ CURVNET_SET_QUIET(oifp->if_vnet); ether_ifdetach(oifp); /* * Wait for all packets to be dispatched to if_input. * The numbers can only go down as the interface is * detached so there is no need to use atomics. */ DPRINTF("scb refcnt=%u\n", scb->refcount); EPAIR_REFCOUNT_ASSERT(scb->refcount == 1, ("%s: ifp=%p scb->refcount!=1: %d", __func__, oifp, scb->refcount)); oifp->if_softc = NULL; error = if_clone_destroyif(ifc, oifp); if (error) panic("%s: if_clone_destroyif() for our 2nd iface failed: %d", __func__, error); if_free(oifp); ifmedia_removeall(&scb->media); free(scb, M_EPAIR); CURVNET_RESTORE(); ether_ifdetach(ifp); /* * Wait for all packets to be dispatched to if_input. */ DPRINTF("sca refcnt=%u\n", sca->refcount); EPAIR_REFCOUNT_ASSERT(sca->refcount == 1, ("%s: ifp=%p sca->refcount!=1: %d", __func__, ifp, sca->refcount)); if_free(ifp); ifmedia_removeall(&sca->media); free(sca, M_EPAIR); ifc_free_unit(ifc, unit); return (0); } static void vnet_epair_init(const void *unused __unused) { V_epair_cloner = if_clone_advanced(epairname, 0, epair_clone_match, epair_clone_create, epair_clone_destroy); #ifdef VIMAGE netisr_register_vnet(&epair_nh); #endif } VNET_SYSINIT(vnet_epair_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_epair_init, NULL); static void vnet_epair_uninit(const void *unused __unused) { #ifdef VIMAGE netisr_unregister_vnet(&epair_nh); #endif if_clone_detach(V_epair_cloner); } VNET_SYSUNINIT(vnet_epair_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, vnet_epair_uninit, NULL); static void epair_uninit(const void *unused __unused) { netisr_unregister(&epair_nh); epair_dpcpu_detach(); if (bootverbose) printf("%s unloaded.\n", epairname); } SYSUNINIT(epair_uninit, SI_SUB_INIT_IF, SI_ORDER_MIDDLE, epair_uninit, NULL); static int epair_modevent(module_t mod, int type, void *data) { int qlimit; switch (type) { case MOD_LOAD: /* For now limit us to one global mutex and one inq. */ epair_dpcpu_init(); epair_nh.nh_qlimit = 42 * ifqmaxlen; /* 42 shall be the number. */ if (TUNABLE_INT_FETCH("net.link.epair.netisr_maxqlen", &qlimit)) epair_nh.nh_qlimit = qlimit; netisr_register(&epair_nh); if (bootverbose) printf("%s initialized.\n", epairname); break; case MOD_UNLOAD: /* Handled in epair_uninit() */ break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t epair_mod = { "if_epair", epair_modevent, 0 }; DECLARE_MODULE(if_epair, epair_mod, SI_SUB_PSEUDO, SI_ORDER_MIDDLE); MODULE_VERSION(if_epair, 1); Index: head/sys/net/if_lagg.c =================================================================== --- head/sys/net/if_lagg.c (revision 333863) +++ head/sys/net/if_lagg.c (revision 333864) @@ -1,2203 +1,2200 @@ /* $OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $ */ /* * Copyright (c) 2005, 2006 Reyk Floeter * Copyright (c) 2007 Andrew Thompson * Copyright (c) 2014, 2016 Marcelo Araujo * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #endif #ifdef INET #include #include #endif #ifdef INET6 #include #include #include #endif #include #include #include #define LAGG_RLOCK() epoch_enter_preempt(net_epoch_preempt) #define LAGG_RUNLOCK() epoch_exit_preempt(net_epoch_preempt) #define LAGG_RLOCK_ASSERT() MPASS(in_epoch()) #define LAGG_UNLOCK_ASSERT() MPASS(!in_epoch()) #define LAGG_SX_INIT(_sc) sx_init(&(_sc)->sc_sx, "if_lagg sx") #define LAGG_SX_DESTROY(_sc) sx_destroy(&(_sc)->sc_sx) #define LAGG_XLOCK(_sc) sx_xlock(&(_sc)->sc_sx) #define LAGG_XUNLOCK(_sc) sx_xunlock(&(_sc)->sc_sx) #define LAGG_SXLOCK_ASSERT(_sc) sx_assert(&(_sc)->sc_sx, SA_LOCKED) #define LAGG_XLOCK_ASSERT(_sc) sx_assert(&(_sc)->sc_sx, SA_XLOCKED) /* Special flags we should propagate to the lagg ports. */ static struct { int flag; int (*func)(struct ifnet *, int); } lagg_pflags[] = { {IFF_PROMISC, ifpromisc}, {IFF_ALLMULTI, if_allmulti}, {0, NULL} }; VNET_DEFINE(SLIST_HEAD(__trhead, lagg_softc), lagg_list); /* list of laggs */ #define V_lagg_list VNET(lagg_list) static VNET_DEFINE(struct mtx, lagg_list_mtx); #define V_lagg_list_mtx VNET(lagg_list_mtx) #define LAGG_LIST_LOCK_INIT(x) mtx_init(&V_lagg_list_mtx, \ "if_lagg list", NULL, MTX_DEF) #define LAGG_LIST_LOCK_DESTROY(x) mtx_destroy(&V_lagg_list_mtx) #define LAGG_LIST_LOCK(x) mtx_lock(&V_lagg_list_mtx) #define LAGG_LIST_UNLOCK(x) mtx_unlock(&V_lagg_list_mtx) eventhandler_tag lagg_detach_cookie = NULL; static int lagg_clone_create(struct if_clone *, int, caddr_t); static void lagg_clone_destroy(struct ifnet *); static VNET_DEFINE(struct if_clone *, lagg_cloner); #define V_lagg_cloner VNET(lagg_cloner) static const char laggname[] = "lagg"; static void lagg_capabilities(struct lagg_softc *); static int lagg_port_create(struct lagg_softc *, struct ifnet *); static int lagg_port_destroy(struct lagg_port *, int); static struct mbuf *lagg_input(struct ifnet *, struct mbuf *); static void lagg_linkstate(struct lagg_softc *); static void lagg_port_state(struct ifnet *, int); static int lagg_port_ioctl(struct ifnet *, u_long, caddr_t); static int lagg_port_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static void lagg_port_ifdetach(void *arg __unused, struct ifnet *); #ifdef LAGG_PORT_STACKING static int lagg_port_checkstacking(struct lagg_softc *); #endif static void lagg_port2req(struct lagg_port *, struct lagg_reqport *); static void lagg_init(void *); static void lagg_stop(struct lagg_softc *); static int lagg_ioctl(struct ifnet *, u_long, caddr_t); #ifdef RATELIMIT static int lagg_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); #endif static int lagg_setmulti(struct lagg_port *); static int lagg_clrmulti(struct lagg_port *); static int lagg_setcaps(struct lagg_port *, int cap); static int lagg_setflag(struct lagg_port *, int, int, int (*func)(struct ifnet *, int)); static int lagg_setflags(struct lagg_port *, int status); static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt); static int lagg_transmit(struct ifnet *, struct mbuf *); static void lagg_qflush(struct ifnet *); static int lagg_media_change(struct ifnet *); static void lagg_media_status(struct ifnet *, struct ifmediareq *); static struct lagg_port *lagg_link_active(struct lagg_softc *, struct lagg_port *); /* Simple round robin */ static void lagg_rr_attach(struct lagg_softc *); static int lagg_rr_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* Active failover */ static int lagg_fail_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* Loadbalancing */ static void lagg_lb_attach(struct lagg_softc *); static void lagg_lb_detach(struct lagg_softc *); static int lagg_lb_port_create(struct lagg_port *); static void lagg_lb_port_destroy(struct lagg_port *); static int lagg_lb_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); static int lagg_lb_porttable(struct lagg_softc *, struct lagg_port *); /* Broadcast */ static int lagg_bcast_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_bcast_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* 802.3ad LACP */ static void lagg_lacp_attach(struct lagg_softc *); static void lagg_lacp_detach(struct lagg_softc *); static int lagg_lacp_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); static void lagg_lacp_lladdr(struct lagg_softc *); /* lagg protocol table */ static const struct lagg_proto { lagg_proto pr_num; void (*pr_attach)(struct lagg_softc *); void (*pr_detach)(struct lagg_softc *); int (*pr_start)(struct lagg_softc *, struct mbuf *); struct mbuf * (*pr_input)(struct lagg_softc *, struct lagg_port *, struct mbuf *); int (*pr_addport)(struct lagg_port *); void (*pr_delport)(struct lagg_port *); void (*pr_linkstate)(struct lagg_port *); void (*pr_init)(struct lagg_softc *); void (*pr_stop)(struct lagg_softc *); void (*pr_lladdr)(struct lagg_softc *); void (*pr_request)(struct lagg_softc *, void *); void (*pr_portreq)(struct lagg_port *, void *); } lagg_protos[] = { { .pr_num = LAGG_PROTO_NONE }, { .pr_num = LAGG_PROTO_ROUNDROBIN, .pr_attach = lagg_rr_attach, .pr_start = lagg_rr_start, .pr_input = lagg_rr_input, }, { .pr_num = LAGG_PROTO_FAILOVER, .pr_start = lagg_fail_start, .pr_input = lagg_fail_input, }, { .pr_num = LAGG_PROTO_LOADBALANCE, .pr_attach = lagg_lb_attach, .pr_detach = lagg_lb_detach, .pr_start = lagg_lb_start, .pr_input = lagg_lb_input, .pr_addport = lagg_lb_port_create, .pr_delport = lagg_lb_port_destroy, }, { .pr_num = LAGG_PROTO_LACP, .pr_attach = lagg_lacp_attach, .pr_detach = lagg_lacp_detach, .pr_start = lagg_lacp_start, .pr_input = lagg_lacp_input, .pr_addport = lacp_port_create, .pr_delport = lacp_port_destroy, .pr_linkstate = lacp_linkstate, .pr_init = lacp_init, .pr_stop = lacp_stop, .pr_lladdr = lagg_lacp_lladdr, .pr_request = lacp_req, .pr_portreq = lacp_portreq, }, { .pr_num = LAGG_PROTO_BROADCAST, .pr_start = lagg_bcast_start, .pr_input = lagg_bcast_input, }, }; SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW, 0, "Link Aggregation"); /* Allow input on any failover links */ static VNET_DEFINE(int, lagg_failover_rx_all); #define V_lagg_failover_rx_all VNET(lagg_failover_rx_all) SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(lagg_failover_rx_all), 0, "Accept input from any interface in a failover lagg"); /* Default value for using flowid */ static VNET_DEFINE(int, def_use_flowid) = 0; #define V_def_use_flowid VNET(def_use_flowid) SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RWTUN, &VNET_NAME(def_use_flowid), 0, "Default setting for using flow id for load sharing"); /* Default value for flowid shift */ static VNET_DEFINE(int, def_flowid_shift) = 16; #define V_def_flowid_shift VNET(def_flowid_shift) SYSCTL_INT(_net_link_lagg, OID_AUTO, default_flowid_shift, CTLFLAG_RWTUN, &VNET_NAME(def_flowid_shift), 0, "Default setting for flowid shift for load sharing"); static void vnet_lagg_init(const void *unused __unused) { LAGG_LIST_LOCK_INIT(); SLIST_INIT(&V_lagg_list); V_lagg_cloner = if_clone_simple(laggname, lagg_clone_create, lagg_clone_destroy, 0); } VNET_SYSINIT(vnet_lagg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_lagg_init, NULL); static void vnet_lagg_uninit(const void *unused __unused) { if_clone_detach(V_lagg_cloner); LAGG_LIST_LOCK_DESTROY(); } VNET_SYSUNINIT(vnet_lagg_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, vnet_lagg_uninit, NULL); static int lagg_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: lagg_input_p = lagg_input; lagg_linkstate_p = lagg_port_state; lagg_detach_cookie = EVENTHANDLER_REGISTER( ifnet_departure_event, lagg_port_ifdetach, NULL, EVENTHANDLER_PRI_ANY); break; case MOD_UNLOAD: EVENTHANDLER_DEREGISTER(ifnet_departure_event, lagg_detach_cookie); lagg_input_p = NULL; lagg_linkstate_p = NULL; break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t lagg_mod = { "if_lagg", lagg_modevent, 0 }; DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_lagg, 1); static void lagg_proto_attach(struct lagg_softc *sc, lagg_proto pr) { LAGG_XLOCK_ASSERT(sc); KASSERT(sc->sc_proto == LAGG_PROTO_NONE, ("%s: sc %p has proto", __func__, sc)); if (sc->sc_ifflags & IFF_DEBUG) if_printf(sc->sc_ifp, "using proto %u\n", pr); if (lagg_protos[pr].pr_attach != NULL) lagg_protos[pr].pr_attach(sc); sc->sc_proto = pr; } static void lagg_proto_detach(struct lagg_softc *sc) { lagg_proto pr; LAGG_XLOCK_ASSERT(sc); pr = sc->sc_proto; sc->sc_proto = LAGG_PROTO_NONE; if (lagg_protos[pr].pr_detach != NULL) lagg_protos[pr].pr_detach(sc); } static int lagg_proto_start(struct lagg_softc *sc, struct mbuf *m) { return (lagg_protos[sc->sc_proto].pr_start(sc, m)); } static struct mbuf * lagg_proto_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { return (lagg_protos[sc->sc_proto].pr_input(sc, lp, m)); } static int lagg_proto_addport(struct lagg_softc *sc, struct lagg_port *lp) { if (lagg_protos[sc->sc_proto].pr_addport == NULL) return (0); else return (lagg_protos[sc->sc_proto].pr_addport(lp)); } static void lagg_proto_delport(struct lagg_softc *sc, struct lagg_port *lp) { if (lagg_protos[sc->sc_proto].pr_delport != NULL) lagg_protos[sc->sc_proto].pr_delport(lp); } static void lagg_proto_linkstate(struct lagg_softc *sc, struct lagg_port *lp) { if (lagg_protos[sc->sc_proto].pr_linkstate != NULL) lagg_protos[sc->sc_proto].pr_linkstate(lp); } static void lagg_proto_init(struct lagg_softc *sc) { if (lagg_protos[sc->sc_proto].pr_init != NULL) lagg_protos[sc->sc_proto].pr_init(sc); } static void lagg_proto_stop(struct lagg_softc *sc) { if (lagg_protos[sc->sc_proto].pr_stop != NULL) lagg_protos[sc->sc_proto].pr_stop(sc); } static void lagg_proto_lladdr(struct lagg_softc *sc) { if (lagg_protos[sc->sc_proto].pr_lladdr != NULL) lagg_protos[sc->sc_proto].pr_lladdr(sc); } static void lagg_proto_request(struct lagg_softc *sc, void *v) { if (lagg_protos[sc->sc_proto].pr_request != NULL) lagg_protos[sc->sc_proto].pr_request(sc, v); } static void lagg_proto_portreq(struct lagg_softc *sc, struct lagg_port *lp, void *v) { if (lagg_protos[sc->sc_proto].pr_portreq != NULL) lagg_protos[sc->sc_proto].pr_portreq(lp, v); } /* * This routine is run via an vlan * config EVENT */ static void lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct lagg_softc *sc = ifp->if_softc; struct lagg_port *lp; if (ifp->if_softc != arg) /* Not our event */ return; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag); LAGG_RUNLOCK(); } /* * This routine is run via an vlan * unconfig EVENT */ static void lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct lagg_softc *sc = ifp->if_softc; struct lagg_port *lp; if (ifp->if_softc != arg) /* Not our event */ return; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag); LAGG_RUNLOCK(); } static int lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct lagg_softc *sc; struct ifnet *ifp; static const u_char eaddr[6]; /* 00:00:00:00:00:00 */ sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); ifp = sc->sc_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { free(sc, M_DEVBUF); return (ENOSPC); } LAGG_SX_INIT(sc); LAGG_XLOCK(sc); if (V_def_use_flowid) sc->sc_opts |= LAGG_OPT_USE_FLOWID; sc->flowid_shift = V_def_flowid_shift; /* Hash all layers by default */ sc->sc_flags = MBUF_HASHFLAG_L2|MBUF_HASHFLAG_L3|MBUF_HASHFLAG_L4; lagg_proto_attach(sc, LAGG_PROTO_DEFAULT); SLIST_INIT(&sc->sc_ports); /* Initialise pseudo media types */ ifmedia_init(&sc->sc_media, 0, lagg_media_change, lagg_media_status); ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO); if_initname(ifp, laggname, unit); ifp->if_softc = sc; ifp->if_transmit = lagg_transmit; ifp->if_qflush = lagg_qflush; ifp->if_init = lagg_init; ifp->if_ioctl = lagg_ioctl; ifp->if_get_counter = lagg_get_counter; ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST; #ifdef RATELIMIT ifp->if_snd_tag_alloc = lagg_snd_tag_alloc; ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS | IFCAP_TXRTLMT; #else ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS; #endif /* * Attach as an ordinary ethernet device, children will be attached * as special device IFT_IEEE8023ADLAG. */ ether_ifattach(ifp, eaddr); sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST); sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST); /* Insert into the global list of laggs */ LAGG_LIST_LOCK(); SLIST_INSERT_HEAD(&V_lagg_list, sc, sc_entries); LAGG_LIST_UNLOCK(); LAGG_XUNLOCK(sc); return (0); } static void lagg_clone_destroy(struct ifnet *ifp) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_port *lp; LAGG_XLOCK(sc); sc->sc_destroying = 1; lagg_stop(sc); ifp->if_flags &= ~IFF_UP; EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach); EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach); /* Shutdown and remove lagg ports */ while ((lp = SLIST_FIRST(&sc->sc_ports)) != NULL) lagg_port_destroy(lp, 1); /* Unhook the aggregation protocol */ lagg_proto_detach(sc); LAGG_XUNLOCK(sc); ifmedia_removeall(&sc->sc_media); ether_ifdetach(ifp); if_free(ifp); LAGG_LIST_LOCK(); SLIST_REMOVE(&V_lagg_list, sc, lagg_softc, sc_entries); LAGG_LIST_UNLOCK(); LAGG_SX_DESTROY(sc); free(sc, M_DEVBUF); } static void lagg_capabilities(struct lagg_softc *sc) { struct lagg_port *lp; int cap, ena, pena; uint64_t hwa; struct ifnet_hw_tsomax hw_tsomax; LAGG_XLOCK_ASSERT(sc); /* Get common enabled capabilities for the lagg ports */ ena = ~0; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) ena &= lp->lp_ifp->if_capenable; ena = (ena == ~0 ? 0 : ena); /* * Apply common enabled capabilities back to the lagg ports. * May require several iterations if they are dependent. */ do { pena = ena; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { lagg_setcaps(lp, ena); ena &= lp->lp_ifp->if_capenable; } } while (pena != ena); /* Get other capabilities from the lagg ports */ cap = ~0; hwa = ~(uint64_t)0; memset(&hw_tsomax, 0, sizeof(hw_tsomax)); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { cap &= lp->lp_ifp->if_capabilities; hwa &= lp->lp_ifp->if_hwassist; if_hw_tsomax_common(lp->lp_ifp, &hw_tsomax); } cap = (cap == ~0 ? 0 : cap); hwa = (hwa == ~(uint64_t)0 ? 0 : hwa); if (sc->sc_ifp->if_capabilities != cap || sc->sc_ifp->if_capenable != ena || sc->sc_ifp->if_hwassist != hwa || if_hw_tsomax_update(sc->sc_ifp, &hw_tsomax) != 0) { sc->sc_ifp->if_capabilities = cap; sc->sc_ifp->if_capenable = ena; sc->sc_ifp->if_hwassist = hwa; getmicrotime(&sc->sc_ifp->if_lastchange); if (sc->sc_ifflags & IFF_DEBUG) if_printf(sc->sc_ifp, "capabilities 0x%08x enabled 0x%08x\n", cap, ena); } } static int lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) { struct lagg_softc *sc_ptr; struct lagg_port *lp, *tlp; int error, i; uint64_t *pval; LAGG_XLOCK_ASSERT(sc); /* Limit the maximal number of lagg ports */ if (sc->sc_count >= LAGG_MAX_PORTS) return (ENOSPC); /* Check if port has already been associated to a lagg */ if (ifp->if_lagg != NULL) { /* Port is already in the current lagg? */ lp = (struct lagg_port *)ifp->if_lagg; if (lp->lp_softc == sc) return (EEXIST); return (EBUSY); } /* XXX Disallow non-ethernet interfaces (this should be any of 802) */ if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN) return (EPROTONOSUPPORT); /* Allow the first Ethernet member to define the MTU */ if (SLIST_EMPTY(&sc->sc_ports)) sc->sc_ifp->if_mtu = ifp->if_mtu; else if (sc->sc_ifp->if_mtu != ifp->if_mtu) { if_printf(sc->sc_ifp, "invalid MTU for %s\n", ifp->if_xname); return (EINVAL); } lp = malloc(sizeof(struct lagg_port), M_DEVBUF, M_WAITOK|M_ZERO); lp->lp_softc = sc; /* Check if port is a stacked lagg */ LAGG_LIST_LOCK(); SLIST_FOREACH(sc_ptr, &V_lagg_list, sc_entries) { if (ifp == sc_ptr->sc_ifp) { LAGG_LIST_UNLOCK(); free(lp, M_DEVBUF); return (EINVAL); /* XXX disable stacking for the moment, its untested */ #ifdef LAGG_PORT_STACKING lp->lp_flags |= LAGG_PORT_STACK; if (lagg_port_checkstacking(sc_ptr) >= LAGG_MAX_STACKING) { LAGG_LIST_UNLOCK(); free(lp, M_DEVBUF); return (E2BIG); } #endif } } LAGG_LIST_UNLOCK(); if_ref(ifp); lp->lp_ifp = ifp; bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ETHER_ADDR_LEN); lp->lp_ifcapenable = ifp->if_capenable; if (SLIST_EMPTY(&sc->sc_ports)) { bcopy(IF_LLADDR(ifp), IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); lagg_proto_lladdr(sc); EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); } else { if_setlladdr(ifp, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); } lagg_setflags(lp, 1); if (SLIST_EMPTY(&sc->sc_ports)) sc->sc_primary = lp; /* Change the interface type */ lp->lp_iftype = ifp->if_type; ifp->if_type = IFT_IEEE8023ADLAG; ifp->if_lagg = lp; lp->lp_ioctl = ifp->if_ioctl; ifp->if_ioctl = lagg_port_ioctl; lp->lp_output = ifp->if_output; ifp->if_output = lagg_port_output; /* Read port counters */ pval = lp->port_counters.val; for (i = 0; i < IFCOUNTERS; i++, pval++) *pval = ifp->if_get_counter(ifp, i); /* * Insert into the list of ports. * Keep ports sorted by if_index. It is handy, when configuration * is predictable and `ifconfig laggN create ...` command * will lead to the same result each time. */ LAGG_RLOCK(); CK_SLIST_FOREACH(tlp, &sc->sc_ports, lp_entries) { if (tlp->lp_ifp->if_index < ifp->if_index && ( SLIST_NEXT(tlp, lp_entries) == NULL || SLIST_NEXT(tlp, lp_entries)->lp_ifp->if_index > ifp->if_index)) break; } LAGG_RUNLOCK(); if (tlp != NULL) SLIST_INSERT_AFTER(tlp, lp, lp_entries); else SLIST_INSERT_HEAD(&sc->sc_ports, lp, lp_entries); sc->sc_count++; lagg_setmulti(lp); if ((error = lagg_proto_addport(sc, lp)) != 0) { /* Remove the port, without calling pr_delport. */ lagg_port_destroy(lp, 0); return (error); } /* Update lagg capabilities */ lagg_capabilities(sc); lagg_linkstate(sc); return (0); } #ifdef LAGG_PORT_STACKING static int lagg_port_checkstacking(struct lagg_softc *sc) { struct lagg_softc *sc_ptr; struct lagg_port *lp; int m = 0; LAGG_SXLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_flags & LAGG_PORT_STACK) { sc_ptr = (struct lagg_softc *)lp->lp_ifp->if_softc; m = MAX(m, lagg_port_checkstacking(sc_ptr)); } } return (m + 1); } #endif static void lagg_port_destroy_cb(epoch_context_t ec) { struct lagg_port *lp; struct ifnet *ifp; lp = __containerof(ec, struct lagg_port, lp_epoch_ctx); ifp = lp->lp_ifp; if_rele(ifp); free(lp, M_DEVBUF); } static int lagg_port_destroy(struct lagg_port *lp, int rundelport) { struct lagg_softc *sc = lp->lp_softc; struct lagg_port *lp_ptr, *lp0; struct ifnet *ifp = lp->lp_ifp; uint64_t *pval, vdiff; int i; LAGG_XLOCK_ASSERT(sc); if (rundelport) lagg_proto_delport(sc, lp); if (lp->lp_detaching == 0) lagg_clrmulti(lp); /* Restore interface */ ifp->if_type = lp->lp_iftype; ifp->if_ioctl = lp->lp_ioctl; ifp->if_output = lp->lp_output; ifp->if_lagg = NULL; /* Update detached port counters */ pval = lp->port_counters.val; for (i = 0; i < IFCOUNTERS; i++, pval++) { vdiff = ifp->if_get_counter(ifp, i) - *pval; sc->detached_counters.val[i] += vdiff; } /* Finally, remove the port from the lagg */ CK_SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries); sc->sc_count--; /* Update the primary interface */ if (lp == sc->sc_primary) { uint8_t lladdr[ETHER_ADDR_LEN]; if ((lp0 = SLIST_FIRST(&sc->sc_ports)) == NULL) bzero(&lladdr, ETHER_ADDR_LEN); else bcopy(lp0->lp_lladdr, lladdr, ETHER_ADDR_LEN); sc->sc_primary = lp0; if (sc->sc_destroying == 0) { bcopy(lladdr, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); lagg_proto_lladdr(sc); EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); } /* * Update lladdr for each port (new primary needs update * as well, to switch from old lladdr to its 'real' one) */ CK_SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries) if_setlladdr(lp_ptr->lp_ifp, lladdr, ETHER_ADDR_LEN); } if (lp->lp_ifflags) if_printf(ifp, "%s: lp_ifflags unclean\n", __func__); if (lp->lp_detaching == 0) { lagg_setflags(lp, 0); lagg_setcaps(lp, lp->lp_ifcapenable); if_setlladdr(ifp, lp->lp_lladdr, ETHER_ADDR_LEN); } /* * free port and release it's ifnet reference after a grace period has * elapsed. */ epoch_call(net_epoch_preempt, &lp->lp_epoch_ctx, lagg_port_destroy_cb); /* Update lagg capabilities */ lagg_capabilities(sc); lagg_linkstate(sc); return (0); } static int lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct lagg_reqport *rp = (struct lagg_reqport *)data; struct lagg_softc *sc; struct lagg_port *lp = NULL; int error = 0; /* Should be checked by the caller */ if (ifp->if_type != IFT_IEEE8023ADLAG || (lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL) goto fallback; switch (cmd) { case SIOCGLAGGPORT: if (rp->rp_portname[0] == '\0' || ifunit(rp->rp_portname) != ifp) { error = EINVAL; break; } LAGG_RLOCK(); if ((lp = ifp->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; LAGG_RUNLOCK(); break; } lagg_port2req(lp, rp); LAGG_RUNLOCK(); break; case SIOCSIFCAP: if (lp->lp_ioctl == NULL) { error = EINVAL; break; } error = (*lp->lp_ioctl)(ifp, cmd, data); if (error) break; /* Update lagg interface capabilities */ LAGG_XLOCK(sc); lagg_capabilities(sc); LAGG_XUNLOCK(sc); VLAN_CAPABILITIES(sc->sc_ifp); break; case SIOCSIFMTU: /* Do not allow the MTU to be changed once joined */ error = EINVAL; break; default: goto fallback; } return (error); fallback: if (lp != NULL && lp->lp_ioctl != NULL) return ((*lp->lp_ioctl)(ifp, cmd, data)); return (EINVAL); } /* * Requests counter @cnt data. * * Counter value is calculated the following way: * 1) for each port, sum difference between current and "initial" measurements. * 2) add lagg logical interface counters. * 3) add data from detached_counters array. * * We also do the following things on ports attach/detach: * 1) On port attach we store all counters it has into port_counter array. * 2) On port detach we add the different between "initial" and * current counters data to detached_counters array. */ static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt) { struct lagg_softc *sc; struct lagg_port *lp; struct ifnet *lpifp; uint64_t newval, oldval, vsum; /* Revise this when we've got non-generic counters. */ KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); sc = (struct lagg_softc *)ifp->if_softc; vsum = 0; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { /* Saved attached value */ oldval = lp->port_counters.val[cnt]; /* current value */ lpifp = lp->lp_ifp; newval = lpifp->if_get_counter(lpifp, cnt); /* Calculate diff and save new */ vsum += newval - oldval; } LAGG_RUNLOCK(); /* * Add counter data which might be added by upper * layer protocols operating on logical interface. */ vsum += if_get_counter_default(ifp, cnt); /* * Add counter data from detached ports counters */ vsum += sc->detached_counters.val[cnt]; return (vsum); } /* * For direct output to child ports. */ static int lagg_port_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct lagg_port *lp = ifp->if_lagg; switch (dst->sa_family) { case pseudo_AF_HDRCMPLT: case AF_UNSPEC: return ((*lp->lp_output)(ifp, m, dst, ro)); } /* drop any other frames */ m_freem(m); return (ENETDOWN); } static void lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp) { struct lagg_port *lp; struct lagg_softc *sc; if ((lp = ifp->if_lagg) == NULL) return; /* If the ifnet is just being renamed, don't do anything. */ if (ifp->if_flags & IFF_RENAMING) return; sc = lp->lp_softc; LAGG_XLOCK(sc); lp->lp_detaching = 1; lagg_port_destroy(lp, 1); LAGG_XUNLOCK(sc); VLAN_CAPABILITIES(sc->sc_ifp); } static void lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp) { struct lagg_softc *sc = lp->lp_softc; strlcpy(rp->rp_ifname, sc->sc_ifname, sizeof(rp->rp_ifname)); strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname)); rp->rp_prio = lp->lp_prio; rp->rp_flags = lp->lp_flags; lagg_proto_portreq(sc, lp, &rp->rp_psc); /* Add protocol specific flags */ switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: if (lp == sc->sc_primary) rp->rp_flags |= LAGG_PORT_MASTER; if (lp == lagg_link_active(sc, sc->sc_primary)) rp->rp_flags |= LAGG_PORT_ACTIVE; break; case LAGG_PROTO_ROUNDROBIN: case LAGG_PROTO_LOADBALANCE: case LAGG_PROTO_BROADCAST: if (LAGG_PORTACTIVE(lp)) rp->rp_flags |= LAGG_PORT_ACTIVE; break; case LAGG_PROTO_LACP: /* LACP has a different definition of active */ if (lacp_isactive(lp)) rp->rp_flags |= LAGG_PORT_ACTIVE; if (lacp_iscollecting(lp)) rp->rp_flags |= LAGG_PORT_COLLECTING; if (lacp_isdistributing(lp)) rp->rp_flags |= LAGG_PORT_DISTRIBUTING; break; } } static void lagg_init(void *xsc) { struct lagg_softc *sc = (struct lagg_softc *)xsc; struct ifnet *ifp = sc->sc_ifp; struct lagg_port *lp; LAGG_XLOCK(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { LAGG_XUNLOCK(sc); return; } ifp->if_drv_flags |= IFF_DRV_RUNNING; /* * Update the port lladdrs if needed. * This might be if_setlladdr() notification * that lladdr has been changed. */ CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (memcmp(IF_LLADDR(ifp), IF_LLADDR(lp->lp_ifp), ETHER_ADDR_LEN) != 0) if_setlladdr(lp->lp_ifp, IF_LLADDR(ifp), ETHER_ADDR_LEN); } lagg_proto_init(sc); LAGG_XUNLOCK(sc); } static void lagg_stop(struct lagg_softc *sc) { struct ifnet *ifp = sc->sc_ifp; LAGG_XLOCK_ASSERT(sc); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; lagg_proto_stop(sc); } static int lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_reqall *ra = (struct lagg_reqall *)data; struct lagg_reqopts *ro = (struct lagg_reqopts *)data; struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf; struct lagg_reqflags *rf = (struct lagg_reqflags *)data; struct ifreq *ifr = (struct ifreq *)data; struct lagg_port *lp; struct ifnet *tpif; struct thread *td = curthread; char *buf, *outbuf; int count, buflen, len, error = 0; bzero(&rpbuf, sizeof(rpbuf)); switch (cmd) { case SIOCGLAGG: LAGG_XLOCK(sc); buflen = sc->sc_count * sizeof(struct lagg_reqport); outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO); ra->ra_proto = sc->sc_proto; lagg_proto_request(sc, &ra->ra_psc); count = 0; buf = outbuf; len = min(ra->ra_size, buflen); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (len < sizeof(rpbuf)) break; lagg_port2req(lp, &rpbuf); memcpy(buf, &rpbuf, sizeof(rpbuf)); count++; buf += sizeof(rpbuf); len -= sizeof(rpbuf); } LAGG_XUNLOCK(sc); ra->ra_ports = count; ra->ra_size = count * sizeof(rpbuf); error = copyout(outbuf, ra->ra_port, ra->ra_size); free(outbuf, M_TEMP); break; case SIOCSLAGG: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (ra->ra_proto >= LAGG_PROTO_MAX) { error = EPROTONOSUPPORT; break; } LAGG_XLOCK(sc); lagg_proto_detach(sc); LAGG_UNLOCK_ASSERT(); lagg_proto_attach(sc, ra->ra_proto); LAGG_XUNLOCK(sc); break; case SIOCGLAGGOPTS: LAGG_XLOCK(sc); ro->ro_opts = sc->sc_opts; if (sc->sc_proto == LAGG_PROTO_LACP) { struct lacp_softc *lsc; lsc = (struct lacp_softc *)sc->sc_psc; if (lsc->lsc_debug.lsc_tx_test != 0) ro->ro_opts |= LAGG_OPT_LACP_TXTEST; if (lsc->lsc_debug.lsc_rx_test != 0) ro->ro_opts |= LAGG_OPT_LACP_RXTEST; if (lsc->lsc_strict_mode != 0) ro->ro_opts |= LAGG_OPT_LACP_STRICT; if (lsc->lsc_fast_timeout != 0) ro->ro_opts |= LAGG_OPT_LACP_TIMEOUT; ro->ro_active = sc->sc_active; } else { ro->ro_active = 0; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) ro->ro_active += LAGG_PORTACTIVE(lp); } ro->ro_bkt = sc->sc_bkt; ro->ro_flapping = sc->sc_flapping; ro->ro_flowid_shift = sc->flowid_shift; LAGG_XUNLOCK(sc); break; case SIOCSLAGGOPTS: if (sc->sc_proto == LAGG_PROTO_ROUNDROBIN) { if (ro->ro_bkt == 0) sc->sc_bkt = 1; // Minimum 1 packet per iface. else sc->sc_bkt = ro->ro_bkt; } error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (ro->ro_opts == 0) break; /* * Set options. LACP options are stored in sc->sc_psc, * not in sc_opts. */ int valid, lacp; switch (ro->ro_opts) { case LAGG_OPT_USE_FLOWID: case -LAGG_OPT_USE_FLOWID: case LAGG_OPT_FLOWIDSHIFT: valid = 1; lacp = 0; break; case LAGG_OPT_LACP_TXTEST: case -LAGG_OPT_LACP_TXTEST: case LAGG_OPT_LACP_RXTEST: case -LAGG_OPT_LACP_RXTEST: case LAGG_OPT_LACP_STRICT: case -LAGG_OPT_LACP_STRICT: case LAGG_OPT_LACP_TIMEOUT: case -LAGG_OPT_LACP_TIMEOUT: valid = lacp = 1; break; default: valid = lacp = 0; break; } LAGG_XLOCK(sc); if (valid == 0 || (lacp == 1 && sc->sc_proto != LAGG_PROTO_LACP)) { /* Invalid combination of options specified. */ error = EINVAL; LAGG_XUNLOCK(sc); break; /* Return from SIOCSLAGGOPTS. */ } /* * Store new options into sc->sc_opts except for * FLOWIDSHIFT and LACP options. */ if (lacp == 0) { if (ro->ro_opts == LAGG_OPT_FLOWIDSHIFT) sc->flowid_shift = ro->ro_flowid_shift; else if (ro->ro_opts > 0) sc->sc_opts |= ro->ro_opts; else sc->sc_opts &= ~ro->ro_opts; } else { struct lacp_softc *lsc; struct lacp_port *lp; lsc = (struct lacp_softc *)sc->sc_psc; switch (ro->ro_opts) { case LAGG_OPT_LACP_TXTEST: lsc->lsc_debug.lsc_tx_test = 1; break; case -LAGG_OPT_LACP_TXTEST: lsc->lsc_debug.lsc_tx_test = 0; break; case LAGG_OPT_LACP_RXTEST: lsc->lsc_debug.lsc_rx_test = 1; break; case -LAGG_OPT_LACP_RXTEST: lsc->lsc_debug.lsc_rx_test = 0; break; case LAGG_OPT_LACP_STRICT: lsc->lsc_strict_mode = 1; break; case -LAGG_OPT_LACP_STRICT: lsc->lsc_strict_mode = 0; break; case LAGG_OPT_LACP_TIMEOUT: LACP_LOCK(lsc); LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) lp->lp_state |= LACP_STATE_TIMEOUT; LACP_UNLOCK(lsc); lsc->lsc_fast_timeout = 1; break; case -LAGG_OPT_LACP_TIMEOUT: LACP_LOCK(lsc); LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) lp->lp_state &= ~LACP_STATE_TIMEOUT; LACP_UNLOCK(lsc); lsc->lsc_fast_timeout = 0; break; } } LAGG_XUNLOCK(sc); break; case SIOCGLAGGFLAGS: rf->rf_flags = 0; LAGG_XLOCK(sc); if (sc->sc_flags & MBUF_HASHFLAG_L2) rf->rf_flags |= LAGG_F_HASHL2; if (sc->sc_flags & MBUF_HASHFLAG_L3) rf->rf_flags |= LAGG_F_HASHL3; if (sc->sc_flags & MBUF_HASHFLAG_L4) rf->rf_flags |= LAGG_F_HASHL4; LAGG_XUNLOCK(sc); break; case SIOCSLAGGHASH: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if ((rf->rf_flags & LAGG_F_HASHMASK) == 0) { error = EINVAL; break; } LAGG_XLOCK(sc); sc->sc_flags = 0; if (rf->rf_flags & LAGG_F_HASHL2) sc->sc_flags |= MBUF_HASHFLAG_L2; if (rf->rf_flags & LAGG_F_HASHL3) sc->sc_flags |= MBUF_HASHFLAG_L3; if (rf->rf_flags & LAGG_F_HASHL4) sc->sc_flags |= MBUF_HASHFLAG_L4; LAGG_XUNLOCK(sc); break; case SIOCGLAGGPORT: if (rp->rp_portname[0] == '\0' || (tpif = ifunit_ref(rp->rp_portname)) == NULL) { error = EINVAL; break; } LAGG_RLOCK(); if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; LAGG_RUNLOCK(); if_rele(tpif); break; } lagg_port2req(lp, rp); LAGG_RUNLOCK(); if_rele(tpif); break; case SIOCSLAGGPORT: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (rp->rp_portname[0] == '\0' || (tpif = ifunit_ref(rp->rp_portname)) == NULL) { error = EINVAL; break; } #ifdef INET6 /* * A laggport interface should not have inet6 address * because two interfaces with a valid link-local * scope zone must not be merged in any form. This * restriction is needed to prevent violation of * link-local scope zone. Attempts to add a laggport * interface which has inet6 addresses triggers * removal of all inet6 addresses on the member * interface. */ if (in6ifa_llaonifp(tpif)) { in6_ifdetach(tpif); if_printf(sc->sc_ifp, "IPv6 addresses on %s have been removed " "before adding it as a member to prevent " "IPv6 address scope violation.\n", tpif->if_xname); } #endif LAGG_XLOCK(sc); error = lagg_port_create(sc, tpif); LAGG_XUNLOCK(sc); if_rele(tpif); VLAN_CAPABILITIES(ifp); break; case SIOCSLAGGDELPORT: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (rp->rp_portname[0] == '\0' || (tpif = ifunit_ref(rp->rp_portname)) == NULL) { error = EINVAL; break; } LAGG_XLOCK(sc); if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; LAGG_XUNLOCK(sc); if_rele(tpif); break; } error = lagg_port_destroy(lp, 1); LAGG_XUNLOCK(sc); if_rele(tpif); VLAN_CAPABILITIES(ifp); break; case SIOCSIFFLAGS: /* Set flags on ports too */ LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { lagg_setflags(lp, 1); } if (!(ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* * If interface is marked down and it is running, * then stop and disable it. */ lagg_stop(sc); LAGG_XUNLOCK(sc); } else if ((ifp->if_flags & IFF_UP) && !(ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* * If interface is marked up and it is stopped, then * start it. */ LAGG_XUNLOCK(sc); (*ifp->if_init)(sc); } else LAGG_XUNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { lagg_clrmulti(lp); lagg_setmulti(lp); } LAGG_XUNLOCK(sc); error = 0; break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd); break; case SIOCSIFCAP: LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_ioctl != NULL) (*lp->lp_ioctl)(lp->lp_ifp, cmd, data); } lagg_capabilities(sc); LAGG_XUNLOCK(sc); VLAN_CAPABILITIES(ifp); error = 0; break; case SIOCSIFMTU: /* Do not allow the MTU to be directly changed */ error = EINVAL; break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } #ifdef RATELIMIT static int lagg_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_port *lp; struct lagg_lb *lb; uint32_t p; switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: lp = lagg_link_active(sc, sc->sc_primary); break; case LAGG_PROTO_LOADBALANCE: if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 || params->hdr.flowtype == M_HASHTYPE_NONE) return (EOPNOTSUPP); p = params->hdr.flowid >> sc->flowid_shift; p %= sc->sc_count; lb = (struct lagg_lb *)sc->sc_psc; lp = lb->lb_ports[p]; lp = lagg_link_active(sc, lp); break; case LAGG_PROTO_LACP: if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 || params->hdr.flowtype == M_HASHTYPE_NONE) return (EOPNOTSUPP); lp = lacp_select_tx_port_by_hash(sc, params->hdr.flowid); break; default: return (EOPNOTSUPP); } if (lp == NULL) return (EOPNOTSUPP); ifp = lp->lp_ifp; if (ifp == NULL || ifp->if_snd_tag_alloc == NULL || (ifp->if_capenable & IFCAP_TXRTLMT) == 0) return (EOPNOTSUPP); /* forward allocation request */ return (ifp->if_snd_tag_alloc(ifp, params, ppmt)); } #endif static int lagg_setmulti(struct lagg_port *lp) { struct lagg_softc *sc = lp->lp_softc; struct ifnet *ifp = lp->lp_ifp; struct ifnet *scifp = sc->sc_ifp; struct lagg_mc *mc; struct ifmultiaddr *ifma; int error; IF_ADDR_WLOCK(scifp); CK_STAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; mc = malloc(sizeof(struct lagg_mc), M_DEVBUF, M_NOWAIT); if (mc == NULL) { IF_ADDR_WUNLOCK(scifp); return (ENOMEM); } bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len); mc->mc_addr.sdl_index = ifp->if_index; mc->mc_ifma = NULL; SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries); } IF_ADDR_WUNLOCK(scifp); SLIST_FOREACH (mc, &lp->lp_mc_head, mc_entries) { error = if_addmulti(ifp, (struct sockaddr *)&mc->mc_addr, &mc->mc_ifma); if (error) return (error); } return (0); } static int lagg_clrmulti(struct lagg_port *lp) { struct lagg_mc *mc; LAGG_XLOCK_ASSERT(lp->lp_softc); while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) { SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries); if (mc->mc_ifma && lp->lp_detaching == 0) if_delmulti_ifma(mc->mc_ifma); free(mc, M_DEVBUF); } return (0); } static int lagg_setcaps(struct lagg_port *lp, int cap) { struct ifreq ifr; if (lp->lp_ifp->if_capenable == cap) return (0); if (lp->lp_ioctl == NULL) return (ENXIO); ifr.ifr_reqcap = cap; return ((*lp->lp_ioctl)(lp->lp_ifp, SIOCSIFCAP, (caddr_t)&ifr)); } /* Handle a ref counted flag that should be set on the lagg port as well */ static int lagg_setflag(struct lagg_port *lp, int flag, int status, int (*func)(struct ifnet *, int)) { struct lagg_softc *sc = lp->lp_softc; struct ifnet *scifp = sc->sc_ifp; struct ifnet *ifp = lp->lp_ifp; int error; LAGG_XLOCK_ASSERT(sc); status = status ? (scifp->if_flags & flag) : 0; /* Now "status" contains the flag value or 0 */ /* * See if recorded ports status is different from what * we want it to be. If it is, flip it. We record ports * status in lp_ifflags so that we won't clear ports flag * we haven't set. In fact, we don't clear or set ports * flags directly, but get or release references to them. * That's why we can be sure that recorded flags still are * in accord with actual ports flags. */ if (status != (lp->lp_ifflags & flag)) { error = (*func)(ifp, status); if (error) return (error); lp->lp_ifflags &= ~flag; lp->lp_ifflags |= status; } return (0); } /* * Handle IFF_* flags that require certain changes on the lagg port * if "status" is true, update ports flags respective to the lagg * if "status" is false, forcedly clear the flags set on port. */ static int lagg_setflags(struct lagg_port *lp, int status) { int error, i; for (i = 0; lagg_pflags[i].flag; i++) { error = lagg_setflag(lp, lagg_pflags[i].flag, status, lagg_pflags[i].func); if (error) return (error); } return (0); } static int lagg_transmit(struct ifnet *ifp, struct mbuf *m) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; - int error, len, mcast; - - len = m->m_pkthdr.len; - mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; + int error; LAGG_RLOCK(); /* We need a Tx algorithm and at least one port */ if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) { LAGG_RUNLOCK(); m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENXIO); } ETHER_BPF_MTAP(ifp, m); error = lagg_proto_start(sc, m); LAGG_RUNLOCK(); if (error != 0) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); } /* * The ifp->if_qflush entry point for lagg(4) is no-op. */ static void lagg_qflush(struct ifnet *ifp __unused) { } static struct mbuf * lagg_input(struct ifnet *ifp, struct mbuf *m) { struct lagg_port *lp = ifp->if_lagg; struct lagg_softc *sc = lp->lp_softc; struct ifnet *scifp = sc->sc_ifp; LAGG_RLOCK(); if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (lp->lp_flags & LAGG_PORT_DISABLED) || sc->sc_proto == LAGG_PROTO_NONE) { LAGG_RUNLOCK(); m_freem(m); return (NULL); } ETHER_BPF_MTAP(scifp, m); if (lp->lp_detaching != 0) { m_freem(m); m = NULL; } else m = lagg_proto_input(sc, lp, m); if (m != NULL) { if (scifp->if_flags & IFF_MONITOR) { m_freem(m); m = NULL; } } LAGG_RUNLOCK(); return (m); } static int lagg_media_change(struct ifnet *ifp) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; if (sc->sc_ifflags & IFF_DEBUG) printf("%s\n", __func__); /* Ignore */ return (0); } static void lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_port *lp; imr->ifm_status = IFM_AVALID; imr->ifm_active = IFM_ETHER | IFM_AUTO; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (LAGG_PORTACTIVE(lp)) imr->ifm_status |= IFM_ACTIVE; } LAGG_RUNLOCK(); } static void lagg_linkstate(struct lagg_softc *sc) { struct lagg_port *lp; int new_link = LINK_STATE_DOWN; uint64_t speed; LAGG_XLOCK_ASSERT(sc); /* Our link is considered up if at least one of our ports is active */ LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_ifp->if_link_state == LINK_STATE_UP) { new_link = LINK_STATE_UP; break; } } LAGG_RUNLOCK(); if_link_state_change(sc->sc_ifp, new_link); /* Update if_baudrate to reflect the max possible speed */ switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: sc->sc_ifp->if_baudrate = sc->sc_primary != NULL ? sc->sc_primary->lp_ifp->if_baudrate : 0; break; case LAGG_PROTO_ROUNDROBIN: case LAGG_PROTO_LOADBALANCE: case LAGG_PROTO_BROADCAST: speed = 0; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) speed += lp->lp_ifp->if_baudrate; LAGG_RUNLOCK(); sc->sc_ifp->if_baudrate = speed; break; case LAGG_PROTO_LACP: /* LACP updates if_baudrate itself */ break; } } static void lagg_port_state(struct ifnet *ifp, int state) { struct lagg_port *lp = (struct lagg_port *)ifp->if_lagg; struct lagg_softc *sc = NULL; if (lp != NULL) sc = lp->lp_softc; if (sc == NULL) return; LAGG_XLOCK(sc); lagg_linkstate(sc); lagg_proto_linkstate(sc, lp); LAGG_XUNLOCK(sc); } struct lagg_port * lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp) { struct lagg_port *lp_next, *rval = NULL; /* * Search a port which reports an active link state. */ if (lp == NULL) goto search; if (LAGG_PORTACTIVE(lp)) { rval = lp; goto found; } if ((lp_next = SLIST_NEXT(lp, lp_entries)) != NULL && LAGG_PORTACTIVE(lp_next)) { rval = lp_next; goto found; } search: LAGG_RLOCK(); CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { if (LAGG_PORTACTIVE(lp_next)) { LAGG_RUNLOCK(); rval = lp_next; goto found; } } LAGG_RUNLOCK(); found: return (rval); } int lagg_enqueue(struct ifnet *ifp, struct mbuf *m) { return (ifp->if_transmit)(ifp, m); } /* * Simple round robin aggregation */ static void lagg_rr_attach(struct lagg_softc *sc) { sc->sc_seq = 0; sc->sc_bkt_count = sc->sc_bkt; } static int lagg_rr_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; uint32_t p; if (sc->sc_bkt_count == 0 && sc->sc_bkt > 0) sc->sc_bkt_count = sc->sc_bkt; if (sc->sc_bkt > 0) { atomic_subtract_int(&sc->sc_bkt_count, 1); if (atomic_cmpset_int(&sc->sc_bkt_count, 0, sc->sc_bkt)) p = atomic_fetchadd_32(&sc->sc_seq, 1); else p = sc->sc_seq; } else p = atomic_fetchadd_32(&sc->sc_seq, 1); p %= sc->sc_count; lp = SLIST_FIRST(&sc->sc_ports); while (p--) lp = SLIST_NEXT(lp, lp_entries); /* * Check the port's link state. This will return the next active * port if the link is down or the port is NULL. */ if ((lp = lagg_link_active(sc, lp)) == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * Broadcast mode */ static int lagg_bcast_start(struct lagg_softc *sc, struct mbuf *m) { int active_ports = 0; int errors = 0; int ret; struct lagg_port *lp, *last = NULL; struct mbuf *m0; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (!LAGG_PORTACTIVE(lp)) continue; active_ports++; if (last != NULL) { m0 = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (m0 == NULL) { ret = ENOBUFS; errors++; break; } ret = lagg_enqueue(last->lp_ifp, m0); if (ret != 0) errors++; } last = lp; } LAGG_RUNLOCK(); if (last == NULL) { m_freem(m); return (ENOENT); } if ((last = lagg_link_active(sc, last)) == NULL) { m_freem(m); return (ENETDOWN); } ret = lagg_enqueue(last->lp_ifp, m); if (ret != 0) errors++; if (errors == 0) return (ret); return (0); } static struct mbuf* lagg_bcast_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * Active failover */ static int lagg_fail_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; /* Use the master port if active or the next available port */ if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; struct lagg_port *tmp_tp; if (lp == sc->sc_primary || V_lagg_failover_rx_all) { m->m_pkthdr.rcvif = ifp; return (m); } if (!LAGG_PORTACTIVE(sc->sc_primary)) { tmp_tp = lagg_link_active(sc, sc->sc_primary); /* * If tmp_tp is null, we've received a packet when all * our links are down. Weird, but process it anyways. */ if ((tmp_tp == NULL || tmp_tp == lp)) { m->m_pkthdr.rcvif = ifp; return (m); } } m_freem(m); return (NULL); } /* * Loadbalancing */ static void lagg_lb_attach(struct lagg_softc *sc) { struct lagg_port *lp; struct lagg_lb *lb; LAGG_XLOCK_ASSERT(sc); lb = malloc(sizeof(struct lagg_lb), M_DEVBUF, M_WAITOK | M_ZERO); lb->lb_key = m_ether_tcpip_hash_init(); sc->sc_psc = lb; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lagg_lb_port_create(lp); } static void lagg_lb_detach(struct lagg_softc *sc) { struct lagg_lb *lb; lb = (struct lagg_lb *)sc->sc_psc; if (lb != NULL) free(lb, M_DEVBUF); } static int lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp) { struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; struct lagg_port *lp_next; int i = 0; bzero(&lb->lb_ports, sizeof(lb->lb_ports)); LAGG_RLOCK(); CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { if (lp_next == lp) continue; if (i >= LAGG_MAX_PORTS) return (EINVAL); if (sc->sc_ifflags & IFF_DEBUG) printf("%s: port %s at index %d\n", sc->sc_ifname, lp_next->lp_ifp->if_xname, i); lb->lb_ports[i++] = lp_next; } LAGG_RUNLOCK(); return (0); } static int lagg_lb_port_create(struct lagg_port *lp) { struct lagg_softc *sc = lp->lp_softc; return (lagg_lb_porttable(sc, NULL)); } static void lagg_lb_port_destroy(struct lagg_port *lp) { struct lagg_softc *sc = lp->lp_softc; lagg_lb_porttable(sc, lp); } static int lagg_lb_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; struct lagg_port *lp = NULL; uint32_t p = 0; if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) && M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) p = m->m_pkthdr.flowid >> sc->flowid_shift; else p = m_ether_tcpip_hash(sc->sc_flags, m, lb->lb_key); p %= sc->sc_count; lp = lb->lb_ports[p]; /* * Check the port's link state. This will return the next active * port if the link is down or the port is NULL. */ if ((lp = lagg_link_active(sc, lp)) == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * 802.3ad LACP */ static void lagg_lacp_attach(struct lagg_softc *sc) { struct lagg_port *lp; lacp_attach(sc); LAGG_XLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_create(lp); } static void lagg_lacp_detach(struct lagg_softc *sc) { struct lagg_port *lp; void *psc; LAGG_XLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_destroy(lp); psc = sc->sc_psc; sc->sc_psc = NULL; lacp_detach(psc); } static void lagg_lacp_lladdr(struct lagg_softc *sc) { struct lagg_port *lp; LAGG_SXLOCK_ASSERT(sc); /* purge all the lacp ports */ CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_destroy(lp); /* add them back in */ CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_create(lp); } static int lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; lp = lacp_select_tx_port(sc, m); if (lp == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; struct ether_header *eh; u_short etype; eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); /* Tap off LACP control messages */ if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_SLOW) { m = lacp_input(lp, m); if (m == NULL) return (NULL); } /* * If the port is not collecting or not in the active aggregator then * free and return. */ if (lacp_iscollecting(lp) == 0 || lacp_isactive(lp) == 0) { m_freem(m); return (NULL); } m->m_pkthdr.rcvif = ifp; return (m); } Index: head/sys/net/if_stf.c =================================================================== --- head/sys/net/if_stf.c (revision 333863) +++ head/sys/net/if_stf.c (revision 333864) @@ -1,767 +1,767 @@ /* $FreeBSD$ */ /* $KAME: if_stf.c,v 1.73 2001/12/03 11:08:30 keiichi Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 2000 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * 6to4 interface, based on RFC3056. * * 6to4 interface is NOT capable of link-layer (I mean, IPv4) multicasting. * There is no address mapping defined from IPv6 multicast address to IPv4 * address. Therefore, we do not have IFF_MULTICAST on the interface. * * Due to the lack of address mapping for link-local addresses, we cannot * throw packets toward link-local addresses (fe80::x). Also, we cannot throw * packets to link-local multicast addresses (ff02::x). * * Here are interesting symptoms due to the lack of link-local address: * * Unicast routing exchange: * - RIPng: Impossible. Uses link-local multicast packet toward ff02::9, * and link-local addresses as nexthop. * - OSPFv6: Impossible. OSPFv6 assumes that there's link-local address * assigned to the link, and makes use of them. Also, HELLO packets use * link-local multicast addresses (ff02::5 and ff02::6). * - BGP4+: Maybe. You can only use global address as nexthop, and global * address as TCP endpoint address. * * Multicast routing protocols: * - PIM: Hello packet cannot be used to discover adjacent PIM routers. * Adjacent PIM routers must be configured manually (is it really spec-wise * correct thing to do?). * * ICMPv6: * - Redirects cannot be used due to the lack of link-local address. * * stf interface does not have, and will not need, a link-local address. * It seems to have no real benefit and does not help the above symptoms much. * Even if we assign link-locals to interface, we cannot really * use link-local unicast/multicast on top of 6to4 cloud (since there's no * encapsulation defined for link-local address), and the above analysis does * not change. RFC3056 does not mandate the assignment of link-local address * either. * * 6to4 interface has security issues. Refer to * http://playground.iijlab.net/i-d/draft-itojun-ipv6-transition-abuse-00.txt * for details. The code tries to filter out some of malicious packets. * Note that there is no way to be 100% secure. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_STF, stf, CTLFLAG_RW, 0, "6to4 Interface"); static int stf_permit_rfc1918 = 0; SYSCTL_INT(_net_link_stf, OID_AUTO, permit_rfc1918, CTLFLAG_RWTUN, &stf_permit_rfc1918, 0, "Permit the use of private IPv4 addresses"); #define STFUNIT 0 #define IN6_IS_ADDR_6TO4(x) (ntohs((x)->s6_addr16[0]) == 0x2002) /* * XXX: Return a pointer with 16-bit aligned. Don't cast it to * struct in_addr *; use bcopy() instead. */ #define GET_V4(x) (&(x)->s6_addr16[1]) struct stf_softc { struct ifnet *sc_ifp; u_int sc_fibnum; const struct encaptab *encap_cookie; }; #define STF2IFP(sc) ((sc)->sc_ifp) static const char stfname[] = "stf"; static MALLOC_DEFINE(M_STF, stfname, "6to4 Tunnel Interface"); static const int ip_stf_ttl = 40; extern struct domain inetdomain; static int in_stf_input(struct mbuf **, int *, int); static struct protosw in_stf_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IPV6, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = in_stf_input, .pr_output = rip_output, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }; static char *stfnames[] = {"stf0", "stf", "6to4", NULL}; static int stfmodevent(module_t, int, void *); static int stf_encapcheck(const struct mbuf *, int, int, void *); static int stf_getsrcifa6(struct ifnet *, struct in6_addr *, struct in6_addr *); static int stf_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static int isrfc1918addr(struct in_addr *); static int stf_checkaddr4(struct stf_softc *, struct in_addr *, struct ifnet *); static int stf_checkaddr6(struct stf_softc *, struct in6_addr *, struct ifnet *); static int stf_ioctl(struct ifnet *, u_long, caddr_t); static int stf_clone_match(struct if_clone *, const char *); static int stf_clone_create(struct if_clone *, char *, size_t, caddr_t); static int stf_clone_destroy(struct if_clone *, struct ifnet *); static struct if_clone *stf_cloner; static int stf_clone_match(struct if_clone *ifc, const char *name) { int i; for(i = 0; stfnames[i] != NULL; i++) { if (strcmp(stfnames[i], name) == 0) return (1); } return (0); } static int stf_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { char *dp; int err, unit, wildcard; struct stf_softc *sc; struct ifnet *ifp; err = ifc_name2unit(name, &unit); if (err != 0) return (err); wildcard = (unit < 0); /* * We can only have one unit, but since unit allocation is * already locked, we use it to keep from allocating extra * interfaces. */ unit = STFUNIT; err = ifc_alloc_unit(ifc, &unit); if (err != 0) return (err); sc = malloc(sizeof(struct stf_softc), M_STF, M_WAITOK | M_ZERO); ifp = STF2IFP(sc) = if_alloc(IFT_STF); if (ifp == NULL) { free(sc, M_STF); ifc_free_unit(ifc, unit); return (ENOSPC); } ifp->if_softc = sc; sc->sc_fibnum = curthread->td_proc->p_fibnum; /* * Set the name manually rather then using if_initname because * we don't conform to the default naming convention for interfaces. * In the wildcard case, we need to update the name. */ if (wildcard) { for (dp = name; *dp != '\0'; dp++); if (snprintf(dp, len - (dp-name), "%d", unit) > len - (dp-name) - 1) { /* * This can only be a programmer error and * there's no straightforward way to recover if * it happens. */ panic("if_clone_create(): interface name too long"); } } strlcpy(ifp->if_xname, name, IFNAMSIZ); ifp->if_dname = stfname; ifp->if_dunit = IF_DUNIT_NONE; sc->encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV6, stf_encapcheck, &in_stf_protosw, sc); if (sc->encap_cookie == NULL) { if_printf(ifp, "attach failed\n"); free(sc, M_STF); ifc_free_unit(ifc, unit); return (ENOMEM); } ifp->if_mtu = IPV6_MMTU; ifp->if_ioctl = stf_ioctl; ifp->if_output = stf_output; ifp->if_snd.ifq_maxlen = ifqmaxlen; if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); return (0); } static int stf_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) { struct stf_softc *sc = ifp->if_softc; - int err; + int err __unused; err = encap_detach(sc->encap_cookie); KASSERT(err == 0, ("Unexpected error detaching encap_cookie")); bpfdetach(ifp); if_detach(ifp); if_free(ifp); free(sc, M_STF); ifc_free_unit(ifc, STFUNIT); return (0); } static int stfmodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: stf_cloner = if_clone_advanced(stfname, 0, stf_clone_match, stf_clone_create, stf_clone_destroy); break; case MOD_UNLOAD: if_clone_detach(stf_cloner); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t stf_mod = { "if_stf", stfmodevent, 0 }; DECLARE_MODULE(if_stf, stf_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); static int stf_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { struct ip ip; struct stf_softc *sc; struct in_addr a, b, mask; struct in6_addr addr6, mask6; sc = (struct stf_softc *)arg; if (sc == NULL) return 0; if ((STF2IFP(sc)->if_flags & IFF_UP) == 0) return 0; /* IFF_LINK0 means "no decapsulation" */ if ((STF2IFP(sc)->if_flags & IFF_LINK0) != 0) return 0; if (proto != IPPROTO_IPV6) return 0; m_copydata(m, 0, sizeof(ip), (caddr_t)&ip); if (ip.ip_v != 4) return 0; if (stf_getsrcifa6(STF2IFP(sc), &addr6, &mask6) != 0) return (0); /* * check if IPv4 dst matches the IPv4 address derived from the * local 6to4 address. * success on: dst = 10.1.1.1, ia6->ia_addr = 2002:0a01:0101:... */ if (bcmp(GET_V4(&addr6), &ip.ip_dst, sizeof(ip.ip_dst)) != 0) return 0; /* * check if IPv4 src matches the IPv4 address derived from the * local 6to4 address masked by prefixmask. * success on: src = 10.1.1.1, ia6->ia_addr = 2002:0a00:.../24 * fail on: src = 10.1.1.1, ia6->ia_addr = 2002:0b00:.../24 */ bzero(&a, sizeof(a)); bcopy(GET_V4(&addr6), &a, sizeof(a)); bcopy(GET_V4(&mask6), &mask, sizeof(mask)); a.s_addr &= mask.s_addr; b = ip.ip_src; b.s_addr &= mask.s_addr; if (a.s_addr != b.s_addr) return 0; /* stf interface makes single side match only */ return 32; } static int stf_getsrcifa6(struct ifnet *ifp, struct in6_addr *addr, struct in6_addr *mask) { struct ifaddr *ia; struct in_ifaddr *ia4; struct in6_ifaddr *ia6; struct sockaddr_in6 *sin6; struct in_addr in; if_addr_rlock(ifp); CK_STAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr->sa_family != AF_INET6) continue; sin6 = (struct sockaddr_in6 *)ia->ifa_addr; if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) continue; bcopy(GET_V4(&sin6->sin6_addr), &in, sizeof(in)); LIST_FOREACH(ia4, INADDR_HASH(in.s_addr), ia_hash) if (ia4->ia_addr.sin_addr.s_addr == in.s_addr) break; if (ia4 == NULL) continue; ia6 = (struct in6_ifaddr *)ia; *addr = sin6->sin6_addr; *mask = ia6->ia_prefixmask.sin6_addr; if_addr_runlock(ifp); return (0); } if_addr_runlock(ifp); return (ENOENT); } static int stf_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct stf_softc *sc; const struct sockaddr_in6 *dst6; struct in_addr in4; const void *ptr; u_int8_t tos; struct ip *ip; struct ip6_hdr *ip6; struct in6_addr addr6, mask6; int error; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) { m_freem(m); return (error); } #endif sc = ifp->if_softc; dst6 = (const struct sockaddr_in6 *)dst; /* just in case */ if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENETDOWN; } /* * If we don't have an ip4 address that match my inner ip6 address, * we shouldn't generate output. Without this check, we'll end up * using wrong IPv4 source. */ if (stf_getsrcifa6(ifp, &addr6, &mask6) != 0) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENETDOWN; } if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENOBUFS; } } ip6 = mtod(m, struct ip6_hdr *); tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; /* * Pickup the right outer dst addr from the list of candidates. * ip6_dst has priority as it may be able to give us shorter IPv4 hops. */ ptr = NULL; if (IN6_IS_ADDR_6TO4(&ip6->ip6_dst)) ptr = GET_V4(&ip6->ip6_dst); else if (IN6_IS_ADDR_6TO4(&dst6->sin6_addr)) ptr = GET_V4(&dst6->sin6_addr); else { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENETUNREACH; } bcopy(ptr, &in4, sizeof(in4)); if (bpf_peers_present(ifp->if_bpf)) { /* * We need to prepend the address family as * a four byte field. Cons up a dummy header * to pacify bpf. This is safe because bpf * will only read from the mbuf (i.e., it won't * try to free it or keep a pointer a to it). */ u_int af = AF_INET6; bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); } M_PREPEND(m, sizeof(struct ip), M_NOWAIT); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENOBUFS; } ip = mtod(m, struct ip *); bzero(ip, sizeof(*ip)); bcopy(GET_V4(&addr6), &ip->ip_src, sizeof(ip->ip_src)); bcopy(&in4, &ip->ip_dst, sizeof(ip->ip_dst)); ip->ip_p = IPPROTO_IPV6; ip->ip_ttl = ip_stf_ttl; ip->ip_len = htons(m->m_pkthdr.len); if (ifp->if_flags & IFF_LINK1) ip_ecn_ingress(ECN_ALLOWED, &ip->ip_tos, &tos); else ip_ecn_ingress(ECN_NOCARE, &ip->ip_tos, &tos); M_SETFIB(m, sc->sc_fibnum); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); error = ip_output(m, NULL, NULL, 0, NULL, NULL); return error; } static int isrfc1918addr(struct in_addr *in) { /* * returns 1 if private address range: * 10.0.0.0/8 172.16.0.0/12 192.168.0.0/16 */ if (stf_permit_rfc1918 == 0 && ( (ntohl(in->s_addr) & 0xff000000) >> 24 == 10 || (ntohl(in->s_addr) & 0xfff00000) >> 16 == 172 * 256 + 16 || (ntohl(in->s_addr) & 0xffff0000) >> 16 == 192 * 256 + 168)) return 1; return 0; } static int stf_checkaddr4(struct stf_softc *sc, struct in_addr *in, struct ifnet *inifp) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia4; /* * reject packets with the following address: * 224.0.0.0/4 0.0.0.0/8 127.0.0.0/8 255.0.0.0/8 */ if (IN_MULTICAST(ntohl(in->s_addr))) return -1; switch ((ntohl(in->s_addr) & 0xff000000) >> 24) { case 0: case 127: case 255: return -1; } /* * reject packets with private address range. * (requirement from RFC3056 section 2 1st paragraph) */ if (isrfc1918addr(in)) return -1; /* * reject packets with broadcast */ IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia4, &V_in_ifaddrhead, ia_link) { if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) continue; if (in->s_addr == ia4->ia_broadaddr.sin_addr.s_addr) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return -1; } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * perform ingress filter */ if (sc && (STF2IFP(sc)->if_flags & IFF_LINK2) == 0 && inifp) { struct nhop4_basic nh4; if (fib4_lookup_nh_basic(sc->sc_fibnum, *in, 0, 0, &nh4) != 0) return (-1); if (nh4.nh_ifp != inifp) return (-1); } return 0; } static int stf_checkaddr6(struct stf_softc *sc, struct in6_addr *in6, struct ifnet *inifp) { /* * check 6to4 addresses */ if (IN6_IS_ADDR_6TO4(in6)) { struct in_addr in4; bcopy(GET_V4(in6), &in4, sizeof(in4)); return stf_checkaddr4(sc, &in4, inifp); } /* * reject anything that look suspicious. the test is implemented * in ip6_input too, but we check here as well to * (1) reject bad packets earlier, and * (2) to be safe against future ip6_input change. */ if (IN6_IS_ADDR_V4COMPAT(in6) || IN6_IS_ADDR_V4MAPPED(in6)) return -1; return 0; } static int in_stf_input(struct mbuf **mp, int *offp, int proto) { struct stf_softc *sc; struct ip *ip; struct ip6_hdr *ip6; struct mbuf *m; u_int8_t otos, itos; struct ifnet *ifp; int off; m = *mp; off = *offp; if (proto != IPPROTO_IPV6) { m_freem(m); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); sc = (struct stf_softc *)encap_getarg(m); if (sc == NULL || (STF2IFP(sc)->if_flags & IFF_UP) == 0) { m_freem(m); return (IPPROTO_DONE); } ifp = STF2IFP(sc); #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif /* * perform sanity check against outer src/dst. * for source, perform ingress filter as well. */ if (stf_checkaddr4(sc, &ip->ip_dst, NULL) < 0 || stf_checkaddr4(sc, &ip->ip_src, m->m_pkthdr.rcvif) < 0) { m_freem(m); return (IPPROTO_DONE); } otos = ip->ip_tos; m_adj(m, off); if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) return (IPPROTO_DONE); } ip6 = mtod(m, struct ip6_hdr *); /* * perform sanity check against inner src/dst. * for source, perform ingress filter as well. */ if (stf_checkaddr6(sc, &ip6->ip6_dst, NULL) < 0 || stf_checkaddr6(sc, &ip6->ip6_src, m->m_pkthdr.rcvif) < 0) { m_freem(m); return (IPPROTO_DONE); } itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; if ((ifp->if_flags & IFF_LINK1) != 0) ip_ecn_egress(ECN_ALLOWED, &otos, &itos); else ip_ecn_egress(ECN_NOCARE, &otos, &itos); ip6->ip6_flow &= ~htonl(0xff << 20); ip6->ip6_flow |= htonl((u_int32_t)itos << 20); m->m_pkthdr.rcvif = ifp; if (bpf_peers_present(ifp->if_bpf)) { /* * We need to prepend the address family as * a four byte field. Cons up a dummy header * to pacify bpf. This is safe because bpf * will only read from the mbuf (i.e., it won't * try to free it or keep a pointer a to it). */ u_int32_t af = AF_INET6; bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); } /* * Put the packet to the network layer input queue according to the * specified address family. * See net/if_gif.c for possible issues with packet processing * reorder due to extra queueing. */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); M_SETFIB(m, ifp->if_fib); netisr_dispatch(NETISR_IPV6, m); return (IPPROTO_DONE); } static int stf_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifaddr *ifa; struct ifreq *ifr; struct sockaddr_in6 *sin6; struct in_addr addr; int error, mtu; error = 0; switch (cmd) { case SIOCSIFADDR: ifa = (struct ifaddr *)data; if (ifa == NULL || ifa->ifa_addr->sa_family != AF_INET6) { error = EAFNOSUPPORT; break; } sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) { error = EINVAL; break; } bcopy(GET_V4(&sin6->sin6_addr), &addr, sizeof(addr)); if (isrfc1918addr(&addr)) { error = EINVAL; break; } ifp->if_flags |= IFF_UP; break; case SIOCADDMULTI: case SIOCDELMULTI: ifr = (struct ifreq *)data; if (ifr && ifr->ifr_addr.sa_family == AF_INET6) ; else error = EAFNOSUPPORT; break; case SIOCGIFMTU: break; case SIOCSIFMTU: ifr = (struct ifreq *)data; mtu = ifr->ifr_mtu; /* RFC 4213 3.2 ideal world MTU */ if (mtu < IPV6_MINMTU || mtu > IF_MAXMTU - 20) return (EINVAL); ifp->if_mtu = mtu; break; default: error = EINVAL; break; } return error; } Index: head/sys/net/if_vxlan.c =================================================================== --- head/sys/net/if_vxlan.c (revision 333863) +++ head/sys/net/if_vxlan.c (revision 333864) @@ -1,3190 +1,3191 @@ /*- * Copyright (c) 2014, Bryan Venteicher * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "opt_inet.h" #include "opt_inet6.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct vxlan_softc; LIST_HEAD(vxlan_softc_head, vxlan_softc); struct vxlan_socket_mc_info { union vxlan_sockaddr vxlsomc_saddr; union vxlan_sockaddr vxlsomc_gaddr; int vxlsomc_ifidx; int vxlsomc_users; }; #define VXLAN_SO_MC_MAX_GROUPS 32 #define VXLAN_SO_VNI_HASH_SHIFT 6 #define VXLAN_SO_VNI_HASH_SIZE (1 << VXLAN_SO_VNI_HASH_SHIFT) #define VXLAN_SO_VNI_HASH(_vni) ((_vni) % VXLAN_SO_VNI_HASH_SIZE) struct vxlan_socket { struct socket *vxlso_sock; struct rmlock vxlso_lock; u_int vxlso_refcnt; union vxlan_sockaddr vxlso_laddr; LIST_ENTRY(vxlan_socket) vxlso_entry; struct vxlan_softc_head vxlso_vni_hash[VXLAN_SO_VNI_HASH_SIZE]; struct vxlan_socket_mc_info vxlso_mc[VXLAN_SO_MC_MAX_GROUPS]; }; #define VXLAN_SO_RLOCK(_vso, _p) rm_rlock(&(_vso)->vxlso_lock, (_p)) #define VXLAN_SO_RUNLOCK(_vso, _p) rm_runlock(&(_vso)->vxlso_lock, (_p)) #define VXLAN_SO_WLOCK(_vso) rm_wlock(&(_vso)->vxlso_lock) #define VXLAN_SO_WUNLOCK(_vso) rm_wunlock(&(_vso)->vxlso_lock) #define VXLAN_SO_LOCK_ASSERT(_vso) \ rm_assert(&(_vso)->vxlso_lock, RA_LOCKED) #define VXLAN_SO_LOCK_WASSERT(_vso) \ rm_assert(&(_vso)->vxlso_lock, RA_WLOCKED) #define VXLAN_SO_ACQUIRE(_vso) refcount_acquire(&(_vso)->vxlso_refcnt) #define VXLAN_SO_RELEASE(_vso) refcount_release(&(_vso)->vxlso_refcnt) struct vxlan_ftable_entry { LIST_ENTRY(vxlan_ftable_entry) vxlfe_hash; uint16_t vxlfe_flags; uint8_t vxlfe_mac[ETHER_ADDR_LEN]; union vxlan_sockaddr vxlfe_raddr; time_t vxlfe_expire; }; #define VXLAN_FE_FLAG_DYNAMIC 0x01 #define VXLAN_FE_FLAG_STATIC 0x02 #define VXLAN_FE_IS_DYNAMIC(_fe) \ ((_fe)->vxlfe_flags & VXLAN_FE_FLAG_DYNAMIC) #define VXLAN_SC_FTABLE_SHIFT 9 #define VXLAN_SC_FTABLE_SIZE (1 << VXLAN_SC_FTABLE_SHIFT) #define VXLAN_SC_FTABLE_MASK (VXLAN_SC_FTABLE_SIZE - 1) #define VXLAN_SC_FTABLE_HASH(_sc, _mac) \ (vxlan_mac_hash(_sc, _mac) % VXLAN_SC_FTABLE_SIZE) LIST_HEAD(vxlan_ftable_head, vxlan_ftable_entry); struct vxlan_statistics { uint32_t ftable_nospace; uint32_t ftable_lock_upgrade_failed; }; struct vxlan_softc { struct ifnet *vxl_ifp; struct vxlan_socket *vxl_sock; uint32_t vxl_vni; union vxlan_sockaddr vxl_src_addr; union vxlan_sockaddr vxl_dst_addr; uint32_t vxl_flags; #define VXLAN_FLAG_INIT 0x0001 #define VXLAN_FLAG_TEARDOWN 0x0002 #define VXLAN_FLAG_LEARN 0x0004 uint32_t vxl_port_hash_key; uint16_t vxl_min_port; uint16_t vxl_max_port; uint8_t vxl_ttl; /* Lookup table from MAC address to forwarding entry. */ uint32_t vxl_ftable_cnt; uint32_t vxl_ftable_max; uint32_t vxl_ftable_timeout; uint32_t vxl_ftable_hash_key; struct vxlan_ftable_head *vxl_ftable; /* Derived from vxl_dst_addr. */ struct vxlan_ftable_entry vxl_default_fe; struct ip_moptions *vxl_im4o; struct ip6_moptions *vxl_im6o; struct rmlock vxl_lock; volatile u_int vxl_refcnt; int vxl_unit; int vxl_vso_mc_index; struct vxlan_statistics vxl_stats; struct sysctl_oid *vxl_sysctl_node; struct sysctl_ctx_list vxl_sysctl_ctx; struct callout vxl_callout; uint8_t vxl_hwaddr[ETHER_ADDR_LEN]; int vxl_mc_ifindex; struct ifnet *vxl_mc_ifp; struct ifmedia vxl_media; char vxl_mc_ifname[IFNAMSIZ]; LIST_ENTRY(vxlan_softc) vxl_entry; LIST_ENTRY(vxlan_softc) vxl_ifdetach_list; }; #define VXLAN_RLOCK(_sc, _p) rm_rlock(&(_sc)->vxl_lock, (_p)) #define VXLAN_RUNLOCK(_sc, _p) rm_runlock(&(_sc)->vxl_lock, (_p)) #define VXLAN_WLOCK(_sc) rm_wlock(&(_sc)->vxl_lock) #define VXLAN_WUNLOCK(_sc) rm_wunlock(&(_sc)->vxl_lock) #define VXLAN_LOCK_WOWNED(_sc) rm_wowned(&(_sc)->vxl_lock) #define VXLAN_LOCK_ASSERT(_sc) rm_assert(&(_sc)->vxl_lock, RA_LOCKED) #define VXLAN_LOCK_WASSERT(_sc) rm_assert(&(_sc)->vxl_lock, RA_WLOCKED) #define VXLAN_UNLOCK(_sc, _p) do { \ if (VXLAN_LOCK_WOWNED(_sc)) \ VXLAN_WUNLOCK(_sc); \ else \ VXLAN_RUNLOCK(_sc, _p); \ } while (0) #define VXLAN_ACQUIRE(_sc) refcount_acquire(&(_sc)->vxl_refcnt) #define VXLAN_RELEASE(_sc) refcount_release(&(_sc)->vxl_refcnt) #define satoconstsin(sa) ((const struct sockaddr_in *)(sa)) #define satoconstsin6(sa) ((const struct sockaddr_in6 *)(sa)) struct vxlanudphdr { struct udphdr vxlh_udp; struct vxlan_header vxlh_hdr; } __packed; static int vxlan_ftable_addr_cmp(const uint8_t *, const uint8_t *); static void vxlan_ftable_init(struct vxlan_softc *); static void vxlan_ftable_fini(struct vxlan_softc *); static void vxlan_ftable_flush(struct vxlan_softc *, int); static void vxlan_ftable_expire(struct vxlan_softc *); static int vxlan_ftable_update_locked(struct vxlan_softc *, const union vxlan_sockaddr *, const uint8_t *, struct rm_priotracker *); static int vxlan_ftable_learn(struct vxlan_softc *, const struct sockaddr *, const uint8_t *); static int vxlan_ftable_sysctl_dump(SYSCTL_HANDLER_ARGS); static struct vxlan_ftable_entry * vxlan_ftable_entry_alloc(void); static void vxlan_ftable_entry_free(struct vxlan_ftable_entry *); static void vxlan_ftable_entry_init(struct vxlan_softc *, struct vxlan_ftable_entry *, const uint8_t *, const struct sockaddr *, uint32_t); static void vxlan_ftable_entry_destroy(struct vxlan_softc *, struct vxlan_ftable_entry *); static int vxlan_ftable_entry_insert(struct vxlan_softc *, struct vxlan_ftable_entry *); static struct vxlan_ftable_entry * vxlan_ftable_entry_lookup(struct vxlan_softc *, const uint8_t *); static void vxlan_ftable_entry_dump(struct vxlan_ftable_entry *, struct sbuf *); static struct vxlan_socket * vxlan_socket_alloc(const union vxlan_sockaddr *); static void vxlan_socket_destroy(struct vxlan_socket *); static void vxlan_socket_release(struct vxlan_socket *); static struct vxlan_socket * vxlan_socket_lookup(union vxlan_sockaddr *vxlsa); static void vxlan_socket_insert(struct vxlan_socket *); static int vxlan_socket_init(struct vxlan_socket *, struct ifnet *); static int vxlan_socket_bind(struct vxlan_socket *, struct ifnet *); static int vxlan_socket_create(struct ifnet *, int, const union vxlan_sockaddr *, struct vxlan_socket **); static void vxlan_socket_ifdetach(struct vxlan_socket *, struct ifnet *, struct vxlan_softc_head *); static struct vxlan_socket * vxlan_socket_mc_lookup(const union vxlan_sockaddr *); static int vxlan_sockaddr_mc_info_match( const struct vxlan_socket_mc_info *, const union vxlan_sockaddr *, const union vxlan_sockaddr *, int); static int vxlan_socket_mc_join_group(struct vxlan_socket *, const union vxlan_sockaddr *, const union vxlan_sockaddr *, int *, union vxlan_sockaddr *); static int vxlan_socket_mc_leave_group(struct vxlan_socket *, const union vxlan_sockaddr *, const union vxlan_sockaddr *, int); static int vxlan_socket_mc_add_group(struct vxlan_socket *, const union vxlan_sockaddr *, const union vxlan_sockaddr *, int, int *); static void vxlan_socket_mc_release_group_by_idx(struct vxlan_socket *, int); static struct vxlan_softc * vxlan_socket_lookup_softc_locked(struct vxlan_socket *, uint32_t); static struct vxlan_softc * vxlan_socket_lookup_softc(struct vxlan_socket *, uint32_t); static int vxlan_socket_insert_softc(struct vxlan_socket *, struct vxlan_softc *); static void vxlan_socket_remove_softc(struct vxlan_socket *, struct vxlan_softc *); static struct ifnet * vxlan_multicast_if_ref(struct vxlan_softc *, int); static void vxlan_free_multicast(struct vxlan_softc *); static int vxlan_setup_multicast_interface(struct vxlan_softc *); static int vxlan_setup_multicast(struct vxlan_softc *); static int vxlan_setup_socket(struct vxlan_softc *); static void vxlan_setup_interface(struct vxlan_softc *); static int vxlan_valid_init_config(struct vxlan_softc *); static void vxlan_init_wait(struct vxlan_softc *); static void vxlan_init_complete(struct vxlan_softc *); static void vxlan_init(void *); static void vxlan_release(struct vxlan_softc *); static void vxlan_teardown_wait(struct vxlan_softc *); static void vxlan_teardown_complete(struct vxlan_softc *); static void vxlan_teardown_locked(struct vxlan_softc *); static void vxlan_teardown(struct vxlan_softc *); static void vxlan_ifdetach(struct vxlan_softc *, struct ifnet *, struct vxlan_softc_head *); static void vxlan_timer(void *); static int vxlan_ctrl_get_config(struct vxlan_softc *, void *); static int vxlan_ctrl_set_vni(struct vxlan_softc *, void *); static int vxlan_ctrl_set_local_addr(struct vxlan_softc *, void *); static int vxlan_ctrl_set_remote_addr(struct vxlan_softc *, void *); static int vxlan_ctrl_set_local_port(struct vxlan_softc *, void *); static int vxlan_ctrl_set_remote_port(struct vxlan_softc *, void *); static int vxlan_ctrl_set_port_range(struct vxlan_softc *, void *); static int vxlan_ctrl_set_ftable_timeout(struct vxlan_softc *, void *); static int vxlan_ctrl_set_ftable_max(struct vxlan_softc *, void *); static int vxlan_ctrl_set_multicast_if(struct vxlan_softc * , void *); static int vxlan_ctrl_set_ttl(struct vxlan_softc *, void *); static int vxlan_ctrl_set_learn(struct vxlan_softc *, void *); static int vxlan_ctrl_ftable_entry_add(struct vxlan_softc *, void *); static int vxlan_ctrl_ftable_entry_rem(struct vxlan_softc *, void *); static int vxlan_ctrl_flush(struct vxlan_softc *, void *); static int vxlan_ioctl_drvspec(struct vxlan_softc *, struct ifdrv *, int); static int vxlan_ioctl_ifflags(struct vxlan_softc *); static int vxlan_ioctl(struct ifnet *, u_long, caddr_t); #if defined(INET) || defined(INET6) static uint16_t vxlan_pick_source_port(struct vxlan_softc *, struct mbuf *); static void vxlan_encap_header(struct vxlan_softc *, struct mbuf *, int, uint16_t, uint16_t); #endif static int vxlan_encap4(struct vxlan_softc *, const union vxlan_sockaddr *, struct mbuf *); static int vxlan_encap6(struct vxlan_softc *, const union vxlan_sockaddr *, struct mbuf *); static int vxlan_transmit(struct ifnet *, struct mbuf *); static void vxlan_qflush(struct ifnet *); static void vxlan_rcv_udp_packet(struct mbuf *, int, struct inpcb *, const struct sockaddr *, void *); static int vxlan_input(struct vxlan_socket *, uint32_t, struct mbuf **, const struct sockaddr *); static void vxlan_set_default_config(struct vxlan_softc *); static int vxlan_set_user_config(struct vxlan_softc *, struct ifvxlanparam *); static int vxlan_clone_create(struct if_clone *, int, caddr_t); static void vxlan_clone_destroy(struct ifnet *); static uint32_t vxlan_mac_hash(struct vxlan_softc *, const uint8_t *); static void vxlan_fakeaddr(struct vxlan_softc *); static int vxlan_media_change(struct ifnet *); static void vxlan_media_status(struct ifnet *, struct ifmediareq *); static int vxlan_sockaddr_cmp(const union vxlan_sockaddr *, const struct sockaddr *); static void vxlan_sockaddr_copy(union vxlan_sockaddr *, const struct sockaddr *); static int vxlan_sockaddr_in_equal(const union vxlan_sockaddr *, const struct sockaddr *); static void vxlan_sockaddr_in_copy(union vxlan_sockaddr *, const struct sockaddr *); static int vxlan_sockaddr_supported(const union vxlan_sockaddr *, int); static int vxlan_sockaddr_in_any(const union vxlan_sockaddr *); static int vxlan_sockaddr_in_multicast(const union vxlan_sockaddr *); static int vxlan_sockaddr_in6_embedscope(union vxlan_sockaddr *); static int vxlan_can_change_config(struct vxlan_softc *); static int vxlan_check_vni(uint32_t); static int vxlan_check_ttl(int); static int vxlan_check_ftable_timeout(uint32_t); static int vxlan_check_ftable_max(uint32_t); static void vxlan_sysctl_setup(struct vxlan_softc *); static void vxlan_sysctl_destroy(struct vxlan_softc *); static int vxlan_tunable_int(struct vxlan_softc *, const char *, int); static void vxlan_ifdetach_event(void *, struct ifnet *); static void vxlan_load(void); static void vxlan_unload(void); static int vxlan_modevent(module_t, int, void *); static const char vxlan_name[] = "vxlan"; static MALLOC_DEFINE(M_VXLAN, vxlan_name, "Virtual eXtensible LAN Interface"); static struct if_clone *vxlan_cloner; static struct mtx vxlan_list_mtx; #define VXLAN_LIST_LOCK() mtx_lock(&vxlan_list_mtx) #define VXLAN_LIST_UNLOCK() mtx_unlock(&vxlan_list_mtx) static LIST_HEAD(, vxlan_socket) vxlan_socket_list; static eventhandler_tag vxlan_ifdetach_event_tag; SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, OID_AUTO, vxlan, CTLFLAG_RW, 0, "Virtual eXtensible Local Area Network"); static int vxlan_legacy_port = 0; TUNABLE_INT("net.link.vxlan.legacy_port", &vxlan_legacy_port); static int vxlan_reuse_port = 0; TUNABLE_INT("net.link.vxlan.reuse_port", &vxlan_reuse_port); /* Default maximum number of addresses in the forwarding table. */ #ifndef VXLAN_FTABLE_MAX #define VXLAN_FTABLE_MAX 2000 #endif /* Timeout (in seconds) of addresses learned in the forwarding table. */ #ifndef VXLAN_FTABLE_TIMEOUT #define VXLAN_FTABLE_TIMEOUT (20 * 60) #endif /* * Maximum timeout (in seconds) of addresses learned in the forwarding * table. */ #ifndef VXLAN_FTABLE_MAX_TIMEOUT #define VXLAN_FTABLE_MAX_TIMEOUT (60 * 60 * 24) #endif /* Number of seconds between pruning attempts of the forwarding table. */ #ifndef VXLAN_FTABLE_PRUNE #define VXLAN_FTABLE_PRUNE (5 * 60) #endif static int vxlan_ftable_prune_period = VXLAN_FTABLE_PRUNE; struct vxlan_control { int (*vxlc_func)(struct vxlan_softc *, void *); int vxlc_argsize; int vxlc_flags; #define VXLAN_CTRL_FLAG_COPYIN 0x01 #define VXLAN_CTRL_FLAG_COPYOUT 0x02 #define VXLAN_CTRL_FLAG_SUSER 0x04 }; static const struct vxlan_control vxlan_control_table[] = { [VXLAN_CMD_GET_CONFIG] = { vxlan_ctrl_get_config, sizeof(struct ifvxlancfg), VXLAN_CTRL_FLAG_COPYOUT }, [VXLAN_CMD_SET_VNI] = { vxlan_ctrl_set_vni, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_LOCAL_ADDR] = { vxlan_ctrl_set_local_addr, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_REMOTE_ADDR] = { vxlan_ctrl_set_remote_addr, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_LOCAL_PORT] = { vxlan_ctrl_set_local_port, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_REMOTE_PORT] = { vxlan_ctrl_set_remote_port, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_PORT_RANGE] = { vxlan_ctrl_set_port_range, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_FTABLE_TIMEOUT] = { vxlan_ctrl_set_ftable_timeout, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_FTABLE_MAX] = { vxlan_ctrl_set_ftable_max, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_MULTICAST_IF] = { vxlan_ctrl_set_multicast_if, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_TTL] = { vxlan_ctrl_set_ttl, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_LEARN] = { vxlan_ctrl_set_learn, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_FTABLE_ENTRY_ADD] = { vxlan_ctrl_ftable_entry_add, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_FTABLE_ENTRY_REM] = { vxlan_ctrl_ftable_entry_rem, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_FLUSH] = { vxlan_ctrl_flush, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, }; static const int vxlan_control_table_size = nitems(vxlan_control_table); static int vxlan_ftable_addr_cmp(const uint8_t *a, const uint8_t *b) { int i, d; for (i = 0, d = 0; i < ETHER_ADDR_LEN && d == 0; i++) d = ((int)a[i]) - ((int)b[i]); return (d); } static void vxlan_ftable_init(struct vxlan_softc *sc) { int i; sc->vxl_ftable = malloc(sizeof(struct vxlan_ftable_head) * VXLAN_SC_FTABLE_SIZE, M_VXLAN, M_ZERO | M_WAITOK); for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) LIST_INIT(&sc->vxl_ftable[i]); sc->vxl_ftable_hash_key = arc4random(); } static void vxlan_ftable_fini(struct vxlan_softc *sc) { int i; for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) { KASSERT(LIST_EMPTY(&sc->vxl_ftable[i]), ("%s: vxlan %p ftable[%d] not empty", __func__, sc, i)); } MPASS(sc->vxl_ftable_cnt == 0); free(sc->vxl_ftable, M_VXLAN); sc->vxl_ftable = NULL; } static void vxlan_ftable_flush(struct vxlan_softc *sc, int all) { struct vxlan_ftable_entry *fe, *tfe; int i; for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) { LIST_FOREACH_SAFE(fe, &sc->vxl_ftable[i], vxlfe_hash, tfe) { if (all || VXLAN_FE_IS_DYNAMIC(fe)) vxlan_ftable_entry_destroy(sc, fe); } } } static void vxlan_ftable_expire(struct vxlan_softc *sc) { struct vxlan_ftable_entry *fe, *tfe; int i; VXLAN_LOCK_WASSERT(sc); for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) { LIST_FOREACH_SAFE(fe, &sc->vxl_ftable[i], vxlfe_hash, tfe) { if (VXLAN_FE_IS_DYNAMIC(fe) && time_uptime >= fe->vxlfe_expire) vxlan_ftable_entry_destroy(sc, fe); } } } static int vxlan_ftable_update_locked(struct vxlan_softc *sc, const union vxlan_sockaddr *vxlsa, const uint8_t *mac, struct rm_priotracker *tracker) { struct vxlan_ftable_entry *fe; - int error; + int error __unused; VXLAN_LOCK_ASSERT(sc); again: /* * A forwarding entry for this MAC address might already exist. If * so, update it, otherwise create a new one. We may have to upgrade * the lock if we have to change or create an entry. */ fe = vxlan_ftable_entry_lookup(sc, mac); if (fe != NULL) { fe->vxlfe_expire = time_uptime + sc->vxl_ftable_timeout; if (!VXLAN_FE_IS_DYNAMIC(fe) || vxlan_sockaddr_in_equal(&fe->vxlfe_raddr, &vxlsa->sa)) return (0); if (!VXLAN_LOCK_WOWNED(sc)) { VXLAN_RUNLOCK(sc, tracker); VXLAN_WLOCK(sc); sc->vxl_stats.ftable_lock_upgrade_failed++; goto again; } vxlan_sockaddr_in_copy(&fe->vxlfe_raddr, &vxlsa->sa); return (0); } if (!VXLAN_LOCK_WOWNED(sc)) { VXLAN_RUNLOCK(sc, tracker); VXLAN_WLOCK(sc); sc->vxl_stats.ftable_lock_upgrade_failed++; goto again; } if (sc->vxl_ftable_cnt >= sc->vxl_ftable_max) { sc->vxl_stats.ftable_nospace++; return (ENOSPC); } fe = vxlan_ftable_entry_alloc(); if (fe == NULL) return (ENOMEM); vxlan_ftable_entry_init(sc, fe, mac, &vxlsa->sa, VXLAN_FE_FLAG_DYNAMIC); /* The prior lookup failed, so the insert should not. */ error = vxlan_ftable_entry_insert(sc, fe); MPASS(error == 0); return (0); } static int vxlan_ftable_learn(struct vxlan_softc *sc, const struct sockaddr *sa, const uint8_t *mac) { struct rm_priotracker tracker; union vxlan_sockaddr vxlsa; int error; /* * The source port may be randomly selected by the remote host, so * use the port of the default destination address. */ vxlan_sockaddr_copy(&vxlsa, sa); vxlsa.in4.sin_port = sc->vxl_dst_addr.in4.sin_port; if (VXLAN_SOCKADDR_IS_IPV6(&vxlsa)) { error = vxlan_sockaddr_in6_embedscope(&vxlsa); if (error) return (error); } VXLAN_RLOCK(sc, &tracker); error = vxlan_ftable_update_locked(sc, &vxlsa, mac, &tracker); VXLAN_UNLOCK(sc, &tracker); return (error); } static int vxlan_ftable_sysctl_dump(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; struct sbuf sb; struct vxlan_softc *sc; struct vxlan_ftable_entry *fe; size_t size; int i, error; /* * This is mostly intended for debugging during development. It is * not practical to dump an entire large table this way. */ sc = arg1; size = PAGE_SIZE; /* Calculate later. */ sbuf_new(&sb, NULL, size, SBUF_FIXEDLEN); sbuf_putc(&sb, '\n'); VXLAN_RLOCK(sc, &tracker); for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) { LIST_FOREACH(fe, &sc->vxl_ftable[i], vxlfe_hash) { if (sbuf_error(&sb) != 0) break; vxlan_ftable_entry_dump(fe, &sb); } } VXLAN_RUNLOCK(sc, &tracker); if (sbuf_len(&sb) == 1) sbuf_setpos(&sb, 0); sbuf_finish(&sb); error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (error); } static struct vxlan_ftable_entry * vxlan_ftable_entry_alloc(void) { struct vxlan_ftable_entry *fe; fe = malloc(sizeof(*fe), M_VXLAN, M_ZERO | M_NOWAIT); return (fe); } static void vxlan_ftable_entry_free(struct vxlan_ftable_entry *fe) { free(fe, M_VXLAN); } static void vxlan_ftable_entry_init(struct vxlan_softc *sc, struct vxlan_ftable_entry *fe, const uint8_t *mac, const struct sockaddr *sa, uint32_t flags) { fe->vxlfe_flags = flags; fe->vxlfe_expire = time_uptime + sc->vxl_ftable_timeout; memcpy(fe->vxlfe_mac, mac, ETHER_ADDR_LEN); vxlan_sockaddr_copy(&fe->vxlfe_raddr, sa); } static void vxlan_ftable_entry_destroy(struct vxlan_softc *sc, struct vxlan_ftable_entry *fe) { sc->vxl_ftable_cnt--; LIST_REMOVE(fe, vxlfe_hash); vxlan_ftable_entry_free(fe); } static int vxlan_ftable_entry_insert(struct vxlan_softc *sc, struct vxlan_ftable_entry *fe) { struct vxlan_ftable_entry *lfe; uint32_t hash; int dir; VXLAN_LOCK_WASSERT(sc); hash = VXLAN_SC_FTABLE_HASH(sc, fe->vxlfe_mac); lfe = LIST_FIRST(&sc->vxl_ftable[hash]); if (lfe == NULL) { LIST_INSERT_HEAD(&sc->vxl_ftable[hash], fe, vxlfe_hash); goto out; } do { dir = vxlan_ftable_addr_cmp(fe->vxlfe_mac, lfe->vxlfe_mac); if (dir == 0) return (EEXIST); if (dir > 0) { LIST_INSERT_BEFORE(lfe, fe, vxlfe_hash); goto out; } else if (LIST_NEXT(lfe, vxlfe_hash) == NULL) { LIST_INSERT_AFTER(lfe, fe, vxlfe_hash); goto out; } else lfe = LIST_NEXT(lfe, vxlfe_hash); } while (lfe != NULL); out: sc->vxl_ftable_cnt++; return (0); } static struct vxlan_ftable_entry * vxlan_ftable_entry_lookup(struct vxlan_softc *sc, const uint8_t *mac) { struct vxlan_ftable_entry *fe; uint32_t hash; int dir; VXLAN_LOCK_ASSERT(sc); hash = VXLAN_SC_FTABLE_HASH(sc, mac); LIST_FOREACH(fe, &sc->vxl_ftable[hash], vxlfe_hash) { dir = vxlan_ftable_addr_cmp(mac, fe->vxlfe_mac); if (dir == 0) return (fe); if (dir > 0) break; } return (NULL); } static void vxlan_ftable_entry_dump(struct vxlan_ftable_entry *fe, struct sbuf *sb) { char buf[64]; const union vxlan_sockaddr *sa; const void *addr; int i, len, af, width; sa = &fe->vxlfe_raddr; af = sa->sa.sa_family; len = sbuf_len(sb); sbuf_printf(sb, "%c 0x%02X ", VXLAN_FE_IS_DYNAMIC(fe) ? 'D' : 'S', fe->vxlfe_flags); for (i = 0; i < ETHER_ADDR_LEN - 1; i++) sbuf_printf(sb, "%02X:", fe->vxlfe_mac[i]); sbuf_printf(sb, "%02X ", fe->vxlfe_mac[i]); if (af == AF_INET) { addr = &sa->in4.sin_addr; width = INET_ADDRSTRLEN - 1; } else { addr = &sa->in6.sin6_addr; width = INET6_ADDRSTRLEN - 1; } inet_ntop(af, addr, buf, sizeof(buf)); sbuf_printf(sb, "%*s ", width, buf); sbuf_printf(sb, "%08jd", (intmax_t)fe->vxlfe_expire); sbuf_putc(sb, '\n'); /* Truncate a partial line. */ if (sbuf_error(sb) != 0) sbuf_setpos(sb, len); } static struct vxlan_socket * vxlan_socket_alloc(const union vxlan_sockaddr *sa) { struct vxlan_socket *vso; int i; vso = malloc(sizeof(*vso), M_VXLAN, M_WAITOK | M_ZERO); rm_init(&vso->vxlso_lock, "vxlansorm"); refcount_init(&vso->vxlso_refcnt, 0); for (i = 0; i < VXLAN_SO_VNI_HASH_SIZE; i++) LIST_INIT(&vso->vxlso_vni_hash[i]); vso->vxlso_laddr = *sa; return (vso); } static void vxlan_socket_destroy(struct vxlan_socket *vso) { struct socket *so; - struct vxlan_socket_mc_info *mc; +#ifdef INVARIANTS int i; + struct vxlan_socket_mc_info *mc; for (i = 0; i < VXLAN_SO_MC_MAX_GROUPS; i++) { mc = &vso->vxlso_mc[i]; KASSERT(mc->vxlsomc_gaddr.sa.sa_family == AF_UNSPEC, ("%s: socket %p mc[%d] still has address", __func__, vso, i)); } for (i = 0; i < VXLAN_SO_VNI_HASH_SIZE; i++) { KASSERT(LIST_EMPTY(&vso->vxlso_vni_hash[i]), ("%s: socket %p vni_hash[%d] not empty", __func__, vso, i)); } - +#endif so = vso->vxlso_sock; if (so != NULL) { vso->vxlso_sock = NULL; soclose(so); } rm_destroy(&vso->vxlso_lock); free(vso, M_VXLAN); } static void vxlan_socket_release(struct vxlan_socket *vso) { int destroy; VXLAN_LIST_LOCK(); destroy = VXLAN_SO_RELEASE(vso); if (destroy != 0) LIST_REMOVE(vso, vxlso_entry); VXLAN_LIST_UNLOCK(); if (destroy != 0) vxlan_socket_destroy(vso); } static struct vxlan_socket * vxlan_socket_lookup(union vxlan_sockaddr *vxlsa) { struct vxlan_socket *vso; VXLAN_LIST_LOCK(); LIST_FOREACH(vso, &vxlan_socket_list, vxlso_entry) { if (vxlan_sockaddr_cmp(&vso->vxlso_laddr, &vxlsa->sa) == 0) { VXLAN_SO_ACQUIRE(vso); break; } } VXLAN_LIST_UNLOCK(); return (vso); } static void vxlan_socket_insert(struct vxlan_socket *vso) { VXLAN_LIST_LOCK(); VXLAN_SO_ACQUIRE(vso); LIST_INSERT_HEAD(&vxlan_socket_list, vso, vxlso_entry); VXLAN_LIST_UNLOCK(); } static int vxlan_socket_init(struct vxlan_socket *vso, struct ifnet *ifp) { struct thread *td; int error; td = curthread; error = socreate(vso->vxlso_laddr.sa.sa_family, &vso->vxlso_sock, SOCK_DGRAM, IPPROTO_UDP, td->td_ucred, td); if (error) { if_printf(ifp, "cannot create socket: %d\n", error); return (error); } error = udp_set_kernel_tunneling(vso->vxlso_sock, vxlan_rcv_udp_packet, NULL, vso); if (error) { if_printf(ifp, "cannot set tunneling function: %d\n", error); return (error); } if (vxlan_reuse_port != 0) { struct sockopt sopt; int val = 1; bzero(&sopt, sizeof(sopt)); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_IP; sopt.sopt_name = SO_REUSEPORT; sopt.sopt_val = &val; sopt.sopt_valsize = sizeof(val); error = sosetopt(vso->vxlso_sock, &sopt); if (error) { if_printf(ifp, "cannot set REUSEADDR socket opt: %d\n", error); return (error); } } return (0); } static int vxlan_socket_bind(struct vxlan_socket *vso, struct ifnet *ifp) { union vxlan_sockaddr laddr; struct thread *td; int error; td = curthread; laddr = vso->vxlso_laddr; error = sobind(vso->vxlso_sock, &laddr.sa, td); if (error) { if (error != EADDRINUSE) if_printf(ifp, "cannot bind socket: %d\n", error); return (error); } return (0); } static int vxlan_socket_create(struct ifnet *ifp, int multicast, const union vxlan_sockaddr *saddr, struct vxlan_socket **vsop) { union vxlan_sockaddr laddr; struct vxlan_socket *vso; int error; laddr = *saddr; /* * If this socket will be multicast, then only the local port * must be specified when binding. */ if (multicast != 0) { if (VXLAN_SOCKADDR_IS_IPV4(&laddr)) laddr.in4.sin_addr.s_addr = INADDR_ANY; #ifdef INET6 else laddr.in6.sin6_addr = in6addr_any; #endif } vso = vxlan_socket_alloc(&laddr); if (vso == NULL) return (ENOMEM); error = vxlan_socket_init(vso, ifp); if (error) goto fail; error = vxlan_socket_bind(vso, ifp); if (error) goto fail; /* * There is a small window between the bind completing and * inserting the socket, so that a concurrent create may fail. * Let's not worry about that for now. */ vxlan_socket_insert(vso); *vsop = vso; return (0); fail: vxlan_socket_destroy(vso); return (error); } static void vxlan_socket_ifdetach(struct vxlan_socket *vso, struct ifnet *ifp, struct vxlan_softc_head *list) { struct rm_priotracker tracker; struct vxlan_softc *sc; int i; VXLAN_SO_RLOCK(vso, &tracker); for (i = 0; i < VXLAN_SO_VNI_HASH_SIZE; i++) { LIST_FOREACH(sc, &vso->vxlso_vni_hash[i], vxl_entry) vxlan_ifdetach(sc, ifp, list); } VXLAN_SO_RUNLOCK(vso, &tracker); } static struct vxlan_socket * vxlan_socket_mc_lookup(const union vxlan_sockaddr *vxlsa) { union vxlan_sockaddr laddr; struct vxlan_socket *vso; laddr = *vxlsa; if (VXLAN_SOCKADDR_IS_IPV4(&laddr)) laddr.in4.sin_addr.s_addr = INADDR_ANY; #ifdef INET6 else laddr.in6.sin6_addr = in6addr_any; #endif vso = vxlan_socket_lookup(&laddr); return (vso); } static int vxlan_sockaddr_mc_info_match(const struct vxlan_socket_mc_info *mc, const union vxlan_sockaddr *group, const union vxlan_sockaddr *local, int ifidx) { if (!vxlan_sockaddr_in_any(local) && !vxlan_sockaddr_in_equal(&mc->vxlsomc_saddr, &local->sa)) return (0); if (!vxlan_sockaddr_in_equal(&mc->vxlsomc_gaddr, &group->sa)) return (0); if (ifidx != 0 && ifidx != mc->vxlsomc_ifidx) return (0); return (1); } static int vxlan_socket_mc_join_group(struct vxlan_socket *vso, const union vxlan_sockaddr *group, const union vxlan_sockaddr *local, int *ifidx, union vxlan_sockaddr *source) { struct sockopt sopt; int error; *source = *local; if (VXLAN_SOCKADDR_IS_IPV4(group)) { struct ip_mreq mreq; mreq.imr_multiaddr = group->in4.sin_addr; mreq.imr_interface = local->in4.sin_addr; bzero(&sopt, sizeof(sopt)); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_IP; sopt.sopt_name = IP_ADD_MEMBERSHIP; sopt.sopt_val = &mreq; sopt.sopt_valsize = sizeof(mreq); error = sosetopt(vso->vxlso_sock, &sopt); if (error) return (error); /* * BMV: Ideally, there would be a formal way for us to get * the local interface that was selected based on the * imr_interface address. We could then update *ifidx so * vxlan_sockaddr_mc_info_match() would return a match for * later creates that explicitly set the multicast interface. * * If we really need to, we can of course look in the INP's * membership list: * sotoinpcb(vso->vxlso_sock)->inp_moptions-> * imo_membership[]->inm_ifp * similarly to imo_match_group(). */ source->in4.sin_addr = local->in4.sin_addr; } else if (VXLAN_SOCKADDR_IS_IPV6(group)) { struct ipv6_mreq mreq; mreq.ipv6mr_multiaddr = group->in6.sin6_addr; mreq.ipv6mr_interface = *ifidx; bzero(&sopt, sizeof(sopt)); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_IPV6; sopt.sopt_name = IPV6_JOIN_GROUP; sopt.sopt_val = &mreq; sopt.sopt_valsize = sizeof(mreq); error = sosetopt(vso->vxlso_sock, &sopt); if (error) return (error); /* * BMV: As with IPv4, we would really like to know what * interface in6p_lookup_mcast_ifp() selected. */ } else error = EAFNOSUPPORT; return (error); } static int vxlan_socket_mc_leave_group(struct vxlan_socket *vso, const union vxlan_sockaddr *group, const union vxlan_sockaddr *source, int ifidx) { struct sockopt sopt; int error; bzero(&sopt, sizeof(sopt)); sopt.sopt_dir = SOPT_SET; if (VXLAN_SOCKADDR_IS_IPV4(group)) { struct ip_mreq mreq; mreq.imr_multiaddr = group->in4.sin_addr; mreq.imr_interface = source->in4.sin_addr; sopt.sopt_level = IPPROTO_IP; sopt.sopt_name = IP_DROP_MEMBERSHIP; sopt.sopt_val = &mreq; sopt.sopt_valsize = sizeof(mreq); error = sosetopt(vso->vxlso_sock, &sopt); } else if (VXLAN_SOCKADDR_IS_IPV6(group)) { struct ipv6_mreq mreq; mreq.ipv6mr_multiaddr = group->in6.sin6_addr; mreq.ipv6mr_interface = ifidx; sopt.sopt_level = IPPROTO_IPV6; sopt.sopt_name = IPV6_LEAVE_GROUP; sopt.sopt_val = &mreq; sopt.sopt_valsize = sizeof(mreq); error = sosetopt(vso->vxlso_sock, &sopt); } else error = EAFNOSUPPORT; return (error); } static int vxlan_socket_mc_add_group(struct vxlan_socket *vso, const union vxlan_sockaddr *group, const union vxlan_sockaddr *local, int ifidx, int *idx) { union vxlan_sockaddr source; struct vxlan_socket_mc_info *mc; int i, empty, error; /* * Within a socket, the same multicast group may be used by multiple * interfaces, each with a different network identifier. But a socket * may only join a multicast group once, so keep track of the users * here. */ VXLAN_SO_WLOCK(vso); for (empty = 0, i = 0; i < VXLAN_SO_MC_MAX_GROUPS; i++) { mc = &vso->vxlso_mc[i]; if (mc->vxlsomc_gaddr.sa.sa_family == AF_UNSPEC) { empty++; continue; } if (vxlan_sockaddr_mc_info_match(mc, group, local, ifidx)) goto out; } VXLAN_SO_WUNLOCK(vso); if (empty == 0) return (ENOSPC); error = vxlan_socket_mc_join_group(vso, group, local, &ifidx, &source); if (error) return (error); VXLAN_SO_WLOCK(vso); for (i = 0; i < VXLAN_SO_MC_MAX_GROUPS; i++) { mc = &vso->vxlso_mc[i]; if (mc->vxlsomc_gaddr.sa.sa_family == AF_UNSPEC) { vxlan_sockaddr_copy(&mc->vxlsomc_gaddr, &group->sa); vxlan_sockaddr_copy(&mc->vxlsomc_saddr, &source.sa); mc->vxlsomc_ifidx = ifidx; goto out; } } VXLAN_SO_WUNLOCK(vso); error = vxlan_socket_mc_leave_group(vso, group, &source, ifidx); MPASS(error == 0); return (ENOSPC); out: mc->vxlsomc_users++; VXLAN_SO_WUNLOCK(vso); *idx = i; return (0); } static void vxlan_socket_mc_release_group_by_idx(struct vxlan_socket *vso, int idx) { union vxlan_sockaddr group, source; struct vxlan_socket_mc_info *mc; int ifidx, leave; KASSERT(idx >= 0 && idx < VXLAN_SO_MC_MAX_GROUPS, ("%s: vso %p idx %d out of bounds", __func__, vso, idx)); leave = 0; mc = &vso->vxlso_mc[idx]; VXLAN_SO_WLOCK(vso); mc->vxlsomc_users--; if (mc->vxlsomc_users == 0) { group = mc->vxlsomc_gaddr; source = mc->vxlsomc_saddr; ifidx = mc->vxlsomc_ifidx; bzero(mc, sizeof(*mc)); leave = 1; } VXLAN_SO_WUNLOCK(vso); if (leave != 0) { /* * Our socket's membership in this group may have already * been removed if we joined through an interface that's * been detached. */ vxlan_socket_mc_leave_group(vso, &group, &source, ifidx); } } static struct vxlan_softc * vxlan_socket_lookup_softc_locked(struct vxlan_socket *vso, uint32_t vni) { struct vxlan_softc *sc; uint32_t hash; VXLAN_SO_LOCK_ASSERT(vso); hash = VXLAN_SO_VNI_HASH(vni); LIST_FOREACH(sc, &vso->vxlso_vni_hash[hash], vxl_entry) { if (sc->vxl_vni == vni) { VXLAN_ACQUIRE(sc); break; } } return (sc); } static struct vxlan_softc * vxlan_socket_lookup_softc(struct vxlan_socket *vso, uint32_t vni) { struct rm_priotracker tracker; struct vxlan_softc *sc; VXLAN_SO_RLOCK(vso, &tracker); sc = vxlan_socket_lookup_softc_locked(vso, vni); VXLAN_SO_RUNLOCK(vso, &tracker); return (sc); } static int vxlan_socket_insert_softc(struct vxlan_socket *vso, struct vxlan_softc *sc) { struct vxlan_softc *tsc; uint32_t vni, hash; vni = sc->vxl_vni; hash = VXLAN_SO_VNI_HASH(vni); VXLAN_SO_WLOCK(vso); tsc = vxlan_socket_lookup_softc_locked(vso, vni); if (tsc != NULL) { VXLAN_SO_WUNLOCK(vso); vxlan_release(tsc); return (EEXIST); } VXLAN_ACQUIRE(sc); LIST_INSERT_HEAD(&vso->vxlso_vni_hash[hash], sc, vxl_entry); VXLAN_SO_WUNLOCK(vso); return (0); } static void vxlan_socket_remove_softc(struct vxlan_socket *vso, struct vxlan_softc *sc) { VXLAN_SO_WLOCK(vso); LIST_REMOVE(sc, vxl_entry); VXLAN_SO_WUNLOCK(vso); vxlan_release(sc); } static struct ifnet * vxlan_multicast_if_ref(struct vxlan_softc *sc, int ipv4) { struct ifnet *ifp; VXLAN_LOCK_ASSERT(sc); if (ipv4 && sc->vxl_im4o != NULL) ifp = sc->vxl_im4o->imo_multicast_ifp; else if (!ipv4 && sc->vxl_im6o != NULL) ifp = sc->vxl_im6o->im6o_multicast_ifp; else ifp = NULL; if (ifp != NULL) if_ref(ifp); return (ifp); } static void vxlan_free_multicast(struct vxlan_softc *sc) { if (sc->vxl_mc_ifp != NULL) { if_rele(sc->vxl_mc_ifp); sc->vxl_mc_ifp = NULL; sc->vxl_mc_ifindex = 0; } if (sc->vxl_im4o != NULL) { free(sc->vxl_im4o, M_VXLAN); sc->vxl_im4o = NULL; } if (sc->vxl_im6o != NULL) { free(sc->vxl_im6o, M_VXLAN); sc->vxl_im6o = NULL; } } static int vxlan_setup_multicast_interface(struct vxlan_softc *sc) { struct ifnet *ifp; ifp = ifunit_ref(sc->vxl_mc_ifname); if (ifp == NULL) { if_printf(sc->vxl_ifp, "multicast interface %s does " "not exist\n", sc->vxl_mc_ifname); return (ENOENT); } if ((ifp->if_flags & IFF_MULTICAST) == 0) { if_printf(sc->vxl_ifp, "interface %s does not support " "multicast\n", sc->vxl_mc_ifname); if_rele(ifp); return (ENOTSUP); } sc->vxl_mc_ifp = ifp; sc->vxl_mc_ifindex = ifp->if_index; return (0); } static int vxlan_setup_multicast(struct vxlan_softc *sc) { const union vxlan_sockaddr *group; int error; group = &sc->vxl_dst_addr; error = 0; if (sc->vxl_mc_ifname[0] != '\0') { error = vxlan_setup_multicast_interface(sc); if (error) return (error); } /* * Initialize an multicast options structure that is sufficiently * populated for use in the respective IP output routine. This * structure is typically stored in the socket, but our sockets * may be shared among multiple interfaces. */ if (VXLAN_SOCKADDR_IS_IPV4(group)) { sc->vxl_im4o = malloc(sizeof(struct ip_moptions), M_VXLAN, M_ZERO | M_WAITOK); sc->vxl_im4o->imo_multicast_ifp = sc->vxl_mc_ifp; sc->vxl_im4o->imo_multicast_ttl = sc->vxl_ttl; sc->vxl_im4o->imo_multicast_vif = -1; } else if (VXLAN_SOCKADDR_IS_IPV6(group)) { sc->vxl_im6o = malloc(sizeof(struct ip6_moptions), M_VXLAN, M_ZERO | M_WAITOK); sc->vxl_im6o->im6o_multicast_ifp = sc->vxl_mc_ifp; sc->vxl_im6o->im6o_multicast_hlim = sc->vxl_ttl; } return (error); } static int vxlan_setup_socket(struct vxlan_softc *sc) { struct vxlan_socket *vso; struct ifnet *ifp; union vxlan_sockaddr *saddr, *daddr; int multicast, error; vso = NULL; ifp = sc->vxl_ifp; saddr = &sc->vxl_src_addr; daddr = &sc->vxl_dst_addr; multicast = vxlan_sockaddr_in_multicast(daddr); MPASS(multicast != -1); sc->vxl_vso_mc_index = -1; /* * Try to create the socket. If that fails, attempt to use an * existing socket. */ error = vxlan_socket_create(ifp, multicast, saddr, &vso); if (error) { if (multicast != 0) vso = vxlan_socket_mc_lookup(saddr); else vso = vxlan_socket_lookup(saddr); if (vso == NULL) { if_printf(ifp, "cannot create socket (error: %d), " "and no existing socket found\n", error); goto out; } } if (multicast != 0) { error = vxlan_setup_multicast(sc); if (error) goto out; error = vxlan_socket_mc_add_group(vso, daddr, saddr, sc->vxl_mc_ifindex, &sc->vxl_vso_mc_index); if (error) goto out; } sc->vxl_sock = vso; error = vxlan_socket_insert_softc(vso, sc); if (error) { sc->vxl_sock = NULL; if_printf(ifp, "network identifier %d already exists in " "this socket\n", sc->vxl_vni); goto out; } return (0); out: if (vso != NULL) { if (sc->vxl_vso_mc_index != -1) { vxlan_socket_mc_release_group_by_idx(vso, sc->vxl_vso_mc_index); sc->vxl_vso_mc_index = -1; } if (multicast != 0) vxlan_free_multicast(sc); vxlan_socket_release(vso); } return (error); } static void vxlan_setup_interface(struct vxlan_softc *sc) { struct ifnet *ifp; ifp = sc->vxl_ifp; ifp->if_hdrlen = ETHER_HDR_LEN + sizeof(struct vxlanudphdr); if (VXLAN_SOCKADDR_IS_IPV4(&sc->vxl_dst_addr) != 0) ifp->if_hdrlen += sizeof(struct ip); else if (VXLAN_SOCKADDR_IS_IPV6(&sc->vxl_dst_addr) != 0) ifp->if_hdrlen += sizeof(struct ip6_hdr); } static int vxlan_valid_init_config(struct vxlan_softc *sc) { const char *reason; if (vxlan_check_vni(sc->vxl_vni) != 0) { reason = "invalid virtual network identifier specified"; goto fail; } if (vxlan_sockaddr_supported(&sc->vxl_src_addr, 1) == 0) { reason = "source address type is not supported"; goto fail; } if (vxlan_sockaddr_supported(&sc->vxl_dst_addr, 0) == 0) { reason = "destination address type is not supported"; goto fail; } if (vxlan_sockaddr_in_any(&sc->vxl_dst_addr) != 0) { reason = "no valid destination address specified"; goto fail; } if (vxlan_sockaddr_in_multicast(&sc->vxl_dst_addr) == 0 && sc->vxl_mc_ifname[0] != '\0') { reason = "can only specify interface with a group address"; goto fail; } if (vxlan_sockaddr_in_any(&sc->vxl_src_addr) == 0) { if (VXLAN_SOCKADDR_IS_IPV4(&sc->vxl_src_addr) ^ VXLAN_SOCKADDR_IS_IPV4(&sc->vxl_dst_addr)) { reason = "source and destination address must both " "be either IPv4 or IPv6"; goto fail; } } if (sc->vxl_src_addr.in4.sin_port == 0) { reason = "local port not specified"; goto fail; } if (sc->vxl_dst_addr.in4.sin_port == 0) { reason = "remote port not specified"; goto fail; } return (0); fail: if_printf(sc->vxl_ifp, "cannot initialize interface: %s\n", reason); return (EINVAL); } static void vxlan_init_wait(struct vxlan_softc *sc) { VXLAN_LOCK_WASSERT(sc); while (sc->vxl_flags & VXLAN_FLAG_INIT) rm_sleep(sc, &sc->vxl_lock, 0, "vxlint", hz); } static void vxlan_init_complete(struct vxlan_softc *sc) { VXLAN_WLOCK(sc); sc->vxl_flags &= ~VXLAN_FLAG_INIT; wakeup(sc); VXLAN_WUNLOCK(sc); } static void vxlan_init(void *xsc) { static const uint8_t empty_mac[ETHER_ADDR_LEN]; struct vxlan_softc *sc; struct ifnet *ifp; sc = xsc; ifp = sc->vxl_ifp; VXLAN_WLOCK(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { VXLAN_WUNLOCK(sc); return; } sc->vxl_flags |= VXLAN_FLAG_INIT; VXLAN_WUNLOCK(sc); if (vxlan_valid_init_config(sc) != 0) goto out; vxlan_setup_interface(sc); if (vxlan_setup_socket(sc) != 0) goto out; /* Initialize the default forwarding entry. */ vxlan_ftable_entry_init(sc, &sc->vxl_default_fe, empty_mac, &sc->vxl_dst_addr.sa, VXLAN_FE_FLAG_STATIC); VXLAN_WLOCK(sc); ifp->if_drv_flags |= IFF_DRV_RUNNING; callout_reset(&sc->vxl_callout, vxlan_ftable_prune_period * hz, vxlan_timer, sc); VXLAN_WUNLOCK(sc); if_link_state_change(ifp, LINK_STATE_UP); out: vxlan_init_complete(sc); } static void vxlan_release(struct vxlan_softc *sc) { /* * The softc may be destroyed as soon as we release our reference, * so we cannot serialize the wakeup with the softc lock. We use a * timeout in our sleeps so a missed wakeup is unfortunate but not * fatal. */ if (VXLAN_RELEASE(sc) != 0) wakeup(sc); } static void vxlan_teardown_wait(struct vxlan_softc *sc) { VXLAN_LOCK_WASSERT(sc); while (sc->vxl_flags & VXLAN_FLAG_TEARDOWN) rm_sleep(sc, &sc->vxl_lock, 0, "vxltrn", hz); } static void vxlan_teardown_complete(struct vxlan_softc *sc) { VXLAN_WLOCK(sc); sc->vxl_flags &= ~VXLAN_FLAG_TEARDOWN; wakeup(sc); VXLAN_WUNLOCK(sc); } static void vxlan_teardown_locked(struct vxlan_softc *sc) { struct ifnet *ifp; struct vxlan_socket *vso; ifp = sc->vxl_ifp; VXLAN_LOCK_WASSERT(sc); MPASS(sc->vxl_flags & VXLAN_FLAG_TEARDOWN); ifp->if_flags &= ~IFF_UP; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; callout_stop(&sc->vxl_callout); vso = sc->vxl_sock; sc->vxl_sock = NULL; VXLAN_WUNLOCK(sc); if_link_state_change(ifp, LINK_STATE_DOWN); if (vso != NULL) { vxlan_socket_remove_softc(vso, sc); if (sc->vxl_vso_mc_index != -1) { vxlan_socket_mc_release_group_by_idx(vso, sc->vxl_vso_mc_index); sc->vxl_vso_mc_index = -1; } } VXLAN_WLOCK(sc); while (sc->vxl_refcnt != 0) rm_sleep(sc, &sc->vxl_lock, 0, "vxldrn", hz); VXLAN_WUNLOCK(sc); callout_drain(&sc->vxl_callout); vxlan_free_multicast(sc); if (vso != NULL) vxlan_socket_release(vso); vxlan_teardown_complete(sc); } static void vxlan_teardown(struct vxlan_softc *sc) { VXLAN_WLOCK(sc); if (sc->vxl_flags & VXLAN_FLAG_TEARDOWN) { vxlan_teardown_wait(sc); VXLAN_WUNLOCK(sc); return; } sc->vxl_flags |= VXLAN_FLAG_TEARDOWN; vxlan_teardown_locked(sc); } static void vxlan_ifdetach(struct vxlan_softc *sc, struct ifnet *ifp, struct vxlan_softc_head *list) { VXLAN_WLOCK(sc); if (sc->vxl_mc_ifp != ifp) goto out; if (sc->vxl_flags & VXLAN_FLAG_TEARDOWN) goto out; sc->vxl_flags |= VXLAN_FLAG_TEARDOWN; LIST_INSERT_HEAD(list, sc, vxl_ifdetach_list); out: VXLAN_WUNLOCK(sc); } static void vxlan_timer(void *xsc) { struct vxlan_softc *sc; sc = xsc; VXLAN_LOCK_WASSERT(sc); vxlan_ftable_expire(sc); callout_schedule(&sc->vxl_callout, vxlan_ftable_prune_period * hz); } static int vxlan_ioctl_ifflags(struct vxlan_softc *sc) { struct ifnet *ifp; ifp = sc->vxl_ifp; if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) vxlan_init(sc); } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) vxlan_teardown(sc); } return (0); } static int vxlan_ctrl_get_config(struct vxlan_softc *sc, void *arg) { struct rm_priotracker tracker; struct ifvxlancfg *cfg; cfg = arg; bzero(cfg, sizeof(*cfg)); VXLAN_RLOCK(sc, &tracker); cfg->vxlc_vni = sc->vxl_vni; memcpy(&cfg->vxlc_local_sa, &sc->vxl_src_addr, sizeof(union vxlan_sockaddr)); memcpy(&cfg->vxlc_remote_sa, &sc->vxl_dst_addr, sizeof(union vxlan_sockaddr)); cfg->vxlc_mc_ifindex = sc->vxl_mc_ifindex; cfg->vxlc_ftable_cnt = sc->vxl_ftable_cnt; cfg->vxlc_ftable_max = sc->vxl_ftable_max; cfg->vxlc_ftable_timeout = sc->vxl_ftable_timeout; cfg->vxlc_port_min = sc->vxl_min_port; cfg->vxlc_port_max = sc->vxl_max_port; cfg->vxlc_learn = (sc->vxl_flags & VXLAN_FLAG_LEARN) != 0; cfg->vxlc_ttl = sc->vxl_ttl; VXLAN_RUNLOCK(sc, &tracker); #ifdef INET6 if (VXLAN_SOCKADDR_IS_IPV6(&cfg->vxlc_local_sa)) sa6_recoverscope(&cfg->vxlc_local_sa.in6); if (VXLAN_SOCKADDR_IS_IPV6(&cfg->vxlc_remote_sa)) sa6_recoverscope(&cfg->vxlc_remote_sa.in6); #endif return (0); } static int vxlan_ctrl_set_vni(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; int error; cmd = arg; if (vxlan_check_vni(cmd->vxlcmd_vni) != 0) return (EINVAL); VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { sc->vxl_vni = cmd->vxlcmd_vni; error = 0; } else error = EBUSY; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_local_addr(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; union vxlan_sockaddr *vxlsa; int error; cmd = arg; vxlsa = &cmd->vxlcmd_sa; if (!VXLAN_SOCKADDR_IS_IPV46(vxlsa)) return (EINVAL); if (vxlan_sockaddr_in_multicast(vxlsa) != 0) return (EINVAL); if (VXLAN_SOCKADDR_IS_IPV6(vxlsa)) { error = vxlan_sockaddr_in6_embedscope(vxlsa); if (error) return (error); } VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { vxlan_sockaddr_in_copy(&sc->vxl_src_addr, &vxlsa->sa); error = 0; } else error = EBUSY; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_remote_addr(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; union vxlan_sockaddr *vxlsa; int error; cmd = arg; vxlsa = &cmd->vxlcmd_sa; if (!VXLAN_SOCKADDR_IS_IPV46(vxlsa)) return (EINVAL); if (VXLAN_SOCKADDR_IS_IPV6(vxlsa)) { error = vxlan_sockaddr_in6_embedscope(vxlsa); if (error) return (error); } VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { vxlan_sockaddr_in_copy(&sc->vxl_dst_addr, &vxlsa->sa); error = 0; } else error = EBUSY; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_local_port(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; int error; cmd = arg; if (cmd->vxlcmd_port == 0) return (EINVAL); VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { sc->vxl_src_addr.in4.sin_port = htons(cmd->vxlcmd_port); error = 0; } else error = EBUSY; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_remote_port(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; int error; cmd = arg; if (cmd->vxlcmd_port == 0) return (EINVAL); VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { sc->vxl_dst_addr.in4.sin_port = htons(cmd->vxlcmd_port); error = 0; } else error = EBUSY; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_port_range(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; uint16_t min, max; int error; cmd = arg; min = cmd->vxlcmd_port_min; max = cmd->vxlcmd_port_max; if (max < min) return (EINVAL); VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { sc->vxl_min_port = min; sc->vxl_max_port = max; error = 0; } else error = EBUSY; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_ftable_timeout(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; int error; cmd = arg; VXLAN_WLOCK(sc); if (vxlan_check_ftable_timeout(cmd->vxlcmd_ftable_timeout) == 0) { sc->vxl_ftable_timeout = cmd->vxlcmd_ftable_timeout; error = 0; } else error = EINVAL; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_ftable_max(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; int error; cmd = arg; VXLAN_WLOCK(sc); if (vxlan_check_ftable_max(cmd->vxlcmd_ftable_max) == 0) { sc->vxl_ftable_max = cmd->vxlcmd_ftable_max; error = 0; } else error = EINVAL; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_multicast_if(struct vxlan_softc * sc, void *arg) { struct ifvxlancmd *cmd; int error; cmd = arg; VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { strlcpy(sc->vxl_mc_ifname, cmd->vxlcmd_ifname, IFNAMSIZ); error = 0; } else error = EBUSY; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_ttl(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; int error; cmd = arg; VXLAN_WLOCK(sc); if (vxlan_check_ttl(cmd->vxlcmd_ttl) == 0) { sc->vxl_ttl = cmd->vxlcmd_ttl; if (sc->vxl_im4o != NULL) sc->vxl_im4o->imo_multicast_ttl = sc->vxl_ttl; if (sc->vxl_im6o != NULL) sc->vxl_im6o->im6o_multicast_hlim = sc->vxl_ttl; error = 0; } else error = EINVAL; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_learn(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; cmd = arg; VXLAN_WLOCK(sc); if (cmd->vxlcmd_flags & VXLAN_CMD_FLAG_LEARN) sc->vxl_flags |= VXLAN_FLAG_LEARN; else sc->vxl_flags &= ~VXLAN_FLAG_LEARN; VXLAN_WUNLOCK(sc); return (0); } static int vxlan_ctrl_ftable_entry_add(struct vxlan_softc *sc, void *arg) { union vxlan_sockaddr vxlsa; struct ifvxlancmd *cmd; struct vxlan_ftable_entry *fe; int error; cmd = arg; vxlsa = cmd->vxlcmd_sa; if (!VXLAN_SOCKADDR_IS_IPV46(&vxlsa)) return (EINVAL); if (vxlan_sockaddr_in_any(&vxlsa) != 0) return (EINVAL); if (vxlan_sockaddr_in_multicast(&vxlsa) != 0) return (EINVAL); /* BMV: We could support both IPv4 and IPv6 later. */ if (vxlsa.sa.sa_family != sc->vxl_dst_addr.sa.sa_family) return (EAFNOSUPPORT); if (VXLAN_SOCKADDR_IS_IPV6(&vxlsa)) { error = vxlan_sockaddr_in6_embedscope(&vxlsa); if (error) return (error); } fe = vxlan_ftable_entry_alloc(); if (fe == NULL) return (ENOMEM); if (vxlsa.in4.sin_port == 0) vxlsa.in4.sin_port = sc->vxl_dst_addr.in4.sin_port; vxlan_ftable_entry_init(sc, fe, cmd->vxlcmd_mac, &vxlsa.sa, VXLAN_FE_FLAG_STATIC); VXLAN_WLOCK(sc); error = vxlan_ftable_entry_insert(sc, fe); VXLAN_WUNLOCK(sc); if (error) vxlan_ftable_entry_free(fe); return (error); } static int vxlan_ctrl_ftable_entry_rem(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; struct vxlan_ftable_entry *fe; int error; cmd = arg; VXLAN_WLOCK(sc); fe = vxlan_ftable_entry_lookup(sc, cmd->vxlcmd_mac); if (fe != NULL) { vxlan_ftable_entry_destroy(sc, fe); error = 0; } else error = ENOENT; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_flush(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; int all; cmd = arg; all = cmd->vxlcmd_flags & VXLAN_CMD_FLAG_FLUSH_ALL; VXLAN_WLOCK(sc); vxlan_ftable_flush(sc, all); VXLAN_WUNLOCK(sc); return (0); } static int vxlan_ioctl_drvspec(struct vxlan_softc *sc, struct ifdrv *ifd, int get) { const struct vxlan_control *vc; union { struct ifvxlancfg cfg; struct ifvxlancmd cmd; } args; int out, error; if (ifd->ifd_cmd >= vxlan_control_table_size) return (EINVAL); bzero(&args, sizeof(args)); vc = &vxlan_control_table[ifd->ifd_cmd]; out = (vc->vxlc_flags & VXLAN_CTRL_FLAG_COPYOUT) != 0; if ((get != 0 && out == 0) || (get == 0 && out != 0)) return (EINVAL); if (vc->vxlc_flags & VXLAN_CTRL_FLAG_SUSER) { error = priv_check(curthread, PRIV_NET_VXLAN); if (error) return (error); } if (ifd->ifd_len != vc->vxlc_argsize || ifd->ifd_len > sizeof(args)) return (EINVAL); if (vc->vxlc_flags & VXLAN_CTRL_FLAG_COPYIN) { error = copyin(ifd->ifd_data, &args, ifd->ifd_len); if (error) return (error); } error = vc->vxlc_func(sc, &args); if (error) return (error); if (vc->vxlc_flags & VXLAN_CTRL_FLAG_COPYOUT) { error = copyout(&args, ifd->ifd_data, ifd->ifd_len); if (error) return (error); } return (0); } static int vxlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct vxlan_softc *sc; struct ifreq *ifr; struct ifdrv *ifd; int error; sc = ifp->if_softc; ifr = (struct ifreq *) data; ifd = (struct ifdrv *) data; switch (cmd) { case SIOCADDMULTI: case SIOCDELMULTI: error = 0; break; case SIOCGDRVSPEC: case SIOCSDRVSPEC: error = vxlan_ioctl_drvspec(sc, ifd, cmd == SIOCGDRVSPEC); break; case SIOCSIFFLAGS: error = vxlan_ioctl_ifflags(sc); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->vxl_media, cmd); break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } #if defined(INET) || defined(INET6) static uint16_t vxlan_pick_source_port(struct vxlan_softc *sc, struct mbuf *m) { int range; uint32_t hash; range = sc->vxl_max_port - sc->vxl_min_port + 1; if (M_HASHTYPE_ISHASH(m)) hash = m->m_pkthdr.flowid; else hash = jenkins_hash(m->m_data, ETHER_HDR_LEN, sc->vxl_port_hash_key); return (sc->vxl_min_port + (hash % range)); } static void vxlan_encap_header(struct vxlan_softc *sc, struct mbuf *m, int ipoff, uint16_t srcport, uint16_t dstport) { struct vxlanudphdr *hdr; struct udphdr *udph; struct vxlan_header *vxh; int len; len = m->m_pkthdr.len - ipoff; MPASS(len >= sizeof(struct vxlanudphdr)); hdr = mtodo(m, ipoff); udph = &hdr->vxlh_udp; udph->uh_sport = srcport; udph->uh_dport = dstport; udph->uh_ulen = htons(len); udph->uh_sum = 0; vxh = &hdr->vxlh_hdr; vxh->vxlh_flags = htonl(VXLAN_HDR_FLAGS_VALID_VNI); vxh->vxlh_vni = htonl(sc->vxl_vni << VXLAN_HDR_VNI_SHIFT); } #endif static int vxlan_encap4(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa, struct mbuf *m) { #ifdef INET struct ifnet *ifp; struct ip *ip; struct in_addr srcaddr, dstaddr; uint16_t srcport, dstport; int len, mcast, error; ifp = sc->vxl_ifp; srcaddr = sc->vxl_src_addr.in4.sin_addr; srcport = vxlan_pick_source_port(sc, m); dstaddr = fvxlsa->in4.sin_addr; dstport = fvxlsa->in4.sin_port; M_PREPEND(m, sizeof(struct ip) + sizeof(struct vxlanudphdr), M_NOWAIT); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } len = m->m_pkthdr.len; ip = mtod(m, struct ip *); ip->ip_tos = 0; ip->ip_len = htons(len); ip->ip_off = 0; ip->ip_ttl = sc->vxl_ttl; ip->ip_p = IPPROTO_UDP; ip->ip_sum = 0; ip->ip_src = srcaddr; ip->ip_dst = dstaddr; vxlan_encap_header(sc, m, sizeof(struct ip), srcport, dstport); mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; m->m_flags &= ~(M_MCAST | M_BCAST); error = ip_output(m, NULL, NULL, 0, sc->vxl_im4o, NULL); if (error == 0) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if (mcast != 0) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); } else if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); #else m_freem(m); return (ENOTSUP); #endif } static int vxlan_encap6(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa, struct mbuf *m) { #ifdef INET6 struct ifnet *ifp; struct ip6_hdr *ip6; const struct in6_addr *srcaddr, *dstaddr; uint16_t srcport, dstport; int len, mcast, error; ifp = sc->vxl_ifp; srcaddr = &sc->vxl_src_addr.in6.sin6_addr; srcport = vxlan_pick_source_port(sc, m); dstaddr = &fvxlsa->in6.sin6_addr; dstport = fvxlsa->in6.sin6_port; M_PREPEND(m, sizeof(struct ip6_hdr) + sizeof(struct vxlanudphdr), M_NOWAIT); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } len = m->m_pkthdr.len; ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; /* BMV: Keep in forwarding entry? */ ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_plen = 0; ip6->ip6_nxt = IPPROTO_UDP; ip6->ip6_hlim = sc->vxl_ttl; ip6->ip6_src = *srcaddr; ip6->ip6_dst = *dstaddr; vxlan_encap_header(sc, m, sizeof(struct ip6_hdr), srcport, dstport); /* * XXX BMV We need support for RFC6935 before we can send and * receive IPv6 UDP packets with a zero checksum. */ { struct udphdr *hdr = mtodo(m, sizeof(struct ip6_hdr)); hdr->uh_sum = in6_cksum_pseudo(ip6, m->m_pkthdr.len - sizeof(struct ip6_hdr), IPPROTO_UDP, 0); m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; m->m_flags &= ~(M_MCAST | M_BCAST); error = ip6_output(m, NULL, NULL, 0, sc->vxl_im6o, NULL, NULL); if (error == 0) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if (mcast != 0) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); } else if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); #else m_freem(m); return (ENOTSUP); #endif } static int vxlan_transmit(struct ifnet *ifp, struct mbuf *m) { struct rm_priotracker tracker; union vxlan_sockaddr vxlsa; struct vxlan_softc *sc; struct vxlan_ftable_entry *fe; struct ifnet *mcifp; struct ether_header *eh; int ipv4, error; sc = ifp->if_softc; eh = mtod(m, struct ether_header *); fe = NULL; mcifp = NULL; ETHER_BPF_MTAP(ifp, m); VXLAN_RLOCK(sc, &tracker); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { VXLAN_RUNLOCK(sc, &tracker); m_freem(m); return (ENETDOWN); } if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) fe = vxlan_ftable_entry_lookup(sc, eh->ether_dhost); if (fe == NULL) fe = &sc->vxl_default_fe; vxlan_sockaddr_copy(&vxlsa, &fe->vxlfe_raddr.sa); ipv4 = VXLAN_SOCKADDR_IS_IPV4(&vxlsa) != 0; if (vxlan_sockaddr_in_multicast(&vxlsa) != 0) mcifp = vxlan_multicast_if_ref(sc, ipv4); VXLAN_ACQUIRE(sc); VXLAN_RUNLOCK(sc, &tracker); if (ipv4 != 0) error = vxlan_encap4(sc, &vxlsa, m); else error = vxlan_encap6(sc, &vxlsa, m); vxlan_release(sc); if (mcifp != NULL) if_rele(mcifp); return (error); } static void vxlan_qflush(struct ifnet *ifp __unused) { } static void vxlan_rcv_udp_packet(struct mbuf *m, int offset, struct inpcb *inpcb, const struct sockaddr *srcsa, void *xvso) { struct vxlan_socket *vso; struct vxlan_header *vxh, vxlanhdr; uint32_t vni; - int error; + int error __unused; M_ASSERTPKTHDR(m); vso = xvso; offset += sizeof(struct udphdr); if (m->m_pkthdr.len < offset + sizeof(struct vxlan_header)) goto out; if (__predict_false(m->m_len < offset + sizeof(struct vxlan_header))) { m_copydata(m, offset, sizeof(struct vxlan_header), (caddr_t) &vxlanhdr); vxh = &vxlanhdr; } else vxh = mtodo(m, offset); /* * Drop if there is a reserved bit set in either the flags or VNI * fields of the header. This goes against the specification, but * a bit set may indicate an unsupported new feature. This matches * the behavior of the Linux implementation. */ if (vxh->vxlh_flags != htonl(VXLAN_HDR_FLAGS_VALID_VNI) || vxh->vxlh_vni & ~htonl(VXLAN_VNI_MASK)) goto out; vni = ntohl(vxh->vxlh_vni) >> VXLAN_HDR_VNI_SHIFT; /* Adjust to the start of the inner Ethernet frame. */ m_adj(m, offset + sizeof(struct vxlan_header)); error = vxlan_input(vso, vni, &m, srcsa); MPASS(error != 0 || m == NULL); out: if (m != NULL) m_freem(m); } static int vxlan_input(struct vxlan_socket *vso, uint32_t vni, struct mbuf **m0, const struct sockaddr *sa) { struct vxlan_softc *sc; struct ifnet *ifp; struct mbuf *m; struct ether_header *eh; int error; sc = vxlan_socket_lookup_softc(vso, vni); if (sc == NULL) return (ENOENT); ifp = sc->vxl_ifp; m = *m0; eh = mtod(m, struct ether_header *); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { error = ENETDOWN; goto out; } else if (ifp == m->m_pkthdr.rcvif) { /* XXX Does not catch more complex loops. */ error = EDEADLK; goto out; } if (sc->vxl_flags & VXLAN_FLAG_LEARN) vxlan_ftable_learn(sc, sa, eh->ether_shost); m_clrprotoflags(m); m->m_pkthdr.rcvif = ifp; M_SETFIB(m, ifp->if_fib); error = netisr_queue_src(NETISR_ETHER, 0, m); *m0 = NULL; out: vxlan_release(sc); return (error); } static void vxlan_set_default_config(struct vxlan_softc *sc) { sc->vxl_flags |= VXLAN_FLAG_LEARN; sc->vxl_vni = VXLAN_VNI_MAX; sc->vxl_ttl = IPDEFTTL; if (!vxlan_tunable_int(sc, "legacy_port", vxlan_legacy_port)) { sc->vxl_src_addr.in4.sin_port = htons(VXLAN_PORT); sc->vxl_dst_addr.in4.sin_port = htons(VXLAN_PORT); } else { sc->vxl_src_addr.in4.sin_port = htons(VXLAN_LEGACY_PORT); sc->vxl_dst_addr.in4.sin_port = htons(VXLAN_LEGACY_PORT); } sc->vxl_min_port = V_ipport_firstauto; sc->vxl_max_port = V_ipport_lastauto; sc->vxl_ftable_max = VXLAN_FTABLE_MAX; sc->vxl_ftable_timeout = VXLAN_FTABLE_TIMEOUT; } static int vxlan_set_user_config(struct vxlan_softc *sc, struct ifvxlanparam *vxlp) { #ifndef INET if (vxlp->vxlp_with & (VXLAN_PARAM_WITH_LOCAL_ADDR4 | VXLAN_PARAM_WITH_REMOTE_ADDR4)) return (EAFNOSUPPORT); #endif #ifndef INET6 if (vxlp->vxlp_with & (VXLAN_PARAM_WITH_LOCAL_ADDR6 | VXLAN_PARAM_WITH_REMOTE_ADDR6)) return (EAFNOSUPPORT); #else if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_ADDR6) { int error = vxlan_sockaddr_in6_embedscope(&vxlp->vxlp_local_sa); if (error) return (error); } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_ADDR6) { int error = vxlan_sockaddr_in6_embedscope( &vxlp->vxlp_remote_sa); if (error) return (error); } #endif if (vxlp->vxlp_with & VXLAN_PARAM_WITH_VNI) { if (vxlan_check_vni(vxlp->vxlp_vni) == 0) sc->vxl_vni = vxlp->vxlp_vni; } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_ADDR4) { sc->vxl_src_addr.in4.sin_len = sizeof(struct sockaddr_in); sc->vxl_src_addr.in4.sin_family = AF_INET; sc->vxl_src_addr.in4.sin_addr = vxlp->vxlp_local_sa.in4.sin_addr; } else if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_ADDR6) { sc->vxl_src_addr.in6.sin6_len = sizeof(struct sockaddr_in6); sc->vxl_src_addr.in6.sin6_family = AF_INET6; sc->vxl_src_addr.in6.sin6_addr = vxlp->vxlp_local_sa.in6.sin6_addr; } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_ADDR4) { sc->vxl_dst_addr.in4.sin_len = sizeof(struct sockaddr_in); sc->vxl_dst_addr.in4.sin_family = AF_INET; sc->vxl_dst_addr.in4.sin_addr = vxlp->vxlp_remote_sa.in4.sin_addr; } else if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_ADDR6) { sc->vxl_dst_addr.in6.sin6_len = sizeof(struct sockaddr_in6); sc->vxl_dst_addr.in6.sin6_family = AF_INET6; sc->vxl_dst_addr.in6.sin6_addr = vxlp->vxlp_remote_sa.in6.sin6_addr; } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_PORT) sc->vxl_src_addr.in4.sin_port = htons(vxlp->vxlp_local_port); if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_PORT) sc->vxl_dst_addr.in4.sin_port = htons(vxlp->vxlp_remote_port); if (vxlp->vxlp_with & VXLAN_PARAM_WITH_PORT_RANGE) { if (vxlp->vxlp_min_port <= vxlp->vxlp_max_port) { sc->vxl_min_port = vxlp->vxlp_min_port; sc->vxl_max_port = vxlp->vxlp_max_port; } } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_MULTICAST_IF) strlcpy(sc->vxl_mc_ifname, vxlp->vxlp_mc_ifname, IFNAMSIZ); if (vxlp->vxlp_with & VXLAN_PARAM_WITH_FTABLE_TIMEOUT) { if (vxlan_check_ftable_timeout(vxlp->vxlp_ftable_timeout) == 0) sc->vxl_ftable_timeout = vxlp->vxlp_ftable_timeout; } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_FTABLE_MAX) { if (vxlan_check_ftable_max(vxlp->vxlp_ftable_max) == 0) sc->vxl_ftable_max = vxlp->vxlp_ftable_max; } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_TTL) { if (vxlan_check_ttl(vxlp->vxlp_ttl) == 0) sc->vxl_ttl = vxlp->vxlp_ttl; } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LEARN) { if (vxlp->vxlp_learn == 0) sc->vxl_flags &= ~VXLAN_FLAG_LEARN; } return (0); } static int vxlan_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct vxlan_softc *sc; struct ifnet *ifp; struct ifvxlanparam vxlp; int error; sc = malloc(sizeof(struct vxlan_softc), M_VXLAN, M_WAITOK | M_ZERO); sc->vxl_unit = unit; vxlan_set_default_config(sc); if (params != 0) { error = copyin(params, &vxlp, sizeof(vxlp)); if (error) goto fail; error = vxlan_set_user_config(sc, &vxlp); if (error) goto fail; } ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { error = ENOSPC; goto fail; } sc->vxl_ifp = ifp; rm_init(&sc->vxl_lock, "vxlanrm"); callout_init_rw(&sc->vxl_callout, &sc->vxl_lock, 0); sc->vxl_port_hash_key = arc4random(); vxlan_ftable_init(sc); vxlan_sysctl_setup(sc); ifp->if_softc = sc; if_initname(ifp, vxlan_name, unit); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_init = vxlan_init; ifp->if_ioctl = vxlan_ioctl; ifp->if_transmit = vxlan_transmit; ifp->if_qflush = vxlan_qflush; ifp->if_capabilities |= IFCAP_LINKSTATE; ifp->if_capenable |= IFCAP_LINKSTATE; ifmedia_init(&sc->vxl_media, 0, vxlan_media_change, vxlan_media_status); ifmedia_add(&sc->vxl_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->vxl_media, IFM_ETHER | IFM_AUTO); vxlan_fakeaddr(sc); ether_ifattach(ifp, sc->vxl_hwaddr); ifp->if_baudrate = 0; ifp->if_hdrlen = 0; return (0); fail: free(sc, M_VXLAN); return (error); } static void vxlan_clone_destroy(struct ifnet *ifp) { struct vxlan_softc *sc; sc = ifp->if_softc; vxlan_teardown(sc); vxlan_ftable_flush(sc, 1); ether_ifdetach(ifp); if_free(ifp); ifmedia_removeall(&sc->vxl_media); vxlan_ftable_fini(sc); vxlan_sysctl_destroy(sc); rm_destroy(&sc->vxl_lock); free(sc, M_VXLAN); } /* BMV: Taken from if_bridge. */ static uint32_t vxlan_mac_hash(struct vxlan_softc *sc, const uint8_t *addr) { uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = sc->vxl_ftable_hash_key; b += addr[5] << 8; b += addr[4]; a += addr[3] << 24; a += addr[2] << 16; a += addr[1] << 8; a += addr[0]; /* * The following hash function is adapted from "Hash Functions" by Bob Jenkins * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). */ #define mix(a, b, c) \ do { \ a -= b; a -= c; a ^= (c >> 13); \ b -= c; b -= a; b ^= (a << 8); \ c -= a; c -= b; c ^= (b >> 13); \ a -= b; a -= c; a ^= (c >> 12); \ b -= c; b -= a; b ^= (a << 16); \ c -= a; c -= b; c ^= (b >> 5); \ a -= b; a -= c; a ^= (c >> 3); \ b -= c; b -= a; b ^= (a << 10); \ c -= a; c -= b; c ^= (b >> 15); \ } while (0) mix(a, b, c); #undef mix return (c); } static void vxlan_fakeaddr(struct vxlan_softc *sc) { /* * Generate a non-multicast, locally administered address. * * BMV: Should we use the FreeBSD OUI range instead? */ arc4rand(sc->vxl_hwaddr, ETHER_ADDR_LEN, 1); sc->vxl_hwaddr[0] &= ~1; sc->vxl_hwaddr[0] |= 2; } static int vxlan_media_change(struct ifnet *ifp) { /* Ignore. */ return (0); } static void vxlan_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { ifmr->ifm_status = IFM_ACTIVE | IFM_AVALID; ifmr->ifm_active = IFM_ETHER | IFM_FDX; } static int vxlan_sockaddr_cmp(const union vxlan_sockaddr *vxladdr, const struct sockaddr *sa) { return (bcmp(&vxladdr->sa, sa, vxladdr->sa.sa_len)); } static void vxlan_sockaddr_copy(union vxlan_sockaddr *vxladdr, const struct sockaddr *sa) { MPASS(sa->sa_family == AF_INET || sa->sa_family == AF_INET6); bzero(vxladdr, sizeof(*vxladdr)); if (sa->sa_family == AF_INET) { vxladdr->in4 = *satoconstsin(sa); vxladdr->in4.sin_len = sizeof(struct sockaddr_in); } else if (sa->sa_family == AF_INET6) { vxladdr->in6 = *satoconstsin6(sa); vxladdr->in6.sin6_len = sizeof(struct sockaddr_in6); } } static int vxlan_sockaddr_in_equal(const union vxlan_sockaddr *vxladdr, const struct sockaddr *sa) { int equal; if (sa->sa_family == AF_INET) { const struct in_addr *in4 = &satoconstsin(sa)->sin_addr; equal = in4->s_addr == vxladdr->in4.sin_addr.s_addr; } else if (sa->sa_family == AF_INET6) { const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr; equal = IN6_ARE_ADDR_EQUAL(in6, &vxladdr->in6.sin6_addr); } else equal = 0; return (equal); } static void vxlan_sockaddr_in_copy(union vxlan_sockaddr *vxladdr, const struct sockaddr *sa) { MPASS(sa->sa_family == AF_INET || sa->sa_family == AF_INET6); if (sa->sa_family == AF_INET) { const struct in_addr *in4 = &satoconstsin(sa)->sin_addr; vxladdr->in4.sin_family = AF_INET; vxladdr->in4.sin_len = sizeof(struct sockaddr_in); vxladdr->in4.sin_addr = *in4; } else if (sa->sa_family == AF_INET6) { const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr; vxladdr->in6.sin6_family = AF_INET6; vxladdr->in6.sin6_len = sizeof(struct sockaddr_in6); vxladdr->in6.sin6_addr = *in6; } } static int vxlan_sockaddr_supported(const union vxlan_sockaddr *vxladdr, int unspec) { const struct sockaddr *sa; int supported; sa = &vxladdr->sa; supported = 0; if (sa->sa_family == AF_UNSPEC && unspec != 0) { supported = 1; } else if (sa->sa_family == AF_INET) { #ifdef INET supported = 1; #endif } else if (sa->sa_family == AF_INET6) { #ifdef INET6 supported = 1; #endif } return (supported); } static int vxlan_sockaddr_in_any(const union vxlan_sockaddr *vxladdr) { const struct sockaddr *sa; int any; sa = &vxladdr->sa; if (sa->sa_family == AF_INET) { const struct in_addr *in4 = &satoconstsin(sa)->sin_addr; any = in4->s_addr == INADDR_ANY; } else if (sa->sa_family == AF_INET6) { const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr; any = IN6_IS_ADDR_UNSPECIFIED(in6); } else any = -1; return (any); } static int vxlan_sockaddr_in_multicast(const union vxlan_sockaddr *vxladdr) { const struct sockaddr *sa; int mc; sa = &vxladdr->sa; if (sa->sa_family == AF_INET) { const struct in_addr *in4 = &satoconstsin(sa)->sin_addr; mc = IN_MULTICAST(ntohl(in4->s_addr)); } else if (sa->sa_family == AF_INET6) { const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr; mc = IN6_IS_ADDR_MULTICAST(in6); } else mc = -1; return (mc); } static int vxlan_sockaddr_in6_embedscope(union vxlan_sockaddr *vxladdr) { int error; MPASS(VXLAN_SOCKADDR_IS_IPV6(vxladdr)); #ifdef INET6 error = sa6_embedscope(&vxladdr->in6, V_ip6_use_defzone); #else error = EAFNOSUPPORT; #endif return (error); } static int vxlan_can_change_config(struct vxlan_softc *sc) { struct ifnet *ifp; ifp = sc->vxl_ifp; VXLAN_LOCK_ASSERT(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) return (0); if (sc->vxl_flags & (VXLAN_FLAG_INIT | VXLAN_FLAG_TEARDOWN)) return (0); return (1); } static int vxlan_check_vni(uint32_t vni) { return (vni >= VXLAN_VNI_MAX); } static int vxlan_check_ttl(int ttl) { return (ttl > MAXTTL); } static int vxlan_check_ftable_timeout(uint32_t timeout) { return (timeout > VXLAN_FTABLE_MAX_TIMEOUT); } static int vxlan_check_ftable_max(uint32_t max) { return (max > VXLAN_FTABLE_MAX); } static void vxlan_sysctl_setup(struct vxlan_softc *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid *node; struct vxlan_statistics *stats; char namebuf[8]; ctx = &sc->vxl_sysctl_ctx; stats = &sc->vxl_stats; snprintf(namebuf, sizeof(namebuf), "%d", sc->vxl_unit); sysctl_ctx_init(ctx); sc->vxl_sysctl_node = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_net_link_vxlan), OID_AUTO, namebuf, CTLFLAG_RD, NULL, ""); node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->vxl_sysctl_node), OID_AUTO, "ftable", CTLFLAG_RD, NULL, ""); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "count", CTLFLAG_RD, &sc->vxl_ftable_cnt, 0, "Number of entries in fowarding table"); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "max", CTLFLAG_RD, &sc->vxl_ftable_max, 0, "Maximum number of entries allowed in fowarding table"); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "timeout", CTLFLAG_RD, &sc->vxl_ftable_timeout, 0, "Number of seconds between prunes of the forwarding table"); SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "dump", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, sc, 0, vxlan_ftable_sysctl_dump, "A", "Dump the forwarding table entries"); node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->vxl_sysctl_node), OID_AUTO, "stats", CTLFLAG_RD, NULL, ""); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "ftable_nospace", CTLFLAG_RD, &stats->ftable_nospace, 0, "Fowarding table reached maximum entries"); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "ftable_lock_upgrade_failed", CTLFLAG_RD, &stats->ftable_lock_upgrade_failed, 0, "Forwarding table update required lock upgrade"); } static void vxlan_sysctl_destroy(struct vxlan_softc *sc) { sysctl_ctx_free(&sc->vxl_sysctl_ctx); sc->vxl_sysctl_node = NULL; } static int vxlan_tunable_int(struct vxlan_softc *sc, const char *knob, int def) { char path[64]; snprintf(path, sizeof(path), "net.link.vxlan.%d.%s", sc->vxl_unit, knob); TUNABLE_INT_FETCH(path, &def); return (def); } static void vxlan_ifdetach_event(void *arg __unused, struct ifnet *ifp) { struct vxlan_softc_head list; struct vxlan_socket *vso; struct vxlan_softc *sc, *tsc; LIST_INIT(&list); if (ifp->if_flags & IFF_RENAMING) return; if ((ifp->if_flags & IFF_MULTICAST) == 0) return; VXLAN_LIST_LOCK(); LIST_FOREACH(vso, &vxlan_socket_list, vxlso_entry) vxlan_socket_ifdetach(vso, ifp, &list); VXLAN_LIST_UNLOCK(); LIST_FOREACH_SAFE(sc, &list, vxl_ifdetach_list, tsc) { LIST_REMOVE(sc, vxl_ifdetach_list); VXLAN_WLOCK(sc); if (sc->vxl_flags & VXLAN_FLAG_INIT) vxlan_init_wait(sc); vxlan_teardown_locked(sc); } } static void vxlan_load(void) { mtx_init(&vxlan_list_mtx, "vxlan list", NULL, MTX_DEF); LIST_INIT(&vxlan_socket_list); vxlan_ifdetach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, vxlan_ifdetach_event, NULL, EVENTHANDLER_PRI_ANY); vxlan_cloner = if_clone_simple(vxlan_name, vxlan_clone_create, vxlan_clone_destroy, 0); } static void vxlan_unload(void) { EVENTHANDLER_DEREGISTER(ifnet_departure_event, vxlan_ifdetach_event_tag); if_clone_detach(vxlan_cloner); mtx_destroy(&vxlan_list_mtx); MPASS(LIST_EMPTY(&vxlan_socket_list)); } static int vxlan_modevent(module_t mod, int type, void *unused) { int error; error = 0; switch (type) { case MOD_LOAD: vxlan_load(); break; case MOD_UNLOAD: vxlan_unload(); break; default: error = ENOTSUP; break; } return (error); } static moduledata_t vxlan_mod = { "if_vxlan", vxlan_modevent, 0 }; DECLARE_MODULE(if_vxlan, vxlan_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_vxlan, 1); Index: head/sys/net/iflib.c =================================================================== --- head/sys/net/iflib.c (revision 333863) +++ head/sys/net/iflib.c (revision 333864) @@ -1,6426 +1,6421 @@ /*- * Copyright (c) 2014-2018, Matthew Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Neither the name of Matthew Macy nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_acpi.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "ifdi_if.h" #if defined(__i386__) || defined(__amd64__) #include #include #include #include #include #include #endif #include /* * enable accounting of every mbuf as it comes in to and goes out of * iflib's software descriptor references */ #define MEMORY_LOGGING 0 /* * Enable mbuf vectors for compressing long mbuf chains */ /* * NB: * - Prefetching in tx cleaning should perhaps be a tunable. The distance ahead * we prefetch needs to be determined by the time spent in m_free vis a vis * the cost of a prefetch. This will of course vary based on the workload: * - NFLX's m_free path is dominated by vm-based M_EXT manipulation which * is quite expensive, thus suggesting very little prefetch. * - small packet forwarding which is just returning a single mbuf to * UMA will typically be very fast vis a vis the cost of a memory * access. */ /* * File organization: * - private structures * - iflib private utility functions * - ifnet functions * - vlan registry and other exported functions * - iflib public core functions * * */ MALLOC_DEFINE(M_IFLIB, "iflib", "ifnet library"); struct iflib_txq; typedef struct iflib_txq *iflib_txq_t; struct iflib_rxq; typedef struct iflib_rxq *iflib_rxq_t; struct iflib_fl; typedef struct iflib_fl *iflib_fl_t; struct iflib_ctx; static void iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid); typedef struct iflib_filter_info { driver_filter_t *ifi_filter; void *ifi_filter_arg; struct grouptask *ifi_task; void *ifi_ctx; } *iflib_filter_info_t; struct iflib_ctx { KOBJ_FIELDS; /* * Pointer to hardware driver's softc */ void *ifc_softc; device_t ifc_dev; if_t ifc_ifp; cpuset_t ifc_cpus; if_shared_ctx_t ifc_sctx; struct if_softc_ctx ifc_softc_ctx; struct sx ifc_ctx_sx; struct mtx ifc_state_mtx; uint16_t ifc_nhwtxqs; iflib_txq_t ifc_txqs; iflib_rxq_t ifc_rxqs; uint32_t ifc_if_flags; uint32_t ifc_flags; uint32_t ifc_max_fl_buf_size; int ifc_in_detach; int ifc_link_state; int ifc_link_irq; int ifc_watchdog_events; struct cdev *ifc_led_dev; struct resource *ifc_msix_mem; struct if_irq ifc_legacy_irq; struct grouptask ifc_admin_task; struct grouptask ifc_vflr_task; struct iflib_filter_info ifc_filter_info; struct ifmedia ifc_media; struct sysctl_oid *ifc_sysctl_node; uint16_t ifc_sysctl_ntxqs; uint16_t ifc_sysctl_nrxqs; uint16_t ifc_sysctl_qs_eq_override; uint16_t ifc_sysctl_rx_budget; qidx_t ifc_sysctl_ntxds[8]; qidx_t ifc_sysctl_nrxds[8]; struct if_txrx ifc_txrx; #define isc_txd_encap ifc_txrx.ift_txd_encap #define isc_txd_flush ifc_txrx.ift_txd_flush #define isc_txd_credits_update ifc_txrx.ift_txd_credits_update #define isc_rxd_available ifc_txrx.ift_rxd_available #define isc_rxd_pkt_get ifc_txrx.ift_rxd_pkt_get #define isc_rxd_refill ifc_txrx.ift_rxd_refill #define isc_rxd_flush ifc_txrx.ift_rxd_flush #define isc_rxd_refill ifc_txrx.ift_rxd_refill #define isc_rxd_refill ifc_txrx.ift_rxd_refill #define isc_legacy_intr ifc_txrx.ift_legacy_intr eventhandler_tag ifc_vlan_attach_event; eventhandler_tag ifc_vlan_detach_event; uint8_t ifc_mac[ETHER_ADDR_LEN]; char ifc_mtx_name[16]; }; void * iflib_get_softc(if_ctx_t ctx) { return (ctx->ifc_softc); } device_t iflib_get_dev(if_ctx_t ctx) { return (ctx->ifc_dev); } if_t iflib_get_ifp(if_ctx_t ctx) { return (ctx->ifc_ifp); } struct ifmedia * iflib_get_media(if_ctx_t ctx) { return (&ctx->ifc_media); } uint32_t iflib_get_flags(if_ctx_t ctx) { return (ctx->ifc_flags); } void iflib_set_detach(if_ctx_t ctx) { ctx->ifc_in_detach = 1; } void iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN]) { bcopy(mac, ctx->ifc_mac, ETHER_ADDR_LEN); } if_softc_ctx_t iflib_get_softc_ctx(if_ctx_t ctx) { return (&ctx->ifc_softc_ctx); } if_shared_ctx_t iflib_get_sctx(if_ctx_t ctx) { return (ctx->ifc_sctx); } #define IP_ALIGNED(m) ((((uintptr_t)(m)->m_data) & 0x3) == 0x2) #define CACHE_PTR_INCREMENT (CACHE_LINE_SIZE/sizeof(void*)) #define CACHE_PTR_NEXT(ptr) ((void *)(((uintptr_t)(ptr)+CACHE_LINE_SIZE-1) & (CACHE_LINE_SIZE-1))) #define LINK_ACTIVE(ctx) ((ctx)->ifc_link_state == LINK_STATE_UP) #define CTX_IS_VF(ctx) ((ctx)->ifc_sctx->isc_flags & IFLIB_IS_VF) #define RX_SW_DESC_MAP_CREATED (1 << 0) #define TX_SW_DESC_MAP_CREATED (1 << 1) #define RX_SW_DESC_INUSE (1 << 3) #define TX_SW_DESC_MAPPED (1 << 4) #define M_TOOBIG M_PROTO1 typedef struct iflib_sw_rx_desc_array { bus_dmamap_t *ifsd_map; /* bus_dma maps for packet */ struct mbuf **ifsd_m; /* pkthdr mbufs */ caddr_t *ifsd_cl; /* direct cluster pointer for rx */ uint8_t *ifsd_flags; } iflib_rxsd_array_t; typedef struct iflib_sw_tx_desc_array { bus_dmamap_t *ifsd_map; /* bus_dma maps for packet */ struct mbuf **ifsd_m; /* pkthdr mbufs */ uint8_t *ifsd_flags; } if_txsd_vec_t; /* magic number that should be high enough for any hardware */ #define IFLIB_MAX_TX_SEGS 128 /* bnxt supports 64 with hardware LRO enabled */ #define IFLIB_MAX_RX_SEGS 64 #define IFLIB_RX_COPY_THRESH 128 #define IFLIB_MAX_RX_REFRESH 32 /* The minimum descriptors per second before we start coalescing */ #define IFLIB_MIN_DESC_SEC 16384 #define IFLIB_DEFAULT_TX_UPDATE_FREQ 16 #define IFLIB_QUEUE_IDLE 0 #define IFLIB_QUEUE_HUNG 1 #define IFLIB_QUEUE_WORKING 2 /* maximum number of txqs that can share an rx interrupt */ #define IFLIB_MAX_TX_SHARED_INTR 4 /* this should really scale with ring size - this is a fairly arbitrary value */ #define TX_BATCH_SIZE 32 #define IFLIB_RESTART_BUDGET 8 #define CSUM_OFFLOAD (CSUM_IP_TSO|CSUM_IP6_TSO|CSUM_IP| \ CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP| \ CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP) struct iflib_txq { qidx_t ift_in_use; qidx_t ift_cidx; qidx_t ift_cidx_processed; qidx_t ift_pidx; uint8_t ift_gen; uint8_t ift_br_offset; uint16_t ift_npending; uint16_t ift_db_pending; uint16_t ift_rs_pending; /* implicit pad */ uint8_t ift_txd_size[8]; uint64_t ift_processed; uint64_t ift_cleaned; uint64_t ift_cleaned_prev; #if MEMORY_LOGGING uint64_t ift_enqueued; uint64_t ift_dequeued; #endif uint64_t ift_no_tx_dma_setup; uint64_t ift_no_desc_avail; uint64_t ift_mbuf_defrag_failed; uint64_t ift_mbuf_defrag; uint64_t ift_map_failed; uint64_t ift_txd_encap_efbig; uint64_t ift_pullups; struct mtx ift_mtx; struct mtx ift_db_mtx; /* constant values */ if_ctx_t ift_ctx; struct ifmp_ring *ift_br; struct grouptask ift_task; qidx_t ift_size; uint16_t ift_id; struct callout ift_timer; if_txsd_vec_t ift_sds; uint8_t ift_qstatus; uint8_t ift_closed; uint8_t ift_update_freq; struct iflib_filter_info ift_filter_info; bus_dma_tag_t ift_desc_tag; bus_dma_tag_t ift_tso_desc_tag; iflib_dma_info_t ift_ifdi; #define MTX_NAME_LEN 16 char ift_mtx_name[MTX_NAME_LEN]; char ift_db_mtx_name[MTX_NAME_LEN]; bus_dma_segment_t ift_segs[IFLIB_MAX_TX_SEGS] __aligned(CACHE_LINE_SIZE); #ifdef IFLIB_DIAGNOSTICS uint64_t ift_cpu_exec_count[256]; #endif } __aligned(CACHE_LINE_SIZE); struct iflib_fl { qidx_t ifl_cidx; qidx_t ifl_pidx; qidx_t ifl_credits; uint8_t ifl_gen; uint8_t ifl_rxd_size; #if MEMORY_LOGGING uint64_t ifl_m_enqueued; uint64_t ifl_m_dequeued; uint64_t ifl_cl_enqueued; uint64_t ifl_cl_dequeued; #endif /* implicit pad */ bitstr_t *ifl_rx_bitmap; qidx_t ifl_fragidx; /* constant */ qidx_t ifl_size; uint16_t ifl_buf_size; uint16_t ifl_cltype; uma_zone_t ifl_zone; iflib_rxsd_array_t ifl_sds; iflib_rxq_t ifl_rxq; uint8_t ifl_id; bus_dma_tag_t ifl_desc_tag; iflib_dma_info_t ifl_ifdi; uint64_t ifl_bus_addrs[IFLIB_MAX_RX_REFRESH] __aligned(CACHE_LINE_SIZE); caddr_t ifl_vm_addrs[IFLIB_MAX_RX_REFRESH]; qidx_t ifl_rxd_idxs[IFLIB_MAX_RX_REFRESH]; } __aligned(CACHE_LINE_SIZE); static inline qidx_t get_inuse(int size, qidx_t cidx, qidx_t pidx, uint8_t gen) { qidx_t used; if (pidx > cidx) used = pidx - cidx; else if (pidx < cidx) used = size - cidx + pidx; else if (gen == 0 && pidx == cidx) used = 0; else if (gen == 1 && pidx == cidx) used = size; else panic("bad state"); return (used); } #define TXQ_AVAIL(txq) (txq->ift_size - get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx, txq->ift_gen)) #define IDXDIFF(head, tail, wrap) \ ((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head)) struct iflib_rxq { /* If there is a separate completion queue - * these are the cq cidx and pidx. Otherwise * these are unused. */ qidx_t ifr_size; qidx_t ifr_cq_cidx; qidx_t ifr_cq_pidx; uint8_t ifr_cq_gen; uint8_t ifr_fl_offset; if_ctx_t ifr_ctx; iflib_fl_t ifr_fl; uint64_t ifr_rx_irq; uint16_t ifr_id; uint8_t ifr_lro_enabled; uint8_t ifr_nfl; uint8_t ifr_ntxqirq; uint8_t ifr_txqid[IFLIB_MAX_TX_SHARED_INTR]; struct lro_ctrl ifr_lc; struct grouptask ifr_task; struct iflib_filter_info ifr_filter_info; iflib_dma_info_t ifr_ifdi; /* dynamically allocate if any drivers need a value substantially larger than this */ struct if_rxd_frag ifr_frags[IFLIB_MAX_RX_SEGS] __aligned(CACHE_LINE_SIZE); #ifdef IFLIB_DIAGNOSTICS uint64_t ifr_cpu_exec_count[256]; #endif } __aligned(CACHE_LINE_SIZE); typedef struct if_rxsd { caddr_t *ifsd_cl; struct mbuf **ifsd_m; iflib_fl_t ifsd_fl; qidx_t ifsd_cidx; } *if_rxsd_t; /* multiple of word size */ #ifdef __LP64__ #define PKT_INFO_SIZE 6 #define RXD_INFO_SIZE 5 #define PKT_TYPE uint64_t #else #define PKT_INFO_SIZE 11 #define RXD_INFO_SIZE 8 #define PKT_TYPE uint32_t #endif #define PKT_LOOP_BOUND ((PKT_INFO_SIZE/3)*3) #define RXD_LOOP_BOUND ((RXD_INFO_SIZE/4)*4) typedef struct if_pkt_info_pad { PKT_TYPE pkt_val[PKT_INFO_SIZE]; } *if_pkt_info_pad_t; typedef struct if_rxd_info_pad { PKT_TYPE rxd_val[RXD_INFO_SIZE]; } *if_rxd_info_pad_t; CTASSERT(sizeof(struct if_pkt_info_pad) == sizeof(struct if_pkt_info)); CTASSERT(sizeof(struct if_rxd_info_pad) == sizeof(struct if_rxd_info)); static inline void pkt_info_zero(if_pkt_info_t pi) { if_pkt_info_pad_t pi_pad; pi_pad = (if_pkt_info_pad_t)pi; pi_pad->pkt_val[0] = 0; pi_pad->pkt_val[1] = 0; pi_pad->pkt_val[2] = 0; pi_pad->pkt_val[3] = 0; pi_pad->pkt_val[4] = 0; pi_pad->pkt_val[5] = 0; #ifndef __LP64__ pi_pad->pkt_val[6] = 0; pi_pad->pkt_val[7] = 0; pi_pad->pkt_val[8] = 0; pi_pad->pkt_val[9] = 0; pi_pad->pkt_val[10] = 0; #endif } static device_method_t iflib_pseudo_methods[] = { DEVMETHOD(device_attach, noop_attach), DEVMETHOD(device_detach, iflib_pseudo_detach), DEVMETHOD_END }; driver_t iflib_pseudodriver = { "iflib_pseudo", iflib_pseudo_methods, sizeof(struct iflib_ctx), }; static inline void rxd_info_zero(if_rxd_info_t ri) { if_rxd_info_pad_t ri_pad; int i; ri_pad = (if_rxd_info_pad_t)ri; for (i = 0; i < RXD_LOOP_BOUND; i += 4) { ri_pad->rxd_val[i] = 0; ri_pad->rxd_val[i+1] = 0; ri_pad->rxd_val[i+2] = 0; ri_pad->rxd_val[i+3] = 0; } #ifdef __LP64__ ri_pad->rxd_val[RXD_INFO_SIZE-1] = 0; #endif } /* * Only allow a single packet to take up most 1/nth of the tx ring */ #define MAX_SINGLE_PACKET_FRACTION 12 #define IF_BAD_DMA (bus_addr_t)-1 #define CTX_ACTIVE(ctx) ((if_getdrvflags((ctx)->ifc_ifp) & IFF_DRV_RUNNING)) #define CTX_LOCK_INIT(_sc) sx_init(&(_sc)->ifc_ctx_sx, "iflib ctx lock") #define CTX_LOCK(ctx) sx_xlock(&(ctx)->ifc_ctx_sx) #define CTX_UNLOCK(ctx) sx_xunlock(&(ctx)->ifc_ctx_sx) #define CTX_LOCK_DESTROY(ctx) sx_destroy(&(ctx)->ifc_ctx_sx) #define STATE_LOCK_INIT(_sc, _name) mtx_init(&(_sc)->ifc_state_mtx, _name, "iflib state lock", MTX_DEF) #define STATE_LOCK(ctx) mtx_lock(&(ctx)->ifc_state_mtx) #define STATE_UNLOCK(ctx) mtx_unlock(&(ctx)->ifc_state_mtx) #define STATE_LOCK_DESTROY(ctx) mtx_destroy(&(ctx)->ifc_state_mtx) #define CALLOUT_LOCK(txq) mtx_lock(&txq->ift_mtx) #define CALLOUT_UNLOCK(txq) mtx_unlock(&txq->ift_mtx) /* Our boot-time initialization hook */ static int iflib_module_event_handler(module_t, int, void *); static moduledata_t iflib_moduledata = { "iflib", iflib_module_event_handler, NULL }; DECLARE_MODULE(iflib, iflib_moduledata, SI_SUB_INIT_IF, SI_ORDER_ANY); MODULE_VERSION(iflib, 1); MODULE_DEPEND(iflib, pci, 1, 1, 1); MODULE_DEPEND(iflib, ether, 1, 1, 1); TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1); TASKQGROUP_DEFINE(if_config_tqg, 1, 1); #ifndef IFLIB_DEBUG_COUNTERS #ifdef INVARIANTS #define IFLIB_DEBUG_COUNTERS 1 #else #define IFLIB_DEBUG_COUNTERS 0 #endif /* !INVARIANTS */ #endif static SYSCTL_NODE(_net, OID_AUTO, iflib, CTLFLAG_RD, 0, "iflib driver parameters"); /* * XXX need to ensure that this can't accidentally cause the head to be moved backwards */ static int iflib_min_tx_latency = 0; SYSCTL_INT(_net_iflib, OID_AUTO, min_tx_latency, CTLFLAG_RW, &iflib_min_tx_latency, 0, "minimize transmit latency at the possible expense of throughput"); static int iflib_no_tx_batch = 0; SYSCTL_INT(_net_iflib, OID_AUTO, no_tx_batch, CTLFLAG_RW, &iflib_no_tx_batch, 0, "minimize transmit latency at the possible expense of throughput"); #if IFLIB_DEBUG_COUNTERS static int iflib_tx_seen; static int iflib_tx_sent; static int iflib_tx_encap; static int iflib_rx_allocs; static int iflib_fl_refills; static int iflib_fl_refills_large; static int iflib_tx_frees; SYSCTL_INT(_net_iflib, OID_AUTO, tx_seen, CTLFLAG_RD, &iflib_tx_seen, 0, "# tx mbufs seen"); SYSCTL_INT(_net_iflib, OID_AUTO, tx_sent, CTLFLAG_RD, &iflib_tx_sent, 0, "# tx mbufs sent"); SYSCTL_INT(_net_iflib, OID_AUTO, tx_encap, CTLFLAG_RD, &iflib_tx_encap, 0, "# tx mbufs encapped"); SYSCTL_INT(_net_iflib, OID_AUTO, tx_frees, CTLFLAG_RD, &iflib_tx_frees, 0, "# tx frees"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_allocs, CTLFLAG_RD, &iflib_rx_allocs, 0, "# rx allocations"); SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills, CTLFLAG_RD, &iflib_fl_refills, 0, "# refills"); SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills_large, CTLFLAG_RD, &iflib_fl_refills_large, 0, "# large refills"); static int iflib_txq_drain_flushing; static int iflib_txq_drain_oactive; static int iflib_txq_drain_notready; static int iflib_txq_drain_encapfail; SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_flushing, CTLFLAG_RD, &iflib_txq_drain_flushing, 0, "# drain flushes"); SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_oactive, CTLFLAG_RD, &iflib_txq_drain_oactive, 0, "# drain oactives"); SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_notready, CTLFLAG_RD, &iflib_txq_drain_notready, 0, "# drain notready"); SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_encapfail, CTLFLAG_RD, &iflib_txq_drain_encapfail, 0, "# drain encap fails"); static int iflib_encap_load_mbuf_fail; static int iflib_encap_pad_mbuf_fail; static int iflib_encap_txq_avail_fail; static int iflib_encap_txd_encap_fail; SYSCTL_INT(_net_iflib, OID_AUTO, encap_load_mbuf_fail, CTLFLAG_RD, &iflib_encap_load_mbuf_fail, 0, "# busdma load failures"); SYSCTL_INT(_net_iflib, OID_AUTO, encap_pad_mbuf_fail, CTLFLAG_RD, &iflib_encap_pad_mbuf_fail, 0, "# runt frame pad failures"); SYSCTL_INT(_net_iflib, OID_AUTO, encap_txq_avail_fail, CTLFLAG_RD, &iflib_encap_txq_avail_fail, 0, "# txq avail failures"); SYSCTL_INT(_net_iflib, OID_AUTO, encap_txd_encap_fail, CTLFLAG_RD, &iflib_encap_txd_encap_fail, 0, "# driver encap failures"); static int iflib_task_fn_rxs; static int iflib_rx_intr_enables; static int iflib_fast_intrs; static int iflib_intr_link; static int iflib_intr_msix; static int iflib_rx_unavail; static int iflib_rx_ctx_inactive; static int iflib_rx_zero_len; static int iflib_rx_if_input; static int iflib_rx_mbuf_null; static int iflib_rxd_flush; static int iflib_verbose_debug; SYSCTL_INT(_net_iflib, OID_AUTO, intr_link, CTLFLAG_RD, &iflib_intr_link, 0, "# intr link calls"); SYSCTL_INT(_net_iflib, OID_AUTO, intr_msix, CTLFLAG_RD, &iflib_intr_msix, 0, "# intr msix calls"); SYSCTL_INT(_net_iflib, OID_AUTO, task_fn_rx, CTLFLAG_RD, &iflib_task_fn_rxs, 0, "# task_fn_rx calls"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_intr_enables, CTLFLAG_RD, &iflib_rx_intr_enables, 0, "# rx intr enables"); SYSCTL_INT(_net_iflib, OID_AUTO, fast_intrs, CTLFLAG_RD, &iflib_fast_intrs, 0, "# fast_intr calls"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_unavail, CTLFLAG_RD, &iflib_rx_unavail, 0, "# times rxeof called with no available data"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLFLAG_RD, &iflib_rx_ctx_inactive, 0, "# times rxeof called with inactive context"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_zero_len, CTLFLAG_RD, &iflib_rx_zero_len, 0, "# times rxeof saw zero len mbuf"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD, &iflib_rx_if_input, 0, "# times rxeof called if_input"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_mbuf_null, CTLFLAG_RD, &iflib_rx_mbuf_null, 0, "# times rxeof got null mbuf"); SYSCTL_INT(_net_iflib, OID_AUTO, rxd_flush, CTLFLAG_RD, &iflib_rxd_flush, 0, "# times rxd_flush called"); SYSCTL_INT(_net_iflib, OID_AUTO, verbose_debug, CTLFLAG_RW, &iflib_verbose_debug, 0, "enable verbose debugging"); #define DBG_COUNTER_INC(name) atomic_add_int(&(iflib_ ## name), 1) static void iflib_debug_reset(void) { iflib_tx_seen = iflib_tx_sent = iflib_tx_encap = iflib_rx_allocs = iflib_fl_refills = iflib_fl_refills_large = iflib_tx_frees = iflib_txq_drain_flushing = iflib_txq_drain_oactive = iflib_txq_drain_notready = iflib_txq_drain_encapfail = iflib_encap_load_mbuf_fail = iflib_encap_pad_mbuf_fail = iflib_encap_txq_avail_fail = iflib_encap_txd_encap_fail = iflib_task_fn_rxs = iflib_rx_intr_enables = iflib_fast_intrs = iflib_intr_link = iflib_intr_msix = iflib_rx_unavail = iflib_rx_ctx_inactive = iflib_rx_zero_len = iflib_rx_if_input = iflib_rx_mbuf_null = iflib_rxd_flush = 0; } #else #define DBG_COUNTER_INC(name) static void iflib_debug_reset(void) {} #endif #define IFLIB_DEBUG 0 static void iflib_tx_structures_free(if_ctx_t ctx); static void iflib_rx_structures_free(if_ctx_t ctx); static int iflib_queues_alloc(if_ctx_t ctx); static int iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq); static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget); static int iflib_qset_structures_setup(if_ctx_t ctx); static int iflib_msix_init(if_ctx_t ctx); static int iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filterarg, int *rid, char *str); static void iflib_txq_check_drain(iflib_txq_t txq, int budget); static uint32_t iflib_txq_can_drain(struct ifmp_ring *); static int iflib_register(if_ctx_t); static void iflib_init_locked(if_ctx_t ctx); static void iflib_add_device_sysctl_pre(if_ctx_t ctx); static void iflib_add_device_sysctl_post(if_ctx_t ctx); static void iflib_ifmp_purge(iflib_txq_t txq); static void _iflib_pre_assert(if_softc_ctx_t scctx); static void iflib_if_init_locked(if_ctx_t ctx); #ifndef __NO_STRICT_ALIGNMENT static struct mbuf * iflib_fixup_rx(struct mbuf *m); #endif NETDUMP_DEFINE(iflib); #ifdef DEV_NETMAP #include #include #include MODULE_DEPEND(iflib, netmap, 1, 1, 1); static int netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, uint32_t nm_i, bool init); /* * device-specific sysctl variables: * * iflib_crcstrip: 0: keep CRC in rx frames (default), 1: strip it. * During regular operations the CRC is stripped, but on some * hardware reception of frames not multiple of 64 is slower, * so using crcstrip=0 helps in benchmarks. * * iflib_rx_miss, iflib_rx_miss_bufs: * count packets that might be missed due to lost interrupts. */ SYSCTL_DECL(_dev_netmap); /* * The xl driver by default strips CRCs and we do not override it. */ int iflib_crcstrip = 1; SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_crcstrip, CTLFLAG_RW, &iflib_crcstrip, 1, "strip CRC on rx frames"); int iflib_rx_miss, iflib_rx_miss_bufs; SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss, CTLFLAG_RW, &iflib_rx_miss, 0, "potentially missed rx intr"); SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss_bufs, CTLFLAG_RW, &iflib_rx_miss_bufs, 0, "potentially missed rx intr bufs"); /* * Register/unregister. We are already under netmap lock. * Only called on the first register or the last unregister. */ static int iflib_netmap_register(struct netmap_adapter *na, int onoff) { struct ifnet *ifp = na->ifp; if_ctx_t ctx = ifp->if_softc; int status; CTX_LOCK(ctx); IFDI_INTR_DISABLE(ctx); /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); if (!CTX_IS_VF(ctx)) IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip); /* enable or disable flags and callbacks in na and ifp */ if (onoff) { nm_set_native_flags(na); } else { nm_clear_native_flags(na); } iflib_stop(ctx); iflib_init_locked(ctx); IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip); // XXX why twice ? status = ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1; if (status) nm_clear_native_flags(na); CTX_UNLOCK(ctx); return (status); } static int netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, uint32_t nm_i, bool init) { struct netmap_adapter *na = kring->na; u_int const lim = kring->nkr_num_slots - 1; u_int head = kring->rhead; struct netmap_ring *ring = kring->ring; bus_dmamap_t *map; struct if_rxd_update iru; if_ctx_t ctx = rxq->ifr_ctx; iflib_fl_t fl = &rxq->ifr_fl[0]; uint32_t refill_pidx, nic_i; if (nm_i == head && __predict_true(!init)) return 0; iru_init(&iru, rxq, 0 /* flid */); map = fl->ifl_sds.ifsd_map; refill_pidx = netmap_idx_k2n(kring, nm_i); /* * IMPORTANT: we must leave one free slot in the ring, * so move head back by one unit */ head = nm_prev(head, lim); nic_i = UINT_MAX; while (nm_i != head) { for (int tmp_pidx = 0; tmp_pidx < IFLIB_MAX_RX_REFRESH && nm_i != head; tmp_pidx++) { struct netmap_slot *slot = &ring->slot[nm_i]; void *addr = PNMB(na, slot, &fl->ifl_bus_addrs[tmp_pidx]); uint32_t nic_i_dma = refill_pidx; nic_i = netmap_idx_k2n(kring, nm_i); MPASS(tmp_pidx < IFLIB_MAX_RX_REFRESH); if (addr == NETMAP_BUF_BASE(na)) /* bad buf */ return netmap_ring_reinit(kring); fl->ifl_vm_addrs[tmp_pidx] = addr; if (__predict_false(init) && map) { netmap_load_map(na, fl->ifl_ifdi->idi_tag, map[nic_i], addr); } else if (map && (slot->flags & NS_BUF_CHANGED)) { /* buffer has changed, reload map */ netmap_reload_map(na, fl->ifl_ifdi->idi_tag, map[nic_i], addr); } slot->flags &= ~NS_BUF_CHANGED; nm_i = nm_next(nm_i, lim); fl->ifl_rxd_idxs[tmp_pidx] = nic_i = nm_next(nic_i, lim); if (nm_i != head && tmp_pidx < IFLIB_MAX_RX_REFRESH-1) continue; iru.iru_pidx = refill_pidx; iru.iru_count = tmp_pidx+1; ctx->isc_rxd_refill(ctx->ifc_softc, &iru); refill_pidx = nic_i; if (map == NULL) continue; for (int n = 0; n < iru.iru_count; n++) { bus_dmamap_sync(fl->ifl_ifdi->idi_tag, map[nic_i_dma], BUS_DMASYNC_PREREAD); /* XXX - change this to not use the netmap func*/ nic_i_dma = nm_next(nic_i_dma, lim); } } } kring->nr_hwcur = head; if (map) bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); if (__predict_true(nic_i != UINT_MAX)) ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id, nic_i); return (0); } /* * Reconcile kernel and user view of the transmit ring. * * All information is in the kring. * Userspace wants to send packets up to the one before kring->rhead, * kernel knows kring->nr_hwcur is the first unsent packet. * * Here we push packets out (as many as possible), and possibly * reclaim buffers from previously completed transmission. * * The caller (netmap) guarantees that there is only one instance * running at any time. Any interference with other driver * methods should be handled by the individual drivers. */ static int iflib_netmap_txsync(struct netmap_kring *kring, int flags) { struct netmap_adapter *na = kring->na; struct ifnet *ifp = na->ifp; struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ u_int n; u_int const lim = kring->nkr_num_slots - 1; u_int const head = kring->rhead; struct if_pkt_info pi; /* * interrupts on every tx packet are expensive so request * them every half ring, or where NS_REPORT is set */ u_int report_frequency = kring->nkr_num_slots >> 1; /* device-specific */ if_ctx_t ctx = ifp->if_softc; iflib_txq_t txq = &ctx->ifc_txqs[kring->ring_id]; if (txq->ift_sds.ifsd_map) bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* * First part: process new packets to send. * nm_i is the current index in the netmap ring, * nic_i is the corresponding index in the NIC ring. * * If we have packets to send (nm_i != head) * iterate over the netmap ring, fetch length and update * the corresponding slot in the NIC ring. Some drivers also * need to update the buffer's physical address in the NIC slot * even NS_BUF_CHANGED is not set (PNMB computes the addresses). * * The netmap_reload_map() calls is especially expensive, * even when (as in this case) the tag is 0, so do only * when the buffer has actually changed. * * If possible do not set the report/intr bit on all slots, * but only a few times per ring or when NS_REPORT is set. * * Finally, on 10G and faster drivers, it might be useful * to prefetch the next slot and txr entry. */ nm_i = netmap_idx_n2k(kring, kring->nr_hwcur); if (nm_i != head) { /* we have new packets to send */ pkt_info_zero(&pi); pi.ipi_segs = txq->ift_segs; pi.ipi_qsidx = kring->ring_id; nic_i = netmap_idx_k2n(kring, nm_i); __builtin_prefetch(&ring->slot[nm_i]); __builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i]); if (txq->ift_sds.ifsd_map) __builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i]); for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; uint64_t paddr; void *addr = PNMB(na, slot, &paddr); int flags = (slot->flags & NS_REPORT || nic_i == 0 || nic_i == report_frequency) ? IPI_TX_INTR : 0; /* device-specific */ pi.ipi_len = len; pi.ipi_segs[0].ds_addr = paddr; pi.ipi_segs[0].ds_len = len; pi.ipi_nsegs = 1; pi.ipi_ndescs = 0; pi.ipi_pidx = nic_i; pi.ipi_flags = flags; /* Fill the slot in the NIC ring. */ ctx->isc_txd_encap(ctx->ifc_softc, &pi); /* prefetch for next round */ __builtin_prefetch(&ring->slot[nm_i + 1]); __builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i + 1]); if (txq->ift_sds.ifsd_map) { __builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i + 1]); NM_CHECK_ADDR_LEN(na, addr, len); if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ netmap_reload_map(na, txq->ift_desc_tag, txq->ift_sds.ifsd_map[nic_i], addr); } /* make sure changes to the buffer are synced */ bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_sds.ifsd_map[nic_i], BUS_DMASYNC_PREWRITE); } slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } kring->nr_hwcur = head; /* synchronize the NIC ring */ if (txq->ift_sds.ifsd_map) bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* (re)start the tx unit up to slot nic_i (excluded) */ ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, nic_i); } /* * Second part: reclaim buffers for completed transmissions. * * If there are unclaimed buffers, attempt to reclaim them. * If none are reclaimed, and TX IRQs are not in use, do an initial * minimal delay, then trigger the tx handler which will spin in the * group task queue. */ if (kring->nr_hwtail != nm_prev(head, lim)) { if (iflib_tx_credits_update(ctx, txq)) { /* some tx completed, increment avail */ nic_i = txq->ift_cidx_processed; kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } else { if (!(ctx->ifc_flags & IFC_NETMAP_TX_IRQ)) { DELAY(1); GROUPTASK_ENQUEUE(&ctx->ifc_txqs[txq->ift_id].ift_task); } } } return (0); } /* * Reconcile kernel and user view of the receive ring. * Same as for the txsync, this routine must be efficient. * The caller guarantees a single invocations, but races against * the rest of the driver should be handled here. * * On call, kring->rhead is the first packet that userspace wants * to keep, and kring->rcur is the wakeup point. * The kernel has previously reported packets up to kring->rtail. * * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective * of whether or not we received an interrupt. */ static int iflib_netmap_rxsync(struct netmap_kring *kring, int flags) { struct netmap_adapter *na = kring->na; struct netmap_ring *ring = kring->ring; uint32_t nm_i; /* index into the netmap ring */ uint32_t nic_i; /* index into the NIC ring */ u_int i, n; u_int const lim = kring->nkr_num_slots - 1; u_int const head = netmap_idx_n2k(kring, kring->rhead); int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; struct if_rxd_info ri; struct ifnet *ifp = na->ifp; if_ctx_t ctx = ifp->if_softc; iflib_rxq_t rxq = &ctx->ifc_rxqs[kring->ring_id]; iflib_fl_t fl = rxq->ifr_fl; if (head > lim) return netmap_ring_reinit(kring); /* XXX check sync modes */ for (i = 0, fl = rxq->ifr_fl; i < rxq->ifr_nfl; i++, fl++) { if (fl->ifl_sds.ifsd_map == NULL) continue; bus_dmamap_sync(rxq->ifr_fl[i].ifl_desc_tag, fl->ifl_ifdi->idi_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); } /* * First part: import newly received packets. * * nm_i is the index of the next free slot in the netmap ring, * nic_i is the index of the next received packet in the NIC ring, * and they may differ in case if_init() has been called while * in netmap mode. For the receive ring we have * * nic_i = rxr->next_check; * nm_i = kring->nr_hwtail (previous) * and * nm_i == (nic_i + kring->nkr_hwofs) % ring_size * * rxr->next_check is set to 0 on a ring reinit */ if (netmap_no_pendintr || force_update) { int crclen = iflib_crcstrip ? 0 : 4; int error, avail; for (i = 0; i < rxq->ifr_nfl; i++) { fl = &rxq->ifr_fl[i]; nic_i = fl->ifl_cidx; nm_i = netmap_idx_n2k(kring, nic_i); avail = iflib_rxd_avail(ctx, rxq, nic_i, USHRT_MAX); for (n = 0; avail > 0; n++, avail--) { rxd_info_zero(&ri); ri.iri_frags = rxq->ifr_frags; ri.iri_qsidx = kring->ring_id; ri.iri_ifp = ctx->ifc_ifp; ri.iri_cidx = nic_i; error = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri); ring->slot[nm_i].len = error ? 0 : ri.iri_len - crclen; ring->slot[nm_i].flags = 0; if (fl->ifl_sds.ifsd_map) bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_sds.ifsd_map[nic_i], BUS_DMASYNC_POSTREAD); nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ if (netmap_no_pendintr && !force_update) { /* diagnostics */ iflib_rx_miss ++; iflib_rx_miss_bufs += n; } fl->ifl_cidx = nic_i; kring->nr_hwtail = netmap_idx_k2n(kring, nm_i); } kring->nr_kflags &= ~NKR_PENDINTR; } } /* * Second part: skip past packets that userspace has released. * (kring->nr_hwcur to head excluded), * and make the buffers available for reception. * As usual nm_i is the index in the netmap ring, * nic_i is the index in the NIC ring, and * nm_i == (nic_i + kring->nkr_hwofs) % ring_size */ /* XXX not sure how this will work with multiple free lists */ nm_i = netmap_idx_n2k(kring, kring->nr_hwcur); return (netmap_fl_refill(rxq, kring, nm_i, false)); } static void iflib_netmap_intr(struct netmap_adapter *na, int onoff) { struct ifnet *ifp = na->ifp; if_ctx_t ctx = ifp->if_softc; CTX_LOCK(ctx); if (onoff) { IFDI_INTR_ENABLE(ctx); } else { IFDI_INTR_DISABLE(ctx); } CTX_UNLOCK(ctx); } static int iflib_netmap_attach(if_ctx_t ctx) { struct netmap_adapter na; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; bzero(&na, sizeof(na)); na.ifp = ctx->ifc_ifp; na.na_flags = NAF_BDG_MAYSLEEP; MPASS(ctx->ifc_softc_ctx.isc_ntxqsets); MPASS(ctx->ifc_softc_ctx.isc_nrxqsets); na.num_tx_desc = scctx->isc_ntxd[0]; na.num_rx_desc = scctx->isc_nrxd[0]; na.nm_txsync = iflib_netmap_txsync; na.nm_rxsync = iflib_netmap_rxsync; na.nm_register = iflib_netmap_register; na.nm_intr = iflib_netmap_intr; na.num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets; na.num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets; return (netmap_attach(&na)); } static void iflib_netmap_txq_init(if_ctx_t ctx, iflib_txq_t txq) { struct netmap_adapter *na = NA(ctx->ifc_ifp); struct netmap_slot *slot; slot = netmap_reset(na, NR_TX, txq->ift_id, 0); if (slot == NULL) return; if (txq->ift_sds.ifsd_map == NULL) return; for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxd[0]; i++) { /* * In netmap mode, set the map for the packet buffer. * NOTE: Some drivers (not this one) also need to set * the physical buffer address in the NIC ring. * netmap_idx_n2k() maps a nic index, i, into the corresponding * netmap slot index, si */ int si = netmap_idx_n2k(na->tx_rings[txq->ift_id], i); netmap_load_map(na, txq->ift_desc_tag, txq->ift_sds.ifsd_map[i], NMB(na, slot + si)); } } static void iflib_netmap_rxq_init(if_ctx_t ctx, iflib_rxq_t rxq) { struct netmap_adapter *na = NA(ctx->ifc_ifp); struct netmap_kring *kring = na->rx_rings[rxq->ifr_id]; struct netmap_slot *slot; uint32_t nm_i; slot = netmap_reset(na, NR_RX, rxq->ifr_id, 0); if (slot == NULL) return; nm_i = netmap_idx_n2k(kring, 0); netmap_fl_refill(rxq, kring, nm_i, true); } #define iflib_netmap_detach(ifp) netmap_detach(ifp) #else #define iflib_netmap_txq_init(ctx, txq) #define iflib_netmap_rxq_init(ctx, rxq) #define iflib_netmap_detach(ifp) #define iflib_netmap_attach(ctx) (0) #define netmap_rx_irq(ifp, qid, budget) (0) #define netmap_tx_irq(ifp, qid) do {} while (0) #endif #if defined(__i386__) || defined(__amd64__) static __inline void prefetch(void *x) { __asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x)); } static __inline void prefetch2cachelines(void *x) { __asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x)); #if (CACHE_LINE_SIZE < 128) __asm volatile("prefetcht0 %0" :: "m" (*(((unsigned long *)x)+CACHE_LINE_SIZE/(sizeof(unsigned long))))); #endif } #else #define prefetch(x) #define prefetch2cachelines(x) #endif static void iflib_gen_mac(if_ctx_t ctx) { struct thread *td; - struct ifnet *ifp; MD5_CTX mdctx; char uuid[HOSTUUIDLEN+1]; char buf[HOSTUUIDLEN+16]; uint8_t *mac; unsigned char digest[16]; td = curthread; - ifp = ctx->ifc_ifp; mac = ctx->ifc_mac; uuid[HOSTUUIDLEN] = 0; bcopy(td->td_ucred->cr_prison->pr_hostuuid, uuid, HOSTUUIDLEN); snprintf(buf, HOSTUUIDLEN+16, "%s-%s", uuid, device_get_nameunit(ctx->ifc_dev)); /* * Generate a pseudo-random, deterministic MAC * address based on the UUID and unit number. * The FreeBSD Foundation OUI of 58-9C-FC is used. */ MD5Init(&mdctx); MD5Update(&mdctx, buf, strlen(buf)); MD5Final(digest, &mdctx); mac[0] = 0x58; mac[1] = 0x9C; mac[2] = 0xFC; mac[3] = digest[0]; mac[4] = digest[1]; mac[5] = digest[2]; } static void iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid) { iflib_fl_t fl; fl = &rxq->ifr_fl[flid]; iru->iru_paddrs = fl->ifl_bus_addrs; iru->iru_vaddrs = &fl->ifl_vm_addrs[0]; iru->iru_idxs = fl->ifl_rxd_idxs; iru->iru_qsidx = rxq->ifr_id; iru->iru_buf_size = fl->ifl_buf_size; iru->iru_flidx = fl->ifl_id; } static void _iflib_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err) { if (err) return; *(bus_addr_t *) arg = segs[0].ds_addr; } int iflib_dma_alloc(if_ctx_t ctx, int size, iflib_dma_info_t dma, int mapflags) { int err; if_shared_ctx_t sctx = ctx->ifc_sctx; device_t dev = ctx->ifc_dev; KASSERT(sctx->isc_q_align != 0, ("alignment value not initialized")); err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ sctx->isc_q_align, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ size, /* maxsize */ 1, /* nsegments */ size, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &dma->idi_tag); if (err) { device_printf(dev, "%s: bus_dma_tag_create failed: %d\n", __func__, err); goto fail_0; } err = bus_dmamem_alloc(dma->idi_tag, (void**) &dma->idi_vaddr, BUS_DMA_NOWAIT | BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->idi_map); if (err) { device_printf(dev, "%s: bus_dmamem_alloc(%ju) failed: %d\n", __func__, (uintmax_t)size, err); goto fail_1; } dma->idi_paddr = IF_BAD_DMA; err = bus_dmamap_load(dma->idi_tag, dma->idi_map, dma->idi_vaddr, size, _iflib_dmamap_cb, &dma->idi_paddr, mapflags | BUS_DMA_NOWAIT); if (err || dma->idi_paddr == IF_BAD_DMA) { device_printf(dev, "%s: bus_dmamap_load failed: %d\n", __func__, err); goto fail_2; } dma->idi_size = size; return (0); fail_2: bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map); fail_1: bus_dma_tag_destroy(dma->idi_tag); fail_0: dma->idi_tag = NULL; return (err); } int iflib_dma_alloc_multi(if_ctx_t ctx, int *sizes, iflib_dma_info_t *dmalist, int mapflags, int count) { int i, err; iflib_dma_info_t *dmaiter; dmaiter = dmalist; for (i = 0; i < count; i++, dmaiter++) { if ((err = iflib_dma_alloc(ctx, sizes[i], *dmaiter, mapflags)) != 0) break; } if (err) iflib_dma_free_multi(dmalist, i); return (err); } void iflib_dma_free(iflib_dma_info_t dma) { if (dma->idi_tag == NULL) return; if (dma->idi_paddr != IF_BAD_DMA) { bus_dmamap_sync(dma->idi_tag, dma->idi_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(dma->idi_tag, dma->idi_map); dma->idi_paddr = IF_BAD_DMA; } if (dma->idi_vaddr != NULL) { bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map); dma->idi_vaddr = NULL; } bus_dma_tag_destroy(dma->idi_tag); dma->idi_tag = NULL; } void iflib_dma_free_multi(iflib_dma_info_t *dmalist, int count) { int i; iflib_dma_info_t *dmaiter = dmalist; for (i = 0; i < count; i++, dmaiter++) iflib_dma_free(*dmaiter); } #ifdef EARLY_AP_STARTUP static const int iflib_started = 1; #else /* * We used to abuse the smp_started flag to decide if the queues have been * fully initialized (by late taskqgroup_adjust() calls in a SYSINIT()). * That gave bad races, since the SYSINIT() runs strictly after smp_started * is set. Run a SYSINIT() strictly after that to just set a usable * completion flag. */ static int iflib_started; static void iflib_record_started(void *arg) { iflib_started = 1; } SYSINIT(iflib_record_started, SI_SUB_SMP + 1, SI_ORDER_FIRST, iflib_record_started, NULL); #endif static int iflib_fast_intr(void *arg) { iflib_filter_info_t info = arg; struct grouptask *gtask = info->ifi_task; if (!iflib_started) return (FILTER_HANDLED); DBG_COUNTER_INC(fast_intrs); if (info->ifi_filter != NULL && info->ifi_filter(info->ifi_filter_arg) == FILTER_HANDLED) return (FILTER_HANDLED); GROUPTASK_ENQUEUE(gtask); return (FILTER_HANDLED); } static int iflib_fast_intr_rxtx(void *arg) { iflib_filter_info_t info = arg; struct grouptask *gtask = info->ifi_task; iflib_rxq_t rxq = (iflib_rxq_t)info->ifi_ctx; if_ctx_t ctx = NULL;; int i, cidx; if (!iflib_started) return (FILTER_HANDLED); DBG_COUNTER_INC(fast_intrs); if (info->ifi_filter != NULL && info->ifi_filter(info->ifi_filter_arg) == FILTER_HANDLED) return (FILTER_HANDLED); MPASS(rxq->ifr_ntxqirq); for (i = 0; i < rxq->ifr_ntxqirq; i++) { qidx_t txqid = rxq->ifr_txqid[i]; ctx = rxq->ifr_ctx; if (!ctx->isc_txd_credits_update(ctx->ifc_softc, txqid, false)) { IFDI_TX_QUEUE_INTR_ENABLE(ctx, txqid); continue; } GROUPTASK_ENQUEUE(&ctx->ifc_txqs[txqid].ift_task); } if (ctx->ifc_sctx->isc_flags & IFLIB_HAS_RXCQ) cidx = rxq->ifr_cq_cidx; else cidx = rxq->ifr_fl[0].ifl_cidx; if (iflib_rxd_avail(ctx, rxq, cidx, 1)) GROUPTASK_ENQUEUE(gtask); else IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id); return (FILTER_HANDLED); } static int iflib_fast_intr_ctx(void *arg) { iflib_filter_info_t info = arg; struct grouptask *gtask = info->ifi_task; if (!iflib_started) return (FILTER_HANDLED); DBG_COUNTER_INC(fast_intrs); if (info->ifi_filter != NULL && info->ifi_filter(info->ifi_filter_arg) == FILTER_HANDLED) return (FILTER_HANDLED); GROUPTASK_ENQUEUE(gtask); return (FILTER_HANDLED); } static int _iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid, driver_filter_t filter, driver_intr_t handler, void *arg, char *name) { int rc, flags; struct resource *res; void *tag = NULL; device_t dev = ctx->ifc_dev; flags = RF_ACTIVE; if (ctx->ifc_flags & IFC_LEGACY) flags |= RF_SHAREABLE; MPASS(rid < 512); irq->ii_rid = rid; res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &irq->ii_rid, flags); if (res == NULL) { device_printf(dev, "failed to allocate IRQ for rid %d, name %s.\n", rid, name); return (ENOMEM); } irq->ii_res = res; KASSERT(filter == NULL || handler == NULL, ("filter and handler can't both be non-NULL")); rc = bus_setup_intr(dev, res, INTR_MPSAFE | INTR_TYPE_NET, filter, handler, arg, &tag); if (rc != 0) { device_printf(dev, "failed to setup interrupt for rid %d, name %s: %d\n", rid, name ? name : "unknown", rc); return (rc); } else if (name) bus_describe_intr(dev, res, tag, "%s", name); irq->ii_tag = tag; return (0); } /********************************************************************* * * Allocate memory for tx_buffer structures. The tx_buffer stores all * the information needed to transmit a packet on the wire. This is * called only once at attach, setup is done every reset. * **********************************************************************/ static int iflib_txsd_alloc(iflib_txq_t txq) { if_ctx_t ctx = txq->ift_ctx; if_shared_ctx_t sctx = ctx->ifc_sctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; device_t dev = ctx->ifc_dev; int err, nsegments, ntsosegments; nsegments = scctx->isc_tx_nsegments; ntsosegments = scctx->isc_tx_tso_segments_max; MPASS(scctx->isc_ntxd[0] > 0); MPASS(scctx->isc_ntxd[txq->ift_br_offset] > 0); MPASS(nsegments > 0); MPASS(ntsosegments > 0); /* * Setup DMA descriptor areas. */ if ((err = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ sctx->isc_tx_maxsize, /* maxsize */ nsegments, /* nsegments */ sctx->isc_tx_maxsegsize, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txq->ift_desc_tag))) { device_printf(dev,"Unable to allocate TX DMA tag: %d\n", err); device_printf(dev,"maxsize: %ju nsegments: %d maxsegsize: %ju\n", (uintmax_t)sctx->isc_tx_maxsize, nsegments, (uintmax_t)sctx->isc_tx_maxsegsize); goto fail; } if ((err = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ scctx->isc_tx_tso_size_max, /* maxsize */ ntsosegments, /* nsegments */ scctx->isc_tx_tso_segsize_max, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txq->ift_tso_desc_tag))) { device_printf(dev,"Unable to allocate TX TSO DMA tag: %d\n", err); goto fail; } if (!(txq->ift_sds.ifsd_flags = (uint8_t *) malloc(sizeof(uint8_t) * scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate tx_buffer memory\n"); err = ENOMEM; goto fail; } if (!(txq->ift_sds.ifsd_m = (struct mbuf **) malloc(sizeof(struct mbuf *) * scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate tx_buffer memory\n"); err = ENOMEM; goto fail; } /* Create the descriptor buffer dma maps */ #if defined(ACPI_DMAR) || (! (defined(__i386__) || defined(__amd64__))) if ((ctx->ifc_flags & IFC_DMAR) == 0) return (0); if (!(txq->ift_sds.ifsd_map = (bus_dmamap_t *) malloc(sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate tx_buffer map memory\n"); err = ENOMEM; goto fail; } for (int i = 0; i < scctx->isc_ntxd[txq->ift_br_offset]; i++) { err = bus_dmamap_create(txq->ift_desc_tag, 0, &txq->ift_sds.ifsd_map[i]); if (err != 0) { device_printf(dev, "Unable to create TX DMA map\n"); goto fail; } } #endif return (0); fail: /* We free all, it handles case where we are in the middle */ iflib_tx_structures_free(ctx); return (err); } static void iflib_txsd_destroy(if_ctx_t ctx, iflib_txq_t txq, int i) { bus_dmamap_t map; map = NULL; if (txq->ift_sds.ifsd_map != NULL) map = txq->ift_sds.ifsd_map[i]; if (map != NULL) { bus_dmamap_unload(txq->ift_desc_tag, map); bus_dmamap_destroy(txq->ift_desc_tag, map); txq->ift_sds.ifsd_map[i] = NULL; } } static void iflib_txq_destroy(iflib_txq_t txq) { if_ctx_t ctx = txq->ift_ctx; for (int i = 0; i < txq->ift_size; i++) iflib_txsd_destroy(ctx, txq, i); if (txq->ift_sds.ifsd_map != NULL) { free(txq->ift_sds.ifsd_map, M_IFLIB); txq->ift_sds.ifsd_map = NULL; } if (txq->ift_sds.ifsd_m != NULL) { free(txq->ift_sds.ifsd_m, M_IFLIB); txq->ift_sds.ifsd_m = NULL; } if (txq->ift_sds.ifsd_flags != NULL) { free(txq->ift_sds.ifsd_flags, M_IFLIB); txq->ift_sds.ifsd_flags = NULL; } if (txq->ift_desc_tag != NULL) { bus_dma_tag_destroy(txq->ift_desc_tag); txq->ift_desc_tag = NULL; } if (txq->ift_tso_desc_tag != NULL) { bus_dma_tag_destroy(txq->ift_tso_desc_tag); txq->ift_tso_desc_tag = NULL; } } static void iflib_txsd_free(if_ctx_t ctx, iflib_txq_t txq, int i) { struct mbuf **mp; mp = &txq->ift_sds.ifsd_m[i]; if (*mp == NULL) return; if (txq->ift_sds.ifsd_map != NULL) { bus_dmamap_sync(txq->ift_desc_tag, txq->ift_sds.ifsd_map[i], BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txq->ift_desc_tag, txq->ift_sds.ifsd_map[i]); } m_free(*mp); DBG_COUNTER_INC(tx_frees); *mp = NULL; } static int iflib_txq_setup(iflib_txq_t txq) { if_ctx_t ctx = txq->ift_ctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; iflib_dma_info_t di; int i; /* Set number of descriptors available */ txq->ift_qstatus = IFLIB_QUEUE_IDLE; /* XXX make configurable */ txq->ift_update_freq = IFLIB_DEFAULT_TX_UPDATE_FREQ; /* Reset indices */ txq->ift_cidx_processed = 0; txq->ift_pidx = txq->ift_cidx = txq->ift_npending = 0; txq->ift_size = scctx->isc_ntxd[txq->ift_br_offset]; for (i = 0, di = txq->ift_ifdi; i < ctx->ifc_nhwtxqs; i++, di++) bzero((void *)di->idi_vaddr, di->idi_size); IFDI_TXQ_SETUP(ctx, txq->ift_id); for (i = 0, di = txq->ift_ifdi; i < ctx->ifc_nhwtxqs; i++, di++) bus_dmamap_sync(di->idi_tag, di->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); return (0); } /********************************************************************* * * Allocate memory for rx_buffer structures. Since we use one * rx_buffer per received packet, the maximum number of rx_buffer's * that we'll need is equal to the number of receive descriptors * that we've allocated. * **********************************************************************/ static int iflib_rxsd_alloc(iflib_rxq_t rxq) { if_ctx_t ctx = rxq->ifr_ctx; if_shared_ctx_t sctx = ctx->ifc_sctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; device_t dev = ctx->ifc_dev; iflib_fl_t fl; int err; MPASS(scctx->isc_nrxd[0] > 0); MPASS(scctx->isc_nrxd[rxq->ifr_fl_offset] > 0); fl = rxq->ifr_fl; for (int i = 0; i < rxq->ifr_nfl; i++, fl++) { fl->ifl_size = scctx->isc_nrxd[rxq->ifr_fl_offset]; /* this isn't necessarily the same */ err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ sctx->isc_rx_maxsize, /* maxsize */ sctx->isc_rx_nsegments, /* nsegments */ sctx->isc_rx_maxsegsize, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &fl->ifl_desc_tag); if (err) { device_printf(dev, "%s: bus_dma_tag_create failed %d\n", __func__, err); goto fail; } if (!(fl->ifl_sds.ifsd_flags = (uint8_t *) malloc(sizeof(uint8_t) * scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate tx_buffer memory\n"); err = ENOMEM; goto fail; } if (!(fl->ifl_sds.ifsd_m = (struct mbuf **) malloc(sizeof(struct mbuf *) * scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate tx_buffer memory\n"); err = ENOMEM; goto fail; } if (!(fl->ifl_sds.ifsd_cl = (caddr_t *) malloc(sizeof(caddr_t) * scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate tx_buffer memory\n"); err = ENOMEM; goto fail; } /* Create the descriptor buffer dma maps */ #if defined(ACPI_DMAR) || (! (defined(__i386__) || defined(__amd64__))) if ((ctx->ifc_flags & IFC_DMAR) == 0) continue; if (!(fl->ifl_sds.ifsd_map = (bus_dmamap_t *) malloc(sizeof(bus_dmamap_t) * scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate tx_buffer map memory\n"); err = ENOMEM; goto fail; } for (int i = 0; i < scctx->isc_nrxd[rxq->ifr_fl_offset]; i++) { err = bus_dmamap_create(fl->ifl_desc_tag, 0, &fl->ifl_sds.ifsd_map[i]); if (err != 0) { device_printf(dev, "Unable to create RX buffer DMA map\n"); goto fail; } } #endif } return (0); fail: iflib_rx_structures_free(ctx); return (err); } /* * Internal service routines */ struct rxq_refill_cb_arg { int error; bus_dma_segment_t seg; int nseg; }; static void _rxq_refill_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error) { struct rxq_refill_cb_arg *cb_arg = arg; cb_arg->error = error; cb_arg->seg = segs[0]; cb_arg->nseg = nseg; } #ifdef ACPI_DMAR #define IS_DMAR(ctx) (ctx->ifc_flags & IFC_DMAR) #else #define IS_DMAR(ctx) (0) #endif /** * rxq_refill - refill an rxq free-buffer list * @ctx: the iflib context * @rxq: the free-list to refill * @n: the number of new buffers to allocate * * (Re)populate an rxq free-buffer list with up to @n new packet buffers. * The caller must assure that @n does not exceed the queue's capacity. */ static void _iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int count) { struct mbuf *m; int idx, frag_idx = fl->ifl_fragidx; int pidx = fl->ifl_pidx; caddr_t cl, *sd_cl; struct mbuf **sd_m; uint8_t *sd_flags; struct if_rxd_update iru; bus_dmamap_t *sd_map; int n, i = 0; uint64_t bus_addr; int err; qidx_t credits; sd_m = fl->ifl_sds.ifsd_m; sd_map = fl->ifl_sds.ifsd_map; sd_cl = fl->ifl_sds.ifsd_cl; sd_flags = fl->ifl_sds.ifsd_flags; idx = pidx; credits = fl->ifl_credits; n = count; MPASS(n > 0); MPASS(credits + n <= fl->ifl_size); if (pidx < fl->ifl_cidx) MPASS(pidx + n <= fl->ifl_cidx); if (pidx == fl->ifl_cidx && (credits < fl->ifl_size)) MPASS(fl->ifl_gen == 0); if (pidx > fl->ifl_cidx) MPASS(n <= fl->ifl_size - pidx + fl->ifl_cidx); DBG_COUNTER_INC(fl_refills); if (n > 8) DBG_COUNTER_INC(fl_refills_large); iru_init(&iru, fl->ifl_rxq, fl->ifl_id); while (n--) { /* * We allocate an uninitialized mbuf + cluster, mbuf is * initialized after rx. * * If the cluster is still set then we know a minimum sized packet was received */ bit_ffc_at(fl->ifl_rx_bitmap, frag_idx, fl->ifl_size, &frag_idx); if ((frag_idx < 0) || (frag_idx >= fl->ifl_size)) bit_ffc(fl->ifl_rx_bitmap, fl->ifl_size, &frag_idx); if ((cl = sd_cl[frag_idx]) == NULL) { if ((cl = sd_cl[frag_idx] = m_cljget(NULL, M_NOWAIT, fl->ifl_buf_size)) == NULL) break; #if MEMORY_LOGGING fl->ifl_cl_enqueued++; #endif } if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) { break; } #if MEMORY_LOGGING fl->ifl_m_enqueued++; #endif DBG_COUNTER_INC(rx_allocs); #if defined(__i386__) || defined(__amd64__) if (!IS_DMAR(ctx)) { bus_addr = pmap_kextract((vm_offset_t)cl); } else #endif { struct rxq_refill_cb_arg cb_arg; cb_arg.error = 0; MPASS(sd_map != NULL); MPASS(sd_map[frag_idx] != NULL); err = bus_dmamap_load(fl->ifl_desc_tag, sd_map[frag_idx], cl, fl->ifl_buf_size, _rxq_refill_cb, &cb_arg, 0); bus_dmamap_sync(fl->ifl_desc_tag, sd_map[frag_idx], BUS_DMASYNC_PREREAD); if (err != 0 || cb_arg.error) { /* * !zone_pack ? */ if (fl->ifl_zone == zone_pack) uma_zfree(fl->ifl_zone, cl); m_free(m); n = 0; goto done; } bus_addr = cb_arg.seg.ds_addr; } bit_set(fl->ifl_rx_bitmap, frag_idx); sd_flags[frag_idx] |= RX_SW_DESC_INUSE; MPASS(sd_m[frag_idx] == NULL); sd_cl[frag_idx] = cl; sd_m[frag_idx] = m; fl->ifl_rxd_idxs[i] = frag_idx; fl->ifl_bus_addrs[i] = bus_addr; fl->ifl_vm_addrs[i] = cl; credits++; i++; MPASS(credits <= fl->ifl_size); if (++idx == fl->ifl_size) { fl->ifl_gen = 1; idx = 0; } if (n == 0 || i == IFLIB_MAX_RX_REFRESH) { iru.iru_pidx = pidx; iru.iru_count = i; ctx->isc_rxd_refill(ctx->ifc_softc, &iru); i = 0; pidx = idx; fl->ifl_pidx = idx; fl->ifl_credits = credits; } } done: if (i) { iru.iru_pidx = pidx; iru.iru_count = i; ctx->isc_rxd_refill(ctx->ifc_softc, &iru); fl->ifl_pidx = idx; fl->ifl_credits = credits; } DBG_COUNTER_INC(rxd_flush); if (fl->ifl_pidx == 0) pidx = fl->ifl_size - 1; else pidx = fl->ifl_pidx - 1; if (sd_map) bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); ctx->isc_rxd_flush(ctx->ifc_softc, fl->ifl_rxq->ifr_id, fl->ifl_id, pidx); fl->ifl_fragidx = frag_idx; } static __inline void __iflib_fl_refill_lt(if_ctx_t ctx, iflib_fl_t fl, int max) { /* we avoid allowing pidx to catch up with cidx as it confuses ixl */ int32_t reclaimable = fl->ifl_size - fl->ifl_credits - 1; #ifdef INVARIANTS int32_t delta = fl->ifl_size - get_inuse(fl->ifl_size, fl->ifl_cidx, fl->ifl_pidx, fl->ifl_gen) - 1; #endif MPASS(fl->ifl_credits <= fl->ifl_size); MPASS(reclaimable == delta); if (reclaimable > 0) _iflib_fl_refill(ctx, fl, min(max, reclaimable)); } static void iflib_fl_bufs_free(iflib_fl_t fl) { iflib_dma_info_t idi = fl->ifl_ifdi; uint32_t i; for (i = 0; i < fl->ifl_size; i++) { struct mbuf **sd_m = &fl->ifl_sds.ifsd_m[i]; uint8_t *sd_flags = &fl->ifl_sds.ifsd_flags[i]; caddr_t *sd_cl = &fl->ifl_sds.ifsd_cl[i]; if (*sd_flags & RX_SW_DESC_INUSE) { if (fl->ifl_sds.ifsd_map != NULL) { bus_dmamap_t sd_map = fl->ifl_sds.ifsd_map[i]; bus_dmamap_unload(fl->ifl_desc_tag, sd_map); if (fl->ifl_rxq->ifr_ctx->ifc_in_detach) bus_dmamap_destroy(fl->ifl_desc_tag, sd_map); } if (*sd_m != NULL) { m_init(*sd_m, M_NOWAIT, MT_DATA, 0); uma_zfree(zone_mbuf, *sd_m); } if (*sd_cl != NULL) uma_zfree(fl->ifl_zone, *sd_cl); *sd_flags = 0; } else { MPASS(*sd_cl == NULL); MPASS(*sd_m == NULL); } #if MEMORY_LOGGING fl->ifl_m_dequeued++; fl->ifl_cl_dequeued++; #endif *sd_cl = NULL; *sd_m = NULL; } #ifdef INVARIANTS for (i = 0; i < fl->ifl_size; i++) { MPASS(fl->ifl_sds.ifsd_flags[i] == 0); MPASS(fl->ifl_sds.ifsd_cl[i] == NULL); MPASS(fl->ifl_sds.ifsd_m[i] == NULL); } #endif /* * Reset free list values */ fl->ifl_credits = fl->ifl_cidx = fl->ifl_pidx = fl->ifl_gen = fl->ifl_fragidx = 0; bzero(idi->idi_vaddr, idi->idi_size); } /********************************************************************* * * Initialize a receive ring and its buffers. * **********************************************************************/ static int iflib_fl_setup(iflib_fl_t fl) { iflib_rxq_t rxq = fl->ifl_rxq; if_ctx_t ctx = rxq->ifr_ctx; if_softc_ctx_t sctx = &ctx->ifc_softc_ctx; bit_nclear(fl->ifl_rx_bitmap, 0, fl->ifl_size - 1); /* ** Free current RX buffer structs and their mbufs */ iflib_fl_bufs_free(fl); /* Now replenish the mbufs */ MPASS(fl->ifl_credits == 0); /* * XXX don't set the max_frame_size to larger * than the hardware can handle */ if (sctx->isc_max_frame_size <= 2048) fl->ifl_buf_size = MCLBYTES; #ifndef CONTIGMALLOC_WORKS else fl->ifl_buf_size = MJUMPAGESIZE; #else else if (sctx->isc_max_frame_size <= 4096) fl->ifl_buf_size = MJUMPAGESIZE; else if (sctx->isc_max_frame_size <= 9216) fl->ifl_buf_size = MJUM9BYTES; else fl->ifl_buf_size = MJUM16BYTES; #endif if (fl->ifl_buf_size > ctx->ifc_max_fl_buf_size) ctx->ifc_max_fl_buf_size = fl->ifl_buf_size; fl->ifl_cltype = m_gettype(fl->ifl_buf_size); fl->ifl_zone = m_getzone(fl->ifl_buf_size); /* avoid pre-allocating zillions of clusters to an idle card * potentially speeding up attach */ _iflib_fl_refill(ctx, fl, min(128, fl->ifl_size)); MPASS(min(128, fl->ifl_size) == fl->ifl_credits); if (min(128, fl->ifl_size) != fl->ifl_credits) return (ENOBUFS); /* * handle failure */ MPASS(rxq != NULL); MPASS(fl->ifl_ifdi != NULL); bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); return (0); } /********************************************************************* * * Free receive ring data structures * **********************************************************************/ static void iflib_rx_sds_free(iflib_rxq_t rxq) { iflib_fl_t fl; int i; if (rxq->ifr_fl != NULL) { for (i = 0; i < rxq->ifr_nfl; i++) { fl = &rxq->ifr_fl[i]; if (fl->ifl_desc_tag != NULL) { bus_dma_tag_destroy(fl->ifl_desc_tag); fl->ifl_desc_tag = NULL; } free(fl->ifl_sds.ifsd_m, M_IFLIB); free(fl->ifl_sds.ifsd_cl, M_IFLIB); /* XXX destroy maps first */ free(fl->ifl_sds.ifsd_map, M_IFLIB); fl->ifl_sds.ifsd_m = NULL; fl->ifl_sds.ifsd_cl = NULL; fl->ifl_sds.ifsd_map = NULL; } free(rxq->ifr_fl, M_IFLIB); rxq->ifr_fl = NULL; rxq->ifr_cq_gen = rxq->ifr_cq_cidx = rxq->ifr_cq_pidx = 0; } } /* * MI independent logic * */ static void iflib_timer(void *arg) { iflib_txq_t txq = arg; if_ctx_t ctx = txq->ift_ctx; if_softc_ctx_t sctx = &ctx->ifc_softc_ctx; if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)) return; /* ** Check on the state of the TX queue(s), this ** can be done without the lock because its RO ** and the HUNG state will be static if set. */ IFDI_TIMER(ctx, txq->ift_id); if ((txq->ift_qstatus == IFLIB_QUEUE_HUNG) && ((txq->ift_cleaned_prev == txq->ift_cleaned) || (sctx->isc_pause_frames == 0))) goto hung; if (ifmp_ring_is_stalled(txq->ift_br)) txq->ift_qstatus = IFLIB_QUEUE_HUNG; txq->ift_cleaned_prev = txq->ift_cleaned; /* handle any laggards */ if (txq->ift_db_pending) GROUPTASK_ENQUEUE(&txq->ift_task); sctx->isc_pause_frames = 0; if (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING) callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq, txq->ift_timer.c_cpu); return; hung: device_printf(ctx->ifc_dev, "TX(%d) desc avail = %d, pidx = %d\n", txq->ift_id, TXQ_AVAIL(txq), txq->ift_pidx); STATE_LOCK(ctx); if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); ctx->ifc_flags |= (IFC_DO_WATCHDOG|IFC_DO_RESET); iflib_admin_intr_deferred(ctx); STATE_UNLOCK(ctx); } static void iflib_init_locked(if_ctx_t ctx) { if_softc_ctx_t sctx = &ctx->ifc_softc_ctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; if_t ifp = ctx->ifc_ifp; iflib_fl_t fl; iflib_txq_t txq; iflib_rxq_t rxq; int i, j, tx_ip_csum_flags, tx_ip6_csum_flags; if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); IFDI_INTR_DISABLE(ctx); tx_ip_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP); tx_ip6_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_IP6_SCTP); /* Set hardware offload abilities */ if_clearhwassist(ifp); if (if_getcapenable(ifp) & IFCAP_TXCSUM) if_sethwassistbits(ifp, tx_ip_csum_flags, 0); if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) if_sethwassistbits(ifp, tx_ip6_csum_flags, 0); if (if_getcapenable(ifp) & IFCAP_TSO4) if_sethwassistbits(ifp, CSUM_IP_TSO, 0); if (if_getcapenable(ifp) & IFCAP_TSO6) if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); for (i = 0, txq = ctx->ifc_txqs; i < sctx->isc_ntxqsets; i++, txq++) { CALLOUT_LOCK(txq); callout_stop(&txq->ift_timer); CALLOUT_UNLOCK(txq); iflib_netmap_txq_init(ctx, txq); } #ifdef INVARIANTS i = if_getdrvflags(ifp); #endif IFDI_INIT(ctx); MPASS(if_getdrvflags(ifp) == i); for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) { /* XXX this should really be done on a per-queue basis */ if (if_getcapenable(ifp) & IFCAP_NETMAP) { MPASS(rxq->ifr_id == i); iflib_netmap_rxq_init(ctx, rxq); continue; } for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) { if (iflib_fl_setup(fl)) { device_printf(ctx->ifc_dev, "freelist setup failed - check cluster settings\n"); goto done; } } } done: if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE); IFDI_INTR_ENABLE(ctx); txq = ctx->ifc_txqs; for (i = 0; i < sctx->isc_ntxqsets; i++, txq++) callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq, txq->ift_timer.c_cpu); } static int iflib_media_change(if_t ifp) { if_ctx_t ctx = if_getsoftc(ifp); int err; CTX_LOCK(ctx); if ((err = IFDI_MEDIA_CHANGE(ctx)) == 0) iflib_init_locked(ctx); CTX_UNLOCK(ctx); return (err); } static void iflib_media_status(if_t ifp, struct ifmediareq *ifmr) { if_ctx_t ctx = if_getsoftc(ifp); CTX_LOCK(ctx); IFDI_UPDATE_ADMIN_STATUS(ctx); IFDI_MEDIA_STATUS(ctx, ifmr); CTX_UNLOCK(ctx); } void iflib_stop(if_ctx_t ctx) { iflib_txq_t txq = ctx->ifc_txqs; iflib_rxq_t rxq = ctx->ifc_rxqs; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; iflib_dma_info_t di; iflib_fl_t fl; int i, j; /* Tell the stack that the interface is no longer active */ if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); IFDI_INTR_DISABLE(ctx); DELAY(1000); IFDI_STOP(ctx); DELAY(1000); iflib_debug_reset(); /* Wait for current tx queue users to exit to disarm watchdog timer. */ for (i = 0; i < scctx->isc_ntxqsets; i++, txq++) { /* make sure all transmitters have completed before proceeding XXX */ CALLOUT_LOCK(txq); callout_stop(&txq->ift_timer); CALLOUT_UNLOCK(txq); /* clean any enqueued buffers */ iflib_ifmp_purge(txq); /* Free any existing tx buffers. */ for (j = 0; j < txq->ift_size; j++) { iflib_txsd_free(ctx, txq, j); } txq->ift_processed = txq->ift_cleaned = txq->ift_cidx_processed = 0; txq->ift_in_use = txq->ift_gen = txq->ift_cidx = txq->ift_pidx = txq->ift_no_desc_avail = 0; txq->ift_closed = txq->ift_mbuf_defrag = txq->ift_mbuf_defrag_failed = 0; txq->ift_no_tx_dma_setup = txq->ift_txd_encap_efbig = txq->ift_map_failed = 0; txq->ift_pullups = 0; ifmp_ring_reset_stats(txq->ift_br); for (j = 0, di = txq->ift_ifdi; j < ctx->ifc_nhwtxqs; j++, di++) bzero((void *)di->idi_vaddr, di->idi_size); } for (i = 0; i < scctx->isc_nrxqsets; i++, rxq++) { /* make sure all transmitters have completed before proceeding XXX */ for (j = 0, di = rxq->ifr_ifdi; j < rxq->ifr_nfl; j++, di++) bzero((void *)di->idi_vaddr, di->idi_size); /* also resets the free lists pidx/cidx */ for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) iflib_fl_bufs_free(fl); } } static inline caddr_t calc_next_rxd(iflib_fl_t fl, int cidx) { qidx_t size; int nrxd; caddr_t start, end, cur, next; nrxd = fl->ifl_size; size = fl->ifl_rxd_size; start = fl->ifl_ifdi->idi_vaddr; if (__predict_false(size == 0)) return (start); cur = start + size*cidx; end = start + size*nrxd; next = CACHE_PTR_NEXT(cur); return (next < end ? next : start); } static inline void prefetch_pkts(iflib_fl_t fl, int cidx) { int nextptr; int nrxd = fl->ifl_size; caddr_t next_rxd; nextptr = (cidx + CACHE_PTR_INCREMENT) & (nrxd-1); prefetch(&fl->ifl_sds.ifsd_m[nextptr]); prefetch(&fl->ifl_sds.ifsd_cl[nextptr]); next_rxd = calc_next_rxd(fl, cidx); prefetch(next_rxd); prefetch(fl->ifl_sds.ifsd_m[(cidx + 1) & (nrxd-1)]); prefetch(fl->ifl_sds.ifsd_m[(cidx + 2) & (nrxd-1)]); prefetch(fl->ifl_sds.ifsd_m[(cidx + 3) & (nrxd-1)]); prefetch(fl->ifl_sds.ifsd_m[(cidx + 4) & (nrxd-1)]); prefetch(fl->ifl_sds.ifsd_cl[(cidx + 1) & (nrxd-1)]); prefetch(fl->ifl_sds.ifsd_cl[(cidx + 2) & (nrxd-1)]); prefetch(fl->ifl_sds.ifsd_cl[(cidx + 3) & (nrxd-1)]); prefetch(fl->ifl_sds.ifsd_cl[(cidx + 4) & (nrxd-1)]); } static void rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int unload, if_rxsd_t sd) { int flid, cidx; bus_dmamap_t map; iflib_fl_t fl; iflib_dma_info_t di; int next; map = NULL; flid = irf->irf_flid; cidx = irf->irf_idx; fl = &rxq->ifr_fl[flid]; sd->ifsd_fl = fl; sd->ifsd_cidx = cidx; sd->ifsd_m = &fl->ifl_sds.ifsd_m[cidx]; sd->ifsd_cl = &fl->ifl_sds.ifsd_cl[cidx]; fl->ifl_credits--; #if MEMORY_LOGGING fl->ifl_m_dequeued++; #endif if (rxq->ifr_ctx->ifc_flags & IFC_PREFETCH) prefetch_pkts(fl, cidx); if (fl->ifl_sds.ifsd_map != NULL) { next = (cidx + CACHE_PTR_INCREMENT) & (fl->ifl_size-1); prefetch(&fl->ifl_sds.ifsd_map[next]); map = fl->ifl_sds.ifsd_map[cidx]; di = fl->ifl_ifdi; next = (cidx + CACHE_LINE_SIZE) & (fl->ifl_size-1); prefetch(&fl->ifl_sds.ifsd_flags[next]); bus_dmamap_sync(di->idi_tag, di->idi_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* not valid assert if bxe really does SGE from non-contiguous elements */ MPASS(fl->ifl_cidx == cidx); if (unload) bus_dmamap_unload(fl->ifl_desc_tag, map); } fl->ifl_cidx = (fl->ifl_cidx + 1) & (fl->ifl_size-1); if (__predict_false(fl->ifl_cidx == 0)) fl->ifl_gen = 0; if (map != NULL) bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); bit_clear(fl->ifl_rx_bitmap, cidx); } static struct mbuf * assemble_segments(iflib_rxq_t rxq, if_rxd_info_t ri, if_rxsd_t sd) { int i, padlen , flags; struct mbuf *m, *mh, *mt; caddr_t cl; i = 0; mh = NULL; do { rxd_frag_to_sd(rxq, &ri->iri_frags[i], TRUE, sd); MPASS(*sd->ifsd_cl != NULL); MPASS(*sd->ifsd_m != NULL); /* Don't include zero-length frags */ if (ri->iri_frags[i].irf_len == 0) { /* XXX we can save the cluster here, but not the mbuf */ m_init(*sd->ifsd_m, M_NOWAIT, MT_DATA, 0); m_free(*sd->ifsd_m); *sd->ifsd_m = NULL; continue; } m = *sd->ifsd_m; *sd->ifsd_m = NULL; if (mh == NULL) { flags = M_PKTHDR|M_EXT; mh = mt = m; padlen = ri->iri_pad; } else { flags = M_EXT; mt->m_next = m; mt = m; /* assuming padding is only on the first fragment */ padlen = 0; } cl = *sd->ifsd_cl; *sd->ifsd_cl = NULL; /* Can these two be made one ? */ m_init(m, M_NOWAIT, MT_DATA, flags); m_cljset(m, cl, sd->ifsd_fl->ifl_cltype); /* * These must follow m_init and m_cljset */ m->m_data += padlen; ri->iri_len -= padlen; m->m_len = ri->iri_frags[i].irf_len; } while (++i < ri->iri_nfrags); return (mh); } /* * Process one software descriptor */ static struct mbuf * iflib_rxd_pkt_get(iflib_rxq_t rxq, if_rxd_info_t ri) { struct if_rxsd sd; struct mbuf *m; /* should I merge this back in now that the two paths are basically duplicated? */ if (ri->iri_nfrags == 1 && ri->iri_frags[0].irf_len <= MIN(IFLIB_RX_COPY_THRESH, MHLEN)) { rxd_frag_to_sd(rxq, &ri->iri_frags[0], FALSE, &sd); m = *sd.ifsd_m; *sd.ifsd_m = NULL; m_init(m, M_NOWAIT, MT_DATA, M_PKTHDR); #ifndef __NO_STRICT_ALIGNMENT if (!IP_ALIGNED(m)) m->m_data += 2; #endif memcpy(m->m_data, *sd.ifsd_cl, ri->iri_len); m->m_len = ri->iri_frags[0].irf_len; } else { m = assemble_segments(rxq, ri, &sd); } m->m_pkthdr.len = ri->iri_len; m->m_pkthdr.rcvif = ri->iri_ifp; m->m_flags |= ri->iri_flags; m->m_pkthdr.ether_vtag = ri->iri_vtag; m->m_pkthdr.flowid = ri->iri_flowid; M_HASHTYPE_SET(m, ri->iri_rsstype); m->m_pkthdr.csum_flags = ri->iri_csum_flags; m->m_pkthdr.csum_data = ri->iri_csum_data; return (m); } #if defined(INET6) || defined(INET) static void iflib_get_ip_forwarding(struct lro_ctrl *lc, bool *v4, bool *v6) { CURVNET_SET(lc->ifp->if_vnet); #if defined(INET6) *v6 = VNET(ip6_forwarding); #endif #if defined(INET) *v4 = VNET(ipforwarding); #endif CURVNET_RESTORE(); } /* * Returns true if it's possible this packet could be LROed. * if it returns false, it is guaranteed that tcp_lro_rx() * would not return zero. */ static bool iflib_check_lro_possible(struct mbuf *m, bool v4_forwarding, bool v6_forwarding) { struct ether_header *eh; uint16_t eh_type; eh = mtod(m, struct ether_header *); eh_type = ntohs(eh->ether_type); switch (eh_type) { #if defined(INET6) case ETHERTYPE_IPV6: return !v6_forwarding; #endif #if defined (INET) case ETHERTYPE_IP: return !v4_forwarding; #endif } return false; } #else static void iflib_get_ip_forwarding(struct lro_ctrl *lc __unused, bool *v4 __unused, bool *v6 __unused) { } #endif static bool iflib_rxeof(iflib_rxq_t rxq, qidx_t budget) { if_ctx_t ctx = rxq->ifr_ctx; if_shared_ctx_t sctx = ctx->ifc_sctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; int avail, i; qidx_t *cidxp; struct if_rxd_info ri; int err, budget_left, rx_bytes, rx_pkts; iflib_fl_t fl; struct ifnet *ifp; int lro_enabled; bool lro_possible = false; bool v4_forwarding, v6_forwarding; /* * XXX early demux data packets so that if_input processing only handles * acks in interrupt context */ struct mbuf *m, *mh, *mt, *mf; ifp = ctx->ifc_ifp; mh = mt = NULL; MPASS(budget > 0); rx_pkts = rx_bytes = 0; if (sctx->isc_flags & IFLIB_HAS_RXCQ) cidxp = &rxq->ifr_cq_cidx; else cidxp = &rxq->ifr_fl[0].ifl_cidx; if ((avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget)) == 0) { for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++) __iflib_fl_refill_lt(ctx, fl, budget + 8); DBG_COUNTER_INC(rx_unavail); return (false); } for (budget_left = budget; (budget_left > 0) && (avail > 0); budget_left--, avail--) { if (__predict_false(!CTX_ACTIVE(ctx))) { DBG_COUNTER_INC(rx_ctx_inactive); break; } /* * Reset client set fields to their default values */ rxd_info_zero(&ri); ri.iri_qsidx = rxq->ifr_id; ri.iri_cidx = *cidxp; ri.iri_ifp = ifp; ri.iri_frags = rxq->ifr_frags; err = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri); if (err) goto err; if (sctx->isc_flags & IFLIB_HAS_RXCQ) { *cidxp = ri.iri_cidx; /* Update our consumer index */ /* XXX NB: shurd - check if this is still safe */ while (rxq->ifr_cq_cidx >= scctx->isc_nrxd[0]) { rxq->ifr_cq_cidx -= scctx->isc_nrxd[0]; rxq->ifr_cq_gen = 0; } /* was this only a completion queue message? */ if (__predict_false(ri.iri_nfrags == 0)) continue; } MPASS(ri.iri_nfrags != 0); MPASS(ri.iri_len != 0); /* will advance the cidx on the corresponding free lists */ m = iflib_rxd_pkt_get(rxq, &ri); if (avail == 0 && budget_left) avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget_left); if (__predict_false(m == NULL)) { DBG_COUNTER_INC(rx_mbuf_null); continue; } /* imm_pkt: -- cxgb */ if (mh == NULL) mh = mt = m; else { mt->m_nextpkt = m; mt = m; } } /* make sure that we can refill faster than drain */ for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++) __iflib_fl_refill_lt(ctx, fl, budget + 8); lro_enabled = (if_getcapenable(ifp) & IFCAP_LRO); if (lro_enabled) iflib_get_ip_forwarding(&rxq->ifr_lc, &v4_forwarding, &v6_forwarding); mt = mf = NULL; while (mh != NULL) { m = mh; mh = mh->m_nextpkt; m->m_nextpkt = NULL; #ifndef __NO_STRICT_ALIGNMENT if (!IP_ALIGNED(m) && (m = iflib_fixup_rx(m)) == NULL) continue; #endif rx_bytes += m->m_pkthdr.len; rx_pkts++; #if defined(INET6) || defined(INET) if (lro_enabled) { if (!lro_possible) { lro_possible = iflib_check_lro_possible(m, v4_forwarding, v6_forwarding); if (lro_possible && mf != NULL) { ifp->if_input(ifp, mf); DBG_COUNTER_INC(rx_if_input); mt = mf = NULL; } } if ((m->m_pkthdr.csum_flags & (CSUM_L4_CALC|CSUM_L4_VALID)) == (CSUM_L4_CALC|CSUM_L4_VALID)) { if (lro_possible && tcp_lro_rx(&rxq->ifr_lc, m, 0) == 0) continue; } } #endif if (lro_possible) { ifp->if_input(ifp, m); DBG_COUNTER_INC(rx_if_input); continue; } if (mf == NULL) mf = m; if (mt != NULL) mt->m_nextpkt = m; mt = m; } if (mf != NULL) { ifp->if_input(ifp, mf); DBG_COUNTER_INC(rx_if_input); } if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes); if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts); /* * Flush any outstanding LRO work */ #if defined(INET6) || defined(INET) tcp_lro_flush_all(&rxq->ifr_lc); #endif if (avail) return true; return (iflib_rxd_avail(ctx, rxq, *cidxp, 1)); err: STATE_LOCK(ctx); ctx->ifc_flags |= IFC_DO_RESET; iflib_admin_intr_deferred(ctx); STATE_UNLOCK(ctx); return (false); } #define TXD_NOTIFY_COUNT(txq) (((txq)->ift_size / (txq)->ift_update_freq)-1) static inline qidx_t txq_max_db_deferred(iflib_txq_t txq, qidx_t in_use) { qidx_t notify_count = TXD_NOTIFY_COUNT(txq); qidx_t minthresh = txq->ift_size / 8; if (in_use > 4*minthresh) return (notify_count); if (in_use > 2*minthresh) return (notify_count >> 1); if (in_use > minthresh) return (notify_count >> 3); return (0); } static inline qidx_t txq_max_rs_deferred(iflib_txq_t txq) { qidx_t notify_count = TXD_NOTIFY_COUNT(txq); qidx_t minthresh = txq->ift_size / 8; if (txq->ift_in_use > 4*minthresh) return (notify_count); if (txq->ift_in_use > 2*minthresh) return (notify_count >> 1); if (txq->ift_in_use > minthresh) return (notify_count >> 2); return (2); } #define M_CSUM_FLAGS(m) ((m)->m_pkthdr.csum_flags) #define M_HAS_VLANTAG(m) (m->m_flags & M_VLANTAG) #define TXQ_MAX_DB_DEFERRED(txq, in_use) txq_max_db_deferred((txq), (in_use)) #define TXQ_MAX_RS_DEFERRED(txq) txq_max_rs_deferred(txq) #define TXQ_MAX_DB_CONSUMED(size) (size >> 4) /* forward compatibility for cxgb */ #define FIRST_QSET(ctx) 0 #define NTXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_ntxqsets) #define NRXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_nrxqsets) #define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx)) #define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments)) /* XXX we should be setting this to something other than zero */ #define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh) #define MAX_TX_DESC(ctx) ((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max) static inline bool iflib_txd_db_check(if_ctx_t ctx, iflib_txq_t txq, int ring, qidx_t in_use) { qidx_t dbval, max; bool rang; rang = false; max = TXQ_MAX_DB_DEFERRED(txq, in_use); if (ring || txq->ift_db_pending >= max) { dbval = txq->ift_npending ? txq->ift_npending : txq->ift_pidx; ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, dbval); txq->ift_db_pending = txq->ift_npending = 0; rang = true; } return (rang); } #ifdef PKT_DEBUG static void print_pkt(if_pkt_info_t pi) { printf("pi len: %d qsidx: %d nsegs: %d ndescs: %d flags: %x pidx: %d\n", pi->ipi_len, pi->ipi_qsidx, pi->ipi_nsegs, pi->ipi_ndescs, pi->ipi_flags, pi->ipi_pidx); printf("pi new_pidx: %d csum_flags: %lx tso_segsz: %d mflags: %x vtag: %d\n", pi->ipi_new_pidx, pi->ipi_csum_flags, pi->ipi_tso_segsz, pi->ipi_mflags, pi->ipi_vtag); printf("pi etype: %d ehdrlen: %d ip_hlen: %d ipproto: %d\n", pi->ipi_etype, pi->ipi_ehdrlen, pi->ipi_ip_hlen, pi->ipi_ipproto); } #endif #define IS_TSO4(pi) ((pi)->ipi_csum_flags & CSUM_IP_TSO) #define IS_TSO6(pi) ((pi)->ipi_csum_flags & CSUM_IP6_TSO) static int iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp) { if_shared_ctx_t sctx = txq->ift_ctx->ifc_sctx; struct ether_vlan_header *eh; struct mbuf *m, *n; n = m = *mp; if ((sctx->isc_flags & IFLIB_NEED_SCRATCH) && M_WRITABLE(m) == 0) { if ((m = m_dup(m, M_NOWAIT)) == NULL) { return (ENOMEM); } else { m_freem(*mp); n = *mp = m; } } /* * Determine where frame payload starts. * Jump over vlan headers if already present, * helpful for QinQ too. */ if (__predict_false(m->m_len < sizeof(*eh))) { txq->ift_pullups++; if (__predict_false((m = m_pullup(m, sizeof(*eh))) == NULL)) return (ENOMEM); } eh = mtod(m, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { pi->ipi_etype = ntohs(eh->evl_proto); pi->ipi_ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { pi->ipi_etype = ntohs(eh->evl_encap_proto); pi->ipi_ehdrlen = ETHER_HDR_LEN; } switch (pi->ipi_etype) { #ifdef INET case ETHERTYPE_IP: { struct ip *ip = NULL; struct tcphdr *th = NULL; int minthlen; minthlen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip) + sizeof(*th)); if (__predict_false(m->m_len < minthlen)) { /* * if this code bloat is causing too much of a hit * move it to a separate function and mark it noinline */ if (m->m_len == pi->ipi_ehdrlen) { n = m->m_next; MPASS(n); if (n->m_len >= sizeof(*ip)) { ip = (struct ip *)n->m_data; if (n->m_len >= (ip->ip_hl << 2) + sizeof(*th)) th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); } else { txq->ift_pullups++; if (__predict_false((m = m_pullup(m, minthlen)) == NULL)) return (ENOMEM); ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen); } } else { txq->ift_pullups++; if (__predict_false((m = m_pullup(m, minthlen)) == NULL)) return (ENOMEM); ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen); if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th)) th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); } } else { ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen); if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th)) th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); } pi->ipi_ip_hlen = ip->ip_hl << 2; pi->ipi_ipproto = ip->ip_p; pi->ipi_flags |= IPI_TX_IPV4; if ((sctx->isc_flags & IFLIB_NEED_ZERO_CSUM) && (pi->ipi_csum_flags & CSUM_IP)) ip->ip_sum = 0; if (IS_TSO4(pi)) { if (pi->ipi_ipproto == IPPROTO_TCP) { if (__predict_false(th == NULL)) { txq->ift_pullups++; if (__predict_false((m = m_pullup(m, (ip->ip_hl << 2) + sizeof(*th))) == NULL)) return (ENOMEM); th = (struct tcphdr *)((caddr_t)ip + pi->ipi_ip_hlen); } pi->ipi_tcp_hflags = th->th_flags; pi->ipi_tcp_hlen = th->th_off << 2; pi->ipi_tcp_seq = th->th_seq; } if (__predict_false(ip->ip_p != IPPROTO_TCP)) return (ENXIO); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz; if (sctx->isc_flags & IFLIB_TSO_INIT_IP) { ip->ip_sum = 0; ip->ip_len = htons(pi->ipi_ip_hlen + pi->ipi_tcp_hlen + pi->ipi_tso_segsz); } } break; } #endif #ifdef INET6 case ETHERTYPE_IPV6: { struct ip6_hdr *ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen); struct tcphdr *th; pi->ipi_ip_hlen = sizeof(struct ip6_hdr); if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) { if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL)) return (ENOMEM); } th = (struct tcphdr *)((caddr_t)ip6 + pi->ipi_ip_hlen); /* XXX-BZ this will go badly in case of ext hdrs. */ pi->ipi_ipproto = ip6->ip6_nxt; pi->ipi_flags |= IPI_TX_IPV6; if (IS_TSO6(pi)) { if (pi->ipi_ipproto == IPPROTO_TCP) { if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) { if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) == NULL)) return (ENOMEM); } pi->ipi_tcp_hflags = th->th_flags; pi->ipi_tcp_hlen = th->th_off << 2; } if (__predict_false(ip6->ip6_nxt != IPPROTO_TCP)) return (ENXIO); /* * The corresponding flag is set by the stack in the IPv4 * TSO case, but not in IPv6 (at least in FreeBSD 10.2). * So, set it here because the rest of the flow requires it. */ pi->ipi_csum_flags |= CSUM_TCP_IPV6; th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz; } break; } #endif default: pi->ipi_csum_flags &= ~CSUM_OFFLOAD; pi->ipi_ip_hlen = 0; break; } *mp = m; return (0); } static __noinline struct mbuf * collapse_pkthdr(struct mbuf *m0) { struct mbuf *m, *m_next, *tmp; m = m0; m_next = m->m_next; while (m_next != NULL && m_next->m_len == 0) { m = m_next; m->m_next = NULL; m_free(m); m_next = m_next->m_next; } m = m0; m->m_next = m_next; if ((m_next->m_flags & M_EXT) == 0) { m = m_defrag(m, M_NOWAIT); } else { tmp = m_next->m_next; memcpy(m_next, m, MPKTHSIZE); m = m_next; m->m_next = tmp; } return (m); } /* * If dodgy hardware rejects the scatter gather chain we've handed it * we'll need to remove the mbuf chain from ifsg_m[] before we can add the * m_defrag'd mbufs */ static __noinline struct mbuf * iflib_remove_mbuf(iflib_txq_t txq) { int ntxd, i, pidx; struct mbuf *m, *mh, **ifsd_m; pidx = txq->ift_pidx; ifsd_m = txq->ift_sds.ifsd_m; ntxd = txq->ift_size; mh = m = ifsd_m[pidx]; ifsd_m[pidx] = NULL; #if MEMORY_LOGGING txq->ift_dequeued++; #endif i = 1; while (m) { ifsd_m[(pidx + i) & (ntxd -1)] = NULL; #if MEMORY_LOGGING txq->ift_dequeued++; #endif m = m->m_next; i++; } return (mh); } static int iflib_busdma_load_mbuf_sg(iflib_txq_t txq, bus_dma_tag_t tag, bus_dmamap_t map, struct mbuf **m0, bus_dma_segment_t *segs, int *nsegs, int max_segs, int flags) { if_ctx_t ctx; if_shared_ctx_t sctx; if_softc_ctx_t scctx; int i, next, pidx, err, ntxd, count; struct mbuf *m, *tmp, **ifsd_m; m = *m0; /* * Please don't ever do this */ if (__predict_false(m->m_len == 0)) *m0 = m = collapse_pkthdr(m); ctx = txq->ift_ctx; sctx = ctx->ifc_sctx; scctx = &ctx->ifc_softc_ctx; ifsd_m = txq->ift_sds.ifsd_m; ntxd = txq->ift_size; pidx = txq->ift_pidx; if (map != NULL) { uint8_t *ifsd_flags = txq->ift_sds.ifsd_flags; err = bus_dmamap_load_mbuf_sg(tag, map, *m0, segs, nsegs, BUS_DMA_NOWAIT); if (err) return (err); ifsd_flags[pidx] |= TX_SW_DESC_MAPPED; count = 0; m = *m0; do { if (__predict_false(m->m_len <= 0)) { tmp = m; m = m->m_next; tmp->m_next = NULL; m_free(tmp); continue; } m = m->m_next; count++; } while (m != NULL); if (count > *nsegs) { ifsd_m[pidx] = *m0; ifsd_m[pidx]->m_flags |= M_TOOBIG; return (0); } m = *m0; count = 0; do { next = (pidx + count) & (ntxd-1); MPASS(ifsd_m[next] == NULL); ifsd_m[next] = m; count++; tmp = m; m = m->m_next; } while (m != NULL); } else { int buflen, sgsize, maxsegsz, max_sgsize; vm_offset_t vaddr; vm_paddr_t curaddr; count = i = 0; m = *m0; if (m->m_pkthdr.csum_flags & CSUM_TSO) maxsegsz = scctx->isc_tx_tso_segsize_max; else maxsegsz = sctx->isc_tx_maxsegsize; do { if (__predict_false(m->m_len <= 0)) { tmp = m; m = m->m_next; tmp->m_next = NULL; m_free(tmp); continue; } buflen = m->m_len; vaddr = (vm_offset_t)m->m_data; /* * see if we can't be smarter about physically * contiguous mappings */ next = (pidx + count) & (ntxd-1); MPASS(ifsd_m[next] == NULL); #if MEMORY_LOGGING txq->ift_enqueued++; #endif ifsd_m[next] = m; while (buflen > 0) { if (i >= max_segs) goto err; max_sgsize = MIN(buflen, maxsegsz); curaddr = pmap_kextract(vaddr); sgsize = PAGE_SIZE - (curaddr & PAGE_MASK); sgsize = MIN(sgsize, max_sgsize); segs[i].ds_addr = curaddr; segs[i].ds_len = sgsize; vaddr += sgsize; buflen -= sgsize; i++; } count++; tmp = m; m = m->m_next; } while (m != NULL); *nsegs = i; } return (0); err: *m0 = iflib_remove_mbuf(txq); return (EFBIG); } static inline caddr_t calc_next_txd(iflib_txq_t txq, int cidx, uint8_t qid) { qidx_t size; int ntxd; caddr_t start, end, cur, next; ntxd = txq->ift_size; size = txq->ift_txd_size[qid]; start = txq->ift_ifdi[qid].idi_vaddr; if (__predict_false(size == 0)) return (start); cur = start + size*cidx; end = start + size*ntxd; next = CACHE_PTR_NEXT(cur); return (next < end ? next : start); } /* * Pad an mbuf to ensure a minimum ethernet frame size. * min_frame_size is the frame size (less CRC) to pad the mbuf to */ static __noinline int iflib_ether_pad(device_t dev, struct mbuf **m_head, uint16_t min_frame_size) { /* * 18 is enough bytes to pad an ARP packet to 46 bytes, and * and ARP message is the smallest common payload I can think of */ static char pad[18]; /* just zeros */ int n; struct mbuf *new_head; if (!M_WRITABLE(*m_head)) { new_head = m_dup(*m_head, M_NOWAIT); if (new_head == NULL) { m_freem(*m_head); device_printf(dev, "cannot pad short frame, m_dup() failed"); DBG_COUNTER_INC(encap_pad_mbuf_fail); return ENOMEM; } m_freem(*m_head); *m_head = new_head; } for (n = min_frame_size - (*m_head)->m_pkthdr.len; n > 0; n -= sizeof(pad)) if (!m_append(*m_head, min(n, sizeof(pad)), pad)) break; if (n > 0) { m_freem(*m_head); device_printf(dev, "cannot pad short frame\n"); DBG_COUNTER_INC(encap_pad_mbuf_fail); return (ENOBUFS); } return 0; } static int iflib_encap(iflib_txq_t txq, struct mbuf **m_headp) { if_ctx_t ctx; if_shared_ctx_t sctx; if_softc_ctx_t scctx; bus_dma_segment_t *segs; struct mbuf *m_head; void *next_txd; bus_dmamap_t map; struct if_pkt_info pi; int remap = 0; int err, nsegs, ndesc, max_segs, pidx, cidx, next, ntxd; bus_dma_tag_t desc_tag; segs = txq->ift_segs; ctx = txq->ift_ctx; sctx = ctx->ifc_sctx; scctx = &ctx->ifc_softc_ctx; segs = txq->ift_segs; ntxd = txq->ift_size; m_head = *m_headp; map = NULL; /* * If we're doing TSO the next descriptor to clean may be quite far ahead */ cidx = txq->ift_cidx; pidx = txq->ift_pidx; if (ctx->ifc_flags & IFC_PREFETCH) { next = (cidx + CACHE_PTR_INCREMENT) & (ntxd-1); if (!(ctx->ifc_flags & IFLIB_HAS_TXCQ)) { next_txd = calc_next_txd(txq, cidx, 0); prefetch(next_txd); } /* prefetch the next cache line of mbuf pointers and flags */ prefetch(&txq->ift_sds.ifsd_m[next]); if (txq->ift_sds.ifsd_map != NULL) { prefetch(&txq->ift_sds.ifsd_map[next]); next = (cidx + CACHE_LINE_SIZE) & (ntxd-1); prefetch(&txq->ift_sds.ifsd_flags[next]); } } else if (txq->ift_sds.ifsd_map != NULL) map = txq->ift_sds.ifsd_map[pidx]; if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { desc_tag = txq->ift_tso_desc_tag; max_segs = scctx->isc_tx_tso_segments_max; } else { desc_tag = txq->ift_desc_tag; max_segs = scctx->isc_tx_nsegments; } if ((sctx->isc_flags & IFLIB_NEED_ETHER_PAD) && __predict_false(m_head->m_pkthdr.len < scctx->isc_min_frame_size)) { err = iflib_ether_pad(ctx->ifc_dev, m_headp, scctx->isc_min_frame_size); if (err) return err; } m_head = *m_headp; pkt_info_zero(&pi); pi.ipi_mflags = (m_head->m_flags & (M_VLANTAG|M_BCAST|M_MCAST)); pi.ipi_pidx = pidx; pi.ipi_qsidx = txq->ift_id; pi.ipi_len = m_head->m_pkthdr.len; pi.ipi_csum_flags = m_head->m_pkthdr.csum_flags; pi.ipi_vtag = (m_head->m_flags & M_VLANTAG) ? m_head->m_pkthdr.ether_vtag : 0; /* deliberate bitwise OR to make one condition */ if (__predict_true((pi.ipi_csum_flags | pi.ipi_vtag))) { if (__predict_false((err = iflib_parse_header(txq, &pi, m_headp)) != 0)) return (err); m_head = *m_headp; } retry: err = iflib_busdma_load_mbuf_sg(txq, desc_tag, map, m_headp, segs, &nsegs, max_segs, BUS_DMA_NOWAIT); defrag: if (__predict_false(err)) { switch (err) { case EFBIG: /* try collapse once and defrag once */ if (remap == 0) { m_head = m_collapse(*m_headp, M_NOWAIT, max_segs); /* try defrag if collapsing fails */ if (m_head == NULL) remap++; } if (remap == 1) m_head = m_defrag(*m_headp, M_NOWAIT); remap++; if (__predict_false(m_head == NULL)) goto defrag_failed; txq->ift_mbuf_defrag++; *m_headp = m_head; goto retry; break; case ENOMEM: txq->ift_no_tx_dma_setup++; break; default: txq->ift_no_tx_dma_setup++; m_freem(*m_headp); DBG_COUNTER_INC(tx_frees); *m_headp = NULL; break; } txq->ift_map_failed++; DBG_COUNTER_INC(encap_load_mbuf_fail); return (err); } /* * XXX assumes a 1 to 1 relationship between segments and * descriptors - this does not hold true on all drivers, e.g. * cxgb */ if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) { txq->ift_no_desc_avail++; if (map != NULL) bus_dmamap_unload(desc_tag, map); DBG_COUNTER_INC(encap_txq_avail_fail); if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0) GROUPTASK_ENQUEUE(&txq->ift_task); return (ENOBUFS); } /* * On Intel cards we can greatly reduce the number of TX interrupts * we see by only setting report status on every Nth descriptor. * However, this also means that the driver will need to keep track * of the descriptors that RS was set on to check them for the DD bit. */ txq->ift_rs_pending += nsegs + 1; if (txq->ift_rs_pending > TXQ_MAX_RS_DEFERRED(txq) || iflib_no_tx_batch || (TXQ_AVAIL(txq) - nsegs) <= MAX_TX_DESC(ctx) + 2) { pi.ipi_flags |= IPI_TX_INTR; txq->ift_rs_pending = 0; } pi.ipi_segs = segs; pi.ipi_nsegs = nsegs; MPASS(pidx >= 0 && pidx < txq->ift_size); #ifdef PKT_DEBUG print_pkt(&pi); #endif if (map != NULL) bus_dmamap_sync(desc_tag, map, BUS_DMASYNC_PREWRITE); if ((err = ctx->isc_txd_encap(ctx->ifc_softc, &pi)) == 0) { if (map != NULL) bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); DBG_COUNTER_INC(tx_encap); MPASS(pi.ipi_new_pidx < txq->ift_size); ndesc = pi.ipi_new_pidx - pi.ipi_pidx; if (pi.ipi_new_pidx < pi.ipi_pidx) { ndesc += txq->ift_size; txq->ift_gen = 1; } /* * drivers can need as many as * two sentinels */ MPASS(ndesc <= pi.ipi_nsegs + 2); MPASS(pi.ipi_new_pidx != pidx); MPASS(ndesc > 0); txq->ift_in_use += ndesc; /* * We update the last software descriptor again here because there may * be a sentinel and/or there may be more mbufs than segments */ txq->ift_pidx = pi.ipi_new_pidx; txq->ift_npending += pi.ipi_ndescs; } else { *m_headp = m_head = iflib_remove_mbuf(txq); if (err == EFBIG) { txq->ift_txd_encap_efbig++; if (remap < 2) { remap = 1; goto defrag; } } DBG_COUNTER_INC(encap_txd_encap_fail); goto defrag_failed; } return (err); defrag_failed: txq->ift_mbuf_defrag_failed++; txq->ift_map_failed++; m_freem(*m_headp); DBG_COUNTER_INC(tx_frees); *m_headp = NULL; return (ENOMEM); } static void iflib_tx_desc_free(iflib_txq_t txq, int n) { int hasmap; uint32_t qsize, cidx, mask, gen; struct mbuf *m, **ifsd_m; uint8_t *ifsd_flags; bus_dmamap_t *ifsd_map; bool do_prefetch; cidx = txq->ift_cidx; gen = txq->ift_gen; qsize = txq->ift_size; mask = qsize-1; hasmap = txq->ift_sds.ifsd_map != NULL; ifsd_flags = txq->ift_sds.ifsd_flags; ifsd_m = txq->ift_sds.ifsd_m; ifsd_map = txq->ift_sds.ifsd_map; do_prefetch = (txq->ift_ctx->ifc_flags & IFC_PREFETCH); while (n-- > 0) { if (do_prefetch) { prefetch(ifsd_m[(cidx + 3) & mask]); prefetch(ifsd_m[(cidx + 4) & mask]); } if (ifsd_m[cidx] != NULL) { prefetch(&ifsd_m[(cidx + CACHE_PTR_INCREMENT) & mask]); prefetch(&ifsd_flags[(cidx + CACHE_PTR_INCREMENT) & mask]); if (hasmap && (ifsd_flags[cidx] & TX_SW_DESC_MAPPED)) { /* * does it matter if it's not the TSO tag? If so we'll * have to add the type to flags */ bus_dmamap_unload(txq->ift_desc_tag, ifsd_map[cidx]); ifsd_flags[cidx] &= ~TX_SW_DESC_MAPPED; } if ((m = ifsd_m[cidx]) != NULL) { /* XXX we don't support any drivers that batch packets yet */ MPASS(m->m_nextpkt == NULL); /* if the number of clusters exceeds the number of segments * there won't be space on the ring to save a pointer to each * cluster so we simply free the list here */ if (m->m_flags & M_TOOBIG) { m_freem(m); } else { m_free(m); } ifsd_m[cidx] = NULL; #if MEMORY_LOGGING txq->ift_dequeued++; #endif DBG_COUNTER_INC(tx_frees); } } if (__predict_false(++cidx == qsize)) { cidx = 0; gen = 0; } } txq->ift_cidx = cidx; txq->ift_gen = gen; } static __inline int iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh) { int reclaim; if_ctx_t ctx = txq->ift_ctx; KASSERT(thresh >= 0, ("invalid threshold to reclaim")); MPASS(thresh /*+ MAX_TX_DESC(txq->ift_ctx) */ < txq->ift_size); /* * Need a rate-limiting check so that this isn't called every time */ iflib_tx_credits_update(ctx, txq); reclaim = DESC_RECLAIMABLE(txq); if (reclaim <= thresh /* + MAX_TX_DESC(txq->ift_ctx) */) { #ifdef INVARIANTS if (iflib_verbose_debug) { printf("%s processed=%ju cleaned=%ju tx_nsegments=%d reclaim=%d thresh=%d\n", __FUNCTION__, txq->ift_processed, txq->ift_cleaned, txq->ift_ctx->ifc_softc_ctx.isc_tx_nsegments, reclaim, thresh); } #endif return (0); } iflib_tx_desc_free(txq, reclaim); txq->ift_cleaned += reclaim; txq->ift_in_use -= reclaim; return (reclaim); } static struct mbuf ** _ring_peek_one(struct ifmp_ring *r, int cidx, int offset, int remaining) { int next, size; struct mbuf **items; size = r->size; next = (cidx + CACHE_PTR_INCREMENT) & (size-1); items = __DEVOLATILE(struct mbuf **, &r->items[0]); prefetch(items[(cidx + offset) & (size-1)]); if (remaining > 1) { prefetch2cachelines(&items[next]); prefetch2cachelines(items[(cidx + offset + 1) & (size-1)]); prefetch2cachelines(items[(cidx + offset + 2) & (size-1)]); prefetch2cachelines(items[(cidx + offset + 3) & (size-1)]); } return (__DEVOLATILE(struct mbuf **, &r->items[(cidx + offset) & (size-1)])); } static void iflib_txq_check_drain(iflib_txq_t txq, int budget) { ifmp_ring_check_drainage(txq->ift_br, budget); } static uint32_t iflib_txq_can_drain(struct ifmp_ring *r) { iflib_txq_t txq = r->cookie; if_ctx_t ctx = txq->ift_ctx; return ((TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2) || ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, false)); } static uint32_t iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx) { iflib_txq_t txq = r->cookie; if_ctx_t ctx = txq->ift_ctx; struct ifnet *ifp = ctx->ifc_ifp; struct mbuf **mp, *m; int i, count, consumed, pkt_sent, bytes_sent, mcast_sent, avail; int reclaimed, err, in_use_prev, desc_used; bool do_prefetch, ring, rang; if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING) || !LINK_ACTIVE(ctx))) { DBG_COUNTER_INC(txq_drain_notready); return (0); } reclaimed = iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx)); rang = iflib_txd_db_check(ctx, txq, reclaimed, txq->ift_in_use); avail = IDXDIFF(pidx, cidx, r->size); if (__predict_false(ctx->ifc_flags & IFC_QFLUSH)) { DBG_COUNTER_INC(txq_drain_flushing); for (i = 0; i < avail; i++) { m_free(r->items[(cidx + i) & (r->size-1)]); r->items[(cidx + i) & (r->size-1)] = NULL; } return (avail); } if (__predict_false(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE)) { txq->ift_qstatus = IFLIB_QUEUE_IDLE; CALLOUT_LOCK(txq); callout_stop(&txq->ift_timer); CALLOUT_UNLOCK(txq); DBG_COUNTER_INC(txq_drain_oactive); return (0); } if (reclaimed) txq->ift_qstatus = IFLIB_QUEUE_IDLE; consumed = mcast_sent = bytes_sent = pkt_sent = 0; count = MIN(avail, TX_BATCH_SIZE); #ifdef INVARIANTS if (iflib_verbose_debug) printf("%s avail=%d ifc_flags=%x txq_avail=%d ", __FUNCTION__, avail, ctx->ifc_flags, TXQ_AVAIL(txq)); #endif do_prefetch = (ctx->ifc_flags & IFC_PREFETCH); avail = TXQ_AVAIL(txq); err = 0; for (desc_used = i = 0; i < count && avail > MAX_TX_DESC(ctx) + 2; i++) { int rem = do_prefetch ? count - i : 0; mp = _ring_peek_one(r, cidx, i, rem); MPASS(mp != NULL && *mp != NULL); if (__predict_false(*mp == (struct mbuf *)txq)) { consumed++; reclaimed++; continue; } in_use_prev = txq->ift_in_use; err = iflib_encap(txq, mp); if (__predict_false(err)) { DBG_COUNTER_INC(txq_drain_encapfail); /* no room - bail out */ if (err == ENOBUFS) break; consumed++; DBG_COUNTER_INC(txq_drain_encapfail); /* we can't send this packet - skip it */ continue; } consumed++; pkt_sent++; m = *mp; DBG_COUNTER_INC(tx_sent); bytes_sent += m->m_pkthdr.len; mcast_sent += !!(m->m_flags & M_MCAST); avail = TXQ_AVAIL(txq); txq->ift_db_pending += (txq->ift_in_use - in_use_prev); desc_used += (txq->ift_in_use - in_use_prev); ETHER_BPF_MTAP(ifp, m); if (__predict_false(!(ifp->if_drv_flags & IFF_DRV_RUNNING))) break; rang = iflib_txd_db_check(ctx, txq, false, in_use_prev); } /* deliberate use of bitwise or to avoid gratuitous short-circuit */ ring = rang ? false : (iflib_min_tx_latency | err) || (TXQ_AVAIL(txq) < MAX_TX_DESC(ctx)); iflib_txd_db_check(ctx, txq, ring, txq->ift_in_use); if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent); if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent); if (mcast_sent) if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast_sent); #ifdef INVARIANTS if (iflib_verbose_debug) printf("consumed=%d\n", consumed); #endif return (consumed); } static uint32_t iflib_txq_drain_always(struct ifmp_ring *r) { return (1); } static uint32_t iflib_txq_drain_free(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx) { int i, avail; struct mbuf **mp; iflib_txq_t txq; txq = r->cookie; txq->ift_qstatus = IFLIB_QUEUE_IDLE; CALLOUT_LOCK(txq); callout_stop(&txq->ift_timer); CALLOUT_UNLOCK(txq); avail = IDXDIFF(pidx, cidx, r->size); for (i = 0; i < avail; i++) { mp = _ring_peek_one(r, cidx, i, avail - i); if (__predict_false(*mp == (struct mbuf *)txq)) continue; m_freem(*mp); } MPASS(ifmp_ring_is_stalled(r) == 0); return (avail); } static void iflib_ifmp_purge(iflib_txq_t txq) { struct ifmp_ring *r; r = txq->ift_br; r->drain = iflib_txq_drain_free; r->can_drain = iflib_txq_drain_always; ifmp_ring_check_drainage(r, r->size); r->drain = iflib_txq_drain; r->can_drain = iflib_txq_can_drain; } static void _task_fn_tx(void *context) { iflib_txq_t txq = context; if_ctx_t ctx = txq->ift_ctx; struct ifnet *ifp = ctx->ifc_ifp; #ifdef IFLIB_DIAGNOSTICS txq->ift_cpu_exec_count[curcpu]++; #endif if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)) return; if (if_getcapenable(ifp) & IFCAP_NETMAP) { /* * If there are no available credits, and TX IRQs are not in use, * re-schedule the task immediately. */ if (ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, false)) netmap_tx_irq(ifp, txq->ift_id); else { #ifdef DEV_NETMAP if (!(ctx->ifc_flags & IFC_NETMAP_TX_IRQ)) { struct netmap_kring *kring = NA(ctx->ifc_ifp)->tx_rings[txq->ift_id]; if (kring->nr_hwtail != nm_prev(kring->rhead, kring->nkr_num_slots - 1)) GROUPTASK_ENQUEUE(&txq->ift_task); } #endif } IFDI_TX_QUEUE_INTR_ENABLE(ctx, txq->ift_id); return; } if (txq->ift_db_pending) ifmp_ring_enqueue(txq->ift_br, (void **)&txq, 1, TX_BATCH_SIZE); ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE); if (ctx->ifc_flags & IFC_LEGACY) IFDI_INTR_ENABLE(ctx); else { #ifdef INVARIANTS int rc = #endif IFDI_TX_QUEUE_INTR_ENABLE(ctx, txq->ift_id); KASSERT(rc != ENOTSUP, ("MSI-X support requires queue_intr_enable, but not implemented in driver")); } } static void _task_fn_rx(void *context) { iflib_rxq_t rxq = context; if_ctx_t ctx = rxq->ifr_ctx; bool more; uint16_t budget; #ifdef IFLIB_DIAGNOSTICS rxq->ifr_cpu_exec_count[curcpu]++; #endif DBG_COUNTER_INC(task_fn_rxs); if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))) return; more = true; #ifdef DEV_NETMAP if (if_getcapenable(ctx->ifc_ifp) & IFCAP_NETMAP) { u_int work = 0; if (netmap_rx_irq(ctx->ifc_ifp, rxq->ifr_id, &work)) { more = false; } } #endif budget = ctx->ifc_sysctl_rx_budget; if (budget == 0) budget = 16; /* XXX */ if (more == false || (more = iflib_rxeof(rxq, budget)) == false) { if (ctx->ifc_flags & IFC_LEGACY) IFDI_INTR_ENABLE(ctx); else { #ifdef INVARIANTS int rc = #endif IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id); KASSERT(rc != ENOTSUP, ("MSI-X support requires queue_intr_enable, but not implemented in driver")); DBG_COUNTER_INC(rx_intr_enables); } } if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))) return; if (more) GROUPTASK_ENQUEUE(&rxq->ifr_task); } static void _task_fn_admin(void *context) { if_ctx_t ctx = context; if_softc_ctx_t sctx = &ctx->ifc_softc_ctx; iflib_txq_t txq; int i; bool oactive, running, do_reset, do_watchdog; STATE_LOCK(ctx); running = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING); oactive = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE); do_reset = (ctx->ifc_flags & IFC_DO_RESET); do_watchdog = (ctx->ifc_flags & IFC_DO_WATCHDOG); ctx->ifc_flags &= ~(IFC_DO_RESET|IFC_DO_WATCHDOG); STATE_UNLOCK(ctx); if (!running & !oactive) return; CTX_LOCK(ctx); for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) { CALLOUT_LOCK(txq); callout_stop(&txq->ift_timer); CALLOUT_UNLOCK(txq); } if (do_watchdog) { ctx->ifc_watchdog_events++; IFDI_WATCHDOG_RESET(ctx); } IFDI_UPDATE_ADMIN_STATUS(ctx); for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq, txq->ift_timer.c_cpu); IFDI_LINK_INTR_ENABLE(ctx); if (do_reset) iflib_if_init_locked(ctx); CTX_UNLOCK(ctx); if (LINK_ACTIVE(ctx) == 0) return; for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET); } static void _task_fn_iov(void *context) { if_ctx_t ctx = context; if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)) return; CTX_LOCK(ctx); IFDI_VFLR_HANDLE(ctx); CTX_UNLOCK(ctx); } static int iflib_sysctl_int_delay(SYSCTL_HANDLER_ARGS) { int err; if_int_delay_info_t info; if_ctx_t ctx; info = (if_int_delay_info_t)arg1; ctx = info->iidi_ctx; info->iidi_req = req; info->iidi_oidp = oidp; CTX_LOCK(ctx); err = IFDI_SYSCTL_INT_DELAY(ctx, info); CTX_UNLOCK(ctx); return (err); } /********************************************************************* * * IFNET FUNCTIONS * **********************************************************************/ static void iflib_if_init_locked(if_ctx_t ctx) { iflib_stop(ctx); iflib_init_locked(ctx); } static void iflib_if_init(void *arg) { if_ctx_t ctx = arg; CTX_LOCK(ctx); iflib_if_init_locked(ctx); CTX_UNLOCK(ctx); } static int iflib_if_transmit(if_t ifp, struct mbuf *m) { if_ctx_t ctx = if_getsoftc(ifp); iflib_txq_t txq; int err, qidx; if (__predict_false((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || !LINK_ACTIVE(ctx))) { DBG_COUNTER_INC(tx_frees); m_freem(m); return (ENOBUFS); } MPASS(m->m_nextpkt == NULL); qidx = 0; if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m)) qidx = QIDX(ctx, m); /* * XXX calculate buf_ring based on flowid (divvy up bits?) */ txq = &ctx->ifc_txqs[qidx]; #ifdef DRIVER_BACKPRESSURE if (txq->ift_closed) { while (m != NULL) { next = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); m = next; } return (ENOBUFS); } #endif #ifdef notyet qidx = count = 0; mp = marr; next = m; do { count++; next = next->m_nextpkt; } while (next != NULL); if (count > nitems(marr)) if ((mp = malloc(count*sizeof(struct mbuf *), M_IFLIB, M_NOWAIT)) == NULL) { /* XXX check nextpkt */ m_freem(m); /* XXX simplify for now */ DBG_COUNTER_INC(tx_frees); return (ENOBUFS); } for (next = m, i = 0; next != NULL; i++) { mp[i] = next; next = next->m_nextpkt; mp[i]->m_nextpkt = NULL; } #endif DBG_COUNTER_INC(tx_seen); err = ifmp_ring_enqueue(txq->ift_br, (void **)&m, 1, TX_BATCH_SIZE); GROUPTASK_ENQUEUE(&txq->ift_task); if (err) { /* support forthcoming later */ #ifdef DRIVER_BACKPRESSURE txq->ift_closed = TRUE; #endif ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE); m_freem(m); } return (err); } static void iflib_if_qflush(if_t ifp) { if_ctx_t ctx = if_getsoftc(ifp); iflib_txq_t txq = ctx->ifc_txqs; int i; STATE_LOCK(ctx); ctx->ifc_flags |= IFC_QFLUSH; STATE_UNLOCK(ctx); for (i = 0; i < NTXQSETS(ctx); i++, txq++) while (!(ifmp_ring_is_idle(txq->ift_br) || ifmp_ring_is_stalled(txq->ift_br))) iflib_txq_check_drain(txq, 0); STATE_LOCK(ctx); ctx->ifc_flags &= ~IFC_QFLUSH; STATE_UNLOCK(ctx); if_qflush(ifp); } #define IFCAP_FLAGS (IFCAP_TXCSUM_IPV6 | IFCAP_RXCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \ IFCAP_TSO4 | IFCAP_TSO6 | IFCAP_VLAN_HWTAGGING | IFCAP_HWSTATS | \ IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | IFCAP_VLAN_HWTSO) static int iflib_if_ioctl(if_t ifp, u_long command, caddr_t data) { if_ctx_t ctx = if_getsoftc(ifp); struct ifreq *ifr = (struct ifreq *)data; #if defined(INET) || defined(INET6) struct ifaddr *ifa = (struct ifaddr *)data; #endif bool avoid_reset = FALSE; int err = 0, reinit = 0, bits; switch (command) { case SIOCSIFADDR: #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) avoid_reset = TRUE; #endif #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) avoid_reset = TRUE; #endif /* ** Calling init results in link renegotiation, ** so we avoid doing it when possible. */ if (avoid_reset) { if_setflagbits(ifp, IFF_UP,0); if (!(if_getdrvflags(ifp)& IFF_DRV_RUNNING)) reinit = 1; #ifdef INET if (!(if_getflags(ifp) & IFF_NOARP)) arp_ifinit(ifp, ifa); #endif } else err = ether_ioctl(ifp, command, data); break; case SIOCSIFMTU: CTX_LOCK(ctx); if (ifr->ifr_mtu == if_getmtu(ifp)) { CTX_UNLOCK(ctx); break; } bits = if_getdrvflags(ifp); /* stop the driver and free any clusters before proceeding */ iflib_stop(ctx); if ((err = IFDI_MTU_SET(ctx, ifr->ifr_mtu)) == 0) { STATE_LOCK(ctx); if (ifr->ifr_mtu > ctx->ifc_max_fl_buf_size) ctx->ifc_flags |= IFC_MULTISEG; else ctx->ifc_flags &= ~IFC_MULTISEG; STATE_UNLOCK(ctx); err = if_setmtu(ifp, ifr->ifr_mtu); } iflib_init_locked(ctx); STATE_LOCK(ctx); if_setdrvflags(ifp, bits); STATE_UNLOCK(ctx); CTX_UNLOCK(ctx); break; case SIOCSIFFLAGS: CTX_LOCK(ctx); if (if_getflags(ifp) & IFF_UP) { if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { if ((if_getflags(ifp) ^ ctx->ifc_if_flags) & (IFF_PROMISC | IFF_ALLMULTI)) { err = IFDI_PROMISC_SET(ctx, if_getflags(ifp)); } } else reinit = 1; } else if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { iflib_stop(ctx); } ctx->ifc_if_flags = if_getflags(ifp); CTX_UNLOCK(ctx); break; case SIOCADDMULTI: case SIOCDELMULTI: if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { CTX_LOCK(ctx); IFDI_INTR_DISABLE(ctx); IFDI_MULTI_SET(ctx); IFDI_INTR_ENABLE(ctx); CTX_UNLOCK(ctx); } break; case SIOCSIFMEDIA: CTX_LOCK(ctx); IFDI_MEDIA_SET(ctx); CTX_UNLOCK(ctx); /* falls thru */ case SIOCGIFMEDIA: case SIOCGIFXMEDIA: err = ifmedia_ioctl(ifp, ifr, &ctx->ifc_media, command); break; case SIOCGI2C: { struct ifi2creq i2c; err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c)); if (err != 0) break; if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) { err = EINVAL; break; } if (i2c.len > sizeof(i2c.data)) { err = EINVAL; break; } if ((err = IFDI_I2C_REQ(ctx, &i2c)) == 0) err = copyout(&i2c, ifr_data_get_ptr(ifr), sizeof(i2c)); break; } case SIOCSIFCAP: { int mask, setmask; mask = ifr->ifr_reqcap ^ if_getcapenable(ifp); setmask = 0; #ifdef TCP_OFFLOAD setmask |= mask & (IFCAP_TOE4|IFCAP_TOE6); #endif setmask |= (mask & IFCAP_FLAGS); if (setmask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) setmask |= (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6); if ((mask & IFCAP_WOL) && (if_getcapabilities(ifp) & IFCAP_WOL) != 0) setmask |= (mask & (IFCAP_WOL_MCAST|IFCAP_WOL_MAGIC)); if_vlancap(ifp); /* * want to ensure that traffic has stopped before we change any of the flags */ if (setmask) { CTX_LOCK(ctx); bits = if_getdrvflags(ifp); if (bits & IFF_DRV_RUNNING) iflib_stop(ctx); STATE_LOCK(ctx); if_togglecapenable(ifp, setmask); STATE_UNLOCK(ctx); if (bits & IFF_DRV_RUNNING) iflib_init_locked(ctx); STATE_LOCK(ctx); if_setdrvflags(ifp, bits); STATE_UNLOCK(ctx); CTX_UNLOCK(ctx); } break; } case SIOCGPRIVATE_0: case SIOCSDRVSPEC: case SIOCGDRVSPEC: CTX_LOCK(ctx); err = IFDI_PRIV_IOCTL(ctx, command, data); CTX_UNLOCK(ctx); break; default: err = ether_ioctl(ifp, command, data); break; } if (reinit) iflib_if_init(ctx); return (err); } static uint64_t iflib_if_get_counter(if_t ifp, ift_counter cnt) { if_ctx_t ctx = if_getsoftc(ifp); return (IFDI_GET_COUNTER(ctx, cnt)); } /********************************************************************* * * OTHER FUNCTIONS EXPORTED TO THE STACK * **********************************************************************/ static void iflib_vlan_register(void *arg, if_t ifp, uint16_t vtag) { if_ctx_t ctx = if_getsoftc(ifp); if ((void *)ctx != arg) return; if ((vtag == 0) || (vtag > 4095)) return; CTX_LOCK(ctx); IFDI_VLAN_REGISTER(ctx, vtag); /* Re-init to load the changes */ if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER) iflib_if_init_locked(ctx); CTX_UNLOCK(ctx); } static void iflib_vlan_unregister(void *arg, if_t ifp, uint16_t vtag) { if_ctx_t ctx = if_getsoftc(ifp); if ((void *)ctx != arg) return; if ((vtag == 0) || (vtag > 4095)) return; CTX_LOCK(ctx); IFDI_VLAN_UNREGISTER(ctx, vtag); /* Re-init to load the changes */ if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER) iflib_if_init_locked(ctx); CTX_UNLOCK(ctx); } static void iflib_led_func(void *arg, int onoff) { if_ctx_t ctx = arg; CTX_LOCK(ctx); IFDI_LED_FUNC(ctx, onoff); CTX_UNLOCK(ctx); } /********************************************************************* * * BUS FUNCTION DEFINITIONS * **********************************************************************/ int iflib_device_probe(device_t dev) { pci_vendor_info_t *ent; uint16_t pci_vendor_id, pci_device_id; uint16_t pci_subvendor_id, pci_subdevice_id; uint16_t pci_rev_id; if_shared_ctx_t sctx; if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC) return (ENOTSUP); pci_vendor_id = pci_get_vendor(dev); pci_device_id = pci_get_device(dev); pci_subvendor_id = pci_get_subvendor(dev); pci_subdevice_id = pci_get_subdevice(dev); pci_rev_id = pci_get_revid(dev); if (sctx->isc_parse_devinfo != NULL) sctx->isc_parse_devinfo(&pci_device_id, &pci_subvendor_id, &pci_subdevice_id, &pci_rev_id); ent = sctx->isc_vendor_info; while (ent->pvi_vendor_id != 0) { if (pci_vendor_id != ent->pvi_vendor_id) { ent++; continue; } if ((pci_device_id == ent->pvi_device_id) && ((pci_subvendor_id == ent->pvi_subvendor_id) || (ent->pvi_subvendor_id == 0)) && ((pci_subdevice_id == ent->pvi_subdevice_id) || (ent->pvi_subdevice_id == 0)) && ((pci_rev_id == ent->pvi_rev_id) || (ent->pvi_rev_id == 0))) { device_set_desc_copy(dev, ent->pvi_name); /* this needs to be changed to zero if the bus probing code * ever stops re-probing on best match because the sctx * may have its values over written by register calls * in subsequent probes */ return (BUS_PROBE_DEFAULT); } ent++; } return (ENXIO); } static void iflib_reset_qvalues(if_ctx_t ctx) { if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; if_shared_ctx_t sctx = ctx->ifc_sctx; device_t dev = ctx->ifc_dev; - int i, main_txq, main_rxq; - - main_txq = (sctx->isc_flags & IFLIB_HAS_TXCQ) ? 1 : 0; - main_rxq = (sctx->isc_flags & IFLIB_HAS_RXCQ) ? 1 : 0; + int i; scctx->isc_txrx_budget_bytes_max = IFLIB_MAX_TX_BYTES; scctx->isc_tx_qdepth = IFLIB_DEFAULT_TX_QDEPTH; /* * XXX sanity check that ntxd & nrxd are a power of 2 */ if (ctx->ifc_sysctl_ntxqs != 0) scctx->isc_ntxqsets = ctx->ifc_sysctl_ntxqs; if (ctx->ifc_sysctl_nrxqs != 0) scctx->isc_nrxqsets = ctx->ifc_sysctl_nrxqs; for (i = 0; i < sctx->isc_ntxqs; i++) { if (ctx->ifc_sysctl_ntxds[i] != 0) scctx->isc_ntxd[i] = ctx->ifc_sysctl_ntxds[i]; else scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i]; } for (i = 0; i < sctx->isc_nrxqs; i++) { if (ctx->ifc_sysctl_nrxds[i] != 0) scctx->isc_nrxd[i] = ctx->ifc_sysctl_nrxds[i]; else scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i]; } for (i = 0; i < sctx->isc_nrxqs; i++) { if (scctx->isc_nrxd[i] < sctx->isc_nrxd_min[i]) { device_printf(dev, "nrxd%d: %d less than nrxd_min %d - resetting to min\n", i, scctx->isc_nrxd[i], sctx->isc_nrxd_min[i]); scctx->isc_nrxd[i] = sctx->isc_nrxd_min[i]; } if (scctx->isc_nrxd[i] > sctx->isc_nrxd_max[i]) { device_printf(dev, "nrxd%d: %d greater than nrxd_max %d - resetting to max\n", i, scctx->isc_nrxd[i], sctx->isc_nrxd_max[i]); scctx->isc_nrxd[i] = sctx->isc_nrxd_max[i]; } } for (i = 0; i < sctx->isc_ntxqs; i++) { if (scctx->isc_ntxd[i] < sctx->isc_ntxd_min[i]) { device_printf(dev, "ntxd%d: %d less than ntxd_min %d - resetting to min\n", i, scctx->isc_ntxd[i], sctx->isc_ntxd_min[i]); scctx->isc_ntxd[i] = sctx->isc_ntxd_min[i]; } if (scctx->isc_ntxd[i] > sctx->isc_ntxd_max[i]) { device_printf(dev, "ntxd%d: %d greater than ntxd_max %d - resetting to max\n", i, scctx->isc_ntxd[i], sctx->isc_ntxd_max[i]); scctx->isc_ntxd[i] = sctx->isc_ntxd_max[i]; } } } int iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ctxp) { int err, rid, msix; if_ctx_t ctx; if_t ifp; if_softc_ctx_t scctx; int i; uint16_t main_txq; uint16_t main_rxq; ctx = malloc(sizeof(* ctx), M_IFLIB, M_WAITOK|M_ZERO); if (sc == NULL) { sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO); device_set_softc(dev, ctx); ctx->ifc_flags |= IFC_SC_ALLOCATED; } ctx->ifc_sctx = sctx; ctx->ifc_dev = dev; ctx->ifc_softc = sc; if ((err = iflib_register(ctx)) != 0) { if (ctx->ifc_flags & IFC_SC_ALLOCATED) free(sc, M_IFLIB); free(ctx, M_IFLIB); device_printf(dev, "iflib_register failed %d\n", err); return (err); } iflib_add_device_sysctl_pre(ctx); scctx = &ctx->ifc_softc_ctx; ifp = ctx->ifc_ifp; iflib_reset_qvalues(ctx); CTX_LOCK(ctx); if ((err = IFDI_ATTACH_PRE(ctx)) != 0) { CTX_UNLOCK(ctx); device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err); return (err); } _iflib_pre_assert(scctx); ctx->ifc_txrx = *scctx->isc_txrx; #ifdef INVARIANTS MPASS(scctx->isc_capenable); if (scctx->isc_capenable & IFCAP_TXCSUM) MPASS(scctx->isc_tx_csum_flags); #endif if_setcapabilities(ifp, scctx->isc_capenable | IFCAP_HWSTATS); if_setcapenable(ifp, scctx->isc_capenable | IFCAP_HWSTATS); if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets)) scctx->isc_ntxqsets = scctx->isc_ntxqsets_max; if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets)) scctx->isc_nrxqsets = scctx->isc_nrxqsets_max; #ifdef ACPI_DMAR if (dmar_get_dma_tag(device_get_parent(dev), dev) != NULL) ctx->ifc_flags |= IFC_DMAR; #elif !(defined(__i386__) || defined(__amd64__)) /* set unconditionally for !x86 */ ctx->ifc_flags |= IFC_DMAR; #endif main_txq = (sctx->isc_flags & IFLIB_HAS_TXCQ) ? 1 : 0; main_rxq = (sctx->isc_flags & IFLIB_HAS_RXCQ) ? 1 : 0; /* XXX change for per-queue sizes */ device_printf(dev, "using %d tx descriptors and %d rx descriptors\n", scctx->isc_ntxd[main_txq], scctx->isc_nrxd[main_rxq]); for (i = 0; i < sctx->isc_nrxqs; i++) { if (!powerof2(scctx->isc_nrxd[i])) { /* round down instead? */ device_printf(dev, "# rx descriptors must be a power of 2\n"); err = EINVAL; goto fail; } } for (i = 0; i < sctx->isc_ntxqs; i++) { if (!powerof2(scctx->isc_ntxd[i])) { device_printf(dev, "# tx descriptors must be a power of 2"); err = EINVAL; goto fail; } } if (scctx->isc_tx_nsegments > scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION) scctx->isc_tx_nsegments = max(1, scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION); if (scctx->isc_tx_tso_segments_max > scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION) scctx->isc_tx_tso_segments_max = max(1, scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION); /* * Protect the stack against modern hardware */ if (scctx->isc_tx_tso_size_max > FREEBSD_TSO_SIZE_MAX) scctx->isc_tx_tso_size_max = FREEBSD_TSO_SIZE_MAX; /* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */ ifp->if_hw_tsomaxsegcount = scctx->isc_tx_tso_segments_max; ifp->if_hw_tsomax = scctx->isc_tx_tso_size_max; ifp->if_hw_tsomaxsegsize = scctx->isc_tx_tso_segsize_max; if (scctx->isc_rss_table_size == 0) scctx->isc_rss_table_size = 64; scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1; GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx); /* XXX format name */ taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx, -1, "admin"); /* Set up cpu set. If it fails, use the set of all CPUs. */ if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) != 0) { device_printf(dev, "Unable to fetch CPU list\n"); CPU_COPY(&all_cpus, &ctx->ifc_cpus); } MPASS(CPU_COUNT(&ctx->ifc_cpus) > 0); /* ** Now setup MSI or MSI/X, should ** return us the number of supported ** vectors. (Will be 1 for MSI) */ if (sctx->isc_flags & IFLIB_SKIP_MSIX) { msix = scctx->isc_vectors; } else if (scctx->isc_msix_bar != 0) /* * The simple fact that isc_msix_bar is not 0 does not mean we * we have a good value there that is known to work. */ msix = iflib_msix_init(ctx); else { scctx->isc_vectors = 1; scctx->isc_ntxqsets = 1; scctx->isc_nrxqsets = 1; scctx->isc_intr = IFLIB_INTR_LEGACY; msix = 0; } /* Get memory for the station queues */ if ((err = iflib_queues_alloc(ctx))) { device_printf(dev, "Unable to allocate queue memory\n"); goto fail; } if ((err = iflib_qset_structures_setup(ctx))) goto fail_queues; /* * Group taskqueues aren't properly set up until SMP is started, * so we disable interrupts until we can handle them post * SI_SUB_SMP. * * XXX: disabling interrupts doesn't actually work, at least for * the non-MSI case. When they occur before SI_SUB_SMP completes, * we do null handling and depend on this not causing too large an * interrupt storm. */ IFDI_INTR_DISABLE(ctx); if (msix > 1 && (err = IFDI_MSIX_INTR_ASSIGN(ctx, msix)) != 0) { device_printf(dev, "IFDI_MSIX_INTR_ASSIGN failed %d\n", err); goto fail_intr_free; } if (msix <= 1) { rid = 0; if (scctx->isc_intr == IFLIB_INTR_MSI) { MPASS(msix == 1); rid = 1; } if ((err = iflib_legacy_setup(ctx, ctx->isc_legacy_intr, ctx->ifc_softc, &rid, "irq0")) != 0) { device_printf(dev, "iflib_legacy_setup failed %d\n", err); goto fail_intr_free; } } ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac); if ((err = IFDI_ATTACH_POST(ctx)) != 0) { device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err); goto fail_detach; } if ((err = iflib_netmap_attach(ctx))) { device_printf(ctx->ifc_dev, "netmap attach failed: %d\n", err); goto fail_detach; } *ctxp = ctx; NETDUMP_SET(ctx->ifc_ifp, iflib); if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter); iflib_add_device_sysctl_post(ctx); ctx->ifc_flags |= IFC_INIT_DONE; CTX_UNLOCK(ctx); return (0); fail_detach: ether_ifdetach(ctx->ifc_ifp); fail_intr_free: if (scctx->isc_intr == IFLIB_INTR_MSIX || scctx->isc_intr == IFLIB_INTR_MSI) pci_release_msi(ctx->ifc_dev); fail_queues: iflib_tx_structures_free(ctx); iflib_rx_structures_free(ctx); fail: IFDI_DETACH(ctx); CTX_UNLOCK(ctx); return (err); } int iflib_pseudo_register(device_t dev, if_shared_ctx_t sctx, if_ctx_t *ctxp, struct iflib_cloneattach_ctx *clctx) { int err; if_ctx_t ctx; if_t ifp; if_softc_ctx_t scctx; int i; void *sc; uint16_t main_txq; uint16_t main_rxq; ctx = malloc(sizeof(*ctx), M_IFLIB, M_WAITOK|M_ZERO); sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO); ctx->ifc_flags |= IFC_SC_ALLOCATED; if (sctx->isc_flags & (IFLIB_PSEUDO|IFLIB_VIRTUAL)) ctx->ifc_flags |= IFC_PSEUDO; ctx->ifc_sctx = sctx; ctx->ifc_softc = sc; ctx->ifc_dev = dev; if ((err = iflib_register(ctx)) != 0) { device_printf(dev, "%s: iflib_register failed %d\n", __func__, err); free(sc, M_IFLIB); free(ctx, M_IFLIB); return (err); } iflib_add_device_sysctl_pre(ctx); scctx = &ctx->ifc_softc_ctx; ifp = ctx->ifc_ifp; /* * XXX sanity check that ntxd & nrxd are a power of 2 */ iflib_reset_qvalues(ctx); if ((err = IFDI_ATTACH_PRE(ctx)) != 0) { device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err); return (err); } if (sctx->isc_flags & IFLIB_GEN_MAC) iflib_gen_mac(ctx); if ((err = IFDI_CLONEATTACH(ctx, clctx->cc_ifc, clctx->cc_name, clctx->cc_params)) != 0) { device_printf(dev, "IFDI_CLONEATTACH failed %d\n", err); return (err); } ifmedia_add(&ctx->ifc_media, IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL); ifmedia_add(&ctx->ifc_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&ctx->ifc_media, IFM_ETHER | IFM_AUTO); #ifdef INVARIANTS MPASS(scctx->isc_capenable); if (scctx->isc_capenable & IFCAP_TXCSUM) MPASS(scctx->isc_tx_csum_flags); #endif if_setcapabilities(ifp, scctx->isc_capenable | IFCAP_HWSTATS | IFCAP_LINKSTATE); if_setcapenable(ifp, scctx->isc_capenable | IFCAP_HWSTATS | IFCAP_LINKSTATE); ifp->if_flags |= IFF_NOGROUP; if (sctx->isc_flags & IFLIB_PSEUDO) { ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac); if ((err = IFDI_ATTACH_POST(ctx)) != 0) { device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err); goto fail_detach; } *ctxp = ctx; if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter); iflib_add_device_sysctl_post(ctx); ctx->ifc_flags |= IFC_INIT_DONE; return (0); } _iflib_pre_assert(scctx); ctx->ifc_txrx = *scctx->isc_txrx; if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets)) scctx->isc_ntxqsets = scctx->isc_ntxqsets_max; if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets)) scctx->isc_nrxqsets = scctx->isc_nrxqsets_max; main_txq = (sctx->isc_flags & IFLIB_HAS_TXCQ) ? 1 : 0; main_rxq = (sctx->isc_flags & IFLIB_HAS_RXCQ) ? 1 : 0; /* XXX change for per-queue sizes */ device_printf(dev, "using %d tx descriptors and %d rx descriptors\n", scctx->isc_ntxd[main_txq], scctx->isc_nrxd[main_rxq]); for (i = 0; i < sctx->isc_nrxqs; i++) { if (!powerof2(scctx->isc_nrxd[i])) { /* round down instead? */ device_printf(dev, "# rx descriptors must be a power of 2\n"); err = EINVAL; goto fail; } } for (i = 0; i < sctx->isc_ntxqs; i++) { if (!powerof2(scctx->isc_ntxd[i])) { device_printf(dev, "# tx descriptors must be a power of 2"); err = EINVAL; goto fail; } } if (scctx->isc_tx_nsegments > scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION) scctx->isc_tx_nsegments = max(1, scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION); if (scctx->isc_tx_tso_segments_max > scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION) scctx->isc_tx_tso_segments_max = max(1, scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION); /* * Protect the stack against modern hardware */ if (scctx->isc_tx_tso_size_max > FREEBSD_TSO_SIZE_MAX) scctx->isc_tx_tso_size_max = FREEBSD_TSO_SIZE_MAX; /* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */ ifp->if_hw_tsomaxsegcount = scctx->isc_tx_tso_segments_max; ifp->if_hw_tsomax = scctx->isc_tx_tso_size_max; ifp->if_hw_tsomaxsegsize = scctx->isc_tx_tso_segsize_max; if (scctx->isc_rss_table_size == 0) scctx->isc_rss_table_size = 64; scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1; GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx); /* XXX format name */ taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx, -1, "admin"); /* XXX --- can support > 1 -- but keep it simple for now */ scctx->isc_intr = IFLIB_INTR_LEGACY; /* Get memory for the station queues */ if ((err = iflib_queues_alloc(ctx))) { device_printf(dev, "Unable to allocate queue memory\n"); goto fail; } if ((err = iflib_qset_structures_setup(ctx))) { device_printf(dev, "qset structure setup failed %d\n", err); goto fail_queues; } /* * XXX What if anything do we want to do about interrupts? */ ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac); if ((err = IFDI_ATTACH_POST(ctx)) != 0) { device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err); goto fail_detach; } /* XXX handle more than one queue */ for (i = 0; i < scctx->isc_nrxqsets; i++) IFDI_RX_CLSET(ctx, 0, i, ctx->ifc_rxqs[i].ifr_fl[0].ifl_sds.ifsd_cl); *ctxp = ctx; if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter); iflib_add_device_sysctl_post(ctx); ctx->ifc_flags |= IFC_INIT_DONE; return (0); fail_detach: ether_ifdetach(ctx->ifc_ifp); fail_queues: iflib_tx_structures_free(ctx); iflib_rx_structures_free(ctx); fail: IFDI_DETACH(ctx); return (err); } int iflib_pseudo_deregister(if_ctx_t ctx) { if_t ifp = ctx->ifc_ifp; iflib_txq_t txq; iflib_rxq_t rxq; int i, j; struct taskqgroup *tqg; iflib_fl_t fl; /* Unregister VLAN events */ if (ctx->ifc_vlan_attach_event != NULL) EVENTHANDLER_DEREGISTER(vlan_config, ctx->ifc_vlan_attach_event); if (ctx->ifc_vlan_detach_event != NULL) EVENTHANDLER_DEREGISTER(vlan_unconfig, ctx->ifc_vlan_detach_event); ether_ifdetach(ifp); /* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/ CTX_LOCK_DESTROY(ctx); /* XXX drain any dependent tasks */ tqg = qgroup_if_io_tqg; for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) { callout_drain(&txq->ift_timer); if (txq->ift_task.gt_uniq != NULL) taskqgroup_detach(tqg, &txq->ift_task); } for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) { if (rxq->ifr_task.gt_uniq != NULL) taskqgroup_detach(tqg, &rxq->ifr_task); for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) free(fl->ifl_rx_bitmap, M_IFLIB); } tqg = qgroup_if_config_tqg; if (ctx->ifc_admin_task.gt_uniq != NULL) taskqgroup_detach(tqg, &ctx->ifc_admin_task); if (ctx->ifc_vflr_task.gt_uniq != NULL) taskqgroup_detach(tqg, &ctx->ifc_vflr_task); if_free(ifp); iflib_tx_structures_free(ctx); iflib_rx_structures_free(ctx); if (ctx->ifc_flags & IFC_SC_ALLOCATED) free(ctx->ifc_softc, M_IFLIB); free(ctx, M_IFLIB); return (0); } int iflib_device_attach(device_t dev) { if_ctx_t ctx; if_shared_ctx_t sctx; if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC) return (ENOTSUP); pci_enable_busmaster(dev); return (iflib_device_register(dev, NULL, sctx, &ctx)); } int iflib_device_deregister(if_ctx_t ctx) { if_t ifp = ctx->ifc_ifp; iflib_txq_t txq; iflib_rxq_t rxq; device_t dev = ctx->ifc_dev; int i, j; struct taskqgroup *tqg; iflib_fl_t fl; /* Make sure VLANS are not using driver */ if (if_vlantrunkinuse(ifp)) { device_printf(dev,"Vlan in use, detach first\n"); return (EBUSY); } CTX_LOCK(ctx); ctx->ifc_in_detach = 1; iflib_stop(ctx); CTX_UNLOCK(ctx); /* Unregister VLAN events */ if (ctx->ifc_vlan_attach_event != NULL) EVENTHANDLER_DEREGISTER(vlan_config, ctx->ifc_vlan_attach_event); if (ctx->ifc_vlan_detach_event != NULL) EVENTHANDLER_DEREGISTER(vlan_unconfig, ctx->ifc_vlan_detach_event); iflib_netmap_detach(ifp); ether_ifdetach(ifp); /* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/ CTX_LOCK_DESTROY(ctx); if (ctx->ifc_led_dev != NULL) led_destroy(ctx->ifc_led_dev); /* XXX drain any dependent tasks */ tqg = qgroup_if_io_tqg; for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) { callout_drain(&txq->ift_timer); if (txq->ift_task.gt_uniq != NULL) taskqgroup_detach(tqg, &txq->ift_task); } for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) { if (rxq->ifr_task.gt_uniq != NULL) taskqgroup_detach(tqg, &rxq->ifr_task); for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) free(fl->ifl_rx_bitmap, M_IFLIB); } tqg = qgroup_if_config_tqg; if (ctx->ifc_admin_task.gt_uniq != NULL) taskqgroup_detach(tqg, &ctx->ifc_admin_task); if (ctx->ifc_vflr_task.gt_uniq != NULL) taskqgroup_detach(tqg, &ctx->ifc_vflr_task); IFDI_DETACH(ctx); device_set_softc(ctx->ifc_dev, NULL); if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_LEGACY) { pci_release_msi(dev); } if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_MSIX) { iflib_irq_free(ctx, &ctx->ifc_legacy_irq); } if (ctx->ifc_msix_mem != NULL) { bus_release_resource(ctx->ifc_dev, SYS_RES_MEMORY, ctx->ifc_softc_ctx.isc_msix_bar, ctx->ifc_msix_mem); ctx->ifc_msix_mem = NULL; } bus_generic_detach(dev); if_free(ifp); iflib_tx_structures_free(ctx); iflib_rx_structures_free(ctx); if (ctx->ifc_flags & IFC_SC_ALLOCATED) free(ctx->ifc_softc, M_IFLIB); free(ctx, M_IFLIB); return (0); } int iflib_device_detach(device_t dev) { if_ctx_t ctx = device_get_softc(dev); return (iflib_device_deregister(ctx)); } int iflib_device_suspend(device_t dev) { if_ctx_t ctx = device_get_softc(dev); CTX_LOCK(ctx); IFDI_SUSPEND(ctx); CTX_UNLOCK(ctx); return bus_generic_suspend(dev); } int iflib_device_shutdown(device_t dev) { if_ctx_t ctx = device_get_softc(dev); CTX_LOCK(ctx); IFDI_SHUTDOWN(ctx); CTX_UNLOCK(ctx); return bus_generic_suspend(dev); } int iflib_device_resume(device_t dev) { if_ctx_t ctx = device_get_softc(dev); iflib_txq_t txq = ctx->ifc_txqs; CTX_LOCK(ctx); IFDI_RESUME(ctx); iflib_init_locked(ctx); CTX_UNLOCK(ctx); for (int i = 0; i < NTXQSETS(ctx); i++, txq++) iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET); return (bus_generic_resume(dev)); } int iflib_device_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *params) { int error; if_ctx_t ctx = device_get_softc(dev); CTX_LOCK(ctx); error = IFDI_IOV_INIT(ctx, num_vfs, params); CTX_UNLOCK(ctx); return (error); } void iflib_device_iov_uninit(device_t dev) { if_ctx_t ctx = device_get_softc(dev); CTX_LOCK(ctx); IFDI_IOV_UNINIT(ctx); CTX_UNLOCK(ctx); } int iflib_device_iov_add_vf(device_t dev, uint16_t vfnum, const nvlist_t *params) { int error; if_ctx_t ctx = device_get_softc(dev); CTX_LOCK(ctx); error = IFDI_IOV_VF_ADD(ctx, vfnum, params); CTX_UNLOCK(ctx); return (error); } /********************************************************************* * * MODULE FUNCTION DEFINITIONS * **********************************************************************/ /* * - Start a fast taskqueue thread for each core * - Start a taskqueue for control operations */ static int iflib_module_init(void) { return (0); } static int iflib_module_event_handler(module_t mod, int what, void *arg) { int err; switch (what) { case MOD_LOAD: if ((err = iflib_module_init()) != 0) return (err); break; case MOD_UNLOAD: return (EBUSY); default: return (EOPNOTSUPP); } return (0); } /********************************************************************* * * PUBLIC FUNCTION DEFINITIONS * ordered as in iflib.h * **********************************************************************/ static void _iflib_assert(if_shared_ctx_t sctx) { MPASS(sctx->isc_tx_maxsize); MPASS(sctx->isc_tx_maxsegsize); MPASS(sctx->isc_rx_maxsize); MPASS(sctx->isc_rx_nsegments); MPASS(sctx->isc_rx_maxsegsize); MPASS(sctx->isc_nrxd_min[0]); MPASS(sctx->isc_nrxd_max[0]); MPASS(sctx->isc_nrxd_default[0]); MPASS(sctx->isc_ntxd_min[0]); MPASS(sctx->isc_ntxd_max[0]); MPASS(sctx->isc_ntxd_default[0]); } static void _iflib_pre_assert(if_softc_ctx_t scctx) { MPASS(scctx->isc_txrx->ift_txd_encap); MPASS(scctx->isc_txrx->ift_txd_flush); MPASS(scctx->isc_txrx->ift_txd_credits_update); MPASS(scctx->isc_txrx->ift_rxd_available); MPASS(scctx->isc_txrx->ift_rxd_pkt_get); MPASS(scctx->isc_txrx->ift_rxd_refill); MPASS(scctx->isc_txrx->ift_rxd_flush); } static int iflib_register(if_ctx_t ctx) { if_shared_ctx_t sctx = ctx->ifc_sctx; driver_t *driver = sctx->isc_driver; device_t dev = ctx->ifc_dev; if_t ifp; _iflib_assert(sctx); CTX_LOCK_INIT(ctx); STATE_LOCK_INIT(ctx, device_get_nameunit(ctx->ifc_dev)); ifp = ctx->ifc_ifp = if_gethandle(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "can not allocate ifnet structure\n"); return (ENOMEM); } /* * Initialize our context's device specific methods */ kobj_init((kobj_t) ctx, (kobj_class_t) driver); kobj_class_compile((kobj_class_t) driver); driver->refs++; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); if_setsoftc(ifp, ctx); if_setdev(ifp, dev); if_setinitfn(ifp, iflib_if_init); if_setioctlfn(ifp, iflib_if_ioctl); if_settransmitfn(ifp, iflib_if_transmit); if_setqflushfn(ifp, iflib_if_qflush); if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); ctx->ifc_vlan_attach_event = EVENTHANDLER_REGISTER(vlan_config, iflib_vlan_register, ctx, EVENTHANDLER_PRI_FIRST); ctx->ifc_vlan_detach_event = EVENTHANDLER_REGISTER(vlan_unconfig, iflib_vlan_unregister, ctx, EVENTHANDLER_PRI_FIRST); ifmedia_init(&ctx->ifc_media, IFM_IMASK, iflib_media_change, iflib_media_status); return (0); } static int iflib_queues_alloc(if_ctx_t ctx) { if_shared_ctx_t sctx = ctx->ifc_sctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; device_t dev = ctx->ifc_dev; int nrxqsets = scctx->isc_nrxqsets; int ntxqsets = scctx->isc_ntxqsets; iflib_txq_t txq; iflib_rxq_t rxq; iflib_fl_t fl = NULL; int i, j, cpu, err, txconf, rxconf; iflib_dma_info_t ifdip; uint32_t *rxqsizes = scctx->isc_rxqsizes; uint32_t *txqsizes = scctx->isc_txqsizes; uint8_t nrxqs = sctx->isc_nrxqs; uint8_t ntxqs = sctx->isc_ntxqs; int nfree_lists = sctx->isc_nfl ? sctx->isc_nfl : 1; caddr_t *vaddrs; uint64_t *paddrs; KASSERT(ntxqs > 0, ("number of queues per qset must be at least 1")); KASSERT(nrxqs > 0, ("number of queues per qset must be at least 1")); /* Allocate the TX ring struct memory */ if (!(ctx->ifc_txqs = (iflib_txq_t) malloc(sizeof(struct iflib_txq) * ntxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate TX ring memory\n"); err = ENOMEM; goto fail; } /* Now allocate the RX */ if (!(ctx->ifc_rxqs = (iflib_rxq_t) malloc(sizeof(struct iflib_rxq) * nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate RX ring memory\n"); err = ENOMEM; goto rx_fail; } txq = ctx->ifc_txqs; rxq = ctx->ifc_rxqs; /* * XXX handle allocation failure */ for (txconf = i = 0, cpu = CPU_FIRST(); i < ntxqsets; i++, txconf++, txq++, cpu = CPU_NEXT(cpu)) { /* Set up some basics */ if ((ifdip = malloc(sizeof(struct iflib_dma_info) * ntxqs, M_IFLIB, M_WAITOK|M_ZERO)) == NULL) { device_printf(dev, "failed to allocate iflib_dma_info\n"); err = ENOMEM; goto err_tx_desc; } txq->ift_ifdi = ifdip; for (j = 0; j < ntxqs; j++, ifdip++) { if (iflib_dma_alloc(ctx, txqsizes[j], ifdip, BUS_DMA_NOWAIT)) { device_printf(dev, "Unable to allocate Descriptor memory\n"); err = ENOMEM; goto err_tx_desc; } txq->ift_txd_size[j] = scctx->isc_txd_size[j]; bzero((void *)ifdip->idi_vaddr, txqsizes[j]); } txq->ift_ctx = ctx; txq->ift_id = i; if (sctx->isc_flags & IFLIB_HAS_TXCQ) { txq->ift_br_offset = 1; } else { txq->ift_br_offset = 0; } /* XXX fix this */ txq->ift_timer.c_cpu = cpu; if (iflib_txsd_alloc(txq)) { device_printf(dev, "Critical Failure setting up TX buffers\n"); err = ENOMEM; goto err_tx_desc; } /* Initialize the TX lock */ snprintf(txq->ift_mtx_name, MTX_NAME_LEN, "%s:tx(%d):callout", device_get_nameunit(dev), txq->ift_id); mtx_init(&txq->ift_mtx, txq->ift_mtx_name, NULL, MTX_DEF); callout_init_mtx(&txq->ift_timer, &txq->ift_mtx, 0); snprintf(txq->ift_db_mtx_name, MTX_NAME_LEN, "%s:tx(%d):db", device_get_nameunit(dev), txq->ift_id); err = ifmp_ring_alloc(&txq->ift_br, 2048, txq, iflib_txq_drain, iflib_txq_can_drain, M_IFLIB, M_WAITOK); if (err) { /* XXX free any allocated rings */ device_printf(dev, "Unable to allocate buf_ring\n"); goto err_tx_desc; } } for (rxconf = i = 0; i < nrxqsets; i++, rxconf++, rxq++) { /* Set up some basics */ if ((ifdip = malloc(sizeof(struct iflib_dma_info) * nrxqs, M_IFLIB, M_WAITOK|M_ZERO)) == NULL) { device_printf(dev, "failed to allocate iflib_dma_info\n"); err = ENOMEM; goto err_tx_desc; } rxq->ifr_ifdi = ifdip; /* XXX this needs to be changed if #rx queues != #tx queues */ rxq->ifr_ntxqirq = 1; rxq->ifr_txqid[0] = i; for (j = 0; j < nrxqs; j++, ifdip++) { if (iflib_dma_alloc(ctx, rxqsizes[j], ifdip, BUS_DMA_NOWAIT)) { device_printf(dev, "Unable to allocate Descriptor memory\n"); err = ENOMEM; goto err_tx_desc; } bzero((void *)ifdip->idi_vaddr, rxqsizes[j]); } rxq->ifr_ctx = ctx; rxq->ifr_id = i; if (sctx->isc_flags & IFLIB_HAS_RXCQ) { rxq->ifr_fl_offset = 1; } else { rxq->ifr_fl_offset = 0; } rxq->ifr_nfl = nfree_lists; if (!(fl = (iflib_fl_t) malloc(sizeof(struct iflib_fl) * nfree_lists, M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate free list memory\n"); err = ENOMEM; goto err_tx_desc; } rxq->ifr_fl = fl; for (j = 0; j < nfree_lists; j++) { fl[j].ifl_rxq = rxq; fl[j].ifl_id = j; fl[j].ifl_ifdi = &rxq->ifr_ifdi[j + rxq->ifr_fl_offset]; fl[j].ifl_rxd_size = scctx->isc_rxd_size[j]; } /* Allocate receive buffers for the ring*/ if (iflib_rxsd_alloc(rxq)) { device_printf(dev, "Critical Failure setting up receive buffers\n"); err = ENOMEM; goto err_rx_desc; } for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) fl->ifl_rx_bitmap = bit_alloc(fl->ifl_size, M_IFLIB, M_WAITOK|M_ZERO); } /* TXQs */ vaddrs = malloc(sizeof(caddr_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK); paddrs = malloc(sizeof(uint64_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK); for (i = 0; i < ntxqsets; i++) { iflib_dma_info_t di = ctx->ifc_txqs[i].ift_ifdi; for (j = 0; j < ntxqs; j++, di++) { vaddrs[i*ntxqs + j] = di->idi_vaddr; paddrs[i*ntxqs + j] = di->idi_paddr; } } if ((err = IFDI_TX_QUEUES_ALLOC(ctx, vaddrs, paddrs, ntxqs, ntxqsets)) != 0) { device_printf(ctx->ifc_dev, "device queue allocation failed\n"); iflib_tx_structures_free(ctx); free(vaddrs, M_IFLIB); free(paddrs, M_IFLIB); goto err_rx_desc; } free(vaddrs, M_IFLIB); free(paddrs, M_IFLIB); /* RXQs */ vaddrs = malloc(sizeof(caddr_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK); paddrs = malloc(sizeof(uint64_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK); for (i = 0; i < nrxqsets; i++) { iflib_dma_info_t di = ctx->ifc_rxqs[i].ifr_ifdi; for (j = 0; j < nrxqs; j++, di++) { vaddrs[i*nrxqs + j] = di->idi_vaddr; paddrs[i*nrxqs + j] = di->idi_paddr; } } if ((err = IFDI_RX_QUEUES_ALLOC(ctx, vaddrs, paddrs, nrxqs, nrxqsets)) != 0) { device_printf(ctx->ifc_dev, "device queue allocation failed\n"); iflib_tx_structures_free(ctx); free(vaddrs, M_IFLIB); free(paddrs, M_IFLIB); goto err_rx_desc; } free(vaddrs, M_IFLIB); free(paddrs, M_IFLIB); return (0); /* XXX handle allocation failure changes */ err_rx_desc: err_tx_desc: rx_fail: if (ctx->ifc_rxqs != NULL) free(ctx->ifc_rxqs, M_IFLIB); ctx->ifc_rxqs = NULL; if (ctx->ifc_txqs != NULL) free(ctx->ifc_txqs, M_IFLIB); ctx->ifc_txqs = NULL; fail: return (err); } static int iflib_tx_structures_setup(if_ctx_t ctx) { iflib_txq_t txq = ctx->ifc_txqs; int i; for (i = 0; i < NTXQSETS(ctx); i++, txq++) iflib_txq_setup(txq); return (0); } static void iflib_tx_structures_free(if_ctx_t ctx) { iflib_txq_t txq = ctx->ifc_txqs; int i, j; for (i = 0; i < NTXQSETS(ctx); i++, txq++) { iflib_txq_destroy(txq); for (j = 0; j < ctx->ifc_nhwtxqs; j++) iflib_dma_free(&txq->ift_ifdi[j]); } free(ctx->ifc_txqs, M_IFLIB); ctx->ifc_txqs = NULL; IFDI_QUEUES_FREE(ctx); } /********************************************************************* * * Initialize all receive rings. * **********************************************************************/ static int iflib_rx_structures_setup(if_ctx_t ctx) { iflib_rxq_t rxq = ctx->ifc_rxqs; int q; #if defined(INET6) || defined(INET) int i, err; #endif for (q = 0; q < ctx->ifc_softc_ctx.isc_nrxqsets; q++, rxq++) { #if defined(INET6) || defined(INET) tcp_lro_free(&rxq->ifr_lc); if ((err = tcp_lro_init_args(&rxq->ifr_lc, ctx->ifc_ifp, TCP_LRO_ENTRIES, min(1024, ctx->ifc_softc_ctx.isc_nrxd[rxq->ifr_fl_offset]))) != 0) { device_printf(ctx->ifc_dev, "LRO Initialization failed!\n"); goto fail; } rxq->ifr_lro_enabled = TRUE; #endif IFDI_RXQ_SETUP(ctx, rxq->ifr_id); } return (0); #if defined(INET6) || defined(INET) fail: /* * Free RX software descriptors allocated so far, we will only handle * the rings that completed, the failing case will have * cleaned up for itself. 'q' failed, so its the terminus. */ rxq = ctx->ifc_rxqs; for (i = 0; i < q; ++i, rxq++) { iflib_rx_sds_free(rxq); rxq->ifr_cq_gen = rxq->ifr_cq_cidx = rxq->ifr_cq_pidx = 0; } return (err); #endif } /********************************************************************* * * Free all receive rings. * **********************************************************************/ static void iflib_rx_structures_free(if_ctx_t ctx) { iflib_rxq_t rxq = ctx->ifc_rxqs; for (int i = 0; i < ctx->ifc_softc_ctx.isc_nrxqsets; i++, rxq++) { iflib_rx_sds_free(rxq); } } static int iflib_qset_structures_setup(if_ctx_t ctx) { int err; /* * It is expected that the caller takes care of freeing queues if this * fails. */ if ((err = iflib_tx_structures_setup(ctx)) != 0) { device_printf(ctx->ifc_dev, "iflib_tx_structures_setup failed: %d\n", err); return (err); } if ((err = iflib_rx_structures_setup(ctx)) != 0) device_printf(ctx->ifc_dev, "iflib_rx_structures_setup failed: %d\n", err); return (err); } int iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid, driver_filter_t filter, void *filter_arg, driver_intr_t handler, void *arg, char *name) { return (_iflib_irq_alloc(ctx, irq, rid, filter, handler, arg, name)); } #ifdef SMP static int find_nth(if_ctx_t ctx, int qid) { cpuset_t cpus; int i, cpuid, eqid, count; CPU_COPY(&ctx->ifc_cpus, &cpus); count = CPU_COUNT(&cpus); eqid = qid % count; /* clear up to the qid'th bit */ for (i = 0; i < eqid; i++) { cpuid = CPU_FFS(&cpus); MPASS(cpuid != 0); CPU_CLR(cpuid-1, &cpus); } cpuid = CPU_FFS(&cpus); MPASS(cpuid != 0); return (cpuid-1); } #ifdef SCHED_ULE extern struct cpu_group *cpu_top; /* CPU topology */ static int find_child_with_core(int cpu, struct cpu_group *grp) { int i; if (grp->cg_children == 0) return -1; MPASS(grp->cg_child); for (i = 0; i < grp->cg_children; i++) { if (CPU_ISSET(cpu, &grp->cg_child[i].cg_mask)) return i; } return -1; } /* * Find the nth "close" core to the specified core * "close" is defined as the deepest level that shares * at least an L2 cache. With threads, this will be * threads on the same core. If the sahred cache is L3 * or higher, simply returns the same core. */ static int find_close_core(int cpu, int core_offset) { struct cpu_group *grp; int i; int fcpu; cpuset_t cs; grp = cpu_top; if (grp == NULL) return cpu; i = 0; while ((i = find_child_with_core(cpu, grp)) != -1) { /* If the child only has one cpu, don't descend */ if (grp->cg_child[i].cg_count <= 1) break; grp = &grp->cg_child[i]; } /* If they don't share at least an L2 cache, use the same CPU */ if (grp->cg_level > CG_SHARE_L2 || grp->cg_level == CG_SHARE_NONE) return cpu; /* Now pick one */ CPU_COPY(&grp->cg_mask, &cs); /* Add the selected CPU offset to core offset. */ for (i = 0; (fcpu = CPU_FFS(&cs)) != 0; i++) { if (fcpu - 1 == cpu) break; CPU_CLR(fcpu - 1, &cs); } MPASS(fcpu); core_offset += i; CPU_COPY(&grp->cg_mask, &cs); for (i = core_offset % grp->cg_count; i > 0; i--) { MPASS(CPU_FFS(&cs)); CPU_CLR(CPU_FFS(&cs) - 1, &cs); } MPASS(CPU_FFS(&cs)); return CPU_FFS(&cs) - 1; } #else static int find_close_core(int cpu, int core_offset __unused) { return cpu; } #endif static int get_core_offset(if_ctx_t ctx, iflib_intr_type_t type, int qid) { switch (type) { case IFLIB_INTR_TX: /* TX queues get cores which share at least an L2 cache with the corresponding RX queue */ /* XXX handle multiple RX threads per core and more than two core per L2 group */ return qid / CPU_COUNT(&ctx->ifc_cpus) + 1; case IFLIB_INTR_RX: case IFLIB_INTR_RXTX: /* RX queues get the specified core */ return qid / CPU_COUNT(&ctx->ifc_cpus); default: return -1; } } #else #define get_core_offset(ctx, type, qid) CPU_FIRST() #define find_close_core(cpuid, tid) CPU_FIRST() #define find_nth(ctx, gid) CPU_FIRST() #endif /* Just to avoid copy/paste */ static inline int iflib_irq_set_affinity(if_ctx_t ctx, int irq, iflib_intr_type_t type, int qid, struct grouptask *gtask, struct taskqgroup *tqg, void *uniq, char *name) { int cpuid; int err, tid; cpuid = find_nth(ctx, qid); tid = get_core_offset(ctx, type, qid); MPASS(tid >= 0); cpuid = find_close_core(cpuid, tid); err = taskqgroup_attach_cpu(tqg, gtask, uniq, cpuid, irq, name); if (err) { device_printf(ctx->ifc_dev, "taskqgroup_attach_cpu failed %d\n", err); return (err); } #ifdef notyet if (cpuid > ctx->ifc_cpuid_highest) ctx->ifc_cpuid_highest = cpuid; #endif return 0; } int iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid, iflib_intr_type_t type, driver_filter_t *filter, void *filter_arg, int qid, char *name) { struct grouptask *gtask; struct taskqgroup *tqg; iflib_filter_info_t info; gtask_fn_t *fn; int tqrid, err; driver_filter_t *intr_fast; void *q; info = &ctx->ifc_filter_info; tqrid = rid; switch (type) { /* XXX merge tx/rx for netmap? */ case IFLIB_INTR_TX: q = &ctx->ifc_txqs[qid]; info = &ctx->ifc_txqs[qid].ift_filter_info; gtask = &ctx->ifc_txqs[qid].ift_task; tqg = qgroup_if_io_tqg; fn = _task_fn_tx; intr_fast = iflib_fast_intr; GROUPTASK_INIT(gtask, 0, fn, q); ctx->ifc_flags |= IFC_NETMAP_TX_IRQ; break; case IFLIB_INTR_RX: q = &ctx->ifc_rxqs[qid]; info = &ctx->ifc_rxqs[qid].ifr_filter_info; gtask = &ctx->ifc_rxqs[qid].ifr_task; tqg = qgroup_if_io_tqg; fn = _task_fn_rx; intr_fast = iflib_fast_intr; GROUPTASK_INIT(gtask, 0, fn, q); break; case IFLIB_INTR_RXTX: q = &ctx->ifc_rxqs[qid]; info = &ctx->ifc_rxqs[qid].ifr_filter_info; gtask = &ctx->ifc_rxqs[qid].ifr_task; tqg = qgroup_if_io_tqg; fn = _task_fn_rx; intr_fast = iflib_fast_intr_rxtx; GROUPTASK_INIT(gtask, 0, fn, q); break; case IFLIB_INTR_ADMIN: q = ctx; tqrid = -1; info = &ctx->ifc_filter_info; gtask = &ctx->ifc_admin_task; tqg = qgroup_if_config_tqg; fn = _task_fn_admin; intr_fast = iflib_fast_intr_ctx; break; default: panic("unknown net intr type"); } info->ifi_filter = filter; info->ifi_filter_arg = filter_arg; info->ifi_task = gtask; info->ifi_ctx = q; err = _iflib_irq_alloc(ctx, irq, rid, intr_fast, NULL, info, name); if (err != 0) { device_printf(ctx->ifc_dev, "_iflib_irq_alloc failed %d\n", err); return (err); } if (type == IFLIB_INTR_ADMIN) return (0); if (tqrid != -1) { err = iflib_irq_set_affinity(ctx, rman_get_start(irq->ii_res), type, qid, gtask, tqg, q, name); if (err) return (err); } else { taskqgroup_attach(tqg, gtask, q, rman_get_start(irq->ii_res), name); } return (0); } void iflib_softirq_alloc_generic(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type, void *arg, int qid, char *name) { struct grouptask *gtask; struct taskqgroup *tqg; gtask_fn_t *fn; void *q; int irq_num = -1; int err; switch (type) { case IFLIB_INTR_TX: q = &ctx->ifc_txqs[qid]; gtask = &ctx->ifc_txqs[qid].ift_task; tqg = qgroup_if_io_tqg; fn = _task_fn_tx; if (irq != NULL) irq_num = rman_get_start(irq->ii_res); break; case IFLIB_INTR_RX: q = &ctx->ifc_rxqs[qid]; gtask = &ctx->ifc_rxqs[qid].ifr_task; tqg = qgroup_if_io_tqg; fn = _task_fn_rx; if (irq != NULL) irq_num = rman_get_start(irq->ii_res); break; case IFLIB_INTR_IOV: q = ctx; gtask = &ctx->ifc_vflr_task; tqg = qgroup_if_config_tqg; fn = _task_fn_iov; break; default: panic("unknown net intr type"); } GROUPTASK_INIT(gtask, 0, fn, q); if (irq_num != -1) { err = iflib_irq_set_affinity(ctx, irq_num, type, qid, gtask, tqg, q, name); if (err) taskqgroup_attach(tqg, gtask, q, irq_num, name); } else { taskqgroup_attach(tqg, gtask, q, irq_num, name); } } void iflib_irq_free(if_ctx_t ctx, if_irq_t irq) { if (irq->ii_tag) bus_teardown_intr(ctx->ifc_dev, irq->ii_res, irq->ii_tag); if (irq->ii_res) bus_release_resource(ctx->ifc_dev, SYS_RES_IRQ, irq->ii_rid, irq->ii_res); } static int iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filter_arg, int *rid, char *name) { iflib_txq_t txq = ctx->ifc_txqs; iflib_rxq_t rxq = ctx->ifc_rxqs; if_irq_t irq = &ctx->ifc_legacy_irq; iflib_filter_info_t info; struct grouptask *gtask; struct taskqgroup *tqg; gtask_fn_t *fn; int tqrid; void *q; int err; q = &ctx->ifc_rxqs[0]; info = &rxq[0].ifr_filter_info; gtask = &rxq[0].ifr_task; tqg = qgroup_if_io_tqg; tqrid = irq->ii_rid = *rid; fn = _task_fn_rx; ctx->ifc_flags |= IFC_LEGACY; info->ifi_filter = filter; info->ifi_filter_arg = filter_arg; info->ifi_task = gtask; info->ifi_ctx = ctx; /* We allocate a single interrupt resource */ if ((err = _iflib_irq_alloc(ctx, irq, tqrid, iflib_fast_intr_ctx, NULL, info, name)) != 0) return (err); GROUPTASK_INIT(gtask, 0, fn, q); taskqgroup_attach(tqg, gtask, q, rman_get_start(irq->ii_res), name); GROUPTASK_INIT(&txq->ift_task, 0, _task_fn_tx, txq); taskqgroup_attach(qgroup_if_io_tqg, &txq->ift_task, txq, rman_get_start(irq->ii_res), "tx"); return (0); } void iflib_led_create(if_ctx_t ctx) { ctx->ifc_led_dev = led_create(iflib_led_func, ctx, device_get_nameunit(ctx->ifc_dev)); } void iflib_tx_intr_deferred(if_ctx_t ctx, int txqid) { GROUPTASK_ENQUEUE(&ctx->ifc_txqs[txqid].ift_task); } void iflib_rx_intr_deferred(if_ctx_t ctx, int rxqid) { GROUPTASK_ENQUEUE(&ctx->ifc_rxqs[rxqid].ifr_task); } void iflib_admin_intr_deferred(if_ctx_t ctx) { #ifdef INVARIANTS struct grouptask *gtask; gtask = &ctx->ifc_admin_task; MPASS(gtask != NULL && gtask->gt_taskqueue != NULL); #endif GROUPTASK_ENQUEUE(&ctx->ifc_admin_task); } void iflib_iov_intr_deferred(if_ctx_t ctx) { GROUPTASK_ENQUEUE(&ctx->ifc_vflr_task); } void iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, char *name) { taskqgroup_attach_cpu(qgroup_if_io_tqg, gt, uniq, cpu, -1, name); } void iflib_config_gtask_init(void *ctx, struct grouptask *gtask, gtask_fn_t *fn, const char *name) { GROUPTASK_INIT(gtask, 0, fn, ctx); taskqgroup_attach(qgroup_if_config_tqg, gtask, gtask, -1, name); } void iflib_config_gtask_deinit(struct grouptask *gtask) { taskqgroup_detach(qgroup_if_config_tqg, gtask); } void iflib_link_state_change(if_ctx_t ctx, int link_state, uint64_t baudrate) { if_t ifp = ctx->ifc_ifp; iflib_txq_t txq = ctx->ifc_txqs; if_setbaudrate(ifp, baudrate); if (baudrate >= IF_Gbps(10)) { STATE_LOCK(ctx); ctx->ifc_flags |= IFC_PREFETCH; STATE_UNLOCK(ctx); } /* If link down, disable watchdog */ if ((ctx->ifc_link_state == LINK_STATE_UP) && (link_state == LINK_STATE_DOWN)) { for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxqsets; i++, txq++) txq->ift_qstatus = IFLIB_QUEUE_IDLE; } ctx->ifc_link_state = link_state; if_link_state_change(ifp, link_state); } static int iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq) { int credits; #ifdef INVARIANTS int credits_pre = txq->ift_cidx_processed; #endif if (ctx->isc_txd_credits_update == NULL) return (0); if ((credits = ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, true)) == 0) return (0); txq->ift_processed += credits; txq->ift_cidx_processed += credits; MPASS(credits_pre + credits == txq->ift_cidx_processed); if (txq->ift_cidx_processed >= txq->ift_size) txq->ift_cidx_processed -= txq->ift_size; return (credits); } static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget) { return (ctx->isc_rxd_available(ctx->ifc_softc, rxq->ifr_id, cidx, budget)); } void iflib_add_int_delay_sysctl(if_ctx_t ctx, const char *name, const char *description, if_int_delay_info_t info, int offset, int value) { info->iidi_ctx = ctx; info->iidi_offset = offset; info->iidi_value = value; SYSCTL_ADD_PROC(device_get_sysctl_ctx(ctx->ifc_dev), SYSCTL_CHILDREN(device_get_sysctl_tree(ctx->ifc_dev)), OID_AUTO, name, CTLTYPE_INT|CTLFLAG_RW, info, 0, iflib_sysctl_int_delay, "I", description); } struct sx * iflib_ctx_lock_get(if_ctx_t ctx) { return (&ctx->ifc_ctx_sx); } static int iflib_msix_init(if_ctx_t ctx) { device_t dev = ctx->ifc_dev; if_shared_ctx_t sctx = ctx->ifc_sctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; int vectors, queues, rx_queues, tx_queues, queuemsgs, msgs; int iflib_num_tx_queues, iflib_num_rx_queues; int err, admincnt, bar; iflib_num_tx_queues = ctx->ifc_sysctl_ntxqs; iflib_num_rx_queues = ctx->ifc_sysctl_nrxqs; device_printf(dev, "msix_init qsets capped at %d\n", imax(scctx->isc_ntxqsets, scctx->isc_nrxqsets)); bar = ctx->ifc_softc_ctx.isc_msix_bar; admincnt = sctx->isc_admin_intrcnt; /* Override by global tuneable */ { int i; size_t len = sizeof(i); err = kernel_sysctlbyname(curthread, "hw.pci.enable_msix", &i, &len, NULL, 0, NULL, 0); if (err == 0) { if (i == 0) goto msi; } else { device_printf(dev, "unable to read hw.pci.enable_msix."); } } /* Override by tuneable */ if (scctx->isc_disable_msix) goto msi; /* ** When used in a virtualized environment ** PCI BUSMASTER capability may not be set ** so explicity set it here and rewrite ** the ENABLE in the MSIX control register ** at this point to cause the host to ** successfully initialize us. */ { int msix_ctrl, rid; pci_enable_busmaster(dev); rid = 0; if (pci_find_cap(dev, PCIY_MSIX, &rid) == 0 && rid != 0) { rid += PCIR_MSIX_CTRL; msix_ctrl = pci_read_config(dev, rid, 2); msix_ctrl |= PCIM_MSIXCTRL_MSIX_ENABLE; pci_write_config(dev, rid, msix_ctrl, 2); } else { device_printf(dev, "PCIY_MSIX capability not found; " "or rid %d == 0.\n", rid); goto msi; } } /* * bar == -1 => "trust me I know what I'm doing" * Some drivers are for hardware that is so shoddily * documented that no one knows which bars are which * so the developer has to map all bars. This hack * allows shoddy garbage to use msix in this framework. */ if (bar != -1) { ctx->ifc_msix_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &bar, RF_ACTIVE); if (ctx->ifc_msix_mem == NULL) { /* May not be enabled */ device_printf(dev, "Unable to map MSIX table \n"); goto msi; } } /* First try MSI/X */ if ((msgs = pci_msix_count(dev)) == 0) { /* system has msix disabled */ device_printf(dev, "System has MSIX disabled \n"); bus_release_resource(dev, SYS_RES_MEMORY, bar, ctx->ifc_msix_mem); ctx->ifc_msix_mem = NULL; goto msi; } #if IFLIB_DEBUG /* use only 1 qset in debug mode */ queuemsgs = min(msgs - admincnt, 1); #else queuemsgs = msgs - admincnt; #endif #ifdef RSS queues = imin(queuemsgs, rss_getnumbuckets()); #else queues = queuemsgs; #endif queues = imin(CPU_COUNT(&ctx->ifc_cpus), queues); device_printf(dev, "pxm cpus: %d queue msgs: %d admincnt: %d\n", CPU_COUNT(&ctx->ifc_cpus), queuemsgs, admincnt); #ifdef RSS /* If we're doing RSS, clamp at the number of RSS buckets */ if (queues > rss_getnumbuckets()) queues = rss_getnumbuckets(); #endif if (iflib_num_rx_queues > 0 && iflib_num_rx_queues < queuemsgs - admincnt) rx_queues = iflib_num_rx_queues; else rx_queues = queues; if (rx_queues > scctx->isc_nrxqsets) rx_queues = scctx->isc_nrxqsets; /* * We want this to be all logical CPUs by default */ if (iflib_num_tx_queues > 0 && iflib_num_tx_queues < queues) tx_queues = iflib_num_tx_queues; else tx_queues = mp_ncpus; if (tx_queues > scctx->isc_ntxqsets) tx_queues = scctx->isc_ntxqsets; if (ctx->ifc_sysctl_qs_eq_override == 0) { #ifdef INVARIANTS if (tx_queues != rx_queues) device_printf(dev, "queue equality override not set, capping rx_queues at %d and tx_queues at %d\n", min(rx_queues, tx_queues), min(rx_queues, tx_queues)); #endif tx_queues = min(rx_queues, tx_queues); rx_queues = min(rx_queues, tx_queues); } device_printf(dev, "using %d rx queues %d tx queues \n", rx_queues, tx_queues); vectors = rx_queues + admincnt; if ((err = pci_alloc_msix(dev, &vectors)) == 0) { device_printf(dev, "Using MSIX interrupts with %d vectors\n", vectors); scctx->isc_vectors = vectors; scctx->isc_nrxqsets = rx_queues; scctx->isc_ntxqsets = tx_queues; scctx->isc_intr = IFLIB_INTR_MSIX; return (vectors); } else { device_printf(dev, "failed to allocate %d msix vectors, err: %d - using MSI\n", vectors, err); } msi: vectors = pci_msi_count(dev); scctx->isc_nrxqsets = 1; scctx->isc_ntxqsets = 1; scctx->isc_vectors = vectors; if (vectors == 1 && pci_alloc_msi(dev, &vectors) == 0) { device_printf(dev,"Using an MSI interrupt\n"); scctx->isc_intr = IFLIB_INTR_MSI; } else { device_printf(dev,"Using a Legacy interrupt\n"); scctx->isc_intr = IFLIB_INTR_LEGACY; } return (vectors); } char * ring_states[] = { "IDLE", "BUSY", "STALLED", "ABDICATED" }; static int mp_ring_state_handler(SYSCTL_HANDLER_ARGS) { int rc; uint16_t *state = ((uint16_t *)oidp->oid_arg1); struct sbuf *sb; char *ring_state = "UNKNOWN"; /* XXX needed ? */ rc = sysctl_wire_old_buffer(req, 0); MPASS(rc == 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 80, req); MPASS(sb != NULL); if (sb == NULL) return (ENOMEM); if (state[3] <= 3) ring_state = ring_states[state[3]]; sbuf_printf(sb, "pidx_head: %04hd pidx_tail: %04hd cidx: %04hd state: %s", state[0], state[1], state[2], ring_state); rc = sbuf_finish(sb); sbuf_delete(sb); return(rc); } enum iflib_ndesc_handler { IFLIB_NTXD_HANDLER, IFLIB_NRXD_HANDLER, }; static int mp_ndesc_handler(SYSCTL_HANDLER_ARGS) { if_ctx_t ctx = (void *)arg1; enum iflib_ndesc_handler type = arg2; char buf[256] = {0}; qidx_t *ndesc; char *p, *next; int nqs, rc, i; MPASS(type == IFLIB_NTXD_HANDLER || type == IFLIB_NRXD_HANDLER); nqs = 8; switch(type) { case IFLIB_NTXD_HANDLER: ndesc = ctx->ifc_sysctl_ntxds; if (ctx->ifc_sctx) nqs = ctx->ifc_sctx->isc_ntxqs; break; case IFLIB_NRXD_HANDLER: ndesc = ctx->ifc_sysctl_nrxds; if (ctx->ifc_sctx) nqs = ctx->ifc_sctx->isc_nrxqs; break; default: panic("unhandled type"); } if (nqs == 0) nqs = 8; for (i=0; i<8; i++) { if (i >= nqs) break; if (i) strcat(buf, ","); sprintf(strchr(buf, 0), "%d", ndesc[i]); } rc = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (rc || req->newptr == NULL) return rc; for (i = 0, next = buf, p = strsep(&next, " ,"); i < 8 && p; i++, p = strsep(&next, " ,")) { ndesc[i] = strtoul(p, NULL, 10); } return(rc); } #define NAME_BUFLEN 32 static void iflib_add_device_sysctl_pre(if_ctx_t ctx) { device_t dev = iflib_get_dev(ctx); struct sysctl_oid_list *child, *oid_list; struct sysctl_ctx_list *ctx_list; struct sysctl_oid *node; ctx_list = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); ctx->ifc_sysctl_node = node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, "iflib", CTLFLAG_RD, NULL, "IFLIB fields"); oid_list = SYSCTL_CHILDREN(node); SYSCTL_ADD_STRING(ctx_list, oid_list, OID_AUTO, "driver_version", CTLFLAG_RD, ctx->ifc_sctx->isc_driver_version, 0, "driver version"); SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_ntxqs", CTLFLAG_RWTUN, &ctx->ifc_sysctl_ntxqs, 0, "# of txqs to use, 0 => use default #"); SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_nrxqs", CTLFLAG_RWTUN, &ctx->ifc_sysctl_nrxqs, 0, "# of rxqs to use, 0 => use default #"); SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_qs_enable", CTLFLAG_RWTUN, &ctx->ifc_sysctl_qs_eq_override, 0, "permit #txq != #rxq"); SYSCTL_ADD_INT(ctx_list, oid_list, OID_AUTO, "disable_msix", CTLFLAG_RWTUN, &ctx->ifc_softc_ctx.isc_disable_msix, 0, "disable MSIX (default 0)"); SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "rx_budget", CTLFLAG_RWTUN, &ctx->ifc_sysctl_rx_budget, 0, "set the rx budget"); /* XXX change for per-queue sizes */ SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_ntxds", CTLTYPE_STRING|CTLFLAG_RWTUN, ctx, IFLIB_NTXD_HANDLER, mp_ndesc_handler, "A", "list of # of tx descriptors to use, 0 = use default #"); SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_nrxds", CTLTYPE_STRING|CTLFLAG_RWTUN, ctx, IFLIB_NRXD_HANDLER, mp_ndesc_handler, "A", "list of # of rx descriptors to use, 0 = use default #"); } static void iflib_add_device_sysctl_post(if_ctx_t ctx) { if_shared_ctx_t sctx = ctx->ifc_sctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; device_t dev = iflib_get_dev(ctx); struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx_list; iflib_fl_t fl; iflib_txq_t txq; iflib_rxq_t rxq; int i, j; char namebuf[NAME_BUFLEN]; char *qfmt; struct sysctl_oid *queue_node, *fl_node, *node; struct sysctl_oid_list *queue_list, *fl_list; ctx_list = device_get_sysctl_ctx(dev); node = ctx->ifc_sysctl_node; child = SYSCTL_CHILDREN(node); if (scctx->isc_ntxqsets > 100) qfmt = "txq%03d"; else if (scctx->isc_ntxqsets > 10) qfmt = "txq%02d"; else qfmt = "txq%d"; for (i = 0, txq = ctx->ifc_txqs; i < scctx->isc_ntxqsets; i++, txq++) { snprintf(namebuf, NAME_BUFLEN, qfmt, i); queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); #if MEMORY_LOGGING SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_dequeued", CTLFLAG_RD, &txq->ift_dequeued, "total mbufs freed"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_enqueued", CTLFLAG_RD, &txq->ift_enqueued, "total mbufs enqueued"); #endif SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag", CTLFLAG_RD, &txq->ift_mbuf_defrag, "# of times m_defrag was called"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "m_pullups", CTLFLAG_RD, &txq->ift_pullups, "# of times m_pullup was called"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag_failed", CTLFLAG_RD, &txq->ift_mbuf_defrag_failed, "# of times m_defrag failed"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_desc_avail", CTLFLAG_RD, &txq->ift_no_desc_avail, "# of times no descriptors were available"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "tx_map_failed", CTLFLAG_RD, &txq->ift_map_failed, "# of times dma map failed"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txd_encap_efbig", CTLFLAG_RD, &txq->ift_txd_encap_efbig, "# of times txd_encap returned EFBIG"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_tx_dma_setup", CTLFLAG_RD, &txq->ift_no_tx_dma_setup, "# of times map failed for other than EFBIG"); SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_pidx", CTLFLAG_RD, &txq->ift_pidx, 1, "Producer Index"); SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx", CTLFLAG_RD, &txq->ift_cidx, 1, "Consumer Index"); SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx_processed", CTLFLAG_RD, &txq->ift_cidx_processed, 1, "Consumer Index seen by credit update"); SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_in_use", CTLFLAG_RD, &txq->ift_in_use, 1, "descriptors in use"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_processed", CTLFLAG_RD, &txq->ift_processed, "descriptors procesed for clean"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_cleaned", CTLFLAG_RD, &txq->ift_cleaned, "total cleaned"); SYSCTL_ADD_PROC(ctx_list, queue_list, OID_AUTO, "ring_state", CTLTYPE_STRING | CTLFLAG_RD, __DEVOLATILE(uint64_t *, &txq->ift_br->state), 0, mp_ring_state_handler, "A", "soft ring state"); SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_enqueues", CTLFLAG_RD, &txq->ift_br->enqueues, "# of enqueues to the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_drops", CTLFLAG_RD, &txq->ift_br->drops, "# of drops in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_starts", CTLFLAG_RD, &txq->ift_br->starts, "# of normal consumer starts in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_stalls", CTLFLAG_RD, &txq->ift_br->stalls, "# of consumer stalls in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_restarts", CTLFLAG_RD, &txq->ift_br->restarts, "# of consumer restarts in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_abdications", CTLFLAG_RD, &txq->ift_br->abdications, "# of consumer abdications in the mp_ring for this queue"); } if (scctx->isc_nrxqsets > 100) qfmt = "rxq%03d"; else if (scctx->isc_nrxqsets > 10) qfmt = "rxq%02d"; else qfmt = "rxq%d"; for (i = 0, rxq = ctx->ifc_rxqs; i < scctx->isc_nrxqsets; i++, rxq++) { snprintf(namebuf, NAME_BUFLEN, qfmt, i); queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); if (sctx->isc_flags & IFLIB_HAS_RXCQ) { SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_pidx", CTLFLAG_RD, &rxq->ifr_cq_pidx, 1, "Producer Index"); SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_cidx", CTLFLAG_RD, &rxq->ifr_cq_cidx, 1, "Consumer Index"); } for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) { snprintf(namebuf, NAME_BUFLEN, "rxq_fl%d", j); fl_node = SYSCTL_ADD_NODE(ctx_list, queue_list, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "freelist Name"); fl_list = SYSCTL_CHILDREN(fl_node); SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "pidx", CTLFLAG_RD, &fl->ifl_pidx, 1, "Producer Index"); SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "cidx", CTLFLAG_RD, &fl->ifl_cidx, 1, "Consumer Index"); SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "credits", CTLFLAG_RD, &fl->ifl_credits, 1, "credits available"); #if MEMORY_LOGGING SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_enqueued", CTLFLAG_RD, &fl->ifl_m_enqueued, "mbufs allocated"); SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_dequeued", CTLFLAG_RD, &fl->ifl_m_dequeued, "mbufs freed"); SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_enqueued", CTLFLAG_RD, &fl->ifl_cl_enqueued, "clusters allocated"); SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_dequeued", CTLFLAG_RD, &fl->ifl_cl_dequeued, "clusters freed"); #endif } } } #ifndef __NO_STRICT_ALIGNMENT static struct mbuf * iflib_fixup_rx(struct mbuf *m) { struct mbuf *n; if (m->m_len <= (MCLBYTES - ETHER_HDR_LEN)) { bcopy(m->m_data, m->m_data + ETHER_HDR_LEN, m->m_len); m->m_data += ETHER_HDR_LEN; n = m; } else { MGETHDR(n, M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return (NULL); } bcopy(m->m_data, n->m_data, ETHER_HDR_LEN); m->m_data += ETHER_HDR_LEN; m->m_len -= ETHER_HDR_LEN; n->m_len = ETHER_HDR_LEN; M_MOVE_PKTHDR(n, m); n->m_next = m; } return (n); } #endif #ifdef NETDUMP static void iflib_netdump_init(struct ifnet *ifp, int *nrxr, int *ncl, int *clsize) { if_ctx_t ctx; ctx = if_getsoftc(ifp); CTX_LOCK(ctx); *nrxr = NRXQSETS(ctx); *ncl = ctx->ifc_rxqs[0].ifr_fl->ifl_size; *clsize = ctx->ifc_rxqs[0].ifr_fl->ifl_buf_size; CTX_UNLOCK(ctx); } static void iflib_netdump_event(struct ifnet *ifp, enum netdump_ev event) { if_ctx_t ctx; if_softc_ctx_t scctx; iflib_fl_t fl; iflib_rxq_t rxq; int i, j; ctx = if_getsoftc(ifp); scctx = &ctx->ifc_softc_ctx; switch (event) { case NETDUMP_START: for (i = 0; i < scctx->isc_nrxqsets; i++) { rxq = &ctx->ifc_rxqs[i]; for (j = 0; j < rxq->ifr_nfl; j++) { fl = rxq->ifr_fl; fl->ifl_zone = m_getzone(fl->ifl_buf_size); } } iflib_no_tx_batch = 1; break; default: break; } } static int iflib_netdump_transmit(struct ifnet *ifp, struct mbuf *m) { if_ctx_t ctx; iflib_txq_t txq; int error; ctx = if_getsoftc(ifp); if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return (EBUSY); txq = &ctx->ifc_txqs[0]; error = iflib_encap(txq, &m); if (error == 0) (void)iflib_txd_db_check(ctx, txq, true, txq->ift_in_use); return (error); } static int iflib_netdump_poll(struct ifnet *ifp, int count) { if_ctx_t ctx; if_softc_ctx_t scctx; iflib_txq_t txq; int i; ctx = if_getsoftc(ifp); scctx = &ctx->ifc_softc_ctx; if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return (EBUSY); txq = &ctx->ifc_txqs[0]; (void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx)); for (i = 0; i < scctx->isc_nrxqsets; i++) (void)iflib_rxeof(&ctx->ifc_rxqs[i], 16 /* XXX */); return (0); } #endif /* NETDUMP */