diff --git a/sys/dev/beri/virtio/network/if_vtbe.c b/sys/dev/beri/virtio/network/if_vtbe.c index 03853435a9de..5975a4a1c7f3 100644 --- a/sys/dev/beri/virtio/network/if_vtbe.c +++ b/sys/dev/beri/virtio/network/if_vtbe.c @@ -1,651 +1,651 @@ /*- * Copyright (c) 2014 Ruslan Bukin * All rights reserved. * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) * ("CTSRD"), as part of the DARPA CRASH research programme. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * BERI Virtio Networking Frontend */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "pio_if.h" #define DPRINTF(fmt, args...) printf(fmt, ##args) #define READ4(_sc, _reg) \ bus_read_4((_sc)->res[0], _reg) #define WRITE4(_sc, _reg, _val) \ bus_write_4((_sc)->res[0], _reg, _val) #define VTBE_LOCK(sc) mtx_lock(&(sc)->mtx) #define VTBE_UNLOCK(sc) mtx_unlock(&(sc)->mtx) #define VTBE_ASSERT_LOCKED(sc) mtx_assert(&(sc)->mtx, MA_OWNED); #define VTBE_ASSERT_UNLOCKED(sc) mtx_assert(&(sc)->mtx, MA_NOTOWNED); /* * Driver data and defines. */ #define DESC_COUNT 256 struct vtbe_softc { struct resource *res[2]; bus_space_tag_t bst; bus_space_handle_t bsh; device_t dev; struct ifnet *ifp; int if_flags; struct mtx mtx; boolean_t is_attached; int beri_mem_offset; device_t pio_send; device_t pio_recv; int opened; struct vqueue_info vs_queues[2]; int vs_curq; int hdrsize; }; static struct resource_spec vtbe_spec[] = { { SYS_RES_MEMORY, 0, RF_ACTIVE }, { -1, 0 } }; static void vtbe_txfinish_locked(struct vtbe_softc *sc); static void vtbe_rxfinish_locked(struct vtbe_softc *sc); static void vtbe_stop_locked(struct vtbe_softc *sc); static int pio_enable_irq(struct vtbe_softc *sc, int enable); static void vtbe_txstart_locked(struct vtbe_softc *sc) { struct iovec iov[DESC_COUNT]; struct virtio_net_hdr *vnh; struct vqueue_info *vq; struct iovec *tiov; struct ifnet *ifp; struct mbuf *m; struct uio uio; int enqueued; int iolen; int error; int reg; int len; int n; VTBE_ASSERT_LOCKED(sc); /* RX queue */ vq = &sc->vs_queues[0]; if (!vq_has_descs(vq)) { return; } ifp = sc->ifp; if (ifp->if_drv_flags & IFF_DRV_OACTIVE) { return; } enqueued = 0; if (!vq_ring_ready(vq)) return; vq->vq_save_used = be16toh(vq->vq_used->idx); for (;;) { if (!vq_has_descs(vq)) { ifp->if_drv_flags |= IFF_DRV_OACTIVE; break; } IFQ_DRV_DEQUEUE(&ifp->if_snd, m); if (m == NULL) { break; } n = vq_getchain(sc->beri_mem_offset, vq, iov, DESC_COUNT, NULL); KASSERT(n == 2, ("Unexpected amount of descriptors (%d)", n)); tiov = getcopy(iov, n); vnh = iov[0].iov_base; memset(vnh, 0, sc->hdrsize); len = iov[1].iov_len; uio.uio_resid = len; uio.uio_iov = &tiov[1]; uio.uio_segflg = UIO_SYSSPACE; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_rw = UIO_READ; error = m_mbuftouio(&uio, m, 0); if (error) panic("m_mbuftouio failed\n"); iolen = (len - uio.uio_resid + sc->hdrsize); free(tiov, M_DEVBUF); vq_relchain(vq, iov, n, iolen); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); BPF_MTAP(ifp, m); m_freem(m); ++enqueued; } if (enqueued != 0) { reg = htobe32(VIRTIO_MMIO_INT_VRING); WRITE4(sc, VIRTIO_MMIO_INTERRUPT_STATUS, reg); PIO_SET(sc->pio_send, Q_INTR, 1); } } static void vtbe_txstart(struct ifnet *ifp) { struct vtbe_softc *sc = ifp->if_softc; VTBE_LOCK(sc); vtbe_txstart_locked(sc); VTBE_UNLOCK(sc); } static void vtbe_stop_locked(struct vtbe_softc *sc) { struct ifnet *ifp; VTBE_ASSERT_LOCKED(sc); ifp = sc->ifp; ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); } static void vtbe_init_locked(struct vtbe_softc *sc) { struct ifnet *ifp = sc->ifp; VTBE_ASSERT_LOCKED(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; ifp->if_drv_flags |= IFF_DRV_RUNNING; } static void vtbe_init(void *if_softc) { struct vtbe_softc *sc = if_softc; VTBE_LOCK(sc); vtbe_init_locked(sc); VTBE_UNLOCK(sc); } static int vtbe_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifmediareq *ifmr; struct vtbe_softc *sc; struct ifreq *ifr; int mask, error; sc = ifp->if_softc; ifr = (struct ifreq *)data; error = 0; switch (cmd) { case SIOCSIFFLAGS: VTBE_LOCK(sc); if (ifp->if_flags & IFF_UP) { pio_enable_irq(sc, 1); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { vtbe_init_locked(sc); } } else { pio_enable_irq(sc, 0); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { vtbe_stop_locked(sc); } } sc->if_flags = ifp->if_flags; VTBE_UNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: ifmr = (struct ifmediareq *)data; ifmr->ifm_count = 1; ifmr->ifm_status = (IFM_AVALID | IFM_ACTIVE); ifmr->ifm_active = (IFM_ETHER | IFM_10G_T | IFM_FDX); ifmr->ifm_current = ifmr->ifm_active; break; case SIOCSIFCAP: mask = ifp->if_capenable ^ ifr->ifr_reqcap; if (mask & IFCAP_VLAN_MTU) { ifp->if_capenable ^= IFCAP_VLAN_MTU; } break; case SIOCSIFADDR: pio_enable_irq(sc, 1); default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } static void vtbe_txfinish_locked(struct vtbe_softc *sc) { struct ifnet *ifp; VTBE_ASSERT_LOCKED(sc); ifp = sc->ifp; } static int vq_init(struct vtbe_softc *sc) { struct vqueue_info *vq; uint8_t *base; int size; int reg; int pfn; vq = &sc->vs_queues[sc->vs_curq]; vq->vq_qsize = DESC_COUNT; reg = READ4(sc, VIRTIO_MMIO_QUEUE_PFN); pfn = be32toh(reg); vq->vq_pfn = pfn; size = vring_size(vq->vq_qsize, VRING_ALIGN); base = paddr_map(sc->beri_mem_offset, (pfn << PAGE_SHIFT), size); /* First pages are descriptors */ vq->vq_desc = (struct vring_desc *)base; base += vq->vq_qsize * sizeof(struct vring_desc); /* Then avail ring */ vq->vq_avail = (struct vring_avail *)base; base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t); /* Then it's rounded up to the next page */ base = (uint8_t *)roundup2((uintptr_t)base, VRING_ALIGN); /* And the last pages are the used ring */ vq->vq_used = (struct vring_used *)base; /* Mark queue as allocated, and start at 0 when we use it. */ vq->vq_flags = VQ_ALLOC; vq->vq_last_avail = 0; return (0); } static void vtbe_proc_rx(struct vtbe_softc *sc, struct vqueue_info *vq) { struct iovec iov[DESC_COUNT]; struct iovec *tiov; struct ifnet *ifp; struct uio uio; struct mbuf *m; int iolen; int i; int n; ifp = sc->ifp; n = vq_getchain(sc->beri_mem_offset, vq, iov, DESC_COUNT, NULL); KASSERT(n >= 1 && n <= DESC_COUNT, ("wrong n %d", n)); tiov = getcopy(iov, n); iolen = 0; for (i = 1; i < n; i++) { iolen += iov[i].iov_len; } uio.uio_resid = iolen; uio.uio_iov = &tiov[1]; uio.uio_segflg = UIO_SYSSPACE; uio.uio_iovcnt = (n - 1); uio.uio_rw = UIO_WRITE; if ((m = m_uiotombuf(&uio, M_NOWAIT, 0, ETHER_ALIGN, M_PKTHDR)) == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); goto done; } m->m_pkthdr.rcvif = ifp; if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); CURVNET_SET(ifp->if_vnet); VTBE_UNLOCK(sc); (*ifp->if_input)(ifp, m); VTBE_LOCK(sc); CURVNET_RESTORE(); done: free(tiov, M_DEVBUF); vq_relchain(vq, iov, n, iolen + sc->hdrsize); } static void vtbe_rxfinish_locked(struct vtbe_softc *sc) { struct vqueue_info *vq; int reg; /* TX queue */ vq = &sc->vs_queues[1]; if (!vq_ring_ready(vq)) return; /* Process new descriptors */ vq->vq_save_used = be16toh(vq->vq_used->idx); while (vq_has_descs(vq)) { vtbe_proc_rx(sc, vq); } /* Interrupt the other side */ reg = htobe32(VIRTIO_MMIO_INT_VRING); WRITE4(sc, VIRTIO_MMIO_INTERRUPT_STATUS, reg); PIO_SET(sc->pio_send, Q_INTR, 1); } static void vtbe_intr(void *arg) { struct vtbe_softc *sc; int pending; uint32_t reg; sc = arg; VTBE_LOCK(sc); reg = PIO_READ(sc->pio_recv); /* Ack */ PIO_SET(sc->pio_recv, reg, 0); pending = htobe32(reg); if (pending & Q_SEL) { reg = READ4(sc, VIRTIO_MMIO_QUEUE_SEL); sc->vs_curq = be32toh(reg); } if (pending & Q_PFN) { vq_init(sc); } if (pending & Q_NOTIFY) { /* beri rx / arm tx notify */ vtbe_txfinish_locked(sc); } if (pending & Q_NOTIFY1) { vtbe_rxfinish_locked(sc); } VTBE_UNLOCK(sc); } static int vtbe_get_hwaddr(struct vtbe_softc *sc, uint8_t *hwaddr) { int rnd; /* * Generate MAC address, use 'bsd' + random 24 low-order bits. */ rnd = arc4random() & 0x00ffffff; hwaddr[0] = 'b'; hwaddr[1] = 's'; hwaddr[2] = 'd'; hwaddr[3] = rnd >> 16; hwaddr[4] = rnd >> 8; hwaddr[5] = rnd >> 0; return (0); } static int pio_enable_irq(struct vtbe_softc *sc, int enable) { /* * IRQ lines should be disabled while reprogram FPGA core. */ if (enable) { if (sc->opened == 0) { sc->opened = 1; PIO_SETUP_IRQ(sc->pio_recv, vtbe_intr, sc); } } else { if (sc->opened == 1) { PIO_TEARDOWN_IRQ(sc->pio_recv); sc->opened = 0; } } return (0); } static int vtbe_probe(device_t dev) { if (!ofw_bus_status_okay(dev)) return (ENXIO); if (!ofw_bus_is_compatible(dev, "sri-cambridge,beri-vtnet")) return (ENXIO); device_set_desc(dev, "Virtio BERI Ethernet Controller"); return (BUS_PROBE_DEFAULT); } static int vtbe_attach(device_t dev) { uint8_t macaddr[ETHER_ADDR_LEN]; struct vtbe_softc *sc; struct ifnet *ifp; int reg; sc = device_get_softc(dev); sc->dev = dev; sc->hdrsize = sizeof(struct virtio_net_hdr); if (bus_alloc_resources(dev, vtbe_spec, sc->res)) { device_printf(dev, "could not allocate resources\n"); return (ENXIO); } /* Memory interface */ sc->bst = rman_get_bustag(sc->res[0]); sc->bsh = rman_get_bushandle(sc->res[0]); mtx_init(&sc->mtx, device_get_nameunit(sc->dev), MTX_NETWORK_LOCK, MTX_DEF); if (setup_offset(dev, &sc->beri_mem_offset) != 0) return (ENXIO); if (setup_pio(dev, "pio-send", &sc->pio_send) != 0) return (ENXIO); if (setup_pio(dev, "pio-recv", &sc->pio_recv) != 0) return (ENXIO); /* Setup MMIO */ /* Specify that we provide network device */ reg = htobe32(VIRTIO_ID_NETWORK); WRITE4(sc, VIRTIO_MMIO_DEVICE_ID, reg); /* The number of desc we support */ reg = htobe32(DESC_COUNT); WRITE4(sc, VIRTIO_MMIO_QUEUE_NUM_MAX, reg); /* Our features */ reg = htobe32(VIRTIO_NET_F_MAC | VIRTIO_F_NOTIFY_ON_EMPTY); WRITE4(sc, VIRTIO_MMIO_HOST_FEATURES, reg); /* Get MAC */ if (vtbe_get_hwaddr(sc, macaddr)) { device_printf(sc->dev, "can't get mac\n"); return (ENXIO); } /* Set up the ethernet interface. */ sc->ifp = ifp = if_alloc(IFT_ETHER); ifp->if_baudrate = IF_Gbps(10); ifp->if_softc = sc; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_flags = (IFF_BROADCAST | IFF_SIMPLEX | - IFF_MULTICAST | IFF_PROMISC | IFF_NEEDSEPOCH); + IFF_MULTICAST | IFF_PROMISC); ifp->if_capabilities = IFCAP_VLAN_MTU; ifp->if_capenable = ifp->if_capabilities; ifp->if_start = vtbe_txstart; ifp->if_ioctl = vtbe_ioctl; ifp->if_init = vtbe_init; IFQ_SET_MAXLEN(&ifp->if_snd, DESC_COUNT - 1); ifp->if_snd.ifq_drv_maxlen = DESC_COUNT - 1; IFQ_SET_READY(&ifp->if_snd); ifp->if_hdrlen = sizeof(struct ether_vlan_header); /* All ready to run, attach the ethernet interface. */ ether_ifattach(ifp, macaddr); sc->is_attached = true; return (0); } static device_method_t vtbe_methods[] = { DEVMETHOD(device_probe, vtbe_probe), DEVMETHOD(device_attach, vtbe_attach), { 0, 0 } }; static driver_t vtbe_driver = { "vtbe", vtbe_methods, sizeof(struct vtbe_softc), }; static devclass_t vtbe_devclass; DRIVER_MODULE(vtbe, simplebus, vtbe_driver, vtbe_devclass, 0, 0); MODULE_DEPEND(vtbe, ether, 1, 1, 1); diff --git a/sys/dev/dpaa/if_dtsec.c b/sys/dev/dpaa/if_dtsec.c index 704aa22eda54..2c6291b07e34 100644 --- a/sys/dev/dpaa/if_dtsec.c +++ b/sys/dev/dpaa/if_dtsec.c @@ -1,856 +1,856 @@ /*- * Copyright (c) 2011-2012 Semihalf. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "miibus_if.h" #include #include #include #include #include "fman.h" #include "if_dtsec.h" #include "if_dtsec_im.h" #include "if_dtsec_rm.h" #define DTSEC_MIN_FRAME_SIZE 64 #define DTSEC_MAX_FRAME_SIZE 9600 #define DTSEC_REG_MAXFRM 0x110 /** * @group dTSEC private defines. * @{ */ /** * dTSEC FMan MAC exceptions info struct. */ struct dtsec_fm_mac_ex_str { const int num; const char *str; }; /** @} */ /** * @group FMan MAC routines. * @{ */ #define DTSEC_MAC_EXCEPTIONS_END (-1) /** * FMan MAC exceptions. */ static const struct dtsec_fm_mac_ex_str dtsec_fm_mac_exceptions[] = { { e_FM_MAC_EX_10G_MDIO_SCAN_EVENTMDIO, "MDIO scan event" }, { e_FM_MAC_EX_10G_MDIO_CMD_CMPL, "MDIO command completion" }, { e_FM_MAC_EX_10G_REM_FAULT, "Remote fault" }, { e_FM_MAC_EX_10G_LOC_FAULT, "Local fault" }, { e_FM_MAC_EX_10G_1TX_ECC_ER, "Transmit frame ECC error" }, { e_FM_MAC_EX_10G_TX_FIFO_UNFL, "Transmit FIFO underflow" }, { e_FM_MAC_EX_10G_TX_FIFO_OVFL, "Receive FIFO overflow" }, { e_FM_MAC_EX_10G_TX_ER, "Transmit frame error" }, { e_FM_MAC_EX_10G_RX_FIFO_OVFL, "Receive FIFO overflow" }, { e_FM_MAC_EX_10G_RX_ECC_ER, "Receive frame ECC error" }, { e_FM_MAC_EX_10G_RX_JAB_FRM, "Receive jabber frame" }, { e_FM_MAC_EX_10G_RX_OVRSZ_FRM, "Receive oversized frame" }, { e_FM_MAC_EX_10G_RX_RUNT_FRM, "Receive runt frame" }, { e_FM_MAC_EX_10G_RX_FRAG_FRM, "Receive fragment frame" }, { e_FM_MAC_EX_10G_RX_LEN_ER, "Receive payload length error" }, { e_FM_MAC_EX_10G_RX_CRC_ER, "Receive CRC error" }, { e_FM_MAC_EX_10G_RX_ALIGN_ER, "Receive alignment error" }, { e_FM_MAC_EX_1G_BAB_RX, "Babbling receive error" }, { e_FM_MAC_EX_1G_RX_CTL, "Receive control (pause frame) interrupt" }, { e_FM_MAC_EX_1G_GRATEFUL_TX_STP_COMPLET, "Graceful transmit stop " "complete" }, { e_FM_MAC_EX_1G_BAB_TX, "Babbling transmit error" }, { e_FM_MAC_EX_1G_TX_CTL, "Transmit control (pause frame) interrupt" }, { e_FM_MAC_EX_1G_TX_ERR, "Transmit error" }, { e_FM_MAC_EX_1G_LATE_COL, "Late collision" }, { e_FM_MAC_EX_1G_COL_RET_LMT, "Collision retry limit" }, { e_FM_MAC_EX_1G_TX_FIFO_UNDRN, "Transmit FIFO underrun" }, { e_FM_MAC_EX_1G_MAG_PCKT, "Magic Packet detected when dTSEC is in " "Magic Packet detection mode" }, { e_FM_MAC_EX_1G_MII_MNG_RD_COMPLET, "MII management read completion" }, { e_FM_MAC_EX_1G_MII_MNG_WR_COMPLET, "MII management write completion" }, { e_FM_MAC_EX_1G_GRATEFUL_RX_STP_COMPLET, "Graceful receive stop " "complete" }, { e_FM_MAC_EX_1G_TX_DATA_ERR, "Internal data error on transmit" }, { e_FM_MAC_EX_1G_RX_DATA_ERR, "Internal data error on receive" }, { e_FM_MAC_EX_1G_1588_TS_RX_ERR, "Time-Stamp Receive Error" }, { e_FM_MAC_EX_1G_RX_MIB_CNT_OVFL, "MIB counter overflow" }, { DTSEC_MAC_EXCEPTIONS_END, "" } }; static const char * dtsec_fm_mac_ex_to_str(e_FmMacExceptions exception) { int i; for (i = 0; dtsec_fm_mac_exceptions[i].num != exception && dtsec_fm_mac_exceptions[i].num != DTSEC_MAC_EXCEPTIONS_END; ++i) ; if (dtsec_fm_mac_exceptions[i].num == DTSEC_MAC_EXCEPTIONS_END) return (""); return (dtsec_fm_mac_exceptions[i].str); } static void dtsec_fm_mac_mdio_event_callback(t_Handle h_App, e_FmMacExceptions exception) { struct dtsec_softc *sc; sc = h_App; device_printf(sc->sc_dev, "MDIO event %i: %s.\n", exception, dtsec_fm_mac_ex_to_str(exception)); } static void dtsec_fm_mac_exception_callback(t_Handle app, e_FmMacExceptions exception) { struct dtsec_softc *sc; sc = app; device_printf(sc->sc_dev, "MAC exception %i: %s.\n", exception, dtsec_fm_mac_ex_to_str(exception)); } static void dtsec_fm_mac_free(struct dtsec_softc *sc) { if (sc->sc_mach == NULL) return; FM_MAC_Disable(sc->sc_mach, e_COMM_MODE_RX_AND_TX); FM_MAC_Free(sc->sc_mach); sc->sc_mach = NULL; } static int dtsec_fm_mac_init(struct dtsec_softc *sc, uint8_t *mac) { t_FmMacParams params; t_Error error; memset(¶ms, 0, sizeof(params)); memcpy(¶ms.addr, mac, sizeof(params.addr)); params.baseAddr = rman_get_bushandle(sc->sc_mem); params.enetMode = sc->sc_mac_enet_mode; params.macId = sc->sc_eth_id; params.mdioIrq = sc->sc_mac_mdio_irq; params.f_Event = dtsec_fm_mac_mdio_event_callback; params.f_Exception = dtsec_fm_mac_exception_callback; params.h_App = sc; params.h_Fm = sc->sc_fmh; sc->sc_mach = FM_MAC_Config(¶ms); if (sc->sc_mach == NULL) { device_printf(sc->sc_dev, "couldn't configure FM_MAC module.\n" ); return (ENXIO); } error = FM_MAC_ConfigResetOnInit(sc->sc_mach, TRUE); if (error != E_OK) { device_printf(sc->sc_dev, "couldn't enable reset on init " "feature.\n"); dtsec_fm_mac_free(sc); return (ENXIO); } /* Do not inform about pause frames */ error = FM_MAC_ConfigException(sc->sc_mach, e_FM_MAC_EX_1G_RX_CTL, FALSE); if (error != E_OK) { device_printf(sc->sc_dev, "couldn't disable pause frames " "exception.\n"); dtsec_fm_mac_free(sc); return (ENXIO); } error = FM_MAC_Init(sc->sc_mach); if (error != E_OK) { device_printf(sc->sc_dev, "couldn't initialize FM_MAC module." "\n"); dtsec_fm_mac_free(sc); return (ENXIO); } return (0); } /** @} */ /** * @group FMan PORT routines. * @{ */ static const char * dtsec_fm_port_ex_to_str(e_FmPortExceptions exception) { switch (exception) { case e_FM_PORT_EXCEPTION_IM_BUSY: return ("IM: RX busy"); default: return (""); } } void dtsec_fm_port_rx_exception_callback(t_Handle app, e_FmPortExceptions exception) { struct dtsec_softc *sc; sc = app; device_printf(sc->sc_dev, "RX exception: %i: %s.\n", exception, dtsec_fm_port_ex_to_str(exception)); } void dtsec_fm_port_tx_exception_callback(t_Handle app, e_FmPortExceptions exception) { struct dtsec_softc *sc; sc = app; device_printf(sc->sc_dev, "TX exception: %i: %s.\n", exception, dtsec_fm_port_ex_to_str(exception)); } e_FmPortType dtsec_fm_port_rx_type(enum eth_dev_type type) { switch (type) { case ETH_DTSEC: return (e_FM_PORT_TYPE_RX); case ETH_10GSEC: return (e_FM_PORT_TYPE_RX_10G); default: return (e_FM_PORT_TYPE_DUMMY); } } e_FmPortType dtsec_fm_port_tx_type(enum eth_dev_type type) { switch (type) { case ETH_DTSEC: return (e_FM_PORT_TYPE_TX); case ETH_10GSEC: return (e_FM_PORT_TYPE_TX_10G); default: return (e_FM_PORT_TYPE_DUMMY); } } static void dtsec_fm_port_free_both(struct dtsec_softc *sc) { if (sc->sc_rxph) { FM_PORT_Free(sc->sc_rxph); sc->sc_rxph = NULL; } if (sc->sc_txph) { FM_PORT_Free(sc->sc_txph); sc->sc_txph = NULL; } } /** @} */ /** * @group IFnet routines. * @{ */ static int dtsec_set_mtu(struct dtsec_softc *sc, unsigned int mtu) { mtu += ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + ETHER_CRC_LEN; DTSEC_LOCK_ASSERT(sc); if (mtu >= DTSEC_MIN_FRAME_SIZE && mtu <= DTSEC_MAX_FRAME_SIZE) { bus_write_4(sc->sc_mem, DTSEC_REG_MAXFRM, mtu); return (mtu); } return (0); } static int dtsec_if_enable_locked(struct dtsec_softc *sc) { int error; DTSEC_LOCK_ASSERT(sc); error = FM_MAC_Enable(sc->sc_mach, e_COMM_MODE_RX_AND_TX); if (error != E_OK) return (EIO); error = FM_PORT_Enable(sc->sc_rxph); if (error != E_OK) return (EIO); error = FM_PORT_Enable(sc->sc_txph); if (error != E_OK) return (EIO); sc->sc_ifnet->if_drv_flags |= IFF_DRV_RUNNING; /* Refresh link state */ dtsec_miibus_statchg(sc->sc_dev); return (0); } static int dtsec_if_disable_locked(struct dtsec_softc *sc) { int error; DTSEC_LOCK_ASSERT(sc); error = FM_MAC_Disable(sc->sc_mach, e_COMM_MODE_RX_AND_TX); if (error != E_OK) return (EIO); error = FM_PORT_Disable(sc->sc_rxph); if (error != E_OK) return (EIO); error = FM_PORT_Disable(sc->sc_txph); if (error != E_OK) return (EIO); sc->sc_ifnet->if_drv_flags &= ~IFF_DRV_RUNNING; return (0); } static int dtsec_if_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct dtsec_softc *sc; struct ifreq *ifr; int error; sc = ifp->if_softc; ifr = (struct ifreq *)data; error = 0; /* Basic functionality to achieve media status reports */ switch (command) { case SIOCSIFMTU: DTSEC_LOCK(sc); if (dtsec_set_mtu(sc, ifr->ifr_mtu)) ifp->if_mtu = ifr->ifr_mtu; else error = EINVAL; DTSEC_UNLOCK(sc); break; case SIOCSIFFLAGS: DTSEC_LOCK(sc); if (sc->sc_ifnet->if_flags & IFF_UP) error = dtsec_if_enable_locked(sc); else error = dtsec_if_disable_locked(sc); DTSEC_UNLOCK(sc); break; case SIOCGIFMEDIA: case SIOCSIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->sc_mii->mii_media, command); break; default: error = ether_ioctl(ifp, command, data); } return (error); } static void dtsec_if_tick(void *arg) { struct dtsec_softc *sc; sc = arg; /* TODO */ DTSEC_LOCK(sc); mii_tick(sc->sc_mii); callout_reset(&sc->sc_tick_callout, hz, dtsec_if_tick, sc); DTSEC_UNLOCK(sc); } static void dtsec_if_deinit_locked(struct dtsec_softc *sc) { DTSEC_LOCK_ASSERT(sc); DTSEC_UNLOCK(sc); callout_drain(&sc->sc_tick_callout); DTSEC_LOCK(sc); } static void dtsec_if_init_locked(struct dtsec_softc *sc) { int error; DTSEC_LOCK_ASSERT(sc); /* Set MAC address */ error = FM_MAC_ModifyMacAddr(sc->sc_mach, (t_EnetAddr *)IF_LLADDR(sc->sc_ifnet)); if (error != E_OK) { device_printf(sc->sc_dev, "couldn't set MAC address.\n"); goto err; } /* Start MII polling */ if (sc->sc_mii) callout_reset(&sc->sc_tick_callout, hz, dtsec_if_tick, sc); if (sc->sc_ifnet->if_flags & IFF_UP) { error = dtsec_if_enable_locked(sc); if (error != 0) goto err; } else { error = dtsec_if_disable_locked(sc); if (error != 0) goto err; } return; err: dtsec_if_deinit_locked(sc); device_printf(sc->sc_dev, "initialization error.\n"); return; } static void dtsec_if_init(void *data) { struct dtsec_softc *sc; sc = data; DTSEC_LOCK(sc); dtsec_if_init_locked(sc); DTSEC_UNLOCK(sc); } static void dtsec_if_start(struct ifnet *ifp) { struct dtsec_softc *sc; sc = ifp->if_softc; DTSEC_LOCK(sc); sc->sc_start_locked(sc); DTSEC_UNLOCK(sc); } static void dtsec_if_watchdog(struct ifnet *ifp) { /* TODO */ } /** @} */ /** * @group IFmedia routines. * @{ */ static int dtsec_ifmedia_upd(struct ifnet *ifp) { struct dtsec_softc *sc = ifp->if_softc; DTSEC_LOCK(sc); mii_mediachg(sc->sc_mii); DTSEC_UNLOCK(sc); return (0); } static void dtsec_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) { struct dtsec_softc *sc = ifp->if_softc; DTSEC_LOCK(sc); mii_pollstat(sc->sc_mii); ifmr->ifm_active = sc->sc_mii->mii_media_active; ifmr->ifm_status = sc->sc_mii->mii_media_status; DTSEC_UNLOCK(sc); } /** @} */ /** * @group dTSEC bus interface. * @{ */ static void dtsec_configure_mode(struct dtsec_softc *sc) { char tunable[64]; snprintf(tunable, sizeof(tunable), "%s.independent_mode", device_get_nameunit(sc->sc_dev)); sc->sc_mode = DTSEC_MODE_REGULAR; TUNABLE_INT_FETCH(tunable, &sc->sc_mode); if (sc->sc_mode == DTSEC_MODE_REGULAR) { sc->sc_port_rx_init = dtsec_rm_fm_port_rx_init; sc->sc_port_tx_init = dtsec_rm_fm_port_tx_init; sc->sc_start_locked = dtsec_rm_if_start_locked; } else { sc->sc_port_rx_init = dtsec_im_fm_port_rx_init; sc->sc_port_tx_init = dtsec_im_fm_port_tx_init; sc->sc_start_locked = dtsec_im_if_start_locked; } device_printf(sc->sc_dev, "Configured for %s mode.\n", (sc->sc_mode == DTSEC_MODE_REGULAR) ? "regular" : "independent"); } int dtsec_attach(device_t dev) { struct dtsec_softc *sc; device_t parent; int error; struct ifnet *ifp; sc = device_get_softc(dev); parent = device_get_parent(dev); sc->sc_dev = dev; sc->sc_mac_mdio_irq = NO_IRQ; /* Check if MallocSmart allocator is ready */ if (XX_MallocSmartInit() != E_OK) return (ENXIO); /* Init locks */ mtx_init(&sc->sc_lock, device_get_nameunit(dev), "DTSEC Global Lock", MTX_DEF); mtx_init(&sc->sc_mii_lock, device_get_nameunit(dev), "DTSEC MII Lock", MTX_DEF); /* Init callouts */ callout_init(&sc->sc_tick_callout, CALLOUT_MPSAFE); /* Read configuraton */ if ((error = fman_get_handle(parent, &sc->sc_fmh)) != 0) return (error); if ((error = fman_get_muram_handle(parent, &sc->sc_muramh)) != 0) return (error); if ((error = fman_get_bushandle(parent, &sc->sc_fm_base)) != 0) return (error); /* Configure working mode */ dtsec_configure_mode(sc); /* If we are working in regular mode configure BMAN and QMAN */ if (sc->sc_mode == DTSEC_MODE_REGULAR) { /* Create RX buffer pool */ error = dtsec_rm_pool_rx_init(sc); if (error != 0) return (EIO); /* Create RX frame queue range */ error = dtsec_rm_fqr_rx_init(sc); if (error != 0) return (EIO); /* Create frame info pool */ error = dtsec_rm_fi_pool_init(sc); if (error != 0) return (EIO); /* Create TX frame queue range */ error = dtsec_rm_fqr_tx_init(sc); if (error != 0) return (EIO); } /* Init FMan MAC module. */ error = dtsec_fm_mac_init(sc, sc->sc_mac_addr); if (error != 0) { dtsec_detach(dev); return (ENXIO); } /* Init FMan TX port */ error = sc->sc_port_tx_init(sc, device_get_unit(sc->sc_dev)); if (error != 0) { dtsec_detach(dev); return (ENXIO); } /* Init FMan RX port */ error = sc->sc_port_rx_init(sc, device_get_unit(sc->sc_dev)); if (error != 0) { dtsec_detach(dev); return (ENXIO); } /* Create network interface for upper layers */ ifp = sc->sc_ifnet = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(sc->sc_dev, "if_alloc() failed.\n"); dtsec_detach(dev); return (ENOMEM); } ifp->if_softc = sc; ifp->if_mtu = ETHERMTU; /* TODO: Configure */ - ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_NEEDSEPOCH; + ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST; ifp->if_init = dtsec_if_init; ifp->if_start = dtsec_if_start; ifp->if_ioctl = dtsec_if_ioctl; ifp->if_snd.ifq_maxlen = IFQ_MAXLEN; if (sc->sc_phy_addr >= 0) if_initname(ifp, device_get_name(sc->sc_dev), device_get_unit(sc->sc_dev)); else if_initname(ifp, "dtsec_phy", device_get_unit(sc->sc_dev)); /* TODO */ #if 0 IFQ_SET_MAXLEN(&ifp->if_snd, TSEC_TX_NUM_DESC - 1); ifp->if_snd.ifq_drv_maxlen = TSEC_TX_NUM_DESC - 1; IFQ_SET_READY(&ifp->if_snd); #endif ifp->if_capabilities = IFCAP_JUMBO_MTU; /* TODO: HWCSUM */ ifp->if_capenable = ifp->if_capabilities; /* Attach PHY(s) */ error = mii_attach(sc->sc_dev, &sc->sc_mii_dev, ifp, dtsec_ifmedia_upd, dtsec_ifmedia_sts, BMSR_DEFCAPMASK, sc->sc_phy_addr, MII_OFFSET_ANY, 0); if (error) { device_printf(sc->sc_dev, "attaching PHYs failed: %d\n", error); dtsec_detach(sc->sc_dev); return (error); } sc->sc_mii = device_get_softc(sc->sc_mii_dev); /* Attach to stack */ ether_ifattach(ifp, sc->sc_mac_addr); return (0); } int dtsec_detach(device_t dev) { struct dtsec_softc *sc; if_t ifp; sc = device_get_softc(dev); ifp = sc->sc_ifnet; if (device_is_attached(dev)) { ether_ifdetach(ifp); /* Shutdown interface */ DTSEC_LOCK(sc); dtsec_if_deinit_locked(sc); DTSEC_UNLOCK(sc); } if (sc->sc_ifnet) { if_free(sc->sc_ifnet); sc->sc_ifnet = NULL; } if (sc->sc_mode == DTSEC_MODE_REGULAR) { /* Free RX/TX FQRs */ dtsec_rm_fqr_rx_free(sc); dtsec_rm_fqr_tx_free(sc); /* Free frame info pool */ dtsec_rm_fi_pool_free(sc); /* Free RX buffer pool */ dtsec_rm_pool_rx_free(sc); } dtsec_fm_mac_free(sc); dtsec_fm_port_free_both(sc); /* Destroy lock */ mtx_destroy(&sc->sc_lock); return (0); } int dtsec_suspend(device_t dev) { return (0); } int dtsec_resume(device_t dev) { return (0); } int dtsec_shutdown(device_t dev) { return (0); } /** @} */ /** * @group MII bus interface. * @{ */ int dtsec_miibus_readreg(device_t dev, int phy, int reg) { struct dtsec_softc *sc; sc = device_get_softc(dev); return (MIIBUS_READREG(sc->sc_mdio, phy, reg)); } int dtsec_miibus_writereg(device_t dev, int phy, int reg, int value) { struct dtsec_softc *sc; sc = device_get_softc(dev); return (MIIBUS_WRITEREG(sc->sc_mdio, phy, reg, value)); } void dtsec_miibus_statchg(device_t dev) { struct dtsec_softc *sc; e_EnetSpeed speed; bool duplex; int error; sc = device_get_softc(dev); DTSEC_LOCK_ASSERT(sc); duplex = ((sc->sc_mii->mii_media_active & IFM_GMASK) == IFM_FDX); switch (IFM_SUBTYPE(sc->sc_mii->mii_media_active)) { case IFM_1000_T: case IFM_1000_SX: speed = e_ENET_SPEED_1000; break; case IFM_100_TX: speed = e_ENET_SPEED_100; break; case IFM_10_T: speed = e_ENET_SPEED_10; break; default: speed = e_ENET_SPEED_10; } error = FM_MAC_AdjustLink(sc->sc_mach, speed, duplex); if (error != E_OK) device_printf(sc->sc_dev, "error while adjusting MAC speed.\n"); } /** @} */ diff --git a/sys/dev/hyperv/netvsc/if_hn.c b/sys/dev/hyperv/netvsc/if_hn.c index bc96775ad553..e9d1b9439671 100644 --- a/sys/dev/hyperv/netvsc/if_hn.c +++ b/sys/dev/hyperv/netvsc/if_hn.c @@ -1,7575 +1,7574 @@ /*- * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 2004-2006 Kip Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_hn.h" #include "opt_inet6.h" #include "opt_inet.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RSS #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmbus_if.h" #define HN_IFSTART_SUPPORT #define HN_RING_CNT_DEF_MAX 8 #define HN_VFMAP_SIZE_DEF 8 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ /* YYY should get it from the underlying channel */ #define HN_TX_DESC_CNT 512 #define HN_RNDIS_PKT_LEN \ (sizeof(struct rndis_packet_msg) + \ HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE #define HN_TX_DATA_BOUNDARY PAGE_SIZE #define HN_TX_DATA_MAXSIZE IP_MAXPACKET #define HN_TX_DATA_SEGSIZE PAGE_SIZE /* -1 for RNDIS packet message */ #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) #define HN_DIRECT_TX_SIZE_DEF 128 #define HN_EARLY_TXEOF_THRESH 8 #define HN_PKTBUF_LEN_DEF (16 * 1024) #define HN_LROENT_CNT_DEF 128 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) /* YYY 2*MTU is a bit rough, but should be good enough. */ #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) #define HN_LRO_ACKCNT_DEF 1 #define HN_LOCK_INIT(sc) \ sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) #define HN_LOCK(sc) \ do { \ while (sx_try_xlock(&(sc)->hn_lock) == 0) \ DELAY(1000); \ } while (0) #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) #define HN_CSUM_IP_HWASSIST(sc) \ ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) #define HN_CSUM_IP6_HWASSIST(sc) \ ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) #define HN_PKTSIZE_MIN(align) \ roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ HN_RNDIS_PKT_LEN, (align)) #define HN_PKTSIZE(m, align) \ roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) #ifdef RSS #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) #else #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) #endif struct hn_txdesc { #ifndef HN_USE_TXDESC_BUFRING SLIST_ENTRY(hn_txdesc) link; #endif STAILQ_ENTRY(hn_txdesc) agg_link; /* Aggregated txdescs, in sending order. */ STAILQ_HEAD(, hn_txdesc) agg_list; /* The oldest packet, if transmission aggregation happens. */ struct mbuf *m; struct hn_tx_ring *txr; int refs; uint32_t flags; /* HN_TXD_FLAG_ */ struct hn_nvs_sendctx send_ctx; uint32_t chim_index; int chim_size; bus_dmamap_t data_dmap; bus_addr_t rndis_pkt_paddr; struct rndis_packet_msg *rndis_pkt; bus_dmamap_t rndis_pkt_dmap; }; #define HN_TXD_FLAG_ONLIST 0x0001 #define HN_TXD_FLAG_DMAMAP 0x0002 #define HN_TXD_FLAG_ONAGG 0x0004 struct hn_rxinfo { uint32_t vlan_info; uint32_t csum_info; uint32_t hash_info; uint32_t hash_value; }; struct hn_rxvf_setarg { struct hn_rx_ring *rxr; struct ifnet *vf_ifp; }; #define HN_RXINFO_VLAN 0x0001 #define HN_RXINFO_CSUM 0x0002 #define HN_RXINFO_HASHINF 0x0004 #define HN_RXINFO_HASHVAL 0x0008 #define HN_RXINFO_ALL \ (HN_RXINFO_VLAN | \ HN_RXINFO_CSUM | \ HN_RXINFO_HASHINF | \ HN_RXINFO_HASHVAL) #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff #define HN_NDIS_RXCSUM_INFO_INVALID 0 #define HN_NDIS_HASH_INFO_INVALID 0 static int hn_probe(device_t); static int hn_attach(device_t); static int hn_detach(device_t); static int hn_shutdown(device_t); static void hn_chan_callback(struct vmbus_channel *, void *); static void hn_init(void *); static int hn_ioctl(struct ifnet *, u_long, caddr_t); #ifdef HN_IFSTART_SUPPORT static void hn_start(struct ifnet *); #endif static int hn_transmit(struct ifnet *, struct mbuf *); static void hn_xmit_qflush(struct ifnet *); static int hn_ifmedia_upd(struct ifnet *); static void hn_ifmedia_sts(struct ifnet *, struct ifmediareq *); static void hn_ifnet_event(void *, struct ifnet *, int); static void hn_ifaddr_event(void *, struct ifnet *); static void hn_ifnet_attevent(void *, struct ifnet *); static void hn_ifnet_detevent(void *, struct ifnet *); static void hn_ifnet_lnkevent(void *, struct ifnet *, int); static bool hn_ismyvf(const struct hn_softc *, const struct ifnet *); static void hn_rxvf_change(struct hn_softc *, struct ifnet *, bool); static void hn_rxvf_set(struct hn_softc *, struct ifnet *); static void hn_rxvf_set_task(void *, int); static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *); static int hn_xpnt_vf_iocsetflags(struct hn_softc *); static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, struct ifreq *); static void hn_xpnt_vf_saveifflags(struct hn_softc *); static bool hn_xpnt_vf_isready(struct hn_softc *); static void hn_xpnt_vf_setready(struct hn_softc *); static void hn_xpnt_vf_init_taskfunc(void *, int); static void hn_xpnt_vf_init(struct hn_softc *); static void hn_xpnt_vf_setenable(struct hn_softc *); static void hn_xpnt_vf_setdisable(struct hn_softc *, bool); static void hn_vf_rss_fixup(struct hn_softc *, bool); static void hn_vf_rss_restore(struct hn_softc *); static int hn_rndis_rxinfo(const void *, int, struct hn_rxinfo *); static void hn_rndis_rx_data(struct hn_rx_ring *, const void *, int); static void hn_rndis_rx_status(struct hn_softc *, const void *, int); static void hn_rndis_init_fixat(struct hn_softc *, int); static void hn_nvs_handle_notify(struct hn_softc *, const struct vmbus_chanpkt_hdr *); static void hn_nvs_handle_comp(struct hn_softc *, struct vmbus_channel *, const struct vmbus_chanpkt_hdr *); static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, struct vmbus_channel *, const struct vmbus_chanpkt_hdr *); static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, struct vmbus_channel *, uint64_t); #if __FreeBSD_version >= 1100099 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); #endif static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); #if __FreeBSD_version < 1100095 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); #else static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); #endif static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); #ifndef RSS static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); #endif static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS); static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); static void hn_stop(struct hn_softc *, bool); static void hn_init_locked(struct hn_softc *); static int hn_chan_attach(struct hn_softc *, struct vmbus_channel *); static void hn_chan_detach(struct hn_softc *, struct vmbus_channel *); static int hn_attach_subchans(struct hn_softc *); static void hn_detach_allchans(struct hn_softc *); static void hn_chan_rollup(struct hn_rx_ring *, struct hn_tx_ring *); static void hn_set_ring_inuse(struct hn_softc *, int); static int hn_synth_attach(struct hn_softc *, int); static void hn_synth_detach(struct hn_softc *); static int hn_synth_alloc_subchans(struct hn_softc *, int *); static bool hn_synth_attachable(const struct hn_softc *); static void hn_suspend(struct hn_softc *); static void hn_suspend_data(struct hn_softc *); static void hn_suspend_mgmt(struct hn_softc *); static void hn_resume(struct hn_softc *); static void hn_resume_data(struct hn_softc *); static void hn_resume_mgmt(struct hn_softc *); static void hn_suspend_mgmt_taskfunc(void *, int); static void hn_chan_drain(struct hn_softc *, struct vmbus_channel *); static void hn_disable_rx(struct hn_softc *); static void hn_drain_rxtx(struct hn_softc *, int); static void hn_polling(struct hn_softc *, u_int); static void hn_chan_polling(struct vmbus_channel *, u_int); static void hn_mtu_change_fixup(struct hn_softc *); static void hn_update_link_status(struct hn_softc *); static void hn_change_network(struct hn_softc *); static void hn_link_taskfunc(void *, int); static void hn_netchg_init_taskfunc(void *, int); static void hn_netchg_status_taskfunc(void *, int); static void hn_link_status(struct hn_softc *); static int hn_create_rx_data(struct hn_softc *, int); static void hn_destroy_rx_data(struct hn_softc *); static int hn_check_iplen(const struct mbuf *, int); static void hn_rxpkt_proto(const struct mbuf *, int *, int *); static int hn_set_rxfilter(struct hn_softc *, uint32_t); static int hn_rxfilter_config(struct hn_softc *); static int hn_rss_reconfig(struct hn_softc *); static void hn_rss_ind_fixup(struct hn_softc *); static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t); static int hn_rxpkt(struct hn_rx_ring *, const void *, int, const struct hn_rxinfo *); static uint32_t hn_rss_type_fromndis(uint32_t); static uint32_t hn_rss_type_tondis(uint32_t); static int hn_tx_ring_create(struct hn_softc *, int); static void hn_tx_ring_destroy(struct hn_tx_ring *); static int hn_create_tx_data(struct hn_softc *, int); static void hn_fixup_tx_data(struct hn_softc *); static void hn_fixup_rx_data(struct hn_softc *); static void hn_destroy_tx_data(struct hn_softc *); static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); static void hn_txdesc_gc(struct hn_tx_ring *, struct hn_txdesc *); static int hn_encap(struct ifnet *, struct hn_tx_ring *, struct hn_txdesc *, struct mbuf **); static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, struct hn_txdesc *); static void hn_set_chim_size(struct hn_softc *, int); static void hn_set_tso_maxsize(struct hn_softc *, int, int); static bool hn_tx_ring_pending(struct hn_tx_ring *); static void hn_tx_ring_qflush(struct hn_tx_ring *); static void hn_resume_tx(struct hn_softc *, int); static void hn_set_txagg(struct hn_softc *); static void *hn_try_txagg(struct ifnet *, struct hn_tx_ring *, struct hn_txdesc *, int); static int hn_get_txswq_depth(const struct hn_tx_ring *); static void hn_txpkt_done(struct hn_nvs_sendctx *, struct hn_softc *, struct vmbus_channel *, const void *, int); static int hn_txpkt_sglist(struct hn_tx_ring *, struct hn_txdesc *); static int hn_txpkt_chim(struct hn_tx_ring *, struct hn_txdesc *); static int hn_xmit(struct hn_tx_ring *, int); static void hn_xmit_taskfunc(void *, int); static void hn_xmit_txeof(struct hn_tx_ring *); static void hn_xmit_txeof_taskfunc(void *, int); #ifdef HN_IFSTART_SUPPORT static int hn_start_locked(struct hn_tx_ring *, int); static void hn_start_taskfunc(void *, int); static void hn_start_txeof(struct hn_tx_ring *); static void hn_start_txeof_taskfunc(void *, int); #endif SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Hyper-V network interface"); /* Trust tcp segements verification on host side. */ static int hn_trust_hosttcp = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, &hn_trust_hosttcp, 0, "Trust tcp segement verification on host side, " "when csum info is missing (global setting)"); /* Trust udp datagrams verification on host side. */ static int hn_trust_hostudp = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, &hn_trust_hostudp, 0, "Trust udp datagram verification on host side, " "when csum info is missing (global setting)"); /* Trust ip packets verification on host side. */ static int hn_trust_hostip = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, &hn_trust_hostip, 0, "Trust ip packet verification on host side, " "when csum info is missing (global setting)"); /* * Offload UDP/IPv4 checksum. */ static int hn_enable_udp4cs = 1; SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN, &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum"); /* * Offload UDP/IPv6 checksum. */ static int hn_enable_udp6cs = 1; SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN, &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum"); /* Stats. */ static counter_u64_t hn_udpcs_fixup; SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW, &hn_udpcs_fixup, "# of UDP checksum fixup"); /* * See hn_set_hlen(). * * This value is for Azure. For Hyper-V, set this above * 65536 to disable UDP datagram checksum fixup. */ static int hn_udpcs_fixup_mtu = 1420; SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN, &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold"); /* Limit TSO burst size */ static int hn_tso_maxlen = IP_MAXPACKET; SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, &hn_tso_maxlen, 0, "TSO burst limit"); /* Limit chimney send size */ static int hn_tx_chimney_size = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, &hn_tx_chimney_size, 0, "Chimney send packet size limit"); /* Limit the size of packet for direct transmission */ static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); /* # of LRO entries per RX ring */ #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, &hn_lro_entry_count, 0, "LRO entry count"); #endif #endif static int hn_tx_taskq_cnt = 1; SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); #define HN_TX_TASKQ_M_INDEP 0 #define HN_TX_TASKQ_M_GLOBAL 1 #define HN_TX_TASKQ_M_EVTTQ 2 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, &hn_tx_taskq_mode, 0, "TX taskqueue modes: " "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); #ifndef HN_USE_TXDESC_BUFRING static int hn_use_txdesc_bufring = 0; #else static int hn_use_txdesc_bufring = 1; #endif SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); #ifdef HN_IFSTART_SUPPORT /* Use ifnet.if_start instead of ifnet.if_transmit */ static int hn_use_if_start = 0; SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, &hn_use_if_start, 0, "Use if_start TX method"); #endif /* # of channels to use */ static int hn_chan_cnt = 0; SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, &hn_chan_cnt, 0, "# of channels to use; each channel has one RX ring and one TX ring"); /* # of transmit rings to use */ static int hn_tx_ring_cnt = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, &hn_tx_ring_cnt, 0, "# of TX rings to use"); /* Software TX ring deptch */ static int hn_tx_swq_depth = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ #if __FreeBSD_version >= 1100095 static u_int hn_lro_mbufq_depth = 0; SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); #endif /* Packet transmission aggregation size limit */ static int hn_tx_agg_size = -1; SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); /* Packet transmission aggregation count limit */ static int hn_tx_agg_pkts = -1; SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); /* VF list */ SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING, 0, 0, hn_vflist_sysctl, "A", "VF list"); /* VF mapping */ SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING, 0, 0, hn_vfmap_sysctl, "A", "VF mapping"); /* Transparent VF */ static int hn_xpnt_vf = 1; SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, &hn_xpnt_vf, 0, "Transparent VF mod"); /* Accurate BPF support for Transparent VF */ static int hn_xpnt_vf_accbpf = 0; SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); /* Extra wait for transparent VF attach routing; unit seconds. */ static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, &hn_xpnt_vf_attwait, 0, "Extra wait for transparent VF attach routing; unit: seconds"); static u_int hn_cpu_index; /* next CPU for channel */ static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ static struct rmlock hn_vfmap_lock; static int hn_vfmap_size; static struct ifnet **hn_vfmap; #ifndef RSS static const uint8_t hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa }; #endif /* !RSS */ static const struct hyperv_guid hn_guid = { .hv_guid = { 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46, 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e } }; static device_method_t hn_methods[] = { /* Device interface */ DEVMETHOD(device_probe, hn_probe), DEVMETHOD(device_attach, hn_attach), DEVMETHOD(device_detach, hn_detach), DEVMETHOD(device_shutdown, hn_shutdown), DEVMETHOD_END }; static driver_t hn_driver = { "hn", hn_methods, sizeof(struct hn_softc) }; static devclass_t hn_devclass; DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0); MODULE_VERSION(hn, 1); MODULE_DEPEND(hn, vmbus, 1, 1, 1); #if __FreeBSD_version >= 1100099 static void hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) { int i; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; } #endif static int hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) { KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && txd->chim_size == 0, ("invalid rndis sglist txd")); return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); } static int hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) { struct hn_nvs_rndis rndis; KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && txd->chim_size > 0, ("invalid rndis chim txd")); rndis.nvs_type = HN_NVS_TYPE_RNDIS; rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; rndis.nvs_chim_idx = txd->chim_index; rndis.nvs_chim_sz = txd->chim_size; return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, &rndis, sizeof(rndis), &txd->send_ctx)); } static __inline uint32_t hn_chim_alloc(struct hn_softc *sc) { int i, bmap_cnt = sc->hn_chim_bmap_cnt; u_long *bmap = sc->hn_chim_bmap; uint32_t ret = HN_NVS_CHIM_IDX_INVALID; for (i = 0; i < bmap_cnt; ++i) { int idx; idx = ffsl(~bmap[i]); if (idx == 0) continue; --idx; /* ffsl is 1-based */ KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, ("invalid i %d and idx %d", i, idx)); if (atomic_testandset_long(&bmap[i], idx)) continue; ret = i * LONG_BIT + idx; break; } return (ret); } static __inline void hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) { u_long mask; uint32_t idx; idx = chim_idx / LONG_BIT; KASSERT(idx < sc->hn_chim_bmap_cnt, ("invalid chimney index 0x%x", chim_idx)); mask = 1UL << (chim_idx % LONG_BIT); KASSERT(sc->hn_chim_bmap[idx] & mask, ("index bitmap 0x%lx, chimney index %u, " "bitmap idx %d, bitmask 0x%lx", sc->hn_chim_bmap[idx], chim_idx, idx, mask)); atomic_clear_long(&sc->hn_chim_bmap[idx], mask); } #if defined(INET6) || defined(INET) #define PULLUP_HDR(m, len) \ do { \ if (__predict_false((m)->m_len < (len))) { \ (m) = m_pullup((m), (len)); \ if ((m) == NULL) \ return (NULL); \ } \ } while (0) /* * NOTE: If this function failed, the m_head would be freed. */ static __inline struct mbuf * hn_tso_fixup(struct mbuf *m_head) { struct ether_vlan_header *evl; struct tcphdr *th; int ehlen; KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); PULLUP_HDR(m_head, sizeof(*evl)); evl = mtod(m_head, struct ether_vlan_header *); if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; else ehlen = ETHER_HDR_LEN; m_head->m_pkthdr.l2hlen = ehlen; #ifdef INET if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { struct ip *ip; int iphlen; PULLUP_HDR(m_head, ehlen + sizeof(*ip)); ip = mtodo(m_head, ehlen); iphlen = ip->ip_hl << 2; m_head->m_pkthdr.l3hlen = iphlen; PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); th = mtodo(m_head, ehlen + iphlen); ip->ip_len = 0; ip->ip_sum = 0; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET6 { struct ip6_hdr *ip6; PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); ip6 = mtodo(m_head, ehlen); if (ip6->ip6_nxt != IPPROTO_TCP) { m_freem(m_head); return (NULL); } m_head->m_pkthdr.l3hlen = sizeof(*ip6); PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); th = mtodo(m_head, ehlen + sizeof(*ip6)); ip6->ip6_plen = 0; th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); } #endif return (m_head); } /* * NOTE: If this function failed, the m_head would be freed. */ static __inline struct mbuf * hn_set_hlen(struct mbuf *m_head) { const struct ether_vlan_header *evl; int ehlen; PULLUP_HDR(m_head, sizeof(*evl)); evl = mtod(m_head, const struct ether_vlan_header *); if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; else ehlen = ETHER_HDR_LEN; m_head->m_pkthdr.l2hlen = ehlen; #ifdef INET if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) { const struct ip *ip; int iphlen; PULLUP_HDR(m_head, ehlen + sizeof(*ip)); ip = mtodo(m_head, ehlen); iphlen = ip->ip_hl << 2; m_head->m_pkthdr.l3hlen = iphlen; /* * UDP checksum offload does not work in Azure, if the * following conditions meet: * - sizeof(IP hdr + UDP hdr + payload) > 1420. * - IP_DF is not set in the IP hdr. * * Fallback to software checksum for these UDP datagrams. */ if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) && m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen && (ntohs(ip->ip_off) & IP_DF) == 0) { uint16_t off = ehlen + iphlen; counter_u64_add(hn_udpcs_fixup, 1); PULLUP_HDR(m_head, off + sizeof(struct udphdr)); *(uint16_t *)(m_head->m_data + off + m_head->m_pkthdr.csum_data) = in_cksum_skip( m_head, m_head->m_pkthdr.len, off); m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP; } } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET6 { const struct ip6_hdr *ip6; PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); ip6 = mtodo(m_head, ehlen); if (ip6->ip6_nxt != IPPROTO_TCP && ip6->ip6_nxt != IPPROTO_UDP) { m_freem(m_head); return (NULL); } m_head->m_pkthdr.l3hlen = sizeof(*ip6); } #endif return (m_head); } /* * NOTE: If this function failed, the m_head would be freed. */ static __inline struct mbuf * hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) { const struct tcphdr *th; int ehlen, iphlen; *tcpsyn = 0; ehlen = m_head->m_pkthdr.l2hlen; iphlen = m_head->m_pkthdr.l3hlen; PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); th = mtodo(m_head, ehlen + iphlen); if (th->th_flags & TH_SYN) *tcpsyn = 1; return (m_head); } #undef PULLUP_HDR #endif /* INET6 || INET */ static int hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) { int error = 0; HN_LOCK_ASSERT(sc); if (sc->hn_rx_filter != filter) { error = hn_rndis_set_rxfilter(sc, filter); if (!error) sc->hn_rx_filter = filter; } return (error); } static int hn_rxfilter_config(struct hn_softc *sc) { struct ifnet *ifp = sc->hn_ifp; uint32_t filter; HN_LOCK_ASSERT(sc); /* * If the non-transparent mode VF is activated, we don't know how * its RX filter is configured, so stick the synthetic device in * the promiscous mode. */ if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { filter = NDIS_PACKET_TYPE_PROMISCUOUS; } else { filter = NDIS_PACKET_TYPE_DIRECTED; if (ifp->if_flags & IFF_BROADCAST) filter |= NDIS_PACKET_TYPE_BROADCAST; /* TODO: support multicast list */ if ((ifp->if_flags & IFF_ALLMULTI) || !CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; } return (hn_set_rxfilter(sc, filter)); } static void hn_set_txagg(struct hn_softc *sc) { uint32_t size, pkts; int i; /* * Setup aggregation size. */ if (sc->hn_agg_size < 0) size = UINT32_MAX; else size = sc->hn_agg_size; if (sc->hn_rndis_agg_size < size) size = sc->hn_rndis_agg_size; /* NOTE: We only aggregate packets using chimney sending buffers. */ if (size > (uint32_t)sc->hn_chim_szmax) size = sc->hn_chim_szmax; if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { /* Disable */ size = 0; pkts = 0; goto done; } /* NOTE: Type of the per TX ring setting is 'int'. */ if (size > INT_MAX) size = INT_MAX; /* * Setup aggregation packet count. */ if (sc->hn_agg_pkts < 0) pkts = UINT32_MAX; else pkts = sc->hn_agg_pkts; if (sc->hn_rndis_agg_pkts < pkts) pkts = sc->hn_rndis_agg_pkts; if (pkts <= 1) { /* Disable */ size = 0; pkts = 0; goto done; } /* NOTE: Type of the per TX ring setting is 'short'. */ if (pkts > SHRT_MAX) pkts = SHRT_MAX; done: /* NOTE: Type of the per TX ring setting is 'short'. */ if (sc->hn_rndis_agg_align > SHRT_MAX) { /* Disable */ size = 0; pkts = 0; } if (bootverbose) { if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", size, pkts, sc->hn_rndis_agg_align); } for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; mtx_lock(&txr->hn_tx_lock); txr->hn_agg_szmax = size; txr->hn_agg_pktmax = pkts; txr->hn_agg_align = sc->hn_rndis_agg_align; mtx_unlock(&txr->hn_tx_lock); } } static int hn_get_txswq_depth(const struct hn_tx_ring *txr) { KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); if (hn_tx_swq_depth < txr->hn_txdesc_cnt) return txr->hn_txdesc_cnt; return hn_tx_swq_depth; } static int hn_rss_reconfig(struct hn_softc *sc) { int error; HN_LOCK_ASSERT(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) return (ENXIO); /* * Disable RSS first. * * NOTE: * Direct reconfiguration by setting the UNCHG flags does * _not_ work properly. */ if (bootverbose) if_printf(sc->hn_ifp, "disable RSS\n"); error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); if (error) { if_printf(sc->hn_ifp, "RSS disable failed\n"); return (error); } /* * Reenable the RSS w/ the updated RSS key or indirect * table. */ if (bootverbose) if_printf(sc->hn_ifp, "reconfig RSS\n"); error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); if (error) { if_printf(sc->hn_ifp, "RSS reconfig failed\n"); return (error); } return (0); } static void hn_rss_ind_fixup(struct hn_softc *sc) { struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; int i, nchan; nchan = sc->hn_rx_ring_inuse; KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); /* * Check indirect table to make sure that all channels in it * can be used. */ for (i = 0; i < NDIS_HASH_INDCNT; ++i) { if (rss->rss_ind[i] >= nchan) { if_printf(sc->hn_ifp, "RSS indirect table %d fixup: %u -> %d\n", i, rss->rss_ind[i], nchan - 1); rss->rss_ind[i] = nchan - 1; } } } static int hn_ifmedia_upd(struct ifnet *ifp __unused) { return EOPNOTSUPP; } static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) { struct hn_softc *sc = ifp->if_softc; ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { ifmr->ifm_active |= IFM_NONE; return; } ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= IFM_10G_T | IFM_FDX; } static void hn_rxvf_set_task(void *xarg, int pending __unused) { struct hn_rxvf_setarg *arg = xarg; arg->rxr->hn_rxvf_ifp = arg->vf_ifp; } static void hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp) { struct hn_rx_ring *rxr; struct hn_rxvf_setarg arg; struct task task; int i; HN_LOCK_ASSERT(sc); TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; if (i < sc->hn_rx_ring_inuse) { arg.rxr = rxr; arg.vf_ifp = vf_ifp; vmbus_chan_run_task(rxr->hn_chan, &task); } else { rxr->hn_rxvf_ifp = vf_ifp; } } } static bool hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp) { const struct ifnet *hn_ifp; hn_ifp = sc->hn_ifp; if (ifp == hn_ifp) return (false); if (ifp->if_alloctype != IFT_ETHER) return (false); /* Ignore lagg/vlan interfaces */ if (strcmp(ifp->if_dname, "lagg") == 0 || strcmp(ifp->if_dname, "vlan") == 0) return (false); /* * During detach events ifp->if_addr might be NULL. * Make sure the bcmp() below doesn't panic on that: */ if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL) return (false); if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0) return (false); return (true); } static void hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf) { struct ifnet *hn_ifp; HN_LOCK(sc); if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) goto out; if (!hn_ismyvf(sc, ifp)) goto out; hn_ifp = sc->hn_ifp; if (rxvf) { if (sc->hn_flags & HN_FLAG_RXVF) goto out; sc->hn_flags |= HN_FLAG_RXVF; hn_rxfilter_config(sc); } else { if (!(sc->hn_flags & HN_FLAG_RXVF)) goto out; sc->hn_flags &= ~HN_FLAG_RXVF; if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING) hn_rxfilter_config(sc); else hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); } hn_nvs_set_datapath(sc, rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); hn_rxvf_set(sc, rxvf ? ifp : NULL); if (rxvf) { hn_vf_rss_fixup(sc, true); hn_suspend_mgmt(sc); sc->hn_link_flags &= ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); if_link_state_change(hn_ifp, LINK_STATE_DOWN); } else { hn_vf_rss_restore(sc); hn_resume_mgmt(sc); } devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname, rxvf ? "VF_UP" : "VF_DOWN", NULL); if (bootverbose) { if_printf(hn_ifp, "datapath is switched %s %s\n", rxvf ? "to" : "from", ifp->if_xname); } out: HN_UNLOCK(sc); } static void hn_ifnet_event(void *arg, struct ifnet *ifp, int event) { if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) return; hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); } static void hn_ifaddr_event(void *arg, struct ifnet *ifp) { hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP); } static int hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr) { struct ifnet *ifp, *vf_ifp; uint64_t tmp; int error; HN_LOCK_ASSERT(sc); ifp = sc->hn_ifp; vf_ifp = sc->hn_vf_ifp; /* * Fix up requested capabilities w/ supported capabilities, * since the supported capabilities could have been changed. */ ifr->ifr_reqcap &= ifp->if_capabilities; /* Pass SIOCSIFCAP to VF. */ error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr); /* * NOTE: * The error will be propagated to the callers, however, it * is _not_ useful here. */ /* * Merge VF's enabled capabilities. */ ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities; tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc); if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist |= tmp; else ifp->if_hwassist &= ~tmp; tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc); if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) ifp->if_hwassist |= tmp; else ifp->if_hwassist &= ~tmp; tmp = vf_ifp->if_hwassist & CSUM_IP_TSO; if (ifp->if_capenable & IFCAP_TSO4) ifp->if_hwassist |= tmp; else ifp->if_hwassist &= ~tmp; tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO; if (ifp->if_capenable & IFCAP_TSO6) ifp->if_hwassist |= tmp; else ifp->if_hwassist &= ~tmp; return (error); } static int hn_xpnt_vf_iocsetflags(struct hn_softc *sc) { struct ifnet *vf_ifp; struct ifreq ifr; HN_LOCK_ASSERT(sc); vf_ifp = sc->hn_vf_ifp; memset(&ifr, 0, sizeof(ifr)); strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); ifr.ifr_flags = vf_ifp->if_flags & 0xffff; ifr.ifr_flagshigh = vf_ifp->if_flags >> 16; return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr)); } static void hn_xpnt_vf_saveifflags(struct hn_softc *sc) { struct ifnet *ifp = sc->hn_ifp; int allmulti = 0; HN_LOCK_ASSERT(sc); /* XXX vlan(4) style mcast addr maintenance */ if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) allmulti = IFF_ALLMULTI; /* Always set the VF's if_flags */ sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti; } static void hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m) { struct rm_priotracker pt; struct ifnet *hn_ifp = NULL; struct mbuf *mn; /* * XXX racy, if hn(4) ever detached. */ rm_rlock(&hn_vfmap_lock, &pt); if (vf_ifp->if_index < hn_vfmap_size) hn_ifp = hn_vfmap[vf_ifp->if_index]; rm_runlock(&hn_vfmap_lock, &pt); if (hn_ifp != NULL) { for (mn = m; mn != NULL; mn = mn->m_nextpkt) { /* * Allow tapping on the VF. */ ETHER_BPF_MTAP(vf_ifp, mn); /* * Update VF stats. */ if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) { if_inc_counter(vf_ifp, IFCOUNTER_IBYTES, mn->m_pkthdr.len); } /* * XXX IFCOUNTER_IMCAST * This stat updating is kinda invasive, since it * requires two checks on the mbuf: the length check * and the ethernet header check. As of this write, * all multicast packets go directly to hn(4), which * makes imcast stat updating in the VF a try in vian. */ /* * Fix up rcvif and increase hn(4)'s ipackets. */ mn->m_pkthdr.rcvif = hn_ifp; if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); } /* * Go through hn(4)'s if_input. */ hn_ifp->if_input(hn_ifp, m); } else { /* * In the middle of the transition; free this * mbuf chain. */ while (m != NULL) { mn = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); m = mn; } } } static void hn_mtu_change_fixup(struct hn_softc *sc) { struct ifnet *ifp; HN_LOCK_ASSERT(sc); ifp = sc->hn_ifp; hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); #if __FreeBSD_version >= 1100099 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); #endif } static uint32_t hn_rss_type_fromndis(uint32_t rss_hash) { uint32_t types = 0; if (rss_hash & NDIS_HASH_IPV4) types |= RSS_TYPE_IPV4; if (rss_hash & NDIS_HASH_TCP_IPV4) types |= RSS_TYPE_TCP_IPV4; if (rss_hash & NDIS_HASH_IPV6) types |= RSS_TYPE_IPV6; if (rss_hash & NDIS_HASH_IPV6_EX) types |= RSS_TYPE_IPV6_EX; if (rss_hash & NDIS_HASH_TCP_IPV6) types |= RSS_TYPE_TCP_IPV6; if (rss_hash & NDIS_HASH_TCP_IPV6_EX) types |= RSS_TYPE_TCP_IPV6_EX; if (rss_hash & NDIS_HASH_UDP_IPV4_X) types |= RSS_TYPE_UDP_IPV4; return (types); } static uint32_t hn_rss_type_tondis(uint32_t types) { uint32_t rss_hash = 0; KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0, ("UDP6 and UDP6EX are not supported")); if (types & RSS_TYPE_IPV4) rss_hash |= NDIS_HASH_IPV4; if (types & RSS_TYPE_TCP_IPV4) rss_hash |= NDIS_HASH_TCP_IPV4; if (types & RSS_TYPE_IPV6) rss_hash |= NDIS_HASH_IPV6; if (types & RSS_TYPE_IPV6_EX) rss_hash |= NDIS_HASH_IPV6_EX; if (types & RSS_TYPE_TCP_IPV6) rss_hash |= NDIS_HASH_TCP_IPV6; if (types & RSS_TYPE_TCP_IPV6_EX) rss_hash |= NDIS_HASH_TCP_IPV6_EX; if (types & RSS_TYPE_UDP_IPV4) rss_hash |= NDIS_HASH_UDP_IPV4_X; return (rss_hash); } static void hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash) { int i; HN_LOCK_ASSERT(sc); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash; } static void hn_vf_rss_fixup(struct hn_softc *sc, bool reconf) { struct ifnet *ifp, *vf_ifp; struct ifrsshash ifrh; struct ifrsskey ifrk; int error; uint32_t my_types, diff_types, mbuf_types = 0; HN_LOCK_ASSERT(sc); KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); if (sc->hn_rx_ring_inuse == 1) { /* No RSS on synthetic parts; done. */ return; } if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) { /* Synthetic parts do not support Toeplitz; done. */ return; } ifp = sc->hn_ifp; vf_ifp = sc->hn_vf_ifp; /* * Extract VF's RSS key. Only 40 bytes key for Toeplitz is * supported. */ memset(&ifrk, 0, sizeof(ifrk)); strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name)); error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk); if (error) { if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n", vf_ifp->if_xname, error); goto done; } if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) { if_printf(ifp, "%s RSS function %u is not Toeplitz\n", vf_ifp->if_xname, ifrk.ifrk_func); goto done; } if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) { if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n", vf_ifp->if_xname, ifrk.ifrk_keylen); goto done; } /* * Extract VF's RSS hash. Only Toeplitz is supported. */ memset(&ifrh, 0, sizeof(ifrh)); strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name)); error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh); if (error) { if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n", vf_ifp->if_xname, error); goto done; } if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) { if_printf(ifp, "%s RSS function %u is not Toeplitz\n", vf_ifp->if_xname, ifrh.ifrh_func); goto done; } my_types = hn_rss_type_fromndis(sc->hn_rss_hcap); if ((ifrh.ifrh_types & my_types) == 0) { /* This disables RSS; ignore it then */ if_printf(ifp, "%s intersection of RSS types failed. " "VF %#x, mine %#x\n", vf_ifp->if_xname, ifrh.ifrh_types, my_types); goto done; } diff_types = my_types ^ ifrh.ifrh_types; my_types &= ifrh.ifrh_types; mbuf_types = my_types; /* * Detect RSS hash value/type confliction. * * NOTE: * We don't disable the hash type, but stop delivery the hash * value/type through mbufs on RX path. * * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple * hash is delivered with type of TCP_IPV4. This means if * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at * least to hn_mbuf_hash. However, given that _all_ of the * NICs implement TCP_IPV4, this will _not_ impose any issues * here. */ if ((my_types & RSS_TYPE_IPV4) && (diff_types & ifrh.ifrh_types & (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) { /* Conflict; disable IPV4 hash type/value delivery. */ if_printf(ifp, "disable IPV4 mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_IPV4; } if ((my_types & RSS_TYPE_IPV6) && (diff_types & ifrh.ifrh_types & (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | RSS_TYPE_IPV6_EX))) { /* Conflict; disable IPV6 hash type/value delivery. */ if_printf(ifp, "disable IPV6 mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_IPV6; } if ((my_types & RSS_TYPE_IPV6_EX) && (diff_types & ifrh.ifrh_types & (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | RSS_TYPE_IPV6))) { /* Conflict; disable IPV6_EX hash type/value delivery. */ if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_IPV6_EX; } if ((my_types & RSS_TYPE_TCP_IPV6) && (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) { /* Conflict; disable TCP_IPV6 hash type/value delivery. */ if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_TCP_IPV6; } if ((my_types & RSS_TYPE_TCP_IPV6_EX) && (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) { /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */ if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX; } if ((my_types & RSS_TYPE_UDP_IPV6) && (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) { /* Conflict; disable UDP_IPV6 hash type/value delivery. */ if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_UDP_IPV6; } if ((my_types & RSS_TYPE_UDP_IPV6_EX) && (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) { /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */ if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX; } /* * Indirect table does not matter. */ sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) | hn_rss_type_tondis(my_types); memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key)); sc->hn_flags |= HN_FLAG_HAS_RSSKEY; if (reconf) { error = hn_rss_reconfig(sc); if (error) { /* XXX roll-back? */ if_printf(ifp, "hn_rss_reconfig failed: %d\n", error); /* XXX keep going. */ } } done: /* Hash deliverability for mbufs. */ hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types)); } static void hn_vf_rss_restore(struct hn_softc *sc) { HN_LOCK_ASSERT(sc); KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); if (sc->hn_rx_ring_inuse == 1) goto done; /* * Restore hash types. Key does _not_ matter. */ if (sc->hn_rss_hash != sc->hn_rss_hcap) { int error; sc->hn_rss_hash = sc->hn_rss_hcap; error = hn_rss_reconfig(sc); if (error) { if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n", error); /* XXX keep going. */ } } done: /* Hash deliverability for mbufs. */ hn_rss_mbuf_hash(sc, NDIS_HASH_ALL); } static void hn_xpnt_vf_setready(struct hn_softc *sc) { struct ifnet *ifp, *vf_ifp; struct ifreq ifr; HN_LOCK_ASSERT(sc); ifp = sc->hn_ifp; vf_ifp = sc->hn_vf_ifp; /* * Mark the VF ready. */ sc->hn_vf_rdytick = 0; /* * Save information for restoration. */ sc->hn_saved_caps = ifp->if_capabilities; sc->hn_saved_tsomax = ifp->if_hw_tsomax; sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount; sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize; /* * Intersect supported/enabled capabilities. * * NOTE: * if_hwassist is not changed here. */ ifp->if_capabilities &= vf_ifp->if_capabilities; ifp->if_capenable &= ifp->if_capabilities; /* * Fix TSO settings. */ if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax) ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax; if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount) ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount; if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize) ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize; /* * Change VF's enabled capabilities. */ memset(&ifr, 0, sizeof(ifr)); strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); ifr.ifr_reqcap = ifp->if_capenable; hn_xpnt_vf_iocsetcaps(sc, &ifr); if (ifp->if_mtu != ETHERMTU) { int error; /* * Change VF's MTU. */ memset(&ifr, 0, sizeof(ifr)); strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); ifr.ifr_mtu = ifp->if_mtu; error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr); if (error) { if_printf(ifp, "%s SIOCSIFMTU %u failed\n", vf_ifp->if_xname, ifp->if_mtu); if (ifp->if_mtu > ETHERMTU) { if_printf(ifp, "change MTU to %d\n", ETHERMTU); /* * XXX * No need to adjust the synthetic parts' MTU; * failure of the adjustment will cause us * infinite headache. */ ifp->if_mtu = ETHERMTU; hn_mtu_change_fixup(sc); } } } } static bool hn_xpnt_vf_isready(struct hn_softc *sc) { HN_LOCK_ASSERT(sc); if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) return (false); if (sc->hn_vf_rdytick == 0) return (true); if (sc->hn_vf_rdytick > ticks) return (false); /* Mark VF as ready. */ hn_xpnt_vf_setready(sc); return (true); } static void hn_xpnt_vf_setenable(struct hn_softc *sc) { int i; HN_LOCK_ASSERT(sc); /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ rm_wlock(&sc->hn_vf_lock); sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; rm_wunlock(&sc->hn_vf_lock); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF; } static void hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf) { int i; HN_LOCK_ASSERT(sc); /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ rm_wlock(&sc->hn_vf_lock); sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; if (clear_vf) sc->hn_vf_ifp = NULL; rm_wunlock(&sc->hn_vf_lock); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF; } static void hn_xpnt_vf_init(struct hn_softc *sc) { int error; HN_LOCK_ASSERT(sc); KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); if (bootverbose) { if_printf(sc->hn_ifp, "try bringing up %s\n", sc->hn_vf_ifp->if_xname); } /* * Bring the VF up. */ hn_xpnt_vf_saveifflags(sc); sc->hn_vf_ifp->if_flags |= IFF_UP; error = hn_xpnt_vf_iocsetflags(sc); if (error) { if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", sc->hn_vf_ifp->if_xname, error); return; } /* * NOTE: * Datapath setting must happen _after_ bringing the VF up. */ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); /* * NOTE: * Fixup RSS related bits _after_ the VF is brought up, since * many VFs generate RSS key during it's initialization. */ hn_vf_rss_fixup(sc, true); /* Mark transparent mode VF as enabled. */ hn_xpnt_vf_setenable(sc); } static void hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) goto done; if (sc->hn_vf_ifp == NULL) goto done; if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) goto done; if (sc->hn_vf_rdytick != 0) { /* Mark VF as ready. */ hn_xpnt_vf_setready(sc); } if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) { /* * Delayed VF initialization. */ if (bootverbose) { if_printf(sc->hn_ifp, "delayed initialize %s\n", sc->hn_vf_ifp->if_xname); } hn_xpnt_vf_init(sc); } done: HN_UNLOCK(sc); } static void hn_ifnet_attevent(void *xsc, struct ifnet *ifp) { struct hn_softc *sc = xsc; HN_LOCK(sc); if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) goto done; if (!hn_ismyvf(sc, ifp)) goto done; if (sc->hn_vf_ifp != NULL) { if_printf(sc->hn_ifp, "%s was attached as VF\n", sc->hn_vf_ifp->if_xname); goto done; } if (hn_xpnt_vf && ifp->if_start != NULL) { /* * ifnet.if_start is _not_ supported by transparent * mode VF; mainly due to the IFF_DRV_OACTIVE flag. */ if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " "in transparent VF mode.\n", ifp->if_xname); goto done; } rm_wlock(&hn_vfmap_lock); if (ifp->if_index >= hn_vfmap_size) { struct ifnet **newmap; int newsize; newsize = ifp->if_index + HN_VFMAP_SIZE_DEF; newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF, M_WAITOK | M_ZERO); memcpy(newmap, hn_vfmap, sizeof(struct ifnet *) * hn_vfmap_size); free(hn_vfmap, M_DEVBUF); hn_vfmap = newmap; hn_vfmap_size = newsize; } KASSERT(hn_vfmap[ifp->if_index] == NULL, ("%s: ifindex %d was mapped to %s", ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname)); hn_vfmap[ifp->if_index] = sc->hn_ifp; rm_wunlock(&hn_vfmap_lock); /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ rm_wlock(&sc->hn_vf_lock); KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); sc->hn_vf_ifp = ifp; rm_wunlock(&sc->hn_vf_lock); if (hn_xpnt_vf) { int wait_ticks; /* * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. * Save vf_ifp's current if_input for later restoration. */ sc->hn_vf_input = ifp->if_input; ifp->if_input = hn_xpnt_vf_input; /* * Stop link status management; use the VF's. */ hn_suspend_mgmt(sc); /* * Give VF sometime to complete its attach routing. */ wait_ticks = hn_xpnt_vf_attwait * hz; sc->hn_vf_rdytick = ticks + wait_ticks; taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, wait_ticks); } done: HN_UNLOCK(sc); } static void hn_ifnet_detevent(void *xsc, struct ifnet *ifp) { struct hn_softc *sc = xsc; HN_LOCK(sc); if (sc->hn_vf_ifp == NULL) goto done; if (!hn_ismyvf(sc, ifp)) goto done; if (hn_xpnt_vf) { /* * Make sure that the delayed initialization is not running. * * NOTE: * - This lock _must_ be released, since the hn_vf_init task * will try holding this lock. * - It is safe to release this lock here, since the * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. * * XXX racy, if hn(4) ever detached. */ HN_UNLOCK(sc); taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); HN_LOCK(sc); KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", sc->hn_ifp->if_xname)); ifp->if_input = sc->hn_vf_input; sc->hn_vf_input = NULL; if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) && (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); if (sc->hn_vf_rdytick == 0) { /* * The VF was ready; restore some settings. */ sc->hn_ifp->if_capabilities = sc->hn_saved_caps; /* * NOTE: * There is _no_ need to fixup if_capenable and * if_hwassist, since the if_capabilities before * restoration was an intersection of the VF's * if_capabilites and the synthetic device's * if_capabilites. */ sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax; sc->hn_ifp->if_hw_tsomaxsegcount = sc->hn_saved_tsosegcnt; sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz; } if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { /* * Restore RSS settings. */ hn_vf_rss_restore(sc); /* * Resume link status management, which was suspended * by hn_ifnet_attevent(). */ hn_resume_mgmt(sc); } } /* Mark transparent mode VF as disabled. */ hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */); rm_wlock(&hn_vfmap_lock); KASSERT(ifp->if_index < hn_vfmap_size, ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size)); if (hn_vfmap[ifp->if_index] != NULL) { KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp, ("%s: ifindex %d was mapped to %s", ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname)); hn_vfmap[ifp->if_index] = NULL; } rm_wunlock(&hn_vfmap_lock); done: HN_UNLOCK(sc); } static void hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state) { struct hn_softc *sc = xsc; if (sc->hn_vf_ifp == ifp) if_link_state_change(sc->hn_ifp, link_state); } static int hn_probe(device_t dev) { if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) { device_set_desc(dev, "Hyper-V Network Interface"); return BUS_PROBE_DEFAULT; } return ENXIO; } static int hn_attach(device_t dev) { struct hn_softc *sc = device_get_softc(dev); struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; uint8_t eaddr[ETHER_ADDR_LEN]; struct ifnet *ifp = NULL; int error, ring_cnt, tx_ring_cnt; uint32_t mtu; sc->hn_dev = dev; sc->hn_prichan = vmbus_get_channel(dev); HN_LOCK_INIT(sc); rm_init(&sc->hn_vf_lock, "hnvf"); if (hn_xpnt_vf && hn_xpnt_vf_accbpf) sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; /* * Initialize these tunables once. */ sc->hn_agg_size = hn_tx_agg_size; sc->hn_agg_pkts = hn_tx_agg_pkts; /* * Setup taskqueue for transmission. */ if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { int i; sc->hn_tx_taskqs = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), M_DEVBUF, M_WAITOK); for (i = 0; i < hn_tx_taskq_cnt; ++i) { sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", M_WAITOK, taskqueue_thread_enqueue, &sc->hn_tx_taskqs[i]); taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, "%s tx%d", device_get_nameunit(dev), i); } } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { sc->hn_tx_taskqs = hn_tx_taskque; } /* * Setup taskqueue for mangement tasks, e.g. link status. */ sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", device_get_nameunit(dev)); TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, hn_netchg_status_taskfunc, sc); if (hn_xpnt_vf) { /* * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. */ sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, taskqueue_thread_enqueue, &sc->hn_vf_taskq); taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", device_get_nameunit(dev)); TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, hn_xpnt_vf_init_taskfunc, sc); } /* * Allocate ifnet and setup its name earlier, so that if_printf * can be used by functions, which will be called after * ether_ifattach(). */ ifp = sc->hn_ifp = if_alloc(IFT_ETHER); ifp->if_softc = sc; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); /* * Initialize ifmedia earlier so that it can be unconditionally * destroyed, if error happened later on. */ ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); /* * Figure out the # of RX rings (ring_cnt) and the # of TX rings * to use (tx_ring_cnt). * * NOTE: * The # of RX rings to use is same as the # of channels to use. */ ring_cnt = hn_chan_cnt; if (ring_cnt <= 0) { /* Default */ ring_cnt = mp_ncpus; if (ring_cnt > HN_RING_CNT_DEF_MAX) ring_cnt = HN_RING_CNT_DEF_MAX; } else if (ring_cnt > mp_ncpus) { ring_cnt = mp_ncpus; } #ifdef RSS if (ring_cnt > rss_getnumbuckets()) ring_cnt = rss_getnumbuckets(); #endif tx_ring_cnt = hn_tx_ring_cnt; if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) tx_ring_cnt = ring_cnt; #ifdef HN_IFSTART_SUPPORT if (hn_use_if_start) { /* ifnet.if_start only needs one TX ring. */ tx_ring_cnt = 1; } #endif /* * Set the leader CPU for channels. */ sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; /* * Create enough TX/RX rings, even if only limited number of * channels can be allocated. */ error = hn_create_tx_data(sc, tx_ring_cnt); if (error) goto failed; error = hn_create_rx_data(sc, ring_cnt); if (error) goto failed; /* * Create transaction context for NVS and RNDIS transactions. */ sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); if (sc->hn_xact == NULL) { error = ENXIO; goto failed; } /* * Install orphan handler for the revocation of this device's * primary channel. * * NOTE: * The processing order is critical here: * Install the orphan handler, _before_ testing whether this * device's primary channel has been revoked or not. */ vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); if (vmbus_chan_is_revoked(sc->hn_prichan)) { error = ENXIO; goto failed; } /* * Attach the synthetic parts, i.e. NVS and RNDIS. */ error = hn_synth_attach(sc, ETHERMTU); if (error) goto failed; error = hn_rndis_get_eaddr(sc, eaddr); if (error) goto failed; error = hn_rndis_get_mtu(sc, &mtu); if (error) mtu = ETHERMTU; else if (bootverbose) device_printf(dev, "RNDIS mtu %u\n", mtu); #if __FreeBSD_version >= 1100099 if (sc->hn_rx_ring_inuse > 1) { /* * Reduce TCP segment aggregation limit for multiple * RX rings to increase ACK timeliness. */ hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); } #endif /* * Fixup TX/RX stuffs after synthetic parts are attached. */ hn_fixup_tx_data(sc); hn_fixup_rx_data(sc); ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, &sc->hn_nvs_ver, 0, "NVS version"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_ndis_version_sysctl, "A", "NDIS version"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_caps_sysctl, "A", "capabilities"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_hwassist_sysctl, "A", "hwassist"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max", CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt", CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0, "max # of TSO segments"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz", CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0, "max size of TSO segment"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rxfilter_sysctl, "A", "rxfilter"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rss_hash_sysctl, "A", "RSS hash"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rss_hcap_sysctl, "A", "RSS hash capabilities"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); #ifndef RSS /* * Don't allow RSS key/indirect table changes, if RSS is defined. */ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_rss_key_sysctl, "IU", "RSS key"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_rss_ind_sysctl, "IU", "RSS indirect table"); #endif SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, "RNDIS offered packet transmission aggregation size limit"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, "RNDIS offered packet transmission aggregation count limit"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, "RNDIS packet transmission aggregation alignment"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_txagg_size_sysctl, "I", "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_txagg_pkts_sysctl, "I", "Packet transmission aggregation packets, " "0 -- disable, -1 -- auto"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_polling_sysctl, "I", "Polling frequency: [100,1000000], 0 disable polling"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_vf_sysctl, "A", "Virtual Function's name"); if (!hn_xpnt_vf) { SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rxvf_sysctl, "A", "activated Virtual Function's name"); } else { SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_xpnt_vf_enabled_sysctl, "I", "Transparent VF enabled"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_xpnt_vf_accbpf_sysctl, "I", "Accurate BPF for transparent VF"); } /* * Setup the ifmedia, which has been initialized earlier. */ ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); /* XXX ifmedia_set really should do this for us */ sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; /* * Setup the ifnet for this interface. */ ifp->if_baudrate = IF_Gbps(10); - ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST | - IFF_NEEDSEPOCH; + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = hn_ioctl; ifp->if_init = hn_init; #ifdef HN_IFSTART_SUPPORT if (hn_use_if_start) { int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); ifp->if_start = hn_start; IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); ifp->if_snd.ifq_drv_maxlen = qdepth - 1; IFQ_SET_READY(&ifp->if_snd); } else #endif { ifp->if_transmit = hn_transmit; ifp->if_qflush = hn_xmit_qflush; } ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE; #ifdef foo /* We can't diff IPv6 packets from IPv4 packets on RX path. */ ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; #endif if (sc->hn_caps & HN_CAP_VLAN) { /* XXX not sure about VLAN_MTU. */ ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; } ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; if (ifp->if_hwassist & HN_CSUM_IP_MASK) ifp->if_capabilities |= IFCAP_TXCSUM; if (ifp->if_hwassist & HN_CSUM_IP6_MASK) ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; if (sc->hn_caps & HN_CAP_TSO4) { ifp->if_capabilities |= IFCAP_TSO4; ifp->if_hwassist |= CSUM_IP_TSO; } if (sc->hn_caps & HN_CAP_TSO6) { ifp->if_capabilities |= IFCAP_TSO6; ifp->if_hwassist |= CSUM_IP6_TSO; } /* Enable all available capabilities by default. */ ifp->if_capenable = ifp->if_capabilities; /* * Disable IPv6 TSO and TXCSUM by default, they still can * be enabled through SIOCSIFCAP. */ ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { /* * Lock hn_set_tso_maxsize() to simplify its * internal logic. */ HN_LOCK(sc); hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); HN_UNLOCK(sc); ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; ifp->if_hw_tsomaxsegsize = PAGE_SIZE; } ether_ifattach(ifp, eaddr); if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { if_printf(ifp, "TSO segcnt %u segsz %u\n", ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); } if (mtu < ETHERMTU) { if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu); ifp->if_mtu = mtu; } /* Inform the upper layer about the long frame support. */ ifp->if_hdrlen = sizeof(struct ether_vlan_header); /* * Kick off link status check. */ sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; hn_update_link_status(sc); if (!hn_xpnt_vf) { sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); } else { sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); } /* * NOTE: * Subscribe ether_ifattach event, instead of ifnet_arrival event, * since interface's LLADDR is needed; interface LLADDR is not * available when ifnet_arrival event is triggered. */ sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); return (0); failed: if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) hn_synth_detach(sc); hn_detach(dev); return (error); } static int hn_detach(device_t dev) { struct hn_softc *sc = device_get_softc(dev); struct ifnet *ifp = sc->hn_ifp, *vf_ifp; if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { /* * In case that the vmbus missed the orphan handler * installation. */ vmbus_xact_ctx_orphan(sc->hn_xact); } if (sc->hn_ifaddr_evthand != NULL) EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); if (sc->hn_ifnet_evthand != NULL) EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); if (sc->hn_ifnet_atthand != NULL) { EVENTHANDLER_DEREGISTER(ether_ifattach_event, sc->hn_ifnet_atthand); } if (sc->hn_ifnet_dethand != NULL) { EVENTHANDLER_DEREGISTER(ifnet_departure_event, sc->hn_ifnet_dethand); } if (sc->hn_ifnet_lnkhand != NULL) EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); vf_ifp = sc->hn_vf_ifp; __compiler_membar(); if (vf_ifp != NULL) hn_ifnet_detevent(sc, vf_ifp); if (device_is_attached(dev)) { HN_LOCK(sc); if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) hn_stop(sc, true); /* * NOTE: * hn_stop() only suspends data, so managment * stuffs have to be suspended manually here. */ hn_suspend_mgmt(sc); hn_synth_detach(sc); } HN_UNLOCK(sc); ether_ifdetach(ifp); } ifmedia_removeall(&sc->hn_media); hn_destroy_rx_data(sc); hn_destroy_tx_data(sc); if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { int i; for (i = 0; i < hn_tx_taskq_cnt; ++i) taskqueue_free(sc->hn_tx_taskqs[i]); free(sc->hn_tx_taskqs, M_DEVBUF); } taskqueue_free(sc->hn_mgmt_taskq0); if (sc->hn_vf_taskq != NULL) taskqueue_free(sc->hn_vf_taskq); if (sc->hn_xact != NULL) { /* * Uninstall the orphan handler _before_ the xact is * destructed. */ vmbus_chan_unset_orphan(sc->hn_prichan); vmbus_xact_ctx_destroy(sc->hn_xact); } if_free(ifp); HN_LOCK_DESTROY(sc); rm_destroy(&sc->hn_vf_lock); return (0); } static int hn_shutdown(device_t dev) { return (0); } static void hn_link_status(struct hn_softc *sc) { uint32_t link_status; int error; error = hn_rndis_get_linkstatus(sc, &link_status); if (error) { /* XXX what to do? */ return; } if (link_status == NDIS_MEDIA_STATE_CONNECTED) sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; else sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; if_link_state_change(sc->hn_ifp, (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? LINK_STATE_UP : LINK_STATE_DOWN); } static void hn_link_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) return; hn_link_status(sc); } static void hn_netchg_init_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; /* Prevent any link status checks from running. */ sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; /* * Fake up a [link down --> link up] state change; 5 seconds * delay is used, which closely simulates miibus reaction * upon link down event. */ sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 5 * hz); } static void hn_netchg_status_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; /* Re-allow link status checks. */ sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; hn_link_status(sc); } static void hn_update_link_status(struct hn_softc *sc) { if (sc->hn_mgmt_taskq != NULL) taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); } static void hn_change_network(struct hn_softc *sc) { if (sc->hn_mgmt_taskq != NULL) taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); } static __inline int hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) { struct mbuf *m = *m_head; int error; KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); if (error == EFBIG) { struct mbuf *m_new; m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); if (m_new == NULL) return ENOBUFS; else *m_head = m = m_new; txr->hn_tx_collapsed++; error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); } if (!error) { bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, BUS_DMASYNC_PREWRITE); txd->flags |= HN_TXD_FLAG_DMAMAP; } return error; } static __inline int hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) { KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, ("put an onlist txd %#x", txd->flags)); KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, ("put an onagg txd %#x", txd->flags)); KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); if (atomic_fetchadd_int(&txd->refs, -1) != 1) return 0; if (!STAILQ_EMPTY(&txd->agg_list)) { struct hn_txdesc *tmp_txd; while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { int freed; KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), ("resursive aggregation on aggregated txdesc")); KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), ("not aggregated txdesc")); KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("aggregated txdesc uses dmamap")); KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("aggregated txdesc consumes " "chimney sending buffer")); KASSERT(tmp_txd->chim_size == 0, ("aggregated txdesc has non-zero " "chimney sending size")); STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; freed = hn_txdesc_put(txr, tmp_txd); KASSERT(freed, ("failed to free aggregated txdesc")); } } if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("chim txd uses dmamap")); hn_chim_free(txr->hn_sc, txd->chim_index); txd->chim_index = HN_NVS_CHIM_IDX_INVALID; txd->chim_size = 0; } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->hn_tx_data_dtag, txd->data_dmap); txd->flags &= ~HN_TXD_FLAG_DMAMAP; } if (txd->m != NULL) { m_freem(txd->m); txd->m = NULL; } txd->flags |= HN_TXD_FLAG_ONLIST; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); KASSERT(txr->hn_txdesc_avail >= 0 && txr->hn_txdesc_avail < txr->hn_txdesc_cnt, ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); txr->hn_txdesc_avail++; SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); mtx_unlock_spin(&txr->hn_txlist_spin); #else /* HN_USE_TXDESC_BUFRING */ #ifdef HN_DEBUG atomic_add_int(&txr->hn_txdesc_avail, 1); #endif buf_ring_enqueue(txr->hn_txdesc_br, txd); #endif /* !HN_USE_TXDESC_BUFRING */ return 1; } static __inline struct hn_txdesc * hn_txdesc_get(struct hn_tx_ring *txr) { struct hn_txdesc *txd; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); txd = SLIST_FIRST(&txr->hn_txlist); if (txd != NULL) { KASSERT(txr->hn_txdesc_avail > 0, ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); txr->hn_txdesc_avail--; SLIST_REMOVE_HEAD(&txr->hn_txlist, link); } mtx_unlock_spin(&txr->hn_txlist_spin); #else txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); #endif if (txd != NULL) { #ifdef HN_USE_TXDESC_BUFRING #ifdef HN_DEBUG atomic_subtract_int(&txr->hn_txdesc_avail, 1); #endif #endif /* HN_USE_TXDESC_BUFRING */ KASSERT(txd->m == NULL && txd->refs == 0 && STAILQ_EMPTY(&txd->agg_list) && txd->chim_index == HN_NVS_CHIM_IDX_INVALID && txd->chim_size == 0 && (txd->flags & HN_TXD_FLAG_ONLIST) && (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); txd->flags &= ~HN_TXD_FLAG_ONLIST; txd->refs = 1; } return txd; } static __inline void hn_txdesc_hold(struct hn_txdesc *txd) { /* 0->1 transition will never work */ KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); atomic_add_int(&txd->refs, 1); } static __inline void hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) { KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, ("recursive aggregation on aggregating txdesc")); KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, ("already aggregated")); KASSERT(STAILQ_EMPTY(&txd->agg_list), ("recursive aggregation on to-be-aggregated txdesc")); txd->flags |= HN_TXD_FLAG_ONAGG; STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); } static bool hn_tx_ring_pending(struct hn_tx_ring *txr) { bool pending = false; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) pending = true; mtx_unlock_spin(&txr->hn_txlist_spin); #else if (!buf_ring_full(txr->hn_txdesc_br)) pending = true; #endif return (pending); } static __inline void hn_txeof(struct hn_tx_ring *txr) { txr->hn_has_txeof = 0; txr->hn_txeof(txr); } static void hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, struct vmbus_channel *chan, const void *data __unused, int dlen __unused) { struct hn_txdesc *txd = sndc->hn_cbarg; struct hn_tx_ring *txr; txr = txd->txr; KASSERT(txr->hn_chan == chan, ("channel mismatch, on chan%u, should be chan%u", vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); txr->hn_has_txeof = 1; hn_txdesc_put(txr, txd); ++txr->hn_txdone_cnt; if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { txr->hn_txdone_cnt = 0; if (txr->hn_oactive) hn_txeof(txr); } } static void hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) { #if defined(INET) || defined(INET6) tcp_lro_flush_all(&rxr->hn_lro); #endif /* * NOTE: * 'txr' could be NULL, if multiple channels and * ifnet.if_start method are enabled. */ if (txr == NULL || !txr->hn_has_txeof) return; txr->hn_txdone_cnt = 0; hn_txeof(txr); } static __inline uint32_t hn_rndis_pktmsg_offset(uint32_t ofs) { KASSERT(ofs >= sizeof(struct rndis_packet_msg), ("invalid RNDIS packet msg offset %u", ofs)); return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); } static __inline void * hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, size_t pi_dlen, uint32_t pi_type) { const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); struct rndis_pktinfo *pi; KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); /* * Per-packet-info does not move; it only grows. * * NOTE: * rm_pktinfooffset in this phase counts from the beginning * of rndis_packet_msg. */ KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, ("%u pktinfo overflows RNDIS packet msg", pi_type)); pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + pkt->rm_pktinfolen); pkt->rm_pktinfolen += pi_size; pi->rm_size = pi_size; pi->rm_type = pi_type; pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; return (pi->rm_data); } static __inline int hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) { struct hn_txdesc *txd; struct mbuf *m; int error, pkts; txd = txr->hn_agg_txd; KASSERT(txd != NULL, ("no aggregate txdesc")); /* * Since hn_txpkt() will reset this temporary stat, save * it now, so that oerrors can be updated properly, if * hn_txpkt() ever fails. */ pkts = txr->hn_stat_pkts; /* * Since txd's mbuf will _not_ be freed upon hn_txpkt() * failure, save it for later freeing, if hn_txpkt() ever * fails. */ m = txd->m; error = hn_txpkt(ifp, txr, txd); if (__predict_false(error)) { /* txd is freed, but m is not. */ m_freem(m); txr->hn_flush_failed++; if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); } /* Reset all aggregation states. */ txr->hn_agg_txd = NULL; txr->hn_agg_szleft = 0; txr->hn_agg_pktleft = 0; txr->hn_agg_prevpkt = NULL; return (error); } static void * hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, int pktsize) { void *chim; if (txr->hn_agg_txd != NULL) { if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { struct hn_txdesc *agg_txd = txr->hn_agg_txd; struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; int olen; /* * Update the previous RNDIS packet's total length, * it can be increased due to the mandatory alignment * padding for this RNDIS packet. And update the * aggregating txdesc's chimney sending buffer size * accordingly. * * XXX * Zero-out the padding, as required by the RNDIS spec. */ olen = pkt->rm_len; pkt->rm_len = roundup2(olen, txr->hn_agg_align); agg_txd->chim_size += pkt->rm_len - olen; /* Link this txdesc to the parent. */ hn_txdesc_agg(agg_txd, txd); chim = (uint8_t *)pkt + pkt->rm_len; /* Save the current packet for later fixup. */ txr->hn_agg_prevpkt = chim; txr->hn_agg_pktleft--; txr->hn_agg_szleft -= pktsize; if (txr->hn_agg_szleft <= HN_PKTSIZE_MIN(txr->hn_agg_align)) { /* * Probably can't aggregate more packets, * flush this aggregating txdesc proactively. */ txr->hn_agg_pktleft = 0; } /* Done! */ return (chim); } hn_flush_txagg(ifp, txr); } KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); txr->hn_tx_chimney_tried++; txd->chim_index = hn_chim_alloc(txr->hn_sc); if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) return (NULL); txr->hn_tx_chimney++; chim = txr->hn_sc->hn_chim + (txd->chim_index * txr->hn_sc->hn_chim_szmax); if (txr->hn_agg_pktmax > 1 && txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { txr->hn_agg_txd = txd; txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; txr->hn_agg_prevpkt = chim; } return (chim); } /* * NOTE: * If this function fails, then both txd and m_head0 will be freed. */ static int hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) { bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; int error, nsegs, i; struct mbuf *m_head = *m_head0; struct rndis_packet_msg *pkt; uint32_t *pi_data; void *chim = NULL; int pkt_hlen, pkt_size; pkt = txd->rndis_pkt; pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); if (pkt_size < txr->hn_chim_size) { chim = hn_try_txagg(ifp, txr, txd, pkt_size); if (chim != NULL) pkt = chim; } else { if (txr->hn_agg_txd != NULL) hn_flush_txagg(ifp, txr); } pkt->rm_type = REMOTE_NDIS_PACKET_MSG; pkt->rm_len = m_head->m_pkthdr.len; pkt->rm_dataoffset = 0; pkt->rm_datalen = m_head->m_pkthdr.len; pkt->rm_oobdataoffset = 0; pkt->rm_oobdatalen = 0; pkt->rm_oobdataelements = 0; pkt->rm_pktinfooffset = sizeof(*pkt); pkt->rm_pktinfolen = 0; pkt->rm_vchandle = 0; pkt->rm_reserved = 0; if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { /* * Set the hash value for this packet, so that the host could * dispatch the TX done event for this packet back to this TX * ring's channel. */ pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); *pi_data = txr->hn_tx_idx; } if (m_head->m_flags & M_VLANTAG) { pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); *pi_data = NDIS_VLAN_INFO_MAKE( EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); } if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { #if defined(INET6) || defined(INET) pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); #ifdef INET if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { *pi_data = NDIS_LSO2_INFO_MAKEIPV4( m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, m_head->m_pkthdr.tso_segsz); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET6 { *pi_data = NDIS_LSO2_INFO_MAKEIPV6( m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, m_head->m_pkthdr.tso_segsz); } #endif #endif /* INET6 || INET */ } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); if (m_head->m_pkthdr.csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP)) { *pi_data = NDIS_TXCSUM_INFO_IPV6; } else { *pi_data = NDIS_TXCSUM_INFO_IPV4; if (m_head->m_pkthdr.csum_flags & CSUM_IP) *pi_data |= NDIS_TXCSUM_INFO_IPCS; } if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) { *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS( m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); } else if (m_head->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_IP6_UDP)) { *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS( m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); } } pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; /* Fixup RNDIS packet message total length */ pkt->rm_len += pkt_hlen; /* Convert RNDIS packet message offsets */ pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); /* * Fast path: Chimney sending. */ if (chim != NULL) { struct hn_txdesc *tgt_txd = txd; if (txr->hn_agg_txd != NULL) { tgt_txd = txr->hn_agg_txd; #ifdef INVARIANTS *m_head0 = NULL; #endif } KASSERT(pkt == chim, ("RNDIS pkt not in chimney sending buffer")); KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, ("chimney sending buffer is not used")); tgt_txd->chim_size += pkt->rm_len; m_copydata(m_head, 0, m_head->m_pkthdr.len, ((uint8_t *)chim) + pkt_hlen); txr->hn_gpa_cnt = 0; txr->hn_sendpkt = hn_txpkt_chim; goto done; } KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("chimney buffer is used")); KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); if (__predict_false(error)) { int freed; /* * This mbuf is not linked w/ the txd yet, so free it now. */ m_freem(m_head); *m_head0 = NULL; freed = hn_txdesc_put(txr, txd); KASSERT(freed != 0, ("fail to free txd upon txdma error")); txr->hn_txdma_failed++; if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return error; } *m_head0 = m_head; /* +1 RNDIS packet message */ txr->hn_gpa_cnt = nsegs + 1; /* send packet with page buffer */ txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; txr->hn_gpa[0].gpa_len = pkt_hlen; /* * Fill the page buffers with mbuf info after the page * buffer for RNDIS packet message. */ for (i = 0; i < nsegs; ++i) { struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; gpa->gpa_page = atop(segs[i].ds_addr); gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; gpa->gpa_len = segs[i].ds_len; } txd->chim_index = HN_NVS_CHIM_IDX_INVALID; txd->chim_size = 0; txr->hn_sendpkt = hn_txpkt_sglist; done: txd->m = m_head; /* Set the completion routine */ hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); /* Update temporary stats for later use. */ txr->hn_stat_pkts++; txr->hn_stat_size += m_head->m_pkthdr.len; if (m_head->m_flags & M_MCAST) txr->hn_stat_mcasts++; return 0; } /* * NOTE: * If this function fails, then txd will be freed, but the mbuf * associated w/ the txd will _not_ be freed. */ static int hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) { int error, send_failed = 0, has_bpf; again: has_bpf = bpf_peers_present(ifp->if_bpf); if (has_bpf) { /* * Make sure that this txd and any aggregated txds are not * freed before ETHER_BPF_MTAP. */ hn_txdesc_hold(txd); } error = txr->hn_sendpkt(txr, txd); if (!error) { if (has_bpf) { const struct hn_txdesc *tmp_txd; ETHER_BPF_MTAP(ifp, txd->m); STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) ETHER_BPF_MTAP(ifp, tmp_txd->m); } if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); #ifdef HN_IFSTART_SUPPORT if (!hn_use_if_start) #endif { if_inc_counter(ifp, IFCOUNTER_OBYTES, txr->hn_stat_size); if (txr->hn_stat_mcasts != 0) { if_inc_counter(ifp, IFCOUNTER_OMCASTS, txr->hn_stat_mcasts); } } txr->hn_pkts += txr->hn_stat_pkts; txr->hn_sends++; } if (has_bpf) hn_txdesc_put(txr, txd); if (__predict_false(error)) { int freed; /* * This should "really rarely" happen. * * XXX Too many RX to be acked or too many sideband * commands to run? Ask netvsc_channel_rollup() * to kick start later. */ txr->hn_has_txeof = 1; if (!send_failed) { txr->hn_send_failed++; send_failed = 1; /* * Try sending again after set hn_has_txeof; * in case that we missed the last * netvsc_channel_rollup(). */ goto again; } if_printf(ifp, "send failed\n"); /* * Caller will perform further processing on the * associated mbuf, so don't free it in hn_txdesc_put(); * only unload it from the DMA map in hn_txdesc_put(), * if it was loaded. */ txd->m = NULL; freed = hn_txdesc_put(txr, txd); KASSERT(freed != 0, ("fail to free txd upon send error")); txr->hn_send_failed++; } /* Reset temporary stats, after this sending is done. */ txr->hn_stat_size = 0; txr->hn_stat_pkts = 0; txr->hn_stat_mcasts = 0; return (error); } /* * Append the specified data to the indicated mbuf chain, * Extend the mbuf chain if the new data does not fit in * existing space. * * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. * There should be an equivalent in the kernel mbuf code, * but there does not appear to be one yet. * * Differs from m_append() in that additional mbufs are * allocated with cluster size MJUMPAGESIZE, and filled * accordingly. * * Return 1 if able to complete the job; otherwise 0. */ static int hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) { struct mbuf *m, *n; int remainder, space; for (m = m0; m->m_next != NULL; m = m->m_next) ; remainder = len; space = M_TRAILINGSPACE(m); if (space > 0) { /* * Copy into available space. */ if (space > remainder) space = remainder; bcopy(cp, mtod(m, caddr_t) + m->m_len, space); m->m_len += space; cp += space; remainder -= space; } while (remainder > 0) { /* * Allocate a new mbuf; could check space * and allocate a cluster instead. */ n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); if (n == NULL) break; n->m_len = min(MJUMPAGESIZE, remainder); bcopy(cp, mtod(n, caddr_t), n->m_len); cp += n->m_len; remainder -= n->m_len; m->m_next = n; m = n; } if (m0->m_flags & M_PKTHDR) m0->m_pkthdr.len += len - remainder; return (remainder == 0); } #if defined(INET) || defined(INET6) static __inline int hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) { #if __FreeBSD_version >= 1100095 if (hn_lro_mbufq_depth) { tcp_lro_queue_mbuf(lc, m); return 0; } #endif return tcp_lro_rx(lc, m, 0); } #endif static int hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen, const struct hn_rxinfo *info) { struct ifnet *ifp, *hn_ifp = rxr->hn_ifp; struct mbuf *m_new; int size, do_lro = 0, do_csum = 1, is_vf = 0; int hash_type = M_HASHTYPE_NONE; int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE; ifp = hn_ifp; if (rxr->hn_rxvf_ifp != NULL) { /* * Non-transparent mode VF; pretend this packet is from * the VF. */ ifp = rxr->hn_rxvf_ifp; is_vf = 1; } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) { /* Transparent mode VF. */ is_vf = 1; } if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { /* * NOTE: * See the NOTE of hn_rndis_init_fixat(). This * function can be reached, immediately after the * RNDIS is initialized but before the ifnet is * setup on the hn_attach() path; drop the unexpected * packets. */ return (0); } if (__predict_false(dlen < ETHER_HDR_LEN)) { if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1); return (0); } if (dlen <= MHLEN) { m_new = m_gethdr(M_NOWAIT, MT_DATA); if (m_new == NULL) { if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); return (0); } memcpy(mtod(m_new, void *), data, dlen); m_new->m_pkthdr.len = m_new->m_len = dlen; rxr->hn_small_pkts++; } else { /* * Get an mbuf with a cluster. For packets 2K or less, * get a standard 2K cluster. For anything larger, get a * 4K cluster. Any buffers larger than 4K can cause problems * if looped around to the Hyper-V TX channel, so avoid them. */ size = MCLBYTES; if (dlen > MCLBYTES) { /* 4096 */ size = MJUMPAGESIZE; } m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); if (m_new == NULL) { if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); return (0); } hv_m_append(m_new, dlen, data); } m_new->m_pkthdr.rcvif = ifp; if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0)) do_csum = 0; /* receive side checksum offload */ if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) { /* IP csum offload */ if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); rxr->hn_csum_ip++; } /* TCP/UDP csum offload */ if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK | NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { m_new->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK) rxr->hn_csum_tcp++; else rxr->hn_csum_udp++; } /* * XXX * As of this write (Oct 28th, 2016), host side will turn * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so * the do_lro setting here is actually _not_ accurate. We * depend on the RSS hash type check to reset do_lro. */ if ((info->csum_info & (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) do_lro = 1; } else { hn_rxpkt_proto(m_new, &l3proto, &l4proto); if (l3proto == ETHERTYPE_IP) { if (l4proto == IPPROTO_TCP) { if (do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_TCP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; } do_lro = 1; } else if (l4proto == IPPROTO_UDP) { if (do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_UDP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; } } else if (l4proto != IPPROTO_DONE && do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); } } } if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) { m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( NDIS_VLAN_INFO_ID(info->vlan_info), NDIS_VLAN_INFO_PRI(info->vlan_info), NDIS_VLAN_INFO_CFI(info->vlan_info)); m_new->m_flags |= M_VLANTAG; } /* * If VF is activated (tranparent/non-transparent mode does not * matter here). * * - Disable LRO * * hn(4) will only receive broadcast packets, multicast packets, * TCP SYN and SYN|ACK (in Azure), LRO is useless for these * packet types. * * For non-transparent, we definitely _cannot_ enable LRO at * all, since the LRO flush will use hn(4) as the receiving * interface; i.e. hn_ifp->if_input(hn_ifp, m). */ if (is_vf) do_lro = 0; /* * If VF is activated (tranparent/non-transparent mode does not * matter here), do _not_ mess with unsupported hash types or * functions. */ if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) { rxr->hn_rss_pkts++; m_new->m_pkthdr.flowid = info->hash_value; if (!is_vf) hash_type = M_HASHTYPE_OPAQUE_HASH; if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) == NDIS_HASH_FUNCTION_TOEPLITZ) { uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK & rxr->hn_mbuf_hash); /* * NOTE: * do_lro is resetted, if the hash types are not TCP * related. See the comment in the above csum_flags * setup section. */ switch (type) { case NDIS_HASH_IPV4: hash_type = M_HASHTYPE_RSS_IPV4; do_lro = 0; break; case NDIS_HASH_TCP_IPV4: hash_type = M_HASHTYPE_RSS_TCP_IPV4; if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) { int def_htype = M_HASHTYPE_OPAQUE_HASH; if (is_vf) def_htype = M_HASHTYPE_NONE; /* * UDP 4-tuple hash is delivered as * TCP 4-tuple hash. */ if (l3proto == ETHERTYPE_MAX) { hn_rxpkt_proto(m_new, &l3proto, &l4proto); } if (l3proto == ETHERTYPE_IP) { if (l4proto == IPPROTO_UDP && (rxr->hn_mbuf_hash & NDIS_HASH_UDP_IPV4_X)) { hash_type = M_HASHTYPE_RSS_UDP_IPV4; do_lro = 0; } else if (l4proto != IPPROTO_TCP) { hash_type = def_htype; do_lro = 0; } } else { hash_type = def_htype; do_lro = 0; } } break; case NDIS_HASH_IPV6: hash_type = M_HASHTYPE_RSS_IPV6; do_lro = 0; break; case NDIS_HASH_IPV6_EX: hash_type = M_HASHTYPE_RSS_IPV6_EX; do_lro = 0; break; case NDIS_HASH_TCP_IPV6: hash_type = M_HASHTYPE_RSS_TCP_IPV6; break; case NDIS_HASH_TCP_IPV6_EX: hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; break; } } } else if (!is_vf) { m_new->m_pkthdr.flowid = rxr->hn_rx_idx; hash_type = M_HASHTYPE_OPAQUE; } M_HASHTYPE_SET(m_new, hash_type); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if (hn_ifp != ifp) { const struct ether_header *eh; /* * Non-transparent mode VF is activated. */ /* * Allow tapping on hn(4). */ ETHER_BPF_MTAP(hn_ifp, m_new); /* * Update hn(4)'s stats. */ if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len); /* Checked at the beginning of this function. */ KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame")); eh = mtod(m_new, struct ether_header *); if (ETHER_IS_MULTICAST(eh->ether_dhost)) if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1); } rxr->hn_pkts++; if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) { #if defined(INET) || defined(INET6) struct lro_ctrl *lro = &rxr->hn_lro; if (lro->lro_cnt) { rxr->hn_lro_tried++; if (hn_lro_rx(lro, m_new) == 0) { /* DONE! */ return 0; } } #endif } ifp->if_input(ifp, m_new); return (0); } static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct hn_softc *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data, ifr_vf; struct ifnet *vf_ifp; int mask, error = 0; struct ifrsskey *ifrk; struct ifrsshash *ifrh; uint32_t mtu; switch (cmd) { case SIOCSIFMTU: if (ifr->ifr_mtu > HN_MTU_MAX) { error = EINVAL; break; } HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { HN_UNLOCK(sc); break; } if ((sc->hn_caps & HN_CAP_MTU) == 0) { /* Can't change MTU */ HN_UNLOCK(sc); error = EOPNOTSUPP; break; } if (ifp->if_mtu == ifr->ifr_mtu) { HN_UNLOCK(sc); break; } if (hn_xpnt_vf_isready(sc)) { vf_ifp = sc->hn_vf_ifp; ifr_vf = *ifr; strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname, sizeof(ifr_vf.ifr_name)); error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr_vf); if (error) { HN_UNLOCK(sc); if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", vf_ifp->if_xname, ifr->ifr_mtu, error); break; } } /* * Suspend this interface before the synthetic parts * are ripped. */ hn_suspend(sc); /* * Detach the synthetics parts, i.e. NVS and RNDIS. */ hn_synth_detach(sc); /* * Reattach the synthetic parts, i.e. NVS and RNDIS, * with the new MTU setting. */ error = hn_synth_attach(sc, ifr->ifr_mtu); if (error) { HN_UNLOCK(sc); break; } error = hn_rndis_get_mtu(sc, &mtu); if (error) mtu = ifr->ifr_mtu; else if (bootverbose) if_printf(ifp, "RNDIS mtu %u\n", mtu); /* * Commit the requested MTU, after the synthetic parts * have been successfully attached. */ if (mtu >= ifr->ifr_mtu) { mtu = ifr->ifr_mtu; } else { if_printf(ifp, "fixup mtu %d -> %u\n", ifr->ifr_mtu, mtu); } ifp->if_mtu = mtu; /* * Synthetic parts' reattach may change the chimney * sending size; update it. */ if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) hn_set_chim_size(sc, sc->hn_chim_szmax); /* * Make sure that various parameters based on MTU are * still valid, after the MTU change. */ hn_mtu_change_fixup(sc); /* * All done! Resume the interface now. */ hn_resume(sc); if ((sc->hn_flags & HN_FLAG_RXVF) || (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { /* * Since we have reattached the NVS part, * change the datapath to VF again; in case * that it is lost, after the NVS was detached. */ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); } HN_UNLOCK(sc); break; case SIOCSIFFLAGS: HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { HN_UNLOCK(sc); break; } if (hn_xpnt_vf_isready(sc)) hn_xpnt_vf_saveifflags(sc); if (ifp->if_flags & IFF_UP) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { /* * Caller meight hold mutex, e.g. * bpf; use busy-wait for the RNDIS * reply. */ HN_NO_SLEEPING(sc); hn_rxfilter_config(sc); HN_SLEEPING_OK(sc); if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) error = hn_xpnt_vf_iocsetflags(sc); } else { hn_init_locked(sc); } } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) hn_stop(sc, false); } sc->hn_if_flags = ifp->if_flags; HN_UNLOCK(sc); break; case SIOCSIFCAP: HN_LOCK(sc); if (hn_xpnt_vf_isready(sc)) { ifr_vf = *ifr; strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname, sizeof(ifr_vf.ifr_name)); error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); HN_UNLOCK(sc); break; } /* * Fix up requested capabilities w/ supported capabilities, * since the supported capabilities could have been changed. */ mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) { ifp->if_capenable ^= IFCAP_TXCSUM; if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); else ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); } if (mask & IFCAP_TXCSUM_IPV6) { ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); else ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); } /* TODO: flip RNDIS offload parameters for RXCSUM. */ if (mask & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; #ifdef foo /* We can't diff IPv6 packets from IPv4 packets on RX path. */ if (mask & IFCAP_RXCSUM_IPV6) ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; #endif if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_TSO4) { ifp->if_capenable ^= IFCAP_TSO4; if (ifp->if_capenable & IFCAP_TSO4) ifp->if_hwassist |= CSUM_IP_TSO; else ifp->if_hwassist &= ~CSUM_IP_TSO; } if (mask & IFCAP_TSO6) { ifp->if_capenable ^= IFCAP_TSO6; if (ifp->if_capenable & IFCAP_TSO6) ifp->if_hwassist |= CSUM_IP6_TSO; else ifp->if_hwassist &= ~CSUM_IP6_TSO; } HN_UNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { HN_UNLOCK(sc); break; } if (ifp->if_drv_flags & IFF_DRV_RUNNING) { /* * Multicast uses mutex; use busy-wait for * the RNDIS reply. */ HN_NO_SLEEPING(sc); hn_rxfilter_config(sc); HN_SLEEPING_OK(sc); } /* XXX vlan(4) style mcast addr maintenance */ if (hn_xpnt_vf_isready(sc)) { int old_if_flags; old_if_flags = sc->hn_vf_ifp->if_flags; hn_xpnt_vf_saveifflags(sc); if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && ((old_if_flags ^ sc->hn_vf_ifp->if_flags) & IFF_ALLMULTI)) error = hn_xpnt_vf_iocsetflags(sc); } HN_UNLOCK(sc); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: HN_LOCK(sc); if (hn_xpnt_vf_isready(sc)) { /* * SIOCGIFMEDIA expects ifmediareq, so don't * create and pass ifr_vf to the VF here; just * replace the ifr_name. */ vf_ifp = sc->hn_vf_ifp; strlcpy(ifr->ifr_name, vf_ifp->if_xname, sizeof(ifr->ifr_name)); error = vf_ifp->if_ioctl(vf_ifp, cmd, data); /* Restore the ifr_name. */ strlcpy(ifr->ifr_name, ifp->if_xname, sizeof(ifr->ifr_name)); HN_UNLOCK(sc); break; } HN_UNLOCK(sc); error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); break; case SIOCGIFRSSHASH: ifrh = (struct ifrsshash *)data; HN_LOCK(sc); if (sc->hn_rx_ring_inuse == 1) { HN_UNLOCK(sc); ifrh->ifrh_func = RSS_FUNC_NONE; ifrh->ifrh_types = 0; break; } if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; else ifrh->ifrh_func = RSS_FUNC_PRIVATE; ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash); HN_UNLOCK(sc); break; case SIOCGIFRSSKEY: ifrk = (struct ifrsskey *)data; HN_LOCK(sc); if (sc->hn_rx_ring_inuse == 1) { HN_UNLOCK(sc); ifrk->ifrk_func = RSS_FUNC_NONE; ifrk->ifrk_keylen = 0; break; } if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; else ifrk->ifrk_func = RSS_FUNC_PRIVATE; ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ; memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key, NDIS_HASH_KEYSIZE_TOEPLITZ); HN_UNLOCK(sc); break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } static void hn_stop(struct hn_softc *sc, bool detaching) { struct ifnet *ifp = sc->hn_ifp; int i; HN_LOCK_ASSERT(sc); KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("synthetic parts were not attached")); /* Clear RUNNING bit ASAP. */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); /* Disable polling. */ hn_polling(sc, 0); if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { KASSERT(sc->hn_vf_ifp != NULL, ("%s: VF is not attached", ifp->if_xname)); /* Mark transparent mode VF as disabled. */ hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */); /* * NOTE: * Datapath setting must happen _before_ bringing * the VF down. */ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); /* * Bring the VF down. */ hn_xpnt_vf_saveifflags(sc); sc->hn_vf_ifp->if_flags &= ~IFF_UP; hn_xpnt_vf_iocsetflags(sc); } /* Suspend data transfers. */ hn_suspend_data(sc); /* Clear OACTIVE bit. */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) sc->hn_tx_ring[i].hn_oactive = 0; /* * If the non-transparent mode VF is active, make sure * that the RX filter still allows packet reception. */ if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) hn_rxfilter_config(sc); } static void hn_init_locked(struct hn_softc *sc) { struct ifnet *ifp = sc->hn_ifp; int i; HN_LOCK_ASSERT(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) return; if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; /* Configure RX filter */ hn_rxfilter_config(sc); /* Clear OACTIVE bit. */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) sc->hn_tx_ring[i].hn_oactive = 0; /* Clear TX 'suspended' bit. */ hn_resume_tx(sc, sc->hn_tx_ring_inuse); if (hn_xpnt_vf_isready(sc)) { /* Initialize transparent VF. */ hn_xpnt_vf_init(sc); } /* Everything is ready; unleash! */ atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); /* Re-enable polling if requested. */ if (sc->hn_pollhz > 0) hn_polling(sc, sc->hn_pollhz); } static void hn_init(void *xsc) { struct hn_softc *sc = xsc; HN_LOCK(sc); hn_init_locked(sc); HN_UNLOCK(sc); } #if __FreeBSD_version >= 1100099 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; unsigned int lenlim; int error; lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; error = sysctl_handle_int(oidp, &lenlim, 0, req); if (error || req->newptr == NULL) return error; HN_LOCK(sc); if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || lenlim > TCP_LRO_LENGTH_MAX) { HN_UNLOCK(sc); return EINVAL; } hn_set_lro_lenlim(sc, lenlim); HN_UNLOCK(sc); return 0; } static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ackcnt, error, i; /* * lro_ackcnt_lim is append count limit, * +1 to turn it into aggregation limit. */ ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; error = sysctl_handle_int(oidp, &ackcnt, 0, req); if (error || req->newptr == NULL) return error; if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) return EINVAL; /* * Convert aggregation limit back to append * count limit. */ --ackcnt; HN_LOCK(sc); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; HN_UNLOCK(sc); return 0; } #endif static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int hcsum = arg2; int on, error, i; on = 0; if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) on = 1; error = sysctl_handle_int(oidp, &on, 0, req); if (error || req->newptr == NULL) return error; HN_LOCK(sc); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (on) rxr->hn_trust_hcsum |= hcsum; else rxr->hn_trust_hcsum &= ~hcsum; } HN_UNLOCK(sc); return 0; } static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int chim_size, error; chim_size = sc->hn_tx_ring[0].hn_chim_size; error = sysctl_handle_int(oidp, &chim_size, 0, req); if (error || req->newptr == NULL) return error; if (chim_size > sc->hn_chim_szmax || chim_size <= 0) return EINVAL; HN_LOCK(sc); hn_set_chim_size(sc, chim_size); HN_UNLOCK(sc); return 0; } #if __FreeBSD_version < 1100095 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_rx_ring *rxr; uint64_t stat; stat = 0; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((int *)((uint8_t *)rxr + ofs)); } error = sysctl_handle_64(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; *((int *)((uint8_t *)rxr + ofs)) = 0; } return 0; } #else static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_rx_ring *rxr; uint64_t stat; stat = 0; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((uint64_t *)((uint8_t *)rxr + ofs)); } error = sysctl_handle_64(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; } return 0; } #endif static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_rx_ring *rxr; u_long stat; stat = 0; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((u_long *)((uint8_t *)rxr + ofs)); } error = sysctl_handle_long(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; *((u_long *)((uint8_t *)rxr + ofs)) = 0; } return 0; } static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_tx_ring *txr; u_long stat; stat = 0; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { txr = &sc->hn_tx_ring[i]; stat += *((u_long *)((uint8_t *)txr + ofs)); } error = sysctl_handle_long(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { txr = &sc->hn_tx_ring[i]; *((u_long *)((uint8_t *)txr + ofs)) = 0; } return 0; } static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error, conf; struct hn_tx_ring *txr; txr = &sc->hn_tx_ring[0]; conf = *((int *)((uint8_t *)txr + ofs)); error = sysctl_handle_int(oidp, &conf, 0, req); if (error || req->newptr == NULL) return error; HN_LOCK(sc); for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { txr = &sc->hn_tx_ring[i]; *((int *)((uint8_t *)txr + ofs)) = conf; } HN_UNLOCK(sc); return 0; } static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error, size; size = sc->hn_agg_size; error = sysctl_handle_int(oidp, &size, 0, req); if (error || req->newptr == NULL) return (error); HN_LOCK(sc); sc->hn_agg_size = size; hn_set_txagg(sc); HN_UNLOCK(sc); return (0); } static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error, pkts; pkts = sc->hn_agg_pkts; error = sysctl_handle_int(oidp, &pkts, 0, req); if (error || req->newptr == NULL) return (error); HN_LOCK(sc); sc->hn_agg_pkts = pkts; hn_set_txagg(sc); HN_UNLOCK(sc); return (0); } static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int pkts; pkts = sc->hn_tx_ring[0].hn_agg_pktmax; return (sysctl_handle_int(oidp, &pkts, 0, req)); } static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int align; align = sc->hn_tx_ring[0].hn_agg_align; return (sysctl_handle_int(oidp, &align, 0, req)); } static void hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) { if (pollhz == 0) vmbus_chan_poll_disable(chan); else vmbus_chan_poll_enable(chan, pollhz); } static void hn_polling(struct hn_softc *sc, u_int pollhz) { int nsubch = sc->hn_rx_ring_inuse - 1; HN_LOCK_ASSERT(sc); if (nsubch > 0) { struct vmbus_channel **subch; int i; subch = vmbus_subchan_get(sc->hn_prichan, nsubch); for (i = 0; i < nsubch; ++i) hn_chan_polling(subch[i], pollhz); vmbus_subchan_rel(subch, nsubch); } hn_chan_polling(sc->hn_prichan, pollhz); } static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int pollhz, error; pollhz = sc->hn_pollhz; error = sysctl_handle_int(oidp, &pollhz, 0, req); if (error || req->newptr == NULL) return (error); if (pollhz != 0 && (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) return (EINVAL); HN_LOCK(sc); if (sc->hn_pollhz != pollhz) { sc->hn_pollhz = pollhz; if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) hn_polling(sc, sc->hn_pollhz); } HN_UNLOCK(sc); return (0); } static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char verstr[16]; snprintf(verstr, sizeof(verstr), "%u.%u", HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); } static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char caps_str[128]; uint32_t caps; HN_LOCK(sc); caps = sc->hn_caps; HN_UNLOCK(sc); snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); } static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char assist_str[128]; uint32_t hwassist; HN_LOCK(sc); hwassist = sc->hn_ifp->if_hwassist; HN_UNLOCK(sc); snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); } static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char filter_str[128]; uint32_t filter; HN_LOCK(sc); filter = sc->hn_rx_filter; HN_UNLOCK(sc); snprintf(filter_str, sizeof(filter_str), "%b", filter, NDIS_PACKET_TYPES); return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); } #ifndef RSS static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error; HN_LOCK(sc); error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); if (error || req->newptr == NULL) goto back; if ((sc->hn_flags & HN_FLAG_RXVF) || (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) { /* * RSS key is synchronized w/ VF's, don't allow users * to change it. */ error = EBUSY; goto back; } error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); if (error) goto back; sc->hn_flags |= HN_FLAG_HAS_RSSKEY; if (sc->hn_rx_ring_inuse > 1) { error = hn_rss_reconfig(sc); } else { /* Not RSS capable, at least for now; just save the RSS key. */ error = 0; } back: HN_UNLOCK(sc); return (error); } static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error; HN_LOCK(sc); error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); if (error || req->newptr == NULL) goto back; /* * Don't allow RSS indirect table change, if this interface is not * RSS capable currently. */ if (sc->hn_rx_ring_inuse == 1) { error = EOPNOTSUPP; goto back; } error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); if (error) goto back; sc->hn_flags |= HN_FLAG_HAS_RSSIND; hn_rss_ind_fixup(sc); error = hn_rss_reconfig(sc); back: HN_UNLOCK(sc); return (error); } #endif /* !RSS */ static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char hash_str[128]; uint32_t hash; HN_LOCK(sc); hash = sc->hn_rss_hash; HN_UNLOCK(sc); snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); } static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char hash_str[128]; uint32_t hash; HN_LOCK(sc); hash = sc->hn_rss_hcap; HN_UNLOCK(sc); snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); } static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char hash_str[128]; uint32_t hash; HN_LOCK(sc); hash = sc->hn_rx_ring[0].hn_mbuf_hash; HN_UNLOCK(sc); snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); } static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char vf_name[IFNAMSIZ + 1]; struct ifnet *vf_ifp; HN_LOCK(sc); vf_name[0] = '\0'; vf_ifp = sc->hn_vf_ifp; if (vf_ifp != NULL) snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); HN_UNLOCK(sc); return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); } static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char vf_name[IFNAMSIZ + 1]; struct ifnet *vf_ifp; HN_LOCK(sc); vf_name[0] = '\0'; vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; if (vf_ifp != NULL) snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); HN_UNLOCK(sc); return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); } static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) { struct rm_priotracker pt; struct sbuf *sb; int error, i; bool first; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); rm_rlock(&hn_vfmap_lock, &pt); first = true; for (i = 0; i < hn_vfmap_size; ++i) { struct ifnet *ifp; if (hn_vfmap[i] == NULL) continue; ifp = ifnet_byindex(i); if (ifp != NULL) { if (first) sbuf_printf(sb, "%s", ifp->if_xname); else sbuf_printf(sb, " %s", ifp->if_xname); first = false; } } rm_runlock(&hn_vfmap_lock, &pt); error = sbuf_finish(sb); sbuf_delete(sb); return (error); } static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) { struct rm_priotracker pt; struct sbuf *sb; int error, i; bool first; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); rm_rlock(&hn_vfmap_lock, &pt); first = true; for (i = 0; i < hn_vfmap_size; ++i) { struct ifnet *ifp, *hn_ifp; hn_ifp = hn_vfmap[i]; if (hn_ifp == NULL) continue; ifp = ifnet_byindex(i); if (ifp != NULL) { if (first) { sbuf_printf(sb, "%s:%s", ifp->if_xname, hn_ifp->if_xname); } else { sbuf_printf(sb, " %s:%s", ifp->if_xname, hn_ifp->if_xname); } first = false; } } rm_runlock(&hn_vfmap_lock, &pt); error = sbuf_finish(sb); sbuf_delete(sb); return (error); } static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error, onoff = 0; if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) onoff = 1; error = sysctl_handle_int(oidp, &onoff, 0, req); if (error || req->newptr == NULL) return (error); HN_LOCK(sc); /* NOTE: hn_vf_lock for hn_transmit() */ rm_wlock(&sc->hn_vf_lock); if (onoff) sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; else sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; rm_wunlock(&sc->hn_vf_lock); HN_UNLOCK(sc); return (0); } static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int enabled = 0; if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) enabled = 1; return (sysctl_handle_int(oidp, &enabled, 0, req)); } static int hn_check_iplen(const struct mbuf *m, int hoff) { const struct ip *ip; int len, iphlen, iplen; const struct tcphdr *th; int thoff; /* TCP data offset */ len = hoff + sizeof(struct ip); /* The packet must be at least the size of an IP header. */ if (m->m_pkthdr.len < len) return IPPROTO_DONE; /* The fixed IP header must reside completely in the first mbuf. */ if (m->m_len < len) return IPPROTO_DONE; ip = mtodo(m, hoff); /* Bound check the packet's stated IP header length. */ iphlen = ip->ip_hl << 2; if (iphlen < sizeof(struct ip)) /* minimum header length */ return IPPROTO_DONE; /* The full IP header must reside completely in the one mbuf. */ if (m->m_len < hoff + iphlen) return IPPROTO_DONE; iplen = ntohs(ip->ip_len); /* * Check that the amount of data in the buffers is as * at least much as the IP header would have us expect. */ if (m->m_pkthdr.len < hoff + iplen) return IPPROTO_DONE; /* * Ignore IP fragments. */ if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) return IPPROTO_DONE; /* * The TCP/IP or UDP/IP header must be entirely contained within * the first fragment of a packet. */ switch (ip->ip_p) { case IPPROTO_TCP: if (iplen < iphlen + sizeof(struct tcphdr)) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) return IPPROTO_DONE; th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); thoff = th->th_off << 2; if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + thoff) return IPPROTO_DONE; break; case IPPROTO_UDP: if (iplen < iphlen + sizeof(struct udphdr)) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) return IPPROTO_DONE; break; default: if (iplen < iphlen) return IPPROTO_DONE; break; } return ip->ip_p; } static void hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto) { const struct ether_header *eh; uint16_t etype; int hoff; hoff = sizeof(*eh); /* Checked at the beginning of this function. */ KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); eh = mtod(m_new, const struct ether_header *); etype = ntohs(eh->ether_type); if (etype == ETHERTYPE_VLAN) { const struct ether_vlan_header *evl; hoff = sizeof(*evl); if (m_new->m_len < hoff) return; evl = mtod(m_new, const struct ether_vlan_header *); etype = ntohs(evl->evl_proto); } *l3proto = etype; if (etype == ETHERTYPE_IP) *l4proto = hn_check_iplen(m_new, hoff); else *l4proto = IPPROTO_DONE; } static int hn_create_rx_data(struct hn_softc *sc, int ring_cnt) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; device_t dev = sc->hn_dev; #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 int lroent_cnt; #endif #endif int i; /* * Create RXBUF for reception. * * NOTE: * - It is shared by all channels. * - A large enough buffer is allocated, certain version of NVSes * may further limit the usable space. */ sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (sc->hn_rxbuf == NULL) { device_printf(sc->hn_dev, "allocate rxbuf failed\n"); return (ENOMEM); } sc->hn_rx_ring_cnt = ring_cnt; sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, M_DEVBUF, M_WAITOK | M_ZERO); #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 lroent_cnt = hn_lro_entry_count; if (lroent_cnt < TCP_LRO_ENTRIES) lroent_cnt = TCP_LRO_ENTRIES; if (bootverbose) device_printf(dev, "LRO: entry count %d\n", lroent_cnt); #endif #endif /* INET || INET6 */ ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); /* Create dev.hn.UNIT.rx sysctl tree */ sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, &rxr->hn_br_dma, BUS_DMA_WAITOK); if (rxr->hn_br == NULL) { device_printf(dev, "allocate bufring failed\n"); return (ENOMEM); } if (hn_trust_hosttcp) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; if (hn_trust_hostudp) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; if (hn_trust_hostip) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; rxr->hn_mbuf_hash = NDIS_HASH_ALL; rxr->hn_ifp = sc->hn_ifp; if (i < sc->hn_tx_ring_cnt) rxr->hn_txr = &sc->hn_tx_ring[i]; rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); rxr->hn_rx_idx = i; rxr->hn_rxbuf = sc->hn_rxbuf; /* * Initialize LRO. */ #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, hn_lro_mbufq_depth); #else tcp_lro_init(&rxr->hn_lro); rxr->hn_lro.ifp = sc->hn_ifp; #endif #if __FreeBSD_version >= 1100099 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; #endif #endif /* INET || INET6 */ if (sc->hn_rx_sysctl_tree != NULL) { char name[16]; /* * Create per RX ring sysctl tree: * dev.hn.UNIT.rx.RINGID */ snprintf(name, sizeof(name), "%d", i); rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (rxr->hn_rx_sysctl_tree != NULL) { SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "packets", CTLFLAG_RW, &rxr->hn_pkts, "# of packets received"); SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "rss_pkts", CTLFLAG_RW, &rxr->hn_rss_pkts, "# of packets w/ RSS info received"); SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "pktbuf_len", CTLFLAG_RD, &rxr->hn_pktbuf_len, 0, "Temporary channel packet buffer length"); } } } SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_lro.lro_queued), #if __FreeBSD_version < 1100095 hn_rx_stat_int_sysctl, #else hn_rx_stat_u64_sysctl, #endif "LU", "LRO queued"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), #if __FreeBSD_version < 1100095 hn_rx_stat_int_sysctl, #else hn_rx_stat_u64_sysctl, #endif "LU", "LRO flushed"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_lro_tried), hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); #if __FreeBSD_version >= 1100099 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_lro_lenlim_sysctl, "IU", "Max # of data bytes to be aggregated by LRO"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_lro_ackcnt_sysctl, "I", "Max # of ACKs to be aggregated by LRO"); #endif SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, hn_trust_hcsum_sysctl, "I", "Trust tcp segement verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, hn_trust_hcsum_sysctl, "I", "Trust udp datagram verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, hn_trust_hcsum_sysctl, "I", "Trust ip packet verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_ip), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_tcp), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_udp), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_trusted), hn_rx_stat_ulong_sysctl, "LU", "# of packets that we trust host's csum verification"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_small_pkts), hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_ack_failed), hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); return (0); } static void hn_destroy_rx_data(struct hn_softc *sc) { int i; if (sc->hn_rxbuf != NULL) { if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); else device_printf(sc->hn_dev, "RXBUF is referenced\n"); sc->hn_rxbuf = NULL; } if (sc->hn_rx_ring_cnt == 0) return; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (rxr->hn_br == NULL) continue; if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); } else { device_printf(sc->hn_dev, "%dth channel bufring is referenced", i); } rxr->hn_br = NULL; #if defined(INET) || defined(INET6) tcp_lro_free(&rxr->hn_lro); #endif free(rxr->hn_pktbuf, M_DEVBUF); } free(sc->hn_rx_ring, M_DEVBUF); sc->hn_rx_ring = NULL; sc->hn_rx_ring_cnt = 0; sc->hn_rx_ring_inuse = 0; } static int hn_tx_ring_create(struct hn_softc *sc, int id) { struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; device_t dev = sc->hn_dev; bus_dma_tag_t parent_dtag; int error, i; txr->hn_sc = sc; txr->hn_tx_idx = id; #ifndef HN_USE_TXDESC_BUFRING mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); #endif mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); txr->hn_txdesc_cnt = HN_TX_DESC_CNT; txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, M_DEVBUF, M_WAITOK | M_ZERO); #ifndef HN_USE_TXDESC_BUFRING SLIST_INIT(&txr->hn_txlist); #else txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, M_WAITOK, &txr->hn_tx_lock); #endif if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); } else { txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; } #ifdef HN_IFSTART_SUPPORT if (hn_use_if_start) { txr->hn_txeof = hn_start_txeof; TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); } else #endif { int br_depth; txr->hn_txeof = hn_xmit_txeof; TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); br_depth = hn_get_txswq_depth(txr); txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, M_WAITOK, &txr->hn_tx_lock); } txr->hn_direct_tx_size = hn_direct_tx_size; /* * Always schedule transmission instead of trying to do direct * transmission. This one gives the best performance so far. */ txr->hn_sched_tx = 1; parent_dtag = bus_get_dma_tag(dev); /* DMA tag for RNDIS packet messages. */ error = bus_dma_tag_create(parent_dtag, /* parent */ HN_RNDIS_PKT_ALIGN, /* alignment */ HN_RNDIS_PKT_BOUNDARY, /* boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ HN_RNDIS_PKT_LEN, /* maxsize */ 1, /* nsegments */ HN_RNDIS_PKT_LEN, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->hn_tx_rndis_dtag); if (error) { device_printf(dev, "failed to create rndis dmatag\n"); return error; } /* DMA tag for data. */ error = bus_dma_tag_create(parent_dtag, /* parent */ 1, /* alignment */ HN_TX_DATA_BOUNDARY, /* boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ HN_TX_DATA_MAXSIZE, /* maxsize */ HN_TX_DATA_SEGCNT_MAX, /* nsegments */ HN_TX_DATA_SEGSIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->hn_tx_data_dtag); if (error) { device_printf(dev, "failed to create data dmatag\n"); return error; } for (i = 0; i < txr->hn_txdesc_cnt; ++i) { struct hn_txdesc *txd = &txr->hn_txdesc[i]; txd->txr = txr; txd->chim_index = HN_NVS_CHIM_IDX_INVALID; STAILQ_INIT(&txd->agg_list); /* * Allocate and load RNDIS packet message. */ error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, (void **)&txd->rndis_pkt, BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, &txd->rndis_pkt_dmap); if (error) { device_printf(dev, "failed to allocate rndis_packet_msg, %d\n", i); return error; } error = bus_dmamap_load(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap, txd->rndis_pkt, HN_RNDIS_PKT_LEN, hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, BUS_DMA_NOWAIT); if (error) { device_printf(dev, "failed to load rndis_packet_msg, %d\n", i); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, txd->rndis_pkt_dmap); return error; } /* DMA map for TX data. */ error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, &txd->data_dmap); if (error) { device_printf(dev, "failed to allocate tx data dmamap\n"); bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, txd->rndis_pkt_dmap); return error; } /* All set, put it to list */ txd->flags |= HN_TXD_FLAG_ONLIST; #ifndef HN_USE_TXDESC_BUFRING SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); #else buf_ring_enqueue(txr->hn_txdesc_br, txd); #endif } txr->hn_txdesc_avail = txr->hn_txdesc_cnt; if (sc->hn_tx_sysctl_tree != NULL) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; char name[16]; /* * Create per TX ring sysctl tree: * dev.hn.UNIT.tx.RINGID */ ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); snprintf(name, sizeof(name), "%d", id); txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (txr->hn_tx_sysctl_tree != NULL) { child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); #ifdef HN_DEBUG SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", CTLFLAG_RD, &txr->hn_txdesc_avail, 0, "# of available TX descs"); #endif #ifdef HN_IFSTART_SUPPORT if (!hn_use_if_start) #endif { SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", CTLFLAG_RD, &txr->hn_oactive, 0, "over active"); } SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", CTLFLAG_RW, &txr->hn_pkts, "# of packets transmitted"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", CTLFLAG_RW, &txr->hn_sends, "# of sends"); } } return 0; } static void hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) { struct hn_tx_ring *txr = txd->txr; KASSERT(txd->m == NULL, ("still has mbuf installed")); KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, txd->rndis_pkt_dmap); bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); } static void hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) { KASSERT(txd->refs == 0 || txd->refs == 1, ("invalid txd refs %d", txd->refs)); /* Aggregated txds will be freed by their aggregating txd. */ if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { int freed; freed = hn_txdesc_put(txr, txd); KASSERT(freed, ("can't free txdesc")); } } static void hn_tx_ring_destroy(struct hn_tx_ring *txr) { int i; if (txr->hn_txdesc == NULL) return; /* * NOTE: * Because the freeing of aggregated txds will be deferred * to the aggregating txd, two passes are used here: * - The first pass GCes any pending txds. This GC is necessary, * since if the channels are revoked, hypervisor will not * deliver send-done for all pending txds. * - The second pass frees the busdma stuffs, i.e. after all txds * were freed. */ for (i = 0; i < txr->hn_txdesc_cnt; ++i) hn_txdesc_gc(txr, &txr->hn_txdesc[i]); for (i = 0; i < txr->hn_txdesc_cnt; ++i) hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); if (txr->hn_tx_data_dtag != NULL) bus_dma_tag_destroy(txr->hn_tx_data_dtag); if (txr->hn_tx_rndis_dtag != NULL) bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); #ifdef HN_USE_TXDESC_BUFRING buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); #endif free(txr->hn_txdesc, M_DEVBUF); txr->hn_txdesc = NULL; if (txr->hn_mbuf_br != NULL) buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); #ifndef HN_USE_TXDESC_BUFRING mtx_destroy(&txr->hn_txlist_spin); #endif mtx_destroy(&txr->hn_tx_lock); } static int hn_create_tx_data(struct hn_softc *sc, int ring_cnt) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; int i; /* * Create TXBUF for chimney sending. * * NOTE: It is shared by all channels. */ sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (sc->hn_chim == NULL) { device_printf(sc->hn_dev, "allocate txbuf failed\n"); return (ENOMEM); } sc->hn_tx_ring_cnt = ring_cnt; sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, M_DEVBUF, M_WAITOK | M_ZERO); ctx = device_get_sysctl_ctx(sc->hn_dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); /* Create dev.hn.UNIT.tx sysctl tree */ sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { int error; error = hn_tx_ring_create(sc, i); if (error) return error; } SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_no_txdescs), hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_send_failed), hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_txdma_failed), hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_flush_failed), hn_tx_stat_ulong_sysctl, "LU", "# of packet transmission aggregation flush failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_tx_collapsed), hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_tx_chimney), hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, "# of total TX descs"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", CTLFLAG_RD, &sc->hn_chim_szmax, 0, "Chimney send packet size upper boundary"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_chim_size_sysctl, "I", "Chimney send packet size limit"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_direct_tx_size), hn_tx_conf_int_sysctl, "I", "Size of the packet for direct transmission"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_sched_tx), hn_tx_conf_int_sysctl, "I", "Always schedule transmission " "instead of doing direct transmission"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, "Applied packet transmission aggregation size"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_txagg_pktmax_sysctl, "I", "Applied packet transmission aggregation packets"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_txagg_align_sysctl, "I", "Applied packet transmission aggregation alignment"); return 0; } static void hn_set_chim_size(struct hn_softc *sc, int chim_size) { int i; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) sc->hn_tx_ring[i].hn_chim_size = chim_size; } static void hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) { struct ifnet *ifp = sc->hn_ifp; u_int hw_tsomax; int tso_minlen; HN_LOCK_ASSERT(sc); if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) return; KASSERT(sc->hn_ndis_tso_sgmin >= 2, ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); tso_minlen = sc->hn_ndis_tso_sgmin * mtu; KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && sc->hn_ndis_tso_szmax <= IP_MAXPACKET, ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); if (tso_maxlen < tso_minlen) tso_maxlen = tso_minlen; else if (tso_maxlen > IP_MAXPACKET) tso_maxlen = IP_MAXPACKET; if (tso_maxlen > sc->hn_ndis_tso_szmax) tso_maxlen = sc->hn_ndis_tso_szmax; hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); if (hn_xpnt_vf_isready(sc)) { if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax) hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax; } ifp->if_hw_tsomax = hw_tsomax; if (bootverbose) if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); } static void hn_fixup_tx_data(struct hn_softc *sc) { uint64_t csum_assist; int i; hn_set_chim_size(sc, sc->hn_chim_szmax); if (hn_tx_chimney_size > 0 && hn_tx_chimney_size < sc->hn_chim_szmax) hn_set_chim_size(sc, hn_tx_chimney_size); csum_assist = 0; if (sc->hn_caps & HN_CAP_IPCS) csum_assist |= CSUM_IP; if (sc->hn_caps & HN_CAP_TCP4CS) csum_assist |= CSUM_IP_TCP; if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs) csum_assist |= CSUM_IP_UDP; if (sc->hn_caps & HN_CAP_TCP6CS) csum_assist |= CSUM_IP6_TCP; if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs) csum_assist |= CSUM_IP6_UDP; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) sc->hn_tx_ring[i].hn_csum_assist = csum_assist; if (sc->hn_caps & HN_CAP_HASHVAL) { /* * Support HASHVAL pktinfo on TX path. */ if (bootverbose) if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); for (i = 0; i < sc->hn_tx_ring_cnt; ++i) sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; } } static void hn_fixup_rx_data(struct hn_softc *sc) { if (sc->hn_caps & HN_CAP_UDPHASH) { int i; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH; } } static void hn_destroy_tx_data(struct hn_softc *sc) { int i; if (sc->hn_chim != NULL) { if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); } else { device_printf(sc->hn_dev, "chimney sending buffer is referenced"); } sc->hn_chim = NULL; } if (sc->hn_tx_ring_cnt == 0) return; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) hn_tx_ring_destroy(&sc->hn_tx_ring[i]); free(sc->hn_tx_ring, M_DEVBUF); sc->hn_tx_ring = NULL; sc->hn_tx_ring_cnt = 0; sc->hn_tx_ring_inuse = 0; } #ifdef HN_IFSTART_SUPPORT static void hn_start_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); hn_start_locked(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static int hn_start_locked(struct hn_tx_ring *txr, int len) { struct hn_softc *sc = txr->hn_sc; struct ifnet *ifp = sc->hn_ifp; int sched = 0; KASSERT(hn_use_if_start, ("hn_start_locked is called, when if_start is disabled")); KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); mtx_assert(&txr->hn_tx_lock, MA_OWNED); KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); if (__predict_false(txr->hn_suspended)) return (0); if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return (0); while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { struct hn_txdesc *txd; struct mbuf *m_head; int error; IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) break; if (len > 0 && m_head->m_pkthdr.len > len) { /* * This sending could be time consuming; let callers * dispatch this packet sending (and sending of any * following up packets) to tx taskqueue. */ IFQ_DRV_PREPEND(&ifp->if_snd, m_head); sched = 1; break; } #if defined(INET6) || defined(INET) if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { m_head = hn_tso_fixup(m_head); if (__predict_false(m_head == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); continue; } } else if (m_head->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { m_head = hn_set_hlen(m_head); if (__predict_false(m_head == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); continue; } } #endif txd = hn_txdesc_get(txr); if (txd == NULL) { txr->hn_no_txdescs++; IFQ_DRV_PREPEND(&ifp->if_snd, m_head); atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); break; } error = hn_encap(ifp, txr, txd, &m_head); if (error) { /* Both txd and m_head are freed */ KASSERT(txr->hn_agg_txd == NULL, ("encap failed w/ pending aggregating txdesc")); continue; } if (txr->hn_agg_pktleft == 0) { if (txr->hn_agg_txd != NULL) { KASSERT(m_head == NULL, ("pending mbuf for aggregating txdesc")); error = hn_flush_txagg(ifp, txr); if (__predict_false(error)) { atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); break; } } else { KASSERT(m_head != NULL, ("mbuf was freed")); error = hn_txpkt(ifp, txr, txd); if (__predict_false(error)) { /* txd is freed, but m_head is not */ IFQ_DRV_PREPEND(&ifp->if_snd, m_head); atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); break; } } } #ifdef INVARIANTS else { KASSERT(txr->hn_agg_txd != NULL, ("no aggregating txdesc")); KASSERT(m_head == NULL, ("pending mbuf for aggregating txdesc")); } #endif } /* Flush pending aggerated transmission. */ if (txr->hn_agg_txd != NULL) hn_flush_txagg(ifp, txr); return (sched); } static void hn_start(struct ifnet *ifp) { struct hn_softc *sc = ifp->if_softc; struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; sched = hn_start_locked(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (!sched) return; } do_sched: taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } static void hn_start_txeof_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); hn_start_locked(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static void hn_start_txeof(struct hn_tx_ring *txr) { struct hn_softc *sc = txr->hn_sc; struct ifnet *ifp = sc->hn_ifp; KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); sched = hn_start_locked(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (sched) { taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } } else { do_sched: /* * Release the OACTIVE earlier, with the hope, that * others could catch up. The task will clear the * flag again with the hn_tx_lock to avoid possible * races. */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } #endif /* HN_IFSTART_SUPPORT */ static int hn_xmit(struct hn_tx_ring *txr, int len) { struct hn_softc *sc = txr->hn_sc; struct ifnet *ifp = sc->hn_ifp; struct mbuf *m_head; int sched = 0; mtx_assert(&txr->hn_tx_lock, MA_OWNED); #ifdef HN_IFSTART_SUPPORT KASSERT(hn_use_if_start == 0, ("hn_xmit is called, when if_start is enabled")); #endif KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); if (__predict_false(txr->hn_suspended)) return (0); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) return (0); while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { struct hn_txdesc *txd; int error; if (len > 0 && m_head->m_pkthdr.len > len) { /* * This sending could be time consuming; let callers * dispatch this packet sending (and sending of any * following up packets) to tx taskqueue. */ drbr_putback(ifp, txr->hn_mbuf_br, m_head); sched = 1; break; } txd = hn_txdesc_get(txr); if (txd == NULL) { txr->hn_no_txdescs++; drbr_putback(ifp, txr->hn_mbuf_br, m_head); txr->hn_oactive = 1; break; } error = hn_encap(ifp, txr, txd, &m_head); if (error) { /* Both txd and m_head are freed; discard */ KASSERT(txr->hn_agg_txd == NULL, ("encap failed w/ pending aggregating txdesc")); drbr_advance(ifp, txr->hn_mbuf_br); continue; } if (txr->hn_agg_pktleft == 0) { if (txr->hn_agg_txd != NULL) { KASSERT(m_head == NULL, ("pending mbuf for aggregating txdesc")); error = hn_flush_txagg(ifp, txr); if (__predict_false(error)) { txr->hn_oactive = 1; break; } } else { KASSERT(m_head != NULL, ("mbuf was freed")); error = hn_txpkt(ifp, txr, txd); if (__predict_false(error)) { /* txd is freed, but m_head is not */ drbr_putback(ifp, txr->hn_mbuf_br, m_head); txr->hn_oactive = 1; break; } } } #ifdef INVARIANTS else { KASSERT(txr->hn_agg_txd != NULL, ("no aggregating txdesc")); KASSERT(m_head == NULL, ("pending mbuf for aggregating txdesc")); } #endif /* Sent */ drbr_advance(ifp, txr->hn_mbuf_br); } /* Flush pending aggerated transmission. */ if (txr->hn_agg_txd != NULL) hn_flush_txagg(ifp, txr); return (sched); } static int hn_transmit(struct ifnet *ifp, struct mbuf *m) { struct hn_softc *sc = ifp->if_softc; struct hn_tx_ring *txr; int error, idx = 0; if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { struct rm_priotracker pt; rm_rlock(&sc->hn_vf_lock, &pt); if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { struct mbuf *m_bpf = NULL; int obytes, omcast; obytes = m->m_pkthdr.len; omcast = (m->m_flags & M_MCAST) != 0; if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { if (bpf_peers_present(ifp->if_bpf)) { m_bpf = m_copypacket(m, M_NOWAIT); if (m_bpf == NULL) { /* * Failed to grab a shallow * copy; tap now. */ ETHER_BPF_MTAP(ifp, m); } } } else { ETHER_BPF_MTAP(ifp, m); } error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m); rm_runlock(&sc->hn_vf_lock, &pt); if (m_bpf != NULL) { if (!error) ETHER_BPF_MTAP(ifp, m_bpf); m_freem(m_bpf); } if (error == ENOBUFS) { if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); } else if (error) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } else { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); if (omcast) { if_inc_counter(ifp, IFCOUNTER_OMCASTS, omcast); } } return (error); } rm_runlock(&sc->hn_vf_lock, &pt); } #if defined(INET6) || defined(INET) /* * Perform TSO packet header fixup or get l2/l3 header length now, * since packet headers should be cache-hot. */ if (m->m_pkthdr.csum_flags & CSUM_TSO) { m = hn_tso_fixup(m); if (__predict_false(m == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return EIO; } } else if (m->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { m = hn_set_hlen(m); if (__predict_false(m == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return EIO; } } #endif /* * Select the TX ring based on flowid */ if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { #ifdef RSS uint32_t bid; if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), &bid) == 0) idx = bid % sc->hn_tx_ring_inuse; else #endif { #if defined(INET6) || defined(INET) int tcpsyn = 0; if (m->m_pkthdr.len < 128 && (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) && (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { m = hn_check_tcpsyn(m, &tcpsyn); if (__predict_false(m == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (EIO); } } #else const int tcpsyn = 0; #endif if (tcpsyn) idx = 0; else idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; } } txr = &sc->hn_tx_ring[idx]; error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); if (error) { if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); return error; } if (txr->hn_oactive) return 0; if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; sched = hn_xmit(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (!sched) return 0; } do_sched: taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); return 0; } static void hn_tx_ring_qflush(struct hn_tx_ring *txr) { struct mbuf *m; mtx_lock(&txr->hn_tx_lock); while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) m_freem(m); mtx_unlock(&txr->hn_tx_lock); } static void hn_xmit_qflush(struct ifnet *ifp) { struct hn_softc *sc = ifp->if_softc; struct rm_priotracker pt; int i; for (i = 0; i < sc->hn_tx_ring_inuse; ++i) hn_tx_ring_qflush(&sc->hn_tx_ring[i]); if_qflush(ifp); rm_rlock(&sc->hn_vf_lock, &pt); if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp); rm_runlock(&sc->hn_vf_lock, &pt); } static void hn_xmit_txeof(struct hn_tx_ring *txr) { if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; txr->hn_oactive = 0; sched = hn_xmit(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (sched) { taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } } else { do_sched: /* * Release the oactive earlier, with the hope, that * others could catch up. The task will clear the * oactive again with the hn_tx_lock to avoid possible * races. */ txr->hn_oactive = 0; taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static void hn_xmit_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); hn_xmit(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static void hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); txr->hn_oactive = 0; hn_xmit(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static int hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) { struct vmbus_chan_br cbr; struct hn_rx_ring *rxr; struct hn_tx_ring *txr = NULL; int idx, error; idx = vmbus_chan_subidx(chan); /* * Link this channel to RX/TX ring. */ KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, ("invalid channel index %d, should > 0 && < %d", idx, sc->hn_rx_ring_inuse)); rxr = &sc->hn_rx_ring[idx]; KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, ("RX ring %d already attached", idx)); rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; rxr->hn_chan = chan; if (bootverbose) { if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", idx, vmbus_chan_id(chan)); } if (idx < sc->hn_tx_ring_inuse) { txr = &sc->hn_tx_ring[idx]; KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, ("TX ring %d already attached", idx)); txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; txr->hn_chan = chan; if (bootverbose) { if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", idx, vmbus_chan_id(chan)); } } /* Bind this channel to a proper CPU. */ vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); /* * Open this channel */ cbr.cbr = rxr->hn_br; cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; cbr.cbr_txsz = HN_TXBR_SIZE; cbr.cbr_rxsz = HN_RXBR_SIZE; error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); if (error) { if (error == EISCONN) { if_printf(sc->hn_ifp, "bufring is connected after " "chan%u open failure\n", vmbus_chan_id(chan)); rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; } else { if_printf(sc->hn_ifp, "open chan%u failed: %d\n", vmbus_chan_id(chan), error); } } return (error); } static void hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) { struct hn_rx_ring *rxr; int idx, error; idx = vmbus_chan_subidx(chan); /* * Link this channel to RX/TX ring. */ KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, ("invalid channel index %d, should > 0 && < %d", idx, sc->hn_rx_ring_inuse)); rxr = &sc->hn_rx_ring[idx]; KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), ("RX ring %d is not attached", idx)); rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; if (idx < sc->hn_tx_ring_inuse) { struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), ("TX ring %d is not attached attached", idx)); txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; } /* * Close this channel. * * NOTE: * Channel closing does _not_ destroy the target channel. */ error = vmbus_chan_close_direct(chan); if (error == EISCONN) { if_printf(sc->hn_ifp, "chan%u bufring is connected " "after being closed\n", vmbus_chan_id(chan)); rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; } else if (error) { if_printf(sc->hn_ifp, "chan%u close failed: %d\n", vmbus_chan_id(chan), error); } } static int hn_attach_subchans(struct hn_softc *sc) { struct vmbus_channel **subchans; int subchan_cnt = sc->hn_rx_ring_inuse - 1; int i, error = 0; KASSERT(subchan_cnt > 0, ("no sub-channels")); /* Attach the sub-channels. */ subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); for (i = 0; i < subchan_cnt; ++i) { int error1; error1 = hn_chan_attach(sc, subchans[i]); if (error1) { error = error1; /* Move on; all channels will be detached later. */ } } vmbus_subchan_rel(subchans, subchan_cnt); if (error) { if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); } else { if (bootverbose) { if_printf(sc->hn_ifp, "%d sub-channels attached\n", subchan_cnt); } } return (error); } static void hn_detach_allchans(struct hn_softc *sc) { struct vmbus_channel **subchans; int subchan_cnt = sc->hn_rx_ring_inuse - 1; int i; if (subchan_cnt == 0) goto back; /* Detach the sub-channels. */ subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); for (i = 0; i < subchan_cnt; ++i) hn_chan_detach(sc, subchans[i]); vmbus_subchan_rel(subchans, subchan_cnt); back: /* * Detach the primary channel, _after_ all sub-channels * are detached. */ hn_chan_detach(sc, sc->hn_prichan); /* Wait for sub-channels to be destroyed, if any. */ vmbus_subchan_drain(sc->hn_prichan); #ifdef INVARIANTS for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { KASSERT((sc->hn_rx_ring[i].hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, ("%dth RX ring is still attached", i)); } for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { KASSERT((sc->hn_tx_ring[i].hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, ("%dth TX ring is still attached", i)); } #endif } static int hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) { struct vmbus_channel **subchans; int nchan, rxr_cnt, error; nchan = *nsubch + 1; if (nchan == 1) { /* * Multiple RX/TX rings are not requested. */ *nsubch = 0; return (0); } /* * Query RSS capabilities, e.g. # of RX rings, and # of indirect * table entries. */ error = hn_rndis_query_rsscaps(sc, &rxr_cnt); if (error) { /* No RSS; this is benign. */ *nsubch = 0; return (0); } if (bootverbose) { if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", rxr_cnt, nchan); } if (nchan > rxr_cnt) nchan = rxr_cnt; if (nchan == 1) { if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); *nsubch = 0; return (0); } /* * Allocate sub-channels from NVS. */ *nsubch = nchan - 1; error = hn_nvs_alloc_subchans(sc, nsubch); if (error || *nsubch == 0) { /* Failed to allocate sub-channels. */ *nsubch = 0; return (0); } /* * Wait for all sub-channels to become ready before moving on. */ subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); vmbus_subchan_rel(subchans, *nsubch); return (0); } static bool hn_synth_attachable(const struct hn_softc *sc) { int i; if (sc->hn_flags & HN_FLAG_ERRORS) return (false); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) return (false); } return (true); } /* * Make sure that the RX filter is zero after the successful * RNDIS initialization. * * NOTE: * Under certain conditions on certain versions of Hyper-V, * the RNDIS rxfilter is _not_ zero on the hypervisor side * after the successful RNDIS initialization, which breaks * the assumption of any following code (well, it breaks the * RNDIS API contract actually). Clear the RNDIS rxfilter * explicitly, drain packets sneaking through, and drain the * interrupt taskqueues scheduled due to the stealth packets. */ static void hn_rndis_init_fixat(struct hn_softc *sc, int nchan) { hn_disable_rx(sc); hn_drain_rxtx(sc, nchan); } static int hn_synth_attach(struct hn_softc *sc, int mtu) { #define ATTACHED_NVS 0x0002 #define ATTACHED_RNDIS 0x0004 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; int error, nsubch, nchan = 1, i, rndis_inited; uint32_t old_caps, attached = 0; KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, ("synthetic parts were attached")); if (!hn_synth_attachable(sc)) return (ENXIO); /* Save capabilities for later verification. */ old_caps = sc->hn_caps; sc->hn_caps = 0; /* Clear RSS stuffs. */ sc->hn_rss_ind_size = 0; sc->hn_rss_hash = 0; sc->hn_rss_hcap = 0; /* * Attach the primary channel _before_ attaching NVS and RNDIS. */ error = hn_chan_attach(sc, sc->hn_prichan); if (error) goto failed; /* * Attach NVS. */ error = hn_nvs_attach(sc, mtu); if (error) goto failed; attached |= ATTACHED_NVS; /* * Attach RNDIS _after_ NVS is attached. */ error = hn_rndis_attach(sc, mtu, &rndis_inited); if (rndis_inited) attached |= ATTACHED_RNDIS; if (error) goto failed; /* * Make sure capabilities are not changed. */ if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", old_caps, sc->hn_caps); error = ENXIO; goto failed; } /* * Allocate sub-channels for multi-TX/RX rings. * * NOTE: * The # of RX rings that can be used is equivalent to the # of * channels to be requested. */ nsubch = sc->hn_rx_ring_cnt - 1; error = hn_synth_alloc_subchans(sc, &nsubch); if (error) goto failed; /* NOTE: _Full_ synthetic parts detach is required now. */ sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; /* * Set the # of TX/RX rings that could be used according to * the # of channels that NVS offered. */ nchan = nsubch + 1; hn_set_ring_inuse(sc, nchan); if (nchan == 1) { /* Only the primary channel can be used; done */ goto back; } /* * Attach the sub-channels. * * NOTE: hn_set_ring_inuse() _must_ have been called. */ error = hn_attach_subchans(sc); if (error) goto failed; /* * Configure RSS key and indirect table _after_ all sub-channels * are attached. */ if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { /* * RSS key is not set yet; set it to the default RSS key. */ if (bootverbose) if_printf(sc->hn_ifp, "setup default RSS key\n"); #ifdef RSS rss_getkey(rss->rss_key); #else memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); #endif sc->hn_flags |= HN_FLAG_HAS_RSSKEY; } if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { /* * RSS indirect table is not set yet; set it up in round- * robin fashion. */ if (bootverbose) { if_printf(sc->hn_ifp, "setup default RSS indirect " "table\n"); } for (i = 0; i < NDIS_HASH_INDCNT; ++i) { uint32_t subidx; #ifdef RSS subidx = rss_get_indirection_to_bucket(i); #else subidx = i; #endif rss->rss_ind[i] = subidx % nchan; } sc->hn_flags |= HN_FLAG_HAS_RSSIND; } else { /* * # of usable channels may be changed, so we have to * make sure that all entries in RSS indirect table * are valid. * * NOTE: hn_set_ring_inuse() _must_ have been called. */ hn_rss_ind_fixup(sc); } sc->hn_rss_hash = sc->hn_rss_hcap; if ((sc->hn_flags & HN_FLAG_RXVF) || (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { /* NOTE: Don't reconfigure RSS; will do immediately. */ hn_vf_rss_fixup(sc, false); } error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); if (error) goto failed; back: /* * Fixup transmission aggregation setup. */ hn_set_txagg(sc); hn_rndis_init_fixat(sc, nchan); return (0); failed: if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { hn_rndis_init_fixat(sc, nchan); hn_synth_detach(sc); } else { if (attached & ATTACHED_RNDIS) { hn_rndis_init_fixat(sc, nchan); hn_rndis_detach(sc); } if (attached & ATTACHED_NVS) hn_nvs_detach(sc); hn_chan_detach(sc, sc->hn_prichan); /* Restore old capabilities. */ sc->hn_caps = old_caps; } return (error); #undef ATTACHED_RNDIS #undef ATTACHED_NVS } /* * NOTE: * The interface must have been suspended though hn_suspend(), before * this function get called. */ static void hn_synth_detach(struct hn_softc *sc) { KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("synthetic parts were not attached")); /* Detach the RNDIS first. */ hn_rndis_detach(sc); /* Detach NVS. */ hn_nvs_detach(sc); /* Detach all of the channels. */ hn_detach_allchans(sc); if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) { /* * Host is post-Win2016, disconnect RXBUF from primary channel here. */ int error; error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, sc->hn_rxbuf_gpadl); if (error) { if_printf(sc->hn_ifp, "rxbuf gpadl disconn failed: %d\n", error); sc->hn_flags |= HN_FLAG_RXBUF_REF; } sc->hn_rxbuf_gpadl = 0; } if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) { /* * Host is post-Win2016, disconnect chimney sending buffer from * primary channel here. */ int error; error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, sc->hn_chim_gpadl); if (error) { if_printf(sc->hn_ifp, "chim gpadl disconn failed: %d\n", error); sc->hn_flags |= HN_FLAG_CHIM_REF; } sc->hn_chim_gpadl = 0; } sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; } static void hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) { KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, ("invalid ring count %d", ring_cnt)); if (sc->hn_tx_ring_cnt > ring_cnt) sc->hn_tx_ring_inuse = ring_cnt; else sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; sc->hn_rx_ring_inuse = ring_cnt; #ifdef RSS if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, rss_getnumbuckets()); } #endif if (bootverbose) { if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); } } static void hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) { /* * NOTE: * The TX bufring will not be drained by the hypervisor, * if the primary channel is revoked. */ while (!vmbus_chan_rx_empty(chan) || (!vmbus_chan_is_revoked(sc->hn_prichan) && !vmbus_chan_tx_empty(chan))) pause("waitch", 1); vmbus_chan_intr_drain(chan); } static void hn_disable_rx(struct hn_softc *sc) { /* * Disable RX by clearing RX filter forcefully. */ sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ /* * Give RNDIS enough time to flush all pending data packets. */ pause("waitrx", (200 * hz) / 1000); } /* * NOTE: * RX/TX _must_ have been suspended/disabled, before this function * is called. */ static void hn_drain_rxtx(struct hn_softc *sc, int nchan) { struct vmbus_channel **subch = NULL; int nsubch; /* * Drain RX/TX bufrings and interrupts. */ nsubch = nchan - 1; if (nsubch > 0) subch = vmbus_subchan_get(sc->hn_prichan, nsubch); if (subch != NULL) { int i; for (i = 0; i < nsubch; ++i) hn_chan_drain(sc, subch[i]); } hn_chan_drain(sc, sc->hn_prichan); if (subch != NULL) vmbus_subchan_rel(subch, nsubch); } static void hn_suspend_data(struct hn_softc *sc) { struct hn_tx_ring *txr; int i; HN_LOCK_ASSERT(sc); /* * Suspend TX. */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; mtx_lock(&txr->hn_tx_lock); txr->hn_suspended = 1; mtx_unlock(&txr->hn_tx_lock); /* No one is able send more packets now. */ /* * Wait for all pending sends to finish. * * NOTE: * We will _not_ receive all pending send-done, if the * primary channel is revoked. */ while (hn_tx_ring_pending(txr) && !vmbus_chan_is_revoked(sc->hn_prichan)) pause("hnwtx", 1 /* 1 tick */); } /* * Disable RX. */ hn_disable_rx(sc); /* * Drain RX/TX. */ hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); /* * Drain any pending TX tasks. * * NOTE: * The above hn_drain_rxtx() can dispatch TX tasks, so the TX * tasks will have to be drained _after_ the above hn_drain_rxtx(). */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static void hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) { ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; } static void hn_suspend_mgmt(struct hn_softc *sc) { struct task task; HN_LOCK_ASSERT(sc); /* * Make sure that hn_mgmt_taskq0 can nolonger be accessed * through hn_mgmt_taskq. */ TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); vmbus_chan_run_task(sc->hn_prichan, &task); /* * Make sure that all pending management tasks are completed. */ taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); taskqueue_drain_all(sc->hn_mgmt_taskq0); } static void hn_suspend(struct hn_softc *sc) { /* Disable polling. */ hn_polling(sc, 0); /* * If the non-transparent mode VF is activated, the synthetic * device is receiving packets, so the data path of the * synthetic device must be suspended. */ if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || (sc->hn_flags & HN_FLAG_RXVF)) hn_suspend_data(sc); hn_suspend_mgmt(sc); } static void hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) { int i; KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, ("invalid TX ring count %d", tx_ring_cnt)); for (i = 0; i < tx_ring_cnt; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; mtx_lock(&txr->hn_tx_lock); txr->hn_suspended = 0; mtx_unlock(&txr->hn_tx_lock); } } static void hn_resume_data(struct hn_softc *sc) { int i; HN_LOCK_ASSERT(sc); /* * Re-enable RX. */ hn_rxfilter_config(sc); /* * Make sure to clear suspend status on "all" TX rings, * since hn_tx_ring_inuse can be changed after * hn_suspend_data(). */ hn_resume_tx(sc, sc->hn_tx_ring_cnt); #ifdef HN_IFSTART_SUPPORT if (!hn_use_if_start) #endif { /* * Flush unused drbrs, since hn_tx_ring_inuse may be * reduced. */ for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) hn_tx_ring_qflush(&sc->hn_tx_ring[i]); } /* * Kick start TX. */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; /* * Use txeof task, so that any pending oactive can be * cleared properly. */ taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static void hn_resume_mgmt(struct hn_softc *sc) { sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; /* * Kick off network change detection, if it was pending. * If no network change was pending, start link status * checks, which is more lightweight than network change * detection. */ if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) hn_change_network(sc); else hn_update_link_status(sc); } static void hn_resume(struct hn_softc *sc) { /* * If the non-transparent mode VF is activated, the synthetic * device have to receive packets, so the data path of the * synthetic device must be resumed. */ if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || (sc->hn_flags & HN_FLAG_RXVF)) hn_resume_data(sc); /* * Don't resume link status change if VF is attached/activated. * - In the non-transparent VF mode, the synthetic device marks * link down until the VF is deactivated; i.e. VF is down. * - In transparent VF mode, VF's media status is used until * the VF is detached. */ if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) hn_resume_mgmt(sc); /* * Re-enable polling if this interface is running and * the polling is requested. */ if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) hn_polling(sc, sc->hn_pollhz); } static void hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) { const struct rndis_status_msg *msg; int ofs; if (dlen < sizeof(*msg)) { if_printf(sc->hn_ifp, "invalid RNDIS status\n"); return; } msg = data; switch (msg->rm_status) { case RNDIS_STATUS_MEDIA_CONNECT: case RNDIS_STATUS_MEDIA_DISCONNECT: hn_update_link_status(sc); break; case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: case RNDIS_STATUS_LINK_SPEED_CHANGE: /* Not really useful; ignore. */ break; case RNDIS_STATUS_NETWORK_CHANGE: ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); if (dlen < ofs + msg->rm_stbuflen || msg->rm_stbuflen < sizeof(uint32_t)) { if_printf(sc->hn_ifp, "network changed\n"); } else { uint32_t change; memcpy(&change, ((const uint8_t *)msg) + ofs, sizeof(change)); if_printf(sc->hn_ifp, "network changed, change %u\n", change); } hn_change_network(sc); break; default: if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", msg->rm_status); break; } } static int hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) { const struct rndis_pktinfo *pi = info_data; uint32_t mask = 0; while (info_dlen != 0) { const void *data; uint32_t dlen; if (__predict_false(info_dlen < sizeof(*pi))) return (EINVAL); if (__predict_false(info_dlen < pi->rm_size)) return (EINVAL); info_dlen -= pi->rm_size; if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) return (EINVAL); if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) return (EINVAL); dlen = pi->rm_size - pi->rm_pktinfooffset; data = pi->rm_data; switch (pi->rm_type) { case NDIS_PKTINFO_TYPE_VLAN: if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE)) return (EINVAL); info->vlan_info = *((const uint32_t *)data); mask |= HN_RXINFO_VLAN; break; case NDIS_PKTINFO_TYPE_CSUM: if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE)) return (EINVAL); info->csum_info = *((const uint32_t *)data); mask |= HN_RXINFO_CSUM; break; case HN_NDIS_PKTINFO_TYPE_HASHVAL: if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE)) return (EINVAL); info->hash_value = *((const uint32_t *)data); mask |= HN_RXINFO_HASHVAL; break; case HN_NDIS_PKTINFO_TYPE_HASHINF: if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE)) return (EINVAL); info->hash_info = *((const uint32_t *)data); mask |= HN_RXINFO_HASHINF; break; default: goto next; } if (mask == HN_RXINFO_ALL) { /* All found; done */ break; } next: pi = (const struct rndis_pktinfo *) ((const uint8_t *)pi + pi->rm_size); } /* * Final fixup. * - If there is no hash value, invalidate the hash info. */ if ((mask & HN_RXINFO_HASHVAL) == 0) info->hash_info = HN_NDIS_HASH_INFO_INVALID; return (0); } static __inline bool hn_rndis_check_overlap(int off, int len, int check_off, int check_len) { if (off < check_off) { if (__predict_true(off + len <= check_off)) return (false); } else if (off > check_off) { if (__predict_true(check_off + check_len <= off)) return (false); } return (true); } static void hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) { const struct rndis_packet_msg *pkt; struct hn_rxinfo info; int data_off, pktinfo_off, data_len, pktinfo_len; /* * Check length. */ if (__predict_false(dlen < sizeof(*pkt))) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); return; } pkt = data; if (__predict_false(dlen < pkt->rm_len)) { if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " "dlen %d, msglen %u\n", dlen, pkt->rm_len); return; } if (__predict_false(pkt->rm_len < pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " "msglen %u, data %u, oob %u, pktinfo %u\n", pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, pkt->rm_pktinfolen); return; } if (__predict_false(pkt->rm_datalen == 0)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); return; } /* * Check offests. */ #define IS_OFFSET_INVALID(ofs) \ ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) /* XXX Hyper-V does not meet data offset alignment requirement */ if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "data offset %u\n", pkt->rm_dataoffset); return; } if (__predict_false(pkt->rm_oobdataoffset > 0 && IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "oob offset %u\n", pkt->rm_oobdataoffset); return; } if (__predict_true(pkt->rm_pktinfooffset > 0) && __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "pktinfo offset %u\n", pkt->rm_pktinfooffset); return; } #undef IS_OFFSET_INVALID data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); data_len = pkt->rm_datalen; pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); pktinfo_len = pkt->rm_pktinfolen; /* * Check OOB coverage. */ if (__predict_false(pkt->rm_oobdatalen != 0)) { int oob_off, oob_len; if_printf(rxr->hn_ifp, "got oobdata\n"); oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); oob_len = pkt->rm_oobdatalen; if (__predict_false(oob_off + oob_len > pkt->rm_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "oob overflow, msglen %u, oob abs %d len %d\n", pkt->rm_len, oob_off, oob_len); return; } /* * Check against data. */ if (hn_rndis_check_overlap(oob_off, oob_len, data_off, data_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "oob overlaps data, oob abs %d len %d, " "data abs %d len %d\n", oob_off, oob_len, data_off, data_len); return; } /* * Check against pktinfo. */ if (pktinfo_len != 0 && hn_rndis_check_overlap(oob_off, oob_len, pktinfo_off, pktinfo_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "oob overlaps pktinfo, oob abs %d len %d, " "pktinfo abs %d len %d\n", oob_off, oob_len, pktinfo_off, pktinfo_len); return; } } /* * Check per-packet-info coverage and find useful per-packet-info. */ info.vlan_info = HN_NDIS_VLAN_INFO_INVALID; info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID; info.hash_info = HN_NDIS_HASH_INFO_INVALID; if (__predict_true(pktinfo_len != 0)) { bool overlap; int error; if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "pktinfo overflow, msglen %u, " "pktinfo abs %d len %d\n", pkt->rm_len, pktinfo_off, pktinfo_len); return; } /* * Check packet info coverage. */ overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, data_off, data_len); if (__predict_false(overlap)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "pktinfo overlap data, pktinfo abs %d len %d, " "data abs %d len %d\n", pktinfo_off, pktinfo_len, data_off, data_len); return; } /* * Find useful per-packet-info. */ error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, pktinfo_len, &info); if (__predict_false(error)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " "pktinfo\n"); return; } } if (__predict_false(data_off + data_len > pkt->rm_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "data overflow, msglen %u, data abs %d len %d\n", pkt->rm_len, data_off, data_len); return; } hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info); } static __inline void hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) { const struct rndis_msghdr *hdr; if (__predict_false(dlen < sizeof(*hdr))) { if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); return; } hdr = data; if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { /* Hot data path. */ hn_rndis_rx_data(rxr, data, dlen); /* Done! */ return; } if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); else hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); } static void hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) { const struct hn_nvs_hdr *hdr; if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { if_printf(sc->hn_ifp, "invalid nvs notify\n"); return; } hdr = VMBUS_CHANPKT_CONST_DATA(pkt); if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { /* Useless; ignore */ return; } if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); } static void hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkt) { struct hn_nvs_sendctx *sndc; sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), VMBUS_CHANPKT_DATALEN(pkt)); /* * NOTE: * 'sndc' CAN NOT be accessed anymore, since it can be freed by * its callback. */ } static void hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkthdr) { const struct vmbus_chanpkt_rxbuf *pkt; const struct hn_nvs_hdr *nvs_hdr; int count, i, hlen; if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); return; } nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); /* Make sure that this is a RNDIS message. */ if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", nvs_hdr->nvs_type); return; } hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); if (__predict_false(hlen < sizeof(*pkt))) { if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); return; } pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", pkt->cp_rxbuf_id); return; } count = pkt->cp_rxbuf_cnt; if (__predict_false(hlen < __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); return; } /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ for (i = 0; i < count; ++i) { int ofs, len; ofs = pkt->cp_rxbuf[i].rb_ofs; len = pkt->cp_rxbuf[i].rb_len; if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " "ofs %d, len %d\n", i, ofs, len); continue; } hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); } /* * Ack the consumed RXBUF associated w/ this channel packet, * so that this RXBUF can be recycled by the hypervisor. */ hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); } static void hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, uint64_t tid) { struct hn_nvs_rndis_ack ack; int retries, error; ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; ack.nvs_status = HN_NVS_STATUS_OK; retries = 0; again: error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); if (__predict_false(error == EAGAIN)) { /* * NOTE: * This should _not_ happen in real world, since the * consumption of the TX bufring from the TX path is * controlled. */ if (rxr->hn_ack_failed == 0) if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); rxr->hn_ack_failed++; retries++; if (retries < 10) { DELAY(100); goto again; } /* RXBUF leaks! */ if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); } } static void hn_chan_callback(struct vmbus_channel *chan, void *xrxr) { struct hn_rx_ring *rxr = xrxr; struct hn_softc *sc = rxr->hn_ifp->if_softc; for (;;) { struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; int error, pktlen; pktlen = rxr->hn_pktbuf_len; error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); if (__predict_false(error == ENOBUFS)) { void *nbuf; int nlen; /* * Expand channel packet buffer. * * XXX * Use M_WAITOK here, since allocation failure * is fatal. */ nlen = rxr->hn_pktbuf_len * 2; while (nlen < pktlen) nlen *= 2; nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", rxr->hn_pktbuf_len, nlen); free(rxr->hn_pktbuf, M_DEVBUF); rxr->hn_pktbuf = nbuf; rxr->hn_pktbuf_len = nlen; /* Retry! */ continue; } else if (__predict_false(error == EAGAIN)) { /* No more channel packets; done! */ break; } KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); switch (pkt->cph_type) { case VMBUS_CHANPKT_TYPE_COMP: hn_nvs_handle_comp(sc, chan, pkt); break; case VMBUS_CHANPKT_TYPE_RXBUF: hn_nvs_handle_rxbuf(rxr, chan, pkt); break; case VMBUS_CHANPKT_TYPE_INBAND: hn_nvs_handle_notify(sc, pkt); break; default: if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", pkt->cph_type); break; } } hn_chan_rollup(rxr, rxr->hn_txr); } static void hn_sysinit(void *arg __unused) { int i; hn_udpcs_fixup = counter_u64_alloc(M_WAITOK); #ifdef HN_IFSTART_SUPPORT /* * Don't use ifnet.if_start if transparent VF mode is requested; * mainly due to the IFF_DRV_OACTIVE flag. */ if (hn_xpnt_vf && hn_use_if_start) { hn_use_if_start = 0; printf("hn: tranparent VF mode, if_transmit will be used, " "instead of if_start\n"); } #endif if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { printf("hn: invalid transparent VF attach routing " "wait timeout %d, reset to %d\n", hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; } /* * Initialize VF map. */ rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); hn_vfmap_size = HN_VFMAP_SIZE_DEF; hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF, M_WAITOK | M_ZERO); /* * Fix the # of TX taskqueues. */ if (hn_tx_taskq_cnt <= 0) hn_tx_taskq_cnt = 1; else if (hn_tx_taskq_cnt > mp_ncpus) hn_tx_taskq_cnt = mp_ncpus; /* * Fix the TX taskqueue mode. */ switch (hn_tx_taskq_mode) { case HN_TX_TASKQ_M_INDEP: case HN_TX_TASKQ_M_GLOBAL: case HN_TX_TASKQ_M_EVTTQ: break; default: hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; break; } if (vm_guest != VM_GUEST_HV) return; if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) return; hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), M_DEVBUF, M_WAITOK); for (i = 0; i < hn_tx_taskq_cnt; ++i) { hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, taskqueue_thread_enqueue, &hn_tx_taskque[i]); taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, "hn tx%d", i); } } SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); static void hn_sysuninit(void *arg __unused) { if (hn_tx_taskque != NULL) { int i; for (i = 0; i < hn_tx_taskq_cnt; ++i) taskqueue_free(hn_tx_taskque[i]); free(hn_tx_taskque, M_DEVBUF); } if (hn_vfmap != NULL) free(hn_vfmap, M_DEVBUF); rm_destroy(&hn_vfmap_lock); counter_u64_free(hn_udpcs_fixup); } SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); diff --git a/sys/dev/if_ndis/if_ndis.c b/sys/dev/if_ndis/if_ndis.c index 37cf0e6bf703..1a5e8eeed4be 100644 --- a/sys/dev/if_ndis/if_ndis.c +++ b/sys/dev/if_ndis/if_ndis.c @@ -1,3424 +1,3423 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2003 * Bill Paul . All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Bill Paul. * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY Bill Paul AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL Bill Paul OR THE VOICES IN HIS HEAD * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. * * WPA support originally contributed by Arvind Srinivasan * then hacked upon mercilessly by my. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NDIS_DEBUG #ifdef NDIS_DEBUG #define DPRINTF(x) do { if (ndis_debug > 0) printf x; } while (0) int ndis_debug = 0; SYSCTL_INT(_debug, OID_AUTO, ndis, CTLFLAG_RW, &ndis_debug, 0, "if_ndis debug level"); #else #define DPRINTF(x) #endif SYSCTL_DECL(_hw_ndisusb); int ndisusb_halt = 1; SYSCTL_INT(_hw_ndisusb, OID_AUTO, halt, CTLFLAG_RW, &ndisusb_halt, 0, "Halt NDIS USB driver when it's attached"); /* 0 - 30 dBm to mW conversion table */ static const uint16_t dBm2mW[] = { 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 20, 22, 25, 28, 32, 35, 40, 45, 50, 56, 63, 71, 79, 89, 100, 112, 126, 141, 158, 178, 200, 224, 251, 282, 316, 355, 398, 447, 501, 562, 631, 708, 794, 891, 1000 }; MODULE_DEPEND(ndis, ether, 1, 1, 1); MODULE_DEPEND(ndis, wlan, 1, 1, 1); MODULE_DEPEND(ndis, ndisapi, 1, 1, 1); MODULE_VERSION(ndis, 1); int ndis_attach (device_t); int ndis_detach (device_t); int ndis_suspend (device_t); int ndis_resume (device_t); void ndis_shutdown (device_t); int ndisdrv_modevent (module_t, int, void *); static void ndis_txeof (ndis_handle, ndis_packet *, ndis_status); static void ndis_rxeof (ndis_handle, ndis_packet **, uint32_t); static void ndis_rxeof_eth (ndis_handle, ndis_handle, char *, void *, uint32_t, void *, uint32_t, uint32_t); static void ndis_rxeof_done (ndis_handle); static void ndis_rxeof_xfr (kdpc *, ndis_handle, void *, void *); static void ndis_rxeof_xfr_done (ndis_handle, ndis_packet *, uint32_t, uint32_t); static void ndis_linksts (ndis_handle, ndis_status, void *, uint32_t); static void ndis_linksts_done (ndis_handle); /* We need to wrap these functions for amd64. */ static funcptr ndis_txeof_wrap; static funcptr ndis_rxeof_wrap; static funcptr ndis_rxeof_eth_wrap; static funcptr ndis_rxeof_done_wrap; static funcptr ndis_rxeof_xfr_wrap; static funcptr ndis_rxeof_xfr_done_wrap; static funcptr ndis_linksts_wrap; static funcptr ndis_linksts_done_wrap; static funcptr ndis_ticktask_wrap; static funcptr ndis_ifstarttask_wrap; static funcptr ndis_resettask_wrap; static funcptr ndis_inputtask_wrap; static struct ieee80211vap *ndis_vap_create(struct ieee80211com *, const char [IFNAMSIZ], int, enum ieee80211_opmode, int, const uint8_t [IEEE80211_ADDR_LEN], const uint8_t [IEEE80211_ADDR_LEN]); static void ndis_vap_delete (struct ieee80211vap *); static void ndis_tick (void *); static void ndis_ticktask (device_object *, void *); static int ndis_raw_xmit (struct ieee80211_node *, struct mbuf *, const struct ieee80211_bpf_params *); static void ndis_update_mcast (struct ieee80211com *); static void ndis_update_promisc (struct ieee80211com *); static void ndis_ifstart (struct ifnet *); static void ndis_ifstarttask (device_object *, void *); static void ndis_resettask (device_object *, void *); static void ndis_inputtask (device_object *, void *); static int ndis_ifioctl (struct ifnet *, u_long, caddr_t); static int ndis_newstate (struct ieee80211vap *, enum ieee80211_state, int); static int ndis_nettype_chan (uint32_t); static int ndis_nettype_mode (uint32_t); static void ndis_scan (void *); static void ndis_scan_results (struct ndis_softc *); static void ndis_scan_start (struct ieee80211com *); static void ndis_scan_end (struct ieee80211com *); static void ndis_set_channel (struct ieee80211com *); static void ndis_scan_curchan (struct ieee80211_scan_state *, unsigned long); static void ndis_scan_mindwell (struct ieee80211_scan_state *); static void ndis_init (void *); static void ndis_stop (struct ndis_softc *); static int ndis_ifmedia_upd (struct ifnet *); static void ndis_ifmedia_sts (struct ifnet *, struct ifmediareq *); static int ndis_get_bssid_list (struct ndis_softc *, ndis_80211_bssid_list_ex **); static int ndis_get_assoc (struct ndis_softc *, ndis_wlan_bssid_ex **); static int ndis_probe_offload (struct ndis_softc *); static int ndis_set_offload (struct ndis_softc *); static void ndis_getstate_80211 (struct ndis_softc *); static void ndis_setstate_80211 (struct ndis_softc *); static void ndis_auth_and_assoc (struct ndis_softc *, struct ieee80211vap *); static void ndis_media_status (struct ifnet *, struct ifmediareq *); static int ndis_set_cipher (struct ndis_softc *, int); static int ndis_set_wpa (struct ndis_softc *, void *, int); static int ndis_add_key (struct ieee80211vap *, const struct ieee80211_key *); static int ndis_del_key (struct ieee80211vap *, const struct ieee80211_key *); static void ndis_setmulti (struct ndis_softc *); static void ndis_map_sclist (void *, bus_dma_segment_t *, int, bus_size_t, int); static int ndis_ifattach(struct ndis_softc *); static int ndis_80211attach(struct ndis_softc *); static int ndis_80211ioctl(struct ieee80211com *, u_long , void *); static int ndis_80211transmit(struct ieee80211com *, struct mbuf *); static void ndis_80211parent(struct ieee80211com *); static int ndisdrv_loaded = 0; /* * This routine should call windrv_load() once for each driver * image. This will do the relocation and dynalinking for the * image, and create a Windows driver object which will be * saved in our driver database. */ int ndisdrv_modevent(mod, cmd, arg) module_t mod; int cmd; void *arg; { int error = 0; switch (cmd) { case MOD_LOAD: ndisdrv_loaded++; if (ndisdrv_loaded > 1) break; windrv_wrap((funcptr)ndis_rxeof, &ndis_rxeof_wrap, 3, WINDRV_WRAP_STDCALL); windrv_wrap((funcptr)ndis_rxeof_eth, &ndis_rxeof_eth_wrap, 8, WINDRV_WRAP_STDCALL); windrv_wrap((funcptr)ndis_rxeof_done, &ndis_rxeof_done_wrap, 1, WINDRV_WRAP_STDCALL); windrv_wrap((funcptr)ndis_rxeof_xfr, &ndis_rxeof_xfr_wrap, 4, WINDRV_WRAP_STDCALL); windrv_wrap((funcptr)ndis_rxeof_xfr_done, &ndis_rxeof_xfr_done_wrap, 4, WINDRV_WRAP_STDCALL); windrv_wrap((funcptr)ndis_txeof, &ndis_txeof_wrap, 3, WINDRV_WRAP_STDCALL); windrv_wrap((funcptr)ndis_linksts, &ndis_linksts_wrap, 4, WINDRV_WRAP_STDCALL); windrv_wrap((funcptr)ndis_linksts_done, &ndis_linksts_done_wrap, 1, WINDRV_WRAP_STDCALL); windrv_wrap((funcptr)ndis_ticktask, &ndis_ticktask_wrap, 2, WINDRV_WRAP_STDCALL); windrv_wrap((funcptr)ndis_ifstarttask, &ndis_ifstarttask_wrap, 2, WINDRV_WRAP_STDCALL); windrv_wrap((funcptr)ndis_resettask, &ndis_resettask_wrap, 2, WINDRV_WRAP_STDCALL); windrv_wrap((funcptr)ndis_inputtask, &ndis_inputtask_wrap, 2, WINDRV_WRAP_STDCALL); break; case MOD_UNLOAD: ndisdrv_loaded--; if (ndisdrv_loaded > 0) break; /* fallthrough */ case MOD_SHUTDOWN: windrv_unwrap(ndis_rxeof_wrap); windrv_unwrap(ndis_rxeof_eth_wrap); windrv_unwrap(ndis_rxeof_done_wrap); windrv_unwrap(ndis_rxeof_xfr_wrap); windrv_unwrap(ndis_rxeof_xfr_done_wrap); windrv_unwrap(ndis_txeof_wrap); windrv_unwrap(ndis_linksts_wrap); windrv_unwrap(ndis_linksts_done_wrap); windrv_unwrap(ndis_ticktask_wrap); windrv_unwrap(ndis_ifstarttask_wrap); windrv_unwrap(ndis_resettask_wrap); windrv_unwrap(ndis_inputtask_wrap); break; default: error = EINVAL; break; } return (error); } struct mclist_ctx { uint8_t *mclist; int mclistsz; }; static u_int ndis_copy_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt) { struct mclist_ctx *ctx = arg; if (cnt < ctx->mclistsz) bcopy(LLADDR(sdl), ctx->mclist + (ETHER_ADDR_LEN * cnt), ETHER_ADDR_LEN); return (1); } /* * Program the 64-bit multicast hash filter. */ static void ndis_setmulti(sc) struct ndis_softc *sc; { struct ifnet *ifp; struct mclist_ctx ctx; int len, error; if (!NDIS_INITIALIZED(sc)) return; if (sc->ndis_80211) return; ifp = sc->ifp; if (ifp->if_flags & IFF_ALLMULTI || ifp->if_flags & IFF_PROMISC) { sc->ndis_filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; len = sizeof(sc->ndis_filter); error = ndis_set_info(sc, OID_GEN_CURRENT_PACKET_FILTER, &sc->ndis_filter, &len); if (error) device_printf(sc->ndis_dev, "set allmulti failed: %d\n", error); return; } if (if_llmaddr_count(ifp) == 0) return; len = sizeof(ctx.mclistsz); ndis_get_info(sc, OID_802_3_MAXIMUM_LIST_SIZE, &ctx.mclistsz, &len); ctx.mclist = malloc(ETHER_ADDR_LEN * ctx.mclistsz, M_TEMP, M_NOWAIT | M_ZERO); if (ctx.mclist == NULL) { sc->ndis_filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; goto out; } sc->ndis_filter |= NDIS_PACKET_TYPE_MULTICAST; len = if_foreach_llmaddr(ifp, ndis_copy_maddr, &ctx); if (len > ctx.mclistsz) { sc->ndis_filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; sc->ndis_filter &= ~NDIS_PACKET_TYPE_MULTICAST; goto out; } len = len * ETHER_ADDR_LEN; error = ndis_set_info(sc, OID_802_3_MULTICAST_LIST, ctx.mclist, &len); if (error) { device_printf(sc->ndis_dev, "set mclist failed: %d\n", error); sc->ndis_filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; sc->ndis_filter &= ~NDIS_PACKET_TYPE_MULTICAST; } out: free(ctx.mclist, M_TEMP); len = sizeof(sc->ndis_filter); error = ndis_set_info(sc, OID_GEN_CURRENT_PACKET_FILTER, &sc->ndis_filter, &len); if (error) device_printf(sc->ndis_dev, "set multi failed: %d\n", error); } static int ndis_set_offload(sc) struct ndis_softc *sc; { ndis_task_offload *nto; ndis_task_offload_hdr *ntoh; ndis_task_tcpip_csum *nttc; struct ifnet *ifp; int len, error; if (!NDIS_INITIALIZED(sc)) return (EINVAL); if (sc->ndis_80211) return (EINVAL); /* See if there's anything to set. */ ifp = sc->ifp; error = ndis_probe_offload(sc); if (error) return (error); if (sc->ndis_hwassist == 0 && ifp->if_capabilities == 0) return (0); len = sizeof(ndis_task_offload_hdr) + sizeof(ndis_task_offload) + sizeof(ndis_task_tcpip_csum); ntoh = malloc(len, M_TEMP, M_NOWAIT|M_ZERO); if (ntoh == NULL) return (ENOMEM); ntoh->ntoh_vers = NDIS_TASK_OFFLOAD_VERSION; ntoh->ntoh_len = sizeof(ndis_task_offload_hdr); ntoh->ntoh_offset_firsttask = sizeof(ndis_task_offload_hdr); ntoh->ntoh_encapfmt.nef_encaphdrlen = sizeof(struct ether_header); ntoh->ntoh_encapfmt.nef_encap = NDIS_ENCAP_IEEE802_3; ntoh->ntoh_encapfmt.nef_flags = NDIS_ENCAPFLAG_FIXEDHDRLEN; nto = (ndis_task_offload *)((char *)ntoh + ntoh->ntoh_offset_firsttask); nto->nto_vers = NDIS_TASK_OFFLOAD_VERSION; nto->nto_len = sizeof(ndis_task_offload); nto->nto_task = NDIS_TASK_TCPIP_CSUM; nto->nto_offset_nexttask = 0; nto->nto_taskbuflen = sizeof(ndis_task_tcpip_csum); nttc = (ndis_task_tcpip_csum *)nto->nto_taskbuf; if (ifp->if_capenable & IFCAP_TXCSUM) nttc->nttc_v4tx = sc->ndis_v4tx; if (ifp->if_capenable & IFCAP_RXCSUM) nttc->nttc_v4rx = sc->ndis_v4rx; error = ndis_set_info(sc, OID_TCP_TASK_OFFLOAD, ntoh, &len); free(ntoh, M_TEMP); return (error); } static int ndis_probe_offload(sc) struct ndis_softc *sc; { ndis_task_offload *nto; ndis_task_offload_hdr *ntoh; ndis_task_tcpip_csum *nttc = NULL; struct ifnet *ifp; int len, error, dummy; ifp = sc->ifp; len = sizeof(dummy); error = ndis_get_info(sc, OID_TCP_TASK_OFFLOAD, &dummy, &len); if (error != ENOSPC) return (error); ntoh = malloc(len, M_TEMP, M_NOWAIT|M_ZERO); if (ntoh == NULL) return (ENOMEM); ntoh->ntoh_vers = NDIS_TASK_OFFLOAD_VERSION; ntoh->ntoh_len = sizeof(ndis_task_offload_hdr); ntoh->ntoh_encapfmt.nef_encaphdrlen = sizeof(struct ether_header); ntoh->ntoh_encapfmt.nef_encap = NDIS_ENCAP_IEEE802_3; ntoh->ntoh_encapfmt.nef_flags = NDIS_ENCAPFLAG_FIXEDHDRLEN; error = ndis_get_info(sc, OID_TCP_TASK_OFFLOAD, ntoh, &len); if (error) { free(ntoh, M_TEMP); return (error); } if (ntoh->ntoh_vers != NDIS_TASK_OFFLOAD_VERSION) { free(ntoh, M_TEMP); return (EINVAL); } nto = (ndis_task_offload *)((char *)ntoh + ntoh->ntoh_offset_firsttask); while (1) { switch (nto->nto_task) { case NDIS_TASK_TCPIP_CSUM: nttc = (ndis_task_tcpip_csum *)nto->nto_taskbuf; break; /* Don't handle these yet. */ case NDIS_TASK_IPSEC: case NDIS_TASK_TCP_LARGESEND: default: break; } if (nto->nto_offset_nexttask == 0) break; nto = (ndis_task_offload *)((char *)nto + nto->nto_offset_nexttask); } if (nttc == NULL) { free(ntoh, M_TEMP); return (ENOENT); } sc->ndis_v4tx = nttc->nttc_v4tx; sc->ndis_v4rx = nttc->nttc_v4rx; if (nttc->nttc_v4tx & NDIS_TCPSUM_FLAGS_IP_CSUM) sc->ndis_hwassist |= CSUM_IP; if (nttc->nttc_v4tx & NDIS_TCPSUM_FLAGS_TCP_CSUM) sc->ndis_hwassist |= CSUM_TCP; if (nttc->nttc_v4tx & NDIS_TCPSUM_FLAGS_UDP_CSUM) sc->ndis_hwassist |= CSUM_UDP; if (sc->ndis_hwassist) ifp->if_capabilities |= IFCAP_TXCSUM; if (nttc->nttc_v4rx & NDIS_TCPSUM_FLAGS_IP_CSUM) ifp->if_capabilities |= IFCAP_RXCSUM; if (nttc->nttc_v4rx & NDIS_TCPSUM_FLAGS_TCP_CSUM) ifp->if_capabilities |= IFCAP_RXCSUM; if (nttc->nttc_v4rx & NDIS_TCPSUM_FLAGS_UDP_CSUM) ifp->if_capabilities |= IFCAP_RXCSUM; free(ntoh, M_TEMP); return (0); } static int ndis_nettype_chan(uint32_t type) { switch (type) { case NDIS_80211_NETTYPE_11FH: return (IEEE80211_CHAN_FHSS); case NDIS_80211_NETTYPE_11DS: return (IEEE80211_CHAN_B); case NDIS_80211_NETTYPE_11OFDM5: return (IEEE80211_CHAN_A); case NDIS_80211_NETTYPE_11OFDM24: return (IEEE80211_CHAN_G); } DPRINTF(("unknown channel nettype %d\n", type)); return (IEEE80211_CHAN_B); /* Default to 11B chan */ } static int ndis_nettype_mode(uint32_t type) { switch (type) { case NDIS_80211_NETTYPE_11FH: return (IEEE80211_MODE_FH); case NDIS_80211_NETTYPE_11DS: return (IEEE80211_MODE_11B); case NDIS_80211_NETTYPE_11OFDM5: return (IEEE80211_MODE_11A); case NDIS_80211_NETTYPE_11OFDM24: return (IEEE80211_MODE_11G); } DPRINTF(("unknown mode nettype %d\n", type)); return (IEEE80211_MODE_AUTO); } /* * Attach the interface. Allocate softc structures, do ifmedia * setup and ethernet/BPF attach. */ int ndis_attach(device_t dev) { struct ndis_softc *sc; driver_object *pdrv; device_object *pdo; int error = 0, len; int i; sc = device_get_softc(dev); mtx_init(&sc->ndis_mtx, device_get_nameunit(dev), MTX_NETWORK_LOCK, MTX_DEF); KeInitializeSpinLock(&sc->ndis_rxlock); KeInitializeSpinLock(&sc->ndisusb_tasklock); KeInitializeSpinLock(&sc->ndisusb_xferdonelock); InitializeListHead(&sc->ndis_shlist); InitializeListHead(&sc->ndisusb_tasklist); InitializeListHead(&sc->ndisusb_xferdonelist); callout_init(&sc->ndis_stat_callout, 1); mbufq_init(&sc->ndis_rxqueue, INT_MAX); /* XXXGL: sane maximum */ /* Create sysctl registry nodes */ ndis_create_sysctls(sc); /* Find the PDO for this device instance. */ if (sc->ndis_iftype == PCIBus) pdrv = windrv_lookup(0, "PCI Bus"); else if (sc->ndis_iftype == PCMCIABus) pdrv = windrv_lookup(0, "PCCARD Bus"); else pdrv = windrv_lookup(0, "USB Bus"); pdo = windrv_find_pdo(pdrv, dev); /* * Create a new functional device object for this * device. This is what creates the miniport block * for this device instance. */ if (NdisAddDevice(sc->ndis_dobj, pdo) != STATUS_SUCCESS) { device_printf(dev, "failed to create FDO!\n"); error = ENXIO; goto fail; } /* Tell the user what version of the API the driver is using. */ device_printf(dev, "NDIS API version: %d.%d\n", sc->ndis_chars->nmc_version_major, sc->ndis_chars->nmc_version_minor); /* Do resource conversion. */ if (sc->ndis_iftype == PCMCIABus || sc->ndis_iftype == PCIBus) ndis_convert_res(sc); else sc->ndis_block->nmb_rlist = NULL; /* Install our RX and TX interrupt handlers. */ sc->ndis_block->nmb_senddone_func = ndis_txeof_wrap; sc->ndis_block->nmb_pktind_func = ndis_rxeof_wrap; sc->ndis_block->nmb_ethrxindicate_func = ndis_rxeof_eth_wrap; sc->ndis_block->nmb_ethrxdone_func = ndis_rxeof_done_wrap; sc->ndis_block->nmb_tdcond_func = ndis_rxeof_xfr_done_wrap; /* Override the status handler so we can detect link changes. */ sc->ndis_block->nmb_status_func = ndis_linksts_wrap; sc->ndis_block->nmb_statusdone_func = ndis_linksts_done_wrap; /* Set up work item handlers. */ sc->ndis_tickitem = IoAllocateWorkItem(sc->ndis_block->nmb_deviceobj); sc->ndis_startitem = IoAllocateWorkItem(sc->ndis_block->nmb_deviceobj); sc->ndis_resetitem = IoAllocateWorkItem(sc->ndis_block->nmb_deviceobj); sc->ndis_inputitem = IoAllocateWorkItem(sc->ndis_block->nmb_deviceobj); sc->ndisusb_xferdoneitem = IoAllocateWorkItem(sc->ndis_block->nmb_deviceobj); sc->ndisusb_taskitem = IoAllocateWorkItem(sc->ndis_block->nmb_deviceobj); KeInitializeDpc(&sc->ndis_rxdpc, ndis_rxeof_xfr_wrap, sc->ndis_block); /* Call driver's init routine. */ if (ndis_init_nic(sc)) { device_printf(dev, "init handler failed\n"); error = ENXIO; goto fail; } /* * Figure out how big to make the TX buffer pool. */ len = sizeof(sc->ndis_maxpkts); if (ndis_get_info(sc, OID_GEN_MAXIMUM_SEND_PACKETS, &sc->ndis_maxpkts, &len)) { device_printf(dev, "failed to get max TX packets\n"); error = ENXIO; goto fail; } /* * If this is a deserialized miniport, we don't have * to honor the OID_GEN_MAXIMUM_SEND_PACKETS result. */ if (!NDIS_SERIALIZED(sc->ndis_block)) sc->ndis_maxpkts = NDIS_TXPKTS; /* Enforce some sanity, just in case. */ if (sc->ndis_maxpkts == 0) sc->ndis_maxpkts = 10; sc->ndis_txarray = malloc(sizeof(ndis_packet *) * sc->ndis_maxpkts, M_DEVBUF, M_NOWAIT|M_ZERO); /* Allocate a pool of ndis_packets for TX encapsulation. */ NdisAllocatePacketPool(&i, &sc->ndis_txpool, sc->ndis_maxpkts, PROTOCOL_RESERVED_SIZE_IN_PACKET); if (i != NDIS_STATUS_SUCCESS) { sc->ndis_txpool = NULL; device_printf(dev, "failed to allocate TX packet pool"); error = ENOMEM; goto fail; } sc->ndis_txpending = sc->ndis_maxpkts; sc->ndis_oidcnt = 0; /* Get supported oid list. */ ndis_get_supported_oids(sc, &sc->ndis_oids, &sc->ndis_oidcnt); /* If the NDIS module requested scatter/gather, init maps. */ if (sc->ndis_sc) ndis_init_dma(sc); /* * See if the OID_802_11_CONFIGURATION OID is * supported by this driver. If it is, then this an 802.11 * wireless driver, and we should set up media for wireless. */ for (i = 0; i < sc->ndis_oidcnt; i++) if (sc->ndis_oids[i] == OID_802_11_CONFIGURATION) { sc->ndis_80211 = 1; break; } if (sc->ndis_80211) error = ndis_80211attach(sc); else error = ndis_ifattach(sc); fail: if (error) { ndis_detach(dev); return (error); } if (sc->ndis_iftype == PNPBus && ndisusb_halt == 0) return (error); DPRINTF(("attach done.\n")); /* We're done talking to the NIC for now; halt it. */ ndis_halt_nic(sc); DPRINTF(("halting done.\n")); return (error); } static int ndis_80211attach(struct ndis_softc *sc) { struct ieee80211com *ic = &sc->ndis_ic; ndis_80211_rates_ex rates; struct ndis_80211_nettype_list *ntl; uint32_t arg; int mode, i, r, len, nonettypes = 1; uint8_t bands[IEEE80211_MODE_BYTES] = { 0 }; callout_init(&sc->ndis_scan_callout, 1); ic->ic_softc = sc; ic->ic_ioctl = ndis_80211ioctl; ic->ic_name = device_get_nameunit(sc->ndis_dev); ic->ic_opmode = IEEE80211_M_STA; ic->ic_phytype = IEEE80211_T_DS; ic->ic_caps = IEEE80211_C_8023ENCAP | IEEE80211_C_STA | IEEE80211_C_IBSS; setbit(ic->ic_modecaps, IEEE80211_MODE_AUTO); len = 0; r = ndis_get_info(sc, OID_802_11_NETWORK_TYPES_SUPPORTED, NULL, &len); if (r != ENOSPC) goto nonettypes; ntl = malloc(len, M_DEVBUF, M_WAITOK | M_ZERO); r = ndis_get_info(sc, OID_802_11_NETWORK_TYPES_SUPPORTED, ntl, &len); if (r != 0) { free(ntl, M_DEVBUF); goto nonettypes; } for (i = 0; i < ntl->ntl_items; i++) { mode = ndis_nettype_mode(ntl->ntl_type[i]); if (mode) { nonettypes = 0; setbit(ic->ic_modecaps, mode); setbit(bands, mode); } else device_printf(sc->ndis_dev, "Unknown nettype %d\n", ntl->ntl_type[i]); } free(ntl, M_DEVBUF); nonettypes: /* Default to 11b channels if the card did not supply any */ if (nonettypes) { setbit(ic->ic_modecaps, IEEE80211_MODE_11B); setbit(bands, IEEE80211_MODE_11B); } len = sizeof(rates); bzero((char *)&rates, len); r = ndis_get_info(sc, OID_802_11_SUPPORTED_RATES, (void *)rates, &len); if (r != 0) device_printf(sc->ndis_dev, "get rates failed: 0x%x\n", r); /* * Since the supported rates only up to 8 can be supported, * if this is not 802.11b we're just going to be faking it * all up to heck. */ #define TESTSETRATE(x, y) \ do { \ int i; \ for (i = 0; i < ic->ic_sup_rates[x].rs_nrates; i++) { \ if (ic->ic_sup_rates[x].rs_rates[i] == (y)) \ break; \ } \ if (i == ic->ic_sup_rates[x].rs_nrates) { \ ic->ic_sup_rates[x].rs_rates[i] = (y); \ ic->ic_sup_rates[x].rs_nrates++; \ } \ } while (0) #define SETRATE(x, y) \ ic->ic_sup_rates[x].rs_rates[ic->ic_sup_rates[x].rs_nrates] = (y) #define INCRATE(x) \ ic->ic_sup_rates[x].rs_nrates++ ic->ic_curmode = IEEE80211_MODE_AUTO; if (isset(ic->ic_modecaps, IEEE80211_MODE_11A)) ic->ic_sup_rates[IEEE80211_MODE_11A].rs_nrates = 0; if (isset(ic->ic_modecaps, IEEE80211_MODE_11B)) ic->ic_sup_rates[IEEE80211_MODE_11B].rs_nrates = 0; if (isset(ic->ic_modecaps, IEEE80211_MODE_11G)) ic->ic_sup_rates[IEEE80211_MODE_11G].rs_nrates = 0; for (i = 0; i < len; i++) { switch (rates[i] & IEEE80211_RATE_VAL) { case 2: case 4: case 11: case 10: case 22: if (isclr(ic->ic_modecaps, IEEE80211_MODE_11B)) { /* Lazy-init 802.11b. */ setbit(ic->ic_modecaps, IEEE80211_MODE_11B); ic->ic_sup_rates[IEEE80211_MODE_11B]. rs_nrates = 0; } SETRATE(IEEE80211_MODE_11B, rates[i]); INCRATE(IEEE80211_MODE_11B); break; default: if (isset(ic->ic_modecaps, IEEE80211_MODE_11A)) { SETRATE(IEEE80211_MODE_11A, rates[i]); INCRATE(IEEE80211_MODE_11A); } if (isset(ic->ic_modecaps, IEEE80211_MODE_11G)) { SETRATE(IEEE80211_MODE_11G, rates[i]); INCRATE(IEEE80211_MODE_11G); } break; } } /* * If the hardware supports 802.11g, it most * likely supports 802.11b and all of the * 802.11b and 802.11g speeds, so maybe we can * just cheat here. Just how in the heck do * we detect turbo modes, though? */ if (isset(ic->ic_modecaps, IEEE80211_MODE_11B)) { TESTSETRATE(IEEE80211_MODE_11B, IEEE80211_RATE_BASIC|2); TESTSETRATE(IEEE80211_MODE_11B, IEEE80211_RATE_BASIC|4); TESTSETRATE(IEEE80211_MODE_11B, IEEE80211_RATE_BASIC|11); TESTSETRATE(IEEE80211_MODE_11B, IEEE80211_RATE_BASIC|22); } if (isset(ic->ic_modecaps, IEEE80211_MODE_11G)) { TESTSETRATE(IEEE80211_MODE_11G, 48); TESTSETRATE(IEEE80211_MODE_11G, 72); TESTSETRATE(IEEE80211_MODE_11G, 96); TESTSETRATE(IEEE80211_MODE_11G, 108); } if (isset(ic->ic_modecaps, IEEE80211_MODE_11A)) { TESTSETRATE(IEEE80211_MODE_11A, 48); TESTSETRATE(IEEE80211_MODE_11A, 72); TESTSETRATE(IEEE80211_MODE_11A, 96); TESTSETRATE(IEEE80211_MODE_11A, 108); } #undef SETRATE #undef INCRATE #undef TESTSETRATE ieee80211_init_channels(ic, NULL, bands); /* * To test for WPA support, we need to see if we can * set AUTHENTICATION_MODE to WPA and read it back * successfully. */ i = sizeof(arg); arg = NDIS_80211_AUTHMODE_WPA; r = ndis_set_info(sc, OID_802_11_AUTHENTICATION_MODE, &arg, &i); if (r == 0) { r = ndis_get_info(sc, OID_802_11_AUTHENTICATION_MODE, &arg, &i); if (r == 0 && arg == NDIS_80211_AUTHMODE_WPA) ic->ic_caps |= IEEE80211_C_WPA; } /* * To test for supported ciphers, we set each * available encryption type in descending order. * If ENC3 works, then we have WEP, TKIP and AES. * If only ENC2 works, then we have WEP and TKIP. * If only ENC1 works, then we have just WEP. */ i = sizeof(arg); arg = NDIS_80211_WEPSTAT_ENC3ENABLED; r = ndis_set_info(sc, OID_802_11_ENCRYPTION_STATUS, &arg, &i); if (r == 0) { ic->ic_cryptocaps |= IEEE80211_CRYPTO_WEP | IEEE80211_CRYPTO_TKIP | IEEE80211_CRYPTO_AES_CCM; goto got_crypto; } arg = NDIS_80211_WEPSTAT_ENC2ENABLED; r = ndis_set_info(sc, OID_802_11_ENCRYPTION_STATUS, &arg, &i); if (r == 0) { ic->ic_cryptocaps |= IEEE80211_CRYPTO_WEP | IEEE80211_CRYPTO_TKIP; goto got_crypto; } arg = NDIS_80211_WEPSTAT_ENC1ENABLED; r = ndis_set_info(sc, OID_802_11_ENCRYPTION_STATUS, &arg, &i); if (r == 0) ic->ic_cryptocaps |= IEEE80211_CRYPTO_WEP; got_crypto: i = sizeof(arg); r = ndis_get_info(sc, OID_802_11_POWER_MODE, &arg, &i); if (r == 0) ic->ic_caps |= IEEE80211_C_PMGT; r = ndis_get_info(sc, OID_802_11_TX_POWER_LEVEL, &arg, &i); if (r == 0) ic->ic_caps |= IEEE80211_C_TXPMGT; /* * Get station address from the driver. */ len = sizeof(ic->ic_macaddr); ndis_get_info(sc, OID_802_3_CURRENT_ADDRESS, &ic->ic_macaddr, &len); ieee80211_ifattach(ic); ic->ic_raw_xmit = ndis_raw_xmit; ic->ic_scan_start = ndis_scan_start; ic->ic_scan_end = ndis_scan_end; ic->ic_set_channel = ndis_set_channel; ic->ic_scan_curchan = ndis_scan_curchan; ic->ic_scan_mindwell = ndis_scan_mindwell; ic->ic_bsschan = IEEE80211_CHAN_ANYC; ic->ic_vap_create = ndis_vap_create; ic->ic_vap_delete = ndis_vap_delete; ic->ic_update_mcast = ndis_update_mcast; ic->ic_update_promisc = ndis_update_promisc; ic->ic_transmit = ndis_80211transmit; ic->ic_parent = ndis_80211parent; if (bootverbose) ieee80211_announce(ic); return (0); } static int ndis_ifattach(struct ndis_softc *sc) { struct ifnet *ifp; u_char eaddr[ETHER_ADDR_LEN]; int len; ifp = if_alloc(IFT_ETHER); if (ifp == NULL) return (ENOSPC); sc->ifp = ifp; ifp->if_softc = sc; /* Check for task offload support. */ ndis_probe_offload(sc); /* * Get station address from the driver. */ len = sizeof(eaddr); ndis_get_info(sc, OID_802_3_CURRENT_ADDRESS, eaddr, &len); if_initname(ifp, device_get_name(sc->ndis_dev), device_get_unit(sc->ndis_dev)); - ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST | - IFF_NEEDSEPOCH; + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = ndis_ifioctl; ifp->if_start = ndis_ifstart; ifp->if_init = ndis_init; ifp->if_baudrate = 10000000; IFQ_SET_MAXLEN(&ifp->if_snd, 50); ifp->if_snd.ifq_drv_maxlen = 25; IFQ_SET_READY(&ifp->if_snd); ifp->if_capenable = ifp->if_capabilities; ifp->if_hwassist = sc->ndis_hwassist; ifmedia_init(&sc->ifmedia, IFM_IMASK, ndis_ifmedia_upd, ndis_ifmedia_sts); ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_10_T, 0, NULL); ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_10_T|IFM_FDX, 0, NULL); ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_100_TX, 0, NULL); ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_100_TX|IFM_FDX, 0, NULL); ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_AUTO, 0, NULL); ifmedia_set(&sc->ifmedia, IFM_ETHER|IFM_AUTO); ether_ifattach(ifp, eaddr); return (0); } static struct ieee80211vap * ndis_vap_create(struct ieee80211com *ic, const char name[IFNAMSIZ], int unit, enum ieee80211_opmode opmode, int flags, const uint8_t bssid[IEEE80211_ADDR_LEN], const uint8_t mac[IEEE80211_ADDR_LEN]) { struct ndis_vap *nvp; struct ieee80211vap *vap; if (!TAILQ_EMPTY(&ic->ic_vaps)) /* only one at a time */ return NULL; nvp = malloc(sizeof(struct ndis_vap), M_80211_VAP, M_WAITOK | M_ZERO); vap = &nvp->vap; ieee80211_vap_setup(ic, vap, name, unit, opmode, flags, bssid); /* override with driver methods */ nvp->newstate = vap->iv_newstate; vap->iv_newstate = ndis_newstate; /* complete setup */ ieee80211_vap_attach(vap, ieee80211_media_change, ndis_media_status, mac); ic->ic_opmode = opmode; /* install key handing routines */ vap->iv_key_set = ndis_add_key; vap->iv_key_delete = ndis_del_key; return vap; } static void ndis_vap_delete(struct ieee80211vap *vap) { struct ndis_vap *nvp = NDIS_VAP(vap); struct ieee80211com *ic = vap->iv_ic; struct ndis_softc *sc = ic->ic_softc; ndis_stop(sc); callout_drain(&sc->ndis_scan_callout); ieee80211_vap_detach(vap); free(nvp, M_80211_VAP); } /* * Shutdown hardware and free up resources. This can be called any * time after the mutex has been initialized. It is called in both * the error case in attach and the normal detach case so it needs * to be careful about only freeing resources that have actually been * allocated. */ int ndis_detach(device_t dev) { struct ifnet *ifp; struct ndis_softc *sc; driver_object *drv; sc = device_get_softc(dev); NDIS_LOCK(sc); if (!sc->ndis_80211) ifp = sc->ifp; else ifp = NULL; if (ifp != NULL) ifp->if_flags &= ~IFF_UP; if (device_is_attached(dev)) { NDIS_UNLOCK(sc); ndis_stop(sc); if (sc->ndis_80211) ieee80211_ifdetach(&sc->ndis_ic); else if (ifp != NULL) ether_ifdetach(ifp); } else NDIS_UNLOCK(sc); if (sc->ndis_tickitem != NULL) IoFreeWorkItem(sc->ndis_tickitem); if (sc->ndis_startitem != NULL) IoFreeWorkItem(sc->ndis_startitem); if (sc->ndis_resetitem != NULL) IoFreeWorkItem(sc->ndis_resetitem); if (sc->ndis_inputitem != NULL) IoFreeWorkItem(sc->ndis_inputitem); if (sc->ndisusb_xferdoneitem != NULL) IoFreeWorkItem(sc->ndisusb_xferdoneitem); if (sc->ndisusb_taskitem != NULL) IoFreeWorkItem(sc->ndisusb_taskitem); bus_generic_detach(dev); ndis_unload_driver(sc); if (sc->ndis_irq) bus_release_resource(dev, SYS_RES_IRQ, 0, sc->ndis_irq); if (sc->ndis_res_io) bus_release_resource(dev, SYS_RES_IOPORT, sc->ndis_io_rid, sc->ndis_res_io); if (sc->ndis_res_mem) bus_release_resource(dev, SYS_RES_MEMORY, sc->ndis_mem_rid, sc->ndis_res_mem); if (sc->ndis_res_altmem) bus_release_resource(dev, SYS_RES_MEMORY, sc->ndis_altmem_rid, sc->ndis_res_altmem); if (ifp != NULL) if_free(ifp); if (sc->ndis_sc) ndis_destroy_dma(sc); if (sc->ndis_txarray) free(sc->ndis_txarray, M_DEVBUF); if (!sc->ndis_80211) ifmedia_removeall(&sc->ifmedia); if (sc->ndis_txpool != NULL) NdisFreePacketPool(sc->ndis_txpool); /* Destroy the PDO for this device. */ if (sc->ndis_iftype == PCIBus) drv = windrv_lookup(0, "PCI Bus"); else if (sc->ndis_iftype == PCMCIABus) drv = windrv_lookup(0, "PCCARD Bus"); else drv = windrv_lookup(0, "USB Bus"); if (drv == NULL) panic("couldn't find driver object"); windrv_destroy_pdo(drv, dev); if (sc->ndis_iftype == PCIBus) bus_dma_tag_destroy(sc->ndis_parent_tag); return (0); } int ndis_suspend(dev) device_t dev; { struct ndis_softc *sc; struct ifnet *ifp; sc = device_get_softc(dev); ifp = sc->ifp; #ifdef notdef if (NDIS_INITIALIZED(sc)) ndis_stop(sc); #endif return (0); } int ndis_resume(dev) device_t dev; { struct ndis_softc *sc; struct ifnet *ifp; sc = device_get_softc(dev); ifp = sc->ifp; if (NDIS_INITIALIZED(sc)) ndis_init(sc); return (0); } /* * The following bunch of routines are here to support drivers that * use the NdisMEthIndicateReceive()/MiniportTransferData() mechanism. * The NdisMEthIndicateReceive() handler runs at DISPATCH_LEVEL for * serialized miniports, or IRQL <= DISPATCH_LEVEL for deserialized * miniports. */ static void ndis_rxeof_eth(adapter, ctx, addr, hdr, hdrlen, lookahead, lookaheadlen, pktlen) ndis_handle adapter; ndis_handle ctx; char *addr; void *hdr; uint32_t hdrlen; void *lookahead; uint32_t lookaheadlen; uint32_t pktlen; { ndis_miniport_block *block; uint8_t irql = 0; uint32_t status; ndis_buffer *b; ndis_packet *p; struct mbuf *m; ndis_ethpriv *priv; block = adapter; m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) return; /* Save the data provided to us so far. */ m->m_len = lookaheadlen + hdrlen; m->m_pkthdr.len = pktlen + hdrlen; m->m_next = NULL; m_copyback(m, 0, hdrlen, hdr); m_copyback(m, hdrlen, lookaheadlen, lookahead); /* Now create a fake NDIS_PACKET to hold the data */ NdisAllocatePacket(&status, &p, block->nmb_rxpool); if (status != NDIS_STATUS_SUCCESS) { m_freem(m); return; } p->np_m0 = m; b = IoAllocateMdl(m->m_data, m->m_pkthdr.len, FALSE, FALSE, NULL); if (b == NULL) { NdisFreePacket(p); m_freem(m); return; } p->np_private.npp_head = p->np_private.npp_tail = b; p->np_private.npp_totlen = m->m_pkthdr.len; /* Save the packet RX context somewhere. */ priv = (ndis_ethpriv *)&p->np_protocolreserved; priv->nep_ctx = ctx; if (!NDIS_SERIALIZED(block)) KeAcquireSpinLock(&block->nmb_lock, &irql); InsertTailList((&block->nmb_packetlist), (&p->np_list)); if (!NDIS_SERIALIZED(block)) KeReleaseSpinLock(&block->nmb_lock, irql); } /* * NdisMEthIndicateReceiveComplete() handler, runs at DISPATCH_LEVEL * for serialized miniports, or IRQL <= DISPATCH_LEVEL for deserialized * miniports. */ static void ndis_rxeof_done(adapter) ndis_handle adapter; { struct ndis_softc *sc; ndis_miniport_block *block; block = adapter; /* Schedule transfer/RX of queued packets. */ sc = device_get_softc(block->nmb_physdeviceobj->do_devext); KeInsertQueueDpc(&sc->ndis_rxdpc, NULL, NULL); } /* * MiniportTransferData() handler, runs at DISPATCH_LEVEL. */ static void ndis_rxeof_xfr(dpc, adapter, sysarg1, sysarg2) kdpc *dpc; ndis_handle adapter; void *sysarg1; void *sysarg2; { ndis_miniport_block *block; struct ndis_softc *sc; ndis_packet *p; list_entry *l; uint32_t status; ndis_ethpriv *priv; struct ifnet *ifp; struct mbuf *m; block = adapter; sc = device_get_softc(block->nmb_physdeviceobj->do_devext); ifp = sc->ifp; KeAcquireSpinLockAtDpcLevel(&block->nmb_lock); l = block->nmb_packetlist.nle_flink; while(!IsListEmpty(&block->nmb_packetlist)) { l = RemoveHeadList((&block->nmb_packetlist)); p = CONTAINING_RECORD(l, ndis_packet, np_list); InitializeListHead((&p->np_list)); priv = (ndis_ethpriv *)&p->np_protocolreserved; m = p->np_m0; p->np_softc = sc; p->np_m0 = NULL; KeReleaseSpinLockFromDpcLevel(&block->nmb_lock); status = MSCALL6(sc->ndis_chars->nmc_transferdata_func, p, &p->np_private.npp_totlen, block, priv->nep_ctx, m->m_len, m->m_pkthdr.len - m->m_len); KeAcquireSpinLockAtDpcLevel(&block->nmb_lock); /* * If status is NDIS_STATUS_PENDING, do nothing and * wait for a callback to the ndis_rxeof_xfr_done() * handler. */ m->m_len = m->m_pkthdr.len; m->m_pkthdr.rcvif = ifp; if (status == NDIS_STATUS_SUCCESS) { IoFreeMdl(p->np_private.npp_head); NdisFreePacket(p); KeAcquireSpinLockAtDpcLevel(&sc->ndis_rxlock); mbufq_enqueue(&sc->ndis_rxqueue, m); KeReleaseSpinLockFromDpcLevel(&sc->ndis_rxlock); IoQueueWorkItem(sc->ndis_inputitem, (io_workitem_func)ndis_inputtask_wrap, WORKQUEUE_CRITICAL, sc); } if (status == NDIS_STATUS_FAILURE) m_freem(m); /* Advance to next packet */ l = block->nmb_packetlist.nle_flink; } KeReleaseSpinLockFromDpcLevel(&block->nmb_lock); } /* * NdisMTransferDataComplete() handler, runs at DISPATCH_LEVEL. */ static void ndis_rxeof_xfr_done(adapter, packet, status, len) ndis_handle adapter; ndis_packet *packet; uint32_t status; uint32_t len; { ndis_miniport_block *block; struct ndis_softc *sc; struct ifnet *ifp; struct mbuf *m; block = adapter; sc = device_get_softc(block->nmb_physdeviceobj->do_devext); ifp = sc->ifp; m = packet->np_m0; IoFreeMdl(packet->np_private.npp_head); NdisFreePacket(packet); if (status != NDIS_STATUS_SUCCESS) { m_freem(m); return; } m->m_len = m->m_pkthdr.len; m->m_pkthdr.rcvif = ifp; KeAcquireSpinLockAtDpcLevel(&sc->ndis_rxlock); mbufq_enqueue(&sc->ndis_rxqueue, m); KeReleaseSpinLockFromDpcLevel(&sc->ndis_rxlock); IoQueueWorkItem(sc->ndis_inputitem, (io_workitem_func)ndis_inputtask_wrap, WORKQUEUE_CRITICAL, sc); } /* * A frame has been uploaded: pass the resulting mbuf chain up to * the higher level protocols. * * When handling received NDIS packets, the 'status' field in the * out-of-band portion of the ndis_packet has special meaning. In the * most common case, the underlying NDIS driver will set this field * to NDIS_STATUS_SUCCESS, which indicates that it's ok for us to * take possession of it. We then change the status field to * NDIS_STATUS_PENDING to tell the driver that we now own the packet, * and that we will return it at some point in the future via the * return packet handler. * * If the driver hands us a packet with a status of NDIS_STATUS_RESOURCES, * this means the driver is running out of packet/buffer resources and * wants to maintain ownership of the packet. In this case, we have to * copy the packet data into local storage and let the driver keep the * packet. */ static void ndis_rxeof(adapter, packets, pktcnt) ndis_handle adapter; ndis_packet **packets; uint32_t pktcnt; { struct ndis_softc *sc; ndis_miniport_block *block; ndis_packet *p; uint32_t s; ndis_tcpip_csum *csum; struct ifnet *ifp; struct mbuf *m0, *m; int i; block = (ndis_miniport_block *)adapter; sc = device_get_softc(block->nmb_physdeviceobj->do_devext); ifp = sc->ifp; /* * There's a slim chance the driver may indicate some packets * before we're completely ready to handle them. If we detect this, * we need to return them to the miniport and ignore them. */ if (!sc->ndis_running) { for (i = 0; i < pktcnt; i++) { p = packets[i]; if (p->np_oob.npo_status == NDIS_STATUS_SUCCESS) { p->np_refcnt++; ndis_return_packet(p); } } return; } for (i = 0; i < pktcnt; i++) { p = packets[i]; /* Stash the softc here so ptom can use it. */ p->np_softc = sc; if (ndis_ptom(&m0, p)) { device_printf(sc->ndis_dev, "ptom failed\n"); if (p->np_oob.npo_status == NDIS_STATUS_SUCCESS) ndis_return_packet(p); } else { #ifdef notdef if (p->np_oob.npo_status == NDIS_STATUS_RESOURCES) { m = m_dup(m0, M_NOWAIT); /* * NOTE: we want to destroy the mbuf here, but * we don't actually want to return it to the * driver via the return packet handler. By * bumping np_refcnt, we can prevent the * ndis_return_packet() routine from actually * doing anything. */ p->np_refcnt++; m_freem(m0); if (m == NULL) if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); else m0 = m; } else p->np_oob.npo_status = NDIS_STATUS_PENDING; #endif m = m_dup(m0, M_NOWAIT); if (p->np_oob.npo_status == NDIS_STATUS_RESOURCES) p->np_refcnt++; else p->np_oob.npo_status = NDIS_STATUS_PENDING; m_freem(m0); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); continue; } m0 = m; m0->m_pkthdr.rcvif = ifp; /* Deal with checksum offload. */ if (ifp->if_capenable & IFCAP_RXCSUM && p->np_ext.npe_info[ndis_tcpipcsum_info] != NULL) { s = (uintptr_t) p->np_ext.npe_info[ndis_tcpipcsum_info]; csum = (ndis_tcpip_csum *)&s; if (csum->u.ntc_rxflags & NDIS_RXCSUM_IP_PASSED) m0->m_pkthdr.csum_flags |= CSUM_IP_CHECKED|CSUM_IP_VALID; if (csum->u.ntc_rxflags & (NDIS_RXCSUM_TCP_PASSED | NDIS_RXCSUM_UDP_PASSED)) { m0->m_pkthdr.csum_flags |= CSUM_DATA_VALID|CSUM_PSEUDO_HDR; m0->m_pkthdr.csum_data = 0xFFFF; } } KeAcquireSpinLockAtDpcLevel(&sc->ndis_rxlock); mbufq_enqueue(&sc->ndis_rxqueue, m0); KeReleaseSpinLockFromDpcLevel(&sc->ndis_rxlock); IoQueueWorkItem(sc->ndis_inputitem, (io_workitem_func)ndis_inputtask_wrap, WORKQUEUE_CRITICAL, sc); } } } /* * This routine is run at PASSIVE_LEVEL. We use this routine to pass * packets into the stack in order to avoid calling (*ifp->if_input)() * with any locks held (at DISPATCH_LEVEL, we'll be holding the * 'dispatch level' per-cpu sleep lock). */ static void ndis_inputtask(device_object *dobj, void *arg) { ndis_miniport_block *block; struct ndis_softc *sc = arg; struct mbuf *m; uint8_t irql; block = dobj->do_devext; KeAcquireSpinLock(&sc->ndis_rxlock, &irql); while ((m = mbufq_dequeue(&sc->ndis_rxqueue)) != NULL) { KeReleaseSpinLock(&sc->ndis_rxlock, irql); if ((sc->ndis_80211 != 0)) { struct ieee80211com *ic = &sc->ndis_ic; struct ieee80211vap *vap = TAILQ_FIRST(&ic->ic_vaps); if (vap != NULL) vap->iv_deliver_data(vap, vap->iv_bss, m); } else { struct ifnet *ifp = sc->ifp; (*ifp->if_input)(ifp, m); } KeAcquireSpinLock(&sc->ndis_rxlock, &irql); } KeReleaseSpinLock(&sc->ndis_rxlock, irql); } /* * A frame was downloaded to the chip. It's safe for us to clean up * the list buffers. */ static void ndis_txeof(adapter, packet, status) ndis_handle adapter; ndis_packet *packet; ndis_status status; { struct ndis_softc *sc; ndis_miniport_block *block; struct ifnet *ifp; int idx; struct mbuf *m; block = (ndis_miniport_block *)adapter; sc = device_get_softc(block->nmb_physdeviceobj->do_devext); ifp = sc->ifp; m = packet->np_m0; idx = packet->np_txidx; if (sc->ndis_sc) bus_dmamap_unload(sc->ndis_ttag, sc->ndis_tmaps[idx]); ndis_free_packet(packet); m_freem(m); NDIS_LOCK(sc); sc->ndis_txarray[idx] = NULL; sc->ndis_txpending++; if (!sc->ndis_80211) { struct ifnet *ifp = sc->ifp; if (status == NDIS_STATUS_SUCCESS) if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); else if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; } sc->ndis_tx_timer = 0; NDIS_UNLOCK(sc); if (!sc->ndis_80211) IoQueueWorkItem(sc->ndis_startitem, (io_workitem_func)ndis_ifstarttask_wrap, WORKQUEUE_CRITICAL, sc); DPRINTF(("%s: ndis_ifstarttask_wrap sc=%p\n", __func__, sc)); } static void ndis_linksts(adapter, status, sbuf, slen) ndis_handle adapter; ndis_status status; void *sbuf; uint32_t slen; { ndis_miniport_block *block; struct ndis_softc *sc; block = adapter; sc = device_get_softc(block->nmb_physdeviceobj->do_devext); sc->ndis_sts = status; /* Event list is all full up, drop this one. */ NDIS_LOCK(sc); if (sc->ndis_evt[sc->ndis_evtpidx].ne_sts) { NDIS_UNLOCK(sc); return; } /* Cache the event. */ if (slen) { sc->ndis_evt[sc->ndis_evtpidx].ne_buf = malloc(slen, M_TEMP, M_NOWAIT); if (sc->ndis_evt[sc->ndis_evtpidx].ne_buf == NULL) { NDIS_UNLOCK(sc); return; } bcopy((char *)sbuf, sc->ndis_evt[sc->ndis_evtpidx].ne_buf, slen); } sc->ndis_evt[sc->ndis_evtpidx].ne_sts = status; sc->ndis_evt[sc->ndis_evtpidx].ne_len = slen; NDIS_EVTINC(sc->ndis_evtpidx); NDIS_UNLOCK(sc); } static void ndis_linksts_done(adapter) ndis_handle adapter; { ndis_miniport_block *block; struct ndis_softc *sc; struct ifnet *ifp; block = adapter; sc = device_get_softc(block->nmb_physdeviceobj->do_devext); ifp = sc->ifp; if (!NDIS_INITIALIZED(sc)) return; switch (sc->ndis_sts) { case NDIS_STATUS_MEDIA_CONNECT: IoQueueWorkItem(sc->ndis_tickitem, (io_workitem_func)ndis_ticktask_wrap, WORKQUEUE_CRITICAL, sc); if (!sc->ndis_80211) IoQueueWorkItem(sc->ndis_startitem, (io_workitem_func)ndis_ifstarttask_wrap, WORKQUEUE_CRITICAL, sc); break; case NDIS_STATUS_MEDIA_DISCONNECT: if (sc->ndis_link) IoQueueWorkItem(sc->ndis_tickitem, (io_workitem_func)ndis_ticktask_wrap, WORKQUEUE_CRITICAL, sc); break; default: break; } } static void ndis_tick(xsc) void *xsc; { struct ndis_softc *sc; sc = xsc; if (sc->ndis_hang_timer && --sc->ndis_hang_timer == 0) { IoQueueWorkItem(sc->ndis_tickitem, (io_workitem_func)ndis_ticktask_wrap, WORKQUEUE_CRITICAL, sc); sc->ndis_hang_timer = sc->ndis_block->nmb_checkforhangsecs; } if (sc->ndis_tx_timer && --sc->ndis_tx_timer == 0) { if_inc_counter(sc->ifp, IFCOUNTER_OERRORS, 1); device_printf(sc->ndis_dev, "watchdog timeout\n"); IoQueueWorkItem(sc->ndis_resetitem, (io_workitem_func)ndis_resettask_wrap, WORKQUEUE_CRITICAL, sc); if (!sc->ndis_80211) IoQueueWorkItem(sc->ndis_startitem, (io_workitem_func)ndis_ifstarttask_wrap, WORKQUEUE_CRITICAL, sc); } callout_reset(&sc->ndis_stat_callout, hz, ndis_tick, sc); } static void ndis_ticktask(device_object *d, void *xsc) { struct ndis_softc *sc = xsc; ndis_checkforhang_handler hangfunc; uint8_t rval; NDIS_LOCK(sc); if (!NDIS_INITIALIZED(sc)) { NDIS_UNLOCK(sc); return; } NDIS_UNLOCK(sc); hangfunc = sc->ndis_chars->nmc_checkhang_func; if (hangfunc != NULL) { rval = MSCALL1(hangfunc, sc->ndis_block->nmb_miniportadapterctx); if (rval == TRUE) { ndis_reset_nic(sc); return; } } NDIS_LOCK(sc); if (sc->ndis_link == 0 && sc->ndis_sts == NDIS_STATUS_MEDIA_CONNECT) { sc->ndis_link = 1; if (sc->ndis_80211 != 0) { struct ieee80211com *ic = &sc->ndis_ic; struct ieee80211vap *vap = TAILQ_FIRST(&ic->ic_vaps); if (vap != NULL) { NDIS_UNLOCK(sc); ndis_getstate_80211(sc); ieee80211_new_state(vap, IEEE80211_S_RUN, -1); NDIS_LOCK(sc); if_link_state_change(vap->iv_ifp, LINK_STATE_UP); } } else if_link_state_change(sc->ifp, LINK_STATE_UP); } if (sc->ndis_link == 1 && sc->ndis_sts == NDIS_STATUS_MEDIA_DISCONNECT) { sc->ndis_link = 0; if (sc->ndis_80211 != 0) { struct ieee80211com *ic = &sc->ndis_ic; struct ieee80211vap *vap = TAILQ_FIRST(&ic->ic_vaps); if (vap != NULL) { NDIS_UNLOCK(sc); ieee80211_new_state(vap, IEEE80211_S_SCAN, 0); NDIS_LOCK(sc); if_link_state_change(vap->iv_ifp, LINK_STATE_DOWN); } } else if_link_state_change(sc->ifp, LINK_STATE_DOWN); } NDIS_UNLOCK(sc); } static void ndis_map_sclist(arg, segs, nseg, mapsize, error) void *arg; bus_dma_segment_t *segs; int nseg; bus_size_t mapsize; int error; { struct ndis_sc_list *sclist; int i; if (error || arg == NULL) return; sclist = arg; sclist->nsl_frags = nseg; for (i = 0; i < nseg; i++) { sclist->nsl_elements[i].nse_addr.np_quad = segs[i].ds_addr; sclist->nsl_elements[i].nse_len = segs[i].ds_len; } } static int ndis_raw_xmit(struct ieee80211_node *ni, struct mbuf *m, const struct ieee80211_bpf_params *params) { /* no support; just discard */ m_freem(m); ieee80211_free_node(ni); return (0); } static void ndis_update_mcast(struct ieee80211com *ic) { struct ndis_softc *sc = ic->ic_softc; ndis_setmulti(sc); } static void ndis_update_promisc(struct ieee80211com *ic) { /* not supported */ } static void ndis_ifstarttask(device_object *d, void *arg) { struct ndis_softc *sc = arg; DPRINTF(("%s: sc=%p, ifp=%p\n", __func__, sc, sc->ifp)); if (sc->ndis_80211) return; struct ifnet *ifp = sc->ifp; if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) ndis_ifstart(ifp); } /* * Main transmit routine. To make NDIS drivers happy, we need to * transform mbuf chains into NDIS packets and feed them to the * send packet routines. Most drivers allow you to send several * packets at once (up to the maxpkts limit). Unfortunately, rather * that accepting them in the form of a linked list, they expect * a contiguous array of pointers to packets. * * For those drivers which use the NDIS scatter/gather DMA mechanism, * we need to perform busdma work here. Those that use map registers * will do the mapping themselves on a buffer by buffer basis. */ static void ndis_ifstart(struct ifnet *ifp) { struct ndis_softc *sc; struct mbuf *m = NULL; ndis_packet **p0 = NULL, *p = NULL; ndis_tcpip_csum *csum; int pcnt = 0, status; sc = ifp->if_softc; NDIS_LOCK(sc); if (!sc->ndis_link || ifp->if_drv_flags & IFF_DRV_OACTIVE) { NDIS_UNLOCK(sc); return; } p0 = &sc->ndis_txarray[sc->ndis_txidx]; while(sc->ndis_txpending) { IFQ_DRV_DEQUEUE(&ifp->if_snd, m); if (m == NULL) break; NdisAllocatePacket(&status, &sc->ndis_txarray[sc->ndis_txidx], sc->ndis_txpool); if (status != NDIS_STATUS_SUCCESS) break; if (ndis_mtop(m, &sc->ndis_txarray[sc->ndis_txidx])) { IFQ_DRV_PREPEND(&ifp->if_snd, m); NDIS_UNLOCK(sc); return; } /* * Save pointer to original mbuf * so we can free it later. */ p = sc->ndis_txarray[sc->ndis_txidx]; p->np_txidx = sc->ndis_txidx; p->np_m0 = m; p->np_oob.npo_status = NDIS_STATUS_PENDING; /* * Do scatter/gather processing, if driver requested it. */ if (sc->ndis_sc) { bus_dmamap_load_mbuf(sc->ndis_ttag, sc->ndis_tmaps[sc->ndis_txidx], m, ndis_map_sclist, &p->np_sclist, BUS_DMA_NOWAIT); bus_dmamap_sync(sc->ndis_ttag, sc->ndis_tmaps[sc->ndis_txidx], BUS_DMASYNC_PREREAD); p->np_ext.npe_info[ndis_sclist_info] = &p->np_sclist; } /* Handle checksum offload. */ if (ifp->if_capenable & IFCAP_TXCSUM && m->m_pkthdr.csum_flags) { csum = (ndis_tcpip_csum *) &p->np_ext.npe_info[ndis_tcpipcsum_info]; csum->u.ntc_txflags = NDIS_TXCSUM_DO_IPV4; if (m->m_pkthdr.csum_flags & CSUM_IP) csum->u.ntc_txflags |= NDIS_TXCSUM_DO_IP; if (m->m_pkthdr.csum_flags & CSUM_TCP) csum->u.ntc_txflags |= NDIS_TXCSUM_DO_TCP; if (m->m_pkthdr.csum_flags & CSUM_UDP) csum->u.ntc_txflags |= NDIS_TXCSUM_DO_UDP; p->np_private.npp_flags = NDIS_PROTOCOL_ID_TCP_IP; } NDIS_INC(sc); sc->ndis_txpending--; pcnt++; /* * If there's a BPF listener, bounce a copy of this frame * to him. */ if (!sc->ndis_80211) /* XXX handle 80211 */ BPF_MTAP(ifp, m); /* * The array that p0 points to must appear contiguous, * so we must not wrap past the end of sc->ndis_txarray[]. * If it looks like we're about to wrap, break out here * so the this batch of packets can be transmitted, then * wait for txeof to ask us to send the rest. */ if (sc->ndis_txidx == 0) break; } if (pcnt == 0) { NDIS_UNLOCK(sc); return; } if (sc->ndis_txpending == 0) ifp->if_drv_flags |= IFF_DRV_OACTIVE; /* * Set a timeout in case the chip goes out to lunch. */ sc->ndis_tx_timer = 5; NDIS_UNLOCK(sc); /* * According to NDIS documentation, if a driver exports * a MiniportSendPackets() routine, we prefer that over * a MiniportSend() routine (which sends just a single * packet). */ if (sc->ndis_chars->nmc_sendmulti_func != NULL) ndis_send_packets(sc, p0, pcnt); else ndis_send_packet(sc, p); return; } static int ndis_80211transmit(struct ieee80211com *ic, struct mbuf *m) { struct ndis_softc *sc = ic->ic_softc; ndis_packet **p0 = NULL, *p = NULL; int status; NDIS_LOCK(sc); if (!sc->ndis_link || !sc->ndis_running) { NDIS_UNLOCK(sc); return (ENXIO); } if (sc->ndis_txpending == 0) { NDIS_UNLOCK(sc); return (ENOBUFS); } p0 = &sc->ndis_txarray[sc->ndis_txidx]; NdisAllocatePacket(&status, &sc->ndis_txarray[sc->ndis_txidx], sc->ndis_txpool); if (status != NDIS_STATUS_SUCCESS) { NDIS_UNLOCK(sc); return (ENOBUFS); } if (ndis_mtop(m, &sc->ndis_txarray[sc->ndis_txidx])) { NDIS_UNLOCK(sc); return (ENOBUFS); } /* * Save pointer to original mbuf * so we can free it later. */ p = sc->ndis_txarray[sc->ndis_txidx]; p->np_txidx = sc->ndis_txidx; p->np_m0 = m; p->np_oob.npo_status = NDIS_STATUS_PENDING; /* * Do scatter/gather processing, if driver requested it. */ if (sc->ndis_sc) { bus_dmamap_load_mbuf(sc->ndis_ttag, sc->ndis_tmaps[sc->ndis_txidx], m, ndis_map_sclist, &p->np_sclist, BUS_DMA_NOWAIT); bus_dmamap_sync(sc->ndis_ttag, sc->ndis_tmaps[sc->ndis_txidx], BUS_DMASYNC_PREREAD); p->np_ext.npe_info[ndis_sclist_info] = &p->np_sclist; } NDIS_INC(sc); sc->ndis_txpending--; /* * Set a timeout in case the chip goes out to lunch. */ sc->ndis_tx_timer = 5; NDIS_UNLOCK(sc); /* * According to NDIS documentation, if a driver exports * a MiniportSendPackets() routine, we prefer that over * a MiniportSend() routine (which sends just a single * packet). */ if (sc->ndis_chars->nmc_sendmulti_func != NULL) ndis_send_packets(sc, p0, 1); else ndis_send_packet(sc, p); return (0); } static void ndis_80211parent(struct ieee80211com *ic) { struct ndis_softc *sc = ic->ic_softc; /*NDIS_LOCK(sc);*/ if (ic->ic_nrunning > 0) { if (!sc->ndis_running) ndis_init(sc); } else if (sc->ndis_running) ndis_stop(sc); /*NDIS_UNLOCK(sc);*/ } static void ndis_init(void *xsc) { struct ndis_softc *sc = xsc; int i, len, error; /* * Avoid reintializing the link unnecessarily. * This should be dealt with in a better way by * fixing the upper layer modules so they don't * call ifp->if_init() quite as often. */ if (sc->ndis_link) return; /* * Cancel pending I/O and free all RX/TX buffers. */ ndis_stop(sc); if (!(sc->ndis_iftype == PNPBus && ndisusb_halt == 0)) { error = ndis_init_nic(sc); if (error != 0) { device_printf(sc->ndis_dev, "failed to initialize the device: %d\n", error); return; } } /* Program the packet filter */ sc->ndis_filter = NDIS_PACKET_TYPE_DIRECTED | NDIS_PACKET_TYPE_BROADCAST; if (sc->ndis_80211) { struct ieee80211com *ic = &sc->ndis_ic; if (ic->ic_promisc > 0) sc->ndis_filter |= NDIS_PACKET_TYPE_PROMISCUOUS; } else { struct ifnet *ifp = sc->ifp; if (ifp->if_flags & IFF_PROMISC) sc->ndis_filter |= NDIS_PACKET_TYPE_PROMISCUOUS; } len = sizeof(sc->ndis_filter); error = ndis_set_info(sc, OID_GEN_CURRENT_PACKET_FILTER, &sc->ndis_filter, &len); if (error) device_printf(sc->ndis_dev, "set filter failed: %d\n", error); /* * Set lookahead. */ if (sc->ndis_80211) i = ETHERMTU; else i = sc->ifp->if_mtu; len = sizeof(i); ndis_set_info(sc, OID_GEN_CURRENT_LOOKAHEAD, &i, &len); /* * Program the multicast filter, if necessary. */ ndis_setmulti(sc); /* Setup task offload. */ ndis_set_offload(sc); NDIS_LOCK(sc); sc->ndis_txidx = 0; sc->ndis_txpending = sc->ndis_maxpkts; sc->ndis_link = 0; if (!sc->ndis_80211) { if_link_state_change(sc->ifp, LINK_STATE_UNKNOWN); sc->ifp->if_drv_flags |= IFF_DRV_RUNNING; sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; } sc->ndis_tx_timer = 0; /* * Some drivers don't set this value. The NDIS spec says * the default checkforhang timeout is "approximately 2 * seconds." We use 3 seconds, because it seems for some * drivers, exactly 2 seconds is too fast. */ if (sc->ndis_block->nmb_checkforhangsecs == 0) sc->ndis_block->nmb_checkforhangsecs = 3; sc->ndis_hang_timer = sc->ndis_block->nmb_checkforhangsecs; callout_reset(&sc->ndis_stat_callout, hz, ndis_tick, sc); sc->ndis_running = 1; NDIS_UNLOCK(sc); /* XXX force handling */ if (sc->ndis_80211) ieee80211_start_all(&sc->ndis_ic); /* start all vap's */ } /* * Set media options. */ static int ndis_ifmedia_upd(ifp) struct ifnet *ifp; { struct ndis_softc *sc; sc = ifp->if_softc; if (NDIS_INITIALIZED(sc)) ndis_init(sc); return (0); } /* * Report current media status. */ static void ndis_ifmedia_sts(ifp, ifmr) struct ifnet *ifp; struct ifmediareq *ifmr; { struct ndis_softc *sc; uint32_t media_info; ndis_media_state linkstate; int len; ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; sc = ifp->if_softc; if (!NDIS_INITIALIZED(sc)) return; len = sizeof(linkstate); ndis_get_info(sc, OID_GEN_MEDIA_CONNECT_STATUS, (void *)&linkstate, &len); len = sizeof(media_info); ndis_get_info(sc, OID_GEN_LINK_SPEED, (void *)&media_info, &len); if (linkstate == nmc_connected) ifmr->ifm_status |= IFM_ACTIVE; switch (media_info) { case 100000: ifmr->ifm_active |= IFM_10_T; break; case 1000000: ifmr->ifm_active |= IFM_100_TX; break; case 10000000: ifmr->ifm_active |= IFM_1000_T; break; default: device_printf(sc->ndis_dev, "unknown speed: %d\n", media_info); break; } } static int ndis_set_cipher(struct ndis_softc *sc, int cipher) { struct ieee80211com *ic = &sc->ndis_ic; int rval = 0, len; uint32_t arg, save; len = sizeof(arg); if (cipher == WPA_CSE_WEP40 || cipher == WPA_CSE_WEP104) { if (!(ic->ic_cryptocaps & IEEE80211_CRYPTO_WEP)) return (ENOTSUP); arg = NDIS_80211_WEPSTAT_ENC1ENABLED; } if (cipher == WPA_CSE_TKIP) { if (!(ic->ic_cryptocaps & IEEE80211_CRYPTO_TKIP)) return (ENOTSUP); arg = NDIS_80211_WEPSTAT_ENC2ENABLED; } if (cipher == WPA_CSE_CCMP) { if (!(ic->ic_cryptocaps & IEEE80211_CRYPTO_AES_CCM)) return (ENOTSUP); arg = NDIS_80211_WEPSTAT_ENC3ENABLED; } DPRINTF(("Setting cipher to %d\n", arg)); save = arg; rval = ndis_set_info(sc, OID_802_11_ENCRYPTION_STATUS, &arg, &len); if (rval) return (rval); /* Check that the cipher was set correctly. */ len = sizeof(save); rval = ndis_get_info(sc, OID_802_11_ENCRYPTION_STATUS, &arg, &len); if (rval != 0 || arg != save) return (ENODEV); return (0); } /* * WPA is hairy to set up. Do the work in a separate routine * so we don't clutter the setstate function too much. * Important yet undocumented fact: first we have to set the * authentication mode, _then_ we enable the ciphers. If one * of the WPA authentication modes isn't enabled, the driver * might not permit the TKIP or AES ciphers to be selected. */ static int ndis_set_wpa(sc, ie, ielen) struct ndis_softc *sc; void *ie; int ielen; { struct ieee80211_ie_wpa *w; struct ndis_ie *n; char *pos; uint32_t arg; int i; /* * Apparently, the only way for us to know what ciphers * and key management/authentication mode to use is for * us to inspect the optional information element (IE) * stored in the 802.11 state machine. This IE should be * supplied by the WPA supplicant. */ w = (struct ieee80211_ie_wpa *)ie; /* Check for the right kind of IE. */ if (w->wpa_id != IEEE80211_ELEMID_VENDOR) { DPRINTF(("Incorrect IE type %d\n", w->wpa_id)); return (EINVAL); } /* Skip over the ucast cipher OIDs. */ pos = (char *)&w->wpa_uciphers[0]; pos += w->wpa_uciphercnt * sizeof(struct ndis_ie); /* Skip over the authmode count. */ pos += sizeof(u_int16_t); /* * Check for the authentication modes. I'm * pretty sure there's only supposed to be one. */ n = (struct ndis_ie *)pos; if (n->ni_val == WPA_ASE_NONE) arg = NDIS_80211_AUTHMODE_WPANONE; if (n->ni_val == WPA_ASE_8021X_UNSPEC) arg = NDIS_80211_AUTHMODE_WPA; if (n->ni_val == WPA_ASE_8021X_PSK) arg = NDIS_80211_AUTHMODE_WPAPSK; DPRINTF(("Setting WPA auth mode to %d\n", arg)); i = sizeof(arg); if (ndis_set_info(sc, OID_802_11_AUTHENTICATION_MODE, &arg, &i)) return (ENOTSUP); i = sizeof(arg); ndis_get_info(sc, OID_802_11_AUTHENTICATION_MODE, &arg, &i); /* Now configure the desired ciphers. */ /* First, set up the multicast group cipher. */ n = (struct ndis_ie *)&w->wpa_mcipher[0]; if (ndis_set_cipher(sc, n->ni_val)) return (ENOTSUP); /* Now start looking around for the unicast ciphers. */ pos = (char *)&w->wpa_uciphers[0]; n = (struct ndis_ie *)pos; for (i = 0; i < w->wpa_uciphercnt; i++) { if (ndis_set_cipher(sc, n->ni_val)) return (ENOTSUP); n++; } return (0); } static void ndis_media_status(struct ifnet *ifp, struct ifmediareq *imr) { struct ieee80211vap *vap = ifp->if_softc; struct ndis_softc *sc = vap->iv_ic->ic_softc; uint32_t txrate; int len; if (!NDIS_INITIALIZED(sc)) return; len = sizeof(txrate); if (ndis_get_info(sc, OID_GEN_LINK_SPEED, &txrate, &len) == 0) vap->iv_bss->ni_txrate = txrate / 5000; ieee80211_media_status(ifp, imr); } static void ndis_setstate_80211(struct ndis_softc *sc) { struct ieee80211com *ic = &sc->ndis_ic; struct ieee80211vap *vap = TAILQ_FIRST(&ic->ic_vaps); ndis_80211_macaddr bssid; ndis_80211_config config; int rval = 0, len; uint32_t arg; if (!NDIS_INITIALIZED(sc)) { DPRINTF(("%s: NDIS not initialized\n", __func__)); return; } /* Disassociate and turn off radio. */ len = sizeof(arg); arg = 1; ndis_set_info(sc, OID_802_11_DISASSOCIATE, &arg, &len); /* Set network infrastructure mode. */ len = sizeof(arg); if (ic->ic_opmode == IEEE80211_M_IBSS) arg = NDIS_80211_NET_INFRA_IBSS; else arg = NDIS_80211_NET_INFRA_BSS; rval = ndis_set_info(sc, OID_802_11_INFRASTRUCTURE_MODE, &arg, &len); if (rval) device_printf (sc->ndis_dev, "set infra failed: %d\n", rval); /* Set power management */ len = sizeof(arg); if (vap->iv_flags & IEEE80211_F_PMGTON) arg = NDIS_80211_POWERMODE_FAST_PSP; else arg = NDIS_80211_POWERMODE_CAM; ndis_set_info(sc, OID_802_11_POWER_MODE, &arg, &len); /* Set TX power */ if ((ic->ic_caps & IEEE80211_C_TXPMGT) && ic->ic_txpowlimit < nitems(dBm2mW)) { arg = dBm2mW[ic->ic_txpowlimit]; len = sizeof(arg); ndis_set_info(sc, OID_802_11_TX_POWER_LEVEL, &arg, &len); } /* * Default encryption mode to off, authentication * to open and privacy to 'accept everything.' */ len = sizeof(arg); arg = NDIS_80211_WEPSTAT_DISABLED; ndis_set_info(sc, OID_802_11_ENCRYPTION_STATUS, &arg, &len); len = sizeof(arg); arg = NDIS_80211_AUTHMODE_OPEN; ndis_set_info(sc, OID_802_11_AUTHENTICATION_MODE, &arg, &len); /* * Note that OID_802_11_PRIVACY_FILTER is optional: * not all drivers implement it. */ len = sizeof(arg); arg = NDIS_80211_PRIVFILT_8021XWEP; ndis_set_info(sc, OID_802_11_PRIVACY_FILTER, &arg, &len); len = sizeof(config); bzero((char *)&config, len); config.nc_length = len; config.nc_fhconfig.ncf_length = sizeof(ndis_80211_config_fh); rval = ndis_get_info(sc, OID_802_11_CONFIGURATION, &config, &len); /* * Some drivers expect us to initialize these values, so * provide some defaults. */ if (config.nc_beaconperiod == 0) config.nc_beaconperiod = 100; if (config.nc_atimwin == 0) config.nc_atimwin = 100; if (config.nc_fhconfig.ncf_dwelltime == 0) config.nc_fhconfig.ncf_dwelltime = 200; if (rval == 0 && ic->ic_bsschan != IEEE80211_CHAN_ANYC) { int chan, chanflag; chan = ieee80211_chan2ieee(ic, ic->ic_bsschan); chanflag = config.nc_dsconfig > 2500000 ? IEEE80211_CHAN_2GHZ : IEEE80211_CHAN_5GHZ; if (chan != ieee80211_mhz2ieee(config.nc_dsconfig / 1000, 0)) { config.nc_dsconfig = ic->ic_bsschan->ic_freq * 1000; len = sizeof(config); config.nc_length = len; config.nc_fhconfig.ncf_length = sizeof(ndis_80211_config_fh); DPRINTF(("Setting channel to %ukHz\n", config.nc_dsconfig)); rval = ndis_set_info(sc, OID_802_11_CONFIGURATION, &config, &len); if (rval) device_printf(sc->ndis_dev, "couldn't change " "DS config to %ukHz: %d\n", config.nc_dsconfig, rval); } } else if (rval) device_printf(sc->ndis_dev, "couldn't retrieve " "channel info: %d\n", rval); /* Set the BSSID to our value so the driver doesn't associate */ len = IEEE80211_ADDR_LEN; bcopy(vap->iv_myaddr, bssid, len); DPRINTF(("Setting BSSID to %6D\n", (uint8_t *)&bssid, ":")); rval = ndis_set_info(sc, OID_802_11_BSSID, &bssid, &len); if (rval) device_printf(sc->ndis_dev, "setting BSSID failed: %d\n", rval); } static void ndis_auth_and_assoc(struct ndis_softc *sc, struct ieee80211vap *vap) { struct ieee80211_node *ni = vap->iv_bss; ndis_80211_ssid ssid; ndis_80211_macaddr bssid; ndis_80211_wep wep; int i, rval = 0, len, error; uint32_t arg; if (!NDIS_INITIALIZED(sc)) { DPRINTF(("%s: NDIS not initialized\n", __func__)); return; } /* Initial setup */ ndis_setstate_80211(sc); /* Set network infrastructure mode. */ len = sizeof(arg); if (vap->iv_opmode == IEEE80211_M_IBSS) arg = NDIS_80211_NET_INFRA_IBSS; else arg = NDIS_80211_NET_INFRA_BSS; rval = ndis_set_info(sc, OID_802_11_INFRASTRUCTURE_MODE, &arg, &len); if (rval) device_printf (sc->ndis_dev, "set infra failed: %d\n", rval); /* Set RTS threshold */ len = sizeof(arg); arg = vap->iv_rtsthreshold; ndis_set_info(sc, OID_802_11_RTS_THRESHOLD, &arg, &len); /* Set fragmentation threshold */ len = sizeof(arg); arg = vap->iv_fragthreshold; ndis_set_info(sc, OID_802_11_FRAGMENTATION_THRESHOLD, &arg, &len); /* Set WEP */ if (vap->iv_flags & IEEE80211_F_PRIVACY && !(vap->iv_flags & IEEE80211_F_WPA)) { int keys_set = 0; if (ni->ni_authmode == IEEE80211_AUTH_SHARED) { len = sizeof(arg); arg = NDIS_80211_AUTHMODE_SHARED; DPRINTF(("Setting shared auth\n")); ndis_set_info(sc, OID_802_11_AUTHENTICATION_MODE, &arg, &len); } for (i = 0; i < IEEE80211_WEP_NKID; i++) { if (vap->iv_nw_keys[i].wk_keylen) { if (vap->iv_nw_keys[i].wk_cipher->ic_cipher != IEEE80211_CIPHER_WEP) continue; bzero((char *)&wep, sizeof(wep)); wep.nw_keylen = vap->iv_nw_keys[i].wk_keylen; /* * 5, 13 and 16 are the only valid * key lengths. Anything in between * will be zero padded out to the * next highest boundary. */ if (vap->iv_nw_keys[i].wk_keylen < 5) wep.nw_keylen = 5; else if (vap->iv_nw_keys[i].wk_keylen > 5 && vap->iv_nw_keys[i].wk_keylen < 13) wep.nw_keylen = 13; else if (vap->iv_nw_keys[i].wk_keylen > 13 && vap->iv_nw_keys[i].wk_keylen < 16) wep.nw_keylen = 16; wep.nw_keyidx = i; wep.nw_length = (sizeof(uint32_t) * 3) + wep.nw_keylen; if (i == vap->iv_def_txkey) wep.nw_keyidx |= NDIS_80211_WEPKEY_TX; bcopy(vap->iv_nw_keys[i].wk_key, wep.nw_keydata, wep.nw_length); len = sizeof(wep); DPRINTF(("Setting WEP key %d\n", i)); rval = ndis_set_info(sc, OID_802_11_ADD_WEP, &wep, &len); if (rval) device_printf(sc->ndis_dev, "set wepkey failed: %d\n", rval); keys_set++; } } if (keys_set) { DPRINTF(("Setting WEP on\n")); arg = NDIS_80211_WEPSTAT_ENABLED; len = sizeof(arg); rval = ndis_set_info(sc, OID_802_11_WEP_STATUS, &arg, &len); if (rval) device_printf(sc->ndis_dev, "enable WEP failed: %d\n", rval); if (vap->iv_flags & IEEE80211_F_DROPUNENC) arg = NDIS_80211_PRIVFILT_8021XWEP; else arg = NDIS_80211_PRIVFILT_ACCEPTALL; len = sizeof(arg); ndis_set_info(sc, OID_802_11_PRIVACY_FILTER, &arg, &len); } } /* Set up WPA. */ if ((vap->iv_flags & IEEE80211_F_WPA) && vap->iv_appie_assocreq != NULL) { struct ieee80211_appie *ie = vap->iv_appie_assocreq; error = ndis_set_wpa(sc, ie->ie_data, ie->ie_len); if (error != 0) device_printf(sc->ndis_dev, "WPA setup failed\n"); } #ifdef notyet /* Set network type. */ arg = 0; switch (vap->iv_curmode) { case IEEE80211_MODE_11A: arg = NDIS_80211_NETTYPE_11OFDM5; break; case IEEE80211_MODE_11B: arg = NDIS_80211_NETTYPE_11DS; break; case IEEE80211_MODE_11G: arg = NDIS_80211_NETTYPE_11OFDM24; break; default: device_printf(sc->ndis_dev, "unknown mode: %d\n", vap->iv_curmode); } if (arg) { DPRINTF(("Setting network type to %d\n", arg)); len = sizeof(arg); rval = ndis_set_info(sc, OID_802_11_NETWORK_TYPE_IN_USE, &arg, &len); if (rval) device_printf(sc->ndis_dev, "set nettype failed: %d\n", rval); } #endif /* * If the user selected a specific BSSID, try * to use that one. This is useful in the case where * there are several APs in range with the same network * name. To delete the BSSID, we use the broadcast * address as the BSSID. * Note that some drivers seem to allow setting a BSSID * in ad-hoc mode, which has the effect of forcing the * NIC to create an ad-hoc cell with a specific BSSID, * instead of a randomly chosen one. However, the net80211 * code makes the assumtion that the BSSID setting is invalid * when you're in ad-hoc mode, so we don't allow that here. */ len = IEEE80211_ADDR_LEN; if (vap->iv_flags & IEEE80211_F_DESBSSID && vap->iv_opmode != IEEE80211_M_IBSS) bcopy(ni->ni_bssid, bssid, len); else bcopy(ieee80211broadcastaddr, bssid, len); DPRINTF(("Setting BSSID to %6D\n", (uint8_t *)&bssid, ":")); rval = ndis_set_info(sc, OID_802_11_BSSID, &bssid, &len); if (rval) device_printf(sc->ndis_dev, "setting BSSID failed: %d\n", rval); /* Set SSID -- always do this last. */ #ifdef NDIS_DEBUG if (ndis_debug > 0) { printf("Setting ESSID to "); ieee80211_print_essid(ni->ni_essid, ni->ni_esslen); printf("\n"); } #endif len = sizeof(ssid); bzero((char *)&ssid, len); ssid.ns_ssidlen = ni->ni_esslen; if (ssid.ns_ssidlen == 0) { ssid.ns_ssidlen = 1; } else bcopy(ni->ni_essid, ssid.ns_ssid, ssid.ns_ssidlen); rval = ndis_set_info(sc, OID_802_11_SSID, &ssid, &len); if (rval) device_printf (sc->ndis_dev, "set ssid failed: %d\n", rval); return; } static int ndis_get_bssid_list(sc, bl) struct ndis_softc *sc; ndis_80211_bssid_list_ex **bl; { int len, error; len = sizeof(uint32_t) + (sizeof(ndis_wlan_bssid_ex) * 16); *bl = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO); if (*bl == NULL) return (ENOMEM); error = ndis_get_info(sc, OID_802_11_BSSID_LIST, *bl, &len); if (error == ENOSPC) { free(*bl, M_DEVBUF); *bl = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO); if (*bl == NULL) return (ENOMEM); error = ndis_get_info(sc, OID_802_11_BSSID_LIST, *bl, &len); } if (error) { DPRINTF(("%s: failed to read\n", __func__)); free(*bl, M_DEVBUF); return (error); } return (0); } static int ndis_get_assoc(struct ndis_softc *sc, ndis_wlan_bssid_ex **assoc) { struct ieee80211com *ic = &sc->ndis_ic; struct ieee80211vap *vap; struct ieee80211_node *ni; ndis_80211_bssid_list_ex *bl; ndis_wlan_bssid_ex *bs; ndis_80211_macaddr bssid; int i, len, error; if (!sc->ndis_link) return (ENOENT); len = sizeof(bssid); error = ndis_get_info(sc, OID_802_11_BSSID, &bssid, &len); if (error) { device_printf(sc->ndis_dev, "failed to get bssid\n"); return (ENOENT); } vap = TAILQ_FIRST(&ic->ic_vaps); ni = vap->iv_bss; error = ndis_get_bssid_list(sc, &bl); if (error) return (error); bs = (ndis_wlan_bssid_ex *)&bl->nblx_bssid[0]; for (i = 0; i < bl->nblx_items; i++) { if (bcmp(bs->nwbx_macaddr, bssid, sizeof(bssid)) == 0) { *assoc = malloc(bs->nwbx_len, M_TEMP, M_NOWAIT); if (*assoc == NULL) { free(bl, M_TEMP); return (ENOMEM); } bcopy((char *)bs, (char *)*assoc, bs->nwbx_len); free(bl, M_TEMP); if (ic->ic_opmode == IEEE80211_M_STA) ni->ni_associd = 1 | 0xc000; /* fake associd */ return (0); } bs = (ndis_wlan_bssid_ex *)((char *)bs + bs->nwbx_len); } free(bl, M_TEMP); return (ENOENT); } static void ndis_getstate_80211(struct ndis_softc *sc) { struct ieee80211com *ic = &sc->ndis_ic; struct ieee80211vap *vap = TAILQ_FIRST(&ic->ic_vaps); struct ieee80211_node *ni = vap->iv_bss; ndis_wlan_bssid_ex *bs; int rval, len, i = 0; int chanflag; uint32_t arg; if (!NDIS_INITIALIZED(sc)) return; if ((rval = ndis_get_assoc(sc, &bs)) != 0) return; /* We're associated, retrieve info on the current bssid. */ ic->ic_curmode = ndis_nettype_mode(bs->nwbx_nettype); chanflag = ndis_nettype_chan(bs->nwbx_nettype); IEEE80211_ADDR_COPY(ni->ni_bssid, bs->nwbx_macaddr); /* Get SSID from current association info. */ bcopy(bs->nwbx_ssid.ns_ssid, ni->ni_essid, bs->nwbx_ssid.ns_ssidlen); ni->ni_esslen = bs->nwbx_ssid.ns_ssidlen; if (ic->ic_caps & IEEE80211_C_PMGT) { len = sizeof(arg); rval = ndis_get_info(sc, OID_802_11_POWER_MODE, &arg, &len); if (rval) device_printf(sc->ndis_dev, "get power mode failed: %d\n", rval); if (arg == NDIS_80211_POWERMODE_CAM) vap->iv_flags &= ~IEEE80211_F_PMGTON; else vap->iv_flags |= IEEE80211_F_PMGTON; } /* Get TX power */ if (ic->ic_caps & IEEE80211_C_TXPMGT) { len = sizeof(arg); ndis_get_info(sc, OID_802_11_TX_POWER_LEVEL, &arg, &len); for (i = 0; i < nitems(dBm2mW); i++) if (dBm2mW[i] >= arg) break; ic->ic_txpowlimit = i; } /* * Use the current association information to reflect * what channel we're on. */ ic->ic_curchan = ieee80211_find_channel(ic, bs->nwbx_config.nc_dsconfig / 1000, chanflag); if (ic->ic_curchan == NULL) ic->ic_curchan = &ic->ic_channels[0]; ni->ni_chan = ic->ic_curchan; ic->ic_bsschan = ic->ic_curchan; free(bs, M_TEMP); /* * Determine current authentication mode. */ len = sizeof(arg); rval = ndis_get_info(sc, OID_802_11_AUTHENTICATION_MODE, &arg, &len); if (rval) device_printf(sc->ndis_dev, "get authmode status failed: %d\n", rval); else { vap->iv_flags &= ~IEEE80211_F_WPA; switch (arg) { case NDIS_80211_AUTHMODE_OPEN: ni->ni_authmode = IEEE80211_AUTH_OPEN; break; case NDIS_80211_AUTHMODE_SHARED: ni->ni_authmode = IEEE80211_AUTH_SHARED; break; case NDIS_80211_AUTHMODE_AUTO: ni->ni_authmode = IEEE80211_AUTH_AUTO; break; case NDIS_80211_AUTHMODE_WPA: case NDIS_80211_AUTHMODE_WPAPSK: case NDIS_80211_AUTHMODE_WPANONE: ni->ni_authmode = IEEE80211_AUTH_WPA; vap->iv_flags |= IEEE80211_F_WPA1; break; case NDIS_80211_AUTHMODE_WPA2: case NDIS_80211_AUTHMODE_WPA2PSK: ni->ni_authmode = IEEE80211_AUTH_WPA; vap->iv_flags |= IEEE80211_F_WPA2; break; default: ni->ni_authmode = IEEE80211_AUTH_NONE; break; } } len = sizeof(arg); rval = ndis_get_info(sc, OID_802_11_WEP_STATUS, &arg, &len); if (rval) device_printf(sc->ndis_dev, "get wep status failed: %d\n", rval); if (arg == NDIS_80211_WEPSTAT_ENABLED) vap->iv_flags |= IEEE80211_F_PRIVACY|IEEE80211_F_DROPUNENC; else vap->iv_flags &= ~(IEEE80211_F_PRIVACY|IEEE80211_F_DROPUNENC); } static int ndis_ifioctl(ifp, command, data) struct ifnet *ifp; u_long command; caddr_t data; { struct ndis_softc *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *) data; int i, error = 0; /*NDIS_LOCK(sc);*/ switch (command) { case SIOCSIFFLAGS: if (ifp->if_flags & IFF_UP) { if (sc->ndis_running && ifp->if_flags & IFF_PROMISC && !(sc->ndis_if_flags & IFF_PROMISC)) { sc->ndis_filter |= NDIS_PACKET_TYPE_PROMISCUOUS; i = sizeof(sc->ndis_filter); error = ndis_set_info(sc, OID_GEN_CURRENT_PACKET_FILTER, &sc->ndis_filter, &i); } else if (sc->ndis_running && !(ifp->if_flags & IFF_PROMISC) && sc->ndis_if_flags & IFF_PROMISC) { sc->ndis_filter &= ~NDIS_PACKET_TYPE_PROMISCUOUS; i = sizeof(sc->ndis_filter); error = ndis_set_info(sc, OID_GEN_CURRENT_PACKET_FILTER, &sc->ndis_filter, &i); } else ndis_init(sc); } else { if (sc->ndis_running) ndis_stop(sc); } sc->ndis_if_flags = ifp->if_flags; error = 0; break; case SIOCADDMULTI: case SIOCDELMULTI: ndis_setmulti(sc); error = 0; break; case SIOCGIFMEDIA: case SIOCSIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->ifmedia, command); break; case SIOCSIFCAP: ifp->if_capenable = ifr->ifr_reqcap; if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist = sc->ndis_hwassist; else ifp->if_hwassist = 0; ndis_set_offload(sc); break; default: error = ether_ioctl(ifp, command, data); break; } /*NDIS_UNLOCK(sc);*/ return(error); } static int ndis_80211ioctl(struct ieee80211com *ic, u_long cmd, void *data) { struct ndis_softc *sc = ic->ic_softc; struct ifreq *ifr = data; struct ndis_oid_data oid; struct ndis_evt evt; void *oidbuf = NULL; int error = 0; if ((error = priv_check(curthread, PRIV_DRIVER)) != 0) return (error); switch (cmd) { case SIOCGDRVSPEC: case SIOCSDRVSPEC: error = copyin(ifr_data_get_ptr(ifr), &oid, sizeof(oid)); if (error) break; oidbuf = malloc(oid.len, M_TEMP, M_WAITOK | M_ZERO); error = copyin((caddr_t)ifr_data_get_ptr(ifr) + sizeof(oid), oidbuf, oid.len); } if (error) { free(oidbuf, M_TEMP); return (error); } switch (cmd) { case SIOCGDRVSPEC: error = ndis_get_info(sc, oid.oid, oidbuf, &oid.len); break; case SIOCSDRVSPEC: error = ndis_set_info(sc, oid.oid, oidbuf, &oid.len); break; case SIOCGPRIVATE_0: NDIS_LOCK(sc); if (sc->ndis_evt[sc->ndis_evtcidx].ne_sts == 0) { error = ENOENT; NDIS_UNLOCK(sc); break; } error = copyin(ifr_data_get_ptr(ifr), &evt, sizeof(evt)); if (error) { NDIS_UNLOCK(sc); break; } if (evt.ne_len < sc->ndis_evt[sc->ndis_evtcidx].ne_len) { error = ENOSPC; NDIS_UNLOCK(sc); break; } error = copyout(&sc->ndis_evt[sc->ndis_evtcidx], ifr_data_get_ptr(ifr), sizeof(uint32_t) * 2); if (error) { NDIS_UNLOCK(sc); break; } if (sc->ndis_evt[sc->ndis_evtcidx].ne_len) { error = copyout(sc->ndis_evt[sc->ndis_evtcidx].ne_buf, (caddr_t)ifr_data_get_ptr(ifr) + (sizeof(uint32_t) * 2), sc->ndis_evt[sc->ndis_evtcidx].ne_len); if (error) { NDIS_UNLOCK(sc); break; } free(sc->ndis_evt[sc->ndis_evtcidx].ne_buf, M_TEMP); sc->ndis_evt[sc->ndis_evtcidx].ne_buf = NULL; } sc->ndis_evt[sc->ndis_evtcidx].ne_len = 0; sc->ndis_evt[sc->ndis_evtcidx].ne_sts = 0; NDIS_EVTINC(sc->ndis_evtcidx); NDIS_UNLOCK(sc); break; default: error = ENOTTY; break; } switch (cmd) { case SIOCGDRVSPEC: case SIOCSDRVSPEC: error = copyout(&oid, ifr_data_get_ptr(ifr), sizeof(oid)); if (error) break; error = copyout(oidbuf, (caddr_t)ifr_data_get_ptr(ifr) + sizeof(oid), oid.len); } free(oidbuf, M_TEMP); return (error); } int ndis_del_key(struct ieee80211vap *vap, const struct ieee80211_key *key) { struct ndis_softc *sc = vap->iv_ic->ic_softc; ndis_80211_key rkey; int len, error = 0; bzero((char *)&rkey, sizeof(rkey)); len = sizeof(rkey); rkey.nk_len = len; rkey.nk_keyidx = key->wk_keyix; bcopy(vap->iv_ifp->if_broadcastaddr, rkey.nk_bssid, IEEE80211_ADDR_LEN); error = ndis_set_info(sc, OID_802_11_REMOVE_KEY, &rkey, &len); if (error) return (0); return (1); } /* * In theory this could be called for any key, but we'll * only use it for WPA TKIP or AES keys. These need to be * set after initial authentication with the AP. */ static int ndis_add_key(struct ieee80211vap *vap, const struct ieee80211_key *key) { struct ndis_softc *sc = vap->iv_ic->ic_softc; ndis_80211_key rkey; int len, error = 0; switch (key->wk_cipher->ic_cipher) { case IEEE80211_CIPHER_TKIP: len = sizeof(ndis_80211_key); bzero((char *)&rkey, sizeof(rkey)); rkey.nk_len = len; rkey.nk_keylen = key->wk_keylen; if (key->wk_flags & IEEE80211_KEY_SWMIC) rkey.nk_keylen += 16; /* key index - gets weird in NDIS */ if (key->wk_keyix != IEEE80211_KEYIX_NONE) rkey.nk_keyidx = key->wk_keyix; else rkey.nk_keyidx = 0; if (key->wk_flags & IEEE80211_KEY_XMIT) rkey.nk_keyidx |= 1 << 31; if (key->wk_flags & IEEE80211_KEY_GROUP) { bcopy(ieee80211broadcastaddr, rkey.nk_bssid, IEEE80211_ADDR_LEN); } else { bcopy(vap->iv_bss->ni_bssid, rkey.nk_bssid, IEEE80211_ADDR_LEN); /* pairwise key */ rkey.nk_keyidx |= 1 << 30; } /* need to set bit 29 based on keyrsc */ rkey.nk_keyrsc = key->wk_keyrsc[0]; /* XXX need tid */ if (rkey.nk_keyrsc) rkey.nk_keyidx |= 1 << 29; if (key->wk_flags & IEEE80211_KEY_SWMIC) { bcopy(key->wk_key, rkey.nk_keydata, 16); bcopy(key->wk_key + 24, rkey.nk_keydata + 16, 8); bcopy(key->wk_key + 16, rkey.nk_keydata + 24, 8); } else bcopy(key->wk_key, rkey.nk_keydata, key->wk_keylen); error = ndis_set_info(sc, OID_802_11_ADD_KEY, &rkey, &len); break; case IEEE80211_CIPHER_WEP: error = 0; break; /* * I don't know how to set up keys for the AES * cipher yet. Is it the same as TKIP? */ case IEEE80211_CIPHER_AES_CCM: default: error = ENOTTY; break; } /* We need to return 1 for success, 0 for failure. */ if (error) return (0); return (1); } static void ndis_resettask(d, arg) device_object *d; void *arg; { struct ndis_softc *sc; sc = arg; ndis_reset_nic(sc); } /* * Stop the adapter and free any mbufs allocated to the * RX and TX lists. */ static void ndis_stop(struct ndis_softc *sc) { int i; callout_drain(&sc->ndis_stat_callout); NDIS_LOCK(sc); sc->ndis_tx_timer = 0; sc->ndis_link = 0; if (!sc->ndis_80211) sc->ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); sc->ndis_running = 0; NDIS_UNLOCK(sc); if (sc->ndis_iftype != PNPBus || (sc->ndis_iftype == PNPBus && !(sc->ndisusb_status & NDISUSB_STATUS_DETACH) && ndisusb_halt != 0)) ndis_halt_nic(sc); NDIS_LOCK(sc); for (i = 0; i < NDIS_EVENTS; i++) { if (sc->ndis_evt[i].ne_sts && sc->ndis_evt[i].ne_buf != NULL) { free(sc->ndis_evt[i].ne_buf, M_TEMP); sc->ndis_evt[i].ne_buf = NULL; } sc->ndis_evt[i].ne_sts = 0; sc->ndis_evt[i].ne_len = 0; } sc->ndis_evtcidx = 0; sc->ndis_evtpidx = 0; NDIS_UNLOCK(sc); } /* * Stop all chip I/O so that the kernel's probe routines don't * get confused by errant DMAs when rebooting. */ void ndis_shutdown(dev) device_t dev; { struct ndis_softc *sc; sc = device_get_softc(dev); ndis_stop(sc); } static int ndis_newstate(struct ieee80211vap *vap, enum ieee80211_state nstate, int arg) { struct ndis_vap *nvp = NDIS_VAP(vap); struct ieee80211com *ic = vap->iv_ic; struct ndis_softc *sc = ic->ic_softc; enum ieee80211_state ostate; DPRINTF(("%s: %s -> %s\n", __func__, ieee80211_state_name[vap->iv_state], ieee80211_state_name[nstate])); ostate = vap->iv_state; vap->iv_state = nstate; switch (nstate) { /* pass on to net80211 */ case IEEE80211_S_INIT: case IEEE80211_S_SCAN: return nvp->newstate(vap, nstate, arg); case IEEE80211_S_ASSOC: if (ostate != IEEE80211_S_AUTH) { IEEE80211_UNLOCK(ic); ndis_auth_and_assoc(sc, vap); IEEE80211_LOCK(ic); } break; case IEEE80211_S_AUTH: IEEE80211_UNLOCK(ic); ndis_auth_and_assoc(sc, vap); if (vap->iv_state == IEEE80211_S_AUTH) /* XXX */ ieee80211_new_state(vap, IEEE80211_S_ASSOC, 0); IEEE80211_LOCK(ic); break; default: break; } return (0); } static void ndis_scan(void *arg) { struct ieee80211vap *vap = arg; ieee80211_scan_done(vap); } static void ndis_scan_results(struct ndis_softc *sc) { struct ieee80211com *ic = &sc->ndis_ic; struct ieee80211vap *vap = TAILQ_FIRST(&ic->ic_vaps); ndis_80211_bssid_list_ex *bl; ndis_wlan_bssid_ex *wb; struct ieee80211_scanparams sp; struct ieee80211_frame wh; struct ieee80211_channel *saved_chan; int i, j; int rssi, noise, freq, chanflag; uint8_t ssid[2+IEEE80211_NWID_LEN]; uint8_t rates[2+IEEE80211_RATE_MAXSIZE]; uint8_t *frm, *efrm; saved_chan = ic->ic_curchan; noise = -96; if (ndis_get_bssid_list(sc, &bl)) return; DPRINTF(("%s: %d results\n", __func__, bl->nblx_items)); wb = &bl->nblx_bssid[0]; for (i = 0; i < bl->nblx_items; i++) { memset(&sp, 0, sizeof(sp)); memcpy(wh.i_addr2, wb->nwbx_macaddr, sizeof(wh.i_addr2)); memcpy(wh.i_addr3, wb->nwbx_macaddr, sizeof(wh.i_addr3)); rssi = 100 * (wb->nwbx_rssi - noise) / (-32 - noise); rssi = max(0, min(rssi, 100)); /* limit 0 <= rssi <= 100 */ if (wb->nwbx_privacy) sp.capinfo |= IEEE80211_CAPINFO_PRIVACY; sp.bintval = wb->nwbx_config.nc_beaconperiod; switch (wb->nwbx_netinfra) { case NDIS_80211_NET_INFRA_IBSS: sp.capinfo |= IEEE80211_CAPINFO_IBSS; break; case NDIS_80211_NET_INFRA_BSS: sp.capinfo |= IEEE80211_CAPINFO_ESS; break; } sp.rates = &rates[0]; for (j = 0; j < IEEE80211_RATE_MAXSIZE; j++) { /* XXX - check units */ if (wb->nwbx_supportedrates[j] == 0) break; rates[2 + j] = wb->nwbx_supportedrates[j] & 0x7f; } rates[1] = j; sp.ssid = (uint8_t *)&ssid[0]; memcpy(sp.ssid + 2, &wb->nwbx_ssid.ns_ssid, wb->nwbx_ssid.ns_ssidlen); sp.ssid[1] = wb->nwbx_ssid.ns_ssidlen; chanflag = ndis_nettype_chan(wb->nwbx_nettype); freq = wb->nwbx_config.nc_dsconfig / 1000; sp.chan = sp.bchan = ieee80211_mhz2ieee(freq, chanflag); /* Hack ic->ic_curchan to be in sync with the scan result */ ic->ic_curchan = ieee80211_find_channel(ic, freq, chanflag); if (ic->ic_curchan == NULL) ic->ic_curchan = &ic->ic_channels[0]; /* Process extended info from AP */ if (wb->nwbx_len > sizeof(ndis_wlan_bssid)) { frm = (uint8_t *)&wb->nwbx_ies; efrm = frm + wb->nwbx_ielen; if (efrm - frm < 12) goto done; sp.tstamp = frm; frm += 8; sp.bintval = le16toh(*(uint16_t *)frm); frm += 2; sp.capinfo = le16toh(*(uint16_t *)frm); frm += 2; sp.ies = frm; sp.ies_len = efrm - frm; } done: DPRINTF(("scan: bssid %s chan %dMHz (%d/%d) rssi %d\n", ether_sprintf(wb->nwbx_macaddr), freq, sp.bchan, chanflag, rssi)); ieee80211_add_scan(vap, ic->ic_curchan, &sp, &wh, 0, rssi, noise); wb = (ndis_wlan_bssid_ex *)((char *)wb + wb->nwbx_len); } free(bl, M_DEVBUF); /* Restore the channel after messing with it */ ic->ic_curchan = saved_chan; } static void ndis_scan_start(struct ieee80211com *ic) { struct ndis_softc *sc = ic->ic_softc; struct ieee80211vap *vap; struct ieee80211_scan_state *ss; ndis_80211_ssid ssid; int error, len; ss = ic->ic_scan; vap = TAILQ_FIRST(&ic->ic_vaps); if (!NDIS_INITIALIZED(sc)) { DPRINTF(("%s: scan aborted\n", __func__)); ieee80211_cancel_scan(vap); return; } len = sizeof(ssid); bzero((char *)&ssid, len); if (ss->ss_nssid == 0) ssid.ns_ssidlen = 1; else { /* Perform a directed scan */ ssid.ns_ssidlen = ss->ss_ssid[0].len; bcopy(ss->ss_ssid[0].ssid, ssid.ns_ssid, ssid.ns_ssidlen); } error = ndis_set_info(sc, OID_802_11_SSID, &ssid, &len); if (error) DPRINTF(("%s: set ESSID failed\n", __func__)); len = 0; error = ndis_set_info(sc, OID_802_11_BSSID_LIST_SCAN, NULL, &len); if (error) { DPRINTF(("%s: scan command failed\n", __func__)); ieee80211_cancel_scan(vap); return; } /* Set a timer to collect the results */ callout_reset(&sc->ndis_scan_callout, hz * 3, ndis_scan, vap); } static void ndis_set_channel(struct ieee80211com *ic) { /* ignore */ } static void ndis_scan_curchan(struct ieee80211_scan_state *ss, unsigned long maxdwell) { /* ignore */ } static void ndis_scan_mindwell(struct ieee80211_scan_state *ss) { /* NB: don't try to abort scan; wait for firmware to finish */ } static void ndis_scan_end(struct ieee80211com *ic) { struct ndis_softc *sc = ic->ic_softc; ndis_scan_results(sc); } diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c index 98f06af5230c..45aa824eae9b 100644 --- a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c @@ -1,4709 +1,4710 @@ /*- * Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_kern_tls.h" #include "en.h" #include #include #include #include #ifndef ETH_DRIVER_VERSION #define ETH_DRIVER_VERSION "3.5.2" #endif #define DRIVER_RELDATE "September 2019" static const char mlx5e_version[] = "mlx5en: Mellanox Ethernet driver " ETH_DRIVER_VERSION " (" DRIVER_RELDATE ")\n"; static int mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs); struct mlx5e_channel_param { struct mlx5e_rq_param rq; struct mlx5e_sq_param sq; struct mlx5e_cq_param rx_cq; struct mlx5e_cq_param tx_cq; }; struct media { u32 subtype; u64 baudrate; }; static const struct media mlx5e_mode_table[MLX5E_LINK_SPEEDS_NUMBER][MLX5E_LINK_MODES_NUMBER] = { [MLX5E_1000BASE_CX_SGMII][MLX5E_SGMII] = { .subtype = IFM_1000_CX_SGMII, .baudrate = IF_Mbps(1000ULL), }, [MLX5E_1000BASE_KX][MLX5E_KX] = { .subtype = IFM_1000_KX, .baudrate = IF_Mbps(1000ULL), }, [MLX5E_10GBASE_CX4][MLX5E_CX4] = { .subtype = IFM_10G_CX4, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_KX4][MLX5E_KX4] = { .subtype = IFM_10G_KX4, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_KR][MLX5E_KR] = { .subtype = IFM_10G_KR, .baudrate = IF_Gbps(10ULL), }, [MLX5E_20GBASE_KR2][MLX5E_KR2] = { .subtype = IFM_20G_KR2, .baudrate = IF_Gbps(20ULL), }, [MLX5E_40GBASE_CR4][MLX5E_CR4] = { .subtype = IFM_40G_CR4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_40GBASE_KR4][MLX5E_KR4] = { .subtype = IFM_40G_KR4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_56GBASE_R4][MLX5E_R] = { .subtype = IFM_56G_R4, .baudrate = IF_Gbps(56ULL), }, [MLX5E_10GBASE_CR][MLX5E_CR1] = { .subtype = IFM_10G_CR1, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_SR][MLX5E_SR] = { .subtype = IFM_10G_SR, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_ER_LR][MLX5E_ER] = { .subtype = IFM_10G_ER, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_ER_LR][MLX5E_LR] = { .subtype = IFM_10G_LR, .baudrate = IF_Gbps(10ULL), }, [MLX5E_40GBASE_SR4][MLX5E_SR4] = { .subtype = IFM_40G_SR4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_40GBASE_LR4_ER4][MLX5E_LR4] = { .subtype = IFM_40G_LR4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_40GBASE_LR4_ER4][MLX5E_ER4] = { .subtype = IFM_40G_ER4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_100GBASE_CR4][MLX5E_CR4] = { .subtype = IFM_100G_CR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_100GBASE_SR4][MLX5E_SR4] = { .subtype = IFM_100G_SR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_100GBASE_KR4][MLX5E_KR4] = { .subtype = IFM_100G_KR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_100GBASE_LR4][MLX5E_LR4] = { .subtype = IFM_100G_LR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_100BASE_TX][MLX5E_TX] = { .subtype = IFM_100_TX, .baudrate = IF_Mbps(100ULL), }, [MLX5E_1000BASE_T][MLX5E_T] = { .subtype = IFM_1000_T, .baudrate = IF_Mbps(1000ULL), }, [MLX5E_10GBASE_T][MLX5E_T] = { .subtype = IFM_10G_T, .baudrate = IF_Gbps(10ULL), }, [MLX5E_25GBASE_CR][MLX5E_CR] = { .subtype = IFM_25G_CR, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GBASE_KR][MLX5E_KR] = { .subtype = IFM_25G_KR, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GBASE_SR][MLX5E_SR] = { .subtype = IFM_25G_SR, .baudrate = IF_Gbps(25ULL), }, [MLX5E_50GBASE_CR2][MLX5E_CR2] = { .subtype = IFM_50G_CR2, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GBASE_KR2][MLX5E_KR2] = { .subtype = IFM_50G_KR2, .baudrate = IF_Gbps(50ULL), }, }; static const struct media mlx5e_ext_mode_table[MLX5E_EXT_LINK_SPEEDS_NUMBER][MLX5E_LINK_MODES_NUMBER] = { [MLX5E_SGMII_100M][MLX5E_SGMII] = { .subtype = IFM_100_SGMII, .baudrate = IF_Mbps(100), }, [MLX5E_1000BASE_X_SGMII][MLX5E_KX] = { .subtype = IFM_1000_KX, .baudrate = IF_Mbps(1000), }, [MLX5E_1000BASE_X_SGMII][MLX5E_CX_SGMII] = { .subtype = IFM_1000_CX_SGMII, .baudrate = IF_Mbps(1000), }, [MLX5E_1000BASE_X_SGMII][MLX5E_CX] = { .subtype = IFM_1000_CX, .baudrate = IF_Mbps(1000), }, [MLX5E_1000BASE_X_SGMII][MLX5E_LX] = { .subtype = IFM_1000_LX, .baudrate = IF_Mbps(1000), }, [MLX5E_1000BASE_X_SGMII][MLX5E_SX] = { .subtype = IFM_1000_SX, .baudrate = IF_Mbps(1000), }, [MLX5E_1000BASE_X_SGMII][MLX5E_T] = { .subtype = IFM_1000_T, .baudrate = IF_Mbps(1000), }, [MLX5E_5GBASE_R][MLX5E_T] = { .subtype = IFM_5000_T, .baudrate = IF_Mbps(5000), }, [MLX5E_5GBASE_R][MLX5E_KR] = { .subtype = IFM_5000_KR, .baudrate = IF_Mbps(5000), }, [MLX5E_5GBASE_R][MLX5E_KR1] = { .subtype = IFM_5000_KR1, .baudrate = IF_Mbps(5000), }, [MLX5E_5GBASE_R][MLX5E_KR_S] = { .subtype = IFM_5000_KR_S, .baudrate = IF_Mbps(5000), }, [MLX5E_10GBASE_XFI_XAUI_1][MLX5E_ER] = { .subtype = IFM_10G_ER, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_XFI_XAUI_1][MLX5E_KR] = { .subtype = IFM_10G_KR, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_XFI_XAUI_1][MLX5E_LR] = { .subtype = IFM_10G_LR, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_XFI_XAUI_1][MLX5E_SR] = { .subtype = IFM_10G_SR, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_XFI_XAUI_1][MLX5E_T] = { .subtype = IFM_10G_T, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_XFI_XAUI_1][MLX5E_AOC] = { .subtype = IFM_10G_AOC, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_XFI_XAUI_1][MLX5E_CR1] = { .subtype = IFM_10G_CR1, .baudrate = IF_Gbps(10ULL), }, [MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_CR4] = { .subtype = IFM_40G_CR4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_KR4] = { .subtype = IFM_40G_KR4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_LR4] = { .subtype = IFM_40G_LR4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_SR4] = { .subtype = IFM_40G_SR4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_ER4] = { .subtype = IFM_40G_ER4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CR] = { .subtype = IFM_25G_CR, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_KR] = { .subtype = IFM_25G_KR, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_SR] = { .subtype = IFM_25G_SR, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_ACC] = { .subtype = IFM_25G_ACC, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_AOC] = { .subtype = IFM_25G_AOC, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CR1] = { .subtype = IFM_25G_CR1, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CR_S] = { .subtype = IFM_25G_CR_S, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_KR1] = { .subtype = IFM_5000_KR1, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_KR_S] = { .subtype = IFM_25G_KR_S, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_LR] = { .subtype = IFM_25G_LR, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_T] = { .subtype = IFM_25G_T, .baudrate = IF_Gbps(25ULL), }, [MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_CR2] = { .subtype = IFM_50G_CR2, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_KR2] = { .subtype = IFM_50G_KR2, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_SR2] = { .subtype = IFM_50G_SR2, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_LR2] = { .subtype = IFM_50G_LR2, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_LR] = { .subtype = IFM_50G_LR, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_SR] = { .subtype = IFM_50G_SR, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_CP] = { .subtype = IFM_50G_CP, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_FR] = { .subtype = IFM_50G_FR, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_KR_PAM4] = { .subtype = IFM_50G_KR_PAM4, .baudrate = IF_Gbps(50ULL), }, [MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_CR4] = { .subtype = IFM_100G_CR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_KR4] = { .subtype = IFM_100G_KR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_LR4] = { .subtype = IFM_100G_LR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_SR4] = { .subtype = IFM_100G_SR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_100GAUI_2_100GBASE_CR2_KR2][MLX5E_SR2] = { .subtype = IFM_100G_SR2, .baudrate = IF_Gbps(100ULL), }, [MLX5E_100GAUI_2_100GBASE_CR2_KR2][MLX5E_CP2] = { .subtype = IFM_100G_CP2, .baudrate = IF_Gbps(100ULL), }, [MLX5E_100GAUI_2_100GBASE_CR2_KR2][MLX5E_KR2_PAM4] = { .subtype = IFM_100G_KR2_PAM4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_DR4] = { .subtype = IFM_200G_DR4, .baudrate = IF_Gbps(200ULL), }, [MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_LR4] = { .subtype = IFM_200G_LR4, .baudrate = IF_Gbps(200ULL), }, [MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_SR4] = { .subtype = IFM_200G_SR4, .baudrate = IF_Gbps(200ULL), }, [MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_FR4] = { .subtype = IFM_200G_FR4, .baudrate = IF_Gbps(200ULL), }, [MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_CR4_PAM4] = { .subtype = IFM_200G_CR4_PAM4, .baudrate = IF_Gbps(200ULL), }, [MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_KR4_PAM4] = { .subtype = IFM_200G_KR4_PAM4, .baudrate = IF_Gbps(200ULL), }, }; DEBUGNET_DEFINE(mlx5_en); MALLOC_DEFINE(M_MLX5EN, "MLX5EN", "MLX5 Ethernet"); static void mlx5e_update_carrier(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; u32 out[MLX5_ST_SZ_DW(ptys_reg)]; u32 eth_proto_oper; int error; u8 port_state; u8 is_er_type; u8 i, j; bool ext; struct media media_entry = {}; port_state = mlx5_query_vport_state(mdev, MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT, 0); if (port_state == VPORT_STATE_UP) { priv->media_status_last |= IFM_ACTIVE; } else { priv->media_status_last &= ~IFM_ACTIVE; priv->media_active_last = IFM_ETHER; if_link_state_change(priv->ifp, LINK_STATE_DOWN); return; } error = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1); if (error) { priv->media_active_last = IFM_ETHER; priv->ifp->if_baudrate = 1; mlx5_en_err(priv->ifp, "query port ptys failed: 0x%x\n", error); return; } ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet); eth_proto_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_oper); i = ilog2(eth_proto_oper); for (j = 0; j != MLX5E_LINK_MODES_NUMBER; j++) { media_entry = ext ? mlx5e_ext_mode_table[i][j] : mlx5e_mode_table[i][j]; if (media_entry.baudrate != 0) break; } if (media_entry.subtype == 0) { mlx5_en_err(priv->ifp, "Could not find operational media subtype\n"); return; } switch (media_entry.subtype) { case IFM_10G_ER: error = mlx5_query_pddr_range_info(mdev, 1, &is_er_type); if (error != 0) { mlx5_en_err(priv->ifp, "query port pddr failed: %d\n", error); } if (error != 0 || is_er_type == 0) media_entry.subtype = IFM_10G_LR; break; case IFM_40G_LR4: error = mlx5_query_pddr_range_info(mdev, 1, &is_er_type); if (error != 0) { mlx5_en_err(priv->ifp, "query port pddr failed: %d\n", error); } if (error == 0 && is_er_type != 0) media_entry.subtype = IFM_40G_ER4; break; } priv->media_active_last = media_entry.subtype | IFM_ETHER | IFM_FDX; priv->ifp->if_baudrate = media_entry.baudrate; if_link_state_change(priv->ifp, LINK_STATE_UP); } static void mlx5e_media_status(struct ifnet *dev, struct ifmediareq *ifmr) { struct mlx5e_priv *priv = dev->if_softc; ifmr->ifm_status = priv->media_status_last; ifmr->ifm_active = priv->media_active_last | (priv->params.rx_pauseframe_control ? IFM_ETH_RXPAUSE : 0) | (priv->params.tx_pauseframe_control ? IFM_ETH_TXPAUSE : 0); } static u32 mlx5e_find_link_mode(u32 subtype, bool ext) { u32 i; u32 j; u32 link_mode = 0; u32 speeds_num = 0; struct media media_entry = {}; switch (subtype) { case IFM_10G_LR: subtype = IFM_10G_ER; break; case IFM_40G_ER4: subtype = IFM_40G_LR4; break; } speeds_num = ext ? MLX5E_EXT_LINK_SPEEDS_NUMBER : MLX5E_LINK_SPEEDS_NUMBER; for (i = 0; i != speeds_num; i++) { for (j = 0; j < MLX5E_LINK_MODES_NUMBER ; ++j) { media_entry = ext ? mlx5e_ext_mode_table[i][j] : mlx5e_mode_table[i][j]; if (media_entry.baudrate == 0) continue; if (media_entry.subtype == subtype) { link_mode |= MLX5E_PROT_MASK(i); } } } return (link_mode); } static int mlx5e_set_port_pause_and_pfc(struct mlx5e_priv *priv) { return (mlx5_set_port_pause_and_pfc(priv->mdev, 1, priv->params.rx_pauseframe_control, priv->params.tx_pauseframe_control, priv->params.rx_priority_flow_control, priv->params.tx_priority_flow_control)); } static int mlx5e_set_port_pfc(struct mlx5e_priv *priv) { int error; if (priv->gone != 0) { error = -ENXIO; } else if (priv->params.rx_pauseframe_control || priv->params.tx_pauseframe_control) { mlx5_en_err(priv->ifp, "Global pauseframes must be disabled before enabling PFC.\n"); error = -EINVAL; } else { error = mlx5e_set_port_pause_and_pfc(priv); } return (error); } static int mlx5e_media_change(struct ifnet *dev) { struct mlx5e_priv *priv = dev->if_softc; struct mlx5_core_dev *mdev = priv->mdev; u32 eth_proto_cap; u32 link_mode; u32 out[MLX5_ST_SZ_DW(ptys_reg)]; int was_opened; int locked; int error; bool ext; locked = PRIV_LOCKED(priv); if (!locked) PRIV_LOCK(priv); if (IFM_TYPE(priv->media.ifm_media) != IFM_ETHER) { error = EINVAL; goto done; } error = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1); if (error != 0) { mlx5_en_err(dev, "Query port media capability failed\n"); goto done; } ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet); link_mode = mlx5e_find_link_mode(IFM_SUBTYPE(priv->media.ifm_media), ext); /* query supported capabilities */ eth_proto_cap = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_capability); /* check for autoselect */ if (IFM_SUBTYPE(priv->media.ifm_media) == IFM_AUTO) { link_mode = eth_proto_cap; if (link_mode == 0) { mlx5_en_err(dev, "Port media capability is zero\n"); error = EINVAL; goto done; } } else { link_mode = link_mode & eth_proto_cap; if (link_mode == 0) { mlx5_en_err(dev, "Not supported link mode requested\n"); error = EINVAL; goto done; } } if (priv->media.ifm_media & (IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE)) { /* check if PFC is enabled */ if (priv->params.rx_priority_flow_control || priv->params.tx_priority_flow_control) { mlx5_en_err(dev, "PFC must be disabled before enabling global pauseframes.\n"); error = EINVAL; goto done; } } /* update pauseframe control bits */ priv->params.rx_pauseframe_control = (priv->media.ifm_media & IFM_ETH_RXPAUSE) ? 1 : 0; priv->params.tx_pauseframe_control = (priv->media.ifm_media & IFM_ETH_TXPAUSE) ? 1 : 0; /* check if device is opened */ was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state); /* reconfigure the hardware */ mlx5_set_port_status(mdev, MLX5_PORT_DOWN); mlx5_set_port_proto(mdev, link_mode, MLX5_PTYS_EN, ext); error = -mlx5e_set_port_pause_and_pfc(priv); if (was_opened) mlx5_set_port_status(mdev, MLX5_PORT_UP); done: if (!locked) PRIV_UNLOCK(priv); return (error); } static void mlx5e_update_carrier_work(struct work_struct *work) { struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv, update_carrier_work); PRIV_LOCK(priv); if (test_bit(MLX5E_STATE_OPENED, &priv->state)) mlx5e_update_carrier(priv); PRIV_UNLOCK(priv); } #define MLX5E_PCIE_PERF_GET_64(a,b,c,d,e,f) \ s_debug->c = MLX5_GET64(mpcnt_reg, out, counter_set.f.c); #define MLX5E_PCIE_PERF_GET_32(a,b,c,d,e,f) \ s_debug->c = MLX5_GET(mpcnt_reg, out, counter_set.f.c); static void mlx5e_update_pcie_counters(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; struct mlx5e_port_stats_debug *s_debug = &priv->stats.port_stats_debug; const unsigned sz = MLX5_ST_SZ_BYTES(mpcnt_reg); void *out; void *in; int err; /* allocate firmware request structures */ in = mlx5_vzalloc(sz); out = mlx5_vzalloc(sz); if (in == NULL || out == NULL) goto free_out; MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_PERFORMANCE_COUNTERS_GROUP); err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0); if (err != 0) goto free_out; MLX5E_PCIE_PERFORMANCE_COUNTERS_64(MLX5E_PCIE_PERF_GET_64) MLX5E_PCIE_PERFORMANCE_COUNTERS_32(MLX5E_PCIE_PERF_GET_32) MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_TIMERS_AND_STATES_COUNTERS_GROUP); err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0); if (err != 0) goto free_out; MLX5E_PCIE_TIMERS_AND_STATES_COUNTERS_32(MLX5E_PCIE_PERF_GET_32) MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_LANE_COUNTERS_GROUP); err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0); if (err != 0) goto free_out; MLX5E_PCIE_LANE_COUNTERS_32(MLX5E_PCIE_PERF_GET_32) free_out: /* free firmware request structures */ kvfree(in); kvfree(out); } /* * This function reads the physical port counters from the firmware * using a pre-defined layout defined by various MLX5E_PPORT_XXX() * macros. The output is converted from big-endian 64-bit values into * host endian ones and stored in the "priv->stats.pport" structure. */ static void mlx5e_update_pport_counters(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; struct mlx5e_pport_stats *s = &priv->stats.pport; struct mlx5e_port_stats_debug *s_debug = &priv->stats.port_stats_debug; u32 *in; u32 *out; const u64 *ptr; unsigned sz = MLX5_ST_SZ_BYTES(ppcnt_reg); unsigned x; unsigned y; unsigned z; /* allocate firmware request structures */ in = mlx5_vzalloc(sz); out = mlx5_vzalloc(sz); if (in == NULL || out == NULL) goto free_out; /* * Get pointer to the 64-bit counter set which is located at a * fixed offset in the output firmware request structure: */ ptr = (const uint64_t *)MLX5_ADDR_OF(ppcnt_reg, out, counter_set); MLX5_SET(ppcnt_reg, in, local_port, 1); /* read IEEE802_3 counter group using predefined counter layout */ MLX5_SET(ppcnt_reg, in, grp, MLX5_IEEE_802_3_COUNTERS_GROUP); mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); for (x = 0, y = MLX5E_PPORT_PER_PRIO_STATS_NUM; x != MLX5E_PPORT_IEEE802_3_STATS_NUM; x++, y++) s->arg[y] = be64toh(ptr[x]); /* read RFC2819 counter group using predefined counter layout */ MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP); mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); for (x = 0; x != MLX5E_PPORT_RFC2819_STATS_NUM; x++, y++) s->arg[y] = be64toh(ptr[x]); for (y = 0; x != MLX5E_PPORT_RFC2819_STATS_NUM + MLX5E_PPORT_RFC2819_STATS_DEBUG_NUM; x++, y++) s_debug->arg[y] = be64toh(ptr[x]); /* read RFC2863 counter group using predefined counter layout */ MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2863_COUNTERS_GROUP); mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); for (x = 0; x != MLX5E_PPORT_RFC2863_STATS_DEBUG_NUM; x++, y++) s_debug->arg[y] = be64toh(ptr[x]); /* read physical layer stats counter group using predefined counter layout */ MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_COUNTERS_GROUP); mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); for (x = 0; x != MLX5E_PPORT_PHYSICAL_LAYER_STATS_DEBUG_NUM; x++, y++) s_debug->arg[y] = be64toh(ptr[x]); /* read Extended Ethernet counter group using predefined counter layout */ MLX5_SET(ppcnt_reg, in, grp, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP); mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); for (x = 0; x != MLX5E_PPORT_ETHERNET_EXTENDED_STATS_DEBUG_NUM; x++, y++) s_debug->arg[y] = be64toh(ptr[x]); /* read Extended Statistical Group */ if (MLX5_CAP_GEN(mdev, pcam_reg) && MLX5_CAP_PCAM_FEATURE(mdev, ppcnt_statistical_group) && MLX5_CAP_PCAM_FEATURE(mdev, per_lane_error_counters)) { /* read Extended Statistical counter group using predefined counter layout */ MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_STATISTICAL_GROUP); mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); for (x = 0; x != MLX5E_PPORT_STATISTICAL_DEBUG_NUM; x++, y++) s_debug->arg[y] = be64toh(ptr[x]); } /* read PCIE counters */ mlx5e_update_pcie_counters(priv); /* read per-priority counters */ MLX5_SET(ppcnt_reg, in, grp, MLX5_PER_PRIORITY_COUNTERS_GROUP); /* iterate all the priorities */ for (y = z = 0; z != MLX5E_PPORT_PER_PRIO_STATS_NUM_PRIO; z++) { MLX5_SET(ppcnt_reg, in, prio_tc, z); mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); /* read per priority stats counter group using predefined counter layout */ for (x = 0; x != (MLX5E_PPORT_PER_PRIO_STATS_NUM / MLX5E_PPORT_PER_PRIO_STATS_NUM_PRIO); x++, y++) s->arg[y] = be64toh(ptr[x]); } free_out: /* free firmware request structures */ kvfree(in); kvfree(out); } static void mlx5e_grp_vnic_env_update_stats(struct mlx5e_priv *priv) { u32 out[MLX5_ST_SZ_DW(query_vnic_env_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {}; if (!MLX5_CAP_GEN(priv->mdev, nic_receive_steering_discard)) return; MLX5_SET(query_vnic_env_in, in, opcode, MLX5_CMD_OP_QUERY_VNIC_ENV); MLX5_SET(query_vnic_env_in, in, op_mod, 0); MLX5_SET(query_vnic_env_in, in, other_vport, 0); if (mlx5_cmd_exec(priv->mdev, in, sizeof(in), out, sizeof(out)) != 0) return; priv->stats.vport.rx_steer_missed_packets = MLX5_GET64(query_vnic_env_out, out, vport_env.nic_receive_steering_discard); } /* * This function is called regularly to collect all statistics * counters from the firmware. The values can be viewed through the * sysctl interface. Execution is serialized using the priv's global * configuration lock. */ static void mlx5e_update_stats_locked(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; struct mlx5e_vport_stats *s = &priv->stats.vport; struct mlx5e_sq_stats *sq_stats; struct buf_ring *sq_br; #if (__FreeBSD_version < 1100000) struct ifnet *ifp = priv->ifp; #endif u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)]; u32 *out; int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out); u64 tso_packets = 0; u64 tso_bytes = 0; u64 tx_queue_dropped = 0; u64 tx_defragged = 0; u64 tx_offload_none = 0; u64 lro_packets = 0; u64 lro_bytes = 0; u64 sw_lro_queued = 0; u64 sw_lro_flushed = 0; u64 rx_csum_none = 0; u64 rx_wqe_err = 0; u64 rx_packets = 0; u64 rx_bytes = 0; u32 rx_out_of_buffer = 0; int error; int i; int j; out = mlx5_vzalloc(outlen); if (out == NULL) goto free_out; /* Collect firts the SW counters and then HW for consistency */ for (i = 0; i < priv->params.num_channels; i++) { struct mlx5e_channel *pch = priv->channel + i; struct mlx5e_rq *rq = &pch->rq; struct mlx5e_rq_stats *rq_stats = &pch->rq.stats; /* collect stats from LRO */ rq_stats->sw_lro_queued = rq->lro.lro_queued; rq_stats->sw_lro_flushed = rq->lro.lro_flushed; sw_lro_queued += rq_stats->sw_lro_queued; sw_lro_flushed += rq_stats->sw_lro_flushed; lro_packets += rq_stats->lro_packets; lro_bytes += rq_stats->lro_bytes; rx_csum_none += rq_stats->csum_none; rx_wqe_err += rq_stats->wqe_err; rx_packets += rq_stats->packets; rx_bytes += rq_stats->bytes; for (j = 0; j < priv->num_tc; j++) { sq_stats = &pch->sq[j].stats; sq_br = pch->sq[j].br; tso_packets += sq_stats->tso_packets; tso_bytes += sq_stats->tso_bytes; tx_queue_dropped += sq_stats->dropped; if (sq_br != NULL) tx_queue_dropped += sq_br->br_drops; tx_defragged += sq_stats->defragged; tx_offload_none += sq_stats->csum_offload_none; } } /* update counters */ s->tso_packets = tso_packets; s->tso_bytes = tso_bytes; s->tx_queue_dropped = tx_queue_dropped; s->tx_defragged = tx_defragged; s->lro_packets = lro_packets; s->lro_bytes = lro_bytes; s->sw_lro_queued = sw_lro_queued; s->sw_lro_flushed = sw_lro_flushed; s->rx_csum_none = rx_csum_none; s->rx_wqe_err = rx_wqe_err; s->rx_packets = rx_packets; s->rx_bytes = rx_bytes; mlx5e_grp_vnic_env_update_stats(priv); /* HW counters */ memset(in, 0, sizeof(in)); MLX5_SET(query_vport_counter_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_COUNTER); MLX5_SET(query_vport_counter_in, in, op_mod, 0); MLX5_SET(query_vport_counter_in, in, other_vport, 0); memset(out, 0, outlen); /* get number of out-of-buffer drops first */ if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0 && mlx5_vport_query_out_of_rx_buffer(mdev, priv->counter_set_id, &rx_out_of_buffer) == 0) { s->rx_out_of_buffer = rx_out_of_buffer; } /* get port statistics */ if (mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen) == 0) { #define MLX5_GET_CTR(out, x) \ MLX5_GET64(query_vport_counter_out, out, x) s->rx_error_packets = MLX5_GET_CTR(out, received_errors.packets); s->rx_error_bytes = MLX5_GET_CTR(out, received_errors.octets); s->tx_error_packets = MLX5_GET_CTR(out, transmit_errors.packets); s->tx_error_bytes = MLX5_GET_CTR(out, transmit_errors.octets); s->rx_unicast_packets = MLX5_GET_CTR(out, received_eth_unicast.packets); s->rx_unicast_bytes = MLX5_GET_CTR(out, received_eth_unicast.octets); s->tx_unicast_packets = MLX5_GET_CTR(out, transmitted_eth_unicast.packets); s->tx_unicast_bytes = MLX5_GET_CTR(out, transmitted_eth_unicast.octets); s->rx_multicast_packets = MLX5_GET_CTR(out, received_eth_multicast.packets); s->rx_multicast_bytes = MLX5_GET_CTR(out, received_eth_multicast.octets); s->tx_multicast_packets = MLX5_GET_CTR(out, transmitted_eth_multicast.packets); s->tx_multicast_bytes = MLX5_GET_CTR(out, transmitted_eth_multicast.octets); s->rx_broadcast_packets = MLX5_GET_CTR(out, received_eth_broadcast.packets); s->rx_broadcast_bytes = MLX5_GET_CTR(out, received_eth_broadcast.octets); s->tx_broadcast_packets = MLX5_GET_CTR(out, transmitted_eth_broadcast.packets); s->tx_broadcast_bytes = MLX5_GET_CTR(out, transmitted_eth_broadcast.octets); s->tx_packets = s->tx_unicast_packets + s->tx_multicast_packets + s->tx_broadcast_packets; s->tx_bytes = s->tx_unicast_bytes + s->tx_multicast_bytes + s->tx_broadcast_bytes; /* Update calculated offload counters */ s->tx_csum_offload = s->tx_packets - tx_offload_none; s->rx_csum_good = s->rx_packets - s->rx_csum_none; } /* Get physical port counters */ mlx5e_update_pport_counters(priv); s->tx_jumbo_packets = priv->stats.port_stats_debug.tx_stat_p1519to2047octets + priv->stats.port_stats_debug.tx_stat_p2048to4095octets + priv->stats.port_stats_debug.tx_stat_p4096to8191octets + priv->stats.port_stats_debug.tx_stat_p8192to10239octets; #if (__FreeBSD_version < 1100000) /* no get_counters interface in fbsd 10 */ ifp->if_ipackets = s->rx_packets; ifp->if_ierrors = priv->stats.pport.in_range_len_errors + priv->stats.pport.out_of_range_len + priv->stats.pport.too_long_errors + priv->stats.pport.check_seq_err + priv->stats.pport.alignment_err; ifp->if_iqdrops = s->rx_out_of_buffer; ifp->if_opackets = s->tx_packets; ifp->if_oerrors = priv->stats.port_stats_debug.out_discards; ifp->if_snd.ifq_drops = s->tx_queue_dropped; ifp->if_ibytes = s->rx_bytes; ifp->if_obytes = s->tx_bytes; ifp->if_collisions = priv->stats.pport.collisions; #endif free_out: kvfree(out); /* Update diagnostics, if any */ if (priv->params_ethtool.diag_pci_enable || priv->params_ethtool.diag_general_enable) { error = mlx5_core_get_diagnostics_full(mdev, priv->params_ethtool.diag_pci_enable ? &priv->params_pci : NULL, priv->params_ethtool.diag_general_enable ? &priv->params_general : NULL); if (error != 0) mlx5_en_err(priv->ifp, "Failed reading diagnostics: %d\n", error); } /* Update FEC, if any */ error = mlx5e_fec_update(priv); if (error != 0 && error != EOPNOTSUPP) { mlx5_en_err(priv->ifp, "Updating FEC failed: %d\n", error); } } static void mlx5e_update_stats_work(struct work_struct *work) { struct mlx5e_priv *priv; priv = container_of(work, struct mlx5e_priv, update_stats_work); PRIV_LOCK(priv); if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0 && !test_bit(MLX5_INTERFACE_STATE_TEARDOWN, &priv->mdev->intf_state)) mlx5e_update_stats_locked(priv); PRIV_UNLOCK(priv); } static void mlx5e_update_stats(void *arg) { struct mlx5e_priv *priv = arg; queue_work(priv->wq, &priv->update_stats_work); callout_reset(&priv->watchdog, hz, &mlx5e_update_stats, priv); } static void mlx5e_async_event_sub(struct mlx5e_priv *priv, enum mlx5_dev_event event) { switch (event) { case MLX5_DEV_EVENT_PORT_UP: case MLX5_DEV_EVENT_PORT_DOWN: queue_work(priv->wq, &priv->update_carrier_work); break; default: break; } } static void mlx5e_async_event(struct mlx5_core_dev *mdev, void *vpriv, enum mlx5_dev_event event, unsigned long param) { struct mlx5e_priv *priv = vpriv; mtx_lock(&priv->async_events_mtx); if (test_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state)) mlx5e_async_event_sub(priv, event); mtx_unlock(&priv->async_events_mtx); } static void mlx5e_enable_async_events(struct mlx5e_priv *priv) { set_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state); } static void mlx5e_disable_async_events(struct mlx5e_priv *priv) { mtx_lock(&priv->async_events_mtx); clear_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state); mtx_unlock(&priv->async_events_mtx); } static void mlx5e_calibration_callout(void *arg); static int mlx5e_calibration_duration = 20; static int mlx5e_fast_calibration = 1; static int mlx5e_normal_calibration = 30; static SYSCTL_NODE(_hw_mlx5, OID_AUTO, calibr, CTLFLAG_RW, 0, "MLX5 timestamp calibration parameteres"); SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, duration, CTLFLAG_RWTUN, &mlx5e_calibration_duration, 0, "Duration of initial calibration"); SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, fast, CTLFLAG_RWTUN, &mlx5e_fast_calibration, 0, "Recalibration interval during initial calibration"); SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, normal, CTLFLAG_RWTUN, &mlx5e_normal_calibration, 0, "Recalibration interval during normal operations"); /* * Ignites the calibration process. */ static void mlx5e_reset_calibration_callout(struct mlx5e_priv *priv) { if (priv->clbr_done == 0) mlx5e_calibration_callout(priv); else callout_reset_curcpu(&priv->tstmp_clbr, (priv->clbr_done < mlx5e_calibration_duration ? mlx5e_fast_calibration : mlx5e_normal_calibration) * hz, mlx5e_calibration_callout, priv); } static uint64_t mlx5e_timespec2usec(const struct timespec *ts) { return ((uint64_t)ts->tv_sec * 1000000000 + ts->tv_nsec); } static uint64_t mlx5e_hw_clock(struct mlx5e_priv *priv) { struct mlx5_init_seg *iseg; uint32_t hw_h, hw_h1, hw_l; iseg = priv->mdev->iseg; do { hw_h = ioread32be(&iseg->internal_timer_h); hw_l = ioread32be(&iseg->internal_timer_l); hw_h1 = ioread32be(&iseg->internal_timer_h); } while (hw_h1 != hw_h); return (((uint64_t)hw_h << 32) | hw_l); } /* * The calibration callout, it runs either in the context of the * thread which enables calibration, or in callout. It takes the * snapshot of system and adapter clocks, then advances the pointers to * the calibration point to allow rx path to read the consistent data * lockless. */ static void mlx5e_calibration_callout(void *arg) { struct mlx5e_priv *priv; struct mlx5e_clbr_point *next, *curr; struct timespec ts; int clbr_curr_next; priv = arg; curr = &priv->clbr_points[priv->clbr_curr]; clbr_curr_next = priv->clbr_curr + 1; if (clbr_curr_next >= nitems(priv->clbr_points)) clbr_curr_next = 0; next = &priv->clbr_points[clbr_curr_next]; next->base_prev = curr->base_curr; next->clbr_hw_prev = curr->clbr_hw_curr; next->clbr_hw_curr = mlx5e_hw_clock(priv); if (((next->clbr_hw_curr - curr->clbr_hw_curr) >> MLX5E_TSTMP_PREC) == 0) { if (priv->clbr_done != 0) { mlx5_en_err(priv->ifp, "HW failed tstmp frozen %#jx %#jx, disabling\n", next->clbr_hw_curr, curr->clbr_hw_prev); priv->clbr_done = 0; } atomic_store_rel_int(&curr->clbr_gen, 0); return; } nanouptime(&ts); next->base_curr = mlx5e_timespec2usec(&ts); curr->clbr_gen = 0; atomic_thread_fence_rel(); priv->clbr_curr = clbr_curr_next; atomic_store_rel_int(&next->clbr_gen, ++(priv->clbr_gen)); if (priv->clbr_done < mlx5e_calibration_duration) priv->clbr_done++; mlx5e_reset_calibration_callout(priv); } static const char *mlx5e_rq_stats_desc[] = { MLX5E_RQ_STATS(MLX5E_STATS_DESC) }; static int mlx5e_create_rq(struct mlx5e_channel *c, struct mlx5e_rq_param *param, struct mlx5e_rq *rq) { struct mlx5e_priv *priv = c->priv; struct mlx5_core_dev *mdev = priv->mdev; char buffer[16]; void *rqc = param->rqc; void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq); int wq_sz; int err; int i; u32 nsegs, wqe_sz; err = mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs); if (err != 0) goto done; /* Create DMA descriptor TAG */ if ((err = -bus_dma_tag_create( bus_get_dma_tag(mdev->pdev->dev.bsddev), 1, /* any alignment */ 0, /* no boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ nsegs * MLX5E_MAX_RX_BYTES, /* maxsize */ nsegs, /* nsegments */ nsegs * MLX5E_MAX_RX_BYTES, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockfuncarg */ &rq->dma_tag))) goto done; err = mlx5_wq_ll_create(mdev, ¶m->wq, rqc_wq, &rq->wq, &rq->wq_ctrl); if (err) goto err_free_dma_tag; rq->wq.db = &rq->wq.db[MLX5_RCV_DBR]; err = mlx5e_get_wqe_sz(priv, &rq->wqe_sz, &rq->nsegs); if (err != 0) goto err_rq_wq_destroy; wq_sz = mlx5_wq_ll_get_size(&rq->wq); err = -tcp_lro_init_args(&rq->lro, priv->ifp, TCP_LRO_ENTRIES, wq_sz); if (err) goto err_rq_wq_destroy; rq->mbuf = malloc(wq_sz * sizeof(rq->mbuf[0]), M_MLX5EN, M_WAITOK | M_ZERO); for (i = 0; i != wq_sz; i++) { struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i); int j; err = -bus_dmamap_create(rq->dma_tag, 0, &rq->mbuf[i].dma_map); if (err != 0) { while (i--) bus_dmamap_destroy(rq->dma_tag, rq->mbuf[i].dma_map); goto err_rq_mbuf_free; } /* set value for constant fields */ for (j = 0; j < rq->nsegs; j++) wqe->data[j].lkey = cpu_to_be32(priv->mr.key); } INIT_WORK(&rq->dim.work, mlx5e_dim_work); if (priv->params.rx_cq_moderation_mode < 2) { rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_DISABLED; } else { void *cqc = container_of(param, struct mlx5e_channel_param, rq)->rx_cq.cqc; switch (MLX5_GET(cqc, cqc, cq_period_mode)) { case MLX5_CQ_PERIOD_MODE_START_FROM_EQE: rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE; break; case MLX5_CQ_PERIOD_MODE_START_FROM_CQE: rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE; break; default: rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_DISABLED; break; } } rq->ifp = priv->ifp; rq->channel = c; rq->ix = c->ix; snprintf(buffer, sizeof(buffer), "rxstat%d", c->ix); mlx5e_create_stats(&rq->stats.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), buffer, mlx5e_rq_stats_desc, MLX5E_RQ_STATS_NUM, rq->stats.arg); return (0); err_rq_mbuf_free: free(rq->mbuf, M_MLX5EN); tcp_lro_free(&rq->lro); err_rq_wq_destroy: mlx5_wq_destroy(&rq->wq_ctrl); err_free_dma_tag: bus_dma_tag_destroy(rq->dma_tag); done: return (err); } static void mlx5e_destroy_rq(struct mlx5e_rq *rq) { int wq_sz; int i; /* destroy all sysctl nodes */ sysctl_ctx_free(&rq->stats.ctx); /* free leftover LRO packets, if any */ tcp_lro_free(&rq->lro); wq_sz = mlx5_wq_ll_get_size(&rq->wq); for (i = 0; i != wq_sz; i++) { if (rq->mbuf[i].mbuf != NULL) { bus_dmamap_unload(rq->dma_tag, rq->mbuf[i].dma_map); m_freem(rq->mbuf[i].mbuf); } bus_dmamap_destroy(rq->dma_tag, rq->mbuf[i].dma_map); } free(rq->mbuf, M_MLX5EN); mlx5_wq_destroy(&rq->wq_ctrl); bus_dma_tag_destroy(rq->dma_tag); } static int mlx5e_enable_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param) { struct mlx5e_channel *c = rq->channel; struct mlx5e_priv *priv = c->priv; struct mlx5_core_dev *mdev = priv->mdev; void *in; void *rqc; void *wq; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(create_rq_in) + sizeof(u64) * rq->wq_ctrl.buf.npages; in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); wq = MLX5_ADDR_OF(rqc, rqc, wq); memcpy(rqc, param->rqc, sizeof(param->rqc)); MLX5_SET(rqc, rqc, cqn, c->rq.cq.mcq.cqn); MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); MLX5_SET(rqc, rqc, flush_in_error_en, 1); if (priv->counter_set_id >= 0) MLX5_SET(rqc, rqc, counter_set_id, priv->counter_set_id); MLX5_SET(wq, wq, log_wq_pg_sz, rq->wq_ctrl.buf.page_shift - PAGE_SHIFT); MLX5_SET64(wq, wq, dbr_addr, rq->wq_ctrl.db.dma); mlx5_fill_page_array(&rq->wq_ctrl.buf, (__be64 *) MLX5_ADDR_OF(wq, wq, pas)); err = mlx5_core_create_rq(mdev, in, inlen, &rq->rqn); kvfree(in); return (err); } static int mlx5e_modify_rq(struct mlx5e_rq *rq, int curr_state, int next_state) { struct mlx5e_channel *c = rq->channel; struct mlx5e_priv *priv = c->priv; struct mlx5_core_dev *mdev = priv->mdev; void *in; void *rqc; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(modify_rq_in); in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); MLX5_SET(modify_rq_in, in, rqn, rq->rqn); MLX5_SET(modify_rq_in, in, rq_state, curr_state); MLX5_SET(rqc, rqc, state, next_state); err = mlx5_core_modify_rq(mdev, in, inlen); kvfree(in); return (err); } static void mlx5e_disable_rq(struct mlx5e_rq *rq) { struct mlx5e_channel *c = rq->channel; struct mlx5e_priv *priv = c->priv; struct mlx5_core_dev *mdev = priv->mdev; mlx5_core_destroy_rq(mdev, rq->rqn); } static int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq) { struct mlx5e_channel *c = rq->channel; struct mlx5e_priv *priv = c->priv; struct mlx5_wq_ll *wq = &rq->wq; int i; for (i = 0; i < 1000; i++) { if (wq->cur_sz >= priv->params.min_rx_wqes) return (0); msleep(4); } return (-ETIMEDOUT); } static int mlx5e_open_rq(struct mlx5e_channel *c, struct mlx5e_rq_param *param, struct mlx5e_rq *rq) { int err; err = mlx5e_create_rq(c, param, rq); if (err) return (err); err = mlx5e_enable_rq(rq, param); if (err) goto err_destroy_rq; err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY); if (err) goto err_disable_rq; c->rq.enabled = 1; return (0); err_disable_rq: mlx5e_disable_rq(rq); err_destroy_rq: mlx5e_destroy_rq(rq); return (err); } static void mlx5e_close_rq(struct mlx5e_rq *rq) { mtx_lock(&rq->mtx); rq->enabled = 0; callout_stop(&rq->watchdog); mtx_unlock(&rq->mtx); mlx5e_modify_rq(rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR); } static void mlx5e_close_rq_wait(struct mlx5e_rq *rq) { mlx5e_disable_rq(rq); mlx5e_close_cq(&rq->cq); cancel_work_sync(&rq->dim.work); mlx5e_destroy_rq(rq); } void mlx5e_free_sq_db(struct mlx5e_sq *sq) { int wq_sz = mlx5_wq_cyc_get_size(&sq->wq); int x; for (x = 0; x != wq_sz; x++) { if (unlikely(sq->mbuf[x].p_refcount != NULL)) { atomic_add_int(sq->mbuf[x].p_refcount, -1); sq->mbuf[x].p_refcount = NULL; } if (sq->mbuf[x].mbuf != NULL) { bus_dmamap_unload(sq->dma_tag, sq->mbuf[x].dma_map); m_freem(sq->mbuf[x].mbuf); } bus_dmamap_destroy(sq->dma_tag, sq->mbuf[x].dma_map); } free(sq->mbuf, M_MLX5EN); } int mlx5e_alloc_sq_db(struct mlx5e_sq *sq) { int wq_sz = mlx5_wq_cyc_get_size(&sq->wq); int err; int x; sq->mbuf = malloc(wq_sz * sizeof(sq->mbuf[0]), M_MLX5EN, M_WAITOK | M_ZERO); /* Create DMA descriptor MAPs */ for (x = 0; x != wq_sz; x++) { err = -bus_dmamap_create(sq->dma_tag, 0, &sq->mbuf[x].dma_map); if (err != 0) { while (x--) bus_dmamap_destroy(sq->dma_tag, sq->mbuf[x].dma_map); free(sq->mbuf, M_MLX5EN); return (err); } } return (0); } static const char *mlx5e_sq_stats_desc[] = { MLX5E_SQ_STATS(MLX5E_STATS_DESC) }; void mlx5e_update_sq_inline(struct mlx5e_sq *sq) { sq->max_inline = sq->priv->params.tx_max_inline; sq->min_inline_mode = sq->priv->params.tx_min_inline_mode; /* * Check if trust state is DSCP or if inline mode is NONE which * indicates CX-5 or newer hardware. */ if (sq->priv->params_ethtool.trust_state != MLX5_QPTS_TRUST_PCP || sq->min_inline_mode == MLX5_INLINE_MODE_NONE) { if (MLX5_CAP_ETH(sq->priv->mdev, wqe_vlan_insert)) sq->min_insert_caps = MLX5E_INSERT_VLAN | MLX5E_INSERT_NON_VLAN; else sq->min_insert_caps = MLX5E_INSERT_NON_VLAN; } else { sq->min_insert_caps = 0; } } static void mlx5e_refresh_sq_inline_sub(struct mlx5e_priv *priv, struct mlx5e_channel *c) { int i; for (i = 0; i != priv->num_tc; i++) { mtx_lock(&c->sq[i].lock); mlx5e_update_sq_inline(&c->sq[i]); mtx_unlock(&c->sq[i].lock); } } void mlx5e_refresh_sq_inline(struct mlx5e_priv *priv) { int i; /* check if channels are closed */ if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0) return; for (i = 0; i < priv->params.num_channels; i++) mlx5e_refresh_sq_inline_sub(priv, &priv->channel[i]); } static int mlx5e_create_sq(struct mlx5e_channel *c, int tc, struct mlx5e_sq_param *param, struct mlx5e_sq *sq) { struct mlx5e_priv *priv = c->priv; struct mlx5_core_dev *mdev = priv->mdev; char buffer[16]; void *sqc = param->sqc; void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq); int err; /* Create DMA descriptor TAG */ if ((err = -bus_dma_tag_create( bus_get_dma_tag(mdev->pdev->dev.bsddev), 1, /* any alignment */ 0, /* no boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MLX5E_MAX_TX_PAYLOAD_SIZE, /* maxsize */ MLX5E_MAX_TX_MBUF_FRAGS, /* nsegments */ MLX5E_MAX_TX_MBUF_SIZE, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockfuncarg */ &sq->dma_tag))) goto done; err = mlx5_alloc_map_uar(mdev, &sq->uar); if (err) goto err_free_dma_tag; err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, &sq->wq_ctrl); if (err) goto err_unmap_free_uar; sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; sq->bf_buf_size = (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2; err = mlx5e_alloc_sq_db(sq); if (err) goto err_sq_wq_destroy; sq->mkey_be = cpu_to_be32(priv->mr.key); sq->ifp = priv->ifp; sq->priv = priv; sq->tc = tc; mlx5e_update_sq_inline(sq); snprintf(buffer, sizeof(buffer), "txstat%dtc%d", c->ix, tc); mlx5e_create_stats(&sq->stats.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), buffer, mlx5e_sq_stats_desc, MLX5E_SQ_STATS_NUM, sq->stats.arg); return (0); err_sq_wq_destroy: mlx5_wq_destroy(&sq->wq_ctrl); err_unmap_free_uar: mlx5_unmap_free_uar(mdev, &sq->uar); err_free_dma_tag: bus_dma_tag_destroy(sq->dma_tag); done: return (err); } static void mlx5e_destroy_sq(struct mlx5e_sq *sq) { /* destroy all sysctl nodes */ sysctl_ctx_free(&sq->stats.ctx); mlx5e_free_sq_db(sq); mlx5_wq_destroy(&sq->wq_ctrl); mlx5_unmap_free_uar(sq->priv->mdev, &sq->uar); bus_dma_tag_destroy(sq->dma_tag); } int mlx5e_enable_sq(struct mlx5e_sq *sq, struct mlx5e_sq_param *param, int tis_num) { void *in; void *sqc; void *wq; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(create_sq_in) + sizeof(u64) * sq->wq_ctrl.buf.npages; in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); wq = MLX5_ADDR_OF(sqc, sqc, wq); memcpy(sqc, param->sqc, sizeof(param->sqc)); MLX5_SET(sqc, sqc, tis_num_0, tis_num); MLX5_SET(sqc, sqc, cqn, sq->cq.mcq.cqn); MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST); MLX5_SET(sqc, sqc, tis_lst_sz, 1); MLX5_SET(sqc, sqc, flush_in_error_en, 1); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); MLX5_SET(wq, wq, uar_page, sq->uar.index); MLX5_SET(wq, wq, log_wq_pg_sz, sq->wq_ctrl.buf.page_shift - PAGE_SHIFT); MLX5_SET64(wq, wq, dbr_addr, sq->wq_ctrl.db.dma); mlx5_fill_page_array(&sq->wq_ctrl.buf, (__be64 *) MLX5_ADDR_OF(wq, wq, pas)); err = mlx5_core_create_sq(sq->priv->mdev, in, inlen, &sq->sqn); kvfree(in); return (err); } int mlx5e_modify_sq(struct mlx5e_sq *sq, int curr_state, int next_state) { void *in; void *sqc; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(modify_sq_in); in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); MLX5_SET(modify_sq_in, in, sqn, sq->sqn); MLX5_SET(modify_sq_in, in, sq_state, curr_state); MLX5_SET(sqc, sqc, state, next_state); err = mlx5_core_modify_sq(sq->priv->mdev, in, inlen); kvfree(in); return (err); } void mlx5e_disable_sq(struct mlx5e_sq *sq) { mlx5_core_destroy_sq(sq->priv->mdev, sq->sqn); } static int mlx5e_open_sq(struct mlx5e_channel *c, int tc, struct mlx5e_sq_param *param, struct mlx5e_sq *sq) { int err; sq->cev_factor = c->priv->params_ethtool.tx_completion_fact; /* ensure the TX completion event factor is not zero */ if (sq->cev_factor == 0) sq->cev_factor = 1; err = mlx5e_create_sq(c, tc, param, sq); if (err) return (err); err = mlx5e_enable_sq(sq, param, c->priv->tisn[tc]); if (err) goto err_destroy_sq; err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY); if (err) goto err_disable_sq; WRITE_ONCE(sq->running, 1); return (0); err_disable_sq: mlx5e_disable_sq(sq); err_destroy_sq: mlx5e_destroy_sq(sq); return (err); } static void mlx5e_sq_send_nops_locked(struct mlx5e_sq *sq, int can_sleep) { /* fill up remainder with NOPs */ while (sq->cev_counter != 0) { while (!mlx5e_sq_has_room_for(sq, 1)) { if (can_sleep != 0) { mtx_unlock(&sq->lock); msleep(4); mtx_lock(&sq->lock); } else { goto done; } } /* send a single NOP */ mlx5e_send_nop(sq, 1); atomic_thread_fence_rel(); } done: /* Check if we need to write the doorbell */ if (likely(sq->doorbell.d64 != 0)) { mlx5e_tx_notify_hw(sq, sq->doorbell.d32, 0); sq->doorbell.d64 = 0; } } void mlx5e_sq_cev_timeout(void *arg) { struct mlx5e_sq *sq = arg; mtx_assert(&sq->lock, MA_OWNED); /* check next state */ switch (sq->cev_next_state) { case MLX5E_CEV_STATE_SEND_NOPS: /* fill TX ring with NOPs, if any */ mlx5e_sq_send_nops_locked(sq, 0); /* check if completed */ if (sq->cev_counter == 0) { sq->cev_next_state = MLX5E_CEV_STATE_INITIAL; return; } break; default: /* send NOPs on next timeout */ sq->cev_next_state = MLX5E_CEV_STATE_SEND_NOPS; break; } /* restart timer */ callout_reset_curcpu(&sq->cev_callout, hz, mlx5e_sq_cev_timeout, sq); } void mlx5e_drain_sq(struct mlx5e_sq *sq) { int error; struct mlx5_core_dev *mdev= sq->priv->mdev; /* * Check if already stopped. * * NOTE: Serialization of this function is managed by the * caller ensuring the priv's state lock is locked or in case * of rate limit support, a single thread manages drain and * resume of SQs. The "running" variable can therefore safely * be read without any locks. */ if (READ_ONCE(sq->running) == 0) return; /* don't put more packets into the SQ */ WRITE_ONCE(sq->running, 0); /* serialize access to DMA rings */ mtx_lock(&sq->lock); /* teardown event factor timer, if any */ sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS; callout_stop(&sq->cev_callout); /* send dummy NOPs in order to flush the transmit ring */ mlx5e_sq_send_nops_locked(sq, 1); mtx_unlock(&sq->lock); /* wait till SQ is empty or link is down */ mtx_lock(&sq->lock); while (sq->cc != sq->pc && (sq->priv->media_status_last & IFM_ACTIVE) != 0 && mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) { mtx_unlock(&sq->lock); msleep(1); sq->cq.mcq.comp(&sq->cq.mcq); mtx_lock(&sq->lock); } mtx_unlock(&sq->lock); /* error out remaining requests */ error = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR); if (error != 0) { mlx5_en_err(sq->ifp, "mlx5e_modify_sq() from RDY to ERR failed: %d\n", error); } /* wait till SQ is empty */ mtx_lock(&sq->lock); while (sq->cc != sq->pc && mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) { mtx_unlock(&sq->lock); msleep(1); sq->cq.mcq.comp(&sq->cq.mcq); mtx_lock(&sq->lock); } mtx_unlock(&sq->lock); } static void mlx5e_close_sq_wait(struct mlx5e_sq *sq) { mlx5e_drain_sq(sq); mlx5e_disable_sq(sq); mlx5e_destroy_sq(sq); } static int mlx5e_create_cq(struct mlx5e_priv *priv, struct mlx5e_cq_param *param, struct mlx5e_cq *cq, mlx5e_cq_comp_t *comp, int eq_ix) { struct mlx5_core_dev *mdev = priv->mdev; struct mlx5_core_cq *mcq = &cq->mcq; int eqn_not_used; int irqn; int err; u32 i; param->wq.buf_numa_node = 0; param->wq.db_numa_node = 0; err = mlx5_vector2eqn(mdev, eq_ix, &eqn_not_used, &irqn); if (err) return (err); err = mlx5_cqwq_create(mdev, ¶m->wq, param->cqc, &cq->wq, &cq->wq_ctrl); if (err) return (err); mcq->cqe_sz = 64; mcq->set_ci_db = cq->wq_ctrl.db.db; mcq->arm_db = cq->wq_ctrl.db.db + 1; *mcq->set_ci_db = 0; *mcq->arm_db = 0; mcq->vector = eq_ix; mcq->comp = comp; mcq->event = mlx5e_cq_error_event; mcq->irqn = irqn; mcq->uar = &priv->cq_uar; for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) { struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, i); cqe->op_own = 0xf1; } cq->priv = priv; return (0); } static void mlx5e_destroy_cq(struct mlx5e_cq *cq) { mlx5_wq_destroy(&cq->wq_ctrl); } static int mlx5e_enable_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param, int eq_ix) { struct mlx5_core_cq *mcq = &cq->mcq; void *in; void *cqc; int inlen; int irqn_not_used; int eqn; int err; inlen = MLX5_ST_SZ_BYTES(create_cq_in) + sizeof(u64) * cq->wq_ctrl.buf.npages; in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); memcpy(cqc, param->cqc, sizeof(param->cqc)); mlx5_fill_page_array(&cq->wq_ctrl.buf, (__be64 *) MLX5_ADDR_OF(create_cq_in, in, pas)); mlx5_vector2eqn(cq->priv->mdev, eq_ix, &eqn, &irqn_not_used); MLX5_SET(cqc, cqc, c_eqn, eqn); MLX5_SET(cqc, cqc, uar_page, mcq->uar->index); MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - PAGE_SHIFT); MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); err = mlx5_core_create_cq(cq->priv->mdev, mcq, in, inlen); kvfree(in); if (err) return (err); mlx5e_cq_arm(cq, MLX5_GET_DOORBELL_LOCK(&cq->priv->doorbell_lock)); return (0); } static void mlx5e_disable_cq(struct mlx5e_cq *cq) { mlx5_core_destroy_cq(cq->priv->mdev, &cq->mcq); } int mlx5e_open_cq(struct mlx5e_priv *priv, struct mlx5e_cq_param *param, struct mlx5e_cq *cq, mlx5e_cq_comp_t *comp, int eq_ix) { int err; err = mlx5e_create_cq(priv, param, cq, comp, eq_ix); if (err) return (err); err = mlx5e_enable_cq(cq, param, eq_ix); if (err) goto err_destroy_cq; return (0); err_destroy_cq: mlx5e_destroy_cq(cq); return (err); } void mlx5e_close_cq(struct mlx5e_cq *cq) { mlx5e_disable_cq(cq); mlx5e_destroy_cq(cq); } static int mlx5e_open_tx_cqs(struct mlx5e_channel *c, struct mlx5e_channel_param *cparam) { int err; int tc; for (tc = 0; tc < c->priv->num_tc; tc++) { /* open completion queue */ err = mlx5e_open_cq(c->priv, &cparam->tx_cq, &c->sq[tc].cq, &mlx5e_tx_cq_comp, c->ix); if (err) goto err_close_tx_cqs; } return (0); err_close_tx_cqs: for (tc--; tc >= 0; tc--) mlx5e_close_cq(&c->sq[tc].cq); return (err); } static void mlx5e_close_tx_cqs(struct mlx5e_channel *c) { int tc; for (tc = 0; tc < c->priv->num_tc; tc++) mlx5e_close_cq(&c->sq[tc].cq); } static int mlx5e_open_sqs(struct mlx5e_channel *c, struct mlx5e_channel_param *cparam) { int err; int tc; for (tc = 0; tc < c->priv->num_tc; tc++) { err = mlx5e_open_sq(c, tc, &cparam->sq, &c->sq[tc]); if (err) goto err_close_sqs; } return (0); err_close_sqs: for (tc--; tc >= 0; tc--) mlx5e_close_sq_wait(&c->sq[tc]); return (err); } static void mlx5e_close_sqs_wait(struct mlx5e_channel *c) { int tc; for (tc = 0; tc < c->priv->num_tc; tc++) mlx5e_close_sq_wait(&c->sq[tc]); } static void mlx5e_chan_static_init(struct mlx5e_priv *priv, struct mlx5e_channel *c, int ix) { int tc; /* setup priv and channel number */ c->priv = priv; c->ix = ix; /* setup send tag */ c->tag.type = IF_SND_TAG_TYPE_UNLIMITED; m_snd_tag_init(&c->tag.m_snd_tag, c->priv->ifp); init_completion(&c->completion); mtx_init(&c->rq.mtx, "mlx5rx", MTX_NETWORK_LOCK, MTX_DEF); callout_init_mtx(&c->rq.watchdog, &c->rq.mtx, 0); for (tc = 0; tc != MLX5E_MAX_TX_NUM_TC; tc++) { struct mlx5e_sq *sq = c->sq + tc; mtx_init(&sq->lock, "mlx5tx", MTX_NETWORK_LOCK " TX", MTX_DEF); mtx_init(&sq->comp_lock, "mlx5comp", MTX_NETWORK_LOCK " TX", MTX_DEF); callout_init_mtx(&sq->cev_callout, &sq->lock, 0); } } static void mlx5e_chan_wait_for_completion(struct mlx5e_channel *c) { m_snd_tag_rele(&c->tag.m_snd_tag); wait_for_completion(&c->completion); } static void mlx5e_priv_wait_for_completion(struct mlx5e_priv *priv, const uint32_t channels) { uint32_t x; for (x = 0; x != channels; x++) mlx5e_chan_wait_for_completion(&priv->channel[x]); } static void mlx5e_chan_static_destroy(struct mlx5e_channel *c) { int tc; callout_drain(&c->rq.watchdog); mtx_destroy(&c->rq.mtx); for (tc = 0; tc != MLX5E_MAX_TX_NUM_TC; tc++) { callout_drain(&c->sq[tc].cev_callout); mtx_destroy(&c->sq[tc].lock); mtx_destroy(&c->sq[tc].comp_lock); } } static int mlx5e_open_channel(struct mlx5e_priv *priv, struct mlx5e_channel_param *cparam, struct mlx5e_channel *c) { struct epoch_tracker et; int i, err; /* zero non-persistant data */ MLX5E_ZERO(&c->rq, mlx5e_rq_zero_start); for (i = 0; i != priv->num_tc; i++) MLX5E_ZERO(&c->sq[i], mlx5e_sq_zero_start); /* open transmit completion queue */ err = mlx5e_open_tx_cqs(c, cparam); if (err) goto err_free; /* open receive completion queue */ err = mlx5e_open_cq(c->priv, &cparam->rx_cq, &c->rq.cq, &mlx5e_rx_cq_comp, c->ix); if (err) goto err_close_tx_cqs; err = mlx5e_open_sqs(c, cparam); if (err) goto err_close_rx_cq; err = mlx5e_open_rq(c, &cparam->rq, &c->rq); if (err) goto err_close_sqs; /* poll receive queue initially */ NET_EPOCH_ENTER(et); c->rq.cq.mcq.comp(&c->rq.cq.mcq); NET_EPOCH_EXIT(et); return (0); err_close_sqs: mlx5e_close_sqs_wait(c); err_close_rx_cq: mlx5e_close_cq(&c->rq.cq); err_close_tx_cqs: mlx5e_close_tx_cqs(c); err_free: return (err); } static void mlx5e_close_channel(struct mlx5e_channel *c) { mlx5e_close_rq(&c->rq); } static void mlx5e_close_channel_wait(struct mlx5e_channel *c) { mlx5e_close_rq_wait(&c->rq); mlx5e_close_sqs_wait(c); mlx5e_close_tx_cqs(c); } static int mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs) { u32 r, n; r = priv->params.hw_lro_en ? priv->params.lro_wqe_sz : MLX5E_SW2MB_MTU(priv->ifp->if_mtu); if (r > MJUM16BYTES) return (-ENOMEM); if (r > MJUM9BYTES) r = MJUM16BYTES; else if (r > MJUMPAGESIZE) r = MJUM9BYTES; else if (r > MCLBYTES) r = MJUMPAGESIZE; else r = MCLBYTES; /* * n + 1 must be a power of two, because stride size must be. * Stride size is 16 * (n + 1), as the first segment is * control. */ for (n = howmany(r, MLX5E_MAX_RX_BYTES); !powerof2(n + 1); n++) ; if (n > MLX5E_MAX_BUSDMA_RX_SEGS) return (-ENOMEM); *wqe_sz = r; *nsegs = n; return (0); } static void mlx5e_build_rq_param(struct mlx5e_priv *priv, struct mlx5e_rq_param *param) { void *rqc = param->rqc; void *wq = MLX5_ADDR_OF(rqc, rqc, wq); u32 wqe_sz, nsegs; mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST); MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN); MLX5_SET(wq, wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe) + nsegs * sizeof(struct mlx5_wqe_data_seg))); MLX5_SET(wq, wq, log_wq_sz, priv->params.log_rq_size); MLX5_SET(wq, wq, pd, priv->pdn); param->wq.buf_numa_node = 0; param->wq.db_numa_node = 0; param->wq.linear = 1; } static void mlx5e_build_sq_param(struct mlx5e_priv *priv, struct mlx5e_sq_param *param) { void *sqc = param->sqc; void *wq = MLX5_ADDR_OF(sqc, sqc, wq); MLX5_SET(wq, wq, log_wq_sz, priv->params.log_sq_size); MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); MLX5_SET(wq, wq, pd, priv->pdn); param->wq.buf_numa_node = 0; param->wq.db_numa_node = 0; param->wq.linear = 1; } static void mlx5e_build_common_cq_param(struct mlx5e_priv *priv, struct mlx5e_cq_param *param) { void *cqc = param->cqc; MLX5_SET(cqc, cqc, uar_page, priv->cq_uar.index); } static void mlx5e_get_default_profile(struct mlx5e_priv *priv, int mode, struct net_dim_cq_moder *ptr) { *ptr = net_dim_get_profile(mode, MLX5E_DIM_DEFAULT_PROFILE); /* apply LRO restrictions */ if (priv->params.hw_lro_en && ptr->pkts > MLX5E_DIM_MAX_RX_CQ_MODERATION_PKTS_WITH_LRO) { ptr->pkts = MLX5E_DIM_MAX_RX_CQ_MODERATION_PKTS_WITH_LRO; } } static void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv, struct mlx5e_cq_param *param) { struct net_dim_cq_moder curr; void *cqc = param->cqc; /* * We use MLX5_CQE_FORMAT_HASH because the RX hash mini CQE * format is more beneficial for FreeBSD use case. * * Adding support for MLX5_CQE_FORMAT_CSUM will require changes * in mlx5e_decompress_cqe. */ if (priv->params.cqe_zipping_en) { MLX5_SET(cqc, cqc, mini_cqe_res_format, MLX5_CQE_FORMAT_HASH); MLX5_SET(cqc, cqc, cqe_compression_en, 1); } MLX5_SET(cqc, cqc, log_cq_size, priv->params.log_rq_size); switch (priv->params.rx_cq_moderation_mode) { case 0: MLX5_SET(cqc, cqc, cq_period, priv->params.rx_cq_moderation_usec); MLX5_SET(cqc, cqc, cq_max_count, priv->params.rx_cq_moderation_pkts); MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; case 1: MLX5_SET(cqc, cqc, cq_period, priv->params.rx_cq_moderation_usec); MLX5_SET(cqc, cqc, cq_max_count, priv->params.rx_cq_moderation_pkts); if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe)) MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE); else MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; case 2: mlx5e_get_default_profile(priv, NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE, &curr); MLX5_SET(cqc, cqc, cq_period, curr.usec); MLX5_SET(cqc, cqc, cq_max_count, curr.pkts); MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; case 3: mlx5e_get_default_profile(priv, NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE, &curr); MLX5_SET(cqc, cqc, cq_period, curr.usec); MLX5_SET(cqc, cqc, cq_max_count, curr.pkts); if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe)) MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE); else MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; default: break; } mlx5e_dim_build_cq_param(priv, param); mlx5e_build_common_cq_param(priv, param); } static void mlx5e_build_tx_cq_param(struct mlx5e_priv *priv, struct mlx5e_cq_param *param) { void *cqc = param->cqc; MLX5_SET(cqc, cqc, log_cq_size, priv->params.log_sq_size); MLX5_SET(cqc, cqc, cq_period, priv->params.tx_cq_moderation_usec); MLX5_SET(cqc, cqc, cq_max_count, priv->params.tx_cq_moderation_pkts); switch (priv->params.tx_cq_moderation_mode) { case 0: MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; default: if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe)) MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE); else MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; } mlx5e_build_common_cq_param(priv, param); } static void mlx5e_build_channel_param(struct mlx5e_priv *priv, struct mlx5e_channel_param *cparam) { memset(cparam, 0, sizeof(*cparam)); mlx5e_build_rq_param(priv, &cparam->rq); mlx5e_build_sq_param(priv, &cparam->sq); mlx5e_build_rx_cq_param(priv, &cparam->rx_cq); mlx5e_build_tx_cq_param(priv, &cparam->tx_cq); } static int mlx5e_open_channels(struct mlx5e_priv *priv) { struct mlx5e_channel_param *cparam; int err; int i; int j; cparam = malloc(sizeof(*cparam), M_MLX5EN, M_WAITOK); mlx5e_build_channel_param(priv, cparam); for (i = 0; i < priv->params.num_channels; i++) { err = mlx5e_open_channel(priv, cparam, &priv->channel[i]); if (err) goto err_close_channels; } for (j = 0; j < priv->params.num_channels; j++) { err = mlx5e_wait_for_min_rx_wqes(&priv->channel[j].rq); if (err) goto err_close_channels; } free(cparam, M_MLX5EN); return (0); err_close_channels: while (i--) { mlx5e_close_channel(&priv->channel[i]); mlx5e_close_channel_wait(&priv->channel[i]); } free(cparam, M_MLX5EN); return (err); } static void mlx5e_close_channels(struct mlx5e_priv *priv) { int i; for (i = 0; i < priv->params.num_channels; i++) mlx5e_close_channel(&priv->channel[i]); for (i = 0; i < priv->params.num_channels; i++) mlx5e_close_channel_wait(&priv->channel[i]); } static int mlx5e_refresh_sq_params(struct mlx5e_priv *priv, struct mlx5e_sq *sq) { if (MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify)) { uint8_t cq_mode; switch (priv->params.tx_cq_moderation_mode) { case 0: case 2: cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_EQE; break; default: cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_CQE; break; } return (mlx5_core_modify_cq_moderation_mode(priv->mdev, &sq->cq.mcq, priv->params.tx_cq_moderation_usec, priv->params.tx_cq_moderation_pkts, cq_mode)); } return (mlx5_core_modify_cq_moderation(priv->mdev, &sq->cq.mcq, priv->params.tx_cq_moderation_usec, priv->params.tx_cq_moderation_pkts)); } static int mlx5e_refresh_rq_params(struct mlx5e_priv *priv, struct mlx5e_rq *rq) { if (MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify)) { uint8_t cq_mode; uint8_t dim_mode; int retval; switch (priv->params.rx_cq_moderation_mode) { case 0: case 2: cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_EQE; dim_mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE; break; default: cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_CQE; dim_mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE; break; } /* tear down dynamic interrupt moderation */ mtx_lock(&rq->mtx); rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_DISABLED; mtx_unlock(&rq->mtx); /* wait for dynamic interrupt moderation work task, if any */ cancel_work_sync(&rq->dim.work); if (priv->params.rx_cq_moderation_mode >= 2) { struct net_dim_cq_moder curr; mlx5e_get_default_profile(priv, dim_mode, &curr); retval = mlx5_core_modify_cq_moderation_mode(priv->mdev, &rq->cq.mcq, curr.usec, curr.pkts, cq_mode); /* set dynamic interrupt moderation mode and zero defaults */ mtx_lock(&rq->mtx); rq->dim.mode = dim_mode; rq->dim.state = 0; rq->dim.profile_ix = MLX5E_DIM_DEFAULT_PROFILE; mtx_unlock(&rq->mtx); } else { retval = mlx5_core_modify_cq_moderation_mode(priv->mdev, &rq->cq.mcq, priv->params.rx_cq_moderation_usec, priv->params.rx_cq_moderation_pkts, cq_mode); } return (retval); } return (mlx5_core_modify_cq_moderation(priv->mdev, &rq->cq.mcq, priv->params.rx_cq_moderation_usec, priv->params.rx_cq_moderation_pkts)); } static int mlx5e_refresh_channel_params_sub(struct mlx5e_priv *priv, struct mlx5e_channel *c) { int err; int i; err = mlx5e_refresh_rq_params(priv, &c->rq); if (err) goto done; for (i = 0; i != priv->num_tc; i++) { err = mlx5e_refresh_sq_params(priv, &c->sq[i]); if (err) goto done; } done: return (err); } int mlx5e_refresh_channel_params(struct mlx5e_priv *priv) { int i; /* check if channels are closed */ if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0) return (EINVAL); for (i = 0; i < priv->params.num_channels; i++) { int err; err = mlx5e_refresh_channel_params_sub(priv, &priv->channel[i]); if (err) return (err); } return (0); } static int mlx5e_open_tis(struct mlx5e_priv *priv, int tc) { struct mlx5_core_dev *mdev = priv->mdev; u32 in[MLX5_ST_SZ_DW(create_tis_in)]; void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); memset(in, 0, sizeof(in)); MLX5_SET(tisc, tisc, prio, tc); MLX5_SET(tisc, tisc, transport_domain, priv->tdn); return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->tisn[tc])); } static void mlx5e_close_tis(struct mlx5e_priv *priv, int tc) { mlx5_core_destroy_tis(priv->mdev, priv->tisn[tc]); } static int mlx5e_open_tises(struct mlx5e_priv *priv) { int num_tc = priv->num_tc; int err; int tc; for (tc = 0; tc < num_tc; tc++) { err = mlx5e_open_tis(priv, tc); if (err) goto err_close_tises; } return (0); err_close_tises: for (tc--; tc >= 0; tc--) mlx5e_close_tis(priv, tc); return (err); } static void mlx5e_close_tises(struct mlx5e_priv *priv) { int num_tc = priv->num_tc; int tc; for (tc = 0; tc < num_tc; tc++) mlx5e_close_tis(priv, tc); } static int mlx5e_open_rqt(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; u32 *in; u32 out[MLX5_ST_SZ_DW(create_rqt_out)] = {0}; void *rqtc; int inlen; int err; int sz; int i; sz = 1 << priv->params.rx_hash_log_tbl_sz; inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz; in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context); MLX5_SET(rqtc, rqtc, rqt_actual_size, sz); MLX5_SET(rqtc, rqtc, rqt_max_size, sz); for (i = 0; i < sz; i++) { int ix = i; #ifdef RSS ix = rss_get_indirection_to_bucket(ix); #endif /* ensure we don't overflow */ ix %= priv->params.num_channels; /* apply receive side scaling stride, if any */ ix -= ix % (int)priv->params.channels_rsss; MLX5_SET(rqtc, rqtc, rq_num[i], priv->channel[ix].rq.rqn); } MLX5_SET(create_rqt_in, in, opcode, MLX5_CMD_OP_CREATE_RQT); err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); if (!err) priv->rqtn = MLX5_GET(create_rqt_out, out, rqtn); kvfree(in); return (err); } static void mlx5e_close_rqt(struct mlx5e_priv *priv) { u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {0}; u32 out[MLX5_ST_SZ_DW(destroy_rqt_out)] = {0}; MLX5_SET(destroy_rqt_in, in, opcode, MLX5_CMD_OP_DESTROY_RQT); MLX5_SET(destroy_rqt_in, in, rqtn, priv->rqtn); mlx5_cmd_exec(priv->mdev, in, sizeof(in), out, sizeof(out)); } static void mlx5e_build_tir_ctx(struct mlx5e_priv *priv, u32 * tirc, int tt) { void *hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer); __be32 *hkey; MLX5_SET(tirc, tirc, transport_domain, priv->tdn); #define ROUGH_MAX_L2_L3_HDR_SZ 256 #define MLX5_HASH_IP (MLX5_HASH_FIELD_SEL_SRC_IP |\ MLX5_HASH_FIELD_SEL_DST_IP) #define MLX5_HASH_ALL (MLX5_HASH_FIELD_SEL_SRC_IP |\ MLX5_HASH_FIELD_SEL_DST_IP |\ MLX5_HASH_FIELD_SEL_L4_SPORT |\ MLX5_HASH_FIELD_SEL_L4_DPORT) #define MLX5_HASH_IP_IPSEC_SPI (MLX5_HASH_FIELD_SEL_SRC_IP |\ MLX5_HASH_FIELD_SEL_DST_IP |\ MLX5_HASH_FIELD_SEL_IPSEC_SPI) if (priv->params.hw_lro_en) { MLX5_SET(tirc, tirc, lro_enable_mask, MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO | MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO); MLX5_SET(tirc, tirc, lro_max_msg_sz, (priv->params.lro_wqe_sz - ROUGH_MAX_L2_L3_HDR_SZ) >> 8); /* TODO: add the option to choose timer value dynamically */ MLX5_SET(tirc, tirc, lro_timeout_period_usecs, MLX5_CAP_ETH(priv->mdev, lro_timer_supported_periods[2])); } /* setup parameters for hashing TIR type, if any */ switch (tt) { case MLX5E_TT_ANY: MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT); MLX5_SET(tirc, tirc, inline_rqn, priv->channel[0].rq.rqn); break; default: MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT); MLX5_SET(tirc, tirc, indirect_table, priv->rqtn); MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_TIRC_RX_HASH_FN_HASH_TOEPLITZ); hkey = (__be32 *) MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key); #ifdef RSS /* * The FreeBSD RSS implementation does currently not * support symmetric Toeplitz hashes: */ MLX5_SET(tirc, tirc, rx_hash_symmetric, 0); rss_getkey((uint8_t *)hkey); #else MLX5_SET(tirc, tirc, rx_hash_symmetric, 1); hkey[0] = cpu_to_be32(0xD181C62C); hkey[1] = cpu_to_be32(0xF7F4DB5B); hkey[2] = cpu_to_be32(0x1983A2FC); hkey[3] = cpu_to_be32(0x943E1ADB); hkey[4] = cpu_to_be32(0xD9389E6B); hkey[5] = cpu_to_be32(0xD1039C2C); hkey[6] = cpu_to_be32(0xA74499AD); hkey[7] = cpu_to_be32(0x593D56D9); hkey[8] = cpu_to_be32(0xF3253C06); hkey[9] = cpu_to_be32(0x2ADC1FFC); #endif break; } switch (tt) { case MLX5E_TT_IPV4_TCP: MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4); MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, MLX5_L4_PROT_TYPE_TCP); #ifdef RSS if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_TCP_IPV4)) { MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_IP); } else #endif MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_ALL); break; case MLX5E_TT_IPV6_TCP: MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV6); MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, MLX5_L4_PROT_TYPE_TCP); #ifdef RSS if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_TCP_IPV6)) { MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_IP); } else #endif MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_ALL); break; case MLX5E_TT_IPV4_UDP: MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4); MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, MLX5_L4_PROT_TYPE_UDP); #ifdef RSS if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_UDP_IPV4)) { MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_IP); } else #endif MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_ALL); break; case MLX5E_TT_IPV6_UDP: MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV6); MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, MLX5_L4_PROT_TYPE_UDP); #ifdef RSS if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_UDP_IPV6)) { MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_IP); } else #endif MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_ALL); break; case MLX5E_TT_IPV4_IPSEC_AH: MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4); MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_IP_IPSEC_SPI); break; case MLX5E_TT_IPV6_IPSEC_AH: MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV6); MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_IP_IPSEC_SPI); break; case MLX5E_TT_IPV4_IPSEC_ESP: MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4); MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_IP_IPSEC_SPI); break; case MLX5E_TT_IPV6_IPSEC_ESP: MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV6); MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_IP_IPSEC_SPI); break; case MLX5E_TT_IPV4: MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4); MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_IP); break; case MLX5E_TT_IPV6: MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV6); MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_IP); break; default: break; } } static int mlx5e_open_tir(struct mlx5e_priv *priv, int tt) { struct mlx5_core_dev *mdev = priv->mdev; u32 *in; void *tirc; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(create_tir_in); in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); tirc = MLX5_ADDR_OF(create_tir_in, in, tir_context); mlx5e_build_tir_ctx(priv, tirc, tt); err = mlx5_core_create_tir(mdev, in, inlen, &priv->tirn[tt]); kvfree(in); return (err); } static void mlx5e_close_tir(struct mlx5e_priv *priv, int tt) { mlx5_core_destroy_tir(priv->mdev, priv->tirn[tt]); } static int mlx5e_open_tirs(struct mlx5e_priv *priv) { int err; int i; for (i = 0; i < MLX5E_NUM_TT; i++) { err = mlx5e_open_tir(priv, i); if (err) goto err_close_tirs; } return (0); err_close_tirs: for (i--; i >= 0; i--) mlx5e_close_tir(priv, i); return (err); } static void mlx5e_close_tirs(struct mlx5e_priv *priv) { int i; for (i = 0; i < MLX5E_NUM_TT; i++) mlx5e_close_tir(priv, i); } /* * SW MTU does not include headers, * HW MTU includes all headers and checksums. */ static int mlx5e_set_dev_port_mtu(struct ifnet *ifp, int sw_mtu) { struct mlx5e_priv *priv = ifp->if_softc; struct mlx5_core_dev *mdev = priv->mdev; int hw_mtu; int err; hw_mtu = MLX5E_SW2HW_MTU(sw_mtu); err = mlx5_set_port_mtu(mdev, hw_mtu); if (err) { mlx5_en_err(ifp, "mlx5_set_port_mtu failed setting %d, err=%d\n", sw_mtu, err); return (err); } /* Update vport context MTU */ err = mlx5_set_vport_mtu(mdev, hw_mtu); if (err) { mlx5_en_err(ifp, "Failed updating vport context with MTU size, err=%d\n", err); } ifp->if_mtu = sw_mtu; err = mlx5_query_vport_mtu(mdev, &hw_mtu); if (err || !hw_mtu) { /* fallback to port oper mtu */ err = mlx5_query_port_oper_mtu(mdev, &hw_mtu); } if (err) { mlx5_en_err(ifp, "Query port MTU, after setting new MTU value, failed\n"); return (err); } else if (MLX5E_HW2SW_MTU(hw_mtu) < sw_mtu) { err = -E2BIG, mlx5_en_err(ifp, "Port MTU %d is smaller than ifp mtu %d\n", hw_mtu, sw_mtu); } else if (MLX5E_HW2SW_MTU(hw_mtu) > sw_mtu) { err = -EINVAL; mlx5_en_err(ifp, "Port MTU %d is bigger than ifp mtu %d\n", hw_mtu, sw_mtu); } priv->params_ethtool.hw_mtu = hw_mtu; /* compute MSB */ while (hw_mtu & (hw_mtu - 1)) hw_mtu &= (hw_mtu - 1); priv->params_ethtool.hw_mtu_msb = hw_mtu; return (err); } int mlx5e_open_locked(struct ifnet *ifp) { struct mlx5e_priv *priv = ifp->if_softc; int err; u16 set_id; /* check if already opened */ if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0) return (0); #ifdef RSS if (rss_getnumbuckets() > priv->params.num_channels) { mlx5_en_info(ifp, "NOTE: There are more RSS buckets(%u) than channels(%u) available\n", rss_getnumbuckets(), priv->params.num_channels); } #endif err = mlx5e_open_tises(priv); if (err) { mlx5_en_err(ifp, "mlx5e_open_tises failed, %d\n", err); return (err); } err = mlx5_vport_alloc_q_counter(priv->mdev, MLX5_INTERFACE_PROTOCOL_ETH, &set_id); if (err) { mlx5_en_err(priv->ifp, "mlx5_vport_alloc_q_counter failed: %d\n", err); goto err_close_tises; } /* store counter set ID */ priv->counter_set_id = set_id; err = mlx5e_open_channels(priv); if (err) { mlx5_en_err(ifp, "mlx5e_open_channels failed, %d\n", err); goto err_dalloc_q_counter; } err = mlx5e_open_rqt(priv); if (err) { mlx5_en_err(ifp, "mlx5e_open_rqt failed, %d\n", err); goto err_close_channels; } err = mlx5e_open_tirs(priv); if (err) { mlx5_en_err(ifp, "mlx5e_open_tir failed, %d\n", err); goto err_close_rqls; } err = mlx5e_open_flow_table(priv); if (err) { mlx5_en_err(ifp, "mlx5e_open_flow_table failed, %d\n", err); goto err_close_tirs; } err = mlx5e_add_all_vlan_rules(priv); if (err) { mlx5_en_err(ifp, "mlx5e_add_all_vlan_rules failed, %d\n", err); goto err_close_flow_table; } set_bit(MLX5E_STATE_OPENED, &priv->state); mlx5e_update_carrier(priv); mlx5e_set_rx_mode_core(priv); return (0); err_close_flow_table: mlx5e_close_flow_table(priv); err_close_tirs: mlx5e_close_tirs(priv); err_close_rqls: mlx5e_close_rqt(priv); err_close_channels: mlx5e_close_channels(priv); err_dalloc_q_counter: mlx5_vport_dealloc_q_counter(priv->mdev, MLX5_INTERFACE_PROTOCOL_ETH, priv->counter_set_id); err_close_tises: mlx5e_close_tises(priv); return (err); } static void mlx5e_open(void *arg) { struct mlx5e_priv *priv = arg; PRIV_LOCK(priv); if (mlx5_set_port_status(priv->mdev, MLX5_PORT_UP)) mlx5_en_err(priv->ifp, "Setting port status to up failed\n"); mlx5e_open_locked(priv->ifp); priv->ifp->if_drv_flags |= IFF_DRV_RUNNING; PRIV_UNLOCK(priv); } int mlx5e_close_locked(struct ifnet *ifp) { struct mlx5e_priv *priv = ifp->if_softc; /* check if already closed */ if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0) return (0); clear_bit(MLX5E_STATE_OPENED, &priv->state); mlx5e_set_rx_mode_core(priv); mlx5e_del_all_vlan_rules(priv); if_link_state_change(priv->ifp, LINK_STATE_DOWN); mlx5e_close_flow_table(priv); mlx5e_close_tirs(priv); mlx5e_close_rqt(priv); mlx5e_close_channels(priv); mlx5_vport_dealloc_q_counter(priv->mdev, MLX5_INTERFACE_PROTOCOL_ETH, priv->counter_set_id); mlx5e_close_tises(priv); return (0); } #if (__FreeBSD_version >= 1100000) static uint64_t mlx5e_get_counter(struct ifnet *ifp, ift_counter cnt) { struct mlx5e_priv *priv = ifp->if_softc; u64 retval; /* PRIV_LOCK(priv); XXX not allowed */ switch (cnt) { case IFCOUNTER_IPACKETS: retval = priv->stats.vport.rx_packets; break; case IFCOUNTER_IERRORS: retval = priv->stats.pport.in_range_len_errors + priv->stats.pport.out_of_range_len + priv->stats.pport.too_long_errors + priv->stats.pport.check_seq_err + priv->stats.pport.alignment_err; break; case IFCOUNTER_IQDROPS: retval = priv->stats.vport.rx_out_of_buffer; break; case IFCOUNTER_OPACKETS: retval = priv->stats.vport.tx_packets; break; case IFCOUNTER_OERRORS: retval = priv->stats.port_stats_debug.out_discards; break; case IFCOUNTER_IBYTES: retval = priv->stats.vport.rx_bytes; break; case IFCOUNTER_OBYTES: retval = priv->stats.vport.tx_bytes; break; case IFCOUNTER_IMCASTS: retval = priv->stats.vport.rx_multicast_packets; break; case IFCOUNTER_OMCASTS: retval = priv->stats.vport.tx_multicast_packets; break; case IFCOUNTER_OQDROPS: retval = priv->stats.vport.tx_queue_dropped; break; case IFCOUNTER_COLLISIONS: retval = priv->stats.pport.collisions; break; default: retval = if_get_counter_default(ifp, cnt); break; } /* PRIV_UNLOCK(priv); XXX not allowed */ return (retval); } #endif static void mlx5e_set_rx_mode(struct ifnet *ifp) { struct mlx5e_priv *priv = ifp->if_softc; queue_work(priv->wq, &priv->set_rx_mode_work); } static int mlx5e_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct mlx5e_priv *priv; struct ifreq *ifr; struct ifi2creq i2c; int error = 0; int mask = 0; int size_read = 0; int module_status; int module_num; int max_mtu; uint8_t read_addr; priv = ifp->if_softc; /* check if detaching */ if (priv == NULL || priv->gone != 0) return (ENXIO); switch (command) { case SIOCSIFMTU: ifr = (struct ifreq *)data; PRIV_LOCK(priv); mlx5_query_port_max_mtu(priv->mdev, &max_mtu); if (ifr->ifr_mtu >= MLX5E_MTU_MIN && ifr->ifr_mtu <= MIN(MLX5E_MTU_MAX, max_mtu)) { int was_opened; was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state); if (was_opened) mlx5e_close_locked(ifp); /* set new MTU */ mlx5e_set_dev_port_mtu(ifp, ifr->ifr_mtu); if (was_opened) mlx5e_open_locked(ifp); } else { error = EINVAL; mlx5_en_err(ifp, "Invalid MTU value. Min val: %d, Max val: %d\n", MLX5E_MTU_MIN, MIN(MLX5E_MTU_MAX, max_mtu)); } PRIV_UNLOCK(priv); break; case SIOCSIFFLAGS: if ((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { mlx5e_set_rx_mode(ifp); break; } PRIV_LOCK(priv); if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0) mlx5e_open_locked(ifp); ifp->if_drv_flags |= IFF_DRV_RUNNING; mlx5_set_port_status(priv->mdev, MLX5_PORT_UP); } } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { mlx5_set_port_status(priv->mdev, MLX5_PORT_DOWN); if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0) mlx5e_close_locked(ifp); mlx5e_update_carrier(priv); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; } } PRIV_UNLOCK(priv); break; case SIOCADDMULTI: case SIOCDELMULTI: mlx5e_set_rx_mode(ifp); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: case SIOCGIFXMEDIA: ifr = (struct ifreq *)data; error = ifmedia_ioctl(ifp, ifr, &priv->media, command); break; case SIOCSIFCAP: ifr = (struct ifreq *)data; PRIV_LOCK(priv); mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) { ifp->if_capenable ^= IFCAP_TXCSUM; ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP); if (IFCAP_TSO4 & ifp->if_capenable && !(IFCAP_TXCSUM & ifp->if_capenable)) { ifp->if_capenable &= ~IFCAP_TSO4; ifp->if_hwassist &= ~CSUM_IP_TSO; mlx5_en_err(ifp, "tso4 disabled due to -txcsum.\n"); } } if (mask & IFCAP_TXCSUM_IPV6) { ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; ifp->if_hwassist ^= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6); if (IFCAP_TSO6 & ifp->if_capenable && !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { ifp->if_capenable &= ~IFCAP_TSO6; ifp->if_hwassist &= ~CSUM_IP6_TSO; mlx5_en_err(ifp, "tso6 disabled due to -txcsum6.\n"); } } if (mask & IFCAP_NOMAP) ifp->if_capenable ^= IFCAP_NOMAP; if (mask & IFCAP_TXTLS4) ifp->if_capenable ^= IFCAP_TXTLS4; if (mask & IFCAP_TXTLS6) ifp->if_capenable ^= IFCAP_TXTLS6; if (mask & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; if (mask & IFCAP_RXCSUM_IPV6) ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; if (mask & IFCAP_TSO4) { if (!(IFCAP_TSO4 & ifp->if_capenable) && !(IFCAP_TXCSUM & ifp->if_capenable)) { mlx5_en_err(ifp, "enable txcsum first.\n"); error = EAGAIN; goto out; } ifp->if_capenable ^= IFCAP_TSO4; ifp->if_hwassist ^= CSUM_IP_TSO; } if (mask & IFCAP_TSO6) { if (!(IFCAP_TSO6 & ifp->if_capenable) && !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { mlx5_en_err(ifp, "enable txcsum6 first.\n"); error = EAGAIN; goto out; } ifp->if_capenable ^= IFCAP_TSO6; ifp->if_hwassist ^= CSUM_IP6_TSO; } if (mask & IFCAP_VLAN_HWFILTER) { if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) mlx5e_disable_vlan_filter(priv); else mlx5e_enable_vlan_filter(priv); ifp->if_capenable ^= IFCAP_VLAN_HWFILTER; } if (mask & IFCAP_VLAN_HWTAGGING) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (mask & IFCAP_WOL_MAGIC) ifp->if_capenable ^= IFCAP_WOL_MAGIC; VLAN_CAPABILITIES(ifp); /* turn off LRO means also turn of HW LRO - if it's on */ if (mask & IFCAP_LRO) { int was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state); bool need_restart = false; ifp->if_capenable ^= IFCAP_LRO; /* figure out if updating HW LRO is needed */ if (!(ifp->if_capenable & IFCAP_LRO)) { if (priv->params.hw_lro_en) { priv->params.hw_lro_en = false; need_restart = true; } } else { if (priv->params.hw_lro_en == false && priv->params_ethtool.hw_lro != 0) { priv->params.hw_lro_en = true; need_restart = true; } } if (was_opened && need_restart) { mlx5e_close_locked(ifp); mlx5e_open_locked(ifp); } } if (mask & IFCAP_HWRXTSTMP) { ifp->if_capenable ^= IFCAP_HWRXTSTMP; if (ifp->if_capenable & IFCAP_HWRXTSTMP) { if (priv->clbr_done == 0) mlx5e_reset_calibration_callout(priv); } else { callout_drain(&priv->tstmp_clbr); priv->clbr_done = 0; } } out: PRIV_UNLOCK(priv); break; case SIOCGI2C: ifr = (struct ifreq *)data; /* * Copy from the user-space address ifr_data to the * kernel-space address i2c */ error = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c)); if (error) break; if (i2c.len > sizeof(i2c.data)) { error = EINVAL; break; } PRIV_LOCK(priv); /* Get module_num which is required for the query_eeprom */ error = mlx5_query_module_num(priv->mdev, &module_num); if (error) { mlx5_en_err(ifp, "Query module num failed, eeprom reading is not supported\n"); error = EINVAL; goto err_i2c; } /* Check if module is present before doing an access */ module_status = mlx5_query_module_status(priv->mdev, module_num); if (module_status != MLX5_MODULE_STATUS_PLUGGED_ENABLED) { error = EINVAL; goto err_i2c; } /* * Currently 0XA0 and 0xA2 are the only addresses permitted. * The internal conversion is as follows: */ if (i2c.dev_addr == 0xA0) read_addr = MLX5_I2C_ADDR_LOW; else if (i2c.dev_addr == 0xA2) read_addr = MLX5_I2C_ADDR_HIGH; else { mlx5_en_err(ifp, "Query eeprom failed, Invalid Address: %X\n", i2c.dev_addr); error = EINVAL; goto err_i2c; } error = mlx5_query_eeprom(priv->mdev, read_addr, MLX5_EEPROM_LOW_PAGE, (uint32_t)i2c.offset, (uint32_t)i2c.len, module_num, (uint32_t *)i2c.data, &size_read); if (error) { mlx5_en_err(ifp, "Query eeprom failed, eeprom reading is not supported\n"); error = EINVAL; goto err_i2c; } if (i2c.len > MLX5_EEPROM_MAX_BYTES) { error = mlx5_query_eeprom(priv->mdev, read_addr, MLX5_EEPROM_LOW_PAGE, (uint32_t)(i2c.offset + size_read), (uint32_t)(i2c.len - size_read), module_num, (uint32_t *)(i2c.data + size_read), &size_read); } if (error) { mlx5_en_err(ifp, "Query eeprom failed, eeprom reading is not supported\n"); error = EINVAL; goto err_i2c; } error = copyout(&i2c, ifr_data_get_ptr(ifr), sizeof(i2c)); err_i2c: PRIV_UNLOCK(priv); break; default: error = ether_ioctl(ifp, command, data); break; } return (error); } static int mlx5e_check_required_hca_cap(struct mlx5_core_dev *mdev) { /* * TODO: uncoment once FW really sets all these bits if * (!mdev->caps.eth.rss_ind_tbl_cap || !mdev->caps.eth.csum_cap || * !mdev->caps.eth.max_lso_cap || !mdev->caps.eth.vlan_cap || * !(mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_SCQE_BRK_MOD)) return * -ENOTSUPP; */ /* TODO: add more must-to-have features */ if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) return (-ENODEV); return (0); } static u16 mlx5e_get_max_inline_cap(struct mlx5_core_dev *mdev) { uint32_t bf_buf_size = (1U << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2U; bf_buf_size -= sizeof(struct mlx5e_tx_wqe) - 2; /* verify against driver hardware limit */ if (bf_buf_size > MLX5E_MAX_TX_INLINE) bf_buf_size = MLX5E_MAX_TX_INLINE; return (bf_buf_size); } static int mlx5e_build_ifp_priv(struct mlx5_core_dev *mdev, struct mlx5e_priv *priv, int num_comp_vectors) { int err; /* * TODO: Consider link speed for setting "log_sq_size", * "log_rq_size" and "cq_moderation_xxx": */ priv->params.log_sq_size = MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE; priv->params.log_rq_size = MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE; priv->params.rx_cq_moderation_usec = MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ? MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE : MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC; priv->params.rx_cq_moderation_mode = MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ? 1 : 0; priv->params.rx_cq_moderation_pkts = MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS; priv->params.tx_cq_moderation_usec = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC; priv->params.tx_cq_moderation_pkts = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS; priv->params.min_rx_wqes = MLX5E_PARAMS_DEFAULT_MIN_RX_WQES; priv->params.rx_hash_log_tbl_sz = (order_base_2(num_comp_vectors) > MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ) ? order_base_2(num_comp_vectors) : MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ; priv->params.num_tc = 1; priv->params.default_vlan_prio = 0; priv->counter_set_id = -1; priv->params.tx_max_inline = mlx5e_get_max_inline_cap(mdev); err = mlx5_query_min_inline(mdev, &priv->params.tx_min_inline_mode); if (err) return (err); /* * hw lro is currently defaulted to off. when it won't anymore we * will consider the HW capability: "!!MLX5_CAP_ETH(mdev, lro_cap)" */ priv->params.hw_lro_en = false; priv->params.lro_wqe_sz = MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ; /* * CQE zipping is currently defaulted to off. when it won't * anymore we will consider the HW capability: * "!!MLX5_CAP_GEN(mdev, cqe_compression)" */ priv->params.cqe_zipping_en = false; priv->mdev = mdev; priv->params.num_channels = num_comp_vectors; priv->params.channels_rsss = 1; priv->order_base_2_num_channels = order_base_2(num_comp_vectors); priv->queue_mapping_channel_mask = roundup_pow_of_two(num_comp_vectors) - 1; priv->num_tc = priv->params.num_tc; priv->default_vlan_prio = priv->params.default_vlan_prio; INIT_WORK(&priv->update_stats_work, mlx5e_update_stats_work); INIT_WORK(&priv->update_carrier_work, mlx5e_update_carrier_work); INIT_WORK(&priv->set_rx_mode_work, mlx5e_set_rx_mode_work); return (0); } static int mlx5e_create_mkey(struct mlx5e_priv *priv, u32 pdn, struct mlx5_core_mr *mkey) { struct ifnet *ifp = priv->ifp; struct mlx5_core_dev *mdev = priv->mdev; int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); void *mkc; u32 *in; int err; in = mlx5_vzalloc(inlen); if (in == NULL) { mlx5_en_err(ifp, "failed to allocate inbox\n"); return (-ENOMEM); } mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_PA); MLX5_SET(mkc, mkc, umr_en, 1); /* used by HW TLS */ MLX5_SET(mkc, mkc, lw, 1); MLX5_SET(mkc, mkc, lr, 1); MLX5_SET(mkc, mkc, pd, pdn); MLX5_SET(mkc, mkc, length64, 1); MLX5_SET(mkc, mkc, qpn, 0xffffff); err = mlx5_core_create_mkey(mdev, mkey, in, inlen); if (err) mlx5_en_err(ifp, "mlx5_core_create_mkey failed, %d\n", err); kvfree(in); return (err); } static const char *mlx5e_vport_stats_desc[] = { MLX5E_VPORT_STATS(MLX5E_STATS_DESC) }; static const char *mlx5e_pport_stats_desc[] = { MLX5E_PPORT_STATS(MLX5E_STATS_DESC) }; static void mlx5e_priv_static_init(struct mlx5e_priv *priv, const uint32_t channels) { uint32_t x; mtx_init(&priv->async_events_mtx, "mlx5async", MTX_NETWORK_LOCK, MTX_DEF); sx_init(&priv->state_lock, "mlx5state"); callout_init_mtx(&priv->watchdog, &priv->async_events_mtx, 0); MLX5_INIT_DOORBELL_LOCK(&priv->doorbell_lock); for (x = 0; x != channels; x++) mlx5e_chan_static_init(priv, &priv->channel[x], x); } static void mlx5e_priv_static_destroy(struct mlx5e_priv *priv, const uint32_t channels) { uint32_t x; for (x = 0; x != channels; x++) mlx5e_chan_static_destroy(&priv->channel[x]); callout_drain(&priv->watchdog); mtx_destroy(&priv->async_events_mtx); sx_destroy(&priv->state_lock); } static int sysctl_firmware(SYSCTL_HANDLER_ARGS) { /* * %d.%d%.d the string format. * fw_rev_{maj,min,sub} return u16, 2^16 = 65536. * We need at most 5 chars to store that. * It also has: two "." and NULL at the end, which means we need 18 * (5*3 + 3) chars at most. */ char fw[18]; struct mlx5e_priv *priv = arg1; int error; snprintf(fw, sizeof(fw), "%d.%d.%d", fw_rev_maj(priv->mdev), fw_rev_min(priv->mdev), fw_rev_sub(priv->mdev)); error = sysctl_handle_string(oidp, fw, sizeof(fw), req); return (error); } static void mlx5e_disable_tx_dma(struct mlx5e_channel *ch) { int i; for (i = 0; i < ch->priv->num_tc; i++) mlx5e_drain_sq(&ch->sq[i]); } static void mlx5e_reset_sq_doorbell_record(struct mlx5e_sq *sq) { sq->doorbell.d32[0] = cpu_to_be32(MLX5_OPCODE_NOP); sq->doorbell.d32[1] = cpu_to_be32(sq->sqn << 8); mlx5e_tx_notify_hw(sq, sq->doorbell.d32, 0); sq->doorbell.d64 = 0; } void mlx5e_resume_sq(struct mlx5e_sq *sq) { int err; /* check if already enabled */ if (READ_ONCE(sq->running) != 0) return; err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_ERR, MLX5_SQC_STATE_RST); if (err != 0) { mlx5_en_err(sq->ifp, "mlx5e_modify_sq() from ERR to RST failed: %d\n", err); } sq->cc = 0; sq->pc = 0; /* reset doorbell prior to moving from RST to RDY */ mlx5e_reset_sq_doorbell_record(sq); err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY); if (err != 0) { mlx5_en_err(sq->ifp, "mlx5e_modify_sq() from RST to RDY failed: %d\n", err); } sq->cev_next_state = MLX5E_CEV_STATE_INITIAL; WRITE_ONCE(sq->running, 1); } static void mlx5e_enable_tx_dma(struct mlx5e_channel *ch) { int i; for (i = 0; i < ch->priv->num_tc; i++) mlx5e_resume_sq(&ch->sq[i]); } static void mlx5e_disable_rx_dma(struct mlx5e_channel *ch) { struct mlx5e_rq *rq = &ch->rq; struct epoch_tracker et; int err; mtx_lock(&rq->mtx); rq->enabled = 0; callout_stop(&rq->watchdog); mtx_unlock(&rq->mtx); err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR); if (err != 0) { mlx5_en_err(rq->ifp, "mlx5e_modify_rq() from RDY to RST failed: %d\n", err); } while (!mlx5_wq_ll_is_empty(&rq->wq)) { msleep(1); NET_EPOCH_ENTER(et); rq->cq.mcq.comp(&rq->cq.mcq); NET_EPOCH_EXIT(et); } /* * Transitioning into RST state will allow the FW to track less ERR state queues, * thus reducing the recv queue flushing time */ err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_ERR, MLX5_RQC_STATE_RST); if (err != 0) { mlx5_en_err(rq->ifp, "mlx5e_modify_rq() from ERR to RST failed: %d\n", err); } } static void mlx5e_enable_rx_dma(struct mlx5e_channel *ch) { struct mlx5e_rq *rq = &ch->rq; struct epoch_tracker et; int err; rq->wq.wqe_ctr = 0; mlx5_wq_ll_update_db_record(&rq->wq); err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY); if (err != 0) { mlx5_en_err(rq->ifp, "mlx5e_modify_rq() from RST to RDY failed: %d\n", err); } rq->enabled = 1; NET_EPOCH_ENTER(et); rq->cq.mcq.comp(&rq->cq.mcq); NET_EPOCH_EXIT(et); } void mlx5e_modify_tx_dma(struct mlx5e_priv *priv, uint8_t value) { int i; if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0) return; for (i = 0; i < priv->params.num_channels; i++) { if (value) mlx5e_disable_tx_dma(&priv->channel[i]); else mlx5e_enable_tx_dma(&priv->channel[i]); } } void mlx5e_modify_rx_dma(struct mlx5e_priv *priv, uint8_t value) { int i; if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0) return; for (i = 0; i < priv->params.num_channels; i++) { if (value) mlx5e_disable_rx_dma(&priv->channel[i]); else mlx5e_enable_rx_dma(&priv->channel[i]); } } static void mlx5e_add_hw_stats(struct mlx5e_priv *priv) { SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_hw), OID_AUTO, "fw_version", CTLTYPE_STRING | CTLFLAG_RD, priv, 0, sysctl_firmware, "A", "HCA firmware version"); SYSCTL_ADD_STRING(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_hw), OID_AUTO, "board_id", CTLFLAG_RD, priv->mdev->board_id, 0, "Board ID"); } static int mlx5e_sysctl_tx_priority_flow_control(SYSCTL_HANDLER_ARGS) { struct mlx5e_priv *priv = arg1; uint8_t temp[MLX5E_MAX_PRIORITY]; uint32_t tx_pfc; int err; int i; PRIV_LOCK(priv); tx_pfc = priv->params.tx_priority_flow_control; for (i = 0; i != MLX5E_MAX_PRIORITY; i++) temp[i] = (tx_pfc >> i) & 1; err = SYSCTL_OUT(req, temp, MLX5E_MAX_PRIORITY); if (err || !req->newptr) goto done; err = SYSCTL_IN(req, temp, MLX5E_MAX_PRIORITY); if (err) goto done; priv->params.tx_priority_flow_control = 0; /* range check input value */ for (i = 0; i != MLX5E_MAX_PRIORITY; i++) { if (temp[i] > 1) { err = ERANGE; goto done; } priv->params.tx_priority_flow_control |= (temp[i] << i); } /* check if update is required */ if (tx_pfc != priv->params.tx_priority_flow_control) err = -mlx5e_set_port_pfc(priv); done: if (err != 0) priv->params.tx_priority_flow_control= tx_pfc; PRIV_UNLOCK(priv); return (err); } static int mlx5e_sysctl_rx_priority_flow_control(SYSCTL_HANDLER_ARGS) { struct mlx5e_priv *priv = arg1; uint8_t temp[MLX5E_MAX_PRIORITY]; uint32_t rx_pfc; int err; int i; PRIV_LOCK(priv); rx_pfc = priv->params.rx_priority_flow_control; for (i = 0; i != MLX5E_MAX_PRIORITY; i++) temp[i] = (rx_pfc >> i) & 1; err = SYSCTL_OUT(req, temp, MLX5E_MAX_PRIORITY); if (err || !req->newptr) goto done; err = SYSCTL_IN(req, temp, MLX5E_MAX_PRIORITY); if (err) goto done; priv->params.rx_priority_flow_control = 0; /* range check input value */ for (i = 0; i != MLX5E_MAX_PRIORITY; i++) { if (temp[i] > 1) { err = ERANGE; goto done; } priv->params.rx_priority_flow_control |= (temp[i] << i); } /* check if update is required */ if (rx_pfc != priv->params.rx_priority_flow_control) { err = -mlx5e_set_port_pfc(priv); if (err == 0 && priv->sw_is_port_buf_owner) err = mlx5e_update_buf_lossy(priv); } done: if (err != 0) priv->params.rx_priority_flow_control= rx_pfc; PRIV_UNLOCK(priv); return (err); } static void mlx5e_setup_pauseframes(struct mlx5e_priv *priv) { #if (__FreeBSD_version < 1100000) char path[96]; #endif int error; /* enable pauseframes by default */ priv->params.tx_pauseframe_control = 1; priv->params.rx_pauseframe_control = 1; /* disable ports flow control, PFC, by default */ priv->params.tx_priority_flow_control = 0; priv->params.rx_priority_flow_control = 0; #if (__FreeBSD_version < 1100000) /* compute path for sysctl */ snprintf(path, sizeof(path), "dev.mce.%d.tx_pauseframe_control", device_get_unit(priv->mdev->pdev->dev.bsddev)); /* try to fetch tunable, if any */ TUNABLE_INT_FETCH(path, &priv->params.tx_pauseframe_control); /* compute path for sysctl */ snprintf(path, sizeof(path), "dev.mce.%d.rx_pauseframe_control", device_get_unit(priv->mdev->pdev->dev.bsddev)); /* try to fetch tunable, if any */ TUNABLE_INT_FETCH(path, &priv->params.rx_pauseframe_control); #endif /* register pauseframe SYSCTLs */ SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "tx_pauseframe_control", CTLFLAG_RDTUN, &priv->params.tx_pauseframe_control, 0, "Set to enable TX pause frames. Clear to disable."); SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "rx_pauseframe_control", CTLFLAG_RDTUN, &priv->params.rx_pauseframe_control, 0, "Set to enable RX pause frames. Clear to disable."); /* register priority flow control, PFC, SYSCTLs */ SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "tx_priority_flow_control", CTLTYPE_U8 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, priv, 0, &mlx5e_sysctl_tx_priority_flow_control, "CU", "Set to enable TX ports flow control frames for priorities 0..7. Clear to disable."); SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "rx_priority_flow_control", CTLTYPE_U8 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, priv, 0, &mlx5e_sysctl_rx_priority_flow_control, "CU", "Set to enable RX ports flow control frames for priorities 0..7. Clear to disable."); PRIV_LOCK(priv); /* range check */ priv->params.tx_pauseframe_control = priv->params.tx_pauseframe_control ? 1 : 0; priv->params.rx_pauseframe_control = priv->params.rx_pauseframe_control ? 1 : 0; /* update firmware */ error = mlx5e_set_port_pause_and_pfc(priv); if (error == -EINVAL) { mlx5_en_err(priv->ifp, "Global pauseframes must be disabled before enabling PFC.\n"); priv->params.rx_priority_flow_control = 0; priv->params.tx_priority_flow_control = 0; /* update firmware */ (void) mlx5e_set_port_pause_and_pfc(priv); } PRIV_UNLOCK(priv); } int mlx5e_ul_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { struct mlx5e_priv *priv; struct mlx5e_channel *pch; priv = ifp->if_softc; if (unlikely(priv->gone || params->hdr.flowtype == M_HASHTYPE_NONE)) { return (EOPNOTSUPP); } else { /* keep this code synced with mlx5e_select_queue() */ u32 ch = priv->params.num_channels; #ifdef RSS u32 temp; if (rss_hash2bucket(params->hdr.flowid, params->hdr.flowtype, &temp) == 0) ch = temp % ch; else #endif ch = (params->hdr.flowid % 128) % ch; /* * NOTE: The channels array is only freed at detach * and it safe to return a pointer to the send tag * inside the channels structure as long as we * reference the priv. */ pch = priv->channel + ch; /* check if send queue is not running */ if (unlikely(pch->sq[0].running == 0)) return (ENXIO); m_snd_tag_ref(&pch->tag.m_snd_tag); *ppmt = &pch->tag.m_snd_tag; return (0); } } int mlx5e_ul_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params) { struct mlx5e_channel *pch = container_of(pmt, struct mlx5e_channel, tag.m_snd_tag); params->unlimited.max_rate = -1ULL; params->unlimited.queue_level = mlx5e_sq_queue_level(&pch->sq[0]); return (0); } void mlx5e_ul_snd_tag_free(struct m_snd_tag *pmt) { struct mlx5e_channel *pch = container_of(pmt, struct mlx5e_channel, tag.m_snd_tag); complete(&pch->completion); } static int mlx5e_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { switch (params->hdr.type) { #ifdef RATELIMIT case IF_SND_TAG_TYPE_RATE_LIMIT: return (mlx5e_rl_snd_tag_alloc(ifp, params, ppmt)); #if defined(KERN_TLS) && defined(IF_SND_TAG_TYPE_TLS_RATE_LIMIT) case IF_SND_TAG_TYPE_TLS_RATE_LIMIT: return (mlx5e_tls_snd_tag_alloc(ifp, params, ppmt)); #endif #endif case IF_SND_TAG_TYPE_UNLIMITED: return (mlx5e_ul_snd_tag_alloc(ifp, params, ppmt)); #ifdef KERN_TLS case IF_SND_TAG_TYPE_TLS: return (mlx5e_tls_snd_tag_alloc(ifp, params, ppmt)); #endif default: return (EOPNOTSUPP); } } static int mlx5e_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params) { struct mlx5e_snd_tag *tag = container_of(pmt, struct mlx5e_snd_tag, m_snd_tag); switch (tag->type) { #ifdef RATELIMIT case IF_SND_TAG_TYPE_RATE_LIMIT: return (mlx5e_rl_snd_tag_modify(pmt, params)); #if defined(KERN_TLS) && defined(IF_SND_TAG_TYPE_TLS_RATE_LIMIT) case IF_SND_TAG_TYPE_TLS_RATE_LIMIT: return (mlx5e_tls_snd_tag_modify(pmt, params)); #endif #endif case IF_SND_TAG_TYPE_UNLIMITED: #ifdef KERN_TLS case IF_SND_TAG_TYPE_TLS: #endif default: return (EOPNOTSUPP); } } static int mlx5e_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params) { struct mlx5e_snd_tag *tag = container_of(pmt, struct mlx5e_snd_tag, m_snd_tag); switch (tag->type) { #ifdef RATELIMIT case IF_SND_TAG_TYPE_RATE_LIMIT: return (mlx5e_rl_snd_tag_query(pmt, params)); #if defined(KERN_TLS) && defined(IF_SND_TAG_TYPE_TLS_RATE_LIMIT) case IF_SND_TAG_TYPE_TLS_RATE_LIMIT: return (mlx5e_tls_snd_tag_query(pmt, params)); #endif #endif case IF_SND_TAG_TYPE_UNLIMITED: return (mlx5e_ul_snd_tag_query(pmt, params)); #ifdef KERN_TLS case IF_SND_TAG_TYPE_TLS: return (mlx5e_tls_snd_tag_query(pmt, params)); #endif default: return (EOPNOTSUPP); } } #ifdef RATELIMIT #define NUM_HDWR_RATES_MLX 13 static const uint64_t adapter_rates_mlx[NUM_HDWR_RATES_MLX] = { 135375, /* 1,083,000 */ 180500, /* 1,444,000 */ 270750, /* 2,166,000 */ 361000, /* 2,888,000 */ 541500, /* 4,332,000 */ 721875, /* 5,775,000 */ 1082875, /* 8,663,000 */ 1443875, /* 11,551,000 */ 2165750, /* 17,326,000 */ 2887750, /* 23,102,000 */ 4331625, /* 34,653,000 */ 5775500, /* 46,204,000 */ 8663125 /* 69,305,000 */ }; static void mlx5e_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q) { /* * This function needs updating by the driver maintainer! * For the MLX card there are currently (ConectX-4?) 13 * pre-set rates and others i.e. ConnectX-5, 6, 7?? * * This will change based on later adapters * and this code should be updated to look at ifp * and figure out the specific adapter type * settings i.e. how many rates as well * as if they are fixed (as is shown here) or * if they are dynamic (example chelsio t4). Also if there * is a maximum number of flows that the adapter * can handle that too needs to be updated in * the max_flows field. */ q->rate_table = adapter_rates_mlx; q->flags = RT_IS_FIXED_TABLE; q->max_flows = 0; /* mlx has no limit */ q->number_of_rates = NUM_HDWR_RATES_MLX; q->min_segment_burst = 1; } #endif static void mlx5e_snd_tag_free(struct m_snd_tag *pmt) { struct mlx5e_snd_tag *tag = container_of(pmt, struct mlx5e_snd_tag, m_snd_tag); switch (tag->type) { #ifdef RATELIMIT case IF_SND_TAG_TYPE_RATE_LIMIT: mlx5e_rl_snd_tag_free(pmt); break; #if defined(KERN_TLS) && defined(IF_SND_TAG_TYPE_TLS_RATE_LIMIT) case IF_SND_TAG_TYPE_TLS_RATE_LIMIT: mlx5e_tls_snd_tag_free(pmt); break; #endif #endif case IF_SND_TAG_TYPE_UNLIMITED: mlx5e_ul_snd_tag_free(pmt); break; #ifdef KERN_TLS case IF_SND_TAG_TYPE_TLS: mlx5e_tls_snd_tag_free(pmt); break; #endif default: break; } } static void * mlx5e_create_ifp(struct mlx5_core_dev *mdev) { struct ifnet *ifp; struct mlx5e_priv *priv; u8 dev_addr[ETHER_ADDR_LEN] __aligned(4); u8 connector_type; struct sysctl_oid_list *child; int ncv = mdev->priv.eq_table.num_comp_vectors; char unit[16]; struct pfil_head_args pa; int err; int i,j; u32 eth_proto_cap; u32 out[MLX5_ST_SZ_DW(ptys_reg)]; bool ext = 0; u32 speeds_num; struct media media_entry = {}; if (mlx5e_check_required_hca_cap(mdev)) { mlx5_core_dbg(mdev, "mlx5e_check_required_hca_cap() failed\n"); return (NULL); } /* * Try to allocate the priv and make room for worst-case * number of channel structures: */ priv = malloc(sizeof(*priv) + (sizeof(priv->channel[0]) * mdev->priv.eq_table.num_comp_vectors), M_MLX5EN, M_WAITOK | M_ZERO); ifp = priv->ifp = if_alloc_dev(IFT_ETHER, mdev->pdev->dev.bsddev); if (ifp == NULL) { mlx5_core_err(mdev, "if_alloc() failed\n"); goto err_free_priv; } /* setup all static fields */ mlx5e_priv_static_init(priv, mdev->priv.eq_table.num_comp_vectors); ifp->if_softc = priv; if_initname(ifp, "mce", device_get_unit(mdev->pdev->dev.bsddev)); ifp->if_mtu = ETHERMTU; ifp->if_init = mlx5e_open; - ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST | + IFF_KNOWSEPOCH; ifp->if_ioctl = mlx5e_ioctl; ifp->if_transmit = mlx5e_xmit; ifp->if_qflush = if_qflush; #if (__FreeBSD_version >= 1100000) ifp->if_get_counter = mlx5e_get_counter; #endif ifp->if_snd.ifq_maxlen = ifqmaxlen; /* * Set driver features */ ifp->if_capabilities |= IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6; ifp->if_capabilities |= IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING; ifp->if_capabilities |= IFCAP_VLAN_HWCSUM | IFCAP_VLAN_HWFILTER; ifp->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU; ifp->if_capabilities |= IFCAP_LRO; ifp->if_capabilities |= IFCAP_TSO | IFCAP_VLAN_HWTSO; ifp->if_capabilities |= IFCAP_HWSTATS | IFCAP_HWRXTSTMP; ifp->if_capabilities |= IFCAP_NOMAP; ifp->if_capabilities |= IFCAP_TXTLS4 | IFCAP_TXTLS6; ifp->if_capabilities |= IFCAP_TXRTLMT; ifp->if_snd_tag_alloc = mlx5e_snd_tag_alloc; ifp->if_snd_tag_free = mlx5e_snd_tag_free; ifp->if_snd_tag_modify = mlx5e_snd_tag_modify; ifp->if_snd_tag_query = mlx5e_snd_tag_query; #ifdef RATELIMIT ifp->if_ratelimit_query = mlx5e_ratelimit_query; #endif /* set TSO limits so that we don't have to drop TX packets */ ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); ifp->if_hw_tsomaxsegcount = MLX5E_MAX_TX_MBUF_FRAGS - 1 /* hdr */; ifp->if_hw_tsomaxsegsize = MLX5E_MAX_TX_MBUF_SIZE; ifp->if_capenable = ifp->if_capabilities; ifp->if_hwassist = 0; if (ifp->if_capenable & IFCAP_TSO) ifp->if_hwassist |= CSUM_TSO; if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP | CSUM_IP); if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) ifp->if_hwassist |= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6); /* ifnet sysctl tree */ sysctl_ctx_init(&priv->sysctl_ctx); priv->sysctl_ifnet = SYSCTL_ADD_NODE(&priv->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dev), OID_AUTO, ifp->if_dname, CTLFLAG_RD, 0, "MLX5 ethernet - interface name"); if (priv->sysctl_ifnet == NULL) { mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n"); goto err_free_sysctl; } snprintf(unit, sizeof(unit), "%d", ifp->if_dunit); priv->sysctl_ifnet = SYSCTL_ADD_NODE(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, unit, CTLFLAG_RD, 0, "MLX5 ethernet - interface unit"); if (priv->sysctl_ifnet == NULL) { mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n"); goto err_free_sysctl; } /* HW sysctl tree */ child = SYSCTL_CHILDREN(device_get_sysctl_tree(mdev->pdev->dev.bsddev)); priv->sysctl_hw = SYSCTL_ADD_NODE(&priv->sysctl_ctx, child, OID_AUTO, "hw", CTLFLAG_RD, 0, "MLX5 ethernet dev hw"); if (priv->sysctl_hw == NULL) { mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n"); goto err_free_sysctl; } err = mlx5e_build_ifp_priv(mdev, priv, ncv); if (err) { mlx5_core_err(mdev, "mlx5e_build_ifp_priv() failed (%d)\n", err); goto err_free_sysctl; } /* reuse mlx5core's watchdog workqueue */ priv->wq = mdev->priv.health.wq_watchdog; err = mlx5_alloc_map_uar(mdev, &priv->cq_uar); if (err) { mlx5_en_err(ifp, "mlx5_alloc_map_uar failed, %d\n", err); goto err_free_wq; } err = mlx5_core_alloc_pd(mdev, &priv->pdn); if (err) { mlx5_en_err(ifp, "mlx5_core_alloc_pd failed, %d\n", err); goto err_unmap_free_uar; } err = mlx5_alloc_transport_domain(mdev, &priv->tdn); if (err) { mlx5_en_err(ifp, "mlx5_alloc_transport_domain failed, %d\n", err); goto err_dealloc_pd; } err = mlx5e_create_mkey(priv, priv->pdn, &priv->mr); if (err) { mlx5_en_err(ifp, "mlx5e_create_mkey failed, %d\n", err); goto err_dealloc_transport_domain; } mlx5_query_nic_vport_mac_address(priv->mdev, 0, dev_addr); /* check if we should generate a random MAC address */ if (MLX5_CAP_GEN(priv->mdev, vport_group_manager) == 0 && is_zero_ether_addr(dev_addr)) { random_ether_addr(dev_addr); mlx5_en_err(ifp, "Assigned random MAC address\n"); } err = mlx5e_rl_init(priv); if (err) { mlx5_en_err(ifp, "mlx5e_rl_init failed, %d\n", err); goto err_create_mkey; } err = mlx5e_tls_init(priv); if (err) { if_printf(ifp, "%s: mlx5e_tls_init failed\n", __func__); goto err_rl_init; } /* set default MTU */ mlx5e_set_dev_port_mtu(ifp, ifp->if_mtu); /* Set default media status */ priv->media_status_last = IFM_AVALID; priv->media_active_last = IFM_ETHER | IFM_AUTO | IFM_ETH_RXPAUSE | IFM_FDX; /* setup default pauseframes configuration */ mlx5e_setup_pauseframes(priv); /* Setup supported medias */ //TODO: If we failed to query ptys is it ok to proceed?? if (!mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1)) { ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet); eth_proto_cap = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_capability); if (MLX5_CAP_PCAM_FEATURE(mdev, ptys_connector_type)) connector_type = MLX5_GET(ptys_reg, out, connector_type); } else { eth_proto_cap = 0; mlx5_en_err(ifp, "Query port media capability failed, %d\n", err); } ifmedia_init(&priv->media, IFM_IMASK | IFM_ETH_FMASK, mlx5e_media_change, mlx5e_media_status); speeds_num = ext ? MLX5E_EXT_LINK_SPEEDS_NUMBER : MLX5E_LINK_SPEEDS_NUMBER; for (i = 0; i != speeds_num; i++) { for (j = 0; j < MLX5E_LINK_MODES_NUMBER ; ++j) { media_entry = ext ? mlx5e_ext_mode_table[i][j] : mlx5e_mode_table[i][j]; if (media_entry.baudrate == 0) continue; if (MLX5E_PROT_MASK(i) & eth_proto_cap) { ifmedia_add(&priv->media, media_entry.subtype | IFM_ETHER, 0, NULL); ifmedia_add(&priv->media, media_entry.subtype | IFM_ETHER | IFM_FDX | IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE, 0, NULL); } } } ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO | IFM_FDX | IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE, 0, NULL); /* Set autoselect by default */ ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO | IFM_FDX | IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE); DEBUGNET_SET(ifp, mlx5_en); ether_ifattach(ifp, dev_addr); /* Register for VLAN events */ priv->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, mlx5e_vlan_rx_add_vid, priv, EVENTHANDLER_PRI_FIRST); priv->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, mlx5e_vlan_rx_kill_vid, priv, EVENTHANDLER_PRI_FIRST); /* Link is down by default */ if_link_state_change(ifp, LINK_STATE_DOWN); mlx5e_enable_async_events(priv); mlx5e_add_hw_stats(priv); mlx5e_create_stats(&priv->stats.vport.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), "vstats", mlx5e_vport_stats_desc, MLX5E_VPORT_STATS_NUM, priv->stats.vport.arg); mlx5e_create_stats(&priv->stats.pport.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), "pstats", mlx5e_pport_stats_desc, MLX5E_PPORT_STATS_NUM, priv->stats.pport.arg); mlx5e_create_ethtool(priv); mtx_lock(&priv->async_events_mtx); mlx5e_update_stats(priv); mtx_unlock(&priv->async_events_mtx); SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "rx_clbr_done", CTLFLAG_RD, &priv->clbr_done, 0, "RX timestamps calibration state"); callout_init(&priv->tstmp_clbr, CALLOUT_DIRECT); mlx5e_reset_calibration_callout(priv); pa.pa_version = PFIL_VERSION; pa.pa_flags = PFIL_IN; pa.pa_type = PFIL_TYPE_ETHERNET; pa.pa_headname = ifp->if_xname; priv->pfil = pfil_head_register(&pa); return (priv); err_rl_init: mlx5e_rl_cleanup(priv); err_create_mkey: mlx5_core_destroy_mkey(priv->mdev, &priv->mr); err_dealloc_transport_domain: mlx5_dealloc_transport_domain(mdev, priv->tdn); err_dealloc_pd: mlx5_core_dealloc_pd(mdev, priv->pdn); err_unmap_free_uar: mlx5_unmap_free_uar(mdev, &priv->cq_uar); err_free_wq: flush_workqueue(priv->wq); err_free_sysctl: sysctl_ctx_free(&priv->sysctl_ctx); if (priv->sysctl_debug) sysctl_ctx_free(&priv->stats.port_stats_debug.ctx); mlx5e_priv_static_destroy(priv, mdev->priv.eq_table.num_comp_vectors); if_free(ifp); err_free_priv: free(priv, M_MLX5EN); return (NULL); } static void mlx5e_destroy_ifp(struct mlx5_core_dev *mdev, void *vpriv) { struct mlx5e_priv *priv = vpriv; struct ifnet *ifp = priv->ifp; /* don't allow more IOCTLs */ priv->gone = 1; /* XXX wait a bit to allow IOCTL handlers to complete */ pause("W", hz); #ifdef RATELIMIT /* * The kernel can have reference(s) via the m_snd_tag's into * the ratelimit channels, and these must go away before * detaching: */ while (READ_ONCE(priv->rl.stats.tx_active_connections) != 0) { mlx5_en_err(priv->ifp, "Waiting for all ratelimit connections to terminate\n"); pause("W", hz); } #endif /* wait for all unlimited send tags to complete */ mlx5e_priv_wait_for_completion(priv, mdev->priv.eq_table.num_comp_vectors); /* stop watchdog timer */ callout_drain(&priv->watchdog); callout_drain(&priv->tstmp_clbr); if (priv->vlan_attach != NULL) EVENTHANDLER_DEREGISTER(vlan_config, priv->vlan_attach); if (priv->vlan_detach != NULL) EVENTHANDLER_DEREGISTER(vlan_unconfig, priv->vlan_detach); /* make sure device gets closed */ PRIV_LOCK(priv); mlx5e_close_locked(ifp); PRIV_UNLOCK(priv); /* deregister pfil */ if (priv->pfil != NULL) { pfil_head_unregister(priv->pfil); priv->pfil = NULL; } /* unregister device */ ifmedia_removeall(&priv->media); ether_ifdetach(ifp); mlx5e_tls_cleanup(priv); mlx5e_rl_cleanup(priv); /* destroy all remaining sysctl nodes */ sysctl_ctx_free(&priv->stats.vport.ctx); sysctl_ctx_free(&priv->stats.pport.ctx); if (priv->sysctl_debug) sysctl_ctx_free(&priv->stats.port_stats_debug.ctx); sysctl_ctx_free(&priv->sysctl_ctx); mlx5_core_destroy_mkey(priv->mdev, &priv->mr); mlx5_dealloc_transport_domain(priv->mdev, priv->tdn); mlx5_core_dealloc_pd(priv->mdev, priv->pdn); mlx5_unmap_free_uar(priv->mdev, &priv->cq_uar); mlx5e_disable_async_events(priv); flush_workqueue(priv->wq); mlx5e_priv_static_destroy(priv, mdev->priv.eq_table.num_comp_vectors); if_free(ifp); free(priv, M_MLX5EN); } #ifdef DEBUGNET static void mlx5_en_debugnet_init(struct ifnet *dev, int *nrxr, int *ncl, int *clsize) { struct mlx5e_priv *priv = if_getsoftc(dev); PRIV_LOCK(priv); *nrxr = priv->params.num_channels; *ncl = DEBUGNET_MAX_IN_FLIGHT; *clsize = MLX5E_MAX_RX_BYTES; PRIV_UNLOCK(priv); } static void mlx5_en_debugnet_event(struct ifnet *dev, enum debugnet_ev event) { } static int mlx5_en_debugnet_transmit(struct ifnet *dev, struct mbuf *m) { struct mlx5e_priv *priv = if_getsoftc(dev); struct mlx5e_sq *sq; int err; if ((if_getdrvflags(dev) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING || (priv->media_status_last & IFM_ACTIVE) == 0) return (ENOENT); sq = &priv->channel[0].sq[0]; if (sq->running == 0) { m_freem(m); return (ENOENT); } if (mlx5e_sq_xmit(sq, &m) != 0) { m_freem(m); err = ENOBUFS; } else { err = 0; } if (likely(sq->doorbell.d64 != 0)) { mlx5e_tx_notify_hw(sq, sq->doorbell.d32, 0); sq->doorbell.d64 = 0; } return (err); } static int mlx5_en_debugnet_poll(struct ifnet *dev, int count) { struct mlx5e_priv *priv = if_getsoftc(dev); if ((if_getdrvflags(dev) & IFF_DRV_RUNNING) == 0 || (priv->media_status_last & IFM_ACTIVE) == 0) return (ENOENT); mlx5_poll_interrupts(priv->mdev); return (0); } #endif /* DEBUGNET */ static void * mlx5e_get_ifp(void *vpriv) { struct mlx5e_priv *priv = vpriv; return (priv->ifp); } static struct mlx5_interface mlx5e_interface = { .add = mlx5e_create_ifp, .remove = mlx5e_destroy_ifp, .event = mlx5e_async_event, .protocol = MLX5_INTERFACE_PROTOCOL_ETH, .get_dev = mlx5e_get_ifp, }; void mlx5e_init(void) { mlx5_register_interface(&mlx5e_interface); } void mlx5e_cleanup(void) { mlx5_unregister_interface(&mlx5e_interface); } static void mlx5e_show_version(void __unused *arg) { printf("%s", mlx5e_version); } SYSINIT(mlx5e_show_version, SI_SUB_DRIVERS, SI_ORDER_ANY, mlx5e_show_version, NULL); module_init_order(mlx5e_init, SI_ORDER_THIRD); module_exit_order(mlx5e_cleanup, SI_ORDER_THIRD); #if (__FreeBSD_version >= 1100000) MODULE_DEPEND(mlx5en, linuxkpi, 1, 1, 1); #endif MODULE_DEPEND(mlx5en, mlx5, 1, 1, 1); MODULE_VERSION(mlx5en, 1); diff --git a/sys/dev/ntb/if_ntb/if_ntb.c b/sys/dev/ntb/if_ntb/if_ntb.c index 1e1f98a54132..3bae01aae49d 100644 --- a/sys/dev/ntb/if_ntb/if_ntb.c +++ b/sys/dev/ntb/if_ntb/if_ntb.c @@ -1,514 +1,513 @@ /*- * Copyright (c) 2016 Alexander Motin * Copyright (C) 2013 Intel Corporation * Copyright (C) 2015 EMC Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * The Non-Transparent Bridge (NTB) is a device that allows you to connect * two or more systems using a PCI-e links, providing remote memory access. * * This module contains a driver for simulated Ethernet device, using * underlying NTB Transport device. * * NOTE: Much of the code in this module is shared with Linux. Any patches may * be picked up and redistributed in Linux with a dual GPL/BSD license. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "../ntb_transport.h" #define KTR_NTB KTR_SPARE3 #define NTB_MEDIATYPE (IFM_ETHER | IFM_AUTO | IFM_FDX) #define NTB_CSUM_FEATURES (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP) #define NTB_CSUM_FEATURES6 (CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_SCTP_IPV6) #define NTB_CSUM_SET (CSUM_DATA_VALID | CSUM_DATA_VALID_IPV6 | \ CSUM_PSEUDO_HDR | \ CSUM_IP_CHECKED | CSUM_IP_VALID | \ CSUM_SCTP_VALID) static SYSCTL_NODE(_hw, OID_AUTO, if_ntb, CTLFLAG_RW, 0, "if_ntb"); static unsigned g_if_ntb_num_queues = UINT_MAX; SYSCTL_UINT(_hw_if_ntb, OID_AUTO, num_queues, CTLFLAG_RWTUN, &g_if_ntb_num_queues, 0, "Number of queues per interface"); struct ntb_net_queue { struct ntb_net_ctx *sc; if_t ifp; struct ntb_transport_qp *qp; struct buf_ring *br; struct task tx_task; struct taskqueue *tx_tq; struct mtx tx_lock; struct callout queue_full; }; struct ntb_net_ctx { if_t ifp; struct ifmedia media; u_char eaddr[ETHER_ADDR_LEN]; int num_queues; struct ntb_net_queue *queues; int mtu; }; static int ntb_net_probe(device_t dev); static int ntb_net_attach(device_t dev); static int ntb_net_detach(device_t dev); static void ntb_net_init(void *arg); static int ntb_ifmedia_upd(struct ifnet *); static void ntb_ifmedia_sts(struct ifnet *, struct ifmediareq *); static int ntb_ioctl(if_t ifp, u_long command, caddr_t data); static int ntb_transmit(if_t ifp, struct mbuf *m); static void ntb_net_tx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data, int len); static void ntb_net_rx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data, int len); static void ntb_net_event_handler(void *data, enum ntb_link_event status); static void ntb_handle_tx(void *arg, int pending); static void ntb_qp_full(void *arg); static void ntb_qflush(if_t ifp); static void create_random_local_eui48(u_char *eaddr); static int ntb_net_probe(device_t dev) { device_set_desc(dev, "NTB Network Interface"); return (0); } static int ntb_net_attach(device_t dev) { struct ntb_net_ctx *sc = device_get_softc(dev); struct ntb_net_queue *q; if_t ifp; struct ntb_queue_handlers handlers = { ntb_net_rx_handler, ntb_net_tx_handler, ntb_net_event_handler }; int i; ifp = sc->ifp = if_gethandle(IFT_ETHER); if (ifp == NULL) { printf("ntb: Cannot allocate ifnet structure\n"); return (ENOMEM); } if_initname(ifp, device_get_name(dev), device_get_unit(dev)); if_setdev(ifp, dev); sc->num_queues = min(g_if_ntb_num_queues, ntb_transport_queue_count(dev)); sc->queues = malloc(sc->num_queues * sizeof(struct ntb_net_queue), M_DEVBUF, M_WAITOK | M_ZERO); sc->mtu = INT_MAX; for (i = 0; i < sc->num_queues; i++) { q = &sc->queues[i]; q->sc = sc; q->ifp = ifp; q->qp = ntb_transport_create_queue(dev, i, &handlers, q); if (q->qp == NULL) break; sc->mtu = imin(sc->mtu, ntb_transport_max_size(q->qp)); mtx_init(&q->tx_lock, "ntb tx", NULL, MTX_DEF); q->br = buf_ring_alloc(4096, M_DEVBUF, M_WAITOK, &q->tx_lock); TASK_INIT(&q->tx_task, 0, ntb_handle_tx, q); q->tx_tq = taskqueue_create_fast("ntb_txq", M_NOWAIT, taskqueue_thread_enqueue, &q->tx_tq); taskqueue_start_threads(&q->tx_tq, 1, PI_NET, "%s txq%d", device_get_nameunit(dev), i); callout_init(&q->queue_full, 1); } sc->num_queues = i; device_printf(dev, "%d queue(s)\n", sc->num_queues); if_setinitfn(ifp, ntb_net_init); if_setsoftc(ifp, sc); - if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST | - IFF_NEEDSEPOCH); + if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); if_setioctlfn(ifp, ntb_ioctl); if_settransmitfn(ifp, ntb_transmit); if_setqflushfn(ifp, ntb_qflush); create_random_local_eui48(sc->eaddr); ether_ifattach(ifp, sc->eaddr); if_setcapabilities(ifp, IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6 | IFCAP_JUMBO_MTU | IFCAP_LINKSTATE); if_setcapenable(ifp, IFCAP_JUMBO_MTU | IFCAP_LINKSTATE); if_setmtu(ifp, sc->mtu - ETHER_HDR_LEN); ifmedia_init(&sc->media, IFM_IMASK, ntb_ifmedia_upd, ntb_ifmedia_sts); ifmedia_add(&sc->media, NTB_MEDIATYPE, 0, NULL); ifmedia_set(&sc->media, NTB_MEDIATYPE); for (i = 0; i < sc->num_queues; i++) ntb_transport_link_up(sc->queues[i].qp); return (0); } static int ntb_net_detach(device_t dev) { struct ntb_net_ctx *sc = device_get_softc(dev); struct ntb_net_queue *q; int i; for (i = 0; i < sc->num_queues; i++) ntb_transport_link_down(sc->queues[i].qp); ether_ifdetach(sc->ifp); if_free(sc->ifp); ifmedia_removeall(&sc->media); for (i = 0; i < sc->num_queues; i++) { q = &sc->queues[i]; ntb_transport_free_queue(q->qp); buf_ring_free(q->br, M_DEVBUF); callout_drain(&q->queue_full); taskqueue_drain_all(q->tx_tq); mtx_destroy(&q->tx_lock); } free(sc->queues, M_DEVBUF); return (0); } /* Network device interface */ static void ntb_net_init(void *arg) { struct ntb_net_ctx *sc = arg; if_t ifp = sc->ifp; if_setdrvflagbits(ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE); if_setbaudrate(ifp, ntb_transport_link_speed(sc->queues[0].qp)); if_link_state_change(ifp, ntb_transport_link_query(sc->queues[0].qp) ? LINK_STATE_UP : LINK_STATE_DOWN); } static int ntb_ioctl(if_t ifp, u_long command, caddr_t data) { struct ntb_net_ctx *sc = if_getsoftc(ifp); struct ifreq *ifr = (struct ifreq *)data; int error = 0; switch (command) { case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCSIFMTU: { if (ifr->ifr_mtu > sc->mtu - ETHER_HDR_LEN) { error = EINVAL; break; } if_setmtu(ifp, ifr->ifr_mtu); break; } case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->media, command); break; case SIOCSIFCAP: if (ifr->ifr_reqcap & IFCAP_RXCSUM) if_setcapenablebit(ifp, IFCAP_RXCSUM, 0); else if_setcapenablebit(ifp, 0, IFCAP_RXCSUM); if (ifr->ifr_reqcap & IFCAP_TXCSUM) { if_setcapenablebit(ifp, IFCAP_TXCSUM, 0); if_sethwassistbits(ifp, NTB_CSUM_FEATURES, 0); } else { if_setcapenablebit(ifp, 0, IFCAP_TXCSUM); if_sethwassistbits(ifp, 0, NTB_CSUM_FEATURES); } if (ifr->ifr_reqcap & IFCAP_RXCSUM_IPV6) if_setcapenablebit(ifp, IFCAP_RXCSUM_IPV6, 0); else if_setcapenablebit(ifp, 0, IFCAP_RXCSUM_IPV6); if (ifr->ifr_reqcap & IFCAP_TXCSUM_IPV6) { if_setcapenablebit(ifp, IFCAP_TXCSUM_IPV6, 0); if_sethwassistbits(ifp, NTB_CSUM_FEATURES6, 0); } else { if_setcapenablebit(ifp, 0, IFCAP_TXCSUM_IPV6); if_sethwassistbits(ifp, 0, NTB_CSUM_FEATURES6); } break; default: error = ether_ioctl(ifp, command, data); break; } return (error); } static int ntb_ifmedia_upd(struct ifnet *ifp) { struct ntb_net_ctx *sc = if_getsoftc(ifp); struct ifmedia *ifm = &sc->media; if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) return (EINVAL); return (0); } static void ntb_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) { struct ntb_net_ctx *sc = if_getsoftc(ifp); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = NTB_MEDIATYPE; if (ntb_transport_link_query(sc->queues[0].qp)) ifmr->ifm_status |= IFM_ACTIVE; } static void ntb_transmit_locked(struct ntb_net_queue *q) { if_t ifp = q->ifp; struct mbuf *m; int rc, len; short mflags; CTR0(KTR_NTB, "TX: ntb_transmit_locked"); while ((m = drbr_peek(ifp, q->br)) != NULL) { CTR1(KTR_NTB, "TX: start mbuf %p", m); if_etherbpfmtap(ifp, m); len = m->m_pkthdr.len; mflags = m->m_flags; rc = ntb_transport_tx_enqueue(q->qp, m, m, len); if (rc != 0) { CTR2(KTR_NTB, "TX: could not tx mbuf %p: %d", m, rc); if (rc == EAGAIN) { drbr_putback(ifp, q->br, m); callout_reset_sbt(&q->queue_full, SBT_1MS / 4, SBT_1MS / 4, ntb_qp_full, q, 0); } else { m_freem(m); drbr_advance(ifp, q->br); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } break; } drbr_advance(ifp, q->br); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if (mflags & M_MCAST) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); } } static int ntb_transmit(if_t ifp, struct mbuf *m) { struct ntb_net_ctx *sc = if_getsoftc(ifp); struct ntb_net_queue *q; int error, i; CTR0(KTR_NTB, "TX: ntb_transmit"); if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) i = m->m_pkthdr.flowid % sc->num_queues; else i = curcpu % sc->num_queues; q = &sc->queues[i]; error = drbr_enqueue(ifp, q->br, m); if (error) return (error); if (mtx_trylock(&q->tx_lock)) { ntb_transmit_locked(q); mtx_unlock(&q->tx_lock); } else taskqueue_enqueue(q->tx_tq, &q->tx_task); return (0); } static void ntb_handle_tx(void *arg, int pending) { struct ntb_net_queue *q = arg; mtx_lock(&q->tx_lock); ntb_transmit_locked(q); mtx_unlock(&q->tx_lock); } static void ntb_qp_full(void *arg) { struct ntb_net_queue *q = arg; CTR0(KTR_NTB, "TX: qp_full callout"); if (ntb_transport_tx_free_entry(q->qp) > 0) taskqueue_enqueue(q->tx_tq, &q->tx_task); else callout_schedule_sbt(&q->queue_full, SBT_1MS / 4, SBT_1MS / 4, 0); } static void ntb_qflush(if_t ifp) { struct ntb_net_ctx *sc = if_getsoftc(ifp); struct ntb_net_queue *q; struct mbuf *m; int i; for (i = 0; i < sc->num_queues; i++) { q = &sc->queues[i]; mtx_lock(&q->tx_lock); while ((m = buf_ring_dequeue_sc(q->br)) != NULL) m_freem(m); mtx_unlock(&q->tx_lock); } if_qflush(ifp); } /* Network Device Callbacks */ static void ntb_net_tx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data, int len) { m_freem(data); CTR1(KTR_NTB, "TX: tx_handler freeing mbuf %p", data); } static void ntb_net_rx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data, int len) { struct ntb_net_queue *q = qp_data; struct ntb_net_ctx *sc = q->sc; struct mbuf *m = data; if_t ifp = q->ifp; uint16_t proto; CTR1(KTR_NTB, "RX: rx handler (%d)", len); if (len < 0) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return; } m->m_pkthdr.rcvif = ifp; if (sc->num_queues > 1) { m->m_pkthdr.flowid = q - sc->queues; M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); } if (if_getcapenable(ifp) & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) { m_copydata(m, 12, 2, (void *)&proto); switch (ntohs(proto)) { case ETHERTYPE_IP: if (if_getcapenable(ifp) & IFCAP_RXCSUM) { m->m_pkthdr.csum_data = 0xffff; m->m_pkthdr.csum_flags = NTB_CSUM_SET; } break; case ETHERTYPE_IPV6: if (if_getcapenable(ifp) & IFCAP_RXCSUM_IPV6) { m->m_pkthdr.csum_data = 0xffff; m->m_pkthdr.csum_flags = NTB_CSUM_SET; } break; } } if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_input(ifp, m); } static void ntb_net_event_handler(void *data, enum ntb_link_event status) { struct ntb_net_queue *q = data; if_setbaudrate(q->ifp, ntb_transport_link_speed(q->qp)); if_link_state_change(q->ifp, (status == NTB_LINK_UP) ? LINK_STATE_UP : LINK_STATE_DOWN); } /* Helper functions */ /* TODO: This too should really be part of the kernel */ #define EUI48_MULTICAST 1 << 0 #define EUI48_LOCALLY_ADMINISTERED 1 << 1 static void create_random_local_eui48(u_char *eaddr) { static uint8_t counter = 0; eaddr[0] = EUI48_LOCALLY_ADMINISTERED; arc4rand(&eaddr[1], 4, 0); eaddr[5] = counter++; } static device_method_t ntb_net_methods[] = { /* Device interface */ DEVMETHOD(device_probe, ntb_net_probe), DEVMETHOD(device_attach, ntb_net_attach), DEVMETHOD(device_detach, ntb_net_detach), DEVMETHOD_END }; devclass_t ntb_net_devclass; static DEFINE_CLASS_0(ntb, ntb_net_driver, ntb_net_methods, sizeof(struct ntb_net_ctx)); DRIVER_MODULE(if_ntb, ntb_transport, ntb_net_driver, ntb_net_devclass, NULL, NULL); MODULE_DEPEND(if_ntb, ntb_transport, 1, 1, 1); MODULE_VERSION(if_ntb, 1); diff --git a/sys/dev/sbni/if_sbni.c b/sys/dev/sbni/if_sbni.c index 62b86112b26f..267001f7897d 100644 --- a/sys/dev/sbni/if_sbni.c +++ b/sys/dev/sbni/if_sbni.c @@ -1,1278 +1,1277 @@ /*- * Copyright (c) 1997-2001 Granch, Ltd. All rights reserved. * Author: Denis I.Timofeev * * Redistributon and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); /* * Device driver for Granch SBNI12 leased line adapters * * Revision 2.0.0 1997/08/06 * Initial revision by Alexey Zverev * * Revision 2.0.1 1997/08/11 * Additional internal statistics support (tx statistics) * * Revision 2.0.2 1997/11/05 * if_bpf bug has been fixed * * Revision 2.0.3 1998/12/20 * Memory leakage has been eliminated in * the sbni_st and sbni_timeout routines. * * Revision 3.0 2000/08/10 by Yaroslav Polyakov * Support for PCI cards. 4.1 modification. * * Revision 3.1 2000/09/12 * Removed extra #defines around bpf functions * * Revision 4.0 2000/11/23 by Denis Timofeev * Completely redesigned the buffer management * * Revision 4.1 2001/01/21 * Support for PCI Dual cards and new SBNI12D-10, -11 Dual/ISA cards * * Written with reference to NE2000 driver developed by David Greenman. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void sbni_init(void *); static void sbni_init_locked(struct sbni_softc *); static void sbni_start(struct ifnet *); static void sbni_start_locked(struct ifnet *); static int sbni_ioctl(struct ifnet *, u_long, caddr_t); static void sbni_stop(struct sbni_softc *); static void handle_channel(struct sbni_softc *); static void card_start(struct sbni_softc *); static int recv_frame(struct sbni_softc *); static void send_frame(struct sbni_softc *); static int upload_data(struct sbni_softc *, u_int, u_int, u_int, u_int32_t); static int skip_tail(struct sbni_softc *, u_int, u_int32_t); static void interpret_ack(struct sbni_softc *, u_int); static void download_data(struct sbni_softc *, u_int32_t *); static void prepare_to_send(struct sbni_softc *); static void drop_xmit_queue(struct sbni_softc *); static int get_rx_buf(struct sbni_softc *); static void indicate_pkt(struct sbni_softc *); static void change_level(struct sbni_softc *); static int check_fhdr(struct sbni_softc *, u_int *, u_int *, u_int *, u_int *, u_int32_t *); static int append_frame_to_pkt(struct sbni_softc *, u_int, u_int32_t); static void timeout_change_level(struct sbni_softc *); static void send_frame_header(struct sbni_softc *, u_int32_t *); static void set_initial_values(struct sbni_softc *, struct sbni_flags); static u_int32_t calc_crc32(u_int32_t, caddr_t, u_int); static callout_func_t sbni_timeout; static __inline u_char sbni_inb(struct sbni_softc *, enum sbni_reg); static __inline void sbni_outb(struct sbni_softc *, enum sbni_reg, u_char); static __inline void sbni_insb(struct sbni_softc *, u_char *, u_int); static __inline void sbni_outsb(struct sbni_softc *, u_char *, u_int); static u_int32_t crc32tab[]; #ifdef SBNI_DUAL_COMPOUND static struct mtx headlist_lock; MTX_SYSINIT(headlist_lock, &headlist_lock, "sbni headlist", MTX_DEF); static struct sbni_softc *sbni_headlist; #endif /* -------------------------------------------------------------------------- */ static __inline u_char sbni_inb(struct sbni_softc *sc, enum sbni_reg reg) { return bus_space_read_1( rman_get_bustag(sc->io_res), rman_get_bushandle(sc->io_res), sc->io_off + reg); } static __inline void sbni_outb(struct sbni_softc *sc, enum sbni_reg reg, u_char value) { bus_space_write_1( rman_get_bustag(sc->io_res), rman_get_bushandle(sc->io_res), sc->io_off + reg, value); } static __inline void sbni_insb(struct sbni_softc *sc, u_char *to, u_int len) { bus_space_read_multi_1( rman_get_bustag(sc->io_res), rman_get_bushandle(sc->io_res), sc->io_off + DAT, to, len); } static __inline void sbni_outsb(struct sbni_softc *sc, u_char *from, u_int len) { bus_space_write_multi_1( rman_get_bustag(sc->io_res), rman_get_bushandle(sc->io_res), sc->io_off + DAT, from, len); } /* Valid combinations in CSR0 (for probing): VALID_DECODER 0000,0011,1011,1010 ; 0 ; - TR_REQ ; 1 ; + TR_RDY ; 2 ; - TR_RDY TR_REQ ; 3 ; + BU_EMP ; 4 ; + BU_EMP TR_REQ ; 5 ; + BU_EMP TR_RDY ; 6 ; - BU_EMP TR_RDY TR_REQ ; 7 ; + RC_RDY ; 8 ; + RC_RDY TR_REQ ; 9 ; + RC_RDY TR_RDY ; 10 ; - RC_RDY TR_RDY TR_REQ ; 11 ; - RC_RDY BU_EMP ; 12 ; - RC_RDY BU_EMP TR_REQ ; 13 ; - RC_RDY BU_EMP TR_RDY ; 14 ; - RC_RDY BU_EMP TR_RDY TR_REQ ; 15 ; - */ #define VALID_DECODER (2 + 8 + 0x10 + 0x20 + 0x80 + 0x100 + 0x200) int sbni_probe(struct sbni_softc *sc) { u_char csr0; csr0 = sbni_inb(sc, CSR0); if (csr0 != 0xff && csr0 != 0x00) { csr0 &= ~EN_INT; if (csr0 & BU_EMP) csr0 |= EN_INT; if (VALID_DECODER & (1 << (csr0 >> 4))) return (0); } return (ENXIO); } /* * Install interface into kernel networking data structures */ int sbni_attach(struct sbni_softc *sc, int unit, struct sbni_flags flags) { struct ifnet *ifp; u_char csr0; ifp = sc->ifp = if_alloc(IFT_ETHER); if (ifp == NULL) return (ENOMEM); sbni_outb(sc, CSR0, 0); set_initial_values(sc, flags); /* Initialize ifnet structure */ ifp->if_softc = sc; if_initname(ifp, "sbni", unit); ifp->if_init = sbni_init; ifp->if_start = sbni_start; ifp->if_ioctl = sbni_ioctl; IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); /* report real baud rate */ csr0 = sbni_inb(sc, CSR0); ifp->if_baudrate = (csr0 & 0x01 ? 500000 : 2000000) / (1 << flags.rate); - ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST | - IFF_NEEDSEPOCH; + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; mtx_init(&sc->lock, ifp->if_xname, MTX_NETWORK_LOCK, MTX_DEF); callout_init_mtx(&sc->wch, &sc->lock, 0); ether_ifattach(ifp, sc->enaddr); /* device attach does transition from UNCONFIGURED to IDLE state */ if_printf(ifp, "speed %ju, rxl ", (uintmax_t)ifp->if_baudrate); if (sc->delta_rxl) printf("auto\n"); else printf("%d (fixed)\n", sc->cur_rxl_index); return (0); } void sbni_detach(struct sbni_softc *sc) { SBNI_LOCK(sc); sbni_stop(sc); SBNI_UNLOCK(sc); callout_drain(&sc->wch); ether_ifdetach(sc->ifp); if (sc->irq_handle) bus_teardown_intr(sc->dev, sc->irq_res, sc->irq_handle); mtx_destroy(&sc->lock); if_free(sc->ifp); } void sbni_release_resources(struct sbni_softc *sc) { if (sc->irq_res) bus_release_resource(sc->dev, SYS_RES_IRQ, sc->irq_rid, sc->irq_res); if (sc->io_res && sc->io_off == 0) bus_release_resource(sc->dev, SYS_RES_IOPORT, sc->io_rid, sc->io_res); } /* -------------------------------------------------------------------------- */ static void sbni_init(void *xsc) { struct sbni_softc *sc; sc = (struct sbni_softc *)xsc; SBNI_LOCK(sc); sbni_init_locked(sc); SBNI_UNLOCK(sc); } static void sbni_init_locked(struct sbni_softc *sc) { struct ifnet *ifp; ifp = sc->ifp; /* * kludge to avoid multiple initialization when more than once * protocols configured */ if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; card_start(sc); callout_reset(&sc->wch, hz/SBNI_HZ, sbni_timeout, sc); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; /* attempt to start output */ sbni_start_locked(ifp); } static void sbni_start(struct ifnet *ifp) { struct sbni_softc *sc = ifp->if_softc; SBNI_LOCK(sc); sbni_start_locked(ifp); SBNI_UNLOCK(sc); } static void sbni_start_locked(struct ifnet *ifp) { struct sbni_softc *sc = ifp->if_softc; if (sc->tx_frameno == 0) prepare_to_send(sc); } static void sbni_stop(struct sbni_softc *sc) { sbni_outb(sc, CSR0, 0); drop_xmit_queue(sc); if (sc->rx_buf_p) { m_freem(sc->rx_buf_p); sc->rx_buf_p = NULL; } callout_stop(&sc->wch); sc->ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); } /* -------------------------------------------------------------------------- */ /* interrupt handler */ /* * SBNI12D-10, -11/ISA boards within "common interrupt" mode could not * be looked as two independent single-channel devices. Every channel seems * as Ethernet interface but interrupt handler must be common. Really, first * channel ("master") driver only registers the handler. In it's struct softc * it has got pointer to "slave" channel's struct softc and handles that's * interrupts too. * softc of successfully attached ISA SBNI boards is linked to list. * While next board driver is initialized, it scans this list. If one * has found softc with same irq and ioaddr different by 4 then it assumes * this board to be "master". */ void sbni_intr(void *arg) { struct sbni_softc *sc; int repeat; sc = (struct sbni_softc *)arg; do { repeat = 0; SBNI_LOCK(sc); if (sbni_inb(sc, CSR0) & (RC_RDY | TR_RDY)) { handle_channel(sc); repeat = 1; } SBNI_UNLOCK(sc); if (sc->slave_sc) { /* second channel present */ SBNI_LOCK(sc->slave_sc); if (sbni_inb(sc->slave_sc, CSR0) & (RC_RDY | TR_RDY)) { handle_channel(sc->slave_sc); repeat = 1; } SBNI_UNLOCK(sc->slave_sc); } } while (repeat); } static void handle_channel(struct sbni_softc *sc) { int req_ans; u_char csr0; sbni_outb(sc, CSR0, (sbni_inb(sc, CSR0) & ~EN_INT) | TR_REQ); sc->timer_ticks = CHANGE_LEVEL_START_TICKS; for (;;) { csr0 = sbni_inb(sc, CSR0); if ((csr0 & (RC_RDY | TR_RDY)) == 0) break; req_ans = !(sc->state & FL_PREV_OK); if (csr0 & RC_RDY) req_ans = recv_frame(sc); /* * TR_RDY always equals 1 here because we have owned the marker, * and we set TR_REQ when disabled interrupts */ csr0 = sbni_inb(sc, CSR0); if ((csr0 & TR_RDY) == 0 || (csr0 & RC_RDY) != 0) if_printf(sc->ifp, "internal error!\n"); /* if state & FL_NEED_RESEND != 0 then tx_frameno != 0 */ if (req_ans || sc->tx_frameno != 0) send_frame(sc); else { /* send the marker without any data */ sbni_outb(sc, CSR0, sbni_inb(sc, CSR0) & ~TR_REQ); } } sbni_outb(sc, CSR0, sbni_inb(sc, CSR0) | EN_INT); } /* * Routine returns 1 if it need to acknoweledge received frame. * Empty frame received without errors won't be acknoweledged. */ static int recv_frame(struct sbni_softc *sc) { u_int32_t crc; u_int framelen, frameno, ack; u_int is_first, frame_ok; crc = CRC32_INITIAL; if (check_fhdr(sc, &framelen, &frameno, &ack, &is_first, &crc)) { frame_ok = framelen > 4 ? upload_data(sc, framelen, frameno, is_first, crc) : skip_tail(sc, framelen, crc); if (frame_ok) interpret_ack(sc, ack); } else { framelen = 0; frame_ok = 0; } sbni_outb(sc, CSR0, sbni_inb(sc, CSR0) ^ CT_ZER); if (frame_ok) { sc->state |= FL_PREV_OK; if (framelen > 4) sc->in_stats.all_rx_number++; } else { sc->state &= ~FL_PREV_OK; change_level(sc); sc->in_stats.all_rx_number++; sc->in_stats.bad_rx_number++; } return (!frame_ok || framelen > 4); } static void send_frame(struct sbni_softc *sc) { u_int32_t crc; u_char csr0; crc = CRC32_INITIAL; if (sc->state & FL_NEED_RESEND) { /* if frame was sended but not ACK'ed - resend it */ if (sc->trans_errors) { sc->trans_errors--; if (sc->framelen != 0) sc->in_stats.resend_tx_number++; } else { /* cannot xmit with many attempts */ drop_xmit_queue(sc); goto do_send; } } else sc->trans_errors = TR_ERROR_COUNT; send_frame_header(sc, &crc); sc->state |= FL_NEED_RESEND; /* * FL_NEED_RESEND will be cleared after ACK, but if empty * frame sended then in prepare_to_send next frame */ if (sc->framelen) { download_data(sc, &crc); sc->in_stats.all_tx_number++; sc->state |= FL_WAIT_ACK; } sbni_outsb(sc, (u_char *)&crc, sizeof crc); do_send: csr0 = sbni_inb(sc, CSR0); sbni_outb(sc, CSR0, csr0 & ~TR_REQ); if (sc->tx_frameno) { /* next frame exists - request to send */ sbni_outb(sc, CSR0, csr0 | TR_REQ); } } static void download_data(struct sbni_softc *sc, u_int32_t *crc_p) { struct mbuf *m; caddr_t data_p; u_int data_len, pos, slice; data_p = NULL; /* initialized to avoid warn */ pos = 0; for (m = sc->tx_buf_p; m != NULL && pos < sc->pktlen; m = m->m_next) { if (pos + m->m_len > sc->outpos) { data_len = m->m_len - (sc->outpos - pos); data_p = mtod(m, caddr_t) + (sc->outpos - pos); goto do_copy; } else pos += m->m_len; } data_len = 0; do_copy: pos = 0; do { if (data_len) { slice = min(data_len, sc->framelen - pos); sbni_outsb(sc, data_p, slice); *crc_p = calc_crc32(*crc_p, data_p, slice); pos += slice; if (data_len -= slice) data_p += slice; else { do { m = m->m_next; } while (m != NULL && m->m_len == 0); if (m) { data_len = m->m_len; data_p = mtod(m, caddr_t); } } } else { /* frame too short - zero padding */ pos = sc->framelen - pos; while (pos--) { sbni_outb(sc, DAT, 0); *crc_p = CRC32(0, *crc_p); } return; } } while (pos < sc->framelen); } static int upload_data(struct sbni_softc *sc, u_int framelen, u_int frameno, u_int is_first, u_int32_t crc) { int frame_ok; if (is_first) { sc->wait_frameno = frameno; sc->inppos = 0; } if (sc->wait_frameno == frameno) { if (sc->inppos + framelen <= ETHER_MAX_LEN) { frame_ok = append_frame_to_pkt(sc, framelen, crc); /* * if CRC is right but framelen incorrect then transmitter * error was occurred... drop entire packet */ } else if ((frame_ok = skip_tail(sc, framelen, crc)) != 0) { sc->wait_frameno = 0; sc->inppos = 0; if_inc_counter(sc->ifp, IFCOUNTER_IERRORS, 1); /* now skip all frames until is_first != 0 */ } } else frame_ok = skip_tail(sc, framelen, crc); if (is_first && !frame_ok) { /* * Frame has been violated, but we have stored * is_first already... Drop entire packet. */ sc->wait_frameno = 0; if_inc_counter(sc->ifp, IFCOUNTER_IERRORS, 1); } return (frame_ok); } static __inline void send_complete(struct sbni_softc *); static __inline void send_complete(struct sbni_softc *sc) { m_freem(sc->tx_buf_p); sc->tx_buf_p = NULL; if_inc_counter(sc->ifp, IFCOUNTER_OPACKETS, 1); } static void interpret_ack(struct sbni_softc *sc, u_int ack) { if (ack == FRAME_SENT_OK) { sc->state &= ~FL_NEED_RESEND; if (sc->state & FL_WAIT_ACK) { sc->outpos += sc->framelen; if (--sc->tx_frameno) { sc->framelen = min( sc->maxframe, sc->pktlen - sc->outpos); } else { send_complete(sc); prepare_to_send(sc); } } } sc->state &= ~FL_WAIT_ACK; } /* * Glue received frame with previous fragments of packet. * Indicate packet when last frame would be accepted. */ static int append_frame_to_pkt(struct sbni_softc *sc, u_int framelen, u_int32_t crc) { caddr_t p; if (sc->inppos + framelen > ETHER_MAX_LEN) return (0); if (!sc->rx_buf_p && !get_rx_buf(sc)) return (0); p = sc->rx_buf_p->m_data + sc->inppos; sbni_insb(sc, p, framelen); if (calc_crc32(crc, p, framelen) != CRC32_REMAINDER) return (0); sc->inppos += framelen - 4; if (--sc->wait_frameno == 0) { /* last frame received */ indicate_pkt(sc); if_inc_counter(sc->ifp, IFCOUNTER_IPACKETS, 1); } return (1); } /* * Prepare to start output on adapter. Current priority must be set to splimp * before this routine is called. * Transmitter will be actually activated when marker has been accepted. */ static void prepare_to_send(struct sbni_softc *sc) { struct mbuf *m; u_int len; /* sc->tx_buf_p == NULL here! */ if (sc->tx_buf_p) printf("sbni: memory leak!\n"); sc->outpos = 0; sc->state &= ~(FL_WAIT_ACK | FL_NEED_RESEND); for (;;) { IF_DEQUEUE(&sc->ifp->if_snd, sc->tx_buf_p); if (!sc->tx_buf_p) { /* nothing to transmit... */ sc->pktlen = 0; sc->tx_frameno = 0; sc->framelen = 0; sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; return; } for (len = 0, m = sc->tx_buf_p; m; m = m->m_next) len += m->m_len; if (len != 0) break; m_freem(sc->tx_buf_p); } if (len < SBNI_MIN_LEN) len = SBNI_MIN_LEN; sc->pktlen = len; sc->tx_frameno = howmany(len, sc->maxframe); sc->framelen = min(len, sc->maxframe); sbni_outb(sc, CSR0, sbni_inb(sc, CSR0) | TR_REQ); sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE; BPF_MTAP(sc->ifp, sc->tx_buf_p); } static void drop_xmit_queue(struct sbni_softc *sc) { struct mbuf *m; if (sc->tx_buf_p) { m_freem(sc->tx_buf_p); sc->tx_buf_p = NULL; if_inc_counter(sc->ifp, IFCOUNTER_OERRORS, 1); } for (;;) { IF_DEQUEUE(&sc->ifp->if_snd, m); if (m == NULL) break; m_freem(m); if_inc_counter(sc->ifp, IFCOUNTER_OERRORS, 1); } sc->tx_frameno = 0; sc->framelen = 0; sc->outpos = 0; sc->state &= ~(FL_WAIT_ACK | FL_NEED_RESEND); sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; } static void send_frame_header(struct sbni_softc *sc, u_int32_t *crc_p) { u_int32_t crc; u_int len_field; u_char value; crc = *crc_p; len_field = sc->framelen + 6; /* CRC + frameno + reserved */ if (sc->state & FL_NEED_RESEND) len_field |= FRAME_RETRY; /* non-first attempt... */ if (sc->outpos == 0) len_field |= FRAME_FIRST; len_field |= (sc->state & FL_PREV_OK) ? FRAME_SENT_OK : FRAME_SENT_BAD; sbni_outb(sc, DAT, SBNI_SIG); value = (u_char)len_field; sbni_outb(sc, DAT, value); crc = CRC32(value, crc); value = (u_char)(len_field >> 8); sbni_outb(sc, DAT, value); crc = CRC32(value, crc); sbni_outb(sc, DAT, sc->tx_frameno); crc = CRC32(sc->tx_frameno, crc); sbni_outb(sc, DAT, 0); crc = CRC32(0, crc); *crc_p = crc; } /* * if frame tail not needed (incorrect number or received twice), * it won't store, but CRC will be calculated */ static int skip_tail(struct sbni_softc *sc, u_int tail_len, u_int32_t crc) { while (tail_len--) crc = CRC32(sbni_inb(sc, DAT), crc); return (crc == CRC32_REMAINDER); } static int check_fhdr(struct sbni_softc *sc, u_int *framelen, u_int *frameno, u_int *ack, u_int *is_first, u_int32_t *crc_p) { u_int32_t crc; u_char value; crc = *crc_p; if (sbni_inb(sc, DAT) != SBNI_SIG) return (0); value = sbni_inb(sc, DAT); *framelen = (u_int)value; crc = CRC32(value, crc); value = sbni_inb(sc, DAT); *framelen |= ((u_int)value) << 8; crc = CRC32(value, crc); *ack = *framelen & FRAME_ACK_MASK; *is_first = (*framelen & FRAME_FIRST) != 0; if ((*framelen &= FRAME_LEN_MASK) < 6 || *framelen > SBNI_MAX_FRAME - 3) return (0); value = sbni_inb(sc, DAT); *frameno = (u_int)value; crc = CRC32(value, crc); crc = CRC32(sbni_inb(sc, DAT), crc); /* reserved byte */ *framelen -= 2; *crc_p = crc; return (1); } static int get_rx_buf(struct sbni_softc *sc) { struct mbuf *m; MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) { if_printf(sc->ifp, "cannot allocate header mbuf\n"); return (0); } /* * We always put the received packet in a single buffer - * either with just an mbuf header or in a cluster attached * to the header. The +2 is to compensate for the alignment * fixup below. */ if (ETHER_MAX_LEN + 2 > MHLEN) { /* Attach an mbuf cluster */ if (!(MCLGET(m, M_NOWAIT))) { m_freem(m); return (0); } } m->m_pkthdr.len = m->m_len = ETHER_MAX_LEN + 2; /* * The +2 is to longword align the start of the real packet. * (sizeof ether_header == 14) * This is important for NFS. */ m_adj(m, 2); sc->rx_buf_p = m; return (1); } static void indicate_pkt(struct sbni_softc *sc) { struct ifnet *ifp = sc->ifp; struct mbuf *m; m = sc->rx_buf_p; m->m_pkthdr.rcvif = ifp; m->m_pkthdr.len = m->m_len = sc->inppos; sc->rx_buf_p = NULL; SBNI_UNLOCK(sc); (*ifp->if_input)(ifp, m); SBNI_LOCK(sc); } /* -------------------------------------------------------------------------- */ /* * Routine checks periodically wire activity and regenerates marker if * connect was inactive for a long time. */ static void sbni_timeout(void *xsc) { struct sbni_softc *sc; u_char csr0; sc = (struct sbni_softc *)xsc; SBNI_ASSERT_LOCKED(sc); csr0 = sbni_inb(sc, CSR0); if (csr0 & RC_CHK) { if (sc->timer_ticks) { if (csr0 & (RC_RDY | BU_EMP)) /* receiving not active */ sc->timer_ticks--; } else { sc->in_stats.timeout_number++; if (sc->delta_rxl) timeout_change_level(sc); sbni_outb(sc, CSR1, *(u_char *)&sc->csr1 | PR_RES); csr0 = sbni_inb(sc, CSR0); } } sbni_outb(sc, CSR0, csr0 | RC_CHK); callout_reset(&sc->wch, hz/SBNI_HZ, sbni_timeout, sc); } /* -------------------------------------------------------------------------- */ static void card_start(struct sbni_softc *sc) { sc->timer_ticks = CHANGE_LEVEL_START_TICKS; sc->state &= ~(FL_WAIT_ACK | FL_NEED_RESEND); sc->state |= FL_PREV_OK; sc->inppos = 0; sc->wait_frameno = 0; sbni_outb(sc, CSR1, *(u_char *)&sc->csr1 | PR_RES); sbni_outb(sc, CSR0, EN_INT); } /* -------------------------------------------------------------------------- */ static u_char rxl_tab[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x08, 0x0a, 0x0c, 0x0f, 0x16, 0x18, 0x1a, 0x1c, 0x1f }; #define SIZE_OF_TIMEOUT_RXL_TAB 4 static u_char timeout_rxl_tab[] = { 0x03, 0x05, 0x08, 0x0b }; static void set_initial_values(struct sbni_softc *sc, struct sbni_flags flags) { if (flags.fixed_rxl) { sc->delta_rxl = 0; /* disable receive level autodetection */ sc->cur_rxl_index = flags.rxl; } else { sc->delta_rxl = DEF_RXL_DELTA; sc->cur_rxl_index = DEF_RXL; } sc->csr1.rate = flags.fixed_rate ? flags.rate : DEFAULT_RATE; sc->csr1.rxl = rxl_tab[sc->cur_rxl_index]; sc->maxframe = DEFAULT_FRAME_LEN; /* * generate Ethernet address (0x00ff01xxxxxx) */ *(u_int16_t *) sc->enaddr = htons(0x00ff); if (flags.mac_addr) { *(u_int32_t *) (sc->enaddr + 2) = htonl(flags.mac_addr | 0x01000000); } else { *(u_char *) (sc->enaddr + 2) = 0x01; read_random(sc->enaddr + 3, 3); } } #ifdef SBNI_DUAL_COMPOUND void sbni_add(struct sbni_softc *sc) { mtx_lock(&headlist_lock); sc->link = sbni_headlist; sbni_headlist = sc; mtx_unlock(&headlist_lock); } struct sbni_softc * connect_to_master(struct sbni_softc *sc) { struct sbni_softc *p, *p_prev; mtx_lock(&headlist_lock); for (p = sbni_headlist, p_prev = NULL; p; p_prev = p, p = p->link) { if (rman_get_start(p->io_res) == rman_get_start(sc->io_res) + 4 || rman_get_start(p->io_res) == rman_get_start(sc->io_res) - 4) { p->slave_sc = sc; if (p_prev) p_prev->link = p->link; else sbni_headlist = p->link; mtx_unlock(&headlist_lock); return p; } } mtx_unlock(&headlist_lock); return (NULL); } #endif /* SBNI_DUAL_COMPOUND */ /* Receive level auto-selection */ static void change_level(struct sbni_softc *sc) { if (sc->delta_rxl == 0) /* do not auto-negotiate RxL */ return; if (sc->cur_rxl_index == 0) sc->delta_rxl = 1; else if (sc->cur_rxl_index == 15) sc->delta_rxl = -1; else if (sc->cur_rxl_rcvd < sc->prev_rxl_rcvd) sc->delta_rxl = -sc->delta_rxl; sc->csr1.rxl = rxl_tab[sc->cur_rxl_index += sc->delta_rxl]; sbni_inb(sc, CSR0); /* it needed for PCI cards */ sbni_outb(sc, CSR1, *(u_char *)&sc->csr1); sc->prev_rxl_rcvd = sc->cur_rxl_rcvd; sc->cur_rxl_rcvd = 0; } static void timeout_change_level(struct sbni_softc *sc) { sc->cur_rxl_index = timeout_rxl_tab[sc->timeout_rxl]; if (++sc->timeout_rxl >= 4) sc->timeout_rxl = 0; sc->csr1.rxl = rxl_tab[sc->cur_rxl_index]; sbni_inb(sc, CSR0); sbni_outb(sc, CSR1, *(u_char *)&sc->csr1); sc->prev_rxl_rcvd = sc->cur_rxl_rcvd; sc->cur_rxl_rcvd = 0; } /* -------------------------------------------------------------------------- */ /* * Process an ioctl request. This code needs some work - it looks * pretty ugly. */ static int sbni_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct sbni_softc *sc; struct ifreq *ifr; struct thread *td; struct sbni_in_stats *in_stats; struct sbni_flags flags; int error; sc = ifp->if_softc; ifr = (struct ifreq *)data; td = curthread; error = 0; switch (command) { case SIOCSIFFLAGS: /* * If the interface is marked up and stopped, then start it. * If it is marked down and running, then stop it. */ SBNI_LOCK(sc); if (ifp->if_flags & IFF_UP) { if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) sbni_init_locked(sc); } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { sbni_stop(sc); } } SBNI_UNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: /* * Multicast list has changed; set the hardware filter * accordingly. */ error = 0; /* if (ifr == NULL) error = EAFNOSUPPORT; */ break; /* * SBNI specific ioctl */ case SIOCGHWFLAGS: /* get flags */ SBNI_LOCK(sc); bcopy((caddr_t)IF_LLADDR(sc->ifp)+3, (caddr_t) &flags, 3); flags.rxl = sc->cur_rxl_index; flags.rate = sc->csr1.rate; flags.fixed_rxl = (sc->delta_rxl == 0); flags.fixed_rate = 1; SBNI_UNLOCK(sc); bcopy(&flags, &ifr->ifr_ifru, sizeof(flags)); break; case SIOCGINSTATS: in_stats = malloc(sizeof(struct sbni_in_stats), M_DEVBUF, M_WAITOK); SBNI_LOCK(sc); bcopy(&sc->in_stats, in_stats, sizeof(struct sbni_in_stats)); SBNI_UNLOCK(sc); error = copyout(in_stats, ifr_data_get_ptr(ifr), sizeof(struct sbni_in_stats)); free(in_stats, M_DEVBUF); break; case SIOCSHWFLAGS: /* set flags */ /* root only */ error = priv_check(td, PRIV_DRIVER); if (error) break; bcopy(&ifr->ifr_ifru, &flags, sizeof(flags)); SBNI_LOCK(sc); if (flags.fixed_rxl) { sc->delta_rxl = 0; sc->cur_rxl_index = flags.rxl; } else { sc->delta_rxl = DEF_RXL_DELTA; sc->cur_rxl_index = DEF_RXL; } sc->csr1.rxl = rxl_tab[sc->cur_rxl_index]; sc->csr1.rate = flags.fixed_rate ? flags.rate : DEFAULT_RATE; if (flags.mac_addr) bcopy((caddr_t) &flags, (caddr_t) IF_LLADDR(sc->ifp)+3, 3); /* Don't be afraid... */ sbni_outb(sc, CSR1, *(char*)(&sc->csr1) | PR_RES); SBNI_UNLOCK(sc); break; case SIOCRINSTATS: SBNI_LOCK(sc); if (!(error = priv_check(td, PRIV_DRIVER))) /* root only */ bzero(&sc->in_stats, sizeof(struct sbni_in_stats)); SBNI_UNLOCK(sc); break; default: error = ether_ioctl(ifp, command, data); break; } return (error); } /* -------------------------------------------------------------------------- */ static u_int32_t calc_crc32(u_int32_t crc, caddr_t p, u_int len) { while (len--) crc = CRC32(*p++, crc); return (crc); } static u_int32_t crc32tab[] __aligned(8) = { 0xD202EF8D, 0xA505DF1B, 0x3C0C8EA1, 0x4B0BBE37, 0xD56F2B94, 0xA2681B02, 0x3B614AB8, 0x4C667A2E, 0xDCD967BF, 0xABDE5729, 0x32D70693, 0x45D03605, 0xDBB4A3A6, 0xACB39330, 0x35BAC28A, 0x42BDF21C, 0xCFB5FFE9, 0xB8B2CF7F, 0x21BB9EC5, 0x56BCAE53, 0xC8D83BF0, 0xBFDF0B66, 0x26D65ADC, 0x51D16A4A, 0xC16E77DB, 0xB669474D, 0x2F6016F7, 0x58672661, 0xC603B3C2, 0xB1048354, 0x280DD2EE, 0x5F0AE278, 0xE96CCF45, 0x9E6BFFD3, 0x0762AE69, 0x70659EFF, 0xEE010B5C, 0x99063BCA, 0x000F6A70, 0x77085AE6, 0xE7B74777, 0x90B077E1, 0x09B9265B, 0x7EBE16CD, 0xE0DA836E, 0x97DDB3F8, 0x0ED4E242, 0x79D3D2D4, 0xF4DBDF21, 0x83DCEFB7, 0x1AD5BE0D, 0x6DD28E9B, 0xF3B61B38, 0x84B12BAE, 0x1DB87A14, 0x6ABF4A82, 0xFA005713, 0x8D076785, 0x140E363F, 0x630906A9, 0xFD6D930A, 0x8A6AA39C, 0x1363F226, 0x6464C2B0, 0xA4DEAE1D, 0xD3D99E8B, 0x4AD0CF31, 0x3DD7FFA7, 0xA3B36A04, 0xD4B45A92, 0x4DBD0B28, 0x3ABA3BBE, 0xAA05262F, 0xDD0216B9, 0x440B4703, 0x330C7795, 0xAD68E236, 0xDA6FD2A0, 0x4366831A, 0x3461B38C, 0xB969BE79, 0xCE6E8EEF, 0x5767DF55, 0x2060EFC3, 0xBE047A60, 0xC9034AF6, 0x500A1B4C, 0x270D2BDA, 0xB7B2364B, 0xC0B506DD, 0x59BC5767, 0x2EBB67F1, 0xB0DFF252, 0xC7D8C2C4, 0x5ED1937E, 0x29D6A3E8, 0x9FB08ED5, 0xE8B7BE43, 0x71BEEFF9, 0x06B9DF6F, 0x98DD4ACC, 0xEFDA7A5A, 0x76D32BE0, 0x01D41B76, 0x916B06E7, 0xE66C3671, 0x7F6567CB, 0x0862575D, 0x9606C2FE, 0xE101F268, 0x7808A3D2, 0x0F0F9344, 0x82079EB1, 0xF500AE27, 0x6C09FF9D, 0x1B0ECF0B, 0x856A5AA8, 0xF26D6A3E, 0x6B643B84, 0x1C630B12, 0x8CDC1683, 0xFBDB2615, 0x62D277AF, 0x15D54739, 0x8BB1D29A, 0xFCB6E20C, 0x65BFB3B6, 0x12B88320, 0x3FBA6CAD, 0x48BD5C3B, 0xD1B40D81, 0xA6B33D17, 0x38D7A8B4, 0x4FD09822, 0xD6D9C998, 0xA1DEF90E, 0x3161E49F, 0x4666D409, 0xDF6F85B3, 0xA868B525, 0x360C2086, 0x410B1010, 0xD80241AA, 0xAF05713C, 0x220D7CC9, 0x550A4C5F, 0xCC031DE5, 0xBB042D73, 0x2560B8D0, 0x52678846, 0xCB6ED9FC, 0xBC69E96A, 0x2CD6F4FB, 0x5BD1C46D, 0xC2D895D7, 0xB5DFA541, 0x2BBB30E2, 0x5CBC0074, 0xC5B551CE, 0xB2B26158, 0x04D44C65, 0x73D37CF3, 0xEADA2D49, 0x9DDD1DDF, 0x03B9887C, 0x74BEB8EA, 0xEDB7E950, 0x9AB0D9C6, 0x0A0FC457, 0x7D08F4C1, 0xE401A57B, 0x930695ED, 0x0D62004E, 0x7A6530D8, 0xE36C6162, 0x946B51F4, 0x19635C01, 0x6E646C97, 0xF76D3D2D, 0x806A0DBB, 0x1E0E9818, 0x6909A88E, 0xF000F934, 0x8707C9A2, 0x17B8D433, 0x60BFE4A5, 0xF9B6B51F, 0x8EB18589, 0x10D5102A, 0x67D220BC, 0xFEDB7106, 0x89DC4190, 0x49662D3D, 0x3E611DAB, 0xA7684C11, 0xD06F7C87, 0x4E0BE924, 0x390CD9B2, 0xA0058808, 0xD702B89E, 0x47BDA50F, 0x30BA9599, 0xA9B3C423, 0xDEB4F4B5, 0x40D06116, 0x37D75180, 0xAEDE003A, 0xD9D930AC, 0x54D13D59, 0x23D60DCF, 0xBADF5C75, 0xCDD86CE3, 0x53BCF940, 0x24BBC9D6, 0xBDB2986C, 0xCAB5A8FA, 0x5A0AB56B, 0x2D0D85FD, 0xB404D447, 0xC303E4D1, 0x5D677172, 0x2A6041E4, 0xB369105E, 0xC46E20C8, 0x72080DF5, 0x050F3D63, 0x9C066CD9, 0xEB015C4F, 0x7565C9EC, 0x0262F97A, 0x9B6BA8C0, 0xEC6C9856, 0x7CD385C7, 0x0BD4B551, 0x92DDE4EB, 0xE5DAD47D, 0x7BBE41DE, 0x0CB97148, 0x95B020F2, 0xE2B71064, 0x6FBF1D91, 0x18B82D07, 0x81B17CBD, 0xF6B64C2B, 0x68D2D988, 0x1FD5E91E, 0x86DCB8A4, 0xF1DB8832, 0x616495A3, 0x1663A535, 0x8F6AF48F, 0xF86DC419, 0x660951BA, 0x110E612C, 0x88073096, 0xFF000000 }; diff --git a/sys/dev/virtio/network/if_vtnet.c b/sys/dev/virtio/network/if_vtnet.c index ceb3ffaaf2b4..b388e43d92a6 100644 --- a/sys/dev/virtio/network/if_vtnet.c +++ b/sys/dev/virtio/network/if_vtnet.c @@ -1,4090 +1,4091 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011, Bryan Venteicher * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Driver for VirtIO network devices. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "virtio_if.h" #include "opt_inet.h" #include "opt_inet6.h" static int vtnet_modevent(module_t, int, void *); static int vtnet_probe(device_t); static int vtnet_attach(device_t); static int vtnet_detach(device_t); static int vtnet_suspend(device_t); static int vtnet_resume(device_t); static int vtnet_shutdown(device_t); static int vtnet_attach_completed(device_t); static int vtnet_config_change(device_t); static void vtnet_negotiate_features(struct vtnet_softc *); static void vtnet_setup_features(struct vtnet_softc *); static int vtnet_init_rxq(struct vtnet_softc *, int); static int vtnet_init_txq(struct vtnet_softc *, int); static int vtnet_alloc_rxtx_queues(struct vtnet_softc *); static void vtnet_free_rxtx_queues(struct vtnet_softc *); static int vtnet_alloc_rx_filters(struct vtnet_softc *); static void vtnet_free_rx_filters(struct vtnet_softc *); static int vtnet_alloc_virtqueues(struct vtnet_softc *); static int vtnet_setup_interface(struct vtnet_softc *); static int vtnet_change_mtu(struct vtnet_softc *, int); static int vtnet_ioctl(struct ifnet *, u_long, caddr_t); static uint64_t vtnet_get_counter(struct ifnet *, ift_counter); static int vtnet_rxq_populate(struct vtnet_rxq *); static void vtnet_rxq_free_mbufs(struct vtnet_rxq *); static struct mbuf * vtnet_rx_alloc_buf(struct vtnet_softc *, int , struct mbuf **); static int vtnet_rxq_replace_lro_nomgr_buf(struct vtnet_rxq *, struct mbuf *, int); static int vtnet_rxq_replace_buf(struct vtnet_rxq *, struct mbuf *, int); static int vtnet_rxq_enqueue_buf(struct vtnet_rxq *, struct mbuf *); static int vtnet_rxq_new_buf(struct vtnet_rxq *); static int vtnet_rxq_csum(struct vtnet_rxq *, struct mbuf *, struct virtio_net_hdr *); static void vtnet_rxq_discard_merged_bufs(struct vtnet_rxq *, int); static void vtnet_rxq_discard_buf(struct vtnet_rxq *, struct mbuf *); static int vtnet_rxq_merged_eof(struct vtnet_rxq *, struct mbuf *, int); static void vtnet_rxq_input(struct vtnet_rxq *, struct mbuf *, struct virtio_net_hdr *); static int vtnet_rxq_eof(struct vtnet_rxq *); static void vtnet_rx_vq_intr(void *); static void vtnet_rxq_tq_intr(void *, int); static int vtnet_txq_below_threshold(struct vtnet_txq *); static int vtnet_txq_notify(struct vtnet_txq *); static void vtnet_txq_free_mbufs(struct vtnet_txq *); static int vtnet_txq_offload_ctx(struct vtnet_txq *, struct mbuf *, int *, int *, int *); static int vtnet_txq_offload_tso(struct vtnet_txq *, struct mbuf *, int, int, struct virtio_net_hdr *); static struct mbuf * vtnet_txq_offload(struct vtnet_txq *, struct mbuf *, struct virtio_net_hdr *); static int vtnet_txq_enqueue_buf(struct vtnet_txq *, struct mbuf **, struct vtnet_tx_header *); static int vtnet_txq_encap(struct vtnet_txq *, struct mbuf **, int); #ifdef VTNET_LEGACY_TX static void vtnet_start_locked(struct vtnet_txq *, struct ifnet *); static void vtnet_start(struct ifnet *); #else static int vtnet_txq_mq_start_locked(struct vtnet_txq *, struct mbuf *); static int vtnet_txq_mq_start(struct ifnet *, struct mbuf *); static void vtnet_txq_tq_deferred(void *, int); #endif static void vtnet_txq_start(struct vtnet_txq *); static void vtnet_txq_tq_intr(void *, int); static int vtnet_txq_eof(struct vtnet_txq *); static void vtnet_tx_vq_intr(void *); static void vtnet_tx_start_all(struct vtnet_softc *); #ifndef VTNET_LEGACY_TX static void vtnet_qflush(struct ifnet *); #endif static int vtnet_watchdog(struct vtnet_txq *); static void vtnet_accum_stats(struct vtnet_softc *, struct vtnet_rxq_stats *, struct vtnet_txq_stats *); static void vtnet_tick(void *); static void vtnet_start_taskqueues(struct vtnet_softc *); static void vtnet_free_taskqueues(struct vtnet_softc *); static void vtnet_drain_taskqueues(struct vtnet_softc *); static void vtnet_drain_rxtx_queues(struct vtnet_softc *); static void vtnet_stop_rendezvous(struct vtnet_softc *); static void vtnet_stop(struct vtnet_softc *); static int vtnet_virtio_reinit(struct vtnet_softc *); static void vtnet_init_rx_filters(struct vtnet_softc *); static int vtnet_init_rx_queues(struct vtnet_softc *); static int vtnet_init_tx_queues(struct vtnet_softc *); static int vtnet_init_rxtx_queues(struct vtnet_softc *); static void vtnet_set_active_vq_pairs(struct vtnet_softc *); static int vtnet_reinit(struct vtnet_softc *); static void vtnet_init_locked(struct vtnet_softc *); static void vtnet_init(void *); static void vtnet_free_ctrl_vq(struct vtnet_softc *); static void vtnet_exec_ctrl_cmd(struct vtnet_softc *, void *, struct sglist *, int, int); static int vtnet_ctrl_mac_cmd(struct vtnet_softc *, uint8_t *); static int vtnet_ctrl_mq_cmd(struct vtnet_softc *, uint16_t); static int vtnet_ctrl_rx_cmd(struct vtnet_softc *, int, int); static int vtnet_set_promisc(struct vtnet_softc *, int); static int vtnet_set_allmulti(struct vtnet_softc *, int); static void vtnet_attach_disable_promisc(struct vtnet_softc *); static void vtnet_rx_filter(struct vtnet_softc *); static void vtnet_rx_filter_mac(struct vtnet_softc *); static int vtnet_exec_vlan_filter(struct vtnet_softc *, int, uint16_t); static void vtnet_rx_filter_vlan(struct vtnet_softc *); static void vtnet_update_vlan_filter(struct vtnet_softc *, int, uint16_t); static void vtnet_register_vlan(void *, struct ifnet *, uint16_t); static void vtnet_unregister_vlan(void *, struct ifnet *, uint16_t); static int vtnet_is_link_up(struct vtnet_softc *); static void vtnet_update_link_status(struct vtnet_softc *); static int vtnet_ifmedia_upd(struct ifnet *); static void vtnet_ifmedia_sts(struct ifnet *, struct ifmediareq *); static void vtnet_get_hwaddr(struct vtnet_softc *); static void vtnet_set_hwaddr(struct vtnet_softc *); static void vtnet_vlan_tag_remove(struct mbuf *); static void vtnet_set_rx_process_limit(struct vtnet_softc *); static void vtnet_set_tx_intr_threshold(struct vtnet_softc *); static void vtnet_setup_rxq_sysctl(struct sysctl_ctx_list *, struct sysctl_oid_list *, struct vtnet_rxq *); static void vtnet_setup_txq_sysctl(struct sysctl_ctx_list *, struct sysctl_oid_list *, struct vtnet_txq *); static void vtnet_setup_queue_sysctl(struct vtnet_softc *); static void vtnet_setup_sysctl(struct vtnet_softc *); static int vtnet_rxq_enable_intr(struct vtnet_rxq *); static void vtnet_rxq_disable_intr(struct vtnet_rxq *); static int vtnet_txq_enable_intr(struct vtnet_txq *); static void vtnet_txq_disable_intr(struct vtnet_txq *); static void vtnet_enable_rx_interrupts(struct vtnet_softc *); static void vtnet_enable_tx_interrupts(struct vtnet_softc *); static void vtnet_enable_interrupts(struct vtnet_softc *); static void vtnet_disable_rx_interrupts(struct vtnet_softc *); static void vtnet_disable_tx_interrupts(struct vtnet_softc *); static void vtnet_disable_interrupts(struct vtnet_softc *); static int vtnet_tunable_int(struct vtnet_softc *, const char *, int); DEBUGNET_DEFINE(vtnet); /* Tunables. */ static SYSCTL_NODE(_hw, OID_AUTO, vtnet, CTLFLAG_RD, 0, "VNET driver parameters"); static int vtnet_csum_disable = 0; TUNABLE_INT("hw.vtnet.csum_disable", &vtnet_csum_disable); SYSCTL_INT(_hw_vtnet, OID_AUTO, csum_disable, CTLFLAG_RDTUN, &vtnet_csum_disable, 0, "Disables receive and send checksum offload"); static int vtnet_tso_disable = 0; TUNABLE_INT("hw.vtnet.tso_disable", &vtnet_tso_disable); SYSCTL_INT(_hw_vtnet, OID_AUTO, tso_disable, CTLFLAG_RDTUN, &vtnet_tso_disable, 0, "Disables TCP Segmentation Offload"); static int vtnet_lro_disable = 0; TUNABLE_INT("hw.vtnet.lro_disable", &vtnet_lro_disable); SYSCTL_INT(_hw_vtnet, OID_AUTO, lro_disable, CTLFLAG_RDTUN, &vtnet_lro_disable, 0, "Disables TCP Large Receive Offload"); static int vtnet_mq_disable = 0; TUNABLE_INT("hw.vtnet.mq_disable", &vtnet_mq_disable); SYSCTL_INT(_hw_vtnet, OID_AUTO, mq_disable, CTLFLAG_RDTUN, &vtnet_mq_disable, 0, "Disables Multi Queue support"); static int vtnet_mq_max_pairs = VTNET_MAX_QUEUE_PAIRS; TUNABLE_INT("hw.vtnet.mq_max_pairs", &vtnet_mq_max_pairs); SYSCTL_INT(_hw_vtnet, OID_AUTO, mq_max_pairs, CTLFLAG_RDTUN, &vtnet_mq_max_pairs, 0, "Sets the maximum number of Multi Queue pairs"); static int vtnet_rx_process_limit = 512; TUNABLE_INT("hw.vtnet.rx_process_limit", &vtnet_rx_process_limit); SYSCTL_INT(_hw_vtnet, OID_AUTO, rx_process_limit, CTLFLAG_RDTUN, &vtnet_rx_process_limit, 0, "Limits the number RX segments processed in a single pass"); static uma_zone_t vtnet_tx_header_zone; static struct virtio_feature_desc vtnet_feature_desc[] = { { VIRTIO_NET_F_CSUM, "TxChecksum" }, { VIRTIO_NET_F_GUEST_CSUM, "RxChecksum" }, { VIRTIO_NET_F_MAC, "MacAddress" }, { VIRTIO_NET_F_GSO, "TxAllGSO" }, { VIRTIO_NET_F_GUEST_TSO4, "RxTSOv4" }, { VIRTIO_NET_F_GUEST_TSO6, "RxTSOv6" }, { VIRTIO_NET_F_GUEST_ECN, "RxECN" }, { VIRTIO_NET_F_GUEST_UFO, "RxUFO" }, { VIRTIO_NET_F_HOST_TSO4, "TxTSOv4" }, { VIRTIO_NET_F_HOST_TSO6, "TxTSOv6" }, { VIRTIO_NET_F_HOST_ECN, "TxTSOECN" }, { VIRTIO_NET_F_HOST_UFO, "TxUFO" }, { VIRTIO_NET_F_MRG_RXBUF, "MrgRxBuf" }, { VIRTIO_NET_F_STATUS, "Status" }, { VIRTIO_NET_F_CTRL_VQ, "ControlVq" }, { VIRTIO_NET_F_CTRL_RX, "RxMode" }, { VIRTIO_NET_F_CTRL_VLAN, "VLanFilter" }, { VIRTIO_NET_F_CTRL_RX_EXTRA, "RxModeExtra" }, { VIRTIO_NET_F_GUEST_ANNOUNCE, "GuestAnnounce" }, { VIRTIO_NET_F_MQ, "Multiqueue" }, { VIRTIO_NET_F_CTRL_MAC_ADDR, "SetMacAddress" }, { 0, NULL } }; static device_method_t vtnet_methods[] = { /* Device methods. */ DEVMETHOD(device_probe, vtnet_probe), DEVMETHOD(device_attach, vtnet_attach), DEVMETHOD(device_detach, vtnet_detach), DEVMETHOD(device_suspend, vtnet_suspend), DEVMETHOD(device_resume, vtnet_resume), DEVMETHOD(device_shutdown, vtnet_shutdown), /* VirtIO methods. */ DEVMETHOD(virtio_attach_completed, vtnet_attach_completed), DEVMETHOD(virtio_config_change, vtnet_config_change), DEVMETHOD_END }; #ifdef DEV_NETMAP #include #endif /* DEV_NETMAP */ static driver_t vtnet_driver = { "vtnet", vtnet_methods, sizeof(struct vtnet_softc) }; static devclass_t vtnet_devclass; DRIVER_MODULE(vtnet, virtio_mmio, vtnet_driver, vtnet_devclass, vtnet_modevent, 0); DRIVER_MODULE(vtnet, virtio_pci, vtnet_driver, vtnet_devclass, vtnet_modevent, 0); MODULE_VERSION(vtnet, 1); MODULE_DEPEND(vtnet, virtio, 1, 1, 1); #ifdef DEV_NETMAP MODULE_DEPEND(vtnet, netmap, 1, 1, 1); #endif /* DEV_NETMAP */ VIRTIO_SIMPLE_PNPTABLE(vtnet, VIRTIO_ID_NETWORK, "VirtIO Networking Adapter"); VIRTIO_SIMPLE_PNPINFO(virtio_mmio, vtnet); VIRTIO_SIMPLE_PNPINFO(virtio_pci, vtnet); static int vtnet_modevent(module_t mod, int type, void *unused) { int error = 0; static int loaded = 0; switch (type) { case MOD_LOAD: if (loaded++ == 0) { vtnet_tx_header_zone = uma_zcreate("vtnet_tx_hdr", sizeof(struct vtnet_tx_header), NULL, NULL, NULL, NULL, 0, 0); #ifdef DEBUGNET /* * We need to allocate from this zone in the transmit path, so ensure * that we have at least one item per header available. * XXX add a separate zone like we do for mbufs? otherwise we may alloc * buckets */ uma_zone_reserve(vtnet_tx_header_zone, DEBUGNET_MAX_IN_FLIGHT * 2); uma_prealloc(vtnet_tx_header_zone, DEBUGNET_MAX_IN_FLIGHT * 2); #endif } break; case MOD_QUIESCE: if (uma_zone_get_cur(vtnet_tx_header_zone) > 0) error = EBUSY; break; case MOD_UNLOAD: if (--loaded == 0) { uma_zdestroy(vtnet_tx_header_zone); vtnet_tx_header_zone = NULL; } break; case MOD_SHUTDOWN: break; default: error = EOPNOTSUPP; break; } return (error); } static int vtnet_probe(device_t dev) { return (VIRTIO_SIMPLE_PROBE(dev, vtnet)); } static int vtnet_attach(device_t dev) { struct vtnet_softc *sc; int error; sc = device_get_softc(dev); sc->vtnet_dev = dev; /* Register our feature descriptions. */ virtio_set_feature_desc(dev, vtnet_feature_desc); VTNET_CORE_LOCK_INIT(sc); callout_init_mtx(&sc->vtnet_tick_ch, VTNET_CORE_MTX(sc), 0); vtnet_setup_sysctl(sc); vtnet_setup_features(sc); error = vtnet_alloc_rx_filters(sc); if (error) { device_printf(dev, "cannot allocate Rx filters\n"); goto fail; } error = vtnet_alloc_rxtx_queues(sc); if (error) { device_printf(dev, "cannot allocate queues\n"); goto fail; } error = vtnet_alloc_virtqueues(sc); if (error) { device_printf(dev, "cannot allocate virtqueues\n"); goto fail; } error = vtnet_setup_interface(sc); if (error) { device_printf(dev, "cannot setup interface\n"); goto fail; } error = virtio_setup_intr(dev, INTR_TYPE_NET); if (error) { device_printf(dev, "cannot setup virtqueue interrupts\n"); /* BMV: This will crash if during boot! */ ether_ifdetach(sc->vtnet_ifp); goto fail; } #ifdef DEV_NETMAP vtnet_netmap_attach(sc); #endif /* DEV_NETMAP */ vtnet_start_taskqueues(sc); fail: if (error) vtnet_detach(dev); return (error); } static int vtnet_detach(device_t dev) { struct vtnet_softc *sc; struct ifnet *ifp; sc = device_get_softc(dev); ifp = sc->vtnet_ifp; if (device_is_attached(dev)) { VTNET_CORE_LOCK(sc); vtnet_stop(sc); VTNET_CORE_UNLOCK(sc); callout_drain(&sc->vtnet_tick_ch); vtnet_drain_taskqueues(sc); ether_ifdetach(ifp); } #ifdef DEV_NETMAP netmap_detach(ifp); #endif /* DEV_NETMAP */ vtnet_free_taskqueues(sc); if (sc->vtnet_vlan_attach != NULL) { EVENTHANDLER_DEREGISTER(vlan_config, sc->vtnet_vlan_attach); sc->vtnet_vlan_attach = NULL; } if (sc->vtnet_vlan_detach != NULL) { EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vtnet_vlan_detach); sc->vtnet_vlan_detach = NULL; } ifmedia_removeall(&sc->vtnet_media); if (ifp != NULL) { if_free(ifp); sc->vtnet_ifp = NULL; } vtnet_free_rxtx_queues(sc); vtnet_free_rx_filters(sc); if (sc->vtnet_ctrl_vq != NULL) vtnet_free_ctrl_vq(sc); VTNET_CORE_LOCK_DESTROY(sc); return (0); } static int vtnet_suspend(device_t dev) { struct vtnet_softc *sc; sc = device_get_softc(dev); VTNET_CORE_LOCK(sc); vtnet_stop(sc); sc->vtnet_flags |= VTNET_FLAG_SUSPENDED; VTNET_CORE_UNLOCK(sc); return (0); } static int vtnet_resume(device_t dev) { struct vtnet_softc *sc; struct ifnet *ifp; sc = device_get_softc(dev); ifp = sc->vtnet_ifp; VTNET_CORE_LOCK(sc); if (ifp->if_flags & IFF_UP) vtnet_init_locked(sc); sc->vtnet_flags &= ~VTNET_FLAG_SUSPENDED; VTNET_CORE_UNLOCK(sc); return (0); } static int vtnet_shutdown(device_t dev) { /* * Suspend already does all of what we need to * do here; we just never expect to be resumed. */ return (vtnet_suspend(dev)); } static int vtnet_attach_completed(device_t dev) { vtnet_attach_disable_promisc(device_get_softc(dev)); return (0); } static int vtnet_config_change(device_t dev) { struct vtnet_softc *sc; sc = device_get_softc(dev); VTNET_CORE_LOCK(sc); vtnet_update_link_status(sc); if (sc->vtnet_link_active != 0) vtnet_tx_start_all(sc); VTNET_CORE_UNLOCK(sc); return (0); } static void vtnet_negotiate_features(struct vtnet_softc *sc) { device_t dev; uint64_t mask, features; dev = sc->vtnet_dev; mask = 0; /* * TSO and LRO are only available when their corresponding checksum * offload feature is also negotiated. */ if (vtnet_tunable_int(sc, "csum_disable", vtnet_csum_disable)) { mask |= VIRTIO_NET_F_CSUM | VIRTIO_NET_F_GUEST_CSUM; mask |= VTNET_TSO_FEATURES | VTNET_LRO_FEATURES; } if (vtnet_tunable_int(sc, "tso_disable", vtnet_tso_disable)) mask |= VTNET_TSO_FEATURES; if (vtnet_tunable_int(sc, "lro_disable", vtnet_lro_disable)) mask |= VTNET_LRO_FEATURES; #ifndef VTNET_LEGACY_TX if (vtnet_tunable_int(sc, "mq_disable", vtnet_mq_disable)) mask |= VIRTIO_NET_F_MQ; #else mask |= VIRTIO_NET_F_MQ; #endif features = VTNET_FEATURES & ~mask; sc->vtnet_features = virtio_negotiate_features(dev, features); if (virtio_with_feature(dev, VTNET_LRO_FEATURES) && virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF) == 0) { /* * LRO without mergeable buffers requires special care. This * is not ideal because every receive buffer must be large * enough to hold the maximum TCP packet, the Ethernet header, * and the header. This requires up to 34 descriptors with * MCLBYTES clusters. If we do not have indirect descriptors, * LRO is disabled since the virtqueue will not contain very * many receive buffers. */ if (!virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC)) { device_printf(dev, "LRO disabled due to both mergeable buffers and " "indirect descriptors not negotiated\n"); features &= ~VTNET_LRO_FEATURES; sc->vtnet_features = virtio_negotiate_features(dev, features); } else sc->vtnet_flags |= VTNET_FLAG_LRO_NOMRG; } } static void vtnet_setup_features(struct vtnet_softc *sc) { device_t dev; dev = sc->vtnet_dev; vtnet_negotiate_features(sc); if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC)) sc->vtnet_flags |= VTNET_FLAG_INDIRECT; if (virtio_with_feature(dev, VIRTIO_RING_F_EVENT_IDX)) sc->vtnet_flags |= VTNET_FLAG_EVENT_IDX; if (virtio_with_feature(dev, VIRTIO_NET_F_MAC)) { /* This feature should always be negotiated. */ sc->vtnet_flags |= VTNET_FLAG_MAC; } if (virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF)) { sc->vtnet_flags |= VTNET_FLAG_MRG_RXBUFS; sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf); } else sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr); if (sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) sc->vtnet_rx_nsegs = VTNET_MRG_RX_SEGS; else if (sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG) sc->vtnet_rx_nsegs = VTNET_MAX_RX_SEGS; else sc->vtnet_rx_nsegs = VTNET_MIN_RX_SEGS; if (virtio_with_feature(dev, VIRTIO_NET_F_GSO) || virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO4) || virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO6)) sc->vtnet_tx_nsegs = VTNET_MAX_TX_SEGS; else sc->vtnet_tx_nsegs = VTNET_MIN_TX_SEGS; if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VQ)) { sc->vtnet_flags |= VTNET_FLAG_CTRL_VQ; if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_RX)) sc->vtnet_flags |= VTNET_FLAG_CTRL_RX; if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VLAN)) sc->vtnet_flags |= VTNET_FLAG_VLAN_FILTER; if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_MAC_ADDR)) sc->vtnet_flags |= VTNET_FLAG_CTRL_MAC; } if (virtio_with_feature(dev, VIRTIO_NET_F_MQ) && sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) { sc->vtnet_max_vq_pairs = virtio_read_dev_config_2(dev, offsetof(struct virtio_net_config, max_virtqueue_pairs)); } else sc->vtnet_max_vq_pairs = 1; if (sc->vtnet_max_vq_pairs > 1) { /* * Limit the maximum number of queue pairs to the lower of * the number of CPUs and the configured maximum. * The actual number of queues that get used may be less. */ int max; max = vtnet_tunable_int(sc, "mq_max_pairs", vtnet_mq_max_pairs); if (max > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN) { if (max > mp_ncpus) max = mp_ncpus; if (max > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX) max = VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX; if (max > 1) { sc->vtnet_requested_vq_pairs = max; sc->vtnet_flags |= VTNET_FLAG_MULTIQ; } } } } static int vtnet_init_rxq(struct vtnet_softc *sc, int id) { struct vtnet_rxq *rxq; rxq = &sc->vtnet_rxqs[id]; snprintf(rxq->vtnrx_name, sizeof(rxq->vtnrx_name), "%s-rx%d", device_get_nameunit(sc->vtnet_dev), id); mtx_init(&rxq->vtnrx_mtx, rxq->vtnrx_name, NULL, MTX_DEF); rxq->vtnrx_sc = sc; rxq->vtnrx_id = id; rxq->vtnrx_sg = sglist_alloc(sc->vtnet_rx_nsegs, M_NOWAIT); if (rxq->vtnrx_sg == NULL) return (ENOMEM); NET_TASK_INIT(&rxq->vtnrx_intrtask, 0, vtnet_rxq_tq_intr, rxq); rxq->vtnrx_tq = taskqueue_create(rxq->vtnrx_name, M_NOWAIT, taskqueue_thread_enqueue, &rxq->vtnrx_tq); return (rxq->vtnrx_tq == NULL ? ENOMEM : 0); } static int vtnet_init_txq(struct vtnet_softc *sc, int id) { struct vtnet_txq *txq; txq = &sc->vtnet_txqs[id]; snprintf(txq->vtntx_name, sizeof(txq->vtntx_name), "%s-tx%d", device_get_nameunit(sc->vtnet_dev), id); mtx_init(&txq->vtntx_mtx, txq->vtntx_name, NULL, MTX_DEF); txq->vtntx_sc = sc; txq->vtntx_id = id; txq->vtntx_sg = sglist_alloc(sc->vtnet_tx_nsegs, M_NOWAIT); if (txq->vtntx_sg == NULL) return (ENOMEM); #ifndef VTNET_LEGACY_TX txq->vtntx_br = buf_ring_alloc(VTNET_DEFAULT_BUFRING_SIZE, M_DEVBUF, M_NOWAIT, &txq->vtntx_mtx); if (txq->vtntx_br == NULL) return (ENOMEM); TASK_INIT(&txq->vtntx_defrtask, 0, vtnet_txq_tq_deferred, txq); #endif TASK_INIT(&txq->vtntx_intrtask, 0, vtnet_txq_tq_intr, txq); txq->vtntx_tq = taskqueue_create(txq->vtntx_name, M_NOWAIT, taskqueue_thread_enqueue, &txq->vtntx_tq); if (txq->vtntx_tq == NULL) return (ENOMEM); return (0); } static int vtnet_alloc_rxtx_queues(struct vtnet_softc *sc) { int i, npairs, error; npairs = sc->vtnet_max_vq_pairs; sc->vtnet_rxqs = malloc(sizeof(struct vtnet_rxq) * npairs, M_DEVBUF, M_NOWAIT | M_ZERO); sc->vtnet_txqs = malloc(sizeof(struct vtnet_txq) * npairs, M_DEVBUF, M_NOWAIT | M_ZERO); if (sc->vtnet_rxqs == NULL || sc->vtnet_txqs == NULL) return (ENOMEM); for (i = 0; i < npairs; i++) { error = vtnet_init_rxq(sc, i); if (error) return (error); error = vtnet_init_txq(sc, i); if (error) return (error); } vtnet_setup_queue_sysctl(sc); return (0); } static void vtnet_destroy_rxq(struct vtnet_rxq *rxq) { rxq->vtnrx_sc = NULL; rxq->vtnrx_id = -1; if (rxq->vtnrx_sg != NULL) { sglist_free(rxq->vtnrx_sg); rxq->vtnrx_sg = NULL; } if (mtx_initialized(&rxq->vtnrx_mtx) != 0) mtx_destroy(&rxq->vtnrx_mtx); } static void vtnet_destroy_txq(struct vtnet_txq *txq) { txq->vtntx_sc = NULL; txq->vtntx_id = -1; if (txq->vtntx_sg != NULL) { sglist_free(txq->vtntx_sg); txq->vtntx_sg = NULL; } #ifndef VTNET_LEGACY_TX if (txq->vtntx_br != NULL) { buf_ring_free(txq->vtntx_br, M_DEVBUF); txq->vtntx_br = NULL; } #endif if (mtx_initialized(&txq->vtntx_mtx) != 0) mtx_destroy(&txq->vtntx_mtx); } static void vtnet_free_rxtx_queues(struct vtnet_softc *sc) { int i; if (sc->vtnet_rxqs != NULL) { for (i = 0; i < sc->vtnet_max_vq_pairs; i++) vtnet_destroy_rxq(&sc->vtnet_rxqs[i]); free(sc->vtnet_rxqs, M_DEVBUF); sc->vtnet_rxqs = NULL; } if (sc->vtnet_txqs != NULL) { for (i = 0; i < sc->vtnet_max_vq_pairs; i++) vtnet_destroy_txq(&sc->vtnet_txqs[i]); free(sc->vtnet_txqs, M_DEVBUF); sc->vtnet_txqs = NULL; } } static int vtnet_alloc_rx_filters(struct vtnet_softc *sc) { if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) { sc->vtnet_mac_filter = malloc(sizeof(struct vtnet_mac_filter), M_DEVBUF, M_NOWAIT | M_ZERO); if (sc->vtnet_mac_filter == NULL) return (ENOMEM); } if (sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER) { sc->vtnet_vlan_filter = malloc(sizeof(uint32_t) * VTNET_VLAN_FILTER_NWORDS, M_DEVBUF, M_NOWAIT | M_ZERO); if (sc->vtnet_vlan_filter == NULL) return (ENOMEM); } return (0); } static void vtnet_free_rx_filters(struct vtnet_softc *sc) { if (sc->vtnet_mac_filter != NULL) { free(sc->vtnet_mac_filter, M_DEVBUF); sc->vtnet_mac_filter = NULL; } if (sc->vtnet_vlan_filter != NULL) { free(sc->vtnet_vlan_filter, M_DEVBUF); sc->vtnet_vlan_filter = NULL; } } static int vtnet_alloc_virtqueues(struct vtnet_softc *sc) { device_t dev; struct vq_alloc_info *info; struct vtnet_rxq *rxq; struct vtnet_txq *txq; int i, idx, flags, nvqs, error; dev = sc->vtnet_dev; flags = 0; nvqs = sc->vtnet_max_vq_pairs * 2; if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) nvqs++; info = malloc(sizeof(struct vq_alloc_info) * nvqs, M_TEMP, M_NOWAIT); if (info == NULL) return (ENOMEM); for (i = 0, idx = 0; i < sc->vtnet_max_vq_pairs; i++, idx+=2) { rxq = &sc->vtnet_rxqs[i]; VQ_ALLOC_INFO_INIT(&info[idx], sc->vtnet_rx_nsegs, vtnet_rx_vq_intr, rxq, &rxq->vtnrx_vq, "%s-%d rx", device_get_nameunit(dev), rxq->vtnrx_id); txq = &sc->vtnet_txqs[i]; VQ_ALLOC_INFO_INIT(&info[idx+1], sc->vtnet_tx_nsegs, vtnet_tx_vq_intr, txq, &txq->vtntx_vq, "%s-%d tx", device_get_nameunit(dev), txq->vtntx_id); } if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) { VQ_ALLOC_INFO_INIT(&info[idx], 0, NULL, NULL, &sc->vtnet_ctrl_vq, "%s ctrl", device_get_nameunit(dev)); } /* * Enable interrupt binding if this is multiqueue. This only matters * when per-vq MSIX is available. */ if (sc->vtnet_flags & VTNET_FLAG_MULTIQ) flags |= 0; error = virtio_alloc_virtqueues(dev, flags, nvqs, info); free(info, M_TEMP); return (error); } static int vtnet_setup_interface(struct vtnet_softc *sc) { device_t dev; struct pfil_head_args pa; struct ifnet *ifp; dev = sc->vtnet_dev; ifp = sc->vtnet_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "cannot allocate ifnet structure\n"); return (ENOSPC); } if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_baudrate = IF_Gbps(10); /* Approx. */ ifp->if_softc = sc; - ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST | + IFF_KNOWSEPOCH; ifp->if_init = vtnet_init; ifp->if_ioctl = vtnet_ioctl; ifp->if_get_counter = vtnet_get_counter; #ifndef VTNET_LEGACY_TX ifp->if_transmit = vtnet_txq_mq_start; ifp->if_qflush = vtnet_qflush; #else struct virtqueue *vq = sc->vtnet_txqs[0].vtntx_vq; ifp->if_start = vtnet_start; IFQ_SET_MAXLEN(&ifp->if_snd, virtqueue_size(vq) - 1); ifp->if_snd.ifq_drv_maxlen = virtqueue_size(vq) - 1; IFQ_SET_READY(&ifp->if_snd); #endif ifmedia_init(&sc->vtnet_media, IFM_IMASK, vtnet_ifmedia_upd, vtnet_ifmedia_sts); ifmedia_add(&sc->vtnet_media, VTNET_MEDIATYPE, 0, NULL); ifmedia_set(&sc->vtnet_media, VTNET_MEDIATYPE); /* Read (or generate) the MAC address for the adapter. */ vtnet_get_hwaddr(sc); ether_ifattach(ifp, sc->vtnet_hwaddr); if (virtio_with_feature(dev, VIRTIO_NET_F_STATUS)) ifp->if_capabilities |= IFCAP_LINKSTATE; /* Tell the upper layer(s) we support long frames. */ ifp->if_hdrlen = sizeof(struct ether_vlan_header); ifp->if_capabilities |= IFCAP_JUMBO_MTU | IFCAP_VLAN_MTU; if (virtio_with_feature(dev, VIRTIO_NET_F_CSUM)) { ifp->if_capabilities |= IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6; if (virtio_with_feature(dev, VIRTIO_NET_F_GSO)) { ifp->if_capabilities |= IFCAP_TSO4 | IFCAP_TSO6; sc->vtnet_flags |= VTNET_FLAG_TSO_ECN; } else { if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO4)) ifp->if_capabilities |= IFCAP_TSO4; if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO6)) ifp->if_capabilities |= IFCAP_TSO6; if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_ECN)) sc->vtnet_flags |= VTNET_FLAG_TSO_ECN; } if (ifp->if_capabilities & IFCAP_TSO) ifp->if_capabilities |= IFCAP_VLAN_HWTSO; } if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_CSUM)) { ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6; if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO4) || virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO6)) ifp->if_capabilities |= IFCAP_LRO; } if (ifp->if_capabilities & IFCAP_HWCSUM) { /* * VirtIO does not support VLAN tagging, but we can fake * it by inserting and removing the 802.1Q header during * transmit and receive. We are then able to do checksum * offloading of VLAN frames. */ ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM; } ifp->if_capenable = ifp->if_capabilities; /* * Capabilities after here are not enabled by default. */ if (sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER) { ifp->if_capabilities |= IFCAP_VLAN_HWFILTER; sc->vtnet_vlan_attach = EVENTHANDLER_REGISTER(vlan_config, vtnet_register_vlan, sc, EVENTHANDLER_PRI_FIRST); sc->vtnet_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, vtnet_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST); } vtnet_set_rx_process_limit(sc); vtnet_set_tx_intr_threshold(sc); DEBUGNET_SET(ifp, vtnet); pa.pa_version = PFIL_VERSION; pa.pa_flags = PFIL_IN; pa.pa_type = PFIL_TYPE_ETHERNET; pa.pa_headname = ifp->if_xname; sc->vtnet_pfil = pfil_head_register(&pa); return (0); } static int vtnet_change_mtu(struct vtnet_softc *sc, int new_mtu) { struct ifnet *ifp; int frame_size, clsize; ifp = sc->vtnet_ifp; if (new_mtu < ETHERMIN || new_mtu > VTNET_MAX_MTU) return (EINVAL); frame_size = sc->vtnet_hdr_size + sizeof(struct ether_vlan_header) + new_mtu; /* * Based on the new MTU (and hence frame size) determine which * cluster size is most appropriate for the receive queues. */ if (frame_size <= MCLBYTES) { clsize = MCLBYTES; } else if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) { /* Avoid going past 9K jumbos. */ if (frame_size > MJUM9BYTES) return (EINVAL); clsize = MJUM9BYTES; } else clsize = MJUMPAGESIZE; ifp->if_mtu = new_mtu; sc->vtnet_rx_new_clsize = clsize; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; vtnet_init_locked(sc); } return (0); } static int vtnet_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct vtnet_softc *sc; struct ifreq *ifr; int reinit, mask, error; sc = ifp->if_softc; ifr = (struct ifreq *) data; error = 0; switch (cmd) { case SIOCSIFMTU: if (ifp->if_mtu != ifr->ifr_mtu) { VTNET_CORE_LOCK(sc); error = vtnet_change_mtu(sc, ifr->ifr_mtu); VTNET_CORE_UNLOCK(sc); } break; case SIOCSIFFLAGS: VTNET_CORE_LOCK(sc); if ((ifp->if_flags & IFF_UP) == 0) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) vtnet_stop(sc); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) { if ((ifp->if_flags ^ sc->vtnet_if_flags) & (IFF_PROMISC | IFF_ALLMULTI)) { if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) vtnet_rx_filter(sc); else { ifp->if_flags |= IFF_PROMISC; if ((ifp->if_flags ^ sc->vtnet_if_flags) & IFF_ALLMULTI) error = ENOTSUP; } } } else vtnet_init_locked(sc); if (error == 0) sc->vtnet_if_flags = ifp->if_flags; VTNET_CORE_UNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: if ((sc->vtnet_flags & VTNET_FLAG_CTRL_RX) == 0) break; VTNET_CORE_LOCK(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) vtnet_rx_filter_mac(sc); VTNET_CORE_UNLOCK(sc); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->vtnet_media, cmd); break; case SIOCSIFCAP: VTNET_CORE_LOCK(sc); mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) ifp->if_capenable ^= IFCAP_TXCSUM; if (mask & IFCAP_TXCSUM_IPV6) ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; if (mask & IFCAP_TSO4) ifp->if_capenable ^= IFCAP_TSO4; if (mask & IFCAP_TSO6) ifp->if_capenable ^= IFCAP_TSO6; if (mask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_LRO | IFCAP_VLAN_HWFILTER)) { /* These Rx features require us to renegotiate. */ reinit = 1; if (mask & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; if (mask & IFCAP_RXCSUM_IPV6) ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_VLAN_HWFILTER) ifp->if_capenable ^= IFCAP_VLAN_HWFILTER; } else reinit = 0; if (mask & IFCAP_VLAN_HWTSO) ifp->if_capenable ^= IFCAP_VLAN_HWTSO; if (mask & IFCAP_VLAN_HWTAGGING) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (reinit && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; vtnet_init_locked(sc); } VTNET_CORE_UNLOCK(sc); VLAN_CAPABILITIES(ifp); break; default: error = ether_ioctl(ifp, cmd, data); break; } VTNET_CORE_LOCK_ASSERT_NOTOWNED(sc); return (error); } static int vtnet_rxq_populate(struct vtnet_rxq *rxq) { struct virtqueue *vq; int nbufs, error; #ifdef DEV_NETMAP error = vtnet_netmap_rxq_populate(rxq); if (error >= 0) return (error); #endif /* DEV_NETMAP */ vq = rxq->vtnrx_vq; error = ENOSPC; for (nbufs = 0; !virtqueue_full(vq); nbufs++) { error = vtnet_rxq_new_buf(rxq); if (error) break; } if (nbufs > 0) { virtqueue_notify(vq); /* * EMSGSIZE signifies the virtqueue did not have enough * entries available to hold the last mbuf. This is not * an error. */ if (error == EMSGSIZE) error = 0; } return (error); } static void vtnet_rxq_free_mbufs(struct vtnet_rxq *rxq) { struct virtqueue *vq; struct mbuf *m; int last; #ifdef DEV_NETMAP int netmap_bufs = vtnet_netmap_queue_on(rxq->vtnrx_sc, NR_RX, rxq->vtnrx_id); #else /* !DEV_NETMAP */ int netmap_bufs = 0; #endif /* !DEV_NETMAP */ vq = rxq->vtnrx_vq; last = 0; while ((m = virtqueue_drain(vq, &last)) != NULL) { if (!netmap_bufs) m_freem(m); } KASSERT(virtqueue_empty(vq), ("%s: mbufs remaining in rx queue %p", __func__, rxq)); } static struct mbuf * vtnet_rx_alloc_buf(struct vtnet_softc *sc, int nbufs, struct mbuf **m_tailp) { struct mbuf *m_head, *m_tail, *m; int i, clsize; clsize = sc->vtnet_rx_clsize; KASSERT(nbufs == 1 || sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG, ("%s: chained mbuf %d request without LRO_NOMRG", __func__, nbufs)); m_head = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, clsize); if (m_head == NULL) goto fail; m_head->m_len = clsize; m_tail = m_head; /* Allocate the rest of the chain. */ for (i = 1; i < nbufs; i++) { m = m_getjcl(M_NOWAIT, MT_DATA, 0, clsize); if (m == NULL) goto fail; m->m_len = clsize; m_tail->m_next = m; m_tail = m; } if (m_tailp != NULL) *m_tailp = m_tail; return (m_head); fail: sc->vtnet_stats.mbuf_alloc_failed++; m_freem(m_head); return (NULL); } /* * Slow path for when LRO without mergeable buffers is negotiated. */ static int vtnet_rxq_replace_lro_nomgr_buf(struct vtnet_rxq *rxq, struct mbuf *m0, int len0) { struct vtnet_softc *sc; struct mbuf *m, *m_prev; struct mbuf *m_new, *m_tail; int len, clsize, nreplace, error; sc = rxq->vtnrx_sc; clsize = sc->vtnet_rx_clsize; m_prev = NULL; m_tail = NULL; nreplace = 0; m = m0; len = len0; /* * Since these mbuf chains are so large, we avoid allocating an * entire replacement chain if possible. When the received frame * did not consume the entire chain, the unused mbufs are moved * to the replacement chain. */ while (len > 0) { /* * Something is seriously wrong if we received a frame * larger than the chain. Drop it. */ if (m == NULL) { sc->vtnet_stats.rx_frame_too_large++; return (EMSGSIZE); } /* We always allocate the same cluster size. */ KASSERT(m->m_len == clsize, ("%s: mbuf size %d is not the cluster size %d", __func__, m->m_len, clsize)); m->m_len = MIN(m->m_len, len); len -= m->m_len; m_prev = m; m = m->m_next; nreplace++; } KASSERT(nreplace <= sc->vtnet_rx_nmbufs, ("%s: too many replacement mbufs %d max %d", __func__, nreplace, sc->vtnet_rx_nmbufs)); m_new = vtnet_rx_alloc_buf(sc, nreplace, &m_tail); if (m_new == NULL) { m_prev->m_len = clsize; return (ENOBUFS); } /* * Move any unused mbufs from the received chain onto the end * of the new chain. */ if (m_prev->m_next != NULL) { m_tail->m_next = m_prev->m_next; m_prev->m_next = NULL; } error = vtnet_rxq_enqueue_buf(rxq, m_new); if (error) { /* * BAD! We could not enqueue the replacement mbuf chain. We * must restore the m0 chain to the original state if it was * modified so we can subsequently discard it. * * NOTE: The replacement is suppose to be an identical copy * to the one just dequeued so this is an unexpected error. */ sc->vtnet_stats.rx_enq_replacement_failed++; if (m_tail->m_next != NULL) { m_prev->m_next = m_tail->m_next; m_tail->m_next = NULL; } m_prev->m_len = clsize; m_freem(m_new); } return (error); } static int vtnet_rxq_replace_buf(struct vtnet_rxq *rxq, struct mbuf *m, int len) { struct vtnet_softc *sc; struct mbuf *m_new; int error; sc = rxq->vtnrx_sc; KASSERT(sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG || m->m_next == NULL, ("%s: chained mbuf without LRO_NOMRG", __func__)); if (m->m_next == NULL) { /* Fast-path for the common case of just one mbuf. */ if (m->m_len < len) return (EINVAL); m_new = vtnet_rx_alloc_buf(sc, 1, NULL); if (m_new == NULL) return (ENOBUFS); error = vtnet_rxq_enqueue_buf(rxq, m_new); if (error) { /* * The new mbuf is suppose to be an identical * copy of the one just dequeued so this is an * unexpected error. */ m_freem(m_new); sc->vtnet_stats.rx_enq_replacement_failed++; } else m->m_len = len; } else error = vtnet_rxq_replace_lro_nomgr_buf(rxq, m, len); return (error); } static int vtnet_rxq_enqueue_buf(struct vtnet_rxq *rxq, struct mbuf *m) { struct vtnet_softc *sc; struct sglist *sg; struct vtnet_rx_header *rxhdr; uint8_t *mdata; int offset, error; sc = rxq->vtnrx_sc; sg = rxq->vtnrx_sg; mdata = mtod(m, uint8_t *); VTNET_RXQ_LOCK_ASSERT(rxq); KASSERT(sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG || m->m_next == NULL, ("%s: chained mbuf without LRO_NOMRG", __func__)); KASSERT(m->m_len == sc->vtnet_rx_clsize, ("%s: unexpected cluster size %d/%d", __func__, m->m_len, sc->vtnet_rx_clsize)); sglist_reset(sg); if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) { MPASS(sc->vtnet_hdr_size == sizeof(struct virtio_net_hdr)); rxhdr = (struct vtnet_rx_header *) mdata; sglist_append(sg, &rxhdr->vrh_hdr, sc->vtnet_hdr_size); offset = sizeof(struct vtnet_rx_header); } else offset = 0; sglist_append(sg, mdata + offset, m->m_len - offset); if (m->m_next != NULL) { error = sglist_append_mbuf(sg, m->m_next); MPASS(error == 0); } error = virtqueue_enqueue(rxq->vtnrx_vq, m, sg, 0, sg->sg_nseg); return (error); } static int vtnet_rxq_new_buf(struct vtnet_rxq *rxq) { struct vtnet_softc *sc; struct mbuf *m; int error; sc = rxq->vtnrx_sc; m = vtnet_rx_alloc_buf(sc, sc->vtnet_rx_nmbufs, NULL); if (m == NULL) return (ENOBUFS); error = vtnet_rxq_enqueue_buf(rxq, m); if (error) m_freem(m); return (error); } /* * Use the checksum offset in the VirtIO header to set the * correct CSUM_* flags. */ static int vtnet_rxq_csum_by_offset(struct vtnet_rxq *rxq, struct mbuf *m, uint16_t eth_type, int ip_start, struct virtio_net_hdr *hdr) { struct vtnet_softc *sc; #if defined(INET) || defined(INET6) int offset = hdr->csum_start + hdr->csum_offset; #endif sc = rxq->vtnrx_sc; /* Only do a basic sanity check on the offset. */ switch (eth_type) { #if defined(INET) case ETHERTYPE_IP: if (__predict_false(offset < ip_start + sizeof(struct ip))) return (1); break; #endif #if defined(INET6) case ETHERTYPE_IPV6: if (__predict_false(offset < ip_start + sizeof(struct ip6_hdr))) return (1); break; #endif default: sc->vtnet_stats.rx_csum_bad_ethtype++; return (1); } /* * Use the offset to determine the appropriate CSUM_* flags. This is * a bit dirty, but we can get by with it since the checksum offsets * happen to be different. We assume the host host does not do IPv4 * header checksum offloading. */ switch (hdr->csum_offset) { case offsetof(struct udphdr, uh_sum): case offsetof(struct tcphdr, th_sum): m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xFFFF; break; default: sc->vtnet_stats.rx_csum_bad_offset++; return (1); } return (0); } static int vtnet_rxq_csum_by_parse(struct vtnet_rxq *rxq, struct mbuf *m, uint16_t eth_type, int ip_start, struct virtio_net_hdr *hdr) { struct vtnet_softc *sc; int offset, proto; sc = rxq->vtnrx_sc; switch (eth_type) { #if defined(INET) case ETHERTYPE_IP: { struct ip *ip; if (__predict_false(m->m_len < ip_start + sizeof(struct ip))) return (1); ip = (struct ip *)(m->m_data + ip_start); proto = ip->ip_p; offset = ip_start + (ip->ip_hl << 2); break; } #endif #if defined(INET6) case ETHERTYPE_IPV6: if (__predict_false(m->m_len < ip_start + sizeof(struct ip6_hdr))) return (1); offset = ip6_lasthdr(m, ip_start, IPPROTO_IPV6, &proto); if (__predict_false(offset < 0)) return (1); break; #endif default: sc->vtnet_stats.rx_csum_bad_ethtype++; return (1); } switch (proto) { case IPPROTO_TCP: if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) return (1); m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xFFFF; break; case IPPROTO_UDP: if (__predict_false(m->m_len < offset + sizeof(struct udphdr))) return (1); m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xFFFF; break; default: /* * For the remaining protocols, FreeBSD does not support * checksum offloading, so the checksum will be recomputed. */ #if 0 if_printf(sc->vtnet_ifp, "cksum offload of unsupported " "protocol eth_type=%#x proto=%d csum_start=%d " "csum_offset=%d\n", __func__, eth_type, proto, hdr->csum_start, hdr->csum_offset); #endif break; } return (0); } /* * Set the appropriate CSUM_* flags. Unfortunately, the information * provided is not directly useful to us. The VirtIO header gives the * offset of the checksum, which is all Linux needs, but this is not * how FreeBSD does things. We are forced to peek inside the packet * a bit. * * It would be nice if VirtIO gave us the L4 protocol or if FreeBSD * could accept the offsets and let the stack figure it out. */ static int vtnet_rxq_csum(struct vtnet_rxq *rxq, struct mbuf *m, struct virtio_net_hdr *hdr) { struct ether_header *eh; struct ether_vlan_header *evh; uint16_t eth_type; int offset, error; eh = mtod(m, struct ether_header *); eth_type = ntohs(eh->ether_type); if (eth_type == ETHERTYPE_VLAN) { /* BMV: We should handle nested VLAN tags too. */ evh = mtod(m, struct ether_vlan_header *); eth_type = ntohs(evh->evl_proto); offset = sizeof(struct ether_vlan_header); } else offset = sizeof(struct ether_header); if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) error = vtnet_rxq_csum_by_offset(rxq, m, eth_type, offset, hdr); else error = vtnet_rxq_csum_by_parse(rxq, m, eth_type, offset, hdr); return (error); } static void vtnet_rxq_discard_merged_bufs(struct vtnet_rxq *rxq, int nbufs) { struct mbuf *m; while (--nbufs > 0) { m = virtqueue_dequeue(rxq->vtnrx_vq, NULL); if (m == NULL) break; vtnet_rxq_discard_buf(rxq, m); } } static void vtnet_rxq_discard_buf(struct vtnet_rxq *rxq, struct mbuf *m) { int error; /* * Requeue the discarded mbuf. This should always be successful * since it was just dequeued. */ error = vtnet_rxq_enqueue_buf(rxq, m); KASSERT(error == 0, ("%s: cannot requeue discarded mbuf %d", __func__, error)); } static int vtnet_rxq_merged_eof(struct vtnet_rxq *rxq, struct mbuf *m_head, int nbufs) { struct vtnet_softc *sc; struct virtqueue *vq; struct mbuf *m, *m_tail; int len; sc = rxq->vtnrx_sc; vq = rxq->vtnrx_vq; m_tail = m_head; while (--nbufs > 0) { m = virtqueue_dequeue(vq, &len); if (m == NULL) { rxq->vtnrx_stats.vrxs_ierrors++; goto fail; } if (vtnet_rxq_new_buf(rxq) != 0) { rxq->vtnrx_stats.vrxs_iqdrops++; vtnet_rxq_discard_buf(rxq, m); if (nbufs > 1) vtnet_rxq_discard_merged_bufs(rxq, nbufs); goto fail; } if (m->m_len < len) len = m->m_len; m->m_len = len; m->m_flags &= ~M_PKTHDR; m_head->m_pkthdr.len += len; m_tail->m_next = m; m_tail = m; } return (0); fail: sc->vtnet_stats.rx_mergeable_failed++; m_freem(m_head); return (1); } static void vtnet_rxq_input(struct vtnet_rxq *rxq, struct mbuf *m, struct virtio_net_hdr *hdr) { struct vtnet_softc *sc; struct ifnet *ifp; struct ether_header *eh; sc = rxq->vtnrx_sc; ifp = sc->vtnet_ifp; if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) { eh = mtod(m, struct ether_header *); if (eh->ether_type == htons(ETHERTYPE_VLAN)) { vtnet_vlan_tag_remove(m); /* * With the 802.1Q header removed, update the * checksum starting location accordingly. */ if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) hdr->csum_start -= ETHER_VLAN_ENCAP_LEN; } } m->m_pkthdr.flowid = rxq->vtnrx_id; M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); /* * BMV: FreeBSD does not have the UNNECESSARY and PARTIAL checksum * distinction that Linux does. Need to reevaluate if performing * offloading for the NEEDS_CSUM case is really appropriate. */ if (hdr->flags & (VIRTIO_NET_HDR_F_NEEDS_CSUM | VIRTIO_NET_HDR_F_DATA_VALID)) { if (vtnet_rxq_csum(rxq, m, hdr) == 0) rxq->vtnrx_stats.vrxs_csum++; else rxq->vtnrx_stats.vrxs_csum_failed++; } rxq->vtnrx_stats.vrxs_ipackets++; rxq->vtnrx_stats.vrxs_ibytes += m->m_pkthdr.len; VTNET_RXQ_UNLOCK(rxq); (*ifp->if_input)(ifp, m); VTNET_RXQ_LOCK(rxq); } static int vtnet_rxq_eof(struct vtnet_rxq *rxq) { struct virtio_net_hdr lhdr, *hdr; struct vtnet_softc *sc; struct ifnet *ifp; struct virtqueue *vq; struct mbuf *m, *mr; struct virtio_net_hdr_mrg_rxbuf *mhdr; int len, deq, nbufs, adjsz, count; pfil_return_t pfil; bool pfil_done; sc = rxq->vtnrx_sc; vq = rxq->vtnrx_vq; ifp = sc->vtnet_ifp; hdr = &lhdr; deq = 0; count = sc->vtnet_rx_process_limit; VTNET_RXQ_LOCK_ASSERT(rxq); while (count-- > 0) { m = virtqueue_dequeue(vq, &len); if (m == NULL) break; deq++; if (len < sc->vtnet_hdr_size + ETHER_HDR_LEN) { rxq->vtnrx_stats.vrxs_ierrors++; vtnet_rxq_discard_buf(rxq, m); continue; } if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) { nbufs = 1; adjsz = sizeof(struct vtnet_rx_header); /* * Account for our pad inserted between the header * and the actual start of the frame. */ len += VTNET_RX_HEADER_PAD; } else { mhdr = mtod(m, struct virtio_net_hdr_mrg_rxbuf *); nbufs = mhdr->num_buffers; adjsz = sizeof(struct virtio_net_hdr_mrg_rxbuf); } /* * If we have enough data in first mbuf, run it through * pfil as a memory buffer before dequeueing the rest. */ if (PFIL_HOOKED_IN(sc->vtnet_pfil) && len - adjsz >= ETHER_HDR_LEN + max_protohdr) { pfil = pfil_run_hooks(sc->vtnet_pfil, m->m_data + adjsz, ifp, (len - adjsz) | PFIL_MEMPTR | PFIL_IN, NULL); switch (pfil) { case PFIL_REALLOCED: mr = pfil_mem2mbuf(m->m_data + adjsz); vtnet_rxq_input(rxq, mr, hdr); /* FALLTHROUGH */ case PFIL_DROPPED: case PFIL_CONSUMED: vtnet_rxq_discard_buf(rxq, m); if (nbufs > 1) vtnet_rxq_discard_merged_bufs(rxq, nbufs); continue; default: KASSERT(pfil == PFIL_PASS, ("Filter returned %d!\n", pfil)); }; pfil_done = true; } else pfil_done = false; if (vtnet_rxq_replace_buf(rxq, m, len) != 0) { rxq->vtnrx_stats.vrxs_iqdrops++; vtnet_rxq_discard_buf(rxq, m); if (nbufs > 1) vtnet_rxq_discard_merged_bufs(rxq, nbufs); continue; } m->m_pkthdr.len = len; m->m_pkthdr.rcvif = ifp; m->m_pkthdr.csum_flags = 0; if (nbufs > 1) { /* Dequeue the rest of chain. */ if (vtnet_rxq_merged_eof(rxq, m, nbufs) != 0) continue; } /* * Save copy of header before we strip it. For both mergeable * and non-mergeable, the header is at the beginning of the * mbuf data. We no longer need num_buffers, so always use a * regular header. * * BMV: Is this memcpy() expensive? We know the mbuf data is * still valid even after the m_adj(). */ memcpy(hdr, mtod(m, void *), sizeof(struct virtio_net_hdr)); m_adj(m, adjsz); if (PFIL_HOOKED_IN(sc->vtnet_pfil) && pfil_done == false) { pfil = pfil_run_hooks(sc->vtnet_pfil, &m, ifp, PFIL_IN, NULL); switch (pfil) { case PFIL_DROPPED: case PFIL_CONSUMED: continue; default: KASSERT(pfil == PFIL_PASS, ("Filter returned %d!\n", pfil)); } } vtnet_rxq_input(rxq, m, hdr); /* Must recheck after dropping the Rx lock. */ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; } if (deq > 0) virtqueue_notify(vq); return (count > 0 ? 0 : EAGAIN); } static void vtnet_rx_vq_intr(void *xrxq) { struct vtnet_softc *sc; struct vtnet_rxq *rxq; struct ifnet *ifp; int tries, more; rxq = xrxq; sc = rxq->vtnrx_sc; ifp = sc->vtnet_ifp; tries = 0; if (__predict_false(rxq->vtnrx_id >= sc->vtnet_act_vq_pairs)) { /* * Ignore this interrupt. Either this is a spurious interrupt * or multiqueue without per-VQ MSIX so every queue needs to * be polled (a brain dead configuration we could try harder * to avoid). */ vtnet_rxq_disable_intr(rxq); return; } #ifdef DEV_NETMAP if (netmap_rx_irq(ifp, rxq->vtnrx_id, &more) != NM_IRQ_PASS) return; #endif /* DEV_NETMAP */ VTNET_RXQ_LOCK(rxq); again: if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { VTNET_RXQ_UNLOCK(rxq); return; } more = vtnet_rxq_eof(rxq); if (more || vtnet_rxq_enable_intr(rxq) != 0) { if (!more) vtnet_rxq_disable_intr(rxq); /* * This is an occasional condition or race (when !more), * so retry a few times before scheduling the taskqueue. */ if (tries++ < VTNET_INTR_DISABLE_RETRIES) goto again; VTNET_RXQ_UNLOCK(rxq); rxq->vtnrx_stats.vrxs_rescheduled++; taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask); } else VTNET_RXQ_UNLOCK(rxq); } static void vtnet_rxq_tq_intr(void *xrxq, int pending) { struct vtnet_softc *sc; struct vtnet_rxq *rxq; struct ifnet *ifp; int more; rxq = xrxq; sc = rxq->vtnrx_sc; ifp = sc->vtnet_ifp; VTNET_RXQ_LOCK(rxq); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { VTNET_RXQ_UNLOCK(rxq); return; } more = vtnet_rxq_eof(rxq); if (more || vtnet_rxq_enable_intr(rxq) != 0) { if (!more) vtnet_rxq_disable_intr(rxq); rxq->vtnrx_stats.vrxs_rescheduled++; taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask); } VTNET_RXQ_UNLOCK(rxq); } static int vtnet_txq_below_threshold(struct vtnet_txq *txq) { struct vtnet_softc *sc; struct virtqueue *vq; sc = txq->vtntx_sc; vq = txq->vtntx_vq; return (virtqueue_nfree(vq) <= sc->vtnet_tx_intr_thresh); } static int vtnet_txq_notify(struct vtnet_txq *txq) { struct virtqueue *vq; vq = txq->vtntx_vq; txq->vtntx_watchdog = VTNET_TX_TIMEOUT; virtqueue_notify(vq); if (vtnet_txq_enable_intr(txq) == 0) return (0); /* * Drain frames that were completed since last checked. If this * causes the queue to go above the threshold, the caller should * continue transmitting. */ if (vtnet_txq_eof(txq) != 0 && vtnet_txq_below_threshold(txq) == 0) { virtqueue_disable_intr(vq); return (1); } return (0); } static void vtnet_txq_free_mbufs(struct vtnet_txq *txq) { struct virtqueue *vq; struct vtnet_tx_header *txhdr; int last; #ifdef DEV_NETMAP int netmap_bufs = vtnet_netmap_queue_on(txq->vtntx_sc, NR_TX, txq->vtntx_id); #else /* !DEV_NETMAP */ int netmap_bufs = 0; #endif /* !DEV_NETMAP */ vq = txq->vtntx_vq; last = 0; while ((txhdr = virtqueue_drain(vq, &last)) != NULL) { if (!netmap_bufs) { m_freem(txhdr->vth_mbuf); uma_zfree(vtnet_tx_header_zone, txhdr); } } KASSERT(virtqueue_empty(vq), ("%s: mbufs remaining in tx queue %p", __func__, txq)); } /* * BMV: Much of this can go away once we finally have offsets in * the mbuf packet header. Bug andre@. */ static int vtnet_txq_offload_ctx(struct vtnet_txq *txq, struct mbuf *m, int *etype, int *proto, int *start) { struct vtnet_softc *sc; struct ether_vlan_header *evh; int offset; sc = txq->vtntx_sc; evh = mtod(m, struct ether_vlan_header *); if (evh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { /* BMV: We should handle nested VLAN tags too. */ *etype = ntohs(evh->evl_proto); offset = sizeof(struct ether_vlan_header); } else { *etype = ntohs(evh->evl_encap_proto); offset = sizeof(struct ether_header); } switch (*etype) { #if defined(INET) case ETHERTYPE_IP: { struct ip *ip, iphdr; if (__predict_false(m->m_len < offset + sizeof(struct ip))) { m_copydata(m, offset, sizeof(struct ip), (caddr_t) &iphdr); ip = &iphdr; } else ip = (struct ip *)(m->m_data + offset); *proto = ip->ip_p; *start = offset + (ip->ip_hl << 2); break; } #endif #if defined(INET6) case ETHERTYPE_IPV6: *proto = -1; *start = ip6_lasthdr(m, offset, IPPROTO_IPV6, proto); /* Assert the network stack sent us a valid packet. */ KASSERT(*start > offset, ("%s: mbuf %p start %d offset %d proto %d", __func__, m, *start, offset, *proto)); break; #endif default: sc->vtnet_stats.tx_csum_bad_ethtype++; return (EINVAL); } return (0); } static int vtnet_txq_offload_tso(struct vtnet_txq *txq, struct mbuf *m, int eth_type, int offset, struct virtio_net_hdr *hdr) { static struct timeval lastecn; static int curecn; struct vtnet_softc *sc; struct tcphdr *tcp, tcphdr; sc = txq->vtntx_sc; if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) { m_copydata(m, offset, sizeof(struct tcphdr), (caddr_t) &tcphdr); tcp = &tcphdr; } else tcp = (struct tcphdr *)(m->m_data + offset); hdr->hdr_len = offset + (tcp->th_off << 2); hdr->gso_size = m->m_pkthdr.tso_segsz; hdr->gso_type = eth_type == ETHERTYPE_IP ? VIRTIO_NET_HDR_GSO_TCPV4 : VIRTIO_NET_HDR_GSO_TCPV6; if (tcp->th_flags & TH_CWR) { /* * Drop if VIRTIO_NET_F_HOST_ECN was not negotiated. In FreeBSD, * ECN support is not on a per-interface basis, but globally via * the net.inet.tcp.ecn.enable sysctl knob. The default is off. */ if ((sc->vtnet_flags & VTNET_FLAG_TSO_ECN) == 0) { if (ppsratecheck(&lastecn, &curecn, 1)) if_printf(sc->vtnet_ifp, "TSO with ECN not negotiated with host\n"); return (ENOTSUP); } hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; } txq->vtntx_stats.vtxs_tso++; return (0); } static struct mbuf * vtnet_txq_offload(struct vtnet_txq *txq, struct mbuf *m, struct virtio_net_hdr *hdr) { struct vtnet_softc *sc; int flags, etype, csum_start, proto, error; sc = txq->vtntx_sc; flags = m->m_pkthdr.csum_flags; error = vtnet_txq_offload_ctx(txq, m, &etype, &proto, &csum_start); if (error) goto drop; if ((etype == ETHERTYPE_IP && flags & VTNET_CSUM_OFFLOAD) || (etype == ETHERTYPE_IPV6 && flags & VTNET_CSUM_OFFLOAD_IPV6)) { /* * We could compare the IP protocol vs the CSUM_ flag too, * but that really should not be necessary. */ hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM; hdr->csum_start = csum_start; hdr->csum_offset = m->m_pkthdr.csum_data; txq->vtntx_stats.vtxs_csum++; } if (flags & CSUM_TSO) { if (__predict_false(proto != IPPROTO_TCP)) { /* Likely failed to correctly parse the mbuf. */ sc->vtnet_stats.tx_tso_not_tcp++; goto drop; } KASSERT(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM, ("%s: mbuf %p TSO without checksum offload %#x", __func__, m, flags)); error = vtnet_txq_offload_tso(txq, m, etype, csum_start, hdr); if (error) goto drop; } return (m); drop: m_freem(m); return (NULL); } static int vtnet_txq_enqueue_buf(struct vtnet_txq *txq, struct mbuf **m_head, struct vtnet_tx_header *txhdr) { struct vtnet_softc *sc; struct virtqueue *vq; struct sglist *sg; struct mbuf *m; int error; sc = txq->vtntx_sc; vq = txq->vtntx_vq; sg = txq->vtntx_sg; m = *m_head; sglist_reset(sg); error = sglist_append(sg, &txhdr->vth_uhdr, sc->vtnet_hdr_size); KASSERT(error == 0 && sg->sg_nseg == 1, ("%s: error %d adding header to sglist", __func__, error)); error = sglist_append_mbuf(sg, m); if (error) { m = m_defrag(m, M_NOWAIT); if (m == NULL) goto fail; *m_head = m; sc->vtnet_stats.tx_defragged++; error = sglist_append_mbuf(sg, m); if (error) goto fail; } txhdr->vth_mbuf = m; error = virtqueue_enqueue(vq, txhdr, sg, sg->sg_nseg, 0); return (error); fail: sc->vtnet_stats.tx_defrag_failed++; m_freem(*m_head); *m_head = NULL; return (ENOBUFS); } static int vtnet_txq_encap(struct vtnet_txq *txq, struct mbuf **m_head, int flags) { struct vtnet_tx_header *txhdr; struct virtio_net_hdr *hdr; struct mbuf *m; int error; m = *m_head; M_ASSERTPKTHDR(m); txhdr = uma_zalloc(vtnet_tx_header_zone, flags | M_ZERO); if (txhdr == NULL) { m_freem(m); *m_head = NULL; return (ENOMEM); } /* * Always use the non-mergeable header, regardless if the feature * was negotiated. For transmit, num_buffers is always zero. The * vtnet_hdr_size is used to enqueue the correct header size. */ hdr = &txhdr->vth_uhdr.hdr; if (m->m_flags & M_VLANTAG) { m = ether_vlanencap(m, m->m_pkthdr.ether_vtag); if ((*m_head = m) == NULL) { error = ENOBUFS; goto fail; } m->m_flags &= ~M_VLANTAG; } if (m->m_pkthdr.csum_flags & VTNET_CSUM_ALL_OFFLOAD) { m = vtnet_txq_offload(txq, m, hdr); if ((*m_head = m) == NULL) { error = ENOBUFS; goto fail; } } error = vtnet_txq_enqueue_buf(txq, m_head, txhdr); if (error == 0) return (0); fail: uma_zfree(vtnet_tx_header_zone, txhdr); return (error); } #ifdef VTNET_LEGACY_TX static void vtnet_start_locked(struct vtnet_txq *txq, struct ifnet *ifp) { struct vtnet_softc *sc; struct virtqueue *vq; struct mbuf *m0; int tries, enq; sc = txq->vtntx_sc; vq = txq->vtntx_vq; tries = 0; VTNET_TXQ_LOCK_ASSERT(txq); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || sc->vtnet_link_active == 0) return; vtnet_txq_eof(txq); again: enq = 0; while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { if (virtqueue_full(vq)) break; IFQ_DRV_DEQUEUE(&ifp->if_snd, m0); if (m0 == NULL) break; if (vtnet_txq_encap(txq, &m0, M_NOWAIT) != 0) { if (m0 != NULL) IFQ_DRV_PREPEND(&ifp->if_snd, m0); break; } enq++; ETHER_BPF_MTAP(ifp, m0); } if (enq > 0 && vtnet_txq_notify(txq) != 0) { if (tries++ < VTNET_NOTIFY_RETRIES) goto again; txq->vtntx_stats.vtxs_rescheduled++; taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_intrtask); } } static void vtnet_start(struct ifnet *ifp) { struct vtnet_softc *sc; struct vtnet_txq *txq; sc = ifp->if_softc; txq = &sc->vtnet_txqs[0]; VTNET_TXQ_LOCK(txq); vtnet_start_locked(txq, ifp); VTNET_TXQ_UNLOCK(txq); } #else /* !VTNET_LEGACY_TX */ static int vtnet_txq_mq_start_locked(struct vtnet_txq *txq, struct mbuf *m) { struct vtnet_softc *sc; struct virtqueue *vq; struct buf_ring *br; struct ifnet *ifp; int enq, tries, error; sc = txq->vtntx_sc; vq = txq->vtntx_vq; br = txq->vtntx_br; ifp = sc->vtnet_ifp; tries = 0; error = 0; VTNET_TXQ_LOCK_ASSERT(txq); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || sc->vtnet_link_active == 0) { if (m != NULL) error = drbr_enqueue(ifp, br, m); return (error); } if (m != NULL) { error = drbr_enqueue(ifp, br, m); if (error) return (error); } vtnet_txq_eof(txq); again: enq = 0; while ((m = drbr_peek(ifp, br)) != NULL) { if (virtqueue_full(vq)) { drbr_putback(ifp, br, m); break; } if (vtnet_txq_encap(txq, &m, M_NOWAIT) != 0) { if (m != NULL) drbr_putback(ifp, br, m); else drbr_advance(ifp, br); break; } drbr_advance(ifp, br); enq++; ETHER_BPF_MTAP(ifp, m); } if (enq > 0 && vtnet_txq_notify(txq) != 0) { if (tries++ < VTNET_NOTIFY_RETRIES) goto again; txq->vtntx_stats.vtxs_rescheduled++; taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_intrtask); } return (0); } static int vtnet_txq_mq_start(struct ifnet *ifp, struct mbuf *m) { struct vtnet_softc *sc; struct vtnet_txq *txq; int i, npairs, error; sc = ifp->if_softc; npairs = sc->vtnet_act_vq_pairs; /* check if flowid is set */ if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) i = m->m_pkthdr.flowid % npairs; else i = curcpu % npairs; txq = &sc->vtnet_txqs[i]; if (VTNET_TXQ_TRYLOCK(txq) != 0) { error = vtnet_txq_mq_start_locked(txq, m); VTNET_TXQ_UNLOCK(txq); } else { error = drbr_enqueue(ifp, txq->vtntx_br, m); taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_defrtask); } return (error); } static void vtnet_txq_tq_deferred(void *xtxq, int pending) { struct vtnet_softc *sc; struct vtnet_txq *txq; txq = xtxq; sc = txq->vtntx_sc; VTNET_TXQ_LOCK(txq); if (!drbr_empty(sc->vtnet_ifp, txq->vtntx_br)) vtnet_txq_mq_start_locked(txq, NULL); VTNET_TXQ_UNLOCK(txq); } #endif /* VTNET_LEGACY_TX */ static void vtnet_txq_start(struct vtnet_txq *txq) { struct vtnet_softc *sc; struct ifnet *ifp; sc = txq->vtntx_sc; ifp = sc->vtnet_ifp; #ifdef VTNET_LEGACY_TX if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) vtnet_start_locked(txq, ifp); #else if (!drbr_empty(ifp, txq->vtntx_br)) vtnet_txq_mq_start_locked(txq, NULL); #endif } static void vtnet_txq_tq_intr(void *xtxq, int pending) { struct vtnet_softc *sc; struct vtnet_txq *txq; struct ifnet *ifp; txq = xtxq; sc = txq->vtntx_sc; ifp = sc->vtnet_ifp; VTNET_TXQ_LOCK(txq); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { VTNET_TXQ_UNLOCK(txq); return; } vtnet_txq_eof(txq); vtnet_txq_start(txq); VTNET_TXQ_UNLOCK(txq); } static int vtnet_txq_eof(struct vtnet_txq *txq) { struct virtqueue *vq; struct vtnet_tx_header *txhdr; struct mbuf *m; int deq; vq = txq->vtntx_vq; deq = 0; VTNET_TXQ_LOCK_ASSERT(txq); while ((txhdr = virtqueue_dequeue(vq, NULL)) != NULL) { m = txhdr->vth_mbuf; deq++; txq->vtntx_stats.vtxs_opackets++; txq->vtntx_stats.vtxs_obytes += m->m_pkthdr.len; if (m->m_flags & M_MCAST) txq->vtntx_stats.vtxs_omcasts++; m_freem(m); uma_zfree(vtnet_tx_header_zone, txhdr); } if (virtqueue_empty(vq)) txq->vtntx_watchdog = 0; return (deq); } static void vtnet_tx_vq_intr(void *xtxq) { struct vtnet_softc *sc; struct vtnet_txq *txq; struct ifnet *ifp; txq = xtxq; sc = txq->vtntx_sc; ifp = sc->vtnet_ifp; if (__predict_false(txq->vtntx_id >= sc->vtnet_act_vq_pairs)) { /* * Ignore this interrupt. Either this is a spurious interrupt * or multiqueue without per-VQ MSIX so every queue needs to * be polled (a brain dead configuration we could try harder * to avoid). */ vtnet_txq_disable_intr(txq); return; } #ifdef DEV_NETMAP if (netmap_tx_irq(ifp, txq->vtntx_id) != NM_IRQ_PASS) return; #endif /* DEV_NETMAP */ VTNET_TXQ_LOCK(txq); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { VTNET_TXQ_UNLOCK(txq); return; } vtnet_txq_eof(txq); vtnet_txq_start(txq); VTNET_TXQ_UNLOCK(txq); } static void vtnet_tx_start_all(struct vtnet_softc *sc) { struct vtnet_txq *txq; int i; VTNET_CORE_LOCK_ASSERT(sc); for (i = 0; i < sc->vtnet_act_vq_pairs; i++) { txq = &sc->vtnet_txqs[i]; VTNET_TXQ_LOCK(txq); vtnet_txq_start(txq); VTNET_TXQ_UNLOCK(txq); } } #ifndef VTNET_LEGACY_TX static void vtnet_qflush(struct ifnet *ifp) { struct vtnet_softc *sc; struct vtnet_txq *txq; struct mbuf *m; int i; sc = ifp->if_softc; for (i = 0; i < sc->vtnet_act_vq_pairs; i++) { txq = &sc->vtnet_txqs[i]; VTNET_TXQ_LOCK(txq); while ((m = buf_ring_dequeue_sc(txq->vtntx_br)) != NULL) m_freem(m); VTNET_TXQ_UNLOCK(txq); } if_qflush(ifp); } #endif static int vtnet_watchdog(struct vtnet_txq *txq) { struct ifnet *ifp; ifp = txq->vtntx_sc->vtnet_ifp; VTNET_TXQ_LOCK(txq); if (txq->vtntx_watchdog == 1) { /* * Only drain completed frames if the watchdog is about to * expire. If any frames were drained, there may be enough * free descriptors now available to transmit queued frames. * In that case, the timer will immediately be decremented * below, but the timeout is generous enough that should not * be a problem. */ if (vtnet_txq_eof(txq) != 0) vtnet_txq_start(txq); } if (txq->vtntx_watchdog == 0 || --txq->vtntx_watchdog) { VTNET_TXQ_UNLOCK(txq); return (0); } VTNET_TXQ_UNLOCK(txq); if_printf(ifp, "watchdog timeout on queue %d\n", txq->vtntx_id); return (1); } static void vtnet_accum_stats(struct vtnet_softc *sc, struct vtnet_rxq_stats *rxacc, struct vtnet_txq_stats *txacc) { bzero(rxacc, sizeof(struct vtnet_rxq_stats)); bzero(txacc, sizeof(struct vtnet_txq_stats)); for (int i = 0; i < sc->vtnet_max_vq_pairs; i++) { struct vtnet_rxq_stats *rxst; struct vtnet_txq_stats *txst; rxst = &sc->vtnet_rxqs[i].vtnrx_stats; rxacc->vrxs_ipackets += rxst->vrxs_ipackets; rxacc->vrxs_ibytes += rxst->vrxs_ibytes; rxacc->vrxs_iqdrops += rxst->vrxs_iqdrops; rxacc->vrxs_csum += rxst->vrxs_csum; rxacc->vrxs_csum_failed += rxst->vrxs_csum_failed; rxacc->vrxs_rescheduled += rxst->vrxs_rescheduled; txst = &sc->vtnet_txqs[i].vtntx_stats; txacc->vtxs_opackets += txst->vtxs_opackets; txacc->vtxs_obytes += txst->vtxs_obytes; txacc->vtxs_csum += txst->vtxs_csum; txacc->vtxs_tso += txst->vtxs_tso; txacc->vtxs_rescheduled += txst->vtxs_rescheduled; } } static uint64_t vtnet_get_counter(if_t ifp, ift_counter cnt) { struct vtnet_softc *sc; struct vtnet_rxq_stats rxaccum; struct vtnet_txq_stats txaccum; sc = if_getsoftc(ifp); vtnet_accum_stats(sc, &rxaccum, &txaccum); switch (cnt) { case IFCOUNTER_IPACKETS: return (rxaccum.vrxs_ipackets); case IFCOUNTER_IQDROPS: return (rxaccum.vrxs_iqdrops); case IFCOUNTER_IERRORS: return (rxaccum.vrxs_ierrors); case IFCOUNTER_OPACKETS: return (txaccum.vtxs_opackets); #ifndef VTNET_LEGACY_TX case IFCOUNTER_OBYTES: return (txaccum.vtxs_obytes); case IFCOUNTER_OMCASTS: return (txaccum.vtxs_omcasts); #endif default: return (if_get_counter_default(ifp, cnt)); } } static void vtnet_tick(void *xsc) { struct vtnet_softc *sc; struct ifnet *ifp; int i, timedout; sc = xsc; ifp = sc->vtnet_ifp; timedout = 0; VTNET_CORE_LOCK_ASSERT(sc); for (i = 0; i < sc->vtnet_act_vq_pairs; i++) timedout |= vtnet_watchdog(&sc->vtnet_txqs[i]); if (timedout != 0) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; vtnet_init_locked(sc); } else callout_schedule(&sc->vtnet_tick_ch, hz); } static void vtnet_start_taskqueues(struct vtnet_softc *sc) { device_t dev; struct vtnet_rxq *rxq; struct vtnet_txq *txq; int i, error; dev = sc->vtnet_dev; /* * Errors here are very difficult to recover from - we cannot * easily fail because, if this is during boot, we will hang * when freeing any successfully started taskqueues because * the scheduler isn't up yet. * * Most drivers just ignore the return value - it only fails * with ENOMEM so an error is not likely. */ for (i = 0; i < sc->vtnet_max_vq_pairs; i++) { rxq = &sc->vtnet_rxqs[i]; error = taskqueue_start_threads(&rxq->vtnrx_tq, 1, PI_NET, "%s rxq %d", device_get_nameunit(dev), rxq->vtnrx_id); if (error) { device_printf(dev, "failed to start rx taskq %d\n", rxq->vtnrx_id); } txq = &sc->vtnet_txqs[i]; error = taskqueue_start_threads(&txq->vtntx_tq, 1, PI_NET, "%s txq %d", device_get_nameunit(dev), txq->vtntx_id); if (error) { device_printf(dev, "failed to start tx taskq %d\n", txq->vtntx_id); } } } static void vtnet_free_taskqueues(struct vtnet_softc *sc) { struct vtnet_rxq *rxq; struct vtnet_txq *txq; int i; for (i = 0; i < sc->vtnet_max_vq_pairs; i++) { rxq = &sc->vtnet_rxqs[i]; if (rxq->vtnrx_tq != NULL) { taskqueue_free(rxq->vtnrx_tq); rxq->vtnrx_tq = NULL; } txq = &sc->vtnet_txqs[i]; if (txq->vtntx_tq != NULL) { taskqueue_free(txq->vtntx_tq); txq->vtntx_tq = NULL; } } } static void vtnet_drain_taskqueues(struct vtnet_softc *sc) { struct vtnet_rxq *rxq; struct vtnet_txq *txq; int i; for (i = 0; i < sc->vtnet_max_vq_pairs; i++) { rxq = &sc->vtnet_rxqs[i]; if (rxq->vtnrx_tq != NULL) taskqueue_drain(rxq->vtnrx_tq, &rxq->vtnrx_intrtask); txq = &sc->vtnet_txqs[i]; if (txq->vtntx_tq != NULL) { taskqueue_drain(txq->vtntx_tq, &txq->vtntx_intrtask); #ifndef VTNET_LEGACY_TX taskqueue_drain(txq->vtntx_tq, &txq->vtntx_defrtask); #endif } } } static void vtnet_drain_rxtx_queues(struct vtnet_softc *sc) { struct vtnet_rxq *rxq; struct vtnet_txq *txq; int i; for (i = 0; i < sc->vtnet_act_vq_pairs; i++) { rxq = &sc->vtnet_rxqs[i]; vtnet_rxq_free_mbufs(rxq); txq = &sc->vtnet_txqs[i]; vtnet_txq_free_mbufs(txq); } } static void vtnet_stop_rendezvous(struct vtnet_softc *sc) { struct vtnet_rxq *rxq; struct vtnet_txq *txq; int i; /* * Lock and unlock the per-queue mutex so we known the stop * state is visible. Doing only the active queues should be * sufficient, but it does not cost much extra to do all the * queues. Note we hold the core mutex here too. */ for (i = 0; i < sc->vtnet_max_vq_pairs; i++) { rxq = &sc->vtnet_rxqs[i]; VTNET_RXQ_LOCK(rxq); VTNET_RXQ_UNLOCK(rxq); txq = &sc->vtnet_txqs[i]; VTNET_TXQ_LOCK(txq); VTNET_TXQ_UNLOCK(txq); } } static void vtnet_stop(struct vtnet_softc *sc) { device_t dev; struct ifnet *ifp; dev = sc->vtnet_dev; ifp = sc->vtnet_ifp; VTNET_CORE_LOCK_ASSERT(sc); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; sc->vtnet_link_active = 0; callout_stop(&sc->vtnet_tick_ch); /* Only advisory. */ vtnet_disable_interrupts(sc); /* * Stop the host adapter. This resets it to the pre-initialized * state. It will not generate any interrupts until after it is * reinitialized. */ virtio_stop(dev); vtnet_stop_rendezvous(sc); /* Free any mbufs left in the virtqueues. */ vtnet_drain_rxtx_queues(sc); } static int vtnet_virtio_reinit(struct vtnet_softc *sc) { device_t dev; struct ifnet *ifp; uint64_t features; int mask, error; dev = sc->vtnet_dev; ifp = sc->vtnet_ifp; features = sc->vtnet_features; mask = 0; #if defined(INET) mask |= IFCAP_RXCSUM; #endif #if defined (INET6) mask |= IFCAP_RXCSUM_IPV6; #endif /* * Re-negotiate with the host, removing any disabled receive * features. Transmit features are disabled only on our side * via if_capenable and if_hwassist. */ if (ifp->if_capabilities & mask) { /* * We require both IPv4 and IPv6 offloading to be enabled * in order to negotiated it: VirtIO does not distinguish * between the two. */ if ((ifp->if_capenable & mask) != mask) features &= ~VIRTIO_NET_F_GUEST_CSUM; } if (ifp->if_capabilities & IFCAP_LRO) { if ((ifp->if_capenable & IFCAP_LRO) == 0) features &= ~VTNET_LRO_FEATURES; } if (ifp->if_capabilities & IFCAP_VLAN_HWFILTER) { if ((ifp->if_capenable & IFCAP_VLAN_HWFILTER) == 0) features &= ~VIRTIO_NET_F_CTRL_VLAN; } error = virtio_reinit(dev, features); if (error) device_printf(dev, "virtio reinit error %d\n", error); return (error); } static void vtnet_init_rx_filters(struct vtnet_softc *sc) { struct ifnet *ifp; ifp = sc->vtnet_ifp; if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) { /* Restore promiscuous and all-multicast modes. */ vtnet_rx_filter(sc); /* Restore filtered MAC addresses. */ vtnet_rx_filter_mac(sc); } if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) vtnet_rx_filter_vlan(sc); } static int vtnet_init_rx_queues(struct vtnet_softc *sc) { device_t dev; struct vtnet_rxq *rxq; int i, clsize, error; dev = sc->vtnet_dev; /* * Use the new cluster size if one has been set (via a MTU * change). Otherwise, use the standard 2K clusters. * * BMV: It might make sense to use page sized clusters as * the default (depending on the features negotiated). */ if (sc->vtnet_rx_new_clsize != 0) { clsize = sc->vtnet_rx_new_clsize; sc->vtnet_rx_new_clsize = 0; } else clsize = MCLBYTES; sc->vtnet_rx_clsize = clsize; sc->vtnet_rx_nmbufs = VTNET_NEEDED_RX_MBUFS(sc, clsize); KASSERT(sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS || sc->vtnet_rx_nmbufs < sc->vtnet_rx_nsegs, ("%s: too many rx mbufs %d for %d segments", __func__, sc->vtnet_rx_nmbufs, sc->vtnet_rx_nsegs)); for (i = 0; i < sc->vtnet_act_vq_pairs; i++) { rxq = &sc->vtnet_rxqs[i]; /* Hold the lock to satisfy asserts. */ VTNET_RXQ_LOCK(rxq); error = vtnet_rxq_populate(rxq); VTNET_RXQ_UNLOCK(rxq); if (error) { device_printf(dev, "cannot allocate mbufs for Rx queue %d\n", i); return (error); } } return (0); } static int vtnet_init_tx_queues(struct vtnet_softc *sc) { struct vtnet_txq *txq; int i; for (i = 0; i < sc->vtnet_act_vq_pairs; i++) { txq = &sc->vtnet_txqs[i]; txq->vtntx_watchdog = 0; } return (0); } static int vtnet_init_rxtx_queues(struct vtnet_softc *sc) { int error; error = vtnet_init_rx_queues(sc); if (error) return (error); error = vtnet_init_tx_queues(sc); if (error) return (error); return (0); } static void vtnet_set_active_vq_pairs(struct vtnet_softc *sc) { device_t dev; int npairs; dev = sc->vtnet_dev; if ((sc->vtnet_flags & VTNET_FLAG_MULTIQ) == 0) { sc->vtnet_act_vq_pairs = 1; return; } npairs = sc->vtnet_requested_vq_pairs; if (vtnet_ctrl_mq_cmd(sc, npairs) != 0) { device_printf(dev, "cannot set active queue pairs to %d\n", npairs); npairs = 1; } sc->vtnet_act_vq_pairs = npairs; } static int vtnet_reinit(struct vtnet_softc *sc) { struct ifnet *ifp; int error; ifp = sc->vtnet_ifp; /* Use the current MAC address. */ bcopy(IF_LLADDR(ifp), sc->vtnet_hwaddr, ETHER_ADDR_LEN); vtnet_set_hwaddr(sc); vtnet_set_active_vq_pairs(sc); ifp->if_hwassist = 0; if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist |= VTNET_CSUM_OFFLOAD; if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) ifp->if_hwassist |= VTNET_CSUM_OFFLOAD_IPV6; if (ifp->if_capenable & IFCAP_TSO4) ifp->if_hwassist |= CSUM_IP_TSO; if (ifp->if_capenable & IFCAP_TSO6) ifp->if_hwassist |= CSUM_IP6_TSO; if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) vtnet_init_rx_filters(sc); error = vtnet_init_rxtx_queues(sc); if (error) return (error); vtnet_enable_interrupts(sc); ifp->if_drv_flags |= IFF_DRV_RUNNING; return (0); } static void vtnet_init_locked(struct vtnet_softc *sc) { device_t dev; struct ifnet *ifp; dev = sc->vtnet_dev; ifp = sc->vtnet_ifp; VTNET_CORE_LOCK_ASSERT(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; vtnet_stop(sc); /* Reinitialize with the host. */ if (vtnet_virtio_reinit(sc) != 0) goto fail; if (vtnet_reinit(sc) != 0) goto fail; virtio_reinit_complete(dev); vtnet_update_link_status(sc); callout_reset(&sc->vtnet_tick_ch, hz, vtnet_tick, sc); return; fail: vtnet_stop(sc); } static void vtnet_init(void *xsc) { struct vtnet_softc *sc; sc = xsc; VTNET_CORE_LOCK(sc); vtnet_init_locked(sc); VTNET_CORE_UNLOCK(sc); } static void vtnet_free_ctrl_vq(struct vtnet_softc *sc) { struct virtqueue *vq; vq = sc->vtnet_ctrl_vq; /* * The control virtqueue is only polled and therefore it should * already be empty. */ KASSERT(virtqueue_empty(vq), ("%s: ctrl vq %p not empty", __func__, vq)); } static void vtnet_exec_ctrl_cmd(struct vtnet_softc *sc, void *cookie, struct sglist *sg, int readable, int writable) { struct virtqueue *vq; vq = sc->vtnet_ctrl_vq; VTNET_CORE_LOCK_ASSERT(sc); KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_VQ, ("%s: CTRL_VQ feature not negotiated", __func__)); if (!virtqueue_empty(vq)) return; if (virtqueue_enqueue(vq, cookie, sg, readable, writable) != 0) return; /* * Poll for the response, but the command is likely already * done when we return from the notify. */ virtqueue_notify(vq); virtqueue_poll(vq, NULL); } static int vtnet_ctrl_mac_cmd(struct vtnet_softc *sc, uint8_t *hwaddr) { struct virtio_net_ctrl_hdr hdr __aligned(2); struct sglist_seg segs[3]; struct sglist sg; uint8_t ack; int error; hdr.class = VIRTIO_NET_CTRL_MAC; hdr.cmd = VIRTIO_NET_CTRL_MAC_ADDR_SET; ack = VIRTIO_NET_ERR; sglist_init(&sg, 3, segs); error = 0; error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr)); error |= sglist_append(&sg, hwaddr, ETHER_ADDR_LEN); error |= sglist_append(&sg, &ack, sizeof(uint8_t)); KASSERT(error == 0 && sg.sg_nseg == 3, ("%s: error %d adding set MAC msg to sglist", __func__, error)); vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1); return (ack == VIRTIO_NET_OK ? 0 : EIO); } static int vtnet_ctrl_mq_cmd(struct vtnet_softc *sc, uint16_t npairs) { struct sglist_seg segs[3]; struct sglist sg; struct { struct virtio_net_ctrl_hdr hdr; uint8_t pad1; struct virtio_net_ctrl_mq mq; uint8_t pad2; uint8_t ack; } s __aligned(2); int error; s.hdr.class = VIRTIO_NET_CTRL_MQ; s.hdr.cmd = VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET; s.mq.virtqueue_pairs = npairs; s.ack = VIRTIO_NET_ERR; sglist_init(&sg, 3, segs); error = 0; error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr)); error |= sglist_append(&sg, &s.mq, sizeof(struct virtio_net_ctrl_mq)); error |= sglist_append(&sg, &s.ack, sizeof(uint8_t)); KASSERT(error == 0 && sg.sg_nseg == 3, ("%s: error %d adding MQ message to sglist", __func__, error)); vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1); return (s.ack == VIRTIO_NET_OK ? 0 : EIO); } static int vtnet_ctrl_rx_cmd(struct vtnet_softc *sc, int cmd, int on) { struct sglist_seg segs[3]; struct sglist sg; struct { struct virtio_net_ctrl_hdr hdr; uint8_t pad1; uint8_t onoff; uint8_t pad2; uint8_t ack; } s __aligned(2); int error; KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_RX, ("%s: CTRL_RX feature not negotiated", __func__)); s.hdr.class = VIRTIO_NET_CTRL_RX; s.hdr.cmd = cmd; s.onoff = !!on; s.ack = VIRTIO_NET_ERR; sglist_init(&sg, 3, segs); error = 0; error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr)); error |= sglist_append(&sg, &s.onoff, sizeof(uint8_t)); error |= sglist_append(&sg, &s.ack, sizeof(uint8_t)); KASSERT(error == 0 && sg.sg_nseg == 3, ("%s: error %d adding Rx message to sglist", __func__, error)); vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1); return (s.ack == VIRTIO_NET_OK ? 0 : EIO); } static int vtnet_set_promisc(struct vtnet_softc *sc, int on) { return (vtnet_ctrl_rx_cmd(sc, VIRTIO_NET_CTRL_RX_PROMISC, on)); } static int vtnet_set_allmulti(struct vtnet_softc *sc, int on) { return (vtnet_ctrl_rx_cmd(sc, VIRTIO_NET_CTRL_RX_ALLMULTI, on)); } /* * The device defaults to promiscuous mode for backwards compatibility. * Turn it off at attach time if possible. */ static void vtnet_attach_disable_promisc(struct vtnet_softc *sc) { struct ifnet *ifp; ifp = sc->vtnet_ifp; VTNET_CORE_LOCK(sc); if ((sc->vtnet_flags & VTNET_FLAG_CTRL_RX) == 0) { ifp->if_flags |= IFF_PROMISC; } else if (vtnet_set_promisc(sc, 0) != 0) { ifp->if_flags |= IFF_PROMISC; device_printf(sc->vtnet_dev, "cannot disable default promiscuous mode\n"); } VTNET_CORE_UNLOCK(sc); } static void vtnet_rx_filter(struct vtnet_softc *sc) { device_t dev; struct ifnet *ifp; dev = sc->vtnet_dev; ifp = sc->vtnet_ifp; VTNET_CORE_LOCK_ASSERT(sc); if (vtnet_set_promisc(sc, ifp->if_flags & IFF_PROMISC) != 0) device_printf(dev, "cannot %s promiscuous mode\n", ifp->if_flags & IFF_PROMISC ? "enable" : "disable"); if (vtnet_set_allmulti(sc, ifp->if_flags & IFF_ALLMULTI) != 0) device_printf(dev, "cannot %s all-multicast mode\n", ifp->if_flags & IFF_ALLMULTI ? "enable" : "disable"); } static u_int vtnet_copy_ifaddr(void *arg, struct sockaddr_dl *sdl, u_int ucnt) { struct vtnet_softc *sc = arg; if (memcmp(LLADDR(sdl), sc->vtnet_hwaddr, ETHER_ADDR_LEN) == 0) return (0); if (ucnt < VTNET_MAX_MAC_ENTRIES) bcopy(LLADDR(sdl), &sc->vtnet_mac_filter->vmf_unicast.macs[ucnt], ETHER_ADDR_LEN); return (1); } static u_int vtnet_copy_maddr(void *arg, struct sockaddr_dl *sdl, u_int mcnt) { struct vtnet_mac_filter *filter = arg; if (mcnt < VTNET_MAX_MAC_ENTRIES) bcopy(LLADDR(sdl), &filter->vmf_multicast.macs[mcnt], ETHER_ADDR_LEN); return (1); } static void vtnet_rx_filter_mac(struct vtnet_softc *sc) { struct virtio_net_ctrl_hdr hdr __aligned(2); struct vtnet_mac_filter *filter; struct sglist_seg segs[4]; struct sglist sg; struct ifnet *ifp; bool promisc, allmulti; u_int ucnt, mcnt; int error; uint8_t ack; ifp = sc->vtnet_ifp; filter = sc->vtnet_mac_filter; VTNET_CORE_LOCK_ASSERT(sc); KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_RX, ("%s: CTRL_RX feature not negotiated", __func__)); /* Unicast MAC addresses: */ ucnt = if_foreach_lladdr(ifp, vtnet_copy_ifaddr, sc); promisc = (ucnt > VTNET_MAX_MAC_ENTRIES); if (promisc) { filter->vmf_unicast.nentries = 0; if_printf(ifp, "more than %d MAC addresses assigned, " "falling back to promiscuous mode\n", VTNET_MAX_MAC_ENTRIES); } else filter->vmf_unicast.nentries = ucnt; /* Multicast MAC addresses: */ mcnt = if_foreach_llmaddr(ifp, vtnet_copy_maddr, filter); allmulti = (mcnt > VTNET_MAX_MAC_ENTRIES); if (allmulti) { filter->vmf_multicast.nentries = 0; if_printf(ifp, "more than %d multicast MAC addresses " "assigned, falling back to all-multicast mode\n", VTNET_MAX_MAC_ENTRIES); } else filter->vmf_multicast.nentries = mcnt; if (promisc && allmulti) goto out; hdr.class = VIRTIO_NET_CTRL_MAC; hdr.cmd = VIRTIO_NET_CTRL_MAC_TABLE_SET; ack = VIRTIO_NET_ERR; sglist_init(&sg, 4, segs); error = 0; error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr)); error |= sglist_append(&sg, &filter->vmf_unicast, sizeof(uint32_t) + filter->vmf_unicast.nentries * ETHER_ADDR_LEN); error |= sglist_append(&sg, &filter->vmf_multicast, sizeof(uint32_t) + filter->vmf_multicast.nentries * ETHER_ADDR_LEN); error |= sglist_append(&sg, &ack, sizeof(uint8_t)); KASSERT(error == 0 && sg.sg_nseg == 4, ("%s: error %d adding MAC filter msg to sglist", __func__, error)); vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1); if (ack != VIRTIO_NET_OK) if_printf(ifp, "error setting host MAC filter table\n"); out: if (promisc != 0 && vtnet_set_promisc(sc, 1) != 0) if_printf(ifp, "cannot enable promiscuous mode\n"); if (allmulti != 0 && vtnet_set_allmulti(sc, 1) != 0) if_printf(ifp, "cannot enable all-multicast mode\n"); } static int vtnet_exec_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag) { struct sglist_seg segs[3]; struct sglist sg; struct { struct virtio_net_ctrl_hdr hdr; uint8_t pad1; uint16_t tag; uint8_t pad2; uint8_t ack; } s __aligned(2); int error; s.hdr.class = VIRTIO_NET_CTRL_VLAN; s.hdr.cmd = add ? VIRTIO_NET_CTRL_VLAN_ADD : VIRTIO_NET_CTRL_VLAN_DEL; s.tag = tag; s.ack = VIRTIO_NET_ERR; sglist_init(&sg, 3, segs); error = 0; error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr)); error |= sglist_append(&sg, &s.tag, sizeof(uint16_t)); error |= sglist_append(&sg, &s.ack, sizeof(uint8_t)); KASSERT(error == 0 && sg.sg_nseg == 3, ("%s: error %d adding VLAN message to sglist", __func__, error)); vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1); return (s.ack == VIRTIO_NET_OK ? 0 : EIO); } static void vtnet_rx_filter_vlan(struct vtnet_softc *sc) { uint32_t w; uint16_t tag; int i, bit; VTNET_CORE_LOCK_ASSERT(sc); KASSERT(sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER, ("%s: VLAN_FILTER feature not negotiated", __func__)); /* Enable the filter for each configured VLAN. */ for (i = 0; i < VTNET_VLAN_FILTER_NWORDS; i++) { w = sc->vtnet_vlan_filter[i]; while ((bit = ffs(w) - 1) != -1) { w &= ~(1 << bit); tag = sizeof(w) * CHAR_BIT * i + bit; if (vtnet_exec_vlan_filter(sc, 1, tag) != 0) { device_printf(sc->vtnet_dev, "cannot enable VLAN %d filter\n", tag); } } } } static void vtnet_update_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag) { struct ifnet *ifp; int idx, bit; ifp = sc->vtnet_ifp; idx = (tag >> 5) & 0x7F; bit = tag & 0x1F; if (tag == 0 || tag > 4095) return; VTNET_CORE_LOCK(sc); if (add) sc->vtnet_vlan_filter[idx] |= (1 << bit); else sc->vtnet_vlan_filter[idx] &= ~(1 << bit); if (ifp->if_capenable & IFCAP_VLAN_HWFILTER && ifp->if_drv_flags & IFF_DRV_RUNNING && vtnet_exec_vlan_filter(sc, add, tag) != 0) { device_printf(sc->vtnet_dev, "cannot %s VLAN %d %s the host filter table\n", add ? "add" : "remove", tag, add ? "to" : "from"); } VTNET_CORE_UNLOCK(sc); } static void vtnet_register_vlan(void *arg, struct ifnet *ifp, uint16_t tag) { if (ifp->if_softc != arg) return; vtnet_update_vlan_filter(arg, 1, tag); } static void vtnet_unregister_vlan(void *arg, struct ifnet *ifp, uint16_t tag) { if (ifp->if_softc != arg) return; vtnet_update_vlan_filter(arg, 0, tag); } static int vtnet_is_link_up(struct vtnet_softc *sc) { device_t dev; struct ifnet *ifp; uint16_t status; dev = sc->vtnet_dev; ifp = sc->vtnet_ifp; if ((ifp->if_capabilities & IFCAP_LINKSTATE) == 0) status = VIRTIO_NET_S_LINK_UP; else status = virtio_read_dev_config_2(dev, offsetof(struct virtio_net_config, status)); return ((status & VIRTIO_NET_S_LINK_UP) != 0); } static void vtnet_update_link_status(struct vtnet_softc *sc) { struct ifnet *ifp; int link; ifp = sc->vtnet_ifp; VTNET_CORE_LOCK_ASSERT(sc); link = vtnet_is_link_up(sc); /* Notify if the link status has changed. */ if (link != 0 && sc->vtnet_link_active == 0) { sc->vtnet_link_active = 1; if_link_state_change(ifp, LINK_STATE_UP); } else if (link == 0 && sc->vtnet_link_active != 0) { sc->vtnet_link_active = 0; if_link_state_change(ifp, LINK_STATE_DOWN); } } static int vtnet_ifmedia_upd(struct ifnet *ifp) { struct vtnet_softc *sc; struct ifmedia *ifm; sc = ifp->if_softc; ifm = &sc->vtnet_media; if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) return (EINVAL); return (0); } static void vtnet_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) { struct vtnet_softc *sc; sc = ifp->if_softc; ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; VTNET_CORE_LOCK(sc); if (vtnet_is_link_up(sc) != 0) { ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= VTNET_MEDIATYPE; } else ifmr->ifm_active |= IFM_NONE; VTNET_CORE_UNLOCK(sc); } static void vtnet_set_hwaddr(struct vtnet_softc *sc) { device_t dev; int i; dev = sc->vtnet_dev; if (sc->vtnet_flags & VTNET_FLAG_CTRL_MAC) { if (vtnet_ctrl_mac_cmd(sc, sc->vtnet_hwaddr) != 0) device_printf(dev, "unable to set MAC address\n"); } else if (sc->vtnet_flags & VTNET_FLAG_MAC) { for (i = 0; i < ETHER_ADDR_LEN; i++) { virtio_write_dev_config_1(dev, offsetof(struct virtio_net_config, mac) + i, sc->vtnet_hwaddr[i]); } } } static void vtnet_get_hwaddr(struct vtnet_softc *sc) { device_t dev; int i; dev = sc->vtnet_dev; if ((sc->vtnet_flags & VTNET_FLAG_MAC) == 0) { /* * Generate a random locally administered unicast address. * * It would be nice to generate the same MAC address across * reboots, but it seems all the hosts currently available * support the MAC feature, so this isn't too important. */ sc->vtnet_hwaddr[0] = 0xB2; arc4rand(&sc->vtnet_hwaddr[1], ETHER_ADDR_LEN - 1, 0); vtnet_set_hwaddr(sc); return; } for (i = 0; i < ETHER_ADDR_LEN; i++) { sc->vtnet_hwaddr[i] = virtio_read_dev_config_1(dev, offsetof(struct virtio_net_config, mac) + i); } } static void vtnet_vlan_tag_remove(struct mbuf *m) { struct ether_vlan_header *evh; evh = mtod(m, struct ether_vlan_header *); m->m_pkthdr.ether_vtag = ntohs(evh->evl_tag); m->m_flags |= M_VLANTAG; /* Strip the 802.1Q header. */ bcopy((char *) evh, (char *) evh + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); } static void vtnet_set_rx_process_limit(struct vtnet_softc *sc) { int limit; limit = vtnet_tunable_int(sc, "rx_process_limit", vtnet_rx_process_limit); if (limit < 0) limit = INT_MAX; sc->vtnet_rx_process_limit = limit; } static void vtnet_set_tx_intr_threshold(struct vtnet_softc *sc) { int size, thresh; size = virtqueue_size(sc->vtnet_txqs[0].vtntx_vq); /* * The Tx interrupt is disabled until the queue free count falls * below our threshold. Completed frames are drained from the Tx * virtqueue before transmitting new frames and in the watchdog * callout, so the frequency of Tx interrupts is greatly reduced, * at the cost of not freeing mbufs as quickly as they otherwise * would be. * * N.B. We assume all the Tx queues are the same size. */ thresh = size / 4; /* * Without indirect descriptors, leave enough room for the most * segments we handle. */ if ((sc->vtnet_flags & VTNET_FLAG_INDIRECT) == 0 && thresh < sc->vtnet_tx_nsegs) thresh = sc->vtnet_tx_nsegs; sc->vtnet_tx_intr_thresh = thresh; } static void vtnet_setup_rxq_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct vtnet_rxq *rxq) { struct sysctl_oid *node; struct sysctl_oid_list *list; struct vtnet_rxq_stats *stats; char namebuf[16]; snprintf(namebuf, sizeof(namebuf), "rxq%d", rxq->vtnrx_id); node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Receive Queue"); list = SYSCTL_CHILDREN(node); stats = &rxq->vtnrx_stats; SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ipackets", CTLFLAG_RD, &stats->vrxs_ipackets, "Receive packets"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ibytes", CTLFLAG_RD, &stats->vrxs_ibytes, "Receive bytes"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "iqdrops", CTLFLAG_RD, &stats->vrxs_iqdrops, "Receive drops"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ierrors", CTLFLAG_RD, &stats->vrxs_ierrors, "Receive errors"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum", CTLFLAG_RD, &stats->vrxs_csum, "Receive checksum offloaded"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum_failed", CTLFLAG_RD, &stats->vrxs_csum_failed, "Receive checksum offload failed"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "rescheduled", CTLFLAG_RD, &stats->vrxs_rescheduled, "Receive interrupt handler rescheduled"); } static void vtnet_setup_txq_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct vtnet_txq *txq) { struct sysctl_oid *node; struct sysctl_oid_list *list; struct vtnet_txq_stats *stats; char namebuf[16]; snprintf(namebuf, sizeof(namebuf), "txq%d", txq->vtntx_id); node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Transmit Queue"); list = SYSCTL_CHILDREN(node); stats = &txq->vtntx_stats; SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "opackets", CTLFLAG_RD, &stats->vtxs_opackets, "Transmit packets"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "obytes", CTLFLAG_RD, &stats->vtxs_obytes, "Transmit bytes"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "omcasts", CTLFLAG_RD, &stats->vtxs_omcasts, "Transmit multicasts"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum", CTLFLAG_RD, &stats->vtxs_csum, "Transmit checksum offloaded"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "tso", CTLFLAG_RD, &stats->vtxs_tso, "Transmit segmentation offloaded"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "rescheduled", CTLFLAG_RD, &stats->vtxs_rescheduled, "Transmit interrupt handler rescheduled"); } static void vtnet_setup_queue_sysctl(struct vtnet_softc *sc) { device_t dev; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; int i; dev = sc->vtnet_dev; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); for (i = 0; i < sc->vtnet_max_vq_pairs; i++) { vtnet_setup_rxq_sysctl(ctx, child, &sc->vtnet_rxqs[i]); vtnet_setup_txq_sysctl(ctx, child, &sc->vtnet_txqs[i]); } } static void vtnet_setup_stat_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct vtnet_softc *sc) { struct vtnet_statistics *stats; struct vtnet_rxq_stats rxaccum; struct vtnet_txq_stats txaccum; vtnet_accum_stats(sc, &rxaccum, &txaccum); stats = &sc->vtnet_stats; stats->rx_csum_offloaded = rxaccum.vrxs_csum; stats->rx_csum_failed = rxaccum.vrxs_csum_failed; stats->rx_task_rescheduled = rxaccum.vrxs_rescheduled; stats->tx_csum_offloaded = txaccum.vtxs_csum; stats->tx_tso_offloaded = txaccum.vtxs_tso; stats->tx_task_rescheduled = txaccum.vtxs_rescheduled; SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "mbuf_alloc_failed", CTLFLAG_RD, &stats->mbuf_alloc_failed, "Mbuf cluster allocation failures"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_frame_too_large", CTLFLAG_RD, &stats->rx_frame_too_large, "Received frame larger than the mbuf chain"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_enq_replacement_failed", CTLFLAG_RD, &stats->rx_enq_replacement_failed, "Enqueuing the replacement receive mbuf failed"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_mergeable_failed", CTLFLAG_RD, &stats->rx_mergeable_failed, "Mergeable buffers receive failures"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_ethtype", CTLFLAG_RD, &stats->rx_csum_bad_ethtype, "Received checksum offloaded buffer with unsupported " "Ethernet type"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_ipproto", CTLFLAG_RD, &stats->rx_csum_bad_ipproto, "Received checksum offloaded buffer with incorrect IP protocol"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_offset", CTLFLAG_RD, &stats->rx_csum_bad_offset, "Received checksum offloaded buffer with incorrect offset"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_proto", CTLFLAG_RD, &stats->rx_csum_bad_proto, "Received checksum offloaded buffer with incorrect protocol"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_failed", CTLFLAG_RD, &stats->rx_csum_failed, "Received buffer checksum offload failed"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_offloaded", CTLFLAG_RD, &stats->rx_csum_offloaded, "Received buffer checksum offload succeeded"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_task_rescheduled", CTLFLAG_RD, &stats->rx_task_rescheduled, "Times the receive interrupt task rescheduled itself"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_csum_bad_ethtype", CTLFLAG_RD, &stats->tx_csum_bad_ethtype, "Aborted transmit of checksum offloaded buffer with unknown " "Ethernet type"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_bad_ethtype", CTLFLAG_RD, &stats->tx_tso_bad_ethtype, "Aborted transmit of TSO buffer with unknown Ethernet type"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_not_tcp", CTLFLAG_RD, &stats->tx_tso_not_tcp, "Aborted transmit of TSO buffer with non TCP protocol"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_defragged", CTLFLAG_RD, &stats->tx_defragged, "Transmit mbufs defragged"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_defrag_failed", CTLFLAG_RD, &stats->tx_defrag_failed, "Aborted transmit of buffer because defrag failed"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_csum_offloaded", CTLFLAG_RD, &stats->tx_csum_offloaded, "Offloaded checksum of transmitted buffer"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_offloaded", CTLFLAG_RD, &stats->tx_tso_offloaded, "Segmentation offload of transmitted buffer"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_task_rescheduled", CTLFLAG_RD, &stats->tx_task_rescheduled, "Times the transmit interrupt task rescheduled itself"); } static void vtnet_setup_sysctl(struct vtnet_softc *sc) { device_t dev; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; dev = sc->vtnet_dev; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "max_vq_pairs", CTLFLAG_RD, &sc->vtnet_max_vq_pairs, 0, "Maximum number of supported virtqueue pairs"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "requested_vq_pairs", CTLFLAG_RD, &sc->vtnet_requested_vq_pairs, 0, "Requested number of virtqueue pairs"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "act_vq_pairs", CTLFLAG_RD, &sc->vtnet_act_vq_pairs, 0, "Number of active virtqueue pairs"); vtnet_setup_stat_sysctl(ctx, child, sc); } static int vtnet_rxq_enable_intr(struct vtnet_rxq *rxq) { return (virtqueue_enable_intr(rxq->vtnrx_vq)); } static void vtnet_rxq_disable_intr(struct vtnet_rxq *rxq) { virtqueue_disable_intr(rxq->vtnrx_vq); } static int vtnet_txq_enable_intr(struct vtnet_txq *txq) { struct virtqueue *vq; vq = txq->vtntx_vq; if (vtnet_txq_below_threshold(txq) != 0) return (virtqueue_postpone_intr(vq, VQ_POSTPONE_LONG)); /* * The free count is above our threshold. Keep the Tx interrupt * disabled until the queue is fuller. */ return (0); } static void vtnet_txq_disable_intr(struct vtnet_txq *txq) { virtqueue_disable_intr(txq->vtntx_vq); } static void vtnet_enable_rx_interrupts(struct vtnet_softc *sc) { int i; for (i = 0; i < sc->vtnet_act_vq_pairs; i++) vtnet_rxq_enable_intr(&sc->vtnet_rxqs[i]); } static void vtnet_enable_tx_interrupts(struct vtnet_softc *sc) { int i; for (i = 0; i < sc->vtnet_act_vq_pairs; i++) vtnet_txq_enable_intr(&sc->vtnet_txqs[i]); } static void vtnet_enable_interrupts(struct vtnet_softc *sc) { vtnet_enable_rx_interrupts(sc); vtnet_enable_tx_interrupts(sc); } static void vtnet_disable_rx_interrupts(struct vtnet_softc *sc) { int i; for (i = 0; i < sc->vtnet_act_vq_pairs; i++) vtnet_rxq_disable_intr(&sc->vtnet_rxqs[i]); } static void vtnet_disable_tx_interrupts(struct vtnet_softc *sc) { int i; for (i = 0; i < sc->vtnet_act_vq_pairs; i++) vtnet_txq_disable_intr(&sc->vtnet_txqs[i]); } static void vtnet_disable_interrupts(struct vtnet_softc *sc) { vtnet_disable_rx_interrupts(sc); vtnet_disable_tx_interrupts(sc); } static int vtnet_tunable_int(struct vtnet_softc *sc, const char *knob, int def) { char path[64]; snprintf(path, sizeof(path), "hw.vtnet.%d.%s", device_get_unit(sc->vtnet_dev), knob); TUNABLE_INT_FETCH(path, &def); return (def); } #ifdef DEBUGNET static void vtnet_debugnet_init(struct ifnet *ifp, int *nrxr, int *ncl, int *clsize) { struct vtnet_softc *sc; sc = if_getsoftc(ifp); VTNET_CORE_LOCK(sc); *nrxr = sc->vtnet_max_vq_pairs; *ncl = DEBUGNET_MAX_IN_FLIGHT; *clsize = sc->vtnet_rx_clsize; VTNET_CORE_UNLOCK(sc); } static void vtnet_debugnet_event(struct ifnet *ifp __unused, enum debugnet_ev event __unused) { } static int vtnet_debugnet_transmit(struct ifnet *ifp, struct mbuf *m) { struct vtnet_softc *sc; struct vtnet_txq *txq; int error; sc = if_getsoftc(ifp); if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return (EBUSY); txq = &sc->vtnet_txqs[0]; error = vtnet_txq_encap(txq, &m, M_NOWAIT | M_USE_RESERVE); if (error == 0) (void)vtnet_txq_notify(txq); return (error); } static int vtnet_debugnet_poll(struct ifnet *ifp, int count) { struct vtnet_softc *sc; int i; sc = if_getsoftc(ifp); if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return (EBUSY); (void)vtnet_txq_eof(&sc->vtnet_txqs[0]); for (i = 0; i < sc->vtnet_max_vq_pairs; i++) (void)vtnet_rxq_eof(&sc->vtnet_rxqs[i]); return (0); } #endif /* DEBUGNET */ diff --git a/sys/mips/nlm/dev/net/xlpge.c b/sys/mips/nlm/dev/net/xlpge.c index e1beb9ad79aa..ac0c4d6e843d 100644 --- a/sys/mips/nlm/dev/net/xlpge.c +++ b/sys/mips/nlm/dev/net/xlpge.c @@ -1,1544 +1,1543 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003-2012 Broadcom Corporation * All Rights Reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY BROADCOM ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BROADCOM OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define __RMAN_RESOURCE_VISIBLE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for DELAY */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "miidevs.h" #include #include "miibus_if.h" #include #include /*#define XLP_DRIVER_LOOPBACK*/ static struct nae_port_config nae_port_config[64]; int poe_cl_tbl[MAX_POE_CLASSES] = { 0x0, 0x249249, 0x492492, 0x6db6db, 0x924924, 0xb6db6d, 0xdb6db6, 0xffffff }; /* #define DUMP_PACKET */ static uint64_t nlm_paddr_ld(uint64_t paddr) { uint64_t xkaddr = 0x9800000000000000 | paddr; return (nlm_load_dword_daddr(xkaddr)); } struct nlm_xlp_portdata ifp_ports[64]; static uma_zone_t nl_tx_desc_zone; /* This implementation will register the following tree of device * registration: * pcibus * | * xlpnae (1 instance - virtual entity) * | * xlpge * (18 sgmii / 4 xaui / 2 interlaken instances) * | * miibus */ static int nlm_xlpnae_probe(device_t); static int nlm_xlpnae_attach(device_t); static int nlm_xlpnae_detach(device_t); static int nlm_xlpnae_suspend(device_t); static int nlm_xlpnae_resume(device_t); static int nlm_xlpnae_shutdown(device_t); static device_method_t nlm_xlpnae_methods[] = { /* Methods from the device interface */ DEVMETHOD(device_probe, nlm_xlpnae_probe), DEVMETHOD(device_attach, nlm_xlpnae_attach), DEVMETHOD(device_detach, nlm_xlpnae_detach), DEVMETHOD(device_suspend, nlm_xlpnae_suspend), DEVMETHOD(device_resume, nlm_xlpnae_resume), DEVMETHOD(device_shutdown, nlm_xlpnae_shutdown), DEVMETHOD(bus_driver_added, bus_generic_driver_added), DEVMETHOD_END }; static driver_t nlm_xlpnae_driver = { "xlpnae", nlm_xlpnae_methods, sizeof(struct nlm_xlpnae_softc) }; static devclass_t nlm_xlpnae_devclass; static int nlm_xlpge_probe(device_t); static int nlm_xlpge_attach(device_t); static int nlm_xlpge_detach(device_t); static int nlm_xlpge_suspend(device_t); static int nlm_xlpge_resume(device_t); static int nlm_xlpge_shutdown(device_t); /* mii override functions */ static int nlm_xlpge_mii_read(device_t, int, int); static int nlm_xlpge_mii_write(device_t, int, int, int); static void nlm_xlpge_mii_statchg(device_t); static device_method_t nlm_xlpge_methods[] = { /* Methods from the device interface */ DEVMETHOD(device_probe, nlm_xlpge_probe), DEVMETHOD(device_attach, nlm_xlpge_attach), DEVMETHOD(device_detach, nlm_xlpge_detach), DEVMETHOD(device_suspend, nlm_xlpge_suspend), DEVMETHOD(device_resume, nlm_xlpge_resume), DEVMETHOD(device_shutdown, nlm_xlpge_shutdown), /* Methods from the nexus bus needed for explicitly * probing children when driver is loaded as a kernel module */ DEVMETHOD(miibus_readreg, nlm_xlpge_mii_read), DEVMETHOD(miibus_writereg, nlm_xlpge_mii_write), DEVMETHOD(miibus_statchg, nlm_xlpge_mii_statchg), /* Terminate method list */ DEVMETHOD_END }; static driver_t nlm_xlpge_driver = { "xlpge", nlm_xlpge_methods, sizeof(struct nlm_xlpge_softc) }; static devclass_t nlm_xlpge_devclass; DRIVER_MODULE(xlpnae, pci, nlm_xlpnae_driver, nlm_xlpnae_devclass, 0, 0); DRIVER_MODULE(xlpge, xlpnae, nlm_xlpge_driver, nlm_xlpge_devclass, 0, 0); DRIVER_MODULE(miibus, xlpge, miibus_driver, miibus_devclass, 0, 0); MODULE_DEPEND(pci, xlpnae, 1, 1, 1); MODULE_DEPEND(xlpnae, xlpge, 1, 1, 1); MODULE_DEPEND(xlpge, ether, 1, 1, 1); MODULE_DEPEND(xlpge, miibus, 1, 1, 1); #define SGMII_RCV_CONTEXT_WIDTH 8 /* prototypes */ static void nlm_xlpge_msgring_handler(int vc, int size, int code, int srcid, struct nlm_fmn_msg *msg, void *data); static void nlm_xlpge_submit_rx_free_desc(struct nlm_xlpge_softc *sc, int num); static void nlm_xlpge_init(void *addr); static void nlm_xlpge_port_disable(struct nlm_xlpge_softc *sc); static void nlm_xlpge_port_enable(struct nlm_xlpge_softc *sc); /* globals */ int dbg_on = 1; int cntx2port[524]; static __inline void atomic_incr_long(unsigned long *addr) { atomic_add_long(addr, 1); } /* * xlpnae driver implementation */ static int nlm_xlpnae_probe(device_t dev) { if (pci_get_vendor(dev) != PCI_VENDOR_NETLOGIC || pci_get_device(dev) != PCI_DEVICE_ID_NLM_NAE) return (ENXIO); return (BUS_PROBE_DEFAULT); } static void nlm_xlpnae_print_frin_desc_carving(struct nlm_xlpnae_softc *sc) { int intf; uint32_t value; int start, size; /* XXXJC: use max_ports instead of 20 ? */ for (intf = 0; intf < 20; intf++) { nlm_write_nae_reg(sc->base, NAE_FREE_IN_FIFO_CFG, (0x80000000 | intf)); value = nlm_read_nae_reg(sc->base, NAE_FREE_IN_FIFO_CFG); size = 2 * ((value >> 20) & 0x3ff); start = 2 * ((value >> 8) & 0x1ff); } } static void nlm_config_egress(struct nlm_xlpnae_softc *sc, int nblock, int context_base, int hwport, int max_channels) { int offset, num_channels; uint32_t data; num_channels = sc->portcfg[hwport].num_channels; data = (2048 << 12) | (hwport << 4) | 1; nlm_write_nae_reg(sc->base, NAE_TX_IF_BURSTMAX_CMD, data); data = ((context_base + num_channels - 1) << 22) | (context_base << 12) | (hwport << 4) | 1; nlm_write_nae_reg(sc->base, NAE_TX_DDR_ACTVLIST_CMD, data); config_egress_fifo_carvings(sc->base, hwport, context_base, num_channels, max_channels, sc->portcfg); config_egress_fifo_credits(sc->base, hwport, context_base, num_channels, max_channels, sc->portcfg); data = nlm_read_nae_reg(sc->base, NAE_DMA_TX_CREDIT_TH); data |= (1 << 25) | (1 << 24); nlm_write_nae_reg(sc->base, NAE_DMA_TX_CREDIT_TH, data); for (offset = 0; offset < num_channels; offset++) { nlm_write_nae_reg(sc->base, NAE_TX_SCHED_MAP_CMD1, NAE_DRR_QUANTA); data = (hwport << 15) | ((context_base + offset) << 5); if (sc->cmplx_type[nblock] == ILC) data |= (offset << 20); nlm_write_nae_reg(sc->base, NAE_TX_SCHED_MAP_CMD0, data | 1); nlm_write_nae_reg(sc->base, NAE_TX_SCHED_MAP_CMD0, data); } } static int xlpnae_get_maxchannels(struct nlm_xlpnae_softc *sc) { int maxchans = 0; int i; for (i = 0; i < sc->max_ports; i++) { if (sc->portcfg[i].type == UNKNOWN) continue; maxchans += sc->portcfg[i].num_channels; } return (maxchans); } static void nlm_setup_interface(struct nlm_xlpnae_softc *sc, int nblock, int port, uint32_t cur_flow_base, uint32_t flow_mask, int max_channels, int context) { uint64_t nae_base = sc->base; int mtu = 1536; /* XXXJC: don't hard code */ uint32_t ucore_mask; if (sc->cmplx_type[nblock] == XAUIC) nlm_config_xaui(nae_base, nblock, mtu, mtu, sc->portcfg[port].vlan_pri_en); nlm_config_freein_fifo_uniq_cfg(nae_base, port, sc->portcfg[port].free_desc_sizes); nlm_config_ucore_iface_mask_cfg(nae_base, port, sc->portcfg[port].ucore_mask); nlm_program_flow_cfg(nae_base, port, cur_flow_base, flow_mask); if (sc->cmplx_type[nblock] == SGMIIC) nlm_configure_sgmii_interface(nae_base, nblock, port, mtu, 0); nlm_config_egress(sc, nblock, context, port, max_channels); nlm_nae_init_netior(nae_base, sc->nblocks); nlm_nae_open_if(nae_base, nblock, sc->cmplx_type[nblock], port, sc->portcfg[port].free_desc_sizes); /* XXXJC: check mask calculation */ ucore_mask = (1 << sc->nucores) - 1; nlm_nae_init_ucore(nae_base, port, ucore_mask); } static void nlm_setup_interfaces(struct nlm_xlpnae_softc *sc) { uint64_t nae_base; uint32_t cur_slot, cur_slot_base; uint32_t cur_flow_base, port, flow_mask; int max_channels; int i, context; cur_slot = 0; cur_slot_base = 0; cur_flow_base = 0; nae_base = sc->base; flow_mask = nlm_get_flow_mask(sc->total_num_ports); /* calculate max_channels */ max_channels = xlpnae_get_maxchannels(sc); port = 0; context = 0; for (i = 0; i < sc->max_ports; i++) { if (sc->portcfg[i].type == UNKNOWN) continue; nlm_setup_interface(sc, sc->portcfg[i].block, i, cur_flow_base, flow_mask, max_channels, context); cur_flow_base += sc->per_port_num_flows; context += sc->portcfg[i].num_channels; } } static void nlm_xlpnae_init(int node, struct nlm_xlpnae_softc *sc) { uint64_t nae_base; uint32_t ucoremask = 0; uint32_t val; int i; nae_base = sc->base; nlm_nae_flush_free_fifo(nae_base, sc->nblocks); nlm_deflate_frin_fifo_carving(nae_base, sc->max_ports); nlm_reset_nae(node); for (i = 0; i < sc->nucores; i++) /* XXXJC: code repeated below */ ucoremask |= (0x1 << i); printf("Loading 0x%x ucores with microcode\n", ucoremask); nlm_ucore_load_all(nae_base, ucoremask, 1); val = nlm_set_device_frequency(node, DFS_DEVICE_NAE, sc->freq); printf("Setup NAE frequency to %dMHz\n", val); nlm_mdio_reset_all(nae_base); printf("Initialze SGMII PCS for blocks 0x%x\n", sc->sgmiimask); nlm_sgmii_pcs_init(nae_base, sc->sgmiimask); printf("Initialze XAUI PCS for blocks 0x%x\n", sc->xauimask); nlm_xaui_pcs_init(nae_base, sc->xauimask); /* clear NETIOR soft reset */ nlm_write_nae_reg(nae_base, NAE_LANE_CFG_SOFTRESET, 0x0); /* Disable RX enable bit in RX_CONFIG */ val = nlm_read_nae_reg(nae_base, NAE_RX_CONFIG); val &= 0xfffffffe; nlm_write_nae_reg(nae_base, NAE_RX_CONFIG, val); if (nlm_is_xlp8xx_ax() == 0) { val = nlm_read_nae_reg(nae_base, NAE_TX_CONFIG); val &= ~(1 << 3); nlm_write_nae_reg(nae_base, NAE_TX_CONFIG, val); } nlm_setup_poe_class_config(nae_base, MAX_POE_CLASSES, sc->ncontexts, poe_cl_tbl); nlm_setup_vfbid_mapping(nae_base); nlm_setup_flow_crc_poly(nae_base, sc->flow_crc_poly); nlm_setup_rx_cal_cfg(nae_base, sc->max_ports, sc->portcfg); /* note: xlp8xx Ax does not have Tx Calendering */ if (!nlm_is_xlp8xx_ax()) nlm_setup_tx_cal_cfg(nae_base, sc->max_ports, sc->portcfg); nlm_setup_interfaces(sc); nlm_config_poe(sc->poe_base, sc->poedv_base); if (sc->hw_parser_en) nlm_enable_hardware_parser(nae_base); if (sc->prepad_en) nlm_prepad_enable(nae_base, sc->prepad_size); if (sc->ieee_1588_en) nlm_setup_1588_timer(sc->base, sc->portcfg); } static void nlm_xlpnae_update_pde(void *dummy __unused) { struct nlm_xlpnae_softc *sc; uint32_t dv[NUM_WORDS_PER_DV]; device_t dev; int vec; dev = devclass_get_device(devclass_find("xlpnae"), 0); sc = device_get_softc(dev); nlm_write_poe_reg(sc->poe_base, POE_DISTR_EN, 0); for (vec = 0; vec < NUM_DIST_VEC; vec++) { if (nlm_get_poe_distvec(vec, dv) != 0) continue; nlm_write_poe_distvec(sc->poedv_base, vec, dv); } nlm_write_poe_reg(sc->poe_base, POE_DISTR_EN, 1); } SYSINIT(nlm_xlpnae_update_pde, SI_SUB_SMP, SI_ORDER_ANY, nlm_xlpnae_update_pde, NULL); /* configuration common for sgmii, xaui, ilaken goes here */ static void nlm_setup_portcfg(struct nlm_xlpnae_softc *sc, struct xlp_nae_ivars *naep, int block, int port) { int i; uint32_t ucore_mask = 0; struct xlp_block_ivars *bp; struct xlp_port_ivars *p; bp = &(naep->block_ivars[block]); p = &(bp->port_ivars[port & 0x3]); sc->portcfg[port].node = p->node; sc->portcfg[port].block = p->block; sc->portcfg[port].port = p->port; sc->portcfg[port].type = p->type; sc->portcfg[port].mdio_bus = p->mdio_bus; sc->portcfg[port].phy_addr = p->phy_addr; sc->portcfg[port].loopback_mode = p->loopback_mode; sc->portcfg[port].num_channels = p->num_channels; if (p->free_desc_sizes != MCLBYTES) { printf("[%d, %d] Error: free_desc_sizes %d != %d\n", block, port, p->free_desc_sizes, MCLBYTES); return; } sc->portcfg[port].free_desc_sizes = p->free_desc_sizes; for (i = 0; i < sc->nucores; i++) /* XXXJC: configure this */ ucore_mask |= (0x1 << i); sc->portcfg[port].ucore_mask = ucore_mask; sc->portcfg[port].vlan_pri_en = p->vlan_pri_en; sc->portcfg[port].num_free_descs = p->num_free_descs; sc->portcfg[port].iface_fifo_size = p->iface_fifo_size; sc->portcfg[port].rxbuf_size = p->rxbuf_size; sc->portcfg[port].rx_slots_reqd = p->rx_slots_reqd; sc->portcfg[port].tx_slots_reqd = p->tx_slots_reqd; sc->portcfg[port].pseq_fifo_size = p->pseq_fifo_size; sc->portcfg[port].stg2_fifo_size = p->stg2_fifo_size; sc->portcfg[port].eh_fifo_size = p->eh_fifo_size; sc->portcfg[port].frout_fifo_size = p->frout_fifo_size; sc->portcfg[port].ms_fifo_size = p->ms_fifo_size; sc->portcfg[port].pkt_fifo_size = p->pkt_fifo_size; sc->portcfg[port].pktlen_fifo_size = p->pktlen_fifo_size; sc->portcfg[port].max_stg2_offset = p->max_stg2_offset; sc->portcfg[port].max_eh_offset = p->max_eh_offset; sc->portcfg[port].max_frout_offset = p->max_frout_offset; sc->portcfg[port].max_ms_offset = p->max_ms_offset; sc->portcfg[port].max_pmem_offset = p->max_pmem_offset; sc->portcfg[port].stg1_2_credit = p->stg1_2_credit; sc->portcfg[port].stg2_eh_credit = p->stg2_eh_credit; sc->portcfg[port].stg2_frout_credit = p->stg2_frout_credit; sc->portcfg[port].stg2_ms_credit = p->stg2_ms_credit; sc->portcfg[port].ieee1588_inc_intg = p->ieee1588_inc_intg; sc->portcfg[port].ieee1588_inc_den = p->ieee1588_inc_den; sc->portcfg[port].ieee1588_inc_num = p->ieee1588_inc_num; sc->portcfg[port].ieee1588_userval = p->ieee1588_userval; sc->portcfg[port].ieee1588_ptpoff = p->ieee1588_ptpoff; sc->portcfg[port].ieee1588_tmr1 = p->ieee1588_tmr1; sc->portcfg[port].ieee1588_tmr2 = p->ieee1588_tmr2; sc->portcfg[port].ieee1588_tmr3 = p->ieee1588_tmr3; sc->total_free_desc += sc->portcfg[port].free_desc_sizes; sc->total_num_ports++; } static int nlm_xlpnae_attach(device_t dev) { struct xlp_nae_ivars *nae_ivars; struct nlm_xlpnae_softc *sc; device_t tmpd; uint32_t dv[NUM_WORDS_PER_DV]; int port, i, j, nchan, nblock, node, qstart, qnum; int offset, context, txq_base, rxvcbase; uint64_t poe_pcibase, nae_pcibase; node = pci_get_slot(dev) / 8; nae_ivars = &xlp_board_info.nodes[node].nae_ivars; sc = device_get_softc(dev); sc->xlpnae_dev = dev; sc->node = nae_ivars->node; sc->base = nlm_get_nae_regbase(sc->node); sc->poe_base = nlm_get_poe_regbase(sc->node); sc->poedv_base = nlm_get_poedv_regbase(sc->node); sc->portcfg = nae_port_config; sc->blockmask = nae_ivars->blockmask; sc->ilmask = nae_ivars->ilmask; sc->xauimask = nae_ivars->xauimask; sc->sgmiimask = nae_ivars->sgmiimask; sc->nblocks = nae_ivars->nblocks; sc->freq = nae_ivars->freq; /* flow table generation is done by CRC16 polynomial */ sc->flow_crc_poly = nae_ivars->flow_crc_poly; sc->hw_parser_en = nae_ivars->hw_parser_en; sc->prepad_en = nae_ivars->prepad_en; sc->prepad_size = nae_ivars->prepad_size; sc->ieee_1588_en = nae_ivars->ieee_1588_en; nae_pcibase = nlm_get_nae_pcibase(sc->node); sc->ncontexts = nlm_read_reg(nae_pcibase, XLP_PCI_DEVINFO_REG5); sc->nucores = nlm_num_uengines(nae_pcibase); for (nblock = 0; nblock < sc->nblocks; nblock++) { sc->cmplx_type[nblock] = nae_ivars->block_ivars[nblock].type; sc->portmask[nblock] = nae_ivars->block_ivars[nblock].portmask; } for (i = 0; i < sc->ncontexts; i++) cntx2port[i] = 18; /* 18 is an invalid port */ if (sc->nblocks == 5) sc->max_ports = 18; /* 8xx has a block 4 with 2 ports */ else sc->max_ports = sc->nblocks * PORTS_PER_CMPLX; for (i = 0; i < sc->max_ports; i++) sc->portcfg[i].type = UNKNOWN; /* Port Not Present */ /* * Now setup all internal fifo carvings based on * total number of ports in the system */ sc->total_free_desc = 0; sc->total_num_ports = 0; port = 0; context = 0; txq_base = nlm_qidstart(nae_pcibase); rxvcbase = txq_base + sc->ncontexts; for (i = 0; i < sc->nblocks; i++) { uint32_t portmask; if ((nae_ivars->blockmask & (1 << i)) == 0) { port += 4; continue; } portmask = nae_ivars->block_ivars[i].portmask; for (j = 0; j < PORTS_PER_CMPLX; j++, port++) { if ((portmask & (1 << j)) == 0) continue; nlm_setup_portcfg(sc, nae_ivars, i, port); nchan = sc->portcfg[port].num_channels; for (offset = 0; offset < nchan; offset++) cntx2port[context + offset] = port; sc->portcfg[port].txq = txq_base + context; sc->portcfg[port].rxfreeq = rxvcbase + port; context += nchan; } } poe_pcibase = nlm_get_poe_pcibase(sc->node); sc->per_port_num_flows = nlm_poe_max_flows(poe_pcibase) / sc->total_num_ports; /* zone for P2P descriptors */ nl_tx_desc_zone = uma_zcreate("NL Tx Desc", sizeof(struct xlpge_tx_desc), NULL, NULL, NULL, NULL, NAE_CACHELINE_SIZE, 0); /* NAE FMN messages have CMS src station id's in the * range of qstart to qnum. */ qstart = nlm_qidstart(nae_pcibase); qnum = nlm_qnum(nae_pcibase); if (register_msgring_handler(qstart, qstart + qnum - 1, nlm_xlpge_msgring_handler, sc)) { panic("Couldn't register NAE msgring handler\n"); } /* POE FMN messages have CMS src station id's in the * range of qstart to qnum. */ qstart = nlm_qidstart(poe_pcibase); qnum = nlm_qnum(poe_pcibase); if (register_msgring_handler(qstart, qstart + qnum - 1, nlm_xlpge_msgring_handler, sc)) { panic("Couldn't register POE msgring handler\n"); } nlm_xlpnae_init(node, sc); for (i = 0; i < sc->max_ports; i++) { char desc[32]; int block, port; if (sc->portcfg[i].type == UNKNOWN) continue; block = sc->portcfg[i].block; port = sc->portcfg[i].port; tmpd = device_add_child(dev, "xlpge", i); device_set_ivars(tmpd, &(nae_ivars->block_ivars[block].port_ivars[port])); sprintf(desc, "XLP NAE Port %d,%d", block, port); device_set_desc_copy(tmpd, desc); } nlm_setup_iface_fifo_cfg(sc->base, sc->max_ports, sc->portcfg); nlm_setup_rx_base_config(sc->base, sc->max_ports, sc->portcfg); nlm_setup_rx_buf_config(sc->base, sc->max_ports, sc->portcfg); nlm_setup_freein_fifo_cfg(sc->base, sc->portcfg); nlm_program_nae_parser_seq_fifo(sc->base, sc->max_ports, sc->portcfg); nlm_xlpnae_print_frin_desc_carving(sc); bus_generic_probe(dev); bus_generic_attach(dev); /* * Enable only boot cpu at this point, full distribution comes * only after SMP is started */ nlm_write_poe_reg(sc->poe_base, POE_DISTR_EN, 0); nlm_calc_poe_distvec(0x1, 0, 0, 0, 0x1 << XLPGE_RX_VC, dv); nlm_write_poe_distvec(sc->poedv_base, 0, dv); nlm_write_poe_reg(sc->poe_base, POE_DISTR_EN, 1); return (0); } static int nlm_xlpnae_detach(device_t dev) { /* TODO - free zone here */ return (0); } static int nlm_xlpnae_suspend(device_t dev) { return (0); } static int nlm_xlpnae_resume(device_t dev) { return (0); } static int nlm_xlpnae_shutdown(device_t dev) { return (0); } /* * xlpge driver implementation */ static void nlm_xlpge_mac_set_rx_mode(struct nlm_xlpge_softc *sc) { if (sc->if_flags & IFF_PROMISC) { if (sc->type == SGMIIC) nlm_nae_setup_rx_mode_sgmii(sc->base_addr, sc->block, sc->port, sc->type, 1 /* broadcast */, 1/* multicast */, 0 /* pause */, 1 /* promisc */); else nlm_nae_setup_rx_mode_xaui(sc->base_addr, sc->block, sc->port, sc->type, 1 /* broadcast */, 1/* multicast */, 0 /* pause */, 1 /* promisc */); } else { if (sc->type == SGMIIC) nlm_nae_setup_rx_mode_sgmii(sc->base_addr, sc->block, sc->port, sc->type, 1 /* broadcast */, 1/* multicast */, 0 /* pause */, 0 /* promisc */); else nlm_nae_setup_rx_mode_xaui(sc->base_addr, sc->block, sc->port, sc->type, 1 /* broadcast */, 1/* multicast */, 0 /* pause */, 0 /* promisc */); } } static int nlm_xlpge_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct mii_data *mii; struct nlm_xlpge_softc *sc; struct ifreq *ifr; int error; sc = ifp->if_softc; error = 0; ifr = (struct ifreq *)data; switch (command) { case SIOCSIFFLAGS: XLPGE_LOCK(sc); sc->if_flags = ifp->if_flags; if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) nlm_xlpge_init(sc); else nlm_xlpge_port_enable(sc); nlm_xlpge_mac_set_rx_mode(sc); sc->link = NLM_LINK_UP; } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) nlm_xlpge_port_disable(sc); sc->link = NLM_LINK_DOWN; } XLPGE_UNLOCK(sc); error = 0; break; case SIOCGIFMEDIA: case SIOCSIFMEDIA: if (sc->mii_bus != NULL) { mii = device_get_softc(sc->mii_bus); error = ifmedia_ioctl(ifp, ifr, &mii->mii_media, command); } break; default: error = ether_ioctl(ifp, command, data); break; } return (error); } static int xlpge_tx(struct ifnet *ifp, struct mbuf *mbuf_chain) { struct nlm_fmn_msg msg; struct xlpge_tx_desc *p2p; struct nlm_xlpge_softc *sc; struct mbuf *m; vm_paddr_t paddr; int fbid, dst, pos, err; int ret = 0, tx_msgstatus, retries; err = 0; if (mbuf_chain == NULL) return (0); sc = ifp->if_softc; p2p = NULL; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING) || ifp->if_drv_flags & IFF_DRV_OACTIVE) { err = ENXIO; goto fail; } /* free a few in coming messages on the fb vc */ xlp_handle_msg_vc(1 << XLPGE_FB_VC, 2); /* vfb id table is setup to map cpu to vc 3 of the cpu */ fbid = nlm_cpuid(); dst = sc->txq; pos = 0; p2p = uma_zalloc(nl_tx_desc_zone, M_NOWAIT); if (p2p == NULL) { printf("alloc fail\n"); err = ENOBUFS; goto fail; } for (m = mbuf_chain; m != NULL; m = m->m_next) { vm_offset_t buf = (vm_offset_t) m->m_data; int len = m->m_len; int frag_sz; uint64_t desc; /*printf("m_data = %p len %d\n", m->m_data, len); */ while (len) { if (pos == XLP_NTXFRAGS - 3) { device_printf(sc->xlpge_dev, "packet defrag %d\n", m_length(mbuf_chain, NULL)); err = ENOBUFS; /* TODO fix error */ goto fail; } paddr = vtophys(buf); frag_sz = PAGE_SIZE - (buf & PAGE_MASK); if (len < frag_sz) frag_sz = len; desc = nae_tx_desc(P2D_NEOP, 0, 127, frag_sz, paddr); p2p->frag[pos] = htobe64(desc); pos++; len -= frag_sz; buf += frag_sz; } } KASSERT(pos != 0, ("Zero-length mbuf chain?\n")); /* Make the last one P2D EOP */ p2p->frag[pos-1] |= htobe64((uint64_t)P2D_EOP << 62); /* stash useful pointers in the desc */ p2p->frag[XLP_NTXFRAGS-3] = 0xf00bad; p2p->frag[XLP_NTXFRAGS-2] = (uintptr_t)p2p; p2p->frag[XLP_NTXFRAGS-1] = (uintptr_t)mbuf_chain; paddr = vtophys(p2p); msg.msg[0] = nae_tx_desc(P2P, 0, fbid, pos, paddr); for (retries = 16; retries > 0; retries--) { ret = nlm_fmn_msgsend(dst, 1, FMN_SWCODE_NAE, &msg); if (ret == 0) return (0); } fail: if (ret != 0) { tx_msgstatus = nlm_read_c2_txmsgstatus(); if ((tx_msgstatus >> 24) & 0x1) device_printf(sc->xlpge_dev, "Transmit queue full - "); if ((tx_msgstatus >> 3) & 0x1) device_printf(sc->xlpge_dev, "ECC error - "); if ((tx_msgstatus >> 2) & 0x1) device_printf(sc->xlpge_dev, "Pending Sync - "); if ((tx_msgstatus >> 1) & 0x1) device_printf(sc->xlpge_dev, "Insufficient input queue credits - "); if (tx_msgstatus & 0x1) device_printf(sc->xlpge_dev, "Insufficient output queue credits - "); } device_printf(sc->xlpge_dev, "Send failed! err = %d\n", err); if (p2p) uma_zfree(nl_tx_desc_zone, p2p); m_freem(mbuf_chain); if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); return (err); } static int nlm_xlpge_gmac_config_speed(struct nlm_xlpge_softc *sc) { struct mii_data *mii; if (sc->type == XAUIC || sc->type == ILC) return (0); if (sc->mii_bus) { mii = device_get_softc(sc->mii_bus); mii_pollstat(mii); } return (0); } static void nlm_xlpge_port_disable(struct nlm_xlpge_softc *sc) { struct ifnet *ifp; ifp = sc->xlpge_if; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; callout_stop(&sc->xlpge_callout); nlm_mac_disable(sc->base_addr, sc->block, sc->type, sc->port); } static void nlm_mii_pollstat(void *arg) { struct nlm_xlpge_softc *sc = (struct nlm_xlpge_softc *)arg; struct mii_data *mii = NULL; if (sc->mii_bus) { mii = device_get_softc(sc->mii_bus); KASSERT(mii != NULL, ("mii ptr is NULL")); mii_pollstat(mii); callout_reset(&sc->xlpge_callout, hz, nlm_mii_pollstat, sc); } } static void nlm_xlpge_port_enable(struct nlm_xlpge_softc *sc) { if ((sc->type != SGMIIC) && (sc->type != XAUIC)) return; nlm_mac_enable(sc->base_addr, sc->block, sc->type, sc->port); nlm_mii_pollstat((void *)sc); } static void nlm_xlpge_init(void *addr) { struct nlm_xlpge_softc *sc; struct ifnet *ifp; struct mii_data *mii = NULL; sc = (struct nlm_xlpge_softc *)addr; ifp = sc->xlpge_if; if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; if (sc->mii_bus) { mii = device_get_softc(sc->mii_bus); mii_mediachg(mii); } nlm_xlpge_gmac_config_speed(sc); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; nlm_xlpge_port_enable(sc); /* start the callout */ callout_reset(&sc->xlpge_callout, hz, nlm_mii_pollstat, sc); } /* * Read the MAC address from FDT or board eeprom. */ static void xlpge_read_mac_addr(struct nlm_xlpge_softc *sc) { xlpge_get_macaddr(sc->dev_addr); /* last octet is port specific */ sc->dev_addr[5] += (sc->block * 4) + sc->port; if (sc->type == SGMIIC) nlm_nae_setup_mac_addr_sgmii(sc->base_addr, sc->block, sc->port, sc->type, sc->dev_addr); else if (sc->type == XAUIC) nlm_nae_setup_mac_addr_xaui(sc->base_addr, sc->block, sc->port, sc->type, sc->dev_addr); } static int xlpge_mediachange(struct ifnet *ifp) { return (0); } static void xlpge_mediastatus(struct ifnet *ifp, struct ifmediareq *ifmr) { struct nlm_xlpge_softc *sc; struct mii_data *md; md = NULL; sc = ifp->if_softc; if (sc->mii_bus) md = device_get_softc(sc->mii_bus); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (sc->link == NLM_LINK_DOWN) return; if (md != NULL) ifmr->ifm_active = md->mii_media.ifm_cur->ifm_media; ifmr->ifm_status |= IFM_ACTIVE; } static int nlm_xlpge_ifinit(struct nlm_xlpge_softc *sc) { struct ifnet *ifp; device_t dev; int port = sc->block * 4 + sc->port; dev = sc->xlpge_dev; ifp = sc->xlpge_if = if_alloc(IFT_ETHER); /*(sc->network_sc)->ifp_ports[port].xlpge_if = ifp;*/ ifp_ports[port].xlpge_if = ifp; if (ifp == NULL) { device_printf(dev, "cannot if_alloc()\n"); return (ENOSPC); } ifp->if_softc = sc; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); - ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST | - IFF_NEEDSEPOCH; + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; sc->if_flags = ifp->if_flags; /*ifp->if_capabilities = IFCAP_TXCSUM | IFCAP_VLAN_HWTAGGING;*/ ifp->if_capabilities = 0; ifp->if_capenable = ifp->if_capabilities; ifp->if_ioctl = nlm_xlpge_ioctl; ifp->if_init = nlm_xlpge_init ; ifp->if_hwassist = 0; ifp->if_snd.ifq_drv_maxlen = NLM_XLPGE_TXQ_SIZE; /* TODO: make this a sysint */ IFQ_SET_MAXLEN(&ifp->if_snd, ifp->if_snd.ifq_drv_maxlen); IFQ_SET_READY(&ifp->if_snd); ifmedia_init(&sc->xlpge_mii.mii_media, 0, xlpge_mediachange, xlpge_mediastatus); ifmedia_add(&sc->xlpge_mii.mii_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->xlpge_mii.mii_media, IFM_ETHER | IFM_AUTO); sc->xlpge_mii.mii_media.ifm_media = sc->xlpge_mii.mii_media.ifm_cur->ifm_media; xlpge_read_mac_addr(sc); ether_ifattach(ifp, sc->dev_addr); /* override if_transmit : per ifnet(9), do it after if_attach */ ifp->if_transmit = xlpge_tx; return (0); } static int nlm_xlpge_probe(device_t dev) { return (BUS_PROBE_DEFAULT); } static void * get_buf(void) { struct mbuf *m_new; uint64_t *md; #ifdef INVARIANTS vm_paddr_t temp1, temp2; #endif if ((m_new = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR)) == NULL) return (NULL); m_new->m_len = m_new->m_pkthdr.len = MCLBYTES; KASSERT(((uintptr_t)m_new->m_data & (NAE_CACHELINE_SIZE - 1)) == 0, ("m_new->m_data is not cacheline aligned")); md = (uint64_t *)m_new->m_data; md[0] = (intptr_t)m_new; /* Back Ptr */ md[1] = 0xf00bad; m_adj(m_new, NAE_CACHELINE_SIZE); #ifdef INVARIANTS temp1 = vtophys((vm_offset_t) m_new->m_data); temp2 = vtophys((vm_offset_t) m_new->m_data + 1536); KASSERT((temp1 + 1536) == temp2, ("Alloced buffer is not contiguous")); #endif return ((void *)m_new->m_data); } static void nlm_xlpge_mii_init(device_t dev, struct nlm_xlpge_softc *sc) { int error; error = mii_attach(dev, &sc->mii_bus, sc->xlpge_if, xlpge_mediachange, xlpge_mediastatus, BMSR_DEFCAPMASK, sc->phy_addr, MII_OFFSET_ANY, 0); if (error) { device_printf(dev, "attaching PHYs failed\n"); sc->mii_bus = NULL; } if (sc->mii_bus != NULL) { /* enable MDIO interrupts in the PHY */ /* XXXJC: TODO */ } } static int xlpge_stats_sysctl(SYSCTL_HANDLER_ARGS) { struct nlm_xlpge_softc *sc; uint32_t val; int reg, field; sc = arg1; field = arg2; reg = SGMII_STATS_MLR(sc->block, sc->port) + field; val = nlm_read_nae_reg(sc->base_addr, reg); return (sysctl_handle_int(oidp, &val, 0, req)); } static void nlm_xlpge_setup_stats_sysctl(device_t dev, struct nlm_xlpge_softc *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid_list *child; struct sysctl_oid *tree; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); #define XLPGE_STAT(name, offset, desc) \ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, name, \ CTLTYPE_UINT | CTLFLAG_RD, sc, offset, \ xlpge_stats_sysctl, "IU", desc) XLPGE_STAT("tr127", nlm_sgmii_stats_tr127, "TxRx 64 - 127 Bytes"); XLPGE_STAT("tr255", nlm_sgmii_stats_tr255, "TxRx 128 - 255 Bytes"); XLPGE_STAT("tr511", nlm_sgmii_stats_tr511, "TxRx 256 - 511 Bytes"); XLPGE_STAT("tr1k", nlm_sgmii_stats_tr1k, "TxRx 512 - 1023 Bytes"); XLPGE_STAT("trmax", nlm_sgmii_stats_trmax, "TxRx 1024 - 1518 Bytes"); XLPGE_STAT("trmgv", nlm_sgmii_stats_trmgv, "TxRx 1519 - 1522 Bytes"); XLPGE_STAT("rbyt", nlm_sgmii_stats_rbyt, "Rx Bytes"); XLPGE_STAT("rpkt", nlm_sgmii_stats_rpkt, "Rx Packets"); XLPGE_STAT("rfcs", nlm_sgmii_stats_rfcs, "Rx FCS Error"); XLPGE_STAT("rmca", nlm_sgmii_stats_rmca, "Rx Multicast Packets"); XLPGE_STAT("rbca", nlm_sgmii_stats_rbca, "Rx Broadcast Packets"); XLPGE_STAT("rxcf", nlm_sgmii_stats_rxcf, "Rx Control Frames"); XLPGE_STAT("rxpf", nlm_sgmii_stats_rxpf, "Rx Pause Frames"); XLPGE_STAT("rxuo", nlm_sgmii_stats_rxuo, "Rx Unknown Opcode"); XLPGE_STAT("raln", nlm_sgmii_stats_raln, "Rx Alignment Errors"); XLPGE_STAT("rflr", nlm_sgmii_stats_rflr, "Rx Framelength Errors"); XLPGE_STAT("rcde", nlm_sgmii_stats_rcde, "Rx Code Errors"); XLPGE_STAT("rcse", nlm_sgmii_stats_rcse, "Rx Carrier Sense Errors"); XLPGE_STAT("rund", nlm_sgmii_stats_rund, "Rx Undersize Packet Errors"); XLPGE_STAT("rovr", nlm_sgmii_stats_rovr, "Rx Oversize Packet Errors"); XLPGE_STAT("rfrg", nlm_sgmii_stats_rfrg, "Rx Fragments"); XLPGE_STAT("rjbr", nlm_sgmii_stats_rjbr, "Rx Jabber"); XLPGE_STAT("tbyt", nlm_sgmii_stats_tbyt, "Tx Bytes"); XLPGE_STAT("tpkt", nlm_sgmii_stats_tpkt, "Tx Packets"); XLPGE_STAT("tmca", nlm_sgmii_stats_tmca, "Tx Multicast Packets"); XLPGE_STAT("tbca", nlm_sgmii_stats_tbca, "Tx Broadcast Packets"); XLPGE_STAT("txpf", nlm_sgmii_stats_txpf, "Tx Pause Frame"); XLPGE_STAT("tdfr", nlm_sgmii_stats_tdfr, "Tx Deferral Packets"); XLPGE_STAT("tedf", nlm_sgmii_stats_tedf, "Tx Excessive Deferral Pkts"); XLPGE_STAT("tscl", nlm_sgmii_stats_tscl, "Tx Single Collisions"); XLPGE_STAT("tmcl", nlm_sgmii_stats_tmcl, "Tx Multiple Collisions"); XLPGE_STAT("tlcl", nlm_sgmii_stats_tlcl, "Tx Late Collision Pkts"); XLPGE_STAT("txcl", nlm_sgmii_stats_txcl, "Tx Excessive Collisions"); XLPGE_STAT("tncl", nlm_sgmii_stats_tncl, "Tx Total Collisions"); XLPGE_STAT("tjbr", nlm_sgmii_stats_tjbr, "Tx Jabber Frames"); XLPGE_STAT("tfcs", nlm_sgmii_stats_tfcs, "Tx FCS Errors"); XLPGE_STAT("txcf", nlm_sgmii_stats_txcf, "Tx Control Frames"); XLPGE_STAT("tovr", nlm_sgmii_stats_tovr, "Tx Oversize Frames"); XLPGE_STAT("tund", nlm_sgmii_stats_tund, "Tx Undersize Frames"); XLPGE_STAT("tfrg", nlm_sgmii_stats_tfrg, "Tx Fragments"); #undef XLPGE_STAT } static int nlm_xlpge_attach(device_t dev) { struct xlp_port_ivars *pv; struct nlm_xlpge_softc *sc; int port; pv = device_get_ivars(dev); sc = device_get_softc(dev); sc->xlpge_dev = dev; sc->mii_bus = NULL; sc->block = pv->block; sc->node = pv->node; sc->port = pv->port; sc->type = pv->type; sc->xlpge_if = NULL; sc->phy_addr = pv->phy_addr; sc->mdio_bus = pv->mdio_bus; sc->portcfg = nae_port_config; sc->hw_parser_en = pv->hw_parser_en; /* default settings */ sc->speed = NLM_SGMII_SPEED_10; sc->duplexity = NLM_SGMII_DUPLEX_FULL; sc->link = NLM_LINK_DOWN; sc->flowctrl = NLM_FLOWCTRL_DISABLED; sc->network_sc = device_get_softc(device_get_parent(dev)); sc->base_addr = sc->network_sc->base; sc->prepad_en = sc->network_sc->prepad_en; sc->prepad_size = sc->network_sc->prepad_size; callout_init(&sc->xlpge_callout, 1); XLPGE_LOCK_INIT(sc, device_get_nameunit(dev)); port = (sc->block*4)+sc->port; sc->nfree_desc = nae_port_config[port].num_free_descs; sc->txq = nae_port_config[port].txq; sc->rxfreeq = nae_port_config[port].rxfreeq; nlm_xlpge_submit_rx_free_desc(sc, sc->nfree_desc); if (sc->hw_parser_en) nlm_enable_hardware_parser_per_port(sc->base_addr, sc->block, sc->port); nlm_xlpge_ifinit(sc); ifp_ports[port].xlpge_sc = sc; nlm_xlpge_mii_init(dev, sc); nlm_xlpge_setup_stats_sysctl(dev, sc); return (0); } static int nlm_xlpge_detach(device_t dev) { return (0); } static int nlm_xlpge_suspend(device_t dev) { return (0); } static int nlm_xlpge_resume(device_t dev) { return (0); } static int nlm_xlpge_shutdown(device_t dev) { return (0); } /* * miibus function with custom implementation */ static int nlm_xlpge_mii_read(device_t dev, int phyaddr, int regidx) { struct nlm_xlpge_softc *sc; int val; sc = device_get_softc(dev); if (sc->type == SGMIIC) val = nlm_gmac_mdio_read(sc->base_addr, sc->mdio_bus, BLOCK_7, LANE_CFG, phyaddr, regidx); else val = 0xffff; return (val); } static int nlm_xlpge_mii_write(device_t dev, int phyaddr, int regidx, int val) { struct nlm_xlpge_softc *sc; sc = device_get_softc(dev); if (sc->type == SGMIIC) nlm_gmac_mdio_write(sc->base_addr, sc->mdio_bus, BLOCK_7, LANE_CFG, phyaddr, regidx, val); return (0); } static void nlm_xlpge_mii_statchg(device_t dev) { struct nlm_xlpge_softc *sc; struct mii_data *mii; char *speed, *duplexity; sc = device_get_softc(dev); if (sc->mii_bus == NULL) return; mii = device_get_softc(sc->mii_bus); if (mii->mii_media_status & IFM_ACTIVE) { if (IFM_SUBTYPE(mii->mii_media_active) == IFM_10_T) { sc->speed = NLM_SGMII_SPEED_10; speed = "10Mbps"; } else if (IFM_SUBTYPE(mii->mii_media_active) == IFM_100_TX) { sc->speed = NLM_SGMII_SPEED_100; speed = "100Mbps"; } else { /* default to 1G */ sc->speed = NLM_SGMII_SPEED_1000; speed = "1Gbps"; } if ((mii->mii_media_active & IFM_GMASK) == IFM_FDX) { sc->duplexity = NLM_SGMII_DUPLEX_FULL; duplexity = "full"; } else { sc->duplexity = NLM_SGMII_DUPLEX_HALF; duplexity = "half"; } printf("Port [%d, %d] setup with speed=%s duplex=%s\n", sc->block, sc->port, speed, duplexity); nlm_nae_setup_mac(sc->base_addr, sc->block, sc->port, 0, 1, 1, sc->speed, sc->duplexity); } } /* * xlpge support function implementations */ static void nlm_xlpge_release_mbuf(uint64_t paddr) { uint64_t mag, desc, mbuf; paddr += (XLP_NTXFRAGS - 3) * sizeof(uint64_t); mag = nlm_paddr_ld(paddr); desc = nlm_paddr_ld(paddr + sizeof(uint64_t)); mbuf = nlm_paddr_ld(paddr + 2 * sizeof(uint64_t)); if (mag != 0xf00bad) { /* somebody else packet Error - FIXME in intialization */ printf("cpu %d: ERR Tx packet paddr %jx, mag %jx, desc %jx mbuf %jx\n", nlm_cpuid(), (uintmax_t)paddr, (uintmax_t)mag, (intmax_t)desc, (uintmax_t)mbuf); return; } m_freem((struct mbuf *)(uintptr_t)mbuf); uma_zfree(nl_tx_desc_zone, (void *)(uintptr_t)desc); } static void nlm_xlpge_rx(struct nlm_xlpge_softc *sc, int port, vm_paddr_t paddr, int len) { struct ifnet *ifp; struct mbuf *m; vm_offset_t temp; unsigned long mag; int prepad_size; ifp = sc->xlpge_if; temp = nlm_paddr_ld(paddr - NAE_CACHELINE_SIZE); mag = nlm_paddr_ld(paddr - NAE_CACHELINE_SIZE + sizeof(uint64_t)); m = (struct mbuf *)(intptr_t)temp; if (mag != 0xf00bad) { /* somebody else packet Error - FIXME in intialization */ printf("cpu %d: ERR Rx packet paddr %jx, temp %p, mag %lx\n", nlm_cpuid(), (uintmax_t)paddr, (void *)temp, mag); return; } m->m_pkthdr.rcvif = ifp; #ifdef DUMP_PACKET { int i = 0, j = 64; unsigned char *buf = (char *)m->m_data; printf("(cpu_%d: nlge_rx, !RX_COPY) Rx Packet: length=%d\n", nlm_cpuid(), len); if (len < j) j = len; if (sc->prepad_en) j += ((sc->prepad_size + 1) * 16); for (i = 0; i < j; i++) { if (i && (i % 16) == 0) printf("\n"); printf("%02x ", buf[i]); } printf("\n"); } #endif if (sc->prepad_en) { prepad_size = ((sc->prepad_size + 1) * 16); m->m_data += prepad_size; m->m_pkthdr.len = m->m_len = (len - prepad_size); } else m->m_pkthdr.len = m->m_len = len; if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); #ifdef XLP_DRIVER_LOOPBACK if (port == 16 || port == 17) (*ifp->if_input)(ifp, m); else xlpge_tx(ifp, m); #else (*ifp->if_input)(ifp, m); #endif } void nlm_xlpge_submit_rx_free_desc(struct nlm_xlpge_softc *sc, int num) { int i, size, ret, n; struct nlm_fmn_msg msg; void *ptr; for(i = 0; i < num; i++) { memset(&msg, 0, sizeof(msg)); ptr = get_buf(); if (!ptr) { device_printf(sc->xlpge_dev, "Cannot allocate mbuf\n"); break; } msg.msg[0] = vtophys(ptr); if (msg.msg[0] == 0) { printf("Bad ptr for %p\n", ptr); break; } size = 1; n = 0; while (1) { /* on success returns 1, else 0 */ ret = nlm_fmn_msgsend(sc->rxfreeq, size, 0, &msg); if (ret == 0) break; if (n++ > 10000) { printf("Too many credit fails for send free desc\n"); break; } } } } void nlm_xlpge_msgring_handler(int vc, int size, int code, int src_id, struct nlm_fmn_msg *msg, void *data) { uint64_t phys_addr; struct nlm_xlpnae_softc *sc; struct nlm_xlpge_softc *xlpge_sc; struct ifnet *ifp; uint32_t context; uint32_t port = 0; uint32_t length; sc = (struct nlm_xlpnae_softc *)data; KASSERT(sc != NULL, ("Null sc in msgring handler")); if (size == 1) { /* process transmit complete */ phys_addr = msg->msg[0] & 0xffffffffffULL; /* context is SGMII_RCV_CONTEXT_NUM + three bit vlan type * or vlan priority */ context = (msg->msg[0] >> 40) & 0x3fff; port = cntx2port[context]; if (port >= XLP_MAX_PORTS) { printf("%s:%d Bad port %d (context=%d)\n", __func__, __LINE__, port, context); return; } ifp = ifp_ports[port].xlpge_if; xlpge_sc = ifp_ports[port].xlpge_sc; nlm_xlpge_release_mbuf(phys_addr); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); } else if (size > 1) { /* Recieve packet */ phys_addr = msg->msg[1] & 0xffffffffc0ULL; length = (msg->msg[1] >> 40) & 0x3fff; length -= MAC_CRC_LEN; /* context is SGMII_RCV_CONTEXT_NUM + three bit vlan type * or vlan priority */ context = (msg->msg[1] >> 54) & 0x3ff; port = cntx2port[context]; if (port >= XLP_MAX_PORTS) { printf("%s:%d Bad port %d (context=%d)\n", __func__, __LINE__, port, context); return; } ifp = ifp_ports[port].xlpge_if; xlpge_sc = ifp_ports[port].xlpge_sc; nlm_xlpge_rx(xlpge_sc, port, phys_addr, length); /* return back a free descriptor to NA */ nlm_xlpge_submit_rx_free_desc(xlpge_sc, 1); } } diff --git a/sys/net/if.c b/sys/net/if.c index 42d34bb21ddb..d1c3cfba811c 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -1,4575 +1,4573 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if.c 8.5 (Berkeley) 1/9/95 * $FreeBSD$ */ #include "opt_bpf.h" #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #ifdef INET6 #include #include #endif /* INET6 */ #endif /* INET || INET6 */ #include /* * Consumers of struct ifreq such as tcpdump assume no pad between ifr_name * and ifr_ifru when it is used in SIOCGIFCONF. */ _Static_assert(sizeof(((struct ifreq *)0)->ifr_name) == offsetof(struct ifreq, ifr_ifru), "gap between ifr_name and ifr_ifru"); __read_mostly epoch_t net_epoch_preempt; #ifdef COMPAT_FREEBSD32 #include #include struct ifreq_buffer32 { uint32_t length; /* (size_t) */ uint32_t buffer; /* (void *) */ }; /* * Interface request structure used for socket * ioctl's. All interface ioctl's must have parameter * definitions which begin with ifr_name. The * remainder may be interface specific. */ struct ifreq32 { char ifr_name[IFNAMSIZ]; /* if name, e.g. "en0" */ union { struct sockaddr ifru_addr; struct sockaddr ifru_dstaddr; struct sockaddr ifru_broadaddr; struct ifreq_buffer32 ifru_buffer; short ifru_flags[2]; short ifru_index; int ifru_jid; int ifru_metric; int ifru_mtu; int ifru_phys; int ifru_media; uint32_t ifru_data; int ifru_cap[2]; u_int ifru_fib; u_char ifru_vlan_pcp; } ifr_ifru; }; CTASSERT(sizeof(struct ifreq) == sizeof(struct ifreq32)); CTASSERT(__offsetof(struct ifreq, ifr_ifru) == __offsetof(struct ifreq32, ifr_ifru)); struct ifgroupreq32 { char ifgr_name[IFNAMSIZ]; u_int ifgr_len; union { char ifgru_group[IFNAMSIZ]; uint32_t ifgru_groups; } ifgr_ifgru; }; struct ifmediareq32 { char ifm_name[IFNAMSIZ]; int ifm_current; int ifm_mask; int ifm_status; int ifm_active; int ifm_count; uint32_t ifm_ulist; /* (int *) */ }; #define SIOCGIFMEDIA32 _IOC_NEWTYPE(SIOCGIFMEDIA, struct ifmediareq32) #define SIOCGIFXMEDIA32 _IOC_NEWTYPE(SIOCGIFXMEDIA, struct ifmediareq32) #define _CASE_IOC_IFGROUPREQ_32(cmd) \ _IOC_NEWTYPE((cmd), struct ifgroupreq32): case #else /* !COMPAT_FREEBSD32 */ #define _CASE_IOC_IFGROUPREQ_32(cmd) #endif /* !COMPAT_FREEBSD32 */ #define CASE_IOC_IFGROUPREQ(cmd) \ _CASE_IOC_IFGROUPREQ_32(cmd) \ (cmd) union ifreq_union { struct ifreq ifr; #ifdef COMPAT_FREEBSD32 struct ifreq32 ifr32; #endif }; union ifgroupreq_union { struct ifgroupreq ifgr; #ifdef COMPAT_FREEBSD32 struct ifgroupreq32 ifgr32; #endif }; SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers"); SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management"); SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN, &ifqmaxlen, 0, "max send queue size"); /* Log link state change events */ static int log_link_state_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW, &log_link_state_change, 0, "log interface link state change events"); /* Log promiscuous mode change events */ static int log_promisc_mode_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_promisc_mode_change, CTLFLAG_RDTUN, &log_promisc_mode_change, 1, "log promiscuous mode change events"); /* Interface description */ static unsigned int ifdescr_maxlen = 1024; SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW, &ifdescr_maxlen, 0, "administrative maximum length for interface description"); static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions"); /* global sx for non-critical path ifdescr */ static struct sx ifdescr_sx; SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr"); void (*ng_ether_link_state_p)(struct ifnet *ifp, int state); void (*lagg_linkstate_p)(struct ifnet *ifp, int state); /* These are external hooks for CARP. */ void (*carp_linkstate_p)(struct ifnet *ifp); void (*carp_demote_adj_p)(int, char *); int (*carp_master_p)(struct ifaddr *); #if defined(INET) || defined(INET6) int (*carp_forus_p)(struct ifnet *ifp, u_char *dhost); int (*carp_output_p)(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa); int (*carp_ioctl_p)(struct ifreq *, u_long, struct thread *); int (*carp_attach_p)(struct ifaddr *, int); void (*carp_detach_p)(struct ifaddr *, bool); #endif #ifdef INET int (*carp_iamatch_p)(struct ifaddr *, uint8_t **); #endif #ifdef INET6 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6); caddr_t (*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr); #endif struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL; /* * XXX: Style; these should be sorted alphabetically, and unprototyped * static functions should be prototyped. Currently they are sorted by * declaration order. */ static void if_attachdomain(void *); static void if_attachdomain1(struct ifnet *); static int ifconf(u_long, caddr_t); static void *if_grow(void); static void if_input_default(struct ifnet *, struct mbuf *); static int if_requestencap_default(struct ifnet *, struct if_encap_req *); static void if_route(struct ifnet *, int flag, int fam); static int if_setflag(struct ifnet *, int, int, int *, int); static int if_transmit(struct ifnet *ifp, struct mbuf *m); static void if_unroute(struct ifnet *, int flag, int fam); static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int); static void do_link_state_change(void *, int); static int if_getgroup(struct ifgroupreq *, struct ifnet *); static int if_getgroupmembers(struct ifgroupreq *); static void if_delgroups(struct ifnet *); static void if_attach_internal(struct ifnet *, int, struct if_clone *); static int if_detach_internal(struct ifnet *, int, struct if_clone **); static void if_siocaddmulti(void *, int); #ifdef VIMAGE static int if_vmove(struct ifnet *, struct vnet *); #endif #ifdef INET6 /* * XXX: declare here to avoid to include many inet6 related files.. * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); #endif /* ipsec helper hooks */ VNET_DEFINE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]); VNET_DEFINE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]); VNET_DEFINE(int, if_index); int ifqmaxlen = IFQ_MAXLEN; VNET_DEFINE(struct ifnethead, ifnet); /* depend on static init XXX */ VNET_DEFINE(struct ifgrouphead, ifg_head); VNET_DEFINE_STATIC(int, if_indexlim) = 8; /* Table of ifnet by index. */ VNET_DEFINE(struct ifnet **, ifindex_table); #define V_if_indexlim VNET(if_indexlim) #define V_ifindex_table VNET(ifindex_table) /* * The global network interface list (V_ifnet) and related state (such as * if_index, if_indexlim, and ifindex_table) are protected by an sxlock and * an rwlock. Either may be acquired shared to stablize the list, but both * must be acquired writable to modify the list. This model allows us to * both stablize the interface list during interrupt thread processing, but * also to stablize it over long-running ioctls, without introducing priority * inversions and deadlocks. */ struct rwlock ifnet_rwlock; RW_SYSINIT_FLAGS(ifnet_rw, &ifnet_rwlock, "ifnet_rw", RW_RECURSE); struct sx ifnet_sxlock; SX_SYSINIT_FLAGS(ifnet_sx, &ifnet_sxlock, "ifnet_sx", SX_RECURSE); /* * The allocation of network interfaces is a rather non-atomic affair; we * need to select an index before we are ready to expose the interface for * use, so will use this pointer value to indicate reservation. */ #define IFNET_HOLD (void *)(uintptr_t)(-1) #ifdef VIMAGE #define VNET_IS_SHUTTING_DOWN(_vnet) \ ((_vnet)->vnet_shutdown && (_vnet)->vnet_state < SI_SUB_VNET_DONE) #endif static if_com_alloc_t *if_com_alloc[256]; static if_com_free_t *if_com_free[256]; static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals"); MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address"); struct ifnet * ifnet_byindex(u_short idx) { struct ifnet *ifp; if (__predict_false(idx > V_if_index)) return (NULL); ifp = *(struct ifnet * const volatile *)(V_ifindex_table + idx); return (__predict_false(ifp == IFNET_HOLD) ? NULL : ifp); } struct ifnet * ifnet_byindex_ref(u_short idx) { struct ifnet *ifp; NET_EPOCH_ASSERT(); ifp = ifnet_byindex(idx); if (ifp == NULL || (ifp->if_flags & IFF_DYING)) return (NULL); if_ref(ifp); return (ifp); } /* * Allocate an ifindex array entry; return 0 on success or an error on * failure. */ static u_short ifindex_alloc(void **old) { u_short idx; IFNET_WLOCK_ASSERT(); /* * Try to find an empty slot below V_if_index. If we fail, take the * next slot. */ for (idx = 1; idx <= V_if_index; idx++) { if (V_ifindex_table[idx] == NULL) break; } /* Catch if_index overflow. */ if (idx >= V_if_indexlim) { *old = if_grow(); return (USHRT_MAX); } if (idx > V_if_index) V_if_index = idx; return (idx); } static void ifindex_free_locked(u_short idx) { IFNET_WLOCK_ASSERT(); V_ifindex_table[idx] = NULL; while (V_if_index > 0 && V_ifindex_table[V_if_index] == NULL) V_if_index--; } static void ifindex_free(u_short idx) { IFNET_WLOCK(); ifindex_free_locked(idx); IFNET_WUNLOCK(); } static void ifnet_setbyindex(u_short idx, struct ifnet *ifp) { V_ifindex_table[idx] = ifp; } struct ifaddr * ifaddr_byindex(u_short idx) { struct ifnet *ifp; struct ifaddr *ifa = NULL; NET_EPOCH_ASSERT(); ifp = ifnet_byindex(idx); if (ifp != NULL && (ifa = ifp->if_addr) != NULL) ifa_ref(ifa); return (ifa); } /* * Network interface utility routines. * * Routines with ifa_ifwith* names take sockaddr *'s as * parameters. */ static void vnet_if_init(const void *unused __unused) { void *old; CK_STAILQ_INIT(&V_ifnet); CK_STAILQ_INIT(&V_ifg_head); IFNET_WLOCK(); old = if_grow(); /* create initial table */ IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(old, M_IFNET); vnet_if_clone_init(); } VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init, NULL); #ifdef VIMAGE static void vnet_if_uninit(const void *unused __unused) { VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p " "not empty", __func__, __LINE__, &V_ifnet)); VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p " "not empty", __func__, __LINE__, &V_ifg_head)); free((caddr_t)V_ifindex_table, M_IFNET); } VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_if_uninit, NULL); static void vnet_if_return(const void *unused __unused) { struct ifnet *ifp, *nifp; /* Return all inherited interfaces to their parent vnets. */ CK_STAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) { if (ifp->if_home_vnet != ifp->if_vnet) if_vmove(ifp, ifp->if_home_vnet); } } VNET_SYSUNINIT(vnet_if_return, SI_SUB_VNET_DONE, SI_ORDER_ANY, vnet_if_return, NULL); #endif static void * if_grow(void) { int oldlim; u_int n; struct ifnet **e; void *old; old = NULL; IFNET_WLOCK_ASSERT(); oldlim = V_if_indexlim; IFNET_WUNLOCK(); n = (oldlim << 1) * sizeof(*e); e = malloc(n, M_IFNET, M_WAITOK | M_ZERO); IFNET_WLOCK(); if (V_if_indexlim != oldlim) { free(e, M_IFNET); return (NULL); } if (V_ifindex_table != NULL) { memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2); old = V_ifindex_table; } V_if_indexlim <<= 1; V_ifindex_table = e; return (old); } /* * Allocate a struct ifnet and an index for an interface. A layer 2 * common structure will also be allocated if an allocation routine is * registered for the passed type. */ struct ifnet * if_alloc_domain(u_char type, int numa_domain) { struct ifnet *ifp; u_short idx; void *old; KASSERT(numa_domain <= IF_NODOM, ("numa_domain too large")); if (numa_domain == IF_NODOM) ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK | M_ZERO); else ifp = malloc_domainset(sizeof(struct ifnet), M_IFNET, DOMAINSET_PREF(numa_domain), M_WAITOK | M_ZERO); restart: IFNET_WLOCK(); idx = ifindex_alloc(&old); if (__predict_false(idx == USHRT_MAX)) { IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(old, M_IFNET); goto restart; } ifnet_setbyindex(idx, IFNET_HOLD); IFNET_WUNLOCK(); ifp->if_index = idx; ifp->if_type = type; ifp->if_alloctype = type; ifp->if_numa_domain = numa_domain; #ifdef VIMAGE ifp->if_vnet = curvnet; #endif - /* XXX */ - ifp->if_flags |= IFF_NEEDSEPOCH; if (if_com_alloc[type] != NULL) { ifp->if_l2com = if_com_alloc[type](type, ifp); if (ifp->if_l2com == NULL) { free(ifp, M_IFNET); ifindex_free(idx); return (NULL); } } IF_ADDR_LOCK_INIT(ifp); TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); TASK_INIT(&ifp->if_addmultitask, 0, if_siocaddmulti, ifp); ifp->if_afdata_initialized = 0; IF_AFDATA_LOCK_INIT(ifp); CK_STAILQ_INIT(&ifp->if_addrhead); CK_STAILQ_INIT(&ifp->if_multiaddrs); CK_STAILQ_INIT(&ifp->if_groups); #ifdef MAC mac_ifnet_init(ifp); #endif ifq_init(&ifp->if_snd, ifp); refcount_init(&ifp->if_refcount, 1); /* Index reference. */ for (int i = 0; i < IFCOUNTERS; i++) ifp->if_counters[i] = counter_u64_alloc(M_WAITOK); ifp->if_get_counter = if_get_counter_default; ifp->if_pcp = IFNET_PCP_NONE; ifnet_setbyindex(ifp->if_index, ifp); return (ifp); } struct ifnet * if_alloc_dev(u_char type, device_t dev) { int numa_domain; if (dev == NULL || bus_get_domain(dev, &numa_domain) != 0) return (if_alloc_domain(type, IF_NODOM)); return (if_alloc_domain(type, numa_domain)); } struct ifnet * if_alloc(u_char type) { return (if_alloc_domain(type, IF_NODOM)); } /* * Do the actual work of freeing a struct ifnet, and layer 2 common * structure. This call is made when the last reference to an * interface is released. */ static void if_free_internal(struct ifnet *ifp) { KASSERT((ifp->if_flags & IFF_DYING), ("if_free_internal: interface not dying")); if (if_com_free[ifp->if_alloctype] != NULL) if_com_free[ifp->if_alloctype](ifp->if_l2com, ifp->if_alloctype); #ifdef MAC mac_ifnet_destroy(ifp); #endif /* MAC */ IF_AFDATA_DESTROY(ifp); IF_ADDR_LOCK_DESTROY(ifp); ifq_delete(&ifp->if_snd); for (int i = 0; i < IFCOUNTERS; i++) counter_u64_free(ifp->if_counters[i]); free(ifp->if_description, M_IFDESCR); free(ifp->if_hw_addr, M_IFADDR); if (ifp->if_numa_domain == IF_NODOM) free(ifp, M_IFNET); else free_domain(ifp, M_IFNET); } static void if_destroy(epoch_context_t ctx) { struct ifnet *ifp; ifp = __containerof(ctx, struct ifnet, if_epoch_ctx); if_free_internal(ifp); } /* * Deregister an interface and free the associated storage. */ void if_free(struct ifnet *ifp) { ifp->if_flags |= IFF_DYING; /* XXX: Locking */ CURVNET_SET_QUIET(ifp->if_vnet); IFNET_WLOCK(); KASSERT(ifp == ifnet_byindex(ifp->if_index), ("%s: freeing unallocated ifnet", ifp->if_xname)); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); if (refcount_release(&ifp->if_refcount)) NET_EPOCH_CALL(if_destroy, &ifp->if_epoch_ctx); CURVNET_RESTORE(); } /* * Interfaces to keep an ifnet type-stable despite the possibility of the * driver calling if_free(). If there are additional references, we defer * freeing the underlying data structure. */ void if_ref(struct ifnet *ifp) { /* We don't assert the ifnet list lock here, but arguably should. */ refcount_acquire(&ifp->if_refcount); } void if_rele(struct ifnet *ifp) { if (!refcount_release(&ifp->if_refcount)) return; NET_EPOCH_CALL(if_destroy, &ifp->if_epoch_ctx); } void ifq_init(struct ifaltq *ifq, struct ifnet *ifp) { mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF); if (ifq->ifq_maxlen == 0) ifq->ifq_maxlen = ifqmaxlen; ifq->altq_type = 0; ifq->altq_disc = NULL; ifq->altq_flags &= ALTQF_CANTCHANGE; ifq->altq_tbr = NULL; ifq->altq_ifp = ifp; } void ifq_delete(struct ifaltq *ifq) { mtx_destroy(&ifq->ifq_mtx); } /* * Perform generic interface initialization tasks and attach the interface * to the list of "active" interfaces. If vmove flag is set on entry * to if_attach_internal(), perform only a limited subset of initialization * tasks, given that we are moving from one vnet to another an ifnet which * has already been fully initialized. * * Note that if_detach_internal() removes group membership unconditionally * even when vmove flag is set, and if_attach_internal() adds only IFG_ALL. * Thus, when if_vmove() is applied to a cloned interface, group membership * is lost while a cloned one always joins a group whose name is * ifc->ifc_name. To recover this after if_detach_internal() and * if_attach_internal(), the cloner should be specified to * if_attach_internal() via ifc. If it is non-NULL, if_attach_internal() * attempts to join a group whose name is ifc->ifc_name. * * XXX: * - The decision to return void and thus require this function to * succeed is questionable. * - We should probably do more sanity checking. For instance we don't * do anything to insure if_xname is unique or non-empty. */ void if_attach(struct ifnet *ifp) { if_attach_internal(ifp, 0, NULL); } /* * Compute the least common TSO limit. */ void if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax) { /* * 1) If there is no limit currently, take the limit from * the network adapter. * * 2) If the network adapter has a limit below the current * limit, apply it. */ if (pmax->tsomaxbytes == 0 || (ifp->if_hw_tsomax != 0 && ifp->if_hw_tsomax < pmax->tsomaxbytes)) { pmax->tsomaxbytes = ifp->if_hw_tsomax; } if (pmax->tsomaxsegcount == 0 || (ifp->if_hw_tsomaxsegcount != 0 && ifp->if_hw_tsomaxsegcount < pmax->tsomaxsegcount)) { pmax->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; } if (pmax->tsomaxsegsize == 0 || (ifp->if_hw_tsomaxsegsize != 0 && ifp->if_hw_tsomaxsegsize < pmax->tsomaxsegsize)) { pmax->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } /* * Update TSO limit of a network adapter. * * Returns zero if no change. Else non-zero. */ int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax) { int retval = 0; if (ifp->if_hw_tsomax != pmax->tsomaxbytes) { ifp->if_hw_tsomax = pmax->tsomaxbytes; retval++; } if (ifp->if_hw_tsomaxsegsize != pmax->tsomaxsegsize) { ifp->if_hw_tsomaxsegsize = pmax->tsomaxsegsize; retval++; } if (ifp->if_hw_tsomaxsegcount != pmax->tsomaxsegcount) { ifp->if_hw_tsomaxsegcount = pmax->tsomaxsegcount; retval++; } return (retval); } static void if_attach_internal(struct ifnet *ifp, int vmove, struct if_clone *ifc) { unsigned socksize, ifasize; int namelen, masklen; struct sockaddr_dl *sdl; struct ifaddr *ifa; if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index)) panic ("%s: BUG: if_attach called without if_alloc'd input()\n", ifp->if_xname); #ifdef VIMAGE ifp->if_vnet = curvnet; if (ifp->if_home_vnet == NULL) ifp->if_home_vnet = curvnet; #endif if_addgroup(ifp, IFG_ALL); /* Restore group membership for cloned interfaces. */ if (vmove && ifc != NULL) if_clone_addgroup(ifp, ifc); getmicrotime(&ifp->if_lastchange); ifp->if_epoch = time_uptime; KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) || (ifp->if_transmit != NULL && ifp->if_qflush != NULL), ("transmit and qflush must both either be set or both be NULL")); if (ifp->if_transmit == NULL) { ifp->if_transmit = if_transmit; ifp->if_qflush = if_qflush; } if (ifp->if_input == NULL) ifp->if_input = if_input_default; if (ifp->if_requestencap == NULL) ifp->if_requestencap = if_requestencap_default; if (!vmove) { #ifdef MAC mac_ifnet_create(ifp); #endif /* * Create a Link Level name for this device. */ namelen = strlen(ifp->if_xname); /* * Always save enough space for any possiable name so we * can do a rename in place later. */ masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ; socksize = masklen + ifp->if_addrlen; if (socksize < sizeof(*sdl)) socksize = sizeof(*sdl); socksize = roundup2(socksize, sizeof(long)); ifasize = sizeof(*ifa) + 2 * socksize; ifa = ifa_alloc(ifasize, M_WAITOK); sdl = (struct sockaddr_dl *)(ifa + 1); sdl->sdl_len = socksize; sdl->sdl_family = AF_LINK; bcopy(ifp->if_xname, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; ifp->if_addr = ifa; ifa->ifa_ifp = ifp; ifa->ifa_addr = (struct sockaddr *)sdl; sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl); ifa->ifa_netmask = (struct sockaddr *)sdl; sdl->sdl_len = masklen; while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; CK_STAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); /* Reliably crash if used uninitialized. */ ifp->if_broadcastaddr = NULL; if (ifp->if_type == IFT_ETHER) { ifp->if_hw_addr = malloc(ifp->if_addrlen, M_IFADDR, M_WAITOK | M_ZERO); } #if defined(INET) || defined(INET6) /* Use defaults for TSO, if nothing is set */ if (ifp->if_hw_tsomax == 0 && ifp->if_hw_tsomaxsegcount == 0 && ifp->if_hw_tsomaxsegsize == 0) { /* * The TSO defaults needs to be such that an * NFS mbuf list of 35 mbufs totalling just * below 64K works and that a chain of mbufs * can be defragged into at most 32 segments: */ ifp->if_hw_tsomax = min(IP_MAXPACKET, (32 * MCLBYTES) - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)); ifp->if_hw_tsomaxsegcount = 35; ifp->if_hw_tsomaxsegsize = 2048; /* 2K */ /* XXX some drivers set IFCAP_TSO after ethernet attach */ if (ifp->if_capabilities & IFCAP_TSO) { if_printf(ifp, "Using defaults for TSO: %u/%u/%u\n", ifp->if_hw_tsomax, ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); } } #endif } #ifdef VIMAGE else { /* * Update the interface index in the link layer address * of the interface. */ for (ifa = ifp->if_addr; ifa != NULL; ifa = CK_STAILQ_NEXT(ifa, ifa_link)) { if (ifa->ifa_addr->sa_family == AF_LINK) { sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_index = ifp->if_index; } } } #endif IFNET_WLOCK(); CK_STAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link); #ifdef VIMAGE curvnet->vnet_ifcnt++; #endif IFNET_WUNLOCK(); if (domain_init_status >= 2) if_attachdomain1(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL); /* Announce the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); } static void if_epochalloc(void *dummy __unused) { net_epoch_preempt = epoch_alloc("Net preemptible", EPOCH_PREEMPT); } SYSINIT(ifepochalloc, SI_SUB_EPOCH, SI_ORDER_ANY, if_epochalloc, NULL); static void if_attachdomain(void *dummy) { struct ifnet *ifp; CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) if_attachdomain1(ifp); } SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND, if_attachdomain, NULL); static void if_attachdomain1(struct ifnet *ifp) { struct domain *dp; /* * Since dp->dom_ifattach calls malloc() with M_WAITOK, we * cannot lock ifp->if_afdata initialization, entirely. */ IF_AFDATA_LOCK(ifp); if (ifp->if_afdata_initialized >= domain_init_status) { IF_AFDATA_UNLOCK(ifp); log(LOG_WARNING, "%s called more than once on %s\n", __func__, ifp->if_xname); return; } ifp->if_afdata_initialized = domain_init_status; IF_AFDATA_UNLOCK(ifp); /* address family dependent data region */ bzero(ifp->if_afdata, sizeof(ifp->if_afdata)); for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_ifattach) ifp->if_afdata[dp->dom_family] = (*dp->dom_ifattach)(ifp); } } /* * Remove any unicast or broadcast network addresses from an interface. */ void if_purgeaddrs(struct ifnet *ifp) { struct ifaddr *ifa; while (1) { struct epoch_tracker et; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_LINK) break; } NET_EPOCH_EXIT(et); if (ifa == NULL) break; #ifdef INET /* XXX: Ugly!! ad hoc just for INET */ if (ifa->ifa_addr->sa_family == AF_INET) { struct ifaliasreq ifr; bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL) == 0) continue; } #endif /* INET */ #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) { in6_purgeaddr(ifa); /* ifp_addrhead is already updated */ continue; } #endif /* INET6 */ IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } } /* * Remove any multicast network addresses from an interface when an ifnet * is going away. */ static void if_purgemaddrs(struct ifnet *ifp) { struct ifmultiaddr *ifma; IF_ADDR_WLOCK(ifp); while (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) { ifma = CK_STAILQ_FIRST(&ifp->if_multiaddrs); CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); if_delmulti_locked(ifp, ifma, 1); } IF_ADDR_WUNLOCK(ifp); } /* * Detach an interface, removing it from the list of "active" interfaces. * If vmove flag is set on entry to if_detach_internal(), perform only a * limited subset of cleanup tasks, given that we are moving an ifnet from * one vnet to another, where it must be fully operational. * * XXXRW: There are some significant questions about event ordering, and * how to prevent things from starting to use the interface during detach. */ void if_detach(struct ifnet *ifp) { CURVNET_SET_QUIET(ifp->if_vnet); if_detach_internal(ifp, 0, NULL); CURVNET_RESTORE(); } /* * The vmove flag, if set, indicates that we are called from a callpath * that is moving an interface to a different vnet instance. * * The shutdown flag, if set, indicates that we are called in the * process of shutting down a vnet instance. Currently only the * vnet_if_return SYSUNINIT function sets it. Note: we can be called * on a vnet instance shutdown without this flag being set, e.g., when * the cloned interfaces are destoyed as first thing of teardown. */ static int if_detach_internal(struct ifnet *ifp, int vmove, struct if_clone **ifcp) { struct ifaddr *ifa; int i; struct domain *dp; struct ifnet *iter; int found = 0; #ifdef VIMAGE bool shutdown; shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet); #endif IFNET_WLOCK(); CK_STAILQ_FOREACH(iter, &V_ifnet, if_link) if (iter == ifp) { CK_STAILQ_REMOVE(&V_ifnet, ifp, ifnet, if_link); if (!vmove) ifp->if_flags |= IFF_DYING; found = 1; break; } IFNET_WUNLOCK(); if (!found) { /* * While we would want to panic here, we cannot * guarantee that the interface is indeed still on * the list given we don't hold locks all the way. */ return (ENOENT); #if 0 if (vmove) panic("%s: ifp=%p not on the ifnet tailq %p", __func__, ifp, &V_ifnet); else return; /* XXX this should panic as well? */ #endif } /* * At this point we know the interface still was on the ifnet list * and we removed it so we are in a stable state. */ #ifdef VIMAGE curvnet->vnet_ifcnt--; #endif epoch_wait_preempt(net_epoch_preempt); /* * Ensure all pending EPOCH(9) callbacks have been executed. This * fixes issues about late destruction of multicast options * which lead to leave group calls, which in turn access the * belonging ifnet structure: */ epoch_drain_callbacks(net_epoch_preempt); /* * In any case (destroy or vmove) detach us from the groups * and remove/wait for pending events on the taskq. * XXX-BZ in theory an interface could still enqueue a taskq change? */ if_delgroups(ifp); taskqueue_drain(taskqueue_swi, &ifp->if_linktask); taskqueue_drain(taskqueue_swi, &ifp->if_addmultitask); /* * Check if this is a cloned interface or not. Must do even if * shutting down as a if_vmove_reclaim() would move the ifp and * the if_clone_addgroup() will have a corrupted string overwise * from a gibberish pointer. */ if (vmove && ifcp != NULL) *ifcp = if_clone_findifc(ifp); if_down(ifp); #ifdef VIMAGE /* * On VNET shutdown abort here as the stack teardown will do all * the work top-down for us. */ if (shutdown) { /* Give interface users the chance to clean up. */ EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); /* * In case of a vmove we are done here without error. * If we would signal an error it would lead to the same * abort as if we did not find the ifnet anymore. * if_detach() calls us in void context and does not care * about an early abort notification, so life is splendid :) */ goto finish_vnet_shutdown; } #endif /* * At this point we are not tearing down a VNET and are either * going to destroy or vmove the interface and have to cleanup * accordingly. */ /* * Remove routes and flush queues. */ #ifdef ALTQ if (ALTQ_IS_ENABLED(&ifp->if_snd)) altq_disable(&ifp->if_snd); if (ALTQ_IS_ATTACHED(&ifp->if_snd)) altq_detach(&ifp->if_snd); #endif if_purgeaddrs(ifp); #ifdef INET in_ifdetach(ifp); #endif #ifdef INET6 /* * Remove all IPv6 kernel structs related to ifp. This should be done * before removing routing entries below, since IPv6 interface direct * routes are expected to be removed by the IPv6-specific kernel API. * Otherwise, the kernel will detect some inconsistency and bark it. */ in6_ifdetach(ifp); #endif if_purgemaddrs(ifp); /* Announce that the interface is gone. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); if (!vmove) { /* * Prevent further calls into the device driver via ifnet. */ if_dead(ifp); /* * Clean up all addresses. */ IF_ADDR_WLOCK(ifp); if (!CK_STAILQ_EMPTY(&ifp->if_addrhead)) { ifa = CK_STAILQ_FIRST(&ifp->if_addrhead); CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } else IF_ADDR_WUNLOCK(ifp); } rt_flushifroutes(ifp); #ifdef VIMAGE finish_vnet_shutdown: #endif /* * We cannot hold the lock over dom_ifdetach calls as they might * sleep, for example trying to drain a callout, thus open up the * theoretical race with re-attaching. */ IF_AFDATA_LOCK(ifp); i = ifp->if_afdata_initialized; ifp->if_afdata_initialized = 0; IF_AFDATA_UNLOCK(ifp); for (dp = domains; i > 0 && dp; dp = dp->dom_next) { if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) { (*dp->dom_ifdetach)(ifp, ifp->if_afdata[dp->dom_family]); ifp->if_afdata[dp->dom_family] = NULL; } } return (0); } #ifdef VIMAGE /* * if_vmove() performs a limited version of if_detach() in current * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg. * An attempt is made to shrink if_index in current vnet, find an * unused if_index in target vnet and calls if_grow() if necessary, * and finally find an unused if_xname for the target vnet. */ static int if_vmove(struct ifnet *ifp, struct vnet *new_vnet) { struct if_clone *ifc; #ifdef DEV_BPF u_int bif_dlt, bif_hdrlen; #endif void *old; int rc; #ifdef DEV_BPF /* * if_detach_internal() will call the eventhandler to notify * interface departure. That will detach if_bpf. We need to * safe the dlt and hdrlen so we can re-attach it later. */ bpf_get_bp_params(ifp->if_bpf, &bif_dlt, &bif_hdrlen); #endif /* * Detach from current vnet, but preserve LLADDR info, do not * mark as dead etc. so that the ifnet can be reattached later. * If we cannot find it, we lost the race to someone else. */ rc = if_detach_internal(ifp, 1, &ifc); if (rc != 0) return (rc); /* * Unlink the ifnet from ifindex_table[] in current vnet, and shrink * the if_index for that vnet if possible. * * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized, * or we'd lock on one vnet and unlock on another. */ IFNET_WLOCK(); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); /* * Perform interface-specific reassignment tasks, if provided by * the driver. */ if (ifp->if_reassign != NULL) ifp->if_reassign(ifp, new_vnet, NULL); /* * Switch to the context of the target vnet. */ CURVNET_SET_QUIET(new_vnet); restart: IFNET_WLOCK(); ifp->if_index = ifindex_alloc(&old); if (__predict_false(ifp->if_index == USHRT_MAX)) { IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(old, M_IFNET); goto restart; } ifnet_setbyindex(ifp->if_index, ifp); IFNET_WUNLOCK(); if_attach_internal(ifp, 1, ifc); #ifdef DEV_BPF if (ifp->if_bpf == NULL) bpfattach(ifp, bif_dlt, bif_hdrlen); #endif CURVNET_RESTORE(); return (0); } /* * Move an ifnet to or from another child prison/vnet, specified by the jail id. */ static int if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid) { struct prison *pr; struct ifnet *difp; int error; bool shutdown; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Do not try to move the iface from and to the same prison. */ if (pr->pr_vnet == ifp->if_vnet) { prison_free(pr); return (EEXIST); } /* Make sure the named iface does not exists in the dst. prison/vnet. */ /* XXX Lock interfaces to avoid races. */ CURVNET_SET_QUIET(pr->pr_vnet); difp = ifunit(ifname); if (difp != NULL) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Make sure the VNET is stable. */ shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet); if (shutdown) { CURVNET_RESTORE(); prison_free(pr); return (EBUSY); } CURVNET_RESTORE(); /* Move the interface into the child jail/vnet. */ error = if_vmove(ifp, pr->pr_vnet); /* Report the new if_xname back to the userland on success. */ if (error == 0) sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (error); } static int if_vmove_reclaim(struct thread *td, char *ifname, int jid) { struct prison *pr; struct vnet *vnet_dst; struct ifnet *ifp; int error; bool shutdown; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Make sure the named iface exists in the source prison/vnet. */ CURVNET_SET(pr->pr_vnet); ifp = ifunit(ifname); /* XXX Lock to avoid races. */ if (ifp == NULL) { CURVNET_RESTORE(); prison_free(pr); return (ENXIO); } /* Do not try to move the iface from and to the same prison. */ vnet_dst = TD_TO_VNET(td); if (vnet_dst == ifp->if_vnet) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Make sure the VNET is stable. */ shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet); if (shutdown) { CURVNET_RESTORE(); prison_free(pr); return (EBUSY); } /* Get interface back from child jail/vnet. */ error = if_vmove(ifp, vnet_dst); CURVNET_RESTORE(); /* Report the new if_xname back to the userland on success. */ if (error == 0) sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (error); } #endif /* VIMAGE */ /* * Add a group to an interface */ int if_addgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_group *ifg = NULL; struct ifg_member *ifgm; int new = 0; if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' && groupname[strlen(groupname) - 1] <= '9') return (EINVAL); IFNET_WLOCK(); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) { IFNET_WUNLOCK(); return (EEXIST); } if ((ifgl = malloc(sizeof(*ifgl), M_TEMP, M_NOWAIT)) == NULL) { IFNET_WUNLOCK(); return (ENOMEM); } if ((ifgm = malloc(sizeof(*ifgm), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, groupname)) break; if (ifg == NULL) { if ((ifg = malloc(sizeof(*ifg), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); free(ifgm, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group)); ifg->ifg_refcnt = 0; CK_STAILQ_INIT(&ifg->ifg_members); CK_STAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next); new = 1; } ifg->ifg_refcnt++; ifgl->ifgl_group = ifg; ifgm->ifgm_ifp = ifp; IF_ADDR_WLOCK(ifp); CK_STAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next); CK_STAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); IFNET_WUNLOCK(); if (new) EVENTHANDLER_INVOKE(group_attach_event, ifg); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Helper function to remove a group out of an interface. Expects the global * ifnet lock to be write-locked, and drops it before returning. */ static void _if_delgroup_locked(struct ifnet *ifp, struct ifg_list *ifgl, const char *groupname) { struct ifg_member *ifgm; bool freeifgl; IFNET_WLOCK_ASSERT(); IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_groups, ifgl, ifg_list, ifgl_next); IF_ADDR_WUNLOCK(ifp); CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) { if (ifgm->ifgm_ifp == ifp) { CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifg_member, ifgm_next); break; } } if (--ifgl->ifgl_group->ifg_refcnt == 0) { CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group, ifg_next); freeifgl = true; } else { freeifgl = false; } IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); if (freeifgl) { EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } free(ifgm, M_TEMP); free(ifgl, M_TEMP); EVENTHANDLER_INVOKE(group_change_event, groupname); } /* * Remove a group from an interface */ int if_delgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; IFNET_WLOCK(); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (strcmp(ifgl->ifgl_group->ifg_group, groupname) == 0) break; if (ifgl == NULL) { IFNET_WUNLOCK(); return (ENOENT); } _if_delgroup_locked(ifp, ifgl, groupname); return (0); } /* * Remove an interface from all groups */ static void if_delgroups(struct ifnet *ifp) { struct ifg_list *ifgl; char groupname[IFNAMSIZ]; IFNET_WLOCK(); while ((ifgl = CK_STAILQ_FIRST(&ifp->if_groups)) != NULL) { strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ); _if_delgroup_locked(ifp, ifgl, groupname); IFNET_WLOCK(); } IFNET_WUNLOCK(); } static char * ifgr_group_get(void *ifgrp) { union ifgroupreq_union *ifgrup; ifgrup = ifgrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return (&ifgrup->ifgr32.ifgr_ifgru.ifgru_group[0]); #endif return (&ifgrup->ifgr.ifgr_ifgru.ifgru_group[0]); } static struct ifg_req * ifgr_groups_get(void *ifgrp) { union ifgroupreq_union *ifgrup; ifgrup = ifgrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((struct ifg_req *)(uintptr_t) ifgrup->ifgr32.ifgr_ifgru.ifgru_groups); #endif return (ifgrup->ifgr.ifgr_ifgru.ifgru_groups); } /* * Stores all groups from an interface in memory pointed to by ifgr. */ static int if_getgroup(struct ifgroupreq *ifgr, struct ifnet *ifp) { int len, error; struct ifg_list *ifgl; struct ifg_req ifgrq, *ifgp; NET_EPOCH_ASSERT(); if (ifgr->ifgr_len == 0) { CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) ifgr->ifgr_len += sizeof(struct ifg_req); return (0); } len = ifgr->ifgr_len; ifgp = ifgr_groups_get(ifgr); /* XXX: wire */ CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) { if (len < sizeof(ifgrq)) return (EINVAL); bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group, sizeof(ifgrq.ifgrq_group)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) return (error); len -= sizeof(ifgrq); ifgp++; } return (0); } /* * Stores all members of a group in memory pointed to by igfr */ static int if_getgroupmembers(struct ifgroupreq *ifgr) { struct ifg_group *ifg; struct ifg_member *ifgm; struct ifg_req ifgrq, *ifgp; int len, error; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (strcmp(ifg->ifg_group, ifgr->ifgr_name) == 0) break; if (ifg == NULL) { IFNET_RUNLOCK(); return (ENOENT); } if (ifgr->ifgr_len == 0) { CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) ifgr->ifgr_len += sizeof(ifgrq); IFNET_RUNLOCK(); return (0); } len = ifgr->ifgr_len; ifgp = ifgr_groups_get(ifgr); CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) { if (len < sizeof(ifgrq)) { IFNET_RUNLOCK(); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname, sizeof(ifgrq.ifgrq_member)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IFNET_RUNLOCK(); return (error); } len -= sizeof(ifgrq); ifgp++; } IFNET_RUNLOCK(); return (0); } /* * Return counter values from counter(9)s stored in ifnet. */ uint64_t if_get_counter_default(struct ifnet *ifp, ift_counter cnt) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); return (counter_u64_fetch(ifp->if_counters[cnt])); } /* * Increase an ifnet counter. Usually used for counters shared * between the stack and a driver, but function supports them all. */ void if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); counter_u64_add(ifp->if_counters[cnt], inc); } /* * Copy data from ifnet to userland API structure if_data. */ void if_data_copy(struct ifnet *ifp, struct if_data *ifd) { ifd->ifi_type = ifp->if_type; ifd->ifi_physical = 0; ifd->ifi_addrlen = ifp->if_addrlen; ifd->ifi_hdrlen = ifp->if_hdrlen; ifd->ifi_link_state = ifp->if_link_state; ifd->ifi_vhid = 0; ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_mtu = ifp->if_mtu; ifd->ifi_metric = ifp->if_metric; ifd->ifi_baudrate = ifp->if_baudrate; ifd->ifi_hwassist = ifp->if_hwassist; ifd->ifi_epoch = ifp->if_epoch; ifd->ifi_lastchange = ifp->if_lastchange; ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS); ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS); ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS); ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS); ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS); ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES); ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES); ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS); ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS); ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS); ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS); ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO); } /* * Initialization, destruction and refcounting functions for ifaddrs. */ struct ifaddr * ifa_alloc(size_t size, int flags) { struct ifaddr *ifa; KASSERT(size >= sizeof(struct ifaddr), ("%s: invalid size %zu", __func__, size)); ifa = malloc(size, M_IFADDR, M_ZERO | flags); if (ifa == NULL) return (NULL); if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL) goto fail; refcount_init(&ifa->ifa_refcnt, 1); return (ifa); fail: /* free(NULL) is okay */ counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); return (NULL); } void ifa_ref(struct ifaddr *ifa) { refcount_acquire(&ifa->ifa_refcnt); } static void ifa_destroy(epoch_context_t ctx) { struct ifaddr *ifa; ifa = __containerof(ctx, struct ifaddr, ifa_epoch_ctx); counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); } void ifa_free(struct ifaddr *ifa) { if (refcount_release(&ifa->ifa_refcnt)) NET_EPOCH_CALL(ifa_destroy, &ifa->ifa_epoch_ctx); } static int ifa_maintain_loopback_route(int cmd, const char *otype, struct ifaddr *ifa, struct sockaddr *ia) { struct epoch_tracker et; int error; struct rt_addrinfo info; struct sockaddr_dl null_sdl; struct ifnet *ifp; struct ifaddr *rti_ifa = NULL; ifp = ifa->ifa_ifp; bzero(&info, sizeof(info)); if (cmd != RTM_DELETE) info.rti_ifp = V_loif; if (cmd == RTM_ADD) { /* explicitly specify (loopback) ifa */ if (info.rti_ifp != NULL) { NET_EPOCH_ENTER(et); rti_ifa = ifaof_ifpforaddr(ifa->ifa_addr, info.rti_ifp); if (rti_ifa != NULL) ifa_ref(rti_ifa); info.rti_ifa = rti_ifa; NET_EPOCH_EXIT(et); } } info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC | RTF_PINNED; info.rti_info[RTAX_DST] = ia; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; link_init_sdl(ifp, (struct sockaddr *)&null_sdl, ifp->if_type); error = rtrequest1_fib(cmd, &info, NULL, ifp->if_fib); if (rti_ifa != NULL) ifa_free(rti_ifa); if (error == 0 || (cmd == RTM_ADD && error == EEXIST) || (cmd == RTM_DELETE && (error == ENOENT || error == ESRCH))) return (error); log(LOG_DEBUG, "%s: %s failed for interface %s: %u\n", __func__, otype, if_name(ifp), error); return (error); } int ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_ADD, "insertion", ifa, ia)); } int ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_DELETE, "deletion", ifa, ia)); } int ifa_switch_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_CHANGE, "switch", ifa, ia)); } /* * XXX: Because sockaddr_dl has deeper structure than the sockaddr * structs used to represent other address families, it is necessary * to perform a different comparison. */ #define sa_dl_equal(a1, a2) \ ((((const struct sockaddr_dl *)(a1))->sdl_len == \ ((const struct sockaddr_dl *)(a2))->sdl_len) && \ (bcmp(CLLADDR((const struct sockaddr_dl *)(a1)), \ CLLADDR((const struct sockaddr_dl *)(a2)), \ ((const struct sockaddr_dl *)(a1))->sdl_alen) == 0)) /* * Locate an interface based on a complete address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithaddr(const struct sockaddr *addr) { struct ifnet *ifp; struct ifaddr *ifa; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (sa_equal(addr, ifa->ifa_addr)) { goto done; } /* IP6 doesn't have broadcast */ if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { goto done; } } } ifa = NULL; done: return (ifa); } int ifa_ifwithaddr_check(const struct sockaddr *addr) { struct epoch_tracker et; int rc; NET_EPOCH_ENTER(et); rc = (ifa_ifwithaddr(addr) != NULL); NET_EPOCH_EXIT(et); return (rc); } /* * Locate an interface based on the broadcast address. */ /* ARGSUSED */ struct ifaddr * ifa_ifwithbroadaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { goto done; } } } ifa = NULL; done: return (ifa); } /* * Locate the point to point interface with a given destination address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithdstaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((ifp->if_flags & IFF_POINTOPOINT) == 0) continue; if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { goto done; } } } ifa = NULL; done: return (ifa); } /* * Find an interface on a specific network. If many, choice * is most specific found. */ struct ifaddr * ifa_ifwithnet(const struct sockaddr *addr, int ignore_ptp, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; const char *addr_data = addr->sa_data, *cplim; NET_EPOCH_ASSERT(); /* * AF_LINK addresses can be looked up directly by their index number, * so do that if we can. */ if (af == AF_LINK) { const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)addr; if (sdl->sdl_index && sdl->sdl_index <= V_if_index) return (ifaddr_byindex(sdl->sdl_index)); } /* * Scan though each interface, looking for ones that have addresses * in this address family and the requested fib. */ CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { const char *cp, *cp2, *cp3; if (ifa->ifa_addr->sa_family != af) next: continue; if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) { /* * This is a bit broken as it doesn't * take into account that the remote end may * be a single node in the network we are * looking for. * The trouble is that we don't know the * netmask for the remote end. */ if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { goto done; } } else { /* * Scan all the bits in the ifa's address. * If a bit dissagrees with what we are * looking for, mask it with the netmask * to see if it really matters. * (A byte at a time) */ if (ifa->ifa_netmask == 0) continue; cp = addr_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; while (cp3 < cplim) if ((*cp++ ^ *cp2++) & *cp3++) goto next; /* next address! */ /* * If the netmask of what we just found * is more specific than what we had before * (if we had one), or if the virtual status * of new prefix is better than of the old one, * then remember the new one before continuing * to search for an even better one. */ if (ifa_maybe == NULL || ifa_preferred(ifa_maybe, ifa) || rn_refines((caddr_t)ifa->ifa_netmask, (caddr_t)ifa_maybe->ifa_netmask)) { ifa_maybe = ifa; } } } } ifa = ifa_maybe; ifa_maybe = NULL; done: return (ifa); } /* * Find an interface address specific to an interface best matching * a given address. */ struct ifaddr * ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp) { struct ifaddr *ifa; const char *cp, *cp2, *cp3; char *cplim; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; if (af >= AF_MAX) return (NULL); NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != af) continue; if (ifa_maybe == NULL) ifa_maybe = ifa; if (ifa->ifa_netmask == 0) { if (sa_equal(addr, ifa->ifa_addr) || (ifa->ifa_dstaddr && sa_equal(addr, ifa->ifa_dstaddr))) goto done; continue; } if (ifp->if_flags & IFF_POINTOPOINT) { if (sa_equal(addr, ifa->ifa_dstaddr)) goto done; } else { cp = addr->sa_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; for (; cp3 < cplim; cp3++) if ((*cp++ ^ *cp2++) & *cp3) break; if (cp3 == cplim) goto done; } } ifa = ifa_maybe; done: return (ifa); } /* * See whether new ifa is better than current one: * 1) A non-virtual one is preferred over virtual. * 2) A virtual in master state preferred over any other state. * * Used in several address selecting functions. */ int ifa_preferred(struct ifaddr *cur, struct ifaddr *next) { return (cur->ifa_carp && (!next->ifa_carp || ((*carp_master_p)(next) && !(*carp_master_p)(cur)))); } struct sockaddr_dl * link_alloc_sdl(size_t size, int flags) { return (malloc(size, M_TEMP, flags)); } void link_free_sdl(struct sockaddr *sa) { free(sa, M_TEMP); } /* * Fills in given sdl with interface basic info. * Returns pointer to filled sdl. */ struct sockaddr_dl * link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)paddr; memset(sdl, 0, sizeof(struct sockaddr_dl)); sdl->sdl_len = sizeof(struct sockaddr_dl); sdl->sdl_family = AF_LINK; sdl->sdl_index = ifp->if_index; sdl->sdl_type = iftype; return (sdl); } /* * Mark an interface down and notify protocols of * the transition. */ static void if_unroute(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP")); ifp->if_flags &= ~flag; getmicrotime(&ifp->if_lastchange); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFDOWN, ifa->ifa_addr); ifp->if_qflush(ifp); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); } /* * Mark an interface up and notify protocols of * the transition. */ static void if_route(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP")); ifp->if_flags |= flag; getmicrotime(&ifp->if_lastchange); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFUP, ifa->ifa_addr); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); #ifdef INET6 in6_if_up(ifp); #endif } void (*vlan_link_state_p)(struct ifnet *); /* XXX: private from if_vlan */ void (*vlan_trunk_cap_p)(struct ifnet *); /* XXX: private from if_vlan */ struct ifnet *(*vlan_trunkdev_p)(struct ifnet *); struct ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t); int (*vlan_tag_p)(struct ifnet *, uint16_t *); int (*vlan_pcp_p)(struct ifnet *, uint16_t *); int (*vlan_setcookie_p)(struct ifnet *, void *); void *(*vlan_cookie_p)(struct ifnet *); /* * Handle a change in the interface link state. To avoid LORs * between driver lock and upper layer locks, as well as possible * recursions, we post event to taskqueue, and all job * is done in static do_link_state_change(). */ void if_link_state_change(struct ifnet *ifp, int link_state) { /* Return if state hasn't changed. */ if (ifp->if_link_state == link_state) return; ifp->if_link_state = link_state; /* XXXGL: reference ifp? */ taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask); } static void do_link_state_change(void *arg, int pending) { struct ifnet *ifp; int link_state; ifp = arg; link_state = ifp->if_link_state; CURVNET_SET(ifp->if_vnet); rt_ifmsg(ifp); if (ifp->if_vlantrunk != NULL) (*vlan_link_state_p)(ifp); if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) && ifp->if_l2com != NULL) (*ng_ether_link_state_p)(ifp, link_state); if (ifp->if_carp) (*carp_linkstate_p)(ifp); if (ifp->if_bridge) ifp->if_bridge_linkstate(ifp); if (ifp->if_lagg) (*lagg_linkstate_p)(ifp, link_state); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL); if (pending > 1) if_printf(ifp, "%d link states coalesced\n", pending); if (log_link_state_change) if_printf(ifp, "link state changed to %s\n", (link_state == LINK_STATE_UP) ? "UP" : "DOWN" ); EVENTHANDLER_INVOKE(ifnet_link_event, ifp, link_state); CURVNET_RESTORE(); } /* * Mark an interface down and notify protocols of * the transition. */ void if_down(struct ifnet *ifp) { EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_DOWN); if_unroute(ifp, IFF_UP, AF_UNSPEC); } /* * Mark an interface up and notify protocols of * the transition. */ void if_up(struct ifnet *ifp) { if_route(ifp, IFF_UP, AF_UNSPEC); EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_UP); } /* * Flush an interface queue. */ void if_qflush(struct ifnet *ifp) { struct mbuf *m, *n; struct ifaltq *ifq; ifq = &ifp->if_snd; IFQ_LOCK(ifq); #ifdef ALTQ if (ALTQ_IS_ENABLED(ifq)) ALTQ_PURGE(ifq); #endif n = ifq->ifq_head; while ((m = n) != NULL) { n = m->m_nextpkt; m_freem(m); } ifq->ifq_head = 0; ifq->ifq_tail = 0; ifq->ifq_len = 0; IFQ_UNLOCK(ifq); } /* * Map interface name to interface structure pointer, with or without * returning a reference. */ struct ifnet * ifunit_ref(const char *name) { struct epoch_tracker et; struct ifnet *ifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 && !(ifp->if_flags & IFF_DYING)) break; } if (ifp != NULL) if_ref(ifp); NET_EPOCH_EXIT(et); return (ifp); } struct ifnet * ifunit(const char *name) { struct epoch_tracker et; struct ifnet *ifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0) break; } NET_EPOCH_EXIT(et); return (ifp); } static void * ifr_buffer_get_buffer(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((void *)(uintptr_t) ifrup->ifr32.ifr_ifru.ifru_buffer.buffer); #endif return (ifrup->ifr.ifr_ifru.ifru_buffer.buffer); } static void ifr_buffer_set_buffer_null(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ifrup->ifr32.ifr_ifru.ifru_buffer.buffer = 0; else #endif ifrup->ifr.ifr_ifru.ifru_buffer.buffer = NULL; } static size_t ifr_buffer_get_length(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return (ifrup->ifr32.ifr_ifru.ifru_buffer.length); #endif return (ifrup->ifr.ifr_ifru.ifru_buffer.length); } static void ifr_buffer_set_length(void *data, size_t len) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ifrup->ifr32.ifr_ifru.ifru_buffer.length = len; else #endif ifrup->ifr.ifr_ifru.ifru_buffer.length = len; } void * ifr_data_get_ptr(void *ifrp) { union ifreq_union *ifrup; ifrup = ifrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((void *)(uintptr_t) ifrup->ifr32.ifr_ifru.ifru_data); #endif return (ifrup->ifr.ifr_ifru.ifru_data); } /* * Hardware specific interface ioctls. */ int ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) { struct ifreq *ifr; int error = 0, do_ifup = 0; int new_flags, temp_flags; size_t namelen, onamelen; size_t descrlen; char *descrbuf, *odescrbuf; char new_name[IFNAMSIZ]; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifr = (struct ifreq *)data; switch (cmd) { case SIOCGIFINDEX: ifr->ifr_index = ifp->if_index; break; case SIOCGIFFLAGS: temp_flags = ifp->if_flags | ifp->if_drv_flags; ifr->ifr_flags = temp_flags & 0xffff; ifr->ifr_flagshigh = temp_flags >> 16; break; case SIOCGIFCAP: ifr->ifr_reqcap = ifp->if_capabilities; ifr->ifr_curcap = ifp->if_capenable; break; #ifdef MAC case SIOCGIFMAC: error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp); break; #endif case SIOCGIFMETRIC: ifr->ifr_metric = ifp->if_metric; break; case SIOCGIFMTU: ifr->ifr_mtu = ifp->if_mtu; break; case SIOCGIFPHYS: /* XXXGL: did this ever worked? */ ifr->ifr_phys = 0; break; case SIOCGIFDESCR: error = 0; sx_slock(&ifdescr_sx); if (ifp->if_description == NULL) error = ENOMSG; else { /* space for terminating nul */ descrlen = strlen(ifp->if_description) + 1; if (ifr_buffer_get_length(ifr) < descrlen) ifr_buffer_set_buffer_null(ifr); else error = copyout(ifp->if_description, ifr_buffer_get_buffer(ifr), descrlen); ifr_buffer_set_length(ifr, descrlen); } sx_sunlock(&ifdescr_sx); break; case SIOCSIFDESCR: error = priv_check(td, PRIV_NET_SETIFDESCR); if (error) return (error); /* * Copy only (length-1) bytes to make sure that * if_description is always nul terminated. The * length parameter is supposed to count the * terminating nul in. */ if (ifr_buffer_get_length(ifr) > ifdescr_maxlen) return (ENAMETOOLONG); else if (ifr_buffer_get_length(ifr) == 0) descrbuf = NULL; else { descrbuf = malloc(ifr_buffer_get_length(ifr), M_IFDESCR, M_WAITOK | M_ZERO); error = copyin(ifr_buffer_get_buffer(ifr), descrbuf, ifr_buffer_get_length(ifr) - 1); if (error) { free(descrbuf, M_IFDESCR); break; } } sx_xlock(&ifdescr_sx); odescrbuf = ifp->if_description; ifp->if_description = descrbuf; sx_xunlock(&ifdescr_sx); getmicrotime(&ifp->if_lastchange); free(odescrbuf, M_IFDESCR); break; case SIOCGIFFIB: ifr->ifr_fib = ifp->if_fib; break; case SIOCSIFFIB: error = priv_check(td, PRIV_NET_SETIFFIB); if (error) return (error); if (ifr->ifr_fib >= rt_numfibs) return (EINVAL); ifp->if_fib = ifr->ifr_fib; break; case SIOCSIFFLAGS: error = priv_check(td, PRIV_NET_SETIFFLAGS); if (error) return (error); /* * Currently, no driver owned flags pass the IFF_CANTCHANGE * check, so we don't need special handling here yet. */ new_flags = (ifr->ifr_flags & 0xffff) | (ifr->ifr_flagshigh << 16); if (ifp->if_flags & IFF_UP && (new_flags & IFF_UP) == 0) { if_down(ifp); } else if (new_flags & IFF_UP && (ifp->if_flags & IFF_UP) == 0) { do_ifup = 1; } /* See if permanently promiscuous mode bit is about to flip */ if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) { if (new_flags & IFF_PPROMISC) ifp->if_flags |= IFF_PROMISC; else if (ifp->if_pcount == 0) ifp->if_flags &= ~IFF_PROMISC; if (log_promisc_mode_change) if_printf(ifp, "permanently promiscuous mode %s\n", ((new_flags & IFF_PPROMISC) ? "enabled" : "disabled")); } ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) | (new_flags &~ IFF_CANTCHANGE); if (ifp->if_ioctl) { (void) (*ifp->if_ioctl)(ifp, cmd, data); } if (do_ifup) if_up(ifp); getmicrotime(&ifp->if_lastchange); break; case SIOCSIFCAP: error = priv_check(td, PRIV_NET_SETIFCAP); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); if (ifr->ifr_reqcap & ~ifp->if_capabilities) return (EINVAL); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; #ifdef MAC case SIOCSIFMAC: error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp); break; #endif case SIOCSIFNAME: error = priv_check(td, PRIV_NET_SETIFNAME); if (error) return (error); error = copyinstr(ifr_data_get_ptr(ifr), new_name, IFNAMSIZ, NULL); if (error != 0) return (error); if (new_name[0] == '\0') return (EINVAL); if (new_name[IFNAMSIZ-1] != '\0') { new_name[IFNAMSIZ-1] = '\0'; if (strlen(new_name) == IFNAMSIZ-1) return (EINVAL); } if (strcmp(new_name, ifp->if_xname) == 0) break; if (ifunit(new_name) != NULL) return (EEXIST); /* * XXX: Locking. Nothing else seems to lock if_flags, * and there are numerous other races with the * ifunit() checks not being atomic with namespace * changes (renames, vmoves, if_attach, etc). */ ifp->if_flags |= IFF_RENAMING; /* Announce the departure of the interface. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if_printf(ifp, "changing name to '%s'\n", new_name); IF_ADDR_WLOCK(ifp); strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname)); ifa = ifp->if_addr; sdl = (struct sockaddr_dl *)ifa->ifa_addr; namelen = strlen(new_name); onamelen = sdl->sdl_nlen; /* * Move the address if needed. This is safe because we * allocate space for a name of length IFNAMSIZ when we * create this in if_attach(). */ if (namelen != onamelen) { bcopy(sdl->sdl_data + onamelen, sdl->sdl_data + namelen, sdl->sdl_alen); } bcopy(new_name, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl = (struct sockaddr_dl *)ifa->ifa_netmask; bzero(sdl->sdl_data, onamelen); while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); /* Announce the return of the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); ifp->if_flags &= ~IFF_RENAMING; break; #ifdef VIMAGE case SIOCSIFVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error) return (error); error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid); break; #endif case SIOCSIFMETRIC: error = priv_check(td, PRIV_NET_SETIFMETRIC); if (error) return (error); ifp->if_metric = ifr->ifr_metric; getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYS: error = priv_check(td, PRIV_NET_SETIFPHYS); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFMTU: { u_long oldmtu = ifp->if_mtu; error = priv_check(td, PRIV_NET_SETIFMTU); if (error) return (error); if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) return (EINVAL); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) { getmicrotime(&ifp->if_lastchange); rt_ifmsg(ifp); #ifdef INET DEBUGNET_NOTIFY_MTU(ifp); #endif } /* * If the link MTU changed, do network layer specific procedure. */ if (ifp->if_mtu != oldmtu) { #ifdef INET6 nd6_setmtu(ifp); #endif rt_updatemtu(ifp); } break; } case SIOCADDMULTI: case SIOCDELMULTI: if (cmd == SIOCADDMULTI) error = priv_check(td, PRIV_NET_ADDMULTI); else error = priv_check(td, PRIV_NET_DELMULTI); if (error) return (error); /* Don't allow group membership on non-multicast interfaces. */ if ((ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); /* Don't let users screw up protocols' entries. */ if (ifr->ifr_addr.sa_family != AF_LINK) return (EINVAL); if (cmd == SIOCADDMULTI) { struct epoch_tracker et; struct ifmultiaddr *ifma; /* * Userland is only permitted to join groups once * via the if_addmulti() KPI, because it cannot hold * struct ifmultiaddr * between calls. It may also * lose a race while we check if the membership * already exists. */ NET_EPOCH_ENTER(et); ifma = if_findmulti(ifp, &ifr->ifr_addr); NET_EPOCH_EXIT(et); if (ifma != NULL) error = EADDRINUSE; else error = if_addmulti(ifp, &ifr->ifr_addr, &ifma); } else { error = if_delmulti(ifp, &ifr->ifr_addr); } if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYADDR: case SIOCDIFPHYADDR: #ifdef INET6 case SIOCSIFPHYADDR_IN6: #endif case SIOCSIFMEDIA: case SIOCSIFGENERIC: error = priv_check(td, PRIV_NET_HWIOCTL); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCGIFSTATUS: case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: case SIOCGIFMEDIA: case SIOCGIFXMEDIA: case SIOCGIFGENERIC: case SIOCGIFRSSKEY: case SIOCGIFRSSHASH: case SIOCGIFDOWNREASON: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); break; case SIOCSIFLLADDR: error = priv_check(td, PRIV_NET_SETLLADDR); if (error) return (error); error = if_setlladdr(ifp, ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len); break; case SIOCGHWADDR: error = if_gethwaddr(ifp, ifr); break; case CASE_IOC_IFGROUPREQ(SIOCAIFGROUP): error = priv_check(td, PRIV_NET_ADDIFGROUP); if (error) return (error); if ((error = if_addgroup(ifp, ifgr_group_get((struct ifgroupreq *)data)))) return (error); break; case CASE_IOC_IFGROUPREQ(SIOCGIFGROUP): { struct epoch_tracker et; NET_EPOCH_ENTER(et); error = if_getgroup((struct ifgroupreq *)data, ifp); NET_EPOCH_EXIT(et); break; } case CASE_IOC_IFGROUPREQ(SIOCDIFGROUP): error = priv_check(td, PRIV_NET_DELIFGROUP); if (error) return (error); if ((error = if_delgroup(ifp, ifgr_group_get((struct ifgroupreq *)data)))) return (error); break; default: error = ENOIOCTL; break; } return (error); } #ifdef COMPAT_FREEBSD32 struct ifconf32 { int32_t ifc_len; union { uint32_t ifcu_buf; uint32_t ifcu_req; } ifc_ifcu; }; #define SIOCGIFCONF32 _IOWR('i', 36, struct ifconf32) #endif #ifdef COMPAT_FREEBSD32 static void ifmr_init(struct ifmediareq *ifmr, caddr_t data) { struct ifmediareq32 *ifmr32; ifmr32 = (struct ifmediareq32 *)data; memcpy(ifmr->ifm_name, ifmr32->ifm_name, sizeof(ifmr->ifm_name)); ifmr->ifm_current = ifmr32->ifm_current; ifmr->ifm_mask = ifmr32->ifm_mask; ifmr->ifm_status = ifmr32->ifm_status; ifmr->ifm_active = ifmr32->ifm_active; ifmr->ifm_count = ifmr32->ifm_count; ifmr->ifm_ulist = (int *)(uintptr_t)ifmr32->ifm_ulist; } static void ifmr_update(const struct ifmediareq *ifmr, caddr_t data) { struct ifmediareq32 *ifmr32; ifmr32 = (struct ifmediareq32 *)data; ifmr32->ifm_current = ifmr->ifm_current; ifmr32->ifm_mask = ifmr->ifm_mask; ifmr32->ifm_status = ifmr->ifm_status; ifmr32->ifm_active = ifmr->ifm_active; ifmr32->ifm_count = ifmr->ifm_count; } #endif /* * Interface ioctls. */ int ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) { #ifdef COMPAT_FREEBSD32 caddr_t saved_data = NULL; struct ifmediareq ifmr; struct ifmediareq *ifmrp = NULL; #endif struct ifnet *ifp; struct ifreq *ifr; int error; int oif_flags; #ifdef VIMAGE bool shutdown; #endif CURVNET_SET(so->so_vnet); #ifdef VIMAGE /* Make sure the VNET is stable. */ shutdown = VNET_IS_SHUTTING_DOWN(so->so_vnet); if (shutdown) { CURVNET_RESTORE(); return (EBUSY); } #endif switch (cmd) { case SIOCGIFCONF: error = ifconf(cmd, data); goto out_noref; #ifdef COMPAT_FREEBSD32 case SIOCGIFCONF32: { struct ifconf32 *ifc32; struct ifconf ifc; ifc32 = (struct ifconf32 *)data; ifc.ifc_len = ifc32->ifc_len; ifc.ifc_buf = PTRIN(ifc32->ifc_buf); error = ifconf(SIOCGIFCONF, (void *)&ifc); if (error == 0) ifc32->ifc_len = ifc.ifc_len; goto out_noref; } #endif } #ifdef COMPAT_FREEBSD32 switch (cmd) { case SIOCGIFMEDIA32: case SIOCGIFXMEDIA32: ifmrp = &ifmr; ifmr_init(ifmrp, data); cmd = _IOC_NEWTYPE(cmd, struct ifmediareq); saved_data = data; data = (caddr_t)ifmrp; } #endif ifr = (struct ifreq *)data; switch (cmd) { #ifdef VIMAGE case SIOCSIFRVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error == 0) error = if_vmove_reclaim(td, ifr->ifr_name, ifr->ifr_jid); goto out_noref; #endif case SIOCIFCREATE: case SIOCIFCREATE2: error = priv_check(td, PRIV_NET_IFCREATE); if (error == 0) error = if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name), cmd == SIOCIFCREATE2 ? ifr_data_get_ptr(ifr) : NULL); goto out_noref; case SIOCIFDESTROY: error = priv_check(td, PRIV_NET_IFDESTROY); if (error == 0) error = if_clone_destroy(ifr->ifr_name); goto out_noref; case SIOCIFGCLONERS: error = if_clone_list((struct if_clonereq *)data); goto out_noref; case CASE_IOC_IFGROUPREQ(SIOCGIFGMEMB): error = if_getgroupmembers((struct ifgroupreq *)data); goto out_noref; #if defined(INET) || defined(INET6) case SIOCSVH: case SIOCGVH: if (carp_ioctl_p == NULL) error = EPROTONOSUPPORT; else error = (*carp_ioctl_p)(ifr, cmd, td); goto out_noref; #endif } ifp = ifunit_ref(ifr->ifr_name); if (ifp == NULL) { error = ENXIO; goto out_noref; } error = ifhwioctl(cmd, ifp, data, td); if (error != ENOIOCTL) goto out_ref; oif_flags = ifp->if_flags; if (so->so_proto == NULL) { error = EOPNOTSUPP; goto out_ref; } /* * Pass the request on to the socket control method, and if the * latter returns EOPNOTSUPP, directly to the interface. * * Make an exception for the legacy SIOCSIF* requests. Drivers * trust SIOCSIFADDR et al to come from an already privileged * layer, and do not perform any credentials checks or input * validation. */ error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, ifp, td)); if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL && cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR && cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK) error = (*ifp->if_ioctl)(ifp, cmd, data); if ((oif_flags ^ ifp->if_flags) & IFF_UP) { #ifdef INET6 if (ifp->if_flags & IFF_UP) in6_if_up(ifp); #endif } out_ref: if_rele(ifp); out_noref: #ifdef COMPAT_FREEBSD32 if (ifmrp != NULL) { KASSERT((cmd == SIOCGIFMEDIA || cmd == SIOCGIFXMEDIA), ("ifmrp non-NULL, but cmd is not an ifmedia req 0x%lx", cmd)); data = saved_data; ifmr_update(ifmrp, data); } #endif CURVNET_RESTORE(); return (error); } /* * The code common to handling reference counted flags, * e.g., in ifpromisc() and if_allmulti(). * The "pflag" argument can specify a permanent mode flag to check, * such as IFF_PPROMISC for promiscuous mode; should be 0 if none. * * Only to be used on stack-owned flags, not driver-owned flags. */ static int if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch) { struct ifreq ifr; int error; int oldflags, oldcount; /* Sanity checks to catch programming errors */ KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0, ("%s: setting driver-owned flag %d", __func__, flag)); if (onswitch) KASSERT(*refcount >= 0, ("%s: increment negative refcount %d for flag %d", __func__, *refcount, flag)); else KASSERT(*refcount > 0, ("%s: decrement non-positive refcount %d for flag %d", __func__, *refcount, flag)); /* In case this mode is permanent, just touch refcount */ if (ifp->if_flags & pflag) { *refcount += onswitch ? 1 : -1; return (0); } /* Save ifnet parameters for if_ioctl() may fail */ oldcount = *refcount; oldflags = ifp->if_flags; /* * See if we aren't the only and touching refcount is enough. * Actually toggle interface flag if we are the first or last. */ if (onswitch) { if ((*refcount)++) return (0); ifp->if_flags |= flag; } else { if (--(*refcount)) return (0); ifp->if_flags &= ~flag; } /* Call down the driver since we've changed interface flags */ if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto recover; } ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); if (error) goto recover; /* Notify userland that interface flags have changed */ rt_ifmsg(ifp); return (0); recover: /* Recover after driver error */ *refcount = oldcount; ifp->if_flags = oldflags; return (error); } /* * Set/clear promiscuous mode on interface ifp based on the truth value * of pswitch. The calls are reference counted so that only the first * "on" request actually has an effect, as does the final "off" request. * Results are undefined if the "off" and "on" requests are not matched. */ int ifpromisc(struct ifnet *ifp, int pswitch) { int error; int oldflags = ifp->if_flags; error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC, &ifp->if_pcount, pswitch); /* If promiscuous mode status has changed, log a message */ if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC) && log_promisc_mode_change) if_printf(ifp, "promiscuous mode %s\n", (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled"); return (error); } /* * Return interface configuration * of system. List may be used * in later ioctl's (above) to get * other information. */ /*ARGSUSED*/ static int ifconf(u_long cmd, caddr_t data) { struct ifconf *ifc = (struct ifconf *)data; struct ifnet *ifp; struct ifaddr *ifa; struct ifreq ifr; struct sbuf *sb; int error, full = 0, valid_len, max_len; /* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */ max_len = MAXPHYS - 1; /* Prevent hostile input from being able to crash the system */ if (ifc->ifc_len <= 0) return (EINVAL); again: if (ifc->ifc_len <= max_len) { max_len = ifc->ifc_len; full = 1; } sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN); max_len = 0; valid_len = 0; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { struct epoch_tracker et; int addrs; /* * Zero the ifr to make sure we don't disclose the contents * of the stack. */ memset(&ifr, 0, sizeof(ifr)); if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name)) >= sizeof(ifr.ifr_name)) { sbuf_delete(sb); IFNET_RUNLOCK(); return (ENAMETOOLONG); } addrs = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; if (prison_if(curthread->td_ucred, sa) != 0) continue; addrs++; if (sa->sa_len <= sizeof(*sa)) { if (sa->sa_len < sizeof(*sa)) { memset(&ifr.ifr_ifru.ifru_addr, 0, sizeof(ifr.ifr_ifru.ifru_addr)); memcpy(&ifr.ifr_ifru.ifru_addr, sa, sa->sa_len); } else ifr.ifr_ifru.ifru_addr = *sa; sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); } else { sbuf_bcat(sb, &ifr, offsetof(struct ifreq, ifr_addr)); max_len += offsetof(struct ifreq, ifr_addr); sbuf_bcat(sb, sa, sa->sa_len); max_len += sa->sa_len; } if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } NET_EPOCH_EXIT(et); if (addrs == 0) { sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } } IFNET_RUNLOCK(); /* * If we didn't allocate enough space (uncommon), try again. If * we have already allocated as much space as we are allowed, * return what we've got. */ if (valid_len != max_len && !full) { sbuf_delete(sb); goto again; } ifc->ifc_len = valid_len; sbuf_finish(sb); error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len); sbuf_delete(sb); return (error); } /* * Just like ifpromisc(), but for all-multicast-reception mode. */ int if_allmulti(struct ifnet *ifp, int onswitch) { return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch)); } struct ifmultiaddr * if_findmulti(struct ifnet *ifp, const struct sockaddr *sa) { struct ifmultiaddr *ifma; IF_ADDR_LOCK_ASSERT(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (sa->sa_family == AF_LINK) { if (sa_dl_equal(ifma->ifma_addr, sa)) break; } else { if (sa_equal(ifma->ifma_addr, sa)) break; } } return ifma; } /* * Allocate a new ifmultiaddr and initialize based on passed arguments. We * make copies of passed sockaddrs. The ifmultiaddr will not be added to * the ifnet multicast address list here, so the caller must do that and * other setup work (such as notifying the device driver). The reference * count is initialized to 1. */ static struct ifmultiaddr * if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa, int mflags) { struct ifmultiaddr *ifma; struct sockaddr *dupsa; ifma = malloc(sizeof *ifma, M_IFMADDR, mflags | M_ZERO); if (ifma == NULL) return (NULL); dupsa = malloc(sa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma, M_IFMADDR); return (NULL); } bcopy(sa, dupsa, sa->sa_len); ifma->ifma_addr = dupsa; ifma->ifma_ifp = ifp; ifma->ifma_refcount = 1; ifma->ifma_protospec = NULL; if (llsa == NULL) { ifma->ifma_lladdr = NULL; return (ifma); } dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); return (NULL); } bcopy(llsa, dupsa, llsa->sa_len); ifma->ifma_lladdr = dupsa; return (ifma); } /* * if_freemulti: free ifmultiaddr structure and possibly attached related * addresses. The caller is responsible for implementing reference * counting, notifying the driver, handling routing messages, and releasing * any dependent link layer state. */ #ifdef MCAST_VERBOSE extern void kdb_backtrace(void); #endif static void if_freemulti_internal(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d", ifma->ifma_refcount)); if (ifma->ifma_lladdr != NULL) free(ifma->ifma_lladdr, M_IFMADDR); #ifdef MCAST_VERBOSE kdb_backtrace(); printf("%s freeing ifma: %p\n", __func__, ifma); #endif free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); } static void if_destroymulti(epoch_context_t ctx) { struct ifmultiaddr *ifma; ifma = __containerof(ctx, struct ifmultiaddr, ifma_epoch_ctx); if_freemulti_internal(ifma); } void if_freemulti(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti_epoch: refcount %d", ifma->ifma_refcount)); NET_EPOCH_CALL(if_destroymulti, &ifma->ifma_epoch_ctx); } /* * Register an additional multicast address with a network interface. * * - If the address is already present, bump the reference count on the * address and return. * - If the address is not link-layer, look up a link layer address. * - Allocate address structures for one or both addresses, and attach to the * multicast address list on the interface. If automatically adding a link * layer address, the protocol address will own a reference to the link * layer address, to be freed when it is freed. * - Notify the network device driver of an addition to the multicast address * list. * * 'sa' points to caller-owned memory with the desired multicast address. * * 'retifma' will be used to return a pointer to the resulting multicast * address reference, if desired. */ int if_addmulti(struct ifnet *ifp, struct sockaddr *sa, struct ifmultiaddr **retifma) { struct ifmultiaddr *ifma, *ll_ifma; struct sockaddr *llsa; struct sockaddr_dl sdl; int error; #ifdef INET IN_MULTI_LIST_UNLOCK_ASSERT(); #endif #ifdef INET6 IN6_MULTI_LIST_UNLOCK_ASSERT(); #endif /* * If the address is already present, return a new reference to it; * otherwise, allocate storage and set up a new address. */ IF_ADDR_WLOCK(ifp); ifma = if_findmulti(ifp, sa); if (ifma != NULL) { ifma->ifma_refcount++; if (retifma != NULL) *retifma = ifma; IF_ADDR_WUNLOCK(ifp); return (0); } /* * The address isn't already present; resolve the protocol address * into a link layer address, and then look that up, bump its * refcount or allocate an ifma for that also. * Most link layer resolving functions returns address data which * fits inside default sockaddr_dl structure. However callback * can allocate another sockaddr structure, in that case we need to * free it later. */ llsa = NULL; ll_ifma = NULL; if (ifp->if_resolvemulti != NULL) { /* Provide called function with buffer size information */ sdl.sdl_len = sizeof(sdl); llsa = (struct sockaddr *)&sdl; error = ifp->if_resolvemulti(ifp, &llsa, sa); if (error) goto unlock_out; } /* * Allocate the new address. Don't hook it up yet, as we may also * need to allocate a link layer multicast address. */ ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT); if (ifma == NULL) { error = ENOMEM; goto free_llsa_out; } /* * If a link layer address is found, we'll need to see if it's * already present in the address list, or allocate is as well. * When this block finishes, the link layer address will be on the * list. */ if (llsa != NULL) { ll_ifma = if_findmulti(ifp, llsa); if (ll_ifma == NULL) { ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT); if (ll_ifma == NULL) { --ifma->ifma_refcount; if_freemulti(ifma); error = ENOMEM; goto free_llsa_out; } ll_ifma->ifma_flags |= IFMA_F_ENQUEUED; CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma, ifma_link); } else ll_ifma->ifma_refcount++; ifma->ifma_llifma = ll_ifma; } /* * We now have a new multicast address, ifma, and possibly a new or * referenced link layer address. Add the primary address to the * ifnet address list. */ ifma->ifma_flags |= IFMA_F_ENQUEUED; CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); if (retifma != NULL) *retifma = ifma; /* * Must generate the message while holding the lock so that 'ifma' * pointer is still valid. */ rt_newmaddrmsg(RTM_NEWMADDR, ifma); IF_ADDR_WUNLOCK(ifp); /* * We are certain we have added something, so call down to the * interface to let them know about it. */ if (ifp->if_ioctl != NULL) { if (THREAD_CAN_SLEEP()) (void )(*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0); else taskqueue_enqueue(taskqueue_swi, &ifp->if_addmultitask); } if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); return (0); free_llsa_out: if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); unlock_out: IF_ADDR_WUNLOCK(ifp); return (error); } static void if_siocaddmulti(void *arg, int pending) { struct ifnet *ifp; ifp = arg; #ifdef DIAGNOSTIC if (pending > 1) if_printf(ifp, "%d SIOCADDMULTI coalesced\n", pending); #endif CURVNET_SET(ifp->if_vnet); (void )(*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0); CURVNET_RESTORE(); } /* * Delete a multicast group membership by network-layer group address. * * Returns ENOENT if the entry could not be found. If ifp no longer * exists, results are undefined. This entry point should only be used * from subsystems which do appropriate locking to hold ifp for the * duration of the call. * Network-layer protocol domains must use if_delmulti_ifma(). */ int if_delmulti(struct ifnet *ifp, struct sockaddr *sa) { struct ifmultiaddr *ifma; int lastref; KASSERT(ifp, ("%s: NULL ifp", __func__)); IF_ADDR_WLOCK(ifp); lastref = 0; ifma = if_findmulti(ifp, sa); if (ifma != NULL) lastref = if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); if (ifma == NULL) return (ENOENT); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } return (0); } /* * Delete all multicast group membership for an interface. * Should be used to quickly flush all multicast filters. */ void if_delallmulti(struct ifnet *ifp) { struct ifmultiaddr *ifma; struct ifmultiaddr *next; IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); } void if_delmulti_ifma(struct ifmultiaddr *ifma) { if_delmulti_ifma_flags(ifma, 0); } /* * Delete a multicast group membership by group membership pointer. * Network-layer protocol domains must use this routine. * * It is safe to call this routine if the ifp disappeared. */ void if_delmulti_ifma_flags(struct ifmultiaddr *ifma, int flags) { struct ifnet *ifp; int lastref; MCDPRINTF("%s freeing ifma: %p\n", __func__, ifma); #ifdef INET IN_MULTI_LIST_UNLOCK_ASSERT(); #endif ifp = ifma->ifma_ifp; #ifdef DIAGNOSTIC if (ifp == NULL) { printf("%s: ifma_ifp seems to be detached\n", __func__); } else { struct epoch_tracker et; struct ifnet *oifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; NET_EPOCH_EXIT(et); if (ifp != oifp) ifp = NULL; } #endif /* * If and only if the ifnet instance exists: Acquire the address lock. */ if (ifp != NULL) IF_ADDR_WLOCK(ifp); lastref = if_delmulti_locked(ifp, ifma, flags); if (ifp != NULL) { /* * If and only if the ifnet instance exists: * Release the address lock. * If the group was left: update the hardware hash filter. */ IF_ADDR_WUNLOCK(ifp); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } } } /* * Perform deletion of network-layer and/or link-layer multicast address. * * Return 0 if the reference count was decremented. * Return 1 if the final reference was released, indicating that the * hardware hash filter should be reprogrammed. */ static int if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching) { struct ifmultiaddr *ll_ifma; if (ifp != NULL && ifma->ifma_ifp != NULL) { KASSERT(ifma->ifma_ifp == ifp, ("%s: inconsistent ifp %p", __func__, ifp)); IF_ADDR_WLOCK_ASSERT(ifp); } ifp = ifma->ifma_ifp; MCDPRINTF("%s freeing %p from %s \n", __func__, ifma, ifp ? ifp->if_xname : ""); /* * If the ifnet is detaching, null out references to ifnet, * so that upper protocol layers will notice, and not attempt * to obtain locks for an ifnet which no longer exists. The * routing socket announcement must happen before the ifnet * instance is detached from the system. */ if (detaching) { #ifdef DIAGNOSTIC printf("%s: detaching ifnet instance %p\n", __func__, ifp); #endif /* * ifp may already be nulled out if we are being reentered * to delete the ll_ifma. */ if (ifp != NULL) { rt_newmaddrmsg(RTM_DELMADDR, ifma); ifma->ifma_ifp = NULL; } } if (--ifma->ifma_refcount > 0) return 0; if (ifp != NULL && detaching == 0 && (ifma->ifma_flags & IFMA_F_ENQUEUED)) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } /* * If this ifma is a network-layer ifma, a link-layer ifma may * have been associated with it. Release it first if so. */ ll_ifma = ifma->ifma_llifma; if (ll_ifma != NULL) { KASSERT(ifma->ifma_lladdr != NULL, ("%s: llifma w/o lladdr", __func__)); if (detaching) ll_ifma->ifma_ifp = NULL; /* XXX */ if (--ll_ifma->ifma_refcount == 0) { if (ifp != NULL) { if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link); ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } } if_freemulti(ll_ifma); } } #ifdef INVARIANTS if (ifp) { struct ifmultiaddr *ifmatmp; CK_STAILQ_FOREACH(ifmatmp, &ifp->if_multiaddrs, ifma_link) MPASS(ifma != ifmatmp); } #endif if_freemulti(ifma); /* * The last reference to this instance of struct ifmultiaddr * was released; the hardware should be notified of this change. */ return 1; } /* * Set the link layer address on an interface. * * At this time we only support certain types of interfaces, * and we don't allow the length of the address to change. * * Set noinline to be dtrace-friendly */ __noinline int if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) { struct sockaddr_dl *sdl; struct ifaddr *ifa; struct ifreq ifr; ifa = ifp->if_addr; if (ifa == NULL) return (EINVAL); sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) return (EINVAL); if (len != sdl->sdl_alen) /* don't allow length to change */ return (EINVAL); switch (ifp->if_type) { case IFT_ETHER: case IFT_XETHER: case IFT_L2VLAN: case IFT_BRIDGE: case IFT_IEEE8023ADLAG: bcopy(lladdr, LLADDR(sdl), len); break; default: return (ENODEV); } /* * If the interface is already up, we need * to re-init it in order to reprogram its * address filter. */ if ((ifp->if_flags & IFF_UP) != 0) { if (ifp->if_ioctl) { ifp->if_flags &= ~IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); ifp->if_flags |= IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); } } EVENTHANDLER_INVOKE(iflladdr_event, ifp); return (0); } /* * Compat function for handling basic encapsulation requests. * Not converted stacks (FDDI, IB, ..) supports traditional * output model: ARP (and other similar L2 protocols) are handled * inside output routine, arpresolve/nd6_resolve() returns MAC * address instead of full prepend. * * This function creates calculated header==MAC for IPv4/IPv6 and * returns EAFNOSUPPORT (which is then handled in ARP code) for other * address families. */ static int if_requestencap_default(struct ifnet *ifp, struct if_encap_req *req) { if (req->rtype != IFENCAP_LL) return (EOPNOTSUPP); if (req->bufsize < req->lladdr_len) return (ENOMEM); switch (req->family) { case AF_INET: case AF_INET6: break; default: return (EAFNOSUPPORT); } /* Copy lladdr to storage as is */ memmove(req->buf, req->lladdr, req->lladdr_len); req->bufsize = req->lladdr_len; req->lladdr_off = 0; return (0); } /* * Tunnel interfaces can nest, also they may cause infinite recursion * calls when misconfigured. We'll prevent this by detecting loops. * High nesting level may cause stack exhaustion. We'll prevent this * by introducing upper limit. * * Return 0, if tunnel nesting count is equal or less than limit. */ int if_tunnel_check_nesting(struct ifnet *ifp, struct mbuf *m, uint32_t cookie, int limit) { struct m_tag *mtag; int count; count = 1; mtag = NULL; while ((mtag = m_tag_locate(m, cookie, 0, mtag)) != NULL) { if (*(struct ifnet **)(mtag + 1) == ifp) { log(LOG_NOTICE, "%s: loop detected\n", if_name(ifp)); return (EIO); } count++; } if (count > limit) { log(LOG_NOTICE, "%s: if_output recursively called too many times(%d)\n", if_name(ifp), count); return (EIO); } mtag = m_tag_alloc(cookie, 0, sizeof(struct ifnet *), M_NOWAIT); if (mtag == NULL) return (ENOMEM); *(struct ifnet **)(mtag + 1) = ifp; m_tag_prepend(m, mtag); return (0); } /* * Get the link layer address that was read from the hardware at attach. * * This is only set by Ethernet NICs (IFT_ETHER), but laggX interfaces re-type * their component interfaces as IFT_IEEE8023ADLAG. */ int if_gethwaddr(struct ifnet *ifp, struct ifreq *ifr) { if (ifp->if_hw_addr == NULL) return (ENODEV); switch (ifp->if_type) { case IFT_ETHER: case IFT_IEEE8023ADLAG: bcopy(ifp->if_hw_addr, ifr->ifr_addr.sa_data, ifp->if_addrlen); return (0); default: return (ENODEV); } } /* * The name argument must be a pointer to storage which will last as * long as the interface does. For physical devices, the result of * device_get_name(dev) is a good choice and for pseudo-devices a * static string works well. */ void if_initname(struct ifnet *ifp, const char *name, int unit) { ifp->if_dname = name; ifp->if_dunit = unit; if (unit != IF_DUNIT_NONE) snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit); else strlcpy(ifp->if_xname, name, IFNAMSIZ); } int if_printf(struct ifnet *ifp, const char *fmt, ...) { char if_fmt[256]; va_list ap; snprintf(if_fmt, sizeof(if_fmt), "%s: %s", ifp->if_xname, fmt); va_start(ap, fmt); vlog(LOG_INFO, if_fmt, ap); va_end(ap); return (0); } void if_start(struct ifnet *ifp) { (*(ifp)->if_start)(ifp); } /* * Backwards compatibility interface for drivers * that have not implemented it */ static int if_transmit(struct ifnet *ifp, struct mbuf *m) { int error; IFQ_HANDOFF(ifp, m, error); return (error); } static void if_input_default(struct ifnet *ifp __unused, struct mbuf *m) { m_freem(m); } int if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust) { int active = 0; IF_LOCK(ifq); if (_IF_QFULL(ifq)) { IF_UNLOCK(ifq); if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); m_freem(m); return (0); } if (ifp != NULL) { if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len + adjust); if (m->m_flags & (M_BCAST|M_MCAST)) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); active = ifp->if_drv_flags & IFF_DRV_OACTIVE; } _IF_ENQUEUE(ifq, m); IF_UNLOCK(ifq); if (ifp != NULL && !active) (*(ifp)->if_start)(ifp); return (1); } void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f) { KASSERT(if_com_alloc[type] == NULL, ("if_register_com_alloc: %d already registered", type)); KASSERT(if_com_free[type] == NULL, ("if_register_com_alloc: %d free already registered", type)); if_com_alloc[type] = a; if_com_free[type] = f; } void if_deregister_com_alloc(u_char type) { KASSERT(if_com_alloc[type] != NULL, ("if_deregister_com_alloc: %d not registered", type)); KASSERT(if_com_free[type] != NULL, ("if_deregister_com_alloc: %d free not registered", type)); if_com_alloc[type] = NULL; if_com_free[type] = NULL; } /* API for driver access to network stack owned ifnet.*/ uint64_t if_setbaudrate(struct ifnet *ifp, uint64_t baudrate) { uint64_t oldbrate; oldbrate = ifp->if_baudrate; ifp->if_baudrate = baudrate; return (oldbrate); } uint64_t if_getbaudrate(if_t ifp) { return (((struct ifnet *)ifp)->if_baudrate); } int if_setcapabilities(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capabilities = capabilities; return (0); } int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit) { ((struct ifnet *)ifp)->if_capabilities |= setbit; ((struct ifnet *)ifp)->if_capabilities &= ~clearbit; return (0); } int if_getcapabilities(if_t ifp) { return ((struct ifnet *)ifp)->if_capabilities; } int if_setcapenable(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capenable = capabilities; return (0); } int if_setcapenablebit(if_t ifp, int setcap, int clearcap) { if(setcap) ((struct ifnet *)ifp)->if_capenable |= setcap; if(clearcap) ((struct ifnet *)ifp)->if_capenable &= ~clearcap; return (0); } const char * if_getdname(if_t ifp) { return ((struct ifnet *)ifp)->if_dname; } int if_togglecapenable(if_t ifp, int togglecap) { ((struct ifnet *)ifp)->if_capenable ^= togglecap; return (0); } int if_getcapenable(if_t ifp) { return ((struct ifnet *)ifp)->if_capenable; } /* * This is largely undesirable because it ties ifnet to a device, but does * provide flexiblity for an embedded product vendor. Should be used with * the understanding that it violates the interface boundaries, and should be * a last resort only. */ int if_setdev(if_t ifp, void *dev) { return (0); } int if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags) { ((struct ifnet *)ifp)->if_drv_flags |= set_flags; ((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags; return (0); } int if_getdrvflags(if_t ifp) { return ((struct ifnet *)ifp)->if_drv_flags; } int if_setdrvflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_drv_flags = flags; return (0); } int if_setflags(if_t ifp, int flags) { - /* XXX Temporary */ - ((struct ifnet *)ifp)->if_flags = flags | IFF_NEEDSEPOCH; + + ifp->if_flags = flags; return (0); } int if_setflagbits(if_t ifp, int set, int clear) { ((struct ifnet *)ifp)->if_flags |= set; ((struct ifnet *)ifp)->if_flags &= ~clear; return (0); } int if_getflags(if_t ifp) { return ((struct ifnet *)ifp)->if_flags; } int if_clearhwassist(if_t ifp) { ((struct ifnet *)ifp)->if_hwassist = 0; return (0); } int if_sethwassistbits(if_t ifp, int toset, int toclear) { ((struct ifnet *)ifp)->if_hwassist |= toset; ((struct ifnet *)ifp)->if_hwassist &= ~toclear; return (0); } int if_sethwassist(if_t ifp, int hwassist_bit) { ((struct ifnet *)ifp)->if_hwassist = hwassist_bit; return (0); } int if_gethwassist(if_t ifp) { return ((struct ifnet *)ifp)->if_hwassist; } int if_setmtu(if_t ifp, int mtu) { ((struct ifnet *)ifp)->if_mtu = mtu; return (0); } int if_getmtu(if_t ifp) { return ((struct ifnet *)ifp)->if_mtu; } int if_getmtu_family(if_t ifp, int family) { struct domain *dp; for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_family == family && dp->dom_ifmtu != NULL) return (dp->dom_ifmtu((struct ifnet *)ifp)); } return (((struct ifnet *)ifp)->if_mtu); } /* * Methods for drivers to access interface unicast and multicast * link level addresses. Driver shall not know 'struct ifaddr' neither * 'struct ifmultiaddr'. */ u_int if_lladdr_count(if_t ifp) { struct epoch_tracker et; struct ifaddr *ifa; u_int count; count = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_LINK) count++; NET_EPOCH_EXIT(et); return (count); } u_int if_foreach_lladdr(if_t ifp, iflladdr_cb_t cb, void *cb_arg) { struct epoch_tracker et; struct ifaddr *ifa; u_int count; MPASS(cb); count = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_LINK) continue; count += (*cb)(cb_arg, (struct sockaddr_dl *)ifa->ifa_addr, count); } NET_EPOCH_EXIT(et); return (count); } u_int if_llmaddr_count(if_t ifp) { struct epoch_tracker et; struct ifmultiaddr *ifma; int count; count = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) if (ifma->ifma_addr->sa_family == AF_LINK) count++; NET_EPOCH_EXIT(et); return (count); } u_int if_foreach_llmaddr(if_t ifp, iflladdr_cb_t cb, void *cb_arg) { struct epoch_tracker et; struct ifmultiaddr *ifma; u_int count; MPASS(cb); count = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; count += (*cb)(cb_arg, (struct sockaddr_dl *)ifma->ifma_addr, count); } NET_EPOCH_EXIT(et); return (count); } int if_setsoftc(if_t ifp, void *softc) { ((struct ifnet *)ifp)->if_softc = softc; return (0); } void * if_getsoftc(if_t ifp) { return ((struct ifnet *)ifp)->if_softc; } void if_setrcvif(struct mbuf *m, if_t ifp) { MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m->m_pkthdr.rcvif = (struct ifnet *)ifp; } void if_setvtag(struct mbuf *m, uint16_t tag) { m->m_pkthdr.ether_vtag = tag; } uint16_t if_getvtag(struct mbuf *m) { return (m->m_pkthdr.ether_vtag); } int if_sendq_empty(if_t ifp) { return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd); } struct ifaddr * if_getifaddr(if_t ifp) { return ((struct ifnet *)ifp)->if_addr; } int if_getamcount(if_t ifp) { return ((struct ifnet *)ifp)->if_amcount; } int if_setsendqready(if_t ifp) { IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd); return (0); } int if_setsendqlen(if_t ifp, int tx_desc_count) { IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count); ((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count; return (0); } int if_vlantrunkinuse(if_t ifp) { return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0; } int if_input(if_t ifp, struct mbuf* sendmp) { (*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp); return (0); } struct mbuf * if_dequeue(if_t ifp) { struct mbuf *m; IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m); return (m); } int if_sendq_prepend(if_t ifp, struct mbuf *m) { IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m); return (0); } int if_setifheaderlen(if_t ifp, int len) { ((struct ifnet *)ifp)->if_hdrlen = len; return (0); } caddr_t if_getlladdr(if_t ifp) { return (IF_LLADDR((struct ifnet *)ifp)); } void * if_gethandle(u_char type) { return (if_alloc(type)); } void if_bpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; BPF_MTAP(ifp, m); } void if_etherbpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; ETHER_BPF_MTAP(ifp, m); } void if_vlancap(if_t ifh) { struct ifnet *ifp = (struct ifnet *)ifh; VLAN_CAPABILITIES(ifp); } int if_sethwtsomax(if_t ifp, u_int if_hw_tsomax) { ((struct ifnet *)ifp)->if_hw_tsomax = if_hw_tsomax; return (0); } int if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount) { ((struct ifnet *)ifp)->if_hw_tsomaxsegcount = if_hw_tsomaxsegcount; return (0); } int if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize) { ((struct ifnet *)ifp)->if_hw_tsomaxsegsize = if_hw_tsomaxsegsize; return (0); } u_int if_gethwtsomax(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomax); } u_int if_gethwtsomaxsegcount(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomaxsegcount); } u_int if_gethwtsomaxsegsize(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomaxsegsize); } void if_setinitfn(if_t ifp, void (*init_fn)(void *)) { ((struct ifnet *)ifp)->if_init = init_fn; } void if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t)) { ((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn; } void if_setstartfn(if_t ifp, void (*start_fn)(if_t)) { ((struct ifnet *)ifp)->if_start = (void *)start_fn; } void if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn) { ((struct ifnet *)ifp)->if_transmit = start_fn; } void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn) { ((struct ifnet *)ifp)->if_qflush = flush_fn; } void if_setgetcounterfn(if_t ifp, if_get_counter_t fn) { ifp->if_get_counter = fn; } /* Revisit these - These are inline functions originally. */ int drbr_inuse_drv(if_t ifh, struct buf_ring *br) { return drbr_inuse(ifh, br); } struct mbuf* drbr_dequeue_drv(if_t ifh, struct buf_ring *br) { return drbr_dequeue(ifh, br); } int drbr_needs_enqueue_drv(if_t ifh, struct buf_ring *br) { return drbr_needs_enqueue(ifh, br); } int drbr_enqueue_drv(if_t ifh, struct buf_ring *br, struct mbuf *m) { return drbr_enqueue(ifh, br, m); } diff --git a/sys/net/if.h b/sys/net/if.h index 974074473e1b..1e7430263fc3 100644 --- a/sys/net/if.h +++ b/sys/net/if.h @@ -1,620 +1,620 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NET_IF_H_ #define _NET_IF_H_ #include #if __BSD_VISIBLE /* * does not depend on on most other systems. This * helps userland compatibility. (struct timeval ifi_lastchange) * The same holds for . (struct sockaddr ifru_addr) */ #ifndef _KERNEL #include #include #endif #endif /* * Length of interface external name, including terminating '\0'. * Note: this is the same size as a generic device's external name. */ #define IF_NAMESIZE 16 #if __BSD_VISIBLE #define IFNAMSIZ IF_NAMESIZE #define IF_MAXUNIT 0x7fff /* historical value */ #endif #if __BSD_VISIBLE /* * Structure used to query names of interface cloners. */ struct if_clonereq { int ifcr_total; /* total cloners (out) */ int ifcr_count; /* room for this many in user buffer */ char *ifcr_buffer; /* buffer for cloner names */ }; /* * Structure describing information about an interface * which may be of interest to management entities. */ struct if_data { /* generic interface information */ uint8_t ifi_type; /* ethernet, tokenring, etc */ uint8_t ifi_physical; /* e.g., AUI, Thinnet, 10base-T, etc */ uint8_t ifi_addrlen; /* media address length */ uint8_t ifi_hdrlen; /* media header length */ uint8_t ifi_link_state; /* current link state */ uint8_t ifi_vhid; /* carp vhid */ uint16_t ifi_datalen; /* length of this data struct */ uint32_t ifi_mtu; /* maximum transmission unit */ uint32_t ifi_metric; /* routing metric (external only) */ uint64_t ifi_baudrate; /* linespeed */ /* volatile statistics */ uint64_t ifi_ipackets; /* packets received on interface */ uint64_t ifi_ierrors; /* input errors on interface */ uint64_t ifi_opackets; /* packets sent on interface */ uint64_t ifi_oerrors; /* output errors on interface */ uint64_t ifi_collisions; /* collisions on csma interfaces */ uint64_t ifi_ibytes; /* total number of octets received */ uint64_t ifi_obytes; /* total number of octets sent */ uint64_t ifi_imcasts; /* packets received via multicast */ uint64_t ifi_omcasts; /* packets sent via multicast */ uint64_t ifi_iqdrops; /* dropped on input */ uint64_t ifi_oqdrops; /* dropped on output */ uint64_t ifi_noproto; /* destined for unsupported protocol */ uint64_t ifi_hwassist; /* HW offload capabilities, see IFCAP */ /* Unions are here to make sizes MI. */ union { /* uptime at attach or stat reset */ time_t tt; uint64_t ph; } __ifi_epoch; #define ifi_epoch __ifi_epoch.tt union { /* time of last administrative change */ struct timeval tv; struct { uint64_t ph1; uint64_t ph2; } ph; } __ifi_lastchange; #define ifi_lastchange __ifi_lastchange.tv }; /*- * Interface flags are of two types: network stack owned flags, and driver * owned flags. Historically, these values were stored in the same ifnet * flags field, but with the advent of fine-grained locking, they have been * broken out such that the network stack is responsible for synchronizing * the stack-owned fields, and the device driver the device-owned fields. * Both halves can perform lockless reads of the other half's field, subject * to accepting the involved races. * * Both sets of flags come from the same number space, and should not be * permitted to conflict, as they are exposed to user space via a single * field. * * The following symbols identify read and write requirements for fields: * * (i) if_flags field set by device driver before attach, read-only there * after. * (n) if_flags field written only by the network stack, read by either the * stack or driver. * (d) if_drv_flags field written only by the device driver, read by either * the stack or driver. */ #define IFF_UP 0x1 /* (n) interface is up */ #define IFF_BROADCAST 0x2 /* (i) broadcast address valid */ #define IFF_DEBUG 0x4 /* (n) turn on debugging */ #define IFF_LOOPBACK 0x8 /* (i) is a loopback net */ #define IFF_POINTOPOINT 0x10 /* (i) is a point-to-point link */ -#define IFF_NEEDSEPOCH 0x20 /* (i) calls if_input w/o epoch */ +#define IFF_KNOWSEPOCH 0x20 /* (i) calls if_input in net epoch */ #define IFF_DRV_RUNNING 0x40 /* (d) resources allocated */ #define IFF_NOARP 0x80 /* (n) no address resolution protocol */ #define IFF_PROMISC 0x100 /* (n) receive all packets */ #define IFF_ALLMULTI 0x200 /* (n) receive all multicast packets */ #define IFF_DRV_OACTIVE 0x400 /* (d) tx hardware queue is full */ #define IFF_SIMPLEX 0x800 /* (i) can't hear own transmissions */ #define IFF_LINK0 0x1000 /* per link layer defined bit */ #define IFF_LINK1 0x2000 /* per link layer defined bit */ #define IFF_LINK2 0x4000 /* per link layer defined bit */ #define IFF_ALTPHYS IFF_LINK2 /* use alternate physical connection */ #define IFF_MULTICAST 0x8000 /* (i) supports multicast */ #define IFF_CANTCONFIG 0x10000 /* (i) unconfigurable using ioctl(2) */ #define IFF_PPROMISC 0x20000 /* (n) user-requested promisc mode */ #define IFF_MONITOR 0x40000 /* (n) user-requested monitor mode */ #define IFF_STATICARP 0x80000 /* (n) static ARP */ #define IFF_DYING 0x200000 /* (n) interface is winding down */ #define IFF_RENAMING 0x400000 /* (n) interface is being renamed */ #define IFF_NOGROUP 0x800000 /* (n) interface is not part of any groups */ /* * Old names for driver flags so that user space tools can continue to use * the old (portable) names. */ #ifndef _KERNEL #define IFF_RUNNING IFF_DRV_RUNNING #define IFF_OACTIVE IFF_DRV_OACTIVE #endif /* flags set internally only: */ #define IFF_CANTCHANGE \ (IFF_BROADCAST|IFF_POINTOPOINT|IFF_DRV_RUNNING|IFF_DRV_OACTIVE|\ IFF_SIMPLEX|IFF_MULTICAST|IFF_ALLMULTI|IFF_PROMISC|\ - IFF_DYING|IFF_CANTCONFIG) + IFF_DYING|IFF_CANTCONFIG|IFF_KNOWSEPOCH) /* * Values for if_link_state. */ #define LINK_STATE_UNKNOWN 0 /* link invalid/unknown */ #define LINK_STATE_DOWN 1 /* link is down */ #define LINK_STATE_UP 2 /* link is up */ /* * Some convenience macros used for setting ifi_baudrate. * XXX 1000 vs. 1024? --thorpej@netbsd.org */ #define IF_Kbps(x) ((uintmax_t)(x) * 1000) /* kilobits/sec. */ #define IF_Mbps(x) (IF_Kbps((x) * 1000)) /* megabits/sec. */ #define IF_Gbps(x) (IF_Mbps((x) * 1000)) /* gigabits/sec. */ /* * Capabilities that interfaces can advertise. * * struct ifnet.if_capabilities * contains the optional features & capabilities a particular interface * supports (not only the driver but also the detected hw revision). * Capabilities are defined by IFCAP_* below. * struct ifnet.if_capenable * contains the enabled (either by default or through ifconfig) optional * features & capabilities on this interface. * Capabilities are defined by IFCAP_* below. * struct if_data.ifi_hwassist in mbuf CSUM_ flag form, controlled by above * contains the enabled optional feature & capabilites that can be used * individually per packet and are specified in the mbuf pkthdr.csum_flags * field. IFCAP_* and CSUM_* do not match one to one and CSUM_* may be * more detailed or differenciated than IFCAP_*. * Hwassist features are defined CSUM_* in sys/mbuf.h * * Capabilities that cannot be arbitrarily changed with ifconfig/ioctl * are listed in IFCAP_CANTCHANGE, similar to IFF_CANTCHANGE. * This is not strictly necessary because the common code never * changes capabilities, and it is left to the individual driver * to do the right thing. However, having the filter here * avoids replication of the same code in all individual drivers. */ #define IFCAP_RXCSUM 0x00001 /* can offload checksum on RX */ #define IFCAP_TXCSUM 0x00002 /* can offload checksum on TX */ #define IFCAP_NETCONS 0x00004 /* can be a network console */ #define IFCAP_VLAN_MTU 0x00008 /* VLAN-compatible MTU */ #define IFCAP_VLAN_HWTAGGING 0x00010 /* hardware VLAN tag support */ #define IFCAP_JUMBO_MTU 0x00020 /* 9000 byte MTU supported */ #define IFCAP_POLLING 0x00040 /* driver supports polling */ #define IFCAP_VLAN_HWCSUM 0x00080 /* can do IFCAP_HWCSUM on VLANs */ #define IFCAP_TSO4 0x00100 /* can do TCP Segmentation Offload */ #define IFCAP_TSO6 0x00200 /* can do TCP6 Segmentation Offload */ #define IFCAP_LRO 0x00400 /* can do Large Receive Offload */ #define IFCAP_WOL_UCAST 0x00800 /* wake on any unicast frame */ #define IFCAP_WOL_MCAST 0x01000 /* wake on any multicast frame */ #define IFCAP_WOL_MAGIC 0x02000 /* wake on any Magic Packet */ #define IFCAP_TOE4 0x04000 /* interface can offload TCP */ #define IFCAP_TOE6 0x08000 /* interface can offload TCP6 */ #define IFCAP_VLAN_HWFILTER 0x10000 /* interface hw can filter vlan tag */ /* available 0x20000 */ #define IFCAP_VLAN_HWTSO 0x40000 /* can do IFCAP_TSO on VLANs */ #define IFCAP_LINKSTATE 0x80000 /* the runtime link state is dynamic */ #define IFCAP_NETMAP 0x100000 /* netmap mode supported/enabled */ #define IFCAP_RXCSUM_IPV6 0x200000 /* can offload checksum on IPv6 RX */ #define IFCAP_TXCSUM_IPV6 0x400000 /* can offload checksum on IPv6 TX */ #define IFCAP_HWSTATS 0x800000 /* manages counters internally */ #define IFCAP_TXRTLMT 0x1000000 /* hardware supports TX rate limiting */ #define IFCAP_HWRXTSTMP 0x2000000 /* hardware rx timestamping */ #define IFCAP_NOMAP 0x4000000 /* can TX unmapped mbufs */ #define IFCAP_TXTLS4 0x8000000 /* can do TLS encryption and segmentation for TCP */ #define IFCAP_TXTLS6 0x10000000 /* can do TLS encryption and segmentation for TCP6 */ #define IFCAP_HWCSUM_IPV6 (IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6) #define IFCAP_HWCSUM (IFCAP_RXCSUM | IFCAP_TXCSUM) #define IFCAP_TSO (IFCAP_TSO4 | IFCAP_TSO6) #define IFCAP_WOL (IFCAP_WOL_UCAST | IFCAP_WOL_MCAST | IFCAP_WOL_MAGIC) #define IFCAP_TOE (IFCAP_TOE4 | IFCAP_TOE6) #define IFCAP_TXTLS (IFCAP_TXTLS4 | IFCAP_TXTLS6) #define IFCAP_CANTCHANGE (IFCAP_NETMAP) #define IFQ_MAXLEN 50 #define IFNET_SLOWHZ 1 /* granularity is 1 second */ /* * Message format for use in obtaining information about interfaces * from getkerninfo and the routing socket * For the new, extensible interface see struct if_msghdrl below. */ struct if_msghdr { u_short ifm_msglen; /* to skip over non-understood messages */ u_char ifm_version; /* future binary compatibility */ u_char ifm_type; /* message type */ int ifm_addrs; /* like rtm_addrs */ int ifm_flags; /* value of if_flags */ u_short ifm_index; /* index for associated ifp */ u_short _ifm_spare1; struct if_data ifm_data;/* statistics and other data about if */ }; /* * The 'l' version shall be used by new interfaces, like NET_RT_IFLISTL. It is * extensible after ifm_data_off or within ifm_data. Both the if_msghdr and * if_data now have a member field detailing the struct length in addition to * the routing message length. Macros are provided to find the start of * ifm_data and the start of the socket address strucutres immediately following * struct if_msghdrl given a pointer to struct if_msghdrl. */ #define IF_MSGHDRL_IFM_DATA(_l) \ (struct if_data *)((char *)(_l) + (_l)->ifm_data_off) #define IF_MSGHDRL_RTA(_l) \ (void *)((uintptr_t)(_l) + (_l)->ifm_len) struct if_msghdrl { u_short ifm_msglen; /* to skip over non-understood messages */ u_char ifm_version; /* future binary compatibility */ u_char ifm_type; /* message type */ int ifm_addrs; /* like rtm_addrs */ int ifm_flags; /* value of if_flags */ u_short ifm_index; /* index for associated ifp */ u_short _ifm_spare1; /* spare space to grow if_index, see if_var.h */ u_short ifm_len; /* length of if_msghdrl incl. if_data */ u_short ifm_data_off; /* offset of if_data from beginning */ int _ifm_spare2; struct if_data ifm_data;/* statistics and other data about if */ }; /* * Message format for use in obtaining information about interface addresses * from getkerninfo and the routing socket * For the new, extensible interface see struct ifa_msghdrl below. */ struct ifa_msghdr { u_short ifam_msglen; /* to skip over non-understood messages */ u_char ifam_version; /* future binary compatibility */ u_char ifam_type; /* message type */ int ifam_addrs; /* like rtm_addrs */ int ifam_flags; /* value of ifa_flags */ u_short ifam_index; /* index for associated ifp */ u_short _ifam_spare1; int ifam_metric; /* value of ifa_ifp->if_metric */ }; /* * The 'l' version shall be used by new interfaces, like NET_RT_IFLISTL. It is * extensible after ifam_metric or within ifam_data. Both the ifa_msghdrl and * if_data now have a member field detailing the struct length in addition to * the routing message length. Macros are provided to find the start of * ifm_data and the start of the socket address strucutres immediately following * struct ifa_msghdrl given a pointer to struct ifa_msghdrl. */ #define IFA_MSGHDRL_IFAM_DATA(_l) \ (struct if_data *)((char *)(_l) + (_l)->ifam_data_off) #define IFA_MSGHDRL_RTA(_l) \ (void *)((uintptr_t)(_l) + (_l)->ifam_len) struct ifa_msghdrl { u_short ifam_msglen; /* to skip over non-understood messages */ u_char ifam_version; /* future binary compatibility */ u_char ifam_type; /* message type */ int ifam_addrs; /* like rtm_addrs */ int ifam_flags; /* value of ifa_flags */ u_short ifam_index; /* index for associated ifp */ u_short _ifam_spare1; /* spare space to grow if_index, see if_var.h */ u_short ifam_len; /* length of ifa_msghdrl incl. if_data */ u_short ifam_data_off; /* offset of if_data from beginning */ int ifam_metric; /* value of ifa_ifp->if_metric */ struct if_data ifam_data;/* statistics and other data about if or * address */ }; /* * Message format for use in obtaining information about multicast addresses * from the routing socket */ struct ifma_msghdr { u_short ifmam_msglen; /* to skip over non-understood messages */ u_char ifmam_version; /* future binary compatibility */ u_char ifmam_type; /* message type */ int ifmam_addrs; /* like rtm_addrs */ int ifmam_flags; /* value of ifa_flags */ u_short ifmam_index; /* index for associated ifp */ u_short _ifmam_spare1; }; /* * Message format announcing the arrival or departure of a network interface. */ struct if_announcemsghdr { u_short ifan_msglen; /* to skip over non-understood messages */ u_char ifan_version; /* future binary compatibility */ u_char ifan_type; /* message type */ u_short ifan_index; /* index for associated ifp */ char ifan_name[IFNAMSIZ]; /* if name, e.g. "en0" */ u_short ifan_what; /* what type of announcement */ }; #define IFAN_ARRIVAL 0 /* interface arrival */ #define IFAN_DEPARTURE 1 /* interface departure */ /* * Buffer with length to be used in SIOCGIFDESCR/SIOCSIFDESCR requests */ struct ifreq_buffer { size_t length; void *buffer; }; /* * Interface request structure used for socket * ioctl's. All interface ioctl's must have parameter * definitions which begin with ifr_name. The * remainder may be interface specific. */ struct ifreq { char ifr_name[IFNAMSIZ]; /* if name, e.g. "en0" */ union { struct sockaddr ifru_addr; struct sockaddr ifru_dstaddr; struct sockaddr ifru_broadaddr; struct ifreq_buffer ifru_buffer; short ifru_flags[2]; short ifru_index; int ifru_jid; int ifru_metric; int ifru_mtu; int ifru_phys; int ifru_media; caddr_t ifru_data; int ifru_cap[2]; u_int ifru_fib; u_char ifru_vlan_pcp; } ifr_ifru; #define ifr_addr ifr_ifru.ifru_addr /* address */ #define ifr_dstaddr ifr_ifru.ifru_dstaddr /* other end of p-to-p link */ #define ifr_broadaddr ifr_ifru.ifru_broadaddr /* broadcast address */ #ifndef _KERNEL #define ifr_buffer ifr_ifru.ifru_buffer /* user supplied buffer with its length */ #endif #define ifr_flags ifr_ifru.ifru_flags[0] /* flags (low 16 bits) */ #define ifr_flagshigh ifr_ifru.ifru_flags[1] /* flags (high 16 bits) */ #define ifr_jid ifr_ifru.ifru_jid /* jail/vnet */ #define ifr_metric ifr_ifru.ifru_metric /* metric */ #define ifr_mtu ifr_ifru.ifru_mtu /* mtu */ #define ifr_phys ifr_ifru.ifru_phys /* physical wire */ #define ifr_media ifr_ifru.ifru_media /* physical media */ #ifndef _KERNEL #define ifr_data ifr_ifru.ifru_data /* for use by interface */ #endif #define ifr_reqcap ifr_ifru.ifru_cap[0] /* requested capabilities */ #define ifr_curcap ifr_ifru.ifru_cap[1] /* current capabilities */ #define ifr_index ifr_ifru.ifru_index /* interface index */ #define ifr_fib ifr_ifru.ifru_fib /* interface fib */ #define ifr_vlan_pcp ifr_ifru.ifru_vlan_pcp /* VLAN priority */ #define ifr_lan_pcp ifr_ifru.ifru_vlan_pcp /* VLAN priority */ }; #define _SIZEOF_ADDR_IFREQ(ifr) \ ((ifr).ifr_addr.sa_len > sizeof(struct sockaddr) ? \ (sizeof(struct ifreq) - sizeof(struct sockaddr) + \ (ifr).ifr_addr.sa_len) : sizeof(struct ifreq)) struct ifaliasreq { char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ struct sockaddr ifra_addr; struct sockaddr ifra_broadaddr; struct sockaddr ifra_mask; int ifra_vhid; }; /* 9.x compat */ struct oifaliasreq { char ifra_name[IFNAMSIZ]; struct sockaddr ifra_addr; struct sockaddr ifra_broadaddr; struct sockaddr ifra_mask; }; struct ifmediareq { char ifm_name[IFNAMSIZ]; /* if name, e.g. "en0" */ int ifm_current; /* current media options */ int ifm_mask; /* don't care mask */ int ifm_status; /* media status */ int ifm_active; /* active options */ int ifm_count; /* # entries in ifm_ulist array */ int *ifm_ulist; /* media words */ }; struct ifdrv { char ifd_name[IFNAMSIZ]; /* if name, e.g. "en0" */ unsigned long ifd_cmd; size_t ifd_len; void *ifd_data; }; /* * Structure used to retrieve aux status data from interfaces. * Kernel suppliers to this interface should respect the formatting * needed by ifconfig(8): each line starts with a TAB and ends with * a newline. The canonical example to copy and paste is in if_tun.c. */ #define IFSTATMAX 800 /* 10 lines of text */ struct ifstat { char ifs_name[IFNAMSIZ]; /* if name, e.g. "en0" */ char ascii[IFSTATMAX + 1]; }; /* * Structure used in SIOCGIFCONF request. * Used to retrieve interface configuration * for machine (useful for programs which * must know all networks accessible). */ struct ifconf { int ifc_len; /* size of associated buffer */ union { caddr_t ifcu_buf; struct ifreq *ifcu_req; } ifc_ifcu; #define ifc_buf ifc_ifcu.ifcu_buf /* buffer address */ #define ifc_req ifc_ifcu.ifcu_req /* array of structures returned */ }; /* * interface groups */ #define IFG_ALL "all" /* group contains all interfaces */ /* XXX: will we implement this? */ #define IFG_EGRESS "egress" /* if(s) default route(s) point to */ struct ifg_req { union { char ifgrqu_group[IFNAMSIZ]; char ifgrqu_member[IFNAMSIZ]; } ifgrq_ifgrqu; #define ifgrq_group ifgrq_ifgrqu.ifgrqu_group #define ifgrq_member ifgrq_ifgrqu.ifgrqu_member }; /* * Used to lookup groups for an interface */ struct ifgroupreq { char ifgr_name[IFNAMSIZ]; u_int ifgr_len; union { char ifgru_group[IFNAMSIZ]; struct ifg_req *ifgru_groups; } ifgr_ifgru; #ifndef _KERNEL #define ifgr_group ifgr_ifgru.ifgru_group #define ifgr_groups ifgr_ifgru.ifgru_groups #endif }; /* * Structure used to request i2c data * from interface transceivers. */ struct ifi2creq { uint8_t dev_addr; /* i2c address (0xA0, 0xA2) */ uint8_t offset; /* read offset */ uint8_t len; /* read length */ uint8_t spare0; uint32_t spare1; uint8_t data[8]; /* read buffer */ }; /* * RSS hash. */ #define RSS_FUNC_NONE 0 /* RSS disabled */ #define RSS_FUNC_PRIVATE 1 /* non-standard */ #define RSS_FUNC_TOEPLITZ 2 #define RSS_TYPE_IPV4 0x00000001 #define RSS_TYPE_TCP_IPV4 0x00000002 #define RSS_TYPE_IPV6 0x00000004 #define RSS_TYPE_IPV6_EX 0x00000008 #define RSS_TYPE_TCP_IPV6 0x00000010 #define RSS_TYPE_TCP_IPV6_EX 0x00000020 #define RSS_TYPE_UDP_IPV4 0x00000040 #define RSS_TYPE_UDP_IPV6 0x00000080 #define RSS_TYPE_UDP_IPV6_EX 0x00000100 #define RSS_KEYLEN 128 struct ifrsskey { char ifrk_name[IFNAMSIZ]; /* if name, e.g. "en0" */ uint8_t ifrk_func; /* RSS_FUNC_ */ uint8_t ifrk_spare0; uint16_t ifrk_keylen; uint8_t ifrk_key[RSS_KEYLEN]; }; struct ifrsshash { char ifrh_name[IFNAMSIZ]; /* if name, e.g. "en0" */ uint8_t ifrh_func; /* RSS_FUNC_ */ uint8_t ifrh_spare0; uint16_t ifrh_spare1; uint32_t ifrh_types; /* RSS_TYPE_ */ }; #define IFNET_PCP_NONE 0xff /* PCP disabled */ #define IFDR_MSG_SIZE 64 #define IFDR_REASON_MSG 1 #define IFDR_REASON_VENDOR 2 struct ifdownreason { char ifdr_name[IFNAMSIZ]; uint32_t ifdr_reason; uint32_t ifdr_vendor; char ifdr_msg[IFDR_MSG_SIZE]; }; #endif /* __BSD_VISIBLE */ #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_IFADDR); MALLOC_DECLARE(M_IFMADDR); #endif #endif #ifndef _KERNEL struct if_nameindex { unsigned int if_index; /* 1, 2, ... */ char *if_name; /* null terminated name: "le0", ... */ }; __BEGIN_DECLS void if_freenameindex(struct if_nameindex *); char *if_indextoname(unsigned int, char *); struct if_nameindex *if_nameindex(void); unsigned int if_nametoindex(const char *); __END_DECLS #endif #endif /* !_NET_IF_H_ */ diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c index 7d918216b300..6d33173462cb 100644 --- a/sys/net/if_ethersubr.c +++ b/sys/net/if_ethersubr.c @@ -1,1449 +1,1452 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_ethersubr.c 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_netgraph.h" #include "opt_mbuf_profiling.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #endif #ifdef INET6 #include #endif #include #include #ifdef CTASSERT CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2); CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN); #endif VNET_DEFINE(pfil_head_t, link_pfil_head); /* Packet filter hooks */ /* netgraph node hooks for ng_ether(4) */ void (*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m); int (*ng_ether_output_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_attach_p)(struct ifnet *ifp); void (*ng_ether_detach_p)(struct ifnet *ifp); void (*vlan_input_p)(struct ifnet *, struct mbuf *); /* if_bridge(4) support */ void (*bridge_dn_p)(struct mbuf *, struct ifnet *); /* if_lagg(4) support */ struct mbuf *(*lagg_input_p)(struct ifnet *, struct mbuf *); static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; static int ether_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); #ifdef VIMAGE static void ether_reassign(struct ifnet *, struct vnet *, char *); #endif static int ether_requestencap(struct ifnet *, struct if_encap_req *); #define senderr(e) do { error = (e); goto bad;} while (0) static void update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst) { int csum_flags = 0; if (src->m_pkthdr.csum_flags & CSUM_IP) csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA) csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR); if (src->m_pkthdr.csum_flags & CSUM_SCTP) csum_flags |= CSUM_SCTP_VALID; dst->m_pkthdr.csum_flags |= csum_flags; if (csum_flags & CSUM_DATA_VALID) dst->m_pkthdr.csum_data = 0xffff; } /* * Handle link-layer encapsulation requests. */ static int ether_requestencap(struct ifnet *ifp, struct if_encap_req *req) { struct ether_header *eh; struct arphdr *ah; uint16_t etype; const u_char *lladdr; if (req->rtype != IFENCAP_LL) return (EOPNOTSUPP); if (req->bufsize < ETHER_HDR_LEN) return (ENOMEM); eh = (struct ether_header *)req->buf; lladdr = req->lladdr; req->lladdr_off = 0; switch (req->family) { case AF_INET: etype = htons(ETHERTYPE_IP); break; case AF_INET6: etype = htons(ETHERTYPE_IPV6); break; case AF_ARP: ah = (struct arphdr *)req->hdata; ah->ar_hrd = htons(ARPHRD_ETHER); switch(ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: etype = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: etype = htons(ETHERTYPE_ARP); break; } if (req->flags & IFENCAP_FLAG_BROADCAST) lladdr = ifp->if_broadcastaddr; break; default: return (EAFNOSUPPORT); } memcpy(&eh->ether_type, &etype, sizeof(eh->ether_type)); memcpy(eh->ether_dhost, lladdr, ETHER_ADDR_LEN); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); req->bufsize = sizeof(struct ether_header); return (0); } static int ether_resolve_addr(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro, u_char *phdr, uint32_t *pflags, struct llentry **plle) { struct ether_header *eh; uint32_t lleflags = 0; int error = 0; #if defined(INET) || defined(INET6) uint16_t etype; #endif if (plle) *plle = NULL; eh = (struct ether_header *)phdr; switch (dst->sa_family) { #ifdef INET case AF_INET: if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) error = arpresolve(ifp, 0, m, dst, phdr, &lleflags, plle); else { if (m->m_flags & M_BCAST) memcpy(eh->ether_dhost, ifp->if_broadcastaddr, ETHER_ADDR_LEN); else { const struct in_addr *a; a = &(((const struct sockaddr_in *)dst)->sin_addr); ETHER_MAP_IP_MULTICAST(a, eh->ether_dhost); } etype = htons(ETHERTYPE_IP); memcpy(&eh->ether_type, &etype, sizeof(etype)); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); } break; #endif #ifdef INET6 case AF_INET6: if ((m->m_flags & M_MCAST) == 0) error = nd6_resolve(ifp, 0, m, dst, phdr, &lleflags, plle); else { const struct in6_addr *a6; a6 = &(((const struct sockaddr_in6 *)dst)->sin6_addr); ETHER_MAP_IPV6_MULTICAST(a6, eh->ether_dhost); etype = htons(ETHERTYPE_IPV6); memcpy(&eh->ether_type, &etype, sizeof(etype)); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); } break; #endif default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); if (m != NULL) m_freem(m); return (EAFNOSUPPORT); } if (error == EHOSTDOWN) { if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0) error = EHOSTUNREACH; } if (error != 0) return (error); *pflags = RT_MAY_LOOP; if (lleflags & LLE_IFADDR) *pflags |= RT_L2_ME; return (0); } /* * Ethernet output routine. * Encapsulate a packet of type family for the local net. * Use trailer local net encapsulation if enough data in first * packet leaves a multiple of 512 bytes of data in remainder. */ int ether_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { int error = 0; char linkhdr[ETHER_HDR_LEN], *phdr; struct ether_header *eh; struct pf_mtag *t; int loop_copy = 1; int hlen; /* link layer header length */ uint32_t pflags; struct llentry *lle = NULL; int addref = 0; phdr = NULL; pflags = 0; if (ro != NULL) { /* XXX BPF uses ro_prepend */ if (ro->ro_prepend != NULL) { phdr = ro->ro_prepend; hlen = ro->ro_plen; } else if (!(m->m_flags & (M_BCAST | M_MCAST))) { if ((ro->ro_flags & RT_LLE_CACHE) != 0) { lle = ro->ro_lle; if (lle != NULL && (lle->la_flags & LLE_VALID) == 0) { LLE_FREE(lle); lle = NULL; /* redundant */ ro->ro_lle = NULL; } if (lle == NULL) { /* if we lookup, keep cache */ addref = 1; } else /* * Notify LLE code that * the entry was used * by datapath. */ llentry_mark_used(lle); } if (lle != NULL) { phdr = lle->r_linkdata; hlen = lle->r_hdrlen; pflags = lle->r_flags; } } } #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) senderr(error); #endif M_PROFILE(m); if (ifp->if_flags & IFF_MONITOR) senderr(ENETDOWN); if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) senderr(ENETDOWN); if (phdr == NULL) { /* No prepend data supplied. Try to calculate ourselves. */ phdr = linkhdr; hlen = ETHER_HDR_LEN; error = ether_resolve_addr(ifp, m, dst, ro, phdr, &pflags, addref ? &lle : NULL); if (addref && lle != NULL) ro->ro_lle = lle; if (error != 0) return (error == EWOULDBLOCK ? 0 : error); } if ((pflags & RT_L2_ME) != 0) { update_mbuf_csumflags(m, m); return (if_simloop(ifp, m, dst->sa_family, 0)); } loop_copy = pflags & RT_MAY_LOOP; /* * Add local net header. If no space in first mbuf, * allocate another. * * Note that we do prepend regardless of RT_HAS_HEADER flag. * This is done because BPF code shifts m_data pointer * to the end of ethernet header prior to calling if_output(). */ M_PREPEND(m, hlen, M_NOWAIT); if (m == NULL) senderr(ENOBUFS); if ((pflags & RT_HAS_HEADER) == 0) { eh = mtod(m, struct ether_header *); memcpy(eh, phdr, hlen); } /* * If a simplex interface, and the packet is being sent to our * Ethernet address or a broadcast address, loopback a copy. * XXX To make a simplex device behave exactly like a duplex * device, we should copy in the case of sending to our own * ethernet address (thus letting the original actually appear * on the wire). However, we don't do that here for security * reasons and compatibility with the original behavior. */ if ((m->m_flags & M_BCAST) && loop_copy && (ifp->if_flags & IFF_SIMPLEX) && ((t = pf_find_mtag(m)) == NULL || !t->routed)) { struct mbuf *n; /* * Because if_simloop() modifies the packet, we need a * writable copy through m_dup() instead of a readonly * one as m_copy[m] would give us. The alternative would * be to modify if_simloop() to handle the readonly mbuf, * but performancewise it is mostly equivalent (trading * extra data copying vs. extra locking). * * XXX This is a local workaround. A number of less * often used kernel parts suffer from the same bug. * See PR kern/105943 for a proposed general solution. */ if ((n = m_dup(m, M_NOWAIT)) != NULL) { update_mbuf_csumflags(m, n); (void)if_simloop(ifp, n, dst->sa_family, hlen); } else if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); } /* * Bridges require special output handling. */ if (ifp->if_bridge) { BRIDGE_OUTPUT(ifp, m, error); return (error); } #if defined(INET) || defined(INET6) if (ifp->if_carp && (error = (*carp_output_p)(ifp, m, dst))) goto bad; #endif /* Handle ng_ether(4) processing, if any */ if (ifp->if_l2com != NULL) { KASSERT(ng_ether_output_p != NULL, ("ng_ether_output_p is NULL")); if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) { bad: if (m != NULL) m_freem(m); return (error); } if (m == NULL) return (0); } /* Continue with link-layer output */ return ether_output_frame(ifp, m); } static bool ether_set_pcp(struct mbuf **mp, struct ifnet *ifp, uint8_t pcp) { struct ether_header *eh; eh = mtod(*mp, struct ether_header *); if (ntohs(eh->ether_type) == ETHERTYPE_VLAN || ether_8021q_frame(mp, ifp, ifp, 0, pcp)) return (true); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (false); } /* * Ethernet link layer output routine to send a raw frame to the device. * * This assumes that the 14 byte Ethernet header is present and contiguous * in the first mbuf (if BRIDGE'ing). */ int ether_output_frame(struct ifnet *ifp, struct mbuf *m) { uint8_t pcp; pcp = ifp->if_pcp; if (pcp != IFNET_PCP_NONE && ifp->if_type != IFT_L2VLAN && !ether_set_pcp(&m, ifp, pcp)) return (0); if (PFIL_HOOKED_OUT(V_link_pfil_head)) switch (pfil_run_hooks(V_link_pfil_head, &m, ifp, PFIL_OUT, NULL)) { case PFIL_DROPPED: return (EACCES); case PFIL_CONSUMED: return (0); } #ifdef EXPERIMENTAL #if defined(INET6) && defined(INET) /* draft-ietf-6man-ipv6only-flag */ /* Catch ETHERTYPE_IP, and ETHERTYPE_[REV]ARP if we are v6-only. */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY_MASK) != 0) { struct ether_header *eh; eh = mtod(m, struct ether_header *); switch (ntohs(eh->ether_type)) { case ETHERTYPE_IP: case ETHERTYPE_ARP: case ETHERTYPE_REVARP: m_freem(m); return (EAFNOSUPPORT); /* NOTREACHED */ break; }; } #endif #endif /* * Queue message on interface, update output statistics if * successful, and start output if interface not yet active. */ return ((ifp->if_transmit)(ifp, m)); } /* * Process a received Ethernet packet; the packet is in the * mbuf chain m with the ethernet header at the front. */ static void ether_input_internal(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; u_short etype; if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); return; } #ifdef DIAGNOSTIC if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { if_printf(ifp, "discard frame at !IFF_DRV_RUNNING\n"); m_freem(m); return; } #endif if (m->m_len < ETHER_HDR_LEN) { /* XXX maybe should pullup? */ if_printf(ifp, "discard frame w/o leading ethernet " "header (len %u pkt len %u)\n", m->m_len, m->m_pkthdr.len); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); random_harvest_queue_ether(m, sizeof(*m)); #ifdef EXPERIMENTAL #if defined(INET6) && defined(INET) /* draft-ietf-6man-ipv6only-flag */ /* Catch ETHERTYPE_IP, and ETHERTYPE_[REV]ARP if we are v6-only. */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY_MASK) != 0) { switch (etype) { case ETHERTYPE_IP: case ETHERTYPE_ARP: case ETHERTYPE_REVARP: m_freem(m); return; /* NOTREACHED */ break; }; } #endif #endif CURVNET_SET_QUIET(ifp->if_vnet); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (ETHER_IS_BROADCAST(eh->ether_dhost)) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } #ifdef MAC /* * Tag the mbuf with an appropriate MAC label before any other * consumers can get to it. */ mac_ifnet_create_mbuf(ifp, m); #endif /* * Give bpf a chance at the packet. */ ETHER_BPF_MTAP(ifp, m); /* * If the CRC is still on the packet, trim it off. We do this once * and once only in case we are re-entered. Nothing else on the * Ethernet receive path expects to see the FCS. */ if (m->m_flags & M_HASFCS) { m_adj(m, -ETHER_CRC_LEN); m->m_flags &= ~M_HASFCS; } if (!(ifp->if_capenable & IFCAP_HWSTATS)) if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { m_freem(m); CURVNET_RESTORE(); return; } /* Handle input from a lagg(4) port */ if (ifp->if_type == IFT_IEEE8023ADLAG) { KASSERT(lagg_input_p != NULL, ("%s: if_lagg not loaded!", __func__)); m = (*lagg_input_p)(ifp, m); if (m != NULL) ifp = m->m_pkthdr.rcvif; else { CURVNET_RESTORE(); return; } } /* * If the hardware did not process an 802.1Q tag, do this now, * to allow 802.1P priority frames to be passed to the main input * path correctly. * TODO: Deal with Q-in-Q frames, but not arbitrary nesting levels. */ if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_VLAN) { struct ether_vlan_header *evl; if (m->m_len < sizeof(*evl) && (m = m_pullup(m, sizeof(*evl))) == NULL) { #ifdef DIAGNOSTIC if_printf(ifp, "cannot pullup VLAN header\n"); #endif if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); CURVNET_RESTORE(); return; } evl = mtod(m, struct ether_vlan_header *); m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag); m->m_flags |= M_VLANTAG; bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); eh = mtod(m, struct ether_header *); } M_SETFIB(m, ifp->if_fib); /* Allow ng_ether(4) to claim this frame. */ if (ifp->if_l2com != NULL) { KASSERT(ng_ether_input_p != NULL, ("%s: ng_ether_input_p is NULL", __func__)); m->m_flags &= ~M_PROMISC; (*ng_ether_input_p)(ifp, &m); if (m == NULL) { CURVNET_RESTORE(); return; } eh = mtod(m, struct ether_header *); } /* * Allow if_bridge(4) to claim this frame. * The BRIDGE_INPUT() macro will update ifp if the bridge changed it * and the frame should be delivered locally. */ if (ifp->if_bridge != NULL) { m->m_flags &= ~M_PROMISC; BRIDGE_INPUT(ifp, m); if (m == NULL) { CURVNET_RESTORE(); return; } eh = mtod(m, struct ether_header *); } #if defined(INET) || defined(INET6) /* * Clear M_PROMISC on frame so that carp(4) will see it when the * mbuf flows up to Layer 3. * FreeBSD's implementation of carp(4) uses the inprotosw * to dispatch IPPROTO_CARP. carp(4) also allocates its own * Ethernet addresses of the form 00:00:5e:00:01:xx, which * is outside the scope of the M_PROMISC test below. * TODO: Maintain a hash table of ethernet addresses other than * ether_dhost which may be active on this ifp. */ if (ifp->if_carp && (*carp_forus_p)(ifp, eh->ether_dhost)) { m->m_flags &= ~M_PROMISC; } else #endif { /* * If the frame received was not for our MAC address, set the * M_PROMISC flag on the mbuf chain. The frame may need to * be seen by the rest of the Ethernet input path in case of * re-entry (e.g. bridge, vlan, netgraph) but should not be * seen by upper protocol layers. */ if (!ETHER_IS_MULTICAST(eh->ether_dhost) && bcmp(IF_LLADDR(ifp), eh->ether_dhost, ETHER_ADDR_LEN) != 0) m->m_flags |= M_PROMISC; } ether_demux(ifp, m); CURVNET_RESTORE(); } /* * Ethernet input dispatch; by default, direct dispatch here regardless of * global configuration. However, if RSS is enabled, hook up RSS affinity * so that when deferred or hybrid dispatch is enabled, we can redistribute * load based on RSS. * * XXXRW: Would be nice if the ifnet passed up a flag indicating whether or * not it had already done work distribution via multi-queue. Then we could * direct dispatch in the event load balancing was already complete and * handle the case of interfaces with different capabilities better. * * XXXRW: Sort of want an M_DISTRIBUTED flag to avoid multiple distributions * at multiple layers? * * XXXRW: For now, enable all this only if RSS is compiled in, although it * works fine without RSS. Need to characterise the performance overhead * of the detour through the netisr code in the event the result is always * direct dispatch. */ static void ether_nh_input(struct mbuf *m) { M_ASSERTPKTHDR(m); KASSERT(m->m_pkthdr.rcvif != NULL, ("%s: NULL interface pointer", __func__)); ether_input_internal(m->m_pkthdr.rcvif, m); } static struct netisr_handler ether_nh = { .nh_name = "ether", .nh_handler = ether_nh_input, .nh_proto = NETISR_ETHER, #ifdef RSS .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_DIRECT, .nh_m2cpuid = rss_m2cpuid, #else .nh_policy = NETISR_POLICY_SOURCE, .nh_dispatch = NETISR_DISPATCH_DIRECT, #endif }; static void ether_init(__unused void *arg) { netisr_register(ðer_nh); } SYSINIT(ether, SI_SUB_INIT_IF, SI_ORDER_ANY, ether_init, NULL); static void vnet_ether_init(__unused void *arg) { struct pfil_head_args args; args.pa_version = PFIL_VERSION; args.pa_flags = PFIL_IN | PFIL_OUT; args.pa_type = PFIL_TYPE_ETHERNET; args.pa_headname = PFIL_ETHER_NAME; V_link_pfil_head = pfil_head_register(&args); #ifdef VIMAGE netisr_register_vnet(ðer_nh); #endif } VNET_SYSINIT(vnet_ether_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_ether_init, NULL); #ifdef VIMAGE static void vnet_ether_pfil_destroy(__unused void *arg) { pfil_head_unregister(V_link_pfil_head); } VNET_SYSUNINIT(vnet_ether_pfil_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_ANY, vnet_ether_pfil_destroy, NULL); static void vnet_ether_destroy(__unused void *arg) { netisr_unregister_vnet(ðer_nh); } VNET_SYSUNINIT(vnet_ether_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_ether_destroy, NULL); #endif static void ether_input(struct ifnet *ifp, struct mbuf *m) { struct epoch_tracker et; struct mbuf *mn; + bool needs_epoch; + + needs_epoch = !(ifp->if_flags & IFF_KNOWSEPOCH); /* * The drivers are allowed to pass in a chain of packets linked with * m_nextpkt. We split them up into separate packets here and pass * them up. This allows the drivers to amortize the receive lock. */ CURVNET_SET_QUIET(ifp->if_vnet); - if (__predict_false(ifp->if_flags & IFF_NEEDSEPOCH)) + if (__predict_false(needs_epoch)) NET_EPOCH_ENTER(et); while (m) { mn = m->m_nextpkt; m->m_nextpkt = NULL; /* * We will rely on rcvif being set properly in the deferred * context, so assert it is correct here. */ MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch m %p " "rcvif %p ifp %p", __func__, m, m->m_pkthdr.rcvif, ifp)); netisr_dispatch(NETISR_ETHER, m); m = mn; } - if (__predict_false(ifp->if_flags & IFF_NEEDSEPOCH)) + if (__predict_false(needs_epoch)) NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } /* * Upper layer processing for a received Ethernet packet. */ void ether_demux(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; int i, isr; u_short ether_type; NET_EPOCH_ASSERT(); KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__)); /* Do not grab PROMISC frames in case we are re-entered. */ if (PFIL_HOOKED_IN(V_link_pfil_head) && !(m->m_flags & M_PROMISC)) { i = pfil_run_hooks(V_link_pfil_head, &m, ifp, PFIL_IN, NULL); if (i != 0 || m == NULL) return; } eh = mtod(m, struct ether_header *); ether_type = ntohs(eh->ether_type); /* * If this frame has a VLAN tag other than 0, call vlan_input() * if its module is loaded. Otherwise, drop. */ if ((m->m_flags & M_VLANTAG) && EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) != 0) { if (ifp->if_vlantrunk == NULL) { if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; } KASSERT(vlan_input_p != NULL,("%s: VLAN not loaded!", __func__)); /* Clear before possibly re-entering ether_input(). */ m->m_flags &= ~M_PROMISC; (*vlan_input_p)(ifp, m); return; } /* * Pass promiscuously received frames to the upper layer if the user * requested this by setting IFF_PPROMISC. Otherwise, drop them. */ if ((ifp->if_flags & IFF_PPROMISC) == 0 && (m->m_flags & M_PROMISC)) { m_freem(m); return; } /* * Reset layer specific mbuf flags to avoid confusing upper layers. * Strip off Ethernet header. */ m->m_flags &= ~M_VLANTAG; m_clrprotoflags(m); m_adj(m, ETHER_HDR_LEN); /* * Dispatch frame to upper layer. */ switch (ether_type) { #ifdef INET case ETHERTYPE_IP: isr = NETISR_IP; break; case ETHERTYPE_ARP: if (ifp->if_flags & IFF_NOARP) { /* Discard packet if ARP is disabled on interface */ m_freem(m); return; } isr = NETISR_ARP; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; break; #endif default: goto discard; } netisr_dispatch(isr, m); return; discard: /* * Packet is to be discarded. If netgraph is present, * hand the packet to it for last chance processing; * otherwise dispose of it. */ if (ifp->if_l2com != NULL) { KASSERT(ng_ether_input_orphan_p != NULL, ("ng_ether_input_orphan_p is NULL")); /* * Put back the ethernet header so netgraph has a * consistent view of inbound packets. */ M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT); (*ng_ether_input_orphan_p)(ifp, m); return; } m_freem(m); } /* * Convert Ethernet address to printable (loggable) representation. * This routine is for compatibility; it's better to just use * * printf("%6D", , ":"); * * since there's no static buffer involved. */ char * ether_sprintf(const u_char *ap) { static char etherbuf[18]; snprintf(etherbuf, sizeof (etherbuf), "%6D", ap, ":"); return (etherbuf); } /* * Perform common duties while attaching to interface list */ void ether_ifattach(struct ifnet *ifp, const u_int8_t *lla) { int i; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifp->if_addrlen = ETHER_ADDR_LEN; ifp->if_hdrlen = ETHER_HDR_LEN; if_attach(ifp); ifp->if_mtu = ETHERMTU; ifp->if_output = ether_output; ifp->if_input = ether_input; ifp->if_resolvemulti = ether_resolvemulti; ifp->if_requestencap = ether_requestencap; #ifdef VIMAGE ifp->if_reassign = ether_reassign; #endif if (ifp->if_baudrate == 0) ifp->if_baudrate = IF_Mbps(10); /* just a default */ ifp->if_broadcastaddr = etherbroadcastaddr; ifa = ifp->if_addr; KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_ETHER; sdl->sdl_alen = ifp->if_addrlen; bcopy(lla, LLADDR(sdl), ifp->if_addrlen); if (ifp->if_hw_addr != NULL) bcopy(lla, ifp->if_hw_addr, ifp->if_addrlen); bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN); if (ng_ether_attach_p != NULL) (*ng_ether_attach_p)(ifp); /* Announce Ethernet MAC address if non-zero. */ for (i = 0; i < ifp->if_addrlen; i++) if (lla[i] != 0) break; if (i != ifp->if_addrlen) if_printf(ifp, "Ethernet address: %6D\n", lla, ":"); uuid_ether_add(LLADDR(sdl)); /* Add necessary bits are setup; announce it now. */ EVENTHANDLER_INVOKE(ether_ifattach_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("ETHERNET", ifp->if_xname, "IFATTACH", NULL); } /* * Perform common duties while detaching an Ethernet interface */ void ether_ifdetach(struct ifnet *ifp) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)(ifp->if_addr->ifa_addr); uuid_ether_del(LLADDR(sdl)); if (ifp->if_l2com != NULL) { KASSERT(ng_ether_detach_p != NULL, ("ng_ether_detach_p is NULL")); (*ng_ether_detach_p)(ifp); } bpfdetach(ifp); if_detach(ifp); } #ifdef VIMAGE void ether_reassign(struct ifnet *ifp, struct vnet *new_vnet, char *unused __unused) { if (ifp->if_l2com != NULL) { KASSERT(ng_ether_detach_p != NULL, ("ng_ether_detach_p is NULL")); (*ng_ether_detach_p)(ifp); } if (ng_ether_attach_p != NULL) { CURVNET_SET_QUIET(new_vnet); (*ng_ether_attach_p)(ifp); CURVNET_RESTORE(); } } #endif SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet"); #if 0 /* * This is for reference. We have a table-driven version * of the little-endian crc32 generator, which is faster * than the double-loop. */ uint32_t ether_crc32_le(const uint8_t *buf, size_t len) { size_t i; uint32_t crc; int bit; uint8_t data; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) { carry = (crc ^ data) & 1; crc >>= 1; if (carry) crc = (crc ^ ETHER_CRC_POLY_LE); } } return (crc); } #else uint32_t ether_crc32_le(const uint8_t *buf, size_t len) { static const uint32_t crctab[] = { 0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c, 0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c, 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c }; size_t i; uint32_t crc; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { crc ^= buf[i]; crc = (crc >> 4) ^ crctab[crc & 0xf]; crc = (crc >> 4) ^ crctab[crc & 0xf]; } return (crc); } #endif uint32_t ether_crc32_be(const uint8_t *buf, size_t len) { size_t i; uint32_t crc, carry; int bit; uint8_t data; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) { carry = ((crc & 0x80000000) ? 1 : 0) ^ (data & 0x01); crc <<= 1; if (carry) crc = (crc ^ ETHER_CRC_POLY_BE) | carry; } } return (crc); } int ether_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ifaddr *ifa = (struct ifaddr *) data; struct ifreq *ifr = (struct ifreq *) data; int error = 0; switch (command) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifp->if_init(ifp->if_softc); /* before arpwhohas */ arp_ifinit(ifp, ifa); break; #endif default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0], ETHER_ADDR_LEN); break; case SIOCSIFMTU: /* * Set the interface MTU. */ if (ifr->ifr_mtu > ETHERMTU) { error = EINVAL; } else { ifp->if_mtu = ifr->ifr_mtu; } break; case SIOCSLANPCP: error = priv_check(curthread, PRIV_NET_SETLANPCP); if (error != 0) break; if (ifr->ifr_lan_pcp > 7 && ifr->ifr_lan_pcp != IFNET_PCP_NONE) { error = EINVAL; } else { ifp->if_pcp = ifr->ifr_lan_pcp; /* broadcast event about PCP change */ EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_PCP); } break; case SIOCGLANPCP: ifr->ifr_lan_pcp = ifp->if_pcp; break; default: error = EINVAL; /* XXX netbsd has ENOTTY??? */ break; } return (error); } static int ether_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, struct sockaddr *sa) { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif u_char *e_addr; switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if (!ETHER_IS_MULTICAST(e_addr)) return EADDRNOTAVAIL; *llsa = NULL; return 0; #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_ETHER); sdl->sdl_alen = ETHER_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* * An IP6 address of 0 means listen to all * of the Ethernet multicast address used for IP6. * (This is used for multicast routers.) */ ifp->if_flags |= IFF_ALLMULTI; *llsa = NULL; return 0; } if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_ETHER); sdl->sdl_alen = ETHER_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif default: /* * Well, the text isn't quite right, but it's the name * that counts... */ return EAFNOSUPPORT; } } static moduledata_t ether_mod = { .name = "ether", }; void ether_vlan_mtap(struct bpf_if *bp, struct mbuf *m, void *data, u_int dlen) { struct ether_vlan_header vlan; struct mbuf mv, mb; KASSERT((m->m_flags & M_VLANTAG) != 0, ("%s: vlan information not present", __func__)); KASSERT(m->m_len >= sizeof(struct ether_header), ("%s: mbuf not large enough for header", __func__)); bcopy(mtod(m, char *), &vlan, sizeof(struct ether_header)); vlan.evl_proto = vlan.evl_encap_proto; vlan.evl_encap_proto = htons(ETHERTYPE_VLAN); vlan.evl_tag = htons(m->m_pkthdr.ether_vtag); m->m_len -= sizeof(struct ether_header); m->m_data += sizeof(struct ether_header); /* * If a data link has been supplied by the caller, then we will need to * re-create a stack allocated mbuf chain with the following structure: * * (1) mbuf #1 will contain the supplied data link * (2) mbuf #2 will contain the vlan header * (3) mbuf #3 will contain the original mbuf's packet data * * Otherwise, submit the packet and vlan header via bpf_mtap2(). */ if (data != NULL) { mv.m_next = m; mv.m_data = (caddr_t)&vlan; mv.m_len = sizeof(vlan); mb.m_next = &mv; mb.m_data = data; mb.m_len = dlen; bpf_mtap(bp, &mb); } else bpf_mtap2(bp, &vlan, sizeof(vlan), m); m->m_len += sizeof(struct ether_header); m->m_data -= sizeof(struct ether_header); } struct mbuf * ether_vlanencap(struct mbuf *m, uint16_t tag) { struct ether_vlan_header *evl; M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT); if (m == NULL) return (NULL); /* M_PREPEND takes care of m_len, m_pkthdr.len for us */ if (m->m_len < sizeof(*evl)) { m = m_pullup(m, sizeof(*evl)); if (m == NULL) return (NULL); } /* * Transform the Ethernet header into an Ethernet header * with 802.1Q encapsulation. */ evl = mtod(m, struct ether_vlan_header *); bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN, (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN); evl->evl_encap_proto = htons(ETHERTYPE_VLAN); evl->evl_tag = htons(tag); return (m); } static SYSCTL_NODE(_net_link, IFT_L2VLAN, vlan, CTLFLAG_RW, 0, "IEEE 802.1Q VLAN"); static SYSCTL_NODE(_net_link_vlan, PF_LINK, link, CTLFLAG_RW, 0, "for consistency"); VNET_DEFINE_STATIC(int, soft_pad); #define V_soft_pad VNET(soft_pad) SYSCTL_INT(_net_link_vlan, OID_AUTO, soft_pad, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(soft_pad), 0, "pad short frames before tagging"); /* * For now, make preserving PCP via an mbuf tag optional, as it increases * per-packet memory allocations and frees. In the future, it would be * preferable to reuse ether_vtag for this, or similar. */ int vlan_mtag_pcp = 0; SYSCTL_INT(_net_link_vlan, OID_AUTO, mtag_pcp, CTLFLAG_RW, &vlan_mtag_pcp, 0, "Retain VLAN PCP information as packets are passed up the stack"); bool ether_8021q_frame(struct mbuf **mp, struct ifnet *ife, struct ifnet *p, uint16_t vid, uint8_t pcp) { struct m_tag *mtag; int n; uint16_t tag; static const char pad[8]; /* just zeros */ /* * Pad the frame to the minimum size allowed if told to. * This option is in accord with IEEE Std 802.1Q, 2003 Ed., * paragraph C.4.4.3.b. It can help to work around buggy * bridges that violate paragraph C.4.4.3.a from the same * document, i.e., fail to pad short frames after untagging. * E.g., a tagged frame 66 bytes long (incl. FCS) is OK, but * untagging it will produce a 62-byte frame, which is a runt * and requires padding. There are VLAN-enabled network * devices that just discard such runts instead or mishandle * them somehow. */ if (V_soft_pad && p->if_type == IFT_ETHER) { for (n = ETHERMIN + ETHER_HDR_LEN - (*mp)->m_pkthdr.len; n > 0; n -= sizeof(pad)) { if (!m_append(*mp, min(n, sizeof(pad)), pad)) break; } if (n > 0) { m_freem(*mp); *mp = NULL; if_printf(ife, "cannot pad short frame"); return (false); } } /* * If underlying interface can do VLAN tag insertion itself, * just pass the packet along. However, we need some way to * tell the interface where the packet came from so that it * knows how to find the VLAN tag to use, so we attach a * packet tag that holds it. */ if (vlan_mtag_pcp && (mtag = m_tag_locate(*mp, MTAG_8021Q, MTAG_8021Q_PCP_OUT, NULL)) != NULL) tag = EVL_MAKETAG(vid, *(uint8_t *)(mtag + 1), 0); else tag = EVL_MAKETAG(vid, pcp, 0); if (p->if_capenable & IFCAP_VLAN_HWTAGGING) { (*mp)->m_pkthdr.ether_vtag = tag; (*mp)->m_flags |= M_VLANTAG; } else { *mp = ether_vlanencap(*mp, tag); if (*mp == NULL) { if_printf(ife, "unable to prepend 802.1Q header"); return (false); } } return (true); } /* * Allocate an address from the FreeBSD Foundation OUI. This uses a * cryptographic hash function on the containing jail's UUID and the interface * name to attempt to provide a unique but stable address. Pseudo-interfaces * which require a MAC address should use this function to allocate * non-locally-administered addresses. */ void ether_gen_addr(struct ifnet *ifp, struct ether_addr *hwaddr) { #define ETHER_GEN_ADDR_BUFSIZ HOSTUUIDLEN + IFNAMSIZ + 2 SHA1_CTX ctx; char buf[ETHER_GEN_ADDR_BUFSIZ]; char uuid[HOSTUUIDLEN + 1]; uint64_t addr; int i, sz; char digest[SHA1_RESULTLEN]; getcredhostuuid(curthread->td_ucred, uuid, sizeof(uuid)); sz = snprintf(buf, ETHER_GEN_ADDR_BUFSIZ, "%s-%s", uuid, ifp->if_xname); SHA1Init(&ctx); SHA1Update(&ctx, buf, sz); SHA1Final(digest, &ctx); addr = ((digest[0] << 16) | (digest[1] << 8) | digest[2]) & OUI_FREEBSD_GENERATED_MASK; addr = OUI_FREEBSD(addr); for (i = 0; i < ETHER_ADDR_LEN; ++i) { hwaddr->octet[i] = addr >> ((ETHER_ADDR_LEN - i - 1) * 8) & 0xFF; } } DECLARE_MODULE(ether, ether_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); MODULE_VERSION(ether, 1); diff --git a/sys/net/iflib.c b/sys/net/iflib.c index d06fbf6ec01b..95fbdbf7c446 100644 --- a/sys/net/iflib.c +++ b/sys/net/iflib.c @@ -1,6863 +1,6864 @@ /*- * Copyright (c) 2014-2018, Matthew Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Neither the name of Matthew Macy nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_acpi.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "ifdi_if.h" #ifdef PCI_IOV #include #endif #include /* * enable accounting of every mbuf as it comes in to and goes out of * iflib's software descriptor references */ #define MEMORY_LOGGING 0 /* * Enable mbuf vectors for compressing long mbuf chains */ /* * NB: * - Prefetching in tx cleaning should perhaps be a tunable. The distance ahead * we prefetch needs to be determined by the time spent in m_free vis a vis * the cost of a prefetch. This will of course vary based on the workload: * - NFLX's m_free path is dominated by vm-based M_EXT manipulation which * is quite expensive, thus suggesting very little prefetch. * - small packet forwarding which is just returning a single mbuf to * UMA will typically be very fast vis a vis the cost of a memory * access. */ /* * File organization: * - private structures * - iflib private utility functions * - ifnet functions * - vlan registry and other exported functions * - iflib public core functions * * */ MALLOC_DEFINE(M_IFLIB, "iflib", "ifnet library"); #define IFLIB_RXEOF_MORE (1U << 0) #define IFLIB_RXEOF_EMPTY (2U << 0) struct iflib_txq; typedef struct iflib_txq *iflib_txq_t; struct iflib_rxq; typedef struct iflib_rxq *iflib_rxq_t; struct iflib_fl; typedef struct iflib_fl *iflib_fl_t; struct iflib_ctx; static void iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid); static void iflib_timer(void *arg); typedef struct iflib_filter_info { driver_filter_t *ifi_filter; void *ifi_filter_arg; struct grouptask *ifi_task; void *ifi_ctx; } *iflib_filter_info_t; struct iflib_ctx { KOBJ_FIELDS; /* * Pointer to hardware driver's softc */ void *ifc_softc; device_t ifc_dev; if_t ifc_ifp; cpuset_t ifc_cpus; if_shared_ctx_t ifc_sctx; struct if_softc_ctx ifc_softc_ctx; struct sx ifc_ctx_sx; struct mtx ifc_state_mtx; iflib_txq_t ifc_txqs; iflib_rxq_t ifc_rxqs; uint32_t ifc_if_flags; uint32_t ifc_flags; uint32_t ifc_max_fl_buf_size; uint32_t ifc_rx_mbuf_sz; int ifc_link_state; int ifc_watchdog_events; struct cdev *ifc_led_dev; struct resource *ifc_msix_mem; struct if_irq ifc_legacy_irq; struct grouptask ifc_admin_task; struct grouptask ifc_vflr_task; struct iflib_filter_info ifc_filter_info; struct ifmedia ifc_media; struct ifmedia *ifc_mediap; struct sysctl_oid *ifc_sysctl_node; uint16_t ifc_sysctl_ntxqs; uint16_t ifc_sysctl_nrxqs; uint16_t ifc_sysctl_qs_eq_override; uint16_t ifc_sysctl_rx_budget; uint16_t ifc_sysctl_tx_abdicate; uint16_t ifc_sysctl_core_offset; #define CORE_OFFSET_UNSPECIFIED 0xffff uint8_t ifc_sysctl_separate_txrx; qidx_t ifc_sysctl_ntxds[8]; qidx_t ifc_sysctl_nrxds[8]; struct if_txrx ifc_txrx; #define isc_txd_encap ifc_txrx.ift_txd_encap #define isc_txd_flush ifc_txrx.ift_txd_flush #define isc_txd_credits_update ifc_txrx.ift_txd_credits_update #define isc_rxd_available ifc_txrx.ift_rxd_available #define isc_rxd_pkt_get ifc_txrx.ift_rxd_pkt_get #define isc_rxd_refill ifc_txrx.ift_rxd_refill #define isc_rxd_flush ifc_txrx.ift_rxd_flush #define isc_rxd_refill ifc_txrx.ift_rxd_refill #define isc_rxd_refill ifc_txrx.ift_rxd_refill #define isc_legacy_intr ifc_txrx.ift_legacy_intr eventhandler_tag ifc_vlan_attach_event; eventhandler_tag ifc_vlan_detach_event; struct ether_addr ifc_mac; }; void * iflib_get_softc(if_ctx_t ctx) { return (ctx->ifc_softc); } device_t iflib_get_dev(if_ctx_t ctx) { return (ctx->ifc_dev); } if_t iflib_get_ifp(if_ctx_t ctx) { return (ctx->ifc_ifp); } struct ifmedia * iflib_get_media(if_ctx_t ctx) { return (ctx->ifc_mediap); } uint32_t iflib_get_flags(if_ctx_t ctx) { return (ctx->ifc_flags); } void iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN]) { bcopy(mac, ctx->ifc_mac.octet, ETHER_ADDR_LEN); } if_softc_ctx_t iflib_get_softc_ctx(if_ctx_t ctx) { return (&ctx->ifc_softc_ctx); } if_shared_ctx_t iflib_get_sctx(if_ctx_t ctx) { return (ctx->ifc_sctx); } #define IP_ALIGNED(m) ((((uintptr_t)(m)->m_data) & 0x3) == 0x2) #define CACHE_PTR_INCREMENT (CACHE_LINE_SIZE/sizeof(void*)) #define CACHE_PTR_NEXT(ptr) ((void *)(((uintptr_t)(ptr)+CACHE_LINE_SIZE-1) & (CACHE_LINE_SIZE-1))) #define LINK_ACTIVE(ctx) ((ctx)->ifc_link_state == LINK_STATE_UP) #define CTX_IS_VF(ctx) ((ctx)->ifc_sctx->isc_flags & IFLIB_IS_VF) typedef struct iflib_sw_rx_desc_array { bus_dmamap_t *ifsd_map; /* bus_dma maps for packet */ struct mbuf **ifsd_m; /* pkthdr mbufs */ caddr_t *ifsd_cl; /* direct cluster pointer for rx */ bus_addr_t *ifsd_ba; /* bus addr of cluster for rx */ } iflib_rxsd_array_t; typedef struct iflib_sw_tx_desc_array { bus_dmamap_t *ifsd_map; /* bus_dma maps for packet */ bus_dmamap_t *ifsd_tso_map; /* bus_dma maps for TSO packet */ struct mbuf **ifsd_m; /* pkthdr mbufs */ } if_txsd_vec_t; /* magic number that should be high enough for any hardware */ #define IFLIB_MAX_TX_SEGS 128 #define IFLIB_RX_COPY_THRESH 128 #define IFLIB_MAX_RX_REFRESH 32 /* The minimum descriptors per second before we start coalescing */ #define IFLIB_MIN_DESC_SEC 16384 #define IFLIB_DEFAULT_TX_UPDATE_FREQ 16 #define IFLIB_QUEUE_IDLE 0 #define IFLIB_QUEUE_HUNG 1 #define IFLIB_QUEUE_WORKING 2 /* maximum number of txqs that can share an rx interrupt */ #define IFLIB_MAX_TX_SHARED_INTR 4 /* this should really scale with ring size - this is a fairly arbitrary value */ #define TX_BATCH_SIZE 32 #define IFLIB_RESTART_BUDGET 8 #define CSUM_OFFLOAD (CSUM_IP_TSO|CSUM_IP6_TSO|CSUM_IP| \ CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP| \ CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP) struct iflib_txq { qidx_t ift_in_use; qidx_t ift_cidx; qidx_t ift_cidx_processed; qidx_t ift_pidx; uint8_t ift_gen; uint8_t ift_br_offset; uint16_t ift_npending; uint16_t ift_db_pending; uint16_t ift_rs_pending; /* implicit pad */ uint8_t ift_txd_size[8]; uint64_t ift_processed; uint64_t ift_cleaned; uint64_t ift_cleaned_prev; #if MEMORY_LOGGING uint64_t ift_enqueued; uint64_t ift_dequeued; #endif uint64_t ift_no_tx_dma_setup; uint64_t ift_no_desc_avail; uint64_t ift_mbuf_defrag_failed; uint64_t ift_mbuf_defrag; uint64_t ift_map_failed; uint64_t ift_txd_encap_efbig; uint64_t ift_pullups; uint64_t ift_last_timer_tick; struct mtx ift_mtx; struct mtx ift_db_mtx; /* constant values */ if_ctx_t ift_ctx; struct ifmp_ring *ift_br; struct grouptask ift_task; qidx_t ift_size; uint16_t ift_id; struct callout ift_timer; if_txsd_vec_t ift_sds; uint8_t ift_qstatus; uint8_t ift_closed; uint8_t ift_update_freq; struct iflib_filter_info ift_filter_info; bus_dma_tag_t ift_buf_tag; bus_dma_tag_t ift_tso_buf_tag; iflib_dma_info_t ift_ifdi; #define MTX_NAME_LEN 16 char ift_mtx_name[MTX_NAME_LEN]; bus_dma_segment_t ift_segs[IFLIB_MAX_TX_SEGS] __aligned(CACHE_LINE_SIZE); #ifdef IFLIB_DIAGNOSTICS uint64_t ift_cpu_exec_count[256]; #endif } __aligned(CACHE_LINE_SIZE); struct iflib_fl { qidx_t ifl_cidx; qidx_t ifl_pidx; qidx_t ifl_credits; uint8_t ifl_gen; uint8_t ifl_rxd_size; #if MEMORY_LOGGING uint64_t ifl_m_enqueued; uint64_t ifl_m_dequeued; uint64_t ifl_cl_enqueued; uint64_t ifl_cl_dequeued; #endif /* implicit pad */ bitstr_t *ifl_rx_bitmap; qidx_t ifl_fragidx; /* constant */ qidx_t ifl_size; uint16_t ifl_buf_size; uint16_t ifl_cltype; uma_zone_t ifl_zone; iflib_rxsd_array_t ifl_sds; iflib_rxq_t ifl_rxq; uint8_t ifl_id; bus_dma_tag_t ifl_buf_tag; iflib_dma_info_t ifl_ifdi; uint64_t ifl_bus_addrs[IFLIB_MAX_RX_REFRESH] __aligned(CACHE_LINE_SIZE); caddr_t ifl_vm_addrs[IFLIB_MAX_RX_REFRESH]; qidx_t ifl_rxd_idxs[IFLIB_MAX_RX_REFRESH]; } __aligned(CACHE_LINE_SIZE); static inline qidx_t get_inuse(int size, qidx_t cidx, qidx_t pidx, uint8_t gen) { qidx_t used; if (pidx > cidx) used = pidx - cidx; else if (pidx < cidx) used = size - cidx + pidx; else if (gen == 0 && pidx == cidx) used = 0; else if (gen == 1 && pidx == cidx) used = size; else panic("bad state"); return (used); } #define TXQ_AVAIL(txq) (txq->ift_size - get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx, txq->ift_gen)) #define IDXDIFF(head, tail, wrap) \ ((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head)) struct iflib_rxq { if_ctx_t ifr_ctx; iflib_fl_t ifr_fl; uint64_t ifr_rx_irq; struct pfil_head *pfil; /* * If there is a separate completion queue (IFLIB_HAS_RXCQ), this is * the command queue consumer index. Otherwise it's unused. */ qidx_t ifr_cq_cidx; uint16_t ifr_id; uint8_t ifr_nfl; uint8_t ifr_ntxqirq; uint8_t ifr_txqid[IFLIB_MAX_TX_SHARED_INTR]; uint8_t ifr_fl_offset; struct lro_ctrl ifr_lc; struct grouptask ifr_task; struct callout ifr_watchdog; struct iflib_filter_info ifr_filter_info; iflib_dma_info_t ifr_ifdi; /* dynamically allocate if any drivers need a value substantially larger than this */ struct if_rxd_frag ifr_frags[IFLIB_MAX_RX_SEGS] __aligned(CACHE_LINE_SIZE); #ifdef IFLIB_DIAGNOSTICS uint64_t ifr_cpu_exec_count[256]; #endif } __aligned(CACHE_LINE_SIZE); typedef struct if_rxsd { caddr_t *ifsd_cl; iflib_fl_t ifsd_fl; qidx_t ifsd_cidx; } *if_rxsd_t; /* multiple of word size */ #ifdef __LP64__ #define PKT_INFO_SIZE 6 #define RXD_INFO_SIZE 5 #define PKT_TYPE uint64_t #else #define PKT_INFO_SIZE 11 #define RXD_INFO_SIZE 8 #define PKT_TYPE uint32_t #endif #define PKT_LOOP_BOUND ((PKT_INFO_SIZE/3)*3) #define RXD_LOOP_BOUND ((RXD_INFO_SIZE/4)*4) typedef struct if_pkt_info_pad { PKT_TYPE pkt_val[PKT_INFO_SIZE]; } *if_pkt_info_pad_t; typedef struct if_rxd_info_pad { PKT_TYPE rxd_val[RXD_INFO_SIZE]; } *if_rxd_info_pad_t; CTASSERT(sizeof(struct if_pkt_info_pad) == sizeof(struct if_pkt_info)); CTASSERT(sizeof(struct if_rxd_info_pad) == sizeof(struct if_rxd_info)); static inline void pkt_info_zero(if_pkt_info_t pi) { if_pkt_info_pad_t pi_pad; pi_pad = (if_pkt_info_pad_t)pi; pi_pad->pkt_val[0] = 0; pi_pad->pkt_val[1] = 0; pi_pad->pkt_val[2] = 0; pi_pad->pkt_val[3] = 0; pi_pad->pkt_val[4] = 0; pi_pad->pkt_val[5] = 0; #ifndef __LP64__ pi_pad->pkt_val[6] = 0; pi_pad->pkt_val[7] = 0; pi_pad->pkt_val[8] = 0; pi_pad->pkt_val[9] = 0; pi_pad->pkt_val[10] = 0; #endif } static device_method_t iflib_pseudo_methods[] = { DEVMETHOD(device_attach, noop_attach), DEVMETHOD(device_detach, iflib_pseudo_detach), DEVMETHOD_END }; driver_t iflib_pseudodriver = { "iflib_pseudo", iflib_pseudo_methods, sizeof(struct iflib_ctx), }; static inline void rxd_info_zero(if_rxd_info_t ri) { if_rxd_info_pad_t ri_pad; int i; ri_pad = (if_rxd_info_pad_t)ri; for (i = 0; i < RXD_LOOP_BOUND; i += 4) { ri_pad->rxd_val[i] = 0; ri_pad->rxd_val[i+1] = 0; ri_pad->rxd_val[i+2] = 0; ri_pad->rxd_val[i+3] = 0; } #ifdef __LP64__ ri_pad->rxd_val[RXD_INFO_SIZE-1] = 0; #endif } /* * Only allow a single packet to take up most 1/nth of the tx ring */ #define MAX_SINGLE_PACKET_FRACTION 12 #define IF_BAD_DMA (bus_addr_t)-1 #define CTX_ACTIVE(ctx) ((if_getdrvflags((ctx)->ifc_ifp) & IFF_DRV_RUNNING)) #define CTX_LOCK_INIT(_sc) sx_init(&(_sc)->ifc_ctx_sx, "iflib ctx lock") #define CTX_LOCK(ctx) sx_xlock(&(ctx)->ifc_ctx_sx) #define CTX_UNLOCK(ctx) sx_xunlock(&(ctx)->ifc_ctx_sx) #define CTX_LOCK_DESTROY(ctx) sx_destroy(&(ctx)->ifc_ctx_sx) #define STATE_LOCK_INIT(_sc, _name) mtx_init(&(_sc)->ifc_state_mtx, _name, "iflib state lock", MTX_DEF) #define STATE_LOCK(ctx) mtx_lock(&(ctx)->ifc_state_mtx) #define STATE_UNLOCK(ctx) mtx_unlock(&(ctx)->ifc_state_mtx) #define STATE_LOCK_DESTROY(ctx) mtx_destroy(&(ctx)->ifc_state_mtx) #define CALLOUT_LOCK(txq) mtx_lock(&txq->ift_mtx) #define CALLOUT_UNLOCK(txq) mtx_unlock(&txq->ift_mtx) void iflib_set_detach(if_ctx_t ctx) { STATE_LOCK(ctx); ctx->ifc_flags |= IFC_IN_DETACH; STATE_UNLOCK(ctx); } /* Our boot-time initialization hook */ static int iflib_module_event_handler(module_t, int, void *); static moduledata_t iflib_moduledata = { "iflib", iflib_module_event_handler, NULL }; DECLARE_MODULE(iflib, iflib_moduledata, SI_SUB_INIT_IF, SI_ORDER_ANY); MODULE_VERSION(iflib, 1); MODULE_DEPEND(iflib, pci, 1, 1, 1); MODULE_DEPEND(iflib, ether, 1, 1, 1); TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1); TASKQGROUP_DEFINE(if_config_tqg, 1, 1); #ifndef IFLIB_DEBUG_COUNTERS #ifdef INVARIANTS #define IFLIB_DEBUG_COUNTERS 1 #else #define IFLIB_DEBUG_COUNTERS 0 #endif /* !INVARIANTS */ #endif static SYSCTL_NODE(_net, OID_AUTO, iflib, CTLFLAG_RD, 0, "iflib driver parameters"); /* * XXX need to ensure that this can't accidentally cause the head to be moved backwards */ static int iflib_min_tx_latency = 0; SYSCTL_INT(_net_iflib, OID_AUTO, min_tx_latency, CTLFLAG_RW, &iflib_min_tx_latency, 0, "minimize transmit latency at the possible expense of throughput"); static int iflib_no_tx_batch = 0; SYSCTL_INT(_net_iflib, OID_AUTO, no_tx_batch, CTLFLAG_RW, &iflib_no_tx_batch, 0, "minimize transmit latency at the possible expense of throughput"); #if IFLIB_DEBUG_COUNTERS static int iflib_tx_seen; static int iflib_tx_sent; static int iflib_tx_encap; static int iflib_rx_allocs; static int iflib_fl_refills; static int iflib_fl_refills_large; static int iflib_tx_frees; SYSCTL_INT(_net_iflib, OID_AUTO, tx_seen, CTLFLAG_RD, &iflib_tx_seen, 0, "# TX mbufs seen"); SYSCTL_INT(_net_iflib, OID_AUTO, tx_sent, CTLFLAG_RD, &iflib_tx_sent, 0, "# TX mbufs sent"); SYSCTL_INT(_net_iflib, OID_AUTO, tx_encap, CTLFLAG_RD, &iflib_tx_encap, 0, "# TX mbufs encapped"); SYSCTL_INT(_net_iflib, OID_AUTO, tx_frees, CTLFLAG_RD, &iflib_tx_frees, 0, "# TX frees"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_allocs, CTLFLAG_RD, &iflib_rx_allocs, 0, "# RX allocations"); SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills, CTLFLAG_RD, &iflib_fl_refills, 0, "# refills"); SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills_large, CTLFLAG_RD, &iflib_fl_refills_large, 0, "# large refills"); static int iflib_txq_drain_flushing; static int iflib_txq_drain_oactive; static int iflib_txq_drain_notready; SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_flushing, CTLFLAG_RD, &iflib_txq_drain_flushing, 0, "# drain flushes"); SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_oactive, CTLFLAG_RD, &iflib_txq_drain_oactive, 0, "# drain oactives"); SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_notready, CTLFLAG_RD, &iflib_txq_drain_notready, 0, "# drain notready"); static int iflib_encap_load_mbuf_fail; static int iflib_encap_pad_mbuf_fail; static int iflib_encap_txq_avail_fail; static int iflib_encap_txd_encap_fail; SYSCTL_INT(_net_iflib, OID_AUTO, encap_load_mbuf_fail, CTLFLAG_RD, &iflib_encap_load_mbuf_fail, 0, "# busdma load failures"); SYSCTL_INT(_net_iflib, OID_AUTO, encap_pad_mbuf_fail, CTLFLAG_RD, &iflib_encap_pad_mbuf_fail, 0, "# runt frame pad failures"); SYSCTL_INT(_net_iflib, OID_AUTO, encap_txq_avail_fail, CTLFLAG_RD, &iflib_encap_txq_avail_fail, 0, "# txq avail failures"); SYSCTL_INT(_net_iflib, OID_AUTO, encap_txd_encap_fail, CTLFLAG_RD, &iflib_encap_txd_encap_fail, 0, "# driver encap failures"); static int iflib_task_fn_rxs; static int iflib_rx_intr_enables; static int iflib_fast_intrs; static int iflib_rx_unavail; static int iflib_rx_ctx_inactive; static int iflib_rx_if_input; static int iflib_rxd_flush; static int iflib_verbose_debug; SYSCTL_INT(_net_iflib, OID_AUTO, task_fn_rx, CTLFLAG_RD, &iflib_task_fn_rxs, 0, "# task_fn_rx calls"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_intr_enables, CTLFLAG_RD, &iflib_rx_intr_enables, 0, "# RX intr enables"); SYSCTL_INT(_net_iflib, OID_AUTO, fast_intrs, CTLFLAG_RD, &iflib_fast_intrs, 0, "# fast_intr calls"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_unavail, CTLFLAG_RD, &iflib_rx_unavail, 0, "# times rxeof called with no available data"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLFLAG_RD, &iflib_rx_ctx_inactive, 0, "# times rxeof called with inactive context"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD, &iflib_rx_if_input, 0, "# times rxeof called if_input"); SYSCTL_INT(_net_iflib, OID_AUTO, rxd_flush, CTLFLAG_RD, &iflib_rxd_flush, 0, "# times rxd_flush called"); SYSCTL_INT(_net_iflib, OID_AUTO, verbose_debug, CTLFLAG_RW, &iflib_verbose_debug, 0, "enable verbose debugging"); #define DBG_COUNTER_INC(name) atomic_add_int(&(iflib_ ## name), 1) static void iflib_debug_reset(void) { iflib_tx_seen = iflib_tx_sent = iflib_tx_encap = iflib_rx_allocs = iflib_fl_refills = iflib_fl_refills_large = iflib_tx_frees = iflib_txq_drain_flushing = iflib_txq_drain_oactive = iflib_txq_drain_notready = iflib_encap_load_mbuf_fail = iflib_encap_pad_mbuf_fail = iflib_encap_txq_avail_fail = iflib_encap_txd_encap_fail = iflib_task_fn_rxs = iflib_rx_intr_enables = iflib_fast_intrs = iflib_rx_unavail = iflib_rx_ctx_inactive = iflib_rx_if_input = iflib_rxd_flush = 0; } #else #define DBG_COUNTER_INC(name) static void iflib_debug_reset(void) {} #endif #define IFLIB_DEBUG 0 static void iflib_tx_structures_free(if_ctx_t ctx); static void iflib_rx_structures_free(if_ctx_t ctx); static int iflib_queues_alloc(if_ctx_t ctx); static int iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq); static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget); static int iflib_qset_structures_setup(if_ctx_t ctx); static int iflib_msix_init(if_ctx_t ctx); static int iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filterarg, int *rid, const char *str); static void iflib_txq_check_drain(iflib_txq_t txq, int budget); static uint32_t iflib_txq_can_drain(struct ifmp_ring *); #ifdef ALTQ static void iflib_altq_if_start(if_t ifp); static int iflib_altq_if_transmit(if_t ifp, struct mbuf *m); #endif static int iflib_register(if_ctx_t); static void iflib_deregister(if_ctx_t); static void iflib_unregister_vlan_handlers(if_ctx_t ctx); static void iflib_init_locked(if_ctx_t ctx); static void iflib_add_device_sysctl_pre(if_ctx_t ctx); static void iflib_add_device_sysctl_post(if_ctx_t ctx); static void iflib_ifmp_purge(iflib_txq_t txq); static void _iflib_pre_assert(if_softc_ctx_t scctx); static void iflib_if_init_locked(if_ctx_t ctx); static void iflib_free_intr_mem(if_ctx_t ctx); #ifndef __NO_STRICT_ALIGNMENT static struct mbuf * iflib_fixup_rx(struct mbuf *m); #endif static SLIST_HEAD(cpu_offset_list, cpu_offset) cpu_offsets = SLIST_HEAD_INITIALIZER(cpu_offsets); struct cpu_offset { SLIST_ENTRY(cpu_offset) entries; cpuset_t set; unsigned int refcount; uint16_t offset; }; static struct mtx cpu_offset_mtx; MTX_SYSINIT(iflib_cpu_offset, &cpu_offset_mtx, "iflib_cpu_offset lock", MTX_DEF); DEBUGNET_DEFINE(iflib); #ifdef DEV_NETMAP #include #include #include MODULE_DEPEND(iflib, netmap, 1, 1, 1); static int netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, uint32_t nm_i, bool init); /* * device-specific sysctl variables: * * iflib_crcstrip: 0: keep CRC in rx frames (default), 1: strip it. * During regular operations the CRC is stripped, but on some * hardware reception of frames not multiple of 64 is slower, * so using crcstrip=0 helps in benchmarks. * * iflib_rx_miss, iflib_rx_miss_bufs: * count packets that might be missed due to lost interrupts. */ SYSCTL_DECL(_dev_netmap); /* * The xl driver by default strips CRCs and we do not override it. */ int iflib_crcstrip = 1; SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_crcstrip, CTLFLAG_RW, &iflib_crcstrip, 1, "strip CRC on RX frames"); int iflib_rx_miss, iflib_rx_miss_bufs; SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss, CTLFLAG_RW, &iflib_rx_miss, 0, "potentially missed RX intr"); SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss_bufs, CTLFLAG_RW, &iflib_rx_miss_bufs, 0, "potentially missed RX intr bufs"); /* * Register/unregister. We are already under netmap lock. * Only called on the first register or the last unregister. */ static int iflib_netmap_register(struct netmap_adapter *na, int onoff) { if_t ifp = na->ifp; if_ctx_t ctx = ifp->if_softc; int status; CTX_LOCK(ctx); IFDI_INTR_DISABLE(ctx); /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); if (!CTX_IS_VF(ctx)) IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip); /* enable or disable flags and callbacks in na and ifp */ if (onoff) { nm_set_native_flags(na); } else { nm_clear_native_flags(na); } iflib_stop(ctx); iflib_init_locked(ctx); IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip); // XXX why twice ? status = ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1; if (status) nm_clear_native_flags(na); CTX_UNLOCK(ctx); return (status); } static int netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, uint32_t nm_i, bool init) { struct netmap_adapter *na = kring->na; u_int const lim = kring->nkr_num_slots - 1; u_int head = kring->rhead; struct netmap_ring *ring = kring->ring; bus_dmamap_t *map; struct if_rxd_update iru; if_ctx_t ctx = rxq->ifr_ctx; iflib_fl_t fl = &rxq->ifr_fl[0]; uint32_t refill_pidx, nic_i; #if IFLIB_DEBUG_COUNTERS int rf_count = 0; #endif if (nm_i == head && __predict_true(!init)) return 0; iru_init(&iru, rxq, 0 /* flid */); map = fl->ifl_sds.ifsd_map; refill_pidx = netmap_idx_k2n(kring, nm_i); /* * IMPORTANT: we must leave one free slot in the ring, * so move head back by one unit */ head = nm_prev(head, lim); nic_i = UINT_MAX; DBG_COUNTER_INC(fl_refills); while (nm_i != head) { #if IFLIB_DEBUG_COUNTERS if (++rf_count == 9) DBG_COUNTER_INC(fl_refills_large); #endif for (int tmp_pidx = 0; tmp_pidx < IFLIB_MAX_RX_REFRESH && nm_i != head; tmp_pidx++) { struct netmap_slot *slot = &ring->slot[nm_i]; void *addr = PNMB(na, slot, &fl->ifl_bus_addrs[tmp_pidx]); uint32_t nic_i_dma = refill_pidx; nic_i = netmap_idx_k2n(kring, nm_i); MPASS(tmp_pidx < IFLIB_MAX_RX_REFRESH); if (addr == NETMAP_BUF_BASE(na)) /* bad buf */ return netmap_ring_reinit(kring); fl->ifl_vm_addrs[tmp_pidx] = addr; if (__predict_false(init)) { netmap_load_map(na, fl->ifl_buf_tag, map[nic_i], addr); } else if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ netmap_reload_map(na, fl->ifl_buf_tag, map[nic_i], addr); } slot->flags &= ~NS_BUF_CHANGED; nm_i = nm_next(nm_i, lim); fl->ifl_rxd_idxs[tmp_pidx] = nic_i = nm_next(nic_i, lim); if (nm_i != head && tmp_pidx < IFLIB_MAX_RX_REFRESH-1) continue; iru.iru_pidx = refill_pidx; iru.iru_count = tmp_pidx+1; ctx->isc_rxd_refill(ctx->ifc_softc, &iru); refill_pidx = nic_i; for (int n = 0; n < iru.iru_count; n++) { bus_dmamap_sync(fl->ifl_buf_tag, map[nic_i_dma], BUS_DMASYNC_PREREAD); /* XXX - change this to not use the netmap func*/ nic_i_dma = nm_next(nic_i_dma, lim); } } } kring->nr_hwcur = head; bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); if (__predict_true(nic_i != UINT_MAX)) { ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id, nic_i); DBG_COUNTER_INC(rxd_flush); } return (0); } /* * Reconcile kernel and user view of the transmit ring. * * All information is in the kring. * Userspace wants to send packets up to the one before kring->rhead, * kernel knows kring->nr_hwcur is the first unsent packet. * * Here we push packets out (as many as possible), and possibly * reclaim buffers from previously completed transmission. * * The caller (netmap) guarantees that there is only one instance * running at any time. Any interference with other driver * methods should be handled by the individual drivers. */ static int iflib_netmap_txsync(struct netmap_kring *kring, int flags) { struct netmap_adapter *na = kring->na; if_t ifp = na->ifp; struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap kring */ u_int nic_i; /* index into the NIC ring */ u_int n; u_int const lim = kring->nkr_num_slots - 1; u_int const head = kring->rhead; struct if_pkt_info pi; /* * interrupts on every tx packet are expensive so request * them every half ring, or where NS_REPORT is set */ u_int report_frequency = kring->nkr_num_slots >> 1; /* device-specific */ if_ctx_t ctx = ifp->if_softc; iflib_txq_t txq = &ctx->ifc_txqs[kring->ring_id]; bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* * First part: process new packets to send. * nm_i is the current index in the netmap kring, * nic_i is the corresponding index in the NIC ring. * * If we have packets to send (nm_i != head) * iterate over the netmap ring, fetch length and update * the corresponding slot in the NIC ring. Some drivers also * need to update the buffer's physical address in the NIC slot * even NS_BUF_CHANGED is not set (PNMB computes the addresses). * * The netmap_reload_map() calls is especially expensive, * even when (as in this case) the tag is 0, so do only * when the buffer has actually changed. * * If possible do not set the report/intr bit on all slots, * but only a few times per ring or when NS_REPORT is set. * * Finally, on 10G and faster drivers, it might be useful * to prefetch the next slot and txr entry. */ nm_i = kring->nr_hwcur; if (nm_i != head) { /* we have new packets to send */ pkt_info_zero(&pi); pi.ipi_segs = txq->ift_segs; pi.ipi_qsidx = kring->ring_id; nic_i = netmap_idx_k2n(kring, nm_i); __builtin_prefetch(&ring->slot[nm_i]); __builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i]); __builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i]); for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; uint64_t paddr; void *addr = PNMB(na, slot, &paddr); int flags = (slot->flags & NS_REPORT || nic_i == 0 || nic_i == report_frequency) ? IPI_TX_INTR : 0; /* device-specific */ pi.ipi_len = len; pi.ipi_segs[0].ds_addr = paddr; pi.ipi_segs[0].ds_len = len; pi.ipi_nsegs = 1; pi.ipi_ndescs = 0; pi.ipi_pidx = nic_i; pi.ipi_flags = flags; /* Fill the slot in the NIC ring. */ ctx->isc_txd_encap(ctx->ifc_softc, &pi); DBG_COUNTER_INC(tx_encap); /* prefetch for next round */ __builtin_prefetch(&ring->slot[nm_i + 1]); __builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i + 1]); __builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i + 1]); NM_CHECK_ADDR_LEN(na, addr, len); if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ netmap_reload_map(na, txq->ift_buf_tag, txq->ift_sds.ifsd_map[nic_i], addr); } /* make sure changes to the buffer are synced */ bus_dmamap_sync(txq->ift_buf_tag, txq->ift_sds.ifsd_map[nic_i], BUS_DMASYNC_PREWRITE); slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } kring->nr_hwcur = nm_i; /* synchronize the NIC ring */ bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* (re)start the tx unit up to slot nic_i (excluded) */ ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, nic_i); } /* * Second part: reclaim buffers for completed transmissions. * * If there are unclaimed buffers, attempt to reclaim them. * If none are reclaimed, and TX IRQs are not in use, do an initial * minimal delay, then trigger the tx handler which will spin in the * group task queue. */ if (kring->nr_hwtail != nm_prev(kring->nr_hwcur, lim)) { if (iflib_tx_credits_update(ctx, txq)) { /* some tx completed, increment avail */ nic_i = txq->ift_cidx_processed; kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } } if (!(ctx->ifc_flags & IFC_NETMAP_TX_IRQ)) if (kring->nr_hwtail != nm_prev(kring->nr_hwcur, lim)) { callout_reset_on(&txq->ift_timer, hz < 2000 ? 1 : hz / 1000, iflib_timer, txq, txq->ift_timer.c_cpu); } return (0); } /* * Reconcile kernel and user view of the receive ring. * Same as for the txsync, this routine must be efficient. * The caller guarantees a single invocations, but races against * the rest of the driver should be handled here. * * On call, kring->rhead is the first packet that userspace wants * to keep, and kring->rcur is the wakeup point. * The kernel has previously reported packets up to kring->rtail. * * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective * of whether or not we received an interrupt. */ static int iflib_netmap_rxsync(struct netmap_kring *kring, int flags) { struct netmap_adapter *na = kring->na; struct netmap_ring *ring = kring->ring; if_t ifp = na->ifp; iflib_fl_t fl; uint32_t nm_i; /* index into the netmap ring */ uint32_t nic_i; /* index into the NIC ring */ u_int i, n; u_int const lim = kring->nkr_num_slots - 1; u_int const head = kring->rhead; int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; struct if_rxd_info ri; if_ctx_t ctx = ifp->if_softc; iflib_rxq_t rxq = &ctx->ifc_rxqs[kring->ring_id]; if (head > lim) return netmap_ring_reinit(kring); /* * XXX netmap_fl_refill() only ever (re)fills free list 0 so far. */ for (i = 0, fl = rxq->ifr_fl; i < rxq->ifr_nfl; i++, fl++) { bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); } /* * First part: import newly received packets. * * nm_i is the index of the next free slot in the netmap ring, * nic_i is the index of the next received packet in the NIC ring, * and they may differ in case if_init() has been called while * in netmap mode. For the receive ring we have * * nic_i = rxr->next_check; * nm_i = kring->nr_hwtail (previous) * and * nm_i == (nic_i + kring->nkr_hwofs) % ring_size * * rxr->next_check is set to 0 on a ring reinit */ if (netmap_no_pendintr || force_update) { int crclen = iflib_crcstrip ? 0 : 4; int error, avail; for (i = 0; i < rxq->ifr_nfl; i++) { fl = &rxq->ifr_fl[i]; nic_i = fl->ifl_cidx; nm_i = netmap_idx_n2k(kring, nic_i); avail = ctx->isc_rxd_available(ctx->ifc_softc, rxq->ifr_id, nic_i, USHRT_MAX); for (n = 0; avail > 0; n++, avail--) { rxd_info_zero(&ri); ri.iri_frags = rxq->ifr_frags; ri.iri_qsidx = kring->ring_id; ri.iri_ifp = ctx->ifc_ifp; ri.iri_cidx = nic_i; error = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri); ring->slot[nm_i].len = error ? 0 : ri.iri_len - crclen; ring->slot[nm_i].flags = 0; bus_dmamap_sync(fl->ifl_buf_tag, fl->ifl_sds.ifsd_map[nic_i], BUS_DMASYNC_POSTREAD); nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ if (netmap_no_pendintr && !force_update) { /* diagnostics */ iflib_rx_miss ++; iflib_rx_miss_bufs += n; } fl->ifl_cidx = nic_i; kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } } /* * Second part: skip past packets that userspace has released. * (kring->nr_hwcur to head excluded), * and make the buffers available for reception. * As usual nm_i is the index in the netmap ring, * nic_i is the index in the NIC ring, and * nm_i == (nic_i + kring->nkr_hwofs) % ring_size */ /* XXX not sure how this will work with multiple free lists */ nm_i = kring->nr_hwcur; return (netmap_fl_refill(rxq, kring, nm_i, false)); } static void iflib_netmap_intr(struct netmap_adapter *na, int onoff) { if_ctx_t ctx = na->ifp->if_softc; CTX_LOCK(ctx); if (onoff) { IFDI_INTR_ENABLE(ctx); } else { IFDI_INTR_DISABLE(ctx); } CTX_UNLOCK(ctx); } static int iflib_netmap_attach(if_ctx_t ctx) { struct netmap_adapter na; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; bzero(&na, sizeof(na)); na.ifp = ctx->ifc_ifp; na.na_flags = NAF_BDG_MAYSLEEP; MPASS(ctx->ifc_softc_ctx.isc_ntxqsets); MPASS(ctx->ifc_softc_ctx.isc_nrxqsets); na.num_tx_desc = scctx->isc_ntxd[0]; na.num_rx_desc = scctx->isc_nrxd[0]; na.nm_txsync = iflib_netmap_txsync; na.nm_rxsync = iflib_netmap_rxsync; na.nm_register = iflib_netmap_register; na.nm_intr = iflib_netmap_intr; na.num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets; na.num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets; return (netmap_attach(&na)); } static void iflib_netmap_txq_init(if_ctx_t ctx, iflib_txq_t txq) { struct netmap_adapter *na = NA(ctx->ifc_ifp); struct netmap_slot *slot; slot = netmap_reset(na, NR_TX, txq->ift_id, 0); if (slot == NULL) return; for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxd[0]; i++) { /* * In netmap mode, set the map for the packet buffer. * NOTE: Some drivers (not this one) also need to set * the physical buffer address in the NIC ring. * netmap_idx_n2k() maps a nic index, i, into the corresponding * netmap slot index, si */ int si = netmap_idx_n2k(na->tx_rings[txq->ift_id], i); netmap_load_map(na, txq->ift_buf_tag, txq->ift_sds.ifsd_map[i], NMB(na, slot + si)); } } static void iflib_netmap_rxq_init(if_ctx_t ctx, iflib_rxq_t rxq) { struct netmap_adapter *na = NA(ctx->ifc_ifp); struct netmap_kring *kring = na->rx_rings[rxq->ifr_id]; struct netmap_slot *slot; uint32_t nm_i; slot = netmap_reset(na, NR_RX, rxq->ifr_id, 0); if (slot == NULL) return; nm_i = netmap_idx_n2k(kring, 0); netmap_fl_refill(rxq, kring, nm_i, true); } static void iflib_netmap_timer_adjust(if_ctx_t ctx, iflib_txq_t txq, uint32_t *reset_on) { struct netmap_kring *kring; uint16_t txqid; txqid = txq->ift_id; kring = NA(ctx->ifc_ifp)->tx_rings[txqid]; if (kring->nr_hwcur != nm_next(kring->nr_hwtail, kring->nkr_num_slots - 1)) { bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map, BUS_DMASYNC_POSTREAD); if (ctx->isc_txd_credits_update(ctx->ifc_softc, txqid, false)) netmap_tx_irq(ctx->ifc_ifp, txqid); if (!(ctx->ifc_flags & IFC_NETMAP_TX_IRQ)) { if (hz < 2000) *reset_on = 1; else *reset_on = hz / 1000; } } } #define iflib_netmap_detach(ifp) netmap_detach(ifp) #else #define iflib_netmap_txq_init(ctx, txq) #define iflib_netmap_rxq_init(ctx, rxq) #define iflib_netmap_detach(ifp) #define iflib_netmap_attach(ctx) (0) #define netmap_rx_irq(ifp, qid, budget) (0) #define netmap_tx_irq(ifp, qid) do {} while (0) #define iflib_netmap_timer_adjust(ctx, txq, reset_on) #endif #if defined(__i386__) || defined(__amd64__) static __inline void prefetch(void *x) { __asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x)); } static __inline void prefetch2cachelines(void *x) { __asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x)); #if (CACHE_LINE_SIZE < 128) __asm volatile("prefetcht0 %0" :: "m" (*(((unsigned long *)x)+CACHE_LINE_SIZE/(sizeof(unsigned long))))); #endif } #else #define prefetch(x) #define prefetch2cachelines(x) #endif static void iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid) { iflib_fl_t fl; fl = &rxq->ifr_fl[flid]; iru->iru_paddrs = fl->ifl_bus_addrs; iru->iru_vaddrs = &fl->ifl_vm_addrs[0]; iru->iru_idxs = fl->ifl_rxd_idxs; iru->iru_qsidx = rxq->ifr_id; iru->iru_buf_size = fl->ifl_buf_size; iru->iru_flidx = fl->ifl_id; } static void _iflib_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err) { if (err) return; *(bus_addr_t *) arg = segs[0].ds_addr; } int iflib_dma_alloc_align(if_ctx_t ctx, int size, int align, iflib_dma_info_t dma, int mapflags) { int err; device_t dev = ctx->ifc_dev; err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ align, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ size, /* maxsize */ 1, /* nsegments */ size, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &dma->idi_tag); if (err) { device_printf(dev, "%s: bus_dma_tag_create failed: %d\n", __func__, err); goto fail_0; } err = bus_dmamem_alloc(dma->idi_tag, (void**) &dma->idi_vaddr, BUS_DMA_NOWAIT | BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->idi_map); if (err) { device_printf(dev, "%s: bus_dmamem_alloc(%ju) failed: %d\n", __func__, (uintmax_t)size, err); goto fail_1; } dma->idi_paddr = IF_BAD_DMA; err = bus_dmamap_load(dma->idi_tag, dma->idi_map, dma->idi_vaddr, size, _iflib_dmamap_cb, &dma->idi_paddr, mapflags | BUS_DMA_NOWAIT); if (err || dma->idi_paddr == IF_BAD_DMA) { device_printf(dev, "%s: bus_dmamap_load failed: %d\n", __func__, err); goto fail_2; } dma->idi_size = size; return (0); fail_2: bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map); fail_1: bus_dma_tag_destroy(dma->idi_tag); fail_0: dma->idi_tag = NULL; return (err); } int iflib_dma_alloc(if_ctx_t ctx, int size, iflib_dma_info_t dma, int mapflags) { if_shared_ctx_t sctx = ctx->ifc_sctx; KASSERT(sctx->isc_q_align != 0, ("alignment value not initialized")); return (iflib_dma_alloc_align(ctx, size, sctx->isc_q_align, dma, mapflags)); } int iflib_dma_alloc_multi(if_ctx_t ctx, int *sizes, iflib_dma_info_t *dmalist, int mapflags, int count) { int i, err; iflib_dma_info_t *dmaiter; dmaiter = dmalist; for (i = 0; i < count; i++, dmaiter++) { if ((err = iflib_dma_alloc(ctx, sizes[i], *dmaiter, mapflags)) != 0) break; } if (err) iflib_dma_free_multi(dmalist, i); return (err); } void iflib_dma_free(iflib_dma_info_t dma) { if (dma->idi_tag == NULL) return; if (dma->idi_paddr != IF_BAD_DMA) { bus_dmamap_sync(dma->idi_tag, dma->idi_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(dma->idi_tag, dma->idi_map); dma->idi_paddr = IF_BAD_DMA; } if (dma->idi_vaddr != NULL) { bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map); dma->idi_vaddr = NULL; } bus_dma_tag_destroy(dma->idi_tag); dma->idi_tag = NULL; } void iflib_dma_free_multi(iflib_dma_info_t *dmalist, int count) { int i; iflib_dma_info_t *dmaiter = dmalist; for (i = 0; i < count; i++, dmaiter++) iflib_dma_free(*dmaiter); } #ifdef EARLY_AP_STARTUP static const int iflib_started = 1; #else /* * We used to abuse the smp_started flag to decide if the queues have been * fully initialized (by late taskqgroup_adjust() calls in a SYSINIT()). * That gave bad races, since the SYSINIT() runs strictly after smp_started * is set. Run a SYSINIT() strictly after that to just set a usable * completion flag. */ static int iflib_started; static void iflib_record_started(void *arg) { iflib_started = 1; } SYSINIT(iflib_record_started, SI_SUB_SMP + 1, SI_ORDER_FIRST, iflib_record_started, NULL); #endif static int iflib_fast_intr(void *arg) { iflib_filter_info_t info = arg; struct grouptask *gtask = info->ifi_task; int result; if (!iflib_started) return (FILTER_STRAY); DBG_COUNTER_INC(fast_intrs); if (info->ifi_filter != NULL) { result = info->ifi_filter(info->ifi_filter_arg); if ((result & FILTER_SCHEDULE_THREAD) == 0) return (result); } GROUPTASK_ENQUEUE(gtask); return (FILTER_HANDLED); } static int iflib_fast_intr_rxtx(void *arg) { iflib_filter_info_t info = arg; struct grouptask *gtask = info->ifi_task; if_ctx_t ctx; iflib_rxq_t rxq = (iflib_rxq_t)info->ifi_ctx; iflib_txq_t txq; void *sc; int i, cidx, result; qidx_t txqid; bool intr_enable, intr_legacy; if (!iflib_started) return (FILTER_STRAY); DBG_COUNTER_INC(fast_intrs); if (info->ifi_filter != NULL) { result = info->ifi_filter(info->ifi_filter_arg); if ((result & FILTER_SCHEDULE_THREAD) == 0) return (result); } ctx = rxq->ifr_ctx; sc = ctx->ifc_softc; intr_enable = false; intr_legacy = !!(ctx->ifc_flags & IFC_LEGACY); MPASS(rxq->ifr_ntxqirq); for (i = 0; i < rxq->ifr_ntxqirq; i++) { txqid = rxq->ifr_txqid[i]; txq = &ctx->ifc_txqs[txqid]; bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map, BUS_DMASYNC_POSTREAD); if (!ctx->isc_txd_credits_update(sc, txqid, false)) { if (intr_legacy) intr_enable = true; else IFDI_TX_QUEUE_INTR_ENABLE(ctx, txqid); continue; } GROUPTASK_ENQUEUE(&txq->ift_task); } if (ctx->ifc_sctx->isc_flags & IFLIB_HAS_RXCQ) cidx = rxq->ifr_cq_cidx; else cidx = rxq->ifr_fl[0].ifl_cidx; if (iflib_rxd_avail(ctx, rxq, cidx, 1)) GROUPTASK_ENQUEUE(gtask); else { if (intr_legacy) intr_enable = true; else IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id); DBG_COUNTER_INC(rx_intr_enables); } if (intr_enable) IFDI_INTR_ENABLE(ctx); return (FILTER_HANDLED); } static int iflib_fast_intr_ctx(void *arg) { iflib_filter_info_t info = arg; struct grouptask *gtask = info->ifi_task; int result; if (!iflib_started) return (FILTER_STRAY); DBG_COUNTER_INC(fast_intrs); if (info->ifi_filter != NULL) { result = info->ifi_filter(info->ifi_filter_arg); if ((result & FILTER_SCHEDULE_THREAD) == 0) return (result); } GROUPTASK_ENQUEUE(gtask); return (FILTER_HANDLED); } static int _iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid, driver_filter_t filter, driver_intr_t handler, void *arg, const char *name) { struct resource *res; void *tag = NULL; device_t dev = ctx->ifc_dev; int flags, i, rc; flags = RF_ACTIVE; if (ctx->ifc_flags & IFC_LEGACY) flags |= RF_SHAREABLE; MPASS(rid < 512); i = rid; res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &i, flags); if (res == NULL) { device_printf(dev, "failed to allocate IRQ for rid %d, name %s.\n", rid, name); return (ENOMEM); } irq->ii_res = res; KASSERT(filter == NULL || handler == NULL, ("filter and handler can't both be non-NULL")); rc = bus_setup_intr(dev, res, INTR_MPSAFE | INTR_TYPE_NET, filter, handler, arg, &tag); if (rc != 0) { device_printf(dev, "failed to setup interrupt for rid %d, name %s: %d\n", rid, name ? name : "unknown", rc); return (rc); } else if (name) bus_describe_intr(dev, res, tag, "%s", name); irq->ii_tag = tag; return (0); } /********************************************************************* * * Allocate DMA resources for TX buffers as well as memory for the TX * mbuf map. TX DMA maps (non-TSO/TSO) and TX mbuf map are kept in a * iflib_sw_tx_desc_array structure, storing all the information that * is needed to transmit a packet on the wire. This is called only * once at attach, setup is done every reset. * **********************************************************************/ static int iflib_txsd_alloc(iflib_txq_t txq) { if_ctx_t ctx = txq->ift_ctx; if_shared_ctx_t sctx = ctx->ifc_sctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; device_t dev = ctx->ifc_dev; bus_size_t tsomaxsize; int err, nsegments, ntsosegments; bool tso; nsegments = scctx->isc_tx_nsegments; ntsosegments = scctx->isc_tx_tso_segments_max; tsomaxsize = scctx->isc_tx_tso_size_max; if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_VLAN_MTU) tsomaxsize += sizeof(struct ether_vlan_header); MPASS(scctx->isc_ntxd[0] > 0); MPASS(scctx->isc_ntxd[txq->ift_br_offset] > 0); MPASS(nsegments > 0); if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_TSO) { MPASS(ntsosegments > 0); MPASS(sctx->isc_tso_maxsize >= tsomaxsize); } /* * Set up DMA tags for TX buffers. */ if ((err = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ sctx->isc_tx_maxsize, /* maxsize */ nsegments, /* nsegments */ sctx->isc_tx_maxsegsize, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txq->ift_buf_tag))) { device_printf(dev,"Unable to allocate TX DMA tag: %d\n", err); device_printf(dev,"maxsize: %ju nsegments: %d maxsegsize: %ju\n", (uintmax_t)sctx->isc_tx_maxsize, nsegments, (uintmax_t)sctx->isc_tx_maxsegsize); goto fail; } tso = (if_getcapabilities(ctx->ifc_ifp) & IFCAP_TSO) != 0; if (tso && (err = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ tsomaxsize, /* maxsize */ ntsosegments, /* nsegments */ sctx->isc_tso_maxsegsize,/* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txq->ift_tso_buf_tag))) { device_printf(dev, "Unable to allocate TSO TX DMA tag: %d\n", err); goto fail; } /* Allocate memory for the TX mbuf map. */ if (!(txq->ift_sds.ifsd_m = (struct mbuf **) malloc(sizeof(struct mbuf *) * scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate TX mbuf map memory\n"); err = ENOMEM; goto fail; } /* * Create the DMA maps for TX buffers. */ if ((txq->ift_sds.ifsd_map = (bus_dmamap_t *)malloc( sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) { device_printf(dev, "Unable to allocate TX buffer DMA map memory\n"); err = ENOMEM; goto fail; } if (tso && (txq->ift_sds.ifsd_tso_map = (bus_dmamap_t *)malloc( sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) { device_printf(dev, "Unable to allocate TSO TX buffer map memory\n"); err = ENOMEM; goto fail; } for (int i = 0; i < scctx->isc_ntxd[txq->ift_br_offset]; i++) { err = bus_dmamap_create(txq->ift_buf_tag, 0, &txq->ift_sds.ifsd_map[i]); if (err != 0) { device_printf(dev, "Unable to create TX DMA map\n"); goto fail; } if (!tso) continue; err = bus_dmamap_create(txq->ift_tso_buf_tag, 0, &txq->ift_sds.ifsd_tso_map[i]); if (err != 0) { device_printf(dev, "Unable to create TSO TX DMA map\n"); goto fail; } } return (0); fail: /* We free all, it handles case where we are in the middle */ iflib_tx_structures_free(ctx); return (err); } static void iflib_txsd_destroy(if_ctx_t ctx, iflib_txq_t txq, int i) { bus_dmamap_t map; if (txq->ift_sds.ifsd_map != NULL) { map = txq->ift_sds.ifsd_map[i]; bus_dmamap_sync(txq->ift_buf_tag, map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txq->ift_buf_tag, map); bus_dmamap_destroy(txq->ift_buf_tag, map); txq->ift_sds.ifsd_map[i] = NULL; } if (txq->ift_sds.ifsd_tso_map != NULL) { map = txq->ift_sds.ifsd_tso_map[i]; bus_dmamap_sync(txq->ift_tso_buf_tag, map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txq->ift_tso_buf_tag, map); bus_dmamap_destroy(txq->ift_tso_buf_tag, map); txq->ift_sds.ifsd_tso_map[i] = NULL; } } static void iflib_txq_destroy(iflib_txq_t txq) { if_ctx_t ctx = txq->ift_ctx; for (int i = 0; i < txq->ift_size; i++) iflib_txsd_destroy(ctx, txq, i); if (txq->ift_br != NULL) { ifmp_ring_free(txq->ift_br); txq->ift_br = NULL; } mtx_destroy(&txq->ift_mtx); if (txq->ift_sds.ifsd_map != NULL) { free(txq->ift_sds.ifsd_map, M_IFLIB); txq->ift_sds.ifsd_map = NULL; } if (txq->ift_sds.ifsd_tso_map != NULL) { free(txq->ift_sds.ifsd_tso_map, M_IFLIB); txq->ift_sds.ifsd_tso_map = NULL; } if (txq->ift_sds.ifsd_m != NULL) { free(txq->ift_sds.ifsd_m, M_IFLIB); txq->ift_sds.ifsd_m = NULL; } if (txq->ift_buf_tag != NULL) { bus_dma_tag_destroy(txq->ift_buf_tag); txq->ift_buf_tag = NULL; } if (txq->ift_tso_buf_tag != NULL) { bus_dma_tag_destroy(txq->ift_tso_buf_tag); txq->ift_tso_buf_tag = NULL; } if (txq->ift_ifdi != NULL) { free(txq->ift_ifdi, M_IFLIB); } } static void iflib_txsd_free(if_ctx_t ctx, iflib_txq_t txq, int i) { struct mbuf **mp; mp = &txq->ift_sds.ifsd_m[i]; if (*mp == NULL) return; if (txq->ift_sds.ifsd_map != NULL) { bus_dmamap_sync(txq->ift_buf_tag, txq->ift_sds.ifsd_map[i], BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txq->ift_buf_tag, txq->ift_sds.ifsd_map[i]); } if (txq->ift_sds.ifsd_tso_map != NULL) { bus_dmamap_sync(txq->ift_tso_buf_tag, txq->ift_sds.ifsd_tso_map[i], BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txq->ift_tso_buf_tag, txq->ift_sds.ifsd_tso_map[i]); } m_free(*mp); DBG_COUNTER_INC(tx_frees); *mp = NULL; } static int iflib_txq_setup(iflib_txq_t txq) { if_ctx_t ctx = txq->ift_ctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; if_shared_ctx_t sctx = ctx->ifc_sctx; iflib_dma_info_t di; int i; /* Set number of descriptors available */ txq->ift_qstatus = IFLIB_QUEUE_IDLE; /* XXX make configurable */ txq->ift_update_freq = IFLIB_DEFAULT_TX_UPDATE_FREQ; /* Reset indices */ txq->ift_cidx_processed = 0; txq->ift_pidx = txq->ift_cidx = txq->ift_npending = 0; txq->ift_size = scctx->isc_ntxd[txq->ift_br_offset]; for (i = 0, di = txq->ift_ifdi; i < sctx->isc_ntxqs; i++, di++) bzero((void *)di->idi_vaddr, di->idi_size); IFDI_TXQ_SETUP(ctx, txq->ift_id); for (i = 0, di = txq->ift_ifdi; i < sctx->isc_ntxqs; i++, di++) bus_dmamap_sync(di->idi_tag, di->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); return (0); } /********************************************************************* * * Allocate DMA resources for RX buffers as well as memory for the RX * mbuf map, direct RX cluster pointer map and RX cluster bus address * map. RX DMA map, RX mbuf map, direct RX cluster pointer map and * RX cluster map are kept in a iflib_sw_rx_desc_array structure. * Since we use use one entry in iflib_sw_rx_desc_array per received * packet, the maximum number of entries we'll need is equal to the * number of hardware receive descriptors that we've allocated. * **********************************************************************/ static int iflib_rxsd_alloc(iflib_rxq_t rxq) { if_ctx_t ctx = rxq->ifr_ctx; if_shared_ctx_t sctx = ctx->ifc_sctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; device_t dev = ctx->ifc_dev; iflib_fl_t fl; int err; MPASS(scctx->isc_nrxd[0] > 0); MPASS(scctx->isc_nrxd[rxq->ifr_fl_offset] > 0); fl = rxq->ifr_fl; for (int i = 0; i < rxq->ifr_nfl; i++, fl++) { fl->ifl_size = scctx->isc_nrxd[rxq->ifr_fl_offset]; /* this isn't necessarily the same */ /* Set up DMA tag for RX buffers. */ err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ sctx->isc_rx_maxsize, /* maxsize */ sctx->isc_rx_nsegments, /* nsegments */ sctx->isc_rx_maxsegsize, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &fl->ifl_buf_tag); if (err) { device_printf(dev, "Unable to allocate RX DMA tag: %d\n", err); goto fail; } /* Allocate memory for the RX mbuf map. */ if (!(fl->ifl_sds.ifsd_m = (struct mbuf **) malloc(sizeof(struct mbuf *) * scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate RX mbuf map memory\n"); err = ENOMEM; goto fail; } /* Allocate memory for the direct RX cluster pointer map. */ if (!(fl->ifl_sds.ifsd_cl = (caddr_t *) malloc(sizeof(caddr_t) * scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate RX cluster map memory\n"); err = ENOMEM; goto fail; } /* Allocate memory for the RX cluster bus address map. */ if (!(fl->ifl_sds.ifsd_ba = (bus_addr_t *) malloc(sizeof(bus_addr_t) * scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate RX bus address map memory\n"); err = ENOMEM; goto fail; } /* * Create the DMA maps for RX buffers. */ if (!(fl->ifl_sds.ifsd_map = (bus_dmamap_t *) malloc(sizeof(bus_dmamap_t) * scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate RX buffer DMA map memory\n"); err = ENOMEM; goto fail; } for (int i = 0; i < scctx->isc_nrxd[rxq->ifr_fl_offset]; i++) { err = bus_dmamap_create(fl->ifl_buf_tag, 0, &fl->ifl_sds.ifsd_map[i]); if (err != 0) { device_printf(dev, "Unable to create RX buffer DMA map\n"); goto fail; } } } return (0); fail: iflib_rx_structures_free(ctx); return (err); } /* * Internal service routines */ struct rxq_refill_cb_arg { int error; bus_dma_segment_t seg; int nseg; }; static void _rxq_refill_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error) { struct rxq_refill_cb_arg *cb_arg = arg; cb_arg->error = error; cb_arg->seg = segs[0]; cb_arg->nseg = nseg; } /** * _iflib_fl_refill - refill an rxq free-buffer list * @ctx: the iflib context * @fl: the free list to refill * @count: the number of new buffers to allocate * * (Re)populate an rxq free-buffer list with up to @count new packet buffers. * The caller must assure that @count does not exceed the queue's capacity. */ static uint8_t _iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int count) { struct if_rxd_update iru; struct rxq_refill_cb_arg cb_arg; struct mbuf *m; caddr_t cl, *sd_cl; struct mbuf **sd_m; bus_dmamap_t *sd_map; bus_addr_t bus_addr, *sd_ba; int err, frag_idx, i, idx, n, pidx; qidx_t credits; sd_m = fl->ifl_sds.ifsd_m; sd_map = fl->ifl_sds.ifsd_map; sd_cl = fl->ifl_sds.ifsd_cl; sd_ba = fl->ifl_sds.ifsd_ba; pidx = fl->ifl_pidx; idx = pidx; frag_idx = fl->ifl_fragidx; credits = fl->ifl_credits; i = 0; n = count; MPASS(n > 0); MPASS(credits + n <= fl->ifl_size); if (pidx < fl->ifl_cidx) MPASS(pidx + n <= fl->ifl_cidx); if (pidx == fl->ifl_cidx && (credits < fl->ifl_size)) MPASS(fl->ifl_gen == 0); if (pidx > fl->ifl_cidx) MPASS(n <= fl->ifl_size - pidx + fl->ifl_cidx); DBG_COUNTER_INC(fl_refills); if (n > 8) DBG_COUNTER_INC(fl_refills_large); iru_init(&iru, fl->ifl_rxq, fl->ifl_id); while (n--) { /* * We allocate an uninitialized mbuf + cluster, mbuf is * initialized after rx. * * If the cluster is still set then we know a minimum sized packet was received */ bit_ffc_at(fl->ifl_rx_bitmap, frag_idx, fl->ifl_size, &frag_idx); if (frag_idx < 0) bit_ffc(fl->ifl_rx_bitmap, fl->ifl_size, &frag_idx); MPASS(frag_idx >= 0); if ((cl = sd_cl[frag_idx]) == NULL) { if ((cl = m_cljget(NULL, M_NOWAIT, fl->ifl_buf_size)) == NULL) break; cb_arg.error = 0; MPASS(sd_map != NULL); err = bus_dmamap_load(fl->ifl_buf_tag, sd_map[frag_idx], cl, fl->ifl_buf_size, _rxq_refill_cb, &cb_arg, BUS_DMA_NOWAIT); if (err != 0 || cb_arg.error) { /* * !zone_pack ? */ if (fl->ifl_zone == zone_pack) uma_zfree(fl->ifl_zone, cl); break; } sd_ba[frag_idx] = bus_addr = cb_arg.seg.ds_addr; sd_cl[frag_idx] = cl; #if MEMORY_LOGGING fl->ifl_cl_enqueued++; #endif } else { bus_addr = sd_ba[frag_idx]; } bus_dmamap_sync(fl->ifl_buf_tag, sd_map[frag_idx], BUS_DMASYNC_PREREAD); if (sd_m[frag_idx] == NULL) { if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) { break; } sd_m[frag_idx] = m; } bit_set(fl->ifl_rx_bitmap, frag_idx); #if MEMORY_LOGGING fl->ifl_m_enqueued++; #endif DBG_COUNTER_INC(rx_allocs); fl->ifl_rxd_idxs[i] = frag_idx; fl->ifl_bus_addrs[i] = bus_addr; fl->ifl_vm_addrs[i] = cl; credits++; i++; MPASS(credits <= fl->ifl_size); if (++idx == fl->ifl_size) { fl->ifl_gen = 1; idx = 0; } if (n == 0 || i == IFLIB_MAX_RX_REFRESH) { iru.iru_pidx = pidx; iru.iru_count = i; ctx->isc_rxd_refill(ctx->ifc_softc, &iru); i = 0; pidx = idx; fl->ifl_pidx = idx; fl->ifl_credits = credits; } } if (i) { iru.iru_pidx = pidx; iru.iru_count = i; ctx->isc_rxd_refill(ctx->ifc_softc, &iru); fl->ifl_pidx = idx; fl->ifl_credits = credits; } DBG_COUNTER_INC(rxd_flush); if (fl->ifl_pidx == 0) pidx = fl->ifl_size - 1; else pidx = fl->ifl_pidx - 1; bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); ctx->isc_rxd_flush(ctx->ifc_softc, fl->ifl_rxq->ifr_id, fl->ifl_id, pidx); fl->ifl_fragidx = frag_idx; return (n == -1 ? 0 : IFLIB_RXEOF_EMPTY); } static __inline uint8_t __iflib_fl_refill_lt(if_ctx_t ctx, iflib_fl_t fl, int max) { /* we avoid allowing pidx to catch up with cidx as it confuses ixl */ int32_t reclaimable = fl->ifl_size - fl->ifl_credits - 1; #ifdef INVARIANTS int32_t delta = fl->ifl_size - get_inuse(fl->ifl_size, fl->ifl_cidx, fl->ifl_pidx, fl->ifl_gen) - 1; #endif MPASS(fl->ifl_credits <= fl->ifl_size); MPASS(reclaimable == delta); if (reclaimable > 0) return (_iflib_fl_refill(ctx, fl, min(max, reclaimable))); return (0); } uint8_t iflib_in_detach(if_ctx_t ctx) { bool in_detach; STATE_LOCK(ctx); in_detach = !!(ctx->ifc_flags & IFC_IN_DETACH); STATE_UNLOCK(ctx); return (in_detach); } static void iflib_fl_bufs_free(iflib_fl_t fl) { iflib_dma_info_t idi = fl->ifl_ifdi; bus_dmamap_t sd_map; uint32_t i; for (i = 0; i < fl->ifl_size; i++) { struct mbuf **sd_m = &fl->ifl_sds.ifsd_m[i]; caddr_t *sd_cl = &fl->ifl_sds.ifsd_cl[i]; if (*sd_cl != NULL) { sd_map = fl->ifl_sds.ifsd_map[i]; bus_dmamap_sync(fl->ifl_buf_tag, sd_map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(fl->ifl_buf_tag, sd_map); if (*sd_cl != NULL) uma_zfree(fl->ifl_zone, *sd_cl); if (*sd_m != NULL) { m_init(*sd_m, M_NOWAIT, MT_DATA, 0); uma_zfree(zone_mbuf, *sd_m); } } else { MPASS(*sd_cl == NULL); MPASS(*sd_m == NULL); } #if MEMORY_LOGGING fl->ifl_m_dequeued++; fl->ifl_cl_dequeued++; #endif *sd_cl = NULL; *sd_m = NULL; } #ifdef INVARIANTS for (i = 0; i < fl->ifl_size; i++) { MPASS(fl->ifl_sds.ifsd_cl[i] == NULL); MPASS(fl->ifl_sds.ifsd_m[i] == NULL); } #endif /* * Reset free list values */ fl->ifl_credits = fl->ifl_cidx = fl->ifl_pidx = fl->ifl_gen = fl->ifl_fragidx = 0; bzero(idi->idi_vaddr, idi->idi_size); } /********************************************************************* * * Initialize a free list and its buffers. * **********************************************************************/ static int iflib_fl_setup(iflib_fl_t fl) { iflib_rxq_t rxq = fl->ifl_rxq; if_ctx_t ctx = rxq->ifr_ctx; bit_nclear(fl->ifl_rx_bitmap, 0, fl->ifl_size - 1); /* ** Free current RX buffer structs and their mbufs */ iflib_fl_bufs_free(fl); /* Now replenish the mbufs */ MPASS(fl->ifl_credits == 0); fl->ifl_buf_size = ctx->ifc_rx_mbuf_sz; if (fl->ifl_buf_size > ctx->ifc_max_fl_buf_size) ctx->ifc_max_fl_buf_size = fl->ifl_buf_size; fl->ifl_cltype = m_gettype(fl->ifl_buf_size); fl->ifl_zone = m_getzone(fl->ifl_buf_size); /* avoid pre-allocating zillions of clusters to an idle card * potentially speeding up attach */ (void) _iflib_fl_refill(ctx, fl, min(128, fl->ifl_size)); MPASS(min(128, fl->ifl_size) == fl->ifl_credits); if (min(128, fl->ifl_size) != fl->ifl_credits) return (ENOBUFS); /* * handle failure */ MPASS(rxq != NULL); MPASS(fl->ifl_ifdi != NULL); bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); return (0); } /********************************************************************* * * Free receive ring data structures * **********************************************************************/ static void iflib_rx_sds_free(iflib_rxq_t rxq) { iflib_fl_t fl; int i, j; if (rxq->ifr_fl != NULL) { for (i = 0; i < rxq->ifr_nfl; i++) { fl = &rxq->ifr_fl[i]; if (fl->ifl_buf_tag != NULL) { if (fl->ifl_sds.ifsd_map != NULL) { for (j = 0; j < fl->ifl_size; j++) { bus_dmamap_sync( fl->ifl_buf_tag, fl->ifl_sds.ifsd_map[j], BUS_DMASYNC_POSTREAD); bus_dmamap_unload( fl->ifl_buf_tag, fl->ifl_sds.ifsd_map[j]); bus_dmamap_destroy( fl->ifl_buf_tag, fl->ifl_sds.ifsd_map[j]); } } bus_dma_tag_destroy(fl->ifl_buf_tag); fl->ifl_buf_tag = NULL; } free(fl->ifl_sds.ifsd_m, M_IFLIB); free(fl->ifl_sds.ifsd_cl, M_IFLIB); free(fl->ifl_sds.ifsd_ba, M_IFLIB); free(fl->ifl_sds.ifsd_map, M_IFLIB); fl->ifl_sds.ifsd_m = NULL; fl->ifl_sds.ifsd_cl = NULL; fl->ifl_sds.ifsd_ba = NULL; fl->ifl_sds.ifsd_map = NULL; } free(rxq->ifr_fl, M_IFLIB); rxq->ifr_fl = NULL; free(rxq->ifr_ifdi, M_IFLIB); rxq->ifr_ifdi = NULL; rxq->ifr_cq_cidx = 0; } } /* * Timer routine */ static void iflib_timer(void *arg) { iflib_txq_t txq = arg; if_ctx_t ctx = txq->ift_ctx; if_softc_ctx_t sctx = &ctx->ifc_softc_ctx; uint64_t this_tick = ticks; uint32_t reset_on = hz / 2; if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)) return; /* ** Check on the state of the TX queue(s), this ** can be done without the lock because its RO ** and the HUNG state will be static if set. */ if (this_tick - txq->ift_last_timer_tick >= hz / 2) { txq->ift_last_timer_tick = this_tick; IFDI_TIMER(ctx, txq->ift_id); if ((txq->ift_qstatus == IFLIB_QUEUE_HUNG) && ((txq->ift_cleaned_prev == txq->ift_cleaned) || (sctx->isc_pause_frames == 0))) goto hung; if (txq->ift_qstatus != IFLIB_QUEUE_IDLE && ifmp_ring_is_stalled(txq->ift_br)) { KASSERT(ctx->ifc_link_state == LINK_STATE_UP, ("queue can't be marked as hung if interface is down")); txq->ift_qstatus = IFLIB_QUEUE_HUNG; } txq->ift_cleaned_prev = txq->ift_cleaned; } #ifdef DEV_NETMAP if (if_getcapenable(ctx->ifc_ifp) & IFCAP_NETMAP) iflib_netmap_timer_adjust(ctx, txq, &reset_on); #endif /* handle any laggards */ if (txq->ift_db_pending) GROUPTASK_ENQUEUE(&txq->ift_task); sctx->isc_pause_frames = 0; if (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING) callout_reset_on(&txq->ift_timer, reset_on, iflib_timer, txq, txq->ift_timer.c_cpu); return; hung: device_printf(ctx->ifc_dev, "Watchdog timeout (TX: %d desc avail: %d pidx: %d) -- resetting\n", txq->ift_id, TXQ_AVAIL(txq), txq->ift_pidx); STATE_LOCK(ctx); if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); ctx->ifc_flags |= (IFC_DO_WATCHDOG|IFC_DO_RESET); iflib_admin_intr_deferred(ctx); STATE_UNLOCK(ctx); } static void iflib_calc_rx_mbuf_sz(if_ctx_t ctx) { if_softc_ctx_t sctx = &ctx->ifc_softc_ctx; /* * XXX don't set the max_frame_size to larger * than the hardware can handle */ if (sctx->isc_max_frame_size <= MCLBYTES) ctx->ifc_rx_mbuf_sz = MCLBYTES; else ctx->ifc_rx_mbuf_sz = MJUMPAGESIZE; } uint32_t iflib_get_rx_mbuf_sz(if_ctx_t ctx) { return (ctx->ifc_rx_mbuf_sz); } static void iflib_init_locked(if_ctx_t ctx) { if_softc_ctx_t sctx = &ctx->ifc_softc_ctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; if_t ifp = ctx->ifc_ifp; iflib_fl_t fl; iflib_txq_t txq; iflib_rxq_t rxq; int i, j, tx_ip_csum_flags, tx_ip6_csum_flags; if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); IFDI_INTR_DISABLE(ctx); tx_ip_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP); tx_ip6_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_IP6_SCTP); /* Set hardware offload abilities */ if_clearhwassist(ifp); if (if_getcapenable(ifp) & IFCAP_TXCSUM) if_sethwassistbits(ifp, tx_ip_csum_flags, 0); if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) if_sethwassistbits(ifp, tx_ip6_csum_flags, 0); if (if_getcapenable(ifp) & IFCAP_TSO4) if_sethwassistbits(ifp, CSUM_IP_TSO, 0); if (if_getcapenable(ifp) & IFCAP_TSO6) if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); for (i = 0, txq = ctx->ifc_txqs; i < sctx->isc_ntxqsets; i++, txq++) { CALLOUT_LOCK(txq); callout_stop(&txq->ift_timer); CALLOUT_UNLOCK(txq); iflib_netmap_txq_init(ctx, txq); } /* * Calculate a suitable Rx mbuf size prior to calling IFDI_INIT, so * that drivers can use the value when setting up the hardware receive * buffers. */ iflib_calc_rx_mbuf_sz(ctx); #ifdef INVARIANTS i = if_getdrvflags(ifp); #endif IFDI_INIT(ctx); MPASS(if_getdrvflags(ifp) == i); for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) { /* XXX this should really be done on a per-queue basis */ if (if_getcapenable(ifp) & IFCAP_NETMAP) { MPASS(rxq->ifr_id == i); iflib_netmap_rxq_init(ctx, rxq); continue; } for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) { if (iflib_fl_setup(fl)) { device_printf(ctx->ifc_dev, "setting up free list %d failed - " "check cluster settings\n", j); goto done; } } } done: if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE); IFDI_INTR_ENABLE(ctx); txq = ctx->ifc_txqs; for (i = 0; i < sctx->isc_ntxqsets; i++, txq++) callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq, txq->ift_timer.c_cpu); } static int iflib_media_change(if_t ifp) { if_ctx_t ctx = if_getsoftc(ifp); int err; CTX_LOCK(ctx); if ((err = IFDI_MEDIA_CHANGE(ctx)) == 0) iflib_init_locked(ctx); CTX_UNLOCK(ctx); return (err); } static void iflib_media_status(if_t ifp, struct ifmediareq *ifmr) { if_ctx_t ctx = if_getsoftc(ifp); CTX_LOCK(ctx); IFDI_UPDATE_ADMIN_STATUS(ctx); IFDI_MEDIA_STATUS(ctx, ifmr); CTX_UNLOCK(ctx); } void iflib_stop(if_ctx_t ctx) { iflib_txq_t txq = ctx->ifc_txqs; iflib_rxq_t rxq = ctx->ifc_rxqs; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; if_shared_ctx_t sctx = ctx->ifc_sctx; iflib_dma_info_t di; iflib_fl_t fl; int i, j; /* Tell the stack that the interface is no longer active */ if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); IFDI_INTR_DISABLE(ctx); DELAY(1000); IFDI_STOP(ctx); DELAY(1000); iflib_debug_reset(); /* Wait for current tx queue users to exit to disarm watchdog timer. */ for (i = 0; i < scctx->isc_ntxqsets; i++, txq++) { /* make sure all transmitters have completed before proceeding XXX */ CALLOUT_LOCK(txq); callout_stop(&txq->ift_timer); CALLOUT_UNLOCK(txq); /* clean any enqueued buffers */ iflib_ifmp_purge(txq); /* Free any existing tx buffers. */ for (j = 0; j < txq->ift_size; j++) { iflib_txsd_free(ctx, txq, j); } txq->ift_processed = txq->ift_cleaned = txq->ift_cidx_processed = 0; txq->ift_in_use = txq->ift_gen = txq->ift_cidx = txq->ift_pidx = txq->ift_no_desc_avail = 0; txq->ift_closed = txq->ift_mbuf_defrag = txq->ift_mbuf_defrag_failed = 0; txq->ift_no_tx_dma_setup = txq->ift_txd_encap_efbig = txq->ift_map_failed = 0; txq->ift_pullups = 0; ifmp_ring_reset_stats(txq->ift_br); for (j = 0, di = txq->ift_ifdi; j < sctx->isc_ntxqs; j++, di++) bzero((void *)di->idi_vaddr, di->idi_size); } for (i = 0; i < scctx->isc_nrxqsets; i++, rxq++) { /* make sure all transmitters have completed before proceeding XXX */ rxq->ifr_cq_cidx = 0; for (j = 0, di = rxq->ifr_ifdi; j < sctx->isc_nrxqs; j++, di++) bzero((void *)di->idi_vaddr, di->idi_size); /* also resets the free lists pidx/cidx */ for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) iflib_fl_bufs_free(fl); } } static inline caddr_t calc_next_rxd(iflib_fl_t fl, int cidx) { qidx_t size; int nrxd; caddr_t start, end, cur, next; nrxd = fl->ifl_size; size = fl->ifl_rxd_size; start = fl->ifl_ifdi->idi_vaddr; if (__predict_false(size == 0)) return (start); cur = start + size*cidx; end = start + size*nrxd; next = CACHE_PTR_NEXT(cur); return (next < end ? next : start); } static inline void prefetch_pkts(iflib_fl_t fl, int cidx) { int nextptr; int nrxd = fl->ifl_size; caddr_t next_rxd; nextptr = (cidx + CACHE_PTR_INCREMENT) & (nrxd-1); prefetch(&fl->ifl_sds.ifsd_m[nextptr]); prefetch(&fl->ifl_sds.ifsd_cl[nextptr]); next_rxd = calc_next_rxd(fl, cidx); prefetch(next_rxd); prefetch(fl->ifl_sds.ifsd_m[(cidx + 1) & (nrxd-1)]); prefetch(fl->ifl_sds.ifsd_m[(cidx + 2) & (nrxd-1)]); prefetch(fl->ifl_sds.ifsd_m[(cidx + 3) & (nrxd-1)]); prefetch(fl->ifl_sds.ifsd_m[(cidx + 4) & (nrxd-1)]); prefetch(fl->ifl_sds.ifsd_cl[(cidx + 1) & (nrxd-1)]); prefetch(fl->ifl_sds.ifsd_cl[(cidx + 2) & (nrxd-1)]); prefetch(fl->ifl_sds.ifsd_cl[(cidx + 3) & (nrxd-1)]); prefetch(fl->ifl_sds.ifsd_cl[(cidx + 4) & (nrxd-1)]); } static struct mbuf * rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, bool unload, if_rxsd_t sd, int *pf_rv, if_rxd_info_t ri) { bus_dmamap_t map; iflib_fl_t fl; caddr_t payload; struct mbuf *m; int flid, cidx, len, next; map = NULL; flid = irf->irf_flid; cidx = irf->irf_idx; fl = &rxq->ifr_fl[flid]; sd->ifsd_fl = fl; sd->ifsd_cidx = cidx; m = fl->ifl_sds.ifsd_m[cidx]; sd->ifsd_cl = &fl->ifl_sds.ifsd_cl[cidx]; fl->ifl_credits--; #if MEMORY_LOGGING fl->ifl_m_dequeued++; #endif if (rxq->ifr_ctx->ifc_flags & IFC_PREFETCH) prefetch_pkts(fl, cidx); next = (cidx + CACHE_PTR_INCREMENT) & (fl->ifl_size-1); prefetch(&fl->ifl_sds.ifsd_map[next]); map = fl->ifl_sds.ifsd_map[cidx]; next = (cidx + CACHE_LINE_SIZE) & (fl->ifl_size-1); /* not valid assert if bxe really does SGE from non-contiguous elements */ MPASS(fl->ifl_cidx == cidx); bus_dmamap_sync(fl->ifl_buf_tag, map, BUS_DMASYNC_POSTREAD); if (rxq->pfil != NULL && PFIL_HOOKED_IN(rxq->pfil) && pf_rv != NULL) { payload = *sd->ifsd_cl; payload += ri->iri_pad; len = ri->iri_len - ri->iri_pad; *pf_rv = pfil_run_hooks(rxq->pfil, payload, ri->iri_ifp, len | PFIL_MEMPTR | PFIL_IN, NULL); switch (*pf_rv) { case PFIL_DROPPED: case PFIL_CONSUMED: /* * The filter ate it. Everything is recycled. */ m = NULL; unload = 0; break; case PFIL_REALLOCED: /* * The filter copied it. Everything is recycled. */ m = pfil_mem2mbuf(payload); unload = 0; break; case PFIL_PASS: /* * Filter said it was OK, so receive like * normal */ fl->ifl_sds.ifsd_m[cidx] = NULL; break; default: MPASS(0); } } else { fl->ifl_sds.ifsd_m[cidx] = NULL; *pf_rv = PFIL_PASS; } if (unload) bus_dmamap_unload(fl->ifl_buf_tag, map); fl->ifl_cidx = (fl->ifl_cidx + 1) & (fl->ifl_size-1); if (__predict_false(fl->ifl_cidx == 0)) fl->ifl_gen = 0; bit_clear(fl->ifl_rx_bitmap, cidx); return (m); } static struct mbuf * assemble_segments(iflib_rxq_t rxq, if_rxd_info_t ri, if_rxsd_t sd, int *pf_rv) { struct mbuf *m, *mh, *mt; caddr_t cl; int *pf_rv_ptr, flags, i, padlen; bool consumed; i = 0; mh = NULL; consumed = false; *pf_rv = PFIL_PASS; pf_rv_ptr = pf_rv; do { m = rxd_frag_to_sd(rxq, &ri->iri_frags[i], !consumed, sd, pf_rv_ptr, ri); MPASS(*sd->ifsd_cl != NULL); /* * Exclude zero-length frags & frags from * packets the filter has consumed or dropped */ if (ri->iri_frags[i].irf_len == 0 || consumed || *pf_rv == PFIL_CONSUMED || *pf_rv == PFIL_DROPPED) { if (mh == NULL) { /* everything saved here */ consumed = true; pf_rv_ptr = NULL; continue; } /* XXX we can save the cluster here, but not the mbuf */ m_init(m, M_NOWAIT, MT_DATA, 0); m_free(m); continue; } if (mh == NULL) { flags = M_PKTHDR|M_EXT; mh = mt = m; padlen = ri->iri_pad; } else { flags = M_EXT; mt->m_next = m; mt = m; /* assuming padding is only on the first fragment */ padlen = 0; } cl = *sd->ifsd_cl; *sd->ifsd_cl = NULL; /* Can these two be made one ? */ m_init(m, M_NOWAIT, MT_DATA, flags); m_cljset(m, cl, sd->ifsd_fl->ifl_cltype); /* * These must follow m_init and m_cljset */ m->m_data += padlen; ri->iri_len -= padlen; m->m_len = ri->iri_frags[i].irf_len; } while (++i < ri->iri_nfrags); return (mh); } /* * Process one software descriptor */ static struct mbuf * iflib_rxd_pkt_get(iflib_rxq_t rxq, if_rxd_info_t ri) { struct if_rxsd sd; struct mbuf *m; int pf_rv; /* should I merge this back in now that the two paths are basically duplicated? */ if (ri->iri_nfrags == 1 && ri->iri_frags[0].irf_len <= MIN(IFLIB_RX_COPY_THRESH, MHLEN)) { m = rxd_frag_to_sd(rxq, &ri->iri_frags[0], false, &sd, &pf_rv, ri); if (pf_rv != PFIL_PASS && pf_rv != PFIL_REALLOCED) return (m); if (pf_rv == PFIL_PASS) { m_init(m, M_NOWAIT, MT_DATA, M_PKTHDR); #ifndef __NO_STRICT_ALIGNMENT if (!IP_ALIGNED(m)) m->m_data += 2; #endif memcpy(m->m_data, *sd.ifsd_cl, ri->iri_len); m->m_len = ri->iri_frags[0].irf_len; } } else { m = assemble_segments(rxq, ri, &sd, &pf_rv); if (pf_rv != PFIL_PASS && pf_rv != PFIL_REALLOCED) return (m); } m->m_pkthdr.len = ri->iri_len; m->m_pkthdr.rcvif = ri->iri_ifp; m->m_flags |= ri->iri_flags; m->m_pkthdr.ether_vtag = ri->iri_vtag; m->m_pkthdr.flowid = ri->iri_flowid; M_HASHTYPE_SET(m, ri->iri_rsstype); m->m_pkthdr.csum_flags = ri->iri_csum_flags; m->m_pkthdr.csum_data = ri->iri_csum_data; return (m); } #if defined(INET6) || defined(INET) static void iflib_get_ip_forwarding(struct lro_ctrl *lc, bool *v4, bool *v6) { CURVNET_SET(lc->ifp->if_vnet); #if defined(INET6) *v6 = V_ip6_forwarding; #endif #if defined(INET) *v4 = V_ipforwarding; #endif CURVNET_RESTORE(); } /* * Returns true if it's possible this packet could be LROed. * if it returns false, it is guaranteed that tcp_lro_rx() * would not return zero. */ static bool iflib_check_lro_possible(struct mbuf *m, bool v4_forwarding, bool v6_forwarding) { struct ether_header *eh; eh = mtod(m, struct ether_header *); switch (eh->ether_type) { #if defined(INET6) case htons(ETHERTYPE_IPV6): return (!v6_forwarding); #endif #if defined (INET) case htons(ETHERTYPE_IP): return (!v4_forwarding); #endif } return false; } #else static void iflib_get_ip_forwarding(struct lro_ctrl *lc __unused, bool *v4 __unused, bool *v6 __unused) { } #endif static void _task_fn_rx_watchdog(void *context) { iflib_rxq_t rxq = context; GROUPTASK_ENQUEUE(&rxq->ifr_task); } static uint8_t iflib_rxeof(iflib_rxq_t rxq, qidx_t budget) { if_t ifp; if_ctx_t ctx = rxq->ifr_ctx; if_shared_ctx_t sctx = ctx->ifc_sctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; int avail, i; qidx_t *cidxp; struct if_rxd_info ri; int err, budget_left, rx_bytes, rx_pkts; iflib_fl_t fl; int lro_enabled; bool v4_forwarding, v6_forwarding, lro_possible; uint8_t retval = 0; /* * XXX early demux data packets so that if_input processing only handles * acks in interrupt context */ struct mbuf *m, *mh, *mt, *mf; NET_EPOCH_ASSERT(); lro_possible = v4_forwarding = v6_forwarding = false; ifp = ctx->ifc_ifp; mh = mt = NULL; MPASS(budget > 0); rx_pkts = rx_bytes = 0; if (sctx->isc_flags & IFLIB_HAS_RXCQ) cidxp = &rxq->ifr_cq_cidx; else cidxp = &rxq->ifr_fl[0].ifl_cidx; if ((avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget)) == 0) { for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++) retval |= __iflib_fl_refill_lt(ctx, fl, budget + 8); DBG_COUNTER_INC(rx_unavail); return (retval); } /* pfil needs the vnet to be set */ CURVNET_SET_QUIET(ifp->if_vnet); for (budget_left = budget; budget_left > 0 && avail > 0;) { if (__predict_false(!CTX_ACTIVE(ctx))) { DBG_COUNTER_INC(rx_ctx_inactive); break; } /* * Reset client set fields to their default values */ rxd_info_zero(&ri); ri.iri_qsidx = rxq->ifr_id; ri.iri_cidx = *cidxp; ri.iri_ifp = ifp; ri.iri_frags = rxq->ifr_frags; err = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri); if (err) goto err; rx_pkts += 1; rx_bytes += ri.iri_len; if (sctx->isc_flags & IFLIB_HAS_RXCQ) { *cidxp = ri.iri_cidx; /* Update our consumer index */ /* XXX NB: shurd - check if this is still safe */ while (rxq->ifr_cq_cidx >= scctx->isc_nrxd[0]) rxq->ifr_cq_cidx -= scctx->isc_nrxd[0]; /* was this only a completion queue message? */ if (__predict_false(ri.iri_nfrags == 0)) continue; } MPASS(ri.iri_nfrags != 0); MPASS(ri.iri_len != 0); /* will advance the cidx on the corresponding free lists */ m = iflib_rxd_pkt_get(rxq, &ri); avail--; budget_left--; if (avail == 0 && budget_left) avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget_left); if (__predict_false(m == NULL)) continue; /* imm_pkt: -- cxgb */ if (mh == NULL) mh = mt = m; else { mt->m_nextpkt = m; mt = m; } } CURVNET_RESTORE(); /* make sure that we can refill faster than drain */ for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++) retval |= __iflib_fl_refill_lt(ctx, fl, budget + 8); lro_enabled = (if_getcapenable(ifp) & IFCAP_LRO); if (lro_enabled) iflib_get_ip_forwarding(&rxq->ifr_lc, &v4_forwarding, &v6_forwarding); mt = mf = NULL; while (mh != NULL) { m = mh; mh = mh->m_nextpkt; m->m_nextpkt = NULL; #ifndef __NO_STRICT_ALIGNMENT if (!IP_ALIGNED(m) && (m = iflib_fixup_rx(m)) == NULL) continue; #endif rx_bytes += m->m_pkthdr.len; rx_pkts++; #if defined(INET6) || defined(INET) if (lro_enabled) { if (!lro_possible) { lro_possible = iflib_check_lro_possible(m, v4_forwarding, v6_forwarding); if (lro_possible && mf != NULL) { ifp->if_input(ifp, mf); DBG_COUNTER_INC(rx_if_input); mt = mf = NULL; } } if ((m->m_pkthdr.csum_flags & (CSUM_L4_CALC|CSUM_L4_VALID)) == (CSUM_L4_CALC|CSUM_L4_VALID)) { if (lro_possible && tcp_lro_rx(&rxq->ifr_lc, m, 0) == 0) continue; } } #endif if (lro_possible) { ifp->if_input(ifp, m); DBG_COUNTER_INC(rx_if_input); continue; } if (mf == NULL) mf = m; if (mt != NULL) mt->m_nextpkt = m; mt = m; } if (mf != NULL) { ifp->if_input(ifp, mf); DBG_COUNTER_INC(rx_if_input); } if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes); if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts); /* * Flush any outstanding LRO work */ #if defined(INET6) || defined(INET) tcp_lro_flush_all(&rxq->ifr_lc); #endif if (avail != 0 || iflib_rxd_avail(ctx, rxq, *cidxp, 1) != 0) retval |= IFLIB_RXEOF_MORE; return (retval); err: STATE_LOCK(ctx); ctx->ifc_flags |= IFC_DO_RESET; iflib_admin_intr_deferred(ctx); STATE_UNLOCK(ctx); return (0); } #define TXD_NOTIFY_COUNT(txq) (((txq)->ift_size / (txq)->ift_update_freq)-1) static inline qidx_t txq_max_db_deferred(iflib_txq_t txq, qidx_t in_use) { qidx_t notify_count = TXD_NOTIFY_COUNT(txq); qidx_t minthresh = txq->ift_size / 8; if (in_use > 4*minthresh) return (notify_count); if (in_use > 2*minthresh) return (notify_count >> 1); if (in_use > minthresh) return (notify_count >> 3); return (0); } static inline qidx_t txq_max_rs_deferred(iflib_txq_t txq) { qidx_t notify_count = TXD_NOTIFY_COUNT(txq); qidx_t minthresh = txq->ift_size / 8; if (txq->ift_in_use > 4*minthresh) return (notify_count); if (txq->ift_in_use > 2*minthresh) return (notify_count >> 1); if (txq->ift_in_use > minthresh) return (notify_count >> 2); return (2); } #define M_CSUM_FLAGS(m) ((m)->m_pkthdr.csum_flags) #define M_HAS_VLANTAG(m) (m->m_flags & M_VLANTAG) #define TXQ_MAX_DB_DEFERRED(txq, in_use) txq_max_db_deferred((txq), (in_use)) #define TXQ_MAX_RS_DEFERRED(txq) txq_max_rs_deferred(txq) #define TXQ_MAX_DB_CONSUMED(size) (size >> 4) /* forward compatibility for cxgb */ #define FIRST_QSET(ctx) 0 #define NTXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_ntxqsets) #define NRXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_nrxqsets) #define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx)) #define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments)) /* XXX we should be setting this to something other than zero */ #define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh) #define MAX_TX_DESC(ctx) max((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max, \ (ctx)->ifc_softc_ctx.isc_tx_nsegments) static inline bool iflib_txd_db_check(if_ctx_t ctx, iflib_txq_t txq, int ring, qidx_t in_use) { qidx_t dbval, max; bool rang; rang = false; max = TXQ_MAX_DB_DEFERRED(txq, in_use); if (ring || txq->ift_db_pending >= max) { dbval = txq->ift_npending ? txq->ift_npending : txq->ift_pidx; bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, dbval); txq->ift_db_pending = txq->ift_npending = 0; rang = true; } return (rang); } #ifdef PKT_DEBUG static void print_pkt(if_pkt_info_t pi) { printf("pi len: %d qsidx: %d nsegs: %d ndescs: %d flags: %x pidx: %d\n", pi->ipi_len, pi->ipi_qsidx, pi->ipi_nsegs, pi->ipi_ndescs, pi->ipi_flags, pi->ipi_pidx); printf("pi new_pidx: %d csum_flags: %lx tso_segsz: %d mflags: %x vtag: %d\n", pi->ipi_new_pidx, pi->ipi_csum_flags, pi->ipi_tso_segsz, pi->ipi_mflags, pi->ipi_vtag); printf("pi etype: %d ehdrlen: %d ip_hlen: %d ipproto: %d\n", pi->ipi_etype, pi->ipi_ehdrlen, pi->ipi_ip_hlen, pi->ipi_ipproto); } #endif #define IS_TSO4(pi) ((pi)->ipi_csum_flags & CSUM_IP_TSO) #define IS_TX_OFFLOAD4(pi) ((pi)->ipi_csum_flags & (CSUM_IP_TCP | CSUM_IP_TSO)) #define IS_TSO6(pi) ((pi)->ipi_csum_flags & CSUM_IP6_TSO) #define IS_TX_OFFLOAD6(pi) ((pi)->ipi_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_TSO)) static int iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp) { if_shared_ctx_t sctx = txq->ift_ctx->ifc_sctx; struct ether_vlan_header *eh; struct mbuf *m; m = *mp; if ((sctx->isc_flags & IFLIB_NEED_SCRATCH) && M_WRITABLE(m) == 0) { if ((m = m_dup(m, M_NOWAIT)) == NULL) { return (ENOMEM); } else { m_freem(*mp); DBG_COUNTER_INC(tx_frees); *mp = m; } } /* * Determine where frame payload starts. * Jump over vlan headers if already present, * helpful for QinQ too. */ if (__predict_false(m->m_len < sizeof(*eh))) { txq->ift_pullups++; if (__predict_false((m = m_pullup(m, sizeof(*eh))) == NULL)) return (ENOMEM); } eh = mtod(m, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { pi->ipi_etype = ntohs(eh->evl_proto); pi->ipi_ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { pi->ipi_etype = ntohs(eh->evl_encap_proto); pi->ipi_ehdrlen = ETHER_HDR_LEN; } switch (pi->ipi_etype) { #ifdef INET case ETHERTYPE_IP: { struct mbuf *n; struct ip *ip = NULL; struct tcphdr *th = NULL; int minthlen; minthlen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip) + sizeof(*th)); if (__predict_false(m->m_len < minthlen)) { /* * if this code bloat is causing too much of a hit * move it to a separate function and mark it noinline */ if (m->m_len == pi->ipi_ehdrlen) { n = m->m_next; MPASS(n); if (n->m_len >= sizeof(*ip)) { ip = (struct ip *)n->m_data; if (n->m_len >= (ip->ip_hl << 2) + sizeof(*th)) th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); } else { txq->ift_pullups++; if (__predict_false((m = m_pullup(m, minthlen)) == NULL)) return (ENOMEM); ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen); } } else { txq->ift_pullups++; if (__predict_false((m = m_pullup(m, minthlen)) == NULL)) return (ENOMEM); ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen); if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th)) th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); } } else { ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen); if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th)) th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); } pi->ipi_ip_hlen = ip->ip_hl << 2; pi->ipi_ipproto = ip->ip_p; pi->ipi_flags |= IPI_TX_IPV4; /* TCP checksum offload may require TCP header length */ if (IS_TX_OFFLOAD4(pi)) { if (__predict_true(pi->ipi_ipproto == IPPROTO_TCP)) { if (__predict_false(th == NULL)) { txq->ift_pullups++; if (__predict_false((m = m_pullup(m, (ip->ip_hl << 2) + sizeof(*th))) == NULL)) return (ENOMEM); th = (struct tcphdr *)((caddr_t)ip + pi->ipi_ip_hlen); } pi->ipi_tcp_hflags = th->th_flags; pi->ipi_tcp_hlen = th->th_off << 2; pi->ipi_tcp_seq = th->th_seq; } if (IS_TSO4(pi)) { if (__predict_false(ip->ip_p != IPPROTO_TCP)) return (ENXIO); /* * TSO always requires hardware checksum offload. */ pi->ipi_csum_flags |= (CSUM_IP_TCP | CSUM_IP); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz; if (sctx->isc_flags & IFLIB_TSO_INIT_IP) { ip->ip_sum = 0; ip->ip_len = htons(pi->ipi_ip_hlen + pi->ipi_tcp_hlen + pi->ipi_tso_segsz); } } } if ((sctx->isc_flags & IFLIB_NEED_ZERO_CSUM) && (pi->ipi_csum_flags & CSUM_IP)) ip->ip_sum = 0; break; } #endif #ifdef INET6 case ETHERTYPE_IPV6: { struct ip6_hdr *ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen); struct tcphdr *th; pi->ipi_ip_hlen = sizeof(struct ip6_hdr); if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) { txq->ift_pullups++; if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL)) return (ENOMEM); } th = (struct tcphdr *)((caddr_t)ip6 + pi->ipi_ip_hlen); /* XXX-BZ this will go badly in case of ext hdrs. */ pi->ipi_ipproto = ip6->ip6_nxt; pi->ipi_flags |= IPI_TX_IPV6; /* TCP checksum offload may require TCP header length */ if (IS_TX_OFFLOAD6(pi)) { if (pi->ipi_ipproto == IPPROTO_TCP) { if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) { txq->ift_pullups++; if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) == NULL)) return (ENOMEM); } pi->ipi_tcp_hflags = th->th_flags; pi->ipi_tcp_hlen = th->th_off << 2; pi->ipi_tcp_seq = th->th_seq; } if (IS_TSO6(pi)) { if (__predict_false(ip6->ip6_nxt != IPPROTO_TCP)) return (ENXIO); /* * TSO always requires hardware checksum offload. */ pi->ipi_csum_flags |= CSUM_IP6_TCP; th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz; } } break; } #endif default: pi->ipi_csum_flags &= ~CSUM_OFFLOAD; pi->ipi_ip_hlen = 0; break; } *mp = m; return (0); } /* * If dodgy hardware rejects the scatter gather chain we've handed it * we'll need to remove the mbuf chain from ifsg_m[] before we can add the * m_defrag'd mbufs */ static __noinline struct mbuf * iflib_remove_mbuf(iflib_txq_t txq) { int ntxd, pidx; struct mbuf *m, **ifsd_m; ifsd_m = txq->ift_sds.ifsd_m; ntxd = txq->ift_size; pidx = txq->ift_pidx & (ntxd - 1); ifsd_m = txq->ift_sds.ifsd_m; m = ifsd_m[pidx]; ifsd_m[pidx] = NULL; bus_dmamap_unload(txq->ift_buf_tag, txq->ift_sds.ifsd_map[pidx]); if (txq->ift_sds.ifsd_tso_map != NULL) bus_dmamap_unload(txq->ift_tso_buf_tag, txq->ift_sds.ifsd_tso_map[pidx]); #if MEMORY_LOGGING txq->ift_dequeued++; #endif return (m); } static inline caddr_t calc_next_txd(iflib_txq_t txq, int cidx, uint8_t qid) { qidx_t size; int ntxd; caddr_t start, end, cur, next; ntxd = txq->ift_size; size = txq->ift_txd_size[qid]; start = txq->ift_ifdi[qid].idi_vaddr; if (__predict_false(size == 0)) return (start); cur = start + size*cidx; end = start + size*ntxd; next = CACHE_PTR_NEXT(cur); return (next < end ? next : start); } /* * Pad an mbuf to ensure a minimum ethernet frame size. * min_frame_size is the frame size (less CRC) to pad the mbuf to */ static __noinline int iflib_ether_pad(device_t dev, struct mbuf **m_head, uint16_t min_frame_size) { /* * 18 is enough bytes to pad an ARP packet to 46 bytes, and * and ARP message is the smallest common payload I can think of */ static char pad[18]; /* just zeros */ int n; struct mbuf *new_head; if (!M_WRITABLE(*m_head)) { new_head = m_dup(*m_head, M_NOWAIT); if (new_head == NULL) { m_freem(*m_head); device_printf(dev, "cannot pad short frame, m_dup() failed"); DBG_COUNTER_INC(encap_pad_mbuf_fail); DBG_COUNTER_INC(tx_frees); return ENOMEM; } m_freem(*m_head); *m_head = new_head; } for (n = min_frame_size - (*m_head)->m_pkthdr.len; n > 0; n -= sizeof(pad)) if (!m_append(*m_head, min(n, sizeof(pad)), pad)) break; if (n > 0) { m_freem(*m_head); device_printf(dev, "cannot pad short frame\n"); DBG_COUNTER_INC(encap_pad_mbuf_fail); DBG_COUNTER_INC(tx_frees); return (ENOBUFS); } return 0; } static int iflib_encap(iflib_txq_t txq, struct mbuf **m_headp) { if_ctx_t ctx; if_shared_ctx_t sctx; if_softc_ctx_t scctx; bus_dma_tag_t buf_tag; bus_dma_segment_t *segs; struct mbuf *m_head, **ifsd_m; void *next_txd; bus_dmamap_t map; struct if_pkt_info pi; int remap = 0; int err, nsegs, ndesc, max_segs, pidx, cidx, next, ntxd; ctx = txq->ift_ctx; sctx = ctx->ifc_sctx; scctx = &ctx->ifc_softc_ctx; segs = txq->ift_segs; ntxd = txq->ift_size; m_head = *m_headp; map = NULL; /* * If we're doing TSO the next descriptor to clean may be quite far ahead */ cidx = txq->ift_cidx; pidx = txq->ift_pidx; if (ctx->ifc_flags & IFC_PREFETCH) { next = (cidx + CACHE_PTR_INCREMENT) & (ntxd-1); if (!(ctx->ifc_flags & IFLIB_HAS_TXCQ)) { next_txd = calc_next_txd(txq, cidx, 0); prefetch(next_txd); } /* prefetch the next cache line of mbuf pointers and flags */ prefetch(&txq->ift_sds.ifsd_m[next]); prefetch(&txq->ift_sds.ifsd_map[next]); next = (cidx + CACHE_LINE_SIZE) & (ntxd-1); } map = txq->ift_sds.ifsd_map[pidx]; ifsd_m = txq->ift_sds.ifsd_m; if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { buf_tag = txq->ift_tso_buf_tag; max_segs = scctx->isc_tx_tso_segments_max; map = txq->ift_sds.ifsd_tso_map[pidx]; MPASS(buf_tag != NULL); MPASS(max_segs > 0); } else { buf_tag = txq->ift_buf_tag; max_segs = scctx->isc_tx_nsegments; map = txq->ift_sds.ifsd_map[pidx]; } if ((sctx->isc_flags & IFLIB_NEED_ETHER_PAD) && __predict_false(m_head->m_pkthdr.len < scctx->isc_min_frame_size)) { err = iflib_ether_pad(ctx->ifc_dev, m_headp, scctx->isc_min_frame_size); if (err) { DBG_COUNTER_INC(encap_txd_encap_fail); return err; } } m_head = *m_headp; pkt_info_zero(&pi); pi.ipi_mflags = (m_head->m_flags & (M_VLANTAG|M_BCAST|M_MCAST)); pi.ipi_pidx = pidx; pi.ipi_qsidx = txq->ift_id; pi.ipi_len = m_head->m_pkthdr.len; pi.ipi_csum_flags = m_head->m_pkthdr.csum_flags; pi.ipi_vtag = M_HAS_VLANTAG(m_head) ? m_head->m_pkthdr.ether_vtag : 0; /* deliberate bitwise OR to make one condition */ if (__predict_true((pi.ipi_csum_flags | pi.ipi_vtag))) { if (__predict_false((err = iflib_parse_header(txq, &pi, m_headp)) != 0)) { DBG_COUNTER_INC(encap_txd_encap_fail); return (err); } m_head = *m_headp; } retry: err = bus_dmamap_load_mbuf_sg(buf_tag, map, m_head, segs, &nsegs, BUS_DMA_NOWAIT); defrag: if (__predict_false(err)) { switch (err) { case EFBIG: /* try collapse once and defrag once */ if (remap == 0) { m_head = m_collapse(*m_headp, M_NOWAIT, max_segs); /* try defrag if collapsing fails */ if (m_head == NULL) remap++; } if (remap == 1) { txq->ift_mbuf_defrag++; m_head = m_defrag(*m_headp, M_NOWAIT); } /* * remap should never be >1 unless bus_dmamap_load_mbuf_sg * failed to map an mbuf that was run through m_defrag */ MPASS(remap <= 1); if (__predict_false(m_head == NULL || remap > 1)) goto defrag_failed; remap++; *m_headp = m_head; goto retry; break; case ENOMEM: txq->ift_no_tx_dma_setup++; break; default: txq->ift_no_tx_dma_setup++; m_freem(*m_headp); DBG_COUNTER_INC(tx_frees); *m_headp = NULL; break; } txq->ift_map_failed++; DBG_COUNTER_INC(encap_load_mbuf_fail); DBG_COUNTER_INC(encap_txd_encap_fail); return (err); } ifsd_m[pidx] = m_head; /* * XXX assumes a 1 to 1 relationship between segments and * descriptors - this does not hold true on all drivers, e.g. * cxgb */ if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) { txq->ift_no_desc_avail++; bus_dmamap_unload(buf_tag, map); DBG_COUNTER_INC(encap_txq_avail_fail); DBG_COUNTER_INC(encap_txd_encap_fail); if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0) GROUPTASK_ENQUEUE(&txq->ift_task); return (ENOBUFS); } /* * On Intel cards we can greatly reduce the number of TX interrupts * we see by only setting report status on every Nth descriptor. * However, this also means that the driver will need to keep track * of the descriptors that RS was set on to check them for the DD bit. */ txq->ift_rs_pending += nsegs + 1; if (txq->ift_rs_pending > TXQ_MAX_RS_DEFERRED(txq) || iflib_no_tx_batch || (TXQ_AVAIL(txq) - nsegs) <= MAX_TX_DESC(ctx) + 2) { pi.ipi_flags |= IPI_TX_INTR; txq->ift_rs_pending = 0; } pi.ipi_segs = segs; pi.ipi_nsegs = nsegs; MPASS(pidx >= 0 && pidx < txq->ift_size); #ifdef PKT_DEBUG print_pkt(&pi); #endif if ((err = ctx->isc_txd_encap(ctx->ifc_softc, &pi)) == 0) { bus_dmamap_sync(buf_tag, map, BUS_DMASYNC_PREWRITE); DBG_COUNTER_INC(tx_encap); MPASS(pi.ipi_new_pidx < txq->ift_size); ndesc = pi.ipi_new_pidx - pi.ipi_pidx; if (pi.ipi_new_pidx < pi.ipi_pidx) { ndesc += txq->ift_size; txq->ift_gen = 1; } /* * drivers can need as many as * two sentinels */ MPASS(ndesc <= pi.ipi_nsegs + 2); MPASS(pi.ipi_new_pidx != pidx); MPASS(ndesc > 0); txq->ift_in_use += ndesc; /* * We update the last software descriptor again here because there may * be a sentinel and/or there may be more mbufs than segments */ txq->ift_pidx = pi.ipi_new_pidx; txq->ift_npending += pi.ipi_ndescs; } else { *m_headp = m_head = iflib_remove_mbuf(txq); if (err == EFBIG) { txq->ift_txd_encap_efbig++; if (remap < 2) { remap = 1; goto defrag; } } goto defrag_failed; } /* * err can't possibly be non-zero here, so we don't neet to test it * to see if we need to DBG_COUNTER_INC(encap_txd_encap_fail). */ return (err); defrag_failed: txq->ift_mbuf_defrag_failed++; txq->ift_map_failed++; m_freem(*m_headp); DBG_COUNTER_INC(tx_frees); *m_headp = NULL; DBG_COUNTER_INC(encap_txd_encap_fail); return (ENOMEM); } static void iflib_tx_desc_free(iflib_txq_t txq, int n) { uint32_t qsize, cidx, mask, gen; struct mbuf *m, **ifsd_m; bool do_prefetch; cidx = txq->ift_cidx; gen = txq->ift_gen; qsize = txq->ift_size; mask = qsize-1; ifsd_m = txq->ift_sds.ifsd_m; do_prefetch = (txq->ift_ctx->ifc_flags & IFC_PREFETCH); while (n-- > 0) { if (do_prefetch) { prefetch(ifsd_m[(cidx + 3) & mask]); prefetch(ifsd_m[(cidx + 4) & mask]); } if ((m = ifsd_m[cidx]) != NULL) { prefetch(&ifsd_m[(cidx + CACHE_PTR_INCREMENT) & mask]); if (m->m_pkthdr.csum_flags & CSUM_TSO) { bus_dmamap_sync(txq->ift_tso_buf_tag, txq->ift_sds.ifsd_tso_map[cidx], BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txq->ift_tso_buf_tag, txq->ift_sds.ifsd_tso_map[cidx]); } else { bus_dmamap_sync(txq->ift_buf_tag, txq->ift_sds.ifsd_map[cidx], BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txq->ift_buf_tag, txq->ift_sds.ifsd_map[cidx]); } /* XXX we don't support any drivers that batch packets yet */ MPASS(m->m_nextpkt == NULL); m_freem(m); ifsd_m[cidx] = NULL; #if MEMORY_LOGGING txq->ift_dequeued++; #endif DBG_COUNTER_INC(tx_frees); } if (__predict_false(++cidx == qsize)) { cidx = 0; gen = 0; } } txq->ift_cidx = cidx; txq->ift_gen = gen; } static __inline int iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh) { int reclaim; if_ctx_t ctx = txq->ift_ctx; KASSERT(thresh >= 0, ("invalid threshold to reclaim")); MPASS(thresh /*+ MAX_TX_DESC(txq->ift_ctx) */ < txq->ift_size); /* * Need a rate-limiting check so that this isn't called every time */ iflib_tx_credits_update(ctx, txq); reclaim = DESC_RECLAIMABLE(txq); if (reclaim <= thresh /* + MAX_TX_DESC(txq->ift_ctx) */) { #ifdef INVARIANTS if (iflib_verbose_debug) { printf("%s processed=%ju cleaned=%ju tx_nsegments=%d reclaim=%d thresh=%d\n", __FUNCTION__, txq->ift_processed, txq->ift_cleaned, txq->ift_ctx->ifc_softc_ctx.isc_tx_nsegments, reclaim, thresh); } #endif return (0); } iflib_tx_desc_free(txq, reclaim); txq->ift_cleaned += reclaim; txq->ift_in_use -= reclaim; return (reclaim); } static struct mbuf ** _ring_peek_one(struct ifmp_ring *r, int cidx, int offset, int remaining) { int next, size; struct mbuf **items; size = r->size; next = (cidx + CACHE_PTR_INCREMENT) & (size-1); items = __DEVOLATILE(struct mbuf **, &r->items[0]); prefetch(items[(cidx + offset) & (size-1)]); if (remaining > 1) { prefetch2cachelines(&items[next]); prefetch2cachelines(items[(cidx + offset + 1) & (size-1)]); prefetch2cachelines(items[(cidx + offset + 2) & (size-1)]); prefetch2cachelines(items[(cidx + offset + 3) & (size-1)]); } return (__DEVOLATILE(struct mbuf **, &r->items[(cidx + offset) & (size-1)])); } static void iflib_txq_check_drain(iflib_txq_t txq, int budget) { ifmp_ring_check_drainage(txq->ift_br, budget); } static uint32_t iflib_txq_can_drain(struct ifmp_ring *r) { iflib_txq_t txq = r->cookie; if_ctx_t ctx = txq->ift_ctx; if (TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2) return (1); bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map, BUS_DMASYNC_POSTREAD); return (ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, false)); } static uint32_t iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx) { iflib_txq_t txq = r->cookie; if_ctx_t ctx = txq->ift_ctx; if_t ifp = ctx->ifc_ifp; struct mbuf *m, **mp; int avail, bytes_sent, consumed, count, err, i, in_use_prev; int mcast_sent, pkt_sent, reclaimed, txq_avail; bool do_prefetch, rang, ring; if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING) || !LINK_ACTIVE(ctx))) { DBG_COUNTER_INC(txq_drain_notready); return (0); } reclaimed = iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx)); rang = iflib_txd_db_check(ctx, txq, reclaimed, txq->ift_in_use); avail = IDXDIFF(pidx, cidx, r->size); if (__predict_false(ctx->ifc_flags & IFC_QFLUSH)) { DBG_COUNTER_INC(txq_drain_flushing); for (i = 0; i < avail; i++) { if (__predict_true(r->items[(cidx + i) & (r->size-1)] != (void *)txq)) m_free(r->items[(cidx + i) & (r->size-1)]); r->items[(cidx + i) & (r->size-1)] = NULL; } return (avail); } if (__predict_false(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE)) { txq->ift_qstatus = IFLIB_QUEUE_IDLE; CALLOUT_LOCK(txq); callout_stop(&txq->ift_timer); CALLOUT_UNLOCK(txq); DBG_COUNTER_INC(txq_drain_oactive); return (0); } if (reclaimed) txq->ift_qstatus = IFLIB_QUEUE_IDLE; consumed = mcast_sent = bytes_sent = pkt_sent = 0; count = MIN(avail, TX_BATCH_SIZE); #ifdef INVARIANTS if (iflib_verbose_debug) printf("%s avail=%d ifc_flags=%x txq_avail=%d ", __FUNCTION__, avail, ctx->ifc_flags, TXQ_AVAIL(txq)); #endif do_prefetch = (ctx->ifc_flags & IFC_PREFETCH); txq_avail = TXQ_AVAIL(txq); err = 0; for (i = 0; i < count && txq_avail > MAX_TX_DESC(ctx) + 2; i++) { int rem = do_prefetch ? count - i : 0; mp = _ring_peek_one(r, cidx, i, rem); MPASS(mp != NULL && *mp != NULL); if (__predict_false(*mp == (struct mbuf *)txq)) { consumed++; continue; } in_use_prev = txq->ift_in_use; err = iflib_encap(txq, mp); if (__predict_false(err)) { /* no room - bail out */ if (err == ENOBUFS) break; consumed++; /* we can't send this packet - skip it */ continue; } consumed++; pkt_sent++; m = *mp; DBG_COUNTER_INC(tx_sent); bytes_sent += m->m_pkthdr.len; mcast_sent += !!(m->m_flags & M_MCAST); txq_avail = TXQ_AVAIL(txq); txq->ift_db_pending += (txq->ift_in_use - in_use_prev); ETHER_BPF_MTAP(ifp, m); if (__predict_false(!(ifp->if_drv_flags & IFF_DRV_RUNNING))) break; rang = iflib_txd_db_check(ctx, txq, false, in_use_prev); } /* deliberate use of bitwise or to avoid gratuitous short-circuit */ ring = rang ? false : (iflib_min_tx_latency | err) || (TXQ_AVAIL(txq) < MAX_TX_DESC(ctx)); iflib_txd_db_check(ctx, txq, ring, txq->ift_in_use); if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent); if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent); if (mcast_sent) if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast_sent); #ifdef INVARIANTS if (iflib_verbose_debug) printf("consumed=%d\n", consumed); #endif return (consumed); } static uint32_t iflib_txq_drain_always(struct ifmp_ring *r) { return (1); } static uint32_t iflib_txq_drain_free(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx) { int i, avail; struct mbuf **mp; iflib_txq_t txq; txq = r->cookie; txq->ift_qstatus = IFLIB_QUEUE_IDLE; CALLOUT_LOCK(txq); callout_stop(&txq->ift_timer); CALLOUT_UNLOCK(txq); avail = IDXDIFF(pidx, cidx, r->size); for (i = 0; i < avail; i++) { mp = _ring_peek_one(r, cidx, i, avail - i); if (__predict_false(*mp == (struct mbuf *)txq)) continue; m_freem(*mp); DBG_COUNTER_INC(tx_frees); } MPASS(ifmp_ring_is_stalled(r) == 0); return (avail); } static void iflib_ifmp_purge(iflib_txq_t txq) { struct ifmp_ring *r; r = txq->ift_br; r->drain = iflib_txq_drain_free; r->can_drain = iflib_txq_drain_always; ifmp_ring_check_drainage(r, r->size); r->drain = iflib_txq_drain; r->can_drain = iflib_txq_can_drain; } static void _task_fn_tx(void *context) { iflib_txq_t txq = context; if_ctx_t ctx = txq->ift_ctx; #if defined(ALTQ) || defined(DEV_NETMAP) if_t ifp = ctx->ifc_ifp; #endif int abdicate = ctx->ifc_sysctl_tx_abdicate; #ifdef IFLIB_DIAGNOSTICS txq->ift_cpu_exec_count[curcpu]++; #endif if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)) return; #ifdef DEV_NETMAP if (if_getcapenable(ifp) & IFCAP_NETMAP) { bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map, BUS_DMASYNC_POSTREAD); if (ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, false)) netmap_tx_irq(ifp, txq->ift_id); if (ctx->ifc_flags & IFC_LEGACY) IFDI_INTR_ENABLE(ctx); else IFDI_TX_QUEUE_INTR_ENABLE(ctx, txq->ift_id); return; } #endif #ifdef ALTQ if (ALTQ_IS_ENABLED(&ifp->if_snd)) iflib_altq_if_start(ifp); #endif if (txq->ift_db_pending) ifmp_ring_enqueue(txq->ift_br, (void **)&txq, 1, TX_BATCH_SIZE, abdicate); else if (!abdicate) ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE); /* * When abdicating, we always need to check drainage, not just when we don't enqueue */ if (abdicate) ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE); if (ctx->ifc_flags & IFC_LEGACY) IFDI_INTR_ENABLE(ctx); else IFDI_TX_QUEUE_INTR_ENABLE(ctx, txq->ift_id); } static void _task_fn_rx(void *context) { iflib_rxq_t rxq = context; if_ctx_t ctx = rxq->ifr_ctx; uint8_t more; uint16_t budget; #ifdef IFLIB_DIAGNOSTICS rxq->ifr_cpu_exec_count[curcpu]++; #endif DBG_COUNTER_INC(task_fn_rxs); if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))) return; #ifdef DEV_NETMAP if (if_getcapenable(ctx->ifc_ifp) & IFCAP_NETMAP) { u_int work = 0; if (netmap_rx_irq(ctx->ifc_ifp, rxq->ifr_id, &work)) { more = 0; goto skip_rxeof; } } #endif budget = ctx->ifc_sysctl_rx_budget; if (budget == 0) budget = 16; /* XXX */ more = iflib_rxeof(rxq, budget); #ifdef DEV_NETMAP skip_rxeof: #endif if ((more & IFLIB_RXEOF_MORE) == 0) { if (ctx->ifc_flags & IFC_LEGACY) IFDI_INTR_ENABLE(ctx); else IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id); DBG_COUNTER_INC(rx_intr_enables); } if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))) return; if (more & IFLIB_RXEOF_MORE) GROUPTASK_ENQUEUE(&rxq->ifr_task); else if (more & IFLIB_RXEOF_EMPTY) callout_reset_curcpu(&rxq->ifr_watchdog, 1, &_task_fn_rx_watchdog, rxq); } static void _task_fn_admin(void *context) { if_ctx_t ctx = context; if_softc_ctx_t sctx = &ctx->ifc_softc_ctx; iflib_txq_t txq; int i; bool oactive, running, do_reset, do_watchdog, in_detach; uint32_t reset_on = hz / 2; STATE_LOCK(ctx); running = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING); oactive = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE); do_reset = (ctx->ifc_flags & IFC_DO_RESET); do_watchdog = (ctx->ifc_flags & IFC_DO_WATCHDOG); in_detach = (ctx->ifc_flags & IFC_IN_DETACH); ctx->ifc_flags &= ~(IFC_DO_RESET|IFC_DO_WATCHDOG); STATE_UNLOCK(ctx); if ((!running && !oactive) && !(ctx->ifc_sctx->isc_flags & IFLIB_ADMIN_ALWAYS_RUN)) return; if (in_detach) return; CTX_LOCK(ctx); for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) { CALLOUT_LOCK(txq); callout_stop(&txq->ift_timer); CALLOUT_UNLOCK(txq); } if (do_watchdog) { ctx->ifc_watchdog_events++; IFDI_WATCHDOG_RESET(ctx); } IFDI_UPDATE_ADMIN_STATUS(ctx); for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) { #ifdef DEV_NETMAP reset_on = hz / 2; if (if_getcapenable(ctx->ifc_ifp) & IFCAP_NETMAP) iflib_netmap_timer_adjust(ctx, txq, &reset_on); #endif callout_reset_on(&txq->ift_timer, reset_on, iflib_timer, txq, txq->ift_timer.c_cpu); } IFDI_LINK_INTR_ENABLE(ctx); if (do_reset) iflib_if_init_locked(ctx); CTX_UNLOCK(ctx); if (LINK_ACTIVE(ctx) == 0) return; for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET); } static void _task_fn_iov(void *context) { if_ctx_t ctx = context; if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING) && !(ctx->ifc_sctx->isc_flags & IFLIB_ADMIN_ALWAYS_RUN)) return; CTX_LOCK(ctx); IFDI_VFLR_HANDLE(ctx); CTX_UNLOCK(ctx); } static int iflib_sysctl_int_delay(SYSCTL_HANDLER_ARGS) { int err; if_int_delay_info_t info; if_ctx_t ctx; info = (if_int_delay_info_t)arg1; ctx = info->iidi_ctx; info->iidi_req = req; info->iidi_oidp = oidp; CTX_LOCK(ctx); err = IFDI_SYSCTL_INT_DELAY(ctx, info); CTX_UNLOCK(ctx); return (err); } /********************************************************************* * * IFNET FUNCTIONS * **********************************************************************/ static void iflib_if_init_locked(if_ctx_t ctx) { iflib_stop(ctx); iflib_init_locked(ctx); } static void iflib_if_init(void *arg) { if_ctx_t ctx = arg; CTX_LOCK(ctx); iflib_if_init_locked(ctx); CTX_UNLOCK(ctx); } static int iflib_if_transmit(if_t ifp, struct mbuf *m) { if_ctx_t ctx = if_getsoftc(ifp); iflib_txq_t txq; int err, qidx; int abdicate = ctx->ifc_sysctl_tx_abdicate; if (__predict_false((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || !LINK_ACTIVE(ctx))) { DBG_COUNTER_INC(tx_frees); m_freem(m); return (ENETDOWN); } MPASS(m->m_nextpkt == NULL); /* ALTQ-enabled interfaces always use queue 0. */ qidx = 0; if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m) && !ALTQ_IS_ENABLED(&ifp->if_snd)) qidx = QIDX(ctx, m); /* * XXX calculate buf_ring based on flowid (divvy up bits?) */ txq = &ctx->ifc_txqs[qidx]; #ifdef DRIVER_BACKPRESSURE if (txq->ift_closed) { while (m != NULL) { next = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); DBG_COUNTER_INC(tx_frees); m = next; } return (ENOBUFS); } #endif #ifdef notyet qidx = count = 0; mp = marr; next = m; do { count++; next = next->m_nextpkt; } while (next != NULL); if (count > nitems(marr)) if ((mp = malloc(count*sizeof(struct mbuf *), M_IFLIB, M_NOWAIT)) == NULL) { /* XXX check nextpkt */ m_freem(m); /* XXX simplify for now */ DBG_COUNTER_INC(tx_frees); return (ENOBUFS); } for (next = m, i = 0; next != NULL; i++) { mp[i] = next; next = next->m_nextpkt; mp[i]->m_nextpkt = NULL; } #endif DBG_COUNTER_INC(tx_seen); err = ifmp_ring_enqueue(txq->ift_br, (void **)&m, 1, TX_BATCH_SIZE, abdicate); if (abdicate) GROUPTASK_ENQUEUE(&txq->ift_task); if (err) { if (!abdicate) GROUPTASK_ENQUEUE(&txq->ift_task); /* support forthcoming later */ #ifdef DRIVER_BACKPRESSURE txq->ift_closed = TRUE; #endif ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE); m_freem(m); DBG_COUNTER_INC(tx_frees); } return (err); } #ifdef ALTQ /* * The overall approach to integrating iflib with ALTQ is to continue to use * the iflib mp_ring machinery between the ALTQ queue(s) and the hardware * ring. Technically, when using ALTQ, queueing to an intermediate mp_ring * is redundant/unnecessary, but doing so minimizes the amount of * ALTQ-specific code required in iflib. It is assumed that the overhead of * redundantly queueing to an intermediate mp_ring is swamped by the * performance limitations inherent in using ALTQ. * * When ALTQ support is compiled in, all iflib drivers will use a transmit * routine, iflib_altq_if_transmit(), that checks if ALTQ is enabled for the * given interface. If ALTQ is enabled for an interface, then all * transmitted packets for that interface will be submitted to the ALTQ * subsystem via IFQ_ENQUEUE(). We don't use the legacy if_transmit() * implementation because it uses IFQ_HANDOFF(), which will duplicatively * update stats that the iflib machinery handles, and which is sensitve to * the disused IFF_DRV_OACTIVE flag. Additionally, iflib_altq_if_start() * will be installed as the start routine for use by ALTQ facilities that * need to trigger queue drains on a scheduled basis. * */ static void iflib_altq_if_start(if_t ifp) { struct ifaltq *ifq = &ifp->if_snd; struct mbuf *m; IFQ_LOCK(ifq); IFQ_DEQUEUE_NOLOCK(ifq, m); while (m != NULL) { iflib_if_transmit(ifp, m); IFQ_DEQUEUE_NOLOCK(ifq, m); } IFQ_UNLOCK(ifq); } static int iflib_altq_if_transmit(if_t ifp, struct mbuf *m) { int err; if (ALTQ_IS_ENABLED(&ifp->if_snd)) { IFQ_ENQUEUE(&ifp->if_snd, m, err); if (err == 0) iflib_altq_if_start(ifp); } else err = iflib_if_transmit(ifp, m); return (err); } #endif /* ALTQ */ static void iflib_if_qflush(if_t ifp) { if_ctx_t ctx = if_getsoftc(ifp); iflib_txq_t txq = ctx->ifc_txqs; int i; STATE_LOCK(ctx); ctx->ifc_flags |= IFC_QFLUSH; STATE_UNLOCK(ctx); for (i = 0; i < NTXQSETS(ctx); i++, txq++) while (!(ifmp_ring_is_idle(txq->ift_br) || ifmp_ring_is_stalled(txq->ift_br))) iflib_txq_check_drain(txq, 0); STATE_LOCK(ctx); ctx->ifc_flags &= ~IFC_QFLUSH; STATE_UNLOCK(ctx); /* * When ALTQ is enabled, this will also take care of purging the * ALTQ queue(s). */ if_qflush(ifp); } #define IFCAP_FLAGS (IFCAP_HWCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \ IFCAP_TSO | IFCAP_VLAN_HWTAGGING | IFCAP_HWSTATS | \ IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | \ IFCAP_VLAN_HWTSO | IFCAP_VLAN_HWCSUM | IFCAP_NOMAP) static int iflib_if_ioctl(if_t ifp, u_long command, caddr_t data) { if_ctx_t ctx = if_getsoftc(ifp); struct ifreq *ifr = (struct ifreq *)data; #if defined(INET) || defined(INET6) struct ifaddr *ifa = (struct ifaddr *)data; #endif bool avoid_reset = false; int err = 0, reinit = 0, bits; switch (command) { case SIOCSIFADDR: #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) avoid_reset = true; #endif #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) avoid_reset = true; #endif /* ** Calling init results in link renegotiation, ** so we avoid doing it when possible. */ if (avoid_reset) { if_setflagbits(ifp, IFF_UP,0); if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING)) reinit = 1; #ifdef INET if (!(if_getflags(ifp) & IFF_NOARP)) arp_ifinit(ifp, ifa); #endif } else err = ether_ioctl(ifp, command, data); break; case SIOCSIFMTU: CTX_LOCK(ctx); if (ifr->ifr_mtu == if_getmtu(ifp)) { CTX_UNLOCK(ctx); break; } bits = if_getdrvflags(ifp); /* stop the driver and free any clusters before proceeding */ iflib_stop(ctx); if ((err = IFDI_MTU_SET(ctx, ifr->ifr_mtu)) == 0) { STATE_LOCK(ctx); if (ifr->ifr_mtu > ctx->ifc_max_fl_buf_size) ctx->ifc_flags |= IFC_MULTISEG; else ctx->ifc_flags &= ~IFC_MULTISEG; STATE_UNLOCK(ctx); err = if_setmtu(ifp, ifr->ifr_mtu); } iflib_init_locked(ctx); STATE_LOCK(ctx); if_setdrvflags(ifp, bits); STATE_UNLOCK(ctx); CTX_UNLOCK(ctx); break; case SIOCSIFFLAGS: CTX_LOCK(ctx); if (if_getflags(ifp) & IFF_UP) { if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { if ((if_getflags(ifp) ^ ctx->ifc_if_flags) & (IFF_PROMISC | IFF_ALLMULTI)) { err = IFDI_PROMISC_SET(ctx, if_getflags(ifp)); } } else reinit = 1; } else if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { iflib_stop(ctx); } ctx->ifc_if_flags = if_getflags(ifp); CTX_UNLOCK(ctx); break; case SIOCADDMULTI: case SIOCDELMULTI: if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { CTX_LOCK(ctx); IFDI_INTR_DISABLE(ctx); IFDI_MULTI_SET(ctx); IFDI_INTR_ENABLE(ctx); CTX_UNLOCK(ctx); } break; case SIOCSIFMEDIA: CTX_LOCK(ctx); IFDI_MEDIA_SET(ctx); CTX_UNLOCK(ctx); /* FALLTHROUGH */ case SIOCGIFMEDIA: case SIOCGIFXMEDIA: err = ifmedia_ioctl(ifp, ifr, ctx->ifc_mediap, command); break; case SIOCGI2C: { struct ifi2creq i2c; err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c)); if (err != 0) break; if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) { err = EINVAL; break; } if (i2c.len > sizeof(i2c.data)) { err = EINVAL; break; } if ((err = IFDI_I2C_REQ(ctx, &i2c)) == 0) err = copyout(&i2c, ifr_data_get_ptr(ifr), sizeof(i2c)); break; } case SIOCSIFCAP: { int mask, setmask, oldmask; oldmask = if_getcapenable(ifp); mask = ifr->ifr_reqcap ^ oldmask; mask &= ctx->ifc_softc_ctx.isc_capabilities | IFCAP_NOMAP; setmask = 0; #ifdef TCP_OFFLOAD setmask |= mask & (IFCAP_TOE4|IFCAP_TOE6); #endif setmask |= (mask & IFCAP_FLAGS); setmask |= (mask & IFCAP_WOL); /* * If any RX csum has changed, change all the ones that * are supported by the driver. */ if (setmask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) { setmask |= ctx->ifc_softc_ctx.isc_capabilities & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6); } /* * want to ensure that traffic has stopped before we change any of the flags */ if (setmask) { CTX_LOCK(ctx); bits = if_getdrvflags(ifp); if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL) iflib_stop(ctx); STATE_LOCK(ctx); if_togglecapenable(ifp, setmask); STATE_UNLOCK(ctx); if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL) iflib_init_locked(ctx); STATE_LOCK(ctx); if_setdrvflags(ifp, bits); STATE_UNLOCK(ctx); CTX_UNLOCK(ctx); } if_vlancap(ifp); break; } case SIOCGPRIVATE_0: case SIOCSDRVSPEC: case SIOCGDRVSPEC: CTX_LOCK(ctx); err = IFDI_PRIV_IOCTL(ctx, command, data); CTX_UNLOCK(ctx); break; default: err = ether_ioctl(ifp, command, data); break; } if (reinit) iflib_if_init(ctx); return (err); } static uint64_t iflib_if_get_counter(if_t ifp, ift_counter cnt) { if_ctx_t ctx = if_getsoftc(ifp); return (IFDI_GET_COUNTER(ctx, cnt)); } /********************************************************************* * * OTHER FUNCTIONS EXPORTED TO THE STACK * **********************************************************************/ static void iflib_vlan_register(void *arg, if_t ifp, uint16_t vtag) { if_ctx_t ctx = if_getsoftc(ifp); if ((void *)ctx != arg) return; if ((vtag == 0) || (vtag > 4095)) return; if (iflib_in_detach(ctx)) return; CTX_LOCK(ctx); IFDI_VLAN_REGISTER(ctx, vtag); /* Re-init to load the changes */ if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER) iflib_if_init_locked(ctx); CTX_UNLOCK(ctx); } static void iflib_vlan_unregister(void *arg, if_t ifp, uint16_t vtag) { if_ctx_t ctx = if_getsoftc(ifp); if ((void *)ctx != arg) return; if ((vtag == 0) || (vtag > 4095)) return; CTX_LOCK(ctx); IFDI_VLAN_UNREGISTER(ctx, vtag); /* Re-init to load the changes */ if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER) iflib_if_init_locked(ctx); CTX_UNLOCK(ctx); } static void iflib_led_func(void *arg, int onoff) { if_ctx_t ctx = arg; CTX_LOCK(ctx); IFDI_LED_FUNC(ctx, onoff); CTX_UNLOCK(ctx); } /********************************************************************* * * BUS FUNCTION DEFINITIONS * **********************************************************************/ int iflib_device_probe(device_t dev) { const pci_vendor_info_t *ent; if_shared_ctx_t sctx; uint16_t pci_device_id, pci_rev_id, pci_subdevice_id, pci_subvendor_id; uint16_t pci_vendor_id; if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC) return (ENOTSUP); pci_vendor_id = pci_get_vendor(dev); pci_device_id = pci_get_device(dev); pci_subvendor_id = pci_get_subvendor(dev); pci_subdevice_id = pci_get_subdevice(dev); pci_rev_id = pci_get_revid(dev); if (sctx->isc_parse_devinfo != NULL) sctx->isc_parse_devinfo(&pci_device_id, &pci_subvendor_id, &pci_subdevice_id, &pci_rev_id); ent = sctx->isc_vendor_info; while (ent->pvi_vendor_id != 0) { if (pci_vendor_id != ent->pvi_vendor_id) { ent++; continue; } if ((pci_device_id == ent->pvi_device_id) && ((pci_subvendor_id == ent->pvi_subvendor_id) || (ent->pvi_subvendor_id == 0)) && ((pci_subdevice_id == ent->pvi_subdevice_id) || (ent->pvi_subdevice_id == 0)) && ((pci_rev_id == ent->pvi_rev_id) || (ent->pvi_rev_id == 0))) { device_set_desc_copy(dev, ent->pvi_name); /* this needs to be changed to zero if the bus probing code * ever stops re-probing on best match because the sctx * may have its values over written by register calls * in subsequent probes */ return (BUS_PROBE_DEFAULT); } ent++; } return (ENXIO); } int iflib_device_probe_vendor(device_t dev) { int probe; probe = iflib_device_probe(dev); if (probe == BUS_PROBE_DEFAULT) return (BUS_PROBE_VENDOR); else return (probe); } static void iflib_reset_qvalues(if_ctx_t ctx) { if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; if_shared_ctx_t sctx = ctx->ifc_sctx; device_t dev = ctx->ifc_dev; int i; if (ctx->ifc_sysctl_ntxqs != 0) scctx->isc_ntxqsets = ctx->ifc_sysctl_ntxqs; if (ctx->ifc_sysctl_nrxqs != 0) scctx->isc_nrxqsets = ctx->ifc_sysctl_nrxqs; for (i = 0; i < sctx->isc_ntxqs; i++) { if (ctx->ifc_sysctl_ntxds[i] != 0) scctx->isc_ntxd[i] = ctx->ifc_sysctl_ntxds[i]; else scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i]; } for (i = 0; i < sctx->isc_nrxqs; i++) { if (ctx->ifc_sysctl_nrxds[i] != 0) scctx->isc_nrxd[i] = ctx->ifc_sysctl_nrxds[i]; else scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i]; } for (i = 0; i < sctx->isc_nrxqs; i++) { if (scctx->isc_nrxd[i] < sctx->isc_nrxd_min[i]) { device_printf(dev, "nrxd%d: %d less than nrxd_min %d - resetting to min\n", i, scctx->isc_nrxd[i], sctx->isc_nrxd_min[i]); scctx->isc_nrxd[i] = sctx->isc_nrxd_min[i]; } if (scctx->isc_nrxd[i] > sctx->isc_nrxd_max[i]) { device_printf(dev, "nrxd%d: %d greater than nrxd_max %d - resetting to max\n", i, scctx->isc_nrxd[i], sctx->isc_nrxd_max[i]); scctx->isc_nrxd[i] = sctx->isc_nrxd_max[i]; } if (!powerof2(scctx->isc_nrxd[i])) { device_printf(dev, "nrxd%d: %d is not a power of 2 - using default value of %d\n", i, scctx->isc_nrxd[i], sctx->isc_nrxd_default[i]); scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i]; } } for (i = 0; i < sctx->isc_ntxqs; i++) { if (scctx->isc_ntxd[i] < sctx->isc_ntxd_min[i]) { device_printf(dev, "ntxd%d: %d less than ntxd_min %d - resetting to min\n", i, scctx->isc_ntxd[i], sctx->isc_ntxd_min[i]); scctx->isc_ntxd[i] = sctx->isc_ntxd_min[i]; } if (scctx->isc_ntxd[i] > sctx->isc_ntxd_max[i]) { device_printf(dev, "ntxd%d: %d greater than ntxd_max %d - resetting to max\n", i, scctx->isc_ntxd[i], sctx->isc_ntxd_max[i]); scctx->isc_ntxd[i] = sctx->isc_ntxd_max[i]; } if (!powerof2(scctx->isc_ntxd[i])) { device_printf(dev, "ntxd%d: %d is not a power of 2 - using default value of %d\n", i, scctx->isc_ntxd[i], sctx->isc_ntxd_default[i]); scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i]; } } } static void iflib_add_pfil(if_ctx_t ctx) { struct pfil_head *pfil; struct pfil_head_args pa; iflib_rxq_t rxq; int i; pa.pa_version = PFIL_VERSION; pa.pa_flags = PFIL_IN; pa.pa_type = PFIL_TYPE_ETHERNET; pa.pa_headname = ctx->ifc_ifp->if_xname; pfil = pfil_head_register(&pa); for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) { rxq->pfil = pfil; } } static void iflib_rem_pfil(if_ctx_t ctx) { struct pfil_head *pfil; iflib_rxq_t rxq; int i; rxq = ctx->ifc_rxqs; pfil = rxq->pfil; for (i = 0; i < NRXQSETS(ctx); i++, rxq++) { rxq->pfil = NULL; } pfil_head_unregister(pfil); } static uint16_t get_ctx_core_offset(if_ctx_t ctx) { if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; struct cpu_offset *op; uint16_t qc; uint16_t ret = ctx->ifc_sysctl_core_offset; if (ret != CORE_OFFSET_UNSPECIFIED) return (ret); if (ctx->ifc_sysctl_separate_txrx) qc = scctx->isc_ntxqsets + scctx->isc_nrxqsets; else qc = max(scctx->isc_ntxqsets, scctx->isc_nrxqsets); mtx_lock(&cpu_offset_mtx); SLIST_FOREACH(op, &cpu_offsets, entries) { if (CPU_CMP(&ctx->ifc_cpus, &op->set) == 0) { ret = op->offset; op->offset += qc; MPASS(op->refcount < UINT_MAX); op->refcount++; break; } } if (ret == CORE_OFFSET_UNSPECIFIED) { ret = 0; op = malloc(sizeof(struct cpu_offset), M_IFLIB, M_NOWAIT | M_ZERO); if (op == NULL) { device_printf(ctx->ifc_dev, "allocation for cpu offset failed.\n"); } else { op->offset = qc; op->refcount = 1; CPU_COPY(&ctx->ifc_cpus, &op->set); SLIST_INSERT_HEAD(&cpu_offsets, op, entries); } } mtx_unlock(&cpu_offset_mtx); return (ret); } static void unref_ctx_core_offset(if_ctx_t ctx) { struct cpu_offset *op, *top; mtx_lock(&cpu_offset_mtx); SLIST_FOREACH_SAFE(op, &cpu_offsets, entries, top) { if (CPU_CMP(&ctx->ifc_cpus, &op->set) == 0) { MPASS(op->refcount > 0); op->refcount--; if (op->refcount == 0) { SLIST_REMOVE(&cpu_offsets, op, cpu_offset, entries); free(op, M_IFLIB); } break; } } mtx_unlock(&cpu_offset_mtx); } int iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ctxp) { if_ctx_t ctx; if_t ifp; if_softc_ctx_t scctx; kobjop_desc_t kobj_desc; kobj_method_t *kobj_method; int err, msix, rid; uint16_t main_rxq, main_txq; ctx = malloc(sizeof(* ctx), M_IFLIB, M_WAITOK|M_ZERO); if (sc == NULL) { sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO); device_set_softc(dev, ctx); ctx->ifc_flags |= IFC_SC_ALLOCATED; } ctx->ifc_sctx = sctx; ctx->ifc_dev = dev; ctx->ifc_softc = sc; if ((err = iflib_register(ctx)) != 0) { device_printf(dev, "iflib_register failed %d\n", err); goto fail_ctx_free; } iflib_add_device_sysctl_pre(ctx); scctx = &ctx->ifc_softc_ctx; ifp = ctx->ifc_ifp; iflib_reset_qvalues(ctx); CTX_LOCK(ctx); if ((err = IFDI_ATTACH_PRE(ctx)) != 0) { device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err); goto fail_unlock; } _iflib_pre_assert(scctx); ctx->ifc_txrx = *scctx->isc_txrx; if (sctx->isc_flags & IFLIB_DRIVER_MEDIA) ctx->ifc_mediap = scctx->isc_media; #ifdef INVARIANTS if (scctx->isc_capabilities & IFCAP_TXCSUM) MPASS(scctx->isc_tx_csum_flags); #endif if_setcapabilities(ifp, scctx->isc_capabilities | IFCAP_HWSTATS | IFCAP_NOMAP); if_setcapenable(ifp, scctx->isc_capenable | IFCAP_HWSTATS | IFCAP_NOMAP); if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets)) scctx->isc_ntxqsets = scctx->isc_ntxqsets_max; if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets)) scctx->isc_nrxqsets = scctx->isc_nrxqsets_max; main_txq = (sctx->isc_flags & IFLIB_HAS_TXCQ) ? 1 : 0; main_rxq = (sctx->isc_flags & IFLIB_HAS_RXCQ) ? 1 : 0; /* XXX change for per-queue sizes */ device_printf(dev, "Using %d TX descriptors and %d RX descriptors\n", scctx->isc_ntxd[main_txq], scctx->isc_nrxd[main_rxq]); if (scctx->isc_tx_nsegments > scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION) scctx->isc_tx_nsegments = max(1, scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION); if (scctx->isc_tx_tso_segments_max > scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION) scctx->isc_tx_tso_segments_max = max(1, scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION); /* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */ if (if_getcapabilities(ifp) & IFCAP_TSO) { /* * The stack can't handle a TSO size larger than IP_MAXPACKET, * but some MACs do. */ if_sethwtsomax(ifp, min(scctx->isc_tx_tso_size_max, IP_MAXPACKET)); /* * Take maximum number of m_pullup(9)'s in iflib_parse_header() * into account. In the worst case, each of these calls will * add another mbuf and, thus, the requirement for another DMA * segment. So for best performance, it doesn't make sense to * advertize a maximum of TSO segments that typically will * require defragmentation in iflib_encap(). */ if_sethwtsomaxsegcount(ifp, scctx->isc_tx_tso_segments_max - 3); if_sethwtsomaxsegsize(ifp, scctx->isc_tx_tso_segsize_max); } if (scctx->isc_rss_table_size == 0) scctx->isc_rss_table_size = 64; scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1; GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx); /* XXX format name */ taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx, NULL, NULL, "admin"); /* Set up cpu set. If it fails, use the set of all CPUs. */ if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) != 0) { device_printf(dev, "Unable to fetch CPU list\n"); CPU_COPY(&all_cpus, &ctx->ifc_cpus); } MPASS(CPU_COUNT(&ctx->ifc_cpus) > 0); /* ** Now set up MSI or MSI-X, should return us the number of supported ** vectors (will be 1 for a legacy interrupt and MSI). */ if (sctx->isc_flags & IFLIB_SKIP_MSIX) { msix = scctx->isc_vectors; } else if (scctx->isc_msix_bar != 0) /* * The simple fact that isc_msix_bar is not 0 does not mean we * we have a good value there that is known to work. */ msix = iflib_msix_init(ctx); else { scctx->isc_vectors = 1; scctx->isc_ntxqsets = 1; scctx->isc_nrxqsets = 1; scctx->isc_intr = IFLIB_INTR_LEGACY; msix = 0; } /* Get memory for the station queues */ if ((err = iflib_queues_alloc(ctx))) { device_printf(dev, "Unable to allocate queue memory\n"); goto fail_intr_free; } if ((err = iflib_qset_structures_setup(ctx))) goto fail_queues; /* * Now that we know how many queues there are, get the core offset. */ ctx->ifc_sysctl_core_offset = get_ctx_core_offset(ctx); /* * Group taskqueues aren't properly set up until SMP is started, * so we disable interrupts until we can handle them post * SI_SUB_SMP. * * XXX: disabling interrupts doesn't actually work, at least for * the non-MSI case. When they occur before SI_SUB_SMP completes, * we do null handling and depend on this not causing too large an * interrupt storm. */ IFDI_INTR_DISABLE(ctx); if (msix > 1) { /* * When using MSI-X, ensure that ifdi_{r,t}x_queue_intr_enable * aren't the default NULL implementation. */ kobj_desc = &ifdi_rx_queue_intr_enable_desc; kobj_method = kobj_lookup_method(((kobj_t)ctx)->ops->cls, NULL, kobj_desc); if (kobj_method == &kobj_desc->deflt) { device_printf(dev, "MSI-X requires ifdi_rx_queue_intr_enable method"); err = EOPNOTSUPP; goto fail_queues; } kobj_desc = &ifdi_tx_queue_intr_enable_desc; kobj_method = kobj_lookup_method(((kobj_t)ctx)->ops->cls, NULL, kobj_desc); if (kobj_method == &kobj_desc->deflt) { device_printf(dev, "MSI-X requires ifdi_tx_queue_intr_enable method"); err = EOPNOTSUPP; goto fail_queues; } /* * Assign the MSI-X vectors. * Note that the default NULL ifdi_msix_intr_assign method will * fail here, too. */ err = IFDI_MSIX_INTR_ASSIGN(ctx, msix); if (err != 0) { device_printf(dev, "IFDI_MSIX_INTR_ASSIGN failed %d\n", err); goto fail_queues; } } else if (scctx->isc_intr != IFLIB_INTR_MSIX) { rid = 0; if (scctx->isc_intr == IFLIB_INTR_MSI) { MPASS(msix == 1); rid = 1; } if ((err = iflib_legacy_setup(ctx, ctx->isc_legacy_intr, ctx->ifc_softc, &rid, "irq0")) != 0) { device_printf(dev, "iflib_legacy_setup failed %d\n", err); goto fail_queues; } } else { device_printf(dev, "Cannot use iflib with only 1 MSI-X interrupt!\n"); err = ENODEV; goto fail_intr_free; } ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac.octet); if ((err = IFDI_ATTACH_POST(ctx)) != 0) { device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err); goto fail_detach; } /* * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported. * This must appear after the call to ether_ifattach() because * ether_ifattach() sets if_hdrlen to the default value. */ if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU) if_setifheaderlen(ifp, sizeof(struct ether_vlan_header)); if ((err = iflib_netmap_attach(ctx))) { device_printf(ctx->ifc_dev, "netmap attach failed: %d\n", err); goto fail_detach; } *ctxp = ctx; DEBUGNET_SET(ctx->ifc_ifp, iflib); if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter); iflib_add_device_sysctl_post(ctx); iflib_add_pfil(ctx); ctx->ifc_flags |= IFC_INIT_DONE; CTX_UNLOCK(ctx); return (0); fail_detach: ether_ifdetach(ctx->ifc_ifp); fail_intr_free: iflib_free_intr_mem(ctx); fail_queues: iflib_tx_structures_free(ctx); iflib_rx_structures_free(ctx); taskqgroup_detach(qgroup_if_config_tqg, &ctx->ifc_admin_task); IFDI_DETACH(ctx); fail_unlock: CTX_UNLOCK(ctx); iflib_deregister(ctx); fail_ctx_free: device_set_softc(ctx->ifc_dev, NULL); if (ctx->ifc_flags & IFC_SC_ALLOCATED) free(ctx->ifc_softc, M_IFLIB); free(ctx, M_IFLIB); return (err); } int iflib_pseudo_register(device_t dev, if_shared_ctx_t sctx, if_ctx_t *ctxp, struct iflib_cloneattach_ctx *clctx) { int err; if_ctx_t ctx; if_t ifp; if_softc_ctx_t scctx; int i; void *sc; uint16_t main_txq; uint16_t main_rxq; ctx = malloc(sizeof(*ctx), M_IFLIB, M_WAITOK|M_ZERO); sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO); ctx->ifc_flags |= IFC_SC_ALLOCATED; if (sctx->isc_flags & (IFLIB_PSEUDO|IFLIB_VIRTUAL)) ctx->ifc_flags |= IFC_PSEUDO; ctx->ifc_sctx = sctx; ctx->ifc_softc = sc; ctx->ifc_dev = dev; if ((err = iflib_register(ctx)) != 0) { device_printf(dev, "%s: iflib_register failed %d\n", __func__, err); goto fail_ctx_free; } iflib_add_device_sysctl_pre(ctx); scctx = &ctx->ifc_softc_ctx; ifp = ctx->ifc_ifp; iflib_reset_qvalues(ctx); CTX_LOCK(ctx); if ((err = IFDI_ATTACH_PRE(ctx)) != 0) { device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err); goto fail_unlock; } if (sctx->isc_flags & IFLIB_GEN_MAC) ether_gen_addr(ifp, &ctx->ifc_mac); if ((err = IFDI_CLONEATTACH(ctx, clctx->cc_ifc, clctx->cc_name, clctx->cc_params)) != 0) { device_printf(dev, "IFDI_CLONEATTACH failed %d\n", err); goto fail_ctx_free; } ifmedia_add(ctx->ifc_mediap, IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL); ifmedia_add(ctx->ifc_mediap, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(ctx->ifc_mediap, IFM_ETHER | IFM_AUTO); #ifdef INVARIANTS if (scctx->isc_capabilities & IFCAP_TXCSUM) MPASS(scctx->isc_tx_csum_flags); #endif if_setcapabilities(ifp, scctx->isc_capabilities | IFCAP_HWSTATS | IFCAP_LINKSTATE); if_setcapenable(ifp, scctx->isc_capenable | IFCAP_HWSTATS | IFCAP_LINKSTATE); ifp->if_flags |= IFF_NOGROUP; if (sctx->isc_flags & IFLIB_PSEUDO) { ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac.octet); if ((err = IFDI_ATTACH_POST(ctx)) != 0) { device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err); goto fail_detach; } *ctxp = ctx; /* * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported. * This must appear after the call to ether_ifattach() because * ether_ifattach() sets if_hdrlen to the default value. */ if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU) if_setifheaderlen(ifp, sizeof(struct ether_vlan_header)); if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter); iflib_add_device_sysctl_post(ctx); ctx->ifc_flags |= IFC_INIT_DONE; return (0); } _iflib_pre_assert(scctx); ctx->ifc_txrx = *scctx->isc_txrx; if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets)) scctx->isc_ntxqsets = scctx->isc_ntxqsets_max; if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets)) scctx->isc_nrxqsets = scctx->isc_nrxqsets_max; main_txq = (sctx->isc_flags & IFLIB_HAS_TXCQ) ? 1 : 0; main_rxq = (sctx->isc_flags & IFLIB_HAS_RXCQ) ? 1 : 0; /* XXX change for per-queue sizes */ device_printf(dev, "Using %d TX descriptors and %d RX descriptors\n", scctx->isc_ntxd[main_txq], scctx->isc_nrxd[main_rxq]); if (scctx->isc_tx_nsegments > scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION) scctx->isc_tx_nsegments = max(1, scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION); if (scctx->isc_tx_tso_segments_max > scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION) scctx->isc_tx_tso_segments_max = max(1, scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION); /* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */ if (if_getcapabilities(ifp) & IFCAP_TSO) { /* * The stack can't handle a TSO size larger than IP_MAXPACKET, * but some MACs do. */ if_sethwtsomax(ifp, min(scctx->isc_tx_tso_size_max, IP_MAXPACKET)); /* * Take maximum number of m_pullup(9)'s in iflib_parse_header() * into account. In the worst case, each of these calls will * add another mbuf and, thus, the requirement for another DMA * segment. So for best performance, it doesn't make sense to * advertize a maximum of TSO segments that typically will * require defragmentation in iflib_encap(). */ if_sethwtsomaxsegcount(ifp, scctx->isc_tx_tso_segments_max - 3); if_sethwtsomaxsegsize(ifp, scctx->isc_tx_tso_segsize_max); } if (scctx->isc_rss_table_size == 0) scctx->isc_rss_table_size = 64; scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1; GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx); /* XXX format name */ taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx, NULL, NULL, "admin"); /* XXX --- can support > 1 -- but keep it simple for now */ scctx->isc_intr = IFLIB_INTR_LEGACY; /* Get memory for the station queues */ if ((err = iflib_queues_alloc(ctx))) { device_printf(dev, "Unable to allocate queue memory\n"); goto fail_iflib_detach; } if ((err = iflib_qset_structures_setup(ctx))) { device_printf(dev, "qset structure setup failed %d\n", err); goto fail_queues; } /* * XXX What if anything do we want to do about interrupts? */ ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac.octet); if ((err = IFDI_ATTACH_POST(ctx)) != 0) { device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err); goto fail_detach; } /* * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported. * This must appear after the call to ether_ifattach() because * ether_ifattach() sets if_hdrlen to the default value. */ if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU) if_setifheaderlen(ifp, sizeof(struct ether_vlan_header)); /* XXX handle more than one queue */ for (i = 0; i < scctx->isc_nrxqsets; i++) IFDI_RX_CLSET(ctx, 0, i, ctx->ifc_rxqs[i].ifr_fl[0].ifl_sds.ifsd_cl); *ctxp = ctx; if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter); iflib_add_device_sysctl_post(ctx); ctx->ifc_flags |= IFC_INIT_DONE; CTX_UNLOCK(ctx); return (0); fail_detach: ether_ifdetach(ctx->ifc_ifp); fail_queues: iflib_tx_structures_free(ctx); iflib_rx_structures_free(ctx); fail_iflib_detach: IFDI_DETACH(ctx); fail_unlock: CTX_UNLOCK(ctx); iflib_deregister(ctx); fail_ctx_free: free(ctx->ifc_softc, M_IFLIB); free(ctx, M_IFLIB); return (err); } int iflib_pseudo_deregister(if_ctx_t ctx) { if_t ifp = ctx->ifc_ifp; iflib_txq_t txq; iflib_rxq_t rxq; int i, j; struct taskqgroup *tqg; iflib_fl_t fl; /* Unregister VLAN event handlers early */ iflib_unregister_vlan_handlers(ctx); ether_ifdetach(ifp); /* XXX drain any dependent tasks */ tqg = qgroup_if_io_tqg; for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) { callout_drain(&txq->ift_timer); if (txq->ift_task.gt_uniq != NULL) taskqgroup_detach(tqg, &txq->ift_task); } for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) { callout_drain(&rxq->ifr_watchdog); if (rxq->ifr_task.gt_uniq != NULL) taskqgroup_detach(tqg, &rxq->ifr_task); for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) free(fl->ifl_rx_bitmap, M_IFLIB); } tqg = qgroup_if_config_tqg; if (ctx->ifc_admin_task.gt_uniq != NULL) taskqgroup_detach(tqg, &ctx->ifc_admin_task); if (ctx->ifc_vflr_task.gt_uniq != NULL) taskqgroup_detach(tqg, &ctx->ifc_vflr_task); iflib_tx_structures_free(ctx); iflib_rx_structures_free(ctx); iflib_deregister(ctx); if (ctx->ifc_flags & IFC_SC_ALLOCATED) free(ctx->ifc_softc, M_IFLIB); free(ctx, M_IFLIB); return (0); } int iflib_device_attach(device_t dev) { if_ctx_t ctx; if_shared_ctx_t sctx; if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC) return (ENOTSUP); pci_enable_busmaster(dev); return (iflib_device_register(dev, NULL, sctx, &ctx)); } int iflib_device_deregister(if_ctx_t ctx) { if_t ifp = ctx->ifc_ifp; iflib_txq_t txq; iflib_rxq_t rxq; device_t dev = ctx->ifc_dev; int i, j; struct taskqgroup *tqg; iflib_fl_t fl; /* Make sure VLANS are not using driver */ if (if_vlantrunkinuse(ifp)) { device_printf(dev, "Vlan in use, detach first\n"); return (EBUSY); } #ifdef PCI_IOV if (!CTX_IS_VF(ctx) && pci_iov_detach(dev) != 0) { device_printf(dev, "SR-IOV in use; detach first.\n"); return (EBUSY); } #endif STATE_LOCK(ctx); ctx->ifc_flags |= IFC_IN_DETACH; STATE_UNLOCK(ctx); /* Unregister VLAN handlers before calling iflib_stop() */ iflib_unregister_vlan_handlers(ctx); iflib_netmap_detach(ifp); ether_ifdetach(ifp); CTX_LOCK(ctx); iflib_stop(ctx); CTX_UNLOCK(ctx); iflib_rem_pfil(ctx); if (ctx->ifc_led_dev != NULL) led_destroy(ctx->ifc_led_dev); /* XXX drain any dependent tasks */ tqg = qgroup_if_io_tqg; for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) { callout_drain(&txq->ift_timer); if (txq->ift_task.gt_uniq != NULL) taskqgroup_detach(tqg, &txq->ift_task); } for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) { if (rxq->ifr_task.gt_uniq != NULL) taskqgroup_detach(tqg, &rxq->ifr_task); for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) free(fl->ifl_rx_bitmap, M_IFLIB); } tqg = qgroup_if_config_tqg; if (ctx->ifc_admin_task.gt_uniq != NULL) taskqgroup_detach(tqg, &ctx->ifc_admin_task); if (ctx->ifc_vflr_task.gt_uniq != NULL) taskqgroup_detach(tqg, &ctx->ifc_vflr_task); CTX_LOCK(ctx); IFDI_DETACH(ctx); CTX_UNLOCK(ctx); /* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/ iflib_free_intr_mem(ctx); bus_generic_detach(dev); iflib_tx_structures_free(ctx); iflib_rx_structures_free(ctx); iflib_deregister(ctx); device_set_softc(ctx->ifc_dev, NULL); if (ctx->ifc_flags & IFC_SC_ALLOCATED) free(ctx->ifc_softc, M_IFLIB); unref_ctx_core_offset(ctx); free(ctx, M_IFLIB); return (0); } static void iflib_free_intr_mem(if_ctx_t ctx) { if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_MSIX) { iflib_irq_free(ctx, &ctx->ifc_legacy_irq); } if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_LEGACY) { pci_release_msi(ctx->ifc_dev); } if (ctx->ifc_msix_mem != NULL) { bus_release_resource(ctx->ifc_dev, SYS_RES_MEMORY, rman_get_rid(ctx->ifc_msix_mem), ctx->ifc_msix_mem); ctx->ifc_msix_mem = NULL; } } int iflib_device_detach(device_t dev) { if_ctx_t ctx = device_get_softc(dev); return (iflib_device_deregister(ctx)); } int iflib_device_suspend(device_t dev) { if_ctx_t ctx = device_get_softc(dev); CTX_LOCK(ctx); IFDI_SUSPEND(ctx); CTX_UNLOCK(ctx); return bus_generic_suspend(dev); } int iflib_device_shutdown(device_t dev) { if_ctx_t ctx = device_get_softc(dev); CTX_LOCK(ctx); IFDI_SHUTDOWN(ctx); CTX_UNLOCK(ctx); return bus_generic_suspend(dev); } int iflib_device_resume(device_t dev) { if_ctx_t ctx = device_get_softc(dev); iflib_txq_t txq = ctx->ifc_txqs; CTX_LOCK(ctx); IFDI_RESUME(ctx); iflib_if_init_locked(ctx); CTX_UNLOCK(ctx); for (int i = 0; i < NTXQSETS(ctx); i++, txq++) iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET); return (bus_generic_resume(dev)); } int iflib_device_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *params) { int error; if_ctx_t ctx = device_get_softc(dev); CTX_LOCK(ctx); error = IFDI_IOV_INIT(ctx, num_vfs, params); CTX_UNLOCK(ctx); return (error); } void iflib_device_iov_uninit(device_t dev) { if_ctx_t ctx = device_get_softc(dev); CTX_LOCK(ctx); IFDI_IOV_UNINIT(ctx); CTX_UNLOCK(ctx); } int iflib_device_iov_add_vf(device_t dev, uint16_t vfnum, const nvlist_t *params) { int error; if_ctx_t ctx = device_get_softc(dev); CTX_LOCK(ctx); error = IFDI_IOV_VF_ADD(ctx, vfnum, params); CTX_UNLOCK(ctx); return (error); } /********************************************************************* * * MODULE FUNCTION DEFINITIONS * **********************************************************************/ /* * - Start a fast taskqueue thread for each core * - Start a taskqueue for control operations */ static int iflib_module_init(void) { return (0); } static int iflib_module_event_handler(module_t mod, int what, void *arg) { int err; switch (what) { case MOD_LOAD: if ((err = iflib_module_init()) != 0) return (err); break; case MOD_UNLOAD: return (EBUSY); default: return (EOPNOTSUPP); } return (0); } /********************************************************************* * * PUBLIC FUNCTION DEFINITIONS * ordered as in iflib.h * **********************************************************************/ static void _iflib_assert(if_shared_ctx_t sctx) { int i; MPASS(sctx->isc_tx_maxsize); MPASS(sctx->isc_tx_maxsegsize); MPASS(sctx->isc_rx_maxsize); MPASS(sctx->isc_rx_nsegments); MPASS(sctx->isc_rx_maxsegsize); MPASS(sctx->isc_nrxqs >= 1 && sctx->isc_nrxqs <= 8); for (i = 0; i < sctx->isc_nrxqs; i++) { MPASS(sctx->isc_nrxd_min[i]); MPASS(powerof2(sctx->isc_nrxd_min[i])); MPASS(sctx->isc_nrxd_max[i]); MPASS(powerof2(sctx->isc_nrxd_max[i])); MPASS(sctx->isc_nrxd_default[i]); MPASS(powerof2(sctx->isc_nrxd_default[i])); } MPASS(sctx->isc_ntxqs >= 1 && sctx->isc_ntxqs <= 8); for (i = 0; i < sctx->isc_ntxqs; i++) { MPASS(sctx->isc_ntxd_min[i]); MPASS(powerof2(sctx->isc_ntxd_min[i])); MPASS(sctx->isc_ntxd_max[i]); MPASS(powerof2(sctx->isc_ntxd_max[i])); MPASS(sctx->isc_ntxd_default[i]); MPASS(powerof2(sctx->isc_ntxd_default[i])); } } static void _iflib_pre_assert(if_softc_ctx_t scctx) { MPASS(scctx->isc_txrx->ift_txd_encap); MPASS(scctx->isc_txrx->ift_txd_flush); MPASS(scctx->isc_txrx->ift_txd_credits_update); MPASS(scctx->isc_txrx->ift_rxd_available); MPASS(scctx->isc_txrx->ift_rxd_pkt_get); MPASS(scctx->isc_txrx->ift_rxd_refill); MPASS(scctx->isc_txrx->ift_rxd_flush); } static int iflib_register(if_ctx_t ctx) { if_shared_ctx_t sctx = ctx->ifc_sctx; driver_t *driver = sctx->isc_driver; device_t dev = ctx->ifc_dev; if_t ifp; _iflib_assert(sctx); CTX_LOCK_INIT(ctx); STATE_LOCK_INIT(ctx, device_get_nameunit(ctx->ifc_dev)); ifp = ctx->ifc_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "can not allocate ifnet structure\n"); return (ENOMEM); } /* * Initialize our context's device specific methods */ kobj_init((kobj_t) ctx, (kobj_class_t) driver); kobj_class_compile((kobj_class_t) driver); if_initname(ifp, device_get_name(dev), device_get_unit(dev)); if_setsoftc(ifp, ctx); if_setdev(ifp, dev); if_setinitfn(ifp, iflib_if_init); if_setioctlfn(ifp, iflib_if_ioctl); #ifdef ALTQ if_setstartfn(ifp, iflib_altq_if_start); if_settransmitfn(ifp, iflib_altq_if_transmit); if_setsendqready(ifp); #else if_settransmitfn(ifp, iflib_if_transmit); #endif if_setqflushfn(ifp, iflib_if_qflush); - if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); + if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST | + IFF_KNOWSEPOCH); ctx->ifc_vlan_attach_event = EVENTHANDLER_REGISTER(vlan_config, iflib_vlan_register, ctx, EVENTHANDLER_PRI_FIRST); ctx->ifc_vlan_detach_event = EVENTHANDLER_REGISTER(vlan_unconfig, iflib_vlan_unregister, ctx, EVENTHANDLER_PRI_FIRST); if ((sctx->isc_flags & IFLIB_DRIVER_MEDIA) == 0) { ctx->ifc_mediap = &ctx->ifc_media; ifmedia_init(ctx->ifc_mediap, IFM_IMASK, iflib_media_change, iflib_media_status); } return (0); } static void iflib_unregister_vlan_handlers(if_ctx_t ctx) { /* Unregister VLAN events */ if (ctx->ifc_vlan_attach_event != NULL) { EVENTHANDLER_DEREGISTER(vlan_config, ctx->ifc_vlan_attach_event); ctx->ifc_vlan_attach_event = NULL; } if (ctx->ifc_vlan_detach_event != NULL) { EVENTHANDLER_DEREGISTER(vlan_unconfig, ctx->ifc_vlan_detach_event); ctx->ifc_vlan_detach_event = NULL; } } static void iflib_deregister(if_ctx_t ctx) { if_t ifp = ctx->ifc_ifp; /* Remove all media */ ifmedia_removeall(&ctx->ifc_media); /* Ensure that VLAN event handlers are unregistered */ iflib_unregister_vlan_handlers(ctx); /* Release kobject reference */ kobj_delete((kobj_t) ctx, NULL); /* Free the ifnet structure */ if_free(ifp); STATE_LOCK_DESTROY(ctx); /* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/ CTX_LOCK_DESTROY(ctx); } static int iflib_queues_alloc(if_ctx_t ctx) { if_shared_ctx_t sctx = ctx->ifc_sctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; device_t dev = ctx->ifc_dev; int nrxqsets = scctx->isc_nrxqsets; int ntxqsets = scctx->isc_ntxqsets; iflib_txq_t txq; iflib_rxq_t rxq; iflib_fl_t fl = NULL; int i, j, cpu, err, txconf, rxconf; iflib_dma_info_t ifdip; uint32_t *rxqsizes = scctx->isc_rxqsizes; uint32_t *txqsizes = scctx->isc_txqsizes; uint8_t nrxqs = sctx->isc_nrxqs; uint8_t ntxqs = sctx->isc_ntxqs; int nfree_lists = sctx->isc_nfl ? sctx->isc_nfl : 1; caddr_t *vaddrs; uint64_t *paddrs; KASSERT(ntxqs > 0, ("number of queues per qset must be at least 1")); KASSERT(nrxqs > 0, ("number of queues per qset must be at least 1")); /* Allocate the TX ring struct memory */ if (!(ctx->ifc_txqs = (iflib_txq_t) malloc(sizeof(struct iflib_txq) * ntxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate TX ring memory\n"); err = ENOMEM; goto fail; } /* Now allocate the RX */ if (!(ctx->ifc_rxqs = (iflib_rxq_t) malloc(sizeof(struct iflib_rxq) * nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate RX ring memory\n"); err = ENOMEM; goto rx_fail; } txq = ctx->ifc_txqs; rxq = ctx->ifc_rxqs; /* * XXX handle allocation failure */ for (txconf = i = 0, cpu = CPU_FIRST(); i < ntxqsets; i++, txconf++, txq++, cpu = CPU_NEXT(cpu)) { /* Set up some basics */ if ((ifdip = malloc(sizeof(struct iflib_dma_info) * ntxqs, M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) { device_printf(dev, "Unable to allocate TX DMA info memory\n"); err = ENOMEM; goto err_tx_desc; } txq->ift_ifdi = ifdip; for (j = 0; j < ntxqs; j++, ifdip++) { if (iflib_dma_alloc(ctx, txqsizes[j], ifdip, 0)) { device_printf(dev, "Unable to allocate TX descriptors\n"); err = ENOMEM; goto err_tx_desc; } txq->ift_txd_size[j] = scctx->isc_txd_size[j]; bzero((void *)ifdip->idi_vaddr, txqsizes[j]); } txq->ift_ctx = ctx; txq->ift_id = i; if (sctx->isc_flags & IFLIB_HAS_TXCQ) { txq->ift_br_offset = 1; } else { txq->ift_br_offset = 0; } /* XXX fix this */ txq->ift_timer.c_cpu = cpu; if (iflib_txsd_alloc(txq)) { device_printf(dev, "Critical Failure setting up TX buffers\n"); err = ENOMEM; goto err_tx_desc; } /* Initialize the TX lock */ snprintf(txq->ift_mtx_name, MTX_NAME_LEN, "%s:TX(%d):callout", device_get_nameunit(dev), txq->ift_id); mtx_init(&txq->ift_mtx, txq->ift_mtx_name, NULL, MTX_DEF); callout_init_mtx(&txq->ift_timer, &txq->ift_mtx, 0); err = ifmp_ring_alloc(&txq->ift_br, 2048, txq, iflib_txq_drain, iflib_txq_can_drain, M_IFLIB, M_WAITOK); if (err) { /* XXX free any allocated rings */ device_printf(dev, "Unable to allocate buf_ring\n"); goto err_tx_desc; } } for (rxconf = i = 0; i < nrxqsets; i++, rxconf++, rxq++) { /* Set up some basics */ callout_init(&rxq->ifr_watchdog, 1); if ((ifdip = malloc(sizeof(struct iflib_dma_info) * nrxqs, M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) { device_printf(dev, "Unable to allocate RX DMA info memory\n"); err = ENOMEM; goto err_tx_desc; } rxq->ifr_ifdi = ifdip; /* XXX this needs to be changed if #rx queues != #tx queues */ rxq->ifr_ntxqirq = 1; rxq->ifr_txqid[0] = i; for (j = 0; j < nrxqs; j++, ifdip++) { if (iflib_dma_alloc(ctx, rxqsizes[j], ifdip, 0)) { device_printf(dev, "Unable to allocate RX descriptors\n"); err = ENOMEM; goto err_tx_desc; } bzero((void *)ifdip->idi_vaddr, rxqsizes[j]); } rxq->ifr_ctx = ctx; rxq->ifr_id = i; if (sctx->isc_flags & IFLIB_HAS_RXCQ) { rxq->ifr_fl_offset = 1; } else { rxq->ifr_fl_offset = 0; } rxq->ifr_nfl = nfree_lists; if (!(fl = (iflib_fl_t) malloc(sizeof(struct iflib_fl) * nfree_lists, M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate free list memory\n"); err = ENOMEM; goto err_tx_desc; } rxq->ifr_fl = fl; for (j = 0; j < nfree_lists; j++) { fl[j].ifl_rxq = rxq; fl[j].ifl_id = j; fl[j].ifl_ifdi = &rxq->ifr_ifdi[j + rxq->ifr_fl_offset]; fl[j].ifl_rxd_size = scctx->isc_rxd_size[j]; } /* Allocate receive buffers for the ring */ if (iflib_rxsd_alloc(rxq)) { device_printf(dev, "Critical Failure setting up receive buffers\n"); err = ENOMEM; goto err_rx_desc; } for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) fl->ifl_rx_bitmap = bit_alloc(fl->ifl_size, M_IFLIB, M_WAITOK); } /* TXQs */ vaddrs = malloc(sizeof(caddr_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK); paddrs = malloc(sizeof(uint64_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK); for (i = 0; i < ntxqsets; i++) { iflib_dma_info_t di = ctx->ifc_txqs[i].ift_ifdi; for (j = 0; j < ntxqs; j++, di++) { vaddrs[i*ntxqs + j] = di->idi_vaddr; paddrs[i*ntxqs + j] = di->idi_paddr; } } if ((err = IFDI_TX_QUEUES_ALLOC(ctx, vaddrs, paddrs, ntxqs, ntxqsets)) != 0) { device_printf(ctx->ifc_dev, "Unable to allocate device TX queue\n"); iflib_tx_structures_free(ctx); free(vaddrs, M_IFLIB); free(paddrs, M_IFLIB); goto err_rx_desc; } free(vaddrs, M_IFLIB); free(paddrs, M_IFLIB); /* RXQs */ vaddrs = malloc(sizeof(caddr_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK); paddrs = malloc(sizeof(uint64_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK); for (i = 0; i < nrxqsets; i++) { iflib_dma_info_t di = ctx->ifc_rxqs[i].ifr_ifdi; for (j = 0; j < nrxqs; j++, di++) { vaddrs[i*nrxqs + j] = di->idi_vaddr; paddrs[i*nrxqs + j] = di->idi_paddr; } } if ((err = IFDI_RX_QUEUES_ALLOC(ctx, vaddrs, paddrs, nrxqs, nrxqsets)) != 0) { device_printf(ctx->ifc_dev, "Unable to allocate device RX queue\n"); iflib_tx_structures_free(ctx); free(vaddrs, M_IFLIB); free(paddrs, M_IFLIB); goto err_rx_desc; } free(vaddrs, M_IFLIB); free(paddrs, M_IFLIB); return (0); /* XXX handle allocation failure changes */ err_rx_desc: err_tx_desc: rx_fail: if (ctx->ifc_rxqs != NULL) free(ctx->ifc_rxqs, M_IFLIB); ctx->ifc_rxqs = NULL; if (ctx->ifc_txqs != NULL) free(ctx->ifc_txqs, M_IFLIB); ctx->ifc_txqs = NULL; fail: return (err); } static int iflib_tx_structures_setup(if_ctx_t ctx) { iflib_txq_t txq = ctx->ifc_txqs; int i; for (i = 0; i < NTXQSETS(ctx); i++, txq++) iflib_txq_setup(txq); return (0); } static void iflib_tx_structures_free(if_ctx_t ctx) { iflib_txq_t txq = ctx->ifc_txqs; if_shared_ctx_t sctx = ctx->ifc_sctx; int i, j; for (i = 0; i < NTXQSETS(ctx); i++, txq++) { for (j = 0; j < sctx->isc_ntxqs; j++) iflib_dma_free(&txq->ift_ifdi[j]); iflib_txq_destroy(txq); } free(ctx->ifc_txqs, M_IFLIB); ctx->ifc_txqs = NULL; IFDI_QUEUES_FREE(ctx); } /********************************************************************* * * Initialize all receive rings. * **********************************************************************/ static int iflib_rx_structures_setup(if_ctx_t ctx) { iflib_rxq_t rxq = ctx->ifc_rxqs; int q; #if defined(INET6) || defined(INET) int err, i; #endif for (q = 0; q < ctx->ifc_softc_ctx.isc_nrxqsets; q++, rxq++) { #if defined(INET6) || defined(INET) if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_LRO) { err = tcp_lro_init_args(&rxq->ifr_lc, ctx->ifc_ifp, TCP_LRO_ENTRIES, min(1024, ctx->ifc_softc_ctx.isc_nrxd[rxq->ifr_fl_offset])); if (err != 0) { device_printf(ctx->ifc_dev, "LRO Initialization failed!\n"); goto fail; } } #endif IFDI_RXQ_SETUP(ctx, rxq->ifr_id); } return (0); #if defined(INET6) || defined(INET) fail: /* * Free LRO resources allocated so far, we will only handle * the rings that completed, the failing case will have * cleaned up for itself. 'q' failed, so its the terminus. */ rxq = ctx->ifc_rxqs; for (i = 0; i < q; ++i, rxq++) { if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_LRO) tcp_lro_free(&rxq->ifr_lc); } return (err); #endif } /********************************************************************* * * Free all receive rings. * **********************************************************************/ static void iflib_rx_structures_free(if_ctx_t ctx) { iflib_rxq_t rxq = ctx->ifc_rxqs; if_shared_ctx_t sctx = ctx->ifc_sctx; int i, j; for (i = 0; i < ctx->ifc_softc_ctx.isc_nrxqsets; i++, rxq++) { for (j = 0; j < sctx->isc_nrxqs; j++) iflib_dma_free(&rxq->ifr_ifdi[j]); iflib_rx_sds_free(rxq); #if defined(INET6) || defined(INET) if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_LRO) tcp_lro_free(&rxq->ifr_lc); #endif } free(ctx->ifc_rxqs, M_IFLIB); ctx->ifc_rxqs = NULL; } static int iflib_qset_structures_setup(if_ctx_t ctx) { int err; /* * It is expected that the caller takes care of freeing queues if this * fails. */ if ((err = iflib_tx_structures_setup(ctx)) != 0) { device_printf(ctx->ifc_dev, "iflib_tx_structures_setup failed: %d\n", err); return (err); } if ((err = iflib_rx_structures_setup(ctx)) != 0) device_printf(ctx->ifc_dev, "iflib_rx_structures_setup failed: %d\n", err); return (err); } int iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid, driver_filter_t filter, void *filter_arg, driver_intr_t handler, void *arg, const char *name) { return (_iflib_irq_alloc(ctx, irq, rid, filter, handler, arg, name)); } #ifdef SMP static int find_nth(if_ctx_t ctx, int qid) { cpuset_t cpus; int i, cpuid, eqid, count; CPU_COPY(&ctx->ifc_cpus, &cpus); count = CPU_COUNT(&cpus); eqid = qid % count; /* clear up to the qid'th bit */ for (i = 0; i < eqid; i++) { cpuid = CPU_FFS(&cpus); MPASS(cpuid != 0); CPU_CLR(cpuid-1, &cpus); } cpuid = CPU_FFS(&cpus); MPASS(cpuid != 0); return (cpuid-1); } #ifdef SCHED_ULE extern struct cpu_group *cpu_top; /* CPU topology */ static int find_child_with_core(int cpu, struct cpu_group *grp) { int i; if (grp->cg_children == 0) return -1; MPASS(grp->cg_child); for (i = 0; i < grp->cg_children; i++) { if (CPU_ISSET(cpu, &grp->cg_child[i].cg_mask)) return i; } return -1; } /* * Find the nth "close" core to the specified core * "close" is defined as the deepest level that shares * at least an L2 cache. With threads, this will be * threads on the same core. If the shared cache is L3 * or higher, simply returns the same core. */ static int find_close_core(int cpu, int core_offset) { struct cpu_group *grp; int i; int fcpu; cpuset_t cs; grp = cpu_top; if (grp == NULL) return cpu; i = 0; while ((i = find_child_with_core(cpu, grp)) != -1) { /* If the child only has one cpu, don't descend */ if (grp->cg_child[i].cg_count <= 1) break; grp = &grp->cg_child[i]; } /* If they don't share at least an L2 cache, use the same CPU */ if (grp->cg_level > CG_SHARE_L2 || grp->cg_level == CG_SHARE_NONE) return cpu; /* Now pick one */ CPU_COPY(&grp->cg_mask, &cs); /* Add the selected CPU offset to core offset. */ for (i = 0; (fcpu = CPU_FFS(&cs)) != 0; i++) { if (fcpu - 1 == cpu) break; CPU_CLR(fcpu - 1, &cs); } MPASS(fcpu); core_offset += i; CPU_COPY(&grp->cg_mask, &cs); for (i = core_offset % grp->cg_count; i > 0; i--) { MPASS(CPU_FFS(&cs)); CPU_CLR(CPU_FFS(&cs) - 1, &cs); } MPASS(CPU_FFS(&cs)); return CPU_FFS(&cs) - 1; } #else static int find_close_core(int cpu, int core_offset __unused) { return cpu; } #endif static int get_core_offset(if_ctx_t ctx, iflib_intr_type_t type, int qid) { switch (type) { case IFLIB_INTR_TX: /* TX queues get cores which share at least an L2 cache with the corresponding RX queue */ /* XXX handle multiple RX threads per core and more than two core per L2 group */ return qid / CPU_COUNT(&ctx->ifc_cpus) + 1; case IFLIB_INTR_RX: case IFLIB_INTR_RXTX: /* RX queues get the specified core */ return qid / CPU_COUNT(&ctx->ifc_cpus); default: return -1; } } #else #define get_core_offset(ctx, type, qid) CPU_FIRST() #define find_close_core(cpuid, tid) CPU_FIRST() #define find_nth(ctx, gid) CPU_FIRST() #endif /* Just to avoid copy/paste */ static inline int iflib_irq_set_affinity(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type, int qid, struct grouptask *gtask, struct taskqgroup *tqg, void *uniq, const char *name) { device_t dev; int co, cpuid, err, tid; dev = ctx->ifc_dev; co = ctx->ifc_sysctl_core_offset; if (ctx->ifc_sysctl_separate_txrx && type == IFLIB_INTR_TX) co += ctx->ifc_softc_ctx.isc_nrxqsets; cpuid = find_nth(ctx, qid + co); tid = get_core_offset(ctx, type, qid); if (tid < 0) { device_printf(dev, "get_core_offset failed\n"); return (EOPNOTSUPP); } cpuid = find_close_core(cpuid, tid); err = taskqgroup_attach_cpu(tqg, gtask, uniq, cpuid, dev, irq->ii_res, name); if (err) { device_printf(dev, "taskqgroup_attach_cpu failed %d\n", err); return (err); } #ifdef notyet if (cpuid > ctx->ifc_cpuid_highest) ctx->ifc_cpuid_highest = cpuid; #endif return (0); } int iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid, iflib_intr_type_t type, driver_filter_t *filter, void *filter_arg, int qid, const char *name) { device_t dev; struct grouptask *gtask; struct taskqgroup *tqg; iflib_filter_info_t info; gtask_fn_t *fn; int tqrid, err; driver_filter_t *intr_fast; void *q; info = &ctx->ifc_filter_info; tqrid = rid; switch (type) { /* XXX merge tx/rx for netmap? */ case IFLIB_INTR_TX: q = &ctx->ifc_txqs[qid]; info = &ctx->ifc_txqs[qid].ift_filter_info; gtask = &ctx->ifc_txqs[qid].ift_task; tqg = qgroup_if_io_tqg; fn = _task_fn_tx; intr_fast = iflib_fast_intr; GROUPTASK_INIT(gtask, 0, fn, q); ctx->ifc_flags |= IFC_NETMAP_TX_IRQ; break; case IFLIB_INTR_RX: q = &ctx->ifc_rxqs[qid]; info = &ctx->ifc_rxqs[qid].ifr_filter_info; gtask = &ctx->ifc_rxqs[qid].ifr_task; tqg = qgroup_if_io_tqg; fn = _task_fn_rx; intr_fast = iflib_fast_intr; NET_GROUPTASK_INIT(gtask, 0, fn, q); break; case IFLIB_INTR_RXTX: q = &ctx->ifc_rxqs[qid]; info = &ctx->ifc_rxqs[qid].ifr_filter_info; gtask = &ctx->ifc_rxqs[qid].ifr_task; tqg = qgroup_if_io_tqg; fn = _task_fn_rx; intr_fast = iflib_fast_intr_rxtx; NET_GROUPTASK_INIT(gtask, 0, fn, q); break; case IFLIB_INTR_ADMIN: q = ctx; tqrid = -1; info = &ctx->ifc_filter_info; gtask = &ctx->ifc_admin_task; tqg = qgroup_if_config_tqg; fn = _task_fn_admin; intr_fast = iflib_fast_intr_ctx; break; default: device_printf(ctx->ifc_dev, "%s: unknown net intr type\n", __func__); return (EINVAL); } info->ifi_filter = filter; info->ifi_filter_arg = filter_arg; info->ifi_task = gtask; info->ifi_ctx = q; dev = ctx->ifc_dev; err = _iflib_irq_alloc(ctx, irq, rid, intr_fast, NULL, info, name); if (err != 0) { device_printf(dev, "_iflib_irq_alloc failed %d\n", err); return (err); } if (type == IFLIB_INTR_ADMIN) return (0); if (tqrid != -1) { err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q, name); if (err) return (err); } else { taskqgroup_attach(tqg, gtask, q, dev, irq->ii_res, name); } return (0); } void iflib_softirq_alloc_generic(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type, void *arg, int qid, const char *name) { struct grouptask *gtask; struct taskqgroup *tqg; gtask_fn_t *fn; void *q; int err; switch (type) { case IFLIB_INTR_TX: q = &ctx->ifc_txqs[qid]; gtask = &ctx->ifc_txqs[qid].ift_task; tqg = qgroup_if_io_tqg; fn = _task_fn_tx; GROUPTASK_INIT(gtask, 0, fn, q); break; case IFLIB_INTR_RX: q = &ctx->ifc_rxqs[qid]; gtask = &ctx->ifc_rxqs[qid].ifr_task; tqg = qgroup_if_io_tqg; fn = _task_fn_rx; NET_GROUPTASK_INIT(gtask, 0, fn, q); break; case IFLIB_INTR_IOV: q = ctx; gtask = &ctx->ifc_vflr_task; tqg = qgroup_if_config_tqg; fn = _task_fn_iov; GROUPTASK_INIT(gtask, 0, fn, q); break; default: panic("unknown net intr type"); } if (irq != NULL) { err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q, name); if (err) taskqgroup_attach(tqg, gtask, q, ctx->ifc_dev, irq->ii_res, name); } else { taskqgroup_attach(tqg, gtask, q, NULL, NULL, name); } } void iflib_irq_free(if_ctx_t ctx, if_irq_t irq) { if (irq->ii_tag) bus_teardown_intr(ctx->ifc_dev, irq->ii_res, irq->ii_tag); if (irq->ii_res) bus_release_resource(ctx->ifc_dev, SYS_RES_IRQ, rman_get_rid(irq->ii_res), irq->ii_res); } static int iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filter_arg, int *rid, const char *name) { iflib_txq_t txq = ctx->ifc_txqs; iflib_rxq_t rxq = ctx->ifc_rxqs; if_irq_t irq = &ctx->ifc_legacy_irq; iflib_filter_info_t info; device_t dev; struct grouptask *gtask; struct resource *res; struct taskqgroup *tqg; void *q; int err, tqrid; bool rx_only; q = &ctx->ifc_rxqs[0]; info = &rxq[0].ifr_filter_info; gtask = &rxq[0].ifr_task; tqg = qgroup_if_io_tqg; tqrid = *rid; rx_only = (ctx->ifc_sctx->isc_flags & IFLIB_SINGLE_IRQ_RX_ONLY) != 0; ctx->ifc_flags |= IFC_LEGACY; info->ifi_filter = filter; info->ifi_filter_arg = filter_arg; info->ifi_task = gtask; info->ifi_ctx = rx_only ? ctx : q; dev = ctx->ifc_dev; /* We allocate a single interrupt resource */ err = _iflib_irq_alloc(ctx, irq, tqrid, rx_only ? iflib_fast_intr_ctx : iflib_fast_intr_rxtx, NULL, info, name); if (err != 0) return (err); NET_GROUPTASK_INIT(gtask, 0, _task_fn_rx, q); res = irq->ii_res; taskqgroup_attach(tqg, gtask, q, dev, res, name); GROUPTASK_INIT(&txq->ift_task, 0, _task_fn_tx, txq); taskqgroup_attach(qgroup_if_io_tqg, &txq->ift_task, txq, dev, res, "tx"); return (0); } void iflib_led_create(if_ctx_t ctx) { ctx->ifc_led_dev = led_create(iflib_led_func, ctx, device_get_nameunit(ctx->ifc_dev)); } void iflib_tx_intr_deferred(if_ctx_t ctx, int txqid) { GROUPTASK_ENQUEUE(&ctx->ifc_txqs[txqid].ift_task); } void iflib_rx_intr_deferred(if_ctx_t ctx, int rxqid) { GROUPTASK_ENQUEUE(&ctx->ifc_rxqs[rxqid].ifr_task); } void iflib_admin_intr_deferred(if_ctx_t ctx) { #ifdef INVARIANTS struct grouptask *gtask; gtask = &ctx->ifc_admin_task; MPASS(gtask != NULL && gtask->gt_taskqueue != NULL); #endif GROUPTASK_ENQUEUE(&ctx->ifc_admin_task); } void iflib_iov_intr_deferred(if_ctx_t ctx) { GROUPTASK_ENQUEUE(&ctx->ifc_vflr_task); } void iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, const char *name) { taskqgroup_attach_cpu(qgroup_if_io_tqg, gt, uniq, cpu, NULL, NULL, name); } void iflib_config_gtask_init(void *ctx, struct grouptask *gtask, gtask_fn_t *fn, const char *name) { GROUPTASK_INIT(gtask, 0, fn, ctx); taskqgroup_attach(qgroup_if_config_tqg, gtask, gtask, NULL, NULL, name); } void iflib_config_gtask_deinit(struct grouptask *gtask) { taskqgroup_detach(qgroup_if_config_tqg, gtask); } void iflib_link_state_change(if_ctx_t ctx, int link_state, uint64_t baudrate) { if_t ifp = ctx->ifc_ifp; iflib_txq_t txq = ctx->ifc_txqs; if_setbaudrate(ifp, baudrate); if (baudrate >= IF_Gbps(10)) { STATE_LOCK(ctx); ctx->ifc_flags |= IFC_PREFETCH; STATE_UNLOCK(ctx); } /* If link down, disable watchdog */ if ((ctx->ifc_link_state == LINK_STATE_UP) && (link_state == LINK_STATE_DOWN)) { for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxqsets; i++, txq++) txq->ift_qstatus = IFLIB_QUEUE_IDLE; } ctx->ifc_link_state = link_state; if_link_state_change(ifp, link_state); } static int iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq) { int credits; #ifdef INVARIANTS int credits_pre = txq->ift_cidx_processed; #endif bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map, BUS_DMASYNC_POSTREAD); if ((credits = ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, true)) == 0) return (0); txq->ift_processed += credits; txq->ift_cidx_processed += credits; MPASS(credits_pre + credits == txq->ift_cidx_processed); if (txq->ift_cidx_processed >= txq->ift_size) txq->ift_cidx_processed -= txq->ift_size; return (credits); } static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget) { iflib_fl_t fl; u_int i; for (i = 0, fl = &rxq->ifr_fl[0]; i < rxq->ifr_nfl; i++, fl++) bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); return (ctx->isc_rxd_available(ctx->ifc_softc, rxq->ifr_id, cidx, budget)); } void iflib_add_int_delay_sysctl(if_ctx_t ctx, const char *name, const char *description, if_int_delay_info_t info, int offset, int value) { info->iidi_ctx = ctx; info->iidi_offset = offset; info->iidi_value = value; SYSCTL_ADD_PROC(device_get_sysctl_ctx(ctx->ifc_dev), SYSCTL_CHILDREN(device_get_sysctl_tree(ctx->ifc_dev)), OID_AUTO, name, CTLTYPE_INT|CTLFLAG_RW, info, 0, iflib_sysctl_int_delay, "I", description); } struct sx * iflib_ctx_lock_get(if_ctx_t ctx) { return (&ctx->ifc_ctx_sx); } static int iflib_msix_init(if_ctx_t ctx) { device_t dev = ctx->ifc_dev; if_shared_ctx_t sctx = ctx->ifc_sctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; int admincnt, bar, err, iflib_num_rx_queues, iflib_num_tx_queues; int msgs, queuemsgs, queues, rx_queues, tx_queues, vectors; iflib_num_tx_queues = ctx->ifc_sysctl_ntxqs; iflib_num_rx_queues = ctx->ifc_sysctl_nrxqs; if (bootverbose) device_printf(dev, "msix_init qsets capped at %d\n", imax(scctx->isc_ntxqsets, scctx->isc_nrxqsets)); /* Override by tuneable */ if (scctx->isc_disable_msix) goto msi; /* First try MSI-X */ if ((msgs = pci_msix_count(dev)) == 0) { if (bootverbose) device_printf(dev, "MSI-X not supported or disabled\n"); goto msi; } bar = ctx->ifc_softc_ctx.isc_msix_bar; /* * bar == -1 => "trust me I know what I'm doing" * Some drivers are for hardware that is so shoddily * documented that no one knows which bars are which * so the developer has to map all bars. This hack * allows shoddy garbage to use MSI-X in this framework. */ if (bar != -1) { ctx->ifc_msix_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &bar, RF_ACTIVE); if (ctx->ifc_msix_mem == NULL) { device_printf(dev, "Unable to map MSI-X table\n"); goto msi; } } admincnt = sctx->isc_admin_intrcnt; #if IFLIB_DEBUG /* use only 1 qset in debug mode */ queuemsgs = min(msgs - admincnt, 1); #else queuemsgs = msgs - admincnt; #endif #ifdef RSS queues = imin(queuemsgs, rss_getnumbuckets()); #else queues = queuemsgs; #endif queues = imin(CPU_COUNT(&ctx->ifc_cpus), queues); if (bootverbose) device_printf(dev, "intr CPUs: %d queue msgs: %d admincnt: %d\n", CPU_COUNT(&ctx->ifc_cpus), queuemsgs, admincnt); #ifdef RSS /* If we're doing RSS, clamp at the number of RSS buckets */ if (queues > rss_getnumbuckets()) queues = rss_getnumbuckets(); #endif if (iflib_num_rx_queues > 0 && iflib_num_rx_queues < queuemsgs - admincnt) rx_queues = iflib_num_rx_queues; else rx_queues = queues; if (rx_queues > scctx->isc_nrxqsets) rx_queues = scctx->isc_nrxqsets; /* * We want this to be all logical CPUs by default */ if (iflib_num_tx_queues > 0 && iflib_num_tx_queues < queues) tx_queues = iflib_num_tx_queues; else tx_queues = mp_ncpus; if (tx_queues > scctx->isc_ntxqsets) tx_queues = scctx->isc_ntxqsets; if (ctx->ifc_sysctl_qs_eq_override == 0) { #ifdef INVARIANTS if (tx_queues != rx_queues) device_printf(dev, "queue equality override not set, capping rx_queues at %d and tx_queues at %d\n", min(rx_queues, tx_queues), min(rx_queues, tx_queues)); #endif tx_queues = min(rx_queues, tx_queues); rx_queues = min(rx_queues, tx_queues); } vectors = rx_queues + admincnt; if (msgs < vectors) { device_printf(dev, "insufficient number of MSI-X vectors " "(supported %d, need %d)\n", msgs, vectors); goto msi; } device_printf(dev, "Using %d RX queues %d TX queues\n", rx_queues, tx_queues); msgs = vectors; if ((err = pci_alloc_msix(dev, &vectors)) == 0) { if (vectors != msgs) { device_printf(dev, "Unable to allocate sufficient MSI-X vectors " "(got %d, need %d)\n", vectors, msgs); pci_release_msi(dev); if (bar != -1) { bus_release_resource(dev, SYS_RES_MEMORY, bar, ctx->ifc_msix_mem); ctx->ifc_msix_mem = NULL; } goto msi; } device_printf(dev, "Using MSI-X interrupts with %d vectors\n", vectors); scctx->isc_vectors = vectors; scctx->isc_nrxqsets = rx_queues; scctx->isc_ntxqsets = tx_queues; scctx->isc_intr = IFLIB_INTR_MSIX; return (vectors); } else { device_printf(dev, "failed to allocate %d MSI-X vectors, err: %d\n", vectors, err); if (bar != -1) { bus_release_resource(dev, SYS_RES_MEMORY, bar, ctx->ifc_msix_mem); ctx->ifc_msix_mem = NULL; } } msi: vectors = pci_msi_count(dev); scctx->isc_nrxqsets = 1; scctx->isc_ntxqsets = 1; scctx->isc_vectors = vectors; if (vectors == 1 && pci_alloc_msi(dev, &vectors) == 0) { device_printf(dev,"Using an MSI interrupt\n"); scctx->isc_intr = IFLIB_INTR_MSI; } else { scctx->isc_vectors = 1; device_printf(dev,"Using a Legacy interrupt\n"); scctx->isc_intr = IFLIB_INTR_LEGACY; } return (vectors); } static const char *ring_states[] = { "IDLE", "BUSY", "STALLED", "ABDICATED" }; static int mp_ring_state_handler(SYSCTL_HANDLER_ARGS) { int rc; uint16_t *state = ((uint16_t *)oidp->oid_arg1); struct sbuf *sb; const char *ring_state = "UNKNOWN"; /* XXX needed ? */ rc = sysctl_wire_old_buffer(req, 0); MPASS(rc == 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 80, req); MPASS(sb != NULL); if (sb == NULL) return (ENOMEM); if (state[3] <= 3) ring_state = ring_states[state[3]]; sbuf_printf(sb, "pidx_head: %04hd pidx_tail: %04hd cidx: %04hd state: %s", state[0], state[1], state[2], ring_state); rc = sbuf_finish(sb); sbuf_delete(sb); return(rc); } enum iflib_ndesc_handler { IFLIB_NTXD_HANDLER, IFLIB_NRXD_HANDLER, }; static int mp_ndesc_handler(SYSCTL_HANDLER_ARGS) { if_ctx_t ctx = (void *)arg1; enum iflib_ndesc_handler type = arg2; char buf[256] = {0}; qidx_t *ndesc; char *p, *next; int nqs, rc, i; nqs = 8; switch(type) { case IFLIB_NTXD_HANDLER: ndesc = ctx->ifc_sysctl_ntxds; if (ctx->ifc_sctx) nqs = ctx->ifc_sctx->isc_ntxqs; break; case IFLIB_NRXD_HANDLER: ndesc = ctx->ifc_sysctl_nrxds; if (ctx->ifc_sctx) nqs = ctx->ifc_sctx->isc_nrxqs; break; default: printf("%s: unhandled type\n", __func__); return (EINVAL); } if (nqs == 0) nqs = 8; for (i=0; i<8; i++) { if (i >= nqs) break; if (i) strcat(buf, ","); sprintf(strchr(buf, 0), "%d", ndesc[i]); } rc = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (rc || req->newptr == NULL) return rc; for (i = 0, next = buf, p = strsep(&next, " ,"); i < 8 && p; i++, p = strsep(&next, " ,")) { ndesc[i] = strtoul(p, NULL, 10); } return(rc); } #define NAME_BUFLEN 32 static void iflib_add_device_sysctl_pre(if_ctx_t ctx) { device_t dev = iflib_get_dev(ctx); struct sysctl_oid_list *child, *oid_list; struct sysctl_ctx_list *ctx_list; struct sysctl_oid *node; ctx_list = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); ctx->ifc_sysctl_node = node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, "iflib", CTLFLAG_RD, NULL, "IFLIB fields"); oid_list = SYSCTL_CHILDREN(node); SYSCTL_ADD_CONST_STRING(ctx_list, oid_list, OID_AUTO, "driver_version", CTLFLAG_RD, ctx->ifc_sctx->isc_driver_version, "driver version"); SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_ntxqs", CTLFLAG_RWTUN, &ctx->ifc_sysctl_ntxqs, 0, "# of txqs to use, 0 => use default #"); SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_nrxqs", CTLFLAG_RWTUN, &ctx->ifc_sysctl_nrxqs, 0, "# of rxqs to use, 0 => use default #"); SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_qs_enable", CTLFLAG_RWTUN, &ctx->ifc_sysctl_qs_eq_override, 0, "permit #txq != #rxq"); SYSCTL_ADD_INT(ctx_list, oid_list, OID_AUTO, "disable_msix", CTLFLAG_RWTUN, &ctx->ifc_softc_ctx.isc_disable_msix, 0, "disable MSI-X (default 0)"); SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "rx_budget", CTLFLAG_RWTUN, &ctx->ifc_sysctl_rx_budget, 0, "set the RX budget"); SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "tx_abdicate", CTLFLAG_RWTUN, &ctx->ifc_sysctl_tx_abdicate, 0, "cause TX to abdicate instead of running to completion"); ctx->ifc_sysctl_core_offset = CORE_OFFSET_UNSPECIFIED; SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "core_offset", CTLFLAG_RDTUN, &ctx->ifc_sysctl_core_offset, 0, "offset to start using cores at"); SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "separate_txrx", CTLFLAG_RDTUN, &ctx->ifc_sysctl_separate_txrx, 0, "use separate cores for TX and RX"); /* XXX change for per-queue sizes */ SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_ntxds", CTLTYPE_STRING|CTLFLAG_RWTUN, ctx, IFLIB_NTXD_HANDLER, mp_ndesc_handler, "A", "list of # of TX descriptors to use, 0 = use default #"); SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_nrxds", CTLTYPE_STRING|CTLFLAG_RWTUN, ctx, IFLIB_NRXD_HANDLER, mp_ndesc_handler, "A", "list of # of RX descriptors to use, 0 = use default #"); } static void iflib_add_device_sysctl_post(if_ctx_t ctx) { if_shared_ctx_t sctx = ctx->ifc_sctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; device_t dev = iflib_get_dev(ctx); struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx_list; iflib_fl_t fl; iflib_txq_t txq; iflib_rxq_t rxq; int i, j; char namebuf[NAME_BUFLEN]; char *qfmt; struct sysctl_oid *queue_node, *fl_node, *node; struct sysctl_oid_list *queue_list, *fl_list; ctx_list = device_get_sysctl_ctx(dev); node = ctx->ifc_sysctl_node; child = SYSCTL_CHILDREN(node); if (scctx->isc_ntxqsets > 100) qfmt = "txq%03d"; else if (scctx->isc_ntxqsets > 10) qfmt = "txq%02d"; else qfmt = "txq%d"; for (i = 0, txq = ctx->ifc_txqs; i < scctx->isc_ntxqsets; i++, txq++) { snprintf(namebuf, NAME_BUFLEN, qfmt, i); queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); #if MEMORY_LOGGING SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_dequeued", CTLFLAG_RD, &txq->ift_dequeued, "total mbufs freed"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_enqueued", CTLFLAG_RD, &txq->ift_enqueued, "total mbufs enqueued"); #endif SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag", CTLFLAG_RD, &txq->ift_mbuf_defrag, "# of times m_defrag was called"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "m_pullups", CTLFLAG_RD, &txq->ift_pullups, "# of times m_pullup was called"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag_failed", CTLFLAG_RD, &txq->ift_mbuf_defrag_failed, "# of times m_defrag failed"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_desc_avail", CTLFLAG_RD, &txq->ift_no_desc_avail, "# of times no descriptors were available"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "tx_map_failed", CTLFLAG_RD, &txq->ift_map_failed, "# of times DMA map failed"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txd_encap_efbig", CTLFLAG_RD, &txq->ift_txd_encap_efbig, "# of times txd_encap returned EFBIG"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_tx_dma_setup", CTLFLAG_RD, &txq->ift_no_tx_dma_setup, "# of times map failed for other than EFBIG"); SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_pidx", CTLFLAG_RD, &txq->ift_pidx, 1, "Producer Index"); SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx", CTLFLAG_RD, &txq->ift_cidx, 1, "Consumer Index"); SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx_processed", CTLFLAG_RD, &txq->ift_cidx_processed, 1, "Consumer Index seen by credit update"); SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_in_use", CTLFLAG_RD, &txq->ift_in_use, 1, "descriptors in use"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_processed", CTLFLAG_RD, &txq->ift_processed, "descriptors procesed for clean"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_cleaned", CTLFLAG_RD, &txq->ift_cleaned, "total cleaned"); SYSCTL_ADD_PROC(ctx_list, queue_list, OID_AUTO, "ring_state", CTLTYPE_STRING | CTLFLAG_RD, __DEVOLATILE(uint64_t *, &txq->ift_br->state), 0, mp_ring_state_handler, "A", "soft ring state"); SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_enqueues", CTLFLAG_RD, &txq->ift_br->enqueues, "# of enqueues to the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_drops", CTLFLAG_RD, &txq->ift_br->drops, "# of drops in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_starts", CTLFLAG_RD, &txq->ift_br->starts, "# of normal consumer starts in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_stalls", CTLFLAG_RD, &txq->ift_br->stalls, "# of consumer stalls in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_restarts", CTLFLAG_RD, &txq->ift_br->restarts, "# of consumer restarts in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_abdications", CTLFLAG_RD, &txq->ift_br->abdications, "# of consumer abdications in the mp_ring for this queue"); } if (scctx->isc_nrxqsets > 100) qfmt = "rxq%03d"; else if (scctx->isc_nrxqsets > 10) qfmt = "rxq%02d"; else qfmt = "rxq%d"; for (i = 0, rxq = ctx->ifc_rxqs; i < scctx->isc_nrxqsets; i++, rxq++) { snprintf(namebuf, NAME_BUFLEN, qfmt, i); queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); if (sctx->isc_flags & IFLIB_HAS_RXCQ) { SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_cidx", CTLFLAG_RD, &rxq->ifr_cq_cidx, 1, "Consumer Index"); } for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) { snprintf(namebuf, NAME_BUFLEN, "rxq_fl%d", j); fl_node = SYSCTL_ADD_NODE(ctx_list, queue_list, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "freelist Name"); fl_list = SYSCTL_CHILDREN(fl_node); SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "pidx", CTLFLAG_RD, &fl->ifl_pidx, 1, "Producer Index"); SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "cidx", CTLFLAG_RD, &fl->ifl_cidx, 1, "Consumer Index"); SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "credits", CTLFLAG_RD, &fl->ifl_credits, 1, "credits available"); #if MEMORY_LOGGING SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_enqueued", CTLFLAG_RD, &fl->ifl_m_enqueued, "mbufs allocated"); SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_dequeued", CTLFLAG_RD, &fl->ifl_m_dequeued, "mbufs freed"); SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_enqueued", CTLFLAG_RD, &fl->ifl_cl_enqueued, "clusters allocated"); SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_dequeued", CTLFLAG_RD, &fl->ifl_cl_dequeued, "clusters freed"); #endif } } } void iflib_request_reset(if_ctx_t ctx) { STATE_LOCK(ctx); ctx->ifc_flags |= IFC_DO_RESET; STATE_UNLOCK(ctx); } #ifndef __NO_STRICT_ALIGNMENT static struct mbuf * iflib_fixup_rx(struct mbuf *m) { struct mbuf *n; if (m->m_len <= (MCLBYTES - ETHER_HDR_LEN)) { bcopy(m->m_data, m->m_data + ETHER_HDR_LEN, m->m_len); m->m_data += ETHER_HDR_LEN; n = m; } else { MGETHDR(n, M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return (NULL); } bcopy(m->m_data, n->m_data, ETHER_HDR_LEN); m->m_data += ETHER_HDR_LEN; m->m_len -= ETHER_HDR_LEN; n->m_len = ETHER_HDR_LEN; M_MOVE_PKTHDR(n, m); n->m_next = m; } return (n); } #endif #ifdef DEBUGNET static void iflib_debugnet_init(if_t ifp, int *nrxr, int *ncl, int *clsize) { if_ctx_t ctx; ctx = if_getsoftc(ifp); CTX_LOCK(ctx); *nrxr = NRXQSETS(ctx); *ncl = ctx->ifc_rxqs[0].ifr_fl->ifl_size; *clsize = ctx->ifc_rxqs[0].ifr_fl->ifl_buf_size; CTX_UNLOCK(ctx); } static void iflib_debugnet_event(if_t ifp, enum debugnet_ev event) { if_ctx_t ctx; if_softc_ctx_t scctx; iflib_fl_t fl; iflib_rxq_t rxq; int i, j; ctx = if_getsoftc(ifp); scctx = &ctx->ifc_softc_ctx; switch (event) { case DEBUGNET_START: for (i = 0; i < scctx->isc_nrxqsets; i++) { rxq = &ctx->ifc_rxqs[i]; for (j = 0; j < rxq->ifr_nfl; j++) { fl = rxq->ifr_fl; fl->ifl_zone = m_getzone(fl->ifl_buf_size); } } iflib_no_tx_batch = 1; break; default: break; } } static int iflib_debugnet_transmit(if_t ifp, struct mbuf *m) { if_ctx_t ctx; iflib_txq_t txq; int error; ctx = if_getsoftc(ifp); if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return (EBUSY); txq = &ctx->ifc_txqs[0]; error = iflib_encap(txq, &m); if (error == 0) (void)iflib_txd_db_check(ctx, txq, true, txq->ift_in_use); return (error); } static int iflib_debugnet_poll(if_t ifp, int count) { struct epoch_tracker et; if_ctx_t ctx; if_softc_ctx_t scctx; iflib_txq_t txq; int i; ctx = if_getsoftc(ifp); scctx = &ctx->ifc_softc_ctx; if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return (EBUSY); txq = &ctx->ifc_txqs[0]; (void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx)); NET_EPOCH_ENTER(et); for (i = 0; i < scctx->isc_nrxqsets; i++) (void)iflib_rxeof(&ctx->ifc_rxqs[i], 16 /* XXX */); NET_EPOCH_EXIT(et); return (0); } #endif /* DEBUGNET */