Index: head/sys/dev/oce/oce_if.c =================================================================== --- head/sys/dev/oce/oce_if.c (revision 271550) +++ head/sys/dev/oce/oce_if.c (revision 271551) @@ -1,2360 +1,2357 @@ /*- * Copyright (C) 2013 Emulex * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * 3. Neither the name of the Emulex Corporation nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * Contact Information: * freebsd-drivers@emulex.com * * Emulex * 3333 Susan Street * Costa Mesa, CA 92626 */ /* $FreeBSD$ */ #include "opt_inet6.h" #include "opt_inet.h" #include "oce_if.h" /* UE Status Low CSR */ static char *ue_status_low_desc[] = { "CEV", "CTX", "DBUF", "ERX", "Host", "MPU", "NDMA", "PTC ", "RDMA ", "RXF ", "RXIPS ", "RXULP0 ", "RXULP1 ", "RXULP2 ", "TIM ", "TPOST ", "TPRE ", "TXIPS ", "TXULP0 ", "TXULP1 ", "UC ", "WDMA ", "TXULP2 ", "HOST1 ", "P0_OB_LINK ", "P1_OB_LINK ", "HOST_GPIO ", "MBOX ", "AXGMAC0", "AXGMAC1", "JTAG", "MPU_INTPEND" }; /* UE Status High CSR */ static char *ue_status_hi_desc[] = { "LPCMEMHOST", "MGMT_MAC", "PCS0ONLINE", "MPU_IRAM", "PCS1ONLINE", "PCTL0", "PCTL1", "PMEM", "RR", "TXPB", "RXPP", "XAUI", "TXP", "ARM", "IPC", "HOST2", "HOST3", "HOST4", "HOST5", "HOST6", "HOST7", "HOST8", "HOST9", "NETC", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown" }; /* Driver entry points prototypes */ static int oce_probe(device_t dev); static int oce_attach(device_t dev); static int oce_detach(device_t dev); static int oce_shutdown(device_t dev); static int oce_ioctl(struct ifnet *ifp, u_long command, caddr_t data); static void oce_init(void *xsc); static int oce_multiq_start(struct ifnet *ifp, struct mbuf *m); static void oce_multiq_flush(struct ifnet *ifp); /* Driver interrupt routines protypes */ static void oce_intr(void *arg, int pending); static int oce_setup_intr(POCE_SOFTC sc); static int oce_fast_isr(void *arg); static int oce_alloc_intr(POCE_SOFTC sc, int vector, void (*isr) (void *arg, int pending)); /* Media callbacks prototypes */ static void oce_media_status(struct ifnet *ifp, struct ifmediareq *req); static int oce_media_change(struct ifnet *ifp); /* Transmit routines prototypes */ static int oce_tx(POCE_SOFTC sc, struct mbuf **mpp, int wq_index); static void oce_tx_restart(POCE_SOFTC sc, struct oce_wq *wq); static void oce_tx_complete(struct oce_wq *wq, uint32_t wqe_idx, uint32_t status); static int oce_multiq_transmit(struct ifnet *ifp, struct mbuf *m, struct oce_wq *wq); /* Receive routines prototypes */ static void oce_discard_rx_comp(struct oce_rq *rq, struct oce_nic_rx_cqe *cqe); static int oce_cqe_vtp_valid(POCE_SOFTC sc, struct oce_nic_rx_cqe *cqe); static int oce_cqe_portid_valid(POCE_SOFTC sc, struct oce_nic_rx_cqe *cqe); static void oce_rx(struct oce_rq *rq, uint32_t rqe_idx, struct oce_nic_rx_cqe *cqe); /* Helper function prototypes in this file */ static int oce_attach_ifp(POCE_SOFTC sc); static void oce_add_vlan(void *arg, struct ifnet *ifp, uint16_t vtag); static void oce_del_vlan(void *arg, struct ifnet *ifp, uint16_t vtag); static int oce_vid_config(POCE_SOFTC sc); static void oce_mac_addr_set(POCE_SOFTC sc); static int oce_handle_passthrough(struct ifnet *ifp, caddr_t data); static void oce_local_timer(void *arg); static void oce_if_deactivate(POCE_SOFTC sc); static void oce_if_activate(POCE_SOFTC sc); static void setup_max_queues_want(POCE_SOFTC sc); static void update_queues_got(POCE_SOFTC sc); static void process_link_state(POCE_SOFTC sc, struct oce_async_cqe_link_state *acqe); static int oce_tx_asic_stall_verify(POCE_SOFTC sc, struct mbuf *m); static void oce_get_config(POCE_SOFTC sc); static struct mbuf *oce_insert_vlan_tag(POCE_SOFTC sc, struct mbuf *m, boolean_t *complete); /* IP specific */ #if defined(INET6) || defined(INET) static int oce_init_lro(POCE_SOFTC sc); static void oce_rx_flush_lro(struct oce_rq *rq); static struct mbuf * oce_tso_setup(POCE_SOFTC sc, struct mbuf **mpp); #endif static device_method_t oce_dispatch[] = { DEVMETHOD(device_probe, oce_probe), DEVMETHOD(device_attach, oce_attach), DEVMETHOD(device_detach, oce_detach), DEVMETHOD(device_shutdown, oce_shutdown), DEVMETHOD_END }; static driver_t oce_driver = { "oce", oce_dispatch, sizeof(OCE_SOFTC) }; static devclass_t oce_devclass; DRIVER_MODULE(oce, pci, oce_driver, oce_devclass, 0, 0); MODULE_DEPEND(oce, pci, 1, 1, 1); MODULE_DEPEND(oce, ether, 1, 1, 1); MODULE_VERSION(oce, 1); /* global vars */ const char component_revision[32] = {"///" COMPONENT_REVISION "///"}; /* Module capabilites and parameters */ uint32_t oce_max_rsp_handled = OCE_MAX_RSP_HANDLED; uint32_t oce_enable_rss = OCE_MODCAP_RSS; TUNABLE_INT("hw.oce.max_rsp_handled", &oce_max_rsp_handled); TUNABLE_INT("hw.oce.enable_rss", &oce_enable_rss); /* Supported devices table */ static uint32_t supportedDevices[] = { (PCI_VENDOR_SERVERENGINES << 16) | PCI_PRODUCT_BE2, (PCI_VENDOR_SERVERENGINES << 16) | PCI_PRODUCT_BE3, (PCI_VENDOR_EMULEX << 16) | PCI_PRODUCT_BE3, (PCI_VENDOR_EMULEX << 16) | PCI_PRODUCT_XE201, (PCI_VENDOR_EMULEX << 16) | PCI_PRODUCT_XE201_VF, (PCI_VENDOR_EMULEX << 16) | PCI_PRODUCT_SH }; /***************************************************************************** * Driver entry points functions * *****************************************************************************/ static int oce_probe(device_t dev) { uint16_t vendor = 0; uint16_t device = 0; int i = 0; char str[256] = {0}; POCE_SOFTC sc; sc = device_get_softc(dev); bzero(sc, sizeof(OCE_SOFTC)); sc->dev = dev; vendor = pci_get_vendor(dev); device = pci_get_device(dev); for (i = 0; i < (sizeof(supportedDevices) / sizeof(uint32_t)); i++) { if (vendor == ((supportedDevices[i] >> 16) & 0xffff)) { if (device == (supportedDevices[i] & 0xffff)) { sprintf(str, "%s:%s", "Emulex CNA NIC function", component_revision); device_set_desc_copy(dev, str); switch (device) { case PCI_PRODUCT_BE2: sc->flags |= OCE_FLAGS_BE2; break; case PCI_PRODUCT_BE3: sc->flags |= OCE_FLAGS_BE3; break; case PCI_PRODUCT_XE201: case PCI_PRODUCT_XE201_VF: sc->flags |= OCE_FLAGS_XE201; break; case PCI_PRODUCT_SH: sc->flags |= OCE_FLAGS_SH; break; default: return ENXIO; } return BUS_PROBE_DEFAULT; } } } return ENXIO; } static int oce_attach(device_t dev) { POCE_SOFTC sc; int rc = 0; sc = device_get_softc(dev); rc = oce_hw_pci_alloc(sc); if (rc) return rc; sc->tx_ring_size = OCE_TX_RING_SIZE; sc->rx_ring_size = OCE_RX_RING_SIZE; sc->rq_frag_size = OCE_RQ_BUF_SIZE; sc->flow_control = OCE_DEFAULT_FLOW_CONTROL; sc->promisc = OCE_DEFAULT_PROMISCUOUS; LOCK_CREATE(&sc->bmbx_lock, "Mailbox_lock"); LOCK_CREATE(&sc->dev_lock, "Device_lock"); /* initialise the hardware */ rc = oce_hw_init(sc); if (rc) goto pci_res_free; oce_get_config(sc); setup_max_queues_want(sc); rc = oce_setup_intr(sc); if (rc) goto mbox_free; rc = oce_queue_init_all(sc); if (rc) goto intr_free; rc = oce_attach_ifp(sc); if (rc) goto queues_free; #if defined(INET6) || defined(INET) rc = oce_init_lro(sc); if (rc) goto ifp_free; #endif rc = oce_hw_start(sc); if (rc) goto lro_free; sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, oce_add_vlan, sc, EVENTHANDLER_PRI_FIRST); sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, oce_del_vlan, sc, EVENTHANDLER_PRI_FIRST); rc = oce_stats_init(sc); if (rc) goto vlan_free; oce_add_sysctls(sc); callout_init(&sc->timer, CALLOUT_MPSAFE); rc = callout_reset(&sc->timer, 2 * hz, oce_local_timer, sc); if (rc) goto stats_free; return 0; stats_free: callout_drain(&sc->timer); oce_stats_free(sc); vlan_free: if (sc->vlan_attach) EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach); if (sc->vlan_detach) EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach); oce_hw_intr_disable(sc); lro_free: #if defined(INET6) || defined(INET) oce_free_lro(sc); ifp_free: #endif ether_ifdetach(sc->ifp); if_free(sc->ifp); queues_free: oce_queue_release_all(sc); intr_free: oce_intr_free(sc); mbox_free: oce_dma_free(sc, &sc->bsmbx); pci_res_free: oce_hw_pci_free(sc); LOCK_DESTROY(&sc->dev_lock); LOCK_DESTROY(&sc->bmbx_lock); return rc; } static int oce_detach(device_t dev) { POCE_SOFTC sc = device_get_softc(dev); LOCK(&sc->dev_lock); oce_if_deactivate(sc); UNLOCK(&sc->dev_lock); callout_drain(&sc->timer); if (sc->vlan_attach != NULL) EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach); if (sc->vlan_detach != NULL) EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach); ether_ifdetach(sc->ifp); if_free(sc->ifp); oce_hw_shutdown(sc); bus_generic_detach(dev); return 0; } static int oce_shutdown(device_t dev) { int rc; rc = oce_detach(dev); return rc; } static int oce_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; POCE_SOFTC sc = ifp->if_softc; int rc = 0; uint32_t u; switch (command) { case SIOCGIFMEDIA: rc = ifmedia_ioctl(ifp, ifr, &sc->media, command); break; case SIOCSIFMTU: if (ifr->ifr_mtu > OCE_MAX_MTU) rc = EINVAL; else ifp->if_mtu = ifr->ifr_mtu; break; case SIOCSIFFLAGS: if (ifp->if_flags & IFF_UP) { if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { sc->ifp->if_drv_flags |= IFF_DRV_RUNNING; oce_init(sc); } device_printf(sc->dev, "Interface Up\n"); } else { LOCK(&sc->dev_lock); sc->ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); oce_if_deactivate(sc); UNLOCK(&sc->dev_lock); device_printf(sc->dev, "Interface Down\n"); } if ((ifp->if_flags & IFF_PROMISC) && !sc->promisc) { if (!oce_rxf_set_promiscuous(sc, (1 | (1 << 1)))) sc->promisc = TRUE; } else if (!(ifp->if_flags & IFF_PROMISC) && sc->promisc) { if (!oce_rxf_set_promiscuous(sc, 0)) sc->promisc = FALSE; } break; case SIOCADDMULTI: case SIOCDELMULTI: rc = oce_hw_update_multicast(sc); if (rc) device_printf(sc->dev, "Update multicast address failed\n"); break; case SIOCSIFCAP: u = ifr->ifr_reqcap ^ ifp->if_capenable; if (u & IFCAP_TXCSUM) { ifp->if_capenable ^= IFCAP_TXCSUM; ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP); if (IFCAP_TSO & ifp->if_capenable && !(IFCAP_TXCSUM & ifp->if_capenable)) { ifp->if_capenable &= ~IFCAP_TSO; ifp->if_hwassist &= ~CSUM_TSO; if_printf(ifp, "TSO disabled due to -txcsum.\n"); } } if (u & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; if (u & IFCAP_TSO4) { ifp->if_capenable ^= IFCAP_TSO4; if (IFCAP_TSO & ifp->if_capenable) { if (IFCAP_TXCSUM & ifp->if_capenable) ifp->if_hwassist |= CSUM_TSO; else { ifp->if_capenable &= ~IFCAP_TSO; ifp->if_hwassist &= ~CSUM_TSO; if_printf(ifp, "Enable txcsum first.\n"); rc = EAGAIN; } } else ifp->if_hwassist &= ~CSUM_TSO; } if (u & IFCAP_VLAN_HWTAGGING) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (u & IFCAP_VLAN_HWFILTER) { ifp->if_capenable ^= IFCAP_VLAN_HWFILTER; oce_vid_config(sc); } #if defined(INET6) || defined(INET) if (u & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; #endif break; case SIOCGPRIVATE_0: rc = oce_handle_passthrough(ifp, data); break; default: rc = ether_ioctl(ifp, command, data); break; } return rc; } static void oce_init(void *arg) { POCE_SOFTC sc = arg; LOCK(&sc->dev_lock); if (sc->ifp->if_flags & IFF_UP) { oce_if_deactivate(sc); oce_if_activate(sc); } UNLOCK(&sc->dev_lock); } static int oce_multiq_start(struct ifnet *ifp, struct mbuf *m) { POCE_SOFTC sc = ifp->if_softc; struct oce_wq *wq = NULL; int queue_index = 0; int status = 0; if ((m->m_flags & M_FLOWID) != 0) queue_index = m->m_pkthdr.flowid % sc->nwqs; wq = sc->wq[queue_index]; LOCK(&wq->tx_lock); status = oce_multiq_transmit(ifp, m, wq); UNLOCK(&wq->tx_lock); return status; } static void oce_multiq_flush(struct ifnet *ifp) { POCE_SOFTC sc = ifp->if_softc; struct mbuf *m; int i = 0; for (i = 0; i < sc->nwqs; i++) { while ((m = buf_ring_dequeue_sc(sc->wq[i]->br)) != NULL) m_freem(m); } if_qflush(ifp); } /***************************************************************************** * Driver interrupt routines functions * *****************************************************************************/ static void oce_intr(void *arg, int pending) { POCE_INTR_INFO ii = (POCE_INTR_INFO) arg; POCE_SOFTC sc = ii->sc; struct oce_eq *eq = ii->eq; struct oce_eqe *eqe; struct oce_cq *cq = NULL; int i, num_eqes = 0; bus_dmamap_sync(eq->ring->dma.tag, eq->ring->dma.map, BUS_DMASYNC_POSTWRITE); do { eqe = RING_GET_CONSUMER_ITEM_VA(eq->ring, struct oce_eqe); if (eqe->evnt == 0) break; eqe->evnt = 0; bus_dmamap_sync(eq->ring->dma.tag, eq->ring->dma.map, BUS_DMASYNC_POSTWRITE); RING_GET(eq->ring, 1); num_eqes++; } while (TRUE); if (!num_eqes) goto eq_arm; /* Spurious */ /* Clear EQ entries, but dont arm */ oce_arm_eq(sc, eq->eq_id, num_eqes, FALSE, FALSE); /* Process TX, RX and MCC. But dont arm CQ*/ for (i = 0; i < eq->cq_valid; i++) { cq = eq->cq[i]; (*cq->cq_handler)(cq->cb_arg); } /* Arm all cqs connected to this EQ */ for (i = 0; i < eq->cq_valid; i++) { cq = eq->cq[i]; oce_arm_cq(sc, cq->cq_id, 0, TRUE); } eq_arm: oce_arm_eq(sc, eq->eq_id, 0, TRUE, FALSE); return; } static int oce_setup_intr(POCE_SOFTC sc) { int rc = 0, use_intx = 0; int vector = 0, req_vectors = 0; if (is_rss_enabled(sc)) req_vectors = MAX((sc->nrqs - 1), sc->nwqs); else req_vectors = 1; if (sc->flags & OCE_FLAGS_MSIX_CAPABLE) { sc->intr_count = req_vectors; rc = pci_alloc_msix(sc->dev, &sc->intr_count); if (rc != 0) { use_intx = 1; pci_release_msi(sc->dev); } else sc->flags |= OCE_FLAGS_USING_MSIX; } else use_intx = 1; if (use_intx) sc->intr_count = 1; /* Scale number of queues based on intr we got */ update_queues_got(sc); if (use_intx) { device_printf(sc->dev, "Using legacy interrupt\n"); rc = oce_alloc_intr(sc, vector, oce_intr); if (rc) goto error; } else { for (; vector < sc->intr_count; vector++) { rc = oce_alloc_intr(sc, vector, oce_intr); if (rc) goto error; } } return 0; error: oce_intr_free(sc); return rc; } static int oce_fast_isr(void *arg) { POCE_INTR_INFO ii = (POCE_INTR_INFO) arg; POCE_SOFTC sc = ii->sc; if (ii->eq == NULL) return FILTER_STRAY; oce_arm_eq(sc, ii->eq->eq_id, 0, FALSE, TRUE); taskqueue_enqueue_fast(ii->tq, &ii->task); ii->eq->intr++; return FILTER_HANDLED; } static int oce_alloc_intr(POCE_SOFTC sc, int vector, void (*isr) (void *arg, int pending)) { POCE_INTR_INFO ii = &sc->intrs[vector]; int rc = 0, rr; if (vector >= OCE_MAX_EQ) return (EINVAL); /* Set the resource id for the interrupt. * MSIx is vector + 1 for the resource id, * INTx is 0 for the resource id. */ if (sc->flags & OCE_FLAGS_USING_MSIX) rr = vector + 1; else rr = 0; ii->intr_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rr, RF_ACTIVE|RF_SHAREABLE); ii->irq_rr = rr; if (ii->intr_res == NULL) { device_printf(sc->dev, "Could not allocate interrupt\n"); rc = ENXIO; return rc; } TASK_INIT(&ii->task, 0, isr, ii); ii->vector = vector; sprintf(ii->task_name, "oce_task[%d]", ii->vector); ii->tq = taskqueue_create_fast(ii->task_name, M_NOWAIT, taskqueue_thread_enqueue, &ii->tq); taskqueue_start_threads(&ii->tq, 1, PI_NET, "%s taskq", device_get_nameunit(sc->dev)); ii->sc = sc; rc = bus_setup_intr(sc->dev, ii->intr_res, INTR_TYPE_NET, oce_fast_isr, NULL, ii, &ii->tag); return rc; } void oce_intr_free(POCE_SOFTC sc) { int i = 0; for (i = 0; i < sc->intr_count; i++) { if (sc->intrs[i].tag != NULL) bus_teardown_intr(sc->dev, sc->intrs[i].intr_res, sc->intrs[i].tag); if (sc->intrs[i].tq != NULL) taskqueue_free(sc->intrs[i].tq); if (sc->intrs[i].intr_res != NULL) bus_release_resource(sc->dev, SYS_RES_IRQ, sc->intrs[i].irq_rr, sc->intrs[i].intr_res); sc->intrs[i].tag = NULL; sc->intrs[i].intr_res = NULL; } if (sc->flags & OCE_FLAGS_USING_MSIX) pci_release_msi(sc->dev); } /****************************************************************************** * Media callbacks functions * ******************************************************************************/ static void oce_media_status(struct ifnet *ifp, struct ifmediareq *req) { POCE_SOFTC sc = (POCE_SOFTC) ifp->if_softc; req->ifm_status = IFM_AVALID; req->ifm_active = IFM_ETHER; if (sc->link_status == 1) req->ifm_status |= IFM_ACTIVE; else return; switch (sc->link_speed) { case 1: /* 10 Mbps */ req->ifm_active |= IFM_10_T | IFM_FDX; sc->speed = 10; break; case 2: /* 100 Mbps */ req->ifm_active |= IFM_100_TX | IFM_FDX; sc->speed = 100; break; case 3: /* 1 Gbps */ req->ifm_active |= IFM_1000_T | IFM_FDX; sc->speed = 1000; break; case 4: /* 10 Gbps */ req->ifm_active |= IFM_10G_SR | IFM_FDX; sc->speed = 10000; break; case 5: /* 20 Gbps */ req->ifm_active |= IFM_10G_SR | IFM_FDX; sc->speed = 20000; break; case 6: /* 25 Gbps */ req->ifm_active |= IFM_10G_SR | IFM_FDX; sc->speed = 25000; break; case 7: /* 40 Gbps */ req->ifm_active |= IFM_40G_SR4 | IFM_FDX; sc->speed = 40000; break; default: sc->speed = 0; break; } return; } int oce_media_change(struct ifnet *ifp) { return 0; } /***************************************************************************** * Transmit routines functions * *****************************************************************************/ static int oce_tx(POCE_SOFTC sc, struct mbuf **mpp, int wq_index) { int rc = 0, i, retry_cnt = 0; bus_dma_segment_t segs[OCE_MAX_TX_ELEMENTS]; struct mbuf *m, *m_temp; struct oce_wq *wq = sc->wq[wq_index]; struct oce_packet_desc *pd; struct oce_nic_hdr_wqe *nichdr; struct oce_nic_frag_wqe *nicfrag; int num_wqes; uint32_t reg_value; boolean_t complete = TRUE; m = *mpp; if (!m) return EINVAL; if (!(m->m_flags & M_PKTHDR)) { rc = ENXIO; goto free_ret; } if(oce_tx_asic_stall_verify(sc, m)) { m = oce_insert_vlan_tag(sc, m, &complete); if(!m) { device_printf(sc->dev, "Insertion unsuccessful\n"); return 0; } } if (m->m_pkthdr.csum_flags & CSUM_TSO) { /* consolidate packet buffers for TSO/LSO segment offload */ #if defined(INET6) || defined(INET) m = oce_tso_setup(sc, mpp); #else m = NULL; #endif if (m == NULL) { rc = ENXIO; goto free_ret; } } pd = &wq->pckts[wq->pkt_desc_head]; retry: rc = bus_dmamap_load_mbuf_sg(wq->tag, pd->map, m, segs, &pd->nsegs, BUS_DMA_NOWAIT); if (rc == 0) { num_wqes = pd->nsegs + 1; if (IS_BE(sc) || IS_SH(sc)) { /*Dummy required only for BE3.*/ if (num_wqes & 1) num_wqes++; } if (num_wqes >= RING_NUM_FREE(wq->ring)) { bus_dmamap_unload(wq->tag, pd->map); return EBUSY; } atomic_store_rel_int(&wq->pkt_desc_head, (wq->pkt_desc_head + 1) % \ OCE_WQ_PACKET_ARRAY_SIZE); bus_dmamap_sync(wq->tag, pd->map, BUS_DMASYNC_PREWRITE); pd->mbuf = m; nichdr = RING_GET_PRODUCER_ITEM_VA(wq->ring, struct oce_nic_hdr_wqe); nichdr->u0.dw[0] = 0; nichdr->u0.dw[1] = 0; nichdr->u0.dw[2] = 0; nichdr->u0.dw[3] = 0; nichdr->u0.s.complete = complete; nichdr->u0.s.event = 1; nichdr->u0.s.crc = 1; nichdr->u0.s.forward = 0; nichdr->u0.s.ipcs = (m->m_pkthdr.csum_flags & CSUM_IP) ? 1 : 0; nichdr->u0.s.udpcs = (m->m_pkthdr.csum_flags & CSUM_UDP) ? 1 : 0; nichdr->u0.s.tcpcs = (m->m_pkthdr.csum_flags & CSUM_TCP) ? 1 : 0; nichdr->u0.s.num_wqe = num_wqes; nichdr->u0.s.total_length = m->m_pkthdr.len; if (m->m_flags & M_VLANTAG) { nichdr->u0.s.vlan = 1; /*Vlan present*/ nichdr->u0.s.vlan_tag = m->m_pkthdr.ether_vtag; } if (m->m_pkthdr.csum_flags & CSUM_TSO) { if (m->m_pkthdr.tso_segsz) { nichdr->u0.s.lso = 1; nichdr->u0.s.lso_mss = m->m_pkthdr.tso_segsz; } if (!IS_BE(sc) || !IS_SH(sc)) nichdr->u0.s.ipcs = 1; } RING_PUT(wq->ring, 1); atomic_add_int(&wq->ring->num_used, 1); for (i = 0; i < pd->nsegs; i++) { nicfrag = RING_GET_PRODUCER_ITEM_VA(wq->ring, struct oce_nic_frag_wqe); nicfrag->u0.s.rsvd0 = 0; nicfrag->u0.s.frag_pa_hi = ADDR_HI(segs[i].ds_addr); nicfrag->u0.s.frag_pa_lo = ADDR_LO(segs[i].ds_addr); nicfrag->u0.s.frag_len = segs[i].ds_len; pd->wqe_idx = wq->ring->pidx; RING_PUT(wq->ring, 1); atomic_add_int(&wq->ring->num_used, 1); } if (num_wqes > (pd->nsegs + 1)) { nicfrag = RING_GET_PRODUCER_ITEM_VA(wq->ring, struct oce_nic_frag_wqe); nicfrag->u0.dw[0] = 0; nicfrag->u0.dw[1] = 0; nicfrag->u0.dw[2] = 0; nicfrag->u0.dw[3] = 0; pd->wqe_idx = wq->ring->pidx; RING_PUT(wq->ring, 1); atomic_add_int(&wq->ring->num_used, 1); pd->nsegs++; } sc->ifp->if_opackets++; wq->tx_stats.tx_reqs++; wq->tx_stats.tx_wrbs += num_wqes; wq->tx_stats.tx_bytes += m->m_pkthdr.len; wq->tx_stats.tx_pkts++; bus_dmamap_sync(wq->ring->dma.tag, wq->ring->dma.map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); reg_value = (num_wqes << 16) | wq->wq_id; OCE_WRITE_REG32(sc, db, wq->db_offset, reg_value); } else if (rc == EFBIG) { if (retry_cnt == 0) { m_temp = m_defrag(m, M_NOWAIT); if (m_temp == NULL) goto free_ret; m = m_temp; *mpp = m_temp; retry_cnt = retry_cnt + 1; goto retry; } else goto free_ret; } else if (rc == ENOMEM) return rc; else goto free_ret; return 0; free_ret: m_freem(*mpp); *mpp = NULL; return rc; } static void oce_tx_complete(struct oce_wq *wq, uint32_t wqe_idx, uint32_t status) { struct oce_packet_desc *pd; POCE_SOFTC sc = (POCE_SOFTC) wq->parent; struct mbuf *m; pd = &wq->pckts[wq->pkt_desc_tail]; atomic_store_rel_int(&wq->pkt_desc_tail, (wq->pkt_desc_tail + 1) % OCE_WQ_PACKET_ARRAY_SIZE); atomic_subtract_int(&wq->ring->num_used, pd->nsegs + 1); bus_dmamap_sync(wq->tag, pd->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(wq->tag, pd->map); m = pd->mbuf; m_freem(m); pd->mbuf = NULL; if (sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) { if (wq->ring->num_used < (wq->ring->num_items / 2)) { sc->ifp->if_drv_flags &= ~(IFF_DRV_OACTIVE); oce_tx_restart(sc, wq); } } } static void oce_tx_restart(POCE_SOFTC sc, struct oce_wq *wq) { if ((sc->ifp->if_drv_flags & IFF_DRV_RUNNING) != IFF_DRV_RUNNING) return; #if __FreeBSD_version >= 800000 if (!drbr_empty(sc->ifp, wq->br)) #else if (!IFQ_DRV_IS_EMPTY(&sc->ifp->if_snd)) #endif taskqueue_enqueue_fast(taskqueue_swi, &wq->txtask); } #if defined(INET6) || defined(INET) static struct mbuf * oce_tso_setup(POCE_SOFTC sc, struct mbuf **mpp) { struct mbuf *m; #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; #endif struct ether_vlan_header *eh; struct tcphdr *th; uint16_t etype; int total_len = 0, ehdrlen = 0; m = *mpp; if (M_WRITABLE(m) == 0) { m = m_dup(*mpp, M_NOWAIT); if (!m) return NULL; m_freem(*mpp); *mpp = m; } eh = mtod(m, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { etype = ntohs(eh->evl_proto); ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { etype = ntohs(eh->evl_encap_proto); ehdrlen = ETHER_HDR_LEN; } switch (etype) { #ifdef INET case ETHERTYPE_IP: ip = (struct ip *)(m->m_data + ehdrlen); if (ip->ip_p != IPPROTO_TCP) return NULL; th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); total_len = ehdrlen + (ip->ip_hl << 2) + (th->th_off << 2); break; #endif #ifdef INET6 case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(m->m_data + ehdrlen); if (ip6->ip6_nxt != IPPROTO_TCP) return NULL; th = (struct tcphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); total_len = ehdrlen + sizeof(struct ip6_hdr) + (th->th_off << 2); break; #endif default: return NULL; } m = m_pullup(m, total_len); if (!m) return NULL; *mpp = m; return m; } #endif /* INET6 || INET */ void oce_tx_task(void *arg, int npending) { struct oce_wq *wq = arg; POCE_SOFTC sc = wq->parent; struct ifnet *ifp = sc->ifp; int rc = 0; #if __FreeBSD_version >= 800000 LOCK(&wq->tx_lock); rc = oce_multiq_transmit(ifp, NULL, wq); if (rc) { device_printf(sc->dev, "TX[%d] restart failed\n", wq->queue_index); } UNLOCK(&wq->tx_lock); #else oce_start(ifp); #endif } void oce_start(struct ifnet *ifp) { POCE_SOFTC sc = ifp->if_softc; struct mbuf *m; int rc = 0; int def_q = 0; /* Defualt tx queue is 0*/ if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return; if (!sc->link_status) return; do { IF_DEQUEUE(&sc->ifp->if_snd, m); if (m == NULL) break; LOCK(&sc->wq[def_q]->tx_lock); rc = oce_tx(sc, &m, def_q); UNLOCK(&sc->wq[def_q]->tx_lock); if (rc) { if (m != NULL) { sc->wq[def_q]->tx_stats.tx_stops ++; ifp->if_drv_flags |= IFF_DRV_OACTIVE; IFQ_DRV_PREPEND(&ifp->if_snd, m); m = NULL; } break; } if (m != NULL) ETHER_BPF_MTAP(ifp, m); } while (TRUE); return; } /* Handle the Completion Queue for transmit */ uint16_t oce_wq_handler(void *arg) { struct oce_wq *wq = (struct oce_wq *)arg; POCE_SOFTC sc = wq->parent; struct oce_cq *cq = wq->cq; struct oce_nic_tx_cqe *cqe; int num_cqes = 0; bus_dmamap_sync(cq->ring->dma.tag, cq->ring->dma.map, BUS_DMASYNC_POSTWRITE); cqe = RING_GET_CONSUMER_ITEM_VA(cq->ring, struct oce_nic_tx_cqe); while (cqe->u0.dw[3]) { DW_SWAP((uint32_t *) cqe, sizeof(oce_wq_cqe)); wq->ring->cidx = cqe->u0.s.wqe_index + 1; if (wq->ring->cidx >= wq->ring->num_items) wq->ring->cidx -= wq->ring->num_items; oce_tx_complete(wq, cqe->u0.s.wqe_index, cqe->u0.s.status); wq->tx_stats.tx_compl++; cqe->u0.dw[3] = 0; RING_GET(cq->ring, 1); bus_dmamap_sync(cq->ring->dma.tag, cq->ring->dma.map, BUS_DMASYNC_POSTWRITE); cqe = RING_GET_CONSUMER_ITEM_VA(cq->ring, struct oce_nic_tx_cqe); num_cqes++; } if (num_cqes) oce_arm_cq(sc, cq->cq_id, num_cqes, FALSE); return 0; } static int oce_multiq_transmit(struct ifnet *ifp, struct mbuf *m, struct oce_wq *wq) { POCE_SOFTC sc = ifp->if_softc; int status = 0, queue_index = 0; struct mbuf *next = NULL; struct buf_ring *br = NULL; br = wq->br; queue_index = wq->queue_index; if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) { if (m != NULL) status = drbr_enqueue(ifp, br, m); return status; } if (m != NULL) { if ((status = drbr_enqueue(ifp, br, m)) != 0) return status; } while ((next = drbr_peek(ifp, br)) != NULL) { if (oce_tx(sc, &next, queue_index)) { if (next == NULL) { drbr_advance(ifp, br); } else { drbr_putback(ifp, br, next); wq->tx_stats.tx_stops ++; ifp->if_drv_flags |= IFF_DRV_OACTIVE; } break; } drbr_advance(ifp, br); ifp->if_obytes += next->m_pkthdr.len; if (next->m_flags & M_MCAST) ifp->if_omcasts++; ETHER_BPF_MTAP(ifp, next); } return 0; } /***************************************************************************** * Receive routines functions * *****************************************************************************/ static void oce_rx(struct oce_rq *rq, uint32_t rqe_idx, struct oce_nic_rx_cqe *cqe) { uint32_t out; struct oce_packet_desc *pd; POCE_SOFTC sc = (POCE_SOFTC) rq->parent; int i, len, frag_len; struct mbuf *m = NULL, *tail = NULL; uint16_t vtag; len = cqe->u0.s.pkt_size; if (!len) { /*partial DMA workaround for Lancer*/ oce_discard_rx_comp(rq, cqe); goto exit; } /* Get vlan_tag value */ if(IS_BE(sc) || IS_SH(sc)) vtag = BSWAP_16(cqe->u0.s.vlan_tag); else vtag = cqe->u0.s.vlan_tag; for (i = 0; i < cqe->u0.s.num_fragments; i++) { if (rq->packets_out == rq->packets_in) { device_printf(sc->dev, "RQ transmit descriptor missing\n"); } out = rq->packets_out + 1; if (out == OCE_RQ_PACKET_ARRAY_SIZE) out = 0; pd = &rq->pckts[rq->packets_out]; rq->packets_out = out; bus_dmamap_sync(rq->tag, pd->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(rq->tag, pd->map); rq->pending--; frag_len = (len > rq->cfg.frag_size) ? rq->cfg.frag_size : len; pd->mbuf->m_len = frag_len; if (tail != NULL) { /* additional fragments */ pd->mbuf->m_flags &= ~M_PKTHDR; tail->m_next = pd->mbuf; tail = pd->mbuf; } else { /* first fragment, fill out much of the packet header */ pd->mbuf->m_pkthdr.len = len; pd->mbuf->m_pkthdr.csum_flags = 0; if (IF_CSUM_ENABLED(sc)) { if (cqe->u0.s.l4_cksum_pass) { pd->mbuf->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); pd->mbuf->m_pkthdr.csum_data = 0xffff; } if (cqe->u0.s.ip_cksum_pass) { if (!cqe->u0.s.ip_ver) { /* IPV4 */ pd->mbuf->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); } } } m = tail = pd->mbuf; } pd->mbuf = NULL; len -= frag_len; } if (m) { if (!oce_cqe_portid_valid(sc, cqe)) { m_freem(m); goto exit; } m->m_pkthdr.rcvif = sc->ifp; #if __FreeBSD_version >= 800000 if (rq->queue_index) m->m_pkthdr.flowid = (rq->queue_index - 1); else m->m_pkthdr.flowid = rq->queue_index; m->m_flags |= M_FLOWID; #endif /* This deternies if vlan tag is Valid */ if (oce_cqe_vtp_valid(sc, cqe)) { if (sc->function_mode & FNM_FLEX10_MODE) { /* FLEX10. If QnQ is not set, neglect VLAN */ if (cqe->u0.s.qnq) { m->m_pkthdr.ether_vtag = vtag; m->m_flags |= M_VLANTAG; } } else if (sc->pvid != (vtag & VLAN_VID_MASK)) { /* In UMC mode generally pvid will be striped by hw. But in some cases we have seen it comes with pvid. So if pvid == vlan, neglect vlan. */ m->m_pkthdr.ether_vtag = vtag; m->m_flags |= M_VLANTAG; } } sc->ifp->if_ipackets++; #if defined(INET6) || defined(INET) /* Try to queue to LRO */ if (IF_LRO_ENABLED(sc) && (cqe->u0.s.ip_cksum_pass) && (cqe->u0.s.l4_cksum_pass) && (!cqe->u0.s.ip_ver) && (rq->lro.lro_cnt != 0)) { if (tcp_lro_rx(&rq->lro, m, 0) == 0) { rq->lro_pkts_queued ++; goto post_done; } /* If LRO posting fails then try to post to STACK */ } #endif (*sc->ifp->if_input) (sc->ifp, m); #if defined(INET6) || defined(INET) post_done: #endif /* Update rx stats per queue */ rq->rx_stats.rx_pkts++; rq->rx_stats.rx_bytes += cqe->u0.s.pkt_size; rq->rx_stats.rx_frags += cqe->u0.s.num_fragments; if (cqe->u0.s.pkt_type == OCE_MULTICAST_PACKET) rq->rx_stats.rx_mcast_pkts++; if (cqe->u0.s.pkt_type == OCE_UNICAST_PACKET) rq->rx_stats.rx_ucast_pkts++; } exit: return; } static void oce_discard_rx_comp(struct oce_rq *rq, struct oce_nic_rx_cqe *cqe) { uint32_t out, i = 0; struct oce_packet_desc *pd; POCE_SOFTC sc = (POCE_SOFTC) rq->parent; int num_frags = cqe->u0.s.num_fragments; for (i = 0; i < num_frags; i++) { if (rq->packets_out == rq->packets_in) { device_printf(sc->dev, "RQ transmit descriptor missing\n"); } out = rq->packets_out + 1; if (out == OCE_RQ_PACKET_ARRAY_SIZE) out = 0; pd = &rq->pckts[rq->packets_out]; rq->packets_out = out; bus_dmamap_sync(rq->tag, pd->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(rq->tag, pd->map); rq->pending--; m_freem(pd->mbuf); } } static int oce_cqe_vtp_valid(POCE_SOFTC sc, struct oce_nic_rx_cqe *cqe) { struct oce_nic_rx_cqe_v1 *cqe_v1; int vtp = 0; if (sc->be3_native) { cqe_v1 = (struct oce_nic_rx_cqe_v1 *)cqe; vtp = cqe_v1->u0.s.vlan_tag_present; } else vtp = cqe->u0.s.vlan_tag_present; return vtp; } static int oce_cqe_portid_valid(POCE_SOFTC sc, struct oce_nic_rx_cqe *cqe) { struct oce_nic_rx_cqe_v1 *cqe_v1; int port_id = 0; if (sc->be3_native && (IS_BE(sc) || IS_SH(sc))) { cqe_v1 = (struct oce_nic_rx_cqe_v1 *)cqe; port_id = cqe_v1->u0.s.port; if (sc->port_id != port_id) return 0; } else ;/* For BE3 legacy and Lancer this is dummy */ return 1; } #if defined(INET6) || defined(INET) static void oce_rx_flush_lro(struct oce_rq *rq) { struct lro_ctrl *lro = &rq->lro; struct lro_entry *queued; POCE_SOFTC sc = (POCE_SOFTC) rq->parent; if (!IF_LRO_ENABLED(sc)) return; while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) { SLIST_REMOVE_HEAD(&lro->lro_active, next); tcp_lro_flush(lro, queued); } rq->lro_pkts_queued = 0; return; } static int oce_init_lro(POCE_SOFTC sc) { struct lro_ctrl *lro = NULL; int i = 0, rc = 0; for (i = 0; i < sc->nrqs; i++) { lro = &sc->rq[i]->lro; rc = tcp_lro_init(lro); if (rc != 0) { device_printf(sc->dev, "LRO init failed\n"); return rc; } lro->ifp = sc->ifp; } return rc; } void oce_free_lro(POCE_SOFTC sc) { struct lro_ctrl *lro = NULL; int i = 0; for (i = 0; i < sc->nrqs; i++) { lro = &sc->rq[i]->lro; if (lro) tcp_lro_free(lro); } } #endif int oce_alloc_rx_bufs(struct oce_rq *rq, int count) { POCE_SOFTC sc = (POCE_SOFTC) rq->parent; int i, in, rc; struct oce_packet_desc *pd; bus_dma_segment_t segs[6]; int nsegs, added = 0; struct oce_nic_rqe *rqe; pd_rxulp_db_t rxdb_reg; bzero(&rxdb_reg, sizeof(pd_rxulp_db_t)); for (i = 0; i < count; i++) { in = rq->packets_in + 1; if (in == OCE_RQ_PACKET_ARRAY_SIZE) in = 0; if (in == rq->packets_out) break; /* no more room */ pd = &rq->pckts[rq->packets_in]; pd->mbuf = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (pd->mbuf == NULL) break; pd->mbuf->m_len = pd->mbuf->m_pkthdr.len = MCLBYTES; rc = bus_dmamap_load_mbuf_sg(rq->tag, pd->map, pd->mbuf, segs, &nsegs, BUS_DMA_NOWAIT); if (rc) { m_free(pd->mbuf); break; } if (nsegs != 1) { i--; continue; } rq->packets_in = in; bus_dmamap_sync(rq->tag, pd->map, BUS_DMASYNC_PREREAD); rqe = RING_GET_PRODUCER_ITEM_VA(rq->ring, struct oce_nic_rqe); rqe->u0.s.frag_pa_hi = ADDR_HI(segs[0].ds_addr); rqe->u0.s.frag_pa_lo = ADDR_LO(segs[0].ds_addr); DW_SWAP(u32ptr(rqe), sizeof(struct oce_nic_rqe)); RING_PUT(rq->ring, 1); added++; rq->pending++; } if (added != 0) { for (i = added / OCE_MAX_RQ_POSTS; i > 0; i--) { rxdb_reg.bits.num_posted = OCE_MAX_RQ_POSTS; rxdb_reg.bits.qid = rq->rq_id; OCE_WRITE_REG32(sc, db, PD_RXULP_DB, rxdb_reg.dw0); added -= OCE_MAX_RQ_POSTS; } if (added > 0) { rxdb_reg.bits.qid = rq->rq_id; rxdb_reg.bits.num_posted = added; OCE_WRITE_REG32(sc, db, PD_RXULP_DB, rxdb_reg.dw0); } } return 0; } /* Handle the Completion Queue for receive */ uint16_t oce_rq_handler(void *arg) { struct oce_rq *rq = (struct oce_rq *)arg; struct oce_cq *cq = rq->cq; POCE_SOFTC sc = rq->parent; struct oce_nic_rx_cqe *cqe; int num_cqes = 0, rq_buffers_used = 0; bus_dmamap_sync(cq->ring->dma.tag, cq->ring->dma.map, BUS_DMASYNC_POSTWRITE); cqe = RING_GET_CONSUMER_ITEM_VA(cq->ring, struct oce_nic_rx_cqe); while (cqe->u0.dw[2]) { DW_SWAP((uint32_t *) cqe, sizeof(oce_rq_cqe)); RING_GET(rq->ring, 1); if (cqe->u0.s.error == 0) { oce_rx(rq, cqe->u0.s.frag_index, cqe); } else { rq->rx_stats.rxcp_err++; sc->ifp->if_ierrors++; /* Post L3/L4 errors to stack.*/ oce_rx(rq, cqe->u0.s.frag_index, cqe); } rq->rx_stats.rx_compl++; cqe->u0.dw[2] = 0; #if defined(INET6) || defined(INET) if (IF_LRO_ENABLED(sc) && rq->lro_pkts_queued >= 16) { oce_rx_flush_lro(rq); } #endif RING_GET(cq->ring, 1); bus_dmamap_sync(cq->ring->dma.tag, cq->ring->dma.map, BUS_DMASYNC_POSTWRITE); cqe = RING_GET_CONSUMER_ITEM_VA(cq->ring, struct oce_nic_rx_cqe); num_cqes++; if (num_cqes >= (IS_XE201(sc) ? 8 : oce_max_rsp_handled)) break; } #if defined(INET6) || defined(INET) if (IF_LRO_ENABLED(sc)) oce_rx_flush_lro(rq); #endif if (num_cqes) { oce_arm_cq(sc, cq->cq_id, num_cqes, FALSE); rq_buffers_used = OCE_RQ_PACKET_ARRAY_SIZE - rq->pending; if (rq_buffers_used > 1) oce_alloc_rx_bufs(rq, (rq_buffers_used - 1)); } return 0; } /***************************************************************************** * Helper function prototypes in this file * *****************************************************************************/ static int oce_attach_ifp(POCE_SOFTC sc) { sc->ifp = if_alloc(IFT_ETHER); if (!sc->ifp) return ENOMEM; ifmedia_init(&sc->media, IFM_IMASK, oce_media_change, oce_media_status); ifmedia_add(&sc->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->media, IFM_ETHER | IFM_AUTO); sc->ifp->if_flags = IFF_BROADCAST | IFF_MULTICAST; sc->ifp->if_ioctl = oce_ioctl; sc->ifp->if_start = oce_start; sc->ifp->if_init = oce_init; sc->ifp->if_mtu = ETHERMTU; sc->ifp->if_softc = sc; #if __FreeBSD_version >= 800000 sc->ifp->if_transmit = oce_multiq_start; sc->ifp->if_qflush = oce_multiq_flush; #endif if_initname(sc->ifp, device_get_name(sc->dev), device_get_unit(sc->dev)); sc->ifp->if_snd.ifq_drv_maxlen = OCE_MAX_TX_DESC - 1; IFQ_SET_MAXLEN(&sc->ifp->if_snd, sc->ifp->if_snd.ifq_drv_maxlen); IFQ_SET_READY(&sc->ifp->if_snd); sc->ifp->if_hwassist = OCE_IF_HWASSIST; sc->ifp->if_hwassist |= CSUM_TSO; sc->ifp->if_hwassist |= (CSUM_IP | CSUM_TCP | CSUM_UDP); sc->ifp->if_capabilities = OCE_IF_CAPABILITIES; sc->ifp->if_capabilities |= IFCAP_HWCSUM; sc->ifp->if_capabilities |= IFCAP_VLAN_HWFILTER; #if defined(INET6) || defined(INET) sc->ifp->if_capabilities |= IFCAP_TSO; sc->ifp->if_capabilities |= IFCAP_LRO; sc->ifp->if_capabilities |= IFCAP_VLAN_HWTSO; #endif sc->ifp->if_capenable = sc->ifp->if_capabilities; sc->ifp->if_baudrate = IF_Gbps(10); #if __FreeBSD_version >= 1000000 - sc->ifp->if_hw_tsomax = IF_HW_TSOMAX_BUILD_VALUE( - 65535 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN) /* bytes */, - OCE_MAX_TX_ELEMENTS /* maximum frag count */, - 12 /* 4K frag size */); + sc->ifp->if_hw_tsomax = OCE_MAX_TSO_SIZE; #endif ether_ifattach(sc->ifp, sc->macaddr.mac_addr); return 0; } static void oce_add_vlan(void *arg, struct ifnet *ifp, uint16_t vtag) { POCE_SOFTC sc = ifp->if_softc; if (ifp->if_softc != arg) return; if ((vtag == 0) || (vtag > 4095)) return; sc->vlan_tag[vtag] = 1; sc->vlans_added++; if (sc->vlans_added <= (sc->max_vlans + 1)) oce_vid_config(sc); } static void oce_del_vlan(void *arg, struct ifnet *ifp, uint16_t vtag) { POCE_SOFTC sc = ifp->if_softc; if (ifp->if_softc != arg) return; if ((vtag == 0) || (vtag > 4095)) return; sc->vlan_tag[vtag] = 0; sc->vlans_added--; oce_vid_config(sc); } /* * A max of 64 vlans can be configured in BE. If the user configures * more, place the card in vlan promiscuous mode. */ static int oce_vid_config(POCE_SOFTC sc) { struct normal_vlan vtags[MAX_VLANFILTER_SIZE]; uint16_t ntags = 0, i; int status = 0; if ((sc->vlans_added <= MAX_VLANFILTER_SIZE) && (sc->ifp->if_capenable & IFCAP_VLAN_HWFILTER)) { for (i = 0; i < MAX_VLANS; i++) { if (sc->vlan_tag[i]) { vtags[ntags].vtag = i; ntags++; } } if (ntags) status = oce_config_vlan(sc, (uint8_t) sc->if_id, vtags, ntags, 1, 0); } else status = oce_config_vlan(sc, (uint8_t) sc->if_id, NULL, 0, 1, 1); return status; } static void oce_mac_addr_set(POCE_SOFTC sc) { uint32_t old_pmac_id = sc->pmac_id; int status = 0; status = bcmp((IF_LLADDR(sc->ifp)), sc->macaddr.mac_addr, sc->macaddr.size_of_struct); if (!status) return; status = oce_mbox_macaddr_add(sc, (uint8_t *)(IF_LLADDR(sc->ifp)), sc->if_id, &sc->pmac_id); if (!status) { status = oce_mbox_macaddr_del(sc, sc->if_id, old_pmac_id); bcopy((IF_LLADDR(sc->ifp)), sc->macaddr.mac_addr, sc->macaddr.size_of_struct); } if (status) device_printf(sc->dev, "Failed update macaddress\n"); } static int oce_handle_passthrough(struct ifnet *ifp, caddr_t data) { POCE_SOFTC sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; int rc = ENXIO; char cookie[32] = {0}; void *priv_data = (void *)ifr->ifr_data; void *ioctl_ptr; uint32_t req_size; struct mbx_hdr req; OCE_DMA_MEM dma_mem; struct mbx_common_get_cntl_attr *fw_cmd; if (copyin(priv_data, cookie, strlen(IOCTL_COOKIE))) return EFAULT; if (memcmp(cookie, IOCTL_COOKIE, strlen(IOCTL_COOKIE))) return EINVAL; ioctl_ptr = (char *)priv_data + strlen(IOCTL_COOKIE); if (copyin(ioctl_ptr, &req, sizeof(struct mbx_hdr))) return EFAULT; req_size = le32toh(req.u0.req.request_length); if (req_size > 65536) return EINVAL; req_size += sizeof(struct mbx_hdr); rc = oce_dma_alloc(sc, req_size, &dma_mem, 0); if (rc) return ENOMEM; if (copyin(ioctl_ptr, OCE_DMAPTR(&dma_mem,char), req_size)) { rc = EFAULT; goto dma_free; } rc = oce_pass_through_mbox(sc, &dma_mem, req_size); if (rc) { rc = EIO; goto dma_free; } if (copyout(OCE_DMAPTR(&dma_mem,char), ioctl_ptr, req_size)) rc = EFAULT; /* firmware is filling all the attributes for this ioctl except the driver version..so fill it */ if(req.u0.rsp.opcode == OPCODE_COMMON_GET_CNTL_ATTRIBUTES) { fw_cmd = (struct mbx_common_get_cntl_attr *) ioctl_ptr; strncpy(fw_cmd->params.rsp.cntl_attr_info.hba_attr.drv_ver_str, COMPONENT_REVISION, strlen(COMPONENT_REVISION)); } dma_free: oce_dma_free(sc, &dma_mem); return rc; } static void oce_eqd_set_periodic(POCE_SOFTC sc) { struct oce_set_eqd set_eqd[OCE_MAX_EQ]; struct oce_aic_obj *aic; struct oce_eq *eqo; uint64_t now = 0, delta; int eqd, i, num = 0; uint32_t ips = 0; int tps; for (i = 0 ; i < sc->neqs; i++) { eqo = sc->eq[i]; aic = &sc->aic_obj[i]; /* When setting the static eq delay from the user space */ if (!aic->enable) { eqd = aic->et_eqd; goto modify_eqd; } now = ticks; /* Over flow check */ if ((now < aic->ticks) || (eqo->intr < aic->intr_prev)) goto done; delta = now - aic->ticks; tps = delta/hz; /* Interrupt rate based on elapsed ticks */ if(tps) ips = (uint32_t)(eqo->intr - aic->intr_prev) / tps; if (ips > INTR_RATE_HWM) eqd = aic->cur_eqd + 20; else if (ips < INTR_RATE_LWM) eqd = aic->cur_eqd / 2; else goto done; if (eqd < 10) eqd = 0; /* Make sure that the eq delay is in the known range */ eqd = min(eqd, aic->max_eqd); eqd = max(eqd, aic->min_eqd); modify_eqd: if (eqd != aic->cur_eqd) { set_eqd[num].delay_multiplier = (eqd * 65)/100; set_eqd[num].eq_id = eqo->eq_id; aic->cur_eqd = eqd; num++; } done: aic->intr_prev = eqo->intr; aic->ticks = now; } /* Is there atleast one eq that needs to be modified? */ if(num) oce_mbox_eqd_modify_periodic(sc, set_eqd, num); } static void oce_detect_hw_error(POCE_SOFTC sc) { uint32_t ue_low = 0, ue_high = 0, ue_low_mask = 0, ue_high_mask = 0; uint32_t sliport_status = 0, sliport_err1 = 0, sliport_err2 = 0; uint32_t i; if (sc->hw_error) return; if (IS_XE201(sc)) { sliport_status = OCE_READ_REG32(sc, db, SLIPORT_STATUS_OFFSET); if (sliport_status & SLIPORT_STATUS_ERR_MASK) { sliport_err1 = OCE_READ_REG32(sc, db, SLIPORT_ERROR1_OFFSET); sliport_err2 = OCE_READ_REG32(sc, db, SLIPORT_ERROR2_OFFSET); } } else { ue_low = OCE_READ_REG32(sc, devcfg, PCICFG_UE_STATUS_LOW); ue_high = OCE_READ_REG32(sc, devcfg, PCICFG_UE_STATUS_HIGH); ue_low_mask = OCE_READ_REG32(sc, devcfg, PCICFG_UE_STATUS_LOW_MASK); ue_high_mask = OCE_READ_REG32(sc, devcfg, PCICFG_UE_STATUS_HI_MASK); ue_low = (ue_low & ~ue_low_mask); ue_high = (ue_high & ~ue_high_mask); } /* On certain platforms BE hardware can indicate spurious UEs. * Allow the h/w to stop working completely in case of a real UE. * Hence not setting the hw_error for UE detection. */ if (sliport_status & SLIPORT_STATUS_ERR_MASK) { sc->hw_error = TRUE; device_printf(sc->dev, "Error detected in the card\n"); } if (sliport_status & SLIPORT_STATUS_ERR_MASK) { device_printf(sc->dev, "ERR: sliport status 0x%x\n", sliport_status); device_printf(sc->dev, "ERR: sliport error1 0x%x\n", sliport_err1); device_printf(sc->dev, "ERR: sliport error2 0x%x\n", sliport_err2); } if (ue_low) { for (i = 0; ue_low; ue_low >>= 1, i++) { if (ue_low & 1) device_printf(sc->dev, "UE: %s bit set\n", ue_status_low_desc[i]); } } if (ue_high) { for (i = 0; ue_high; ue_high >>= 1, i++) { if (ue_high & 1) device_printf(sc->dev, "UE: %s bit set\n", ue_status_hi_desc[i]); } } } static void oce_local_timer(void *arg) { POCE_SOFTC sc = arg; int i = 0; oce_detect_hw_error(sc); oce_refresh_nic_stats(sc); oce_refresh_queue_stats(sc); oce_mac_addr_set(sc); /* TX Watch Dog*/ for (i = 0; i < sc->nwqs; i++) oce_tx_restart(sc, sc->wq[i]); /* calculate and set the eq delay for optimal interrupt rate */ if (IS_BE(sc) || IS_SH(sc)) oce_eqd_set_periodic(sc); callout_reset(&sc->timer, hz, oce_local_timer, sc); } /* NOTE : This should only be called holding * DEVICE_LOCK. */ static void oce_if_deactivate(POCE_SOFTC sc) { int i, mtime = 0; int wait_req = 0; struct oce_rq *rq; struct oce_wq *wq; struct oce_eq *eq; sc->ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); /*Wait for max of 400ms for TX completions to be done */ while (mtime < 400) { wait_req = 0; for_all_wq_queues(sc, wq, i) { if (wq->ring->num_used) { wait_req = 1; DELAY(1); break; } } mtime += 1; if (!wait_req) break; } /* Stop intrs and finish any bottom halves pending */ oce_hw_intr_disable(sc); /* Since taskqueue_drain takes a Gaint Lock, We should not acquire any other lock. So unlock device lock and require after completing taskqueue_drain. */ UNLOCK(&sc->dev_lock); for (i = 0; i < sc->intr_count; i++) { if (sc->intrs[i].tq != NULL) { taskqueue_drain(sc->intrs[i].tq, &sc->intrs[i].task); } } LOCK(&sc->dev_lock); /* Delete RX queue in card with flush param */ oce_stop_rx(sc); /* Invalidate any pending cq and eq entries*/ for_all_evnt_queues(sc, eq, i) oce_drain_eq(eq); for_all_rq_queues(sc, rq, i) oce_drain_rq_cq(rq); for_all_wq_queues(sc, wq, i) oce_drain_wq_cq(wq); /* But still we need to get MCC aync events. So enable intrs and also arm first EQ */ oce_hw_intr_enable(sc); oce_arm_eq(sc, sc->eq[0]->eq_id, 0, TRUE, FALSE); DELAY(10); } static void oce_if_activate(POCE_SOFTC sc) { struct oce_eq *eq; struct oce_rq *rq; struct oce_wq *wq; int i, rc = 0; sc->ifp->if_drv_flags |= IFF_DRV_RUNNING; oce_hw_intr_disable(sc); oce_start_rx(sc); for_all_rq_queues(sc, rq, i) { rc = oce_start_rq(rq); if (rc) device_printf(sc->dev, "Unable to start RX\n"); } for_all_wq_queues(sc, wq, i) { rc = oce_start_wq(wq); if (rc) device_printf(sc->dev, "Unable to start TX\n"); } for_all_evnt_queues(sc, eq, i) oce_arm_eq(sc, eq->eq_id, 0, TRUE, FALSE); oce_hw_intr_enable(sc); } static void process_link_state(POCE_SOFTC sc, struct oce_async_cqe_link_state *acqe) { /* Update Link status */ if ((acqe->u0.s.link_status & ~ASYNC_EVENT_LOGICAL) == ASYNC_EVENT_LINK_UP) { sc->link_status = ASYNC_EVENT_LINK_UP; if_link_state_change(sc->ifp, LINK_STATE_UP); } else { sc->link_status = ASYNC_EVENT_LINK_DOWN; if_link_state_change(sc->ifp, LINK_STATE_DOWN); } } /* Handle the Completion Queue for the Mailbox/Async notifications */ uint16_t oce_mq_handler(void *arg) { struct oce_mq *mq = (struct oce_mq *)arg; POCE_SOFTC sc = mq->parent; struct oce_cq *cq = mq->cq; int num_cqes = 0, evt_type = 0, optype = 0; struct oce_mq_cqe *cqe; struct oce_async_cqe_link_state *acqe; struct oce_async_event_grp5_pvid_state *gcqe; struct oce_async_event_qnq *dbgcqe; bus_dmamap_sync(cq->ring->dma.tag, cq->ring->dma.map, BUS_DMASYNC_POSTWRITE); cqe = RING_GET_CONSUMER_ITEM_VA(cq->ring, struct oce_mq_cqe); while (cqe->u0.dw[3]) { DW_SWAP((uint32_t *) cqe, sizeof(oce_mq_cqe)); if (cqe->u0.s.async_event) { evt_type = cqe->u0.s.event_type; optype = cqe->u0.s.async_type; if (evt_type == ASYNC_EVENT_CODE_LINK_STATE) { /* Link status evt */ acqe = (struct oce_async_cqe_link_state *)cqe; process_link_state(sc, acqe); } else if ((evt_type == ASYNC_EVENT_GRP5) && (optype == ASYNC_EVENT_PVID_STATE)) { /* GRP5 PVID */ gcqe = (struct oce_async_event_grp5_pvid_state *)cqe; if (gcqe->enabled) sc->pvid = gcqe->tag & VLAN_VID_MASK; else sc->pvid = 0; } else if(evt_type == ASYNC_EVENT_CODE_DEBUG && optype == ASYNC_EVENT_DEBUG_QNQ) { dbgcqe = (struct oce_async_event_qnq *)cqe; if(dbgcqe->valid) sc->qnqid = dbgcqe->vlan_tag; sc->qnq_debug_event = TRUE; } } cqe->u0.dw[3] = 0; RING_GET(cq->ring, 1); bus_dmamap_sync(cq->ring->dma.tag, cq->ring->dma.map, BUS_DMASYNC_POSTWRITE); cqe = RING_GET_CONSUMER_ITEM_VA(cq->ring, struct oce_mq_cqe); num_cqes++; } if (num_cqes) oce_arm_cq(sc, cq->cq_id, num_cqes, FALSE); return 0; } static void setup_max_queues_want(POCE_SOFTC sc) { /* Check if it is FLEX machine. Is so dont use RSS */ if ((sc->function_mode & FNM_FLEX10_MODE) || (sc->function_mode & FNM_UMC_MODE) || (sc->function_mode & FNM_VNIC_MODE) || (!is_rss_enabled(sc)) || IS_BE2(sc)) { sc->nrqs = 1; sc->nwqs = 1; } else { sc->nrqs = MIN(OCE_NCPUS, sc->nrssqs) + 1; sc->nwqs = MIN(OCE_NCPUS, sc->nrssqs); } if (IS_BE2(sc) && is_rss_enabled(sc)) sc->nrqs = MIN(OCE_NCPUS, sc->nrssqs) + 1; } static void update_queues_got(POCE_SOFTC sc) { if (is_rss_enabled(sc)) { sc->nrqs = sc->intr_count + 1; sc->nwqs = sc->intr_count; } else { sc->nrqs = 1; sc->nwqs = 1; } if (IS_BE2(sc)) sc->nwqs = 1; } static int oce_check_ipv6_ext_hdr(struct mbuf *m) { struct ether_header *eh = mtod(m, struct ether_header *); caddr_t m_datatemp = m->m_data; if (eh->ether_type == htons(ETHERTYPE_IPV6)) { m->m_data += sizeof(struct ether_header); struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); if((ip6->ip6_nxt != IPPROTO_TCP) && \ (ip6->ip6_nxt != IPPROTO_UDP)){ struct ip6_ext *ip6e = NULL; m->m_data += sizeof(struct ip6_hdr); ip6e = (struct ip6_ext *) mtod(m, struct ip6_ext *); if(ip6e->ip6e_len == 0xff) { m->m_data = m_datatemp; return TRUE; } } m->m_data = m_datatemp; } return FALSE; } static int is_be3_a1(POCE_SOFTC sc) { if((sc->flags & OCE_FLAGS_BE3) && ((sc->asic_revision & 0xFF) < 2)) { return TRUE; } return FALSE; } static struct mbuf * oce_insert_vlan_tag(POCE_SOFTC sc, struct mbuf *m, boolean_t *complete) { uint16_t vlan_tag = 0; if(!M_WRITABLE(m)) return NULL; /* Embed vlan tag in the packet if it is not part of it */ if(m->m_flags & M_VLANTAG) { vlan_tag = EVL_VLANOFTAG(m->m_pkthdr.ether_vtag); m->m_flags &= ~M_VLANTAG; } /* if UMC, ignore vlan tag insertion and instead insert pvid */ if(sc->pvid) { if(!vlan_tag) vlan_tag = sc->pvid; *complete = FALSE; } if(vlan_tag) { m = ether_vlanencap(m, vlan_tag); } if(sc->qnqid) { m = ether_vlanencap(m, sc->qnqid); *complete = FALSE; } return m; } static int oce_tx_asic_stall_verify(POCE_SOFTC sc, struct mbuf *m) { if(is_be3_a1(sc) && IS_QNQ_OR_UMC(sc) && \ oce_check_ipv6_ext_hdr(m)) { return TRUE; } return FALSE; } static void oce_get_config(POCE_SOFTC sc) { int rc = 0; uint32_t max_rss = 0; if ((IS_BE(sc) || IS_SH(sc)) && (!sc->be3_native)) max_rss = OCE_LEGACY_MODE_RSS; else max_rss = OCE_MAX_RSS; if (!IS_BE(sc)) { rc = oce_get_profile_config(sc, max_rss); if (rc) { sc->nwqs = OCE_MAX_WQ; sc->nrssqs = max_rss; sc->nrqs = sc->nrssqs + 1; } } else { /* For BE3 don't rely on fw for determining the resources */ sc->nrssqs = max_rss; sc->nrqs = sc->nrssqs + 1; sc->nwqs = OCE_MAX_WQ; sc->max_vlans = MAX_VLANFILTER_SIZE; } } Index: head/sys/dev/oce/oce_if.h =================================================================== --- head/sys/dev/oce/oce_if.h (revision 271550) +++ head/sys/dev/oce/oce_if.h (revision 271551) @@ -1,1160 +1,1161 @@ /*- * Copyright (C) 2013 Emulex * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * 3. Neither the name of the Emulex Corporation nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * Contact Information: * freebsd-drivers@emulex.com * * Emulex * 3333 Susan Street * Costa Mesa, CA 92626 */ /* $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "oce_hw.h" /* OCE device driver module component revision informaiton */ #define COMPONENT_REVISION "10.0.664.0" /* OCE devices supported by this driver */ #define PCI_VENDOR_EMULEX 0x10df /* Emulex */ #define PCI_VENDOR_SERVERENGINES 0x19a2 /* ServerEngines (BE) */ #define PCI_PRODUCT_BE2 0x0700 /* BE2 network adapter */ #define PCI_PRODUCT_BE3 0x0710 /* BE3 network adapter */ #define PCI_PRODUCT_XE201 0xe220 /* XE201 network adapter */ #define PCI_PRODUCT_XE201_VF 0xe228 /* XE201 with VF in Lancer */ #define PCI_PRODUCT_SH 0x0720 /* Skyhawk network adapter */ #define IS_BE(sc) (((sc->flags & OCE_FLAGS_BE3) | \ (sc->flags & OCE_FLAGS_BE2))? 1:0) #define IS_BE3(sc) (sc->flags & OCE_FLAGS_BE3) #define IS_BE2(sc) (sc->flags & OCE_FLAGS_BE2) #define IS_XE201(sc) ((sc->flags & OCE_FLAGS_XE201) ? 1:0) #define HAS_A0_CHIP(sc) ((sc->flags & OCE_FLAGS_HAS_A0_CHIP) ? 1:0) #define IS_SH(sc) ((sc->flags & OCE_FLAGS_SH) ? 1 : 0) #define is_be_mode_mc(sc) ((sc->function_mode & FNM_FLEX10_MODE) || \ (sc->function_mode & FNM_UMC_MODE) || \ (sc->function_mode & FNM_VNIC_MODE)) #define OCE_FUNCTION_CAPS_SUPER_NIC 0x40 #define IS_PROFILE_SUPER_NIC(sc) (sc->function_caps & OCE_FUNCTION_CAPS_SUPER_NIC) /* proportion Service Level Interface queues */ #define OCE_MAX_UNITS 2 #define OCE_MAX_PPORT OCE_MAX_UNITS #define OCE_MAX_VPORT OCE_MAX_UNITS extern int mp_ncpus; /* system's total active cpu cores */ #define OCE_NCPUS mp_ncpus /* This should be powers of 2. Like 2,4,8 & 16 */ #define OCE_MAX_RSS 8 #define OCE_LEGACY_MODE_RSS 4 /* For BE3 Legacy mode*/ #define is_rss_enabled(sc) ((sc->function_caps & FNC_RSS) && !is_be_mode_mc(sc)) #define OCE_MIN_RQ 1 #define OCE_MIN_WQ 1 #define OCE_MAX_RQ OCE_MAX_RSS + 1 /* one default queue */ #define OCE_MAX_WQ 8 #define OCE_MAX_EQ 32 #define OCE_MAX_CQ OCE_MAX_RQ + OCE_MAX_WQ + 1 /* one MCC queue */ #define OCE_MAX_CQ_EQ 8 /* Max CQ that can attached to an EQ */ #define OCE_DEFAULT_WQ_EQD 16 #define OCE_MAX_PACKET_Q 16 #define OCE_RQ_BUF_SIZE 2048 #define OCE_LSO_MAX_SIZE (64 * 1024) #define LONG_TIMEOUT 30 #define OCE_MAX_JUMBO_FRAME_SIZE 9018 #define OCE_MAX_MTU (OCE_MAX_JUMBO_FRAME_SIZE - \ ETHER_VLAN_ENCAP_LEN - \ ETHER_HDR_LEN) #define OCE_MAX_TX_ELEMENTS 29 #define OCE_MAX_TX_DESC 1024 #define OCE_MAX_TX_SIZE 65535 +#define OCE_MAX_TSO_SIZE (65535 - ETHER_HDR_LEN) #define OCE_MAX_RX_SIZE 4096 #define OCE_MAX_RQ_POSTS 255 #define OCE_DEFAULT_PROMISCUOUS 0 #define RSS_ENABLE_IPV4 0x1 #define RSS_ENABLE_TCP_IPV4 0x2 #define RSS_ENABLE_IPV6 0x4 #define RSS_ENABLE_TCP_IPV6 0x8 #define INDIRECTION_TABLE_ENTRIES 128 /* flow control definitions */ #define OCE_FC_NONE 0x00000000 #define OCE_FC_TX 0x00000001 #define OCE_FC_RX 0x00000002 #define OCE_DEFAULT_FLOW_CONTROL (OCE_FC_TX | OCE_FC_RX) /* Interface capabilities to give device when creating interface */ #define OCE_CAPAB_FLAGS (MBX_RX_IFACE_FLAGS_BROADCAST | \ MBX_RX_IFACE_FLAGS_UNTAGGED | \ MBX_RX_IFACE_FLAGS_PROMISCUOUS | \ MBX_RX_IFACE_FLAGS_VLAN_PROMISCUOUS | \ MBX_RX_IFACE_FLAGS_MCAST_PROMISCUOUS | \ MBX_RX_IFACE_FLAGS_RSS | \ MBX_RX_IFACE_FLAGS_PASS_L3L4_ERR) /* Interface capabilities to enable by default (others set dynamically) */ #define OCE_CAPAB_ENABLE (MBX_RX_IFACE_FLAGS_BROADCAST | \ MBX_RX_IFACE_FLAGS_UNTAGGED | \ MBX_RX_IFACE_FLAGS_PASS_L3L4_ERR) #define OCE_IF_HWASSIST (CSUM_IP | CSUM_TCP | CSUM_UDP) #define OCE_IF_CAPABILITIES (IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING | \ IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM | \ IFCAP_JUMBO_MTU | IFCAP_VLAN_MTU) #define OCE_IF_HWASSIST_NONE 0 #define OCE_IF_CAPABILITIES_NONE 0 #define ETH_ADDR_LEN 6 #define MAX_VLANFILTER_SIZE 64 #define MAX_VLANS 4096 #define upper_32_bits(n) ((uint32_t)(((n) >> 16) >> 16)) #define BSWAP_8(x) ((x) & 0xff) #define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8)) #define BSWAP_32(x) ((BSWAP_16(x) << 16) | \ BSWAP_16((x) >> 16)) #define BSWAP_64(x) ((BSWAP_32(x) << 32) | \ BSWAP_32((x) >> 32)) #define for_all_wq_queues(sc, wq, i) \ for (i = 0, wq = sc->wq[0]; i < sc->nwqs; i++, wq = sc->wq[i]) #define for_all_rq_queues(sc, rq, i) \ for (i = 0, rq = sc->rq[0]; i < sc->nrqs; i++, rq = sc->rq[i]) #define for_all_rss_queues(sc, rq, i) \ for (i = 0, rq = sc->rq[i + 1]; i < (sc->nrqs - 1); \ i++, rq = sc->rq[i + 1]) #define for_all_evnt_queues(sc, eq, i) \ for (i = 0, eq = sc->eq[0]; i < sc->neqs; i++, eq = sc->eq[i]) #define for_all_cq_queues(sc, cq, i) \ for (i = 0, cq = sc->cq[0]; i < sc->ncqs; i++, cq = sc->cq[i]) /* Flash specific */ #define IOCTL_COOKIE "SERVERENGINES CORP" #define MAX_FLASH_COMP 32 #define IMG_ISCSI 160 #define IMG_REDBOOT 224 #define IMG_BIOS 34 #define IMG_PXEBIOS 32 #define IMG_FCOEBIOS 33 #define IMG_ISCSI_BAK 176 #define IMG_FCOE 162 #define IMG_FCOE_BAK 178 #define IMG_NCSI 16 #define IMG_PHY 192 #define FLASHROM_OPER_FLASH 1 #define FLASHROM_OPER_SAVE 2 #define FLASHROM_OPER_REPORT 4 #define FLASHROM_OPER_FLASH_PHY 9 #define FLASHROM_OPER_SAVE_PHY 10 #define TN_8022 13 enum { PHY_TYPE_CX4_10GB = 0, PHY_TYPE_XFP_10GB, PHY_TYPE_SFP_1GB, PHY_TYPE_SFP_PLUS_10GB, PHY_TYPE_KR_10GB, PHY_TYPE_KX4_10GB, PHY_TYPE_BASET_10GB, PHY_TYPE_BASET_1GB, PHY_TYPE_BASEX_1GB, PHY_TYPE_SGMII, PHY_TYPE_DISABLED = 255 }; /** * @brief Define and hold all necessary info for a single interrupt */ #define OCE_MAX_MSI 32 /* Message Signaled Interrupts */ #define OCE_MAX_MSIX 2048 /* PCI Express MSI Interrrupts */ typedef struct oce_intr_info { void *tag; /* cookie returned by bus_setup_intr */ struct resource *intr_res; /* PCI resource container */ int irq_rr; /* resource id for the interrupt */ struct oce_softc *sc; /* pointer to the parent soft c */ struct oce_eq *eq; /* pointer to the connected EQ */ struct taskqueue *tq; /* Associated task queue */ struct task task; /* task queue task */ char task_name[32]; /* task name */ int vector; /* interrupt vector number */ } OCE_INTR_INFO, *POCE_INTR_INFO; /* Ring related */ #define GET_Q_NEXT(_START, _STEP, _END) \ (((_START) + (_STEP)) < (_END) ? ((_START) + (_STEP)) \ : (((_START) + (_STEP)) - (_END))) #define DBUF_PA(obj) ((obj)->addr) #define DBUF_VA(obj) ((obj)->ptr) #define DBUF_TAG(obj) ((obj)->tag) #define DBUF_MAP(obj) ((obj)->map) #define DBUF_SYNC(obj, flags) \ (void) bus_dmamap_sync(DBUF_TAG(obj), DBUF_MAP(obj), (flags)) #define RING_NUM_PENDING(ring) ring->num_used #define RING_FULL(ring) (ring->num_used == ring->num_items) #define RING_EMPTY(ring) (ring->num_used == 0) #define RING_NUM_FREE(ring) \ (uint32_t)(ring->num_items - ring->num_used) #define RING_GET(ring, n) \ ring->cidx = GET_Q_NEXT(ring->cidx, n, ring->num_items) #define RING_PUT(ring, n) \ ring->pidx = GET_Q_NEXT(ring->pidx, n, ring->num_items) #define RING_GET_CONSUMER_ITEM_VA(ring, type) \ (void*)((type *)DBUF_VA(&ring->dma) + ring->cidx) #define RING_GET_CONSUMER_ITEM_PA(ring, type) \ (uint64_t)(((type *)DBUF_PA(ring->dbuf)) + ring->cidx) #define RING_GET_PRODUCER_ITEM_VA(ring, type) \ (void *)(((type *)DBUF_VA(&ring->dma)) + ring->pidx) #define RING_GET_PRODUCER_ITEM_PA(ring, type) \ (uint64_t)(((type *)DBUF_PA(ring->dbuf)) + ring->pidx) #define OCE_DMAPTR(o, c) ((c *)(o)->ptr) struct oce_packet_desc { struct mbuf *mbuf; bus_dmamap_t map; int nsegs; uint32_t wqe_idx; }; typedef struct oce_dma_mem { bus_dma_tag_t tag; bus_dmamap_t map; void *ptr; bus_addr_t paddr; } OCE_DMA_MEM, *POCE_DMA_MEM; typedef struct oce_ring_buffer_s { uint16_t cidx; /* Get ptr */ uint16_t pidx; /* Put Ptr */ size_t item_size; size_t num_items; uint32_t num_used; OCE_DMA_MEM dma; } oce_ring_buffer_t; /* Stats */ #define OCE_UNICAST_PACKET 0 #define OCE_MULTICAST_PACKET 1 #define OCE_BROADCAST_PACKET 2 #define OCE_RSVD_PACKET 3 struct oce_rx_stats { /* Total Receive Stats*/ uint64_t t_rx_pkts; uint64_t t_rx_bytes; uint32_t t_rx_frags; uint32_t t_rx_mcast_pkts; uint32_t t_rx_ucast_pkts; uint32_t t_rxcp_errs; }; struct oce_tx_stats { /*Total Transmit Stats */ uint64_t t_tx_pkts; uint64_t t_tx_bytes; uint32_t t_tx_reqs; uint32_t t_tx_stops; uint32_t t_tx_wrbs; uint32_t t_tx_compl; uint32_t t_ipv6_ext_hdr_tx_drop; }; struct oce_be_stats { uint8_t be_on_die_temperature; uint32_t be_tx_events; uint32_t eth_red_drops; uint32_t rx_drops_no_pbuf; uint32_t rx_drops_no_txpb; uint32_t rx_drops_no_erx_descr; uint32_t rx_drops_no_tpre_descr; uint32_t rx_drops_too_many_frags; uint32_t rx_drops_invalid_ring; uint32_t forwarded_packets; uint32_t rx_drops_mtu; uint32_t rx_crc_errors; uint32_t rx_alignment_symbol_errors; uint32_t rx_pause_frames; uint32_t rx_priority_pause_frames; uint32_t rx_control_frames; uint32_t rx_in_range_errors; uint32_t rx_out_range_errors; uint32_t rx_frame_too_long; uint32_t rx_address_match_errors; uint32_t rx_dropped_too_small; uint32_t rx_dropped_too_short; uint32_t rx_dropped_header_too_small; uint32_t rx_dropped_tcp_length; uint32_t rx_dropped_runt; uint32_t rx_ip_checksum_errs; uint32_t rx_tcp_checksum_errs; uint32_t rx_udp_checksum_errs; uint32_t rx_switched_unicast_packets; uint32_t rx_switched_multicast_packets; uint32_t rx_switched_broadcast_packets; uint32_t tx_pauseframes; uint32_t tx_priority_pauseframes; uint32_t tx_controlframes; uint32_t rxpp_fifo_overflow_drop; uint32_t rx_input_fifo_overflow_drop; uint32_t pmem_fifo_overflow_drop; uint32_t jabber_events; }; struct oce_xe201_stats { uint64_t tx_pkts; uint64_t tx_unicast_pkts; uint64_t tx_multicast_pkts; uint64_t tx_broadcast_pkts; uint64_t tx_bytes; uint64_t tx_unicast_bytes; uint64_t tx_multicast_bytes; uint64_t tx_broadcast_bytes; uint64_t tx_discards; uint64_t tx_errors; uint64_t tx_pause_frames; uint64_t tx_pause_on_frames; uint64_t tx_pause_off_frames; uint64_t tx_internal_mac_errors; uint64_t tx_control_frames; uint64_t tx_pkts_64_bytes; uint64_t tx_pkts_65_to_127_bytes; uint64_t tx_pkts_128_to_255_bytes; uint64_t tx_pkts_256_to_511_bytes; uint64_t tx_pkts_512_to_1023_bytes; uint64_t tx_pkts_1024_to_1518_bytes; uint64_t tx_pkts_1519_to_2047_bytes; uint64_t tx_pkts_2048_to_4095_bytes; uint64_t tx_pkts_4096_to_8191_bytes; uint64_t tx_pkts_8192_to_9216_bytes; uint64_t tx_lso_pkts; uint64_t rx_pkts; uint64_t rx_unicast_pkts; uint64_t rx_multicast_pkts; uint64_t rx_broadcast_pkts; uint64_t rx_bytes; uint64_t rx_unicast_bytes; uint64_t rx_multicast_bytes; uint64_t rx_broadcast_bytes; uint32_t rx_unknown_protos; uint64_t rx_discards; uint64_t rx_errors; uint64_t rx_crc_errors; uint64_t rx_alignment_errors; uint64_t rx_symbol_errors; uint64_t rx_pause_frames; uint64_t rx_pause_on_frames; uint64_t rx_pause_off_frames; uint64_t rx_frames_too_long; uint64_t rx_internal_mac_errors; uint32_t rx_undersize_pkts; uint32_t rx_oversize_pkts; uint32_t rx_fragment_pkts; uint32_t rx_jabbers; uint64_t rx_control_frames; uint64_t rx_control_frames_unknown_opcode; uint32_t rx_in_range_errors; uint32_t rx_out_of_range_errors; uint32_t rx_address_match_errors; uint32_t rx_vlan_mismatch_errors; uint32_t rx_dropped_too_small; uint32_t rx_dropped_too_short; uint32_t rx_dropped_header_too_small; uint32_t rx_dropped_invalid_tcp_length; uint32_t rx_dropped_runt; uint32_t rx_ip_checksum_errors; uint32_t rx_tcp_checksum_errors; uint32_t rx_udp_checksum_errors; uint32_t rx_non_rss_pkts; uint64_t rx_ipv4_pkts; uint64_t rx_ipv6_pkts; uint64_t rx_ipv4_bytes; uint64_t rx_ipv6_bytes; uint64_t rx_nic_pkts; uint64_t rx_tcp_pkts; uint64_t rx_iscsi_pkts; uint64_t rx_management_pkts; uint64_t rx_switched_unicast_pkts; uint64_t rx_switched_multicast_pkts; uint64_t rx_switched_broadcast_pkts; uint64_t num_forwards; uint32_t rx_fifo_overflow; uint32_t rx_input_fifo_overflow; uint64_t rx_drops_too_many_frags; uint32_t rx_drops_invalid_queue; uint64_t rx_drops_mtu; uint64_t rx_pkts_64_bytes; uint64_t rx_pkts_65_to_127_bytes; uint64_t rx_pkts_128_to_255_bytes; uint64_t rx_pkts_256_to_511_bytes; uint64_t rx_pkts_512_to_1023_bytes; uint64_t rx_pkts_1024_to_1518_bytes; uint64_t rx_pkts_1519_to_2047_bytes; uint64_t rx_pkts_2048_to_4095_bytes; uint64_t rx_pkts_4096_to_8191_bytes; uint64_t rx_pkts_8192_to_9216_bytes; }; struct oce_drv_stats { struct oce_rx_stats rx; struct oce_tx_stats tx; union { struct oce_be_stats be; struct oce_xe201_stats xe201; } u0; }; #define INTR_RATE_HWM 15000 #define INTR_RATE_LWM 10000 #define OCE_MAX_EQD 128u #define OCE_MIN_EQD 50u struct oce_set_eqd { uint32_t eq_id; uint32_t phase; uint32_t delay_multiplier; }; struct oce_aic_obj { /* Adaptive interrupt coalescing (AIC) info */ boolean_t enable; uint32_t min_eqd; /* in usecs */ uint32_t max_eqd; /* in usecs */ uint32_t cur_eqd; /* in usecs */ uint32_t et_eqd; /* configured value when aic is off */ uint64_t ticks; uint64_t intr_prev; }; #define MAX_LOCK_DESC_LEN 32 struct oce_lock { struct mtx mutex; char name[MAX_LOCK_DESC_LEN+1]; }; #define OCE_LOCK struct oce_lock #define LOCK_CREATE(lock, desc) { \ strncpy((lock)->name, (desc), MAX_LOCK_DESC_LEN); \ (lock)->name[MAX_LOCK_DESC_LEN] = '\0'; \ mtx_init(&(lock)->mutex, (lock)->name, NULL, MTX_DEF); \ } #define LOCK_DESTROY(lock) \ if (mtx_initialized(&(lock)->mutex))\ mtx_destroy(&(lock)->mutex) #define TRY_LOCK(lock) mtx_trylock(&(lock)->mutex) #define LOCK(lock) mtx_lock(&(lock)->mutex) #define LOCKED(lock) mtx_owned(&(lock)->mutex) #define UNLOCK(lock) mtx_unlock(&(lock)->mutex) #define DEFAULT_MQ_MBOX_TIMEOUT (5 * 1000 * 1000) #define MBX_READY_TIMEOUT (1 * 1000 * 1000) #define DEFAULT_DRAIN_TIME 200 #define MBX_TIMEOUT_SEC 5 #define STAT_TIMEOUT 2000000 /* size of the packet descriptor array in a transmit queue */ #define OCE_TX_RING_SIZE 2048 #define OCE_RX_RING_SIZE 1024 #define OCE_WQ_PACKET_ARRAY_SIZE (OCE_TX_RING_SIZE/2) #define OCE_RQ_PACKET_ARRAY_SIZE (OCE_RX_RING_SIZE) struct oce_dev; enum eq_len { EQ_LEN_256 = 256, EQ_LEN_512 = 512, EQ_LEN_1024 = 1024, EQ_LEN_2048 = 2048, EQ_LEN_4096 = 4096 }; enum eqe_size { EQE_SIZE_4 = 4, EQE_SIZE_16 = 16 }; enum qtype { QTYPE_EQ, QTYPE_MQ, QTYPE_WQ, QTYPE_RQ, QTYPE_CQ, QTYPE_RSS }; typedef enum qstate_e { QDELETED = 0x0, QCREATED = 0x1 } qstate_t; struct eq_config { enum eq_len q_len; enum eqe_size item_size; uint32_t q_vector_num; uint8_t min_eqd; uint8_t max_eqd; uint8_t cur_eqd; uint8_t pad; }; struct oce_eq { uint32_t eq_id; void *parent; void *cb_context; oce_ring_buffer_t *ring; uint32_t ref_count; qstate_t qstate; struct oce_cq *cq[OCE_MAX_CQ_EQ]; int cq_valid; struct eq_config eq_cfg; int vector; uint64_t intr; }; enum cq_len { CQ_LEN_256 = 256, CQ_LEN_512 = 512, CQ_LEN_1024 = 1024 }; struct cq_config { enum cq_len q_len; uint32_t item_size; boolean_t is_eventable; boolean_t sol_eventable; boolean_t nodelay; uint16_t dma_coalescing; }; typedef uint16_t(*cq_handler_t) (void *arg1); struct oce_cq { uint32_t cq_id; void *parent; struct oce_eq *eq; cq_handler_t cq_handler; void *cb_arg; oce_ring_buffer_t *ring; qstate_t qstate; struct cq_config cq_cfg; uint32_t ref_count; }; struct mq_config { uint32_t eqd; uint8_t q_len; uint8_t pad[3]; }; struct oce_mq { void *parent; oce_ring_buffer_t *ring; uint32_t mq_id; struct oce_cq *cq; struct oce_cq *async_cq; uint32_t mq_free; qstate_t qstate; struct mq_config cfg; }; struct oce_mbx_ctx { struct oce_mbx *mbx; void (*cb) (void *ctx); void *cb_ctx; }; struct wq_config { uint8_t wq_type; uint16_t buf_size; uint8_t pad[1]; uint32_t q_len; uint16_t pd_id; uint16_t pci_fn_num; uint32_t eqd; /* interrupt delay */ uint32_t nbufs; uint32_t nhdl; }; struct oce_tx_queue_stats { uint64_t tx_pkts; uint64_t tx_bytes; uint32_t tx_reqs; uint32_t tx_stops; /* number of times TX Q was stopped */ uint32_t tx_wrbs; uint32_t tx_compl; uint32_t tx_rate; uint32_t ipv6_ext_hdr_tx_drop; }; struct oce_wq { OCE_LOCK tx_lock; void *parent; oce_ring_buffer_t *ring; struct oce_cq *cq; bus_dma_tag_t tag; struct oce_packet_desc pckts[OCE_WQ_PACKET_ARRAY_SIZE]; uint32_t pkt_desc_tail; uint32_t pkt_desc_head; uint32_t wqm_used; boolean_t resched; uint32_t wq_free; uint32_t tx_deferd; uint32_t pkt_drops; qstate_t qstate; uint16_t wq_id; struct wq_config cfg; int queue_index; struct oce_tx_queue_stats tx_stats; struct buf_ring *br; struct task txtask; uint32_t db_offset; }; struct rq_config { uint32_t q_len; uint32_t frag_size; uint32_t mtu; uint32_t if_id; uint32_t is_rss_queue; uint32_t eqd; uint32_t nbufs; }; struct oce_rx_queue_stats { uint32_t rx_post_fail; uint32_t rx_ucast_pkts; uint32_t rx_compl; uint64_t rx_bytes; uint64_t rx_bytes_prev; uint64_t rx_pkts; uint32_t rx_rate; uint32_t rx_mcast_pkts; uint32_t rxcp_err; uint32_t rx_frags; uint32_t prev_rx_frags; uint32_t rx_fps; }; struct oce_rq { struct rq_config cfg; uint32_t rq_id; int queue_index; uint32_t rss_cpuid; void *parent; oce_ring_buffer_t *ring; struct oce_cq *cq; void *pad1; bus_dma_tag_t tag; struct oce_packet_desc pckts[OCE_RQ_PACKET_ARRAY_SIZE]; uint32_t packets_in; uint32_t packets_out; uint32_t pending; #ifdef notdef struct mbuf *head; struct mbuf *tail; int fragsleft; #endif qstate_t qstate; OCE_LOCK rx_lock; struct oce_rx_queue_stats rx_stats; struct lro_ctrl lro; int lro_pkts_queued; }; struct link_status { uint8_t phys_port_speed; uint8_t logical_link_status; uint16_t qos_link_speed; }; #define OCE_FLAGS_PCIX 0x00000001 #define OCE_FLAGS_PCIE 0x00000002 #define OCE_FLAGS_MSI_CAPABLE 0x00000004 #define OCE_FLAGS_MSIX_CAPABLE 0x00000008 #define OCE_FLAGS_USING_MSI 0x00000010 #define OCE_FLAGS_USING_MSIX 0x00000020 #define OCE_FLAGS_FUNCRESET_RQD 0x00000040 #define OCE_FLAGS_VIRTUAL_PORT 0x00000080 #define OCE_FLAGS_MBOX_ENDIAN_RQD 0x00000100 #define OCE_FLAGS_BE3 0x00000200 #define OCE_FLAGS_XE201 0x00000400 #define OCE_FLAGS_BE2 0x00000800 #define OCE_FLAGS_SH 0x00001000 #define OCE_DEV_BE2_CFG_BAR 1 #define OCE_DEV_CFG_BAR 0 #define OCE_PCI_CSR_BAR 2 #define OCE_PCI_DB_BAR 4 typedef struct oce_softc { device_t dev; OCE_LOCK dev_lock; uint32_t flags; uint32_t pcie_link_speed; uint32_t pcie_link_width; uint8_t fn; /* PCI function number */ struct resource *devcfg_res; bus_space_tag_t devcfg_btag; bus_space_handle_t devcfg_bhandle; void *devcfg_vhandle; struct resource *csr_res; bus_space_tag_t csr_btag; bus_space_handle_t csr_bhandle; void *csr_vhandle; struct resource *db_res; bus_space_tag_t db_btag; bus_space_handle_t db_bhandle; void *db_vhandle; OCE_INTR_INFO intrs[OCE_MAX_EQ]; int intr_count; struct ifnet *ifp; struct ifmedia media; uint8_t link_status; uint8_t link_speed; uint8_t duplex; uint32_t qos_link_speed; uint32_t speed; char fw_version[32]; struct mac_address_format macaddr; OCE_DMA_MEM bsmbx; OCE_LOCK bmbx_lock; uint32_t config_number; uint32_t asic_revision; uint32_t port_id; uint32_t function_mode; uint32_t function_caps; uint32_t max_tx_rings; uint32_t max_rx_rings; struct oce_wq *wq[OCE_MAX_WQ]; /* TX work queues */ struct oce_rq *rq[OCE_MAX_RQ]; /* RX work queues */ struct oce_cq *cq[OCE_MAX_CQ]; /* Completion queues */ struct oce_eq *eq[OCE_MAX_EQ]; /* Event queues */ struct oce_mq *mq; /* Mailbox queue */ uint32_t neqs; uint32_t ncqs; uint32_t nrqs; uint32_t nwqs; uint32_t nrssqs; uint32_t tx_ring_size; uint32_t rx_ring_size; uint32_t rq_frag_size; uint32_t if_id; /* interface ID */ uint32_t nifs; /* number of adapter interfaces, 0 or 1 */ uint32_t pmac_id; /* PMAC id */ uint32_t if_cap_flags; uint32_t flow_control; uint8_t promisc; struct oce_aic_obj aic_obj[OCE_MAX_EQ]; /*Vlan Filtering related */ eventhandler_tag vlan_attach; eventhandler_tag vlan_detach; uint16_t vlans_added; uint8_t vlan_tag[MAX_VLANS]; /*stats */ OCE_DMA_MEM stats_mem; struct oce_drv_stats oce_stats_info; struct callout timer; int8_t be3_native; uint8_t hw_error; uint16_t qnq_debug_event; uint16_t qnqid; uint32_t pvid; uint32_t max_vlans; } OCE_SOFTC, *POCE_SOFTC; /************************************************** * BUS memory read/write macros * BE3: accesses three BAR spaces (CFG, CSR, DB) * Lancer: accesses one BAR space (CFG) **************************************************/ #define OCE_READ_CSR_MPU(sc, space, o) \ ((IS_BE(sc)) ? (bus_space_read_4((sc)->space##_btag, \ (sc)->space##_bhandle,o)) \ : (bus_space_read_4((sc)->devcfg_btag, \ (sc)->devcfg_bhandle,o))) #define OCE_READ_REG32(sc, space, o) \ ((IS_BE(sc) || IS_SH(sc)) ? (bus_space_read_4((sc)->space##_btag, \ (sc)->space##_bhandle,o)) \ : (bus_space_read_4((sc)->devcfg_btag, \ (sc)->devcfg_bhandle,o))) #define OCE_READ_REG16(sc, space, o) \ ((IS_BE(sc) || IS_SH(sc)) ? (bus_space_read_2((sc)->space##_btag, \ (sc)->space##_bhandle,o)) \ : (bus_space_read_2((sc)->devcfg_btag, \ (sc)->devcfg_bhandle,o))) #define OCE_READ_REG8(sc, space, o) \ ((IS_BE(sc) || IS_SH(sc)) ? (bus_space_read_1((sc)->space##_btag, \ (sc)->space##_bhandle,o)) \ : (bus_space_read_1((sc)->devcfg_btag, \ (sc)->devcfg_bhandle,o))) #define OCE_WRITE_CSR_MPU(sc, space, o, v) \ ((IS_BE(sc)) ? (bus_space_write_4((sc)->space##_btag, \ (sc)->space##_bhandle,o,v)) \ : (bus_space_write_4((sc)->devcfg_btag, \ (sc)->devcfg_bhandle,o,v))) #define OCE_WRITE_REG32(sc, space, o, v) \ ((IS_BE(sc) || IS_SH(sc)) ? (bus_space_write_4((sc)->space##_btag, \ (sc)->space##_bhandle,o,v)) \ : (bus_space_write_4((sc)->devcfg_btag, \ (sc)->devcfg_bhandle,o,v))) #define OCE_WRITE_REG16(sc, space, o, v) \ ((IS_BE(sc) || IS_SH(sc)) ? (bus_space_write_2((sc)->space##_btag, \ (sc)->space##_bhandle,o,v)) \ : (bus_space_write_2((sc)->devcfg_btag, \ (sc)->devcfg_bhandle,o,v))) #define OCE_WRITE_REG8(sc, space, o, v) \ ((IS_BE(sc) || IS_SH(sc)) ? (bus_space_write_1((sc)->space##_btag, \ (sc)->space##_bhandle,o,v)) \ : (bus_space_write_1((sc)->devcfg_btag, \ (sc)->devcfg_bhandle,o,v))) /*********************************************************** * DMA memory functions ***********************************************************/ #define oce_dma_sync(d, f) bus_dmamap_sync((d)->tag, (d)->map, f) int oce_dma_alloc(POCE_SOFTC sc, bus_size_t size, POCE_DMA_MEM dma, int flags); void oce_dma_free(POCE_SOFTC sc, POCE_DMA_MEM dma); void oce_dma_map_addr(void *arg, bus_dma_segment_t * segs, int nseg, int error); void oce_destroy_ring_buffer(POCE_SOFTC sc, oce_ring_buffer_t *ring); oce_ring_buffer_t *oce_create_ring_buffer(POCE_SOFTC sc, uint32_t q_len, uint32_t num_entries); /************************************************************ * oce_hw_xxx functions ************************************************************/ int oce_clear_rx_buf(struct oce_rq *rq); int oce_hw_pci_alloc(POCE_SOFTC sc); int oce_hw_init(POCE_SOFTC sc); int oce_hw_start(POCE_SOFTC sc); int oce_create_nw_interface(POCE_SOFTC sc); int oce_pci_soft_reset(POCE_SOFTC sc); int oce_hw_update_multicast(POCE_SOFTC sc); void oce_delete_nw_interface(POCE_SOFTC sc); void oce_hw_shutdown(POCE_SOFTC sc); void oce_hw_intr_enable(POCE_SOFTC sc); void oce_hw_intr_disable(POCE_SOFTC sc); void oce_hw_pci_free(POCE_SOFTC sc); /*********************************************************** * oce_queue_xxx functions ***********************************************************/ int oce_queue_init_all(POCE_SOFTC sc); int oce_start_rq(struct oce_rq *rq); int oce_start_wq(struct oce_wq *wq); int oce_start_mq(struct oce_mq *mq); int oce_start_rx(POCE_SOFTC sc); void oce_arm_eq(POCE_SOFTC sc, int16_t qid, int npopped, uint32_t rearm, uint32_t clearint); void oce_queue_release_all(POCE_SOFTC sc); void oce_arm_cq(POCE_SOFTC sc, int16_t qid, int npopped, uint32_t rearm); void oce_drain_eq(struct oce_eq *eq); void oce_drain_mq_cq(void *arg); void oce_drain_rq_cq(struct oce_rq *rq); void oce_drain_wq_cq(struct oce_wq *wq); uint32_t oce_page_list(oce_ring_buffer_t *ring, struct phys_addr *pa_list); /*********************************************************** * cleanup functions ***********************************************************/ void oce_stop_rx(POCE_SOFTC sc); void oce_intr_free(POCE_SOFTC sc); void oce_free_posted_rxbuf(struct oce_rq *rq); #if defined(INET6) || defined(INET) void oce_free_lro(POCE_SOFTC sc); #endif /************************************************************ * Mailbox functions ************************************************************/ int oce_fw_clean(POCE_SOFTC sc); int oce_reset_fun(POCE_SOFTC sc); int oce_mbox_init(POCE_SOFTC sc); int oce_mbox_dispatch(POCE_SOFTC sc, uint32_t tmo_sec); int oce_get_fw_version(POCE_SOFTC sc); int oce_first_mcc_cmd(POCE_SOFTC sc); int oce_read_mac_addr(POCE_SOFTC sc, uint32_t if_id, uint8_t perm, uint8_t type, struct mac_address_format *mac); int oce_get_fw_config(POCE_SOFTC sc); int oce_if_create(POCE_SOFTC sc, uint32_t cap_flags, uint32_t en_flags, uint16_t vlan_tag, uint8_t *mac_addr, uint32_t *if_id); int oce_if_del(POCE_SOFTC sc, uint32_t if_id); int oce_config_vlan(POCE_SOFTC sc, uint32_t if_id, struct normal_vlan *vtag_arr, uint8_t vtag_cnt, uint32_t untagged, uint32_t enable_promisc); int oce_set_flow_control(POCE_SOFTC sc, uint32_t flow_control); int oce_config_nic_rss(POCE_SOFTC sc, uint32_t if_id, uint16_t enable_rss); int oce_rxf_set_promiscuous(POCE_SOFTC sc, uint8_t enable); int oce_set_common_iface_rx_filter(POCE_SOFTC sc, POCE_DMA_MEM sgl); int oce_get_link_status(POCE_SOFTC sc, struct link_status *link); int oce_mbox_get_nic_stats_v0(POCE_SOFTC sc, POCE_DMA_MEM pstats_dma_mem); int oce_mbox_get_nic_stats(POCE_SOFTC sc, POCE_DMA_MEM pstats_dma_mem); int oce_mbox_get_pport_stats(POCE_SOFTC sc, POCE_DMA_MEM pstats_dma_mem, uint32_t reset_stats); int oce_mbox_get_vport_stats(POCE_SOFTC sc, POCE_DMA_MEM pstats_dma_mem, uint32_t req_size, uint32_t reset_stats); int oce_update_multicast(POCE_SOFTC sc, POCE_DMA_MEM pdma_mem); int oce_pass_through_mbox(POCE_SOFTC sc, POCE_DMA_MEM dma_mem, uint32_t req_size); int oce_mbox_macaddr_del(POCE_SOFTC sc, uint32_t if_id, uint32_t pmac_id); int oce_mbox_macaddr_add(POCE_SOFTC sc, uint8_t *mac_addr, uint32_t if_id, uint32_t *pmac_id); int oce_mbox_cmd_test_loopback(POCE_SOFTC sc, uint32_t port_num, uint32_t loopback_type, uint32_t pkt_size, uint32_t num_pkts, uint64_t pattern); int oce_mbox_cmd_set_loopback(POCE_SOFTC sc, uint8_t port_num, uint8_t loopback_type, uint8_t enable); int oce_mbox_check_native_mode(POCE_SOFTC sc); int oce_mbox_post(POCE_SOFTC sc, struct oce_mbx *mbx, struct oce_mbx_ctx *mbxctx); int oce_mbox_write_flashrom(POCE_SOFTC sc, uint32_t optype,uint32_t opcode, POCE_DMA_MEM pdma_mem, uint32_t num_bytes); int oce_mbox_lancer_write_flashrom(POCE_SOFTC sc, uint32_t data_size, uint32_t data_offset,POCE_DMA_MEM pdma_mem, uint32_t *written_data, uint32_t *additional_status); int oce_mbox_get_flashrom_crc(POCE_SOFTC sc, uint8_t *flash_crc, uint32_t offset, uint32_t optype); int oce_mbox_get_phy_info(POCE_SOFTC sc, struct oce_phy_info *phy_info); int oce_mbox_create_rq(struct oce_rq *rq); int oce_mbox_create_wq(struct oce_wq *wq); int oce_mbox_create_eq(struct oce_eq *eq); int oce_mbox_cq_create(struct oce_cq *cq, uint32_t ncoalesce, uint32_t is_eventable); int oce_mbox_read_transrecv_data(POCE_SOFTC sc, uint32_t page_num); void oce_mbox_eqd_modify_periodic(POCE_SOFTC sc, struct oce_set_eqd *set_eqd, int num); int oce_get_profile_config(POCE_SOFTC sc, uint32_t max_rss); int oce_get_func_config(POCE_SOFTC sc); void mbx_common_req_hdr_init(struct mbx_hdr *hdr, uint8_t dom, uint8_t port, uint8_t subsys, uint8_t opcode, uint32_t timeout, uint32_t pyld_len, uint8_t version); uint16_t oce_mq_handler(void *arg); /************************************************************ * Transmit functions ************************************************************/ uint16_t oce_wq_handler(void *arg); void oce_start(struct ifnet *ifp); void oce_tx_task(void *arg, int npending); /************************************************************ * Receive functions ************************************************************/ int oce_alloc_rx_bufs(struct oce_rq *rq, int count); uint16_t oce_rq_handler(void *arg); /* Sysctl functions */ void oce_add_sysctls(POCE_SOFTC sc); void oce_refresh_queue_stats(POCE_SOFTC sc); int oce_refresh_nic_stats(POCE_SOFTC sc); int oce_stats_init(POCE_SOFTC sc); void oce_stats_free(POCE_SOFTC sc); /* Capabilities */ #define OCE_MODCAP_RSS 1 #define OCE_MAX_RSP_HANDLED 64 extern uint32_t oce_max_rsp_handled; /* max responses */ #define OCE_MAC_LOOPBACK 0x0 #define OCE_PHY_LOOPBACK 0x1 #define OCE_ONE_PORT_EXT_LOOPBACK 0x2 #define OCE_NO_LOOPBACK 0xff #undef IFM_40G_SR4 #define IFM_40G_SR4 28 #define atomic_inc_32(x) atomic_add_32(x, 1) #define atomic_dec_32(x) atomic_subtract_32(x, 1) #define LE_64(x) htole64(x) #define LE_32(x) htole32(x) #define LE_16(x) htole16(x) #define HOST_64(x) le64toh(x) #define HOST_32(x) le32toh(x) #define HOST_16(x) le16toh(x) #define DW_SWAP(x, l) #define IS_ALIGNED(x,a) ((x % a) == 0) #define ADDR_HI(x) ((uint32_t)((uint64_t)(x) >> 32)) #define ADDR_LO(x) ((uint32_t)((uint64_t)(x) & 0xffffffff)); #define IF_LRO_ENABLED(sc) (((sc)->ifp->if_capenable & IFCAP_LRO) ? 1:0) #define IF_LSO_ENABLED(sc) (((sc)->ifp->if_capenable & IFCAP_TSO4) ? 1:0) #define IF_CSUM_ENABLED(sc) (((sc)->ifp->if_capenable & IFCAP_HWCSUM) ? 1:0) #define OCE_LOG2(x) (oce_highbit(x)) static inline uint32_t oce_highbit(uint32_t x) { int i; int c; int b; c = 0; b = 0; for (i = 0; i < 32; i++) { if ((1 << i) & x) { c++; b = i; } } if (c == 1) return b; return 0; } static inline int MPU_EP_SEMAPHORE(POCE_SOFTC sc) { if (IS_BE(sc)) return MPU_EP_SEMAPHORE_BE3; else if (IS_SH(sc)) return MPU_EP_SEMAPHORE_SH; else return MPU_EP_SEMAPHORE_XE201; } #define TRANSCEIVER_DATA_NUM_ELE 64 #define TRANSCEIVER_DATA_SIZE 256 #define TRANSCEIVER_A0_SIZE 128 #define TRANSCEIVER_A2_SIZE 128 #define PAGE_NUM_A0 0xa0 #define PAGE_NUM_A2 0xa2 #define IS_QNQ_OR_UMC(sc) ((sc->pvid && (sc->function_mode & FNM_UMC_MODE ))\ || (sc->qnqid && (sc->function_mode & FNM_FLEX10_MODE))) Index: head/sys/dev/vmware/vmxnet3/if_vmx.c =================================================================== --- head/sys/dev/vmware/vmxnet3/if_vmx.c (revision 271550) +++ head/sys/dev/vmware/vmxnet3/if_vmx.c (revision 271551) @@ -1,3939 +1,3935 @@ /*- * Copyright (c) 2013 Tsubai Masanari * Copyright (c) 2013 Bryan Venteicher * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. * * $OpenBSD: src/sys/dev/pci/if_vmx.c,v 1.11 2013/06/22 00:28:10 uebayasi Exp $ */ /* Driver for VMware vmxnet3 virtual ethernet devices. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "if_vmxreg.h" #include "if_vmxvar.h" #include "opt_inet.h" #include "opt_inet6.h" #ifdef VMXNET3_FAILPOINTS #include static SYSCTL_NODE(DEBUG_FP, OID_AUTO, vmxnet3, CTLFLAG_RW, 0, "vmxnet3 fail points"); #define VMXNET3_FP _debug_fail_point_vmxnet3 #endif static int vmxnet3_probe(device_t); static int vmxnet3_attach(device_t); static int vmxnet3_detach(device_t); static int vmxnet3_shutdown(device_t); static int vmxnet3_alloc_resources(struct vmxnet3_softc *); static void vmxnet3_free_resources(struct vmxnet3_softc *); static int vmxnet3_check_version(struct vmxnet3_softc *); static void vmxnet3_initial_config(struct vmxnet3_softc *); static void vmxnet3_check_multiqueue(struct vmxnet3_softc *); static int vmxnet3_alloc_msix_interrupts(struct vmxnet3_softc *); static int vmxnet3_alloc_msi_interrupts(struct vmxnet3_softc *); static int vmxnet3_alloc_legacy_interrupts(struct vmxnet3_softc *); static int vmxnet3_alloc_interrupt(struct vmxnet3_softc *, int, int, struct vmxnet3_interrupt *); static int vmxnet3_alloc_intr_resources(struct vmxnet3_softc *); static int vmxnet3_setup_msix_interrupts(struct vmxnet3_softc *); static int vmxnet3_setup_legacy_interrupt(struct vmxnet3_softc *); static int vmxnet3_setup_interrupts(struct vmxnet3_softc *); static int vmxnet3_alloc_interrupts(struct vmxnet3_softc *); static void vmxnet3_free_interrupt(struct vmxnet3_softc *, struct vmxnet3_interrupt *); static void vmxnet3_free_interrupts(struct vmxnet3_softc *); #ifndef VMXNET3_LEGACY_TX static int vmxnet3_alloc_taskqueue(struct vmxnet3_softc *); static void vmxnet3_start_taskqueue(struct vmxnet3_softc *); static void vmxnet3_drain_taskqueue(struct vmxnet3_softc *); static void vmxnet3_free_taskqueue(struct vmxnet3_softc *); #endif static int vmxnet3_init_rxq(struct vmxnet3_softc *, int); static int vmxnet3_init_txq(struct vmxnet3_softc *, int); static int vmxnet3_alloc_rxtx_queues(struct vmxnet3_softc *); static void vmxnet3_destroy_rxq(struct vmxnet3_rxqueue *); static void vmxnet3_destroy_txq(struct vmxnet3_txqueue *); static void vmxnet3_free_rxtx_queues(struct vmxnet3_softc *); static int vmxnet3_alloc_shared_data(struct vmxnet3_softc *); static void vmxnet3_free_shared_data(struct vmxnet3_softc *); static int vmxnet3_alloc_txq_data(struct vmxnet3_softc *); static void vmxnet3_free_txq_data(struct vmxnet3_softc *); static int vmxnet3_alloc_rxq_data(struct vmxnet3_softc *); static void vmxnet3_free_rxq_data(struct vmxnet3_softc *); static int vmxnet3_alloc_queue_data(struct vmxnet3_softc *); static void vmxnet3_free_queue_data(struct vmxnet3_softc *); static int vmxnet3_alloc_mcast_table(struct vmxnet3_softc *); static void vmxnet3_init_shared_data(struct vmxnet3_softc *); static void vmxnet3_reinit_interface(struct vmxnet3_softc *); static void vmxnet3_reinit_rss_shared_data(struct vmxnet3_softc *); static void vmxnet3_reinit_shared_data(struct vmxnet3_softc *); static int vmxnet3_alloc_data(struct vmxnet3_softc *); static void vmxnet3_free_data(struct vmxnet3_softc *); static int vmxnet3_setup_interface(struct vmxnet3_softc *); static void vmxnet3_evintr(struct vmxnet3_softc *); static void vmxnet3_txq_eof(struct vmxnet3_txqueue *); static void vmxnet3_rx_csum(struct vmxnet3_rxcompdesc *, struct mbuf *); static int vmxnet3_newbuf(struct vmxnet3_softc *, struct vmxnet3_rxring *); static void vmxnet3_rxq_eof_discard(struct vmxnet3_rxqueue *, struct vmxnet3_rxring *, int); static void vmxnet3_rxq_eof(struct vmxnet3_rxqueue *); static void vmxnet3_legacy_intr(void *); static void vmxnet3_txq_intr(void *); static void vmxnet3_rxq_intr(void *); static void vmxnet3_event_intr(void *); static void vmxnet3_txstop(struct vmxnet3_softc *, struct vmxnet3_txqueue *); static void vmxnet3_rxstop(struct vmxnet3_softc *, struct vmxnet3_rxqueue *); static void vmxnet3_stop(struct vmxnet3_softc *); static void vmxnet3_txinit(struct vmxnet3_softc *, struct vmxnet3_txqueue *); static int vmxnet3_rxinit(struct vmxnet3_softc *, struct vmxnet3_rxqueue *); static int vmxnet3_reinit_queues(struct vmxnet3_softc *); static int vmxnet3_enable_device(struct vmxnet3_softc *); static void vmxnet3_reinit_rxfilters(struct vmxnet3_softc *); static int vmxnet3_reinit(struct vmxnet3_softc *); static void vmxnet3_init_locked(struct vmxnet3_softc *); static void vmxnet3_init(void *); static int vmxnet3_txq_offload_ctx(struct vmxnet3_txqueue *,struct mbuf *, int *, int *, int *); static int vmxnet3_txq_load_mbuf(struct vmxnet3_txqueue *, struct mbuf **, bus_dmamap_t, bus_dma_segment_t [], int *); static void vmxnet3_txq_unload_mbuf(struct vmxnet3_txqueue *, bus_dmamap_t); static int vmxnet3_txq_encap(struct vmxnet3_txqueue *, struct mbuf **); #ifdef VMXNET3_LEGACY_TX static void vmxnet3_start_locked(struct ifnet *); static void vmxnet3_start(struct ifnet *); #else static int vmxnet3_txq_mq_start_locked(struct vmxnet3_txqueue *, struct mbuf *); static int vmxnet3_txq_mq_start(struct ifnet *, struct mbuf *); static void vmxnet3_txq_tq_deferred(void *, int); #endif static void vmxnet3_txq_start(struct vmxnet3_txqueue *); static void vmxnet3_tx_start_all(struct vmxnet3_softc *); static void vmxnet3_update_vlan_filter(struct vmxnet3_softc *, int, uint16_t); static void vmxnet3_register_vlan(void *, struct ifnet *, uint16_t); static void vmxnet3_unregister_vlan(void *, struct ifnet *, uint16_t); static void vmxnet3_set_rxfilter(struct vmxnet3_softc *); static int vmxnet3_change_mtu(struct vmxnet3_softc *, int); static int vmxnet3_ioctl(struct ifnet *, u_long, caddr_t); #ifndef VMXNET3_LEGACY_TX static void vmxnet3_qflush(struct ifnet *); #endif static int vmxnet3_watchdog(struct vmxnet3_txqueue *); static void vmxnet3_refresh_host_stats(struct vmxnet3_softc *); static void vmxnet3_txq_accum_stats(struct vmxnet3_txqueue *, struct vmxnet3_txq_stats *); static void vmxnet3_rxq_accum_stats(struct vmxnet3_rxqueue *, struct vmxnet3_rxq_stats *); static void vmxnet3_tick(void *); static void vmxnet3_link_status(struct vmxnet3_softc *); static void vmxnet3_media_status(struct ifnet *, struct ifmediareq *); static int vmxnet3_media_change(struct ifnet *); static void vmxnet3_set_lladdr(struct vmxnet3_softc *); static void vmxnet3_get_lladdr(struct vmxnet3_softc *); static void vmxnet3_setup_txq_sysctl(struct vmxnet3_txqueue *, struct sysctl_ctx_list *, struct sysctl_oid_list *); static void vmxnet3_setup_rxq_sysctl(struct vmxnet3_rxqueue *, struct sysctl_ctx_list *, struct sysctl_oid_list *); static void vmxnet3_setup_queue_sysctl(struct vmxnet3_softc *, struct sysctl_ctx_list *, struct sysctl_oid_list *); static void vmxnet3_setup_sysctl(struct vmxnet3_softc *); static void vmxnet3_write_bar0(struct vmxnet3_softc *, bus_size_t, uint32_t); static uint32_t vmxnet3_read_bar1(struct vmxnet3_softc *, bus_size_t); static void vmxnet3_write_bar1(struct vmxnet3_softc *, bus_size_t, uint32_t); static void vmxnet3_write_cmd(struct vmxnet3_softc *, uint32_t); static uint32_t vmxnet3_read_cmd(struct vmxnet3_softc *, uint32_t); static void vmxnet3_enable_intr(struct vmxnet3_softc *, int); static void vmxnet3_disable_intr(struct vmxnet3_softc *, int); static void vmxnet3_enable_all_intrs(struct vmxnet3_softc *); static void vmxnet3_disable_all_intrs(struct vmxnet3_softc *); static int vmxnet3_dma_malloc(struct vmxnet3_softc *, bus_size_t, bus_size_t, struct vmxnet3_dma_alloc *); static void vmxnet3_dma_free(struct vmxnet3_softc *, struct vmxnet3_dma_alloc *); static int vmxnet3_tunable_int(struct vmxnet3_softc *, const char *, int); typedef enum { VMXNET3_BARRIER_RD, VMXNET3_BARRIER_WR, VMXNET3_BARRIER_RDWR, } vmxnet3_barrier_t; static void vmxnet3_barrier(struct vmxnet3_softc *, vmxnet3_barrier_t); /* Tunables. */ static int vmxnet3_mq_disable = 0; TUNABLE_INT("hw.vmx.mq_disable", &vmxnet3_mq_disable); static int vmxnet3_default_txnqueue = VMXNET3_DEF_TX_QUEUES; TUNABLE_INT("hw.vmx.txnqueue", &vmxnet3_default_txnqueue); static int vmxnet3_default_rxnqueue = VMXNET3_DEF_RX_QUEUES; TUNABLE_INT("hw.vmx.rxnqueue", &vmxnet3_default_rxnqueue); static int vmxnet3_default_txndesc = VMXNET3_DEF_TX_NDESC; TUNABLE_INT("hw.vmx.txndesc", &vmxnet3_default_txndesc); static int vmxnet3_default_rxndesc = VMXNET3_DEF_RX_NDESC; TUNABLE_INT("hw.vmx.rxndesc", &vmxnet3_default_rxndesc); static device_method_t vmxnet3_methods[] = { /* Device interface. */ DEVMETHOD(device_probe, vmxnet3_probe), DEVMETHOD(device_attach, vmxnet3_attach), DEVMETHOD(device_detach, vmxnet3_detach), DEVMETHOD(device_shutdown, vmxnet3_shutdown), DEVMETHOD_END }; static driver_t vmxnet3_driver = { "vmx", vmxnet3_methods, sizeof(struct vmxnet3_softc) }; static devclass_t vmxnet3_devclass; DRIVER_MODULE(vmx, pci, vmxnet3_driver, vmxnet3_devclass, 0, 0); MODULE_DEPEND(vmx, pci, 1, 1, 1); MODULE_DEPEND(vmx, ether, 1, 1, 1); #define VMXNET3_VMWARE_VENDOR_ID 0x15AD #define VMXNET3_VMWARE_DEVICE_ID 0x07B0 static int vmxnet3_probe(device_t dev) { if (pci_get_vendor(dev) == VMXNET3_VMWARE_VENDOR_ID && pci_get_device(dev) == VMXNET3_VMWARE_DEVICE_ID) { device_set_desc(dev, "VMware VMXNET3 Ethernet Adapter"); return (BUS_PROBE_DEFAULT); } return (ENXIO); } static int vmxnet3_attach(device_t dev) { struct vmxnet3_softc *sc; int error; sc = device_get_softc(dev); sc->vmx_dev = dev; pci_enable_busmaster(dev); VMXNET3_CORE_LOCK_INIT(sc, device_get_nameunit(dev)); callout_init_mtx(&sc->vmx_tick, &sc->vmx_mtx, 0); vmxnet3_initial_config(sc); error = vmxnet3_alloc_resources(sc); if (error) goto fail; error = vmxnet3_check_version(sc); if (error) goto fail; error = vmxnet3_alloc_rxtx_queues(sc); if (error) goto fail; #ifndef VMXNET3_LEGACY_TX error = vmxnet3_alloc_taskqueue(sc); if (error) goto fail; #endif error = vmxnet3_alloc_interrupts(sc); if (error) goto fail; vmxnet3_check_multiqueue(sc); error = vmxnet3_alloc_data(sc); if (error) goto fail; error = vmxnet3_setup_interface(sc); if (error) goto fail; error = vmxnet3_setup_interrupts(sc); if (error) { ether_ifdetach(sc->vmx_ifp); device_printf(dev, "could not set up interrupt\n"); goto fail; } vmxnet3_setup_sysctl(sc); #ifndef VMXNET3_LEGACY_TX vmxnet3_start_taskqueue(sc); #endif fail: if (error) vmxnet3_detach(dev); return (error); } static int vmxnet3_detach(device_t dev) { struct vmxnet3_softc *sc; struct ifnet *ifp; sc = device_get_softc(dev); ifp = sc->vmx_ifp; if (device_is_attached(dev)) { VMXNET3_CORE_LOCK(sc); vmxnet3_stop(sc); VMXNET3_CORE_UNLOCK(sc); callout_drain(&sc->vmx_tick); #ifndef VMXNET3_LEGACY_TX vmxnet3_drain_taskqueue(sc); #endif ether_ifdetach(ifp); } if (sc->vmx_vlan_attach != NULL) { EVENTHANDLER_DEREGISTER(vlan_config, sc->vmx_vlan_attach); sc->vmx_vlan_attach = NULL; } if (sc->vmx_vlan_detach != NULL) { EVENTHANDLER_DEREGISTER(vlan_config, sc->vmx_vlan_detach); sc->vmx_vlan_detach = NULL; } #ifndef VMXNET3_LEGACY_TX vmxnet3_free_taskqueue(sc); #endif vmxnet3_free_interrupts(sc); if (ifp != NULL) { if_free(ifp); sc->vmx_ifp = NULL; } ifmedia_removeall(&sc->vmx_media); vmxnet3_free_data(sc); vmxnet3_free_resources(sc); vmxnet3_free_rxtx_queues(sc); VMXNET3_CORE_LOCK_DESTROY(sc); return (0); } static int vmxnet3_shutdown(device_t dev) { return (0); } static int vmxnet3_alloc_resources(struct vmxnet3_softc *sc) { device_t dev; int rid; dev = sc->vmx_dev; rid = PCIR_BAR(0); sc->vmx_res0 = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (sc->vmx_res0 == NULL) { device_printf(dev, "could not map BAR0 memory\n"); return (ENXIO); } sc->vmx_iot0 = rman_get_bustag(sc->vmx_res0); sc->vmx_ioh0 = rman_get_bushandle(sc->vmx_res0); rid = PCIR_BAR(1); sc->vmx_res1 = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (sc->vmx_res1 == NULL) { device_printf(dev, "could not map BAR1 memory\n"); return (ENXIO); } sc->vmx_iot1 = rman_get_bustag(sc->vmx_res1); sc->vmx_ioh1 = rman_get_bushandle(sc->vmx_res1); if (pci_find_cap(dev, PCIY_MSIX, NULL) == 0) { rid = PCIR_BAR(2); sc->vmx_msix_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); } if (sc->vmx_msix_res == NULL) sc->vmx_flags |= VMXNET3_FLAG_NO_MSIX; return (0); } static void vmxnet3_free_resources(struct vmxnet3_softc *sc) { device_t dev; int rid; dev = sc->vmx_dev; if (sc->vmx_res0 != NULL) { rid = PCIR_BAR(0); bus_release_resource(dev, SYS_RES_MEMORY, rid, sc->vmx_res0); sc->vmx_res0 = NULL; } if (sc->vmx_res1 != NULL) { rid = PCIR_BAR(1); bus_release_resource(dev, SYS_RES_MEMORY, rid, sc->vmx_res1); sc->vmx_res1 = NULL; } if (sc->vmx_msix_res != NULL) { rid = PCIR_BAR(2); bus_release_resource(dev, SYS_RES_MEMORY, rid, sc->vmx_msix_res); sc->vmx_msix_res = NULL; } } static int vmxnet3_check_version(struct vmxnet3_softc *sc) { device_t dev; uint32_t version; dev = sc->vmx_dev; version = vmxnet3_read_bar1(sc, VMXNET3_BAR1_VRRS); if ((version & 0x01) == 0) { device_printf(dev, "unsupported hardware version %#x\n", version); return (ENOTSUP); } vmxnet3_write_bar1(sc, VMXNET3_BAR1_VRRS, 1); version = vmxnet3_read_bar1(sc, VMXNET3_BAR1_UVRS); if ((version & 0x01) == 0) { device_printf(dev, "unsupported UPT version %#x\n", version); return (ENOTSUP); } vmxnet3_write_bar1(sc, VMXNET3_BAR1_UVRS, 1); return (0); } static void vmxnet3_initial_config(struct vmxnet3_softc *sc) { int nqueue, ndesc; nqueue = vmxnet3_tunable_int(sc, "txnqueue", vmxnet3_default_txnqueue); if (nqueue > VMXNET3_MAX_TX_QUEUES || nqueue < 1) nqueue = VMXNET3_DEF_TX_QUEUES; if (nqueue > mp_ncpus) nqueue = mp_ncpus; sc->vmx_max_ntxqueues = nqueue; nqueue = vmxnet3_tunable_int(sc, "rxnqueue", vmxnet3_default_rxnqueue); if (nqueue > VMXNET3_MAX_RX_QUEUES || nqueue < 1) nqueue = VMXNET3_DEF_RX_QUEUES; if (nqueue > mp_ncpus) nqueue = mp_ncpus; sc->vmx_max_nrxqueues = nqueue; if (vmxnet3_tunable_int(sc, "mq_disable", vmxnet3_mq_disable)) { sc->vmx_max_nrxqueues = 1; sc->vmx_max_ntxqueues = 1; } ndesc = vmxnet3_tunable_int(sc, "txd", vmxnet3_default_txndesc); if (ndesc > VMXNET3_MAX_TX_NDESC || ndesc < VMXNET3_MIN_TX_NDESC) ndesc = VMXNET3_DEF_TX_NDESC; if (ndesc & VMXNET3_MASK_TX_NDESC) ndesc &= ~VMXNET3_MASK_TX_NDESC; sc->vmx_ntxdescs = ndesc; ndesc = vmxnet3_tunable_int(sc, "rxd", vmxnet3_default_rxndesc); if (ndesc > VMXNET3_MAX_RX_NDESC || ndesc < VMXNET3_MIN_RX_NDESC) ndesc = VMXNET3_DEF_RX_NDESC; if (ndesc & VMXNET3_MASK_RX_NDESC) ndesc &= ~VMXNET3_MASK_RX_NDESC; sc->vmx_nrxdescs = ndesc; sc->vmx_max_rxsegs = VMXNET3_MAX_RX_SEGS; } static void vmxnet3_check_multiqueue(struct vmxnet3_softc *sc) { if (sc->vmx_intr_type != VMXNET3_IT_MSIX) goto out; /* BMV: Just use the maximum configured for now. */ sc->vmx_nrxqueues = sc->vmx_max_nrxqueues; sc->vmx_ntxqueues = sc->vmx_max_ntxqueues; if (sc->vmx_nrxqueues > 1) sc->vmx_flags |= VMXNET3_FLAG_RSS; return; out: sc->vmx_ntxqueues = 1; sc->vmx_nrxqueues = 1; } static int vmxnet3_alloc_msix_interrupts(struct vmxnet3_softc *sc) { device_t dev; int nmsix, cnt, required; dev = sc->vmx_dev; if (sc->vmx_flags & VMXNET3_FLAG_NO_MSIX) return (1); /* Allocate an additional vector for the events interrupt. */ required = sc->vmx_max_nrxqueues + sc->vmx_max_ntxqueues + 1; nmsix = pci_msix_count(dev); if (nmsix < required) return (1); cnt = required; if (pci_alloc_msix(dev, &cnt) == 0 && cnt >= required) { sc->vmx_nintrs = required; return (0); } else pci_release_msi(dev); /* BMV TODO Fallback to sharing MSIX vectors if possible. */ return (1); } static int vmxnet3_alloc_msi_interrupts(struct vmxnet3_softc *sc) { device_t dev; int nmsi, cnt, required; dev = sc->vmx_dev; required = 1; nmsi = pci_msi_count(dev); if (nmsi < required) return (1); cnt = required; if (pci_alloc_msi(dev, &cnt) == 0 && cnt >= required) { sc->vmx_nintrs = 1; return (0); } else pci_release_msi(dev); return (1); } static int vmxnet3_alloc_legacy_interrupts(struct vmxnet3_softc *sc) { sc->vmx_nintrs = 1; return (0); } static int vmxnet3_alloc_interrupt(struct vmxnet3_softc *sc, int rid, int flags, struct vmxnet3_interrupt *intr) { struct resource *irq; irq = bus_alloc_resource_any(sc->vmx_dev, SYS_RES_IRQ, &rid, flags); if (irq == NULL) return (ENXIO); intr->vmxi_irq = irq; intr->vmxi_rid = rid; return (0); } static int vmxnet3_alloc_intr_resources(struct vmxnet3_softc *sc) { int i, rid, flags, error; rid = 0; flags = RF_ACTIVE; if (sc->vmx_intr_type == VMXNET3_IT_LEGACY) flags |= RF_SHAREABLE; else rid = 1; for (i = 0; i < sc->vmx_nintrs; i++, rid++) { error = vmxnet3_alloc_interrupt(sc, rid, flags, &sc->vmx_intrs[i]); if (error) return (error); } return (0); } static int vmxnet3_setup_msix_interrupts(struct vmxnet3_softc *sc) { device_t dev; struct vmxnet3_txqueue *txq; struct vmxnet3_rxqueue *rxq; struct vmxnet3_interrupt *intr; enum intr_type type; int i, error; dev = sc->vmx_dev; intr = &sc->vmx_intrs[0]; type = INTR_TYPE_NET | INTR_MPSAFE; for (i = 0; i < sc->vmx_ntxqueues; i++, intr++) { txq = &sc->vmx_txq[i]; error = bus_setup_intr(dev, intr->vmxi_irq, type, NULL, vmxnet3_txq_intr, txq, &intr->vmxi_handler); if (error) return (error); bus_describe_intr(dev, intr->vmxi_irq, intr->vmxi_handler, "tq%d", i); txq->vxtxq_intr_idx = intr->vmxi_rid - 1; } for (i = 0; i < sc->vmx_nrxqueues; i++, intr++) { rxq = &sc->vmx_rxq[i]; error = bus_setup_intr(dev, intr->vmxi_irq, type, NULL, vmxnet3_rxq_intr, rxq, &intr->vmxi_handler); if (error) return (error); bus_describe_intr(dev, intr->vmxi_irq, intr->vmxi_handler, "rq%d", i); rxq->vxrxq_intr_idx = intr->vmxi_rid - 1; } error = bus_setup_intr(dev, intr->vmxi_irq, type, NULL, vmxnet3_event_intr, sc, &intr->vmxi_handler); if (error) return (error); bus_describe_intr(dev, intr->vmxi_irq, intr->vmxi_handler, "event"); sc->vmx_event_intr_idx = intr->vmxi_rid - 1; return (0); } static int vmxnet3_setup_legacy_interrupt(struct vmxnet3_softc *sc) { struct vmxnet3_interrupt *intr; int i, error; intr = &sc->vmx_intrs[0]; error = bus_setup_intr(sc->vmx_dev, intr->vmxi_irq, INTR_TYPE_NET | INTR_MPSAFE, NULL, vmxnet3_legacy_intr, sc, &intr->vmxi_handler); for (i = 0; i < sc->vmx_ntxqueues; i++) sc->vmx_txq[i].vxtxq_intr_idx = 0; for (i = 0; i < sc->vmx_nrxqueues; i++) sc->vmx_rxq[i].vxrxq_intr_idx = 0; sc->vmx_event_intr_idx = 0; return (error); } static void vmxnet3_set_interrupt_idx(struct vmxnet3_softc *sc) { struct vmxnet3_txqueue *txq; struct vmxnet3_txq_shared *txs; struct vmxnet3_rxqueue *rxq; struct vmxnet3_rxq_shared *rxs; int i; sc->vmx_ds->evintr = sc->vmx_event_intr_idx; for (i = 0; i < sc->vmx_ntxqueues; i++) { txq = &sc->vmx_txq[i]; txs = txq->vxtxq_ts; txs->intr_idx = txq->vxtxq_intr_idx; } for (i = 0; i < sc->vmx_nrxqueues; i++) { rxq = &sc->vmx_rxq[i]; rxs = rxq->vxrxq_rs; rxs->intr_idx = rxq->vxrxq_intr_idx; } } static int vmxnet3_setup_interrupts(struct vmxnet3_softc *sc) { int error; error = vmxnet3_alloc_intr_resources(sc); if (error) return (error); switch (sc->vmx_intr_type) { case VMXNET3_IT_MSIX: error = vmxnet3_setup_msix_interrupts(sc); break; case VMXNET3_IT_MSI: case VMXNET3_IT_LEGACY: error = vmxnet3_setup_legacy_interrupt(sc); break; default: panic("%s: invalid interrupt type %d", __func__, sc->vmx_intr_type); } if (error == 0) vmxnet3_set_interrupt_idx(sc); return (error); } static int vmxnet3_alloc_interrupts(struct vmxnet3_softc *sc) { device_t dev; uint32_t config; int error; dev = sc->vmx_dev; config = vmxnet3_read_cmd(sc, VMXNET3_CMD_GET_INTRCFG); sc->vmx_intr_type = config & 0x03; sc->vmx_intr_mask_mode = (config >> 2) & 0x03; switch (sc->vmx_intr_type) { case VMXNET3_IT_AUTO: sc->vmx_intr_type = VMXNET3_IT_MSIX; /* FALLTHROUGH */ case VMXNET3_IT_MSIX: error = vmxnet3_alloc_msix_interrupts(sc); if (error == 0) break; sc->vmx_intr_type = VMXNET3_IT_MSI; /* FALLTHROUGH */ case VMXNET3_IT_MSI: error = vmxnet3_alloc_msi_interrupts(sc); if (error == 0) break; sc->vmx_intr_type = VMXNET3_IT_LEGACY; /* FALLTHROUGH */ case VMXNET3_IT_LEGACY: error = vmxnet3_alloc_legacy_interrupts(sc); if (error == 0) break; /* FALLTHROUGH */ default: sc->vmx_intr_type = -1; device_printf(dev, "cannot allocate any interrupt resources\n"); return (ENXIO); } return (error); } static void vmxnet3_free_interrupt(struct vmxnet3_softc *sc, struct vmxnet3_interrupt *intr) { device_t dev; dev = sc->vmx_dev; if (intr->vmxi_handler != NULL) { bus_teardown_intr(dev, intr->vmxi_irq, intr->vmxi_handler); intr->vmxi_handler = NULL; } if (intr->vmxi_irq != NULL) { bus_release_resource(dev, SYS_RES_IRQ, intr->vmxi_rid, intr->vmxi_irq); intr->vmxi_irq = NULL; intr->vmxi_rid = -1; } } static void vmxnet3_free_interrupts(struct vmxnet3_softc *sc) { int i; for (i = 0; i < sc->vmx_nintrs; i++) vmxnet3_free_interrupt(sc, &sc->vmx_intrs[i]); if (sc->vmx_intr_type == VMXNET3_IT_MSI || sc->vmx_intr_type == VMXNET3_IT_MSIX) pci_release_msi(sc->vmx_dev); } #ifndef VMXNET3_LEGACY_TX static int vmxnet3_alloc_taskqueue(struct vmxnet3_softc *sc) { device_t dev; dev = sc->vmx_dev; sc->vmx_tq = taskqueue_create(device_get_nameunit(dev), M_NOWAIT, taskqueue_thread_enqueue, &sc->vmx_tq); if (sc->vmx_tq == NULL) return (ENOMEM); return (0); } static void vmxnet3_start_taskqueue(struct vmxnet3_softc *sc) { device_t dev; int nthreads, error; dev = sc->vmx_dev; /* * The taskqueue is typically not frequently used, so a dedicated * thread for each queue is unnecessary. */ nthreads = MAX(1, sc->vmx_ntxqueues / 2); /* * Most drivers just ignore the return value - it only fails * with ENOMEM so an error is not likely. It is hard for us * to recover from an error here. */ error = taskqueue_start_threads(&sc->vmx_tq, nthreads, PI_NET, "%s taskq", device_get_nameunit(dev)); if (error) device_printf(dev, "failed to start taskqueue: %d", error); } static void vmxnet3_drain_taskqueue(struct vmxnet3_softc *sc) { struct vmxnet3_txqueue *txq; int i; if (sc->vmx_tq != NULL) { for (i = 0; i < sc->vmx_max_ntxqueues; i++) { txq = &sc->vmx_txq[i]; taskqueue_drain(sc->vmx_tq, &txq->vxtxq_defrtask); } } } static void vmxnet3_free_taskqueue(struct vmxnet3_softc *sc) { if (sc->vmx_tq != NULL) { taskqueue_free(sc->vmx_tq); sc->vmx_tq = NULL; } } #endif static int vmxnet3_init_rxq(struct vmxnet3_softc *sc, int q) { struct vmxnet3_rxqueue *rxq; struct vmxnet3_rxring *rxr; int i; rxq = &sc->vmx_rxq[q]; snprintf(rxq->vxrxq_name, sizeof(rxq->vxrxq_name), "%s-rx%d", device_get_nameunit(sc->vmx_dev), q); mtx_init(&rxq->vxrxq_mtx, rxq->vxrxq_name, NULL, MTX_DEF); rxq->vxrxq_sc = sc; rxq->vxrxq_id = q; for (i = 0; i < VMXNET3_RXRINGS_PERQ; i++) { rxr = &rxq->vxrxq_cmd_ring[i]; rxr->vxrxr_rid = i; rxr->vxrxr_ndesc = sc->vmx_nrxdescs; rxr->vxrxr_rxbuf = malloc(rxr->vxrxr_ndesc * sizeof(struct vmxnet3_rxbuf), M_DEVBUF, M_NOWAIT | M_ZERO); if (rxr->vxrxr_rxbuf == NULL) return (ENOMEM); rxq->vxrxq_comp_ring.vxcr_ndesc += sc->vmx_nrxdescs; } return (0); } static int vmxnet3_init_txq(struct vmxnet3_softc *sc, int q) { struct vmxnet3_txqueue *txq; struct vmxnet3_txring *txr; txq = &sc->vmx_txq[q]; txr = &txq->vxtxq_cmd_ring; snprintf(txq->vxtxq_name, sizeof(txq->vxtxq_name), "%s-tx%d", device_get_nameunit(sc->vmx_dev), q); mtx_init(&txq->vxtxq_mtx, txq->vxtxq_name, NULL, MTX_DEF); txq->vxtxq_sc = sc; txq->vxtxq_id = q; txr->vxtxr_ndesc = sc->vmx_ntxdescs; txr->vxtxr_txbuf = malloc(txr->vxtxr_ndesc * sizeof(struct vmxnet3_txbuf), M_DEVBUF, M_NOWAIT | M_ZERO); if (txr->vxtxr_txbuf == NULL) return (ENOMEM); txq->vxtxq_comp_ring.vxcr_ndesc = sc->vmx_ntxdescs; #ifndef VMXNET3_LEGACY_TX TASK_INIT(&txq->vxtxq_defrtask, 0, vmxnet3_txq_tq_deferred, txq); txq->vxtxq_br = buf_ring_alloc(VMXNET3_DEF_BUFRING_SIZE, M_DEVBUF, M_NOWAIT, &txq->vxtxq_mtx); if (txq->vxtxq_br == NULL) return (ENOMEM); #endif return (0); } static int vmxnet3_alloc_rxtx_queues(struct vmxnet3_softc *sc) { int i, error; /* * Only attempt to create multiple queues if MSIX is available. MSIX is * disabled by default because its apparently broken for devices passed * through by at least ESXi 5.1. The hw.pci.honor_msi_blacklist tunable * must be set to zero for MSIX. This check prevents us from allocating * queue structures that we will not use. */ if (sc->vmx_flags & VMXNET3_FLAG_NO_MSIX) { sc->vmx_max_nrxqueues = 1; sc->vmx_max_ntxqueues = 1; } sc->vmx_rxq = malloc(sizeof(struct vmxnet3_rxqueue) * sc->vmx_max_nrxqueues, M_DEVBUF, M_NOWAIT | M_ZERO); sc->vmx_txq = malloc(sizeof(struct vmxnet3_txqueue) * sc->vmx_max_ntxqueues, M_DEVBUF, M_NOWAIT | M_ZERO); if (sc->vmx_rxq == NULL || sc->vmx_txq == NULL) return (ENOMEM); for (i = 0; i < sc->vmx_max_nrxqueues; i++) { error = vmxnet3_init_rxq(sc, i); if (error) return (error); } for (i = 0; i < sc->vmx_max_ntxqueues; i++) { error = vmxnet3_init_txq(sc, i); if (error) return (error); } return (0); } static void vmxnet3_destroy_rxq(struct vmxnet3_rxqueue *rxq) { struct vmxnet3_rxring *rxr; int i; rxq->vxrxq_sc = NULL; rxq->vxrxq_id = -1; for (i = 0; i < VMXNET3_RXRINGS_PERQ; i++) { rxr = &rxq->vxrxq_cmd_ring[i]; if (rxr->vxrxr_rxbuf != NULL) { free(rxr->vxrxr_rxbuf, M_DEVBUF); rxr->vxrxr_rxbuf = NULL; } } if (mtx_initialized(&rxq->vxrxq_mtx) != 0) mtx_destroy(&rxq->vxrxq_mtx); } static void vmxnet3_destroy_txq(struct vmxnet3_txqueue *txq) { struct vmxnet3_txring *txr; txr = &txq->vxtxq_cmd_ring; txq->vxtxq_sc = NULL; txq->vxtxq_id = -1; #ifndef VMXNET3_LEGACY_TX if (txq->vxtxq_br != NULL) { buf_ring_free(txq->vxtxq_br, M_DEVBUF); txq->vxtxq_br = NULL; } #endif if (txr->vxtxr_txbuf != NULL) { free(txr->vxtxr_txbuf, M_DEVBUF); txr->vxtxr_txbuf = NULL; } if (mtx_initialized(&txq->vxtxq_mtx) != 0) mtx_destroy(&txq->vxtxq_mtx); } static void vmxnet3_free_rxtx_queues(struct vmxnet3_softc *sc) { int i; if (sc->vmx_rxq != NULL) { for (i = 0; i < sc->vmx_max_nrxqueues; i++) vmxnet3_destroy_rxq(&sc->vmx_rxq[i]); free(sc->vmx_rxq, M_DEVBUF); sc->vmx_rxq = NULL; } if (sc->vmx_txq != NULL) { for (i = 0; i < sc->vmx_max_ntxqueues; i++) vmxnet3_destroy_txq(&sc->vmx_txq[i]); free(sc->vmx_txq, M_DEVBUF); sc->vmx_txq = NULL; } } static int vmxnet3_alloc_shared_data(struct vmxnet3_softc *sc) { device_t dev; uint8_t *kva; size_t size; int i, error; dev = sc->vmx_dev; size = sizeof(struct vmxnet3_driver_shared); error = vmxnet3_dma_malloc(sc, size, 1, &sc->vmx_ds_dma); if (error) { device_printf(dev, "cannot alloc shared memory\n"); return (error); } sc->vmx_ds = (struct vmxnet3_driver_shared *) sc->vmx_ds_dma.dma_vaddr; size = sc->vmx_ntxqueues * sizeof(struct vmxnet3_txq_shared) + sc->vmx_nrxqueues * sizeof(struct vmxnet3_rxq_shared); error = vmxnet3_dma_malloc(sc, size, 128, &sc->vmx_qs_dma); if (error) { device_printf(dev, "cannot alloc queue shared memory\n"); return (error); } sc->vmx_qs = (void *) sc->vmx_qs_dma.dma_vaddr; kva = sc->vmx_qs; for (i = 0; i < sc->vmx_ntxqueues; i++) { sc->vmx_txq[i].vxtxq_ts = (struct vmxnet3_txq_shared *) kva; kva += sizeof(struct vmxnet3_txq_shared); } for (i = 0; i < sc->vmx_nrxqueues; i++) { sc->vmx_rxq[i].vxrxq_rs = (struct vmxnet3_rxq_shared *) kva; kva += sizeof(struct vmxnet3_rxq_shared); } if (sc->vmx_flags & VMXNET3_FLAG_RSS) { size = sizeof(struct vmxnet3_rss_shared); error = vmxnet3_dma_malloc(sc, size, 128, &sc->vmx_rss_dma); if (error) { device_printf(dev, "cannot alloc rss shared memory\n"); return (error); } sc->vmx_rss = (struct vmxnet3_rss_shared *) sc->vmx_rss_dma.dma_vaddr; } return (0); } static void vmxnet3_free_shared_data(struct vmxnet3_softc *sc) { if (sc->vmx_rss != NULL) { vmxnet3_dma_free(sc, &sc->vmx_rss_dma); sc->vmx_rss = NULL; } if (sc->vmx_qs != NULL) { vmxnet3_dma_free(sc, &sc->vmx_qs_dma); sc->vmx_qs = NULL; } if (sc->vmx_ds != NULL) { vmxnet3_dma_free(sc, &sc->vmx_ds_dma); sc->vmx_ds = NULL; } } static int vmxnet3_alloc_txq_data(struct vmxnet3_softc *sc) { device_t dev; struct vmxnet3_txqueue *txq; struct vmxnet3_txring *txr; struct vmxnet3_comp_ring *txc; size_t descsz, compsz; int i, q, error; dev = sc->vmx_dev; for (q = 0; q < sc->vmx_ntxqueues; q++) { txq = &sc->vmx_txq[q]; txr = &txq->vxtxq_cmd_ring; txc = &txq->vxtxq_comp_ring; descsz = txr->vxtxr_ndesc * sizeof(struct vmxnet3_txdesc); compsz = txr->vxtxr_ndesc * sizeof(struct vmxnet3_txcompdesc); error = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0, /* alignment, boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ VMXNET3_TX_MAXSIZE, /* maxsize */ VMXNET3_TX_MAXSEGS, /* nsegments */ VMXNET3_TX_MAXSEGSIZE, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &txr->vxtxr_txtag); if (error) { device_printf(dev, "unable to create Tx buffer tag for queue %d\n", q); return (error); } error = vmxnet3_dma_malloc(sc, descsz, 512, &txr->vxtxr_dma); if (error) { device_printf(dev, "cannot alloc Tx descriptors for " "queue %d error %d\n", q, error); return (error); } txr->vxtxr_txd = (struct vmxnet3_txdesc *) txr->vxtxr_dma.dma_vaddr; error = vmxnet3_dma_malloc(sc, compsz, 512, &txc->vxcr_dma); if (error) { device_printf(dev, "cannot alloc Tx comp descriptors " "for queue %d error %d\n", q, error); return (error); } txc->vxcr_u.txcd = (struct vmxnet3_txcompdesc *) txc->vxcr_dma.dma_vaddr; for (i = 0; i < txr->vxtxr_ndesc; i++) { error = bus_dmamap_create(txr->vxtxr_txtag, 0, &txr->vxtxr_txbuf[i].vtxb_dmamap); if (error) { device_printf(dev, "unable to create Tx buf " "dmamap for queue %d idx %d\n", q, i); return (error); } } } return (0); } static void vmxnet3_free_txq_data(struct vmxnet3_softc *sc) { device_t dev; struct vmxnet3_txqueue *txq; struct vmxnet3_txring *txr; struct vmxnet3_comp_ring *txc; struct vmxnet3_txbuf *txb; int i, q; dev = sc->vmx_dev; for (q = 0; q < sc->vmx_ntxqueues; q++) { txq = &sc->vmx_txq[q]; txr = &txq->vxtxq_cmd_ring; txc = &txq->vxtxq_comp_ring; for (i = 0; i < txr->vxtxr_ndesc; i++) { txb = &txr->vxtxr_txbuf[i]; if (txb->vtxb_dmamap != NULL) { bus_dmamap_destroy(txr->vxtxr_txtag, txb->vtxb_dmamap); txb->vtxb_dmamap = NULL; } } if (txc->vxcr_u.txcd != NULL) { vmxnet3_dma_free(sc, &txc->vxcr_dma); txc->vxcr_u.txcd = NULL; } if (txr->vxtxr_txd != NULL) { vmxnet3_dma_free(sc, &txr->vxtxr_dma); txr->vxtxr_txd = NULL; } if (txr->vxtxr_txtag != NULL) { bus_dma_tag_destroy(txr->vxtxr_txtag); txr->vxtxr_txtag = NULL; } } } static int vmxnet3_alloc_rxq_data(struct vmxnet3_softc *sc) { device_t dev; struct vmxnet3_rxqueue *rxq; struct vmxnet3_rxring *rxr; struct vmxnet3_comp_ring *rxc; int descsz, compsz; int i, j, q, error; dev = sc->vmx_dev; for (q = 0; q < sc->vmx_nrxqueues; q++) { rxq = &sc->vmx_rxq[q]; rxc = &rxq->vxrxq_comp_ring; compsz = 0; for (i = 0; i < VMXNET3_RXRINGS_PERQ; i++) { rxr = &rxq->vxrxq_cmd_ring[i]; descsz = rxr->vxrxr_ndesc * sizeof(struct vmxnet3_rxdesc); compsz += rxr->vxrxr_ndesc * sizeof(struct vmxnet3_rxcompdesc); error = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0, /* alignment, boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MJUMPAGESIZE, /* maxsize */ 1, /* nsegments */ MJUMPAGESIZE, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &rxr->vxrxr_rxtag); if (error) { device_printf(dev, "unable to create Rx buffer tag for " "queue %d\n", q); return (error); } error = vmxnet3_dma_malloc(sc, descsz, 512, &rxr->vxrxr_dma); if (error) { device_printf(dev, "cannot allocate Rx " "descriptors for queue %d/%d error %d\n", i, q, error); return (error); } rxr->vxrxr_rxd = (struct vmxnet3_rxdesc *) rxr->vxrxr_dma.dma_vaddr; } error = vmxnet3_dma_malloc(sc, compsz, 512, &rxc->vxcr_dma); if (error) { device_printf(dev, "cannot alloc Rx comp descriptors " "for queue %d error %d\n", q, error); return (error); } rxc->vxcr_u.rxcd = (struct vmxnet3_rxcompdesc *) rxc->vxcr_dma.dma_vaddr; for (i = 0; i < VMXNET3_RXRINGS_PERQ; i++) { rxr = &rxq->vxrxq_cmd_ring[i]; error = bus_dmamap_create(rxr->vxrxr_rxtag, 0, &rxr->vxrxr_spare_dmap); if (error) { device_printf(dev, "unable to create spare " "dmamap for queue %d/%d error %d\n", q, i, error); return (error); } for (j = 0; j < rxr->vxrxr_ndesc; j++) { error = bus_dmamap_create(rxr->vxrxr_rxtag, 0, &rxr->vxrxr_rxbuf[j].vrxb_dmamap); if (error) { device_printf(dev, "unable to create " "dmamap for queue %d/%d slot %d " "error %d\n", q, i, j, error); return (error); } } } } return (0); } static void vmxnet3_free_rxq_data(struct vmxnet3_softc *sc) { device_t dev; struct vmxnet3_rxqueue *rxq; struct vmxnet3_rxring *rxr; struct vmxnet3_comp_ring *rxc; struct vmxnet3_rxbuf *rxb; int i, j, q; dev = sc->vmx_dev; for (q = 0; q < sc->vmx_nrxqueues; q++) { rxq = &sc->vmx_rxq[q]; rxc = &rxq->vxrxq_comp_ring; for (i = 0; i < VMXNET3_RXRINGS_PERQ; i++) { rxr = &rxq->vxrxq_cmd_ring[i]; if (rxr->vxrxr_spare_dmap != NULL) { bus_dmamap_destroy(rxr->vxrxr_rxtag, rxr->vxrxr_spare_dmap); rxr->vxrxr_spare_dmap = NULL; } for (j = 0; j < rxr->vxrxr_ndesc; j++) { rxb = &rxr->vxrxr_rxbuf[j]; if (rxb->vrxb_dmamap != NULL) { bus_dmamap_destroy(rxr->vxrxr_rxtag, rxb->vrxb_dmamap); rxb->vrxb_dmamap = NULL; } } } if (rxc->vxcr_u.rxcd != NULL) { vmxnet3_dma_free(sc, &rxc->vxcr_dma); rxc->vxcr_u.rxcd = NULL; } for (i = 0; i < VMXNET3_RXRINGS_PERQ; i++) { rxr = &rxq->vxrxq_cmd_ring[i]; if (rxr->vxrxr_rxd != NULL) { vmxnet3_dma_free(sc, &rxr->vxrxr_dma); rxr->vxrxr_rxd = NULL; } if (rxr->vxrxr_rxtag != NULL) { bus_dma_tag_destroy(rxr->vxrxr_rxtag); rxr->vxrxr_rxtag = NULL; } } } } static int vmxnet3_alloc_queue_data(struct vmxnet3_softc *sc) { int error; error = vmxnet3_alloc_txq_data(sc); if (error) return (error); error = vmxnet3_alloc_rxq_data(sc); if (error) return (error); return (0); } static void vmxnet3_free_queue_data(struct vmxnet3_softc *sc) { if (sc->vmx_rxq != NULL) vmxnet3_free_rxq_data(sc); if (sc->vmx_txq != NULL) vmxnet3_free_txq_data(sc); } static int vmxnet3_alloc_mcast_table(struct vmxnet3_softc *sc) { int error; error = vmxnet3_dma_malloc(sc, VMXNET3_MULTICAST_MAX * ETHER_ADDR_LEN, 32, &sc->vmx_mcast_dma); if (error) device_printf(sc->vmx_dev, "unable to alloc multicast table\n"); else sc->vmx_mcast = sc->vmx_mcast_dma.dma_vaddr; return (error); } static void vmxnet3_free_mcast_table(struct vmxnet3_softc *sc) { if (sc->vmx_mcast != NULL) { vmxnet3_dma_free(sc, &sc->vmx_mcast_dma); sc->vmx_mcast = NULL; } } static void vmxnet3_init_shared_data(struct vmxnet3_softc *sc) { struct vmxnet3_driver_shared *ds; struct vmxnet3_txqueue *txq; struct vmxnet3_txq_shared *txs; struct vmxnet3_rxqueue *rxq; struct vmxnet3_rxq_shared *rxs; int i; ds = sc->vmx_ds; /* * Initialize fields of the shared data that remains the same across * reinits. Note the shared data is zero'd when allocated. */ ds->magic = VMXNET3_REV1_MAGIC; /* DriverInfo */ ds->version = VMXNET3_DRIVER_VERSION; ds->guest = VMXNET3_GOS_FREEBSD | #ifdef __LP64__ VMXNET3_GOS_64BIT; #else VMXNET3_GOS_32BIT; #endif ds->vmxnet3_revision = 1; ds->upt_version = 1; /* Misc. conf */ ds->driver_data = vtophys(sc); ds->driver_data_len = sizeof(struct vmxnet3_softc); ds->queue_shared = sc->vmx_qs_dma.dma_paddr; ds->queue_shared_len = sc->vmx_qs_dma.dma_size; ds->nrxsg_max = sc->vmx_max_rxsegs; /* RSS conf */ if (sc->vmx_flags & VMXNET3_FLAG_RSS) { ds->rss.version = 1; ds->rss.paddr = sc->vmx_rss_dma.dma_paddr; ds->rss.len = sc->vmx_rss_dma.dma_size; } /* Interrupt control. */ ds->automask = sc->vmx_intr_mask_mode == VMXNET3_IMM_AUTO; ds->nintr = sc->vmx_nintrs; ds->evintr = sc->vmx_event_intr_idx; ds->ictrl = VMXNET3_ICTRL_DISABLE_ALL; for (i = 0; i < sc->vmx_nintrs; i++) ds->modlevel[i] = UPT1_IMOD_ADAPTIVE; /* Receive filter. */ ds->mcast_table = sc->vmx_mcast_dma.dma_paddr; ds->mcast_tablelen = sc->vmx_mcast_dma.dma_size; /* Tx queues */ for (i = 0; i < sc->vmx_ntxqueues; i++) { txq = &sc->vmx_txq[i]; txs = txq->vxtxq_ts; txs->cmd_ring = txq->vxtxq_cmd_ring.vxtxr_dma.dma_paddr; txs->cmd_ring_len = txq->vxtxq_cmd_ring.vxtxr_ndesc; txs->comp_ring = txq->vxtxq_comp_ring.vxcr_dma.dma_paddr; txs->comp_ring_len = txq->vxtxq_comp_ring.vxcr_ndesc; txs->driver_data = vtophys(txq); txs->driver_data_len = sizeof(struct vmxnet3_txqueue); } /* Rx queues */ for (i = 0; i < sc->vmx_nrxqueues; i++) { rxq = &sc->vmx_rxq[i]; rxs = rxq->vxrxq_rs; rxs->cmd_ring[0] = rxq->vxrxq_cmd_ring[0].vxrxr_dma.dma_paddr; rxs->cmd_ring_len[0] = rxq->vxrxq_cmd_ring[0].vxrxr_ndesc; rxs->cmd_ring[1] = rxq->vxrxq_cmd_ring[1].vxrxr_dma.dma_paddr; rxs->cmd_ring_len[1] = rxq->vxrxq_cmd_ring[1].vxrxr_ndesc; rxs->comp_ring = rxq->vxrxq_comp_ring.vxcr_dma.dma_paddr; rxs->comp_ring_len = rxq->vxrxq_comp_ring.vxcr_ndesc; rxs->driver_data = vtophys(rxq); rxs->driver_data_len = sizeof(struct vmxnet3_rxqueue); } } static void vmxnet3_reinit_interface(struct vmxnet3_softc *sc) { struct ifnet *ifp; ifp = sc->vmx_ifp; /* Use the current MAC address. */ bcopy(IF_LLADDR(sc->vmx_ifp), sc->vmx_lladdr, ETHER_ADDR_LEN); vmxnet3_set_lladdr(sc); ifp->if_hwassist = 0; if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist |= VMXNET3_CSUM_OFFLOAD; if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) ifp->if_hwassist |= VMXNET3_CSUM_OFFLOAD_IPV6; if (ifp->if_capenable & IFCAP_TSO4) ifp->if_hwassist |= CSUM_IP_TSO; if (ifp->if_capenable & IFCAP_TSO6) ifp->if_hwassist |= CSUM_IP6_TSO; } static void vmxnet3_reinit_rss_shared_data(struct vmxnet3_softc *sc) { /* * Use the same key as the Linux driver until FreeBSD can do * RSS (presumably Toeplitz) in software. */ static const uint8_t rss_key[UPT1_RSS_MAX_KEY_SIZE] = { 0x3b, 0x56, 0xd1, 0x56, 0x13, 0x4a, 0xe7, 0xac, 0xe8, 0x79, 0x09, 0x75, 0xe8, 0x65, 0x79, 0x28, 0x35, 0x12, 0xb9, 0x56, 0x7c, 0x76, 0x4b, 0x70, 0xd8, 0x56, 0xa3, 0x18, 0x9b, 0x0a, 0xee, 0xf3, 0x96, 0xa6, 0x9f, 0x8f, 0x9e, 0x8c, 0x90, 0xc9, }; struct vmxnet3_driver_shared *ds; struct vmxnet3_rss_shared *rss; int i; ds = sc->vmx_ds; rss = sc->vmx_rss; rss->hash_type = UPT1_RSS_HASH_TYPE_IPV4 | UPT1_RSS_HASH_TYPE_TCP_IPV4 | UPT1_RSS_HASH_TYPE_IPV6 | UPT1_RSS_HASH_TYPE_TCP_IPV6; rss->hash_func = UPT1_RSS_HASH_FUNC_TOEPLITZ; rss->hash_key_size = UPT1_RSS_MAX_KEY_SIZE; rss->ind_table_size = UPT1_RSS_MAX_IND_TABLE_SIZE; memcpy(rss->hash_key, rss_key, UPT1_RSS_MAX_KEY_SIZE); for (i = 0; i < UPT1_RSS_MAX_IND_TABLE_SIZE; i++) rss->ind_table[i] = i % sc->vmx_nrxqueues; } static void vmxnet3_reinit_shared_data(struct vmxnet3_softc *sc) { struct ifnet *ifp; struct vmxnet3_driver_shared *ds; ifp = sc->vmx_ifp; ds = sc->vmx_ds; ds->mtu = ifp->if_mtu; ds->ntxqueue = sc->vmx_ntxqueues; ds->nrxqueue = sc->vmx_nrxqueues; ds->upt_features = 0; if (ifp->if_capenable & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) ds->upt_features |= UPT1_F_CSUM; if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) ds->upt_features |= UPT1_F_VLAN; if (ifp->if_capenable & IFCAP_LRO) ds->upt_features |= UPT1_F_LRO; if (sc->vmx_flags & VMXNET3_FLAG_RSS) { ds->upt_features |= UPT1_F_RSS; vmxnet3_reinit_rss_shared_data(sc); } vmxnet3_write_bar1(sc, VMXNET3_BAR1_DSL, sc->vmx_ds_dma.dma_paddr); vmxnet3_write_bar1(sc, VMXNET3_BAR1_DSH, (uint64_t) sc->vmx_ds_dma.dma_paddr >> 32); } static int vmxnet3_alloc_data(struct vmxnet3_softc *sc) { int error; error = vmxnet3_alloc_shared_data(sc); if (error) return (error); error = vmxnet3_alloc_queue_data(sc); if (error) return (error); error = vmxnet3_alloc_mcast_table(sc); if (error) return (error); vmxnet3_init_shared_data(sc); return (0); } static void vmxnet3_free_data(struct vmxnet3_softc *sc) { vmxnet3_free_mcast_table(sc); vmxnet3_free_queue_data(sc); vmxnet3_free_shared_data(sc); } static int vmxnet3_setup_interface(struct vmxnet3_softc *sc) { device_t dev; struct ifnet *ifp; dev = sc->vmx_dev; ifp = sc->vmx_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "cannot allocate ifnet structure\n"); return (ENOSPC); } if_initname(ifp, device_get_name(dev), device_get_unit(dev)); #if __FreeBSD_version < 1000025 ifp->if_baudrate = 1000000000; #elif __FreeBSD_version < 1100011 if_initbaudrate(ifp, IF_Gbps(10)); #else ifp->if_baudrate = IF_Gbps(10); #endif ifp->if_softc = sc; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_init = vmxnet3_init; ifp->if_ioctl = vmxnet3_ioctl; - - ifp->if_hw_tsomax = IF_HW_TSOMAX_BUILD_VALUE( - 65535 - sizeof(struct ether_vlan_header) /* bytes */, - VMXNET3_TX_MAXSEGS /* maximum frag count */, - VMXNET3_TX_MAXSEGSHIFT /* frag size */); + ifp->if_hw_tsomax = VMXNET3_TSO_MAXSIZE; #ifdef VMXNET3_LEGACY_TX ifp->if_start = vmxnet3_start; ifp->if_snd.ifq_drv_maxlen = sc->vmx_ntxdescs - 1; IFQ_SET_MAXLEN(&ifp->if_snd, sc->vmx_ntxdescs - 1); IFQ_SET_READY(&ifp->if_snd); #else ifp->if_transmit = vmxnet3_txq_mq_start; ifp->if_qflush = vmxnet3_qflush; #endif vmxnet3_get_lladdr(sc); ether_ifattach(ifp, sc->vmx_lladdr); ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_TXCSUM; ifp->if_capabilities |= IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6; ifp->if_capabilities |= IFCAP_TSO4 | IFCAP_TSO6; ifp->if_capabilities |= IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM; ifp->if_capenable = ifp->if_capabilities; /* These capabilities are not enabled by default. */ ifp->if_capabilities |= IFCAP_LRO | IFCAP_VLAN_HWFILTER; sc->vmx_vlan_attach = EVENTHANDLER_REGISTER(vlan_config, vmxnet3_register_vlan, sc, EVENTHANDLER_PRI_FIRST); sc->vmx_vlan_detach = EVENTHANDLER_REGISTER(vlan_config, vmxnet3_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST); ifmedia_init(&sc->vmx_media, 0, vmxnet3_media_change, vmxnet3_media_status); ifmedia_add(&sc->vmx_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->vmx_media, IFM_ETHER | IFM_AUTO); return (0); } static void vmxnet3_evintr(struct vmxnet3_softc *sc) { device_t dev; struct ifnet *ifp; struct vmxnet3_txq_shared *ts; struct vmxnet3_rxq_shared *rs; uint32_t event; int reset; dev = sc->vmx_dev; ifp = sc->vmx_ifp; reset = 0; VMXNET3_CORE_LOCK(sc); /* Clear events. */ event = sc->vmx_ds->event; vmxnet3_write_bar1(sc, VMXNET3_BAR1_EVENT, event); if (event & VMXNET3_EVENT_LINK) { vmxnet3_link_status(sc); if (sc->vmx_link_active != 0) vmxnet3_tx_start_all(sc); } if (event & (VMXNET3_EVENT_TQERROR | VMXNET3_EVENT_RQERROR)) { reset = 1; vmxnet3_read_cmd(sc, VMXNET3_CMD_GET_STATUS); ts = sc->vmx_txq[0].vxtxq_ts; if (ts->stopped != 0) device_printf(dev, "Tx queue error %#x\n", ts->error); rs = sc->vmx_rxq[0].vxrxq_rs; if (rs->stopped != 0) device_printf(dev, "Rx queue error %#x\n", rs->error); device_printf(dev, "Rx/Tx queue error event ... resetting\n"); } if (event & VMXNET3_EVENT_DIC) device_printf(dev, "device implementation change event\n"); if (event & VMXNET3_EVENT_DEBUG) device_printf(dev, "debug event\n"); if (reset != 0) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; vmxnet3_init_locked(sc); } VMXNET3_CORE_UNLOCK(sc); } static void vmxnet3_txq_eof(struct vmxnet3_txqueue *txq) { struct vmxnet3_softc *sc; struct ifnet *ifp; struct vmxnet3_txring *txr; struct vmxnet3_comp_ring *txc; struct vmxnet3_txcompdesc *txcd; struct vmxnet3_txbuf *txb; struct mbuf *m; u_int sop; sc = txq->vxtxq_sc; ifp = sc->vmx_ifp; txr = &txq->vxtxq_cmd_ring; txc = &txq->vxtxq_comp_ring; VMXNET3_TXQ_LOCK_ASSERT(txq); for (;;) { txcd = &txc->vxcr_u.txcd[txc->vxcr_next]; if (txcd->gen != txc->vxcr_gen) break; vmxnet3_barrier(sc, VMXNET3_BARRIER_RD); if (++txc->vxcr_next == txc->vxcr_ndesc) { txc->vxcr_next = 0; txc->vxcr_gen ^= 1; } sop = txr->vxtxr_next; txb = &txr->vxtxr_txbuf[sop]; if ((m = txb->vtxb_m) != NULL) { bus_dmamap_sync(txr->vxtxr_txtag, txb->vtxb_dmamap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->vxtxr_txtag, txb->vtxb_dmamap); txq->vxtxq_stats.vmtxs_opackets++; txq->vxtxq_stats.vmtxs_obytes += m->m_pkthdr.len; if (m->m_flags & M_MCAST) txq->vxtxq_stats.vmtxs_omcasts++; m_freem(m); txb->vtxb_m = NULL; } txr->vxtxr_next = (txcd->eop_idx + 1) % txr->vxtxr_ndesc; } if (txr->vxtxr_head == txr->vxtxr_next) txq->vxtxq_watchdog = 0; } static int vmxnet3_newbuf(struct vmxnet3_softc *sc, struct vmxnet3_rxring *rxr) { struct ifnet *ifp; struct mbuf *m; struct vmxnet3_rxdesc *rxd; struct vmxnet3_rxbuf *rxb; bus_dma_tag_t tag; bus_dmamap_t dmap; bus_dma_segment_t segs[1]; int idx, clsize, btype, flags, nsegs, error; ifp = sc->vmx_ifp; tag = rxr->vxrxr_rxtag; dmap = rxr->vxrxr_spare_dmap; idx = rxr->vxrxr_fill; rxd = &rxr->vxrxr_rxd[idx]; rxb = &rxr->vxrxr_rxbuf[idx]; #ifdef VMXNET3_FAILPOINTS KFAIL_POINT_CODE(VMXNET3_FP, newbuf, return ENOBUFS); if (rxr->vxrxr_rid != 0) KFAIL_POINT_CODE(VMXNET3_FP, newbuf_body_only, return ENOBUFS); #endif if (rxr->vxrxr_rid == 0 && (idx % sc->vmx_rx_max_chain) == 0) { flags = M_PKTHDR; clsize = MCLBYTES; btype = VMXNET3_BTYPE_HEAD; } else { #if __FreeBSD_version < 902001 /* * These mbufs will never be used for the start of a frame. * Roughly prior to branching releng/9.2, the load_mbuf_sg() * required the mbuf to always be a packet header. Avoid * unnecessary mbuf initialization in newer versions where * that is not the case. */ flags = M_PKTHDR; #else flags = 0; #endif clsize = MJUMPAGESIZE; btype = VMXNET3_BTYPE_BODY; } m = m_getjcl(M_NOWAIT, MT_DATA, flags, clsize); if (m == NULL) { sc->vmx_stats.vmst_mgetcl_failed++; return (ENOBUFS); } if (btype == VMXNET3_BTYPE_HEAD) { m->m_len = m->m_pkthdr.len = clsize; m_adj(m, ETHER_ALIGN); } else m->m_len = clsize; error = bus_dmamap_load_mbuf_sg(tag, dmap, m, &segs[0], &nsegs, BUS_DMA_NOWAIT); if (error) { m_freem(m); sc->vmx_stats.vmst_mbuf_load_failed++; return (error); } KASSERT(nsegs == 1, ("%s: mbuf %p with too many segments %d", __func__, m, nsegs)); #if __FreeBSD_version < 902001 if (btype == VMXNET3_BTYPE_BODY) m->m_flags &= ~M_PKTHDR; #endif if (rxb->vrxb_m != NULL) { bus_dmamap_sync(tag, rxb->vrxb_dmamap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(tag, rxb->vrxb_dmamap); } rxr->vxrxr_spare_dmap = rxb->vrxb_dmamap; rxb->vrxb_dmamap = dmap; rxb->vrxb_m = m; rxd->addr = segs[0].ds_addr; rxd->len = segs[0].ds_len; rxd->btype = btype; rxd->gen = rxr->vxrxr_gen; vmxnet3_rxr_increment_fill(rxr); return (0); } static void vmxnet3_rxq_eof_discard(struct vmxnet3_rxqueue *rxq, struct vmxnet3_rxring *rxr, int idx) { struct vmxnet3_rxdesc *rxd; rxd = &rxr->vxrxr_rxd[idx]; rxd->gen = rxr->vxrxr_gen; vmxnet3_rxr_increment_fill(rxr); } static void vmxnet3_rxq_discard_chain(struct vmxnet3_rxqueue *rxq) { struct vmxnet3_softc *sc; struct vmxnet3_rxring *rxr; struct vmxnet3_comp_ring *rxc; struct vmxnet3_rxcompdesc *rxcd; int idx, eof; sc = rxq->vxrxq_sc; rxc = &rxq->vxrxq_comp_ring; do { rxcd = &rxc->vxcr_u.rxcd[rxc->vxcr_next]; if (rxcd->gen != rxc->vxcr_gen) break; /* Not expected. */ vmxnet3_barrier(sc, VMXNET3_BARRIER_RD); if (++rxc->vxcr_next == rxc->vxcr_ndesc) { rxc->vxcr_next = 0; rxc->vxcr_gen ^= 1; } idx = rxcd->rxd_idx; eof = rxcd->eop; if (rxcd->qid < sc->vmx_nrxqueues) rxr = &rxq->vxrxq_cmd_ring[0]; else rxr = &rxq->vxrxq_cmd_ring[1]; vmxnet3_rxq_eof_discard(rxq, rxr, idx); } while (!eof); } static void vmxnet3_rx_csum(struct vmxnet3_rxcompdesc *rxcd, struct mbuf *m) { if (rxcd->ipv4) { m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED; if (rxcd->ipcsum_ok) m->m_pkthdr.csum_flags |= CSUM_IP_VALID; } if (!rxcd->fragment) { if (rxcd->csum_ok && (rxcd->tcp || rxcd->udp)) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xFFFF; } } } static void vmxnet3_rxq_input(struct vmxnet3_rxqueue *rxq, struct vmxnet3_rxcompdesc *rxcd, struct mbuf *m) { struct vmxnet3_softc *sc; struct ifnet *ifp; sc = rxq->vxrxq_sc; ifp = sc->vmx_ifp; if (rxcd->error) { rxq->vxrxq_stats.vmrxs_ierrors++; m_freem(m); return; } #ifdef notyet switch (rxcd->rss_type) { case VMXNET3_RCD_RSS_TYPE_IPV4: m->m_pkthdr.flowid = rxcd->rss_hash; M_HASHTYPE_SET(m, M_HASHTYPE_RSS_IPV4); break; case VMXNET3_RCD_RSS_TYPE_TCPIPV4: m->m_pkthdr.flowid = rxcd->rss_hash; M_HASHTYPE_SET(m, M_HASHTYPE_RSS_TCP_IPV4); break; case VMXNET3_RCD_RSS_TYPE_IPV6: m->m_pkthdr.flowid = rxcd->rss_hash; M_HASHTYPE_SET(m, M_HASHTYPE_RSS_IPV6); break; case VMXNET3_RCD_RSS_TYPE_TCPIPV6: m->m_pkthdr.flowid = rxcd->rss_hash; M_HASHTYPE_SET(m, M_HASHTYPE_RSS_TCP_IPV6); break; default: /* VMXNET3_RCD_RSS_TYPE_NONE */ m->m_pkthdr.flowid = rxq->vxrxq_id; M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); break; } #else m->m_pkthdr.flowid = rxq->vxrxq_id; m->m_flags |= M_FLOWID; #endif if (!rxcd->no_csum) vmxnet3_rx_csum(rxcd, m); if (rxcd->vlan) { m->m_flags |= M_VLANTAG; m->m_pkthdr.ether_vtag = rxcd->vtag; } rxq->vxrxq_stats.vmrxs_ipackets++; rxq->vxrxq_stats.vmrxs_ibytes += m->m_pkthdr.len; VMXNET3_RXQ_UNLOCK(rxq); (*ifp->if_input)(ifp, m); VMXNET3_RXQ_LOCK(rxq); } static void vmxnet3_rxq_eof(struct vmxnet3_rxqueue *rxq) { struct vmxnet3_softc *sc; struct ifnet *ifp; struct vmxnet3_rxring *rxr; struct vmxnet3_comp_ring *rxc; struct vmxnet3_rxdesc *rxd; struct vmxnet3_rxcompdesc *rxcd; struct mbuf *m, *m_head, *m_tail; int idx, length; sc = rxq->vxrxq_sc; ifp = sc->vmx_ifp; rxc = &rxq->vxrxq_comp_ring; VMXNET3_RXQ_LOCK_ASSERT(rxq); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; m_head = rxq->vxrxq_mhead; rxq->vxrxq_mhead = NULL; m_tail = rxq->vxrxq_mtail; rxq->vxrxq_mtail = NULL; MPASS(m_head == NULL || m_tail != NULL); for (;;) { rxcd = &rxc->vxcr_u.rxcd[rxc->vxcr_next]; if (rxcd->gen != rxc->vxcr_gen) { rxq->vxrxq_mhead = m_head; rxq->vxrxq_mtail = m_tail; break; } vmxnet3_barrier(sc, VMXNET3_BARRIER_RD); if (++rxc->vxcr_next == rxc->vxcr_ndesc) { rxc->vxcr_next = 0; rxc->vxcr_gen ^= 1; } idx = rxcd->rxd_idx; length = rxcd->len; if (rxcd->qid < sc->vmx_nrxqueues) rxr = &rxq->vxrxq_cmd_ring[0]; else rxr = &rxq->vxrxq_cmd_ring[1]; rxd = &rxr->vxrxr_rxd[idx]; m = rxr->vxrxr_rxbuf[idx].vrxb_m; KASSERT(m != NULL, ("%s: queue %d idx %d without mbuf", __func__, rxcd->qid, idx)); /* * The host may skip descriptors. We detect this when this * descriptor does not match the previous fill index. Catch * up with the host now. */ if (__predict_false(rxr->vxrxr_fill != idx)) { while (rxr->vxrxr_fill != idx) { rxr->vxrxr_rxd[rxr->vxrxr_fill].gen = rxr->vxrxr_gen; vmxnet3_rxr_increment_fill(rxr); } } if (rxcd->sop) { KASSERT(rxd->btype == VMXNET3_BTYPE_HEAD, ("%s: start of frame w/o head buffer", __func__)); KASSERT(rxr == &rxq->vxrxq_cmd_ring[0], ("%s: start of frame not in ring 0", __func__)); KASSERT((idx % sc->vmx_rx_max_chain) == 0, ("%s: start of frame at unexcepted index %d (%d)", __func__, idx, sc->vmx_rx_max_chain)); KASSERT(m_head == NULL, ("%s: duplicate start of frame?", __func__)); if (length == 0) { /* Just ignore this descriptor. */ vmxnet3_rxq_eof_discard(rxq, rxr, idx); goto nextp; } if (vmxnet3_newbuf(sc, rxr) != 0) { rxq->vxrxq_stats.vmrxs_iqdrops++; vmxnet3_rxq_eof_discard(rxq, rxr, idx); if (!rxcd->eop) vmxnet3_rxq_discard_chain(rxq); goto nextp; } m->m_pkthdr.rcvif = ifp; m->m_pkthdr.len = m->m_len = length; m->m_pkthdr.csum_flags = 0; m_head = m_tail = m; } else { KASSERT(rxd->btype == VMXNET3_BTYPE_BODY, ("%s: non start of frame w/o body buffer", __func__)); KASSERT(m_head != NULL, ("%s: frame not started?", __func__)); if (vmxnet3_newbuf(sc, rxr) != 0) { rxq->vxrxq_stats.vmrxs_iqdrops++; vmxnet3_rxq_eof_discard(rxq, rxr, idx); if (!rxcd->eop) vmxnet3_rxq_discard_chain(rxq); m_freem(m_head); m_head = m_tail = NULL; goto nextp; } m->m_len = length; m_head->m_pkthdr.len += length; m_tail->m_next = m; m_tail = m; } if (rxcd->eop) { vmxnet3_rxq_input(rxq, rxcd, m_head); m_head = m_tail = NULL; /* Must recheck after dropping the Rx lock. */ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; } nextp: if (__predict_false(rxq->vxrxq_rs->update_rxhead)) { int qid = rxcd->qid; bus_size_t r; idx = (idx + 1) % rxr->vxrxr_ndesc; if (qid >= sc->vmx_nrxqueues) { qid -= sc->vmx_nrxqueues; r = VMXNET3_BAR0_RXH2(qid); } else r = VMXNET3_BAR0_RXH1(qid); vmxnet3_write_bar0(sc, r, idx); } } } static void vmxnet3_legacy_intr(void *xsc) { struct vmxnet3_softc *sc; struct vmxnet3_rxqueue *rxq; struct vmxnet3_txqueue *txq; sc = xsc; rxq = &sc->vmx_rxq[0]; txq = &sc->vmx_txq[0]; if (sc->vmx_intr_type == VMXNET3_IT_LEGACY) { if (vmxnet3_read_bar1(sc, VMXNET3_BAR1_INTR) == 0) return; } if (sc->vmx_intr_mask_mode == VMXNET3_IMM_ACTIVE) vmxnet3_disable_all_intrs(sc); if (sc->vmx_ds->event != 0) vmxnet3_evintr(sc); VMXNET3_RXQ_LOCK(rxq); vmxnet3_rxq_eof(rxq); VMXNET3_RXQ_UNLOCK(rxq); VMXNET3_TXQ_LOCK(txq); vmxnet3_txq_eof(txq); vmxnet3_txq_start(txq); VMXNET3_TXQ_UNLOCK(txq); vmxnet3_enable_all_intrs(sc); } static void vmxnet3_txq_intr(void *xtxq) { struct vmxnet3_softc *sc; struct vmxnet3_txqueue *txq; txq = xtxq; sc = txq->vxtxq_sc; if (sc->vmx_intr_mask_mode == VMXNET3_IMM_ACTIVE) vmxnet3_disable_intr(sc, txq->vxtxq_intr_idx); VMXNET3_TXQ_LOCK(txq); vmxnet3_txq_eof(txq); vmxnet3_txq_start(txq); VMXNET3_TXQ_UNLOCK(txq); vmxnet3_enable_intr(sc, txq->vxtxq_intr_idx); } static void vmxnet3_rxq_intr(void *xrxq) { struct vmxnet3_softc *sc; struct vmxnet3_rxqueue *rxq; rxq = xrxq; sc = rxq->vxrxq_sc; if (sc->vmx_intr_mask_mode == VMXNET3_IMM_ACTIVE) vmxnet3_disable_intr(sc, rxq->vxrxq_intr_idx); VMXNET3_RXQ_LOCK(rxq); vmxnet3_rxq_eof(rxq); VMXNET3_RXQ_UNLOCK(rxq); vmxnet3_enable_intr(sc, rxq->vxrxq_intr_idx); } static void vmxnet3_event_intr(void *xsc) { struct vmxnet3_softc *sc; sc = xsc; if (sc->vmx_intr_mask_mode == VMXNET3_IMM_ACTIVE) vmxnet3_disable_intr(sc, sc->vmx_event_intr_idx); if (sc->vmx_ds->event != 0) vmxnet3_evintr(sc); vmxnet3_enable_intr(sc, sc->vmx_event_intr_idx); } static void vmxnet3_txstop(struct vmxnet3_softc *sc, struct vmxnet3_txqueue *txq) { struct vmxnet3_txring *txr; struct vmxnet3_txbuf *txb; int i; txr = &txq->vxtxq_cmd_ring; for (i = 0; i < txr->vxtxr_ndesc; i++) { txb = &txr->vxtxr_txbuf[i]; if (txb->vtxb_m == NULL) continue; bus_dmamap_sync(txr->vxtxr_txtag, txb->vtxb_dmamap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->vxtxr_txtag, txb->vtxb_dmamap); m_freem(txb->vtxb_m); txb->vtxb_m = NULL; } } static void vmxnet3_rxstop(struct vmxnet3_softc *sc, struct vmxnet3_rxqueue *rxq) { struct vmxnet3_rxring *rxr; struct vmxnet3_rxbuf *rxb; int i, j; if (rxq->vxrxq_mhead != NULL) { m_freem(rxq->vxrxq_mhead); rxq->vxrxq_mhead = NULL; rxq->vxrxq_mtail = NULL; } for (i = 0; i < VMXNET3_RXRINGS_PERQ; i++) { rxr = &rxq->vxrxq_cmd_ring[i]; for (j = 0; j < rxr->vxrxr_ndesc; j++) { rxb = &rxr->vxrxr_rxbuf[j]; if (rxb->vrxb_m == NULL) continue; bus_dmamap_sync(rxr->vxrxr_rxtag, rxb->vrxb_dmamap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->vxrxr_rxtag, rxb->vrxb_dmamap); m_freem(rxb->vrxb_m); rxb->vrxb_m = NULL; } } } static void vmxnet3_stop_rendezvous(struct vmxnet3_softc *sc) { struct vmxnet3_rxqueue *rxq; struct vmxnet3_txqueue *txq; int i; for (i = 0; i < sc->vmx_nrxqueues; i++) { rxq = &sc->vmx_rxq[i]; VMXNET3_RXQ_LOCK(rxq); VMXNET3_RXQ_UNLOCK(rxq); } for (i = 0; i < sc->vmx_ntxqueues; i++) { txq = &sc->vmx_txq[i]; VMXNET3_TXQ_LOCK(txq); VMXNET3_TXQ_UNLOCK(txq); } } static void vmxnet3_stop(struct vmxnet3_softc *sc) { struct ifnet *ifp; int q; ifp = sc->vmx_ifp; VMXNET3_CORE_LOCK_ASSERT(sc); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; sc->vmx_link_active = 0; callout_stop(&sc->vmx_tick); /* Disable interrupts. */ vmxnet3_disable_all_intrs(sc); vmxnet3_write_cmd(sc, VMXNET3_CMD_DISABLE); vmxnet3_stop_rendezvous(sc); for (q = 0; q < sc->vmx_ntxqueues; q++) vmxnet3_txstop(sc, &sc->vmx_txq[q]); for (q = 0; q < sc->vmx_nrxqueues; q++) vmxnet3_rxstop(sc, &sc->vmx_rxq[q]); vmxnet3_write_cmd(sc, VMXNET3_CMD_RESET); } static void vmxnet3_txinit(struct vmxnet3_softc *sc, struct vmxnet3_txqueue *txq) { struct vmxnet3_txring *txr; struct vmxnet3_comp_ring *txc; txr = &txq->vxtxq_cmd_ring; txr->vxtxr_head = 0; txr->vxtxr_next = 0; txr->vxtxr_gen = VMXNET3_INIT_GEN; bzero(txr->vxtxr_txd, txr->vxtxr_ndesc * sizeof(struct vmxnet3_txdesc)); txc = &txq->vxtxq_comp_ring; txc->vxcr_next = 0; txc->vxcr_gen = VMXNET3_INIT_GEN; bzero(txc->vxcr_u.txcd, txc->vxcr_ndesc * sizeof(struct vmxnet3_txcompdesc)); } static int vmxnet3_rxinit(struct vmxnet3_softc *sc, struct vmxnet3_rxqueue *rxq) { struct ifnet *ifp; struct vmxnet3_rxring *rxr; struct vmxnet3_comp_ring *rxc; int i, populate, idx, frame_size, error; ifp = sc->vmx_ifp; frame_size = ETHER_ALIGN + sizeof(struct ether_vlan_header) + ifp->if_mtu; /* * If the MTU causes us to exceed what a regular sized cluster can * handle, we allocate a second MJUMPAGESIZE cluster after it in * ring 0. If in use, ring 1 always contains MJUMPAGESIZE clusters. * * Keep rx_max_chain a divisor of the maximum Rx ring size to make * our life easier. We do not support changing the ring size after * the attach. */ if (frame_size <= MCLBYTES) sc->vmx_rx_max_chain = 1; else sc->vmx_rx_max_chain = 2; /* * Only populate ring 1 if the configuration will take advantage * of it. That is either when LRO is enabled or the frame size * exceeds what ring 0 can contain. */ if ((ifp->if_capenable & IFCAP_LRO) == 0 && frame_size <= MCLBYTES + MJUMPAGESIZE) populate = 1; else populate = VMXNET3_RXRINGS_PERQ; for (i = 0; i < populate; i++) { rxr = &rxq->vxrxq_cmd_ring[i]; rxr->vxrxr_fill = 0; rxr->vxrxr_gen = VMXNET3_INIT_GEN; bzero(rxr->vxrxr_rxd, rxr->vxrxr_ndesc * sizeof(struct vmxnet3_rxdesc)); for (idx = 0; idx < rxr->vxrxr_ndesc; idx++) { error = vmxnet3_newbuf(sc, rxr); if (error) return (error); } } for (/**/; i < VMXNET3_RXRINGS_PERQ; i++) { rxr = &rxq->vxrxq_cmd_ring[i]; rxr->vxrxr_fill = 0; rxr->vxrxr_gen = 0; bzero(rxr->vxrxr_rxd, rxr->vxrxr_ndesc * sizeof(struct vmxnet3_rxdesc)); } rxc = &rxq->vxrxq_comp_ring; rxc->vxcr_next = 0; rxc->vxcr_gen = VMXNET3_INIT_GEN; bzero(rxc->vxcr_u.rxcd, rxc->vxcr_ndesc * sizeof(struct vmxnet3_rxcompdesc)); return (0); } static int vmxnet3_reinit_queues(struct vmxnet3_softc *sc) { device_t dev; int q, error; dev = sc->vmx_dev; for (q = 0; q < sc->vmx_ntxqueues; q++) vmxnet3_txinit(sc, &sc->vmx_txq[q]); for (q = 0; q < sc->vmx_nrxqueues; q++) { error = vmxnet3_rxinit(sc, &sc->vmx_rxq[q]); if (error) { device_printf(dev, "cannot populate Rx queue %d\n", q); return (error); } } return (0); } static int vmxnet3_enable_device(struct vmxnet3_softc *sc) { int q; if (vmxnet3_read_cmd(sc, VMXNET3_CMD_ENABLE) != 0) { device_printf(sc->vmx_dev, "device enable command failed!\n"); return (1); } /* Reset the Rx queue heads. */ for (q = 0; q < sc->vmx_nrxqueues; q++) { vmxnet3_write_bar0(sc, VMXNET3_BAR0_RXH1(q), 0); vmxnet3_write_bar0(sc, VMXNET3_BAR0_RXH2(q), 0); } return (0); } static void vmxnet3_reinit_rxfilters(struct vmxnet3_softc *sc) { struct ifnet *ifp; ifp = sc->vmx_ifp; vmxnet3_set_rxfilter(sc); if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) bcopy(sc->vmx_vlan_filter, sc->vmx_ds->vlan_filter, sizeof(sc->vmx_ds->vlan_filter)); else bzero(sc->vmx_ds->vlan_filter, sizeof(sc->vmx_ds->vlan_filter)); vmxnet3_write_cmd(sc, VMXNET3_CMD_VLAN_FILTER); } static int vmxnet3_reinit(struct vmxnet3_softc *sc) { vmxnet3_reinit_interface(sc); vmxnet3_reinit_shared_data(sc); if (vmxnet3_reinit_queues(sc) != 0) return (ENXIO); if (vmxnet3_enable_device(sc) != 0) return (ENXIO); vmxnet3_reinit_rxfilters(sc); return (0); } static void vmxnet3_init_locked(struct vmxnet3_softc *sc) { struct ifnet *ifp; ifp = sc->vmx_ifp; if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; vmxnet3_stop(sc); if (vmxnet3_reinit(sc) != 0) { vmxnet3_stop(sc); return; } ifp->if_drv_flags |= IFF_DRV_RUNNING; vmxnet3_link_status(sc); vmxnet3_enable_all_intrs(sc); callout_reset(&sc->vmx_tick, hz, vmxnet3_tick, sc); } static void vmxnet3_init(void *xsc) { struct vmxnet3_softc *sc; sc = xsc; VMXNET3_CORE_LOCK(sc); vmxnet3_init_locked(sc); VMXNET3_CORE_UNLOCK(sc); } /* * BMV: Much of this can go away once we finally have offsets in * the mbuf packet header. Bug andre@. */ static int vmxnet3_txq_offload_ctx(struct vmxnet3_txqueue *txq, struct mbuf *m, int *etype, int *proto, int *start) { struct ether_vlan_header *evh; int offset; #if defined(INET) struct ip *ip = NULL; struct ip iphdr; #endif #if defined(INET6) struct ip6_hdr *ip6 = NULL; struct ip6_hdr ip6hdr; #endif evh = mtod(m, struct ether_vlan_header *); if (evh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { /* BMV: We should handle nested VLAN tags too. */ *etype = ntohs(evh->evl_proto); offset = sizeof(struct ether_vlan_header); } else { *etype = ntohs(evh->evl_encap_proto); offset = sizeof(struct ether_header); } switch (*etype) { #if defined(INET) case ETHERTYPE_IP: if (__predict_false(m->m_len < offset + sizeof(struct ip))) { m_copydata(m, offset, sizeof(struct ip), (caddr_t) &iphdr); ip = &iphdr; } else ip = mtodo(m, offset); *proto = ip->ip_p; *start = offset + (ip->ip_hl << 2); break; #endif #if defined(INET6) case ETHERTYPE_IPV6: if (__predict_false(m->m_len < offset + sizeof(struct ip6_hdr))) { m_copydata(m, offset, sizeof(struct ip6_hdr), (caddr_t) &ip6hdr); ip6 = &ip6hdr; } else ip6 = mtodo(m, offset); *proto = -1; *start = ip6_lasthdr(m, offset, IPPROTO_IPV6, proto); /* Assert the network stack sent us a valid packet. */ KASSERT(*start > offset, ("%s: mbuf %p start %d offset %d proto %d", __func__, m, *start, offset, *proto)); break; #endif default: return (EINVAL); } if (m->m_pkthdr.csum_flags & CSUM_TSO) { struct tcphdr *tcp, tcphdr; uint16_t sum; if (__predict_false(*proto != IPPROTO_TCP)) { /* Likely failed to correctly parse the mbuf. */ return (EINVAL); } txq->vxtxq_stats.vmtxs_tso++; switch (*etype) { #if defined(INET) case ETHERTYPE_IP: sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); break; #endif #if defined(INET6) case ETHERTYPE_IPV6: sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); break; #endif default: sum = 0; break; } if (m->m_len < *start + sizeof(struct tcphdr)) { m_copyback(m, *start + offsetof(struct tcphdr, th_sum), sizeof(uint16_t), (caddr_t) &sum); m_copydata(m, *start, sizeof(struct tcphdr), (caddr_t) &tcphdr); tcp = &tcphdr; } else { tcp = mtodo(m, *start); tcp->th_sum = sum; } /* * For TSO, the size of the protocol header is also * included in the descriptor header size. */ *start += (tcp->th_off << 2); } else txq->vxtxq_stats.vmtxs_csum++; return (0); } static int vmxnet3_txq_load_mbuf(struct vmxnet3_txqueue *txq, struct mbuf **m0, bus_dmamap_t dmap, bus_dma_segment_t segs[], int *nsegs) { struct vmxnet3_txring *txr; struct mbuf *m; bus_dma_tag_t tag; int error; txr = &txq->vxtxq_cmd_ring; m = *m0; tag = txr->vxtxr_txtag; error = bus_dmamap_load_mbuf_sg(tag, dmap, m, segs, nsegs, 0); if (error == 0 || error != EFBIG) return (error); m = m_defrag(m, M_NOWAIT); if (m != NULL) { *m0 = m; error = bus_dmamap_load_mbuf_sg(tag, dmap, m, segs, nsegs, 0); } else error = ENOBUFS; if (error) { m_freem(*m0); *m0 = NULL; txq->vxtxq_sc->vmx_stats.vmst_defrag_failed++; } else txq->vxtxq_sc->vmx_stats.vmst_defragged++; return (error); } static void vmxnet3_txq_unload_mbuf(struct vmxnet3_txqueue *txq, bus_dmamap_t dmap) { struct vmxnet3_txring *txr; txr = &txq->vxtxq_cmd_ring; bus_dmamap_unload(txr->vxtxr_txtag, dmap); } static int vmxnet3_txq_encap(struct vmxnet3_txqueue *txq, struct mbuf **m0) { struct vmxnet3_softc *sc; struct vmxnet3_txring *txr; struct vmxnet3_txdesc *txd, *sop; struct mbuf *m; bus_dmamap_t dmap; bus_dma_segment_t segs[VMXNET3_TX_MAXSEGS]; int i, gen, nsegs, etype, proto, start, error; sc = txq->vxtxq_sc; start = 0; txd = NULL; txr = &txq->vxtxq_cmd_ring; dmap = txr->vxtxr_txbuf[txr->vxtxr_head].vtxb_dmamap; error = vmxnet3_txq_load_mbuf(txq, m0, dmap, segs, &nsegs); if (error) return (error); m = *m0; M_ASSERTPKTHDR(m); KASSERT(nsegs <= VMXNET3_TX_MAXSEGS, ("%s: mbuf %p with too many segments %d", __func__, m, nsegs)); if (VMXNET3_TXRING_AVAIL(txr) < nsegs) { txq->vxtxq_stats.vmtxs_full++; vmxnet3_txq_unload_mbuf(txq, dmap); return (ENOSPC); } else if (m->m_pkthdr.csum_flags & VMXNET3_CSUM_ALL_OFFLOAD) { error = vmxnet3_txq_offload_ctx(txq, m, &etype, &proto, &start); if (error) { txq->vxtxq_stats.vmtxs_offload_failed++; vmxnet3_txq_unload_mbuf(txq, dmap); m_freem(m); *m0 = NULL; return (error); } } txr->vxtxr_txbuf[txr->vxtxr_head].vtxb_m = m; sop = &txr->vxtxr_txd[txr->vxtxr_head]; gen = txr->vxtxr_gen ^ 1; /* Owned by cpu (yet) */ for (i = 0; i < nsegs; i++) { txd = &txr->vxtxr_txd[txr->vxtxr_head]; txd->addr = segs[i].ds_addr; txd->len = segs[i].ds_len; txd->gen = gen; txd->dtype = 0; txd->offload_mode = VMXNET3_OM_NONE; txd->offload_pos = 0; txd->hlen = 0; txd->eop = 0; txd->compreq = 0; txd->vtag_mode = 0; txd->vtag = 0; if (++txr->vxtxr_head == txr->vxtxr_ndesc) { txr->vxtxr_head = 0; txr->vxtxr_gen ^= 1; } gen = txr->vxtxr_gen; } txd->eop = 1; txd->compreq = 1; if (m->m_flags & M_VLANTAG) { sop->vtag_mode = 1; sop->vtag = m->m_pkthdr.ether_vtag; } if (m->m_pkthdr.csum_flags & CSUM_TSO) { sop->offload_mode = VMXNET3_OM_TSO; sop->hlen = start; sop->offload_pos = m->m_pkthdr.tso_segsz; } else if (m->m_pkthdr.csum_flags & (VMXNET3_CSUM_OFFLOAD | VMXNET3_CSUM_OFFLOAD_IPV6)) { sop->offload_mode = VMXNET3_OM_CSUM; sop->hlen = start; sop->offload_pos = start + m->m_pkthdr.csum_data; } /* Finally, change the ownership. */ vmxnet3_barrier(sc, VMXNET3_BARRIER_WR); sop->gen ^= 1; txq->vxtxq_ts->npending += nsegs; if (txq->vxtxq_ts->npending >= txq->vxtxq_ts->intr_threshold) { txq->vxtxq_ts->npending = 0; vmxnet3_write_bar0(sc, VMXNET3_BAR0_TXH(txq->vxtxq_id), txr->vxtxr_head); } return (0); } #ifdef VMXNET3_LEGACY_TX static void vmxnet3_start_locked(struct ifnet *ifp) { struct vmxnet3_softc *sc; struct vmxnet3_txqueue *txq; struct vmxnet3_txring *txr; struct mbuf *m_head; int tx, avail; sc = ifp->if_softc; txq = &sc->vmx_txq[0]; txr = &txq->vxtxq_cmd_ring; tx = 0; VMXNET3_TXQ_LOCK_ASSERT(txq); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || sc->vmx_link_active == 0) return; while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { if ((avail = VMXNET3_TXRING_AVAIL(txr)) < 2) break; IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) break; /* Assume worse case if this mbuf is the head of a chain. */ if (m_head->m_next != NULL && avail < VMXNET3_TX_MAXSEGS) { IFQ_DRV_PREPEND(&ifp->if_snd, m_head); break; } if (vmxnet3_txq_encap(txq, &m_head) != 0) { if (m_head != NULL) IFQ_DRV_PREPEND(&ifp->if_snd, m_head); break; } tx++; ETHER_BPF_MTAP(ifp, m_head); } if (tx > 0) txq->vxtxq_watchdog = VMXNET3_WATCHDOG_TIMEOUT; } static void vmxnet3_start(struct ifnet *ifp) { struct vmxnet3_softc *sc; struct vmxnet3_txqueue *txq; sc = ifp->if_softc; txq = &sc->vmx_txq[0]; VMXNET3_TXQ_LOCK(txq); vmxnet3_start_locked(ifp); VMXNET3_TXQ_UNLOCK(txq); } #else /* !VMXNET3_LEGACY_TX */ static int vmxnet3_txq_mq_start_locked(struct vmxnet3_txqueue *txq, struct mbuf *m) { struct vmxnet3_softc *sc; struct vmxnet3_txring *txr; struct buf_ring *br; struct ifnet *ifp; int tx, avail, error; sc = txq->vxtxq_sc; br = txq->vxtxq_br; ifp = sc->vmx_ifp; txr = &txq->vxtxq_cmd_ring; tx = 0; error = 0; VMXNET3_TXQ_LOCK_ASSERT(txq); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || sc->vmx_link_active == 0) { if (m != NULL) error = drbr_enqueue(ifp, br, m); return (error); } if (m != NULL) { error = drbr_enqueue(ifp, br, m); if (error) return (error); } while ((avail = VMXNET3_TXRING_AVAIL(txr)) >= 2) { m = drbr_peek(ifp, br); if (m == NULL) break; /* Assume worse case if this mbuf is the head of a chain. */ if (m->m_next != NULL && avail < VMXNET3_TX_MAXSEGS) { drbr_putback(ifp, br, m); break; } if (vmxnet3_txq_encap(txq, &m) != 0) { if (m != NULL) drbr_putback(ifp, br, m); else drbr_advance(ifp, br); break; } drbr_advance(ifp, br); tx++; ETHER_BPF_MTAP(ifp, m); } if (tx > 0) txq->vxtxq_watchdog = VMXNET3_WATCHDOG_TIMEOUT; return (0); } static int vmxnet3_txq_mq_start(struct ifnet *ifp, struct mbuf *m) { struct vmxnet3_softc *sc; struct vmxnet3_txqueue *txq; int i, ntxq, error; sc = ifp->if_softc; ntxq = sc->vmx_ntxqueues; if (m->m_flags & M_FLOWID) i = m->m_pkthdr.flowid % ntxq; else i = curcpu % ntxq; txq = &sc->vmx_txq[i]; if (VMXNET3_TXQ_TRYLOCK(txq) != 0) { error = vmxnet3_txq_mq_start_locked(txq, m); VMXNET3_TXQ_UNLOCK(txq); } else { error = drbr_enqueue(ifp, txq->vxtxq_br, m); taskqueue_enqueue(sc->vmx_tq, &txq->vxtxq_defrtask); } return (error); } static void vmxnet3_txq_tq_deferred(void *xtxq, int pending) { struct vmxnet3_softc *sc; struct vmxnet3_txqueue *txq; txq = xtxq; sc = txq->vxtxq_sc; VMXNET3_TXQ_LOCK(txq); if (!drbr_empty(sc->vmx_ifp, txq->vxtxq_br)) vmxnet3_txq_mq_start_locked(txq, NULL); VMXNET3_TXQ_UNLOCK(txq); } #endif /* VMXNET3_LEGACY_TX */ static void vmxnet3_txq_start(struct vmxnet3_txqueue *txq) { struct vmxnet3_softc *sc; struct ifnet *ifp; sc = txq->vxtxq_sc; ifp = sc->vmx_ifp; #ifdef VMXNET3_LEGACY_TX if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) vmxnet3_start_locked(ifp); #else if (!drbr_empty(ifp, txq->vxtxq_br)) vmxnet3_txq_mq_start_locked(txq, NULL); #endif } static void vmxnet3_tx_start_all(struct vmxnet3_softc *sc) { struct vmxnet3_txqueue *txq; int i; VMXNET3_CORE_LOCK_ASSERT(sc); for (i = 0; i < sc->vmx_ntxqueues; i++) { txq = &sc->vmx_txq[i]; VMXNET3_TXQ_LOCK(txq); vmxnet3_txq_start(txq); VMXNET3_TXQ_UNLOCK(txq); } } static void vmxnet3_update_vlan_filter(struct vmxnet3_softc *sc, int add, uint16_t tag) { struct ifnet *ifp; int idx, bit; ifp = sc->vmx_ifp; idx = (tag >> 5) & 0x7F; bit = tag & 0x1F; if (tag == 0 || tag > 4095) return; VMXNET3_CORE_LOCK(sc); /* Update our private VLAN bitvector. */ if (add) sc->vmx_vlan_filter[idx] |= (1 << bit); else sc->vmx_vlan_filter[idx] &= ~(1 << bit); if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) { if (add) sc->vmx_ds->vlan_filter[idx] |= (1 << bit); else sc->vmx_ds->vlan_filter[idx] &= ~(1 << bit); vmxnet3_write_cmd(sc, VMXNET3_CMD_VLAN_FILTER); } VMXNET3_CORE_UNLOCK(sc); } static void vmxnet3_register_vlan(void *arg, struct ifnet *ifp, uint16_t tag) { if (ifp->if_softc == arg) vmxnet3_update_vlan_filter(arg, 1, tag); } static void vmxnet3_unregister_vlan(void *arg, struct ifnet *ifp, uint16_t tag) { if (ifp->if_softc == arg) vmxnet3_update_vlan_filter(arg, 0, tag); } static void vmxnet3_set_rxfilter(struct vmxnet3_softc *sc) { struct ifnet *ifp; struct vmxnet3_driver_shared *ds; struct ifmultiaddr *ifma; u_int mode; ifp = sc->vmx_ifp; ds = sc->vmx_ds; mode = VMXNET3_RXMODE_UCAST | VMXNET3_RXMODE_BCAST; if (ifp->if_flags & IFF_PROMISC) mode |= VMXNET3_RXMODE_PROMISC; if (ifp->if_flags & IFF_ALLMULTI) mode |= VMXNET3_RXMODE_ALLMULTI; else { int cnt = 0, overflow = 0; if_maddr_rlock(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; else if (cnt == VMXNET3_MULTICAST_MAX) { overflow = 1; break; } bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), &sc->vmx_mcast[cnt*ETHER_ADDR_LEN], ETHER_ADDR_LEN); cnt++; } if_maddr_runlock(ifp); if (overflow != 0) { cnt = 0; mode |= VMXNET3_RXMODE_ALLMULTI; } else if (cnt > 0) mode |= VMXNET3_RXMODE_MCAST; ds->mcast_tablelen = cnt * ETHER_ADDR_LEN; } ds->rxmode = mode; vmxnet3_write_cmd(sc, VMXNET3_CMD_SET_FILTER); vmxnet3_write_cmd(sc, VMXNET3_CMD_SET_RXMODE); } static int vmxnet3_change_mtu(struct vmxnet3_softc *sc, int mtu) { struct ifnet *ifp; ifp = sc->vmx_ifp; if (mtu < VMXNET3_MIN_MTU || mtu > VMXNET3_MAX_MTU) return (EINVAL); ifp->if_mtu = mtu; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; vmxnet3_init_locked(sc); } return (0); } static int vmxnet3_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct vmxnet3_softc *sc; struct ifreq *ifr; int reinit, mask, error; sc = ifp->if_softc; ifr = (struct ifreq *) data; error = 0; switch (cmd) { case SIOCSIFMTU: if (ifp->if_mtu != ifr->ifr_mtu) { VMXNET3_CORE_LOCK(sc); error = vmxnet3_change_mtu(sc, ifr->ifr_mtu); VMXNET3_CORE_UNLOCK(sc); } break; case SIOCSIFFLAGS: VMXNET3_CORE_LOCK(sc); if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) { if ((ifp->if_flags ^ sc->vmx_if_flags) & (IFF_PROMISC | IFF_ALLMULTI)) { vmxnet3_set_rxfilter(sc); } } else vmxnet3_init_locked(sc); } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) vmxnet3_stop(sc); } sc->vmx_if_flags = ifp->if_flags; VMXNET3_CORE_UNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: VMXNET3_CORE_LOCK(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) vmxnet3_set_rxfilter(sc); VMXNET3_CORE_UNLOCK(sc); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->vmx_media, cmd); break; case SIOCSIFCAP: VMXNET3_CORE_LOCK(sc); mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) ifp->if_capenable ^= IFCAP_TXCSUM; if (mask & IFCAP_TXCSUM_IPV6) ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; if (mask & IFCAP_TSO4) ifp->if_capenable ^= IFCAP_TSO4; if (mask & IFCAP_TSO6) ifp->if_capenable ^= IFCAP_TSO6; if (mask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_LRO | IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWFILTER)) { /* Changing these features requires us to reinit. */ reinit = 1; if (mask & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; if (mask & IFCAP_RXCSUM_IPV6) ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_VLAN_HWTAGGING) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (mask & IFCAP_VLAN_HWFILTER) ifp->if_capenable ^= IFCAP_VLAN_HWFILTER; } else reinit = 0; if (mask & IFCAP_VLAN_HWTSO) ifp->if_capenable ^= IFCAP_VLAN_HWTSO; if (reinit && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; vmxnet3_init_locked(sc); } VMXNET3_CORE_UNLOCK(sc); VLAN_CAPABILITIES(ifp); break; default: error = ether_ioctl(ifp, cmd, data); break; } VMXNET3_CORE_LOCK_ASSERT_NOTOWNED(sc); return (error); } #ifndef VMXNET3_LEGACY_TX static void vmxnet3_qflush(struct ifnet *ifp) { struct vmxnet3_softc *sc; struct vmxnet3_txqueue *txq; struct mbuf *m; int i; sc = ifp->if_softc; for (i = 0; i < sc->vmx_ntxqueues; i++) { txq = &sc->vmx_txq[i]; VMXNET3_TXQ_LOCK(txq); while ((m = buf_ring_dequeue_sc(txq->vxtxq_br)) != NULL) m_freem(m); VMXNET3_TXQ_UNLOCK(txq); } if_qflush(ifp); } #endif static int vmxnet3_watchdog(struct vmxnet3_txqueue *txq) { struct vmxnet3_softc *sc; sc = txq->vxtxq_sc; VMXNET3_TXQ_LOCK(txq); if (txq->vxtxq_watchdog == 0 || --txq->vxtxq_watchdog) { VMXNET3_TXQ_UNLOCK(txq); return (0); } VMXNET3_TXQ_UNLOCK(txq); if_printf(sc->vmx_ifp, "watchdog timeout on queue %d\n", txq->vxtxq_id); return (1); } static void vmxnet3_refresh_host_stats(struct vmxnet3_softc *sc) { vmxnet3_write_cmd(sc, VMXNET3_CMD_GET_STATS); } static void vmxnet3_txq_accum_stats(struct vmxnet3_txqueue *txq, struct vmxnet3_txq_stats *accum) { struct vmxnet3_txq_stats *st; st = &txq->vxtxq_stats; accum->vmtxs_opackets += st->vmtxs_opackets; accum->vmtxs_obytes += st->vmtxs_obytes; accum->vmtxs_omcasts += st->vmtxs_omcasts; accum->vmtxs_csum += st->vmtxs_csum; accum->vmtxs_tso += st->vmtxs_tso; accum->vmtxs_full += st->vmtxs_full; accum->vmtxs_offload_failed += st->vmtxs_offload_failed; } static void vmxnet3_rxq_accum_stats(struct vmxnet3_rxqueue *rxq, struct vmxnet3_rxq_stats *accum) { struct vmxnet3_rxq_stats *st; st = &rxq->vxrxq_stats; accum->vmrxs_ipackets += st->vmrxs_ipackets; accum->vmrxs_ibytes += st->vmrxs_ibytes; accum->vmrxs_iqdrops += st->vmrxs_iqdrops; accum->vmrxs_ierrors += st->vmrxs_ierrors; } static void vmxnet3_accumulate_stats(struct vmxnet3_softc *sc) { struct ifnet *ifp; struct vmxnet3_statistics *st; struct vmxnet3_txq_stats txaccum; struct vmxnet3_rxq_stats rxaccum; int i; ifp = sc->vmx_ifp; st = &sc->vmx_stats; bzero(&txaccum, sizeof(struct vmxnet3_txq_stats)); bzero(&rxaccum, sizeof(struct vmxnet3_rxq_stats)); for (i = 0; i < sc->vmx_ntxqueues; i++) vmxnet3_txq_accum_stats(&sc->vmx_txq[i], &txaccum); for (i = 0; i < sc->vmx_nrxqueues; i++) vmxnet3_rxq_accum_stats(&sc->vmx_rxq[i], &rxaccum); /* * With the exception of if_ierrors, these ifnet statistics are * only updated in the driver, so just set them to our accumulated * values. if_ierrors is updated in ether_input() for malformed * frames that we should have already discarded. */ ifp->if_ipackets = rxaccum.vmrxs_ipackets; ifp->if_iqdrops = rxaccum.vmrxs_iqdrops; ifp->if_ierrors = rxaccum.vmrxs_ierrors; ifp->if_opackets = txaccum.vmtxs_opackets; #ifndef VMXNET3_LEGACY_TX ifp->if_obytes = txaccum.vmtxs_obytes; ifp->if_omcasts = txaccum.vmtxs_omcasts; #endif } static void vmxnet3_tick(void *xsc) { struct vmxnet3_softc *sc; struct ifnet *ifp; int i, timedout; sc = xsc; ifp = sc->vmx_ifp; timedout = 0; VMXNET3_CORE_LOCK_ASSERT(sc); vmxnet3_accumulate_stats(sc); vmxnet3_refresh_host_stats(sc); for (i = 0; i < sc->vmx_ntxqueues; i++) timedout |= vmxnet3_watchdog(&sc->vmx_txq[i]); if (timedout != 0) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; vmxnet3_init_locked(sc); } else callout_reset(&sc->vmx_tick, hz, vmxnet3_tick, sc); } static int vmxnet3_link_is_up(struct vmxnet3_softc *sc) { uint32_t status; /* Also update the link speed while here. */ status = vmxnet3_read_cmd(sc, VMXNET3_CMD_GET_LINK); sc->vmx_link_speed = status >> 16; return !!(status & 0x1); } static void vmxnet3_link_status(struct vmxnet3_softc *sc) { struct ifnet *ifp; int link; ifp = sc->vmx_ifp; link = vmxnet3_link_is_up(sc); if (link != 0 && sc->vmx_link_active == 0) { sc->vmx_link_active = 1; if_link_state_change(ifp, LINK_STATE_UP); } else if (link == 0 && sc->vmx_link_active != 0) { sc->vmx_link_active = 0; if_link_state_change(ifp, LINK_STATE_DOWN); } } static void vmxnet3_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { struct vmxnet3_softc *sc; sc = ifp->if_softc; ifmr->ifm_active = IFM_ETHER | IFM_AUTO; ifmr->ifm_status = IFM_AVALID; VMXNET3_CORE_LOCK(sc); if (vmxnet3_link_is_up(sc) != 0) ifmr->ifm_status |= IFM_ACTIVE; else ifmr->ifm_status |= IFM_NONE; VMXNET3_CORE_UNLOCK(sc); } static int vmxnet3_media_change(struct ifnet *ifp) { /* Ignore. */ return (0); } static void vmxnet3_set_lladdr(struct vmxnet3_softc *sc) { uint32_t ml, mh; ml = sc->vmx_lladdr[0]; ml |= sc->vmx_lladdr[1] << 8; ml |= sc->vmx_lladdr[2] << 16; ml |= sc->vmx_lladdr[3] << 24; vmxnet3_write_bar1(sc, VMXNET3_BAR1_MACL, ml); mh = sc->vmx_lladdr[4]; mh |= sc->vmx_lladdr[5] << 8; vmxnet3_write_bar1(sc, VMXNET3_BAR1_MACH, mh); } static void vmxnet3_get_lladdr(struct vmxnet3_softc *sc) { uint32_t ml, mh; ml = vmxnet3_read_cmd(sc, VMXNET3_CMD_GET_MACL); mh = vmxnet3_read_cmd(sc, VMXNET3_CMD_GET_MACH); sc->vmx_lladdr[0] = ml; sc->vmx_lladdr[1] = ml >> 8; sc->vmx_lladdr[2] = ml >> 16; sc->vmx_lladdr[3] = ml >> 24; sc->vmx_lladdr[4] = mh; sc->vmx_lladdr[5] = mh >> 8; } static void vmxnet3_setup_txq_sysctl(struct vmxnet3_txqueue *txq, struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child) { struct sysctl_oid *node, *txsnode; struct sysctl_oid_list *list, *txslist; struct vmxnet3_txq_stats *stats; struct UPT1_TxStats *txstats; char namebuf[16]; stats = &txq->vxtxq_stats; txstats = &txq->vxtxq_ts->stats; snprintf(namebuf, sizeof(namebuf), "txq%d", txq->vxtxq_id); node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Transmit Queue"); txq->vxtxq_sysctl = list = SYSCTL_CHILDREN(node); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "opackets", CTLFLAG_RD, &stats->vmtxs_opackets, "Transmit packets"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "obytes", CTLFLAG_RD, &stats->vmtxs_obytes, "Transmit bytes"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "omcasts", CTLFLAG_RD, &stats->vmtxs_omcasts, "Transmit multicasts"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum", CTLFLAG_RD, &stats->vmtxs_csum, "Transmit checksum offloaded"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "tso", CTLFLAG_RD, &stats->vmtxs_tso, "Transmit TCP segmentation offloaded"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ringfull", CTLFLAG_RD, &stats->vmtxs_full, "Transmit ring full"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "offload_failed", CTLFLAG_RD, &stats->vmtxs_offload_failed, "Transmit checksum offload failed"); /* * Add statistics reported by the host. These are updated once * per second. */ txsnode = SYSCTL_ADD_NODE(ctx, list, OID_AUTO, "hstats", CTLFLAG_RD, NULL, "Host Statistics"); txslist = SYSCTL_CHILDREN(txsnode); SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "tso_packets", CTLFLAG_RD, &txstats->TSO_packets, "TSO packets"); SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "tso_bytes", CTLFLAG_RD, &txstats->TSO_bytes, "TSO bytes"); SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "ucast_packets", CTLFLAG_RD, &txstats->ucast_packets, "Unicast packets"); SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "unicast_bytes", CTLFLAG_RD, &txstats->ucast_bytes, "Unicast bytes"); SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "mcast_packets", CTLFLAG_RD, &txstats->mcast_packets, "Multicast packets"); SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "mcast_bytes", CTLFLAG_RD, &txstats->mcast_bytes, "Multicast bytes"); SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "error", CTLFLAG_RD, &txstats->error, "Errors"); SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "discard", CTLFLAG_RD, &txstats->discard, "Discards"); } static void vmxnet3_setup_rxq_sysctl(struct vmxnet3_rxqueue *rxq, struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child) { struct sysctl_oid *node, *rxsnode; struct sysctl_oid_list *list, *rxslist; struct vmxnet3_rxq_stats *stats; struct UPT1_RxStats *rxstats; char namebuf[16]; stats = &rxq->vxrxq_stats; rxstats = &rxq->vxrxq_rs->stats; snprintf(namebuf, sizeof(namebuf), "rxq%d", rxq->vxrxq_id); node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Receive Queue"); rxq->vxrxq_sysctl = list = SYSCTL_CHILDREN(node); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ipackets", CTLFLAG_RD, &stats->vmrxs_ipackets, "Receive packets"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ibytes", CTLFLAG_RD, &stats->vmrxs_ibytes, "Receive bytes"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "iqdrops", CTLFLAG_RD, &stats->vmrxs_iqdrops, "Receive drops"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ierrors", CTLFLAG_RD, &stats->vmrxs_ierrors, "Receive errors"); /* * Add statistics reported by the host. These are updated once * per second. */ rxsnode = SYSCTL_ADD_NODE(ctx, list, OID_AUTO, "hstats", CTLFLAG_RD, NULL, "Host Statistics"); rxslist = SYSCTL_CHILDREN(rxsnode); SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "lro_packets", CTLFLAG_RD, &rxstats->LRO_packets, "LRO packets"); SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "lro_bytes", CTLFLAG_RD, &rxstats->LRO_bytes, "LRO bytes"); SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "ucast_packets", CTLFLAG_RD, &rxstats->ucast_packets, "Unicast packets"); SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "unicast_bytes", CTLFLAG_RD, &rxstats->ucast_bytes, "Unicast bytes"); SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "mcast_packets", CTLFLAG_RD, &rxstats->mcast_packets, "Multicast packets"); SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "mcast_bytes", CTLFLAG_RD, &rxstats->mcast_bytes, "Multicast bytes"); SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "bcast_packets", CTLFLAG_RD, &rxstats->bcast_packets, "Broadcast packets"); SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "bcast_bytes", CTLFLAG_RD, &rxstats->bcast_bytes, "Broadcast bytes"); SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "nobuffer", CTLFLAG_RD, &rxstats->nobuffer, "No buffer"); SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "error", CTLFLAG_RD, &rxstats->error, "Errors"); } static void vmxnet3_setup_debug_sysctl(struct vmxnet3_softc *sc, struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child) { struct sysctl_oid *node; struct sysctl_oid_list *list; int i; for (i = 0; i < sc->vmx_ntxqueues; i++) { struct vmxnet3_txqueue *txq = &sc->vmx_txq[i]; node = SYSCTL_ADD_NODE(ctx, txq->vxtxq_sysctl, OID_AUTO, "debug", CTLFLAG_RD, NULL, ""); list = SYSCTL_CHILDREN(node); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "cmd_head", CTLFLAG_RD, &txq->vxtxq_cmd_ring.vxtxr_head, 0, ""); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "cmd_next", CTLFLAG_RD, &txq->vxtxq_cmd_ring.vxtxr_next, 0, ""); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "cmd_ndesc", CTLFLAG_RD, &txq->vxtxq_cmd_ring.vxtxr_ndesc, 0, ""); SYSCTL_ADD_INT(ctx, list, OID_AUTO, "cmd_gen", CTLFLAG_RD, &txq->vxtxq_cmd_ring.vxtxr_gen, 0, ""); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "comp_next", CTLFLAG_RD, &txq->vxtxq_comp_ring.vxcr_next, 0, ""); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "comp_ndesc", CTLFLAG_RD, &txq->vxtxq_comp_ring.vxcr_ndesc, 0,""); SYSCTL_ADD_INT(ctx, list, OID_AUTO, "comp_gen", CTLFLAG_RD, &txq->vxtxq_comp_ring.vxcr_gen, 0, ""); } for (i = 0; i < sc->vmx_nrxqueues; i++) { struct vmxnet3_rxqueue *rxq = &sc->vmx_rxq[i]; node = SYSCTL_ADD_NODE(ctx, rxq->vxrxq_sysctl, OID_AUTO, "debug", CTLFLAG_RD, NULL, ""); list = SYSCTL_CHILDREN(node); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "cmd0_fill", CTLFLAG_RD, &rxq->vxrxq_cmd_ring[0].vxrxr_fill, 0, ""); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "cmd0_ndesc", CTLFLAG_RD, &rxq->vxrxq_cmd_ring[0].vxrxr_ndesc, 0, ""); SYSCTL_ADD_INT(ctx, list, OID_AUTO, "cmd0_gen", CTLFLAG_RD, &rxq->vxrxq_cmd_ring[0].vxrxr_gen, 0, ""); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "cmd1_fill", CTLFLAG_RD, &rxq->vxrxq_cmd_ring[1].vxrxr_fill, 0, ""); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "cmd1_ndesc", CTLFLAG_RD, &rxq->vxrxq_cmd_ring[1].vxrxr_ndesc, 0, ""); SYSCTL_ADD_INT(ctx, list, OID_AUTO, "cmd1_gen", CTLFLAG_RD, &rxq->vxrxq_cmd_ring[1].vxrxr_gen, 0, ""); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "comp_next", CTLFLAG_RD, &rxq->vxrxq_comp_ring.vxcr_next, 0, ""); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "comp_ndesc", CTLFLAG_RD, &rxq->vxrxq_comp_ring.vxcr_ndesc, 0,""); SYSCTL_ADD_INT(ctx, list, OID_AUTO, "comp_gen", CTLFLAG_RD, &rxq->vxrxq_comp_ring.vxcr_gen, 0, ""); } } static void vmxnet3_setup_queue_sysctl(struct vmxnet3_softc *sc, struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child) { int i; for (i = 0; i < sc->vmx_ntxqueues; i++) vmxnet3_setup_txq_sysctl(&sc->vmx_txq[i], ctx, child); for (i = 0; i < sc->vmx_nrxqueues; i++) vmxnet3_setup_rxq_sysctl(&sc->vmx_rxq[i], ctx, child); vmxnet3_setup_debug_sysctl(sc, ctx, child); } static void vmxnet3_setup_sysctl(struct vmxnet3_softc *sc) { device_t dev; struct vmxnet3_statistics *stats; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; dev = sc->vmx_dev; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "max_ntxqueues", CTLFLAG_RD, &sc->vmx_max_ntxqueues, 0, "Maximum number of Tx queues"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "max_nrxqueues", CTLFLAG_RD, &sc->vmx_max_nrxqueues, 0, "Maximum number of Rx queues"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "ntxqueues", CTLFLAG_RD, &sc->vmx_ntxqueues, 0, "Number of Tx queues"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "nrxqueues", CTLFLAG_RD, &sc->vmx_nrxqueues, 0, "Number of Rx queues"); stats = &sc->vmx_stats; SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "defragged", CTLFLAG_RD, &stats->vmst_defragged, 0, "Tx mbuf chains defragged"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "defrag_failed", CTLFLAG_RD, &stats->vmst_defrag_failed, 0, "Tx mbuf dropped because defrag failed"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "mgetcl_failed", CTLFLAG_RD, &stats->vmst_mgetcl_failed, 0, "mbuf cluster allocation failed"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "mbuf_load_failed", CTLFLAG_RD, &stats->vmst_mbuf_load_failed, 0, "mbuf load segments failed"); vmxnet3_setup_queue_sysctl(sc, ctx, child); } static void vmxnet3_write_bar0(struct vmxnet3_softc *sc, bus_size_t r, uint32_t v) { bus_space_write_4(sc->vmx_iot0, sc->vmx_ioh0, r, v); } static uint32_t vmxnet3_read_bar1(struct vmxnet3_softc *sc, bus_size_t r) { return (bus_space_read_4(sc->vmx_iot1, sc->vmx_ioh1, r)); } static void vmxnet3_write_bar1(struct vmxnet3_softc *sc, bus_size_t r, uint32_t v) { bus_space_write_4(sc->vmx_iot1, sc->vmx_ioh1, r, v); } static void vmxnet3_write_cmd(struct vmxnet3_softc *sc, uint32_t cmd) { vmxnet3_write_bar1(sc, VMXNET3_BAR1_CMD, cmd); } static uint32_t vmxnet3_read_cmd(struct vmxnet3_softc *sc, uint32_t cmd) { vmxnet3_write_cmd(sc, cmd); bus_space_barrier(sc->vmx_iot1, sc->vmx_ioh1, 0, 0, BUS_SPACE_BARRIER_READ | BUS_SPACE_BARRIER_WRITE); return (vmxnet3_read_bar1(sc, VMXNET3_BAR1_CMD)); } static void vmxnet3_enable_intr(struct vmxnet3_softc *sc, int irq) { vmxnet3_write_bar0(sc, VMXNET3_BAR0_IMASK(irq), 0); } static void vmxnet3_disable_intr(struct vmxnet3_softc *sc, int irq) { vmxnet3_write_bar0(sc, VMXNET3_BAR0_IMASK(irq), 1); } static void vmxnet3_enable_all_intrs(struct vmxnet3_softc *sc) { int i; sc->vmx_ds->ictrl &= ~VMXNET3_ICTRL_DISABLE_ALL; for (i = 0; i < sc->vmx_nintrs; i++) vmxnet3_enable_intr(sc, i); } static void vmxnet3_disable_all_intrs(struct vmxnet3_softc *sc) { int i; sc->vmx_ds->ictrl |= VMXNET3_ICTRL_DISABLE_ALL; for (i = 0; i < sc->vmx_nintrs; i++) vmxnet3_disable_intr(sc, i); } static void vmxnet3_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error) { bus_addr_t *baddr = arg; if (error == 0) *baddr = segs->ds_addr; } static int vmxnet3_dma_malloc(struct vmxnet3_softc *sc, bus_size_t size, bus_size_t align, struct vmxnet3_dma_alloc *dma) { device_t dev; int error; dev = sc->vmx_dev; bzero(dma, sizeof(struct vmxnet3_dma_alloc)); error = bus_dma_tag_create(bus_get_dma_tag(dev), align, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ size, /* maxsize */ 1, /* nsegments */ size, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &dma->dma_tag); if (error) { device_printf(dev, "bus_dma_tag_create failed: %d\n", error); goto fail; } error = bus_dmamem_alloc(dma->dma_tag, (void **)&dma->dma_vaddr, BUS_DMA_ZERO | BUS_DMA_NOWAIT, &dma->dma_map); if (error) { device_printf(dev, "bus_dmamem_alloc failed: %d\n", error); goto fail; } error = bus_dmamap_load(dma->dma_tag, dma->dma_map, dma->dma_vaddr, size, vmxnet3_dmamap_cb, &dma->dma_paddr, BUS_DMA_NOWAIT); if (error) { device_printf(dev, "bus_dmamap_load failed: %d\n", error); goto fail; } dma->dma_size = size; fail: if (error) vmxnet3_dma_free(sc, dma); return (error); } static void vmxnet3_dma_free(struct vmxnet3_softc *sc, struct vmxnet3_dma_alloc *dma) { if (dma->dma_tag != NULL) { if (dma->dma_paddr != 0) { bus_dmamap_sync(dma->dma_tag, dma->dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(dma->dma_tag, dma->dma_map); } if (dma->dma_vaddr != NULL) { bus_dmamem_free(dma->dma_tag, dma->dma_vaddr, dma->dma_map); } bus_dma_tag_destroy(dma->dma_tag); } bzero(dma, sizeof(struct vmxnet3_dma_alloc)); } static int vmxnet3_tunable_int(struct vmxnet3_softc *sc, const char *knob, int def) { char path[64]; snprintf(path, sizeof(path), "hw.vmx.%d.%s", device_get_unit(sc->vmx_dev), knob); TUNABLE_INT_FETCH(path, &def); return (def); } /* * Since this is a purely paravirtualized device, we do not have * to worry about DMA coherency. But at times, we must make sure * both the compiler and CPU do not reorder memory operations. */ static inline void vmxnet3_barrier(struct vmxnet3_softc *sc, vmxnet3_barrier_t type) { switch (type) { case VMXNET3_BARRIER_RD: rmb(); break; case VMXNET3_BARRIER_WR: wmb(); break; case VMXNET3_BARRIER_RDWR: mb(); break; default: panic("%s: bad barrier type %d", __func__, type); } } Index: head/sys/dev/vmware/vmxnet3/if_vmxvar.h =================================================================== --- head/sys/dev/vmware/vmxnet3/if_vmxvar.h (revision 271550) +++ head/sys/dev/vmware/vmxnet3/if_vmxvar.h (revision 271551) @@ -1,346 +1,347 @@ /*- * Copyright (c) 2013 Tsubai Masanari * Copyright (c) 2013 Bryan Venteicher * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. * * $FreeBSD$ */ #ifndef _IF_VMXVAR_H #define _IF_VMXVAR_H struct vmxnet3_softc; struct vmxnet3_dma_alloc { bus_addr_t dma_paddr; caddr_t dma_vaddr; bus_dma_tag_t dma_tag; bus_dmamap_t dma_map; bus_size_t dma_size; }; /* * The number of Rx/Tx queues this driver prefers. */ #define VMXNET3_DEF_RX_QUEUES 8 #define VMXNET3_DEF_TX_QUEUES 8 /* * The number of Rx rings in each Rx queue. */ #define VMXNET3_RXRINGS_PERQ 2 /* * The number of descriptors in each Rx/Tx ring. */ #define VMXNET3_DEF_TX_NDESC 512 #define VMXNET3_MAX_TX_NDESC 4096 #define VMXNET3_MIN_TX_NDESC 32 #define VMXNET3_MASK_TX_NDESC 0x1F #define VMXNET3_DEF_RX_NDESC 256 #define VMXNET3_MAX_RX_NDESC 2048 #define VMXNET3_MIN_RX_NDESC 32 #define VMXNET3_MASK_RX_NDESC 0x1F #define VMXNET3_MAX_TX_NCOMPDESC VMXNET3_MAX_TX_NDESC #define VMXNET3_MAX_RX_NCOMPDESC \ (VMXNET3_MAX_RX_NDESC * VMXNET3_RXRINGS_PERQ) struct vmxnet3_txbuf { bus_dmamap_t vtxb_dmamap; struct mbuf *vtxb_m; }; struct vmxnet3_txring { struct vmxnet3_txbuf *vxtxr_txbuf; u_int vxtxr_head; u_int vxtxr_next; u_int vxtxr_ndesc; int vxtxr_gen; bus_dma_tag_t vxtxr_txtag; struct vmxnet3_txdesc *vxtxr_txd; struct vmxnet3_dma_alloc vxtxr_dma; }; static inline int VMXNET3_TXRING_AVAIL(struct vmxnet3_txring *txr) { int avail = txr->vxtxr_next - txr->vxtxr_head - 1; return (avail < 0 ? txr->vxtxr_ndesc + avail : avail); } struct vmxnet3_rxbuf { bus_dmamap_t vrxb_dmamap; struct mbuf *vrxb_m; }; struct vmxnet3_rxring { struct vmxnet3_rxbuf *vxrxr_rxbuf; struct vmxnet3_rxdesc *vxrxr_rxd; u_int vxrxr_fill; u_int vxrxr_ndesc; int vxrxr_gen; int vxrxr_rid; bus_dma_tag_t vxrxr_rxtag; struct vmxnet3_dma_alloc vxrxr_dma; bus_dmamap_t vxrxr_spare_dmap; }; static inline void vmxnet3_rxr_increment_fill(struct vmxnet3_rxring *rxr) { if (++rxr->vxrxr_fill == rxr->vxrxr_ndesc) { rxr->vxrxr_fill = 0; rxr->vxrxr_gen ^= 1; } } struct vmxnet3_comp_ring { union { struct vmxnet3_txcompdesc *txcd; struct vmxnet3_rxcompdesc *rxcd; } vxcr_u; u_int vxcr_next; u_int vxcr_ndesc; int vxcr_gen; struct vmxnet3_dma_alloc vxcr_dma; }; struct vmxnet3_txq_stats { uint64_t vmtxs_opackets; /* if_opackets */ uint64_t vmtxs_obytes; /* if_obytes */ uint64_t vmtxs_omcasts; /* if_omcasts */ uint64_t vmtxs_csum; uint64_t vmtxs_tso; uint64_t vmtxs_full; uint64_t vmtxs_offload_failed; }; struct vmxnet3_txqueue { struct mtx vxtxq_mtx; struct vmxnet3_softc *vxtxq_sc; #ifndef VMXNET3_TX_LEGACY struct buf_ring *vxtxq_br; #endif int vxtxq_id; int vxtxq_intr_idx; int vxtxq_watchdog; struct vmxnet3_txring vxtxq_cmd_ring; struct vmxnet3_comp_ring vxtxq_comp_ring; struct vmxnet3_txq_stats vxtxq_stats; struct vmxnet3_txq_shared *vxtxq_ts; struct sysctl_oid_list *vxtxq_sysctl; #ifndef VMXNET3_TX_LEGACY struct task vxtxq_defrtask; #endif char vxtxq_name[16]; } __aligned(CACHE_LINE_SIZE); #define VMXNET3_TXQ_LOCK(_txq) mtx_lock(&(_txq)->vxtxq_mtx) #define VMXNET3_TXQ_TRYLOCK(_txq) mtx_trylock(&(_txq)->vxtxq_mtx) #define VMXNET3_TXQ_UNLOCK(_txq) mtx_unlock(&(_txq)->vxtxq_mtx) #define VMXNET3_TXQ_LOCK_ASSERT(_txq) \ mtx_assert(&(_txq)->vxtxq_mtx, MA_OWNED) #define VMXNET3_TXQ_LOCK_ASSERT_NOTOWNED(_txq) \ mtx_assert(&(_txq)->vxtxq_mtx, MA_NOTOWNED) struct vmxnet3_rxq_stats { uint64_t vmrxs_ipackets; /* if_ipackets */ uint64_t vmrxs_ibytes; /* if_ibytes */ uint64_t vmrxs_iqdrops; /* if_iqdrops */ uint64_t vmrxs_ierrors; /* if_ierrors */ }; struct vmxnet3_rxqueue { struct mtx vxrxq_mtx; struct vmxnet3_softc *vxrxq_sc; int vxrxq_id; int vxrxq_intr_idx; struct mbuf *vxrxq_mhead; struct mbuf *vxrxq_mtail; struct vmxnet3_rxring vxrxq_cmd_ring[VMXNET3_RXRINGS_PERQ]; struct vmxnet3_comp_ring vxrxq_comp_ring; struct vmxnet3_rxq_stats vxrxq_stats; struct vmxnet3_rxq_shared *vxrxq_rs; struct sysctl_oid_list *vxrxq_sysctl; char vxrxq_name[16]; } __aligned(CACHE_LINE_SIZE); #define VMXNET3_RXQ_LOCK(_rxq) mtx_lock(&(_rxq)->vxrxq_mtx) #define VMXNET3_RXQ_UNLOCK(_rxq) mtx_unlock(&(_rxq)->vxrxq_mtx) #define VMXNET3_RXQ_LOCK_ASSERT(_rxq) \ mtx_assert(&(_rxq)->vxrxq_mtx, MA_OWNED) #define VMXNET3_RXQ_LOCK_ASSERT_NOTOWNED(_rxq) \ mtx_assert(&(_rxq)->vxrxq_mtx, MA_NOTOWNED) struct vmxnet3_statistics { uint32_t vmst_defragged; uint32_t vmst_defrag_failed; uint32_t vmst_mgetcl_failed; uint32_t vmst_mbuf_load_failed; }; struct vmxnet3_interrupt { struct resource *vmxi_irq; int vmxi_rid; void *vmxi_handler; }; struct vmxnet3_softc { device_t vmx_dev; struct ifnet *vmx_ifp; struct vmxnet3_driver_shared *vmx_ds; uint32_t vmx_flags; #define VMXNET3_FLAG_NO_MSIX 0x0001 #define VMXNET3_FLAG_RSS 0x0002 struct vmxnet3_rxqueue *vmx_rxq; struct vmxnet3_txqueue *vmx_txq; struct resource *vmx_res0; bus_space_tag_t vmx_iot0; bus_space_handle_t vmx_ioh0; struct resource *vmx_res1; bus_space_tag_t vmx_iot1; bus_space_handle_t vmx_ioh1; struct resource *vmx_msix_res; int vmx_link_active; int vmx_link_speed; int vmx_if_flags; int vmx_ntxqueues; int vmx_nrxqueues; int vmx_ntxdescs; int vmx_nrxdescs; int vmx_max_rxsegs; int vmx_rx_max_chain; struct vmxnet3_statistics vmx_stats; int vmx_intr_type; int vmx_intr_mask_mode; int vmx_event_intr_idx; int vmx_nintrs; struct vmxnet3_interrupt vmx_intrs[VMXNET3_MAX_INTRS]; struct mtx vmx_mtx; #ifndef VMXNET3_LEGACY_TX struct taskqueue *vmx_tq; #endif uint8_t *vmx_mcast; void *vmx_qs; struct vmxnet3_rss_shared *vmx_rss; struct callout vmx_tick; struct vmxnet3_dma_alloc vmx_ds_dma; struct vmxnet3_dma_alloc vmx_qs_dma; struct vmxnet3_dma_alloc vmx_mcast_dma; struct vmxnet3_dma_alloc vmx_rss_dma; struct ifmedia vmx_media; int vmx_max_ntxqueues; int vmx_max_nrxqueues; eventhandler_tag vmx_vlan_attach; eventhandler_tag vmx_vlan_detach; uint32_t vmx_vlan_filter[4096/32]; uint8_t vmx_lladdr[ETHER_ADDR_LEN]; }; #define VMXNET3_CORE_LOCK_INIT(_sc, _name) \ mtx_init(&(_sc)->vmx_mtx, _name, "VMXNET3 Lock", MTX_DEF) #define VMXNET3_CORE_LOCK_DESTROY(_sc) mtx_destroy(&(_sc)->vmx_mtx) #define VMXNET3_CORE_LOCK(_sc) mtx_lock(&(_sc)->vmx_mtx) #define VMXNET3_CORE_UNLOCK(_sc) mtx_unlock(&(_sc)->vmx_mtx) #define VMXNET3_CORE_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->vmx_mtx, MA_OWNED) #define VMXNET3_CORE_LOCK_ASSERT_NOTOWNED(_sc) \ mtx_assert(&(_sc)->vmx_mtx, MA_NOTOWNED) /* * Our driver version we report to the hypervisor; we just keep * this value constant. */ #define VMXNET3_DRIVER_VERSION 0x00010000 /* * Max descriptors per Tx packet. We must limit the size of the * any TSO packets based on the number of segments. */ #define VMXNET3_TX_MAXSEGS 32 #define VMXNET3_TX_MAXSIZE (VMXNET3_TX_MAXSEGS * MCLBYTES) +#define VMXNET3_TSO_MAXSIZE \ + (VMXNET3_TX_MAXSIZE - sizeof(struct ether_vlan_header)) /* * Maximum support Tx segments size. The length field in the * Tx descriptor is 14 bits. */ -#define VMXNET3_TX_MAXSEGSHIFT 14 -#define VMXNET3_TX_MAXSEGSIZE (1 << VMXNET3_TX_MAXSEGSHIFT) +#define VMXNET3_TX_MAXSEGSIZE (1 << 14) /* * The maximum number of Rx segments we accept. When LRO is enabled, * this allows us to receive the maximum sized frame with one MCLBYTES * cluster followed by 16 MJUMPAGESIZE clusters. */ #define VMXNET3_MAX_RX_SEGS 17 /* * Predetermined size of the multicast MACs filter table. If the * number of multicast addresses exceeds this size, then the * ALL_MULTI mode is use instead. */ #define VMXNET3_MULTICAST_MAX 32 /* * Our Tx watchdog timeout. */ #define VMXNET3_WATCHDOG_TIMEOUT 5 /* * Number of slots in the Tx bufrings. This value matches most other * multiqueue drivers. */ #define VMXNET3_DEF_BUFRING_SIZE 4096 /* * IP protocols that we can perform Tx checksum offloading of. */ #define VMXNET3_CSUM_OFFLOAD (CSUM_TCP | CSUM_UDP) #define VMXNET3_CSUM_OFFLOAD_IPV6 (CSUM_TCP_IPV6 | CSUM_UDP_IPV6) #define VMXNET3_CSUM_ALL_OFFLOAD \ (VMXNET3_CSUM_OFFLOAD | VMXNET3_CSUM_OFFLOAD_IPV6 | CSUM_TSO) /* * Compat macros to keep this driver compiling on old releases. */ #if !defined(SYSCTL_ADD_UQUAD) #define SYSCTL_ADD_UQUAD SYSCTL_ADD_QUAD #endif #if !defined(IFCAP_TXCSUM_IPV6) #define IFCAP_TXCSUM_IPV6 0 #endif #if !defined(IFCAP_RXCSUM_IPV6) #define IFCAP_RXCSUM_IPV6 0 #endif #if !defined(CSUM_TCP_IPV6) #define CSUM_TCP_IPV6 0 #endif #if !defined(CSUM_UDP_IPV6) #define CSUM_UDP_IPV6 0 #endif #endif /* _IF_VMXVAR_H */ Index: head/sys/dev/xen/netfront/netfront.c =================================================================== --- head/sys/dev/xen/netfront/netfront.c (revision 271550) +++ head/sys/dev/xen/netfront/netfront.c (revision 271551) @@ -1,2235 +1,2233 @@ /*- * Copyright (c) 2004-2006 Kip Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if __FreeBSD_version >= 700000 #include #include #endif #include #include #include /* for DELAY */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "xenbus_if.h" /* Features supported by all backends. TSO and LRO can be negotiated */ #define XN_CSUM_FEATURES (CSUM_TCP | CSUM_UDP) #define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE) #define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE) #if __FreeBSD_version >= 700000 /* * Should the driver do LRO on the RX end * this can be toggled on the fly, but the * interface must be reset (down/up) for it * to take effect. */ static int xn_enable_lro = 1; TUNABLE_INT("hw.xn.enable_lro", &xn_enable_lro); #else #define IFCAP_TSO4 0 #define CSUM_TSO 0 #endif #ifdef CONFIG_XEN static int MODPARM_rx_copy = 0; module_param_named(rx_copy, MODPARM_rx_copy, bool, 0); MODULE_PARM_DESC(rx_copy, "Copy packets from network card (rather than flip)"); static int MODPARM_rx_flip = 0; module_param_named(rx_flip, MODPARM_rx_flip, bool, 0); MODULE_PARM_DESC(rx_flip, "Flip packets from network card (rather than copy)"); #else static const int MODPARM_rx_copy = 1; static const int MODPARM_rx_flip = 0; #endif /** * \brief The maximum allowed data fragments in a single transmit * request. * * This limit is imposed by the backend driver. We assume here that * we are dealing with a Linux driver domain and have set our limit * to mirror the Linux MAX_SKB_FRAGS constant. */ #define MAX_TX_REQ_FRAGS (65536 / PAGE_SIZE + 2) +#define NF_TSO_MAXBURST ((IP_MAXPACKET / PAGE_SIZE) * MCLBYTES) #define RX_COPY_THRESHOLD 256 #define net_ratelimit() 0 struct netfront_info; struct netfront_rx_info; static void xn_txeof(struct netfront_info *); static void xn_rxeof(struct netfront_info *); static void network_alloc_rx_buffers(struct netfront_info *); static void xn_tick_locked(struct netfront_info *); static void xn_tick(void *); static void xn_intr(void *); static inline int xn_count_frags(struct mbuf *m); static int xn_assemble_tx_request(struct netfront_info *sc, struct mbuf *m_head); static void xn_start_locked(struct ifnet *); static void xn_start(struct ifnet *); static int xn_ioctl(struct ifnet *, u_long, caddr_t); static void xn_ifinit_locked(struct netfront_info *); static void xn_ifinit(void *); static void xn_stop(struct netfront_info *); static void xn_query_features(struct netfront_info *np); static int xn_configure_features(struct netfront_info *np); #ifdef notyet static void xn_watchdog(struct ifnet *); #endif #ifdef notyet static void netfront_closing(device_t dev); #endif static void netif_free(struct netfront_info *info); static int netfront_detach(device_t dev); static int talk_to_backend(device_t dev, struct netfront_info *info); static int create_netdev(device_t dev); static void netif_disconnect_backend(struct netfront_info *info); static int setup_device(device_t dev, struct netfront_info *info); static void free_ring(int *ref, void *ring_ptr_ref); static int xn_ifmedia_upd(struct ifnet *ifp); static void xn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr); /* Xenolinux helper functions */ int network_connect(struct netfront_info *); static void xn_free_rx_ring(struct netfront_info *); static void xn_free_tx_ring(struct netfront_info *); static int xennet_get_responses(struct netfront_info *np, struct netfront_rx_info *rinfo, RING_IDX rp, RING_IDX *cons, struct mbuf **list, int *pages_flipped_p); #define virt_to_mfn(x) (vtomach(x) >> PAGE_SHIFT) #define INVALID_P2M_ENTRY (~0UL) /* * Mbuf pointers. We need these to keep track of the virtual addresses * of our mbuf chains since we can only convert from virtual to physical, * not the other way around. The size must track the free index arrays. */ struct xn_chain_data { struct mbuf *xn_tx_chain[NET_TX_RING_SIZE+1]; int xn_tx_chain_cnt; struct mbuf *xn_rx_chain[NET_RX_RING_SIZE+1]; }; struct net_device_stats { u_long rx_packets; /* total packets received */ u_long tx_packets; /* total packets transmitted */ u_long rx_bytes; /* total bytes received */ u_long tx_bytes; /* total bytes transmitted */ u_long rx_errors; /* bad packets received */ u_long tx_errors; /* packet transmit problems */ u_long rx_dropped; /* no space in linux buffers */ u_long tx_dropped; /* no space available in linux */ u_long multicast; /* multicast packets received */ u_long collisions; /* detailed rx_errors: */ u_long rx_length_errors; u_long rx_over_errors; /* receiver ring buff overflow */ u_long rx_crc_errors; /* recved pkt with crc error */ u_long rx_frame_errors; /* recv'd frame alignment error */ u_long rx_fifo_errors; /* recv'r fifo overrun */ u_long rx_missed_errors; /* receiver missed packet */ /* detailed tx_errors */ u_long tx_aborted_errors; u_long tx_carrier_errors; u_long tx_fifo_errors; u_long tx_heartbeat_errors; u_long tx_window_errors; /* for cslip etc */ u_long rx_compressed; u_long tx_compressed; }; struct netfront_info { struct ifnet *xn_ifp; #if __FreeBSD_version >= 700000 struct lro_ctrl xn_lro; #endif struct net_device_stats stats; u_int tx_full; netif_tx_front_ring_t tx; netif_rx_front_ring_t rx; struct mtx tx_lock; struct mtx rx_lock; struct mtx sc_lock; xen_intr_handle_t xen_intr_handle; u_int copying_receiver; u_int carrier; u_int maxfrags; /* Receive-ring batched refills. */ #define RX_MIN_TARGET 32 #define RX_MAX_TARGET NET_RX_RING_SIZE int rx_min_target; int rx_max_target; int rx_target; grant_ref_t gref_tx_head; grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1]; grant_ref_t gref_rx_head; grant_ref_t grant_rx_ref[NET_TX_RING_SIZE + 1]; device_t xbdev; int tx_ring_ref; int rx_ring_ref; uint8_t mac[ETHER_ADDR_LEN]; struct xn_chain_data xn_cdata; /* mbufs */ struct mbuf_head xn_rx_batch; /* head of the batch queue */ int xn_if_flags; struct callout xn_stat_ch; u_long rx_pfn_array[NET_RX_RING_SIZE]; multicall_entry_t rx_mcl[NET_RX_RING_SIZE+1]; mmu_update_t rx_mmu[NET_RX_RING_SIZE]; struct ifmedia sc_media; }; #define rx_mbufs xn_cdata.xn_rx_chain #define tx_mbufs xn_cdata.xn_tx_chain #define XN_LOCK_INIT(_sc, _name) \ mtx_init(&(_sc)->tx_lock, #_name"_tx", "network transmit lock", MTX_DEF); \ mtx_init(&(_sc)->rx_lock, #_name"_rx", "network receive lock", MTX_DEF); \ mtx_init(&(_sc)->sc_lock, #_name"_sc", "netfront softc lock", MTX_DEF) #define XN_RX_LOCK(_sc) mtx_lock(&(_sc)->rx_lock) #define XN_RX_UNLOCK(_sc) mtx_unlock(&(_sc)->rx_lock) #define XN_TX_LOCK(_sc) mtx_lock(&(_sc)->tx_lock) #define XN_TX_UNLOCK(_sc) mtx_unlock(&(_sc)->tx_lock) #define XN_LOCK(_sc) mtx_lock(&(_sc)->sc_lock); #define XN_UNLOCK(_sc) mtx_unlock(&(_sc)->sc_lock); #define XN_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->sc_lock, MA_OWNED); #define XN_RX_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->rx_lock, MA_OWNED); #define XN_TX_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->tx_lock, MA_OWNED); #define XN_LOCK_DESTROY(_sc) mtx_destroy(&(_sc)->rx_lock); \ mtx_destroy(&(_sc)->tx_lock); \ mtx_destroy(&(_sc)->sc_lock); struct netfront_rx_info { struct netif_rx_response rx; struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1]; }; #define netfront_carrier_on(netif) ((netif)->carrier = 1) #define netfront_carrier_off(netif) ((netif)->carrier = 0) #define netfront_carrier_ok(netif) ((netif)->carrier) /* Access macros for acquiring freeing slots in xn_free_{tx,rx}_idxs[]. */ static inline void add_id_to_freelist(struct mbuf **list, uintptr_t id) { KASSERT(id != 0, ("%s: the head item (0) must always be free.", __func__)); list[id] = list[0]; list[0] = (struct mbuf *)id; } static inline unsigned short get_id_from_freelist(struct mbuf **list) { uintptr_t id; id = (uintptr_t)list[0]; KASSERT(id != 0, ("%s: the head item (0) must always remain free.", __func__)); list[0] = list[id]; return (id); } static inline int xennet_rxidx(RING_IDX idx) { return idx & (NET_RX_RING_SIZE - 1); } static inline struct mbuf * xennet_get_rx_mbuf(struct netfront_info *np, RING_IDX ri) { int i = xennet_rxidx(ri); struct mbuf *m; m = np->rx_mbufs[i]; np->rx_mbufs[i] = NULL; return (m); } static inline grant_ref_t xennet_get_rx_ref(struct netfront_info *np, RING_IDX ri) { int i = xennet_rxidx(ri); grant_ref_t ref = np->grant_rx_ref[i]; KASSERT(ref != GRANT_REF_INVALID, ("Invalid grant reference!\n")); np->grant_rx_ref[i] = GRANT_REF_INVALID; return ref; } #define IPRINTK(fmt, args...) \ printf("[XEN] " fmt, ##args) #ifdef INVARIANTS #define WPRINTK(fmt, args...) \ printf("[XEN] " fmt, ##args) #else #define WPRINTK(fmt, args...) #endif #ifdef DEBUG #define DPRINTK(fmt, args...) \ printf("[XEN] %s: " fmt, __func__, ##args) #else #define DPRINTK(fmt, args...) #endif /** * Read the 'mac' node at the given device's node in the store, and parse that * as colon-separated octets, placing result the given mac array. mac must be * a preallocated array of length ETH_ALEN (as declared in linux/if_ether.h). * Return 0 on success, or errno on error. */ static int xen_net_read_mac(device_t dev, uint8_t mac[]) { int error, i; char *s, *e, *macstr; const char *path; path = xenbus_get_node(dev); error = xs_read(XST_NIL, path, "mac", NULL, (void **) &macstr); if (error == ENOENT) { /* * Deal with missing mac XenStore nodes on devices with * HVM emulation (the 'ioemu' configuration attribute) * enabled. * * The HVM emulator may execute in a stub device model * domain which lacks the permission, only given to Dom0, * to update the guest's XenStore tree. For this reason, * the HVM emulator doesn't even attempt to write the * front-side mac node, even when operating in Dom0. * However, there should always be a mac listed in the * backend tree. Fallback to this version if our query * of the front side XenStore location doesn't find * anything. */ path = xenbus_get_otherend_path(dev); error = xs_read(XST_NIL, path, "mac", NULL, (void **) &macstr); } if (error != 0) { xenbus_dev_fatal(dev, error, "parsing %s/mac", path); return (error); } s = macstr; for (i = 0; i < ETHER_ADDR_LEN; i++) { mac[i] = strtoul(s, &e, 16); if (s == e || (e[0] != ':' && e[0] != 0)) { free(macstr, M_XENBUS); return (ENOENT); } s = &e[1]; } free(macstr, M_XENBUS); return (0); } /** * Entry point to this code when a new device is created. Allocate the basic * structures and the ring buffers for communication with the backend, and * inform the backend of the appropriate details for those. Switch to * Connected state. */ static int netfront_probe(device_t dev) { if (!strcmp(xenbus_get_type(dev), "vif")) { device_set_desc(dev, "Virtual Network Interface"); return (0); } return (ENXIO); } static int netfront_attach(device_t dev) { int err; err = create_netdev(dev); if (err) { xenbus_dev_fatal(dev, err, "creating netdev"); return (err); } #if __FreeBSD_version >= 700000 SYSCTL_ADD_INT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "enable_lro", CTLTYPE_INT|CTLFLAG_RW, &xn_enable_lro, 0, "Large Receive Offload"); #endif return (0); } static int netfront_suspend(device_t dev) { struct netfront_info *info = device_get_softc(dev); XN_RX_LOCK(info); XN_TX_LOCK(info); netfront_carrier_off(info); XN_TX_UNLOCK(info); XN_RX_UNLOCK(info); return (0); } /** * We are reconnecting to the backend, due to a suspend/resume, or a backend * driver restart. We tear down our netif structure and recreate it, but * leave the device-layer structures intact so that this is transparent to the * rest of the kernel. */ static int netfront_resume(device_t dev) { struct netfront_info *info = device_get_softc(dev); netif_disconnect_backend(info); return (0); } /* Common code used when first setting up, and when resuming. */ static int talk_to_backend(device_t dev, struct netfront_info *info) { const char *message; struct xs_transaction xst; const char *node = xenbus_get_node(dev); int err; err = xen_net_read_mac(dev, info->mac); if (err) { xenbus_dev_fatal(dev, err, "parsing %s/mac", node); goto out; } /* Create shared ring, alloc event channel. */ err = setup_device(dev, info); if (err) goto out; again: err = xs_transaction_start(&xst); if (err) { xenbus_dev_fatal(dev, err, "starting transaction"); goto destroy_ring; } err = xs_printf(xst, node, "tx-ring-ref","%u", info->tx_ring_ref); if (err) { message = "writing tx ring-ref"; goto abort_transaction; } err = xs_printf(xst, node, "rx-ring-ref","%u", info->rx_ring_ref); if (err) { message = "writing rx ring-ref"; goto abort_transaction; } err = xs_printf(xst, node, "event-channel", "%u", xen_intr_port(info->xen_intr_handle)); if (err) { message = "writing event-channel"; goto abort_transaction; } err = xs_printf(xst, node, "request-rx-copy", "%u", info->copying_receiver); if (err) { message = "writing request-rx-copy"; goto abort_transaction; } err = xs_printf(xst, node, "feature-rx-notify", "%d", 1); if (err) { message = "writing feature-rx-notify"; goto abort_transaction; } err = xs_printf(xst, node, "feature-sg", "%d", 1); if (err) { message = "writing feature-sg"; goto abort_transaction; } #if __FreeBSD_version >= 700000 err = xs_printf(xst, node, "feature-gso-tcpv4", "%d", 1); if (err) { message = "writing feature-gso-tcpv4"; goto abort_transaction; } #endif err = xs_transaction_end(xst, 0); if (err) { if (err == EAGAIN) goto again; xenbus_dev_fatal(dev, err, "completing transaction"); goto destroy_ring; } return 0; abort_transaction: xs_transaction_end(xst, 1); xenbus_dev_fatal(dev, err, "%s", message); destroy_ring: netif_free(info); out: return err; } static int setup_device(device_t dev, struct netfront_info *info) { netif_tx_sring_t *txs; netif_rx_sring_t *rxs; int error; struct ifnet *ifp; ifp = info->xn_ifp; info->tx_ring_ref = GRANT_REF_INVALID; info->rx_ring_ref = GRANT_REF_INVALID; info->rx.sring = NULL; info->tx.sring = NULL; txs = (netif_tx_sring_t *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT|M_ZERO); if (!txs) { error = ENOMEM; xenbus_dev_fatal(dev, error, "allocating tx ring page"); goto fail; } SHARED_RING_INIT(txs); FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE); error = xenbus_grant_ring(dev, virt_to_mfn(txs), &info->tx_ring_ref); if (error) goto fail; rxs = (netif_rx_sring_t *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT|M_ZERO); if (!rxs) { error = ENOMEM; xenbus_dev_fatal(dev, error, "allocating rx ring page"); goto fail; } SHARED_RING_INIT(rxs); FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE); error = xenbus_grant_ring(dev, virt_to_mfn(rxs), &info->rx_ring_ref); if (error) goto fail; error = xen_intr_alloc_and_bind_local_port(dev, xenbus_get_otherend_id(dev), /*filter*/NULL, xn_intr, info, INTR_TYPE_NET | INTR_MPSAFE | INTR_ENTROPY, &info->xen_intr_handle); if (error) { xenbus_dev_fatal(dev, error, "xen_intr_alloc_and_bind_local_port failed"); goto fail; } return (0); fail: netif_free(info); return (error); } #ifdef INET /** * If this interface has an ipv4 address, send an arp for it. This * helps to get the network going again after migrating hosts. */ static void netfront_send_fake_arp(device_t dev, struct netfront_info *info) { struct ifnet *ifp; struct ifaddr *ifa; ifp = info->xn_ifp; TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET) { arp_ifinit(ifp, ifa); } } } #endif /** * Callback received when the backend's state changes. */ static void netfront_backend_changed(device_t dev, XenbusState newstate) { struct netfront_info *sc = device_get_softc(dev); DPRINTK("newstate=%d\n", newstate); switch (newstate) { case XenbusStateInitialising: case XenbusStateInitialised: case XenbusStateConnected: case XenbusStateUnknown: case XenbusStateClosed: case XenbusStateReconfigured: case XenbusStateReconfiguring: break; case XenbusStateInitWait: if (xenbus_get_state(dev) != XenbusStateInitialising) break; if (network_connect(sc) != 0) break; xenbus_set_state(dev, XenbusStateConnected); #ifdef INET netfront_send_fake_arp(dev, sc); #endif break; case XenbusStateClosing: xenbus_set_state(dev, XenbusStateClosed); break; } } static void xn_free_rx_ring(struct netfront_info *sc) { #if 0 int i; for (i = 0; i < NET_RX_RING_SIZE; i++) { if (sc->xn_cdata.rx_mbufs[i] != NULL) { m_freem(sc->rx_mbufs[i]); sc->rx_mbufs[i] = NULL; } } sc->rx.rsp_cons = 0; sc->xn_rx_if->req_prod = 0; sc->xn_rx_if->event = sc->rx.rsp_cons ; #endif } static void xn_free_tx_ring(struct netfront_info *sc) { #if 0 int i; for (i = 0; i < NET_TX_RING_SIZE; i++) { if (sc->tx_mbufs[i] != NULL) { m_freem(sc->tx_mbufs[i]); sc->xn_cdata.xn_tx_chain[i] = NULL; } } return; #endif } /** * \brief Verify that there is sufficient space in the Tx ring * buffer for a maximally sized request to be enqueued. * * A transmit request requires a transmit descriptor for each packet * fragment, plus up to 2 entries for "options" (e.g. TSO). */ static inline int xn_tx_slot_available(struct netfront_info *np) { return (RING_FREE_REQUESTS(&np->tx) > (MAX_TX_REQ_FRAGS + 2)); } static void netif_release_tx_bufs(struct netfront_info *np) { int i; for (i = 1; i <= NET_TX_RING_SIZE; i++) { struct mbuf *m; m = np->tx_mbufs[i]; /* * We assume that no kernel addresses are * less than NET_TX_RING_SIZE. Any entry * in the table that is below this number * must be an index from free-list tracking. */ if (((uintptr_t)m) <= NET_TX_RING_SIZE) continue; gnttab_end_foreign_access_ref(np->grant_tx_ref[i]); gnttab_release_grant_reference(&np->gref_tx_head, np->grant_tx_ref[i]); np->grant_tx_ref[i] = GRANT_REF_INVALID; add_id_to_freelist(np->tx_mbufs, i); np->xn_cdata.xn_tx_chain_cnt--; if (np->xn_cdata.xn_tx_chain_cnt < 0) { panic("%s: tx_chain_cnt must be >= 0", __func__); } m_free(m); } } static void network_alloc_rx_buffers(struct netfront_info *sc) { int otherend_id = xenbus_get_otherend_id(sc->xbdev); unsigned short id; struct mbuf *m_new; int i, batch_target, notify; RING_IDX req_prod; struct xen_memory_reservation reservation; grant_ref_t ref; int nr_flips; netif_rx_request_t *req; vm_offset_t vaddr; u_long pfn; req_prod = sc->rx.req_prod_pvt; if (__predict_false(sc->carrier == 0)) return; /* * Allocate mbufs greedily, even though we batch updates to the * receive ring. This creates a less bursty demand on the memory * allocator, and so should reduce the chance of failed allocation * requests both for ourself and for other kernel subsystems. * * Here we attempt to maintain rx_target buffers in flight, counting * buffers that we have yet to process in the receive ring. */ batch_target = sc->rx_target - (req_prod - sc->rx.rsp_cons); for (i = mbufq_len(&sc->xn_rx_batch); i < batch_target; i++) { MGETHDR(m_new, M_NOWAIT, MT_DATA); if (m_new == NULL) { printf("%s: MGETHDR failed\n", __func__); goto no_mbuf; } m_cljget(m_new, M_NOWAIT, MJUMPAGESIZE); if ((m_new->m_flags & M_EXT) == 0) { printf("%s: m_cljget failed\n", __func__); m_freem(m_new); no_mbuf: if (i != 0) goto refill; /* * XXX set timer */ break; } m_new->m_len = m_new->m_pkthdr.len = MJUMPAGESIZE; /* queue the mbufs allocated */ mbufq_tail(&sc->xn_rx_batch, m_new); } /* * If we've allocated at least half of our target number of entries, * submit them to the backend - we have enough to make the overhead * of submission worthwhile. Otherwise wait for more mbufs and * request entries to become available. */ if (i < (sc->rx_target/2)) { if (req_prod >sc->rx.sring->req_prod) goto push; return; } /* * Double floating fill target if we risked having the backend * run out of empty buffers for receive traffic. We define "running * low" as having less than a fourth of our target buffers free * at the time we refilled the queue. */ if ((req_prod - sc->rx.sring->rsp_prod) < (sc->rx_target / 4)) { sc->rx_target *= 2; if (sc->rx_target > sc->rx_max_target) sc->rx_target = sc->rx_max_target; } refill: for (nr_flips = i = 0; ; i++) { if ((m_new = mbufq_dequeue(&sc->xn_rx_batch)) == NULL) break; m_new->m_ext.ext_arg1 = (vm_paddr_t *)(uintptr_t)( vtophys(m_new->m_ext.ext_buf) >> PAGE_SHIFT); id = xennet_rxidx(req_prod + i); KASSERT(sc->rx_mbufs[id] == NULL, ("non-NULL xm_rx_chain")); sc->rx_mbufs[id] = m_new; ref = gnttab_claim_grant_reference(&sc->gref_rx_head); KASSERT(ref != GNTTAB_LIST_END, ("reserved grant references exhuasted")); sc->grant_rx_ref[id] = ref; vaddr = mtod(m_new, vm_offset_t); pfn = vtophys(vaddr) >> PAGE_SHIFT; req = RING_GET_REQUEST(&sc->rx, req_prod + i); if (sc->copying_receiver == 0) { gnttab_grant_foreign_transfer_ref(ref, otherend_id, pfn); sc->rx_pfn_array[nr_flips] = PFNTOMFN(pfn); if (!xen_feature(XENFEAT_auto_translated_physmap)) { /* Remove this page before passing * back to Xen. */ set_phys_to_machine(pfn, INVALID_P2M_ENTRY); MULTI_update_va_mapping(&sc->rx_mcl[i], vaddr, 0, 0); } nr_flips++; } else { gnttab_grant_foreign_access_ref(ref, otherend_id, PFNTOMFN(pfn), 0); } req->id = id; req->gref = ref; sc->rx_pfn_array[i] = vtomach(mtod(m_new,vm_offset_t)) >> PAGE_SHIFT; } KASSERT(i, ("no mbufs processed")); /* should have returned earlier */ KASSERT(mbufq_len(&sc->xn_rx_batch) == 0, ("not all mbufs processed")); /* * We may have allocated buffers which have entries outstanding * in the page * update queue -- make sure we flush those first! */ PT_UPDATES_FLUSH(); if (nr_flips != 0) { #ifdef notyet /* Tell the ballon driver what is going on. */ balloon_update_driver_allowance(i); #endif set_xen_guest_handle(reservation.extent_start, sc->rx_pfn_array); reservation.nr_extents = i; reservation.extent_order = 0; reservation.address_bits = 0; reservation.domid = DOMID_SELF; if (!xen_feature(XENFEAT_auto_translated_physmap)) { /* After all PTEs have been zapped, flush the TLB. */ sc->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL; /* Give away a batch of pages. */ sc->rx_mcl[i].op = __HYPERVISOR_memory_op; sc->rx_mcl[i].args[0] = XENMEM_decrease_reservation; sc->rx_mcl[i].args[1] = (u_long)&reservation; /* Zap PTEs and give away pages in one big multicall. */ (void)HYPERVISOR_multicall(sc->rx_mcl, i+1); if (__predict_false(sc->rx_mcl[i].result != i || HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation) != i)) panic("%s: unable to reduce memory " "reservation\n", __func__); } } else { wmb(); } /* Above is a suitable barrier to ensure backend will see requests. */ sc->rx.req_prod_pvt = req_prod + i; push: RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->rx, notify); if (notify) xen_intr_signal(sc->xen_intr_handle); } static void xn_rxeof(struct netfront_info *np) { struct ifnet *ifp; #if __FreeBSD_version >= 700000 && (defined(INET) || defined(INET6)) struct lro_ctrl *lro = &np->xn_lro; struct lro_entry *queued; #endif struct netfront_rx_info rinfo; struct netif_rx_response *rx = &rinfo.rx; struct netif_extra_info *extras = rinfo.extras; RING_IDX i, rp; multicall_entry_t *mcl; struct mbuf *m; struct mbuf_head rxq, errq; int err, pages_flipped = 0, work_to_do; do { XN_RX_LOCK_ASSERT(np); if (!netfront_carrier_ok(np)) return; mbufq_init(&errq); mbufq_init(&rxq); ifp = np->xn_ifp; rp = np->rx.sring->rsp_prod; rmb(); /* Ensure we see queued responses up to 'rp'. */ i = np->rx.rsp_cons; while ((i != rp)) { memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx)); memset(extras, 0, sizeof(rinfo.extras)); m = NULL; err = xennet_get_responses(np, &rinfo, rp, &i, &m, &pages_flipped); if (__predict_false(err)) { if (m) mbufq_tail(&errq, m); np->stats.rx_errors++; continue; } m->m_pkthdr.rcvif = ifp; if ( rx->flags & NETRXF_data_validated ) { /* Tell the stack the checksums are okay */ /* * XXX this isn't necessarily the case - need to add * check */ m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m->m_pkthdr.csum_data = 0xffff; } np->stats.rx_packets++; np->stats.rx_bytes += m->m_pkthdr.len; mbufq_tail(&rxq, m); np->rx.rsp_cons = i; } if (pages_flipped) { /* Some pages are no longer absent... */ #ifdef notyet balloon_update_driver_allowance(-pages_flipped); #endif /* Do all the remapping work, and M->P updates, in one big * hypercall. */ if (!!xen_feature(XENFEAT_auto_translated_physmap)) { mcl = np->rx_mcl + pages_flipped; mcl->op = __HYPERVISOR_mmu_update; mcl->args[0] = (u_long)np->rx_mmu; mcl->args[1] = pages_flipped; mcl->args[2] = 0; mcl->args[3] = DOMID_SELF; (void)HYPERVISOR_multicall(np->rx_mcl, pages_flipped + 1); } } while ((m = mbufq_dequeue(&errq))) m_freem(m); /* * Process all the mbufs after the remapping is complete. * Break the mbuf chain first though. */ while ((m = mbufq_dequeue(&rxq)) != NULL) { ifp->if_ipackets++; /* * Do we really need to drop the rx lock? */ XN_RX_UNLOCK(np); #if __FreeBSD_version >= 700000 && (defined(INET) || defined(INET6)) /* Use LRO if possible */ if ((ifp->if_capenable & IFCAP_LRO) == 0 || lro->lro_cnt == 0 || tcp_lro_rx(lro, m, 0)) { /* * If LRO fails, pass up to the stack * directly. */ (*ifp->if_input)(ifp, m); } #else (*ifp->if_input)(ifp, m); #endif XN_RX_LOCK(np); } np->rx.rsp_cons = i; #if __FreeBSD_version >= 700000 && (defined(INET) || defined(INET6)) /* * Flush any outstanding LRO work */ while (!SLIST_EMPTY(&lro->lro_active)) { queued = SLIST_FIRST(&lro->lro_active); SLIST_REMOVE_HEAD(&lro->lro_active, next); tcp_lro_flush(lro, queued); } #endif #if 0 /* If we get a callback with very few responses, reduce fill target. */ /* NB. Note exponential increase, linear decrease. */ if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) > ((3*np->rx_target) / 4)) && (--np->rx_target < np->rx_min_target)) np->rx_target = np->rx_min_target; #endif network_alloc_rx_buffers(np); RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, work_to_do); } while (work_to_do); } static void xn_txeof(struct netfront_info *np) { RING_IDX i, prod; unsigned short id; struct ifnet *ifp; netif_tx_response_t *txr; struct mbuf *m; XN_TX_LOCK_ASSERT(np); if (!netfront_carrier_ok(np)) return; ifp = np->xn_ifp; do { prod = np->tx.sring->rsp_prod; rmb(); /* Ensure we see responses up to 'rp'. */ for (i = np->tx.rsp_cons; i != prod; i++) { txr = RING_GET_RESPONSE(&np->tx, i); if (txr->status == NETIF_RSP_NULL) continue; if (txr->status != NETIF_RSP_OKAY) { printf("%s: WARNING: response is %d!\n", __func__, txr->status); } id = txr->id; m = np->tx_mbufs[id]; KASSERT(m != NULL, ("mbuf not found in xn_tx_chain")); KASSERT((uintptr_t)m > NET_TX_RING_SIZE, ("mbuf already on the free list, but we're " "trying to free it again!")); M_ASSERTVALID(m); /* * Increment packet count if this is the last * mbuf of the chain. */ if (!m->m_next) ifp->if_opackets++; if (__predict_false(gnttab_query_foreign_access( np->grant_tx_ref[id]) != 0)) { panic("%s: grant id %u still in use by the " "backend", __func__, id); } gnttab_end_foreign_access_ref( np->grant_tx_ref[id]); gnttab_release_grant_reference( &np->gref_tx_head, np->grant_tx_ref[id]); np->grant_tx_ref[id] = GRANT_REF_INVALID; np->tx_mbufs[id] = NULL; add_id_to_freelist(np->tx_mbufs, id); np->xn_cdata.xn_tx_chain_cnt--; m_free(m); /* Only mark the queue active if we've freed up at least one slot to try */ ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; } np->tx.rsp_cons = prod; /* * Set a new event, then check for race with update of * tx_cons. Note that it is essential to schedule a * callback, no matter how few buffers are pending. Even if * there is space in the transmit ring, higher layers may * be blocked because too much data is outstanding: in such * cases notification from Xen is likely to be the only kick * that we'll get. */ np->tx.sring->rsp_event = prod + ((np->tx.sring->req_prod - prod) >> 1) + 1; mb(); } while (prod != np->tx.sring->rsp_prod); if (np->tx_full && ((np->tx.sring->req_prod - prod) < NET_TX_RING_SIZE)) { np->tx_full = 0; #if 0 if (np->user_state == UST_OPEN) netif_wake_queue(dev); #endif } } static void xn_intr(void *xsc) { struct netfront_info *np = xsc; struct ifnet *ifp = np->xn_ifp; #if 0 if (!(np->rx.rsp_cons != np->rx.sring->rsp_prod && likely(netfront_carrier_ok(np)) && ifp->if_drv_flags & IFF_DRV_RUNNING)) return; #endif if (RING_HAS_UNCONSUMED_RESPONSES(&np->tx)) { XN_TX_LOCK(np); xn_txeof(np); XN_TX_UNLOCK(np); } XN_RX_LOCK(np); xn_rxeof(np); XN_RX_UNLOCK(np); if (ifp->if_drv_flags & IFF_DRV_RUNNING && !IFQ_DRV_IS_EMPTY(&ifp->if_snd)) xn_start(ifp); } static void xennet_move_rx_slot(struct netfront_info *np, struct mbuf *m, grant_ref_t ref) { int new = xennet_rxidx(np->rx.req_prod_pvt); KASSERT(np->rx_mbufs[new] == NULL, ("rx_mbufs != NULL")); np->rx_mbufs[new] = m; np->grant_rx_ref[new] = ref; RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new; RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref; np->rx.req_prod_pvt++; } static int xennet_get_extras(struct netfront_info *np, struct netif_extra_info *extras, RING_IDX rp, RING_IDX *cons) { struct netif_extra_info *extra; int err = 0; do { struct mbuf *m; grant_ref_t ref; if (__predict_false(*cons + 1 == rp)) { #if 0 if (net_ratelimit()) WPRINTK("Missing extra info\n"); #endif err = EINVAL; break; } extra = (struct netif_extra_info *) RING_GET_RESPONSE(&np->rx, ++(*cons)); if (__predict_false(!extra->type || extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) { #if 0 if (net_ratelimit()) WPRINTK("Invalid extra type: %d\n", extra->type); #endif err = EINVAL; } else { memcpy(&extras[extra->type - 1], extra, sizeof(*extra)); } m = xennet_get_rx_mbuf(np, *cons); ref = xennet_get_rx_ref(np, *cons); xennet_move_rx_slot(np, m, ref); } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE); return err; } static int xennet_get_responses(struct netfront_info *np, struct netfront_rx_info *rinfo, RING_IDX rp, RING_IDX *cons, struct mbuf **list, int *pages_flipped_p) { int pages_flipped = *pages_flipped_p; struct mmu_update *mmu; struct multicall_entry *mcl; struct netif_rx_response *rx = &rinfo->rx; struct netif_extra_info *extras = rinfo->extras; struct mbuf *m, *m0, *m_prev; grant_ref_t ref = xennet_get_rx_ref(np, *cons); RING_IDX ref_cons = *cons; int frags = 1; int err = 0; u_long ret; m0 = m = m_prev = xennet_get_rx_mbuf(np, *cons); if (rx->flags & NETRXF_extra_info) { err = xennet_get_extras(np, extras, rp, cons); } if (m0 != NULL) { m0->m_pkthdr.len = 0; m0->m_next = NULL; } for (;;) { u_long mfn; #if 0 DPRINTK("rx->status=%hd rx->offset=%hu frags=%u\n", rx->status, rx->offset, frags); #endif if (__predict_false(rx->status < 0 || rx->offset + rx->status > PAGE_SIZE)) { #if 0 if (net_ratelimit()) WPRINTK("rx->offset: %x, size: %u\n", rx->offset, rx->status); #endif xennet_move_rx_slot(np, m, ref); if (m0 == m) m0 = NULL; m = NULL; err = EINVAL; goto next_skip_queue; } /* * This definitely indicates a bug, either in this driver or in * the backend driver. In future this should flag the bad * situation to the system controller to reboot the backed. */ if (ref == GRANT_REF_INVALID) { #if 0 if (net_ratelimit()) WPRINTK("Bad rx response id %d.\n", rx->id); #endif printf("%s: Bad rx response id %d.\n", __func__,rx->id); err = EINVAL; goto next; } if (!np->copying_receiver) { /* Memory pressure, insufficient buffer * headroom, ... */ if (!(mfn = gnttab_end_foreign_transfer_ref(ref))) { WPRINTK("Unfulfilled rx req (id=%d, st=%d).\n", rx->id, rx->status); xennet_move_rx_slot(np, m, ref); err = ENOMEM; goto next; } if (!xen_feature( XENFEAT_auto_translated_physmap)) { /* Remap the page. */ void *vaddr = mtod(m, void *); uint32_t pfn; mcl = np->rx_mcl + pages_flipped; mmu = np->rx_mmu + pages_flipped; MULTI_update_va_mapping(mcl, (u_long)vaddr, (((vm_paddr_t)mfn) << PAGE_SHIFT) | PG_RW | PG_V | PG_M | PG_A, 0); pfn = (uintptr_t)m->m_ext.ext_arg1; mmu->ptr = ((vm_paddr_t)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; mmu->val = pfn; set_phys_to_machine(pfn, mfn); } pages_flipped++; } else { ret = gnttab_end_foreign_access_ref(ref); KASSERT(ret, ("ret != 0")); } gnttab_release_grant_reference(&np->gref_rx_head, ref); next: if (m == NULL) break; m->m_len = rx->status; m->m_data += rx->offset; m0->m_pkthdr.len += rx->status; next_skip_queue: if (!(rx->flags & NETRXF_more_data)) break; if (*cons + frags == rp) { if (net_ratelimit()) WPRINTK("Need more frags\n"); err = ENOENT; printf("%s: cons %u frags %u rp %u, not enough frags\n", __func__, *cons, frags, rp); break; } /* * Note that m can be NULL, if rx->status < 0 or if * rx->offset + rx->status > PAGE_SIZE above. */ m_prev = m; rx = RING_GET_RESPONSE(&np->rx, *cons + frags); m = xennet_get_rx_mbuf(np, *cons + frags); /* * m_prev == NULL can happen if rx->status < 0 or if * rx->offset + * rx->status > PAGE_SIZE above. */ if (m_prev != NULL) m_prev->m_next = m; /* * m0 can be NULL if rx->status < 0 or if * rx->offset + * rx->status > PAGE_SIZE above. */ if (m0 == NULL) m0 = m; m->m_next = NULL; ref = xennet_get_rx_ref(np, *cons + frags); ref_cons = *cons + frags; frags++; } *list = m0; *cons += frags; *pages_flipped_p = pages_flipped; return (err); } static void xn_tick_locked(struct netfront_info *sc) { XN_RX_LOCK_ASSERT(sc); callout_reset(&sc->xn_stat_ch, hz, xn_tick, sc); /* XXX placeholder for printing debug information */ } static void xn_tick(void *xsc) { struct netfront_info *sc; sc = xsc; XN_RX_LOCK(sc); xn_tick_locked(sc); XN_RX_UNLOCK(sc); } /** * \brief Count the number of fragments in an mbuf chain. * * Surprisingly, there isn't an M* macro for this. */ static inline int xn_count_frags(struct mbuf *m) { int nfrags; for (nfrags = 0; m != NULL; m = m->m_next) nfrags++; return (nfrags); } /** * Given an mbuf chain, make sure we have enough room and then push * it onto the transmit ring. */ static int xn_assemble_tx_request(struct netfront_info *sc, struct mbuf *m_head) { struct ifnet *ifp; struct mbuf *m; u_int nfrags; netif_extra_info_t *extra; int otherend_id; ifp = sc->xn_ifp; /** * Defragment the mbuf if necessary. */ nfrags = xn_count_frags(m_head); /* * Check to see whether this request is longer than netback * can handle, and try to defrag it. */ /** * It is a bit lame, but the netback driver in Linux can't * deal with nfrags > MAX_TX_REQ_FRAGS, which is a quirk of * the Linux network stack. */ if (nfrags > sc->maxfrags) { m = m_defrag(m_head, M_NOWAIT); if (!m) { /* * Defrag failed, so free the mbuf and * therefore drop the packet. */ m_freem(m_head); return (EMSGSIZE); } m_head = m; } /* Determine how many fragments now exist */ nfrags = xn_count_frags(m_head); /* * Check to see whether the defragmented packet has too many * segments for the Linux netback driver. */ /** * The FreeBSD TCP stack, with TSO enabled, can produce a chain * of mbufs longer than Linux can handle. Make sure we don't * pass a too-long chain over to the other side by dropping the * packet. It doesn't look like there is currently a way to * tell the TCP stack to generate a shorter chain of packets. */ if (nfrags > MAX_TX_REQ_FRAGS) { #ifdef DEBUG printf("%s: nfrags %d > MAX_TX_REQ_FRAGS %d, netback " "won't be able to handle it, dropping\n", __func__, nfrags, MAX_TX_REQ_FRAGS); #endif m_freem(m_head); return (EMSGSIZE); } /* * This check should be redundant. We've already verified that we * have enough slots in the ring to handle a packet of maximum * size, and that our packet is less than the maximum size. Keep * it in here as an assert for now just to make certain that * xn_tx_chain_cnt is accurate. */ KASSERT((sc->xn_cdata.xn_tx_chain_cnt + nfrags) <= NET_TX_RING_SIZE, ("%s: xn_tx_chain_cnt (%d) + nfrags (%d) > NET_TX_RING_SIZE " "(%d)!", __func__, (int) sc->xn_cdata.xn_tx_chain_cnt, (int) nfrags, (int) NET_TX_RING_SIZE)); /* * Start packing the mbufs in this chain into * the fragment pointers. Stop when we run out * of fragments or hit the end of the mbuf chain. */ m = m_head; extra = NULL; otherend_id = xenbus_get_otherend_id(sc->xbdev); for (m = m_head; m; m = m->m_next) { netif_tx_request_t *tx; uintptr_t id; grant_ref_t ref; u_long mfn; /* XXX Wrong type? */ tx = RING_GET_REQUEST(&sc->tx, sc->tx.req_prod_pvt); id = get_id_from_freelist(sc->tx_mbufs); if (id == 0) panic("%s: was allocated the freelist head!\n", __func__); sc->xn_cdata.xn_tx_chain_cnt++; if (sc->xn_cdata.xn_tx_chain_cnt > NET_TX_RING_SIZE) panic("%s: tx_chain_cnt must be <= NET_TX_RING_SIZE\n", __func__); sc->tx_mbufs[id] = m; tx->id = id; ref = gnttab_claim_grant_reference(&sc->gref_tx_head); KASSERT((short)ref >= 0, ("Negative ref")); mfn = virt_to_mfn(mtod(m, vm_offset_t)); gnttab_grant_foreign_access_ref(ref, otherend_id, mfn, GNTMAP_readonly); tx->gref = sc->grant_tx_ref[id] = ref; tx->offset = mtod(m, vm_offset_t) & (PAGE_SIZE - 1); tx->flags = 0; if (m == m_head) { /* * The first fragment has the entire packet * size, subsequent fragments have just the * fragment size. The backend works out the * true size of the first fragment by * subtracting the sizes of the other * fragments. */ tx->size = m->m_pkthdr.len; /* * The first fragment contains the checksum flags * and is optionally followed by extra data for * TSO etc. */ /** * CSUM_TSO requires checksum offloading. * Some versions of FreeBSD fail to * set CSUM_TCP in the CSUM_TSO case, * so we have to test for CSUM_TSO * explicitly. */ if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_TSO)) { tx->flags |= (NETTXF_csum_blank | NETTXF_data_validated); } #if __FreeBSD_version >= 700000 if (m->m_pkthdr.csum_flags & CSUM_TSO) { struct netif_extra_info *gso = (struct netif_extra_info *) RING_GET_REQUEST(&sc->tx, ++sc->tx.req_prod_pvt); tx->flags |= NETTXF_extra_info; gso->u.gso.size = m->m_pkthdr.tso_segsz; gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; gso->u.gso.pad = 0; gso->u.gso.features = 0; gso->type = XEN_NETIF_EXTRA_TYPE_GSO; gso->flags = 0; } #endif } else { tx->size = m->m_len; } if (m->m_next) tx->flags |= NETTXF_more_data; sc->tx.req_prod_pvt++; } BPF_MTAP(ifp, m_head); sc->stats.tx_bytes += m_head->m_pkthdr.len; sc->stats.tx_packets++; return (0); } static void xn_start_locked(struct ifnet *ifp) { struct netfront_info *sc; struct mbuf *m_head; int notify; sc = ifp->if_softc; if (!netfront_carrier_ok(sc)) return; /* * While we have enough transmit slots available for at least one * maximum-sized packet, pull mbufs off the queue and put them on * the transmit ring. */ while (xn_tx_slot_available(sc)) { IF_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) break; if (xn_assemble_tx_request(sc, m_head) != 0) break; } RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->tx, notify); if (notify) xen_intr_signal(sc->xen_intr_handle); if (RING_FULL(&sc->tx)) { sc->tx_full = 1; #if 0 netif_stop_queue(dev); #endif } } static void xn_start(struct ifnet *ifp) { struct netfront_info *sc; sc = ifp->if_softc; XN_TX_LOCK(sc); xn_start_locked(ifp); XN_TX_UNLOCK(sc); } /* equivalent of network_open() in Linux */ static void xn_ifinit_locked(struct netfront_info *sc) { struct ifnet *ifp; XN_LOCK_ASSERT(sc); ifp = sc->xn_ifp; if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; xn_stop(sc); network_alloc_rx_buffers(sc); sc->rx.sring->rsp_event = sc->rx.rsp_cons + 1; ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if_link_state_change(ifp, LINK_STATE_UP); callout_reset(&sc->xn_stat_ch, hz, xn_tick, sc); } static void xn_ifinit(void *xsc) { struct netfront_info *sc = xsc; XN_LOCK(sc); xn_ifinit_locked(sc); XN_UNLOCK(sc); } static int xn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct netfront_info *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *) data; #ifdef INET struct ifaddr *ifa = (struct ifaddr *)data; #endif int mask, error = 0; switch(cmd) { case SIOCSIFADDR: case SIOCGIFADDR: #ifdef INET XN_LOCK(sc); if (ifa->ifa_addr->sa_family == AF_INET) { ifp->if_flags |= IFF_UP; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) xn_ifinit_locked(sc); arp_ifinit(ifp, ifa); XN_UNLOCK(sc); } else { XN_UNLOCK(sc); #endif error = ether_ioctl(ifp, cmd, data); #ifdef INET } #endif break; case SIOCSIFMTU: /* XXX can we alter the MTU on a VN ?*/ #ifdef notyet if (ifr->ifr_mtu > XN_JUMBO_MTU) error = EINVAL; else #endif { ifp->if_mtu = ifr->ifr_mtu; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; xn_ifinit(sc); } break; case SIOCSIFFLAGS: XN_LOCK(sc); if (ifp->if_flags & IFF_UP) { /* * If only the state of the PROMISC flag changed, * then just use the 'set promisc mode' command * instead of reinitializing the entire NIC. Doing * a full re-init means reloading the firmware and * waiting for it to start up, which may take a * second or two. */ #ifdef notyet /* No promiscuous mode with Xen */ if (ifp->if_drv_flags & IFF_DRV_RUNNING && ifp->if_flags & IFF_PROMISC && !(sc->xn_if_flags & IFF_PROMISC)) { XN_SETBIT(sc, XN_RX_MODE, XN_RXMODE_RX_PROMISC); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING && !(ifp->if_flags & IFF_PROMISC) && sc->xn_if_flags & IFF_PROMISC) { XN_CLRBIT(sc, XN_RX_MODE, XN_RXMODE_RX_PROMISC); } else #endif xn_ifinit_locked(sc); } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { xn_stop(sc); } } sc->xn_if_flags = ifp->if_flags; XN_UNLOCK(sc); error = 0; break; case SIOCSIFCAP: mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) { if (IFCAP_TXCSUM & ifp->if_capenable) { ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4); ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO); } else { ifp->if_capenable |= IFCAP_TXCSUM; ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP | CSUM_IP); } } if (mask & IFCAP_RXCSUM) { ifp->if_capenable ^= IFCAP_RXCSUM; } #if __FreeBSD_version >= 700000 if (mask & IFCAP_TSO4) { if (IFCAP_TSO4 & ifp->if_capenable) { ifp->if_capenable &= ~IFCAP_TSO4; ifp->if_hwassist &= ~CSUM_TSO; } else if (IFCAP_TXCSUM & ifp->if_capenable) { ifp->if_capenable |= IFCAP_TSO4; ifp->if_hwassist |= CSUM_TSO; } else { IPRINTK("Xen requires tx checksum offload" " be enabled to use TSO\n"); error = EINVAL; } } if (mask & IFCAP_LRO) { ifp->if_capenable ^= IFCAP_LRO; } #endif error = 0; break; case SIOCADDMULTI: case SIOCDELMULTI: #ifdef notyet if (ifp->if_drv_flags & IFF_DRV_RUNNING) { XN_LOCK(sc); xn_setmulti(sc); XN_UNLOCK(sc); error = 0; } #endif /* FALLTHROUGH */ case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd); break; default: error = ether_ioctl(ifp, cmd, data); } return (error); } static void xn_stop(struct netfront_info *sc) { struct ifnet *ifp; XN_LOCK_ASSERT(sc); ifp = sc->xn_ifp; callout_stop(&sc->xn_stat_ch); xn_free_rx_ring(sc); xn_free_tx_ring(sc); ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); if_link_state_change(ifp, LINK_STATE_DOWN); } /* START of Xenolinux helper functions adapted to FreeBSD */ int network_connect(struct netfront_info *np) { int i, requeue_idx, error; grant_ref_t ref; netif_rx_request_t *req; u_int feature_rx_copy, feature_rx_flip; error = xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev), "feature-rx-copy", NULL, "%u", &feature_rx_copy); if (error) feature_rx_copy = 0; error = xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev), "feature-rx-flip", NULL, "%u", &feature_rx_flip); if (error) feature_rx_flip = 1; /* * Copy packets on receive path if: * (a) This was requested by user, and the backend supports it; or * (b) Flipping was requested, but this is unsupported by the backend. */ np->copying_receiver = ((MODPARM_rx_copy && feature_rx_copy) || (MODPARM_rx_flip && !feature_rx_flip)); /* Recovery procedure: */ error = talk_to_backend(np->xbdev, np); if (error) return (error); /* Step 1: Reinitialise variables. */ xn_query_features(np); xn_configure_features(np); netif_release_tx_bufs(np); /* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */ for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) { struct mbuf *m; u_long pfn; if (np->rx_mbufs[i] == NULL) continue; m = np->rx_mbufs[requeue_idx] = xennet_get_rx_mbuf(np, i); ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i); req = RING_GET_REQUEST(&np->rx, requeue_idx); pfn = vtophys(mtod(m, vm_offset_t)) >> PAGE_SHIFT; if (!np->copying_receiver) { gnttab_grant_foreign_transfer_ref(ref, xenbus_get_otherend_id(np->xbdev), pfn); } else { gnttab_grant_foreign_access_ref(ref, xenbus_get_otherend_id(np->xbdev), PFNTOMFN(pfn), 0); } req->gref = ref; req->id = requeue_idx; requeue_idx++; } np->rx.req_prod_pvt = requeue_idx; /* Step 3: All public and private state should now be sane. Get * ready to start sending and receiving packets and give the driver * domain a kick because we've probably just requeued some * packets. */ netfront_carrier_on(np); xen_intr_signal(np->xen_intr_handle); XN_TX_LOCK(np); xn_txeof(np); XN_TX_UNLOCK(np); network_alloc_rx_buffers(np); return (0); } static void xn_query_features(struct netfront_info *np) { int val; device_printf(np->xbdev, "backend features:"); if (xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev), "feature-sg", NULL, "%d", &val) < 0) val = 0; np->maxfrags = 1; if (val) { np->maxfrags = MAX_TX_REQ_FRAGS; printf(" feature-sg"); } if (xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev), "feature-gso-tcpv4", NULL, "%d", &val) < 0) val = 0; np->xn_ifp->if_capabilities &= ~(IFCAP_TSO4|IFCAP_LRO); if (val) { np->xn_ifp->if_capabilities |= IFCAP_TSO4|IFCAP_LRO; printf(" feature-gso-tcp4"); } printf("\n"); } static int xn_configure_features(struct netfront_info *np) { int err; err = 0; #if __FreeBSD_version >= 700000 && (defined(INET) || defined(INET6)) if ((np->xn_ifp->if_capenable & IFCAP_LRO) != 0) tcp_lro_free(&np->xn_lro); #endif np->xn_ifp->if_capenable = np->xn_ifp->if_capabilities & ~(IFCAP_LRO|IFCAP_TSO4); np->xn_ifp->if_hwassist &= ~CSUM_TSO; #if __FreeBSD_version >= 700000 && (defined(INET) || defined(INET6)) if (xn_enable_lro && (np->xn_ifp->if_capabilities & IFCAP_LRO) != 0) { err = tcp_lro_init(&np->xn_lro); if (err) { device_printf(np->xbdev, "LRO initialization failed\n"); } else { np->xn_lro.ifp = np->xn_ifp; np->xn_ifp->if_capenable |= IFCAP_LRO; } } if ((np->xn_ifp->if_capabilities & IFCAP_TSO4) != 0) { np->xn_ifp->if_capenable |= IFCAP_TSO4; np->xn_ifp->if_hwassist |= CSUM_TSO; } #endif return (err); } /** * Create a network device. * @param dev Newbus device representing this virtual NIC. */ int create_netdev(device_t dev) { int i; struct netfront_info *np; int err; struct ifnet *ifp; np = device_get_softc(dev); np->xbdev = dev; XN_LOCK_INIT(np, xennetif); ifmedia_init(&np->sc_media, 0, xn_ifmedia_upd, xn_ifmedia_sts); ifmedia_add(&np->sc_media, IFM_ETHER|IFM_MANUAL, 0, NULL); ifmedia_set(&np->sc_media, IFM_ETHER|IFM_MANUAL); np->rx_target = RX_MIN_TARGET; np->rx_min_target = RX_MIN_TARGET; np->rx_max_target = RX_MAX_TARGET; /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */ for (i = 0; i <= NET_TX_RING_SIZE; i++) { np->tx_mbufs[i] = (void *) ((u_long) i+1); np->grant_tx_ref[i] = GRANT_REF_INVALID; } np->tx_mbufs[NET_TX_RING_SIZE] = (void *)0; for (i = 0; i <= NET_RX_RING_SIZE; i++) { np->rx_mbufs[i] = NULL; np->grant_rx_ref[i] = GRANT_REF_INVALID; } /* A grant for every tx ring slot */ if (gnttab_alloc_grant_references(NET_TX_RING_SIZE, &np->gref_tx_head) != 0) { IPRINTK("#### netfront can't alloc tx grant refs\n"); err = ENOMEM; goto exit; } /* A grant for every rx ring slot */ if (gnttab_alloc_grant_references(RX_MAX_TARGET, &np->gref_rx_head) != 0) { WPRINTK("#### netfront can't alloc rx grant refs\n"); gnttab_free_grant_references(np->gref_tx_head); err = ENOMEM; goto exit; } err = xen_net_read_mac(dev, np->mac); if (err) goto out; /* Set up ifnet structure */ ifp = np->xn_ifp = if_alloc(IFT_ETHER); ifp->if_softc = np; if_initname(ifp, "xn", device_get_unit(dev)); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = xn_ioctl; ifp->if_output = ether_output; ifp->if_start = xn_start; #ifdef notyet ifp->if_watchdog = xn_watchdog; #endif ifp->if_init = xn_ifinit; ifp->if_snd.ifq_maxlen = NET_TX_RING_SIZE - 1; ifp->if_hwassist = XN_CSUM_FEATURES; ifp->if_capabilities = IFCAP_HWCSUM; - ifp->if_hw_tsomax = IF_HW_TSOMAX_BUILD_VALUE( - 65535 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN) /* bytes */, - MAX_TX_REQ_FRAGS /* maximum frag count */, - PAGE_SHIFT /* PAGE_SIZE frag size */); + ifp->if_hw_tsomax = NF_TSO_MAXBURST; ether_ifattach(ifp, np->mac); callout_init(&np->xn_stat_ch, CALLOUT_MPSAFE); netfront_carrier_off(np); return (0); exit: gnttab_free_grant_references(np->gref_tx_head); out: return (err); } /** * Handle the change of state of the backend to Closing. We must delete our * device-layer structures now, to ensure that writes are flushed through to * the backend. Once is this done, we can switch to Closed in * acknowledgement. */ #if 0 static void netfront_closing(device_t dev) { #if 0 struct netfront_info *info = dev->dev_driver_data; DPRINTK("netfront_closing: %s removed\n", dev->nodename); close_netdev(info); #endif xenbus_switch_state(dev, XenbusStateClosed); } #endif static int netfront_detach(device_t dev) { struct netfront_info *info = device_get_softc(dev); DPRINTK("%s\n", xenbus_get_node(dev)); netif_free(info); return 0; } static void netif_free(struct netfront_info *info) { XN_LOCK(info); xn_stop(info); XN_UNLOCK(info); callout_drain(&info->xn_stat_ch); netif_disconnect_backend(info); if (info->xn_ifp != NULL) { ether_ifdetach(info->xn_ifp); if_free(info->xn_ifp); info->xn_ifp = NULL; } ifmedia_removeall(&info->sc_media); } static void netif_disconnect_backend(struct netfront_info *info) { XN_RX_LOCK(info); XN_TX_LOCK(info); netfront_carrier_off(info); XN_TX_UNLOCK(info); XN_RX_UNLOCK(info); free_ring(&info->tx_ring_ref, &info->tx.sring); free_ring(&info->rx_ring_ref, &info->rx.sring); xen_intr_unbind(&info->xen_intr_handle); } static void free_ring(int *ref, void *ring_ptr_ref) { void **ring_ptr_ptr = ring_ptr_ref; if (*ref != GRANT_REF_INVALID) { /* This API frees the associated storage. */ gnttab_end_foreign_access(*ref, *ring_ptr_ptr); *ref = GRANT_REF_INVALID; } *ring_ptr_ptr = NULL; } static int xn_ifmedia_upd(struct ifnet *ifp) { return (0); } static void xn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) { ifmr->ifm_status = IFM_AVALID|IFM_ACTIVE; ifmr->ifm_active = IFM_ETHER|IFM_MANUAL; } /* ** Driver registration ** */ static device_method_t netfront_methods[] = { /* Device interface */ DEVMETHOD(device_probe, netfront_probe), DEVMETHOD(device_attach, netfront_attach), DEVMETHOD(device_detach, netfront_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, netfront_suspend), DEVMETHOD(device_resume, netfront_resume), /* Xenbus interface */ DEVMETHOD(xenbus_otherend_changed, netfront_backend_changed), DEVMETHOD_END }; static driver_t netfront_driver = { "xn", netfront_methods, sizeof(struct netfront_info), }; devclass_t netfront_devclass; DRIVER_MODULE(xe, xenbusb_front, netfront_driver, netfront_devclass, NULL, NULL); Index: head/sys/net/if.c =================================================================== --- head/sys/net/if.c (revision 271550) +++ head/sys/net/if.c (revision 271551) @@ -1,4112 +1,4072 @@ /*- * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if.c 8.5 (Berkeley) 1/9/95 * $FreeBSD$ */ #include "opt_compat.h" #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #ifdef INET #include #endif /* INET */ #ifdef INET6 #include #include #endif /* INET6 */ #endif /* INET || INET6 */ #include #ifdef COMPAT_FREEBSD32 #include #include #endif struct ifindex_entry { struct ifnet *ife_ifnet; }; SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers"); SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management"); SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN, &ifqmaxlen, 0, "max send queue size"); /* Log link state change events */ static int log_link_state_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW, &log_link_state_change, 0, "log interface link state change events"); /* Interface description */ static unsigned int ifdescr_maxlen = 1024; SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW, &ifdescr_maxlen, 0, "administrative maximum length for interface description"); static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions"); /* global sx for non-critical path ifdescr */ static struct sx ifdescr_sx; SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr"); void (*bridge_linkstate_p)(struct ifnet *ifp); void (*ng_ether_link_state_p)(struct ifnet *ifp, int state); void (*lagg_linkstate_p)(struct ifnet *ifp, int state); /* These are external hooks for CARP. */ void (*carp_linkstate_p)(struct ifnet *ifp); void (*carp_demote_adj_p)(int, char *); int (*carp_master_p)(struct ifaddr *); #if defined(INET) || defined(INET6) int (*carp_forus_p)(struct ifnet *ifp, u_char *dhost); int (*carp_output_p)(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa); int (*carp_ioctl_p)(struct ifreq *, u_long, struct thread *); int (*carp_attach_p)(struct ifaddr *, int); void (*carp_detach_p)(struct ifaddr *); #endif #ifdef INET int (*carp_iamatch_p)(struct ifaddr *, uint8_t **); #endif #ifdef INET6 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6); caddr_t (*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr); #endif struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL; /* * XXX: Style; these should be sorted alphabetically, and unprototyped * static functions should be prototyped. Currently they are sorted by * declaration order. */ static void if_attachdomain(void *); static void if_attachdomain1(struct ifnet *); static int ifconf(u_long, caddr_t); static void if_freemulti(struct ifmultiaddr *); static void if_init(void *); static void if_grow(void); static void if_route(struct ifnet *, int flag, int fam); static int if_setflag(struct ifnet *, int, int, int *, int); static int if_transmit(struct ifnet *ifp, struct mbuf *m); static void if_unroute(struct ifnet *, int flag, int fam); static void link_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static int if_rtdel(struct radix_node *, void *); static int ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *); static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int); static void do_link_state_change(void *, int); static int if_getgroup(struct ifgroupreq *, struct ifnet *); static int if_getgroupmembers(struct ifgroupreq *); static void if_delgroups(struct ifnet *); static void if_attach_internal(struct ifnet *, int); static void if_detach_internal(struct ifnet *, int); #ifdef INET6 /* * XXX: declare here to avoid to include many inet6 related files.. * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); #endif VNET_DEFINE(int, if_index); int ifqmaxlen = IFQ_MAXLEN; VNET_DEFINE(struct ifnethead, ifnet); /* depend on static init XXX */ VNET_DEFINE(struct ifgrouphead, ifg_head); static VNET_DEFINE(int, if_indexlim) = 8; /* Table of ifnet by index. */ VNET_DEFINE(struct ifindex_entry *, ifindex_table); #define V_if_indexlim VNET(if_indexlim) #define V_ifindex_table VNET(ifindex_table) /* * The global network interface list (V_ifnet) and related state (such as * if_index, if_indexlim, and ifindex_table) are protected by an sxlock and * an rwlock. Either may be acquired shared to stablize the list, but both * must be acquired writable to modify the list. This model allows us to * both stablize the interface list during interrupt thread processing, but * also to stablize it over long-running ioctls, without introducing priority * inversions and deadlocks. */ struct rwlock ifnet_rwlock; struct sx ifnet_sxlock; /* * The allocation of network interfaces is a rather non-atomic affair; we * need to select an index before we are ready to expose the interface for * use, so will use this pointer value to indicate reservation. */ #define IFNET_HOLD (void *)(uintptr_t)(-1) static if_com_alloc_t *if_com_alloc[256]; static if_com_free_t *if_com_free[256]; static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals"); MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address"); struct ifnet * ifnet_byindex_locked(u_short idx) { if (idx > V_if_index) return (NULL); if (V_ifindex_table[idx].ife_ifnet == IFNET_HOLD) return (NULL); return (V_ifindex_table[idx].ife_ifnet); } struct ifnet * ifnet_byindex(u_short idx) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); ifp = ifnet_byindex_locked(idx); IFNET_RUNLOCK_NOSLEEP(); return (ifp); } struct ifnet * ifnet_byindex_ref(u_short idx) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); ifp = ifnet_byindex_locked(idx); if (ifp == NULL || (ifp->if_flags & IFF_DYING)) { IFNET_RUNLOCK_NOSLEEP(); return (NULL); } if_ref(ifp); IFNET_RUNLOCK_NOSLEEP(); return (ifp); } /* * Allocate an ifindex array entry; return 0 on success or an error on * failure. */ static int ifindex_alloc_locked(u_short *idxp) { u_short idx; IFNET_WLOCK_ASSERT(); retry: /* * Try to find an empty slot below V_if_index. If we fail, take the * next slot. */ for (idx = 1; idx <= V_if_index; idx++) { if (V_ifindex_table[idx].ife_ifnet == NULL) break; } /* Catch if_index overflow. */ if (idx >= V_if_indexlim) { if_grow(); goto retry; } if (idx > V_if_index) V_if_index = idx; *idxp = idx; return (0); } static void ifindex_free_locked(u_short idx) { IFNET_WLOCK_ASSERT(); V_ifindex_table[idx].ife_ifnet = NULL; while (V_if_index > 0 && V_ifindex_table[V_if_index].ife_ifnet == NULL) V_if_index--; } static void ifindex_free(u_short idx) { IFNET_WLOCK(); ifindex_free_locked(idx); IFNET_WUNLOCK(); } static void ifnet_setbyindex_locked(u_short idx, struct ifnet *ifp) { IFNET_WLOCK_ASSERT(); V_ifindex_table[idx].ife_ifnet = ifp; } static void ifnet_setbyindex(u_short idx, struct ifnet *ifp) { IFNET_WLOCK(); ifnet_setbyindex_locked(idx, ifp); IFNET_WUNLOCK(); } struct ifaddr * ifaddr_byindex(u_short idx) { struct ifaddr *ifa; IFNET_RLOCK_NOSLEEP(); ifa = ifnet_byindex_locked(idx)->if_addr; if (ifa != NULL) ifa_ref(ifa); IFNET_RUNLOCK_NOSLEEP(); return (ifa); } /* * Network interface utility routines. * * Routines with ifa_ifwith* names take sockaddr *'s as * parameters. */ static void vnet_if_init(const void *unused __unused) { TAILQ_INIT(&V_ifnet); TAILQ_INIT(&V_ifg_head); IFNET_WLOCK(); if_grow(); /* create initial table */ IFNET_WUNLOCK(); vnet_if_clone_init(); } VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init, NULL); /* ARGSUSED*/ static void if_init(void *dummy __unused) { IFNET_LOCK_INIT(); if_clone_init(); } SYSINIT(interfaces, SI_SUB_INIT_IF, SI_ORDER_FIRST, if_init, NULL); #ifdef VIMAGE static void vnet_if_uninit(const void *unused __unused) { VNET_ASSERT(TAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p " "not empty", __func__, __LINE__, &V_ifnet)); VNET_ASSERT(TAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p " "not empty", __func__, __LINE__, &V_ifg_head)); free((caddr_t)V_ifindex_table, M_IFNET); } VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_if_uninit, NULL); #endif static void if_grow(void) { int oldlim; u_int n; struct ifindex_entry *e; IFNET_WLOCK_ASSERT(); oldlim = V_if_indexlim; IFNET_WUNLOCK(); n = (oldlim << 1) * sizeof(*e); e = malloc(n, M_IFNET, M_WAITOK | M_ZERO); IFNET_WLOCK(); if (V_if_indexlim != oldlim) { free(e, M_IFNET); return; } if (V_ifindex_table != NULL) { memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2); free((caddr_t)V_ifindex_table, M_IFNET); } V_if_indexlim <<= 1; V_ifindex_table = e; } /* - * Compute the least common value of two "if_hw_tsomax" values: - */ -u_int -if_hw_tsomax_common(u_int a, u_int b) -{ - u_int a_bytes = IF_HW_TSOMAX_GET_BYTES(a); - u_int a_frag_count = IF_HW_TSOMAX_GET_FRAG_COUNT(a); - u_int a_frag_size = IF_HW_TSOMAX_GET_FRAG_SIZE(a); - u_int b_bytes = IF_HW_TSOMAX_GET_BYTES(b); - u_int b_frag_count = IF_HW_TSOMAX_GET_FRAG_COUNT(b); - u_int b_frag_size = IF_HW_TSOMAX_GET_FRAG_SIZE(b); - - return (IF_HW_TSOMAX_BUILD_VALUE(min(a_bytes, b_bytes), - min(a_frag_count, b_frag_count), - min(a_frag_size, b_frag_size))); -} - -/* - * Range check the "if_hw_tsomax" value: - */ -u_int -if_hw_tsomax_range_check(u_int a) -{ - u_int a_bytes = IF_HW_TSOMAX_GET_BYTES(a); - u_int a_frag_count = IF_HW_TSOMAX_GET_FRAG_COUNT(a); - u_int a_frag_size = IF_HW_TSOMAX_GET_FRAG_SIZE(a); - - /* round down to nearest 4 bytes */ - a_bytes &= 0xFFFC; - - /* use default, if zero */ - if (a_bytes == 0) - a_bytes = IF_HW_TSOMAX_DEFAULT_BYTES; - - /* use default, if zero */ - if (a_frag_count == 0) - a_frag_count = IF_HW_TSOMAX_DEFAULT_FRAG_COUNT; - - /* use default, if zero */ - if (a_frag_size == 0) - a_frag_size = IF_HW_TSOMAX_DEFAULT_FRAG_SIZE; - - return (IF_HW_TSOMAX_BUILD_VALUE(a_bytes, a_frag_count, a_frag_size)); -} - -/* * Allocate a struct ifnet and an index for an interface. A layer 2 * common structure will also be allocated if an allocation routine is * registered for the passed type. */ struct ifnet * if_alloc(u_char type) { struct ifnet *ifp; u_short idx; ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK|M_ZERO); IFNET_WLOCK(); if (ifindex_alloc_locked(&idx) != 0) { IFNET_WUNLOCK(); free(ifp, M_IFNET); return (NULL); } ifnet_setbyindex_locked(idx, IFNET_HOLD); IFNET_WUNLOCK(); ifp->if_index = idx; ifp->if_type = type; ifp->if_alloctype = type; - ifp->if_hw_tsomax = IF_HW_TSOMAX_DEFAULT_VALUE(); if (if_com_alloc[type] != NULL) { ifp->if_l2com = if_com_alloc[type](type, ifp); if (ifp->if_l2com == NULL) { free(ifp, M_IFNET); ifindex_free(idx); return (NULL); } } IF_ADDR_LOCK_INIT(ifp); TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); ifp->if_afdata_initialized = 0; IF_AFDATA_LOCK_INIT(ifp); TAILQ_INIT(&ifp->if_addrhead); TAILQ_INIT(&ifp->if_multiaddrs); TAILQ_INIT(&ifp->if_groups); #ifdef MAC mac_ifnet_init(ifp); #endif ifq_init(&ifp->if_snd, ifp); refcount_init(&ifp->if_refcount, 1); /* Index reference. */ ifnet_setbyindex(ifp->if_index, ifp); return (ifp); } /* * Do the actual work of freeing a struct ifnet, and layer 2 common * structure. This call is made when the last reference to an * interface is released. */ static void if_free_internal(struct ifnet *ifp) { KASSERT((ifp->if_flags & IFF_DYING), ("if_free_internal: interface not dying")); if (if_com_free[ifp->if_alloctype] != NULL) if_com_free[ifp->if_alloctype](ifp->if_l2com, ifp->if_alloctype); #ifdef MAC mac_ifnet_destroy(ifp); #endif /* MAC */ if (ifp->if_description != NULL) free(ifp->if_description, M_IFDESCR); IF_AFDATA_DESTROY(ifp); IF_ADDR_LOCK_DESTROY(ifp); ifq_delete(&ifp->if_snd); free(ifp, M_IFNET); } /* * Deregister an interface and free the associated storage. */ void if_free(struct ifnet *ifp) { ifp->if_flags |= IFF_DYING; /* XXX: Locking */ CURVNET_SET_QUIET(ifp->if_vnet); IFNET_WLOCK(); KASSERT(ifp == ifnet_byindex_locked(ifp->if_index), ("%s: freeing unallocated ifnet", ifp->if_xname)); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); if (refcount_release(&ifp->if_refcount)) if_free_internal(ifp); CURVNET_RESTORE(); } /* * Interfaces to keep an ifnet type-stable despite the possibility of the * driver calling if_free(). If there are additional references, we defer * freeing the underlying data structure. */ void if_ref(struct ifnet *ifp) { /* We don't assert the ifnet list lock here, but arguably should. */ refcount_acquire(&ifp->if_refcount); } void if_rele(struct ifnet *ifp) { if (!refcount_release(&ifp->if_refcount)) return; if_free_internal(ifp); } void ifq_init(struct ifaltq *ifq, struct ifnet *ifp) { mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF); if (ifq->ifq_maxlen == 0) ifq->ifq_maxlen = ifqmaxlen; ifq->altq_type = 0; ifq->altq_disc = NULL; ifq->altq_flags &= ALTQF_CANTCHANGE; ifq->altq_tbr = NULL; ifq->altq_ifp = ifp; } void ifq_delete(struct ifaltq *ifq) { mtx_destroy(&ifq->ifq_mtx); } /* * Perform generic interface initalization tasks and attach the interface * to the list of "active" interfaces. If vmove flag is set on entry * to if_attach_internal(), perform only a limited subset of initialization * tasks, given that we are moving from one vnet to another an ifnet which * has already been fully initialized. * * XXX: * - The decision to return void and thus require this function to * succeed is questionable. * - We should probably do more sanity checking. For instance we don't * do anything to insure if_xname is unique or non-empty. */ void if_attach(struct ifnet *ifp) { if_attach_internal(ifp, 0); } static void if_attach_internal(struct ifnet *ifp, int vmove) { unsigned socksize, ifasize; int namelen, masklen; struct sockaddr_dl *sdl; struct ifaddr *ifa; if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index)) panic ("%s: BUG: if_attach called without if_alloc'd input()\n", ifp->if_xname); #ifdef VIMAGE ifp->if_vnet = curvnet; if (ifp->if_home_vnet == NULL) ifp->if_home_vnet = curvnet; #endif if_addgroup(ifp, IFG_ALL); getmicrotime(&ifp->if_lastchange); ifp->if_epoch = time_uptime; KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) || (ifp->if_transmit != NULL && ifp->if_qflush != NULL), ("transmit and qflush must both either be set or both be NULL")); if (ifp->if_transmit == NULL) { ifp->if_transmit = if_transmit; ifp->if_qflush = if_qflush; } if (ifp->if_get_counter == NULL) ifp->if_get_counter = if_get_counter_compat; if (!vmove) { #ifdef MAC mac_ifnet_create(ifp); #endif /* * Create a Link Level name for this device. */ namelen = strlen(ifp->if_xname); /* * Always save enough space for any possiable name so we * can do a rename in place later. */ masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ; socksize = masklen + ifp->if_addrlen; if (socksize < sizeof(*sdl)) socksize = sizeof(*sdl); socksize = roundup2(socksize, sizeof(long)); ifasize = sizeof(*ifa) + 2 * socksize; ifa = ifa_alloc(ifasize, M_WAITOK); sdl = (struct sockaddr_dl *)(ifa + 1); sdl->sdl_len = socksize; sdl->sdl_family = AF_LINK; bcopy(ifp->if_xname, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; ifp->if_addr = ifa; ifa->ifa_ifp = ifp; ifa->ifa_rtrequest = link_rtrequest; ifa->ifa_addr = (struct sockaddr *)sdl; sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl); ifa->ifa_netmask = (struct sockaddr *)sdl; sdl->sdl_len = masklen; while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); /* Reliably crash if used uninitialized. */ ifp->if_broadcastaddr = NULL; - /* range check TSO value */ - ifp->if_hw_tsomax = - if_hw_tsomax_range_check(ifp->if_hw_tsomax); + +#if defined(INET) || defined(INET6) + /* Initialize to max value. */ + if (ifp->if_hw_tsomax == 0) + ifp->if_hw_tsomax = min(IP_MAXPACKET, 32 * MCLBYTES - + (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)); + KASSERT(ifp->if_hw_tsomax <= IP_MAXPACKET && + ifp->if_hw_tsomax >= IP_MAXPACKET / 8, + ("%s: tsomax outside of range", __func__)); +#endif } #ifdef VIMAGE else { /* * Update the interface index in the link layer address * of the interface. */ for (ifa = ifp->if_addr; ifa != NULL; ifa = TAILQ_NEXT(ifa, ifa_link)) { if (ifa->ifa_addr->sa_family == AF_LINK) { sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_index = ifp->if_index; } } } #endif IFNET_WLOCK(); TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link); #ifdef VIMAGE curvnet->vnet_ifcnt++; #endif IFNET_WUNLOCK(); if (domain_init_status >= 2) if_attachdomain1(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL); /* Announce the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); } static void if_attachdomain(void *dummy) { struct ifnet *ifp; TAILQ_FOREACH(ifp, &V_ifnet, if_link) if_attachdomain1(ifp); } SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND, if_attachdomain, NULL); static void if_attachdomain1(struct ifnet *ifp) { struct domain *dp; /* * Since dp->dom_ifattach calls malloc() with M_WAITOK, we * cannot lock ifp->if_afdata initialization, entirely. */ if (IF_AFDATA_TRYLOCK(ifp) == 0) return; if (ifp->if_afdata_initialized >= domain_init_status) { IF_AFDATA_UNLOCK(ifp); log(LOG_WARNING, "%s called more than once on %s\n", __func__, ifp->if_xname); return; } ifp->if_afdata_initialized = domain_init_status; IF_AFDATA_UNLOCK(ifp); /* address family dependent data region */ bzero(ifp->if_afdata, sizeof(ifp->if_afdata)); for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_ifattach) ifp->if_afdata[dp->dom_family] = (*dp->dom_ifattach)(ifp); } } /* * Remove any unicast or broadcast network addresses from an interface. */ void if_purgeaddrs(struct ifnet *ifp) { struct ifaddr *ifa, *next; TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) { if (ifa->ifa_addr->sa_family == AF_LINK) continue; #ifdef INET /* XXX: Ugly!! ad hoc just for INET */ if (ifa->ifa_addr->sa_family == AF_INET) { struct ifaliasreq ifr; bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL) == 0) continue; } #endif /* INET */ #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) { in6_purgeaddr(ifa); /* ifp_addrhead is already updated */ continue; } #endif /* INET6 */ TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link); ifa_free(ifa); } } /* * Remove any multicast network addresses from an interface when an ifnet * is going away. */ static void if_purgemaddrs(struct ifnet *ifp) { struct ifmultiaddr *ifma; struct ifmultiaddr *next; IF_ADDR_WLOCK(ifp); TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) if_delmulti_locked(ifp, ifma, 1); IF_ADDR_WUNLOCK(ifp); } /* * Detach an interface, removing it from the list of "active" interfaces. * If vmove flag is set on entry to if_detach_internal(), perform only a * limited subset of cleanup tasks, given that we are moving an ifnet from * one vnet to another, where it must be fully operational. * * XXXRW: There are some significant questions about event ordering, and * how to prevent things from starting to use the interface during detach. */ void if_detach(struct ifnet *ifp) { CURVNET_SET_QUIET(ifp->if_vnet); if_detach_internal(ifp, 0); CURVNET_RESTORE(); } static void if_detach_internal(struct ifnet *ifp, int vmove) { struct ifaddr *ifa; struct radix_node_head *rnh; int i, j; struct domain *dp; struct ifnet *iter; int found = 0; IFNET_WLOCK(); TAILQ_FOREACH(iter, &V_ifnet, if_link) if (iter == ifp) { TAILQ_REMOVE(&V_ifnet, ifp, if_link); found = 1; break; } #ifdef VIMAGE if (found) curvnet->vnet_ifcnt--; #endif IFNET_WUNLOCK(); if (!found) { if (vmove) panic("%s: ifp=%p not on the ifnet tailq %p", __func__, ifp, &V_ifnet); else return; /* XXX this should panic as well? */ } /* * Remove/wait for pending events. */ taskqueue_drain(taskqueue_swi, &ifp->if_linktask); /* * Remove routes and flush queues. */ if_down(ifp); #ifdef ALTQ if (ALTQ_IS_ENABLED(&ifp->if_snd)) altq_disable(&ifp->if_snd); if (ALTQ_IS_ATTACHED(&ifp->if_snd)) altq_detach(&ifp->if_snd); #endif if_purgeaddrs(ifp); #ifdef INET in_ifdetach(ifp); #endif #ifdef INET6 /* * Remove all IPv6 kernel structs related to ifp. This should be done * before removing routing entries below, since IPv6 interface direct * routes are expected to be removed by the IPv6-specific kernel API. * Otherwise, the kernel will detect some inconsistency and bark it. */ in6_ifdetach(ifp); #endif if_purgemaddrs(ifp); /* Announce that the interface is gone. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); if (!vmove) { /* * Prevent further calls into the device driver via ifnet. */ if_dead(ifp); /* * Remove link ifaddr pointer and maybe decrement if_index. * Clean up all addresses. */ ifp->if_addr = NULL; /* We can now free link ifaddr. */ if (!TAILQ_EMPTY(&ifp->if_addrhead)) { ifa = TAILQ_FIRST(&ifp->if_addrhead); TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link); ifa_free(ifa); } } /* * Delete all remaining routes using this interface * Unfortuneatly the only way to do this is to slog through * the entire routing table looking for routes which point * to this interface...oh well... */ for (i = 1; i <= AF_MAX; i++) { for (j = 0; j < rt_numfibs; j++) { rnh = rt_tables_get_rnh(j, i); if (rnh == NULL) continue; RADIX_NODE_HEAD_LOCK(rnh); (void) rnh->rnh_walktree(rnh, if_rtdel, ifp); RADIX_NODE_HEAD_UNLOCK(rnh); } } if_delgroups(ifp); /* * We cannot hold the lock over dom_ifdetach calls as they might * sleep, for example trying to drain a callout, thus open up the * theoretical race with re-attaching. */ IF_AFDATA_LOCK(ifp); i = ifp->if_afdata_initialized; ifp->if_afdata_initialized = 0; IF_AFDATA_UNLOCK(ifp); for (dp = domains; i > 0 && dp; dp = dp->dom_next) { if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) (*dp->dom_ifdetach)(ifp, ifp->if_afdata[dp->dom_family]); } } #ifdef VIMAGE /* * if_vmove() performs a limited version of if_detach() in current * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg. * An attempt is made to shrink if_index in current vnet, find an * unused if_index in target vnet and calls if_grow() if necessary, * and finally find an unused if_xname for the target vnet. */ void if_vmove(struct ifnet *ifp, struct vnet *new_vnet) { u_short idx; /* * Detach from current vnet, but preserve LLADDR info, do not * mark as dead etc. so that the ifnet can be reattached later. */ if_detach_internal(ifp, 1); /* * Unlink the ifnet from ifindex_table[] in current vnet, and shrink * the if_index for that vnet if possible. * * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized, * or we'd lock on one vnet and unlock on another. */ IFNET_WLOCK(); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); /* * Perform interface-specific reassignment tasks, if provided by * the driver. */ if (ifp->if_reassign != NULL) ifp->if_reassign(ifp, new_vnet, NULL); /* * Switch to the context of the target vnet. */ CURVNET_SET_QUIET(new_vnet); IFNET_WLOCK(); if (ifindex_alloc_locked(&idx) != 0) { IFNET_WUNLOCK(); panic("if_index overflow"); } ifp->if_index = idx; ifnet_setbyindex_locked(ifp->if_index, ifp); IFNET_WUNLOCK(); if_attach_internal(ifp, 1); CURVNET_RESTORE(); } /* * Move an ifnet to or from another child prison/vnet, specified by the jail id. */ static int if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid) { struct prison *pr; struct ifnet *difp; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Do not try to move the iface from and to the same prison. */ if (pr->pr_vnet == ifp->if_vnet) { prison_free(pr); return (EEXIST); } /* Make sure the named iface does not exists in the dst. prison/vnet. */ /* XXX Lock interfaces to avoid races. */ CURVNET_SET_QUIET(pr->pr_vnet); difp = ifunit(ifname); CURVNET_RESTORE(); if (difp != NULL) { prison_free(pr); return (EEXIST); } /* Move the interface into the child jail/vnet. */ if_vmove(ifp, pr->pr_vnet); /* Report the new if_xname back to the userland. */ sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (0); } static int if_vmove_reclaim(struct thread *td, char *ifname, int jid) { struct prison *pr; struct vnet *vnet_dst; struct ifnet *ifp; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Make sure the named iface exists in the source prison/vnet. */ CURVNET_SET(pr->pr_vnet); ifp = ifunit(ifname); /* XXX Lock to avoid races. */ if (ifp == NULL) { CURVNET_RESTORE(); prison_free(pr); return (ENXIO); } /* Do not try to move the iface from and to the same prison. */ vnet_dst = TD_TO_VNET(td); if (vnet_dst == ifp->if_vnet) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Get interface back from child jail/vnet. */ if_vmove(ifp, vnet_dst); CURVNET_RESTORE(); /* Report the new if_xname back to the userland. */ sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (0); } #endif /* VIMAGE */ /* * Add a group to an interface */ int if_addgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_group *ifg = NULL; struct ifg_member *ifgm; int new = 0; if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' && groupname[strlen(groupname) - 1] <= '9') return (EINVAL); IFNET_WLOCK(); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) { IFNET_WUNLOCK(); return (EEXIST); } if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP, M_NOWAIT)) == NULL) { IFNET_WUNLOCK(); return (ENOMEM); } if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, groupname)) break; if (ifg == NULL) { if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); free(ifgm, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group)); ifg->ifg_refcnt = 0; TAILQ_INIT(&ifg->ifg_members); TAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next); new = 1; } ifg->ifg_refcnt++; ifgl->ifgl_group = ifg; ifgm->ifgm_ifp = ifp; IF_ADDR_WLOCK(ifp); TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next); TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); IFNET_WUNLOCK(); if (new) EVENTHANDLER_INVOKE(group_attach_event, ifg); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Remove a group from an interface */ int if_delgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_member *ifgm; IFNET_WLOCK(); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) break; if (ifgl == NULL) { IFNET_WUNLOCK(); return (ENOENT); } IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) break; if (ifgm != NULL) { TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next); free(ifgm, M_TEMP); } if (--ifgl->ifgl_group->ifg_refcnt == 0) { TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next); IFNET_WUNLOCK(); EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } else IFNET_WUNLOCK(); free(ifgl, M_TEMP); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Remove an interface from all groups */ static void if_delgroups(struct ifnet *ifp) { struct ifg_list *ifgl; struct ifg_member *ifgm; char groupname[IFNAMSIZ]; IFNET_WLOCK(); while (!TAILQ_EMPTY(&ifp->if_groups)) { ifgl = TAILQ_FIRST(&ifp->if_groups); strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ); IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) break; if (ifgm != NULL) { TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next); free(ifgm, M_TEMP); } if (--ifgl->ifgl_group->ifg_refcnt == 0) { TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next); IFNET_WUNLOCK(); EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } else IFNET_WUNLOCK(); free(ifgl, M_TEMP); EVENTHANDLER_INVOKE(group_change_event, groupname); IFNET_WLOCK(); } IFNET_WUNLOCK(); } /* * Stores all groups from an interface in memory pointed * to by data */ static int if_getgroup(struct ifgroupreq *data, struct ifnet *ifp) { int len, error; struct ifg_list *ifgl; struct ifg_req ifgrq, *ifgp; struct ifgroupreq *ifgr = data; if (ifgr->ifgr_len == 0) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) ifgr->ifgr_len += sizeof(struct ifg_req); IF_ADDR_RUNLOCK(ifp); return (0); } len = ifgr->ifgr_len; ifgp = ifgr->ifgr_groups; /* XXX: wire */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) { if (len < sizeof(ifgrq)) { IF_ADDR_RUNLOCK(ifp); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group, sizeof(ifgrq.ifgrq_group)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IF_ADDR_RUNLOCK(ifp); return (error); } len -= sizeof(ifgrq); ifgp++; } IF_ADDR_RUNLOCK(ifp); return (0); } /* * Stores all members of a group in memory pointed to by data */ static int if_getgroupmembers(struct ifgroupreq *data) { struct ifgroupreq *ifgr = data; struct ifg_group *ifg; struct ifg_member *ifgm; struct ifg_req ifgrq, *ifgp; int len, error; IFNET_RLOCK(); TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, ifgr->ifgr_name)) break; if (ifg == NULL) { IFNET_RUNLOCK(); return (ENOENT); } if (ifgr->ifgr_len == 0) { TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) ifgr->ifgr_len += sizeof(ifgrq); IFNET_RUNLOCK(); return (0); } len = ifgr->ifgr_len; ifgp = ifgr->ifgr_groups; TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) { if (len < sizeof(ifgrq)) { IFNET_RUNLOCK(); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname, sizeof(ifgrq.ifgrq_member)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IFNET_RUNLOCK(); return (error); } len -= sizeof(ifgrq); ifgp++; } IFNET_RUNLOCK(); return (0); } /* * Delete Routes for a Network Interface * * Called for each routing entry via the rnh->rnh_walktree() call above * to delete all route entries referencing a detaching network interface. * * Arguments: * rn pointer to node in the routing table * arg argument passed to rnh->rnh_walktree() - detaching interface * * Returns: * 0 successful * errno failed - reason indicated * */ static int if_rtdel(struct radix_node *rn, void *arg) { struct rtentry *rt = (struct rtentry *)rn; struct ifnet *ifp = arg; int err; if (rt->rt_ifp == ifp) { /* * Protect (sorta) against walktree recursion problems * with cloned routes */ if ((rt->rt_flags & RTF_UP) == 0) return (0); err = rtrequest_fib(RTM_DELETE, rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags|RTF_RNH_LOCKED|RTF_PINNED, (struct rtentry **) NULL, rt->rt_fibnum); if (err) { log(LOG_WARNING, "if_rtdel: error %d\n", err); } } return (0); } /* * Return counter values from old racy non-pcpu counters. */ uint64_t if_get_counter_compat(struct ifnet *ifp, ifnet_counter cnt) { switch (cnt) { case IFCOUNTER_IPACKETS: return (ifp->if_ipackets); case IFCOUNTER_IERRORS: return (ifp->if_ierrors); case IFCOUNTER_OPACKETS: return (ifp->if_opackets); case IFCOUNTER_OERRORS: return (ifp->if_oerrors); case IFCOUNTER_COLLISIONS: return (ifp->if_collisions); case IFCOUNTER_IBYTES: return (ifp->if_ibytes); case IFCOUNTER_OBYTES: return (ifp->if_obytes); case IFCOUNTER_IMCASTS: return (ifp->if_imcasts); case IFCOUNTER_OMCASTS: return (ifp->if_omcasts); case IFCOUNTER_IQDROPS: return (ifp->if_iqdrops); case IFCOUNTER_OQDROPS: return (ifp->if_oqdrops); case IFCOUNTER_NOPROTO: return (ifp->if_noproto); } panic("%s: unknown counter %d", __func__, cnt); } /* * Copy data from ifnet to userland API structure if_data. */ void if_data_copy(struct ifnet *ifp, struct if_data *ifd) { ifd->ifi_type = ifp->if_type; ifd->ifi_physical = 0; ifd->ifi_addrlen = ifp->if_addrlen; ifd->ifi_hdrlen = ifp->if_hdrlen; ifd->ifi_link_state = ifp->if_link_state; ifd->ifi_vhid = 0; ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_mtu = ifp->if_mtu; ifd->ifi_metric = ifp->if_metric; ifd->ifi_baudrate = ifp->if_baudrate; ifd->ifi_hwassist = ifp->if_hwassist; ifd->ifi_epoch = ifp->if_epoch; ifd->ifi_lastchange = ifp->if_lastchange; ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS); ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS); ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS); ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS); ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS); ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES); ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES); ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS); ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS); ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS); ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS); ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO); } /* * Wrapper functions for struct ifnet address list locking macros. These are * used by kernel modules to avoid encoding programming interface or binary * interface assumptions that may be violated when kernel-internal locking * approaches change. */ void if_addr_rlock(struct ifnet *ifp) { IF_ADDR_RLOCK(ifp); } void if_addr_runlock(struct ifnet *ifp) { IF_ADDR_RUNLOCK(ifp); } void if_maddr_rlock(if_t ifp) { IF_ADDR_RLOCK((struct ifnet *)ifp); } void if_maddr_runlock(if_t ifp) { IF_ADDR_RUNLOCK((struct ifnet *)ifp); } /* * Initialization, destruction and refcounting functions for ifaddrs. */ struct ifaddr * ifa_alloc(size_t size, int flags) { struct ifaddr *ifa; KASSERT(size >= sizeof(struct ifaddr), ("%s: invalid size %zu", __func__, size)); ifa = malloc(size, M_IFADDR, M_ZERO | flags); if (ifa == NULL) return (NULL); if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL) goto fail; refcount_init(&ifa->ifa_refcnt, 1); return (ifa); fail: /* free(NULL) is okay */ counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); return (NULL); } void ifa_ref(struct ifaddr *ifa) { refcount_acquire(&ifa->ifa_refcnt); } void ifa_free(struct ifaddr *ifa) { if (refcount_release(&ifa->ifa_refcnt)) { counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); } } int ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { int error = 0; struct rtentry *rt = NULL; struct rt_addrinfo info; static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; bzero(&info, sizeof(info)); info.rti_ifp = V_loif; info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC; info.rti_info[RTAX_DST] = ia; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; error = rtrequest1_fib(RTM_ADD, &info, &rt, ifa->ifa_ifp->if_fib); if (error == 0 && rt != NULL) { RT_LOCK(rt); ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type = ifa->ifa_ifp->if_type; ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index = ifa->ifa_ifp->if_index; RT_REMREF(rt); RT_UNLOCK(rt); } else if (error != 0) log(LOG_DEBUG, "%s: insertion failed: %u\n", __func__, error); return (error); } int ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { int error = 0; struct rt_addrinfo info; struct sockaddr_dl null_sdl; bzero(&null_sdl, sizeof(null_sdl)); null_sdl.sdl_len = sizeof(null_sdl); null_sdl.sdl_family = AF_LINK; null_sdl.sdl_type = ifa->ifa_ifp->if_type; null_sdl.sdl_index = ifa->ifa_ifp->if_index; bzero(&info, sizeof(info)); info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC; info.rti_info[RTAX_DST] = ia; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; error = rtrequest1_fib(RTM_DELETE, &info, NULL, ifa->ifa_ifp->if_fib); if (error != 0) log(LOG_DEBUG, "%s: deletion failed: %u\n", __func__, error); return (error); } int ifa_switch_loopback_route(struct ifaddr *ifa, struct sockaddr *sa, int fib) { struct rtentry *rt; rt = rtalloc1_fib(sa, 0, 0, fib); if (rt == NULL) { log(LOG_DEBUG, "%s: fail", __func__); return (EHOSTUNREACH); } ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type = ifa->ifa_ifp->if_type; ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index = ifa->ifa_ifp->if_index; RTFREE_LOCKED(rt); return (0); } /* * XXX: Because sockaddr_dl has deeper structure than the sockaddr * structs used to represent other address families, it is necessary * to perform a different comparison. */ #define sa_dl_equal(a1, a2) \ ((((struct sockaddr_dl *)(a1))->sdl_len == \ ((struct sockaddr_dl *)(a2))->sdl_len) && \ (bcmp(LLADDR((struct sockaddr_dl *)(a1)), \ LLADDR((struct sockaddr_dl *)(a2)), \ ((struct sockaddr_dl *)(a1))->sdl_alen) == 0)) /* * Locate an interface based on a complete address. */ /*ARGSUSED*/ static struct ifaddr * ifa_ifwithaddr_internal(struct sockaddr *addr, int getref) { struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (sa_equal(addr, ifa->ifa_addr)) { if (getref) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } /* IP6 doesn't have broadcast */ if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { if (getref) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } } IF_ADDR_RUNLOCK(ifp); } ifa = NULL; done: IFNET_RUNLOCK_NOSLEEP(); return (ifa); } struct ifaddr * ifa_ifwithaddr(struct sockaddr *addr) { return (ifa_ifwithaddr_internal(addr, 1)); } int ifa_ifwithaddr_check(struct sockaddr *addr) { return (ifa_ifwithaddr_internal(addr, 0) != NULL); } /* * Locate an interface based on the broadcast address. */ /* ARGSUSED */ struct ifaddr * ifa_ifwithbroadaddr(struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } } IF_ADDR_RUNLOCK(ifp); } ifa = NULL; done: IFNET_RUNLOCK_NOSLEEP(); return (ifa); } /* * Locate the point to point interface with a given destination address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithdstaddr(struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((ifp->if_flags & IFF_POINTOPOINT) == 0) continue; if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } } IF_ADDR_RUNLOCK(ifp); } ifa = NULL; done: IFNET_RUNLOCK_NOSLEEP(); return (ifa); } /* * Find an interface on a specific network. If many, choice * is most specific found. */ struct ifaddr * ifa_ifwithnet(struct sockaddr *addr, int ignore_ptp, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; char *addr_data = addr->sa_data, *cplim; /* * AF_LINK addresses can be looked up directly by their index number, * so do that if we can. */ if (af == AF_LINK) { struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr; if (sdl->sdl_index && sdl->sdl_index <= V_if_index) return (ifaddr_byindex(sdl->sdl_index)); } /* * Scan though each interface, looking for ones that have addresses * in this address family and the requested fib. Maintain a reference * on ifa_maybe once we find one, as we release the IF_ADDR_RLOCK() that * kept it stable when we move onto the next interface. */ IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { char *cp, *cp2, *cp3; if (ifa->ifa_addr->sa_family != af) next: continue; if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) { /* * This is a bit broken as it doesn't * take into account that the remote end may * be a single node in the network we are * looking for. * The trouble is that we don't know the * netmask for the remote end. */ if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } } else { /* * Scan all the bits in the ifa's address. * If a bit dissagrees with what we are * looking for, mask it with the netmask * to see if it really matters. * (A byte at a time) */ if (ifa->ifa_netmask == 0) continue; cp = addr_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; while (cp3 < cplim) if ((*cp++ ^ *cp2++) & *cp3++) goto next; /* next address! */ /* * If the netmask of what we just found * is more specific than what we had before * (if we had one), or if the virtual status * of new prefix is better than of the old one, * then remember the new one before continuing * to search for an even better one. */ if (ifa_maybe == NULL || ifa_preferred(ifa_maybe, ifa) || rn_refines((caddr_t)ifa->ifa_netmask, (caddr_t)ifa_maybe->ifa_netmask)) { if (ifa_maybe != NULL) ifa_free(ifa_maybe); ifa_maybe = ifa; ifa_ref(ifa_maybe); } } } IF_ADDR_RUNLOCK(ifp); } ifa = ifa_maybe; ifa_maybe = NULL; done: IFNET_RUNLOCK_NOSLEEP(); if (ifa_maybe != NULL) ifa_free(ifa_maybe); return (ifa); } /* * Find an interface address specific to an interface best matching * a given address. */ struct ifaddr * ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp) { struct ifaddr *ifa; char *cp, *cp2, *cp3; char *cplim; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; if (af >= AF_MAX) return (NULL); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != af) continue; if (ifa_maybe == NULL) ifa_maybe = ifa; if (ifa->ifa_netmask == 0) { if (sa_equal(addr, ifa->ifa_addr) || (ifa->ifa_dstaddr && sa_equal(addr, ifa->ifa_dstaddr))) goto done; continue; } if (ifp->if_flags & IFF_POINTOPOINT) { if (sa_equal(addr, ifa->ifa_dstaddr)) goto done; } else { cp = addr->sa_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; for (; cp3 < cplim; cp3++) if ((*cp++ ^ *cp2++) & *cp3) break; if (cp3 == cplim) goto done; } } ifa = ifa_maybe; done: if (ifa != NULL) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); return (ifa); } /* * See whether new ifa is better than current one: * 1) A non-virtual one is preferred over virtual. * 2) A virtual in master state preferred over any other state. * * Used in several address selecting functions. */ int ifa_preferred(struct ifaddr *cur, struct ifaddr *next) { return (cur->ifa_carp && (!next->ifa_carp || ((*carp_master_p)(next) && !(*carp_master_p)(cur)))); } #include /* * Default action when installing a route with a Link Level gateway. * Lookup an appropriate real ifa to point to. * This should be moved to /sys/net/link.c eventually. */ static void link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info) { struct ifaddr *ifa, *oifa; struct sockaddr *dst; struct ifnet *ifp; RT_LOCK_ASSERT(rt); if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == 0) || ((ifp = ifa->ifa_ifp) == 0) || ((dst = rt_key(rt)) == 0)) return; ifa = ifaof_ifpforaddr(dst, ifp); if (ifa) { oifa = rt->rt_ifa; rt->rt_ifa = ifa; ifa_free(oifa); if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest) ifa->ifa_rtrequest(cmd, rt, info); } } struct sockaddr_dl * link_alloc_sdl(size_t size, int flags) { return (malloc(size, M_TEMP, flags)); } void link_free_sdl(struct sockaddr *sa) { free(sa, M_TEMP); } /* * Fills in given sdl with interface basic info. * Returns pointer to filled sdl. */ struct sockaddr_dl * link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)paddr; memset(sdl, 0, sizeof(struct sockaddr_dl)); sdl->sdl_len = sizeof(struct sockaddr_dl); sdl->sdl_family = AF_LINK; sdl->sdl_index = ifp->if_index; sdl->sdl_type = iftype; return (sdl); } /* * Mark an interface down and notify protocols of * the transition. */ static void if_unroute(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP")); ifp->if_flags &= ~flag; getmicrotime(&ifp->if_lastchange); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFDOWN, ifa->ifa_addr); ifp->if_qflush(ifp); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); } /* * Mark an interface up and notify protocols of * the transition. */ static void if_route(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP")); ifp->if_flags |= flag; getmicrotime(&ifp->if_lastchange); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFUP, ifa->ifa_addr); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); #ifdef INET6 in6_if_up(ifp); #endif } void (*vlan_link_state_p)(struct ifnet *); /* XXX: private from if_vlan */ void (*vlan_trunk_cap_p)(struct ifnet *); /* XXX: private from if_vlan */ struct ifnet *(*vlan_trunkdev_p)(struct ifnet *); struct ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t); int (*vlan_tag_p)(struct ifnet *, uint16_t *); int (*vlan_setcookie_p)(struct ifnet *, void *); void *(*vlan_cookie_p)(struct ifnet *); /* * Handle a change in the interface link state. To avoid LORs * between driver lock and upper layer locks, as well as possible * recursions, we post event to taskqueue, and all job * is done in static do_link_state_change(). */ void if_link_state_change(struct ifnet *ifp, int link_state) { /* Return if state hasn't changed. */ if (ifp->if_link_state == link_state) return; ifp->if_link_state = link_state; taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask); } static void do_link_state_change(void *arg, int pending) { struct ifnet *ifp = (struct ifnet *)arg; int link_state = ifp->if_link_state; CURVNET_SET(ifp->if_vnet); /* Notify that the link state has changed. */ rt_ifmsg(ifp); if (ifp->if_vlantrunk != NULL) (*vlan_link_state_p)(ifp); if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) && IFP2AC(ifp)->ac_netgraph != NULL) (*ng_ether_link_state_p)(ifp, link_state); if (ifp->if_carp) (*carp_linkstate_p)(ifp); if (ifp->if_bridge) (*bridge_linkstate_p)(ifp); if (ifp->if_lagg) (*lagg_linkstate_p)(ifp, link_state); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL); if (pending > 1) if_printf(ifp, "%d link states coalesced\n", pending); if (log_link_state_change) log(LOG_NOTICE, "%s: link state changed to %s\n", ifp->if_xname, (link_state == LINK_STATE_UP) ? "UP" : "DOWN" ); EVENTHANDLER_INVOKE(ifnet_link_event, ifp, ifp->if_link_state); CURVNET_RESTORE(); } /* * Mark an interface down and notify protocols of * the transition. */ void if_down(struct ifnet *ifp) { if_unroute(ifp, IFF_UP, AF_UNSPEC); } /* * Mark an interface up and notify protocols of * the transition. */ void if_up(struct ifnet *ifp) { if_route(ifp, IFF_UP, AF_UNSPEC); } /* * Flush an interface queue. */ void if_qflush(struct ifnet *ifp) { struct mbuf *m, *n; struct ifaltq *ifq; ifq = &ifp->if_snd; IFQ_LOCK(ifq); #ifdef ALTQ if (ALTQ_IS_ENABLED(ifq)) ALTQ_PURGE(ifq); #endif n = ifq->ifq_head; while ((m = n) != 0) { n = m->m_nextpkt; m_freem(m); } ifq->ifq_head = 0; ifq->ifq_tail = 0; ifq->ifq_len = 0; IFQ_UNLOCK(ifq); } /* * Map interface name to interface structure pointer, with or without * returning a reference. */ struct ifnet * ifunit_ref(const char *name) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 && !(ifp->if_flags & IFF_DYING)) break; } if (ifp != NULL) if_ref(ifp); IFNET_RUNLOCK_NOSLEEP(); return (ifp); } struct ifnet * ifunit(const char *name) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0) break; } IFNET_RUNLOCK_NOSLEEP(); return (ifp); } /* * Hardware specific interface ioctls. */ static int ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) { struct ifreq *ifr; int error = 0; int new_flags, temp_flags; size_t namelen, onamelen; size_t descrlen; char *descrbuf, *odescrbuf; char new_name[IFNAMSIZ]; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifr = (struct ifreq *)data; switch (cmd) { case SIOCGIFINDEX: ifr->ifr_index = ifp->if_index; break; case SIOCGIFFLAGS: temp_flags = ifp->if_flags | ifp->if_drv_flags; ifr->ifr_flags = temp_flags & 0xffff; ifr->ifr_flagshigh = temp_flags >> 16; break; case SIOCGIFCAP: ifr->ifr_reqcap = ifp->if_capabilities; ifr->ifr_curcap = ifp->if_capenable; break; #ifdef MAC case SIOCGIFMAC: error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp); break; #endif case SIOCGIFMETRIC: ifr->ifr_metric = ifp->if_metric; break; case SIOCGIFMTU: ifr->ifr_mtu = ifp->if_mtu; break; case SIOCGIFPHYS: /* XXXGL: did this ever worked? */ ifr->ifr_phys = 0; break; case SIOCGIFDESCR: error = 0; sx_slock(&ifdescr_sx); if (ifp->if_description == NULL) error = ENOMSG; else { /* space for terminating nul */ descrlen = strlen(ifp->if_description) + 1; if (ifr->ifr_buffer.length < descrlen) ifr->ifr_buffer.buffer = NULL; else error = copyout(ifp->if_description, ifr->ifr_buffer.buffer, descrlen); ifr->ifr_buffer.length = descrlen; } sx_sunlock(&ifdescr_sx); break; case SIOCSIFDESCR: error = priv_check(td, PRIV_NET_SETIFDESCR); if (error) return (error); /* * Copy only (length-1) bytes to make sure that * if_description is always nul terminated. The * length parameter is supposed to count the * terminating nul in. */ if (ifr->ifr_buffer.length > ifdescr_maxlen) return (ENAMETOOLONG); else if (ifr->ifr_buffer.length == 0) descrbuf = NULL; else { descrbuf = malloc(ifr->ifr_buffer.length, M_IFDESCR, M_WAITOK | M_ZERO); error = copyin(ifr->ifr_buffer.buffer, descrbuf, ifr->ifr_buffer.length - 1); if (error) { free(descrbuf, M_IFDESCR); break; } } sx_xlock(&ifdescr_sx); odescrbuf = ifp->if_description; ifp->if_description = descrbuf; sx_xunlock(&ifdescr_sx); getmicrotime(&ifp->if_lastchange); free(odescrbuf, M_IFDESCR); break; case SIOCGIFFIB: ifr->ifr_fib = ifp->if_fib; break; case SIOCSIFFIB: error = priv_check(td, PRIV_NET_SETIFFIB); if (error) return (error); if (ifr->ifr_fib >= rt_numfibs) return (EINVAL); ifp->if_fib = ifr->ifr_fib; break; case SIOCSIFFLAGS: error = priv_check(td, PRIV_NET_SETIFFLAGS); if (error) return (error); /* * Currently, no driver owned flags pass the IFF_CANTCHANGE * check, so we don't need special handling here yet. */ new_flags = (ifr->ifr_flags & 0xffff) | (ifr->ifr_flagshigh << 16); if (ifp->if_flags & IFF_UP && (new_flags & IFF_UP) == 0) { if_down(ifp); } else if (new_flags & IFF_UP && (ifp->if_flags & IFF_UP) == 0) { if_up(ifp); } /* See if permanently promiscuous mode bit is about to flip */ if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) { if (new_flags & IFF_PPROMISC) ifp->if_flags |= IFF_PROMISC; else if (ifp->if_pcount == 0) ifp->if_flags &= ~IFF_PROMISC; log(LOG_INFO, "%s: permanently promiscuous mode %s\n", ifp->if_xname, (new_flags & IFF_PPROMISC) ? "enabled" : "disabled"); } ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) | (new_flags &~ IFF_CANTCHANGE); if (ifp->if_ioctl) { (void) (*ifp->if_ioctl)(ifp, cmd, data); } getmicrotime(&ifp->if_lastchange); break; case SIOCSIFCAP: error = priv_check(td, PRIV_NET_SETIFCAP); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); if (ifr->ifr_reqcap & ~ifp->if_capabilities) return (EINVAL); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; #ifdef MAC case SIOCSIFMAC: error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp); break; #endif case SIOCSIFNAME: error = priv_check(td, PRIV_NET_SETIFNAME); if (error) return (error); error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL); if (error != 0) return (error); if (new_name[0] == '\0') return (EINVAL); if (ifunit(new_name) != NULL) return (EEXIST); /* * XXX: Locking. Nothing else seems to lock if_flags, * and there are numerous other races with the * ifunit() checks not being atomic with namespace * changes (renames, vmoves, if_attach, etc). */ ifp->if_flags |= IFF_RENAMING; /* Announce the departure of the interface. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); log(LOG_INFO, "%s: changing name to '%s'\n", ifp->if_xname, new_name); IF_ADDR_WLOCK(ifp); strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname)); ifa = ifp->if_addr; sdl = (struct sockaddr_dl *)ifa->ifa_addr; namelen = strlen(new_name); onamelen = sdl->sdl_nlen; /* * Move the address if needed. This is safe because we * allocate space for a name of length IFNAMSIZ when we * create this in if_attach(). */ if (namelen != onamelen) { bcopy(sdl->sdl_data + onamelen, sdl->sdl_data + namelen, sdl->sdl_alen); } bcopy(new_name, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl = (struct sockaddr_dl *)ifa->ifa_netmask; bzero(sdl->sdl_data, onamelen); while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); /* Announce the return of the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); ifp->if_flags &= ~IFF_RENAMING; break; #ifdef VIMAGE case SIOCSIFVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error) return (error); error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid); break; #endif case SIOCSIFMETRIC: error = priv_check(td, PRIV_NET_SETIFMETRIC); if (error) return (error); ifp->if_metric = ifr->ifr_metric; getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYS: error = priv_check(td, PRIV_NET_SETIFPHYS); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFMTU: { u_long oldmtu = ifp->if_mtu; error = priv_check(td, PRIV_NET_SETIFMTU); if (error) return (error); if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) return (EINVAL); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) { getmicrotime(&ifp->if_lastchange); rt_ifmsg(ifp); } /* * If the link MTU changed, do network layer specific procedure. */ if (ifp->if_mtu != oldmtu) { #ifdef INET6 nd6_setmtu(ifp); #endif } break; } case SIOCADDMULTI: case SIOCDELMULTI: if (cmd == SIOCADDMULTI) error = priv_check(td, PRIV_NET_ADDMULTI); else error = priv_check(td, PRIV_NET_DELMULTI); if (error) return (error); /* Don't allow group membership on non-multicast interfaces. */ if ((ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); /* Don't let users screw up protocols' entries. */ if (ifr->ifr_addr.sa_family != AF_LINK) return (EINVAL); if (cmd == SIOCADDMULTI) { struct ifmultiaddr *ifma; /* * Userland is only permitted to join groups once * via the if_addmulti() KPI, because it cannot hold * struct ifmultiaddr * between calls. It may also * lose a race while we check if the membership * already exists. */ IF_ADDR_RLOCK(ifp); ifma = if_findmulti(ifp, &ifr->ifr_addr); IF_ADDR_RUNLOCK(ifp); if (ifma != NULL) error = EADDRINUSE; else error = if_addmulti(ifp, &ifr->ifr_addr, &ifma); } else { error = if_delmulti(ifp, &ifr->ifr_addr); } if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYADDR: case SIOCDIFPHYADDR: #ifdef INET6 case SIOCSIFPHYADDR_IN6: #endif case SIOCSIFMEDIA: case SIOCSIFGENERIC: error = priv_check(td, PRIV_NET_HWIOCTL); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCGIFSTATUS: case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: case SIOCGIFMEDIA: case SIOCGIFGENERIC: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); break; case SIOCSIFLLADDR: error = priv_check(td, PRIV_NET_SETLLADDR); if (error) return (error); error = if_setlladdr(ifp, ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len); EVENTHANDLER_INVOKE(iflladdr_event, ifp); break; case SIOCAIFGROUP: { struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr; error = priv_check(td, PRIV_NET_ADDIFGROUP); if (error) return (error); if ((error = if_addgroup(ifp, ifgr->ifgr_group))) return (error); break; } case SIOCGIFGROUP: if ((error = if_getgroup((struct ifgroupreq *)ifr, ifp))) return (error); break; case SIOCDIFGROUP: { struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr; error = priv_check(td, PRIV_NET_DELIFGROUP); if (error) return (error); if ((error = if_delgroup(ifp, ifgr->ifgr_group))) return (error); break; } default: error = ENOIOCTL; break; } return (error); } #ifdef COMPAT_FREEBSD32 struct ifconf32 { int32_t ifc_len; union { uint32_t ifcu_buf; uint32_t ifcu_req; } ifc_ifcu; }; #define SIOCGIFCONF32 _IOWR('i', 36, struct ifconf32) #endif /* * Interface ioctls. */ int ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) { struct ifnet *ifp; struct ifreq *ifr; int error; int oif_flags; CURVNET_SET(so->so_vnet); switch (cmd) { case SIOCGIFCONF: error = ifconf(cmd, data); CURVNET_RESTORE(); return (error); #ifdef COMPAT_FREEBSD32 case SIOCGIFCONF32: { struct ifconf32 *ifc32; struct ifconf ifc; ifc32 = (struct ifconf32 *)data; ifc.ifc_len = ifc32->ifc_len; ifc.ifc_buf = PTRIN(ifc32->ifc_buf); error = ifconf(SIOCGIFCONF, (void *)&ifc); CURVNET_RESTORE(); if (error == 0) ifc32->ifc_len = ifc.ifc_len; return (error); } #endif } ifr = (struct ifreq *)data; switch (cmd) { #ifdef VIMAGE case SIOCSIFRVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error == 0) error = if_vmove_reclaim(td, ifr->ifr_name, ifr->ifr_jid); CURVNET_RESTORE(); return (error); #endif case SIOCIFCREATE: case SIOCIFCREATE2: error = priv_check(td, PRIV_NET_IFCREATE); if (error == 0) error = if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name), cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL); CURVNET_RESTORE(); return (error); case SIOCIFDESTROY: error = priv_check(td, PRIV_NET_IFDESTROY); if (error == 0) error = if_clone_destroy(ifr->ifr_name); CURVNET_RESTORE(); return (error); case SIOCIFGCLONERS: error = if_clone_list((struct if_clonereq *)data); CURVNET_RESTORE(); return (error); case SIOCGIFGMEMB: error = if_getgroupmembers((struct ifgroupreq *)data); CURVNET_RESTORE(); return (error); #if defined(INET) || defined(INET6) case SIOCSVH: case SIOCGVH: if (carp_ioctl_p == NULL) error = EPROTONOSUPPORT; else error = (*carp_ioctl_p)(ifr, cmd, td); CURVNET_RESTORE(); return (error); #endif } ifp = ifunit_ref(ifr->ifr_name); if (ifp == NULL) { CURVNET_RESTORE(); return (ENXIO); } error = ifhwioctl(cmd, ifp, data, td); if (error != ENOIOCTL) { if_rele(ifp); CURVNET_RESTORE(); return (error); } oif_flags = ifp->if_flags; if (so->so_proto == NULL) { if_rele(ifp); CURVNET_RESTORE(); return (EOPNOTSUPP); } /* * Pass the request on to the socket control method, and if the * latter returns EOPNOTSUPP, directly to the interface. * * Make an exception for the legacy SIOCSIF* requests. Drivers * trust SIOCSIFADDR et al to come from an already privileged * layer, and do not perform any credentials checks or input * validation. */ error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, ifp, td)); if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL && cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR && cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK) error = (*ifp->if_ioctl)(ifp, cmd, data); if ((oif_flags ^ ifp->if_flags) & IFF_UP) { #ifdef INET6 if (ifp->if_flags & IFF_UP) in6_if_up(ifp); #endif } if_rele(ifp); CURVNET_RESTORE(); return (error); } /* * The code common to handling reference counted flags, * e.g., in ifpromisc() and if_allmulti(). * The "pflag" argument can specify a permanent mode flag to check, * such as IFF_PPROMISC for promiscuous mode; should be 0 if none. * * Only to be used on stack-owned flags, not driver-owned flags. */ static int if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch) { struct ifreq ifr; int error; int oldflags, oldcount; /* Sanity checks to catch programming errors */ KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0, ("%s: setting driver-owned flag %d", __func__, flag)); if (onswitch) KASSERT(*refcount >= 0, ("%s: increment negative refcount %d for flag %d", __func__, *refcount, flag)); else KASSERT(*refcount > 0, ("%s: decrement non-positive refcount %d for flag %d", __func__, *refcount, flag)); /* In case this mode is permanent, just touch refcount */ if (ifp->if_flags & pflag) { *refcount += onswitch ? 1 : -1; return (0); } /* Save ifnet parameters for if_ioctl() may fail */ oldcount = *refcount; oldflags = ifp->if_flags; /* * See if we aren't the only and touching refcount is enough. * Actually toggle interface flag if we are the first or last. */ if (onswitch) { if ((*refcount)++) return (0); ifp->if_flags |= flag; } else { if (--(*refcount)) return (0); ifp->if_flags &= ~flag; } /* Call down the driver since we've changed interface flags */ if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto recover; } ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); if (error) goto recover; /* Notify userland that interface flags have changed */ rt_ifmsg(ifp); return (0); recover: /* Recover after driver error */ *refcount = oldcount; ifp->if_flags = oldflags; return (error); } /* * Set/clear promiscuous mode on interface ifp based on the truth value * of pswitch. The calls are reference counted so that only the first * "on" request actually has an effect, as does the final "off" request. * Results are undefined if the "off" and "on" requests are not matched. */ int ifpromisc(struct ifnet *ifp, int pswitch) { int error; int oldflags = ifp->if_flags; error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC, &ifp->if_pcount, pswitch); /* If promiscuous mode status has changed, log a message */ if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC)) log(LOG_INFO, "%s: promiscuous mode %s\n", ifp->if_xname, (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled"); return (error); } /* * Return interface configuration * of system. List may be used * in later ioctl's (above) to get * other information. */ /*ARGSUSED*/ static int ifconf(u_long cmd, caddr_t data) { struct ifconf *ifc = (struct ifconf *)data; struct ifnet *ifp; struct ifaddr *ifa; struct ifreq ifr; struct sbuf *sb; int error, full = 0, valid_len, max_len; /* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */ max_len = MAXPHYS - 1; /* Prevent hostile input from being able to crash the system */ if (ifc->ifc_len <= 0) return (EINVAL); again: if (ifc->ifc_len <= max_len) { max_len = ifc->ifc_len; full = 1; } sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN); max_len = 0; valid_len = 0; IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { int addrs; /* * Zero the ifr_name buffer to make sure we don't * disclose the contents of the stack. */ memset(ifr.ifr_name, 0, sizeof(ifr.ifr_name)); if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name)) >= sizeof(ifr.ifr_name)) { sbuf_delete(sb); IFNET_RUNLOCK(); return (ENAMETOOLONG); } addrs = 0; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; if (prison_if(curthread->td_ucred, sa) != 0) continue; addrs++; if (sa->sa_len <= sizeof(*sa)) { ifr.ifr_addr = *sa; sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); } else { sbuf_bcat(sb, &ifr, offsetof(struct ifreq, ifr_addr)); max_len += offsetof(struct ifreq, ifr_addr); sbuf_bcat(sb, sa, sa->sa_len); max_len += sa->sa_len; } if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } IF_ADDR_RUNLOCK(ifp); if (addrs == 0) { bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr)); sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } } IFNET_RUNLOCK(); /* * If we didn't allocate enough space (uncommon), try again. If * we have already allocated as much space as we are allowed, * return what we've got. */ if (valid_len != max_len && !full) { sbuf_delete(sb); goto again; } ifc->ifc_len = valid_len; sbuf_finish(sb); error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len); sbuf_delete(sb); return (error); } /* * Just like ifpromisc(), but for all-multicast-reception mode. */ int if_allmulti(struct ifnet *ifp, int onswitch) { return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch)); } struct ifmultiaddr * if_findmulti(struct ifnet *ifp, struct sockaddr *sa) { struct ifmultiaddr *ifma; IF_ADDR_LOCK_ASSERT(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (sa->sa_family == AF_LINK) { if (sa_dl_equal(ifma->ifma_addr, sa)) break; } else { if (sa_equal(ifma->ifma_addr, sa)) break; } } return ifma; } /* * Allocate a new ifmultiaddr and initialize based on passed arguments. We * make copies of passed sockaddrs. The ifmultiaddr will not be added to * the ifnet multicast address list here, so the caller must do that and * other setup work (such as notifying the device driver). The reference * count is initialized to 1. */ static struct ifmultiaddr * if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa, int mflags) { struct ifmultiaddr *ifma; struct sockaddr *dupsa; ifma = malloc(sizeof *ifma, M_IFMADDR, mflags | M_ZERO); if (ifma == NULL) return (NULL); dupsa = malloc(sa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma, M_IFMADDR); return (NULL); } bcopy(sa, dupsa, sa->sa_len); ifma->ifma_addr = dupsa; ifma->ifma_ifp = ifp; ifma->ifma_refcount = 1; ifma->ifma_protospec = NULL; if (llsa == NULL) { ifma->ifma_lladdr = NULL; return (ifma); } dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); return (NULL); } bcopy(llsa, dupsa, llsa->sa_len); ifma->ifma_lladdr = dupsa; return (ifma); } /* * if_freemulti: free ifmultiaddr structure and possibly attached related * addresses. The caller is responsible for implementing reference * counting, notifying the driver, handling routing messages, and releasing * any dependent link layer state. */ static void if_freemulti(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d", ifma->ifma_refcount)); if (ifma->ifma_lladdr != NULL) free(ifma->ifma_lladdr, M_IFMADDR); free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); } /* * Register an additional multicast address with a network interface. * * - If the address is already present, bump the reference count on the * address and return. * - If the address is not link-layer, look up a link layer address. * - Allocate address structures for one or both addresses, and attach to the * multicast address list on the interface. If automatically adding a link * layer address, the protocol address will own a reference to the link * layer address, to be freed when it is freed. * - Notify the network device driver of an addition to the multicast address * list. * * 'sa' points to caller-owned memory with the desired multicast address. * * 'retifma' will be used to return a pointer to the resulting multicast * address reference, if desired. */ int if_addmulti(struct ifnet *ifp, struct sockaddr *sa, struct ifmultiaddr **retifma) { struct ifmultiaddr *ifma, *ll_ifma; struct sockaddr *llsa; struct sockaddr_dl sdl; int error; /* * If the address is already present, return a new reference to it; * otherwise, allocate storage and set up a new address. */ IF_ADDR_WLOCK(ifp); ifma = if_findmulti(ifp, sa); if (ifma != NULL) { ifma->ifma_refcount++; if (retifma != NULL) *retifma = ifma; IF_ADDR_WUNLOCK(ifp); return (0); } /* * The address isn't already present; resolve the protocol address * into a link layer address, and then look that up, bump its * refcount or allocate an ifma for that also. * Most link layer resolving functions returns address data which * fits inside default sockaddr_dl structure. However callback * can allocate another sockaddr structure, in that case we need to * free it later. */ llsa = NULL; ll_ifma = NULL; if (ifp->if_resolvemulti != NULL) { /* Provide called function with buffer size information */ sdl.sdl_len = sizeof(sdl); llsa = (struct sockaddr *)&sdl; error = ifp->if_resolvemulti(ifp, &llsa, sa); if (error) goto unlock_out; } /* * Allocate the new address. Don't hook it up yet, as we may also * need to allocate a link layer multicast address. */ ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT); if (ifma == NULL) { error = ENOMEM; goto free_llsa_out; } /* * If a link layer address is found, we'll need to see if it's * already present in the address list, or allocate is as well. * When this block finishes, the link layer address will be on the * list. */ if (llsa != NULL) { ll_ifma = if_findmulti(ifp, llsa); if (ll_ifma == NULL) { ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT); if (ll_ifma == NULL) { --ifma->ifma_refcount; if_freemulti(ifma); error = ENOMEM; goto free_llsa_out; } TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma, ifma_link); } else ll_ifma->ifma_refcount++; ifma->ifma_llifma = ll_ifma; } /* * We now have a new multicast address, ifma, and possibly a new or * referenced link layer address. Add the primary address to the * ifnet address list. */ TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); if (retifma != NULL) *retifma = ifma; /* * Must generate the message while holding the lock so that 'ifma' * pointer is still valid. */ rt_newmaddrmsg(RTM_NEWMADDR, ifma); IF_ADDR_WUNLOCK(ifp); /* * We are certain we have added something, so call down to the * interface to let them know about it. */ if (ifp->if_ioctl != NULL) { (void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0); } if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); return (0); free_llsa_out: if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); unlock_out: IF_ADDR_WUNLOCK(ifp); return (error); } /* * Delete a multicast group membership by network-layer group address. * * Returns ENOENT if the entry could not be found. If ifp no longer * exists, results are undefined. This entry point should only be used * from subsystems which do appropriate locking to hold ifp for the * duration of the call. * Network-layer protocol domains must use if_delmulti_ifma(). */ int if_delmulti(struct ifnet *ifp, struct sockaddr *sa) { struct ifmultiaddr *ifma; int lastref; #ifdef INVARIANTS struct ifnet *oifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) ifp = NULL; IFNET_RUNLOCK_NOSLEEP(); KASSERT(ifp != NULL, ("%s: ifnet went away", __func__)); #endif if (ifp == NULL) return (ENOENT); IF_ADDR_WLOCK(ifp); lastref = 0; ifma = if_findmulti(ifp, sa); if (ifma != NULL) lastref = if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); if (ifma == NULL) return (ENOENT); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } return (0); } /* * Delete all multicast group membership for an interface. * Should be used to quickly flush all multicast filters. */ void if_delallmulti(struct ifnet *ifp) { struct ifmultiaddr *ifma; struct ifmultiaddr *next; IF_ADDR_WLOCK(ifp); TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); } /* * Delete a multicast group membership by group membership pointer. * Network-layer protocol domains must use this routine. * * It is safe to call this routine if the ifp disappeared. */ void if_delmulti_ifma(struct ifmultiaddr *ifma) { struct ifnet *ifp; int lastref; ifp = ifma->ifma_ifp; #ifdef DIAGNOSTIC if (ifp == NULL) { printf("%s: ifma_ifp seems to be detached\n", __func__); } else { struct ifnet *oifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) { printf("%s: ifnet %p disappeared\n", __func__, ifp); ifp = NULL; } IFNET_RUNLOCK_NOSLEEP(); } #endif /* * If and only if the ifnet instance exists: Acquire the address lock. */ if (ifp != NULL) IF_ADDR_WLOCK(ifp); lastref = if_delmulti_locked(ifp, ifma, 0); if (ifp != NULL) { /* * If and only if the ifnet instance exists: * Release the address lock. * If the group was left: update the hardware hash filter. */ IF_ADDR_WUNLOCK(ifp); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } } } /* * Perform deletion of network-layer and/or link-layer multicast address. * * Return 0 if the reference count was decremented. * Return 1 if the final reference was released, indicating that the * hardware hash filter should be reprogrammed. */ static int if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching) { struct ifmultiaddr *ll_ifma; if (ifp != NULL && ifma->ifma_ifp != NULL) { KASSERT(ifma->ifma_ifp == ifp, ("%s: inconsistent ifp %p", __func__, ifp)); IF_ADDR_WLOCK_ASSERT(ifp); } ifp = ifma->ifma_ifp; /* * If the ifnet is detaching, null out references to ifnet, * so that upper protocol layers will notice, and not attempt * to obtain locks for an ifnet which no longer exists. The * routing socket announcement must happen before the ifnet * instance is detached from the system. */ if (detaching) { #ifdef DIAGNOSTIC printf("%s: detaching ifnet instance %p\n", __func__, ifp); #endif /* * ifp may already be nulled out if we are being reentered * to delete the ll_ifma. */ if (ifp != NULL) { rt_newmaddrmsg(RTM_DELMADDR, ifma); ifma->ifma_ifp = NULL; } } if (--ifma->ifma_refcount > 0) return 0; /* * If this ifma is a network-layer ifma, a link-layer ifma may * have been associated with it. Release it first if so. */ ll_ifma = ifma->ifma_llifma; if (ll_ifma != NULL) { KASSERT(ifma->ifma_lladdr != NULL, ("%s: llifma w/o lladdr", __func__)); if (detaching) ll_ifma->ifma_ifp = NULL; /* XXX */ if (--ll_ifma->ifma_refcount == 0) { if (ifp != NULL) { TAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifma_link); } if_freemulti(ll_ifma); } } if (ifp != NULL) TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link); if_freemulti(ifma); /* * The last reference to this instance of struct ifmultiaddr * was released; the hardware should be notified of this change. */ return 1; } /* * Set the link layer address on an interface. * * At this time we only support certain types of interfaces, * and we don't allow the length of the address to change. */ int if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) { struct sockaddr_dl *sdl; struct ifaddr *ifa; struct ifreq ifr; IF_ADDR_RLOCK(ifp); ifa = ifp->if_addr; if (ifa == NULL) { IF_ADDR_RUNLOCK(ifp); return (EINVAL); } ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) { ifa_free(ifa); return (EINVAL); } if (len != sdl->sdl_alen) { /* don't allow length to change */ ifa_free(ifa); return (EINVAL); } switch (ifp->if_type) { case IFT_ETHER: case IFT_FDDI: case IFT_XETHER: case IFT_ISO88025: case IFT_L2VLAN: case IFT_BRIDGE: case IFT_ARCNET: case IFT_IEEE8023ADLAG: case IFT_IEEE80211: bcopy(lladdr, LLADDR(sdl), len); ifa_free(ifa); break; default: ifa_free(ifa); return (ENODEV); } /* * If the interface is already up, we need * to re-init it in order to reprogram its * address filter. */ if ((ifp->if_flags & IFF_UP) != 0) { if (ifp->if_ioctl) { ifp->if_flags &= ~IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); ifp->if_flags |= IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); } #ifdef INET /* * Also send gratuitous ARPs to notify other nodes about * the address change. */ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET) arp_ifinit(ifp, ifa); } #endif } return (0); } /* * The name argument must be a pointer to storage which will last as * long as the interface does. For physical devices, the result of * device_get_name(dev) is a good choice and for pseudo-devices a * static string works well. */ void if_initname(struct ifnet *ifp, const char *name, int unit) { ifp->if_dname = name; ifp->if_dunit = unit; if (unit != IF_DUNIT_NONE) snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit); else strlcpy(ifp->if_xname, name, IFNAMSIZ); } int if_printf(struct ifnet *ifp, const char * fmt, ...) { va_list ap; int retval; retval = printf("%s: ", ifp->if_xname); va_start(ap, fmt); retval += vprintf(fmt, ap); va_end(ap); return (retval); } void if_start(struct ifnet *ifp) { (*(ifp)->if_start)(ifp); } /* * Backwards compatibility interface for drivers * that have not implemented it */ static int if_transmit(struct ifnet *ifp, struct mbuf *m) { int error; IFQ_HANDOFF(ifp, m, error); return (error); } int if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust) { int active = 0; IF_LOCK(ifq); if (_IF_QFULL(ifq)) { _IF_DROP(ifq); IF_UNLOCK(ifq); m_freem(m); return (0); } if (ifp != NULL) { ifp->if_obytes += m->m_pkthdr.len + adjust; if (m->m_flags & (M_BCAST|M_MCAST)) ifp->if_omcasts++; active = ifp->if_drv_flags & IFF_DRV_OACTIVE; } _IF_ENQUEUE(ifq, m); IF_UNLOCK(ifq); if (ifp != NULL && !active) (*(ifp)->if_start)(ifp); return (1); } void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f) { KASSERT(if_com_alloc[type] == NULL, ("if_register_com_alloc: %d already registered", type)); KASSERT(if_com_free[type] == NULL, ("if_register_com_alloc: %d free already registered", type)); if_com_alloc[type] = a; if_com_free[type] = f; } void if_deregister_com_alloc(u_char type) { KASSERT(if_com_alloc[type] != NULL, ("if_deregister_com_alloc: %d not registered", type)); KASSERT(if_com_free[type] != NULL, ("if_deregister_com_alloc: %d free not registered", type)); if_com_alloc[type] = NULL; if_com_free[type] = NULL; } /* API for driver access to network stack owned ifnet.*/ uint64_t if_setbaudrate(struct ifnet *ifp, uint64_t baudrate) { uint64_t oldbrate; oldbrate = ifp->if_baudrate; ifp->if_baudrate = baudrate; return (oldbrate); } uint64_t if_getbaudrate(if_t ifp) { return (((struct ifnet *)ifp)->if_baudrate); } int if_setcapabilities(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capabilities = capabilities; return (0); } int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit) { ((struct ifnet *)ifp)->if_capabilities |= setbit; ((struct ifnet *)ifp)->if_capabilities &= ~clearbit; return (0); } int if_getcapabilities(if_t ifp) { return ((struct ifnet *)ifp)->if_capabilities; } int if_setcapenable(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capenable = capabilities; return (0); } int if_setcapenablebit(if_t ifp, int setcap, int clearcap) { if(setcap) ((struct ifnet *)ifp)->if_capenable |= setcap; if(clearcap) ((struct ifnet *)ifp)->if_capenable &= ~clearcap; return (0); } const char * if_getdname(if_t ifp) { return ((struct ifnet *)ifp)->if_dname; } int if_togglecapenable(if_t ifp, int togglecap) { ((struct ifnet *)ifp)->if_capenable ^= togglecap; return (0); } int if_getcapenable(if_t ifp) { return ((struct ifnet *)ifp)->if_capenable; } /* * This is largely undesirable because it ties ifnet to a device, but does * provide flexiblity for an embedded product vendor. Should be used with * the understanding that it violates the interface boundaries, and should be * a last resort only. */ int if_setdev(if_t ifp, void *dev) { return (0); } int if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags) { ((struct ifnet *)ifp)->if_drv_flags |= set_flags; ((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags; return (0); } int if_getdrvflags(if_t ifp) { return ((struct ifnet *)ifp)->if_drv_flags; } int if_setdrvflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_drv_flags = flags; return (0); } int if_setflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_flags = flags; return (0); } int if_setflagbits(if_t ifp, int set, int clear) { ((struct ifnet *)ifp)->if_flags |= set; ((struct ifnet *)ifp)->if_flags &= ~clear; return (0); } int if_getflags(if_t ifp) { return ((struct ifnet *)ifp)->if_flags; } int if_clearhwassist(if_t ifp) { ((struct ifnet *)ifp)->if_hwassist = 0; return (0); } int if_sethwassistbits(if_t ifp, int toset, int toclear) { ((struct ifnet *)ifp)->if_hwassist |= toset; ((struct ifnet *)ifp)->if_hwassist &= ~toclear; return (0); } int if_sethwassist(if_t ifp, int hwassist_bit) { ((struct ifnet *)ifp)->if_hwassist = hwassist_bit; return (0); } int if_gethwassist(if_t ifp) { return ((struct ifnet *)ifp)->if_hwassist; } int if_setmtu(if_t ifp, int mtu) { ((struct ifnet *)ifp)->if_mtu = mtu; return (0); } int if_getmtu(if_t ifp) { return ((struct ifnet *)ifp)->if_mtu; } int if_setsoftc(if_t ifp, void *softc) { ((struct ifnet *)ifp)->if_softc = softc; return (0); } void * if_getsoftc(if_t ifp) { return ((struct ifnet *)ifp)->if_softc; } void if_setrcvif(struct mbuf *m, if_t ifp) { m->m_pkthdr.rcvif = (struct ifnet *)ifp; } void if_setvtag(struct mbuf *m, uint16_t tag) { m->m_pkthdr.ether_vtag = tag; } uint16_t if_getvtag(struct mbuf *m) { return (m->m_pkthdr.ether_vtag); } /* Statistics */ int if_incipackets(if_t ifp, int pkts) { ((struct ifnet *)ifp)->if_ipackets += pkts; return (0); } int if_incopackets(if_t ifp, int pkts) { ((struct ifnet *)ifp)->if_opackets += pkts; return (0); } int if_incierrors(if_t ifp, int ierrors) { ((struct ifnet *)ifp)->if_ierrors += ierrors; return (0); } int if_setierrors(if_t ifp, int ierrors) { ((struct ifnet *)ifp)->if_ierrors = ierrors; return (0); } int if_setoerrors(if_t ifp, int oerrors) { ((struct ifnet *)ifp)->if_oerrors = oerrors; return (0); } int if_incoerrors(if_t ifp, int oerrors) { ((struct ifnet *)ifp)->if_oerrors += oerrors; return (0); } int if_inciqdrops(if_t ifp, int val) { ((struct ifnet *)ifp)->if_iqdrops += val; return (0); } int if_setcollisions(if_t ifp, int collisions) { ((struct ifnet *)ifp)->if_collisions = collisions; return (0); } int if_inccollisions(if_t ifp, int collisions) { ((struct ifnet *)ifp)->if_collisions += collisions; return (0); } int if_setipackets(if_t ifp, int pkts) { ((struct ifnet *)ifp)->if_ipackets = pkts; return (0); } int if_setopackets(if_t ifp, int pkts) { ((struct ifnet *)ifp)->if_opackets = pkts; return (0); } int if_incobytes(if_t ifp, int bytes) { ((struct ifnet *)ifp)->if_obytes += bytes; return (0); } int if_setibytes(if_t ifp, int bytes) { ((struct ifnet *)ifp)->if_ibytes = bytes; return (0); } int if_setobytes(if_t ifp, int bytes) { ((struct ifnet *)ifp)->if_obytes = bytes; return (0); } int if_sendq_empty(if_t ifp) { return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd); } int if_getiqdrops(if_t ifp) { return ((struct ifnet *)ifp)->if_iqdrops; } int if_incimcasts(if_t ifp, int mcast) { ((struct ifnet *)ifp)->if_imcasts += mcast; return (0); } int if_incomcasts(if_t ifp, int mcast) { ((struct ifnet *)ifp)->if_omcasts += mcast; return (0); } int if_setimcasts(if_t ifp, int mcast) { ((struct ifnet *)ifp)->if_imcasts = mcast; return (0); } struct ifaddr * if_getifaddr(if_t ifp) { return ((struct ifnet *)ifp)->if_addr; } int if_getamcount(if_t ifp) { return ((struct ifnet *)ifp)->if_amcount; } int if_setsendqready(if_t ifp) { IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd); return (0); } int if_setsendqlen(if_t ifp, int tx_desc_count) { IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count); ((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count; return (0); } int if_vlantrunkinuse(if_t ifp) { return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0; } int if_input(if_t ifp, struct mbuf* sendmp) { (*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp); return (0); } /* XXX */ #ifndef ETH_ADDR_LEN #define ETH_ADDR_LEN 6 #endif int if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max) { struct ifmultiaddr *ifma; uint8_t *lmta = (uint8_t *)mta; int mcnt = 0; TAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == max) break; bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), &lmta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN); mcnt++; } *cnt = mcnt; return (0); } int if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max) { int error; if_maddr_rlock(ifp); error = if_setupmultiaddr(ifp, mta, cnt, max); if_maddr_runlock(ifp); return (error); } int if_multiaddr_count(if_t ifp, int max) { struct ifmultiaddr *ifma; int count; count = 0; if_maddr_rlock(ifp); TAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; count++; if (count == max) break; } if_maddr_runlock(ifp); return (count); } struct mbuf * if_dequeue(if_t ifp) { struct mbuf *m; IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m); return (m); } int if_sendq_prepend(if_t ifp, struct mbuf *m) { IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m); return (0); } int if_setifheaderlen(if_t ifp, int len) { ((struct ifnet *)ifp)->if_hdrlen = len; return (0); } caddr_t if_getlladdr(if_t ifp) { return (IF_LLADDR((struct ifnet *)ifp)); } void * if_gethandle(u_char type) { return (if_alloc(type)); } void if_bpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; BPF_MTAP(ifp, m); } void if_etherbpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; ETHER_BPF_MTAP(ifp, m); } void if_vlancap(if_t ifh) { struct ifnet *ifp = (struct ifnet *)ifh; VLAN_CAPABILITIES(ifp); } void if_setinitfn(if_t ifp, void (*init_fn)(void *)) { ((struct ifnet *)ifp)->if_init = init_fn; } void if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t)) { ((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn; } void if_setstartfn(if_t ifp, void (*start_fn)(if_t)) { ((struct ifnet *)ifp)->if_start = (void *)start_fn; } void if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn) { ((struct ifnet *)ifp)->if_transmit = start_fn; } void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn) { ((struct ifnet *)ifp)->if_qflush = flush_fn; } /* Revisit these - These are inline functions originally. */ int drbr_inuse_drv(if_t ifh, struct buf_ring *br) { return drbr_inuse_drv(ifh, br); } struct mbuf* drbr_dequeue_drv(if_t ifh, struct buf_ring *br) { return drbr_dequeue(ifh, br); } int drbr_needs_enqueue_drv(if_t ifh, struct buf_ring *br) { return drbr_needs_enqueue(ifh, br); } int drbr_enqueue_drv(if_t ifh, struct buf_ring *br, struct mbuf *m) { return drbr_enqueue(ifh, br, m); } Index: head/sys/net/if_lagg.c =================================================================== --- head/sys/net/if_lagg.c (revision 271550) +++ head/sys/net/if_lagg.c (revision 271551) @@ -1,2016 +1,2021 @@ /* $OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $ */ /* * Copyright (c) 2005, 2006 Reyk Floeter * Copyright (c) 2007 Andrew Thompson * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #endif #ifdef INET #include #include #endif #ifdef INET6 #include #include #include #endif #include #include #include /* Special flags we should propagate to the lagg ports. */ static struct { int flag; int (*func)(struct ifnet *, int); } lagg_pflags[] = { {IFF_PROMISC, ifpromisc}, {IFF_ALLMULTI, if_allmulti}, {0, NULL} }; SLIST_HEAD(__trhead, lagg_softc) lagg_list; /* list of laggs */ static struct mtx lagg_list_mtx; eventhandler_tag lagg_detach_cookie = NULL; static int lagg_clone_create(struct if_clone *, int, caddr_t); static void lagg_clone_destroy(struct ifnet *); static struct if_clone *lagg_cloner; static const char laggname[] = "lagg"; static void lagg_lladdr(struct lagg_softc *, uint8_t *); static void lagg_capabilities(struct lagg_softc *); static void lagg_port_lladdr(struct lagg_port *, uint8_t *); static void lagg_port_setlladdr(void *, int); static int lagg_port_create(struct lagg_softc *, struct ifnet *); static int lagg_port_destroy(struct lagg_port *, int); static struct mbuf *lagg_input(struct ifnet *, struct mbuf *); static void lagg_linkstate(struct lagg_softc *); static void lagg_port_state(struct ifnet *, int); static int lagg_port_ioctl(struct ifnet *, u_long, caddr_t); static int lagg_port_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static void lagg_port_ifdetach(void *arg __unused, struct ifnet *); #ifdef LAGG_PORT_STACKING static int lagg_port_checkstacking(struct lagg_softc *); #endif static void lagg_port2req(struct lagg_port *, struct lagg_reqport *); static void lagg_init(void *); static void lagg_stop(struct lagg_softc *); static int lagg_ioctl(struct ifnet *, u_long, caddr_t); static int lagg_ether_setmulti(struct lagg_softc *); static int lagg_ether_cmdmulti(struct lagg_port *, int); static int lagg_setflag(struct lagg_port *, int, int, int (*func)(struct ifnet *, int)); static int lagg_setflags(struct lagg_port *, int status); static int lagg_transmit(struct ifnet *, struct mbuf *); static void lagg_qflush(struct ifnet *); static int lagg_media_change(struct ifnet *); static void lagg_media_status(struct ifnet *, struct ifmediareq *); static struct lagg_port *lagg_link_active(struct lagg_softc *, struct lagg_port *); static const void *lagg_gethdr(struct mbuf *, u_int, u_int, void *); static int lagg_sysctl_active(SYSCTL_HANDLER_ARGS); /* Simple round robin */ static int lagg_rr_attach(struct lagg_softc *); static int lagg_rr_detach(struct lagg_softc *); static int lagg_rr_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* Active failover */ static int lagg_fail_attach(struct lagg_softc *); static int lagg_fail_detach(struct lagg_softc *); static int lagg_fail_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* Loadbalancing */ static int lagg_lb_attach(struct lagg_softc *); static int lagg_lb_detach(struct lagg_softc *); static int lagg_lb_port_create(struct lagg_port *); static void lagg_lb_port_destroy(struct lagg_port *); static int lagg_lb_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); static int lagg_lb_porttable(struct lagg_softc *, struct lagg_port *); /* 802.3ad LACP */ static int lagg_lacp_attach(struct lagg_softc *); static int lagg_lacp_detach(struct lagg_softc *); static int lagg_lacp_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); static void lagg_lacp_lladdr(struct lagg_softc *); static void lagg_callout(void *); /* lagg protocol table */ static const struct { int ti_proto; int (*ti_attach)(struct lagg_softc *); } lagg_protos[] = { { LAGG_PROTO_ROUNDROBIN, lagg_rr_attach }, { LAGG_PROTO_FAILOVER, lagg_fail_attach }, { LAGG_PROTO_LOADBALANCE, lagg_lb_attach }, { LAGG_PROTO_ETHERCHANNEL, lagg_lb_attach }, { LAGG_PROTO_LACP, lagg_lacp_attach }, { LAGG_PROTO_NONE, NULL } }; SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW, 0, "Link Aggregation"); static int lagg_failover_rx_all = 0; /* Allow input on any failover links */ SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW, &lagg_failover_rx_all, 0, "Accept input from any interface in a failover lagg"); static int def_use_flowid = 1; /* Default value for using M_FLOWID */ SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RWTUN, &def_use_flowid, 0, "Default setting for using flow id for load sharing"); static int def_flowid_shift = 16; /* Default value for using M_FLOWID */ SYSCTL_INT(_net_link_lagg, OID_AUTO, default_flowid_shift, CTLFLAG_RWTUN, &def_flowid_shift, 0, "Default setting for flowid shift for load sharing"); static int lagg_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: mtx_init(&lagg_list_mtx, "if_lagg list", NULL, MTX_DEF); SLIST_INIT(&lagg_list); lagg_cloner = if_clone_simple(laggname, lagg_clone_create, lagg_clone_destroy, 0); lagg_input_p = lagg_input; lagg_linkstate_p = lagg_port_state; lagg_detach_cookie = EVENTHANDLER_REGISTER( ifnet_departure_event, lagg_port_ifdetach, NULL, EVENTHANDLER_PRI_ANY); break; case MOD_UNLOAD: EVENTHANDLER_DEREGISTER(ifnet_departure_event, lagg_detach_cookie); if_clone_detach(lagg_cloner); lagg_input_p = NULL; lagg_linkstate_p = NULL; mtx_destroy(&lagg_list_mtx); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t lagg_mod = { "if_lagg", lagg_modevent, 0 }; DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_lagg, 1); /* * This routine is run via an vlan * config EVENT */ static void lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct lagg_softc *sc = ifp->if_softc; struct lagg_port *lp; struct rm_priotracker tracker; if (ifp->if_softc != arg) /* Not our event */ return; LAGG_RLOCK(sc, &tracker); if (!SLIST_EMPTY(&sc->sc_ports)) { SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag); } LAGG_RUNLOCK(sc, &tracker); } /* * This routine is run via an vlan * unconfig EVENT */ static void lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct lagg_softc *sc = ifp->if_softc; struct lagg_port *lp; struct rm_priotracker tracker; if (ifp->if_softc != arg) /* Not our event */ return; LAGG_RLOCK(sc, &tracker); if (!SLIST_EMPTY(&sc->sc_ports)) { SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag); } LAGG_RUNLOCK(sc, &tracker); } static int lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct lagg_softc *sc; struct ifnet *ifp; int i, error = 0; static const u_char eaddr[6]; /* 00:00:00:00:00:00 */ struct sysctl_oid *oid; char num[14]; /* sufficient for 32 bits */ sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); ifp = sc->sc_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { free(sc, M_DEVBUF); return (ENOSPC); } sc->sc_ipackets = counter_u64_alloc(M_WAITOK); sc->sc_opackets = counter_u64_alloc(M_WAITOK); sc->sc_ibytes = counter_u64_alloc(M_WAITOK); sc->sc_obytes = counter_u64_alloc(M_WAITOK); sysctl_ctx_init(&sc->ctx); snprintf(num, sizeof(num), "%u", unit); sc->use_flowid = def_use_flowid; sc->flowid_shift = def_flowid_shift; sc->sc_oid = oid = SYSCTL_ADD_NODE(&sc->ctx, &SYSCTL_NODE_CHILDREN(_net_link, lagg), OID_AUTO, num, CTLFLAG_RD, NULL, ""); SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, "use_flowid", CTLTYPE_INT|CTLFLAG_RW, &sc->use_flowid, sc->use_flowid, "Use flow id for load sharing"); SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, "flowid_shift", CTLTYPE_INT|CTLFLAG_RW, &sc->flowid_shift, sc->flowid_shift, "Shift flowid bits to prevent multiqueue collisions"); SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, "count", CTLTYPE_INT|CTLFLAG_RD, &sc->sc_count, sc->sc_count, "Total number of ports"); SYSCTL_ADD_PROC(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, "active", CTLTYPE_INT|CTLFLAG_RD, sc, 0, lagg_sysctl_active, "I", "Total number of active ports"); SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, "flapping", CTLTYPE_INT|CTLFLAG_RD, &sc->sc_flapping, sc->sc_flapping, "Total number of port change events"); /* Hash all layers by default */ sc->sc_flags = LAGG_F_HASHL2|LAGG_F_HASHL3|LAGG_F_HASHL4; sc->sc_proto = LAGG_PROTO_NONE; for (i = 0; lagg_protos[i].ti_proto != LAGG_PROTO_NONE; i++) { if (lagg_protos[i].ti_proto == LAGG_PROTO_DEFAULT) { sc->sc_proto = lagg_protos[i].ti_proto; if ((error = lagg_protos[i].ti_attach(sc)) != 0) { if_free(ifp); free(sc, M_DEVBUF); return (error); } break; } } LAGG_LOCK_INIT(sc); LAGG_CALLOUT_LOCK_INIT(sc); SLIST_INIT(&sc->sc_ports); TASK_INIT(&sc->sc_lladdr_task, 0, lagg_port_setlladdr, sc); /* * This uses the callout lock rather than the rmlock; one can't * hold said rmlock during SWI. */ callout_init_mtx(&sc->sc_callout, &sc->sc_call_mtx, 0); /* Initialise pseudo media types */ ifmedia_init(&sc->sc_media, 0, lagg_media_change, lagg_media_status); ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO); if_initname(ifp, laggname, unit); ifp->if_softc = sc; ifp->if_transmit = lagg_transmit; ifp->if_qflush = lagg_qflush; ifp->if_init = lagg_init; ifp->if_ioctl = lagg_ioctl; ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST; ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS; /* * Attach as an ordinary ethernet device, children will be attached * as special device IFT_IEEE8023ADLAG. */ ether_ifattach(ifp, eaddr); sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST); sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST); /* Insert into the global list of laggs */ mtx_lock(&lagg_list_mtx); SLIST_INSERT_HEAD(&lagg_list, sc, sc_entries); mtx_unlock(&lagg_list_mtx); callout_reset(&sc->sc_callout, hz, lagg_callout, sc); return (0); } static void lagg_clone_destroy(struct ifnet *ifp) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_port *lp; LAGG_WLOCK(sc); lagg_stop(sc); ifp->if_flags &= ~IFF_UP; EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach); EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach); /* Shutdown and remove lagg ports */ while ((lp = SLIST_FIRST(&sc->sc_ports)) != NULL) lagg_port_destroy(lp, 1); /* Unhook the aggregation protocol */ if (sc->sc_detach != NULL) (*sc->sc_detach)(sc); LAGG_WUNLOCK(sc); sysctl_ctx_free(&sc->ctx); ifmedia_removeall(&sc->sc_media); ether_ifdetach(ifp); if_free(ifp); /* This grabs sc_callout_mtx, serialising it correctly */ callout_drain(&sc->sc_callout); /* At this point it's drained; we can free this */ counter_u64_free(sc->sc_ipackets); counter_u64_free(sc->sc_opackets); counter_u64_free(sc->sc_ibytes); counter_u64_free(sc->sc_obytes); mtx_lock(&lagg_list_mtx); SLIST_REMOVE(&lagg_list, sc, lagg_softc, sc_entries); mtx_unlock(&lagg_list_mtx); taskqueue_drain(taskqueue_swi, &sc->sc_lladdr_task); LAGG_LOCK_DESTROY(sc); LAGG_CALLOUT_LOCK_DESTROY(sc); free(sc, M_DEVBUF); } static void lagg_lladdr(struct lagg_softc *sc, uint8_t *lladdr) { struct ifnet *ifp = sc->sc_ifp; if (memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0) return; bcopy(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN); /* Let the protocol know the MAC has changed */ if (sc->sc_lladdr != NULL) (*sc->sc_lladdr)(sc); EVENTHANDLER_INVOKE(iflladdr_event, ifp); } static void lagg_capabilities(struct lagg_softc *sc) { struct lagg_port *lp; int cap = ~0, ena = ~0; u_long hwa = ~0UL; - u_int hw_tsomax = IF_HW_TSOMAX_DEFAULT_VALUE(); +#if defined(INET) || defined(INET6) + u_int hw_tsomax = IP_MAXPACKET; /* Initialize to the maximum value. */ +#else + u_int hw_tsomax = ~0; /* if_hw_tsomax is only for INET/INET6, but.. */ +#endif LAGG_WLOCK_ASSERT(sc); /* Get capabilities from the lagg ports */ SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { cap &= lp->lp_ifp->if_capabilities; ena &= lp->lp_ifp->if_capenable; hwa &= lp->lp_ifp->if_hwassist; - /* Set to the common value of the lagg ports. */ - hw_tsomax = if_hw_tsomax_common(hw_tsomax, - lp->lp_ifp->if_hw_tsomax); + /* Set to the minimum value of the lagg ports. */ + if (lp->lp_ifp->if_hw_tsomax < hw_tsomax && + lp->lp_ifp->if_hw_tsomax > 0) + hw_tsomax = lp->lp_ifp->if_hw_tsomax; } cap = (cap == ~0 ? 0 : cap); ena = (ena == ~0 ? 0 : ena); hwa = (hwa == ~0 ? 0 : hwa); if (sc->sc_ifp->if_capabilities != cap || sc->sc_ifp->if_capenable != ena || sc->sc_ifp->if_hwassist != hwa || sc->sc_ifp->if_hw_tsomax != hw_tsomax) { sc->sc_ifp->if_capabilities = cap; sc->sc_ifp->if_capenable = ena; sc->sc_ifp->if_hwassist = hwa; sc->sc_ifp->if_hw_tsomax = hw_tsomax; getmicrotime(&sc->sc_ifp->if_lastchange); if (sc->sc_ifflags & IFF_DEBUG) if_printf(sc->sc_ifp, "capabilities 0x%08x enabled 0x%08x\n", cap, ena); } } static void lagg_port_lladdr(struct lagg_port *lp, uint8_t *lladdr) { struct lagg_softc *sc = lp->lp_softc; struct ifnet *ifp = lp->lp_ifp; struct lagg_llq *llq; int pending = 0; LAGG_WLOCK_ASSERT(sc); if (lp->lp_detaching || memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0) return; /* Check to make sure its not already queued to be changed */ SLIST_FOREACH(llq, &sc->sc_llq_head, llq_entries) { if (llq->llq_ifp == ifp) { pending = 1; break; } } if (!pending) { llq = malloc(sizeof(struct lagg_llq), M_DEVBUF, M_NOWAIT); if (llq == NULL) /* XXX what to do */ return; } /* Update the lladdr even if pending, it may have changed */ llq->llq_ifp = ifp; bcopy(lladdr, llq->llq_lladdr, ETHER_ADDR_LEN); if (!pending) SLIST_INSERT_HEAD(&sc->sc_llq_head, llq, llq_entries); taskqueue_enqueue(taskqueue_swi, &sc->sc_lladdr_task); } /* * Set the interface MAC address from a taskqueue to avoid a LOR. */ static void lagg_port_setlladdr(void *arg, int pending) { struct lagg_softc *sc = (struct lagg_softc *)arg; struct lagg_llq *llq, *head; struct ifnet *ifp; int error; /* Grab a local reference of the queue and remove it from the softc */ LAGG_WLOCK(sc); head = SLIST_FIRST(&sc->sc_llq_head); SLIST_FIRST(&sc->sc_llq_head) = NULL; LAGG_WUNLOCK(sc); /* * Traverse the queue and set the lladdr on each ifp. It is safe to do * unlocked as we have the only reference to it. */ for (llq = head; llq != NULL; llq = head) { ifp = llq->llq_ifp; /* Set the link layer address */ CURVNET_SET(ifp->if_vnet); error = if_setlladdr(ifp, llq->llq_lladdr, ETHER_ADDR_LEN); CURVNET_RESTORE(); if (error) printf("%s: setlladdr failed on %s\n", __func__, ifp->if_xname); head = SLIST_NEXT(llq, llq_entries); free(llq, M_DEVBUF); } } static int lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) { struct lagg_softc *sc_ptr; struct lagg_port *lp; int error = 0; LAGG_WLOCK_ASSERT(sc); /* Limit the maximal number of lagg ports */ if (sc->sc_count >= LAGG_MAX_PORTS) return (ENOSPC); /* Check if port has already been associated to a lagg */ if (ifp->if_lagg != NULL) { /* Port is already in the current lagg? */ lp = (struct lagg_port *)ifp->if_lagg; if (lp->lp_softc == sc) return (EEXIST); return (EBUSY); } /* XXX Disallow non-ethernet interfaces (this should be any of 802) */ if (ifp->if_type != IFT_ETHER) return (EPROTONOSUPPORT); #ifdef INET6 /* * The member interface should not have inet6 address because * two interfaces with a valid link-local scope zone must not be * merged in any form. This restriction is needed to * prevent violation of link-local scope zone. Attempts to * add a member interface which has inet6 addresses triggers * removal of all inet6 addresses on the member interface. */ SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (in6ifa_llaonifp(lp->lp_ifp)) { in6_ifdetach(lp->lp_ifp); if_printf(sc->sc_ifp, "IPv6 addresses on %s have been removed " "before adding it as a member to prevent " "IPv6 address scope violation.\n", lp->lp_ifp->if_xname); } } if (in6ifa_llaonifp(ifp)) { in6_ifdetach(ifp); if_printf(sc->sc_ifp, "IPv6 addresses on %s have been removed " "before adding it as a member to prevent " "IPv6 address scope violation.\n", ifp->if_xname); } #endif /* Allow the first Ethernet member to define the MTU */ if (SLIST_EMPTY(&sc->sc_ports)) sc->sc_ifp->if_mtu = ifp->if_mtu; else if (sc->sc_ifp->if_mtu != ifp->if_mtu) { if_printf(sc->sc_ifp, "invalid MTU for %s\n", ifp->if_xname); return (EINVAL); } if ((lp = malloc(sizeof(struct lagg_port), M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL) return (ENOMEM); /* Check if port is a stacked lagg */ mtx_lock(&lagg_list_mtx); SLIST_FOREACH(sc_ptr, &lagg_list, sc_entries) { if (ifp == sc_ptr->sc_ifp) { mtx_unlock(&lagg_list_mtx); free(lp, M_DEVBUF); return (EINVAL); /* XXX disable stacking for the moment, its untested */ #ifdef LAGG_PORT_STACKING lp->lp_flags |= LAGG_PORT_STACK; if (lagg_port_checkstacking(sc_ptr) >= LAGG_MAX_STACKING) { mtx_unlock(&lagg_list_mtx); free(lp, M_DEVBUF); return (E2BIG); } #endif } } mtx_unlock(&lagg_list_mtx); /* Change the interface type */ lp->lp_iftype = ifp->if_type; ifp->if_type = IFT_IEEE8023ADLAG; ifp->if_lagg = lp; lp->lp_ioctl = ifp->if_ioctl; ifp->if_ioctl = lagg_port_ioctl; lp->lp_output = ifp->if_output; ifp->if_output = lagg_port_output; lp->lp_ifp = ifp; lp->lp_softc = sc; /* Save port link layer address */ bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ETHER_ADDR_LEN); if (SLIST_EMPTY(&sc->sc_ports)) { sc->sc_primary = lp; lagg_lladdr(sc, IF_LLADDR(ifp)); } else { /* Update link layer address for this port */ lagg_port_lladdr(lp, IF_LLADDR(sc->sc_ifp)); } /* Insert into the list of ports */ SLIST_INSERT_HEAD(&sc->sc_ports, lp, lp_entries); sc->sc_count++; /* Update lagg capabilities */ lagg_capabilities(sc); lagg_linkstate(sc); /* Add multicast addresses and interface flags to this port */ lagg_ether_cmdmulti(lp, 1); lagg_setflags(lp, 1); if (sc->sc_port_create != NULL) error = (*sc->sc_port_create)(lp); if (error) { /* remove the port again, without calling sc_port_destroy */ lagg_port_destroy(lp, 0); return (error); } return (error); } #ifdef LAGG_PORT_STACKING static int lagg_port_checkstacking(struct lagg_softc *sc) { struct lagg_softc *sc_ptr; struct lagg_port *lp; int m = 0; LAGG_WLOCK_ASSERT(sc); SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_flags & LAGG_PORT_STACK) { sc_ptr = (struct lagg_softc *)lp->lp_ifp->if_softc; m = MAX(m, lagg_port_checkstacking(sc_ptr)); } } return (m + 1); } #endif static int lagg_port_destroy(struct lagg_port *lp, int runpd) { struct lagg_softc *sc = lp->lp_softc; struct lagg_port *lp_ptr; struct lagg_llq *llq; struct ifnet *ifp = lp->lp_ifp; LAGG_WLOCK_ASSERT(sc); if (runpd && sc->sc_port_destroy != NULL) (*sc->sc_port_destroy)(lp); /* * Remove multicast addresses and interface flags from this port and * reset the MAC address, skip if the interface is being detached. */ if (!lp->lp_detaching) { lagg_ether_cmdmulti(lp, 0); lagg_setflags(lp, 0); lagg_port_lladdr(lp, lp->lp_lladdr); } /* Restore interface */ ifp->if_type = lp->lp_iftype; ifp->if_ioctl = lp->lp_ioctl; ifp->if_output = lp->lp_output; ifp->if_lagg = NULL; /* Finally, remove the port from the lagg */ SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries); sc->sc_count--; /* Update the primary interface */ if (lp == sc->sc_primary) { uint8_t lladdr[ETHER_ADDR_LEN]; if ((lp_ptr = SLIST_FIRST(&sc->sc_ports)) == NULL) { bzero(&lladdr, ETHER_ADDR_LEN); } else { bcopy(lp_ptr->lp_lladdr, lladdr, ETHER_ADDR_LEN); } lagg_lladdr(sc, lladdr); sc->sc_primary = lp_ptr; /* Update link layer address for each port */ SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries) lagg_port_lladdr(lp_ptr, lladdr); } /* Remove any pending lladdr changes from the queue */ if (lp->lp_detaching) { SLIST_FOREACH(llq, &sc->sc_llq_head, llq_entries) { if (llq->llq_ifp == ifp) { SLIST_REMOVE(&sc->sc_llq_head, llq, lagg_llq, llq_entries); free(llq, M_DEVBUF); break; /* Only appears once */ } } } if (lp->lp_ifflags) if_printf(ifp, "%s: lp_ifflags unclean\n", __func__); free(lp, M_DEVBUF); /* Update lagg capabilities */ lagg_capabilities(sc); lagg_linkstate(sc); return (0); } static int lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct lagg_reqport *rp = (struct lagg_reqport *)data; struct lagg_softc *sc; struct lagg_port *lp = NULL; int error = 0; struct rm_priotracker tracker; /* Should be checked by the caller */ if (ifp->if_type != IFT_IEEE8023ADLAG || (lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL) goto fallback; switch (cmd) { case SIOCGLAGGPORT: if (rp->rp_portname[0] == '\0' || ifunit(rp->rp_portname) != ifp) { error = EINVAL; break; } LAGG_RLOCK(sc, &tracker); if ((lp = ifp->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; LAGG_RUNLOCK(sc, &tracker); break; } lagg_port2req(lp, rp); LAGG_RUNLOCK(sc, &tracker); break; case SIOCSIFCAP: if (lp->lp_ioctl == NULL) { error = EINVAL; break; } error = (*lp->lp_ioctl)(ifp, cmd, data); if (error) break; /* Update lagg interface capabilities */ LAGG_WLOCK(sc); lagg_capabilities(sc); LAGG_WUNLOCK(sc); break; case SIOCSIFMTU: /* Do not allow the MTU to be changed once joined */ error = EINVAL; break; default: goto fallback; } return (error); fallback: if (lp->lp_ioctl != NULL) return ((*lp->lp_ioctl)(ifp, cmd, data)); return (EINVAL); } /* * For direct output to child ports. */ static int lagg_port_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct lagg_port *lp = ifp->if_lagg; switch (dst->sa_family) { case pseudo_AF_HDRCMPLT: case AF_UNSPEC: return ((*lp->lp_output)(ifp, m, dst, ro)); } /* drop any other frames */ m_freem(m); return (ENETDOWN); } static void lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp) { struct lagg_port *lp; struct lagg_softc *sc; if ((lp = ifp->if_lagg) == NULL) return; /* If the ifnet is just being renamed, don't do anything. */ if (ifp->if_flags & IFF_RENAMING) return; sc = lp->lp_softc; LAGG_WLOCK(sc); lp->lp_detaching = 1; lagg_port_destroy(lp, 1); LAGG_WUNLOCK(sc); } static void lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp) { struct lagg_softc *sc = lp->lp_softc; strlcpy(rp->rp_ifname, sc->sc_ifname, sizeof(rp->rp_ifname)); strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname)); rp->rp_prio = lp->lp_prio; rp->rp_flags = lp->lp_flags; if (sc->sc_portreq != NULL) (*sc->sc_portreq)(lp, (caddr_t)&rp->rp_psc); /* Add protocol specific flags */ switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: if (lp == sc->sc_primary) rp->rp_flags |= LAGG_PORT_MASTER; if (lp == lagg_link_active(sc, sc->sc_primary)) rp->rp_flags |= LAGG_PORT_ACTIVE; break; case LAGG_PROTO_ROUNDROBIN: case LAGG_PROTO_LOADBALANCE: case LAGG_PROTO_ETHERCHANNEL: if (LAGG_PORTACTIVE(lp)) rp->rp_flags |= LAGG_PORT_ACTIVE; break; case LAGG_PROTO_LACP: /* LACP has a different definition of active */ if (lacp_isactive(lp)) rp->rp_flags |= LAGG_PORT_ACTIVE; if (lacp_iscollecting(lp)) rp->rp_flags |= LAGG_PORT_COLLECTING; if (lacp_isdistributing(lp)) rp->rp_flags |= LAGG_PORT_DISTRIBUTING; break; } } static void lagg_init(void *xsc) { struct lagg_softc *sc = (struct lagg_softc *)xsc; struct lagg_port *lp; struct ifnet *ifp = sc->sc_ifp; if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; LAGG_WLOCK(sc); ifp->if_drv_flags |= IFF_DRV_RUNNING; /* Update the port lladdrs */ SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lagg_port_lladdr(lp, IF_LLADDR(ifp)); if (sc->sc_init != NULL) (*sc->sc_init)(sc); LAGG_WUNLOCK(sc); } static void lagg_stop(struct lagg_softc *sc) { struct ifnet *ifp = sc->sc_ifp; LAGG_WLOCK_ASSERT(sc); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; if (sc->sc_stop != NULL) (*sc->sc_stop)(sc); } static int lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_reqall *ra = (struct lagg_reqall *)data; struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf; struct lagg_reqflags *rf = (struct lagg_reqflags *)data; struct ifreq *ifr = (struct ifreq *)data; struct lagg_port *lp; struct ifnet *tpif; struct thread *td = curthread; char *buf, *outbuf; int count, buflen, len, error = 0; struct rm_priotracker tracker; bzero(&rpbuf, sizeof(rpbuf)); switch (cmd) { case SIOCGLAGG: LAGG_RLOCK(sc, &tracker); count = 0; SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) count++; buflen = count * sizeof(struct lagg_reqport); LAGG_RUNLOCK(sc, &tracker); outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO); LAGG_RLOCK(sc, &tracker); ra->ra_proto = sc->sc_proto; if (sc->sc_req != NULL) (*sc->sc_req)(sc, (caddr_t)&ra->ra_psc); count = 0; buf = outbuf; len = min(ra->ra_size, buflen); SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (len < sizeof(rpbuf)) break; lagg_port2req(lp, &rpbuf); memcpy(buf, &rpbuf, sizeof(rpbuf)); count++; buf += sizeof(rpbuf); len -= sizeof(rpbuf); } LAGG_RUNLOCK(sc, &tracker); ra->ra_ports = count; ra->ra_size = count * sizeof(rpbuf); error = copyout(outbuf, ra->ra_port, ra->ra_size); free(outbuf, M_TEMP); break; case SIOCSLAGG: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (ra->ra_proto >= LAGG_PROTO_MAX) { error = EPROTONOSUPPORT; break; } LAGG_WLOCK(sc); if (sc->sc_proto != LAGG_PROTO_NONE) { /* Reset protocol first in case detach unlocks */ sc->sc_proto = LAGG_PROTO_NONE; error = sc->sc_detach(sc); sc->sc_detach = NULL; sc->sc_start = NULL; sc->sc_input = NULL; sc->sc_port_create = NULL; sc->sc_port_destroy = NULL; sc->sc_linkstate = NULL; sc->sc_init = NULL; sc->sc_stop = NULL; sc->sc_lladdr = NULL; sc->sc_req = NULL; sc->sc_portreq = NULL; } else if (sc->sc_input != NULL) { /* Still detaching */ error = EBUSY; } if (error != 0) { LAGG_WUNLOCK(sc); break; } for (int i = 0; i < (sizeof(lagg_protos) / sizeof(lagg_protos[0])); i++) { if (lagg_protos[i].ti_proto == ra->ra_proto) { if (sc->sc_ifflags & IFF_DEBUG) printf("%s: using proto %u\n", sc->sc_ifname, lagg_protos[i].ti_proto); sc->sc_proto = lagg_protos[i].ti_proto; if (sc->sc_proto != LAGG_PROTO_NONE) error = lagg_protos[i].ti_attach(sc); LAGG_WUNLOCK(sc); return (error); } } LAGG_WUNLOCK(sc); error = EPROTONOSUPPORT; break; case SIOCGLAGGFLAGS: rf->rf_flags = sc->sc_flags; break; case SIOCSLAGGHASH: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if ((rf->rf_flags & LAGG_F_HASHMASK) == 0) { error = EINVAL; break; } LAGG_WLOCK(sc); sc->sc_flags &= ~LAGG_F_HASHMASK; sc->sc_flags |= rf->rf_flags & LAGG_F_HASHMASK; LAGG_WUNLOCK(sc); break; case SIOCGLAGGPORT: if (rp->rp_portname[0] == '\0' || (tpif = ifunit(rp->rp_portname)) == NULL) { error = EINVAL; break; } LAGG_RLOCK(sc, &tracker); if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; LAGG_RUNLOCK(sc, &tracker); break; } lagg_port2req(lp, rp); LAGG_RUNLOCK(sc, &tracker); break; case SIOCSLAGGPORT: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (rp->rp_portname[0] == '\0' || (tpif = ifunit(rp->rp_portname)) == NULL) { error = EINVAL; break; } LAGG_WLOCK(sc); error = lagg_port_create(sc, tpif); LAGG_WUNLOCK(sc); break; case SIOCSLAGGDELPORT: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (rp->rp_portname[0] == '\0' || (tpif = ifunit(rp->rp_portname)) == NULL) { error = EINVAL; break; } LAGG_WLOCK(sc); if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; LAGG_WUNLOCK(sc); break; } error = lagg_port_destroy(lp, 1); LAGG_WUNLOCK(sc); break; case SIOCSIFFLAGS: /* Set flags on ports too */ LAGG_WLOCK(sc); SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { lagg_setflags(lp, 1); } LAGG_WUNLOCK(sc); if (!(ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* * If interface is marked down and it is running, * then stop and disable it. */ LAGG_WLOCK(sc); lagg_stop(sc); LAGG_WUNLOCK(sc); } else if ((ifp->if_flags & IFF_UP) && !(ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* * If interface is marked up and it is stopped, then * start it. */ (*ifp->if_init)(sc); } break; case SIOCADDMULTI: case SIOCDELMULTI: LAGG_WLOCK(sc); error = lagg_ether_setmulti(sc); LAGG_WUNLOCK(sc); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd); break; case SIOCSIFCAP: case SIOCSIFMTU: /* Do not allow the MTU or caps to be directly changed */ error = EINVAL; break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } static int lagg_ether_setmulti(struct lagg_softc *sc) { struct lagg_port *lp; LAGG_WLOCK_ASSERT(sc); SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { /* First, remove any existing filter entries. */ lagg_ether_cmdmulti(lp, 0); /* copy all addresses from the lagg interface to the port */ lagg_ether_cmdmulti(lp, 1); } return (0); } static int lagg_ether_cmdmulti(struct lagg_port *lp, int set) { struct lagg_softc *sc = lp->lp_softc; struct ifnet *ifp = lp->lp_ifp; struct ifnet *scifp = sc->sc_ifp; struct lagg_mc *mc; struct ifmultiaddr *ifma; int error; LAGG_WLOCK_ASSERT(sc); if (set) { IF_ADDR_WLOCK(scifp); TAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; mc = malloc(sizeof(struct lagg_mc), M_DEVBUF, M_NOWAIT); if (mc == NULL) { IF_ADDR_WUNLOCK(scifp); return (ENOMEM); } bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len); mc->mc_addr.sdl_index = ifp->if_index; mc->mc_ifma = NULL; SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries); } IF_ADDR_WUNLOCK(scifp); SLIST_FOREACH (mc, &lp->lp_mc_head, mc_entries) { error = if_addmulti(ifp, (struct sockaddr *)&mc->mc_addr, &mc->mc_ifma); if (error) return (error); } } else { while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) { SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries); if (mc->mc_ifma && !lp->lp_detaching) if_delmulti_ifma(mc->mc_ifma); free(mc, M_DEVBUF); } } return (0); } /* Handle a ref counted flag that should be set on the lagg port as well */ static int lagg_setflag(struct lagg_port *lp, int flag, int status, int (*func)(struct ifnet *, int)) { struct lagg_softc *sc = lp->lp_softc; struct ifnet *scifp = sc->sc_ifp; struct ifnet *ifp = lp->lp_ifp; int error; LAGG_WLOCK_ASSERT(sc); status = status ? (scifp->if_flags & flag) : 0; /* Now "status" contains the flag value or 0 */ /* * See if recorded ports status is different from what * we want it to be. If it is, flip it. We record ports * status in lp_ifflags so that we won't clear ports flag * we haven't set. In fact, we don't clear or set ports * flags directly, but get or release references to them. * That's why we can be sure that recorded flags still are * in accord with actual ports flags. */ if (status != (lp->lp_ifflags & flag)) { error = (*func)(ifp, status); if (error) return (error); lp->lp_ifflags &= ~flag; lp->lp_ifflags |= status; } return (0); } /* * Handle IFF_* flags that require certain changes on the lagg port * if "status" is true, update ports flags respective to the lagg * if "status" is false, forcedly clear the flags set on port. */ static int lagg_setflags(struct lagg_port *lp, int status) { int error, i; for (i = 0; lagg_pflags[i].flag; i++) { error = lagg_setflag(lp, lagg_pflags[i].flag, status, lagg_pflags[i].func); if (error) return (error); } return (0); } static int lagg_transmit(struct ifnet *ifp, struct mbuf *m) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; int error, len, mcast; struct rm_priotracker tracker; len = m->m_pkthdr.len; mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; LAGG_RLOCK(sc, &tracker); /* We need a Tx algorithm and at least one port */ if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) { LAGG_RUNLOCK(sc, &tracker); m_freem(m); ifp->if_oerrors++; return (ENXIO); } ETHER_BPF_MTAP(ifp, m); error = (*sc->sc_start)(sc, m); LAGG_RUNLOCK(sc, &tracker); if (error == 0) { counter_u64_add(sc->sc_opackets, 1); counter_u64_add(sc->sc_obytes, len); ifp->if_omcasts += mcast; } else ifp->if_oerrors++; return (error); } /* * The ifp->if_qflush entry point for lagg(4) is no-op. */ static void lagg_qflush(struct ifnet *ifp __unused) { } static struct mbuf * lagg_input(struct ifnet *ifp, struct mbuf *m) { struct lagg_port *lp = ifp->if_lagg; struct lagg_softc *sc = lp->lp_softc; struct ifnet *scifp = sc->sc_ifp; struct rm_priotracker tracker; LAGG_RLOCK(sc, &tracker); if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (lp->lp_flags & LAGG_PORT_DISABLED) || sc->sc_proto == LAGG_PROTO_NONE) { LAGG_RUNLOCK(sc, &tracker); m_freem(m); return (NULL); } ETHER_BPF_MTAP(scifp, m); m = (*sc->sc_input)(sc, lp, m); if (m != NULL) { counter_u64_add(sc->sc_ipackets, 1); counter_u64_add(sc->sc_ibytes, m->m_pkthdr.len); if (scifp->if_flags & IFF_MONITOR) { m_freem(m); m = NULL; } } LAGG_RUNLOCK(sc, &tracker); return (m); } static int lagg_media_change(struct ifnet *ifp) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; if (sc->sc_ifflags & IFF_DEBUG) printf("%s\n", __func__); /* Ignore */ return (0); } static void lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_port *lp; struct rm_priotracker tracker; imr->ifm_status = IFM_AVALID; imr->ifm_active = IFM_ETHER | IFM_AUTO; LAGG_RLOCK(sc, &tracker); SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (LAGG_PORTACTIVE(lp)) imr->ifm_status |= IFM_ACTIVE; } LAGG_RUNLOCK(sc, &tracker); } static void lagg_linkstate(struct lagg_softc *sc) { struct lagg_port *lp; int new_link = LINK_STATE_DOWN; uint64_t speed; /* Our link is considered up if at least one of our ports is active */ SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_link_state == LINK_STATE_UP) { new_link = LINK_STATE_UP; break; } } if_link_state_change(sc->sc_ifp, new_link); /* Update if_baudrate to reflect the max possible speed */ switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: sc->sc_ifp->if_baudrate = sc->sc_primary != NULL ? sc->sc_primary->lp_ifp->if_baudrate : 0; break; case LAGG_PROTO_ROUNDROBIN: case LAGG_PROTO_LOADBALANCE: case LAGG_PROTO_ETHERCHANNEL: speed = 0; SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) speed += lp->lp_ifp->if_baudrate; sc->sc_ifp->if_baudrate = speed; break; case LAGG_PROTO_LACP: /* LACP updates if_baudrate itself */ break; } } static void lagg_port_state(struct ifnet *ifp, int state) { struct lagg_port *lp = (struct lagg_port *)ifp->if_lagg; struct lagg_softc *sc = NULL; if (lp != NULL) sc = lp->lp_softc; if (sc == NULL) return; LAGG_WLOCK(sc); lagg_linkstate(sc); if (sc->sc_linkstate != NULL) (*sc->sc_linkstate)(lp); LAGG_WUNLOCK(sc); } struct lagg_port * lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp) { struct lagg_port *lp_next, *rval = NULL; // int new_link = LINK_STATE_DOWN; LAGG_RLOCK_ASSERT(sc); /* * Search a port which reports an active link state. */ if (lp == NULL) goto search; if (LAGG_PORTACTIVE(lp)) { rval = lp; goto found; } if ((lp_next = SLIST_NEXT(lp, lp_entries)) != NULL && LAGG_PORTACTIVE(lp_next)) { rval = lp_next; goto found; } search: SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { if (LAGG_PORTACTIVE(lp_next)) { rval = lp_next; goto found; } } found: if (rval != NULL) { /* * The IEEE 802.1D standard assumes that a lagg with * multiple ports is always full duplex. This is valid * for load sharing laggs and if at least two links * are active. Unfortunately, checking the latter would * be too expensive at this point. XXX if ((sc->sc_capabilities & IFCAP_LAGG_FULLDUPLEX) && (sc->sc_count > 1)) new_link = LINK_STATE_FULL_DUPLEX; else new_link = rval->lp_link_state; */ } return (rval); } static const void * lagg_gethdr(struct mbuf *m, u_int off, u_int len, void *buf) { if (m->m_pkthdr.len < (off + len)) { return (NULL); } else if (m->m_len < (off + len)) { m_copydata(m, off, len, buf); return (buf); } return (mtod(m, char *) + off); } static int lagg_sysctl_active(SYSCTL_HANDLER_ARGS) { struct lagg_softc *sc = (struct lagg_softc *)arg1; struct lagg_port *lp; int error; /* LACP tracks active links automatically, the others do not */ if (sc->sc_proto != LAGG_PROTO_LACP) { sc->sc_active = 0; SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) sc->sc_active += LAGG_PORTACTIVE(lp); } error = sysctl_handle_int(oidp, &sc->sc_active, 0, req); if ((error) || (req->newptr == NULL)) return (error); return (0); } uint32_t lagg_hashmbuf(struct lagg_softc *sc, struct mbuf *m, uint32_t key) { uint16_t etype; uint32_t p = key; int off; struct ether_header *eh; const struct ether_vlan_header *vlan; #ifdef INET const struct ip *ip; const uint32_t *ports; int iphlen; #endif #ifdef INET6 const struct ip6_hdr *ip6; uint32_t flow; #endif union { #ifdef INET struct ip ip; #endif #ifdef INET6 struct ip6_hdr ip6; #endif struct ether_vlan_header vlan; uint32_t port; } buf; off = sizeof(*eh); if (m->m_len < off) goto out; eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); if (sc->sc_flags & LAGG_F_HASHL2) { p = hash32_buf(&eh->ether_shost, ETHER_ADDR_LEN, p); p = hash32_buf(&eh->ether_dhost, ETHER_ADDR_LEN, p); } /* Special handling for encapsulating VLAN frames */ if ((m->m_flags & M_VLANTAG) && (sc->sc_flags & LAGG_F_HASHL2)) { p = hash32_buf(&m->m_pkthdr.ether_vtag, sizeof(m->m_pkthdr.ether_vtag), p); } else if (etype == ETHERTYPE_VLAN) { vlan = lagg_gethdr(m, off, sizeof(*vlan), &buf); if (vlan == NULL) goto out; if (sc->sc_flags & LAGG_F_HASHL2) p = hash32_buf(&vlan->evl_tag, sizeof(vlan->evl_tag), p); etype = ntohs(vlan->evl_proto); off += sizeof(*vlan) - sizeof(*eh); } switch (etype) { #ifdef INET case ETHERTYPE_IP: ip = lagg_gethdr(m, off, sizeof(*ip), &buf); if (ip == NULL) goto out; if (sc->sc_flags & LAGG_F_HASHL3) { p = hash32_buf(&ip->ip_src, sizeof(struct in_addr), p); p = hash32_buf(&ip->ip_dst, sizeof(struct in_addr), p); } if (!(sc->sc_flags & LAGG_F_HASHL4)) break; switch (ip->ip_p) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_SCTP: iphlen = ip->ip_hl << 2; if (iphlen < sizeof(*ip)) break; off += iphlen; ports = lagg_gethdr(m, off, sizeof(*ports), &buf); if (ports == NULL) break; p = hash32_buf(ports, sizeof(*ports), p); break; } break; #endif #ifdef INET6 case ETHERTYPE_IPV6: if (!(sc->sc_flags & LAGG_F_HASHL3)) break; ip6 = lagg_gethdr(m, off, sizeof(*ip6), &buf); if (ip6 == NULL) goto out; p = hash32_buf(&ip6->ip6_src, sizeof(struct in6_addr), p); p = hash32_buf(&ip6->ip6_dst, sizeof(struct in6_addr), p); flow = ip6->ip6_flow & IPV6_FLOWLABEL_MASK; p = hash32_buf(&flow, sizeof(flow), p); /* IPv6 flow label */ break; #endif } out: return (p); } int lagg_enqueue(struct ifnet *ifp, struct mbuf *m) { return (ifp->if_transmit)(ifp, m); } /* * Simple round robin aggregation */ static int lagg_rr_attach(struct lagg_softc *sc) { sc->sc_detach = lagg_rr_detach; sc->sc_start = lagg_rr_start; sc->sc_input = lagg_rr_input; sc->sc_port_create = NULL; sc->sc_capabilities = IFCAP_LAGG_FULLDUPLEX; sc->sc_seq = 0; return (0); } static int lagg_rr_detach(struct lagg_softc *sc) { return (0); } static int lagg_rr_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; uint32_t p; p = atomic_fetchadd_32(&sc->sc_seq, 1); p %= sc->sc_count; lp = SLIST_FIRST(&sc->sc_ports); while (p--) lp = SLIST_NEXT(lp, lp_entries); /* * Check the port's link state. This will return the next active * port if the link is down or the port is NULL. */ if ((lp = lagg_link_active(sc, lp)) == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * Active failover */ static int lagg_fail_attach(struct lagg_softc *sc) { sc->sc_detach = lagg_fail_detach; sc->sc_start = lagg_fail_start; sc->sc_input = lagg_fail_input; sc->sc_port_create = NULL; sc->sc_port_destroy = NULL; return (0); } static int lagg_fail_detach(struct lagg_softc *sc) { return (0); } static int lagg_fail_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; /* Use the master port if active or the next available port */ if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; struct lagg_port *tmp_tp; if (lp == sc->sc_primary || lagg_failover_rx_all) { m->m_pkthdr.rcvif = ifp; return (m); } if (!LAGG_PORTACTIVE(sc->sc_primary)) { tmp_tp = lagg_link_active(sc, sc->sc_primary); /* * If tmp_tp is null, we've recieved a packet when all * our links are down. Weird, but process it anyways. */ if ((tmp_tp == NULL || tmp_tp == lp)) { m->m_pkthdr.rcvif = ifp; return (m); } } m_freem(m); return (NULL); } /* * Loadbalancing */ static int lagg_lb_attach(struct lagg_softc *sc) { struct lagg_port *lp; struct lagg_lb *lb; if ((lb = (struct lagg_lb *)malloc(sizeof(struct lagg_lb), M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL) return (ENOMEM); sc->sc_detach = lagg_lb_detach; sc->sc_start = lagg_lb_start; sc->sc_input = lagg_lb_input; sc->sc_port_create = lagg_lb_port_create; sc->sc_port_destroy = lagg_lb_port_destroy; sc->sc_capabilities = IFCAP_LAGG_FULLDUPLEX; lb->lb_key = arc4random(); sc->sc_psc = (caddr_t)lb; SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lagg_lb_port_create(lp); return (0); } static int lagg_lb_detach(struct lagg_softc *sc) { struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; if (lb != NULL) free(lb, M_DEVBUF); return (0); } static int lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp) { struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; struct lagg_port *lp_next; int i = 0; bzero(&lb->lb_ports, sizeof(lb->lb_ports)); SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { if (lp_next == lp) continue; if (i >= LAGG_MAX_PORTS) return (EINVAL); if (sc->sc_ifflags & IFF_DEBUG) printf("%s: port %s at index %d\n", sc->sc_ifname, lp_next->lp_ifname, i); lb->lb_ports[i++] = lp_next; } return (0); } static int lagg_lb_port_create(struct lagg_port *lp) { struct lagg_softc *sc = lp->lp_softc; return (lagg_lb_porttable(sc, NULL)); } static void lagg_lb_port_destroy(struct lagg_port *lp) { struct lagg_softc *sc = lp->lp_softc; lagg_lb_porttable(sc, lp); } static int lagg_lb_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; struct lagg_port *lp = NULL; uint32_t p = 0; if (sc->use_flowid && (m->m_flags & M_FLOWID)) p = m->m_pkthdr.flowid >> sc->flowid_shift; else p = lagg_hashmbuf(sc, m, lb->lb_key); p %= sc->sc_count; lp = lb->lb_ports[p]; /* * Check the port's link state. This will return the next active * port if the link is down or the port is NULL. */ if ((lp = lagg_link_active(sc, lp)) == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * 802.3ad LACP */ static int lagg_lacp_attach(struct lagg_softc *sc) { struct lagg_port *lp; int error; sc->sc_detach = lagg_lacp_detach; sc->sc_port_create = lacp_port_create; sc->sc_port_destroy = lacp_port_destroy; sc->sc_linkstate = lacp_linkstate; sc->sc_start = lagg_lacp_start; sc->sc_input = lagg_lacp_input; sc->sc_init = lacp_init; sc->sc_stop = lacp_stop; sc->sc_lladdr = lagg_lacp_lladdr; sc->sc_req = lacp_req; sc->sc_portreq = lacp_portreq; error = lacp_attach(sc); if (error) return (error); SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_create(lp); return (error); } static int lagg_lacp_detach(struct lagg_softc *sc) { struct lagg_port *lp; int error; SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_destroy(lp); /* unlocking is safe here */ LAGG_WUNLOCK(sc); error = lacp_detach(sc); LAGG_WLOCK(sc); return (error); } static void lagg_lacp_lladdr(struct lagg_softc *sc) { struct lagg_port *lp; /* purge all the lacp ports */ SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_destroy(lp); /* add them back in */ SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_create(lp); } static int lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; lp = lacp_select_tx_port(sc, m); if (lp == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; struct ether_header *eh; u_short etype; eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); /* Tap off LACP control messages */ if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_SLOW) { m = lacp_input(lp, m); if (m == NULL) return (NULL); } /* * If the port is not collecting or not in the active aggregator then * free and return. */ if (lacp_iscollecting(lp) == 0 || lacp_isactive(lp) == 0) { m_freem(m); return (NULL); } m->m_pkthdr.rcvif = ifp; return (m); } static void lagg_callout(void *arg) { struct lagg_softc *sc = (struct lagg_softc *)arg; struct ifnet *ifp = sc->sc_ifp; ifp->if_ipackets = counter_u64_fetch(sc->sc_ipackets); ifp->if_opackets = counter_u64_fetch(sc->sc_opackets); ifp->if_ibytes = counter_u64_fetch(sc->sc_ibytes); ifp->if_obytes = counter_u64_fetch(sc->sc_obytes); callout_reset(&sc->sc_callout, hz, lagg_callout, sc); } Index: head/sys/net/if_var.h =================================================================== --- head/sys/net/if_var.h (revision 271550) +++ head/sys/net/if_var.h (revision 271551) @@ -1,659 +1,619 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)if.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NET_IF_VAR_H_ #define _NET_IF_VAR_H_ /* * Structures defining a network interface, providing a packet * transport mechanism (ala level 0 of the PUP protocols). * * Each interface accepts output datagrams of a specified maximum * length, and provides higher level routines with input datagrams * received from its medium. * * Output occurs when the routine if_output is called, with three parameters: * (*ifp->if_output)(ifp, m, dst, rt) * Here m is the mbuf chain to be sent and dst is the destination address. * The output routine encapsulates the supplied datagram if necessary, * and then transmits it on its medium. * * On input, each interface unwraps the data received by it, and either * places it on the input queue of an internetwork datagram routine * and posts the associated software interrupt, or passes the datagram to a raw * packet input routine. * * Routines exist for locating interfaces by their addresses * or for locating an interface on a certain network, as well as more general * routing and gateway routines maintaining information used to locate * interfaces. These routines live in the files if.c and route.c */ struct rtentry; /* ifa_rtrequest */ struct rt_addrinfo; /* ifa_rtrequest */ struct socket; struct carp_if; struct carp_softc; struct ifvlantrunk; struct route; /* if_output */ struct vnet; struct ifmedia; struct netmap_adapter; #ifdef _KERNEL #include /* ifqueue only? */ #include #include #endif /* _KERNEL */ #include #include /* XXX */ #include /* struct ifqueue */ #include /* XXX */ #include /* XXX */ #include /* if_link_task */ #define IF_DUNIT_NONE -1 #include TAILQ_HEAD(ifnethead, ifnet); /* we use TAILQs so that the order of */ TAILQ_HEAD(ifaddrhead, ifaddr); /* instantiation is preserved in the list */ TAILQ_HEAD(ifmultihead, ifmultiaddr); TAILQ_HEAD(ifgrouphead, ifg_group); #ifdef _KERNEL VNET_DECLARE(struct pfil_head, link_pfil_hook); /* packet filter hooks */ #define V_link_pfil_hook VNET(link_pfil_hook) #endif /* _KERNEL */ typedef enum { IFCOUNTER_IPACKETS = 1, IFCOUNTER_IERRORS, IFCOUNTER_OPACKETS, IFCOUNTER_OERRORS, IFCOUNTER_COLLISIONS, IFCOUNTER_IBYTES, IFCOUNTER_OBYTES, IFCOUNTER_IMCASTS, IFCOUNTER_OMCASTS, IFCOUNTER_IQDROPS, IFCOUNTER_OQDROPS, IFCOUNTER_NOPROTO, } ifnet_counter; typedef struct ifnet * if_t; typedef void (*if_start_fn_t)(if_t); typedef int (*if_ioctl_fn_t)(if_t, u_long, caddr_t); typedef void (*if_init_fn_t)(void *); typedef void (*if_qflush_fn_t)(if_t); typedef int (*if_transmit_fn_t)(if_t, struct mbuf *); typedef uint64_t (*if_get_counter_t)(if_t, ifnet_counter); /* - * Macros defining how to decode the "if_hw_tsomax" field: - */ -#define IF_HW_TSOMAX_GET_BYTES(x) \ - ((uint16_t)(x)) /* 32..65535 */ - -#define IF_HW_TSOMAX_GET_FRAG_COUNT(x) \ - ((uint8_t)((x) >> 16)) /* 1..255 */ - -#define IF_HW_TSOMAX_GET_FRAG_SIZE(x) \ - ((uint8_t)((x) >> 24)) /* 12..16 */ - -/* - * The following macro defines how to build the "if_hw_tsomax" - * field. The "bytes" field has unit 1 bytes and declares the maximum - * number of bytes which can be transferred by a single transmit - * offload, TSO, job. The "bytes" field is rounded down to the neares - * 4 bytes to avoid having the hardware do unaligned memory - * accesses. The "frag_count" field has unit 1 fragment and declares - * the maximum number of fragments a TSO job can contain. The - * "frag_size" field has unit logarithm in base 2 of the actual value - * in bytes and declares the maximum size of a fragment. - */ -#define IF_HW_TSOMAX_BUILD_VALUE(bytes, frag_count, frag_size) \ - (((bytes) & 0xFFFC) | (((frag_count) & 0xFF) << 16) | \ - (((frag_size) & 0xFF) << 24)) - -#define IF_HW_TSOMAX_DEFAULT_BYTES (65536 - 4) -#define IF_HW_TSOMAX_DEFAULT_FRAG_COUNT 255 -#define IF_HW_TSOMAX_DEFAULT_FRAG_SIZE 16 - -#define IF_HW_TSOMAX_DEFAULT_VALUE() \ - IF_HW_TSOMAX_BUILD_VALUE( \ - IF_HW_TSOMAX_DEFAULT_BYTES, \ - IF_HW_TSOMAX_DEFAULT_FRAG_COUNT, \ - IF_HW_TSOMAX_DEFAULT_FRAG_SIZE) - -/* * Structure defining a network interface. * * Size ILP32: 592 (approx) * LP64: 1048 (approx) */ struct ifnet { /* General book keeping of interface lists. */ TAILQ_ENTRY(ifnet) if_link; /* all struct ifnets are chained */ LIST_ENTRY(ifnet) if_clones; /* interfaces of a cloner */ TAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if */ /* protected by if_addr_lock */ u_char if_alloctype; /* if_type at time of allocation */ /* Driver and protocol specific information that remains stable. */ void *if_softc; /* pointer to driver state */ void *if_llsoftc; /* link layer softc */ void *if_l2com; /* pointer to protocol bits */ const char *if_dname; /* driver name */ int if_dunit; /* unit or IF_DUNIT_NONE */ u_short if_index; /* numeric abbreviation for this if */ short if_index_reserved; /* spare space to grow if_index */ char if_xname[IFNAMSIZ]; /* external name (name + unit) */ char *if_description; /* interface description */ /* Variable fields that are touched by the stack and drivers. */ int if_flags; /* up/down, broadcast, etc. */ int if_drv_flags; /* driver-managed status flags */ int if_capabilities; /* interface features & capabilities */ int if_capenable; /* enabled features & capabilities */ void *if_linkmib; /* link-type-specific MIB data */ size_t if_linkmiblen; /* length of above data */ u_int if_refcount; /* reference count */ /* These fields are shared with struct if_data. */ uint8_t if_type; /* ethernet, tokenring, etc */ uint8_t if_addrlen; /* media address length */ uint8_t if_hdrlen; /* media header length */ uint8_t if_link_state; /* current link state */ uint32_t if_mtu; /* maximum transmission unit */ uint32_t if_metric; /* routing metric (external only) */ uint64_t if_baudrate; /* linespeed */ uint64_t if_hwassist; /* HW offload capabilities, see IFCAP */ time_t if_epoch; /* uptime at attach or stat reset */ struct timeval if_lastchange; /* time of last administrative change */ struct ifaltq if_snd; /* output queue (includes altq) */ struct task if_linktask; /* task for link change events */ /* Addresses of different protocol families assigned to this if. */ struct rwlock if_addr_lock; /* lock to protect address lists */ /* * if_addrhead is the list of all addresses associated to * an interface. * Some code in the kernel assumes that first element * of the list has type AF_LINK, and contains sockaddr_dl * addresses which store the link-level address and the name * of the interface. * However, access to the AF_LINK address through this * field is deprecated. Use if_addr or ifaddr_byindex() instead. */ struct ifaddrhead if_addrhead; /* linked list of addresses per if */ struct ifmultihead if_multiaddrs; /* multicast addresses configured */ int if_amcount; /* number of all-multicast requests */ struct ifaddr *if_addr; /* pointer to link-level address */ const u_int8_t *if_broadcastaddr; /* linklevel broadcast bytestring */ struct rwlock if_afdata_lock; void *if_afdata[AF_MAX]; int if_afdata_initialized; /* Additional features hung off the interface. */ u_int if_fib; /* interface FIB */ struct vnet *if_vnet; /* pointer to network stack instance */ struct vnet *if_home_vnet; /* where this ifnet originates from */ struct ifvlantrunk *if_vlantrunk; /* pointer to 802.1q data */ struct bpf_if *if_bpf; /* packet filter structure */ int if_pcount; /* number of promiscuous listeners */ void *if_bridge; /* bridge glue */ void *if_lagg; /* lagg glue */ void *if_pf_kif; /* pf glue */ struct carp_if *if_carp; /* carp interface structure */ struct label *if_label; /* interface MAC label */ struct netmap_adapter *if_netmap; /* netmap(4) softc */ /* Various procedures of the layer2 encapsulation and drivers. */ int (*if_output) /* output routine (enqueue) */ (struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); void (*if_input) /* input routine (from h/w driver) */ (struct ifnet *, struct mbuf *); if_start_fn_t if_start; /* initiate output routine */ if_ioctl_fn_t if_ioctl; /* ioctl routine */ if_init_fn_t if_init; /* Init routine */ int (*if_resolvemulti) /* validate/resolve multicast */ (struct ifnet *, struct sockaddr **, struct sockaddr *); if_qflush_fn_t if_qflush; /* flush any queue */ if_transmit_fn_t if_transmit; /* initiate output routine */ void (*if_reassign) /* reassign to vnet routine */ (struct ifnet *, struct vnet *, char *); if_get_counter_t if_get_counter; /* get counter values */ /* Stuff that's only temporary and doesn't belong here. */ - u_int if_hw_tsomax; /* TSO burst length limits. + u_int if_hw_tsomax; /* tso burst length limit, the minimum + * is (IP_MAXPACKET / 8). * XXXAO: Have to find a better place * for it eventually. */ /* * Old, racy and expensive statistics, should not be used in * new drivers. */ uint64_t if_ipackets; /* packets received on interface */ uint64_t if_ierrors; /* input errors on interface */ uint64_t if_opackets; /* packets sent on interface */ uint64_t if_oerrors; /* output errors on interface */ uint64_t if_collisions; /* collisions on csma interfaces */ uint64_t if_ibytes; /* total number of octets received */ uint64_t if_obytes; /* total number of octets sent */ uint64_t if_imcasts; /* packets received via multicast */ uint64_t if_omcasts; /* packets sent via multicast */ uint64_t if_iqdrops; /* dropped on input */ uint64_t if_oqdrops; /* dropped on output */ uint64_t if_noproto; /* destined for unsupported protocol */ /* * Spare fields to be added before branching a stable branch, so * that structure can be enhanced without changing the kernel * binary interface. */ }; #include /* XXXAO: temporary unconditional include */ /* for compatibility with other BSDs */ #define if_addrlist if_addrhead #define if_list if_link #define if_name(ifp) ((ifp)->if_xname) /* * Locks for address lists on the network interface. */ #define IF_ADDR_LOCK_INIT(if) rw_init(&(if)->if_addr_lock, "if_addr_lock") #define IF_ADDR_LOCK_DESTROY(if) rw_destroy(&(if)->if_addr_lock) #define IF_ADDR_WLOCK(if) rw_wlock(&(if)->if_addr_lock) #define IF_ADDR_WUNLOCK(if) rw_wunlock(&(if)->if_addr_lock) #define IF_ADDR_RLOCK(if) rw_rlock(&(if)->if_addr_lock) #define IF_ADDR_RUNLOCK(if) rw_runlock(&(if)->if_addr_lock) #define IF_ADDR_LOCK_ASSERT(if) rw_assert(&(if)->if_addr_lock, RA_LOCKED) #define IF_ADDR_WLOCK_ASSERT(if) rw_assert(&(if)->if_addr_lock, RA_WLOCKED) /* * Function variations on locking macros intended to be used by loadable * kernel modules in order to divorce them from the internals of address list * locking. */ void if_addr_rlock(struct ifnet *ifp); /* if_addrhead */ void if_addr_runlock(struct ifnet *ifp); /* if_addrhead */ void if_maddr_rlock(if_t ifp); /* if_multiaddrs */ void if_maddr_runlock(if_t ifp); /* if_multiaddrs */ #ifdef _KERNEL #ifdef _SYS_EVENTHANDLER_H_ /* interface link layer address change event */ typedef void (*iflladdr_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(iflladdr_event, iflladdr_event_handler_t); /* interface address change event */ typedef void (*ifaddr_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(ifaddr_event, ifaddr_event_handler_t); /* new interface arrival event */ typedef void (*ifnet_arrival_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(ifnet_arrival_event, ifnet_arrival_event_handler_t); /* interface departure event */ typedef void (*ifnet_departure_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(ifnet_departure_event, ifnet_departure_event_handler_t); /* Interface link state change event */ typedef void (*ifnet_link_event_handler_t)(void *, struct ifnet *, int); EVENTHANDLER_DECLARE(ifnet_link_event, ifnet_link_event_handler_t); #endif /* _SYS_EVENTHANDLER_H_ */ /* * interface groups */ struct ifg_group { char ifg_group[IFNAMSIZ]; u_int ifg_refcnt; void *ifg_pf_kif; TAILQ_HEAD(, ifg_member) ifg_members; TAILQ_ENTRY(ifg_group) ifg_next; }; struct ifg_member { TAILQ_ENTRY(ifg_member) ifgm_next; struct ifnet *ifgm_ifp; }; struct ifg_list { struct ifg_group *ifgl_group; TAILQ_ENTRY(ifg_list) ifgl_next; }; #ifdef _SYS_EVENTHANDLER_H_ /* group attach event */ typedef void (*group_attach_event_handler_t)(void *, struct ifg_group *); EVENTHANDLER_DECLARE(group_attach_event, group_attach_event_handler_t); /* group detach event */ typedef void (*group_detach_event_handler_t)(void *, struct ifg_group *); EVENTHANDLER_DECLARE(group_detach_event, group_detach_event_handler_t); /* group change event */ typedef void (*group_change_event_handler_t)(void *, const char *); EVENTHANDLER_DECLARE(group_change_event, group_change_event_handler_t); #endif /* _SYS_EVENTHANDLER_H_ */ #define IF_AFDATA_LOCK_INIT(ifp) \ rw_init(&(ifp)->if_afdata_lock, "if_afdata") #define IF_AFDATA_WLOCK(ifp) rw_wlock(&(ifp)->if_afdata_lock) #define IF_AFDATA_RLOCK(ifp) rw_rlock(&(ifp)->if_afdata_lock) #define IF_AFDATA_WUNLOCK(ifp) rw_wunlock(&(ifp)->if_afdata_lock) #define IF_AFDATA_RUNLOCK(ifp) rw_runlock(&(ifp)->if_afdata_lock) #define IF_AFDATA_LOCK(ifp) IF_AFDATA_WLOCK(ifp) #define IF_AFDATA_UNLOCK(ifp) IF_AFDATA_WUNLOCK(ifp) #define IF_AFDATA_TRYLOCK(ifp) rw_try_wlock(&(ifp)->if_afdata_lock) #define IF_AFDATA_DESTROY(ifp) rw_destroy(&(ifp)->if_afdata_lock) #define IF_AFDATA_LOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_LOCKED) #define IF_AFDATA_RLOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_RLOCKED) #define IF_AFDATA_WLOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_WLOCKED) #define IF_AFDATA_UNLOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_UNLOCKED) /* * 72 was chosen below because it is the size of a TCP/IP * header (40) + the minimum mss (32). */ #define IF_MINMTU 72 #define IF_MAXMTU 65535 #define TOEDEV(ifp) ((ifp)->if_llsoftc) #endif /* _KERNEL */ /* * The ifaddr structure contains information about one address * of an interface. They are maintained by the different address families, * are allocated and attached when an address is set, and are linked * together so all addresses for an interface can be located. * * NOTE: a 'struct ifaddr' is always at the beginning of a larger * chunk of malloc'ed memory, where we store the three addresses * (ifa_addr, ifa_dstaddr and ifa_netmask) referenced here. */ #if defined(_KERNEL) || defined(_WANT_IFADDR) struct ifaddr { struct sockaddr *ifa_addr; /* address of interface */ struct sockaddr *ifa_dstaddr; /* other end of p-to-p link */ #define ifa_broadaddr ifa_dstaddr /* broadcast address interface */ struct sockaddr *ifa_netmask; /* used to determine subnet */ struct ifnet *ifa_ifp; /* back-pointer to interface */ struct carp_softc *ifa_carp; /* pointer to CARP data */ TAILQ_ENTRY(ifaddr) ifa_link; /* queue macro glue */ void (*ifa_rtrequest) /* check or clean routes (+ or -)'d */ (int, struct rtentry *, struct rt_addrinfo *); u_short ifa_flags; /* mostly rt_flags for cloning */ u_int ifa_refcnt; /* references to this structure */ counter_u64_t ifa_ipackets; counter_u64_t ifa_opackets; counter_u64_t ifa_ibytes; counter_u64_t ifa_obytes; }; #endif #ifdef _KERNEL #define IFA_ROUTE RTF_UP /* route installed */ #define IFA_RTSELF RTF_HOST /* loopback route to self installed */ /* For compatibility with other BSDs. SCTP uses it. */ #define ifa_list ifa_link struct ifaddr * ifa_alloc(size_t size, int flags); void ifa_free(struct ifaddr *ifa); void ifa_ref(struct ifaddr *ifa); #endif /* _KERNEL */ /* * Multicast address structure. This is analogous to the ifaddr * structure except that it keeps track of multicast addresses. */ struct ifmultiaddr { TAILQ_ENTRY(ifmultiaddr) ifma_link; /* queue macro glue */ struct sockaddr *ifma_addr; /* address this membership is for */ struct sockaddr *ifma_lladdr; /* link-layer translation, if any */ struct ifnet *ifma_ifp; /* back-pointer to interface */ u_int ifma_refcount; /* reference count */ void *ifma_protospec; /* protocol-specific state, if any */ struct ifmultiaddr *ifma_llifma; /* pointer to ifma for ifma_lladdr */ }; #ifdef _KERNEL extern struct rwlock ifnet_rwlock; extern struct sx ifnet_sxlock; #define IFNET_LOCK_INIT() do { \ rw_init_flags(&ifnet_rwlock, "ifnet_rw", RW_RECURSE); \ sx_init_flags(&ifnet_sxlock, "ifnet_sx", SX_RECURSE); \ } while(0) #define IFNET_WLOCK() do { \ sx_xlock(&ifnet_sxlock); \ rw_wlock(&ifnet_rwlock); \ } while (0) #define IFNET_WUNLOCK() do { \ rw_wunlock(&ifnet_rwlock); \ sx_xunlock(&ifnet_sxlock); \ } while (0) /* * To assert the ifnet lock, you must know not only whether it's for read or * write, but also whether it was acquired with sleep support or not. */ #define IFNET_RLOCK_ASSERT() sx_assert(&ifnet_sxlock, SA_SLOCKED) #define IFNET_RLOCK_NOSLEEP_ASSERT() rw_assert(&ifnet_rwlock, RA_RLOCKED) #define IFNET_WLOCK_ASSERT() do { \ sx_assert(&ifnet_sxlock, SA_XLOCKED); \ rw_assert(&ifnet_rwlock, RA_WLOCKED); \ } while (0) #define IFNET_RLOCK() sx_slock(&ifnet_sxlock) #define IFNET_RLOCK_NOSLEEP() rw_rlock(&ifnet_rwlock) #define IFNET_RUNLOCK() sx_sunlock(&ifnet_sxlock) #define IFNET_RUNLOCK_NOSLEEP() rw_runlock(&ifnet_rwlock) /* * Look up an ifnet given its index; the _ref variant also acquires a * reference that must be freed using if_rele(). It is almost always a bug * to call ifnet_byindex() instead if ifnet_byindex_ref(). */ struct ifnet *ifnet_byindex(u_short idx); struct ifnet *ifnet_byindex_locked(u_short idx); struct ifnet *ifnet_byindex_ref(u_short idx); /* * Given the index, ifaddr_byindex() returns the one and only * link-level ifaddr for the interface. You are not supposed to use * it to traverse the list of addresses associated to the interface. */ struct ifaddr *ifaddr_byindex(u_short idx); VNET_DECLARE(struct ifnethead, ifnet); VNET_DECLARE(struct ifgrouphead, ifg_head); VNET_DECLARE(int, if_index); VNET_DECLARE(struct ifnet *, loif); /* first loopback interface */ #define V_ifnet VNET(ifnet) #define V_ifg_head VNET(ifg_head) #define V_if_index VNET(if_index) #define V_loif VNET(loif) int if_addgroup(struct ifnet *, const char *); int if_delgroup(struct ifnet *, const char *); int if_addmulti(struct ifnet *, struct sockaddr *, struct ifmultiaddr **); int if_allmulti(struct ifnet *, int); struct ifnet* if_alloc(u_char); void if_attach(struct ifnet *); void if_dead(struct ifnet *); int if_delmulti(struct ifnet *, struct sockaddr *); void if_delmulti_ifma(struct ifmultiaddr *); void if_detach(struct ifnet *); void if_vmove(struct ifnet *, struct vnet *); void if_purgeaddrs(struct ifnet *); void if_delallmulti(struct ifnet *); void if_down(struct ifnet *); struct ifmultiaddr * if_findmulti(struct ifnet *, struct sockaddr *); void if_free(struct ifnet *); void if_initname(struct ifnet *, const char *, int); void if_link_state_change(struct ifnet *, int); int if_printf(struct ifnet *, const char *, ...) __printflike(2, 3); void if_ref(struct ifnet *); void if_rele(struct ifnet *); int if_setlladdr(struct ifnet *, const u_char *, int); void if_up(struct ifnet *); int ifioctl(struct socket *, u_long, caddr_t, struct thread *); int ifpromisc(struct ifnet *, int); struct ifnet *ifunit(const char *); struct ifnet *ifunit_ref(const char *); int ifa_add_loopback_route(struct ifaddr *, struct sockaddr *); int ifa_del_loopback_route(struct ifaddr *, struct sockaddr *); int ifa_switch_loopback_route(struct ifaddr *, struct sockaddr *, int fib); struct ifaddr *ifa_ifwithaddr(struct sockaddr *); int ifa_ifwithaddr_check(struct sockaddr *); struct ifaddr *ifa_ifwithbroadaddr(struct sockaddr *, int); struct ifaddr *ifa_ifwithdstaddr(struct sockaddr *, int); struct ifaddr *ifa_ifwithnet(struct sockaddr *, int, int); struct ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr *, u_int); struct ifaddr *ifaof_ifpforaddr(struct sockaddr *, struct ifnet *); int ifa_preferred(struct ifaddr *, struct ifaddr *); int if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen); typedef void *if_com_alloc_t(u_char type, struct ifnet *ifp); typedef void if_com_free_t(void *com, u_char type); void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f); void if_deregister_com_alloc(u_char type); void if_data_copy(struct ifnet *, struct if_data *); uint64_t if_get_counter_compat(struct ifnet *, ifnet_counter); #define IF_LLADDR(ifp) \ LLADDR((struct sockaddr_dl *)((ifp)->if_addr->ifa_addr)) uint64_t if_setbaudrate(if_t ifp, uint64_t baudrate); uint64_t if_getbaudrate(if_t ifp); int if_setcapabilities(if_t ifp, int capabilities); int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit); int if_getcapabilities(if_t ifp); int if_togglecapenable(if_t ifp, int togglecap); int if_setcapenable(if_t ifp, int capenable); int if_setcapenablebit(if_t ifp, int setcap, int clearcap); int if_getcapenable(if_t ifp); const char *if_getdname(if_t ifp); int if_setdev(if_t ifp, void *dev); int if_setdrvflagbits(if_t ifp, int if_setflags, int clear_flags); int if_getdrvflags(if_t ifp); int if_setdrvflags(if_t ifp, int flags); int if_clearhwassist(if_t ifp); int if_sethwassistbits(if_t ifp, int toset, int toclear); int if_sethwassist(if_t ifp, int hwassist_bit); int if_gethwassist(if_t ifp); int if_setsoftc(if_t ifp, void *softc); void *if_getsoftc(if_t ifp); int if_setflags(if_t ifp, int flags); int if_setmtu(if_t ifp, int mtu); int if_getmtu(if_t ifp); int if_setflagbits(if_t ifp, int set, int clear); int if_getflags(if_t ifp); int if_sendq_empty(if_t ifp); int if_setsendqready(if_t ifp); int if_setsendqlen(if_t ifp, int tx_desc_count); int if_input(if_t ifp, struct mbuf* sendmp); int if_sendq_prepend(if_t ifp, struct mbuf *m); struct mbuf *if_dequeue(if_t ifp); int if_setifheaderlen(if_t ifp, int len); void if_setrcvif(struct mbuf *m, if_t ifp); void if_setvtag(struct mbuf *m, u_int16_t tag); u_int16_t if_getvtag(struct mbuf *m); int if_vlantrunkinuse(if_t ifp); caddr_t if_getlladdr(if_t ifp); void *if_gethandle(u_char); void if_bpfmtap(if_t ifp, struct mbuf *m); void if_etherbpfmtap(if_t ifp, struct mbuf *m); void if_vlancap(if_t ifp); int if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max); int if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max); int if_multiaddr_count(if_t ifp, int max); int if_getamcount(if_t ifp); struct ifaddr * if_getifaddr(if_t ifp); /* Statistics */ int if_incipackets(if_t ifp, int pkt); int if_incopackets(if_t ifp, int pkts); int if_incierrors(if_t ifp, int ierrors); int if_incoerrors(if_t ifp, int oerrors); int if_inciqdrops(if_t ifp, int val); int if_setierrors(if_t ifp, int ierrors); int if_setoerrors(if_t ifp, int oerrors); int if_setcollisions(if_t ifp, int collisions); int if_inccollisions(if_t ifp, int collisions); int if_incobytes(if_t ifp, int bytes); int if_getiqdrops(if_t ifp); int if_incimcasts(if_t ifp, int imcasts); int if_incomcasts(if_t ifp, int imcasts); int if_setipackets(if_t ifp, int pkts); int if_setopackets(if_t ifp, int pkts); int if_setibytes(if_t ifp, int bytes); int if_setobytes(if_t ifp, int bytes); int if_setimcasts(if_t ifp, int pkts); /* Functions */ void if_setinitfn(if_t ifp, void (*)(void *)); void if_setioctlfn(if_t ifp, int (*)(if_t, u_long, caddr_t)); void if_setstartfn(if_t ifp, void (*)(if_t)); void if_settransmitfn(if_t ifp, if_transmit_fn_t); void if_setqflushfn(if_t ifp, if_qflush_fn_t); - -/* "if_hw_tsomax" related functions */ -u_int if_hw_tsomax_common(u_int, u_int); -u_int if_hw_tsomax_range_check(u_int); /* Revisit the below. These are inline functions originally */ int drbr_inuse_drv(if_t ifp, struct buf_ring *br); struct mbuf* drbr_dequeue_drv(if_t ifp, struct buf_ring *br); int drbr_needs_enqueue_drv(if_t ifp, struct buf_ring *br); int drbr_enqueue_drv(if_t ifp, struct buf_ring *br, struct mbuf *m); #endif /* _KERNEL */ #endif /* !_NET_IF_VAR_H_ */ Index: head/sys/net/if_vlan.c =================================================================== --- head/sys/net/if_vlan.c (revision 271550) +++ head/sys/net/if_vlan.c (revision 271551) @@ -1,1776 +1,1776 @@ /*- * Copyright 1998 Massachusetts Institute of Technology * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby * granted, provided that both the above copyright notice and this * permission notice appear in all copies, that both the above * copyright notice and this permission notice appear in all * supporting documentation, and that the name of M.I.T. not be used * in advertising or publicity pertaining to distribution of the * software without specific, written prior permission. M.I.T. makes * no representations about the suitability of this software for any * purpose. It is provided "as is" without express or implied * warranty. * * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * if_vlan.c - pseudo-device driver for IEEE 802.1Q virtual LANs. * Might be extended some day to also handle IEEE 802.1p priority * tagging. This is sort of sneaky in the implementation, since * we need to pretend to be enough of an Ethernet implementation * to make arp work. The way we do this is by telling everyone * that we are an Ethernet, and then catch the packets that * ether_output() sends to us via if_transmit(), rewrite them for * use by the real outgoing interface, and ask it to send them. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_vlan.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif #define VLAN_DEF_HWIDTH 4 #define VLAN_IFFLAGS (IFF_BROADCAST | IFF_MULTICAST) #define UP_AND_RUNNING(ifp) \ ((ifp)->if_flags & IFF_UP && (ifp)->if_drv_flags & IFF_DRV_RUNNING) LIST_HEAD(ifvlanhead, ifvlan); struct ifvlantrunk { struct ifnet *parent; /* parent interface of this trunk */ struct rmlock lock; #ifdef VLAN_ARRAY #define VLAN_ARRAY_SIZE (EVL_VLID_MASK + 1) struct ifvlan *vlans[VLAN_ARRAY_SIZE]; /* static table */ #else struct ifvlanhead *hash; /* dynamic hash-list table */ uint16_t hmask; uint16_t hwidth; #endif int refcnt; }; struct vlan_mc_entry { struct sockaddr_dl mc_addr; SLIST_ENTRY(vlan_mc_entry) mc_entries; }; struct ifvlan { struct ifvlantrunk *ifv_trunk; struct ifnet *ifv_ifp; void *ifv_cookie; counter_u64_t ifv_ipackets; counter_u64_t ifv_ibytes; counter_u64_t ifv_opackets; counter_u64_t ifv_obytes; counter_u64_t ifv_omcasts; #define TRUNK(ifv) ((ifv)->ifv_trunk) #define PARENT(ifv) ((ifv)->ifv_trunk->parent) int ifv_pflags; /* special flags we have set on parent */ struct ifv_linkmib { int ifvm_encaplen; /* encapsulation length */ int ifvm_mtufudge; /* MTU fudged by this much */ int ifvm_mintu; /* min transmission unit */ uint16_t ifvm_proto; /* encapsulation ethertype */ uint16_t ifvm_tag; /* tag to apply on packets leaving if */ } ifv_mib; SLIST_HEAD(, vlan_mc_entry) vlan_mc_listhead; #ifndef VLAN_ARRAY LIST_ENTRY(ifvlan) ifv_list; #endif }; #define ifv_proto ifv_mib.ifvm_proto #define ifv_vid ifv_mib.ifvm_tag #define ifv_encaplen ifv_mib.ifvm_encaplen #define ifv_mtufudge ifv_mib.ifvm_mtufudge #define ifv_mintu ifv_mib.ifvm_mintu /* Special flags we should propagate to parent. */ static struct { int flag; int (*func)(struct ifnet *, int); } vlan_pflags[] = { {IFF_PROMISC, ifpromisc}, {IFF_ALLMULTI, if_allmulti}, {0, NULL} }; SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_L2VLAN, vlan, CTLFLAG_RW, 0, "IEEE 802.1Q VLAN"); static SYSCTL_NODE(_net_link_vlan, PF_LINK, link, CTLFLAG_RW, 0, "for consistency"); static int soft_pad = 0; SYSCTL_INT(_net_link_vlan, OID_AUTO, soft_pad, CTLFLAG_RW, &soft_pad, 0, "pad short frames before tagging"); static const char vlanname[] = "vlan"; static MALLOC_DEFINE(M_VLAN, vlanname, "802.1Q Virtual LAN Interface"); static eventhandler_tag ifdetach_tag; static eventhandler_tag iflladdr_tag; /* * We have a global mutex, that is used to serialize configuration * changes and isn't used in normal packet delivery. * * We also have a per-trunk rwlock, that is locked shared on packet * processing and exclusive when configuration is changed. * * The VLAN_ARRAY substitutes the dynamic hash with a static array * with 4096 entries. In theory this can give a boost in processing, * however on practice it does not. Probably this is because array * is too big to fit into CPU cache. */ static struct sx ifv_lock; #define VLAN_LOCK_INIT() sx_init(&ifv_lock, "vlan_global") #define VLAN_LOCK_DESTROY() sx_destroy(&ifv_lock) #define VLAN_LOCK_ASSERT() sx_assert(&ifv_lock, SA_LOCKED) #define VLAN_LOCK() sx_xlock(&ifv_lock) #define VLAN_UNLOCK() sx_xunlock(&ifv_lock) #define TRUNK_LOCK_INIT(trunk) rm_init(&(trunk)->lock, vlanname) #define TRUNK_LOCK_DESTROY(trunk) rm_destroy(&(trunk)->lock) #define TRUNK_LOCK(trunk) rm_wlock(&(trunk)->lock) #define TRUNK_UNLOCK(trunk) rm_wunlock(&(trunk)->lock) #define TRUNK_LOCK_ASSERT(trunk) rm_assert(&(trunk)->lock, RA_WLOCKED) #define TRUNK_RLOCK(trunk) rm_rlock(&(trunk)->lock, &tracker) #define TRUNK_RUNLOCK(trunk) rm_runlock(&(trunk)->lock, &tracker) #define TRUNK_LOCK_RASSERT(trunk) rm_assert(&(trunk)->lock, RA_RLOCKED) #define TRUNK_LOCK_READER struct rm_priotracker tracker #ifndef VLAN_ARRAY static void vlan_inithash(struct ifvlantrunk *trunk); static void vlan_freehash(struct ifvlantrunk *trunk); static int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv); static int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv); static void vlan_growhash(struct ifvlantrunk *trunk, int howmuch); static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid); #endif static void trunk_destroy(struct ifvlantrunk *trunk); static void vlan_init(void *foo); static void vlan_input(struct ifnet *ifp, struct mbuf *m); static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr); static void vlan_qflush(struct ifnet *ifp); static uint64_t vlan_get_counter(struct ifnet *ifp, ifnet_counter cnt); static int vlan_setflag(struct ifnet *ifp, int flag, int status, int (*func)(struct ifnet *, int)); static int vlan_setflags(struct ifnet *ifp, int status); static int vlan_setmulti(struct ifnet *ifp); static int vlan_transmit(struct ifnet *ifp, struct mbuf *m); static void vlan_unconfig(struct ifnet *ifp); static void vlan_unconfig_locked(struct ifnet *ifp, int departing); static int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t tag); static void vlan_link_state(struct ifnet *ifp); static void vlan_capabilities(struct ifvlan *ifv); static void vlan_trunk_capabilities(struct ifnet *ifp); static struct ifnet *vlan_clone_match_ethervid(struct if_clone *, const char *, int *); static int vlan_clone_match(struct if_clone *, const char *); static int vlan_clone_create(struct if_clone *, char *, size_t, caddr_t); static int vlan_clone_destroy(struct if_clone *, struct ifnet *); static void vlan_ifdetach(void *arg, struct ifnet *ifp); static void vlan_iflladdr(void *arg, struct ifnet *ifp); static struct if_clone *vlan_cloner; #ifdef VIMAGE static VNET_DEFINE(struct if_clone *, vlan_cloner); #define V_vlan_cloner VNET(vlan_cloner) #endif #ifndef VLAN_ARRAY #define HASH(n, m) ((((n) >> 8) ^ ((n) >> 4) ^ (n)) & (m)) static void vlan_inithash(struct ifvlantrunk *trunk) { int i, n; /* * The trunk must not be locked here since we call malloc(M_WAITOK). * It is OK in case this function is called before the trunk struct * gets hooked up and becomes visible from other threads. */ KASSERT(trunk->hwidth == 0 && trunk->hash == NULL, ("%s: hash already initialized", __func__)); trunk->hwidth = VLAN_DEF_HWIDTH; n = 1 << trunk->hwidth; trunk->hmask = n - 1; trunk->hash = malloc(sizeof(struct ifvlanhead) * n, M_VLAN, M_WAITOK); for (i = 0; i < n; i++) LIST_INIT(&trunk->hash[i]); } static void vlan_freehash(struct ifvlantrunk *trunk) { #ifdef INVARIANTS int i; KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); for (i = 0; i < (1 << trunk->hwidth); i++) KASSERT(LIST_EMPTY(&trunk->hash[i]), ("%s: hash table not empty", __func__)); #endif free(trunk->hash, M_VLAN); trunk->hash = NULL; trunk->hwidth = trunk->hmask = 0; } static int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { int i, b; struct ifvlan *ifv2; TRUNK_LOCK_ASSERT(trunk); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); b = 1 << trunk->hwidth; i = HASH(ifv->ifv_vid, trunk->hmask); LIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) if (ifv->ifv_vid == ifv2->ifv_vid) return (EEXIST); /* * Grow the hash when the number of vlans exceeds half of the number of * hash buckets squared. This will make the average linked-list length * buckets/2. */ if (trunk->refcnt > (b * b) / 2) { vlan_growhash(trunk, 1); i = HASH(ifv->ifv_vid, trunk->hmask); } LIST_INSERT_HEAD(&trunk->hash[i], ifv, ifv_list); trunk->refcnt++; return (0); } static int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { int i, b; struct ifvlan *ifv2; TRUNK_LOCK_ASSERT(trunk); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); b = 1 << trunk->hwidth; i = HASH(ifv->ifv_vid, trunk->hmask); LIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) if (ifv2 == ifv) { trunk->refcnt--; LIST_REMOVE(ifv2, ifv_list); if (trunk->refcnt < (b * b) / 2) vlan_growhash(trunk, -1); return (0); } panic("%s: vlan not found\n", __func__); return (ENOENT); /*NOTREACHED*/ } /* * Grow the hash larger or smaller if memory permits. */ static void vlan_growhash(struct ifvlantrunk *trunk, int howmuch) { struct ifvlan *ifv; struct ifvlanhead *hash2; int hwidth2, i, j, n, n2; TRUNK_LOCK_ASSERT(trunk); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); if (howmuch == 0) { /* Harmless yet obvious coding error */ printf("%s: howmuch is 0\n", __func__); return; } hwidth2 = trunk->hwidth + howmuch; n = 1 << trunk->hwidth; n2 = 1 << hwidth2; /* Do not shrink the table below the default */ if (hwidth2 < VLAN_DEF_HWIDTH) return; /* M_NOWAIT because we're called with trunk mutex held */ hash2 = malloc(sizeof(struct ifvlanhead) * n2, M_VLAN, M_NOWAIT); if (hash2 == NULL) { printf("%s: out of memory -- hash size not changed\n", __func__); return; /* We can live with the old hash table */ } for (j = 0; j < n2; j++) LIST_INIT(&hash2[j]); for (i = 0; i < n; i++) while ((ifv = LIST_FIRST(&trunk->hash[i])) != NULL) { LIST_REMOVE(ifv, ifv_list); j = HASH(ifv->ifv_vid, n2 - 1); LIST_INSERT_HEAD(&hash2[j], ifv, ifv_list); } free(trunk->hash, M_VLAN); trunk->hash = hash2; trunk->hwidth = hwidth2; trunk->hmask = n2 - 1; if (bootverbose) if_printf(trunk->parent, "VLAN hash table resized from %d to %d buckets\n", n, n2); } static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid) { struct ifvlan *ifv; TRUNK_LOCK_RASSERT(trunk); LIST_FOREACH(ifv, &trunk->hash[HASH(vid, trunk->hmask)], ifv_list) if (ifv->ifv_vid == vid) return (ifv); return (NULL); } #if 0 /* Debugging code to view the hashtables. */ static void vlan_dumphash(struct ifvlantrunk *trunk) { int i; struct ifvlan *ifv; for (i = 0; i < (1 << trunk->hwidth); i++) { printf("%d: ", i); LIST_FOREACH(ifv, &trunk->hash[i], ifv_list) printf("%s ", ifv->ifv_ifp->if_xname); printf("\n"); } } #endif /* 0 */ #else static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid) { return trunk->vlans[vid]; } static __inline int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { if (trunk->vlans[ifv->ifv_vid] != NULL) return EEXIST; trunk->vlans[ifv->ifv_vid] = ifv; trunk->refcnt++; return (0); } static __inline int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { trunk->vlans[ifv->ifv_vid] = NULL; trunk->refcnt--; return (0); } static __inline void vlan_freehash(struct ifvlantrunk *trunk) { } static __inline void vlan_inithash(struct ifvlantrunk *trunk) { } #endif /* !VLAN_ARRAY */ static void trunk_destroy(struct ifvlantrunk *trunk) { VLAN_LOCK_ASSERT(); TRUNK_LOCK(trunk); vlan_freehash(trunk); trunk->parent->if_vlantrunk = NULL; TRUNK_UNLOCK(trunk); TRUNK_LOCK_DESTROY(trunk); free(trunk, M_VLAN); } /* * Program our multicast filter. What we're actually doing is * programming the multicast filter of the parent. This has the * side effect of causing the parent interface to receive multicast * traffic that it doesn't really want, which ends up being discarded * later by the upper protocol layers. Unfortunately, there's no way * to avoid this: there really is only one physical interface. */ static int vlan_setmulti(struct ifnet *ifp) { struct ifnet *ifp_p; struct ifmultiaddr *ifma; struct ifvlan *sc; struct vlan_mc_entry *mc; int error; /* Find the parent. */ sc = ifp->if_softc; TRUNK_LOCK_ASSERT(TRUNK(sc)); ifp_p = PARENT(sc); CURVNET_SET_QUIET(ifp_p->if_vnet); /* First, remove any existing filter entries. */ while ((mc = SLIST_FIRST(&sc->vlan_mc_listhead)) != NULL) { SLIST_REMOVE_HEAD(&sc->vlan_mc_listhead, mc_entries); (void)if_delmulti(ifp_p, (struct sockaddr *)&mc->mc_addr); free(mc, M_VLAN); } /* Now program new ones. */ IF_ADDR_WLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; mc = malloc(sizeof(struct vlan_mc_entry), M_VLAN, M_NOWAIT); if (mc == NULL) { IF_ADDR_WUNLOCK(ifp); return (ENOMEM); } bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len); mc->mc_addr.sdl_index = ifp_p->if_index; SLIST_INSERT_HEAD(&sc->vlan_mc_listhead, mc, mc_entries); } IF_ADDR_WUNLOCK(ifp); SLIST_FOREACH (mc, &sc->vlan_mc_listhead, mc_entries) { error = if_addmulti(ifp_p, (struct sockaddr *)&mc->mc_addr, NULL); if (error) return (error); } CURVNET_RESTORE(); return (0); } /* * A handler for parent interface link layer address changes. * If the parent interface link layer address is changed we * should also change it on all children vlans. */ static void vlan_iflladdr(void *arg __unused, struct ifnet *ifp) { struct ifvlan *ifv; #ifndef VLAN_ARRAY struct ifvlan *next; #endif int i; /* * Check if it's a trunk interface first of all * to avoid needless locking. */ if (ifp->if_vlantrunk == NULL) return; VLAN_LOCK(); /* * OK, it's a trunk. Loop over and change all vlan's lladdrs on it. */ #ifdef VLAN_ARRAY for (i = 0; i < VLAN_ARRAY_SIZE; i++) if ((ifv = ifp->if_vlantrunk->vlans[i])) { #else /* VLAN_ARRAY */ for (i = 0; i < (1 << ifp->if_vlantrunk->hwidth); i++) LIST_FOREACH_SAFE(ifv, &ifp->if_vlantrunk->hash[i], ifv_list, next) { #endif /* VLAN_ARRAY */ VLAN_UNLOCK(); if_setlladdr(ifv->ifv_ifp, IF_LLADDR(ifp), ifp->if_addrlen); VLAN_LOCK(); } VLAN_UNLOCK(); } /* * A handler for network interface departure events. * Track departure of trunks here so that we don't access invalid * pointers or whatever if a trunk is ripped from under us, e.g., * by ejecting its hot-plug card. However, if an ifnet is simply * being renamed, then there's no need to tear down the state. */ static void vlan_ifdetach(void *arg __unused, struct ifnet *ifp) { struct ifvlan *ifv; int i; /* * Check if it's a trunk interface first of all * to avoid needless locking. */ if (ifp->if_vlantrunk == NULL) return; /* If the ifnet is just being renamed, don't do anything. */ if (ifp->if_flags & IFF_RENAMING) return; VLAN_LOCK(); /* * OK, it's a trunk. Loop over and detach all vlan's on it. * Check trunk pointer after each vlan_unconfig() as it will * free it and set to NULL after the last vlan was detached. */ #ifdef VLAN_ARRAY for (i = 0; i < VLAN_ARRAY_SIZE; i++) if ((ifv = ifp->if_vlantrunk->vlans[i])) { vlan_unconfig_locked(ifv->ifv_ifp, 1); if (ifp->if_vlantrunk == NULL) break; } #else /* VLAN_ARRAY */ restart: for (i = 0; i < (1 << ifp->if_vlantrunk->hwidth); i++) if ((ifv = LIST_FIRST(&ifp->if_vlantrunk->hash[i]))) { vlan_unconfig_locked(ifv->ifv_ifp, 1); if (ifp->if_vlantrunk) goto restart; /* trunk->hwidth can change */ else break; } #endif /* VLAN_ARRAY */ /* Trunk should have been destroyed in vlan_unconfig(). */ KASSERT(ifp->if_vlantrunk == NULL, ("%s: purge failed", __func__)); VLAN_UNLOCK(); } /* * Return the trunk device for a virtual interface. */ static struct ifnet * vlan_trunkdev(struct ifnet *ifp) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (NULL); ifv = ifp->if_softc; ifp = NULL; VLAN_LOCK(); if (ifv->ifv_trunk) ifp = PARENT(ifv); VLAN_UNLOCK(); return (ifp); } /* * Return the 12-bit VLAN VID for this interface, for use by external * components such as Infiniband. * * XXXRW: Note that the function name here is historical; it should be named * vlan_vid(). */ static int vlan_tag(struct ifnet *ifp, uint16_t *vidp) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (EINVAL); ifv = ifp->if_softc; *vidp = ifv->ifv_vid; return (0); } /* * Return a driver specific cookie for this interface. Synchronization * with setcookie must be provided by the driver. */ static void * vlan_cookie(struct ifnet *ifp) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (NULL); ifv = ifp->if_softc; return (ifv->ifv_cookie); } /* * Store a cookie in our softc that drivers can use to store driver * private per-instance data in. */ static int vlan_setcookie(struct ifnet *ifp, void *cookie) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (EINVAL); ifv = ifp->if_softc; ifv->ifv_cookie = cookie; return (0); } /* * Return the vlan device present at the specific VID. */ static struct ifnet * vlan_devat(struct ifnet *ifp, uint16_t vid) { struct ifvlantrunk *trunk; struct ifvlan *ifv; TRUNK_LOCK_READER; trunk = ifp->if_vlantrunk; if (trunk == NULL) return (NULL); ifp = NULL; TRUNK_RLOCK(trunk); ifv = vlan_gethash(trunk, vid); if (ifv) ifp = ifv->ifv_ifp; TRUNK_RUNLOCK(trunk); return (ifp); } /* * VLAN support can be loaded as a module. The only place in the * system that's intimately aware of this is ether_input. We hook * into this code through vlan_input_p which is defined there and * set here. Noone else in the system should be aware of this so * we use an explicit reference here. */ extern void (*vlan_input_p)(struct ifnet *, struct mbuf *); /* For if_link_state_change() eyes only... */ extern void (*vlan_link_state_p)(struct ifnet *); static int vlan_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: ifdetach_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, vlan_ifdetach, NULL, EVENTHANDLER_PRI_ANY); if (ifdetach_tag == NULL) return (ENOMEM); iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event, vlan_iflladdr, NULL, EVENTHANDLER_PRI_ANY); if (iflladdr_tag == NULL) return (ENOMEM); VLAN_LOCK_INIT(); vlan_input_p = vlan_input; vlan_link_state_p = vlan_link_state; vlan_trunk_cap_p = vlan_trunk_capabilities; vlan_trunkdev_p = vlan_trunkdev; vlan_cookie_p = vlan_cookie; vlan_setcookie_p = vlan_setcookie; vlan_tag_p = vlan_tag; vlan_devat_p = vlan_devat; #ifndef VIMAGE vlan_cloner = if_clone_advanced(vlanname, 0, vlan_clone_match, vlan_clone_create, vlan_clone_destroy); #endif if (bootverbose) printf("vlan: initialized, using " #ifdef VLAN_ARRAY "full-size arrays" #else "hash tables with chaining" #endif "\n"); break; case MOD_UNLOAD: #ifndef VIMAGE if_clone_detach(vlan_cloner); #endif EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_tag); EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_tag); vlan_input_p = NULL; vlan_link_state_p = NULL; vlan_trunk_cap_p = NULL; vlan_trunkdev_p = NULL; vlan_tag_p = NULL; vlan_cookie_p = NULL; vlan_setcookie_p = NULL; vlan_devat_p = NULL; VLAN_LOCK_DESTROY(); if (bootverbose) printf("vlan: unloaded\n"); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t vlan_mod = { "if_vlan", vlan_modevent, 0 }; DECLARE_MODULE(if_vlan, vlan_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_vlan, 3); #ifdef VIMAGE static void vnet_vlan_init(const void *unused __unused) { vlan_cloner = if_clone_advanced(vlanname, 0, vlan_clone_match, vlan_clone_create, vlan_clone_destroy); V_vlan_cloner = vlan_cloner; } VNET_SYSINIT(vnet_vlan_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_vlan_init, NULL); static void vnet_vlan_uninit(const void *unused __unused) { if_clone_detach(V_vlan_cloner); } VNET_SYSUNINIT(vnet_vlan_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, vnet_vlan_uninit, NULL); #endif static struct ifnet * vlan_clone_match_ethervid(struct if_clone *ifc, const char *name, int *vidp) { const char *cp; struct ifnet *ifp; int vid; /* Check for . style interface names. */ IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* * We can handle non-ethernet hardware types as long as * they handle the tagging and headers themselves. */ if (ifp->if_type != IFT_ETHER && (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) continue; if (strncmp(ifp->if_xname, name, strlen(ifp->if_xname)) != 0) continue; cp = name + strlen(ifp->if_xname); if (*cp++ != '.') continue; if (*cp == '\0') continue; vid = 0; for(; *cp >= '0' && *cp <= '9'; cp++) vid = (vid * 10) + (*cp - '0'); if (*cp != '\0') continue; if (vidp != NULL) *vidp = vid; break; } IFNET_RUNLOCK_NOSLEEP(); return (ifp); } static int vlan_clone_match(struct if_clone *ifc, const char *name) { const char *cp; if (vlan_clone_match_ethervid(ifc, name, NULL) != NULL) return (1); if (strncmp(vlanname, name, strlen(vlanname)) != 0) return (0); for (cp = name + 4; *cp != '\0'; cp++) { if (*cp < '0' || *cp > '9') return (0); } return (1); } static int vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { char *dp; int wildcard; int unit; int error; int vid; int ethertag; struct ifvlan *ifv; struct ifnet *ifp; struct ifnet *p; struct ifaddr *ifa; struct sockaddr_dl *sdl; struct vlanreq vlr; static const u_char eaddr[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */ /* * There are 3 (ugh) ways to specify the cloned device: * o pass a parameter block with the clone request. * o specify parameters in the text of the clone device name * o specify no parameters and get an unattached device that * must be configured separately. * The first technique is preferred; the latter two are * supported for backwards compatibilty. * * XXXRW: Note historic use of the word "tag" here. New ioctls may be * called for. */ if (params) { error = copyin(params, &vlr, sizeof(vlr)); if (error) return error; p = ifunit(vlr.vlr_parent); if (p == NULL) return ENXIO; /* * Don't let the caller set up a VLAN VID with * anything except VLID bits. */ if (vlr.vlr_tag & ~EVL_VLID_MASK) return (EINVAL); error = ifc_name2unit(name, &unit); if (error != 0) return (error); ethertag = 1; vid = vlr.vlr_tag; wildcard = (unit < 0); } else if ((p = vlan_clone_match_ethervid(ifc, name, &vid)) != NULL) { ethertag = 1; unit = -1; wildcard = 0; /* * Don't let the caller set up a VLAN VID with * anything except VLID bits. */ if (vid & ~EVL_VLID_MASK) return (EINVAL); } else { ethertag = 0; error = ifc_name2unit(name, &unit); if (error != 0) return (error); wildcard = (unit < 0); } error = ifc_alloc_unit(ifc, &unit); if (error != 0) return (error); /* In the wildcard case, we need to update the name. */ if (wildcard) { for (dp = name; *dp != '\0'; dp++); if (snprintf(dp, len - (dp-name), "%d", unit) > len - (dp-name) - 1) { panic("%s: interface name too long", __func__); } } ifv = malloc(sizeof(struct ifvlan), M_VLAN, M_WAITOK | M_ZERO); ifp = ifv->ifv_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { ifc_free_unit(ifc, unit); free(ifv, M_VLAN); return (ENOSPC); } SLIST_INIT(&ifv->vlan_mc_listhead); /* Prepare pcpu counters */ ifv->ifv_ipackets = counter_u64_alloc(M_WAITOK); ifv->ifv_opackets = counter_u64_alloc(M_WAITOK); ifv->ifv_ibytes = counter_u64_alloc(M_WAITOK); ifv->ifv_obytes = counter_u64_alloc(M_WAITOK); ifv->ifv_omcasts = counter_u64_alloc(M_WAITOK); ifp->if_softc = ifv; /* * Set the name manually rather than using if_initname because * we don't conform to the default naming convention for interfaces. */ strlcpy(ifp->if_xname, name, IFNAMSIZ); ifp->if_dname = vlanname; ifp->if_dunit = unit; /* NB: flags are not set here */ ifp->if_linkmib = &ifv->ifv_mib; ifp->if_linkmiblen = sizeof(ifv->ifv_mib); /* NB: mtu is not set here */ ifp->if_init = vlan_init; ifp->if_transmit = vlan_transmit; ifp->if_qflush = vlan_qflush; ifp->if_ioctl = vlan_ioctl; ifp->if_flags = VLAN_IFFLAGS; ifp->if_get_counter = vlan_get_counter; ether_ifattach(ifp, eaddr); /* Now undo some of the damage... */ ifp->if_baudrate = 0; ifp->if_type = IFT_L2VLAN; ifp->if_hdrlen = ETHER_VLAN_ENCAP_LEN; ifa = ifp->if_addr; sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_L2VLAN; if (ethertag) { error = vlan_config(ifv, p, vid); if (error != 0) { /* * Since we've partially failed, we need to back * out all the way, otherwise userland could get * confused. Thus, we destroy the interface. */ ether_ifdetach(ifp); vlan_unconfig(ifp); if_free(ifp); ifc_free_unit(ifc, unit); free(ifv, M_VLAN); return (error); } /* Update flags on the parent, if necessary. */ vlan_setflags(ifp, 1); } return (0); } static int vlan_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) { struct ifvlan *ifv = ifp->if_softc; int unit = ifp->if_dunit; ether_ifdetach(ifp); /* first, remove it from system-wide lists */ vlan_unconfig(ifp); /* now it can be unconfigured and freed */ if_free(ifp); counter_u64_free(ifv->ifv_ipackets); counter_u64_free(ifv->ifv_opackets); counter_u64_free(ifv->ifv_ibytes); counter_u64_free(ifv->ifv_obytes); counter_u64_free(ifv->ifv_omcasts); free(ifv, M_VLAN); ifc_free_unit(ifc, unit); return (0); } /* * The ifp->if_init entry point for vlan(4) is a no-op. */ static void vlan_init(void *foo __unused) { } /* * The if_transmit method for vlan(4) interface. */ static int vlan_transmit(struct ifnet *ifp, struct mbuf *m) { struct ifvlan *ifv; struct ifnet *p; int error, len, mcast; ifv = ifp->if_softc; p = PARENT(ifv); len = m->m_pkthdr.len; mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; BPF_MTAP(ifp, m); /* * Do not run parent's if_transmit() if the parent is not up, * or parent's driver will cause a system crash. */ if (!UP_AND_RUNNING(p)) { m_freem(m); ifp->if_oerrors++; return (ENETDOWN); } /* * Pad the frame to the minimum size allowed if told to. * This option is in accord with IEEE Std 802.1Q, 2003 Ed., * paragraph C.4.4.3.b. It can help to work around buggy * bridges that violate paragraph C.4.4.3.a from the same * document, i.e., fail to pad short frames after untagging. * E.g., a tagged frame 66 bytes long (incl. FCS) is OK, but * untagging it will produce a 62-byte frame, which is a runt * and requires padding. There are VLAN-enabled network * devices that just discard such runts instead or mishandle * them somehow. */ if (soft_pad && p->if_type == IFT_ETHER) { static char pad[8]; /* just zeros */ int n; for (n = ETHERMIN + ETHER_HDR_LEN - m->m_pkthdr.len; n > 0; n -= sizeof(pad)) if (!m_append(m, min(n, sizeof(pad)), pad)) break; if (n > 0) { if_printf(ifp, "cannot pad short frame\n"); ifp->if_oerrors++; m_freem(m); return (0); } } /* * If underlying interface can do VLAN tag insertion itself, * just pass the packet along. However, we need some way to * tell the interface where the packet came from so that it * knows how to find the VLAN tag to use, so we attach a * packet tag that holds it. */ if (p->if_capenable & IFCAP_VLAN_HWTAGGING) { m->m_pkthdr.ether_vtag = ifv->ifv_vid; m->m_flags |= M_VLANTAG; } else { m = ether_vlanencap(m, ifv->ifv_vid); if (m == NULL) { if_printf(ifp, "unable to prepend VLAN header\n"); ifp->if_oerrors++; return (0); } } /* * Send it, precisely as ether_output() would have. */ error = (p->if_transmit)(p, m); if (error == 0) { counter_u64_add(ifv->ifv_opackets, 1); counter_u64_add(ifv->ifv_obytes, len); counter_u64_add(ifv->ifv_omcasts, 1); } else ifp->if_oerrors++; return (error); } static uint64_t vlan_get_counter(struct ifnet *ifp, ifnet_counter cnt) { struct ifvlan *ifv; ifv = ifp->if_softc; switch (cnt) { case IFCOUNTER_IPACKETS: return (counter_u64_fetch(ifv->ifv_ipackets)); case IFCOUNTER_OPACKETS: return (counter_u64_fetch(ifv->ifv_opackets)); case IFCOUNTER_IBYTES: return (counter_u64_fetch(ifv->ifv_ibytes)); case IFCOUNTER_OBYTES: return (counter_u64_fetch(ifv->ifv_obytes)); case IFCOUNTER_OMCASTS: return (counter_u64_fetch(ifv->ifv_omcasts)); default: return (if_get_counter_compat(ifp, cnt)); } /* NOTREACHED */ } /* * The ifp->if_qflush entry point for vlan(4) is a no-op. */ static void vlan_qflush(struct ifnet *ifp __unused) { } static void vlan_input(struct ifnet *ifp, struct mbuf *m) { struct ifvlantrunk *trunk = ifp->if_vlantrunk; struct ifvlan *ifv; TRUNK_LOCK_READER; uint16_t vid; KASSERT(trunk != NULL, ("%s: no trunk", __func__)); if (m->m_flags & M_VLANTAG) { /* * Packet is tagged, but m contains a normal * Ethernet frame; the tag is stored out-of-band. */ vid = EVL_VLANOFTAG(m->m_pkthdr.ether_vtag); m->m_flags &= ~M_VLANTAG; } else { struct ether_vlan_header *evl; /* * Packet is tagged in-band as specified by 802.1q. */ switch (ifp->if_type) { case IFT_ETHER: if (m->m_len < sizeof(*evl) && (m = m_pullup(m, sizeof(*evl))) == NULL) { if_printf(ifp, "cannot pullup VLAN header\n"); return; } evl = mtod(m, struct ether_vlan_header *); vid = EVL_VLANOFTAG(ntohs(evl->evl_tag)); /* * Remove the 802.1q header by copying the Ethernet * addresses over it and adjusting the beginning of * the data in the mbuf. The encapsulated Ethernet * type field is already in place. */ bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); break; default: #ifdef INVARIANTS panic("%s: %s has unsupported if_type %u", __func__, ifp->if_xname, ifp->if_type); #endif m_freem(m); ifp->if_noproto++; return; } } TRUNK_RLOCK(trunk); ifv = vlan_gethash(trunk, vid); if (ifv == NULL || !UP_AND_RUNNING(ifv->ifv_ifp)) { TRUNK_RUNLOCK(trunk); m_freem(m); ifp->if_noproto++; return; } TRUNK_RUNLOCK(trunk); m->m_pkthdr.rcvif = ifv->ifv_ifp; counter_u64_add(ifv->ifv_ipackets, 1); counter_u64_add(ifv->ifv_ibytes, m->m_pkthdr.len); /* Pass it back through the parent's input routine. */ (*ifp->if_input)(ifv->ifv_ifp, m); } static int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) { struct ifvlantrunk *trunk; struct ifnet *ifp; int error = 0; /* VID numbers 0x0 and 0xFFF are reserved */ if (vid == 0 || vid == 0xFFF) return (EINVAL); if (p->if_type != IFT_ETHER && (p->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) return (EPROTONOSUPPORT); if ((p->if_flags & VLAN_IFFLAGS) != VLAN_IFFLAGS) return (EPROTONOSUPPORT); if (ifv->ifv_trunk) return (EBUSY); if (p->if_vlantrunk == NULL) { trunk = malloc(sizeof(struct ifvlantrunk), M_VLAN, M_WAITOK | M_ZERO); vlan_inithash(trunk); VLAN_LOCK(); if (p->if_vlantrunk != NULL) { /* A race that that is very unlikely to be hit. */ vlan_freehash(trunk); free(trunk, M_VLAN); goto exists; } TRUNK_LOCK_INIT(trunk); TRUNK_LOCK(trunk); p->if_vlantrunk = trunk; trunk->parent = p; } else { VLAN_LOCK(); exists: trunk = p->if_vlantrunk; TRUNK_LOCK(trunk); } ifv->ifv_vid = vid; /* must set this before vlan_inshash() */ error = vlan_inshash(trunk, ifv); if (error) goto done; ifv->ifv_proto = ETHERTYPE_VLAN; ifv->ifv_encaplen = ETHER_VLAN_ENCAP_LEN; ifv->ifv_mintu = ETHERMIN; ifv->ifv_pflags = 0; /* * If the parent supports the VLAN_MTU capability, * i.e. can Tx/Rx larger than ETHER_MAX_LEN frames, * use it. */ if (p->if_capenable & IFCAP_VLAN_MTU) { /* * No need to fudge the MTU since the parent can * handle extended frames. */ ifv->ifv_mtufudge = 0; } else { /* * Fudge the MTU by the encapsulation size. This * makes us incompatible with strictly compliant * 802.1Q implementations, but allows us to use * the feature with other NetBSD implementations, * which might still be useful. */ ifv->ifv_mtufudge = ifv->ifv_encaplen; } ifv->ifv_trunk = trunk; ifp = ifv->ifv_ifp; /* * Initialize fields from our parent. This duplicates some * work with ether_ifattach() but allows for non-ethernet * interfaces to also work. */ ifp->if_mtu = p->if_mtu - ifv->ifv_mtufudge; ifp->if_baudrate = p->if_baudrate; ifp->if_output = p->if_output; ifp->if_input = p->if_input; ifp->if_resolvemulti = p->if_resolvemulti; ifp->if_addrlen = p->if_addrlen; ifp->if_broadcastaddr = p->if_broadcastaddr; /* * Copy only a selected subset of flags from the parent. * Other flags are none of our business. */ #define VLAN_COPY_FLAGS (IFF_SIMPLEX) ifp->if_flags &= ~VLAN_COPY_FLAGS; ifp->if_flags |= p->if_flags & VLAN_COPY_FLAGS; #undef VLAN_COPY_FLAGS ifp->if_link_state = p->if_link_state; vlan_capabilities(ifv); /* * Set up our interface address to reflect the underlying * physical interface's. */ bcopy(IF_LLADDR(p), IF_LLADDR(ifp), p->if_addrlen); ((struct sockaddr_dl *)ifp->if_addr->ifa_addr)->sdl_alen = p->if_addrlen; /* * Configure multicast addresses that may already be * joined on the vlan device. */ (void)vlan_setmulti(ifp); /* XXX: VLAN lock held */ /* We are ready for operation now. */ ifp->if_drv_flags |= IFF_DRV_RUNNING; done: TRUNK_UNLOCK(trunk); if (error == 0) EVENTHANDLER_INVOKE(vlan_config, p, ifv->ifv_vid); VLAN_UNLOCK(); return (error); } static void vlan_unconfig(struct ifnet *ifp) { VLAN_LOCK(); vlan_unconfig_locked(ifp, 0); VLAN_UNLOCK(); } static void vlan_unconfig_locked(struct ifnet *ifp, int departing) { struct ifvlantrunk *trunk; struct vlan_mc_entry *mc; struct ifvlan *ifv; struct ifnet *parent; int error; VLAN_LOCK_ASSERT(); ifv = ifp->if_softc; trunk = ifv->ifv_trunk; parent = NULL; if (trunk != NULL) { TRUNK_LOCK(trunk); parent = trunk->parent; /* * Since the interface is being unconfigured, we need to * empty the list of multicast groups that we may have joined * while we were alive from the parent's list. */ while ((mc = SLIST_FIRST(&ifv->vlan_mc_listhead)) != NULL) { /* * If the parent interface is being detached, * all its multicast addresses have already * been removed. Warn about errors if * if_delmulti() does fail, but don't abort as * all callers expect vlan destruction to * succeed. */ if (!departing) { error = if_delmulti(parent, (struct sockaddr *)&mc->mc_addr); if (error) if_printf(ifp, "Failed to delete multicast address from parent: %d\n", error); } SLIST_REMOVE_HEAD(&ifv->vlan_mc_listhead, mc_entries); free(mc, M_VLAN); } vlan_setflags(ifp, 0); /* clear special flags on parent */ vlan_remhash(trunk, ifv); ifv->ifv_trunk = NULL; /* * Check if we were the last. */ if (trunk->refcnt == 0) { parent->if_vlantrunk = NULL; /* * XXXGL: If some ithread has already entered * vlan_input() and is now blocked on the trunk * lock, then it should preempt us right after * unlock and finish its work. Then we will acquire * lock again in trunk_destroy(). */ TRUNK_UNLOCK(trunk); trunk_destroy(trunk); } else TRUNK_UNLOCK(trunk); } /* Disconnect from parent. */ if (ifv->ifv_pflags) if_printf(ifp, "%s: ifv_pflags unclean\n", __func__); ifp->if_mtu = ETHERMTU; ifp->if_link_state = LINK_STATE_UNKNOWN; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; /* * Only dispatch an event if vlan was * attached, otherwise there is nothing * to cleanup anyway. */ if (parent != NULL) EVENTHANDLER_INVOKE(vlan_unconfig, parent, ifv->ifv_vid); } /* Handle a reference counted flag that should be set on the parent as well */ static int vlan_setflag(struct ifnet *ifp, int flag, int status, int (*func)(struct ifnet *, int)) { struct ifvlan *ifv; int error; /* XXX VLAN_LOCK_ASSERT(); */ ifv = ifp->if_softc; status = status ? (ifp->if_flags & flag) : 0; /* Now "status" contains the flag value or 0 */ /* * See if recorded parent's status is different from what * we want it to be. If it is, flip it. We record parent's * status in ifv_pflags so that we won't clear parent's flag * we haven't set. In fact, we don't clear or set parent's * flags directly, but get or release references to them. * That's why we can be sure that recorded flags still are * in accord with actual parent's flags. */ if (status != (ifv->ifv_pflags & flag)) { error = (*func)(PARENT(ifv), status); if (error) return (error); ifv->ifv_pflags &= ~flag; ifv->ifv_pflags |= status; } return (0); } /* * Handle IFF_* flags that require certain changes on the parent: * if "status" is true, update parent's flags respective to our if_flags; * if "status" is false, forcedly clear the flags set on parent. */ static int vlan_setflags(struct ifnet *ifp, int status) { int error, i; for (i = 0; vlan_pflags[i].flag; i++) { error = vlan_setflag(ifp, vlan_pflags[i].flag, status, vlan_pflags[i].func); if (error) return (error); } return (0); } /* Inform all vlans that their parent has changed link state */ static void vlan_link_state(struct ifnet *ifp) { struct ifvlantrunk *trunk = ifp->if_vlantrunk; struct ifvlan *ifv; int i; TRUNK_LOCK(trunk); #ifdef VLAN_ARRAY for (i = 0; i < VLAN_ARRAY_SIZE; i++) if (trunk->vlans[i] != NULL) { ifv = trunk->vlans[i]; #else for (i = 0; i < (1 << trunk->hwidth); i++) LIST_FOREACH(ifv, &trunk->hash[i], ifv_list) { #endif ifv->ifv_ifp->if_baudrate = trunk->parent->if_baudrate; if_link_state_change(ifv->ifv_ifp, trunk->parent->if_link_state); } TRUNK_UNLOCK(trunk); } static void vlan_capabilities(struct ifvlan *ifv) { struct ifnet *p = PARENT(ifv); struct ifnet *ifp = ifv->ifv_ifp; TRUNK_LOCK_ASSERT(TRUNK(ifv)); /* * If the parent interface can do checksum offloading * on VLANs, then propagate its hardware-assisted * checksumming flags. Also assert that checksum * offloading requires hardware VLAN tagging. */ if (p->if_capabilities & IFCAP_VLAN_HWCSUM) ifp->if_capabilities = p->if_capabilities & IFCAP_HWCSUM; if (p->if_capenable & IFCAP_VLAN_HWCSUM && p->if_capenable & IFCAP_VLAN_HWTAGGING) { ifp->if_capenable = p->if_capenable & IFCAP_HWCSUM; ifp->if_hwassist = p->if_hwassist & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP); } else { ifp->if_capenable = 0; ifp->if_hwassist = 0; } /* * If the parent interface can do TSO on VLANs then * propagate the hardware-assisted flag. TSO on VLANs * does not necessarily require hardware VLAN tagging. */ - ifp->if_hw_tsomax = if_hw_tsomax_common(ifp->if_hw_tsomax, - p->if_hw_tsomax); + if (p->if_hw_tsomax > 0) + ifp->if_hw_tsomax = p->if_hw_tsomax; if (p->if_capabilities & IFCAP_VLAN_HWTSO) ifp->if_capabilities |= p->if_capabilities & IFCAP_TSO; if (p->if_capenable & IFCAP_VLAN_HWTSO) { ifp->if_capenable |= p->if_capenable & IFCAP_TSO; ifp->if_hwassist |= p->if_hwassist & CSUM_TSO; } else { ifp->if_capenable &= ~(p->if_capenable & IFCAP_TSO); ifp->if_hwassist &= ~(p->if_hwassist & CSUM_TSO); } /* * If the parent interface can offload TCP connections over VLANs then * propagate its TOE capability to the VLAN interface. * * All TOE drivers in the tree today can deal with VLANs. If this * changes then IFCAP_VLAN_TOE should be promoted to a full capability * with its own bit. */ #define IFCAP_VLAN_TOE IFCAP_TOE if (p->if_capabilities & IFCAP_VLAN_TOE) ifp->if_capabilities |= p->if_capabilities & IFCAP_TOE; if (p->if_capenable & IFCAP_VLAN_TOE) { TOEDEV(ifp) = TOEDEV(p); ifp->if_capenable |= p->if_capenable & IFCAP_TOE; } } static void vlan_trunk_capabilities(struct ifnet *ifp) { struct ifvlantrunk *trunk = ifp->if_vlantrunk; struct ifvlan *ifv; int i; TRUNK_LOCK(trunk); #ifdef VLAN_ARRAY for (i = 0; i < VLAN_ARRAY_SIZE; i++) if (trunk->vlans[i] != NULL) { ifv = trunk->vlans[i]; #else for (i = 0; i < (1 << trunk->hwidth); i++) { LIST_FOREACH(ifv, &trunk->hash[i], ifv_list) #endif vlan_capabilities(ifv); } TRUNK_UNLOCK(trunk); } static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifnet *p; struct ifreq *ifr; struct ifaddr *ifa; struct ifvlan *ifv; struct ifvlantrunk *trunk; struct vlanreq vlr; int error = 0; ifr = (struct ifreq *)data; ifa = (struct ifaddr *) data; ifv = ifp->if_softc; switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) arp_ifinit(ifp, ifa); #endif break; case SIOCGIFADDR: { struct sockaddr *sa; sa = (struct sockaddr *)&ifr->ifr_data; bcopy(IF_LLADDR(ifp), sa->sa_data, ifp->if_addrlen); } break; case SIOCGIFMEDIA: VLAN_LOCK(); if (TRUNK(ifv) != NULL) { p = PARENT(ifv); VLAN_UNLOCK(); error = (*p->if_ioctl)(p, SIOCGIFMEDIA, data); /* Limit the result to the parent's current config. */ if (error == 0) { struct ifmediareq *ifmr; ifmr = (struct ifmediareq *)data; if (ifmr->ifm_count >= 1 && ifmr->ifm_ulist) { ifmr->ifm_count = 1; error = copyout(&ifmr->ifm_current, ifmr->ifm_ulist, sizeof(int)); } } } else { VLAN_UNLOCK(); error = EINVAL; } break; case SIOCSIFMEDIA: error = EINVAL; break; case SIOCSIFMTU: /* * Set the interface MTU. */ VLAN_LOCK(); if (TRUNK(ifv) != NULL) { if (ifr->ifr_mtu > (PARENT(ifv)->if_mtu - ifv->ifv_mtufudge) || ifr->ifr_mtu < (ifv->ifv_mintu - ifv->ifv_mtufudge)) error = EINVAL; else ifp->if_mtu = ifr->ifr_mtu; } else error = EINVAL; VLAN_UNLOCK(); break; case SIOCSETVLAN: #ifdef VIMAGE /* * XXXRW/XXXBZ: The goal in these checks is to allow a VLAN * interface to be delegated to a jail without allowing the * jail to change what underlying interface/VID it is * associated with. We are not entirely convinced that this * is the right way to accomplish that policy goal. */ if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif error = copyin(ifr->ifr_data, &vlr, sizeof(vlr)); if (error) break; if (vlr.vlr_parent[0] == '\0') { vlan_unconfig(ifp); break; } p = ifunit(vlr.vlr_parent); if (p == NULL) { error = ENOENT; break; } /* * Don't let the caller set up a VLAN VID with * anything except VLID bits. */ if (vlr.vlr_tag & ~EVL_VLID_MASK) { error = EINVAL; break; } error = vlan_config(ifv, p, vlr.vlr_tag); if (error) break; /* Update flags on the parent, if necessary. */ vlan_setflags(ifp, 1); break; case SIOCGETVLAN: #ifdef VIMAGE if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif bzero(&vlr, sizeof(vlr)); VLAN_LOCK(); if (TRUNK(ifv) != NULL) { strlcpy(vlr.vlr_parent, PARENT(ifv)->if_xname, sizeof(vlr.vlr_parent)); vlr.vlr_tag = ifv->ifv_vid; } VLAN_UNLOCK(); error = copyout(&vlr, ifr->ifr_data, sizeof(vlr)); break; case SIOCSIFFLAGS: /* * We should propagate selected flags to the parent, * e.g., promiscuous mode. */ if (TRUNK(ifv) != NULL) error = vlan_setflags(ifp, 1); break; case SIOCADDMULTI: case SIOCDELMULTI: /* * If we don't have a parent, just remember the membership for * when we do. */ trunk = TRUNK(ifv); if (trunk != NULL) { TRUNK_LOCK(trunk); error = vlan_setmulti(ifp); TRUNK_UNLOCK(trunk); } break; default: error = EINVAL; break; } return (error); } Index: head/sys/netinet/tcp_output.c =================================================================== --- head/sys/netinet/tcp_output.c (revision 271550) +++ head/sys/netinet/tcp_output.c (revision 271551) @@ -1,1651 +1,1580 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #define TCPOUTFLAGS #include #include #include #include #include #ifdef TCPDEBUG #include #endif #ifdef TCP_OFFLOAD #include #endif #ifdef IPSEC #include #endif /*IPSEC*/ #include #include VNET_DEFINE(int, path_mtu_discovery) = 1; SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, &VNET_NAME(path_mtu_discovery), 1, "Enable Path MTU Discovery"); VNET_DEFINE(int, tcp_do_tso) = 1; #define V_tcp_do_tso VNET(tcp_do_tso) SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, &VNET_NAME(tcp_do_tso), 0, "Enable TCP Segmentation Offload"); VNET_DEFINE(int, tcp_sendspace) = 1024*32; #define V_tcp_sendspace VNET(tcp_sendspace) SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, &VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size"); VNET_DEFINE(int, tcp_do_autosndbuf) = 1; #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW, &VNET_NAME(tcp_do_autosndbuf), 0, "Enable automatic send buffer sizing"); VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024; #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_inc), 0, "Incrementor step size of automatic send buffer"); VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024; #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_max), 0, "Max size of automatic send buffer"); static void inline hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, long len, int tso); static void inline cc_after_idle(struct tcpcb *tp); /* * Wrapper for the TCP established output helper hook. */ static void inline hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, long len, int tso) { struct tcp_hhook_data hhook_data; if (V_tcp_hhh[HHOOK_TCP_EST_OUT]->hhh_nhooks > 0) { hhook_data.tp = tp; hhook_data.th = th; hhook_data.to = to; hhook_data.len = len; hhook_data.tso = tso; hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_OUT], &hhook_data, tp->osd); } } /* * CC wrapper hook functions */ static void inline cc_after_idle(struct tcpcb *tp) { INP_WLOCK_ASSERT(tp->t_inpcb); if (CC_ALGO(tp)->after_idle != NULL) CC_ALGO(tp)->after_idle(tp->ccv); } /* * Tcp output routine: figure out what should be sent and send it. */ int tcp_output(struct tcpcb *tp) { struct socket *so = tp->t_inpcb->inp_socket; long len, recwin, sendwin; int off, flags, error = 0; /* Keep compiler happy */ struct mbuf *m; struct ip *ip = NULL; struct ipovly *ipov = NULL; struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; #ifdef IPSEC unsigned ipsec_optlen = 0; #endif int idle, sendalot; int sack_rxmit, sack_bytes_rxmt; struct sackhole *p; int tso, mtu; struct tcpopt to; #if 0 int maxburst = TCP_MAXBURST; #endif #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif INP_WLOCK_ASSERT(tp->t_inpcb); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return (tcp_offload_output(tp)); #endif /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur) cc_after_idle(tp); tp->t_flags &= ~TF_LASTIDLE; if (idle) { if (tp->t_flags & TF_MORETOCOME) { tp->t_flags |= TF_LASTIDLE; idle = 0; } } again: /* * If we've recently taken a timeout, snd_max will be greater than * snd_nxt. There may be SACK information that allows us to avoid * resending already delivered data. Adjust snd_nxt accordingly. */ if ((tp->t_flags & TF_SACK_PERMIT) && SEQ_LT(tp->snd_nxt, tp->snd_max)) tcp_sack_adjust(tp); sendalot = 0; tso = 0; mtu = 0; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd); flags = tcp_outflags[tp->t_state]; /* * Send any SACK-generated retransmissions. If we're explicitly trying * to send out new data (when sendalot is 1), bypass this function. * If we retransmit in fast recovery mode, decrement snd_cwnd, since * we're replacing a (future) new transmission with a retransmission * now, and we previously incremented snd_cwnd in tcp_input(). */ /* * Still in sack recovery , reset rxmit flag to zero. */ sack_rxmit = 0; sack_bytes_rxmt = 0; len = 0; p = NULL; if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) && (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { long cwin; cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; /* Do not retransmit SACK segments beyond snd_recover */ if (SEQ_GT(p->end, tp->snd_recover)) { /* * (At least) part of sack hole extends beyond * snd_recover. Check to see if we can rexmit data * for this hole. */ if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { /* * Can't rexmit any more data for this hole. * That data will be rexmitted in the next * sack recovery episode, when snd_recover * moves past p->rxmit. */ p = NULL; goto after_sack_rexmit; } else /* Can rexmit part of the current hole */ len = ((long)ulmin(cwin, tp->snd_recover - p->rxmit)); } else len = ((long)ulmin(cwin, p->end - p->rxmit)); off = p->rxmit - tp->snd_una; KASSERT(off >= 0,("%s: sack block to the left of una : %d", __func__, off)); if (len > 0) { sack_rxmit = 1; sendalot = 1; TCPSTAT_INC(tcps_sack_rexmits); TCPSTAT_ADD(tcps_sack_rexmit_bytes, min(len, tp->t_maxseg)); } } after_sack_rexmit: /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; SOCKBUF_LOCK(&so->so_snd); /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_flags & TF_FORCEDATA) { if (sendwin == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unsent data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < so->so_snd.sb_cc) flags &= ~TH_FIN; sendwin = 1; } else { tcp_timer_activate(tp, TT_PERSIST, 0); tp->t_rxtshift = 0; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * offset will be > 0 even if so_snd.sb_cc is 0, resulting in * a negative length. This can also occur when TCP opens up * its congestion window while receiving additional duplicate * acks after fast-retransmit because TCP will reset snd_nxt * to snd_max after the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. * * If sack_rxmit is true we are retransmitting from the scoreboard * in which case len is already set. */ if (sack_rxmit == 0) { if (sack_bytes_rxmt == 0) len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off); else { long cwin; /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible in the scoreboard. */ len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd) - off); /* * Don't remove this (len > 0) check ! * We explicitly check for len > 0 here (although it * isn't really necessary), to work around a gcc * optimization issue - to force gcc to compute * len above. Without this check, the computation * of len is bungled by the optimizer. */ if (len > 0) { cwin = tp->snd_cwnd - (tp->snd_nxt - tp->sack_newdata) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; len = lmin(len, cwin); } } } /* * Lop off SYN bit if it has already been sent. However, if this * is SYN-SENT state and if segment contains data and if we don't * know that foreign host supports TAO, suppress sending segment. */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { if (tp->t_state != TCPS_SYN_RECEIVED) flags &= ~TH_SYN; off--, len++; } /* * Be careful not to send data and/or FIN on SYN segments. * This measure is needed to prevent interoperability problems * with not fully conformant TCP implementations. */ if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { len = 0; flags &= ~TH_FIN; } if (len < 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be < 0. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back * to (closed) window, and set the persist timer * if it isn't already going. If the window didn't * close completely, just wait for an ACK. */ len = 0; if (sendwin == 0) { tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; if (!tcp_timer_active(tp, TT_PERSIST)) tcp_setpersist(tp); } } /* len will be >= 0 after this point. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); /* * Automatic sizing of send socket buffer. Often the send buffer * size is not optimally adjusted to the actual network conditions * at hand (delay bandwidth product). Setting the buffer size too * small limits throughput on links with high bandwidth and high * delay (eg. trans-continental/oceanic links). Setting the * buffer size too big consumes too much real kernel memory, * especially with many connections on busy servers. * * The criteria to step up the send buffer one notch are: * 1. receive window of remote host is larger than send buffer * (with a fudge factor of 5/4th); * 2. send buffer is filled to 7/8th with data (so we actually * have data to make use of it); * 3. send buffer fill has not hit maximal automatic size; * 4. our send window (slow start and cogestion controlled) is * larger than sent but unacknowledged data in send buffer. * * The remote host receive window scaling factor may limit the * growing of the send buffer before it reaches its allowed * maximum. * * It scales directly with slow start or congestion window * and does at most one step per received ACK. This fast * scaling has the drawback of growing the send buffer beyond * what is strictly necessary to make full use of a given * delay*bandwith product. However testing has shown this not * to be much of an problem. At worst we are trading wasting * of available bandwith (the non-use of it) for wasting some * socket buffer memory. * * TODO: Shrink send buffer during idle periods together * with congestion window. Requires another timer. Has to * wait for upcoming tcp timer rewrite. */ if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && so->so_snd.sb_cc < V_tcp_autosndbuf_max && sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) { if (!sbreserve_locked(&so->so_snd, min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc, V_tcp_autosndbuf_max), so, curthread)) so->so_snd.sb_flags &= ~SB_AUTOSIZE; } } /* * Decide if we can use TCP Segmentation Offloading (if supported by * hardware). * * TSO may only be used if we are in a pure bulk sending state. The * presence of TCP-MD5, SACK retransmits, SACK advertizements and * IP options prevent using TSO. With TSO the TCP header is the same * (except for the sequence number) for all generated packets. This * makes it impossible to transmit any options which vary per generated * segment or packet. */ #ifdef IPSEC /* * Pre-calculate here as we save another lookup into the darknesses * of IPsec that way and can actually decide if TSO is ok. */ ipsec_optlen = ipsec_hdrsiz_tcp(tp); #endif if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && sack_rxmit == 0 && #ifdef IPSEC ipsec_optlen == 0 && #endif tp->t_inpcb->inp_options == NULL && tp->t_inpcb->in6p_options == NULL) tso = 1; if (sack_rxmit) { if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; } else { if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; } recwin = sbspace(&so->so_rcv); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment (or more with TSO) * - This is the last buffer in a write()/send() and we are * either idle or running NODELAY * - we've timed out (e.g. persist timer) * - we have more then 1/2 the maximum send window's worth of * data (receiver may be limited the window size) * - we need to retransmit */ if (len) { if (len >= tp->t_maxseg) goto send; /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause * us to flush a buffer queued with moretocome. XXX * * note: the len + off check is almost certainly unnecessary. */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && len + off >= so->so_snd.sb_cc && (tp->t_flags & TF_NOPUSH) == 0) { goto send; } if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */ goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; if (sack_rxmit) goto send; } /* * Sending of standalone window updates. * * Window updates are important when we close our window due to a * full socket buffer and are opening it again after the application * reads data from it. Once the window has opened again and the * remote end starts to send again the ACK clock takes over and * provides the most current window information. * * We must avoid the silly window syndrome whereas every read * from the receive buffer, no matter how small, causes a window * update to be sent. We also should avoid sending a flurry of * window updates when the socket buffer had queued a lot of data * and the application is doing small reads. * * Prevent a flurry of pointless window updates by only sending * an update when we can increase the advertized window by more * than 1/4th of the socket buffer capacity. When the buffer is * getting full or is very small be more aggressive and send an * update whenever we can increase by two mss sized segments. * In all other situations the ACK's to new incoming data will * carry further window increases. * * Don't send an independent window update if a delayed * ACK is pending (it will get piggy-backed on it) or the * remote side already has done a half-close and won't send * more data. Skip this if the connection is in T/TCP * half-open state. */ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && !(tp->t_flags & TF_DELACK) && !TCPS_HAVERCVDFIN(tp->t_state)) { /* * "adv" is the amount we could increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ long adv; int oldwin; adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale); if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { oldwin = (tp->rcv_adv - tp->rcv_nxt); adv -= oldwin; } else oldwin = 0; /* * If the new window size ends up being the same as the old * size when it is scaled, then don't force a window update. */ if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) goto dontupdate; if (adv >= (long)(2 * tp->t_maxseg) && (adv >= (long)(so->so_rcv.sb_hiwat / 4) || recwin <= (long)(so->so_rcv.sb_hiwat / 8) || so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) goto send; } dontupdate: /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) goto send; if ((flags & TH_RST) || ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * If our state indicates that FIN should be sent * and we have not yet done so, then we need to send. */ if (flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) goto send; /* * In SACK, it is possible for tcp_output to fail to send a segment * after the retransmission timer has been turned off. Make sure * that the retransmission timer is set. */ if ((tp->t_flags & TF_SACK_PERMIT) && SEQ_GT(tp->snd_max, tp->snd_una) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); goto just_return; } /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tcp_timer_active(tp, TT_PERSIST) * is true when we are in persist state. * (tp->t_flags & TF_FORCEDATA) * is set when we are called to send a persist packet. * tcp_timer_active(tp, TT_REXMT) * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if (so->so_snd.sb_cc && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ just_return: SOCKBUF_UNLOCK(&so->so_snd); return (0); send: SOCKBUF_LOCK_ASSERT(&so->so_snd); /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. * NOTE: we assume that the IP/TCP header plus TCP options * always fit in a single mbuf, leaving room for a maximum * link header, i.e. * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES */ optlen = 0; #ifdef INET6 if (isipv6) hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); else #endif hdrlen = sizeof (struct tcpiphdr); /* * Compute options for segment. * We only have to care about SYN and established connection * segments. Options for SYN-ACK segments are handled in TCP * syncache. */ if ((tp->t_flags & TF_NOOPT) == 0) { to.to_flags = 0; /* Maximum segment size. */ if (flags & TH_SYN) { tp->snd_nxt = tp->iss; to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc); to.to_flags |= TOF_MSS; } /* Window scaling. */ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { to.to_wscale = tp->request_r_scale; to.to_flags |= TOF_SCALE; } /* Timestamps. */ if ((tp->t_flags & TF_RCVD_TSTMP) || ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { to.to_tsval = tcp_ts_getticks() + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) tp->rfbuf_ts = tcp_ts_getticks(); } /* Selective ACK's. */ if (tp->t_flags & TF_SACK_PERMIT) { if (flags & TH_SYN) to.to_flags |= TOF_SACKPERM; else if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { to.to_flags |= TOF_SACK; to.to_nsacks = tp->rcv_numsacks; to.to_sacks = (u_char *)tp->sackblks; } } #ifdef TCP_SIGNATURE /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* TCP_SIGNATURE */ /* Processing the options. */ hdrlen += optlen = tcp_addoptions(&to, opt); } #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(tp->t_inpcb); else #endif if (tp->t_inpcb->inp_options) ipoptlen = tp->t_inpcb->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else ipoptlen = 0; #ifdef IPSEC ipoptlen += ipsec_optlen; #endif /* * Adjust data length if insertion of options will * bump the packet length beyond the t_maxopd length. * Clear the FIN bit because we cut off the tail of * the segment. */ if (len + optlen + ipoptlen > tp->t_maxopd) { flags &= ~TH_FIN; if (tso) { - u_int if_hw_tsomax_bytes; - u_int if_hw_tsomax_frag_count; - u_int if_hw_tsomax_frag_size; - struct mbuf *mb; - u_int moff; - int max_len; - - /* extract TSO information */ - if_hw_tsomax_bytes = - IF_HW_TSOMAX_GET_BYTES(tp->t_tsomax); - if_hw_tsomax_frag_count = - IF_HW_TSOMAX_GET_FRAG_COUNT(tp->t_tsomax); - if_hw_tsomax_frag_size = - IF_HW_TSOMAX_GET_FRAG_SIZE(tp->t_tsomax); - - /* compute maximum TSO length */ - max_len = (if_hw_tsomax_bytes - hdrlen); - - /* clamp maximum length value */ - if (max_len > IP_MAXPACKET) - max_len = IP_MAXPACKET; - else if (max_len < 0) - max_len = 0; - - /* get smallest length */ - if (len > (u_int)max_len) { - if (max_len != 0) - sendalot = 1; - len = (u_int)max_len; - } - KASSERT(ipoptlen == 0, ("%s: TSO can't do IP options", __func__)); - max_len = 0; - mb = sbsndptr(&so->so_snd, off, len, &moff); - - /* now make sure the number of fragments fit too */ - while (mb != NULL && (u_int)max_len < len) { - u_int cur_length; - u_int cur_frags; - - /* - * Get length of mbuf fragment and how - * many hardware frags, rounded up, it - * would use: - */ - cur_length = (mb->m_len - moff); - cur_frags = (cur_length + - (1 << if_hw_tsomax_frag_size) - 1) >> - if_hw_tsomax_frag_size; - - /* Handle special case: Zero Length Mbuf */ - if (cur_frags == 0) - cur_frags = 1; - - /* - * Check if the fragment limit will be - * reached or exceeded: - */ - if (cur_frags >= if_hw_tsomax_frag_count) { - max_len += min(cur_length, - if_hw_tsomax_frag_count << - if_hw_tsomax_frag_size); - break; - } - max_len += cur_length; - if_hw_tsomax_frag_count -= cur_frags; - moff = 0; - mb = mb->m_next; - } - /* * Limit a burst to t_tsomax minus IP, * TCP and options length to keep ip->ip_len * from overflowing or exceeding the maximum * length allowed by the network interface. */ - if (len > (u_int)max_len) { - if (max_len != 0) - sendalot = 1; - len = (u_int)max_len; + if (len > tp->t_tsomax - hdrlen) { + len = tp->t_tsomax - hdrlen; + sendalot = 1; } /* * Prevent the last segment from being * fractional unless the send sockbuf can * be emptied. */ if (sendalot && off + len < so->so_snd.sb_cc) { len -= len % (tp->t_maxopd - optlen); sendalot = 1; } /* * Send the FIN in a separate segment * after the bulk sending is done. * We don't trust the TSO implementations * to clear the FIN flag on all but the * last segment. */ if (tp->t_flags & TF_NEEDFIN) sendalot = 1; } else { len = tp->t_maxopd - optlen - ipoptlen; sendalot = 1; } } else tso = 0; KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, ("%s: len > IP_MAXPACKET", __func__)); /*#ifdef DIAGNOSTIC*/ #ifdef INET6 if (max_linkhdr + hdrlen > MCLBYTES) #else if (max_linkhdr + hdrlen > MHLEN) #endif panic("tcphdr too big"); /*#endif*/ /* * This KASSERT is here to catch edge cases at a well defined place. * Before, those had triggered (random) panic conditions further down. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); /* * Grab a header mbuf, attaching a copy of data to * be transmitted, and initialize the header from * the template for sends on this connection. */ if (len) { struct mbuf *mb; u_int moff; if ((tp->t_flags & TF_FORCEDATA) && len == 1) TCPSTAT_INC(tcps_sndprobe); else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { tp->t_sndrexmitpack++; TCPSTAT_INC(tcps_sndrexmitpack); TCPSTAT_ADD(tcps_sndrexmitbyte, len); } else { TCPSTAT_INC(tcps_sndpack); TCPSTAT_ADD(tcps_sndbyte, len); } #ifdef INET6 if (MHLEN < hdrlen + max_linkhdr) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else #endif m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOBUFS; sack_rxmit = 0; goto out; } m->m_data += max_linkhdr; m->m_len = hdrlen; /* * Start the m_copy functions from the closest mbuf * to the offset in the socket buffer chain. */ mb = sbsndptr(&so->so_snd, off, len, &moff); if (len <= MHLEN - hdrlen - max_linkhdr) { m_copydata(mb, moff, (int)len, mtod(m, caddr_t) + hdrlen); m->m_len += len; } else { m->m_next = m_copy(mb, moff, (int)len); if (m->m_next == NULL) { SOCKBUF_UNLOCK(&so->so_snd); (void) m_free(m); error = ENOBUFS; sack_rxmit = 0; goto out; } } /* * If we're sending everything we've got, set PUSH. * (This will keep happy those implementations which only * give data to the user when a buffer fills or * a PUSH comes in.) */ if (off + len == so->so_snd.sb_cc) flags |= TH_PUSH; SOCKBUF_UNLOCK(&so->so_snd); } else { SOCKBUF_UNLOCK(&so->so_snd); if (tp->t_flags & TF_ACKNOW) TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN|TH_FIN|TH_RST)) TCPSTAT_INC(tcps_sndctrl); else if (SEQ_GT(tp->snd_up, tp->snd_una)) TCPSTAT_INC(tcps_sndurg); else TCPSTAT_INC(tcps_sndwinup); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; sack_rxmit = 0; goto out; } #ifdef INET6 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && MHLEN >= hdrlen) { MH_ALIGN(m, hdrlen); } else #endif m->m_data += max_linkhdr; m->m_len = hdrlen; } SOCKBUF_UNLOCK_ASSERT(&so->so_snd); m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC mac_inpcb_create_mbuf(tp->t_inpcb, m); #endif #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); tcpip_fillheaders(tp->t_inpcb, ip6, th); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(tp->t_inpcb, ip, th); } /* * Fill in fields, remembering maximum advertised * window for use in delaying messages about window sizes. * If resending a FIN, be sure not to use a new sequence number. */ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && tp->snd_nxt == tp->snd_max) tp->snd_nxt--; /* * If we are starting a connection, send ECN setup * SYN packet. If we are on a retransmit, we may * resend those bits a number of times as per * RFC 3168. */ if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { if (tp->t_rxtshift >= 1) { if (tp->t_rxtshift <= V_tcp_ecn_maxretries) flags |= TH_ECE|TH_CWR; } else flags |= TH_ECE|TH_CWR; } if (tp->t_state == TCPS_ESTABLISHED && (tp->t_flags & TF_ECN_PERMIT)) { /* * If the peer has ECN, mark data packets with * ECN capable transmission (ECT). * Ignore pure ack packets, retransmissions and window probes. */ if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && !((tp->t_flags & TF_FORCEDATA) && len == 1)) { #ifdef INET6 if (isipv6) ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); else #endif ip->ip_tos |= IPTOS_ECN_ECT0; TCPSTAT_INC(tcps_ecn_ect0); } /* * Reply with proper ECN notifications. */ if (tp->t_flags & TF_ECN_SND_CWR) { flags |= TH_CWR; tp->t_flags &= ~TF_ECN_SND_CWR; } if (tp->t_flags & TF_ECN_SND_ECE) flags |= TH_ECE; } /* * If we are doing retransmissions, then snd_nxt will * not reflect the first unsent octet. For ACK only * packets, we do not want the sequence number of the * retransmitted packet, we want the sequence number * of the next unsent octet. So, if there is no data * (and no SYN or FIN), use snd_max instead of snd_nxt * when filling in ti_seq. But if we are in persist * state, snd_max might reflect one byte beyond the * right edge of the window, so use snd_nxt in that * case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ if (sack_rxmit == 0) { if (len || (flags & (TH_SYN|TH_FIN)) || tcp_timer_active(tp, TT_PERSIST)) th->th_seq = htonl(tp->snd_nxt); else th->th_seq = htonl(tp->snd_max); } else { th->th_seq = htonl(p->rxmit); p->rxmit += len; tp->sackhint.sack_bytes_rexmit += len; } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; } th->th_flags = flags; /* * Calculate receive window. Don't shrink window, * but avoid silly window syndrome. */ if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && recwin < (long)tp->t_maxseg) recwin = 0; if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) recwin = (long)(tp->rcv_adv - tp->rcv_nxt); if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) recwin = (long)TCP_MAXWIN << tp->rcv_scale; /* * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. The * case is handled in syncache. */ if (flags & TH_SYN) th->th_win = htons((u_short) (min(sbspace(&so->so_rcv), TCP_MAXWIN))); else th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); /* * Adjust the RXWIN0SENT flag - indicate that we have advertised * a 0 window. This may cause the remote transmitter to stall. This * flag tells soreceive() to disable delayed acknowledgements when * draining the buffer. This can occur if the receiver is attempting * to read more data than can be buffered prior to transmitting on * the connection. */ if (th->th_win == 0) { tp->t_sndzerowin++; tp->t_flags |= TF_RXWIN0SENT; } else tp->t_flags &= ~TF_RXWIN0SENT; if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); th->th_flags |= TH_URG; } else /* * If no urgent pointer to send, then we pull * the urgent pointer to the left edge of the send window * so that it doesn't drift into the send window on sequence * number wraparound. */ tp->snd_up = tp->snd_una; /* drag it along */ #ifdef TCP_SIGNATURE if (tp->t_flags & TF_SIGNATURE) { int sigoff = to.to_signature - opt; tcp_signature_compute(m, 0, len, optlen, (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND); } #endif /* * Put TCP length in extended header, and then * checksum extended header and data. */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (isipv6) { /* * ip6_plen is not need to be filled now, and will be filled * in ip6_output. */ m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 0); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { m->m_pkthdr.csum_flags = CSUM_TCP; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); } #endif /* * Enable TSO and specify the size of the segments. * The TCP pseudo header checksum is always provided. * XXX: Fixme: This is currently not the case for IPv6. */ if (tso) { KASSERT(len > tp->t_maxopd - optlen, ("%s: len <= tso_segsz", __func__)); m->m_pkthdr.csum_flags |= CSUM_TSO; m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen; } #ifdef IPSEC KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL), ("%s: mbuf chain shorter than expected: %ld + %u + %u - %u != %u", __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL))); #else KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL), ("%s: mbuf chain shorter than expected: %ld + %u + %u != %u", __func__, len, hdrlen, ipoptlen, m_length(m, NULL))); #endif /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ hhook_run_tcp_est_out(tp, th, &to, len, tso); #ifdef TCPDEBUG /* * Trace. */ if (so->so_options & SO_DEBUG) { u_short save = 0; #ifdef INET6 if (!isipv6) #endif { save = ipov->ih_len; ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */); } tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); #ifdef INET6 if (!isipv6) #endif ipov->ih_len = save; } #endif /* TCPDEBUG */ /* * Fill in IP length and desired time to live and * send to IP level. There should be a better way * to handle ttl and tos; we could keep them in * the template, but need a way to checksum without them. */ /* * m->m_pkthdr.len should have been set before checksum calculation, * because in6_cksum() need it. */ #ifdef INET6 if (isipv6) { struct route_in6 ro; bzero(&ro, sizeof(ro)); /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. * Also, desired default hop limit might be changed via * Neighbor Discovery. */ ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL); /* * Set the packet size here for the benefit of DTrace probes. * ip6_output() will set it properly; it's supposed to include * the option header lengths as well. */ ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); TCP_PROBE5(send, NULL, tp, ip6, tp, th); /* TODO: IPv6 IP6TOS_ECT bit on */ error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &ro, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, NULL, tp->t_inpcb); if (error == EMSGSIZE && ro.ro_rt != NULL) mtu = ro.ro_rt->rt_mtu; RO_RTFREE(&ro); } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { struct route ro; bzero(&ro, sizeof(ro)); ip->ip_len = htons(m->m_pkthdr.len); #ifdef INET6 if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO) ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL); #endif /* INET6 */ /* * If we do path MTU discovery, then we set DF on every packet. * This might not be the best thing to do according to RFC3390 * Section 2. However the tcp hostcache migitates the problem * so it affects only the first tcp connection with a host. * * NB: Don't set DF on small MTU/MSS to have a safe fallback. */ if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss) ip->ip_off |= htons(IP_DF); if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); TCP_PROBE5(send, NULL, tp, ip, tp, th); error = ip_output(m, tp->t_inpcb->inp_options, &ro, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, tp->t_inpcb); if (error == EMSGSIZE && ro.ro_rt != NULL) mtu = ro.ro_rt->rt_mtu; RO_RTFREE(&ro); } #endif /* INET */ out: /* * In transmit state, time the transmission and arrange for * the retransmit. In persist state, just set snd_max. */ if ((tp->t_flags & TF_FORCEDATA) == 0 || !tcp_timer_active(tp, TT_PERSIST)) { tcp_seq startseq = tp->snd_nxt; /* * Advance snd_nxt over sequence space of this segment. */ if (flags & (TH_SYN|TH_FIN)) { if (flags & TH_SYN) tp->snd_nxt++; if (flags & TH_FIN) { tp->snd_nxt++; tp->t_flags |= TF_SENTFIN; } } if (sack_rxmit) goto timer; tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { tp->snd_max = tp->snd_nxt; /* * Time this transmission if not a retransmission and * not currently timing anything. */ if (tp->t_rtttime == 0) { tp->t_rtttime = ticks; tp->t_rtseq = startseq; TCPSTAT_INC(tcps_segstimed); } } /* * Set retransmit timer if not currently set, * and not doing a pure ack or a keep-alive probe. * Initial value for retransmit timer is smoothed * round-trip time + 2 * round-trip time variance. * Initialize shift counter which is used for backoff * of retransmit time. */ timer: if (!tcp_timer_active(tp, TT_REXMT) && ((sack_rxmit && tp->snd_nxt != tp->snd_max) || (tp->snd_nxt != tp->snd_una))) { if (tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_PERSIST, 0); tp->t_rxtshift = 0; } tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); } } else { /* * Persist case, update snd_max but since we are in * persist mode (no window) we do not update snd_nxt. */ int xlen = len; if (flags & TH_SYN) ++xlen; if (flags & TH_FIN) { ++xlen; tp->t_flags |= TF_SENTFIN; } if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) tp->snd_max = tp->snd_nxt + len; } if (error) { /* * We know that the packet was lost, so back out the * sequence number advance, if any. * * If the error is EPERM the packet got blocked by the * local firewall. Normally we should terminate the * connection but the blocking may have been spurious * due to a firewall reconfiguration cycle. So we treat * it like a packet loss and let the retransmit timer and * timeouts do their work over time. * XXX: It is a POLA question whether calling tcp_drop right * away would be the really correct behavior instead. */ if (((tp->t_flags & TF_FORCEDATA) == 0 || !tcp_timer_active(tp, TT_PERSIST)) && ((flags & TH_SYN) == 0) && (error != EPERM)) { if (sack_rxmit) { p->rxmit -= len; tp->sackhint.sack_bytes_rexmit -= len; KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, ("sackhint bytes rtx >= 0")); } else tp->snd_nxt -= len; } SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */ switch (error) { case EPERM: tp->t_softerror = error; return (error); case ENOBUFS: if (!tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); tp->snd_cwnd = tp->t_maxseg; return (0); case EMSGSIZE: /* * For some reason the interface we used initially * to send segments changed to another or lowered * its MTU. * If TSO was active we either got an interface * without TSO capabilits or TSO was turned off. * If we obtained mtu from ip_output() then update * it and try again. */ if (tso) tp->t_flags &= ~TF_TSO; if (mtu != 0) { tcp_mss_update(tp, -1, mtu, NULL, NULL); goto again; } return (error); case EHOSTDOWN: case EHOSTUNREACH: case ENETDOWN: case ENETUNREACH: if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; return (0); } /* FALLTHROUGH */ default: return (error); } } TCPSTAT_INC(tcps_sndtotal); /* * Data sent (as far as we can tell). * If this advertises a larger window than any other segment, * then remember the size of the advertised window. * Any pending ACK has now been sent. */ if (recwin >= 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + recwin; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); if (tcp_timer_active(tp, TT_DELACK)) tcp_timer_activate(tp, TT_DELACK, 0); #if 0 /* * This completely breaks TCP if newreno is turned on. What happens * is that if delayed-acks are turned on on the receiver, this code * on the transmitter effectively destroys the TCP window, forcing * it to four packets (1.5Kx4 = 6K window). */ if (sendalot && --maxburst) goto again; #endif if (sendalot) goto again; return (0); } void tcp_setpersist(struct tcpcb *tp) { int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; int tt; tp->t_flags &= ~TF_PREVVALID; if (tcp_timer_active(tp, TT_REXMT)) panic("tcp_setpersist: retransmit pending"); /* * Start/restart persistance timer. */ TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], TCPTV_PERSMIN, TCPTV_PERSMAX); tcp_timer_activate(tp, TT_PERSIST, tt); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; } /* * Insert TCP options according to the supplied parameters to the place * optp in a consistent way. Can handle unaligned destinations. * * The order of the option processing is crucial for optimal packing and * alignment for the scarce option space. * * The optimal order for a SYN/SYN-ACK segment is: * MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) + * Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40. * * The SACK options should be last. SACK blocks consume 8*n+2 bytes. * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks). * At minimum we need 10 bytes (to generate 1 SACK block). If both * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present, * we only have 10 bytes for SACK options (40 - (12 + 18)). */ int tcp_addoptions(struct tcpopt *to, u_char *optp) { u_int mask, optlen = 0; for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) { if ((to->to_flags & mask) != mask) continue; if (optlen == TCP_MAXOLEN) break; switch (to->to_flags & mask) { case TOF_MSS: while (optlen % 4) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG) continue; optlen += TCPOLEN_MAXSEG; *optp++ = TCPOPT_MAXSEG; *optp++ = TCPOLEN_MAXSEG; to->to_mss = htons(to->to_mss); bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss)); optp += sizeof(to->to_mss); break; case TOF_SCALE: while (!optlen || optlen % 2 != 1) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW) continue; optlen += TCPOLEN_WINDOW; *optp++ = TCPOPT_WINDOW; *optp++ = TCPOLEN_WINDOW; *optp++ = to->to_wscale; break; case TOF_SACKPERM: while (optlen % 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED) continue; optlen += TCPOLEN_SACK_PERMITTED; *optp++ = TCPOPT_SACK_PERMITTED; *optp++ = TCPOLEN_SACK_PERMITTED; break; case TOF_TS: while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP) continue; optlen += TCPOLEN_TIMESTAMP; *optp++ = TCPOPT_TIMESTAMP; *optp++ = TCPOLEN_TIMESTAMP; to->to_tsval = htonl(to->to_tsval); to->to_tsecr = htonl(to->to_tsecr); bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval)); optp += sizeof(to->to_tsval); bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr)); optp += sizeof(to->to_tsecr); break; case TOF_SIGNATURE: { int siglen = TCPOLEN_SIGNATURE - 2; while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE) continue; optlen += TCPOLEN_SIGNATURE; *optp++ = TCPOPT_SIGNATURE; *optp++ = TCPOLEN_SIGNATURE; to->to_signature = optp; while (siglen--) *optp++ = 0; break; } case TOF_SACK: { int sackblks = 0; struct sackblk *sack = (struct sackblk *)to->to_sacks; tcp_seq sack_seq; while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK) continue; optlen += TCPOLEN_SACKHDR; *optp++ = TCPOPT_SACK; sackblks = min(to->to_nsacks, (TCP_MAXOLEN - optlen) / TCPOLEN_SACK); *optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK; while (sackblks--) { sack_seq = htonl(sack->start); bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); optp += sizeof(sack_seq); sack_seq = htonl(sack->end); bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); optp += sizeof(sack_seq); optlen += TCPOLEN_SACK; sack++; } TCPSTAT_INC(tcps_sack_send_blocks); break; } default: panic("%s: unknown TCP option type", __func__); break; } } /* Terminate and pad TCP options to a 4 byte boundary. */ if (optlen % 4) { optlen += TCPOLEN_EOL; *optp++ = TCPOPT_EOL; } /* * According to RFC 793 (STD0007): * "The content of the header beyond the End-of-Option option * must be header padding (i.e., zero)." * and later: "The padding is composed of zeros." */ while (optlen % 4) { optlen += TCPOLEN_PAD; *optp++ = TCPOPT_PAD; } KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__)); return (optlen); } Index: head/sys/ofed/drivers/net/mlx4/en_netdev.c =================================================================== --- head/sys/ofed/drivers/net/mlx4/en_netdev.c (revision 271550) +++ head/sys/ofed/drivers/net/mlx4/en_netdev.c (revision 271551) @@ -1,1673 +1,1667 @@ /* * Copyright (c) 2007 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include "mlx4_en.h" #include #include #include #include #include #include #include #include #include static void mlx4_en_init_locked(struct mlx4_en_priv *priv); static void mlx4_en_sysctl_stat(struct mlx4_en_priv *priv); static void mlx4_en_vlan_rx_add_vid(void *arg, struct net_device *dev, u16 vid) { struct mlx4_en_priv *priv = netdev_priv(dev); int idx; u8 field; if (arg != priv) return; if ((vid == 0) || (vid > 4095)) /* Invalid */ return; en_dbg(HW, priv, "adding VLAN:%d\n", vid); idx = vid >> 5; field = 1 << (vid & 0x1f); spin_lock(&priv->vlan_lock); priv->vlgrp_modified = true; if (priv->vlan_unregister[idx] & field) priv->vlan_unregister[idx] &= ~field; else priv->vlan_register[idx] |= field; priv->vlans[idx] |= field; spin_unlock(&priv->vlan_lock); } static void mlx4_en_vlan_rx_kill_vid(void *arg, struct net_device *dev, u16 vid) { struct mlx4_en_priv *priv = netdev_priv(dev); int idx; u8 field; if (arg != priv) return; if ((vid == 0) || (vid > 4095)) /* Invalid */ return; en_dbg(HW, priv, "Killing VID:%d\n", vid); idx = vid >> 5; field = 1 << (vid & 0x1f); spin_lock(&priv->vlan_lock); priv->vlgrp_modified = true; if (priv->vlan_register[idx] & field) priv->vlan_register[idx] &= ~field; else priv->vlan_unregister[idx] |= field; priv->vlans[idx] &= ~field; spin_unlock(&priv->vlan_lock); } u64 mlx4_en_mac_to_u64(u8 *addr) { u64 mac = 0; int i; for (i = 0; i < ETHER_ADDR_LEN; i++) { mac <<= 8; mac |= addr[i]; } return mac; } static int mlx4_en_cache_mclist(struct net_device *dev, u64 **mcaddrp) { struct ifmultiaddr *ifma; u64 *mcaddr; int cnt; int i; *mcaddrp = NULL; restart: cnt = 0; if_maddr_rlock(dev); TAILQ_FOREACH(ifma, &dev->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (((struct sockaddr_dl *)ifma->ifma_addr)->sdl_alen != ETHER_ADDR_LEN) continue; cnt++; } if_maddr_runlock(dev); if (cnt == 0) return (0); mcaddr = kmalloc(sizeof(u64) * cnt, GFP_KERNEL); if (mcaddr == NULL) return (0); i = 0; if_maddr_rlock(dev); TAILQ_FOREACH(ifma, &dev->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (((struct sockaddr_dl *)ifma->ifma_addr)->sdl_alen != ETHER_ADDR_LEN) continue; /* Make sure the list didn't grow. */ if (i == cnt) { if_maddr_runlock(dev); kfree(mcaddr); goto restart; } mcaddr[i++] = mlx4_en_mac_to_u64( LLADDR((struct sockaddr_dl *)ifma->ifma_addr)); } if_maddr_runlock(dev); *mcaddrp = mcaddr; return (i); } static void mlx4_en_stop_port(struct net_device *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); queue_work(priv->mdev->workqueue, &priv->stop_port_task); } static void mlx4_en_start_port(struct net_device *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); queue_work(priv->mdev->workqueue, &priv->start_port_task); } static void mlx4_en_set_multicast(struct net_device *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); if (!priv->port_up) return; queue_work(priv->mdev->workqueue, &priv->mcast_task); } static void mlx4_en_do_set_multicast(struct work_struct *work) { struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv, mcast_task); struct net_device *dev = priv->dev; struct mlx4_en_dev *mdev = priv->mdev; int err; mutex_lock(&mdev->state_lock); if (!mdev->device_up) { en_dbg(HW, priv, "Card is not up, " "ignoring multicast change.\n"); goto out; } if (!priv->port_up) { en_dbg(HW, priv, "Port is down, " "ignoring multicast change.\n"); goto out; } /* * Promsicuous mode: disable all filters */ if (dev->if_flags & IFF_PROMISC) { if (!(priv->flags & MLX4_EN_FLAG_PROMISC)) { priv->flags |= MLX4_EN_FLAG_PROMISC; /* Enable promiscouos mode */ err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port, priv->base_qpn, 1); if (err) en_err(priv, "Failed enabling " "promiscous mode\n"); /* Disable port multicast filter (unconditionally) */ err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 0, MLX4_MCAST_DISABLE); if (err) en_err(priv, "Failed disabling " "multicast filter\n"); /* Disable port VLAN filter */ err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, NULL); if (err) en_err(priv, "Failed disabling VLAN filter\n"); } goto out; } /* * Not in promiscous mode */ if (priv->flags & MLX4_EN_FLAG_PROMISC) { priv->flags &= ~MLX4_EN_FLAG_PROMISC; /* Disable promiscouos mode */ err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port, priv->base_qpn, 0); if (err) en_err(priv, "Failed disabling promiscous mode\n"); /* Enable port VLAN filter */ err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, priv->vlans); if (err) en_err(priv, "Failed enabling VLAN filter\n"); } /* Enable/disable the multicast filter according to IFF_ALLMULTI */ if (dev->if_flags & IFF_ALLMULTI) { err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 0, MLX4_MCAST_DISABLE); if (err) en_err(priv, "Failed disabling multicast filter\n"); } else { u64 *mcaddr; int mccount; int i; err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 0, MLX4_MCAST_DISABLE); if (err) en_err(priv, "Failed disabling multicast filter\n"); /* Flush mcast filter and init it with broadcast address */ mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, ETH_BCAST, 1, MLX4_MCAST_CONFIG); /* Update multicast list - we cache all addresses so they won't * change while HW is updated holding the command semaphor */ mccount = mlx4_en_cache_mclist(dev, &mcaddr); for (i = 0; i < mccount; i++) mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, mcaddr[i], 0, MLX4_MCAST_CONFIG); err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 0, MLX4_MCAST_ENABLE); if (err) en_err(priv, "Failed enabling multicast filter\n"); kfree(mcaddr); } out: mutex_unlock(&mdev->state_lock); } #ifdef CONFIG_NET_POLL_CONTROLLER static void mlx4_en_netpoll(struct net_device *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_cq *cq; unsigned long flags; int i; for (i = 0; i < priv->rx_ring_num; i++) { cq = &priv->rx_cq[i]; spin_lock_irqsave(&cq->lock, flags); napi_synchronize(&cq->napi); mlx4_en_process_rx_cq(dev, cq, 0); spin_unlock_irqrestore(&cq->lock, flags); } } #endif static void mlx4_en_watchdog_timeout(void *arg) { struct mlx4_en_priv *priv = arg; struct mlx4_en_dev *mdev = priv->mdev; en_dbg(DRV, priv, "Scheduling watchdog\n"); queue_work(mdev->workqueue, &priv->watchdog_task); if (priv->port_up) callout_reset(&priv->watchdog_timer, MLX4_EN_WATCHDOG_TIMEOUT, mlx4_en_watchdog_timeout, priv); } /* XXX This clears user settings in too many cases. */ static void mlx4_en_set_default_moderation(struct mlx4_en_priv *priv) { struct mlx4_en_cq *cq; int i; /* If we haven't received a specific coalescing setting * (module param), we set the moderation paramters as follows: * - moder_cnt is set to the number of mtu sized packets to * satisfy our coelsing target. * - moder_time is set to a fixed value. */ priv->rx_frames = MLX4_EN_RX_COAL_TARGET / priv->dev->if_mtu + 1; priv->rx_usecs = MLX4_EN_RX_COAL_TIME; en_dbg(INTR, priv, "Default coalesing params for mtu:%u - " "rx_frames:%d rx_usecs:%d\n", priv->dev->if_mtu, priv->rx_frames, priv->rx_usecs); /* Setup cq moderation params */ for (i = 0; i < priv->rx_ring_num; i++) { cq = &priv->rx_cq[i]; cq->moder_cnt = priv->rx_frames; cq->moder_time = priv->rx_usecs; priv->last_moder_time[i] = MLX4_EN_AUTO_CONF; priv->last_moder_packets[i] = 0; priv->last_moder_bytes[i] = 0; } for (i = 0; i < priv->tx_ring_num; i++) { cq = &priv->tx_cq[i]; cq->moder_cnt = MLX4_EN_TX_COAL_PKTS; cq->moder_time = MLX4_EN_TX_COAL_TIME; } /* Reset auto-moderation params */ priv->pkt_rate_low = MLX4_EN_RX_RATE_LOW; priv->rx_usecs_low = MLX4_EN_RX_COAL_TIME_LOW; priv->pkt_rate_high = MLX4_EN_RX_RATE_HIGH; priv->rx_usecs_high = MLX4_EN_RX_COAL_TIME_HIGH; priv->sample_interval = MLX4_EN_SAMPLE_INTERVAL; priv->adaptive_rx_coal = 1; priv->last_moder_jiffies = 0; priv->last_moder_tx_packets = 0; } static void mlx4_en_auto_moderation(struct mlx4_en_priv *priv) { unsigned long period = (unsigned long) (jiffies - priv->last_moder_jiffies); struct mlx4_en_cq *cq; unsigned long packets; unsigned long rate; unsigned long avg_pkt_size; unsigned long rx_packets; unsigned long rx_bytes; unsigned long rx_pkt_diff; int moder_time; int ring, err; if (!priv->adaptive_rx_coal || period < priv->sample_interval * HZ) return; for (ring = 0; ring < priv->rx_ring_num; ring++) { spin_lock(&priv->stats_lock); rx_packets = priv->rx_ring[ring].packets; rx_bytes = priv->rx_ring[ring].bytes; spin_unlock(&priv->stats_lock); rx_pkt_diff = ((unsigned long) (rx_packets - priv->last_moder_packets[ring])); packets = rx_pkt_diff; rate = packets * HZ / period; avg_pkt_size = packets ? ((unsigned long) (rx_bytes - priv->last_moder_bytes[ring])) / packets : 0; /* Apply auto-moderation only when packet rate * exceeds a rate that it matters */ if (rate > (MLX4_EN_RX_RATE_THRESH / priv->rx_ring_num) && avg_pkt_size > MLX4_EN_AVG_PKT_SMALL) { if (rate < priv->pkt_rate_low || avg_pkt_size < MLX4_EN_AVG_PKT_SMALL) moder_time = priv->rx_usecs_low; else if (rate > priv->pkt_rate_high) moder_time = priv->rx_usecs_high; else moder_time = (rate - priv->pkt_rate_low) * (priv->rx_usecs_high - priv->rx_usecs_low) / (priv->pkt_rate_high - priv->pkt_rate_low) + priv->rx_usecs_low; } else { moder_time = priv->rx_usecs_low; } if (moder_time != priv->last_moder_time[ring]) { priv->last_moder_time[ring] = moder_time; cq = &priv->rx_cq[ring]; cq->moder_time = moder_time; err = mlx4_en_set_cq_moder(priv, cq); if (err) en_err(priv, "Failed modifying moderation " "for cq:%d\n", ring); } priv->last_moder_packets[ring] = rx_packets; priv->last_moder_bytes[ring] = rx_bytes; } priv->last_moder_jiffies = jiffies; } static void mlx4_en_handle_vlans(struct mlx4_en_priv *priv) { u8 vlan_register[VLAN_FLTR_SIZE]; u8 vlan_unregister[VLAN_FLTR_SIZE]; int i, j, idx; u16 vid; /* cache the vlan data for processing * done under lock to avoid changes during work */ spin_lock(&priv->vlan_lock); for (i = 0; i < VLAN_FLTR_SIZE; i++) { vlan_register[i] = priv->vlan_register[i]; priv->vlan_register[i] = 0; vlan_unregister[i] = priv->vlan_unregister[i]; priv->vlan_unregister[i] = 0; } priv->vlgrp_modified = false; spin_unlock(&priv->vlan_lock); /* Configure the vlan filter * The vlgrp is updated with all the vids that need to be allowed */ if (mlx4_SET_VLAN_FLTR(priv->mdev->dev, priv->port, priv->vlans)) en_err(priv, "Failed configuring VLAN filter\n"); /* Configure the VLAN table */ for (i = 0; i < VLAN_FLTR_SIZE; i++) { for (j = 0; j < 32; j++) { vid = (i << 5) + j; if (vlan_register[i] & (1 << j)) if (mlx4_register_vlan(priv->mdev->dev, priv->port, vid, &idx)) en_dbg(HW, priv, "failed registering vlan %d\n", vid); if (vlan_unregister[i] & (1 << j)) { if (!mlx4_find_cached_vlan(priv->mdev->dev, priv->port, vid, &idx)) mlx4_unregister_vlan(priv->mdev->dev, priv->port, idx); else en_dbg(HW, priv, "could not find vid %d in cache\n", vid); } } } } static void mlx4_en_do_get_stats(struct work_struct *work) { struct delayed_work *delay = to_delayed_work(work); struct mlx4_en_priv *priv = container_of(delay, struct mlx4_en_priv, stats_task); struct mlx4_en_dev *mdev = priv->mdev; int err; err = mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 0); if (err) en_dbg(HW, priv, "Could not update stats \n"); mutex_lock(&mdev->state_lock); if (mdev->device_up) { if (priv->port_up) { if (priv->vlgrp_modified) mlx4_en_handle_vlans(priv); mlx4_en_auto_moderation(priv); } queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY); } mlx4_en_QUERY_PORT(priv->mdev, priv->port); mutex_unlock(&mdev->state_lock); } static void mlx4_en_linkstate(struct work_struct *work) { struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv, linkstate_task); struct mlx4_en_dev *mdev = priv->mdev; int linkstate = priv->link_state; mutex_lock(&mdev->state_lock); /* If observable port state changed set carrier state and * report to system log */ if (priv->last_link_state != linkstate) { if (linkstate == MLX4_DEV_EVENT_PORT_DOWN) { if_link_state_change(priv->dev, LINK_STATE_DOWN); } else { en_info(priv, "Link Up\n"); if_link_state_change(priv->dev, LINK_STATE_UP); } } priv->last_link_state = linkstate; mutex_unlock(&mdev->state_lock); } static void mlx4_en_lock_and_stop_port(struct work_struct *work) { struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv, stop_port_task); struct net_device *dev = priv->dev; struct mlx4_en_dev *mdev = priv->mdev; mutex_lock(&mdev->state_lock); mlx4_en_do_stop_port(dev); mutex_unlock(&mdev->state_lock); } static void mlx4_en_lock_and_start_port(struct work_struct *work) { struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv, start_port_task); struct net_device *dev = priv->dev; struct mlx4_en_dev *mdev = priv->mdev; mutex_lock(&mdev->state_lock); mlx4_en_do_start_port(dev); mutex_unlock(&mdev->state_lock); } int mlx4_en_do_start_port(struct net_device *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_cq *cq; struct mlx4_en_tx_ring *tx_ring; u64 config; int rx_index = 0; int tx_index = 0; int err = 0; int i; int j; if (priv->port_up) { en_dbg(DRV, priv, "start port called while port already up\n"); return 0; } /* Calculate Rx buf size */ dev->if_mtu = min(dev->if_mtu, priv->max_mtu); mlx4_en_calc_rx_buf(dev); en_dbg(DRV, priv, "Rx buf size:%d\n", priv->rx_mb_size); /* Configure rx cq's and rings */ err = mlx4_en_activate_rx_rings(priv); if (err) { en_err(priv, "Failed to activate RX rings\n"); return err; } for (i = 0; i < priv->rx_ring_num; i++) { cq = &priv->rx_cq[i]; err = mlx4_en_activate_cq(priv, cq); if (err) { en_err(priv, "Failed activating Rx CQ\n"); goto cq_err; } for (j = 0; j < cq->size; j++) cq->buf[j].owner_sr_opcode = MLX4_CQE_OWNER_MASK; err = mlx4_en_set_cq_moder(priv, cq); if (err) { en_err(priv, "Failed setting cq moderation parameters"); mlx4_en_deactivate_cq(priv, cq); goto cq_err; } mlx4_en_arm_cq(priv, cq); priv->rx_ring[i].cqn = cq->mcq.cqn; ++rx_index; } err = mlx4_en_config_rss_steer(priv); if (err) { en_err(priv, "Failed configuring rss steering\n"); goto cq_err; } /* Configure tx cq's and rings */ for (i = 0; i < priv->tx_ring_num; i++) { /* Configure cq */ cq = &priv->tx_cq[i]; err = mlx4_en_activate_cq(priv, cq); if (err) { en_err(priv, "Failed allocating Tx CQ\n"); goto tx_err; } err = mlx4_en_set_cq_moder(priv, cq); if (err) { en_err(priv, "Failed setting cq moderation parameters"); mlx4_en_deactivate_cq(priv, cq); goto tx_err; } en_dbg(DRV, priv, "Resetting index of collapsed CQ:%d to -1\n", i); cq->buf->wqe_index = cpu_to_be16(0xffff); /* Configure ring */ tx_ring = &priv->tx_ring[i]; err = mlx4_en_activate_tx_ring(priv, tx_ring, cq->mcq.cqn); if (err) { en_err(priv, "Failed allocating Tx ring\n"); mlx4_en_deactivate_cq(priv, cq); goto tx_err; } /* Set initial ownership of all Tx TXBBs to SW (1) */ for (j = 0; j < tx_ring->buf_size; j += STAMP_STRIDE) *((u32 *) (tx_ring->buf + j)) = 0xffffffff; ++tx_index; } /* Configure port */ err = mlx4_SET_PORT_general(mdev->dev, priv->port, priv->rx_mb_size + ETHER_CRC_LEN, priv->prof->tx_pause, priv->prof->tx_ppp, priv->prof->rx_pause, priv->prof->rx_ppp); if (err) { en_err(priv, "Failed setting port general configurations " "for port %d, with error %d\n", priv->port, err); goto tx_err; } /* Set default qp number */ err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port, priv->base_qpn, 0); if (err) { en_err(priv, "Failed setting default qp numbers\n"); goto tx_err; } /* Set port mac number */ en_dbg(DRV, priv, "Setting mac for port %d\n", priv->port); err = mlx4_register_mac(mdev->dev, priv->port, mlx4_en_mac_to_u64(IF_LLADDR(dev))); if (err < 0) { en_err(priv, "Failed setting port mac err=%d\n", err); goto tx_err; } mdev->mac_removed[priv->port] = 0; /* Init port */ en_dbg(HW, priv, "Initializing port\n"); err = mlx4_INIT_PORT(mdev->dev, priv->port); if (err) { en_err(priv, "Failed Initializing port\n"); goto mac_err; } /* Set the various hardware offload abilities */ dev->if_hwassist = 0; if (dev->if_capenable & IFCAP_TSO4) dev->if_hwassist |= CSUM_TSO; if (dev->if_capenable & IFCAP_TXCSUM) dev->if_hwassist |= (CSUM_TCP | CSUM_UDP | CSUM_IP); if (dev->if_capenable & IFCAP_RXCSUM) priv->rx_csum = 1; else priv->rx_csum = 0; - /* set TSO limits so that we don't have to drop TX packets */ - dev->if_hw_tsomax = IF_HW_TSOMAX_BUILD_VALUE( - 65535 - sizeof(struct ether_vlan_header) /* bytes */, - 16 /* maximum frag count */, - 16 /* can do up to 4GByte */); - err = mlx4_wol_read(priv->mdev->dev, &config, priv->port); if (err) { en_err(priv, "Failed to get WoL info, unable to modify\n"); goto wol_err; } if (dev->if_capenable & IFCAP_WOL_MAGIC) { config |= MLX4_EN_WOL_DO_MODIFY | MLX4_EN_WOL_ENABLED | MLX4_EN_WOL_MAGIC; } else { config &= ~(MLX4_EN_WOL_ENABLED | MLX4_EN_WOL_MAGIC); config |= MLX4_EN_WOL_DO_MODIFY; } err = mlx4_wol_write(priv->mdev->dev, config, priv->port); if (err) { en_err(priv, "Failed to set WoL information\n"); goto wol_err; } priv->port_up = true; /* Populate multicast list */ mlx4_en_set_multicast(dev); /* Enable the queues. */ dev->if_drv_flags &= ~IFF_DRV_OACTIVE; dev->if_drv_flags |= IFF_DRV_RUNNING; callout_reset(&priv->watchdog_timer, MLX4_EN_WATCHDOG_TIMEOUT, mlx4_en_watchdog_timeout, priv); return 0; wol_err: /* close port*/ mlx4_CLOSE_PORT(mdev->dev, priv->port); mac_err: mlx4_unregister_mac(mdev->dev, priv->port, priv->mac); tx_err: while (tx_index--) { mlx4_en_deactivate_tx_ring(priv, &priv->tx_ring[tx_index]); mlx4_en_deactivate_cq(priv, &priv->tx_cq[tx_index]); } mlx4_en_release_rss_steer(priv); cq_err: while (rx_index--) mlx4_en_deactivate_cq(priv, &priv->rx_cq[rx_index]); for (i = 0; i < priv->rx_ring_num; i++) mlx4_en_deactivate_rx_ring(priv, &priv->rx_ring[i]); return err; /* need to close devices */ } void mlx4_en_do_stop_port(struct net_device *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; int i; if (!priv->port_up) { en_dbg(DRV, priv, "stop port called while port already down\n"); return; } /* Set port as not active */ priv->port_up = false; /* Unregister Mac address for the port */ mlx4_unregister_mac(mdev->dev, priv->port, priv->mac); mdev->mac_removed[priv->port] = 1; /* Free TX Rings */ for (i = 0; i < priv->tx_ring_num; i++) { mlx4_en_deactivate_tx_ring(priv, &priv->tx_ring[i]); mlx4_en_deactivate_cq(priv, &priv->tx_cq[i]); } msleep(10); for (i = 0; i < priv->tx_ring_num; i++) mlx4_en_free_tx_buf(dev, &priv->tx_ring[i]); /* Free RSS qps */ mlx4_en_release_rss_steer(priv); /* Free RX Rings */ for (i = 0; i < priv->rx_ring_num; i++) { mlx4_en_deactivate_rx_ring(priv, &priv->rx_ring[i]); mlx4_en_deactivate_cq(priv, &priv->rx_cq[i]); } /* close port*/ mlx4_CLOSE_PORT(mdev->dev, priv->port); callout_stop(&priv->watchdog_timer); dev->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); } static void mlx4_en_restart(struct work_struct *work) { struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv, watchdog_task); struct mlx4_en_dev *mdev = priv->mdev; struct net_device *dev = priv->dev; struct mlx4_en_tx_ring *ring; int i; if (priv->blocked == 0 || priv->port_up == 0) return; for (i = 0; i < priv->tx_ring_num; i++) { ring = &priv->tx_ring[i]; if (ring->blocked && ring->watchdog_time + MLX4_EN_WATCHDOG_TIMEOUT < ticks) goto reset; } return; reset: priv->port_stats.tx_timeout++; en_dbg(DRV, priv, "Watchdog task called for port %d\n", priv->port); mutex_lock(&mdev->state_lock); if (priv->port_up) { mlx4_en_do_stop_port(dev); if (mlx4_en_do_start_port(dev)) en_err(priv, "Failed restarting port %d\n", priv->port); } mutex_unlock(&mdev->state_lock); } static void mlx4_en_init(void *arg) { struct mlx4_en_priv *priv; struct mlx4_en_dev *mdev; priv = arg; mdev = priv->mdev; mutex_lock(&mdev->state_lock); mlx4_en_init_locked(priv); mutex_unlock(&mdev->state_lock); } static void mlx4_en_init_locked(struct mlx4_en_priv *priv) { struct mlx4_en_dev *mdev; struct ifnet *dev; int i; dev = priv->dev; mdev = priv->mdev; if (dev->if_drv_flags & IFF_DRV_RUNNING) mlx4_en_do_stop_port(dev); if (!mdev->device_up) { en_err(priv, "Cannot open - device down/disabled\n"); return; } /* Reset HW statistics and performance counters */ if (mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 1)) en_dbg(HW, priv, "Failed dumping statistics\n"); memset(&priv->pstats, 0, sizeof(priv->pstats)); for (i = 0; i < priv->tx_ring_num; i++) { priv->tx_ring[i].bytes = 0; priv->tx_ring[i].packets = 0; } for (i = 0; i < priv->rx_ring_num; i++) { priv->rx_ring[i].bytes = 0; priv->rx_ring[i].packets = 0; } mlx4_en_set_default_moderation(priv); if (mlx4_en_do_start_port(dev)) en_err(priv, "Failed starting port:%d\n", priv->port); } void mlx4_en_free_resources(struct mlx4_en_priv *priv) { int i; for (i = 0; i < priv->tx_ring_num; i++) { if (priv->tx_ring[i].tx_info) mlx4_en_destroy_tx_ring(priv, &priv->tx_ring[i]); if (priv->tx_cq[i].buf) mlx4_en_destroy_cq(priv, &priv->tx_cq[i]); } for (i = 0; i < priv->rx_ring_num; i++) { if (priv->rx_ring[i].rx_info) mlx4_en_destroy_rx_ring(priv, &priv->rx_ring[i]); if (priv->rx_cq[i].buf) mlx4_en_destroy_cq(priv, &priv->rx_cq[i]); } /* Free the stats tree when we resize the rings. */ if (priv->sysctl) sysctl_ctx_free(&priv->stat_ctx); } int mlx4_en_alloc_resources(struct mlx4_en_priv *priv) { struct mlx4_en_port_profile *prof = priv->prof; int i; /* Create tx Rings */ for (i = 0; i < priv->tx_ring_num; i++) { if (mlx4_en_create_cq(priv, &priv->tx_cq[i], prof->tx_ring_size, i, TX)) goto err; if (mlx4_en_create_tx_ring(priv, &priv->tx_ring[i], prof->tx_ring_size, TXBB_SIZE)) goto err; } /* Create rx Rings */ for (i = 0; i < priv->rx_ring_num; i++) { if (mlx4_en_create_cq(priv, &priv->rx_cq[i], prof->rx_ring_size, i, RX)) goto err; if (mlx4_en_create_rx_ring(priv, &priv->rx_ring[i], prof->rx_ring_size)) goto err; } /* Re-create stat sysctls in case the number of rings changed. */ mlx4_en_sysctl_stat(priv); /* Populate Tx priority mappings */ mlx4_en_set_prio_map(priv, priv->tx_prio_map, priv->tx_ring_num - MLX4_EN_NUM_HASH_RINGS); return 0; err: en_err(priv, "Failed to allocate NIC resources\n"); return -ENOMEM; } void mlx4_en_destroy_netdev(struct net_device *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; en_dbg(DRV, priv, "Destroying netdev on port:%d\n", priv->port); if (priv->vlan_attach != NULL) EVENTHANDLER_DEREGISTER(vlan_config, priv->vlan_attach); if (priv->vlan_detach != NULL) EVENTHANDLER_DEREGISTER(vlan_unconfig, priv->vlan_detach); /* Unregister device - this will close the port if it was up */ if (priv->registered) ether_ifdetach(dev); if (priv->allocated) mlx4_free_hwq_res(mdev->dev, &priv->res, MLX4_EN_PAGE_SIZE); mutex_lock(&mdev->state_lock); mlx4_en_do_stop_port(dev); mutex_unlock(&mdev->state_lock); cancel_delayed_work(&priv->stats_task); /* flush any pending task for this netdev */ flush_workqueue(mdev->workqueue); callout_drain(&priv->watchdog_timer); /* Detach the netdev so tasks would not attempt to access it */ mutex_lock(&mdev->state_lock); mdev->pndev[priv->port] = NULL; mutex_unlock(&mdev->state_lock); mlx4_en_free_resources(priv); if (priv->sysctl) sysctl_ctx_free(&priv->conf_ctx); mtx_destroy(&priv->stats_lock.m); mtx_destroy(&priv->vlan_lock.m); kfree(priv); if_free(dev); } static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; int err = 0; en_dbg(DRV, priv, "Change MTU called - current:%u new:%d\n", dev->if_mtu, new_mtu); if ((new_mtu < MLX4_EN_MIN_MTU) || (new_mtu > priv->max_mtu)) { en_err(priv, "Bad MTU size:%d.\n", new_mtu); return -EPERM; } mutex_lock(&mdev->state_lock); dev->if_mtu = new_mtu; if (dev->if_drv_flags & IFF_DRV_RUNNING) { if (!mdev->device_up) { /* NIC is probably restarting - let watchdog task reset * the port */ en_dbg(DRV, priv, "Change MTU called with card down!?\n"); } else { mlx4_en_do_stop_port(dev); mlx4_en_set_default_moderation(priv); err = mlx4_en_do_start_port(dev); if (err) { en_err(priv, "Failed restarting port:%d\n", priv->port); queue_work(mdev->workqueue, &priv->watchdog_task); } } } mutex_unlock(&mdev->state_lock); return 0; } static int mlx4_en_calc_media(struct mlx4_en_priv *priv) { int trans_type; int active; active = IFM_ETHER; if (priv->last_link_state == MLX4_DEV_EVENT_PORT_DOWN) return (active); /* * [ShaharK] mlx4_en_QUERY_PORT sleeps and cannot be called under a * non-sleepable lock. * I moved it to the periodic mlx4_en_do_get_stats. if (mlx4_en_QUERY_PORT(priv->mdev, priv->port)) return (active); */ active |= IFM_FDX; trans_type = priv->port_state.transciver; /* XXX I don't know all of the transceiver values. */ switch (priv->port_state.link_speed) { case 1000: active |= IFM_1000_T; break; case 10000: if (trans_type > 0 && trans_type <= 0xC) active |= IFM_10G_SR; else if (trans_type == 0x80 || trans_type == 0) active |= IFM_10G_CX4; break; case 40000: active |= IFM_40G_CR4; break; } if (priv->prof->tx_pause) active |= IFM_ETH_TXPAUSE; if (priv->prof->rx_pause) active |= IFM_ETH_RXPAUSE; return (active); } static void mlx4_en_media_status(struct ifnet *dev, struct ifmediareq *ifmr) { struct mlx4_en_priv *priv; priv = dev->if_softc; ifmr->ifm_status = IFM_AVALID; if (priv->last_link_state != MLX4_DEV_EVENT_PORT_DOWN) ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active = mlx4_en_calc_media(priv); return; } static int mlx4_en_media_change(struct ifnet *dev) { struct mlx4_en_priv *priv; struct ifmedia *ifm; int rxpause; int txpause; int error; priv = dev->if_softc; ifm = &priv->media; rxpause = txpause = 0; error = 0; if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) return (EINVAL); switch (IFM_SUBTYPE(ifm->ifm_media)) { case IFM_AUTO: break; case IFM_10G_SR: case IFM_10G_CX4: case IFM_1000_T: if (IFM_SUBTYPE(ifm->ifm_media) == IFM_SUBTYPE(mlx4_en_calc_media(priv)) && (ifm->ifm_media & IFM_FDX)) break; /* Fallthrough */ default: printf("%s: Only auto media type\n", if_name(dev)); return (EINVAL); } /* Allow user to set/clear pause */ if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_RXPAUSE) rxpause = 1; if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_TXPAUSE) txpause = 1; if (priv->prof->tx_pause != txpause || priv->prof->rx_pause != rxpause) { priv->prof->tx_pause = txpause; priv->prof->rx_pause = rxpause; error = -mlx4_SET_PORT_general(priv->mdev->dev, priv->port, priv->rx_mb_size + ETHER_CRC_LEN, priv->prof->tx_pause, priv->prof->tx_ppp, priv->prof->rx_pause, priv->prof->rx_ppp); } return (error); } static int mlx4_en_ioctl(struct ifnet *dev, u_long command, caddr_t data) { struct mlx4_en_priv *priv; struct mlx4_en_dev *mdev; struct ifreq *ifr; int error; int mask; error = 0; mask = 0; priv = dev->if_softc; mdev = priv->mdev; ifr = (struct ifreq *) data; switch (command) { case SIOCSIFMTU: error = -mlx4_en_change_mtu(dev, ifr->ifr_mtu); break; case SIOCSIFFLAGS: if (dev->if_flags & IFF_UP) { if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0) mlx4_en_start_port(dev); else mlx4_en_set_multicast(dev); } else { if (dev->if_drv_flags & IFF_DRV_RUNNING) { mlx4_en_stop_port(dev); if_link_state_change(dev, LINK_STATE_DOWN); /* * Since mlx4_en_stop_port is defered we * have to wait till it's finished. */ for (int count=0; count<10; count++) { if (dev->if_drv_flags & IFF_DRV_RUNNING) { DELAY(20000); } else { break; } } } } break; case SIOCADDMULTI: case SIOCDELMULTI: mlx4_en_set_multicast(dev); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(dev, ifr, &priv->media, command); break; case SIOCSIFCAP: mutex_lock(&mdev->state_lock); mask = ifr->ifr_reqcap ^ dev->if_capenable; if (mask & IFCAP_HWCSUM) dev->if_capenable ^= IFCAP_HWCSUM; if (mask & IFCAP_TSO4) dev->if_capenable ^= IFCAP_TSO4; if (mask & IFCAP_LRO) dev->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_VLAN_HWTAGGING) dev->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (mask & IFCAP_VLAN_HWFILTER) dev->if_capenable ^= IFCAP_VLAN_HWFILTER; if (mask & IFCAP_WOL_MAGIC) dev->if_capenable ^= IFCAP_WOL_MAGIC; if (dev->if_drv_flags & IFF_DRV_RUNNING) mlx4_en_init_locked(priv); mutex_unlock(&mdev->state_lock); VLAN_CAPABILITIES(dev); break; default: error = ether_ioctl(dev, command, data); break; } return (error); } static int mlx4_en_set_ring_size(struct net_device *dev, int rx_size, int tx_size) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; int port_up = 0; int err = 0; rx_size = roundup_pow_of_two(rx_size); rx_size = max_t(u32, rx_size, MLX4_EN_MIN_RX_SIZE); rx_size = min_t(u32, rx_size, MLX4_EN_MAX_RX_SIZE); tx_size = roundup_pow_of_two(tx_size); tx_size = max_t(u32, tx_size, MLX4_EN_MIN_TX_SIZE); tx_size = min_t(u32, tx_size, MLX4_EN_MAX_TX_SIZE); if (rx_size == (priv->port_up ? priv->rx_ring[0].actual_size : priv->rx_ring[0].size) && tx_size == priv->tx_ring[0].size) return 0; mutex_lock(&mdev->state_lock); if (priv->port_up) { port_up = 1; mlx4_en_do_stop_port(dev); } mlx4_en_free_resources(priv); priv->prof->tx_ring_size = tx_size; priv->prof->rx_ring_size = rx_size; err = mlx4_en_alloc_resources(priv); if (err) { en_err(priv, "Failed reallocating port resources\n"); goto out; } if (port_up) { err = mlx4_en_do_start_port(dev); if (err) en_err(priv, "Failed starting port\n"); } out: mutex_unlock(&mdev->state_lock); return err; } static int mlx4_en_set_rx_ring_size(SYSCTL_HANDLER_ARGS) { struct mlx4_en_priv *priv; int size; int error; priv = arg1; size = priv->prof->rx_ring_size; error = sysctl_handle_int(oidp, &size, 0, req); if (error || !req->newptr) return (error); error = -mlx4_en_set_ring_size(priv->dev, size, priv->prof->tx_ring_size); return (error); } static int mlx4_en_set_tx_ring_size(SYSCTL_HANDLER_ARGS) { struct mlx4_en_priv *priv; int size; int error; priv = arg1; size = priv->prof->tx_ring_size; error = sysctl_handle_int(oidp, &size, 0, req); if (error || !req->newptr) return (error); error = -mlx4_en_set_ring_size(priv->dev, priv->prof->rx_ring_size, size); return (error); } static int mlx4_en_set_tx_ppp(SYSCTL_HANDLER_ARGS) { struct mlx4_en_priv *priv; int ppp; int error; priv = arg1; ppp = priv->prof->tx_ppp; error = sysctl_handle_int(oidp, &ppp, 0, req); if (error || !req->newptr) return (error); if (ppp > 0xff || ppp < 0) return (-EINVAL); priv->prof->tx_ppp = ppp; error = -mlx4_SET_PORT_general(priv->mdev->dev, priv->port, priv->rx_mb_size + ETHER_CRC_LEN, priv->prof->tx_pause, priv->prof->tx_ppp, priv->prof->rx_pause, priv->prof->rx_ppp); return (error); } static int mlx4_en_set_rx_ppp(SYSCTL_HANDLER_ARGS) { struct mlx4_en_priv *priv; struct mlx4_en_dev *mdev; int tx_ring_num; int ppp; int error; int port_up; port_up = 0; priv = arg1; mdev = priv->mdev; ppp = priv->prof->rx_ppp; error = sysctl_handle_int(oidp, &ppp, 0, req); if (error || !req->newptr) return (error); if (ppp > 0xff || ppp < 0) return (-EINVAL); /* See if we have to change the number of tx queues. */ if (!ppp != !priv->prof->rx_ppp) { tx_ring_num = MLX4_EN_NUM_HASH_RINGS + 1 + (!!ppp) * MLX4_EN_NUM_PPP_RINGS; mutex_lock(&mdev->state_lock); if (priv->port_up) { port_up = 1; mlx4_en_do_stop_port(priv->dev); } mlx4_en_free_resources(priv); priv->tx_ring_num = tx_ring_num; priv->prof->rx_ppp = ppp; error = -mlx4_en_alloc_resources(priv); if (error) en_err(priv, "Failed reallocating port resources\n"); if (error == 0 && port_up) { error = -mlx4_en_do_start_port(priv->dev); if (error) en_err(priv, "Failed starting port\n"); } mutex_unlock(&mdev->state_lock); return (error); } priv->prof->rx_ppp = ppp; error = -mlx4_SET_PORT_general(priv->mdev->dev, priv->port, priv->rx_mb_size + ETHER_CRC_LEN, priv->prof->tx_pause, priv->prof->tx_ppp, priv->prof->rx_pause, priv->prof->rx_ppp); return (error); } static void mlx4_en_sysctl_conf(struct mlx4_en_priv *priv) { struct net_device *dev; struct sysctl_ctx_list *ctx; struct sysctl_oid *node; struct sysctl_oid_list *node_list; struct sysctl_oid *coal; struct sysctl_oid_list *coal_list; dev = priv->dev; ctx = &priv->conf_ctx; sysctl_ctx_init(ctx); priv->sysctl = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw), OID_AUTO, dev->if_xname, CTLFLAG_RD, 0, "mlx4 10gig ethernet"); node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(priv->sysctl), OID_AUTO, "conf", CTLFLAG_RD, NULL, "Configuration"); node_list = SYSCTL_CHILDREN(node); SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "msg_enable", CTLFLAG_RW, &priv->msg_enable, 0, "Driver message enable bitfield"); SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "rx_rings", CTLTYPE_INT | CTLFLAG_RD, &priv->rx_ring_num, 0, "Number of receive rings"); SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "tx_rings", CTLTYPE_INT | CTLFLAG_RD, &priv->tx_ring_num, 0, "Number of transmit rings"); SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "rx_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, mlx4_en_set_rx_ring_size, "I", "Receive ring size"); SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "tx_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, mlx4_en_set_tx_ring_size, "I", "Transmit ring size"); SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "ip_reasm", CTLFLAG_RW, &priv->ip_reasm, 0, "Allow reassembly of IP fragments."); SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "tx_ppp", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, mlx4_en_set_tx_ppp, "I", "TX Per-priority pause"); SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "rx_ppp", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, mlx4_en_set_rx_ppp, "I", "RX Per-priority pause"); /* Add coalescer configuration. */ coal = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO, "coalesce", CTLFLAG_RD, NULL, "Interrupt coalesce configuration"); coal_list = SYSCTL_CHILDREN(node); SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "pkt_rate_low", CTLFLAG_RW, &priv->pkt_rate_low, 0, "Packets per-second for minimum delay"); SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "rx_usecs_low", CTLFLAG_RW, &priv->rx_usecs_low, 0, "Minimum RX delay in micro-seconds"); SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "pkt_rate_high", CTLFLAG_RW, &priv->pkt_rate_high, 0, "Packets per-second for maximum delay"); SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "rx_usecs_high", CTLFLAG_RW, &priv->rx_usecs_high, 0, "Maximum RX delay in micro-seconds"); SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "sample_interval", CTLFLAG_RW, &priv->sample_interval, 0, "adaptive frequency in units of HZ ticks"); SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "adaptive_rx_coal", CTLFLAG_RW, &priv->adaptive_rx_coal, 0, "Enable adaptive rx coalescing"); } static void mlx4_en_sysctl_stat(struct mlx4_en_priv *priv) { struct net_device *dev; struct sysctl_ctx_list *ctx; struct sysctl_oid *node; struct sysctl_oid_list *node_list; struct sysctl_oid *ring_node; struct sysctl_oid_list *ring_list; struct mlx4_en_tx_ring *tx_ring; struct mlx4_en_rx_ring *rx_ring; char namebuf[128]; int i; dev = priv->dev; ctx = &priv->stat_ctx; sysctl_ctx_init(ctx); node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(priv->sysctl), OID_AUTO, "stat", CTLFLAG_RD, NULL, "Statistics"); node_list = SYSCTL_CHILDREN(node); #ifdef MLX4_EN_PERF_STAT SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "tx_poll", CTLFLAG_RD, &priv->pstats.tx_poll, "TX Poll calls"); SYSCTL_ADD_QUAD(ctx, node_list, OID_AUTO, "tx_pktsz_avg", CTLFLAG_RD, &priv->pstats.tx_pktsz_avg, "TX average packet size"); SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "inflight_avg", CTLFLAG_RD, &priv->pstats.inflight_avg, "TX average packets in-flight"); SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "tx_coal_avg", CTLFLAG_RD, &priv->pstats.tx_coal_avg, "TX average coalesced completions"); SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "rx_coal_avg", CTLFLAG_RD, &priv->pstats.rx_coal_avg, "RX average coalesced completions"); #endif SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tso_packets", CTLFLAG_RD, &priv->port_stats.tso_packets, "TSO packets sent"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "queue_stopped", CTLFLAG_RD, &priv->port_stats.queue_stopped, "Queue full"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "wake_queue", CTLFLAG_RD, &priv->port_stats.wake_queue, "Queue resumed after full"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_timeout", CTLFLAG_RD, &priv->port_stats.tx_timeout, "Transmit timeouts"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_alloc_failed", CTLFLAG_RD, &priv->port_stats.rx_alloc_failed, "RX failed to allocate mbuf"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_chksum_good", CTLFLAG_RD, &priv->port_stats.rx_chksum_good, "RX checksum offload success"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_chksum_none", CTLFLAG_RD, &priv->port_stats.rx_chksum_none, "RX without checksum offload"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_chksum_offload", CTLFLAG_RD, &priv->port_stats.tx_chksum_offload, "TX checksum offloads"); /* Could strdup the names and add in a loop. This is simpler. */ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "broadcast", CTLFLAG_RD, &priv->pkstats.broadcast, "Broadcast packets"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio0", CTLFLAG_RD, &priv->pkstats.tx_prio[0], "TX Priority 0 packets"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio1", CTLFLAG_RD, &priv->pkstats.tx_prio[1], "TX Priority 1 packets"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio2", CTLFLAG_RD, &priv->pkstats.tx_prio[2], "TX Priority 2 packets"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio3", CTLFLAG_RD, &priv->pkstats.tx_prio[3], "TX Priority 3 packets"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio4", CTLFLAG_RD, &priv->pkstats.tx_prio[4], "TX Priority 4 packets"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio5", CTLFLAG_RD, &priv->pkstats.tx_prio[5], "TX Priority 5 packets"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio6", CTLFLAG_RD, &priv->pkstats.tx_prio[6], "TX Priority 6 packets"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio7", CTLFLAG_RD, &priv->pkstats.tx_prio[7], "TX Priority 7 packets"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio0", CTLFLAG_RD, &priv->pkstats.rx_prio[0], "RX Priority 0 packets"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio1", CTLFLAG_RD, &priv->pkstats.rx_prio[1], "RX Priority 1 packets"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio2", CTLFLAG_RD, &priv->pkstats.rx_prio[2], "RX Priority 2 packets"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio3", CTLFLAG_RD, &priv->pkstats.rx_prio[3], "RX Priority 3 packets"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio4", CTLFLAG_RD, &priv->pkstats.rx_prio[4], "RX Priority 4 packets"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio5", CTLFLAG_RD, &priv->pkstats.rx_prio[5], "RX Priority 5 packets"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio6", CTLFLAG_RD, &priv->pkstats.rx_prio[6], "RX Priority 6 packets"); SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio7", CTLFLAG_RD, &priv->pkstats.rx_prio[7], "RX Priority 7 packets"); for (i = 0; i < priv->tx_ring_num; i++) { tx_ring = &priv->tx_ring[i]; snprintf(namebuf, sizeof(namebuf), "tx_ring%d", i); ring_node = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "TX Ring"); ring_list = SYSCTL_CHILDREN(ring_node); SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "packets", CTLFLAG_RD, &tx_ring->packets, "TX packets"); SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "bytes", CTLFLAG_RD, &tx_ring->bytes, "TX bytes"); SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "error", CTLFLAG_RD, &tx_ring->errors, "TX soft errors"); } for (i = 0; i < priv->rx_ring_num; i++) { rx_ring = &priv->rx_ring[i]; snprintf(namebuf, sizeof(namebuf), "rx_ring%d", i); ring_node = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "RX Ring"); ring_list = SYSCTL_CHILDREN(ring_node); SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "packets", CTLFLAG_RD, &rx_ring->packets, "RX packets"); SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "bytes", CTLFLAG_RD, &rx_ring->bytes, "RX bytes"); SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "error", CTLFLAG_RD, &rx_ring->errors, "RX soft errors"); SYSCTL_ADD_UINT(ctx, ring_list, OID_AUTO, "lro_queued", CTLFLAG_RD, &rx_ring->lro.lro_queued, 0, "LRO Queued"); SYSCTL_ADD_UINT(ctx, ring_list, OID_AUTO, "lro_flushed", CTLFLAG_RD, &rx_ring->lro.lro_flushed, 0, "LRO Flushed"); } } int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, struct mlx4_en_port_profile *prof) { static volatile int mlx4_en_unit; struct net_device *dev; struct mlx4_en_priv *priv; uint8_t dev_addr[ETHER_ADDR_LEN]; int err; int i; priv = kzalloc(sizeof(*priv), GFP_KERNEL); dev = priv->dev = if_alloc(IFT_ETHER); if (dev == NULL) { mlx4_err(mdev, "Net device allocation failed\n"); kfree(priv); return -ENOMEM; } dev->if_softc = priv; if_initname(dev, "mlxen", atomic_fetchadd_int(&mlx4_en_unit, 1)); dev->if_mtu = ETHERMTU; dev->if_baudrate = 1000000000; dev->if_init = mlx4_en_init; dev->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; dev->if_ioctl = mlx4_en_ioctl; dev->if_transmit = mlx4_en_transmit; dev->if_qflush = mlx4_en_qflush; dev->if_snd.ifq_maxlen = prof->tx_ring_size; /* * Initialize driver private data */ priv->dev = dev; priv->mdev = mdev; priv->prof = prof; priv->port = port; priv->port_up = false; priv->rx_csum = 1; priv->flags = prof->flags; priv->tx_ring_num = prof->tx_ring_num; priv->rx_ring_num = prof->rx_ring_num; priv->mac_index = -1; priv->msg_enable = MLX4_EN_MSG_LEVEL; priv->ip_reasm = priv->mdev->profile.ip_reasm; mtx_init(&priv->stats_lock.m, "mlx4 stats", NULL, MTX_DEF); mtx_init(&priv->vlan_lock.m, "mlx4 vlan", NULL, MTX_DEF); INIT_WORK(&priv->start_port_task, mlx4_en_lock_and_start_port); INIT_WORK(&priv->stop_port_task, mlx4_en_lock_and_stop_port); INIT_WORK(&priv->mcast_task, mlx4_en_do_set_multicast); INIT_WORK(&priv->watchdog_task, mlx4_en_restart); INIT_WORK(&priv->linkstate_task, mlx4_en_linkstate); INIT_DELAYED_WORK(&priv->stats_task, mlx4_en_do_get_stats); callout_init(&priv->watchdog_timer, 1); /* Query for default mac and max mtu */ priv->max_mtu = mdev->dev->caps.eth_mtu_cap[priv->port]; priv->mac = mdev->dev->caps.def_mac[priv->port]; if (ILLEGAL_MAC(priv->mac)) { en_err(priv, "Port: %d, invalid mac burned: 0x%llx, quiting\n", priv->port, (long long)priv->mac); err = -EINVAL; goto out; } mlx4_en_sysctl_conf(priv); err = mlx4_en_alloc_resources(priv); if (err) goto out; /* Allocate page for receive rings */ err = mlx4_alloc_hwq_res(mdev->dev, &priv->res, MLX4_EN_PAGE_SIZE, MLX4_EN_PAGE_SIZE); if (err) { en_err(priv, "Failed to allocate page for rx qps\n"); goto out; } priv->allocated = 1; /* * Set driver features */ dev->if_capabilities |= IFCAP_RXCSUM | IFCAP_TXCSUM; dev->if_capabilities |= IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING; dev->if_capabilities |= IFCAP_VLAN_HWCSUM | IFCAP_VLAN_HWFILTER; dev->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU; if (mdev->LSO_support) dev->if_capabilities |= IFCAP_TSO4 | IFCAP_VLAN_HWTSO; if (mdev->profile.num_lro) dev->if_capabilities |= IFCAP_LRO; dev->if_capenable = dev->if_capabilities; /* * Setup wake-on-lan. */ #if 0 if (priv->mdev->dev->caps.wol) { u64 config; if (mlx4_wol_read(priv->mdev->dev, &config, priv->port) == 0) { if (config & MLX4_EN_WOL_MAGIC) dev->if_capabilities |= IFCAP_WOL_MAGIC; if (config & MLX4_EN_WOL_ENABLED) dev->if_capenable |= IFCAP_WOL_MAGIC; } } #endif /* Register for VLAN events */ priv->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, mlx4_en_vlan_rx_add_vid, priv, EVENTHANDLER_PRI_FIRST); priv->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, mlx4_en_vlan_rx_kill_vid, priv, EVENTHANDLER_PRI_FIRST); mdev->pndev[priv->port] = dev; priv->last_link_state = MLX4_DEV_EVENT_PORT_DOWN; if_link_state_change(dev, LINK_STATE_DOWN); /* Set default MAC */ for (i = 0; i < ETHER_ADDR_LEN; i++) dev_addr[ETHER_ADDR_LEN - 1 - i] = (u8) (priv->mac >> (8 * i)); ether_ifattach(dev, dev_addr); ifmedia_init(&priv->media, IFM_IMASK | IFM_ETH_FMASK, mlx4_en_media_change, mlx4_en_media_status); ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_1000_T, 0, NULL); ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_10G_SR, 0, NULL); ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_10G_CX4, 0, NULL); ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO); en_warn(priv, "Using %d TX rings\n", prof->tx_ring_num); en_warn(priv, "Using %d RX rings\n", prof->rx_ring_num); priv->registered = 1; queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY); return 0; out: mlx4_en_destroy_netdev(dev); return err; }