diff --git a/sys/dev/netmap/if_ptnet.c b/sys/dev/netmap/if_ptnet.c index 468ebabe9d90..ccf008d4451b 100644 --- a/sys/dev/netmap/if_ptnet.c +++ b/sys/dev/netmap/if_ptnet.c @@ -1,2293 +1,1995 @@ /*- * Copyright (c) 2016, Vincenzo Maffione * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /* Driver for ptnet paravirtualized network device. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #ifndef INET #error "INET not defined, cannot support offloadings" #endif #if __FreeBSD_version >= 1100000 static uint64_t ptnet_get_counter(if_t, ift_counter); #else typedef struct ifnet *if_t; #define if_getsoftc(_ifp) (_ifp)->if_softc #endif //#define PTNETMAP_STATS //#define DEBUG #ifdef DEBUG #define DBG(x) x #else /* !DEBUG */ #define DBG(x) #endif /* !DEBUG */ extern int ptnet_vnet_hdr; /* Tunable parameter */ struct ptnet_softc; struct ptnet_queue_stats { uint64_t packets; /* if_[io]packets */ uint64_t bytes; /* if_[io]bytes */ uint64_t errors; /* if_[io]errors */ uint64_t iqdrops; /* if_iqdrops */ uint64_t mcasts; /* if_[io]mcasts */ #ifdef PTNETMAP_STATS uint64_t intrs; uint64_t kicks; #endif /* PTNETMAP_STATS */ }; struct ptnet_queue { struct ptnet_softc *sc; struct resource *irq; void *cookie; int kring_id; struct nm_csb_atok *atok; struct nm_csb_ktoa *ktoa; unsigned int kick; struct mtx lock; struct buf_ring *bufring; /* for TX queues */ struct ptnet_queue_stats stats; #ifdef PTNETMAP_STATS struct ptnet_queue_stats last_stats; #endif /* PTNETMAP_STATS */ struct taskqueue *taskq; struct task task; char lock_name[16]; }; #define PTNET_Q_LOCK(_pq) mtx_lock(&(_pq)->lock) #define PTNET_Q_TRYLOCK(_pq) mtx_trylock(&(_pq)->lock) #define PTNET_Q_UNLOCK(_pq) mtx_unlock(&(_pq)->lock) struct ptnet_softc { device_t dev; if_t ifp; struct ifmedia media; struct mtx lock; char lock_name[16]; char hwaddr[ETHER_ADDR_LEN]; /* Mirror of PTFEAT register. */ uint32_t ptfeatures; unsigned int vnet_hdr_len; /* PCI BARs support. */ struct resource *iomem; struct resource *msix_mem; unsigned int num_rings; unsigned int num_tx_rings; struct ptnet_queue *queues; struct ptnet_queue *rxqueues; struct nm_csb_atok *csb_gh; struct nm_csb_ktoa *csb_hg; unsigned int min_tx_space; struct netmap_pt_guest_adapter *ptna; struct callout tick; #ifdef PTNETMAP_STATS struct timeval last_ts; #endif /* PTNETMAP_STATS */ }; #define PTNET_CORE_LOCK(_sc) mtx_lock(&(_sc)->lock) #define PTNET_CORE_UNLOCK(_sc) mtx_unlock(&(_sc)->lock) static int ptnet_probe(device_t); static int ptnet_attach(device_t); static int ptnet_detach(device_t); static int ptnet_suspend(device_t); static int ptnet_resume(device_t); static int ptnet_shutdown(device_t); static void ptnet_init(void *opaque); static int ptnet_ioctl(if_t ifp, u_long cmd, caddr_t data); static int ptnet_init_locked(struct ptnet_softc *sc); static int ptnet_stop(struct ptnet_softc *sc); static int ptnet_transmit(if_t ifp, struct mbuf *m); static int ptnet_drain_transmit_queue(struct ptnet_queue *pq, unsigned int budget, bool may_resched); static void ptnet_qflush(if_t ifp); static void ptnet_tx_task(void *context, int pending); static int ptnet_media_change(if_t ifp); static void ptnet_media_status(if_t ifp, struct ifmediareq *ifmr); #ifdef PTNETMAP_STATS static void ptnet_tick(void *opaque); #endif static int ptnet_irqs_init(struct ptnet_softc *sc); static void ptnet_irqs_fini(struct ptnet_softc *sc); static uint32_t ptnet_nm_ptctl(struct ptnet_softc *sc, uint32_t cmd); static int ptnet_nm_config(struct netmap_adapter *na, struct nm_config_info *info); static void ptnet_update_vnet_hdr(struct ptnet_softc *sc); static int ptnet_nm_register(struct netmap_adapter *na, int onoff); static int ptnet_nm_txsync(struct netmap_kring *kring, int flags); static int ptnet_nm_rxsync(struct netmap_kring *kring, int flags); static void ptnet_nm_intr(struct netmap_adapter *na, int onoff); static void ptnet_tx_intr(void *opaque); static void ptnet_rx_intr(void *opaque); static unsigned ptnet_rx_discard(struct netmap_kring *kring, unsigned int head); static int ptnet_rx_eof(struct ptnet_queue *pq, unsigned int budget, bool may_resched); static void ptnet_rx_task(void *context, int pending); #ifdef DEVICE_POLLING static poll_handler_t ptnet_poll; #endif static device_method_t ptnet_methods[] = { DEVMETHOD(device_probe, ptnet_probe), DEVMETHOD(device_attach, ptnet_attach), DEVMETHOD(device_detach, ptnet_detach), DEVMETHOD(device_suspend, ptnet_suspend), DEVMETHOD(device_resume, ptnet_resume), DEVMETHOD(device_shutdown, ptnet_shutdown), DEVMETHOD_END }; static driver_t ptnet_driver = { "ptnet", ptnet_methods, sizeof(struct ptnet_softc) }; /* We use (SI_ORDER_MIDDLE+2) here, see DEV_MODULE_ORDERED() invocation. */ static devclass_t ptnet_devclass; DRIVER_MODULE_ORDERED(ptnet, pci, ptnet_driver, ptnet_devclass, NULL, NULL, SI_ORDER_MIDDLE + 2); static int ptnet_probe(device_t dev) { if (pci_get_vendor(dev) != PTNETMAP_PCI_VENDOR_ID || pci_get_device(dev) != PTNETMAP_PCI_NETIF_ID) { return (ENXIO); } device_set_desc(dev, "ptnet network adapter"); return (BUS_PROBE_DEFAULT); } static inline void ptnet_kick(struct ptnet_queue *pq) { #ifdef PTNETMAP_STATS pq->stats.kicks ++; #endif /* PTNETMAP_STATS */ bus_write_4(pq->sc->iomem, pq->kick, 0); } #define PTNET_BUF_RING_SIZE 4096 #define PTNET_RX_BUDGET 512 #define PTNET_RX_BATCH 1 #define PTNET_TX_BUDGET 512 #define PTNET_TX_BATCH 64 #define PTNET_HDR_SIZE sizeof(struct virtio_net_hdr_mrg_rxbuf) #define PTNET_MAX_PKT_SIZE 65536 #define PTNET_CSUM_OFFLOAD (CSUM_TCP | CSUM_UDP) #define PTNET_CSUM_OFFLOAD_IPV6 (CSUM_TCP_IPV6 | CSUM_UDP_IPV6) #define PTNET_ALL_OFFLOAD (CSUM_TSO | PTNET_CSUM_OFFLOAD |\ PTNET_CSUM_OFFLOAD_IPV6) static int ptnet_attach(device_t dev) { uint32_t ptfeatures = 0; unsigned int num_rx_rings, num_tx_rings; struct netmap_adapter na_arg; unsigned int nifp_offset; struct ptnet_softc *sc; if_t ifp; uint32_t macreg; int err, rid; int i; sc = device_get_softc(dev); sc->dev = dev; /* Setup PCI resources. */ pci_enable_busmaster(dev); rid = PCIR_BAR(PTNETMAP_IO_PCI_BAR); sc->iomem = bus_alloc_resource_any(dev, SYS_RES_IOPORT, &rid, RF_ACTIVE); if (sc->iomem == NULL) { device_printf(dev, "Failed to map I/O BAR\n"); return (ENXIO); } /* Negotiate features with the hypervisor. */ if (ptnet_vnet_hdr) { ptfeatures |= PTNETMAP_F_VNET_HDR; } bus_write_4(sc->iomem, PTNET_IO_PTFEAT, ptfeatures); /* wanted */ ptfeatures = bus_read_4(sc->iomem, PTNET_IO_PTFEAT); /* acked */ sc->ptfeatures = ptfeatures; num_tx_rings = bus_read_4(sc->iomem, PTNET_IO_NUM_TX_RINGS); num_rx_rings = bus_read_4(sc->iomem, PTNET_IO_NUM_RX_RINGS); sc->num_rings = num_tx_rings + num_rx_rings; sc->num_tx_rings = num_tx_rings; if (sc->num_rings * sizeof(struct nm_csb_atok) > PAGE_SIZE) { device_printf(dev, "CSB cannot handle that many rings (%u)\n", sc->num_rings); err = ENOMEM; goto err_path; } /* Allocate CSB and carry out CSB allocation protocol. */ sc->csb_gh = contigmalloc(2*PAGE_SIZE, M_DEVBUF, M_NOWAIT | M_ZERO, (size_t)0, -1UL, PAGE_SIZE, 0); if (sc->csb_gh == NULL) { device_printf(dev, "Failed to allocate CSB\n"); err = ENOMEM; goto err_path; } sc->csb_hg = (struct nm_csb_ktoa *)(((char *)sc->csb_gh) + PAGE_SIZE); { /* * We use uint64_t rather than vm_paddr_t since we * need 64 bit addresses even on 32 bit platforms. */ uint64_t paddr = vtophys(sc->csb_gh); /* CSB allocation protocol: write to BAH first, then * to BAL (for both GH and HG sections). */ bus_write_4(sc->iomem, PTNET_IO_CSB_GH_BAH, (paddr >> 32) & 0xffffffff); bus_write_4(sc->iomem, PTNET_IO_CSB_GH_BAL, paddr & 0xffffffff); paddr = vtophys(sc->csb_hg); bus_write_4(sc->iomem, PTNET_IO_CSB_HG_BAH, (paddr >> 32) & 0xffffffff); bus_write_4(sc->iomem, PTNET_IO_CSB_HG_BAL, paddr & 0xffffffff); } /* Allocate and initialize per-queue data structures. */ sc->queues = malloc(sizeof(struct ptnet_queue) * sc->num_rings, M_DEVBUF, M_NOWAIT | M_ZERO); if (sc->queues == NULL) { err = ENOMEM; goto err_path; } sc->rxqueues = sc->queues + num_tx_rings; for (i = 0; i < sc->num_rings; i++) { struct ptnet_queue *pq = sc->queues + i; pq->sc = sc; pq->kring_id = i; pq->kick = PTNET_IO_KICK_BASE + 4 * i; pq->atok = sc->csb_gh + i; pq->ktoa = sc->csb_hg + i; snprintf(pq->lock_name, sizeof(pq->lock_name), "%s-%d", device_get_nameunit(dev), i); mtx_init(&pq->lock, pq->lock_name, NULL, MTX_DEF); if (i >= num_tx_rings) { /* RX queue: fix kring_id. */ pq->kring_id -= num_tx_rings; } else { /* TX queue: allocate buf_ring. */ pq->bufring = buf_ring_alloc(PTNET_BUF_RING_SIZE, M_DEVBUF, M_NOWAIT, &pq->lock); if (pq->bufring == NULL) { err = ENOMEM; goto err_path; } } } sc->min_tx_space = 64; /* Safe initial value. */ err = ptnet_irqs_init(sc); if (err) { goto err_path; } /* Setup Ethernet interface. */ sc->ifp = ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "Failed to allocate ifnet\n"); err = ENOMEM; goto err_path; } if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_baudrate = IF_Gbps(10); ifp->if_softc = sc; ifp->if_flags = IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX; ifp->if_init = ptnet_init; ifp->if_ioctl = ptnet_ioctl; #if __FreeBSD_version >= 1100000 ifp->if_get_counter = ptnet_get_counter; #endif ifp->if_transmit = ptnet_transmit; ifp->if_qflush = ptnet_qflush; ifmedia_init(&sc->media, IFM_IMASK, ptnet_media_change, ptnet_media_status); ifmedia_add(&sc->media, IFM_ETHER | IFM_10G_T | IFM_FDX, 0, NULL); ifmedia_set(&sc->media, IFM_ETHER | IFM_10G_T | IFM_FDX); macreg = bus_read_4(sc->iomem, PTNET_IO_MAC_HI); sc->hwaddr[0] = (macreg >> 8) & 0xff; sc->hwaddr[1] = macreg & 0xff; macreg = bus_read_4(sc->iomem, PTNET_IO_MAC_LO); sc->hwaddr[2] = (macreg >> 24) & 0xff; sc->hwaddr[3] = (macreg >> 16) & 0xff; sc->hwaddr[4] = (macreg >> 8) & 0xff; sc->hwaddr[5] = macreg & 0xff; ether_ifattach(ifp, sc->hwaddr); ifp->if_hdrlen = sizeof(struct ether_vlan_header); ifp->if_capabilities |= IFCAP_JUMBO_MTU | IFCAP_VLAN_MTU; if (sc->ptfeatures & PTNETMAP_F_VNET_HDR) { /* Similarly to what the vtnet driver does, we can emulate * VLAN offloadings by inserting and removing the 802.1Q * header during transmit and receive. We are then able * to do checksum offloading of VLAN frames. */ ifp->if_capabilities |= IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6 | IFCAP_VLAN_HWCSUM | IFCAP_TSO | IFCAP_LRO | IFCAP_VLAN_HWTSO | IFCAP_VLAN_HWTAGGING; } ifp->if_capenable = ifp->if_capabilities; #ifdef DEVICE_POLLING /* Don't enable polling by default. */ ifp->if_capabilities |= IFCAP_POLLING; #endif snprintf(sc->lock_name, sizeof(sc->lock_name), "%s", device_get_nameunit(dev)); mtx_init(&sc->lock, sc->lock_name, "ptnet core lock", MTX_DEF); callout_init_mtx(&sc->tick, &sc->lock, 0); /* Prepare a netmap_adapter struct instance to do netmap_attach(). */ nifp_offset = bus_read_4(sc->iomem, PTNET_IO_NIFP_OFS); memset(&na_arg, 0, sizeof(na_arg)); na_arg.ifp = ifp; na_arg.num_tx_desc = bus_read_4(sc->iomem, PTNET_IO_NUM_TX_SLOTS); na_arg.num_rx_desc = bus_read_4(sc->iomem, PTNET_IO_NUM_RX_SLOTS); na_arg.num_tx_rings = num_tx_rings; na_arg.num_rx_rings = num_rx_rings; na_arg.nm_config = ptnet_nm_config; na_arg.nm_krings_create = ptnet_nm_krings_create; na_arg.nm_krings_delete = ptnet_nm_krings_delete; na_arg.nm_dtor = ptnet_nm_dtor; na_arg.nm_intr = ptnet_nm_intr; na_arg.nm_register = ptnet_nm_register; na_arg.nm_txsync = ptnet_nm_txsync; na_arg.nm_rxsync = ptnet_nm_rxsync; netmap_pt_guest_attach(&na_arg, nifp_offset, bus_read_4(sc->iomem, PTNET_IO_HOSTMEMID)); /* Now a netmap adapter for this ifp has been allocated, and it * can be accessed through NA(ifp). We also have to initialize the CSB * pointer. */ sc->ptna = (struct netmap_pt_guest_adapter *)NA(ifp); /* If virtio-net header was negotiated, set the virt_hdr_len field in * the netmap adapter, to inform users that this netmap adapter requires * the application to deal with the headers. */ ptnet_update_vnet_hdr(sc); device_printf(dev, "%s() completed\n", __func__); return (0); err_path: ptnet_detach(dev); return err; } /* Stop host sync-kloop if it was running. */ static void ptnet_device_shutdown(struct ptnet_softc *sc) { ptnet_nm_ptctl(sc, PTNETMAP_PTCTL_DELETE); bus_write_4(sc->iomem, PTNET_IO_CSB_GH_BAH, 0); bus_write_4(sc->iomem, PTNET_IO_CSB_GH_BAL, 0); bus_write_4(sc->iomem, PTNET_IO_CSB_HG_BAH, 0); bus_write_4(sc->iomem, PTNET_IO_CSB_HG_BAL, 0); } static int ptnet_detach(device_t dev) { struct ptnet_softc *sc = device_get_softc(dev); int i; ptnet_device_shutdown(sc); #ifdef DEVICE_POLLING if (sc->ifp->if_capenable & IFCAP_POLLING) { ether_poll_deregister(sc->ifp); } #endif callout_drain(&sc->tick); if (sc->queues) { /* Drain taskqueues before calling if_detach. */ for (i = 0; i < sc->num_rings; i++) { struct ptnet_queue *pq = sc->queues + i; if (pq->taskq) { taskqueue_drain(pq->taskq, &pq->task); } } } if (sc->ifp) { ether_ifdetach(sc->ifp); /* Uninitialize netmap adapters for this device. */ netmap_detach(sc->ifp); ifmedia_removeall(&sc->media); if_free(sc->ifp); sc->ifp = NULL; } ptnet_irqs_fini(sc); if (sc->csb_gh) { contigfree(sc->csb_gh, 2*PAGE_SIZE, M_DEVBUF); sc->csb_gh = NULL; sc->csb_hg = NULL; } if (sc->queues) { for (i = 0; i < sc->num_rings; i++) { struct ptnet_queue *pq = sc->queues + i; if (mtx_initialized(&pq->lock)) { mtx_destroy(&pq->lock); } if (pq->bufring != NULL) { buf_ring_free(pq->bufring, M_DEVBUF); } } free(sc->queues, M_DEVBUF); sc->queues = NULL; } if (sc->iomem) { bus_release_resource(dev, SYS_RES_IOPORT, PCIR_BAR(PTNETMAP_IO_PCI_BAR), sc->iomem); sc->iomem = NULL; } mtx_destroy(&sc->lock); device_printf(dev, "%s() completed\n", __func__); return (0); } static int ptnet_suspend(device_t dev) { struct ptnet_softc *sc = device_get_softc(dev); (void)sc; return (0); } static int ptnet_resume(device_t dev) { struct ptnet_softc *sc = device_get_softc(dev); (void)sc; return (0); } static int ptnet_shutdown(device_t dev) { struct ptnet_softc *sc = device_get_softc(dev); ptnet_device_shutdown(sc); return (0); } static int ptnet_irqs_init(struct ptnet_softc *sc) { int rid = PCIR_BAR(PTNETMAP_MSIX_PCI_BAR); int nvecs = sc->num_rings; device_t dev = sc->dev; int err = ENOSPC; int cpu_cur; int i; if (pci_find_cap(dev, PCIY_MSIX, NULL) != 0) { device_printf(dev, "Could not find MSI-X capability\n"); return (ENXIO); } sc->msix_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (sc->msix_mem == NULL) { device_printf(dev, "Failed to allocate MSIX PCI BAR\n"); return (ENXIO); } if (pci_msix_count(dev) < nvecs) { device_printf(dev, "Not enough MSI-X vectors\n"); goto err_path; } err = pci_alloc_msix(dev, &nvecs); if (err) { device_printf(dev, "Failed to allocate MSI-X vectors\n"); goto err_path; } for (i = 0; i < nvecs; i++) { struct ptnet_queue *pq = sc->queues + i; rid = i + 1; pq->irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_ACTIVE); if (pq->irq == NULL) { device_printf(dev, "Failed to allocate interrupt " "for queue #%d\n", i); err = ENOSPC; goto err_path; } } cpu_cur = CPU_FIRST(); for (i = 0; i < nvecs; i++) { struct ptnet_queue *pq = sc->queues + i; void (*handler)(void *) = ptnet_tx_intr; if (i >= sc->num_tx_rings) { handler = ptnet_rx_intr; } err = bus_setup_intr(dev, pq->irq, INTR_TYPE_NET | INTR_MPSAFE, NULL /* intr_filter */, handler, pq, &pq->cookie); if (err) { device_printf(dev, "Failed to register intr handler " "for queue #%d\n", i); goto err_path; } bus_describe_intr(dev, pq->irq, pq->cookie, "q%d", i); #if 0 bus_bind_intr(sc->dev, pq->irq, cpu_cur); #endif cpu_cur = CPU_NEXT(cpu_cur); } device_printf(dev, "Allocated %d MSI-X vectors\n", nvecs); cpu_cur = CPU_FIRST(); for (i = 0; i < nvecs; i++) { struct ptnet_queue *pq = sc->queues + i; static void (*handler)(void *context, int pending); handler = (i < sc->num_tx_rings) ? ptnet_tx_task : ptnet_rx_task; TASK_INIT(&pq->task, 0, handler, pq); pq->taskq = taskqueue_create_fast("ptnet_queue", M_NOWAIT, taskqueue_thread_enqueue, &pq->taskq); taskqueue_start_threads(&pq->taskq, 1, PI_NET, "%s-pq-%d", device_get_nameunit(sc->dev), cpu_cur); cpu_cur = CPU_NEXT(cpu_cur); } return 0; err_path: ptnet_irqs_fini(sc); return err; } static void ptnet_irqs_fini(struct ptnet_softc *sc) { device_t dev = sc->dev; int i; for (i = 0; i < sc->num_rings; i++) { struct ptnet_queue *pq = sc->queues + i; if (pq->taskq) { taskqueue_free(pq->taskq); pq->taskq = NULL; } if (pq->cookie) { bus_teardown_intr(dev, pq->irq, pq->cookie); pq->cookie = NULL; } if (pq->irq) { bus_release_resource(dev, SYS_RES_IRQ, i + 1, pq->irq); pq->irq = NULL; } } if (sc->msix_mem) { pci_release_msi(dev); bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(PTNETMAP_MSIX_PCI_BAR), sc->msix_mem); sc->msix_mem = NULL; } } static void ptnet_init(void *opaque) { struct ptnet_softc *sc = opaque; PTNET_CORE_LOCK(sc); ptnet_init_locked(sc); PTNET_CORE_UNLOCK(sc); } static int ptnet_ioctl(if_t ifp, u_long cmd, caddr_t data) { struct ptnet_softc *sc = if_getsoftc(ifp); device_t dev = sc->dev; struct ifreq *ifr = (struct ifreq *)data; int mask __unused, err = 0; switch (cmd) { case SIOCSIFFLAGS: device_printf(dev, "SIOCSIFFLAGS %x\n", ifp->if_flags); PTNET_CORE_LOCK(sc); if (ifp->if_flags & IFF_UP) { /* Network stack wants the iff to be up. */ err = ptnet_init_locked(sc); } else { /* Network stack wants the iff to be down. */ err = ptnet_stop(sc); } /* We don't need to do nothing to support IFF_PROMISC, * since that is managed by the backend port. */ PTNET_CORE_UNLOCK(sc); break; case SIOCSIFCAP: device_printf(dev, "SIOCSIFCAP %x %x\n", ifr->ifr_reqcap, ifp->if_capenable); mask = ifr->ifr_reqcap ^ ifp->if_capenable; #ifdef DEVICE_POLLING if (mask & IFCAP_POLLING) { struct ptnet_queue *pq; int i; if (ifr->ifr_reqcap & IFCAP_POLLING) { err = ether_poll_register(ptnet_poll, ifp); if (err) { break; } /* Stop queues and sync with taskqueues. */ ifp->if_drv_flags &= ~IFF_DRV_RUNNING; for (i = 0; i < sc->num_rings; i++) { pq = sc-> queues + i; /* Make sure the worker sees the * IFF_DRV_RUNNING down. */ PTNET_Q_LOCK(pq); pq->atok->appl_need_kick = 0; PTNET_Q_UNLOCK(pq); /* Wait for rescheduling to finish. */ if (pq->taskq) { taskqueue_drain(pq->taskq, &pq->task); } } ifp->if_drv_flags |= IFF_DRV_RUNNING; } else { err = ether_poll_deregister(ifp); for (i = 0; i < sc->num_rings; i++) { pq = sc-> queues + i; PTNET_Q_LOCK(pq); pq->atok->appl_need_kick = 1; PTNET_Q_UNLOCK(pq); } } } #endif /* DEVICE_POLLING */ ifp->if_capenable = ifr->ifr_reqcap; break; case SIOCSIFMTU: /* We support any reasonable MTU. */ if (ifr->ifr_mtu < ETHERMIN || ifr->ifr_mtu > PTNET_MAX_PKT_SIZE) { err = EINVAL; } else { PTNET_CORE_LOCK(sc); ifp->if_mtu = ifr->ifr_mtu; PTNET_CORE_UNLOCK(sc); } break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: err = ifmedia_ioctl(ifp, ifr, &sc->media, cmd); break; default: err = ether_ioctl(ifp, cmd, data); break; } return err; } static int ptnet_init_locked(struct ptnet_softc *sc) { if_t ifp = sc->ifp; struct netmap_adapter *na_dr = &sc->ptna->dr.up; struct netmap_adapter *na_nm = &sc->ptna->hwup.up; unsigned int nm_buf_size; int ret; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { return 0; /* nothing to do */ } device_printf(sc->dev, "%s\n", __func__); /* Translate offload capabilities according to if_capenable. */ ifp->if_hwassist = 0; if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist |= PTNET_CSUM_OFFLOAD; if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) ifp->if_hwassist |= PTNET_CSUM_OFFLOAD_IPV6; if (ifp->if_capenable & IFCAP_TSO4) ifp->if_hwassist |= CSUM_IP_TSO; if (ifp->if_capenable & IFCAP_TSO6) ifp->if_hwassist |= CSUM_IP6_TSO; /* * Prepare the interface for netmap mode access. */ netmap_update_config(na_dr); ret = netmap_mem_finalize(na_dr->nm_mem, na_dr); if (ret) { device_printf(sc->dev, "netmap_mem_finalize() failed\n"); return ret; } if (sc->ptna->backend_users == 0) { ret = ptnet_nm_krings_create(na_nm); if (ret) { device_printf(sc->dev, "ptnet_nm_krings_create() " "failed\n"); goto err_mem_finalize; } ret = netmap_mem_rings_create(na_dr); if (ret) { device_printf(sc->dev, "netmap_mem_rings_create() " "failed\n"); goto err_rings_create; } ret = netmap_mem_get_lut(na_dr->nm_mem, &na_dr->na_lut); if (ret) { device_printf(sc->dev, "netmap_mem_get_lut() " "failed\n"); goto err_get_lut; } } ret = ptnet_nm_register(na_dr, 1 /* on */); if (ret) { goto err_register; } nm_buf_size = NETMAP_BUF_SIZE(na_dr); KASSERT(nm_buf_size > 0, ("Invalid netmap buffer size")); sc->min_tx_space = PTNET_MAX_PKT_SIZE / nm_buf_size + 2; device_printf(sc->dev, "%s: min_tx_space = %u\n", __func__, sc->min_tx_space); #ifdef PTNETMAP_STATS callout_reset(&sc->tick, hz, ptnet_tick, sc); #endif ifp->if_drv_flags |= IFF_DRV_RUNNING; return 0; err_register: memset(&na_dr->na_lut, 0, sizeof(na_dr->na_lut)); err_get_lut: netmap_mem_rings_delete(na_dr); err_rings_create: ptnet_nm_krings_delete(na_nm); err_mem_finalize: netmap_mem_deref(na_dr->nm_mem, na_dr); return ret; } /* To be called under core lock. */ static int ptnet_stop(struct ptnet_softc *sc) { if_t ifp = sc->ifp; struct netmap_adapter *na_dr = &sc->ptna->dr.up; struct netmap_adapter *na_nm = &sc->ptna->hwup.up; int i; device_printf(sc->dev, "%s\n", __func__); if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { return 0; /* nothing to do */ } /* Clear the driver-ready flag, and synchronize with all the queues, * so that after this loop we are sure nobody is working anymore with * the device. This scheme is taken from the vtnet driver. */ ifp->if_drv_flags &= ~IFF_DRV_RUNNING; callout_stop(&sc->tick); for (i = 0; i < sc->num_rings; i++) { PTNET_Q_LOCK(sc->queues + i); PTNET_Q_UNLOCK(sc->queues + i); } ptnet_nm_register(na_dr, 0 /* off */); if (sc->ptna->backend_users == 0) { netmap_mem_rings_delete(na_dr); ptnet_nm_krings_delete(na_nm); } netmap_mem_deref(na_dr->nm_mem, na_dr); return 0; } static void ptnet_qflush(if_t ifp) { struct ptnet_softc *sc = if_getsoftc(ifp); int i; /* Flush all the bufrings and do the interface flush. */ for (i = 0; i < sc->num_rings; i++) { struct ptnet_queue *pq = sc->queues + i; struct mbuf *m; PTNET_Q_LOCK(pq); if (pq->bufring) { while ((m = buf_ring_dequeue_sc(pq->bufring))) { m_freem(m); } } PTNET_Q_UNLOCK(pq); } if_qflush(ifp); } static int ptnet_media_change(if_t ifp) { struct ptnet_softc *sc = if_getsoftc(ifp); struct ifmedia *ifm = &sc->media; if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) { return EINVAL; } return 0; } #if __FreeBSD_version >= 1100000 static uint64_t ptnet_get_counter(if_t ifp, ift_counter cnt) { struct ptnet_softc *sc = if_getsoftc(ifp); struct ptnet_queue_stats stats[2]; int i; /* Accumulate statistics over the queues. */ memset(stats, 0, sizeof(stats)); for (i = 0; i < sc->num_rings; i++) { struct ptnet_queue *pq = sc->queues + i; int idx = (i < sc->num_tx_rings) ? 0 : 1; stats[idx].packets += pq->stats.packets; stats[idx].bytes += pq->stats.bytes; stats[idx].errors += pq->stats.errors; stats[idx].iqdrops += pq->stats.iqdrops; stats[idx].mcasts += pq->stats.mcasts; } switch (cnt) { case IFCOUNTER_IPACKETS: return (stats[1].packets); case IFCOUNTER_IQDROPS: return (stats[1].iqdrops); case IFCOUNTER_IERRORS: return (stats[1].errors); case IFCOUNTER_OPACKETS: return (stats[0].packets); case IFCOUNTER_OBYTES: return (stats[0].bytes); case IFCOUNTER_OMCASTS: return (stats[0].mcasts); default: return (if_get_counter_default(ifp, cnt)); } } #endif #ifdef PTNETMAP_STATS /* Called under core lock. */ static void ptnet_tick(void *opaque) { struct ptnet_softc *sc = opaque; int i; for (i = 0; i < sc->num_rings; i++) { struct ptnet_queue *pq = sc->queues + i; struct ptnet_queue_stats cur = pq->stats; struct timeval now; unsigned int delta; microtime(&now); delta = now.tv_usec - sc->last_ts.tv_usec + (now.tv_sec - sc->last_ts.tv_sec) * 1000000; delta /= 1000; /* in milliseconds */ if (delta == 0) continue; device_printf(sc->dev, "#%d[%u ms]:pkts %lu, kicks %lu, " "intr %lu\n", i, delta, (cur.packets - pq->last_stats.packets), (cur.kicks - pq->last_stats.kicks), (cur.intrs - pq->last_stats.intrs)); pq->last_stats = cur; } microtime(&sc->last_ts); callout_schedule(&sc->tick, hz); } #endif /* PTNETMAP_STATS */ static void ptnet_media_status(if_t ifp, struct ifmediareq *ifmr) { /* We are always active, as the backend netmap port is * always open in netmap mode. */ ifmr->ifm_status = IFM_AVALID | IFM_ACTIVE; ifmr->ifm_active = IFM_ETHER | IFM_10G_T | IFM_FDX; } static uint32_t ptnet_nm_ptctl(struct ptnet_softc *sc, uint32_t cmd) { /* * Write a command and read back error status, * with zero meaning success. */ bus_write_4(sc->iomem, PTNET_IO_PTCTL, cmd); return bus_read_4(sc->iomem, PTNET_IO_PTCTL); } static int ptnet_nm_config(struct netmap_adapter *na, struct nm_config_info *info) { struct ptnet_softc *sc = if_getsoftc(na->ifp); info->num_tx_rings = bus_read_4(sc->iomem, PTNET_IO_NUM_TX_RINGS); info->num_rx_rings = bus_read_4(sc->iomem, PTNET_IO_NUM_RX_RINGS); info->num_tx_descs = bus_read_4(sc->iomem, PTNET_IO_NUM_TX_SLOTS); info->num_rx_descs = bus_read_4(sc->iomem, PTNET_IO_NUM_RX_SLOTS); info->rx_buf_maxsize = NETMAP_BUF_SIZE(na); device_printf(sc->dev, "txr %u, rxr %u, txd %u, rxd %u, rxbufsz %u\n", info->num_tx_rings, info->num_rx_rings, info->num_tx_descs, info->num_rx_descs, info->rx_buf_maxsize); return 0; } static void ptnet_sync_from_csb(struct ptnet_softc *sc, struct netmap_adapter *na) { int i; /* Sync krings from the host, reading from * CSB. */ for (i = 0; i < sc->num_rings; i++) { struct nm_csb_atok *atok = sc->queues[i].atok; struct nm_csb_ktoa *ktoa = sc->queues[i].ktoa; struct netmap_kring *kring; if (i < na->num_tx_rings) { kring = na->tx_rings[i]; } else { kring = na->rx_rings[i - na->num_tx_rings]; } kring->rhead = kring->ring->head = atok->head; kring->rcur = kring->ring->cur = atok->cur; kring->nr_hwcur = ktoa->hwcur; kring->nr_hwtail = kring->rtail = kring->ring->tail = ktoa->hwtail; nm_prdis("%d,%d: csb {hc %u h %u c %u ht %u}", t, i, ktoa->hwcur, atok->head, atok->cur, ktoa->hwtail); nm_prdis("%d,%d: kring {hc %u rh %u rc %u h %u c %u ht %u rt %u t %u}", t, i, kring->nr_hwcur, kring->rhead, kring->rcur, kring->ring->head, kring->ring->cur, kring->nr_hwtail, kring->rtail, kring->ring->tail); } } static void ptnet_update_vnet_hdr(struct ptnet_softc *sc) { unsigned int wanted_hdr_len = ptnet_vnet_hdr ? PTNET_HDR_SIZE : 0; bus_write_4(sc->iomem, PTNET_IO_VNET_HDR_LEN, wanted_hdr_len); sc->vnet_hdr_len = bus_read_4(sc->iomem, PTNET_IO_VNET_HDR_LEN); sc->ptna->hwup.up.virt_hdr_len = sc->vnet_hdr_len; } static int ptnet_nm_register(struct netmap_adapter *na, int onoff) { /* device-specific */ if_t ifp = na->ifp; struct ptnet_softc *sc = if_getsoftc(ifp); int native = (na == &sc->ptna->hwup.up); struct ptnet_queue *pq; int ret = 0; int i; if (!onoff) { sc->ptna->backend_users--; } /* If this is the last netmap client, guest interrupt enable flags may * be in arbitrary state. Since these flags are going to be used also * by the netdevice driver, we have to make sure to start with * notifications enabled. Also, schedule NAPI to flush pending packets * in the RX rings, since we will not receive further interrupts * until these will be processed. */ if (native && !onoff && na->active_fds == 0) { nm_prinf("Exit netmap mode, re-enable interrupts"); for (i = 0; i < sc->num_rings; i++) { pq = sc->queues + i; pq->atok->appl_need_kick = 1; } } if (onoff) { if (sc->ptna->backend_users == 0) { /* Initialize notification enable fields in the CSB. */ for (i = 0; i < sc->num_rings; i++) { pq = sc->queues + i; pq->ktoa->kern_need_kick = 1; pq->atok->appl_need_kick = (!(ifp->if_capenable & IFCAP_POLLING) && i >= sc->num_tx_rings); } /* Set the virtio-net header length. */ ptnet_update_vnet_hdr(sc); /* Make sure the host adapter passed through is ready * for txsync/rxsync. */ ret = ptnet_nm_ptctl(sc, PTNETMAP_PTCTL_CREATE); if (ret) { return ret; } /* Align the guest krings and rings to the state stored * in the CSB. */ ptnet_sync_from_csb(sc, na); } /* If not native, don't call nm_set_native_flags, since we don't want * to replace if_transmit method, nor set NAF_NETMAP_ON */ if (native) { netmap_krings_mode_commit(na, onoff); nm_set_native_flags(na); } } else { if (native) { nm_clear_native_flags(na); netmap_krings_mode_commit(na, onoff); } if (sc->ptna->backend_users == 0) { ret = ptnet_nm_ptctl(sc, PTNETMAP_PTCTL_DELETE); } } if (onoff) { sc->ptna->backend_users++; } return ret; } static int ptnet_nm_txsync(struct netmap_kring *kring, int flags) { struct ptnet_softc *sc = if_getsoftc(kring->na->ifp); struct ptnet_queue *pq = sc->queues + kring->ring_id; bool notify; notify = netmap_pt_guest_txsync(pq->atok, pq->ktoa, kring, flags); if (notify) { ptnet_kick(pq); } return 0; } static int ptnet_nm_rxsync(struct netmap_kring *kring, int flags) { struct ptnet_softc *sc = if_getsoftc(kring->na->ifp); struct ptnet_queue *pq = sc->rxqueues + kring->ring_id; bool notify; notify = netmap_pt_guest_rxsync(pq->atok, pq->ktoa, kring, flags); if (notify) { ptnet_kick(pq); } return 0; } static void ptnet_nm_intr(struct netmap_adapter *na, int onoff) { struct ptnet_softc *sc = if_getsoftc(na->ifp); int i; for (i = 0; i < sc->num_rings; i++) { struct ptnet_queue *pq = sc->queues + i; pq->atok->appl_need_kick = onoff; } } static void ptnet_tx_intr(void *opaque) { struct ptnet_queue *pq = opaque; struct ptnet_softc *sc = pq->sc; DBG(device_printf(sc->dev, "Tx interrupt #%d\n", pq->kring_id)); #ifdef PTNETMAP_STATS pq->stats.intrs ++; #endif /* PTNETMAP_STATS */ if (netmap_tx_irq(sc->ifp, pq->kring_id) != NM_IRQ_PASS) { return; } /* Schedule the tasqueue to flush process transmissions requests. * However, vtnet, if_em and if_igb just call ptnet_transmit() here, * at least when using MSI-X interrupts. The if_em driver, instead * schedule taskqueue when using legacy interrupts. */ taskqueue_enqueue(pq->taskq, &pq->task); } static void ptnet_rx_intr(void *opaque) { struct ptnet_queue *pq = opaque; struct ptnet_softc *sc = pq->sc; unsigned int unused; DBG(device_printf(sc->dev, "Rx interrupt #%d\n", pq->kring_id)); #ifdef PTNETMAP_STATS pq->stats.intrs ++; #endif /* PTNETMAP_STATS */ if (netmap_rx_irq(sc->ifp, pq->kring_id, &unused) != NM_IRQ_PASS) { return; } /* Like vtnet, if_igb and if_em drivers when using MSI-X interrupts, * receive-side processing is executed directly in the interrupt * service routine. Alternatively, we may schedule the taskqueue. */ ptnet_rx_eof(pq, PTNET_RX_BUDGET, true); } -/* The following offloadings-related functions are taken from the vtnet - * driver, but the same functionality is required for the ptnet driver. - * As a temporary solution, I copied this code from vtnet and I started - * to generalize it (taking away driver-specific statistic accounting), - * making as little modifications as possible. - * In the future we need to share these functions between vtnet and ptnet. - */ -static int -ptnet_tx_offload_ctx(struct mbuf *m, int *etype, int *proto, int *start) -{ - struct ether_vlan_header *evh; - int offset; - - evh = mtod(m, struct ether_vlan_header *); - if (evh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { - /* BMV: We should handle nested VLAN tags too. */ - *etype = ntohs(evh->evl_proto); - offset = sizeof(struct ether_vlan_header); - } else { - *etype = ntohs(evh->evl_encap_proto); - offset = sizeof(struct ether_header); - } - - switch (*etype) { -#if defined(INET) - case ETHERTYPE_IP: { - struct ip *ip, iphdr; - if (__predict_false(m->m_len < offset + sizeof(struct ip))) { - m_copydata(m, offset, sizeof(struct ip), - (caddr_t) &iphdr); - ip = &iphdr; - } else - ip = (struct ip *)(m->m_data + offset); - *proto = ip->ip_p; - *start = offset + (ip->ip_hl << 2); - break; - } -#endif -#if defined(INET6) - case ETHERTYPE_IPV6: - *proto = -1; - *start = ip6_lasthdr(m, offset, IPPROTO_IPV6, proto); - /* Assert the network stack sent us a valid packet. */ - KASSERT(*start > offset, - ("%s: mbuf %p start %d offset %d proto %d", __func__, m, - *start, offset, *proto)); - break; -#endif - default: - /* Here we should increment the tx_csum_bad_ethtype counter. */ - return (EINVAL); - } - - return (0); -} - -static int -ptnet_tx_offload_tso(if_t ifp, struct mbuf *m, int eth_type, - int offset, bool allow_ecn, struct virtio_net_hdr *hdr) -{ - static struct timeval lastecn; - static int curecn; - struct tcphdr *tcp, tcphdr; - - if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) { - m_copydata(m, offset, sizeof(struct tcphdr), (caddr_t) &tcphdr); - tcp = &tcphdr; - } else - tcp = (struct tcphdr *)(m->m_data + offset); - - hdr->hdr_len = offset + (tcp->th_off << 2); - hdr->gso_size = m->m_pkthdr.tso_segsz; - hdr->gso_type = eth_type == ETHERTYPE_IP ? VIRTIO_NET_HDR_GSO_TCPV4 : - VIRTIO_NET_HDR_GSO_TCPV6; - - if (tcp->th_flags & TH_CWR) { - /* - * Drop if VIRTIO_NET_F_HOST_ECN was not negotiated. In FreeBSD, - * ECN support is not on a per-interface basis, but globally via - * the net.inet.tcp.ecn.enable sysctl knob. The default is off. - */ - if (!allow_ecn) { - if (ppsratecheck(&lastecn, &curecn, 1)) - if_printf(ifp, - "TSO with ECN not negotiated with host\n"); - return (ENOTSUP); - } - hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; - } - - /* Here we should increment tx_tso counter. */ - - return (0); -} - -static struct mbuf * -ptnet_tx_offload(if_t ifp, struct mbuf *m, bool allow_ecn, - struct virtio_net_hdr *hdr) -{ - int flags, etype, csum_start, proto, error; - - flags = m->m_pkthdr.csum_flags; - - error = ptnet_tx_offload_ctx(m, &etype, &proto, &csum_start); - if (error) - goto drop; - - if ((etype == ETHERTYPE_IP && flags & PTNET_CSUM_OFFLOAD) || - (etype == ETHERTYPE_IPV6 && flags & PTNET_CSUM_OFFLOAD_IPV6)) { - /* - * We could compare the IP protocol vs the CSUM_ flag too, - * but that really should not be necessary. - */ - hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM; - hdr->csum_start = csum_start; - hdr->csum_offset = m->m_pkthdr.csum_data; - /* Here we should increment the tx_csum counter. */ - } - - if (flags & CSUM_TSO) { - if (__predict_false(proto != IPPROTO_TCP)) { - /* Likely failed to correctly parse the mbuf. - * Here we should increment the tx_tso_not_tcp - * counter. */ - goto drop; - } - - KASSERT(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM, - ("%s: mbuf %p TSO without checksum offload %#x", - __func__, m, flags)); - - error = ptnet_tx_offload_tso(ifp, m, etype, csum_start, - allow_ecn, hdr); - if (error) - goto drop; - } - - return (m); - -drop: - m_freem(m); - return (NULL); -} - static void ptnet_vlan_tag_remove(struct mbuf *m) { struct ether_vlan_header *evh; evh = mtod(m, struct ether_vlan_header *); m->m_pkthdr.ether_vtag = ntohs(evh->evl_tag); m->m_flags |= M_VLANTAG; /* Strip the 802.1Q header. */ bcopy((char *) evh, (char *) evh + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); } -/* - * Use the checksum offset in the VirtIO header to set the - * correct CSUM_* flags. - */ -static int -ptnet_rx_csum_by_offset(struct mbuf *m, uint16_t eth_type, int ip_start, - struct virtio_net_hdr *hdr) -{ -#if defined(INET) || defined(INET6) - int offset = hdr->csum_start + hdr->csum_offset; -#endif - - /* Only do a basic sanity check on the offset. */ - switch (eth_type) { -#if defined(INET) - case ETHERTYPE_IP: - if (__predict_false(offset < ip_start + sizeof(struct ip))) - return (1); - break; -#endif -#if defined(INET6) - case ETHERTYPE_IPV6: - if (__predict_false(offset < ip_start + sizeof(struct ip6_hdr))) - return (1); - break; -#endif - default: - /* Here we should increment the rx_csum_bad_ethtype counter. */ - return (1); - } - - /* - * Use the offset to determine the appropriate CSUM_* flags. This is - * a bit dirty, but we can get by with it since the checksum offsets - * happen to be different. We assume the host host does not do IPv4 - * header checksum offloading. - */ - switch (hdr->csum_offset) { - case offsetof(struct udphdr, uh_sum): - case offsetof(struct tcphdr, th_sum): - m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; - m->m_pkthdr.csum_data = 0xFFFF; - break; - default: - /* Here we should increment the rx_csum_bad_offset counter. */ - return (1); - } - - return (0); -} - -static int -ptnet_rx_csum_by_parse(struct mbuf *m, uint16_t eth_type, int ip_start, - struct virtio_net_hdr *hdr) -{ - int offset, proto; - - switch (eth_type) { -#if defined(INET) - case ETHERTYPE_IP: { - struct ip *ip; - if (__predict_false(m->m_len < ip_start + sizeof(struct ip))) - return (1); - ip = (struct ip *)(m->m_data + ip_start); - proto = ip->ip_p; - offset = ip_start + (ip->ip_hl << 2); - break; - } -#endif -#if defined(INET6) - case ETHERTYPE_IPV6: - if (__predict_false(m->m_len < ip_start + - sizeof(struct ip6_hdr))) - return (1); - offset = ip6_lasthdr(m, ip_start, IPPROTO_IPV6, &proto); - if (__predict_false(offset < 0)) - return (1); - break; -#endif - default: - /* Here we should increment the rx_csum_bad_ethtype counter. */ - return (1); - } - - switch (proto) { - case IPPROTO_TCP: - if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) - return (1); - m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; - m->m_pkthdr.csum_data = 0xFFFF; - break; - case IPPROTO_UDP: - if (__predict_false(m->m_len < offset + sizeof(struct udphdr))) - return (1); - m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; - m->m_pkthdr.csum_data = 0xFFFF; - break; - default: - /* - * For the remaining protocols, FreeBSD does not support - * checksum offloading, so the checksum will be recomputed. - */ -#if 0 - if_printf(ifp, "cksum offload of unsupported " - "protocol eth_type=%#x proto=%d csum_start=%d " - "csum_offset=%d\n", __func__, eth_type, proto, - hdr->csum_start, hdr->csum_offset); -#endif - break; - } - - return (0); -} - -/* - * Set the appropriate CSUM_* flags. Unfortunately, the information - * provided is not directly useful to us. The VirtIO header gives the - * offset of the checksum, which is all Linux needs, but this is not - * how FreeBSD does things. We are forced to peek inside the packet - * a bit. - * - * It would be nice if VirtIO gave us the L4 protocol or if FreeBSD - * could accept the offsets and let the stack figure it out. - */ -static int -ptnet_rx_csum(struct mbuf *m, struct virtio_net_hdr *hdr) -{ - struct ether_header *eh; - struct ether_vlan_header *evh; - uint16_t eth_type; - int offset, error; - - eh = mtod(m, struct ether_header *); - eth_type = ntohs(eh->ether_type); - if (eth_type == ETHERTYPE_VLAN) { - /* BMV: We should handle nested VLAN tags too. */ - evh = mtod(m, struct ether_vlan_header *); - eth_type = ntohs(evh->evl_proto); - offset = sizeof(struct ether_vlan_header); - } else - offset = sizeof(struct ether_header); - - if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) - error = ptnet_rx_csum_by_offset(m, eth_type, offset, hdr); - else - error = ptnet_rx_csum_by_parse(m, eth_type, offset, hdr); - - return (error); -} -/* End of offloading-related functions to be shared with vtnet. */ - static void ptnet_ring_update(struct ptnet_queue *pq, struct netmap_kring *kring, unsigned int head, unsigned int sync_flags) { struct netmap_ring *ring = kring->ring; struct nm_csb_atok *atok = pq->atok; struct nm_csb_ktoa *ktoa = pq->ktoa; /* Some packets have been pushed to the netmap ring. We have * to tell the host to process the new packets, updating cur * and head in the CSB. */ ring->head = ring->cur = head; /* Mimic nm_txsync_prologue/nm_rxsync_prologue. */ kring->rcur = kring->rhead = head; nm_sync_kloop_appl_write(atok, kring->rcur, kring->rhead); /* Kick the host if needed. */ if (NM_ACCESS_ONCE(ktoa->kern_need_kick)) { atok->sync_flags = sync_flags; ptnet_kick(pq); } } #define PTNET_TX_NOSPACE(_h, _k, _min) \ ((((_h) < (_k)->rtail) ? 0 : (_k)->nkr_num_slots) + \ (_k)->rtail - (_h)) < (_min) /* This function may be called by the network stack, or by * by the taskqueue thread. */ static int ptnet_drain_transmit_queue(struct ptnet_queue *pq, unsigned int budget, bool may_resched) { struct ptnet_softc *sc = pq->sc; bool have_vnet_hdr = sc->vnet_hdr_len; struct netmap_adapter *na = &sc->ptna->dr.up; if_t ifp = sc->ifp; unsigned int batch_count = 0; struct nm_csb_atok *atok; struct nm_csb_ktoa *ktoa; struct netmap_kring *kring; struct netmap_ring *ring; struct netmap_slot *slot; unsigned int count = 0; unsigned int minspace; unsigned int head; unsigned int lim; struct mbuf *mhead; struct mbuf *mf; int nmbuf_bytes; uint8_t *nmbuf; if (!PTNET_Q_TRYLOCK(pq)) { /* We failed to acquire the lock, schedule the taskqueue. */ nm_prlim(1, "Deferring TX work"); if (may_resched) { taskqueue_enqueue(pq->taskq, &pq->task); } return 0; } if (unlikely(!(ifp->if_drv_flags & IFF_DRV_RUNNING))) { PTNET_Q_UNLOCK(pq); nm_prlim(1, "Interface is down"); return ENETDOWN; } atok = pq->atok; ktoa = pq->ktoa; kring = na->tx_rings[pq->kring_id]; ring = kring->ring; lim = kring->nkr_num_slots - 1; head = ring->head; minspace = sc->min_tx_space; while (count < budget) { if (PTNET_TX_NOSPACE(head, kring, minspace)) { /* We ran out of slot, let's see if the host has * freed up some, by reading hwcur and hwtail from * the CSB. */ ptnet_sync_tail(ktoa, kring); if (PTNET_TX_NOSPACE(head, kring, minspace)) { /* Still no slots available. Reactivate the * interrupts so that we can be notified * when some free slots are made available by * the host. */ atok->appl_need_kick = 1; /* Double check. We need a full barrier to * prevent the store to atok->appl_need_kick * to be reordered with the load from * ktoa->hwcur and ktoa->hwtail (store-load * barrier). */ nm_stld_barrier(); ptnet_sync_tail(ktoa, kring); if (likely(PTNET_TX_NOSPACE(head, kring, minspace))) { break; } nm_prlim(1, "Found more slots by doublecheck"); /* More slots were freed before reactivating * the interrupts. */ atok->appl_need_kick = 0; } } mhead = drbr_peek(ifp, pq->bufring); if (!mhead) { break; } /* Initialize transmission state variables. */ slot = ring->slot + head; nmbuf = NMB(na, slot); nmbuf_bytes = 0; /* If needed, prepare the virtio-net header at the beginning * of the first slot. */ if (have_vnet_hdr) { struct virtio_net_hdr *vh = (struct virtio_net_hdr *)nmbuf; /* For performance, we could replace this memset() with * two 8-bytes-wide writes. */ memset(nmbuf, 0, PTNET_HDR_SIZE); if (mhead->m_pkthdr.csum_flags & PTNET_ALL_OFFLOAD) { - mhead = ptnet_tx_offload(ifp, mhead, false, + mhead = virtio_net_tx_offload(ifp, mhead, false, vh); if (unlikely(!mhead)) { /* Packet dropped because errors * occurred while preparing the vnet * header. Let's go ahead with the next * packet. */ pq->stats.errors ++; drbr_advance(ifp, pq->bufring); continue; } } nm_prdis(1, "%s: [csum_flags %lX] vnet hdr: flags %x " "csum_start %u csum_ofs %u hdr_len = %u " "gso_size %u gso_type %x", __func__, mhead->m_pkthdr.csum_flags, vh->flags, vh->csum_start, vh->csum_offset, vh->hdr_len, vh->gso_size, vh->gso_type); nmbuf += PTNET_HDR_SIZE; nmbuf_bytes += PTNET_HDR_SIZE; } for (mf = mhead; mf; mf = mf->m_next) { uint8_t *mdata = mf->m_data; int mlen = mf->m_len; for (;;) { int copy = NETMAP_BUF_SIZE(na) - nmbuf_bytes; if (mlen < copy) { copy = mlen; } memcpy(nmbuf, mdata, copy); mdata += copy; mlen -= copy; nmbuf += copy; nmbuf_bytes += copy; if (!mlen) { break; } slot->len = nmbuf_bytes; slot->flags = NS_MOREFRAG; head = nm_next(head, lim); KASSERT(head != ring->tail, ("Unexpectedly run out of TX space")); slot = ring->slot + head; nmbuf = NMB(na, slot); nmbuf_bytes = 0; } } /* Complete last slot and update head. */ slot->len = nmbuf_bytes; slot->flags = 0; head = nm_next(head, lim); /* Consume the packet just processed. */ drbr_advance(ifp, pq->bufring); /* Copy the packet to listeners. */ ETHER_BPF_MTAP(ifp, mhead); pq->stats.packets ++; pq->stats.bytes += mhead->m_pkthdr.len; if (mhead->m_flags & M_MCAST) { pq->stats.mcasts ++; } m_freem(mhead); count ++; if (++batch_count == PTNET_TX_BATCH) { ptnet_ring_update(pq, kring, head, NAF_FORCE_RECLAIM); batch_count = 0; } } if (batch_count) { ptnet_ring_update(pq, kring, head, NAF_FORCE_RECLAIM); } if (count >= budget && may_resched) { DBG(nm_prlim(1, "out of budget: resched, %d mbufs pending\n", drbr_inuse(ifp, pq->bufring))); taskqueue_enqueue(pq->taskq, &pq->task); } PTNET_Q_UNLOCK(pq); return count; } static int ptnet_transmit(if_t ifp, struct mbuf *m) { struct ptnet_softc *sc = if_getsoftc(ifp); struct ptnet_queue *pq; unsigned int queue_idx; int err; DBG(device_printf(sc->dev, "transmit %p\n", m)); /* Insert 802.1Q header if needed. */ if (m->m_flags & M_VLANTAG) { m = ether_vlanencap(m, m->m_pkthdr.ether_vtag); if (m == NULL) { return ENOBUFS; } m->m_flags &= ~M_VLANTAG; } /* Get the flow-id if available. */ queue_idx = (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) ? m->m_pkthdr.flowid : curcpu; if (unlikely(queue_idx >= sc->num_tx_rings)) { queue_idx %= sc->num_tx_rings; } pq = sc->queues + queue_idx; err = drbr_enqueue(ifp, pq->bufring, m); if (err) { /* ENOBUFS when the bufring is full */ nm_prlim(1, "%s: drbr_enqueue() failed %d\n", __func__, err); pq->stats.errors ++; return err; } if (ifp->if_capenable & IFCAP_POLLING) { /* If polling is on, the transmit queues will be * drained by the poller. */ return 0; } err = ptnet_drain_transmit_queue(pq, PTNET_TX_BUDGET, true); return (err < 0) ? err : 0; } static unsigned int ptnet_rx_discard(struct netmap_kring *kring, unsigned int head) { struct netmap_ring *ring = kring->ring; struct netmap_slot *slot = ring->slot + head; for (;;) { head = nm_next(head, kring->nkr_num_slots - 1); if (!(slot->flags & NS_MOREFRAG) || head == ring->tail) { break; } slot = ring->slot + head; } return head; } static inline struct mbuf * ptnet_rx_slot(struct mbuf *mtail, uint8_t *nmbuf, unsigned int nmbuf_len) { uint8_t *mdata = mtod(mtail, uint8_t *) + mtail->m_len; do { unsigned int copy; if (mtail->m_len == MCLBYTES) { struct mbuf *mf; mf = m_getcl(M_NOWAIT, MT_DATA, 0); if (unlikely(!mf)) { return NULL; } mtail->m_next = mf; mtail = mf; mdata = mtod(mtail, uint8_t *); mtail->m_len = 0; } copy = MCLBYTES - mtail->m_len; if (nmbuf_len < copy) { copy = nmbuf_len; } memcpy(mdata, nmbuf, copy); nmbuf += copy; nmbuf_len -= copy; mdata += copy; mtail->m_len += copy; } while (nmbuf_len); return mtail; } static int ptnet_rx_eof(struct ptnet_queue *pq, unsigned int budget, bool may_resched) { struct ptnet_softc *sc = pq->sc; bool have_vnet_hdr = sc->vnet_hdr_len; struct nm_csb_atok *atok = pq->atok; struct nm_csb_ktoa *ktoa = pq->ktoa; struct netmap_adapter *na = &sc->ptna->dr.up; struct netmap_kring *kring = na->rx_rings[pq->kring_id]; struct netmap_ring *ring = kring->ring; unsigned int const lim = kring->nkr_num_slots - 1; unsigned int batch_count = 0; if_t ifp = sc->ifp; unsigned int count = 0; uint32_t head; PTNET_Q_LOCK(pq); if (unlikely(!(ifp->if_drv_flags & IFF_DRV_RUNNING))) { goto unlock; } kring->nr_kflags &= ~NKR_PENDINTR; head = ring->head; while (count < budget) { uint32_t prev_head = head; struct mbuf *mhead, *mtail; struct virtio_net_hdr *vh; struct netmap_slot *slot; unsigned int nmbuf_len; uint8_t *nmbuf; int deliver = 1; /* the mbuf to the network stack. */ host_sync: if (head == ring->tail) { /* We ran out of slot, let's see if the host has * added some, by reading hwcur and hwtail from * the CSB. */ ptnet_sync_tail(ktoa, kring); if (head == ring->tail) { /* Still no slots available. Reactivate * interrupts as they were disabled by the * host thread right before issuing the * last interrupt. */ atok->appl_need_kick = 1; /* Double check for more completed RX slots. * We need a full barrier to prevent the store * to atok->appl_need_kick to be reordered with * the load from ktoa->hwcur and ktoa->hwtail * (store-load barrier). */ nm_stld_barrier(); ptnet_sync_tail(ktoa, kring); if (likely(head == ring->tail)) { break; } atok->appl_need_kick = 0; } } /* Initialize ring state variables, possibly grabbing the * virtio-net header. */ slot = ring->slot + head; nmbuf = NMB(na, slot); nmbuf_len = slot->len; vh = (struct virtio_net_hdr *)nmbuf; if (have_vnet_hdr) { if (unlikely(nmbuf_len < PTNET_HDR_SIZE)) { /* There is no good reason why host should * put the header in multiple netmap slots. * If this is the case, discard. */ nm_prlim(1, "Fragmented vnet-hdr: dropping"); head = ptnet_rx_discard(kring, head); pq->stats.iqdrops ++; deliver = 0; goto skip; } nm_prdis(1, "%s: vnet hdr: flags %x csum_start %u " "csum_ofs %u hdr_len = %u gso_size %u " "gso_type %x", __func__, vh->flags, vh->csum_start, vh->csum_offset, vh->hdr_len, vh->gso_size, vh->gso_type); nmbuf += PTNET_HDR_SIZE; nmbuf_len -= PTNET_HDR_SIZE; } /* Allocate the head of a new mbuf chain. * We use m_getcl() to allocate an mbuf with standard cluster * size (MCLBYTES). In the future we could use m_getjcl() * to choose different sizes. */ mhead = mtail = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (unlikely(mhead == NULL)) { device_printf(sc->dev, "%s: failed to allocate mbuf " "head\n", __func__); pq->stats.errors ++; break; } /* Initialize the mbuf state variables. */ mhead->m_pkthdr.len = nmbuf_len; mtail->m_len = 0; /* Scan all the netmap slots containing the current packet. */ for (;;) { DBG(device_printf(sc->dev, "%s: h %u t %u rcv frag " "len %u, flags %u\n", __func__, head, ring->tail, slot->len, slot->flags)); mtail = ptnet_rx_slot(mtail, nmbuf, nmbuf_len); if (unlikely(!mtail)) { /* Ouch. We ran out of memory while processing * a packet. We have to restore the previous * head position, free the mbuf chain, and * schedule the taskqueue to give the packet * another chance. */ device_printf(sc->dev, "%s: failed to allocate" " mbuf frag, reset head %u --> %u\n", __func__, head, prev_head); head = prev_head; m_freem(mhead); pq->stats.errors ++; if (may_resched) { taskqueue_enqueue(pq->taskq, &pq->task); } goto escape; } /* We have to increment head irrespective of the * NS_MOREFRAG being set or not. */ head = nm_next(head, lim); if (!(slot->flags & NS_MOREFRAG)) { break; } if (unlikely(head == ring->tail)) { /* The very last slot prepared by the host has * the NS_MOREFRAG set. Drop it and continue * the outer cycle (to do the double-check). */ nm_prlim(1, "Incomplete packet: dropping"); m_freem(mhead); pq->stats.iqdrops ++; goto host_sync; } slot = ring->slot + head; nmbuf = NMB(na, slot); nmbuf_len = slot->len; mhead->m_pkthdr.len += nmbuf_len; } mhead->m_pkthdr.rcvif = ifp; mhead->m_pkthdr.csum_flags = 0; /* Store the queue idx in the packet header. */ mhead->m_pkthdr.flowid = pq->kring_id; M_HASHTYPE_SET(mhead, M_HASHTYPE_OPAQUE); if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) { struct ether_header *eh; eh = mtod(mhead, struct ether_header *); if (eh->ether_type == htons(ETHERTYPE_VLAN)) { ptnet_vlan_tag_remove(mhead); /* * With the 802.1Q header removed, update the * checksum starting location accordingly. */ if (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) vh->csum_start -= ETHER_VLAN_ENCAP_LEN; } } - if (have_vnet_hdr && (vh->flags & (VIRTIO_NET_HDR_F_NEEDS_CSUM - | VIRTIO_NET_HDR_F_DATA_VALID))) { - if (unlikely(ptnet_rx_csum(mhead, vh))) { - m_freem(mhead); - nm_prlim(1, "Csum offload error: dropping"); - pq->stats.iqdrops ++; - deliver = 0; - } + if (unlikely(have_vnet_hdr && virtio_net_rx_csum(mhead, vh))) { + m_freem(mhead); + nm_prlim(1, "Csum offload error: dropping"); + pq->stats.iqdrops ++; + deliver = 0; } skip: count ++; if (++batch_count >= PTNET_RX_BATCH) { /* Some packets have been (or will be) pushed to the network * stack. We need to update the CSB to tell the host about * the new ring->cur and ring->head (RX buffer refill). */ ptnet_ring_update(pq, kring, head, NAF_FORCE_READ); batch_count = 0; } if (likely(deliver)) { pq->stats.packets ++; pq->stats.bytes += mhead->m_pkthdr.len; PTNET_Q_UNLOCK(pq); (*ifp->if_input)(ifp, mhead); PTNET_Q_LOCK(pq); /* The ring->head index (and related indices) are * updated under pq lock by ptnet_ring_update(). * Since we dropped the lock to call if_input(), we * must reload ring->head and restart processing the * ring from there. */ head = ring->head; if (unlikely(!(ifp->if_drv_flags & IFF_DRV_RUNNING))) { /* The interface has gone down while we didn't * have the lock. Stop any processing and exit. */ goto unlock; } } } escape: if (batch_count) { ptnet_ring_update(pq, kring, head, NAF_FORCE_READ); } if (count >= budget && may_resched) { /* If we ran out of budget or the double-check found new * slots to process, schedule the taskqueue. */ DBG(nm_prlim(1, "out of budget: resched h %u t %u\n", head, ring->tail)); taskqueue_enqueue(pq->taskq, &pq->task); } unlock: PTNET_Q_UNLOCK(pq); return count; } static void ptnet_rx_task(void *context, int pending) { struct ptnet_queue *pq = context; DBG(nm_prlim(1, "%s: pq #%u\n", __func__, pq->kring_id)); ptnet_rx_eof(pq, PTNET_RX_BUDGET, true); } static void ptnet_tx_task(void *context, int pending) { struct ptnet_queue *pq = context; DBG(nm_prlim(1, "%s: pq #%u\n", __func__, pq->kring_id)); ptnet_drain_transmit_queue(pq, PTNET_TX_BUDGET, true); } #ifdef DEVICE_POLLING /* We don't need to handle differently POLL_AND_CHECK_STATUS and * POLL_ONLY, since we don't have an Interrupt Status Register. */ static int ptnet_poll(if_t ifp, enum poll_cmd cmd, int budget) { struct ptnet_softc *sc = if_getsoftc(ifp); unsigned int queue_budget; unsigned int count = 0; bool borrow = false; int i; KASSERT(sc->num_rings > 0, ("Found no queues in while polling ptnet")); queue_budget = MAX(budget / sc->num_rings, 1); nm_prlim(1, "Per-queue budget is %d", queue_budget); while (budget) { unsigned int rcnt = 0; for (i = 0; i < sc->num_rings; i++) { struct ptnet_queue *pq = sc->queues + i; if (borrow) { queue_budget = MIN(queue_budget, budget); if (queue_budget == 0) { break; } } if (i < sc->num_tx_rings) { rcnt += ptnet_drain_transmit_queue(pq, queue_budget, false); } else { rcnt += ptnet_rx_eof(pq, queue_budget, false); } } if (!rcnt) { /* A scan of the queues gave no result, we can * stop here. */ break; } if (rcnt > budget) { /* This may happen when initial budget < sc->num_rings, * since one packet budget is given to each queue * anyway. Just pretend we didn't eat "so much". */ rcnt = budget; } count += rcnt; budget -= rcnt; borrow = true; } return count; } #endif /* DEVICE_POLLING */ diff --git a/sys/dev/virtio/network/virtio_net.h b/sys/dev/virtio/network/virtio_net.h index b0c80b58c732..019193db1cc0 100644 --- a/sys/dev/virtio/network/virtio_net.h +++ b/sys/dev/virtio/network/virtio_net.h @@ -1,204 +1,497 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * This header is BSD licensed so anyone can use the definitions to implement * compatible drivers/servers. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of IBM nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VIRTIO_NET_H #define _VIRTIO_NET_H /* The feature bitmap for virtio net */ #define VIRTIO_NET_F_CSUM 0x00001 /* Host handles pkts w/ partial csum */ #define VIRTIO_NET_F_GUEST_CSUM 0x00002 /* Guest handles pkts w/ partial csum*/ #define VIRTIO_NET_F_MAC 0x00020 /* Host has given MAC address. */ #define VIRTIO_NET_F_GSO 0x00040 /* Host handles pkts w/ any GSO type */ #define VIRTIO_NET_F_GUEST_TSO4 0x00080 /* Guest can handle TSOv4 in. */ #define VIRTIO_NET_F_GUEST_TSO6 0x00100 /* Guest can handle TSOv6 in. */ #define VIRTIO_NET_F_GUEST_ECN 0x00200 /* Guest can handle TSO[6] w/ ECN in.*/ #define VIRTIO_NET_F_GUEST_UFO 0x00400 /* Guest can handle UFO in. */ #define VIRTIO_NET_F_HOST_TSO4 0x00800 /* Host can handle TSOv4 in. */ #define VIRTIO_NET_F_HOST_TSO6 0x01000 /* Host can handle TSOv6 in. */ #define VIRTIO_NET_F_HOST_ECN 0x02000 /* Host can handle TSO[6] w/ ECN in. */ #define VIRTIO_NET_F_HOST_UFO 0x04000 /* Host can handle UFO in. */ #define VIRTIO_NET_F_MRG_RXBUF 0x08000 /* Host can merge receive buffers. */ #define VIRTIO_NET_F_STATUS 0x10000 /* virtio_net_config.status available*/ #define VIRTIO_NET_F_CTRL_VQ 0x20000 /* Control channel available */ #define VIRTIO_NET_F_CTRL_RX 0x40000 /* Control channel RX mode support */ #define VIRTIO_NET_F_CTRL_VLAN 0x80000 /* Control channel VLAN filtering */ #define VIRTIO_NET_F_CTRL_RX_EXTRA 0x100000 /* Extra RX mode control support */ #define VIRTIO_NET_F_GUEST_ANNOUNCE 0x200000 /* Announce device on network */ #define VIRTIO_NET_F_MQ 0x400000 /* Device supports RFS */ #define VIRTIO_NET_F_CTRL_MAC_ADDR 0x800000 /* Set MAC address */ #define VIRTIO_NET_S_LINK_UP 1 /* Link is up */ struct virtio_net_config { /* The config defining mac address (if VIRTIO_NET_F_MAC) */ uint8_t mac[ETHER_ADDR_LEN]; /* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */ uint16_t status; /* Maximum number of each of transmit and receive queues; * see VIRTIO_NET_F_MQ and VIRTIO_NET_CTRL_MQ. * Legal values are between 1 and 0x8000. */ uint16_t max_virtqueue_pairs; } __packed; /* * This is the first element of the scatter-gather list. If you don't * specify GSO or CSUM features, you can simply ignore the header. */ struct virtio_net_hdr { #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start,csum_offset*/ #define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */ uint8_t flags; #define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */ #define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */ #define VIRTIO_NET_HDR_GSO_UDP 3 /* GSO frame, IPv4 UDP (UFO) */ #define VIRTIO_NET_HDR_GSO_TCPV6 4 /* GSO frame, IPv6 TCP */ #define VIRTIO_NET_HDR_GSO_ECN 0x80 /* TCP has ECN set */ uint8_t gso_type; uint16_t hdr_len; /* Ethernet + IP + tcp/udp hdrs */ uint16_t gso_size; /* Bytes to append to hdr_len per frame */ uint16_t csum_start; /* Position to start checksumming from */ uint16_t csum_offset; /* Offset after that to place checksum */ }; /* * This is the version of the header to use when the MRG_RXBUF * feature has been negotiated. */ struct virtio_net_hdr_mrg_rxbuf { struct virtio_net_hdr hdr; uint16_t num_buffers; /* Number of merged rx buffers */ }; /* * Control virtqueue data structures * * The control virtqueue expects a header in the first sg entry * and an ack/status response in the last entry. Data for the * command goes in between. */ struct virtio_net_ctrl_hdr { uint8_t class; uint8_t cmd; } __packed; #define VIRTIO_NET_OK 0 #define VIRTIO_NET_ERR 1 /* * Control the RX mode, ie. promiscuous, allmulti, etc... * All commands require an "out" sg entry containing a 1 byte * state value, zero = disable, non-zero = enable. Commands * 0 and 1 are supported with the VIRTIO_NET_F_CTRL_RX feature. * Commands 2-5 are added with VIRTIO_NET_F_CTRL_RX_EXTRA. */ #define VIRTIO_NET_CTRL_RX 0 #define VIRTIO_NET_CTRL_RX_PROMISC 0 #define VIRTIO_NET_CTRL_RX_ALLMULTI 1 #define VIRTIO_NET_CTRL_RX_ALLUNI 2 #define VIRTIO_NET_CTRL_RX_NOMULTI 3 #define VIRTIO_NET_CTRL_RX_NOUNI 4 #define VIRTIO_NET_CTRL_RX_NOBCAST 5 /* * Control the MAC filter table. * * The MAC filter table is managed by the hypervisor, the guest should * assume the size is infinite. Filtering should be considered * non-perfect, ie. based on hypervisor resources, the guest may * received packets from sources not specified in the filter list. * * In addition to the class/cmd header, the TABLE_SET command requires * two out scatterlists. Each contains a 4 byte count of entries followed * by a concatenated byte stream of the ETH_ALEN MAC addresses. The * first sg list contains unicast addresses, the second is for multicast. * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature * is available. * * The ADDR_SET command requests one out scatterlist, it contains a * 6 bytes MAC address. This functionality is present if the * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available. */ struct virtio_net_ctrl_mac { uint32_t entries; uint8_t macs[][ETHER_ADDR_LEN]; } __packed; #define VIRTIO_NET_CTRL_MAC 1 #define VIRTIO_NET_CTRL_MAC_TABLE_SET 0 #define VIRTIO_NET_CTRL_MAC_ADDR_SET 1 /* * Control VLAN filtering * * The VLAN filter table is controlled via a simple ADD/DEL interface. * VLAN IDs not added may be filtered by the hypervisor. Del is the * opposite of add. Both commands expect an out entry containing a 2 * byte VLAN ID. VLAN filtering is available with the * VIRTIO_NET_F_CTRL_VLAN feature bit. */ #define VIRTIO_NET_CTRL_VLAN 2 #define VIRTIO_NET_CTRL_VLAN_ADD 0 #define VIRTIO_NET_CTRL_VLAN_DEL 1 /* * Control link announce acknowledgement * * The command VIRTIO_NET_CTRL_ANNOUNCE_ACK is used to indicate that * driver has recevied the notification; device would clear the * VIRTIO_NET_S_ANNOUNCE bit in the status field after it receives * this command. */ #define VIRTIO_NET_CTRL_ANNOUNCE 3 #define VIRTIO_NET_CTRL_ANNOUNCE_ACK 0 /* * Control Receive Flow Steering * * The command VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET enables Receive Flow * Steering, specifying the number of the transmit and receive queues * that will be used. After the command is consumed and acked by the * device, the device will not steer new packets on receive virtqueues * other than specified nor read from transmit virtqueues other than * specified. Accordingly, driver should not transmit new packets on * virtqueues other than specified. */ struct virtio_net_ctrl_mq { uint16_t virtqueue_pairs; } __packed; #define VIRTIO_NET_CTRL_MQ 4 #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET 0 #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN 1 #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX 0x8000 +/* + * Use the checksum offset in the VirtIO header to set the + * correct CSUM_* flags. + */ +static inline int +virtio_net_rx_csum_by_offset(struct mbuf *m, uint16_t eth_type, int ip_start, + struct virtio_net_hdr *hdr) +{ +#if defined(INET) || defined(INET6) + int offset = hdr->csum_start + hdr->csum_offset; +#endif + + /* Only do a basic sanity check on the offset. */ + switch (eth_type) { +#if defined(INET) + case ETHERTYPE_IP: + if (__predict_false(offset < ip_start + sizeof(struct ip))) + return (1); + break; +#endif +#if defined(INET6) + case ETHERTYPE_IPV6: + if (__predict_false(offset < ip_start + sizeof(struct ip6_hdr))) + return (1); + break; +#endif + default: + /* Here we should increment the rx_csum_bad_ethtype counter. */ + return (1); + } + + /* + * Use the offset to determine the appropriate CSUM_* flags. This is + * a bit dirty, but we can get by with it since the checksum offsets + * happen to be different. We assume the host host does not do IPv4 + * header checksum offloading. + */ + switch (hdr->csum_offset) { + case offsetof(struct udphdr, uh_sum): + case offsetof(struct tcphdr, th_sum): + m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + m->m_pkthdr.csum_data = 0xFFFF; + break; + default: + /* Here we should increment the rx_csum_bad_offset counter. */ + return (1); + } + + return (0); +} + +static inline int +virtio_net_rx_csum_by_parse(struct mbuf *m, uint16_t eth_type, int ip_start, + struct virtio_net_hdr *hdr) +{ + int offset, proto; + + switch (eth_type) { +#if defined(INET) + case ETHERTYPE_IP: { + struct ip *ip; + if (__predict_false(m->m_len < ip_start + sizeof(struct ip))) + return (1); + ip = (struct ip *)(m->m_data + ip_start); + proto = ip->ip_p; + offset = ip_start + (ip->ip_hl << 2); + break; + } +#endif +#if defined(INET6) + case ETHERTYPE_IPV6: + if (__predict_false(m->m_len < ip_start + + sizeof(struct ip6_hdr))) + return (1); + offset = ip6_lasthdr(m, ip_start, IPPROTO_IPV6, &proto); + if (__predict_false(offset < 0)) + return (1); + break; +#endif + default: + /* Here we should increment the rx_csum_bad_ethtype counter. */ + return (1); + } + + switch (proto) { + case IPPROTO_TCP: + if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) + return (1); + m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + m->m_pkthdr.csum_data = 0xFFFF; + break; + case IPPROTO_UDP: + if (__predict_false(m->m_len < offset + sizeof(struct udphdr))) + return (1); + m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + m->m_pkthdr.csum_data = 0xFFFF; + break; + default: + /* + * For the remaining protocols, FreeBSD does not support + * checksum offloading, so the checksum will be recomputed. + */ +#if 0 + if_printf(ifp, "cksum offload of unsupported " + "protocol eth_type=%#x proto=%d csum_start=%d " + "csum_offset=%d\n", __func__, eth_type, proto, + hdr->csum_start, hdr->csum_offset); +#endif + break; + } + + return (0); +} + +/* + * Set the appropriate CSUM_* flags. Unfortunately, the information + * provided is not directly useful to us. The VirtIO header gives the + * offset of the checksum, which is all Linux needs, but this is not + * how FreeBSD does things. We are forced to peek inside the packet + * a bit. + * + * It would be nice if VirtIO gave us the L4 protocol or if FreeBSD + * could accept the offsets and let the stack figure it out. + */ +static inline int +virtio_net_rx_csum(struct mbuf *m, struct virtio_net_hdr *hdr) +{ + struct ether_header *eh; + struct ether_vlan_header *evh; + uint16_t eth_type; + int offset, error; + + if ((hdr->flags & (VIRTIO_NET_HDR_F_NEEDS_CSUM | + VIRTIO_NET_HDR_F_DATA_VALID)) == 0) { + return (0); + } + + eh = mtod(m, struct ether_header *); + eth_type = ntohs(eh->ether_type); + if (eth_type == ETHERTYPE_VLAN) { + /* BMV: We should handle nested VLAN tags too. */ + evh = mtod(m, struct ether_vlan_header *); + eth_type = ntohs(evh->evl_proto); + offset = sizeof(struct ether_vlan_header); + } else + offset = sizeof(struct ether_header); + + if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) + error = virtio_net_rx_csum_by_offset(m, eth_type, offset, hdr); + else + error = virtio_net_rx_csum_by_parse(m, eth_type, offset, hdr); + + return (error); +} + +static inline int +virtio_net_tx_offload_ctx(struct mbuf *m, int *etype, int *proto, int *start) +{ + struct ether_vlan_header *evh; + int offset; + + evh = mtod(m, struct ether_vlan_header *); + if (evh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { + /* BMV: We should handle nested VLAN tags too. */ + *etype = ntohs(evh->evl_proto); + offset = sizeof(struct ether_vlan_header); + } else { + *etype = ntohs(evh->evl_encap_proto); + offset = sizeof(struct ether_header); + } + + switch (*etype) { +#if defined(INET) + case ETHERTYPE_IP: { + struct ip *ip, iphdr; + if (__predict_false(m->m_len < offset + sizeof(struct ip))) { + m_copydata(m, offset, sizeof(struct ip), + (caddr_t) &iphdr); + ip = &iphdr; + } else + ip = (struct ip *)(m->m_data + offset); + *proto = ip->ip_p; + *start = offset + (ip->ip_hl << 2); + break; + } +#endif +#if defined(INET6) + case ETHERTYPE_IPV6: + *proto = -1; + *start = ip6_lasthdr(m, offset, IPPROTO_IPV6, proto); + /* Assert the network stack sent us a valid packet. */ + KASSERT(*start > offset, + ("%s: mbuf %p start %d offset %d proto %d", __func__, m, + *start, offset, *proto)); + break; +#endif + default: + /* Here we should increment the tx_csum_bad_ethtype counter. */ + return (EINVAL); + } + + return (0); +} + +static inline int +virtio_net_tx_offload_tso(if_t ifp, struct mbuf *m, int eth_type, + int offset, bool allow_ecn, struct virtio_net_hdr *hdr) +{ + static struct timeval lastecn; + static int curecn; + struct tcphdr *tcp, tcphdr; + + if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) { + m_copydata(m, offset, sizeof(struct tcphdr), (caddr_t) &tcphdr); + tcp = &tcphdr; + } else + tcp = (struct tcphdr *)(m->m_data + offset); + + hdr->hdr_len = offset + (tcp->th_off << 2); + hdr->gso_size = m->m_pkthdr.tso_segsz; + hdr->gso_type = eth_type == ETHERTYPE_IP ? VIRTIO_NET_HDR_GSO_TCPV4 : + VIRTIO_NET_HDR_GSO_TCPV6; + + if (tcp->th_flags & TH_CWR) { + /* + * Drop if VIRTIO_NET_F_HOST_ECN was not negotiated. In FreeBSD, + * ECN support is not on a per-interface basis, but globally via + * the net.inet.tcp.ecn.enable sysctl knob. The default is off. + */ + if (!allow_ecn) { + if (ppsratecheck(&lastecn, &curecn, 1)) + if_printf(ifp, + "TSO with ECN not negotiated with host\n"); + return (ENOTSUP); + } + hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; + } + + /* Here we should increment tx_tso counter. */ + + return (0); +} + +static inline struct mbuf * +virtio_net_tx_offload(if_t ifp, struct mbuf *m, bool allow_ecn, + struct virtio_net_hdr *hdr) +{ + int flags, etype, csum_start, proto, error; + + flags = m->m_pkthdr.csum_flags; + + error = virtio_net_tx_offload_ctx(m, &etype, &proto, &csum_start); + if (error) + goto drop; + + if ((etype == ETHERTYPE_IP && (flags & (CSUM_TCP | CSUM_UDP))) || + (etype == ETHERTYPE_IPV6 && + (flags & (CSUM_TCP_IPV6 | CSUM_UDP_IPV6)))) { + /* + * We could compare the IP protocol vs the CSUM_ flag too, + * but that really should not be necessary. + */ + hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM; + hdr->csum_start = csum_start; + hdr->csum_offset = m->m_pkthdr.csum_data; + /* Here we should increment the tx_csum counter. */ + } + + if (flags & CSUM_TSO) { + if (__predict_false(proto != IPPROTO_TCP)) { + /* Likely failed to correctly parse the mbuf. + * Here we should increment the tx_tso_not_tcp + * counter. */ + goto drop; + } + + KASSERT(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM, + ("%s: mbuf %p TSO without checksum offload %#x", + __func__, m, flags)); + + error = virtio_net_tx_offload_tso(ifp, m, etype, csum_start, + allow_ecn, hdr); + if (error) + goto drop; + } + + return (m); + +drop: + m_freem(m); + return (NULL); +} + #endif /* _VIRTIO_NET_H */ diff --git a/sys/net/if_tap.h b/sys/net/if_tap.h index 9718cee4e2d6..1c6f30d7730c 100644 --- a/sys/net/if_tap.h +++ b/sys/net/if_tap.h @@ -1,74 +1,76 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 1999-2000 by Maksim Yevmenkin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * BASED ON: * ------------------------------------------------------------------------- * * Copyright (c) 1988, Julian Onions * Nottingham University 1987. */ /* * $FreeBSD$ * $Id: if_tap.h,v 0.7 2000/07/12 04:12:51 max Exp $ */ #ifndef _NET_IF_TAP_H_ #define _NET_IF_TAP_H_ #include /* maximum receive packet size (hard limit) */ -#define TAPMRU 16384 +#define TAPMRU 65535 #define tapinfo tuninfo /* * ioctl's for get/set debug; these are aliases of TUN* ioctls, see net/if_tun.h * for details. */ #define TAPSDEBUG TUNSDEBUG #define TAPGDEBUG TUNGDEBUG #define TAPSIFINFO TUNSIFINFO #define TAPGIFINFO TUNGIFINFO #define TAPGIFNAME TUNGIFNAME +#define TAPSVNETHDR _IOW('t', 91, int) +#define TAPGVNETHDR _IOR('t', 94, int) /* VMware ioctl's */ #define VMIO_SIOCSIFFLAGS _IOWINT('V', 0) #define VMIO_SIOCSKEEP _IO('V', 1) #define VMIO_SIOCSIFBR _IO('V', 2) #define VMIO_SIOCSLADRF _IO('V', 3) /* XXX -- unimplemented */ #define VMIO_SIOCSETMACADDR _IO('V', 4) /* XXX -- not used? */ #define VMIO_SIOCPORT _IO('V', 5) #define VMIO_SIOCBRIDGE _IO('V', 6) #define VMIO_SIOCNETIF _IO('V', 7) #endif /* !_NET_IF_TAP_H_ */ diff --git a/sys/net/if_tuntap.c b/sys/net/if_tuntap.c index e0a9978bd0f0..594db6a20158 100644 --- a/sys/net/if_tuntap.c +++ b/sys/net/if_tuntap.c @@ -1,1875 +1,2018 @@ /* $NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $ */ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 1999-2000 by Maksim Yevmenkin * All rights reserved. * Copyright (c) 2019 Kyle Evans * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * BASED ON: * ------------------------------------------------------------------------- * * Copyright (c) 1988, Julian Onions * Nottingham University 1987. * * This source may be freely distributed, however I would be interested * in any changes that are made. * * This driver takes packets off the IP i/f and hands them up to a * user process to have its wicked way with. This driver has it's * roots in a similar driver written by Phil Cockcroft (formerly) at * UCL. This driver is based much more on read/write/poll mode of * operation though. * * $FreeBSD$ */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #ifdef INET #include +#include +#include +#include +#include +#include #endif #include #include #include +#include + #include #include #include struct tuntap_driver; /* * tun_list is protected by global tunmtx. Other mutable fields are * protected by tun->tun_mtx, or by their owning subsystem. tun_dev is * static for the duration of a tunnel interface. */ struct tuntap_softc { TAILQ_ENTRY(tuntap_softc) tun_list; struct cdev *tun_alias; struct cdev *tun_dev; u_short tun_flags; /* misc flags */ #define TUN_OPEN 0x0001 #define TUN_INITED 0x0002 #define TUN_IASET 0x0008 #define TUN_DSTADDR 0x0010 #define TUN_LMODE 0x0020 #define TUN_RWAIT 0x0040 #define TUN_ASYNC 0x0080 #define TUN_IFHEAD 0x0100 #define TUN_DYING 0x0200 #define TUN_L2 0x0400 #define TUN_VMNET 0x0800 #define TUN_DRIVER_IDENT_MASK (TUN_L2 | TUN_VMNET) #define TUN_READY (TUN_OPEN | TUN_INITED) pid_t tun_pid; /* owning pid */ struct ifnet *tun_ifp; /* the interface */ struct sigio *tun_sigio; /* async I/O info */ struct tuntap_driver *tun_drv; /* appropriate driver */ struct selinfo tun_rsel; /* read select */ struct mtx tun_mtx; /* softc field mutex */ struct cv tun_cv; /* for ref'd dev destroy */ struct ether_addr tun_ether; /* remote address */ int tun_busy; /* busy count */ + int tun_vhdrlen; /* virtio-net header length */ }; #define TUN2IFP(sc) ((sc)->tun_ifp) #define TUNDEBUG if (tundebug) if_printf #define TUN_LOCK(tp) mtx_lock(&(tp)->tun_mtx) #define TUN_UNLOCK(tp) mtx_unlock(&(tp)->tun_mtx) #define TUN_LOCK_ASSERT(tp) mtx_assert(&(tp)->tun_mtx, MA_OWNED); #define TUN_VMIO_FLAG_MASK 0x0fff +/* + * Interface capabilities of a tap device that supports the virtio-net + * header. + */ +#define TAP_VNET_HDR_CAPS (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6 \ + | IFCAP_VLAN_HWCSUM \ + | IFCAP_TSO | IFCAP_LRO \ + | IFCAP_VLAN_HWTSO) + +#define TAP_ALL_OFFLOAD (CSUM_TSO | CSUM_TCP | CSUM_UDP |\ + CSUM_TCP_IPV6 | CSUM_UDP_IPV6) + + /* * All mutable global variables in if_tun are locked using tunmtx, with * the exception of tundebug, which is used unlocked, and the drivers' *clones, * which are static after setup. */ static struct mtx tunmtx; static eventhandler_tag arrival_tag; static eventhandler_tag clone_tag; static const char tunname[] = "tun"; static const char tapname[] = "tap"; static const char vmnetname[] = "vmnet"; static MALLOC_DEFINE(M_TUN, tunname, "Tunnel Interface"); static int tundebug = 0; static int tundclone = 1; static int tap_allow_uopen = 0; /* allow user open() */ static int tapuponopen = 0; /* IFF_UP on open() */ static int tapdclone = 1; /* enable devfs cloning */ static TAILQ_HEAD(,tuntap_softc) tunhead = TAILQ_HEAD_INITIALIZER(tunhead); SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, ""); static struct sx tun_ioctl_sx; SX_SYSINIT(tun_ioctl_sx, &tun_ioctl_sx, "tun_ioctl"); SYSCTL_DECL(_net_link); /* tun */ static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW, 0, "IP tunnel software network interface"); SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tundclone, 0, "Enable legacy devfs interface creation"); /* tap */ static SYSCTL_NODE(_net_link, OID_AUTO, tap, CTLFLAG_RW, 0, "Ethernet tunnel software network interface"); SYSCTL_INT(_net_link_tap, OID_AUTO, user_open, CTLFLAG_RW, &tap_allow_uopen, 0, "Allow user to open /dev/tap (based on node permissions)"); SYSCTL_INT(_net_link_tap, OID_AUTO, up_on_open, CTLFLAG_RW, &tapuponopen, 0, "Bring interface up when /dev/tap is opened"); SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tapdclone, 0, "Enable legacy devfs interface creation"); SYSCTL_INT(_net_link_tap, OID_AUTO, debug, CTLFLAG_RW, &tundebug, 0, ""); static int tun_busy_locked(struct tuntap_softc *tp); static void tun_unbusy_locked(struct tuntap_softc *tp); static int tun_busy(struct tuntap_softc *tp); static void tun_unbusy(struct tuntap_softc *tp); static int tuntap_name2info(const char *name, int *unit, int *flags); static void tunclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev); static void tuncreate(struct cdev *dev, struct tuntap_driver *); static void tunrename(void *arg, struct ifnet *ifp); static int tunifioctl(struct ifnet *, u_long, caddr_t); static void tuninit(struct ifnet *); static void tunifinit(void *xtp); static int tuntapmodevent(module_t, int, void *); static int tunoutput(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *ro); static void tunstart(struct ifnet *); static void tunstart_l2(struct ifnet *); static int tun_clone_match(struct if_clone *ifc, const char *name); static int tap_clone_match(struct if_clone *ifc, const char *name); static int vmnet_clone_match(struct if_clone *ifc, const char *name); static int tun_clone_create(struct if_clone *, char *, size_t, caddr_t); static int tun_clone_destroy(struct if_clone *, struct ifnet *); +static void tun_vnethdr_set(struct ifnet *ifp, int vhdrlen); static d_open_t tunopen; static d_close_t tunclose; static d_read_t tunread; static d_write_t tunwrite; static d_ioctl_t tunioctl; static d_poll_t tunpoll; static d_kqfilter_t tunkqfilter; static int tunkqread(struct knote *, long); static int tunkqwrite(struct knote *, long); static void tunkqdetach(struct knote *); static struct filterops tun_read_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = tunkqdetach, .f_event = tunkqread, }; static struct filterops tun_write_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = tunkqdetach, .f_event = tunkqwrite, }; static struct tuntap_driver { struct cdevsw cdevsw; int ident_flags; struct unrhdr *unrhdr; struct clonedevs *clones; ifc_match_t *clone_match_fn; ifc_create_t *clone_create_fn; ifc_destroy_t *clone_destroy_fn; } tuntap_drivers[] = { { .ident_flags = 0, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_close = tunclose, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = tunname, }, .clone_match_fn = tun_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, { .ident_flags = TUN_L2, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_close = tunclose, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = tapname, }, .clone_match_fn = tap_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, { .ident_flags = TUN_L2 | TUN_VMNET, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_close = tunclose, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = vmnetname, }, .clone_match_fn = vmnet_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, }; struct tuntap_driver_cloner { SLIST_ENTRY(tuntap_driver_cloner) link; struct tuntap_driver *drv; struct if_clone *cloner; }; VNET_DEFINE_STATIC(SLIST_HEAD(, tuntap_driver_cloner), tuntap_driver_cloners) = SLIST_HEAD_INITIALIZER(tuntap_driver_cloners); #define V_tuntap_driver_cloners VNET(tuntap_driver_cloners) /* * Mechanism for marking a tunnel device as busy so that we can safely do some * orthogonal operations (such as operations on devices) without racing against * tun_destroy. tun_destroy will wait on the condvar if we're at all busy or * open, to be woken up when the condition is alleviated. */ static int tun_busy_locked(struct tuntap_softc *tp) { TUN_LOCK_ASSERT(tp); if ((tp->tun_flags & TUN_DYING) != 0) { /* * Perhaps unintuitive, but the device is busy going away. * Other interpretations of EBUSY from tun_busy make little * sense, since making a busy device even more busy doesn't * sound like a problem. */ return (EBUSY); } ++tp->tun_busy; return (0); } static void tun_unbusy_locked(struct tuntap_softc *tp) { TUN_LOCK_ASSERT(tp); KASSERT(tp->tun_busy != 0, ("tun_unbusy: called for non-busy tunnel")); --tp->tun_busy; /* Wake up anything that may be waiting on our busy tunnel. */ if (tp->tun_busy == 0) cv_broadcast(&tp->tun_cv); } static int tun_busy(struct tuntap_softc *tp) { int ret; TUN_LOCK(tp); ret = tun_busy_locked(tp); TUN_UNLOCK(tp); return (ret); } static void tun_unbusy(struct tuntap_softc *tp) { TUN_LOCK(tp); tun_unbusy_locked(tp); TUN_UNLOCK(tp); } /* * Sets unit and/or flags given the device name. Must be called with correct * vnet context. */ static int tuntap_name2info(const char *name, int *outunit, int *outflags) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; char *dname; int flags, unit; bool found; if (name == NULL) return (EINVAL); /* * Needed for dev_stdclone, but dev_stdclone will not modify, it just * wants to be able to pass back a char * through the second param. We * will always set that as NULL here, so we'll fake it. */ dname = __DECONST(char *, name); found = false; KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners), ("tuntap_driver_cloners failed to initialize")); SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) { KASSERT(drvc->drv != NULL, ("tuntap_driver_cloners entry not properly initialized")); drv = drvc->drv; if (strcmp(name, drv->cdevsw.d_name) == 0) { found = true; unit = -1; flags = drv->ident_flags; break; } if (dev_stdclone(dname, NULL, drv->cdevsw.d_name, &unit) == 1) { found = true; flags = drv->ident_flags; break; } } if (!found) return (ENXIO); if (outunit != NULL) *outunit = unit; if (outflags != NULL) *outflags = flags; return (0); } /* * Get driver information from a set of flags specified. Masks the identifying * part of the flags and compares it against all of the available * tuntap_drivers. Must be called with correct vnet context. */ static struct tuntap_driver * tuntap_driver_from_flags(int tun_flags) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners), ("tuntap_driver_cloners failed to initialize")); SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) { KASSERT(drvc->drv != NULL, ("tuntap_driver_cloners entry not properly initialized")); drv = drvc->drv; if ((tun_flags & TUN_DRIVER_IDENT_MASK) == drv->ident_flags) return (drv); } return (NULL); } static int tun_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & TUN_L2) == 0) return (1); } return (0); } static int tap_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & (TUN_L2 | TUN_VMNET)) == TUN_L2) return (1); } return (0); } static int vmnet_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & TUN_VMNET) != 0) return (1); } return (0); } static int tun_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { struct tuntap_driver *drv; struct cdev *dev; int err, i, tunflags, unit; tunflags = 0; /* The name here tells us exactly what we're creating */ err = tuntap_name2info(name, &unit, &tunflags); if (err != 0) return (err); drv = tuntap_driver_from_flags(tunflags); if (drv == NULL) return (ENXIO); if (unit != -1) { /* If this unit number is still available that's okay. */ if (alloc_unr_specific(drv->unrhdr, unit) == -1) return (EEXIST); } else { unit = alloc_unr(drv->unrhdr); } snprintf(name, IFNAMSIZ, "%s%d", drv->cdevsw.d_name, unit); /* find any existing device, or allocate new unit number */ i = clone_create(&drv->clones, &drv->cdevsw, &unit, &dev, 0); if (i) { /* No preexisting struct cdev *, create one */ dev = make_dev(&drv->cdevsw, unit, UID_UUCP, GID_DIALER, 0600, "%s%d", drv->cdevsw.d_name, unit); } tuncreate(dev, drv); return (0); } static void tunclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev) { char devname[SPECNAMELEN + 1]; struct tuntap_driver *drv; int append_unit, i, u, tunflags; bool mayclone; if (*dev != NULL) return; tunflags = 0; CURVNET_SET(CRED_TO_VNET(cred)); if (tuntap_name2info(name, &u, &tunflags) != 0) goto out; /* Not recognized */ if (u != -1 && u > IF_MAXUNIT) goto out; /* Unit number too high */ mayclone = priv_check_cred(cred, PRIV_NET_IFCREATE) == 0; if ((tunflags & TUN_L2) != 0) { /* tap/vmnet allow user open with a sysctl */ mayclone = (mayclone || tap_allow_uopen) && tapdclone; } else { mayclone = mayclone && tundclone; } /* * If tun cloning is enabled, only the superuser can create an * interface. */ if (!mayclone) goto out; if (u == -1) append_unit = 1; else append_unit = 0; drv = tuntap_driver_from_flags(tunflags); if (drv == NULL) goto out; /* find any existing device, or allocate new unit number */ i = clone_create(&drv->clones, &drv->cdevsw, &u, dev, 0); if (i) { if (append_unit) { namelen = snprintf(devname, sizeof(devname), "%s%d", name, u); name = devname; } /* No preexisting struct cdev *, create one */ *dev = make_dev_credf(MAKEDEV_REF, &drv->cdevsw, u, cred, UID_UUCP, GID_DIALER, 0600, "%s", name); } if_clone_create(name, namelen, NULL); out: CURVNET_RESTORE(); } static void tun_destroy(struct tuntap_softc *tp) { TUN_LOCK(tp); tp->tun_flags |= TUN_DYING; if (tp->tun_busy != 0) cv_wait_unlock(&tp->tun_cv, &tp->tun_mtx); else TUN_UNLOCK(tp); CURVNET_SET(TUN2IFP(tp)->if_vnet); /* destroy_dev will take care of any alias. */ destroy_dev(tp->tun_dev); seldrain(&tp->tun_rsel); knlist_clear(&tp->tun_rsel.si_note, 0); knlist_destroy(&tp->tun_rsel.si_note); if ((tp->tun_flags & TUN_L2) != 0) { ether_ifdetach(TUN2IFP(tp)); } else { bpfdetach(TUN2IFP(tp)); if_detach(TUN2IFP(tp)); } sx_xlock(&tun_ioctl_sx); TUN2IFP(tp)->if_softc = NULL; sx_xunlock(&tun_ioctl_sx); free_unr(tp->tun_drv->unrhdr, TUN2IFP(tp)->if_dunit); if_free(TUN2IFP(tp)); mtx_destroy(&tp->tun_mtx); cv_destroy(&tp->tun_cv); free(tp, M_TUN); CURVNET_RESTORE(); } static int tun_clone_destroy(struct if_clone *ifc __unused, struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; mtx_lock(&tunmtx); TAILQ_REMOVE(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); tun_destroy(tp); return (0); } static void vnet_tun_init(const void *unused __unused) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; int i; for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; drvc = malloc(sizeof(*drvc), M_TUN, M_WAITOK | M_ZERO); drvc->drv = drv; drvc->cloner = if_clone_advanced(drv->cdevsw.d_name, 0, drv->clone_match_fn, drv->clone_create_fn, drv->clone_destroy_fn); SLIST_INSERT_HEAD(&V_tuntap_driver_cloners, drvc, link); }; } VNET_SYSINIT(vnet_tun_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_tun_init, NULL); static void vnet_tun_uninit(const void *unused __unused) { struct tuntap_driver_cloner *drvc; while (!SLIST_EMPTY(&V_tuntap_driver_cloners)) { drvc = SLIST_FIRST(&V_tuntap_driver_cloners); SLIST_REMOVE_HEAD(&V_tuntap_driver_cloners, link); if_clone_detach(drvc->cloner); free(drvc, M_TUN); } } VNET_SYSUNINIT(vnet_tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_tun_uninit, NULL); static void tun_uninit(const void *unused __unused) { struct tuntap_driver *drv; struct tuntap_softc *tp; int i; EVENTHANDLER_DEREGISTER(ifnet_arrival_event, arrival_tag); EVENTHANDLER_DEREGISTER(dev_clone, clone_tag); drain_dev_clone_events(); mtx_lock(&tunmtx); while ((tp = TAILQ_FIRST(&tunhead)) != NULL) { TAILQ_REMOVE(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); tun_destroy(tp); mtx_lock(&tunmtx); } mtx_unlock(&tunmtx); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; delete_unrhdr(drv->unrhdr); clone_cleanup(&drv->clones); } mtx_destroy(&tunmtx); } SYSUNINIT(tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, tun_uninit, NULL); static struct tuntap_driver * tuntap_driver_from_ifnet(const struct ifnet *ifp) { struct tuntap_driver *drv; int i; if (ifp == NULL) return (NULL); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; if (strcmp(ifp->if_dname, drv->cdevsw.d_name) == 0) return (drv); } return (NULL); } static int tuntapmodevent(module_t mod, int type, void *data) { struct tuntap_driver *drv; int i; switch (type) { case MOD_LOAD: mtx_init(&tunmtx, "tunmtx", NULL, MTX_DEF); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; clone_setup(&drv->clones); drv->unrhdr = new_unrhdr(0, IF_MAXUNIT, &tunmtx); } arrival_tag = EVENTHANDLER_REGISTER(ifnet_arrival_event, tunrename, 0, 1000); if (arrival_tag == NULL) return (ENOMEM); clone_tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000); if (clone_tag == NULL) return (ENOMEM); break; case MOD_UNLOAD: /* See tun_uninit, so it's done after the vnet_sysuninit() */ break; default: return EOPNOTSUPP; } return 0; } static moduledata_t tuntap_mod = { "if_tuntap", tuntapmodevent, 0 }; DECLARE_MODULE(if_tuntap, tuntap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_tuntap, 1); MODULE_VERSION(if_tun, 1); MODULE_VERSION(if_tap, 1); static void tunstart(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; struct mbuf *m; TUNDEBUG(ifp, "starting\n"); if (ALTQ_IS_ENABLED(&ifp->if_snd)) { IFQ_LOCK(&ifp->if_snd); IFQ_POLL_NOLOCK(&ifp->if_snd, m); if (m == NULL) { IFQ_UNLOCK(&ifp->if_snd); return; } IFQ_UNLOCK(&ifp->if_snd); } TUN_LOCK(tp); if (tp->tun_flags & TUN_RWAIT) { tp->tun_flags &= ~TUN_RWAIT; wakeup(tp); } selwakeuppri(&tp->tun_rsel, PZERO + 1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) { TUN_UNLOCK(tp); pgsigio(&tp->tun_sigio, SIGIO, 0); } else TUN_UNLOCK(tp); } /* * tunstart_l2 * * queue packets from higher level ready to put out */ static void tunstart_l2(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; TUNDEBUG(ifp, "starting\n"); /* * do not junk pending output if we are in VMnet mode. * XXX: can this do any harm because of queue overflow? */ TUN_LOCK(tp); if (((tp->tun_flags & TUN_VMNET) == 0) && ((tp->tun_flags & TUN_READY) != TUN_READY)) { struct mbuf *m; /* Unlocked read. */ TUNDEBUG(ifp, "not ready, tun_flags = 0x%x\n", tp->tun_flags); for (;;) { IF_DEQUEUE(&ifp->if_snd, m); if (m != NULL) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } else break; } TUN_UNLOCK(tp); return; } ifp->if_drv_flags |= IFF_DRV_OACTIVE; if (!IFQ_IS_EMPTY(&ifp->if_snd)) { if (tp->tun_flags & TUN_RWAIT) { tp->tun_flags &= ~TUN_RWAIT; wakeup(tp); } if ((tp->tun_flags & TUN_ASYNC) && (tp->tun_sigio != NULL)) { TUN_UNLOCK(tp); pgsigio(&tp->tun_sigio, SIGIO, 0); TUN_LOCK(tp); } selwakeuppri(&tp->tun_rsel, PZERO+1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* obytes are counted in ether_output */ } ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; TUN_UNLOCK(tp); } /* tunstart_l2 */ /* XXX: should return an error code so it can fail. */ static void tuncreate(struct cdev *dev, struct tuntap_driver *drv) { struct tuntap_softc *sc; struct ifnet *ifp; struct ether_addr eaddr; int iflags; u_char type; sc = malloc(sizeof(*sc), M_TUN, M_WAITOK | M_ZERO); mtx_init(&sc->tun_mtx, "tun_mtx", NULL, MTX_DEF); cv_init(&sc->tun_cv, "tun_condvar"); sc->tun_flags = drv->ident_flags; sc->tun_dev = dev; sc->tun_drv = drv; mtx_lock(&tunmtx); TAILQ_INSERT_TAIL(&tunhead, sc, tun_list); mtx_unlock(&tunmtx); iflags = IFF_MULTICAST; if ((sc->tun_flags & TUN_L2) != 0) { type = IFT_ETHER; iflags |= IFF_BROADCAST | IFF_SIMPLEX; } else { type = IFT_PPP; iflags |= IFF_POINTOPOINT; } ifp = sc->tun_ifp = if_alloc(type); if (ifp == NULL) panic("%s%d: failed to if_alloc() interface.\n", drv->cdevsw.d_name, dev2unit(dev)); ifp->if_softc = sc; if_initname(ifp, drv->cdevsw.d_name, dev2unit(dev)); ifp->if_ioctl = tunifioctl; ifp->if_flags = iflags; IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); knlist_init_mtx(&sc->tun_rsel.si_note, &sc->tun_mtx); ifp->if_capabilities |= IFCAP_LINKSTATE; ifp->if_capenable |= IFCAP_LINKSTATE; if ((sc->tun_flags & TUN_L2) != 0) { ifp->if_mtu = ETHERMTU; ifp->if_init = tunifinit; ifp->if_start = tunstart_l2; ether_gen_addr(ifp, &eaddr); ether_ifattach(ifp, eaddr.octet); } else { ifp->if_mtu = TUNMTU; ifp->if_start = tunstart; ifp->if_output = tunoutput; ifp->if_snd.ifq_drv_maxlen = 0; IFQ_SET_READY(&ifp->if_snd); if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); } dev->si_drv1 = sc; TUN_LOCK(sc); sc->tun_flags |= TUN_INITED; TUN_UNLOCK(sc); TUNDEBUG(ifp, "interface %s is created, minor = %#x\n", ifp->if_xname, dev2unit(dev)); } static void tunrename(void *arg __unused, struct ifnet *ifp) { struct tuntap_softc *tp; int error; if ((ifp->if_flags & IFF_RENAMING) == 0) return; if (tuntap_driver_from_ifnet(ifp) == NULL) return; /* * We need to grab the ioctl sx long enough to make sure the softc is * still there. If it is, we can safely try to busy the tun device. * The busy may fail if the device is currently dying, in which case * we do nothing. If it doesn't fail, the busy count stops the device * from dying until we've created the alias (that will then be * subsequently destroyed). */ sx_xlock(&tun_ioctl_sx); tp = ifp->if_softc; if (tp == NULL) { sx_xunlock(&tun_ioctl_sx); return; } error = tun_busy(tp); sx_xunlock(&tun_ioctl_sx); if (error != 0) return; if (tp->tun_alias != NULL) { destroy_dev(tp->tun_alias); tp->tun_alias = NULL; } if (strcmp(ifp->if_xname, tp->tun_dev->si_name) == 0) goto out; /* * Failure's ok, aliases are created on a best effort basis. If a * tun user/consumer decides to rename the interface to conflict with * another device (non-ifnet) on the system, we will assume they know * what they are doing. make_dev_alias_p won't touch tun_alias on * failure, so we use it but ignore the return value. */ make_dev_alias_p(MAKEDEV_CHECKNAME, &tp->tun_alias, tp->tun_dev, "%s", ifp->if_xname); out: tun_unbusy(tp); } static int tunopen(struct cdev *dev, int flag, int mode, struct thread *td) { struct ifnet *ifp; struct tuntap_driver *drv; struct tuntap_softc *tp; int error, tunflags; tunflags = 0; CURVNET_SET(TD_TO_VNET(td)); error = tuntap_name2info(dev->si_name, NULL, &tunflags); if (error != 0) { CURVNET_RESTORE(); return (error); /* Shouldn't happen */ } if ((tunflags & TUN_L2) != 0) { /* Restrict? */ if (tap_allow_uopen == 0) { error = priv_check(td, PRIV_NET_TAP); if (error != 0) { CURVNET_RESTORE(); return (error); } } } /* * XXXRW: Non-atomic test and set of dev->si_drv1 requires * synchronization. */ tp = dev->si_drv1; if (!tp) { drv = tuntap_driver_from_flags(tunflags); if (drv == NULL) { CURVNET_RESTORE(); return (ENXIO); } tuncreate(dev, drv); tp = dev->si_drv1; } TUN_LOCK(tp); if ((tp->tun_flags & (TUN_OPEN | TUN_DYING)) != 0) { TUN_UNLOCK(tp); CURVNET_RESTORE(); return (EBUSY); } error = tun_busy_locked(tp); KASSERT(error == 0, ("Must be able to busy an unopen tunnel")); ifp = TUN2IFP(tp); if ((tp->tun_flags & TUN_L2) != 0) { bcopy(IF_LLADDR(ifp), tp->tun_ether.octet, sizeof(tp->tun_ether.octet)); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if (tapuponopen) ifp->if_flags |= IFF_UP; } tp->tun_pid = td->td_proc->p_pid; tp->tun_flags |= TUN_OPEN; if_link_state_change(ifp, LINK_STATE_UP); TUNDEBUG(ifp, "open\n"); TUN_UNLOCK(tp); CURVNET_RESTORE(); return (0); } /* * tunclose - close the device - mark i/f down & delete * routing info */ static int tunclose(struct cdev *dev, int foo, int bar, struct thread *td) { struct proc *p; struct tuntap_softc *tp; struct ifnet *ifp; bool l2tun; p = td->td_proc; tp = dev->si_drv1; ifp = TUN2IFP(tp); TUN_LOCK(tp); /* * Realistically, we can't be obstinate here. This only means that the * tuntap device was closed out of order, and the last closer wasn't the * controller. These are still good to know about, though, as software * should avoid multiple processes with a tuntap device open and * ill-defined transfer of control (e.g., handoff, TUNSIFPID, close in * parent). */ if (p->p_pid != tp->tun_pid) { log(LOG_INFO, "pid %d (%s), %s: tun/tap protocol violation, non-controlling process closed last.\n", p->p_pid, p->p_comm, dev->si_name); } /* * junk all pending output */ CURVNET_SET(ifp->if_vnet); l2tun = false; if ((tp->tun_flags & TUN_L2) != 0) { l2tun = true; IF_DRAIN(&ifp->if_snd); } else { IFQ_PURGE(&ifp->if_snd); } /* For vmnet, we won't do most of the address/route bits */ if ((tp->tun_flags & TUN_VMNET) != 0 || (l2tun && (ifp->if_flags & IFF_LINK0) != 0)) goto out; if (ifp->if_flags & IFF_UP) { TUN_UNLOCK(tp); if_down(ifp); TUN_LOCK(tp); } /* Delete all addresses and routes which reference this interface. */ if (ifp->if_drv_flags & IFF_DRV_RUNNING) { struct ifaddr *ifa; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; TUN_UNLOCK(tp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { /* deal w/IPv4 PtP destination; unlocked read */ if (!l2tun && ifa->ifa_addr->sa_family == AF_INET) { rtinit(ifa, (int)RTM_DELETE, tp->tun_flags & TUN_DSTADDR ? RTF_HOST : 0); } else { rtinit(ifa, (int)RTM_DELETE, 0); } } if_purgeaddrs(ifp); TUN_LOCK(tp); } out: if_link_state_change(ifp, LINK_STATE_DOWN); CURVNET_RESTORE(); funsetown(&tp->tun_sigio); selwakeuppri(&tp->tun_rsel, PZERO + 1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); TUNDEBUG (ifp, "closed\n"); tp->tun_flags &= ~TUN_OPEN; tp->tun_pid = 0; + tun_vnethdr_set(ifp, 0); tun_unbusy_locked(tp); TUN_UNLOCK(tp); return (0); } static void tuninit(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; #ifdef INET struct epoch_tracker et; struct ifaddr *ifa; #endif TUNDEBUG(ifp, "tuninit\n"); TUN_LOCK(tp); ifp->if_drv_flags |= IFF_DRV_RUNNING; if ((tp->tun_flags & TUN_L2) == 0) { ifp->if_flags |= IFF_UP; getmicrotime(&ifp->if_lastchange); #ifdef INET NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET) { struct sockaddr_in *si; si = (struct sockaddr_in *)ifa->ifa_addr; if (si->sin_addr.s_addr) tp->tun_flags |= TUN_IASET; si = (struct sockaddr_in *)ifa->ifa_dstaddr; if (si && si->sin_addr.s_addr) tp->tun_flags |= TUN_DSTADDR; } } NET_EPOCH_EXIT(et); #endif TUN_UNLOCK(tp); } else { ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; TUN_UNLOCK(tp); /* attempt to start output */ tunstart_l2(ifp); } } /* * Used only for l2 tunnel. */ static void tunifinit(void *xtp) { struct tuntap_softc *tp; tp = (struct tuntap_softc *)xtp; tuninit(tp->tun_ifp); } +/* + * To be called under TUN_LOCK. Update ifp->if_hwassist according to the + * current value of ifp->if_capenable. + */ +static void +tun_caps_changed(struct ifnet *ifp) +{ + uint64_t hwassist = 0; + + TUN_LOCK_ASSERT((struct tuntap_softc *)ifp->if_softc); + if (ifp->if_capenable & IFCAP_TXCSUM) + hwassist |= CSUM_TCP | CSUM_UDP; + if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) + hwassist |= CSUM_TCP_IPV6 + | CSUM_UDP_IPV6; + if (ifp->if_capenable & IFCAP_TSO4) + hwassist |= CSUM_IP_TSO; + if (ifp->if_capenable & IFCAP_TSO6) + hwassist |= CSUM_IP6_TSO; + ifp->if_hwassist = hwassist; +} + +/* + * To be called under TUN_LOCK. Update tp->tun_vhdrlen and adjust + * if_capabilities and if_capenable as needed. + */ +static void +tun_vnethdr_set(struct ifnet *ifp, int vhdrlen) +{ + struct tuntap_softc *tp = ifp->if_softc; + + TUN_LOCK_ASSERT(tp); + + if (tp->tun_vhdrlen == vhdrlen) + return; + + /* + * Update if_capabilities to reflect the + * functionalities offered by the virtio-net + * header. + */ + if (vhdrlen != 0) + ifp->if_capabilities |= + TAP_VNET_HDR_CAPS; + else + ifp->if_capabilities &= + ~TAP_VNET_HDR_CAPS; + /* + * Disable any capabilities that we don't + * support anymore. + */ + ifp->if_capenable &= ifp->if_capabilities; + tun_caps_changed(ifp); + tp->tun_vhdrlen = vhdrlen; + + TUNDEBUG(ifp, "vnet_hdr_len=%d, if_capabilities=%x\n", + vhdrlen, ifp->if_capabilities); +} + /* * Process an ioctl request. */ static int tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; struct tuntap_softc *tp; struct ifstat *ifs; struct ifmediareq *ifmr; int dummy, error = 0; bool l2tun; ifmr = NULL; sx_xlock(&tun_ioctl_sx); tp = ifp->if_softc; if (tp == NULL) { error = ENXIO; goto bad; } l2tun = (tp->tun_flags & TUN_L2) != 0; switch(cmd) { case SIOCGIFSTATUS: ifs = (struct ifstat *)data; TUN_LOCK(tp); if (tp->tun_pid) snprintf(ifs->ascii, sizeof(ifs->ascii), "\tOpened by PID %d\n", tp->tun_pid); else ifs->ascii[0] = '\0'; TUN_UNLOCK(tp); break; case SIOCSIFADDR: if (l2tun) error = ether_ioctl(ifp, cmd, data); else tuninit(ifp); if (error == 0) TUNDEBUG(ifp, "address set\n"); break; case SIOCSIFMTU: ifp->if_mtu = ifr->ifr_mtu; TUNDEBUG(ifp, "mtu set\n"); break; case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCGIFMEDIA: if (!l2tun) { error = EINVAL; break; } ifmr = (struct ifmediareq *)data; dummy = ifmr->ifm_count; ifmr->ifm_count = 1; ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (tp->tun_flags & TUN_OPEN) ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_current = ifmr->ifm_active; if (dummy >= 1) { int media = IFM_ETHER; error = copyout(&media, ifmr->ifm_ulist, sizeof(int)); } break; + case SIOCSIFCAP: + TUN_LOCK(tp); + ifp->if_capenable = ifr->ifr_reqcap; + tun_caps_changed(ifp); + TUN_UNLOCK(tp); + VLAN_CAPABILITIES(ifp); + break; default: if (l2tun) { error = ether_ioctl(ifp, cmd, data); } else { error = EINVAL; } } bad: sx_xunlock(&tun_ioctl_sx); return (error); } /* * tunoutput - queue packets from higher level ready to put out. */ static int tunoutput(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst, struct route *ro) { struct tuntap_softc *tp = ifp->if_softc; u_short cached_tun_flags; int error; u_int32_t af; TUNDEBUG (ifp, "tunoutput\n"); #ifdef MAC error = mac_ifnet_check_transmit(ifp, m0); if (error) { m_freem(m0); return (error); } #endif /* Could be unlocked read? */ TUN_LOCK(tp); cached_tun_flags = tp->tun_flags; TUN_UNLOCK(tp); if ((cached_tun_flags & TUN_READY) != TUN_READY) { TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); m_freem (m0); return (EHOSTDOWN); } if ((ifp->if_flags & IFF_UP) != IFF_UP) { m_freem (m0); return (EHOSTDOWN); } /* BPF writes need to be handled specially. */ if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &af, sizeof(af)); else af = dst->sa_family; if (bpf_peers_present(ifp->if_bpf)) bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m0); /* prepend sockaddr? this may abort if the mbuf allocation fails */ if (cached_tun_flags & TUN_LMODE) { /* allocate space for sockaddr */ M_PREPEND(m0, dst->sa_len, M_NOWAIT); /* if allocation failed drop packet */ if (m0 == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } else { bcopy(dst, m0->m_data, dst->sa_len); } } if (cached_tun_flags & TUN_IFHEAD) { /* Prepend the address family */ M_PREPEND(m0, 4, M_NOWAIT); /* if allocation failed drop packet */ if (m0 == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } else *(u_int32_t *)m0->m_data = htonl(af); } else { #ifdef INET if (af != AF_INET) #endif { m_freem(m0); return (EAFNOSUPPORT); } } error = (ifp->if_transmit)(ifp, m0); if (error) return (ENOBUFS); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); return (0); } /* * the cdevsw interface is now pretty minimal. */ static int tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { struct ifreq ifr, *ifrp; struct tuntap_softc *tp = dev->si_drv1; + struct ifnet *ifp = TUN2IFP(tp); struct tuninfo *tunp; - int error, iflags; -#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ - defined(COMPAT_FREEBSD4) - int ival; -#endif + int error, iflags, ival; bool l2tun; l2tun = (tp->tun_flags & TUN_L2) != 0; if (l2tun) { /* tap specific ioctls */ switch(cmd) { /* VMware/VMnet port ioctl's */ #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD4) case _IO('V', 0): ival = IOCPARM_IVAL(data); data = (caddr_t)&ival; /* FALLTHROUGH */ #endif case VMIO_SIOCSIFFLAGS: /* VMware/VMnet SIOCSIFFLAGS */ iflags = *(int *)data; iflags &= TUN_VMIO_FLAG_MASK; iflags &= ~IFF_CANTCHANGE; iflags |= IFF_UP; TUN_LOCK(tp); - TUN2IFP(tp)->if_flags = iflags | - (TUN2IFP(tp)->if_flags & IFF_CANTCHANGE); + ifp->if_flags = iflags | + (ifp->if_flags & IFF_CANTCHANGE); TUN_UNLOCK(tp); return (0); case SIOCGIFADDR: /* get MAC address of the remote side */ TUN_LOCK(tp); bcopy(&tp->tun_ether.octet, data, sizeof(tp->tun_ether.octet)); TUN_UNLOCK(tp); return (0); case SIOCSIFADDR: /* set MAC address of the remote side */ TUN_LOCK(tp); bcopy(data, &tp->tun_ether.octet, sizeof(tp->tun_ether.octet)); TUN_UNLOCK(tp); + return (0); + case TAPSVNETHDR: + ival = *(int *)data; + if (ival != 0 && + ival != sizeof(struct virtio_net_hdr) && + ival != sizeof(struct virtio_net_hdr_mrg_rxbuf)) { + return (EINVAL); + } + TUN_LOCK(tp); + tun_vnethdr_set(ifp, ival); + TUN_UNLOCK(tp); + + return (0); + case TAPGVNETHDR: + TUN_LOCK(tp); + *(int *)data = tp->tun_vhdrlen; + TUN_UNLOCK(tp); + return (0); } /* Fall through to the common ioctls if unhandled */ } else { switch (cmd) { case TUNSLMODE: TUN_LOCK(tp); if (*(int *)data) { tp->tun_flags |= TUN_LMODE; tp->tun_flags &= ~TUN_IFHEAD; } else tp->tun_flags &= ~TUN_LMODE; TUN_UNLOCK(tp); return (0); case TUNSIFHEAD: TUN_LOCK(tp); if (*(int *)data) { tp->tun_flags |= TUN_IFHEAD; tp->tun_flags &= ~TUN_LMODE; } else tp->tun_flags &= ~TUN_IFHEAD; TUN_UNLOCK(tp); return (0); case TUNGIFHEAD: TUN_LOCK(tp); *(int *)data = (tp->tun_flags & TUN_IFHEAD) ? 1 : 0; TUN_UNLOCK(tp); return (0); case TUNSIFMODE: /* deny this if UP */ if (TUN2IFP(tp)->if_flags & IFF_UP) return (EBUSY); switch (*(int *)data & ~IFF_MULTICAST) { case IFF_POINTOPOINT: case IFF_BROADCAST: TUN_LOCK(tp); TUN2IFP(tp)->if_flags &= ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST); TUN2IFP(tp)->if_flags |= *(int *)data; TUN_UNLOCK(tp); break; default: return (EINVAL); } return (0); case TUNSIFPID: TUN_LOCK(tp); tp->tun_pid = curthread->td_proc->p_pid; TUN_UNLOCK(tp); return (0); } /* Fall through to the common ioctls if unhandled */ } switch (cmd) { case TUNGIFNAME: ifrp = (struct ifreq *)data; strlcpy(ifrp->ifr_name, TUN2IFP(tp)->if_xname, IFNAMSIZ); return (0); case TUNSIFINFO: tunp = (struct tuninfo *)data; if (TUN2IFP(tp)->if_type != tunp->type) return (EPROTOTYPE); TUN_LOCK(tp); if (TUN2IFP(tp)->if_mtu != tunp->mtu) { strlcpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ); ifr.ifr_mtu = tunp->mtu; CURVNET_SET(TUN2IFP(tp)->if_vnet); error = ifhwioctl(SIOCSIFMTU, TUN2IFP(tp), (caddr_t)&ifr, td); CURVNET_RESTORE(); if (error) { TUN_UNLOCK(tp); return (error); } } TUN2IFP(tp)->if_baudrate = tunp->baudrate; TUN_UNLOCK(tp); break; case TUNGIFINFO: tunp = (struct tuninfo *)data; TUN_LOCK(tp); tunp->mtu = TUN2IFP(tp)->if_mtu; tunp->type = TUN2IFP(tp)->if_type; tunp->baudrate = TUN2IFP(tp)->if_baudrate; TUN_UNLOCK(tp); break; case TUNSDEBUG: tundebug = *(int *)data; break; case TUNGDEBUG: *(int *)data = tundebug; break; case FIONBIO: break; case FIOASYNC: TUN_LOCK(tp); if (*(int *)data) tp->tun_flags |= TUN_ASYNC; else tp->tun_flags &= ~TUN_ASYNC; TUN_UNLOCK(tp); break; case FIONREAD: if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) { struct mbuf *mb; IFQ_LOCK(&TUN2IFP(tp)->if_snd); IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb); for (*(int *)data = 0; mb != NULL; mb = mb->m_next) *(int *)data += mb->m_len; IFQ_UNLOCK(&TUN2IFP(tp)->if_snd); } else *(int *)data = 0; break; case FIOSETOWN: return (fsetown(*(int *)data, &tp->tun_sigio)); case FIOGETOWN: *(int *)data = fgetown(&tp->tun_sigio); return (0); /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: return (fsetown(-(*(int *)data), &tp->tun_sigio)); /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)data = -fgetown(&tp->tun_sigio); return (0); default: return (ENOTTY); } return (0); } /* * The cdevsw read interface - reads a packet at a time, or at * least as much of a packet as can be read. */ static int tunread(struct cdev *dev, struct uio *uio, int flag) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); struct mbuf *m; - int error=0, len; + size_t len; + int error = 0; TUNDEBUG (ifp, "read\n"); TUN_LOCK(tp); if ((tp->tun_flags & TUN_READY) != TUN_READY) { TUN_UNLOCK(tp); TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); return (EHOSTDOWN); } tp->tun_flags &= ~TUN_RWAIT; for (;;) { IFQ_DEQUEUE(&ifp->if_snd, m); if (m != NULL) break; if (flag & O_NONBLOCK) { TUN_UNLOCK(tp); return (EWOULDBLOCK); } tp->tun_flags |= TUN_RWAIT; error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1), "tunread", 0); if (error != 0) { TUN_UNLOCK(tp); return (error); } } TUN_UNLOCK(tp); if ((tp->tun_flags & TUN_L2) != 0) BPF_MTAP(ifp, m); + len = min(tp->tun_vhdrlen, uio->uio_resid); + if (len > 0) { + struct virtio_net_hdr_mrg_rxbuf vhdr; + + bzero(&vhdr, sizeof(vhdr)); + if (m->m_pkthdr.csum_flags & TAP_ALL_OFFLOAD) { + m = virtio_net_tx_offload(ifp, m, false, &vhdr.hdr); + } + + TUNDEBUG(ifp, "txvhdr: f %u, gt %u, hl %u, " + "gs %u, cs %u, co %u\n", vhdr.hdr.flags, + vhdr.hdr.gso_type, vhdr.hdr.hdr_len, + vhdr.hdr.gso_size, vhdr.hdr.csum_start, + vhdr.hdr.csum_offset); + error = uiomove(&vhdr, len, uio); + } + while (m && uio->uio_resid > 0 && error == 0) { len = min(uio->uio_resid, m->m_len); if (len != 0) error = uiomove(mtod(m, void *), len, uio); m = m_free(m); } if (m) { TUNDEBUG(ifp, "Dropping mbuf\n"); m_freem(m); } return (error); } static int -tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m) +tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m, + struct virtio_net_hdr_mrg_rxbuf *vhdr) { struct ether_header *eh; struct ifnet *ifp; ifp = TUN2IFP(tp); /* * Only pass a unicast frame to ether_input(), if it would * actually have been received by non-virtual hardware. */ if (m->m_len < sizeof(struct ether_header)) { m_freem(m); return (0); } eh = mtod(m, struct ether_header *); if (eh && (ifp->if_flags & IFF_PROMISC) == 0 && !ETHER_IS_MULTICAST(eh->ether_dhost) && bcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) != 0) { m_freem(m); return (0); } + if (vhdr != NULL && virtio_net_rx_csum(m, &vhdr->hdr)) { + m_freem(m); + return (0); + } + /* Pass packet up to parent. */ CURVNET_SET(ifp->if_vnet); (*ifp->if_input)(ifp, m); CURVNET_RESTORE(); /* ibytes are counted in parent */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); return (0); } static int tunwrite_l3(struct tuntap_softc *tp, struct mbuf *m) { struct epoch_tracker et; struct ifnet *ifp; int family, isr; ifp = TUN2IFP(tp); /* Could be unlocked read? */ TUN_LOCK(tp); if (tp->tun_flags & TUN_IFHEAD) { TUN_UNLOCK(tp); if (m->m_len < sizeof(family) && (m = m_pullup(m, sizeof(family))) == NULL) return (ENOBUFS); family = ntohl(*mtod(m, u_int32_t *)); m_adj(m, sizeof(family)); } else { TUN_UNLOCK(tp); family = AF_INET; } BPF_MTAP2(ifp, &family, sizeof(family), m); switch (family) { #ifdef INET case AF_INET: isr = NETISR_IP; break; #endif #ifdef INET6 case AF_INET6: isr = NETISR_IPV6; break; #endif default: m_freem(m); return (EAFNOSUPPORT); } random_harvest_queue(m, sizeof(*m), RANDOM_NET_TUN); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); CURVNET_SET(ifp->if_vnet); M_SETFIB(m, ifp->if_fib); NET_EPOCH_ENTER(et); netisr_dispatch(isr, m); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } /* * the cdevsw write interface - an atomic write is a packet - or else! */ static int tunwrite(struct cdev *dev, struct uio *uio, int flag) { + struct virtio_net_hdr_mrg_rxbuf vhdr; struct tuntap_softc *tp; struct ifnet *ifp; struct mbuf *m; uint32_t mru; - int align; + int align, vhdrlen, error; bool l2tun; tp = dev->si_drv1; ifp = TUN2IFP(tp); TUNDEBUG(ifp, "tunwrite\n"); if ((ifp->if_flags & IFF_UP) != IFF_UP) /* ignore silently */ return (0); if (uio->uio_resid == 0) return (0); l2tun = (tp->tun_flags & TUN_L2) != 0; - align = 0; mru = l2tun ? TAPMRU : TUNMRU; - if (l2tun) + vhdrlen = tp->tun_vhdrlen; + align = 0; + if (l2tun) { align = ETHER_ALIGN; - else if ((tp->tun_flags & TUN_IFHEAD) != 0) + mru += vhdrlen; + } else if ((tp->tun_flags & TUN_IFHEAD) != 0) mru += sizeof(uint32_t); /* family */ if (uio->uio_resid < 0 || uio->uio_resid > mru) { TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid); return (EIO); } + if (vhdrlen > 0) { + error = uiomove(&vhdr, vhdrlen, uio); + if (error != 0) + return (error); + TUNDEBUG(ifp, "txvhdr: f %u, gt %u, hl %u, " + "gs %u, cs %u, co %u\n", vhdr.hdr.flags, + vhdr.hdr.gso_type, vhdr.hdr.hdr_len, + vhdr.hdr.gso_size, vhdr.hdr.csum_start, + vhdr.hdr.csum_offset); + } + if ((m = m_uiotombuf(uio, M_NOWAIT, 0, align, M_PKTHDR)) == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return (ENOBUFS); } m->m_pkthdr.rcvif = ifp; #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif if (l2tun) - return (tunwrite_l2(tp, m)); + return (tunwrite_l2(tp, m, vhdrlen > 0 ? &vhdr : NULL)); return (tunwrite_l3(tp, m)); } /* * tunpoll - the poll interface, this is only useful on reads * really. The write detect always returns true, write never blocks * anyway, it either accepts the packet or drops it. */ static int tunpoll(struct cdev *dev, int events, struct thread *td) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); int revents = 0; TUNDEBUG(ifp, "tunpoll\n"); if (events & (POLLIN | POLLRDNORM)) { IFQ_LOCK(&ifp->if_snd); if (!IFQ_IS_EMPTY(&ifp->if_snd)) { TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len); revents |= events & (POLLIN | POLLRDNORM); } else { TUNDEBUG(ifp, "tunpoll waiting\n"); selrecord(td, &tp->tun_rsel); } IFQ_UNLOCK(&ifp->if_snd); } revents |= events & (POLLOUT | POLLWRNORM); return (revents); } /* * tunkqfilter - support for the kevent() system call. */ static int tunkqfilter(struct cdev *dev, struct knote *kn) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); switch(kn->kn_filter) { case EVFILT_READ: TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n", ifp->if_xname, dev2unit(dev)); kn->kn_fop = &tun_read_filterops; break; case EVFILT_WRITE: TUNDEBUG(ifp, "%s kqfilter: EVFILT_WRITE, minor = %#x\n", ifp->if_xname, dev2unit(dev)); kn->kn_fop = &tun_write_filterops; break; default: TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n", ifp->if_xname, dev2unit(dev)); return(EINVAL); } kn->kn_hook = tp; knlist_add(&tp->tun_rsel.si_note, kn, 0); return (0); } /* * Return true of there is data in the interface queue. */ static int tunkqread(struct knote *kn, long hint) { int ret; struct tuntap_softc *tp = kn->kn_hook; struct cdev *dev = tp->tun_dev; struct ifnet *ifp = TUN2IFP(tp); if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { TUNDEBUG(ifp, "%s have data in the queue. Len = %d, minor = %#x\n", ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); ret = 1; } else { TUNDEBUG(ifp, "%s waiting for data, minor = %#x\n", ifp->if_xname, dev2unit(dev)); ret = 0; } return (ret); } /* * Always can write, always return MTU in kn->data. */ static int tunkqwrite(struct knote *kn, long hint) { struct tuntap_softc *tp = kn->kn_hook; struct ifnet *ifp = TUN2IFP(tp); kn->kn_data = ifp->if_mtu; return (1); } static void tunkqdetach(struct knote *kn) { struct tuntap_softc *tp = kn->kn_hook; knlist_remove(&tp->tun_rsel.si_note, kn, 0); }