Index: head/sys/dev/netmap/netmap_freebsd.c =================================================================== --- head/sys/dev/netmap/netmap_freebsd.c (revision 344252) +++ head/sys/dev/netmap/netmap_freebsd.c (revision 344253) @@ -1,1603 +1,1618 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* $FreeBSD$ */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include /* POLLIN, POLLOUT */ #include /* types used in module initialization */ #include /* DEV_MODULE_ORDERED */ #include #include /* kern_ioctl() */ #include #include /* vtophys */ #include /* vtophys */ #include #include #include #include #include #include #include /* sockaddrs */ #include #include /* kthread_add() */ #include /* PROC_LOCK() */ #include /* RFNOWAIT */ #include /* sched_bind() */ #include /* mp_maxid */ #include /* taskqueue_enqueue(), taskqueue_create(), ... */ #include #include #include /* IFT_ETHER */ #include /* ether_ifdetach */ #include /* LLADDR */ #include /* bus_dmamap_* */ #include /* in6_cksum_pseudo() */ #include /* in_pseudo(), in_cksum_hdr() */ #include #include #include #include /* ======================== FREEBSD-SPECIFIC ROUTINES ================== */ static void nm_kqueue_notify(void *opaque, int pending) { struct nm_selinfo *si = opaque; /* We use a non-zero hint to distinguish this notification call * from the call done in kqueue_scan(), which uses hint=0. */ KNOTE_UNLOCKED(&si->si.si_note, /*hint=*/0x100); } int nm_os_selinfo_init(NM_SELINFO_T *si, const char *name) { int err; TASK_INIT(&si->ntfytask, 0, nm_kqueue_notify, si); si->ntfytq = taskqueue_create(name, M_NOWAIT, taskqueue_thread_enqueue, &si->ntfytq); if (si->ntfytq == NULL) return -ENOMEM; err = taskqueue_start_threads(&si->ntfytq, 1, PI_NET, "tq %s", name); if (err) { taskqueue_free(si->ntfytq); si->ntfytq = NULL; return err; } snprintf(si->mtxname, sizeof(si->mtxname), "nmkl%s", name); mtx_init(&si->m, si->mtxname, NULL, MTX_DEF); knlist_init_mtx(&si->si.si_note, &si->m); + si->kqueue_users = 0; return (0); } void nm_os_selinfo_uninit(NM_SELINFO_T *si) { if (si->ntfytq == NULL) { return; /* si was not initialized */ } taskqueue_drain(si->ntfytq, &si->ntfytask); taskqueue_free(si->ntfytq); si->ntfytq = NULL; knlist_delete(&si->si.si_note, curthread, /*islocked=*/0); knlist_destroy(&si->si.si_note); /* now we don't need the mutex anymore */ mtx_destroy(&si->m); } void * nm_os_malloc(size_t size) { return malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO); } void * nm_os_realloc(void *addr, size_t new_size, size_t old_size __unused) { return realloc(addr, new_size, M_DEVBUF, M_NOWAIT | M_ZERO); } void nm_os_free(void *addr) { free(addr, M_DEVBUF); } void nm_os_ifnet_lock(void) { IFNET_RLOCK(); } void nm_os_ifnet_unlock(void) { IFNET_RUNLOCK(); } static int netmap_use_count = 0; void nm_os_get_module(void) { netmap_use_count++; } void nm_os_put_module(void) { netmap_use_count--; } static void netmap_ifnet_arrival_handler(void *arg __unused, struct ifnet *ifp) { netmap_undo_zombie(ifp); } static void netmap_ifnet_departure_handler(void *arg __unused, struct ifnet *ifp) { netmap_make_zombie(ifp); } static eventhandler_tag nm_ifnet_ah_tag; static eventhandler_tag nm_ifnet_dh_tag; int nm_os_ifnet_init(void) { nm_ifnet_ah_tag = EVENTHANDLER_REGISTER(ifnet_arrival_event, netmap_ifnet_arrival_handler, NULL, EVENTHANDLER_PRI_ANY); nm_ifnet_dh_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, netmap_ifnet_departure_handler, NULL, EVENTHANDLER_PRI_ANY); return 0; } void nm_os_ifnet_fini(void) { EVENTHANDLER_DEREGISTER(ifnet_arrival_event, nm_ifnet_ah_tag); EVENTHANDLER_DEREGISTER(ifnet_departure_event, nm_ifnet_dh_tag); } unsigned nm_os_ifnet_mtu(struct ifnet *ifp) { #if __FreeBSD_version < 1100030 return ifp->if_data.ifi_mtu; #else /* __FreeBSD_version >= 1100030 */ return ifp->if_mtu; #endif } rawsum_t nm_os_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum) { /* TODO XXX please use the FreeBSD implementation for this. */ uint16_t *words = (uint16_t *)data; int nw = len / 2; int i; for (i = 0; i < nw; i++) cur_sum += be16toh(words[i]); if (len & 1) cur_sum += (data[len-1] << 8); return cur_sum; } /* Fold a raw checksum: 'cur_sum' is in host byte order, while the * return value is in network byte order. */ uint16_t nm_os_csum_fold(rawsum_t cur_sum) { /* TODO XXX please use the FreeBSD implementation for this. */ while (cur_sum >> 16) cur_sum = (cur_sum & 0xFFFF) + (cur_sum >> 16); return htobe16((~cur_sum) & 0xFFFF); } uint16_t nm_os_csum_ipv4(struct nm_iphdr *iph) { #if 0 return in_cksum_hdr((void *)iph); #else return nm_os_csum_fold(nm_os_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0)); #endif } void nm_os_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, size_t datalen, uint16_t *check) { #ifdef INET uint16_t pseudolen = datalen + iph->protocol; /* Compute and insert the pseudo-header cheksum. */ *check = in_pseudo(iph->saddr, iph->daddr, htobe16(pseudolen)); /* Compute the checksum on TCP/UDP header + payload * (includes the pseudo-header). */ *check = nm_os_csum_fold(nm_os_csum_raw(data, datalen, 0)); #else static int notsupported = 0; if (!notsupported) { notsupported = 1; nm_prerr("inet4 segmentation not supported"); } #endif } void nm_os_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, size_t datalen, uint16_t *check) { #ifdef INET6 *check = in6_cksum_pseudo((void*)ip6h, datalen, ip6h->nexthdr, 0); *check = nm_os_csum_fold(nm_os_csum_raw(data, datalen, 0)); #else static int notsupported = 0; if (!notsupported) { notsupported = 1; nm_prerr("inet6 segmentation not supported"); } #endif } /* on FreeBSD we send up one packet at a time */ void * nm_os_send_up(struct ifnet *ifp, struct mbuf *m, struct mbuf *prev) { NA(ifp)->if_input(ifp, m); return NULL; } int nm_os_mbuf_has_csum_offld(struct mbuf *m) { return m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_SCTP | CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_SCTP_IPV6); } int nm_os_mbuf_has_seg_offld(struct mbuf *m) { return m->m_pkthdr.csum_flags & CSUM_TSO; } static void freebsd_generic_rx_handler(struct ifnet *ifp, struct mbuf *m) { int stolen; if (unlikely(!NM_NA_VALID(ifp))) { nm_prlim(1, "Warning: RX packet intercepted, but no" " emulated adapter"); return; } stolen = generic_rx_handler(ifp, m); if (!stolen) { struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)NA(ifp); gna->save_if_input(ifp, m); } } /* * Intercept the rx routine in the standard device driver. * Second argument is non-zero to intercept, 0 to restore */ int nm_os_catch_rx(struct netmap_generic_adapter *gna, int intercept) { struct netmap_adapter *na = &gna->up.up; struct ifnet *ifp = na->ifp; int ret = 0; nm_os_ifnet_lock(); if (intercept) { if (gna->save_if_input) { nm_prerr("RX on %s already intercepted", na->name); ret = EBUSY; /* already set */ goto out; } gna->save_if_input = ifp->if_input; ifp->if_input = freebsd_generic_rx_handler; } else { if (!gna->save_if_input) { nm_prerr("Failed to undo RX intercept on %s", na->name); ret = EINVAL; /* not saved */ goto out; } ifp->if_input = gna->save_if_input; gna->save_if_input = NULL; } out: nm_os_ifnet_unlock(); return ret; } /* * Intercept the packet steering routine in the tx path, * so that we can decide which queue is used for an mbuf. * Second argument is non-zero to intercept, 0 to restore. * On freebsd we just intercept if_transmit. */ int nm_os_catch_tx(struct netmap_generic_adapter *gna, int intercept) { struct netmap_adapter *na = &gna->up.up; struct ifnet *ifp = netmap_generic_getifp(gna); nm_os_ifnet_lock(); if (intercept) { na->if_transmit = ifp->if_transmit; ifp->if_transmit = netmap_transmit; } else { ifp->if_transmit = na->if_transmit; } nm_os_ifnet_unlock(); return 0; } /* * Transmit routine used by generic_netmap_txsync(). Returns 0 on success * and non-zero on error (which may be packet drops or other errors). * addr and len identify the netmap buffer, m is the (preallocated) * mbuf to use for transmissions. * * We should add a reference to the mbuf so the m_freem() at the end * of the transmission does not consume resources. * * On FreeBSD, and on multiqueue cards, we can force the queue using * if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) * i = m->m_pkthdr.flowid % adapter->num_queues; * else * i = curcpu % adapter->num_queues; * */ int nm_os_generic_xmit_frame(struct nm_os_gen_arg *a) { int ret; u_int len = a->len; struct ifnet *ifp = a->ifp; struct mbuf *m = a->m; #if __FreeBSD_version < 1100000 /* * Old FreeBSD versions. The mbuf has a cluster attached, * we need to copy from the cluster to the netmap buffer. */ if (MBUF_REFCNT(m) != 1) { nm_prerr("invalid refcnt %d for %p", MBUF_REFCNT(m), m); panic("in generic_xmit_frame"); } if (m->m_ext.ext_size < len) { nm_prlim(2, "size %d < len %d", m->m_ext.ext_size, len); len = m->m_ext.ext_size; } bcopy(a->addr, m->m_data, len); #else /* __FreeBSD_version >= 1100000 */ /* New FreeBSD versions. Link the external storage to * the netmap buffer, so that no copy is necessary. */ m->m_ext.ext_buf = m->m_data = a->addr; m->m_ext.ext_size = len; #endif /* __FreeBSD_version >= 1100000 */ m->m_len = m->m_pkthdr.len = len; /* mbuf refcnt is not contended, no need to use atomic * (a memory barrier is enough). */ SET_MBUF_REFCNT(m, 2); M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); m->m_pkthdr.flowid = a->ring_nr; m->m_pkthdr.rcvif = ifp; /* used for tx notification */ ret = NA(ifp)->if_transmit(ifp, m); return ret ? -1 : 0; } #if __FreeBSD_version >= 1100005 struct netmap_adapter * netmap_getna(if_t ifp) { return (NA((struct ifnet *)ifp)); } #endif /* __FreeBSD_version >= 1100005 */ /* * The following two functions are empty until we have a generic * way to extract the info from the ifp */ int nm_os_generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx) { return 0; } void nm_os_generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq) { unsigned num_rings = netmap_generic_rings ? netmap_generic_rings : 1; *txq = num_rings; *rxq = num_rings; } void nm_os_generic_set_features(struct netmap_generic_adapter *gna) { gna->rxsg = 1; /* Supported through m_copydata. */ gna->txqdisc = 0; /* Not supported. */ } void nm_os_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na) { mit->mit_pending = 0; mit->mit_ring_idx = idx; mit->mit_na = na; } void nm_os_mitigation_start(struct nm_generic_mit *mit) { } void nm_os_mitigation_restart(struct nm_generic_mit *mit) { } int nm_os_mitigation_active(struct nm_generic_mit *mit) { return 0; } void nm_os_mitigation_cleanup(struct nm_generic_mit *mit) { } static int nm_vi_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr) { return EINVAL; } static void nm_vi_start(struct ifnet *ifp) { panic("nm_vi_start() must not be called"); } /* * Index manager of persistent virtual interfaces. * It is used to decide the lowest byte of the MAC address. * We use the same algorithm with management of bridge port index. */ #define NM_VI_MAX 255 static struct { uint8_t index[NM_VI_MAX]; /* XXX just for a reasonable number */ uint8_t active; struct mtx lock; } nm_vi_indices; void nm_os_vi_init_index(void) { int i; for (i = 0; i < NM_VI_MAX; i++) nm_vi_indices.index[i] = i; nm_vi_indices.active = 0; mtx_init(&nm_vi_indices.lock, "nm_vi_indices_lock", NULL, MTX_DEF); } /* return -1 if no index available */ static int nm_vi_get_index(void) { int ret; mtx_lock(&nm_vi_indices.lock); ret = nm_vi_indices.active == NM_VI_MAX ? -1 : nm_vi_indices.index[nm_vi_indices.active++]; mtx_unlock(&nm_vi_indices.lock); return ret; } static void nm_vi_free_index(uint8_t val) { int i, lim; mtx_lock(&nm_vi_indices.lock); lim = nm_vi_indices.active; for (i = 0; i < lim; i++) { if (nm_vi_indices.index[i] == val) { /* swap index[lim-1] and j */ int tmp = nm_vi_indices.index[lim-1]; nm_vi_indices.index[lim-1] = val; nm_vi_indices.index[i] = tmp; nm_vi_indices.active--; break; } } if (lim == nm_vi_indices.active) nm_prerr("Index %u not found", val); mtx_unlock(&nm_vi_indices.lock); } #undef NM_VI_MAX /* * Implementation of a netmap-capable virtual interface that * registered to the system. * It is based on if_tap.c and ip_fw_log.c in FreeBSD 9. * * Note: Linux sets refcount to 0 on allocation of net_device, * then increments it on registration to the system. * FreeBSD sets refcount to 1 on if_alloc(), and does not * increment this refcount on if_attach(). */ int nm_os_vi_persist(const char *name, struct ifnet **ret) { struct ifnet *ifp; u_short macaddr_hi; uint32_t macaddr_mid; u_char eaddr[6]; int unit = nm_vi_get_index(); /* just to decide MAC address */ if (unit < 0) return EBUSY; /* * We use the same MAC address generation method with tap * except for the highest octet is 00:be instead of 00:bd */ macaddr_hi = htons(0x00be); /* XXX tap + 1 */ macaddr_mid = (uint32_t) ticks; bcopy(&macaddr_hi, eaddr, sizeof(short)); bcopy(&macaddr_mid, &eaddr[2], sizeof(uint32_t)); eaddr[5] = (uint8_t)unit; ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { nm_prerr("if_alloc failed"); return ENOMEM; } if_initname(ifp, name, IF_DUNIT_NONE); ifp->if_mtu = 65536; ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_init = (void *)nm_vi_dummy; ifp->if_ioctl = nm_vi_dummy; ifp->if_start = nm_vi_start; ifp->if_mtu = ETHERMTU; IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); ifp->if_capabilities |= IFCAP_LINKSTATE; ifp->if_capenable |= IFCAP_LINKSTATE; ether_ifattach(ifp, eaddr); *ret = ifp; return 0; } /* unregister from the system and drop the final refcount */ void nm_os_vi_detach(struct ifnet *ifp) { nm_vi_free_index(((char *)IF_LLADDR(ifp))[5]); ether_ifdetach(ifp); if_free(ifp); } #ifdef WITH_EXTMEM #include #include struct nm_os_extmem { vm_object_t obj; vm_offset_t kva; vm_offset_t size; uintptr_t scan; }; void nm_os_extmem_delete(struct nm_os_extmem *e) { nm_prinf("freeing %zx bytes", (size_t)e->size); vm_map_remove(kernel_map, e->kva, e->kva + e->size); nm_os_free(e); } char * nm_os_extmem_nextpage(struct nm_os_extmem *e) { char *rv = NULL; if (e->scan < e->kva + e->size) { rv = (char *)e->scan; e->scan += PAGE_SIZE; } return rv; } int nm_os_extmem_isequal(struct nm_os_extmem *e1, struct nm_os_extmem *e2) { return (e1->obj == e2->obj); } int nm_os_extmem_nr_pages(struct nm_os_extmem *e) { return e->size >> PAGE_SHIFT; } struct nm_os_extmem * nm_os_extmem_create(unsigned long p, struct nmreq_pools_info *pi, int *perror) { vm_map_t map; vm_map_entry_t entry; vm_object_t obj; vm_prot_t prot; vm_pindex_t index; boolean_t wired; struct nm_os_extmem *e = NULL; int rv, error = 0; e = nm_os_malloc(sizeof(*e)); if (e == NULL) { error = ENOMEM; goto out; } map = &curthread->td_proc->p_vmspace->vm_map; rv = vm_map_lookup(&map, p, VM_PROT_RW, &entry, &obj, &index, &prot, &wired); if (rv != KERN_SUCCESS) { nm_prerr("address %lx not found", p); goto out_free; } /* check that we are given the whole vm_object ? */ vm_map_lookup_done(map, entry); // XXX can we really use obj after releasing the map lock? e->obj = obj; vm_object_reference(obj); /* wire the memory and add the vm_object to the kernel map, * to make sure that it is not fred even if the processes that * are mmap()ing it all exit */ e->kva = vm_map_min(kernel_map); e->size = obj->size << PAGE_SHIFT; rv = vm_map_find(kernel_map, obj, 0, &e->kva, e->size, 0, VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0); if (rv != KERN_SUCCESS) { nm_prerr("vm_map_find(%zx) failed", (size_t)e->size); goto out_rel; } rv = vm_map_wire(kernel_map, e->kva, e->kva + e->size, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); if (rv != KERN_SUCCESS) { nm_prerr("vm_map_wire failed"); goto out_rem; } e->scan = e->kva; return e; out_rem: vm_map_remove(kernel_map, e->kva, e->kva + e->size); e->obj = NULL; out_rel: vm_object_deallocate(e->obj); out_free: nm_os_free(e); out: if (perror) *perror = error; return NULL; } #endif /* WITH_EXTMEM */ /* ================== PTNETMAP GUEST SUPPORT ==================== */ #ifdef WITH_PTNETMAP #include #include #include /* bus_dmamap_* */ #include #include #include /* * ptnetmap memory device (memdev) for freebsd guest, * ssed to expose host netmap memory to the guest through a PCI BAR. */ /* * ptnetmap memdev private data structure */ struct ptnetmap_memdev { device_t dev; struct resource *pci_io; struct resource *pci_mem; struct netmap_mem_d *nm_mem; }; static int ptn_memdev_probe(device_t); static int ptn_memdev_attach(device_t); static int ptn_memdev_detach(device_t); static int ptn_memdev_shutdown(device_t); static device_method_t ptn_memdev_methods[] = { DEVMETHOD(device_probe, ptn_memdev_probe), DEVMETHOD(device_attach, ptn_memdev_attach), DEVMETHOD(device_detach, ptn_memdev_detach), DEVMETHOD(device_shutdown, ptn_memdev_shutdown), DEVMETHOD_END }; static driver_t ptn_memdev_driver = { PTNETMAP_MEMDEV_NAME, ptn_memdev_methods, sizeof(struct ptnetmap_memdev), }; /* We use (SI_ORDER_MIDDLE+1) here, see DEV_MODULE_ORDERED() invocation * below. */ static devclass_t ptnetmap_devclass; DRIVER_MODULE_ORDERED(ptn_memdev, pci, ptn_memdev_driver, ptnetmap_devclass, NULL, NULL, SI_ORDER_MIDDLE + 1); /* * Map host netmap memory through PCI-BAR in the guest OS, * returning physical (nm_paddr) and virtual (nm_addr) addresses * of the netmap memory mapped in the guest. */ int nm_os_pt_memdev_iomap(struct ptnetmap_memdev *ptn_dev, vm_paddr_t *nm_paddr, void **nm_addr, uint64_t *mem_size) { int rid; nm_prinf("ptn_memdev_driver iomap"); rid = PCIR_BAR(PTNETMAP_MEM_PCI_BAR); *mem_size = bus_read_4(ptn_dev->pci_io, PTNET_MDEV_IO_MEMSIZE_HI); *mem_size = bus_read_4(ptn_dev->pci_io, PTNET_MDEV_IO_MEMSIZE_LO) | (*mem_size << 32); /* map memory allocator */ ptn_dev->pci_mem = bus_alloc_resource(ptn_dev->dev, SYS_RES_MEMORY, &rid, 0, ~0, *mem_size, RF_ACTIVE); if (ptn_dev->pci_mem == NULL) { *nm_paddr = 0; *nm_addr = NULL; return ENOMEM; } *nm_paddr = rman_get_start(ptn_dev->pci_mem); *nm_addr = rman_get_virtual(ptn_dev->pci_mem); nm_prinf("=== BAR %d start %lx len %lx mem_size %lx ===", PTNETMAP_MEM_PCI_BAR, (unsigned long)(*nm_paddr), (unsigned long)rman_get_size(ptn_dev->pci_mem), (unsigned long)*mem_size); return (0); } uint32_t nm_os_pt_memdev_ioread(struct ptnetmap_memdev *ptn_dev, unsigned int reg) { return bus_read_4(ptn_dev->pci_io, reg); } /* Unmap host netmap memory. */ void nm_os_pt_memdev_iounmap(struct ptnetmap_memdev *ptn_dev) { nm_prinf("ptn_memdev_driver iounmap"); if (ptn_dev->pci_mem) { bus_release_resource(ptn_dev->dev, SYS_RES_MEMORY, PCIR_BAR(PTNETMAP_MEM_PCI_BAR), ptn_dev->pci_mem); ptn_dev->pci_mem = NULL; } } /* Device identification routine, return BUS_PROBE_DEFAULT on success, * positive on failure */ static int ptn_memdev_probe(device_t dev) { char desc[256]; if (pci_get_vendor(dev) != PTNETMAP_PCI_VENDOR_ID) return (ENXIO); if (pci_get_device(dev) != PTNETMAP_PCI_DEVICE_ID) return (ENXIO); snprintf(desc, sizeof(desc), "%s PCI adapter", PTNETMAP_MEMDEV_NAME); device_set_desc_copy(dev, desc); return (BUS_PROBE_DEFAULT); } /* Device initialization routine. */ static int ptn_memdev_attach(device_t dev) { struct ptnetmap_memdev *ptn_dev; int rid; uint16_t mem_id; ptn_dev = device_get_softc(dev); ptn_dev->dev = dev; pci_enable_busmaster(dev); rid = PCIR_BAR(PTNETMAP_IO_PCI_BAR); ptn_dev->pci_io = bus_alloc_resource_any(dev, SYS_RES_IOPORT, &rid, RF_ACTIVE); if (ptn_dev->pci_io == NULL) { device_printf(dev, "cannot map I/O space\n"); return (ENXIO); } mem_id = bus_read_4(ptn_dev->pci_io, PTNET_MDEV_IO_MEMID); /* create guest allocator */ ptn_dev->nm_mem = netmap_mem_pt_guest_attach(ptn_dev, mem_id); if (ptn_dev->nm_mem == NULL) { ptn_memdev_detach(dev); return (ENOMEM); } netmap_mem_get(ptn_dev->nm_mem); nm_prinf("ptnetmap memdev attached, host memid: %u", mem_id); return (0); } /* Device removal routine. */ static int ptn_memdev_detach(device_t dev) { struct ptnetmap_memdev *ptn_dev; ptn_dev = device_get_softc(dev); if (ptn_dev->nm_mem) { nm_prinf("ptnetmap memdev detached, host memid %u", netmap_mem_get_id(ptn_dev->nm_mem)); netmap_mem_put(ptn_dev->nm_mem); ptn_dev->nm_mem = NULL; } if (ptn_dev->pci_mem) { bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(PTNETMAP_MEM_PCI_BAR), ptn_dev->pci_mem); ptn_dev->pci_mem = NULL; } if (ptn_dev->pci_io) { bus_release_resource(dev, SYS_RES_IOPORT, PCIR_BAR(PTNETMAP_IO_PCI_BAR), ptn_dev->pci_io); ptn_dev->pci_io = NULL; } return (0); } static int ptn_memdev_shutdown(device_t dev) { return bus_generic_shutdown(dev); } #endif /* WITH_PTNETMAP */ /* * In order to track whether pages are still mapped, we hook into * the standard cdev_pager and intercept the constructor and * destructor. */ struct netmap_vm_handle_t { struct cdev *dev; struct netmap_priv_d *priv; }; static int netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred, u_short *color) { struct netmap_vm_handle_t *vmh = handle; if (netmap_verbose) nm_prinf("handle %p size %jd prot %d foff %jd", handle, (intmax_t)size, prot, (intmax_t)foff); if (color) *color = 0; dev_ref(vmh->dev); return 0; } static void netmap_dev_pager_dtor(void *handle) { struct netmap_vm_handle_t *vmh = handle; struct cdev *dev = vmh->dev; struct netmap_priv_d *priv = vmh->priv; if (netmap_verbose) nm_prinf("handle %p", handle); netmap_dtor(priv); free(vmh, M_DEVBUF); dev_rel(dev); } static int netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot, vm_page_t *mres) { struct netmap_vm_handle_t *vmh = object->handle; struct netmap_priv_d *priv = vmh->priv; struct netmap_adapter *na = priv->np_na; vm_paddr_t paddr; vm_page_t page; vm_memattr_t memattr; vm_pindex_t pidx; nm_prdis("object %p offset %jd prot %d mres %p", object, (intmax_t)offset, prot, mres); memattr = object->memattr; pidx = OFF_TO_IDX(offset); paddr = netmap_mem_ofstophys(na->nm_mem, offset); if (paddr == 0) return VM_PAGER_FAIL; if (((*mres)->flags & PG_FICTITIOUS) != 0) { /* * If the passed in result page is a fake page, update it with * the new physical address. */ page = *mres; vm_page_updatefake(page, paddr, memattr); } else { /* * Replace the passed in reqpage page with our own fake page and * free up the all of the original pages. */ #ifndef VM_OBJECT_WUNLOCK /* FreeBSD < 10.x */ #define VM_OBJECT_WUNLOCK VM_OBJECT_UNLOCK #define VM_OBJECT_WLOCK VM_OBJECT_LOCK #endif /* VM_OBJECT_WUNLOCK */ VM_OBJECT_WUNLOCK(object); page = vm_page_getfake(paddr, memattr); VM_OBJECT_WLOCK(object); vm_page_lock(*mres); vm_page_free(*mres); vm_page_unlock(*mres); *mres = page; vm_page_insert(page, object, pidx); } page->valid = VM_PAGE_BITS_ALL; return (VM_PAGER_OK); } static struct cdev_pager_ops netmap_cdev_pager_ops = { .cdev_pg_ctor = netmap_dev_pager_ctor, .cdev_pg_dtor = netmap_dev_pager_dtor, .cdev_pg_fault = netmap_dev_pager_fault, }; static int netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff, vm_size_t objsize, vm_object_t *objp, int prot) { int error; struct netmap_vm_handle_t *vmh; struct netmap_priv_d *priv; vm_object_t obj; if (netmap_verbose) nm_prinf("cdev %p foff %jd size %jd objp %p prot %d", cdev, (intmax_t )*foff, (intmax_t )objsize, objp, prot); vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF, M_NOWAIT | M_ZERO); if (vmh == NULL) return ENOMEM; vmh->dev = cdev; NMG_LOCK(); error = devfs_get_cdevpriv((void**)&priv); if (error) goto err_unlock; if (priv->np_nifp == NULL) { error = EINVAL; goto err_unlock; } vmh->priv = priv; priv->np_refs++; NMG_UNLOCK(); obj = cdev_pager_allocate(vmh, OBJT_DEVICE, &netmap_cdev_pager_ops, objsize, prot, *foff, NULL); if (obj == NULL) { nm_prerr("cdev_pager_allocate failed"); error = EINVAL; goto err_deref; } *objp = obj; return 0; err_deref: NMG_LOCK(); priv->np_refs--; err_unlock: NMG_UNLOCK(); // err: free(vmh, M_DEVBUF); return error; } /* * On FreeBSD the close routine is only called on the last close on * the device (/dev/netmap) so we cannot do anything useful. * To track close() on individual file descriptors we pass netmap_dtor() to * devfs_set_cdevpriv() on open(). The FreeBSD kernel will call the destructor * when the last fd pointing to the device is closed. * * Note that FreeBSD does not even munmap() on close() so we also have * to track mmap() ourselves, and postpone the call to * netmap_dtor() is called when the process has no open fds and no active * memory maps on /dev/netmap, as in linux. */ static int netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td) { if (netmap_verbose) nm_prinf("dev %p fflag 0x%x devtype %d td %p", dev, fflag, devtype, td); return 0; } static int netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct netmap_priv_d *priv; int error; (void)dev; (void)oflags; (void)devtype; (void)td; NMG_LOCK(); priv = netmap_priv_new(); if (priv == NULL) { error = ENOMEM; goto out; } error = devfs_set_cdevpriv(priv, netmap_dtor); if (error) { netmap_priv_delete(priv); } out: NMG_UNLOCK(); return error; } /******************** kthread wrapper ****************/ #include u_int nm_os_ncpus(void) { return mp_maxid + 1; } struct nm_kctx_ctx { /* Userspace thread (kthread creator). */ struct thread *user_td; /* worker function and parameter */ nm_kctx_worker_fn_t worker_fn; void *worker_private; struct nm_kctx *nmk; /* integer to manage multiple worker contexts (e.g., RX or TX on ptnetmap) */ long type; }; struct nm_kctx { struct thread *worker; struct mtx worker_lock; struct nm_kctx_ctx worker_ctx; int run; /* used to stop kthread */ int attach_user; /* kthread attached to user_process */ int affinity; }; static void nm_kctx_worker(void *data) { struct nm_kctx *nmk = data; struct nm_kctx_ctx *ctx = &nmk->worker_ctx; if (nmk->affinity >= 0) { thread_lock(curthread); sched_bind(curthread, nmk->affinity); thread_unlock(curthread); } while (nmk->run) { /* * check if the parent process dies * (when kthread is attached to user process) */ if (ctx->user_td) { PROC_LOCK(curproc); thread_suspend_check(0); PROC_UNLOCK(curproc); } else { kthread_suspend_check(); } /* Continuously execute worker process. */ ctx->worker_fn(ctx->worker_private); /* worker body */ } kthread_exit(); } void nm_os_kctx_worker_setaff(struct nm_kctx *nmk, int affinity) { nmk->affinity = affinity; } struct nm_kctx * nm_os_kctx_create(struct nm_kctx_cfg *cfg, void *opaque) { struct nm_kctx *nmk = NULL; nmk = malloc(sizeof(*nmk), M_DEVBUF, M_NOWAIT | M_ZERO); if (!nmk) return NULL; mtx_init(&nmk->worker_lock, "nm_kthread lock", NULL, MTX_DEF); nmk->worker_ctx.worker_fn = cfg->worker_fn; nmk->worker_ctx.worker_private = cfg->worker_private; nmk->worker_ctx.type = cfg->type; nmk->affinity = -1; /* attach kthread to user process (ptnetmap) */ nmk->attach_user = cfg->attach_user; return nmk; } int nm_os_kctx_worker_start(struct nm_kctx *nmk) { struct proc *p = NULL; int error = 0; /* Temporarily disable this function as it is currently broken * and causes kernel crashes. The failure can be triggered by * the "vale_polling_enable_disable" test in ctrl-api-test.c. */ return EOPNOTSUPP; if (nmk->worker) return EBUSY; /* check if we want to attach kthread to user process */ if (nmk->attach_user) { nmk->worker_ctx.user_td = curthread; p = curthread->td_proc; } /* enable kthread main loop */ nmk->run = 1; /* create kthread */ if((error = kthread_add(nm_kctx_worker, nmk, p, &nmk->worker, RFNOWAIT /* to be checked */, 0, "nm-kthread-%ld", nmk->worker_ctx.type))) { goto err; } nm_prinf("nm_kthread started td %p", nmk->worker); return 0; err: nm_prerr("nm_kthread start failed err %d", error); nmk->worker = NULL; return error; } void nm_os_kctx_worker_stop(struct nm_kctx *nmk) { if (!nmk->worker) return; /* tell to kthread to exit from main loop */ nmk->run = 0; /* wake up kthread if it sleeps */ kthread_resume(nmk->worker); nmk->worker = NULL; } void nm_os_kctx_destroy(struct nm_kctx *nmk) { if (!nmk) return; if (nmk->worker) nm_os_kctx_worker_stop(nmk); free(nmk, M_DEVBUF); } /******************** kqueue support ****************/ /* * In addition to calling selwakeuppri(), nm_os_selwakeup() also * needs to call knote() to wake up kqueue listeners. * This operation is deferred to a taskqueue in order to avoid possible * lock order reversals; these may happen because knote() grabs a * private lock associated to the 'si' (see struct selinfo, * struct nm_selinfo, and nm_os_selinfo_init), and nm_os_selwakeup() * can be called while holding the lock associated to a different * 'si'. * When calling knote() we use a non-zero 'hint' argument to inform * the netmap_knrw() function that it is being called from * 'nm_os_selwakeup'; this is necessary because when netmap_knrw() is * called by the kevent subsystem (i.e. kevent_scan()) we also need to * call netmap_poll(). * * The netmap_kqfilter() function registers one or another f_event * depending on read or write mode. A pointer to the struct * 'netmap_priv_d' is stored into kn->kn_hook, so that it can later * be passed to netmap_poll(). We pass NULL as a third argument to * netmap_poll(), so that the latter only runs the txsync/rxsync * (if necessary), and skips the nm_os_selrecord() calls. */ void nm_os_selwakeup(struct nm_selinfo *si) { selwakeuppri(&si->si, PI_NET); - taskqueue_enqueue(si->ntfytq, &si->ntfytask); + if (si->kqueue_users > 0) { + taskqueue_enqueue(si->ntfytq, &si->ntfytask); + } } void nm_os_selrecord(struct thread *td, struct nm_selinfo *si) { selrecord(td, &si->si); } static void netmap_knrdetach(struct knote *kn) { struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook; - struct selinfo *si = &priv->np_si[NR_RX]->si; + struct nm_selinfo *si = priv->np_si[NR_RX]; - nm_prinf("remove selinfo %p", si); - knlist_remove(&si->si_note, kn, /*islocked=*/0); + knlist_remove(&si->si.si_note, kn, /*islocked=*/0); + NMG_LOCK(); + KASSERT(si->kqueue_users > 0, ("kqueue_user underflow on %s", + si->mtxname)); + si->kqueue_users--; + nm_prinf("kqueue users for %s: %d", si->mtxname, si->kqueue_users); + NMG_UNLOCK(); } static void netmap_knwdetach(struct knote *kn) { struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook; - struct selinfo *si = &priv->np_si[NR_TX]->si; + struct nm_selinfo *si = priv->np_si[NR_TX]; - nm_prinf("remove selinfo %p", si); - knlist_remove(&si->si_note, kn, /*islocked=*/0); + knlist_remove(&si->si.si_note, kn, /*islocked=*/0); + NMG_LOCK(); + si->kqueue_users--; + nm_prinf("kqueue users for %s: %d", si->mtxname, si->kqueue_users); + NMG_UNLOCK(); } /* * Callback triggered by netmap notifications (see netmap_notify()), * and by the application calling kevent(). In the former case we * just return 1 (events ready), since we are not able to do better. * In the latter case we use netmap_poll() to see which events are * ready. */ static int netmap_knrw(struct knote *kn, long hint, int events) { struct netmap_priv_d *priv; int revents; if (hint != 0) { /* Called from netmap_notify(), typically from a * thread different from the one issuing kevent(). * Assume we are ready. */ return 1; } /* Called from kevent(). */ priv = kn->kn_hook; revents = netmap_poll(priv, events, /*thread=*/NULL); return (events & revents) ? 1 : 0; } static int netmap_knread(struct knote *kn, long hint) { return netmap_knrw(kn, hint, POLLIN); } static int netmap_knwrite(struct knote *kn, long hint) { return netmap_knrw(kn, hint, POLLOUT); } static struct filterops netmap_rfiltops = { .f_isfd = 1, .f_detach = netmap_knrdetach, .f_event = netmap_knread, }; static struct filterops netmap_wfiltops = { .f_isfd = 1, .f_detach = netmap_knwdetach, .f_event = netmap_knwrite, }; /* * This is called when a thread invokes kevent() to record * a change in the configuration of the kqueue(). * The 'priv' is the one associated to the open netmap device. */ static int netmap_kqfilter(struct cdev *dev, struct knote *kn) { struct netmap_priv_d *priv; int error; struct netmap_adapter *na; struct nm_selinfo *si; int ev = kn->kn_filter; if (ev != EVFILT_READ && ev != EVFILT_WRITE) { nm_prerr("bad filter request %d", ev); return 1; } error = devfs_get_cdevpriv((void**)&priv); if (error) { nm_prerr("device not yet setup"); return 1; } na = priv->np_na; if (na == NULL) { nm_prerr("no netmap adapter for this file descriptor"); return 1; } /* the si is indicated in the priv */ si = priv->np_si[(ev == EVFILT_WRITE) ? NR_TX : NR_RX]; kn->kn_fop = (ev == EVFILT_WRITE) ? &netmap_wfiltops : &netmap_rfiltops; kn->kn_hook = priv; + NMG_LOCK(); + si->kqueue_users++; + nm_prinf("kqueue users for %s: %d", si->mtxname, si->kqueue_users); + NMG_UNLOCK(); knlist_add(&si->si.si_note, kn, /*islocked=*/0); return 0; } static int freebsd_netmap_poll(struct cdev *cdevi __unused, int events, struct thread *td) { struct netmap_priv_d *priv; if (devfs_get_cdevpriv((void **)&priv)) { return POLLERR; } return netmap_poll(priv, events, td); } static int freebsd_netmap_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data, int ffla __unused, struct thread *td) { int error; struct netmap_priv_d *priv; CURVNET_SET(TD_TO_VNET(td)); error = devfs_get_cdevpriv((void **)&priv); if (error) { /* XXX ENOENT should be impossible, since the priv * is now created in the open */ if (error == ENOENT) error = ENXIO; goto out; } error = netmap_ioctl(priv, cmd, data, td, /*nr_body_is_user=*/1); out: CURVNET_RESTORE(); return error; } void nm_os_onattach(struct ifnet *ifp) { ifp->if_capabilities |= IFCAP_NETMAP; } void nm_os_onenter(struct ifnet *ifp) { struct netmap_adapter *na = NA(ifp); na->if_transmit = ifp->if_transmit; ifp->if_transmit = netmap_transmit; ifp->if_capenable |= IFCAP_NETMAP; } void nm_os_onexit(struct ifnet *ifp) { struct netmap_adapter *na = NA(ifp); ifp->if_transmit = na->if_transmit; ifp->if_capenable &= ~IFCAP_NETMAP; } extern struct cdevsw netmap_cdevsw; /* XXX used in netmap.c, should go elsewhere */ struct cdevsw netmap_cdevsw = { .d_version = D_VERSION, .d_name = "netmap", .d_open = netmap_open, .d_mmap_single = netmap_mmap_single, .d_ioctl = freebsd_netmap_ioctl, .d_poll = freebsd_netmap_poll, .d_kqfilter = netmap_kqfilter, .d_close = netmap_close, }; /*--- end of kqueue support ----*/ /* * Kernel entry point. * * Initialize/finalize the module and return. * * Return 0 on success, errno on failure. */ static int netmap_loader(__unused struct module *module, int event, __unused void *arg) { int error = 0; switch (event) { case MOD_LOAD: error = netmap_init(); break; case MOD_UNLOAD: /* * if some one is still using netmap, * then the module can not be unloaded. */ if (netmap_use_count) { nm_prerr("netmap module can not be unloaded - netmap_use_count: %d", netmap_use_count); error = EBUSY; break; } netmap_fini(); break; default: error = EOPNOTSUPP; break; } return (error); } #ifdef DEV_MODULE_ORDERED /* * The netmap module contains three drivers: (i) the netmap character device * driver; (ii) the ptnetmap memdev PCI device driver, (iii) the ptnet PCI * device driver. The attach() routines of both (ii) and (iii) need the * lock of the global allocator, and such lock is initialized in netmap_init(), * which is part of (i). * Therefore, we make sure that (i) is loaded before (ii) and (iii), using * the 'order' parameter of driver declaration macros. For (i), we specify * SI_ORDER_MIDDLE, while higher orders are used with the DRIVER_MODULE_ORDERED * macros for (ii) and (iii). */ DEV_MODULE_ORDERED(netmap, netmap_loader, NULL, SI_ORDER_MIDDLE); #else /* !DEV_MODULE_ORDERED */ DEV_MODULE(netmap, netmap_loader, NULL); #endif /* DEV_MODULE_ORDERED */ MODULE_DEPEND(netmap, pci, 1, 1, 1); MODULE_VERSION(netmap, 1); /* reduce conditional code */ // linux API, use for the knlist in FreeBSD /* use a private mutex for the knlist */ Index: head/sys/dev/netmap/netmap_kern.h =================================================================== --- head/sys/dev/netmap/netmap_kern.h (revision 344252) +++ head/sys/dev/netmap/netmap_kern.h (revision 344253) @@ -1,2405 +1,2408 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo * Copyright (C) 2013-2016 Universita` di Pisa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD$ * * The header contains the definitions of constants and function * prototypes used only in kernelspace. */ #ifndef _NET_NETMAP_KERN_H_ #define _NET_NETMAP_KERN_H_ #if defined(linux) #if defined(CONFIG_NETMAP_EXTMEM) #define WITH_EXTMEM #endif #if defined(CONFIG_NETMAP_VALE) #define WITH_VALE #endif #if defined(CONFIG_NETMAP_PIPE) #define WITH_PIPES #endif #if defined(CONFIG_NETMAP_MONITOR) #define WITH_MONITOR #endif #if defined(CONFIG_NETMAP_GENERIC) #define WITH_GENERIC #endif #if defined(CONFIG_NETMAP_PTNETMAP) #define WITH_PTNETMAP #endif #if defined(CONFIG_NETMAP_SINK) #define WITH_SINK #endif #if defined(CONFIG_NETMAP_NULL) #define WITH_NMNULL #endif #elif defined (_WIN32) #define WITH_VALE // comment out to disable VALE support #define WITH_PIPES #define WITH_MONITOR #define WITH_GENERIC #define WITH_NMNULL #else /* neither linux nor windows */ #define WITH_VALE // comment out to disable VALE support #define WITH_PIPES #define WITH_MONITOR #define WITH_GENERIC #define WITH_PTNETMAP /* ptnetmap guest support */ #define WITH_EXTMEM #define WITH_NMNULL #endif #if defined(__FreeBSD__) #include #define likely(x) __builtin_expect((long)!!(x), 1L) #define unlikely(x) __builtin_expect((long)!!(x), 0L) #define __user #define NM_LOCK_T struct mtx /* low level spinlock, used to protect queues */ #define NM_MTX_T struct sx /* OS-specific mutex (sleepable) */ #define NM_MTX_INIT(m) sx_init(&(m), #m) #define NM_MTX_DESTROY(m) sx_destroy(&(m)) #define NM_MTX_LOCK(m) sx_xlock(&(m)) #define NM_MTX_SPINLOCK(m) while (!sx_try_xlock(&(m))) ; #define NM_MTX_UNLOCK(m) sx_xunlock(&(m)) #define NM_MTX_ASSERT(m) sx_assert(&(m), SA_XLOCKED) #define NM_SELINFO_T struct nm_selinfo #define NM_SELRECORD_T struct thread #define MBUF_LEN(m) ((m)->m_pkthdr.len) #define MBUF_TXQ(m) ((m)->m_pkthdr.flowid) #define MBUF_TRANSMIT(na, ifp, m) ((na)->if_transmit(ifp, m)) #define GEN_TX_MBUF_IFP(m) ((m)->m_pkthdr.rcvif) #define NM_ATOMIC_T volatile int /* required by atomic/bitops.h */ /* atomic operations */ #include #define NM_ATOMIC_TEST_AND_SET(p) (!atomic_cmpset_acq_int((p), 0, 1)) #define NM_ATOMIC_CLEAR(p) atomic_store_rel_int((p), 0) #if __FreeBSD_version >= 1100030 #define WNA(_ifp) (_ifp)->if_netmap #else /* older FreeBSD */ #define WNA(_ifp) (_ifp)->if_pspare[0] #endif /* older FreeBSD */ #if __FreeBSD_version >= 1100005 struct netmap_adapter *netmap_getna(if_t ifp); #endif #if __FreeBSD_version >= 1100027 #define MBUF_REFCNT(m) ((m)->m_ext.ext_count) #define SET_MBUF_REFCNT(m, x) (m)->m_ext.ext_count = x #else #define MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *((m)->m_ext.ref_cnt) : -1) #define SET_MBUF_REFCNT(m, x) *((m)->m_ext.ref_cnt) = x #endif #define MBUF_QUEUED(m) 1 struct nm_selinfo { + /* Support for select(2) and poll(2). */ struct selinfo si; + /* Support for kqueue(9). See comments in netmap_freebsd.c */ struct taskqueue *ntfytq; struct task ntfytask; struct mtx m; char mtxname[32]; + int kqueue_users; }; struct hrtimer { /* Not used in FreeBSD. */ }; #define NM_BNS_GET(b) #define NM_BNS_PUT(b) #elif defined (linux) #define NM_LOCK_T safe_spinlock_t // see bsd_glue.h #define NM_SELINFO_T wait_queue_head_t #define MBUF_LEN(m) ((m)->len) #define MBUF_TRANSMIT(na, ifp, m) \ ({ \ /* Avoid infinite recursion with generic. */ \ m->priority = NM_MAGIC_PRIORITY_TX; \ (((struct net_device_ops *)(na)->if_transmit)->ndo_start_xmit(m, ifp)); \ 0; \ }) /* See explanation in nm_os_generic_xmit_frame. */ #define GEN_TX_MBUF_IFP(m) ((struct ifnet *)skb_shinfo(m)->destructor_arg) #define NM_ATOMIC_T volatile long unsigned int #define NM_MTX_T struct mutex /* OS-specific sleepable lock */ #define NM_MTX_INIT(m) mutex_init(&(m)) #define NM_MTX_DESTROY(m) do { (void)(m); } while (0) #define NM_MTX_LOCK(m) mutex_lock(&(m)) #define NM_MTX_UNLOCK(m) mutex_unlock(&(m)) #define NM_MTX_ASSERT(m) mutex_is_locked(&(m)) #ifndef DEV_NETMAP #define DEV_NETMAP #endif /* DEV_NETMAP */ #elif defined (__APPLE__) #warning apple support is incomplete. #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) #define NM_LOCK_T IOLock * #define NM_SELINFO_T struct selinfo #define MBUF_LEN(m) ((m)->m_pkthdr.len) #elif defined (_WIN32) #include "../../../WINDOWS/win_glue.h" #define NM_SELRECORD_T IO_STACK_LOCATION #define NM_SELINFO_T win_SELINFO // see win_glue.h #define NM_LOCK_T win_spinlock_t // see win_glue.h #define NM_MTX_T KGUARDED_MUTEX /* OS-specific mutex (sleepable) */ #define NM_MTX_INIT(m) KeInitializeGuardedMutex(&m); #define NM_MTX_DESTROY(m) do { (void)(m); } while (0) #define NM_MTX_LOCK(m) KeAcquireGuardedMutex(&(m)) #define NM_MTX_UNLOCK(m) KeReleaseGuardedMutex(&(m)) #define NM_MTX_ASSERT(m) assert(&m.Count>0) //These linknames are for the NDIS driver #define NETMAP_NDIS_LINKNAME_STRING L"\\DosDevices\\NMAPNDIS" #define NETMAP_NDIS_NTDEVICE_STRING L"\\Device\\NMAPNDIS" //Definition of internal driver-to-driver ioctl codes #define NETMAP_KERNEL_XCHANGE_POINTERS _IO('i', 180) #define NETMAP_KERNEL_SEND_SHUTDOWN_SIGNAL _IO_direct('i', 195) typedef struct hrtimer{ KTIMER timer; BOOLEAN active; KDPC deferred_proc; }; /* MSVC does not have likely/unlikely support */ #ifdef _MSC_VER #define likely(x) (x) #define unlikely(x) (x) #else #define likely(x) __builtin_expect((long)!!(x), 1L) #define unlikely(x) __builtin_expect((long)!!(x), 0L) #endif //_MSC_VER #else #error unsupported platform #endif /* end - platform-specific code */ #ifndef _WIN32 /* support for emulated sysctl */ #define SYSBEGIN(x) #define SYSEND #endif /* _WIN32 */ #define NM_ACCESS_ONCE(x) (*(volatile __typeof__(x) *)&(x)) #define NMG_LOCK_T NM_MTX_T #define NMG_LOCK_INIT() NM_MTX_INIT(netmap_global_lock) #define NMG_LOCK_DESTROY() NM_MTX_DESTROY(netmap_global_lock) #define NMG_LOCK() NM_MTX_LOCK(netmap_global_lock) #define NMG_UNLOCK() NM_MTX_UNLOCK(netmap_global_lock) #define NMG_LOCK_ASSERT() NM_MTX_ASSERT(netmap_global_lock) #if defined(__FreeBSD__) #define nm_prerr_int printf #define nm_prinf_int printf #elif defined (_WIN32) #define nm_prerr_int DbgPrint #define nm_prinf_int DbgPrint #elif defined(linux) #define nm_prerr_int(fmt, arg...) printk(KERN_ERR fmt, ##arg) #define nm_prinf_int(fmt, arg...) printk(KERN_INFO fmt, ##arg) #endif #define nm_prinf(format, ...) \ do { \ struct timeval __xxts; \ microtime(&__xxts); \ nm_prinf_int("%03d.%06d [%4d] %-25s " format "\n",\ (int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \ __LINE__, __FUNCTION__, ##__VA_ARGS__); \ } while (0) #define nm_prerr(format, ...) \ do { \ struct timeval __xxts; \ microtime(&__xxts); \ nm_prerr_int("%03d.%06d [%4d] %-25s " format "\n",\ (int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \ __LINE__, __FUNCTION__, ##__VA_ARGS__); \ } while (0) /* Disabled printf (used to be nm_prdis). */ #define nm_prdis(format, ...) /* Rate limited, lps indicates how many per second. */ #define nm_prlim(lps, format, ...) \ do { \ static int t0, __cnt; \ if (t0 != time_second) { \ t0 = time_second; \ __cnt = 0; \ } \ if (__cnt++ < lps) \ nm_prinf(format, ##__VA_ARGS__); \ } while (0) struct netmap_adapter; struct nm_bdg_fwd; struct nm_bridge; struct netmap_priv_d; struct nm_bdg_args; /* os-specific NM_SELINFO_T initialzation/destruction functions */ int nm_os_selinfo_init(NM_SELINFO_T *, const char *name); void nm_os_selinfo_uninit(NM_SELINFO_T *); const char *nm_dump_buf(char *p, int len, int lim, char *dst); void nm_os_selwakeup(NM_SELINFO_T *si); void nm_os_selrecord(NM_SELRECORD_T *sr, NM_SELINFO_T *si); int nm_os_ifnet_init(void); void nm_os_ifnet_fini(void); void nm_os_ifnet_lock(void); void nm_os_ifnet_unlock(void); unsigned nm_os_ifnet_mtu(struct ifnet *ifp); void nm_os_get_module(void); void nm_os_put_module(void); void netmap_make_zombie(struct ifnet *); void netmap_undo_zombie(struct ifnet *); /* os independent alloc/realloc/free */ void *nm_os_malloc(size_t); void *nm_os_vmalloc(size_t); void *nm_os_realloc(void *, size_t new_size, size_t old_size); void nm_os_free(void *); void nm_os_vfree(void *); /* os specific attach/detach enter/exit-netmap-mode routines */ void nm_os_onattach(struct ifnet *); void nm_os_ondetach(struct ifnet *); void nm_os_onenter(struct ifnet *); void nm_os_onexit(struct ifnet *); /* passes a packet up to the host stack. * If the packet is sent (or dropped) immediately it returns NULL, * otherwise it links the packet to prev and returns m. * In this case, a final call with m=NULL and prev != NULL will send up * the entire chain to the host stack. */ void *nm_os_send_up(struct ifnet *, struct mbuf *m, struct mbuf *prev); int nm_os_mbuf_has_seg_offld(struct mbuf *m); int nm_os_mbuf_has_csum_offld(struct mbuf *m); #include "netmap_mbq.h" extern NMG_LOCK_T netmap_global_lock; enum txrx { NR_RX = 0, NR_TX = 1, NR_TXRX }; static __inline const char* nm_txrx2str(enum txrx t) { return (t== NR_RX ? "RX" : "TX"); } static __inline enum txrx nm_txrx_swap(enum txrx t) { return (t== NR_RX ? NR_TX : NR_RX); } #define for_rx_tx(t) for ((t) = 0; (t) < NR_TXRX; (t)++) #ifdef WITH_MONITOR struct netmap_zmon_list { struct netmap_kring *next; struct netmap_kring *prev; }; #endif /* WITH_MONITOR */ /* * private, kernel view of a ring. Keeps track of the status of * a ring across system calls. * * nr_hwcur index of the next buffer to refill. * It corresponds to ring->head * at the time the system call returns. * * nr_hwtail index of the first buffer owned by the kernel. * On RX, hwcur->hwtail are receive buffers * not yet released. hwcur is advanced following * ring->head, hwtail is advanced on incoming packets, * and a wakeup is generated when hwtail passes ring->cur * On TX, hwcur->rcur have been filled by the sender * but not sent yet to the NIC; rcur->hwtail are available * for new transmissions, and hwtail->hwcur-1 are pending * transmissions not yet acknowledged. * * The indexes in the NIC and netmap rings are offset by nkr_hwofs slots. * This is so that, on a reset, buffers owned by userspace are not * modified by the kernel. In particular: * RX rings: the next empty buffer (hwtail + hwofs) coincides with * the next empty buffer as known by the hardware (next_to_check or so). * TX rings: hwcur + hwofs coincides with next_to_send * * The following fields are used to implement lock-free copy of packets * from input to output ports in VALE switch: * nkr_hwlease buffer after the last one being copied. * A writer in nm_bdg_flush reserves N buffers * from nr_hwlease, advances it, then does the * copy outside the lock. * In RX rings (used for VALE ports), * nkr_hwtail <= nkr_hwlease < nkr_hwcur+N-1 * In TX rings (used for NIC or host stack ports) * nkr_hwcur <= nkr_hwlease < nkr_hwtail * nkr_leases array of nkr_num_slots where writers can report * completion of their block. NR_NOSLOT (~0) indicates * that the writer has not finished yet * nkr_lease_idx index of next free slot in nr_leases, to be assigned * * The kring is manipulated by txsync/rxsync and generic netmap function. * * Concurrent rxsync or txsync on the same ring are prevented through * by nm_kr_(try)lock() which in turn uses nr_busy. This is all we need * for NIC rings, and for TX rings attached to the host stack. * * RX rings attached to the host stack use an mbq (rx_queue) on both * rxsync_from_host() and netmap_transmit(). The mbq is protected * by its internal lock. * * RX rings attached to the VALE switch are accessed by both senders * and receiver. They are protected through the q_lock on the RX ring. */ struct netmap_kring { struct netmap_ring *ring; uint32_t nr_hwcur; /* should be nr_hwhead */ uint32_t nr_hwtail; /* * Copies of values in user rings, so we do not need to look * at the ring (which could be modified). These are set in the * *sync_prologue()/finalize() routines. */ uint32_t rhead; uint32_t rcur; uint32_t rtail; uint32_t nr_kflags; /* private driver flags */ #define NKR_PENDINTR 0x1 // Pending interrupt. #define NKR_EXCLUSIVE 0x2 /* exclusive binding */ #define NKR_FORWARD 0x4 /* (host ring only) there are packets to forward */ #define NKR_NEEDRING 0x8 /* ring needed even if users==0 * (used internally by pipes and * by ptnetmap host ports) */ #define NKR_NOINTR 0x10 /* don't use interrupts on this ring */ #define NKR_FAKERING 0x20 /* don't allocate/free buffers */ uint32_t nr_mode; uint32_t nr_pending_mode; #define NKR_NETMAP_OFF 0x0 #define NKR_NETMAP_ON 0x1 uint32_t nkr_num_slots; /* * On a NIC reset, the NIC ring indexes may be reset but the * indexes in the netmap rings remain the same. nkr_hwofs * keeps track of the offset between the two. */ int32_t nkr_hwofs; /* last_reclaim is opaque marker to help reduce the frequency * of operations such as reclaiming tx buffers. A possible use * is set it to ticks and do the reclaim only once per tick. */ uint64_t last_reclaim; NM_SELINFO_T si; /* poll/select wait queue */ NM_LOCK_T q_lock; /* protects kring and ring. */ NM_ATOMIC_T nr_busy; /* prevent concurrent syscalls */ /* the adapter the owns this kring */ struct netmap_adapter *na; /* the adapter that wants to be notified when this kring has * new slots avaialable. This is usually the same as the above, * but wrappers may let it point to themselves */ struct netmap_adapter *notify_na; /* The following fields are for VALE switch support */ struct nm_bdg_fwd *nkr_ft; uint32_t *nkr_leases; #define NR_NOSLOT ((uint32_t)~0) /* used in nkr_*lease* */ uint32_t nkr_hwlease; uint32_t nkr_lease_idx; /* while nkr_stopped is set, no new [tr]xsync operations can * be started on this kring. * This is used by netmap_disable_all_rings() * to find a synchronization point where critical data * structures pointed to by the kring can be added or removed */ volatile int nkr_stopped; /* Support for adapters without native netmap support. * On tx rings we preallocate an array of tx buffers * (same size as the netmap ring), on rx rings we * store incoming mbufs in a queue that is drained by * a rxsync. */ struct mbuf **tx_pool; struct mbuf *tx_event; /* TX event used as a notification */ NM_LOCK_T tx_event_lock; /* protects the tx_event mbuf */ struct mbq rx_queue; /* intercepted rx mbufs. */ uint32_t users; /* existing bindings for this ring */ uint32_t ring_id; /* kring identifier */ enum txrx tx; /* kind of ring (tx or rx) */ char name[64]; /* diagnostic */ /* [tx]sync callback for this kring. * The default nm_kring_create callback (netmap_krings_create) * sets the nm_sync callback of each hardware tx(rx) kring to * the corresponding nm_txsync(nm_rxsync) taken from the * netmap_adapter; moreover, it sets the sync callback * of the host tx(rx) ring to netmap_txsync_to_host * (netmap_rxsync_from_host). * * Overrides: the above configuration is not changed by * any of the nm_krings_create callbacks. */ int (*nm_sync)(struct netmap_kring *kring, int flags); int (*nm_notify)(struct netmap_kring *kring, int flags); #ifdef WITH_PIPES struct netmap_kring *pipe; /* if this is a pipe ring, * pointer to the other end */ uint32_t pipe_tail; /* hwtail updated by the other end */ #endif /* WITH_PIPES */ int (*save_notify)(struct netmap_kring *kring, int flags); #ifdef WITH_MONITOR /* array of krings that are monitoring this kring */ struct netmap_kring **monitors; uint32_t max_monitors; /* current size of the monitors array */ uint32_t n_monitors; /* next unused entry in the monitor array */ uint32_t mon_pos[NR_TXRX]; /* index of this ring in the monitored ring array */ uint32_t mon_tail; /* last seen slot on rx */ /* circular list of zero-copy monitors */ struct netmap_zmon_list zmon_list[NR_TXRX]; /* * Monitors work by intercepting the sync and notify callbacks of the * monitored krings. This is implemented by replacing the pointers * above and saving the previous ones in mon_* pointers below */ int (*mon_sync)(struct netmap_kring *kring, int flags); int (*mon_notify)(struct netmap_kring *kring, int flags); #endif } #ifdef _WIN32 __declspec(align(64)); #else __attribute__((__aligned__(64))); #endif /* return 1 iff the kring needs to be turned on */ static inline int nm_kring_pending_on(struct netmap_kring *kring) { return kring->nr_pending_mode == NKR_NETMAP_ON && kring->nr_mode == NKR_NETMAP_OFF; } /* return 1 iff the kring needs to be turned off */ static inline int nm_kring_pending_off(struct netmap_kring *kring) { return kring->nr_pending_mode == NKR_NETMAP_OFF && kring->nr_mode == NKR_NETMAP_ON; } /* return the next index, with wraparound */ static inline uint32_t nm_next(uint32_t i, uint32_t lim) { return unlikely (i == lim) ? 0 : i + 1; } /* return the previous index, with wraparound */ static inline uint32_t nm_prev(uint32_t i, uint32_t lim) { return unlikely (i == 0) ? lim : i - 1; } /* * * Here is the layout for the Rx and Tx rings. RxRING TxRING +-----------------+ +-----------------+ | | | | | free | | free | +-----------------+ +-----------------+ head->| owned by user |<-hwcur | not sent to nic |<-hwcur | | | yet | +-----------------+ | | cur->| available to | | | | user, not read | +-----------------+ | yet | cur->| (being | | | | prepared) | | | | | +-----------------+ + ------ + tail->| |<-hwtail | |<-hwlease | (being | ... | | ... | prepared) | ... | | ... +-----------------+ ... | | ... | |<-hwlease +-----------------+ | | tail->| |<-hwtail | | | | | | | | | | | | +-----------------+ +-----------------+ * The cur/tail (user view) and hwcur/hwtail (kernel view) * are used in the normal operation of the card. * * When a ring is the output of a switch port (Rx ring for * a VALE port, Tx ring for the host stack or NIC), slots * are reserved in blocks through 'hwlease' which points * to the next unused slot. * On an Rx ring, hwlease is always after hwtail, * and completions cause hwtail to advance. * On a Tx ring, hwlease is always between cur and hwtail, * and completions cause cur to advance. * * nm_kr_space() returns the maximum number of slots that * can be assigned. * nm_kr_lease() reserves the required number of buffers, * advances nkr_hwlease and also returns an entry in * a circular array where completions should be reported. */ struct lut_entry; #ifdef __FreeBSD__ #define plut_entry lut_entry #endif struct netmap_lut { struct lut_entry *lut; struct plut_entry *plut; uint32_t objtotal; /* max buffer index */ uint32_t objsize; /* buffer size */ }; struct netmap_vp_adapter; // forward struct nm_bridge; /* Struct to be filled by nm_config callbacks. */ struct nm_config_info { unsigned num_tx_rings; unsigned num_rx_rings; unsigned num_tx_descs; unsigned num_rx_descs; unsigned rx_buf_maxsize; }; /* * default type for the magic field. * May be overriden in glue code. */ #ifndef NM_OS_MAGIC #define NM_OS_MAGIC uint32_t #endif /* !NM_OS_MAGIC */ /* * The "struct netmap_adapter" extends the "struct adapter" * (or equivalent) device descriptor. * It contains all base fields needed to support netmap operation. * There are in fact different types of netmap adapters * (native, generic, VALE switch...) so a netmap_adapter is * just the first field in the derived type. */ struct netmap_adapter { /* * On linux we do not have a good way to tell if an interface * is netmap-capable. So we always use the following trick: * NA(ifp) points here, and the first entry (which hopefully * always exists and is at least 32 bits) contains a magic * value which we can use to detect that the interface is good. */ NM_OS_MAGIC magic; uint32_t na_flags; /* enabled, and other flags */ #define NAF_SKIP_INTR 1 /* use the regular interrupt handler. * useful during initialization */ #define NAF_SW_ONLY 2 /* forward packets only to sw adapter */ #define NAF_BDG_MAYSLEEP 4 /* the bridge is allowed to sleep when * forwarding packets coming from this * interface */ #define NAF_MEM_OWNER 8 /* the adapter uses its own memory area * that cannot be changed */ #define NAF_NATIVE 16 /* the adapter is native. * Virtual ports (non persistent vale ports, * pipes, monitors...) should never use * this flag. */ #define NAF_NETMAP_ON 32 /* netmap is active (either native or * emulated). Where possible (e.g. FreeBSD) * IFCAP_NETMAP also mirrors this flag. */ #define NAF_HOST_RINGS 64 /* the adapter supports the host rings */ #define NAF_FORCE_NATIVE 128 /* the adapter is always NATIVE */ /* free */ #define NAF_MOREFRAG 512 /* the adapter supports NS_MOREFRAG */ #define NAF_ZOMBIE (1U<<30) /* the nic driver has been unloaded */ #define NAF_BUSY (1U<<31) /* the adapter is used internally and * cannot be registered from userspace */ int active_fds; /* number of user-space descriptors using this interface, which is equal to the number of struct netmap_if objs in the mapped region. */ u_int num_rx_rings; /* number of adapter receive rings */ u_int num_tx_rings; /* number of adapter transmit rings */ u_int num_host_rx_rings; /* number of host receive rings */ u_int num_host_tx_rings; /* number of host transmit rings */ u_int num_tx_desc; /* number of descriptor in each queue */ u_int num_rx_desc; /* tx_rings and rx_rings are private but allocated as a * contiguous chunk of memory. Each array has N+K entries, * N for the hardware rings and K for the host rings. */ struct netmap_kring **tx_rings; /* array of TX rings. */ struct netmap_kring **rx_rings; /* array of RX rings. */ void *tailroom; /* space below the rings array */ /* (used for leases) */ NM_SELINFO_T si[NR_TXRX]; /* global wait queues */ /* count users of the global wait queues */ int si_users[NR_TXRX]; void *pdev; /* used to store pci device */ /* copy of if_qflush and if_transmit pointers, to intercept * packets from the network stack when netmap is active. */ int (*if_transmit)(struct ifnet *, struct mbuf *); /* copy of if_input for netmap_send_up() */ void (*if_input)(struct ifnet *, struct mbuf *); /* Back reference to the parent ifnet struct. Used for * hardware ports (emulated netmap included). */ struct ifnet *ifp; /* adapter is ifp->if_softc */ /*---- callbacks for this netmap adapter -----*/ /* * nm_dtor() is the cleanup routine called when destroying * the adapter. * Called with NMG_LOCK held. * * nm_register() is called on NIOCREGIF and close() to enter * or exit netmap mode on the NIC * Called with NNG_LOCK held. * * nm_txsync() pushes packets to the underlying hw/switch * * nm_rxsync() collects packets from the underlying hw/switch * * nm_config() returns configuration information from the OS * Called with NMG_LOCK held. * * nm_krings_create() create and init the tx_rings and * rx_rings arrays of kring structures. In particular, * set the nm_sync callbacks for each ring. * There is no need to also allocate the corresponding * netmap_rings, since netmap_mem_rings_create() will always * be called to provide the missing ones. * Called with NNG_LOCK held. * * nm_krings_delete() cleanup and delete the tx_rings and rx_rings * arrays * Called with NMG_LOCK held. * * nm_notify() is used to act after data have become available * (or the stopped state of the ring has changed) * For hw devices this is typically a selwakeup(), * but for NIC/host ports attached to a switch (or vice-versa) * we also need to invoke the 'txsync' code downstream. * This callback pointer is actually used only to initialize * kring->nm_notify. * Return values are the same as for netmap_rx_irq(). */ void (*nm_dtor)(struct netmap_adapter *); int (*nm_register)(struct netmap_adapter *, int onoff); void (*nm_intr)(struct netmap_adapter *, int onoff); int (*nm_txsync)(struct netmap_kring *kring, int flags); int (*nm_rxsync)(struct netmap_kring *kring, int flags); int (*nm_notify)(struct netmap_kring *kring, int flags); #define NAF_FORCE_READ 1 #define NAF_FORCE_RECLAIM 2 #define NAF_CAN_FORWARD_DOWN 4 /* return configuration information */ int (*nm_config)(struct netmap_adapter *, struct nm_config_info *info); int (*nm_krings_create)(struct netmap_adapter *); void (*nm_krings_delete)(struct netmap_adapter *); /* * nm_bdg_attach() initializes the na_vp field to point * to an adapter that can be attached to a VALE switch. If the * current adapter is already a VALE port, na_vp is simply a cast; * otherwise, na_vp points to a netmap_bwrap_adapter. * If applicable, this callback also initializes na_hostvp, * that can be used to connect the adapter host rings to the * switch. * Called with NMG_LOCK held. * * nm_bdg_ctl() is called on the actual attach/detach to/from * to/from the switch, to perform adapter-specific * initializations * Called with NMG_LOCK held. */ int (*nm_bdg_attach)(const char *bdg_name, struct netmap_adapter *, struct nm_bridge *); int (*nm_bdg_ctl)(struct nmreq_header *, struct netmap_adapter *); /* adapter used to attach this adapter to a VALE switch (if any) */ struct netmap_vp_adapter *na_vp; /* adapter used to attach the host rings of this adapter * to a VALE switch (if any) */ struct netmap_vp_adapter *na_hostvp; /* standard refcount to control the lifetime of the adapter * (it should be equal to the lifetime of the corresponding ifp) */ int na_refcount; /* memory allocator (opaque) * We also cache a pointer to the lut_entry for translating * buffer addresses, the total number of buffers and the buffer size. */ struct netmap_mem_d *nm_mem; struct netmap_mem_d *nm_mem_prev; struct netmap_lut na_lut; /* additional information attached to this adapter * by other netmap subsystems. Currently used by * bwrap, LINUX/v1000 and ptnetmap */ void *na_private; /* array of pipes that have this adapter as a parent */ struct netmap_pipe_adapter **na_pipes; int na_next_pipe; /* next free slot in the array */ int na_max_pipes; /* size of the array */ /* Offset of ethernet header for each packet. */ u_int virt_hdr_len; /* Max number of bytes that the NIC can store in the buffer * referenced by each RX descriptor. This translates to the maximum * bytes that a single netmap slot can reference. Larger packets * require NS_MOREFRAG support. */ unsigned rx_buf_maxsize; char name[NETMAP_REQ_IFNAMSIZ]; /* used at least by pipes */ #ifdef WITH_MONITOR unsigned long monitor_id; /* debugging */ #endif }; static __inline u_int nma_get_ndesc(struct netmap_adapter *na, enum txrx t) { return (t == NR_TX ? na->num_tx_desc : na->num_rx_desc); } static __inline void nma_set_ndesc(struct netmap_adapter *na, enum txrx t, u_int v) { if (t == NR_TX) na->num_tx_desc = v; else na->num_rx_desc = v; } static __inline u_int nma_get_nrings(struct netmap_adapter *na, enum txrx t) { return (t == NR_TX ? na->num_tx_rings : na->num_rx_rings); } static __inline u_int nma_get_host_nrings(struct netmap_adapter *na, enum txrx t) { return (t == NR_TX ? na->num_host_tx_rings : na->num_host_rx_rings); } static __inline void nma_set_nrings(struct netmap_adapter *na, enum txrx t, u_int v) { if (t == NR_TX) na->num_tx_rings = v; else na->num_rx_rings = v; } static __inline void nma_set_host_nrings(struct netmap_adapter *na, enum txrx t, u_int v) { if (t == NR_TX) na->num_host_tx_rings = v; else na->num_host_rx_rings = v; } static __inline struct netmap_kring** NMR(struct netmap_adapter *na, enum txrx t) { return (t == NR_TX ? na->tx_rings : na->rx_rings); } int nma_intr_enable(struct netmap_adapter *na, int onoff); /* * If the NIC is owned by the kernel * (i.e., bridge), neither another bridge nor user can use it; * if the NIC is owned by a user, only users can share it. * Evaluation must be done under NMG_LOCK(). */ #define NETMAP_OWNED_BY_KERN(na) ((na)->na_flags & NAF_BUSY) #define NETMAP_OWNED_BY_ANY(na) \ (NETMAP_OWNED_BY_KERN(na) || ((na)->active_fds > 0)) /* * derived netmap adapters for various types of ports */ struct netmap_vp_adapter { /* VALE software port */ struct netmap_adapter up; /* * Bridge support: * * bdg_port is the port number used in the bridge; * na_bdg points to the bridge this NA is attached to. */ int bdg_port; struct nm_bridge *na_bdg; int retry; int autodelete; /* remove the ifp on last reference */ /* Maximum Frame Size, used in bdg_mismatch_datapath() */ u_int mfs; /* Last source MAC on this port */ uint64_t last_smac; }; struct netmap_hw_adapter { /* physical device */ struct netmap_adapter up; #ifdef linux struct net_device_ops nm_ndo; struct ethtool_ops nm_eto; #endif const struct ethtool_ops* save_ethtool; int (*nm_hw_register)(struct netmap_adapter *, int onoff); }; #ifdef WITH_GENERIC /* Mitigation support. */ struct nm_generic_mit { struct hrtimer mit_timer; int mit_pending; int mit_ring_idx; /* index of the ring being mitigated */ struct netmap_adapter *mit_na; /* backpointer */ }; struct netmap_generic_adapter { /* emulated device */ struct netmap_hw_adapter up; /* Pointer to a previously used netmap adapter. */ struct netmap_adapter *prev; /* Emulated netmap adapters support: * - save_if_input saves the if_input hook (FreeBSD); * - mit implements rx interrupt mitigation; */ void (*save_if_input)(struct ifnet *, struct mbuf *); struct nm_generic_mit *mit; #ifdef linux netdev_tx_t (*save_start_xmit)(struct mbuf *, struct ifnet *); #endif /* Is the adapter able to use multiple RX slots to scatter * each packet pushed up by the driver? */ int rxsg; /* Is the transmission path controlled by a netmap-aware * device queue (i.e. qdisc on linux)? */ int txqdisc; }; #endif /* WITH_GENERIC */ static __inline u_int netmap_real_rings(struct netmap_adapter *na, enum txrx t) { return nma_get_nrings(na, t) + !!(na->na_flags & NAF_HOST_RINGS) * nma_get_host_nrings(na, t); } /* account for fake rings */ static __inline u_int netmap_all_rings(struct netmap_adapter *na, enum txrx t) { return max(nma_get_nrings(na, t) + 1, netmap_real_rings(na, t)); } int netmap_default_bdg_attach(const char *name, struct netmap_adapter *na, struct nm_bridge *); struct nm_bdg_polling_state; /* * Bridge wrapper for non VALE ports attached to a VALE switch. * * The real device must already have its own netmap adapter (hwna). * The bridge wrapper and the hwna adapter share the same set of * netmap rings and buffers, but they have two separate sets of * krings descriptors, with tx/rx meanings swapped: * * netmap * bwrap krings rings krings hwna * +------+ +------+ +-----+ +------+ +------+ * |tx_rings->| |\ /| |----| |<-tx_rings| * | | +------+ \ / +-----+ +------+ | | * | | X | | * | | / \ | | * | | +------+/ \+-----+ +------+ | | * |rx_rings->| | | |----| |<-rx_rings| * | | +------+ +-----+ +------+ | | * +------+ +------+ * * - packets coming from the bridge go to the brwap rx rings, * which are also the hwna tx rings. The bwrap notify callback * will then complete the hwna tx (see netmap_bwrap_notify). * * - packets coming from the outside go to the hwna rx rings, * which are also the bwrap tx rings. The (overwritten) hwna * notify method will then complete the bridge tx * (see netmap_bwrap_intr_notify). * * The bridge wrapper may optionally connect the hwna 'host' rings * to the bridge. This is done by using a second port in the * bridge and connecting it to the 'host' netmap_vp_adapter * contained in the netmap_bwrap_adapter. The brwap host adapter * cross-links the hwna host rings in the same way as shown above. * * - packets coming from the bridge and directed to the host stack * are handled by the bwrap host notify callback * (see netmap_bwrap_host_notify) * * - packets coming from the host stack are still handled by the * overwritten hwna notify callback (netmap_bwrap_intr_notify), * but are diverted to the host adapter depending on the ring number. * */ struct netmap_bwrap_adapter { struct netmap_vp_adapter up; struct netmap_vp_adapter host; /* for host rings */ struct netmap_adapter *hwna; /* the underlying device */ /* * When we attach a physical interface to the bridge, we * allow the controlling process to terminate, so we need * a place to store the n_detmap_priv_d data structure. * This is only done when physical interfaces * are attached to a bridge. */ struct netmap_priv_d *na_kpriv; struct nm_bdg_polling_state *na_polling_state; /* we overwrite the hwna->na_vp pointer, so we save * here its original value, to be restored at detach */ struct netmap_vp_adapter *saved_na_vp; }; int nm_bdg_polling(struct nmreq_header *hdr); #ifdef WITH_VALE int netmap_vale_attach(struct nmreq_header *hdr, void *auth_token); int netmap_vale_detach(struct nmreq_header *hdr, void *auth_token); int netmap_vale_list(struct nmreq_header *hdr); int netmap_vi_create(struct nmreq_header *hdr, int); int nm_vi_create(struct nmreq_header *); int nm_vi_destroy(const char *name); #else /* !WITH_VALE */ #define netmap_vi_create(hdr, a) (EOPNOTSUPP) #endif /* WITH_VALE */ #ifdef WITH_PIPES #define NM_MAXPIPES 64 /* max number of pipes per adapter */ struct netmap_pipe_adapter { /* pipe identifier is up.name */ struct netmap_adapter up; #define NM_PIPE_ROLE_MASTER 0x1 #define NM_PIPE_ROLE_SLAVE 0x2 int role; /* either NM_PIPE_ROLE_MASTER or NM_PIPE_ROLE_SLAVE */ struct netmap_adapter *parent; /* adapter that owns the memory */ struct netmap_pipe_adapter *peer; /* the other end of the pipe */ int peer_ref; /* 1 iff we are holding a ref to the peer */ struct ifnet *parent_ifp; /* maybe null */ u_int parent_slot; /* index in the parent pipe array */ }; #endif /* WITH_PIPES */ #ifdef WITH_NMNULL struct netmap_null_adapter { struct netmap_adapter up; }; #endif /* WITH_NMNULL */ /* return slots reserved to rx clients; used in drivers */ static inline uint32_t nm_kr_rxspace(struct netmap_kring *k) { int space = k->nr_hwtail - k->nr_hwcur; if (space < 0) space += k->nkr_num_slots; nm_prdis("preserving %d rx slots %d -> %d", space, k->nr_hwcur, k->nr_hwtail); return space; } /* return slots reserved to tx clients */ #define nm_kr_txspace(_k) nm_kr_rxspace(_k) /* True if no space in the tx ring, only valid after txsync_prologue */ static inline int nm_kr_txempty(struct netmap_kring *kring) { return kring->rhead == kring->nr_hwtail; } /* True if no more completed slots in the rx ring, only valid after * rxsync_prologue */ #define nm_kr_rxempty(_k) nm_kr_txempty(_k) /* True if the application needs to wait for more space on the ring * (more received packets or more free tx slots). * Only valid after *xsync_prologue. */ static inline int nm_kr_wouldblock(struct netmap_kring *kring) { return kring->rcur == kring->nr_hwtail; } /* * protect against multiple threads using the same ring. * also check that the ring has not been stopped or locked */ #define NM_KR_BUSY 1 /* some other thread is syncing the ring */ #define NM_KR_STOPPED 2 /* unbounded stop (ifconfig down or driver unload) */ #define NM_KR_LOCKED 3 /* bounded, brief stop for mutual exclusion */ /* release the previously acquired right to use the *sync() methods of the ring */ static __inline void nm_kr_put(struct netmap_kring *kr) { NM_ATOMIC_CLEAR(&kr->nr_busy); } /* true if the ifp that backed the adapter has disappeared (e.g., the * driver has been unloaded) */ static inline int nm_iszombie(struct netmap_adapter *na); /* try to obtain exclusive right to issue the *sync() operations on the ring. * The right is obtained and must be later relinquished via nm_kr_put() if and * only if nm_kr_tryget() returns 0. * If can_sleep is 1 there are only two other possible outcomes: * - the function returns NM_KR_BUSY * - the function returns NM_KR_STOPPED and sets the POLLERR bit in *perr * (if non-null) * In both cases the caller will typically skip the ring, possibly collecting * errors along the way. * If the calling context does not allow sleeping, the caller must pass 0 in can_sleep. * In the latter case, the function may also return NM_KR_LOCKED and leave *perr * untouched: ideally, the caller should try again at a later time. */ static __inline int nm_kr_tryget(struct netmap_kring *kr, int can_sleep, int *perr) { int busy = 1, stopped; /* check a first time without taking the lock * to avoid starvation for nm_kr_get() */ retry: stopped = kr->nkr_stopped; if (unlikely(stopped)) { goto stop; } busy = NM_ATOMIC_TEST_AND_SET(&kr->nr_busy); /* we should not return NM_KR_BUSY if the ring was * actually stopped, so check another time after * the barrier provided by the atomic operation */ stopped = kr->nkr_stopped; if (unlikely(stopped)) { goto stop; } if (unlikely(nm_iszombie(kr->na))) { stopped = NM_KR_STOPPED; goto stop; } return unlikely(busy) ? NM_KR_BUSY : 0; stop: if (!busy) nm_kr_put(kr); if (stopped == NM_KR_STOPPED) { /* if POLLERR is defined we want to use it to simplify netmap_poll(). * Otherwise, any non-zero value will do. */ #ifdef POLLERR #define NM_POLLERR POLLERR #else #define NM_POLLERR 1 #endif /* POLLERR */ if (perr) *perr |= NM_POLLERR; #undef NM_POLLERR } else if (can_sleep) { tsleep(kr, 0, "NM_KR_TRYGET", 4); goto retry; } return stopped; } /* put the ring in the 'stopped' state and wait for the current user (if any) to * notice. stopped must be either NM_KR_STOPPED or NM_KR_LOCKED */ static __inline void nm_kr_stop(struct netmap_kring *kr, int stopped) { kr->nkr_stopped = stopped; while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) tsleep(kr, 0, "NM_KR_GET", 4); } /* restart a ring after a stop */ static __inline void nm_kr_start(struct netmap_kring *kr) { kr->nkr_stopped = 0; nm_kr_put(kr); } /* * The following functions are used by individual drivers to * support netmap operation. * * netmap_attach() initializes a struct netmap_adapter, allocating the * struct netmap_ring's and the struct selinfo. * * netmap_detach() frees the memory allocated by netmap_attach(). * * netmap_transmit() replaces the if_transmit routine of the interface, * and is used to intercept packets coming from the stack. * * netmap_load_map/netmap_reload_map are helper routines to set/reset * the dmamap for a packet buffer * * netmap_reset() is a helper routine to be called in the hw driver * when reinitializing a ring. It should not be called by * virtual ports (vale, pipes, monitor) */ int netmap_attach(struct netmap_adapter *); int netmap_attach_ext(struct netmap_adapter *, size_t size, int override_reg); void netmap_detach(struct ifnet *); int netmap_transmit(struct ifnet *, struct mbuf *); struct netmap_slot *netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, u_int new_cur); int netmap_ring_reinit(struct netmap_kring *); int netmap_rings_config_get(struct netmap_adapter *, struct nm_config_info *); /* Return codes for netmap_*x_irq. */ enum { /* Driver should do normal interrupt processing, e.g. because * the interface is not in netmap mode. */ NM_IRQ_PASS = 0, /* Port is in netmap mode, and the interrupt work has been * completed. The driver does not have to notify netmap * again before the next interrupt. */ NM_IRQ_COMPLETED = -1, /* Port is in netmap mode, but the interrupt work has not been * completed. The driver has to make sure netmap will be * notified again soon, even if no more interrupts come (e.g. * on Linux the driver should not call napi_complete()). */ NM_IRQ_RESCHED = -2, }; /* default functions to handle rx/tx interrupts */ int netmap_rx_irq(struct ifnet *, u_int, u_int *); #define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL) int netmap_common_irq(struct netmap_adapter *, u_int, u_int *work_done); #ifdef WITH_VALE /* functions used by external modules to interface with VALE */ #define netmap_vp_to_ifp(_vp) ((_vp)->up.ifp) #define netmap_ifp_to_vp(_ifp) (NA(_ifp)->na_vp) #define netmap_ifp_to_host_vp(_ifp) (NA(_ifp)->na_hostvp) #define netmap_bdg_idx(_vp) ((_vp)->bdg_port) const char *netmap_bdg_name(struct netmap_vp_adapter *); #else /* !WITH_VALE */ #define netmap_vp_to_ifp(_vp) NULL #define netmap_ifp_to_vp(_ifp) NULL #define netmap_ifp_to_host_vp(_ifp) NULL #define netmap_bdg_idx(_vp) -1 #endif /* WITH_VALE */ static inline int nm_netmap_on(struct netmap_adapter *na) { return na && na->na_flags & NAF_NETMAP_ON; } static inline int nm_native_on(struct netmap_adapter *na) { return nm_netmap_on(na) && (na->na_flags & NAF_NATIVE); } static inline int nm_iszombie(struct netmap_adapter *na) { return na == NULL || (na->na_flags & NAF_ZOMBIE); } static inline void nm_update_hostrings_mode(struct netmap_adapter *na) { /* Process nr_mode and nr_pending_mode for host rings. */ na->tx_rings[na->num_tx_rings]->nr_mode = na->tx_rings[na->num_tx_rings]->nr_pending_mode; na->rx_rings[na->num_rx_rings]->nr_mode = na->rx_rings[na->num_rx_rings]->nr_pending_mode; } void nm_set_native_flags(struct netmap_adapter *); void nm_clear_native_flags(struct netmap_adapter *); void netmap_krings_mode_commit(struct netmap_adapter *na, int onoff); /* * nm_*sync_prologue() functions are used in ioctl/poll and ptnetmap * kthreads. * We need netmap_ring* parameter, because in ptnetmap it is decoupled * from host kring. * The user-space ring pointers (head/cur/tail) are shared through * CSB between host and guest. */ /* * validates parameters in the ring/kring, returns a value for head * If any error, returns ring_size to force a reinit. */ uint32_t nm_txsync_prologue(struct netmap_kring *, struct netmap_ring *); /* * validates parameters in the ring/kring, returns a value for head * If any error, returns ring_size lim to force a reinit. */ uint32_t nm_rxsync_prologue(struct netmap_kring *, struct netmap_ring *); /* check/fix address and len in tx rings */ #if 1 /* debug version */ #define NM_CHECK_ADDR_LEN(_na, _a, _l) do { \ if (_a == NETMAP_BUF_BASE(_na) || _l > NETMAP_BUF_SIZE(_na)) { \ nm_prlim(5, "bad addr/len ring %d slot %d idx %d len %d", \ kring->ring_id, nm_i, slot->buf_idx, len); \ if (_l > NETMAP_BUF_SIZE(_na)) \ _l = NETMAP_BUF_SIZE(_na); \ } } while (0) #else /* no debug version */ #define NM_CHECK_ADDR_LEN(_na, _a, _l) do { \ if (_l > NETMAP_BUF_SIZE(_na)) \ _l = NETMAP_BUF_SIZE(_na); \ } while (0) #endif /*---------------------------------------------------------------*/ /* * Support routines used by netmap subsystems * (native drivers, VALE, generic, pipes, monitors, ...) */ /* common routine for all functions that create a netmap adapter. It performs * two main tasks: * - if the na points to an ifp, mark the ifp as netmap capable * using na as its native adapter; * - provide defaults for the setup callbacks and the memory allocator */ int netmap_attach_common(struct netmap_adapter *); /* fill priv->np_[tr]xq{first,last} using the ringid and flags information * coming from a struct nmreq_register */ int netmap_interp_ringid(struct netmap_priv_d *priv, uint32_t nr_mode, uint16_t nr_ringid, uint64_t nr_flags); /* update the ring parameters (number and size of tx and rx rings). * It calls the nm_config callback, if available. */ int netmap_update_config(struct netmap_adapter *na); /* create and initialize the common fields of the krings array. * using the information that must be already available in the na. * tailroom can be used to request the allocation of additional * tailroom bytes after the krings array. This is used by * netmap_vp_adapter's (i.e., VALE ports) to make room for * leasing-related data structures */ int netmap_krings_create(struct netmap_adapter *na, u_int tailroom); /* deletes the kring array of the adapter. The array must have * been created using netmap_krings_create */ void netmap_krings_delete(struct netmap_adapter *na); int netmap_hw_krings_create(struct netmap_adapter *na); void netmap_hw_krings_delete(struct netmap_adapter *na); /* set the stopped/enabled status of ring * When stopping, they also wait for all current activity on the ring to * terminate. The status change is then notified using the na nm_notify * callback. */ void netmap_set_ring(struct netmap_adapter *, u_int ring_id, enum txrx, int stopped); /* set the stopped/enabled status of all rings of the adapter. */ void netmap_set_all_rings(struct netmap_adapter *, int stopped); /* convenience wrappers for netmap_set_all_rings */ void netmap_disable_all_rings(struct ifnet *); void netmap_enable_all_rings(struct ifnet *); int netmap_buf_size_validate(const struct netmap_adapter *na, unsigned mtu); int netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, uint32_t nr_mode, uint16_t nr_ringid, uint64_t nr_flags); void netmap_do_unregif(struct netmap_priv_d *priv); u_int nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg); int netmap_get_na(struct nmreq_header *hdr, struct netmap_adapter **na, struct ifnet **ifp, struct netmap_mem_d *nmd, int create); void netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp); int netmap_get_hw_na(struct ifnet *ifp, struct netmap_mem_d *nmd, struct netmap_adapter **na); #ifdef WITH_VALE uint32_t netmap_vale_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, struct netmap_vp_adapter *, void *private_data); /* these are redefined in case of no VALE support */ int netmap_get_vale_na(struct nmreq_header *hdr, struct netmap_adapter **na, struct netmap_mem_d *nmd, int create); void *netmap_vale_create(const char *bdg_name, int *return_status); int netmap_vale_destroy(const char *bdg_name, void *auth_token); #else /* !WITH_VALE */ #define netmap_bdg_learning(_1, _2, _3, _4) 0 #define netmap_get_vale_na(_1, _2, _3, _4) 0 #define netmap_bdg_create(_1, _2) NULL #define netmap_bdg_destroy(_1, _2) 0 #endif /* !WITH_VALE */ #ifdef WITH_PIPES /* max number of pipes per device */ #define NM_MAXPIPES 64 /* XXX this should probably be a sysctl */ void netmap_pipe_dealloc(struct netmap_adapter *); int netmap_get_pipe_na(struct nmreq_header *hdr, struct netmap_adapter **na, struct netmap_mem_d *nmd, int create); #else /* !WITH_PIPES */ #define NM_MAXPIPES 0 #define netmap_pipe_alloc(_1, _2) 0 #define netmap_pipe_dealloc(_1) #define netmap_get_pipe_na(hdr, _2, _3, _4) \ ((strchr(hdr->nr_name, '{') != NULL || strchr(hdr->nr_name, '}') != NULL) ? EOPNOTSUPP : 0) #endif #ifdef WITH_MONITOR int netmap_get_monitor_na(struct nmreq_header *hdr, struct netmap_adapter **na, struct netmap_mem_d *nmd, int create); void netmap_monitor_stop(struct netmap_adapter *na); #else #define netmap_get_monitor_na(hdr, _2, _3, _4) \ (((struct nmreq_register *)(uintptr_t)hdr->nr_body)->nr_flags & (NR_MONITOR_TX | NR_MONITOR_RX) ? EOPNOTSUPP : 0) #endif #ifdef WITH_NMNULL int netmap_get_null_na(struct nmreq_header *hdr, struct netmap_adapter **na, struct netmap_mem_d *nmd, int create); #else /* !WITH_NMNULL */ #define netmap_get_null_na(hdr, _2, _3, _4) \ (((struct nmreq_register *)(uintptr_t)hdr->nr_body)->nr_flags & (NR_MONITOR_TX | NR_MONITOR_RX) ? EOPNOTSUPP : 0) #endif /* WITH_NMNULL */ #ifdef CONFIG_NET_NS struct net *netmap_bns_get(void); void netmap_bns_put(struct net *); void netmap_bns_getbridges(struct nm_bridge **, u_int *); #else extern struct nm_bridge *nm_bridges; #define netmap_bns_get() #define netmap_bns_put(_1) #define netmap_bns_getbridges(b, n) \ do { *b = nm_bridges; *n = NM_BRIDGES; } while (0) #endif /* Various prototypes */ int netmap_poll(struct netmap_priv_d *, int events, NM_SELRECORD_T *td); int netmap_init(void); void netmap_fini(void); int netmap_get_memory(struct netmap_priv_d* p); void netmap_dtor(void *data); int netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data, struct thread *, int nr_body_is_user); int netmap_ioctl_legacy(struct netmap_priv_d *priv, u_long cmd, caddr_t data, struct thread *td); size_t nmreq_size_by_type(uint16_t nr_reqtype); /* netmap_adapter creation/destruction */ // #define NM_DEBUG_PUTGET 1 #ifdef NM_DEBUG_PUTGET #define NM_DBG(f) __##f void __netmap_adapter_get(struct netmap_adapter *na); #define netmap_adapter_get(na) \ do { \ struct netmap_adapter *__na = na; \ nm_prinf("getting %p:%s (%d)", __na, (__na)->name, (__na)->na_refcount); \ __netmap_adapter_get(__na); \ } while (0) int __netmap_adapter_put(struct netmap_adapter *na); #define netmap_adapter_put(na) \ ({ \ struct netmap_adapter *__na = na; \ nm_prinf("putting %p:%s (%d)", __na, (__na)->name, (__na)->na_refcount); \ __netmap_adapter_put(__na); \ }) #else /* !NM_DEBUG_PUTGET */ #define NM_DBG(f) f void netmap_adapter_get(struct netmap_adapter *na); int netmap_adapter_put(struct netmap_adapter *na); #endif /* !NM_DEBUG_PUTGET */ /* * module variables */ #define NETMAP_BUF_BASE(_na) ((_na)->na_lut.lut[0].vaddr) #define NETMAP_BUF_SIZE(_na) ((_na)->na_lut.objsize) extern int netmap_no_pendintr; extern int netmap_mitigate; extern int netmap_verbose; #ifdef CONFIG_NETMAP_DEBUG extern int netmap_debug; /* for debugging */ #else /* !CONFIG_NETMAP_DEBUG */ #define netmap_debug (0) #endif /* !CONFIG_NETMAP_DEBUG */ enum { /* debug flags */ NM_DEBUG_ON = 1, /* generic debug messsages */ NM_DEBUG_HOST = 0x2, /* debug host stack */ NM_DEBUG_RXSYNC = 0x10, /* debug on rxsync/txsync */ NM_DEBUG_TXSYNC = 0x20, NM_DEBUG_RXINTR = 0x100, /* debug on rx/tx intr (driver) */ NM_DEBUG_TXINTR = 0x200, NM_DEBUG_NIC_RXSYNC = 0x1000, /* debug on rx/tx intr (driver) */ NM_DEBUG_NIC_TXSYNC = 0x2000, NM_DEBUG_MEM = 0x4000, /* verbose memory allocations/deallocations */ NM_DEBUG_VALE = 0x8000, /* debug messages from memory allocators */ NM_DEBUG_BDG = NM_DEBUG_VALE, }; extern int netmap_txsync_retry; extern int netmap_flags; extern int netmap_generic_hwcsum; extern int netmap_generic_mit; extern int netmap_generic_ringsize; extern int netmap_generic_rings; #ifdef linux extern int netmap_generic_txqdisc; #endif /* * NA returns a pointer to the struct netmap adapter from the ifp. * WNA is os-specific and must be defined in glue code. */ #define NA(_ifp) ((struct netmap_adapter *)WNA(_ifp)) /* * we provide a default implementation of NM_ATTACH_NA/NM_DETACH_NA * based on the WNA field. * Glue code may override this by defining its own NM_ATTACH_NA */ #ifndef NM_ATTACH_NA /* * On old versions of FreeBSD, NA(ifp) is a pspare. On linux we * overload another pointer in the netdev. * * We check if NA(ifp) is set and its first element has a related * magic value. The capenable is within the struct netmap_adapter. */ #define NETMAP_MAGIC 0x52697a7a #define NM_NA_VALID(ifp) (NA(ifp) && \ ((uint32_t)(uintptr_t)NA(ifp) ^ NA(ifp)->magic) == NETMAP_MAGIC ) #define NM_ATTACH_NA(ifp, na) do { \ WNA(ifp) = na; \ if (NA(ifp)) \ NA(ifp)->magic = \ ((uint32_t)(uintptr_t)NA(ifp)) ^ NETMAP_MAGIC; \ } while(0) #define NM_RESTORE_NA(ifp, na) WNA(ifp) = na; #define NM_DETACH_NA(ifp) do { WNA(ifp) = NULL; } while (0) #define NM_NA_CLASH(ifp) (NA(ifp) && !NM_NA_VALID(ifp)) #endif /* !NM_ATTACH_NA */ #define NM_IS_NATIVE(ifp) (NM_NA_VALID(ifp) && NA(ifp)->nm_dtor == netmap_hw_dtor) #if defined(__FreeBSD__) /* Assigns the device IOMMU domain to an allocator. * Returns -ENOMEM in case the domain is different */ #define nm_iommu_group_id(dev) (0) /* Callback invoked by the dma machinery after a successful dmamap_load */ static void netmap_dmamap_cb(__unused void *arg, __unused bus_dma_segment_t * segs, __unused int nseg, __unused int error) { } /* bus_dmamap_load wrapper: call aforementioned function if map != NULL. * XXX can we do it without a callback ? */ static inline int netmap_load_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map, void *buf) { if (map) bus_dmamap_load(tag, map, buf, NETMAP_BUF_SIZE(na), netmap_dmamap_cb, NULL, BUS_DMA_NOWAIT); return 0; } static inline void netmap_unload_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map) { if (map) bus_dmamap_unload(tag, map); } #define netmap_sync_map(na, tag, map, sz, t) /* update the map when a buffer changes. */ static inline void netmap_reload_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map, void *buf) { if (map) { bus_dmamap_unload(tag, map); bus_dmamap_load(tag, map, buf, NETMAP_BUF_SIZE(na), netmap_dmamap_cb, NULL, BUS_DMA_NOWAIT); } } #elif defined(_WIN32) #else /* linux */ int nm_iommu_group_id(bus_dma_tag_t dev); #include /* * on linux we need * dma_map_single(&pdev->dev, virt_addr, len, direction) * dma_unmap_single(&adapter->pdev->dev, phys_addr, len, direction) */ #if 0 struct e1000_buffer *buffer_info = &tx_ring->buffer_info[l]; /* set time_stamp *before* dma to help avoid a possible race */ buffer_info->time_stamp = jiffies; buffer_info->mapped_as_page = false; buffer_info->length = len; //buffer_info->next_to_watch = l; /* reload dma map */ dma_unmap_single(&adapter->pdev->dev, buffer_info->dma, NETMAP_BUF_SIZE, DMA_TO_DEVICE); buffer_info->dma = dma_map_single(&adapter->pdev->dev, addr, NETMAP_BUF_SIZE, DMA_TO_DEVICE); if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { nm_prerr("dma mapping error"); /* goto dma_error; See e1000_put_txbuf() */ /* XXX reset */ } tx_desc->buffer_addr = htole64(buffer_info->dma); //XXX #endif static inline int netmap_load_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map, void *buf, u_int size) { if (map) { *map = dma_map_single(na->pdev, buf, size, DMA_BIDIRECTIONAL); if (dma_mapping_error(na->pdev, *map)) { *map = 0; return ENOMEM; } } return 0; } static inline void netmap_unload_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map, u_int sz) { if (*map) { dma_unmap_single(na->pdev, *map, sz, DMA_BIDIRECTIONAL); } } #ifdef NETMAP_LINUX_HAVE_DMASYNC static inline void netmap_sync_map_cpu(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map, u_int sz, enum txrx t) { if (*map) { dma_sync_single_for_cpu(na->pdev, *map, sz, (t == NR_TX ? DMA_TO_DEVICE : DMA_FROM_DEVICE)); } } static inline void netmap_sync_map_dev(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map, u_int sz, enum txrx t) { if (*map) { dma_sync_single_for_device(na->pdev, *map, sz, (t == NR_TX ? DMA_TO_DEVICE : DMA_FROM_DEVICE)); } } static inline void netmap_reload_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map, void *buf) { u_int sz = NETMAP_BUF_SIZE(na); if (*map) { dma_unmap_single(na->pdev, *map, sz, DMA_BIDIRECTIONAL); } *map = dma_map_single(na->pdev, buf, sz, DMA_BIDIRECTIONAL); } #else /* !NETMAP_LINUX_HAVE_DMASYNC */ #define netmap_sync_map_cpu(na, tag, map, sz, t) #define netmap_sync_map_dev(na, tag, map, sz, t) #endif /* NETMAP_LINUX_HAVE_DMASYNC */ #endif /* linux */ /* * functions to map NIC to KRING indexes (n2k) and vice versa (k2n) */ static inline int netmap_idx_n2k(struct netmap_kring *kr, int idx) { int n = kr->nkr_num_slots; if (likely(kr->nkr_hwofs == 0)) { return idx; } idx += kr->nkr_hwofs; if (idx < 0) return idx + n; else if (idx < n) return idx; else return idx - n; } static inline int netmap_idx_k2n(struct netmap_kring *kr, int idx) { int n = kr->nkr_num_slots; if (likely(kr->nkr_hwofs == 0)) { return idx; } idx -= kr->nkr_hwofs; if (idx < 0) return idx + n; else if (idx < n) return idx; else return idx - n; } /* Entries of the look-up table. */ #ifdef __FreeBSD__ struct lut_entry { void *vaddr; /* virtual address. */ vm_paddr_t paddr; /* physical address. */ }; #else /* linux & _WIN32 */ /* dma-mapping in linux can assign a buffer a different address * depending on the device, so we need to have a separate * physical-address look-up table for each na. * We can still share the vaddrs, though, therefore we split * the lut_entry structure. */ struct lut_entry { void *vaddr; /* virtual address. */ }; struct plut_entry { vm_paddr_t paddr; /* physical address. */ }; #endif /* linux & _WIN32 */ struct netmap_obj_pool; /* * NMB return the virtual address of a buffer (buffer 0 on bad index) * PNMB also fills the physical address */ static inline void * NMB(struct netmap_adapter *na, struct netmap_slot *slot) { struct lut_entry *lut = na->na_lut.lut; uint32_t i = slot->buf_idx; return (unlikely(i >= na->na_lut.objtotal)) ? lut[0].vaddr : lut[i].vaddr; } static inline void * PNMB(struct netmap_adapter *na, struct netmap_slot *slot, uint64_t *pp) { uint32_t i = slot->buf_idx; struct lut_entry *lut = na->na_lut.lut; struct plut_entry *plut = na->na_lut.plut; void *ret = (i >= na->na_lut.objtotal) ? lut[0].vaddr : lut[i].vaddr; #ifdef _WIN32 *pp = (i >= na->na_lut.objtotal) ? (uint64_t)plut[0].paddr.QuadPart : (uint64_t)plut[i].paddr.QuadPart; #else *pp = (i >= na->na_lut.objtotal) ? plut[0].paddr : plut[i].paddr; #endif return ret; } /* * Structure associated to each netmap file descriptor. * It is created on open and left unbound (np_nifp == NULL). * A successful NIOCREGIF will set np_nifp and the first few fields; * this is protected by a global lock (NMG_LOCK) due to low contention. * * np_refs counts the number of references to the structure: one for the fd, * plus (on FreeBSD) one for each active mmap which we track ourselves * (linux automatically tracks them, but FreeBSD does not). * np_refs is protected by NMG_LOCK. * * Read access to the structure is lock free, because ni_nifp once set * can only go to 0 when nobody is using the entry anymore. Readers * must check that np_nifp != NULL before using the other fields. */ struct netmap_priv_d { struct netmap_if * volatile np_nifp; /* netmap if descriptor. */ struct netmap_adapter *np_na; struct ifnet *np_ifp; uint32_t np_flags; /* from the ioctl */ u_int np_qfirst[NR_TXRX], np_qlast[NR_TXRX]; /* range of tx/rx rings to scan */ uint16_t np_txpoll; uint16_t np_kloop_state; /* use with NMG_LOCK held */ #define NM_SYNC_KLOOP_RUNNING (1 << 0) #define NM_SYNC_KLOOP_STOPPING (1 << 1) int np_sync_flags; /* to be passed to nm_sync */ int np_refs; /* use with NMG_LOCK held */ /* pointers to the selinfo to be used for selrecord. * Either the local or the global one depending on the * number of rings. */ NM_SELINFO_T *np_si[NR_TXRX]; /* In the optional CSB mode, the user must specify the start address * of two arrays of Communication Status Block (CSB) entries, for the * two directions (kernel read application write, and kernel write * application read). * The number of entries must agree with the number of rings bound to * the netmap file descriptor. The entries corresponding to the TX * rings are laid out before the ones corresponding to the RX rings. * * Array of CSB entries for application --> kernel communication * (N entries). */ struct nm_csb_atok *np_csb_atok_base; /* Array of CSB entries for kernel --> application communication * (N entries). */ struct nm_csb_ktoa *np_csb_ktoa_base; #ifdef linux struct file *np_filp; /* used by sync kloop */ #endif /* linux */ }; struct netmap_priv_d *netmap_priv_new(void); void netmap_priv_delete(struct netmap_priv_d *); static inline int nm_kring_pending(struct netmap_priv_d *np) { struct netmap_adapter *na = np->np_na; enum txrx t; int i; for_rx_tx(t) { for (i = np->np_qfirst[t]; i < np->np_qlast[t]; i++) { struct netmap_kring *kring = NMR(na, t)[i]; if (kring->nr_mode != kring->nr_pending_mode) { return 1; } } } return 0; } /* call with NMG_LOCK held */ static __inline int nm_si_user(struct netmap_priv_d *priv, enum txrx t) { return (priv->np_na != NULL && (priv->np_qlast[t] - priv->np_qfirst[t] > 1)); } #ifdef WITH_PIPES int netmap_pipe_txsync(struct netmap_kring *txkring, int flags); int netmap_pipe_rxsync(struct netmap_kring *rxkring, int flags); int netmap_pipe_krings_create_both(struct netmap_adapter *na, struct netmap_adapter *ona); void netmap_pipe_krings_delete_both(struct netmap_adapter *na, struct netmap_adapter *ona); int netmap_pipe_reg_both(struct netmap_adapter *na, struct netmap_adapter *ona); #endif /* WITH_PIPES */ #ifdef WITH_MONITOR struct netmap_monitor_adapter { struct netmap_adapter up; struct netmap_priv_d priv; uint32_t flags; }; #endif /* WITH_MONITOR */ #ifdef WITH_GENERIC /* * generic netmap emulation for devices that do not have * native netmap support. */ int generic_netmap_attach(struct ifnet *ifp); int generic_rx_handler(struct ifnet *ifp, struct mbuf *m);; int nm_os_catch_rx(struct netmap_generic_adapter *gna, int intercept); int nm_os_catch_tx(struct netmap_generic_adapter *gna, int intercept); int na_is_generic(struct netmap_adapter *na); /* * the generic transmit routine is passed a structure to optionally * build a queue of descriptors, in an OS-specific way. * The payload is at addr, if non-null, and the routine should send or queue * the packet, returning 0 if successful, 1 on failure. * * At the end, if head is non-null, there will be an additional call * to the function with addr = NULL; this should tell the OS-specific * routine to send the queue and free any resources. Failure is ignored. */ struct nm_os_gen_arg { struct ifnet *ifp; void *m; /* os-specific mbuf-like object */ void *head, *tail; /* tailq, if the OS-specific routine needs to build one */ void *addr; /* payload of current packet */ u_int len; /* packet length */ u_int ring_nr; /* packet length */ u_int qevent; /* in txqdisc mode, place an event on this mbuf */ }; int nm_os_generic_xmit_frame(struct nm_os_gen_arg *); int nm_os_generic_find_num_desc(struct ifnet *ifp, u_int *tx, u_int *rx); void nm_os_generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq); void nm_os_generic_set_features(struct netmap_generic_adapter *gna); static inline struct ifnet* netmap_generic_getifp(struct netmap_generic_adapter *gna) { if (gna->prev) return gna->prev->ifp; return gna->up.up.ifp; } void netmap_generic_irq(struct netmap_adapter *na, u_int q, u_int *work_done); //#define RATE_GENERIC /* Enables communication statistics for generic. */ #ifdef RATE_GENERIC void generic_rate(int txp, int txs, int txi, int rxp, int rxs, int rxi); #else #define generic_rate(txp, txs, txi, rxp, rxs, rxi) #endif /* * netmap_mitigation API. This is used by the generic adapter * to reduce the number of interrupt requests/selwakeup * to clients on incoming packets. */ void nm_os_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na); void nm_os_mitigation_start(struct nm_generic_mit *mit); void nm_os_mitigation_restart(struct nm_generic_mit *mit); int nm_os_mitigation_active(struct nm_generic_mit *mit); void nm_os_mitigation_cleanup(struct nm_generic_mit *mit); #else /* !WITH_GENERIC */ #define generic_netmap_attach(ifp) (EOPNOTSUPP) #define na_is_generic(na) (0) #endif /* WITH_GENERIC */ /* Shared declarations for the VALE switch. */ /* * Each transmit queue accumulates a batch of packets into * a structure before forwarding. Packets to the same * destination are put in a list using ft_next as a link field. * ft_frags and ft_next are valid only on the first fragment. */ struct nm_bdg_fwd { /* forwarding entry for a bridge */ void *ft_buf; /* netmap or indirect buffer */ uint8_t ft_frags; /* how many fragments (only on 1st frag) */ uint16_t ft_offset; /* dst port (unused) */ uint16_t ft_flags; /* flags, e.g. indirect */ uint16_t ft_len; /* src fragment len */ uint16_t ft_next; /* next packet to same destination */ }; /* struct 'virtio_net_hdr' from linux. */ struct nm_vnet_hdr { #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start, csum_offset */ #define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */ uint8_t flags; #define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */ #define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */ #define VIRTIO_NET_HDR_GSO_UDP 3 /* GSO frame, IPv4 UDP (UFO) */ #define VIRTIO_NET_HDR_GSO_TCPV6 4 /* GSO frame, IPv6 TCP */ #define VIRTIO_NET_HDR_GSO_ECN 0x80 /* TCP has ECN set */ uint8_t gso_type; uint16_t hdr_len; uint16_t gso_size; uint16_t csum_start; uint16_t csum_offset; }; #define WORST_CASE_GSO_HEADER (14+40+60) /* IPv6 + TCP */ /* Private definitions for IPv4, IPv6, UDP and TCP headers. */ struct nm_iphdr { uint8_t version_ihl; uint8_t tos; uint16_t tot_len; uint16_t id; uint16_t frag_off; uint8_t ttl; uint8_t protocol; uint16_t check; uint32_t saddr; uint32_t daddr; /*The options start here. */ }; struct nm_tcphdr { uint16_t source; uint16_t dest; uint32_t seq; uint32_t ack_seq; uint8_t doff; /* Data offset + Reserved */ uint8_t flags; uint16_t window; uint16_t check; uint16_t urg_ptr; }; struct nm_udphdr { uint16_t source; uint16_t dest; uint16_t len; uint16_t check; }; struct nm_ipv6hdr { uint8_t priority_version; uint8_t flow_lbl[3]; uint16_t payload_len; uint8_t nexthdr; uint8_t hop_limit; uint8_t saddr[16]; uint8_t daddr[16]; }; /* Type used to store a checksum (in host byte order) that hasn't been * folded yet. */ #define rawsum_t uint32_t rawsum_t nm_os_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum); uint16_t nm_os_csum_ipv4(struct nm_iphdr *iph); void nm_os_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, size_t datalen, uint16_t *check); void nm_os_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, size_t datalen, uint16_t *check); uint16_t nm_os_csum_fold(rawsum_t cur_sum); void bdg_mismatch_datapath(struct netmap_vp_adapter *na, struct netmap_vp_adapter *dst_na, const struct nm_bdg_fwd *ft_p, struct netmap_ring *dst_ring, u_int *j, u_int lim, u_int *howmany); /* persistent virtual port routines */ int nm_os_vi_persist(const char *, struct ifnet **); void nm_os_vi_detach(struct ifnet *); void nm_os_vi_init_index(void); /* * kernel thread routines */ struct nm_kctx; /* OS-specific kernel context - opaque */ typedef void (*nm_kctx_worker_fn_t)(void *data); /* kthread configuration */ struct nm_kctx_cfg { long type; /* kthread type/identifier */ nm_kctx_worker_fn_t worker_fn; /* worker function */ void *worker_private;/* worker parameter */ int attach_user; /* attach kthread to user process */ }; /* kthread configuration */ struct nm_kctx *nm_os_kctx_create(struct nm_kctx_cfg *cfg, void *opaque); int nm_os_kctx_worker_start(struct nm_kctx *); void nm_os_kctx_worker_stop(struct nm_kctx *); void nm_os_kctx_destroy(struct nm_kctx *); void nm_os_kctx_worker_setaff(struct nm_kctx *, int); u_int nm_os_ncpus(void); int netmap_sync_kloop(struct netmap_priv_d *priv, struct nmreq_header *hdr); int netmap_sync_kloop_stop(struct netmap_priv_d *priv); #ifdef WITH_PTNETMAP /* ptnetmap guest routines */ /* * ptnetmap_memdev routines used to talk with ptnetmap_memdev device driver */ struct ptnetmap_memdev; int nm_os_pt_memdev_iomap(struct ptnetmap_memdev *, vm_paddr_t *, void **, uint64_t *); void nm_os_pt_memdev_iounmap(struct ptnetmap_memdev *); uint32_t nm_os_pt_memdev_ioread(struct ptnetmap_memdev *, unsigned int); /* * netmap adapter for guest ptnetmap ports */ struct netmap_pt_guest_adapter { /* The netmap adapter to be used by netmap applications. * This field must be the first, to allow upcast. */ struct netmap_hw_adapter hwup; /* The netmap adapter to be used by the driver. */ struct netmap_hw_adapter dr; /* Reference counter to track users of backend netmap port: the * network stack and netmap clients. * Used to decide when we need (de)allocate krings/rings and * start (stop) ptnetmap kthreads. */ int backend_users; }; int netmap_pt_guest_attach(struct netmap_adapter *na, unsigned int nifp_offset, unsigned int memid); bool netmap_pt_guest_txsync(struct nm_csb_atok *atok, struct nm_csb_ktoa *ktoa, struct netmap_kring *kring, int flags); bool netmap_pt_guest_rxsync(struct nm_csb_atok *atok, struct nm_csb_ktoa *ktoa, struct netmap_kring *kring, int flags); int ptnet_nm_krings_create(struct netmap_adapter *na); void ptnet_nm_krings_delete(struct netmap_adapter *na); void ptnet_nm_dtor(struct netmap_adapter *na); /* Helper function wrapping nm_sync_kloop_appl_read(). */ static inline void ptnet_sync_tail(struct nm_csb_ktoa *ktoa, struct netmap_kring *kring) { struct netmap_ring *ring = kring->ring; /* Update hwcur and hwtail as known by the host. */ nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail, &kring->nr_hwcur); /* nm_sync_finalize */ ring->tail = kring->rtail = kring->nr_hwtail; } #endif /* WITH_PTNETMAP */ #ifdef __FreeBSD__ /* * FreeBSD mbuf allocator/deallocator in emulation mode: */ #if __FreeBSD_version < 1100000 /* * For older versions of FreeBSD: * * We allocate EXT_PACKET mbuf+clusters, but need to set M_NOFREE * so that the destructor, if invoked, will not free the packet. * In principle we should set the destructor only on demand, * but since there might be a race we better do it on allocation. * As a consequence, we also need to set the destructor or we * would leak buffers. */ /* mbuf destructor, also need to change the type to EXT_EXTREF, * add an M_NOFREE flag, and then clear the flag and * chain into uma_zfree(zone_pack, mf) * (or reinstall the buffer ?) */ #define SET_MBUF_DESTRUCTOR(m, fn) do { \ (m)->m_ext.ext_free = (void *)fn; \ (m)->m_ext.ext_type = EXT_EXTREF; \ } while (0) static int void_mbuf_dtor(struct mbuf *m, void *arg1, void *arg2) { /* restore original mbuf */ m->m_ext.ext_buf = m->m_data = m->m_ext.ext_arg1; m->m_ext.ext_arg1 = NULL; m->m_ext.ext_type = EXT_PACKET; m->m_ext.ext_free = NULL; if (MBUF_REFCNT(m) == 0) SET_MBUF_REFCNT(m, 1); uma_zfree(zone_pack, m); return 0; } static inline struct mbuf * nm_os_get_mbuf(struct ifnet *ifp, int len) { struct mbuf *m; (void)ifp; m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) { /* m_getcl() (mb_ctor_mbuf) has an assert that checks that * M_NOFREE flag is not specified as third argument, * so we have to set M_NOFREE after m_getcl(). */ m->m_flags |= M_NOFREE; m->m_ext.ext_arg1 = m->m_ext.ext_buf; // XXX save m->m_ext.ext_free = (void *)void_mbuf_dtor; m->m_ext.ext_type = EXT_EXTREF; nm_prdis(5, "create m %p refcnt %d", m, MBUF_REFCNT(m)); } return m; } #else /* __FreeBSD_version >= 1100000 */ /* * Newer versions of FreeBSD, using a straightforward scheme. * * We allocate mbufs with m_gethdr(), since the mbuf header is needed * by the driver. We also attach a customly-provided external storage, * which in this case is a netmap buffer. When calling m_extadd(), however * we pass a NULL address, since the real address (and length) will be * filled in by nm_os_generic_xmit_frame() right before calling * if_transmit(). * * The dtor function does nothing, however we need it since mb_free_ext() * has a KASSERT(), checking that the mbuf dtor function is not NULL. */ #if __FreeBSD_version <= 1200050 static void void_mbuf_dtor(struct mbuf *m, void *arg1, void *arg2) { } #else /* __FreeBSD_version >= 1200051 */ /* The arg1 and arg2 pointers argument were removed by r324446, which * in included since version 1200051. */ static void void_mbuf_dtor(struct mbuf *m) { } #endif /* __FreeBSD_version >= 1200051 */ #define SET_MBUF_DESTRUCTOR(m, fn) do { \ (m)->m_ext.ext_free = (fn != NULL) ? \ (void *)fn : (void *)void_mbuf_dtor; \ } while (0) static inline struct mbuf * nm_os_get_mbuf(struct ifnet *ifp, int len) { struct mbuf *m; (void)ifp; (void)len; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { return m; } m_extadd(m, NULL /* buf */, 0 /* size */, void_mbuf_dtor, NULL, NULL, 0, EXT_NET_DRV); return m; } #endif /* __FreeBSD_version >= 1100000 */ #endif /* __FreeBSD__ */ struct nmreq_option * nmreq_findoption(struct nmreq_option *, uint16_t); int nmreq_checkduplicate(struct nmreq_option *); int netmap_init_bridges(void); void netmap_uninit_bridges(void); /* Functions to read and write CSB fields from the kernel. */ #if defined (linux) #define CSB_READ(csb, field, r) (get_user(r, &csb->field)) #define CSB_WRITE(csb, field, v) (put_user(v, &csb->field)) #else /* ! linux */ #define CSB_READ(csb, field, r) (r = fuword32(&csb->field)) #define CSB_WRITE(csb, field, v) (suword32(&csb->field, v)) #endif /* ! linux */ #endif /* _NET_NETMAP_KERN_H_ */