Index: stable/11/sys/dev/netmap/netmap_kern.h =================================================================== --- stable/11/sys/dev/netmap/netmap_kern.h (revision 343831) +++ stable/11/sys/dev/netmap/netmap_kern.h (revision 343832) @@ -1,2435 +1,2388 @@ /* * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo * Copyright (C) 2013-2016 Universita` di Pisa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD$ * * The header contains the definitions of constants and function * prototypes used only in kernelspace. */ #ifndef _NET_NETMAP_KERN_H_ #define _NET_NETMAP_KERN_H_ #if defined(linux) #if defined(CONFIG_NETMAP_EXTMEM) #define WITH_EXTMEM #endif #if defined(CONFIG_NETMAP_VALE) #define WITH_VALE #endif #if defined(CONFIG_NETMAP_PIPE) #define WITH_PIPES #endif #if defined(CONFIG_NETMAP_MONITOR) #define WITH_MONITOR #endif #if defined(CONFIG_NETMAP_GENERIC) #define WITH_GENERIC #endif #if defined(CONFIG_NETMAP_PTNETMAP) #define WITH_PTNETMAP #endif #if defined(CONFIG_NETMAP_SINK) #define WITH_SINK #endif #if defined(CONFIG_NETMAP_NULL) #define WITH_NMNULL #endif #elif defined (_WIN32) #define WITH_VALE // comment out to disable VALE support #define WITH_PIPES #define WITH_MONITOR #define WITH_GENERIC #define WITH_NMNULL #else /* neither linux nor windows */ #define WITH_VALE // comment out to disable VALE support #define WITH_PIPES #define WITH_MONITOR #define WITH_GENERIC #define WITH_PTNETMAP /* ptnetmap guest support */ #define WITH_EXTMEM #define WITH_NMNULL #endif #if defined(__FreeBSD__) #include #define likely(x) __builtin_expect((long)!!(x), 1L) #define unlikely(x) __builtin_expect((long)!!(x), 0L) #define __user #define NM_LOCK_T struct mtx /* low level spinlock, used to protect queues */ #define NM_MTX_T struct sx /* OS-specific mutex (sleepable) */ #define NM_MTX_INIT(m) sx_init(&(m), #m) #define NM_MTX_DESTROY(m) sx_destroy(&(m)) #define NM_MTX_LOCK(m) sx_xlock(&(m)) #define NM_MTX_SPINLOCK(m) while (!sx_try_xlock(&(m))) ; #define NM_MTX_UNLOCK(m) sx_xunlock(&(m)) #define NM_MTX_ASSERT(m) sx_assert(&(m), SA_XLOCKED) #define NM_SELINFO_T struct nm_selinfo #define NM_SELRECORD_T struct thread #define MBUF_LEN(m) ((m)->m_pkthdr.len) #define MBUF_TXQ(m) ((m)->m_pkthdr.flowid) #define MBUF_TRANSMIT(na, ifp, m) ((na)->if_transmit(ifp, m)) #define GEN_TX_MBUF_IFP(m) ((m)->m_pkthdr.rcvif) #define NM_ATOMIC_T volatile int /* required by atomic/bitops.h */ /* atomic operations */ #include #define NM_ATOMIC_TEST_AND_SET(p) (!atomic_cmpset_acq_int((p), 0, 1)) #define NM_ATOMIC_CLEAR(p) atomic_store_rel_int((p), 0) #if __FreeBSD_version >= 1100030 #define WNA(_ifp) (_ifp)->if_netmap #else /* older FreeBSD */ #define WNA(_ifp) (_ifp)->if_pspare[0] #endif /* older FreeBSD */ #if __FreeBSD_version >= 1100005 struct netmap_adapter *netmap_getna(if_t ifp); #endif #if __FreeBSD_version >= 1100027 #define MBUF_REFCNT(m) ((m)->m_ext.ext_count) #define SET_MBUF_REFCNT(m, x) (m)->m_ext.ext_count = x #else #define MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *((m)->m_ext.ref_cnt) : -1) #define SET_MBUF_REFCNT(m, x) *((m)->m_ext.ref_cnt) = x #endif #define MBUF_QUEUED(m) 1 struct nm_selinfo { struct selinfo si; struct mtx m; }; struct hrtimer { /* Not used in FreeBSD. */ }; #define NM_BNS_GET(b) #define NM_BNS_PUT(b) #elif defined (linux) #define NM_LOCK_T safe_spinlock_t // see bsd_glue.h #define NM_SELINFO_T wait_queue_head_t #define MBUF_LEN(m) ((m)->len) #define MBUF_TRANSMIT(na, ifp, m) \ ({ \ /* Avoid infinite recursion with generic. */ \ m->priority = NM_MAGIC_PRIORITY_TX; \ (((struct net_device_ops *)(na)->if_transmit)->ndo_start_xmit(m, ifp)); \ 0; \ }) /* See explanation in nm_os_generic_xmit_frame. */ #define GEN_TX_MBUF_IFP(m) ((struct ifnet *)skb_shinfo(m)->destructor_arg) #define NM_ATOMIC_T volatile long unsigned int #define NM_MTX_T struct mutex /* OS-specific sleepable lock */ #define NM_MTX_INIT(m) mutex_init(&(m)) #define NM_MTX_DESTROY(m) do { (void)(m); } while (0) #define NM_MTX_LOCK(m) mutex_lock(&(m)) #define NM_MTX_UNLOCK(m) mutex_unlock(&(m)) #define NM_MTX_ASSERT(m) mutex_is_locked(&(m)) #ifndef DEV_NETMAP #define DEV_NETMAP #endif /* DEV_NETMAP */ #elif defined (__APPLE__) #warning apple support is incomplete. #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) #define NM_LOCK_T IOLock * #define NM_SELINFO_T struct selinfo #define MBUF_LEN(m) ((m)->m_pkthdr.len) #elif defined (_WIN32) #include "../../../WINDOWS/win_glue.h" #define NM_SELRECORD_T IO_STACK_LOCATION #define NM_SELINFO_T win_SELINFO // see win_glue.h #define NM_LOCK_T win_spinlock_t // see win_glue.h #define NM_MTX_T KGUARDED_MUTEX /* OS-specific mutex (sleepable) */ #define NM_MTX_INIT(m) KeInitializeGuardedMutex(&m); #define NM_MTX_DESTROY(m) do { (void)(m); } while (0) #define NM_MTX_LOCK(m) KeAcquireGuardedMutex(&(m)) #define NM_MTX_UNLOCK(m) KeReleaseGuardedMutex(&(m)) #define NM_MTX_ASSERT(m) assert(&m.Count>0) //These linknames are for the NDIS driver #define NETMAP_NDIS_LINKNAME_STRING L"\\DosDevices\\NMAPNDIS" #define NETMAP_NDIS_NTDEVICE_STRING L"\\Device\\NMAPNDIS" //Definition of internal driver-to-driver ioctl codes #define NETMAP_KERNEL_XCHANGE_POINTERS _IO('i', 180) #define NETMAP_KERNEL_SEND_SHUTDOWN_SIGNAL _IO_direct('i', 195) typedef struct hrtimer{ KTIMER timer; BOOLEAN active; KDPC deferred_proc; }; /* MSVC does not have likely/unlikely support */ #ifdef _MSC_VER #define likely(x) (x) #define unlikely(x) (x) #else #define likely(x) __builtin_expect((long)!!(x), 1L) #define unlikely(x) __builtin_expect((long)!!(x), 0L) #endif //_MSC_VER #else #error unsupported platform #endif /* end - platform-specific code */ #ifndef _WIN32 /* support for emulated sysctl */ #define SYSBEGIN(x) #define SYSEND #endif /* _WIN32 */ #define NM_ACCESS_ONCE(x) (*(volatile __typeof__(x) *)&(x)) #define NMG_LOCK_T NM_MTX_T #define NMG_LOCK_INIT() NM_MTX_INIT(netmap_global_lock) #define NMG_LOCK_DESTROY() NM_MTX_DESTROY(netmap_global_lock) #define NMG_LOCK() NM_MTX_LOCK(netmap_global_lock) #define NMG_UNLOCK() NM_MTX_UNLOCK(netmap_global_lock) #define NMG_LOCK_ASSERT() NM_MTX_ASSERT(netmap_global_lock) #if defined(__FreeBSD__) #define nm_prerr_int printf #define nm_prinf_int printf #elif defined (_WIN32) #define nm_prerr_int DbgPrint #define nm_prinf_int DbgPrint #elif defined(linux) #define nm_prerr_int(fmt, arg...) printk(KERN_ERR fmt, ##arg) #define nm_prinf_int(fmt, arg...) printk(KERN_INFO fmt, ##arg) #endif #define nm_prinf(format, ...) \ do { \ struct timeval __xxts; \ microtime(&__xxts); \ nm_prinf_int("%03d.%06d [%4d] %-25s " format "\n",\ (int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \ __LINE__, __FUNCTION__, ##__VA_ARGS__); \ } while (0) #define nm_prerr(format, ...) \ do { \ struct timeval __xxts; \ microtime(&__xxts); \ nm_prerr_int("%03d.%06d [%4d] %-25s " format "\n",\ (int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \ __LINE__, __FUNCTION__, ##__VA_ARGS__); \ } while (0) /* Disabled printf (used to be ND). */ #define nm_prdis(format, ...) /* Rate limited, lps indicates how many per second. */ #define nm_prlim(lps, format, ...) \ do { \ static int t0, __cnt; \ if (t0 != time_second) { \ t0 = time_second; \ __cnt = 0; \ } \ if (__cnt++ < lps) \ nm_prinf(format, ##__VA_ARGS__); \ } while (0) /* Old macros. */ #define ND nm_prdis #define D nm_prerr #define RD nm_prlim struct netmap_adapter; struct nm_bdg_fwd; struct nm_bridge; struct netmap_priv_d; struct nm_bdg_args; /* os-specific NM_SELINFO_T initialzation/destruction functions */ void nm_os_selinfo_init(NM_SELINFO_T *); void nm_os_selinfo_uninit(NM_SELINFO_T *); const char *nm_dump_buf(char *p, int len, int lim, char *dst); void nm_os_selwakeup(NM_SELINFO_T *si); void nm_os_selrecord(NM_SELRECORD_T *sr, NM_SELINFO_T *si); int nm_os_ifnet_init(void); void nm_os_ifnet_fini(void); void nm_os_ifnet_lock(void); void nm_os_ifnet_unlock(void); unsigned nm_os_ifnet_mtu(struct ifnet *ifp); void nm_os_get_module(void); void nm_os_put_module(void); void netmap_make_zombie(struct ifnet *); void netmap_undo_zombie(struct ifnet *); /* os independent alloc/realloc/free */ void *nm_os_malloc(size_t); void *nm_os_vmalloc(size_t); void *nm_os_realloc(void *, size_t new_size, size_t old_size); void nm_os_free(void *); void nm_os_vfree(void *); /* os specific attach/detach enter/exit-netmap-mode routines */ void nm_os_onattach(struct ifnet *); void nm_os_ondetach(struct ifnet *); void nm_os_onenter(struct ifnet *); void nm_os_onexit(struct ifnet *); /* passes a packet up to the host stack. * If the packet is sent (or dropped) immediately it returns NULL, * otherwise it links the packet to prev and returns m. * In this case, a final call with m=NULL and prev != NULL will send up * the entire chain to the host stack. */ void *nm_os_send_up(struct ifnet *, struct mbuf *m, struct mbuf *prev); int nm_os_mbuf_has_seg_offld(struct mbuf *m); int nm_os_mbuf_has_csum_offld(struct mbuf *m); #include "netmap_mbq.h" extern NMG_LOCK_T netmap_global_lock; enum txrx { NR_RX = 0, NR_TX = 1, NR_TXRX }; static __inline const char* nm_txrx2str(enum txrx t) { return (t== NR_RX ? "RX" : "TX"); } static __inline enum txrx nm_txrx_swap(enum txrx t) { return (t== NR_RX ? NR_TX : NR_RX); } #define for_rx_tx(t) for ((t) = 0; (t) < NR_TXRX; (t)++) #ifdef WITH_MONITOR struct netmap_zmon_list { struct netmap_kring *next; struct netmap_kring *prev; }; #endif /* WITH_MONITOR */ /* * private, kernel view of a ring. Keeps track of the status of * a ring across system calls. * * nr_hwcur index of the next buffer to refill. * It corresponds to ring->head * at the time the system call returns. * * nr_hwtail index of the first buffer owned by the kernel. * On RX, hwcur->hwtail are receive buffers * not yet released. hwcur is advanced following * ring->head, hwtail is advanced on incoming packets, * and a wakeup is generated when hwtail passes ring->cur * On TX, hwcur->rcur have been filled by the sender * but not sent yet to the NIC; rcur->hwtail are available * for new transmissions, and hwtail->hwcur-1 are pending * transmissions not yet acknowledged. * * The indexes in the NIC and netmap rings are offset by nkr_hwofs slots. * This is so that, on a reset, buffers owned by userspace are not * modified by the kernel. In particular: * RX rings: the next empty buffer (hwtail + hwofs) coincides with * the next empty buffer as known by the hardware (next_to_check or so). * TX rings: hwcur + hwofs coincides with next_to_send * * The following fields are used to implement lock-free copy of packets * from input to output ports in VALE switch: * nkr_hwlease buffer after the last one being copied. * A writer in nm_bdg_flush reserves N buffers * from nr_hwlease, advances it, then does the * copy outside the lock. * In RX rings (used for VALE ports), * nkr_hwtail <= nkr_hwlease < nkr_hwcur+N-1 * In TX rings (used for NIC or host stack ports) * nkr_hwcur <= nkr_hwlease < nkr_hwtail * nkr_leases array of nkr_num_slots where writers can report * completion of their block. NR_NOSLOT (~0) indicates * that the writer has not finished yet * nkr_lease_idx index of next free slot in nr_leases, to be assigned * * The kring is manipulated by txsync/rxsync and generic netmap function. * * Concurrent rxsync or txsync on the same ring are prevented through * by nm_kr_(try)lock() which in turn uses nr_busy. This is all we need * for NIC rings, and for TX rings attached to the host stack. * * RX rings attached to the host stack use an mbq (rx_queue) on both * rxsync_from_host() and netmap_transmit(). The mbq is protected * by its internal lock. * * RX rings attached to the VALE switch are accessed by both senders * and receiver. They are protected through the q_lock on the RX ring. */ struct netmap_kring { struct netmap_ring *ring; uint32_t nr_hwcur; /* should be nr_hwhead */ uint32_t nr_hwtail; /* * Copies of values in user rings, so we do not need to look * at the ring (which could be modified). These are set in the * *sync_prologue()/finalize() routines. */ uint32_t rhead; uint32_t rcur; uint32_t rtail; uint32_t nr_kflags; /* private driver flags */ #define NKR_PENDINTR 0x1 // Pending interrupt. #define NKR_EXCLUSIVE 0x2 /* exclusive binding */ #define NKR_FORWARD 0x4 /* (host ring only) there are packets to forward */ #define NKR_NEEDRING 0x8 /* ring needed even if users==0 * (used internally by pipes and * by ptnetmap host ports) */ #define NKR_NOINTR 0x10 /* don't use interrupts on this ring */ #define NKR_FAKERING 0x20 /* don't allocate/free buffers */ uint32_t nr_mode; uint32_t nr_pending_mode; #define NKR_NETMAP_OFF 0x0 #define NKR_NETMAP_ON 0x1 uint32_t nkr_num_slots; /* * On a NIC reset, the NIC ring indexes may be reset but the * indexes in the netmap rings remain the same. nkr_hwofs * keeps track of the offset between the two. */ int32_t nkr_hwofs; /* last_reclaim is opaque marker to help reduce the frequency * of operations such as reclaiming tx buffers. A possible use * is set it to ticks and do the reclaim only once per tick. */ uint64_t last_reclaim; NM_SELINFO_T si; /* poll/select wait queue */ NM_LOCK_T q_lock; /* protects kring and ring. */ NM_ATOMIC_T nr_busy; /* prevent concurrent syscalls */ /* the adapter the owns this kring */ struct netmap_adapter *na; /* the adapter that wants to be notified when this kring has * new slots avaialable. This is usually the same as the above, * but wrappers may let it point to themselves */ struct netmap_adapter *notify_na; /* The following fields are for VALE switch support */ struct nm_bdg_fwd *nkr_ft; uint32_t *nkr_leases; #define NR_NOSLOT ((uint32_t)~0) /* used in nkr_*lease* */ uint32_t nkr_hwlease; uint32_t nkr_lease_idx; /* while nkr_stopped is set, no new [tr]xsync operations can * be started on this kring. * This is used by netmap_disable_all_rings() * to find a synchronization point where critical data * structures pointed to by the kring can be added or removed */ volatile int nkr_stopped; /* Support for adapters without native netmap support. * On tx rings we preallocate an array of tx buffers * (same size as the netmap ring), on rx rings we * store incoming mbufs in a queue that is drained by * a rxsync. */ struct mbuf **tx_pool; struct mbuf *tx_event; /* TX event used as a notification */ NM_LOCK_T tx_event_lock; /* protects the tx_event mbuf */ struct mbq rx_queue; /* intercepted rx mbufs. */ uint32_t users; /* existing bindings for this ring */ uint32_t ring_id; /* kring identifier */ enum txrx tx; /* kind of ring (tx or rx) */ char name[64]; /* diagnostic */ /* [tx]sync callback for this kring. * The default nm_kring_create callback (netmap_krings_create) * sets the nm_sync callback of each hardware tx(rx) kring to * the corresponding nm_txsync(nm_rxsync) taken from the * netmap_adapter; moreover, it sets the sync callback * of the host tx(rx) ring to netmap_txsync_to_host * (netmap_rxsync_from_host). * * Overrides: the above configuration is not changed by * any of the nm_krings_create callbacks. */ int (*nm_sync)(struct netmap_kring *kring, int flags); int (*nm_notify)(struct netmap_kring *kring, int flags); #ifdef WITH_PIPES struct netmap_kring *pipe; /* if this is a pipe ring, * pointer to the other end */ uint32_t pipe_tail; /* hwtail updated by the other end */ #endif /* WITH_PIPES */ int (*save_notify)(struct netmap_kring *kring, int flags); #ifdef WITH_MONITOR /* array of krings that are monitoring this kring */ struct netmap_kring **monitors; uint32_t max_monitors; /* current size of the monitors array */ uint32_t n_monitors; /* next unused entry in the monitor array */ uint32_t mon_pos[NR_TXRX]; /* index of this ring in the monitored ring array */ uint32_t mon_tail; /* last seen slot on rx */ /* circular list of zero-copy monitors */ struct netmap_zmon_list zmon_list[NR_TXRX]; /* * Monitors work by intercepting the sync and notify callbacks of the * monitored krings. This is implemented by replacing the pointers * above and saving the previous ones in mon_* pointers below */ int (*mon_sync)(struct netmap_kring *kring, int flags); int (*mon_notify)(struct netmap_kring *kring, int flags); #endif } #ifdef _WIN32 __declspec(align(64)); #else __attribute__((__aligned__(64))); #endif /* return 1 iff the kring needs to be turned on */ static inline int nm_kring_pending_on(struct netmap_kring *kring) { return kring->nr_pending_mode == NKR_NETMAP_ON && kring->nr_mode == NKR_NETMAP_OFF; } /* return 1 iff the kring needs to be turned off */ static inline int nm_kring_pending_off(struct netmap_kring *kring) { return kring->nr_pending_mode == NKR_NETMAP_OFF && kring->nr_mode == NKR_NETMAP_ON; } /* return the next index, with wraparound */ static inline uint32_t nm_next(uint32_t i, uint32_t lim) { return unlikely (i == lim) ? 0 : i + 1; } /* return the previous index, with wraparound */ static inline uint32_t nm_prev(uint32_t i, uint32_t lim) { return unlikely (i == 0) ? lim : i - 1; } /* * * Here is the layout for the Rx and Tx rings. RxRING TxRING +-----------------+ +-----------------+ | | | | | free | | free | +-----------------+ +-----------------+ head->| owned by user |<-hwcur | not sent to nic |<-hwcur | | | yet | +-----------------+ | | cur->| available to | | | | user, not read | +-----------------+ | yet | cur->| (being | | | | prepared) | | | | | +-----------------+ + ------ + tail->| |<-hwtail | |<-hwlease | (being | ... | | ... | prepared) | ... | | ... +-----------------+ ... | | ... | |<-hwlease +-----------------+ | | tail->| |<-hwtail | | | | | | | | | | | | +-----------------+ +-----------------+ * The cur/tail (user view) and hwcur/hwtail (kernel view) * are used in the normal operation of the card. * * When a ring is the output of a switch port (Rx ring for * a VALE port, Tx ring for the host stack or NIC), slots * are reserved in blocks through 'hwlease' which points * to the next unused slot. * On an Rx ring, hwlease is always after hwtail, * and completions cause hwtail to advance. * On a Tx ring, hwlease is always between cur and hwtail, * and completions cause cur to advance. * * nm_kr_space() returns the maximum number of slots that * can be assigned. * nm_kr_lease() reserves the required number of buffers, * advances nkr_hwlease and also returns an entry in * a circular array where completions should be reported. */ struct lut_entry; #ifdef __FreeBSD__ #define plut_entry lut_entry #endif struct netmap_lut { struct lut_entry *lut; struct plut_entry *plut; uint32_t objtotal; /* max buffer index */ uint32_t objsize; /* buffer size */ }; struct netmap_vp_adapter; // forward struct nm_bridge; /* Struct to be filled by nm_config callbacks. */ struct nm_config_info { unsigned num_tx_rings; unsigned num_rx_rings; unsigned num_tx_descs; unsigned num_rx_descs; unsigned rx_buf_maxsize; }; /* * default type for the magic field. * May be overriden in glue code. */ #ifndef NM_OS_MAGIC #define NM_OS_MAGIC uint32_t #endif /* !NM_OS_MAGIC */ /* * The "struct netmap_adapter" extends the "struct adapter" * (or equivalent) device descriptor. * It contains all base fields needed to support netmap operation. * There are in fact different types of netmap adapters * (native, generic, VALE switch...) so a netmap_adapter is * just the first field in the derived type. */ struct netmap_adapter { /* * On linux we do not have a good way to tell if an interface * is netmap-capable. So we always use the following trick: * NA(ifp) points here, and the first entry (which hopefully * always exists and is at least 32 bits) contains a magic * value which we can use to detect that the interface is good. */ NM_OS_MAGIC magic; uint32_t na_flags; /* enabled, and other flags */ #define NAF_SKIP_INTR 1 /* use the regular interrupt handler. * useful during initialization */ #define NAF_SW_ONLY 2 /* forward packets only to sw adapter */ #define NAF_BDG_MAYSLEEP 4 /* the bridge is allowed to sleep when * forwarding packets coming from this * interface */ #define NAF_MEM_OWNER 8 /* the adapter uses its own memory area * that cannot be changed */ #define NAF_NATIVE 16 /* the adapter is native. * Virtual ports (non persistent vale ports, * pipes, monitors...) should never use * this flag. */ #define NAF_NETMAP_ON 32 /* netmap is active (either native or * emulated). Where possible (e.g. FreeBSD) * IFCAP_NETMAP also mirrors this flag. */ #define NAF_HOST_RINGS 64 /* the adapter supports the host rings */ #define NAF_FORCE_NATIVE 128 /* the adapter is always NATIVE */ /* free */ #define NAF_MOREFRAG 512 /* the adapter supports NS_MOREFRAG */ #define NAF_ZOMBIE (1U<<30) /* the nic driver has been unloaded */ #define NAF_BUSY (1U<<31) /* the adapter is used internally and * cannot be registered from userspace */ int active_fds; /* number of user-space descriptors using this interface, which is equal to the number of struct netmap_if objs in the mapped region. */ u_int num_rx_rings; /* number of adapter receive rings */ u_int num_tx_rings; /* number of adapter transmit rings */ u_int num_host_rx_rings; /* number of host receive rings */ u_int num_host_tx_rings; /* number of host transmit rings */ u_int num_tx_desc; /* number of descriptor in each queue */ u_int num_rx_desc; /* tx_rings and rx_rings are private but allocated as a * contiguous chunk of memory. Each array has N+K entries, * N for the hardware rings and K for the host rings. */ struct netmap_kring **tx_rings; /* array of TX rings. */ struct netmap_kring **rx_rings; /* array of RX rings. */ void *tailroom; /* space below the rings array */ /* (used for leases) */ NM_SELINFO_T si[NR_TXRX]; /* global wait queues */ /* count users of the global wait queues */ int si_users[NR_TXRX]; void *pdev; /* used to store pci device */ /* copy of if_qflush and if_transmit pointers, to intercept * packets from the network stack when netmap is active. */ int (*if_transmit)(struct ifnet *, struct mbuf *); /* copy of if_input for netmap_send_up() */ void (*if_input)(struct ifnet *, struct mbuf *); /* Back reference to the parent ifnet struct. Used for * hardware ports (emulated netmap included). */ struct ifnet *ifp; /* adapter is ifp->if_softc */ /*---- callbacks for this netmap adapter -----*/ /* * nm_dtor() is the cleanup routine called when destroying * the adapter. * Called with NMG_LOCK held. * * nm_register() is called on NIOCREGIF and close() to enter * or exit netmap mode on the NIC * Called with NNG_LOCK held. * * nm_txsync() pushes packets to the underlying hw/switch * * nm_rxsync() collects packets from the underlying hw/switch * * nm_config() returns configuration information from the OS * Called with NMG_LOCK held. * * nm_krings_create() create and init the tx_rings and * rx_rings arrays of kring structures. In particular, * set the nm_sync callbacks for each ring. * There is no need to also allocate the corresponding * netmap_rings, since netmap_mem_rings_create() will always * be called to provide the missing ones. * Called with NNG_LOCK held. * * nm_krings_delete() cleanup and delete the tx_rings and rx_rings * arrays * Called with NMG_LOCK held. * * nm_notify() is used to act after data have become available * (or the stopped state of the ring has changed) * For hw devices this is typically a selwakeup(), * but for NIC/host ports attached to a switch (or vice-versa) * we also need to invoke the 'txsync' code downstream. * This callback pointer is actually used only to initialize * kring->nm_notify. * Return values are the same as for netmap_rx_irq(). */ void (*nm_dtor)(struct netmap_adapter *); int (*nm_register)(struct netmap_adapter *, int onoff); void (*nm_intr)(struct netmap_adapter *, int onoff); int (*nm_txsync)(struct netmap_kring *kring, int flags); int (*nm_rxsync)(struct netmap_kring *kring, int flags); int (*nm_notify)(struct netmap_kring *kring, int flags); #define NAF_FORCE_READ 1 #define NAF_FORCE_RECLAIM 2 #define NAF_CAN_FORWARD_DOWN 4 /* return configuration information */ int (*nm_config)(struct netmap_adapter *, struct nm_config_info *info); int (*nm_krings_create)(struct netmap_adapter *); void (*nm_krings_delete)(struct netmap_adapter *); /* * nm_bdg_attach() initializes the na_vp field to point * to an adapter that can be attached to a VALE switch. If the * current adapter is already a VALE port, na_vp is simply a cast; * otherwise, na_vp points to a netmap_bwrap_adapter. * If applicable, this callback also initializes na_hostvp, * that can be used to connect the adapter host rings to the * switch. * Called with NMG_LOCK held. * * nm_bdg_ctl() is called on the actual attach/detach to/from * to/from the switch, to perform adapter-specific * initializations * Called with NMG_LOCK held. */ int (*nm_bdg_attach)(const char *bdg_name, struct netmap_adapter *, struct nm_bridge *); int (*nm_bdg_ctl)(struct nmreq_header *, struct netmap_adapter *); /* adapter used to attach this adapter to a VALE switch (if any) */ struct netmap_vp_adapter *na_vp; /* adapter used to attach the host rings of this adapter * to a VALE switch (if any) */ struct netmap_vp_adapter *na_hostvp; /* standard refcount to control the lifetime of the adapter * (it should be equal to the lifetime of the corresponding ifp) */ int na_refcount; /* memory allocator (opaque) * We also cache a pointer to the lut_entry for translating * buffer addresses, the total number of buffers and the buffer size. */ struct netmap_mem_d *nm_mem; struct netmap_mem_d *nm_mem_prev; struct netmap_lut na_lut; /* additional information attached to this adapter * by other netmap subsystems. Currently used by * bwrap, LINUX/v1000 and ptnetmap */ void *na_private; /* array of pipes that have this adapter as a parent */ struct netmap_pipe_adapter **na_pipes; int na_next_pipe; /* next free slot in the array */ int na_max_pipes; /* size of the array */ /* Offset of ethernet header for each packet. */ u_int virt_hdr_len; /* Max number of bytes that the NIC can store in the buffer * referenced by each RX descriptor. This translates to the maximum * bytes that a single netmap slot can reference. Larger packets * require NS_MOREFRAG support. */ unsigned rx_buf_maxsize; char name[NETMAP_REQ_IFNAMSIZ]; /* used at least by pipes */ #ifdef WITH_MONITOR unsigned long monitor_id; /* debugging */ #endif }; static __inline u_int nma_get_ndesc(struct netmap_adapter *na, enum txrx t) { return (t == NR_TX ? na->num_tx_desc : na->num_rx_desc); } static __inline void nma_set_ndesc(struct netmap_adapter *na, enum txrx t, u_int v) { if (t == NR_TX) na->num_tx_desc = v; else na->num_rx_desc = v; } static __inline u_int nma_get_nrings(struct netmap_adapter *na, enum txrx t) { return (t == NR_TX ? na->num_tx_rings : na->num_rx_rings); } static __inline u_int nma_get_host_nrings(struct netmap_adapter *na, enum txrx t) { return (t == NR_TX ? na->num_host_tx_rings : na->num_host_rx_rings); } static __inline void nma_set_nrings(struct netmap_adapter *na, enum txrx t, u_int v) { if (t == NR_TX) na->num_tx_rings = v; else na->num_rx_rings = v; } static __inline void nma_set_host_nrings(struct netmap_adapter *na, enum txrx t, u_int v) { if (t == NR_TX) na->num_host_tx_rings = v; else na->num_host_rx_rings = v; } static __inline struct netmap_kring** NMR(struct netmap_adapter *na, enum txrx t) { return (t == NR_TX ? na->tx_rings : na->rx_rings); } int nma_intr_enable(struct netmap_adapter *na, int onoff); /* * If the NIC is owned by the kernel * (i.e., bridge), neither another bridge nor user can use it; * if the NIC is owned by a user, only users can share it. * Evaluation must be done under NMG_LOCK(). */ #define NETMAP_OWNED_BY_KERN(na) ((na)->na_flags & NAF_BUSY) #define NETMAP_OWNED_BY_ANY(na) \ (NETMAP_OWNED_BY_KERN(na) || ((na)->active_fds > 0)) /* * derived netmap adapters for various types of ports */ struct netmap_vp_adapter { /* VALE software port */ struct netmap_adapter up; /* * Bridge support: * * bdg_port is the port number used in the bridge; * na_bdg points to the bridge this NA is attached to. */ int bdg_port; struct nm_bridge *na_bdg; int retry; int autodelete; /* remove the ifp on last reference */ /* Maximum Frame Size, used in bdg_mismatch_datapath() */ u_int mfs; /* Last source MAC on this port */ uint64_t last_smac; }; struct netmap_hw_adapter { /* physical device */ struct netmap_adapter up; #ifdef linux struct net_device_ops nm_ndo; struct ethtool_ops nm_eto; #endif const struct ethtool_ops* save_ethtool; int (*nm_hw_register)(struct netmap_adapter *, int onoff); }; #ifdef WITH_GENERIC /* Mitigation support. */ struct nm_generic_mit { struct hrtimer mit_timer; int mit_pending; int mit_ring_idx; /* index of the ring being mitigated */ struct netmap_adapter *mit_na; /* backpointer */ }; struct netmap_generic_adapter { /* emulated device */ struct netmap_hw_adapter up; /* Pointer to a previously used netmap adapter. */ struct netmap_adapter *prev; /* Emulated netmap adapters support: * - save_if_input saves the if_input hook (FreeBSD); * - mit implements rx interrupt mitigation; */ void (*save_if_input)(struct ifnet *, struct mbuf *); struct nm_generic_mit *mit; #ifdef linux netdev_tx_t (*save_start_xmit)(struct mbuf *, struct ifnet *); #endif /* Is the adapter able to use multiple RX slots to scatter * each packet pushed up by the driver? */ int rxsg; /* Is the transmission path controlled by a netmap-aware * device queue (i.e. qdisc on linux)? */ int txqdisc; }; #endif /* WITH_GENERIC */ static __inline u_int netmap_real_rings(struct netmap_adapter *na, enum txrx t) { return nma_get_nrings(na, t) + !!(na->na_flags & NAF_HOST_RINGS) * nma_get_host_nrings(na, t); } /* account for fake rings */ static __inline u_int netmap_all_rings(struct netmap_adapter *na, enum txrx t) { return max(nma_get_nrings(na, t) + 1, netmap_real_rings(na, t)); } int netmap_default_bdg_attach(const char *name, struct netmap_adapter *na, struct nm_bridge *); struct nm_bdg_polling_state; /* * Bridge wrapper for non VALE ports attached to a VALE switch. * * The real device must already have its own netmap adapter (hwna). * The bridge wrapper and the hwna adapter share the same set of * netmap rings and buffers, but they have two separate sets of * krings descriptors, with tx/rx meanings swapped: * * netmap * bwrap krings rings krings hwna * +------+ +------+ +-----+ +------+ +------+ * |tx_rings->| |\ /| |----| |<-tx_rings| * | | +------+ \ / +-----+ +------+ | | * | | X | | * | | / \ | | * | | +------+/ \+-----+ +------+ | | * |rx_rings->| | | |----| |<-rx_rings| * | | +------+ +-----+ +------+ | | * +------+ +------+ * * - packets coming from the bridge go to the brwap rx rings, * which are also the hwna tx rings. The bwrap notify callback * will then complete the hwna tx (see netmap_bwrap_notify). * * - packets coming from the outside go to the hwna rx rings, * which are also the bwrap tx rings. The (overwritten) hwna * notify method will then complete the bridge tx * (see netmap_bwrap_intr_notify). * * The bridge wrapper may optionally connect the hwna 'host' rings * to the bridge. This is done by using a second port in the * bridge and connecting it to the 'host' netmap_vp_adapter * contained in the netmap_bwrap_adapter. The brwap host adapter * cross-links the hwna host rings in the same way as shown above. * * - packets coming from the bridge and directed to the host stack * are handled by the bwrap host notify callback * (see netmap_bwrap_host_notify) * * - packets coming from the host stack are still handled by the * overwritten hwna notify callback (netmap_bwrap_intr_notify), * but are diverted to the host adapter depending on the ring number. * */ struct netmap_bwrap_adapter { struct netmap_vp_adapter up; struct netmap_vp_adapter host; /* for host rings */ struct netmap_adapter *hwna; /* the underlying device */ /* * When we attach a physical interface to the bridge, we * allow the controlling process to terminate, so we need * a place to store the n_detmap_priv_d data structure. * This is only done when physical interfaces * are attached to a bridge. */ struct netmap_priv_d *na_kpriv; struct nm_bdg_polling_state *na_polling_state; /* we overwrite the hwna->na_vp pointer, so we save * here its original value, to be restored at detach */ struct netmap_vp_adapter *saved_na_vp; }; int nm_bdg_polling(struct nmreq_header *hdr); #ifdef WITH_VALE int netmap_vale_attach(struct nmreq_header *hdr, void *auth_token); int netmap_vale_detach(struct nmreq_header *hdr, void *auth_token); int netmap_vale_list(struct nmreq_header *hdr); int netmap_vi_create(struct nmreq_header *hdr, int); int nm_vi_create(struct nmreq_header *); int nm_vi_destroy(const char *name); #else /* !WITH_VALE */ #define netmap_vi_create(hdr, a) (EOPNOTSUPP) #endif /* WITH_VALE */ #ifdef WITH_PIPES #define NM_MAXPIPES 64 /* max number of pipes per adapter */ struct netmap_pipe_adapter { /* pipe identifier is up.name */ struct netmap_adapter up; #define NM_PIPE_ROLE_MASTER 0x1 #define NM_PIPE_ROLE_SLAVE 0x2 int role; /* either NM_PIPE_ROLE_MASTER or NM_PIPE_ROLE_SLAVE */ struct netmap_adapter *parent; /* adapter that owns the memory */ struct netmap_pipe_adapter *peer; /* the other end of the pipe */ int peer_ref; /* 1 iff we are holding a ref to the peer */ struct ifnet *parent_ifp; /* maybe null */ u_int parent_slot; /* index in the parent pipe array */ }; #endif /* WITH_PIPES */ #ifdef WITH_NMNULL struct netmap_null_adapter { struct netmap_adapter up; }; #endif /* WITH_NMNULL */ /* return slots reserved to rx clients; used in drivers */ static inline uint32_t nm_kr_rxspace(struct netmap_kring *k) { int space = k->nr_hwtail - k->nr_hwcur; if (space < 0) space += k->nkr_num_slots; ND("preserving %d rx slots %d -> %d", space, k->nr_hwcur, k->nr_hwtail); return space; } /* return slots reserved to tx clients */ #define nm_kr_txspace(_k) nm_kr_rxspace(_k) /* True if no space in the tx ring, only valid after txsync_prologue */ static inline int nm_kr_txempty(struct netmap_kring *kring) { - return kring->rcur == kring->nr_hwtail; + return kring->rhead == kring->nr_hwtail; } /* True if no more completed slots in the rx ring, only valid after * rxsync_prologue */ #define nm_kr_rxempty(_k) nm_kr_txempty(_k) /* * protect against multiple threads using the same ring. * also check that the ring has not been stopped or locked */ #define NM_KR_BUSY 1 /* some other thread is syncing the ring */ #define NM_KR_STOPPED 2 /* unbounded stop (ifconfig down or driver unload) */ #define NM_KR_LOCKED 3 /* bounded, brief stop for mutual exclusion */ /* release the previously acquired right to use the *sync() methods of the ring */ static __inline void nm_kr_put(struct netmap_kring *kr) { NM_ATOMIC_CLEAR(&kr->nr_busy); } /* true if the ifp that backed the adapter has disappeared (e.g., the * driver has been unloaded) */ static inline int nm_iszombie(struct netmap_adapter *na); /* try to obtain exclusive right to issue the *sync() operations on the ring. * The right is obtained and must be later relinquished via nm_kr_put() if and * only if nm_kr_tryget() returns 0. * If can_sleep is 1 there are only two other possible outcomes: * - the function returns NM_KR_BUSY * - the function returns NM_KR_STOPPED and sets the POLLERR bit in *perr * (if non-null) * In both cases the caller will typically skip the ring, possibly collecting * errors along the way. * If the calling context does not allow sleeping, the caller must pass 0 in can_sleep. * In the latter case, the function may also return NM_KR_LOCKED and leave *perr * untouched: ideally, the caller should try again at a later time. */ static __inline int nm_kr_tryget(struct netmap_kring *kr, int can_sleep, int *perr) { int busy = 1, stopped; /* check a first time without taking the lock * to avoid starvation for nm_kr_get() */ retry: stopped = kr->nkr_stopped; if (unlikely(stopped)) { goto stop; } busy = NM_ATOMIC_TEST_AND_SET(&kr->nr_busy); /* we should not return NM_KR_BUSY if the ring was * actually stopped, so check another time after * the barrier provided by the atomic operation */ stopped = kr->nkr_stopped; if (unlikely(stopped)) { goto stop; } if (unlikely(nm_iszombie(kr->na))) { stopped = NM_KR_STOPPED; goto stop; } return unlikely(busy) ? NM_KR_BUSY : 0; stop: if (!busy) nm_kr_put(kr); if (stopped == NM_KR_STOPPED) { /* if POLLERR is defined we want to use it to simplify netmap_poll(). * Otherwise, any non-zero value will do. */ #ifdef POLLERR #define NM_POLLERR POLLERR #else #define NM_POLLERR 1 #endif /* POLLERR */ if (perr) *perr |= NM_POLLERR; #undef NM_POLLERR } else if (can_sleep) { tsleep(kr, 0, "NM_KR_TRYGET", 4); goto retry; } return stopped; } /* put the ring in the 'stopped' state and wait for the current user (if any) to * notice. stopped must be either NM_KR_STOPPED or NM_KR_LOCKED */ static __inline void nm_kr_stop(struct netmap_kring *kr, int stopped) { kr->nkr_stopped = stopped; while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) tsleep(kr, 0, "NM_KR_GET", 4); } /* restart a ring after a stop */ static __inline void nm_kr_start(struct netmap_kring *kr) { kr->nkr_stopped = 0; nm_kr_put(kr); } /* * The following functions are used by individual drivers to * support netmap operation. * * netmap_attach() initializes a struct netmap_adapter, allocating the * struct netmap_ring's and the struct selinfo. * * netmap_detach() frees the memory allocated by netmap_attach(). * * netmap_transmit() replaces the if_transmit routine of the interface, * and is used to intercept packets coming from the stack. * * netmap_load_map/netmap_reload_map are helper routines to set/reset * the dmamap for a packet buffer * * netmap_reset() is a helper routine to be called in the hw driver * when reinitializing a ring. It should not be called by * virtual ports (vale, pipes, monitor) */ int netmap_attach(struct netmap_adapter *); int netmap_attach_ext(struct netmap_adapter *, size_t size, int override_reg); void netmap_detach(struct ifnet *); int netmap_transmit(struct ifnet *, struct mbuf *); struct netmap_slot *netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, u_int new_cur); int netmap_ring_reinit(struct netmap_kring *); int netmap_rings_config_get(struct netmap_adapter *, struct nm_config_info *); /* Return codes for netmap_*x_irq. */ enum { /* Driver should do normal interrupt processing, e.g. because * the interface is not in netmap mode. */ NM_IRQ_PASS = 0, /* Port is in netmap mode, and the interrupt work has been * completed. The driver does not have to notify netmap * again before the next interrupt. */ NM_IRQ_COMPLETED = -1, /* Port is in netmap mode, but the interrupt work has not been * completed. The driver has to make sure netmap will be * notified again soon, even if no more interrupts come (e.g. * on Linux the driver should not call napi_complete()). */ NM_IRQ_RESCHED = -2, }; /* default functions to handle rx/tx interrupts */ int netmap_rx_irq(struct ifnet *, u_int, u_int *); #define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL) int netmap_common_irq(struct netmap_adapter *, u_int, u_int *work_done); #ifdef WITH_VALE /* functions used by external modules to interface with VALE */ #define netmap_vp_to_ifp(_vp) ((_vp)->up.ifp) #define netmap_ifp_to_vp(_ifp) (NA(_ifp)->na_vp) #define netmap_ifp_to_host_vp(_ifp) (NA(_ifp)->na_hostvp) #define netmap_bdg_idx(_vp) ((_vp)->bdg_port) const char *netmap_bdg_name(struct netmap_vp_adapter *); #else /* !WITH_VALE */ #define netmap_vp_to_ifp(_vp) NULL #define netmap_ifp_to_vp(_ifp) NULL #define netmap_ifp_to_host_vp(_ifp) NULL #define netmap_bdg_idx(_vp) -1 #endif /* WITH_VALE */ static inline int nm_netmap_on(struct netmap_adapter *na) { return na && na->na_flags & NAF_NETMAP_ON; } static inline int nm_native_on(struct netmap_adapter *na) { return nm_netmap_on(na) && (na->na_flags & NAF_NATIVE); } static inline int nm_iszombie(struct netmap_adapter *na) { return na == NULL || (na->na_flags & NAF_ZOMBIE); } static inline void nm_update_hostrings_mode(struct netmap_adapter *na) { /* Process nr_mode and nr_pending_mode for host rings. */ na->tx_rings[na->num_tx_rings]->nr_mode = na->tx_rings[na->num_tx_rings]->nr_pending_mode; na->rx_rings[na->num_rx_rings]->nr_mode = na->rx_rings[na->num_rx_rings]->nr_pending_mode; } void nm_set_native_flags(struct netmap_adapter *); void nm_clear_native_flags(struct netmap_adapter *); /* * nm_*sync_prologue() functions are used in ioctl/poll and ptnetmap * kthreads. * We need netmap_ring* parameter, because in ptnetmap it is decoupled * from host kring. * The user-space ring pointers (head/cur/tail) are shared through * CSB between host and guest. */ /* * validates parameters in the ring/kring, returns a value for head * If any error, returns ring_size to force a reinit. */ uint32_t nm_txsync_prologue(struct netmap_kring *, struct netmap_ring *); /* * validates parameters in the ring/kring, returns a value for head * If any error, returns ring_size lim to force a reinit. */ uint32_t nm_rxsync_prologue(struct netmap_kring *, struct netmap_ring *); /* check/fix address and len in tx rings */ #if 1 /* debug version */ #define NM_CHECK_ADDR_LEN(_na, _a, _l) do { \ if (_a == NETMAP_BUF_BASE(_na) || _l > NETMAP_BUF_SIZE(_na)) { \ RD(5, "bad addr/len ring %d slot %d idx %d len %d", \ kring->ring_id, nm_i, slot->buf_idx, len); \ if (_l > NETMAP_BUF_SIZE(_na)) \ _l = NETMAP_BUF_SIZE(_na); \ } } while (0) #else /* no debug version */ #define NM_CHECK_ADDR_LEN(_na, _a, _l) do { \ if (_l > NETMAP_BUF_SIZE(_na)) \ _l = NETMAP_BUF_SIZE(_na); \ } while (0) #endif /*---------------------------------------------------------------*/ /* * Support routines used by netmap subsystems * (native drivers, VALE, generic, pipes, monitors, ...) */ /* common routine for all functions that create a netmap adapter. It performs * two main tasks: * - if the na points to an ifp, mark the ifp as netmap capable * using na as its native adapter; * - provide defaults for the setup callbacks and the memory allocator */ int netmap_attach_common(struct netmap_adapter *); /* fill priv->np_[tr]xq{first,last} using the ringid and flags information * coming from a struct nmreq_register */ int netmap_interp_ringid(struct netmap_priv_d *priv, uint32_t nr_mode, uint16_t nr_ringid, uint64_t nr_flags); /* update the ring parameters (number and size of tx and rx rings). * It calls the nm_config callback, if available. */ int netmap_update_config(struct netmap_adapter *na); /* create and initialize the common fields of the krings array. * using the information that must be already available in the na. * tailroom can be used to request the allocation of additional * tailroom bytes after the krings array. This is used by * netmap_vp_adapter's (i.e., VALE ports) to make room for * leasing-related data structures */ int netmap_krings_create(struct netmap_adapter *na, u_int tailroom); /* deletes the kring array of the adapter. The array must have * been created using netmap_krings_create */ void netmap_krings_delete(struct netmap_adapter *na); int netmap_hw_krings_create(struct netmap_adapter *na); void netmap_hw_krings_delete(struct netmap_adapter *na); /* set the stopped/enabled status of ring * When stopping, they also wait for all current activity on the ring to * terminate. The status change is then notified using the na nm_notify * callback. */ void netmap_set_ring(struct netmap_adapter *, u_int ring_id, enum txrx, int stopped); /* set the stopped/enabled status of all rings of the adapter. */ void netmap_set_all_rings(struct netmap_adapter *, int stopped); /* convenience wrappers for netmap_set_all_rings */ void netmap_disable_all_rings(struct ifnet *); void netmap_enable_all_rings(struct ifnet *); int netmap_buf_size_validate(const struct netmap_adapter *na, unsigned mtu); int netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, uint32_t nr_mode, uint16_t nr_ringid, uint64_t nr_flags); void netmap_do_unregif(struct netmap_priv_d *priv); u_int nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg); int netmap_get_na(struct nmreq_header *hdr, struct netmap_adapter **na, struct ifnet **ifp, struct netmap_mem_d *nmd, int create); void netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp); int netmap_get_hw_na(struct ifnet *ifp, struct netmap_mem_d *nmd, struct netmap_adapter **na); #ifdef WITH_VALE uint32_t netmap_vale_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, struct netmap_vp_adapter *, void *private_data); /* these are redefined in case of no VALE support */ int netmap_get_vale_na(struct nmreq_header *hdr, struct netmap_adapter **na, struct netmap_mem_d *nmd, int create); void *netmap_vale_create(const char *bdg_name, int *return_status); int netmap_vale_destroy(const char *bdg_name, void *auth_token); #else /* !WITH_VALE */ #define netmap_bdg_learning(_1, _2, _3, _4) 0 #define netmap_get_vale_na(_1, _2, _3, _4) 0 #define netmap_bdg_create(_1, _2) NULL #define netmap_bdg_destroy(_1, _2) 0 #endif /* !WITH_VALE */ #ifdef WITH_PIPES /* max number of pipes per device */ #define NM_MAXPIPES 64 /* XXX this should probably be a sysctl */ void netmap_pipe_dealloc(struct netmap_adapter *); int netmap_get_pipe_na(struct nmreq_header *hdr, struct netmap_adapter **na, struct netmap_mem_d *nmd, int create); #else /* !WITH_PIPES */ #define NM_MAXPIPES 0 #define netmap_pipe_alloc(_1, _2) 0 #define netmap_pipe_dealloc(_1) #define netmap_get_pipe_na(hdr, _2, _3, _4) \ ((strchr(hdr->nr_name, '{') != NULL || strchr(hdr->nr_name, '}') != NULL) ? EOPNOTSUPP : 0) #endif #ifdef WITH_MONITOR int netmap_get_monitor_na(struct nmreq_header *hdr, struct netmap_adapter **na, struct netmap_mem_d *nmd, int create); void netmap_monitor_stop(struct netmap_adapter *na); #else #define netmap_get_monitor_na(hdr, _2, _3, _4) \ (((struct nmreq_register *)(uintptr_t)hdr->nr_body)->nr_flags & (NR_MONITOR_TX | NR_MONITOR_RX) ? EOPNOTSUPP : 0) #endif #ifdef WITH_NMNULL int netmap_get_null_na(struct nmreq_header *hdr, struct netmap_adapter **na, struct netmap_mem_d *nmd, int create); #else /* !WITH_NMNULL */ #define netmap_get_null_na(hdr, _2, _3, _4) \ (((struct nmreq_register *)(uintptr_t)hdr->nr_body)->nr_flags & (NR_MONITOR_TX | NR_MONITOR_RX) ? EOPNOTSUPP : 0) #endif /* WITH_NMNULL */ #ifdef CONFIG_NET_NS struct net *netmap_bns_get(void); void netmap_bns_put(struct net *); void netmap_bns_getbridges(struct nm_bridge **, u_int *); #else extern struct nm_bridge *nm_bridges; #define netmap_bns_get() #define netmap_bns_put(_1) #define netmap_bns_getbridges(b, n) \ do { *b = nm_bridges; *n = NM_BRIDGES; } while (0) #endif /* Various prototypes */ int netmap_poll(struct netmap_priv_d *, int events, NM_SELRECORD_T *td); int netmap_init(void); void netmap_fini(void); int netmap_get_memory(struct netmap_priv_d* p); void netmap_dtor(void *data); int netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data, struct thread *, int nr_body_is_user); int netmap_ioctl_legacy(struct netmap_priv_d *priv, u_long cmd, caddr_t data, struct thread *td); size_t nmreq_size_by_type(uint16_t nr_reqtype); /* netmap_adapter creation/destruction */ // #define NM_DEBUG_PUTGET 1 #ifdef NM_DEBUG_PUTGET #define NM_DBG(f) __##f void __netmap_adapter_get(struct netmap_adapter *na); #define netmap_adapter_get(na) \ do { \ struct netmap_adapter *__na = na; \ D("getting %p:%s (%d)", __na, (__na)->name, (__na)->na_refcount); \ __netmap_adapter_get(__na); \ } while (0) int __netmap_adapter_put(struct netmap_adapter *na); #define netmap_adapter_put(na) \ ({ \ struct netmap_adapter *__na = na; \ D("putting %p:%s (%d)", __na, (__na)->name, (__na)->na_refcount); \ __netmap_adapter_put(__na); \ }) #else /* !NM_DEBUG_PUTGET */ #define NM_DBG(f) f void netmap_adapter_get(struct netmap_adapter *na); int netmap_adapter_put(struct netmap_adapter *na); #endif /* !NM_DEBUG_PUTGET */ /* * module variables */ #define NETMAP_BUF_BASE(_na) ((_na)->na_lut.lut[0].vaddr) #define NETMAP_BUF_SIZE(_na) ((_na)->na_lut.objsize) extern int netmap_no_pendintr; extern int netmap_mitigate; extern int netmap_verbose; #ifdef CONFIG_NETMAP_DEBUG extern int netmap_debug; /* for debugging */ #else /* !CONFIG_NETMAP_DEBUG */ #define netmap_debug (0) #endif /* !CONFIG_NETMAP_DEBUG */ enum { /* debug flags */ NM_DEBUG_ON = 1, /* generic debug messsages */ NM_DEBUG_HOST = 0x2, /* debug host stack */ NM_DEBUG_RXSYNC = 0x10, /* debug on rxsync/txsync */ NM_DEBUG_TXSYNC = 0x20, NM_DEBUG_RXINTR = 0x100, /* debug on rx/tx intr (driver) */ NM_DEBUG_TXINTR = 0x200, NM_DEBUG_NIC_RXSYNC = 0x1000, /* debug on rx/tx intr (driver) */ NM_DEBUG_NIC_TXSYNC = 0x2000, NM_DEBUG_MEM = 0x4000, /* verbose memory allocations/deallocations */ NM_DEBUG_VALE = 0x8000, /* debug messages from memory allocators */ NM_DEBUG_BDG = NM_DEBUG_VALE, }; extern int netmap_txsync_retry; extern int netmap_flags; extern int netmap_generic_hwcsum; extern int netmap_generic_mit; extern int netmap_generic_ringsize; extern int netmap_generic_rings; #ifdef linux extern int netmap_generic_txqdisc; #endif /* * NA returns a pointer to the struct netmap adapter from the ifp. * WNA is os-specific and must be defined in glue code. */ #define NA(_ifp) ((struct netmap_adapter *)WNA(_ifp)) /* * we provide a default implementation of NM_ATTACH_NA/NM_DETACH_NA * based on the WNA field. * Glue code may override this by defining its own NM_ATTACH_NA */ #ifndef NM_ATTACH_NA /* * On old versions of FreeBSD, NA(ifp) is a pspare. On linux we * overload another pointer in the netdev. * * We check if NA(ifp) is set and its first element has a related * magic value. The capenable is within the struct netmap_adapter. */ #define NETMAP_MAGIC 0x52697a7a #define NM_NA_VALID(ifp) (NA(ifp) && \ ((uint32_t)(uintptr_t)NA(ifp) ^ NA(ifp)->magic) == NETMAP_MAGIC ) #define NM_ATTACH_NA(ifp, na) do { \ WNA(ifp) = na; \ if (NA(ifp)) \ NA(ifp)->magic = \ ((uint32_t)(uintptr_t)NA(ifp)) ^ NETMAP_MAGIC; \ } while(0) #define NM_RESTORE_NA(ifp, na) WNA(ifp) = na; #define NM_DETACH_NA(ifp) do { WNA(ifp) = NULL; } while (0) #define NM_NA_CLASH(ifp) (NA(ifp) && !NM_NA_VALID(ifp)) #endif /* !NM_ATTACH_NA */ #define NM_IS_NATIVE(ifp) (NM_NA_VALID(ifp) && NA(ifp)->nm_dtor == netmap_hw_dtor) #if defined(__FreeBSD__) /* Assigns the device IOMMU domain to an allocator. * Returns -ENOMEM in case the domain is different */ #define nm_iommu_group_id(dev) (0) /* Callback invoked by the dma machinery after a successful dmamap_load */ static void netmap_dmamap_cb(__unused void *arg, __unused bus_dma_segment_t * segs, __unused int nseg, __unused int error) { } /* bus_dmamap_load wrapper: call aforementioned function if map != NULL. * XXX can we do it without a callback ? */ static inline int netmap_load_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map, void *buf) { if (map) bus_dmamap_load(tag, map, buf, NETMAP_BUF_SIZE(na), netmap_dmamap_cb, NULL, BUS_DMA_NOWAIT); return 0; } static inline void netmap_unload_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map) { if (map) bus_dmamap_unload(tag, map); } #define netmap_sync_map(na, tag, map, sz, t) /* update the map when a buffer changes. */ static inline void netmap_reload_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map, void *buf) { if (map) { bus_dmamap_unload(tag, map); bus_dmamap_load(tag, map, buf, NETMAP_BUF_SIZE(na), netmap_dmamap_cb, NULL, BUS_DMA_NOWAIT); } } #elif defined(_WIN32) #else /* linux */ int nm_iommu_group_id(bus_dma_tag_t dev); #include /* * on linux we need * dma_map_single(&pdev->dev, virt_addr, len, direction) * dma_unmap_single(&adapter->pdev->dev, phys_addr, len, direction) */ #if 0 struct e1000_buffer *buffer_info = &tx_ring->buffer_info[l]; /* set time_stamp *before* dma to help avoid a possible race */ buffer_info->time_stamp = jiffies; buffer_info->mapped_as_page = false; buffer_info->length = len; //buffer_info->next_to_watch = l; /* reload dma map */ dma_unmap_single(&adapter->pdev->dev, buffer_info->dma, NETMAP_BUF_SIZE, DMA_TO_DEVICE); buffer_info->dma = dma_map_single(&adapter->pdev->dev, addr, NETMAP_BUF_SIZE, DMA_TO_DEVICE); if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { D("dma mapping error"); /* goto dma_error; See e1000_put_txbuf() */ /* XXX reset */ } tx_desc->buffer_addr = htole64(buffer_info->dma); //XXX #endif static inline int netmap_load_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map, void *buf, u_int size) { if (map) { *map = dma_map_single(na->pdev, buf, size, DMA_BIDIRECTIONAL); if (dma_mapping_error(na->pdev, *map)) { *map = 0; return ENOMEM; } } return 0; } static inline void netmap_unload_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map, u_int sz) { if (*map) { dma_unmap_single(na->pdev, *map, sz, DMA_BIDIRECTIONAL); } } #ifdef NETMAP_LINUX_HAVE_DMASYNC static inline void netmap_sync_map_cpu(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map, u_int sz, enum txrx t) { if (*map) { dma_sync_single_for_cpu(na->pdev, *map, sz, (t == NR_TX ? DMA_TO_DEVICE : DMA_FROM_DEVICE)); } } static inline void netmap_sync_map_dev(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map, u_int sz, enum txrx t) { if (*map) { dma_sync_single_for_device(na->pdev, *map, sz, (t == NR_TX ? DMA_TO_DEVICE : DMA_FROM_DEVICE)); } } static inline void netmap_reload_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map, void *buf) { u_int sz = NETMAP_BUF_SIZE(na); if (*map) { dma_unmap_single(na->pdev, *map, sz, DMA_BIDIRECTIONAL); } *map = dma_map_single(na->pdev, buf, sz, DMA_BIDIRECTIONAL); } #else /* !NETMAP_LINUX_HAVE_DMASYNC */ #define netmap_sync_map_cpu(na, tag, map, sz, t) #define netmap_sync_map_dev(na, tag, map, sz, t) #endif /* NETMAP_LINUX_HAVE_DMASYNC */ #endif /* linux */ /* * functions to map NIC to KRING indexes (n2k) and vice versa (k2n) */ static inline int netmap_idx_n2k(struct netmap_kring *kr, int idx) { int n = kr->nkr_num_slots; if (likely(kr->nkr_hwofs == 0)) { return idx; } idx += kr->nkr_hwofs; if (idx < 0) return idx + n; else if (idx < n) return idx; else return idx - n; } static inline int netmap_idx_k2n(struct netmap_kring *kr, int idx) { int n = kr->nkr_num_slots; if (likely(kr->nkr_hwofs == 0)) { return idx; } idx -= kr->nkr_hwofs; if (idx < 0) return idx + n; else if (idx < n) return idx; else return idx - n; } /* Entries of the look-up table. */ #ifdef __FreeBSD__ struct lut_entry { void *vaddr; /* virtual address. */ vm_paddr_t paddr; /* physical address. */ }; #else /* linux & _WIN32 */ /* dma-mapping in linux can assign a buffer a different address * depending on the device, so we need to have a separate * physical-address look-up table for each na. * We can still share the vaddrs, though, therefore we split * the lut_entry structure. */ struct lut_entry { void *vaddr; /* virtual address. */ }; struct plut_entry { vm_paddr_t paddr; /* physical address. */ }; #endif /* linux & _WIN32 */ struct netmap_obj_pool; /* * NMB return the virtual address of a buffer (buffer 0 on bad index) * PNMB also fills the physical address */ static inline void * NMB(struct netmap_adapter *na, struct netmap_slot *slot) { struct lut_entry *lut = na->na_lut.lut; uint32_t i = slot->buf_idx; return (unlikely(i >= na->na_lut.objtotal)) ? lut[0].vaddr : lut[i].vaddr; } static inline void * PNMB(struct netmap_adapter *na, struct netmap_slot *slot, uint64_t *pp) { uint32_t i = slot->buf_idx; struct lut_entry *lut = na->na_lut.lut; struct plut_entry *plut = na->na_lut.plut; void *ret = (i >= na->na_lut.objtotal) ? lut[0].vaddr : lut[i].vaddr; #ifdef _WIN32 *pp = (i >= na->na_lut.objtotal) ? (uint64_t)plut[0].paddr.QuadPart : (uint64_t)plut[i].paddr.QuadPart; #else *pp = (i >= na->na_lut.objtotal) ? plut[0].paddr : plut[i].paddr; #endif return ret; } /* * Structure associated to each netmap file descriptor. * It is created on open and left unbound (np_nifp == NULL). * A successful NIOCREGIF will set np_nifp and the first few fields; * this is protected by a global lock (NMG_LOCK) due to low contention. * * np_refs counts the number of references to the structure: one for the fd, * plus (on FreeBSD) one for each active mmap which we track ourselves * (linux automatically tracks them, but FreeBSD does not). * np_refs is protected by NMG_LOCK. * * Read access to the structure is lock free, because ni_nifp once set * can only go to 0 when nobody is using the entry anymore. Readers * must check that np_nifp != NULL before using the other fields. */ struct netmap_priv_d { struct netmap_if * volatile np_nifp; /* netmap if descriptor. */ struct netmap_adapter *np_na; struct ifnet *np_ifp; uint32_t np_flags; /* from the ioctl */ u_int np_qfirst[NR_TXRX], np_qlast[NR_TXRX]; /* range of tx/rx rings to scan */ uint16_t np_txpoll; uint16_t np_kloop_state; /* use with NMG_LOCK held */ #define NM_SYNC_KLOOP_RUNNING (1 << 0) #define NM_SYNC_KLOOP_STOPPING (1 << 1) int np_sync_flags; /* to be passed to nm_sync */ int np_refs; /* use with NMG_LOCK held */ /* pointers to the selinfo to be used for selrecord. * Either the local or the global one depending on the * number of rings. */ NM_SELINFO_T *np_si[NR_TXRX]; /* In the optional CSB mode, the user must specify the start address * of two arrays of Communication Status Block (CSB) entries, for the * two directions (kernel read application write, and kernel write * application read). * The number of entries must agree with the number of rings bound to * the netmap file descriptor. The entries corresponding to the TX * rings are laid out before the ones corresponding to the RX rings. * * Array of CSB entries for application --> kernel communication * (N entries). */ struct nm_csb_atok *np_csb_atok_base; /* Array of CSB entries for kernel --> application communication * (N entries). */ struct nm_csb_ktoa *np_csb_ktoa_base; #ifdef linux struct file *np_filp; /* used by sync kloop */ #endif /* linux */ }; struct netmap_priv_d *netmap_priv_new(void); void netmap_priv_delete(struct netmap_priv_d *); static inline int nm_kring_pending(struct netmap_priv_d *np) { struct netmap_adapter *na = np->np_na; enum txrx t; int i; for_rx_tx(t) { for (i = np->np_qfirst[t]; i < np->np_qlast[t]; i++) { struct netmap_kring *kring = NMR(na, t)[i]; if (kring->nr_mode != kring->nr_pending_mode) { return 1; } } } return 0; } /* call with NMG_LOCK held */ static __inline int nm_si_user(struct netmap_priv_d *priv, enum txrx t) { return (priv->np_na != NULL && (priv->np_qlast[t] - priv->np_qfirst[t] > 1)); } #ifdef WITH_PIPES int netmap_pipe_txsync(struct netmap_kring *txkring, int flags); int netmap_pipe_rxsync(struct netmap_kring *rxkring, int flags); #endif /* WITH_PIPES */ #ifdef WITH_MONITOR struct netmap_monitor_adapter { struct netmap_adapter up; struct netmap_priv_d priv; uint32_t flags; }; #endif /* WITH_MONITOR */ #ifdef WITH_GENERIC /* * generic netmap emulation for devices that do not have * native netmap support. */ int generic_netmap_attach(struct ifnet *ifp); int generic_rx_handler(struct ifnet *ifp, struct mbuf *m);; int nm_os_catch_rx(struct netmap_generic_adapter *gna, int intercept); int nm_os_catch_tx(struct netmap_generic_adapter *gna, int intercept); int na_is_generic(struct netmap_adapter *na); /* * the generic transmit routine is passed a structure to optionally * build a queue of descriptors, in an OS-specific way. * The payload is at addr, if non-null, and the routine should send or queue * the packet, returning 0 if successful, 1 on failure. * * At the end, if head is non-null, there will be an additional call * to the function with addr = NULL; this should tell the OS-specific * routine to send the queue and free any resources. Failure is ignored. */ struct nm_os_gen_arg { struct ifnet *ifp; void *m; /* os-specific mbuf-like object */ void *head, *tail; /* tailq, if the OS-specific routine needs to build one */ void *addr; /* payload of current packet */ u_int len; /* packet length */ u_int ring_nr; /* packet length */ u_int qevent; /* in txqdisc mode, place an event on this mbuf */ }; int nm_os_generic_xmit_frame(struct nm_os_gen_arg *); int nm_os_generic_find_num_desc(struct ifnet *ifp, u_int *tx, u_int *rx); void nm_os_generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq); void nm_os_generic_set_features(struct netmap_generic_adapter *gna); static inline struct ifnet* netmap_generic_getifp(struct netmap_generic_adapter *gna) { if (gna->prev) return gna->prev->ifp; return gna->up.up.ifp; } void netmap_generic_irq(struct netmap_adapter *na, u_int q, u_int *work_done); //#define RATE_GENERIC /* Enables communication statistics for generic. */ #ifdef RATE_GENERIC void generic_rate(int txp, int txs, int txi, int rxp, int rxs, int rxi); #else #define generic_rate(txp, txs, txi, rxp, rxs, rxi) #endif /* * netmap_mitigation API. This is used by the generic adapter * to reduce the number of interrupt requests/selwakeup * to clients on incoming packets. */ void nm_os_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na); void nm_os_mitigation_start(struct nm_generic_mit *mit); void nm_os_mitigation_restart(struct nm_generic_mit *mit); int nm_os_mitigation_active(struct nm_generic_mit *mit); void nm_os_mitigation_cleanup(struct nm_generic_mit *mit); #else /* !WITH_GENERIC */ #define generic_netmap_attach(ifp) (EOPNOTSUPP) #define na_is_generic(na) (0) #endif /* WITH_GENERIC */ /* Shared declarations for the VALE switch. */ /* * Each transmit queue accumulates a batch of packets into * a structure before forwarding. Packets to the same * destination are put in a list using ft_next as a link field. * ft_frags and ft_next are valid only on the first fragment. */ struct nm_bdg_fwd { /* forwarding entry for a bridge */ void *ft_buf; /* netmap or indirect buffer */ uint8_t ft_frags; /* how many fragments (only on 1st frag) */ uint16_t ft_offset; /* dst port (unused) */ uint16_t ft_flags; /* flags, e.g. indirect */ uint16_t ft_len; /* src fragment len */ uint16_t ft_next; /* next packet to same destination */ }; /* struct 'virtio_net_hdr' from linux. */ struct nm_vnet_hdr { #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start, csum_offset */ #define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */ uint8_t flags; #define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */ #define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */ #define VIRTIO_NET_HDR_GSO_UDP 3 /* GSO frame, IPv4 UDP (UFO) */ #define VIRTIO_NET_HDR_GSO_TCPV6 4 /* GSO frame, IPv6 TCP */ #define VIRTIO_NET_HDR_GSO_ECN 0x80 /* TCP has ECN set */ uint8_t gso_type; uint16_t hdr_len; uint16_t gso_size; uint16_t csum_start; uint16_t csum_offset; }; #define WORST_CASE_GSO_HEADER (14+40+60) /* IPv6 + TCP */ /* Private definitions for IPv4, IPv6, UDP and TCP headers. */ struct nm_iphdr { uint8_t version_ihl; uint8_t tos; uint16_t tot_len; uint16_t id; uint16_t frag_off; uint8_t ttl; uint8_t protocol; uint16_t check; uint32_t saddr; uint32_t daddr; /*The options start here. */ }; struct nm_tcphdr { uint16_t source; uint16_t dest; uint32_t seq; uint32_t ack_seq; uint8_t doff; /* Data offset + Reserved */ uint8_t flags; uint16_t window; uint16_t check; uint16_t urg_ptr; }; struct nm_udphdr { uint16_t source; uint16_t dest; uint16_t len; uint16_t check; }; struct nm_ipv6hdr { uint8_t priority_version; uint8_t flow_lbl[3]; uint16_t payload_len; uint8_t nexthdr; uint8_t hop_limit; uint8_t saddr[16]; uint8_t daddr[16]; }; /* Type used to store a checksum (in host byte order) that hasn't been * folded yet. */ #define rawsum_t uint32_t rawsum_t nm_os_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum); uint16_t nm_os_csum_ipv4(struct nm_iphdr *iph); void nm_os_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, size_t datalen, uint16_t *check); void nm_os_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, size_t datalen, uint16_t *check); uint16_t nm_os_csum_fold(rawsum_t cur_sum); void bdg_mismatch_datapath(struct netmap_vp_adapter *na, struct netmap_vp_adapter *dst_na, const struct nm_bdg_fwd *ft_p, struct netmap_ring *dst_ring, u_int *j, u_int lim, u_int *howmany); /* persistent virtual port routines */ int nm_os_vi_persist(const char *, struct ifnet **); void nm_os_vi_detach(struct ifnet *); void nm_os_vi_init_index(void); /* * kernel thread routines */ struct nm_kctx; /* OS-specific kernel context - opaque */ typedef void (*nm_kctx_worker_fn_t)(void *data); /* kthread configuration */ struct nm_kctx_cfg { long type; /* kthread type/identifier */ nm_kctx_worker_fn_t worker_fn; /* worker function */ void *worker_private;/* worker parameter */ int attach_user; /* attach kthread to user process */ }; /* kthread configuration */ struct nm_kctx *nm_os_kctx_create(struct nm_kctx_cfg *cfg, void *opaque); int nm_os_kctx_worker_start(struct nm_kctx *); void nm_os_kctx_worker_stop(struct nm_kctx *); void nm_os_kctx_destroy(struct nm_kctx *); void nm_os_kctx_worker_setaff(struct nm_kctx *, int); u_int nm_os_ncpus(void); int netmap_sync_kloop(struct netmap_priv_d *priv, struct nmreq_header *hdr); int netmap_sync_kloop_stop(struct netmap_priv_d *priv); #ifdef WITH_PTNETMAP /* ptnetmap guest routines */ /* * ptnetmap_memdev routines used to talk with ptnetmap_memdev device driver */ struct ptnetmap_memdev; int nm_os_pt_memdev_iomap(struct ptnetmap_memdev *, vm_paddr_t *, void **, uint64_t *); void nm_os_pt_memdev_iounmap(struct ptnetmap_memdev *); uint32_t nm_os_pt_memdev_ioread(struct ptnetmap_memdev *, unsigned int); /* * netmap adapter for guest ptnetmap ports */ struct netmap_pt_guest_adapter { /* The netmap adapter to be used by netmap applications. * This field must be the first, to allow upcast. */ struct netmap_hw_adapter hwup; /* The netmap adapter to be used by the driver. */ struct netmap_hw_adapter dr; /* Reference counter to track users of backend netmap port: the * network stack and netmap clients. * Used to decide when we need (de)allocate krings/rings and * start (stop) ptnetmap kthreads. */ int backend_users; }; int netmap_pt_guest_attach(struct netmap_adapter *na, unsigned int nifp_offset, unsigned int memid); bool netmap_pt_guest_txsync(struct nm_csb_atok *atok, struct nm_csb_ktoa *ktoa, struct netmap_kring *kring, int flags); bool netmap_pt_guest_rxsync(struct nm_csb_atok *atok, struct nm_csb_ktoa *ktoa, struct netmap_kring *kring, int flags); int ptnet_nm_krings_create(struct netmap_adapter *na); void ptnet_nm_krings_delete(struct netmap_adapter *na); void ptnet_nm_dtor(struct netmap_adapter *na); -/* Guest driver: Write kring pointers (cur, head) to the CSB. - * This routine is coupled with ptnetmap_host_read_kring_csb(). */ +/* Helper function wrapping nm_sync_kloop_appl_read(). */ static inline void -ptnetmap_guest_write_kring_csb(struct nm_csb_atok *atok, uint32_t cur, - uint32_t head) -{ - /* - * We need to write cur and head to the CSB but we cannot do it atomically. - * There is no way we can prevent the host from reading the updated value - * of one of the two and the old value of the other. However, if we make - * sure that the host never reads a value of head more recent than the - * value of cur we are safe. We can allow the host to read a value of cur - * more recent than the value of head, since in the netmap ring cur can be - * ahead of head and cur cannot wrap around head because it must be behind - * tail. Inverting the order of writes below could instead result into the - * host to think head went ahead of cur, which would cause the sync - * prologue to fail. - * - * The following memory barrier scheme is used to make this happen: - * - * Guest Host - * - * STORE(cur) LOAD(head) - * mb() <-----------> mb() - * STORE(head) LOAD(cur) - */ - atok->cur = cur; - nm_stst_barrier(); - atok->head = head; -} - -/* Guest driver: Read kring pointers (hwcur, hwtail) from the CSB. - * This routine is coupled with ptnetmap_host_write_kring_csb(). */ -static inline void -ptnetmap_guest_read_kring_csb(struct nm_csb_ktoa *ktoa, - struct netmap_kring *kring) -{ - /* - * We place a memory barrier to make sure that the update of hwtail never - * overtakes the update of hwcur. - * (see explanation in ptnetmap_host_write_kring_csb). - */ - kring->nr_hwtail = ktoa->hwtail; - nm_stst_barrier(); - kring->nr_hwcur = ktoa->hwcur; -} - -/* Helper function wrapping ptnetmap_guest_read_kring_csb(). */ -static inline void ptnet_sync_tail(struct nm_csb_ktoa *ktoa, struct netmap_kring *kring) { struct netmap_ring *ring = kring->ring; /* Update hwcur and hwtail as known by the host. */ - ptnetmap_guest_read_kring_csb(ktoa, kring); + nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail, &kring->nr_hwcur); /* nm_sync_finalize */ ring->tail = kring->rtail = kring->nr_hwtail; } #endif /* WITH_PTNETMAP */ #ifdef __FreeBSD__ /* * FreeBSD mbuf allocator/deallocator in emulation mode: */ #if __FreeBSD_version < 1100000 /* * For older versions of FreeBSD: * * We allocate EXT_PACKET mbuf+clusters, but need to set M_NOFREE * so that the destructor, if invoked, will not free the packet. * In principle we should set the destructor only on demand, * but since there might be a race we better do it on allocation. * As a consequence, we also need to set the destructor or we * would leak buffers. */ /* mbuf destructor, also need to change the type to EXT_EXTREF, * add an M_NOFREE flag, and then clear the flag and * chain into uma_zfree(zone_pack, mf) * (or reinstall the buffer ?) */ #define SET_MBUF_DESTRUCTOR(m, fn) do { \ (m)->m_ext.ext_free = (void *)fn; \ (m)->m_ext.ext_type = EXT_EXTREF; \ } while (0) static int void_mbuf_dtor(struct mbuf *m, void *arg1, void *arg2) { /* restore original mbuf */ m->m_ext.ext_buf = m->m_data = m->m_ext.ext_arg1; m->m_ext.ext_arg1 = NULL; m->m_ext.ext_type = EXT_PACKET; m->m_ext.ext_free = NULL; if (MBUF_REFCNT(m) == 0) SET_MBUF_REFCNT(m, 1); uma_zfree(zone_pack, m); return 0; } static inline struct mbuf * nm_os_get_mbuf(struct ifnet *ifp, int len) { struct mbuf *m; (void)ifp; m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) { /* m_getcl() (mb_ctor_mbuf) has an assert that checks that * M_NOFREE flag is not specified as third argument, * so we have to set M_NOFREE after m_getcl(). */ m->m_flags |= M_NOFREE; m->m_ext.ext_arg1 = m->m_ext.ext_buf; // XXX save m->m_ext.ext_free = (void *)void_mbuf_dtor; m->m_ext.ext_type = EXT_EXTREF; ND(5, "create m %p refcnt %d", m, MBUF_REFCNT(m)); } return m; } #else /* __FreeBSD_version >= 1100000 */ /* * Newer versions of FreeBSD, using a straightforward scheme. * * We allocate mbufs with m_gethdr(), since the mbuf header is needed * by the driver. We also attach a customly-provided external storage, * which in this case is a netmap buffer. When calling m_extadd(), however * we pass a NULL address, since the real address (and length) will be * filled in by nm_os_generic_xmit_frame() right before calling * if_transmit(). * * The dtor function does nothing, however we need it since mb_free_ext() * has a KASSERT(), checking that the mbuf dtor function is not NULL. */ #if __FreeBSD_version <= 1200050 static void void_mbuf_dtor(struct mbuf *m, void *arg1, void *arg2) { } #else /* __FreeBSD_version >= 1200051 */ /* The arg1 and arg2 pointers argument were removed by r324446, which * in included since version 1200051. */ static void void_mbuf_dtor(struct mbuf *m) { } #endif /* __FreeBSD_version >= 1200051 */ #define SET_MBUF_DESTRUCTOR(m, fn) do { \ (m)->m_ext.ext_free = (fn != NULL) ? \ (void *)fn : (void *)void_mbuf_dtor; \ } while (0) static inline struct mbuf * nm_os_get_mbuf(struct ifnet *ifp, int len) { struct mbuf *m; (void)ifp; (void)len; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { return m; } m_extadd(m, NULL /* buf */, 0 /* size */, void_mbuf_dtor, NULL, NULL, 0, EXT_NET_DRV); return m; } #endif /* __FreeBSD_version >= 1100000 */ #endif /* __FreeBSD__ */ struct nmreq_option * nmreq_findoption(struct nmreq_option *, uint16_t); int nmreq_checkduplicate(struct nmreq_option *); int netmap_init_bridges(void); void netmap_uninit_bridges(void); /* Functions to read and write CSB fields from the kernel. */ #if defined (linux) #define CSB_READ(csb, field, r) (get_user(r, &csb->field)) #define CSB_WRITE(csb, field, v) (put_user(v, &csb->field)) #else /* ! linux */ #define CSB_READ(csb, field, r) (r = fuword32(&csb->field)) #define CSB_WRITE(csb, field, v) (suword32(&csb->field, v)) #endif /* ! linux */ #endif /* _NET_NETMAP_KERN_H_ */ Index: stable/11/sys/dev/netmap/netmap_kloop.c =================================================================== --- stable/11/sys/dev/netmap/netmap_kloop.c (revision 343831) +++ stable/11/sys/dev/netmap/netmap_kloop.c (revision 343832) @@ -1,918 +1,962 @@ /* * Copyright (C) 2016-2018 Vincenzo Maffione * Copyright (C) 2015 Stefano Garzarella * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * common headers */ #if defined(__FreeBSD__) #include #include #include #include #include #include #include #include #include #define usleep_range(_1, _2) \ pause_sbt("sync-kloop-sleep", SBT_1US * _1, SBT_1US * 1, C_ABSOLUTE) #elif defined(linux) #include #include #include #endif #include #include #include #include /* Support for eventfd-based notifications. */ #if defined(linux) #define SYNC_KLOOP_POLL #endif /* Write kring pointers (hwcur, hwtail) to the CSB. * This routine is coupled with ptnetmap_guest_read_kring_csb(). */ static inline void sync_kloop_kernel_write(struct nm_csb_ktoa __user *ptr, uint32_t hwcur, uint32_t hwtail) { + /* Issue a first store-store barrier to make sure writes to the + * netmap ring do not overcome updates on ktoa->hwcur and ktoa->hwtail. */ + nm_stst_barrier(); + /* - * The same scheme used in ptnetmap_guest_write_kring_csb() applies here. + * The same scheme used in nm_sync_kloop_appl_write() applies here. * We allow the application to read a value of hwcur more recent than the value * of hwtail, since this would anyway result in a consistent view of the * ring state (and hwcur can never wraparound hwtail, since hwcur must be * behind head). * * The following memory barrier scheme is used to make this happen: * - * Application Kernel + * Application Kernel * - * STORE(hwcur) LOAD(hwtail) - * mb() <-------------> mb() - * STORE(hwtail) LOAD(hwcur) + * STORE(hwcur) LOAD(hwtail) + * wmb() <-------------> rmb() + * STORE(hwtail) LOAD(hwcur) */ CSB_WRITE(ptr, hwcur, hwcur); nm_stst_barrier(); CSB_WRITE(ptr, hwtail, hwtail); } /* Read kring pointers (head, cur, sync_flags) from the CSB. * This routine is coupled with ptnetmap_guest_write_kring_csb(). */ static inline void sync_kloop_kernel_read(struct nm_csb_atok __user *ptr, struct netmap_ring *shadow_ring, uint32_t num_slots) { /* * We place a memory barrier to make sure that the update of head never * overtakes the update of cur. - * (see explanation in ptnetmap_guest_write_kring_csb). + * (see explanation in sync_kloop_kernel_write). */ CSB_READ(ptr, head, shadow_ring->head); - nm_stst_barrier(); + nm_ldld_barrier(); CSB_READ(ptr, cur, shadow_ring->cur); CSB_READ(ptr, sync_flags, shadow_ring->flags); + + /* Make sure that loads from atok->head and atok->cur are not delayed + * after the loads from the netmap ring. */ + nm_ldld_barrier(); } /* Enable or disable application --> kernel kicks. */ static inline void csb_ktoa_kick_enable(struct nm_csb_ktoa __user *csb_ktoa, uint32_t val) { CSB_WRITE(csb_ktoa, kern_need_kick, val); } #ifdef SYNC_KLOOP_POLL /* Are application interrupt enabled or disabled? */ static inline uint32_t csb_atok_intr_enabled(struct nm_csb_atok __user *csb_atok) { uint32_t v; CSB_READ(csb_atok, appl_need_kick, v); return v; } #endif /* SYNC_KLOOP_POLL */ static inline void sync_kloop_kring_dump(const char *title, const struct netmap_kring *kring) { - nm_prinf("%s - name: %s hwcur: %d hwtail: %d " - "rhead: %d rcur: %d rtail: %d", - title, kring->name, kring->nr_hwcur, kring->nr_hwtail, - kring->rhead, kring->rcur, kring->rtail); + nm_prinf("%s, kring %s, hwcur %d, rhead %d, " + "rcur %d, rtail %d, hwtail %d", + title, kring->name, kring->nr_hwcur, kring->rhead, + kring->rcur, kring->rtail, kring->nr_hwtail); } struct sync_kloop_ring_args { struct netmap_kring *kring; struct nm_csb_atok *csb_atok; struct nm_csb_ktoa *csb_ktoa; #ifdef SYNC_KLOOP_POLL struct eventfd_ctx *irq_ctx; #endif /* SYNC_KLOOP_POLL */ }; static void netmap_sync_kloop_tx_ring(const struct sync_kloop_ring_args *a) { struct netmap_kring *kring = a->kring; struct nm_csb_atok *csb_atok = a->csb_atok; struct nm_csb_ktoa *csb_ktoa = a->csb_ktoa; struct netmap_ring shadow_ring; /* shadow copy of the netmap_ring */ bool more_txspace = false; uint32_t num_slots; int batch; num_slots = kring->nkr_num_slots; /* Disable application --> kernel notifications. */ csb_ktoa_kick_enable(csb_ktoa, 0); /* Copy the application kring pointers from the CSB */ sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots); for (;;) { batch = shadow_ring.head - kring->nr_hwcur; if (batch < 0) batch += num_slots; #ifdef PTN_TX_BATCH_LIM if (batch > PTN_TX_BATCH_LIM(num_slots)) { /* If application moves ahead too fast, let's cut the move so * that we don't exceed our batch limit. */ uint32_t head_lim = kring->nr_hwcur + PTN_TX_BATCH_LIM(num_slots); if (head_lim >= num_slots) head_lim -= num_slots; nm_prdis(1, "batch: %d head: %d head_lim: %d", batch, shadow_ring.head, head_lim); shadow_ring.head = head_lim; batch = PTN_TX_BATCH_LIM(num_slots); } #endif /* PTN_TX_BATCH_LIM */ if (nm_kr_txspace(kring) <= (num_slots >> 1)) { shadow_ring.flags |= NAF_FORCE_RECLAIM; } /* Netmap prologue */ shadow_ring.tail = kring->rtail; if (unlikely(nm_txsync_prologue(kring, &shadow_ring) >= num_slots)) { /* Reinit ring and enable notifications. */ netmap_ring_reinit(kring); csb_ktoa_kick_enable(csb_ktoa, 1); break; } if (unlikely(netmap_debug & NM_DEBUG_TXSYNC)) { sync_kloop_kring_dump("pre txsync", kring); } if (unlikely(kring->nm_sync(kring, shadow_ring.flags))) { /* Reenable notifications. */ csb_ktoa_kick_enable(csb_ktoa, 1); nm_prerr("txsync() failed"); break; } /* * Finalize * Copy kernel hwcur and hwtail into the CSB for the application sync(), and * do the nm_sync_finalize. */ sync_kloop_kernel_write(csb_ktoa, kring->nr_hwcur, kring->nr_hwtail); if (kring->rtail != kring->nr_hwtail) { /* Some more room available in the parent adapter. */ kring->rtail = kring->nr_hwtail; more_txspace = true; } if (unlikely(netmap_debug & NM_DEBUG_TXSYNC)) { sync_kloop_kring_dump("post txsync", kring); } /* Interrupt the application if needed. */ #ifdef SYNC_KLOOP_POLL if (a->irq_ctx && more_txspace && csb_atok_intr_enabled(csb_atok)) { /* Disable application kick to avoid sending unnecessary kicks */ eventfd_signal(a->irq_ctx, 1); more_txspace = false; } #endif /* SYNC_KLOOP_POLL */ /* Read CSB to see if there is more work to do. */ sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots); if (shadow_ring.head == kring->rhead) { /* * No more packets to transmit. We enable notifications and * go to sleep, waiting for a kick from the application when new * new slots are ready for transmission. */ /* Reenable notifications. */ csb_ktoa_kick_enable(csb_ktoa, 1); - /* Doublecheck. */ + /* Double check, with store-load memory barrier. */ + nm_stld_barrier(); sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots); if (shadow_ring.head != kring->rhead) { /* We won the race condition, there are more packets to * transmit. Disable notifications and do another cycle */ csb_ktoa_kick_enable(csb_ktoa, 0); continue; } break; } if (nm_kr_txempty(kring)) { /* No more available TX slots. We stop waiting for a notification * from the backend (netmap_tx_irq). */ nm_prdis(1, "TX ring"); break; } } #ifdef SYNC_KLOOP_POLL if (a->irq_ctx && more_txspace && csb_atok_intr_enabled(csb_atok)) { eventfd_signal(a->irq_ctx, 1); } #endif /* SYNC_KLOOP_POLL */ } /* RX cycle without receive any packets */ #define SYNC_LOOP_RX_DRY_CYCLES_MAX 2 static inline int sync_kloop_norxslots(struct netmap_kring *kring, uint32_t g_head) { return (NM_ACCESS_ONCE(kring->nr_hwtail) == nm_prev(g_head, kring->nkr_num_slots - 1)); } static void netmap_sync_kloop_rx_ring(const struct sync_kloop_ring_args *a) { struct netmap_kring *kring = a->kring; struct nm_csb_atok *csb_atok = a->csb_atok; struct nm_csb_ktoa *csb_ktoa = a->csb_ktoa; struct netmap_ring shadow_ring; /* shadow copy of the netmap_ring */ int dry_cycles = 0; bool some_recvd = false; uint32_t num_slots; num_slots = kring->nkr_num_slots; /* Get RX csb_atok and csb_ktoa pointers from the CSB. */ num_slots = kring->nkr_num_slots; /* Disable notifications. */ csb_ktoa_kick_enable(csb_ktoa, 0); /* Copy the application kring pointers from the CSB */ sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots); for (;;) { uint32_t hwtail; /* Netmap prologue */ shadow_ring.tail = kring->rtail; if (unlikely(nm_rxsync_prologue(kring, &shadow_ring) >= num_slots)) { /* Reinit ring and enable notifications. */ netmap_ring_reinit(kring); csb_ktoa_kick_enable(csb_ktoa, 1); break; } if (unlikely(netmap_debug & NM_DEBUG_RXSYNC)) { sync_kloop_kring_dump("pre rxsync", kring); } if (unlikely(kring->nm_sync(kring, shadow_ring.flags))) { /* Reenable notifications. */ csb_ktoa_kick_enable(csb_ktoa, 1); nm_prerr("rxsync() failed"); break; } /* * Finalize * Copy kernel hwcur and hwtail into the CSB for the application sync() */ hwtail = NM_ACCESS_ONCE(kring->nr_hwtail); sync_kloop_kernel_write(csb_ktoa, kring->nr_hwcur, hwtail); if (kring->rtail != hwtail) { kring->rtail = hwtail; some_recvd = true; dry_cycles = 0; } else { dry_cycles++; } if (unlikely(netmap_debug & NM_DEBUG_RXSYNC)) { sync_kloop_kring_dump("post rxsync", kring); } #ifdef SYNC_KLOOP_POLL /* Interrupt the application if needed. */ if (a->irq_ctx && some_recvd && csb_atok_intr_enabled(csb_atok)) { /* Disable application kick to avoid sending unnecessary kicks */ eventfd_signal(a->irq_ctx, 1); some_recvd = false; } #endif /* SYNC_KLOOP_POLL */ /* Read CSB to see if there is more work to do. */ sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots); if (sync_kloop_norxslots(kring, shadow_ring.head)) { /* * No more slots available for reception. We enable notification and * go to sleep, waiting for a kick from the application when new receive * slots are available. */ /* Reenable notifications. */ csb_ktoa_kick_enable(csb_ktoa, 1); - /* Doublecheck. */ + /* Double check, with store-load memory barrier. */ + nm_stld_barrier(); sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots); if (!sync_kloop_norxslots(kring, shadow_ring.head)) { /* We won the race condition, more slots are available. Disable * notifications and do another cycle. */ csb_ktoa_kick_enable(csb_ktoa, 0); continue; } break; } hwtail = NM_ACCESS_ONCE(kring->nr_hwtail); if (unlikely(hwtail == kring->rhead || dry_cycles >= SYNC_LOOP_RX_DRY_CYCLES_MAX)) { /* No more packets to be read from the backend. We stop and * wait for a notification from the backend (netmap_rx_irq). */ nm_prdis(1, "nr_hwtail: %d rhead: %d dry_cycles: %d", hwtail, kring->rhead, dry_cycles); break; } } nm_kr_put(kring); #ifdef SYNC_KLOOP_POLL /* Interrupt the application if needed. */ if (a->irq_ctx && some_recvd && csb_atok_intr_enabled(csb_atok)) { eventfd_signal(a->irq_ctx, 1); } #endif /* SYNC_KLOOP_POLL */ } #ifdef SYNC_KLOOP_POLL struct sync_kloop_poll_entry { /* Support for receiving notifications from * a netmap ring or from the application. */ struct file *filp; wait_queue_t wait; wait_queue_head_t *wqh; /* Support for sending notifications to the application. */ struct eventfd_ctx *irq_ctx; struct file *irq_filp; }; struct sync_kloop_poll_ctx { poll_table wait_table; unsigned int next_entry; unsigned int num_entries; struct sync_kloop_poll_entry entries[0]; }; static void sync_kloop_poll_table_queue_proc(struct file *file, wait_queue_head_t *wqh, poll_table *pt) { struct sync_kloop_poll_ctx *poll_ctx = container_of(pt, struct sync_kloop_poll_ctx, wait_table); struct sync_kloop_poll_entry *entry = poll_ctx->entries + poll_ctx->next_entry; BUG_ON(poll_ctx->next_entry >= poll_ctx->num_entries); entry->wqh = wqh; entry->filp = file; /* Use the default wake up function. */ init_waitqueue_entry(&entry->wait, current); add_wait_queue(wqh, &entry->wait); poll_ctx->next_entry++; } #endif /* SYNC_KLOOP_POLL */ int netmap_sync_kloop(struct netmap_priv_d *priv, struct nmreq_header *hdr) { struct nmreq_sync_kloop_start *req = (struct nmreq_sync_kloop_start *)(uintptr_t)hdr->nr_body; struct nmreq_opt_sync_kloop_eventfds *eventfds_opt = NULL; #ifdef SYNC_KLOOP_POLL struct sync_kloop_poll_ctx *poll_ctx = NULL; #endif /* SYNC_KLOOP_POLL */ int num_rx_rings, num_tx_rings, num_rings; + struct sync_kloop_ring_args *args = NULL; uint32_t sleep_us = req->sleep_us; struct nm_csb_atok* csb_atok_base; struct nm_csb_ktoa* csb_ktoa_base; struct netmap_adapter *na; struct nmreq_option *opt; int err = 0; int i; if (sleep_us > 1000000) { /* We do not accept sleeping for more than a second. */ return EINVAL; } if (priv->np_nifp == NULL) { return ENXIO; } mb(); /* make sure following reads are not from cache */ na = priv->np_na; if (!nm_netmap_on(na)) { return ENXIO; } NMG_LOCK(); /* Make sure the application is working in CSB mode. */ if (!priv->np_csb_atok_base || !priv->np_csb_ktoa_base) { NMG_UNLOCK(); nm_prerr("sync-kloop on %s requires " "NETMAP_REQ_OPT_CSB option", na->name); return EINVAL; } csb_atok_base = priv->np_csb_atok_base; csb_ktoa_base = priv->np_csb_ktoa_base; /* Make sure that no kloop is currently running. */ if (priv->np_kloop_state & NM_SYNC_KLOOP_RUNNING) { err = EBUSY; } priv->np_kloop_state |= NM_SYNC_KLOOP_RUNNING; NMG_UNLOCK(); if (err) { return err; } num_rx_rings = priv->np_qlast[NR_RX] - priv->np_qfirst[NR_RX]; num_tx_rings = priv->np_qlast[NR_TX] - priv->np_qfirst[NR_TX]; num_rings = num_tx_rings + num_rx_rings; + args = nm_os_malloc(num_rings * sizeof(args[0])); + if (!args) { + err = ENOMEM; + goto out; + } + /* Validate notification options. */ opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)hdr->nr_options, NETMAP_REQ_OPT_SYNC_KLOOP_EVENTFDS); if (opt != NULL) { err = nmreq_checkduplicate(opt); if (err) { opt->nro_status = err; goto out; } if (opt->nro_size != sizeof(*eventfds_opt) + sizeof(eventfds_opt->eventfds[0]) * num_rings) { /* Option size not consistent with the number of * entries. */ opt->nro_status = err = EINVAL; goto out; } #ifdef SYNC_KLOOP_POLL eventfds_opt = (struct nmreq_opt_sync_kloop_eventfds *)opt; opt->nro_status = 0; /* We need 2 poll entries for TX and RX notifications coming * from the netmap adapter, plus one entries per ring for the * notifications coming from the application. */ poll_ctx = nm_os_malloc(sizeof(*poll_ctx) + (2 + num_rings) * sizeof(poll_ctx->entries[0])); init_poll_funcptr(&poll_ctx->wait_table, sync_kloop_poll_table_queue_proc); poll_ctx->num_entries = 2 + num_rings; poll_ctx->next_entry = 0; /* Poll for notifications coming from the applications through * eventfds . */ for (i = 0; i < num_rings; i++) { struct eventfd_ctx *irq; struct file *filp; unsigned long mask; filp = eventfd_fget(eventfds_opt->eventfds[i].ioeventfd); if (IS_ERR(filp)) { err = PTR_ERR(filp); goto out; } mask = filp->f_op->poll(filp, &poll_ctx->wait_table); if (mask & POLLERR) { err = EINVAL; goto out; } filp = eventfd_fget(eventfds_opt->eventfds[i].irqfd); if (IS_ERR(filp)) { err = PTR_ERR(filp); goto out; } poll_ctx->entries[i].irq_filp = filp; irq = eventfd_ctx_fileget(filp); if (IS_ERR(irq)) { err = PTR_ERR(irq); goto out; } poll_ctx->entries[i].irq_ctx = irq; } /* Poll for notifications coming from the netmap rings bound to * this file descriptor. */ { NM_SELINFO_T *si[NR_TXRX]; NMG_LOCK(); si[NR_RX] = nm_si_user(priv, NR_RX) ? &na->si[NR_RX] : &na->rx_rings[priv->np_qfirst[NR_RX]]->si; si[NR_TX] = nm_si_user(priv, NR_TX) ? &na->si[NR_TX] : &na->tx_rings[priv->np_qfirst[NR_TX]]->si; NMG_UNLOCK(); - poll_wait(priv->np_filp, si[NR_RX], &poll_ctx->wait_table); poll_wait(priv->np_filp, si[NR_TX], &poll_ctx->wait_table); + poll_wait(priv->np_filp, si[NR_RX], &poll_ctx->wait_table); } #else /* SYNC_KLOOP_POLL */ opt->nro_status = EOPNOTSUPP; goto out; #endif /* SYNC_KLOOP_POLL */ } + /* Prepare the arguments for netmap_sync_kloop_tx_ring() + * and netmap_sync_kloop_rx_ring(). */ + for (i = 0; i < num_tx_rings; i++) { + struct sync_kloop_ring_args *a = args + i; + + a->kring = NMR(na, NR_TX)[i + priv->np_qfirst[NR_TX]]; + a->csb_atok = csb_atok_base + i; + a->csb_ktoa = csb_ktoa_base + i; +#ifdef SYNC_KLOOP_POLL + if (poll_ctx) + a->irq_ctx = poll_ctx->entries[i].irq_ctx; +#endif /* SYNC_KLOOP_POLL */ + } + for (i = 0; i < num_rx_rings; i++) { + struct sync_kloop_ring_args *a = args + num_tx_rings + i; + + a->kring = NMR(na, NR_RX)[i + priv->np_qfirst[NR_RX]]; + a->csb_atok = csb_atok_base + num_tx_rings + i; + a->csb_ktoa = csb_ktoa_base + num_tx_rings + i; +#ifdef SYNC_KLOOP_POLL + if (poll_ctx) + a->irq_ctx = poll_ctx->entries[num_tx_rings + i].irq_ctx; +#endif /* SYNC_KLOOP_POLL */ + } + /* Main loop. */ for (;;) { if (unlikely(NM_ACCESS_ONCE(priv->np_kloop_state) & NM_SYNC_KLOOP_STOPPING)) { break; } #ifdef SYNC_KLOOP_POLL - if (poll_ctx) - __set_current_state(TASK_INTERRUPTIBLE); + if (poll_ctx) { + /* It is important to set the task state as + * interruptible before processing any TX/RX ring, + * so that if a notification on ring Y comes after + * we have processed ring Y, but before we call + * schedule(), we don't miss it. This is true because + * the wake up function will change the the task state, + * and therefore the schedule_timeout() call below + * will observe the change). + */ + set_current_state(TASK_INTERRUPTIBLE); + } #endif /* SYNC_KLOOP_POLL */ /* Process all the TX rings bound to this file descriptor. */ for (i = 0; i < num_tx_rings; i++) { - struct sync_kloop_ring_args a = { - .kring = NMR(na, NR_TX)[i + priv->np_qfirst[NR_TX]], - .csb_atok = csb_atok_base + i, - .csb_ktoa = csb_ktoa_base + i, - }; + struct sync_kloop_ring_args *a = args + i; -#ifdef SYNC_KLOOP_POLL - if (poll_ctx) - a.irq_ctx = poll_ctx->entries[i].irq_ctx; -#endif /* SYNC_KLOOP_POLL */ - if (unlikely(nm_kr_tryget(a.kring, 1, NULL))) { + if (unlikely(nm_kr_tryget(a->kring, 1, NULL))) { continue; } - netmap_sync_kloop_tx_ring(&a); - nm_kr_put(a.kring); + netmap_sync_kloop_tx_ring(a); + nm_kr_put(a->kring); } /* Process all the RX rings bound to this file descriptor. */ for (i = 0; i < num_rx_rings; i++) { - struct sync_kloop_ring_args a = { - .kring = NMR(na, NR_RX)[i + priv->np_qfirst[NR_RX]], - .csb_atok = csb_atok_base + num_tx_rings + i, - .csb_ktoa = csb_ktoa_base + num_tx_rings + i, - }; + struct sync_kloop_ring_args *a = args + num_tx_rings + i; -#ifdef SYNC_KLOOP_POLL - if (poll_ctx) - a.irq_ctx = poll_ctx->entries[num_tx_rings + i].irq_ctx; -#endif /* SYNC_KLOOP_POLL */ - - if (unlikely(nm_kr_tryget(a.kring, 1, NULL))) { + if (unlikely(nm_kr_tryget(a->kring, 1, NULL))) { continue; } - netmap_sync_kloop_rx_ring(&a); - nm_kr_put(a.kring); + netmap_sync_kloop_rx_ring(a); + nm_kr_put(a->kring); } #ifdef SYNC_KLOOP_POLL if (poll_ctx) { /* If a poll context is present, yield to the scheduler * waiting for a notification to come either from * netmap or the application. */ - schedule_timeout_interruptible(msecs_to_jiffies(1000)); + schedule_timeout(msecs_to_jiffies(20000)); } else #endif /* SYNC_KLOOP_POLL */ { /* Default synchronization method: sleep for a while. */ usleep_range(sleep_us, sleep_us); } } out: #ifdef SYNC_KLOOP_POLL if (poll_ctx) { /* Stop polling from netmap and the eventfds, and deallocate * the poll context. */ __set_current_state(TASK_RUNNING); for (i = 0; i < poll_ctx->next_entry; i++) { struct sync_kloop_poll_entry *entry = poll_ctx->entries + i; if (entry->wqh) remove_wait_queue(entry->wqh, &entry->wait); /* We did not get a reference to the eventfds, but * don't do that on netmap file descriptors (since * a reference was not taken. */ if (entry->filp && entry->filp != priv->np_filp) fput(entry->filp); if (entry->irq_ctx) eventfd_ctx_put(entry->irq_ctx); if (entry->irq_filp) fput(entry->irq_filp); } nm_os_free(poll_ctx); poll_ctx = NULL; } #endif /* SYNC_KLOOP_POLL */ + if (args) { + nm_os_free(args); + args = NULL; + } + /* Reset the kloop state. */ NMG_LOCK(); priv->np_kloop_state = 0; NMG_UNLOCK(); return err; } int netmap_sync_kloop_stop(struct netmap_priv_d *priv) { bool running = true; int err = 0; NMG_LOCK(); priv->np_kloop_state |= NM_SYNC_KLOOP_STOPPING; NMG_UNLOCK(); while (running) { usleep_range(1000, 1500); NMG_LOCK(); running = (NM_ACCESS_ONCE(priv->np_kloop_state) & NM_SYNC_KLOOP_RUNNING); NMG_UNLOCK(); } return err; } #ifdef WITH_PTNETMAP /* * Guest ptnetmap txsync()/rxsync() routines, used in ptnet device drivers. * These routines are reused across the different operating systems supported * by netmap. */ /* * Reconcile host and guest views of the transmit ring. * * Guest user wants to transmit packets up to the one before ring->head, * and guest kernel knows tx_ring->hwcur is the first packet unsent * by the host kernel. * * We push out as many packets as possible, and possibly * reclaim buffers from previously completed transmission. * * Notifications from the host are enabled only if the user guest would * block (no space in the ring). */ bool netmap_pt_guest_txsync(struct nm_csb_atok *atok, struct nm_csb_ktoa *ktoa, struct netmap_kring *kring, int flags) { bool notify = false; /* Disable notifications */ atok->appl_need_kick = 0; /* * First part: tell the host (updating the CSB) to process the new * packets. */ kring->nr_hwcur = ktoa->hwcur; - ptnetmap_guest_write_kring_csb(atok, kring->rcur, kring->rhead); + nm_sync_kloop_appl_write(atok, kring->rcur, kring->rhead); /* Ask for a kick from a guest to the host if needed. */ if (((kring->rhead != kring->nr_hwcur || nm_kr_txempty(kring)) && NM_ACCESS_ONCE(ktoa->kern_need_kick)) || (flags & NAF_FORCE_RECLAIM)) { atok->sync_flags = flags; notify = true; } /* * Second part: reclaim buffers for completed transmissions. */ if (nm_kr_txempty(kring) || (flags & NAF_FORCE_RECLAIM)) { - ptnetmap_guest_read_kring_csb(ktoa, kring); + nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail, + &kring->nr_hwcur); } /* * No more room in the ring for new transmissions. The user thread will * go to sleep and we need to be notified by the host when more free * space is available. */ if (nm_kr_txempty(kring) && !(kring->nr_kflags & NKR_NOINTR)) { /* Reenable notifications. */ atok->appl_need_kick = 1; - /* Double check */ - ptnetmap_guest_read_kring_csb(ktoa, kring); + /* Double check, with store-load memory barrier. */ + nm_stld_barrier(); + nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail, + &kring->nr_hwcur); /* If there is new free space, disable notifications */ if (unlikely(!nm_kr_txempty(kring))) { atok->appl_need_kick = 0; } } nm_prdis(1, "%s CSB(head:%u cur:%u hwtail:%u) KRING(head:%u cur:%u tail:%u)", kring->name, atok->head, atok->cur, ktoa->hwtail, kring->rhead, kring->rcur, kring->nr_hwtail); return notify; } /* * Reconcile host and guest view of the receive ring. * * Update hwcur/hwtail from host (reading from CSB). * * If guest user has released buffers up to the one before ring->head, we * also give them to the host. * * Notifications from the host are enabled only if the user guest would * block (no more completed slots in the ring). */ bool netmap_pt_guest_rxsync(struct nm_csb_atok *atok, struct nm_csb_ktoa *ktoa, struct netmap_kring *kring, int flags) { bool notify = false; /* Disable notifications */ atok->appl_need_kick = 0; /* * First part: import newly received packets, by updating the kring * hwtail to the hwtail known from the host (read from the CSB). * This also updates the kring hwcur. */ - ptnetmap_guest_read_kring_csb(ktoa, kring); + nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail, &kring->nr_hwcur); kring->nr_kflags &= ~NKR_PENDINTR; /* * Second part: tell the host about the slots that guest user has * released, by updating cur and head in the CSB. */ if (kring->rhead != kring->nr_hwcur) { - ptnetmap_guest_write_kring_csb(atok, kring->rcur, - kring->rhead); + nm_sync_kloop_appl_write(atok, kring->rcur, kring->rhead); /* Ask for a kick from the guest to the host if needed. */ if (NM_ACCESS_ONCE(ktoa->kern_need_kick)) { atok->sync_flags = flags; notify = true; } } /* * No more completed RX slots. The user thread will go to sleep and * we need to be notified by the host when more RX slots have been * completed. */ if (nm_kr_rxempty(kring) && !(kring->nr_kflags & NKR_NOINTR)) { /* Reenable notifications. */ atok->appl_need_kick = 1; - /* Double check */ - ptnetmap_guest_read_kring_csb(ktoa, kring); + /* Double check, with store-load memory barrier. */ + nm_stld_barrier(); + nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail, + &kring->nr_hwcur); /* If there are new slots, disable notifications. */ if (!nm_kr_rxempty(kring)) { atok->appl_need_kick = 0; } } nm_prdis(1, "%s CSB(head:%u cur:%u hwtail:%u) KRING(head:%u cur:%u tail:%u)", kring->name, atok->head, atok->cur, ktoa->hwtail, kring->rhead, kring->rcur, kring->nr_hwtail); return notify; } /* * Callbacks for ptnet drivers: nm_krings_create, nm_krings_delete, nm_dtor. */ int ptnet_nm_krings_create(struct netmap_adapter *na) { struct netmap_pt_guest_adapter *ptna = (struct netmap_pt_guest_adapter *)na; /* Upcast. */ struct netmap_adapter *na_nm = &ptna->hwup.up; struct netmap_adapter *na_dr = &ptna->dr.up; int ret; if (ptna->backend_users) { return 0; } /* Create krings on the public netmap adapter. */ ret = netmap_hw_krings_create(na_nm); if (ret) { return ret; } /* Copy krings into the netmap adapter private to the driver. */ na_dr->tx_rings = na_nm->tx_rings; na_dr->rx_rings = na_nm->rx_rings; return 0; } void ptnet_nm_krings_delete(struct netmap_adapter *na) { struct netmap_pt_guest_adapter *ptna = (struct netmap_pt_guest_adapter *)na; /* Upcast. */ struct netmap_adapter *na_nm = &ptna->hwup.up; struct netmap_adapter *na_dr = &ptna->dr.up; if (ptna->backend_users) { return; } na_dr->tx_rings = NULL; na_dr->rx_rings = NULL; netmap_hw_krings_delete(na_nm); } void ptnet_nm_dtor(struct netmap_adapter *na) { struct netmap_pt_guest_adapter *ptna = (struct netmap_pt_guest_adapter *)na; netmap_mem_put(ptna->dr.up.nm_mem); memset(&ptna->dr, 0, sizeof(ptna->dr)); netmap_mem_pt_guest_ifp_del(na->nm_mem, na->ifp); } int netmap_pt_guest_attach(struct netmap_adapter *arg, unsigned int nifp_offset, unsigned int memid) { struct netmap_pt_guest_adapter *ptna; struct ifnet *ifp = arg ? arg->ifp : NULL; int error; /* get allocator */ arg->nm_mem = netmap_mem_pt_guest_new(ifp, nifp_offset, memid); if (arg->nm_mem == NULL) return ENOMEM; arg->na_flags |= NAF_MEM_OWNER; error = netmap_attach_ext(arg, sizeof(struct netmap_pt_guest_adapter), 1); if (error) return error; /* get the netmap_pt_guest_adapter */ ptna = (struct netmap_pt_guest_adapter *) NA(ifp); /* Initialize a separate pass-through netmap adapter that is going to * be used by the ptnet driver only, and so never exposed to netmap * applications. We only need a subset of the available fields. */ memset(&ptna->dr, 0, sizeof(ptna->dr)); ptna->dr.up.ifp = ifp; ptna->dr.up.nm_mem = netmap_mem_get(ptna->hwup.up.nm_mem); ptna->dr.up.nm_config = ptna->hwup.up.nm_config; ptna->backend_users = 0; return 0; } #endif /* WITH_PTNETMAP */ Index: stable/11/sys/net/netmap.h =================================================================== --- stable/11/sys/net/netmap.h (revision 343831) +++ stable/11/sys/net/netmap.h (revision 343832) @@ -1,882 +1,905 @@ /* * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``S IS''AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD$ * * Definitions of constants and the structures used by the netmap * framework, for the part visible to both kernel and userspace. * Detailed info on netmap is available with "man netmap" or at * * http://info.iet.unipi.it/~luigi/netmap/ * * This API is also used to communicate with the VALE software switch */ #ifndef _NET_NETMAP_H_ #define _NET_NETMAP_H_ #define NETMAP_API 13 /* current API version */ #define NETMAP_MIN_API 13 /* min and max versions accepted */ #define NETMAP_MAX_API 15 /* * Some fields should be cache-aligned to reduce contention. * The alignment is architecture and OS dependent, but rather than * digging into OS headers to find the exact value we use an estimate * that should cover most architectures. */ #define NM_CACHE_ALIGN 128 /* * --- Netmap data structures --- * * The userspace data structures used by netmap are shown below. * They are allocated by the kernel and mmap()ed by userspace threads. * Pointers are implemented as memory offsets or indexes, * so that they can be easily dereferenced in kernel and userspace. KERNEL (opaque, obviously) ==================================================================== | USERSPACE | struct netmap_ring +---->+---------------+ / | head,cur,tail | struct netmap_if (nifp, 1 per fd) / | buf_ofs | +---------------+ / | other fields | | ni_tx_rings | / +===============+ | ni_rx_rings | / | buf_idx, len | slot[0] | | / | flags, ptr | | | / +---------------+ +===============+ / | buf_idx, len | slot[1] | txring_ofs[0] | (rel.to nifp)--' | flags, ptr | | txring_ofs[1] | +---------------+ (tx+1 entries) (num_slots entries) | txring_ofs[t] | | buf_idx, len | slot[n-1] +---------------+ | flags, ptr | | rxring_ofs[0] | +---------------+ | rxring_ofs[1] | (rx+1 entries) | rxring_ofs[r] | +---------------+ * For each "interface" (NIC, host stack, PIPE, VALE switch port) bound to * a file descriptor, the mmap()ed region contains a (logically readonly) * struct netmap_if pointing to struct netmap_ring's. * * There is one netmap_ring per physical NIC ring, plus one tx/rx ring * pair attached to the host stack (this pair is unused for non-NIC ports). * * All physical/host stack ports share the same memory region, * so that zero-copy can be implemented between them. * VALE switch ports instead have separate memory regions. * * The netmap_ring is the userspace-visible replica of the NIC ring. * Each slot has the index of a buffer (MTU-sized and residing in the * mmapped region), its length and some flags. An extra 64-bit pointer * is provided for user-supplied buffers in the tx path. * * In user space, the buffer address is computed as * (char *)ring + buf_ofs + index * NETMAP_BUF_SIZE * * Added in NETMAP_API 11: * * + NIOCREGIF can request the allocation of extra spare buffers from * the same memory pool. The desired number of buffers must be in * nr_arg3. The ioctl may return fewer buffers, depending on memory * availability. nr_arg3 will return the actual value, and, once * mapped, nifp->ni_bufs_head will be the index of the first buffer. * * The buffers are linked to each other using the first uint32_t * as the index. On close, ni_bufs_head must point to the list of * buffers to be released. * * + NIOCREGIF can request space for extra rings (and buffers) * allocated in the same memory space. The number of extra rings * is in nr_arg1, and is advisory. This is a no-op on NICs where * the size of the memory space is fixed. * * + NIOCREGIF can attach to PIPE rings sharing the same memory * space with a parent device. The ifname indicates the parent device, * which must already exist. Flags in nr_flags indicate if we want to * bind the master or slave side, the index (from nr_ringid) * is just a cookie and does not need to be sequential. * * + NIOCREGIF can also attach to 'monitor' rings that replicate * the content of specific rings, also from the same memory space. * * Extra flags in nr_flags support the above functions. * Application libraries may use the following naming scheme: * netmap:foo all NIC ring pairs * netmap:foo^ only host ring pair * netmap:foo+ all NIC ring + host ring pairs * netmap:foo-k the k-th NIC ring pair * netmap:foo{k PIPE ring pair k, master side * netmap:foo}k PIPE ring pair k, slave side * * Some notes about host rings: * * + The RX host ring is used to store those packets that the host network * stack is trying to transmit through a NIC queue, but only if that queue * is currently in netmap mode. Netmap will not intercept host stack mbufs * designated to NIC queues that are not in netmap mode. As a consequence, * registering a netmap port with netmap:foo^ is not enough to intercept * mbufs in the RX host ring; the netmap port should be registered with * netmap:foo*, or another registration should be done to open at least a * NIC TX queue in netmap mode. * * + Netmap is not currently able to deal with intercepted trasmit mbufs which * require offloadings like TSO, UFO, checksumming offloadings, etc. It is * responsibility of the user to disable those offloadings (e.g. using * ifconfig on FreeBSD or ethtool -K on Linux) for an interface that is being * used in netmap mode. If the offloadings are not disabled, GSO and/or * unchecksummed packets may be dropped immediately or end up in the host RX * ring, and will be dropped as soon as the packet reaches another netmap * adapter. */ /* * struct netmap_slot is a buffer descriptor */ struct netmap_slot { uint32_t buf_idx; /* buffer index */ uint16_t len; /* length for this slot */ uint16_t flags; /* buf changed, etc. */ uint64_t ptr; /* pointer for indirect buffers */ }; /* * The following flags control how the slot is used */ #define NS_BUF_CHANGED 0x0001 /* buf_idx changed */ /* * must be set whenever buf_idx is changed (as it might be * necessary to recompute the physical address and mapping) * * It is also set by the kernel whenever the buf_idx is * changed internally (e.g., by pipes). Applications may * use this information to know when they can reuse the * contents of previously prepared buffers. */ #define NS_REPORT 0x0002 /* ask the hardware to report results */ /* * Request notification when slot is used by the hardware. * Normally transmit completions are handled lazily and * may be unreported. This flag lets us know when a slot * has been sent (e.g. to terminate the sender). */ #define NS_FORWARD 0x0004 /* pass packet 'forward' */ /* * (Only for physical ports, rx rings with NR_FORWARD set). * Slot released to the kernel (i.e. before ring->head) with * this flag set are passed to the peer ring (host/NIC), * thus restoring the host-NIC connection for these slots. * This supports efficient traffic monitoring or firewalling. */ #define NS_NO_LEARN 0x0008 /* disable bridge learning */ /* * On a VALE switch, do not 'learn' the source port for * this buffer. */ #define NS_INDIRECT 0x0010 /* userspace buffer */ /* * (VALE tx rings only) data is in a userspace buffer, * whose address is in the 'ptr' field in the slot. */ #define NS_MOREFRAG 0x0020 /* packet has more fragments */ /* * (VALE ports, ptnetmap ports and some NIC ports, e.g. * ixgbe and i40e on Linux) * Set on all but the last slot of a multi-segment packet. * The 'len' field refers to the individual fragment. */ #define NS_PORT_SHIFT 8 #define NS_PORT_MASK (0xff << NS_PORT_SHIFT) /* * The high 8 bits of the flag, if not zero, indicate the * destination port for the VALE switch, overriding * the lookup table. */ #define NS_RFRAGS(_slot) ( ((_slot)->flags >> 8) & 0xff) /* * (VALE rx rings only) the high 8 bits * are the number of fragments. */ #define NETMAP_MAX_FRAGS 64 /* max number of fragments */ /* * struct netmap_ring * * Netmap representation of a TX or RX ring (also known as "queue"). * This is a queue implemented as a fixed-size circular array. * At the software level the important fields are: head, cur, tail. * * In TX rings: * * head first slot available for transmission. * cur wakeup point. select() and poll() will unblock * when 'tail' moves past 'cur' * tail (readonly) first slot reserved to the kernel * * [head .. tail-1] can be used for new packets to send; * 'head' and 'cur' must be incremented as slots are filled * with new packets to be sent; * 'cur' can be moved further ahead if we need more space * for new transmissions. XXX todo (2014-03-12) * * In RX rings: * * head first valid received packet * cur wakeup point. select() and poll() will unblock * when 'tail' moves past 'cur' * tail (readonly) first slot reserved to the kernel * * [head .. tail-1] contain received packets; * 'head' and 'cur' must be incremented as slots are consumed * and can be returned to the kernel; * 'cur' can be moved further ahead if we want to wait for * new packets without returning the previous ones. * * DATA OWNERSHIP/LOCKING: * The netmap_ring, and all slots and buffers in the range * [head .. tail-1] are owned by the user program; * the kernel only accesses them during a netmap system call * and in the user thread context. * * Other slots and buffers are reserved for use by the kernel */ struct netmap_ring { /* * buf_ofs is meant to be used through macros. * It contains the offset of the buffer region from this * descriptor. */ const int64_t buf_ofs; const uint32_t num_slots; /* number of slots in the ring. */ const uint32_t nr_buf_size; const uint16_t ringid; const uint16_t dir; /* 0: tx, 1: rx */ uint32_t head; /* (u) first user slot */ uint32_t cur; /* (u) wakeup point */ uint32_t tail; /* (k) first kernel slot */ uint32_t flags; struct timeval ts; /* (k) time of last *sync() */ /* opaque room for a mutex or similar object */ #if !defined(_WIN32) || defined(__CYGWIN__) uint8_t __attribute__((__aligned__(NM_CACHE_ALIGN))) sem[128]; #else uint8_t __declspec(align(NM_CACHE_ALIGN)) sem[128]; #endif /* the slots follow. This struct has variable size */ struct netmap_slot slot[0]; /* array of slots. */ }; /* * RING FLAGS */ #define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */ /* * updates the 'ts' field on each netmap syscall. This saves * saves a separate gettimeofday(), and is not much worse than * software timestamps generated in the interrupt handler. */ #define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */ /* * Enables the NS_FORWARD slot flag for the ring. */ /* * Helper functions for kernel and userspace */ /* * Check if space is available in the ring. We use ring->head, which * points to the next netmap slot to be published to netmap. It is * possible that the applications moves ring->cur ahead of ring->tail * (e.g., by setting ring->cur <== ring->tail), if it wants more slots * than the ones currently available, and it wants to be notified when * more arrive. See netmap(4) for more details and examples. */ static inline int nm_ring_empty(struct netmap_ring *ring) { return (ring->head == ring->tail); } /* * Netmap representation of an interface and its queue(s). * This is initialized by the kernel when binding a file * descriptor to a port, and should be considered as readonly * by user programs. The kernel never uses it. * * There is one netmap_if for each file descriptor on which we want * to select/poll. * select/poll operates on one or all pairs depending on the value of * nmr_queueid passed on the ioctl. */ struct netmap_if { char ni_name[IFNAMSIZ]; /* name of the interface. */ const uint32_t ni_version; /* API version, currently unused */ const uint32_t ni_flags; /* properties */ #define NI_PRIV_MEM 0x1 /* private memory region */ /* * The number of packet rings available in netmap mode. * Physical NICs can have different numbers of tx and rx rings. * Physical NICs also have a 'host' ring pair. * Additionally, clients can request additional ring pairs to * be used for internal communication. */ const uint32_t ni_tx_rings; /* number of HW tx rings */ const uint32_t ni_rx_rings; /* number of HW rx rings */ uint32_t ni_bufs_head; /* head index for extra bufs */ uint32_t ni_spare1[5]; /* * The following array contains the offset of each netmap ring * from this structure, in the following order: * NIC tx rings (ni_tx_rings); host tx ring (1); extra tx rings; * NIC rx rings (ni_rx_rings); host tx ring (1); extra rx rings. * * The area is filled up by the kernel on NIOCREGIF, * and then only read by userspace code. */ const ssize_t ring_ofs[0]; }; /* Legacy interface to interact with a netmap control device. * Included for backward compatibility. The user should not include this * file directly. */ #include "netmap_legacy.h" /* * New API to control netmap control devices. New applications should only use * nmreq_xyz structs with the NIOCCTRL ioctl() command. * * NIOCCTRL takes a nmreq_header struct, which contains the required * API version, the name of a netmap port, a command type, and pointers * to request body and options. * * nr_name (in) * The name of the port (em0, valeXXX:YYY, eth0{pn1 etc.) * * nr_version (in/out) * Must match NETMAP_API as used in the kernel, error otherwise. * Always returns the desired value on output. * * nr_reqtype (in) * One of the NETMAP_REQ_* command types below * * nr_body (in) * Pointer to a command-specific struct, described by one * of the struct nmreq_xyz below. * * nr_options (in) * Command specific options, if any. * * A NETMAP_REQ_REGISTER command activates netmap mode on the netmap * port (e.g. physical interface) specified by nmreq_header.nr_name. * The request body (struct nmreq_register) has several arguments to * specify how the port is to be registered. * * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings (in/out) * On input, non-zero values may be used to reconfigure the port * according to the requested values, but this is not guaranteed. * On output the actual values in use are reported. * * nr_mode (in) * Indicate what set of rings must be bound to the netmap * device (e.g. all NIC rings, host rings only, NIC and * host rings, ...). Values are in NR_REG_*. * * nr_ringid (in) * If nr_mode == NR_REG_ONE_NIC (only a single couple of TX/RX * rings), indicate which NIC TX and/or RX ring is to be bound * (0..nr_*x_rings-1). * * nr_flags (in) * Indicate special options for how to open the port. * * NR_NO_TX_POLL can be OR-ed to make select()/poll() push * packets on tx rings only if POLLOUT is set. * The default is to push any pending packet. * * NR_DO_RX_POLL can be OR-ed to make select()/poll() release * packets on rx rings also when POLLIN is NOT set. * The default is to touch the rx ring only with POLLIN. * Note that this is the opposite of TX because it * reflects the common usage. * * Other options are NR_MONITOR_TX, NR_MONITOR_RX, NR_ZCOPY_MON, * NR_EXCLUSIVE, NR_RX_RINGS_ONLY, NR_TX_RINGS_ONLY and * NR_ACCEPT_VNET_HDR. * * nr_mem_id (in/out) * The identity of the memory region used. * On input, 0 means the system decides autonomously, * other values may try to select a specific region. * On return the actual value is reported. * Region '1' is the global allocator, normally shared * by all interfaces. Other values are private regions. * If two ports the same region zero-copy is possible. * * nr_extra_bufs (in/out) * Number of extra buffers to be allocated. * * The other NETMAP_REQ_* commands are described below. * */ /* maximum size of a request, including all options */ #define NETMAP_REQ_MAXSIZE 4096 /* Header common to all request options. */ struct nmreq_option { /* Pointer ot the next option. */ uint64_t nro_next; /* Option type. */ uint32_t nro_reqtype; /* (out) status of the option: * 0: recognized and processed * !=0: errno value */ uint32_t nro_status; /* Option size, used only for options that can have variable size * (e.g. because they contain arrays). For fixed-size options this * field should be set to zero. */ uint64_t nro_size; }; /* Header common to all requests. Do not reorder these fields, as we need * the second one (nr_reqtype) to know how much to copy from/to userspace. */ struct nmreq_header { uint16_t nr_version; /* API version */ uint16_t nr_reqtype; /* nmreq type (NETMAP_REQ_*) */ uint32_t nr_reserved; /* must be zero */ #define NETMAP_REQ_IFNAMSIZ 64 char nr_name[NETMAP_REQ_IFNAMSIZ]; /* port name */ uint64_t nr_options; /* command-specific options */ uint64_t nr_body; /* ptr to nmreq_xyz struct */ }; enum { /* Register a netmap port with the device. */ NETMAP_REQ_REGISTER = 1, /* Get information from a netmap port. */ NETMAP_REQ_PORT_INFO_GET, /* Attach a netmap port to a VALE switch. */ NETMAP_REQ_VALE_ATTACH, /* Detach a netmap port from a VALE switch. */ NETMAP_REQ_VALE_DETACH, /* List the ports attached to a VALE switch. */ NETMAP_REQ_VALE_LIST, /* Set the port header length (was virtio-net header length). */ NETMAP_REQ_PORT_HDR_SET, /* Get the port header length (was virtio-net header length). */ NETMAP_REQ_PORT_HDR_GET, /* Create a new persistent VALE port. */ NETMAP_REQ_VALE_NEWIF, /* Delete a persistent VALE port. */ NETMAP_REQ_VALE_DELIF, /* Enable polling kernel thread(s) on an attached VALE port. */ NETMAP_REQ_VALE_POLLING_ENABLE, /* Disable polling kernel thread(s) on an attached VALE port. */ NETMAP_REQ_VALE_POLLING_DISABLE, /* Get info about the pools of a memory allocator. */ NETMAP_REQ_POOLS_INFO_GET, /* Start an in-kernel loop that syncs the rings periodically or * on notifications. The loop runs in the context of the ioctl * syscall, and only stops on NETMAP_REQ_SYNC_KLOOP_STOP. */ NETMAP_REQ_SYNC_KLOOP_START, /* Stops the thread executing the in-kernel loop. The thread * returns from the ioctl syscall. */ NETMAP_REQ_SYNC_KLOOP_STOP, /* Enable CSB mode on a registered netmap control device. */ NETMAP_REQ_CSB_ENABLE, }; enum { /* On NETMAP_REQ_REGISTER, ask netmap to use memory allocated * from user-space allocated memory pools (e.g. hugepages). */ NETMAP_REQ_OPT_EXTMEM = 1, /* ON NETMAP_REQ_SYNC_KLOOP_START, ask netmap to use eventfd-based * notifications to synchronize the kernel loop with the application. */ NETMAP_REQ_OPT_SYNC_KLOOP_EVENTFDS, /* On NETMAP_REQ_REGISTER, ask netmap to work in CSB mode, where * head, cur and tail pointers are not exchanged through the * struct netmap_ring header, but rather using an user-provided * memory area (see struct nm_csb_atok and struct nm_csb_ktoa). */ NETMAP_REQ_OPT_CSB, }; /* * nr_reqtype: NETMAP_REQ_REGISTER * Bind (register) a netmap port to this control device. */ struct nmreq_register { uint64_t nr_offset; /* nifp offset in the shared region */ uint64_t nr_memsize; /* size of the shared region */ uint32_t nr_tx_slots; /* slots in tx rings */ uint32_t nr_rx_slots; /* slots in rx rings */ uint16_t nr_tx_rings; /* number of tx rings */ uint16_t nr_rx_rings; /* number of rx rings */ uint16_t nr_mem_id; /* id of the memory allocator */ uint16_t nr_ringid; /* ring(s) we care about */ uint32_t nr_mode; /* specify NR_REG_* modes */ uint32_t nr_extra_bufs; /* number of requested extra buffers */ uint64_t nr_flags; /* additional flags (see below) */ /* monitors use nr_ringid and nr_mode to select the rings to monitor */ #define NR_MONITOR_TX 0x100 #define NR_MONITOR_RX 0x200 #define NR_ZCOPY_MON 0x400 /* request exclusive access to the selected rings */ #define NR_EXCLUSIVE 0x800 /* 0x1000 unused */ #define NR_RX_RINGS_ONLY 0x2000 #define NR_TX_RINGS_ONLY 0x4000 /* Applications set this flag if they are able to deal with virtio-net headers, * that is send/receive frames that start with a virtio-net header. * If not set, NIOCREGIF will fail with netmap ports that require applications * to use those headers. If the flag is set, the application can use the * NETMAP_VNET_HDR_GET command to figure out the header length. */ #define NR_ACCEPT_VNET_HDR 0x8000 /* The following two have the same meaning of NETMAP_NO_TX_POLL and * NETMAP_DO_RX_POLL. */ #define NR_DO_RX_POLL 0x10000 #define NR_NO_TX_POLL 0x20000 }; /* Valid values for nmreq_register.nr_mode (see above). */ enum { NR_REG_DEFAULT = 0, /* backward compat, should not be used. */ NR_REG_ALL_NIC = 1, NR_REG_SW = 2, NR_REG_NIC_SW = 3, NR_REG_ONE_NIC = 4, NR_REG_PIPE_MASTER = 5, /* deprecated, use "x{y" port name syntax */ NR_REG_PIPE_SLAVE = 6, /* deprecated, use "x}y" port name syntax */ NR_REG_NULL = 7, }; /* A single ioctl number is shared by all the new API command. * Demultiplexing is done using the hdr.nr_reqtype field. * FreeBSD uses the size value embedded in the _IOWR to determine * how much to copy in/out, so we define the ioctl() command * specifying only nmreq_header, and copyin/copyout the rest. */ #define NIOCCTRL _IOWR('i', 151, struct nmreq_header) /* The ioctl commands to sync TX/RX netmap rings. * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues, * whose identity is set in NIOCREGIF through nr_ringid. * These are non blocking and take no argument. */ #define NIOCTXSYNC _IO('i', 148) /* sync tx queues */ #define NIOCRXSYNC _IO('i', 149) /* sync rx queues */ /* * nr_reqtype: NETMAP_REQ_PORT_INFO_GET * Get information about a netmap port, including number of rings. * slots per ring, id of the memory allocator, etc. The netmap * control device used for this operation does not need to be bound * to a netmap port. */ struct nmreq_port_info_get { uint64_t nr_memsize; /* size of the shared region */ uint32_t nr_tx_slots; /* slots in tx rings */ uint32_t nr_rx_slots; /* slots in rx rings */ uint16_t nr_tx_rings; /* number of tx rings */ uint16_t nr_rx_rings; /* number of rx rings */ uint16_t nr_mem_id; /* memory allocator id (in/out) */ uint16_t pad1; }; #define NM_BDG_NAME "vale" /* prefix for bridge port name */ /* * nr_reqtype: NETMAP_REQ_VALE_ATTACH * Attach a netmap port to a VALE switch. Both the name of the netmap * port and the VALE switch are specified through the nr_name argument. * The attach operation could need to register a port, so at least * the same arguments are available. * port_index will contain the index where the port has been attached. */ struct nmreq_vale_attach { struct nmreq_register reg; uint32_t port_index; uint32_t pad1; }; /* * nr_reqtype: NETMAP_REQ_VALE_DETACH * Detach a netmap port from a VALE switch. Both the name of the netmap * port and the VALE switch are specified through the nr_name argument. * port_index will contain the index where the port was attached. */ struct nmreq_vale_detach { uint32_t port_index; uint32_t pad1; }; /* * nr_reqtype: NETMAP_REQ_VALE_LIST * List the ports of a VALE switch. */ struct nmreq_vale_list { /* Name of the VALE port (valeXXX:YYY) or empty. */ uint16_t nr_bridge_idx; uint16_t pad1; uint32_t nr_port_idx; }; /* * nr_reqtype: NETMAP_REQ_PORT_HDR_SET or NETMAP_REQ_PORT_HDR_GET * Set or get the port header length of the port identified by hdr.nr_name. * The control device does not need to be bound to a netmap port. */ struct nmreq_port_hdr { uint32_t nr_hdr_len; uint32_t pad1; }; /* * nr_reqtype: NETMAP_REQ_VALE_NEWIF * Create a new persistent VALE port. */ struct nmreq_vale_newif { uint32_t nr_tx_slots; /* slots in tx rings */ uint32_t nr_rx_slots; /* slots in rx rings */ uint16_t nr_tx_rings; /* number of tx rings */ uint16_t nr_rx_rings; /* number of rx rings */ uint16_t nr_mem_id; /* id of the memory allocator */ uint16_t pad1; }; /* * nr_reqtype: NETMAP_REQ_VALE_POLLING_ENABLE or NETMAP_REQ_VALE_POLLING_DISABLE * Enable or disable polling kthreads on a VALE port. */ struct nmreq_vale_polling { uint32_t nr_mode; #define NETMAP_POLLING_MODE_SINGLE_CPU 1 #define NETMAP_POLLING_MODE_MULTI_CPU 2 uint32_t nr_first_cpu_id; uint32_t nr_num_polling_cpus; uint32_t pad1; }; /* * nr_reqtype: NETMAP_REQ_POOLS_INFO_GET * Get info about the pools of the memory allocator of the netmap * port specified by hdr.nr_name and nr_mem_id. The netmap control * device used for this operation does not need to be bound to a netmap * port. */ struct nmreq_pools_info { uint64_t nr_memsize; uint16_t nr_mem_id; /* in/out argument */ uint16_t pad1[3]; uint64_t nr_if_pool_offset; uint32_t nr_if_pool_objtotal; uint32_t nr_if_pool_objsize; uint64_t nr_ring_pool_offset; uint32_t nr_ring_pool_objtotal; uint32_t nr_ring_pool_objsize; uint64_t nr_buf_pool_offset; uint32_t nr_buf_pool_objtotal; uint32_t nr_buf_pool_objsize; }; /* * nr_reqtype: NETMAP_REQ_SYNC_KLOOP_START * Start an in-kernel loop that syncs the rings periodically or on * notifications. The loop runs in the context of the ioctl syscall, * and only stops on NETMAP_REQ_SYNC_KLOOP_STOP. * The registered netmap port must be open in CSB mode. */ struct nmreq_sync_kloop_start { /* Sleeping is the default synchronization method for the kloop. * The 'sleep_us' field specifies how many microsconds to sleep for * when there is no work to do, before doing another kloop iteration. */ uint32_t sleep_us; uint32_t pad1; }; /* A CSB entry for the application --> kernel direction. */ struct nm_csb_atok { uint32_t head; /* AW+ KR+ the head of the appl netmap_ring */ uint32_t cur; /* AW+ KR+ the cur of the appl netmap_ring */ uint32_t appl_need_kick; /* AW+ KR+ kern --> appl notification enable */ uint32_t sync_flags; /* AW+ KR+ the flags of the appl [tx|rx]sync() */ uint32_t pad[12]; /* pad to a 64 bytes cacheline */ }; /* A CSB entry for the application <-- kernel direction. */ struct nm_csb_ktoa { uint32_t hwcur; /* AR+ KW+ the hwcur of the kern netmap_kring */ uint32_t hwtail; /* AR+ KW+ the hwtail of the kern netmap_kring */ uint32_t kern_need_kick; /* AR+ KW+ appl-->kern notification enable */ uint32_t pad[13]; }; #ifdef __linux__ #ifdef __KERNEL__ #define nm_stst_barrier smp_wmb +#define nm_ldld_barrier smp_rmb +#define nm_stld_barrier smp_mb #else /* !__KERNEL__ */ static inline void nm_stst_barrier(void) { /* A memory barrier with release semantic has the combined * effect of a store-store barrier and a load-store barrier, * which is fine for us. */ __atomic_thread_fence(__ATOMIC_RELEASE); } +static inline void nm_ldld_barrier(void) +{ + /* A memory barrier with acquire semantic has the combined + * effect of a load-load barrier and a store-load barrier, + * which is fine for us. */ + __atomic_thread_fence(__ATOMIC_ACQUIRE); +} #endif /* !__KERNEL__ */ #elif defined(__FreeBSD__) #ifdef _KERNEL #define nm_stst_barrier atomic_thread_fence_rel +#define nm_ldld_barrier atomic_thread_fence_acq +#define nm_stld_barrier atomic_thread_fence_seq_cst #else /* !_KERNEL */ #include static inline void nm_stst_barrier(void) { atomic_thread_fence(memory_order_release); } +static inline void nm_ldld_barrier(void) +{ + atomic_thread_fence(memory_order_acquire); +} #endif /* !_KERNEL */ #else /* !__linux__ && !__FreeBSD__ */ #error "OS not supported" #endif /* !__linux__ && !__FreeBSD__ */ /* Application side of sync-kloop: Write ring pointers (cur, head) to the CSB. * This routine is coupled with sync_kloop_kernel_read(). */ static inline void nm_sync_kloop_appl_write(struct nm_csb_atok *atok, uint32_t cur, uint32_t head) { + /* Issue a first store-store barrier to make sure writes to the + * netmap ring do not overcome updates on atok->cur and atok->head. */ + nm_stst_barrier(); + /* * We need to write cur and head to the CSB but we cannot do it atomically. * There is no way we can prevent the host from reading the updated value * of one of the two and the old value of the other. However, if we make * sure that the host never reads a value of head more recent than the * value of cur we are safe. We can allow the host to read a value of cur * more recent than the value of head, since in the netmap ring cur can be * ahead of head and cur cannot wrap around head because it must be behind * tail. Inverting the order of writes below could instead result into the * host to think head went ahead of cur, which would cause the sync * prologue to fail. * * The following memory barrier scheme is used to make this happen: * - * Guest Host + * Guest Host * - * STORE(cur) LOAD(head) - * mb() <-----------> mb() - * STORE(head) LOAD(cur) + * STORE(cur) LOAD(head) + * wmb() <-----------> rmb() + * STORE(head) LOAD(cur) * */ atok->cur = cur; nm_stst_barrier(); atok->head = head; } /* Application side of sync-kloop: Read kring pointers (hwcur, hwtail) from * the CSB. This routine is coupled with sync_kloop_kernel_write(). */ static inline void nm_sync_kloop_appl_read(struct nm_csb_ktoa *ktoa, uint32_t *hwtail, uint32_t *hwcur) { /* * We place a memory barrier to make sure that the update of hwtail never * overtakes the update of hwcur. * (see explanation in sync_kloop_kernel_write). */ *hwtail = ktoa->hwtail; - nm_stst_barrier(); + nm_ldld_barrier(); *hwcur = ktoa->hwcur; + + /* Make sure that loads from ktoa->hwtail and ktoa->hwcur are not delayed + * after the loads from the netmap ring. */ + nm_ldld_barrier(); } /* * data for NETMAP_REQ_OPT_* options */ struct nmreq_opt_sync_kloop_eventfds { struct nmreq_option nro_opt; /* common header */ /* An array of N entries for bidirectional notifications between * the kernel loop and the application. The number of entries and * their order must agree with the CSB arrays passed in the * NETMAP_REQ_OPT_CSB option. Each entry contains a file descriptor * backed by an eventfd. */ struct { /* Notifier for the application --> kernel loop direction. */ int32_t ioeventfd; /* Notifier for the kernel loop --> application direction. */ int32_t irqfd; } eventfds[0]; }; struct nmreq_opt_extmem { struct nmreq_option nro_opt; /* common header */ uint64_t nro_usrptr; /* (in) ptr to usr memory */ struct nmreq_pools_info nro_info; /* (in/out) */ }; struct nmreq_opt_csb { struct nmreq_option nro_opt; /* Array of CSB entries for application --> kernel communication * (N entries). */ uint64_t csb_atok; /* Array of CSB entries for kernel --> application communication * (N entries). */ uint64_t csb_ktoa; }; #endif /* _NET_NETMAP_H_ */ Index: stable/11 =================================================================== --- stable/11 (revision 343831) +++ stable/11 (revision 343832) Property changes on: stable/11 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r343346