Index: stable/9/share/man/man4/netmap.4 =================================================================== --- stable/9/share/man/man4/netmap.4 (revision 262152) +++ stable/9/share/man/man4/netmap.4 (revision 262153) @@ -1,299 +1,1075 @@ -.\" Copyright (c) 2011 Matteo Landi, Luigi Rizzo, Universita` di Pisa +.\" Copyright (c) 2011-2014 Matteo Landi, Luigi Rizzo, Universita` di Pisa .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" This document is derived in part from the enet man page (enet.4) .\" distributed with 4.3BSD Unix. .\" .\" $FreeBSD$ -.\" $Id: netmap.4 9662 2011-11-16 13:18:06Z luigi $: stable/8/share/man/man4/bpf.4 181694 2008-08-13 17:45:06Z ed $ .\" -.Dd November 16, 2011 +.Dd February 13, 2014 .Dt NETMAP 4 .Os .Sh NAME .Nm netmap .Nd a framework for fast packet I/O +.br +.Nm VALE +.Nd a fast VirtuAl Local Ethernet using the netmap API +.br +.Nm netmap pipes +.Nd a shared memory packet transport channel .Sh SYNOPSIS .Cd device netmap .Sh DESCRIPTION .Nm -is a framework for fast and safe access to network devices -(reaching 14.88 Mpps at less than 1 GHz). +is a framework for extremely fast and efficient packet I/O +for both userspace and kernel clients. +It runs on FreeBSD and Linux, +and includes +.Nm VALE , +a very fast and modular in-kernel software switch/dataplane, +and +.Nm netmap pipes , +a shared memory packet transport channel. +All these are accessed interchangeably with the same API. +.Pp +.Nm , VALE +and +.Nm netmap pipes +are at least one order of magnitude faster than +standard OS mechanisms +(sockets, bpf, tun/tap interfaces, native switches, pipes), +reaching 14.88 million packets per second (Mpps) +with much less than one core on a 10 Gbit NIC, +about 20 Mpps per core for VALE ports, +and over 100 Mpps for netmap pipes. +.Pp +Userspace clients can dynamically switch NICs into .Nm -uses memory mapped buffers and metadata -(buffer indexes and lengths) to communicate with the kernel, -which is in charge of validating information through -.Pa ioctl() +mode and send and receive raw packets through +memory mapped buffers. +Similarly, +.Nm VALE +switch instances and ports, and +.Nm netmap pipes +can be created dynamically, +providing high speed packet I/O between processes, +virtual machines, NICs and the host stack. +.Pp +.Nm +suports both non-blocking I/O through +.Xr ioctls() , +synchronization and blocking I/O through a file descriptor +and standard OS mechanisms such as +.Xr select 2 , +.Xr poll 2 , +.Xr epoll 2 , +.Xr kqueue 2 . +.Nm VALE and -.Pa select()/poll() . +.Nm netmap pipes +are implemented by a single kernel module, which also emulates the .Nm -can exploit the parallelism in multiqueue devices and -multicore systems. +API over standard drivers for devices without native +.Nm +support. +For best performance, +.Nm +requires explicit support in device drivers. .Pp +In the rest of this (long) manual page we document +various aspects of the +.Nm +and +.Nm VALE +architecture, features and usage. .Pp +.Sh ARCHITECTURE .Nm -requires explicit support in device drivers. -For a list of supported devices, see the end of this manual page. -.Sh OPERATION +supports raw packet I/O through a +.Em port , +which can be connected to a physical interface +.Em ( NIC ) , +to the host stack, +or to a +.Nm VALE +switch). +Ports use preallocated circular queues of buffers +.Em ( rings ) +residing in an mmapped region. +There is one ring for each transmit/receive queue of a +NIC or virtual port. +An additional ring pair connects to the host stack. +.Pp +After binding a file descriptor to a port, a .Nm -clients must first open the -.Pa open("/dev/netmap") , -and then issue an -.Pa ioctl(...,NIOCREGIF,...) -to bind the file descriptor to a network device. +client can send or receive packets in batches through +the rings, and possibly implement zero-copy forwarding +between ports. .Pp -When a device is put in +All NICs operating in .Nm -mode, its data path is disconnected from the host stack. -The processes owning the file descriptor -can exchange packets with the device, or with the host stack, -through an mmapped memory region that contains pre-allocated -buffers and metadata. +mode use the same memory region, +accessible to all processes who own +.Nm /dev/netmap +file descriptors bound to NICs. +Independent +.Nm VALE +and +.Nm netmap pipe +ports +by default use separate memory regions, +but can be independently configured to share memory. .Pp +.Sh ENTERING AND EXITING NETMAP MODE +The following section describes the system calls to create +and control +.Nm netmap +ports (including +.Nm VALE +and +.Nm netmap pipe +ports). +Simpler, higher level functions are described in section +.Xr LIBRARIES . +.Pp +Ports and rings are created and controlled through a file descriptor, +created by opening a special device +.Dl fd = open("/dev/netmap"); +and then bound to a specific port with an +.Dl ioctl(fd, NIOCREGIF, (struct nmreq *)arg); +.Pp +.Nm +has multiple modes of operation controlled by the +.Vt struct nmreq +argument. +.Va arg.nr_name +specifies the port name, as follows: +.Bl -tag -width XXXX +.It Dv OS network interface name (e.g. 'em0', 'eth1', ... ) +the data path of the NIC is disconnected from the host stack, +and the file descriptor is bound to the NIC (one or all queues), +or to the host stack; +.It Dv valeXXX:YYY (arbitrary XXX and YYY) +the file descriptor is bound to port YYY of a VALE switch called XXX, +both dynamically created if necessary. +The string cannot exceed IFNAMSIZ characters, and YYY cannot +be the name of any existing OS network interface. +.El +.Pp +On return, +.Va arg +indicates the size of the shared memory region, +and the number, size and location of all the +.Nm +data structures, which can be accessed by mmapping the memory +.Dl char *mem = mmap(0, arg.nr_memsize, fd); +.Pp Non blocking I/O is done with special -.Pa ioctl()'s , -whereas the file descriptor can be passed to -.Pa select()/poll() -to be notified about incoming packet or available transmit buffers. -.Ss Data structures -All data structures for all devices in +.Xr ioctl 2 +.Xr select 2 +and +.Xr poll 2 +on the file descriptor permit blocking I/O. +.Xr epoll 2 +and +.Xr kqueue 2 +are not supported on .Nm -mode are in a memory -region shared by the kernel and all processes -who open -.Pa /dev/netmap -(NOTE: visibility may be restricted in future implementations). -All references between the shared data structure -are relative (offsets or indexes). Some macros help converting -them into actual pointers. +file descriptors. .Pp -The data structures in shared memory are the following: +While a NIC is in +.Nm +mode, the OS will still believe the interface is up and running. +OS-generated packets for that NIC end up into a +.Nm +ring, and another ring is used to send packets into the OS network stack. +A +.Xr close 2 +on the file descriptor removes the binding, +and returns the NIC to normal mode (reconnecting the data path +to the host stack), or destroys the virtual port. .Pp +.Sh DATA STRUCTURES +The data structures in the mmapped memory region are detailed in +.Xr sys/net/netmap.h , +which is the ultimate reference for the +.Nm +API. The main structures and fields are indicated below: .Bl -tag -width XXX .It Dv struct netmap_if (one per interface) -indicates the number of rings supported by an interface, their -sizes, and the offsets of the -.Pa netmap_rings -associated to the interface. -The offset of a -.Pa struct netmap_if -in the shared memory region is indicated by the -.Pa nr_offset -field in the structure returned by the -.Pa NIOCREGIF -(see below). .Bd -literal struct netmap_if { - char ni_name[IFNAMSIZ]; /* name of the interface. */ - const u_int ni_num_queues; /* number of hw ring pairs */ - const ssize_t ring_ofs[]; /* offset of tx and rx rings */ + ... + const uint32_t ni_flags; /* properties */ + ... + const uint32_t ni_tx_rings; /* NIC tx rings */ + const uint32_t ni_rx_rings; /* NIC rx rings */ + uint32_t ni_bufs_head; /* head of extra bufs list */ + ... }; .Ed +.Pp +Indicates the number of available rings +.Pa ( struct netmap_rings ) +and their position in the mmapped region. +The number of tx and rx rings +.Pa ( ni_tx_rings , ni_rx_rings ) +normally depends on the hardware. +NICs also have an extra tx/rx ring pair connected to the host stack. +.Em NIOCREGIF +can also request additional unbound buffers in the same memory space, +to be used as temporary storage for packets. +.Pa ni_bufs_head +contains the index of the first of these free rings, +which are connected in a list (the first uint32_t of each +buffer being the index of the next buffer in the list). +A 0 indicates the end of the list. +.Pp .It Dv struct netmap_ring (one per ring) -contains the index of the current read or write slot (cur), -the number of slots available for reception or transmission (avail), -and an array of -.Pa slots -describing the buffers. -There is one ring pair for each of the N hardware ring pairs -supported by the card (numbered 0..N-1), plus -one ring pair (numbered N) for packets from/to the host stack. .Bd -literal struct netmap_ring { - const ssize_t buf_ofs; - const uint32_t num_slots; /* number of slots in the ring. */ - uint32_t avail; /* number of usable slots */ - uint32_t cur; /* 'current' index for the user side */ - - const uint16_t nr_buf_size; - uint16_t flags; - struct netmap_slot slot[0]; /* array of slots. */ + ... + const uint32_t num_slots; /* slots in each ring */ + const uint32_t nr_buf_size; /* size of each buffer */ + ... + uint32_t head; /* (u) first buf owned by user */ + uint32_t cur; /* (u) wakeup position */ + const uint32_t tail; /* (k) first buf owned by kernel */ + ... + uint32_t flags; + struct timeval ts; /* (k) time of last rxsync() */ + ... + struct netmap_slot slot[0]; /* array of slots */ } .Ed -.It Dv struct netmap_slot (one per packet) -contains the metadata for a packet: a buffer index (buf_idx), -a buffer length (len), and some flags. +.Pp +Implements transmit and receive rings, with read/write +pointers, metadata and and an array of +.Pa slots +describing the buffers. +.Pp +.It Dv struct netmap_slot (one per buffer) .Bd -literal struct netmap_slot { - uint32_t buf_idx; /* buffer index */ - uint16_t len; /* packet length */ - uint16_t flags; /* buf changed, etc. */ -#define NS_BUF_CHANGED 0x0001 /* must resync, buffer changed */ -#define NS_REPORT 0x0002 /* tell hw to report results - * e.g. by generating an interrupt - */ + uint32_t buf_idx; /* buffer index */ + uint16_t len; /* packet length */ + uint16_t flags; /* buf changed, etc. */ + uint64_t ptr; /* address for indirect buffers */ }; .Ed +.Pp +Describes a packet buffer, which normally is identified by +an index and resides in the mmapped region. .It Dv packet buffers -are fixed size (approximately 2k) buffers allocated by the kernel -that contain packet data. Buffers addresses are computed through -macros. +Fixed size (normally 2 KB) packet buffers allocated by the kernel. .El .Pp -Some macros support the access to objects in the shared memory -region. In particular: +The offset of the +.Pa struct netmap_if +in the mmapped region is indicated by the +.Pa nr_offset +field in the structure returned by +.Pa NIOCREGIF . +From there, all other objects are reachable through +relative references (offsets or indexes). +Macros and functions in +help converting them into actual pointers: +.Pp +.Dl struct netmap_if *nifp = NETMAP_IF(mem, arg.nr_offset); +.Dl struct netmap_ring *txr = NETMAP_TXRING(nifp, ring_index); +.Dl struct netmap_ring *rxr = NETMAP_RXRING(nifp, ring_index); +.Pp +.Dl char *buf = NETMAP_BUF(ring, buffer_index); +.Sh RINGS, BUFFERS AND DATA I/O +.Va Rings +are circular queues of packets with three indexes/pointers +.Va ( head , cur , tail ) ; +one slot is always kept empty. +The ring size +.Va ( num_slots ) +should not be assumed to be a power of two. +.br +(NOTE: older versions of netmap used head/count format to indicate +the content of a ring). +.Pp +.Va head +is the first slot available to userspace; +.br +.Va cur +is the wakeup point: +select/poll will unblock when +.Va tail +passes +.Va cur ; +.br +.Va tail +is the first slot reserved to the kernel. +.Pp +Slot indexes MUST only move forward; +for convenience, the function +.Dl nm_ring_next(ring, index) +returns the next index modulo the ring size. +.Pp +.Va head +and +.Va cur +are only modified by the user program; +.Va tail +is only modified by the kernel. +The kernel only reads/writes the +.Vt struct netmap_ring +slots and buffers +during the execution of a netmap-related system call. +The only exception are slots (and buffers) in the range +.Va tail\ . . . head-1 , +that are explicitly assigned to the kernel. +.Pp +.Ss TRANSMIT RINGS +On transmit rings, after a +.Nm +system call, slots in the range +.Va head\ . . . tail-1 +are available for transmission. +User code should fill the slots sequentially +and advance +.Va head +and +.Va cur +past slots ready to transmit. +.Va cur +may be moved further ahead if the user code needs +more slots before further transmissions (see +.Sx SCATTER GATHER I/O ) . +.Pp +At the next NIOCTXSYNC/select()/poll(), +slots up to +.Va head-1 +are pushed to the port, and +.Va tail +may advance if further slots have become available. +Below is an example of the evolution of a TX ring: +.Pp .Bd -literal -struct netmap_if *nifp; -struct netmap_ring *txring = NETMAP_TXRING(nifp, i); -struct netmap_ring *rxring = NETMAP_RXRING(nifp, i); -int i = txring->slot[txring->cur].buf_idx; -char *buf = NETMAP_BUF(txring, i); + after the syscall, slots between cur and tail are (a)vailable + head=cur tail + | | + v v + TX [.....aaaaaaaaaaa.............] + + user creates new packets to (T)ransmit + head=cur tail + | | + v v + TX [.....TTTTTaaaaaa.............] + + NIOCTXSYNC/poll()/select() sends packets and reports new slots + head=cur tail + | | + v v + TX [..........aaaaaaaaaaa........] .Ed -.Ss IOCTLS .Pp +select() and poll() wlll block if there is no space in the ring, i.e. +.Dl ring->cur == ring->tail +and return when new slots have become available. +.Pp +High speed applications may want to amortize the cost of system calls +by preparing as many packets as possible before issuing them. +.Pp +A transmit ring with pending transmissions has +.Dl ring->head != ring->tail + 1 (modulo the ring size). +The function +.Va int nm_tx_pending(ring) +implements this test. +.Pp +.Ss RECEIVE RINGS +On receive rings, after a .Nm -supports some ioctl() to synchronize the state of the rings -between the kernel and the user processes, plus some -to query and configure the interface. -The former do not require any argument, whereas the latter -use a -.Pa struct netmap_req -defined as follows: +system call, the slots in the range +.Va head\& . . . tail-1 +contain received packets. +User code should process them and advance +.Va head +and +.Va cur +past slots it wants to return to the kernel. +.Va cur +may be moved further ahead if the user code wants to +wait for more packets +without returning all the previous slots to the kernel. +.Pp +At the next NIOCRXSYNC/select()/poll(), +slots up to +.Va head-1 +are returned to the kernel for further receives, and +.Va tail +may advance to report new incoming packets. +.br +Below is an example of the evolution of an RX ring: .Bd -literal + after the syscall, there are some (h)eld and some (R)eceived slots + head cur tail + | | | + v v v + RX [..hhhhhhRRRRRRRR..........] + + user advances head and cur, releasing some slots and holding others + head cur tail + | | | + v v v + RX [..*****hhhRRRRRR...........] + + NICRXSYNC/poll()/select() recovers slots and reports new packets + head cur tail + | | | + v v v + RX [.......hhhRRRRRRRRRRRR....] +.Ed +.Pp +.Sh SLOTS AND PACKET BUFFERS +Normally, packets should be stored in the netmap-allocated buffers +assigned to slots when ports are bound to a file descriptor. +One packet is fully contained in a single buffer. +.Pp +The following flags affect slot and buffer processing: +.Bl -tag -width XXX +.It NS_BUF_CHANGED +it MUST be used when the buf_idx in the slot is changed. +This can be used to implement +zero-copy forwarding, see +.Sx ZERO-COPY FORWARDING . +.Pp +.It NS_REPORT +reports when this buffer has been transmitted. +Normally, +.Nm +notifies transmit completions in batches, hence signals +can be delayed indefinitely. This flag helps detecting +when packets have been send and a file descriptor can be closed. +.It NS_FORWARD +When a ring is in 'transparent' mode (see +.Sx TRANSPARENT MODE ) , +packets marked with this flags are forwarded to the other endpoint +at the next system call, thus restoring (in a selective way) +the connection between a NIC and the host stack. +.It NS_NO_LEARN +tells the forwarding code that the SRC MAC address for this +packet must not be used in the learning bridge code. +.It NS_INDIRECT +indicates that the packet's payload is in a user-supplied buffer, +whose user virtual address is in the 'ptr' field of the slot. +The size can reach 65535 bytes. +.br +This is only supported on the transmit ring of +.Nm VALE +ports, and it helps reducing data copies in the interconnection +of virtual machines. +.It NS_MOREFRAG +indicates that the packet continues with subsequent buffers; +the last buffer in a packet must have the flag clear. +.El +.Sh SCATTER GATHER I/O +Packets can span multiple slots if the +.Va NS_MOREFRAG +flag is set in all but the last slot. +The maximum length of a chain is 64 buffers. +This is normally used with +.Nm VALE +ports when connecting virtual machines, as they generate large +TSO segments that are not split unless they reach a physical device. +.Pp +NOTE: The length field always refers to the individual +fragment; there is no place with the total length of a packet. +.Pp +On receive rings the macro +.Va NS_RFRAGS(slot) +indicates the remaining number of slots for this packet, +including the current one. +Slots with a value greater than 1 also have NS_MOREFRAG set. +.Sh IOCTLS +.Nm +uses two ioctls (NIOCTXSYNC, NIOCRXSYNC) +for non-blocking I/O. They take no argument. +Two more ioctls (NIOCGINFO, NIOCREGIF) are used +to query and configure ports, with the following argument: +.Bd -literal struct nmreq { - char nr_name[IFNAMSIZ]; - uint32_t nr_offset; /* nifp offset in the shared region */ - uint32_t nr_memsize; /* size of the shared region */ - uint32_t nr_numdescs; /* descriptors per queue */ - uint16_t nr_numqueues; - uint16_t nr_ringid; /* ring(s) we care about */ -#define NETMAP_HW_RING 0x4000 /* low bits indicate one hw ring */ -#define NETMAP_SW_RING 0x2000 /* we process the sw ring */ -#define NETMAP_NO_TX_POLL 0x1000 /* no gratuitous txsync on poll */ -#define NETMAP_RING_MASK 0xfff /* the actual ring number */ + char nr_name[IFNAMSIZ]; /* (i) port name */ + uint32_t nr_version; /* (i) API version */ + uint32_t nr_offset; /* (o) nifp offset in mmap region */ + uint32_t nr_memsize; /* (o) size of the mmap region */ + uint32_t nr_tx_slots; /* (i/o) slots in tx rings */ + uint32_t nr_rx_slots; /* (i/o) slots in rx rings */ + uint16_t nr_tx_rings; /* (i/o) number of tx rings */ + uint16_t nr_rx_rings; /* (i/o) number of tx rings */ + uint16_t nr_ringid; /* (i/o) ring(s) we care about */ + uint16_t nr_cmd; /* (i) special command */ + uint16_t nr_arg1; /* (i/o) extra arguments */ + uint16_t nr_arg2; /* (i/o) extra arguments */ + uint32_t nr_arg3; /* (i/o) extra arguments */ + uint32_t nr_flags /* (i/o) open mode */ + ... }; - .Ed -A device descriptor obtained through +.Pp +A file descriptor obtained through .Pa /dev/netmap -also supports the ioctl supported by network devices. +also supports the ioctl supported by network devices, see +.Xr netintro 4 . .Pp -The netmap-specific -.Xr ioctl 2 -command codes below are defined in -.In net/netmap.h -and are: .Bl -tag -width XXXX .It Dv NIOCGINFO -returns information about the interface named in nr_name. -On return, nr_memsize indicates the size of the shared netmap -memory region (this is device-independent), -nr_numslots indicates how many buffers are in a ring, -nr_numrings indicates the number of rings supported by the hardware. +returns EINVAL if the named port does not support netmap. +Otherwise, it returns 0 and (advisory) information +about the port. +Note that all the information below can change before the +interface is actually put in netmap mode. .Pp -If the device does not support netmap, the ioctl returns EINVAL. +.Bl -tag -width XX +.It Pa nr_memsize +indicates the size of the +.Nm +memory region. NICs in +.Nm +mode all share the same memory region, +whereas +.Nm VALE +ports have independent regions for each port. +.It Pa nr_tx_slots , nr_rx_slots +indicate the size of transmit and receive rings. +.It Pa nr_tx_rings , nr_rx_rings +indicate the number of transmit +and receive rings. +Both ring number and sizes may be configured at runtime +using interface-specific functions (e.g. +.Xr ethtool +). +.El .It Dv NIOCREGIF -puts the interface named in nr_name into netmap mode, disconnecting -it from the host stack, and/or defines which rings are controlled -through this file descriptor. -On return, it gives the same info as NIOCGINFO, and nr_ringid -indicates the identity of the rings controlled through the file +binds the port named in +.Va nr_name +to the file descriptor. For a physical device this also switches it into +.Nm +mode, disconnecting +it from the host stack. +Multiple file descriptors can be bound to the same port, +with proper synchronization left to the user. +.Pp +.Dv NIOCREGIF can also bind a file descriptor to one endpoint of a +.Em netmap pipe , +consisting of two netmap ports with a crossover connection. +A netmap pipe share the same memory space of the parent port, +and is meant to enable configuration where a master process acts +as a dispatcher towards slave processes. +.Pp +To enable this function, the +.Pa nr_arg1 +field of the structure can be used as a hint to the kernel to +indicate how many pipes we expect to use, and reserve extra space +in the memory region. +.Pp +On return, it gives the same info as NIOCGINFO, +with +.Pa nr_ringid +and +.Pa nr_flags +indicating the identity of the rings controlled through the file descriptor. .Pp -Possible values for nr_ringid are +.Va nr_flags +.Va nr_ringid +selects which rings are controlled through this file descriptor. +Possible values of +.Pa nr_flags +are indicated below, together with the naming schemes +that application libraries (such as the +.Nm nm_open +indicated below) can use to indicate the specific set of rings. +In the example below, "netmap:foo" is any valid netmap port name. +.Pp .Bl -tag -width XXXXX -.It 0 -default, all hardware rings -.It NETMAP_SW_RING -the ``host rings'' connecting to the host stack -.It NETMAP_HW_RING + i -the i-th hardware ring +.It NR_REG_ALL_NIC "netmap:foo" +(default) all hardware ring pairs +.It NR_REG_SW_NIC "netmap:foo^" +the ``host rings'', connecting to the host stack. +.It NR_RING_NIC_SW "netmap:foo+ +all hardware rings and the host rings +.It NR_REG_ONE_NIC "netmap:foo-i" +only the i-th hardware ring pair, where the number is in +.Pa nr_ringid ; +.It NR_REG_PIPE_MASTER "netmap:foo{i" +the master side of the netmap pipe whose identifier (i) is in +.Pa nr_ringid ; +.It NR_REG_PIPE_SLAVE "netmap:foo}i" +the slave side of the netmap pipe whose identifier (i) is in +.Pa nr_ringid . +.Pp +The identifier of a pipe must be thought as part of the pipe name, +and does not need to be sequential. On return the pipe +will only have a single ring pair with index 0, +irrespective of the value of i. .El +.Pp By default, a -.Nm poll +.Xr poll 2 or -.Nm select +.Xr select 2 call pushes out any pending packets on the transmit ring, even if no write events are specified. The feature can be disabled by or-ing -.Nm NETMAP_NO_TX_SYNC -to nr_ringid. -But normally you should keep this feature unless you are using -separate file descriptors for the send and receive rings, because -otherwise packets are pushed out only if NETMAP_TXSYNC is called, -or the send queue is full. +.Va NETMAP_NO_TX_SYNC +to the value written to +.Va nr_ringid. +When this feature is used, +packets are transmitted only on +.Va ioctl(NIOCTXSYNC) +or select()/poll() are called with a write event (POLLOUT/wfdset) or a full ring. .Pp -.Pa NIOCREGIF -can be used multiple times to change the association of a -file descriptor to a ring pair, always within the same device. -.It Dv NIOCUNREGIF -brings an interface back to normal mode. +When registering a virtual interface that is dynamically created to a +.Xr vale 4 +switch, we can specify the desired number of rings (1 by default, +and currently up to 16) on it using nr_tx_rings and nr_rx_rings fields. .It Dv NIOCTXSYNC tells the hardware of new packets to transmit, and updates the number of slots available for transmission. .It Dv NIOCRXSYNC tells the hardware of consumed packets, and asks for newly available packets. .El -.Ss SYSTEM CALLS +.Sh SELECT, POLL, EPOLL, KQUEUE. +.Xr select 2 +and +.Xr poll 2 +on a .Nm +file descriptor process rings as indicated in +.Sx TRANSMIT RINGS +and +.Sx RECEIVE RINGS , +respectively when write (POLLOUT) and read (POLLIN) events are requested. +Both block if no slots are available in the ring +.Va ( ring->cur == ring->tail ) . +Depending on the platform, +.Xr epoll 2 +and +.Xr kqueue 2 +are supported too. +.Pp +Packets in transmit rings are normally pushed out +(and buffers reclaimed) even without +requesting write events. Passing the NETMAP_NO_TX_SYNC flag to +.Em NIOCREGIF +disables this feature. +By default, receive rings are processed only if read +events are requested. Passing the NETMAP_DO_RX_SYNC flag to +.Em NIOCREGIF updates receive rings even without read events. +Note that on epoll and kqueue, NETMAP_NO_TX_SYNC and NETMAP_DO_RX_SYNC +only have an effect when some event is posted for the file descriptor. +.Sh LIBRARIES +The +.Nm +API is supposed to be used directly, both because of its simplicity and +for efficient integration with applications. +.Pp +For conveniency, the +.Va +header provides a few macros and functions to ease creating +a file descriptor and doing I/O with a +.Nm +port. These are loosely modeled after the +.Xr pcap 3 +API, to ease porting of libpcap-based applications to +.Nm . +To use these extra functions, programs should +.Dl #define NETMAP_WITH_LIBS +before +.Dl #include +.Pp +The following functions are available: +.Bl -tag -width XXXXX +.It Va struct nm_desc * nm_open(const char *ifname, const struct nmreq *req, uint64_t flags, const struct nm_desc *arg) +similar to +.Xr pcap_open , +binds a file descriptor to a port. +.Bl -tag -width XX +.It Va ifname +is a port name, in the form "netmap:XXX" for a NIC and "valeXXX:YYY" for a +.Nm VALE +port. +.It Va req +provides the initial values for the argument to the NIOCREGIF ioctl. +The nm_flags and nm_ringid values are overwritten by parsing +ifname and flags, and other fields can be overridden through +the other two arguments. +.It Va arg +points to a struct nm_desc containing arguments (e.g. from a previously +open file descriptor) that should override the defaults. +The fields are used as described below +.It Va flags +can be set to a combination of the following flags: +.Va NETMAP_NO_TX_POLL , +.Va NETMAP_DO_RX_POLL +(copied into nr_ringid); +.Va NM_OPEN_NO_MMAP (if arg points to the same memory region, +avoids the mmap and uses the values from it); +.Va NM_OPEN_IFNAME (ignores ifname and uses the values in arg); +.Va NM_OPEN_ARG1 , +.Va NM_OPEN_ARG2 , +.Va NM_OPEN_ARG3 (uses the fields from arg); +.Va NM_OPEN_RING_CFG (uses the ring number and sizes from arg). +.El +.It Va int nm_close(struct nm_desc *d) +closes the file descriptor, unmaps memory, frees resources. +.It Va int nm_inject(struct nm_desc *d, const void *buf, size_t size) +similar to pcap_inject(), pushes a packet to a ring, returns the size +of the packet is successful, or 0 on error; +.It Va int nm_dispatch(struct nm_desc *d, int cnt, nm_cb_t cb, u_char *arg) +similar to pcap_dispatch(), applies a callback to incoming packets +.It Va u_char * nm_nextpkt(struct nm_desc *d, struct nm_pkthdr *hdr) +similar to pcap_next(), fetches the next packet +.Pp +.El +.Sh SUPPORTED DEVICES +.Nm +natively supports the following devices: +.Pp +On FreeBSD: +.Xr em 4 , +.Xr igb 4 , +.Xr ixgbe 4 , +.Xr lem 4 , +.Xr re 4 . +.Pp +On Linux +.Xr e1000 4 , +.Xr e1000e 4 , +.Xr igb 4 , +.Xr ixgbe 4 , +.Xr mlx4 4 , +.Xr forcedeth 4 , +.Xr r8169 4 . +.Pp +NICs without native support can still be used in +.Nm +mode through emulation. Performance is inferior to native netmap +mode but still significantly higher than sockets, and approaching +that of in-kernel solutions such as Linux's +.Xr pktgen . +.Pp +Emulation is also available for devices with native netmap support, +which can be used for testing or performance comparison. +The sysctl variable +.Va dev.netmap.admode +globally controls how netmap mode is implemented. +.Sh SYSCTL VARIABLES AND MODULE PARAMETERS +Some aspect of the operation of +.Nm +are controlled through sysctl variables on FreeBSD +.Em ( dev.netmap.* ) +and module parameters on Linux +.Em ( /sys/module/netmap_lin/parameters/* ) : +.Pp +.Bl -tag -width indent +.It Va dev.netmap.admode: 0 +Controls the use of native or emulated adapter mode. +0 uses the best available option, 1 forces native and +fails if not available, 2 forces emulated hence never fails. +.It Va dev.netmap.generic_ringsize: 1024 +Ring size used for emulated netmap mode +.It Va dev.netmap.generic_mit: 100000 +Controls interrupt moderation for emulated mode +.It Va dev.netmap.mmap_unreg: 0 +.It Va dev.netmap.fwd: 0 +Forces NS_FORWARD mode +.It Va dev.netmap.flags: 0 +.It Va dev.netmap.txsync_retry: 2 +.It Va dev.netmap.no_pendintr: 1 +Forces recovery of transmit buffers on system calls +.It Va dev.netmap.mitigate: 1 +Propagates interrupt mitigation to user processes +.It Va dev.netmap.no_timestamp: 0 +Disables the update of the timestamp in the netmap ring +.It Va dev.netmap.verbose: 0 +Verbose kernel messages +.It Va dev.netmap.buf_num: 163840 +.It Va dev.netmap.buf_size: 2048 +.It Va dev.netmap.ring_num: 200 +.It Va dev.netmap.ring_size: 36864 +.It Va dev.netmap.if_num: 100 +.It Va dev.netmap.if_size: 1024 +Sizes and number of objects (netmap_if, netmap_ring, buffers) +for the global memory region. The only parameter worth modifying is +.Va dev.netmap.buf_num +as it impacts the total amount of memory used by netmap. +.It Va dev.netmap.buf_curr_num: 0 +.It Va dev.netmap.buf_curr_size: 0 +.It Va dev.netmap.ring_curr_num: 0 +.It Va dev.netmap.ring_curr_size: 0 +.It Va dev.netmap.if_curr_num: 0 +.It Va dev.netmap.if_curr_size: 0 +Actual values in use. +.It Va dev.netmap.bridge_batch: 1024 +Batch size used when moving packets across a +.Nm VALE +switch. Values above 64 generally guarantee good +performance. +.El +.Sh SYSTEM CALLS +.Nm uses -.Nm select +.Xr select 2 , +.Xr poll 2 , +.Xr epoll and -.Nm poll -to wake up processes when significant events occur. +.Xr kqueue +to wake up processes when significant events occur, and +.Xr mmap 2 +to map memory. +.Xr ioctl 2 +is used to configure ports and +.Nm VALE switches . +.Pp +Applications may need to create threads and bind them to +specific cores to improve performance, using standard +OS primitives, see +.Xr pthread 3 . +In particular, +.Xr pthread_setaffinity_np 3 +may be of use. +.Sh CAVEATS +No matter how fast the CPU and OS are, +achieving line rate on 10G and faster interfaces +requires hardware with sufficient performance. +Several NICs are unable to sustain line rate with +small packet sizes. Insufficient PCIe or memory bandwidth +can also cause reduced performance. +.Pp +Another frequent reason for low performance is the use +of flow control on the link: a slow receiver can limit +the transmit speed. +Be sure to disable flow control when running high +speed experiments. +.Pp +.Ss SPECIAL NIC FEATURES +.Nm +is orthogonal to some NIC features such as +multiqueue, schedulers, packet filters. +.Pp +Multiple transmit and receive rings are supported natively +and can be configured with ordinary OS tools, +such as +.Xr ethtool +or +device-specific sysctl variables. +The same goes for Receive Packet Steering (RPS) +and filtering of incoming traffic. +.Pp +.Nm +.Em does not use +features such as +.Em checksum offloading , TCP segmentation offloading , +.Em encryption , VLAN encapsulation/decapsulation , +etc. . +When using netmap to exchange packets with the host stack, +make sure to disable these features. .Sh EXAMPLES +.Ss TEST PROGRAMS +.Nm +comes with a few programs that can be used for testing or +simple applications. +See the +.Va examples/ +directory in +.Nm +distributions, or +.Va tools/tools/netmap/ +directory in FreeBSD distributions. +.Pp +.Xr pkt-gen +is a general purpose traffic source/sink. +.Pp +As an example +.Dl pkt-gen -i ix0 -f tx -l 60 +can generate an infinite stream of minimum size packets, and +.Dl pkt-gen -i ix0 -f rx +is a traffic sink. +Both print traffic statistics, to help monitor +how the system performs. +.Pp +.Xr pkt-gen +has many options can be uses to set packet sizes, addresses, +rates, and use multiple send/receive threads and cores. +.Pp +.Xr bridge +is another test program which interconnects two +.Nm +ports. It can be used for transparent forwarding between +interfaces, as in +.Dl bridge -i ix0 -i ix1 +or even connect the NIC to the host stack using netmap +.Dl bridge -i ix0 -i ix0 +.Ss USING THE NATIVE API The following code implements a traffic generator .Pp .Bd -literal -compact -#include #include -struct netmap_if *nifp; -struct netmap_ring *ring; -struct netmap_request nmr; +... +void sender(void) +{ + struct netmap_if *nifp; + struct netmap_ring *ring; + struct nmreq nmr; + struct pollfd fds; -fd = open("/dev/netmap", O_RDWR); -bzero(&nmr, sizeof(nmr)); -strcpy(nmr.nm_name, "ix0"); -ioctl(fd, NIOCREG, &nmr); -p = mmap(0, nmr.memsize, fd); -nifp = NETMAP_IF(p, nmr.offset); -ring = NETMAP_TXRING(nifp, 0); -fds.fd = fd; -fds.events = POLLOUT; -for (;;) { - poll(list, 1, -1); - while (ring->avail-- > 0) { - i = ring->cur; - buf = NETMAP_BUF(ring, ring->slot[i].buf_index); - ... prepare packet in buf ... - ring->slot[i].len = ... packet length ... - ring->cur = NETMAP_RING_NEXT(ring, i); + fd = open("/dev/netmap", O_RDWR); + bzero(&nmr, sizeof(nmr)); + strcpy(nmr.nr_name, "ix0"); + nmr.nm_version = NETMAP_API; + ioctl(fd, NIOCREGIF, &nmr); + p = mmap(0, nmr.nr_memsize, fd); + nifp = NETMAP_IF(p, nmr.nr_offset); + ring = NETMAP_TXRING(nifp, 0); + fds.fd = fd; + fds.events = POLLOUT; + for (;;) { + poll(&fds, 1, -1); + while (!nm_ring_empty(ring)) { + i = ring->cur; + buf = NETMAP_BUF(ring, ring->slot[i].buf_index); + ... prepare packet in buf ... + ring->slot[i].len = ... packet length ... + ring->head = ring->cur = nm_ring_next(ring, i); + } } } .Ed -.Sh SUPPORTED INTERFACES +.Ss HELPER FUNCTIONS +A simple receiver can be implemented using the helper functions +.Bd -literal -compact +#define NETMAP_WITH_LIBS +#include +... +void receiver(void) +{ + struct nm_desc *d; + struct pollfd fds; + u_char *buf; + struct nm_pkthdr h; + ... + d = nm_open("netmap:ix0", NULL, 0, 0); + fds.fd = NETMAP_FD(d); + fds.events = POLLIN; + for (;;) { + poll(&fds, 1, -1); + while ( (buf = nm_nextpkt(d, &h)) ) + consume_pkt(buf, h->len); + } + nm_close(d); +} +.Ed +.Ss ZERO-COPY FORWARDING +Since physical interfaces share the same memory region, +it is possible to do packet forwarding between ports +swapping buffers. The buffer from the transmit ring is used +to replenish the receive ring: +.Bd -literal -compact + uint32_t tmp; + struct netmap_slot *src, *dst; + ... + src = &src_ring->slot[rxr->cur]; + dst = &dst_ring->slot[txr->cur]; + tmp = dst->buf_idx; + dst->buf_idx = src->buf_idx; + dst->len = src->len; + dst->flags = NS_BUF_CHANGED; + src->buf_idx = tmp; + src->flags = NS_BUF_CHANGED; + rxr->head = rxr->cur = nm_ring_next(rxr, rxr->cur); + txr->head = txr->cur = nm_ring_next(txr, txr->cur); + ... +.Ed +.Ss ACCESSING THE HOST STACK +The host stack is for all practical purposes just a regular ring pair, +which you can access with the netmap API (e.g. with +.Dl nm_open("netmap:eth0^", ... ) ; +All packets that the host would send to an interface in .Nm -supports the following interfaces: -.Xr em 4 , -.Xr ixgbe 4 , -.Xr re 4 , +mode end up into the RX ring, whereas all packets queued to the +TX ring are send up to the host stack. +.Ss VALE SWITCH +A simple way to test the performance of a +.Nm VALE +switch is to attach a sender and a receiver to it, +e.g. running the following in two different terminals: +.Dl pkt-gen -i vale1:a -f rx # receiver +.Dl pkt-gen -i vale1:b -f tx # sender +The same example can be used to test netmap pipes, by simply +changing port names, e.g. +.Dl pkt-gen -i vale:x{3 -f rx # receiver on the master side +.Dl pkt-gen -i vale:x}3 -f tx # sender on the slave side +.Pp +The following command attaches an interface and the host stack +to a switch: +.Dl vale-ctl -h vale2:em0 +Other +.Nm +clients attached to the same switch can now communicate +with the network card or the host. +.Pp +.Sh SEE ALSO +.Pp +http://info.iet.unipi.it/~luigi/netmap/ +.Pp +Luigi Rizzo, Revisiting network I/O APIs: the netmap framework, +Communications of the ACM, 55 (3), pp.45-51, March 2012 +.Pp +Luigi Rizzo, netmap: a novel framework for fast packet I/O, +Usenix ATC'12, June 2012, Boston +.Pp +Luigi Rizzo, Giuseppe Lettieri, +VALE, a switched ethernet for virtual machines, +ACM CoNEXT'12, December 2012, Nice +.Pp +Luigi Rizzo, Giuseppe Lettieri, Vincenzo Maffione, +Speeding up packet I/O in virtual machines, +ACM/IEEE ANCS'13, October 2013, San Jose .Sh AUTHORS +.An -nosplit The .Nm -framework has been designed and implemented by -.An Luigi Rizzo +framework has been originally designed and implemented at the +Universita` di Pisa in 2011 by +.An Luigi Rizzo , +and further extended with help from +.An Matteo Landi , +.An Gaetano Catalli , +.An Giuseppe Lettieri , +.An Vincenzo Maffione . +.Pp +.Nm and -.An Matteo Landi -in 2011 at the Universita` di Pisa. +.Nm VALE +have been funded by the European Commission within FP7 Projects +CHANGE (257422) and OPENLAB (287581). Index: stable/9/sys/conf/files =================================================================== --- stable/9/sys/conf/files (revision 262152) +++ stable/9/sys/conf/files (revision 262153) @@ -1,3639 +1,3645 @@ # $FreeBSD$ # # The long compile-with and dependency lines are required because of # limitations in config: backslash-newline doesn't work in strings, and # dependency lines other than the first are silently ignored. # acpi_quirks.h optional acpi \ dependency "$S/tools/acpi_quirks2h.awk $S/dev/acpica/acpi_quirks" \ compile-with "${AWK} -f $S/tools/acpi_quirks2h.awk $S/dev/acpica/acpi_quirks" \ no-obj no-implicit-rule before-depend \ clean "acpi_quirks.h" aicasm optional ahc | ahd \ dependency "$S/dev/aic7xxx/aicasm/*.[chyl]" \ compile-with "CC='${CC}' ${MAKE} -f $S/dev/aic7xxx/aicasm/Makefile MAKESRCPATH=$S/dev/aic7xxx/aicasm" \ no-obj no-implicit-rule \ clean "aicasm* y.tab.h" aic7xxx_seq.h optional ahc \ compile-with "./aicasm ${INCLUDES} -I$S/cam/scsi -I$S/dev/aic7xxx -o aic7xxx_seq.h -r aic7xxx_reg.h -p aic7xxx_reg_print.c -i $S/dev/aic7xxx/aic7xxx_osm.h $S/dev/aic7xxx/aic7xxx.seq" \ no-obj no-implicit-rule before-depend local \ clean "aic7xxx_seq.h" \ dependency "$S/dev/aic7xxx/aic7xxx.{reg,seq} $S/cam/scsi/scsi_message.h aicasm" aic7xxx_reg.h optional ahc \ compile-with "./aicasm ${INCLUDES} -I$S/cam/scsi -I$S/dev/aic7xxx -o aic7xxx_seq.h -r aic7xxx_reg.h -p aic7xxx_reg_print.c -i $S/dev/aic7xxx/aic7xxx_osm.h $S/dev/aic7xxx/aic7xxx.seq" \ no-obj no-implicit-rule before-depend local \ clean "aic7xxx_reg.h" \ dependency "$S/dev/aic7xxx/aic7xxx.{reg,seq} $S/cam/scsi/scsi_message.h aicasm" aic7xxx_reg_print.c optional ahc \ compile-with "./aicasm ${INCLUDES} -I$S/cam/scsi -I$S/dev/aic7xxx -o aic7xxx_seq.h -r aic7xxx_reg.h -p aic7xxx_reg_print.c -i $S/dev/aic7xxx/aic7xxx_osm.h $S/dev/aic7xxx/aic7xxx.seq" \ no-obj no-implicit-rule local \ clean "aic7xxx_reg_print.c" \ dependency "$S/dev/aic7xxx/aic7xxx.{reg,seq} $S/cam/scsi/scsi_message.h aicasm" aic7xxx_reg_print.o optional ahc ahc_reg_pretty_print \ compile-with "${NORMAL_C}" \ no-implicit-rule local aic79xx_seq.h optional ahd pci \ compile-with "./aicasm ${INCLUDES} -I$S/cam/scsi -I$S/dev/aic7xxx -o aic79xx_seq.h -r aic79xx_reg.h -p aic79xx_reg_print.c -i $S/dev/aic7xxx/aic79xx_osm.h $S/dev/aic7xxx/aic79xx.seq" \ no-obj no-implicit-rule before-depend local \ clean "aic79xx_seq.h" \ dependency "$S/dev/aic7xxx/aic79xx.{reg,seq} $S/cam/scsi/scsi_message.h aicasm" aic79xx_reg.h optional ahd pci \ compile-with "./aicasm ${INCLUDES} -I$S/cam/scsi -I$S/dev/aic7xxx -o aic79xx_seq.h -r aic79xx_reg.h -p aic79xx_reg_print.c -i $S/dev/aic7xxx/aic79xx_osm.h $S/dev/aic7xxx/aic79xx.seq" \ no-obj no-implicit-rule before-depend local \ clean "aic79xx_reg.h" \ dependency "$S/dev/aic7xxx/aic79xx.{reg,seq} $S/cam/scsi/scsi_message.h aicasm" aic79xx_reg_print.c optional ahd pci \ compile-with "./aicasm ${INCLUDES} -I$S/cam/scsi -I$S/dev/aic7xxx -o aic79xx_seq.h -r aic79xx_reg.h -p aic79xx_reg_print.c -i $S/dev/aic7xxx/aic79xx_osm.h $S/dev/aic7xxx/aic79xx.seq" \ no-obj no-implicit-rule local \ clean "aic79xx_reg_print.c" \ dependency "$S/dev/aic7xxx/aic79xx.{reg,seq} $S/cam/scsi/scsi_message.h aicasm" aic79xx_reg_print.o optional ahd pci ahd_reg_pretty_print \ compile-with "${NORMAL_C}" \ no-implicit-rule local # # The 'fdt_dtb_file' target covers an actual DTB file name, which is derived # from the specified source (DTS) file: .dts -> .dtb # fdt_dtb_file optional fdt \ compile-with "if [ -f $S/boot/fdt/dts/${FDT_DTS_FILE} ]; then dtc -O dtb -o `echo ${FDT_DTS_FILE} | cut -d. -f1`.dtb -b 0 -p 1024 $S/boot/fdt/dts/${FDT_DTS_FILE}; fi" \ no-obj no-implicit-rule before-depend \ clean "`echo ${FDT_DTS_FILE} | cut -d. -f1`.dtb" fdt_static_dtb.h optional fdt fdt_dtb_static \ compile-with "sh $S/tools/fdt/make_dtbh.sh ${FDT_DTS_FILE} ." \ no-obj no-implicit-rule before-depend \ clean "fdt_static_dtb.h" feeder_eq_gen.h optional sound \ dependency "$S/tools/sound/feeder_eq_mkfilter.awk" \ compile-with "${AWK} -f $S/tools/sound/feeder_eq_mkfilter.awk -- ${FEEDER_EQ_PRESETS} > feeder_eq_gen.h" \ no-obj no-implicit-rule before-depend \ clean "feeder_eq_gen.h" feeder_rate_gen.h optional sound \ dependency "$S/tools/sound/feeder_rate_mkfilter.awk" \ compile-with "${AWK} -f $S/tools/sound/feeder_rate_mkfilter.awk -- ${FEEDER_RATE_PRESETS} > feeder_rate_gen.h" \ no-obj no-implicit-rule before-depend \ clean "feeder_rate_gen.h" snd_fxdiv_gen.h optional sound \ dependency "$S/tools/sound/snd_fxdiv_gen.awk" \ compile-with "${AWK} -f $S/tools/sound/snd_fxdiv_gen.awk -- > snd_fxdiv_gen.h" \ no-obj no-implicit-rule before-depend \ clean "snd_fxdiv_gen.h" miidevs.h optional miibus | mii \ dependency "$S/tools/miidevs2h.awk $S/dev/mii/miidevs" \ compile-with "${AWK} -f $S/tools/miidevs2h.awk $S/dev/mii/miidevs" \ no-obj no-implicit-rule before-depend \ clean "miidevs.h" pccarddevs.h standard \ dependency "$S/tools/pccarddevs2h.awk $S/dev/pccard/pccarddevs" \ compile-with "${AWK} -f $S/tools/pccarddevs2h.awk $S/dev/pccard/pccarddevs" \ no-obj no-implicit-rule before-depend \ clean "pccarddevs.h" teken_state.h optional sc \ dependency "$S/teken/gensequences $S/teken/sequences" \ compile-with "${AWK} -f $S/teken/gensequences $S/teken/sequences > teken_state.h" \ no-obj no-implicit-rule before-depend \ clean "teken_state.h" usbdevs.h optional usb \ dependency "$S/tools/usbdevs2h.awk $S/dev/usb/usbdevs" \ compile-with "${AWK} -f $S/tools/usbdevs2h.awk $S/dev/usb/usbdevs -h" \ no-obj no-implicit-rule before-depend \ clean "usbdevs.h" usbdevs_data.h optional usb \ dependency "$S/tools/usbdevs2h.awk $S/dev/usb/usbdevs" \ compile-with "${AWK} -f $S/tools/usbdevs2h.awk $S/dev/usb/usbdevs -d" \ no-obj no-implicit-rule before-depend \ clean "usbdevs_data.h" cam/cam.c optional scbus cam/cam_compat.c optional scbus cam/cam_periph.c optional scbus cam/cam_queue.c optional scbus cam/cam_sim.c optional scbus cam/cam_xpt.c optional scbus cam/ata/ata_all.c optional scbus cam/ata/ata_xpt.c optional scbus cam/ata/ata_pmp.c optional scbus cam/scsi/scsi_xpt.c optional scbus cam/scsi/scsi_all.c optional scbus cam/scsi/scsi_cd.c optional cd cam/scsi/scsi_ch.c optional ch cam/ata/ata_da.c optional ada | da cam/ctl/ctl.c optional ctl cam/ctl/ctl_backend.c optional ctl cam/ctl/ctl_backend_block.c optional ctl cam/ctl/ctl_backend_ramdisk.c optional ctl cam/ctl/ctl_cmd_table.c optional ctl cam/ctl/ctl_frontend.c optional ctl cam/ctl/ctl_frontend_cam_sim.c optional ctl cam/ctl/ctl_frontend_internal.c optional ctl cam/ctl/ctl_mem_pool.c optional ctl cam/ctl/ctl_scsi_all.c optional ctl cam/ctl/ctl_error.c optional ctl cam/ctl/ctl_util.c optional ctl cam/ctl/scsi_ctl.c optional ctl cam/scsi/scsi_da.c optional da cam/scsi/scsi_low.c optional ct | ncv | nsp | stg cam/scsi/scsi_low_pisa.c optional ct | ncv | nsp | stg cam/scsi/scsi_pass.c optional pass cam/scsi/scsi_pt.c optional pt cam/scsi/scsi_sa.c optional sa cam/scsi/scsi_enc.c optional ses cam/scsi/scsi_enc_ses.c optional ses cam/scsi/scsi_enc_safte.c optional ses cam/scsi/scsi_sg.c optional sg cam/scsi/scsi_targ_bh.c optional targbh cam/scsi/scsi_target.c optional targ cam/scsi/smp_all.c optional scbus contrib/altq/altq/altq_cbq.c optional altq \ compile-with "${NORMAL_C} -I$S/contrib/pf" contrib/altq/altq/altq_cdnr.c optional altq contrib/altq/altq/altq_hfsc.c optional altq \ compile-with "${NORMAL_C} -I$S/contrib/pf" contrib/altq/altq/altq_priq.c optional altq \ compile-with "${NORMAL_C} -I$S/contrib/pf" contrib/altq/altq/altq_red.c optional altq \ compile-with "${NORMAL_C} -I$S/contrib/pf" contrib/altq/altq/altq_rio.c optional altq \ compile-with "${NORMAL_C} -I$S/contrib/pf" contrib/altq/altq/altq_rmclass.c optional altq contrib/altq/altq/altq_subr.c optional altq \ compile-with "${NORMAL_C} -I$S/contrib/pf" contrib/dev/acpica/debugger/dbcmds.c optional acpi acpi_debug contrib/dev/acpica/debugger/dbdisply.c optional acpi acpi_debug contrib/dev/acpica/debugger/dbexec.c optional acpi acpi_debug contrib/dev/acpica/debugger/dbfileio.c optional acpi acpi_debug contrib/dev/acpica/debugger/dbhistry.c optional acpi acpi_debug contrib/dev/acpica/debugger/dbinput.c optional acpi acpi_debug contrib/dev/acpica/debugger/dbmethod.c optional acpi acpi_debug contrib/dev/acpica/debugger/dbnames.c optional acpi acpi_debug contrib/dev/acpica/debugger/dbstats.c optional acpi acpi_debug contrib/dev/acpica/debugger/dbutils.c optional acpi acpi_debug contrib/dev/acpica/debugger/dbxface.c optional acpi acpi_debug contrib/dev/acpica/disassembler/dmbuffer.c optional acpi acpi_debug contrib/dev/acpica/disassembler/dmnames.c optional acpi acpi_debug contrib/dev/acpica/disassembler/dmopcode.c optional acpi acpi_debug contrib/dev/acpica/disassembler/dmobject.c optional acpi acpi_debug contrib/dev/acpica/disassembler/dmresrc.c optional acpi acpi_debug contrib/dev/acpica/disassembler/dmresrcl.c optional acpi acpi_debug contrib/dev/acpica/disassembler/dmresrcs.c optional acpi acpi_debug contrib/dev/acpica/disassembler/dmutils.c optional acpi acpi_debug contrib/dev/acpica/disassembler/dmwalk.c optional acpi acpi_debug contrib/dev/acpica/dispatcher/dsargs.c optional acpi contrib/dev/acpica/dispatcher/dscontrol.c optional acpi contrib/dev/acpica/dispatcher/dsfield.c optional acpi contrib/dev/acpica/dispatcher/dsinit.c optional acpi contrib/dev/acpica/dispatcher/dsmethod.c optional acpi contrib/dev/acpica/dispatcher/dsmthdat.c optional acpi contrib/dev/acpica/dispatcher/dsobject.c optional acpi contrib/dev/acpica/dispatcher/dsopcode.c optional acpi contrib/dev/acpica/dispatcher/dsutils.c optional acpi contrib/dev/acpica/dispatcher/dswexec.c optional acpi contrib/dev/acpica/dispatcher/dswload.c optional acpi contrib/dev/acpica/dispatcher/dswload2.c optional acpi contrib/dev/acpica/dispatcher/dswscope.c optional acpi contrib/dev/acpica/dispatcher/dswstate.c optional acpi contrib/dev/acpica/events/evevent.c optional acpi contrib/dev/acpica/events/evglock.c optional acpi contrib/dev/acpica/events/evgpe.c optional acpi contrib/dev/acpica/events/evgpeblk.c optional acpi contrib/dev/acpica/events/evgpeinit.c optional acpi contrib/dev/acpica/events/evgpeutil.c optional acpi contrib/dev/acpica/events/evmisc.c optional acpi contrib/dev/acpica/events/evregion.c optional acpi contrib/dev/acpica/events/evrgnini.c optional acpi contrib/dev/acpica/events/evsci.c optional acpi contrib/dev/acpica/events/evxface.c optional acpi contrib/dev/acpica/events/evxfevnt.c optional acpi contrib/dev/acpica/events/evxfgpe.c optional acpi contrib/dev/acpica/events/evxfregn.c optional acpi contrib/dev/acpica/executer/exconfig.c optional acpi contrib/dev/acpica/executer/exconvrt.c optional acpi contrib/dev/acpica/executer/excreate.c optional acpi contrib/dev/acpica/executer/exdebug.c optional acpi contrib/dev/acpica/executer/exdump.c optional acpi contrib/dev/acpica/executer/exfield.c optional acpi contrib/dev/acpica/executer/exfldio.c optional acpi contrib/dev/acpica/executer/exmisc.c optional acpi contrib/dev/acpica/executer/exmutex.c optional acpi contrib/dev/acpica/executer/exnames.c optional acpi contrib/dev/acpica/executer/exoparg1.c optional acpi contrib/dev/acpica/executer/exoparg2.c optional acpi contrib/dev/acpica/executer/exoparg3.c optional acpi contrib/dev/acpica/executer/exoparg6.c optional acpi contrib/dev/acpica/executer/exprep.c optional acpi contrib/dev/acpica/executer/exregion.c optional acpi contrib/dev/acpica/executer/exresnte.c optional acpi contrib/dev/acpica/executer/exresolv.c optional acpi contrib/dev/acpica/executer/exresop.c optional acpi contrib/dev/acpica/executer/exstore.c optional acpi contrib/dev/acpica/executer/exstoren.c optional acpi contrib/dev/acpica/executer/exstorob.c optional acpi contrib/dev/acpica/executer/exsystem.c optional acpi contrib/dev/acpica/executer/exutils.c optional acpi contrib/dev/acpica/hardware/hwacpi.c optional acpi contrib/dev/acpica/hardware/hwgpe.c optional acpi contrib/dev/acpica/hardware/hwpci.c optional acpi contrib/dev/acpica/hardware/hwregs.c optional acpi contrib/dev/acpica/hardware/hwsleep.c optional acpi contrib/dev/acpica/hardware/hwtimer.c optional acpi contrib/dev/acpica/hardware/hwvalid.c optional acpi contrib/dev/acpica/hardware/hwxface.c optional acpi contrib/dev/acpica/namespace/nsaccess.c optional acpi contrib/dev/acpica/namespace/nsalloc.c optional acpi contrib/dev/acpica/namespace/nsdump.c optional acpi contrib/dev/acpica/namespace/nseval.c optional acpi contrib/dev/acpica/namespace/nsinit.c optional acpi contrib/dev/acpica/namespace/nsload.c optional acpi contrib/dev/acpica/namespace/nsnames.c optional acpi contrib/dev/acpica/namespace/nsobject.c optional acpi contrib/dev/acpica/namespace/nsparse.c optional acpi contrib/dev/acpica/namespace/nspredef.c optional acpi contrib/dev/acpica/namespace/nsrepair.c optional acpi contrib/dev/acpica/namespace/nsrepair2.c optional acpi contrib/dev/acpica/namespace/nssearch.c optional acpi contrib/dev/acpica/namespace/nsutils.c optional acpi contrib/dev/acpica/namespace/nswalk.c optional acpi contrib/dev/acpica/namespace/nsxfeval.c optional acpi contrib/dev/acpica/namespace/nsxfname.c optional acpi contrib/dev/acpica/namespace/nsxfobj.c optional acpi contrib/dev/acpica/parser/psargs.c optional acpi contrib/dev/acpica/parser/psloop.c optional acpi contrib/dev/acpica/parser/psopcode.c optional acpi contrib/dev/acpica/parser/psparse.c optional acpi contrib/dev/acpica/parser/psscope.c optional acpi contrib/dev/acpica/parser/pstree.c optional acpi contrib/dev/acpica/parser/psutils.c optional acpi contrib/dev/acpica/parser/pswalk.c optional acpi contrib/dev/acpica/parser/psxface.c optional acpi contrib/dev/acpica/resources/rsaddr.c optional acpi contrib/dev/acpica/resources/rscalc.c optional acpi contrib/dev/acpica/resources/rscreate.c optional acpi contrib/dev/acpica/resources/rsdump.c optional acpi contrib/dev/acpica/resources/rsinfo.c optional acpi contrib/dev/acpica/resources/rsio.c optional acpi contrib/dev/acpica/resources/rsirq.c optional acpi contrib/dev/acpica/resources/rslist.c optional acpi contrib/dev/acpica/resources/rsmemory.c optional acpi contrib/dev/acpica/resources/rsmisc.c optional acpi contrib/dev/acpica/resources/rsutils.c optional acpi contrib/dev/acpica/resources/rsxface.c optional acpi contrib/dev/acpica/tables/tbfadt.c optional acpi contrib/dev/acpica/tables/tbfind.c optional acpi contrib/dev/acpica/tables/tbinstal.c optional acpi contrib/dev/acpica/tables/tbutils.c optional acpi contrib/dev/acpica/tables/tbxface.c optional acpi contrib/dev/acpica/tables/tbxfroot.c optional acpi contrib/dev/acpica/utilities/utalloc.c optional acpi contrib/dev/acpica/utilities/utcache.c optional acpi contrib/dev/acpica/utilities/utcopy.c optional acpi contrib/dev/acpica/utilities/utdebug.c optional acpi contrib/dev/acpica/utilities/utdecode.c optional acpi contrib/dev/acpica/utilities/utdelete.c optional acpi contrib/dev/acpica/utilities/uteval.c optional acpi contrib/dev/acpica/utilities/utglobal.c optional acpi contrib/dev/acpica/utilities/utids.c optional acpi contrib/dev/acpica/utilities/utinit.c optional acpi contrib/dev/acpica/utilities/utlock.c optional acpi contrib/dev/acpica/utilities/utmath.c optional acpi contrib/dev/acpica/utilities/utmisc.c optional acpi contrib/dev/acpica/utilities/utmutex.c optional acpi contrib/dev/acpica/utilities/utobject.c optional acpi contrib/dev/acpica/utilities/utosi.c optional acpi contrib/dev/acpica/utilities/utresrc.c optional acpi contrib/dev/acpica/utilities/utstate.c optional acpi contrib/dev/acpica/utilities/utxface.c optional acpi contrib/dev/acpica/utilities/utxferror.c optional acpi contrib/ipfilter/netinet/fil.c optional ipfilter inet \ compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN} -I$S/contrib/ipfilter" contrib/ipfilter/netinet/ip_auth.c optional ipfilter inet \ compile-with "${NORMAL_C} -I$S/contrib/ipfilter" contrib/ipfilter/netinet/ip_fil_freebsd.c optional ipfilter inet \ compile-with "${NORMAL_C} -I$S/contrib/ipfilter" contrib/ipfilter/netinet/ip_frag.c optional ipfilter inet \ compile-with "${NORMAL_C} -I$S/contrib/ipfilter" contrib/ipfilter/netinet/ip_log.c optional ipfilter inet \ compile-with "${NORMAL_C} -I$S/contrib/ipfilter" contrib/ipfilter/netinet/ip_nat.c optional ipfilter inet \ compile-with "${NORMAL_C} -I$S/contrib/ipfilter" contrib/ipfilter/netinet/ip_proxy.c optional ipfilter inet \ compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN} -I$S/contrib/ipfilter" contrib/ipfilter/netinet/ip_state.c optional ipfilter inet \ compile-with "${NORMAL_C} -I$S/contrib/ipfilter" contrib/ipfilter/netinet/ip_lookup.c optional ipfilter inet \ compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN} -Wno-error -I$S/contrib/ipfilter" contrib/ipfilter/netinet/ip_pool.c optional ipfilter inet \ compile-with "${NORMAL_C} -I$S/contrib/ipfilter" contrib/ipfilter/netinet/ip_htable.c optional ipfilter inet \ compile-with "${NORMAL_C} -I$S/contrib/ipfilter" contrib/ipfilter/netinet/ip_sync.c optional ipfilter inet \ compile-with "${NORMAL_C} -I$S/contrib/ipfilter" contrib/ipfilter/netinet/mlfk_ipl.c optional ipfilter inet \ compile-with "${NORMAL_C} -I$S/contrib/ipfilter" contrib/libfdt/fdt.c optional fdt contrib/libfdt/fdt_ro.c optional fdt contrib/libfdt/fdt_rw.c optional fdt contrib/libfdt/fdt_strerror.c optional fdt contrib/libfdt/fdt_sw.c optional fdt contrib/libfdt/fdt_wip.c optional fdt contrib/ngatm/netnatm/api/cc_conn.c optional ngatm_ccatm \ compile-with "${NORMAL_C_NOWERROR} -I$S/contrib/ngatm" contrib/ngatm/netnatm/api/cc_data.c optional ngatm_ccatm \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" contrib/ngatm/netnatm/api/cc_dump.c optional ngatm_ccatm \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" contrib/ngatm/netnatm/api/cc_port.c optional ngatm_ccatm \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" contrib/ngatm/netnatm/api/cc_sig.c optional ngatm_ccatm \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" contrib/ngatm/netnatm/api/cc_user.c optional ngatm_ccatm \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" contrib/ngatm/netnatm/api/unisap.c optional ngatm_ccatm \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" contrib/ngatm/netnatm/misc/straddr.c optional ngatm_atmbase \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" contrib/ngatm/netnatm/misc/unimsg_common.c optional ngatm_atmbase \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" contrib/ngatm/netnatm/msg/traffic.c optional ngatm_atmbase \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" contrib/ngatm/netnatm/msg/uni_ie.c optional ngatm_atmbase \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" contrib/ngatm/netnatm/msg/uni_msg.c optional ngatm_atmbase \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" contrib/ngatm/netnatm/saal/saal_sscfu.c optional ngatm_sscfu \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" contrib/ngatm/netnatm/saal/saal_sscop.c optional ngatm_sscop \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" contrib/ngatm/netnatm/sig/sig_call.c optional ngatm_uni \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" contrib/ngatm/netnatm/sig/sig_coord.c optional ngatm_uni \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" contrib/ngatm/netnatm/sig/sig_party.c optional ngatm_uni \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" contrib/ngatm/netnatm/sig/sig_print.c optional ngatm_uni \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" contrib/ngatm/netnatm/sig/sig_reset.c optional ngatm_uni \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" contrib/ngatm/netnatm/sig/sig_uni.c optional ngatm_uni \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" contrib/ngatm/netnatm/sig/sig_unimsgcpy.c optional ngatm_uni \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" contrib/ngatm/netnatm/sig/sig_verify.c optional ngatm_uni \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" contrib/pf/net/if_pflog.c optional pflog pf inet \ compile-with "${NORMAL_C} -I$S/contrib/pf" contrib/pf/net/if_pfsync.c optional pfsync pf inet \ compile-with "${NORMAL_C} -I$S/contrib/pf" contrib/pf/net/pf.c optional pf inet \ compile-with "${NORMAL_C} -I$S/contrib/pf" contrib/pf/net/pf_if.c optional pf inet \ compile-with "${NORMAL_C} -I$S/contrib/pf" contrib/pf/net/pf_ioctl.c optional pf inet \ compile-with "${NORMAL_C} -I$S/contrib/pf" contrib/pf/net/pf_lb.c optional pf inet \ compile-with "${NORMAL_C} -I$S/contrib/pf" contrib/pf/net/pf_norm.c optional pf inet \ compile-with "${NORMAL_C} -I$S/contrib/pf" contrib/pf/net/pf_osfp.c optional pf inet \ compile-with "${NORMAL_C} -I$S/contrib/pf" contrib/pf/net/pf_ruleset.c optional pf inet \ compile-with "${NORMAL_C} -I$S/contrib/pf" contrib/pf/net/pf_table.c optional pf inet \ compile-with "${NORMAL_C} -I$S/contrib/pf" contrib/pf/netinet/in4_cksum.c optional pf inet crypto/blowfish/bf_ecb.c optional ipsec crypto/blowfish/bf_skey.c optional crypto | ipsec crypto/camellia/camellia.c optional crypto | ipsec crypto/camellia/camellia-api.c optional crypto | ipsec crypto/des/des_ecb.c optional crypto | ipsec | netsmb crypto/des/des_setkey.c optional crypto | ipsec | netsmb crypto/rc4/rc4.c optional netgraph_mppc_encryption | kgssapi crypto/rijndael/rijndael-alg-fst.c optional crypto | geom_bde | \ ipsec | random | wlan_ccmp crypto/rijndael/rijndael-api-fst.c optional geom_bde | random crypto/rijndael/rijndael-api.c optional crypto | ipsec | wlan_ccmp crypto/sha1.c optional carp | crypto | ipsec | \ netgraph_mppc_encryption | sctp crypto/sha2/sha2.c optional crypto | geom_bde | ipsec | random | \ sctp ddb/db_access.c optional ddb ddb/db_break.c optional ddb ddb/db_capture.c optional ddb ddb/db_command.c optional ddb ddb/db_examine.c optional ddb ddb/db_expr.c optional ddb ddb/db_input.c optional ddb ddb/db_lex.c optional ddb ddb/db_main.c optional ddb ddb/db_output.c optional ddb ddb/db_print.c optional ddb ddb/db_ps.c optional ddb ddb/db_run.c optional ddb ddb/db_script.c optional ddb ddb/db_sym.c optional ddb ddb/db_thread.c optional ddb ddb/db_textdump.c optional ddb ddb/db_variables.c optional ddb ddb/db_watch.c optional ddb ddb/db_write_cmd.c optional ddb #dev/dpt/dpt_control.c optional dpt dev/aac/aac.c optional aac dev/aac/aac_cam.c optional aacp aac dev/aac/aac_debug.c optional aac dev/aac/aac_disk.c optional aac dev/aac/aac_linux.c optional aac compat_linux dev/aac/aac_pci.c optional aac pci dev/aacraid/aacraid.c optional aacraid dev/aacraid/aacraid_cam.c optional aacraid scbus dev/aacraid/aacraid_debug.c optional aacraid dev/aacraid/aacraid_linux.c optional aacraid compat_linux dev/aacraid/aacraid_pci.c optional aacraid pci dev/acpi_support/acpi_wmi.c optional acpi_wmi acpi dev/acpi_support/acpi_asus.c optional acpi_asus acpi dev/acpi_support/acpi_fujitsu.c optional acpi_fujitsu acpi dev/acpi_support/acpi_hp.c optional acpi_hp acpi dev/acpi_support/acpi_ibm.c optional acpi_ibm acpi dev/acpi_support/acpi_panasonic.c optional acpi_panasonic acpi dev/acpi_support/acpi_sony.c optional acpi_sony acpi dev/acpi_support/acpi_toshiba.c optional acpi_toshiba acpi dev/acpi_support/atk0110.c optional aibs acpi dev/acpica/Osd/OsdDebug.c optional acpi dev/acpica/Osd/OsdHardware.c optional acpi dev/acpica/Osd/OsdInterrupt.c optional acpi dev/acpica/Osd/OsdMemory.c optional acpi dev/acpica/Osd/OsdSchedule.c optional acpi dev/acpica/Osd/OsdStream.c optional acpi dev/acpica/Osd/OsdSynch.c optional acpi dev/acpica/Osd/OsdTable.c optional acpi dev/acpica/acpi.c optional acpi dev/acpica/acpi_acad.c optional acpi dev/acpica/acpi_battery.c optional acpi dev/acpica/acpi_button.c optional acpi dev/acpica/acpi_cmbat.c optional acpi dev/acpica/acpi_cpu.c optional acpi dev/acpica/acpi_ec.c optional acpi dev/acpica/acpi_hpet.c optional acpi dev/acpica/acpi_isab.c optional acpi isa dev/acpica/acpi_lid.c optional acpi dev/acpica/acpi_package.c optional acpi dev/acpica/acpi_pci.c optional acpi pci dev/acpica/acpi_pci_link.c optional acpi pci dev/acpica/acpi_pcib.c optional acpi pci dev/acpica/acpi_pcib_acpi.c optional acpi pci dev/acpica/acpi_pcib_pci.c optional acpi pci dev/acpica/acpi_perf.c optional acpi dev/acpica/acpi_powerres.c optional acpi dev/acpica/acpi_quirk.c optional acpi dev/acpica/acpi_resource.c optional acpi dev/acpica/acpi_smbat.c optional acpi dev/acpica/acpi_thermal.c optional acpi dev/acpica/acpi_throttle.c optional acpi dev/acpica/acpi_timer.c optional acpi dev/acpica/acpi_video.c optional acpi_video acpi dev/acpica/acpi_dock.c optional acpi_dock acpi dev/adlink/adlink.c optional adlink dev/advansys/adv_eisa.c optional adv eisa dev/advansys/adv_pci.c optional adv pci dev/advansys/advansys.c optional adv dev/advansys/advlib.c optional adv dev/advansys/advmcode.c optional adv dev/advansys/adw_pci.c optional adw pci dev/advansys/adwcam.c optional adw dev/advansys/adwlib.c optional adw dev/advansys/adwmcode.c optional adw dev/ae/if_ae.c optional ae pci dev/age/if_age.c optional age pci dev/agp/agp.c optional agp pci dev/agp/agp_if.m optional agp pci dev/aha/aha.c optional aha dev/aha/aha_isa.c optional aha isa dev/aha/aha_mca.c optional aha mca dev/ahb/ahb.c optional ahb eisa dev/ahci/ahci.c optional ahci pci dev/aic/aic.c optional aic dev/aic/aic_pccard.c optional aic pccard dev/aic7xxx/ahc_eisa.c optional ahc eisa dev/aic7xxx/ahc_isa.c optional ahc isa dev/aic7xxx/ahc_pci.c optional ahc pci \ compile-with "${NORMAL_C} ${NO_WCONSTANT_CONVERSION}" dev/aic7xxx/ahd_pci.c optional ahd pci \ compile-with "${NORMAL_C} ${NO_WCONSTANT_CONVERSION}" dev/aic7xxx/aic7770.c optional ahc dev/aic7xxx/aic79xx.c optional ahd pci dev/aic7xxx/aic79xx_osm.c optional ahd pci dev/aic7xxx/aic79xx_pci.c optional ahd pci dev/aic7xxx/aic7xxx.c optional ahc dev/aic7xxx/aic7xxx_93cx6.c optional ahc dev/aic7xxx/aic7xxx_osm.c optional ahc dev/aic7xxx/aic7xxx_pci.c optional ahc pci dev/alc/if_alc.c optional alc pci dev/ale/if_ale.c optional ale pci dev/amd/amd.c optional amd dev/amr/amr.c optional amr dev/amr/amr_cam.c optional amrp amr dev/amr/amr_disk.c optional amr dev/amr/amr_linux.c optional amr compat_linux dev/amr/amr_pci.c optional amr pci dev/an/if_an.c optional an dev/an/if_an_isa.c optional an isa dev/an/if_an_pccard.c optional an pccard dev/an/if_an_pci.c optional an pci dev/asr/asr.c optional asr pci \ compile-with "${NORMAL_C} ${NO_WARRAY_BOUNDS}" # dev/ata/ata_if.m optional ata | atacore dev/ata/ata-all.c optional ata | atacore dev/ata/ata-dma.c optional ata | atacore dev/ata/ata-lowlevel.c optional ata | atacore dev/ata/ata-queue.c optional ata | atacore dev/ata/ata-sata.c optional ata | atacore dev/ata/ata-card.c optional ata pccard | atapccard dev/ata/ata-cbus.c optional ata pc98 | atapc98 dev/ata/ata-isa.c optional ata isa | ataisa dev/ata/ata-pci.c optional ata pci | atapci dev/ata/chipsets/ata-ahci.c optional ata pci | ataahci | ataacerlabs | \ ataati | ataintel | atajmicron | \ atavia | atanvidia dev/ata/chipsets/ata-acard.c optional ata pci | ataacard dev/ata/chipsets/ata-acerlabs.c optional ata pci | ataacerlabs dev/ata/chipsets/ata-adaptec.c optional ata pci | ataadaptec dev/ata/chipsets/ata-amd.c optional ata pci | ataamd dev/ata/chipsets/ata-ati.c optional ata pci | ataati dev/ata/chipsets/ata-cenatek.c optional ata pci | atacenatek dev/ata/chipsets/ata-cypress.c optional ata pci | atacypress dev/ata/chipsets/ata-cyrix.c optional ata pci | atacyrix dev/ata/chipsets/ata-highpoint.c optional ata pci | atahighpoint dev/ata/chipsets/ata-intel.c optional ata pci | ataintel dev/ata/chipsets/ata-ite.c optional ata pci | ataite dev/ata/chipsets/ata-jmicron.c optional ata pci | atajmicron dev/ata/chipsets/ata-marvell.c optional ata pci | atamarvell | ataadaptec dev/ata/chipsets/ata-micron.c optional ata pci | atamicron dev/ata/chipsets/ata-national.c optional ata pci | atanational dev/ata/chipsets/ata-netcell.c optional ata pci | atanetcell dev/ata/chipsets/ata-nvidia.c optional ata pci | atanvidia dev/ata/chipsets/ata-promise.c optional ata pci | atapromise dev/ata/chipsets/ata-serverworks.c optional ata pci | ataserverworks dev/ata/chipsets/ata-siliconimage.c optional ata pci | atasiliconimage | ataati dev/ata/chipsets/ata-sis.c optional ata pci | atasis dev/ata/chipsets/ata-via.c optional ata pci | atavia dev/ata/ata-disk.c optional atadisk dev/ata/ata-raid.c optional ataraid dev/ata/atapi-cd.c optional atapicd dev/ata/atapi-fd.c optional atapifd dev/ata/atapi-tape.c optional atapist dev/ata/atapi-cam.c optional atapicam # dev/ath/if_ath_pci.c optional ath_pci pci \ compile-with "${NORMAL_C} -I$S/dev/ath" # dev/ath/if_ath_ahb.c optional ath_ahb \ compile-with "${NORMAL_C} -I$S/dev/ath" # # XXX Work around clang warning, until maintainer approves fix. dev/ath/if_ath.c optional ath \ compile-with "${NORMAL_C} ${NO_WSOMETIMES_UNINITIALIZED} -I$S/dev/ath" dev/ath/if_ath_debug.c optional ath \ compile-with "${NORMAL_C} -I$S/dev/ath" dev/ath/if_ath_keycache.c optional ath \ compile-with "${NORMAL_C} -I$S/dev/ath" dev/ath/if_ath_tx.c optional ath \ compile-with "${NORMAL_C} -I$S/dev/ath" dev/ath/if_ath_tx_ht.c optional ath \ compile-with "${NORMAL_C} -I$S/dev/ath" dev/ath/if_ath_sysctl.c optional ath \ compile-with "${NORMAL_C} -I$S/dev/ath" dev/ath/ah_osdep.c optional ath \ compile-with "${NORMAL_C} -I$S/dev/ath" # dev/ath/ath_hal/ah.c optional ath \ compile-with "${NORMAL_C} -I$S/dev/ath" dev/ath/ath_hal/ah_eeprom_v1.c optional ath_hal | ath_ar5210 \ compile-with "${NORMAL_C} -I$S/dev/ath" dev/ath/ath_hal/ah_eeprom_v3.c optional ath_hal | ath_ar5211 | ath_ar5212 \ compile-with "${NORMAL_C} -I$S/dev/ath" dev/ath/ath_hal/ah_eeprom_v14.c \ optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 \ compile-with "${NORMAL_C} -I$S/dev/ath" dev/ath/ath_hal/ah_eeprom_v4k.c \ optional ath_hal | ath_ar9285 \ compile-with "${NORMAL_C} -I$S/dev/ath" # XXX Work around clang warning, until maintainer approves fix. dev/ath/ath_hal/ah_eeprom_9287.c \ optional ath_hal | ath_ar9287 \ compile-with "${NORMAL_C} ${NO_WSOMETIMES_UNINITIALIZED} -I$S/dev/ath" dev/ath/ath_hal/ah_regdomain.c optional ath \ compile-with "${NORMAL_C} ${NO_WSHIFT_COUNT_NEGATIVE} ${NO_WSHIFT_COUNT_OVERFLOW} -I$S/dev/ath" # ar5210 dev/ath/ath_hal/ar5210/ar5210_attach.c optional ath_hal | ath_ar5210 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5210/ar5210_beacon.c optional ath_hal | ath_ar5210 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5210/ar5210_interrupts.c optional ath_hal | ath_ar5210 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5210/ar5210_keycache.c optional ath_hal | ath_ar5210 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5210/ar5210_misc.c optional ath_hal | ath_ar5210 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5210/ar5210_phy.c optional ath_hal | ath_ar5210 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5210/ar5210_power.c optional ath_hal | ath_ar5210 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5210/ar5210_recv.c optional ath_hal | ath_ar5210 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5210/ar5210_reset.c optional ath_hal | ath_ar5210 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5210/ar5210_xmit.c optional ath_hal | ath_ar5210 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" # ar5211 dev/ath/ath_hal/ar5211/ar5211_attach.c optional ath_hal | ath_ar5211 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5211/ar5211_beacon.c optional ath_hal | ath_ar5211 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5211/ar5211_interrupts.c optional ath_hal | ath_ar5211 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5211/ar5211_keycache.c optional ath_hal | ath_ar5211 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5211/ar5211_misc.c optional ath_hal | ath_ar5211 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5211/ar5211_phy.c optional ath_hal | ath_ar5211 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5211/ar5211_power.c optional ath_hal | ath_ar5211 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5211/ar5211_recv.c optional ath_hal | ath_ar5211 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5211/ar5211_reset.c optional ath_hal | ath_ar5211 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5211/ar5211_xmit.c optional ath_hal | ath_ar5211 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" # ar5212 dev/ath/ath_hal/ar5212/ar5212_ani.c \ optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \ ath_ar9285 ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5212/ar5212_attach.c \ optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \ ath_ar9285 ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5212/ar5212_beacon.c \ optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \ ath_ar9285 ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5212/ar5212_eeprom.c \ optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \ ath_ar9285 ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5212/ar5212_gpio.c \ optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \ ath_ar9285 ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5212/ar5212_interrupts.c \ optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \ ath_ar9285 ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5212/ar5212_keycache.c \ optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \ ath_ar9285 ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5212/ar5212_misc.c \ optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \ ath_ar9285 ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5212/ar5212_phy.c \ optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \ ath_ar9285 ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5212/ar5212_power.c \ optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \ ath_ar9285 ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5212/ar5212_recv.c \ optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \ ath_ar9285 ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5212/ar5212_reset.c \ optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \ ath_ar9285 ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5212/ar5212_rfgain.c \ optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \ ath_ar9285 ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5212/ar5212_xmit.c \ optional ath_hal | ath_ar5212 | ath_ar5416 | ath_ar9160 | ath_ar9280 | \ ath_ar9285 ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" # ar5416 (depends on ar5212) dev/ath/ath_hal/ar5416/ar5416_ani.c \ optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \ ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5416/ar5416_attach.c \ optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \ ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5416/ar5416_beacon.c \ optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \ ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5416/ar5416_cal.c \ optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \ ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5416/ar5416_cal_iq.c \ optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \ ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5416/ar5416_cal_adcgain.c \ optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \ ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5416/ar5416_cal_adcdc.c \ optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \ ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5416/ar5416_eeprom.c \ optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \ ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5416/ar5416_gpio.c \ optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \ ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5416/ar5416_interrupts.c \ optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \ ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5416/ar5416_keycache.c \ optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \ ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5416/ar5416_misc.c \ optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \ ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5416/ar5416_phy.c \ optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \ ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5416/ar5416_power.c \ optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \ ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5416/ar5416_recv.c \ optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \ ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5416/ar5416_reset.c \ optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \ ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5416/ar5416_xmit.c \ optional ath_hal | ath_ar5416 | ath_ar9160 | ath_ar9280 | ath_ar9285 | \ ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" # ar9130 (depends upon ar5416) - also requires AH_SUPPORT_AR9130 dev/ath/ath_hal/ar9001/ar9130_attach.c optional ath_hal | ath_ar9130 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar9001/ar9130_phy.c optional ath_hal | ath_ar9130 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar9001/ar9130_eeprom.c optional ath_hal | ath_ar9130 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" # ar9160 (depends on ar5416) dev/ath/ath_hal/ar9001/ar9160_attach.c optional ath_hal | ath_ar9160 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" # ar9280 (depends on ar5416) dev/ath/ath_hal/ar9002/ar9280_attach.c optional ath_hal | ath_ar9280 | \ ath_ar9285 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar9002/ar9280_olc.c optional ath_hal | ath_ar9280 | \ ath_ar9285 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" # ar9285 (depends on ar5416 and ar9280) dev/ath/ath_hal/ar9002/ar9285_attach.c optional ath_hal | ath_ar9285 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar9002/ar9285_reset.c optional ath_hal | ath_ar9285 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar9002/ar9285_cal.c optional ath_hal | ath_ar9285 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar9002/ar9285_phy.c optional ath_hal | ath_ar9285 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar9002/ar9285_diversity.c optional ath_hal | ath_ar9285 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" # ar9287 (depends on ar5416) dev/ath/ath_hal/ar9002/ar9287_attach.c optional ath_hal | ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar9002/ar9287_reset.c optional ath_hal | ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar9002/ar9287_cal.c optional ath_hal | ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar9002/ar9287_olc.c optional ath_hal | ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" # rf backends dev/ath/ath_hal/ar5212/ar2316.c optional ath_rf2316 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5212/ar2317.c optional ath_rf2317 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5212/ar2413.c optional ath_hal | ath_rf2413 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5212/ar2425.c optional ath_hal | ath_rf2425 | ath_rf2417 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5212/ar5111.c optional ath_hal | ath_rf5111 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5212/ar5112.c optional ath_hal | ath_rf5112 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5212/ar5413.c optional ath_hal | ath_rf5413 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar5416/ar2133.c optional ath_hal | ath_ar5416 | ath_ar9160 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar9002/ar9280.c optional ath_hal | ath_ar9280 | ath_ar9285 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar9002/ar9285.c optional ath_hal | ath_ar9285 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" dev/ath/ath_hal/ar9002/ar9287.c optional ath_hal | ath_ar9287 \ compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal" # ath rate control algorithms dev/ath/ath_rate/amrr/amrr.c optional ath_rate_amrr \ compile-with "${NORMAL_C} -I$S/dev/ath" dev/ath/ath_rate/onoe/onoe.c optional ath_rate_onoe \ compile-with "${NORMAL_C} -I$S/dev/ath" dev/ath/ath_rate/sample/sample.c optional ath_rate_sample \ compile-with "${NORMAL_C} -I$S/dev/ath" # ath DFS modules dev/ath/ath_dfs/null/dfs_null.c optional ath \ compile-with "${NORMAL_C} -I$S/dev/ath" # dev/bce/if_bce.c optional bce dev/bfe/if_bfe.c optional bfe dev/bge/if_bge.c optional bge dev/bktr/bktr_audio.c optional bktr pci dev/bktr/bktr_card.c optional bktr pci dev/bktr/bktr_core.c optional bktr pci dev/bktr/bktr_i2c.c optional bktr pci smbus dev/bktr/bktr_os.c optional bktr pci dev/bktr/bktr_tuner.c optional bktr pci dev/bktr/msp34xx.c optional bktr pci dev/buslogic/bt.c optional bt dev/buslogic/bt_eisa.c optional bt eisa dev/buslogic/bt_isa.c optional bt isa dev/buslogic/bt_mca.c optional bt mca dev/buslogic/bt_pci.c optional bt pci dev/bwi/bwimac.c optional bwi dev/bwi/bwiphy.c optional bwi dev/bwi/bwirf.c optional bwi dev/bwi/if_bwi.c optional bwi dev/bwi/if_bwi_pci.c optional bwi pci # XXX Work around clang warning, until maintainer approves fix. dev/bwn/if_bwn.c optional bwn siba_bwn \ compile-with "${NORMAL_C} ${NO_WSOMETIMES_UNINITIALIZED}" dev/cardbus/cardbus.c optional cardbus dev/cardbus/cardbus_cis.c optional cardbus dev/cardbus/cardbus_device.c optional cardbus dev/cas/if_cas.c optional cas dev/cfi/cfi_core.c optional cfi dev/cfi/cfi_dev.c optional cfi dev/cfi/cfi_disk.c optional cfid dev/ciss/ciss.c optional ciss dev/cm/smc90cx6.c optional cm dev/cmx/cmx.c optional cmx dev/cmx/cmx_pccard.c optional cmx pccard dev/cpufreq/ichss.c optional cpufreq dev/cs/if_cs.c optional cs dev/cs/if_cs_isa.c optional cs isa dev/cs/if_cs_pccard.c optional cs pccard dev/cxgb/cxgb_main.c optional cxgb pci \ compile-with "${NORMAL_C} -I$S/dev/cxgb" dev/cxgb/cxgb_sge.c optional cxgb pci \ compile-with "${NORMAL_C} -I$S/dev/cxgb" dev/cxgb/common/cxgb_mc5.c optional cxgb pci \ compile-with "${NORMAL_C} -I$S/dev/cxgb" dev/cxgb/common/cxgb_vsc7323.c optional cxgb pci \ compile-with "${NORMAL_C} -I$S/dev/cxgb" dev/cxgb/common/cxgb_vsc8211.c optional cxgb pci \ compile-with "${NORMAL_C} -I$S/dev/cxgb" dev/cxgb/common/cxgb_ael1002.c optional cxgb pci \ compile-with "${NORMAL_C} -I$S/dev/cxgb" dev/cxgb/common/cxgb_aq100x.c optional cxgb pci \ compile-with "${NORMAL_C} -I$S/dev/cxgb" dev/cxgb/common/cxgb_mv88e1xxx.c optional cxgb pci \ compile-with "${NORMAL_C} -I$S/dev/cxgb" dev/cxgb/common/cxgb_xgmac.c optional cxgb pci \ compile-with "${NORMAL_C} -I$S/dev/cxgb" dev/cxgb/common/cxgb_t3_hw.c optional cxgb pci \ compile-with "${NORMAL_C} -I$S/dev/cxgb" dev/cxgb/common/cxgb_tn1010.c optional cxgb pci \ compile-with "${NORMAL_C} -I$S/dev/cxgb" dev/cxgb/sys/uipc_mvec.c optional cxgb pci \ compile-with "${NORMAL_C} -I$S/dev/cxgb" dev/cxgb/cxgb_t3fw.c optional cxgb cxgb_t3fw \ compile-with "${NORMAL_C} -I$S/dev/cxgb" dev/cxgbe/t4_main.c optional cxgbe pci \ compile-with "${NORMAL_C} -I$S/dev/cxgbe" dev/cxgbe/t4_sge.c optional cxgbe pci \ compile-with "${NORMAL_C} -I$S/dev/cxgbe" dev/cxgbe/t4_l2t.c optional cxgbe pci \ compile-with "${NORMAL_C} -I$S/dev/cxgbe" dev/cxgbe/common/t4_hw.c optional cxgbe pci \ compile-with "${NORMAL_C} -I$S/dev/cxgbe" t4fw_cfg.c optional cxgbe \ compile-with "${AWK} -f $S/tools/fw_stub.awk t4fw_cfg.fw:t4fw_cfg t4fw_cfg_uwire.fw:t4fw_cfg_uwire t4fw.fw:t4fw -mt4fw_cfg -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "t4fw_cfg.c" t4fw_cfg.fwo optional cxgbe \ dependency "t4fw_cfg.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "t4fw_cfg.fwo" t4fw_cfg.fw optional cxgbe \ dependency "$S/dev/cxgbe/firmware/t4fw_cfg.txt" \ compile-with "${CP} ${.ALLSRC} ${.TARGET}" \ no-obj no-implicit-rule \ clean "t4fw_cfg.fw" t4fw_cfg_uwire.fwo optional cxgbe \ dependency "t4fw_cfg_uwire.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "t4fw_cfg_uwire.fwo" t4fw_cfg_uwire.fw optional cxgbe \ dependency "$S/dev/cxgbe/firmware/t4fw_cfg_uwire.txt" \ compile-with "${CP} ${.ALLSRC} ${.TARGET}" \ no-obj no-implicit-rule \ clean "t4fw_cfg_uwire.fw" t4fw.fwo optional cxgbe \ dependency "t4fw.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "t4fw.fwo" t4fw.fw optional cxgbe \ dependency "$S/dev/cxgbe/firmware/t4fw-1.8.11.0.bin.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "t4fw.fw" t5fw_cfg.c optional cxgbe \ compile-with "${AWK} -f $S/tools/fw_stub.awk t5fw_cfg.fw:t5fw_cfg t5fw.fw:t5fw -mt5fw_cfg -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "t5fw_cfg.c" t5fw_cfg.fwo optional cxgbe \ dependency "t5fw_cfg.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "t5fw_cfg.fwo" t5fw_cfg.fw optional cxgbe \ dependency "$S/dev/cxgbe/firmware/t5fw_cfg.txt" \ compile-with "${CP} ${.ALLSRC} ${.TARGET}" \ no-obj no-implicit-rule \ clean "t5fw_cfg.fw" t5fw.fwo optional cxgbe \ dependency "t5fw.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "t5fw.fwo" t5fw.fw optional cxgbe \ dependency "$S/dev/cxgbe/firmware/t5fw-1.8.22.0.bin.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "t5fw.fw" dev/cy/cy.c optional cy dev/cy/cy_isa.c optional cy isa dev/cy/cy_pci.c optional cy pci dev/dc/if_dc.c optional dc pci dev/dc/dcphy.c optional dc pci dev/dc/pnphy.c optional dc pci dev/dcons/dcons.c optional dcons dev/dcons/dcons_crom.c optional dcons_crom dev/dcons/dcons_os.c optional dcons dev/de/if_de.c optional de pci dev/digi/CX.c optional digi_CX dev/digi/CX_PCI.c optional digi_CX_PCI dev/digi/EPCX.c optional digi_EPCX dev/digi/EPCX_PCI.c optional digi_EPCX_PCI dev/digi/Xe.c optional digi_Xe dev/digi/Xem.c optional digi_Xem dev/digi/Xr.c optional digi_Xr dev/digi/digi.c optional digi dev/digi/digi_isa.c optional digi isa dev/digi/digi_pci.c optional digi pci dev/dpt/dpt_eisa.c optional dpt eisa dev/dpt/dpt_pci.c optional dpt pci dev/dpt/dpt_scsi.c optional dpt dev/drm/ati_pcigart.c optional drm dev/drm/drm_agpsupport.c optional drm dev/drm/drm_auth.c optional drm dev/drm/drm_bufs.c optional drm dev/drm/drm_context.c optional drm dev/drm/drm_dma.c optional drm dev/drm/drm_drawable.c optional drm dev/drm/drm_drv.c optional drm dev/drm/drm_fops.c optional drm dev/drm/drm_hashtab.c optional drm dev/drm/drm_ioctl.c optional drm dev/drm/drm_irq.c optional drm dev/drm/drm_lock.c optional drm dev/drm/drm_memory.c optional drm dev/drm/drm_mm.c optional drm dev/drm/drm_pci.c optional drm dev/drm/drm_scatter.c optional drm dev/drm/drm_sman.c optional drm dev/drm/drm_sysctl.c optional drm dev/drm/drm_vm.c optional drm dev/drm/i915_dma.c optional i915drm dev/drm/i915_drv.c optional i915drm dev/drm/i915_irq.c optional i915drm dev/drm/i915_mem.c optional i915drm dev/drm/i915_suspend.c optional i915drm dev/drm/mach64_dma.c optional mach64drm dev/drm/mach64_drv.c optional mach64drm dev/drm/mach64_irq.c optional mach64drm dev/drm/mach64_state.c optional mach64drm dev/drm/mga_dma.c optional mgadrm dev/drm/mga_drv.c optional mgadrm dev/drm/mga_irq.c optional mgadrm dev/drm/mga_state.c optional mgadrm \ compile-with "${NORMAL_C} -finline-limit=13500" dev/drm/mga_warp.c optional mgadrm dev/drm/r128_cce.c optional r128drm \ compile-with "${NORMAL_C} ${NO_WUNUSED_VALUE} ${NO_WCONSTANT_CONVERSION}" dev/drm/r128_drv.c optional r128drm dev/drm/r128_irq.c optional r128drm dev/drm/r128_state.c optional r128drm \ compile-with "${NORMAL_C} ${NO_WUNUSED_VALUE} -finline-limit=13500" dev/drm/r300_cmdbuf.c optional radeondrm dev/drm/r600_blit.c optional radeondrm dev/drm/r600_cp.c optional radeondrm \ compile-with "${NORMAL_C} ${NO_WUNUSED_VALUE} ${NO_WCONSTANT_CONVERSION}" dev/drm/radeon_cp.c optional radeondrm \ compile-with "${NORMAL_C} ${NO_WUNUSED_VALUE} ${NO_WCONSTANT_CONVERSION}" dev/drm/radeon_cs.c optional radeondrm dev/drm/radeon_drv.c optional radeondrm dev/drm/radeon_irq.c optional radeondrm dev/drm/radeon_mem.c optional radeondrm dev/drm/radeon_state.c optional radeondrm dev/drm/savage_bci.c optional savagedrm dev/drm/savage_drv.c optional savagedrm dev/drm/savage_state.c optional savagedrm dev/drm/sis_drv.c optional sisdrm dev/drm/sis_ds.c optional sisdrm dev/drm/sis_mm.c optional sisdrm dev/drm/tdfx_drv.c optional tdfxdrm dev/drm/via_dma.c optional viadrm dev/drm/via_dmablit.c optional viadrm dev/drm/via_drv.c optional viadrm dev/drm/via_irq.c optional viadrm dev/drm/via_map.c optional viadrm dev/drm/via_mm.c optional viadrm dev/drm/via_verifier.c optional viadrm dev/drm/via_video.c optional viadrm dev/ed/if_ed.c optional ed dev/ed/if_ed_novell.c optional ed dev/ed/if_ed_rtl80x9.c optional ed dev/ed/if_ed_pccard.c optional ed pccard dev/ed/if_ed_pci.c optional ed pci dev/eisa/eisa_if.m standard dev/eisa/eisaconf.c optional eisa dev/e1000/if_em.c optional em \ compile-with "${NORMAL_C} -I$S/dev/e1000" dev/e1000/if_lem.c optional em \ compile-with "${NORMAL_C} -I$S/dev/e1000" dev/e1000/if_igb.c optional igb \ compile-with "${NORMAL_C} -I$S/dev/e1000" dev/e1000/e1000_80003es2lan.c optional em | igb \ compile-with "${NORMAL_C} -I$S/dev/e1000" dev/e1000/e1000_82540.c optional em | igb \ compile-with "${NORMAL_C} -I$S/dev/e1000" dev/e1000/e1000_82541.c optional em | igb \ compile-with "${NORMAL_C} -I$S/dev/e1000" dev/e1000/e1000_82542.c optional em | igb \ compile-with "${NORMAL_C} -I$S/dev/e1000" dev/e1000/e1000_82543.c optional em | igb \ compile-with "${NORMAL_C} -I$S/dev/e1000" dev/e1000/e1000_82571.c optional em | igb \ compile-with "${NORMAL_C} -I$S/dev/e1000" dev/e1000/e1000_82575.c optional em | igb \ compile-with "${NORMAL_C} -I$S/dev/e1000" dev/e1000/e1000_ich8lan.c optional em | igb \ compile-with "${NORMAL_C} -I$S/dev/e1000" dev/e1000/e1000_i210.c optional em | igb \ compile-with "${NORMAL_C} -I$S/dev/e1000" dev/e1000/e1000_api.c optional em | igb \ compile-with "${NORMAL_C} -I$S/dev/e1000" dev/e1000/e1000_mac.c optional em | igb \ compile-with "${NORMAL_C} -I$S/dev/e1000" dev/e1000/e1000_manage.c optional em | igb \ compile-with "${NORMAL_C} -I$S/dev/e1000" dev/e1000/e1000_nvm.c optional em | igb \ compile-with "${NORMAL_C} -I$S/dev/e1000" dev/e1000/e1000_phy.c optional em | igb \ compile-with "${NORMAL_C} -I$S/dev/e1000" dev/e1000/e1000_vf.c optional em | igb \ compile-with "${NORMAL_C} -I$S/dev/e1000" dev/e1000/e1000_mbx.c optional em | igb \ compile-with "${NORMAL_C} -I$S/dev/e1000" dev/e1000/e1000_osdep.c optional em | igb \ compile-with "${NORMAL_C} -I$S/dev/e1000" dev/et/if_et.c optional et dev/en/if_en_pci.c optional en pci dev/en/midway.c optional en dev/ep/if_ep.c optional ep dev/ep/if_ep_eisa.c optional ep eisa dev/ep/if_ep_isa.c optional ep isa dev/ep/if_ep_mca.c optional ep mca dev/ep/if_ep_pccard.c optional ep pccard dev/esp/esp_pci.c optional esp pci dev/esp/ncr53c9x.c optional esp dev/ex/if_ex.c optional ex dev/ex/if_ex_isa.c optional ex isa dev/ex/if_ex_pccard.c optional ex pccard dev/exca/exca.c optional cbb dev/fatm/if_fatm.c optional fatm pci dev/fb/splash.c optional splash dev/fdt/fdt_common.c optional fdt dev/fdt/fdt_pci.c optional fdt pci dev/fdt/fdt_static_dtb.S optional fdt fdt_dtb_static dev/fdt/fdtbus.c optional fdt dev/fdt/simplebus.c optional fdt dev/fe/if_fe.c optional fe dev/fe/if_fe_pccard.c optional fe pccard dev/firewire/firewire.c optional firewire dev/firewire/fwcrom.c optional firewire dev/firewire/fwdev.c optional firewire dev/firewire/fwdma.c optional firewire dev/firewire/fwmem.c optional firewire dev/firewire/fwohci.c optional firewire dev/firewire/fwohci_pci.c optional firewire pci dev/firewire/if_fwe.c optional fwe dev/firewire/if_fwip.c optional fwip dev/firewire/sbp.c optional sbp dev/firewire/sbp_targ.c optional sbp_targ dev/flash/at45d.c optional at45d dev/flash/mx25l.c optional mx25l dev/fxp/if_fxp.c optional fxp dev/fxp/inphy.c optional fxp dev/gem/if_gem.c optional gem dev/gem/if_gem_pci.c optional gem pci dev/gem/if_gem_sbus.c optional gem sbus dev/gpio/gpiobus.c optional gpio \ dependency "gpiobus_if.h" dev/gpio/gpioc.c optional gpio \ dependency "gpio_if.h" dev/gpio/gpioiic.c optional gpioiic dev/gpio/gpioled.c optional gpioled dev/gpio/gpio_if.m optional gpio dev/gpio/gpiobus_if.m optional gpio dev/hatm/if_hatm.c optional hatm pci dev/hatm/if_hatm_intr.c optional hatm pci dev/hatm/if_hatm_ioctl.c optional hatm pci dev/hatm/if_hatm_rx.c optional hatm pci dev/hatm/if_hatm_tx.c optional hatm pci dev/hifn/hifn7751.c optional hifn dev/hme/if_hme.c optional hme dev/hme/if_hme_pci.c optional hme pci dev/hme/if_hme_sbus.c optional hme sbus dev/hptiop/hptiop.c optional hptiop scbus dev/hwpmc/hwpmc_logging.c optional hwpmc dev/hwpmc/hwpmc_mod.c optional hwpmc dev/hwpmc/hwpmc_soft.c optional hwpmc dev/ichsmb/ichsmb.c optional ichsmb dev/ichsmb/ichsmb_pci.c optional ichsmb pci dev/ida/ida.c optional ida dev/ida/ida_disk.c optional ida dev/ida/ida_eisa.c optional ida eisa dev/ida/ida_pci.c optional ida pci dev/ie/if_ie.c optional ie isa nowerror dev/ie/if_ie_isa.c optional ie isa dev/ieee488/ibfoo.c optional pcii | tnt4882 dev/ieee488/pcii.c optional pcii dev/ieee488/tnt4882.c optional tnt4882 dev/ieee488/upd7210.c optional pcii | tnt4882 dev/iicbus/ad7418.c optional ad7418 dev/iicbus/ds133x.c optional ds133x dev/iicbus/ds1672.c optional ds1672 dev/iicbus/icee.c optional icee dev/iicbus/if_ic.c optional ic dev/iicbus/iic.c optional iic dev/iicbus/iicbb.c optional iicbb dev/iicbus/iicbb_if.m optional iicbb dev/iicbus/iicbus.c optional iicbus dev/iicbus/iicbus_if.m optional iicbus dev/iicbus/iiconf.c optional iicbus dev/iicbus/iicsmb.c optional iicsmb \ dependency "iicbus_if.h" dev/iicbus/pcf8563.c optional pcf8563 dev/iir/iir.c optional iir dev/iir/iir_ctrl.c optional iir dev/iir/iir_pci.c optional iir pci # XXX Work around clang warning, until maintainer approves fix. dev/ips/ips.c optional ips \ compile-with "${NORMAL_C} ${NO_WSOMETIMES_UNINITIALIZED}" dev/ips/ips_commands.c optional ips dev/ips/ips_disk.c optional ips dev/ips/ips_ioctl.c optional ips dev/ips/ips_pci.c optional ips pci dev/ipw/if_ipw.c optional ipw ipwbssfw.c optional ipwbssfw | ipwfw \ compile-with "${AWK} -f $S/tools/fw_stub.awk ipw_bss.fw:ipw_bss:130 -lintel_ipw -mipw_bss -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "ipwbssfw.c" ipw_bss.fwo optional ipwbssfw | ipwfw \ dependency "ipw_bss.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "ipw_bss.fwo" ipw_bss.fw optional ipwbssfw | ipwfw \ dependency "$S/contrib/dev/ipw/ipw2100-1.3.fw.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "ipw_bss.fw" ipwibssfw.c optional ipwibssfw | ipwfw \ compile-with "${AWK} -f $S/tools/fw_stub.awk ipw_ibss.fw:ipw_ibss:130 -lintel_ipw -mipw_ibss -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "ipwibssfw.c" ipw_ibss.fwo optional ipwibssfw | ipwfw \ dependency "ipw_ibss.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "ipw_ibss.fwo" ipw_ibss.fw optional ipwibssfw | ipwfw \ dependency "$S/contrib/dev/ipw/ipw2100-1.3-i.fw.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "ipw_ibss.fw" ipwmonitorfw.c optional ipwmonitorfw | ipwfw \ compile-with "${AWK} -f $S/tools/fw_stub.awk ipw_monitor.fw:ipw_monitor:130 -lintel_ipw -mipw_monitor -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "ipwmonitorfw.c" ipw_monitor.fwo optional ipwmonitorfw | ipwfw \ dependency "ipw_monitor.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "ipw_monitor.fwo" ipw_monitor.fw optional ipwmonitorfw | ipwfw \ dependency "$S/contrib/dev/ipw/ipw2100-1.3-p.fw.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "ipw_monitor.fw" dev/iscsi/initiator/iscsi.c optional iscsi_initiator scbus dev/iscsi/initiator/iscsi_subr.c optional iscsi_initiator scbus dev/iscsi/initiator/isc_cam.c optional iscsi_initiator scbus dev/iscsi/initiator/isc_soc.c optional iscsi_initiator scbus dev/iscsi/initiator/isc_sm.c optional iscsi_initiator scbus dev/iscsi/initiator/isc_subr.c optional iscsi_initiator scbus dev/isp/isp.c optional isp dev/isp/isp_freebsd.c optional isp dev/isp/isp_library.c optional isp dev/isp/isp_pci.c optional isp pci dev/isp/isp_sbus.c optional isp sbus dev/isp/isp_target.c optional isp dev/ispfw/ispfw.c optional ispfw dev/iwi/if_iwi.c optional iwi iwibssfw.c optional iwibssfw | iwifw \ compile-with "${AWK} -f $S/tools/fw_stub.awk iwi_bss.fw:iwi_bss:300 -lintel_iwi -miwi_bss -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "iwibssfw.c" iwi_bss.fwo optional iwibssfw | iwifw \ dependency "iwi_bss.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "iwi_bss.fwo" iwi_bss.fw optional iwibssfw | iwifw \ dependency "$S/contrib/dev/iwi/ipw2200-bss.fw.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "iwi_bss.fw" iwiibssfw.c optional iwiibssfw | iwifw \ compile-with "${AWK} -f $S/tools/fw_stub.awk iwi_ibss.fw:iwi_ibss:300 -lintel_iwi -miwi_ibss -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "iwiibssfw.c" iwi_ibss.fwo optional iwiibssfw | iwifw \ dependency "iwi_ibss.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "iwi_ibss.fwo" iwi_ibss.fw optional iwiibssfw | iwifw \ dependency "$S/contrib/dev/iwi/ipw2200-ibss.fw.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "iwi_ibss.fw" iwimonitorfw.c optional iwimonitorfw | iwifw \ compile-with "${AWK} -f $S/tools/fw_stub.awk iwi_monitor.fw:iwi_monitor:300 -lintel_iwi -miwi_monitor -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "iwimonitorfw.c" iwi_monitor.fwo optional iwimonitorfw | iwifw \ dependency "iwi_monitor.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "iwi_monitor.fwo" iwi_monitor.fw optional iwimonitorfw | iwifw \ dependency "$S/contrib/dev/iwi/ipw2200-sniffer.fw.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "iwi_monitor.fw" dev/iwn/if_iwn.c optional iwn iwn1000fw.c optional iwn1000fw | iwnfw \ compile-with "${AWK} -f $S/tools/fw_stub.awk iwn1000.fw:iwn1000fw -miwn1000fw -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "iwn1000fw.c" iwn1000fw.fwo optional iwn1000fw | iwnfw \ dependency "iwn1000.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "iwn1000fw.fwo" iwn1000.fw optional iwn1000fw | iwnfw \ dependency "$S/contrib/dev/iwn/iwlwifi-1000-39.31.5.1.fw.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "iwn1000.fw" iwn4965fw.c optional iwn4965fw | iwnfw \ compile-with "${AWK} -f $S/tools/fw_stub.awk iwn4965.fw:iwn4965fw -miwn4965fw -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "iwn4965fw.c" iwn4965fw.fwo optional iwn4965fw | iwnfw \ dependency "iwn4965.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "iwn4965fw.fwo" iwn4965.fw optional iwn4965fw | iwnfw \ dependency "$S/contrib/dev/iwn/iwlwifi-4965-228.61.2.24.fw.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "iwn4965.fw" iwn5000fw.c optional iwn5000fw | iwnfw \ compile-with "${AWK} -f $S/tools/fw_stub.awk iwn5000.fw:iwn5000fw -miwn5000fw -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "iwn5000fw.c" iwn5000fw.fwo optional iwn5000fw | iwnfw \ dependency "iwn5000.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "iwn5000fw.fwo" iwn5000.fw optional iwn5000fw | iwnfw \ dependency "$S/contrib/dev/iwn/iwlwifi-5000-8.83.5.1.fw.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "iwn5000.fw" iwn5150fw.c optional iwn5150fw | iwnfw \ compile-with "${AWK} -f $S/tools/fw_stub.awk iwn5150.fw:iwn5150fw -miwn5150fw -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "iwn5150fw.c" iwn5150fw.fwo optional iwn5150fw | iwnfw \ dependency "iwn5150.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "iwn5150fw.fwo" iwn5150.fw optional iwn5150fw | iwnfw \ dependency "$S/contrib/dev/iwn/iwlwifi-5150-8.24.2.2.fw.uu"\ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "iwn5150.fw" iwn6000fw.c optional iwn6000fw | iwnfw \ compile-with "${AWK} -f $S/tools/fw_stub.awk iwn6000.fw:iwn6000fw -miwn6000fw -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "iwn6000fw.c" iwn6000fw.fwo optional iwn6000fw | iwnfw \ dependency "iwn6000.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "iwn6000fw.fwo" iwn6000.fw optional iwn6000fw | iwnfw \ dependency "$S/contrib/dev/iwn/iwlwifi-6000-9.221.4.1.fw.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "iwn6000.fw" iwn6000g2afw.c optional iwn6000g2afw | iwnfw \ compile-with "${AWK} -f $S/tools/fw_stub.awk iwn6000g2a.fw:iwn6000g2afw -miwn6000g2afw -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "iwn6000g2afw.c" iwn6000g2afw.fwo optional iwn6000g2afw | iwnfw \ dependency "iwn6000g2a.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "iwn6000g2afw.fwo" iwn6000g2a.fw optional iwn6000g2afw | iwnfw \ dependency "$S/contrib/dev/iwn/iwlwifi-6000g2a-17.168.5.2.fw.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "iwn6000g2a.fw" iwn6000g2bfw.c optional iwn6000g2bfw | iwnfw \ compile-with "${AWK} -f $S/tools/fw_stub.awk iwn6000g2b.fw:iwn6000g2bfw -miwn6000g2bfw -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "iwn6000g2bfw.c" iwn6000g2bfw.fwo optional iwn6000g2bfw | iwnfw \ dependency "iwn6000g2b.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "iwn6000g2bfw.fwo" iwn6000g2b.fw optional iwn6000g2bfw | iwnfw \ dependency "$S/contrib/dev/iwn/iwlwifi-6000g2b-17.168.5.2.fw.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "iwn6000g2b.fw" iwn6050fw.c optional iwn6050fw | iwnfw \ compile-with "${AWK} -f $S/tools/fw_stub.awk iwn6050.fw:iwn6050fw -miwn6050fw -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "iwn6050fw.c" iwn6050fw.fwo optional iwn6050fw | iwnfw \ dependency "iwn6050.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "iwn6050fw.fwo" iwn6050.fw optional iwn6050fw | iwnfw \ dependency "$S/contrib/dev/iwn/iwlwifi-6050-41.28.5.1.fw.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "iwn6050.fw" dev/ixgb/if_ixgb.c optional ixgb dev/ixgb/ixgb_ee.c optional ixgb dev/ixgb/ixgb_hw.c optional ixgb dev/ixgbe/ixgbe.c optional ixgbe inet \ compile-with "${NORMAL_C} -I$S/dev/ixgbe" dev/ixgbe/ixv.c optional ixgbe inet \ compile-with "${NORMAL_C} -I$S/dev/ixgbe" dev/ixgbe/ixgbe_phy.c optional ixgbe inet \ compile-with "${NORMAL_C} -I$S/dev/ixgbe" dev/ixgbe/ixgbe_api.c optional ixgbe inet \ compile-with "${NORMAL_C} -I$S/dev/ixgbe" dev/ixgbe/ixgbe_common.c optional ixgbe inet \ compile-with "${NORMAL_C} -I$S/dev/ixgbe" dev/ixgbe/ixgbe_mbx.c optional ixgbe inet \ compile-with "${NORMAL_C} -I$S/dev/ixgbe" dev/ixgbe/ixgbe_vf.c optional ixgbe inet \ compile-with "${NORMAL_C} -I$S/dev/ixgbe" dev/ixgbe/ixgbe_82598.c optional ixgbe inet \ compile-with "${NORMAL_C} -I$S/dev/ixgbe" dev/ixgbe/ixgbe_82599.c optional ixgbe inet \ compile-with "${NORMAL_C} -I$S/dev/ixgbe" dev/ixgbe/ixgbe_x540.c optional ixgbe inet \ compile-with "${NORMAL_C} -I$S/dev/ixgbe" dev/ixgbe/ixgbe_dcb.c optional ixgbe inet \ compile-with "${NORMAL_C} -I$S/dev/ixgbe" dev/ixgbe/ixgbe_dcb_82598.c optional ixgbe inet \ compile-with "${NORMAL_C} -I$S/dev/ixgbe" dev/ixgbe/ixgbe_dcb_82599.c optional ixgbe inet \ compile-with "${NORMAL_C} -I$S/dev/ixgbe" dev/jme/if_jme.c optional jme pci dev/joy/joy.c optional joy dev/joy/joy_isa.c optional joy isa dev/joy/joy_pccard.c optional joy pccard dev/kbdmux/kbdmux.c optional kbdmux dev/ksyms/ksyms.c optional ksyms dev/le/am7990.c optional le dev/le/am79900.c optional le dev/le/if_le_pci.c optional le pci dev/le/lance.c optional le dev/led/led.c standard dev/lge/if_lge.c optional lge dev/lmc/if_lmc.c optional lmc dev/malo/if_malo.c optional malo dev/malo/if_malohal.c optional malo dev/malo/if_malo_pci.c optional malo pci dev/mc146818/mc146818.c optional mc146818 dev/mca/mca_bus.c optional mca dev/mcd/mcd.c optional mcd isa nowerror dev/mcd/mcd_isa.c optional mcd isa nowerror dev/md/md.c optional md dev/mem/memdev.c optional mem dev/mem/memutil.c optional mem dev/mfi/mfi.c optional mfi dev/mfi/mfi_debug.c optional mfi dev/mfi/mfi_pci.c optional mfi pci dev/mfi/mfi_disk.c optional mfi dev/mfi/mfi_syspd.c optional mfi dev/mfi/mfi_tbolt.c optional mfi dev/mfi/mfi_linux.c optional mfi compat_linux dev/mfi/mfi_cam.c optional mfip scbus dev/mii/acphy.c optional miibus | acphy dev/mii/amphy.c optional miibus | amphy dev/mii/atphy.c optional miibus | atphy dev/mii/axphy.c optional miibus | axphy dev/mii/bmtphy.c optional miibus | bmtphy dev/mii/brgphy.c optional miibus | brgphy dev/mii/ciphy.c optional miibus | ciphy dev/mii/e1000phy.c optional miibus | e1000phy dev/mii/gentbi.c optional miibus | gentbi dev/mii/icsphy.c optional miibus | icsphy dev/mii/ip1000phy.c optional miibus | ip1000phy dev/mii/jmphy.c optional miibus | jmphy dev/mii/lxtphy.c optional miibus | lxtphy dev/mii/mii.c optional miibus | mii dev/mii/mii_bitbang.c optional miibus | mii_bitbang dev/mii/mii_physubr.c optional miibus | mii dev/mii/miibus_if.m optional miibus | mii dev/mii/mlphy.c optional miibus | mlphy dev/mii/nsgphy.c optional miibus | nsgphy dev/mii/nsphy.c optional miibus | nsphy dev/mii/nsphyter.c optional miibus | nsphyter dev/mii/pnaphy.c optional miibus | pnaphy dev/mii/qsphy.c optional miibus | qsphy dev/mii/rdcphy.c optional miibus | rdcphy dev/mii/rgephy.c optional miibus | rgephy dev/mii/rlphy.c optional miibus | rlphy dev/mii/rlswitch.c optional rlswitch dev/mii/smcphy.c optional miibus | smcphy dev/mii/tdkphy.c optional miibus | tdkphy dev/mii/tlphy.c optional miibus | tlphy dev/mii/truephy.c optional miibus | truephy dev/mii/ukphy.c optional miibus | mii dev/mii/ukphy_subr.c optional miibus | mii dev/mii/xmphy.c optional miibus | xmphy dev/mk48txx/mk48txx.c optional mk48txx dev/mlx/mlx.c optional mlx dev/mlx/mlx_disk.c optional mlx dev/mlx/mlx_pci.c optional mlx pci dev/mly/mly.c optional mly dev/mmc/mmc.c optional mmc dev/mmc/mmcbr_if.m standard dev/mmc/mmcbus_if.m standard dev/mmc/mmcsd.c optional mmcsd dev/mn/if_mn.c optional mn pci dev/mps/mps.c optional mps dev/mps/mps_config.c optional mps # XXX Work around clang warning, until maintainer approves fix. dev/mps/mps_mapping.c optional mps \ compile-with "${NORMAL_C} ${NO_WSOMETIMES_UNINITIALIZED}" dev/mps/mps_pci.c optional mps pci dev/mps/mps_sas.c optional mps \ compile-with "${NORMAL_C} ${NO_WUNNEEDED_INTERNAL_DECL}" dev/mps/mps_sas_lsi.c optional mps dev/mps/mps_table.c optional mps dev/mps/mps_user.c optional mps dev/mpt/mpt.c optional mpt dev/mpt/mpt_cam.c optional mpt dev/mpt/mpt_debug.c optional mpt dev/mpt/mpt_pci.c optional mpt pci dev/mpt/mpt_raid.c optional mpt dev/mpt/mpt_user.c optional mpt dev/msk/if_msk.c optional msk dev/mvs/mvs.c optional mvs dev/mvs/mvs_if.m optional mvs dev/mvs/mvs_pci.c optional mvs pci dev/mwl/if_mwl.c optional mwl dev/mwl/if_mwl_pci.c optional mwl pci dev/mwl/mwlhal.c optional mwl mwlfw.c optional mwlfw \ compile-with "${AWK} -f $S/tools/fw_stub.awk mw88W8363.fw:mw88W8363fw mwlboot.fw:mwlboot -mmwl -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "mwlfw.c" mw88W8363.fwo optional mwlfw \ dependency "mw88W8363.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "mw88W8363.fwo" mw88W8363.fw optional mwlfw \ dependency "$S/contrib/dev/mwl/mw88W8363.fw.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "mw88W8363.fw" mwlboot.fwo optional mwlfw \ dependency "mwlboot.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "mwlboot.fwo" mwlboot.fw optional mwlfw \ dependency "$S/contrib/dev/mwl/mwlboot.fw.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "mwlboot.fw" dev/mxge/if_mxge.c optional mxge pci dev/mxge/mxge_eth_z8e.c optional mxge pci dev/mxge/mxge_ethp_z8e.c optional mxge pci dev/mxge/mxge_rss_eth_z8e.c optional mxge pci dev/mxge/mxge_rss_ethp_z8e.c optional mxge pci dev/my/if_my.c optional my dev/ncv/ncr53c500.c optional ncv dev/ncv/ncr53c500_pccard.c optional ncv pccard dev/netmap/netmap.c optional netmap +dev/netmap/netmap_freebsd.c optional netmap +dev/netmap/netmap_generic.c optional netmap +dev/netmap/netmap_mbq.c optional netmap dev/netmap/netmap_mem2.c optional netmap +dev/netmap/netmap_offloadings.c optional netmap +dev/netmap/netmap_pipe.c optional netmap +dev/netmap/netmap_vale.c optional netmap dev/nge/if_nge.c optional nge dev/nxge/if_nxge.c optional nxge \ compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}" dev/nxge/xgehal/xgehal-device.c optional nxge \ compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}" dev/nxge/xgehal/xgehal-mm.c optional nxge dev/nxge/xgehal/xge-queue.c optional nxge dev/nxge/xgehal/xgehal-driver.c optional nxge \ compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}" dev/nxge/xgehal/xgehal-ring.c optional nxge \ compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}" dev/nxge/xgehal/xgehal-channel.c optional nxge \ compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}" dev/nxge/xgehal/xgehal-fifo.c optional nxge \ compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}" dev/nxge/xgehal/xgehal-stats.c optional nxge \ compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}" dev/nxge/xgehal/xgehal-config.c optional nxge dev/nxge/xgehal/xgehal-mgmt.c optional nxge \ compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}" dev/nmdm/nmdm.c optional nmdm dev/nsp/nsp.c optional nsp dev/nsp/nsp_pccard.c optional nsp pccard dev/null/null.c standard dev/oce/oce_hw.c optional oce pci dev/oce/oce_if.c optional oce pci dev/oce/oce_mbox.c optional oce pci dev/oce/oce_queue.c optional oce pci dev/oce/oce_sysctl.c optional oce pci dev/oce/oce_util.c optional oce pci dev/patm/if_patm.c optional patm pci dev/patm/if_patm_attach.c optional patm pci dev/patm/if_patm_intr.c optional patm pci dev/patm/if_patm_ioctl.c optional patm pci dev/patm/if_patm_rtables.c optional patm pci dev/patm/if_patm_rx.c optional patm pci dev/patm/if_patm_tx.c optional patm pci dev/pbio/pbio.c optional pbio isa dev/pccard/card_if.m standard dev/pccard/pccard.c optional pccard dev/pccard/pccard_cis.c optional pccard dev/pccard/pccard_cis_quirks.c optional pccard dev/pccard/pccard_device.c optional pccard dev/pccard/power_if.m standard dev/pccbb/pccbb.c optional cbb dev/pccbb/pccbb_isa.c optional cbb isa dev/pccbb/pccbb_pci.c optional cbb pci dev/pcf/pcf.c optional pcf dev/pci/eisa_pci.c optional pci eisa dev/pci/fixup_pci.c optional pci dev/pci/hostb_pci.c optional pci dev/pci/ignore_pci.c optional pci dev/pci/isa_pci.c optional pci isa dev/pci/pci.c optional pci dev/pci/pci_if.m standard dev/pci/pci_pci.c optional pci dev/pci/pci_subr.c optional pci dev/pci/pci_user.c optional pci dev/pci/pcib_if.m standard dev/pci/vga_pci.c optional pci dev/pcn/if_pcn.c optional pcn pci dev/pdq/if_fea.c optional fea eisa dev/pdq/if_fpa.c optional fpa pci dev/pdq/pdq.c optional nowerror fea eisa | fpa pci dev/pdq/pdq_ifsubr.c optional nowerror fea eisa | fpa pci dev/ppbus/if_plip.c optional plip dev/ppbus/immio.c optional vpo dev/ppbus/lpbb.c optional lpbb dev/ppbus/lpt.c optional lpt dev/ppbus/pcfclock.c optional pcfclock dev/ppbus/ppb_1284.c optional ppbus dev/ppbus/ppb_base.c optional ppbus dev/ppbus/ppb_msq.c optional ppbus dev/ppbus/ppbconf.c optional ppbus dev/ppbus/ppbus_if.m optional ppbus dev/ppbus/ppi.c optional ppi dev/ppbus/pps.c optional pps dev/ppbus/vpo.c optional vpo dev/ppbus/vpoio.c optional vpo dev/ppc/ppc.c optional ppc dev/ppc/ppc_acpi.c optional ppc acpi dev/ppc/ppc_isa.c optional ppc isa dev/ppc/ppc_pci.c optional ppc pci dev/ppc/ppc_puc.c optional ppc puc dev/pst/pst-iop.c optional pst dev/pst/pst-pci.c optional pst pci dev/pst/pst-raid.c optional pst dev/pty/pty.c optional pty dev/puc/puc.c optional puc dev/puc/puc_cfg.c optional puc dev/puc/puc_pccard.c optional puc pccard dev/puc/puc_pci.c optional puc pci dev/puc/pucdata.c optional puc pci dev/quicc/quicc_core.c optional quicc dev/ral/rt2560.c optional ral dev/ral/rt2661.c optional ral dev/ral/rt2860.c optional ral dev/ral/if_ral_pci.c optional ral pci rt2561fw.c optional rt2561fw | ralfw \ compile-with "${AWK} -f $S/tools/fw_stub.awk rt2561.fw:rt2561fw -mrt2561 -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "rt2561fw.c" rt2561fw.fwo optional rt2561fw | ralfw \ dependency "rt2561.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "rt2561fw.fwo" rt2561.fw optional rt2561fw | ralfw \ dependency "$S/contrib/dev/ral/rt2561.fw.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "rt2561.fw" rt2561sfw.c optional rt2561sfw | ralfw \ compile-with "${AWK} -f $S/tools/fw_stub.awk rt2561s.fw:rt2561sfw -mrt2561s -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "rt2561sfw.c" rt2561sfw.fwo optional rt2561sfw | ralfw \ dependency "rt2561s.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "rt2561sfw.fwo" rt2561s.fw optional rt2561sfw | ralfw \ dependency "$S/contrib/dev/ral/rt2561s.fw.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "rt2561s.fw" rt2661fw.c optional rt2661fw | ralfw \ compile-with "${AWK} -f $S/tools/fw_stub.awk rt2661.fw:rt2661fw -mrt2661 -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "rt2661fw.c" rt2661fw.fwo optional rt2661fw | ralfw \ dependency "rt2661.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "rt2661fw.fwo" rt2661.fw optional rt2661fw | ralfw \ dependency "$S/contrib/dev/ral/rt2661.fw.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "rt2661.fw" rt2860fw.c optional rt2860fw | ralfw \ compile-with "${AWK} -f $S/tools/fw_stub.awk rt2860.fw:rt2860fw -mrt2860 -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "rt2860fw.c" rt2860fw.fwo optional rt2860fw | ralfw \ dependency "rt2860.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "rt2860fw.fwo" rt2860.fw optional rt2860fw | ralfw \ dependency "$S/contrib/dev/ral/rt2860.fw.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "rt2860.fw" dev/random/harvest.c standard dev/random/hash.c optional random dev/random/probe.c optional random dev/random/randomdev.c optional random dev/random/randomdev_soft.c optional random dev/random/yarrow.c optional random dev/rc/rc.c optional rc dev/re/if_re.c optional re dev/rndtest/rndtest.c optional rndtest dev/rp/rp.c optional rp dev/rp/rp_isa.c optional rp isa dev/rp/rp_pci.c optional rp pci dev/safe/safe.c optional safe dev/scc/scc_if.m optional scc dev/scc/scc_bfe_ebus.c optional scc ebus dev/scc/scc_bfe_quicc.c optional scc quicc dev/scc/scc_bfe_sbus.c optional scc fhc | scc sbus dev/scc/scc_core.c optional scc dev/scc/scc_dev_quicc.c optional scc quicc dev/scc/scc_dev_sab82532.c optional scc dev/scc/scc_dev_z8530.c optional scc dev/scd/scd.c optional scd isa dev/scd/scd_isa.c optional scd isa dev/sdhci/sdhci.c optional sdhci pci dev/sf/if_sf.c optional sf pci dev/sge/if_sge.c optional sge pci dev/si/si.c optional si dev/si/si2_z280.c optional si dev/si/si3_t225.c optional si dev/si/si_eisa.c optional si eisa dev/si/si_isa.c optional si isa dev/si/si_pci.c optional si pci dev/siba/siba_bwn.c optional siba_bwn pci dev/siba/siba_core.c optional siba_bwn pci dev/siis/siis.c optional siis pci dev/sis/if_sis.c optional sis pci dev/sk/if_sk.c optional sk pci dev/smbus/smb.c optional smb dev/smbus/smbconf.c optional smbus dev/smbus/smbus.c optional smbus dev/smbus/smbus_if.m optional smbus dev/smc/if_smc.c optional smc dev/sn/if_sn.c optional sn dev/sn/if_sn_isa.c optional sn isa dev/sn/if_sn_pccard.c optional sn pccard dev/snp/snp.c optional snp dev/sound/clone.c optional sound dev/sound/unit.c optional sound dev/sound/isa/ad1816.c optional snd_ad1816 isa dev/sound/isa/ess.c optional snd_ess isa dev/sound/isa/gusc.c optional snd_gusc isa dev/sound/isa/mss.c optional snd_mss isa dev/sound/isa/sb16.c optional snd_sb16 isa dev/sound/isa/sb8.c optional snd_sb8 isa dev/sound/isa/sbc.c optional snd_sbc isa dev/sound/isa/sndbuf_dma.c optional sound isa dev/sound/pci/als4000.c optional snd_als4000 pci dev/sound/pci/atiixp.c optional snd_atiixp pci dev/sound/pci/cmi.c optional snd_cmi pci dev/sound/pci/cs4281.c optional snd_cs4281 pci dev/sound/pci/csa.c optional snd_csa pci dev/sound/pci/csapcm.c optional snd_csa pci dev/sound/pci/ds1.c optional snd_ds1 pci dev/sound/pci/emu10k1.c optional snd_emu10k1 pci dev/sound/pci/emu10kx.c optional snd_emu10kx pci dev/sound/pci/emu10kx-pcm.c optional snd_emu10kx pci dev/sound/pci/emu10kx-midi.c optional snd_emu10kx pci dev/sound/pci/envy24.c optional snd_envy24 pci dev/sound/pci/envy24ht.c optional snd_envy24ht pci dev/sound/pci/es137x.c optional snd_es137x pci dev/sound/pci/fm801.c optional snd_fm801 pci dev/sound/pci/ich.c optional snd_ich pci dev/sound/pci/maestro.c optional snd_maestro pci dev/sound/pci/maestro3.c optional snd_maestro3 pci dev/sound/pci/neomagic.c optional snd_neomagic pci dev/sound/pci/solo.c optional snd_solo pci dev/sound/pci/spicds.c optional snd_spicds pci dev/sound/pci/t4dwave.c optional snd_t4dwave pci dev/sound/pci/via8233.c optional snd_via8233 pci dev/sound/pci/via82c686.c optional snd_via82c686 pci dev/sound/pci/vibes.c optional snd_vibes pci dev/sound/pci/hda/hdaa.c optional snd_hda pci dev/sound/pci/hda/hdaa_patches.c optional snd_hda pci dev/sound/pci/hda/hdac.c optional snd_hda pci dev/sound/pci/hda/hdac_if.m optional snd_hda pci dev/sound/pci/hda/hdacc.c optional snd_hda pci dev/sound/pci/hdspe.c optional snd_hdspe pci dev/sound/pci/hdspe-pcm.c optional snd_hdspe pci dev/sound/pcm/ac97.c optional sound dev/sound/pcm/ac97_if.m optional sound dev/sound/pcm/ac97_patch.c optional sound dev/sound/pcm/buffer.c optional sound \ dependency "snd_fxdiv_gen.h" dev/sound/pcm/channel.c optional sound dev/sound/pcm/channel_if.m optional sound dev/sound/pcm/dsp.c optional sound dev/sound/pcm/feeder.c optional sound dev/sound/pcm/feeder_chain.c optional sound dev/sound/pcm/feeder_eq.c optional sound \ dependency "feeder_eq_gen.h" \ dependency "snd_fxdiv_gen.h" dev/sound/pcm/feeder_if.m optional sound dev/sound/pcm/feeder_format.c optional sound \ dependency "snd_fxdiv_gen.h" dev/sound/pcm/feeder_matrix.c optional sound \ dependency "snd_fxdiv_gen.h" dev/sound/pcm/feeder_mixer.c optional sound \ dependency "snd_fxdiv_gen.h" dev/sound/pcm/feeder_rate.c optional sound \ dependency "feeder_rate_gen.h" \ dependency "snd_fxdiv_gen.h" dev/sound/pcm/feeder_volume.c optional sound \ dependency "snd_fxdiv_gen.h" dev/sound/pcm/mixer.c optional sound dev/sound/pcm/mixer_if.m optional sound dev/sound/pcm/sndstat.c optional sound dev/sound/pcm/sound.c optional sound dev/sound/pcm/vchan.c optional sound dev/sound/usb/uaudio.c optional snd_uaudio usb dev/sound/usb/uaudio_pcm.c optional snd_uaudio usb dev/sound/midi/midi.c optional sound dev/sound/midi/mpu401.c optional sound dev/sound/midi/mpu_if.m optional sound dev/sound/midi/mpufoi_if.m optional sound dev/sound/midi/sequencer.c optional sound dev/sound/midi/synth_if.m optional sound dev/spibus/spibus.c optional spibus \ dependency "spibus_if.h" dev/spibus/spibus_if.m optional spibus dev/ste/if_ste.c optional ste pci dev/stg/tmc18c30.c optional stg dev/stg/tmc18c30_isa.c optional stg isa dev/stg/tmc18c30_pccard.c optional stg pccard dev/stg/tmc18c30_pci.c optional stg pci dev/stg/tmc18c30_subr.c optional stg dev/stge/if_stge.c optional stge dev/streams/streams.c optional streams dev/sym/sym_hipd.c optional sym \ dependency "$S/dev/sym/sym_{conf,defs}.h" dev/syscons/blank/blank_saver.c optional blank_saver dev/syscons/daemon/daemon_saver.c optional daemon_saver dev/syscons/dragon/dragon_saver.c optional dragon_saver dev/syscons/fade/fade_saver.c optional fade_saver dev/syscons/fire/fire_saver.c optional fire_saver dev/syscons/green/green_saver.c optional green_saver dev/syscons/logo/logo.c optional logo_saver dev/syscons/logo/logo_saver.c optional logo_saver dev/syscons/rain/rain_saver.c optional rain_saver dev/syscons/schistory.c optional sc dev/syscons/scmouse.c optional sc dev/syscons/scterm.c optional sc dev/syscons/scvidctl.c optional sc dev/syscons/snake/snake_saver.c optional snake_saver dev/syscons/star/star_saver.c optional star_saver dev/syscons/syscons.c optional sc dev/syscons/sysmouse.c optional sc dev/syscons/warp/warp_saver.c optional warp_saver dev/tdfx/tdfx_linux.c optional tdfx_linux tdfx compat_linux dev/tdfx/tdfx_pci.c optional tdfx pci dev/ti/if_ti.c optional ti pci dev/tl/if_tl.c optional tl pci dev/trm/trm.c optional trm dev/twa/tw_cl_init.c optional twa \ compile-with "${NORMAL_C} -I$S/dev/twa" dev/twa/tw_cl_intr.c optional twa \ compile-with "${NORMAL_C} -I$S/dev/twa" dev/twa/tw_cl_io.c optional twa \ compile-with "${NORMAL_C} -I$S/dev/twa" dev/twa/tw_cl_misc.c optional twa \ compile-with "${NORMAL_C} -I$S/dev/twa" dev/twa/tw_osl_cam.c optional twa \ compile-with "${NORMAL_C} -I$S/dev/twa" dev/twa/tw_osl_freebsd.c optional twa \ compile-with "${NORMAL_C} -I$S/dev/twa" dev/twe/twe.c optional twe dev/twe/twe_freebsd.c optional twe dev/tws/tws.c optional tws dev/tws/tws_cam.c optional tws dev/tws/tws_hdm.c optional tws dev/tws/tws_services.c optional tws dev/tws/tws_user.c optional tws dev/tx/if_tx.c optional tx dev/txp/if_txp.c optional txp dev/uart/uart_bus_acpi.c optional uart acpi #dev/uart/uart_bus_cbus.c optional uart cbus dev/uart/uart_bus_ebus.c optional uart ebus dev/uart/uart_bus_fdt.c optional uart fdt dev/uart/uart_bus_isa.c optional uart isa dev/uart/uart_bus_pccard.c optional uart pccard dev/uart/uart_bus_pci.c optional uart pci dev/uart/uart_bus_puc.c optional uart puc dev/uart/uart_bus_scc.c optional uart scc dev/uart/uart_core.c optional uart dev/uart/uart_dbg.c optional uart gdb dev/uart/uart_dev_ns8250.c optional uart uart_ns8250 dev/uart/uart_dev_quicc.c optional uart quicc dev/uart/uart_dev_sab82532.c optional uart uart_sab82532 dev/uart/uart_dev_sab82532.c optional uart scc dev/uart/uart_dev_z8530.c optional uart uart_z8530 dev/uart/uart_dev_z8530.c optional uart scc dev/uart/uart_if.m optional uart dev/uart/uart_subr.c optional uart dev/uart/uart_tty.c optional uart dev/ubsec/ubsec.c optional ubsec # # USB controller drivers # dev/usb/controller/at91dci.c optional at91dci dev/usb/controller/at91dci_atmelarm.c optional at91dci at91rm9200 dev/usb/controller/musb_otg.c optional musb dev/usb/controller/musb_otg_atmelarm.c optional musb at91rm9200 dev/usb/controller/ehci.c optional ehci dev/usb/controller/ehci_pci.c optional ehci pci dev/usb/controller/ohci.c optional ohci dev/usb/controller/ohci_atmelarm.c optional ohci at91rm9200 dev/usb/controller/ohci_pci.c optional ohci pci dev/usb/controller/uhci.c optional uhci dev/usb/controller/uhci_pci.c optional uhci pci dev/usb/controller/xhci.c optional xhci dev/usb/controller/xhci_pci.c optional xhci pci dev/usb/controller/uss820dci.c optional uss820dci dev/usb/controller/uss820dci_atmelarm.c optional uss820dci at91rm9200 dev/usb/controller/usb_controller.c optional usb # # USB storage drivers # dev/usb/storage/umass.c optional umass dev/usb/storage/urio.c optional urio dev/usb/storage/ustorage_fs.c optional usfs # # USB core # dev/usb/usb_busdma.c optional usb dev/usb/usb_compat_linux.c optional usb dev/usb/usb_core.c optional usb dev/usb/usb_debug.c optional usb dev/usb/usb_dev.c optional usb dev/usb/usb_device.c optional usb dev/usb/usb_dynamic.c optional usb dev/usb/usb_error.c optional usb dev/usb/usb_generic.c optional usb dev/usb/usb_handle_request.c optional usb dev/usb/usb_hid.c optional usb dev/usb/usb_hub.c optional usb dev/usb/usb_if.m optional usb dev/usb/usb_lookup.c optional usb dev/usb/usb_mbuf.c optional usb dev/usb/usb_msctest.c optional usb dev/usb/usb_parse.c optional usb dev/usb/usb_pf.c optional usb dev/usb/usb_process.c optional usb dev/usb/usb_request.c optional usb dev/usb/usb_transfer.c optional usb dev/usb/usb_util.c optional usb # # USB network drivers # dev/usb/net/if_aue.c optional aue dev/usb/net/if_axe.c optional axe dev/usb/net/if_axge.c optional axge dev/usb/net/if_cdce.c optional cdce dev/usb/net/if_cue.c optional cue dev/usb/net/if_ipheth.c optional ipheth dev/usb/net/if_kue.c optional kue dev/usb/net/if_mos.c optional mos dev/usb/net/if_rue.c optional rue dev/usb/net/if_udav.c optional udav dev/usb/net/if_usie.c optional usie dev/usb/net/ruephy.c optional rue dev/usb/net/usb_ethernet.c optional aue | axe | axge | cdce | cue | kue | \ mos | rue | udav | ipheth dev/usb/net/uhso.c optional uhso # # USB WLAN drivers # dev/usb/wlan/if_rum.c optional rum dev/usb/wlan/if_run.c optional run runfw.c optional runfw \ compile-with "${AWK} -f $S/tools/fw_stub.awk run.fw:runfw -mrunfw -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "runfw.c" runfw.fwo optional runfw \ dependency "run.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "runfw.fwo" run.fw optional runfw \ dependency "$S/contrib/dev/run/rt2870.fw.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "run.fw" dev/usb/wlan/if_uath.c optional uath dev/usb/wlan/if_upgt.c optional upgt dev/usb/wlan/if_ural.c optional ural dev/usb/wlan/if_urtw.c optional urtw dev/usb/wlan/if_zyd.c optional zyd # # USB serial and parallel port drivers # dev/usb/serial/u3g.c optional u3g dev/usb/serial/uark.c optional uark dev/usb/serial/ubsa.c optional ubsa dev/usb/serial/ubser.c optional ubser dev/usb/serial/uchcom.c optional uchcom dev/usb/serial/ucycom.c optional ucycom dev/usb/serial/ufoma.c optional ufoma dev/usb/serial/uftdi.c optional uftdi dev/usb/serial/ugensa.c optional ugensa dev/usb/serial/uipaq.c optional uipaq dev/usb/serial/ulpt.c optional ulpt dev/usb/serial/umcs.c optional umcs dev/usb/serial/umct.c optional umct dev/usb/serial/umodem.c optional umodem dev/usb/serial/umoscom.c optional umoscom dev/usb/serial/uplcom.c optional uplcom dev/usb/serial/uslcom.c optional uslcom dev/usb/serial/uvisor.c optional uvisor dev/usb/serial/uvscom.c optional uvscom dev/usb/serial/usb_serial.c optional ucom | u3g | uark | ubsa | ubser | \ uchcom | ucycom | ufoma | uftdi | \ ugensa | uipaq | umcs | umct | \ umodem | umoscom | uplcom | usie | \ uslcom | uvisor | uvscom # # USB misc drivers # dev/usb/misc/ufm.c optional ufm dev/usb/misc/udbp.c optional udbp # # USB input drivers # dev/usb/input/atp.c optional atp dev/usb/input/uep.c optional uep dev/usb/input/uhid.c optional uhid dev/usb/input/ukbd.c optional ukbd dev/usb/input/ums.c optional ums dev/usb/input/wsp.c optional wsp # # USB quirks # dev/usb/quirk/usb_quirk.c optional usb # # USB templates # dev/usb/template/usb_template.c optional usb_template dev/usb/template/usb_template_audio.c optional usb_template dev/usb/template/usb_template_cdce.c optional usb_template dev/usb/template/usb_template_kbd.c optional usb_template dev/usb/template/usb_template_modem.c optional usb_template dev/usb/template/usb_template_mouse.c optional usb_template dev/usb/template/usb_template_msc.c optional usb_template dev/usb/template/usb_template_mtp.c optional usb_template # # USB END # dev/utopia/idtphy.c optional utopia dev/utopia/suni.c optional utopia dev/utopia/utopia.c optional utopia dev/vge/if_vge.c optional vge dev/vkbd/vkbd.c optional vkbd dev/vr/if_vr.c optional vr pci dev/vte/if_vte.c optional vte pci dev/vx/if_vx.c optional vx dev/vx/if_vx_eisa.c optional vx eisa dev/vx/if_vx_pci.c optional vx pci dev/vxge/vxge.c optional vxge dev/vxge/vxgehal/vxgehal-ifmsg.c optional vxge dev/vxge/vxgehal/vxgehal-mrpcim.c optional vxge dev/vxge/vxgehal/vxge-queue.c optional vxge dev/vxge/vxgehal/vxgehal-ring.c optional vxge dev/vxge/vxgehal/vxgehal-swapper.c optional vxge dev/vxge/vxgehal/vxgehal-mgmt.c optional vxge dev/vxge/vxgehal/vxgehal-srpcim.c optional vxge dev/vxge/vxgehal/vxgehal-config.c optional vxge dev/vxge/vxgehal/vxgehal-blockpool.c optional vxge dev/vxge/vxgehal/vxgehal-doorbells.c optional vxge dev/vxge/vxgehal/vxgehal-mgmtaux.c optional vxge dev/vxge/vxgehal/vxgehal-device.c optional vxge dev/vxge/vxgehal/vxgehal-mm.c optional vxge dev/vxge/vxgehal/vxgehal-driver.c optional vxge dev/vxge/vxgehal/vxgehal-virtualpath.c optional vxge dev/vxge/vxgehal/vxgehal-channel.c optional vxge dev/vxge/vxgehal/vxgehal-fifo.c optional vxge dev/watchdog/watchdog.c standard dev/wb/if_wb.c optional wb pci dev/wds/wd7000.c optional wds isa dev/wi/if_wi.c optional wi dev/wi/if_wi_pccard.c optional wi pccard dev/wi/if_wi_pci.c optional wi pci dev/wl/if_wl.c optional wl isa dev/wpi/if_wpi.c optional wpi pci wpifw.c optional wpifw \ compile-with "${AWK} -f $S/tools/fw_stub.awk wpi.fw:wpifw:153229 -mwpi -c${.TARGET}" \ no-implicit-rule before-depend local \ clean "wpifw.c" wpifw.fwo optional wpifw \ dependency "wpi.fw" \ compile-with "${NORMAL_FWO}" \ no-implicit-rule \ clean "wpifw.fwo" wpi.fw optional wpifw \ dependency "$S/contrib/dev/wpi/iwlwifi-3945-15.32.2.9.fw.uu" \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "wpi.fw" dev/xe/if_xe.c optional xe dev/xe/if_xe_pccard.c optional xe pccard dev/xl/if_xl.c optional xl pci dev/xl/xlphy.c optional xl pci fs/coda/coda_fbsd.c optional vcoda fs/coda/coda_psdev.c optional vcoda fs/coda/coda_subr.c optional vcoda fs/coda/coda_venus.c optional vcoda fs/coda/coda_vfsops.c optional vcoda fs/coda/coda_vnops.c optional vcoda fs/deadfs/dead_vnops.c standard fs/devfs/devfs_devs.c standard fs/devfs/devfs_dir.c standard fs/devfs/devfs_rule.c standard fs/devfs/devfs_vfsops.c standard fs/devfs/devfs_vnops.c standard fs/fdescfs/fdesc_vfsops.c optional fdescfs fs/fdescfs/fdesc_vnops.c optional fdescfs fs/fifofs/fifo_vnops.c standard fs/hpfs/hpfs_alsubr.c optional hpfs fs/hpfs/hpfs_lookup.c optional hpfs fs/hpfs/hpfs_subr.c optional hpfs fs/hpfs/hpfs_vfsops.c optional hpfs fs/hpfs/hpfs_vnops.c optional hpfs fs/msdosfs/msdosfs_conv.c optional msdosfs fs/msdosfs/msdosfs_denode.c optional msdosfs fs/msdosfs/msdosfs_fat.c optional msdosfs fs/msdosfs/msdosfs_fileno.c optional msdosfs fs/msdosfs/msdosfs_iconv.c optional msdosfs_iconv fs/msdosfs/msdosfs_lookup.c optional msdosfs fs/msdosfs/msdosfs_vfsops.c optional msdosfs fs/msdosfs/msdosfs_vnops.c optional msdosfs fs/nfs/nfs_commonkrpc.c optional nfscl | nfsd fs/nfs/nfs_commonsubs.c optional nfscl | nfsd fs/nfs/nfs_commonport.c optional nfscl | nfsd fs/nfs/nfs_commonacl.c optional nfscl | nfsd fs/nfsclient/nfs_clcomsubs.c optional nfscl fs/nfsclient/nfs_clsubs.c optional nfscl fs/nfsclient/nfs_clstate.c optional nfscl fs/nfsclient/nfs_clkrpc.c optional nfscl fs/nfsclient/nfs_clrpcops.c optional nfscl fs/nfsclient/nfs_clvnops.c optional nfscl fs/nfsclient/nfs_clnode.c optional nfscl fs/nfsclient/nfs_clvfsops.c optional nfscl fs/nfsclient/nfs_clport.c optional nfscl fs/nfsclient/nfs_clbio.c optional nfscl fs/nfsclient/nfs_clnfsiod.c optional nfscl fs/nfsserver/nfs_fha_new.c optional nfsd inet fs/nfsserver/nfs_nfsdsocket.c optional nfsd inet fs/nfsserver/nfs_nfsdsubs.c optional nfsd inet fs/nfsserver/nfs_nfsdstate.c optional nfsd inet fs/nfsserver/nfs_nfsdkrpc.c optional nfsd inet fs/nfsserver/nfs_nfsdserv.c optional nfsd inet fs/nfsserver/nfs_nfsdport.c optional nfsd inet fs/nfsserver/nfs_nfsdcache.c optional nfsd inet fs/ntfs/ntfs_compr.c optional ntfs fs/ntfs/ntfs_iconv.c optional ntfs_iconv fs/ntfs/ntfs_ihash.c optional ntfs fs/ntfs/ntfs_subr.c optional ntfs fs/ntfs/ntfs_vfsops.c optional ntfs fs/ntfs/ntfs_vnops.c optional ntfs fs/nullfs/null_subr.c optional nullfs fs/nullfs/null_vfsops.c optional nullfs fs/nullfs/null_vnops.c optional nullfs fs/nwfs/nwfs_io.c optional nwfs fs/nwfs/nwfs_ioctl.c optional nwfs fs/nwfs/nwfs_node.c optional nwfs fs/nwfs/nwfs_subr.c optional nwfs fs/nwfs/nwfs_vfsops.c optional nwfs fs/nwfs/nwfs_vnops.c optional nwfs fs/portalfs/portal_vfsops.c optional portalfs fs/portalfs/portal_vnops.c optional portalfs fs/procfs/procfs.c optional procfs fs/procfs/procfs_ctl.c optional procfs fs/procfs/procfs_dbregs.c optional procfs fs/procfs/procfs_fpregs.c optional procfs fs/procfs/procfs_ioctl.c optional procfs fs/procfs/procfs_map.c optional procfs fs/procfs/procfs_mem.c optional procfs fs/procfs/procfs_note.c optional procfs fs/procfs/procfs_osrel.c optional procfs fs/procfs/procfs_regs.c optional procfs fs/procfs/procfs_rlimit.c optional procfs fs/procfs/procfs_status.c optional procfs fs/procfs/procfs_type.c optional procfs fs/pseudofs/pseudofs.c optional pseudofs fs/pseudofs/pseudofs_fileno.c optional pseudofs fs/pseudofs/pseudofs_vncache.c optional pseudofs fs/pseudofs/pseudofs_vnops.c optional pseudofs fs/smbfs/smbfs_io.c optional smbfs fs/smbfs/smbfs_node.c optional smbfs fs/smbfs/smbfs_smb.c optional smbfs fs/smbfs/smbfs_subr.c optional smbfs fs/smbfs/smbfs_vfsops.c optional smbfs fs/smbfs/smbfs_vnops.c optional smbfs fs/udf/osta.c optional udf fs/udf/udf_iconv.c optional udf_iconv fs/udf/udf_vfsops.c optional udf fs/udf/udf_vnops.c optional udf fs/unionfs/union_subr.c optional unionfs fs/unionfs/union_vfsops.c optional unionfs fs/unionfs/union_vnops.c optional unionfs fs/tmpfs/tmpfs_vnops.c optional tmpfs fs/tmpfs/tmpfs_fifoops.c optional tmpfs fs/tmpfs/tmpfs_vfsops.c optional tmpfs fs/tmpfs/tmpfs_subr.c optional tmpfs gdb/gdb_cons.c optional gdb gdb/gdb_main.c optional gdb gdb/gdb_packet.c optional gdb geom/bde/g_bde.c optional geom_bde geom/bde/g_bde_crypt.c optional geom_bde geom/bde/g_bde_lock.c optional geom_bde geom/bde/g_bde_work.c optional geom_bde geom/cache/g_cache.c optional geom_cache geom/concat/g_concat.c optional geom_concat geom/eli/g_eli.c optional geom_eli geom/eli/g_eli_crypto.c optional geom_eli geom/eli/g_eli_ctl.c optional geom_eli geom/eli/g_eli_integrity.c optional geom_eli geom/eli/g_eli_key.c optional geom_eli geom/eli/g_eli_key_cache.c optional geom_eli geom/eli/g_eli_privacy.c optional geom_eli geom/eli/pkcs5v2.c optional geom_eli geom/gate/g_gate.c optional geom_gate geom/geom_aes.c optional geom_aes geom/geom_bsd.c optional geom_bsd geom/geom_bsd_enc.c optional geom_bsd geom/geom_ccd.c optional ccd | geom_ccd geom/geom_ctl.c standard geom/geom_dev.c standard geom/geom_disk.c standard geom/geom_dump.c standard geom/geom_event.c standard geom/geom_fox.c optional geom_fox geom/geom_io.c standard geom/geom_kern.c standard geom/geom_map.c optional geom_map geom/geom_mbr.c optional geom_mbr geom/geom_mbr_enc.c optional geom_mbr geom/geom_pc98.c optional geom_pc98 geom/geom_pc98_enc.c optional geom_pc98 geom/geom_redboot.c optional geom_redboot geom/geom_slice.c standard geom/geom_subr.c standard geom/geom_sunlabel.c optional geom_sunlabel geom/geom_sunlabel_enc.c optional geom_sunlabel geom/geom_vfs.c standard geom/geom_vol_ffs.c optional geom_vol geom/journal/g_journal.c optional geom_journal geom/journal/g_journal_ufs.c optional geom_journal geom/label/g_label.c optional geom_label geom/label/g_label_ext2fs.c optional geom_label geom/label/g_label_iso9660.c optional geom_label geom/label/g_label_msdosfs.c optional geom_label geom/label/g_label_ntfs.c optional geom_label geom/label/g_label_reiserfs.c optional geom_label geom/label/g_label_ufs.c optional geom_label geom/label/g_label_gpt.c optional geom_label geom/linux_lvm/g_linux_lvm.c optional geom_linux_lvm geom/mirror/g_mirror.c optional geom_mirror geom/mirror/g_mirror_ctl.c optional geom_mirror geom/mountver/g_mountver.c optional geom_mountver geom/multipath/g_multipath.c optional geom_multipath geom/nop/g_nop.c optional geom_nop geom/part/g_part.c standard geom/part/g_part_if.m standard geom/part/g_part_apm.c optional geom_part_apm geom/part/g_part_bsd.c optional geom_part_bsd geom/part/g_part_ebr.c optional geom_part_ebr geom/part/g_part_gpt.c optional geom_part_gpt geom/part/g_part_ldm.c optional geom_part_ldm geom/part/g_part_mbr.c optional geom_part_mbr geom/part/g_part_pc98.c optional geom_part_pc98 geom/part/g_part_vtoc8.c optional geom_part_vtoc8 geom/raid/g_raid.c optional geom_raid geom/raid/g_raid_ctl.c optional geom_raid geom/raid/g_raid_md_if.m optional geom_raid geom/raid/g_raid_tr_if.m optional geom_raid geom/raid/md_ddf.c optional geom_raid geom/raid/md_intel.c optional geom_raid geom/raid/md_jmicron.c optional geom_raid geom/raid/md_nvidia.c optional geom_raid geom/raid/md_promise.c optional geom_raid geom/raid/md_sii.c optional geom_raid geom/raid/tr_concat.c optional geom_raid geom/raid/tr_raid0.c optional geom_raid geom/raid/tr_raid1.c optional geom_raid geom/raid/tr_raid1e.c optional geom_raid geom/raid/tr_raid5.c optional geom_raid geom/raid3/g_raid3.c optional geom_raid3 geom/raid3/g_raid3_ctl.c optional geom_raid3 geom/shsec/g_shsec.c optional geom_shsec geom/stripe/g_stripe.c optional geom_stripe geom/uncompress/g_uncompress.c optional geom_uncompress contrib/xz-embedded/freebsd/xz_malloc.c \ optional xz_embedded | geom_uncompress \ compile-with "${NORMAL_C} -I$S/contrib/xz-embedded/freebsd/ -I$S/contrib/xz-embedded/linux/lib/xz/ -I$S/contrib/xz-embedded/linux/include/linux/" contrib/xz-embedded/linux/lib/xz/xz_crc32.c \ optional xz_embedded | geom_uncompress \ compile-with "${NORMAL_C} -I$S/contrib/xz-embedded/freebsd/ -I$S/contrib/xz-embedded/linux/lib/xz/ -I$S/contrib/xz-embedded/linux/include/linux/" contrib/xz-embedded/linux/lib/xz/xz_dec_bcj.c \ optional xz_embedded | geom_uncompress \ compile-with "${NORMAL_C} -I$S/contrib/xz-embedded/freebsd/ -I$S/contrib/xz-embedded/linux/lib/xz/ -I$S/contrib/xz-embedded/linux/include/linux/" contrib/xz-embedded/linux/lib/xz/xz_dec_lzma2.c \ optional xz_embedded | geom_uncompress \ compile-with "${NORMAL_C} -I$S/contrib/xz-embedded/freebsd/ -I$S/contrib/xz-embedded/linux/lib/xz/ -I$S/contrib/xz-embedded/linux/include/linux/" contrib/xz-embedded/linux/lib/xz/xz_dec_stream.c \ optional xz_embedded | geom_uncompress \ compile-with "${NORMAL_C} -I$S/contrib/xz-embedded/freebsd/ -I$S/contrib/xz-embedded/linux/lib/xz/ -I$S/contrib/xz-embedded/linux/include/linux/" geom/uzip/g_uzip.c optional geom_uzip geom/virstor/binstream.c optional geom_virstor geom/virstor/g_virstor.c optional geom_virstor geom/virstor/g_virstor_md.c optional geom_virstor geom/zero/g_zero.c optional geom_zero fs/ext2fs/ext2_alloc.c optional ext2fs fs/ext2fs/ext2_balloc.c optional ext2fs fs/ext2fs/ext2_bmap.c optional ext2fs fs/ext2fs/ext2_extents.c optional ext2fs fs/ext2fs/ext2_inode.c optional ext2fs fs/ext2fs/ext2_inode_cnv.c optional ext2fs fs/ext2fs/ext2_hash.c optional ext2fs fs/ext2fs/ext2_htree.c optional ext2fs fs/ext2fs/ext2_lookup.c optional ext2fs fs/ext2fs/ext2_subr.c optional ext2fs fs/ext2fs/ext2_vfsops.c optional ext2fs fs/ext2fs/ext2_vnops.c optional ext2fs gnu/fs/reiserfs/reiserfs_hashes.c optional reiserfs \ warning "kernel contains GPL contaminated ReiserFS filesystem" gnu/fs/reiserfs/reiserfs_inode.c optional reiserfs gnu/fs/reiserfs/reiserfs_item_ops.c optional reiserfs gnu/fs/reiserfs/reiserfs_namei.c optional reiserfs gnu/fs/reiserfs/reiserfs_prints.c optional reiserfs gnu/fs/reiserfs/reiserfs_stree.c optional reiserfs gnu/fs/reiserfs/reiserfs_vfsops.c optional reiserfs gnu/fs/reiserfs/reiserfs_vnops.c optional reiserfs # isa/isa_if.m standard isa/isa_common.c optional isa isa/isahint.c optional isa isa/pnp.c optional isa isapnp isa/pnpparse.c optional isa isapnp fs/cd9660/cd9660_bmap.c optional cd9660 fs/cd9660/cd9660_lookup.c optional cd9660 fs/cd9660/cd9660_node.c optional cd9660 fs/cd9660/cd9660_rrip.c optional cd9660 fs/cd9660/cd9660_util.c optional cd9660 fs/cd9660/cd9660_vfsops.c optional cd9660 fs/cd9660/cd9660_vnops.c optional cd9660 fs/cd9660/cd9660_iconv.c optional cd9660_iconv kern/bus_if.m standard kern/clock_if.m standard kern/cpufreq_if.m standard kern/device_if.m standard kern/imgact_elf.c standard kern/imgact_shell.c standard kern/inflate.c optional gzip kern/init_main.c standard kern/init_sysent.c standard kern/ksched.c optional _kposix_priority_scheduling kern/kern_acct.c standard kern/kern_alq.c optional alq kern/kern_clock.c standard kern/kern_condvar.c standard kern/kern_conf.c standard kern/kern_cons.c standard kern/kern_cpu.c standard kern/kern_cpuset.c standard kern/kern_context.c standard kern/kern_descrip.c standard kern/kern_dtrace.c optional kdtrace_hooks kern/kern_environment.c standard kern/kern_et.c standard kern/kern_event.c standard kern/kern_exec.c standard kern/kern_exit.c standard kern/kern_fail.c standard kern/kern_fork.c standard kern/kern_gzio.c optional gzio kern/kern_hhook.c standard kern/kern_idle.c standard kern/kern_intr.c standard kern/kern_jail.c standard kern/kern_khelp.c standard kern/kern_kthread.c standard kern/kern_ktr.c optional ktr kern/kern_ktrace.c standard kern/kern_linker.c standard kern/kern_lock.c standard kern/kern_lockf.c standard kern/kern_lockstat.c optional kdtrace_hooks kern/kern_loginclass.c standard kern/kern_malloc.c standard kern/kern_mbuf.c standard kern/kern_mib.c standard kern/kern_module.c standard kern/kern_mtxpool.c standard kern/kern_mutex.c standard kern/kern_ntptime.c standard kern/kern_osd.c standard kern/kern_physio.c standard kern/kern_pmc.c standard kern/kern_poll.c optional device_polling kern/kern_priv.c standard kern/kern_proc.c standard kern/kern_prot.c standard kern/kern_racct.c standard kern/kern_rangelock.c standard kern/kern_rctl.c standard kern/kern_resource.c standard kern/kern_rmlock.c standard kern/kern_rwlock.c standard kern/kern_sdt.c optional kdtrace_hooks kern/kern_sema.c standard kern/kern_sharedpage.c standard kern/kern_shutdown.c standard kern/kern_sig.c standard kern/kern_switch.c standard kern/kern_sx.c standard kern/kern_synch.c standard kern/kern_syscalls.c standard kern/kern_sysctl.c standard kern/kern_tc.c standard kern/kern_thr.c standard kern/kern_thread.c standard kern/kern_time.c standard kern/kern_timeout.c standard kern/kern_umtx.c standard kern/kern_uuid.c standard kern/kern_xxx.c standard kern/link_elf.c standard kern/linker_if.m standard kern/md4c.c optional netsmb kern/md5c.c standard kern/p1003_1b.c standard kern/posix4_mib.c standard kern/sched_4bsd.c optional sched_4bsd kern/sched_ule.c optional sched_ule kern/serdev_if.m standard kern/stack_protector.c standard \ compile-with "${NORMAL_C:N-fstack-protector*}" kern/subr_acl_nfs4.c standard kern/subr_acl_posix1e.c standard kern/subr_autoconf.c standard kern/subr_blist.c standard kern/subr_bus.c standard kern/subr_bus_dma.c standard kern/subr_bufring.c standard kern/subr_clock.c standard kern/subr_devstat.c standard kern/subr_disk.c standard kern/subr_eventhandler.c standard kern/subr_fattime.c standard kern/subr_firmware.c optional firmware kern/subr_hash.c standard kern/subr_hints.c standard kern/subr_kdb.c standard kern/subr_kobj.c standard kern/subr_lock.c standard kern/subr_log.c standard kern/subr_mbpool.c optional libmbpool kern/subr_mchain.c optional libmchain kern/subr_module.c standard kern/subr_msgbuf.c standard kern/subr_param.c standard kern/subr_pcpu.c standard kern/subr_power.c standard kern/subr_prf.c standard kern/subr_prof.c standard kern/subr_rman.c standard kern/subr_rtc.c standard kern/subr_sbuf.c standard kern/subr_scanf.c standard kern/subr_sglist.c standard kern/subr_sleepqueue.c standard kern/subr_smp.c standard kern/subr_stack.c optional ddb | stack | ktr kern/subr_taskqueue.c standard kern/subr_trap.c standard kern/subr_turnstile.c standard kern/subr_uio.c standard kern/subr_unit.c standard kern/subr_witness.c optional witness kern/sys_capability.c standard kern/sys_generic.c standard kern/sys_pipe.c standard kern/sys_procdesc.c standard kern/sys_process.c standard kern/sys_socket.c standard kern/syscalls.c standard kern/sysv_ipc.c standard kern/sysv_msg.c optional sysvmsg kern/sysv_sem.c optional sysvsem kern/sysv_shm.c optional sysvshm kern/tty.c standard kern/tty_compat.c optional compat_43tty kern/tty_info.c standard kern/tty_inq.c standard kern/tty_outq.c standard kern/tty_pts.c standard kern/tty_tty.c standard kern/tty_ttydisc.c standard kern/uipc_accf.c optional inet kern/uipc_cow.c optional zero_copy_sockets kern/uipc_debug.c optional ddb kern/uipc_domain.c standard kern/uipc_mbuf.c standard kern/uipc_mbuf2.c standard kern/uipc_mqueue.c optional p1003_1b_mqueue kern/uipc_sem.c optional p1003_1b_semaphores kern/uipc_shm.c standard kern/uipc_sockbuf.c standard kern/uipc_socket.c standard kern/uipc_syscalls.c standard kern/uipc_usrreq.c standard kern/vfs_acl.c standard kern/vfs_aio.c optional vfs_aio kern/vfs_bio.c standard kern/vfs_cache.c standard kern/vfs_cluster.c standard kern/vfs_default.c standard kern/vfs_export.c standard kern/vfs_extattr.c standard kern/vfs_hash.c standard kern/vfs_init.c standard kern/vfs_lookup.c standard kern/vfs_mount.c standard kern/vfs_mountroot.c standard kern/vfs_subr.c standard kern/vfs_syscalls.c standard kern/vfs_vnops.c standard # # Kernel GSS-API # gssd.h optional kgssapi \ dependency "$S/kgssapi/gssd.x" \ compile-with "RPCGEN_CPP='${CPP}' rpcgen -hM $S/kgssapi/gssd.x | grep -v pthread.h > gssd.h" \ no-obj no-implicit-rule before-depend local \ clean "gssd.h" gssd_xdr.c optional kgssapi \ dependency "$S/kgssapi/gssd.x gssd.h" \ compile-with "RPCGEN_CPP='${CPP}' rpcgen -c $S/kgssapi/gssd.x -o gssd_xdr.c" \ no-implicit-rule before-depend local \ clean "gssd_xdr.c" gssd_clnt.c optional kgssapi \ dependency "$S/kgssapi/gssd.x gssd.h" \ compile-with "RPCGEN_CPP='${CPP}' rpcgen -lM $S/kgssapi/gssd.x | grep -v string.h > gssd_clnt.c" \ no-implicit-rule before-depend local \ clean "gssd_clnt.c" kgssapi/gss_accept_sec_context.c optional kgssapi kgssapi/gss_add_oid_set_member.c optional kgssapi kgssapi/gss_acquire_cred.c optional kgssapi kgssapi/gss_canonicalize_name.c optional kgssapi kgssapi/gss_create_empty_oid_set.c optional kgssapi kgssapi/gss_delete_sec_context.c optional kgssapi kgssapi/gss_display_status.c optional kgssapi kgssapi/gss_export_name.c optional kgssapi kgssapi/gss_get_mic.c optional kgssapi kgssapi/gss_init_sec_context.c optional kgssapi kgssapi/gss_impl.c optional kgssapi kgssapi/gss_import_name.c optional kgssapi kgssapi/gss_names.c optional kgssapi kgssapi/gss_pname_to_uid.c optional kgssapi kgssapi/gss_release_buffer.c optional kgssapi kgssapi/gss_release_cred.c optional kgssapi kgssapi/gss_release_name.c optional kgssapi kgssapi/gss_release_oid_set.c optional kgssapi kgssapi/gss_set_cred_option.c optional kgssapi kgssapi/gss_test_oid_set_member.c optional kgssapi kgssapi/gss_unwrap.c optional kgssapi kgssapi/gss_verify_mic.c optional kgssapi kgssapi/gss_wrap.c optional kgssapi kgssapi/gss_wrap_size_limit.c optional kgssapi kgssapi/gssd_prot.c optional kgssapi kgssapi/krb5/krb5_mech.c optional kgssapi kgssapi/krb5/kcrypto.c optional kgssapi kgssapi/krb5/kcrypto_aes.c optional kgssapi kgssapi/krb5/kcrypto_arcfour.c optional kgssapi kgssapi/krb5/kcrypto_des.c optional kgssapi kgssapi/krb5/kcrypto_des3.c optional kgssapi kgssapi/kgss_if.m optional kgssapi kgssapi/gsstest.c optional kgssapi_debug # These files in libkern/ are those needed by all architectures. Some # of the files in libkern/ are only needed on some architectures, e.g., # libkern/divdi3.c is needed by i386 but not alpha. Also, some of these # routines may be optimized for a particular platform. In either case, # the file should be moved to conf/files. from here. # libkern/arc4random.c standard libkern/bcd.c standard libkern/bsearch.c standard libkern/crc32.c standard libkern/fnmatch.c standard libkern/iconv.c optional libiconv libkern/iconv_converter_if.m optional libiconv libkern/iconv_ucs.c optional libiconv libkern/iconv_xlat.c optional libiconv libkern/iconv_xlat16.c optional libiconv libkern/index.c standard libkern/inet_aton.c standard libkern/inet_ntoa.c standard libkern/inet_ntop.c standard libkern/inet_pton.c standard libkern/mcount.c optional profiling-routine libkern/memcmp.c standard libkern/qsort.c standard libkern/qsort_r.c standard libkern/random.c standard libkern/rindex.c standard libkern/scanc.c standard libkern/skpc.c standard libkern/strcasecmp.c standard libkern/strcat.c standard libkern/strcmp.c standard libkern/strcpy.c standard libkern/strcspn.c standard libkern/strdup.c standard libkern/strlcat.c standard libkern/strlcpy.c standard libkern/strlen.c standard libkern/strncmp.c standard libkern/strncpy.c standard libkern/strnlen.c standard libkern/strsep.c standard libkern/strspn.c standard libkern/strstr.c standard libkern/strtol.c standard libkern/strtoq.c standard libkern/strtoul.c standard libkern/strtouq.c standard libkern/strvalid.c standard net/bpf.c standard net/bpf_buffer.c optional bpf net/bpf_jitter.c optional bpf_jitter net/bpf_filter.c optional bpf | netgraph_bpf net/bpf_zerocopy.c optional bpf net/bridgestp.c optional bridge | if_bridge net/flowtable.c optional flowtable inet | flowtable inet6 net/ieee8023ad_lacp.c optional lagg net/if.c standard net/if_arcsubr.c optional arcnet net/if_atmsubr.c optional atm net/if_bridge.c optional bridge inet | if_bridge inet net/if_clone.c standard net/if_dead.c standard net/if_debug.c optional ddb net/if_disc.c optional disc net/if_edsc.c optional edsc net/if_ef.c optional ef net/if_enc.c optional enc ipsec inet | enc ipsec inet6 net/if_epair.c optional epair net/if_ethersubr.c optional ether \ compile-with "${NORMAL_C} -I$S/contrib/pf" net/if_faith.c optional faith net/if_fddisubr.c optional fddi net/if_fwsubr.c optional fwip net/if_gif.c optional gif | netgraph_gif net/if_gre.c optional gre inet net/if_iso88025subr.c optional token net/if_lagg.c optional lagg net/if_loop.c optional loop net/if_llatbl.c standard net/if_media.c standard net/if_mib.c standard net/if_spppfr.c optional sppp | netgraph_sppp net/if_spppsubr.c optional sppp | netgraph_sppp net/if_stf.c optional stf inet inet6 net/if_tun.c optional tun net/if_tap.c optional tap net/if_vlan.c optional vlan net/mppcc.c optional netgraph_mppc_compression net/mppcd.c optional netgraph_mppc_compression net/netisr.c standard net/pfil.c optional ether | inet net/radix.c standard net/radix_mpath.c standard net/raw_cb.c standard net/raw_usrreq.c standard net/route.c standard net/rtsock.c standard net/slcompress.c optional netgraph_vjc | sppp | \ netgraph_sppp net/vnet.c optional vimage net/zlib.c optional crypto | geom_uzip | ipsec | \ mxge | netgraph_deflate | \ ddb_ctf | gzio | geom_uncompress net80211/ieee80211.c optional wlan net80211/ieee80211_acl.c optional wlan wlan_acl net80211/ieee80211_action.c optional wlan net80211/ieee80211_ageq.c optional wlan net80211/ieee80211_adhoc.c optional wlan \ compile-with "${NORMAL_C} -Wno-unused-function" net80211/ieee80211_ageq.c optional wlan net80211/ieee80211_amrr.c optional wlan | wlan_amrr net80211/ieee80211_crypto.c optional wlan \ compile-with "${NORMAL_C} -Wno-unused-function" net80211/ieee80211_crypto_ccmp.c optional wlan wlan_ccmp net80211/ieee80211_crypto_none.c optional wlan net80211/ieee80211_crypto_tkip.c optional wlan wlan_tkip net80211/ieee80211_crypto_wep.c optional wlan wlan_wep net80211/ieee80211_ddb.c optional wlan ddb net80211/ieee80211_dfs.c optional wlan net80211/ieee80211_freebsd.c optional wlan net80211/ieee80211_hostap.c optional wlan \ compile-with "${NORMAL_C} -Wno-unused-function" net80211/ieee80211_ht.c optional wlan net80211/ieee80211_hwmp.c optional wlan ieee80211_support_mesh net80211/ieee80211_input.c optional wlan net80211/ieee80211_ioctl.c optional wlan net80211/ieee80211_mesh.c optional wlan ieee80211_support_mesh \ compile-with "${NORMAL_C} -Wno-unused-function" net80211/ieee80211_monitor.c optional wlan net80211/ieee80211_node.c optional wlan net80211/ieee80211_output.c optional wlan net80211/ieee80211_phy.c optional wlan net80211/ieee80211_power.c optional wlan net80211/ieee80211_proto.c optional wlan net80211/ieee80211_radiotap.c optional wlan net80211/ieee80211_ratectl.c optional wlan net80211/ieee80211_ratectl_none.c optional wlan net80211/ieee80211_regdomain.c optional wlan net80211/ieee80211_rssadapt.c optional wlan wlan_rssadapt net80211/ieee80211_scan.c optional wlan net80211/ieee80211_scan_sta.c optional wlan net80211/ieee80211_sta.c optional wlan \ compile-with "${NORMAL_C} -Wno-unused-function" net80211/ieee80211_superg.c optional wlan ieee80211_support_superg net80211/ieee80211_tdma.c optional wlan ieee80211_support_tdma net80211/ieee80211_wds.c optional wlan net80211/ieee80211_xauth.c optional wlan wlan_xauth net80211/ieee80211_alq.c optional wlan ieee80211_alq netatalk/aarp.c optional netatalk netatalk/at_control.c optional netatalk netatalk/at_proto.c optional netatalk netatalk/at_rmx.c optional netatalk netatalk/ddp_input.c optional netatalk netatalk/ddp_output.c optional netatalk netatalk/ddp_pcb.c optional netatalk netatalk/ddp_usrreq.c optional netatalk netgraph/atm/ccatm/ng_ccatm.c optional ngatm_ccatm \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" netgraph/atm/ng_atm.c optional ngatm_atm netgraph/atm/ngatmbase.c optional ngatm_atmbase \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" netgraph/atm/sscfu/ng_sscfu.c optional ngatm_sscfu \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" netgraph/atm/sscop/ng_sscop.c optional ngatm_sscop \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" netgraph/atm/uni/ng_uni.c optional ngatm_uni \ compile-with "${NORMAL_C} -I$S/contrib/ngatm" netgraph/bluetooth/common/ng_bluetooth.c optional netgraph_bluetooth netgraph/bluetooth/drivers/bt3c/ng_bt3c_pccard.c optional netgraph_bluetooth_bt3c netgraph/bluetooth/drivers/h4/ng_h4.c optional netgraph_bluetooth_h4 netgraph/bluetooth/drivers/ubt/ng_ubt.c optional netgraph_bluetooth_ubt usb netgraph/bluetooth/drivers/ubtbcmfw/ubtbcmfw.c optional netgraph_bluetooth_ubtbcmfw usb netgraph/bluetooth/hci/ng_hci_cmds.c optional netgraph_bluetooth_hci netgraph/bluetooth/hci/ng_hci_evnt.c optional netgraph_bluetooth_hci netgraph/bluetooth/hci/ng_hci_main.c optional netgraph_bluetooth_hci netgraph/bluetooth/hci/ng_hci_misc.c optional netgraph_bluetooth_hci netgraph/bluetooth/hci/ng_hci_ulpi.c optional netgraph_bluetooth_hci netgraph/bluetooth/l2cap/ng_l2cap_cmds.c optional netgraph_bluetooth_l2cap netgraph/bluetooth/l2cap/ng_l2cap_evnt.c optional netgraph_bluetooth_l2cap netgraph/bluetooth/l2cap/ng_l2cap_llpi.c optional netgraph_bluetooth_l2cap netgraph/bluetooth/l2cap/ng_l2cap_main.c optional netgraph_bluetooth_l2cap netgraph/bluetooth/l2cap/ng_l2cap_misc.c optional netgraph_bluetooth_l2cap netgraph/bluetooth/l2cap/ng_l2cap_ulpi.c optional netgraph_bluetooth_l2cap netgraph/bluetooth/socket/ng_btsocket.c optional netgraph_bluetooth_socket netgraph/bluetooth/socket/ng_btsocket_hci_raw.c optional netgraph_bluetooth_socket netgraph/bluetooth/socket/ng_btsocket_l2cap.c optional netgraph_bluetooth_socket netgraph/bluetooth/socket/ng_btsocket_l2cap_raw.c optional netgraph_bluetooth_socket netgraph/bluetooth/socket/ng_btsocket_rfcomm.c optional netgraph_bluetooth_socket netgraph/bluetooth/socket/ng_btsocket_sco.c optional netgraph_bluetooth_socket netgraph/netflow/netflow.c optional netgraph_netflow netgraph/netflow/netflow_v9.c optional netgraph_netflow netgraph/netflow/ng_netflow.c optional netgraph_netflow netgraph/ng_UI.c optional netgraph_UI netgraph/ng_async.c optional netgraph_async netgraph/ng_atmllc.c optional netgraph_atmllc netgraph/ng_base.c optional netgraph netgraph/ng_bpf.c optional netgraph_bpf netgraph/ng_bridge.c optional netgraph_bridge netgraph/ng_car.c optional netgraph_car netgraph/ng_cisco.c optional netgraph_cisco netgraph/ng_deflate.c optional netgraph_deflate netgraph/ng_device.c optional netgraph_device netgraph/ng_echo.c optional netgraph_echo netgraph/ng_eiface.c optional netgraph_eiface netgraph/ng_ether.c optional netgraph_ether netgraph/ng_ether_echo.c optional netgraph_ether_echo netgraph/ng_fec.c optional netgraph_fec netgraph/ng_frame_relay.c optional netgraph_frame_relay netgraph/ng_gif.c optional netgraph_gif netgraph/ng_gif_demux.c optional netgraph_gif_demux netgraph/ng_hole.c optional netgraph_hole netgraph/ng_iface.c optional netgraph_iface netgraph/ng_ip_input.c optional netgraph_ip_input netgraph/ng_ipfw.c optional netgraph_ipfw inet ipfirewall netgraph/ng_ksocket.c optional netgraph_ksocket netgraph/ng_l2tp.c optional netgraph_l2tp netgraph/ng_lmi.c optional netgraph_lmi netgraph/ng_mppc.c optional netgraph_mppc_compression | \ netgraph_mppc_encryption netgraph/ng_nat.c optional netgraph_nat inet libalias netgraph/ng_one2many.c optional netgraph_one2many netgraph/ng_parse.c optional netgraph netgraph/ng_patch.c optional netgraph_patch netgraph/ng_pipe.c optional netgraph_pipe netgraph/ng_ppp.c optional netgraph_ppp netgraph/ng_pppoe.c optional netgraph_pppoe netgraph/ng_pptpgre.c optional netgraph_pptpgre netgraph/ng_pred1.c optional netgraph_pred1 netgraph/ng_rfc1490.c optional netgraph_rfc1490 netgraph/ng_socket.c optional netgraph_socket netgraph/ng_split.c optional netgraph_split netgraph/ng_sppp.c optional netgraph_sppp netgraph/ng_tag.c optional netgraph_tag netgraph/ng_tcpmss.c optional netgraph_tcpmss netgraph/ng_tee.c optional netgraph_tee netgraph/ng_tty.c optional netgraph_tty netgraph/ng_vjc.c optional netgraph_vjc netgraph/ng_vlan.c optional netgraph_vlan netinet/accf_data.c optional accept_filter_data inet netinet/accf_dns.c optional accept_filter_dns inet netinet/accf_http.c optional accept_filter_http inet netinet/if_atm.c optional atm netinet/if_ether.c optional inet ether netinet/igmp.c optional inet netinet/in.c optional inet netinet/in_debug.c optional inet ddb netinet/ip_carp.c optional inet carp | inet6 carp netinet/in_gif.c optional gif inet | netgraph_gif inet netinet/ip_gre.c optional gre inet netinet/ip_id.c optional inet netinet/in_mcast.c optional inet netinet/in_pcb.c optional inet | inet6 netinet/in_pcbgroup.c optional inet pcbgroup | inet6 pcbgroup netinet/in_proto.c optional inet | inet6 \ compile-with "${NORMAL_C} -I$S/contrib/pf" netinet/in_rmx.c optional inet netinet/ip_divert.c optional inet ipdivert ipfirewall netinet/ip_ecn.c optional inet | inet6 netinet/ip_encap.c optional inet | inet6 netinet/ip_fastfwd.c optional inet netinet/ip_icmp.c optional inet | inet6 netinet/ip_input.c optional inet netinet/ip_ipsec.c optional inet ipsec netinet/ip_mroute.c optional mrouting inet netinet/ip_options.c optional inet netinet/ip_output.c optional inet netinet/raw_ip.c optional inet | inet6 netinet/cc/cc.c optional inet | inet6 netinet/cc/cc_newreno.c optional inet | inet6 netinet/sctp_asconf.c optional inet sctp | inet6 sctp netinet/sctp_auth.c optional inet sctp | inet6 sctp netinet/sctp_bsd_addr.c optional inet sctp | inet6 sctp netinet/sctp_cc_functions.c optional inet sctp | inet6 sctp netinet/sctp_crc32.c optional inet sctp | inet6 sctp netinet/sctp_indata.c optional inet sctp | inet6 sctp netinet/sctp_input.c optional inet sctp | inet6 sctp netinet/sctp_output.c optional inet sctp | inet6 sctp netinet/sctp_pcb.c optional inet sctp | inet6 sctp netinet/sctp_peeloff.c optional inet sctp | inet6 sctp netinet/sctp_ss_functions.c optional inet sctp | inet6 sctp netinet/sctp_sysctl.c optional inet sctp | inet6 sctp netinet/sctp_timer.c optional inet sctp | inet6 sctp netinet/sctp_usrreq.c optional inet sctp | inet6 sctp netinet/sctputil.c optional inet sctp | inet6 sctp netinet/tcp_debug.c optional tcpdebug netinet/tcp_hostcache.c optional inet | inet6 netinet/tcp_input.c optional inet | inet6 netinet/tcp_lro.c optional inet | inet6 netinet/tcp_output.c optional inet | inet6 netinet/tcp_offload.c optional tcp_offload inet | tcp_offload inet6 netinet/tcp_reass.c optional inet | inet6 netinet/tcp_sack.c optional inet | inet6 netinet/tcp_subr.c optional inet | inet6 netinet/tcp_syncache.c optional inet | inet6 netinet/tcp_timer.c optional inet | inet6 netinet/tcp_timewait.c optional inet | inet6 netinet/tcp_usrreq.c optional inet | inet6 netinet/udp_usrreq.c optional inet | inet6 netinet/libalias/alias.c optional libalias inet | netgraph_nat inet netinet/libalias/alias_db.c optional libalias inet | netgraph_nat inet netinet/libalias/alias_mod.c optional libalias | netgraph_nat netinet/libalias/alias_proxy.c optional libalias inet | netgraph_nat inet netinet/libalias/alias_util.c optional libalias inet | netgraph_nat inet netinet/libalias/alias_sctp.c optional libalias inet | netgraph_nat inet netinet6/dest6.c optional inet6 netinet6/frag6.c optional inet6 netinet6/icmp6.c optional inet6 netinet6/in6.c optional inet6 netinet6/in6_cksum.c optional inet6 netinet6/in6_gif.c optional gif inet6 | netgraph_gif inet6 netinet6/in6_ifattach.c optional inet6 netinet6/in6_mcast.c optional inet6 netinet6/in6_pcb.c optional inet6 netinet6/in6_pcbgroup.c optional inet6 pcbgroup netinet6/in6_proto.c optional inet6 netinet6/in6_rmx.c optional inet6 netinet6/in6_src.c optional inet6 netinet6/ip6_forward.c optional inet6 netinet6/ip6_id.c optional inet6 netinet6/ip6_input.c optional inet6 netinet6/ip6_mroute.c optional mrouting inet6 netinet6/ip6_output.c optional inet6 netinet6/ip6_ipsec.c optional inet6 ipsec netinet6/mld6.c optional inet6 netinet6/nd6.c optional inet6 netinet6/nd6_nbr.c optional inet6 netinet6/nd6_rtr.c optional inet6 netinet6/raw_ip6.c optional inet6 netinet6/route6.c optional inet6 netinet6/scope6.c optional inet6 netinet6/sctp6_usrreq.c optional inet6 sctp netinet6/udp6_usrreq.c optional inet6 netipsec/ipsec.c optional ipsec inet | ipsec inet6 netipsec/ipsec_input.c optional ipsec inet | ipsec inet6 netipsec/ipsec_mbuf.c optional ipsec inet | ipsec inet6 netipsec/ipsec_output.c optional ipsec inet | ipsec inet6 netipsec/key.c optional ipsec inet | ipsec inet6 netipsec/key_debug.c optional ipsec inet | ipsec inet6 netipsec/keysock.c optional ipsec inet | ipsec inet6 netipsec/xform_ah.c optional ipsec inet | ipsec inet6 netipsec/xform_esp.c optional ipsec inet | ipsec inet6 netipsec/xform_ipcomp.c optional ipsec inet | ipsec inet6 netipsec/xform_ipip.c optional ipsec inet | ipsec inet6 netipsec/xform_tcp.c optional ipsec inet tcp_signature | \ ipsec inet6 tcp_signature netipx/ipx.c optional ipx netipx/ipx_cksum.c optional ipx netipx/ipx_input.c optional ipx netipx/ipx_outputfl.c optional ipx netipx/ipx_pcb.c optional ipx netipx/ipx_proto.c optional ipx netipx/ipx_usrreq.c optional ipx netipx/spx_debug.c optional ipx netipx/spx_reass.c optional ipx netipx/spx_usrreq.c optional ipx netnatm/natm.c optional natm netnatm/natm_pcb.c optional natm netnatm/natm_proto.c optional natm netncp/ncp_conn.c optional ncp netncp/ncp_crypt.c optional ncp netncp/ncp_login.c optional ncp netncp/ncp_mod.c optional ncp netncp/ncp_ncp.c optional ncp netncp/ncp_nls.c optional ncp netncp/ncp_rq.c optional ncp netncp/ncp_sock.c optional ncp netncp/ncp_subr.c optional ncp netpfil/ipfw/dn_heap.c optional inet dummynet netpfil/ipfw/dn_sched_fifo.c optional inet dummynet netpfil/ipfw/dn_sched_prio.c optional inet dummynet netpfil/ipfw/dn_sched_qfq.c optional inet dummynet netpfil/ipfw/dn_sched_rr.c optional inet dummynet netpfil/ipfw/dn_sched_wf2q.c optional inet dummynet netpfil/ipfw/ip_dummynet.c optional inet dummynet netpfil/ipfw/ip_dn_io.c optional inet dummynet netpfil/ipfw/ip_dn_glue.c optional inet dummynet netpfil/ipfw/ip_fw2.c optional inet ipfirewall \ compile-with "${NORMAL_C} -I$S/contrib/pf" netpfil/ipfw/ip_fw_dynamic.c optional inet ipfirewall netpfil/ipfw/ip_fw_log.c optional inet ipfirewall netpfil/ipfw/ip_fw_pfil.c optional inet ipfirewall netpfil/ipfw/ip_fw_sockopt.c optional inet ipfirewall netpfil/ipfw/ip_fw_table.c optional inet ipfirewall netpfil/ipfw/ip_fw_nat.c optional inet ipfirewall_nat netsmb/smb_conn.c optional netsmb netsmb/smb_crypt.c optional netsmb netsmb/smb_dev.c optional netsmb netsmb/smb_iod.c optional netsmb netsmb/smb_rq.c optional netsmb netsmb/smb_smb.c optional netsmb netsmb/smb_subr.c optional netsmb netsmb/smb_trantcp.c optional netsmb netsmb/smb_usr.c optional netsmb nfs/bootp_subr.c optional bootp nfsclient | bootp nfscl nfs/krpc_subr.c optional bootp nfsclient | bootp nfscl nfs/nfs_common.c optional nfsclient | nfsserver nfs/nfs_diskless.c optional nfsclient nfs_root | nfscl nfs_root nfs/nfs_fha.c optional nfsserver | nfsd nfs/nfs_lock.c optional nfsclient | nfscl | nfslockd | nfsd nfsclient/nfs_bio.c optional nfsclient nfsclient/nfs_node.c optional nfsclient nfsclient/nfs_krpc.c optional nfsclient nfsclient/nfs_subs.c optional nfsclient nfsclient/nfs_nfsiod.c optional nfsclient nfsclient/nfs_vfsops.c optional nfsclient nfsclient/nfs_vnops.c optional nfsclient nfsserver/nfs_fha_old.c optional nfsserver nfsserver/nfs_serv.c optional nfsserver nfsserver/nfs_srvkrpc.c optional nfsserver nfsserver/nfs_srvsubs.c optional nfsserver nfs/nfs_nfssvc.c optional nfsserver | nfscl | nfsd nlm/nlm_advlock.c optional nfslockd | nfsd nlm/nlm_prot_clnt.c optional nfslockd | nfsd nlm/nlm_prot_impl.c optional nfslockd | nfsd nlm/nlm_prot_server.c optional nfslockd | nfsd nlm/nlm_prot_svc.c optional nfslockd | nfsd nlm/nlm_prot_xdr.c optional nfslockd | nfsd nlm/sm_inter_xdr.c optional nfslockd | nfsd # OpenFabrics Enterprise Distribution (Infiniband) ofed/include/linux/linux_compat.c optional ofed \ no-depend compile-with "${OFED_C}" ofed/include/linux/linux_idr.c optional ofed \ no-depend compile-with "${OFED_C}" ofed/include/linux/linux_radix.c optional ofed \ no-depend compile-with "${OFED_C}" ofed/drivers/infiniband/core/addr.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/agent.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/cache.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" # XXX Mad.c must be ordered before cm.c for sysinit sets to occur in # the correct order. ofed/drivers/infiniband/core/mad.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/cm.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/ -Wno-unused-function" ofed/drivers/infiniband/core/cma.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/device.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/fmr_pool.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/iwcm.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/local_sa.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/mad_rmpp.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/multicast.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/notice.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/packer.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/sa_query.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/smi.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/sysfs.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/ucm.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/ucma.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/ud_header.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/umem.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/user_mad.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/uverbs_cmd.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/uverbs_main.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/uverbs_marshall.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/verbs.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c optional ipoib \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/" #ofed/drivers/infiniband/ulp/ipoib/ipoib_fs.c optional ipoib \ # no-depend \ # compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/" ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c optional ipoib \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/" ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c optional ipoib \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/" ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c optional ipoib \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/" ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c optional ipoib \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/" #ofed/drivers/infiniband/ulp/ipoib/ipoib_vlan.c optional ipoib \ # no-depend \ # compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/" ofed/drivers/infiniband/ulp/sdp/sdp_bcopy.c optional sdp inet \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/sdp/" ofed/drivers/infiniband/ulp/sdp/sdp_main.c optional sdp inet \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/sdp/" ofed/drivers/infiniband/ulp/sdp/sdp_rx.c optional sdp inet \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/sdp/" ofed/drivers/infiniband/ulp/sdp/sdp_cma.c optional sdp inet \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/sdp/" ofed/drivers/infiniband/ulp/sdp/sdp_tx.c optional sdp inet \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/sdp/" ofed/drivers/infiniband/hw/mlx4/ah.c optional mlx4ib \ no-depend obj-prefix "mlx4ib_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" ofed/drivers/infiniband/hw/mlx4/cq.c optional mlx4ib \ no-depend obj-prefix "mlx4ib_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" ofed/drivers/infiniband/hw/mlx4/doorbell.c optional mlx4ib \ no-depend obj-prefix "mlx4ib_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" ofed/drivers/infiniband/hw/mlx4/mad.c optional mlx4ib \ no-depend obj-prefix "mlx4ib_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" ofed/drivers/infiniband/hw/mlx4/main.c optional mlx4ib \ no-depend obj-prefix "mlx4ib_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" ofed/drivers/infiniband/hw/mlx4/mr.c optional mlx4ib \ no-depend obj-prefix "mlx4ib_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" ofed/drivers/infiniband/hw/mlx4/qp.c optional mlx4ib \ no-depend obj-prefix "mlx4ib_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" ofed/drivers/infiniband/hw/mlx4/srq.c optional mlx4ib \ no-depend obj-prefix "mlx4ib_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" ofed/drivers/infiniband/hw/mlx4/wc.c optional mlx4ib \ no-depend obj-prefix "mlx4ib_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" ofed/drivers/net/mlx4/alloc.c optional mlx4ib | mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/catas.c optional mlx4ib | mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/cmd.c optional mlx4ib | mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/cq.c optional mlx4ib | mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/eq.c optional mlx4ib | mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/fw.c optional mlx4ib | mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/icm.c optional mlx4ib | mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/intf.c optional mlx4ib | mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/main.c optional mlx4ib | mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/mcg.c optional mlx4ib | mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/ -Wno-unused" ofed/drivers/net/mlx4/mr.c optional mlx4ib | mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/pd.c optional mlx4ib | mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/port.c optional mlx4ib | mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/profile.c optional mlx4ib | mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/qp.c optional mlx4ib | mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/reset.c optional mlx4ib | mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/sense.c optional mlx4ib | mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/srq.c optional mlx4ib | mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/xrcd.c optional mlx4ib | mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/en_cq.c optional mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/en_frag.c optional mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/en_main.c optional mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/en_netdev.c optional mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/en_port.c optional mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/en_resources.c optional mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/en_rx.c optional mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/net/mlx4/en_tx.c optional mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" ofed/drivers/infiniband/hw/mthca/mthca_allocator.c optional mthca \ no-depend compile-with "${OFED_C}" ofed/drivers/infiniband/hw/mthca/mthca_av.c optional mthca \ no-depend compile-with "${OFED_C}" ofed/drivers/infiniband/hw/mthca/mthca_catas.c optional mthca \ no-depend compile-with "${OFED_C}" ofed/drivers/infiniband/hw/mthca/mthca_cmd.c optional mthca \ no-depend compile-with "${OFED_C}" ofed/drivers/infiniband/hw/mthca/mthca_cq.c optional mthca \ no-depend compile-with "${OFED_C}" ofed/drivers/infiniband/hw/mthca/mthca_eq.c optional mthca \ no-depend compile-with "${OFED_C}" ofed/drivers/infiniband/hw/mthca/mthca_mad.c optional mthca \ no-depend compile-with "${OFED_C}" ofed/drivers/infiniband/hw/mthca/mthca_main.c optional mthca \ no-depend compile-with "${OFED_C}" ofed/drivers/infiniband/hw/mthca/mthca_mcg.c optional mthca \ no-depend compile-with "${OFED_C}" ofed/drivers/infiniband/hw/mthca/mthca_memfree.c optional mthca \ no-depend compile-with "${OFED_C}" ofed/drivers/infiniband/hw/mthca/mthca_mr.c optional mthca \ no-depend compile-with "${OFED_C}" ofed/drivers/infiniband/hw/mthca/mthca_pd.c optional mthca \ no-depend compile-with "${OFED_C}" ofed/drivers/infiniband/hw/mthca/mthca_profile.c optional mthca \ no-depend compile-with "${OFED_C}" ofed/drivers/infiniband/hw/mthca/mthca_provider.c optional mthca \ no-depend compile-with "${OFED_C}" ofed/drivers/infiniband/hw/mthca/mthca_qp.c optional mthca \ no-depend compile-with "${OFED_C}" ofed/drivers/infiniband/hw/mthca/mthca_reset.c optional mthca \ no-depend compile-with "${OFED_C}" ofed/drivers/infiniband/hw/mthca/mthca_srq.c optional mthca \ no-depend compile-with "${OFED_C}" ofed/drivers/infiniband/hw/mthca/mthca_uar.c optional mthca \ no-depend compile-with "${OFED_C}" # crypto support opencrypto/cast.c optional crypto | ipsec opencrypto/criov.c optional crypto opencrypto/crypto.c optional crypto opencrypto/cryptodev.c optional cryptodev opencrypto/cryptodev_if.m optional crypto opencrypto/cryptosoft.c optional crypto opencrypto/deflate.c optional crypto opencrypto/rmd160.c optional crypto | ipsec opencrypto/skipjack.c optional crypto opencrypto/xform.c optional crypto pci/alpm.c optional alpm pci pci/amdpm.c optional amdpm pci | nfpm pci pci/amdsmb.c optional amdsmb pci pci/if_rl.c optional rl pci pci/intpm.c optional intpm pci pci/ncr.c optional ncr pci \ compile-with "${NORMAL_C} -Wno-unused" pci/nfsmb.c optional nfsmb pci pci/viapm.c optional viapm pci rpc/auth_none.c optional krpc | nfslockd | nfsclient | nfsserver | nfscl | nfsd rpc/auth_unix.c optional krpc | nfslockd | nfsclient | nfscl | nfsd rpc/authunix_prot.c optional krpc | nfslockd | nfsclient | nfsserver | nfscl | nfsd rpc/clnt_dg.c optional krpc | nfslockd | nfsclient | nfscl | nfsd rpc/clnt_rc.c optional krpc | nfslockd | nfsclient | nfscl | nfsd rpc/clnt_vc.c optional krpc | nfslockd | nfsclient | nfsserver | nfscl | nfsd rpc/getnetconfig.c optional krpc | nfslockd | nfsclient | nfsserver | nfscl | nfsd rpc/replay.c optional krpc | nfslockd | nfsserver | nfscl | nfsd rpc/rpc_callmsg.c optional krpc | nfslockd | nfsclient | nfsserver | nfscl | nfsd rpc/rpc_generic.c optional krpc | nfslockd | nfsclient | nfsserver | nfscl | nfsd rpc/rpc_prot.c optional krpc | nfslockd | nfsclient | nfsserver | nfscl | nfsd rpc/rpcb_clnt.c optional krpc | nfslockd | nfsclient | nfsserver | nfscl | nfsd rpc/rpcb_prot.c optional krpc | nfslockd | nfsclient | nfsserver | nfscl | nfsd rpc/svc.c optional krpc | nfslockd | nfsserver | nfscl | nfsd rpc/svc_auth.c optional krpc | nfslockd | nfsserver | nfscl | nfsd rpc/svc_auth_unix.c optional krpc | nfslockd | nfsserver | nfscl | nfsd rpc/svc_dg.c optional krpc | nfslockd | nfsserver | nfscl | nfsd rpc/svc_generic.c optional krpc | nfslockd | nfsserver | nfscl | nfsd rpc/svc_vc.c optional krpc | nfslockd | nfsserver | nfscl | nfsd rpc/rpcsec_gss/rpcsec_gss.c optional krpc kgssapi | nfslockd kgssapi | nfscl kgssapi | nfsd kgssapi rpc/rpcsec_gss/rpcsec_gss_conf.c optional krpc kgssapi | nfslockd kgssapi | nfscl kgssapi | nfsd kgssapi rpc/rpcsec_gss/rpcsec_gss_misc.c optional krpc kgssapi | nfslockd kgssapi | nfscl kgssapi | nfsd kgssapi rpc/rpcsec_gss/rpcsec_gss_prot.c optional krpc kgssapi | nfslockd kgssapi | nfscl kgssapi | nfsd kgssapi rpc/rpcsec_gss/svc_rpcsec_gss.c optional krpc kgssapi | nfslockd kgssapi | nfscl kgssapi | nfsd kgssapi security/audit/audit.c optional audit security/audit/audit_arg.c optional audit security/audit/audit_bsm.c optional audit security/audit/audit_bsm_domain.c optional audit security/audit/audit_bsm_errno.c optional audit security/audit/audit_bsm_fcntl.c optional audit security/audit/audit_bsm_klib.c optional audit security/audit/audit_bsm_socket_type.c optional audit security/audit/audit_bsm_token.c optional audit security/audit/audit_pipe.c optional audit security/audit/audit_syscalls.c standard security/audit/audit_trigger.c optional audit security/audit/audit_worker.c optional audit security/mac/mac_atalk.c optional mac netatalk security/mac/mac_audit.c optional mac audit security/mac/mac_cred.c optional mac security/mac/mac_framework.c optional mac security/mac/mac_inet.c optional mac inet | mac inet6 security/mac/mac_inet6.c optional mac inet6 security/mac/mac_label.c optional mac security/mac/mac_net.c optional mac security/mac/mac_pipe.c optional mac security/mac/mac_posix_sem.c optional mac security/mac/mac_posix_shm.c optional mac security/mac/mac_priv.c optional mac security/mac/mac_process.c optional mac security/mac/mac_socket.c optional mac security/mac/mac_syscalls.c standard security/mac/mac_system.c optional mac security/mac/mac_sysv_msg.c optional mac security/mac/mac_sysv_sem.c optional mac security/mac/mac_sysv_shm.c optional mac security/mac/mac_vfs.c optional mac security/mac_biba/mac_biba.c optional mac_biba security/mac_bsdextended/mac_bsdextended.c optional mac_bsdextended security/mac_bsdextended/ugidfw_system.c optional mac_bsdextended security/mac_bsdextended/ugidfw_vnode.c optional mac_bsdextended security/mac_ifoff/mac_ifoff.c optional mac_ifoff security/mac_lomac/mac_lomac.c optional mac_lomac security/mac_mls/mac_mls.c optional mac_mls security/mac_none/mac_none.c optional mac_none security/mac_partition/mac_partition.c optional mac_partition security/mac_portacl/mac_portacl.c optional mac_portacl security/mac_seeotheruids/mac_seeotheruids.c optional mac_seeotheruids security/mac_stub/mac_stub.c optional mac_stub security/mac_test/mac_test.c optional mac_test teken/teken.c optional sc ufs/ffs/ffs_alloc.c optional ffs ufs/ffs/ffs_balloc.c optional ffs ufs/ffs/ffs_inode.c optional ffs ufs/ffs/ffs_snapshot.c optional ffs ufs/ffs/ffs_softdep.c optional ffs ufs/ffs/ffs_subr.c optional ffs ufs/ffs/ffs_tables.c optional ffs ufs/ffs/ffs_vfsops.c optional ffs ufs/ffs/ffs_vnops.c optional ffs ufs/ffs/ffs_rawread.c optional directio ufs/ffs/ffs_suspend.c optional ffs ufs/ufs/ufs_acl.c optional ffs ufs/ufs/ufs_bmap.c optional ffs ufs/ufs/ufs_dirhash.c optional ffs ufs/ufs/ufs_extattr.c optional ffs ufs/ufs/ufs_gjournal.c optional ffs UFS_GJOURNAL ufs/ufs/ufs_inode.c optional ffs ufs/ufs/ufs_lookup.c optional ffs ufs/ufs/ufs_quota.c optional ffs ufs/ufs/ufs_vfsops.c optional ffs ufs/ufs/ufs_vnops.c optional ffs vm/default_pager.c standard vm/device_pager.c standard vm/phys_pager.c standard vm/redzone.c optional DEBUG_REDZONE vm/sg_pager.c standard vm/swap_pager.c standard vm/uma_core.c standard vm/uma_dbg.c standard vm/vm_contig.c standard vm/memguard.c optional DEBUG_MEMGUARD vm/vm_fault.c standard vm/vm_glue.c standard vm/vm_init.c standard vm/vm_kern.c standard vm/vm_map.c standard vm/vm_meter.c standard vm/vm_mmap.c standard vm/vm_object.c standard vm/vm_page.c standard vm/vm_pageout.c standard vm/vm_pager.c standard vm/vm_phys.c standard vm/vm_reserv.c standard vm/vm_unix.c standard vm/vm_zeroidle.c standard vm/vnode_pager.c standard xdr/xdr.c optional krpc | nfslockd | nfsclient | nfsserver | nfscl | nfsd xdr/xdr_array.c optional krpc | nfslockd | nfsclient | nfsserver | nfscl | nfsd xdr/xdr_mbuf.c optional krpc | nfslockd | nfsclient | nfsserver | nfscl | nfsd xdr/xdr_mem.c optional krpc | nfslockd | nfsclient | nfsserver | nfscl | nfsd xdr/xdr_reference.c optional krpc | nfslockd | nfsclient | nfsserver | nfscl | nfsd xdr/xdr_sizeof.c optional krpc | nfslockd | nfsclient | nfsserver | nfscl | nfsd # gnu/fs/xfs/xfs_alloc.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" \ warning "kernel contains GPL contaminated xfs filesystem" gnu/fs/xfs/xfs_alloc_btree.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_bit.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_bmap.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_bmap_btree.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_btree.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_buf_item.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_da_btree.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_dir.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_dir2.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_dir2_block.c optional xfs \ compile-with "${NORMAL_C} ${NO_WARRAY_BOUNDS} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_dir2_data.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_dir2_leaf.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_dir2_node.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_dir2_sf.c optional xfs \ compile-with "${NORMAL_C} ${NO_WARRAY_BOUNDS} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_dir2_trace.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_dir_leaf.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_error.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_extfree_item.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_fsops.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_ialloc.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_ialloc_btree.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_inode.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_inode_item.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_iocore.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_itable.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_dfrag.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_log.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_log_recover.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_mount.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_rename.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_trans.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_trans_ail.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_trans_buf.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_trans_extfree.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_trans_inode.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_trans_item.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_utils.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_vfsops.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_vnodeops.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_rw.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_attr_leaf.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_attr.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_dmops.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_qmops.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_iget.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/FreeBSD/xfs_freebsd_iget.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/FreeBSD/xfs_mountops.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/FreeBSD/xfs_vnops.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/FreeBSD/xfs_frw.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/FreeBSD/xfs_buf.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/FreeBSD/xfs_globals.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/FreeBSD/xfs_dmistubs.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/FreeBSD/xfs_super.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/FreeBSD/xfs_stats.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/FreeBSD/xfs_vfs.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/FreeBSD/xfs_vnode.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/FreeBSD/xfs_sysctl.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/FreeBSD/xfs_fs_subr.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/FreeBSD/xfs_ioctl.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/FreeBSD/support/debug.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/FreeBSD/support/ktrace.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/FreeBSD/support/mrlock.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/FreeBSD/support/uuid.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/FreeBSD/support/kmem.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_iomap.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" gnu/fs/xfs/xfs_behavior.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" xen/gnttab.c optional xen | xenhvm xen/features.c optional xen | xenhvm xen/evtchn/evtchn.c optional xen xen/evtchn/evtchn_dev.c optional xen | xenhvm xen/xenbus/xenbus_if.m optional xen | xenhvm xen/xenbus/xenbus.c optional xen | xenhvm xen/xenbus/xenbusb_if.m optional xen | xenhvm xen/xenbus/xenbusb.c optional xen | xenhvm xen/xenbus/xenbusb_front.c optional xen | xenhvm xen/xenbus/xenbusb_back.c optional xen | xenhvm xen/xenstore/xenstore.c optional xen | xenhvm xen/xenstore/xenstore_dev.c optional xen | xenhvm dev/xen/balloon/balloon.c optional xen | xenhvm dev/xen/blkfront/blkfront.c optional xen | xenhvm dev/xen/blkback/blkback.c optional xen | xenhvm dev/xen/console/console.c optional xen dev/xen/console/xencons_ring.c optional xen dev/xen/control/control.c optional xen | xenhvm dev/xen/netback/netback.c optional xen | xenhvm dev/xen/netfront/netfront.c optional xen | xenhvm dev/xen/xenpci/xenpci.c optional xenpci dev/xen/xenpci/evtchn.c optional xenpci Index: stable/9/sys/dev/e1000/if_em.c =================================================================== --- stable/9/sys/dev/e1000/if_em.c (revision 262152) +++ stable/9/sys/dev/e1000/if_em.c (revision 262153) @@ -1,5798 +1,5798 @@ /****************************************************************************** Copyright (c) 2001-2013, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************/ /*$FreeBSD$*/ #include "opt_inet.h" #include "opt_inet6.h" #ifdef HAVE_KERNEL_OPTION_HEADERS #include "opt_device_polling.h" #endif #include #include #if __FreeBSD_version >= 800000 #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "e1000_api.h" #include "e1000_82571.h" #include "if_em.h" /********************************************************************* * Set this to one to display debug statistics *********************************************************************/ int em_display_debug_stats = 0; /********************************************************************* * Driver version: *********************************************************************/ char em_driver_version[] = "7.3.8"; /********************************************************************* * PCI Device ID Table * * Used by probe to select devices to load on * Last field stores an index into e1000_strings * Last entry must be all 0s * * { Vendor ID, Device ID, SubVendor ID, SubDevice ID, String Index } *********************************************************************/ static em_vendor_info_t em_vendor_info_array[] = { /* Intel(R) PRO/1000 Network Connection */ { 0x8086, E1000_DEV_ID_82571EB_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82571EB_FIBER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82571EB_SERDES, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82571EB_SERDES_DUAL, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82571EB_SERDES_QUAD, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82571EB_QUAD_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82571EB_QUAD_COPPER_LP, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82571EB_QUAD_FIBER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82571PT_QUAD_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82572EI_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82572EI_FIBER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82572EI_SERDES, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82572EI, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82573E, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82573E_IAMT, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82573L, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82583V, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_80003ES2LAN_COPPER_SPT, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_80003ES2LAN_SERDES_SPT, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_80003ES2LAN_COPPER_DPT, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_80003ES2LAN_SERDES_DPT, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH8_IGP_M_AMT, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH8_IGP_AMT, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH8_IGP_C, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH8_IFE, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH8_IFE_GT, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH8_IFE_G, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH8_IGP_M, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH8_82567V_3, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH9_IGP_M_AMT, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH9_IGP_AMT, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH9_IGP_C, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH9_IGP_M, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH9_IGP_M_V, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH9_IFE, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH9_IFE_GT, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH9_IFE_G, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH9_BM, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82574L, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82574LA, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH10_R_BM_LM, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH10_R_BM_LF, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH10_R_BM_V, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH10_D_BM_LM, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH10_D_BM_LF, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_ICH10_D_BM_V, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_PCH_M_HV_LM, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_PCH_M_HV_LC, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_PCH_D_HV_DM, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_PCH_D_HV_DC, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_PCH2_LV_LM, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_PCH2_LV_V, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_PCH_LPT_I217_LM, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_PCH_LPT_I217_V, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_PCH_LPTLP_I218_LM, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_PCH_LPTLP_I218_V, PCI_ANY_ID, PCI_ANY_ID, 0}, /* required last entry */ { 0, 0, 0, 0, 0} }; /********************************************************************* * Table of branding strings for all supported NICs. *********************************************************************/ static char *em_strings[] = { "Intel(R) PRO/1000 Network Connection" }; /********************************************************************* * Function prototypes *********************************************************************/ static int em_probe(device_t); static int em_attach(device_t); static int em_detach(device_t); static int em_shutdown(device_t); static int em_suspend(device_t); static int em_resume(device_t); #ifdef EM_MULTIQUEUE static int em_mq_start(struct ifnet *, struct mbuf *); static int em_mq_start_locked(struct ifnet *, struct tx_ring *, struct mbuf *); static void em_qflush(struct ifnet *); #else static void em_start(struct ifnet *); static void em_start_locked(struct ifnet *, struct tx_ring *); #endif static int em_ioctl(struct ifnet *, u_long, caddr_t); static void em_init(void *); static void em_init_locked(struct adapter *); static void em_stop(void *); static void em_media_status(struct ifnet *, struct ifmediareq *); static int em_media_change(struct ifnet *); static void em_identify_hardware(struct adapter *); static int em_allocate_pci_resources(struct adapter *); static int em_allocate_legacy(struct adapter *); static int em_allocate_msix(struct adapter *); static int em_allocate_queues(struct adapter *); static int em_setup_msix(struct adapter *); static void em_free_pci_resources(struct adapter *); static void em_local_timer(void *); static void em_reset(struct adapter *); static int em_setup_interface(device_t, struct adapter *); static void em_setup_transmit_structures(struct adapter *); static void em_initialize_transmit_unit(struct adapter *); static int em_allocate_transmit_buffers(struct tx_ring *); static void em_free_transmit_structures(struct adapter *); static void em_free_transmit_buffers(struct tx_ring *); static int em_setup_receive_structures(struct adapter *); static int em_allocate_receive_buffers(struct rx_ring *); static void em_initialize_receive_unit(struct adapter *); static void em_free_receive_structures(struct adapter *); static void em_free_receive_buffers(struct rx_ring *); static void em_enable_intr(struct adapter *); static void em_disable_intr(struct adapter *); static void em_update_stats_counters(struct adapter *); static void em_add_hw_stats(struct adapter *adapter); static void em_txeof(struct tx_ring *); static bool em_rxeof(struct rx_ring *, int, int *); #ifndef __NO_STRICT_ALIGNMENT static int em_fixup_rx(struct rx_ring *); #endif static void em_receive_checksum(struct e1000_rx_desc *, struct mbuf *); static void em_transmit_checksum_setup(struct tx_ring *, struct mbuf *, int, struct ip *, u32 *, u32 *); static void em_tso_setup(struct tx_ring *, struct mbuf *, int, struct ip *, struct tcphdr *, u32 *, u32 *); static void em_set_promisc(struct adapter *); static void em_disable_promisc(struct adapter *); static void em_set_multi(struct adapter *); static void em_update_link_status(struct adapter *); static void em_refresh_mbufs(struct rx_ring *, int); static void em_register_vlan(void *, struct ifnet *, u16); static void em_unregister_vlan(void *, struct ifnet *, u16); static void em_setup_vlan_hw_support(struct adapter *); static int em_xmit(struct tx_ring *, struct mbuf **); static int em_dma_malloc(struct adapter *, bus_size_t, struct em_dma_alloc *, int); static void em_dma_free(struct adapter *, struct em_dma_alloc *); static int em_sysctl_nvm_info(SYSCTL_HANDLER_ARGS); static void em_print_nvm_info(struct adapter *); static int em_sysctl_debug_info(SYSCTL_HANDLER_ARGS); static void em_print_debug_info(struct adapter *); static int em_is_valid_ether_addr(u8 *); static int em_sysctl_int_delay(SYSCTL_HANDLER_ARGS); static void em_add_int_delay_sysctl(struct adapter *, const char *, const char *, struct em_int_delay_info *, int, int); /* Management and WOL Support */ static void em_init_manageability(struct adapter *); static void em_release_manageability(struct adapter *); static void em_get_hw_control(struct adapter *); static void em_release_hw_control(struct adapter *); static void em_get_wakeup(device_t); static void em_enable_wakeup(device_t); static int em_enable_phy_wakeup(struct adapter *); static void em_led_func(void *, int); static void em_disable_aspm(struct adapter *); static int em_irq_fast(void *); /* MSIX handlers */ static void em_msix_tx(void *); static void em_msix_rx(void *); static void em_msix_link(void *); static void em_handle_tx(void *context, int pending); static void em_handle_rx(void *context, int pending); static void em_handle_link(void *context, int pending); static void em_set_sysctl_value(struct adapter *, const char *, const char *, int *, int); static int em_set_flowcntl(SYSCTL_HANDLER_ARGS); static int em_sysctl_eee(SYSCTL_HANDLER_ARGS); static __inline void em_rx_discard(struct rx_ring *, int); #ifdef DEVICE_POLLING static poll_handler_t em_poll; #endif /* POLLING */ /********************************************************************* * FreeBSD Device Interface Entry Points *********************************************************************/ static device_method_t em_methods[] = { /* Device interface */ DEVMETHOD(device_probe, em_probe), DEVMETHOD(device_attach, em_attach), DEVMETHOD(device_detach, em_detach), DEVMETHOD(device_shutdown, em_shutdown), DEVMETHOD(device_suspend, em_suspend), DEVMETHOD(device_resume, em_resume), DEVMETHOD_END }; static driver_t em_driver = { "em", em_methods, sizeof(struct adapter), }; devclass_t em_devclass; DRIVER_MODULE(em, pci, em_driver, em_devclass, 0, 0); MODULE_DEPEND(em, pci, 1, 1, 1); MODULE_DEPEND(em, ether, 1, 1, 1); /********************************************************************* * Tunable default values. *********************************************************************/ #define EM_TICKS_TO_USECS(ticks) ((1024 * (ticks) + 500) / 1000) #define EM_USECS_TO_TICKS(usecs) ((1000 * (usecs) + 512) / 1024) #define M_TSO_LEN 66 #define MAX_INTS_PER_SEC 8000 #define DEFAULT_ITR (1000000000/(MAX_INTS_PER_SEC * 256)) /* Allow common code without TSO */ #ifndef CSUM_TSO #define CSUM_TSO 0 #endif static SYSCTL_NODE(_hw, OID_AUTO, em, CTLFLAG_RD, 0, "EM driver parameters"); static int em_tx_int_delay_dflt = EM_TICKS_TO_USECS(EM_TIDV); static int em_rx_int_delay_dflt = EM_TICKS_TO_USECS(EM_RDTR); TUNABLE_INT("hw.em.tx_int_delay", &em_tx_int_delay_dflt); TUNABLE_INT("hw.em.rx_int_delay", &em_rx_int_delay_dflt); SYSCTL_INT(_hw_em, OID_AUTO, tx_int_delay, CTLFLAG_RDTUN, &em_tx_int_delay_dflt, 0, "Default transmit interrupt delay in usecs"); SYSCTL_INT(_hw_em, OID_AUTO, rx_int_delay, CTLFLAG_RDTUN, &em_rx_int_delay_dflt, 0, "Default receive interrupt delay in usecs"); static int em_tx_abs_int_delay_dflt = EM_TICKS_TO_USECS(EM_TADV); static int em_rx_abs_int_delay_dflt = EM_TICKS_TO_USECS(EM_RADV); TUNABLE_INT("hw.em.tx_abs_int_delay", &em_tx_abs_int_delay_dflt); TUNABLE_INT("hw.em.rx_abs_int_delay", &em_rx_abs_int_delay_dflt); SYSCTL_INT(_hw_em, OID_AUTO, tx_abs_int_delay, CTLFLAG_RDTUN, &em_tx_abs_int_delay_dflt, 0, "Default transmit interrupt delay limit in usecs"); SYSCTL_INT(_hw_em, OID_AUTO, rx_abs_int_delay, CTLFLAG_RDTUN, &em_rx_abs_int_delay_dflt, 0, "Default receive interrupt delay limit in usecs"); static int em_rxd = EM_DEFAULT_RXD; static int em_txd = EM_DEFAULT_TXD; TUNABLE_INT("hw.em.rxd", &em_rxd); TUNABLE_INT("hw.em.txd", &em_txd); SYSCTL_INT(_hw_em, OID_AUTO, rxd, CTLFLAG_RDTUN, &em_rxd, 0, "Number of receive descriptors per queue"); SYSCTL_INT(_hw_em, OID_AUTO, txd, CTLFLAG_RDTUN, &em_txd, 0, "Number of transmit descriptors per queue"); static int em_smart_pwr_down = FALSE; TUNABLE_INT("hw.em.smart_pwr_down", &em_smart_pwr_down); SYSCTL_INT(_hw_em, OID_AUTO, smart_pwr_down, CTLFLAG_RDTUN, &em_smart_pwr_down, 0, "Set to true to leave smart power down enabled on newer adapters"); /* Controls whether promiscuous also shows bad packets */ static int em_debug_sbp = FALSE; TUNABLE_INT("hw.em.sbp", &em_debug_sbp); SYSCTL_INT(_hw_em, OID_AUTO, sbp, CTLFLAG_RDTUN, &em_debug_sbp, 0, "Show bad packets in promiscuous mode"); static int em_enable_msix = TRUE; TUNABLE_INT("hw.em.enable_msix", &em_enable_msix); SYSCTL_INT(_hw_em, OID_AUTO, enable_msix, CTLFLAG_RDTUN, &em_enable_msix, 0, "Enable MSI-X interrupts"); /* How many packets rxeof tries to clean at a time */ static int em_rx_process_limit = 100; TUNABLE_INT("hw.em.rx_process_limit", &em_rx_process_limit); SYSCTL_INT(_hw_em, OID_AUTO, rx_process_limit, CTLFLAG_RDTUN, &em_rx_process_limit, 0, "Maximum number of received packets to process " "at a time, -1 means unlimited"); /* Energy efficient ethernet - default to OFF */ static int eee_setting = 1; TUNABLE_INT("hw.em.eee_setting", &eee_setting); SYSCTL_INT(_hw_em, OID_AUTO, eee_setting, CTLFLAG_RDTUN, &eee_setting, 0, "Enable Energy Efficient Ethernet"); /* Global used in WOL setup with multiport cards */ static int global_quad_port_a = 0; #ifdef DEV_NETMAP /* see ixgbe.c for details */ #include #endif /* DEV_NETMAP */ /********************************************************************* * Device identification routine * * em_probe determines if the driver should be loaded on * adapter based on PCI vendor/device id of the adapter. * * return BUS_PROBE_DEFAULT on success, positive on failure *********************************************************************/ static int em_probe(device_t dev) { char adapter_name[60]; u16 pci_vendor_id = 0; u16 pci_device_id = 0; u16 pci_subvendor_id = 0; u16 pci_subdevice_id = 0; em_vendor_info_t *ent; INIT_DEBUGOUT("em_probe: begin"); pci_vendor_id = pci_get_vendor(dev); if (pci_vendor_id != EM_VENDOR_ID) return (ENXIO); pci_device_id = pci_get_device(dev); pci_subvendor_id = pci_get_subvendor(dev); pci_subdevice_id = pci_get_subdevice(dev); ent = em_vendor_info_array; while (ent->vendor_id != 0) { if ((pci_vendor_id == ent->vendor_id) && (pci_device_id == ent->device_id) && ((pci_subvendor_id == ent->subvendor_id) || (ent->subvendor_id == PCI_ANY_ID)) && ((pci_subdevice_id == ent->subdevice_id) || (ent->subdevice_id == PCI_ANY_ID))) { sprintf(adapter_name, "%s %s", em_strings[ent->index], em_driver_version); device_set_desc_copy(dev, adapter_name); return (BUS_PROBE_DEFAULT); } ent++; } return (ENXIO); } /********************************************************************* * Device initialization routine * * The attach entry point is called when the driver is being loaded. * This routine identifies the type of hardware, allocates all resources * and initializes the hardware. * * return 0 on success, positive on failure *********************************************************************/ static int em_attach(device_t dev) { struct adapter *adapter; struct e1000_hw *hw; int error = 0; INIT_DEBUGOUT("em_attach: begin"); if (resource_disabled("em", device_get_unit(dev))) { device_printf(dev, "Disabled by device hint\n"); return (ENXIO); } adapter = device_get_softc(dev); adapter->dev = adapter->osdep.dev = dev; hw = &adapter->hw; EM_CORE_LOCK_INIT(adapter, device_get_nameunit(dev)); /* SYSCTL stuff */ SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "nvm", CTLTYPE_INT|CTLFLAG_RW, adapter, 0, em_sysctl_nvm_info, "I", "NVM Information"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "debug", CTLTYPE_INT|CTLFLAG_RW, adapter, 0, em_sysctl_debug_info, "I", "Debug Information"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "fc", CTLTYPE_INT|CTLFLAG_RW, adapter, 0, em_set_flowcntl, "I", "Flow Control"); callout_init_mtx(&adapter->timer, &adapter->core_mtx, 0); /* Determine hardware and mac info */ em_identify_hardware(adapter); /* Setup PCI resources */ if (em_allocate_pci_resources(adapter)) { device_printf(dev, "Allocation of PCI resources failed\n"); error = ENXIO; goto err_pci; } /* ** For ICH8 and family we need to ** map the flash memory, and this ** must happen after the MAC is ** identified */ if ((hw->mac.type == e1000_ich8lan) || (hw->mac.type == e1000_ich9lan) || (hw->mac.type == e1000_ich10lan) || (hw->mac.type == e1000_pchlan) || (hw->mac.type == e1000_pch2lan) || (hw->mac.type == e1000_pch_lpt)) { int rid = EM_BAR_TYPE_FLASH; adapter->flash = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (adapter->flash == NULL) { device_printf(dev, "Mapping of Flash failed\n"); error = ENXIO; goto err_pci; } /* This is used in the shared code */ hw->flash_address = (u8 *)adapter->flash; adapter->osdep.flash_bus_space_tag = rman_get_bustag(adapter->flash); adapter->osdep.flash_bus_space_handle = rman_get_bushandle(adapter->flash); } /* Do Shared Code initialization */ if (e1000_setup_init_funcs(hw, TRUE)) { device_printf(dev, "Setup of Shared code failed\n"); error = ENXIO; goto err_pci; } e1000_get_bus_info(hw); /* Set up some sysctls for the tunable interrupt delays */ em_add_int_delay_sysctl(adapter, "rx_int_delay", "receive interrupt delay in usecs", &adapter->rx_int_delay, E1000_REGISTER(hw, E1000_RDTR), em_rx_int_delay_dflt); em_add_int_delay_sysctl(adapter, "tx_int_delay", "transmit interrupt delay in usecs", &adapter->tx_int_delay, E1000_REGISTER(hw, E1000_TIDV), em_tx_int_delay_dflt); em_add_int_delay_sysctl(adapter, "rx_abs_int_delay", "receive interrupt delay limit in usecs", &adapter->rx_abs_int_delay, E1000_REGISTER(hw, E1000_RADV), em_rx_abs_int_delay_dflt); em_add_int_delay_sysctl(adapter, "tx_abs_int_delay", "transmit interrupt delay limit in usecs", &adapter->tx_abs_int_delay, E1000_REGISTER(hw, E1000_TADV), em_tx_abs_int_delay_dflt); em_add_int_delay_sysctl(adapter, "itr", "interrupt delay limit in usecs/4", &adapter->tx_itr, E1000_REGISTER(hw, E1000_ITR), DEFAULT_ITR); /* Sysctl for limiting the amount of work done in the taskqueue */ em_set_sysctl_value(adapter, "rx_processing_limit", "max number of rx packets to process", &adapter->rx_process_limit, em_rx_process_limit); /* * Validate number of transmit and receive descriptors. It * must not exceed hardware maximum, and must be multiple * of E1000_DBA_ALIGN. */ if (((em_txd * sizeof(struct e1000_tx_desc)) % EM_DBA_ALIGN) != 0 || (em_txd > EM_MAX_TXD) || (em_txd < EM_MIN_TXD)) { device_printf(dev, "Using %d TX descriptors instead of %d!\n", EM_DEFAULT_TXD, em_txd); adapter->num_tx_desc = EM_DEFAULT_TXD; } else adapter->num_tx_desc = em_txd; if (((em_rxd * sizeof(struct e1000_rx_desc)) % EM_DBA_ALIGN) != 0 || (em_rxd > EM_MAX_RXD) || (em_rxd < EM_MIN_RXD)) { device_printf(dev, "Using %d RX descriptors instead of %d!\n", EM_DEFAULT_RXD, em_rxd); adapter->num_rx_desc = EM_DEFAULT_RXD; } else adapter->num_rx_desc = em_rxd; hw->mac.autoneg = DO_AUTO_NEG; hw->phy.autoneg_wait_to_complete = FALSE; hw->phy.autoneg_advertised = AUTONEG_ADV_DEFAULT; /* Copper options */ if (hw->phy.media_type == e1000_media_type_copper) { hw->phy.mdix = AUTO_ALL_MODES; hw->phy.disable_polarity_correction = FALSE; hw->phy.ms_type = EM_MASTER_SLAVE; } /* * Set the frame limits assuming * standard ethernet sized frames. */ adapter->hw.mac.max_frame_size = ETHERMTU + ETHER_HDR_LEN + ETHERNET_FCS_SIZE; /* * This controls when hardware reports transmit completion * status. */ hw->mac.report_tx_early = 1; /* ** Get queue/ring memory */ if (em_allocate_queues(adapter)) { error = ENOMEM; goto err_pci; } /* Allocate multicast array memory. */ adapter->mta = malloc(sizeof(u8) * ETH_ADDR_LEN * MAX_NUM_MULTICAST_ADDRESSES, M_DEVBUF, M_NOWAIT); if (adapter->mta == NULL) { device_printf(dev, "Can not allocate multicast setup array\n"); error = ENOMEM; goto err_late; } /* Check SOL/IDER usage */ if (e1000_check_reset_block(hw)) device_printf(dev, "PHY reset is blocked" " due to SOL/IDER session.\n"); /* Sysctl for setting Energy Efficient Ethernet */ hw->dev_spec.ich8lan.eee_disable = eee_setting; SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "eee_control", CTLTYPE_INT|CTLFLAG_RW, adapter, 0, em_sysctl_eee, "I", "Disable Energy Efficient Ethernet"); /* ** Start from a known state, this is ** important in reading the nvm and ** mac from that. */ e1000_reset_hw(hw); /* Make sure we have a good EEPROM before we read from it */ if (e1000_validate_nvm_checksum(hw) < 0) { /* ** Some PCI-E parts fail the first check due to ** the link being in sleep state, call it again, ** if it fails a second time its a real issue. */ if (e1000_validate_nvm_checksum(hw) < 0) { device_printf(dev, "The EEPROM Checksum Is Not Valid\n"); error = EIO; goto err_late; } } /* Copy the permanent MAC address out of the EEPROM */ if (e1000_read_mac_addr(hw) < 0) { device_printf(dev, "EEPROM read error while reading MAC" " address\n"); error = EIO; goto err_late; } if (!em_is_valid_ether_addr(hw->mac.addr)) { device_printf(dev, "Invalid MAC address\n"); error = EIO; goto err_late; } /* ** Do interrupt configuration */ if (adapter->msix > 1) /* Do MSIX */ error = em_allocate_msix(adapter); else /* MSI or Legacy */ error = em_allocate_legacy(adapter); if (error) goto err_late; /* * Get Wake-on-Lan and Management info for later use */ em_get_wakeup(dev); /* Setup OS specific network interface */ if (em_setup_interface(dev, adapter) != 0) goto err_late; em_reset(adapter); /* Initialize statistics */ em_update_stats_counters(adapter); hw->mac.get_link_status = 1; em_update_link_status(adapter); /* Register for VLAN events */ adapter->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, em_register_vlan, adapter, EVENTHANDLER_PRI_FIRST); adapter->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, em_unregister_vlan, adapter, EVENTHANDLER_PRI_FIRST); em_add_hw_stats(adapter); /* Non-AMT based hardware can now take control from firmware */ if (adapter->has_manage && !adapter->has_amt) em_get_hw_control(adapter); /* Tell the stack that the interface is not active */ adapter->ifp->if_drv_flags &= ~IFF_DRV_RUNNING; adapter->ifp->if_drv_flags |= IFF_DRV_OACTIVE; adapter->led_dev = led_create(em_led_func, adapter, device_get_nameunit(dev)); #ifdef DEV_NETMAP em_netmap_attach(adapter); #endif /* DEV_NETMAP */ INIT_DEBUGOUT("em_attach: end"); return (0); err_late: em_free_transmit_structures(adapter); em_free_receive_structures(adapter); em_release_hw_control(adapter); if (adapter->ifp != NULL) if_free(adapter->ifp); err_pci: em_free_pci_resources(adapter); free(adapter->mta, M_DEVBUF); EM_CORE_LOCK_DESTROY(adapter); return (error); } /********************************************************************* * Device removal routine * * The detach entry point is called when the driver is being removed. * This routine stops the adapter and deallocates all the resources * that were allocated for driver operation. * * return 0 on success, positive on failure *********************************************************************/ static int em_detach(device_t dev) { struct adapter *adapter = device_get_softc(dev); struct ifnet *ifp = adapter->ifp; INIT_DEBUGOUT("em_detach: begin"); /* Make sure VLANS are not using driver */ if (adapter->ifp->if_vlantrunk != NULL) { device_printf(dev,"Vlan in use, detach first\n"); return (EBUSY); } #ifdef DEVICE_POLLING if (ifp->if_capenable & IFCAP_POLLING) ether_poll_deregister(ifp); #endif if (adapter->led_dev != NULL) led_destroy(adapter->led_dev); EM_CORE_LOCK(adapter); adapter->in_detach = 1; em_stop(adapter); EM_CORE_UNLOCK(adapter); EM_CORE_LOCK_DESTROY(adapter); e1000_phy_hw_reset(&adapter->hw); em_release_manageability(adapter); em_release_hw_control(adapter); /* Unregister VLAN events */ if (adapter->vlan_attach != NULL) EVENTHANDLER_DEREGISTER(vlan_config, adapter->vlan_attach); if (adapter->vlan_detach != NULL) EVENTHANDLER_DEREGISTER(vlan_unconfig, adapter->vlan_detach); ether_ifdetach(adapter->ifp); callout_drain(&adapter->timer); #ifdef DEV_NETMAP netmap_detach(ifp); #endif /* DEV_NETMAP */ em_free_pci_resources(adapter); bus_generic_detach(dev); if_free(ifp); em_free_transmit_structures(adapter); em_free_receive_structures(adapter); em_release_hw_control(adapter); free(adapter->mta, M_DEVBUF); return (0); } /********************************************************************* * * Shutdown entry point * **********************************************************************/ static int em_shutdown(device_t dev) { return em_suspend(dev); } /* * Suspend/resume device methods. */ static int em_suspend(device_t dev) { struct adapter *adapter = device_get_softc(dev); EM_CORE_LOCK(adapter); em_release_manageability(adapter); em_release_hw_control(adapter); em_enable_wakeup(dev); EM_CORE_UNLOCK(adapter); return bus_generic_suspend(dev); } static int em_resume(device_t dev) { struct adapter *adapter = device_get_softc(dev); struct tx_ring *txr = adapter->tx_rings; struct ifnet *ifp = adapter->ifp; EM_CORE_LOCK(adapter); if (adapter->hw.mac.type == e1000_pch2lan) e1000_resume_workarounds_pchlan(&adapter->hw); em_init_locked(adapter); em_init_manageability(adapter); if ((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING) && adapter->link_active) { for (int i = 0; i < adapter->num_queues; i++, txr++) { EM_TX_LOCK(txr); #ifdef EM_MULTIQUEUE if (!drbr_empty(ifp, txr->br)) em_mq_start_locked(ifp, txr, NULL); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) em_start_locked(ifp, txr); #endif EM_TX_UNLOCK(txr); } } EM_CORE_UNLOCK(adapter); return bus_generic_resume(dev); } #ifdef EM_MULTIQUEUE /********************************************************************* * Multiqueue Transmit routines * * em_mq_start is called by the stack to initiate a transmit. * however, if busy the driver can queue the request rather * than do an immediate send. It is this that is an advantage * in this driver, rather than also having multiple tx queues. **********************************************************************/ static int em_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf *m) { struct adapter *adapter = txr->adapter; struct mbuf *next; int err = 0, enq = 0; if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING || adapter->link_active == 0) { if (m != NULL) err = drbr_enqueue(ifp, txr->br, m); return (err); } enq = 0; if (m != NULL) { err = drbr_enqueue(ifp, txr->br, m); if (err) return (err); } /* Process the queue */ while ((next = drbr_peek(ifp, txr->br)) != NULL) { if ((err = em_xmit(txr, &next)) != 0) { if (next == NULL) drbr_advance(ifp, txr->br); else drbr_putback(ifp, txr->br, next); break; } drbr_advance(ifp, txr->br); enq++; ifp->if_obytes += next->m_pkthdr.len; if (next->m_flags & M_MCAST) ifp->if_omcasts++; ETHER_BPF_MTAP(ifp, next); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; } if (enq > 0) { /* Set the watchdog */ txr->queue_status = EM_QUEUE_WORKING; txr->watchdog_time = ticks; } if (txr->tx_avail < EM_MAX_SCATTER) em_txeof(txr); if (txr->tx_avail < EM_MAX_SCATTER) ifp->if_drv_flags |= IFF_DRV_OACTIVE; return (err); } /* ** Multiqueue capable stack interface */ static int em_mq_start(struct ifnet *ifp, struct mbuf *m) { struct adapter *adapter = ifp->if_softc; struct tx_ring *txr = adapter->tx_rings; int error; if (EM_TX_TRYLOCK(txr)) { error = em_mq_start_locked(ifp, txr, m); EM_TX_UNLOCK(txr); } else error = drbr_enqueue(ifp, txr->br, m); return (error); } /* ** Flush all ring buffers */ static void em_qflush(struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct tx_ring *txr = adapter->tx_rings; struct mbuf *m; for (int i = 0; i < adapter->num_queues; i++, txr++) { EM_TX_LOCK(txr); while ((m = buf_ring_dequeue_sc(txr->br)) != NULL) m_freem(m); EM_TX_UNLOCK(txr); } if_qflush(ifp); } #else /* !EM_MULTIQUEUE */ static void em_start_locked(struct ifnet *ifp, struct tx_ring *txr) { struct adapter *adapter = ifp->if_softc; struct mbuf *m_head; EM_TX_LOCK_ASSERT(txr); if ((ifp->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return; if (!adapter->link_active) return; while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { /* Call cleanup if number of TX descriptors low */ if (txr->tx_avail <= EM_TX_CLEANUP_THRESHOLD) em_txeof(txr); if (txr->tx_avail < EM_MAX_SCATTER) { ifp->if_drv_flags |= IFF_DRV_OACTIVE; break; } IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) break; /* * Encapsulation can modify our pointer, and or make it * NULL on failure. In that event, we can't requeue. */ if (em_xmit(txr, &m_head)) { if (m_head == NULL) break; IFQ_DRV_PREPEND(&ifp->if_snd, m_head); break; } /* Send a copy of the frame to the BPF listener */ ETHER_BPF_MTAP(ifp, m_head); /* Set timeout in case hardware has problems transmitting. */ txr->watchdog_time = ticks; txr->queue_status = EM_QUEUE_WORKING; } return; } static void em_start(struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct tx_ring *txr = adapter->tx_rings; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { EM_TX_LOCK(txr); em_start_locked(ifp, txr); EM_TX_UNLOCK(txr); } return; } #endif /* EM_MULTIQUEUE */ /********************************************************************* * Ioctl entry point * * em_ioctl is called when the user wants to configure the * interface. * * return 0 on success, positive on failure **********************************************************************/ static int em_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct adapter *adapter = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; #if defined(INET) || defined(INET6) struct ifaddr *ifa = (struct ifaddr *)data; #endif bool avoid_reset = FALSE; int error = 0; if (adapter->in_detach) return (error); switch (command) { case SIOCSIFADDR: #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) avoid_reset = TRUE; #endif #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) avoid_reset = TRUE; #endif /* ** Calling init results in link renegotiation, ** so we avoid doing it when possible. */ if (avoid_reset) { ifp->if_flags |= IFF_UP; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) em_init(adapter); #ifdef INET if (!(ifp->if_flags & IFF_NOARP)) arp_ifinit(ifp, ifa); #endif } else error = ether_ioctl(ifp, command, data); break; case SIOCSIFMTU: { int max_frame_size; IOCTL_DEBUGOUT("ioctl rcv'd: SIOCSIFMTU (Set Interface MTU)"); EM_CORE_LOCK(adapter); switch (adapter->hw.mac.type) { case e1000_82571: case e1000_82572: case e1000_ich9lan: case e1000_ich10lan: case e1000_pch2lan: case e1000_pch_lpt: case e1000_82574: case e1000_82583: case e1000_80003es2lan: /* 9K Jumbo Frame size */ max_frame_size = 9234; break; case e1000_pchlan: max_frame_size = 4096; break; /* Adapters that do not support jumbo frames */ case e1000_ich8lan: max_frame_size = ETHER_MAX_LEN; break; default: max_frame_size = MAX_JUMBO_FRAME_SIZE; } if (ifr->ifr_mtu > max_frame_size - ETHER_HDR_LEN - ETHER_CRC_LEN) { EM_CORE_UNLOCK(adapter); error = EINVAL; break; } ifp->if_mtu = ifr->ifr_mtu; adapter->hw.mac.max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; em_init_locked(adapter); EM_CORE_UNLOCK(adapter); break; } case SIOCSIFFLAGS: IOCTL_DEBUGOUT("ioctl rcv'd:\ SIOCSIFFLAGS (Set Interface Flags)"); EM_CORE_LOCK(adapter); if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) { if ((ifp->if_flags ^ adapter->if_flags) & (IFF_PROMISC | IFF_ALLMULTI)) { em_disable_promisc(adapter); em_set_promisc(adapter); } } else em_init_locked(adapter); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) em_stop(adapter); adapter->if_flags = ifp->if_flags; EM_CORE_UNLOCK(adapter); break; case SIOCADDMULTI: case SIOCDELMULTI: IOCTL_DEBUGOUT("ioctl rcv'd: SIOC(ADD|DEL)MULTI"); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { EM_CORE_LOCK(adapter); em_disable_intr(adapter); em_set_multi(adapter); #ifdef DEVICE_POLLING if (!(ifp->if_capenable & IFCAP_POLLING)) #endif em_enable_intr(adapter); EM_CORE_UNLOCK(adapter); } break; case SIOCSIFMEDIA: /* Check SOL/IDER usage */ EM_CORE_LOCK(adapter); if (e1000_check_reset_block(&adapter->hw)) { EM_CORE_UNLOCK(adapter); device_printf(adapter->dev, "Media change is" " blocked due to SOL/IDER session.\n"); break; } EM_CORE_UNLOCK(adapter); /* falls thru */ case SIOCGIFMEDIA: IOCTL_DEBUGOUT("ioctl rcv'd: \ SIOCxIFMEDIA (Get/Set Interface Media)"); error = ifmedia_ioctl(ifp, ifr, &adapter->media, command); break; case SIOCSIFCAP: { int mask, reinit; IOCTL_DEBUGOUT("ioctl rcv'd: SIOCSIFCAP (Set Capabilities)"); reinit = 0; mask = ifr->ifr_reqcap ^ ifp->if_capenable; #ifdef DEVICE_POLLING if (mask & IFCAP_POLLING) { if (ifr->ifr_reqcap & IFCAP_POLLING) { error = ether_poll_register(em_poll, ifp); if (error) return (error); EM_CORE_LOCK(adapter); em_disable_intr(adapter); ifp->if_capenable |= IFCAP_POLLING; EM_CORE_UNLOCK(adapter); } else { error = ether_poll_deregister(ifp); /* Enable interrupt even in error case */ EM_CORE_LOCK(adapter); em_enable_intr(adapter); ifp->if_capenable &= ~IFCAP_POLLING; EM_CORE_UNLOCK(adapter); } } #endif if (mask & IFCAP_HWCSUM) { ifp->if_capenable ^= IFCAP_HWCSUM; reinit = 1; } if (mask & IFCAP_TSO4) { ifp->if_capenable ^= IFCAP_TSO4; reinit = 1; } if (mask & IFCAP_VLAN_HWTAGGING) { ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; reinit = 1; } if (mask & IFCAP_VLAN_HWFILTER) { ifp->if_capenable ^= IFCAP_VLAN_HWFILTER; reinit = 1; } if (mask & IFCAP_VLAN_HWTSO) { ifp->if_capenable ^= IFCAP_VLAN_HWTSO; reinit = 1; } if ((mask & IFCAP_WOL) && (ifp->if_capabilities & IFCAP_WOL) != 0) { if (mask & IFCAP_WOL_MCAST) ifp->if_capenable ^= IFCAP_WOL_MCAST; if (mask & IFCAP_WOL_MAGIC) ifp->if_capenable ^= IFCAP_WOL_MAGIC; } if (reinit && (ifp->if_drv_flags & IFF_DRV_RUNNING)) em_init(adapter); VLAN_CAPABILITIES(ifp); break; } default: error = ether_ioctl(ifp, command, data); break; } return (error); } /********************************************************************* * Init entry point * * This routine is used in two ways. It is used by the stack as * init entry point in network interface structure. It is also used * by the driver as a hw/sw initialization routine to get to a * consistent state. * * return 0 on success, positive on failure **********************************************************************/ static void em_init_locked(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; device_t dev = adapter->dev; INIT_DEBUGOUT("em_init: begin"); EM_CORE_LOCK_ASSERT(adapter); em_disable_intr(adapter); callout_stop(&adapter->timer); /* Get the latest mac address, User can use a LAA */ bcopy(IF_LLADDR(adapter->ifp), adapter->hw.mac.addr, ETHER_ADDR_LEN); /* Put the address into the Receive Address Array */ e1000_rar_set(&adapter->hw, adapter->hw.mac.addr, 0); /* * With the 82571 adapter, RAR[0] may be overwritten * when the other port is reset, we make a duplicate * in RAR[14] for that eventuality, this assures * the interface continues to function. */ if (adapter->hw.mac.type == e1000_82571) { e1000_set_laa_state_82571(&adapter->hw, TRUE); e1000_rar_set(&adapter->hw, adapter->hw.mac.addr, E1000_RAR_ENTRIES - 1); } /* Initialize the hardware */ em_reset(adapter); em_update_link_status(adapter); /* Setup VLAN support, basic and offload if available */ E1000_WRITE_REG(&adapter->hw, E1000_VET, ETHERTYPE_VLAN); /* Set hardware offload abilities */ ifp->if_hwassist = 0; if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP); if (ifp->if_capenable & IFCAP_TSO4) ifp->if_hwassist |= CSUM_TSO; /* Configure for OS presence */ em_init_manageability(adapter); /* Prepare transmit descriptors and buffers */ em_setup_transmit_structures(adapter); em_initialize_transmit_unit(adapter); /* Setup Multicast table */ em_set_multi(adapter); /* ** Figure out the desired mbuf ** pool for doing jumbos */ if (adapter->hw.mac.max_frame_size <= 2048) adapter->rx_mbuf_sz = MCLBYTES; else if (adapter->hw.mac.max_frame_size <= 4096) adapter->rx_mbuf_sz = MJUMPAGESIZE; else adapter->rx_mbuf_sz = MJUM9BYTES; /* Prepare receive descriptors and buffers */ if (em_setup_receive_structures(adapter)) { device_printf(dev, "Could not setup receive structures\n"); em_stop(adapter); return; } em_initialize_receive_unit(adapter); /* Use real VLAN Filter support? */ if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) { if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) /* Use real VLAN Filter support */ em_setup_vlan_hw_support(adapter); else { u32 ctrl; ctrl = E1000_READ_REG(&adapter->hw, E1000_CTRL); ctrl |= E1000_CTRL_VME; E1000_WRITE_REG(&adapter->hw, E1000_CTRL, ctrl); } } /* Don't lose promiscuous settings */ em_set_promisc(adapter); /* Set the interface as ACTIVE */ ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; callout_reset(&adapter->timer, hz, em_local_timer, adapter); e1000_clear_hw_cntrs_base_generic(&adapter->hw); /* MSI/X configuration for 82574 */ if (adapter->hw.mac.type == e1000_82574) { int tmp; tmp = E1000_READ_REG(&adapter->hw, E1000_CTRL_EXT); tmp |= E1000_CTRL_EXT_PBA_CLR; E1000_WRITE_REG(&adapter->hw, E1000_CTRL_EXT, tmp); /* Set the IVAR - interrupt vector routing. */ E1000_WRITE_REG(&adapter->hw, E1000_IVAR, adapter->ivars); } #ifdef DEVICE_POLLING /* * Only enable interrupts if we are not polling, make sure * they are off otherwise. */ if (ifp->if_capenable & IFCAP_POLLING) em_disable_intr(adapter); else #endif /* DEVICE_POLLING */ em_enable_intr(adapter); /* AMT based hardware can now take control from firmware */ if (adapter->has_manage && adapter->has_amt) em_get_hw_control(adapter); } static void em_init(void *arg) { struct adapter *adapter = arg; EM_CORE_LOCK(adapter); em_init_locked(adapter); EM_CORE_UNLOCK(adapter); } #ifdef DEVICE_POLLING /********************************************************************* * * Legacy polling routine: note this only works with single queue * *********************************************************************/ static int em_poll(struct ifnet *ifp, enum poll_cmd cmd, int count) { struct adapter *adapter = ifp->if_softc; struct tx_ring *txr = adapter->tx_rings; struct rx_ring *rxr = adapter->rx_rings; u32 reg_icr; int rx_done; EM_CORE_LOCK(adapter); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { EM_CORE_UNLOCK(adapter); return (0); } if (cmd == POLL_AND_CHECK_STATUS) { reg_icr = E1000_READ_REG(&adapter->hw, E1000_ICR); if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) { callout_stop(&adapter->timer); adapter->hw.mac.get_link_status = 1; em_update_link_status(adapter); callout_reset(&adapter->timer, hz, em_local_timer, adapter); } } EM_CORE_UNLOCK(adapter); em_rxeof(rxr, count, &rx_done); EM_TX_LOCK(txr); em_txeof(txr); #ifdef EM_MULTIQUEUE if (!drbr_empty(ifp, txr->br)) em_mq_start_locked(ifp, txr, NULL); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) em_start_locked(ifp, txr); #endif EM_TX_UNLOCK(txr); return (rx_done); } #endif /* DEVICE_POLLING */ /********************************************************************* * * Fast Legacy/MSI Combined Interrupt Service routine * *********************************************************************/ static int em_irq_fast(void *arg) { struct adapter *adapter = arg; struct ifnet *ifp; u32 reg_icr; ifp = adapter->ifp; reg_icr = E1000_READ_REG(&adapter->hw, E1000_ICR); /* Hot eject? */ if (reg_icr == 0xffffffff) return FILTER_STRAY; /* Definitely not our interrupt. */ if (reg_icr == 0x0) return FILTER_STRAY; /* * Starting with the 82571 chip, bit 31 should be used to * determine whether the interrupt belongs to us. */ if (adapter->hw.mac.type >= e1000_82571 && (reg_icr & E1000_ICR_INT_ASSERTED) == 0) return FILTER_STRAY; em_disable_intr(adapter); taskqueue_enqueue(adapter->tq, &adapter->que_task); /* Link status change */ if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) { adapter->hw.mac.get_link_status = 1; taskqueue_enqueue(taskqueue_fast, &adapter->link_task); } if (reg_icr & E1000_ICR_RXO) adapter->rx_overruns++; return FILTER_HANDLED; } /* Combined RX/TX handler, used by Legacy and MSI */ static void em_handle_que(void *context, int pending) { struct adapter *adapter = context; struct ifnet *ifp = adapter->ifp; struct tx_ring *txr = adapter->tx_rings; struct rx_ring *rxr = adapter->rx_rings; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { bool more = em_rxeof(rxr, adapter->rx_process_limit, NULL); EM_TX_LOCK(txr); em_txeof(txr); #ifdef EM_MULTIQUEUE if (!drbr_empty(ifp, txr->br)) em_mq_start_locked(ifp, txr, NULL); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) em_start_locked(ifp, txr); #endif EM_TX_UNLOCK(txr); if (more) { taskqueue_enqueue(adapter->tq, &adapter->que_task); return; } } em_enable_intr(adapter); return; } /********************************************************************* * * MSIX Interrupt Service Routines * **********************************************************************/ static void em_msix_tx(void *arg) { struct tx_ring *txr = arg; struct adapter *adapter = txr->adapter; struct ifnet *ifp = adapter->ifp; ++txr->tx_irq; EM_TX_LOCK(txr); em_txeof(txr); #ifdef EM_MULTIQUEUE if (!drbr_empty(ifp, txr->br)) em_mq_start_locked(ifp, txr, NULL); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) em_start_locked(ifp, txr); #endif /* Reenable this interrupt */ E1000_WRITE_REG(&adapter->hw, E1000_IMS, txr->ims); EM_TX_UNLOCK(txr); return; } /********************************************************************* * * MSIX RX Interrupt Service routine * **********************************************************************/ static void em_msix_rx(void *arg) { struct rx_ring *rxr = arg; struct adapter *adapter = rxr->adapter; bool more; ++rxr->rx_irq; if (!(adapter->ifp->if_drv_flags & IFF_DRV_RUNNING)) return; more = em_rxeof(rxr, adapter->rx_process_limit, NULL); if (more) taskqueue_enqueue(rxr->tq, &rxr->rx_task); else /* Reenable this interrupt */ E1000_WRITE_REG(&adapter->hw, E1000_IMS, rxr->ims); return; } /********************************************************************* * * MSIX Link Fast Interrupt Service routine * **********************************************************************/ static void em_msix_link(void *arg) { struct adapter *adapter = arg; u32 reg_icr; ++adapter->link_irq; reg_icr = E1000_READ_REG(&adapter->hw, E1000_ICR); if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) { adapter->hw.mac.get_link_status = 1; em_handle_link(adapter, 0); } else E1000_WRITE_REG(&adapter->hw, E1000_IMS, EM_MSIX_LINK | E1000_IMS_LSC); return; } static void em_handle_rx(void *context, int pending) { struct rx_ring *rxr = context; struct adapter *adapter = rxr->adapter; bool more; more = em_rxeof(rxr, adapter->rx_process_limit, NULL); if (more) taskqueue_enqueue(rxr->tq, &rxr->rx_task); else /* Reenable this interrupt */ E1000_WRITE_REG(&adapter->hw, E1000_IMS, rxr->ims); } static void em_handle_tx(void *context, int pending) { struct tx_ring *txr = context; struct adapter *adapter = txr->adapter; struct ifnet *ifp = adapter->ifp; EM_TX_LOCK(txr); em_txeof(txr); #ifdef EM_MULTIQUEUE if (!drbr_empty(ifp, txr->br)) em_mq_start_locked(ifp, txr, NULL); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) em_start_locked(ifp, txr); #endif E1000_WRITE_REG(&adapter->hw, E1000_IMS, txr->ims); EM_TX_UNLOCK(txr); } static void em_handle_link(void *context, int pending) { struct adapter *adapter = context; struct tx_ring *txr = adapter->tx_rings; struct ifnet *ifp = adapter->ifp; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) return; EM_CORE_LOCK(adapter); callout_stop(&adapter->timer); em_update_link_status(adapter); callout_reset(&adapter->timer, hz, em_local_timer, adapter); E1000_WRITE_REG(&adapter->hw, E1000_IMS, EM_MSIX_LINK | E1000_IMS_LSC); if (adapter->link_active) { for (int i = 0; i < adapter->num_queues; i++, txr++) { EM_TX_LOCK(txr); #ifdef EM_MULTIQUEUE if (!drbr_empty(ifp, txr->br)) em_mq_start_locked(ifp, txr, NULL); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) em_start_locked(ifp, txr); #endif EM_TX_UNLOCK(txr); } } EM_CORE_UNLOCK(adapter); } /********************************************************************* * * Media Ioctl callback * * This routine is called whenever the user queries the status of * the interface using ifconfig. * **********************************************************************/ static void em_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { struct adapter *adapter = ifp->if_softc; u_char fiber_type = IFM_1000_SX; INIT_DEBUGOUT("em_media_status: begin"); EM_CORE_LOCK(adapter); em_update_link_status(adapter); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (!adapter->link_active) { EM_CORE_UNLOCK(adapter); return; } ifmr->ifm_status |= IFM_ACTIVE; if ((adapter->hw.phy.media_type == e1000_media_type_fiber) || (adapter->hw.phy.media_type == e1000_media_type_internal_serdes)) { ifmr->ifm_active |= fiber_type | IFM_FDX; } else { switch (adapter->link_speed) { case 10: ifmr->ifm_active |= IFM_10_T; break; case 100: ifmr->ifm_active |= IFM_100_TX; break; case 1000: ifmr->ifm_active |= IFM_1000_T; break; } if (adapter->link_duplex == FULL_DUPLEX) ifmr->ifm_active |= IFM_FDX; else ifmr->ifm_active |= IFM_HDX; } EM_CORE_UNLOCK(adapter); } /********************************************************************* * * Media Ioctl callback * * This routine is called when the user changes speed/duplex using * media/mediopt option with ifconfig. * **********************************************************************/ static int em_media_change(struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct ifmedia *ifm = &adapter->media; INIT_DEBUGOUT("em_media_change: begin"); if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) return (EINVAL); EM_CORE_LOCK(adapter); switch (IFM_SUBTYPE(ifm->ifm_media)) { case IFM_AUTO: adapter->hw.mac.autoneg = DO_AUTO_NEG; adapter->hw.phy.autoneg_advertised = AUTONEG_ADV_DEFAULT; break; case IFM_1000_LX: case IFM_1000_SX: case IFM_1000_T: adapter->hw.mac.autoneg = DO_AUTO_NEG; adapter->hw.phy.autoneg_advertised = ADVERTISE_1000_FULL; break; case IFM_100_TX: adapter->hw.mac.autoneg = FALSE; adapter->hw.phy.autoneg_advertised = 0; if ((ifm->ifm_media & IFM_GMASK) == IFM_FDX) adapter->hw.mac.forced_speed_duplex = ADVERTISE_100_FULL; else adapter->hw.mac.forced_speed_duplex = ADVERTISE_100_HALF; break; case IFM_10_T: adapter->hw.mac.autoneg = FALSE; adapter->hw.phy.autoneg_advertised = 0; if ((ifm->ifm_media & IFM_GMASK) == IFM_FDX) adapter->hw.mac.forced_speed_duplex = ADVERTISE_10_FULL; else adapter->hw.mac.forced_speed_duplex = ADVERTISE_10_HALF; break; default: device_printf(adapter->dev, "Unsupported media type\n"); } em_init_locked(adapter); EM_CORE_UNLOCK(adapter); return (0); } /********************************************************************* * * This routine maps the mbufs to tx descriptors. * * return 0 on success, positive on failure **********************************************************************/ static int em_xmit(struct tx_ring *txr, struct mbuf **m_headp) { struct adapter *adapter = txr->adapter; bus_dma_segment_t segs[EM_MAX_SCATTER]; bus_dmamap_t map; struct em_buffer *tx_buffer, *tx_buffer_mapped; struct e1000_tx_desc *ctxd = NULL; struct mbuf *m_head; struct ether_header *eh; struct ip *ip = NULL; struct tcphdr *tp = NULL; u32 txd_upper, txd_lower, txd_used, txd_saved; int ip_off, poff; int nsegs, i, j, first, last = 0; int error, do_tso, tso_desc = 0, remap = 1; retry: m_head = *m_headp; txd_upper = txd_lower = txd_used = txd_saved = 0; do_tso = ((m_head->m_pkthdr.csum_flags & CSUM_TSO) != 0); ip_off = poff = 0; /* * Intel recommends entire IP/TCP header length reside in a single * buffer. If multiple descriptors are used to describe the IP and * TCP header, each descriptor should describe one or more * complete headers; descriptors referencing only parts of headers * are not supported. If all layer headers are not coalesced into * a single buffer, each buffer should not cross a 4KB boundary, * or be larger than the maximum read request size. * Controller also requires modifing IP/TCP header to make TSO work * so we firstly get a writable mbuf chain then coalesce ethernet/ * IP/TCP header into a single buffer to meet the requirement of * controller. This also simplifies IP/TCP/UDP checksum offloading * which also has similiar restrictions. */ if (do_tso || m_head->m_pkthdr.csum_flags & CSUM_OFFLOAD) { if (do_tso || (m_head->m_next != NULL && m_head->m_pkthdr.csum_flags & CSUM_OFFLOAD)) { if (M_WRITABLE(*m_headp) == 0) { m_head = m_dup(*m_headp, M_NOWAIT); m_freem(*m_headp); if (m_head == NULL) { *m_headp = NULL; return (ENOBUFS); } *m_headp = m_head; } } /* * XXX * Assume IPv4, we don't have TSO/checksum offload support * for IPv6 yet. */ ip_off = sizeof(struct ether_header); m_head = m_pullup(m_head, ip_off); if (m_head == NULL) { *m_headp = NULL; return (ENOBUFS); } eh = mtod(m_head, struct ether_header *); if (eh->ether_type == htons(ETHERTYPE_VLAN)) { ip_off = sizeof(struct ether_vlan_header); m_head = m_pullup(m_head, ip_off); if (m_head == NULL) { *m_headp = NULL; return (ENOBUFS); } } m_head = m_pullup(m_head, ip_off + sizeof(struct ip)); if (m_head == NULL) { *m_headp = NULL; return (ENOBUFS); } ip = (struct ip *)(mtod(m_head, char *) + ip_off); poff = ip_off + (ip->ip_hl << 2); if (do_tso) { m_head = m_pullup(m_head, poff + sizeof(struct tcphdr)); if (m_head == NULL) { *m_headp = NULL; return (ENOBUFS); } tp = (struct tcphdr *)(mtod(m_head, char *) + poff); /* * TSO workaround: * pull 4 more bytes of data into it. */ m_head = m_pullup(m_head, poff + (tp->th_off << 2) + 4); if (m_head == NULL) { *m_headp = NULL; return (ENOBUFS); } ip = (struct ip *)(mtod(m_head, char *) + ip_off); ip->ip_len = 0; ip->ip_sum = 0; /* * The pseudo TCP checksum does not include TCP payload * length so driver should recompute the checksum here * what hardware expect to see. This is adherence of * Microsoft's Large Send specification. */ tp = (struct tcphdr *)(mtod(m_head, char *) + poff); tp->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); } else if (m_head->m_pkthdr.csum_flags & CSUM_TCP) { m_head = m_pullup(m_head, poff + sizeof(struct tcphdr)); if (m_head == NULL) { *m_headp = NULL; return (ENOBUFS); } tp = (struct tcphdr *)(mtod(m_head, char *) + poff); m_head = m_pullup(m_head, poff + (tp->th_off << 2)); if (m_head == NULL) { *m_headp = NULL; return (ENOBUFS); } ip = (struct ip *)(mtod(m_head, char *) + ip_off); tp = (struct tcphdr *)(mtod(m_head, char *) + poff); } else if (m_head->m_pkthdr.csum_flags & CSUM_UDP) { m_head = m_pullup(m_head, poff + sizeof(struct udphdr)); if (m_head == NULL) { *m_headp = NULL; return (ENOBUFS); } ip = (struct ip *)(mtod(m_head, char *) + ip_off); } *m_headp = m_head; } /* * Map the packet for DMA * * Capture the first descriptor index, * this descriptor will have the index * of the EOP which is the only one that * now gets a DONE bit writeback. */ first = txr->next_avail_desc; tx_buffer = &txr->tx_buffers[first]; tx_buffer_mapped = tx_buffer; map = tx_buffer->map; error = bus_dmamap_load_mbuf_sg(txr->txtag, map, *m_headp, segs, &nsegs, BUS_DMA_NOWAIT); /* * There are two types of errors we can (try) to handle: * - EFBIG means the mbuf chain was too long and bus_dma ran * out of segments. Defragment the mbuf chain and try again. * - ENOMEM means bus_dma could not obtain enough bounce buffers * at this point in time. Defer sending and try again later. * All other errors, in particular EINVAL, are fatal and prevent the * mbuf chain from ever going through. Drop it and report error. */ if (error == EFBIG && remap) { struct mbuf *m; m = m_defrag(*m_headp, M_NOWAIT); if (m == NULL) { adapter->mbuf_alloc_failed++; m_freem(*m_headp); *m_headp = NULL; return (ENOBUFS); } *m_headp = m; /* Try it again, but only once */ remap = 0; goto retry; } else if (error == ENOMEM) { adapter->no_tx_dma_setup++; return (error); } else if (error != 0) { adapter->no_tx_dma_setup++; m_freem(*m_headp); *m_headp = NULL; return (error); } /* * TSO Hardware workaround, if this packet is not * TSO, and is only a single descriptor long, and * it follows a TSO burst, then we need to add a * sentinel descriptor to prevent premature writeback. */ if ((do_tso == 0) && (txr->tx_tso == TRUE)) { if (nsegs == 1) tso_desc = TRUE; txr->tx_tso = FALSE; } if (nsegs > (txr->tx_avail - 2)) { txr->no_desc_avail++; bus_dmamap_unload(txr->txtag, map); return (ENOBUFS); } m_head = *m_headp; /* Do hardware assists */ if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { em_tso_setup(txr, m_head, ip_off, ip, tp, &txd_upper, &txd_lower); /* we need to make a final sentinel transmit desc */ tso_desc = TRUE; } else if (m_head->m_pkthdr.csum_flags & CSUM_OFFLOAD) em_transmit_checksum_setup(txr, m_head, ip_off, ip, &txd_upper, &txd_lower); if (m_head->m_flags & M_VLANTAG) { /* Set the vlan id. */ txd_upper |= (htole16(m_head->m_pkthdr.ether_vtag) << 16); /* Tell hardware to add tag */ txd_lower |= htole32(E1000_TXD_CMD_VLE); } i = txr->next_avail_desc; /* Set up our transmit descriptors */ for (j = 0; j < nsegs; j++) { bus_size_t seg_len; bus_addr_t seg_addr; tx_buffer = &txr->tx_buffers[i]; ctxd = &txr->tx_base[i]; seg_addr = segs[j].ds_addr; seg_len = segs[j].ds_len; /* ** TSO Workaround: ** If this is the last descriptor, we want to ** split it so we have a small final sentinel */ if (tso_desc && (j == (nsegs -1)) && (seg_len > 8)) { seg_len -= 4; ctxd->buffer_addr = htole64(seg_addr); ctxd->lower.data = htole32( adapter->txd_cmd | txd_lower | seg_len); ctxd->upper.data = htole32(txd_upper); if (++i == adapter->num_tx_desc) i = 0; /* Now make the sentinel */ ++txd_used; /* using an extra txd */ ctxd = &txr->tx_base[i]; tx_buffer = &txr->tx_buffers[i]; ctxd->buffer_addr = htole64(seg_addr + seg_len); ctxd->lower.data = htole32( adapter->txd_cmd | txd_lower | 4); ctxd->upper.data = htole32(txd_upper); last = i; if (++i == adapter->num_tx_desc) i = 0; } else { ctxd->buffer_addr = htole64(seg_addr); ctxd->lower.data = htole32( adapter->txd_cmd | txd_lower | seg_len); ctxd->upper.data = htole32(txd_upper); last = i; if (++i == adapter->num_tx_desc) i = 0; } tx_buffer->m_head = NULL; tx_buffer->next_eop = -1; } txr->next_avail_desc = i; txr->tx_avail -= nsegs; if (tso_desc) /* TSO used an extra for sentinel */ txr->tx_avail -= txd_used; tx_buffer->m_head = m_head; /* ** Here we swap the map so the last descriptor, ** which gets the completion interrupt has the ** real map, and the first descriptor gets the ** unused map from this descriptor. */ tx_buffer_mapped->map = tx_buffer->map; tx_buffer->map = map; bus_dmamap_sync(txr->txtag, map, BUS_DMASYNC_PREWRITE); /* * Last Descriptor of Packet * needs End Of Packet (EOP) * and Report Status (RS) */ ctxd->lower.data |= htole32(E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS); /* * Keep track in the first buffer which * descriptor will be written back */ tx_buffer = &txr->tx_buffers[first]; tx_buffer->next_eop = last; /* Update the watchdog time early and often */ txr->watchdog_time = ticks; /* * Advance the Transmit Descriptor Tail (TDT), this tells the E1000 * that this frame is available to transmit. */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), i); return (0); } static void em_set_promisc(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; u32 reg_rctl; reg_rctl = E1000_READ_REG(&adapter->hw, E1000_RCTL); if (ifp->if_flags & IFF_PROMISC) { reg_rctl |= (E1000_RCTL_UPE | E1000_RCTL_MPE); /* Turn this on if you want to see bad packets */ if (em_debug_sbp) reg_rctl |= E1000_RCTL_SBP; E1000_WRITE_REG(&adapter->hw, E1000_RCTL, reg_rctl); } else if (ifp->if_flags & IFF_ALLMULTI) { reg_rctl |= E1000_RCTL_MPE; reg_rctl &= ~E1000_RCTL_UPE; E1000_WRITE_REG(&adapter->hw, E1000_RCTL, reg_rctl); } } static void em_disable_promisc(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; u32 reg_rctl; int mcnt = 0; reg_rctl = E1000_READ_REG(&adapter->hw, E1000_RCTL); reg_rctl &= (~E1000_RCTL_UPE); if (ifp->if_flags & IFF_ALLMULTI) mcnt = MAX_NUM_MULTICAST_ADDRESSES; else { struct ifmultiaddr *ifma; #if __FreeBSD_version < 800000 IF_ADDR_LOCK(ifp); #else if_maddr_rlock(ifp); #endif TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == MAX_NUM_MULTICAST_ADDRESSES) break; mcnt++; } #if __FreeBSD_version < 800000 IF_ADDR_UNLOCK(ifp); #else if_maddr_runlock(ifp); #endif } /* Don't disable if in MAX groups */ if (mcnt < MAX_NUM_MULTICAST_ADDRESSES) reg_rctl &= (~E1000_RCTL_MPE); reg_rctl &= (~E1000_RCTL_SBP); E1000_WRITE_REG(&adapter->hw, E1000_RCTL, reg_rctl); } /********************************************************************* * Multicast Update * * This routine is called whenever multicast address list is updated. * **********************************************************************/ static void em_set_multi(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; struct ifmultiaddr *ifma; u32 reg_rctl = 0; u8 *mta; /* Multicast array memory */ int mcnt = 0; IOCTL_DEBUGOUT("em_set_multi: begin"); mta = adapter->mta; bzero(mta, sizeof(u8) * ETH_ADDR_LEN * MAX_NUM_MULTICAST_ADDRESSES); if (adapter->hw.mac.type == e1000_82542 && adapter->hw.revision_id == E1000_REVISION_2) { reg_rctl = E1000_READ_REG(&adapter->hw, E1000_RCTL); if (adapter->hw.bus.pci_cmd_word & CMD_MEM_WRT_INVALIDATE) e1000_pci_clear_mwi(&adapter->hw); reg_rctl |= E1000_RCTL_RST; E1000_WRITE_REG(&adapter->hw, E1000_RCTL, reg_rctl); msec_delay(5); } #if __FreeBSD_version < 800000 IF_ADDR_LOCK(ifp); #else if_maddr_rlock(ifp); #endif TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == MAX_NUM_MULTICAST_ADDRESSES) break; bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), &mta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN); mcnt++; } #if __FreeBSD_version < 800000 IF_ADDR_UNLOCK(ifp); #else if_maddr_runlock(ifp); #endif if (mcnt >= MAX_NUM_MULTICAST_ADDRESSES) { reg_rctl = E1000_READ_REG(&adapter->hw, E1000_RCTL); reg_rctl |= E1000_RCTL_MPE; E1000_WRITE_REG(&adapter->hw, E1000_RCTL, reg_rctl); } else e1000_update_mc_addr_list(&adapter->hw, mta, mcnt); if (adapter->hw.mac.type == e1000_82542 && adapter->hw.revision_id == E1000_REVISION_2) { reg_rctl = E1000_READ_REG(&adapter->hw, E1000_RCTL); reg_rctl &= ~E1000_RCTL_RST; E1000_WRITE_REG(&adapter->hw, E1000_RCTL, reg_rctl); msec_delay(5); if (adapter->hw.bus.pci_cmd_word & CMD_MEM_WRT_INVALIDATE) e1000_pci_set_mwi(&adapter->hw); } } /********************************************************************* * Timer routine * * This routine checks for link status and updates statistics. * **********************************************************************/ static void em_local_timer(void *arg) { struct adapter *adapter = arg; struct ifnet *ifp = adapter->ifp; struct tx_ring *txr = adapter->tx_rings; struct rx_ring *rxr = adapter->rx_rings; u32 trigger; EM_CORE_LOCK_ASSERT(adapter); em_update_link_status(adapter); em_update_stats_counters(adapter); /* Reset LAA into RAR[0] on 82571 */ if ((adapter->hw.mac.type == e1000_82571) && e1000_get_laa_state_82571(&adapter->hw)) e1000_rar_set(&adapter->hw, adapter->hw.mac.addr, 0); /* Mask to use in the irq trigger */ if (adapter->msix_mem) trigger = rxr->ims; else trigger = E1000_ICS_RXDMT0; /* ** Check on the state of the TX queue(s), this ** can be done without the lock because its RO ** and the HUNG state will be static if set. */ for (int i = 0; i < adapter->num_queues; i++, txr++) { if ((txr->queue_status == EM_QUEUE_HUNG) && (adapter->pause_frames == 0)) goto hung; /* Schedule a TX tasklet if needed */ if (txr->tx_avail <= EM_MAX_SCATTER) taskqueue_enqueue(txr->tq, &txr->tx_task); } adapter->pause_frames = 0; callout_reset(&adapter->timer, hz, em_local_timer, adapter); #ifndef DEVICE_POLLING /* Trigger an RX interrupt to guarantee mbuf refresh */ E1000_WRITE_REG(&adapter->hw, E1000_ICS, trigger); #endif return; hung: /* Looks like we're hung */ device_printf(adapter->dev, "Watchdog timeout -- resetting\n"); device_printf(adapter->dev, "Queue(%d) tdh = %d, hw tdt = %d\n", txr->me, E1000_READ_REG(&adapter->hw, E1000_TDH(txr->me)), E1000_READ_REG(&adapter->hw, E1000_TDT(txr->me))); device_printf(adapter->dev,"TX(%d) desc avail = %d," "Next TX to Clean = %d\n", txr->me, txr->tx_avail, txr->next_to_clean); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; adapter->watchdog_events++; adapter->pause_frames = 0; em_init_locked(adapter); } static void em_update_link_status(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct ifnet *ifp = adapter->ifp; device_t dev = adapter->dev; struct tx_ring *txr = adapter->tx_rings; u32 link_check = 0; /* Get the cached link value or read phy for real */ switch (hw->phy.media_type) { case e1000_media_type_copper: if (hw->mac.get_link_status) { /* Do the work to read phy */ e1000_check_for_link(hw); link_check = !hw->mac.get_link_status; if (link_check) /* ESB2 fix */ e1000_cfg_on_link_up(hw); } else link_check = TRUE; break; case e1000_media_type_fiber: e1000_check_for_link(hw); link_check = (E1000_READ_REG(hw, E1000_STATUS) & E1000_STATUS_LU); break; case e1000_media_type_internal_serdes: e1000_check_for_link(hw); link_check = adapter->hw.mac.serdes_has_link; break; default: case e1000_media_type_unknown: break; } /* Now check for a transition */ if (link_check && (adapter->link_active == 0)) { e1000_get_speed_and_duplex(hw, &adapter->link_speed, &adapter->link_duplex); /* Check if we must disable SPEED_MODE bit on PCI-E */ if ((adapter->link_speed != SPEED_1000) && ((hw->mac.type == e1000_82571) || (hw->mac.type == e1000_82572))) { int tarc0; tarc0 = E1000_READ_REG(hw, E1000_TARC(0)); tarc0 &= ~SPEED_MODE_BIT; E1000_WRITE_REG(hw, E1000_TARC(0), tarc0); } if (bootverbose) device_printf(dev, "Link is up %d Mbps %s\n", adapter->link_speed, ((adapter->link_duplex == FULL_DUPLEX) ? "Full Duplex" : "Half Duplex")); adapter->link_active = 1; adapter->smartspeed = 0; ifp->if_baudrate = adapter->link_speed * 1000000; if_link_state_change(ifp, LINK_STATE_UP); } else if (!link_check && (adapter->link_active == 1)) { ifp->if_baudrate = adapter->link_speed = 0; adapter->link_duplex = 0; if (bootverbose) device_printf(dev, "Link is Down\n"); adapter->link_active = 0; /* Link down, disable watchdog */ for (int i = 0; i < adapter->num_queues; i++, txr++) txr->queue_status = EM_QUEUE_IDLE; if_link_state_change(ifp, LINK_STATE_DOWN); } } /********************************************************************* * * This routine disables all traffic on the adapter by issuing a * global reset on the MAC and deallocates TX/RX buffers. * * This routine should always be called with BOTH the CORE * and TX locks. **********************************************************************/ static void em_stop(void *arg) { struct adapter *adapter = arg; struct ifnet *ifp = adapter->ifp; struct tx_ring *txr = adapter->tx_rings; EM_CORE_LOCK_ASSERT(adapter); INIT_DEBUGOUT("em_stop: begin"); em_disable_intr(adapter); callout_stop(&adapter->timer); /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~IFF_DRV_RUNNING; ifp->if_drv_flags |= IFF_DRV_OACTIVE; /* Unarm watchdog timer. */ for (int i = 0; i < adapter->num_queues; i++, txr++) { EM_TX_LOCK(txr); txr->queue_status = EM_QUEUE_IDLE; EM_TX_UNLOCK(txr); } e1000_reset_hw(&adapter->hw); E1000_WRITE_REG(&adapter->hw, E1000_WUC, 0); e1000_led_off(&adapter->hw); e1000_cleanup_led(&adapter->hw); } /********************************************************************* * * Determine hardware revision. * **********************************************************************/ static void em_identify_hardware(struct adapter *adapter) { device_t dev = adapter->dev; /* Make sure our PCI config space has the necessary stuff set */ pci_enable_busmaster(dev); adapter->hw.bus.pci_cmd_word = pci_read_config(dev, PCIR_COMMAND, 2); /* Save off the information about this board */ adapter->hw.vendor_id = pci_get_vendor(dev); adapter->hw.device_id = pci_get_device(dev); adapter->hw.revision_id = pci_read_config(dev, PCIR_REVID, 1); adapter->hw.subsystem_vendor_id = pci_read_config(dev, PCIR_SUBVEND_0, 2); adapter->hw.subsystem_device_id = pci_read_config(dev, PCIR_SUBDEV_0, 2); /* Do Shared Code Init and Setup */ if (e1000_set_mac_type(&adapter->hw)) { device_printf(dev, "Setup init failure\n"); return; } } static int em_allocate_pci_resources(struct adapter *adapter) { device_t dev = adapter->dev; int rid; rid = PCIR_BAR(0); adapter->memory = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (adapter->memory == NULL) { device_printf(dev, "Unable to allocate bus resource: memory\n"); return (ENXIO); } adapter->osdep.mem_bus_space_tag = rman_get_bustag(adapter->memory); adapter->osdep.mem_bus_space_handle = rman_get_bushandle(adapter->memory); adapter->hw.hw_addr = (u8 *)&adapter->osdep.mem_bus_space_handle; /* Default to a single queue */ adapter->num_queues = 1; /* * Setup MSI/X or MSI if PCI Express */ adapter->msix = em_setup_msix(adapter); adapter->hw.back = &adapter->osdep; return (0); } /********************************************************************* * * Setup the Legacy or MSI Interrupt handler * **********************************************************************/ int em_allocate_legacy(struct adapter *adapter) { device_t dev = adapter->dev; struct tx_ring *txr = adapter->tx_rings; int error, rid = 0; /* Manually turn off all interrupts */ E1000_WRITE_REG(&adapter->hw, E1000_IMC, 0xffffffff); if (adapter->msix == 1) /* using MSI */ rid = 1; /* We allocate a single interrupt resource */ adapter->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (adapter->res == NULL) { device_printf(dev, "Unable to allocate bus resource: " "interrupt\n"); return (ENXIO); } /* * Allocate a fast interrupt and the associated * deferred processing contexts. */ TASK_INIT(&adapter->que_task, 0, em_handle_que, adapter); adapter->tq = taskqueue_create_fast("em_taskq", M_NOWAIT, taskqueue_thread_enqueue, &adapter->tq); taskqueue_start_threads(&adapter->tq, 1, PI_NET, "%s que", device_get_nameunit(adapter->dev)); /* Use a TX only tasklet for local timer */ TASK_INIT(&txr->tx_task, 0, em_handle_tx, txr); txr->tq = taskqueue_create_fast("em_txq", M_NOWAIT, taskqueue_thread_enqueue, &txr->tq); taskqueue_start_threads(&txr->tq, 1, PI_NET, "%s txq", device_get_nameunit(adapter->dev)); TASK_INIT(&adapter->link_task, 0, em_handle_link, adapter); if ((error = bus_setup_intr(dev, adapter->res, INTR_TYPE_NET, em_irq_fast, NULL, adapter, &adapter->tag)) != 0) { device_printf(dev, "Failed to register fast interrupt " "handler: %d\n", error); taskqueue_free(adapter->tq); adapter->tq = NULL; return (error); } return (0); } /********************************************************************* * * Setup the MSIX Interrupt handlers * This is not really Multiqueue, rather * its just seperate interrupt vectors * for TX, RX, and Link. * **********************************************************************/ int em_allocate_msix(struct adapter *adapter) { device_t dev = adapter->dev; struct tx_ring *txr = adapter->tx_rings; struct rx_ring *rxr = adapter->rx_rings; int error, rid, vector = 0; /* Make sure all interrupts are disabled */ E1000_WRITE_REG(&adapter->hw, E1000_IMC, 0xffffffff); /* First set up ring resources */ for (int i = 0; i < adapter->num_queues; i++, txr++, rxr++) { /* RX ring */ rid = vector + 1; rxr->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_ACTIVE); if (rxr->res == NULL) { device_printf(dev, "Unable to allocate bus resource: " "RX MSIX Interrupt %d\n", i); return (ENXIO); } if ((error = bus_setup_intr(dev, rxr->res, INTR_TYPE_NET | INTR_MPSAFE, NULL, em_msix_rx, rxr, &rxr->tag)) != 0) { device_printf(dev, "Failed to register RX handler"); return (error); } #if __FreeBSD_version >= 800504 bus_describe_intr(dev, rxr->res, rxr->tag, "rx %d", i); #endif rxr->msix = vector++; /* NOTE increment vector for TX */ TASK_INIT(&rxr->rx_task, 0, em_handle_rx, rxr); rxr->tq = taskqueue_create_fast("em_rxq", M_NOWAIT, taskqueue_thread_enqueue, &rxr->tq); taskqueue_start_threads(&rxr->tq, 1, PI_NET, "%s rxq", device_get_nameunit(adapter->dev)); /* ** Set the bit to enable interrupt ** in E1000_IMS -- bits 20 and 21 ** are for RX0 and RX1, note this has ** NOTHING to do with the MSIX vector */ rxr->ims = 1 << (20 + i); adapter->ivars |= (8 | rxr->msix) << (i * 4); /* TX ring */ rid = vector + 1; txr->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_ACTIVE); if (txr->res == NULL) { device_printf(dev, "Unable to allocate bus resource: " "TX MSIX Interrupt %d\n", i); return (ENXIO); } if ((error = bus_setup_intr(dev, txr->res, INTR_TYPE_NET | INTR_MPSAFE, NULL, em_msix_tx, txr, &txr->tag)) != 0) { device_printf(dev, "Failed to register TX handler"); return (error); } #if __FreeBSD_version >= 800504 bus_describe_intr(dev, txr->res, txr->tag, "tx %d", i); #endif txr->msix = vector++; /* Increment vector for next pass */ TASK_INIT(&txr->tx_task, 0, em_handle_tx, txr); txr->tq = taskqueue_create_fast("em_txq", M_NOWAIT, taskqueue_thread_enqueue, &txr->tq); taskqueue_start_threads(&txr->tq, 1, PI_NET, "%s txq", device_get_nameunit(adapter->dev)); /* ** Set the bit to enable interrupt ** in E1000_IMS -- bits 22 and 23 ** are for TX0 and TX1, note this has ** NOTHING to do with the MSIX vector */ txr->ims = 1 << (22 + i); adapter->ivars |= (8 | txr->msix) << (8 + (i * 4)); } /* Link interrupt */ ++rid; adapter->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_ACTIVE); if (!adapter->res) { device_printf(dev,"Unable to allocate " "bus resource: Link interrupt [%d]\n", rid); return (ENXIO); } /* Set the link handler function */ error = bus_setup_intr(dev, adapter->res, INTR_TYPE_NET | INTR_MPSAFE, NULL, em_msix_link, adapter, &adapter->tag); if (error) { adapter->res = NULL; device_printf(dev, "Failed to register LINK handler"); return (error); } #if __FreeBSD_version >= 800504 bus_describe_intr(dev, adapter->res, adapter->tag, "link"); #endif adapter->linkvec = vector; adapter->ivars |= (8 | vector) << 16; adapter->ivars |= 0x80000000; return (0); } static void em_free_pci_resources(struct adapter *adapter) { device_t dev = adapter->dev; struct tx_ring *txr; struct rx_ring *rxr; int rid; /* ** Release all the queue interrupt resources: */ for (int i = 0; i < adapter->num_queues; i++) { txr = &adapter->tx_rings[i]; rxr = &adapter->rx_rings[i]; /* an early abort? */ if ((txr == NULL) || (rxr == NULL)) break; rid = txr->msix +1; if (txr->tag != NULL) { bus_teardown_intr(dev, txr->res, txr->tag); txr->tag = NULL; } if (txr->res != NULL) bus_release_resource(dev, SYS_RES_IRQ, rid, txr->res); rid = rxr->msix +1; if (rxr->tag != NULL) { bus_teardown_intr(dev, rxr->res, rxr->tag); rxr->tag = NULL; } if (rxr->res != NULL) bus_release_resource(dev, SYS_RES_IRQ, rid, rxr->res); } if (adapter->linkvec) /* we are doing MSIX */ rid = adapter->linkvec + 1; else (adapter->msix != 0) ? (rid = 1):(rid = 0); if (adapter->tag != NULL) { bus_teardown_intr(dev, adapter->res, adapter->tag); adapter->tag = NULL; } if (adapter->res != NULL) bus_release_resource(dev, SYS_RES_IRQ, rid, adapter->res); if (adapter->msix) pci_release_msi(dev); if (adapter->msix_mem != NULL) bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(EM_MSIX_BAR), adapter->msix_mem); if (adapter->memory != NULL) bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(0), adapter->memory); if (adapter->flash != NULL) bus_release_resource(dev, SYS_RES_MEMORY, EM_FLASH, adapter->flash); } /* * Setup MSI or MSI/X */ static int em_setup_msix(struct adapter *adapter) { device_t dev = adapter->dev; int val; /* ** Setup MSI/X for Hartwell: tests have shown ** use of two queues to be unstable, and to ** provide no great gain anyway, so we simply ** seperate the interrupts and use a single queue. */ if ((adapter->hw.mac.type == e1000_82574) && (em_enable_msix == TRUE)) { /* Map the MSIX BAR */ int rid = PCIR_BAR(EM_MSIX_BAR); adapter->msix_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (adapter->msix_mem == NULL) { /* May not be enabled */ device_printf(adapter->dev, "Unable to map MSIX table \n"); goto msi; } val = pci_msix_count(dev); /* We only need/want 3 vectors */ if (val >= 3) val = 3; else { device_printf(adapter->dev, "MSIX: insufficient vectors, using MSI\n"); goto msi; } if ((pci_alloc_msix(dev, &val) == 0) && (val == 3)) { device_printf(adapter->dev, "Using MSIX interrupts " "with %d vectors\n", val); return (val); } /* ** If MSIX alloc failed or provided us with ** less than needed, free and fall through to MSI */ pci_release_msi(dev); } msi: if (adapter->msix_mem != NULL) { bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(EM_MSIX_BAR), adapter->msix_mem); adapter->msix_mem = NULL; } val = 1; if (pci_alloc_msi(dev, &val) == 0) { device_printf(adapter->dev,"Using an MSI interrupt\n"); return (val); } /* Should only happen due to manual configuration */ device_printf(adapter->dev,"No MSI/MSIX using a Legacy IRQ\n"); return (0); } /********************************************************************* * * Initialize the hardware to a configuration * as specified by the adapter structure. * **********************************************************************/ static void em_reset(struct adapter *adapter) { device_t dev = adapter->dev; struct ifnet *ifp = adapter->ifp; struct e1000_hw *hw = &adapter->hw; u16 rx_buffer_size; u32 pba; INIT_DEBUGOUT("em_reset: begin"); /* Set up smart power down as default off on newer adapters. */ if (!em_smart_pwr_down && (hw->mac.type == e1000_82571 || hw->mac.type == e1000_82572)) { u16 phy_tmp = 0; /* Speed up time to link by disabling smart power down. */ e1000_read_phy_reg(hw, IGP02E1000_PHY_POWER_MGMT, &phy_tmp); phy_tmp &= ~IGP02E1000_PM_SPD; e1000_write_phy_reg(hw, IGP02E1000_PHY_POWER_MGMT, phy_tmp); } /* * Packet Buffer Allocation (PBA) * Writing PBA sets the receive portion of the buffer * the remainder is used for the transmit buffer. */ switch (hw->mac.type) { /* Total Packet Buffer on these is 48K */ case e1000_82571: case e1000_82572: case e1000_80003es2lan: pba = E1000_PBA_32K; /* 32K for Rx, 16K for Tx */ break; case e1000_82573: /* 82573: Total Packet Buffer is 32K */ pba = E1000_PBA_12K; /* 12K for Rx, 20K for Tx */ break; case e1000_82574: case e1000_82583: pba = E1000_PBA_20K; /* 20K for Rx, 20K for Tx */ break; case e1000_ich8lan: pba = E1000_PBA_8K; break; case e1000_ich9lan: case e1000_ich10lan: /* Boost Receive side for jumbo frames */ if (adapter->hw.mac.max_frame_size > 4096) pba = E1000_PBA_14K; else pba = E1000_PBA_10K; break; case e1000_pchlan: case e1000_pch2lan: case e1000_pch_lpt: pba = E1000_PBA_26K; break; default: if (adapter->hw.mac.max_frame_size > 8192) pba = E1000_PBA_40K; /* 40K for Rx, 24K for Tx */ else pba = E1000_PBA_48K; /* 48K for Rx, 16K for Tx */ } E1000_WRITE_REG(&adapter->hw, E1000_PBA, pba); /* * These parameters control the automatic generation (Tx) and * response (Rx) to Ethernet PAUSE frames. * - High water mark should allow for at least two frames to be * received after sending an XOFF. * - Low water mark works best when it is very near the high water mark. * This allows the receiver to restart by sending XON when it has * drained a bit. Here we use an arbitary value of 1500 which will * restart after one full frame is pulled from the buffer. There * could be several smaller frames in the buffer and if so they will * not trigger the XON until their total number reduces the buffer * by 1500. * - The pause time is fairly large at 1000 x 512ns = 512 usec. */ rx_buffer_size = ((E1000_READ_REG(hw, E1000_PBA) & 0xffff) << 10 ); hw->fc.high_water = rx_buffer_size - roundup2(adapter->hw.mac.max_frame_size, 1024); hw->fc.low_water = hw->fc.high_water - 1500; if (adapter->fc) /* locally set flow control value? */ hw->fc.requested_mode = adapter->fc; else hw->fc.requested_mode = e1000_fc_full; if (hw->mac.type == e1000_80003es2lan) hw->fc.pause_time = 0xFFFF; else hw->fc.pause_time = EM_FC_PAUSE_TIME; hw->fc.send_xon = TRUE; /* Device specific overrides/settings */ switch (hw->mac.type) { case e1000_pchlan: /* Workaround: no TX flow ctrl for PCH */ hw->fc.requested_mode = e1000_fc_rx_pause; hw->fc.pause_time = 0xFFFF; /* override */ if (ifp->if_mtu > ETHERMTU) { hw->fc.high_water = 0x3500; hw->fc.low_water = 0x1500; } else { hw->fc.high_water = 0x5000; hw->fc.low_water = 0x3000; } hw->fc.refresh_time = 0x1000; break; case e1000_pch2lan: case e1000_pch_lpt: hw->fc.high_water = 0x5C20; hw->fc.low_water = 0x5048; hw->fc.pause_time = 0x0650; hw->fc.refresh_time = 0x0400; /* Jumbos need adjusted PBA */ if (ifp->if_mtu > ETHERMTU) E1000_WRITE_REG(hw, E1000_PBA, 12); else E1000_WRITE_REG(hw, E1000_PBA, 26); break; case e1000_ich9lan: case e1000_ich10lan: if (ifp->if_mtu > ETHERMTU) { hw->fc.high_water = 0x2800; hw->fc.low_water = hw->fc.high_water - 8; break; } /* else fall thru */ default: if (hw->mac.type == e1000_80003es2lan) hw->fc.pause_time = 0xFFFF; break; } /* Issue a global reset */ e1000_reset_hw(hw); E1000_WRITE_REG(hw, E1000_WUC, 0); em_disable_aspm(adapter); /* and a re-init */ if (e1000_init_hw(hw) < 0) { device_printf(dev, "Hardware Initialization Failed\n"); return; } E1000_WRITE_REG(hw, E1000_VET, ETHERTYPE_VLAN); e1000_get_phy_info(hw); e1000_check_for_link(hw); return; } /********************************************************************* * * Setup networking device structure and register an interface. * **********************************************************************/ static int em_setup_interface(device_t dev, struct adapter *adapter) { struct ifnet *ifp; INIT_DEBUGOUT("em_setup_interface: begin"); ifp = adapter->ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "can not allocate ifnet structure\n"); return (-1); } if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_init = em_init; ifp->if_softc = adapter; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = em_ioctl; #ifdef EM_MULTIQUEUE /* Multiqueue stack interface */ ifp->if_transmit = em_mq_start; ifp->if_qflush = em_qflush; #else ifp->if_start = em_start; IFQ_SET_MAXLEN(&ifp->if_snd, adapter->num_tx_desc - 1); ifp->if_snd.ifq_drv_maxlen = adapter->num_tx_desc - 1; IFQ_SET_READY(&ifp->if_snd); #endif ether_ifattach(ifp, adapter->hw.mac.addr); ifp->if_capabilities = ifp->if_capenable = 0; ifp->if_capabilities |= IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM; ifp->if_capabilities |= IFCAP_TSO4; /* * Tell the upper layer(s) we * support full VLAN capability */ ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header); ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWTSO | IFCAP_VLAN_MTU; ifp->if_capenable = ifp->if_capabilities; /* ** Don't turn this on by default, if vlans are ** created on another pseudo device (eg. lagg) ** then vlan events are not passed thru, breaking ** operation, but with HW FILTER off it works. If ** using vlans directly on the em driver you can ** enable this and get full hardware tag filtering. */ ifp->if_capabilities |= IFCAP_VLAN_HWFILTER; #ifdef DEVICE_POLLING ifp->if_capabilities |= IFCAP_POLLING; #endif /* Enable only WOL MAGIC by default */ if (adapter->wol) { ifp->if_capabilities |= IFCAP_WOL; ifp->if_capenable |= IFCAP_WOL_MAGIC; } /* * Specify the media types supported by this adapter and register * callbacks to update media and link information */ ifmedia_init(&adapter->media, IFM_IMASK, em_media_change, em_media_status); if ((adapter->hw.phy.media_type == e1000_media_type_fiber) || (adapter->hw.phy.media_type == e1000_media_type_internal_serdes)) { u_char fiber_type = IFM_1000_SX; /* default type */ ifmedia_add(&adapter->media, IFM_ETHER | fiber_type | IFM_FDX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | fiber_type, 0, NULL); } else { ifmedia_add(&adapter->media, IFM_ETHER | IFM_10_T, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_10_T | IFM_FDX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_100_TX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_100_TX | IFM_FDX, 0, NULL); if (adapter->hw.phy.type != e1000_phy_ife) { ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_T, 0, NULL); } } ifmedia_add(&adapter->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&adapter->media, IFM_ETHER | IFM_AUTO); return (0); } /* * Manage DMA'able memory. */ static void em_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error) { if (error) return; *(bus_addr_t *) arg = segs[0].ds_addr; } static int em_dma_malloc(struct adapter *adapter, bus_size_t size, struct em_dma_alloc *dma, int mapflags) { int error; error = bus_dma_tag_create(bus_get_dma_tag(adapter->dev), /* parent */ EM_DBA_ALIGN, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ size, /* maxsize */ 1, /* nsegments */ size, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &dma->dma_tag); if (error) { device_printf(adapter->dev, "%s: bus_dma_tag_create failed: %d\n", __func__, error); goto fail_0; } error = bus_dmamem_alloc(dma->dma_tag, (void**) &dma->dma_vaddr, BUS_DMA_NOWAIT | BUS_DMA_COHERENT, &dma->dma_map); if (error) { device_printf(adapter->dev, "%s: bus_dmamem_alloc(%ju) failed: %d\n", __func__, (uintmax_t)size, error); goto fail_2; } dma->dma_paddr = 0; error = bus_dmamap_load(dma->dma_tag, dma->dma_map, dma->dma_vaddr, size, em_dmamap_cb, &dma->dma_paddr, mapflags | BUS_DMA_NOWAIT); if (error || dma->dma_paddr == 0) { device_printf(adapter->dev, "%s: bus_dmamap_load failed: %d\n", __func__, error); goto fail_3; } return (0); fail_3: bus_dmamap_unload(dma->dma_tag, dma->dma_map); fail_2: bus_dmamem_free(dma->dma_tag, dma->dma_vaddr, dma->dma_map); bus_dma_tag_destroy(dma->dma_tag); fail_0: dma->dma_map = NULL; dma->dma_tag = NULL; return (error); } static void em_dma_free(struct adapter *adapter, struct em_dma_alloc *dma) { if (dma->dma_tag == NULL) return; if (dma->dma_map != NULL) { bus_dmamap_sync(dma->dma_tag, dma->dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(dma->dma_tag, dma->dma_map); bus_dmamem_free(dma->dma_tag, dma->dma_vaddr, dma->dma_map); dma->dma_map = NULL; } bus_dma_tag_destroy(dma->dma_tag); dma->dma_tag = NULL; } /********************************************************************* * * Allocate memory for the transmit and receive rings, and then * the descriptors associated with each, called only once at attach. * **********************************************************************/ static int em_allocate_queues(struct adapter *adapter) { device_t dev = adapter->dev; struct tx_ring *txr = NULL; struct rx_ring *rxr = NULL; int rsize, tsize, error = E1000_SUCCESS; int txconf = 0, rxconf = 0; /* Allocate the TX ring struct memory */ if (!(adapter->tx_rings = (struct tx_ring *) malloc(sizeof(struct tx_ring) * adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate TX ring memory\n"); error = ENOMEM; goto fail; } /* Now allocate the RX */ if (!(adapter->rx_rings = (struct rx_ring *) malloc(sizeof(struct rx_ring) * adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate RX ring memory\n"); error = ENOMEM; goto rx_fail; } tsize = roundup2(adapter->num_tx_desc * sizeof(struct e1000_tx_desc), EM_DBA_ALIGN); /* * Now set up the TX queues, txconf is needed to handle the * possibility that things fail midcourse and we need to * undo memory gracefully */ for (int i = 0; i < adapter->num_queues; i++, txconf++) { /* Set up some basics */ txr = &adapter->tx_rings[i]; txr->adapter = adapter; txr->me = i; /* Initialize the TX lock */ snprintf(txr->mtx_name, sizeof(txr->mtx_name), "%s:tx(%d)", device_get_nameunit(dev), txr->me); mtx_init(&txr->tx_mtx, txr->mtx_name, NULL, MTX_DEF); if (em_dma_malloc(adapter, tsize, &txr->txdma, BUS_DMA_NOWAIT)) { device_printf(dev, "Unable to allocate TX Descriptor memory\n"); error = ENOMEM; goto err_tx_desc; } txr->tx_base = (struct e1000_tx_desc *)txr->txdma.dma_vaddr; bzero((void *)txr->tx_base, tsize); if (em_allocate_transmit_buffers(txr)) { device_printf(dev, "Critical Failure setting up transmit buffers\n"); error = ENOMEM; goto err_tx_desc; } #if __FreeBSD_version >= 800000 /* Allocate a buf ring */ txr->br = buf_ring_alloc(4096, M_DEVBUF, M_WAITOK, &txr->tx_mtx); #endif } /* * Next the RX queues... */ rsize = roundup2(adapter->num_rx_desc * sizeof(struct e1000_rx_desc), EM_DBA_ALIGN); for (int i = 0; i < adapter->num_queues; i++, rxconf++) { rxr = &adapter->rx_rings[i]; rxr->adapter = adapter; rxr->me = i; /* Initialize the RX lock */ snprintf(rxr->mtx_name, sizeof(rxr->mtx_name), "%s:rx(%d)", device_get_nameunit(dev), txr->me); mtx_init(&rxr->rx_mtx, rxr->mtx_name, NULL, MTX_DEF); if (em_dma_malloc(adapter, rsize, &rxr->rxdma, BUS_DMA_NOWAIT)) { device_printf(dev, "Unable to allocate RxDescriptor memory\n"); error = ENOMEM; goto err_rx_desc; } rxr->rx_base = (struct e1000_rx_desc *)rxr->rxdma.dma_vaddr; bzero((void *)rxr->rx_base, rsize); /* Allocate receive buffers for the ring*/ if (em_allocate_receive_buffers(rxr)) { device_printf(dev, "Critical Failure setting up receive buffers\n"); error = ENOMEM; goto err_rx_desc; } } return (0); err_rx_desc: for (rxr = adapter->rx_rings; rxconf > 0; rxr++, rxconf--) em_dma_free(adapter, &rxr->rxdma); err_tx_desc: for (txr = adapter->tx_rings; txconf > 0; txr++, txconf--) em_dma_free(adapter, &txr->txdma); free(adapter->rx_rings, M_DEVBUF); rx_fail: #if __FreeBSD_version >= 800000 buf_ring_free(txr->br, M_DEVBUF); #endif free(adapter->tx_rings, M_DEVBUF); fail: return (error); } /********************************************************************* * * Allocate memory for tx_buffer structures. The tx_buffer stores all * the information needed to transmit a packet on the wire. This is * called only once at attach, setup is done every reset. * **********************************************************************/ static int em_allocate_transmit_buffers(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; device_t dev = adapter->dev; struct em_buffer *txbuf; int error, i; /* * Setup DMA descriptor areas. */ if ((error = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ EM_TSO_SIZE, /* maxsize */ EM_MAX_SCATTER, /* nsegments */ PAGE_SIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->txtag))) { device_printf(dev,"Unable to allocate TX DMA tag\n"); goto fail; } if (!(txr->tx_buffers = (struct em_buffer *) malloc(sizeof(struct em_buffer) * adapter->num_tx_desc, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate tx_buffer memory\n"); error = ENOMEM; goto fail; } /* Create the descriptor buffer dma maps */ txbuf = txr->tx_buffers; for (i = 0; i < adapter->num_tx_desc; i++, txbuf++) { error = bus_dmamap_create(txr->txtag, 0, &txbuf->map); if (error != 0) { device_printf(dev, "Unable to create TX DMA map\n"); goto fail; } } return 0; fail: /* We free all, it handles case where we are in the middle */ em_free_transmit_structures(adapter); return (error); } /********************************************************************* * * Initialize a transmit ring. * **********************************************************************/ static void em_setup_transmit_ring(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; struct em_buffer *txbuf; int i; #ifdef DEV_NETMAP struct netmap_adapter *na = NA(adapter->ifp); struct netmap_slot *slot; #endif /* DEV_NETMAP */ /* Clear the old descriptor contents */ EM_TX_LOCK(txr); #ifdef DEV_NETMAP slot = netmap_reset(na, NR_TX, txr->me, 0); #endif /* DEV_NETMAP */ bzero((void *)txr->tx_base, (sizeof(struct e1000_tx_desc)) * adapter->num_tx_desc); /* Reset indices */ txr->next_avail_desc = 0; txr->next_to_clean = 0; /* Free any existing tx buffers. */ txbuf = txr->tx_buffers; for (i = 0; i < adapter->num_tx_desc; i++, txbuf++) { if (txbuf->m_head != NULL) { bus_dmamap_sync(txr->txtag, txbuf->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, txbuf->map); m_freem(txbuf->m_head); txbuf->m_head = NULL; } #ifdef DEV_NETMAP if (slot) { int si = netmap_idx_n2k(&na->tx_rings[txr->me], i); uint64_t paddr; void *addr; addr = PNMB(slot + si, &paddr); txr->tx_base[i].buffer_addr = htole64(paddr); /* reload the map for netmap mode */ netmap_load_map(txr->txtag, txbuf->map, addr); } #endif /* DEV_NETMAP */ /* clear the watch index */ txbuf->next_eop = -1; } /* Set number of descriptors available */ txr->tx_avail = adapter->num_tx_desc; txr->queue_status = EM_QUEUE_IDLE; /* Clear checksum offload context. */ txr->last_hw_offload = 0; txr->last_hw_ipcss = 0; txr->last_hw_ipcso = 0; txr->last_hw_tucss = 0; txr->last_hw_tucso = 0; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); EM_TX_UNLOCK(txr); } /********************************************************************* * * Initialize all transmit rings. * **********************************************************************/ static void em_setup_transmit_structures(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; for (int i = 0; i < adapter->num_queues; i++, txr++) em_setup_transmit_ring(txr); return; } /********************************************************************* * * Enable transmit unit. * **********************************************************************/ static void em_initialize_transmit_unit(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; struct e1000_hw *hw = &adapter->hw; u32 tctl, tarc, tipg = 0; INIT_DEBUGOUT("em_initialize_transmit_unit: begin"); for (int i = 0; i < adapter->num_queues; i++, txr++) { u64 bus_addr = txr->txdma.dma_paddr; /* Base and Len of TX Ring */ E1000_WRITE_REG(hw, E1000_TDLEN(i), adapter->num_tx_desc * sizeof(struct e1000_tx_desc)); E1000_WRITE_REG(hw, E1000_TDBAH(i), (u32)(bus_addr >> 32)); E1000_WRITE_REG(hw, E1000_TDBAL(i), (u32)bus_addr); /* Init the HEAD/TAIL indices */ E1000_WRITE_REG(hw, E1000_TDT(i), 0); E1000_WRITE_REG(hw, E1000_TDH(i), 0); HW_DEBUGOUT2("Base = %x, Length = %x\n", E1000_READ_REG(&adapter->hw, E1000_TDBAL(i)), E1000_READ_REG(&adapter->hw, E1000_TDLEN(i))); txr->queue_status = EM_QUEUE_IDLE; } /* Set the default values for the Tx Inter Packet Gap timer */ switch (adapter->hw.mac.type) { case e1000_80003es2lan: tipg = DEFAULT_82543_TIPG_IPGR1; tipg |= DEFAULT_80003ES2LAN_TIPG_IPGR2 << E1000_TIPG_IPGR2_SHIFT; break; default: if ((adapter->hw.phy.media_type == e1000_media_type_fiber) || (adapter->hw.phy.media_type == e1000_media_type_internal_serdes)) tipg = DEFAULT_82543_TIPG_IPGT_FIBER; else tipg = DEFAULT_82543_TIPG_IPGT_COPPER; tipg |= DEFAULT_82543_TIPG_IPGR1 << E1000_TIPG_IPGR1_SHIFT; tipg |= DEFAULT_82543_TIPG_IPGR2 << E1000_TIPG_IPGR2_SHIFT; } E1000_WRITE_REG(&adapter->hw, E1000_TIPG, tipg); E1000_WRITE_REG(&adapter->hw, E1000_TIDV, adapter->tx_int_delay.value); if(adapter->hw.mac.type >= e1000_82540) E1000_WRITE_REG(&adapter->hw, E1000_TADV, adapter->tx_abs_int_delay.value); if ((adapter->hw.mac.type == e1000_82571) || (adapter->hw.mac.type == e1000_82572)) { tarc = E1000_READ_REG(&adapter->hw, E1000_TARC(0)); tarc |= SPEED_MODE_BIT; E1000_WRITE_REG(&adapter->hw, E1000_TARC(0), tarc); } else if (adapter->hw.mac.type == e1000_80003es2lan) { tarc = E1000_READ_REG(&adapter->hw, E1000_TARC(0)); tarc |= 1; E1000_WRITE_REG(&adapter->hw, E1000_TARC(0), tarc); tarc = E1000_READ_REG(&adapter->hw, E1000_TARC(1)); tarc |= 1; E1000_WRITE_REG(&adapter->hw, E1000_TARC(1), tarc); } adapter->txd_cmd = E1000_TXD_CMD_IFCS; if (adapter->tx_int_delay.value > 0) adapter->txd_cmd |= E1000_TXD_CMD_IDE; /* Program the Transmit Control Register */ tctl = E1000_READ_REG(&adapter->hw, E1000_TCTL); tctl &= ~E1000_TCTL_CT; tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN | (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT)); if (adapter->hw.mac.type >= e1000_82571) tctl |= E1000_TCTL_MULR; /* This write will effectively turn on the transmit unit. */ E1000_WRITE_REG(&adapter->hw, E1000_TCTL, tctl); } /********************************************************************* * * Free all transmit rings. * **********************************************************************/ static void em_free_transmit_structures(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; for (int i = 0; i < adapter->num_queues; i++, txr++) { EM_TX_LOCK(txr); em_free_transmit_buffers(txr); em_dma_free(adapter, &txr->txdma); EM_TX_UNLOCK(txr); EM_TX_LOCK_DESTROY(txr); } free(adapter->tx_rings, M_DEVBUF); } /********************************************************************* * * Free transmit ring related data structures. * **********************************************************************/ static void em_free_transmit_buffers(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; struct em_buffer *txbuf; INIT_DEBUGOUT("free_transmit_ring: begin"); if (txr->tx_buffers == NULL) return; for (int i = 0; i < adapter->num_tx_desc; i++) { txbuf = &txr->tx_buffers[i]; if (txbuf->m_head != NULL) { bus_dmamap_sync(txr->txtag, txbuf->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, txbuf->map); m_freem(txbuf->m_head); txbuf->m_head = NULL; if (txbuf->map != NULL) { bus_dmamap_destroy(txr->txtag, txbuf->map); txbuf->map = NULL; } } else if (txbuf->map != NULL) { bus_dmamap_unload(txr->txtag, txbuf->map); bus_dmamap_destroy(txr->txtag, txbuf->map); txbuf->map = NULL; } } #if __FreeBSD_version >= 800000 if (txr->br != NULL) buf_ring_free(txr->br, M_DEVBUF); #endif if (txr->tx_buffers != NULL) { free(txr->tx_buffers, M_DEVBUF); txr->tx_buffers = NULL; } if (txr->txtag != NULL) { bus_dma_tag_destroy(txr->txtag); txr->txtag = NULL; } return; } /********************************************************************* * The offload context is protocol specific (TCP/UDP) and thus * only needs to be set when the protocol changes. The occasion * of a context change can be a performance detriment, and * might be better just disabled. The reason arises in the way * in which the controller supports pipelined requests from the * Tx data DMA. Up to four requests can be pipelined, and they may * belong to the same packet or to multiple packets. However all * requests for one packet are issued before a request is issued * for a subsequent packet and if a request for the next packet * requires a context change, that request will be stalled * until the previous request completes. This means setting up * a new context effectively disables pipelined Tx data DMA which * in turn greatly slow down performance to send small sized * frames. **********************************************************************/ static void em_transmit_checksum_setup(struct tx_ring *txr, struct mbuf *mp, int ip_off, struct ip *ip, u32 *txd_upper, u32 *txd_lower) { struct adapter *adapter = txr->adapter; struct e1000_context_desc *TXD = NULL; struct em_buffer *tx_buffer; int cur, hdr_len; u32 cmd = 0; u16 offload = 0; u8 ipcso, ipcss, tucso, tucss; ipcss = ipcso = tucss = tucso = 0; hdr_len = ip_off + (ip->ip_hl << 2); cur = txr->next_avail_desc; /* Setup of IP header checksum. */ if (mp->m_pkthdr.csum_flags & CSUM_IP) { *txd_upper |= E1000_TXD_POPTS_IXSM << 8; offload |= CSUM_IP; ipcss = ip_off; ipcso = ip_off + offsetof(struct ip, ip_sum); /* * Start offset for header checksum calculation. * End offset for header checksum calculation. * Offset of place to put the checksum. */ TXD = (struct e1000_context_desc *)&txr->tx_base[cur]; TXD->lower_setup.ip_fields.ipcss = ipcss; TXD->lower_setup.ip_fields.ipcse = htole16(hdr_len); TXD->lower_setup.ip_fields.ipcso = ipcso; cmd |= E1000_TXD_CMD_IP; } if (mp->m_pkthdr.csum_flags & CSUM_TCP) { *txd_lower = E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D; *txd_upper |= E1000_TXD_POPTS_TXSM << 8; offload |= CSUM_TCP; tucss = hdr_len; tucso = hdr_len + offsetof(struct tcphdr, th_sum); /* * Setting up new checksum offload context for every frames * takes a lot of processing time for hardware. This also * reduces performance a lot for small sized frames so avoid * it if driver can use previously configured checksum * offload context. */ if (txr->last_hw_offload == offload) { if (offload & CSUM_IP) { if (txr->last_hw_ipcss == ipcss && txr->last_hw_ipcso == ipcso && txr->last_hw_tucss == tucss && txr->last_hw_tucso == tucso) return; } else { if (txr->last_hw_tucss == tucss && txr->last_hw_tucso == tucso) return; } } txr->last_hw_offload = offload; txr->last_hw_tucss = tucss; txr->last_hw_tucso = tucso; /* * Start offset for payload checksum calculation. * End offset for payload checksum calculation. * Offset of place to put the checksum. */ TXD = (struct e1000_context_desc *)&txr->tx_base[cur]; TXD->upper_setup.tcp_fields.tucss = hdr_len; TXD->upper_setup.tcp_fields.tucse = htole16(0); TXD->upper_setup.tcp_fields.tucso = tucso; cmd |= E1000_TXD_CMD_TCP; } else if (mp->m_pkthdr.csum_flags & CSUM_UDP) { *txd_lower = E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D; *txd_upper |= E1000_TXD_POPTS_TXSM << 8; tucss = hdr_len; tucso = hdr_len + offsetof(struct udphdr, uh_sum); /* * Setting up new checksum offload context for every frames * takes a lot of processing time for hardware. This also * reduces performance a lot for small sized frames so avoid * it if driver can use previously configured checksum * offload context. */ if (txr->last_hw_offload == offload) { if (offload & CSUM_IP) { if (txr->last_hw_ipcss == ipcss && txr->last_hw_ipcso == ipcso && txr->last_hw_tucss == tucss && txr->last_hw_tucso == tucso) return; } else { if (txr->last_hw_tucss == tucss && txr->last_hw_tucso == tucso) return; } } txr->last_hw_offload = offload; txr->last_hw_tucss = tucss; txr->last_hw_tucso = tucso; /* * Start offset for header checksum calculation. * End offset for header checksum calculation. * Offset of place to put the checksum. */ TXD = (struct e1000_context_desc *)&txr->tx_base[cur]; TXD->upper_setup.tcp_fields.tucss = tucss; TXD->upper_setup.tcp_fields.tucse = htole16(0); TXD->upper_setup.tcp_fields.tucso = tucso; } if (offload & CSUM_IP) { txr->last_hw_ipcss = ipcss; txr->last_hw_ipcso = ipcso; } TXD->tcp_seg_setup.data = htole32(0); TXD->cmd_and_length = htole32(adapter->txd_cmd | E1000_TXD_CMD_DEXT | cmd); tx_buffer = &txr->tx_buffers[cur]; tx_buffer->m_head = NULL; tx_buffer->next_eop = -1; if (++cur == adapter->num_tx_desc) cur = 0; txr->tx_avail--; txr->next_avail_desc = cur; } /********************************************************************** * * Setup work for hardware segmentation offload (TSO) * **********************************************************************/ static void em_tso_setup(struct tx_ring *txr, struct mbuf *mp, int ip_off, struct ip *ip, struct tcphdr *tp, u32 *txd_upper, u32 *txd_lower) { struct adapter *adapter = txr->adapter; struct e1000_context_desc *TXD; struct em_buffer *tx_buffer; int cur, hdr_len; /* * In theory we can use the same TSO context if and only if * frame is the same type(IP/TCP) and the same MSS. However * checking whether a frame has the same IP/TCP structure is * hard thing so just ignore that and always restablish a * new TSO context. */ hdr_len = ip_off + (ip->ip_hl << 2) + (tp->th_off << 2); *txd_lower = (E1000_TXD_CMD_DEXT | /* Extended descr type */ E1000_TXD_DTYP_D | /* Data descr type */ E1000_TXD_CMD_TSE); /* Do TSE on this packet */ /* IP and/or TCP header checksum calculation and insertion. */ *txd_upper = (E1000_TXD_POPTS_IXSM | E1000_TXD_POPTS_TXSM) << 8; cur = txr->next_avail_desc; tx_buffer = &txr->tx_buffers[cur]; TXD = (struct e1000_context_desc *) &txr->tx_base[cur]; /* * Start offset for header checksum calculation. * End offset for header checksum calculation. * Offset of place put the checksum. */ TXD->lower_setup.ip_fields.ipcss = ip_off; TXD->lower_setup.ip_fields.ipcse = htole16(ip_off + (ip->ip_hl << 2) - 1); TXD->lower_setup.ip_fields.ipcso = ip_off + offsetof(struct ip, ip_sum); /* * Start offset for payload checksum calculation. * End offset for payload checksum calculation. * Offset of place to put the checksum. */ TXD->upper_setup.tcp_fields.tucss = ip_off + (ip->ip_hl << 2); TXD->upper_setup.tcp_fields.tucse = 0; TXD->upper_setup.tcp_fields.tucso = ip_off + (ip->ip_hl << 2) + offsetof(struct tcphdr, th_sum); /* * Payload size per packet w/o any headers. * Length of all headers up to payload. */ TXD->tcp_seg_setup.fields.mss = htole16(mp->m_pkthdr.tso_segsz); TXD->tcp_seg_setup.fields.hdr_len = hdr_len; TXD->cmd_and_length = htole32(adapter->txd_cmd | E1000_TXD_CMD_DEXT | /* Extended descr */ E1000_TXD_CMD_TSE | /* TSE context */ E1000_TXD_CMD_IP | /* Do IP csum */ E1000_TXD_CMD_TCP | /* Do TCP checksum */ (mp->m_pkthdr.len - (hdr_len))); /* Total len */ tx_buffer->m_head = NULL; tx_buffer->next_eop = -1; if (++cur == adapter->num_tx_desc) cur = 0; txr->tx_avail--; txr->next_avail_desc = cur; txr->tx_tso = TRUE; } /********************************************************************** * * Examine each tx_buffer in the used queue. If the hardware is done * processing the packet then free associated resources. The * tx_buffer is put back on the free queue. * **********************************************************************/ static void em_txeof(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; int first, last, done, processed; struct em_buffer *tx_buffer; struct e1000_tx_desc *tx_desc, *eop_desc; struct ifnet *ifp = adapter->ifp; EM_TX_LOCK_ASSERT(txr); #ifdef DEV_NETMAP if (netmap_tx_irq(ifp, txr->me)) return; #endif /* DEV_NETMAP */ /* No work, make sure watchdog is off */ if (txr->tx_avail == adapter->num_tx_desc) { txr->queue_status = EM_QUEUE_IDLE; return; } processed = 0; first = txr->next_to_clean; tx_desc = &txr->tx_base[first]; tx_buffer = &txr->tx_buffers[first]; last = tx_buffer->next_eop; eop_desc = &txr->tx_base[last]; /* * What this does is get the index of the * first descriptor AFTER the EOP of the * first packet, that way we can do the * simple comparison on the inner while loop. */ if (++last == adapter->num_tx_desc) last = 0; done = last; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); while (eop_desc->upper.fields.status & E1000_TXD_STAT_DD) { /* We clean the range of the packet */ while (first != done) { tx_desc->upper.data = 0; tx_desc->lower.data = 0; tx_desc->buffer_addr = 0; ++txr->tx_avail; ++processed; if (tx_buffer->m_head) { bus_dmamap_sync(txr->txtag, tx_buffer->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, tx_buffer->map); m_freem(tx_buffer->m_head); tx_buffer->m_head = NULL; } tx_buffer->next_eop = -1; txr->watchdog_time = ticks; if (++first == adapter->num_tx_desc) first = 0; tx_buffer = &txr->tx_buffers[first]; tx_desc = &txr->tx_base[first]; } ++ifp->if_opackets; /* See if we can continue to the next packet */ last = tx_buffer->next_eop; if (last != -1) { eop_desc = &txr->tx_base[last]; /* Get new done point */ if (++last == adapter->num_tx_desc) last = 0; done = last; } else break; } bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); txr->next_to_clean = first; /* ** Watchdog calculation, we know there's ** work outstanding or the first return ** would have been taken, so none processed ** for too long indicates a hang. local timer ** will examine this and do a reset if needed. */ if ((!processed) && ((ticks - txr->watchdog_time) > EM_WATCHDOG)) txr->queue_status = EM_QUEUE_HUNG; /* * If we have a minimum free, clear IFF_DRV_OACTIVE * to tell the stack that it is OK to send packets. * Notice that all writes of OACTIVE happen under the * TX lock which, with a single queue, guarantees * sanity. */ if (txr->tx_avail >= EM_MAX_SCATTER) ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; /* Disable watchdog if all clean */ if (txr->tx_avail == adapter->num_tx_desc) { txr->queue_status = EM_QUEUE_IDLE; } } /********************************************************************* * * Refresh RX descriptor mbufs from system mbuf buffer pool. * **********************************************************************/ static void em_refresh_mbufs(struct rx_ring *rxr, int limit) { struct adapter *adapter = rxr->adapter; struct mbuf *m; bus_dma_segment_t segs[1]; struct em_buffer *rxbuf; int i, j, error, nsegs; bool cleaned = FALSE; i = j = rxr->next_to_refresh; /* ** Get one descriptor beyond ** our work mark to control ** the loop. */ if (++j == adapter->num_rx_desc) j = 0; while (j != limit) { rxbuf = &rxr->rx_buffers[i]; if (rxbuf->m_head == NULL) { m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, adapter->rx_mbuf_sz); /* ** If we have a temporary resource shortage ** that causes a failure, just abort refresh ** for now, we will return to this point when ** reinvoked from em_rxeof. */ if (m == NULL) goto update; } else m = rxbuf->m_head; m->m_len = m->m_pkthdr.len = adapter->rx_mbuf_sz; m->m_flags |= M_PKTHDR; m->m_data = m->m_ext.ext_buf; /* Use bus_dma machinery to setup the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->rxtag, rxbuf->map, m, segs, &nsegs, BUS_DMA_NOWAIT); if (error != 0) { printf("Refresh mbufs: hdr dmamap load" " failure - %d\n", error); m_free(m); rxbuf->m_head = NULL; goto update; } rxbuf->m_head = m; bus_dmamap_sync(rxr->rxtag, rxbuf->map, BUS_DMASYNC_PREREAD); rxr->rx_base[i].buffer_addr = htole64(segs[0].ds_addr); cleaned = TRUE; i = j; /* Next is precalulated for us */ rxr->next_to_refresh = i; /* Calculate next controlling index */ if (++j == adapter->num_rx_desc) j = 0; } update: /* ** Update the tail pointer only if, ** and as far as we have refreshed. */ if (cleaned) E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), rxr->next_to_refresh); return; } /********************************************************************* * * Allocate memory for rx_buffer structures. Since we use one * rx_buffer per received packet, the maximum number of rx_buffer's * that we'll need is equal to the number of receive descriptors * that we've allocated. * **********************************************************************/ static int em_allocate_receive_buffers(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; device_t dev = adapter->dev; struct em_buffer *rxbuf; int error; rxr->rx_buffers = malloc(sizeof(struct em_buffer) * adapter->num_rx_desc, M_DEVBUF, M_NOWAIT | M_ZERO); if (rxr->rx_buffers == NULL) { device_printf(dev, "Unable to allocate rx_buffer memory\n"); return (ENOMEM); } error = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MJUM9BYTES, /* maxsize */ 1, /* nsegments */ MJUM9BYTES, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &rxr->rxtag); if (error) { device_printf(dev, "%s: bus_dma_tag_create failed %d\n", __func__, error); goto fail; } rxbuf = rxr->rx_buffers; for (int i = 0; i < adapter->num_rx_desc; i++, rxbuf++) { rxbuf = &rxr->rx_buffers[i]; error = bus_dmamap_create(rxr->rxtag, BUS_DMA_NOWAIT, &rxbuf->map); if (error) { device_printf(dev, "%s: bus_dmamap_create failed: %d\n", __func__, error); goto fail; } } return (0); fail: em_free_receive_structures(adapter); return (error); } /********************************************************************* * * Initialize a receive ring and its buffers. * **********************************************************************/ static int em_setup_receive_ring(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; struct em_buffer *rxbuf; bus_dma_segment_t seg[1]; int rsize, nsegs, error = 0; #ifdef DEV_NETMAP struct netmap_adapter *na = NA(adapter->ifp); struct netmap_slot *slot; #endif /* Clear the ring contents */ EM_RX_LOCK(rxr); rsize = roundup2(adapter->num_rx_desc * sizeof(struct e1000_rx_desc), EM_DBA_ALIGN); bzero((void *)rxr->rx_base, rsize); #ifdef DEV_NETMAP slot = netmap_reset(na, NR_RX, 0, 0); #endif /* ** Free current RX buffer structs and their mbufs */ for (int i = 0; i < adapter->num_rx_desc; i++) { rxbuf = &rxr->rx_buffers[i]; if (rxbuf->m_head != NULL) { bus_dmamap_sync(rxr->rxtag, rxbuf->map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->rxtag, rxbuf->map); m_freem(rxbuf->m_head); rxbuf->m_head = NULL; /* mark as freed */ } } /* Now replenish the mbufs */ for (int j = 0; j != adapter->num_rx_desc; ++j) { rxbuf = &rxr->rx_buffers[j]; #ifdef DEV_NETMAP if (slot) { int si = netmap_idx_n2k(&na->rx_rings[rxr->me], j); uint64_t paddr; void *addr; addr = PNMB(slot + si, &paddr); netmap_load_map(rxr->rxtag, rxbuf->map, addr); /* Update descriptor */ rxr->rx_base[j].buffer_addr = htole64(paddr); continue; } #endif /* DEV_NETMAP */ rxbuf->m_head = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, adapter->rx_mbuf_sz); if (rxbuf->m_head == NULL) { error = ENOBUFS; goto fail; } rxbuf->m_head->m_len = adapter->rx_mbuf_sz; rxbuf->m_head->m_flags &= ~M_HASFCS; /* we strip it */ rxbuf->m_head->m_pkthdr.len = adapter->rx_mbuf_sz; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->rxtag, rxbuf->map, rxbuf->m_head, seg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) { m_freem(rxbuf->m_head); rxbuf->m_head = NULL; goto fail; } bus_dmamap_sync(rxr->rxtag, rxbuf->map, BUS_DMASYNC_PREREAD); /* Update descriptor */ rxr->rx_base[j].buffer_addr = htole64(seg[0].ds_addr); } rxr->next_to_check = 0; rxr->next_to_refresh = 0; bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); fail: EM_RX_UNLOCK(rxr); return (error); } /********************************************************************* * * Initialize all receive rings. * **********************************************************************/ static int em_setup_receive_structures(struct adapter *adapter) { struct rx_ring *rxr = adapter->rx_rings; int q; for (q = 0; q < adapter->num_queues; q++, rxr++) if (em_setup_receive_ring(rxr)) goto fail; return (0); fail: /* * Free RX buffers allocated so far, we will only handle * the rings that completed, the failing case will have * cleaned up for itself. 'q' failed, so its the terminus. */ for (int i = 0; i < q; ++i) { rxr = &adapter->rx_rings[i]; for (int n = 0; n < adapter->num_rx_desc; n++) { struct em_buffer *rxbuf; rxbuf = &rxr->rx_buffers[n]; if (rxbuf->m_head != NULL) { bus_dmamap_sync(rxr->rxtag, rxbuf->map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->rxtag, rxbuf->map); m_freem(rxbuf->m_head); rxbuf->m_head = NULL; } } rxr->next_to_check = 0; rxr->next_to_refresh = 0; } return (ENOBUFS); } /********************************************************************* * * Free all receive rings. * **********************************************************************/ static void em_free_receive_structures(struct adapter *adapter) { struct rx_ring *rxr = adapter->rx_rings; for (int i = 0; i < adapter->num_queues; i++, rxr++) { em_free_receive_buffers(rxr); /* Free the ring memory as well */ em_dma_free(adapter, &rxr->rxdma); EM_RX_LOCK_DESTROY(rxr); } free(adapter->rx_rings, M_DEVBUF); } /********************************************************************* * * Free receive ring data structures * **********************************************************************/ static void em_free_receive_buffers(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; struct em_buffer *rxbuf = NULL; INIT_DEBUGOUT("free_receive_buffers: begin"); if (rxr->rx_buffers != NULL) { for (int i = 0; i < adapter->num_rx_desc; i++) { rxbuf = &rxr->rx_buffers[i]; if (rxbuf->map != NULL) { bus_dmamap_sync(rxr->rxtag, rxbuf->map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->rxtag, rxbuf->map); bus_dmamap_destroy(rxr->rxtag, rxbuf->map); } if (rxbuf->m_head != NULL) { m_freem(rxbuf->m_head); rxbuf->m_head = NULL; } } free(rxr->rx_buffers, M_DEVBUF); rxr->rx_buffers = NULL; rxr->next_to_check = 0; rxr->next_to_refresh = 0; } if (rxr->rxtag != NULL) { bus_dma_tag_destroy(rxr->rxtag); rxr->rxtag = NULL; } return; } /********************************************************************* * * Enable receive unit. * **********************************************************************/ static void em_initialize_receive_unit(struct adapter *adapter) { struct rx_ring *rxr = adapter->rx_rings; struct ifnet *ifp = adapter->ifp; struct e1000_hw *hw = &adapter->hw; u64 bus_addr; u32 rctl, rxcsum; INIT_DEBUGOUT("em_initialize_receive_units: begin"); /* * Make sure receives are disabled while setting * up the descriptor ring */ rctl = E1000_READ_REG(hw, E1000_RCTL); /* Do not disable if ever enabled on this hardware */ if ((hw->mac.type != e1000_82574) && (hw->mac.type != e1000_82583)) E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN); E1000_WRITE_REG(&adapter->hw, E1000_RADV, adapter->rx_abs_int_delay.value); /* * Set the interrupt throttling rate. Value is calculated * as DEFAULT_ITR = 1/(MAX_INTS_PER_SEC * 256ns) */ E1000_WRITE_REG(hw, E1000_ITR, DEFAULT_ITR); /* ** When using MSIX interrupts we need to throttle ** using the EITR register (82574 only) */ if (hw->mac.type == e1000_82574) { for (int i = 0; i < 4; i++) E1000_WRITE_REG(hw, E1000_EITR_82574(i), DEFAULT_ITR); /* Disable accelerated acknowledge */ E1000_WRITE_REG(hw, E1000_RFCTL, E1000_RFCTL_ACK_DIS); } rxcsum = E1000_READ_REG(hw, E1000_RXCSUM); if (ifp->if_capenable & IFCAP_RXCSUM) rxcsum |= E1000_RXCSUM_TUOFL; else rxcsum &= ~E1000_RXCSUM_TUOFL; E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum); /* ** XXX TEMPORARY WORKAROUND: on some systems with 82573 ** long latencies are observed, like Lenovo X60. This ** change eliminates the problem, but since having positive ** values in RDTR is a known source of problems on other ** platforms another solution is being sought. */ if (hw->mac.type == e1000_82573) E1000_WRITE_REG(hw, E1000_RDTR, 0x20); for (int i = 0; i < adapter->num_queues; i++, rxr++) { /* Setup the Base and Length of the Rx Descriptor Ring */ u32 rdt = adapter->num_rx_desc - 1; /* default */ bus_addr = rxr->rxdma.dma_paddr; E1000_WRITE_REG(hw, E1000_RDLEN(i), adapter->num_rx_desc * sizeof(struct e1000_rx_desc)); E1000_WRITE_REG(hw, E1000_RDBAH(i), (u32)(bus_addr >> 32)); E1000_WRITE_REG(hw, E1000_RDBAL(i), (u32)bus_addr); /* Setup the Head and Tail Descriptor Pointers */ E1000_WRITE_REG(hw, E1000_RDH(i), 0); #ifdef DEV_NETMAP /* * an init() while a netmap client is active must * preserve the rx buffers passed to userspace. */ if (ifp->if_capenable & IFCAP_NETMAP) - rdt -= NA(adapter->ifp)->rx_rings[i].nr_hwavail; + rdt -= nm_kr_rxspace(&NA(adapter->ifp)->rx_rings[i]); #endif /* DEV_NETMAP */ E1000_WRITE_REG(hw, E1000_RDT(i), rdt); } /* Set PTHRESH for improved jumbo performance */ if (((adapter->hw.mac.type == e1000_ich9lan) || (adapter->hw.mac.type == e1000_pch2lan) || (adapter->hw.mac.type == e1000_ich10lan)) && (ifp->if_mtu > ETHERMTU)) { u32 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(0)); E1000_WRITE_REG(hw, E1000_RXDCTL(0), rxdctl | 3); } if (adapter->hw.mac.type >= e1000_pch2lan) { if (ifp->if_mtu > ETHERMTU) e1000_lv_jumbo_workaround_ich8lan(hw, TRUE); else e1000_lv_jumbo_workaround_ich8lan(hw, FALSE); } /* Setup the Receive Control Register */ rctl &= ~(3 << E1000_RCTL_MO_SHIFT); rctl |= E1000_RCTL_EN | E1000_RCTL_BAM | E1000_RCTL_LBM_NO | E1000_RCTL_RDMTS_HALF | (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT); /* Strip the CRC */ rctl |= E1000_RCTL_SECRC; /* Make sure VLAN Filters are off */ rctl &= ~E1000_RCTL_VFE; rctl &= ~E1000_RCTL_SBP; if (adapter->rx_mbuf_sz == MCLBYTES) rctl |= E1000_RCTL_SZ_2048; else if (adapter->rx_mbuf_sz == MJUMPAGESIZE) rctl |= E1000_RCTL_SZ_4096 | E1000_RCTL_BSEX; else if (adapter->rx_mbuf_sz > MJUMPAGESIZE) rctl |= E1000_RCTL_SZ_8192 | E1000_RCTL_BSEX; if (ifp->if_mtu > ETHERMTU) rctl |= E1000_RCTL_LPE; else rctl &= ~E1000_RCTL_LPE; /* Write out the settings */ E1000_WRITE_REG(hw, E1000_RCTL, rctl); return; } /********************************************************************* * * This routine executes in interrupt context. It replenishes * the mbufs in the descriptor and sends data which has been * dma'ed into host memory to upper layer. * * We loop at most count times if count is > 0, or until done if * count < 0. * * For polling we also now return the number of cleaned packets *********************************************************************/ static bool em_rxeof(struct rx_ring *rxr, int count, int *done) { struct adapter *adapter = rxr->adapter; struct ifnet *ifp = adapter->ifp; struct mbuf *mp, *sendmp; u8 status = 0; u16 len; int i, processed, rxdone = 0; bool eop; struct e1000_rx_desc *cur; EM_RX_LOCK(rxr); #ifdef DEV_NETMAP if (netmap_rx_irq(ifp, rxr->me, &processed)) { EM_RX_UNLOCK(rxr); return (FALSE); } #endif /* DEV_NETMAP */ for (i = rxr->next_to_check, processed = 0; count != 0;) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); cur = &rxr->rx_base[i]; status = cur->status; mp = sendmp = NULL; if ((status & E1000_RXD_STAT_DD) == 0) break; len = le16toh(cur->length); eop = (status & E1000_RXD_STAT_EOP) != 0; if ((cur->errors & E1000_RXD_ERR_FRAME_ERR_MASK) || (rxr->discard == TRUE)) { adapter->dropped_pkts++; ++rxr->rx_discarded; if (!eop) /* Catch subsequent segs */ rxr->discard = TRUE; else rxr->discard = FALSE; em_rx_discard(rxr, i); goto next_desc; } /* Assign correct length to the current fragment */ mp = rxr->rx_buffers[i].m_head; mp->m_len = len; /* Trigger for refresh */ rxr->rx_buffers[i].m_head = NULL; /* First segment? */ if (rxr->fmp == NULL) { mp->m_pkthdr.len = len; rxr->fmp = rxr->lmp = mp; } else { /* Chain mbuf's together */ mp->m_flags &= ~M_PKTHDR; rxr->lmp->m_next = mp; rxr->lmp = mp; rxr->fmp->m_pkthdr.len += len; } if (eop) { --count; sendmp = rxr->fmp; sendmp->m_pkthdr.rcvif = ifp; ifp->if_ipackets++; em_receive_checksum(cur, sendmp); #ifndef __NO_STRICT_ALIGNMENT if (adapter->hw.mac.max_frame_size > (MCLBYTES - ETHER_ALIGN) && em_fixup_rx(rxr) != 0) goto skip; #endif if (status & E1000_RXD_STAT_VP) { sendmp->m_pkthdr.ether_vtag = le16toh(cur->special); sendmp->m_flags |= M_VLANTAG; } #ifndef __NO_STRICT_ALIGNMENT skip: #endif rxr->fmp = rxr->lmp = NULL; } next_desc: /* Zero out the receive descriptors status. */ cur->status = 0; ++rxdone; /* cumulative for POLL */ ++processed; /* Advance our pointers to the next descriptor. */ if (++i == adapter->num_rx_desc) i = 0; /* Send to the stack */ if (sendmp != NULL) { rxr->next_to_check = i; EM_RX_UNLOCK(rxr); (*ifp->if_input)(ifp, sendmp); EM_RX_LOCK(rxr); i = rxr->next_to_check; } /* Only refresh mbufs every 8 descriptors */ if (processed == 8) { em_refresh_mbufs(rxr, i); processed = 0; } } /* Catch any remaining refresh work */ if (e1000_rx_unrefreshed(rxr)) em_refresh_mbufs(rxr, i); rxr->next_to_check = i; if (done != NULL) *done = rxdone; EM_RX_UNLOCK(rxr); return ((status & E1000_RXD_STAT_DD) ? TRUE : FALSE); } static __inline void em_rx_discard(struct rx_ring *rxr, int i) { struct em_buffer *rbuf; rbuf = &rxr->rx_buffers[i]; /* Free any previous pieces */ if (rxr->fmp != NULL) { rxr->fmp->m_flags |= M_PKTHDR; m_freem(rxr->fmp); rxr->fmp = NULL; rxr->lmp = NULL; } /* ** Free buffer and allow em_refresh_mbufs() ** to clean up and recharge buffer. */ if (rbuf->m_head) { m_free(rbuf->m_head); rbuf->m_head = NULL; } return; } #ifndef __NO_STRICT_ALIGNMENT /* * When jumbo frames are enabled we should realign entire payload on * architecures with strict alignment. This is serious design mistake of 8254x * as it nullifies DMA operations. 8254x just allows RX buffer size to be * 2048/4096/8192/16384. What we really want is 2048 - ETHER_ALIGN to align its * payload. On architecures without strict alignment restrictions 8254x still * performs unaligned memory access which would reduce the performance too. * To avoid copying over an entire frame to align, we allocate a new mbuf and * copy ethernet header to the new mbuf. The new mbuf is prepended into the * existing mbuf chain. * * Be aware, best performance of the 8254x is achived only when jumbo frame is * not used at all on architectures with strict alignment. */ static int em_fixup_rx(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; struct mbuf *m, *n; int error; error = 0; m = rxr->fmp; if (m->m_len <= (MCLBYTES - ETHER_HDR_LEN)) { bcopy(m->m_data, m->m_data + ETHER_HDR_LEN, m->m_len); m->m_data += ETHER_HDR_LEN; } else { MGETHDR(n, M_NOWAIT, MT_DATA); if (n != NULL) { bcopy(m->m_data, n->m_data, ETHER_HDR_LEN); m->m_data += ETHER_HDR_LEN; m->m_len -= ETHER_HDR_LEN; n->m_len = ETHER_HDR_LEN; M_MOVE_PKTHDR(n, m); n->m_next = m; rxr->fmp = n; } else { adapter->dropped_pkts++; m_freem(rxr->fmp); rxr->fmp = NULL; error = ENOMEM; } } return (error); } #endif /********************************************************************* * * Verify that the hardware indicated that the checksum is valid. * Inform the stack about the status of checksum so that stack * doesn't spend time verifying the checksum. * *********************************************************************/ static void em_receive_checksum(struct e1000_rx_desc *rx_desc, struct mbuf *mp) { mp->m_pkthdr.csum_flags = 0; /* Ignore Checksum bit is set */ if (rx_desc->status & E1000_RXD_STAT_IXSM) return; if (rx_desc->errors & (E1000_RXD_ERR_TCPE | E1000_RXD_ERR_IPE)) return; /* IP Checksum Good? */ if (rx_desc->status & E1000_RXD_STAT_IPCS) mp->m_pkthdr.csum_flags = (CSUM_IP_CHECKED | CSUM_IP_VALID); /* TCP or UDP checksum */ if (rx_desc->status & (E1000_RXD_STAT_TCPCS | E1000_RXD_STAT_UDPCS)) { mp->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); mp->m_pkthdr.csum_data = htons(0xffff); } } /* * This routine is run via an vlan * config EVENT */ static void em_register_vlan(void *arg, struct ifnet *ifp, u16 vtag) { struct adapter *adapter = ifp->if_softc; u32 index, bit; if (ifp->if_softc != arg) /* Not our event */ return; if ((vtag == 0) || (vtag > 4095)) /* Invalid ID */ return; EM_CORE_LOCK(adapter); index = (vtag >> 5) & 0x7F; bit = vtag & 0x1F; adapter->shadow_vfta[index] |= (1 << bit); ++adapter->num_vlans; /* Re-init to load the changes */ if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) em_init_locked(adapter); EM_CORE_UNLOCK(adapter); } /* * This routine is run via an vlan * unconfig EVENT */ static void em_unregister_vlan(void *arg, struct ifnet *ifp, u16 vtag) { struct adapter *adapter = ifp->if_softc; u32 index, bit; if (ifp->if_softc != arg) return; if ((vtag == 0) || (vtag > 4095)) /* Invalid */ return; EM_CORE_LOCK(adapter); index = (vtag >> 5) & 0x7F; bit = vtag & 0x1F; adapter->shadow_vfta[index] &= ~(1 << bit); --adapter->num_vlans; /* Re-init to load the changes */ if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) em_init_locked(adapter); EM_CORE_UNLOCK(adapter); } static void em_setup_vlan_hw_support(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; u32 reg; /* ** We get here thru init_locked, meaning ** a soft reset, this has already cleared ** the VFTA and other state, so if there ** have been no vlan's registered do nothing. */ if (adapter->num_vlans == 0) return; /* ** A soft reset zero's out the VFTA, so ** we need to repopulate it now. */ for (int i = 0; i < EM_VFTA_SIZE; i++) if (adapter->shadow_vfta[i] != 0) E1000_WRITE_REG_ARRAY(hw, E1000_VFTA, i, adapter->shadow_vfta[i]); reg = E1000_READ_REG(hw, E1000_CTRL); reg |= E1000_CTRL_VME; E1000_WRITE_REG(hw, E1000_CTRL, reg); /* Enable the Filter Table */ reg = E1000_READ_REG(hw, E1000_RCTL); reg &= ~E1000_RCTL_CFIEN; reg |= E1000_RCTL_VFE; E1000_WRITE_REG(hw, E1000_RCTL, reg); } static void em_enable_intr(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; u32 ims_mask = IMS_ENABLE_MASK; if (hw->mac.type == e1000_82574) { E1000_WRITE_REG(hw, EM_EIAC, EM_MSIX_MASK); ims_mask |= EM_MSIX_MASK; } E1000_WRITE_REG(hw, E1000_IMS, ims_mask); } static void em_disable_intr(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; if (hw->mac.type == e1000_82574) E1000_WRITE_REG(hw, EM_EIAC, 0); E1000_WRITE_REG(&adapter->hw, E1000_IMC, 0xffffffff); } /* * Bit of a misnomer, what this really means is * to enable OS management of the system... aka * to disable special hardware management features */ static void em_init_manageability(struct adapter *adapter) { /* A shared code workaround */ #define E1000_82542_MANC2H E1000_MANC2H if (adapter->has_manage) { int manc2h = E1000_READ_REG(&adapter->hw, E1000_MANC2H); int manc = E1000_READ_REG(&adapter->hw, E1000_MANC); /* disable hardware interception of ARP */ manc &= ~(E1000_MANC_ARP_EN); /* enable receiving management packets to the host */ manc |= E1000_MANC_EN_MNG2HOST; #define E1000_MNG2HOST_PORT_623 (1 << 5) #define E1000_MNG2HOST_PORT_664 (1 << 6) manc2h |= E1000_MNG2HOST_PORT_623; manc2h |= E1000_MNG2HOST_PORT_664; E1000_WRITE_REG(&adapter->hw, E1000_MANC2H, manc2h); E1000_WRITE_REG(&adapter->hw, E1000_MANC, manc); } } /* * Give control back to hardware management * controller if there is one. */ static void em_release_manageability(struct adapter *adapter) { if (adapter->has_manage) { int manc = E1000_READ_REG(&adapter->hw, E1000_MANC); /* re-enable hardware interception of ARP */ manc |= E1000_MANC_ARP_EN; manc &= ~E1000_MANC_EN_MNG2HOST; E1000_WRITE_REG(&adapter->hw, E1000_MANC, manc); } } /* * em_get_hw_control sets the {CTRL_EXT|FWSM}:DRV_LOAD bit. * For ASF and Pass Through versions of f/w this means * that the driver is loaded. For AMT version type f/w * this means that the network i/f is open. */ static void em_get_hw_control(struct adapter *adapter) { u32 ctrl_ext, swsm; if (adapter->hw.mac.type == e1000_82573) { swsm = E1000_READ_REG(&adapter->hw, E1000_SWSM); E1000_WRITE_REG(&adapter->hw, E1000_SWSM, swsm | E1000_SWSM_DRV_LOAD); return; } /* else */ ctrl_ext = E1000_READ_REG(&adapter->hw, E1000_CTRL_EXT); E1000_WRITE_REG(&adapter->hw, E1000_CTRL_EXT, ctrl_ext | E1000_CTRL_EXT_DRV_LOAD); return; } /* * em_release_hw_control resets {CTRL_EXT|FWSM}:DRV_LOAD bit. * For ASF and Pass Through versions of f/w this means that * the driver is no longer loaded. For AMT versions of the * f/w this means that the network i/f is closed. */ static void em_release_hw_control(struct adapter *adapter) { u32 ctrl_ext, swsm; if (!adapter->has_manage) return; if (adapter->hw.mac.type == e1000_82573) { swsm = E1000_READ_REG(&adapter->hw, E1000_SWSM); E1000_WRITE_REG(&adapter->hw, E1000_SWSM, swsm & ~E1000_SWSM_DRV_LOAD); return; } /* else */ ctrl_ext = E1000_READ_REG(&adapter->hw, E1000_CTRL_EXT); E1000_WRITE_REG(&adapter->hw, E1000_CTRL_EXT, ctrl_ext & ~E1000_CTRL_EXT_DRV_LOAD); return; } static int em_is_valid_ether_addr(u8 *addr) { char zero_addr[6] = { 0, 0, 0, 0, 0, 0 }; if ((addr[0] & 1) || (!bcmp(addr, zero_addr, ETHER_ADDR_LEN))) { return (FALSE); } return (TRUE); } /* ** Parse the interface capabilities with regard ** to both system management and wake-on-lan for ** later use. */ static void em_get_wakeup(device_t dev) { struct adapter *adapter = device_get_softc(dev); u16 eeprom_data = 0, device_id, apme_mask; adapter->has_manage = e1000_enable_mng_pass_thru(&adapter->hw); apme_mask = EM_EEPROM_APME; switch (adapter->hw.mac.type) { case e1000_82573: case e1000_82583: adapter->has_amt = TRUE; /* Falls thru */ case e1000_82571: case e1000_82572: case e1000_80003es2lan: if (adapter->hw.bus.func == 1) { e1000_read_nvm(&adapter->hw, NVM_INIT_CONTROL3_PORT_B, 1, &eeprom_data); break; } else e1000_read_nvm(&adapter->hw, NVM_INIT_CONTROL3_PORT_A, 1, &eeprom_data); break; case e1000_ich8lan: case e1000_ich9lan: case e1000_ich10lan: case e1000_pchlan: case e1000_pch2lan: apme_mask = E1000_WUC_APME; adapter->has_amt = TRUE; eeprom_data = E1000_READ_REG(&adapter->hw, E1000_WUC); break; default: e1000_read_nvm(&adapter->hw, NVM_INIT_CONTROL3_PORT_A, 1, &eeprom_data); break; } if (eeprom_data & apme_mask) adapter->wol = (E1000_WUFC_MAG | E1000_WUFC_MC); /* * We have the eeprom settings, now apply the special cases * where the eeprom may be wrong or the board won't support * wake on lan on a particular port */ device_id = pci_get_device(dev); switch (device_id) { case E1000_DEV_ID_82571EB_FIBER: /* Wake events only supported on port A for dual fiber * regardless of eeprom setting */ if (E1000_READ_REG(&adapter->hw, E1000_STATUS) & E1000_STATUS_FUNC_1) adapter->wol = 0; break; case E1000_DEV_ID_82571EB_QUAD_COPPER: case E1000_DEV_ID_82571EB_QUAD_FIBER: case E1000_DEV_ID_82571EB_QUAD_COPPER_LP: /* if quad port adapter, disable WoL on all but port A */ if (global_quad_port_a != 0) adapter->wol = 0; /* Reset for multiple quad port adapters */ if (++global_quad_port_a == 4) global_quad_port_a = 0; break; } return; } /* * Enable PCI Wake On Lan capability */ static void em_enable_wakeup(device_t dev) { struct adapter *adapter = device_get_softc(dev); struct ifnet *ifp = adapter->ifp; u32 pmc, ctrl, ctrl_ext, rctl; u16 status; if ((pci_find_cap(dev, PCIY_PMG, &pmc) != 0)) return; /* Advertise the wakeup capability */ ctrl = E1000_READ_REG(&adapter->hw, E1000_CTRL); ctrl |= (E1000_CTRL_SWDPIN2 | E1000_CTRL_SWDPIN3); E1000_WRITE_REG(&adapter->hw, E1000_CTRL, ctrl); E1000_WRITE_REG(&adapter->hw, E1000_WUC, E1000_WUC_PME_EN); if ((adapter->hw.mac.type == e1000_ich8lan) || (adapter->hw.mac.type == e1000_pchlan) || (adapter->hw.mac.type == e1000_ich9lan) || (adapter->hw.mac.type == e1000_ich10lan)) e1000_suspend_workarounds_ich8lan(&adapter->hw); /* Keep the laser running on Fiber adapters */ if (adapter->hw.phy.media_type == e1000_media_type_fiber || adapter->hw.phy.media_type == e1000_media_type_internal_serdes) { ctrl_ext = E1000_READ_REG(&adapter->hw, E1000_CTRL_EXT); ctrl_ext |= E1000_CTRL_EXT_SDP3_DATA; E1000_WRITE_REG(&adapter->hw, E1000_CTRL_EXT, ctrl_ext); } /* ** Determine type of Wakeup: note that wol ** is set with all bits on by default. */ if ((ifp->if_capenable & IFCAP_WOL_MAGIC) == 0) adapter->wol &= ~E1000_WUFC_MAG; if ((ifp->if_capenable & IFCAP_WOL_MCAST) == 0) adapter->wol &= ~E1000_WUFC_MC; else { rctl = E1000_READ_REG(&adapter->hw, E1000_RCTL); rctl |= E1000_RCTL_MPE; E1000_WRITE_REG(&adapter->hw, E1000_RCTL, rctl); } if ((adapter->hw.mac.type == e1000_pchlan) || (adapter->hw.mac.type == e1000_pch2lan)) { if (em_enable_phy_wakeup(adapter)) return; } else { E1000_WRITE_REG(&adapter->hw, E1000_WUC, E1000_WUC_PME_EN); E1000_WRITE_REG(&adapter->hw, E1000_WUFC, adapter->wol); } if (adapter->hw.phy.type == e1000_phy_igp_3) e1000_igp3_phy_powerdown_workaround_ich8lan(&adapter->hw); /* Request PME */ status = pci_read_config(dev, pmc + PCIR_POWER_STATUS, 2); status &= ~(PCIM_PSTAT_PME | PCIM_PSTAT_PMEENABLE); if (ifp->if_capenable & IFCAP_WOL) status |= PCIM_PSTAT_PME | PCIM_PSTAT_PMEENABLE; pci_write_config(dev, pmc + PCIR_POWER_STATUS, status, 2); return; } /* ** WOL in the newer chipset interfaces (pchlan) ** require thing to be copied into the phy */ static int em_enable_phy_wakeup(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; u32 mreg, ret = 0; u16 preg; /* copy MAC RARs to PHY RARs */ e1000_copy_rx_addrs_to_phy_ich8lan(hw); /* copy MAC MTA to PHY MTA */ for (int i = 0; i < adapter->hw.mac.mta_reg_count; i++) { mreg = E1000_READ_REG_ARRAY(hw, E1000_MTA, i); e1000_write_phy_reg(hw, BM_MTA(i), (u16)(mreg & 0xFFFF)); e1000_write_phy_reg(hw, BM_MTA(i) + 1, (u16)((mreg >> 16) & 0xFFFF)); } /* configure PHY Rx Control register */ e1000_read_phy_reg(&adapter->hw, BM_RCTL, &preg); mreg = E1000_READ_REG(hw, E1000_RCTL); if (mreg & E1000_RCTL_UPE) preg |= BM_RCTL_UPE; if (mreg & E1000_RCTL_MPE) preg |= BM_RCTL_MPE; preg &= ~(BM_RCTL_MO_MASK); if (mreg & E1000_RCTL_MO_3) preg |= (((mreg & E1000_RCTL_MO_3) >> E1000_RCTL_MO_SHIFT) << BM_RCTL_MO_SHIFT); if (mreg & E1000_RCTL_BAM) preg |= BM_RCTL_BAM; if (mreg & E1000_RCTL_PMCF) preg |= BM_RCTL_PMCF; mreg = E1000_READ_REG(hw, E1000_CTRL); if (mreg & E1000_CTRL_RFCE) preg |= BM_RCTL_RFCE; e1000_write_phy_reg(&adapter->hw, BM_RCTL, preg); /* enable PHY wakeup in MAC register */ E1000_WRITE_REG(hw, E1000_WUC, E1000_WUC_PHY_WAKE | E1000_WUC_PME_EN); E1000_WRITE_REG(hw, E1000_WUFC, adapter->wol); /* configure and enable PHY wakeup in PHY registers */ e1000_write_phy_reg(&adapter->hw, BM_WUFC, adapter->wol); e1000_write_phy_reg(&adapter->hw, BM_WUC, E1000_WUC_PME_EN); /* activate PHY wakeup */ ret = hw->phy.ops.acquire(hw); if (ret) { printf("Could not acquire PHY\n"); return ret; } e1000_write_phy_reg_mdic(hw, IGP01E1000_PHY_PAGE_SELECT, (BM_WUC_ENABLE_PAGE << IGP_PAGE_SHIFT)); ret = e1000_read_phy_reg_mdic(hw, BM_WUC_ENABLE_REG, &preg); if (ret) { printf("Could not read PHY page 769\n"); goto out; } preg |= BM_WUC_ENABLE_BIT | BM_WUC_HOST_WU_BIT; ret = e1000_write_phy_reg_mdic(hw, BM_WUC_ENABLE_REG, preg); if (ret) printf("Could not set PHY Host Wakeup bit\n"); out: hw->phy.ops.release(hw); return ret; } static void em_led_func(void *arg, int onoff) { struct adapter *adapter = arg; EM_CORE_LOCK(adapter); if (onoff) { e1000_setup_led(&adapter->hw); e1000_led_on(&adapter->hw); } else { e1000_led_off(&adapter->hw); e1000_cleanup_led(&adapter->hw); } EM_CORE_UNLOCK(adapter); } /* ** Disable the L0S and L1 LINK states */ static void em_disable_aspm(struct adapter *adapter) { int base, reg; u16 link_cap,link_ctrl; device_t dev = adapter->dev; switch (adapter->hw.mac.type) { case e1000_82573: case e1000_82574: case e1000_82583: break; default: return; } if (pci_find_cap(dev, PCIY_EXPRESS, &base) != 0) return; reg = base + PCIER_LINK_CAP; link_cap = pci_read_config(dev, reg, 2); if ((link_cap & PCIEM_LINK_CAP_ASPM) == 0) return; reg = base + PCIER_LINK_CTL; link_ctrl = pci_read_config(dev, reg, 2); link_ctrl &= ~PCIEM_LINK_CTL_ASPMC; pci_write_config(dev, reg, link_ctrl, 2); return; } /********************************************************************** * * Update the board statistics counters. * **********************************************************************/ static void em_update_stats_counters(struct adapter *adapter) { struct ifnet *ifp; if(adapter->hw.phy.media_type == e1000_media_type_copper || (E1000_READ_REG(&adapter->hw, E1000_STATUS) & E1000_STATUS_LU)) { adapter->stats.symerrs += E1000_READ_REG(&adapter->hw, E1000_SYMERRS); adapter->stats.sec += E1000_READ_REG(&adapter->hw, E1000_SEC); } adapter->stats.crcerrs += E1000_READ_REG(&adapter->hw, E1000_CRCERRS); adapter->stats.mpc += E1000_READ_REG(&adapter->hw, E1000_MPC); adapter->stats.scc += E1000_READ_REG(&adapter->hw, E1000_SCC); adapter->stats.ecol += E1000_READ_REG(&adapter->hw, E1000_ECOL); adapter->stats.mcc += E1000_READ_REG(&adapter->hw, E1000_MCC); adapter->stats.latecol += E1000_READ_REG(&adapter->hw, E1000_LATECOL); adapter->stats.colc += E1000_READ_REG(&adapter->hw, E1000_COLC); adapter->stats.dc += E1000_READ_REG(&adapter->hw, E1000_DC); adapter->stats.rlec += E1000_READ_REG(&adapter->hw, E1000_RLEC); adapter->stats.xonrxc += E1000_READ_REG(&adapter->hw, E1000_XONRXC); adapter->stats.xontxc += E1000_READ_REG(&adapter->hw, E1000_XONTXC); /* ** For watchdog management we need to know if we have been ** paused during the last interval, so capture that here. */ adapter->pause_frames = E1000_READ_REG(&adapter->hw, E1000_XOFFRXC); adapter->stats.xoffrxc += adapter->pause_frames; adapter->stats.xofftxc += E1000_READ_REG(&adapter->hw, E1000_XOFFTXC); adapter->stats.fcruc += E1000_READ_REG(&adapter->hw, E1000_FCRUC); adapter->stats.prc64 += E1000_READ_REG(&adapter->hw, E1000_PRC64); adapter->stats.prc127 += E1000_READ_REG(&adapter->hw, E1000_PRC127); adapter->stats.prc255 += E1000_READ_REG(&adapter->hw, E1000_PRC255); adapter->stats.prc511 += E1000_READ_REG(&adapter->hw, E1000_PRC511); adapter->stats.prc1023 += E1000_READ_REG(&adapter->hw, E1000_PRC1023); adapter->stats.prc1522 += E1000_READ_REG(&adapter->hw, E1000_PRC1522); adapter->stats.gprc += E1000_READ_REG(&adapter->hw, E1000_GPRC); adapter->stats.bprc += E1000_READ_REG(&adapter->hw, E1000_BPRC); adapter->stats.mprc += E1000_READ_REG(&adapter->hw, E1000_MPRC); adapter->stats.gptc += E1000_READ_REG(&adapter->hw, E1000_GPTC); /* For the 64-bit byte counters the low dword must be read first. */ /* Both registers clear on the read of the high dword */ adapter->stats.gorc += E1000_READ_REG(&adapter->hw, E1000_GORCL) + ((u64)E1000_READ_REG(&adapter->hw, E1000_GORCH) << 32); adapter->stats.gotc += E1000_READ_REG(&adapter->hw, E1000_GOTCL) + ((u64)E1000_READ_REG(&adapter->hw, E1000_GOTCH) << 32); adapter->stats.rnbc += E1000_READ_REG(&adapter->hw, E1000_RNBC); adapter->stats.ruc += E1000_READ_REG(&adapter->hw, E1000_RUC); adapter->stats.rfc += E1000_READ_REG(&adapter->hw, E1000_RFC); adapter->stats.roc += E1000_READ_REG(&adapter->hw, E1000_ROC); adapter->stats.rjc += E1000_READ_REG(&adapter->hw, E1000_RJC); adapter->stats.tor += E1000_READ_REG(&adapter->hw, E1000_TORH); adapter->stats.tot += E1000_READ_REG(&adapter->hw, E1000_TOTH); adapter->stats.tpr += E1000_READ_REG(&adapter->hw, E1000_TPR); adapter->stats.tpt += E1000_READ_REG(&adapter->hw, E1000_TPT); adapter->stats.ptc64 += E1000_READ_REG(&adapter->hw, E1000_PTC64); adapter->stats.ptc127 += E1000_READ_REG(&adapter->hw, E1000_PTC127); adapter->stats.ptc255 += E1000_READ_REG(&adapter->hw, E1000_PTC255); adapter->stats.ptc511 += E1000_READ_REG(&adapter->hw, E1000_PTC511); adapter->stats.ptc1023 += E1000_READ_REG(&adapter->hw, E1000_PTC1023); adapter->stats.ptc1522 += E1000_READ_REG(&adapter->hw, E1000_PTC1522); adapter->stats.mptc += E1000_READ_REG(&adapter->hw, E1000_MPTC); adapter->stats.bptc += E1000_READ_REG(&adapter->hw, E1000_BPTC); /* Interrupt Counts */ adapter->stats.iac += E1000_READ_REG(&adapter->hw, E1000_IAC); adapter->stats.icrxptc += E1000_READ_REG(&adapter->hw, E1000_ICRXPTC); adapter->stats.icrxatc += E1000_READ_REG(&adapter->hw, E1000_ICRXATC); adapter->stats.ictxptc += E1000_READ_REG(&adapter->hw, E1000_ICTXPTC); adapter->stats.ictxatc += E1000_READ_REG(&adapter->hw, E1000_ICTXATC); adapter->stats.ictxqec += E1000_READ_REG(&adapter->hw, E1000_ICTXQEC); adapter->stats.ictxqmtc += E1000_READ_REG(&adapter->hw, E1000_ICTXQMTC); adapter->stats.icrxdmtc += E1000_READ_REG(&adapter->hw, E1000_ICRXDMTC); adapter->stats.icrxoc += E1000_READ_REG(&adapter->hw, E1000_ICRXOC); if (adapter->hw.mac.type >= e1000_82543) { adapter->stats.algnerrc += E1000_READ_REG(&adapter->hw, E1000_ALGNERRC); adapter->stats.rxerrc += E1000_READ_REG(&adapter->hw, E1000_RXERRC); adapter->stats.tncrs += E1000_READ_REG(&adapter->hw, E1000_TNCRS); adapter->stats.cexterr += E1000_READ_REG(&adapter->hw, E1000_CEXTERR); adapter->stats.tsctc += E1000_READ_REG(&adapter->hw, E1000_TSCTC); adapter->stats.tsctfc += E1000_READ_REG(&adapter->hw, E1000_TSCTFC); } ifp = adapter->ifp; ifp->if_collisions = adapter->stats.colc; /* Rx Errors */ ifp->if_ierrors = adapter->dropped_pkts + adapter->stats.rxerrc + adapter->stats.crcerrs + adapter->stats.algnerrc + adapter->stats.ruc + adapter->stats.roc + adapter->stats.mpc + adapter->stats.cexterr; /* Tx Errors */ ifp->if_oerrors = adapter->stats.ecol + adapter->stats.latecol + adapter->watchdog_events; } /* Export a single 32-bit register via a read-only sysctl. */ static int em_sysctl_reg_handler(SYSCTL_HANDLER_ARGS) { struct adapter *adapter; u_int val; adapter = oidp->oid_arg1; val = E1000_READ_REG(&adapter->hw, oidp->oid_arg2); return (sysctl_handle_int(oidp, &val, 0, req)); } /* * Add sysctl variables, one per statistic, to the system. */ static void em_add_hw_stats(struct adapter *adapter) { device_t dev = adapter->dev; struct tx_ring *txr = adapter->tx_rings; struct rx_ring *rxr = adapter->rx_rings; struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(dev); struct sysctl_oid *tree = device_get_sysctl_tree(dev); struct sysctl_oid_list *child = SYSCTL_CHILDREN(tree); struct e1000_hw_stats *stats = &adapter->stats; struct sysctl_oid *stat_node, *queue_node, *int_node; struct sysctl_oid_list *stat_list, *queue_list, *int_list; #define QUEUE_NAME_LEN 32 char namebuf[QUEUE_NAME_LEN]; /* Driver Statistics */ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "link_irq", CTLFLAG_RD, &adapter->link_irq, "Link MSIX IRQ Handled"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "mbuf_alloc_fail", CTLFLAG_RD, &adapter->mbuf_alloc_failed, "Std mbuf failed"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "cluster_alloc_fail", CTLFLAG_RD, &adapter->mbuf_cluster_failed, "Std mbuf cluster failed"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "dropped", CTLFLAG_RD, &adapter->dropped_pkts, "Driver dropped packets"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_dma_fail", CTLFLAG_RD, &adapter->no_tx_dma_setup, "Driver tx dma failure in xmit"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_overruns", CTLFLAG_RD, &adapter->rx_overruns, "RX overruns"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "watchdog_timeouts", CTLFLAG_RD, &adapter->watchdog_events, "Watchdog timeouts"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "device_control", CTLTYPE_UINT | CTLFLAG_RD, adapter, E1000_CTRL, em_sysctl_reg_handler, "IU", "Device Control Register"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_control", CTLTYPE_UINT | CTLFLAG_RD, adapter, E1000_RCTL, em_sysctl_reg_handler, "IU", "Receiver Control Register"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "fc_high_water", CTLFLAG_RD, &adapter->hw.fc.high_water, 0, "Flow Control High Watermark"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "fc_low_water", CTLFLAG_RD, &adapter->hw.fc.low_water, 0, "Flow Control Low Watermark"); for (int i = 0; i < adapter->num_queues; i++, rxr++, txr++) { snprintf(namebuf, QUEUE_NAME_LEN, "queue%d", i); queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "txd_head", CTLTYPE_UINT | CTLFLAG_RD, adapter, E1000_TDH(txr->me), em_sysctl_reg_handler, "IU", "Transmit Descriptor Head"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "txd_tail", CTLTYPE_UINT | CTLFLAG_RD, adapter, E1000_TDT(txr->me), em_sysctl_reg_handler, "IU", "Transmit Descriptor Tail"); SYSCTL_ADD_ULONG(ctx, queue_list, OID_AUTO, "tx_irq", CTLFLAG_RD, &txr->tx_irq, "Queue MSI-X Transmit Interrupts"); SYSCTL_ADD_ULONG(ctx, queue_list, OID_AUTO, "no_desc_avail", CTLFLAG_RD, &txr->no_desc_avail, "Queue No Descriptor Available"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "rxd_head", CTLTYPE_UINT | CTLFLAG_RD, adapter, E1000_RDH(rxr->me), em_sysctl_reg_handler, "IU", "Receive Descriptor Head"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "rxd_tail", CTLTYPE_UINT | CTLFLAG_RD, adapter, E1000_RDT(rxr->me), em_sysctl_reg_handler, "IU", "Receive Descriptor Tail"); SYSCTL_ADD_ULONG(ctx, queue_list, OID_AUTO, "rx_irq", CTLFLAG_RD, &rxr->rx_irq, "Queue MSI-X Receive Interrupts"); } /* MAC stats get their own sub node */ stat_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "mac_stats", CTLFLAG_RD, NULL, "Statistics"); stat_list = SYSCTL_CHILDREN(stat_node); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "excess_coll", CTLFLAG_RD, &stats->ecol, "Excessive collisions"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "single_coll", CTLFLAG_RD, &stats->scc, "Single collisions"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "multiple_coll", CTLFLAG_RD, &stats->mcc, "Multiple collisions"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "late_coll", CTLFLAG_RD, &stats->latecol, "Late collisions"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "collision_count", CTLFLAG_RD, &stats->colc, "Collision Count"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "symbol_errors", CTLFLAG_RD, &adapter->stats.symerrs, "Symbol Errors"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "sequence_errors", CTLFLAG_RD, &adapter->stats.sec, "Sequence Errors"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "defer_count", CTLFLAG_RD, &adapter->stats.dc, "Defer Count"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "missed_packets", CTLFLAG_RD, &adapter->stats.mpc, "Missed Packets"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_no_buff", CTLFLAG_RD, &adapter->stats.rnbc, "Receive No Buffers"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_undersize", CTLFLAG_RD, &adapter->stats.ruc, "Receive Undersize"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_fragmented", CTLFLAG_RD, &adapter->stats.rfc, "Fragmented Packets Received "); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_oversize", CTLFLAG_RD, &adapter->stats.roc, "Oversized Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_jabber", CTLFLAG_RD, &adapter->stats.rjc, "Recevied Jabber"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_errs", CTLFLAG_RD, &adapter->stats.rxerrc, "Receive Errors"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "crc_errs", CTLFLAG_RD, &adapter->stats.crcerrs, "CRC errors"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "alignment_errs", CTLFLAG_RD, &adapter->stats.algnerrc, "Alignment Errors"); /* On 82575 these are collision counts */ SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "coll_ext_errs", CTLFLAG_RD, &adapter->stats.cexterr, "Collision/Carrier extension errors"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "xon_recvd", CTLFLAG_RD, &adapter->stats.xonrxc, "XON Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "xon_txd", CTLFLAG_RD, &adapter->stats.xontxc, "XON Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "xoff_recvd", CTLFLAG_RD, &adapter->stats.xoffrxc, "XOFF Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "xoff_txd", CTLFLAG_RD, &adapter->stats.xofftxc, "XOFF Transmitted"); /* Packet Reception Stats */ SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "total_pkts_recvd", CTLFLAG_RD, &adapter->stats.tpr, "Total Packets Received "); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_pkts_recvd", CTLFLAG_RD, &adapter->stats.gprc, "Good Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "bcast_pkts_recvd", CTLFLAG_RD, &adapter->stats.bprc, "Broadcast Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "mcast_pkts_recvd", CTLFLAG_RD, &adapter->stats.mprc, "Multicast Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_64", CTLFLAG_RD, &adapter->stats.prc64, "64 byte frames received "); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_65_127", CTLFLAG_RD, &adapter->stats.prc127, "65-127 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_128_255", CTLFLAG_RD, &adapter->stats.prc255, "128-255 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_256_511", CTLFLAG_RD, &adapter->stats.prc511, "256-511 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_512_1023", CTLFLAG_RD, &adapter->stats.prc1023, "512-1023 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_1024_1522", CTLFLAG_RD, &adapter->stats.prc1522, "1023-1522 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_octets_recvd", CTLFLAG_RD, &adapter->stats.gorc, "Good Octets Received"); /* Packet Transmission Stats */ SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_octets_txd", CTLFLAG_RD, &adapter->stats.gotc, "Good Octets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "total_pkts_txd", CTLFLAG_RD, &adapter->stats.tpt, "Total Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_pkts_txd", CTLFLAG_RD, &adapter->stats.gptc, "Good Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "bcast_pkts_txd", CTLFLAG_RD, &adapter->stats.bptc, "Broadcast Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "mcast_pkts_txd", CTLFLAG_RD, &adapter->stats.mptc, "Multicast Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_64", CTLFLAG_RD, &adapter->stats.ptc64, "64 byte frames transmitted "); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_65_127", CTLFLAG_RD, &adapter->stats.ptc127, "65-127 byte frames transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_128_255", CTLFLAG_RD, &adapter->stats.ptc255, "128-255 byte frames transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_256_511", CTLFLAG_RD, &adapter->stats.ptc511, "256-511 byte frames transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_512_1023", CTLFLAG_RD, &adapter->stats.ptc1023, "512-1023 byte frames transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_1024_1522", CTLFLAG_RD, &adapter->stats.ptc1522, "1024-1522 byte frames transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tso_txd", CTLFLAG_RD, &adapter->stats.tsctc, "TSO Contexts Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tso_ctx_fail", CTLFLAG_RD, &adapter->stats.tsctfc, "TSO Contexts Failed"); /* Interrupt Stats */ int_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "interrupts", CTLFLAG_RD, NULL, "Interrupt Statistics"); int_list = SYSCTL_CHILDREN(int_node); SYSCTL_ADD_UQUAD(ctx, int_list, OID_AUTO, "asserts", CTLFLAG_RD, &adapter->stats.iac, "Interrupt Assertion Count"); SYSCTL_ADD_UQUAD(ctx, int_list, OID_AUTO, "rx_pkt_timer", CTLFLAG_RD, &adapter->stats.icrxptc, "Interrupt Cause Rx Pkt Timer Expire Count"); SYSCTL_ADD_UQUAD(ctx, int_list, OID_AUTO, "rx_abs_timer", CTLFLAG_RD, &adapter->stats.icrxatc, "Interrupt Cause Rx Abs Timer Expire Count"); SYSCTL_ADD_UQUAD(ctx, int_list, OID_AUTO, "tx_pkt_timer", CTLFLAG_RD, &adapter->stats.ictxptc, "Interrupt Cause Tx Pkt Timer Expire Count"); SYSCTL_ADD_UQUAD(ctx, int_list, OID_AUTO, "tx_abs_timer", CTLFLAG_RD, &adapter->stats.ictxatc, "Interrupt Cause Tx Abs Timer Expire Count"); SYSCTL_ADD_UQUAD(ctx, int_list, OID_AUTO, "tx_queue_empty", CTLFLAG_RD, &adapter->stats.ictxqec, "Interrupt Cause Tx Queue Empty Count"); SYSCTL_ADD_UQUAD(ctx, int_list, OID_AUTO, "tx_queue_min_thresh", CTLFLAG_RD, &adapter->stats.ictxqmtc, "Interrupt Cause Tx Queue Min Thresh Count"); SYSCTL_ADD_UQUAD(ctx, int_list, OID_AUTO, "rx_desc_min_thresh", CTLFLAG_RD, &adapter->stats.icrxdmtc, "Interrupt Cause Rx Desc Min Thresh Count"); SYSCTL_ADD_UQUAD(ctx, int_list, OID_AUTO, "rx_overrun", CTLFLAG_RD, &adapter->stats.icrxoc, "Interrupt Cause Receiver Overrun Count"); } /********************************************************************** * * This routine provides a way to dump out the adapter eeprom, * often a useful debug/service tool. This only dumps the first * 32 words, stuff that matters is in that extent. * **********************************************************************/ static int em_sysctl_nvm_info(SYSCTL_HANDLER_ARGS) { struct adapter *adapter = (struct adapter *)arg1; int error; int result; result = -1; error = sysctl_handle_int(oidp, &result, 0, req); if (error || !req->newptr) return (error); /* * This value will cause a hex dump of the * first 32 16-bit words of the EEPROM to * the screen. */ if (result == 1) em_print_nvm_info(adapter); return (error); } static void em_print_nvm_info(struct adapter *adapter) { u16 eeprom_data; int i, j, row = 0; /* Its a bit crude, but it gets the job done */ printf("\nInterface EEPROM Dump:\n"); printf("Offset\n0x0000 "); for (i = 0, j = 0; i < 32; i++, j++) { if (j == 8) { /* Make the offset block */ j = 0; ++row; printf("\n0x00%x0 ",row); } e1000_read_nvm(&adapter->hw, i, 1, &eeprom_data); printf("%04x ", eeprom_data); } printf("\n"); } static int em_sysctl_int_delay(SYSCTL_HANDLER_ARGS) { struct em_int_delay_info *info; struct adapter *adapter; u32 regval; int error, usecs, ticks; info = (struct em_int_delay_info *)arg1; usecs = info->value; error = sysctl_handle_int(oidp, &usecs, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (usecs < 0 || usecs > EM_TICKS_TO_USECS(65535)) return (EINVAL); info->value = usecs; ticks = EM_USECS_TO_TICKS(usecs); if (info->offset == E1000_ITR) /* units are 256ns here */ ticks *= 4; adapter = info->adapter; EM_CORE_LOCK(adapter); regval = E1000_READ_OFFSET(&adapter->hw, info->offset); regval = (regval & ~0xffff) | (ticks & 0xffff); /* Handle a few special cases. */ switch (info->offset) { case E1000_RDTR: break; case E1000_TIDV: if (ticks == 0) { adapter->txd_cmd &= ~E1000_TXD_CMD_IDE; /* Don't write 0 into the TIDV register. */ regval++; } else adapter->txd_cmd |= E1000_TXD_CMD_IDE; break; } E1000_WRITE_OFFSET(&adapter->hw, info->offset, regval); EM_CORE_UNLOCK(adapter); return (0); } static void em_add_int_delay_sysctl(struct adapter *adapter, const char *name, const char *description, struct em_int_delay_info *info, int offset, int value) { info->adapter = adapter; info->offset = offset; info->value = value; SYSCTL_ADD_PROC(device_get_sysctl_ctx(adapter->dev), SYSCTL_CHILDREN(device_get_sysctl_tree(adapter->dev)), OID_AUTO, name, CTLTYPE_INT|CTLFLAG_RW, info, 0, em_sysctl_int_delay, "I", description); } static void em_set_sysctl_value(struct adapter *adapter, const char *name, const char *description, int *limit, int value) { *limit = value; SYSCTL_ADD_INT(device_get_sysctl_ctx(adapter->dev), SYSCTL_CHILDREN(device_get_sysctl_tree(adapter->dev)), OID_AUTO, name, CTLTYPE_INT|CTLFLAG_RW, limit, value, description); } /* ** Set flow control using sysctl: ** Flow control values: ** 0 - off ** 1 - rx pause ** 2 - tx pause ** 3 - full */ static int em_set_flowcntl(SYSCTL_HANDLER_ARGS) { int error; static int input = 3; /* default is full */ struct adapter *adapter = (struct adapter *) arg1; error = sysctl_handle_int(oidp, &input, 0, req); if ((error) || (req->newptr == NULL)) return (error); if (input == adapter->fc) /* no change? */ return (error); switch (input) { case e1000_fc_rx_pause: case e1000_fc_tx_pause: case e1000_fc_full: case e1000_fc_none: adapter->hw.fc.requested_mode = input; adapter->fc = input; break; default: /* Do nothing */ return (error); } adapter->hw.fc.current_mode = adapter->hw.fc.requested_mode; e1000_force_mac_fc(&adapter->hw); return (error); } /* ** Manage Energy Efficient Ethernet: ** Control values: ** 0/1 - enabled/disabled */ static int em_sysctl_eee(SYSCTL_HANDLER_ARGS) { struct adapter *adapter = (struct adapter *) arg1; int error, value; value = adapter->hw.dev_spec.ich8lan.eee_disable; error = sysctl_handle_int(oidp, &value, 0, req); if (error || req->newptr == NULL) return (error); EM_CORE_LOCK(adapter); adapter->hw.dev_spec.ich8lan.eee_disable = (value != 0); em_init_locked(adapter); EM_CORE_UNLOCK(adapter); return (0); } static int em_sysctl_debug_info(SYSCTL_HANDLER_ARGS) { struct adapter *adapter; int error; int result; result = -1; error = sysctl_handle_int(oidp, &result, 0, req); if (error || !req->newptr) return (error); if (result == 1) { adapter = (struct adapter *)arg1; em_print_debug_info(adapter); } return (error); } /* ** This routine is meant to be fluid, add whatever is ** needed for debugging a problem. -jfv */ static void em_print_debug_info(struct adapter *adapter) { device_t dev = adapter->dev; struct tx_ring *txr = adapter->tx_rings; struct rx_ring *rxr = adapter->rx_rings; if (adapter->ifp->if_drv_flags & IFF_DRV_RUNNING) printf("Interface is RUNNING "); else printf("Interface is NOT RUNNING\n"); if (adapter->ifp->if_drv_flags & IFF_DRV_OACTIVE) printf("and INACTIVE\n"); else printf("and ACTIVE\n"); device_printf(dev, "hw tdh = %d, hw tdt = %d\n", E1000_READ_REG(&adapter->hw, E1000_TDH(0)), E1000_READ_REG(&adapter->hw, E1000_TDT(0))); device_printf(dev, "hw rdh = %d, hw rdt = %d\n", E1000_READ_REG(&adapter->hw, E1000_RDH(0)), E1000_READ_REG(&adapter->hw, E1000_RDT(0))); device_printf(dev, "Tx Queue Status = %d\n", txr->queue_status); device_printf(dev, "TX descriptors avail = %d\n", txr->tx_avail); device_printf(dev, "Tx Descriptors avail failure = %ld\n", txr->no_desc_avail); device_printf(dev, "RX discarded packets = %ld\n", rxr->rx_discarded); device_printf(dev, "RX Next to Check = %d\n", rxr->next_to_check); device_printf(dev, "RX Next to Refresh = %d\n", rxr->next_to_refresh); } Index: stable/9/sys/dev/e1000/if_igb.c =================================================================== --- stable/9/sys/dev/e1000/if_igb.c (revision 262152) +++ stable/9/sys/dev/e1000/if_igb.c (revision 262153) @@ -1,6063 +1,6063 @@ /****************************************************************************** Copyright (c) 2001-2013, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************/ /*$FreeBSD$*/ #include "opt_inet.h" #include "opt_inet6.h" #ifdef HAVE_KERNEL_OPTION_HEADERS #include "opt_device_polling.h" #include "opt_altq.h" #endif #include #include #ifndef IGB_LEGACY_TX #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "e1000_api.h" #include "e1000_82575.h" #include "if_igb.h" /********************************************************************* * Set this to one to display debug statistics *********************************************************************/ int igb_display_debug_stats = 0; /********************************************************************* * Driver version: *********************************************************************/ char igb_driver_version[] = "version - 2.3.10"; /********************************************************************* * PCI Device ID Table * * Used by probe to select devices to load on * Last field stores an index into e1000_strings * Last entry must be all 0s * * { Vendor ID, Device ID, SubVendor ID, SubDevice ID, String Index } *********************************************************************/ static igb_vendor_info_t igb_vendor_info_array[] = { { 0x8086, E1000_DEV_ID_82575EB_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82575EB_FIBER_SERDES, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82575GB_QUAD_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82576, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82576_NS, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82576_NS_SERDES, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82576_FIBER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82576_SERDES, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82576_SERDES_QUAD, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82576_QUAD_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82576_QUAD_COPPER_ET2, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82576_VF, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82580_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82580_FIBER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82580_SERDES, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82580_SGMII, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82580_COPPER_DUAL, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82580_QUAD_FIBER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_DH89XXCC_SERDES, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_DH89XXCC_SGMII, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_DH89XXCC_SFP, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_DH89XXCC_BACKPLANE, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I350_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I350_FIBER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I350_SERDES, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I350_SGMII, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I350_VF, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I210_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I210_COPPER_IT, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I210_COPPER_OEM1, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I210_FIBER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I210_SERDES, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I210_SGMII, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I211_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, /* required last entry */ { 0, 0, 0, 0, 0} }; /********************************************************************* * Table of branding strings for all supported NICs. *********************************************************************/ static char *igb_strings[] = { "Intel(R) PRO/1000 Network Connection" }; /********************************************************************* * Function prototypes *********************************************************************/ static int igb_probe(device_t); static int igb_attach(device_t); static int igb_detach(device_t); static int igb_shutdown(device_t); static int igb_suspend(device_t); static int igb_resume(device_t); #ifndef IGB_LEGACY_TX static int igb_mq_start(struct ifnet *, struct mbuf *); static int igb_mq_start_locked(struct ifnet *, struct tx_ring *); static void igb_qflush(struct ifnet *); static void igb_deferred_mq_start(void *, int); #else static void igb_start(struct ifnet *); static void igb_start_locked(struct tx_ring *, struct ifnet *ifp); #endif static int igb_ioctl(struct ifnet *, u_long, caddr_t); static void igb_init(void *); static void igb_init_locked(struct adapter *); static void igb_stop(void *); static void igb_media_status(struct ifnet *, struct ifmediareq *); static int igb_media_change(struct ifnet *); static void igb_identify_hardware(struct adapter *); static int igb_allocate_pci_resources(struct adapter *); static int igb_allocate_msix(struct adapter *); static int igb_allocate_legacy(struct adapter *); static int igb_setup_msix(struct adapter *); static void igb_free_pci_resources(struct adapter *); static void igb_local_timer(void *); static void igb_reset(struct adapter *); static int igb_setup_interface(device_t, struct adapter *); static int igb_allocate_queues(struct adapter *); static void igb_configure_queues(struct adapter *); static int igb_allocate_transmit_buffers(struct tx_ring *); static void igb_setup_transmit_structures(struct adapter *); static void igb_setup_transmit_ring(struct tx_ring *); static void igb_initialize_transmit_units(struct adapter *); static void igb_free_transmit_structures(struct adapter *); static void igb_free_transmit_buffers(struct tx_ring *); static int igb_allocate_receive_buffers(struct rx_ring *); static int igb_setup_receive_structures(struct adapter *); static int igb_setup_receive_ring(struct rx_ring *); static void igb_initialize_receive_units(struct adapter *); static void igb_free_receive_structures(struct adapter *); static void igb_free_receive_buffers(struct rx_ring *); static void igb_free_receive_ring(struct rx_ring *); static void igb_enable_intr(struct adapter *); static void igb_disable_intr(struct adapter *); static void igb_update_stats_counters(struct adapter *); static bool igb_txeof(struct tx_ring *); static __inline void igb_rx_discard(struct rx_ring *, int); static __inline void igb_rx_input(struct rx_ring *, struct ifnet *, struct mbuf *, u32); static bool igb_rxeof(struct igb_queue *, int, int *); static void igb_rx_checksum(u32, struct mbuf *, u32); static bool igb_tx_ctx_setup(struct tx_ring *, struct mbuf *); static bool igb_tso_setup(struct tx_ring *, struct mbuf *, int, struct ip *, struct tcphdr *); static void igb_set_promisc(struct adapter *); static void igb_disable_promisc(struct adapter *); static void igb_set_multi(struct adapter *); static void igb_update_link_status(struct adapter *); static void igb_refresh_mbufs(struct rx_ring *, int); static void igb_register_vlan(void *, struct ifnet *, u16); static void igb_unregister_vlan(void *, struct ifnet *, u16); static void igb_setup_vlan_hw_support(struct adapter *); static int igb_xmit(struct tx_ring *, struct mbuf **); static int igb_dma_malloc(struct adapter *, bus_size_t, struct igb_dma_alloc *, int); static void igb_dma_free(struct adapter *, struct igb_dma_alloc *); static int igb_sysctl_nvm_info(SYSCTL_HANDLER_ARGS); static void igb_print_nvm_info(struct adapter *); static int igb_is_valid_ether_addr(u8 *); static void igb_add_hw_stats(struct adapter *); static void igb_vf_init_stats(struct adapter *); static void igb_update_vf_stats_counters(struct adapter *); /* Management and WOL Support */ static void igb_init_manageability(struct adapter *); static void igb_release_manageability(struct adapter *); static void igb_get_hw_control(struct adapter *); static void igb_release_hw_control(struct adapter *); static void igb_enable_wakeup(device_t); static void igb_led_func(void *, int); static int igb_irq_fast(void *); static void igb_msix_que(void *); static void igb_msix_link(void *); static void igb_handle_que(void *context, int pending); static void igb_handle_link(void *context, int pending); static void igb_handle_link_locked(struct adapter *); static void igb_set_sysctl_value(struct adapter *, const char *, const char *, int *, int); static int igb_set_flowcntl(SYSCTL_HANDLER_ARGS); static int igb_sysctl_dmac(SYSCTL_HANDLER_ARGS); static int igb_sysctl_eee(SYSCTL_HANDLER_ARGS); #ifdef DEVICE_POLLING static poll_handler_t igb_poll; #endif /* POLLING */ /********************************************************************* * FreeBSD Device Interface Entry Points *********************************************************************/ static device_method_t igb_methods[] = { /* Device interface */ DEVMETHOD(device_probe, igb_probe), DEVMETHOD(device_attach, igb_attach), DEVMETHOD(device_detach, igb_detach), DEVMETHOD(device_shutdown, igb_shutdown), DEVMETHOD(device_suspend, igb_suspend), DEVMETHOD(device_resume, igb_resume), DEVMETHOD_END }; static driver_t igb_driver = { "igb", igb_methods, sizeof(struct adapter), }; static devclass_t igb_devclass; DRIVER_MODULE(igb, pci, igb_driver, igb_devclass, 0, 0); MODULE_DEPEND(igb, pci, 1, 1, 1); MODULE_DEPEND(igb, ether, 1, 1, 1); /********************************************************************* * Tunable default values. *********************************************************************/ static SYSCTL_NODE(_hw, OID_AUTO, igb, CTLFLAG_RD, 0, "IGB driver parameters"); /* Descriptor defaults */ static int igb_rxd = IGB_DEFAULT_RXD; static int igb_txd = IGB_DEFAULT_TXD; TUNABLE_INT("hw.igb.rxd", &igb_rxd); TUNABLE_INT("hw.igb.txd", &igb_txd); SYSCTL_INT(_hw_igb, OID_AUTO, rxd, CTLFLAG_RDTUN, &igb_rxd, 0, "Number of receive descriptors per queue"); SYSCTL_INT(_hw_igb, OID_AUTO, txd, CTLFLAG_RDTUN, &igb_txd, 0, "Number of transmit descriptors per queue"); /* ** AIM: Adaptive Interrupt Moderation ** which means that the interrupt rate ** is varied over time based on the ** traffic for that interrupt vector */ static int igb_enable_aim = TRUE; TUNABLE_INT("hw.igb.enable_aim", &igb_enable_aim); SYSCTL_INT(_hw_igb, OID_AUTO, enable_aim, CTLFLAG_RW, &igb_enable_aim, 0, "Enable adaptive interrupt moderation"); /* * MSIX should be the default for best performance, * but this allows it to be forced off for testing. */ static int igb_enable_msix = 1; TUNABLE_INT("hw.igb.enable_msix", &igb_enable_msix); SYSCTL_INT(_hw_igb, OID_AUTO, enable_msix, CTLFLAG_RDTUN, &igb_enable_msix, 0, "Enable MSI-X interrupts"); /* ** Tuneable Interrupt rate */ static int igb_max_interrupt_rate = 8000; TUNABLE_INT("hw.igb.max_interrupt_rate", &igb_max_interrupt_rate); SYSCTL_INT(_hw_igb, OID_AUTO, max_interrupt_rate, CTLFLAG_RDTUN, &igb_max_interrupt_rate, 0, "Maximum interrupts per second"); #if __FreeBSD_version >= 800000 /* ** Tuneable number of buffers in the buf-ring (drbr_xxx) */ static int igb_buf_ring_size = IGB_BR_SIZE; TUNABLE_INT("hw.igb.buf_ring_size", &igb_buf_ring_size); SYSCTL_INT(_hw_igb, OID_AUTO, buf_ring_size, CTLFLAG_RDTUN, &igb_buf_ring_size, 0, "Size of the bufring"); #endif /* ** Header split causes the packet header to ** be dma'd to a seperate mbuf from the payload. ** this can have memory alignment benefits. But ** another plus is that small packets often fit ** into the header and thus use no cluster. Its ** a very workload dependent type feature. */ static int igb_header_split = FALSE; TUNABLE_INT("hw.igb.hdr_split", &igb_header_split); SYSCTL_INT(_hw_igb, OID_AUTO, header_split, CTLFLAG_RDTUN, &igb_header_split, 0, "Enable receive mbuf header split"); /* ** This will autoconfigure based on the ** number of CPUs and max supported ** MSIX messages if left at 0. */ static int igb_num_queues = 0; TUNABLE_INT("hw.igb.num_queues", &igb_num_queues); SYSCTL_INT(_hw_igb, OID_AUTO, num_queues, CTLFLAG_RDTUN, &igb_num_queues, 0, "Number of queues to configure, 0 indicates autoconfigure"); /* ** Global variable to store last used CPU when binding queues ** to CPUs in igb_allocate_msix. Starts at CPU_FIRST and increments when a ** queue is bound to a cpu. */ static int igb_last_bind_cpu = -1; /* How many packets rxeof tries to clean at a time */ static int igb_rx_process_limit = 100; TUNABLE_INT("hw.igb.rx_process_limit", &igb_rx_process_limit); SYSCTL_INT(_hw_igb, OID_AUTO, rx_process_limit, CTLFLAG_RDTUN, &igb_rx_process_limit, 0, "Maximum number of received packets to process at a time, -1 means unlimited"); #ifdef DEV_NETMAP /* see ixgbe.c for details */ #include #endif /* DEV_NETMAP */ /********************************************************************* * Device identification routine * * igb_probe determines if the driver should be loaded on * adapter based on PCI vendor/device id of the adapter. * * return BUS_PROBE_DEFAULT on success, positive on failure *********************************************************************/ static int igb_probe(device_t dev) { char adapter_name[60]; uint16_t pci_vendor_id = 0; uint16_t pci_device_id = 0; uint16_t pci_subvendor_id = 0; uint16_t pci_subdevice_id = 0; igb_vendor_info_t *ent; INIT_DEBUGOUT("igb_probe: begin"); pci_vendor_id = pci_get_vendor(dev); if (pci_vendor_id != IGB_VENDOR_ID) return (ENXIO); pci_device_id = pci_get_device(dev); pci_subvendor_id = pci_get_subvendor(dev); pci_subdevice_id = pci_get_subdevice(dev); ent = igb_vendor_info_array; while (ent->vendor_id != 0) { if ((pci_vendor_id == ent->vendor_id) && (pci_device_id == ent->device_id) && ((pci_subvendor_id == ent->subvendor_id) || (ent->subvendor_id == PCI_ANY_ID)) && ((pci_subdevice_id == ent->subdevice_id) || (ent->subdevice_id == PCI_ANY_ID))) { sprintf(adapter_name, "%s %s", igb_strings[ent->index], igb_driver_version); device_set_desc_copy(dev, adapter_name); return (BUS_PROBE_DEFAULT); } ent++; } return (ENXIO); } /********************************************************************* * Device initialization routine * * The attach entry point is called when the driver is being loaded. * This routine identifies the type of hardware, allocates all resources * and initializes the hardware. * * return 0 on success, positive on failure *********************************************************************/ static int igb_attach(device_t dev) { struct adapter *adapter; int error = 0; u16 eeprom_data; INIT_DEBUGOUT("igb_attach: begin"); if (resource_disabled("igb", device_get_unit(dev))) { device_printf(dev, "Disabled by device hint\n"); return (ENXIO); } adapter = device_get_softc(dev); adapter->dev = adapter->osdep.dev = dev; IGB_CORE_LOCK_INIT(adapter, device_get_nameunit(dev)); /* SYSCTL stuff */ SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "nvm", CTLTYPE_INT|CTLFLAG_RW, adapter, 0, igb_sysctl_nvm_info, "I", "NVM Information"); igb_set_sysctl_value(adapter, "enable_aim", "Interrupt Moderation", &adapter->enable_aim, igb_enable_aim); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "fc", CTLTYPE_INT|CTLFLAG_RW, adapter, 0, igb_set_flowcntl, "I", "Flow Control"); callout_init_mtx(&adapter->timer, &adapter->core_mtx, 0); /* Determine hardware and mac info */ igb_identify_hardware(adapter); /* Setup PCI resources */ if (igb_allocate_pci_resources(adapter)) { device_printf(dev, "Allocation of PCI resources failed\n"); error = ENXIO; goto err_pci; } /* Do Shared Code initialization */ if (e1000_setup_init_funcs(&adapter->hw, TRUE)) { device_printf(dev, "Setup of Shared code failed\n"); error = ENXIO; goto err_pci; } e1000_get_bus_info(&adapter->hw); /* Sysctl for limiting the amount of work done in the taskqueue */ igb_set_sysctl_value(adapter, "rx_processing_limit", "max number of rx packets to process", &adapter->rx_process_limit, igb_rx_process_limit); /* * Validate number of transmit and receive descriptors. It * must not exceed hardware maximum, and must be multiple * of E1000_DBA_ALIGN. */ if (((igb_txd * sizeof(struct e1000_tx_desc)) % IGB_DBA_ALIGN) != 0 || (igb_txd > IGB_MAX_TXD) || (igb_txd < IGB_MIN_TXD)) { device_printf(dev, "Using %d TX descriptors instead of %d!\n", IGB_DEFAULT_TXD, igb_txd); adapter->num_tx_desc = IGB_DEFAULT_TXD; } else adapter->num_tx_desc = igb_txd; if (((igb_rxd * sizeof(struct e1000_rx_desc)) % IGB_DBA_ALIGN) != 0 || (igb_rxd > IGB_MAX_RXD) || (igb_rxd < IGB_MIN_RXD)) { device_printf(dev, "Using %d RX descriptors instead of %d!\n", IGB_DEFAULT_RXD, igb_rxd); adapter->num_rx_desc = IGB_DEFAULT_RXD; } else adapter->num_rx_desc = igb_rxd; adapter->hw.mac.autoneg = DO_AUTO_NEG; adapter->hw.phy.autoneg_wait_to_complete = FALSE; adapter->hw.phy.autoneg_advertised = AUTONEG_ADV_DEFAULT; /* Copper options */ if (adapter->hw.phy.media_type == e1000_media_type_copper) { adapter->hw.phy.mdix = AUTO_ALL_MODES; adapter->hw.phy.disable_polarity_correction = FALSE; adapter->hw.phy.ms_type = IGB_MASTER_SLAVE; } /* * Set the frame limits assuming * standard ethernet sized frames. */ adapter->max_frame_size = ETHERMTU + ETHER_HDR_LEN + ETHERNET_FCS_SIZE; adapter->min_frame_size = ETH_ZLEN + ETHERNET_FCS_SIZE; /* ** Allocate and Setup Queues */ if (igb_allocate_queues(adapter)) { error = ENOMEM; goto err_pci; } /* Allocate the appropriate stats memory */ if (adapter->vf_ifp) { adapter->stats = (struct e1000_vf_stats *)malloc(sizeof \ (struct e1000_vf_stats), M_DEVBUF, M_NOWAIT | M_ZERO); igb_vf_init_stats(adapter); } else adapter->stats = (struct e1000_hw_stats *)malloc(sizeof \ (struct e1000_hw_stats), M_DEVBUF, M_NOWAIT | M_ZERO); if (adapter->stats == NULL) { device_printf(dev, "Can not allocate stats memory\n"); error = ENOMEM; goto err_late; } /* Allocate multicast array memory. */ adapter->mta = malloc(sizeof(u8) * ETH_ADDR_LEN * MAX_NUM_MULTICAST_ADDRESSES, M_DEVBUF, M_NOWAIT); if (adapter->mta == NULL) { device_printf(dev, "Can not allocate multicast setup array\n"); error = ENOMEM; goto err_late; } /* Some adapter-specific advanced features */ if (adapter->hw.mac.type >= e1000_i350) { SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "dmac", CTLTYPE_INT|CTLFLAG_RW, adapter, 0, igb_sysctl_dmac, "I", "DMA Coalesce"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "eee_disabled", CTLTYPE_INT|CTLFLAG_RW, adapter, 0, igb_sysctl_eee, "I", "Disable Energy Efficient Ethernet"); if (adapter->hw.phy.media_type == e1000_media_type_copper) e1000_set_eee_i350(&adapter->hw); } /* ** Start from a known state, this is ** important in reading the nvm and ** mac from that. */ e1000_reset_hw(&adapter->hw); /* Make sure we have a good EEPROM before we read from it */ if (((adapter->hw.mac.type != e1000_i210) && (adapter->hw.mac.type != e1000_i211)) && (e1000_validate_nvm_checksum(&adapter->hw) < 0)) { /* ** Some PCI-E parts fail the first check due to ** the link being in sleep state, call it again, ** if it fails a second time its a real issue. */ if (e1000_validate_nvm_checksum(&adapter->hw) < 0) { device_printf(dev, "The EEPROM Checksum Is Not Valid\n"); error = EIO; goto err_late; } } /* ** Copy the permanent MAC address out of the EEPROM */ if (e1000_read_mac_addr(&adapter->hw) < 0) { device_printf(dev, "EEPROM read error while reading MAC" " address\n"); error = EIO; goto err_late; } /* Check its sanity */ if (!igb_is_valid_ether_addr(adapter->hw.mac.addr)) { device_printf(dev, "Invalid MAC address\n"); error = EIO; goto err_late; } /* Setup OS specific network interface */ if (igb_setup_interface(dev, adapter) != 0) goto err_late; /* Now get a good starting state */ igb_reset(adapter); /* Initialize statistics */ igb_update_stats_counters(adapter); adapter->hw.mac.get_link_status = 1; igb_update_link_status(adapter); /* Indicate SOL/IDER usage */ if (e1000_check_reset_block(&adapter->hw)) device_printf(dev, "PHY reset is blocked due to SOL/IDER session.\n"); /* Determine if we have to control management hardware */ adapter->has_manage = e1000_enable_mng_pass_thru(&adapter->hw); /* * Setup Wake-on-Lan */ /* APME bit in EEPROM is mapped to WUC.APME */ eeprom_data = E1000_READ_REG(&adapter->hw, E1000_WUC) & E1000_WUC_APME; if (eeprom_data) adapter->wol = E1000_WUFC_MAG; /* Register for VLAN events */ adapter->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, igb_register_vlan, adapter, EVENTHANDLER_PRI_FIRST); adapter->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, igb_unregister_vlan, adapter, EVENTHANDLER_PRI_FIRST); igb_add_hw_stats(adapter); /* Tell the stack that the interface is not active */ adapter->ifp->if_drv_flags &= ~IFF_DRV_RUNNING; adapter->ifp->if_drv_flags |= IFF_DRV_OACTIVE; adapter->led_dev = led_create(igb_led_func, adapter, device_get_nameunit(dev)); /* ** Configure Interrupts */ if ((adapter->msix > 1) && (igb_enable_msix)) error = igb_allocate_msix(adapter); else /* MSI or Legacy */ error = igb_allocate_legacy(adapter); if (error) goto err_late; #ifdef DEV_NETMAP igb_netmap_attach(adapter); #endif /* DEV_NETMAP */ INIT_DEBUGOUT("igb_attach: end"); return (0); err_late: igb_detach(dev); igb_free_transmit_structures(adapter); igb_free_receive_structures(adapter); igb_release_hw_control(adapter); err_pci: igb_free_pci_resources(adapter); if (adapter->ifp != NULL) if_free(adapter->ifp); free(adapter->mta, M_DEVBUF); IGB_CORE_LOCK_DESTROY(adapter); return (error); } /********************************************************************* * Device removal routine * * The detach entry point is called when the driver is being removed. * This routine stops the adapter and deallocates all the resources * that were allocated for driver operation. * * return 0 on success, positive on failure *********************************************************************/ static int igb_detach(device_t dev) { struct adapter *adapter = device_get_softc(dev); struct ifnet *ifp = adapter->ifp; INIT_DEBUGOUT("igb_detach: begin"); /* Make sure VLANS are not using driver */ if (adapter->ifp->if_vlantrunk != NULL) { device_printf(dev,"Vlan in use, detach first\n"); return (EBUSY); } ether_ifdetach(adapter->ifp); if (adapter->led_dev != NULL) led_destroy(adapter->led_dev); #ifdef DEVICE_POLLING if (ifp->if_capenable & IFCAP_POLLING) ether_poll_deregister(ifp); #endif IGB_CORE_LOCK(adapter); adapter->in_detach = 1; igb_stop(adapter); IGB_CORE_UNLOCK(adapter); e1000_phy_hw_reset(&adapter->hw); /* Give control back to firmware */ igb_release_manageability(adapter); igb_release_hw_control(adapter); if (adapter->wol) { E1000_WRITE_REG(&adapter->hw, E1000_WUC, E1000_WUC_PME_EN); E1000_WRITE_REG(&adapter->hw, E1000_WUFC, adapter->wol); igb_enable_wakeup(dev); } /* Unregister VLAN events */ if (adapter->vlan_attach != NULL) EVENTHANDLER_DEREGISTER(vlan_config, adapter->vlan_attach); if (adapter->vlan_detach != NULL) EVENTHANDLER_DEREGISTER(vlan_unconfig, adapter->vlan_detach); callout_drain(&adapter->timer); #ifdef DEV_NETMAP netmap_detach(adapter->ifp); #endif /* DEV_NETMAP */ igb_free_pci_resources(adapter); bus_generic_detach(dev); if_free(ifp); igb_free_transmit_structures(adapter); igb_free_receive_structures(adapter); if (adapter->mta != NULL) free(adapter->mta, M_DEVBUF); IGB_CORE_LOCK_DESTROY(adapter); return (0); } /********************************************************************* * * Shutdown entry point * **********************************************************************/ static int igb_shutdown(device_t dev) { return igb_suspend(dev); } /* * Suspend/resume device methods. */ static int igb_suspend(device_t dev) { struct adapter *adapter = device_get_softc(dev); IGB_CORE_LOCK(adapter); igb_stop(adapter); igb_release_manageability(adapter); igb_release_hw_control(adapter); if (adapter->wol) { E1000_WRITE_REG(&adapter->hw, E1000_WUC, E1000_WUC_PME_EN); E1000_WRITE_REG(&adapter->hw, E1000_WUFC, adapter->wol); igb_enable_wakeup(dev); } IGB_CORE_UNLOCK(adapter); return bus_generic_suspend(dev); } static int igb_resume(device_t dev) { struct adapter *adapter = device_get_softc(dev); struct tx_ring *txr = adapter->tx_rings; struct ifnet *ifp = adapter->ifp; IGB_CORE_LOCK(adapter); igb_init_locked(adapter); igb_init_manageability(adapter); if ((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING) && adapter->link_active) { for (int i = 0; i < adapter->num_queues; i++, txr++) { IGB_TX_LOCK(txr); #ifndef IGB_LEGACY_TX /* Process the stack queue only if not depleted */ if (((txr->queue_status & IGB_QUEUE_DEPLETED) == 0) && !drbr_empty(ifp, txr->br)) igb_mq_start_locked(ifp, txr); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) igb_start_locked(txr, ifp); #endif IGB_TX_UNLOCK(txr); } } IGB_CORE_UNLOCK(adapter); return bus_generic_resume(dev); } #ifdef IGB_LEGACY_TX /********************************************************************* * Transmit entry point * * igb_start is called by the stack to initiate a transmit. * The driver will remain in this routine as long as there are * packets to transmit and transmit resources are available. * In case resources are not available stack is notified and * the packet is requeued. **********************************************************************/ static void igb_start_locked(struct tx_ring *txr, struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct mbuf *m_head; IGB_TX_LOCK_ASSERT(txr); if ((ifp->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return; if (!adapter->link_active) return; /* Call cleanup if number of TX descriptors low */ if (txr->tx_avail <= IGB_TX_CLEANUP_THRESHOLD) igb_txeof(txr); while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { if (txr->tx_avail <= IGB_MAX_SCATTER) { txr->queue_status |= IGB_QUEUE_DEPLETED; break; } IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) break; /* * Encapsulation can modify our pointer, and or make it * NULL on failure. In that event, we can't requeue. */ if (igb_xmit(txr, &m_head)) { if (m_head != NULL) IFQ_DRV_PREPEND(&ifp->if_snd, m_head); if (txr->tx_avail <= IGB_MAX_SCATTER) txr->queue_status |= IGB_QUEUE_DEPLETED; break; } /* Send a copy of the frame to the BPF listener */ ETHER_BPF_MTAP(ifp, m_head); /* Set watchdog on */ txr->watchdog_time = ticks; txr->queue_status |= IGB_QUEUE_WORKING; } } /* * Legacy TX driver routine, called from the * stack, always uses tx[0], and spins for it. * Should not be used with multiqueue tx */ static void igb_start(struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct tx_ring *txr = adapter->tx_rings; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { IGB_TX_LOCK(txr); igb_start_locked(txr, ifp); IGB_TX_UNLOCK(txr); } return; } #else /* ~IGB_LEGACY_TX */ /* ** Multiqueue Transmit Entry: ** quick turnaround to the stack ** */ static int igb_mq_start(struct ifnet *ifp, struct mbuf *m) { struct adapter *adapter = ifp->if_softc; struct igb_queue *que; struct tx_ring *txr; int i, err = 0; /* Which queue to use */ if ((m->m_flags & M_FLOWID) != 0) i = m->m_pkthdr.flowid % adapter->num_queues; else i = curcpu % adapter->num_queues; txr = &adapter->tx_rings[i]; que = &adapter->queues[i]; err = drbr_enqueue(ifp, txr->br, m); if (err) return (err); if (IGB_TX_TRYLOCK(txr)) { err = igb_mq_start_locked(ifp, txr); IGB_TX_UNLOCK(txr); } else taskqueue_enqueue(que->tq, &txr->txq_task); return (err); } static int igb_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr) { struct adapter *adapter = txr->adapter; struct mbuf *next; int err = 0, enq; IGB_TX_LOCK_ASSERT(txr); if (((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) || adapter->link_active == 0) return (ENETDOWN); enq = 0; /* Process the queue */ while ((next = drbr_peek(ifp, txr->br)) != NULL) { if ((err = igb_xmit(txr, &next)) != 0) { if (next == NULL) { /* It was freed, move forward */ drbr_advance(ifp, txr->br); } else { /* * Still have one left, it may not be * the same since the transmit function * may have changed it. */ drbr_putback(ifp, txr->br, next); } break; } drbr_advance(ifp, txr->br); enq++; ifp->if_obytes += next->m_pkthdr.len; if (next->m_flags & M_MCAST) ifp->if_omcasts++; ETHER_BPF_MTAP(ifp, next); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; } if (enq > 0) { /* Set the watchdog */ txr->queue_status |= IGB_QUEUE_WORKING; txr->watchdog_time = ticks; } if (txr->tx_avail <= IGB_TX_CLEANUP_THRESHOLD) igb_txeof(txr); if (txr->tx_avail <= IGB_MAX_SCATTER) txr->queue_status |= IGB_QUEUE_DEPLETED; return (err); } /* * Called from a taskqueue to drain queued transmit packets. */ static void igb_deferred_mq_start(void *arg, int pending) { struct tx_ring *txr = arg; struct adapter *adapter = txr->adapter; struct ifnet *ifp = adapter->ifp; IGB_TX_LOCK(txr); if (!drbr_empty(ifp, txr->br)) igb_mq_start_locked(ifp, txr); IGB_TX_UNLOCK(txr); } /* ** Flush all ring buffers */ static void igb_qflush(struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct tx_ring *txr = adapter->tx_rings; struct mbuf *m; for (int i = 0; i < adapter->num_queues; i++, txr++) { IGB_TX_LOCK(txr); while ((m = buf_ring_dequeue_sc(txr->br)) != NULL) m_freem(m); IGB_TX_UNLOCK(txr); } if_qflush(ifp); } #endif /* ~IGB_LEGACY_TX */ /********************************************************************* * Ioctl entry point * * igb_ioctl is called when the user wants to configure the * interface. * * return 0 on success, positive on failure **********************************************************************/ static int igb_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct adapter *adapter = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; #if defined(INET) || defined(INET6) struct ifaddr *ifa = (struct ifaddr *)data; #endif bool avoid_reset = FALSE; int error = 0; if (adapter->in_detach) return (error); switch (command) { case SIOCSIFADDR: #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) avoid_reset = TRUE; #endif #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) avoid_reset = TRUE; #endif /* ** Calling init results in link renegotiation, ** so we avoid doing it when possible. */ if (avoid_reset) { ifp->if_flags |= IFF_UP; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) igb_init(adapter); #ifdef INET if (!(ifp->if_flags & IFF_NOARP)) arp_ifinit(ifp, ifa); #endif } else error = ether_ioctl(ifp, command, data); break; case SIOCSIFMTU: { int max_frame_size; IOCTL_DEBUGOUT("ioctl rcv'd: SIOCSIFMTU (Set Interface MTU)"); IGB_CORE_LOCK(adapter); max_frame_size = 9234; if (ifr->ifr_mtu > max_frame_size - ETHER_HDR_LEN - ETHER_CRC_LEN) { IGB_CORE_UNLOCK(adapter); error = EINVAL; break; } ifp->if_mtu = ifr->ifr_mtu; adapter->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; igb_init_locked(adapter); IGB_CORE_UNLOCK(adapter); break; } case SIOCSIFFLAGS: IOCTL_DEBUGOUT("ioctl rcv'd:\ SIOCSIFFLAGS (Set Interface Flags)"); IGB_CORE_LOCK(adapter); if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) { if ((ifp->if_flags ^ adapter->if_flags) & (IFF_PROMISC | IFF_ALLMULTI)) { igb_disable_promisc(adapter); igb_set_promisc(adapter); } } else igb_init_locked(adapter); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) igb_stop(adapter); adapter->if_flags = ifp->if_flags; IGB_CORE_UNLOCK(adapter); break; case SIOCADDMULTI: case SIOCDELMULTI: IOCTL_DEBUGOUT("ioctl rcv'd: SIOC(ADD|DEL)MULTI"); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { IGB_CORE_LOCK(adapter); igb_disable_intr(adapter); igb_set_multi(adapter); #ifdef DEVICE_POLLING if (!(ifp->if_capenable & IFCAP_POLLING)) #endif igb_enable_intr(adapter); IGB_CORE_UNLOCK(adapter); } break; case SIOCSIFMEDIA: /* Check SOL/IDER usage */ IGB_CORE_LOCK(adapter); if (e1000_check_reset_block(&adapter->hw)) { IGB_CORE_UNLOCK(adapter); device_printf(adapter->dev, "Media change is" " blocked due to SOL/IDER session.\n"); break; } IGB_CORE_UNLOCK(adapter); case SIOCGIFMEDIA: IOCTL_DEBUGOUT("ioctl rcv'd: \ SIOCxIFMEDIA (Get/Set Interface Media)"); error = ifmedia_ioctl(ifp, ifr, &adapter->media, command); break; case SIOCSIFCAP: { int mask, reinit; IOCTL_DEBUGOUT("ioctl rcv'd: SIOCSIFCAP (Set Capabilities)"); reinit = 0; mask = ifr->ifr_reqcap ^ ifp->if_capenable; #ifdef DEVICE_POLLING if (mask & IFCAP_POLLING) { if (ifr->ifr_reqcap & IFCAP_POLLING) { error = ether_poll_register(igb_poll, ifp); if (error) return (error); IGB_CORE_LOCK(adapter); igb_disable_intr(adapter); ifp->if_capenable |= IFCAP_POLLING; IGB_CORE_UNLOCK(adapter); } else { error = ether_poll_deregister(ifp); /* Enable interrupt even in error case */ IGB_CORE_LOCK(adapter); igb_enable_intr(adapter); ifp->if_capenable &= ~IFCAP_POLLING; IGB_CORE_UNLOCK(adapter); } } #endif if (mask & IFCAP_HWCSUM) { ifp->if_capenable ^= IFCAP_HWCSUM; reinit = 1; } if (mask & IFCAP_TSO4) { ifp->if_capenable ^= IFCAP_TSO4; reinit = 1; } if (mask & IFCAP_VLAN_HWTAGGING) { ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; reinit = 1; } if (mask & IFCAP_VLAN_HWFILTER) { ifp->if_capenable ^= IFCAP_VLAN_HWFILTER; reinit = 1; } if (mask & IFCAP_VLAN_HWTSO) { ifp->if_capenable ^= IFCAP_VLAN_HWTSO; reinit = 1; } if (mask & IFCAP_LRO) { ifp->if_capenable ^= IFCAP_LRO; reinit = 1; } if (reinit && (ifp->if_drv_flags & IFF_DRV_RUNNING)) igb_init(adapter); VLAN_CAPABILITIES(ifp); break; } default: error = ether_ioctl(ifp, command, data); break; } return (error); } /********************************************************************* * Init entry point * * This routine is used in two ways. It is used by the stack as * init entry point in network interface structure. It is also used * by the driver as a hw/sw initialization routine to get to a * consistent state. * * return 0 on success, positive on failure **********************************************************************/ static void igb_init_locked(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; device_t dev = adapter->dev; INIT_DEBUGOUT("igb_init: begin"); IGB_CORE_LOCK_ASSERT(adapter); igb_disable_intr(adapter); callout_stop(&adapter->timer); /* Get the latest mac address, User can use a LAA */ bcopy(IF_LLADDR(adapter->ifp), adapter->hw.mac.addr, ETHER_ADDR_LEN); /* Put the address into the Receive Address Array */ e1000_rar_set(&adapter->hw, adapter->hw.mac.addr, 0); igb_reset(adapter); igb_update_link_status(adapter); E1000_WRITE_REG(&adapter->hw, E1000_VET, ETHERTYPE_VLAN); /* Set hardware offload abilities */ ifp->if_hwassist = 0; if (ifp->if_capenable & IFCAP_TXCSUM) { ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP); #if __FreeBSD_version >= 800000 if (adapter->hw.mac.type == e1000_82576) ifp->if_hwassist |= CSUM_SCTP; #endif } if (ifp->if_capenable & IFCAP_TSO4) ifp->if_hwassist |= CSUM_TSO; /* Configure for OS presence */ igb_init_manageability(adapter); /* Prepare transmit descriptors and buffers */ igb_setup_transmit_structures(adapter); igb_initialize_transmit_units(adapter); /* Setup Multicast table */ igb_set_multi(adapter); /* ** Figure out the desired mbuf pool ** for doing jumbo/packetsplit */ if (adapter->max_frame_size <= 2048) adapter->rx_mbuf_sz = MCLBYTES; else if (adapter->max_frame_size <= 4096) adapter->rx_mbuf_sz = MJUMPAGESIZE; else adapter->rx_mbuf_sz = MJUM9BYTES; /* Prepare receive descriptors and buffers */ if (igb_setup_receive_structures(adapter)) { device_printf(dev, "Could not setup receive structures\n"); return; } igb_initialize_receive_units(adapter); /* Enable VLAN support */ if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) igb_setup_vlan_hw_support(adapter); /* Don't lose promiscuous settings */ igb_set_promisc(adapter); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; callout_reset(&adapter->timer, hz, igb_local_timer, adapter); e1000_clear_hw_cntrs_base_generic(&adapter->hw); if (adapter->msix > 1) /* Set up queue routing */ igb_configure_queues(adapter); /* this clears any pending interrupts */ E1000_READ_REG(&adapter->hw, E1000_ICR); #ifdef DEVICE_POLLING /* * Only enable interrupts if we are not polling, make sure * they are off otherwise. */ if (ifp->if_capenable & IFCAP_POLLING) igb_disable_intr(adapter); else #endif /* DEVICE_POLLING */ { igb_enable_intr(adapter); E1000_WRITE_REG(&adapter->hw, E1000_ICS, E1000_ICS_LSC); } /* Set Energy Efficient Ethernet */ if (adapter->hw.phy.media_type == e1000_media_type_copper) e1000_set_eee_i350(&adapter->hw); } static void igb_init(void *arg) { struct adapter *adapter = arg; IGB_CORE_LOCK(adapter); igb_init_locked(adapter); IGB_CORE_UNLOCK(adapter); } static void igb_handle_que(void *context, int pending) { struct igb_queue *que = context; struct adapter *adapter = que->adapter; struct tx_ring *txr = que->txr; struct ifnet *ifp = adapter->ifp; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { bool more; more = igb_rxeof(que, adapter->rx_process_limit, NULL); IGB_TX_LOCK(txr); igb_txeof(txr); #ifndef IGB_LEGACY_TX /* Process the stack queue only if not depleted */ if (((txr->queue_status & IGB_QUEUE_DEPLETED) == 0) && !drbr_empty(ifp, txr->br)) igb_mq_start_locked(ifp, txr); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) igb_start_locked(txr, ifp); #endif IGB_TX_UNLOCK(txr); /* Do we need another? */ if (more) { taskqueue_enqueue(que->tq, &que->que_task); return; } } #ifdef DEVICE_POLLING if (ifp->if_capenable & IFCAP_POLLING) return; #endif /* Reenable this interrupt */ if (que->eims) E1000_WRITE_REG(&adapter->hw, E1000_EIMS, que->eims); else igb_enable_intr(adapter); } /* Deal with link in a sleepable context */ static void igb_handle_link(void *context, int pending) { struct adapter *adapter = context; IGB_CORE_LOCK(adapter); igb_handle_link_locked(adapter); IGB_CORE_UNLOCK(adapter); } static void igb_handle_link_locked(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; struct ifnet *ifp = adapter->ifp; IGB_CORE_LOCK_ASSERT(adapter); adapter->hw.mac.get_link_status = 1; igb_update_link_status(adapter); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) && adapter->link_active) { for (int i = 0; i < adapter->num_queues; i++, txr++) { IGB_TX_LOCK(txr); #ifndef IGB_LEGACY_TX /* Process the stack queue only if not depleted */ if (((txr->queue_status & IGB_QUEUE_DEPLETED) == 0) && !drbr_empty(ifp, txr->br)) igb_mq_start_locked(ifp, txr); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) igb_start_locked(txr, ifp); #endif IGB_TX_UNLOCK(txr); } } } /********************************************************************* * * MSI/Legacy Deferred * Interrupt Service routine * *********************************************************************/ static int igb_irq_fast(void *arg) { struct adapter *adapter = arg; struct igb_queue *que = adapter->queues; u32 reg_icr; reg_icr = E1000_READ_REG(&adapter->hw, E1000_ICR); /* Hot eject? */ if (reg_icr == 0xffffffff) return FILTER_STRAY; /* Definitely not our interrupt. */ if (reg_icr == 0x0) return FILTER_STRAY; if ((reg_icr & E1000_ICR_INT_ASSERTED) == 0) return FILTER_STRAY; /* * Mask interrupts until the taskqueue is finished running. This is * cheap, just assume that it is needed. This also works around the * MSI message reordering errata on certain systems. */ igb_disable_intr(adapter); taskqueue_enqueue(que->tq, &que->que_task); /* Link status change */ if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) taskqueue_enqueue(que->tq, &adapter->link_task); if (reg_icr & E1000_ICR_RXO) adapter->rx_overruns++; return FILTER_HANDLED; } #ifdef DEVICE_POLLING #if __FreeBSD_version >= 800000 #define POLL_RETURN_COUNT(a) (a) static int #else #define POLL_RETURN_COUNT(a) static void #endif igb_poll(struct ifnet *ifp, enum poll_cmd cmd, int count) { struct adapter *adapter = ifp->if_softc; struct igb_queue *que; struct tx_ring *txr; u32 reg_icr, rx_done = 0; u32 loop = IGB_MAX_LOOP; bool more; IGB_CORE_LOCK(adapter); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { IGB_CORE_UNLOCK(adapter); return POLL_RETURN_COUNT(rx_done); } if (cmd == POLL_AND_CHECK_STATUS) { reg_icr = E1000_READ_REG(&adapter->hw, E1000_ICR); /* Link status change */ if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) igb_handle_link_locked(adapter); if (reg_icr & E1000_ICR_RXO) adapter->rx_overruns++; } IGB_CORE_UNLOCK(adapter); for (int i = 0; i < adapter->num_queues; i++) { que = &adapter->queues[i]; txr = que->txr; igb_rxeof(que, count, &rx_done); IGB_TX_LOCK(txr); do { more = igb_txeof(txr); } while (loop-- && more); #ifndef IGB_LEGACY_TX if (!drbr_empty(ifp, txr->br)) igb_mq_start_locked(ifp, txr); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) igb_start_locked(txr, ifp); #endif IGB_TX_UNLOCK(txr); } return POLL_RETURN_COUNT(rx_done); } #endif /* DEVICE_POLLING */ /********************************************************************* * * MSIX Que Interrupt Service routine * **********************************************************************/ static void igb_msix_que(void *arg) { struct igb_queue *que = arg; struct adapter *adapter = que->adapter; struct ifnet *ifp = adapter->ifp; struct tx_ring *txr = que->txr; struct rx_ring *rxr = que->rxr; u32 newitr = 0; bool more_rx; /* Ignore spurious interrupts */ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; E1000_WRITE_REG(&adapter->hw, E1000_EIMC, que->eims); ++que->irqs; IGB_TX_LOCK(txr); igb_txeof(txr); #ifndef IGB_LEGACY_TX /* Process the stack queue only if not depleted */ if (((txr->queue_status & IGB_QUEUE_DEPLETED) == 0) && !drbr_empty(ifp, txr->br)) igb_mq_start_locked(ifp, txr); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) igb_start_locked(txr, ifp); #endif IGB_TX_UNLOCK(txr); more_rx = igb_rxeof(que, adapter->rx_process_limit, NULL); if (adapter->enable_aim == FALSE) goto no_calc; /* ** Do Adaptive Interrupt Moderation: ** - Write out last calculated setting ** - Calculate based on average size over ** the last interval. */ if (que->eitr_setting) E1000_WRITE_REG(&adapter->hw, E1000_EITR(que->msix), que->eitr_setting); que->eitr_setting = 0; /* Idle, do nothing */ if ((txr->bytes == 0) && (rxr->bytes == 0)) goto no_calc; /* Used half Default if sub-gig */ if (adapter->link_speed != 1000) newitr = IGB_DEFAULT_ITR / 2; else { if ((txr->bytes) && (txr->packets)) newitr = txr->bytes/txr->packets; if ((rxr->bytes) && (rxr->packets)) newitr = max(newitr, (rxr->bytes / rxr->packets)); newitr += 24; /* account for hardware frame, crc */ /* set an upper boundary */ newitr = min(newitr, 3000); /* Be nice to the mid range */ if ((newitr > 300) && (newitr < 1200)) newitr = (newitr / 3); else newitr = (newitr / 2); } newitr &= 0x7FFC; /* Mask invalid bits */ if (adapter->hw.mac.type == e1000_82575) newitr |= newitr << 16; else newitr |= E1000_EITR_CNT_IGNR; /* save for next interrupt */ que->eitr_setting = newitr; /* Reset state */ txr->bytes = 0; txr->packets = 0; rxr->bytes = 0; rxr->packets = 0; no_calc: /* Schedule a clean task if needed*/ if (more_rx) taskqueue_enqueue(que->tq, &que->que_task); else /* Reenable this interrupt */ E1000_WRITE_REG(&adapter->hw, E1000_EIMS, que->eims); return; } /********************************************************************* * * MSIX Link Interrupt Service routine * **********************************************************************/ static void igb_msix_link(void *arg) { struct adapter *adapter = arg; u32 icr; ++adapter->link_irq; icr = E1000_READ_REG(&adapter->hw, E1000_ICR); if (!(icr & E1000_ICR_LSC)) goto spurious; igb_handle_link(adapter, 0); spurious: /* Rearm */ E1000_WRITE_REG(&adapter->hw, E1000_IMS, E1000_IMS_LSC); E1000_WRITE_REG(&adapter->hw, E1000_EIMS, adapter->link_mask); return; } /********************************************************************* * * Media Ioctl callback * * This routine is called whenever the user queries the status of * the interface using ifconfig. * **********************************************************************/ static void igb_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { struct adapter *adapter = ifp->if_softc; INIT_DEBUGOUT("igb_media_status: begin"); IGB_CORE_LOCK(adapter); igb_update_link_status(adapter); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (!adapter->link_active) { IGB_CORE_UNLOCK(adapter); return; } ifmr->ifm_status |= IFM_ACTIVE; switch (adapter->link_speed) { case 10: ifmr->ifm_active |= IFM_10_T; break; case 100: /* ** Support for 100Mb SFP - these are Fiber ** but the media type appears as serdes */ if (adapter->hw.phy.media_type == e1000_media_type_internal_serdes) ifmr->ifm_active |= IFM_100_FX; else ifmr->ifm_active |= IFM_100_TX; break; case 1000: ifmr->ifm_active |= IFM_1000_T; break; } if (adapter->link_duplex == FULL_DUPLEX) ifmr->ifm_active |= IFM_FDX; else ifmr->ifm_active |= IFM_HDX; IGB_CORE_UNLOCK(adapter); } /********************************************************************* * * Media Ioctl callback * * This routine is called when the user changes speed/duplex using * media/mediopt option with ifconfig. * **********************************************************************/ static int igb_media_change(struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct ifmedia *ifm = &adapter->media; INIT_DEBUGOUT("igb_media_change: begin"); if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) return (EINVAL); IGB_CORE_LOCK(adapter); switch (IFM_SUBTYPE(ifm->ifm_media)) { case IFM_AUTO: adapter->hw.mac.autoneg = DO_AUTO_NEG; adapter->hw.phy.autoneg_advertised = AUTONEG_ADV_DEFAULT; break; case IFM_1000_LX: case IFM_1000_SX: case IFM_1000_T: adapter->hw.mac.autoneg = DO_AUTO_NEG; adapter->hw.phy.autoneg_advertised = ADVERTISE_1000_FULL; break; case IFM_100_TX: adapter->hw.mac.autoneg = FALSE; adapter->hw.phy.autoneg_advertised = 0; if ((ifm->ifm_media & IFM_GMASK) == IFM_FDX) adapter->hw.mac.forced_speed_duplex = ADVERTISE_100_FULL; else adapter->hw.mac.forced_speed_duplex = ADVERTISE_100_HALF; break; case IFM_10_T: adapter->hw.mac.autoneg = FALSE; adapter->hw.phy.autoneg_advertised = 0; if ((ifm->ifm_media & IFM_GMASK) == IFM_FDX) adapter->hw.mac.forced_speed_duplex = ADVERTISE_10_FULL; else adapter->hw.mac.forced_speed_duplex = ADVERTISE_10_HALF; break; default: device_printf(adapter->dev, "Unsupported media type\n"); } igb_init_locked(adapter); IGB_CORE_UNLOCK(adapter); return (0); } /********************************************************************* * * This routine maps the mbufs to Advanced TX descriptors. * **********************************************************************/ static int igb_xmit(struct tx_ring *txr, struct mbuf **m_headp) { struct adapter *adapter = txr->adapter; bus_dma_segment_t segs[IGB_MAX_SCATTER]; bus_dmamap_t map; struct igb_tx_buffer *tx_buffer, *tx_buffer_mapped; union e1000_adv_tx_desc *txd = NULL; struct mbuf *m_head = *m_headp; struct ether_vlan_header *eh = NULL; struct ip *ip = NULL; struct tcphdr *th = NULL; u32 hdrlen, cmd_type_len, olinfo_status = 0; int ehdrlen, poff; int nsegs, i, first, last = 0; int error, do_tso, remap = 1; /* Set basic descriptor constants */ cmd_type_len = E1000_ADVTXD_DTYP_DATA; cmd_type_len |= E1000_ADVTXD_DCMD_IFCS | E1000_ADVTXD_DCMD_DEXT; if (m_head->m_flags & M_VLANTAG) cmd_type_len |= E1000_ADVTXD_DCMD_VLE; retry: m_head = *m_headp; do_tso = ((m_head->m_pkthdr.csum_flags & CSUM_TSO) != 0); hdrlen = ehdrlen = poff = 0; /* * Intel recommends entire IP/TCP header length reside in a single * buffer. If multiple descriptors are used to describe the IP and * TCP header, each descriptor should describe one or more * complete headers; descriptors referencing only parts of headers * are not supported. If all layer headers are not coalesced into * a single buffer, each buffer should not cross a 4KB boundary, * or be larger than the maximum read request size. * Controller also requires modifing IP/TCP header to make TSO work * so we firstly get a writable mbuf chain then coalesce ethernet/ * IP/TCP header into a single buffer to meet the requirement of * controller. This also simplifies IP/TCP/UDP checksum offloading * which also has similiar restrictions. */ if (do_tso || m_head->m_pkthdr.csum_flags & CSUM_OFFLOAD) { if (do_tso || (m_head->m_next != NULL && m_head->m_pkthdr.csum_flags & CSUM_OFFLOAD)) { if (M_WRITABLE(*m_headp) == 0) { m_head = m_dup(*m_headp, M_NOWAIT); m_freem(*m_headp); if (m_head == NULL) { *m_headp = NULL; return (ENOBUFS); } *m_headp = m_head; } } /* * Assume IPv4, we don't have TSO/checksum offload support * for IPv6 yet. */ ehdrlen = sizeof(struct ether_header); m_head = m_pullup(m_head, ehdrlen); if (m_head == NULL) { *m_headp = NULL; return (ENOBUFS); } eh = mtod(m_head, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { ehdrlen = sizeof(struct ether_vlan_header); m_head = m_pullup(m_head, ehdrlen); if (m_head == NULL) { *m_headp = NULL; return (ENOBUFS); } } m_head = m_pullup(m_head, ehdrlen + sizeof(struct ip)); if (m_head == NULL) { *m_headp = NULL; return (ENOBUFS); } ip = (struct ip *)(mtod(m_head, char *) + ehdrlen); poff = ehdrlen + (ip->ip_hl << 2); if (do_tso) { m_head = m_pullup(m_head, poff + sizeof(struct tcphdr)); if (m_head == NULL) { *m_headp = NULL; return (ENOBUFS); } /* * The pseudo TCP checksum does not include TCP payload * length so driver should recompute the checksum here * what hardware expect to see. This is adherence of * Microsoft's Large Send specification. */ th = (struct tcphdr *)(mtod(m_head, char *) + poff); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); /* Keep track of the full header length */ hdrlen = poff + (th->th_off << 2); } else if (m_head->m_pkthdr.csum_flags & CSUM_TCP) { m_head = m_pullup(m_head, poff + sizeof(struct tcphdr)); if (m_head == NULL) { *m_headp = NULL; return (ENOBUFS); } th = (struct tcphdr *)(mtod(m_head, char *) + poff); m_head = m_pullup(m_head, poff + (th->th_off << 2)); if (m_head == NULL) { *m_headp = NULL; return (ENOBUFS); } ip = (struct ip *)(mtod(m_head, char *) + ehdrlen); th = (struct tcphdr *)(mtod(m_head, char *) + poff); } else if (m_head->m_pkthdr.csum_flags & CSUM_UDP) { m_head = m_pullup(m_head, poff + sizeof(struct udphdr)); if (m_head == NULL) { *m_headp = NULL; return (ENOBUFS); } ip = (struct ip *)(mtod(m_head, char *) + ehdrlen); } *m_headp = m_head; } /* * Map the packet for DMA * * Capture the first descriptor index, * this descriptor will have the index * of the EOP which is the only one that * now gets a DONE bit writeback. */ first = txr->next_avail_desc; tx_buffer = &txr->tx_buffers[first]; tx_buffer_mapped = tx_buffer; map = tx_buffer->map; error = bus_dmamap_load_mbuf_sg(txr->txtag, map, *m_headp, segs, &nsegs, BUS_DMA_NOWAIT); /* * There are two types of errors we can (try) to handle: * - EFBIG means the mbuf chain was too long and bus_dma ran * out of segments. Defragment the mbuf chain and try again. * - ENOMEM means bus_dma could not obtain enough bounce buffers * at this point in time. Defer sending and try again later. * All other errors, in particular EINVAL, are fatal and prevent the * mbuf chain from ever going through. Drop it and report error. */ if (error == EFBIG && remap) { struct mbuf *m; m = m_defrag(*m_headp, M_NOWAIT); if (m == NULL) { adapter->mbuf_defrag_failed++; m_freem(*m_headp); *m_headp = NULL; return (ENOBUFS); } *m_headp = m; /* Try it again, but only once */ remap = 0; goto retry; } else if (error == ENOMEM) { adapter->no_tx_dma_setup++; return (error); } else if (error != 0) { adapter->no_tx_dma_setup++; m_freem(*m_headp); *m_headp = NULL; return (error); } /* ** Make sure we don't overrun the ring, ** we need nsegs descriptors and one for ** the context descriptor used for the ** offloads. */ if ((nsegs + 1) > (txr->tx_avail - 2)) { txr->no_desc_avail++; bus_dmamap_unload(txr->txtag, map); return (ENOBUFS); } m_head = *m_headp; /* Do hardware assists: * Set up the context descriptor, used * when any hardware offload is done. * This includes CSUM, VLAN, and TSO. * It will use the first descriptor. */ if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { if (igb_tso_setup(txr, m_head, ehdrlen, ip, th)) { cmd_type_len |= E1000_ADVTXD_DCMD_TSE; olinfo_status |= E1000_TXD_POPTS_IXSM << 8; olinfo_status |= E1000_TXD_POPTS_TXSM << 8; } else return (ENXIO); } else if (igb_tx_ctx_setup(txr, m_head)) olinfo_status |= E1000_TXD_POPTS_TXSM << 8; /* Calculate payload length */ olinfo_status |= ((m_head->m_pkthdr.len - hdrlen) << E1000_ADVTXD_PAYLEN_SHIFT); /* 82575 needs the queue index added */ if (adapter->hw.mac.type == e1000_82575) olinfo_status |= txr->me << 4; /* Set up our transmit descriptors */ i = txr->next_avail_desc; for (int j = 0; j < nsegs; j++) { bus_size_t seg_len; bus_addr_t seg_addr; tx_buffer = &txr->tx_buffers[i]; txd = (union e1000_adv_tx_desc *)&txr->tx_base[i]; seg_addr = segs[j].ds_addr; seg_len = segs[j].ds_len; txd->read.buffer_addr = htole64(seg_addr); txd->read.cmd_type_len = htole32(cmd_type_len | seg_len); txd->read.olinfo_status = htole32(olinfo_status); last = i; if (++i == adapter->num_tx_desc) i = 0; tx_buffer->m_head = NULL; tx_buffer->next_eop = -1; } txr->next_avail_desc = i; txr->tx_avail -= nsegs; tx_buffer->m_head = m_head; /* ** Here we swap the map so the last descriptor, ** which gets the completion interrupt has the ** real map, and the first descriptor gets the ** unused map from this descriptor. */ tx_buffer_mapped->map = tx_buffer->map; tx_buffer->map = map; bus_dmamap_sync(txr->txtag, map, BUS_DMASYNC_PREWRITE); /* * Last Descriptor of Packet * needs End Of Packet (EOP) * and Report Status (RS) */ txd->read.cmd_type_len |= htole32(E1000_ADVTXD_DCMD_EOP | E1000_ADVTXD_DCMD_RS); /* * Keep track in the first buffer which * descriptor will be written back */ tx_buffer = &txr->tx_buffers[first]; tx_buffer->next_eop = last; /* Update the watchdog time early and often */ txr->watchdog_time = ticks; /* * Advance the Transmit Descriptor Tail (TDT), this tells the E1000 * that this frame is available to transmit. */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), i); ++txr->tx_packets; return (0); } static void igb_set_promisc(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; struct e1000_hw *hw = &adapter->hw; u32 reg; if (adapter->vf_ifp) { e1000_promisc_set_vf(hw, e1000_promisc_enabled); return; } reg = E1000_READ_REG(hw, E1000_RCTL); if (ifp->if_flags & IFF_PROMISC) { reg |= (E1000_RCTL_UPE | E1000_RCTL_MPE); E1000_WRITE_REG(hw, E1000_RCTL, reg); } else if (ifp->if_flags & IFF_ALLMULTI) { reg |= E1000_RCTL_MPE; reg &= ~E1000_RCTL_UPE; E1000_WRITE_REG(hw, E1000_RCTL, reg); } } static void igb_disable_promisc(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct ifnet *ifp = adapter->ifp; u32 reg; int mcnt = 0; if (adapter->vf_ifp) { e1000_promisc_set_vf(hw, e1000_promisc_disabled); return; } reg = E1000_READ_REG(hw, E1000_RCTL); reg &= (~E1000_RCTL_UPE); if (ifp->if_flags & IFF_ALLMULTI) mcnt = MAX_NUM_MULTICAST_ADDRESSES; else { struct ifmultiaddr *ifma; #if __FreeBSD_version < 800000 IF_ADDR_LOCK(ifp); #else if_maddr_rlock(ifp); #endif TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == MAX_NUM_MULTICAST_ADDRESSES) break; mcnt++; } #if __FreeBSD_version < 800000 IF_ADDR_UNLOCK(ifp); #else if_maddr_runlock(ifp); #endif } /* Don't disable if in MAX groups */ if (mcnt < MAX_NUM_MULTICAST_ADDRESSES) reg &= (~E1000_RCTL_MPE); E1000_WRITE_REG(hw, E1000_RCTL, reg); } /********************************************************************* * Multicast Update * * This routine is called whenever multicast address list is updated. * **********************************************************************/ static void igb_set_multi(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; struct ifmultiaddr *ifma; u32 reg_rctl = 0; u8 *mta; int mcnt = 0; IOCTL_DEBUGOUT("igb_set_multi: begin"); mta = adapter->mta; bzero(mta, sizeof(uint8_t) * ETH_ADDR_LEN * MAX_NUM_MULTICAST_ADDRESSES); #if __FreeBSD_version < 800000 IF_ADDR_LOCK(ifp); #else if_maddr_rlock(ifp); #endif TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == MAX_NUM_MULTICAST_ADDRESSES) break; bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), &mta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN); mcnt++; } #if __FreeBSD_version < 800000 IF_ADDR_UNLOCK(ifp); #else if_maddr_runlock(ifp); #endif if (mcnt >= MAX_NUM_MULTICAST_ADDRESSES) { reg_rctl = E1000_READ_REG(&adapter->hw, E1000_RCTL); reg_rctl |= E1000_RCTL_MPE; E1000_WRITE_REG(&adapter->hw, E1000_RCTL, reg_rctl); } else e1000_update_mc_addr_list(&adapter->hw, mta, mcnt); } /********************************************************************* * Timer routine: * This routine checks for link status, * updates statistics, and does the watchdog. * **********************************************************************/ static void igb_local_timer(void *arg) { struct adapter *adapter = arg; device_t dev = adapter->dev; struct ifnet *ifp = adapter->ifp; struct tx_ring *txr = adapter->tx_rings; struct igb_queue *que = adapter->queues; int hung = 0, busy = 0; IGB_CORE_LOCK_ASSERT(adapter); igb_update_link_status(adapter); igb_update_stats_counters(adapter); /* ** Check the TX queues status ** - central locked handling of OACTIVE ** - watchdog only if all queues show hung */ for (int i = 0; i < adapter->num_queues; i++, que++, txr++) { if ((txr->queue_status & IGB_QUEUE_HUNG) && (adapter->pause_frames == 0)) ++hung; if (txr->queue_status & IGB_QUEUE_DEPLETED) ++busy; if ((txr->queue_status & IGB_QUEUE_IDLE) == 0) taskqueue_enqueue(que->tq, &que->que_task); } if (hung == adapter->num_queues) goto timeout; if (busy == adapter->num_queues) ifp->if_drv_flags |= IFF_DRV_OACTIVE; else if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) && (busy < adapter->num_queues)) ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; adapter->pause_frames = 0; callout_reset(&adapter->timer, hz, igb_local_timer, adapter); #ifndef DEVICE_POLLING /* Schedule all queue interrupts - deadlock protection */ E1000_WRITE_REG(&adapter->hw, E1000_EICS, adapter->que_mask); #endif return; timeout: device_printf(adapter->dev, "Watchdog timeout -- resetting\n"); device_printf(dev,"Queue(%d) tdh = %d, hw tdt = %d\n", txr->me, E1000_READ_REG(&adapter->hw, E1000_TDH(txr->me)), E1000_READ_REG(&adapter->hw, E1000_TDT(txr->me))); device_printf(dev,"TX(%d) desc avail = %d," "Next TX to Clean = %d\n", txr->me, txr->tx_avail, txr->next_to_clean); adapter->ifp->if_drv_flags &= ~IFF_DRV_RUNNING; adapter->watchdog_events++; igb_init_locked(adapter); } static void igb_update_link_status(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct e1000_fc_info *fc = &hw->fc; struct ifnet *ifp = adapter->ifp; device_t dev = adapter->dev; struct tx_ring *txr = adapter->tx_rings; u32 link_check, thstat, ctrl; char *flowctl = NULL; link_check = thstat = ctrl = 0; /* Get the cached link value or read for real */ switch (hw->phy.media_type) { case e1000_media_type_copper: if (hw->mac.get_link_status) { /* Do the work to read phy */ e1000_check_for_link(hw); link_check = !hw->mac.get_link_status; } else link_check = TRUE; break; case e1000_media_type_fiber: e1000_check_for_link(hw); link_check = (E1000_READ_REG(hw, E1000_STATUS) & E1000_STATUS_LU); break; case e1000_media_type_internal_serdes: e1000_check_for_link(hw); link_check = adapter->hw.mac.serdes_has_link; break; /* VF device is type_unknown */ case e1000_media_type_unknown: e1000_check_for_link(hw); link_check = !hw->mac.get_link_status; /* Fall thru */ default: break; } /* Check for thermal downshift or shutdown */ if (hw->mac.type == e1000_i350) { thstat = E1000_READ_REG(hw, E1000_THSTAT); ctrl = E1000_READ_REG(hw, E1000_CTRL_EXT); } /* Get the flow control for display */ switch (fc->current_mode) { case e1000_fc_rx_pause: flowctl = "RX"; break; case e1000_fc_tx_pause: flowctl = "TX"; break; case e1000_fc_full: flowctl = "Full"; break; case e1000_fc_none: default: flowctl = "None"; break; } /* Now we check if a transition has happened */ if (link_check && (adapter->link_active == 0)) { e1000_get_speed_and_duplex(&adapter->hw, &adapter->link_speed, &adapter->link_duplex); if (bootverbose) device_printf(dev, "Link is up %d Mbps %s," " Flow Control: %s\n", adapter->link_speed, ((adapter->link_duplex == FULL_DUPLEX) ? "Full Duplex" : "Half Duplex"), flowctl); adapter->link_active = 1; ifp->if_baudrate = adapter->link_speed * 1000000; if ((ctrl & E1000_CTRL_EXT_LINK_MODE_GMII) && (thstat & E1000_THSTAT_LINK_THROTTLE)) device_printf(dev, "Link: thermal downshift\n"); /* This can sleep */ if_link_state_change(ifp, LINK_STATE_UP); } else if (!link_check && (adapter->link_active == 1)) { ifp->if_baudrate = adapter->link_speed = 0; adapter->link_duplex = 0; if (bootverbose) device_printf(dev, "Link is Down\n"); if ((ctrl & E1000_CTRL_EXT_LINK_MODE_GMII) && (thstat & E1000_THSTAT_PWR_DOWN)) device_printf(dev, "Link: thermal shutdown\n"); adapter->link_active = 0; /* This can sleep */ if_link_state_change(ifp, LINK_STATE_DOWN); /* Reset queue state */ for (int i = 0; i < adapter->num_queues; i++, txr++) txr->queue_status = IGB_QUEUE_IDLE; } } /********************************************************************* * * This routine disables all traffic on the adapter by issuing a * global reset on the MAC and deallocates TX/RX buffers. * **********************************************************************/ static void igb_stop(void *arg) { struct adapter *adapter = arg; struct ifnet *ifp = adapter->ifp; struct tx_ring *txr = adapter->tx_rings; IGB_CORE_LOCK_ASSERT(adapter); INIT_DEBUGOUT("igb_stop: begin"); igb_disable_intr(adapter); callout_stop(&adapter->timer); /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~IFF_DRV_RUNNING; ifp->if_drv_flags |= IFF_DRV_OACTIVE; /* Disarm watchdog timer. */ for (int i = 0; i < adapter->num_queues; i++, txr++) { IGB_TX_LOCK(txr); txr->queue_status = IGB_QUEUE_IDLE; IGB_TX_UNLOCK(txr); } e1000_reset_hw(&adapter->hw); E1000_WRITE_REG(&adapter->hw, E1000_WUC, 0); e1000_led_off(&adapter->hw); e1000_cleanup_led(&adapter->hw); } /********************************************************************* * * Determine hardware revision. * **********************************************************************/ static void igb_identify_hardware(struct adapter *adapter) { device_t dev = adapter->dev; /* Make sure our PCI config space has the necessary stuff set */ pci_enable_busmaster(dev); adapter->hw.bus.pci_cmd_word = pci_read_config(dev, PCIR_COMMAND, 2); /* Save off the information about this board */ adapter->hw.vendor_id = pci_get_vendor(dev); adapter->hw.device_id = pci_get_device(dev); adapter->hw.revision_id = pci_read_config(dev, PCIR_REVID, 1); adapter->hw.subsystem_vendor_id = pci_read_config(dev, PCIR_SUBVEND_0, 2); adapter->hw.subsystem_device_id = pci_read_config(dev, PCIR_SUBDEV_0, 2); /* Set MAC type early for PCI setup */ e1000_set_mac_type(&adapter->hw); /* Are we a VF device? */ if ((adapter->hw.mac.type == e1000_vfadapt) || (adapter->hw.mac.type == e1000_vfadapt_i350)) adapter->vf_ifp = 1; else adapter->vf_ifp = 0; } static int igb_allocate_pci_resources(struct adapter *adapter) { device_t dev = adapter->dev; int rid; rid = PCIR_BAR(0); adapter->pci_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (adapter->pci_mem == NULL) { device_printf(dev, "Unable to allocate bus resource: memory\n"); return (ENXIO); } adapter->osdep.mem_bus_space_tag = rman_get_bustag(adapter->pci_mem); adapter->osdep.mem_bus_space_handle = rman_get_bushandle(adapter->pci_mem); adapter->hw.hw_addr = (u8 *)&adapter->osdep.mem_bus_space_handle; adapter->num_queues = 1; /* Defaults for Legacy or MSI */ /* This will setup either MSI/X or MSI */ adapter->msix = igb_setup_msix(adapter); adapter->hw.back = &adapter->osdep; return (0); } /********************************************************************* * * Setup the Legacy or MSI Interrupt handler * **********************************************************************/ static int igb_allocate_legacy(struct adapter *adapter) { device_t dev = adapter->dev; struct igb_queue *que = adapter->queues; int error, rid = 0; /* Turn off all interrupts */ E1000_WRITE_REG(&adapter->hw, E1000_IMC, 0xffffffff); /* MSI RID is 1 */ if (adapter->msix == 1) rid = 1; /* We allocate a single interrupt resource */ adapter->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (adapter->res == NULL) { device_printf(dev, "Unable to allocate bus resource: " "interrupt\n"); return (ENXIO); } #ifndef IGB_LEGACY_TX TASK_INIT(&que->txr->txq_task, 0, igb_deferred_mq_start, que->txr); #endif /* * Try allocating a fast interrupt and the associated deferred * processing contexts. */ TASK_INIT(&que->que_task, 0, igb_handle_que, que); /* Make tasklet for deferred link handling */ TASK_INIT(&adapter->link_task, 0, igb_handle_link, adapter); que->tq = taskqueue_create_fast("igb_taskq", M_NOWAIT, taskqueue_thread_enqueue, &que->tq); taskqueue_start_threads(&que->tq, 1, PI_NET, "%s taskq", device_get_nameunit(adapter->dev)); if ((error = bus_setup_intr(dev, adapter->res, INTR_TYPE_NET | INTR_MPSAFE, igb_irq_fast, NULL, adapter, &adapter->tag)) != 0) { device_printf(dev, "Failed to register fast interrupt " "handler: %d\n", error); taskqueue_free(que->tq); que->tq = NULL; return (error); } return (0); } /********************************************************************* * * Setup the MSIX Queue Interrupt handlers: * **********************************************************************/ static int igb_allocate_msix(struct adapter *adapter) { device_t dev = adapter->dev; struct igb_queue *que = adapter->queues; int error, rid, vector = 0; /* Be sure to start with all interrupts disabled */ E1000_WRITE_REG(&adapter->hw, E1000_IMC, ~0); E1000_WRITE_FLUSH(&adapter->hw); for (int i = 0; i < adapter->num_queues; i++, vector++, que++) { rid = vector +1; que->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (que->res == NULL) { device_printf(dev, "Unable to allocate bus resource: " "MSIX Queue Interrupt\n"); return (ENXIO); } error = bus_setup_intr(dev, que->res, INTR_TYPE_NET | INTR_MPSAFE, NULL, igb_msix_que, que, &que->tag); if (error) { que->res = NULL; device_printf(dev, "Failed to register Queue handler"); return (error); } #if __FreeBSD_version >= 800504 bus_describe_intr(dev, que->res, que->tag, "que %d", i); #endif que->msix = vector; if (adapter->hw.mac.type == e1000_82575) que->eims = E1000_EICR_TX_QUEUE0 << i; else que->eims = 1 << vector; /* ** Bind the msix vector, and thus the ** rings to the corresponding cpu. */ if (adapter->num_queues > 1) { if (igb_last_bind_cpu < 0) igb_last_bind_cpu = CPU_FIRST(); bus_bind_intr(dev, que->res, igb_last_bind_cpu); device_printf(dev, "Bound queue %d to cpu %d\n", i,igb_last_bind_cpu); igb_last_bind_cpu = CPU_NEXT(igb_last_bind_cpu); } #ifndef IGB_LEGACY_TX TASK_INIT(&que->txr->txq_task, 0, igb_deferred_mq_start, que->txr); #endif /* Make tasklet for deferred handling */ TASK_INIT(&que->que_task, 0, igb_handle_que, que); que->tq = taskqueue_create("igb_que", M_NOWAIT, taskqueue_thread_enqueue, &que->tq); taskqueue_start_threads(&que->tq, 1, PI_NET, "%s que", device_get_nameunit(adapter->dev)); } /* And Link */ rid = vector + 1; adapter->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (adapter->res == NULL) { device_printf(dev, "Unable to allocate bus resource: " "MSIX Link Interrupt\n"); return (ENXIO); } if ((error = bus_setup_intr(dev, adapter->res, INTR_TYPE_NET | INTR_MPSAFE, NULL, igb_msix_link, adapter, &adapter->tag)) != 0) { device_printf(dev, "Failed to register Link handler"); return (error); } #if __FreeBSD_version >= 800504 bus_describe_intr(dev, adapter->res, adapter->tag, "link"); #endif adapter->linkvec = vector; return (0); } static void igb_configure_queues(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct igb_queue *que; u32 tmp, ivar = 0, newitr = 0; /* First turn on RSS capability */ if (adapter->hw.mac.type != e1000_82575) E1000_WRITE_REG(hw, E1000_GPIE, E1000_GPIE_MSIX_MODE | E1000_GPIE_EIAME | E1000_GPIE_PBA | E1000_GPIE_NSICR); /* Turn on MSIX */ switch (adapter->hw.mac.type) { case e1000_82580: case e1000_i350: case e1000_i210: case e1000_i211: case e1000_vfadapt: case e1000_vfadapt_i350: /* RX entries */ for (int i = 0; i < adapter->num_queues; i++) { u32 index = i >> 1; ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index); que = &adapter->queues[i]; if (i & 1) { ivar &= 0xFF00FFFF; ivar |= (que->msix | E1000_IVAR_VALID) << 16; } else { ivar &= 0xFFFFFF00; ivar |= que->msix | E1000_IVAR_VALID; } E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar); } /* TX entries */ for (int i = 0; i < adapter->num_queues; i++) { u32 index = i >> 1; ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index); que = &adapter->queues[i]; if (i & 1) { ivar &= 0x00FFFFFF; ivar |= (que->msix | E1000_IVAR_VALID) << 24; } else { ivar &= 0xFFFF00FF; ivar |= (que->msix | E1000_IVAR_VALID) << 8; } E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar); adapter->que_mask |= que->eims; } /* And for the link interrupt */ ivar = (adapter->linkvec | E1000_IVAR_VALID) << 8; adapter->link_mask = 1 << adapter->linkvec; E1000_WRITE_REG(hw, E1000_IVAR_MISC, ivar); break; case e1000_82576: /* RX entries */ for (int i = 0; i < adapter->num_queues; i++) { u32 index = i & 0x7; /* Each IVAR has two entries */ ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index); que = &adapter->queues[i]; if (i < 8) { ivar &= 0xFFFFFF00; ivar |= que->msix | E1000_IVAR_VALID; } else { ivar &= 0xFF00FFFF; ivar |= (que->msix | E1000_IVAR_VALID) << 16; } E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar); adapter->que_mask |= que->eims; } /* TX entries */ for (int i = 0; i < adapter->num_queues; i++) { u32 index = i & 0x7; /* Each IVAR has two entries */ ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index); que = &adapter->queues[i]; if (i < 8) { ivar &= 0xFFFF00FF; ivar |= (que->msix | E1000_IVAR_VALID) << 8; } else { ivar &= 0x00FFFFFF; ivar |= (que->msix | E1000_IVAR_VALID) << 24; } E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar); adapter->que_mask |= que->eims; } /* And for the link interrupt */ ivar = (adapter->linkvec | E1000_IVAR_VALID) << 8; adapter->link_mask = 1 << adapter->linkvec; E1000_WRITE_REG(hw, E1000_IVAR_MISC, ivar); break; case e1000_82575: /* enable MSI-X support*/ tmp = E1000_READ_REG(hw, E1000_CTRL_EXT); tmp |= E1000_CTRL_EXT_PBA_CLR; /* Auto-Mask interrupts upon ICR read. */ tmp |= E1000_CTRL_EXT_EIAME; tmp |= E1000_CTRL_EXT_IRCA; E1000_WRITE_REG(hw, E1000_CTRL_EXT, tmp); /* Queues */ for (int i = 0; i < adapter->num_queues; i++) { que = &adapter->queues[i]; tmp = E1000_EICR_RX_QUEUE0 << i; tmp |= E1000_EICR_TX_QUEUE0 << i; que->eims = tmp; E1000_WRITE_REG_ARRAY(hw, E1000_MSIXBM(0), i, que->eims); adapter->que_mask |= que->eims; } /* Link */ E1000_WRITE_REG(hw, E1000_MSIXBM(adapter->linkvec), E1000_EIMS_OTHER); adapter->link_mask |= E1000_EIMS_OTHER; default: break; } /* Set the starting interrupt rate */ if (igb_max_interrupt_rate > 0) newitr = (4000000 / igb_max_interrupt_rate) & 0x7FFC; if (hw->mac.type == e1000_82575) newitr |= newitr << 16; else newitr |= E1000_EITR_CNT_IGNR; for (int i = 0; i < adapter->num_queues; i++) { que = &adapter->queues[i]; E1000_WRITE_REG(hw, E1000_EITR(que->msix), newitr); } return; } static void igb_free_pci_resources(struct adapter *adapter) { struct igb_queue *que = adapter->queues; device_t dev = adapter->dev; int rid; /* ** There is a slight possibility of a failure mode ** in attach that will result in entering this function ** before interrupt resources have been initialized, and ** in that case we do not want to execute the loops below ** We can detect this reliably by the state of the adapter ** res pointer. */ if (adapter->res == NULL) goto mem; /* * First release all the interrupt resources: */ for (int i = 0; i < adapter->num_queues; i++, que++) { rid = que->msix + 1; if (que->tag != NULL) { bus_teardown_intr(dev, que->res, que->tag); que->tag = NULL; } if (que->res != NULL) bus_release_resource(dev, SYS_RES_IRQ, rid, que->res); } /* Clean the Legacy or Link interrupt last */ if (adapter->linkvec) /* we are doing MSIX */ rid = adapter->linkvec + 1; else (adapter->msix != 0) ? (rid = 1):(rid = 0); que = adapter->queues; if (adapter->tag != NULL) { taskqueue_drain(que->tq, &adapter->link_task); bus_teardown_intr(dev, adapter->res, adapter->tag); adapter->tag = NULL; } if (adapter->res != NULL) bus_release_resource(dev, SYS_RES_IRQ, rid, adapter->res); for (int i = 0; i < adapter->num_queues; i++, que++) { if (que->tq != NULL) { #ifndef IGB_LEGACY_TX taskqueue_drain(que->tq, &que->txr->txq_task); #endif taskqueue_drain(que->tq, &que->que_task); taskqueue_free(que->tq); } } mem: if (adapter->msix) pci_release_msi(dev); if (adapter->msix_mem != NULL) bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(IGB_MSIX_BAR), adapter->msix_mem); if (adapter->pci_mem != NULL) bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(0), adapter->pci_mem); } /* * Setup Either MSI/X or MSI */ static int igb_setup_msix(struct adapter *adapter) { device_t dev = adapter->dev; int rid, want, queues, msgs, maxqueues; /* tuneable override */ if (igb_enable_msix == 0) goto msi; /* First try MSI/X */ msgs = pci_msix_count(dev); if (msgs == 0) goto msi; rid = PCIR_BAR(IGB_MSIX_BAR); adapter->msix_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (adapter->msix_mem == NULL) { /* May not be enabled */ device_printf(adapter->dev, "Unable to map MSIX table \n"); goto msi; } /* Figure out a reasonable auto config value */ queues = (mp_ncpus > (msgs-1)) ? (msgs-1) : mp_ncpus; /* Manual override */ if (igb_num_queues != 0) queues = igb_num_queues; /* Sanity check based on HW */ switch (adapter->hw.mac.type) { case e1000_82575: maxqueues = 4; break; case e1000_82576: case e1000_82580: case e1000_i350: maxqueues = 8; break; case e1000_i210: maxqueues = 4; break; case e1000_i211: maxqueues = 2; break; default: /* VF interfaces */ maxqueues = 1; break; } if (queues > maxqueues) queues = maxqueues; /* reflect correct sysctl value */ igb_num_queues = queues; /* ** One vector (RX/TX pair) per queue ** plus an additional for Link interrupt */ want = queues + 1; if (msgs >= want) msgs = want; else { device_printf(adapter->dev, "MSIX Configuration Problem, " "%d vectors configured, but %d queues wanted!\n", msgs, want); goto msi; } if ((pci_alloc_msix(dev, &msgs) == 0) && (msgs == want)) { device_printf(adapter->dev, "Using MSIX interrupts with %d vectors\n", msgs); adapter->num_queues = queues; return (msgs); } /* ** If MSIX alloc failed or provided us with ** less than needed, free and fall through to MSI */ pci_release_msi(dev); msi: if (adapter->msix_mem != NULL) { bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(IGB_MSIX_BAR), adapter->msix_mem); adapter->msix_mem = NULL; } msgs = 1; if (pci_alloc_msi(dev, &msgs) == 0) { device_printf(adapter->dev," Using an MSI interrupt\n"); return (msgs); } device_printf(adapter->dev," Using a Legacy interrupt\n"); return (0); } /********************************************************************* * * Set up an fresh starting state * **********************************************************************/ static void igb_reset(struct adapter *adapter) { device_t dev = adapter->dev; struct e1000_hw *hw = &adapter->hw; struct e1000_fc_info *fc = &hw->fc; struct ifnet *ifp = adapter->ifp; u32 pba = 0; u16 hwm; INIT_DEBUGOUT("igb_reset: begin"); /* Let the firmware know the OS is in control */ igb_get_hw_control(adapter); /* * Packet Buffer Allocation (PBA) * Writing PBA sets the receive portion of the buffer * the remainder is used for the transmit buffer. */ switch (hw->mac.type) { case e1000_82575: pba = E1000_PBA_32K; break; case e1000_82576: case e1000_vfadapt: pba = E1000_READ_REG(hw, E1000_RXPBS); pba &= E1000_RXPBS_SIZE_MASK_82576; break; case e1000_82580: case e1000_i350: case e1000_vfadapt_i350: pba = E1000_READ_REG(hw, E1000_RXPBS); pba = e1000_rxpbs_adjust_82580(pba); break; case e1000_i210: case e1000_i211: pba = E1000_PBA_34K; default: break; } /* Special needs in case of Jumbo frames */ if ((hw->mac.type == e1000_82575) && (ifp->if_mtu > ETHERMTU)) { u32 tx_space, min_tx, min_rx; pba = E1000_READ_REG(hw, E1000_PBA); tx_space = pba >> 16; pba &= 0xffff; min_tx = (adapter->max_frame_size + sizeof(struct e1000_tx_desc) - ETHERNET_FCS_SIZE) * 2; min_tx = roundup2(min_tx, 1024); min_tx >>= 10; min_rx = adapter->max_frame_size; min_rx = roundup2(min_rx, 1024); min_rx >>= 10; if (tx_space < min_tx && ((min_tx - tx_space) < pba)) { pba = pba - (min_tx - tx_space); /* * if short on rx space, rx wins * and must trump tx adjustment */ if (pba < min_rx) pba = min_rx; } E1000_WRITE_REG(hw, E1000_PBA, pba); } INIT_DEBUGOUT1("igb_init: pba=%dK",pba); /* * These parameters control the automatic generation (Tx) and * response (Rx) to Ethernet PAUSE frames. * - High water mark should allow for at least two frames to be * received after sending an XOFF. * - Low water mark works best when it is very near the high water mark. * This allows the receiver to restart by sending XON when it has * drained a bit. */ hwm = min(((pba << 10) * 9 / 10), ((pba << 10) - 2 * adapter->max_frame_size)); if (hw->mac.type < e1000_82576) { fc->high_water = hwm & 0xFFF8; /* 8-byte granularity */ fc->low_water = fc->high_water - 8; } else { fc->high_water = hwm & 0xFFF0; /* 16-byte granularity */ fc->low_water = fc->high_water - 16; } fc->pause_time = IGB_FC_PAUSE_TIME; fc->send_xon = TRUE; if (adapter->fc) fc->requested_mode = adapter->fc; else fc->requested_mode = e1000_fc_default; /* Issue a global reset */ e1000_reset_hw(hw); E1000_WRITE_REG(hw, E1000_WUC, 0); if (e1000_init_hw(hw) < 0) device_printf(dev, "Hardware Initialization Failed\n"); /* Setup DMA Coalescing */ if ((hw->mac.type > e1000_82580) && (hw->mac.type != e1000_i211)) { u32 dmac; u32 reg = ~E1000_DMACR_DMAC_EN; if (adapter->dmac == 0) { /* Disabling it */ E1000_WRITE_REG(hw, E1000_DMACR, reg); goto reset_out; } /* Set starting thresholds */ E1000_WRITE_REG(hw, E1000_DMCTXTH, 0); E1000_WRITE_REG(hw, E1000_DMCRTRH, 0); hwm = 64 * pba - adapter->max_frame_size / 16; if (hwm < 64 * (pba - 6)) hwm = 64 * (pba - 6); reg = E1000_READ_REG(hw, E1000_FCRTC); reg &= ~E1000_FCRTC_RTH_COAL_MASK; reg |= ((hwm << E1000_FCRTC_RTH_COAL_SHIFT) & E1000_FCRTC_RTH_COAL_MASK); E1000_WRITE_REG(hw, E1000_FCRTC, reg); dmac = pba - adapter->max_frame_size / 512; if (dmac < pba - 10) dmac = pba - 10; reg = E1000_READ_REG(hw, E1000_DMACR); reg &= ~E1000_DMACR_DMACTHR_MASK; reg = ((dmac << E1000_DMACR_DMACTHR_SHIFT) & E1000_DMACR_DMACTHR_MASK); /* transition to L0x or L1 if available..*/ reg |= (E1000_DMACR_DMAC_EN | E1000_DMACR_DMAC_LX_MASK); /* timer = value in adapter->dmac in 32usec intervals */ reg |= (adapter->dmac >> 5); E1000_WRITE_REG(hw, E1000_DMACR, reg); /* Set the interval before transition */ reg = E1000_READ_REG(hw, E1000_DMCTLX); reg |= 0x80000004; E1000_WRITE_REG(hw, E1000_DMCTLX, reg); /* free space in tx packet buffer to wake from DMA coal */ E1000_WRITE_REG(hw, E1000_DMCTXTH, (20480 - (2 * adapter->max_frame_size)) >> 6); /* make low power state decision controlled by DMA coal */ reg = E1000_READ_REG(hw, E1000_PCIEMISC); reg &= ~E1000_PCIEMISC_LX_DECISION; E1000_WRITE_REG(hw, E1000_PCIEMISC, reg); device_printf(dev, "DMA Coalescing enabled\n"); } else if (hw->mac.type == e1000_82580) { u32 reg = E1000_READ_REG(hw, E1000_PCIEMISC); E1000_WRITE_REG(hw, E1000_DMACR, 0); E1000_WRITE_REG(hw, E1000_PCIEMISC, reg & ~E1000_PCIEMISC_LX_DECISION); } reset_out: E1000_WRITE_REG(&adapter->hw, E1000_VET, ETHERTYPE_VLAN); e1000_get_phy_info(hw); e1000_check_for_link(hw); return; } /********************************************************************* * * Setup networking device structure and register an interface. * **********************************************************************/ static int igb_setup_interface(device_t dev, struct adapter *adapter) { struct ifnet *ifp; INIT_DEBUGOUT("igb_setup_interface: begin"); ifp = adapter->ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "can not allocate ifnet structure\n"); return (-1); } if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_init = igb_init; ifp->if_softc = adapter; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = igb_ioctl; #ifndef IGB_LEGACY_TX ifp->if_transmit = igb_mq_start; ifp->if_qflush = igb_qflush; #else ifp->if_start = igb_start; IFQ_SET_MAXLEN(&ifp->if_snd, adapter->num_tx_desc - 1); ifp->if_snd.ifq_drv_maxlen = adapter->num_tx_desc - 1; IFQ_SET_READY(&ifp->if_snd); #endif ether_ifattach(ifp, adapter->hw.mac.addr); ifp->if_capabilities = ifp->if_capenable = 0; ifp->if_capabilities = IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM; ifp->if_capabilities |= IFCAP_TSO4; ifp->if_capabilities |= IFCAP_JUMBO_MTU; ifp->if_capenable = ifp->if_capabilities; /* Don't enable LRO by default */ ifp->if_capabilities |= IFCAP_LRO; #ifdef DEVICE_POLLING ifp->if_capabilities |= IFCAP_POLLING; #endif /* * Tell the upper layer(s) we * support full VLAN capability. */ ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header); ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWTSO | IFCAP_VLAN_MTU; ifp->if_capenable |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWTSO | IFCAP_VLAN_MTU; /* ** Don't turn this on by default, if vlans are ** created on another pseudo device (eg. lagg) ** then vlan events are not passed thru, breaking ** operation, but with HW FILTER off it works. If ** using vlans directly on the igb driver you can ** enable this and get full hardware tag filtering. */ ifp->if_capabilities |= IFCAP_VLAN_HWFILTER; /* * Specify the media types supported by this adapter and register * callbacks to update media and link information */ ifmedia_init(&adapter->media, IFM_IMASK, igb_media_change, igb_media_status); if ((adapter->hw.phy.media_type == e1000_media_type_fiber) || (adapter->hw.phy.media_type == e1000_media_type_internal_serdes)) { ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_SX | IFM_FDX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_SX, 0, NULL); } else { ifmedia_add(&adapter->media, IFM_ETHER | IFM_10_T, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_10_T | IFM_FDX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_100_TX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_100_TX | IFM_FDX, 0, NULL); if (adapter->hw.phy.type != e1000_phy_ife) { ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_T, 0, NULL); } } ifmedia_add(&adapter->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&adapter->media, IFM_ETHER | IFM_AUTO); return (0); } /* * Manage DMA'able memory. */ static void igb_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error) { if (error) return; *(bus_addr_t *) arg = segs[0].ds_addr; } static int igb_dma_malloc(struct adapter *adapter, bus_size_t size, struct igb_dma_alloc *dma, int mapflags) { int error; error = bus_dma_tag_create(bus_get_dma_tag(adapter->dev), /* parent */ IGB_DBA_ALIGN, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ size, /* maxsize */ 1, /* nsegments */ size, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &dma->dma_tag); if (error) { device_printf(adapter->dev, "%s: bus_dma_tag_create failed: %d\n", __func__, error); goto fail_0; } error = bus_dmamem_alloc(dma->dma_tag, (void**) &dma->dma_vaddr, BUS_DMA_NOWAIT | BUS_DMA_COHERENT, &dma->dma_map); if (error) { device_printf(adapter->dev, "%s: bus_dmamem_alloc(%ju) failed: %d\n", __func__, (uintmax_t)size, error); goto fail_2; } dma->dma_paddr = 0; error = bus_dmamap_load(dma->dma_tag, dma->dma_map, dma->dma_vaddr, size, igb_dmamap_cb, &dma->dma_paddr, mapflags | BUS_DMA_NOWAIT); if (error || dma->dma_paddr == 0) { device_printf(adapter->dev, "%s: bus_dmamap_load failed: %d\n", __func__, error); goto fail_3; } return (0); fail_3: bus_dmamap_unload(dma->dma_tag, dma->dma_map); fail_2: bus_dmamem_free(dma->dma_tag, dma->dma_vaddr, dma->dma_map); bus_dma_tag_destroy(dma->dma_tag); fail_0: dma->dma_map = NULL; dma->dma_tag = NULL; return (error); } static void igb_dma_free(struct adapter *adapter, struct igb_dma_alloc *dma) { if (dma->dma_tag == NULL) return; if (dma->dma_map != NULL) { bus_dmamap_sync(dma->dma_tag, dma->dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(dma->dma_tag, dma->dma_map); bus_dmamem_free(dma->dma_tag, dma->dma_vaddr, dma->dma_map); dma->dma_map = NULL; } bus_dma_tag_destroy(dma->dma_tag); dma->dma_tag = NULL; } /********************************************************************* * * Allocate memory for the transmit and receive rings, and then * the descriptors associated with each, called only once at attach. * **********************************************************************/ static int igb_allocate_queues(struct adapter *adapter) { device_t dev = adapter->dev; struct igb_queue *que = NULL; struct tx_ring *txr = NULL; struct rx_ring *rxr = NULL; int rsize, tsize, error = E1000_SUCCESS; int txconf = 0, rxconf = 0; /* First allocate the top level queue structs */ if (!(adapter->queues = (struct igb_queue *) malloc(sizeof(struct igb_queue) * adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate queue memory\n"); error = ENOMEM; goto fail; } /* Next allocate the TX ring struct memory */ if (!(adapter->tx_rings = (struct tx_ring *) malloc(sizeof(struct tx_ring) * adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate TX ring memory\n"); error = ENOMEM; goto tx_fail; } /* Now allocate the RX */ if (!(adapter->rx_rings = (struct rx_ring *) malloc(sizeof(struct rx_ring) * adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate RX ring memory\n"); error = ENOMEM; goto rx_fail; } tsize = roundup2(adapter->num_tx_desc * sizeof(union e1000_adv_tx_desc), IGB_DBA_ALIGN); /* * Now set up the TX queues, txconf is needed to handle the * possibility that things fail midcourse and we need to * undo memory gracefully */ for (int i = 0; i < adapter->num_queues; i++, txconf++) { /* Set up some basics */ txr = &adapter->tx_rings[i]; txr->adapter = adapter; txr->me = i; /* Initialize the TX lock */ snprintf(txr->mtx_name, sizeof(txr->mtx_name), "%s:tx(%d)", device_get_nameunit(dev), txr->me); mtx_init(&txr->tx_mtx, txr->mtx_name, NULL, MTX_DEF); if (igb_dma_malloc(adapter, tsize, &txr->txdma, BUS_DMA_NOWAIT)) { device_printf(dev, "Unable to allocate TX Descriptor memory\n"); error = ENOMEM; goto err_tx_desc; } txr->tx_base = (struct e1000_tx_desc *)txr->txdma.dma_vaddr; bzero((void *)txr->tx_base, tsize); /* Now allocate transmit buffers for the ring */ if (igb_allocate_transmit_buffers(txr)) { device_printf(dev, "Critical Failure setting up transmit buffers\n"); error = ENOMEM; goto err_tx_desc; } #ifndef IGB_LEGACY_TX /* Allocate a buf ring */ txr->br = buf_ring_alloc(igb_buf_ring_size, M_DEVBUF, M_WAITOK, &txr->tx_mtx); #endif } /* * Next the RX queues... */ rsize = roundup2(adapter->num_rx_desc * sizeof(union e1000_adv_rx_desc), IGB_DBA_ALIGN); for (int i = 0; i < adapter->num_queues; i++, rxconf++) { rxr = &adapter->rx_rings[i]; rxr->adapter = adapter; rxr->me = i; /* Initialize the RX lock */ snprintf(rxr->mtx_name, sizeof(rxr->mtx_name), "%s:rx(%d)", device_get_nameunit(dev), txr->me); mtx_init(&rxr->rx_mtx, rxr->mtx_name, NULL, MTX_DEF); if (igb_dma_malloc(adapter, rsize, &rxr->rxdma, BUS_DMA_NOWAIT)) { device_printf(dev, "Unable to allocate RxDescriptor memory\n"); error = ENOMEM; goto err_rx_desc; } rxr->rx_base = (union e1000_adv_rx_desc *)rxr->rxdma.dma_vaddr; bzero((void *)rxr->rx_base, rsize); /* Allocate receive buffers for the ring*/ if (igb_allocate_receive_buffers(rxr)) { device_printf(dev, "Critical Failure setting up receive buffers\n"); error = ENOMEM; goto err_rx_desc; } } /* ** Finally set up the queue holding structs */ for (int i = 0; i < adapter->num_queues; i++) { que = &adapter->queues[i]; que->adapter = adapter; que->txr = &adapter->tx_rings[i]; que->rxr = &adapter->rx_rings[i]; } return (0); err_rx_desc: for (rxr = adapter->rx_rings; rxconf > 0; rxr++, rxconf--) igb_dma_free(adapter, &rxr->rxdma); err_tx_desc: for (txr = adapter->tx_rings; txconf > 0; txr++, txconf--) igb_dma_free(adapter, &txr->txdma); free(adapter->rx_rings, M_DEVBUF); rx_fail: #ifndef IGB_LEGACY_TX buf_ring_free(txr->br, M_DEVBUF); #endif free(adapter->tx_rings, M_DEVBUF); tx_fail: free(adapter->queues, M_DEVBUF); fail: return (error); } /********************************************************************* * * Allocate memory for tx_buffer structures. The tx_buffer stores all * the information needed to transmit a packet on the wire. This is * called only once at attach, setup is done every reset. * **********************************************************************/ static int igb_allocate_transmit_buffers(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; device_t dev = adapter->dev; struct igb_tx_buffer *txbuf; int error, i; /* * Setup DMA descriptor areas. */ if ((error = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ IGB_TSO_SIZE, /* maxsize */ IGB_MAX_SCATTER, /* nsegments */ PAGE_SIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->txtag))) { device_printf(dev,"Unable to allocate TX DMA tag\n"); goto fail; } if (!(txr->tx_buffers = (struct igb_tx_buffer *) malloc(sizeof(struct igb_tx_buffer) * adapter->num_tx_desc, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate tx_buffer memory\n"); error = ENOMEM; goto fail; } /* Create the descriptor buffer dma maps */ txbuf = txr->tx_buffers; for (i = 0; i < adapter->num_tx_desc; i++, txbuf++) { error = bus_dmamap_create(txr->txtag, 0, &txbuf->map); if (error != 0) { device_printf(dev, "Unable to create TX DMA map\n"); goto fail; } } return 0; fail: /* We free all, it handles case where we are in the middle */ igb_free_transmit_structures(adapter); return (error); } /********************************************************************* * * Initialize a transmit ring. * **********************************************************************/ static void igb_setup_transmit_ring(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; struct igb_tx_buffer *txbuf; int i; #ifdef DEV_NETMAP struct netmap_adapter *na = NA(adapter->ifp); struct netmap_slot *slot; #endif /* DEV_NETMAP */ /* Clear the old descriptor contents */ IGB_TX_LOCK(txr); #ifdef DEV_NETMAP slot = netmap_reset(na, NR_TX, txr->me, 0); #endif /* DEV_NETMAP */ bzero((void *)txr->tx_base, (sizeof(union e1000_adv_tx_desc)) * adapter->num_tx_desc); /* Reset indices */ txr->next_avail_desc = 0; txr->next_to_clean = 0; /* Free any existing tx buffers. */ txbuf = txr->tx_buffers; for (i = 0; i < adapter->num_tx_desc; i++, txbuf++) { if (txbuf->m_head != NULL) { bus_dmamap_sync(txr->txtag, txbuf->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, txbuf->map); m_freem(txbuf->m_head); txbuf->m_head = NULL; } #ifdef DEV_NETMAP if (slot) { int si = netmap_idx_n2k(&na->tx_rings[txr->me], i); /* no need to set the address */ netmap_load_map(txr->txtag, txbuf->map, NMB(slot + si)); } #endif /* DEV_NETMAP */ /* clear the watch index */ txbuf->next_eop = -1; } /* Set number of descriptors available */ txr->tx_avail = adapter->num_tx_desc; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); IGB_TX_UNLOCK(txr); } /********************************************************************* * * Initialize all transmit rings. * **********************************************************************/ static void igb_setup_transmit_structures(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; for (int i = 0; i < adapter->num_queues; i++, txr++) igb_setup_transmit_ring(txr); return; } /********************************************************************* * * Enable transmit unit. * **********************************************************************/ static void igb_initialize_transmit_units(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; struct e1000_hw *hw = &adapter->hw; u32 tctl, txdctl; INIT_DEBUGOUT("igb_initialize_transmit_units: begin"); tctl = txdctl = 0; /* Setup the Tx Descriptor Rings */ for (int i = 0; i < adapter->num_queues; i++, txr++) { u64 bus_addr = txr->txdma.dma_paddr; E1000_WRITE_REG(hw, E1000_TDLEN(i), adapter->num_tx_desc * sizeof(struct e1000_tx_desc)); E1000_WRITE_REG(hw, E1000_TDBAH(i), (uint32_t)(bus_addr >> 32)); E1000_WRITE_REG(hw, E1000_TDBAL(i), (uint32_t)bus_addr); /* Setup the HW Tx Head and Tail descriptor pointers */ E1000_WRITE_REG(hw, E1000_TDT(i), 0); E1000_WRITE_REG(hw, E1000_TDH(i), 0); HW_DEBUGOUT2("Base = %x, Length = %x\n", E1000_READ_REG(hw, E1000_TDBAL(i)), E1000_READ_REG(hw, E1000_TDLEN(i))); txr->queue_status = IGB_QUEUE_IDLE; txdctl |= IGB_TX_PTHRESH; txdctl |= IGB_TX_HTHRESH << 8; txdctl |= IGB_TX_WTHRESH << 16; txdctl |= E1000_TXDCTL_QUEUE_ENABLE; E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl); } if (adapter->vf_ifp) return; e1000_config_collision_dist(hw); /* Program the Transmit Control Register */ tctl = E1000_READ_REG(hw, E1000_TCTL); tctl &= ~E1000_TCTL_CT; tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN | (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT)); /* This write will effectively turn on the transmit unit. */ E1000_WRITE_REG(hw, E1000_TCTL, tctl); } /********************************************************************* * * Free all transmit rings. * **********************************************************************/ static void igb_free_transmit_structures(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; for (int i = 0; i < adapter->num_queues; i++, txr++) { IGB_TX_LOCK(txr); igb_free_transmit_buffers(txr); igb_dma_free(adapter, &txr->txdma); IGB_TX_UNLOCK(txr); IGB_TX_LOCK_DESTROY(txr); } free(adapter->tx_rings, M_DEVBUF); } /********************************************************************* * * Free transmit ring related data structures. * **********************************************************************/ static void igb_free_transmit_buffers(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; struct igb_tx_buffer *tx_buffer; int i; INIT_DEBUGOUT("free_transmit_ring: begin"); if (txr->tx_buffers == NULL) return; tx_buffer = txr->tx_buffers; for (i = 0; i < adapter->num_tx_desc; i++, tx_buffer++) { if (tx_buffer->m_head != NULL) { bus_dmamap_sync(txr->txtag, tx_buffer->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, tx_buffer->map); m_freem(tx_buffer->m_head); tx_buffer->m_head = NULL; if (tx_buffer->map != NULL) { bus_dmamap_destroy(txr->txtag, tx_buffer->map); tx_buffer->map = NULL; } } else if (tx_buffer->map != NULL) { bus_dmamap_unload(txr->txtag, tx_buffer->map); bus_dmamap_destroy(txr->txtag, tx_buffer->map); tx_buffer->map = NULL; } } #ifndef IGB_LEGACY_TX if (txr->br != NULL) buf_ring_free(txr->br, M_DEVBUF); #endif if (txr->tx_buffers != NULL) { free(txr->tx_buffers, M_DEVBUF); txr->tx_buffers = NULL; } if (txr->txtag != NULL) { bus_dma_tag_destroy(txr->txtag); txr->txtag = NULL; } return; } /********************************************************************** * * Setup work for hardware segmentation offload (TSO) * **********************************************************************/ static bool igb_tso_setup(struct tx_ring *txr, struct mbuf *mp, int ehdrlen, struct ip *ip, struct tcphdr *th) { struct adapter *adapter = txr->adapter; struct e1000_adv_tx_context_desc *TXD; struct igb_tx_buffer *tx_buffer; u32 vlan_macip_lens = 0, type_tucmd_mlhl = 0; u32 mss_l4len_idx = 0; u16 vtag = 0; int ctxd, ip_hlen, tcp_hlen; ctxd = txr->next_avail_desc; tx_buffer = &txr->tx_buffers[ctxd]; TXD = (struct e1000_adv_tx_context_desc *) &txr->tx_base[ctxd]; ip->ip_sum = 0; ip_hlen = ip->ip_hl << 2; tcp_hlen = th->th_off << 2; /* VLAN MACLEN IPLEN */ if (mp->m_flags & M_VLANTAG) { vtag = htole16(mp->m_pkthdr.ether_vtag); vlan_macip_lens |= (vtag << E1000_ADVTXD_VLAN_SHIFT); } vlan_macip_lens |= (ehdrlen << E1000_ADVTXD_MACLEN_SHIFT); vlan_macip_lens |= ip_hlen; TXD->vlan_macip_lens |= htole32(vlan_macip_lens); /* ADV DTYPE TUCMD */ type_tucmd_mlhl |= E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_CTXT; type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP; type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV4; TXD->type_tucmd_mlhl |= htole32(type_tucmd_mlhl); /* MSS L4LEN IDX */ mss_l4len_idx |= (mp->m_pkthdr.tso_segsz << E1000_ADVTXD_MSS_SHIFT); mss_l4len_idx |= (tcp_hlen << E1000_ADVTXD_L4LEN_SHIFT); /* 82575 needs the queue index added */ if (adapter->hw.mac.type == e1000_82575) mss_l4len_idx |= txr->me << 4; TXD->mss_l4len_idx = htole32(mss_l4len_idx); TXD->seqnum_seed = htole32(0); tx_buffer->m_head = NULL; tx_buffer->next_eop = -1; if (++ctxd == adapter->num_tx_desc) ctxd = 0; txr->tx_avail--; txr->next_avail_desc = ctxd; return TRUE; } /********************************************************************* * * Context Descriptor setup for VLAN or CSUM * **********************************************************************/ static bool igb_tx_ctx_setup(struct tx_ring *txr, struct mbuf *mp) { struct adapter *adapter = txr->adapter; struct e1000_adv_tx_context_desc *TXD; struct igb_tx_buffer *tx_buffer; u32 vlan_macip_lens, type_tucmd_mlhl, mss_l4len_idx; struct ether_vlan_header *eh; struct ip *ip = NULL; struct ip6_hdr *ip6; int ehdrlen, ctxd, ip_hlen = 0; u16 etype, vtag = 0; u8 ipproto = 0; bool offload = TRUE; if ((mp->m_pkthdr.csum_flags & CSUM_OFFLOAD) == 0) offload = FALSE; vlan_macip_lens = type_tucmd_mlhl = mss_l4len_idx = 0; ctxd = txr->next_avail_desc; tx_buffer = &txr->tx_buffers[ctxd]; TXD = (struct e1000_adv_tx_context_desc *) &txr->tx_base[ctxd]; /* ** In advanced descriptors the vlan tag must ** be placed into the context descriptor, thus ** we need to be here just for that setup. */ if (mp->m_flags & M_VLANTAG) { vtag = htole16(mp->m_pkthdr.ether_vtag); vlan_macip_lens |= (vtag << E1000_ADVTXD_VLAN_SHIFT); } else if (offload == FALSE) return FALSE; /* * Determine where frame payload starts. * Jump over vlan headers if already present, * helpful for QinQ too. */ eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { etype = ntohs(eh->evl_proto); ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { etype = ntohs(eh->evl_encap_proto); ehdrlen = ETHER_HDR_LEN; } /* Set the ether header length */ vlan_macip_lens |= ehdrlen << E1000_ADVTXD_MACLEN_SHIFT; switch (etype) { case ETHERTYPE_IP: ip = (struct ip *)(mp->m_data + ehdrlen); ip_hlen = ip->ip_hl << 2; if (mp->m_len < ehdrlen + ip_hlen) { offload = FALSE; break; } ipproto = ip->ip_p; type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV4; break; case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen); ip_hlen = sizeof(struct ip6_hdr); ipproto = ip6->ip6_nxt; type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV6; break; default: offload = FALSE; break; } vlan_macip_lens |= ip_hlen; type_tucmd_mlhl |= E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_CTXT; switch (ipproto) { case IPPROTO_TCP: if (mp->m_pkthdr.csum_flags & CSUM_TCP) type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP; break; case IPPROTO_UDP: if (mp->m_pkthdr.csum_flags & CSUM_UDP) type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_UDP; break; #if __FreeBSD_version >= 800000 case IPPROTO_SCTP: if (mp->m_pkthdr.csum_flags & CSUM_SCTP) type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_SCTP; break; #endif default: offload = FALSE; break; } /* 82575 needs the queue index added */ if (adapter->hw.mac.type == e1000_82575) mss_l4len_idx = txr->me << 4; /* Now copy bits into descriptor */ TXD->vlan_macip_lens |= htole32(vlan_macip_lens); TXD->type_tucmd_mlhl |= htole32(type_tucmd_mlhl); TXD->seqnum_seed = htole32(0); TXD->mss_l4len_idx = htole32(mss_l4len_idx); tx_buffer->m_head = NULL; tx_buffer->next_eop = -1; /* We've consumed the first desc, adjust counters */ if (++ctxd == adapter->num_tx_desc) ctxd = 0; txr->next_avail_desc = ctxd; --txr->tx_avail; return (offload); } /********************************************************************** * * Examine each tx_buffer in the used queue. If the hardware is done * processing the packet then free associated resources. The * tx_buffer is put back on the free queue. * * TRUE return means there's work in the ring to clean, FALSE its empty. **********************************************************************/ static bool igb_txeof(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; int first, last, done, processed; struct igb_tx_buffer *tx_buffer; struct e1000_tx_desc *tx_desc, *eop_desc; struct ifnet *ifp = adapter->ifp; IGB_TX_LOCK_ASSERT(txr); #ifdef DEV_NETMAP if (netmap_tx_irq(ifp, txr->me )) return (FALSE); #endif /* DEV_NETMAP */ if (txr->tx_avail == adapter->num_tx_desc) { txr->queue_status = IGB_QUEUE_IDLE; return FALSE; } processed = 0; first = txr->next_to_clean; tx_desc = &txr->tx_base[first]; tx_buffer = &txr->tx_buffers[first]; last = tx_buffer->next_eop; eop_desc = &txr->tx_base[last]; /* * What this does is get the index of the * first descriptor AFTER the EOP of the * first packet, that way we can do the * simple comparison on the inner while loop. */ if (++last == adapter->num_tx_desc) last = 0; done = last; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); while (eop_desc->upper.fields.status & E1000_TXD_STAT_DD) { /* We clean the range of the packet */ while (first != done) { tx_desc->upper.data = 0; tx_desc->lower.data = 0; tx_desc->buffer_addr = 0; ++txr->tx_avail; ++processed; if (tx_buffer->m_head) { txr->bytes += tx_buffer->m_head->m_pkthdr.len; bus_dmamap_sync(txr->txtag, tx_buffer->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, tx_buffer->map); m_freem(tx_buffer->m_head); tx_buffer->m_head = NULL; } tx_buffer->next_eop = -1; txr->watchdog_time = ticks; if (++first == adapter->num_tx_desc) first = 0; tx_buffer = &txr->tx_buffers[first]; tx_desc = &txr->tx_base[first]; } ++txr->packets; ++ifp->if_opackets; /* See if we can continue to the next packet */ last = tx_buffer->next_eop; if (last != -1) { eop_desc = &txr->tx_base[last]; /* Get new done point */ if (++last == adapter->num_tx_desc) last = 0; done = last; } else break; } bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); txr->next_to_clean = first; /* ** Watchdog calculation, we know there's ** work outstanding or the first return ** would have been taken, so none processed ** for too long indicates a hang. */ if ((!processed) && ((ticks - txr->watchdog_time) > IGB_WATCHDOG)) txr->queue_status |= IGB_QUEUE_HUNG; /* * If we have a minimum free, * clear depleted state bit */ if (txr->tx_avail >= IGB_QUEUE_THRESHOLD) txr->queue_status &= ~IGB_QUEUE_DEPLETED; /* All clean, turn off the watchdog */ if (txr->tx_avail == adapter->num_tx_desc) { txr->queue_status = IGB_QUEUE_IDLE; return (FALSE); } return (TRUE); } /********************************************************************* * * Refresh mbuf buffers for RX descriptor rings * - now keeps its own state so discards due to resource * exhaustion are unnecessary, if an mbuf cannot be obtained * it just returns, keeping its placeholder, thus it can simply * be recalled to try again. * **********************************************************************/ static void igb_refresh_mbufs(struct rx_ring *rxr, int limit) { struct adapter *adapter = rxr->adapter; bus_dma_segment_t hseg[1]; bus_dma_segment_t pseg[1]; struct igb_rx_buf *rxbuf; struct mbuf *mh, *mp; int i, j, nsegs, error; bool refreshed = FALSE; i = j = rxr->next_to_refresh; /* ** Get one descriptor beyond ** our work mark to control ** the loop. */ if (++j == adapter->num_rx_desc) j = 0; while (j != limit) { rxbuf = &rxr->rx_buffers[i]; /* No hdr mbuf used with header split off */ if (rxr->hdr_split == FALSE) goto no_split; if (rxbuf->m_head == NULL) { mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) goto update; } else mh = rxbuf->m_head; mh->m_pkthdr.len = mh->m_len = MHLEN; mh->m_len = MHLEN; mh->m_flags |= M_PKTHDR; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->htag, rxbuf->hmap, mh, hseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) { printf("Refresh mbufs: hdr dmamap load" " failure - %d\n", error); m_free(mh); rxbuf->m_head = NULL; goto update; } rxbuf->m_head = mh; bus_dmamap_sync(rxr->htag, rxbuf->hmap, BUS_DMASYNC_PREREAD); rxr->rx_base[i].read.hdr_addr = htole64(hseg[0].ds_addr); no_split: if (rxbuf->m_pack == NULL) { mp = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, adapter->rx_mbuf_sz); if (mp == NULL) goto update; } else mp = rxbuf->m_pack; mp->m_pkthdr.len = mp->m_len = adapter->rx_mbuf_sz; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->ptag, rxbuf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) { printf("Refresh mbufs: payload dmamap load" " failure - %d\n", error); m_free(mp); rxbuf->m_pack = NULL; goto update; } rxbuf->m_pack = mp; bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_PREREAD); rxr->rx_base[i].read.pkt_addr = htole64(pseg[0].ds_addr); refreshed = TRUE; /* I feel wefreshed :) */ i = j; /* our next is precalculated */ rxr->next_to_refresh = i; if (++j == adapter->num_rx_desc) j = 0; } update: if (refreshed) /* update tail */ E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), rxr->next_to_refresh); return; } /********************************************************************* * * Allocate memory for rx_buffer structures. Since we use one * rx_buffer per received packet, the maximum number of rx_buffer's * that we'll need is equal to the number of receive descriptors * that we've allocated. * **********************************************************************/ static int igb_allocate_receive_buffers(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; device_t dev = adapter->dev; struct igb_rx_buf *rxbuf; int i, bsize, error; bsize = sizeof(struct igb_rx_buf) * adapter->num_rx_desc; if (!(rxr->rx_buffers = (struct igb_rx_buf *) malloc(bsize, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate rx_buffer memory\n"); error = ENOMEM; goto fail; } if ((error = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MSIZE, /* maxsize */ 1, /* nsegments */ MSIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &rxr->htag))) { device_printf(dev, "Unable to create RX DMA tag\n"); goto fail; } if ((error = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MJUM9BYTES, /* maxsize */ 1, /* nsegments */ MJUM9BYTES, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &rxr->ptag))) { device_printf(dev, "Unable to create RX payload DMA tag\n"); goto fail; } for (i = 0; i < adapter->num_rx_desc; i++) { rxbuf = &rxr->rx_buffers[i]; error = bus_dmamap_create(rxr->htag, BUS_DMA_NOWAIT, &rxbuf->hmap); if (error) { device_printf(dev, "Unable to create RX head DMA maps\n"); goto fail; } error = bus_dmamap_create(rxr->ptag, BUS_DMA_NOWAIT, &rxbuf->pmap); if (error) { device_printf(dev, "Unable to create RX packet DMA maps\n"); goto fail; } } return (0); fail: /* Frees all, but can handle partial completion */ igb_free_receive_structures(adapter); return (error); } static void igb_free_receive_ring(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; struct igb_rx_buf *rxbuf; for (int i = 0; i < adapter->num_rx_desc; i++) { rxbuf = &rxr->rx_buffers[i]; if (rxbuf->m_head != NULL) { bus_dmamap_sync(rxr->htag, rxbuf->hmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->htag, rxbuf->hmap); rxbuf->m_head->m_flags |= M_PKTHDR; m_freem(rxbuf->m_head); } if (rxbuf->m_pack != NULL) { bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->ptag, rxbuf->pmap); rxbuf->m_pack->m_flags |= M_PKTHDR; m_freem(rxbuf->m_pack); } rxbuf->m_head = NULL; rxbuf->m_pack = NULL; } } /********************************************************************* * * Initialize a receive ring and its buffers. * **********************************************************************/ static int igb_setup_receive_ring(struct rx_ring *rxr) { struct adapter *adapter; struct ifnet *ifp; device_t dev; struct igb_rx_buf *rxbuf; bus_dma_segment_t pseg[1], hseg[1]; struct lro_ctrl *lro = &rxr->lro; int rsize, nsegs, error = 0; #ifdef DEV_NETMAP struct netmap_adapter *na = NA(rxr->adapter->ifp); struct netmap_slot *slot; #endif /* DEV_NETMAP */ adapter = rxr->adapter; dev = adapter->dev; ifp = adapter->ifp; /* Clear the ring contents */ IGB_RX_LOCK(rxr); #ifdef DEV_NETMAP slot = netmap_reset(na, NR_RX, rxr->me, 0); #endif /* DEV_NETMAP */ rsize = roundup2(adapter->num_rx_desc * sizeof(union e1000_adv_rx_desc), IGB_DBA_ALIGN); bzero((void *)rxr->rx_base, rsize); /* ** Free current RX buffer structures and their mbufs */ igb_free_receive_ring(rxr); /* Configure for header split? */ if (igb_header_split) rxr->hdr_split = TRUE; /* Now replenish the ring mbufs */ for (int j = 0; j < adapter->num_rx_desc; ++j) { struct mbuf *mh, *mp; rxbuf = &rxr->rx_buffers[j]; #ifdef DEV_NETMAP if (slot) { /* slot sj is mapped to the i-th NIC-ring entry */ int sj = netmap_idx_n2k(&na->rx_rings[rxr->me], j); uint64_t paddr; void *addr; addr = PNMB(slot + sj, &paddr); netmap_load_map(rxr->ptag, rxbuf->pmap, addr); /* Update descriptor */ rxr->rx_base[j].read.pkt_addr = htole64(paddr); continue; } #endif /* DEV_NETMAP */ if (rxr->hdr_split == FALSE) goto skip_head; /* First the header */ rxbuf->m_head = m_gethdr(M_NOWAIT, MT_DATA); if (rxbuf->m_head == NULL) { error = ENOBUFS; goto fail; } m_adj(rxbuf->m_head, ETHER_ALIGN); mh = rxbuf->m_head; mh->m_len = mh->m_pkthdr.len = MHLEN; mh->m_flags |= M_PKTHDR; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->htag, rxbuf->hmap, rxbuf->m_head, hseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) /* Nothing elegant to do here */ goto fail; bus_dmamap_sync(rxr->htag, rxbuf->hmap, BUS_DMASYNC_PREREAD); /* Update descriptor */ rxr->rx_base[j].read.hdr_addr = htole64(hseg[0].ds_addr); skip_head: /* Now the payload cluster */ rxbuf->m_pack = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, adapter->rx_mbuf_sz); if (rxbuf->m_pack == NULL) { error = ENOBUFS; goto fail; } mp = rxbuf->m_pack; mp->m_pkthdr.len = mp->m_len = adapter->rx_mbuf_sz; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->ptag, rxbuf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) goto fail; bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_PREREAD); /* Update descriptor */ rxr->rx_base[j].read.pkt_addr = htole64(pseg[0].ds_addr); } /* Setup our descriptor indices */ rxr->next_to_check = 0; rxr->next_to_refresh = adapter->num_rx_desc - 1; rxr->lro_enabled = FALSE; rxr->rx_split_packets = 0; rxr->rx_bytes = 0; rxr->fmp = NULL; rxr->lmp = NULL; rxr->discard = FALSE; bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* ** Now set up the LRO interface, we ** also only do head split when LRO ** is enabled, since so often they ** are undesireable in similar setups. */ if (ifp->if_capenable & IFCAP_LRO) { error = tcp_lro_init(lro); if (error) { device_printf(dev, "LRO Initialization failed!\n"); goto fail; } INIT_DEBUGOUT("RX LRO Initialized\n"); rxr->lro_enabled = TRUE; lro->ifp = adapter->ifp; } IGB_RX_UNLOCK(rxr); return (0); fail: igb_free_receive_ring(rxr); IGB_RX_UNLOCK(rxr); return (error); } /********************************************************************* * * Initialize all receive rings. * **********************************************************************/ static int igb_setup_receive_structures(struct adapter *adapter) { struct rx_ring *rxr = adapter->rx_rings; int i; for (i = 0; i < adapter->num_queues; i++, rxr++) if (igb_setup_receive_ring(rxr)) goto fail; return (0); fail: /* * Free RX buffers allocated so far, we will only handle * the rings that completed, the failing case will have * cleaned up for itself. 'i' is the endpoint. */ for (int j = 0; j < i; ++j) { rxr = &adapter->rx_rings[j]; IGB_RX_LOCK(rxr); igb_free_receive_ring(rxr); IGB_RX_UNLOCK(rxr); } return (ENOBUFS); } /********************************************************************* * * Enable receive unit. * **********************************************************************/ static void igb_initialize_receive_units(struct adapter *adapter) { struct rx_ring *rxr = adapter->rx_rings; struct ifnet *ifp = adapter->ifp; struct e1000_hw *hw = &adapter->hw; u32 rctl, rxcsum, psize, srrctl = 0; INIT_DEBUGOUT("igb_initialize_receive_unit: begin"); /* * Make sure receives are disabled while setting * up the descriptor ring */ rctl = E1000_READ_REG(hw, E1000_RCTL); E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN); /* ** Set up for header split */ if (igb_header_split) { /* Use a standard mbuf for the header */ srrctl |= IGB_HDR_BUF << E1000_SRRCTL_BSIZEHDRSIZE_SHIFT; srrctl |= E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS; } else srrctl |= E1000_SRRCTL_DESCTYPE_ADV_ONEBUF; /* ** Set up for jumbo frames */ if (ifp->if_mtu > ETHERMTU) { rctl |= E1000_RCTL_LPE; if (adapter->rx_mbuf_sz == MJUMPAGESIZE) { srrctl |= 4096 >> E1000_SRRCTL_BSIZEPKT_SHIFT; rctl |= E1000_RCTL_SZ_4096 | E1000_RCTL_BSEX; } else if (adapter->rx_mbuf_sz > MJUMPAGESIZE) { srrctl |= 8192 >> E1000_SRRCTL_BSIZEPKT_SHIFT; rctl |= E1000_RCTL_SZ_8192 | E1000_RCTL_BSEX; } /* Set maximum packet len */ psize = adapter->max_frame_size; /* are we on a vlan? */ if (adapter->ifp->if_vlantrunk != NULL) psize += VLAN_TAG_SIZE; E1000_WRITE_REG(&adapter->hw, E1000_RLPML, psize); } else { rctl &= ~E1000_RCTL_LPE; srrctl |= 2048 >> E1000_SRRCTL_BSIZEPKT_SHIFT; rctl |= E1000_RCTL_SZ_2048; } /* Setup the Base and Length of the Rx Descriptor Rings */ for (int i = 0; i < adapter->num_queues; i++, rxr++) { u64 bus_addr = rxr->rxdma.dma_paddr; u32 rxdctl; E1000_WRITE_REG(hw, E1000_RDLEN(i), adapter->num_rx_desc * sizeof(struct e1000_rx_desc)); E1000_WRITE_REG(hw, E1000_RDBAH(i), (uint32_t)(bus_addr >> 32)); E1000_WRITE_REG(hw, E1000_RDBAL(i), (uint32_t)bus_addr); E1000_WRITE_REG(hw, E1000_SRRCTL(i), srrctl); /* Enable this Queue */ rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i)); rxdctl |= E1000_RXDCTL_QUEUE_ENABLE; rxdctl &= 0xFFF00000; rxdctl |= IGB_RX_PTHRESH; rxdctl |= IGB_RX_HTHRESH << 8; rxdctl |= IGB_RX_WTHRESH << 16; E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl); } /* ** Setup for RX MultiQueue */ rxcsum = E1000_READ_REG(hw, E1000_RXCSUM); if (adapter->num_queues >1) { u32 random[10], mrqc, shift = 0; union igb_reta { u32 dword; u8 bytes[4]; } reta; arc4rand(&random, sizeof(random), 0); if (adapter->hw.mac.type == e1000_82575) shift = 6; /* Warning FM follows */ for (int i = 0; i < 128; i++) { reta.bytes[i & 3] = (i % adapter->num_queues) << shift; if ((i & 3) == 3) E1000_WRITE_REG(hw, E1000_RETA(i >> 2), reta.dword); } /* Now fill in hash table */ mrqc = E1000_MRQC_ENABLE_RSS_4Q; for (int i = 0; i < 10; i++) E1000_WRITE_REG_ARRAY(hw, E1000_RSSRK(0), i, random[i]); mrqc |= (E1000_MRQC_RSS_FIELD_IPV4 | E1000_MRQC_RSS_FIELD_IPV4_TCP); mrqc |= (E1000_MRQC_RSS_FIELD_IPV6 | E1000_MRQC_RSS_FIELD_IPV6_TCP); mrqc |=( E1000_MRQC_RSS_FIELD_IPV4_UDP | E1000_MRQC_RSS_FIELD_IPV6_UDP); mrqc |=( E1000_MRQC_RSS_FIELD_IPV6_UDP_EX | E1000_MRQC_RSS_FIELD_IPV6_TCP_EX); E1000_WRITE_REG(hw, E1000_MRQC, mrqc); /* ** NOTE: Receive Full-Packet Checksum Offload ** is mutually exclusive with Multiqueue. However ** this is not the same as TCP/IP checksums which ** still work. */ rxcsum |= E1000_RXCSUM_PCSD; #if __FreeBSD_version >= 800000 /* For SCTP Offload */ if ((hw->mac.type == e1000_82576) && (ifp->if_capenable & IFCAP_RXCSUM)) rxcsum |= E1000_RXCSUM_CRCOFL; #endif } else { /* Non RSS setup */ if (ifp->if_capenable & IFCAP_RXCSUM) { rxcsum |= E1000_RXCSUM_IPPCSE; #if __FreeBSD_version >= 800000 if (adapter->hw.mac.type == e1000_82576) rxcsum |= E1000_RXCSUM_CRCOFL; #endif } else rxcsum &= ~E1000_RXCSUM_TUOFL; } E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum); /* Setup the Receive Control Register */ rctl &= ~(3 << E1000_RCTL_MO_SHIFT); rctl |= E1000_RCTL_EN | E1000_RCTL_BAM | E1000_RCTL_LBM_NO | E1000_RCTL_RDMTS_HALF | (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT); /* Strip CRC bytes. */ rctl |= E1000_RCTL_SECRC; /* Make sure VLAN Filters are off */ rctl &= ~E1000_RCTL_VFE; /* Don't store bad packets */ rctl &= ~E1000_RCTL_SBP; /* Enable Receives */ E1000_WRITE_REG(hw, E1000_RCTL, rctl); /* * Setup the HW Rx Head and Tail Descriptor Pointers * - needs to be after enable */ for (int i = 0; i < adapter->num_queues; i++) { rxr = &adapter->rx_rings[i]; E1000_WRITE_REG(hw, E1000_RDH(i), rxr->next_to_check); #ifdef DEV_NETMAP /* * an init() while a netmap client is active must * preserve the rx buffers passed to userspace. * In this driver it means we adjust RDT to - * somthing different from next_to_refresh + * something different from next_to_refresh * (which is not used in netmap mode). */ if (ifp->if_capenable & IFCAP_NETMAP) { struct netmap_adapter *na = NA(adapter->ifp); struct netmap_kring *kring = &na->rx_rings[i]; - int t = rxr->next_to_refresh - kring->nr_hwavail; + int t = rxr->next_to_refresh - nm_kr_rxspace(kring); if (t >= adapter->num_rx_desc) t -= adapter->num_rx_desc; else if (t < 0) t += adapter->num_rx_desc; E1000_WRITE_REG(hw, E1000_RDT(i), t); } else #endif /* DEV_NETMAP */ E1000_WRITE_REG(hw, E1000_RDT(i), rxr->next_to_refresh); } return; } /********************************************************************* * * Free receive rings. * **********************************************************************/ static void igb_free_receive_structures(struct adapter *adapter) { struct rx_ring *rxr = adapter->rx_rings; for (int i = 0; i < adapter->num_queues; i++, rxr++) { struct lro_ctrl *lro = &rxr->lro; igb_free_receive_buffers(rxr); tcp_lro_free(lro); igb_dma_free(adapter, &rxr->rxdma); } free(adapter->rx_rings, M_DEVBUF); } /********************************************************************* * * Free receive ring data structures. * **********************************************************************/ static void igb_free_receive_buffers(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; struct igb_rx_buf *rxbuf; int i; INIT_DEBUGOUT("free_receive_structures: begin"); /* Cleanup any existing buffers */ if (rxr->rx_buffers != NULL) { for (i = 0; i < adapter->num_rx_desc; i++) { rxbuf = &rxr->rx_buffers[i]; if (rxbuf->m_head != NULL) { bus_dmamap_sync(rxr->htag, rxbuf->hmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->htag, rxbuf->hmap); rxbuf->m_head->m_flags |= M_PKTHDR; m_freem(rxbuf->m_head); } if (rxbuf->m_pack != NULL) { bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->ptag, rxbuf->pmap); rxbuf->m_pack->m_flags |= M_PKTHDR; m_freem(rxbuf->m_pack); } rxbuf->m_head = NULL; rxbuf->m_pack = NULL; if (rxbuf->hmap != NULL) { bus_dmamap_destroy(rxr->htag, rxbuf->hmap); rxbuf->hmap = NULL; } if (rxbuf->pmap != NULL) { bus_dmamap_destroy(rxr->ptag, rxbuf->pmap); rxbuf->pmap = NULL; } } if (rxr->rx_buffers != NULL) { free(rxr->rx_buffers, M_DEVBUF); rxr->rx_buffers = NULL; } } if (rxr->htag != NULL) { bus_dma_tag_destroy(rxr->htag); rxr->htag = NULL; } if (rxr->ptag != NULL) { bus_dma_tag_destroy(rxr->ptag); rxr->ptag = NULL; } } static __inline void igb_rx_discard(struct rx_ring *rxr, int i) { struct igb_rx_buf *rbuf; rbuf = &rxr->rx_buffers[i]; /* Partially received? Free the chain */ if (rxr->fmp != NULL) { rxr->fmp->m_flags |= M_PKTHDR; m_freem(rxr->fmp); rxr->fmp = NULL; rxr->lmp = NULL; } /* ** With advanced descriptors the writeback ** clobbers the buffer addrs, so its easier ** to just free the existing mbufs and take ** the normal refresh path to get new buffers ** and mapping. */ if (rbuf->m_head) { m_free(rbuf->m_head); rbuf->m_head = NULL; } if (rbuf->m_pack) { m_free(rbuf->m_pack); rbuf->m_pack = NULL; } return; } static __inline void igb_rx_input(struct rx_ring *rxr, struct ifnet *ifp, struct mbuf *m, u32 ptype) { /* * ATM LRO is only for IPv4/TCP packets and TCP checksum of the packet * should be computed by hardware. Also it should not have VLAN tag in * ethernet header. */ if (rxr->lro_enabled && (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0 && (ptype & E1000_RXDADV_PKTTYPE_ETQF) == 0 && (ptype & (E1000_RXDADV_PKTTYPE_IPV4 | E1000_RXDADV_PKTTYPE_TCP)) == (E1000_RXDADV_PKTTYPE_IPV4 | E1000_RXDADV_PKTTYPE_TCP) && (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) == (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) { /* * Send to the stack if: ** - LRO not enabled, or ** - no LRO resources, or ** - lro enqueue fails */ if (rxr->lro.lro_cnt != 0) if (tcp_lro_rx(&rxr->lro, m, 0) == 0) return; } IGB_RX_UNLOCK(rxr); (*ifp->if_input)(ifp, m); IGB_RX_LOCK(rxr); } /********************************************************************* * * This routine executes in interrupt context. It replenishes * the mbufs in the descriptor and sends data which has been * dma'ed into host memory to upper layer. * * We loop at most count times if count is > 0, or until done if * count < 0. * * Return TRUE if more to clean, FALSE otherwise *********************************************************************/ static bool igb_rxeof(struct igb_queue *que, int count, int *done) { struct adapter *adapter = que->adapter; struct rx_ring *rxr = que->rxr; struct ifnet *ifp = adapter->ifp; struct lro_ctrl *lro = &rxr->lro; struct lro_entry *queued; int i, processed = 0, rxdone = 0; u32 ptype, staterr = 0; union e1000_adv_rx_desc *cur; IGB_RX_LOCK(rxr); /* Sync the ring. */ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); #ifdef DEV_NETMAP if (netmap_rx_irq(ifp, rxr->me, &processed)) { IGB_RX_UNLOCK(rxr); return (FALSE); } #endif /* DEV_NETMAP */ /* Main clean loop */ for (i = rxr->next_to_check; count != 0;) { struct mbuf *sendmp, *mh, *mp; struct igb_rx_buf *rxbuf; u16 hlen, plen, hdr, vtag; bool eop = FALSE; cur = &rxr->rx_base[i]; staterr = le32toh(cur->wb.upper.status_error); if ((staterr & E1000_RXD_STAT_DD) == 0) break; if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; count--; sendmp = mh = mp = NULL; cur->wb.upper.status_error = 0; rxbuf = &rxr->rx_buffers[i]; plen = le16toh(cur->wb.upper.length); ptype = le32toh(cur->wb.lower.lo_dword.data) & IGB_PKTTYPE_MASK; if ((adapter->hw.mac.type == e1000_i350) && (staterr & E1000_RXDEXT_STATERR_LB)) vtag = be16toh(cur->wb.upper.vlan); else vtag = le16toh(cur->wb.upper.vlan); hdr = le16toh(cur->wb.lower.lo_dword.hs_rss.hdr_info); eop = ((staterr & E1000_RXD_STAT_EOP) == E1000_RXD_STAT_EOP); /* Make sure all segments of a bad packet are discarded */ if (((staterr & E1000_RXDEXT_ERR_FRAME_ERR_MASK) != 0) || (rxr->discard)) { adapter->dropped_pkts++; ++rxr->rx_discarded; if (!eop) /* Catch subsequent segs */ rxr->discard = TRUE; else rxr->discard = FALSE; igb_rx_discard(rxr, i); goto next_desc; } /* ** The way the hardware is configured to ** split, it will ONLY use the header buffer ** when header split is enabled, otherwise we ** get normal behavior, ie, both header and ** payload are DMA'd into the payload buffer. ** ** The fmp test is to catch the case where a ** packet spans multiple descriptors, in that ** case only the first header is valid. */ if (rxr->hdr_split && rxr->fmp == NULL) { hlen = (hdr & E1000_RXDADV_HDRBUFLEN_MASK) >> E1000_RXDADV_HDRBUFLEN_SHIFT; if (hlen > IGB_HDR_BUF) hlen = IGB_HDR_BUF; mh = rxr->rx_buffers[i].m_head; mh->m_len = hlen; /* clear buf pointer for refresh */ rxbuf->m_head = NULL; /* ** Get the payload length, this ** could be zero if its a small ** packet. */ if (plen > 0) { mp = rxr->rx_buffers[i].m_pack; mp->m_len = plen; mh->m_next = mp; /* clear buf pointer */ rxbuf->m_pack = NULL; rxr->rx_split_packets++; } } else { /* ** Either no header split, or a ** secondary piece of a fragmented ** split packet. */ mh = rxr->rx_buffers[i].m_pack; mh->m_len = plen; /* clear buf info for refresh */ rxbuf->m_pack = NULL; } ++processed; /* So we know when to refresh */ /* Initial frame - setup */ if (rxr->fmp == NULL) { mh->m_pkthdr.len = mh->m_len; /* Save the head of the chain */ rxr->fmp = mh; rxr->lmp = mh; if (mp != NULL) { /* Add payload if split */ mh->m_pkthdr.len += mp->m_len; rxr->lmp = mh->m_next; } } else { /* Chain mbuf's together */ rxr->lmp->m_next = mh; rxr->lmp = rxr->lmp->m_next; rxr->fmp->m_pkthdr.len += mh->m_len; } if (eop) { rxr->fmp->m_pkthdr.rcvif = ifp; ifp->if_ipackets++; rxr->rx_packets++; /* capture data for AIM */ rxr->packets++; rxr->bytes += rxr->fmp->m_pkthdr.len; rxr->rx_bytes += rxr->fmp->m_pkthdr.len; if ((ifp->if_capenable & IFCAP_RXCSUM) != 0) igb_rx_checksum(staterr, rxr->fmp, ptype); if ((ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0 && (staterr & E1000_RXD_STAT_VP) != 0) { rxr->fmp->m_pkthdr.ether_vtag = vtag; rxr->fmp->m_flags |= M_VLANTAG; } #ifndef IGB_LEGACY_TX rxr->fmp->m_pkthdr.flowid = que->msix; rxr->fmp->m_flags |= M_FLOWID; #endif sendmp = rxr->fmp; /* Make sure to set M_PKTHDR. */ sendmp->m_flags |= M_PKTHDR; rxr->fmp = NULL; rxr->lmp = NULL; } next_desc: bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* Advance our pointers to the next descriptor. */ if (++i == adapter->num_rx_desc) i = 0; /* ** Send to the stack or LRO */ if (sendmp != NULL) { rxr->next_to_check = i; igb_rx_input(rxr, ifp, sendmp, ptype); i = rxr->next_to_check; rxdone++; } /* Every 8 descriptors we go to refresh mbufs */ if (processed == 8) { igb_refresh_mbufs(rxr, i); processed = 0; } } /* Catch any remainders */ if (igb_rx_unrefreshed(rxr)) igb_refresh_mbufs(rxr, i); rxr->next_to_check = i; /* * Flush any outstanding LRO work */ while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) { SLIST_REMOVE_HEAD(&lro->lro_active, next); tcp_lro_flush(lro, queued); } if (done != NULL) *done += rxdone; IGB_RX_UNLOCK(rxr); return ((staterr & E1000_RXD_STAT_DD) ? TRUE : FALSE); } /********************************************************************* * * Verify that the hardware indicated that the checksum is valid. * Inform the stack about the status of checksum so that stack * doesn't spend time verifying the checksum. * *********************************************************************/ static void igb_rx_checksum(u32 staterr, struct mbuf *mp, u32 ptype) { u16 status = (u16)staterr; u8 errors = (u8) (staterr >> 24); int sctp; /* Ignore Checksum bit is set */ if (status & E1000_RXD_STAT_IXSM) { mp->m_pkthdr.csum_flags = 0; return; } if ((ptype & E1000_RXDADV_PKTTYPE_ETQF) == 0 && (ptype & E1000_RXDADV_PKTTYPE_SCTP) != 0) sctp = 1; else sctp = 0; if (status & E1000_RXD_STAT_IPCS) { /* Did it pass? */ if (!(errors & E1000_RXD_ERR_IPE)) { /* IP Checksum Good */ mp->m_pkthdr.csum_flags = CSUM_IP_CHECKED; mp->m_pkthdr.csum_flags |= CSUM_IP_VALID; } else mp->m_pkthdr.csum_flags = 0; } if (status & (E1000_RXD_STAT_TCPCS | E1000_RXD_STAT_UDPCS)) { u16 type = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); #if __FreeBSD_version >= 800000 if (sctp) /* reassign */ type = CSUM_SCTP_VALID; #endif /* Did it pass? */ if (!(errors & E1000_RXD_ERR_TCPE)) { mp->m_pkthdr.csum_flags |= type; if (sctp == 0) mp->m_pkthdr.csum_data = htons(0xffff); } } return; } /* * This routine is run via an vlan * config EVENT */ static void igb_register_vlan(void *arg, struct ifnet *ifp, u16 vtag) { struct adapter *adapter = ifp->if_softc; u32 index, bit; if (ifp->if_softc != arg) /* Not our event */ return; if ((vtag == 0) || (vtag > 4095)) /* Invalid */ return; IGB_CORE_LOCK(adapter); index = (vtag >> 5) & 0x7F; bit = vtag & 0x1F; adapter->shadow_vfta[index] |= (1 << bit); ++adapter->num_vlans; /* Change hw filter setting */ if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) igb_setup_vlan_hw_support(adapter); IGB_CORE_UNLOCK(adapter); } /* * This routine is run via an vlan * unconfig EVENT */ static void igb_unregister_vlan(void *arg, struct ifnet *ifp, u16 vtag) { struct adapter *adapter = ifp->if_softc; u32 index, bit; if (ifp->if_softc != arg) return; if ((vtag == 0) || (vtag > 4095)) /* Invalid */ return; IGB_CORE_LOCK(adapter); index = (vtag >> 5) & 0x7F; bit = vtag & 0x1F; adapter->shadow_vfta[index] &= ~(1 << bit); --adapter->num_vlans; /* Change hw filter setting */ if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) igb_setup_vlan_hw_support(adapter); IGB_CORE_UNLOCK(adapter); } static void igb_setup_vlan_hw_support(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct ifnet *ifp = adapter->ifp; u32 reg; if (adapter->vf_ifp) { e1000_rlpml_set_vf(hw, adapter->max_frame_size + VLAN_TAG_SIZE); return; } reg = E1000_READ_REG(hw, E1000_CTRL); reg |= E1000_CTRL_VME; E1000_WRITE_REG(hw, E1000_CTRL, reg); /* Enable the Filter Table */ if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) { reg = E1000_READ_REG(hw, E1000_RCTL); reg &= ~E1000_RCTL_CFIEN; reg |= E1000_RCTL_VFE; E1000_WRITE_REG(hw, E1000_RCTL, reg); } /* Update the frame size */ E1000_WRITE_REG(&adapter->hw, E1000_RLPML, adapter->max_frame_size + VLAN_TAG_SIZE); /* Don't bother with table if no vlans */ if ((adapter->num_vlans == 0) || ((ifp->if_capenable & IFCAP_VLAN_HWFILTER) == 0)) return; /* ** A soft reset zero's out the VFTA, so ** we need to repopulate it now. */ for (int i = 0; i < IGB_VFTA_SIZE; i++) if (adapter->shadow_vfta[i] != 0) { if (adapter->vf_ifp) e1000_vfta_set_vf(hw, adapter->shadow_vfta[i], TRUE); else e1000_write_vfta(hw, i, adapter->shadow_vfta[i]); } } static void igb_enable_intr(struct adapter *adapter) { /* With RSS set up what to auto clear */ if (adapter->msix_mem) { u32 mask = (adapter->que_mask | adapter->link_mask); E1000_WRITE_REG(&adapter->hw, E1000_EIAC, mask); E1000_WRITE_REG(&adapter->hw, E1000_EIAM, mask); E1000_WRITE_REG(&adapter->hw, E1000_EIMS, mask); E1000_WRITE_REG(&adapter->hw, E1000_IMS, E1000_IMS_LSC); } else { E1000_WRITE_REG(&adapter->hw, E1000_IMS, IMS_ENABLE_MASK); } E1000_WRITE_FLUSH(&adapter->hw); return; } static void igb_disable_intr(struct adapter *adapter) { if (adapter->msix_mem) { E1000_WRITE_REG(&adapter->hw, E1000_EIMC, ~0); E1000_WRITE_REG(&adapter->hw, E1000_EIAC, 0); } E1000_WRITE_REG(&adapter->hw, E1000_IMC, ~0); E1000_WRITE_FLUSH(&adapter->hw); return; } /* * Bit of a misnomer, what this really means is * to enable OS management of the system... aka * to disable special hardware management features */ static void igb_init_manageability(struct adapter *adapter) { if (adapter->has_manage) { int manc2h = E1000_READ_REG(&adapter->hw, E1000_MANC2H); int manc = E1000_READ_REG(&adapter->hw, E1000_MANC); /* disable hardware interception of ARP */ manc &= ~(E1000_MANC_ARP_EN); /* enable receiving management packets to the host */ manc |= E1000_MANC_EN_MNG2HOST; manc2h |= 1 << 5; /* Mng Port 623 */ manc2h |= 1 << 6; /* Mng Port 664 */ E1000_WRITE_REG(&adapter->hw, E1000_MANC2H, manc2h); E1000_WRITE_REG(&adapter->hw, E1000_MANC, manc); } } /* * Give control back to hardware management * controller if there is one. */ static void igb_release_manageability(struct adapter *adapter) { if (adapter->has_manage) { int manc = E1000_READ_REG(&adapter->hw, E1000_MANC); /* re-enable hardware interception of ARP */ manc |= E1000_MANC_ARP_EN; manc &= ~E1000_MANC_EN_MNG2HOST; E1000_WRITE_REG(&adapter->hw, E1000_MANC, manc); } } /* * igb_get_hw_control sets CTRL_EXT:DRV_LOAD bit. * For ASF and Pass Through versions of f/w this means that * the driver is loaded. * */ static void igb_get_hw_control(struct adapter *adapter) { u32 ctrl_ext; if (adapter->vf_ifp) return; /* Let firmware know the driver has taken over */ ctrl_ext = E1000_READ_REG(&adapter->hw, E1000_CTRL_EXT); E1000_WRITE_REG(&adapter->hw, E1000_CTRL_EXT, ctrl_ext | E1000_CTRL_EXT_DRV_LOAD); } /* * igb_release_hw_control resets CTRL_EXT:DRV_LOAD bit. * For ASF and Pass Through versions of f/w this means that the * driver is no longer loaded. * */ static void igb_release_hw_control(struct adapter *adapter) { u32 ctrl_ext; if (adapter->vf_ifp) return; /* Let firmware taken over control of h/w */ ctrl_ext = E1000_READ_REG(&adapter->hw, E1000_CTRL_EXT); E1000_WRITE_REG(&adapter->hw, E1000_CTRL_EXT, ctrl_ext & ~E1000_CTRL_EXT_DRV_LOAD); } static int igb_is_valid_ether_addr(uint8_t *addr) { char zero_addr[6] = { 0, 0, 0, 0, 0, 0 }; if ((addr[0] & 1) || (!bcmp(addr, zero_addr, ETHER_ADDR_LEN))) { return (FALSE); } return (TRUE); } /* * Enable PCI Wake On Lan capability */ static void igb_enable_wakeup(device_t dev) { u16 cap, status; u8 id; /* First find the capabilities pointer*/ cap = pci_read_config(dev, PCIR_CAP_PTR, 2); /* Read the PM Capabilities */ id = pci_read_config(dev, cap, 1); if (id != PCIY_PMG) /* Something wrong */ return; /* OK, we have the power capabilities, so now get the status register */ cap += PCIR_POWER_STATUS; status = pci_read_config(dev, cap, 2); status |= PCIM_PSTAT_PME | PCIM_PSTAT_PMEENABLE; pci_write_config(dev, cap, status, 2); return; } static void igb_led_func(void *arg, int onoff) { struct adapter *adapter = arg; IGB_CORE_LOCK(adapter); if (onoff) { e1000_setup_led(&adapter->hw); e1000_led_on(&adapter->hw); } else { e1000_led_off(&adapter->hw); e1000_cleanup_led(&adapter->hw); } IGB_CORE_UNLOCK(adapter); } /********************************************************************** * * Update the board statistics counters. * **********************************************************************/ static void igb_update_stats_counters(struct adapter *adapter) { struct ifnet *ifp; struct e1000_hw *hw = &adapter->hw; struct e1000_hw_stats *stats; /* ** The virtual function adapter has only a ** small controlled set of stats, do only ** those and return. */ if (adapter->vf_ifp) { igb_update_vf_stats_counters(adapter); return; } stats = (struct e1000_hw_stats *)adapter->stats; if(adapter->hw.phy.media_type == e1000_media_type_copper || (E1000_READ_REG(hw, E1000_STATUS) & E1000_STATUS_LU)) { stats->symerrs += E1000_READ_REG(hw,E1000_SYMERRS); stats->sec += E1000_READ_REG(hw, E1000_SEC); } stats->crcerrs += E1000_READ_REG(hw, E1000_CRCERRS); stats->mpc += E1000_READ_REG(hw, E1000_MPC); stats->scc += E1000_READ_REG(hw, E1000_SCC); stats->ecol += E1000_READ_REG(hw, E1000_ECOL); stats->mcc += E1000_READ_REG(hw, E1000_MCC); stats->latecol += E1000_READ_REG(hw, E1000_LATECOL); stats->colc += E1000_READ_REG(hw, E1000_COLC); stats->dc += E1000_READ_REG(hw, E1000_DC); stats->rlec += E1000_READ_REG(hw, E1000_RLEC); stats->xonrxc += E1000_READ_REG(hw, E1000_XONRXC); stats->xontxc += E1000_READ_REG(hw, E1000_XONTXC); /* ** For watchdog management we need to know if we have been ** paused during the last interval, so capture that here. */ adapter->pause_frames = E1000_READ_REG(&adapter->hw, E1000_XOFFRXC); stats->xoffrxc += adapter->pause_frames; stats->xofftxc += E1000_READ_REG(hw, E1000_XOFFTXC); stats->fcruc += E1000_READ_REG(hw, E1000_FCRUC); stats->prc64 += E1000_READ_REG(hw, E1000_PRC64); stats->prc127 += E1000_READ_REG(hw, E1000_PRC127); stats->prc255 += E1000_READ_REG(hw, E1000_PRC255); stats->prc511 += E1000_READ_REG(hw, E1000_PRC511); stats->prc1023 += E1000_READ_REG(hw, E1000_PRC1023); stats->prc1522 += E1000_READ_REG(hw, E1000_PRC1522); stats->gprc += E1000_READ_REG(hw, E1000_GPRC); stats->bprc += E1000_READ_REG(hw, E1000_BPRC); stats->mprc += E1000_READ_REG(hw, E1000_MPRC); stats->gptc += E1000_READ_REG(hw, E1000_GPTC); /* For the 64-bit byte counters the low dword must be read first. */ /* Both registers clear on the read of the high dword */ stats->gorc += E1000_READ_REG(hw, E1000_GORCL) + ((u64)E1000_READ_REG(hw, E1000_GORCH) << 32); stats->gotc += E1000_READ_REG(hw, E1000_GOTCL) + ((u64)E1000_READ_REG(hw, E1000_GOTCH) << 32); stats->rnbc += E1000_READ_REG(hw, E1000_RNBC); stats->ruc += E1000_READ_REG(hw, E1000_RUC); stats->rfc += E1000_READ_REG(hw, E1000_RFC); stats->roc += E1000_READ_REG(hw, E1000_ROC); stats->rjc += E1000_READ_REG(hw, E1000_RJC); stats->tor += E1000_READ_REG(hw, E1000_TORH); stats->tot += E1000_READ_REG(hw, E1000_TOTH); stats->tpr += E1000_READ_REG(hw, E1000_TPR); stats->tpt += E1000_READ_REG(hw, E1000_TPT); stats->ptc64 += E1000_READ_REG(hw, E1000_PTC64); stats->ptc127 += E1000_READ_REG(hw, E1000_PTC127); stats->ptc255 += E1000_READ_REG(hw, E1000_PTC255); stats->ptc511 += E1000_READ_REG(hw, E1000_PTC511); stats->ptc1023 += E1000_READ_REG(hw, E1000_PTC1023); stats->ptc1522 += E1000_READ_REG(hw, E1000_PTC1522); stats->mptc += E1000_READ_REG(hw, E1000_MPTC); stats->bptc += E1000_READ_REG(hw, E1000_BPTC); /* Interrupt Counts */ stats->iac += E1000_READ_REG(hw, E1000_IAC); stats->icrxptc += E1000_READ_REG(hw, E1000_ICRXPTC); stats->icrxatc += E1000_READ_REG(hw, E1000_ICRXATC); stats->ictxptc += E1000_READ_REG(hw, E1000_ICTXPTC); stats->ictxatc += E1000_READ_REG(hw, E1000_ICTXATC); stats->ictxqec += E1000_READ_REG(hw, E1000_ICTXQEC); stats->ictxqmtc += E1000_READ_REG(hw, E1000_ICTXQMTC); stats->icrxdmtc += E1000_READ_REG(hw, E1000_ICRXDMTC); stats->icrxoc += E1000_READ_REG(hw, E1000_ICRXOC); /* Host to Card Statistics */ stats->cbtmpc += E1000_READ_REG(hw, E1000_CBTMPC); stats->htdpmc += E1000_READ_REG(hw, E1000_HTDPMC); stats->cbrdpc += E1000_READ_REG(hw, E1000_CBRDPC); stats->cbrmpc += E1000_READ_REG(hw, E1000_CBRMPC); stats->rpthc += E1000_READ_REG(hw, E1000_RPTHC); stats->hgptc += E1000_READ_REG(hw, E1000_HGPTC); stats->htcbdpc += E1000_READ_REG(hw, E1000_HTCBDPC); stats->hgorc += (E1000_READ_REG(hw, E1000_HGORCL) + ((u64)E1000_READ_REG(hw, E1000_HGORCH) << 32)); stats->hgotc += (E1000_READ_REG(hw, E1000_HGOTCL) + ((u64)E1000_READ_REG(hw, E1000_HGOTCH) << 32)); stats->lenerrs += E1000_READ_REG(hw, E1000_LENERRS); stats->scvpc += E1000_READ_REG(hw, E1000_SCVPC); stats->hrmpc += E1000_READ_REG(hw, E1000_HRMPC); stats->algnerrc += E1000_READ_REG(hw, E1000_ALGNERRC); stats->rxerrc += E1000_READ_REG(hw, E1000_RXERRC); stats->tncrs += E1000_READ_REG(hw, E1000_TNCRS); stats->cexterr += E1000_READ_REG(hw, E1000_CEXTERR); stats->tsctc += E1000_READ_REG(hw, E1000_TSCTC); stats->tsctfc += E1000_READ_REG(hw, E1000_TSCTFC); ifp = adapter->ifp; ifp->if_collisions = stats->colc; /* Rx Errors */ ifp->if_ierrors = adapter->dropped_pkts + stats->rxerrc + stats->crcerrs + stats->algnerrc + stats->ruc + stats->roc + stats->mpc + stats->cexterr; /* Tx Errors */ ifp->if_oerrors = stats->ecol + stats->latecol + adapter->watchdog_events; /* Driver specific counters */ adapter->device_control = E1000_READ_REG(hw, E1000_CTRL); adapter->rx_control = E1000_READ_REG(hw, E1000_RCTL); adapter->int_mask = E1000_READ_REG(hw, E1000_IMS); adapter->eint_mask = E1000_READ_REG(hw, E1000_EIMS); adapter->packet_buf_alloc_tx = ((E1000_READ_REG(hw, E1000_PBA) & 0xffff0000) >> 16); adapter->packet_buf_alloc_rx = (E1000_READ_REG(hw, E1000_PBA) & 0xffff); } /********************************************************************** * * Initialize the VF board statistics counters. * **********************************************************************/ static void igb_vf_init_stats(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct e1000_vf_stats *stats; stats = (struct e1000_vf_stats *)adapter->stats; if (stats == NULL) return; stats->last_gprc = E1000_READ_REG(hw, E1000_VFGPRC); stats->last_gorc = E1000_READ_REG(hw, E1000_VFGORC); stats->last_gptc = E1000_READ_REG(hw, E1000_VFGPTC); stats->last_gotc = E1000_READ_REG(hw, E1000_VFGOTC); stats->last_mprc = E1000_READ_REG(hw, E1000_VFMPRC); } /********************************************************************** * * Update the VF board statistics counters. * **********************************************************************/ static void igb_update_vf_stats_counters(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct e1000_vf_stats *stats; if (adapter->link_speed == 0) return; stats = (struct e1000_vf_stats *)adapter->stats; UPDATE_VF_REG(E1000_VFGPRC, stats->last_gprc, stats->gprc); UPDATE_VF_REG(E1000_VFGORC, stats->last_gorc, stats->gorc); UPDATE_VF_REG(E1000_VFGPTC, stats->last_gptc, stats->gptc); UPDATE_VF_REG(E1000_VFGOTC, stats->last_gotc, stats->gotc); UPDATE_VF_REG(E1000_VFMPRC, stats->last_mprc, stats->mprc); } /* Export a single 32-bit register via a read-only sysctl. */ static int igb_sysctl_reg_handler(SYSCTL_HANDLER_ARGS) { struct adapter *adapter; u_int val; adapter = oidp->oid_arg1; val = E1000_READ_REG(&adapter->hw, oidp->oid_arg2); return (sysctl_handle_int(oidp, &val, 0, req)); } /* ** Tuneable interrupt rate handler */ static int igb_sysctl_interrupt_rate_handler(SYSCTL_HANDLER_ARGS) { struct igb_queue *que = ((struct igb_queue *)oidp->oid_arg1); int error; u32 reg, usec, rate; reg = E1000_READ_REG(&que->adapter->hw, E1000_EITR(que->msix)); usec = ((reg & 0x7FFC) >> 2); if (usec > 0) rate = 1000000 / usec; else rate = 0; error = sysctl_handle_int(oidp, &rate, 0, req); if (error || !req->newptr) return error; return 0; } /* * Add sysctl variables, one per statistic, to the system. */ static void igb_add_hw_stats(struct adapter *adapter) { device_t dev = adapter->dev; struct tx_ring *txr = adapter->tx_rings; struct rx_ring *rxr = adapter->rx_rings; struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(dev); struct sysctl_oid *tree = device_get_sysctl_tree(dev); struct sysctl_oid_list *child = SYSCTL_CHILDREN(tree); struct e1000_hw_stats *stats = adapter->stats; struct sysctl_oid *stat_node, *queue_node, *int_node, *host_node; struct sysctl_oid_list *stat_list, *queue_list, *int_list, *host_list; #define QUEUE_NAME_LEN 32 char namebuf[QUEUE_NAME_LEN]; /* Driver Statistics */ SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "link_irq", CTLFLAG_RD, &adapter->link_irq, 0, "Link MSIX IRQ Handled"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "dropped", CTLFLAG_RD, &adapter->dropped_pkts, "Driver dropped packets"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_dma_fail", CTLFLAG_RD, &adapter->no_tx_dma_setup, "Driver tx dma failure in xmit"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_overruns", CTLFLAG_RD, &adapter->rx_overruns, "RX overruns"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "watchdog_timeouts", CTLFLAG_RD, &adapter->watchdog_events, "Watchdog timeouts"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "device_control", CTLFLAG_RD, &adapter->device_control, "Device Control Register"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_control", CTLFLAG_RD, &adapter->rx_control, "Receiver Control Register"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "interrupt_mask", CTLFLAG_RD, &adapter->int_mask, "Interrupt Mask"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "extended_int_mask", CTLFLAG_RD, &adapter->eint_mask, "Extended Interrupt Mask"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_buf_alloc", CTLFLAG_RD, &adapter->packet_buf_alloc_tx, "Transmit Buffer Packet Allocation"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_buf_alloc", CTLFLAG_RD, &adapter->packet_buf_alloc_rx, "Receive Buffer Packet Allocation"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "fc_high_water", CTLFLAG_RD, &adapter->hw.fc.high_water, 0, "Flow Control High Watermark"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "fc_low_water", CTLFLAG_RD, &adapter->hw.fc.low_water, 0, "Flow Control Low Watermark"); for (int i = 0; i < adapter->num_queues; i++, rxr++, txr++) { struct lro_ctrl *lro = &rxr->lro; snprintf(namebuf, QUEUE_NAME_LEN, "queue%d", i); queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "interrupt_rate", CTLFLAG_RD, &adapter->queues[i], sizeof(&adapter->queues[i]), igb_sysctl_interrupt_rate_handler, "IU", "Interrupt Rate"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "txd_head", CTLFLAG_RD, adapter, E1000_TDH(txr->me), igb_sysctl_reg_handler, "IU", "Transmit Descriptor Head"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "txd_tail", CTLFLAG_RD, adapter, E1000_TDT(txr->me), igb_sysctl_reg_handler, "IU", "Transmit Descriptor Tail"); SYSCTL_ADD_QUAD(ctx, queue_list, OID_AUTO, "no_desc_avail", CTLFLAG_RD, &txr->no_desc_avail, "Queue No Descriptor Available"); SYSCTL_ADD_QUAD(ctx, queue_list, OID_AUTO, "tx_packets", CTLFLAG_RD, &txr->tx_packets, "Queue Packets Transmitted"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "rxd_head", CTLFLAG_RD, adapter, E1000_RDH(rxr->me), igb_sysctl_reg_handler, "IU", "Receive Descriptor Head"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "rxd_tail", CTLFLAG_RD, adapter, E1000_RDT(rxr->me), igb_sysctl_reg_handler, "IU", "Receive Descriptor Tail"); SYSCTL_ADD_QUAD(ctx, queue_list, OID_AUTO, "rx_packets", CTLFLAG_RD, &rxr->rx_packets, "Queue Packets Received"); SYSCTL_ADD_QUAD(ctx, queue_list, OID_AUTO, "rx_bytes", CTLFLAG_RD, &rxr->rx_bytes, "Queue Bytes Received"); SYSCTL_ADD_UINT(ctx, queue_list, OID_AUTO, "lro_queued", CTLFLAG_RD, &lro->lro_queued, 0, "LRO Queued"); SYSCTL_ADD_UINT(ctx, queue_list, OID_AUTO, "lro_flushed", CTLFLAG_RD, &lro->lro_flushed, 0, "LRO Flushed"); } /* MAC stats get their own sub node */ stat_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "mac_stats", CTLFLAG_RD, NULL, "MAC Statistics"); stat_list = SYSCTL_CHILDREN(stat_node); /* ** VF adapter has a very limited set of stats ** since its not managing the metal, so to speak. */ if (adapter->vf_ifp) { SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_pkts_recvd", CTLFLAG_RD, &stats->gprc, "Good Packets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_pkts_txd", CTLFLAG_RD, &stats->gptc, "Good Packets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_octets_recvd", CTLFLAG_RD, &stats->gorc, "Good Octets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_octets_txd", CTLFLAG_RD, &stats->gotc, "Good Octets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "mcast_pkts_recvd", CTLFLAG_RD, &stats->mprc, "Multicast Packets Received"); return; } SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "excess_coll", CTLFLAG_RD, &stats->ecol, "Excessive collisions"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "single_coll", CTLFLAG_RD, &stats->scc, "Single collisions"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "multiple_coll", CTLFLAG_RD, &stats->mcc, "Multiple collisions"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "late_coll", CTLFLAG_RD, &stats->latecol, "Late collisions"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "collision_count", CTLFLAG_RD, &stats->colc, "Collision Count"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "symbol_errors", CTLFLAG_RD, &stats->symerrs, "Symbol Errors"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "sequence_errors", CTLFLAG_RD, &stats->sec, "Sequence Errors"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "defer_count", CTLFLAG_RD, &stats->dc, "Defer Count"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "missed_packets", CTLFLAG_RD, &stats->mpc, "Missed Packets"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "recv_no_buff", CTLFLAG_RD, &stats->rnbc, "Receive No Buffers"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "recv_undersize", CTLFLAG_RD, &stats->ruc, "Receive Undersize"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "recv_fragmented", CTLFLAG_RD, &stats->rfc, "Fragmented Packets Received "); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "recv_oversize", CTLFLAG_RD, &stats->roc, "Oversized Packets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "recv_jabber", CTLFLAG_RD, &stats->rjc, "Recevied Jabber"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "recv_errs", CTLFLAG_RD, &stats->rxerrc, "Receive Errors"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "crc_errs", CTLFLAG_RD, &stats->crcerrs, "CRC errors"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "alignment_errs", CTLFLAG_RD, &stats->algnerrc, "Alignment Errors"); /* On 82575 these are collision counts */ SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "coll_ext_errs", CTLFLAG_RD, &stats->cexterr, "Collision/Carrier extension errors"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "xon_recvd", CTLFLAG_RD, &stats->xonrxc, "XON Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "xon_txd", CTLFLAG_RD, &stats->xontxc, "XON Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "xoff_recvd", CTLFLAG_RD, &stats->xoffrxc, "XOFF Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "xoff_txd", CTLFLAG_RD, &stats->xofftxc, "XOFF Transmitted"); /* Packet Reception Stats */ SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "total_pkts_recvd", CTLFLAG_RD, &stats->tpr, "Total Packets Received "); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_pkts_recvd", CTLFLAG_RD, &stats->gprc, "Good Packets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "bcast_pkts_recvd", CTLFLAG_RD, &stats->bprc, "Broadcast Packets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "mcast_pkts_recvd", CTLFLAG_RD, &stats->mprc, "Multicast Packets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "rx_frames_64", CTLFLAG_RD, &stats->prc64, "64 byte frames received "); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "rx_frames_65_127", CTLFLAG_RD, &stats->prc127, "65-127 byte frames received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "rx_frames_128_255", CTLFLAG_RD, &stats->prc255, "128-255 byte frames received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "rx_frames_256_511", CTLFLAG_RD, &stats->prc511, "256-511 byte frames received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "rx_frames_512_1023", CTLFLAG_RD, &stats->prc1023, "512-1023 byte frames received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "rx_frames_1024_1522", CTLFLAG_RD, &stats->prc1522, "1023-1522 byte frames received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_octets_recvd", CTLFLAG_RD, &stats->gorc, "Good Octets Received"); /* Packet Transmission Stats */ SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_octets_txd", CTLFLAG_RD, &stats->gotc, "Good Octets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "total_pkts_txd", CTLFLAG_RD, &stats->tpt, "Total Packets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_pkts_txd", CTLFLAG_RD, &stats->gptc, "Good Packets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "bcast_pkts_txd", CTLFLAG_RD, &stats->bptc, "Broadcast Packets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "mcast_pkts_txd", CTLFLAG_RD, &stats->mptc, "Multicast Packets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tx_frames_64", CTLFLAG_RD, &stats->ptc64, "64 byte frames transmitted "); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tx_frames_65_127", CTLFLAG_RD, &stats->ptc127, "65-127 byte frames transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tx_frames_128_255", CTLFLAG_RD, &stats->ptc255, "128-255 byte frames transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tx_frames_256_511", CTLFLAG_RD, &stats->ptc511, "256-511 byte frames transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tx_frames_512_1023", CTLFLAG_RD, &stats->ptc1023, "512-1023 byte frames transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tx_frames_1024_1522", CTLFLAG_RD, &stats->ptc1522, "1024-1522 byte frames transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tso_txd", CTLFLAG_RD, &stats->tsctc, "TSO Contexts Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tso_ctx_fail", CTLFLAG_RD, &stats->tsctfc, "TSO Contexts Failed"); /* Interrupt Stats */ int_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "interrupts", CTLFLAG_RD, NULL, "Interrupt Statistics"); int_list = SYSCTL_CHILDREN(int_node); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "asserts", CTLFLAG_RD, &stats->iac, "Interrupt Assertion Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "rx_pkt_timer", CTLFLAG_RD, &stats->icrxptc, "Interrupt Cause Rx Pkt Timer Expire Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "rx_abs_timer", CTLFLAG_RD, &stats->icrxatc, "Interrupt Cause Rx Abs Timer Expire Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "tx_pkt_timer", CTLFLAG_RD, &stats->ictxptc, "Interrupt Cause Tx Pkt Timer Expire Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "tx_abs_timer", CTLFLAG_RD, &stats->ictxatc, "Interrupt Cause Tx Abs Timer Expire Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "tx_queue_empty", CTLFLAG_RD, &stats->ictxqec, "Interrupt Cause Tx Queue Empty Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "tx_queue_min_thresh", CTLFLAG_RD, &stats->ictxqmtc, "Interrupt Cause Tx Queue Min Thresh Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "rx_desc_min_thresh", CTLFLAG_RD, &stats->icrxdmtc, "Interrupt Cause Rx Desc Min Thresh Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "rx_overrun", CTLFLAG_RD, &stats->icrxoc, "Interrupt Cause Receiver Overrun Count"); /* Host to Card Stats */ host_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "host", CTLFLAG_RD, NULL, "Host to Card Statistics"); host_list = SYSCTL_CHILDREN(host_node); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "breaker_tx_pkt", CTLFLAG_RD, &stats->cbtmpc, "Circuit Breaker Tx Packet Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "host_tx_pkt_discard", CTLFLAG_RD, &stats->htdpmc, "Host Transmit Discarded Packets"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "rx_pkt", CTLFLAG_RD, &stats->rpthc, "Rx Packets To Host"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "breaker_rx_pkts", CTLFLAG_RD, &stats->cbrmpc, "Circuit Breaker Rx Packet Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "breaker_rx_pkt_drop", CTLFLAG_RD, &stats->cbrdpc, "Circuit Breaker Rx Dropped Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "tx_good_pkt", CTLFLAG_RD, &stats->hgptc, "Host Good Packets Tx Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "breaker_tx_pkt_drop", CTLFLAG_RD, &stats->htcbdpc, "Host Tx Circuit Breaker Dropped Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "rx_good_bytes", CTLFLAG_RD, &stats->hgorc, "Host Good Octets Received Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "tx_good_bytes", CTLFLAG_RD, &stats->hgotc, "Host Good Octets Transmit Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "length_errors", CTLFLAG_RD, &stats->lenerrs, "Length Errors"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "serdes_violation_pkt", CTLFLAG_RD, &stats->scvpc, "SerDes/SGMII Code Violation Pkt Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "header_redir_missed", CTLFLAG_RD, &stats->hrmpc, "Header Redirection Missed Packet Count"); } /********************************************************************** * * This routine provides a way to dump out the adapter eeprom, * often a useful debug/service tool. This only dumps the first * 32 words, stuff that matters is in that extent. * **********************************************************************/ static int igb_sysctl_nvm_info(SYSCTL_HANDLER_ARGS) { struct adapter *adapter; int error; int result; result = -1; error = sysctl_handle_int(oidp, &result, 0, req); if (error || !req->newptr) return (error); /* * This value will cause a hex dump of the * first 32 16-bit words of the EEPROM to * the screen. */ if (result == 1) { adapter = (struct adapter *)arg1; igb_print_nvm_info(adapter); } return (error); } static void igb_print_nvm_info(struct adapter *adapter) { u16 eeprom_data; int i, j, row = 0; /* Its a bit crude, but it gets the job done */ printf("\nInterface EEPROM Dump:\n"); printf("Offset\n0x0000 "); for (i = 0, j = 0; i < 32; i++, j++) { if (j == 8) { /* Make the offset block */ j = 0; ++row; printf("\n0x00%x0 ",row); } e1000_read_nvm(&adapter->hw, i, 1, &eeprom_data); printf("%04x ", eeprom_data); } printf("\n"); } static void igb_set_sysctl_value(struct adapter *adapter, const char *name, const char *description, int *limit, int value) { *limit = value; SYSCTL_ADD_INT(device_get_sysctl_ctx(adapter->dev), SYSCTL_CHILDREN(device_get_sysctl_tree(adapter->dev)), OID_AUTO, name, CTLTYPE_INT|CTLFLAG_RW, limit, value, description); } /* ** Set flow control using sysctl: ** Flow control values: ** 0 - off ** 1 - rx pause ** 2 - tx pause ** 3 - full */ static int igb_set_flowcntl(SYSCTL_HANDLER_ARGS) { int error; static int input = 3; /* default is full */ struct adapter *adapter = (struct adapter *) arg1; error = sysctl_handle_int(oidp, &input, 0, req); if ((error) || (req->newptr == NULL)) return (error); switch (input) { case e1000_fc_rx_pause: case e1000_fc_tx_pause: case e1000_fc_full: case e1000_fc_none: adapter->hw.fc.requested_mode = input; adapter->fc = input; break; default: /* Do nothing */ return (error); } adapter->hw.fc.current_mode = adapter->hw.fc.requested_mode; e1000_force_mac_fc(&adapter->hw); return (error); } /* ** Manage DMA Coalesce: ** Control values: ** 0/1 - off/on ** Legal timer values are: ** 250,500,1000-10000 in thousands */ static int igb_sysctl_dmac(SYSCTL_HANDLER_ARGS) { struct adapter *adapter = (struct adapter *) arg1; int error; error = sysctl_handle_int(oidp, &adapter->dmac, 0, req); if ((error) || (req->newptr == NULL)) return (error); switch (adapter->dmac) { case 0: /*Disabling */ break; case 1: /* Just enable and use default */ adapter->dmac = 1000; break; case 250: case 500: case 1000: case 2000: case 3000: case 4000: case 5000: case 6000: case 7000: case 8000: case 9000: case 10000: /* Legal values - allow */ break; default: /* Do nothing, illegal value */ adapter->dmac = 0; return (error); } /* Reinit the interface */ igb_init(adapter); return (error); } /* ** Manage Energy Efficient Ethernet: ** Control values: ** 0/1 - enabled/disabled */ static int igb_sysctl_eee(SYSCTL_HANDLER_ARGS) { struct adapter *adapter = (struct adapter *) arg1; int error, value; value = adapter->hw.dev_spec._82575.eee_disable; error = sysctl_handle_int(oidp, &value, 0, req); if (error || req->newptr == NULL) return (error); IGB_CORE_LOCK(adapter); adapter->hw.dev_spec._82575.eee_disable = (value != 0); igb_init_locked(adapter); IGB_CORE_UNLOCK(adapter); return (0); } Index: stable/9/sys/dev/e1000/if_lem.c =================================================================== --- stable/9/sys/dev/e1000/if_lem.c (revision 262152) +++ stable/9/sys/dev/e1000/if_lem.c (revision 262153) @@ -1,4662 +1,4662 @@ /****************************************************************************** Copyright (c) 2001-2012, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************/ /*$FreeBSD$*/ #include "opt_inet.h" #include "opt_inet6.h" #ifdef HAVE_KERNEL_OPTION_HEADERS #include "opt_device_polling.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "e1000_api.h" #include "if_lem.h" /********************************************************************* * Legacy Em Driver version: *********************************************************************/ char lem_driver_version[] = "1.0.6"; /********************************************************************* * PCI Device ID Table * * Used by probe to select devices to load on * Last field stores an index into e1000_strings * Last entry must be all 0s * * { Vendor ID, Device ID, SubVendor ID, SubDevice ID, String Index } *********************************************************************/ static em_vendor_info_t lem_vendor_info_array[] = { /* Intel(R) PRO/1000 Network Connection */ { 0x8086, E1000_DEV_ID_82540EM, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82540EM_LOM, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82540EP, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82540EP_LOM, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82540EP_LP, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82541EI, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82541ER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82541ER_LOM, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82541EI_MOBILE, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82541GI, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82541GI_LF, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82541GI_MOBILE, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82542, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82543GC_FIBER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82543GC_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82544EI_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82544EI_FIBER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82544GC_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82544GC_LOM, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82545EM_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82545EM_FIBER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82545GM_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82545GM_FIBER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82545GM_SERDES, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82546EB_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82546EB_FIBER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82546EB_QUAD_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82546GB_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82546GB_FIBER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82546GB_SERDES, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82546GB_PCIE, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82546GB_QUAD_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82546GB_QUAD_COPPER_KSP3, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82547EI, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82547EI_MOBILE, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82547GI, PCI_ANY_ID, PCI_ANY_ID, 0}, /* required last entry */ { 0, 0, 0, 0, 0} }; /********************************************************************* * Table of branding strings for all supported NICs. *********************************************************************/ static char *lem_strings[] = { "Intel(R) PRO/1000 Legacy Network Connection" }; /********************************************************************* * Function prototypes *********************************************************************/ static int lem_probe(device_t); static int lem_attach(device_t); static int lem_detach(device_t); static int lem_shutdown(device_t); static int lem_suspend(device_t); static int lem_resume(device_t); static void lem_start(struct ifnet *); static void lem_start_locked(struct ifnet *ifp); static int lem_ioctl(struct ifnet *, u_long, caddr_t); static void lem_init(void *); static void lem_init_locked(struct adapter *); static void lem_stop(void *); static void lem_media_status(struct ifnet *, struct ifmediareq *); static int lem_media_change(struct ifnet *); static void lem_identify_hardware(struct adapter *); static int lem_allocate_pci_resources(struct adapter *); static int lem_allocate_irq(struct adapter *adapter); static void lem_free_pci_resources(struct adapter *); static void lem_local_timer(void *); static int lem_hardware_init(struct adapter *); static int lem_setup_interface(device_t, struct adapter *); static void lem_setup_transmit_structures(struct adapter *); static void lem_initialize_transmit_unit(struct adapter *); static int lem_setup_receive_structures(struct adapter *); static void lem_initialize_receive_unit(struct adapter *); static void lem_enable_intr(struct adapter *); static void lem_disable_intr(struct adapter *); static void lem_free_transmit_structures(struct adapter *); static void lem_free_receive_structures(struct adapter *); static void lem_update_stats_counters(struct adapter *); static void lem_add_hw_stats(struct adapter *adapter); static void lem_txeof(struct adapter *); static void lem_tx_purge(struct adapter *); static int lem_allocate_receive_structures(struct adapter *); static int lem_allocate_transmit_structures(struct adapter *); static bool lem_rxeof(struct adapter *, int, int *); #ifndef __NO_STRICT_ALIGNMENT static int lem_fixup_rx(struct adapter *); #endif static void lem_receive_checksum(struct adapter *, struct e1000_rx_desc *, struct mbuf *); static void lem_transmit_checksum_setup(struct adapter *, struct mbuf *, u32 *, u32 *); static void lem_set_promisc(struct adapter *); static void lem_disable_promisc(struct adapter *); static void lem_set_multi(struct adapter *); static void lem_update_link_status(struct adapter *); static int lem_get_buf(struct adapter *, int); static void lem_register_vlan(void *, struct ifnet *, u16); static void lem_unregister_vlan(void *, struct ifnet *, u16); static void lem_setup_vlan_hw_support(struct adapter *); static int lem_xmit(struct adapter *, struct mbuf **); static void lem_smartspeed(struct adapter *); static int lem_82547_fifo_workaround(struct adapter *, int); static void lem_82547_update_fifo_head(struct adapter *, int); static int lem_82547_tx_fifo_reset(struct adapter *); static void lem_82547_move_tail(void *); static int lem_dma_malloc(struct adapter *, bus_size_t, struct em_dma_alloc *, int); static void lem_dma_free(struct adapter *, struct em_dma_alloc *); static int lem_sysctl_nvm_info(SYSCTL_HANDLER_ARGS); static void lem_print_nvm_info(struct adapter *); static int lem_is_valid_ether_addr(u8 *); static u32 lem_fill_descriptors (bus_addr_t address, u32 length, PDESC_ARRAY desc_array); static int lem_sysctl_int_delay(SYSCTL_HANDLER_ARGS); static void lem_add_int_delay_sysctl(struct adapter *, const char *, const char *, struct em_int_delay_info *, int, int); static void lem_set_flow_cntrl(struct adapter *, const char *, const char *, int *, int); /* Management and WOL Support */ static void lem_init_manageability(struct adapter *); static void lem_release_manageability(struct adapter *); static void lem_get_hw_control(struct adapter *); static void lem_release_hw_control(struct adapter *); static void lem_get_wakeup(device_t); static void lem_enable_wakeup(device_t); static int lem_enable_phy_wakeup(struct adapter *); static void lem_led_func(void *, int); static void lem_intr(void *); static int lem_irq_fast(void *); static void lem_handle_rxtx(void *context, int pending); static void lem_handle_link(void *context, int pending); static void lem_add_rx_process_limit(struct adapter *, const char *, const char *, int *, int); #ifdef DEVICE_POLLING static poll_handler_t lem_poll; #endif /* POLLING */ /********************************************************************* * FreeBSD Device Interface Entry Points *********************************************************************/ static device_method_t lem_methods[] = { /* Device interface */ DEVMETHOD(device_probe, lem_probe), DEVMETHOD(device_attach, lem_attach), DEVMETHOD(device_detach, lem_detach), DEVMETHOD(device_shutdown, lem_shutdown), DEVMETHOD(device_suspend, lem_suspend), DEVMETHOD(device_resume, lem_resume), DEVMETHOD_END }; static driver_t lem_driver = { "em", lem_methods, sizeof(struct adapter), }; extern devclass_t em_devclass; DRIVER_MODULE(lem, pci, lem_driver, em_devclass, 0, 0); MODULE_DEPEND(lem, pci, 1, 1, 1); MODULE_DEPEND(lem, ether, 1, 1, 1); /********************************************************************* * Tunable default values. *********************************************************************/ #define EM_TICKS_TO_USECS(ticks) ((1024 * (ticks) + 500) / 1000) #define EM_USECS_TO_TICKS(usecs) ((1000 * (usecs) + 512) / 1024) #define MAX_INTS_PER_SEC 8000 #define DEFAULT_ITR (1000000000/(MAX_INTS_PER_SEC * 256)) static int lem_tx_int_delay_dflt = EM_TICKS_TO_USECS(EM_TIDV); static int lem_rx_int_delay_dflt = EM_TICKS_TO_USECS(EM_RDTR); static int lem_tx_abs_int_delay_dflt = EM_TICKS_TO_USECS(EM_TADV); static int lem_rx_abs_int_delay_dflt = EM_TICKS_TO_USECS(EM_RADV); static int lem_rxd = EM_DEFAULT_RXD; static int lem_txd = EM_DEFAULT_TXD; static int lem_smart_pwr_down = FALSE; /* Controls whether promiscuous also shows bad packets */ static int lem_debug_sbp = FALSE; TUNABLE_INT("hw.em.tx_int_delay", &lem_tx_int_delay_dflt); TUNABLE_INT("hw.em.rx_int_delay", &lem_rx_int_delay_dflt); TUNABLE_INT("hw.em.tx_abs_int_delay", &lem_tx_abs_int_delay_dflt); TUNABLE_INT("hw.em.rx_abs_int_delay", &lem_rx_abs_int_delay_dflt); TUNABLE_INT("hw.em.rxd", &lem_rxd); TUNABLE_INT("hw.em.txd", &lem_txd); TUNABLE_INT("hw.em.smart_pwr_down", &lem_smart_pwr_down); TUNABLE_INT("hw.em.sbp", &lem_debug_sbp); /* Interrupt style - default to fast */ static int lem_use_legacy_irq = 0; TUNABLE_INT("hw.em.use_legacy_irq", &lem_use_legacy_irq); /* How many packets rxeof tries to clean at a time */ static int lem_rx_process_limit = 100; TUNABLE_INT("hw.em.rx_process_limit", &lem_rx_process_limit); /* Flow control setting - default to FULL */ static int lem_fc_setting = e1000_fc_full; TUNABLE_INT("hw.em.fc_setting", &lem_fc_setting); /* Global used in WOL setup with multiport cards */ static int global_quad_port_a = 0; #ifdef DEV_NETMAP /* see ixgbe.c for details */ #include #endif /* DEV_NETMAP */ /********************************************************************* * Device identification routine * * em_probe determines if the driver should be loaded on * adapter based on PCI vendor/device id of the adapter. * * return BUS_PROBE_DEFAULT on success, positive on failure *********************************************************************/ static int lem_probe(device_t dev) { char adapter_name[60]; u16 pci_vendor_id = 0; u16 pci_device_id = 0; u16 pci_subvendor_id = 0; u16 pci_subdevice_id = 0; em_vendor_info_t *ent; INIT_DEBUGOUT("em_probe: begin"); pci_vendor_id = pci_get_vendor(dev); if (pci_vendor_id != EM_VENDOR_ID) return (ENXIO); pci_device_id = pci_get_device(dev); pci_subvendor_id = pci_get_subvendor(dev); pci_subdevice_id = pci_get_subdevice(dev); ent = lem_vendor_info_array; while (ent->vendor_id != 0) { if ((pci_vendor_id == ent->vendor_id) && (pci_device_id == ent->device_id) && ((pci_subvendor_id == ent->subvendor_id) || (ent->subvendor_id == PCI_ANY_ID)) && ((pci_subdevice_id == ent->subdevice_id) || (ent->subdevice_id == PCI_ANY_ID))) { sprintf(adapter_name, "%s %s", lem_strings[ent->index], lem_driver_version); device_set_desc_copy(dev, adapter_name); return (BUS_PROBE_DEFAULT); } ent++; } return (ENXIO); } /********************************************************************* * Device initialization routine * * The attach entry point is called when the driver is being loaded. * This routine identifies the type of hardware, allocates all resources * and initializes the hardware. * * return 0 on success, positive on failure *********************************************************************/ static int lem_attach(device_t dev) { struct adapter *adapter; int tsize, rsize; int error = 0; INIT_DEBUGOUT("lem_attach: begin"); adapter = device_get_softc(dev); adapter->dev = adapter->osdep.dev = dev; EM_CORE_LOCK_INIT(adapter, device_get_nameunit(dev)); EM_TX_LOCK_INIT(adapter, device_get_nameunit(dev)); EM_RX_LOCK_INIT(adapter, device_get_nameunit(dev)); /* SYSCTL stuff */ SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "nvm", CTLTYPE_INT|CTLFLAG_RW, adapter, 0, lem_sysctl_nvm_info, "I", "NVM Information"); callout_init_mtx(&adapter->timer, &adapter->core_mtx, 0); callout_init_mtx(&adapter->tx_fifo_timer, &adapter->tx_mtx, 0); /* Determine hardware and mac info */ lem_identify_hardware(adapter); /* Setup PCI resources */ if (lem_allocate_pci_resources(adapter)) { device_printf(dev, "Allocation of PCI resources failed\n"); error = ENXIO; goto err_pci; } /* Do Shared Code initialization */ if (e1000_setup_init_funcs(&adapter->hw, TRUE)) { device_printf(dev, "Setup of Shared code failed\n"); error = ENXIO; goto err_pci; } e1000_get_bus_info(&adapter->hw); /* Set up some sysctls for the tunable interrupt delays */ lem_add_int_delay_sysctl(adapter, "rx_int_delay", "receive interrupt delay in usecs", &adapter->rx_int_delay, E1000_REGISTER(&adapter->hw, E1000_RDTR), lem_rx_int_delay_dflt); lem_add_int_delay_sysctl(adapter, "tx_int_delay", "transmit interrupt delay in usecs", &adapter->tx_int_delay, E1000_REGISTER(&adapter->hw, E1000_TIDV), lem_tx_int_delay_dflt); if (adapter->hw.mac.type >= e1000_82540) { lem_add_int_delay_sysctl(adapter, "rx_abs_int_delay", "receive interrupt delay limit in usecs", &adapter->rx_abs_int_delay, E1000_REGISTER(&adapter->hw, E1000_RADV), lem_rx_abs_int_delay_dflt); lem_add_int_delay_sysctl(adapter, "tx_abs_int_delay", "transmit interrupt delay limit in usecs", &adapter->tx_abs_int_delay, E1000_REGISTER(&adapter->hw, E1000_TADV), lem_tx_abs_int_delay_dflt); lem_add_int_delay_sysctl(adapter, "itr", "interrupt delay limit in usecs/4", &adapter->tx_itr, E1000_REGISTER(&adapter->hw, E1000_ITR), DEFAULT_ITR); } /* Sysctls for limiting the amount of work done in the taskqueue */ lem_add_rx_process_limit(adapter, "rx_processing_limit", "max number of rx packets to process", &adapter->rx_process_limit, lem_rx_process_limit); /* Sysctl for setting the interface flow control */ lem_set_flow_cntrl(adapter, "flow_control", "flow control setting", &adapter->fc_setting, lem_fc_setting); /* * Validate number of transmit and receive descriptors. It * must not exceed hardware maximum, and must be multiple * of E1000_DBA_ALIGN. */ if (((lem_txd * sizeof(struct e1000_tx_desc)) % EM_DBA_ALIGN) != 0 || (adapter->hw.mac.type >= e1000_82544 && lem_txd > EM_MAX_TXD) || (adapter->hw.mac.type < e1000_82544 && lem_txd > EM_MAX_TXD_82543) || (lem_txd < EM_MIN_TXD)) { device_printf(dev, "Using %d TX descriptors instead of %d!\n", EM_DEFAULT_TXD, lem_txd); adapter->num_tx_desc = EM_DEFAULT_TXD; } else adapter->num_tx_desc = lem_txd; if (((lem_rxd * sizeof(struct e1000_rx_desc)) % EM_DBA_ALIGN) != 0 || (adapter->hw.mac.type >= e1000_82544 && lem_rxd > EM_MAX_RXD) || (adapter->hw.mac.type < e1000_82544 && lem_rxd > EM_MAX_RXD_82543) || (lem_rxd < EM_MIN_RXD)) { device_printf(dev, "Using %d RX descriptors instead of %d!\n", EM_DEFAULT_RXD, lem_rxd); adapter->num_rx_desc = EM_DEFAULT_RXD; } else adapter->num_rx_desc = lem_rxd; adapter->hw.mac.autoneg = DO_AUTO_NEG; adapter->hw.phy.autoneg_wait_to_complete = FALSE; adapter->hw.phy.autoneg_advertised = AUTONEG_ADV_DEFAULT; adapter->rx_buffer_len = 2048; e1000_init_script_state_82541(&adapter->hw, TRUE); e1000_set_tbi_compatibility_82543(&adapter->hw, TRUE); /* Copper options */ if (adapter->hw.phy.media_type == e1000_media_type_copper) { adapter->hw.phy.mdix = AUTO_ALL_MODES; adapter->hw.phy.disable_polarity_correction = FALSE; adapter->hw.phy.ms_type = EM_MASTER_SLAVE; } /* * Set the frame limits assuming * standard ethernet sized frames. */ adapter->max_frame_size = ETHERMTU + ETHER_HDR_LEN + ETHERNET_FCS_SIZE; adapter->min_frame_size = ETH_ZLEN + ETHERNET_FCS_SIZE; /* * This controls when hardware reports transmit completion * status. */ adapter->hw.mac.report_tx_early = 1; tsize = roundup2(adapter->num_tx_desc * sizeof(struct e1000_tx_desc), EM_DBA_ALIGN); /* Allocate Transmit Descriptor ring */ if (lem_dma_malloc(adapter, tsize, &adapter->txdma, BUS_DMA_NOWAIT)) { device_printf(dev, "Unable to allocate tx_desc memory\n"); error = ENOMEM; goto err_tx_desc; } adapter->tx_desc_base = (struct e1000_tx_desc *)adapter->txdma.dma_vaddr; rsize = roundup2(adapter->num_rx_desc * sizeof(struct e1000_rx_desc), EM_DBA_ALIGN); /* Allocate Receive Descriptor ring */ if (lem_dma_malloc(adapter, rsize, &adapter->rxdma, BUS_DMA_NOWAIT)) { device_printf(dev, "Unable to allocate rx_desc memory\n"); error = ENOMEM; goto err_rx_desc; } adapter->rx_desc_base = (struct e1000_rx_desc *)adapter->rxdma.dma_vaddr; /* Allocate multicast array memory. */ adapter->mta = malloc(sizeof(u8) * ETH_ADDR_LEN * MAX_NUM_MULTICAST_ADDRESSES, M_DEVBUF, M_NOWAIT); if (adapter->mta == NULL) { device_printf(dev, "Can not allocate multicast setup array\n"); error = ENOMEM; goto err_hw_init; } /* ** Start from a known state, this is ** important in reading the nvm and ** mac from that. */ e1000_reset_hw(&adapter->hw); /* Make sure we have a good EEPROM before we read from it */ if (e1000_validate_nvm_checksum(&adapter->hw) < 0) { /* ** Some PCI-E parts fail the first check due to ** the link being in sleep state, call it again, ** if it fails a second time its a real issue. */ if (e1000_validate_nvm_checksum(&adapter->hw) < 0) { device_printf(dev, "The EEPROM Checksum Is Not Valid\n"); error = EIO; goto err_hw_init; } } /* Copy the permanent MAC address out of the EEPROM */ if (e1000_read_mac_addr(&adapter->hw) < 0) { device_printf(dev, "EEPROM read error while reading MAC" " address\n"); error = EIO; goto err_hw_init; } if (!lem_is_valid_ether_addr(adapter->hw.mac.addr)) { device_printf(dev, "Invalid MAC address\n"); error = EIO; goto err_hw_init; } /* Initialize the hardware */ if (lem_hardware_init(adapter)) { device_printf(dev, "Unable to initialize the hardware\n"); error = EIO; goto err_hw_init; } /* Allocate transmit descriptors and buffers */ if (lem_allocate_transmit_structures(adapter)) { device_printf(dev, "Could not setup transmit structures\n"); error = ENOMEM; goto err_tx_struct; } /* Allocate receive descriptors and buffers */ if (lem_allocate_receive_structures(adapter)) { device_printf(dev, "Could not setup receive structures\n"); error = ENOMEM; goto err_rx_struct; } /* ** Do interrupt configuration */ error = lem_allocate_irq(adapter); if (error) goto err_rx_struct; /* * Get Wake-on-Lan and Management info for later use */ lem_get_wakeup(dev); /* Setup OS specific network interface */ if (lem_setup_interface(dev, adapter) != 0) goto err_rx_struct; /* Initialize statistics */ lem_update_stats_counters(adapter); adapter->hw.mac.get_link_status = 1; lem_update_link_status(adapter); /* Indicate SOL/IDER usage */ if (e1000_check_reset_block(&adapter->hw)) device_printf(dev, "PHY reset is blocked due to SOL/IDER session.\n"); /* Do we need workaround for 82544 PCI-X adapter? */ if (adapter->hw.bus.type == e1000_bus_type_pcix && adapter->hw.mac.type == e1000_82544) adapter->pcix_82544 = TRUE; else adapter->pcix_82544 = FALSE; /* Register for VLAN events */ adapter->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, lem_register_vlan, adapter, EVENTHANDLER_PRI_FIRST); adapter->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, lem_unregister_vlan, adapter, EVENTHANDLER_PRI_FIRST); lem_add_hw_stats(adapter); /* Non-AMT based hardware can now take control from firmware */ if (adapter->has_manage && !adapter->has_amt) lem_get_hw_control(adapter); /* Tell the stack that the interface is not active */ adapter->ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); adapter->led_dev = led_create(lem_led_func, adapter, device_get_nameunit(dev)); #ifdef DEV_NETMAP lem_netmap_attach(adapter); #endif /* DEV_NETMAP */ INIT_DEBUGOUT("lem_attach: end"); return (0); err_rx_struct: lem_free_transmit_structures(adapter); err_tx_struct: err_hw_init: lem_release_hw_control(adapter); lem_dma_free(adapter, &adapter->rxdma); err_rx_desc: lem_dma_free(adapter, &adapter->txdma); err_tx_desc: err_pci: if (adapter->ifp != NULL) if_free(adapter->ifp); lem_free_pci_resources(adapter); free(adapter->mta, M_DEVBUF); EM_TX_LOCK_DESTROY(adapter); EM_RX_LOCK_DESTROY(adapter); EM_CORE_LOCK_DESTROY(adapter); return (error); } /********************************************************************* * Device removal routine * * The detach entry point is called when the driver is being removed. * This routine stops the adapter and deallocates all the resources * that were allocated for driver operation. * * return 0 on success, positive on failure *********************************************************************/ static int lem_detach(device_t dev) { struct adapter *adapter = device_get_softc(dev); struct ifnet *ifp = adapter->ifp; INIT_DEBUGOUT("em_detach: begin"); /* Make sure VLANS are not using driver */ if (adapter->ifp->if_vlantrunk != NULL) { device_printf(dev,"Vlan in use, detach first\n"); return (EBUSY); } #ifdef DEVICE_POLLING if (ifp->if_capenable & IFCAP_POLLING) ether_poll_deregister(ifp); #endif if (adapter->led_dev != NULL) led_destroy(adapter->led_dev); EM_CORE_LOCK(adapter); EM_TX_LOCK(adapter); adapter->in_detach = 1; lem_stop(adapter); e1000_phy_hw_reset(&adapter->hw); lem_release_manageability(adapter); EM_TX_UNLOCK(adapter); EM_CORE_UNLOCK(adapter); /* Unregister VLAN events */ if (adapter->vlan_attach != NULL) EVENTHANDLER_DEREGISTER(vlan_config, adapter->vlan_attach); if (adapter->vlan_detach != NULL) EVENTHANDLER_DEREGISTER(vlan_unconfig, adapter->vlan_detach); ether_ifdetach(adapter->ifp); callout_drain(&adapter->timer); callout_drain(&adapter->tx_fifo_timer); #ifdef DEV_NETMAP netmap_detach(ifp); #endif /* DEV_NETMAP */ lem_free_pci_resources(adapter); bus_generic_detach(dev); if_free(ifp); lem_free_transmit_structures(adapter); lem_free_receive_structures(adapter); /* Free Transmit Descriptor ring */ if (adapter->tx_desc_base) { lem_dma_free(adapter, &adapter->txdma); adapter->tx_desc_base = NULL; } /* Free Receive Descriptor ring */ if (adapter->rx_desc_base) { lem_dma_free(adapter, &adapter->rxdma); adapter->rx_desc_base = NULL; } lem_release_hw_control(adapter); free(adapter->mta, M_DEVBUF); EM_TX_LOCK_DESTROY(adapter); EM_RX_LOCK_DESTROY(adapter); EM_CORE_LOCK_DESTROY(adapter); return (0); } /********************************************************************* * * Shutdown entry point * **********************************************************************/ static int lem_shutdown(device_t dev) { return lem_suspend(dev); } /* * Suspend/resume device methods. */ static int lem_suspend(device_t dev) { struct adapter *adapter = device_get_softc(dev); EM_CORE_LOCK(adapter); lem_release_manageability(adapter); lem_release_hw_control(adapter); lem_enable_wakeup(dev); EM_CORE_UNLOCK(adapter); return bus_generic_suspend(dev); } static int lem_resume(device_t dev) { struct adapter *adapter = device_get_softc(dev); struct ifnet *ifp = adapter->ifp; EM_CORE_LOCK(adapter); lem_init_locked(adapter); lem_init_manageability(adapter); EM_CORE_UNLOCK(adapter); lem_start(ifp); return bus_generic_resume(dev); } static void lem_start_locked(struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct mbuf *m_head; EM_TX_LOCK_ASSERT(adapter); if ((ifp->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return; if (!adapter->link_active) return; /* * Force a cleanup if number of TX descriptors * available hits the threshold */ if (adapter->num_tx_desc_avail <= EM_TX_CLEANUP_THRESHOLD) { lem_txeof(adapter); /* Now do we at least have a minimal? */ if (adapter->num_tx_desc_avail <= EM_TX_OP_THRESHOLD) { adapter->no_tx_desc_avail1++; return; } } while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) break; /* * Encapsulation can modify our pointer, and or make it * NULL on failure. In that event, we can't requeue. */ if (lem_xmit(adapter, &m_head)) { if (m_head == NULL) break; ifp->if_drv_flags |= IFF_DRV_OACTIVE; IFQ_DRV_PREPEND(&ifp->if_snd, m_head); break; } /* Send a copy of the frame to the BPF listener */ ETHER_BPF_MTAP(ifp, m_head); /* Set timeout in case hardware has problems transmitting. */ adapter->watchdog_check = TRUE; adapter->watchdog_time = ticks; } if (adapter->num_tx_desc_avail <= EM_TX_OP_THRESHOLD) ifp->if_drv_flags |= IFF_DRV_OACTIVE; return; } static void lem_start(struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; EM_TX_LOCK(adapter); if (ifp->if_drv_flags & IFF_DRV_RUNNING) lem_start_locked(ifp); EM_TX_UNLOCK(adapter); } /********************************************************************* * Ioctl entry point * * em_ioctl is called when the user wants to configure the * interface. * * return 0 on success, positive on failure **********************************************************************/ static int lem_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct adapter *adapter = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; #if defined(INET) || defined(INET6) struct ifaddr *ifa = (struct ifaddr *)data; #endif bool avoid_reset = FALSE; int error = 0; if (adapter->in_detach) return (error); switch (command) { case SIOCSIFADDR: #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) avoid_reset = TRUE; #endif #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) avoid_reset = TRUE; #endif /* ** Calling init results in link renegotiation, ** so we avoid doing it when possible. */ if (avoid_reset) { ifp->if_flags |= IFF_UP; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) lem_init(adapter); #ifdef INET if (!(ifp->if_flags & IFF_NOARP)) arp_ifinit(ifp, ifa); #endif } else error = ether_ioctl(ifp, command, data); break; case SIOCSIFMTU: { int max_frame_size; IOCTL_DEBUGOUT("ioctl rcv'd: SIOCSIFMTU (Set Interface MTU)"); EM_CORE_LOCK(adapter); switch (adapter->hw.mac.type) { case e1000_82542: max_frame_size = ETHER_MAX_LEN; break; default: max_frame_size = MAX_JUMBO_FRAME_SIZE; } if (ifr->ifr_mtu > max_frame_size - ETHER_HDR_LEN - ETHER_CRC_LEN) { EM_CORE_UNLOCK(adapter); error = EINVAL; break; } ifp->if_mtu = ifr->ifr_mtu; adapter->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; lem_init_locked(adapter); EM_CORE_UNLOCK(adapter); break; } case SIOCSIFFLAGS: IOCTL_DEBUGOUT("ioctl rcv'd:\ SIOCSIFFLAGS (Set Interface Flags)"); EM_CORE_LOCK(adapter); if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) { if ((ifp->if_flags ^ adapter->if_flags) & (IFF_PROMISC | IFF_ALLMULTI)) { lem_disable_promisc(adapter); lem_set_promisc(adapter); } } else lem_init_locked(adapter); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) { EM_TX_LOCK(adapter); lem_stop(adapter); EM_TX_UNLOCK(adapter); } adapter->if_flags = ifp->if_flags; EM_CORE_UNLOCK(adapter); break; case SIOCADDMULTI: case SIOCDELMULTI: IOCTL_DEBUGOUT("ioctl rcv'd: SIOC(ADD|DEL)MULTI"); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { EM_CORE_LOCK(adapter); lem_disable_intr(adapter); lem_set_multi(adapter); if (adapter->hw.mac.type == e1000_82542 && adapter->hw.revision_id == E1000_REVISION_2) { lem_initialize_receive_unit(adapter); } #ifdef DEVICE_POLLING if (!(ifp->if_capenable & IFCAP_POLLING)) #endif lem_enable_intr(adapter); EM_CORE_UNLOCK(adapter); } break; case SIOCSIFMEDIA: /* Check SOL/IDER usage */ EM_CORE_LOCK(adapter); if (e1000_check_reset_block(&adapter->hw)) { EM_CORE_UNLOCK(adapter); device_printf(adapter->dev, "Media change is" " blocked due to SOL/IDER session.\n"); break; } EM_CORE_UNLOCK(adapter); case SIOCGIFMEDIA: IOCTL_DEBUGOUT("ioctl rcv'd: \ SIOCxIFMEDIA (Get/Set Interface Media)"); error = ifmedia_ioctl(ifp, ifr, &adapter->media, command); break; case SIOCSIFCAP: { int mask, reinit; IOCTL_DEBUGOUT("ioctl rcv'd: SIOCSIFCAP (Set Capabilities)"); reinit = 0; mask = ifr->ifr_reqcap ^ ifp->if_capenable; #ifdef DEVICE_POLLING if (mask & IFCAP_POLLING) { if (ifr->ifr_reqcap & IFCAP_POLLING) { error = ether_poll_register(lem_poll, ifp); if (error) return (error); EM_CORE_LOCK(adapter); lem_disable_intr(adapter); ifp->if_capenable |= IFCAP_POLLING; EM_CORE_UNLOCK(adapter); } else { error = ether_poll_deregister(ifp); /* Enable interrupt even in error case */ EM_CORE_LOCK(adapter); lem_enable_intr(adapter); ifp->if_capenable &= ~IFCAP_POLLING; EM_CORE_UNLOCK(adapter); } } #endif if (mask & IFCAP_HWCSUM) { ifp->if_capenable ^= IFCAP_HWCSUM; reinit = 1; } if (mask & IFCAP_VLAN_HWTAGGING) { ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; reinit = 1; } if ((mask & IFCAP_WOL) && (ifp->if_capabilities & IFCAP_WOL) != 0) { if (mask & IFCAP_WOL_MCAST) ifp->if_capenable ^= IFCAP_WOL_MCAST; if (mask & IFCAP_WOL_MAGIC) ifp->if_capenable ^= IFCAP_WOL_MAGIC; } if (reinit && (ifp->if_drv_flags & IFF_DRV_RUNNING)) lem_init(adapter); VLAN_CAPABILITIES(ifp); break; } default: error = ether_ioctl(ifp, command, data); break; } return (error); } /********************************************************************* * Init entry point * * This routine is used in two ways. It is used by the stack as * init entry point in network interface structure. It is also used * by the driver as a hw/sw initialization routine to get to a * consistent state. * * return 0 on success, positive on failure **********************************************************************/ static void lem_init_locked(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; device_t dev = adapter->dev; u32 pba; INIT_DEBUGOUT("lem_init: begin"); EM_CORE_LOCK_ASSERT(adapter); EM_TX_LOCK(adapter); lem_stop(adapter); EM_TX_UNLOCK(adapter); /* * Packet Buffer Allocation (PBA) * Writing PBA sets the receive portion of the buffer * the remainder is used for the transmit buffer. * * Devices before the 82547 had a Packet Buffer of 64K. * Default allocation: PBA=48K for Rx, leaving 16K for Tx. * After the 82547 the buffer was reduced to 40K. * Default allocation: PBA=30K for Rx, leaving 10K for Tx. * Note: default does not leave enough room for Jumbo Frame >10k. */ switch (adapter->hw.mac.type) { case e1000_82547: case e1000_82547_rev_2: /* 82547: Total Packet Buffer is 40K */ if (adapter->max_frame_size > 8192) pba = E1000_PBA_22K; /* 22K for Rx, 18K for Tx */ else pba = E1000_PBA_30K; /* 30K for Rx, 10K for Tx */ adapter->tx_fifo_head = 0; adapter->tx_head_addr = pba << EM_TX_HEAD_ADDR_SHIFT; adapter->tx_fifo_size = (E1000_PBA_40K - pba) << EM_PBA_BYTES_SHIFT; break; default: /* Devices before 82547 had a Packet Buffer of 64K. */ if (adapter->max_frame_size > 8192) pba = E1000_PBA_40K; /* 40K for Rx, 24K for Tx */ else pba = E1000_PBA_48K; /* 48K for Rx, 16K for Tx */ } INIT_DEBUGOUT1("lem_init: pba=%dK",pba); E1000_WRITE_REG(&adapter->hw, E1000_PBA, pba); /* Get the latest mac address, User can use a LAA */ bcopy(IF_LLADDR(adapter->ifp), adapter->hw.mac.addr, ETHER_ADDR_LEN); /* Put the address into the Receive Address Array */ e1000_rar_set(&adapter->hw, adapter->hw.mac.addr, 0); /* Initialize the hardware */ if (lem_hardware_init(adapter)) { device_printf(dev, "Unable to initialize the hardware\n"); return; } lem_update_link_status(adapter); /* Setup VLAN support, basic and offload if available */ E1000_WRITE_REG(&adapter->hw, E1000_VET, ETHERTYPE_VLAN); /* Set hardware offload abilities */ ifp->if_hwassist = 0; if (adapter->hw.mac.type >= e1000_82543) { if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP); } /* Configure for OS presence */ lem_init_manageability(adapter); /* Prepare transmit descriptors and buffers */ lem_setup_transmit_structures(adapter); lem_initialize_transmit_unit(adapter); /* Setup Multicast table */ lem_set_multi(adapter); /* Prepare receive descriptors and buffers */ if (lem_setup_receive_structures(adapter)) { device_printf(dev, "Could not setup receive structures\n"); EM_TX_LOCK(adapter); lem_stop(adapter); EM_TX_UNLOCK(adapter); return; } lem_initialize_receive_unit(adapter); /* Use real VLAN Filter support? */ if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) { if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) /* Use real VLAN Filter support */ lem_setup_vlan_hw_support(adapter); else { u32 ctrl; ctrl = E1000_READ_REG(&adapter->hw, E1000_CTRL); ctrl |= E1000_CTRL_VME; E1000_WRITE_REG(&adapter->hw, E1000_CTRL, ctrl); } } /* Don't lose promiscuous settings */ lem_set_promisc(adapter); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; callout_reset(&adapter->timer, hz, lem_local_timer, adapter); e1000_clear_hw_cntrs_base_generic(&adapter->hw); #ifdef DEVICE_POLLING /* * Only enable interrupts if we are not polling, make sure * they are off otherwise. */ if (ifp->if_capenable & IFCAP_POLLING) lem_disable_intr(adapter); else #endif /* DEVICE_POLLING */ lem_enable_intr(adapter); /* AMT based hardware can now take control from firmware */ if (adapter->has_manage && adapter->has_amt) lem_get_hw_control(adapter); } static void lem_init(void *arg) { struct adapter *adapter = arg; EM_CORE_LOCK(adapter); lem_init_locked(adapter); EM_CORE_UNLOCK(adapter); } #ifdef DEVICE_POLLING /********************************************************************* * * Legacy polling routine * *********************************************************************/ static int lem_poll(struct ifnet *ifp, enum poll_cmd cmd, int count) { struct adapter *adapter = ifp->if_softc; u32 reg_icr, rx_done = 0; EM_CORE_LOCK(adapter); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { EM_CORE_UNLOCK(adapter); return (rx_done); } if (cmd == POLL_AND_CHECK_STATUS) { reg_icr = E1000_READ_REG(&adapter->hw, E1000_ICR); if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) { callout_stop(&adapter->timer); adapter->hw.mac.get_link_status = 1; lem_update_link_status(adapter); callout_reset(&adapter->timer, hz, lem_local_timer, adapter); } } EM_CORE_UNLOCK(adapter); lem_rxeof(adapter, count, &rx_done); EM_TX_LOCK(adapter); lem_txeof(adapter); if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) lem_start_locked(ifp); EM_TX_UNLOCK(adapter); return (rx_done); } #endif /* DEVICE_POLLING */ /********************************************************************* * * Legacy Interrupt Service routine * *********************************************************************/ static void lem_intr(void *arg) { struct adapter *adapter = arg; struct ifnet *ifp = adapter->ifp; u32 reg_icr; if ((ifp->if_capenable & IFCAP_POLLING) || ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)) return; EM_CORE_LOCK(adapter); reg_icr = E1000_READ_REG(&adapter->hw, E1000_ICR); if (reg_icr & E1000_ICR_RXO) adapter->rx_overruns++; if ((reg_icr == 0xffffffff) || (reg_icr == 0)) { EM_CORE_UNLOCK(adapter); return; } if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) { callout_stop(&adapter->timer); adapter->hw.mac.get_link_status = 1; lem_update_link_status(adapter); /* Deal with TX cruft when link lost */ lem_tx_purge(adapter); callout_reset(&adapter->timer, hz, lem_local_timer, adapter); EM_CORE_UNLOCK(adapter); return; } EM_CORE_UNLOCK(adapter); lem_rxeof(adapter, -1, NULL); EM_TX_LOCK(adapter); lem_txeof(adapter); if (ifp->if_drv_flags & IFF_DRV_RUNNING && !IFQ_DRV_IS_EMPTY(&ifp->if_snd)) lem_start_locked(ifp); EM_TX_UNLOCK(adapter); return; } static void lem_handle_link(void *context, int pending) { struct adapter *adapter = context; struct ifnet *ifp = adapter->ifp; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) return; EM_CORE_LOCK(adapter); callout_stop(&adapter->timer); lem_update_link_status(adapter); /* Deal with TX cruft when link lost */ lem_tx_purge(adapter); callout_reset(&adapter->timer, hz, lem_local_timer, adapter); EM_CORE_UNLOCK(adapter); } /* Combined RX/TX handler, used by Legacy and MSI */ static void lem_handle_rxtx(void *context, int pending) { struct adapter *adapter = context; struct ifnet *ifp = adapter->ifp; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { bool more = lem_rxeof(adapter, adapter->rx_process_limit, NULL); EM_TX_LOCK(adapter); lem_txeof(adapter); if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) lem_start_locked(ifp); EM_TX_UNLOCK(adapter); if (more) { taskqueue_enqueue(adapter->tq, &adapter->rxtx_task); return; } } if (ifp->if_drv_flags & IFF_DRV_RUNNING) lem_enable_intr(adapter); } /********************************************************************* * * Fast Legacy/MSI Combined Interrupt Service routine * *********************************************************************/ static int lem_irq_fast(void *arg) { struct adapter *adapter = arg; struct ifnet *ifp; u32 reg_icr; ifp = adapter->ifp; reg_icr = E1000_READ_REG(&adapter->hw, E1000_ICR); /* Hot eject? */ if (reg_icr == 0xffffffff) return FILTER_STRAY; /* Definitely not our interrupt. */ if (reg_icr == 0x0) return FILTER_STRAY; /* * Mask interrupts until the taskqueue is finished running. This is * cheap, just assume that it is needed. This also works around the * MSI message reordering errata on certain systems. */ lem_disable_intr(adapter); taskqueue_enqueue(adapter->tq, &adapter->rxtx_task); /* Link status change */ if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) { adapter->hw.mac.get_link_status = 1; taskqueue_enqueue(taskqueue_fast, &adapter->link_task); } if (reg_icr & E1000_ICR_RXO) adapter->rx_overruns++; return FILTER_HANDLED; } /********************************************************************* * * Media Ioctl callback * * This routine is called whenever the user queries the status of * the interface using ifconfig. * **********************************************************************/ static void lem_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { struct adapter *adapter = ifp->if_softc; u_char fiber_type = IFM_1000_SX; INIT_DEBUGOUT("lem_media_status: begin"); EM_CORE_LOCK(adapter); lem_update_link_status(adapter); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (!adapter->link_active) { EM_CORE_UNLOCK(adapter); return; } ifmr->ifm_status |= IFM_ACTIVE; if ((adapter->hw.phy.media_type == e1000_media_type_fiber) || (adapter->hw.phy.media_type == e1000_media_type_internal_serdes)) { if (adapter->hw.mac.type == e1000_82545) fiber_type = IFM_1000_LX; ifmr->ifm_active |= fiber_type | IFM_FDX; } else { switch (adapter->link_speed) { case 10: ifmr->ifm_active |= IFM_10_T; break; case 100: ifmr->ifm_active |= IFM_100_TX; break; case 1000: ifmr->ifm_active |= IFM_1000_T; break; } if (adapter->link_duplex == FULL_DUPLEX) ifmr->ifm_active |= IFM_FDX; else ifmr->ifm_active |= IFM_HDX; } EM_CORE_UNLOCK(adapter); } /********************************************************************* * * Media Ioctl callback * * This routine is called when the user changes speed/duplex using * media/mediopt option with ifconfig. * **********************************************************************/ static int lem_media_change(struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct ifmedia *ifm = &adapter->media; INIT_DEBUGOUT("lem_media_change: begin"); if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) return (EINVAL); EM_CORE_LOCK(adapter); switch (IFM_SUBTYPE(ifm->ifm_media)) { case IFM_AUTO: adapter->hw.mac.autoneg = DO_AUTO_NEG; adapter->hw.phy.autoneg_advertised = AUTONEG_ADV_DEFAULT; break; case IFM_1000_LX: case IFM_1000_SX: case IFM_1000_T: adapter->hw.mac.autoneg = DO_AUTO_NEG; adapter->hw.phy.autoneg_advertised = ADVERTISE_1000_FULL; break; case IFM_100_TX: adapter->hw.mac.autoneg = FALSE; adapter->hw.phy.autoneg_advertised = 0; if ((ifm->ifm_media & IFM_GMASK) == IFM_FDX) adapter->hw.mac.forced_speed_duplex = ADVERTISE_100_FULL; else adapter->hw.mac.forced_speed_duplex = ADVERTISE_100_HALF; break; case IFM_10_T: adapter->hw.mac.autoneg = FALSE; adapter->hw.phy.autoneg_advertised = 0; if ((ifm->ifm_media & IFM_GMASK) == IFM_FDX) adapter->hw.mac.forced_speed_duplex = ADVERTISE_10_FULL; else adapter->hw.mac.forced_speed_duplex = ADVERTISE_10_HALF; break; default: device_printf(adapter->dev, "Unsupported media type\n"); } lem_init_locked(adapter); EM_CORE_UNLOCK(adapter); return (0); } /********************************************************************* * * This routine maps the mbufs to tx descriptors. * * return 0 on success, positive on failure **********************************************************************/ static int lem_xmit(struct adapter *adapter, struct mbuf **m_headp) { bus_dma_segment_t segs[EM_MAX_SCATTER]; bus_dmamap_t map; struct em_buffer *tx_buffer, *tx_buffer_mapped; struct e1000_tx_desc *ctxd = NULL; struct mbuf *m_head; u32 txd_upper, txd_lower, txd_used, txd_saved; int error, nsegs, i, j, first, last = 0; m_head = *m_headp; txd_upper = txd_lower = txd_used = txd_saved = 0; /* ** When doing checksum offload, it is critical to ** make sure the first mbuf has more than header, ** because that routine expects data to be present. */ if ((m_head->m_pkthdr.csum_flags & CSUM_OFFLOAD) && (m_head->m_len < ETHER_HDR_LEN + sizeof(struct ip))) { m_head = m_pullup(m_head, ETHER_HDR_LEN + sizeof(struct ip)); *m_headp = m_head; if (m_head == NULL) return (ENOBUFS); } /* * Map the packet for DMA * * Capture the first descriptor index, * this descriptor will have the index * of the EOP which is the only one that * now gets a DONE bit writeback. */ first = adapter->next_avail_tx_desc; tx_buffer = &adapter->tx_buffer_area[first]; tx_buffer_mapped = tx_buffer; map = tx_buffer->map; error = bus_dmamap_load_mbuf_sg(adapter->txtag, map, *m_headp, segs, &nsegs, BUS_DMA_NOWAIT); /* * There are two types of errors we can (try) to handle: * - EFBIG means the mbuf chain was too long and bus_dma ran * out of segments. Defragment the mbuf chain and try again. * - ENOMEM means bus_dma could not obtain enough bounce buffers * at this point in time. Defer sending and try again later. * All other errors, in particular EINVAL, are fatal and prevent the * mbuf chain from ever going through. Drop it and report error. */ if (error == EFBIG) { struct mbuf *m; m = m_defrag(*m_headp, M_NOWAIT); if (m == NULL) { adapter->mbuf_alloc_failed++; m_freem(*m_headp); *m_headp = NULL; return (ENOBUFS); } *m_headp = m; /* Try it again */ error = bus_dmamap_load_mbuf_sg(adapter->txtag, map, *m_headp, segs, &nsegs, BUS_DMA_NOWAIT); if (error) { adapter->no_tx_dma_setup++; m_freem(*m_headp); *m_headp = NULL; return (error); } } else if (error != 0) { adapter->no_tx_dma_setup++; return (error); } if (nsegs > (adapter->num_tx_desc_avail - 2)) { adapter->no_tx_desc_avail2++; bus_dmamap_unload(adapter->txtag, map); return (ENOBUFS); } m_head = *m_headp; /* Do hardware assists */ if (m_head->m_pkthdr.csum_flags & CSUM_OFFLOAD) lem_transmit_checksum_setup(adapter, m_head, &txd_upper, &txd_lower); i = adapter->next_avail_tx_desc; if (adapter->pcix_82544) txd_saved = i; /* Set up our transmit descriptors */ for (j = 0; j < nsegs; j++) { bus_size_t seg_len; bus_addr_t seg_addr; /* If adapter is 82544 and on PCIX bus */ if(adapter->pcix_82544) { DESC_ARRAY desc_array; u32 array_elements, counter; /* * Check the Address and Length combination and * split the data accordingly */ array_elements = lem_fill_descriptors(segs[j].ds_addr, segs[j].ds_len, &desc_array); for (counter = 0; counter < array_elements; counter++) { if (txd_used == adapter->num_tx_desc_avail) { adapter->next_avail_tx_desc = txd_saved; adapter->no_tx_desc_avail2++; bus_dmamap_unload(adapter->txtag, map); return (ENOBUFS); } tx_buffer = &adapter->tx_buffer_area[i]; ctxd = &adapter->tx_desc_base[i]; ctxd->buffer_addr = htole64( desc_array.descriptor[counter].address); ctxd->lower.data = htole32( (adapter->txd_cmd | txd_lower | (u16) desc_array.descriptor[counter].length)); ctxd->upper.data = htole32((txd_upper)); last = i; if (++i == adapter->num_tx_desc) i = 0; tx_buffer->m_head = NULL; tx_buffer->next_eop = -1; txd_used++; } } else { tx_buffer = &adapter->tx_buffer_area[i]; ctxd = &adapter->tx_desc_base[i]; seg_addr = segs[j].ds_addr; seg_len = segs[j].ds_len; ctxd->buffer_addr = htole64(seg_addr); ctxd->lower.data = htole32( adapter->txd_cmd | txd_lower | seg_len); ctxd->upper.data = htole32(txd_upper); last = i; if (++i == adapter->num_tx_desc) i = 0; tx_buffer->m_head = NULL; tx_buffer->next_eop = -1; } } adapter->next_avail_tx_desc = i; if (adapter->pcix_82544) adapter->num_tx_desc_avail -= txd_used; else adapter->num_tx_desc_avail -= nsegs; if (m_head->m_flags & M_VLANTAG) { /* Set the vlan id. */ ctxd->upper.fields.special = htole16(m_head->m_pkthdr.ether_vtag); /* Tell hardware to add tag */ ctxd->lower.data |= htole32(E1000_TXD_CMD_VLE); } tx_buffer->m_head = m_head; tx_buffer_mapped->map = tx_buffer->map; tx_buffer->map = map; bus_dmamap_sync(adapter->txtag, map, BUS_DMASYNC_PREWRITE); /* * Last Descriptor of Packet * needs End Of Packet (EOP) * and Report Status (RS) */ ctxd->lower.data |= htole32(E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS); /* * Keep track in the first buffer which * descriptor will be written back */ tx_buffer = &adapter->tx_buffer_area[first]; tx_buffer->next_eop = last; adapter->watchdog_time = ticks; /* * Advance the Transmit Descriptor Tail (TDT), this tells the E1000 * that this frame is available to transmit. */ bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); if (adapter->hw.mac.type == e1000_82547 && adapter->link_duplex == HALF_DUPLEX) lem_82547_move_tail(adapter); else { E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), i); if (adapter->hw.mac.type == e1000_82547) lem_82547_update_fifo_head(adapter, m_head->m_pkthdr.len); } return (0); } /********************************************************************* * * 82547 workaround to avoid controller hang in half-duplex environment. * The workaround is to avoid queuing a large packet that would span * the internal Tx FIFO ring boundary. We need to reset the FIFO pointers * in this case. We do that only when FIFO is quiescent. * **********************************************************************/ static void lem_82547_move_tail(void *arg) { struct adapter *adapter = arg; struct e1000_tx_desc *tx_desc; u16 hw_tdt, sw_tdt, length = 0; bool eop = 0; EM_TX_LOCK_ASSERT(adapter); hw_tdt = E1000_READ_REG(&adapter->hw, E1000_TDT(0)); sw_tdt = adapter->next_avail_tx_desc; while (hw_tdt != sw_tdt) { tx_desc = &adapter->tx_desc_base[hw_tdt]; length += tx_desc->lower.flags.length; eop = tx_desc->lower.data & E1000_TXD_CMD_EOP; if (++hw_tdt == adapter->num_tx_desc) hw_tdt = 0; if (eop) { if (lem_82547_fifo_workaround(adapter, length)) { adapter->tx_fifo_wrk_cnt++; callout_reset(&adapter->tx_fifo_timer, 1, lem_82547_move_tail, adapter); break; } E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), hw_tdt); lem_82547_update_fifo_head(adapter, length); length = 0; } } } static int lem_82547_fifo_workaround(struct adapter *adapter, int len) { int fifo_space, fifo_pkt_len; fifo_pkt_len = roundup2(len + EM_FIFO_HDR, EM_FIFO_HDR); if (adapter->link_duplex == HALF_DUPLEX) { fifo_space = adapter->tx_fifo_size - adapter->tx_fifo_head; if (fifo_pkt_len >= (EM_82547_PKT_THRESH + fifo_space)) { if (lem_82547_tx_fifo_reset(adapter)) return (0); else return (1); } } return (0); } static void lem_82547_update_fifo_head(struct adapter *adapter, int len) { int fifo_pkt_len = roundup2(len + EM_FIFO_HDR, EM_FIFO_HDR); /* tx_fifo_head is always 16 byte aligned */ adapter->tx_fifo_head += fifo_pkt_len; if (adapter->tx_fifo_head >= adapter->tx_fifo_size) { adapter->tx_fifo_head -= adapter->tx_fifo_size; } } static int lem_82547_tx_fifo_reset(struct adapter *adapter) { u32 tctl; if ((E1000_READ_REG(&adapter->hw, E1000_TDT(0)) == E1000_READ_REG(&adapter->hw, E1000_TDH(0))) && (E1000_READ_REG(&adapter->hw, E1000_TDFT) == E1000_READ_REG(&adapter->hw, E1000_TDFH)) && (E1000_READ_REG(&adapter->hw, E1000_TDFTS) == E1000_READ_REG(&adapter->hw, E1000_TDFHS)) && (E1000_READ_REG(&adapter->hw, E1000_TDFPC) == 0)) { /* Disable TX unit */ tctl = E1000_READ_REG(&adapter->hw, E1000_TCTL); E1000_WRITE_REG(&adapter->hw, E1000_TCTL, tctl & ~E1000_TCTL_EN); /* Reset FIFO pointers */ E1000_WRITE_REG(&adapter->hw, E1000_TDFT, adapter->tx_head_addr); E1000_WRITE_REG(&adapter->hw, E1000_TDFH, adapter->tx_head_addr); E1000_WRITE_REG(&adapter->hw, E1000_TDFTS, adapter->tx_head_addr); E1000_WRITE_REG(&adapter->hw, E1000_TDFHS, adapter->tx_head_addr); /* Re-enable TX unit */ E1000_WRITE_REG(&adapter->hw, E1000_TCTL, tctl); E1000_WRITE_FLUSH(&adapter->hw); adapter->tx_fifo_head = 0; adapter->tx_fifo_reset_cnt++; return (TRUE); } else { return (FALSE); } } static void lem_set_promisc(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; u32 reg_rctl; reg_rctl = E1000_READ_REG(&adapter->hw, E1000_RCTL); if (ifp->if_flags & IFF_PROMISC) { reg_rctl |= (E1000_RCTL_UPE | E1000_RCTL_MPE); /* Turn this on if you want to see bad packets */ if (lem_debug_sbp) reg_rctl |= E1000_RCTL_SBP; E1000_WRITE_REG(&adapter->hw, E1000_RCTL, reg_rctl); } else if (ifp->if_flags & IFF_ALLMULTI) { reg_rctl |= E1000_RCTL_MPE; reg_rctl &= ~E1000_RCTL_UPE; E1000_WRITE_REG(&adapter->hw, E1000_RCTL, reg_rctl); } } static void lem_disable_promisc(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; u32 reg_rctl; int mcnt = 0; reg_rctl = E1000_READ_REG(&adapter->hw, E1000_RCTL); reg_rctl &= (~E1000_RCTL_UPE); if (ifp->if_flags & IFF_ALLMULTI) mcnt = MAX_NUM_MULTICAST_ADDRESSES; else { struct ifmultiaddr *ifma; #if __FreeBSD_version < 800000 IF_ADDR_LOCK(ifp); #else if_maddr_rlock(ifp); #endif TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == MAX_NUM_MULTICAST_ADDRESSES) break; mcnt++; } #if __FreeBSD_version < 800000 IF_ADDR_UNLOCK(ifp); #else if_maddr_runlock(ifp); #endif } /* Don't disable if in MAX groups */ if (mcnt < MAX_NUM_MULTICAST_ADDRESSES) reg_rctl &= (~E1000_RCTL_MPE); reg_rctl &= (~E1000_RCTL_SBP); E1000_WRITE_REG(&adapter->hw, E1000_RCTL, reg_rctl); } /********************************************************************* * Multicast Update * * This routine is called whenever multicast address list is updated. * **********************************************************************/ static void lem_set_multi(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; struct ifmultiaddr *ifma; u32 reg_rctl = 0; u8 *mta; /* Multicast array memory */ int mcnt = 0; IOCTL_DEBUGOUT("lem_set_multi: begin"); mta = adapter->mta; bzero(mta, sizeof(u8) * ETH_ADDR_LEN * MAX_NUM_MULTICAST_ADDRESSES); if (adapter->hw.mac.type == e1000_82542 && adapter->hw.revision_id == E1000_REVISION_2) { reg_rctl = E1000_READ_REG(&adapter->hw, E1000_RCTL); if (adapter->hw.bus.pci_cmd_word & CMD_MEM_WRT_INVALIDATE) e1000_pci_clear_mwi(&adapter->hw); reg_rctl |= E1000_RCTL_RST; E1000_WRITE_REG(&adapter->hw, E1000_RCTL, reg_rctl); msec_delay(5); } #if __FreeBSD_version < 800000 IF_ADDR_LOCK(ifp); #else if_maddr_rlock(ifp); #endif TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == MAX_NUM_MULTICAST_ADDRESSES) break; bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), &mta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN); mcnt++; } #if __FreeBSD_version < 800000 IF_ADDR_UNLOCK(ifp); #else if_maddr_runlock(ifp); #endif if (mcnt >= MAX_NUM_MULTICAST_ADDRESSES) { reg_rctl = E1000_READ_REG(&adapter->hw, E1000_RCTL); reg_rctl |= E1000_RCTL_MPE; E1000_WRITE_REG(&adapter->hw, E1000_RCTL, reg_rctl); } else e1000_update_mc_addr_list(&adapter->hw, mta, mcnt); if (adapter->hw.mac.type == e1000_82542 && adapter->hw.revision_id == E1000_REVISION_2) { reg_rctl = E1000_READ_REG(&adapter->hw, E1000_RCTL); reg_rctl &= ~E1000_RCTL_RST; E1000_WRITE_REG(&adapter->hw, E1000_RCTL, reg_rctl); msec_delay(5); if (adapter->hw.bus.pci_cmd_word & CMD_MEM_WRT_INVALIDATE) e1000_pci_set_mwi(&adapter->hw); } } /********************************************************************* * Timer routine * * This routine checks for link status and updates statistics. * **********************************************************************/ static void lem_local_timer(void *arg) { struct adapter *adapter = arg; EM_CORE_LOCK_ASSERT(adapter); lem_update_link_status(adapter); lem_update_stats_counters(adapter); lem_smartspeed(adapter); /* * We check the watchdog: the time since * the last TX descriptor was cleaned. * This implies a functional TX engine. */ if ((adapter->watchdog_check == TRUE) && (ticks - adapter->watchdog_time > EM_WATCHDOG)) goto hung; callout_reset(&adapter->timer, hz, lem_local_timer, adapter); return; hung: device_printf(adapter->dev, "Watchdog timeout -- resetting\n"); adapter->ifp->if_drv_flags &= ~IFF_DRV_RUNNING; adapter->watchdog_events++; lem_init_locked(adapter); } static void lem_update_link_status(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct ifnet *ifp = adapter->ifp; device_t dev = adapter->dev; u32 link_check = 0; /* Get the cached link value or read phy for real */ switch (hw->phy.media_type) { case e1000_media_type_copper: if (hw->mac.get_link_status) { /* Do the work to read phy */ e1000_check_for_link(hw); link_check = !hw->mac.get_link_status; if (link_check) /* ESB2 fix */ e1000_cfg_on_link_up(hw); } else link_check = TRUE; break; case e1000_media_type_fiber: e1000_check_for_link(hw); link_check = (E1000_READ_REG(hw, E1000_STATUS) & E1000_STATUS_LU); break; case e1000_media_type_internal_serdes: e1000_check_for_link(hw); link_check = adapter->hw.mac.serdes_has_link; break; default: case e1000_media_type_unknown: break; } /* Now check for a transition */ if (link_check && (adapter->link_active == 0)) { e1000_get_speed_and_duplex(hw, &adapter->link_speed, &adapter->link_duplex); if (bootverbose) device_printf(dev, "Link is up %d Mbps %s\n", adapter->link_speed, ((adapter->link_duplex == FULL_DUPLEX) ? "Full Duplex" : "Half Duplex")); adapter->link_active = 1; adapter->smartspeed = 0; ifp->if_baudrate = adapter->link_speed * 1000000; if_link_state_change(ifp, LINK_STATE_UP); } else if (!link_check && (adapter->link_active == 1)) { ifp->if_baudrate = adapter->link_speed = 0; adapter->link_duplex = 0; if (bootverbose) device_printf(dev, "Link is Down\n"); adapter->link_active = 0; /* Link down, disable watchdog */ adapter->watchdog_check = FALSE; if_link_state_change(ifp, LINK_STATE_DOWN); } } /********************************************************************* * * This routine disables all traffic on the adapter by issuing a * global reset on the MAC and deallocates TX/RX buffers. * * This routine should always be called with BOTH the CORE * and TX locks. **********************************************************************/ static void lem_stop(void *arg) { struct adapter *adapter = arg; struct ifnet *ifp = adapter->ifp; EM_CORE_LOCK_ASSERT(adapter); EM_TX_LOCK_ASSERT(adapter); INIT_DEBUGOUT("lem_stop: begin"); lem_disable_intr(adapter); callout_stop(&adapter->timer); callout_stop(&adapter->tx_fifo_timer); /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); e1000_reset_hw(&adapter->hw); if (adapter->hw.mac.type >= e1000_82544) E1000_WRITE_REG(&adapter->hw, E1000_WUC, 0); e1000_led_off(&adapter->hw); e1000_cleanup_led(&adapter->hw); } /********************************************************************* * * Determine hardware revision. * **********************************************************************/ static void lem_identify_hardware(struct adapter *adapter) { device_t dev = adapter->dev; /* Make sure our PCI config space has the necessary stuff set */ pci_enable_busmaster(dev); adapter->hw.bus.pci_cmd_word = pci_read_config(dev, PCIR_COMMAND, 2); /* Save off the information about this board */ adapter->hw.vendor_id = pci_get_vendor(dev); adapter->hw.device_id = pci_get_device(dev); adapter->hw.revision_id = pci_read_config(dev, PCIR_REVID, 1); adapter->hw.subsystem_vendor_id = pci_read_config(dev, PCIR_SUBVEND_0, 2); adapter->hw.subsystem_device_id = pci_read_config(dev, PCIR_SUBDEV_0, 2); /* Do Shared Code Init and Setup */ if (e1000_set_mac_type(&adapter->hw)) { device_printf(dev, "Setup init failure\n"); return; } } static int lem_allocate_pci_resources(struct adapter *adapter) { device_t dev = adapter->dev; int val, rid, error = E1000_SUCCESS; rid = PCIR_BAR(0); adapter->memory = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (adapter->memory == NULL) { device_printf(dev, "Unable to allocate bus resource: memory\n"); return (ENXIO); } adapter->osdep.mem_bus_space_tag = rman_get_bustag(adapter->memory); adapter->osdep.mem_bus_space_handle = rman_get_bushandle(adapter->memory); adapter->hw.hw_addr = (u8 *)&adapter->osdep.mem_bus_space_handle; /* Only older adapters use IO mapping */ if (adapter->hw.mac.type > e1000_82543) { /* Figure our where our IO BAR is ? */ for (rid = PCIR_BAR(0); rid < PCIR_CIS;) { val = pci_read_config(dev, rid, 4); if (EM_BAR_TYPE(val) == EM_BAR_TYPE_IO) { adapter->io_rid = rid; break; } rid += 4; /* check for 64bit BAR */ if (EM_BAR_MEM_TYPE(val) == EM_BAR_MEM_TYPE_64BIT) rid += 4; } if (rid >= PCIR_CIS) { device_printf(dev, "Unable to locate IO BAR\n"); return (ENXIO); } adapter->ioport = bus_alloc_resource_any(dev, SYS_RES_IOPORT, &adapter->io_rid, RF_ACTIVE); if (adapter->ioport == NULL) { device_printf(dev, "Unable to allocate bus resource: " "ioport\n"); return (ENXIO); } adapter->hw.io_base = 0; adapter->osdep.io_bus_space_tag = rman_get_bustag(adapter->ioport); adapter->osdep.io_bus_space_handle = rman_get_bushandle(adapter->ioport); } adapter->hw.back = &adapter->osdep; return (error); } /********************************************************************* * * Setup the Legacy or MSI Interrupt handler * **********************************************************************/ int lem_allocate_irq(struct adapter *adapter) { device_t dev = adapter->dev; int error, rid = 0; /* Manually turn off all interrupts */ E1000_WRITE_REG(&adapter->hw, E1000_IMC, 0xffffffff); /* We allocate a single interrupt resource */ adapter->res[0] = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (adapter->res[0] == NULL) { device_printf(dev, "Unable to allocate bus resource: " "interrupt\n"); return (ENXIO); } /* Do Legacy setup? */ if (lem_use_legacy_irq) { if ((error = bus_setup_intr(dev, adapter->res[0], INTR_TYPE_NET | INTR_MPSAFE, NULL, lem_intr, adapter, &adapter->tag[0])) != 0) { device_printf(dev, "Failed to register interrupt handler"); return (error); } return (0); } /* * Use a Fast interrupt and the associated * deferred processing contexts. */ TASK_INIT(&adapter->rxtx_task, 0, lem_handle_rxtx, adapter); TASK_INIT(&adapter->link_task, 0, lem_handle_link, adapter); adapter->tq = taskqueue_create_fast("lem_taskq", M_NOWAIT, taskqueue_thread_enqueue, &adapter->tq); taskqueue_start_threads(&adapter->tq, 1, PI_NET, "%s taskq", device_get_nameunit(adapter->dev)); if ((error = bus_setup_intr(dev, adapter->res[0], INTR_TYPE_NET, lem_irq_fast, NULL, adapter, &adapter->tag[0])) != 0) { device_printf(dev, "Failed to register fast interrupt " "handler: %d\n", error); taskqueue_free(adapter->tq); adapter->tq = NULL; return (error); } return (0); } static void lem_free_pci_resources(struct adapter *adapter) { device_t dev = adapter->dev; if (adapter->tag[0] != NULL) { bus_teardown_intr(dev, adapter->res[0], adapter->tag[0]); adapter->tag[0] = NULL; } if (adapter->res[0] != NULL) { bus_release_resource(dev, SYS_RES_IRQ, 0, adapter->res[0]); } if (adapter->memory != NULL) bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(0), adapter->memory); if (adapter->ioport != NULL) bus_release_resource(dev, SYS_RES_IOPORT, adapter->io_rid, adapter->ioport); } /********************************************************************* * * Initialize the hardware to a configuration * as specified by the adapter structure. * **********************************************************************/ static int lem_hardware_init(struct adapter *adapter) { device_t dev = adapter->dev; u16 rx_buffer_size; INIT_DEBUGOUT("lem_hardware_init: begin"); /* Issue a global reset */ e1000_reset_hw(&adapter->hw); /* When hardware is reset, fifo_head is also reset */ adapter->tx_fifo_head = 0; /* * These parameters control the automatic generation (Tx) and * response (Rx) to Ethernet PAUSE frames. * - High water mark should allow for at least two frames to be * received after sending an XOFF. * - Low water mark works best when it is very near the high water mark. * This allows the receiver to restart by sending XON when it has * drained a bit. Here we use an arbitary value of 1500 which will * restart after one full frame is pulled from the buffer. There * could be several smaller frames in the buffer and if so they will * not trigger the XON until their total number reduces the buffer * by 1500. * - The pause time is fairly large at 1000 x 512ns = 512 usec. */ rx_buffer_size = ((E1000_READ_REG(&adapter->hw, E1000_PBA) & 0xffff) << 10 ); adapter->hw.fc.high_water = rx_buffer_size - roundup2(adapter->max_frame_size, 1024); adapter->hw.fc.low_water = adapter->hw.fc.high_water - 1500; adapter->hw.fc.pause_time = EM_FC_PAUSE_TIME; adapter->hw.fc.send_xon = TRUE; /* Set Flow control, use the tunable location if sane */ if ((lem_fc_setting >= 0) && (lem_fc_setting < 4)) adapter->hw.fc.requested_mode = lem_fc_setting; else adapter->hw.fc.requested_mode = e1000_fc_none; if (e1000_init_hw(&adapter->hw) < 0) { device_printf(dev, "Hardware Initialization Failed\n"); return (EIO); } e1000_check_for_link(&adapter->hw); return (0); } /********************************************************************* * * Setup networking device structure and register an interface. * **********************************************************************/ static int lem_setup_interface(device_t dev, struct adapter *adapter) { struct ifnet *ifp; INIT_DEBUGOUT("lem_setup_interface: begin"); ifp = adapter->ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "can not allocate ifnet structure\n"); return (-1); } if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_init = lem_init; ifp->if_softc = adapter; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = lem_ioctl; ifp->if_start = lem_start; IFQ_SET_MAXLEN(&ifp->if_snd, adapter->num_tx_desc - 1); ifp->if_snd.ifq_drv_maxlen = adapter->num_tx_desc - 1; IFQ_SET_READY(&ifp->if_snd); ether_ifattach(ifp, adapter->hw.mac.addr); ifp->if_capabilities = ifp->if_capenable = 0; if (adapter->hw.mac.type >= e1000_82543) { ifp->if_capabilities |= IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM; ifp->if_capenable |= IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM; } /* * Tell the upper layer(s) we support long frames. */ ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header); ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; ifp->if_capenable |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; /* ** Dont turn this on by default, if vlans are ** created on another pseudo device (eg. lagg) ** then vlan events are not passed thru, breaking ** operation, but with HW FILTER off it works. If ** using vlans directly on the em driver you can ** enable this and get full hardware tag filtering. */ ifp->if_capabilities |= IFCAP_VLAN_HWFILTER; #ifdef DEVICE_POLLING ifp->if_capabilities |= IFCAP_POLLING; #endif /* Enable only WOL MAGIC by default */ if (adapter->wol) { ifp->if_capabilities |= IFCAP_WOL; ifp->if_capenable |= IFCAP_WOL_MAGIC; } /* * Specify the media types supported by this adapter and register * callbacks to update media and link information */ ifmedia_init(&adapter->media, IFM_IMASK, lem_media_change, lem_media_status); if ((adapter->hw.phy.media_type == e1000_media_type_fiber) || (adapter->hw.phy.media_type == e1000_media_type_internal_serdes)) { u_char fiber_type = IFM_1000_SX; /* default type */ if (adapter->hw.mac.type == e1000_82545) fiber_type = IFM_1000_LX; ifmedia_add(&adapter->media, IFM_ETHER | fiber_type | IFM_FDX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | fiber_type, 0, NULL); } else { ifmedia_add(&adapter->media, IFM_ETHER | IFM_10_T, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_10_T | IFM_FDX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_100_TX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_100_TX | IFM_FDX, 0, NULL); if (adapter->hw.phy.type != e1000_phy_ife) { ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_T, 0, NULL); } } ifmedia_add(&adapter->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&adapter->media, IFM_ETHER | IFM_AUTO); return (0); } /********************************************************************* * * Workaround for SmartSpeed on 82541 and 82547 controllers * **********************************************************************/ static void lem_smartspeed(struct adapter *adapter) { u16 phy_tmp; if (adapter->link_active || (adapter->hw.phy.type != e1000_phy_igp) || adapter->hw.mac.autoneg == 0 || (adapter->hw.phy.autoneg_advertised & ADVERTISE_1000_FULL) == 0) return; if (adapter->smartspeed == 0) { /* If Master/Slave config fault is asserted twice, * we assume back-to-back */ e1000_read_phy_reg(&adapter->hw, PHY_1000T_STATUS, &phy_tmp); if (!(phy_tmp & SR_1000T_MS_CONFIG_FAULT)) return; e1000_read_phy_reg(&adapter->hw, PHY_1000T_STATUS, &phy_tmp); if (phy_tmp & SR_1000T_MS_CONFIG_FAULT) { e1000_read_phy_reg(&adapter->hw, PHY_1000T_CTRL, &phy_tmp); if(phy_tmp & CR_1000T_MS_ENABLE) { phy_tmp &= ~CR_1000T_MS_ENABLE; e1000_write_phy_reg(&adapter->hw, PHY_1000T_CTRL, phy_tmp); adapter->smartspeed++; if(adapter->hw.mac.autoneg && !e1000_copper_link_autoneg(&adapter->hw) && !e1000_read_phy_reg(&adapter->hw, PHY_CONTROL, &phy_tmp)) { phy_tmp |= (MII_CR_AUTO_NEG_EN | MII_CR_RESTART_AUTO_NEG); e1000_write_phy_reg(&adapter->hw, PHY_CONTROL, phy_tmp); } } } return; } else if(adapter->smartspeed == EM_SMARTSPEED_DOWNSHIFT) { /* If still no link, perhaps using 2/3 pair cable */ e1000_read_phy_reg(&adapter->hw, PHY_1000T_CTRL, &phy_tmp); phy_tmp |= CR_1000T_MS_ENABLE; e1000_write_phy_reg(&adapter->hw, PHY_1000T_CTRL, phy_tmp); if(adapter->hw.mac.autoneg && !e1000_copper_link_autoneg(&adapter->hw) && !e1000_read_phy_reg(&adapter->hw, PHY_CONTROL, &phy_tmp)) { phy_tmp |= (MII_CR_AUTO_NEG_EN | MII_CR_RESTART_AUTO_NEG); e1000_write_phy_reg(&adapter->hw, PHY_CONTROL, phy_tmp); } } /* Restart process after EM_SMARTSPEED_MAX iterations */ if(adapter->smartspeed++ == EM_SMARTSPEED_MAX) adapter->smartspeed = 0; } /* * Manage DMA'able memory. */ static void lem_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error) { if (error) return; *(bus_addr_t *) arg = segs[0].ds_addr; } static int lem_dma_malloc(struct adapter *adapter, bus_size_t size, struct em_dma_alloc *dma, int mapflags) { int error; error = bus_dma_tag_create(bus_get_dma_tag(adapter->dev), /* parent */ EM_DBA_ALIGN, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ size, /* maxsize */ 1, /* nsegments */ size, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &dma->dma_tag); if (error) { device_printf(adapter->dev, "%s: bus_dma_tag_create failed: %d\n", __func__, error); goto fail_0; } error = bus_dmamem_alloc(dma->dma_tag, (void**) &dma->dma_vaddr, BUS_DMA_NOWAIT | BUS_DMA_COHERENT, &dma->dma_map); if (error) { device_printf(adapter->dev, "%s: bus_dmamem_alloc(%ju) failed: %d\n", __func__, (uintmax_t)size, error); goto fail_2; } dma->dma_paddr = 0; error = bus_dmamap_load(dma->dma_tag, dma->dma_map, dma->dma_vaddr, size, lem_dmamap_cb, &dma->dma_paddr, mapflags | BUS_DMA_NOWAIT); if (error || dma->dma_paddr == 0) { device_printf(adapter->dev, "%s: bus_dmamap_load failed: %d\n", __func__, error); goto fail_3; } return (0); fail_3: bus_dmamap_unload(dma->dma_tag, dma->dma_map); fail_2: bus_dmamem_free(dma->dma_tag, dma->dma_vaddr, dma->dma_map); bus_dma_tag_destroy(dma->dma_tag); fail_0: dma->dma_map = NULL; dma->dma_tag = NULL; return (error); } static void lem_dma_free(struct adapter *adapter, struct em_dma_alloc *dma) { if (dma->dma_tag == NULL) return; if (dma->dma_map != NULL) { bus_dmamap_sync(dma->dma_tag, dma->dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(dma->dma_tag, dma->dma_map); bus_dmamem_free(dma->dma_tag, dma->dma_vaddr, dma->dma_map); dma->dma_map = NULL; } bus_dma_tag_destroy(dma->dma_tag); dma->dma_tag = NULL; } /********************************************************************* * * Allocate memory for tx_buffer structures. The tx_buffer stores all * the information needed to transmit a packet on the wire. * **********************************************************************/ static int lem_allocate_transmit_structures(struct adapter *adapter) { device_t dev = adapter->dev; struct em_buffer *tx_buffer; int error; /* * Create DMA tags for tx descriptors */ if ((error = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MCLBYTES * EM_MAX_SCATTER, /* maxsize */ EM_MAX_SCATTER, /* nsegments */ MCLBYTES, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &adapter->txtag)) != 0) { device_printf(dev, "Unable to allocate TX DMA tag\n"); goto fail; } adapter->tx_buffer_area = malloc(sizeof(struct em_buffer) * adapter->num_tx_desc, M_DEVBUF, M_NOWAIT | M_ZERO); if (adapter->tx_buffer_area == NULL) { device_printf(dev, "Unable to allocate tx_buffer memory\n"); error = ENOMEM; goto fail; } /* Create the descriptor buffer dma maps */ for (int i = 0; i < adapter->num_tx_desc; i++) { tx_buffer = &adapter->tx_buffer_area[i]; error = bus_dmamap_create(adapter->txtag, 0, &tx_buffer->map); if (error != 0) { device_printf(dev, "Unable to create TX DMA map\n"); goto fail; } tx_buffer->next_eop = -1; } return (0); fail: lem_free_transmit_structures(adapter); return (error); } /********************************************************************* * * (Re)Initialize transmit structures. * **********************************************************************/ static void lem_setup_transmit_structures(struct adapter *adapter) { struct em_buffer *tx_buffer; #ifdef DEV_NETMAP /* we are already locked */ struct netmap_adapter *na = NA(adapter->ifp); struct netmap_slot *slot = netmap_reset(na, NR_TX, 0, 0); #endif /* DEV_NETMAP */ /* Clear the old ring contents */ bzero(adapter->tx_desc_base, (sizeof(struct e1000_tx_desc)) * adapter->num_tx_desc); /* Free any existing TX buffers */ for (int i = 0; i < adapter->num_tx_desc; i++, tx_buffer++) { tx_buffer = &adapter->tx_buffer_area[i]; bus_dmamap_sync(adapter->txtag, tx_buffer->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(adapter->txtag, tx_buffer->map); m_freem(tx_buffer->m_head); tx_buffer->m_head = NULL; #ifdef DEV_NETMAP if (slot) { /* the i-th NIC entry goes to slot si */ int si = netmap_idx_n2k(&na->tx_rings[0], i); uint64_t paddr; void *addr; addr = PNMB(slot + si, &paddr); adapter->tx_desc_base[i].buffer_addr = htole64(paddr); /* reload the map for netmap mode */ netmap_load_map(adapter->txtag, tx_buffer->map, addr); } #endif /* DEV_NETMAP */ tx_buffer->next_eop = -1; } /* Reset state */ adapter->last_hw_offload = 0; adapter->next_avail_tx_desc = 0; adapter->next_tx_to_clean = 0; adapter->num_tx_desc_avail = adapter->num_tx_desc; bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); return; } /********************************************************************* * * Enable transmit unit. * **********************************************************************/ static void lem_initialize_transmit_unit(struct adapter *adapter) { u32 tctl, tipg = 0; u64 bus_addr; INIT_DEBUGOUT("lem_initialize_transmit_unit: begin"); /* Setup the Base and Length of the Tx Descriptor Ring */ bus_addr = adapter->txdma.dma_paddr; E1000_WRITE_REG(&adapter->hw, E1000_TDLEN(0), adapter->num_tx_desc * sizeof(struct e1000_tx_desc)); E1000_WRITE_REG(&adapter->hw, E1000_TDBAH(0), (u32)(bus_addr >> 32)); E1000_WRITE_REG(&adapter->hw, E1000_TDBAL(0), (u32)bus_addr); /* Setup the HW Tx Head and Tail descriptor pointers */ E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), 0); E1000_WRITE_REG(&adapter->hw, E1000_TDH(0), 0); HW_DEBUGOUT2("Base = %x, Length = %x\n", E1000_READ_REG(&adapter->hw, E1000_TDBAL(0)), E1000_READ_REG(&adapter->hw, E1000_TDLEN(0))); /* Set the default values for the Tx Inter Packet Gap timer */ switch (adapter->hw.mac.type) { case e1000_82542: tipg = DEFAULT_82542_TIPG_IPGT; tipg |= DEFAULT_82542_TIPG_IPGR1 << E1000_TIPG_IPGR1_SHIFT; tipg |= DEFAULT_82542_TIPG_IPGR2 << E1000_TIPG_IPGR2_SHIFT; break; default: if ((adapter->hw.phy.media_type == e1000_media_type_fiber) || (adapter->hw.phy.media_type == e1000_media_type_internal_serdes)) tipg = DEFAULT_82543_TIPG_IPGT_FIBER; else tipg = DEFAULT_82543_TIPG_IPGT_COPPER; tipg |= DEFAULT_82543_TIPG_IPGR1 << E1000_TIPG_IPGR1_SHIFT; tipg |= DEFAULT_82543_TIPG_IPGR2 << E1000_TIPG_IPGR2_SHIFT; } E1000_WRITE_REG(&adapter->hw, E1000_TIPG, tipg); E1000_WRITE_REG(&adapter->hw, E1000_TIDV, adapter->tx_int_delay.value); if(adapter->hw.mac.type >= e1000_82540) E1000_WRITE_REG(&adapter->hw, E1000_TADV, adapter->tx_abs_int_delay.value); /* Program the Transmit Control Register */ tctl = E1000_READ_REG(&adapter->hw, E1000_TCTL); tctl &= ~E1000_TCTL_CT; tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN | (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT)); /* This write will effectively turn on the transmit unit. */ E1000_WRITE_REG(&adapter->hw, E1000_TCTL, tctl); /* Setup Transmit Descriptor Base Settings */ adapter->txd_cmd = E1000_TXD_CMD_IFCS; if (adapter->tx_int_delay.value > 0) adapter->txd_cmd |= E1000_TXD_CMD_IDE; } /********************************************************************* * * Free all transmit related data structures. * **********************************************************************/ static void lem_free_transmit_structures(struct adapter *adapter) { struct em_buffer *tx_buffer; INIT_DEBUGOUT("free_transmit_structures: begin"); if (adapter->tx_buffer_area != NULL) { for (int i = 0; i < adapter->num_tx_desc; i++) { tx_buffer = &adapter->tx_buffer_area[i]; if (tx_buffer->m_head != NULL) { bus_dmamap_sync(adapter->txtag, tx_buffer->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(adapter->txtag, tx_buffer->map); m_freem(tx_buffer->m_head); tx_buffer->m_head = NULL; } else if (tx_buffer->map != NULL) bus_dmamap_unload(adapter->txtag, tx_buffer->map); if (tx_buffer->map != NULL) { bus_dmamap_destroy(adapter->txtag, tx_buffer->map); tx_buffer->map = NULL; } } } if (adapter->tx_buffer_area != NULL) { free(adapter->tx_buffer_area, M_DEVBUF); adapter->tx_buffer_area = NULL; } if (adapter->txtag != NULL) { bus_dma_tag_destroy(adapter->txtag); adapter->txtag = NULL; } #if __FreeBSD_version >= 800000 if (adapter->br != NULL) buf_ring_free(adapter->br, M_DEVBUF); #endif } /********************************************************************* * * The offload context needs to be set when we transfer the first * packet of a particular protocol (TCP/UDP). This routine has been * enhanced to deal with inserted VLAN headers, and IPV6 (not complete) * * Added back the old method of keeping the current context type * and not setting if unnecessary, as this is reported to be a * big performance win. -jfv **********************************************************************/ static void lem_transmit_checksum_setup(struct adapter *adapter, struct mbuf *mp, u32 *txd_upper, u32 *txd_lower) { struct e1000_context_desc *TXD = NULL; struct em_buffer *tx_buffer; struct ether_vlan_header *eh; struct ip *ip = NULL; struct ip6_hdr *ip6; int curr_txd, ehdrlen; u32 cmd, hdr_len, ip_hlen; u16 etype; u8 ipproto; cmd = hdr_len = ipproto = 0; *txd_upper = *txd_lower = 0; curr_txd = adapter->next_avail_tx_desc; /* * Determine where frame payload starts. * Jump over vlan headers if already present, * helpful for QinQ too. */ eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { etype = ntohs(eh->evl_proto); ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { etype = ntohs(eh->evl_encap_proto); ehdrlen = ETHER_HDR_LEN; } /* * We only support TCP/UDP for IPv4 and IPv6 for the moment. * TODO: Support SCTP too when it hits the tree. */ switch (etype) { case ETHERTYPE_IP: ip = (struct ip *)(mp->m_data + ehdrlen); ip_hlen = ip->ip_hl << 2; /* Setup of IP header checksum. */ if (mp->m_pkthdr.csum_flags & CSUM_IP) { /* * Start offset for header checksum calculation. * End offset for header checksum calculation. * Offset of place to put the checksum. */ TXD = (struct e1000_context_desc *) &adapter->tx_desc_base[curr_txd]; TXD->lower_setup.ip_fields.ipcss = ehdrlen; TXD->lower_setup.ip_fields.ipcse = htole16(ehdrlen + ip_hlen); TXD->lower_setup.ip_fields.ipcso = ehdrlen + offsetof(struct ip, ip_sum); cmd |= E1000_TXD_CMD_IP; *txd_upper |= E1000_TXD_POPTS_IXSM << 8; } hdr_len = ehdrlen + ip_hlen; ipproto = ip->ip_p; break; case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen); ip_hlen = sizeof(struct ip6_hdr); /* XXX: No header stacking. */ /* IPv6 doesn't have a header checksum. */ hdr_len = ehdrlen + ip_hlen; ipproto = ip6->ip6_nxt; break; default: return; } switch (ipproto) { case IPPROTO_TCP: if (mp->m_pkthdr.csum_flags & CSUM_TCP) { *txd_lower = E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D; *txd_upper |= E1000_TXD_POPTS_TXSM << 8; /* no need for context if already set */ if (adapter->last_hw_offload == CSUM_TCP) return; adapter->last_hw_offload = CSUM_TCP; /* * Start offset for payload checksum calculation. * End offset for payload checksum calculation. * Offset of place to put the checksum. */ TXD = (struct e1000_context_desc *) &adapter->tx_desc_base[curr_txd]; TXD->upper_setup.tcp_fields.tucss = hdr_len; TXD->upper_setup.tcp_fields.tucse = htole16(0); TXD->upper_setup.tcp_fields.tucso = hdr_len + offsetof(struct tcphdr, th_sum); cmd |= E1000_TXD_CMD_TCP; } break; case IPPROTO_UDP: { if (mp->m_pkthdr.csum_flags & CSUM_UDP) { *txd_lower = E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D; *txd_upper |= E1000_TXD_POPTS_TXSM << 8; /* no need for context if already set */ if (adapter->last_hw_offload == CSUM_UDP) return; adapter->last_hw_offload = CSUM_UDP; /* * Start offset for header checksum calculation. * End offset for header checksum calculation. * Offset of place to put the checksum. */ TXD = (struct e1000_context_desc *) &adapter->tx_desc_base[curr_txd]; TXD->upper_setup.tcp_fields.tucss = hdr_len; TXD->upper_setup.tcp_fields.tucse = htole16(0); TXD->upper_setup.tcp_fields.tucso = hdr_len + offsetof(struct udphdr, uh_sum); } /* Fall Thru */ } default: break; } if (TXD == NULL) return; TXD->tcp_seg_setup.data = htole32(0); TXD->cmd_and_length = htole32(adapter->txd_cmd | E1000_TXD_CMD_DEXT | cmd); tx_buffer = &adapter->tx_buffer_area[curr_txd]; tx_buffer->m_head = NULL; tx_buffer->next_eop = -1; if (++curr_txd == adapter->num_tx_desc) curr_txd = 0; adapter->num_tx_desc_avail--; adapter->next_avail_tx_desc = curr_txd; } /********************************************************************** * * Examine each tx_buffer in the used queue. If the hardware is done * processing the packet then free associated resources. The * tx_buffer is put back on the free queue. * **********************************************************************/ static void lem_txeof(struct adapter *adapter) { int first, last, done, num_avail; struct em_buffer *tx_buffer; struct e1000_tx_desc *tx_desc, *eop_desc; struct ifnet *ifp = adapter->ifp; EM_TX_LOCK_ASSERT(adapter); #ifdef DEV_NETMAP if (netmap_tx_irq(ifp, 0)) return; #endif /* DEV_NETMAP */ if (adapter->num_tx_desc_avail == adapter->num_tx_desc) return; num_avail = adapter->num_tx_desc_avail; first = adapter->next_tx_to_clean; tx_desc = &adapter->tx_desc_base[first]; tx_buffer = &adapter->tx_buffer_area[first]; last = tx_buffer->next_eop; eop_desc = &adapter->tx_desc_base[last]; /* * What this does is get the index of the * first descriptor AFTER the EOP of the * first packet, that way we can do the * simple comparison on the inner while loop. */ if (++last == adapter->num_tx_desc) last = 0; done = last; bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, BUS_DMASYNC_POSTREAD); while (eop_desc->upper.fields.status & E1000_TXD_STAT_DD) { /* We clean the range of the packet */ while (first != done) { tx_desc->upper.data = 0; tx_desc->lower.data = 0; tx_desc->buffer_addr = 0; ++num_avail; if (tx_buffer->m_head) { ifp->if_opackets++; bus_dmamap_sync(adapter->txtag, tx_buffer->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(adapter->txtag, tx_buffer->map); m_freem(tx_buffer->m_head); tx_buffer->m_head = NULL; } tx_buffer->next_eop = -1; adapter->watchdog_time = ticks; if (++first == adapter->num_tx_desc) first = 0; tx_buffer = &adapter->tx_buffer_area[first]; tx_desc = &adapter->tx_desc_base[first]; } /* See if we can continue to the next packet */ last = tx_buffer->next_eop; if (last != -1) { eop_desc = &adapter->tx_desc_base[last]; /* Get new done point */ if (++last == adapter->num_tx_desc) last = 0; done = last; } else break; } bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); adapter->next_tx_to_clean = first; adapter->num_tx_desc_avail = num_avail; /* * If we have enough room, clear IFF_DRV_OACTIVE to * tell the stack that it is OK to send packets. * If there are no pending descriptors, clear the watchdog. */ if (adapter->num_tx_desc_avail > EM_TX_CLEANUP_THRESHOLD) { ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if (adapter->num_tx_desc_avail == adapter->num_tx_desc) { adapter->watchdog_check = FALSE; return; } } } /********************************************************************* * * When Link is lost sometimes there is work still in the TX ring * which may result in a watchdog, rather than allow that we do an * attempted cleanup and then reinit here. Note that this has been * seens mostly with fiber adapters. * **********************************************************************/ static void lem_tx_purge(struct adapter *adapter) { if ((!adapter->link_active) && (adapter->watchdog_check)) { EM_TX_LOCK(adapter); lem_txeof(adapter); EM_TX_UNLOCK(adapter); if (adapter->watchdog_check) /* Still outstanding? */ lem_init_locked(adapter); } } /********************************************************************* * * Get a buffer from system mbuf buffer pool. * **********************************************************************/ static int lem_get_buf(struct adapter *adapter, int i) { struct mbuf *m; bus_dma_segment_t segs[1]; bus_dmamap_t map; struct em_buffer *rx_buffer; int error, nsegs; m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) { adapter->mbuf_cluster_failed++; return (ENOBUFS); } m->m_len = m->m_pkthdr.len = MCLBYTES; if (adapter->max_frame_size <= (MCLBYTES - ETHER_ALIGN)) m_adj(m, ETHER_ALIGN); /* * Using memory from the mbuf cluster pool, invoke the * bus_dma machinery to arrange the memory mapping. */ error = bus_dmamap_load_mbuf_sg(adapter->rxtag, adapter->rx_sparemap, m, segs, &nsegs, BUS_DMA_NOWAIT); if (error != 0) { m_free(m); return (error); } /* If nsegs is wrong then the stack is corrupt. */ KASSERT(nsegs == 1, ("Too many segments returned!")); rx_buffer = &adapter->rx_buffer_area[i]; if (rx_buffer->m_head != NULL) bus_dmamap_unload(adapter->rxtag, rx_buffer->map); map = rx_buffer->map; rx_buffer->map = adapter->rx_sparemap; adapter->rx_sparemap = map; bus_dmamap_sync(adapter->rxtag, rx_buffer->map, BUS_DMASYNC_PREREAD); rx_buffer->m_head = m; adapter->rx_desc_base[i].buffer_addr = htole64(segs[0].ds_addr); return (0); } /********************************************************************* * * Allocate memory for rx_buffer structures. Since we use one * rx_buffer per received packet, the maximum number of rx_buffer's * that we'll need is equal to the number of receive descriptors * that we've allocated. * **********************************************************************/ static int lem_allocate_receive_structures(struct adapter *adapter) { device_t dev = adapter->dev; struct em_buffer *rx_buffer; int i, error; adapter->rx_buffer_area = malloc(sizeof(struct em_buffer) * adapter->num_rx_desc, M_DEVBUF, M_NOWAIT | M_ZERO); if (adapter->rx_buffer_area == NULL) { device_printf(dev, "Unable to allocate rx_buffer memory\n"); return (ENOMEM); } error = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MCLBYTES, /* maxsize */ 1, /* nsegments */ MCLBYTES, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &adapter->rxtag); if (error) { device_printf(dev, "%s: bus_dma_tag_create failed %d\n", __func__, error); goto fail; } /* Create the spare map (used by getbuf) */ error = bus_dmamap_create(adapter->rxtag, BUS_DMA_NOWAIT, &adapter->rx_sparemap); if (error) { device_printf(dev, "%s: bus_dmamap_create failed: %d\n", __func__, error); goto fail; } rx_buffer = adapter->rx_buffer_area; for (i = 0; i < adapter->num_rx_desc; i++, rx_buffer++) { error = bus_dmamap_create(adapter->rxtag, BUS_DMA_NOWAIT, &rx_buffer->map); if (error) { device_printf(dev, "%s: bus_dmamap_create failed: %d\n", __func__, error); goto fail; } } return (0); fail: lem_free_receive_structures(adapter); return (error); } /********************************************************************* * * (Re)initialize receive structures. * **********************************************************************/ static int lem_setup_receive_structures(struct adapter *adapter) { struct em_buffer *rx_buffer; int i, error; #ifdef DEV_NETMAP /* we are already under lock */ struct netmap_adapter *na = NA(adapter->ifp); struct netmap_slot *slot = netmap_reset(na, NR_RX, 0, 0); #endif /* Reset descriptor ring */ bzero(adapter->rx_desc_base, (sizeof(struct e1000_rx_desc)) * adapter->num_rx_desc); /* Free current RX buffers. */ rx_buffer = adapter->rx_buffer_area; for (i = 0; i < adapter->num_rx_desc; i++, rx_buffer++) { if (rx_buffer->m_head != NULL) { bus_dmamap_sync(adapter->rxtag, rx_buffer->map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(adapter->rxtag, rx_buffer->map); m_freem(rx_buffer->m_head); rx_buffer->m_head = NULL; } } /* Allocate new ones. */ for (i = 0; i < adapter->num_rx_desc; i++) { #ifdef DEV_NETMAP if (slot) { /* the i-th NIC entry goes to slot si */ int si = netmap_idx_n2k(&na->rx_rings[0], i); uint64_t paddr; void *addr; addr = PNMB(slot + si, &paddr); netmap_load_map(adapter->rxtag, rx_buffer->map, addr); /* Update descriptor */ adapter->rx_desc_base[i].buffer_addr = htole64(paddr); continue; } #endif /* DEV_NETMAP */ error = lem_get_buf(adapter, i); if (error) return (error); } /* Setup our descriptor pointers */ adapter->next_rx_desc_to_check = 0; bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); return (0); } /********************************************************************* * * Enable receive unit. * **********************************************************************/ static void lem_initialize_receive_unit(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; u64 bus_addr; u32 rctl, rxcsum; INIT_DEBUGOUT("lem_initialize_receive_unit: begin"); /* * Make sure receives are disabled while setting * up the descriptor ring */ rctl = E1000_READ_REG(&adapter->hw, E1000_RCTL); E1000_WRITE_REG(&adapter->hw, E1000_RCTL, rctl & ~E1000_RCTL_EN); if (adapter->hw.mac.type >= e1000_82540) { E1000_WRITE_REG(&adapter->hw, E1000_RADV, adapter->rx_abs_int_delay.value); /* * Set the interrupt throttling rate. Value is calculated * as DEFAULT_ITR = 1/(MAX_INTS_PER_SEC * 256ns) */ E1000_WRITE_REG(&adapter->hw, E1000_ITR, DEFAULT_ITR); } /* Setup the Base and Length of the Rx Descriptor Ring */ bus_addr = adapter->rxdma.dma_paddr; E1000_WRITE_REG(&adapter->hw, E1000_RDLEN(0), adapter->num_rx_desc * sizeof(struct e1000_rx_desc)); E1000_WRITE_REG(&adapter->hw, E1000_RDBAH(0), (u32)(bus_addr >> 32)); E1000_WRITE_REG(&adapter->hw, E1000_RDBAL(0), (u32)bus_addr); /* Setup the Receive Control Register */ rctl &= ~(3 << E1000_RCTL_MO_SHIFT); rctl |= E1000_RCTL_EN | E1000_RCTL_BAM | E1000_RCTL_LBM_NO | E1000_RCTL_RDMTS_HALF | (adapter->hw.mac.mc_filter_type << E1000_RCTL_MO_SHIFT); /* Make sure VLAN Filters are off */ rctl &= ~E1000_RCTL_VFE; if (e1000_tbi_sbp_enabled_82543(&adapter->hw)) rctl |= E1000_RCTL_SBP; else rctl &= ~E1000_RCTL_SBP; switch (adapter->rx_buffer_len) { default: case 2048: rctl |= E1000_RCTL_SZ_2048; break; case 4096: rctl |= E1000_RCTL_SZ_4096 | E1000_RCTL_BSEX | E1000_RCTL_LPE; break; case 8192: rctl |= E1000_RCTL_SZ_8192 | E1000_RCTL_BSEX | E1000_RCTL_LPE; break; case 16384: rctl |= E1000_RCTL_SZ_16384 | E1000_RCTL_BSEX | E1000_RCTL_LPE; break; } if (ifp->if_mtu > ETHERMTU) rctl |= E1000_RCTL_LPE; else rctl &= ~E1000_RCTL_LPE; /* Enable 82543 Receive Checksum Offload for TCP and UDP */ if ((adapter->hw.mac.type >= e1000_82543) && (ifp->if_capenable & IFCAP_RXCSUM)) { rxcsum = E1000_READ_REG(&adapter->hw, E1000_RXCSUM); rxcsum |= (E1000_RXCSUM_IPOFL | E1000_RXCSUM_TUOFL); E1000_WRITE_REG(&adapter->hw, E1000_RXCSUM, rxcsum); } /* Enable Receives */ E1000_WRITE_REG(&adapter->hw, E1000_RCTL, rctl); /* * Setup the HW Rx Head and * Tail Descriptor Pointers */ E1000_WRITE_REG(&adapter->hw, E1000_RDH(0), 0); rctl = adapter->num_rx_desc - 1; /* default RDT value */ #ifdef DEV_NETMAP /* preserve buffers already made available to clients */ if (ifp->if_capenable & IFCAP_NETMAP) - rctl -= NA(adapter->ifp)->rx_rings[0].nr_hwavail; + rctl -= nm_kr_rxspace(&NA(adapter->ifp)->rx_rings[0]); #endif /* DEV_NETMAP */ E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), rctl); return; } /********************************************************************* * * Free receive related data structures. * **********************************************************************/ static void lem_free_receive_structures(struct adapter *adapter) { struct em_buffer *rx_buffer; int i; INIT_DEBUGOUT("free_receive_structures: begin"); if (adapter->rx_sparemap) { bus_dmamap_destroy(adapter->rxtag, adapter->rx_sparemap); adapter->rx_sparemap = NULL; } /* Cleanup any existing buffers */ if (adapter->rx_buffer_area != NULL) { rx_buffer = adapter->rx_buffer_area; for (i = 0; i < adapter->num_rx_desc; i++, rx_buffer++) { if (rx_buffer->m_head != NULL) { bus_dmamap_sync(adapter->rxtag, rx_buffer->map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(adapter->rxtag, rx_buffer->map); m_freem(rx_buffer->m_head); rx_buffer->m_head = NULL; } else if (rx_buffer->map != NULL) bus_dmamap_unload(adapter->rxtag, rx_buffer->map); if (rx_buffer->map != NULL) { bus_dmamap_destroy(adapter->rxtag, rx_buffer->map); rx_buffer->map = NULL; } } } if (adapter->rx_buffer_area != NULL) { free(adapter->rx_buffer_area, M_DEVBUF); adapter->rx_buffer_area = NULL; } if (adapter->rxtag != NULL) { bus_dma_tag_destroy(adapter->rxtag); adapter->rxtag = NULL; } } /********************************************************************* * * This routine executes in interrupt context. It replenishes * the mbufs in the descriptor and sends data which has been * dma'ed into host memory to upper layer. * * We loop at most count times if count is > 0, or until done if * count < 0. * * For polling we also now return the number of cleaned packets *********************************************************************/ static bool lem_rxeof(struct adapter *adapter, int count, int *done) { struct ifnet *ifp = adapter->ifp; struct mbuf *mp; u8 status = 0, accept_frame = 0, eop = 0; u16 len, desc_len, prev_len_adj; int i, rx_sent = 0; struct e1000_rx_desc *current_desc; EM_RX_LOCK(adapter); i = adapter->next_rx_desc_to_check; current_desc = &adapter->rx_desc_base[i]; bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, BUS_DMASYNC_POSTREAD); #ifdef DEV_NETMAP if (netmap_rx_irq(ifp, 0, &rx_sent)) { EM_RX_UNLOCK(adapter); return (FALSE); } #endif /* DEV_NETMAP */ if (!((current_desc->status) & E1000_RXD_STAT_DD)) { if (done != NULL) *done = rx_sent; EM_RX_UNLOCK(adapter); return (FALSE); } while (count != 0 && ifp->if_drv_flags & IFF_DRV_RUNNING) { struct mbuf *m = NULL; status = current_desc->status; if ((status & E1000_RXD_STAT_DD) == 0) break; mp = adapter->rx_buffer_area[i].m_head; /* * Can't defer bus_dmamap_sync(9) because TBI_ACCEPT * needs to access the last received byte in the mbuf. */ bus_dmamap_sync(adapter->rxtag, adapter->rx_buffer_area[i].map, BUS_DMASYNC_POSTREAD); accept_frame = 1; prev_len_adj = 0; desc_len = le16toh(current_desc->length); if (status & E1000_RXD_STAT_EOP) { count--; eop = 1; if (desc_len < ETHER_CRC_LEN) { len = 0; prev_len_adj = ETHER_CRC_LEN - desc_len; } else len = desc_len - ETHER_CRC_LEN; } else { eop = 0; len = desc_len; } if (current_desc->errors & E1000_RXD_ERR_FRAME_ERR_MASK) { u8 last_byte; u32 pkt_len = desc_len; if (adapter->fmp != NULL) pkt_len += adapter->fmp->m_pkthdr.len; last_byte = *(mtod(mp, caddr_t) + desc_len - 1); if (TBI_ACCEPT(&adapter->hw, status, current_desc->errors, pkt_len, last_byte, adapter->min_frame_size, adapter->max_frame_size)) { e1000_tbi_adjust_stats_82543(&adapter->hw, &adapter->stats, pkt_len, adapter->hw.mac.addr, adapter->max_frame_size); if (len > 0) len--; } else accept_frame = 0; } if (accept_frame) { if (lem_get_buf(adapter, i) != 0) { ifp->if_iqdrops++; goto discard; } /* Assign correct length to the current fragment */ mp->m_len = len; if (adapter->fmp == NULL) { mp->m_pkthdr.len = len; adapter->fmp = mp; /* Store the first mbuf */ adapter->lmp = mp; } else { /* Chain mbuf's together */ mp->m_flags &= ~M_PKTHDR; /* * Adjust length of previous mbuf in chain if * we received less than 4 bytes in the last * descriptor. */ if (prev_len_adj > 0) { adapter->lmp->m_len -= prev_len_adj; adapter->fmp->m_pkthdr.len -= prev_len_adj; } adapter->lmp->m_next = mp; adapter->lmp = adapter->lmp->m_next; adapter->fmp->m_pkthdr.len += len; } if (eop) { adapter->fmp->m_pkthdr.rcvif = ifp; ifp->if_ipackets++; lem_receive_checksum(adapter, current_desc, adapter->fmp); #ifndef __NO_STRICT_ALIGNMENT if (adapter->max_frame_size > (MCLBYTES - ETHER_ALIGN) && lem_fixup_rx(adapter) != 0) goto skip; #endif if (status & E1000_RXD_STAT_VP) { adapter->fmp->m_pkthdr.ether_vtag = le16toh(current_desc->special); adapter->fmp->m_flags |= M_VLANTAG; } #ifndef __NO_STRICT_ALIGNMENT skip: #endif m = adapter->fmp; adapter->fmp = NULL; adapter->lmp = NULL; } } else { adapter->dropped_pkts++; discard: /* Reuse loaded DMA map and just update mbuf chain */ mp = adapter->rx_buffer_area[i].m_head; mp->m_len = mp->m_pkthdr.len = MCLBYTES; mp->m_data = mp->m_ext.ext_buf; mp->m_next = NULL; if (adapter->max_frame_size <= (MCLBYTES - ETHER_ALIGN)) m_adj(mp, ETHER_ALIGN); if (adapter->fmp != NULL) { m_freem(adapter->fmp); adapter->fmp = NULL; adapter->lmp = NULL; } m = NULL; } /* Zero out the receive descriptors status. */ current_desc->status = 0; bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* Advance our pointers to the next descriptor. */ if (++i == adapter->num_rx_desc) i = 0; /* Call into the stack */ if (m != NULL) { adapter->next_rx_desc_to_check = i; EM_RX_UNLOCK(adapter); (*ifp->if_input)(ifp, m); EM_RX_LOCK(adapter); rx_sent++; i = adapter->next_rx_desc_to_check; } current_desc = &adapter->rx_desc_base[i]; } adapter->next_rx_desc_to_check = i; /* Advance the E1000's Receive Queue #0 "Tail Pointer". */ if (--i < 0) i = adapter->num_rx_desc - 1; E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), i); if (done != NULL) *done = rx_sent; EM_RX_UNLOCK(adapter); return ((status & E1000_RXD_STAT_DD) ? TRUE : FALSE); } #ifndef __NO_STRICT_ALIGNMENT /* * When jumbo frames are enabled we should realign entire payload on * architecures with strict alignment. This is serious design mistake of 8254x * as it nullifies DMA operations. 8254x just allows RX buffer size to be * 2048/4096/8192/16384. What we really want is 2048 - ETHER_ALIGN to align its * payload. On architecures without strict alignment restrictions 8254x still * performs unaligned memory access which would reduce the performance too. * To avoid copying over an entire frame to align, we allocate a new mbuf and * copy ethernet header to the new mbuf. The new mbuf is prepended into the * existing mbuf chain. * * Be aware, best performance of the 8254x is achived only when jumbo frame is * not used at all on architectures with strict alignment. */ static int lem_fixup_rx(struct adapter *adapter) { struct mbuf *m, *n; int error; error = 0; m = adapter->fmp; if (m->m_len <= (MCLBYTES - ETHER_HDR_LEN)) { bcopy(m->m_data, m->m_data + ETHER_HDR_LEN, m->m_len); m->m_data += ETHER_HDR_LEN; } else { MGETHDR(n, M_NOWAIT, MT_DATA); if (n != NULL) { bcopy(m->m_data, n->m_data, ETHER_HDR_LEN); m->m_data += ETHER_HDR_LEN; m->m_len -= ETHER_HDR_LEN; n->m_len = ETHER_HDR_LEN; M_MOVE_PKTHDR(n, m); n->m_next = m; adapter->fmp = n; } else { adapter->dropped_pkts++; m_freem(adapter->fmp); adapter->fmp = NULL; error = ENOMEM; } } return (error); } #endif /********************************************************************* * * Verify that the hardware indicated that the checksum is valid. * Inform the stack about the status of checksum so that stack * doesn't spend time verifying the checksum. * *********************************************************************/ static void lem_receive_checksum(struct adapter *adapter, struct e1000_rx_desc *rx_desc, struct mbuf *mp) { /* 82543 or newer only */ if ((adapter->hw.mac.type < e1000_82543) || /* Ignore Checksum bit is set */ (rx_desc->status & E1000_RXD_STAT_IXSM)) { mp->m_pkthdr.csum_flags = 0; return; } if (rx_desc->status & E1000_RXD_STAT_IPCS) { /* Did it pass? */ if (!(rx_desc->errors & E1000_RXD_ERR_IPE)) { /* IP Checksum Good */ mp->m_pkthdr.csum_flags = CSUM_IP_CHECKED; mp->m_pkthdr.csum_flags |= CSUM_IP_VALID; } else { mp->m_pkthdr.csum_flags = 0; } } if (rx_desc->status & E1000_RXD_STAT_TCPCS) { /* Did it pass? */ if (!(rx_desc->errors & E1000_RXD_ERR_TCPE)) { mp->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); mp->m_pkthdr.csum_data = htons(0xffff); } } } /* * This routine is run via an vlan * config EVENT */ static void lem_register_vlan(void *arg, struct ifnet *ifp, u16 vtag) { struct adapter *adapter = ifp->if_softc; u32 index, bit; if (ifp->if_softc != arg) /* Not our event */ return; if ((vtag == 0) || (vtag > 4095)) /* Invalid ID */ return; EM_CORE_LOCK(adapter); index = (vtag >> 5) & 0x7F; bit = vtag & 0x1F; adapter->shadow_vfta[index] |= (1 << bit); ++adapter->num_vlans; /* Re-init to load the changes */ if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) lem_init_locked(adapter); EM_CORE_UNLOCK(adapter); } /* * This routine is run via an vlan * unconfig EVENT */ static void lem_unregister_vlan(void *arg, struct ifnet *ifp, u16 vtag) { struct adapter *adapter = ifp->if_softc; u32 index, bit; if (ifp->if_softc != arg) return; if ((vtag == 0) || (vtag > 4095)) /* Invalid */ return; EM_CORE_LOCK(adapter); index = (vtag >> 5) & 0x7F; bit = vtag & 0x1F; adapter->shadow_vfta[index] &= ~(1 << bit); --adapter->num_vlans; /* Re-init to load the changes */ if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) lem_init_locked(adapter); EM_CORE_UNLOCK(adapter); } static void lem_setup_vlan_hw_support(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; u32 reg; /* ** We get here thru init_locked, meaning ** a soft reset, this has already cleared ** the VFTA and other state, so if there ** have been no vlan's registered do nothing. */ if (adapter->num_vlans == 0) return; /* ** A soft reset zero's out the VFTA, so ** we need to repopulate it now. */ for (int i = 0; i < EM_VFTA_SIZE; i++) if (adapter->shadow_vfta[i] != 0) E1000_WRITE_REG_ARRAY(hw, E1000_VFTA, i, adapter->shadow_vfta[i]); reg = E1000_READ_REG(hw, E1000_CTRL); reg |= E1000_CTRL_VME; E1000_WRITE_REG(hw, E1000_CTRL, reg); /* Enable the Filter Table */ reg = E1000_READ_REG(hw, E1000_RCTL); reg &= ~E1000_RCTL_CFIEN; reg |= E1000_RCTL_VFE; E1000_WRITE_REG(hw, E1000_RCTL, reg); } static void lem_enable_intr(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; u32 ims_mask = IMS_ENABLE_MASK; E1000_WRITE_REG(hw, E1000_IMS, ims_mask); } static void lem_disable_intr(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; E1000_WRITE_REG(hw, E1000_IMC, 0xffffffff); } /* * Bit of a misnomer, what this really means is * to enable OS management of the system... aka * to disable special hardware management features */ static void lem_init_manageability(struct adapter *adapter) { /* A shared code workaround */ if (adapter->has_manage) { int manc = E1000_READ_REG(&adapter->hw, E1000_MANC); /* disable hardware interception of ARP */ manc &= ~(E1000_MANC_ARP_EN); E1000_WRITE_REG(&adapter->hw, E1000_MANC, manc); } } /* * Give control back to hardware management * controller if there is one. */ static void lem_release_manageability(struct adapter *adapter) { if (adapter->has_manage) { int manc = E1000_READ_REG(&adapter->hw, E1000_MANC); /* re-enable hardware interception of ARP */ manc |= E1000_MANC_ARP_EN; E1000_WRITE_REG(&adapter->hw, E1000_MANC, manc); } } /* * lem_get_hw_control sets the {CTRL_EXT|FWSM}:DRV_LOAD bit. * For ASF and Pass Through versions of f/w this means * that the driver is loaded. For AMT version type f/w * this means that the network i/f is open. */ static void lem_get_hw_control(struct adapter *adapter) { u32 ctrl_ext; ctrl_ext = E1000_READ_REG(&adapter->hw, E1000_CTRL_EXT); E1000_WRITE_REG(&adapter->hw, E1000_CTRL_EXT, ctrl_ext | E1000_CTRL_EXT_DRV_LOAD); return; } /* * lem_release_hw_control resets {CTRL_EXT|FWSM}:DRV_LOAD bit. * For ASF and Pass Through versions of f/w this means that * the driver is no longer loaded. For AMT versions of the * f/w this means that the network i/f is closed. */ static void lem_release_hw_control(struct adapter *adapter) { u32 ctrl_ext; if (!adapter->has_manage) return; ctrl_ext = E1000_READ_REG(&adapter->hw, E1000_CTRL_EXT); E1000_WRITE_REG(&adapter->hw, E1000_CTRL_EXT, ctrl_ext & ~E1000_CTRL_EXT_DRV_LOAD); return; } static int lem_is_valid_ether_addr(u8 *addr) { char zero_addr[6] = { 0, 0, 0, 0, 0, 0 }; if ((addr[0] & 1) || (!bcmp(addr, zero_addr, ETHER_ADDR_LEN))) { return (FALSE); } return (TRUE); } /* ** Parse the interface capabilities with regard ** to both system management and wake-on-lan for ** later use. */ static void lem_get_wakeup(device_t dev) { struct adapter *adapter = device_get_softc(dev); u16 eeprom_data = 0, device_id, apme_mask; adapter->has_manage = e1000_enable_mng_pass_thru(&adapter->hw); apme_mask = EM_EEPROM_APME; switch (adapter->hw.mac.type) { case e1000_82542: case e1000_82543: break; case e1000_82544: e1000_read_nvm(&adapter->hw, NVM_INIT_CONTROL2_REG, 1, &eeprom_data); apme_mask = EM_82544_APME; break; case e1000_82546: case e1000_82546_rev_3: if (adapter->hw.bus.func == 1) { e1000_read_nvm(&adapter->hw, NVM_INIT_CONTROL3_PORT_B, 1, &eeprom_data); break; } else e1000_read_nvm(&adapter->hw, NVM_INIT_CONTROL3_PORT_A, 1, &eeprom_data); break; default: e1000_read_nvm(&adapter->hw, NVM_INIT_CONTROL3_PORT_A, 1, &eeprom_data); break; } if (eeprom_data & apme_mask) adapter->wol = (E1000_WUFC_MAG | E1000_WUFC_MC); /* * We have the eeprom settings, now apply the special cases * where the eeprom may be wrong or the board won't support * wake on lan on a particular port */ device_id = pci_get_device(dev); switch (device_id) { case E1000_DEV_ID_82546GB_PCIE: adapter->wol = 0; break; case E1000_DEV_ID_82546EB_FIBER: case E1000_DEV_ID_82546GB_FIBER: /* Wake events only supported on port A for dual fiber * regardless of eeprom setting */ if (E1000_READ_REG(&adapter->hw, E1000_STATUS) & E1000_STATUS_FUNC_1) adapter->wol = 0; break; case E1000_DEV_ID_82546GB_QUAD_COPPER_KSP3: /* if quad port adapter, disable WoL on all but port A */ if (global_quad_port_a != 0) adapter->wol = 0; /* Reset for multiple quad port adapters */ if (++global_quad_port_a == 4) global_quad_port_a = 0; break; } return; } /* * Enable PCI Wake On Lan capability */ static void lem_enable_wakeup(device_t dev) { struct adapter *adapter = device_get_softc(dev); struct ifnet *ifp = adapter->ifp; u32 pmc, ctrl, ctrl_ext, rctl; u16 status; if ((pci_find_cap(dev, PCIY_PMG, &pmc) != 0)) return; /* Advertise the wakeup capability */ ctrl = E1000_READ_REG(&adapter->hw, E1000_CTRL); ctrl |= (E1000_CTRL_SWDPIN2 | E1000_CTRL_SWDPIN3); E1000_WRITE_REG(&adapter->hw, E1000_CTRL, ctrl); E1000_WRITE_REG(&adapter->hw, E1000_WUC, E1000_WUC_PME_EN); /* Keep the laser running on Fiber adapters */ if (adapter->hw.phy.media_type == e1000_media_type_fiber || adapter->hw.phy.media_type == e1000_media_type_internal_serdes) { ctrl_ext = E1000_READ_REG(&adapter->hw, E1000_CTRL_EXT); ctrl_ext |= E1000_CTRL_EXT_SDP3_DATA; E1000_WRITE_REG(&adapter->hw, E1000_CTRL_EXT, ctrl_ext); } /* ** Determine type of Wakeup: note that wol ** is set with all bits on by default. */ if ((ifp->if_capenable & IFCAP_WOL_MAGIC) == 0) adapter->wol &= ~E1000_WUFC_MAG; if ((ifp->if_capenable & IFCAP_WOL_MCAST) == 0) adapter->wol &= ~E1000_WUFC_MC; else { rctl = E1000_READ_REG(&adapter->hw, E1000_RCTL); rctl |= E1000_RCTL_MPE; E1000_WRITE_REG(&adapter->hw, E1000_RCTL, rctl); } if (adapter->hw.mac.type == e1000_pchlan) { if (lem_enable_phy_wakeup(adapter)) return; } else { E1000_WRITE_REG(&adapter->hw, E1000_WUC, E1000_WUC_PME_EN); E1000_WRITE_REG(&adapter->hw, E1000_WUFC, adapter->wol); } /* Request PME */ status = pci_read_config(dev, pmc + PCIR_POWER_STATUS, 2); status &= ~(PCIM_PSTAT_PME | PCIM_PSTAT_PMEENABLE); if (ifp->if_capenable & IFCAP_WOL) status |= PCIM_PSTAT_PME | PCIM_PSTAT_PMEENABLE; pci_write_config(dev, pmc + PCIR_POWER_STATUS, status, 2); return; } /* ** WOL in the newer chipset interfaces (pchlan) ** require thing to be copied into the phy */ static int lem_enable_phy_wakeup(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; u32 mreg, ret = 0; u16 preg; /* copy MAC RARs to PHY RARs */ for (int i = 0; i < adapter->hw.mac.rar_entry_count; i++) { mreg = E1000_READ_REG(hw, E1000_RAL(i)); e1000_write_phy_reg(hw, BM_RAR_L(i), (u16)(mreg & 0xFFFF)); e1000_write_phy_reg(hw, BM_RAR_M(i), (u16)((mreg >> 16) & 0xFFFF)); mreg = E1000_READ_REG(hw, E1000_RAH(i)); e1000_write_phy_reg(hw, BM_RAR_H(i), (u16)(mreg & 0xFFFF)); e1000_write_phy_reg(hw, BM_RAR_CTRL(i), (u16)((mreg >> 16) & 0xFFFF)); } /* copy MAC MTA to PHY MTA */ for (int i = 0; i < adapter->hw.mac.mta_reg_count; i++) { mreg = E1000_READ_REG_ARRAY(hw, E1000_MTA, i); e1000_write_phy_reg(hw, BM_MTA(i), (u16)(mreg & 0xFFFF)); e1000_write_phy_reg(hw, BM_MTA(i) + 1, (u16)((mreg >> 16) & 0xFFFF)); } /* configure PHY Rx Control register */ e1000_read_phy_reg(&adapter->hw, BM_RCTL, &preg); mreg = E1000_READ_REG(hw, E1000_RCTL); if (mreg & E1000_RCTL_UPE) preg |= BM_RCTL_UPE; if (mreg & E1000_RCTL_MPE) preg |= BM_RCTL_MPE; preg &= ~(BM_RCTL_MO_MASK); if (mreg & E1000_RCTL_MO_3) preg |= (((mreg & E1000_RCTL_MO_3) >> E1000_RCTL_MO_SHIFT) << BM_RCTL_MO_SHIFT); if (mreg & E1000_RCTL_BAM) preg |= BM_RCTL_BAM; if (mreg & E1000_RCTL_PMCF) preg |= BM_RCTL_PMCF; mreg = E1000_READ_REG(hw, E1000_CTRL); if (mreg & E1000_CTRL_RFCE) preg |= BM_RCTL_RFCE; e1000_write_phy_reg(&adapter->hw, BM_RCTL, preg); /* enable PHY wakeup in MAC register */ E1000_WRITE_REG(hw, E1000_WUC, E1000_WUC_PHY_WAKE | E1000_WUC_PME_EN); E1000_WRITE_REG(hw, E1000_WUFC, adapter->wol); /* configure and enable PHY wakeup in PHY registers */ e1000_write_phy_reg(&adapter->hw, BM_WUFC, adapter->wol); e1000_write_phy_reg(&adapter->hw, BM_WUC, E1000_WUC_PME_EN); /* activate PHY wakeup */ ret = hw->phy.ops.acquire(hw); if (ret) { printf("Could not acquire PHY\n"); return ret; } e1000_write_phy_reg_mdic(hw, IGP01E1000_PHY_PAGE_SELECT, (BM_WUC_ENABLE_PAGE << IGP_PAGE_SHIFT)); ret = e1000_read_phy_reg_mdic(hw, BM_WUC_ENABLE_REG, &preg); if (ret) { printf("Could not read PHY page 769\n"); goto out; } preg |= BM_WUC_ENABLE_BIT | BM_WUC_HOST_WU_BIT; ret = e1000_write_phy_reg_mdic(hw, BM_WUC_ENABLE_REG, preg); if (ret) printf("Could not set PHY Host Wakeup bit\n"); out: hw->phy.ops.release(hw); return ret; } static void lem_led_func(void *arg, int onoff) { struct adapter *adapter = arg; EM_CORE_LOCK(adapter); if (onoff) { e1000_setup_led(&adapter->hw); e1000_led_on(&adapter->hw); } else { e1000_led_off(&adapter->hw); e1000_cleanup_led(&adapter->hw); } EM_CORE_UNLOCK(adapter); } /********************************************************************* * 82544 Coexistence issue workaround. * There are 2 issues. * 1. Transmit Hang issue. * To detect this issue, following equation can be used... * SIZE[3:0] + ADDR[2:0] = SUM[3:0]. * If SUM[3:0] is in between 1 to 4, we will have this issue. * * 2. DAC issue. * To detect this issue, following equation can be used... * SIZE[3:0] + ADDR[2:0] = SUM[3:0]. * If SUM[3:0] is in between 9 to c, we will have this issue. * * * WORKAROUND: * Make sure we do not have ending address * as 1,2,3,4(Hang) or 9,a,b,c (DAC) * *************************************************************************/ static u32 lem_fill_descriptors (bus_addr_t address, u32 length, PDESC_ARRAY desc_array) { u32 safe_terminator; /* Since issue is sensitive to length and address.*/ /* Let us first check the address...*/ if (length <= 4) { desc_array->descriptor[0].address = address; desc_array->descriptor[0].length = length; desc_array->elements = 1; return (desc_array->elements); } safe_terminator = (u32)((((u32)address & 0x7) + (length & 0xF)) & 0xF); /* if it does not fall between 0x1 to 0x4 and 0x9 to 0xC then return */ if (safe_terminator == 0 || (safe_terminator > 4 && safe_terminator < 9) || (safe_terminator > 0xC && safe_terminator <= 0xF)) { desc_array->descriptor[0].address = address; desc_array->descriptor[0].length = length; desc_array->elements = 1; return (desc_array->elements); } desc_array->descriptor[0].address = address; desc_array->descriptor[0].length = length - 4; desc_array->descriptor[1].address = address + (length - 4); desc_array->descriptor[1].length = 4; desc_array->elements = 2; return (desc_array->elements); } /********************************************************************** * * Update the board statistics counters. * **********************************************************************/ static void lem_update_stats_counters(struct adapter *adapter) { struct ifnet *ifp; if(adapter->hw.phy.media_type == e1000_media_type_copper || (E1000_READ_REG(&adapter->hw, E1000_STATUS) & E1000_STATUS_LU)) { adapter->stats.symerrs += E1000_READ_REG(&adapter->hw, E1000_SYMERRS); adapter->stats.sec += E1000_READ_REG(&adapter->hw, E1000_SEC); } adapter->stats.crcerrs += E1000_READ_REG(&adapter->hw, E1000_CRCERRS); adapter->stats.mpc += E1000_READ_REG(&adapter->hw, E1000_MPC); adapter->stats.scc += E1000_READ_REG(&adapter->hw, E1000_SCC); adapter->stats.ecol += E1000_READ_REG(&adapter->hw, E1000_ECOL); adapter->stats.mcc += E1000_READ_REG(&adapter->hw, E1000_MCC); adapter->stats.latecol += E1000_READ_REG(&adapter->hw, E1000_LATECOL); adapter->stats.colc += E1000_READ_REG(&adapter->hw, E1000_COLC); adapter->stats.dc += E1000_READ_REG(&adapter->hw, E1000_DC); adapter->stats.rlec += E1000_READ_REG(&adapter->hw, E1000_RLEC); adapter->stats.xonrxc += E1000_READ_REG(&adapter->hw, E1000_XONRXC); adapter->stats.xontxc += E1000_READ_REG(&adapter->hw, E1000_XONTXC); adapter->stats.xoffrxc += E1000_READ_REG(&adapter->hw, E1000_XOFFRXC); adapter->stats.xofftxc += E1000_READ_REG(&adapter->hw, E1000_XOFFTXC); adapter->stats.fcruc += E1000_READ_REG(&adapter->hw, E1000_FCRUC); adapter->stats.prc64 += E1000_READ_REG(&adapter->hw, E1000_PRC64); adapter->stats.prc127 += E1000_READ_REG(&adapter->hw, E1000_PRC127); adapter->stats.prc255 += E1000_READ_REG(&adapter->hw, E1000_PRC255); adapter->stats.prc511 += E1000_READ_REG(&adapter->hw, E1000_PRC511); adapter->stats.prc1023 += E1000_READ_REG(&adapter->hw, E1000_PRC1023); adapter->stats.prc1522 += E1000_READ_REG(&adapter->hw, E1000_PRC1522); adapter->stats.gprc += E1000_READ_REG(&adapter->hw, E1000_GPRC); adapter->stats.bprc += E1000_READ_REG(&adapter->hw, E1000_BPRC); adapter->stats.mprc += E1000_READ_REG(&adapter->hw, E1000_MPRC); adapter->stats.gptc += E1000_READ_REG(&adapter->hw, E1000_GPTC); /* For the 64-bit byte counters the low dword must be read first. */ /* Both registers clear on the read of the high dword */ adapter->stats.gorc += E1000_READ_REG(&adapter->hw, E1000_GORCL) + ((u64)E1000_READ_REG(&adapter->hw, E1000_GORCH) << 32); adapter->stats.gotc += E1000_READ_REG(&adapter->hw, E1000_GOTCL) + ((u64)E1000_READ_REG(&adapter->hw, E1000_GOTCH) << 32); adapter->stats.rnbc += E1000_READ_REG(&adapter->hw, E1000_RNBC); adapter->stats.ruc += E1000_READ_REG(&adapter->hw, E1000_RUC); adapter->stats.rfc += E1000_READ_REG(&adapter->hw, E1000_RFC); adapter->stats.roc += E1000_READ_REG(&adapter->hw, E1000_ROC); adapter->stats.rjc += E1000_READ_REG(&adapter->hw, E1000_RJC); adapter->stats.tor += E1000_READ_REG(&adapter->hw, E1000_TORH); adapter->stats.tot += E1000_READ_REG(&adapter->hw, E1000_TOTH); adapter->stats.tpr += E1000_READ_REG(&adapter->hw, E1000_TPR); adapter->stats.tpt += E1000_READ_REG(&adapter->hw, E1000_TPT); adapter->stats.ptc64 += E1000_READ_REG(&adapter->hw, E1000_PTC64); adapter->stats.ptc127 += E1000_READ_REG(&adapter->hw, E1000_PTC127); adapter->stats.ptc255 += E1000_READ_REG(&adapter->hw, E1000_PTC255); adapter->stats.ptc511 += E1000_READ_REG(&adapter->hw, E1000_PTC511); adapter->stats.ptc1023 += E1000_READ_REG(&adapter->hw, E1000_PTC1023); adapter->stats.ptc1522 += E1000_READ_REG(&adapter->hw, E1000_PTC1522); adapter->stats.mptc += E1000_READ_REG(&adapter->hw, E1000_MPTC); adapter->stats.bptc += E1000_READ_REG(&adapter->hw, E1000_BPTC); if (adapter->hw.mac.type >= e1000_82543) { adapter->stats.algnerrc += E1000_READ_REG(&adapter->hw, E1000_ALGNERRC); adapter->stats.rxerrc += E1000_READ_REG(&adapter->hw, E1000_RXERRC); adapter->stats.tncrs += E1000_READ_REG(&adapter->hw, E1000_TNCRS); adapter->stats.cexterr += E1000_READ_REG(&adapter->hw, E1000_CEXTERR); adapter->stats.tsctc += E1000_READ_REG(&adapter->hw, E1000_TSCTC); adapter->stats.tsctfc += E1000_READ_REG(&adapter->hw, E1000_TSCTFC); } ifp = adapter->ifp; ifp->if_collisions = adapter->stats.colc; /* Rx Errors */ ifp->if_ierrors = adapter->dropped_pkts + adapter->stats.rxerrc + adapter->stats.crcerrs + adapter->stats.algnerrc + adapter->stats.ruc + adapter->stats.roc + adapter->stats.mpc + adapter->stats.cexterr; /* Tx Errors */ ifp->if_oerrors = adapter->stats.ecol + adapter->stats.latecol + adapter->watchdog_events; } /* Export a single 32-bit register via a read-only sysctl. */ static int lem_sysctl_reg_handler(SYSCTL_HANDLER_ARGS) { struct adapter *adapter; u_int val; adapter = oidp->oid_arg1; val = E1000_READ_REG(&adapter->hw, oidp->oid_arg2); return (sysctl_handle_int(oidp, &val, 0, req)); } /* * Add sysctl variables, one per statistic, to the system. */ static void lem_add_hw_stats(struct adapter *adapter) { device_t dev = adapter->dev; struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(dev); struct sysctl_oid *tree = device_get_sysctl_tree(dev); struct sysctl_oid_list *child = SYSCTL_CHILDREN(tree); struct e1000_hw_stats *stats = &adapter->stats; struct sysctl_oid *stat_node; struct sysctl_oid_list *stat_list; /* Driver Statistics */ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "mbuf_alloc_fail", CTLFLAG_RD, &adapter->mbuf_alloc_failed, "Std mbuf failed"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "cluster_alloc_fail", CTLFLAG_RD, &adapter->mbuf_cluster_failed, "Std mbuf cluster failed"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "dropped", CTLFLAG_RD, &adapter->dropped_pkts, "Driver dropped packets"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_dma_fail", CTLFLAG_RD, &adapter->no_tx_dma_setup, "Driver tx dma failure in xmit"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_desc_fail1", CTLFLAG_RD, &adapter->no_tx_desc_avail1, "Not enough tx descriptors failure in xmit"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_desc_fail2", CTLFLAG_RD, &adapter->no_tx_desc_avail2, "Not enough tx descriptors failure in xmit"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_overruns", CTLFLAG_RD, &adapter->rx_overruns, "RX overruns"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "watchdog_timeouts", CTLFLAG_RD, &adapter->watchdog_events, "Watchdog timeouts"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "device_control", CTLTYPE_UINT | CTLFLAG_RD, adapter, E1000_CTRL, lem_sysctl_reg_handler, "IU", "Device Control Register"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_control", CTLTYPE_UINT | CTLFLAG_RD, adapter, E1000_RCTL, lem_sysctl_reg_handler, "IU", "Receiver Control Register"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "fc_high_water", CTLFLAG_RD, &adapter->hw.fc.high_water, 0, "Flow Control High Watermark"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "fc_low_water", CTLFLAG_RD, &adapter->hw.fc.low_water, 0, "Flow Control Low Watermark"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "fifo_workaround", CTLFLAG_RD, &adapter->tx_fifo_wrk_cnt, "TX FIFO workaround events"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "fifo_reset", CTLFLAG_RD, &adapter->tx_fifo_reset_cnt, "TX FIFO resets"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txd_head", CTLTYPE_UINT | CTLFLAG_RD, adapter, E1000_TDH(0), lem_sysctl_reg_handler, "IU", "Transmit Descriptor Head"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txd_tail", CTLTYPE_UINT | CTLFLAG_RD, adapter, E1000_TDT(0), lem_sysctl_reg_handler, "IU", "Transmit Descriptor Tail"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxd_head", CTLTYPE_UINT | CTLFLAG_RD, adapter, E1000_RDH(0), lem_sysctl_reg_handler, "IU", "Receive Descriptor Head"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxd_tail", CTLTYPE_UINT | CTLFLAG_RD, adapter, E1000_RDT(0), lem_sysctl_reg_handler, "IU", "Receive Descriptor Tail"); /* MAC stats get their own sub node */ stat_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "mac_stats", CTLFLAG_RD, NULL, "Statistics"); stat_list = SYSCTL_CHILDREN(stat_node); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "excess_coll", CTLFLAG_RD, &stats->ecol, "Excessive collisions"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "single_coll", CTLFLAG_RD, &stats->scc, "Single collisions"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "multiple_coll", CTLFLAG_RD, &stats->mcc, "Multiple collisions"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "late_coll", CTLFLAG_RD, &stats->latecol, "Late collisions"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "collision_count", CTLFLAG_RD, &stats->colc, "Collision Count"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "symbol_errors", CTLFLAG_RD, &adapter->stats.symerrs, "Symbol Errors"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "sequence_errors", CTLFLAG_RD, &adapter->stats.sec, "Sequence Errors"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "defer_count", CTLFLAG_RD, &adapter->stats.dc, "Defer Count"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "missed_packets", CTLFLAG_RD, &adapter->stats.mpc, "Missed Packets"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_no_buff", CTLFLAG_RD, &adapter->stats.rnbc, "Receive No Buffers"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_undersize", CTLFLAG_RD, &adapter->stats.ruc, "Receive Undersize"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_fragmented", CTLFLAG_RD, &adapter->stats.rfc, "Fragmented Packets Received "); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_oversize", CTLFLAG_RD, &adapter->stats.roc, "Oversized Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_jabber", CTLFLAG_RD, &adapter->stats.rjc, "Recevied Jabber"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_errs", CTLFLAG_RD, &adapter->stats.rxerrc, "Receive Errors"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "crc_errs", CTLFLAG_RD, &adapter->stats.crcerrs, "CRC errors"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "alignment_errs", CTLFLAG_RD, &adapter->stats.algnerrc, "Alignment Errors"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "coll_ext_errs", CTLFLAG_RD, &adapter->stats.cexterr, "Collision/Carrier extension errors"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "xon_recvd", CTLFLAG_RD, &adapter->stats.xonrxc, "XON Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "xon_txd", CTLFLAG_RD, &adapter->stats.xontxc, "XON Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "xoff_recvd", CTLFLAG_RD, &adapter->stats.xoffrxc, "XOFF Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "xoff_txd", CTLFLAG_RD, &adapter->stats.xofftxc, "XOFF Transmitted"); /* Packet Reception Stats */ SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "total_pkts_recvd", CTLFLAG_RD, &adapter->stats.tpr, "Total Packets Received "); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_pkts_recvd", CTLFLAG_RD, &adapter->stats.gprc, "Good Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "bcast_pkts_recvd", CTLFLAG_RD, &adapter->stats.bprc, "Broadcast Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "mcast_pkts_recvd", CTLFLAG_RD, &adapter->stats.mprc, "Multicast Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_64", CTLFLAG_RD, &adapter->stats.prc64, "64 byte frames received "); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_65_127", CTLFLAG_RD, &adapter->stats.prc127, "65-127 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_128_255", CTLFLAG_RD, &adapter->stats.prc255, "128-255 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_256_511", CTLFLAG_RD, &adapter->stats.prc511, "256-511 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_512_1023", CTLFLAG_RD, &adapter->stats.prc1023, "512-1023 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_1024_1522", CTLFLAG_RD, &adapter->stats.prc1522, "1023-1522 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_octets_recvd", CTLFLAG_RD, &adapter->stats.gorc, "Good Octets Received"); /* Packet Transmission Stats */ SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_octets_txd", CTLFLAG_RD, &adapter->stats.gotc, "Good Octets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "total_pkts_txd", CTLFLAG_RD, &adapter->stats.tpt, "Total Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_pkts_txd", CTLFLAG_RD, &adapter->stats.gptc, "Good Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "bcast_pkts_txd", CTLFLAG_RD, &adapter->stats.bptc, "Broadcast Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "mcast_pkts_txd", CTLFLAG_RD, &adapter->stats.mptc, "Multicast Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_64", CTLFLAG_RD, &adapter->stats.ptc64, "64 byte frames transmitted "); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_65_127", CTLFLAG_RD, &adapter->stats.ptc127, "65-127 byte frames transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_128_255", CTLFLAG_RD, &adapter->stats.ptc255, "128-255 byte frames transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_256_511", CTLFLAG_RD, &adapter->stats.ptc511, "256-511 byte frames transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_512_1023", CTLFLAG_RD, &adapter->stats.ptc1023, "512-1023 byte frames transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_1024_1522", CTLFLAG_RD, &adapter->stats.ptc1522, "1024-1522 byte frames transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tso_txd", CTLFLAG_RD, &adapter->stats.tsctc, "TSO Contexts Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tso_ctx_fail", CTLFLAG_RD, &adapter->stats.tsctfc, "TSO Contexts Failed"); } /********************************************************************** * * This routine provides a way to dump out the adapter eeprom, * often a useful debug/service tool. This only dumps the first * 32 words, stuff that matters is in that extent. * **********************************************************************/ static int lem_sysctl_nvm_info(SYSCTL_HANDLER_ARGS) { struct adapter *adapter; int error; int result; result = -1; error = sysctl_handle_int(oidp, &result, 0, req); if (error || !req->newptr) return (error); /* * This value will cause a hex dump of the * first 32 16-bit words of the EEPROM to * the screen. */ if (result == 1) { adapter = (struct adapter *)arg1; lem_print_nvm_info(adapter); } return (error); } static void lem_print_nvm_info(struct adapter *adapter) { u16 eeprom_data; int i, j, row = 0; /* Its a bit crude, but it gets the job done */ printf("\nInterface EEPROM Dump:\n"); printf("Offset\n0x0000 "); for (i = 0, j = 0; i < 32; i++, j++) { if (j == 8) { /* Make the offset block */ j = 0; ++row; printf("\n0x00%x0 ",row); } e1000_read_nvm(&adapter->hw, i, 1, &eeprom_data); printf("%04x ", eeprom_data); } printf("\n"); } static int lem_sysctl_int_delay(SYSCTL_HANDLER_ARGS) { struct em_int_delay_info *info; struct adapter *adapter; u32 regval; int error; int usecs; int ticks; info = (struct em_int_delay_info *)arg1; usecs = info->value; error = sysctl_handle_int(oidp, &usecs, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (usecs < 0 || usecs > EM_TICKS_TO_USECS(65535)) return (EINVAL); info->value = usecs; ticks = EM_USECS_TO_TICKS(usecs); if (info->offset == E1000_ITR) /* units are 256ns here */ ticks *= 4; adapter = info->adapter; EM_CORE_LOCK(adapter); regval = E1000_READ_OFFSET(&adapter->hw, info->offset); regval = (regval & ~0xffff) | (ticks & 0xffff); /* Handle a few special cases. */ switch (info->offset) { case E1000_RDTR: break; case E1000_TIDV: if (ticks == 0) { adapter->txd_cmd &= ~E1000_TXD_CMD_IDE; /* Don't write 0 into the TIDV register. */ regval++; } else adapter->txd_cmd |= E1000_TXD_CMD_IDE; break; } E1000_WRITE_OFFSET(&adapter->hw, info->offset, regval); EM_CORE_UNLOCK(adapter); return (0); } static void lem_add_int_delay_sysctl(struct adapter *adapter, const char *name, const char *description, struct em_int_delay_info *info, int offset, int value) { info->adapter = adapter; info->offset = offset; info->value = value; SYSCTL_ADD_PROC(device_get_sysctl_ctx(adapter->dev), SYSCTL_CHILDREN(device_get_sysctl_tree(adapter->dev)), OID_AUTO, name, CTLTYPE_INT|CTLFLAG_RW, info, 0, lem_sysctl_int_delay, "I", description); } static void lem_set_flow_cntrl(struct adapter *adapter, const char *name, const char *description, int *limit, int value) { *limit = value; SYSCTL_ADD_INT(device_get_sysctl_ctx(adapter->dev), SYSCTL_CHILDREN(device_get_sysctl_tree(adapter->dev)), OID_AUTO, name, CTLTYPE_INT|CTLFLAG_RW, limit, value, description); } static void lem_add_rx_process_limit(struct adapter *adapter, const char *name, const char *description, int *limit, int value) { *limit = value; SYSCTL_ADD_INT(device_get_sysctl_ctx(adapter->dev), SYSCTL_CHILDREN(device_get_sysctl_tree(adapter->dev)), OID_AUTO, name, CTLTYPE_INT|CTLFLAG_RW, limit, value, description); } Index: stable/9/sys/dev/ixgbe/ixgbe.c =================================================================== --- stable/9/sys/dev/ixgbe/ixgbe.c (revision 262152) +++ stable/9/sys/dev/ixgbe/ixgbe.c (revision 262153) @@ -1,5828 +1,5828 @@ /****************************************************************************** Copyright (c) 2001-2013, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************/ /*$FreeBSD$*/ #include "opt_inet.h" #include "opt_inet6.h" #include "ixgbe.h" /********************************************************************* * Set this to one to display debug statistics *********************************************************************/ int ixgbe_display_debug_stats = 0; /********************************************************************* * Driver version *********************************************************************/ char ixgbe_driver_version[] = "2.5.15"; /********************************************************************* * PCI Device ID Table * * Used by probe to select devices to load on * Last field stores an index into ixgbe_strings * Last entry must be all 0s * * { Vendor ID, Device ID, SubVendor ID, SubDevice ID, String Index } *********************************************************************/ static ixgbe_vendor_info_t ixgbe_vendor_info_array[] = { {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598AF_DUAL_PORT, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598AF_SINGLE_PORT, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598EB_CX4, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598AT, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598AT2, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598_DA_DUAL_PORT, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598_CX4_DUAL_PORT, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598EB_XF_LR, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598_SR_DUAL_PORT_EM, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598EB_SFP_LOM, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_KX4, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_KX4_MEZZ, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_SFP, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_XAUI_LOM, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_CX4, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_T3_LOM, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_COMBO_BACKPLANE, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_BACKPLANE_FCOE, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_SFP_SF2, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_SFP_FCOE, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599EN_SFP, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_SFP_SF_QP, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_X540T, 0, 0, 0}, /* required last entry */ {0, 0, 0, 0, 0} }; /********************************************************************* * Table of branding strings *********************************************************************/ static char *ixgbe_strings[] = { "Intel(R) PRO/10GbE PCI-Express Network Driver" }; /********************************************************************* * Function prototypes *********************************************************************/ static int ixgbe_probe(device_t); static int ixgbe_attach(device_t); static int ixgbe_detach(device_t); static int ixgbe_shutdown(device_t); #ifdef IXGBE_LEGACY_TX static void ixgbe_start(struct ifnet *); static void ixgbe_start_locked(struct tx_ring *, struct ifnet *); #else /* ! IXGBE_LEGACY_TX */ static int ixgbe_mq_start(struct ifnet *, struct mbuf *); static int ixgbe_mq_start_locked(struct ifnet *, struct tx_ring *); static void ixgbe_qflush(struct ifnet *); static void ixgbe_deferred_mq_start(void *, int); #endif /* IXGBE_LEGACY_TX */ static int ixgbe_ioctl(struct ifnet *, u_long, caddr_t); static void ixgbe_init(void *); static void ixgbe_init_locked(struct adapter *); static void ixgbe_stop(void *); static void ixgbe_media_status(struct ifnet *, struct ifmediareq *); static int ixgbe_media_change(struct ifnet *); static void ixgbe_identify_hardware(struct adapter *); static int ixgbe_allocate_pci_resources(struct adapter *); static void ixgbe_get_slot_info(struct ixgbe_hw *); static int ixgbe_allocate_msix(struct adapter *); static int ixgbe_allocate_legacy(struct adapter *); static int ixgbe_allocate_queues(struct adapter *); static int ixgbe_setup_msix(struct adapter *); static void ixgbe_free_pci_resources(struct adapter *); static void ixgbe_local_timer(void *); static int ixgbe_setup_interface(device_t, struct adapter *); static void ixgbe_config_link(struct adapter *); static int ixgbe_allocate_transmit_buffers(struct tx_ring *); static int ixgbe_setup_transmit_structures(struct adapter *); static void ixgbe_setup_transmit_ring(struct tx_ring *); static void ixgbe_initialize_transmit_units(struct adapter *); static void ixgbe_free_transmit_structures(struct adapter *); static void ixgbe_free_transmit_buffers(struct tx_ring *); static int ixgbe_allocate_receive_buffers(struct rx_ring *); static int ixgbe_setup_receive_structures(struct adapter *); static int ixgbe_setup_receive_ring(struct rx_ring *); static void ixgbe_initialize_receive_units(struct adapter *); static void ixgbe_free_receive_structures(struct adapter *); static void ixgbe_free_receive_buffers(struct rx_ring *); static void ixgbe_setup_hw_rsc(struct rx_ring *); static void ixgbe_enable_intr(struct adapter *); static void ixgbe_disable_intr(struct adapter *); static void ixgbe_update_stats_counters(struct adapter *); static void ixgbe_txeof(struct tx_ring *); static bool ixgbe_rxeof(struct ix_queue *); static void ixgbe_rx_checksum(u32, struct mbuf *, u32); static void ixgbe_set_promisc(struct adapter *); static void ixgbe_set_multi(struct adapter *); static void ixgbe_update_link_status(struct adapter *); static void ixgbe_refresh_mbufs(struct rx_ring *, int); static int ixgbe_xmit(struct tx_ring *, struct mbuf **); static int ixgbe_set_flowcntl(SYSCTL_HANDLER_ARGS); static int ixgbe_set_advertise(SYSCTL_HANDLER_ARGS); static int ixgbe_set_thermal_test(SYSCTL_HANDLER_ARGS); static int ixgbe_dma_malloc(struct adapter *, bus_size_t, struct ixgbe_dma_alloc *, int); static void ixgbe_dma_free(struct adapter *, struct ixgbe_dma_alloc *); static int ixgbe_tx_ctx_setup(struct tx_ring *, struct mbuf *, u32 *, u32 *); static int ixgbe_tso_setup(struct tx_ring *, struct mbuf *, u32 *, u32 *); static void ixgbe_set_ivar(struct adapter *, u8, u8, s8); static void ixgbe_configure_ivars(struct adapter *); static u8 * ixgbe_mc_array_itr(struct ixgbe_hw *, u8 **, u32 *); static void ixgbe_setup_vlan_hw_support(struct adapter *); static void ixgbe_register_vlan(void *, struct ifnet *, u16); static void ixgbe_unregister_vlan(void *, struct ifnet *, u16); static void ixgbe_add_hw_stats(struct adapter *adapter); static __inline void ixgbe_rx_discard(struct rx_ring *, int); static __inline void ixgbe_rx_input(struct rx_ring *, struct ifnet *, struct mbuf *, u32); static void ixgbe_enable_rx_drop(struct adapter *); static void ixgbe_disable_rx_drop(struct adapter *); /* Support for pluggable optic modules */ static bool ixgbe_sfp_probe(struct adapter *); static void ixgbe_setup_optics(struct adapter *); /* Legacy (single vector interrupt handler */ static void ixgbe_legacy_irq(void *); /* The MSI/X Interrupt handlers */ static void ixgbe_msix_que(void *); static void ixgbe_msix_link(void *); /* Deferred interrupt tasklets */ static void ixgbe_handle_que(void *, int); static void ixgbe_handle_link(void *, int); static void ixgbe_handle_msf(void *, int); static void ixgbe_handle_mod(void *, int); #ifdef IXGBE_FDIR static void ixgbe_atr(struct tx_ring *, struct mbuf *); static void ixgbe_reinit_fdir(void *, int); #endif /* Missing shared code prototype */ extern void ixgbe_stop_mac_link_on_d3_82599(struct ixgbe_hw *hw); /********************************************************************* * FreeBSD Device Interface Entry Points *********************************************************************/ static device_method_t ixgbe_methods[] = { /* Device interface */ DEVMETHOD(device_probe, ixgbe_probe), DEVMETHOD(device_attach, ixgbe_attach), DEVMETHOD(device_detach, ixgbe_detach), DEVMETHOD(device_shutdown, ixgbe_shutdown), DEVMETHOD_END }; static driver_t ixgbe_driver = { "ix", ixgbe_methods, sizeof(struct adapter), }; devclass_t ixgbe_devclass; DRIVER_MODULE(ixgbe, pci, ixgbe_driver, ixgbe_devclass, 0, 0); MODULE_DEPEND(ixgbe, pci, 1, 1, 1); MODULE_DEPEND(ixgbe, ether, 1, 1, 1); /* ** TUNEABLE PARAMETERS: */ static SYSCTL_NODE(_hw, OID_AUTO, ix, CTLFLAG_RD, 0, "IXGBE driver parameters"); /* ** AIM: Adaptive Interrupt Moderation ** which means that the interrupt rate ** is varied over time based on the ** traffic for that interrupt vector */ static int ixgbe_enable_aim = TRUE; TUNABLE_INT("hw.ixgbe.enable_aim", &ixgbe_enable_aim); SYSCTL_INT(_hw_ix, OID_AUTO, enable_aim, CTLFLAG_RW, &ixgbe_enable_aim, 0, "Enable adaptive interrupt moderation"); static int ixgbe_max_interrupt_rate = (4000000 / IXGBE_LOW_LATENCY); TUNABLE_INT("hw.ixgbe.max_interrupt_rate", &ixgbe_max_interrupt_rate); SYSCTL_INT(_hw_ix, OID_AUTO, max_interrupt_rate, CTLFLAG_RDTUN, &ixgbe_max_interrupt_rate, 0, "Maximum interrupts per second"); /* How many packets rxeof tries to clean at a time */ static int ixgbe_rx_process_limit = 256; TUNABLE_INT("hw.ixgbe.rx_process_limit", &ixgbe_rx_process_limit); SYSCTL_INT(_hw_ix, OID_AUTO, rx_process_limit, CTLFLAG_RDTUN, &ixgbe_rx_process_limit, 0, "Maximum number of received packets to process at a time," "-1 means unlimited"); /* How many packets txeof tries to clean at a time */ static int ixgbe_tx_process_limit = 256; TUNABLE_INT("hw.ixgbe.tx_process_limit", &ixgbe_tx_process_limit); SYSCTL_INT(_hw_ix, OID_AUTO, tx_process_limit, CTLFLAG_RDTUN, &ixgbe_tx_process_limit, 0, "Maximum number of sent packets to process at a time," "-1 means unlimited"); /* ** Smart speed setting, default to on ** this only works as a compile option ** right now as its during attach, set ** this to 'ixgbe_smart_speed_off' to ** disable. */ static int ixgbe_smart_speed = ixgbe_smart_speed_on; /* * MSIX should be the default for best performance, * but this allows it to be forced off for testing. */ static int ixgbe_enable_msix = 1; TUNABLE_INT("hw.ixgbe.enable_msix", &ixgbe_enable_msix); SYSCTL_INT(_hw_ix, OID_AUTO, enable_msix, CTLFLAG_RDTUN, &ixgbe_enable_msix, 0, "Enable MSI-X interrupts"); /* * Number of Queues, can be set to 0, * it then autoconfigures based on the * number of cpus with a max of 8. This * can be overriden manually here. */ static int ixgbe_num_queues = 0; TUNABLE_INT("hw.ixgbe.num_queues", &ixgbe_num_queues); SYSCTL_INT(_hw_ix, OID_AUTO, num_queues, CTLFLAG_RDTUN, &ixgbe_num_queues, 0, "Number of queues to configure, 0 indicates autoconfigure"); /* ** Number of TX descriptors per ring, ** setting higher than RX as this seems ** the better performing choice. */ static int ixgbe_txd = PERFORM_TXD; TUNABLE_INT("hw.ixgbe.txd", &ixgbe_txd); SYSCTL_INT(_hw_ix, OID_AUTO, txd, CTLFLAG_RDTUN, &ixgbe_txd, 0, "Number of receive descriptors per queue"); /* Number of RX descriptors per ring */ static int ixgbe_rxd = PERFORM_RXD; TUNABLE_INT("hw.ixgbe.rxd", &ixgbe_rxd); SYSCTL_INT(_hw_ix, OID_AUTO, rxd, CTLFLAG_RDTUN, &ixgbe_rxd, 0, "Number of receive descriptors per queue"); /* ** Defining this on will allow the use ** of unsupported SFP+ modules, note that ** doing so you are on your own :) */ static int allow_unsupported_sfp = FALSE; TUNABLE_INT("hw.ixgbe.unsupported_sfp", &allow_unsupported_sfp); /* ** HW RSC control: ** this feature only works with ** IPv4, and only on 82599 and later. ** Also this will cause IP forwarding to ** fail and that can't be controlled by ** the stack as LRO can. For all these ** reasons I've deemed it best to leave ** this off and not bother with a tuneable ** interface, this would need to be compiled ** to enable. */ static bool ixgbe_rsc_enable = FALSE; /* Keep running tab on them for sanity check */ static int ixgbe_total_ports; #ifdef IXGBE_FDIR /* ** For Flow Director: this is the ** number of TX packets we sample ** for the filter pool, this means ** every 20th packet will be probed. ** ** This feature can be disabled by ** setting this to 0. */ static int atr_sample_rate = 20; /* ** Flow Director actually 'steals' ** part of the packet buffer as its ** filter pool, this variable controls ** how much it uses: ** 0 = 64K, 1 = 128K, 2 = 256K */ static int fdir_pballoc = 1; #endif #ifdef DEV_NETMAP /* * The #ifdef DEV_NETMAP / #endif blocks in this file are meant to * be a reference on how to implement netmap support in a driver. * Additional comments are in ixgbe_netmap.h . * * contains functions for netmap support * that extend the standard driver. */ #include #endif /* DEV_NETMAP */ /********************************************************************* * Device identification routine * * ixgbe_probe determines if the driver should be loaded on * adapter based on PCI vendor/device id of the adapter. * * return BUS_PROBE_DEFAULT on success, positive on failure *********************************************************************/ static int ixgbe_probe(device_t dev) { ixgbe_vendor_info_t *ent; u16 pci_vendor_id = 0; u16 pci_device_id = 0; u16 pci_subvendor_id = 0; u16 pci_subdevice_id = 0; char adapter_name[256]; INIT_DEBUGOUT("ixgbe_probe: begin"); pci_vendor_id = pci_get_vendor(dev); if (pci_vendor_id != IXGBE_INTEL_VENDOR_ID) return (ENXIO); pci_device_id = pci_get_device(dev); pci_subvendor_id = pci_get_subvendor(dev); pci_subdevice_id = pci_get_subdevice(dev); ent = ixgbe_vendor_info_array; while (ent->vendor_id != 0) { if ((pci_vendor_id == ent->vendor_id) && (pci_device_id == ent->device_id) && ((pci_subvendor_id == ent->subvendor_id) || (ent->subvendor_id == 0)) && ((pci_subdevice_id == ent->subdevice_id) || (ent->subdevice_id == 0))) { sprintf(adapter_name, "%s, Version - %s", ixgbe_strings[ent->index], ixgbe_driver_version); device_set_desc_copy(dev, adapter_name); ++ixgbe_total_ports; return (BUS_PROBE_DEFAULT); } ent++; } return (ENXIO); } /********************************************************************* * Device initialization routine * * The attach entry point is called when the driver is being loaded. * This routine identifies the type of hardware, allocates all resources * and initializes the hardware. * * return 0 on success, positive on failure *********************************************************************/ static int ixgbe_attach(device_t dev) { struct adapter *adapter; struct ixgbe_hw *hw; int error = 0; u16 csum; u32 ctrl_ext; INIT_DEBUGOUT("ixgbe_attach: begin"); /* Allocate, clear, and link in our adapter structure */ adapter = device_get_softc(dev); adapter->dev = adapter->osdep.dev = dev; hw = &adapter->hw; /* Core Lock Init*/ IXGBE_CORE_LOCK_INIT(adapter, device_get_nameunit(dev)); /* SYSCTL APIs */ SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "fc", CTLTYPE_INT | CTLFLAG_RW, adapter, 0, ixgbe_set_flowcntl, "I", "Flow Control"); SYSCTL_ADD_INT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "enable_aim", CTLTYPE_INT|CTLFLAG_RW, &ixgbe_enable_aim, 1, "Interrupt Moderation"); /* ** Allow a kind of speed control by forcing the autoneg ** advertised speed list to only a certain value, this ** supports 1G on 82599 devices, and 100Mb on x540. */ SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "advertise_speed", CTLTYPE_INT | CTLFLAG_RW, adapter, 0, ixgbe_set_advertise, "I", "Link Speed"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ts", CTLTYPE_INT | CTLFLAG_RW, adapter, 0, ixgbe_set_thermal_test, "I", "Thermal Test"); /* Set up the timer callout */ callout_init_mtx(&adapter->timer, &adapter->core_mtx, 0); /* Determine hardware revision */ ixgbe_identify_hardware(adapter); /* Do base PCI setup - map BAR0 */ if (ixgbe_allocate_pci_resources(adapter)) { device_printf(dev, "Allocation of PCI resources failed\n"); error = ENXIO; goto err_out; } /* Do descriptor calc and sanity checks */ if (((ixgbe_txd * sizeof(union ixgbe_adv_tx_desc)) % DBA_ALIGN) != 0 || ixgbe_txd < MIN_TXD || ixgbe_txd > MAX_TXD) { device_printf(dev, "TXD config issue, using default!\n"); adapter->num_tx_desc = DEFAULT_TXD; } else adapter->num_tx_desc = ixgbe_txd; /* ** With many RX rings it is easy to exceed the ** system mbuf allocation. Tuning nmbclusters ** can alleviate this. */ if (nmbclusters > 0 ) { int s; s = (ixgbe_rxd * adapter->num_queues) * ixgbe_total_ports; if (s > nmbclusters) { device_printf(dev, "RX Descriptors exceed " "system mbuf max, using default instead!\n"); ixgbe_rxd = DEFAULT_RXD; } } if (((ixgbe_rxd * sizeof(union ixgbe_adv_rx_desc)) % DBA_ALIGN) != 0 || ixgbe_rxd < MIN_TXD || ixgbe_rxd > MAX_TXD) { device_printf(dev, "RXD config issue, using default!\n"); adapter->num_rx_desc = DEFAULT_RXD; } else adapter->num_rx_desc = ixgbe_rxd; /* Allocate our TX/RX Queues */ if (ixgbe_allocate_queues(adapter)) { error = ENOMEM; goto err_out; } /* Allocate multicast array memory. */ adapter->mta = malloc(sizeof(u8) * IXGBE_ETH_LENGTH_OF_ADDRESS * MAX_NUM_MULTICAST_ADDRESSES, M_DEVBUF, M_NOWAIT); if (adapter->mta == NULL) { device_printf(dev, "Can not allocate multicast setup array\n"); error = ENOMEM; goto err_late; } /* Initialize the shared code */ hw->allow_unsupported_sfp = allow_unsupported_sfp; error = ixgbe_init_shared_code(hw); if (error == IXGBE_ERR_SFP_NOT_PRESENT) { /* ** No optics in this port, set up ** so the timer routine will probe ** for later insertion. */ adapter->sfp_probe = TRUE; error = 0; } else if (error == IXGBE_ERR_SFP_NOT_SUPPORTED) { device_printf(dev,"Unsupported SFP+ module detected!\n"); error = EIO; goto err_late; } else if (error) { device_printf(dev,"Unable to initialize the shared code\n"); error = EIO; goto err_late; } /* Make sure we have a good EEPROM before we read from it */ if (ixgbe_validate_eeprom_checksum(&adapter->hw, &csum) < 0) { device_printf(dev,"The EEPROM Checksum Is Not Valid\n"); error = EIO; goto err_late; } error = ixgbe_init_hw(hw); switch (error) { case IXGBE_ERR_EEPROM_VERSION: device_printf(dev, "This device is a pre-production adapter/" "LOM. Please be aware there may be issues associated " "with your hardware.\n If you are experiencing problems " "please contact your Intel or hardware representative " "who provided you with this hardware.\n"); break; case IXGBE_ERR_SFP_NOT_SUPPORTED: device_printf(dev,"Unsupported SFP+ Module\n"); error = EIO; goto err_late; case IXGBE_ERR_SFP_NOT_PRESENT: device_printf(dev,"No SFP+ Module found\n"); /* falls thru */ default: break; } /* Detect and set physical type */ ixgbe_setup_optics(adapter); if ((adapter->msix > 1) && (ixgbe_enable_msix)) error = ixgbe_allocate_msix(adapter); else error = ixgbe_allocate_legacy(adapter); if (error) goto err_late; /* Setup OS specific network interface */ if (ixgbe_setup_interface(dev, adapter) != 0) goto err_late; /* Initialize statistics */ ixgbe_update_stats_counters(adapter); /* Register for VLAN events */ adapter->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, ixgbe_register_vlan, adapter, EVENTHANDLER_PRI_FIRST); adapter->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, ixgbe_unregister_vlan, adapter, EVENTHANDLER_PRI_FIRST); /* ** Check PCIE slot type/speed/width */ ixgbe_get_slot_info(hw); /* Set an initial default flow control value */ adapter->fc = ixgbe_fc_full; /* let hardware know driver is loaded */ ctrl_ext = IXGBE_READ_REG(hw, IXGBE_CTRL_EXT); ctrl_ext |= IXGBE_CTRL_EXT_DRV_LOAD; IXGBE_WRITE_REG(hw, IXGBE_CTRL_EXT, ctrl_ext); ixgbe_add_hw_stats(adapter); #ifdef DEV_NETMAP ixgbe_netmap_attach(adapter); #endif /* DEV_NETMAP */ INIT_DEBUGOUT("ixgbe_attach: end"); return (0); err_late: ixgbe_free_transmit_structures(adapter); ixgbe_free_receive_structures(adapter); err_out: if (adapter->ifp != NULL) if_free(adapter->ifp); ixgbe_free_pci_resources(adapter); free(adapter->mta, M_DEVBUF); return (error); } /********************************************************************* * Device removal routine * * The detach entry point is called when the driver is being removed. * This routine stops the adapter and deallocates all the resources * that were allocated for driver operation. * * return 0 on success, positive on failure *********************************************************************/ static int ixgbe_detach(device_t dev) { struct adapter *adapter = device_get_softc(dev); struct ix_queue *que = adapter->queues; struct tx_ring *txr = adapter->tx_rings; u32 ctrl_ext; INIT_DEBUGOUT("ixgbe_detach: begin"); /* Make sure VLANS are not using driver */ if (adapter->ifp->if_vlantrunk != NULL) { device_printf(dev,"Vlan in use, detach first\n"); return (EBUSY); } IXGBE_CORE_LOCK(adapter); ixgbe_stop(adapter); IXGBE_CORE_UNLOCK(adapter); for (int i = 0; i < adapter->num_queues; i++, que++, txr++) { if (que->tq) { #ifndef IXGBE_LEGACY_TX taskqueue_drain(que->tq, &txr->txq_task); #endif taskqueue_drain(que->tq, &que->que_task); taskqueue_free(que->tq); } } /* Drain the Link queue */ if (adapter->tq) { taskqueue_drain(adapter->tq, &adapter->link_task); taskqueue_drain(adapter->tq, &adapter->mod_task); taskqueue_drain(adapter->tq, &adapter->msf_task); #ifdef IXGBE_FDIR taskqueue_drain(adapter->tq, &adapter->fdir_task); #endif taskqueue_free(adapter->tq); } /* let hardware know driver is unloading */ ctrl_ext = IXGBE_READ_REG(&adapter->hw, IXGBE_CTRL_EXT); ctrl_ext &= ~IXGBE_CTRL_EXT_DRV_LOAD; IXGBE_WRITE_REG(&adapter->hw, IXGBE_CTRL_EXT, ctrl_ext); /* Unregister VLAN events */ if (adapter->vlan_attach != NULL) EVENTHANDLER_DEREGISTER(vlan_config, adapter->vlan_attach); if (adapter->vlan_detach != NULL) EVENTHANDLER_DEREGISTER(vlan_unconfig, adapter->vlan_detach); ether_ifdetach(adapter->ifp); callout_drain(&adapter->timer); #ifdef DEV_NETMAP netmap_detach(adapter->ifp); #endif /* DEV_NETMAP */ ixgbe_free_pci_resources(adapter); bus_generic_detach(dev); if_free(adapter->ifp); ixgbe_free_transmit_structures(adapter); ixgbe_free_receive_structures(adapter); free(adapter->mta, M_DEVBUF); IXGBE_CORE_LOCK_DESTROY(adapter); return (0); } /********************************************************************* * * Shutdown entry point * **********************************************************************/ static int ixgbe_shutdown(device_t dev) { struct adapter *adapter = device_get_softc(dev); IXGBE_CORE_LOCK(adapter); ixgbe_stop(adapter); IXGBE_CORE_UNLOCK(adapter); return (0); } #ifdef IXGBE_LEGACY_TX /********************************************************************* * Transmit entry point * * ixgbe_start is called by the stack to initiate a transmit. * The driver will remain in this routine as long as there are * packets to transmit and transmit resources are available. * In case resources are not available stack is notified and * the packet is requeued. **********************************************************************/ static void ixgbe_start_locked(struct tx_ring *txr, struct ifnet * ifp) { struct mbuf *m_head; struct adapter *adapter = txr->adapter; IXGBE_TX_LOCK_ASSERT(txr); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; if (!adapter->link_active) return; while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { if (txr->tx_avail <= IXGBE_QUEUE_MIN_FREE) break; IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) break; if (ixgbe_xmit(txr, &m_head)) { if (m_head != NULL) IFQ_DRV_PREPEND(&ifp->if_snd, m_head); break; } /* Send a copy of the frame to the BPF listener */ ETHER_BPF_MTAP(ifp, m_head); /* Set watchdog on */ txr->watchdog_time = ticks; txr->queue_status = IXGBE_QUEUE_WORKING; } return; } /* * Legacy TX start - called by the stack, this * always uses the first tx ring, and should * not be used with multiqueue tx enabled. */ static void ixgbe_start(struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct tx_ring *txr = adapter->tx_rings; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { IXGBE_TX_LOCK(txr); ixgbe_start_locked(txr, ifp); IXGBE_TX_UNLOCK(txr); } return; } #else /* ! IXGBE_LEGACY_TX */ /* ** Multiqueue Transmit driver ** */ static int ixgbe_mq_start(struct ifnet *ifp, struct mbuf *m) { struct adapter *adapter = ifp->if_softc; struct ix_queue *que; struct tx_ring *txr; int i, err = 0; /* Which queue to use */ if ((m->m_flags & M_FLOWID) != 0) i = m->m_pkthdr.flowid % adapter->num_queues; else i = curcpu % adapter->num_queues; txr = &adapter->tx_rings[i]; que = &adapter->queues[i]; err = drbr_enqueue(ifp, txr->br, m); if (err) return (err); if (IXGBE_TX_TRYLOCK(txr)) { err = ixgbe_mq_start_locked(ifp, txr); IXGBE_TX_UNLOCK(txr); } else taskqueue_enqueue(que->tq, &txr->txq_task); return (err); } static int ixgbe_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr) { struct adapter *adapter = txr->adapter; struct mbuf *next; int enqueued = 0, err = 0; if (((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) || adapter->link_active == 0) return (ENETDOWN); /* Process the queue */ #if __FreeBSD_version < 901504 next = drbr_dequeue(ifp, txr->br); while (next != NULL) { if ((err = ixgbe_xmit(txr, &next)) != 0) { if (next != NULL) err = drbr_enqueue(ifp, txr->br, next); #else while ((next = drbr_peek(ifp, txr->br)) != NULL) { if ((err = ixgbe_xmit(txr, &next)) != 0) { if (next == NULL) { drbr_advance(ifp, txr->br); } else { drbr_putback(ifp, txr->br, next); } #endif break; } #if __FreeBSD_version >= 901504 drbr_advance(ifp, txr->br); #endif enqueued++; /* Send a copy of the frame to the BPF listener */ ETHER_BPF_MTAP(ifp, next); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; #if __FreeBSD_version < 901504 next = drbr_dequeue(ifp, txr->br); #endif } if (enqueued > 0) { /* Set watchdog on */ txr->queue_status = IXGBE_QUEUE_WORKING; txr->watchdog_time = ticks; } if (txr->tx_avail < IXGBE_TX_CLEANUP_THRESHOLD) ixgbe_txeof(txr); return (err); } /* * Called from a taskqueue to drain queued transmit packets. */ static void ixgbe_deferred_mq_start(void *arg, int pending) { struct tx_ring *txr = arg; struct adapter *adapter = txr->adapter; struct ifnet *ifp = adapter->ifp; IXGBE_TX_LOCK(txr); if (!drbr_empty(ifp, txr->br)) ixgbe_mq_start_locked(ifp, txr); IXGBE_TX_UNLOCK(txr); } /* ** Flush all ring buffers */ static void ixgbe_qflush(struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct tx_ring *txr = adapter->tx_rings; struct mbuf *m; for (int i = 0; i < adapter->num_queues; i++, txr++) { IXGBE_TX_LOCK(txr); while ((m = buf_ring_dequeue_sc(txr->br)) != NULL) m_freem(m); IXGBE_TX_UNLOCK(txr); } if_qflush(ifp); } #endif /* IXGBE_LEGACY_TX */ /********************************************************************* * Ioctl entry point * * ixgbe_ioctl is called when the user wants to configure the * interface. * * return 0 on success, positive on failure **********************************************************************/ static int ixgbe_ioctl(struct ifnet * ifp, u_long command, caddr_t data) { struct adapter *adapter = ifp->if_softc; struct ixgbe_hw *hw = &adapter->hw; struct ifreq *ifr = (struct ifreq *) data; #if defined(INET) || defined(INET6) struct ifaddr *ifa = (struct ifaddr *)data; bool avoid_reset = FALSE; #endif int error = 0; switch (command) { case SIOCSIFADDR: #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) avoid_reset = TRUE; #endif #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) avoid_reset = TRUE; #endif #if defined(INET) || defined(INET6) /* ** Calling init results in link renegotiation, ** so we avoid doing it when possible. */ if (avoid_reset) { ifp->if_flags |= IFF_UP; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) ixgbe_init(adapter); if (!(ifp->if_flags & IFF_NOARP)) arp_ifinit(ifp, ifa); } else error = ether_ioctl(ifp, command, data); #endif break; case SIOCSIFMTU: IOCTL_DEBUGOUT("ioctl: SIOCSIFMTU (Set Interface MTU)"); if (ifr->ifr_mtu > IXGBE_MAX_FRAME_SIZE - ETHER_HDR_LEN) { error = EINVAL; } else { IXGBE_CORE_LOCK(adapter); ifp->if_mtu = ifr->ifr_mtu; adapter->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; ixgbe_init_locked(adapter); IXGBE_CORE_UNLOCK(adapter); } break; case SIOCSIFFLAGS: IOCTL_DEBUGOUT("ioctl: SIOCSIFFLAGS (Set Interface Flags)"); IXGBE_CORE_LOCK(adapter); if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) { if ((ifp->if_flags ^ adapter->if_flags) & (IFF_PROMISC | IFF_ALLMULTI)) { ixgbe_set_promisc(adapter); } } else ixgbe_init_locked(adapter); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) ixgbe_stop(adapter); adapter->if_flags = ifp->if_flags; IXGBE_CORE_UNLOCK(adapter); break; case SIOCADDMULTI: case SIOCDELMULTI: IOCTL_DEBUGOUT("ioctl: SIOC(ADD|DEL)MULTI"); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { IXGBE_CORE_LOCK(adapter); ixgbe_disable_intr(adapter); ixgbe_set_multi(adapter); ixgbe_enable_intr(adapter); IXGBE_CORE_UNLOCK(adapter); } break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: IOCTL_DEBUGOUT("ioctl: SIOCxIFMEDIA (Get/Set Interface Media)"); error = ifmedia_ioctl(ifp, ifr, &adapter->media, command); break; case SIOCSIFCAP: { int mask = ifr->ifr_reqcap ^ ifp->if_capenable; IOCTL_DEBUGOUT("ioctl: SIOCSIFCAP (Set Capabilities)"); if (mask & IFCAP_HWCSUM) ifp->if_capenable ^= IFCAP_HWCSUM; if (mask & IFCAP_TSO4) ifp->if_capenable ^= IFCAP_TSO4; if (mask & IFCAP_TSO6) ifp->if_capenable ^= IFCAP_TSO6; if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_VLAN_HWTAGGING) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (mask & IFCAP_VLAN_HWFILTER) ifp->if_capenable ^= IFCAP_VLAN_HWFILTER; if (mask & IFCAP_VLAN_HWTSO) ifp->if_capenable ^= IFCAP_VLAN_HWTSO; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { IXGBE_CORE_LOCK(adapter); ixgbe_init_locked(adapter); IXGBE_CORE_UNLOCK(adapter); } VLAN_CAPABILITIES(ifp); break; } case SIOCGI2C: { struct ixgbe_i2c_req i2c; IOCTL_DEBUGOUT("ioctl: SIOCGI2C (Get I2C Data)"); error = copyin(ifr->ifr_data, &i2c, sizeof(i2c)); if (error) break; if ((i2c.dev_addr != 0xA0) || (i2c.dev_addr != 0xA2)){ error = EINVAL; break; } hw->phy.ops.read_i2c_byte(hw, i2c.offset, i2c.dev_addr, i2c.data); error = copyout(&i2c, ifr->ifr_data, sizeof(i2c)); break; } default: IOCTL_DEBUGOUT1("ioctl: UNKNOWN (0x%X)\n", (int)command); error = ether_ioctl(ifp, command, data); break; } return (error); } /********************************************************************* * Init entry point * * This routine is used in two ways. It is used by the stack as * init entry point in network interface structure. It is also used * by the driver as a hw/sw initialization routine to get to a * consistent state. * * return 0 on success, positive on failure **********************************************************************/ #define IXGBE_MHADD_MFS_SHIFT 16 static void ixgbe_init_locked(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; device_t dev = adapter->dev; struct ixgbe_hw *hw = &adapter->hw; u32 k, txdctl, mhadd, gpie; u32 rxdctl, rxctrl; mtx_assert(&adapter->core_mtx, MA_OWNED); INIT_DEBUGOUT("ixgbe_init_locked: begin"); hw->adapter_stopped = FALSE; ixgbe_stop_adapter(hw); callout_stop(&adapter->timer); /* reprogram the RAR[0] in case user changed it. */ ixgbe_set_rar(hw, 0, adapter->hw.mac.addr, 0, IXGBE_RAH_AV); /* Get the latest mac address, User can use a LAA */ bcopy(IF_LLADDR(adapter->ifp), hw->mac.addr, IXGBE_ETH_LENGTH_OF_ADDRESS); ixgbe_set_rar(hw, 0, hw->mac.addr, 0, 1); hw->addr_ctrl.rar_used_count = 1; /* Set the various hardware offload abilities */ ifp->if_hwassist = 0; if (ifp->if_capenable & IFCAP_TSO) ifp->if_hwassist |= CSUM_TSO; if (ifp->if_capenable & IFCAP_TXCSUM) { ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP); #if __FreeBSD_version >= 800000 if (hw->mac.type != ixgbe_mac_82598EB) ifp->if_hwassist |= CSUM_SCTP; #endif } /* Prepare transmit descriptors and buffers */ if (ixgbe_setup_transmit_structures(adapter)) { device_printf(dev,"Could not setup transmit structures\n"); ixgbe_stop(adapter); return; } ixgbe_init_hw(hw); ixgbe_initialize_transmit_units(adapter); /* Setup Multicast table */ ixgbe_set_multi(adapter); /* ** Determine the correct mbuf pool ** for doing jumbo frames */ if (adapter->max_frame_size <= 2048) adapter->rx_mbuf_sz = MCLBYTES; else if (adapter->max_frame_size <= 4096) adapter->rx_mbuf_sz = MJUMPAGESIZE; else if (adapter->max_frame_size <= 9216) adapter->rx_mbuf_sz = MJUM9BYTES; else adapter->rx_mbuf_sz = MJUM16BYTES; /* Prepare receive descriptors and buffers */ if (ixgbe_setup_receive_structures(adapter)) { device_printf(dev,"Could not setup receive structures\n"); ixgbe_stop(adapter); return; } /* Configure RX settings */ ixgbe_initialize_receive_units(adapter); gpie = IXGBE_READ_REG(&adapter->hw, IXGBE_GPIE); /* Enable Fan Failure Interrupt */ gpie |= IXGBE_SDP1_GPIEN; /* Add for Module detection */ if (hw->mac.type == ixgbe_mac_82599EB) gpie |= IXGBE_SDP2_GPIEN; /* Thermal Failure Detection */ if (hw->mac.type == ixgbe_mac_X540) gpie |= IXGBE_SDP0_GPIEN; if (adapter->msix > 1) { /* Enable Enhanced MSIX mode */ gpie |= IXGBE_GPIE_MSIX_MODE; gpie |= IXGBE_GPIE_EIAME | IXGBE_GPIE_PBA_SUPPORT | IXGBE_GPIE_OCD; } IXGBE_WRITE_REG(hw, IXGBE_GPIE, gpie); /* Set MTU size */ if (ifp->if_mtu > ETHERMTU) { mhadd = IXGBE_READ_REG(hw, IXGBE_MHADD); mhadd &= ~IXGBE_MHADD_MFS_MASK; mhadd |= adapter->max_frame_size << IXGBE_MHADD_MFS_SHIFT; IXGBE_WRITE_REG(hw, IXGBE_MHADD, mhadd); } /* Now enable all the queues */ for (int i = 0; i < adapter->num_queues; i++) { txdctl = IXGBE_READ_REG(hw, IXGBE_TXDCTL(i)); txdctl |= IXGBE_TXDCTL_ENABLE; /* Set WTHRESH to 8, burst writeback */ txdctl |= (8 << 16); /* * When the internal queue falls below PTHRESH (32), * start prefetching as long as there are at least * HTHRESH (1) buffers ready. The values are taken * from the Intel linux driver 3.8.21. * Prefetching enables tx line rate even with 1 queue. */ txdctl |= (32 << 0) | (1 << 8); IXGBE_WRITE_REG(hw, IXGBE_TXDCTL(i), txdctl); } for (int i = 0; i < adapter->num_queues; i++) { rxdctl = IXGBE_READ_REG(hw, IXGBE_RXDCTL(i)); if (hw->mac.type == ixgbe_mac_82598EB) { /* ** PTHRESH = 21 ** HTHRESH = 4 ** WTHRESH = 8 */ rxdctl &= ~0x3FFFFF; rxdctl |= 0x080420; } rxdctl |= IXGBE_RXDCTL_ENABLE; IXGBE_WRITE_REG(hw, IXGBE_RXDCTL(i), rxdctl); for (k = 0; k < 10; k++) { if (IXGBE_READ_REG(hw, IXGBE_RXDCTL(i)) & IXGBE_RXDCTL_ENABLE) break; else msec_delay(1); } wmb(); #ifdef DEV_NETMAP /* * In netmap mode, we must preserve the buffers made * available to userspace before the if_init() * (this is true by default on the TX side, because * init makes all buffers available to userspace). * * netmap_reset() and the device specific routines * (e.g. ixgbe_setup_receive_rings()) map these * buffers at the end of the NIC ring, so here we * must set the RDT (tail) register to make sure * they are not overwritten. * * In this driver the NIC ring starts at RDH = 0, * RDT points to the last slot available for reception (?), * so RDT = num_rx_desc - 1 means the whole ring is available. */ if (ifp->if_capenable & IFCAP_NETMAP) { struct netmap_adapter *na = NA(adapter->ifp); struct netmap_kring *kring = &na->rx_rings[i]; - int t = na->num_rx_desc - 1 - kring->nr_hwavail; + int t = na->num_rx_desc - 1 - nm_kr_rxspace(kring); IXGBE_WRITE_REG(hw, IXGBE_RDT(i), t); } else #endif /* DEV_NETMAP */ IXGBE_WRITE_REG(hw, IXGBE_RDT(i), adapter->num_rx_desc - 1); } /* Set up VLAN support and filter */ ixgbe_setup_vlan_hw_support(adapter); /* Enable Receive engine */ rxctrl = IXGBE_READ_REG(hw, IXGBE_RXCTRL); if (hw->mac.type == ixgbe_mac_82598EB) rxctrl |= IXGBE_RXCTRL_DMBYPS; rxctrl |= IXGBE_RXCTRL_RXEN; ixgbe_enable_rx_dma(hw, rxctrl); callout_reset(&adapter->timer, hz, ixgbe_local_timer, adapter); /* Set up MSI/X routing */ if (ixgbe_enable_msix) { ixgbe_configure_ivars(adapter); /* Set up auto-mask */ if (hw->mac.type == ixgbe_mac_82598EB) IXGBE_WRITE_REG(hw, IXGBE_EIAM, IXGBE_EICS_RTX_QUEUE); else { IXGBE_WRITE_REG(hw, IXGBE_EIAM_EX(0), 0xFFFFFFFF); IXGBE_WRITE_REG(hw, IXGBE_EIAM_EX(1), 0xFFFFFFFF); } } else { /* Simple settings for Legacy/MSI */ ixgbe_set_ivar(adapter, 0, 0, 0); ixgbe_set_ivar(adapter, 0, 0, 1); IXGBE_WRITE_REG(hw, IXGBE_EIAM, IXGBE_EICS_RTX_QUEUE); } #ifdef IXGBE_FDIR /* Init Flow director */ if (hw->mac.type != ixgbe_mac_82598EB) { u32 hdrm = 32 << fdir_pballoc; hw->mac.ops.setup_rxpba(hw, 0, hdrm, PBA_STRATEGY_EQUAL); ixgbe_init_fdir_signature_82599(&adapter->hw, fdir_pballoc); } #endif /* ** Check on any SFP devices that ** need to be kick-started */ if (hw->phy.type == ixgbe_phy_none) { int err = hw->phy.ops.identify(hw); if (err == IXGBE_ERR_SFP_NOT_SUPPORTED) { device_printf(dev, "Unsupported SFP+ module type was detected.\n"); return; } } /* Set moderation on the Link interrupt */ IXGBE_WRITE_REG(hw, IXGBE_EITR(adapter->linkvec), IXGBE_LINK_ITR); /* Config/Enable Link */ ixgbe_config_link(adapter); /* Hardware Packet Buffer & Flow Control setup */ { u32 rxpb, frame, size, tmp; frame = adapter->max_frame_size; /* Calculate High Water */ if (hw->mac.type == ixgbe_mac_X540) tmp = IXGBE_DV_X540(frame, frame); else tmp = IXGBE_DV(frame, frame); size = IXGBE_BT2KB(tmp); rxpb = IXGBE_READ_REG(hw, IXGBE_RXPBSIZE(0)) >> 10; hw->fc.high_water[0] = rxpb - size; /* Now calculate Low Water */ if (hw->mac.type == ixgbe_mac_X540) tmp = IXGBE_LOW_DV_X540(frame); else tmp = IXGBE_LOW_DV(frame); hw->fc.low_water[0] = IXGBE_BT2KB(tmp); hw->fc.requested_mode = adapter->fc; hw->fc.pause_time = IXGBE_FC_PAUSE; hw->fc.send_xon = TRUE; } /* Initialize the FC settings */ ixgbe_start_hw(hw); /* And now turn on interrupts */ ixgbe_enable_intr(adapter); /* Now inform the stack we're ready */ ifp->if_drv_flags |= IFF_DRV_RUNNING; return; } static void ixgbe_init(void *arg) { struct adapter *adapter = arg; IXGBE_CORE_LOCK(adapter); ixgbe_init_locked(adapter); IXGBE_CORE_UNLOCK(adapter); return; } /* ** ** MSIX Interrupt Handlers and Tasklets ** */ static inline void ixgbe_enable_queue(struct adapter *adapter, u32 vector) { struct ixgbe_hw *hw = &adapter->hw; u64 queue = (u64)(1 << vector); u32 mask; if (hw->mac.type == ixgbe_mac_82598EB) { mask = (IXGBE_EIMS_RTX_QUEUE & queue); IXGBE_WRITE_REG(hw, IXGBE_EIMS, mask); } else { mask = (queue & 0xFFFFFFFF); if (mask) IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(0), mask); mask = (queue >> 32); if (mask) IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(1), mask); } } static inline void ixgbe_disable_queue(struct adapter *adapter, u32 vector) { struct ixgbe_hw *hw = &adapter->hw; u64 queue = (u64)(1 << vector); u32 mask; if (hw->mac.type == ixgbe_mac_82598EB) { mask = (IXGBE_EIMS_RTX_QUEUE & queue); IXGBE_WRITE_REG(hw, IXGBE_EIMC, mask); } else { mask = (queue & 0xFFFFFFFF); if (mask) IXGBE_WRITE_REG(hw, IXGBE_EIMC_EX(0), mask); mask = (queue >> 32); if (mask) IXGBE_WRITE_REG(hw, IXGBE_EIMC_EX(1), mask); } } static void ixgbe_handle_que(void *context, int pending) { struct ix_queue *que = context; struct adapter *adapter = que->adapter; struct tx_ring *txr = que->txr; struct ifnet *ifp = adapter->ifp; bool more; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { more = ixgbe_rxeof(que); IXGBE_TX_LOCK(txr); ixgbe_txeof(txr); #ifndef IXGBE_LEGACY_TX if (!drbr_empty(ifp, txr->br)) ixgbe_mq_start_locked(ifp, txr); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) ixgbe_start_locked(txr, ifp); #endif IXGBE_TX_UNLOCK(txr); } /* Reenable this interrupt */ if (que->res != NULL) ixgbe_enable_queue(adapter, que->msix); else ixgbe_enable_intr(adapter); return; } /********************************************************************* * * Legacy Interrupt Service routine * **********************************************************************/ static void ixgbe_legacy_irq(void *arg) { struct ix_queue *que = arg; struct adapter *adapter = que->adapter; struct ixgbe_hw *hw = &adapter->hw; struct ifnet *ifp = adapter->ifp; struct tx_ring *txr = adapter->tx_rings; bool more; u32 reg_eicr; reg_eicr = IXGBE_READ_REG(hw, IXGBE_EICR); ++que->irqs; if (reg_eicr == 0) { ixgbe_enable_intr(adapter); return; } more = ixgbe_rxeof(que); IXGBE_TX_LOCK(txr); ixgbe_txeof(txr); #ifdef IXGBE_LEGACY_TX if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) ixgbe_start_locked(txr, ifp); #else if (!drbr_empty(ifp, txr->br)) ixgbe_mq_start_locked(ifp, txr); #endif IXGBE_TX_UNLOCK(txr); /* Check for fan failure */ if ((hw->phy.media_type == ixgbe_media_type_copper) && (reg_eicr & IXGBE_EICR_GPI_SDP1)) { device_printf(adapter->dev, "\nCRITICAL: FAN FAILURE!! " "REPLACE IMMEDIATELY!!\n"); IXGBE_WRITE_REG(hw, IXGBE_EIMS, IXGBE_EICR_GPI_SDP1); } /* Link status change */ if (reg_eicr & IXGBE_EICR_LSC) taskqueue_enqueue(adapter->tq, &adapter->link_task); if (more) taskqueue_enqueue(que->tq, &que->que_task); else ixgbe_enable_intr(adapter); return; } /********************************************************************* * * MSIX Queue Interrupt Service routine * **********************************************************************/ void ixgbe_msix_que(void *arg) { struct ix_queue *que = arg; struct adapter *adapter = que->adapter; struct ifnet *ifp = adapter->ifp; struct tx_ring *txr = que->txr; struct rx_ring *rxr = que->rxr; bool more; u32 newitr = 0; /* Protect against spurious interrupts */ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; ixgbe_disable_queue(adapter, que->msix); ++que->irqs; more = ixgbe_rxeof(que); IXGBE_TX_LOCK(txr); ixgbe_txeof(txr); #ifdef IXGBE_LEGACY_TX if (!IFQ_DRV_IS_EMPTY(ifp->if_snd)) ixgbe_start_locked(txr, ifp); #else if (!drbr_empty(ifp, txr->br)) ixgbe_mq_start_locked(ifp, txr); #endif IXGBE_TX_UNLOCK(txr); /* Do AIM now? */ if (ixgbe_enable_aim == FALSE) goto no_calc; /* ** Do Adaptive Interrupt Moderation: ** - Write out last calculated setting ** - Calculate based on average size over ** the last interval. */ if (que->eitr_setting) IXGBE_WRITE_REG(&adapter->hw, IXGBE_EITR(que->msix), que->eitr_setting); que->eitr_setting = 0; /* Idle, do nothing */ if ((txr->bytes == 0) && (rxr->bytes == 0)) goto no_calc; if ((txr->bytes) && (txr->packets)) newitr = txr->bytes/txr->packets; if ((rxr->bytes) && (rxr->packets)) newitr = max(newitr, (rxr->bytes / rxr->packets)); newitr += 24; /* account for hardware frame, crc */ /* set an upper boundary */ newitr = min(newitr, 3000); /* Be nice to the mid range */ if ((newitr > 300) && (newitr < 1200)) newitr = (newitr / 3); else newitr = (newitr / 2); if (adapter->hw.mac.type == ixgbe_mac_82598EB) newitr |= newitr << 16; else newitr |= IXGBE_EITR_CNT_WDIS; /* save for next interrupt */ que->eitr_setting = newitr; /* Reset state */ txr->bytes = 0; txr->packets = 0; rxr->bytes = 0; rxr->packets = 0; no_calc: if (more) taskqueue_enqueue(que->tq, &que->que_task); else ixgbe_enable_queue(adapter, que->msix); return; } static void ixgbe_msix_link(void *arg) { struct adapter *adapter = arg; struct ixgbe_hw *hw = &adapter->hw; u32 reg_eicr; ++adapter->link_irq; /* First get the cause */ reg_eicr = IXGBE_READ_REG(hw, IXGBE_EICS); /* Be sure the queue bits are not cleared */ reg_eicr &= ~IXGBE_EICR_RTX_QUEUE; /* Clear interrupt with write */ IXGBE_WRITE_REG(hw, IXGBE_EICR, reg_eicr); /* Link status change */ if (reg_eicr & IXGBE_EICR_LSC) taskqueue_enqueue(adapter->tq, &adapter->link_task); if (adapter->hw.mac.type != ixgbe_mac_82598EB) { #ifdef IXGBE_FDIR if (reg_eicr & IXGBE_EICR_FLOW_DIR) { /* This is probably overkill :) */ if (!atomic_cmpset_int(&adapter->fdir_reinit, 0, 1)) return; /* Disable the interrupt */ IXGBE_WRITE_REG(hw, IXGBE_EIMC, IXGBE_EICR_FLOW_DIR); taskqueue_enqueue(adapter->tq, &adapter->fdir_task); } else #endif if (reg_eicr & IXGBE_EICR_ECC) { device_printf(adapter->dev, "\nCRITICAL: ECC ERROR!! " "Please Reboot!!\n"); IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_ECC); } else if (reg_eicr & IXGBE_EICR_GPI_SDP1) { /* Clear the interrupt */ IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_GPI_SDP1); taskqueue_enqueue(adapter->tq, &adapter->msf_task); } else if (reg_eicr & IXGBE_EICR_GPI_SDP2) { /* Clear the interrupt */ IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_GPI_SDP2); taskqueue_enqueue(adapter->tq, &adapter->mod_task); } } /* Check for fan failure */ if ((hw->device_id == IXGBE_DEV_ID_82598AT) && (reg_eicr & IXGBE_EICR_GPI_SDP1)) { device_printf(adapter->dev, "\nCRITICAL: FAN FAILURE!! " "REPLACE IMMEDIATELY!!\n"); IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_GPI_SDP1); } /* Check for over temp condition */ if ((hw->mac.type == ixgbe_mac_X540) && (reg_eicr & IXGBE_EICR_TS)) { device_printf(adapter->dev, "\nCRITICAL: OVER TEMP!! " "PHY IS SHUT DOWN!!\n"); device_printf(adapter->dev, "System shutdown required\n"); IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_TS); } IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMS, IXGBE_EIMS_OTHER); return; } /********************************************************************* * * Media Ioctl callback * * This routine is called whenever the user queries the status of * the interface using ifconfig. * **********************************************************************/ static void ixgbe_media_status(struct ifnet * ifp, struct ifmediareq * ifmr) { struct adapter *adapter = ifp->if_softc; INIT_DEBUGOUT("ixgbe_media_status: begin"); IXGBE_CORE_LOCK(adapter); ixgbe_update_link_status(adapter); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (!adapter->link_active) { IXGBE_CORE_UNLOCK(adapter); return; } ifmr->ifm_status |= IFM_ACTIVE; switch (adapter->link_speed) { case IXGBE_LINK_SPEED_100_FULL: ifmr->ifm_active |= IFM_100_TX | IFM_FDX; break; case IXGBE_LINK_SPEED_1GB_FULL: ifmr->ifm_active |= IFM_1000_SX | IFM_FDX; break; case IXGBE_LINK_SPEED_10GB_FULL: ifmr->ifm_active |= adapter->optics | IFM_FDX; break; } IXGBE_CORE_UNLOCK(adapter); return; } /********************************************************************* * * Media Ioctl callback * * This routine is called when the user changes speed/duplex using * media/mediopt option with ifconfig. * **********************************************************************/ static int ixgbe_media_change(struct ifnet * ifp) { struct adapter *adapter = ifp->if_softc; struct ifmedia *ifm = &adapter->media; INIT_DEBUGOUT("ixgbe_media_change: begin"); if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) return (EINVAL); switch (IFM_SUBTYPE(ifm->ifm_media)) { case IFM_AUTO: adapter->hw.phy.autoneg_advertised = IXGBE_LINK_SPEED_100_FULL | IXGBE_LINK_SPEED_1GB_FULL | IXGBE_LINK_SPEED_10GB_FULL; break; default: device_printf(adapter->dev, "Only auto media type\n"); return (EINVAL); } return (0); } /********************************************************************* * * This routine maps the mbufs to tx descriptors, allowing the * TX engine to transmit the packets. * - return 0 on success, positive on failure * **********************************************************************/ static int ixgbe_xmit(struct tx_ring *txr, struct mbuf **m_headp) { struct adapter *adapter = txr->adapter; u32 olinfo_status = 0, cmd_type_len; int i, j, error, nsegs; int first; bool remap = TRUE; struct mbuf *m_head; bus_dma_segment_t segs[adapter->num_segs]; bus_dmamap_t map; struct ixgbe_tx_buf *txbuf; union ixgbe_adv_tx_desc *txd = NULL; m_head = *m_headp; /* Basic descriptor defines */ cmd_type_len = (IXGBE_ADVTXD_DTYP_DATA | IXGBE_ADVTXD_DCMD_IFCS | IXGBE_ADVTXD_DCMD_DEXT); if (m_head->m_flags & M_VLANTAG) cmd_type_len |= IXGBE_ADVTXD_DCMD_VLE; /* * Important to capture the first descriptor * used because it will contain the index of * the one we tell the hardware to report back */ first = txr->next_avail_desc; txbuf = &txr->tx_buffers[first]; map = txbuf->map; /* * Map the packet for DMA. */ retry: error = bus_dmamap_load_mbuf_sg(txr->txtag, map, *m_headp, segs, &nsegs, BUS_DMA_NOWAIT); if (__predict_false(error)) { struct mbuf *m; switch (error) { case EFBIG: /* Try it again? - one try */ if (remap == TRUE) { remap = FALSE; m = m_defrag(*m_headp, M_NOWAIT); if (m == NULL) { adapter->mbuf_defrag_failed++; m_freem(*m_headp); *m_headp = NULL; return (ENOBUFS); } *m_headp = m; goto retry; } else return (error); case ENOMEM: txr->no_tx_dma_setup++; return (error); default: txr->no_tx_dma_setup++; m_freem(*m_headp); *m_headp = NULL; return (error); } } /* Make certain there are enough descriptors */ if (nsegs > txr->tx_avail - 2) { txr->no_desc_avail++; bus_dmamap_unload(txr->txtag, map); return (ENOBUFS); } m_head = *m_headp; /* ** Set up the appropriate offload context ** this will consume the first descriptor */ error = ixgbe_tx_ctx_setup(txr, m_head, &cmd_type_len, &olinfo_status); if (__predict_false(error)) { if (error == ENOBUFS) *m_headp = NULL; return (error); } #ifdef IXGBE_FDIR /* Do the flow director magic */ if ((txr->atr_sample) && (!adapter->fdir_reinit)) { ++txr->atr_count; if (txr->atr_count >= atr_sample_rate) { ixgbe_atr(txr, m_head); txr->atr_count = 0; } } #endif i = txr->next_avail_desc; for (j = 0; j < nsegs; j++) { bus_size_t seglen; bus_addr_t segaddr; txbuf = &txr->tx_buffers[i]; txd = &txr->tx_base[i]; seglen = segs[j].ds_len; segaddr = htole64(segs[j].ds_addr); txd->read.buffer_addr = segaddr; txd->read.cmd_type_len = htole32(txr->txd_cmd | cmd_type_len |seglen); txd->read.olinfo_status = htole32(olinfo_status); if (++i == txr->num_desc) i = 0; } txd->read.cmd_type_len |= htole32(IXGBE_TXD_CMD_EOP | IXGBE_TXD_CMD_RS); txr->tx_avail -= nsegs; txr->next_avail_desc = i; txbuf->m_head = m_head; /* ** Here we swap the map so the last descriptor, ** which gets the completion interrupt has the ** real map, and the first descriptor gets the ** unused map from this descriptor. */ txr->tx_buffers[first].map = txbuf->map; txbuf->map = map; bus_dmamap_sync(txr->txtag, map, BUS_DMASYNC_PREWRITE); /* Set the EOP descriptor that will be marked done */ txbuf = &txr->tx_buffers[first]; txbuf->eop = txd; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * Advance the Transmit Descriptor Tail (Tdt), this tells the * hardware that this frame is available to transmit. */ ++txr->total_packets; IXGBE_WRITE_REG(&adapter->hw, IXGBE_TDT(txr->me), i); return (0); } static void ixgbe_set_promisc(struct adapter *adapter) { u_int32_t reg_rctl; struct ifnet *ifp = adapter->ifp; int mcnt = 0; reg_rctl = IXGBE_READ_REG(&adapter->hw, IXGBE_FCTRL); reg_rctl &= (~IXGBE_FCTRL_UPE); if (ifp->if_flags & IFF_ALLMULTI) mcnt = MAX_NUM_MULTICAST_ADDRESSES; else { struct ifmultiaddr *ifma; #if __FreeBSD_version < 800000 IF_ADDR_LOCK(ifp); #else if_maddr_rlock(ifp); #endif TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == MAX_NUM_MULTICAST_ADDRESSES) break; mcnt++; } #if __FreeBSD_version < 800000 IF_ADDR_UNLOCK(ifp); #else if_maddr_runlock(ifp); #endif } if (mcnt < MAX_NUM_MULTICAST_ADDRESSES) reg_rctl &= (~IXGBE_FCTRL_MPE); IXGBE_WRITE_REG(&adapter->hw, IXGBE_FCTRL, reg_rctl); if (ifp->if_flags & IFF_PROMISC) { reg_rctl |= (IXGBE_FCTRL_UPE | IXGBE_FCTRL_MPE); IXGBE_WRITE_REG(&adapter->hw, IXGBE_FCTRL, reg_rctl); } else if (ifp->if_flags & IFF_ALLMULTI) { reg_rctl |= IXGBE_FCTRL_MPE; reg_rctl &= ~IXGBE_FCTRL_UPE; IXGBE_WRITE_REG(&adapter->hw, IXGBE_FCTRL, reg_rctl); } return; } /********************************************************************* * Multicast Update * * This routine is called whenever multicast address list is updated. * **********************************************************************/ #define IXGBE_RAR_ENTRIES 16 static void ixgbe_set_multi(struct adapter *adapter) { u32 fctrl; u8 *mta; u8 *update_ptr; struct ifmultiaddr *ifma; int mcnt = 0; struct ifnet *ifp = adapter->ifp; IOCTL_DEBUGOUT("ixgbe_set_multi: begin"); mta = adapter->mta; bzero(mta, sizeof(u8) * IXGBE_ETH_LENGTH_OF_ADDRESS * MAX_NUM_MULTICAST_ADDRESSES); #if __FreeBSD_version < 800000 IF_ADDR_LOCK(ifp); #else if_maddr_rlock(ifp); #endif TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == MAX_NUM_MULTICAST_ADDRESSES) break; bcopy(LLADDR((struct sockaddr_dl *) ifma->ifma_addr), &mta[mcnt * IXGBE_ETH_LENGTH_OF_ADDRESS], IXGBE_ETH_LENGTH_OF_ADDRESS); mcnt++; } #if __FreeBSD_version < 800000 IF_ADDR_UNLOCK(ifp); #else if_maddr_runlock(ifp); #endif fctrl = IXGBE_READ_REG(&adapter->hw, IXGBE_FCTRL); fctrl |= (IXGBE_FCTRL_UPE | IXGBE_FCTRL_MPE); if (ifp->if_flags & IFF_PROMISC) fctrl |= (IXGBE_FCTRL_UPE | IXGBE_FCTRL_MPE); else if (mcnt >= MAX_NUM_MULTICAST_ADDRESSES || ifp->if_flags & IFF_ALLMULTI) { fctrl |= IXGBE_FCTRL_MPE; fctrl &= ~IXGBE_FCTRL_UPE; } else fctrl &= ~(IXGBE_FCTRL_UPE | IXGBE_FCTRL_MPE); IXGBE_WRITE_REG(&adapter->hw, IXGBE_FCTRL, fctrl); if (mcnt < MAX_NUM_MULTICAST_ADDRESSES) { update_ptr = mta; ixgbe_update_mc_addr_list(&adapter->hw, update_ptr, mcnt, ixgbe_mc_array_itr, TRUE); } return; } /* * This is an iterator function now needed by the multicast * shared code. It simply feeds the shared code routine the * addresses in the array of ixgbe_set_multi() one by one. */ static u8 * ixgbe_mc_array_itr(struct ixgbe_hw *hw, u8 **update_ptr, u32 *vmdq) { u8 *addr = *update_ptr; u8 *newptr; *vmdq = 0; newptr = addr + IXGBE_ETH_LENGTH_OF_ADDRESS; *update_ptr = newptr; return addr; } /********************************************************************* * Timer routine * * This routine checks for link status,updates statistics, * and runs the watchdog check. * **********************************************************************/ static void ixgbe_local_timer(void *arg) { struct adapter *adapter = arg; device_t dev = adapter->dev; struct ix_queue *que = adapter->queues; struct tx_ring *txr = adapter->tx_rings; int hung = 0, paused = 0; mtx_assert(&adapter->core_mtx, MA_OWNED); /* Check for pluggable optics */ if (adapter->sfp_probe) if (!ixgbe_sfp_probe(adapter)) goto out; /* Nothing to do */ ixgbe_update_link_status(adapter); ixgbe_update_stats_counters(adapter); /* * If the interface has been paused * then don't do the watchdog check */ if (IXGBE_READ_REG(&adapter->hw, IXGBE_TFCS) & IXGBE_TFCS_TXOFF) paused = 1; /* ** Check the TX queues status ** - watchdog only if all queues show hung */ for (int i = 0; i < adapter->num_queues; i++, que++, txr++) { if ((txr->queue_status == IXGBE_QUEUE_HUNG) && (paused == 0)) ++hung; else if (txr->queue_status == IXGBE_QUEUE_WORKING) taskqueue_enqueue(que->tq, &txr->txq_task); } /* Only truely watchdog if all queues show hung */ if (hung == adapter->num_queues) goto watchdog; out: callout_reset(&adapter->timer, hz, ixgbe_local_timer, adapter); return; watchdog: device_printf(adapter->dev, "Watchdog timeout -- resetting\n"); device_printf(dev,"Queue(%d) tdh = %d, hw tdt = %d\n", txr->me, IXGBE_READ_REG(&adapter->hw, IXGBE_TDH(txr->me)), IXGBE_READ_REG(&adapter->hw, IXGBE_TDT(txr->me))); device_printf(dev,"TX(%d) desc avail = %d," "Next TX to Clean = %d\n", txr->me, txr->tx_avail, txr->next_to_clean); adapter->ifp->if_drv_flags &= ~IFF_DRV_RUNNING; adapter->watchdog_events++; ixgbe_init_locked(adapter); } /* ** Note: this routine updates the OS on the link state ** the real check of the hardware only happens with ** a link interrupt. */ static void ixgbe_update_link_status(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; device_t dev = adapter->dev; if (adapter->link_up){ if (adapter->link_active == FALSE) { if (bootverbose) device_printf(dev,"Link is up %d Gbps %s \n", ((adapter->link_speed == 128)? 10:1), "Full Duplex"); adapter->link_active = TRUE; /* Update any Flow Control changes */ ixgbe_fc_enable(&adapter->hw); if_link_state_change(ifp, LINK_STATE_UP); } } else { /* Link down */ if (adapter->link_active == TRUE) { if (bootverbose) device_printf(dev,"Link is Down\n"); if_link_state_change(ifp, LINK_STATE_DOWN); adapter->link_active = FALSE; } } return; } /********************************************************************* * * This routine disables all traffic on the adapter by issuing a * global reset on the MAC and deallocates TX/RX buffers. * **********************************************************************/ static void ixgbe_stop(void *arg) { struct ifnet *ifp; struct adapter *adapter = arg; struct ixgbe_hw *hw = &adapter->hw; ifp = adapter->ifp; mtx_assert(&adapter->core_mtx, MA_OWNED); INIT_DEBUGOUT("ixgbe_stop: begin\n"); ixgbe_disable_intr(adapter); callout_stop(&adapter->timer); /* Let the stack know...*/ ifp->if_drv_flags &= ~IFF_DRV_RUNNING; ixgbe_reset_hw(hw); hw->adapter_stopped = FALSE; ixgbe_stop_adapter(hw); if (hw->mac.type == ixgbe_mac_82599EB) ixgbe_stop_mac_link_on_d3_82599(hw); /* Turn off the laser - noop with no optics */ ixgbe_disable_tx_laser(hw); /* Update the stack */ adapter->link_up = FALSE; ixgbe_update_link_status(adapter); /* reprogram the RAR[0] in case user changed it. */ ixgbe_set_rar(&adapter->hw, 0, adapter->hw.mac.addr, 0, IXGBE_RAH_AV); return; } /********************************************************************* * * Determine hardware revision. * **********************************************************************/ static void ixgbe_identify_hardware(struct adapter *adapter) { device_t dev = adapter->dev; struct ixgbe_hw *hw = &adapter->hw; /* Save off the information about this board */ hw->vendor_id = pci_get_vendor(dev); hw->device_id = pci_get_device(dev); hw->revision_id = pci_read_config(dev, PCIR_REVID, 1); hw->subsystem_vendor_id = pci_read_config(dev, PCIR_SUBVEND_0, 2); hw->subsystem_device_id = pci_read_config(dev, PCIR_SUBDEV_0, 2); /* We need this here to set the num_segs below */ ixgbe_set_mac_type(hw); /* Pick up the 82599 and VF settings */ if (hw->mac.type != ixgbe_mac_82598EB) { hw->phy.smart_speed = ixgbe_smart_speed; adapter->num_segs = IXGBE_82599_SCATTER; } else adapter->num_segs = IXGBE_82598_SCATTER; return; } /********************************************************************* * * Determine optic type * **********************************************************************/ static void ixgbe_setup_optics(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; int layer; layer = ixgbe_get_supported_physical_layer(hw); if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_T) { adapter->optics = IFM_10G_T; return; } if (layer & IXGBE_PHYSICAL_LAYER_1000BASE_T) { adapter->optics = IFM_1000_T; return; } if (layer & IXGBE_PHYSICAL_LAYER_1000BASE_SX) { adapter->optics = IFM_1000_SX; return; } if (layer & (IXGBE_PHYSICAL_LAYER_10GBASE_LR | IXGBE_PHYSICAL_LAYER_10GBASE_LRM)) { adapter->optics = IFM_10G_LR; return; } if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_SR) { adapter->optics = IFM_10G_SR; return; } if (layer & IXGBE_PHYSICAL_LAYER_SFP_PLUS_CU) { adapter->optics = IFM_10G_TWINAX; return; } if (layer & (IXGBE_PHYSICAL_LAYER_10GBASE_KX4 | IXGBE_PHYSICAL_LAYER_10GBASE_CX4)) { adapter->optics = IFM_10G_CX4; return; } /* If we get here just set the default */ adapter->optics = IFM_ETHER | IFM_AUTO; return; } /********************************************************************* * * Setup the Legacy or MSI Interrupt handler * **********************************************************************/ static int ixgbe_allocate_legacy(struct adapter *adapter) { device_t dev = adapter->dev; struct ix_queue *que = adapter->queues; #ifndef IXGBE_LEGACY_TX struct tx_ring *txr = adapter->tx_rings; #endif int error, rid = 0; /* MSI RID at 1 */ if (adapter->msix == 1) rid = 1; /* We allocate a single interrupt resource */ adapter->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (adapter->res == NULL) { device_printf(dev, "Unable to allocate bus resource: " "interrupt\n"); return (ENXIO); } /* * Try allocating a fast interrupt and the associated deferred * processing contexts. */ #ifndef IXGBE_LEGACY_TX TASK_INIT(&txr->txq_task, 0, ixgbe_deferred_mq_start, txr); #endif TASK_INIT(&que->que_task, 0, ixgbe_handle_que, que); que->tq = taskqueue_create_fast("ixgbe_que", M_NOWAIT, taskqueue_thread_enqueue, &que->tq); taskqueue_start_threads(&que->tq, 1, PI_NET, "%s ixq", device_get_nameunit(adapter->dev)); /* Tasklets for Link, SFP and Multispeed Fiber */ TASK_INIT(&adapter->link_task, 0, ixgbe_handle_link, adapter); TASK_INIT(&adapter->mod_task, 0, ixgbe_handle_mod, adapter); TASK_INIT(&adapter->msf_task, 0, ixgbe_handle_msf, adapter); #ifdef IXGBE_FDIR TASK_INIT(&adapter->fdir_task, 0, ixgbe_reinit_fdir, adapter); #endif adapter->tq = taskqueue_create_fast("ixgbe_link", M_NOWAIT, taskqueue_thread_enqueue, &adapter->tq); taskqueue_start_threads(&adapter->tq, 1, PI_NET, "%s linkq", device_get_nameunit(adapter->dev)); if ((error = bus_setup_intr(dev, adapter->res, INTR_TYPE_NET | INTR_MPSAFE, NULL, ixgbe_legacy_irq, que, &adapter->tag)) != 0) { device_printf(dev, "Failed to register fast interrupt " "handler: %d\n", error); taskqueue_free(que->tq); taskqueue_free(adapter->tq); que->tq = NULL; adapter->tq = NULL; return (error); } /* For simplicity in the handlers */ adapter->que_mask = IXGBE_EIMS_ENABLE_MASK; return (0); } /********************************************************************* * * Setup MSIX Interrupt resources and handlers * **********************************************************************/ static int ixgbe_allocate_msix(struct adapter *adapter) { device_t dev = adapter->dev; struct ix_queue *que = adapter->queues; struct tx_ring *txr = adapter->tx_rings; int error, rid, vector = 0; for (int i = 0; i < adapter->num_queues; i++, vector++, que++, txr++) { rid = vector + 1; que->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (que->res == NULL) { device_printf(dev,"Unable to allocate" " bus resource: que interrupt [%d]\n", vector); return (ENXIO); } /* Set the handler function */ error = bus_setup_intr(dev, que->res, INTR_TYPE_NET | INTR_MPSAFE, NULL, ixgbe_msix_que, que, &que->tag); if (error) { que->res = NULL; device_printf(dev, "Failed to register QUE handler"); return (error); } #if __FreeBSD_version >= 800504 bus_describe_intr(dev, que->res, que->tag, "que %d", i); #endif que->msix = vector; adapter->que_mask |= (u64)(1 << que->msix); /* ** Bind the msix vector, and thus the ** ring to the corresponding cpu. */ if (adapter->num_queues > 1) bus_bind_intr(dev, que->res, i); #ifndef IXGBE_LEGACY_TX TASK_INIT(&txr->txq_task, 0, ixgbe_deferred_mq_start, txr); #endif TASK_INIT(&que->que_task, 0, ixgbe_handle_que, que); que->tq = taskqueue_create_fast("ixgbe_que", M_NOWAIT, taskqueue_thread_enqueue, &que->tq); taskqueue_start_threads(&que->tq, 1, PI_NET, "%s que", device_get_nameunit(adapter->dev)); } /* and Link */ rid = vector + 1; adapter->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (!adapter->res) { device_printf(dev,"Unable to allocate" " bus resource: Link interrupt [%d]\n", rid); return (ENXIO); } /* Set the link handler function */ error = bus_setup_intr(dev, adapter->res, INTR_TYPE_NET | INTR_MPSAFE, NULL, ixgbe_msix_link, adapter, &adapter->tag); if (error) { adapter->res = NULL; device_printf(dev, "Failed to register LINK handler"); return (error); } #if __FreeBSD_version >= 800504 bus_describe_intr(dev, adapter->res, adapter->tag, "link"); #endif adapter->linkvec = vector; /* Tasklets for Link, SFP and Multispeed Fiber */ TASK_INIT(&adapter->link_task, 0, ixgbe_handle_link, adapter); TASK_INIT(&adapter->mod_task, 0, ixgbe_handle_mod, adapter); TASK_INIT(&adapter->msf_task, 0, ixgbe_handle_msf, adapter); #ifdef IXGBE_FDIR TASK_INIT(&adapter->fdir_task, 0, ixgbe_reinit_fdir, adapter); #endif adapter->tq = taskqueue_create_fast("ixgbe_link", M_NOWAIT, taskqueue_thread_enqueue, &adapter->tq); taskqueue_start_threads(&adapter->tq, 1, PI_NET, "%s linkq", device_get_nameunit(adapter->dev)); return (0); } /* * Setup Either MSI/X or MSI */ static int ixgbe_setup_msix(struct adapter *adapter) { device_t dev = adapter->dev; int rid, want, queues, msgs; /* Override by tuneable */ if (ixgbe_enable_msix == 0) goto msi; /* First try MSI/X */ msgs = pci_msix_count(dev); if (msgs == 0) goto msi; rid = PCIR_BAR(MSIX_82598_BAR); adapter->msix_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (adapter->msix_mem == NULL) { rid += 4; /* 82599 maps in higher BAR */ adapter->msix_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); } if (adapter->msix_mem == NULL) { /* May not be enabled */ device_printf(adapter->dev, "Unable to map MSIX table \n"); goto msi; } /* Figure out a reasonable auto config value */ queues = (mp_ncpus > (msgs-1)) ? (msgs-1) : mp_ncpus; if (ixgbe_num_queues != 0) queues = ixgbe_num_queues; /* Set max queues to 8 when autoconfiguring */ else if ((ixgbe_num_queues == 0) && (queues > 8)) queues = 8; /* reflect correct sysctl value */ ixgbe_num_queues = queues; /* ** Want one vector (RX/TX pair) per queue ** plus an additional for Link. */ want = queues + 1; if (msgs >= want) msgs = want; else { device_printf(adapter->dev, "MSIX Configuration Problem, " "%d vectors but %d queues wanted!\n", msgs, want); goto msi; } if ((pci_alloc_msix(dev, &msgs) == 0) && (msgs == want)) { device_printf(adapter->dev, "Using MSIX interrupts with %d vectors\n", msgs); adapter->num_queues = queues; return (msgs); } /* ** If MSIX alloc failed or provided us with ** less than needed, free and fall through to MSI */ pci_release_msi(dev); msi: if (adapter->msix_mem != NULL) { bus_release_resource(dev, SYS_RES_MEMORY, rid, adapter->msix_mem); adapter->msix_mem = NULL; } msgs = 1; if (pci_alloc_msi(dev, &msgs) == 0) { device_printf(adapter->dev,"Using an MSI interrupt\n"); return (msgs); } device_printf(adapter->dev,"Using a Legacy interrupt\n"); return (0); } static int ixgbe_allocate_pci_resources(struct adapter *adapter) { int rid; device_t dev = adapter->dev; rid = PCIR_BAR(0); adapter->pci_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (!(adapter->pci_mem)) { device_printf(dev,"Unable to allocate bus resource: memory\n"); return (ENXIO); } adapter->osdep.mem_bus_space_tag = rman_get_bustag(adapter->pci_mem); adapter->osdep.mem_bus_space_handle = rman_get_bushandle(adapter->pci_mem); adapter->hw.hw_addr = (u8 *) &adapter->osdep.mem_bus_space_handle; /* Legacy defaults */ adapter->num_queues = 1; adapter->hw.back = &adapter->osdep; /* ** Now setup MSI or MSI/X, should ** return us the number of supported ** vectors. (Will be 1 for MSI) */ adapter->msix = ixgbe_setup_msix(adapter); return (0); } static void ixgbe_free_pci_resources(struct adapter * adapter) { struct ix_queue *que = adapter->queues; device_t dev = adapter->dev; int rid, memrid; if (adapter->hw.mac.type == ixgbe_mac_82598EB) memrid = PCIR_BAR(MSIX_82598_BAR); else memrid = PCIR_BAR(MSIX_82599_BAR); /* ** There is a slight possibility of a failure mode ** in attach that will result in entering this function ** before interrupt resources have been initialized, and ** in that case we do not want to execute the loops below ** We can detect this reliably by the state of the adapter ** res pointer. */ if (adapter->res == NULL) goto mem; /* ** Release all msix queue resources: */ for (int i = 0; i < adapter->num_queues; i++, que++) { rid = que->msix + 1; if (que->tag != NULL) { bus_teardown_intr(dev, que->res, que->tag); que->tag = NULL; } if (que->res != NULL) bus_release_resource(dev, SYS_RES_IRQ, rid, que->res); } /* Clean the Legacy or Link interrupt last */ if (adapter->linkvec) /* we are doing MSIX */ rid = adapter->linkvec + 1; else (adapter->msix != 0) ? (rid = 1):(rid = 0); if (adapter->tag != NULL) { bus_teardown_intr(dev, adapter->res, adapter->tag); adapter->tag = NULL; } if (adapter->res != NULL) bus_release_resource(dev, SYS_RES_IRQ, rid, adapter->res); mem: if (adapter->msix) pci_release_msi(dev); if (adapter->msix_mem != NULL) bus_release_resource(dev, SYS_RES_MEMORY, memrid, adapter->msix_mem); if (adapter->pci_mem != NULL) bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(0), adapter->pci_mem); return; } /********************************************************************* * * Setup networking device structure and register an interface. * **********************************************************************/ static int ixgbe_setup_interface(device_t dev, struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; struct ifnet *ifp; INIT_DEBUGOUT("ixgbe_setup_interface: begin"); ifp = adapter->ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "can not allocate ifnet structure\n"); return (-1); } if_initname(ifp, device_get_name(dev), device_get_unit(dev)); #if __FreeBSD_version < 1000025 ifp->if_baudrate = 1000000000; #else if_initbaudrate(ifp, IF_Gbps(10)); #endif ifp->if_init = ixgbe_init; ifp->if_softc = adapter; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = ixgbe_ioctl; #ifndef IXGBE_LEGACY_TX ifp->if_transmit = ixgbe_mq_start; ifp->if_qflush = ixgbe_qflush; #else ifp->if_start = ixgbe_start; IFQ_SET_MAXLEN(&ifp->if_snd, adapter->num_tx_desc - 2); ifp->if_snd.ifq_drv_maxlen = adapter->num_tx_desc - 2; IFQ_SET_READY(&ifp->if_snd); #endif ether_ifattach(ifp, adapter->hw.mac.addr); adapter->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; /* * Tell the upper layer(s) we support long frames. */ ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header); ifp->if_capabilities |= IFCAP_HWCSUM | IFCAP_TSO | IFCAP_VLAN_HWCSUM; ifp->if_capabilities |= IFCAP_JUMBO_MTU; ifp->if_capabilities |= IFCAP_LRO; ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWTSO | IFCAP_VLAN_MTU; ifp->if_capenable = ifp->if_capabilities; /* ** Don't turn this on by default, if vlans are ** created on another pseudo device (eg. lagg) ** then vlan events are not passed thru, breaking ** operation, but with HW FILTER off it works. If ** using vlans directly on the ixgbe driver you can ** enable this and get full hardware tag filtering. */ ifp->if_capabilities |= IFCAP_VLAN_HWFILTER; /* * Specify the media types supported by this adapter and register * callbacks to update media and link information */ ifmedia_init(&adapter->media, IFM_IMASK, ixgbe_media_change, ixgbe_media_status); ifmedia_add(&adapter->media, IFM_ETHER | adapter->optics, 0, NULL); ifmedia_set(&adapter->media, IFM_ETHER | adapter->optics); if (hw->device_id == IXGBE_DEV_ID_82598AT) { ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_T, 0, NULL); } ifmedia_add(&adapter->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&adapter->media, IFM_ETHER | IFM_AUTO); return (0); } static void ixgbe_config_link(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; u32 autoneg, err = 0; bool sfp, negotiate; sfp = ixgbe_is_sfp(hw); if (sfp) { if (hw->phy.multispeed_fiber) { hw->mac.ops.setup_sfp(hw); ixgbe_enable_tx_laser(hw); taskqueue_enqueue(adapter->tq, &adapter->msf_task); } else taskqueue_enqueue(adapter->tq, &adapter->mod_task); } else { if (hw->mac.ops.check_link) err = ixgbe_check_link(hw, &adapter->link_speed, &adapter->link_up, FALSE); if (err) goto out; autoneg = hw->phy.autoneg_advertised; if ((!autoneg) && (hw->mac.ops.get_link_capabilities)) err = hw->mac.ops.get_link_capabilities(hw, &autoneg, &negotiate); if (err) goto out; if (hw->mac.ops.setup_link) err = hw->mac.ops.setup_link(hw, autoneg, adapter->link_up); } out: return; } /******************************************************************** * Manage DMA'able memory. *******************************************************************/ static void ixgbe_dmamap_cb(void *arg, bus_dma_segment_t * segs, int nseg, int error) { if (error) return; *(bus_addr_t *) arg = segs->ds_addr; return; } static int ixgbe_dma_malloc(struct adapter *adapter, bus_size_t size, struct ixgbe_dma_alloc *dma, int mapflags) { device_t dev = adapter->dev; int r; r = bus_dma_tag_create(bus_get_dma_tag(adapter->dev), /* parent */ DBA_ALIGN, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ size, /* maxsize */ 1, /* nsegments */ size, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &dma->dma_tag); if (r != 0) { device_printf(dev,"ixgbe_dma_malloc: bus_dma_tag_create failed; " "error %u\n", r); goto fail_0; } r = bus_dmamem_alloc(dma->dma_tag, (void **)&dma->dma_vaddr, BUS_DMA_NOWAIT, &dma->dma_map); if (r != 0) { device_printf(dev,"ixgbe_dma_malloc: bus_dmamem_alloc failed; " "error %u\n", r); goto fail_1; } r = bus_dmamap_load(dma->dma_tag, dma->dma_map, dma->dma_vaddr, size, ixgbe_dmamap_cb, &dma->dma_paddr, mapflags | BUS_DMA_NOWAIT); if (r != 0) { device_printf(dev,"ixgbe_dma_malloc: bus_dmamap_load failed; " "error %u\n", r); goto fail_2; } dma->dma_size = size; return (0); fail_2: bus_dmamem_free(dma->dma_tag, dma->dma_vaddr, dma->dma_map); fail_1: bus_dma_tag_destroy(dma->dma_tag); fail_0: dma->dma_map = NULL; dma->dma_tag = NULL; return (r); } static void ixgbe_dma_free(struct adapter *adapter, struct ixgbe_dma_alloc *dma) { bus_dmamap_sync(dma->dma_tag, dma->dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(dma->dma_tag, dma->dma_map); bus_dmamem_free(dma->dma_tag, dma->dma_vaddr, dma->dma_map); bus_dma_tag_destroy(dma->dma_tag); } /********************************************************************* * * Allocate memory for the transmit and receive rings, and then * the descriptors associated with each, called only once at attach. * **********************************************************************/ static int ixgbe_allocate_queues(struct adapter *adapter) { device_t dev = adapter->dev; struct ix_queue *que; struct tx_ring *txr; struct rx_ring *rxr; int rsize, tsize, error = IXGBE_SUCCESS; int txconf = 0, rxconf = 0; /* First allocate the top level queue structs */ if (!(adapter->queues = (struct ix_queue *) malloc(sizeof(struct ix_queue) * adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate queue memory\n"); error = ENOMEM; goto fail; } /* First allocate the TX ring struct memory */ if (!(adapter->tx_rings = (struct tx_ring *) malloc(sizeof(struct tx_ring) * adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate TX ring memory\n"); error = ENOMEM; goto tx_fail; } /* Next allocate the RX */ if (!(adapter->rx_rings = (struct rx_ring *) malloc(sizeof(struct rx_ring) * adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate RX ring memory\n"); error = ENOMEM; goto rx_fail; } /* For the ring itself */ tsize = roundup2(adapter->num_tx_desc * sizeof(union ixgbe_adv_tx_desc), DBA_ALIGN); /* * Now set up the TX queues, txconf is needed to handle the * possibility that things fail midcourse and we need to * undo memory gracefully */ for (int i = 0; i < adapter->num_queues; i++, txconf++) { /* Set up some basics */ txr = &adapter->tx_rings[i]; txr->adapter = adapter; txr->me = i; txr->num_desc = adapter->num_tx_desc; /* Initialize the TX side lock */ snprintf(txr->mtx_name, sizeof(txr->mtx_name), "%s:tx(%d)", device_get_nameunit(dev), txr->me); mtx_init(&txr->tx_mtx, txr->mtx_name, NULL, MTX_DEF); if (ixgbe_dma_malloc(adapter, tsize, &txr->txdma, BUS_DMA_NOWAIT)) { device_printf(dev, "Unable to allocate TX Descriptor memory\n"); error = ENOMEM; goto err_tx_desc; } txr->tx_base = (union ixgbe_adv_tx_desc *)txr->txdma.dma_vaddr; bzero((void *)txr->tx_base, tsize); /* Now allocate transmit buffers for the ring */ if (ixgbe_allocate_transmit_buffers(txr)) { device_printf(dev, "Critical Failure setting up transmit buffers\n"); error = ENOMEM; goto err_tx_desc; } #ifndef IXGBE_LEGACY_TX /* Allocate a buf ring */ txr->br = buf_ring_alloc(IXGBE_BR_SIZE, M_DEVBUF, M_WAITOK, &txr->tx_mtx); if (txr->br == NULL) { device_printf(dev, "Critical Failure setting up buf ring\n"); error = ENOMEM; goto err_tx_desc; } #endif } /* * Next the RX queues... */ rsize = roundup2(adapter->num_rx_desc * sizeof(union ixgbe_adv_rx_desc), DBA_ALIGN); for (int i = 0; i < adapter->num_queues; i++, rxconf++) { rxr = &adapter->rx_rings[i]; /* Set up some basics */ rxr->adapter = adapter; rxr->me = i; rxr->num_desc = adapter->num_rx_desc; /* Initialize the RX side lock */ snprintf(rxr->mtx_name, sizeof(rxr->mtx_name), "%s:rx(%d)", device_get_nameunit(dev), rxr->me); mtx_init(&rxr->rx_mtx, rxr->mtx_name, NULL, MTX_DEF); if (ixgbe_dma_malloc(adapter, rsize, &rxr->rxdma, BUS_DMA_NOWAIT)) { device_printf(dev, "Unable to allocate RxDescriptor memory\n"); error = ENOMEM; goto err_rx_desc; } rxr->rx_base = (union ixgbe_adv_rx_desc *)rxr->rxdma.dma_vaddr; bzero((void *)rxr->rx_base, rsize); /* Allocate receive buffers for the ring*/ if (ixgbe_allocate_receive_buffers(rxr)) { device_printf(dev, "Critical Failure setting up receive buffers\n"); error = ENOMEM; goto err_rx_desc; } } /* ** Finally set up the queue holding structs */ for (int i = 0; i < adapter->num_queues; i++) { que = &adapter->queues[i]; que->adapter = adapter; que->txr = &adapter->tx_rings[i]; que->rxr = &adapter->rx_rings[i]; } return (0); err_rx_desc: for (rxr = adapter->rx_rings; rxconf > 0; rxr++, rxconf--) ixgbe_dma_free(adapter, &rxr->rxdma); err_tx_desc: for (txr = adapter->tx_rings; txconf > 0; txr++, txconf--) ixgbe_dma_free(adapter, &txr->txdma); free(adapter->rx_rings, M_DEVBUF); rx_fail: free(adapter->tx_rings, M_DEVBUF); tx_fail: free(adapter->queues, M_DEVBUF); fail: return (error); } /********************************************************************* * * Allocate memory for tx_buffer structures. The tx_buffer stores all * the information needed to transmit a packet on the wire. This is * called only once at attach, setup is done every reset. * **********************************************************************/ static int ixgbe_allocate_transmit_buffers(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; device_t dev = adapter->dev; struct ixgbe_tx_buf *txbuf; int error, i; /* * Setup DMA descriptor areas. */ if ((error = bus_dma_tag_create( bus_get_dma_tag(adapter->dev), /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ IXGBE_TSO_SIZE, /* maxsize */ adapter->num_segs, /* nsegments */ PAGE_SIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->txtag))) { device_printf(dev,"Unable to allocate TX DMA tag\n"); goto fail; } if (!(txr->tx_buffers = (struct ixgbe_tx_buf *) malloc(sizeof(struct ixgbe_tx_buf) * adapter->num_tx_desc, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate tx_buffer memory\n"); error = ENOMEM; goto fail; } /* Create the descriptor buffer dma maps */ txbuf = txr->tx_buffers; for (i = 0; i < adapter->num_tx_desc; i++, txbuf++) { error = bus_dmamap_create(txr->txtag, 0, &txbuf->map); if (error != 0) { device_printf(dev, "Unable to create TX DMA map\n"); goto fail; } } return 0; fail: /* We free all, it handles case where we are in the middle */ ixgbe_free_transmit_structures(adapter); return (error); } /********************************************************************* * * Initialize a transmit ring. * **********************************************************************/ static void ixgbe_setup_transmit_ring(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; struct ixgbe_tx_buf *txbuf; int i; #ifdef DEV_NETMAP struct netmap_adapter *na = NA(adapter->ifp); struct netmap_slot *slot; #endif /* DEV_NETMAP */ /* Clear the old ring contents */ IXGBE_TX_LOCK(txr); #ifdef DEV_NETMAP /* * (under lock): if in netmap mode, do some consistency * checks and set slot to entry 0 of the netmap ring. */ slot = netmap_reset(na, NR_TX, txr->me, 0); #endif /* DEV_NETMAP */ bzero((void *)txr->tx_base, (sizeof(union ixgbe_adv_tx_desc)) * adapter->num_tx_desc); /* Reset indices */ txr->next_avail_desc = 0; txr->next_to_clean = 0; /* Free any existing tx buffers. */ txbuf = txr->tx_buffers; for (i = 0; i < txr->num_desc; i++, txbuf++) { if (txbuf->m_head != NULL) { bus_dmamap_sync(txr->txtag, txbuf->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, txbuf->map); m_freem(txbuf->m_head); txbuf->m_head = NULL; } #ifdef DEV_NETMAP /* * In netmap mode, set the map for the packet buffer. * NOTE: Some drivers (not this one) also need to set * the physical buffer address in the NIC ring. * Slots in the netmap ring (indexed by "si") are * kring->nkr_hwofs positions "ahead" wrt the * corresponding slot in the NIC ring. In some drivers * (not here) nkr_hwofs can be negative. Function * netmap_idx_n2k() handles wraparounds properly. */ if (slot) { int si = netmap_idx_n2k(&na->tx_rings[txr->me], i); netmap_load_map(txr->txtag, txbuf->map, NMB(slot + si)); } #endif /* DEV_NETMAP */ /* Clear the EOP descriptor pointer */ txbuf->eop = NULL; } #ifdef IXGBE_FDIR /* Set the rate at which we sample packets */ if (adapter->hw.mac.type != ixgbe_mac_82598EB) txr->atr_sample = atr_sample_rate; #endif /* Set number of descriptors available */ txr->tx_avail = adapter->num_tx_desc; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); IXGBE_TX_UNLOCK(txr); } /********************************************************************* * * Initialize all transmit rings. * **********************************************************************/ static int ixgbe_setup_transmit_structures(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; for (int i = 0; i < adapter->num_queues; i++, txr++) ixgbe_setup_transmit_ring(txr); return (0); } /********************************************************************* * * Enable transmit unit. * **********************************************************************/ static void ixgbe_initialize_transmit_units(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; struct ixgbe_hw *hw = &adapter->hw; /* Setup the Base and Length of the Tx Descriptor Ring */ for (int i = 0; i < adapter->num_queues; i++, txr++) { u64 tdba = txr->txdma.dma_paddr; u32 txctrl; IXGBE_WRITE_REG(hw, IXGBE_TDBAL(i), (tdba & 0x00000000ffffffffULL)); IXGBE_WRITE_REG(hw, IXGBE_TDBAH(i), (tdba >> 32)); IXGBE_WRITE_REG(hw, IXGBE_TDLEN(i), adapter->num_tx_desc * sizeof(union ixgbe_adv_tx_desc)); /* Setup the HW Tx Head and Tail descriptor pointers */ IXGBE_WRITE_REG(hw, IXGBE_TDH(i), 0); IXGBE_WRITE_REG(hw, IXGBE_TDT(i), 0); /* Setup Transmit Descriptor Cmd Settings */ txr->txd_cmd = IXGBE_TXD_CMD_IFCS; txr->queue_status = IXGBE_QUEUE_IDLE; /* Set the processing limit */ txr->process_limit = ixgbe_tx_process_limit; /* Disable Head Writeback */ switch (hw->mac.type) { case ixgbe_mac_82598EB: txctrl = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL(i)); break; case ixgbe_mac_82599EB: case ixgbe_mac_X540: default: txctrl = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(i)); break; } txctrl &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN; switch (hw->mac.type) { case ixgbe_mac_82598EB: IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL(i), txctrl); break; case ixgbe_mac_82599EB: case ixgbe_mac_X540: default: IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(i), txctrl); break; } } if (hw->mac.type != ixgbe_mac_82598EB) { u32 dmatxctl, rttdcs; dmatxctl = IXGBE_READ_REG(hw, IXGBE_DMATXCTL); dmatxctl |= IXGBE_DMATXCTL_TE; IXGBE_WRITE_REG(hw, IXGBE_DMATXCTL, dmatxctl); /* Disable arbiter to set MTQC */ rttdcs = IXGBE_READ_REG(hw, IXGBE_RTTDCS); rttdcs |= IXGBE_RTTDCS_ARBDIS; IXGBE_WRITE_REG(hw, IXGBE_RTTDCS, rttdcs); IXGBE_WRITE_REG(hw, IXGBE_MTQC, IXGBE_MTQC_64Q_1PB); rttdcs &= ~IXGBE_RTTDCS_ARBDIS; IXGBE_WRITE_REG(hw, IXGBE_RTTDCS, rttdcs); } return; } /********************************************************************* * * Free all transmit rings. * **********************************************************************/ static void ixgbe_free_transmit_structures(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; for (int i = 0; i < adapter->num_queues; i++, txr++) { IXGBE_TX_LOCK(txr); ixgbe_free_transmit_buffers(txr); ixgbe_dma_free(adapter, &txr->txdma); IXGBE_TX_UNLOCK(txr); IXGBE_TX_LOCK_DESTROY(txr); } free(adapter->tx_rings, M_DEVBUF); } /********************************************************************* * * Free transmit ring related data structures. * **********************************************************************/ static void ixgbe_free_transmit_buffers(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; struct ixgbe_tx_buf *tx_buffer; int i; INIT_DEBUGOUT("ixgbe_free_transmit_ring: begin"); if (txr->tx_buffers == NULL) return; tx_buffer = txr->tx_buffers; for (i = 0; i < adapter->num_tx_desc; i++, tx_buffer++) { if (tx_buffer->m_head != NULL) { bus_dmamap_sync(txr->txtag, tx_buffer->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, tx_buffer->map); m_freem(tx_buffer->m_head); tx_buffer->m_head = NULL; if (tx_buffer->map != NULL) { bus_dmamap_destroy(txr->txtag, tx_buffer->map); tx_buffer->map = NULL; } } else if (tx_buffer->map != NULL) { bus_dmamap_unload(txr->txtag, tx_buffer->map); bus_dmamap_destroy(txr->txtag, tx_buffer->map); tx_buffer->map = NULL; } } #ifdef IXGBE_LEGACY_TX if (txr->br != NULL) buf_ring_free(txr->br, M_DEVBUF); #endif if (txr->tx_buffers != NULL) { free(txr->tx_buffers, M_DEVBUF); txr->tx_buffers = NULL; } if (txr->txtag != NULL) { bus_dma_tag_destroy(txr->txtag); txr->txtag = NULL; } return; } /********************************************************************* * * Advanced Context Descriptor setup for VLAN, CSUM or TSO * **********************************************************************/ static int ixgbe_tx_ctx_setup(struct tx_ring *txr, struct mbuf *mp, u32 *cmd_type_len, u32 *olinfo_status) { struct ixgbe_adv_tx_context_desc *TXD; struct ether_vlan_header *eh; struct ip *ip; struct ip6_hdr *ip6; u32 vlan_macip_lens = 0, type_tucmd_mlhl = 0; int ehdrlen, ip_hlen = 0; u16 etype; u8 ipproto = 0; int offload = TRUE; int ctxd = txr->next_avail_desc; u16 vtag = 0; /* First check if TSO is to be used */ if (mp->m_pkthdr.csum_flags & CSUM_TSO) return (ixgbe_tso_setup(txr, mp, cmd_type_len, olinfo_status)); if ((mp->m_pkthdr.csum_flags & CSUM_OFFLOAD) == 0) offload = FALSE; /* Indicate the whole packet as payload when not doing TSO */ *olinfo_status |= mp->m_pkthdr.len << IXGBE_ADVTXD_PAYLEN_SHIFT; /* Now ready a context descriptor */ TXD = (struct ixgbe_adv_tx_context_desc *) &txr->tx_base[ctxd]; /* ** In advanced descriptors the vlan tag must ** be placed into the context descriptor. Hence ** we need to make one even if not doing offloads. */ if (mp->m_flags & M_VLANTAG) { vtag = htole16(mp->m_pkthdr.ether_vtag); vlan_macip_lens |= (vtag << IXGBE_ADVTXD_VLAN_SHIFT); } else if (offload == FALSE) /* ... no offload to do */ return (0); /* * Determine where frame payload starts. * Jump over vlan headers if already present, * helpful for QinQ too. */ eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { etype = ntohs(eh->evl_proto); ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { etype = ntohs(eh->evl_encap_proto); ehdrlen = ETHER_HDR_LEN; } /* Set the ether header length */ vlan_macip_lens |= ehdrlen << IXGBE_ADVTXD_MACLEN_SHIFT; switch (etype) { case ETHERTYPE_IP: ip = (struct ip *)(mp->m_data + ehdrlen); ip_hlen = ip->ip_hl << 2; ipproto = ip->ip_p; type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4; break; case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen); ip_hlen = sizeof(struct ip6_hdr); /* XXX-BZ this will go badly in case of ext hdrs. */ ipproto = ip6->ip6_nxt; type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6; break; default: offload = FALSE; break; } vlan_macip_lens |= ip_hlen; type_tucmd_mlhl |= IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT; switch (ipproto) { case IPPROTO_TCP: if (mp->m_pkthdr.csum_flags & CSUM_TCP) type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP; break; case IPPROTO_UDP: if (mp->m_pkthdr.csum_flags & CSUM_UDP) type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_UDP; break; #if __FreeBSD_version >= 800000 case IPPROTO_SCTP: if (mp->m_pkthdr.csum_flags & CSUM_SCTP) type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_SCTP; break; #endif default: offload = FALSE; break; } if (offload) /* For the TX descriptor setup */ *olinfo_status |= IXGBE_TXD_POPTS_TXSM << 8; /* Now copy bits into descriptor */ TXD->vlan_macip_lens = htole32(vlan_macip_lens); TXD->type_tucmd_mlhl = htole32(type_tucmd_mlhl); TXD->seqnum_seed = htole32(0); TXD->mss_l4len_idx = htole32(0); /* We've consumed the first desc, adjust counters */ if (++ctxd == txr->num_desc) ctxd = 0; txr->next_avail_desc = ctxd; --txr->tx_avail; return (0); } /********************************************************************** * * Setup work for hardware segmentation offload (TSO) on * adapters using advanced tx descriptors * **********************************************************************/ static int ixgbe_tso_setup(struct tx_ring *txr, struct mbuf *mp, u32 *cmd_type_len, u32 *olinfo_status) { struct ixgbe_adv_tx_context_desc *TXD; u32 vlan_macip_lens = 0, type_tucmd_mlhl = 0; u32 mss_l4len_idx = 0, paylen; u16 vtag = 0, eh_type; int ctxd, ehdrlen, ip_hlen, tcp_hlen; struct ether_vlan_header *eh; #ifdef INET6 struct ip6_hdr *ip6; #endif #ifdef INET struct ip *ip; #endif struct tcphdr *th; /* * Determine where frame payload starts. * Jump over vlan headers if already present */ eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; eh_type = eh->evl_proto; } else { ehdrlen = ETHER_HDR_LEN; eh_type = eh->evl_encap_proto; } switch (ntohs(eh_type)) { #ifdef INET6 case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen); /* XXX-BZ For now we do not pretend to support ext. hdrs. */ if (ip6->ip6_nxt != IPPROTO_TCP) return (ENXIO); ip_hlen = sizeof(struct ip6_hdr); ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen); th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen); th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6; break; #endif #ifdef INET case ETHERTYPE_IP: ip = (struct ip *)(mp->m_data + ehdrlen); if (ip->ip_p != IPPROTO_TCP) return (ENXIO); ip->ip_sum = 0; ip_hlen = ip->ip_hl << 2; th = (struct tcphdr *)((caddr_t)ip + ip_hlen); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4; /* Tell transmit desc to also do IPv4 checksum. */ *olinfo_status |= IXGBE_TXD_POPTS_IXSM << 8; break; #endif default: panic("%s: CSUM_TSO but no supported IP version (0x%04x)", __func__, ntohs(eh_type)); break; } ctxd = txr->next_avail_desc; TXD = (struct ixgbe_adv_tx_context_desc *) &txr->tx_base[ctxd]; tcp_hlen = th->th_off << 2; /* This is used in the transmit desc in encap */ paylen = mp->m_pkthdr.len - ehdrlen - ip_hlen - tcp_hlen; /* VLAN MACLEN IPLEN */ if (mp->m_flags & M_VLANTAG) { vtag = htole16(mp->m_pkthdr.ether_vtag); vlan_macip_lens |= (vtag << IXGBE_ADVTXD_VLAN_SHIFT); } vlan_macip_lens |= ehdrlen << IXGBE_ADVTXD_MACLEN_SHIFT; vlan_macip_lens |= ip_hlen; TXD->vlan_macip_lens = htole32(vlan_macip_lens); /* ADV DTYPE TUCMD */ type_tucmd_mlhl |= IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT; type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP; TXD->type_tucmd_mlhl = htole32(type_tucmd_mlhl); /* MSS L4LEN IDX */ mss_l4len_idx |= (mp->m_pkthdr.tso_segsz << IXGBE_ADVTXD_MSS_SHIFT); mss_l4len_idx |= (tcp_hlen << IXGBE_ADVTXD_L4LEN_SHIFT); TXD->mss_l4len_idx = htole32(mss_l4len_idx); TXD->seqnum_seed = htole32(0); if (++ctxd == txr->num_desc) ctxd = 0; txr->tx_avail--; txr->next_avail_desc = ctxd; *cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE; *olinfo_status |= IXGBE_TXD_POPTS_TXSM << 8; *olinfo_status |= paylen << IXGBE_ADVTXD_PAYLEN_SHIFT; ++txr->tso_tx; return (0); } #ifdef IXGBE_FDIR /* ** This routine parses packet headers so that Flow ** Director can make a hashed filter table entry ** allowing traffic flows to be identified and kept ** on the same cpu. This would be a performance ** hit, but we only do it at IXGBE_FDIR_RATE of ** packets. */ static void ixgbe_atr(struct tx_ring *txr, struct mbuf *mp) { struct adapter *adapter = txr->adapter; struct ix_queue *que; struct ip *ip; struct tcphdr *th; struct udphdr *uh; struct ether_vlan_header *eh; union ixgbe_atr_hash_dword input = {.dword = 0}; union ixgbe_atr_hash_dword common = {.dword = 0}; int ehdrlen, ip_hlen; u16 etype; eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; etype = eh->evl_proto; } else { ehdrlen = ETHER_HDR_LEN; etype = eh->evl_encap_proto; } /* Only handling IPv4 */ if (etype != htons(ETHERTYPE_IP)) return; ip = (struct ip *)(mp->m_data + ehdrlen); ip_hlen = ip->ip_hl << 2; /* check if we're UDP or TCP */ switch (ip->ip_p) { case IPPROTO_TCP: th = (struct tcphdr *)((caddr_t)ip + ip_hlen); /* src and dst are inverted */ common.port.dst ^= th->th_sport; common.port.src ^= th->th_dport; input.formatted.flow_type ^= IXGBE_ATR_FLOW_TYPE_TCPV4; break; case IPPROTO_UDP: uh = (struct udphdr *)((caddr_t)ip + ip_hlen); /* src and dst are inverted */ common.port.dst ^= uh->uh_sport; common.port.src ^= uh->uh_dport; input.formatted.flow_type ^= IXGBE_ATR_FLOW_TYPE_UDPV4; break; default: return; } input.formatted.vlan_id = htobe16(mp->m_pkthdr.ether_vtag); if (mp->m_pkthdr.ether_vtag) common.flex_bytes ^= htons(ETHERTYPE_VLAN); else common.flex_bytes ^= etype; common.ip ^= ip->ip_src.s_addr ^ ip->ip_dst.s_addr; que = &adapter->queues[txr->me]; /* ** This assumes the Rx queue and Tx ** queue are bound to the same CPU */ ixgbe_fdir_add_signature_filter_82599(&adapter->hw, input, common, que->msix); } #endif /* IXGBE_FDIR */ /********************************************************************** * * Examine each tx_buffer in the used queue. If the hardware is done * processing the packet then free associated resources. The * tx_buffer is put back on the free queue. * **********************************************************************/ static void ixgbe_txeof(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; struct ifnet *ifp = adapter->ifp; u32 work, processed = 0; u16 limit = txr->process_limit; struct ixgbe_tx_buf *buf; union ixgbe_adv_tx_desc *txd; mtx_assert(&txr->tx_mtx, MA_OWNED); #ifdef DEV_NETMAP if (ifp->if_capenable & IFCAP_NETMAP) { struct netmap_adapter *na = NA(ifp); struct netmap_kring *kring = &na->tx_rings[txr->me]; txd = txr->tx_base; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); /* * In netmap mode, all the work is done in the context * of the client thread. Interrupt handlers only wake up * clients, which may be sleeping on individual rings * or on a global resource for all rings. * To implement tx interrupt mitigation, we wake up the client * thread roughly every half ring, even if the NIC interrupts * more frequently. This is implemented as follows: * - ixgbe_txsync() sets kring->nr_kflags with the index of * the slot that should wake up the thread (nkr_num_slots * means the user thread should not be woken up); * - the driver ignores tx interrupts unless netmap_mitigate=0 * or the slot has the DD bit set. */ if (!netmap_mitigate || (kring->nr_kflags < kring->nkr_num_slots && txd[kring->nr_kflags].wb.status & IXGBE_TXD_STAT_DD)) { netmap_tx_irq(ifp, txr->me); } return; } #endif /* DEV_NETMAP */ if (txr->tx_avail == txr->num_desc) { txr->queue_status = IXGBE_QUEUE_IDLE; return; } /* Get work starting point */ work = txr->next_to_clean; buf = &txr->tx_buffers[work]; txd = &txr->tx_base[work]; work -= txr->num_desc; /* The distance to ring end */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); do { union ixgbe_adv_tx_desc *eop= buf->eop; if (eop == NULL) /* No work */ break; if ((eop->wb.status & IXGBE_TXD_STAT_DD) == 0) break; /* I/O not complete */ if (buf->m_head) { txr->bytes += buf->m_head->m_pkthdr.len; bus_dmamap_sync(txr->txtag, buf->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, buf->map); m_freem(buf->m_head); buf->m_head = NULL; buf->map = NULL; } buf->eop = NULL; ++txr->tx_avail; /* We clean the range if multi segment */ while (txd != eop) { ++txd; ++buf; ++work; /* wrap the ring? */ if (__predict_false(!work)) { work -= txr->num_desc; buf = txr->tx_buffers; txd = txr->tx_base; } if (buf->m_head) { txr->bytes += buf->m_head->m_pkthdr.len; bus_dmamap_sync(txr->txtag, buf->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, buf->map); m_freem(buf->m_head); buf->m_head = NULL; buf->map = NULL; } ++txr->tx_avail; buf->eop = NULL; } ++txr->packets; ++processed; ++ifp->if_opackets; txr->watchdog_time = ticks; /* Try the next packet */ ++txd; ++buf; ++work; /* reset with a wrap */ if (__predict_false(!work)) { work -= txr->num_desc; buf = txr->tx_buffers; txd = txr->tx_base; } prefetch(txd); } while (__predict_true(--limit)); bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); work += txr->num_desc; txr->next_to_clean = work; /* ** Watchdog calculation, we know there's ** work outstanding or the first return ** would have been taken, so none processed ** for too long indicates a hang. */ if ((!processed) && ((ticks - txr->watchdog_time) > IXGBE_WATCHDOG)) txr->queue_status = IXGBE_QUEUE_HUNG; if (txr->tx_avail == txr->num_desc) txr->queue_status = IXGBE_QUEUE_IDLE; return; } /********************************************************************* * * Refresh mbuf buffers for RX descriptor rings * - now keeps its own state so discards due to resource * exhaustion are unnecessary, if an mbuf cannot be obtained * it just returns, keeping its placeholder, thus it can simply * be recalled to try again. * **********************************************************************/ static void ixgbe_refresh_mbufs(struct rx_ring *rxr, int limit) { struct adapter *adapter = rxr->adapter; bus_dma_segment_t seg[1]; struct ixgbe_rx_buf *rxbuf; struct mbuf *mp; int i, j, nsegs, error; bool refreshed = FALSE; i = j = rxr->next_to_refresh; /* Control the loop with one beyond */ if (++j == rxr->num_desc) j = 0; while (j != limit) { rxbuf = &rxr->rx_buffers[i]; if (rxbuf->buf == NULL) { mp = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rxr->mbuf_sz); if (mp == NULL) goto update; if (adapter->max_frame_size <= (MCLBYTES - ETHER_ALIGN)) m_adj(mp, ETHER_ALIGN); } else mp = rxbuf->buf; mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz; /* If we're dealing with an mbuf that was copied rather * than replaced, there's no need to go through busdma. */ if ((rxbuf->flags & IXGBE_RX_COPY) == 0) { /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->ptag, rxbuf->pmap, mp, seg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) { printf("Refresh mbufs: payload dmamap load" " failure - %d\n", error); m_free(mp); rxbuf->buf = NULL; goto update; } rxbuf->buf = mp; bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_PREREAD); rxbuf->addr = rxr->rx_base[i].read.pkt_addr = htole64(seg[0].ds_addr); } else { rxr->rx_base[i].read.pkt_addr = rxbuf->addr; rxbuf->flags &= ~IXGBE_RX_COPY; } refreshed = TRUE; /* Next is precalculated */ i = j; rxr->next_to_refresh = i; if (++j == rxr->num_desc) j = 0; } update: if (refreshed) /* Update hardware tail index */ IXGBE_WRITE_REG(&adapter->hw, IXGBE_RDT(rxr->me), rxr->next_to_refresh); return; } /********************************************************************* * * Allocate memory for rx_buffer structures. Since we use one * rx_buffer per received packet, the maximum number of rx_buffer's * that we'll need is equal to the number of receive descriptors * that we've allocated. * **********************************************************************/ static int ixgbe_allocate_receive_buffers(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; device_t dev = adapter->dev; struct ixgbe_rx_buf *rxbuf; int i, bsize, error; bsize = sizeof(struct ixgbe_rx_buf) * rxr->num_desc; if (!(rxr->rx_buffers = (struct ixgbe_rx_buf *) malloc(bsize, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate rx_buffer memory\n"); error = ENOMEM; goto fail; } if ((error = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MJUM16BYTES, /* maxsize */ 1, /* nsegments */ MJUM16BYTES, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &rxr->ptag))) { device_printf(dev, "Unable to create RX DMA tag\n"); goto fail; } for (i = 0; i < rxr->num_desc; i++, rxbuf++) { rxbuf = &rxr->rx_buffers[i]; error = bus_dmamap_create(rxr->ptag, BUS_DMA_NOWAIT, &rxbuf->pmap); if (error) { device_printf(dev, "Unable to create RX dma map\n"); goto fail; } } return (0); fail: /* Frees all, but can handle partial completion */ ixgbe_free_receive_structures(adapter); return (error); } /* ** Used to detect a descriptor that has ** been merged by Hardware RSC. */ static inline u32 ixgbe_rsc_count(union ixgbe_adv_rx_desc *rx) { return (le32toh(rx->wb.lower.lo_dword.data) & IXGBE_RXDADV_RSCCNT_MASK) >> IXGBE_RXDADV_RSCCNT_SHIFT; } /********************************************************************* * * Initialize Hardware RSC (LRO) feature on 82599 * for an RX ring, this is toggled by the LRO capability * even though it is transparent to the stack. * * NOTE: since this HW feature only works with IPV4 and * our testing has shown soft LRO to be as effective * I have decided to disable this by default. * **********************************************************************/ static void ixgbe_setup_hw_rsc(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; struct ixgbe_hw *hw = &adapter->hw; u32 rscctrl, rdrxctl; /* If turning LRO/RSC off we need to disable it */ if ((adapter->ifp->if_capenable & IFCAP_LRO) == 0) { rscctrl = IXGBE_READ_REG(hw, IXGBE_RSCCTL(rxr->me)); rscctrl &= ~IXGBE_RSCCTL_RSCEN; return; } rdrxctl = IXGBE_READ_REG(hw, IXGBE_RDRXCTL); rdrxctl &= ~IXGBE_RDRXCTL_RSCFRSTSIZE; #ifdef DEV_NETMAP /* crcstrip is optional in netmap */ if (adapter->ifp->if_capenable & IFCAP_NETMAP && !ix_crcstrip) #endif /* DEV_NETMAP */ rdrxctl |= IXGBE_RDRXCTL_CRCSTRIP; rdrxctl |= IXGBE_RDRXCTL_RSCACKC; IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rdrxctl); rscctrl = IXGBE_READ_REG(hw, IXGBE_RSCCTL(rxr->me)); rscctrl |= IXGBE_RSCCTL_RSCEN; /* ** Limit the total number of descriptors that ** can be combined, so it does not exceed 64K */ if (rxr->mbuf_sz == MCLBYTES) rscctrl |= IXGBE_RSCCTL_MAXDESC_16; else if (rxr->mbuf_sz == MJUMPAGESIZE) rscctrl |= IXGBE_RSCCTL_MAXDESC_8; else if (rxr->mbuf_sz == MJUM9BYTES) rscctrl |= IXGBE_RSCCTL_MAXDESC_4; else /* Using 16K cluster */ rscctrl |= IXGBE_RSCCTL_MAXDESC_1; IXGBE_WRITE_REG(hw, IXGBE_RSCCTL(rxr->me), rscctrl); /* Enable TCP header recognition */ IXGBE_WRITE_REG(hw, IXGBE_PSRTYPE(0), (IXGBE_READ_REG(hw, IXGBE_PSRTYPE(0)) | IXGBE_PSRTYPE_TCPHDR)); /* Disable RSC for ACK packets */ IXGBE_WRITE_REG(hw, IXGBE_RSCDBU, (IXGBE_RSCDBU_RSCACKDIS | IXGBE_READ_REG(hw, IXGBE_RSCDBU))); rxr->hw_rsc = TRUE; } static void ixgbe_free_receive_ring(struct rx_ring *rxr) { struct ixgbe_rx_buf *rxbuf; int i; for (i = 0; i < rxr->num_desc; i++) { rxbuf = &rxr->rx_buffers[i]; if (rxbuf->buf != NULL) { bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->ptag, rxbuf->pmap); rxbuf->buf->m_flags |= M_PKTHDR; m_freem(rxbuf->buf); rxbuf->buf = NULL; rxbuf->flags = 0; } } } /********************************************************************* * * Initialize a receive ring and its buffers. * **********************************************************************/ static int ixgbe_setup_receive_ring(struct rx_ring *rxr) { struct adapter *adapter; struct ifnet *ifp; device_t dev; struct ixgbe_rx_buf *rxbuf; bus_dma_segment_t seg[1]; struct lro_ctrl *lro = &rxr->lro; int rsize, nsegs, error = 0; #ifdef DEV_NETMAP struct netmap_adapter *na = NA(rxr->adapter->ifp); struct netmap_slot *slot; #endif /* DEV_NETMAP */ adapter = rxr->adapter; ifp = adapter->ifp; dev = adapter->dev; /* Clear the ring contents */ IXGBE_RX_LOCK(rxr); #ifdef DEV_NETMAP /* same as in ixgbe_setup_transmit_ring() */ slot = netmap_reset(na, NR_RX, rxr->me, 0); #endif /* DEV_NETMAP */ rsize = roundup2(adapter->num_rx_desc * sizeof(union ixgbe_adv_rx_desc), DBA_ALIGN); bzero((void *)rxr->rx_base, rsize); /* Cache the size */ rxr->mbuf_sz = adapter->rx_mbuf_sz; /* Free current RX buffer structs and their mbufs */ ixgbe_free_receive_ring(rxr); /* Now replenish the mbufs */ for (int j = 0; j != rxr->num_desc; ++j) { struct mbuf *mp; rxbuf = &rxr->rx_buffers[j]; #ifdef DEV_NETMAP /* * In netmap mode, fill the map and set the buffer * address in the NIC ring, considering the offset * between the netmap and NIC rings (see comment in * ixgbe_setup_transmit_ring() ). No need to allocate * an mbuf, so end the block with a continue; */ if (slot) { int sj = netmap_idx_n2k(&na->rx_rings[rxr->me], j); uint64_t paddr; void *addr; addr = PNMB(slot + sj, &paddr); netmap_load_map(rxr->ptag, rxbuf->pmap, addr); /* Update descriptor and the cached value */ rxr->rx_base[j].read.pkt_addr = htole64(paddr); rxbuf->addr = htole64(paddr); continue; } #endif /* DEV_NETMAP */ rxbuf->flags = 0; rxbuf->buf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, adapter->rx_mbuf_sz); if (rxbuf->buf == NULL) { error = ENOBUFS; goto fail; } mp = rxbuf->buf; mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->ptag, rxbuf->pmap, mp, seg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) goto fail; bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_PREREAD); /* Update the descriptor and the cached value */ rxr->rx_base[j].read.pkt_addr = htole64(seg[0].ds_addr); rxbuf->addr = htole64(seg[0].ds_addr); } /* Setup our descriptor indices */ rxr->next_to_check = 0; rxr->next_to_refresh = 0; rxr->lro_enabled = FALSE; rxr->rx_copies = 0; rxr->rx_bytes = 0; rxr->discard = FALSE; rxr->vtag_strip = FALSE; bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* ** Now set up the LRO interface: */ if (ixgbe_rsc_enable) ixgbe_setup_hw_rsc(rxr); else if (ifp->if_capenable & IFCAP_LRO) { int err = tcp_lro_init(lro); if (err) { device_printf(dev, "LRO Initialization failed!\n"); goto fail; } INIT_DEBUGOUT("RX Soft LRO Initialized\n"); rxr->lro_enabled = TRUE; lro->ifp = adapter->ifp; } IXGBE_RX_UNLOCK(rxr); return (0); fail: ixgbe_free_receive_ring(rxr); IXGBE_RX_UNLOCK(rxr); return (error); } /********************************************************************* * * Initialize all receive rings. * **********************************************************************/ static int ixgbe_setup_receive_structures(struct adapter *adapter) { struct rx_ring *rxr = adapter->rx_rings; int j; for (j = 0; j < adapter->num_queues; j++, rxr++) if (ixgbe_setup_receive_ring(rxr)) goto fail; return (0); fail: /* * Free RX buffers allocated so far, we will only handle * the rings that completed, the failing case will have * cleaned up for itself. 'j' failed, so its the terminus. */ for (int i = 0; i < j; ++i) { rxr = &adapter->rx_rings[i]; ixgbe_free_receive_ring(rxr); } return (ENOBUFS); } /********************************************************************* * * Setup receive registers and features. * **********************************************************************/ #define IXGBE_SRRCTL_BSIZEHDRSIZE_SHIFT 2 #define BSIZEPKT_ROUNDUP ((1<rx_rings; struct ixgbe_hw *hw = &adapter->hw; struct ifnet *ifp = adapter->ifp; u32 bufsz, rxctrl, fctrl, srrctl, rxcsum; u32 reta, mrqc = 0, hlreg, random[10]; /* * Make sure receives are disabled while * setting up the descriptor ring */ rxctrl = IXGBE_READ_REG(hw, IXGBE_RXCTRL); IXGBE_WRITE_REG(hw, IXGBE_RXCTRL, rxctrl & ~IXGBE_RXCTRL_RXEN); /* Enable broadcasts */ fctrl = IXGBE_READ_REG(hw, IXGBE_FCTRL); fctrl |= IXGBE_FCTRL_BAM; fctrl |= IXGBE_FCTRL_DPF; fctrl |= IXGBE_FCTRL_PMCF; IXGBE_WRITE_REG(hw, IXGBE_FCTRL, fctrl); /* Set for Jumbo Frames? */ hlreg = IXGBE_READ_REG(hw, IXGBE_HLREG0); if (ifp->if_mtu > ETHERMTU) hlreg |= IXGBE_HLREG0_JUMBOEN; else hlreg &= ~IXGBE_HLREG0_JUMBOEN; #ifdef DEV_NETMAP /* crcstrip is conditional in netmap (in RDRXCTL too ?) */ if (ifp->if_capenable & IFCAP_NETMAP && !ix_crcstrip) hlreg &= ~IXGBE_HLREG0_RXCRCSTRP; else hlreg |= IXGBE_HLREG0_RXCRCSTRP; #endif /* DEV_NETMAP */ IXGBE_WRITE_REG(hw, IXGBE_HLREG0, hlreg); bufsz = (adapter->rx_mbuf_sz + BSIZEPKT_ROUNDUP) >> IXGBE_SRRCTL_BSIZEPKT_SHIFT; for (int i = 0; i < adapter->num_queues; i++, rxr++) { u64 rdba = rxr->rxdma.dma_paddr; /* Setup the Base and Length of the Rx Descriptor Ring */ IXGBE_WRITE_REG(hw, IXGBE_RDBAL(i), (rdba & 0x00000000ffffffffULL)); IXGBE_WRITE_REG(hw, IXGBE_RDBAH(i), (rdba >> 32)); IXGBE_WRITE_REG(hw, IXGBE_RDLEN(i), adapter->num_rx_desc * sizeof(union ixgbe_adv_rx_desc)); /* Set up the SRRCTL register */ srrctl = IXGBE_READ_REG(hw, IXGBE_SRRCTL(i)); srrctl &= ~IXGBE_SRRCTL_BSIZEHDR_MASK; srrctl &= ~IXGBE_SRRCTL_BSIZEPKT_MASK; srrctl |= bufsz; srrctl |= IXGBE_SRRCTL_DESCTYPE_ADV_ONEBUF; IXGBE_WRITE_REG(hw, IXGBE_SRRCTL(i), srrctl); /* Setup the HW Rx Head and Tail Descriptor Pointers */ IXGBE_WRITE_REG(hw, IXGBE_RDH(i), 0); IXGBE_WRITE_REG(hw, IXGBE_RDT(i), 0); /* Set the processing limit */ rxr->process_limit = ixgbe_rx_process_limit; } if (adapter->hw.mac.type != ixgbe_mac_82598EB) { u32 psrtype = IXGBE_PSRTYPE_TCPHDR | IXGBE_PSRTYPE_UDPHDR | IXGBE_PSRTYPE_IPV4HDR | IXGBE_PSRTYPE_IPV6HDR; IXGBE_WRITE_REG(hw, IXGBE_PSRTYPE(0), psrtype); } rxcsum = IXGBE_READ_REG(hw, IXGBE_RXCSUM); /* Setup RSS */ if (adapter->num_queues > 1) { int i, j; reta = 0; /* set up random bits */ arc4rand(&random, sizeof(random), 0); /* Set up the redirection table */ for (i = 0, j = 0; i < 128; i++, j++) { if (j == adapter->num_queues) j = 0; reta = (reta << 8) | (j * 0x11); if ((i & 3) == 3) IXGBE_WRITE_REG(hw, IXGBE_RETA(i >> 2), reta); } /* Now fill our hash function seeds */ for (int i = 0; i < 10; i++) IXGBE_WRITE_REG(hw, IXGBE_RSSRK(i), random[i]); /* Perform hash on these packet types */ mrqc = IXGBE_MRQC_RSSEN | IXGBE_MRQC_RSS_FIELD_IPV4 | IXGBE_MRQC_RSS_FIELD_IPV4_TCP | IXGBE_MRQC_RSS_FIELD_IPV4_UDP | IXGBE_MRQC_RSS_FIELD_IPV6_EX_TCP | IXGBE_MRQC_RSS_FIELD_IPV6_EX | IXGBE_MRQC_RSS_FIELD_IPV6 | IXGBE_MRQC_RSS_FIELD_IPV6_TCP | IXGBE_MRQC_RSS_FIELD_IPV6_UDP | IXGBE_MRQC_RSS_FIELD_IPV6_EX_UDP; IXGBE_WRITE_REG(hw, IXGBE_MRQC, mrqc); /* RSS and RX IPP Checksum are mutually exclusive */ rxcsum |= IXGBE_RXCSUM_PCSD; } if (ifp->if_capenable & IFCAP_RXCSUM) rxcsum |= IXGBE_RXCSUM_PCSD; if (!(rxcsum & IXGBE_RXCSUM_PCSD)) rxcsum |= IXGBE_RXCSUM_IPPCSE; IXGBE_WRITE_REG(hw, IXGBE_RXCSUM, rxcsum); return; } /********************************************************************* * * Free all receive rings. * **********************************************************************/ static void ixgbe_free_receive_structures(struct adapter *adapter) { struct rx_ring *rxr = adapter->rx_rings; INIT_DEBUGOUT("ixgbe_free_receive_structures: begin"); for (int i = 0; i < adapter->num_queues; i++, rxr++) { struct lro_ctrl *lro = &rxr->lro; ixgbe_free_receive_buffers(rxr); /* Free LRO memory */ tcp_lro_free(lro); /* Free the ring memory as well */ ixgbe_dma_free(adapter, &rxr->rxdma); } free(adapter->rx_rings, M_DEVBUF); } /********************************************************************* * * Free receive ring data structures * **********************************************************************/ static void ixgbe_free_receive_buffers(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; struct ixgbe_rx_buf *rxbuf; INIT_DEBUGOUT("ixgbe_free_receive_buffers: begin"); /* Cleanup any existing buffers */ if (rxr->rx_buffers != NULL) { for (int i = 0; i < adapter->num_rx_desc; i++) { rxbuf = &rxr->rx_buffers[i]; if (rxbuf->buf != NULL) { bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->ptag, rxbuf->pmap); rxbuf->buf->m_flags |= M_PKTHDR; m_freem(rxbuf->buf); } rxbuf->buf = NULL; if (rxbuf->pmap != NULL) { bus_dmamap_destroy(rxr->ptag, rxbuf->pmap); rxbuf->pmap = NULL; } } if (rxr->rx_buffers != NULL) { free(rxr->rx_buffers, M_DEVBUF); rxr->rx_buffers = NULL; } } if (rxr->ptag != NULL) { bus_dma_tag_destroy(rxr->ptag); rxr->ptag = NULL; } return; } static __inline void ixgbe_rx_input(struct rx_ring *rxr, struct ifnet *ifp, struct mbuf *m, u32 ptype) { /* * ATM LRO is only for IP/TCP packets and TCP checksum of the packet * should be computed by hardware. Also it should not have VLAN tag in * ethernet header. In case of IPv6 we do not yet support ext. hdrs. */ if (rxr->lro_enabled && (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0 && (ptype & IXGBE_RXDADV_PKTTYPE_ETQF) == 0 && ((ptype & (IXGBE_RXDADV_PKTTYPE_IPV4 | IXGBE_RXDADV_PKTTYPE_TCP)) == (IXGBE_RXDADV_PKTTYPE_IPV4 | IXGBE_RXDADV_PKTTYPE_TCP) || (ptype & (IXGBE_RXDADV_PKTTYPE_IPV6 | IXGBE_RXDADV_PKTTYPE_TCP)) == (IXGBE_RXDADV_PKTTYPE_IPV6 | IXGBE_RXDADV_PKTTYPE_TCP)) && (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) == (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) { /* * Send to the stack if: ** - LRO not enabled, or ** - no LRO resources, or ** - lro enqueue fails */ if (rxr->lro.lro_cnt != 0) if (tcp_lro_rx(&rxr->lro, m, 0) == 0) return; } IXGBE_RX_UNLOCK(rxr); (*ifp->if_input)(ifp, m); IXGBE_RX_LOCK(rxr); } static __inline void ixgbe_rx_discard(struct rx_ring *rxr, int i) { struct ixgbe_rx_buf *rbuf; rbuf = &rxr->rx_buffers[i]; if (rbuf->fmp != NULL) {/* Partial chain ? */ rbuf->fmp->m_flags |= M_PKTHDR; m_freem(rbuf->fmp); rbuf->fmp = NULL; } /* ** With advanced descriptors the writeback ** clobbers the buffer addrs, so its easier ** to just free the existing mbufs and take ** the normal refresh path to get new buffers ** and mapping. */ if (rbuf->buf) { m_free(rbuf->buf); rbuf->buf = NULL; } rbuf->flags = 0; return; } /********************************************************************* * * This routine executes in interrupt context. It replenishes * the mbufs in the descriptor and sends data which has been * dma'ed into host memory to upper layer. * * We loop at most count times if count is > 0, or until done if * count < 0. * * Return TRUE for more work, FALSE for all clean. *********************************************************************/ static bool ixgbe_rxeof(struct ix_queue *que) { struct adapter *adapter = que->adapter; struct rx_ring *rxr = que->rxr; struct ifnet *ifp = adapter->ifp; struct lro_ctrl *lro = &rxr->lro; struct lro_entry *queued; int i, nextp, processed = 0; u32 staterr = 0; u16 count = rxr->process_limit; union ixgbe_adv_rx_desc *cur; struct ixgbe_rx_buf *rbuf, *nbuf; IXGBE_RX_LOCK(rxr); #ifdef DEV_NETMAP /* Same as the txeof routine: wakeup clients on intr. */ if (netmap_rx_irq(ifp, rxr->me, &processed)) { IXGBE_RX_UNLOCK(rxr); return (FALSE); } #endif /* DEV_NETMAP */ for (i = rxr->next_to_check; count != 0;) { struct mbuf *sendmp, *mp; u32 rsc, ptype; u16 len; u16 vtag = 0; bool eop; /* Sync the ring. */ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); cur = &rxr->rx_base[i]; staterr = le32toh(cur->wb.upper.status_error); if ((staterr & IXGBE_RXD_STAT_DD) == 0) break; if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; count--; sendmp = NULL; nbuf = NULL; rsc = 0; cur->wb.upper.status_error = 0; rbuf = &rxr->rx_buffers[i]; mp = rbuf->buf; len = le16toh(cur->wb.upper.length); ptype = le32toh(cur->wb.lower.lo_dword.data) & IXGBE_RXDADV_PKTTYPE_MASK; eop = ((staterr & IXGBE_RXD_STAT_EOP) != 0); /* Make sure bad packets are discarded */ if (((staterr & IXGBE_RXDADV_ERR_FRAME_ERR_MASK) != 0) || (rxr->discard)) { rxr->rx_discarded++; if (eop) rxr->discard = FALSE; else rxr->discard = TRUE; ixgbe_rx_discard(rxr, i); goto next_desc; } /* ** On 82599 which supports a hardware ** LRO (called HW RSC), packets need ** not be fragmented across sequential ** descriptors, rather the next descriptor ** is indicated in bits of the descriptor. ** This also means that we might proceses ** more than one packet at a time, something ** that has never been true before, it ** required eliminating global chain pointers ** in favor of what we are doing here. -jfv */ if (!eop) { /* ** Figure out the next descriptor ** of this frame. */ if (rxr->hw_rsc == TRUE) { rsc = ixgbe_rsc_count(cur); rxr->rsc_num += (rsc - 1); } if (rsc) { /* Get hardware index */ nextp = ((staterr & IXGBE_RXDADV_NEXTP_MASK) >> IXGBE_RXDADV_NEXTP_SHIFT); } else { /* Just sequential */ nextp = i + 1; if (nextp == adapter->num_rx_desc) nextp = 0; } nbuf = &rxr->rx_buffers[nextp]; prefetch(nbuf); } /* ** Rather than using the fmp/lmp global pointers ** we now keep the head of a packet chain in the ** buffer struct and pass this along from one ** descriptor to the next, until we get EOP. */ mp->m_len = len; /* ** See if there is a stored head ** that determines what we are */ sendmp = rbuf->fmp; if (sendmp != NULL) { /* secondary frag */ rbuf->buf = rbuf->fmp = NULL; mp->m_flags &= ~M_PKTHDR; sendmp->m_pkthdr.len += mp->m_len; } else { /* * Optimize. This might be a small packet, * maybe just a TCP ACK. Do a fast copy that * is cache aligned into a new mbuf, and * leave the old mbuf+cluster for re-use. */ if (eop && len <= IXGBE_RX_COPY_LEN) { sendmp = m_gethdr(M_NOWAIT, MT_DATA); if (sendmp != NULL) { sendmp->m_data += IXGBE_RX_COPY_ALIGN; ixgbe_bcopy(mp->m_data, sendmp->m_data, len); sendmp->m_len = len; rxr->rx_copies++; rbuf->flags |= IXGBE_RX_COPY; } } if (sendmp == NULL) { rbuf->buf = rbuf->fmp = NULL; sendmp = mp; } /* first desc of a non-ps chain */ sendmp->m_flags |= M_PKTHDR; sendmp->m_pkthdr.len = mp->m_len; } ++processed; /* Pass the head pointer on */ if (eop == 0) { nbuf->fmp = sendmp; sendmp = NULL; mp->m_next = nbuf->buf; } else { /* Sending this frame */ sendmp->m_pkthdr.rcvif = ifp; ifp->if_ipackets++; rxr->rx_packets++; /* capture data for AIM */ rxr->bytes += sendmp->m_pkthdr.len; rxr->rx_bytes += sendmp->m_pkthdr.len; /* Process vlan info */ if ((rxr->vtag_strip) && (staterr & IXGBE_RXD_STAT_VP)) vtag = le16toh(cur->wb.upper.vlan); if (vtag) { sendmp->m_pkthdr.ether_vtag = vtag; sendmp->m_flags |= M_VLANTAG; } if ((ifp->if_capenable & IFCAP_RXCSUM) != 0) ixgbe_rx_checksum(staterr, sendmp, ptype); #if __FreeBSD_version >= 800000 sendmp->m_pkthdr.flowid = que->msix; sendmp->m_flags |= M_FLOWID; #endif } next_desc: bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* Advance our pointers to the next descriptor. */ if (++i == rxr->num_desc) i = 0; /* Now send to the stack or do LRO */ if (sendmp != NULL) { rxr->next_to_check = i; ixgbe_rx_input(rxr, ifp, sendmp, ptype); i = rxr->next_to_check; } /* Every 8 descriptors we go to refresh mbufs */ if (processed == 8) { ixgbe_refresh_mbufs(rxr, i); processed = 0; } } /* Refresh any remaining buf structs */ if (ixgbe_rx_unrefreshed(rxr)) ixgbe_refresh_mbufs(rxr, i); rxr->next_to_check = i; /* * Flush any outstanding LRO work */ while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) { SLIST_REMOVE_HEAD(&lro->lro_active, next); tcp_lro_flush(lro, queued); } IXGBE_RX_UNLOCK(rxr); /* ** Still have cleaning to do? */ if ((staterr & IXGBE_RXD_STAT_DD) != 0) return (TRUE); else return (FALSE); } /********************************************************************* * * Verify that the hardware indicated that the checksum is valid. * Inform the stack about the status of checksum so that stack * doesn't spend time verifying the checksum. * *********************************************************************/ static void ixgbe_rx_checksum(u32 staterr, struct mbuf * mp, u32 ptype) { u16 status = (u16) staterr; u8 errors = (u8) (staterr >> 24); bool sctp = FALSE; if ((ptype & IXGBE_RXDADV_PKTTYPE_ETQF) == 0 && (ptype & IXGBE_RXDADV_PKTTYPE_SCTP) != 0) sctp = TRUE; if (status & IXGBE_RXD_STAT_IPCS) { if (!(errors & IXGBE_RXD_ERR_IPE)) { /* IP Checksum Good */ mp->m_pkthdr.csum_flags = CSUM_IP_CHECKED; mp->m_pkthdr.csum_flags |= CSUM_IP_VALID; } else mp->m_pkthdr.csum_flags = 0; } if (status & IXGBE_RXD_STAT_L4CS) { u16 type = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); #if __FreeBSD_version >= 800000 if (sctp) type = CSUM_SCTP_VALID; #endif if (!(errors & IXGBE_RXD_ERR_TCPE)) { mp->m_pkthdr.csum_flags |= type; if (!sctp) mp->m_pkthdr.csum_data = htons(0xffff); } } return; } /* ** This routine is run via an vlan config EVENT, ** it enables us to use the HW Filter table since ** we can get the vlan id. This just creates the ** entry in the soft version of the VFTA, init will ** repopulate the real table. */ static void ixgbe_register_vlan(void *arg, struct ifnet *ifp, u16 vtag) { struct adapter *adapter = ifp->if_softc; u16 index, bit; if (ifp->if_softc != arg) /* Not our event */ return; if ((vtag == 0) || (vtag > 4095)) /* Invalid */ return; IXGBE_CORE_LOCK(adapter); index = (vtag >> 5) & 0x7F; bit = vtag & 0x1F; adapter->shadow_vfta[index] |= (1 << bit); ++adapter->num_vlans; ixgbe_init_locked(adapter); IXGBE_CORE_UNLOCK(adapter); } /* ** This routine is run via an vlan ** unconfig EVENT, remove our entry ** in the soft vfta. */ static void ixgbe_unregister_vlan(void *arg, struct ifnet *ifp, u16 vtag) { struct adapter *adapter = ifp->if_softc; u16 index, bit; if (ifp->if_softc != arg) return; if ((vtag == 0) || (vtag > 4095)) /* Invalid */ return; IXGBE_CORE_LOCK(adapter); index = (vtag >> 5) & 0x7F; bit = vtag & 0x1F; adapter->shadow_vfta[index] &= ~(1 << bit); --adapter->num_vlans; /* Re-init to load the changes */ ixgbe_init_locked(adapter); IXGBE_CORE_UNLOCK(adapter); } static void ixgbe_setup_vlan_hw_support(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; struct ixgbe_hw *hw = &adapter->hw; struct rx_ring *rxr; u32 ctrl; /* ** We get here thru init_locked, meaning ** a soft reset, this has already cleared ** the VFTA and other state, so if there ** have been no vlan's registered do nothing. */ if (adapter->num_vlans == 0) return; /* ** A soft reset zero's out the VFTA, so ** we need to repopulate it now. */ for (int i = 0; i < IXGBE_VFTA_SIZE; i++) if (adapter->shadow_vfta[i] != 0) IXGBE_WRITE_REG(hw, IXGBE_VFTA(i), adapter->shadow_vfta[i]); ctrl = IXGBE_READ_REG(hw, IXGBE_VLNCTRL); /* Enable the Filter Table if enabled */ if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) { ctrl &= ~IXGBE_VLNCTRL_CFIEN; ctrl |= IXGBE_VLNCTRL_VFE; } if (hw->mac.type == ixgbe_mac_82598EB) ctrl |= IXGBE_VLNCTRL_VME; IXGBE_WRITE_REG(hw, IXGBE_VLNCTRL, ctrl); /* Setup the queues for vlans */ for (int i = 0; i < adapter->num_queues; i++) { rxr = &adapter->rx_rings[i]; /* On 82599 the VLAN enable is per/queue in RXDCTL */ if (hw->mac.type != ixgbe_mac_82598EB) { ctrl = IXGBE_READ_REG(hw, IXGBE_RXDCTL(i)); ctrl |= IXGBE_RXDCTL_VME; IXGBE_WRITE_REG(hw, IXGBE_RXDCTL(i), ctrl); } rxr->vtag_strip = TRUE; } } static void ixgbe_enable_intr(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; struct ix_queue *que = adapter->queues; u32 mask, fwsm; mask = (IXGBE_EIMS_ENABLE_MASK & ~IXGBE_EIMS_RTX_QUEUE); /* Enable Fan Failure detection */ if (hw->device_id == IXGBE_DEV_ID_82598AT) mask |= IXGBE_EIMS_GPI_SDP1; switch (adapter->hw.mac.type) { case ixgbe_mac_82599EB: mask |= IXGBE_EIMS_ECC; mask |= IXGBE_EIMS_GPI_SDP0; mask |= IXGBE_EIMS_GPI_SDP1; mask |= IXGBE_EIMS_GPI_SDP2; #ifdef IXGBE_FDIR mask |= IXGBE_EIMS_FLOW_DIR; #endif break; case ixgbe_mac_X540: mask |= IXGBE_EIMS_ECC; /* Detect if Thermal Sensor is enabled */ fwsm = IXGBE_READ_REG(hw, IXGBE_FWSM); if (fwsm & IXGBE_FWSM_TS_ENABLED) mask |= IXGBE_EIMS_TS; #ifdef IXGBE_FDIR mask |= IXGBE_EIMS_FLOW_DIR; #endif /* falls through */ default: break; } IXGBE_WRITE_REG(hw, IXGBE_EIMS, mask); /* With RSS we use auto clear */ if (adapter->msix_mem) { mask = IXGBE_EIMS_ENABLE_MASK; /* Don't autoclear Link */ mask &= ~IXGBE_EIMS_OTHER; mask &= ~IXGBE_EIMS_LSC; IXGBE_WRITE_REG(hw, IXGBE_EIAC, mask); } /* ** Now enable all queues, this is done separately to ** allow for handling the extended (beyond 32) MSIX ** vectors that can be used by 82599 */ for (int i = 0; i < adapter->num_queues; i++, que++) ixgbe_enable_queue(adapter, que->msix); IXGBE_WRITE_FLUSH(hw); return; } static void ixgbe_disable_intr(struct adapter *adapter) { if (adapter->msix_mem) IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIAC, 0); if (adapter->hw.mac.type == ixgbe_mac_82598EB) { IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMC, ~0); } else { IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMC, 0xFFFF0000); IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMC_EX(0), ~0); IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMC_EX(1), ~0); } IXGBE_WRITE_FLUSH(&adapter->hw); return; } u16 ixgbe_read_pci_cfg(struct ixgbe_hw *hw, u32 reg) { u16 value; value = pci_read_config(((struct ixgbe_osdep *)hw->back)->dev, reg, 2); return (value); } void ixgbe_write_pci_cfg(struct ixgbe_hw *hw, u32 reg, u16 value) { pci_write_config(((struct ixgbe_osdep *)hw->back)->dev, reg, value, 2); return; } /* ** Get the width and transaction speed of ** the slot this adapter is plugged into. */ static void ixgbe_get_slot_info(struct ixgbe_hw *hw) { device_t dev = ((struct ixgbe_osdep *)hw->back)->dev; struct ixgbe_mac_info *mac = &hw->mac; u16 link; u32 offset; /* For most devices simply call the shared code routine */ if (hw->device_id != IXGBE_DEV_ID_82599_SFP_SF_QP) { ixgbe_get_bus_info(hw); goto display; } /* ** For the Quad port adapter we need to parse back ** up the PCI tree to find the speed of the expansion ** slot into which this adapter is plugged. A bit more work. */ dev = device_get_parent(device_get_parent(dev)); #ifdef IXGBE_DEBUG device_printf(dev, "parent pcib = %x,%x,%x\n", pci_get_bus(dev), pci_get_slot(dev), pci_get_function(dev)); #endif dev = device_get_parent(device_get_parent(dev)); #ifdef IXGBE_DEBUG device_printf(dev, "slot pcib = %x,%x,%x\n", pci_get_bus(dev), pci_get_slot(dev), pci_get_function(dev)); #endif /* Now get the PCI Express Capabilities offset */ pci_find_cap(dev, PCIY_EXPRESS, &offset); /* ...and read the Link Status Register */ link = pci_read_config(dev, offset + PCIER_LINK_STA, 2); switch (link & IXGBE_PCI_LINK_WIDTH) { case IXGBE_PCI_LINK_WIDTH_1: hw->bus.width = ixgbe_bus_width_pcie_x1; break; case IXGBE_PCI_LINK_WIDTH_2: hw->bus.width = ixgbe_bus_width_pcie_x2; break; case IXGBE_PCI_LINK_WIDTH_4: hw->bus.width = ixgbe_bus_width_pcie_x4; break; case IXGBE_PCI_LINK_WIDTH_8: hw->bus.width = ixgbe_bus_width_pcie_x8; break; default: hw->bus.width = ixgbe_bus_width_unknown; break; } switch (link & IXGBE_PCI_LINK_SPEED) { case IXGBE_PCI_LINK_SPEED_2500: hw->bus.speed = ixgbe_bus_speed_2500; break; case IXGBE_PCI_LINK_SPEED_5000: hw->bus.speed = ixgbe_bus_speed_5000; break; case IXGBE_PCI_LINK_SPEED_8000: hw->bus.speed = ixgbe_bus_speed_8000; break; default: hw->bus.speed = ixgbe_bus_speed_unknown; break; } mac->ops.set_lan_id(hw); display: device_printf(dev,"PCI Express Bus: Speed %s %s\n", ((hw->bus.speed == ixgbe_bus_speed_8000) ? "8.0GT/s": (hw->bus.speed == ixgbe_bus_speed_5000) ? "5.0GT/s": (hw->bus.speed == ixgbe_bus_speed_2500) ? "2.5GT/s":"Unknown"), (hw->bus.width == ixgbe_bus_width_pcie_x8) ? "Width x8" : (hw->bus.width == ixgbe_bus_width_pcie_x4) ? "Width x4" : (hw->bus.width == ixgbe_bus_width_pcie_x1) ? "Width x1" : ("Unknown")); if ((hw->device_id != IXGBE_DEV_ID_82599_SFP_SF_QP) && ((hw->bus.width <= ixgbe_bus_width_pcie_x4) && (hw->bus.speed == ixgbe_bus_speed_2500))) { device_printf(dev, "PCI-Express bandwidth available" " for this card\n is not sufficient for" " optimal performance.\n"); device_printf(dev, "For optimal performance a x8 " "PCIE, or x4 PCIE Gen2 slot is required.\n"); } if ((hw->device_id == IXGBE_DEV_ID_82599_SFP_SF_QP) && ((hw->bus.width <= ixgbe_bus_width_pcie_x8) && (hw->bus.speed < ixgbe_bus_speed_8000))) { device_printf(dev, "PCI-Express bandwidth available" " for this card\n is not sufficient for" " optimal performance.\n"); device_printf(dev, "For optimal performance a x8 " "PCIE Gen3 slot is required.\n"); } return; } /* ** Setup the correct IVAR register for a particular MSIX interrupt ** (yes this is all very magic and confusing :) ** - entry is the register array entry ** - vector is the MSIX vector for this queue ** - type is RX/TX/MISC */ static void ixgbe_set_ivar(struct adapter *adapter, u8 entry, u8 vector, s8 type) { struct ixgbe_hw *hw = &adapter->hw; u32 ivar, index; vector |= IXGBE_IVAR_ALLOC_VAL; switch (hw->mac.type) { case ixgbe_mac_82598EB: if (type == -1) entry = IXGBE_IVAR_OTHER_CAUSES_INDEX; else entry += (type * 64); index = (entry >> 2) & 0x1F; ivar = IXGBE_READ_REG(hw, IXGBE_IVAR(index)); ivar &= ~(0xFF << (8 * (entry & 0x3))); ivar |= (vector << (8 * (entry & 0x3))); IXGBE_WRITE_REG(&adapter->hw, IXGBE_IVAR(index), ivar); break; case ixgbe_mac_82599EB: case ixgbe_mac_X540: if (type == -1) { /* MISC IVAR */ index = (entry & 1) * 8; ivar = IXGBE_READ_REG(hw, IXGBE_IVAR_MISC); ivar &= ~(0xFF << index); ivar |= (vector << index); IXGBE_WRITE_REG(hw, IXGBE_IVAR_MISC, ivar); } else { /* RX/TX IVARS */ index = (16 * (entry & 1)) + (8 * type); ivar = IXGBE_READ_REG(hw, IXGBE_IVAR(entry >> 1)); ivar &= ~(0xFF << index); ivar |= (vector << index); IXGBE_WRITE_REG(hw, IXGBE_IVAR(entry >> 1), ivar); } default: break; } } static void ixgbe_configure_ivars(struct adapter *adapter) { struct ix_queue *que = adapter->queues; u32 newitr; if (ixgbe_max_interrupt_rate > 0) newitr = (4000000 / ixgbe_max_interrupt_rate) & 0x0FF8; else newitr = 0; for (int i = 0; i < adapter->num_queues; i++, que++) { /* First the RX queue entry */ ixgbe_set_ivar(adapter, i, que->msix, 0); /* ... and the TX */ ixgbe_set_ivar(adapter, i, que->msix, 1); /* Set an Initial EITR value */ IXGBE_WRITE_REG(&adapter->hw, IXGBE_EITR(que->msix), newitr); } /* For the Link interrupt */ ixgbe_set_ivar(adapter, 1, adapter->linkvec, -1); } /* ** ixgbe_sfp_probe - called in the local timer to ** determine if a port had optics inserted. */ static bool ixgbe_sfp_probe(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; device_t dev = adapter->dev; bool result = FALSE; if ((hw->phy.type == ixgbe_phy_nl) && (hw->phy.sfp_type == ixgbe_sfp_type_not_present)) { s32 ret = hw->phy.ops.identify_sfp(hw); if (ret) goto out; ret = hw->phy.ops.reset(hw); if (ret == IXGBE_ERR_SFP_NOT_SUPPORTED) { device_printf(dev,"Unsupported SFP+ module detected!"); printf(" Reload driver with supported module.\n"); adapter->sfp_probe = FALSE; goto out; } else device_printf(dev,"SFP+ module detected!\n"); /* We now have supported optics */ adapter->sfp_probe = FALSE; /* Set the optics type so system reports correctly */ ixgbe_setup_optics(adapter); result = TRUE; } out: return (result); } /* ** Tasklet handler for MSIX Link interrupts ** - do outside interrupt since it might sleep */ static void ixgbe_handle_link(void *context, int pending) { struct adapter *adapter = context; ixgbe_check_link(&adapter->hw, &adapter->link_speed, &adapter->link_up, 0); ixgbe_update_link_status(adapter); } /* ** Tasklet for handling SFP module interrupts */ static void ixgbe_handle_mod(void *context, int pending) { struct adapter *adapter = context; struct ixgbe_hw *hw = &adapter->hw; device_t dev = adapter->dev; u32 err; err = hw->phy.ops.identify_sfp(hw); if (err == IXGBE_ERR_SFP_NOT_SUPPORTED) { device_printf(dev, "Unsupported SFP+ module type was detected.\n"); return; } err = hw->mac.ops.setup_sfp(hw); if (err == IXGBE_ERR_SFP_NOT_SUPPORTED) { device_printf(dev, "Setup failure - unsupported SFP+ module type.\n"); return; } taskqueue_enqueue(adapter->tq, &adapter->msf_task); return; } /* ** Tasklet for handling MSF (multispeed fiber) interrupts */ static void ixgbe_handle_msf(void *context, int pending) { struct adapter *adapter = context; struct ixgbe_hw *hw = &adapter->hw; u32 autoneg; bool negotiate; autoneg = hw->phy.autoneg_advertised; if ((!autoneg) && (hw->mac.ops.get_link_capabilities)) hw->mac.ops.get_link_capabilities(hw, &autoneg, &negotiate); if (hw->mac.ops.setup_link) hw->mac.ops.setup_link(hw, autoneg, TRUE); return; } #ifdef IXGBE_FDIR /* ** Tasklet for reinitializing the Flow Director filter table */ static void ixgbe_reinit_fdir(void *context, int pending) { struct adapter *adapter = context; struct ifnet *ifp = adapter->ifp; if (adapter->fdir_reinit != 1) /* Shouldn't happen */ return; ixgbe_reinit_fdir_tables_82599(&adapter->hw); adapter->fdir_reinit = 0; /* re-enable flow director interrupts */ IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMS, IXGBE_EIMS_FLOW_DIR); /* Restart the interface */ ifp->if_drv_flags |= IFF_DRV_RUNNING; return; } #endif /********************************************************************** * * Update the board statistics counters. * **********************************************************************/ static void ixgbe_update_stats_counters(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; struct ixgbe_hw *hw = &adapter->hw; u32 missed_rx = 0, bprc, lxon, lxoff, total; u64 total_missed_rx = 0; adapter->stats.crcerrs += IXGBE_READ_REG(hw, IXGBE_CRCERRS); adapter->stats.illerrc += IXGBE_READ_REG(hw, IXGBE_ILLERRC); adapter->stats.errbc += IXGBE_READ_REG(hw, IXGBE_ERRBC); adapter->stats.mspdc += IXGBE_READ_REG(hw, IXGBE_MSPDC); /* ** Note: these are for the 8 possible traffic classes, ** which in current implementation is unused, ** therefore only 0 should read real data. */ for (int i = 0; i < 8; i++) { u32 mp; mp = IXGBE_READ_REG(hw, IXGBE_MPC(i)); /* missed_rx tallies misses for the gprc workaround */ missed_rx += mp; /* global total per queue */ adapter->stats.mpc[i] += mp; /* Running comprehensive total for stats display */ total_missed_rx += adapter->stats.mpc[i]; if (hw->mac.type == ixgbe_mac_82598EB) { adapter->stats.rnbc[i] += IXGBE_READ_REG(hw, IXGBE_RNBC(i)); adapter->stats.qbtc[i] += IXGBE_READ_REG(hw, IXGBE_QBTC(i)); adapter->stats.qbrc[i] += IXGBE_READ_REG(hw, IXGBE_QBRC(i)); adapter->stats.pxonrxc[i] += IXGBE_READ_REG(hw, IXGBE_PXONRXC(i)); } else adapter->stats.pxonrxc[i] += IXGBE_READ_REG(hw, IXGBE_PXONRXCNT(i)); adapter->stats.pxontxc[i] += IXGBE_READ_REG(hw, IXGBE_PXONTXC(i)); adapter->stats.pxofftxc[i] += IXGBE_READ_REG(hw, IXGBE_PXOFFTXC(i)); adapter->stats.pxoffrxc[i] += IXGBE_READ_REG(hw, IXGBE_PXOFFRXC(i)); adapter->stats.pxon2offc[i] += IXGBE_READ_REG(hw, IXGBE_PXON2OFFCNT(i)); } for (int i = 0; i < 16; i++) { adapter->stats.qprc[i] += IXGBE_READ_REG(hw, IXGBE_QPRC(i)); adapter->stats.qptc[i] += IXGBE_READ_REG(hw, IXGBE_QPTC(i)); adapter->stats.qprdc[i] += IXGBE_READ_REG(hw, IXGBE_QPRDC(i)); } adapter->stats.mlfc += IXGBE_READ_REG(hw, IXGBE_MLFC); adapter->stats.mrfc += IXGBE_READ_REG(hw, IXGBE_MRFC); adapter->stats.rlec += IXGBE_READ_REG(hw, IXGBE_RLEC); /* Hardware workaround, gprc counts missed packets */ adapter->stats.gprc += IXGBE_READ_REG(hw, IXGBE_GPRC); adapter->stats.gprc -= missed_rx; if (hw->mac.type != ixgbe_mac_82598EB) { adapter->stats.gorc += IXGBE_READ_REG(hw, IXGBE_GORCL) + ((u64)IXGBE_READ_REG(hw, IXGBE_GORCH) << 32); adapter->stats.gotc += IXGBE_READ_REG(hw, IXGBE_GOTCL) + ((u64)IXGBE_READ_REG(hw, IXGBE_GOTCH) << 32); adapter->stats.tor += IXGBE_READ_REG(hw, IXGBE_TORL) + ((u64)IXGBE_READ_REG(hw, IXGBE_TORH) << 32); adapter->stats.lxonrxc += IXGBE_READ_REG(hw, IXGBE_LXONRXCNT); adapter->stats.lxoffrxc += IXGBE_READ_REG(hw, IXGBE_LXOFFRXCNT); } else { adapter->stats.lxonrxc += IXGBE_READ_REG(hw, IXGBE_LXONRXC); adapter->stats.lxoffrxc += IXGBE_READ_REG(hw, IXGBE_LXOFFRXC); /* 82598 only has a counter in the high register */ adapter->stats.gorc += IXGBE_READ_REG(hw, IXGBE_GORCH); adapter->stats.gotc += IXGBE_READ_REG(hw, IXGBE_GOTCH); adapter->stats.tor += IXGBE_READ_REG(hw, IXGBE_TORH); } /* * Workaround: mprc hardware is incorrectly counting * broadcasts, so for now we subtract those. */ bprc = IXGBE_READ_REG(hw, IXGBE_BPRC); adapter->stats.bprc += bprc; adapter->stats.mprc += IXGBE_READ_REG(hw, IXGBE_MPRC); if (hw->mac.type == ixgbe_mac_82598EB) adapter->stats.mprc -= bprc; adapter->stats.prc64 += IXGBE_READ_REG(hw, IXGBE_PRC64); adapter->stats.prc127 += IXGBE_READ_REG(hw, IXGBE_PRC127); adapter->stats.prc255 += IXGBE_READ_REG(hw, IXGBE_PRC255); adapter->stats.prc511 += IXGBE_READ_REG(hw, IXGBE_PRC511); adapter->stats.prc1023 += IXGBE_READ_REG(hw, IXGBE_PRC1023); adapter->stats.prc1522 += IXGBE_READ_REG(hw, IXGBE_PRC1522); lxon = IXGBE_READ_REG(hw, IXGBE_LXONTXC); adapter->stats.lxontxc += lxon; lxoff = IXGBE_READ_REG(hw, IXGBE_LXOFFTXC); adapter->stats.lxofftxc += lxoff; total = lxon + lxoff; adapter->stats.gptc += IXGBE_READ_REG(hw, IXGBE_GPTC); adapter->stats.mptc += IXGBE_READ_REG(hw, IXGBE_MPTC); adapter->stats.ptc64 += IXGBE_READ_REG(hw, IXGBE_PTC64); adapter->stats.gptc -= total; adapter->stats.mptc -= total; adapter->stats.ptc64 -= total; adapter->stats.gotc -= total * ETHER_MIN_LEN; adapter->stats.ruc += IXGBE_READ_REG(hw, IXGBE_RUC); adapter->stats.rfc += IXGBE_READ_REG(hw, IXGBE_RFC); adapter->stats.roc += IXGBE_READ_REG(hw, IXGBE_ROC); adapter->stats.rjc += IXGBE_READ_REG(hw, IXGBE_RJC); adapter->stats.mngprc += IXGBE_READ_REG(hw, IXGBE_MNGPRC); adapter->stats.mngpdc += IXGBE_READ_REG(hw, IXGBE_MNGPDC); adapter->stats.mngptc += IXGBE_READ_REG(hw, IXGBE_MNGPTC); adapter->stats.tpr += IXGBE_READ_REG(hw, IXGBE_TPR); adapter->stats.tpt += IXGBE_READ_REG(hw, IXGBE_TPT); adapter->stats.ptc127 += IXGBE_READ_REG(hw, IXGBE_PTC127); adapter->stats.ptc255 += IXGBE_READ_REG(hw, IXGBE_PTC255); adapter->stats.ptc511 += IXGBE_READ_REG(hw, IXGBE_PTC511); adapter->stats.ptc1023 += IXGBE_READ_REG(hw, IXGBE_PTC1023); adapter->stats.ptc1522 += IXGBE_READ_REG(hw, IXGBE_PTC1522); adapter->stats.bptc += IXGBE_READ_REG(hw, IXGBE_BPTC); adapter->stats.xec += IXGBE_READ_REG(hw, IXGBE_XEC); adapter->stats.fccrc += IXGBE_READ_REG(hw, IXGBE_FCCRC); adapter->stats.fclast += IXGBE_READ_REG(hw, IXGBE_FCLAST); /* Only read FCOE on 82599 */ if (hw->mac.type != ixgbe_mac_82598EB) { adapter->stats.fcoerpdc += IXGBE_READ_REG(hw, IXGBE_FCOERPDC); adapter->stats.fcoeprc += IXGBE_READ_REG(hw, IXGBE_FCOEPRC); adapter->stats.fcoeptc += IXGBE_READ_REG(hw, IXGBE_FCOEPTC); adapter->stats.fcoedwrc += IXGBE_READ_REG(hw, IXGBE_FCOEDWRC); adapter->stats.fcoedwtc += IXGBE_READ_REG(hw, IXGBE_FCOEDWTC); } /* Fill out the OS statistics structure */ ifp->if_ipackets = adapter->stats.gprc; ifp->if_opackets = adapter->stats.gptc; ifp->if_ibytes = adapter->stats.gorc; ifp->if_obytes = adapter->stats.gotc; ifp->if_imcasts = adapter->stats.mprc; ifp->if_omcasts = adapter->stats.mptc; ifp->if_collisions = 0; /* Rx Errors */ ifp->if_iqdrops = total_missed_rx; ifp->if_ierrors = adapter->stats.crcerrs + adapter->stats.rlec; } /** ixgbe_sysctl_tdh_handler - Handler function * Retrieves the TDH value from the hardware */ static int ixgbe_sysctl_tdh_handler(SYSCTL_HANDLER_ARGS) { int error; struct tx_ring *txr = ((struct tx_ring *)oidp->oid_arg1); if (!txr) return 0; unsigned val = IXGBE_READ_REG(&txr->adapter->hw, IXGBE_TDH(txr->me)); error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return error; return 0; } /** ixgbe_sysctl_tdt_handler - Handler function * Retrieves the TDT value from the hardware */ static int ixgbe_sysctl_tdt_handler(SYSCTL_HANDLER_ARGS) { int error; struct tx_ring *txr = ((struct tx_ring *)oidp->oid_arg1); if (!txr) return 0; unsigned val = IXGBE_READ_REG(&txr->adapter->hw, IXGBE_TDT(txr->me)); error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return error; return 0; } /** ixgbe_sysctl_rdh_handler - Handler function * Retrieves the RDH value from the hardware */ static int ixgbe_sysctl_rdh_handler(SYSCTL_HANDLER_ARGS) { int error; struct rx_ring *rxr = ((struct rx_ring *)oidp->oid_arg1); if (!rxr) return 0; unsigned val = IXGBE_READ_REG(&rxr->adapter->hw, IXGBE_RDH(rxr->me)); error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return error; return 0; } /** ixgbe_sysctl_rdt_handler - Handler function * Retrieves the RDT value from the hardware */ static int ixgbe_sysctl_rdt_handler(SYSCTL_HANDLER_ARGS) { int error; struct rx_ring *rxr = ((struct rx_ring *)oidp->oid_arg1); if (!rxr) return 0; unsigned val = IXGBE_READ_REG(&rxr->adapter->hw, IXGBE_RDT(rxr->me)); error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return error; return 0; } static int ixgbe_sysctl_interrupt_rate_handler(SYSCTL_HANDLER_ARGS) { int error; struct ix_queue *que = ((struct ix_queue *)oidp->oid_arg1); unsigned int reg, usec, rate; reg = IXGBE_READ_REG(&que->adapter->hw, IXGBE_EITR(que->msix)); usec = ((reg & 0x0FF8) >> 3); if (usec > 0) rate = 500000 / usec; else rate = 0; error = sysctl_handle_int(oidp, &rate, 0, req); if (error || !req->newptr) return error; reg &= ~0xfff; /* default, no limitation */ ixgbe_max_interrupt_rate = 0; if (rate > 0 && rate < 500000) { if (rate < 1000) rate = 1000; ixgbe_max_interrupt_rate = rate; reg |= ((4000000/rate) & 0xff8 ); } IXGBE_WRITE_REG(&que->adapter->hw, IXGBE_EITR(que->msix), reg); return 0; } /* * Add sysctl variables, one per statistic, to the system. */ static void ixgbe_add_hw_stats(struct adapter *adapter) { device_t dev = adapter->dev; struct tx_ring *txr = adapter->tx_rings; struct rx_ring *rxr = adapter->rx_rings; struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(dev); struct sysctl_oid *tree = device_get_sysctl_tree(dev); struct sysctl_oid_list *child = SYSCTL_CHILDREN(tree); struct ixgbe_hw_stats *stats = &adapter->stats; struct sysctl_oid *stat_node, *queue_node; struct sysctl_oid_list *stat_list, *queue_list; #define QUEUE_NAME_LEN 32 char namebuf[QUEUE_NAME_LEN]; /* Driver Statistics */ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "dropped", CTLFLAG_RD, &adapter->dropped_pkts, "Driver dropped packets"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "mbuf_defrag_failed", CTLFLAG_RD, &adapter->mbuf_defrag_failed, "m_defrag() failed"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "watchdog_events", CTLFLAG_RD, &adapter->watchdog_events, "Watchdog timeouts"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "link_irq", CTLFLAG_RD, &adapter->link_irq, "Link MSIX IRQ Handled"); for (int i = 0; i < adapter->num_queues; i++, txr++) { snprintf(namebuf, QUEUE_NAME_LEN, "queue%d", i); queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "interrupt_rate", CTLTYPE_UINT | CTLFLAG_RW, &adapter->queues[i], sizeof(&adapter->queues[i]), ixgbe_sysctl_interrupt_rate_handler, "IU", "Interrupt Rate"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "irqs", CTLFLAG_RD, &(adapter->queues[i].irqs), "irqs on this queue"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "txd_head", CTLTYPE_UINT | CTLFLAG_RD, txr, sizeof(txr), ixgbe_sysctl_tdh_handler, "IU", "Transmit Descriptor Head"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "txd_tail", CTLTYPE_UINT | CTLFLAG_RD, txr, sizeof(txr), ixgbe_sysctl_tdt_handler, "IU", "Transmit Descriptor Tail"); SYSCTL_ADD_ULONG(ctx, queue_list, OID_AUTO, "tso_tx", CTLFLAG_RD, &txr->tso_tx, "TSO"); SYSCTL_ADD_ULONG(ctx, queue_list, OID_AUTO, "no_tx_dma_setup", CTLFLAG_RD, &txr->no_tx_dma_setup, "Driver tx dma failure in xmit"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "no_desc_avail", CTLFLAG_RD, &txr->no_desc_avail, "Queue No Descriptor Available"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "tx_packets", CTLFLAG_RD, &txr->total_packets, "Queue Packets Transmitted"); } for (int i = 0; i < adapter->num_queues; i++, rxr++) { snprintf(namebuf, QUEUE_NAME_LEN, "queue%d", i); queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); struct lro_ctrl *lro = &rxr->lro; snprintf(namebuf, QUEUE_NAME_LEN, "queue%d", i); queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "rxd_head", CTLTYPE_UINT | CTLFLAG_RD, rxr, sizeof(rxr), ixgbe_sysctl_rdh_handler, "IU", "Receive Descriptor Head"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "rxd_tail", CTLTYPE_UINT | CTLFLAG_RD, rxr, sizeof(rxr), ixgbe_sysctl_rdt_handler, "IU", "Receive Descriptor Tail"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "rx_packets", CTLFLAG_RD, &rxr->rx_packets, "Queue Packets Received"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "rx_bytes", CTLFLAG_RD, &rxr->rx_bytes, "Queue Bytes Received"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "rx_copies", CTLFLAG_RD, &rxr->rx_copies, "Copied RX Frames"); SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "lro_queued", CTLFLAG_RD, &lro->lro_queued, 0, "LRO Queued"); SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "lro_flushed", CTLFLAG_RD, &lro->lro_flushed, 0, "LRO Flushed"); } /* MAC stats get the own sub node */ stat_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "mac_stats", CTLFLAG_RD, NULL, "MAC Statistics"); stat_list = SYSCTL_CHILDREN(stat_node); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "crc_errs", CTLFLAG_RD, &stats->crcerrs, "CRC Errors"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "ill_errs", CTLFLAG_RD, &stats->illerrc, "Illegal Byte Errors"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "byte_errs", CTLFLAG_RD, &stats->errbc, "Byte Errors"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "short_discards", CTLFLAG_RD, &stats->mspdc, "MAC Short Packets Discarded"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "local_faults", CTLFLAG_RD, &stats->mlfc, "MAC Local Faults"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "remote_faults", CTLFLAG_RD, &stats->mrfc, "MAC Remote Faults"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rec_len_errs", CTLFLAG_RD, &stats->rlec, "Receive Length Errors"); /* Flow Control stats */ SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "xon_txd", CTLFLAG_RD, &stats->lxontxc, "Link XON Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "xon_recvd", CTLFLAG_RD, &stats->lxonrxc, "Link XON Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "xoff_txd", CTLFLAG_RD, &stats->lxofftxc, "Link XOFF Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "xoff_recvd", CTLFLAG_RD, &stats->lxoffrxc, "Link XOFF Received"); /* Packet Reception Stats */ SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "total_octets_rcvd", CTLFLAG_RD, &stats->tor, "Total Octets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_octets_rcvd", CTLFLAG_RD, &stats->gorc, "Good Octets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "total_pkts_rcvd", CTLFLAG_RD, &stats->tpr, "Total Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_pkts_rcvd", CTLFLAG_RD, &stats->gprc, "Good Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "mcast_pkts_rcvd", CTLFLAG_RD, &stats->mprc, "Multicast Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "bcast_pkts_rcvd", CTLFLAG_RD, &stats->bprc, "Broadcast Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_64", CTLFLAG_RD, &stats->prc64, "64 byte frames received "); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_65_127", CTLFLAG_RD, &stats->prc127, "65-127 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_128_255", CTLFLAG_RD, &stats->prc255, "128-255 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_256_511", CTLFLAG_RD, &stats->prc511, "256-511 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_512_1023", CTLFLAG_RD, &stats->prc1023, "512-1023 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_1024_1522", CTLFLAG_RD, &stats->prc1522, "1023-1522 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_undersized", CTLFLAG_RD, &stats->ruc, "Receive Undersized"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_fragmented", CTLFLAG_RD, &stats->rfc, "Fragmented Packets Received "); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_oversized", CTLFLAG_RD, &stats->roc, "Oversized Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_jabberd", CTLFLAG_RD, &stats->rjc, "Received Jabber"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "management_pkts_rcvd", CTLFLAG_RD, &stats->mngprc, "Management Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "management_pkts_drpd", CTLFLAG_RD, &stats->mngptc, "Management Packets Dropped"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "checksum_errs", CTLFLAG_RD, &stats->xec, "Checksum Errors"); /* Packet Transmission Stats */ SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_octets_txd", CTLFLAG_RD, &stats->gotc, "Good Octets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "total_pkts_txd", CTLFLAG_RD, &stats->tpt, "Total Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_pkts_txd", CTLFLAG_RD, &stats->gptc, "Good Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "bcast_pkts_txd", CTLFLAG_RD, &stats->bptc, "Broadcast Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "mcast_pkts_txd", CTLFLAG_RD, &stats->mptc, "Multicast Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "management_pkts_txd", CTLFLAG_RD, &stats->mngptc, "Management Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_64", CTLFLAG_RD, &stats->ptc64, "64 byte frames transmitted "); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_65_127", CTLFLAG_RD, &stats->ptc127, "65-127 byte frames transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_128_255", CTLFLAG_RD, &stats->ptc255, "128-255 byte frames transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_256_511", CTLFLAG_RD, &stats->ptc511, "256-511 byte frames transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_512_1023", CTLFLAG_RD, &stats->ptc1023, "512-1023 byte frames transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_1024_1522", CTLFLAG_RD, &stats->ptc1522, "1024-1522 byte frames transmitted"); } /* ** Set flow control using sysctl: ** Flow control values: ** 0 - off ** 1 - rx pause ** 2 - tx pause ** 3 - full */ static int ixgbe_set_flowcntl(SYSCTL_HANDLER_ARGS) { int error, last; struct adapter *adapter = (struct adapter *) arg1; last = adapter->fc; error = sysctl_handle_int(oidp, &adapter->fc, 0, req); if ((error) || (req->newptr == NULL)) return (error); /* Don't bother if it's not changed */ if (adapter->fc == last) return (0); switch (adapter->fc) { case ixgbe_fc_rx_pause: case ixgbe_fc_tx_pause: case ixgbe_fc_full: adapter->hw.fc.requested_mode = adapter->fc; if (adapter->num_queues > 1) ixgbe_disable_rx_drop(adapter); break; case ixgbe_fc_none: adapter->hw.fc.requested_mode = ixgbe_fc_none; if (adapter->num_queues > 1) ixgbe_enable_rx_drop(adapter); break; default: adapter->fc = last; return (EINVAL); } /* Don't autoneg if forcing a value */ adapter->hw.fc.disable_fc_autoneg = TRUE; ixgbe_fc_enable(&adapter->hw); return error; } /* ** Control link advertise speed: ** 1 - advertise only 1G ** 2 - advertise 100Mb ** 3 - advertise normal */ static int ixgbe_set_advertise(SYSCTL_HANDLER_ARGS) { int error = 0; struct adapter *adapter; device_t dev; struct ixgbe_hw *hw; ixgbe_link_speed speed, last; adapter = (struct adapter *) arg1; dev = adapter->dev; hw = &adapter->hw; last = adapter->advertise; error = sysctl_handle_int(oidp, &adapter->advertise, 0, req); if ((error) || (req->newptr == NULL)) return (error); if (adapter->advertise == last) /* no change */ return (0); if (!((hw->phy.media_type == ixgbe_media_type_copper) || (hw->phy.multispeed_fiber))) return (EINVAL); if ((adapter->advertise == 2) && (hw->mac.type != ixgbe_mac_X540)) { device_printf(dev, "Set Advertise: 100Mb on X540 only\n"); return (EINVAL); } if (adapter->advertise == 1) speed = IXGBE_LINK_SPEED_1GB_FULL; else if (adapter->advertise == 2) speed = IXGBE_LINK_SPEED_100_FULL; else if (adapter->advertise == 3) speed = IXGBE_LINK_SPEED_1GB_FULL | IXGBE_LINK_SPEED_10GB_FULL; else { /* bogus value */ adapter->advertise = last; return (EINVAL); } hw->mac.autotry_restart = TRUE; hw->mac.ops.setup_link(hw, speed, TRUE); return (error); } /* ** Thermal Shutdown Trigger ** - cause a Thermal Overtemp IRQ ** - this now requires firmware enabling */ static int ixgbe_set_thermal_test(SYSCTL_HANDLER_ARGS) { int error, fire = 0; struct adapter *adapter = (struct adapter *) arg1; struct ixgbe_hw *hw = &adapter->hw; if (hw->mac.type != ixgbe_mac_X540) return (0); error = sysctl_handle_int(oidp, &fire, 0, req); if ((error) || (req->newptr == NULL)) return (error); if (fire) { u32 reg = IXGBE_READ_REG(hw, IXGBE_EICS); reg |= IXGBE_EICR_TS; IXGBE_WRITE_REG(hw, IXGBE_EICS, reg); } return (0); } /* ** Enable the hardware to drop packets when the buffer is ** full. This is useful when multiqueue,so that no single ** queue being full stalls the entire RX engine. We only ** enable this when Multiqueue AND when Flow Control is ** disabled. */ static void ixgbe_enable_rx_drop(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; for (int i = 0; i < adapter->num_queues; i++) { u32 srrctl = IXGBE_READ_REG(hw, IXGBE_SRRCTL(i)); srrctl |= IXGBE_SRRCTL_DROP_EN; IXGBE_WRITE_REG(hw, IXGBE_SRRCTL(i), srrctl); } } static void ixgbe_disable_rx_drop(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; for (int i = 0; i < adapter->num_queues; i++) { u32 srrctl = IXGBE_READ_REG(hw, IXGBE_SRRCTL(i)); srrctl &= ~IXGBE_SRRCTL_DROP_EN; IXGBE_WRITE_REG(hw, IXGBE_SRRCTL(i), srrctl); } } Index: stable/9/sys/dev/netmap/if_em_netmap.h =================================================================== --- stable/9/sys/dev/netmap/if_em_netmap.h (revision 262152) +++ stable/9/sys/dev/netmap/if_em_netmap.h (revision 262153) @@ -1,348 +1,334 @@ /* - * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD$ * - * netmap support for em. + * netmap support for: em. * * For more details on netmap support please see ixgbe_netmap.h */ #include #include #include #include /* vtophys ? */ #include -static void em_netmap_block_tasks(struct adapter *); -static void em_netmap_unblock_tasks(struct adapter *); - - // XXX do we need to block/unblock the tasks ? static void em_netmap_block_tasks(struct adapter *adapter) { if (adapter->msix > 1) { /* MSIX */ int i; struct tx_ring *txr = adapter->tx_rings; struct rx_ring *rxr = adapter->rx_rings; for (i = 0; i < adapter->num_queues; i++, txr++, rxr++) { taskqueue_block(txr->tq); taskqueue_drain(txr->tq, &txr->tx_task); taskqueue_block(rxr->tq); taskqueue_drain(rxr->tq, &rxr->rx_task); } } else { /* legacy */ taskqueue_block(adapter->tq); taskqueue_drain(adapter->tq, &adapter->link_task); taskqueue_drain(adapter->tq, &adapter->que_task); } } static void em_netmap_unblock_tasks(struct adapter *adapter) { if (adapter->msix > 1) { struct tx_ring *txr = adapter->tx_rings; struct rx_ring *rxr = adapter->rx_rings; int i; for (i = 0; i < adapter->num_queues; i++) { taskqueue_unblock(txr->tq); taskqueue_unblock(rxr->tq); } } else { /* legacy */ taskqueue_unblock(adapter->tq); } } /* - * Register/unregister routine + * Register/unregister. We are already under netmap lock. */ static int -em_netmap_reg(struct ifnet *ifp, int onoff) +em_netmap_reg(struct netmap_adapter *na, int onoff) { + struct ifnet *ifp = na->ifp; struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); - int error = 0; - if (na == NULL) - return EINVAL; /* no netmap support here */ - + EM_CORE_LOCK(adapter); em_disable_intr(adapter); /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); em_netmap_block_tasks(adapter); - + /* enable or disable flags and callbacks in na and ifp */ if (onoff) { - ifp->if_capenable |= IFCAP_NETMAP; - - na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_transmit; - - em_init_locked(adapter); - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { - error = ENOMEM; - goto fail; - } + nm_set_native_flags(na); } else { -fail: - /* return to non-netmap mode */ - ifp->if_transmit = na->if_transmit; - ifp->if_capenable &= ~IFCAP_NETMAP; - em_init_locked(adapter); /* also enable intr */ + nm_clear_native_flags(na); } + em_init_locked(adapter); /* also enable intr */ em_netmap_unblock_tasks(adapter); - return (error); + EM_CORE_UNLOCK(adapter); + return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); } /* * Reconcile kernel and user view of the transmit ring. */ static int -em_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) +em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct tx_ring *txr = &adapter->tx_rings[ring_nr]; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, k, l, n = 0, lim = kring->nkr_num_slots - 1; - + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; /* generate an interrupt approximately every half ring */ u_int report_frequency = kring->nkr_num_slots >> 1; - k = ring->cur; - if (k > lim) - return netmap_ring_reinit(kring); + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct tx_ring *txr = &adapter->tx_rings[ring_nr]; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); /* - * Process new packets to send. j is the current index in the - * netmap ring, l is the corresponding index in the NIC ring. + * First part: process new packets to send. */ - j = kring->nr_hwcur; - if (j != k) { /* we have new packets to send */ - l = netmap_idx_k2n(kring, j); - for (n = 0; j != k; n++) { - /* slot is the current slot in the netmap ring */ - struct netmap_slot *slot = &ring->slot[j]; - /* curr is the current slot in the nic ring */ - struct e1000_tx_desc *curr = &txr->tx_base[l]; - struct em_buffer *txbuf = &txr->tx_buffers[l]; - int flags = ((slot->flags & NS_REPORT) || - j == 0 || j == report_frequency) ? - E1000_TXD_CMD_RS : 0; + + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* we have new packets to send */ + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; + u_int len = slot->len; uint64_t paddr; void *addr = PNMB(slot, &paddr); - u_int len = slot->len; - if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { - return netmap_ring_reinit(kring); - } + /* device-specific */ + struct e1000_tx_desc *curr = &txr->tx_base[nic_i]; + struct em_buffer *txbuf = &txr->tx_buffers[nic_i]; + int flags = (slot->flags & NS_REPORT || + nic_i == 0 || nic_i == report_frequency) ? + E1000_TXD_CMD_RS : 0; - slot->flags &= ~NS_REPORT; + NM_CHECK_ADDR_LEN(addr, len); + if (slot->flags & NS_BUF_CHANGED) { curr->buffer_addr = htole64(paddr); /* buffer has changed, reload map */ netmap_reload_map(txr->txtag, txbuf->map, addr); - slot->flags &= ~NS_BUF_CHANGED; } + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + + /* Fill the slot in the NIC ring. */ curr->upper.data = 0; curr->lower.data = htole32(adapter->txd_cmd | len | (E1000_TXD_CMD_EOP | flags) ); bus_dmamap_sync(txr->txtag, txbuf->map, BUS_DMASYNC_PREWRITE); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwcur = k; /* the saved ring->cur */ - kring->nr_hwavail -= n; + kring->nr_hwcur = head; + /* synchronize the NIC ring */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, - BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), l); + /* (re)start the tx unit up to slot nic_i (excluded) */ + E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), nic_i); } - if (n == 0 || kring->nr_hwavail < 1) { - int delta; - + /* + * Second part: reclaim buffers for completed transmissions. + */ + if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { /* record completed transmissions using TDH */ - l = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); - if (l >= kring->nkr_num_slots) { /* XXX can it happen ? */ - D("TDH wrap %d", l); - l -= kring->nkr_num_slots; + nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); + if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ + D("TDH wrap %d", nic_i); + nic_i -= kring->nkr_num_slots; } - delta = l - txr->next_to_clean; - if (delta) { - /* some completed, increment hwavail. */ - if (delta < 0) - delta += kring->nkr_num_slots; - txr->next_to_clean = l; - kring->nr_hwavail += delta; + if (nic_i != txr->next_to_clean) { + txr->next_to_clean = nic_i; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } } - /* update avail to what the kernel knows */ - ring->avail = kring->nr_hwavail; + nm_txsync_finalize(kring); + return 0; } /* * Reconcile kernel and user view of the receive ring. */ static int -em_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) +em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, l, n, lim = kring->nkr_num_slots - 1; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; - u_int k = ring->cur, resvd = ring->reserved; - k = ring->cur; - if (k > lim) + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; + + if (head > lim) return netmap_ring_reinit(kring); /* XXX check sync modes */ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* - * Import newly received packets into the netmap ring. - * j is an index in the netmap ring, l in the NIC ring. + * First part: import newly received packets. */ - l = rxr->next_to_check; - j = netmap_idx_n2k(kring, l); if (netmap_no_pendintr || force_update) { uint16_t slot_flags = kring->nkr_slot_flags; - for (n = 0; ; n++) { - struct e1000_rx_desc *curr = &rxr->rx_base[l]; + nic_i = rxr->next_to_check; + nm_i = netmap_idx_n2k(kring, nic_i); + + for (n = 0; ; n++) { // XXX no need to count + struct e1000_rx_desc *curr = &rxr->rx_base[nic_i]; uint32_t staterr = le32toh(curr->status); if ((staterr & E1000_RXD_STAT_DD) == 0) break; - ring->slot[j].len = le16toh(curr->length); - ring->slot[j].flags = slot_flags; - bus_dmamap_sync(rxr->rxtag, rxr->rx_buffers[l].map, + ring->slot[nm_i].len = le16toh(curr->length); + ring->slot[nm_i].flags = slot_flags; + bus_dmamap_sync(rxr->rxtag, rxr->rx_buffers[nic_i].map, BUS_DMASYNC_POSTREAD); - j = (j == lim) ? 0 : j + 1; + nm_i = nm_next(nm_i, lim); /* make sure next_to_refresh follows next_to_check */ - rxr->next_to_refresh = l; // XXX - l = (l == lim) ? 0 : l + 1; + rxr->next_to_refresh = nic_i; // XXX + nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ - rxr->next_to_check = l; - kring->nr_hwavail += n; + rxr->next_to_check = nic_i; + kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; /* netmap ring index */ - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - if (j != k) { /* userspace has released some packets. */ - l = netmap_idx_k2n(kring, j); /* NIC ring index */ - for (n = 0; j != k; n++) { - struct netmap_slot *slot = &ring->slot[j]; - struct e1000_rx_desc *curr = &rxr->rx_base[l]; - struct em_buffer *rxbuf = &rxr->rx_buffers[l]; + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); - if (addr == netmap_buffer_base) { /* bad buf */ - return netmap_ring_reinit(kring); - } + struct e1000_rx_desc *curr = &rxr->rx_base[nic_i]; + struct em_buffer *rxbuf = &rxr->rx_buffers[nic_i]; + if (addr == netmap_buffer_base) /* bad buf */ + goto ring_reset; + if (slot->flags & NS_BUF_CHANGED) { - curr->buffer_addr = htole64(paddr); /* buffer has changed, reload map */ + curr->buffer_addr = htole64(paddr); netmap_reload_map(rxr->rxtag, rxbuf->map, addr); slot->flags &= ~NS_BUF_CHANGED; } curr->status = 0; bus_dmamap_sync(rxr->rxtag, rxbuf->map, BUS_DMASYNC_PREREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; + kring->nr_hwcur = head; + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * IMPORTANT: we must leave one free slot in the ring, - * so move l back by one unit + * so move nic_i back by one unit */ - l = (l == 0) ? lim : l - 1; - E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), l); + nic_i = nm_prev(nic_i, lim); + E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), nic_i); } - /* tell userspace that there are new packets */ - ring->avail = kring->nr_hwavail - resvd; + + /* tell userspace that there might be new packets */ + nm_rxsync_finalize(kring); + return 0; + +ring_reset: + return netmap_ring_reinit(kring); } static void em_netmap_attach(struct adapter *adapter) { struct netmap_adapter na; bzero(&na, sizeof(na)); na.ifp = adapter->ifp; na.na_flags = NAF_BDG_MAYSLEEP; na.num_tx_desc = adapter->num_tx_desc; na.num_rx_desc = adapter->num_rx_desc; na.nm_txsync = em_netmap_txsync; na.nm_rxsync = em_netmap_rxsync; na.nm_register = em_netmap_reg; - netmap_attach(&na, adapter->num_queues); + na.num_tx_rings = na.num_rx_rings = adapter->num_queues; + netmap_attach(&na); } /* end of file */ Index: stable/9/sys/dev/netmap/if_igb_netmap.h =================================================================== --- stable/9/sys/dev/netmap/if_igb_netmap.h (revision 262152) +++ stable/9/sys/dev/netmap/if_igb_netmap.h (revision 262153) @@ -1,319 +1,314 @@ /* - * Copyright (C) 2011 Universita` di Pisa. All rights reserved. + * Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD$ * * Netmap support for igb, partly contributed by Ahmed Kooli * For details on netmap support please see ixgbe_netmap.h */ #include #include #include #include /* vtophys ? */ #include +/* + * Adaptation to different versions of the driver. + */ +#ifndef IGB_MEDIA_RESET +/* at the same time as IGB_MEDIA_RESET was defined, the + * tx buffer descriptor was renamed, so use this to revert + * back to the old name. + */ +#define igb_tx_buf igb_tx_buffer +#endif + + /* - * register-unregister routine + * Register/unregister. We are already under netmap lock. */ static int -igb_netmap_reg(struct ifnet *ifp, int onoff) +igb_netmap_reg(struct netmap_adapter *na, int onoff) { + struct ifnet *ifp = na->ifp; struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); - int error = 0; - if (na == NULL) - return EINVAL; /* no netmap support here */ - + IGB_CORE_LOCK(adapter); igb_disable_intr(adapter); /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + /* enable or disable flags and callbacks in na and ifp */ if (onoff) { - ifp->if_capenable |= IFCAP_NETMAP; - - na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_transmit; - - igb_init_locked(adapter); - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { - error = ENOMEM; - goto fail; - } + nm_set_native_flags(na); } else { -fail: - /* restore if_transmit */ - ifp->if_transmit = na->if_transmit; - ifp->if_capenable &= ~IFCAP_NETMAP; - igb_init_locked(adapter); /* also enable intr */ + nm_clear_native_flags(na); } - return (error); + igb_init_locked(adapter); /* also enable intr */ + IGB_CORE_UNLOCK(adapter); + return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); } /* * Reconcile kernel and user view of the transmit ring. */ static int -igb_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) +igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct tx_ring *txr = &adapter->tx_rings[ring_nr]; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, k, l, n = 0, lim = kring->nkr_num_slots - 1; - + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; /* generate an interrupt approximately every half ring */ u_int report_frequency = kring->nkr_num_slots >> 1; - k = ring->cur; - if (k > lim) - return netmap_ring_reinit(kring); + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct tx_ring *txr = &adapter->tx_rings[ring_nr]; + /* 82575 needs the queue index added */ + u32 olinfo_status = + (adapter->hw.mac.type == e1000_82575) ? (txr->me << 4) : 0; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, - BUS_DMASYNC_POSTREAD); + BUS_DMASYNC_POSTREAD); - /* check for new packets to send. - * j indexes the netmap ring, l indexes the nic ring, and - * j = kring->nr_hwcur, l = E1000_TDT (not tracked), - * j == (l + kring->nkr_hwofs) % ring_size + /* + * First part: process new packets to send. */ - j = kring->nr_hwcur; - if (j != k) { /* we have new packets to send */ - /* 82575 needs the queue index added */ - u32 olinfo_status = - (adapter->hw.mac.type == e1000_82575) ? (txr->me << 4) : 0; - l = netmap_idx_k2n(kring, j); - for (n = 0; j != k; n++) { - /* slot is the current slot in the netmap ring */ - struct netmap_slot *slot = &ring->slot[j]; - /* curr is the current slot in the nic ring */ - union e1000_adv_tx_desc *curr = - (union e1000_adv_tx_desc *)&txr->tx_base[l]; -#ifndef IGB_MEDIA_RESET -/* at the same time as IGB_MEDIA_RESET was defined, the - * tx buffer descriptor was renamed, so use this to revert - * back to the old name. - */ -#define igb_tx_buf igb_tx_buffer -#endif - struct igb_tx_buf *txbuf = &txr->tx_buffers[l]; - int flags = ((slot->flags & NS_REPORT) || - j == 0 || j == report_frequency) ? - E1000_ADVTXD_DCMD_RS : 0; + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* we have new packets to send */ + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; + u_int len = slot->len; uint64_t paddr; void *addr = PNMB(slot, &paddr); - u_int len = slot->len; - if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { - return netmap_ring_reinit(kring); - } + /* device-specific */ + union e1000_adv_tx_desc *curr = + (union e1000_adv_tx_desc *)&txr->tx_base[nic_i]; + struct igb_tx_buf *txbuf = &txr->tx_buffers[nic_i]; + int flags = (slot->flags & NS_REPORT || + nic_i == 0 || nic_i == report_frequency) ? + E1000_ADVTXD_DCMD_RS : 0; - slot->flags &= ~NS_REPORT; + NM_CHECK_ADDR_LEN(addr, len); + if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ netmap_reload_map(txr->txtag, txbuf->map, addr); - slot->flags &= ~NS_BUF_CHANGED; } + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + + /* Fill the slot in the NIC ring. */ curr->read.buffer_addr = htole64(paddr); // XXX check olinfo and cmd_type_len curr->read.olinfo_status = htole32(olinfo_status | (len<< E1000_ADVTXD_PAYLEN_SHIFT)); curr->read.cmd_type_len = htole32(len | E1000_ADVTXD_DTYP_DATA | - E1000_ADVTXD_DCMD_IFCS | - E1000_ADVTXD_DCMD_DEXT | - E1000_ADVTXD_DCMD_EOP | flags); + E1000_ADVTXD_DCMD_IFCS | + E1000_ADVTXD_DCMD_DEXT | + E1000_ADVTXD_DCMD_EOP | flags); + /* make sure changes to the buffer are synced */ bus_dmamap_sync(txr->txtag, txbuf->map, BUS_DMASYNC_PREWRITE); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwcur = k; /* the saved ring->cur */ - kring->nr_hwavail -= n; + kring->nr_hwcur = head; /* Set the watchdog XXX ? */ txr->queue_status = IGB_QUEUE_WORKING; txr->watchdog_time = ticks; + /* synchronize the NIC ring */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, - BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), l); + /* (re)start the tx unit up to slot nic_i (excluded) */ + E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), nic_i); } - if (n == 0 || kring->nr_hwavail < 1) { - int delta; - + /* + * Second part: reclaim buffers for completed transmissions. + */ + if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { /* record completed transmissions using TDH */ - l = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); - if (l >= kring->nkr_num_slots) { /* XXX can it happen ? */ - D("TDH wrap %d", l); - l -= kring->nkr_num_slots; + nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); + if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ + D("TDH wrap %d", nic_i); + nic_i -= kring->nkr_num_slots; } - delta = l - txr->next_to_clean; - if (delta) { - /* some completed, increment hwavail. */ - if (delta < 0) - delta += kring->nkr_num_slots; - txr->next_to_clean = l; - kring->nr_hwavail += delta; - } + txr->next_to_clean = nic_i; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } - /* update avail to what the kernel knows */ - ring->avail = kring->nr_hwavail; + nm_txsync_finalize(kring); + return 0; } /* * Reconcile kernel and user view of the receive ring. */ static int -igb_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) +igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, l, n, lim = kring->nkr_num_slots - 1; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; - u_int k = ring->cur, resvd = ring->reserved; - k = ring->cur; - if (k > lim) + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; + + if (head > lim) return netmap_ring_reinit(kring); /* XXX check sync modes */ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, - BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* - * import newly received packets into the netmap ring. - * j is an index in the netmap ring, l in the NIC ring. + * First part: import newly received packets. */ - l = rxr->next_to_check; - j = netmap_idx_n2k(kring, l); if (netmap_no_pendintr || force_update) { uint16_t slot_flags = kring->nkr_slot_flags; + nic_i = rxr->next_to_check; + nm_i = netmap_idx_n2k(kring, nic_i); + for (n = 0; ; n++) { - union e1000_adv_rx_desc *curr = &rxr->rx_base[l]; + union e1000_adv_rx_desc *curr = &rxr->rx_base[nic_i]; uint32_t staterr = le32toh(curr->wb.upper.status_error); if ((staterr & E1000_RXD_STAT_DD) == 0) break; - ring->slot[j].len = le16toh(curr->wb.upper.length); - ring->slot[j].flags = slot_flags; + ring->slot[nm_i].len = le16toh(curr->wb.upper.length); + ring->slot[nm_i].flags = slot_flags; bus_dmamap_sync(rxr->ptag, - rxr->rx_buffers[l].pmap, BUS_DMASYNC_POSTREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + rxr->rx_buffers[nic_i].pmap, BUS_DMASYNC_POSTREAD); + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ - rxr->next_to_check = l; - kring->nr_hwavail += n; + rxr->next_to_check = nic_i; + kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; /* netmap ring index */ - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - if (j != k) { /* userspace has released some packets. */ - l = netmap_idx_k2n(kring, j); - for (n = 0; j != k; n++) { - struct netmap_slot *slot = ring->slot + j; - union e1000_adv_rx_desc *curr = &rxr->rx_base[l]; - struct igb_rx_buf *rxbuf = rxr->rx_buffers + l; + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); - if (addr == netmap_buffer_base) { /* bad buf */ - return netmap_ring_reinit(kring); - } + union e1000_adv_rx_desc *curr = &rxr->rx_base[nic_i]; + struct igb_rx_buf *rxbuf = &rxr->rx_buffers[nic_i]; + if (addr == netmap_buffer_base) /* bad buf */ + goto ring_reset; + if (slot->flags & NS_BUF_CHANGED) { + /* buffer has changed, reload map */ netmap_reload_map(rxr->ptag, rxbuf->pmap, addr); slot->flags &= ~NS_BUF_CHANGED; } - curr->read.pkt_addr = htole64(paddr); curr->wb.upper.status_error = 0; + curr->read.pkt_addr = htole64(paddr); bus_dmamap_sync(rxr->ptag, rxbuf->pmap, - BUS_DMASYNC_PREREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + BUS_DMASYNC_PREREAD); + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; + kring->nr_hwcur = head; + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, - BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * IMPORTANT: we must leave one free slot in the ring, - * so move l back by one unit + * so move nic_i back by one unit */ - l = (l == 0) ? lim : l - 1; - E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), l); + nic_i = nm_prev(nic_i, lim); + E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), nic_i); } - /* tell userspace that there are new packets */ - ring->avail = kring->nr_hwavail - resvd; + + /* tell userspace that there might be new packets */ + nm_rxsync_finalize(kring); + return 0; + +ring_reset: + return netmap_ring_reinit(kring); } static void igb_netmap_attach(struct adapter *adapter) { struct netmap_adapter na; bzero(&na, sizeof(na)); na.ifp = adapter->ifp; na.na_flags = NAF_BDG_MAYSLEEP; na.num_tx_desc = adapter->num_tx_desc; na.num_rx_desc = adapter->num_rx_desc; na.nm_txsync = igb_netmap_txsync; na.nm_rxsync = igb_netmap_rxsync; na.nm_register = igb_netmap_reg; - netmap_attach(&na, adapter->num_queues); -} + na.num_tx_rings = na.num_rx_rings = adapter->num_queues; + netmap_attach(&na); +} + /* end of file */ Index: stable/9/sys/dev/netmap/if_lem_netmap.h =================================================================== --- stable/9/sys/dev/netmap/if_lem_netmap.h (revision 262152) +++ stable/9/sys/dev/netmap/if_lem_netmap.h (revision 262153) @@ -1,341 +1,311 @@ /* - * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD$ * - * netmap support for "lem" + * netmap support for: lem * * For details on netmap support please see ixgbe_netmap.h */ + #include #include #include #include /* vtophys ? */ #include /* - * Register/unregister + * Register/unregister. We are already under netmap lock. */ static int -lem_netmap_reg(struct ifnet *ifp, int onoff) +lem_netmap_reg(struct netmap_adapter *na, int onoff) { + struct ifnet *ifp = na->ifp; struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); - int error = 0; - if (na == NULL) - return EINVAL; - EM_CORE_LOCK(adapter); lem_disable_intr(adapter); /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); #ifndef EM_LEGACY_IRQ // XXX do we need this ? taskqueue_block(adapter->tq); taskqueue_drain(adapter->tq, &adapter->rxtx_task); taskqueue_drain(adapter->tq, &adapter->link_task); #endif /* !EM_LEGCY_IRQ */ - if (onoff) { - ifp->if_capenable |= IFCAP_NETMAP; - na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_transmit; - - lem_init_locked(adapter); - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { - error = ENOMEM; - goto fail; - } + /* enable or disable flags and callbacks in na and ifp */ + if (onoff) { + nm_set_native_flags(na); } else { -fail: - /* return to non-netmap mode */ - ifp->if_transmit = na->if_transmit; - ifp->if_capenable &= ~IFCAP_NETMAP; - lem_init_locked(adapter); /* also enable intr */ + nm_clear_native_flags(na); } + lem_init_locked(adapter); /* also enable intr */ #ifndef EM_LEGACY_IRQ taskqueue_unblock(adapter->tq); // XXX do we need this ? #endif /* !EM_LEGCY_IRQ */ EM_CORE_UNLOCK(adapter); - return (error); + return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); } /* * Reconcile kernel and user view of the transmit ring. */ static int -lem_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) +lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, k, l, n = 0, lim = kring->nkr_num_slots - 1; - + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; /* generate an interrupt approximately every half ring */ - int report_frequency = kring->nkr_num_slots >> 1; + u_int report_frequency = kring->nkr_num_slots >> 1; - ND("%s: hwofs %d, hwcur %d hwavail %d lease %d cur %d avail %d", - ifp->if_xname, - kring->nkr_hwofs, kring->nr_hwcur, kring->nr_hwavail, - kring->nkr_hwlease, - ring->cur, ring->avail); - /* take a copy of ring->cur now, and never read it again */ - k = ring->cur; - if (k > lim) - return netmap_ring_reinit(kring); + /* device-specific */ + struct adapter *adapter = ifp->if_softc; bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, BUS_DMASYNC_POSTREAD); + /* - * Process new packets to send. j is the current index in the - * netmap ring, l is the corresponding index in the NIC ring. + * First part: process new packets to send. */ - j = kring->nr_hwcur; - if (netmap_verbose > 255) - RD(5, "device %s send %d->%d", ifp->if_xname, j, k); - if (j != k) { /* we have new packets to send */ - l = netmap_idx_k2n(kring, j); - for (n = 0; j != k; n++) { - /* slot is the current slot in the netmap ring */ - struct netmap_slot *slot = &ring->slot[j]; - /* curr is the current slot in the nic ring */ - struct e1000_tx_desc *curr = &adapter->tx_desc_base[l]; - struct em_buffer *txbuf = &adapter->tx_buffer_area[l]; - int flags = ((slot->flags & NS_REPORT) || - j == 0 || j == report_frequency) ? - E1000_TXD_CMD_RS : 0; + + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* we have new packets to send */ + nic_i = netmap_idx_k2n(kring, nm_i); + while (nm_i != head) { + struct netmap_slot *slot = &ring->slot[nm_i]; + u_int len = slot->len; uint64_t paddr; void *addr = PNMB(slot, &paddr); - u_int len = slot->len; - if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { - return netmap_ring_reinit(kring); - } - ND("slot %d NIC %d %s", j, l, nm_dump_buf(addr, len, 128, NULL)); + /* device-specific */ + struct e1000_tx_desc *curr = &adapter->tx_desc_base[nic_i]; + struct em_buffer *txbuf = &adapter->tx_buffer_area[nic_i]; + int flags = (slot->flags & NS_REPORT || + nic_i == 0 || nic_i == report_frequency) ? + E1000_TXD_CMD_RS : 0; - slot->flags &= ~NS_REPORT; - if (1 || slot->flags & NS_BUF_CHANGED) { + NM_CHECK_ADDR_LEN(addr, len); + + if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ - netmap_reload_map(adapter->txtag, txbuf->map, addr); curr->buffer_addr = htole64(paddr); - slot->flags &= ~NS_BUF_CHANGED; + netmap_reload_map(adapter->txtag, txbuf->map, addr); } + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + + /* Fill the slot in the NIC ring. */ curr->upper.data = 0; - curr->lower.data = - htole32( adapter->txd_cmd | len | + curr->lower.data = htole32(adapter->txd_cmd | len | (E1000_TXD_CMD_EOP | flags) ); - - ND("len %d kring %d nic %d", len, j, l); bus_dmamap_sync(adapter->txtag, txbuf->map, - BUS_DMASYNC_PREWRITE); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + BUS_DMASYNC_PREWRITE); + + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - ND("sent %d packets from %d, TDT now %d", n, kring->nr_hwcur, l); - kring->nr_hwcur = k; /* the saved ring->cur */ - kring->nr_hwavail -= n; + kring->nr_hwcur = head; + /* synchronize the NIC ring */ bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, - BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), l); + /* (re)start the tx unit up to slot nic_i (excluded) */ + E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), nic_i); } - if (n == 0 || kring->nr_hwavail < 1) { - int delta; - + /* + * Second part: reclaim buffers for completed transmissions. + */ + if (ticks != kring->last_reclaim || flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { + kring->last_reclaim = ticks; /* record completed transmissions using TDH */ - l = E1000_READ_REG(&adapter->hw, E1000_TDH(0)); - ND("tdh is now %d", l); - if (l >= kring->nkr_num_slots) { /* XXX can it happen ? */ - D("bad TDH %d", l); - l -= kring->nkr_num_slots; + nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(0)); + if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ + D("TDH wrap %d", nic_i); + nic_i -= kring->nkr_num_slots; } - delta = l - adapter->next_tx_to_clean; - if (delta) { - /* some tx completed, increment hwavail. */ - if (delta < 0) - delta += kring->nkr_num_slots; - if (netmap_verbose > 255) - RD(5, "%s tx recover %d bufs", - ifp->if_xname, delta); - adapter->next_tx_to_clean = l; - kring->nr_hwavail += delta; - } + adapter->next_tx_to_clean = nic_i; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } - /* update avail to what the kernel knows */ - ring->avail = kring->nr_hwavail; + nm_txsync_finalize(kring); + return 0; } /* * Reconcile kernel and user view of the receive ring. */ static int -lem_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) +lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - int j, l, n, lim = kring->nkr_num_slots - 1; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; - u_int k = ring->cur, resvd = ring->reserved; - if (k > lim) + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + + if (head > lim) return netmap_ring_reinit(kring); - /* XXX check sync modes */ bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* - * Import newly received packets into the netmap ring. - * j is an index in the netmap ring, l in the NIC ring. + * First part: import newly received packets. */ - l = adapter->next_rx_desc_to_check; - j = netmap_idx_n2k(kring, l); - ND("%s: next NIC %d kring %d (ofs %d), hwcur %d hwavail %d cur %d avail %d", - ifp->if_xname, - l, j, kring->nkr_hwofs, kring->nr_hwcur, kring->nr_hwavail, - ring->cur, ring->avail); if (netmap_no_pendintr || force_update) { uint16_t slot_flags = kring->nkr_slot_flags; + nic_i = adapter->next_rx_desc_to_check; + nm_i = netmap_idx_n2k(kring, nic_i); + for (n = 0; ; n++) { - struct e1000_rx_desc *curr = &adapter->rx_desc_base[l]; + struct e1000_rx_desc *curr = &adapter->rx_desc_base[nic_i]; uint32_t staterr = le32toh(curr->status); int len; if ((staterr & E1000_RXD_STAT_DD) == 0) break; len = le16toh(curr->length) - 4; // CRC if (len < 0) { - D("bogus pkt size at %d", j); + D("bogus pkt size %d nic idx %d", len, nic_i); len = 0; } - ND("\n%s", nm_dump_buf(NMB(&ring->slot[j]), - len, 128, NULL)); - ring->slot[j].len = len; - ring->slot[j].flags = slot_flags; + ring->slot[nm_i].len = len; + ring->slot[nm_i].flags = slot_flags; bus_dmamap_sync(adapter->rxtag, - adapter->rx_buffer_area[l].map, - BUS_DMASYNC_POSTREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + adapter->rx_buffer_area[nic_i].map, + BUS_DMASYNC_POSTREAD); + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ - adapter->next_rx_desc_to_check = l; - kring->nr_hwavail += n; + ND("%d new packets at nic %d nm %d tail %d", + n, + adapter->next_rx_desc_to_check, + netmap_idx_n2k(kring, adapter->next_rx_desc_to_check), + kring->nr_hwtail); + adapter->next_rx_desc_to_check = nic_i; + // ifp->if_ipackets += n; + kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; /* netmap ring index */ - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - if (j != k) { /* userspace has released some packets. */ - l = netmap_idx_k2n(kring, j); /* NIC ring index */ - for (n = 0; j != k; n++) { - struct netmap_slot *slot = &ring->slot[j]; - struct e1000_rx_desc *curr = &adapter->rx_desc_base[l]; - struct em_buffer *rxbuf = &adapter->rx_buffer_area[l]; + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); - if (addr == netmap_buffer_base) { /* bad buf */ - return netmap_ring_reinit(kring); - } + struct e1000_rx_desc *curr = &adapter->rx_desc_base[nic_i]; + struct em_buffer *rxbuf = &adapter->rx_buffer_area[nic_i]; + if (addr == netmap_buffer_base) /* bad buf */ + goto ring_reset; + if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ - netmap_reload_map(adapter->rxtag, rxbuf->map, addr); curr->buffer_addr = htole64(paddr); + netmap_reload_map(adapter->rxtag, rxbuf->map, addr); slot->flags &= ~NS_BUF_CHANGED; } curr->status = 0; - bus_dmamap_sync(adapter->rxtag, rxbuf->map, BUS_DMASYNC_PREREAD); - - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; + kring->nr_hwcur = head; bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * IMPORTANT: we must leave one free slot in the ring, - * so move l back by one unit + * so move nic_i back by one unit */ - l = (l == 0) ? lim : l - 1; - E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), l); + nic_i = nm_prev(nic_i, lim); + E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), nic_i); } - /* tell userspace that there are new packets */ - ring->avail = kring->nr_hwavail - resvd; + + /* tell userspace that there might be new packets */ + nm_rxsync_finalize(kring); + return 0; + +ring_reset: + return netmap_ring_reinit(kring); } static void lem_netmap_attach(struct adapter *adapter) { struct netmap_adapter na; bzero(&na, sizeof(na)); na.ifp = adapter->ifp; na.na_flags = NAF_BDG_MAYSLEEP; na.num_tx_desc = adapter->num_tx_desc; na.num_rx_desc = adapter->num_rx_desc; na.nm_txsync = lem_netmap_txsync; na.nm_rxsync = lem_netmap_rxsync; na.nm_register = lem_netmap_reg; - netmap_attach(&na, 1); + na.num_tx_rings = na.num_rx_rings = 1; + netmap_attach(&na); } /* end of file */ Index: stable/9/sys/dev/netmap/if_re_netmap.h =================================================================== --- stable/9/sys/dev/netmap/if_re_netmap.h (revision 262152) +++ stable/9/sys/dev/netmap/if_re_netmap.h (revision 262153) @@ -1,382 +1,383 @@ /* - * Copyright (C) 2011 Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD$ * - * netmap support for "re" - * For details on netmap support please see ixgbe_netmap.h + * netmap support for: re + * + * For more details on netmap support please see ixgbe_netmap.h */ #include #include #include #include /* vtophys ? */ #include /* - * support for netmap register/unregisted. We are already under core lock. - * only called on the first register or the last unregister. + * Register/unregister. We are already under netmap lock. */ static int -re_netmap_reg(struct ifnet *ifp, int onoff) +re_netmap_reg(struct netmap_adapter *na, int onoff) { + struct ifnet *ifp = na->ifp; struct rl_softc *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); - int error = 0; - if (na == NULL) - return EINVAL; - /* Tell the stack that the interface is no longer active */ - ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); - - re_stop(adapter); - + RL_LOCK(adapter); + re_stop(adapter); /* also clears IFF_DRV_RUNNING */ if (onoff) { - ifp->if_capenable |= IFCAP_NETMAP; - - /* save if_transmit to restore it later */ - na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_transmit; - - re_init_locked(adapter); - - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { - error = ENOMEM; - goto fail; - } + nm_set_native_flags(na); } else { -fail: - /* restore if_transmit */ - ifp->if_transmit = na->if_transmit; - ifp->if_capenable &= ~IFCAP_NETMAP; - re_init_locked(adapter); /* also enables intr */ + nm_clear_native_flags(na); } - return (error); + re_init_locked(adapter); /* also enables intr */ + RL_UNLOCK(adapter); + return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); } /* * Reconcile kernel and user view of the transmit ring. */ static int -re_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) +re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct rl_softc *sc = ifp->if_softc; - struct rl_txdesc *txd = sc->rl_ldata.rl_tx_desc; - struct netmap_adapter *na = NA(sc->rl_ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - int j, k, l, n, lim = kring->nkr_num_slots - 1; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; - k = ring->cur; - if (k > lim) - return netmap_ring_reinit(kring); + /* device-specific */ + struct rl_softc *sc = ifp->if_softc; + struct rl_txdesc *txd = sc->rl_ldata.rl_tx_desc; - /* Sync the TX descriptor list */ bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, - sc->rl_ldata.rl_tx_list_map, - BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + sc->rl_ldata.rl_tx_list_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); // XXX extra postwrite ? - /* XXX move after the transmissions */ - /* record completed transmissions */ - for (n = 0, l = sc->rl_ldata.rl_tx_considx; - l != sc->rl_ldata.rl_tx_prodidx; - n++, l = RL_TX_DESC_NXT(sc, l)) { - uint32_t cmdstat = - le32toh(sc->rl_ldata.rl_tx_list[l].rl_cmdstat); - if (cmdstat & RL_TDESC_STAT_OWN) - break; - } - if (n > 0) { - sc->rl_ldata.rl_tx_considx = l; - sc->rl_ldata.rl_tx_free += n; - kring->nr_hwavail += n; - } + /* + * First part: process new packets to send. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* we have new packets to send */ + nic_i = sc->rl_ldata.rl_tx_prodidx; + // XXX or netmap_idx_k2n(kring, nm_i); - /* update avail to what the kernel knows */ - ring->avail = kring->nr_hwavail; + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; + u_int len = slot->len; + uint64_t paddr; + void *addr = PNMB(slot, &paddr); - j = kring->nr_hwcur; - if (j != k) { /* we have new packets to send */ - l = sc->rl_ldata.rl_tx_prodidx; - for (n = 0; j != k; n++) { - struct netmap_slot *slot = &ring->slot[j]; - struct rl_desc *desc = &sc->rl_ldata.rl_tx_list[l]; + /* device-specific */ + struct rl_desc *desc = &sc->rl_ldata.rl_tx_list[nic_i]; int cmd = slot->len | RL_TDESC_CMD_EOF | RL_TDESC_CMD_OWN | RL_TDESC_CMD_SOF ; - uint64_t paddr; - void *addr = PNMB(slot, &paddr); - int len = slot->len; - if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { - // XXX what about prodidx ? - return netmap_ring_reinit(kring); - } + NM_CHECK_ADDR_LEN(addr, len); - if (l == lim) /* mark end of ring */ + if (nic_i == lim) /* mark end of ring */ cmd |= RL_TDESC_CMD_EOR; if (slot->flags & NS_BUF_CHANGED) { + /* buffer has changed, reload map */ desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); - /* buffer has changed, unload and reload map */ netmap_reload_map(sc->rl_ldata.rl_tx_mtag, - txd[l].tx_dmamap, addr); - slot->flags &= ~NS_BUF_CHANGED; + txd[nic_i].tx_dmamap, addr); } - slot->flags &= ~NS_REPORT; + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + + /* Fill the slot in the NIC ring. */ desc->rl_cmdstat = htole32(cmd); + + /* make sure changes to the buffer are synced */ bus_dmamap_sync(sc->rl_ldata.rl_tx_mtag, - txd[l].tx_dmamap, BUS_DMASYNC_PREWRITE); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + txd[nic_i].tx_dmamap, + BUS_DMASYNC_PREWRITE); + + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - sc->rl_ldata.rl_tx_prodidx = l; - kring->nr_hwcur = k; /* the saved ring->cur */ - ring->avail -= n; // XXX see others - kring->nr_hwavail = ring->avail; + sc->rl_ldata.rl_tx_prodidx = nic_i; + kring->nr_hwcur = head; + /* synchronize the NIC ring */ bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, - sc->rl_ldata.rl_tx_list_map, - BUS_DMASYNC_PREWRITE|BUS_DMASYNC_PREREAD); + sc->rl_ldata.rl_tx_list_map, + BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE); /* start ? */ CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START); } + + /* + * Second part: reclaim buffers for completed transmissions. + */ + if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { + nic_i = sc->rl_ldata.rl_tx_considx; + for (n = 0; nic_i != sc->rl_ldata.rl_tx_prodidx; + n++, nic_i = RL_TX_DESC_NXT(sc, nic_i)) { + uint32_t cmdstat = + le32toh(sc->rl_ldata.rl_tx_list[nic_i].rl_cmdstat); + if (cmdstat & RL_TDESC_STAT_OWN) + break; + } + if (n > 0) { + sc->rl_ldata.rl_tx_considx = nic_i; + sc->rl_ldata.rl_tx_free += n; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); + } + } + + nm_txsync_finalize(kring); + return 0; } /* * Reconcile kernel and user view of the receive ring. */ static int -re_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) +re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct rl_softc *sc = ifp->if_softc; - struct rl_rxdesc *rxd = sc->rl_ldata.rl_rx_desc; - struct netmap_adapter *na = NA(sc->rl_ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - int j, l, n, lim = kring->nkr_num_slots - 1; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; - u_int k = ring->cur, resvd = ring->reserved; - k = ring->cur; - if (k > lim) + /* device-specific */ + struct rl_softc *sc = ifp->if_softc; + struct rl_rxdesc *rxd = sc->rl_ldata.rl_rx_desc; + + if (head > lim) return netmap_ring_reinit(kring); - /* XXX check sync modes */ bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, - sc->rl_ldata.rl_rx_list_map, - BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + sc->rl_ldata.rl_rx_list_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* - * Import newly received packets into the netmap ring. - * j is an index in the netmap ring, l in the NIC ring. + * First part: import newly received packets. * - * The device uses all the buffers in the ring, so we need + * This device uses all the buffers in the ring, so we need * another termination condition in addition to RL_RDESC_STAT_OWN - * cleared (all buffers could have it cleared. The easiest one - * is to limit the amount of data reported up to 'lim' + * cleared (all buffers could have it cleared). The easiest one + * is to stop right before nm_hwcur. */ - l = sc->rl_ldata.rl_rx_prodidx; /* next pkt to check */ - j = netmap_idx_n2k(kring, l); /* the kring index */ if (netmap_no_pendintr || force_update) { uint16_t slot_flags = kring->nkr_slot_flags; + uint32_t stop_i = nm_prev(kring->nr_hwcur, lim); - for (n = kring->nr_hwavail; n < lim ; n++) { - struct rl_desc *cur_rx = &sc->rl_ldata.rl_rx_list[l]; + nic_i = sc->rl_ldata.rl_rx_prodidx; /* next pkt to check */ + nm_i = netmap_idx_n2k(kring, nic_i); + + while (nm_i != stop_i) { + struct rl_desc *cur_rx = &sc->rl_ldata.rl_rx_list[nic_i]; uint32_t rxstat = le32toh(cur_rx->rl_cmdstat); uint32_t total_len; if ((rxstat & RL_RDESC_STAT_OWN) != 0) break; total_len = rxstat & sc->rl_rxlenmask; /* XXX subtract crc */ total_len = (total_len < 4) ? 0 : total_len - 4; - kring->ring->slot[j].len = total_len; - kring->ring->slot[j].flags = slot_flags; + ring->slot[nm_i].len = total_len; + ring->slot[nm_i].flags = slot_flags; /* sync was in re_newbuf() */ bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, - rxd[l].rx_dmamap, BUS_DMASYNC_POSTREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + rxd[nic_i].rx_dmamap, BUS_DMASYNC_POSTREAD); + // sc->rl_ifp->if_ipackets++; + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - if (n != kring->nr_hwavail) { - sc->rl_ldata.rl_rx_prodidx = l; - sc->rl_ifp->if_ipackets += n - kring->nr_hwavail; - kring->nr_hwavail = n; - } + sc->rl_ldata.rl_rx_prodidx = nic_i; + kring->nr_hwtail = nm_i; kring->nr_kflags &= ~NKR_PENDINTR; } - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - if (j != k) { /* userspace has released some packets. */ - l = netmap_idx_k2n(kring, j); /* the NIC index */ - for (n = 0; j != k; n++) { - struct netmap_slot *slot = ring->slot + j; - struct rl_desc *desc = &sc->rl_ldata.rl_rx_list[l]; - int cmd = NETMAP_BUF_SIZE | RL_RDESC_CMD_OWN; + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); - if (addr == netmap_buffer_base) { /* bad buf */ - return netmap_ring_reinit(kring); - } + struct rl_desc *desc = &sc->rl_ldata.rl_rx_list[nic_i]; + int cmd = NETMAP_BUF_SIZE | RL_RDESC_CMD_OWN; - if (l == lim) /* mark end of ring */ + if (addr == netmap_buffer_base) /* bad buf */ + goto ring_reset; + + if (nic_i == lim) /* mark end of ring */ cmd |= RL_RDESC_CMD_EOR; - slot->flags &= ~NS_REPORT; if (slot->flags & NS_BUF_CHANGED) { - netmap_reload_map(sc->rl_ldata.rl_rx_mtag, - rxd[l].rx_dmamap, addr); + /* buffer has changed, reload map */ desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); + netmap_reload_map(sc->rl_ldata.rl_rx_mtag, + rxd[nic_i].rx_dmamap, addr); slot->flags &= ~NS_BUF_CHANGED; } desc->rl_cmdstat = htole32(cmd); bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, - rxd[l].rx_dmamap, BUS_DMASYNC_PREREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + rxd[nic_i].rx_dmamap, + BUS_DMASYNC_PREREAD); + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; - /* Flush the RX DMA ring */ + kring->nr_hwcur = head; bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, sc->rl_ldata.rl_rx_list_map, - BUS_DMASYNC_PREWRITE|BUS_DMASYNC_PREREAD); + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); } - /* tell userspace that there are new packets */ - ring->avail = kring->nr_hwavail - resvd; + + /* tell userspace that there might be new packets */ + nm_rxsync_finalize(kring); + return 0; + +ring_reset: + return netmap_ring_reinit(kring); } + /* * Additional routines to init the tx and rx rings. * In other drivers we do that inline in the main code. */ static void re_netmap_tx_init(struct rl_softc *sc) { struct rl_txdesc *txd; struct rl_desc *desc; int i, n; struct netmap_adapter *na = NA(sc->rl_ifp); - struct netmap_slot *slot = netmap_reset(na, NR_TX, 0, 0); + struct netmap_slot *slot; + if (!na || !(na->na_flags & NAF_NATIVE_ON)) { + return; + } + + slot = netmap_reset(na, NR_TX, 0, 0); /* slot is NULL if we are not in netmap mode */ if (!slot) - return; + return; // XXX cannot happen /* in netmap mode, overwrite addresses and maps */ txd = sc->rl_ldata.rl_tx_desc; desc = sc->rl_ldata.rl_tx_list; n = sc->rl_ldata.rl_tx_desc_cnt; /* l points in the netmap ring, i points in the NIC ring */ for (i = 0; i < n; i++) { uint64_t paddr; int l = netmap_idx_n2k(&na->tx_rings[0], i); void *addr = PNMB(slot + l, &paddr); desc[i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); desc[i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); netmap_load_map(sc->rl_ldata.rl_tx_mtag, txd[i].tx_dmamap, addr); } } static void re_netmap_rx_init(struct rl_softc *sc) { struct netmap_adapter *na = NA(sc->rl_ifp); struct netmap_slot *slot = netmap_reset(na, NR_RX, 0, 0); struct rl_desc *desc = sc->rl_ldata.rl_rx_list; uint32_t cmdstat; - int i, n, max_avail; + uint32_t nic_i, max_avail; + uint32_t const n = sc->rl_ldata.rl_rx_desc_cnt; if (!slot) return; - n = sc->rl_ldata.rl_rx_desc_cnt; /* - * Userspace owned hwavail packets before the reset, - * so the NIC that last hwavail descriptors of the ring - * are still owned by the driver (and keep one empty). + * Do not release the slots owned by userspace, + * and also keep one empty. */ - max_avail = n - 1 - na->rx_rings[0].nr_hwavail; - for (i = 0; i < n; i++) { + max_avail = n - 1 - nm_kr_rxspace(&na->rx_rings[0]); + for (nic_i = 0; nic_i < n; nic_i++) { void *addr; uint64_t paddr; - int l = netmap_idx_n2k(&na->rx_rings[0], i); + uint32_t nm_i = netmap_idx_n2k(&na->rx_rings[0], nic_i); - addr = PNMB(slot + l, &paddr); + addr = PNMB(slot + nm_i, &paddr); netmap_reload_map(sc->rl_ldata.rl_rx_mtag, - sc->rl_ldata.rl_rx_desc[i].rx_dmamap, addr); + sc->rl_ldata.rl_rx_desc[nic_i].rx_dmamap, addr); bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, - sc->rl_ldata.rl_rx_desc[i].rx_dmamap, BUS_DMASYNC_PREREAD); - desc[i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); - desc[i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); + sc->rl_ldata.rl_rx_desc[nic_i].rx_dmamap, BUS_DMASYNC_PREREAD); + desc[nic_i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); + desc[nic_i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); cmdstat = NETMAP_BUF_SIZE; - if (i == n - 1) /* mark the end of ring */ + if (nic_i == n - 1) /* mark the end of ring */ cmdstat |= RL_RDESC_CMD_EOR; - if (i < max_avail) + if (nic_i < max_avail) cmdstat |= RL_RDESC_CMD_OWN; - desc[i].rl_cmdstat = htole32(cmdstat); + desc[nic_i].rl_cmdstat = htole32(cmdstat); } } static void re_netmap_attach(struct rl_softc *sc) { struct netmap_adapter na; bzero(&na, sizeof(na)); na.ifp = sc->rl_ifp; na.na_flags = NAF_BDG_MAYSLEEP; na.num_tx_desc = sc->rl_ldata.rl_tx_desc_cnt; na.num_rx_desc = sc->rl_ldata.rl_rx_desc_cnt; na.nm_txsync = re_netmap_txsync; na.nm_rxsync = re_netmap_rxsync; na.nm_register = re_netmap_reg; - netmap_attach(&na, 1); + na.num_tx_rings = na.num_rx_rings = 1; + netmap_attach(&na); } + /* end of file */ Index: stable/9/sys/dev/netmap/ixgbe_netmap.h =================================================================== --- stable/9/sys/dev/netmap/ixgbe_netmap.h (revision 262152) +++ stable/9/sys/dev/netmap/ixgbe_netmap.h (revision 262153) @@ -1,568 +1,495 @@ /* - * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD$ * - * netmap modifications for ixgbe + * netmap support for: ixgbe * * This file is meant to be a reference on how to implement * netmap support for a network driver. - * This file contains code but only static or inline functions - * that are used by a single driver. To avoid replication of - * code we just #include it near the beginning of the - * standard driver. + * This file contains code but only static or inline functions used + * by a single driver. To avoid replication of code we just #include + * it near the beginning of the standard driver. */ + #include #include /* * Some drivers may need the following headers. Others * already include them by default #include #include */ #include + /* + * device-specific sysctl variables: + * * ix_crcstrip: 0: keep CRC in rx frames (default), 1: strip it. * During regular operations the CRC is stripped, but on some * hardware reception of frames not multiple of 64 is slower, * so using crcstrip=0 helps in benchmarks. * * ix_rx_miss, ix_rx_miss_bufs: * count packets that might be missed due to lost interrupts. - * - * ix_use_dd - * use the dd bit for completed tx transmissions. - * This is tricky, much better to use TDH for now. */ SYSCTL_DECL(_dev_netmap); -static int ix_rx_miss, ix_rx_miss_bufs, ix_use_dd, ix_crcstrip; +static int ix_rx_miss, ix_rx_miss_bufs, ix_crcstrip; SYSCTL_INT(_dev_netmap, OID_AUTO, ix_crcstrip, CTLFLAG_RW, &ix_crcstrip, 0, "strip CRC on rx frames"); -SYSCTL_INT(_dev_netmap, OID_AUTO, ix_use_dd, - CTLFLAG_RW, &ix_use_dd, 0, "use dd instead of tdh to detect tx frames"); SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss, CTLFLAG_RW, &ix_rx_miss, 0, "potentially missed rx intr"); SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss_bufs, CTLFLAG_RW, &ix_rx_miss_bufs, 0, "potentially missed rx intr bufs"); static void set_crcstrip(struct ixgbe_hw *hw, int onoff) { /* crc stripping is set in two places: * IXGBE_HLREG0 (modified on init_locked and hw reset) * IXGBE_RDRXCTL (set by the original driver in * ixgbe_setup_hw_rsc() called in init_locked. * We disable the setting when netmap is compiled in). * We update the values here, but also in ixgbe.c because * init_locked sometimes is called outside our control. */ uint32_t hl, rxc; hl = IXGBE_READ_REG(hw, IXGBE_HLREG0); rxc = IXGBE_READ_REG(hw, IXGBE_RDRXCTL); if (netmap_verbose) D("%s read HLREG 0x%x rxc 0x%x", onoff ? "enter" : "exit", hl, rxc); /* hw requirements ... */ rxc &= ~IXGBE_RDRXCTL_RSCFRSTSIZE; rxc |= IXGBE_RDRXCTL_RSCACKC; if (onoff && !ix_crcstrip) { /* keep the crc. Fast rx */ hl &= ~IXGBE_HLREG0_RXCRCSTRP; rxc &= ~IXGBE_RDRXCTL_CRCSTRIP; } else { /* reset default mode */ hl |= IXGBE_HLREG0_RXCRCSTRP; rxc |= IXGBE_RDRXCTL_CRCSTRIP; } if (netmap_verbose) D("%s write HLREG 0x%x rxc 0x%x", onoff ? "enter" : "exit", hl, rxc); IXGBE_WRITE_REG(hw, IXGBE_HLREG0, hl); IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rxc); } + /* - * Register/unregister. We are already under core lock. + * Register/unregister. We are already under netmap lock. * Only called on the first register or the last unregister. */ static int -ixgbe_netmap_reg(struct ifnet *ifp, int onoff) +ixgbe_netmap_reg(struct netmap_adapter *na, int onoff) { + struct ifnet *ifp = na->ifp; struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); - int error = 0; - if (na == NULL) - return EINVAL; /* no netmap support here */ - IXGBE_CORE_LOCK(adapter); - ixgbe_disable_intr(adapter); + ixgbe_disable_intr(adapter); // XXX maybe ixgbe_stop ? /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); set_crcstrip(&adapter->hw, onoff); - if (onoff) { /* enable netmap mode */ - ifp->if_capenable |= IFCAP_NETMAP; - - /* save if_transmit and replace with our routine */ - na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_transmit; - - /* - * reinitialize the adapter, now with netmap flag set, - * so the rings will be set accordingly. - */ - ixgbe_init_locked(adapter); - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { - error = ENOMEM; - goto fail; - } - } else { /* reset normal mode (explicit request or netmap failed) */ -fail: - /* restore if_transmit */ - ifp->if_transmit = na->if_transmit; - ifp->if_capenable &= ~IFCAP_NETMAP; - /* initialize the card, this time in standard mode */ - ixgbe_init_locked(adapter); /* also enables intr */ + /* enable or disable flags and callbacks in na and ifp */ + if (onoff) { + nm_set_native_flags(na); + } else { + nm_clear_native_flags(na); } - set_crcstrip(&adapter->hw, onoff); + ixgbe_init_locked(adapter); /* also enables intr */ + set_crcstrip(&adapter->hw, onoff); // XXX why twice ? IXGBE_CORE_UNLOCK(adapter); - return (error); + return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); } /* * Reconcile kernel and user view of the transmit ring. - * This routine might be called frequently so it must be efficient. * - * ring->cur holds the userspace view of the current ring index. Userspace - * has filled the tx slots from the previous call's ring->cur up to but not - * including ring->cur for this call. In this function the kernel updates - * kring->nr_hwcur to ring->cur, thus slots [kring->nr_hwcur, ring->cur) are - * now ready to transmit. At the last interrupt kring->nr_hwavail slots were - * available. + * All information is in the kring. + * Userspace wants to send packets up to the one before kring->rhead, + * kernel knows kring->nr_hwcur is the first unsent packet. * - * This function runs under lock (acquired from the caller or internally). - * It must first update ring->avail to what the kernel knows, - * subtract the newly used slots (ring->cur - kring->nr_hwcur) - * from both avail and nr_hwavail, and set ring->nr_hwcur = ring->cur - * issuing a dmamap_sync on all slots. + * Here we push packets out (as many as possible), and possibly + * reclaim buffers from previously completed transmission. * - * Since ring comes from userspace, its content must be read only once, - * and validated before being used to update the kernel's structures. - * (this is also true for every use of ring in the kernel). - * - * ring->avail is never used, only checked for bogus values. - * - * I flags & FORCE_RECLAIM, reclaim transmitted - * buffers irrespective of interrupt mitigation. + * The caller (netmap) guarantees that there is only one instance + * running at any time. Any interference with other driver + * methods should be handled by the individual drivers. */ static int -ixgbe_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) +ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct tx_ring *txr = &adapter->tx_rings[ring_nr]; - struct netmap_adapter *na = NA(adapter->ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, l, n = 0; - u_int const k = ring->cur, lim = kring->nkr_num_slots - 1; - + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; /* - * ixgbe can generate an interrupt on every tx packet, but it - * seems very expensive, so we interrupt once every half ring, - * or when requested with NS_REPORT + * interrupts on every tx packet are expensive so request + * them every half ring, or where NS_REPORT is set */ u_int report_frequency = kring->nkr_num_slots >> 1; - if (k > lim) - return netmap_ring_reinit(kring); + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct tx_ring *txr = &adapter->tx_rings[ring_nr]; + int reclaim_tx; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); /* - * Process new packets to send. j is the current index in the - * netmap ring, l is the corresponding index in the NIC ring. + * First part: process new packets to send. + * nm_i is the current index in the netmap ring, + * nic_i is the corresponding index in the NIC ring. * The two numbers differ because upon a *_init() we reset * the NIC ring but leave the netmap ring unchanged. * For the transmit ring, we have * - * j = kring->nr_hwcur - * l = IXGBE_TDT (not tracked in the driver) + * nm_i = kring->nr_hwcur + * nic_i = IXGBE_TDT (not tracked in the driver) * and - * j == (l + kring->nkr_hwofs) % ring_size + * nm_i == (nic_i + kring->nkr_hwofs) % ring_size * * In this driver kring->nkr_hwofs >= 0, but for other * drivers it might be negative as well. */ - j = kring->nr_hwcur; - if (j != k) { /* we have new packets to send */ - prefetch(&ring->slot[j]); - l = netmap_idx_k2n(kring, j); /* NIC index */ - prefetch(&txr->tx_buffers[l]); - for (n = 0; j != k; n++) { - /* - * Collect per-slot info. - * Note that txbuf and curr are indexed by l. - * - * In this driver we collect the buffer address - * (using the PNMB() macro) because we always - * need to rewrite it into the NIC ring. - * Many other drivers preserve the address, so - * we only need to access it if NS_BUF_CHANGED - * is set. - * XXX note, on this device the dmamap* calls are - * not necessary because tag is 0, however just accessing - * the per-packet tag kills 1Mpps at 900 MHz. - */ - struct netmap_slot *slot = &ring->slot[j]; - union ixgbe_adv_tx_desc *curr = &txr->tx_base[l]; - struct ixgbe_tx_buf *txbuf = &txr->tx_buffers[l]; - uint64_t paddr; - // XXX type for flags and len ? - int flags = ((slot->flags & NS_REPORT) || - j == 0 || j == report_frequency) ? - IXGBE_TXD_CMD_RS : 0; + + /* + * If we have packets to send (kring->nr_hwcur != kring->rhead) + * iterate over the netmap ring, fetch length and update + * the corresponding slot in the NIC ring. Some drivers also + * need to update the buffer's physical address in the NIC slot + * even NS_BUF_CHANGED is not set (PNMB computes the addresses). + * + * The netmap_reload_map() calls is especially expensive, + * even when (as in this case) the tag is 0, so do only + * when the buffer has actually changed. + * + * If possible do not set the report/intr bit on all slots, + * but only a few times per ring or when NS_REPORT is set. + * + * Finally, on 10G and faster drivers, it might be useful + * to prefetch the next slot and txr entry. + */ + + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* we have new packets to send */ + nic_i = netmap_idx_k2n(kring, nm_i); + + __builtin_prefetch(&ring->slot[nm_i]); + __builtin_prefetch(&txr->tx_buffers[nic_i]); + + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; + uint64_t paddr; void *addr = PNMB(slot, &paddr); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; - prefetch(&ring->slot[j]); - prefetch(&txr->tx_buffers[l]); + /* device-specific */ + union ixgbe_adv_tx_desc *curr = &txr->tx_base[nic_i]; + struct ixgbe_tx_buf *txbuf = &txr->tx_buffers[nic_i]; + int flags = (slot->flags & NS_REPORT || + nic_i == 0 || nic_i == report_frequency) ? + IXGBE_TXD_CMD_RS : 0; - /* - * Quick check for valid addr and len. - * NMB() returns netmap_buffer_base for invalid - * buffer indexes (but the address is still a - * valid one to be used in a ring). slot->len is - * unsigned so no need to check for negative values. - */ - if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { -ring_reset: - return netmap_ring_reinit(kring); - } + /* prefetch for next round */ + __builtin_prefetch(&ring->slot[nm_i + 1]); + __builtin_prefetch(&txr->tx_buffers[nic_i + 1]); + NM_CHECK_ADDR_LEN(addr, len); + if (slot->flags & NS_BUF_CHANGED) { - /* buffer has changed, unload and reload map */ + /* buffer has changed, reload map */ netmap_reload_map(txr->txtag, txbuf->map, addr); - slot->flags &= ~NS_BUF_CHANGED; } - slot->flags &= ~NS_REPORT; - /* - * Fill the slot in the NIC ring. - * In this driver we need to rewrite the buffer - * address in the NIC ring. Other drivers do not - * need this. - * Use legacy descriptor, it is faster. - */ + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + + /* Fill the slot in the NIC ring. */ + /* Use legacy descriptor, they are faster? */ curr->read.buffer_addr = htole64(paddr); curr->read.olinfo_status = 0; curr->read.cmd_type_len = htole32(len | flags | IXGBE_ADVTXD_DCMD_IFCS | IXGBE_TXD_CMD_EOP); /* make sure changes to the buffer are synced */ - bus_dmamap_sync(txr->txtag, txbuf->map, BUS_DMASYNC_PREWRITE); + bus_dmamap_sync(txr->txtag, txbuf->map, + BUS_DMASYNC_PREWRITE); + + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwcur = k; /* the saved ring->cur */ - /* decrease avail by number of packets sent */ - kring->nr_hwavail -= n; + kring->nr_hwcur = head; /* synchronize the NIC ring */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - /* (re)start the transmitter up to slot l (excluded) */ - IXGBE_WRITE_REG(&adapter->hw, IXGBE_TDT(txr->me), l); + + /* (re)start the tx unit up to slot nic_i (excluded) */ + IXGBE_WRITE_REG(&adapter->hw, IXGBE_TDT(txr->me), nic_i); } /* - * Reclaim buffers for completed transmissions. + * Second part: reclaim buffers for completed transmissions. * Because this is expensive (we read a NIC register etc.) * we only do it in specific cases (see below). - * In all cases kring->nr_kflags indicates which slot will be - * checked upon a tx interrupt (nkr_num_slots means none). */ if (flags & NAF_FORCE_RECLAIM) { - j = 1; /* forced reclaim, ignore interrupts */ - kring->nr_kflags = kring->nkr_num_slots; - } else if (kring->nr_hwavail > 0) { - j = 0; /* buffers still available: no reclaim, ignore intr. */ - kring->nr_kflags = kring->nkr_num_slots; + reclaim_tx = 1; /* forced reclaim */ + } else if (!nm_kr_txempty(kring)) { + reclaim_tx = 0; /* have buffers, no reclaim */ } else { /* - * no buffers available, locate a slot for which we request - * ReportStatus (approximately half ring after next_to_clean) - * and record it in kring->nr_kflags. - * If the slot has DD set, do the reclaim looking at TDH, - * otherwise we go to sleep (in netmap_poll()) and will be - * woken up when slot nr_kflags will be ready. + * No buffers available. Locate previous slot with + * REPORT_STATUS set. + * If the slot has DD set, we can reclaim space, + * otherwise wait for the next interrupt. + * This enables interrupt moderation on the tx + * side though it might reduce throughput. */ struct ixgbe_legacy_tx_desc *txd = (struct ixgbe_legacy_tx_desc *)txr->tx_base; - j = txr->next_to_clean + kring->nkr_num_slots/2; - if (j >= kring->nkr_num_slots) - j -= kring->nkr_num_slots; + nic_i = txr->next_to_clean + report_frequency; + if (nic_i > lim) + nic_i -= lim + 1; // round to the closest with dd set - j= (j < kring->nkr_num_slots / 4 || j >= kring->nkr_num_slots*3/4) ? + nic_i = (nic_i < kring->nkr_num_slots / 4 || + nic_i >= kring->nkr_num_slots*3/4) ? 0 : report_frequency; - kring->nr_kflags = j; /* the slot to check */ - j = txd[j].upper.fields.status & IXGBE_TXD_STAT_DD; // XXX cpu_to_le32 ? + reclaim_tx = txd[nic_i].upper.fields.status & IXGBE_TXD_STAT_DD; // XXX cpu_to_le32 ? } - if (j) { - int delta; - + if (reclaim_tx) { /* * Record completed transmissions. * We (re)use the driver's txr->next_to_clean to keep * track of the most recently completed transmission. * - * The datasheet discourages the use of TDH to find out the - * number of sent packets. We should rather check the DD - * status bit in a packet descriptor. However, we only set - * the "report status" bit for some descriptors (a kind of - * interrupt mitigation), so we can only check on those. - * For the time being we use TDH, as we do it infrequently - * enough not to pose performance problems. + * The datasheet discourages the use of TDH to find + * out the number of sent packets, but we only set + * REPORT_STATUS in a few slots so TDH is the only + * good way. */ - if (ix_use_dd) { - struct ixgbe_legacy_tx_desc *txd = - (struct ixgbe_legacy_tx_desc *)txr->tx_base; - u_int k1 = netmap_idx_k2n(kring, kring->nr_hwcur); - l = txr->next_to_clean; - delta = 0; - while (l != k1 && - txd[l].upper.fields.status & IXGBE_TXD_STAT_DD) { - delta++; - l = (l == lim) ? 0 : l + 1; + nic_i = IXGBE_READ_REG(&adapter->hw, IXGBE_TDH(ring_nr)); + if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ + D("TDH wrap %d", nic_i); + nic_i -= kring->nkr_num_slots; } - } else { - l = IXGBE_READ_REG(&adapter->hw, IXGBE_TDH(ring_nr)); - if (l >= kring->nkr_num_slots) { /* XXX can happen */ - D("TDH wrap %d", l); - l -= kring->nkr_num_slots; - } - delta = l - txr->next_to_clean; - } - if (delta) { + if (nic_i != txr->next_to_clean) { /* some tx completed, increment avail */ - if (delta < 0) - delta += kring->nkr_num_slots; - txr->next_to_clean = l; - kring->nr_hwavail += delta; - if (kring->nr_hwavail > lim) - goto ring_reset; + txr->next_to_clean = nic_i; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } } - /* update avail to what the kernel knows */ - ring->avail = kring->nr_hwavail; + nm_txsync_finalize(kring); + return 0; } /* * Reconcile kernel and user view of the receive ring. - * Same as for the txsync, this routine must be efficient and - * avoid races in accessing the shared regions. + * Same as for the txsync, this routine must be efficient. + * The caller guarantees a single invocations, but races against + * the rest of the driver should be handled here. * - * When called, userspace has read data from slots kring->nr_hwcur - * up to ring->cur (excluded). + * On call, kring->rhead is the first packet that userspace wants + * to keep, and kring->rcur is the wakeup point. + * The kernel has previously reported packets up to kring->rtail. * - * The last interrupt reported kring->nr_hwavail slots available - * after kring->nr_hwcur. - * We must subtract the newly consumed slots (cur - nr_hwcur) - * from nr_hwavail, make the descriptors available for the next reads, - * and set kring->nr_hwcur = ring->cur and ring->avail = kring->nr_hwavail. - * * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective * of whether or not we received an interrupt. */ static int -ixgbe_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) +ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; - struct netmap_adapter *na = NA(adapter->ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, l, n, lim = kring->nkr_num_slots - 1; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; - u_int k = ring->cur, resvd = ring->reserved; - if (k > lim) + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; + + if (head > lim) return netmap_ring_reinit(kring); /* XXX check sync modes */ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* - * First part, import newly received packets into the netmap ring. + * First part: import newly received packets. * - * j is the index of the next free slot in the netmap ring, - * and l is the index of the next received packet in the NIC ring, + * nm_i is the index of the next free slot in the netmap ring, + * nic_i is the index of the next received packet in the NIC ring, * and they may differ in case if_init() has been called while * in netmap mode. For the receive ring we have * - * j = (kring->nr_hwcur + kring->nr_hwavail) % ring_size - * l = rxr->next_to_check; + * nic_i = rxr->next_to_check; + * nm_i = kring->nr_hwtail (previous) * and - * j == (l + kring->nkr_hwofs) % ring_size + * nm_i == (nic_i + kring->nkr_hwofs) % ring_size * * rxr->next_to_check is set to 0 on a ring reinit */ if (netmap_no_pendintr || force_update) { int crclen = ix_crcstrip ? 0 : 4; uint16_t slot_flags = kring->nkr_slot_flags; - l = rxr->next_to_check; - j = netmap_idx_n2k(kring, l); + nic_i = rxr->next_to_check; // or also k2n(kring->nr_hwtail) + nm_i = netmap_idx_n2k(kring, nic_i); for (n = 0; ; n++) { - union ixgbe_adv_rx_desc *curr = &rxr->rx_base[l]; + union ixgbe_adv_rx_desc *curr = &rxr->rx_base[nic_i]; uint32_t staterr = le32toh(curr->wb.upper.status_error); if ((staterr & IXGBE_RXD_STAT_DD) == 0) break; - ring->slot[j].len = le16toh(curr->wb.upper.length) - crclen; - ring->slot[j].flags = slot_flags; + ring->slot[nm_i].len = le16toh(curr->wb.upper.length) - crclen; + ring->slot[nm_i].flags = slot_flags; bus_dmamap_sync(rxr->ptag, - rxr->rx_buffers[l].pmap, BUS_DMASYNC_POSTREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + rxr->rx_buffers[nic_i].pmap, BUS_DMASYNC_POSTREAD); + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ if (netmap_no_pendintr && !force_update) { /* diagnostics */ ix_rx_miss ++; ix_rx_miss_bufs += n; } - rxr->next_to_check = l; - kring->nr_hwavail += n; + rxr->next_to_check = nic_i; + kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } /* - * Skip past packets that userspace has released - * (from kring->nr_hwcur to ring->cur - ring->reserved excluded), + * Second part: skip past packets that userspace has released. + * (kring->nr_hwcur to kring->rhead excluded), * and make the buffers available for reception. - * As usual j is the index in the netmap ring, l is the index - * in the NIC ring, and j == (l + kring->nkr_hwofs) % ring_size + * As usual nm_i is the index in the netmap ring, + * nic_i is the index in the NIC ring, and + * nm_i == (nic_i + kring->nkr_hwofs) % ring_size */ - j = kring->nr_hwcur; - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - if (j != k) { /* userspace has released some packets. */ - l = netmap_idx_k2n(kring, j); - for (n = 0; j != k; n++) { - /* collect per-slot info, with similar validations - * and flag handling as in the txsync code. - * - * NOTE curr and rxbuf are indexed by l. - * Also, this driver needs to update the physical - * address in the NIC ring, but other drivers - * may not have this requirement. - */ - struct netmap_slot *slot = &ring->slot[j]; - union ixgbe_adv_rx_desc *curr = &rxr->rx_base[l]; - struct ixgbe_rx_buf *rxbuf = &rxr->rx_buffers[l]; + nm_i = kring->nr_hwcur; + if (nm_i != head) { + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); + union ixgbe_adv_rx_desc *curr = &rxr->rx_base[nic_i]; + struct ixgbe_rx_buf *rxbuf = &rxr->rx_buffers[nic_i]; + if (addr == netmap_buffer_base) /* bad buf */ goto ring_reset; if (slot->flags & NS_BUF_CHANGED) { + /* buffer has changed, reload map */ netmap_reload_map(rxr->ptag, rxbuf->pmap, addr); slot->flags &= ~NS_BUF_CHANGED; } curr->wb.upper.status_error = 0; curr->read.pkt_addr = htole64(paddr); bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_PREREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; + kring->nr_hwcur = head; + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - /* IMPORTANT: we must leave one free slot in the ring, - * so move l back by one unit + /* + * IMPORTANT: we must leave one free slot in the ring, + * so move nic_i back by one unit */ - l = (l == 0) ? lim : l - 1; - IXGBE_WRITE_REG(&adapter->hw, IXGBE_RDT(rxr->me), l); + nic_i = nm_prev(nic_i, lim); + IXGBE_WRITE_REG(&adapter->hw, IXGBE_RDT(rxr->me), nic_i); } - /* tell userspace that there are new packets */ - ring->avail = kring->nr_hwavail - resvd; + /* tell userspace that there might be new packets */ + nm_rxsync_finalize(kring); + return 0; ring_reset: return netmap_ring_reinit(kring); } /* * The attach routine, called near the end of ixgbe_attach(), * fills the parameters for netmap_attach() and calls it. * It cannot fail, in the worst case (such as no memory) * netmap mode will be disabled and the driver will only * operate in standard mode. */ static void ixgbe_netmap_attach(struct adapter *adapter) { struct netmap_adapter na; bzero(&na, sizeof(na)); na.ifp = adapter->ifp; na.na_flags = NAF_BDG_MAYSLEEP; na.num_tx_desc = adapter->num_tx_desc; na.num_rx_desc = adapter->num_rx_desc; na.nm_txsync = ixgbe_netmap_txsync; na.nm_rxsync = ixgbe_netmap_rxsync; na.nm_register = ixgbe_netmap_reg; - netmap_attach(&na, adapter->num_queues); -} + na.num_tx_rings = na.num_rx_rings = adapter->num_queues; + netmap_attach(&na); +} /* end of file */ Index: stable/9/sys/dev/netmap/netmap.c =================================================================== --- stable/9/sys/dev/netmap/netmap.c (revision 262152) +++ stable/9/sys/dev/netmap/netmap.c (revision 262153) @@ -1,4073 +1,2615 @@ /* - * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. + * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* + * $FreeBSD$ + * * This module supports memory mapped access to network devices, * see netmap(4). * * The module uses a large, memory pool allocated by the kernel * and accessible as mmapped memory by multiple userspace threads/processes. * The memory pool contains packet buffers and "netmap rings", * i.e. user-accessible copies of the interface's queues. * * Access to the network card works like this: * 1. a process/thread issues one or more open() on /dev/netmap, to create * select()able file descriptor on which events are reported. * 2. on each descriptor, the process issues an ioctl() to identify * the interface that should report events to the file descriptor. * 3. on each descriptor, the process issues an mmap() request to * map the shared memory region within the process' address space. * The list of interesting queues is indicated by a location in * the shared memory region. * 4. using the functions in the netmap(4) userspace API, a process * can look up the occupation state of a queue, access memory buffers, * and retrieve received packets or enqueue packets to transmit. * 5. using some ioctl()s the process can synchronize the userspace view * of the queue with the actual status in the kernel. This includes both * receiving the notification of new packets, and transmitting new * packets on the output interface. * 6. select() or poll() can be used to wait for events on individual * transmit or receive queues (or all queues for a given interface). * SYNCHRONIZATION (USER) The netmap rings and data structures may be shared among multiple user threads or even independent processes. Any synchronization among those threads/processes is delegated to the threads themselves. Only one thread at a time can be in a system call on the same netmap ring. The OS does not enforce this and only guarantees against system crashes in case of invalid usage. LOCKING (INTERNAL) Within the kernel, access to the netmap rings is protected as follows: - a spinlock on each ring, to handle producer/consumer races on RX rings attached to the host stack (against multiple host threads writing from the host stack to the same ring), and on 'destination' rings attached to a VALE switch (i.e. RX rings in VALE ports, and TX rings in NIC/host ports) protecting multiple active senders for the same destination) - an atomic variable to guarantee that there is at most one instance of *_*xsync() on the ring at any time. For rings connected to user file descriptors, an atomic_test_and_set() protects this, and the lock on the ring is not actually used. For NIC RX rings connected to a VALE switch, an atomic_test_and_set() is also used to prevent multiple executions (the driver might indeed already guarantee this). For NIC TX rings connected to a VALE switch, the lock arbitrates access to the queue (both when allocating buffers and when pushing them out). - *xsync() should be protected against initializations of the card. On FreeBSD most devices have the reset routine protected by a RING lock (ixgbe, igb, em) or core lock (re). lem is missing the RING protection on rx_reset(), this should be added. On linux there is an external lock on the tx path, which probably also arbitrates access to the reset routine. XXX to be revised - a per-interface core_lock protecting access from the host stack while interfaces may be detached from netmap mode. XXX there should be no need for this lock if we detach the interfaces only while they are down. --- VALE SWITCH --- NMG_LOCK() serializes all modifications to switches and ports. A switch cannot be deleted until all ports are gone. For each switch, an SX lock (RWlock on linux) protects deletion of ports. When configuring or deleting a new port, the lock is acquired in exclusive mode (after holding NMG_LOCK). When forwarding, the lock is acquired in shared mode (without NMG_LOCK). The lock is held throughout the entire forwarding cycle, during which the thread may incur in a page fault. Hence it is important that sleepable shared locks are used. On the rx ring, the per-port lock is grabbed initially to reserve a number of slot in the ring, then the lock is released, packets are copied from source to destination, and then the lock is acquired again and the receive ring is updated. (A similar thing is done on the tx ring for NIC and host stack ports attached to the switch) */ /* * OS-specific code that is used only within this file. * Other OS-specific code that must be accessed by drivers * is present in netmap_kern.h */ #if defined(__FreeBSD__) #include /* prerequisite */ -__FBSDID("$FreeBSD$"); - #include -#include #include #include /* defines used in kernel.h */ -#include #include /* types used in module initialization */ -#include /* cdevsw struct */ -#include /* uio struct */ +#include /* cdevsw struct, UID, GID */ +#include /* FIONBIO */ #include #include /* struct socket */ #include -#include /* PROT_EXEC */ #include -#include #include -#include /* vtophys */ -#include /* vtophys */ -#include -#include -#include -#include -#include #include /* sockaddrs */ #include #include +#include +#include #include #include #include /* BIOCIMMEDIATE */ -#include #include /* bus_dmamap_* */ #include #include -#define prefetch(x) __builtin_prefetch(x) -#define BDG_RWLOCK_T struct rwlock // struct rwlock +/* reduce conditional code */ +// linux API, use for the knlist in FreeBSD +#define init_waitqueue_head(x) knlist_init_mtx(&(x)->si_note, NULL) -#define BDG_RWINIT(b) \ - rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS) -#define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) -#define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) -#define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) -#define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock) -#define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) -#define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock) +void freebsd_selwakeup(struct selinfo *si, int pri); +#define OS_selwakeup(a, b) freebsd_selwakeup(a, b) - -/* netmap global lock. - * normally called within the user thread (upon a system call) - * or when a file descriptor or process is terminated - * (last close or last munmap) - */ - -#define NMG_LOCK_T struct mtx -#define NMG_LOCK_INIT() mtx_init(&netmap_global_lock, "netmap global lock", NULL, MTX_DEF) -#define NMG_LOCK_DESTROY() mtx_destroy(&netmap_global_lock) -#define NMG_LOCK() mtx_lock(&netmap_global_lock) -#define NMG_UNLOCK() mtx_unlock(&netmap_global_lock) -#define NMG_LOCK_ASSERT() mtx_assert(&netmap_global_lock, MA_OWNED) - - -/* atomic operations */ -#include -#define NM_ATOMIC_TEST_AND_SET(p) (!atomic_cmpset_acq_int((p), 0, 1)) -#define NM_ATOMIC_CLEAR(p) atomic_store_rel_int((p), 0) - - #elif defined(linux) #include "bsd_glue.h" -static netdev_tx_t linux_netmap_start_xmit(struct sk_buff *, struct net_device *); -static struct device_driver* -linux_netmap_find_driver(struct device *dev) -{ - struct device_driver *dd; - while ( (dd = dev->driver) == NULL ) { - if ( (dev = dev->parent) == NULL ) - return NULL; - } - return dd; -} - -static struct net_device* -ifunit_ref(const char *name) -{ - struct net_device *ifp = dev_get_by_name(&init_net, name); - struct device_driver *dd; - - if (ifp == NULL) - return NULL; - - if ( (dd = linux_netmap_find_driver(&ifp->dev)) == NULL ) - goto error; - - if (!try_module_get(dd->owner)) - goto error; - - return ifp; -error: - dev_put(ifp); - return NULL; -} - -static void -if_rele(struct net_device *ifp) -{ - struct device_driver *dd; - dd = linux_netmap_find_driver(&ifp->dev); - dev_put(ifp); - if (dd) - module_put(dd->owner); -} - -// XXX a mtx would suffice here too 20130404 gl -#define NMG_LOCK_T struct semaphore -#define NMG_LOCK_INIT() sema_init(&netmap_global_lock, 1) -#define NMG_LOCK_DESTROY() -#define NMG_LOCK() down(&netmap_global_lock) -#define NMG_UNLOCK() up(&netmap_global_lock) -#define NMG_LOCK_ASSERT() // XXX to be completed - - #elif defined(__APPLE__) #warning OSX support is only partial #include "osx_glue.h" #else #error Unsupported platform #endif /* unsupported */ /* * common headers */ #include #include #include MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); /* * The following variables are used by the drivers and replicate * fields in the global memory pool. They only refer to buffers * used by physical interfaces. */ u_int netmap_total_buffers; u_int netmap_buf_size; char *netmap_buffer_base; /* also address of an invalid buffer */ /* user-controlled variables */ int netmap_verbose; static int netmap_no_timestamp; /* don't timestamp on rxsync */ SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); int netmap_mitigate = 1; SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); int netmap_no_pendintr = 1; SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); int netmap_txsync_retry = 2; SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); -int netmap_drop = 0; /* debugging */ int netmap_flags = 0; /* debug flags */ int netmap_fwd = 0; /* force transparent mode */ int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */ -SYSCTL_INT(_dev_netmap, OID_AUTO, drop, CTLFLAG_RW, &netmap_drop, 0 , ""); +/* + * netmap_admode selects the netmap mode to use. + * Invalid values are reset to NETMAP_ADMODE_BEST + */ +enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */ + NETMAP_ADMODE_NATIVE, /* either native or none */ + NETMAP_ADMODE_GENERIC, /* force generic */ + NETMAP_ADMODE_LAST }; +static int netmap_admode = NETMAP_ADMODE_BEST; + +int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */ +int netmap_generic_ringsize = 1024; /* Generic ringsize. */ +int netmap_generic_rings = 1; /* number of queues in generic. */ + SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , ""); NMG_LOCK_T netmap_global_lock; -/* - * protect against multiple threads using the same ring. - * also check that the ring has not been stopped. - */ -#define NM_KR_BUSY 1 -#define NM_KR_STOPPED 2 -static void nm_kr_put(struct netmap_kring *kr); -static __inline int nm_kr_tryget(struct netmap_kring *kr) -{ - /* check a first time without taking the lock - * to avoid starvation for nm_kr_get() - */ - if (unlikely(kr->nkr_stopped)) { - ND("ring %p stopped (%d)", kr, kr->nkr_stopped); - return NM_KR_STOPPED; - } - if (unlikely(NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))) - return NM_KR_BUSY; - /* check a second time with lock held */ - if (unlikely(kr->nkr_stopped)) { - ND("ring %p stopped (%d)", kr, kr->nkr_stopped); - nm_kr_put(kr); - return NM_KR_STOPPED; - } - return 0; -} -static __inline void nm_kr_put(struct netmap_kring *kr) +static void +nm_kr_get(struct netmap_kring *kr) { - NM_ATOMIC_CLEAR(&kr->nr_busy); -} - -static void nm_kr_get(struct netmap_kring *kr) -{ while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) tsleep(kr, 0, "NM_KR_GET", 4); } -static void nm_disable_ring(struct netmap_kring *kr) + +/* + * mark the ring as stopped, and run through the locks + * to make sure other users get to see it. + */ +void +netmap_disable_ring(struct netmap_kring *kr) { kr->nkr_stopped = 1; nm_kr_get(kr); mtx_lock(&kr->q_lock); mtx_unlock(&kr->q_lock); nm_kr_put(kr); } -void netmap_disable_all_rings(struct ifnet *ifp) + +static void +netmap_set_all_rings(struct ifnet *ifp, int stopped) { struct netmap_adapter *na; int i; + u_int ntx, nrx; if (!(ifp->if_capenable & IFCAP_NETMAP)) return; na = NA(ifp); - for (i = 0; i < na->num_tx_rings + 1; i++) { - nm_disable_ring(na->tx_rings + i); - selwakeuppri(&na->tx_rings[i].si, PI_NET); + ntx = netmap_real_tx_rings(na); + nrx = netmap_real_rx_rings(na); + + for (i = 0; i < ntx; i++) { + if (stopped) + netmap_disable_ring(na->tx_rings + i); + else + na->tx_rings[i].nkr_stopped = 0; + na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY); } - for (i = 0; i < na->num_rx_rings + 1; i++) { - nm_disable_ring(na->rx_rings + i); - selwakeuppri(&na->rx_rings[i].si, PI_NET); + + for (i = 0; i < nrx; i++) { + if (stopped) + netmap_disable_ring(na->rx_rings + i); + else + na->rx_rings[i].nkr_stopped = 0; + na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY); } - selwakeuppri(&na->tx_si, PI_NET); - selwakeuppri(&na->rx_si, PI_NET); } -void netmap_enable_all_rings(struct ifnet *ifp) + +void +netmap_disable_all_rings(struct ifnet *ifp) { - struct netmap_adapter *na; - int i; + netmap_set_all_rings(ifp, 1 /* stopped */); +} - if (!(ifp->if_capenable & IFCAP_NETMAP)) - return; - na = NA(ifp); - for (i = 0; i < na->num_tx_rings + 1; i++) { - D("enabling %p", na->tx_rings + i); - na->tx_rings[i].nkr_stopped = 0; - } - for (i = 0; i < na->num_rx_rings + 1; i++) { - D("enabling %p", na->rx_rings + i); - na->rx_rings[i].nkr_stopped = 0; - } +void +netmap_enable_all_rings(struct ifnet *ifp) +{ + netmap_set_all_rings(ifp, 0 /* enabled */); } /* * generic bound_checking function */ u_int nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg) { u_int oldv = *v; const char *op = NULL; if (dflt < lo) dflt = lo; if (dflt > hi) dflt = hi; if (oldv < lo) { *v = dflt; op = "Bump"; } else if (oldv > hi) { *v = hi; op = "Clamp"; } if (op && msg) printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); return *v; } + /* * packet-dump function, user-supplied or static buffer. * The destination buffer must be at least 30+4*len */ const char * nm_dump_buf(char *p, int len, int lim, char *dst) { static char _dst[8192]; - int i, j, i0; + int i, j, i0; static char hex[] ="0123456789abcdef"; char *o; /* output position */ #define P_HI(x) hex[((x) & 0xf0)>>4] #define P_LO(x) hex[((x) & 0xf)] #define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.') if (!dst) dst = _dst; if (lim <= 0 || lim > len) lim = len; o = dst; sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim); o += strlen(o); /* hexdump routine */ for (i = 0; i < lim; ) { sprintf(o, "%5d: ", i); o += strlen(o); memset(o, ' ', 48); i0 = i; for (j=0; j < 16 && i < lim; i++, j++) { o[j*3] = P_HI(p[i]); o[j*3+1] = P_LO(p[i]); } i = i0; for (j=0; j < 16 && i < lim; i++, j++) o[j + 48] = P_C(p[i]); o[j+48] = '\n'; o += j+49; } *o = '\0'; #undef P_HI #undef P_LO #undef P_C return dst; } -/* - * system parameters (most of them in netmap_kern.h) - * NM_NAME prefix for switch port names, default "vale" - * NM_BDG_MAXPORTS number of ports - * NM_BRIDGES max number of switches in the system. - * XXX should become a sysctl or tunable - * - * Switch ports are named valeX:Y where X is the switch name and Y - * is the port. If Y matches a physical interface name, the port is - * connected to a physical device. - * - * Unlike physical interfaces, switch ports use their own memory region - * for rings and buffers. - * The virtual interfaces use per-queue lock instead of core lock. - * In the tx loop, we aggregate traffic in batches to make all operations - * faster. The batch size is bridge_batch. - */ -#define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ -#define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ -#define NM_BRIDGE_RINGSIZE 1024 /* in the device */ -#define NM_BDG_HASH 1024 /* forwarding table entries */ -#define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ -#define NM_MULTISEG 64 /* max size of a chain of bufs */ -/* actual size of the tables */ -#define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) -/* NM_FT_NULL terminates a list of slots in the ft */ -#define NM_FT_NULL NM_BDG_BATCH_MAX -#define NM_BRIDGES 8 /* number of bridges */ - /* - * bridge_batch is set via sysctl to the max batch size to be - * used in the bridge. The actual value may be larger as the - * last packet in the block may overflow the size. + * Fetch configuration from the device, to cope with dynamic + * reconfigurations after loading the module. */ -int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ -SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); +int +netmap_update_config(struct netmap_adapter *na) +{ + struct ifnet *ifp = na->ifp; + u_int txr, txd, rxr, rxd; + txr = txd = rxr = rxd = 0; + if (na->nm_config) { + na->nm_config(na, &txr, &txd, &rxr, &rxd); + } else { + /* take whatever we had at init time */ + txr = na->num_tx_rings; + txd = na->num_tx_desc; + rxr = na->num_rx_rings; + rxd = na->num_rx_desc; + } -/* - * These are used to handle reference counters for bridge ports. - */ -#define ADD_BDG_REF(ifp) refcount_acquire(&NA(ifp)->na_bdg_refcount) -#define DROP_BDG_REF(ifp) refcount_release(&NA(ifp)->na_bdg_refcount) + if (na->num_tx_rings == txr && na->num_tx_desc == txd && + na->num_rx_rings == rxr && na->num_rx_desc == rxd) + return 0; /* nothing changed */ + if (netmap_verbose || na->active_fds > 0) { + D("stored config %s: txring %d x %d, rxring %d x %d", + NM_IFPNAME(ifp), + na->num_tx_rings, na->num_tx_desc, + na->num_rx_rings, na->num_rx_desc); + D("new config %s: txring %d x %d, rxring %d x %d", + NM_IFPNAME(ifp), txr, txd, rxr, rxd); + } + if (na->active_fds == 0) { + D("configuration changed (but fine)"); + na->num_tx_rings = txr; + na->num_tx_desc = txd; + na->num_rx_rings = rxr; + na->num_rx_desc = rxd; + return 0; + } + D("configuration changed while active, this is bad..."); + return 1; +} -/* The bridge references the buffers using the device specific look up table */ -static inline void * -BDG_NMB(struct netmap_mem_d *nmd, struct netmap_slot *slot) +static int +netmap_txsync_compat(struct netmap_kring *kring, int flags) { - struct lut_entry *lut = nmd->pools[NETMAP_BUF_POOL].lut; - uint32_t i = slot->buf_idx; - return (unlikely(i >= nmd->pools[NETMAP_BUF_POOL].objtotal)) ? lut[0].vaddr : lut[i].vaddr; + struct netmap_adapter *na = kring->na; + return na->nm_txsync(na, kring->ring_id, flags); } -static int bdg_netmap_attach(struct netmap_adapter *); -static int bdg_netmap_reg(struct ifnet *ifp, int onoff); -int kern_netmap_regif(struct nmreq *nmr); - -/* - * Each transmit queue accumulates a batch of packets into - * a structure before forwarding. Packets to the same - * destination are put in a list using ft_next as a link field. - * ft_frags and ft_next are valid only on the first fragment. - */ -struct nm_bdg_fwd { /* forwarding entry for a bridge */ - void *ft_buf; /* netmap or indirect buffer */ - uint8_t ft_frags; /* how many fragments (only on 1st frag) */ - uint8_t _ft_port; /* dst port (unused) */ - uint16_t ft_flags; /* flags, e.g. indirect */ - uint16_t ft_len; /* src fragment len */ - uint16_t ft_next; /* next packet to same destination */ -}; - -/* - * For each output interface, nm_bdg_q is used to construct a list. - * bq_len is the number of output buffers (we can have coalescing - * during the copy). - */ -struct nm_bdg_q { - uint16_t bq_head; - uint16_t bq_tail; - uint32_t bq_len; /* number of buffers */ -}; - -/* XXX revise this */ -struct nm_hash_ent { - uint64_t mac; /* the top 2 bytes are the epoch */ - uint64_t ports; -}; - -/* - * nm_bridge is a descriptor for a VALE switch. - * Interfaces for a bridge are all in bdg_ports[]. - * The array has fixed size, an empty entry does not terminate - * the search, but lookups only occur on attach/detach so we - * don't mind if they are slow. - * - * The bridge is non blocking on the transmit ports: excess - * packets are dropped if there is no room on the output port. - * - * bdg_lock protects accesses to the bdg_ports array. - * This is a rw lock (or equivalent). - */ -struct nm_bridge { - /* XXX what is the proper alignment/layout ? */ - BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */ - int bdg_namelen; - uint32_t bdg_active_ports; /* 0 means free */ - char bdg_basename[IFNAMSIZ]; - - /* Indexes of active ports (up to active_ports) - * and all other remaining ports. - */ - uint8_t bdg_port_index[NM_BDG_MAXPORTS]; - - struct netmap_adapter *bdg_ports[NM_BDG_MAXPORTS]; - - - /* - * The function to decide the destination port. - * It returns either of an index of the destination port, - * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to - * forward this packet. ring_nr is the source ring index, and the - * function may overwrite this value to forward this packet to a - * different ring index. - * This function must be set by netmap_bdgctl(). - */ - bdg_lookup_fn_t nm_bdg_lookup; - - /* the forwarding table, MAC+ports. - * XXX should be changed to an argument to be passed to - * the lookup function, and allocated on attach - */ - struct nm_hash_ent ht[NM_BDG_HASH]; -}; - - -/* - * XXX in principle nm_bridges could be created dynamically - * Right now we have a static array and deletions are protected - * by an exclusive lock. - */ -struct nm_bridge nm_bridges[NM_BRIDGES]; - - -/* - * A few function to tell which kind of port are we using. - * XXX should we hold a lock ? - * - * nma_is_vp() virtual port - * nma_is_host() port connected to the host stack - * nma_is_hw() port connected to a NIC - */ -int nma_is_vp(struct netmap_adapter *na); -int -nma_is_vp(struct netmap_adapter *na) +static int +netmap_rxsync_compat(struct netmap_kring *kring, int flags) { - return na->nm_register == bdg_netmap_reg; + struct netmap_adapter *na = kring->na; + return na->nm_rxsync(na, kring->ring_id, flags); } -static __inline int -nma_is_host(struct netmap_adapter *na) +static int +netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags) { - return na->nm_register == NULL; + (void)flags; + netmap_txsync_to_host(kring->na); + return 0; } -static __inline int -nma_is_hw(struct netmap_adapter *na) +static int +netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags) { - /* In case of sw adapter, nm_register is NULL */ - return !nma_is_vp(na) && !nma_is_host(na); + (void)flags; + netmap_rxsync_from_host(kring->na, NULL, NULL); + return 0; } -/* - * If the NIC is owned by the kernel - * (i.e., bridge), neither another bridge nor user can use it; - * if the NIC is owned by a user, only users can share it. - * Evaluation must be done under NMG_LOCK(). - */ -#define NETMAP_OWNED_BY_KERN(ifp) (!nma_is_vp(NA(ifp)) && NA(ifp)->na_bdg) -#define NETMAP_OWNED_BY_ANY(ifp) \ - (NETMAP_OWNED_BY_KERN(ifp) || (NA(ifp)->refcount > 0)) -/* - * NA(ifp)->bdg_port port index - */ - - -/* - * this is a slightly optimized copy routine which rounds - * to multiple of 64 bytes and is often faster than dealing - * with other odd sizes. We assume there is enough room - * in the source and destination buffers. +/* create the krings array and initialize the fields common to all adapters. + * The array layout is this: * - * XXX only for multiples of 64 bytes, non overlapped. - */ -static inline void -pkt_copy(void *_src, void *_dst, int l) -{ - uint64_t *src = _src; - uint64_t *dst = _dst; - if (unlikely(l >= 1024)) { - memcpy(dst, src, l); - return; - } - for (; likely(l > 0); l-=64) { - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - } -} - - -/* - * locate a bridge among the existing ones. - * MUST BE CALLED WITH NMG_LOCK() + * +----------+ + * na->tx_rings ----->| | \ + * | | } na->num_tx_ring + * | | / + * +----------+ + * | | host tx kring + * na->rx_rings ----> +----------+ + * | | \ + * | | } na->num_rx_rings + * | | / + * +----------+ + * | | host rx kring + * +----------+ + * na->tailroom ----->| | \ + * | | } tailroom bytes + * | | / + * +----------+ * - * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. - * We assume that this is called with a name of at least NM_NAME chars. + * Note: for compatibility, host krings are created even when not needed. + * The tailroom space is currently used by vale ports for allocating leases. */ -static struct nm_bridge * -nm_find_bridge(const char *name, int create) +int +netmap_krings_create(struct netmap_adapter *na, u_int tailroom) { - int i, l, namelen; - struct nm_bridge *b = NULL; + u_int i, len, ndesc; + struct netmap_kring *kring; + u_int ntx, nrx; - NMG_LOCK_ASSERT(); + /* account for the (possibly fake) host rings */ + ntx = na->num_tx_rings + 1; + nrx = na->num_rx_rings + 1; - namelen = strlen(NM_NAME); /* base length */ - l = name ? strlen(name) : 0; /* actual length */ - if (l < namelen) { - D("invalid bridge name %s", name ? name : NULL); - return NULL; + len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom; + + na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); + if (na->tx_rings == NULL) { + D("Cannot allocate krings"); + return ENOMEM; } - for (i = namelen + 1; i < l; i++) { - if (name[i] == ':') { - namelen = i; - break; + na->rx_rings = na->tx_rings + ntx; + + /* + * All fields in krings are 0 except the one initialized below. + * but better be explicit on important kring fields. + */ + ndesc = na->num_tx_desc; + for (i = 0; i < ntx; i++) { /* Transmit rings */ + kring = &na->tx_rings[i]; + bzero(kring, sizeof(*kring)); + kring->na = na; + kring->ring_id = i; + kring->nkr_num_slots = ndesc; + if (i < na->num_tx_rings) { + kring->nm_sync = netmap_txsync_compat; // XXX + } else if (i == na->num_tx_rings) { + kring->nm_sync = netmap_txsync_to_host_compat; } + /* + * IMPORTANT: Always keep one slot empty. + */ + kring->rhead = kring->rcur = kring->nr_hwcur = 0; + kring->rtail = kring->nr_hwtail = ndesc - 1; + snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", NM_IFPNAME(na->ifp), i); + ND("ktx %s h %d c %d t %d", + kring->name, kring->rhead, kring->rcur, kring->rtail); + mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF); + init_waitqueue_head(&kring->si); } - if (namelen >= IFNAMSIZ) - namelen = IFNAMSIZ; - ND("--- prefix is '%.*s' ---", namelen, name); - /* lookup the name, remember empty slot if there is one */ - for (i = 0; i < NM_BRIDGES; i++) { - struct nm_bridge *x = nm_bridges + i; - - if (x->bdg_active_ports == 0) { - if (create && b == NULL) - b = x; /* record empty slot */ - } else if (x->bdg_namelen != namelen) { - continue; - } else if (strncmp(name, x->bdg_basename, namelen) == 0) { - ND("found '%.*s' at %d", namelen, name, i); - b = x; - break; + ndesc = na->num_rx_desc; + for (i = 0; i < nrx; i++) { /* Receive rings */ + kring = &na->rx_rings[i]; + bzero(kring, sizeof(*kring)); + kring->na = na; + kring->ring_id = i; + kring->nkr_num_slots = ndesc; + if (i < na->num_rx_rings) { + kring->nm_sync = netmap_rxsync_compat; // XXX + } else if (i == na->num_rx_rings) { + kring->nm_sync = netmap_rxsync_from_host_compat; } + kring->rhead = kring->rcur = kring->nr_hwcur = 0; + kring->rtail = kring->nr_hwtail = 0; + snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", NM_IFPNAME(na->ifp), i); + ND("krx %s h %d c %d t %d", + kring->name, kring->rhead, kring->rcur, kring->rtail); + mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF); + init_waitqueue_head(&kring->si); } - if (i == NM_BRIDGES && b) { /* name not found, can create entry */ - /* initialize the bridge */ - strncpy(b->bdg_basename, name, namelen); - ND("create new bridge %s with ports %d", b->bdg_basename, - b->bdg_active_ports); - b->bdg_namelen = namelen; - b->bdg_active_ports = 0; - for (i = 0; i < NM_BDG_MAXPORTS; i++) - b->bdg_port_index[i] = i; - /* set the default function */ - b->nm_bdg_lookup = netmap_bdg_learning; - /* reset the MAC address table */ - bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); - } - return b; -} + init_waitqueue_head(&na->tx_si); + init_waitqueue_head(&na->rx_si); + na->tailroom = na->rx_rings + nrx; -/* - * Free the forwarding tables for rings attached to switch ports. - */ -static void -nm_free_bdgfwd(struct netmap_adapter *na) -{ - int nrings, i; - struct netmap_kring *kring; - - NMG_LOCK_ASSERT(); - nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings; - kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings; - for (i = 0; i < nrings; i++) { - if (kring[i].nkr_ft) { - free(kring[i].nkr_ft, M_DEVBUF); - kring[i].nkr_ft = NULL; /* protect from freeing twice */ - } - } - if (nma_is_hw(na)) - nm_free_bdgfwd(SWNA(na->ifp)); + return 0; } -/* - * Allocate the forwarding tables for the rings attached to the bridge ports. - */ -static int -nm_alloc_bdgfwd(struct netmap_adapter *na) +/* undo the actions performed by netmap_krings_create */ +void +netmap_krings_delete(struct netmap_adapter *na) { - int nrings, l, i, num_dstq; - struct netmap_kring *kring; + struct netmap_kring *kring = na->tx_rings; - NMG_LOCK_ASSERT(); - /* all port:rings + broadcast */ - num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; - l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; - l += sizeof(struct nm_bdg_q) * num_dstq; - l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; - - nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings; - kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings; - for (i = 0; i < nrings; i++) { - struct nm_bdg_fwd *ft; - struct nm_bdg_q *dstq; - int j; - - ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO); - if (!ft) { - nm_free_bdgfwd(na); - return ENOMEM; - } - dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); - for (j = 0; j < num_dstq; j++) { - dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; - dstq[j].bq_len = 0; - } - kring[i].nkr_ft = ft; + /* we rely on the krings layout described above */ + for ( ; kring != na->tailroom; kring++) { + mtx_destroy(&kring->q_lock); } - if (nma_is_hw(na)) - nm_alloc_bdgfwd(SWNA(na->ifp)); - return 0; + free(na->tx_rings, M_DEVBUF); + na->tx_rings = na->rx_rings = na->tailroom = NULL; } /* - * Fetch configuration from the device, to cope with dynamic - * reconfigurations after loading the module. + * Destructor for NIC ports. They also have an mbuf queue + * on the rings connected to the host so we need to purge + * them first. */ -static int -netmap_update_config(struct netmap_adapter *na) +static void +netmap_hw_krings_delete(struct netmap_adapter *na) { - struct ifnet *ifp = na->ifp; - u_int txr, txd, rxr, rxd; + struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue; - txr = txd = rxr = rxd = 0; - if (na->nm_config) { - na->nm_config(ifp, &txr, &txd, &rxr, &rxd); - } else { - /* take whatever we had at init time */ - txr = na->num_tx_rings; - txd = na->num_tx_desc; - rxr = na->num_rx_rings; - rxd = na->num_rx_desc; - } - - if (na->num_tx_rings == txr && na->num_tx_desc == txd && - na->num_rx_rings == rxr && na->num_rx_desc == rxd) - return 0; /* nothing changed */ - if (netmap_verbose || na->refcount > 0) { - D("stored config %s: txring %d x %d, rxring %d x %d", - ifp->if_xname, - na->num_tx_rings, na->num_tx_desc, - na->num_rx_rings, na->num_rx_desc); - D("new config %s: txring %d x %d, rxring %d x %d", - ifp->if_xname, txr, txd, rxr, rxd); - } - if (na->refcount == 0) { - D("configuration changed (but fine)"); - na->num_tx_rings = txr; - na->num_tx_desc = txd; - na->num_rx_rings = rxr; - na->num_rx_desc = rxd; - return 0; - } - D("configuration changed while active, this is bad..."); - return 1; + ND("destroy sw mbq with len %d", mbq_len(q)); + mbq_purge(q); + mbq_safe_destroy(q); + netmap_krings_delete(na); } -static struct netmap_if * + +static struct netmap_if* netmap_if_new(const char *ifname, struct netmap_adapter *na) { + struct netmap_if *nifp; + if (netmap_update_config(na)) { /* configuration mismatch, report and fail */ return NULL; } - return netmap_mem_if_new(ifname, na); -} + if (na->active_fds) + goto final; -/* Structure associated to each thread which registered an interface. - * - * The first 4 fields of this structure are written by NIOCREGIF and - * read by poll() and NIOC?XSYNC. - * There is low contention among writers (actually, a correct user program - * should have no contention among writers) and among writers and readers, - * so we use a single global lock to protect the structure initialization. - * Since initialization involves the allocation of memory, we reuse the memory - * allocator lock. - * Read access to the structure is lock free. Readers must check that - * np_nifp is not NULL before using the other fields. - * If np_nifp is NULL initialization has not been performed, so they should - * return an error to userlevel. - * - * The ref_done field is used to regulate access to the refcount in the - * memory allocator. The refcount must be incremented at most once for - * each open("/dev/netmap"). The increment is performed by the first - * function that calls netmap_get_memory() (currently called by - * mmap(), NIOCGINFO and NIOCREGIF). - * If the refcount is incremented, it is then decremented when the - * private structure is destroyed. - */ -struct netmap_priv_d { - struct netmap_if * volatile np_nifp; /* netmap if descriptor. */ + if (na->nm_krings_create(na)) + goto cleanup; - struct ifnet *np_ifp; /* device for which we hold a ref. */ - int np_ringid; /* from the ioctl */ - u_int np_qfirst, np_qlast; /* range of rings to scan */ - uint16_t np_txpoll; + if (netmap_mem_rings_create(na)) + goto cleanup; - struct netmap_mem_d *np_mref; /* use with NMG_LOCK held */ -#ifdef __FreeBSD__ - int np_refcount; /* use with NMG_LOCK held */ -#endif /* __FreeBSD__ */ -}; +final: + nifp = netmap_mem_if_new(ifname, na); + if (nifp == NULL) + goto cleanup; + + return (nifp); + +cleanup: + + if (na->active_fds == 0) { + netmap_mem_rings_delete(na); + na->nm_krings_delete(na); + } + + return NULL; +} + + /* grab a reference to the memory allocator, if we don't have one already. The * reference is taken from the netmap_adapter registered with the priv. * */ static int netmap_get_memory_locked(struct netmap_priv_d* p) { struct netmap_mem_d *nmd; int error = 0; - if (p->np_ifp == NULL) { + if (p->np_na == NULL) { if (!netmap_mmap_unreg) return ENODEV; /* for compatibility with older versions of the API * we use the global allocator when no interface has been * registered */ nmd = &nm_mem; } else { - nmd = NA(p->np_ifp)->nm_mem; + nmd = p->np_na->nm_mem; } if (p->np_mref == NULL) { error = netmap_mem_finalize(nmd); if (!error) p->np_mref = nmd; } else if (p->np_mref != nmd) { /* a virtual port has been registered, but previous * syscalls already used the global allocator. * We cannot continue */ error = ENODEV; } return error; } -static int + +int netmap_get_memory(struct netmap_priv_d* p) { int error; NMG_LOCK(); error = netmap_get_memory_locked(p); NMG_UNLOCK(); return error; } + static int netmap_have_memory_locked(struct netmap_priv_d* p) { return p->np_mref != NULL; } + static void netmap_drop_memory_locked(struct netmap_priv_d* p) { if (p->np_mref) { netmap_mem_deref(p->np_mref); p->np_mref = NULL; } } + /* * File descriptor's private data destructor. * * Call nm_register(ifp,0) to stop netmap mode on the interface and - * revert to normal operation. We expect that np_ifp has not gone. + * revert to normal operation. We expect that np_na->ifp has not gone. * The second argument is the nifp to work on. In some cases it is * not attached yet to the netmap_priv_d so we need to pass it as * a separate argument. */ /* call with NMG_LOCK held */ static void netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) { - struct ifnet *ifp = priv->np_ifp; - struct netmap_adapter *na = NA(ifp); + struct netmap_adapter *na = priv->np_na; + struct ifnet *ifp = na->ifp; NMG_LOCK_ASSERT(); - na->refcount--; - if (na->refcount <= 0) { /* last instance */ - u_int i; + na->active_fds--; + if (na->active_fds <= 0) { /* last instance */ if (netmap_verbose) - D("deleting last instance for %s", ifp->if_xname); + D("deleting last instance for %s", NM_IFPNAME(ifp)); /* * (TO CHECK) This function is only called * when the last reference to this file descriptor goes * away. This means we cannot have any pending poll() * or interrupt routine operating on the structure. * XXX The file may be closed in a thread while * another thread is using it. * Linux keeps the file opened until the last reference * by any outstanding ioctl/poll or mmap is gone. * FreeBSD does not track mmap()s (but we do) and * wakes up any sleeping poll(). Need to check what * happens if the close() occurs while a concurrent * syscall is running. */ - na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */ + if (ifp) + na->nm_register(na, 0); /* off, clear flags */ /* Wake up any sleeping threads. netmap_poll will * then return POLLERR * XXX The wake up now must happen during *_down(), when * we order all activities to stop. -gl */ - nm_free_bdgfwd(na); - for (i = 0; i < na->num_tx_rings + 1; i++) { - mtx_destroy(&na->tx_rings[i].q_lock); - } - for (i = 0; i < na->num_rx_rings + 1; i++) { - mtx_destroy(&na->rx_rings[i].q_lock); - } /* XXX kqueue(9) needed; these will mirror knlist_init. */ /* knlist_destroy(&na->tx_si.si_note); */ /* knlist_destroy(&na->rx_si.si_note); */ - if (nma_is_hw(na)) - SWNA(ifp)->tx_rings = SWNA(ifp)->rx_rings = NULL; + + /* delete rings and buffers */ + netmap_mem_rings_delete(na); + na->nm_krings_delete(na); } - /* - * netmap_mem_if_delete() deletes the nifp, and if this is - * the last instance also buffers, rings and krings. - */ + /* delete the nifp */ netmap_mem_if_delete(na, nifp); } - -/* we assume netmap adapter exists - * Called with NMG_LOCK held - */ -static void -nm_if_rele(struct ifnet *ifp) +static __inline int +nm_tx_si_user(struct netmap_priv_d *priv) { - int i, is_hw, hw, sw, lim; - struct nm_bridge *b; - struct netmap_adapter *na; - uint8_t tmp[NM_BDG_MAXPORTS]; + return (priv->np_na != NULL && + (priv->np_txqlast - priv->np_txqfirst > 1)); +} - NMG_LOCK_ASSERT(); - /* I can be called not only for get_ifp()-ed references where netmap's - * capability is guaranteed, but also for non-netmap-capable NICs. - */ - if (!NETMAP_CAPABLE(ifp) || !NA(ifp)->na_bdg) { - if_rele(ifp); - return; - } - na = NA(ifp); - b = na->na_bdg; - is_hw = nma_is_hw(na); - - ND("%s has %d references", ifp->if_xname, NA(ifp)->na_bdg_refcount); - - if (!DROP_BDG_REF(ifp)) - return; - - /* - New algorithm: - make a copy of bdg_port_index; - lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port - in the array of bdg_port_index, replacing them with - entries from the bottom of the array; - decrement bdg_active_ports; - acquire BDG_WLOCK() and copy back the array. - */ - - hw = NA(ifp)->bdg_port; - sw = (is_hw && SWNA(ifp)->na_bdg) ? SWNA(ifp)->bdg_port : -1; - lim = b->bdg_active_ports; - - ND("detach %d and %d (lim %d)", hw, sw, lim); - /* make a copy of the list of active ports, update it, - * and then copy back within BDG_WLOCK(). - */ - memcpy(tmp, b->bdg_port_index, sizeof(tmp)); - for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) { - if (hw >= 0 && tmp[i] == hw) { - ND("detach hw %d at %d", hw, i); - lim--; /* point to last active port */ - tmp[i] = tmp[lim]; /* swap with i */ - tmp[lim] = hw; /* now this is inactive */ - hw = -1; - } else if (sw >= 0 && tmp[i] == sw) { - ND("detach sw %d at %d", sw, i); - lim--; - tmp[i] = tmp[lim]; - tmp[lim] = sw; - sw = -1; - } else { - i++; - } - } - if (hw >= 0 || sw >= 0) { - D("XXX delete failed hw %d sw %d, should panic...", hw, sw); - } - hw = NA(ifp)->bdg_port; - sw = (is_hw && SWNA(ifp)->na_bdg) ? SWNA(ifp)->bdg_port : -1; - - BDG_WLOCK(b); - b->bdg_ports[hw] = NULL; - na->na_bdg = NULL; - if (sw >= 0) { - b->bdg_ports[sw] = NULL; - SWNA(ifp)->na_bdg = NULL; - } - memcpy(b->bdg_port_index, tmp, sizeof(tmp)); - b->bdg_active_ports = lim; - BDG_WUNLOCK(b); - - ND("now %d active ports", lim); - if (lim == 0) { - ND("marking bridge %s as free", b->bdg_basename); - b->nm_bdg_lookup = NULL; - } - - if (is_hw) { - if_rele(ifp); - } else { - if (na->na_flags & NAF_MEM_OWNER) - netmap_mem_private_delete(na->nm_mem); - bzero(na, sizeof(*na)); - free(na, M_DEVBUF); - bzero(ifp, sizeof(*ifp)); - free(ifp, M_DEVBUF); - } +static __inline int +nm_rx_si_user(struct netmap_priv_d *priv) +{ + return (priv->np_na != NULL && + (priv->np_rxqlast - priv->np_rxqfirst > 1)); } /* * returns 1 if this is the last instance and we can free priv */ -static int +int netmap_dtor_locked(struct netmap_priv_d *priv) { - struct ifnet *ifp = priv->np_ifp; + struct netmap_adapter *na = priv->np_na; #ifdef __FreeBSD__ /* * np_refcount is the number of active mmaps on * this file descriptor */ if (--priv->np_refcount > 0) { return 0; } #endif /* __FreeBSD__ */ - if (ifp) { - netmap_do_unregif(priv, priv->np_nifp); + if (!na) { + return 1; //XXX is it correct? } + netmap_do_unregif(priv, priv->np_nifp); + priv->np_nifp = NULL; netmap_drop_memory_locked(priv); - if (ifp) { - nm_if_rele(ifp); /* might also destroy *na */ + if (priv->np_na) { + if (nm_tx_si_user(priv)) + na->tx_si_users--; + if (nm_rx_si_user(priv)) + na->rx_si_users--; + netmap_adapter_put(na); + priv->np_na = NULL; } return 1; } -static void + +void netmap_dtor(void *data) { struct netmap_priv_d *priv = data; int last_instance; NMG_LOCK(); last_instance = netmap_dtor_locked(priv); NMG_UNLOCK(); if (last_instance) { bzero(priv, sizeof(*priv)); /* for safety */ free(priv, M_DEVBUF); } } -#ifdef __FreeBSD__ -/* - * In order to track whether pages are still mapped, we hook into - * the standard cdev_pager and intercept the constructor and - * destructor. - */ -struct netmap_vm_handle_t { - struct cdev *dev; - struct netmap_priv_d *priv; -}; - -static int -netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, - vm_ooffset_t foff, struct ucred *cred, u_short *color) -{ - struct netmap_vm_handle_t *vmh = handle; - D("handle %p size %jd prot %d foff %jd", - handle, (intmax_t)size, prot, (intmax_t)foff); - dev_ref(vmh->dev); - return 0; -} - - -static void -netmap_dev_pager_dtor(void *handle) -{ - struct netmap_vm_handle_t *vmh = handle; - struct cdev *dev = vmh->dev; - struct netmap_priv_d *priv = vmh->priv; - D("handle %p", handle); - netmap_dtor(priv); - free(vmh, M_DEVBUF); - dev_rel(dev); -} - -static int -netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, - int prot, vm_page_t *mres) -{ - struct netmap_vm_handle_t *vmh = object->handle; - struct netmap_priv_d *priv = vmh->priv; - vm_paddr_t paddr; - vm_page_t page; - vm_memattr_t memattr; - vm_pindex_t pidx; - - ND("object %p offset %jd prot %d mres %p", - object, (intmax_t)offset, prot, mres); - memattr = object->memattr; - pidx = OFF_TO_IDX(offset); - paddr = netmap_mem_ofstophys(priv->np_mref, offset); - if (paddr == 0) - return VM_PAGER_FAIL; - - if (((*mres)->flags & PG_FICTITIOUS) != 0) { - /* - * If the passed in result page is a fake page, update it with - * the new physical address. - */ - page = *mres; - vm_page_updatefake(page, paddr, memattr); - } else { - /* - * Replace the passed in reqpage page with our own fake page and - * free up the all of the original pages. - */ -#ifndef VM_OBJECT_WUNLOCK /* FreeBSD < 10.x */ -#define VM_OBJECT_WUNLOCK VM_OBJECT_UNLOCK -#define VM_OBJECT_WLOCK VM_OBJECT_LOCK -#endif /* VM_OBJECT_WUNLOCK */ - - VM_OBJECT_WUNLOCK(object); - page = vm_page_getfake(paddr, memattr); - VM_OBJECT_WLOCK(object); - vm_page_lock(*mres); - vm_page_free(*mres); - vm_page_unlock(*mres); - *mres = page; - vm_page_insert(page, object, pidx); - } - page->valid = VM_PAGE_BITS_ALL; - return (VM_PAGER_OK); -} - - -static struct cdev_pager_ops netmap_cdev_pager_ops = { - .cdev_pg_ctor = netmap_dev_pager_ctor, - .cdev_pg_dtor = netmap_dev_pager_dtor, - .cdev_pg_fault = netmap_dev_pager_fault, -}; - - -static int -netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff, - vm_size_t objsize, vm_object_t *objp, int prot) -{ - int error; - struct netmap_vm_handle_t *vmh; - struct netmap_priv_d *priv; - vm_object_t obj; - - D("cdev %p foff %jd size %jd objp %p prot %d", cdev, - (intmax_t )*foff, (intmax_t )objsize, objp, prot); - - vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF, - M_NOWAIT | M_ZERO); - if (vmh == NULL) - return ENOMEM; - vmh->dev = cdev; - - NMG_LOCK(); - error = devfs_get_cdevpriv((void**)&priv); - if (error) - goto err_unlock; - vmh->priv = priv; - priv->np_refcount++; - NMG_UNLOCK(); - - error = netmap_get_memory(priv); - if (error) - goto err_deref; - - obj = cdev_pager_allocate(vmh, OBJT_DEVICE, - &netmap_cdev_pager_ops, objsize, prot, - *foff, NULL); - if (obj == NULL) { - D("cdev_pager_allocate failed"); - error = EINVAL; - goto err_deref; - } - - *objp = obj; - return 0; - -err_deref: - NMG_LOCK(); - priv->np_refcount--; -err_unlock: - NMG_UNLOCK(); -// err: - free(vmh, M_DEVBUF); - return error; -} - - -// XXX can we remove this ? -static int -netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td) -{ - if (netmap_verbose) - D("dev %p fflag 0x%x devtype %d td %p", - dev, fflag, devtype, td); - return 0; -} - - -static int -netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) -{ - struct netmap_priv_d *priv; - int error; - - (void)dev; - (void)oflags; - (void)devtype; - (void)td; - - // XXX wait or nowait ? - priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, - M_NOWAIT | M_ZERO); - if (priv == NULL) - return ENOMEM; - - error = devfs_set_cdevpriv(priv, netmap_dtor); - if (error) - return error; - - priv->np_refcount = 1; - - return 0; -} -#endif /* __FreeBSD__ */ - - /* * Handlers for synchronization of the queues from/to the host. * Netmap has two operating modes: * - in the default mode, the rings connected to the host stack are * just another ring pair managed by userspace; * - in transparent mode (XXX to be defined) incoming packets * (from the host or the NIC) are marked as NS_FORWARD upon * arrival, and the user application has a chance to reset the * flag for packets that should be dropped. * On the RXSYNC or poll(), packets in RX rings between * kring->nr_kcur and ring->cur with NS_FORWARD still set are moved * to the other side. * The transfer NIC --> host is relatively easy, just encapsulate * into mbufs and we are done. The host --> NIC side is slightly * harder because there might not be room in the tx ring so it * might take a while before releasing the buffer. */ /* * pass a chain of buffers to the host stack as coming from 'dst' + * We do not need to lock because the queue is private. */ static void -netmap_send_up(struct ifnet *dst, struct mbuf *head) +netmap_send_up(struct ifnet *dst, struct mbq *q) { struct mbuf *m; /* send packets up, outside the lock */ - while ((m = head) != NULL) { - head = head->m_nextpkt; - m->m_nextpkt = NULL; + while ((m = mbq_dequeue(q)) != NULL) { if (netmap_verbose & NM_VERB_HOST) D("sending up pkt %p size %d", m, MBUF_LEN(m)); NM_SEND_UP(dst, m); } + mbq_destroy(q); } -struct mbq { - struct mbuf *head; - struct mbuf *tail; - int count; -}; - /* * put a copy of the buffers marked NS_FORWARD into an mbuf chain. - * Run from hwcur to cur - reserved + * Take packets from hwcur to ring->head marked NS_FORWARD (or forced) + * and pass them up. Drop remaining packets in the unlikely event + * of an mbuf shortage. */ static void netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) { - /* Take packets from hwcur to cur-reserved and pass them up. - * In case of no buffers we give up. At the end of the loop, - * the queue is drained in all cases. - * XXX handle reserved - */ - u_int lim = kring->nkr_num_slots - 1; - struct mbuf *m, *tail = q->tail; - u_int k = kring->ring->cur, n = kring->ring->reserved; - struct netmap_mem_d *nmd = kring->na->nm_mem; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->ring->head; + u_int n; + struct netmap_adapter *na = kring->na; - /* compute the final position, ring->cur - ring->reserved */ - if (n > 0) { - if (k < n) - k += kring->nkr_num_slots; - k += n; - } - for (n = kring->nr_hwcur; n != k;) { + for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) { + struct mbuf *m; struct netmap_slot *slot = &kring->ring->slot[n]; - n = nm_next(n, lim); if ((slot->flags & NS_FORWARD) == 0 && !force) continue; - if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(nmd)) { - D("bad pkt at %d len %d", n, slot->len); + if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { + RD(5, "bad pkt at %d len %d", n, slot->len); continue; } slot->flags &= ~NS_FORWARD; // XXX needed ? - /* XXX adapt to the case of a multisegment packet */ - m = m_devget(BDG_NMB(nmd, slot), slot->len, 0, kring->na->ifp, NULL); + /* XXX TODO: adapt to the case of a multisegment packet */ + m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL); if (m == NULL) break; - if (tail) - tail->m_nextpkt = m; - else - q->head = m; - tail = m; - q->count++; - m->m_nextpkt = NULL; + mbq_enqueue(q, m); } - q->tail = tail; } /* - * The host ring has packets from nr_hwcur to (cur - reserved) - * to be sent down to the NIC. - * We need to use the queue lock on the source (host RX ring) - * to protect against netmap_transmit. - * If the user is well behaved we do not need to acquire locks - * on the destination(s), - * so we only need to make sure that there are no panics because - * of user errors. - * XXX verify - * - * We scan the tx rings, which have just been - * flushed so nr_hwcur == cur. Pushing packets down means - * increment cur and decrement avail. - * XXX to be verified + * Send to the NIC rings packets marked NS_FORWARD between + * kring->nr_hwcur and kring->rhead + * Called under kring->rx_queue.lock on the sw rx ring, */ -static void +static u_int netmap_sw_to_nic(struct netmap_adapter *na) { struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; - struct netmap_kring *k1 = &na->tx_rings[0]; - u_int i, howmany, src_lim, dst_lim; + struct netmap_slot *rxslot = kring->ring->slot; + u_int i, rxcur = kring->nr_hwcur; + u_int const head = kring->rhead; + u_int const src_lim = kring->nkr_num_slots - 1; + u_int sent = 0; - /* XXX we should also check that the carrier is on */ - if (kring->nkr_stopped) - return; + /* scan rings to find space, then fill as much as possible */ + for (i = 0; i < na->num_tx_rings; i++) { + struct netmap_kring *kdst = &na->tx_rings[i]; + struct netmap_ring *rdst = kdst->ring; + u_int const dst_lim = kdst->nkr_num_slots - 1; - mtx_lock(&kring->q_lock); + /* XXX do we trust ring or kring->rcur,rtail ? */ + for (; rxcur != head && !nm_ring_empty(rdst); + rxcur = nm_next(rxcur, src_lim) ) { + struct netmap_slot *src, *dst, tmp; + u_int dst_cur = rdst->cur; - if (kring->nkr_stopped) - goto out; + src = &rxslot[rxcur]; + if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd) + continue; - howmany = kring->nr_hwavail; /* XXX otherwise cur - reserved - nr_hwcur */ + sent++; - src_lim = kring->nkr_num_slots - 1; - for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) { - ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail); - dst_lim = k1->nkr_num_slots - 1; - while (howmany > 0 && k1->ring->avail > 0) { - struct netmap_slot *src, *dst, tmp; - src = &kring->ring->slot[kring->nr_hwcur]; - dst = &k1->ring->slot[k1->ring->cur]; + dst = &rdst->slot[dst_cur]; + tmp = *src; + src->buf_idx = dst->buf_idx; src->flags = NS_BUF_CHANGED; dst->buf_idx = tmp.buf_idx; dst->len = tmp.len; dst->flags = NS_BUF_CHANGED; - ND("out len %d buf %d from %d to %d", - dst->len, dst->buf_idx, - kring->nr_hwcur, k1->ring->cur); - kring->nr_hwcur = nm_next(kring->nr_hwcur, src_lim); - howmany--; - kring->nr_hwavail--; - k1->ring->cur = nm_next(k1->ring->cur, dst_lim); - k1->ring->avail--; + rdst->cur = nm_next(dst_cur, dst_lim); } - kring->ring->cur = kring->nr_hwcur; // XXX - k1++; // XXX why? + /* if (sent) XXX txsync ? */ } -out: - mtx_unlock(&kring->q_lock); + return sent; } /* * netmap_txsync_to_host() passes packets up. We are called from a * system call in user process context, and the only contention * can be among multiple user threads erroneously calling * this routine concurrently. */ -static void +void netmap_txsync_to_host(struct netmap_adapter *na) { struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; struct netmap_ring *ring = kring->ring; - u_int k, lim = kring->nkr_num_slots - 1; - struct mbq q = { NULL, NULL, 0 }; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; + struct mbq q; - if (nm_kr_tryget(kring)) { - D("ring %p busy (user error)", kring); - return; - } - k = ring->cur; - if (k > lim) { - D("invalid ring index in stack TX kring %p", kring); - netmap_ring_reinit(kring); - nm_kr_put(kring); - return; - } - - /* Take packets from hwcur to cur and pass them up. + /* Take packets from hwcur to head and pass them up. + * force head = cur since netmap_grab_packets() stops at head * In case of no buffers we give up. At the end of the loop, * the queue is drained in all cases. */ - netmap_grab_packets(kring, &q, 1); - kring->nr_hwcur = k; - kring->nr_hwavail = ring->avail = lim; + mbq_init(&q); + ring->cur = head; + netmap_grab_packets(kring, &q, 1 /* force */); + ND("have %d pkts in queue", mbq_len(&q)); + kring->nr_hwcur = head; + kring->nr_hwtail = head + lim; + if (kring->nr_hwtail > lim) + kring->nr_hwtail -= lim + 1; + nm_txsync_finalize(kring); - nm_kr_put(kring); - netmap_send_up(na->ifp, q.head); + netmap_send_up(na->ifp, &q); } /* - * This is the 'txsync' handler to send from a software ring to the - * host stack. - */ -/* SWNA(ifp)->txrings[0] is always NA(ifp)->txrings[NA(ifp)->num_txrings] */ -static int -netmap_bdg_to_host(struct ifnet *ifp, u_int ring_nr, int flags) -{ - (void)ring_nr; - (void)flags; - if (netmap_verbose > 255) - RD(5, "sync to host %s ring %d", ifp->if_xname, ring_nr); - netmap_txsync_to_host(NA(ifp)); - return 0; -} - - -/* * rxsync backend for packets coming from the host stack. - * They have been put in the queue by netmap_transmit() so we - * need to protect access to the kring using a lock. + * They have been put in kring->rx_queue by netmap_transmit(). + * We protect access to the kring using kring->rx_queue.lock * * This routine also does the selrecord if called from the poll handler * (we know because td != NULL). * * NOTE: on linux, selrecord() is defined as a macro and uses pwait * as an additional hidden argument. + * returns the number of packets delivered to tx queues in + * transparent mode, or a negative value if error */ -static void +int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) { struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; struct netmap_ring *ring = kring->ring; - u_int j, n, lim = kring->nkr_num_slots; - u_int k = ring->cur, resvd = ring->reserved; + u_int nm_i, n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; + int ret = 0; + struct mbq *q = &kring->rx_queue; (void)pwait; /* disable unused warnings */ + (void)td; - if (kring->nkr_stopped) /* check a first time without lock */ - return; + mtx_lock(&q->lock); - /* XXX as an optimization we could reuse na->core_lock */ - mtx_lock(&kring->q_lock); + /* First part: import newly received packets */ + n = mbq_len(q); + if (n) { /* grab packets from the queue */ + struct mbuf *m; + uint32_t stop_i; - if (kring->nkr_stopped) /* check again with lock held */ - goto unlock_out; + nm_i = kring->nr_hwtail; + stop_i = nm_prev(nm_i, lim); + while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) { + int len = MBUF_LEN(m); + struct netmap_slot *slot = &ring->slot[nm_i]; - if (k >= lim) { - netmap_ring_reinit(kring); - goto unlock_out; - } - /* new packets are already set in nr_hwavail */ - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... + m_copydata(m, 0, len, BDG_NMB(na, slot)); + ND("nm %d len %d", nm_i, len); + if (netmap_verbose) + D("%s", nm_dump_buf(BDG_NMB(na, slot),len, 128, NULL)); + + slot->len = len; + slot->flags = kring->nkr_slot_flags; + nm_i = nm_next(nm_i, lim); } - k = (k >= resvd) ? k - resvd : k + lim - resvd; - } - if (j != k) { - n = k >= j ? k - j : k + lim - j; - kring->nr_hwavail -= n; - kring->nr_hwcur = k; + kring->nr_hwtail = nm_i; } - k = ring->avail = kring->nr_hwavail - resvd; - if (k == 0 && td) + + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* something was released */ + if (netmap_fwd || kring->ring->flags & NR_FORWARD) + ret = netmap_sw_to_nic(na); + kring->nr_hwcur = head; + } + + nm_rxsync_finalize(kring); + + /* access copies of cur,tail in the kring */ + if (kring->rcur == kring->rtail && td) /* no bufs available */ selrecord(td, &kring->si); - if (k && (netmap_verbose & NM_VERB_HOST)) - D("%d pkts from stack", k); -unlock_out: - mtx_unlock(&kring->q_lock); + mtx_unlock(&q->lock); + return ret; } +/* Get a netmap adapter for the port. + * + * If it is possible to satisfy the request, return 0 + * with *na containing the netmap adapter found. + * Otherwise return an error code, with *na containing NULL. + * + * When the port is attached to a bridge, we always return + * EBUSY. + * Otherwise, if the port is already bound to a file descriptor, + * then we unconditionally return the existing adapter into *na. + * In all the other cases, we return (into *na) either native, + * generic or NULL, according to the following table: + * + * native_support + * active_fds dev.netmap.admode YES NO + * ------------------------------------------------------- + * >0 * NA(ifp) NA(ifp) + * + * 0 NETMAP_ADMODE_BEST NATIVE GENERIC + * 0 NETMAP_ADMODE_NATIVE NATIVE NULL + * 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC + * + */ + +int +netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) +{ + /* generic support */ + int i = netmap_admode; /* Take a snapshot. */ + int error = 0; + struct netmap_adapter *prev_na; + struct netmap_generic_adapter *gna; + + *na = NULL; /* default */ + + /* reset in case of invalid value */ + if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST) + i = netmap_admode = NETMAP_ADMODE_BEST; + + if (NETMAP_CAPABLE(ifp)) { + /* If an adapter already exists, but is + * attached to a vale port, we report that the + * port is busy. + */ + if (NETMAP_OWNED_BY_KERN(NA(ifp))) + return EBUSY; + + /* If an adapter already exists, return it if + * there are active file descriptors or if + * netmap is not forced to use generic + * adapters. + */ + if (NA(ifp)->active_fds > 0 || + i != NETMAP_ADMODE_GENERIC) { + *na = NA(ifp); + return 0; + } + } + + /* If there isn't native support and netmap is not allowed + * to use generic adapters, we cannot satisfy the request. + */ + if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE) + return EOPNOTSUPP; + + /* Otherwise, create a generic adapter and return it, + * saving the previously used netmap adapter, if any. + * + * Note that here 'prev_na', if not NULL, MUST be a + * native adapter, and CANNOT be a generic one. This is + * true because generic adapters are created on demand, and + * destroyed when not used anymore. Therefore, if the adapter + * currently attached to an interface 'ifp' is generic, it + * must be that + * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))). + * Consequently, if NA(ifp) is generic, we will enter one of + * the branches above. This ensures that we never override + * a generic adapter with another generic adapter. + */ + prev_na = NA(ifp); + error = generic_netmap_attach(ifp); + if (error) + return error; + + *na = NA(ifp); + gna = (struct netmap_generic_adapter*)NA(ifp); + gna->prev = prev_na; /* save old na */ + if (prev_na != NULL) { + ifunit_ref(ifp->if_xname); + // XXX add a refcount ? + netmap_adapter_get(prev_na); + } + ND("Created generic NA %p (prev %p)", gna, gna->prev); + + return 0; +} + + /* * MUST BE CALLED UNDER NMG_LOCK() * - * get a refcounted reference to an interface. + * Get a refcounted reference to a netmap adapter attached + * to the interface specified by nmr. * This is always called in the execution of an ioctl(). * - * Return ENXIO if the interface does not exist, EINVAL if netmap - * is not supported by the interface. - * If successful, hold a reference. + * Return ENXIO if the interface specified by the request does + * not exist, ENOTSUP if netmap is not supported by the interface, + * EBUSY if the interface is already attached to a bridge, + * EINVAL if parameters are invalid, ENOMEM if needed resources + * could not be allocated. + * If successful, hold a reference to the netmap adapter. * - * When the NIC is attached to a bridge, reference is managed - * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as - * virtual ports. Hence, on the final DROP_BDG_REF(), the NIC - * is detached from the bridge, then ifp's refcount is dropped (this - * is equivalent to that ifp is destroyed in case of virtual ports. - * - * This function uses if_rele() when we want to prevent the NIC from - * being detached from the bridge in error handling. But once refcount - * is acquired by this function, it must be released using nm_if_rele(). + * No reference is kept on the real interface, which may then + * disappear at any time. */ -static int -get_ifp(struct nmreq *nmr, struct ifnet **ifp, int create) +int +netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) { - const char *name = nmr->nr_name; - int namelen = strlen(name); - struct ifnet *iter = NULL; - int no_prefix = 0; + struct ifnet *ifp = NULL; + int error = 0; + struct netmap_adapter *ret = NULL; - /* first try to see if this is a bridge port. */ - struct nm_bridge *b; - struct netmap_adapter *na; - int i, j, cand = -1, cand2 = -1; - int needed; + *na = NULL; /* default return value */ + /* first try to see if this is a bridge port. */ NMG_LOCK_ASSERT(); - *ifp = NULL; /* default */ - if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) { - no_prefix = 1; /* no VALE prefix */ - goto no_bridge_port; + + error = netmap_get_pipe_na(nmr, na, create); + if (error || *na != NULL) + return error; + + error = netmap_get_bdg_na(nmr, na, create); + if (error) + return error; + + if (*na != NULL) /* valid match in netmap_get_bdg_na() */ + goto pipes; + + ifp = ifunit_ref(nmr->nr_name); + if (ifp == NULL) { + return ENXIO; } - b = nm_find_bridge(name, create); - if (b == NULL) { - D("no bridges available for '%s'", name); - return (ENXIO); + error = netmap_get_hw_na(ifp, &ret); + if (error) + goto out; + + /* Users cannot use the NIC attached to a bridge directly */ + if (NETMAP_OWNED_BY_KERN(ret)) { + error = EBUSY; + goto out; } + *na = ret; + netmap_adapter_get(ret); - /* Now we are sure that name starts with the bridge's name, - * lookup the port in the bridge. We need to scan the entire - * list. It is not important to hold a WLOCK on the bridge - * during the search because NMG_LOCK already guarantees - * that there are no other possible writers. +pipes: + error = netmap_pipe_alloc(*na, nmr); + +out: + if (error && ret != NULL) + netmap_adapter_put(ret); + + if (ifp) + if_rele(ifp); + + return error; +} + + +/* + * validate parameters on entry for *_txsync() + * Returns ring->cur if ok, or something >= kring->nkr_num_slots + * in case of error. + * + * rhead, rcur and rtail=hwtail are stored from previous round. + * hwcur is the next packet to send to the ring. + * + * We want + * hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail + * + * hwcur, rhead, rtail and hwtail are reliable + */ +u_int +nm_txsync_prologue(struct netmap_kring *kring) +{ + struct netmap_ring *ring = kring->ring; + u_int head = ring->head; /* read only once */ + u_int cur = ring->cur; /* read only once */ + u_int n = kring->nkr_num_slots; + + ND(5, "%s kcur %d ktail %d head %d cur %d tail %d", + kring->name, + kring->nr_hwcur, kring->nr_hwtail, + ring->head, ring->cur, ring->tail); +#if 1 /* kernel sanity checks; but we can trust the kring. */ + if (kring->nr_hwcur >= n || kring->rhead >= n || + kring->rtail >= n || kring->nr_hwtail >= n) + goto error; +#endif /* kernel sanity checks */ + /* + * user sanity checks. We only use 'cur', + * A, B, ... are possible positions for cur: + * + * 0 A cur B tail C n-1 + * 0 D tail E cur F n-1 + * + * B, F, D are valid. A, C, E are wrong */ + if (kring->rtail >= kring->rhead) { + /* want rhead <= head <= rtail */ + if (head < kring->rhead || head > kring->rtail) + goto error; + /* and also head <= cur <= rtail */ + if (cur < head || cur > kring->rtail) + goto error; + } else { /* here rtail < rhead */ + /* we need head outside rtail .. rhead */ + if (head > kring->rtail && head < kring->rhead) + goto error; - /* lookup in the local list of ports */ - for (j = 0; j < b->bdg_active_ports; j++) { - i = b->bdg_port_index[j]; - na = b->bdg_ports[i]; - // KASSERT(na != NULL); - iter = na->ifp; - /* XXX make sure the name only contains one : */ - if (!strcmp(iter->if_xname, name) /* virtual port */ || - (namelen > b->bdg_namelen && !strcmp(iter->if_xname, - name + b->bdg_namelen + 1)) /* NIC */) { - ADD_BDG_REF(iter); - ND("found existing if %s refs %d", name, - NA(iter)->na_bdg_refcount); - *ifp = iter; - /* we are done, this is surely netmap capable */ - return 0; + /* two cases now: head <= rtail or head >= rhead */ + if (head <= kring->rtail) { + /* want head <= cur <= rtail */ + if (cur < head || cur > kring->rtail) + goto error; + } else { /* head >= rhead */ + /* cur must be outside rtail..head */ + if (cur > kring->rtail && cur < head) + goto error; } } - /* not found, should we create it? */ - if (!create) - return ENXIO; - /* yes we should, see if we have space to attach entries */ - needed = 2; /* in some cases we only need 1 */ - if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) { - D("bridge full %d, cannot create new port", b->bdg_active_ports); - return EINVAL; + if (ring->tail != kring->rtail) { + RD(5, "tail overwritten was %d need %d", + ring->tail, kring->rtail); + ring->tail = kring->rtail; } - /* record the next two ports available, but do not allocate yet */ - cand = b->bdg_port_index[b->bdg_active_ports]; - cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; - ND("+++ bridge %s port %s used %d avail %d %d", - b->bdg_basename, name, b->bdg_active_ports, cand, cand2); + kring->rhead = head; + kring->rcur = cur; + return head; - /* - * try see if there is a matching NIC with this name - * (after the bridge's name) - */ - iter = ifunit_ref(name + b->bdg_namelen + 1); - if (!iter) { /* this is a virtual port */ - /* Create a temporary NA with arguments, then - * bdg_netmap_attach() will allocate the real one - * and attach it to the ifp - */ - struct netmap_adapter tmp_na; - int error; +error: + RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d", + kring->name, + kring->nr_hwcur, + kring->rcur, kring->nr_hwtail, + cur, ring->tail); + return n; +} - if (nmr->nr_cmd) { - /* nr_cmd must be 0 for a virtual port */ - return EINVAL; - } - bzero(&tmp_na, sizeof(tmp_na)); - /* bound checking */ - tmp_na.num_tx_rings = nmr->nr_tx_rings; - nm_bound_var(&tmp_na.num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); - nmr->nr_tx_rings = tmp_na.num_tx_rings; // write back - tmp_na.num_rx_rings = nmr->nr_rx_rings; - nm_bound_var(&tmp_na.num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); - nmr->nr_rx_rings = tmp_na.num_rx_rings; // write back - nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE, - 1, NM_BDG_MAXSLOTS, NULL); - tmp_na.num_tx_desc = nmr->nr_tx_slots; - nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, - 1, NM_BDG_MAXSLOTS, NULL); - tmp_na.num_rx_desc = nmr->nr_rx_slots; - /* create a struct ifnet for the new port. - * need M_NOWAIT as we are under nma_lock - */ - iter = malloc(sizeof(*iter), M_DEVBUF, M_NOWAIT | M_ZERO); - if (!iter) - return ENOMEM; +/* + * validate parameters on entry for *_rxsync() + * Returns ring->head if ok, kring->nkr_num_slots on error. + * + * For a valid configuration, + * hwcur <= head <= cur <= tail <= hwtail + * + * We only consider head and cur. + * hwcur and hwtail are reliable. + * + */ +u_int +nm_rxsync_prologue(struct netmap_kring *kring) +{ + struct netmap_ring *ring = kring->ring; + uint32_t const n = kring->nkr_num_slots; + uint32_t head, cur; - strcpy(iter->if_xname, name); - tmp_na.ifp = iter; - /* bdg_netmap_attach creates a struct netmap_adapter */ - error = bdg_netmap_attach(&tmp_na); - if (error) { - D("error %d", error); - free(iter, M_DEVBUF); - return error; + ND("%s kc %d kt %d h %d c %d t %d", + kring->name, + kring->nr_hwcur, kring->nr_hwtail, + ring->head, ring->cur, ring->tail); + /* + * Before storing the new values, we should check they do not + * move backwards. However: + * - head is not an issue because the previous value is hwcur; + * - cur could in principle go back, however it does not matter + * because we are processing a brand new rxsync() + */ + cur = kring->rcur = ring->cur; /* read only once */ + head = kring->rhead = ring->head; /* read only once */ +#if 1 /* kernel sanity checks */ + if (kring->nr_hwcur >= n || kring->nr_hwtail >= n) + goto error; +#endif /* kernel sanity checks */ + /* user sanity checks */ + if (kring->nr_hwtail >= kring->nr_hwcur) { + /* want hwcur <= rhead <= hwtail */ + if (head < kring->nr_hwcur || head > kring->nr_hwtail) + goto error; + /* and also rhead <= rcur <= hwtail */ + if (cur < head || cur > kring->nr_hwtail) + goto error; + } else { + /* we need rhead outside hwtail..hwcur */ + if (head < kring->nr_hwcur && head > kring->nr_hwtail) + goto error; + /* two cases now: head <= hwtail or head >= hwcur */ + if (head <= kring->nr_hwtail) { + /* want head <= cur <= hwtail */ + if (cur < head || cur > kring->nr_hwtail) + goto error; + } else { + /* cur must be outside hwtail..head */ + if (cur < head && cur > kring->nr_hwtail) + goto error; } - cand2 = -1; /* only need one port */ - } else if (NETMAP_CAPABLE(iter)) { /* this is a NIC */ - /* make sure the NIC is not already in use */ - if (NETMAP_OWNED_BY_ANY(iter)) { - D("NIC %s busy, cannot attach to bridge", - iter->if_xname); - if_rele(iter); /* don't detach from bridge */ - return EINVAL; - } - if (nmr->nr_arg1 != NETMAP_BDG_HOST) - cand2 = -1; /* only need one port */ - } else { /* not a netmap-capable NIC */ - if_rele(iter); /* don't detach from bridge */ - return EINVAL; } - na = NA(iter); - - BDG_WLOCK(b); - na->bdg_port = cand; - ND("NIC %p to bridge port %d", NA(iter), cand); - /* bind the port to the bridge (virtual ports are not active) */ - b->bdg_ports[cand] = na; - na->na_bdg = b; - b->bdg_active_ports++; - if (cand2 >= 0) { - /* also bind the host stack to the bridge */ - b->bdg_ports[cand2] = SWNA(iter); - SWNA(iter)->bdg_port = cand2; - SWNA(iter)->na_bdg = b; - b->bdg_active_ports++; - ND("host %p to bridge port %d", SWNA(iter), cand2); + if (ring->tail != kring->rtail) { + RD(5, "%s tail overwritten was %d need %d", + kring->name, + ring->tail, kring->rtail); + ring->tail = kring->rtail; } - ADD_BDG_REF(iter); // XXX one or two ? - ND("if %s refs %d", name, NA(iter)->na_bdg_refcount); - BDG_WUNLOCK(b); - *ifp = iter; - return 0; + return head; -no_bridge_port: - *ifp = iter; - if (! *ifp) - *ifp = ifunit_ref(name); - if (*ifp == NULL) - return (ENXIO); - - if (NETMAP_CAPABLE(*ifp)) { - /* Users cannot use the NIC attached to a bridge directly */ - if (no_prefix && NETMAP_OWNED_BY_KERN(*ifp)) { - if_rele(*ifp); /* don't detach from bridge */ - return EINVAL; - } else - return 0; /* valid pointer, we hold the refcount */ - } - nm_if_rele(*ifp); - return EINVAL; // not NETMAP capable +error: + RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d", + kring->nr_hwcur, + kring->rcur, kring->nr_hwtail, + kring->rhead, kring->rcur, ring->tail); + return n; } /* * Error routine called when txsync/rxsync detects an error. - * Can't do much more than resetting cur = hwcur, avail = hwavail. + * Can't do much more than resetting head =cur = hwcur, tail = hwtail * Return 1 on reinit. * * This routine is only called by the upper half of the kernel. * It only reads hwcur (which is changed only by the upper half, too) - * and hwavail (which may be changed by the lower half, but only on + * and hwtail (which may be changed by the lower half, but only on * a tx ring and only to increase it, so any error will be recovered * on the next call). For the above, we don't strictly need to call * it under lock. */ int netmap_ring_reinit(struct netmap_kring *kring) { struct netmap_ring *ring = kring->ring; u_int i, lim = kring->nkr_num_slots - 1; int errors = 0; // XXX KASSERT nm_kr_tryget - RD(10, "called for %s", kring->na->ifp->if_xname); + RD(10, "called for %s", NM_IFPNAME(kring->na->ifp)); + // XXX probably wrong to trust userspace + kring->rhead = ring->head; + kring->rcur = ring->cur; + kring->rtail = ring->tail; + if (ring->cur > lim) errors++; + if (ring->head > lim) + errors++; + if (ring->tail > lim) + errors++; for (i = 0; i <= lim; i++) { u_int idx = ring->slot[i].buf_idx; u_int len = ring->slot[i].len; if (idx < 2 || idx >= netmap_total_buffers) { - if (!errors++) - D("bad buffer at slot %d idx %d len %d ", i, idx, len); + RD(5, "bad index at slot %d idx %d len %d ", i, idx, len); ring->slot[i].buf_idx = 0; ring->slot[i].len = 0; } else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) { ring->slot[i].len = 0; - if (!errors++) - D("bad len %d at slot %d idx %d", - len, i, idx); + RD(5, "bad len at slot %d idx %d len %d", i, idx, len); } } if (errors) { - int pos = kring - kring->na->tx_rings; - int n = kring->na->num_tx_rings + 1; - RD(10, "total %d errors", errors); - errors++; - RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d", - kring->na->ifp->if_xname, - pos < n ? "TX" : "RX", pos < n ? pos : pos - n, + RD(10, "%s reinit, cur %d -> %d tail %d -> %d", + kring->name, ring->cur, kring->nr_hwcur, - ring->avail, kring->nr_hwavail); - ring->cur = kring->nr_hwcur; - ring->avail = kring->nr_hwavail; + ring->tail, kring->nr_hwtail); + ring->head = kring->rhead = kring->nr_hwcur; + ring->cur = kring->rcur = kring->nr_hwcur; + ring->tail = kring->rtail = kring->nr_hwtail; } return (errors ? 1 : 0); } /* * Set the ring ID. For devices with a single queue, a request * for all rings is the same as a single ring. */ static int -netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) +netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags) { - struct ifnet *ifp = priv->np_ifp; - struct netmap_adapter *na = NA(ifp); - u_int i = ringid & NETMAP_RING_MASK; - /* initially (np_qfirst == np_qlast) we don't want to lock */ - u_int lim = na->num_rx_rings; + struct netmap_adapter *na = priv->np_na; + u_int j, i = ringid & NETMAP_RING_MASK; + u_int reg = flags & NR_REG_MASK; - if (na->num_tx_rings > lim) - lim = na->num_tx_rings; - if ( (ringid & NETMAP_HW_RING) && i >= lim) { - D("invalid ring id %d", i); - return (EINVAL); + if (reg == NR_REG_DEFAULT) { + /* convert from old ringid to flags */ + if (ringid & NETMAP_SW_RING) { + reg = NR_REG_SW; + } else if (ringid & NETMAP_HW_RING) { + reg = NR_REG_ONE_NIC; + } else { + reg = NR_REG_ALL_NIC; + } + D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg); } - priv->np_ringid = ringid; - if (ringid & NETMAP_SW_RING) { - priv->np_qfirst = NETMAP_SW_RING; - priv->np_qlast = 0; - } else if (ringid & NETMAP_HW_RING) { - priv->np_qfirst = i; - priv->np_qlast = i + 1; - } else { - priv->np_qfirst = 0; - priv->np_qlast = NETMAP_HW_RING ; + switch (reg) { + case NR_REG_ALL_NIC: + case NR_REG_PIPE_MASTER: + case NR_REG_PIPE_SLAVE: + priv->np_txqfirst = 0; + priv->np_txqlast = na->num_tx_rings; + priv->np_rxqfirst = 0; + priv->np_rxqlast = na->num_rx_rings; + ND("%s %d %d", "ALL/PIPE", + priv->np_rxqfirst, priv->np_rxqlast); + break; + case NR_REG_SW: + case NR_REG_NIC_SW: + if (!(na->na_flags & NAF_HOST_RINGS)) { + D("host rings not supported"); + return EINVAL; + } + priv->np_txqfirst = (reg == NR_REG_SW ? + na->num_tx_rings : 0); + priv->np_txqlast = na->num_tx_rings + 1; + priv->np_rxqfirst = (reg == NR_REG_SW ? + na->num_rx_rings : 0); + priv->np_rxqlast = na->num_rx_rings + 1; + ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW", + priv->np_rxqfirst, priv->np_rxqlast); + break; + case NR_REG_ONE_NIC: + if (i >= na->num_tx_rings && i >= na->num_rx_rings) { + D("invalid ring id %d", i); + return EINVAL; + } + /* if not enough rings, use the first one */ + j = i; + if (j >= na->num_tx_rings) + j = 0; + priv->np_txqfirst = j; + priv->np_txqlast = j + 1; + j = i; + if (j >= na->num_rx_rings) + j = 0; + priv->np_rxqfirst = j; + priv->np_rxqlast = j + 1; + break; + default: + D("invalid regif type %d", reg); + return EINVAL; } priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; - if (netmap_verbose) { - if (ringid & NETMAP_SW_RING) - D("ringid %s set to SW RING", ifp->if_xname); - else if (ringid & NETMAP_HW_RING) - D("ringid %s set to HW RING %d", ifp->if_xname, - priv->np_qfirst); - else - D("ringid %s set to all %d HW RINGS", ifp->if_xname, lim); - } + priv->np_flags = (flags & ~NR_REG_MASK) | reg; + if (nm_tx_si_user(priv)) + na->tx_si_users++; + if (nm_rx_si_user(priv)) + na->rx_si_users++; + if (netmap_verbose) { + D("%s: tx [%d,%d) rx [%d,%d) id %d", + NM_IFPNAME(na->ifp), + priv->np_txqfirst, + priv->np_txqlast, + priv->np_rxqfirst, + priv->np_rxqlast, + i); + } return 0; } - /* * possibly move the interface to netmap-mode. * If success it returns a pointer to netmap_if, otherwise NULL. * This must be called with NMG_LOCK held. */ -static struct netmap_if * -netmap_do_regif(struct netmap_priv_d *priv, struct ifnet *ifp, - uint16_t ringid, int *err) +struct netmap_if * +netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, + uint16_t ringid, uint32_t flags, int *err) { - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_if *nifp = NULL; - int error, need_mem; + int error, need_mem = 0; NMG_LOCK_ASSERT(); /* ring configuration may have changed, fetch from the card */ netmap_update_config(na); - priv->np_ifp = ifp; /* store the reference */ - error = netmap_set_ringid(priv, ringid); + priv->np_na = na; /* store the reference */ + error = netmap_set_ringid(priv, ringid, flags); if (error) goto out; /* ensure allocators are ready */ need_mem = !netmap_have_memory_locked(priv); if (need_mem) { error = netmap_get_memory_locked(priv); ND("get_memory returned %d", error); if (error) goto out; } - nifp = netmap_if_new(ifp->if_xname, na); + nifp = netmap_if_new(NM_IFPNAME(ifp), na); if (nifp == NULL) { /* allocation failed */ /* we should drop the allocator, but only * if we were the ones who grabbed it */ - if (need_mem) - netmap_drop_memory_locked(priv); error = ENOMEM; goto out; } - na->refcount++; + na->active_fds++; if (ifp->if_capenable & IFCAP_NETMAP) { /* was already set */ } else { - u_int i; /* Otherwise set the card in netmap mode * and make it use the shared buffers. * - * If the interface is attached to a bridge, lock it. - */ - if (NETMAP_OWNED_BY_KERN(ifp)) - BDG_WLOCK(NA(ifp)->na_bdg); - for (i = 0 ; i < na->num_tx_rings + 1; i++) - mtx_init(&na->tx_rings[i].q_lock, "nm_txq_lock", - NULL, MTX_DEF); - for (i = 0 ; i < na->num_rx_rings + 1; i++) { - mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock", - NULL, MTX_DEF); - } - if (nma_is_hw(na)) { - SWNA(ifp)->tx_rings = &na->tx_rings[na->num_tx_rings]; - SWNA(ifp)->rx_rings = &na->rx_rings[na->num_rx_rings]; - } - /* * do not core lock because the race is harmless here, * there cannot be any traffic to netmap_transmit() */ - error = na->nm_register(ifp, 1); /* mode on */ - // XXX do we need to nm_alloc_bdgfwd() in all cases ? - if (!error) - error = nm_alloc_bdgfwd(na); + na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut; + ND("%p->na_lut == %p", na, na->na_lut); + na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal; + error = na->nm_register(na, 1); /* mode on */ if (error) { netmap_do_unregif(priv, nifp); nifp = NULL; } - if (NETMAP_OWNED_BY_KERN(ifp)) - BDG_WUNLOCK(NA(ifp)->na_bdg); - } out: *err = error; + if (error) { + priv->np_na = NULL; + if (need_mem) + netmap_drop_memory_locked(priv); + } if (nifp != NULL) { /* * advertise that the interface is ready bt setting ni_nifp. * The barrier is needed because readers (poll and *SYNC) * check for priv->np_nifp != NULL without locking */ wmb(); /* make sure previous writes are visible to all CPUs */ priv->np_nifp = nifp; } return nifp; } -/* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */ -static int -nm_bdg_attach(struct nmreq *nmr) -{ - struct ifnet *ifp; - struct netmap_if *nifp; - struct netmap_priv_d *npriv; - int error; - npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO); - if (npriv == NULL) - return ENOMEM; - NMG_LOCK(); - error = get_ifp(nmr, &ifp, 1 /* create if not exists */); - if (error) /* no device, or another bridge or user owns the device */ - goto unlock_exit; - /* get_ifp() sets na_bdg if this is a physical interface - * that we can attach to a switch. - */ - if (!NETMAP_OWNED_BY_KERN(ifp)) { - /* got reference to a virtual port or direct access to a NIC. - * perhaps specified no bridge prefix or wrong NIC name - */ - error = EINVAL; - goto unref_exit; - } - if (NA(ifp)->refcount > 0) { /* already registered */ - error = EBUSY; - DROP_BDG_REF(ifp); - goto unlock_exit; - } - - nifp = netmap_do_regif(npriv, ifp, nmr->nr_ringid, &error); - if (!nifp) { - goto unref_exit; - } - - NA(ifp)->na_kpriv = npriv; - NMG_UNLOCK(); - ND("registered %s to netmap-mode", ifp->if_xname); - return 0; - -unref_exit: - nm_if_rele(ifp); -unlock_exit: - NMG_UNLOCK(); - bzero(npriv, sizeof(*npriv)); - free(npriv, M_DEVBUF); - return error; -} - -static int -nm_bdg_detach(struct nmreq *nmr) -{ - struct ifnet *ifp; - int error; - int last_instance; - - NMG_LOCK(); - error = get_ifp(nmr, &ifp, 0 /* don't create */); - if (error) { /* no device, or another bridge or user owns the device */ - goto unlock_exit; - } - /* XXX do we need to check this ? */ - if (!NETMAP_OWNED_BY_KERN(ifp)) { - /* got reference to a virtual port or direct access to a NIC. - * perhaps specified no bridge's prefix or wrong NIC's name - */ - error = EINVAL; - goto unref_exit; - } - - if (NA(ifp)->refcount == 0) { /* not registered */ - error = EINVAL; - goto unref_exit; - } - - DROP_BDG_REF(ifp); /* the one from get_ifp */ - last_instance = netmap_dtor_locked(NA(ifp)->na_kpriv); /* unregister */ - NMG_UNLOCK(); - if (!last_instance) { - D("--- error, trying to detach an entry with active mmaps"); - error = EINVAL; - } else { - struct netmap_priv_d *npriv = NA(ifp)->na_kpriv; - NA(ifp)->na_kpriv = NULL; - - bzero(npriv, sizeof(*npriv)); - free(npriv, M_DEVBUF); - } - return error; - -unref_exit: - nm_if_rele(ifp); -unlock_exit: - NMG_UNLOCK(); - return error; -} - - -/* Initialize necessary fields of sw adapter located in right after hw's - * one. sw adapter attaches a pair of sw rings of the netmap-mode NIC. - * It is always activated and deactivated at the same tie with the hw's one. - * Thus we don't need refcounting on the sw adapter. - * Regardless of NIC's feature we use separate lock so that anybody can lock - * me independently from the hw adapter. - * Make sure nm_register is NULL to be handled as FALSE in nma_is_hw - */ -static void -netmap_attach_sw(struct ifnet *ifp) -{ - struct netmap_adapter *hw_na = NA(ifp); - struct netmap_adapter *na = SWNA(ifp); - - na->ifp = ifp; - na->num_rx_rings = na->num_tx_rings = 1; - na->num_tx_desc = hw_na->num_tx_desc; - na->num_rx_desc = hw_na->num_rx_desc; - na->nm_txsync = netmap_bdg_to_host; - /* we use the same memory allocator as the - * the hw adapter */ - na->nm_mem = hw_na->nm_mem; -} - - -/* exported to kernel callers, e.g. OVS ? - * Entry point. - * Called without NMG_LOCK. - */ -int -netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) -{ - struct nm_bridge *b; - struct netmap_adapter *na; - struct ifnet *iter; - char *name = nmr->nr_name; - int cmd = nmr->nr_cmd, namelen = strlen(name); - int error = 0, i, j; - - switch (cmd) { - case NETMAP_BDG_ATTACH: - error = nm_bdg_attach(nmr); - break; - - case NETMAP_BDG_DETACH: - error = nm_bdg_detach(nmr); - break; - - case NETMAP_BDG_LIST: - /* this is used to enumerate bridges and ports */ - if (namelen) { /* look up indexes of bridge and port */ - if (strncmp(name, NM_NAME, strlen(NM_NAME))) { - error = EINVAL; - break; - } - NMG_LOCK(); - b = nm_find_bridge(name, 0 /* don't create */); - if (!b) { - error = ENOENT; - NMG_UNLOCK(); - break; - } - - error = ENOENT; - for (j = 0; j < b->bdg_active_ports; j++) { - i = b->bdg_port_index[j]; - na = b->bdg_ports[i]; - if (na == NULL) { - D("---AAAAAAAAARGH-------"); - continue; - } - iter = na->ifp; - /* the former and the latter identify a - * virtual port and a NIC, respectively - */ - if (!strcmp(iter->if_xname, name) || - (namelen > b->bdg_namelen && - !strcmp(iter->if_xname, - name + b->bdg_namelen + 1))) { - /* bridge index */ - nmr->nr_arg1 = b - nm_bridges; - nmr->nr_arg2 = i; /* port index */ - error = 0; - break; - } - } - NMG_UNLOCK(); - } else { - /* return the first non-empty entry starting from - * bridge nr_arg1 and port nr_arg2. - * - * Users can detect the end of the same bridge by - * seeing the new and old value of nr_arg1, and can - * detect the end of all the bridge by error != 0 - */ - i = nmr->nr_arg1; - j = nmr->nr_arg2; - - NMG_LOCK(); - for (error = ENOENT; i < NM_BRIDGES; i++) { - b = nm_bridges + i; - if (j >= b->bdg_active_ports) { - j = 0; /* following bridges scan from 0 */ - continue; - } - nmr->nr_arg1 = i; - nmr->nr_arg2 = j; - j = b->bdg_port_index[j]; - na = b->bdg_ports[j]; - iter = na->ifp; - strncpy(name, iter->if_xname, (size_t)IFNAMSIZ); - error = 0; - break; - } - NMG_UNLOCK(); - } - break; - - case NETMAP_BDG_LOOKUP_REG: - /* register a lookup function to the given bridge. - * nmr->nr_name may be just bridge's name (including ':' - * if it is not just NM_NAME). - */ - if (!func) { - error = EINVAL; - break; - } - NMG_LOCK(); - b = nm_find_bridge(name, 0 /* don't create */); - if (!b) { - error = EINVAL; - } else { - b->nm_bdg_lookup = func; - } - NMG_UNLOCK(); - break; - - default: - D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd); - error = EINVAL; - break; - } - return error; -} - - /* * ioctl(2) support for the "netmap" device. * * Following a list of accepted commands: * - NIOCGINFO * - SIOCGIFADDR just for convenience * - NIOCREGIF - * - NIOCUNREGIF * - NIOCTXSYNC * - NIOCRXSYNC * * Return 0 on success, errno otherwise. */ -static int +int netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct netmap_priv_d *priv = NULL; struct ifnet *ifp = NULL; struct nmreq *nmr = (struct nmreq *) data; struct netmap_adapter *na = NULL; int error; - u_int i, lim; + u_int i, qfirst, qlast; struct netmap_if *nifp; struct netmap_kring *krings; (void)dev; /* UNUSED */ (void)fflag; /* UNUSED */ -#ifdef linux -#define devfs_get_cdevpriv(pp) \ - ({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; \ - (*pp ? 0 : ENOENT); }) -/* devfs_set_cdevpriv cannot fail on linux */ -#define devfs_set_cdevpriv(p, fn) \ - ({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); }) - - -#define devfs_clear_cdevpriv() do { \ - netmap_dtor(priv); ((struct file *)td)->private_data = 0; \ - } while (0) -#endif /* linux */ - + if (cmd == NIOCGINFO || cmd == NIOCREGIF) { + /* truncate name */ + nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; + if (nmr->nr_version != NETMAP_API) { + D("API mismatch for %s got %d need %d", + nmr->nr_name, + nmr->nr_version, NETMAP_API); + nmr->nr_version = NETMAP_API; + } + if (nmr->nr_version < NETMAP_MIN_API || + nmr->nr_version > NETMAP_MAX_API) { + return EINVAL; + } + } CURVNET_SET(TD_TO_VNET(td)); error = devfs_get_cdevpriv((void **)&priv); if (error) { CURVNET_RESTORE(); /* XXX ENOENT should be impossible, since the priv * is now created in the open */ return (error == ENOENT ? ENXIO : error); } - nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; /* truncate name */ switch (cmd) { case NIOCGINFO: /* return capabilities etc */ - if (nmr->nr_version != NETMAP_API) { - D("API mismatch got %d have %d", - nmr->nr_version, NETMAP_API); - nmr->nr_version = NETMAP_API; - error = EINVAL; - break; - } if (nmr->nr_cmd == NETMAP_BDG_LIST) { error = netmap_bdg_ctl(nmr, NULL); break; } NMG_LOCK(); do { /* memsize is always valid */ struct netmap_mem_d *nmd = &nm_mem; u_int memflags; if (nmr->nr_name[0] != '\0') { /* get a refcount */ - error = get_ifp(nmr, &ifp, 1 /* create */); + error = netmap_get_na(nmr, &na, 1 /* create */); if (error) break; - na = NA(ifp); /* retrieve the netmap adapter */ - nmd = na->nm_mem; /* and its memory allocator */ + nmd = na->nm_mem; /* get memory allocator */ } - - error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags); + + error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags, + &nmr->nr_arg2); if (error) break; if (na == NULL) /* only memory info */ break; nmr->nr_offset = 0; nmr->nr_rx_slots = nmr->nr_tx_slots = 0; netmap_update_config(na); nmr->nr_rx_rings = na->num_rx_rings; nmr->nr_tx_rings = na->num_tx_rings; nmr->nr_rx_slots = na->num_rx_desc; nmr->nr_tx_slots = na->num_tx_desc; - if (memflags & NETMAP_MEM_PRIVATE) - nmr->nr_ringid |= NETMAP_PRIV_MEM; + netmap_adapter_put(na); } while (0); - if (ifp) - nm_if_rele(ifp); /* return the refcount */ NMG_UNLOCK(); break; case NIOCREGIF: - if (nmr->nr_version != NETMAP_API) { - nmr->nr_version = NETMAP_API; - error = EINVAL; - break; - } /* possibly attach/detach NIC and VALE switch */ i = nmr->nr_cmd; - if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH) { + if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH + || i == NETMAP_BDG_VNET_HDR) { error = netmap_bdg_ctl(nmr, NULL); break; } else if (i != 0) { D("nr_cmd must be 0 not %d", i); error = EINVAL; break; } /* protect access to priv from concurrent NIOCREGIF */ NMG_LOCK(); do { u_int memflags; - if (priv->np_ifp != NULL) { /* thread already registered */ - error = netmap_set_ringid(priv, nmr->nr_ringid); + if (priv->np_na != NULL) { /* thread already registered */ + error = EBUSY; break; } /* find the interface and a reference */ - error = get_ifp(nmr, &ifp, 1 /* create */); /* keep reference */ + error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */ if (error) break; - if (NETMAP_OWNED_BY_KERN(ifp)) { - nm_if_rele(ifp); + ifp = na->ifp; + if (NETMAP_OWNED_BY_KERN(na)) { + netmap_adapter_put(na); error = EBUSY; break; } - nifp = netmap_do_regif(priv, ifp, nmr->nr_ringid, &error); + nifp = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags, &error); if (!nifp) { /* reg. failed, release priv and ref */ - nm_if_rele(ifp); /* return the refcount */ - priv->np_ifp = NULL; + netmap_adapter_put(na); priv->np_nifp = NULL; break; } + priv->np_td = td; // XXX kqueue, debugging only /* return the offset of the netmap_if object */ - na = NA(ifp); /* retrieve netmap adapter */ nmr->nr_rx_rings = na->num_rx_rings; nmr->nr_tx_rings = na->num_tx_rings; nmr->nr_rx_slots = na->num_rx_desc; nmr->nr_tx_slots = na->num_tx_desc; - error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags); + error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags, + &nmr->nr_arg2); if (error) { - nm_if_rele(ifp); + netmap_adapter_put(na); break; } if (memflags & NETMAP_MEM_PRIVATE) { - nmr->nr_ringid |= NETMAP_PRIV_MEM; *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM; } + priv->np_txsi = (priv->np_txqlast - priv->np_txqfirst > 1) ? + &na->tx_si : &na->tx_rings[priv->np_txqfirst].si; + priv->np_rxsi = (priv->np_rxqlast - priv->np_rxqfirst > 1) ? + &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si; + + if (nmr->nr_arg3) { + D("requested %d extra buffers", nmr->nr_arg3); + nmr->nr_arg3 = netmap_extra_alloc(na, + &nifp->ni_bufs_head, nmr->nr_arg3); + D("got %d extra buffers", nmr->nr_arg3); + } nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); } while (0); NMG_UNLOCK(); break; - case NIOCUNREGIF: - // XXX we have no data here ? - D("deprecated, data is %p", nmr); - error = EINVAL; - break; - case NIOCTXSYNC: case NIOCRXSYNC: nifp = priv->np_nifp; if (nifp == NULL) { error = ENXIO; break; } rmb(); /* make sure following reads are not from cache */ - ifp = priv->np_ifp; /* we have a reference */ + na = priv->np_na; /* we have a reference */ - if (ifp == NULL) { - D("Internal error: nifp != NULL && ifp == NULL"); + if (na == NULL) { + D("Internal error: nifp != NULL && na == NULL"); error = ENXIO; break; } - na = NA(ifp); /* retrieve netmap adapter */ - if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */ - if (cmd == NIOCTXSYNC) - netmap_txsync_to_host(na); - else - netmap_rxsync_from_host(na, NULL, NULL); + ifp = na->ifp; + if (ifp == NULL) { + RD(1, "the ifp is gone"); + error = ENXIO; break; } - /* find the last ring to scan */ - lim = priv->np_qlast; - if (lim == NETMAP_HW_RING) - lim = (cmd == NIOCTXSYNC) ? - na->num_tx_rings : na->num_rx_rings; - krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings; - for (i = priv->np_qfirst; i < lim; i++) { + if (cmd == NIOCTXSYNC) { + krings = na->tx_rings; + qfirst = priv->np_txqfirst; + qlast = priv->np_txqlast; + } else { + krings = na->rx_rings; + qfirst = priv->np_rxqfirst; + qlast = priv->np_rxqlast; + } + + for (i = qfirst; i < qlast; i++) { struct netmap_kring *kring = krings + i; if (nm_kr_tryget(kring)) { error = EBUSY; goto out; } if (cmd == NIOCTXSYNC) { if (netmap_verbose & NM_VERB_TXSYNC) D("pre txsync ring %d cur %d hwcur %d", i, kring->ring->cur, kring->nr_hwcur); - na->nm_txsync(ifp, i, NAF_FORCE_RECLAIM); + if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { + netmap_ring_reinit(kring); + } else { + kring->nm_sync(kring, NAF_FORCE_RECLAIM); + } if (netmap_verbose & NM_VERB_TXSYNC) D("post txsync ring %d cur %d hwcur %d", i, kring->ring->cur, kring->nr_hwcur); } else { - na->nm_rxsync(ifp, i, NAF_FORCE_READ); + kring->nm_sync(kring, NAF_FORCE_READ); microtime(&na->rx_rings[i].ring->ts); } nm_kr_put(kring); } break; #ifdef __FreeBSD__ + case FIONBIO: + case FIOASYNC: + ND("FIONBIO/FIOASYNC are no-ops"); + break; + case BIOCIMMEDIATE: case BIOCGHDRCMPLT: case BIOCSHDRCMPLT: case BIOCSSEESENT: D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT"); break; default: /* allow device-specific ioctls */ { struct socket so; bzero(&so, sizeof(so)); NMG_LOCK(); - error = get_ifp(nmr, &ifp, 0 /* don't create */); /* keep reference */ + error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */ if (error) { + netmap_adapter_put(na); NMG_UNLOCK(); break; } + ifp = na->ifp; so.so_vnet = ifp->if_vnet; // so->so_proto not null. error = ifioctl(&so, cmd, data, td); - nm_if_rele(ifp); + netmap_adapter_put(na); NMG_UNLOCK(); break; } #else /* linux */ default: error = EOPNOTSUPP; #endif /* linux */ } out: CURVNET_RESTORE(); return (error); } /* * select(2) and poll(2) handlers for the "netmap" device. * * Can be called for one or more queues. * Return true the event mask corresponding to ready events. * If there are no ready events, do a selrecord on either individual * selinfo or on the global one. * Device-dependent parts (locking and sync of tx/rx rings) * are done through callbacks. * * On linux, arguments are really pwait, the poll table, and 'td' is struct file * * The first one is remapped to pwait as selrecord() uses the name as an * hidden argument. */ -static int +int netmap_poll(struct cdev *dev, int events, struct thread *td) { struct netmap_priv_d *priv = NULL; struct netmap_adapter *na; struct ifnet *ifp; struct netmap_kring *kring; u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0; - u_int lim_tx, lim_rx, host_forwarded = 0; - struct mbq q = { NULL, NULL, 0 }; + struct mbq q; /* packets from hw queues to host stack */ void *pwait = dev; /* linux compatibility */ + int is_kevent = 0; - int retry_tx = 1; + /* + * In order to avoid nested locks, we need to "double check" + * txsync and rxsync if we decide to do a selrecord(). + * retry_tx (and retry_rx, later) prevent looping forever. + */ + int retry_tx = 1, retry_rx = 1; (void)pwait; + mbq_init(&q); - if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL) + /* + * XXX kevent has curthread->tp_fop == NULL, + * so devfs_get_cdevpriv() fails. We circumvent this by passing + * priv as the first argument, which is also useful to avoid + * the selrecord() which are not necessary in that case. + */ + if (devfs_get_cdevpriv((void **)&priv) != 0) { + is_kevent = 1; + if (netmap_verbose) + D("called from kevent"); + priv = (struct netmap_priv_d *)dev; + } + if (priv == NULL) return POLLERR; if (priv->np_nifp == NULL) { D("No if registered"); return POLLERR; } rmb(); /* make sure following reads are not from cache */ - ifp = priv->np_ifp; - // XXX check for deleting() ? + na = priv->np_na; + ifp = na->ifp; + // check for deleted + if (ifp == NULL) { + RD(1, "the ifp is gone"); + return POLLERR; + } + if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) return POLLERR; if (netmap_verbose & 0x8000) - D("device %s events 0x%x", ifp->if_xname, events); + D("device %s events 0x%x", NM_IFPNAME(ifp), events); want_tx = events & (POLLOUT | POLLWRNORM); want_rx = events & (POLLIN | POLLRDNORM); - na = NA(ifp); /* retrieve netmap adapter */ - lim_tx = na->num_tx_rings; - lim_rx = na->num_rx_rings; - - if (priv->np_qfirst == NETMAP_SW_RING) { - /* handle the host stack ring */ - if (priv->np_txpoll || want_tx) { - /* push any packets up, then we are always ready */ - netmap_txsync_to_host(na); - revents |= want_tx; - } - if (want_rx) { - kring = &na->rx_rings[lim_rx]; - if (kring->ring->avail == 0) - netmap_rxsync_from_host(na, td, dev); - if (kring->ring->avail > 0) { - revents |= want_rx; - } - } - return (revents); - } - - /* if we are in transparent mode, check also the host rx ring */ - kring = &na->rx_rings[lim_rx]; - if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all - && want_rx - && (netmap_fwd || kring->ring->flags & NR_FORWARD) ) { - if (kring->ring->avail == 0) - netmap_rxsync_from_host(na, td, dev); - if (kring->ring->avail > 0) - revents |= want_rx; - } - /* - * check_all is set if the card has more than one queue AND - * the client is polling all of them. If true, we sleep on + * check_all_{tx|rx} are set if the card has more than one queue AND + * the file descriptor is bound to all of them. If so, we sleep on * the "global" selinfo, otherwise we sleep on individual selinfo * (FreeBSD only allows two selinfo's per file descriptor). * The interrupt routine in the driver wake one or the other * (or both) depending on which clients are active. * * rxsync() is only called if we run out of buffers on a POLLIN. * txsync() is called if we run out of buffers on POLLOUT, or * there are pending packets to send. The latter can be disabled * passing NETMAP_NO_TX_POLL in the NIOCREG call. */ - check_all_tx = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1); - check_all_rx = (priv->np_qlast == NETMAP_HW_RING) && (lim_rx > 1); + check_all_tx = nm_tx_si_user(priv); + check_all_rx = nm_rx_si_user(priv); - if (priv->np_qlast != NETMAP_HW_RING) { - lim_tx = lim_rx = priv->np_qlast; - } - /* - * We start with a lock free round which is good if we have - * data available. If this fails, then lock and call the sync + * We start with a lock free round which is cheap if we have + * slots available. If this fails, then lock and call the sync * routines. */ - for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) { + for (i = priv->np_rxqfirst; want_rx && i < priv->np_rxqlast; i++) { kring = &na->rx_rings[i]; - if (kring->ring->avail > 0) { + /* XXX compare ring->cur and kring->tail */ + if (!nm_ring_empty(kring->ring)) { revents |= want_rx; want_rx = 0; /* also breaks the loop */ } } - for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) { + for (i = priv->np_txqfirst; want_tx && i < priv->np_txqlast; i++) { kring = &na->tx_rings[i]; - if (kring->ring->avail > 0) { + /* XXX compare ring->cur and kring->tail */ + if (!nm_ring_empty(kring->ring)) { revents |= want_tx; want_tx = 0; /* also breaks the loop */ } } /* - * If we to push packets out (priv->np_txpoll) or want_tx is - * still set, we do need to run the txsync calls (on all rings, - * to avoid that the tx rings stall). + * If we want to push packets out (priv->np_txpoll) or + * want_tx is still set, we must issue txsync calls + * (on all rings, to avoid that the tx rings stall). + * XXX should also check cur != hwcur on the tx rings. + * Fortunately, normal tx mode has np_txpoll set. */ if (priv->np_txpoll || want_tx) { - /* If we really want to be woken up (want_tx), - * do a selrecord, either on the global or on - * the private structure. Then issue the txsync - * so there is no race in the selrecord/selwait + /* + * The first round checks if anyone is ready, if not + * do a selrecord and another round to handle races. + * want_tx goes to 0 if any space is found, and is + * used to skip rings with no pending transmissions. */ flush_tx: - for (i = priv->np_qfirst; i < lim_tx; i++) { + for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) { + int found = 0; + kring = &na->tx_rings[i]; - /* - * Skip this ring if want_tx == 0 - * (we have already done a successful sync on - * a previous ring) AND kring->cur == kring->hwcur - * (there are no pending transmissions for this ring). - */ if (!want_tx && kring->ring->cur == kring->nr_hwcur) continue; - /* make sure only one user thread is doing this */ + /* only one thread does txsync */ if (nm_kr_tryget(kring)) { - ND("ring %p busy is %d", kring, (int)kring->nr_busy); - revents |= POLLERR; - goto out; + if (netmap_verbose) + RD(2, "%p lost race on txring %d, ok", + priv, i); + continue; } - - if (netmap_verbose & NM_VERB_TXSYNC) - D("send %d on %s %d", - kring->ring->cur, ifp->if_xname, i); - if (na->nm_txsync(ifp, i, 0)) + if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { + netmap_ring_reinit(kring); revents |= POLLERR; - - /* Check avail/call selrecord only if called with POLLOUT */ - if (want_tx) { - if (kring->ring->avail > 0) { - /* stop at the first ring. We don't risk - * starvation. - */ - revents |= want_tx; - want_tx = 0; - } + } else { + if (kring->nm_sync(kring, 0)) + revents |= POLLERR; } + + /* + * If we found new slots, notify potential + * listeners on the same ring. + * Since we just did a txsync, look at the copies + * of cur,tail in the kring. + */ + found = kring->rcur != kring->rtail; nm_kr_put(kring); + if (found) { /* notify other listeners */ + revents |= want_tx; + want_tx = 0; + na->nm_notify(na, i, NR_TX, 0); + } } - if (want_tx && retry_tx) { + if (want_tx && retry_tx && !is_kevent) { selrecord(td, check_all_tx ? - &na->tx_si : &na->tx_rings[priv->np_qfirst].si); + &na->tx_si : &na->tx_rings[priv->np_txqfirst].si); retry_tx = 0; goto flush_tx; } } /* - * now if want_rx is still set we need to lock and rxsync. + * If want_rx is still set scan receive rings. * Do it on all rings because otherwise we starve. */ if (want_rx) { - int retry_rx = 1; + int send_down = 0; /* transparent mode */ + /* two rounds here to for race avoidance */ do_retry_rx: - for (i = priv->np_qfirst; i < lim_rx; i++) { + for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) { + int found = 0; + kring = &na->rx_rings[i]; if (nm_kr_tryget(kring)) { - revents |= POLLERR; - goto out; + if (netmap_verbose) + RD(2, "%p lost race on rxring %d, ok", + priv, i); + continue; } - /* XXX NR_FORWARD should only be read on + /* + * transparent mode support: collect packets + * from the rxring(s). + * XXX NR_FORWARD should only be read on * physical or NIC ports */ if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { ND(10, "forwarding some buffers up %d to %d", kring->nr_hwcur, kring->ring->cur); netmap_grab_packets(kring, &q, netmap_fwd); } - if (na->nm_rxsync(ifp, i, 0)) + if (kring->nm_sync(kring, 0)) revents |= POLLERR; if (netmap_no_timestamp == 0 || kring->ring->flags & NR_TIMESTAMP) { microtime(&kring->ring->ts); } - - if (kring->ring->avail > 0) { + /* after an rxsync we can use kring->rcur, rtail */ + found = kring->rcur != kring->rtail; + nm_kr_put(kring); + if (found) { revents |= want_rx; retry_rx = 0; + na->nm_notify(na, i, NR_RX, 0); } - nm_kr_put(kring); } - if (retry_rx) { - retry_rx = 0; + + /* transparent mode XXX only during first pass ? */ + if (na->na_flags & NAF_HOST_RINGS) { + kring = &na->rx_rings[na->num_rx_rings]; + if (check_all_rx + && (netmap_fwd || kring->ring->flags & NR_FORWARD)) { + /* XXX fix to use kring fields */ + if (nm_ring_empty(kring->ring)) + send_down = netmap_rxsync_from_host(na, td, dev); + if (!nm_ring_empty(kring->ring)) + revents |= want_rx; + } + } + + if (retry_rx && !is_kevent) selrecord(td, check_all_rx ? - &na->rx_si : &na->rx_rings[priv->np_qfirst].si); - goto do_retry_rx; + &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si); + if (send_down > 0 || retry_rx) { + retry_rx = 0; + if (send_down) + goto flush_tx; /* and retry_rx */ + else + goto do_retry_rx; } } - /* forward host to the netmap ring. - * I am accessing nr_hwavail without lock, but netmap_transmit - * can only increment it, so the operation is safe. + /* + * Transparent mode: marked bufs on rx rings between + * kring->nr_hwcur and ring->head + * are passed to the other endpoint. + * + * In this mode we also scan the sw rxring, which in + * turn passes packets up. + * + * XXX Transparent mode at the moment requires to bind all + * rings to a single file descriptor. */ - kring = &na->rx_rings[lim_rx]; - if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all - && (netmap_fwd || kring->ring->flags & NR_FORWARD) - && kring->nr_hwavail > 0 && !host_forwarded) { - netmap_sw_to_nic(na); - host_forwarded = 1; /* prevent another pass */ - want_rx = 0; - goto flush_tx; - } if (q.head) - netmap_send_up(na->ifp, q.head); + netmap_send_up(na->ifp, &q); -out: - return (revents); } -/*------- driver support routines ------*/ +/*-------------------- driver support routines -------------------*/ +static int netmap_hw_krings_create(struct netmap_adapter *); + +static int +netmap_notify(struct netmap_adapter *na, u_int n_ring, + enum txrx tx, int flags) +{ + struct netmap_kring *kring; + + if (tx == NR_TX) { + kring = na->tx_rings + n_ring; + OS_selwakeup(&kring->si, PI_NET); + if (na->tx_si_users > 0) + OS_selwakeup(&na->tx_si, PI_NET); + } else { + kring = na->rx_rings + n_ring; + OS_selwakeup(&kring->si, PI_NET); + if (na->rx_si_users > 0) + OS_selwakeup(&na->rx_si, PI_NET); + } + return 0; +} + + +// XXX check handling of failures +int +netmap_attach_common(struct netmap_adapter *na) +{ + struct ifnet *ifp = na->ifp; + + if (na->num_tx_rings == 0 || na->num_rx_rings == 0) { + D("%s: invalid rings tx %d rx %d", + ifp->if_xname, na->num_tx_rings, na->num_rx_rings); + return EINVAL; + } + WNA(ifp) = na; + + /* the following is only needed for na that use the host port. + * XXX do we have something similar for linux ? + */ +#ifdef __FreeBSD__ + na->if_input = ifp->if_input; /* for netmap_send_up */ +#endif /* __FreeBSD__ */ + + NETMAP_SET_CAPABLE(ifp); + if (na->nm_krings_create == NULL) { + na->nm_krings_create = netmap_hw_krings_create; + na->nm_krings_delete = netmap_hw_krings_delete; + } + if (na->nm_notify == NULL) + na->nm_notify = netmap_notify; + na->active_fds = 0; + + if (na->nm_mem == NULL) + na->nm_mem = &nm_mem; + return 0; +} + + +void +netmap_detach_common(struct netmap_adapter *na) +{ + if (na->ifp) + WNA(na->ifp) = NULL; /* XXX do we need this? */ + + if (na->tx_rings) { /* XXX should not happen */ + D("freeing leftover tx_rings"); + na->nm_krings_delete(na); + } + netmap_pipe_dealloc(na); + if (na->na_flags & NAF_MEM_OWNER) + netmap_mem_private_delete(na->nm_mem); + bzero(na, sizeof(*na)); + free(na, M_DEVBUF); +} + + /* * Initialize a ``netmap_adapter`` object created by driver on attach. * We allocate a block of memory with room for a struct netmap_adapter * plus two sets of N+2 struct netmap_kring (where N is the number * of hardware rings): * krings 0..N-1 are for the hardware queues. * kring N is for the host stack queue - * kring N+1 is only used for the selinfo for all queues. + * kring N+1 is only used for the selinfo for all queues. // XXX still true ? * Return 0 on success, ENOMEM otherwise. - * - * By default the receive and transmit adapter ring counts are both initialized - * to num_queues. na->num_tx_rings can be set for cards with different tx/rx - * setups. */ int -netmap_attach(struct netmap_adapter *arg, u_int num_queues) +netmap_attach(struct netmap_adapter *arg) { - struct netmap_adapter *na = NULL; + struct netmap_hw_adapter *hwna = NULL; + // XXX when is arg == NULL ? struct ifnet *ifp = arg ? arg->ifp : NULL; - size_t len; if (arg == NULL || ifp == NULL) goto fail; - /* a VALE port uses two endpoints */ - len = nma_is_vp(arg) ? sizeof(*na) : sizeof(*na) * 2; - na = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO); - if (na == NULL) + hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (hwna == NULL) goto fail; - WNA(ifp) = na; - *na = *arg; /* copy everything, trust the driver to not pass junk */ - NETMAP_SET_CAPABLE(ifp); - if (na->num_tx_rings == 0) - na->num_tx_rings = num_queues; - na->num_rx_rings = num_queues; - na->refcount = na->na_single = na->na_multi = 0; - /* Core lock initialized here, others after netmap_if_new. */ - mtx_init(&na->core_lock, "netmap core lock", MTX_NETWORK_LOCK, MTX_DEF); + hwna->up = *arg; + hwna->up.na_flags |= NAF_HOST_RINGS; + if (netmap_attach_common(&hwna->up)) { + free(hwna, M_DEVBUF); + goto fail; + } + netmap_adapter_get(&hwna->up); + #ifdef linux if (ifp->netdev_ops) { - ND("netdev_ops %p", ifp->netdev_ops); /* prepare a clone of the netdev ops */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) - na->nm_ndo.ndo_start_xmit = ifp->netdev_ops; + hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops; #else - na->nm_ndo = *ifp->netdev_ops; + hwna->nm_ndo = *ifp->netdev_ops; #endif } - na->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; + hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; #endif /* linux */ - na->nm_mem = arg->nm_mem ? arg->nm_mem : &nm_mem; - if (!nma_is_vp(arg)) - netmap_attach_sw(ifp); - D("success for %s", ifp->if_xname); + + D("success for %s", NM_IFPNAME(ifp)); return 0; fail: - D("fail, arg %p ifp %p na %p", arg, ifp, na); + D("fail, arg %p ifp %p na %p", arg, ifp, hwna); netmap_detach(ifp); - return (na ? EINVAL : ENOMEM); + return (hwna ? EINVAL : ENOMEM); } +void +NM_DBG(netmap_adapter_get)(struct netmap_adapter *na) +{ + if (!na) { + return; + } + + refcount_acquire(&na->na_refcount); +} + + +/* returns 1 iff the netmap_adapter is destroyed */ +int +NM_DBG(netmap_adapter_put)(struct netmap_adapter *na) +{ + if (!na) + return 1; + + if (!refcount_release(&na->na_refcount)) + return 0; + + if (na->nm_dtor) + na->nm_dtor(na); + + netmap_detach_common(na); + + return 1; +} + +int +netmap_hw_krings_create(struct netmap_adapter *na) +{ + int ret = netmap_krings_create(na, 0); + if (ret == 0) { + /* initialize the mbq for the sw rx ring */ + mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue); + ND("initialized sw rx queue %d", na->num_rx_rings); + } + return ret; +} + + + /* * Free the allocated memory linked to the given ``netmap_adapter`` * object. */ void netmap_detach(struct ifnet *ifp) { struct netmap_adapter *na = NA(ifp); if (!na) return; - mtx_destroy(&na->core_lock); - - if (na->tx_rings) { /* XXX should not happen */ - D("freeing leftover tx_rings"); - free(na->tx_rings, M_DEVBUF); + NMG_LOCK(); + netmap_disable_all_rings(ifp); + if (!netmap_adapter_put(na)) { + /* someone is still using the adapter, + * tell them that the interface is gone + */ + na->ifp = NULL; + /* give them a chance to notice */ + netmap_enable_all_rings(ifp); } - if (na->na_flags & NAF_MEM_OWNER) - netmap_mem_private_delete(na->nm_mem); - bzero(na, sizeof(*na)); - WNA(ifp) = NULL; - free(na, M_DEVBUF); + NMG_UNLOCK(); } -int -nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, - struct netmap_adapter *na, u_int ring_nr); - - /* * Intercept packets from the network stack and pass them * to netmap as incoming packets on the 'software' ring. + * + * We only store packets in a bounded mbq and then copy them + * in the relevant rxsync routine. + * * We rely on the OS to make sure that the ifp and na do not go * away (typically the caller checks for IFF_DRV_RUNNING or the like). * In nm_register() or whenever there is a reinitialization, - * we make sure to access the core lock and per-ring locks - * so that IFCAP_NETMAP is visible here. + * we make sure to make the mode change visible here. */ int netmap_transmit(struct ifnet *ifp, struct mbuf *m) { struct netmap_adapter *na = NA(ifp); struct netmap_kring *kring; - u_int i, len = MBUF_LEN(m); - u_int error = EBUSY, lim; - struct netmap_slot *slot; + u_int len = MBUF_LEN(m); + u_int error = ENOBUFS; + struct mbq *q; + int space; // XXX [Linux] we do not need this lock // if we follow the down/configure/up protocol -gl // mtx_lock(&na->core_lock); + if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) { - /* interface not in netmap mode anymore */ + D("%s not in netmap mode anymore", NM_IFPNAME(ifp)); error = ENXIO; goto done; } kring = &na->rx_rings[na->num_rx_rings]; - lim = kring->nkr_num_slots - 1; - if (netmap_verbose & NM_VERB_HOST) - D("%s packet %d len %d from the stack", ifp->if_xname, - kring->nr_hwcur + kring->nr_hwavail, len); + q = &kring->rx_queue; + // XXX reconsider long packets if we handle fragments if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */ - D("%s from_host, drop packet size %d > %d", ifp->if_xname, + D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp), len, NETMAP_BDG_BUF_SIZE(na->nm_mem)); goto done; } - if (SWNA(ifp)->na_bdg) { - struct nm_bdg_fwd *ft; - char *dst; - na = SWNA(ifp); /* we operate on the host port */ - ft = na->rx_rings[0].nkr_ft; - dst = BDG_NMB(na->nm_mem, &na->rx_rings[0].ring->slot[0]); - - /* use slot 0 in the ft, there is nothing queued here */ - /* XXX we can save the copy calling m_copydata in nm_bdg_flush, - * need a special flag for this. - */ - m_copydata(m, 0, (int)len, dst); - ft->ft_flags = 0; - ft->ft_len = len; - ft->ft_buf = dst; - ft->ft_next = NM_FT_NULL; - ft->ft_frags = 1; - if (netmap_verbose & NM_VERB_HOST) - RD(5, "pkt %p size %d to bridge port %d", - dst, len, na->bdg_port); - nm_bdg_flush(ft, 1, na, 0); - na = NA(ifp); /* back to the regular object/lock */ - error = 0; - goto done; - } - - /* protect against other instances of netmap_transmit, - * and userspace invocations of rxsync(). - * XXX could reuse core_lock + /* protect against rxsync_from_host(), netmap_sw_to_nic() + * and maybe other instances of netmap_transmit (the latter + * not possible on Linux). + * Also avoid overflowing the queue. */ - // XXX [Linux] there can be no other instances of netmap_transmit - // on this same ring, but we still need this lock to protect - // concurrent access from netmap_sw_to_nic() -gl - mtx_lock(&kring->q_lock); - if (kring->nr_hwavail >= lim) { - if (netmap_verbose) - D("stack ring %s full\n", ifp->if_xname); + mtx_lock(&q->lock); + + space = kring->nr_hwtail - kring->nr_hwcur; + if (space < 0) + space += kring->nkr_num_slots; + if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX + RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p", + NM_IFPNAME(ifp), kring->nr_hwcur, kring->nr_hwtail, mbq_len(q), + len, m); } else { - /* compute the insert position */ - i = nm_kr_rxpos(kring); - slot = &kring->ring->slot[i]; - m_copydata(m, 0, (int)len, BDG_NMB(na->nm_mem, slot)); - slot->len = len; - slot->flags = kring->nkr_slot_flags; - kring->nr_hwavail++; - if (netmap_verbose & NM_VERB_HOST) - D("wake up host ring %s %d", na->ifp->if_xname, na->num_rx_rings); - selwakeuppri(&kring->si, PI_NET); + mbq_enqueue(q, m); + ND(10, "%s %d bufs in queue len %d m %p", + NM_IFPNAME(ifp), mbq_len(q), len, m); + /* notify outside the lock */ + m = NULL; error = 0; } - mtx_unlock(&kring->q_lock); + mtx_unlock(&q->lock); done: - // mtx_unlock(&na->core_lock); + if (m) + m_freem(m); + /* unconditionally wake up listeners */ + na->nm_notify(na, na->num_rx_rings, NR_RX, 0); - /* release the mbuf in either cases of success or failure. As an - * alternative, put the mbuf in a free list and free the list - * only when really necessary. - */ - m_freem(m); - return (error); } /* * netmap_reset() is called by the driver routines when reinitializing * a ring. The driver is in charge of locking to protect the kring. - * If netmap mode is not set just return NULL. + * If native netmap mode is not set just return NULL. */ struct netmap_slot * netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, u_int new_cur) { struct netmap_kring *kring; int new_hwofs, lim; if (na == NULL) { D("NULL na, should not happen"); return NULL; /* no netmap support here */ } if (!(na->ifp->if_capenable & IFCAP_NETMAP)) { ND("interface not in netmap mode"); return NULL; /* nothing to reinitialize */ } /* XXX note- in the new scheme, we are not guaranteed to be * under lock (e.g. when called on a device reset). * In this case, we should set a flag and do not trust too * much the values. In practice: TODO * - set a RESET flag somewhere in the kring * - do the processing in a conservative way * - let the *sync() fixup at the end. */ if (tx == NR_TX) { if (n >= na->num_tx_rings) return NULL; kring = na->tx_rings + n; + // XXX check whether we should use hwcur or rcur new_hwofs = kring->nr_hwcur - new_cur; } else { if (n >= na->num_rx_rings) return NULL; kring = na->rx_rings + n; - new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur; + new_hwofs = kring->nr_hwtail - new_cur; } lim = kring->nkr_num_slots - 1; if (new_hwofs > lim) new_hwofs -= lim + 1; /* Always set the new offset value and realign the ring. */ - D("%s hwofs %d -> %d, hwavail %d -> %d", - tx == NR_TX ? "TX" : "RX", + if (netmap_verbose) + D("%s %s%d hwofs %d -> %d, hwtail %d -> %d", + NM_IFPNAME(na->ifp), + tx == NR_TX ? "TX" : "RX", n, kring->nkr_hwofs, new_hwofs, - kring->nr_hwavail, - tx == NR_TX ? lim : kring->nr_hwavail); + kring->nr_hwtail, + tx == NR_TX ? lim : kring->nr_hwtail); kring->nkr_hwofs = new_hwofs; - if (tx == NR_TX) - kring->nr_hwavail = lim; + if (tx == NR_TX) { + kring->nr_hwtail = kring->nr_hwcur + lim; + if (kring->nr_hwtail > lim) + kring->nr_hwtail -= lim + 1; + } #if 0 // def linux /* XXX check that the mappings are correct */ /* need ring_nr, adapter->pdev, direction */ buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE); if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { D("error mapping rx netmap buffer %d", i); // XXX fix error handling } #endif /* linux */ /* * Wakeup on the individual and global selwait * We do the wakeup here, but the ring is not yet reconfigured. * However, we are under lock so there are no races. */ - selwakeuppri(&kring->si, PI_NET); - selwakeuppri(tx == NR_TX ? &na->tx_si : &na->rx_si, PI_NET); + na->nm_notify(na, n, tx, 0); return kring->ring->slot; } /* - * Grab packets from a kring, move them into the ft structure - * associated to the tx (input) port. Max one instance per port, - * filtered on input (ioctl, poll or XXX). - * Returns the next position in the ring. - */ -static int -nm_bdg_preflush(struct netmap_adapter *na, u_int ring_nr, - struct netmap_kring *kring, u_int end) -{ - struct netmap_ring *ring = kring->ring; - struct nm_bdg_fwd *ft; - u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; - u_int ft_i = 0; /* start from 0 */ - u_int frags = 1; /* how many frags ? */ - struct nm_bridge *b = na->na_bdg; - - /* To protect against modifications to the bridge we acquire a - * shared lock, waiting if we can sleep (if the source port is - * attached to a user process) or with a trylock otherwise (NICs). - */ - ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); - if (na->na_flags & NAF_BDG_MAYSLEEP) - BDG_RLOCK(b); - else if (!BDG_RTRYLOCK(b)) - return 0; - ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); - ft = kring->nkr_ft; - - for (; likely(j != end); j = nm_next(j, lim)) { - struct netmap_slot *slot = &ring->slot[j]; - char *buf; - - ft[ft_i].ft_len = slot->len; - ft[ft_i].ft_flags = slot->flags; - - ND("flags is 0x%x", slot->flags); - /* this slot goes into a list so initialize the link field */ - ft[ft_i].ft_next = NM_FT_NULL; - buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? - (void *)(uintptr_t)slot->ptr : BDG_NMB(na->nm_mem, slot); - prefetch(buf); - ++ft_i; - if (slot->flags & NS_MOREFRAG) { - frags++; - continue; - } - if (unlikely(netmap_verbose && frags > 1)) - RD(5, "%d frags at %d", frags, ft_i - frags); - ft[ft_i - frags].ft_frags = frags; - frags = 1; - if (unlikely((int)ft_i >= bridge_batch)) - ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); - } - if (frags > 1) { - D("truncate incomplete fragment at %d (%d frags)", ft_i, frags); - // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG - ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG; - ft[ft_i - frags].ft_frags = frags - 1; - } - if (ft_i) - ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); - BDG_RUNLOCK(b); - return j; -} - - -/* - * Pass packets from nic to the bridge. - * XXX TODO check locking: this is called from the interrupt - * handler so we should make sure that the interface is not - * disconnected while passing down an interrupt. + * Dispatch rx/tx interrupts to the netmap rings. * - * Note, no user process can access this NIC so we can ignore - * the info in the 'ring'. + * "work_done" is non-null on the RX path, NULL for the TX path. + * We rely on the OS to make sure that there is only one active + * instance per queue, and that there is appropriate locking. + * + * The 'notify' routine depends on what the ring is attached to. + * - for a netmap file descriptor, do a selwakeup on the individual + * waitqueue, plus one on the global one if needed + * - for a switch, call the proper forwarding routine + * - XXX more ? */ -static void -netmap_nic_to_bdg(struct ifnet *ifp, u_int ring_nr) +void +netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) { struct netmap_adapter *na = NA(ifp); - struct netmap_kring *kring = &na->rx_rings[ring_nr]; - struct netmap_ring *ring = kring->ring; - u_int j, k; + struct netmap_kring *kring; - /* make sure that only one thread is ever in here, - * after which we can unlock. Probably unnecessary XXX. - */ - if (nm_kr_tryget(kring)) - return; - /* fetch packets that have arrived. - * XXX maybe do this in a loop ? - */ - if (na->nm_rxsync(ifp, ring_nr, 0)) - goto put_out; - if (kring->nr_hwavail == 0 && netmap_verbose) { - D("how strange, interrupt with no packets on %s", - ifp->if_xname); - goto put_out; + q &= NETMAP_RING_MASK; + + if (netmap_verbose) { + RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); } - k = nm_kr_rxpos(kring); - j = nm_bdg_preflush(na, ring_nr, kring, k); - - /* we consume everything, but we cannot update kring directly - * because the nic may have destroyed the info in the NIC ring. - * So we need to call rxsync again to restore it. - */ - ring->cur = j; - ring->avail = 0; - na->nm_rxsync(ifp, ring_nr, 0); - -put_out: - nm_kr_put(kring); - return; + if (work_done) { /* RX path */ + if (q >= na->num_rx_rings) + return; // not a physical queue + kring = na->rx_rings + q; + kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? + na->nm_notify(na, q, NR_RX, 0); + *work_done = 1; /* do not fire napi again */ + } else { /* TX path */ + if (q >= na->num_tx_rings) + return; // not a physical queue + kring = na->tx_rings + q; + na->nm_notify(na, q, NR_TX, 0); + } } /* * Default functions to handle rx/tx interrupts from a physical device. * "work_done" is non-null on the RX path, NULL for the TX path. - * We rely on the OS to make sure that there is only one active - * instance per queue, and that there is appropriate locking. * * If the card is not in netmap mode, simply return 0, * so that the caller proceeds with regular processing. + * Otherwise call netmap_common_irq() and return 1. * * If the card is connected to a netmap file descriptor, * do a selwakeup on the individual queue, plus one on the global one * if needed (multiqueue card _and_ there are multiqueue listeners), * and return 1. * * Finally, if called on rx from an interface connected to a switch, * calls the proper forwarding routine, and return 1. */ int netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) { - struct netmap_adapter *na; - struct netmap_kring *kring; - + // XXX could we check NAF_NATIVE_ON ? if (!(ifp->if_capenable & IFCAP_NETMAP)) return 0; - q &= NETMAP_RING_MASK; - - if (netmap_verbose) - RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); - na = NA(ifp); - if (na->na_flags & NAF_SKIP_INTR) { + if (NA(ifp)->na_flags & NAF_SKIP_INTR) { ND("use regular interrupt"); return 0; } - if (work_done) { /* RX path */ - if (q >= na->num_rx_rings) - return 0; // not a physical queue - kring = na->rx_rings + q; - kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? - if (na->na_bdg != NULL) { - netmap_nic_to_bdg(ifp, q); - } else { - selwakeuppri(&kring->si, PI_NET); - if (na->num_rx_rings > 1 /* or multiple listeners */ ) - selwakeuppri(&na->rx_si, PI_NET); - } - *work_done = 1; /* do not fire napi again */ - } else { /* TX path */ - if (q >= na->num_tx_rings) - return 0; // not a physical queue - kring = na->tx_rings + q; - selwakeuppri(&kring->si, PI_NET); - if (na->num_tx_rings > 1 /* or multiple listeners */ ) - selwakeuppri(&na->tx_si, PI_NET); - } + netmap_common_irq(ifp, q, work_done); return 1; } -#ifdef linux /* linux-specific routines */ - - /* - * Remap linux arguments into the FreeBSD call. - * - pwait is the poll table, passed as 'dev'; - * If pwait == NULL someone else already woke up before. We can report - * events but they are filtered upstream. - * If pwait != NULL, then pwait->key contains the list of events. - * - events is computed from pwait as above. - * - file is passed as 'td'; - */ -static u_int -linux_netmap_poll(struct file * file, struct poll_table_struct *pwait) -{ -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) - int events = POLLIN | POLLOUT; /* XXX maybe... */ -#elif LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0) - int events = pwait ? pwait->key : POLLIN | POLLOUT; -#else /* in 3.4.0 field 'key' was renamed to '_key' */ - int events = pwait ? pwait->_key : POLLIN | POLLOUT; -#endif - return netmap_poll((void *)pwait, events, (void *)file); -} - - -static int -linux_netmap_mmap(struct file *f, struct vm_area_struct *vma) -{ - int error = 0; - unsigned long off, va; - vm_ooffset_t pa; - struct netmap_priv_d *priv = f->private_data; - /* - * vma->vm_start: start of mapping user address space - * vma->vm_end: end of the mapping user address space - * vma->vm_pfoff: offset of first page in the device - */ - - // XXX security checks - - error = netmap_get_memory(priv); - ND("get_memory returned %d", error); - if (error) - return -error; - - if ((vma->vm_start & ~PAGE_MASK) || (vma->vm_end & ~PAGE_MASK)) { - ND("vm_start = %lx vm_end = %lx", vma->vm_start, vma->vm_end); - return -EINVAL; - } - - for (va = vma->vm_start, off = vma->vm_pgoff; - va < vma->vm_end; - va += PAGE_SIZE, off++) - { - pa = netmap_mem_ofstophys(priv->np_mref, off << PAGE_SHIFT); - if (pa == 0) - return -EINVAL; - - ND("va %lx pa %p", va, pa); - error = remap_pfn_range(vma, va, pa >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot); - if (error) - return error; - } - return 0; -} - - -/* - * This one is probably already protected by the netif lock XXX - */ -static netdev_tx_t -linux_netmap_start_xmit(struct sk_buff *skb, struct net_device *dev) -{ - netmap_transmit(dev, skb); - return (NETDEV_TX_OK); -} - - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36) // XXX was 37 -#define LIN_IOCTL_NAME .ioctl -int -linux_netmap_ioctl(struct inode *inode, struct file *file, u_int cmd, u_long data /* arg */) -#else -#define LIN_IOCTL_NAME .unlocked_ioctl -long -linux_netmap_ioctl(struct file *file, u_int cmd, u_long data /* arg */) -#endif -{ - int ret; - struct nmreq nmr; - bzero(&nmr, sizeof(nmr)); - - if (cmd == NIOCTXSYNC || cmd == NIOCRXSYNC) { - data = 0; /* no argument required here */ - } - if (data && copy_from_user(&nmr, (void *)data, sizeof(nmr) ) != 0) - return -EFAULT; - ret = netmap_ioctl(NULL, cmd, (caddr_t)&nmr, 0, (void *)file); - if (data && copy_to_user((void*)data, &nmr, sizeof(nmr) ) != 0) - return -EFAULT; - return -ret; -} - - -static int -netmap_release(struct inode *inode, struct file *file) -{ - (void)inode; /* UNUSED */ - if (file->private_data) - netmap_dtor(file->private_data); - return (0); -} - - -static int -linux_netmap_open(struct inode *inode, struct file *file) -{ - struct netmap_priv_d *priv; - (void)inode; /* UNUSED */ - - priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, - M_NOWAIT | M_ZERO); - if (priv == NULL) - return -ENOMEM; - - file->private_data = priv; - - return (0); -} - - -static struct file_operations netmap_fops = { - .owner = THIS_MODULE, - .open = linux_netmap_open, - .mmap = linux_netmap_mmap, - LIN_IOCTL_NAME = linux_netmap_ioctl, - .poll = linux_netmap_poll, - .release = netmap_release, -}; - - -static struct miscdevice netmap_cdevsw = { /* same name as FreeBSD */ - MISC_DYNAMIC_MINOR, - "netmap", - &netmap_fops, -}; - -static int netmap_init(void); -static void netmap_fini(void); - - -/* Errors have negative values on linux */ -static int linux_netmap_init(void) -{ - return -netmap_init(); -} - -module_init(linux_netmap_init); -module_exit(netmap_fini); -/* export certain symbols to other modules */ -EXPORT_SYMBOL(netmap_attach); // driver attach routines -EXPORT_SYMBOL(netmap_detach); // driver detach routines -EXPORT_SYMBOL(netmap_ring_reinit); // ring init on error -EXPORT_SYMBOL(netmap_buffer_lut); -EXPORT_SYMBOL(netmap_total_buffers); // index check -EXPORT_SYMBOL(netmap_buffer_base); -EXPORT_SYMBOL(netmap_reset); // ring init routines -EXPORT_SYMBOL(netmap_buf_size); -EXPORT_SYMBOL(netmap_rx_irq); // default irq handler -EXPORT_SYMBOL(netmap_no_pendintr); // XXX mitigation - should go away -EXPORT_SYMBOL(netmap_bdg_ctl); // bridge configuration routine -EXPORT_SYMBOL(netmap_bdg_learning); // the default lookup function -EXPORT_SYMBOL(netmap_disable_all_rings); -EXPORT_SYMBOL(netmap_enable_all_rings); - - -MODULE_AUTHOR("http://info.iet.unipi.it/~luigi/netmap/"); -MODULE_DESCRIPTION("The netmap packet I/O framework"); -MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */ - -#else /* __FreeBSD__ */ - - -static struct cdevsw netmap_cdevsw = { - .d_version = D_VERSION, - .d_name = "netmap", - .d_open = netmap_open, - .d_mmap_single = netmap_mmap_single, - .d_ioctl = netmap_ioctl, - .d_poll = netmap_poll, - .d_close = netmap_close, -}; -#endif /* __FreeBSD__ */ - -/* - *---- support for virtual bridge ----- - */ - -/* ----- FreeBSD if_bridge hash function ------- */ - -/* - * The following hash function is adapted from "Hash Functions" by Bob Jenkins - * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). + * Module loader and unloader * - * http://www.burtleburtle.net/bob/hash/spooky.html + * netmap_init() creates the /dev/netmap device and initializes + * all global variables. Returns 0 on success, errno on failure + * (but there is no chance) + * + * netmap_fini() destroys everything. */ -#define mix(a, b, c) \ -do { \ - a -= b; a -= c; a ^= (c >> 13); \ - b -= c; b -= a; b ^= (a << 8); \ - c -= a; c -= b; c ^= (b >> 13); \ - a -= b; a -= c; a ^= (c >> 12); \ - b -= c; b -= a; b ^= (a << 16); \ - c -= a; c -= b; c ^= (b >> 5); \ - a -= b; a -= c; a ^= (c >> 3); \ - b -= c; b -= a; b ^= (a << 10); \ - c -= a; c -= b; c ^= (b >> 15); \ -} while (/*CONSTCOND*/0) -static __inline uint32_t -nm_bridge_rthash(const uint8_t *addr) -{ - uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key +static struct cdev *netmap_dev; /* /dev/netmap character device. */ +extern struct cdevsw netmap_cdevsw; - b += addr[5] << 8; - b += addr[4]; - a += addr[3] << 24; - a += addr[2] << 16; - a += addr[1] << 8; - a += addr[0]; - mix(a, b, c); -#define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) - return (c & BRIDGE_RTHASH_MASK); -} - -#undef mix - - -static int -bdg_netmap_reg(struct ifnet *ifp, int onoff) +void +netmap_fini(void) { - /* the interface is already attached to the bridge, - * so we only need to toggle IFCAP_NETMAP. - */ - if (onoff) { - ifp->if_capenable |= IFCAP_NETMAP; - } else { - ifp->if_capenable &= ~IFCAP_NETMAP; - } - return 0; + // XXX destroy_bridges() ? + if (netmap_dev) + destroy_dev(netmap_dev); + netmap_mem_fini(); + NMG_LOCK_DESTROY(); + printf("netmap: unloaded module.\n"); } -/* - * Lookup function for a learning bridge. - * Update the hash table with the source address, - * and then returns the destination port index, and the - * ring in *dst_ring (at the moment, always use ring 0) - */ -u_int -netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring, - struct netmap_adapter *na) -{ - struct nm_hash_ent *ht = na->na_bdg->ht; - uint32_t sh, dh; - u_int dst, mysrc = na->bdg_port; - uint64_t smac, dmac; - - if (buf_len < 14) { - D("invalid buf length %d", buf_len); - return NM_BDG_NOPORT; - } - dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; - smac = le64toh(*(uint64_t *)(buf + 4)); - smac >>= 16; - - /* - * The hash is somewhat expensive, there might be some - * worthwhile optimizations here. - */ - if ((buf[6] & 1) == 0) { /* valid src */ - uint8_t *s = buf+6; - sh = nm_bridge_rthash(s); // XXX hash of source - /* update source port forwarding entry */ - ht[sh].mac = smac; /* XXX expire ? */ - ht[sh].ports = mysrc; - if (netmap_verbose) - D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", - s[0], s[1], s[2], s[3], s[4], s[5], mysrc); - } - dst = NM_BDG_BROADCAST; - if ((buf[0] & 1) == 0) { /* unicast */ - dh = nm_bridge_rthash(buf); // XXX hash of dst - if (ht[dh].mac == dmac) { /* found dst */ - dst = ht[dh].ports; - } - /* XXX otherwise return NM_BDG_UNKNOWN ? */ - } - *dst_ring = 0; - return dst; -} - - -/* - * This flush routine supports only unicast and broadcast but a large - * number of ports, and lets us replace the learn and dispatch functions. - */ int -nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_adapter *na, - u_int ring_nr) -{ - struct nm_bdg_q *dst_ents, *brddst; - uint16_t num_dsts = 0, *dsts; - struct nm_bridge *b = na->na_bdg; - u_int i, j, me = na->bdg_port; - - /* - * The work area (pointed by ft) is followed by an array of - * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS - * queues per port plus one for the broadcast traffic. - * Then we have an array of destination indexes. - */ - dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); - dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); - - /* first pass: find a destination for each packet in the batch */ - for (i = 0; likely(i < n); i += ft[i].ft_frags) { - uint8_t dst_ring = ring_nr; /* default, same ring as origin */ - uint16_t dst_port, d_i; - struct nm_bdg_q *d; - - ND("slot %d frags %d", i, ft[i].ft_frags); - dst_port = b->nm_bdg_lookup(ft[i].ft_buf, ft[i].ft_len, - &dst_ring, na); - if (netmap_verbose > 255) - RD(5, "slot %d port %d -> %d", i, me, dst_port); - if (dst_port == NM_BDG_NOPORT) - continue; /* this packet is identified to be dropped */ - else if (unlikely(dst_port > NM_BDG_MAXPORTS)) - continue; - else if (dst_port == NM_BDG_BROADCAST) - dst_ring = 0; /* broadcasts always go to ring 0 */ - else if (unlikely(dst_port == me || - !b->bdg_ports[dst_port])) - continue; - - /* get a position in the scratch pad */ - d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; - d = dst_ents + d_i; - - /* append the first fragment to the list */ - if (d->bq_head == NM_FT_NULL) { /* new destination */ - d->bq_head = d->bq_tail = i; - /* remember this position to be scanned later */ - if (dst_port != NM_BDG_BROADCAST) - dsts[num_dsts++] = d_i; - } else { - ft[d->bq_tail].ft_next = i; - d->bq_tail = i; - } - d->bq_len += ft[i].ft_frags; - } - - /* - * Broadcast traffic goes to ring 0 on all destinations. - * So we need to add these rings to the list of ports to scan. - * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is - * expensive. We should keep a compact list of active destinations - * so we could shorten this loop. - */ - brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; - if (brddst->bq_head != NM_FT_NULL) { - for (j = 0; likely(j < b->bdg_active_ports); j++) { - uint16_t d_i; - i = b->bdg_port_index[j]; - if (unlikely(i == me)) - continue; - d_i = i * NM_BDG_MAXRINGS; - if (dst_ents[d_i].bq_head == NM_FT_NULL) - dsts[num_dsts++] = d_i; - } - } - - ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); - /* second pass: scan destinations (XXX will be modular somehow) */ - for (i = 0; i < num_dsts; i++) { - struct ifnet *dst_ifp; - struct netmap_adapter *dst_na; - struct netmap_kring *kring; - struct netmap_ring *ring; - u_int dst_nr, is_vp, lim, j, sent = 0, d_i, next, brd_next; - u_int needed, howmany; - int retry = netmap_txsync_retry; - struct nm_bdg_q *d; - uint32_t my_start = 0, lease_idx = 0; - int nrings; - - d_i = dsts[i]; - ND("second pass %d port %d", i, d_i); - d = dst_ents + d_i; - // XXX fix the division - dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; - /* protect from the lookup function returning an inactive - * destination port - */ - if (unlikely(dst_na == NULL)) - goto cleanup; - if (dst_na->na_flags & NAF_SW_ONLY) - goto cleanup; - dst_ifp = dst_na->ifp; - /* - * The interface may be in !netmap mode in two cases: - * - when na is attached but not activated yet; - * - when na is being deactivated but is still attached. - */ - if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) { - ND("not in netmap mode!"); - goto cleanup; - } - - /* there is at least one either unicast or broadcast packet */ - brd_next = brddst->bq_head; - next = d->bq_head; - /* we need to reserve this many slots. If fewer are - * available, some packets will be dropped. - * Packets may have multiple fragments, so we may not use - * there is a chance that we may not use all of the slots - * we have claimed, so we will need to handle the leftover - * ones when we regain the lock. - */ - needed = d->bq_len + brddst->bq_len; - - is_vp = nma_is_vp(dst_na); - ND(5, "pass 2 dst %d is %x %s", - i, d_i, is_vp ? "virtual" : "nic/host"); - dst_nr = d_i & (NM_BDG_MAXRINGS-1); - if (is_vp) { /* virtual port */ - nrings = dst_na->num_rx_rings; - } else { - nrings = dst_na->num_tx_rings; - } - if (dst_nr >= nrings) - dst_nr = dst_nr % nrings; - kring = is_vp ? &dst_na->rx_rings[dst_nr] : - &dst_na->tx_rings[dst_nr]; - ring = kring->ring; - lim = kring->nkr_num_slots - 1; - -retry: - - /* reserve the buffers in the queue and an entry - * to report completion, and drop lock. - * XXX this might become a helper function. - */ - mtx_lock(&kring->q_lock); - if (kring->nkr_stopped) { - mtx_unlock(&kring->q_lock); - goto cleanup; - } - /* on physical interfaces, do a txsync to recover - * slots for packets already transmitted. - * XXX maybe we could be optimistic and rely on a retry - * in case of failure. - */ - if (nma_is_hw(dst_na)) { - dst_na->nm_txsync(dst_ifp, dst_nr, 0); - } - my_start = j = kring->nkr_hwlease; - howmany = nm_kr_space(kring, is_vp); - if (needed < howmany) - howmany = needed; - lease_idx = nm_kr_lease(kring, howmany, is_vp); - mtx_unlock(&kring->q_lock); - - /* only retry if we need more than available slots */ - if (retry && needed <= howmany) - retry = 0; - - /* copy to the destination queue */ - while (howmany > 0) { - struct netmap_slot *slot; - struct nm_bdg_fwd *ft_p, *ft_end; - u_int cnt; - - /* find the queue from which we pick next packet. - * NM_FT_NULL is always higher than valid indexes - * so we never dereference it if the other list - * has packets (and if both are empty we never - * get here). - */ - if (next < brd_next) { - ft_p = ft + next; - next = ft_p->ft_next; - } else { /* insert broadcast */ - ft_p = ft + brd_next; - brd_next = ft_p->ft_next; - } - cnt = ft_p->ft_frags; // cnt > 0 - if (unlikely(cnt > howmany)) - break; /* no more space */ - howmany -= cnt; - if (netmap_verbose && cnt > 1) - RD(5, "rx %d frags to %d", cnt, j); - ft_end = ft_p + cnt; - do { - void *dst, *src = ft_p->ft_buf; - size_t len = (ft_p->ft_len + 63) & ~63; - - slot = &ring->slot[j]; - dst = BDG_NMB(dst_na->nm_mem, slot); - /* round to a multiple of 64 */ - - ND("send %d %d bytes at %s:%d", - i, ft_p->ft_len, dst_ifp->if_xname, j); - if (ft_p->ft_flags & NS_INDIRECT) { - if (copyin(src, dst, len)) { - // invalid user pointer, pretend len is 0 - ft_p->ft_len = 0; - } - } else { - //memcpy(dst, src, len); - pkt_copy(src, dst, (int)len); - } - slot->len = ft_p->ft_len; - slot->flags = (cnt << 8)| NS_MOREFRAG; - j = nm_next(j, lim); - ft_p++; - sent++; - } while (ft_p != ft_end); - slot->flags = (cnt << 8); /* clear flag on last entry */ - /* are we done ? */ - if (next == NM_FT_NULL && brd_next == NM_FT_NULL) - break; - } - { - /* current position */ - uint32_t *p = kring->nkr_leases; /* shorthand */ - uint32_t update_pos; - int still_locked = 1; - - mtx_lock(&kring->q_lock); - if (unlikely(howmany > 0)) { - /* not used all bufs. If i am the last one - * i can recover the slots, otherwise must - * fill them with 0 to mark empty packets. - */ - ND("leftover %d bufs", howmany); - if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { - /* yes i am the last one */ - ND("roll back nkr_hwlease to %d", j); - kring->nkr_hwlease = j; - } else { - while (howmany-- > 0) { - ring->slot[j].len = 0; - ring->slot[j].flags = 0; - j = nm_next(j, lim); - } - } - } - p[lease_idx] = j; /* report I am done */ - - update_pos = is_vp ? nm_kr_rxpos(kring) : ring->cur; - - if (my_start == update_pos) { - /* all slots before my_start have been reported, - * so scan subsequent leases to see if other ranges - * have been completed, and to a selwakeup or txsync. - */ - while (lease_idx != kring->nkr_lease_idx && - p[lease_idx] != NR_NOSLOT) { - j = p[lease_idx]; - p[lease_idx] = NR_NOSLOT; - lease_idx = nm_next(lease_idx, lim); - } - /* j is the new 'write' position. j != my_start - * means there are new buffers to report - */ - if (likely(j != my_start)) { - if (is_vp) { - uint32_t old_avail = kring->nr_hwavail; - - kring->nr_hwavail = (j >= kring->nr_hwcur) ? - j - kring->nr_hwcur : - j + lim + 1 - kring->nr_hwcur; - if (kring->nr_hwavail < old_avail) { - D("avail shrink %d -> %d", - old_avail, kring->nr_hwavail); - } - still_locked = 0; - mtx_unlock(&kring->q_lock); - selwakeuppri(&kring->si, PI_NET); - } else { - ring->cur = j; - /* XXX update avail ? */ - still_locked = 0; - dst_na->nm_txsync(dst_ifp, dst_nr, 0); - mtx_unlock(&kring->q_lock); - - /* retry to send more packets */ - if (nma_is_hw(dst_na) && retry--) - goto retry; - } - } - } - if (still_locked) - mtx_unlock(&kring->q_lock); - } -cleanup: - d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ - d->bq_len = 0; - } - brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ - brddst->bq_len = 0; - return 0; -} - - -/* - * main dispatch routine for the bridge. - * We already know that only one thread is running this. - * we must run nm_bdg_preflush without lock. - */ -static int -bdg_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) -{ - struct netmap_adapter *na = NA(ifp); - struct netmap_kring *kring = &na->tx_rings[ring_nr]; - struct netmap_ring *ring = kring->ring; - u_int j, k, lim = kring->nkr_num_slots - 1; - - k = ring->cur; - if (k > lim) - return netmap_ring_reinit(kring); - - if (bridge_batch <= 0) { /* testing only */ - j = k; // used all - goto done; - } - if (bridge_batch > NM_BDG_BATCH) - bridge_batch = NM_BDG_BATCH; - - j = nm_bdg_preflush(na, ring_nr, kring, k); - if (j != k) - D("early break at %d/ %d, avail %d", j, k, kring->nr_hwavail); - /* k-j modulo ring size is the number of slots processed */ - if (k < j) - k += kring->nkr_num_slots; - kring->nr_hwavail = lim - (k - j); - -done: - kring->nr_hwcur = j; - ring->avail = kring->nr_hwavail; - if (netmap_verbose) - D("%s ring %d flags %d", ifp->if_xname, ring_nr, flags); - return 0; -} - - -/* - * user process reading from a VALE switch. - * Already protected against concurrent calls from userspace, - * but we must acquire the queue's lock to protect against - * writers on the same queue. - */ -static int -bdg_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) -{ - struct netmap_adapter *na = NA(ifp); - struct netmap_kring *kring = &na->rx_rings[ring_nr]; - struct netmap_ring *ring = kring->ring; - u_int j, lim = kring->nkr_num_slots - 1; - u_int k = ring->cur, resvd = ring->reserved; - int n; - - mtx_lock(&kring->q_lock); - if (k > lim) { - D("ouch dangerous reset!!!"); - n = netmap_ring_reinit(kring); - goto done; - } - - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; /* netmap ring index */ - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - - if (j != k) { /* userspace has released some packets. */ - n = k - j; - if (n < 0) - n += kring->nkr_num_slots; - ND("userspace releases %d packets", n); - for (n = 0; likely(j != k); n++) { - struct netmap_slot *slot = &ring->slot[j]; - void *addr = BDG_NMB(na->nm_mem, slot); - - if (addr == netmap_buffer_base) { /* bad buf */ - D("bad buffer index %d, ignore ?", - slot->buf_idx); - } - slot->flags &= ~NS_BUF_CHANGED; - j = nm_next(j, lim); - } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; - } - /* tell userspace that there are new packets */ - ring->avail = kring->nr_hwavail - resvd; - n = 0; -done: - mtx_unlock(&kring->q_lock); - return n; -} - - -static int -bdg_netmap_attach(struct netmap_adapter *arg) -{ - struct netmap_adapter na; - - ND("attaching virtual bridge"); - bzero(&na, sizeof(na)); - - na.ifp = arg->ifp; - na.na_flags = NAF_BDG_MAYSLEEP | NAF_MEM_OWNER; - na.num_tx_rings = arg->num_tx_rings; - na.num_rx_rings = arg->num_rx_rings; - na.num_tx_desc = arg->num_tx_desc; - na.num_rx_desc = arg->num_rx_desc; - na.nm_txsync = bdg_netmap_txsync; - na.nm_rxsync = bdg_netmap_rxsync; - na.nm_register = bdg_netmap_reg; - na.nm_mem = netmap_mem_private_new(arg->ifp->if_xname, - na.num_tx_rings, na.num_tx_desc, - na.num_rx_rings, na.num_rx_desc); - return netmap_attach(&na, na.num_tx_rings); -} - - -static struct cdev *netmap_dev; /* /dev/netmap character device. */ - - -/* - * Module loader. - * - * Create the /dev/netmap device and initialize all global - * variables. - * - * Return 0 on success, errno on failure. - */ -static int netmap_init(void) { - int i, error; + int error; NMG_LOCK_INIT(); error = netmap_mem_init(); - if (error != 0) { - printf("netmap: unable to initialize the memory allocator.\n"); - return (error); - } - printf("netmap: loaded module\n"); + if (error != 0) + goto fail; + /* XXX could use make_dev_credv() to get error number */ netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, "netmap"); + if (!netmap_dev) + goto fail; - bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */ - for (i = 0; i < NM_BRIDGES; i++) - BDG_RWINIT(&nm_bridges[i]); - return (error); + netmap_init_bridges(); + printf("netmap: loaded module\n"); + return (0); +fail: + netmap_fini(); + return (EINVAL); /* may be incorrect */ } - - -/* - * Module unloader. - * - * Free all the memory, and destroy the ``/dev/netmap`` device. - */ -static void -netmap_fini(void) -{ - destroy_dev(netmap_dev); - netmap_mem_fini(); - NMG_LOCK_DESTROY(); - printf("netmap: unloaded module.\n"); -} - - -#ifdef __FreeBSD__ -/* - * Kernel entry point. - * - * Initialize/finalize the module and return. - * - * Return 0 on success, errno on failure. - */ -static int -netmap_loader(__unused struct module *module, int event, __unused void *arg) -{ - int error = 0; - - switch (event) { - case MOD_LOAD: - error = netmap_init(); - break; - - case MOD_UNLOAD: - netmap_fini(); - break; - - default: - error = EOPNOTSUPP; - break; - } - - return (error); -} - - -DEV_MODULE(netmap, netmap_loader, NULL); -#endif /* __FreeBSD__ */ Index: stable/9/sys/dev/netmap/netmap_freebsd.c =================================================================== --- stable/9/sys/dev/netmap/netmap_freebsd.c (nonexistent) +++ stable/9/sys/dev/netmap/netmap_freebsd.c (revision 262153) @@ -0,0 +1,655 @@ +/* + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* $FreeBSD$ */ + +#include +#include +#include +#include /* defines used in kernel.h */ +#include /* POLLIN, POLLOUT */ +#include /* types used in module initialization */ +#include /* DEV_MODULE */ +#include + +#include + +#include /* vtophys */ +#include /* vtophys */ +#include +#include +#include +#include +#include + + +#include +#include /* sockaddrs */ +#include +#include +#include +#include /* bus_dmamap_* */ +#include /* in6_cksum_pseudo() */ +#include /* in_pseudo(), in_cksum_hdr() */ + +#include +#include +#include + + +/* ======================== FREEBSD-SPECIFIC ROUTINES ================== */ + +rawsum_t nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum) +{ + /* TODO XXX please use the FreeBSD implementation for this. */ + uint16_t *words = (uint16_t *)data; + int nw = len / 2; + int i; + + for (i = 0; i < nw; i++) + cur_sum += be16toh(words[i]); + + if (len & 1) + cur_sum += (data[len-1] << 8); + + return cur_sum; +} + +/* Fold a raw checksum: 'cur_sum' is in host byte order, while the + * return value is in network byte order. + */ +uint16_t nm_csum_fold(rawsum_t cur_sum) +{ + /* TODO XXX please use the FreeBSD implementation for this. */ + while (cur_sum >> 16) + cur_sum = (cur_sum & 0xFFFF) + (cur_sum >> 16); + + return htobe16((~cur_sum) & 0xFFFF); +} + +uint16_t nm_csum_ipv4(struct nm_iphdr *iph) +{ +#if 0 + return in_cksum_hdr((void *)iph); +#else + return nm_csum_fold(nm_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0)); +#endif +} + +void nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, + size_t datalen, uint16_t *check) +{ + uint16_t pseudolen = datalen + iph->protocol; + + /* Compute and insert the pseudo-header cheksum. */ + *check = in_pseudo(iph->saddr, iph->daddr, + htobe16(pseudolen)); + /* Compute the checksum on TCP/UDP header + payload + * (includes the pseudo-header). + */ + *check = nm_csum_fold(nm_csum_raw(data, datalen, 0)); +} + +void nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, + size_t datalen, uint16_t *check) +{ +#ifdef INET6 + *check = in6_cksum_pseudo((void*)ip6h, datalen, ip6h->nexthdr, 0); + *check = nm_csum_fold(nm_csum_raw(data, datalen, 0)); +#else + static int notsupported = 0; + if (!notsupported) { + notsupported = 1; + D("inet6 segmentation not supported"); + } +#endif +} + + +/* + * Intercept the rx routine in the standard device driver. + * Second argument is non-zero to intercept, 0 to restore + */ +int +netmap_catch_rx(struct netmap_adapter *na, int intercept) +{ + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; + struct ifnet *ifp = na->ifp; + + if (intercept) { + if (gna->save_if_input) { + D("cannot intercept again"); + return EINVAL; /* already set */ + } + gna->save_if_input = ifp->if_input; + ifp->if_input = generic_rx_handler; + } else { + if (!gna->save_if_input){ + D("cannot restore"); + return EINVAL; /* not saved */ + } + ifp->if_input = gna->save_if_input; + gna->save_if_input = NULL; + } + + return 0; +} + + +/* + * Intercept the packet steering routine in the tx path, + * so that we can decide which queue is used for an mbuf. + * Second argument is non-zero to intercept, 0 to restore. + * On freebsd we just intercept if_transmit. + */ +void +netmap_catch_tx(struct netmap_generic_adapter *gna, int enable) +{ + struct netmap_adapter *na = &gna->up.up; + struct ifnet *ifp = na->ifp; + + if (enable) { + na->if_transmit = ifp->if_transmit; + ifp->if_transmit = netmap_transmit; + } else { + ifp->if_transmit = na->if_transmit; + } +} + + +/* + * Transmit routine used by generic_netmap_txsync(). Returns 0 on success + * and non-zero on error (which may be packet drops or other errors). + * addr and len identify the netmap buffer, m is the (preallocated) + * mbuf to use for transmissions. + * + * We should add a reference to the mbuf so the m_freem() at the end + * of the transmission does not consume resources. + * + * On FreeBSD, and on multiqueue cards, we can force the queue using + * if ((m->m_flags & M_FLOWID) != 0) + * i = m->m_pkthdr.flowid % adapter->num_queues; + * else + * i = curcpu % adapter->num_queues; + * + */ +int +generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, + void *addr, u_int len, u_int ring_nr) +{ + int ret; + + m->m_len = m->m_pkthdr.len = 0; + + // copy data to the mbuf + m_copyback(m, 0, len, addr); + // inc refcount. We are alone, so we can skip the atomic + atomic_fetchadd_int(m->m_ext.ref_cnt, 1); + m->m_flags |= M_FLOWID; + m->m_pkthdr.flowid = ring_nr; + m->m_pkthdr.rcvif = ifp; /* used for tx notification */ + ret = NA(ifp)->if_transmit(ifp, m); + return ret; +} + + +/* + * The following two functions are empty until we have a generic + * way to extract the info from the ifp + */ +int +generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx) +{ + D("called"); + return 0; +} + + +void +generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq) +{ + D("called"); + *txq = netmap_generic_rings; + *rxq = netmap_generic_rings; +} + + +void netmap_mitigation_init(struct nm_generic_mit *mit, struct netmap_adapter *na) +{ + ND("called"); + mit->mit_pending = 0; + mit->mit_na = na; +} + + +void netmap_mitigation_start(struct nm_generic_mit *mit) +{ + ND("called"); +} + + +void netmap_mitigation_restart(struct nm_generic_mit *mit) +{ + ND("called"); +} + + +int netmap_mitigation_active(struct nm_generic_mit *mit) +{ + ND("called"); + return 0; +} + + +void netmap_mitigation_cleanup(struct nm_generic_mit *mit) +{ + ND("called"); +} + + +/* + * In order to track whether pages are still mapped, we hook into + * the standard cdev_pager and intercept the constructor and + * destructor. + */ + +struct netmap_vm_handle_t { + struct cdev *dev; + struct netmap_priv_d *priv; +}; + + +static int +netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, + vm_ooffset_t foff, struct ucred *cred, u_short *color) +{ + struct netmap_vm_handle_t *vmh = handle; + + if (netmap_verbose) + D("handle %p size %jd prot %d foff %jd", + handle, (intmax_t)size, prot, (intmax_t)foff); + dev_ref(vmh->dev); + return 0; +} + + +static void +netmap_dev_pager_dtor(void *handle) +{ + struct netmap_vm_handle_t *vmh = handle; + struct cdev *dev = vmh->dev; + struct netmap_priv_d *priv = vmh->priv; + + if (netmap_verbose) + D("handle %p", handle); + netmap_dtor(priv); + free(vmh, M_DEVBUF); + dev_rel(dev); +} + + +static int +netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, + int prot, vm_page_t *mres) +{ + struct netmap_vm_handle_t *vmh = object->handle; + struct netmap_priv_d *priv = vmh->priv; + vm_paddr_t paddr; + vm_page_t page; + vm_memattr_t memattr; + vm_pindex_t pidx; + + ND("object %p offset %jd prot %d mres %p", + object, (intmax_t)offset, prot, mres); + memattr = object->memattr; + pidx = OFF_TO_IDX(offset); + paddr = netmap_mem_ofstophys(priv->np_mref, offset); + if (paddr == 0) + return VM_PAGER_FAIL; + + if (((*mres)->flags & PG_FICTITIOUS) != 0) { + /* + * If the passed in result page is a fake page, update it with + * the new physical address. + */ + page = *mres; + vm_page_updatefake(page, paddr, memattr); + } else { + /* + * Replace the passed in reqpage page with our own fake page and + * free up the all of the original pages. + */ +#ifndef VM_OBJECT_WUNLOCK /* FreeBSD < 10.x */ +#define VM_OBJECT_WUNLOCK VM_OBJECT_UNLOCK +#define VM_OBJECT_WLOCK VM_OBJECT_LOCK +#endif /* VM_OBJECT_WUNLOCK */ + + VM_OBJECT_WUNLOCK(object); + page = vm_page_getfake(paddr, memattr); + VM_OBJECT_WLOCK(object); + vm_page_lock(*mres); + vm_page_free(*mres); + vm_page_unlock(*mres); + *mres = page; + vm_page_insert(page, object, pidx); + } + page->valid = VM_PAGE_BITS_ALL; + return (VM_PAGER_OK); +} + + +static struct cdev_pager_ops netmap_cdev_pager_ops = { + .cdev_pg_ctor = netmap_dev_pager_ctor, + .cdev_pg_dtor = netmap_dev_pager_dtor, + .cdev_pg_fault = netmap_dev_pager_fault, +}; + + +static int +netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff, + vm_size_t objsize, vm_object_t *objp, int prot) +{ + int error; + struct netmap_vm_handle_t *vmh; + struct netmap_priv_d *priv; + vm_object_t obj; + + if (netmap_verbose) + D("cdev %p foff %jd size %jd objp %p prot %d", cdev, + (intmax_t )*foff, (intmax_t )objsize, objp, prot); + + vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF, + M_NOWAIT | M_ZERO); + if (vmh == NULL) + return ENOMEM; + vmh->dev = cdev; + + NMG_LOCK(); + error = devfs_get_cdevpriv((void**)&priv); + if (error) + goto err_unlock; + vmh->priv = priv; + priv->np_refcount++; + NMG_UNLOCK(); + + error = netmap_get_memory(priv); + if (error) + goto err_deref; + + obj = cdev_pager_allocate(vmh, OBJT_DEVICE, + &netmap_cdev_pager_ops, objsize, prot, + *foff, NULL); + if (obj == NULL) { + D("cdev_pager_allocate failed"); + error = EINVAL; + goto err_deref; + } + + *objp = obj; + return 0; + +err_deref: + NMG_LOCK(); + priv->np_refcount--; +err_unlock: + NMG_UNLOCK(); +// err: + free(vmh, M_DEVBUF); + return error; +} + + +// XXX can we remove this ? +static int +netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td) +{ + if (netmap_verbose) + D("dev %p fflag 0x%x devtype %d td %p", + dev, fflag, devtype, td); + return 0; +} + + +static int +netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) +{ + struct netmap_priv_d *priv; + int error; + + (void)dev; + (void)oflags; + (void)devtype; + (void)td; + + // XXX wait or nowait ? + priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, + M_NOWAIT | M_ZERO); + if (priv == NULL) + return ENOMEM; + + error = devfs_set_cdevpriv(priv, netmap_dtor); + if (error) + return error; + + priv->np_refcount = 1; + + return 0; +} + +/******************** kqueue support ****************/ + +/* + * The OS_selwakeup also needs to issue a KNOTE_UNLOCKED. + * We use a non-zero argument to distinguish the call from the one + * in kevent_scan() which instead also needs to run netmap_poll(). + * The knote uses a global mutex for the time being. We might + * try to reuse the one in the si, but it is not allocated + * permanently so it might be a bit tricky. + * + * The *kqfilter function registers one or another f_event + * depending on read or write mode. + * In the call to f_event() td_fpop is NULL so any child function + * calling devfs_get_cdevpriv() would fail - and we need it in + * netmap_poll(). As a workaround we store priv into kn->kn_hook + * and pass it as first argument to netmap_poll(), which then + * uses the failure to tell that we are called from f_event() + * and do not need the selrecord(). + */ + +void freebsd_selwakeup(struct selinfo *si, int pri); + +void +freebsd_selwakeup(struct selinfo *si, int pri) +{ + if (netmap_verbose) + D("on knote %p", &si->si_note); + selwakeuppri(si, pri); + /* use a non-zero hint to tell the notification from the + * call done in kqueue_scan() which uses 0 + */ + KNOTE_UNLOCKED(&si->si_note, 0x100 /* notification */); +} + +static void +netmap_knrdetach(struct knote *kn) +{ + struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook; + struct selinfo *si = priv->np_rxsi; + + D("remove selinfo %p", si); + knlist_remove(&si->si_note, kn, 0); +} + +static void +netmap_knwdetach(struct knote *kn) +{ + struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook; + struct selinfo *si = priv->np_txsi; + + D("remove selinfo %p", si); + knlist_remove(&si->si_note, kn, 0); +} + +/* + * callback from notifies (generated externally) and our + * calls to kevent(). The former we just return 1 (ready) + * since we do not know better. + * In the latter we call netmap_poll and return 0/1 accordingly. + */ +static int +netmap_knrw(struct knote *kn, long hint, int events) +{ + struct netmap_priv_d *priv; + int revents; + + if (hint != 0) { + ND(5, "call from notify"); + return 1; /* assume we are ready */ + } + priv = kn->kn_hook; + /* the notification may come from an external thread, + * in which case we do not want to run the netmap_poll + * This should be filtered above, but check just in case. + */ + if (curthread != priv->np_td) { /* should not happen */ + RD(5, "curthread changed %p %p", curthread, priv->np_td); + return 1; + } else { + revents = netmap_poll((void *)priv, events, curthread); + return (events & revents) ? 1 : 0; + } +} + +static int +netmap_knread(struct knote *kn, long hint) +{ + return netmap_knrw(kn, hint, POLLIN); +} + +static int +netmap_knwrite(struct knote *kn, long hint) +{ + return netmap_knrw(kn, hint, POLLOUT); +} + +static struct filterops netmap_rfiltops = { + .f_isfd = 1, + .f_detach = netmap_knrdetach, + .f_event = netmap_knread, +}; + +static struct filterops netmap_wfiltops = { + .f_isfd = 1, + .f_detach = netmap_knwdetach, + .f_event = netmap_knwrite, +}; + + +/* + * This is called when a thread invokes kevent() to record + * a change in the configuration of the kqueue(). + * The 'priv' should be the same as in the netmap device. + */ +static int +netmap_kqfilter(struct cdev *dev, struct knote *kn) +{ + struct netmap_priv_d *priv; + int error; + struct netmap_adapter *na; + struct selinfo *si; + int ev = kn->kn_filter; + + if (ev != EVFILT_READ && ev != EVFILT_WRITE) { + D("bad filter request %d", ev); + return 1; + } + error = devfs_get_cdevpriv((void**)&priv); + if (error) { + D("device not yet setup"); + return 1; + } + na = priv->np_na; + if (na == NULL) { + D("no netmap adapter for this file descriptor"); + return 1; + } + /* the si is indicated in the priv */ + si = (ev == EVFILT_WRITE) ? priv->np_txsi : priv->np_rxsi; + // XXX lock(priv) ? + kn->kn_fop = (ev == EVFILT_WRITE) ? + &netmap_wfiltops : &netmap_rfiltops; + kn->kn_hook = priv; + knlist_add(&si->si_note, kn, 1); + // XXX unlock(priv) + ND("register %p %s td %p priv %p kn %p np_nifp %p kn_fp/fpop %s", + na, na->ifp->if_xname, curthread, priv, kn, + priv->np_nifp, + kn->kn_fp == curthread->td_fpop ? "match" : "MISMATCH"); + return 0; +} + +struct cdevsw netmap_cdevsw = { + .d_version = D_VERSION, + .d_name = "netmap", + .d_open = netmap_open, + .d_mmap_single = netmap_mmap_single, + .d_ioctl = netmap_ioctl, + .d_poll = netmap_poll, + .d_kqfilter = netmap_kqfilter, + .d_close = netmap_close, +}; +/*--- end of kqueue support ----*/ + +/* + * Kernel entry point. + * + * Initialize/finalize the module and return. + * + * Return 0 on success, errno on failure. + */ +static int +netmap_loader(__unused struct module *module, int event, __unused void *arg) +{ + int error = 0; + + switch (event) { + case MOD_LOAD: + error = netmap_init(); + break; + + case MOD_UNLOAD: + netmap_fini(); + break; + + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + + +DEV_MODULE(netmap, netmap_loader, NULL); Property changes on: stable/9/sys/dev/netmap/netmap_freebsd.c ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: stable/9/sys/dev/netmap/netmap_generic.c =================================================================== --- stable/9/sys/dev/netmap/netmap_generic.c (nonexistent) +++ stable/9/sys/dev/netmap/netmap_generic.c (revision 262153) @@ -0,0 +1,806 @@ +/* + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This module implements netmap support on top of standard, + * unmodified device drivers. + * + * A NIOCREGIF request is handled here if the device does not + * have native support. TX and RX rings are emulated as follows: + * + * NIOCREGIF + * We preallocate a block of TX mbufs (roughly as many as + * tx descriptors; the number is not critical) to speed up + * operation during transmissions. The refcount on most of + * these buffers is artificially bumped up so we can recycle + * them more easily. Also, the destructor is intercepted + * so we use it as an interrupt notification to wake up + * processes blocked on a poll(). + * + * For each receive ring we allocate one "struct mbq" + * (an mbuf tailq plus a spinlock). We intercept packets + * (through if_input) + * on the receive path and put them in the mbq from which + * netmap receive routines can grab them. + * + * TX: + * in the generic_txsync() routine, netmap buffers are copied + * (or linked, in a future) to the preallocated mbufs + * and pushed to the transmit queue. Some of these mbufs + * (those with NS_REPORT, or otherwise every half ring) + * have the refcount=1, others have refcount=2. + * When the destructor is invoked, we take that as + * a notification that all mbufs up to that one in + * the specific ring have been completed, and generate + * the equivalent of a transmit interrupt. + * + * RX: + * + */ + +#ifdef __FreeBSD__ + +#include /* prerequisite */ +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include /* PROT_EXEC */ +#include +#include /* sockaddrs */ +#include +#include +#include +#include /* bus_dmamap_* in netmap_kern.h */ + +// XXX temporary - D() defined here +#include +#include +#include + +#define rtnl_lock() D("rtnl_lock called"); +#define rtnl_unlock() D("rtnl_unlock called"); +#define MBUF_TXQ(m) ((m)->m_pkthdr.flowid) +#define MBUF_RXQ(m) ((m)->m_pkthdr.flowid) +#define smp_mb() + +/* + * mbuf wrappers + */ + +/* + * we allocate an EXT_PACKET + */ +#define netmap_get_mbuf(len) m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR|M_NOFREE) + +/* mbuf destructor, also need to change the type to EXT_EXTREF, + * add an M_NOFREE flag, and then clear the flag and + * chain into uma_zfree(zone_pack, mf) + * (or reinstall the buffer ?) + */ +#define SET_MBUF_DESTRUCTOR(m, fn) do { \ + (m)->m_ext.ext_free = (void *)fn; \ + (m)->m_ext.ext_type = EXT_EXTREF; \ +} while (0) + + +#define GET_MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *(m)->m_ext.ref_cnt : -1) + + + +#else /* linux */ + +#include "bsd_glue.h" + +#include /* rtnl_[un]lock() */ +#include /* struct ethtool_ops, get_ringparam */ +#include + +//#define RATE /* Enables communication statistics. */ + +//#define REG_RESET + +#endif /* linux */ + + +/* Common headers. */ +#include +#include +#include + + + +/* ======================== usage stats =========================== */ + +#ifdef RATE +#define IFRATE(x) x +struct rate_stats { + unsigned long txpkt; + unsigned long txsync; + unsigned long txirq; + unsigned long rxpkt; + unsigned long rxirq; + unsigned long rxsync; +}; + +struct rate_context { + unsigned refcount; + struct timer_list timer; + struct rate_stats new; + struct rate_stats old; +}; + +#define RATE_PRINTK(_NAME_) \ + printk( #_NAME_ " = %lu Hz\n", (cur._NAME_ - ctx->old._NAME_)/RATE_PERIOD); +#define RATE_PERIOD 2 +static void rate_callback(unsigned long arg) +{ + struct rate_context * ctx = (struct rate_context *)arg; + struct rate_stats cur = ctx->new; + int r; + + RATE_PRINTK(txpkt); + RATE_PRINTK(txsync); + RATE_PRINTK(txirq); + RATE_PRINTK(rxpkt); + RATE_PRINTK(rxsync); + RATE_PRINTK(rxirq); + printk("\n"); + + ctx->old = cur; + r = mod_timer(&ctx->timer, jiffies + + msecs_to_jiffies(RATE_PERIOD * 1000)); + if (unlikely(r)) + D("[v1000] Error: mod_timer()"); +} + +static struct rate_context rate_ctx; + +#else /* !RATE */ +#define IFRATE(x) +#endif /* !RATE */ + + +/* =============== GENERIC NETMAP ADAPTER SUPPORT ================= */ +#define GENERIC_BUF_SIZE netmap_buf_size /* Size of the mbufs in the Tx pool. */ + +/* + * Wrapper used by the generic adapter layer to notify + * the poller threads. Differently from netmap_rx_irq(), we check + * only IFCAP_NETMAP instead of NAF_NATIVE_ON to enable the irq. + */ +static void +netmap_generic_irq(struct ifnet *ifp, u_int q, u_int *work_done) +{ + if (unlikely(!(ifp->if_capenable & IFCAP_NETMAP))) + return; + + netmap_common_irq(ifp, q, work_done); +} + + +/* Enable/disable netmap mode for a generic network interface. */ +static int +generic_netmap_register(struct netmap_adapter *na, int enable) +{ + struct ifnet *ifp = na->ifp; + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; + struct mbuf *m; + int error; + int i, r; + + if (!na) + return EINVAL; + +#ifdef REG_RESET + error = ifp->netdev_ops->ndo_stop(ifp); + if (error) { + return error; + } +#endif /* REG_RESET */ + + if (enable) { /* Enable netmap mode. */ + /* Init the mitigation support. */ + gna->mit = malloc(na->num_rx_rings * sizeof(struct nm_generic_mit), + M_DEVBUF, M_NOWAIT | M_ZERO); + if (!gna->mit) { + D("mitigation allocation failed"); + error = ENOMEM; + goto out; + } + for (r=0; rnum_rx_rings; r++) + netmap_mitigation_init(&gna->mit[r], na); + + /* Initialize the rx queue, as generic_rx_handler() can + * be called as soon as netmap_catch_rx() returns. + */ + for (r=0; rnum_rx_rings; r++) { + mbq_safe_init(&na->rx_rings[r].rx_queue); + } + + /* + * Preallocate packet buffers for the tx rings. + */ + for (r=0; rnum_tx_rings; r++) + na->tx_rings[r].tx_pool = NULL; + for (r=0; rnum_tx_rings; r++) { + na->tx_rings[r].tx_pool = malloc(na->num_tx_desc * sizeof(struct mbuf *), + M_DEVBUF, M_NOWAIT | M_ZERO); + if (!na->tx_rings[r].tx_pool) { + D("tx_pool allocation failed"); + error = ENOMEM; + goto free_tx_pools; + } + for (i=0; inum_tx_desc; i++) + na->tx_rings[r].tx_pool[i] = NULL; + for (i=0; inum_tx_desc; i++) { + m = netmap_get_mbuf(GENERIC_BUF_SIZE); + if (!m) { + D("tx_pool[%d] allocation failed", i); + error = ENOMEM; + goto free_tx_pools; + } + na->tx_rings[r].tx_pool[i] = m; + } + } + rtnl_lock(); + /* Prepare to intercept incoming traffic. */ + error = netmap_catch_rx(na, 1); + if (error) { + D("netdev_rx_handler_register() failed (%d)", error); + goto register_handler; + } + ifp->if_capenable |= IFCAP_NETMAP; + + /* Make netmap control the packet steering. */ + netmap_catch_tx(gna, 1); + + rtnl_unlock(); + +#ifdef RATE + if (rate_ctx.refcount == 0) { + D("setup_timer()"); + memset(&rate_ctx, 0, sizeof(rate_ctx)); + setup_timer(&rate_ctx.timer, &rate_callback, (unsigned long)&rate_ctx); + if (mod_timer(&rate_ctx.timer, jiffies + msecs_to_jiffies(1500))) { + D("Error: mod_timer()"); + } + } + rate_ctx.refcount++; +#endif /* RATE */ + + } else if (na->tx_rings[0].tx_pool) { + /* Disable netmap mode. We enter here only if the previous + generic_netmap_register(na, 1) was successfull. + If it was not, na->tx_rings[0].tx_pool was set to NULL by the + error handling code below. */ + rtnl_lock(); + + ifp->if_capenable &= ~IFCAP_NETMAP; + + /* Release packet steering control. */ + netmap_catch_tx(gna, 0); + + /* Do not intercept packets on the rx path. */ + netmap_catch_rx(na, 0); + + rtnl_unlock(); + + /* Free the mbufs going to the netmap rings */ + for (r=0; rnum_rx_rings; r++) { + mbq_safe_purge(&na->rx_rings[r].rx_queue); + mbq_safe_destroy(&na->rx_rings[r].rx_queue); + } + + for (r=0; rnum_rx_rings; r++) + netmap_mitigation_cleanup(&gna->mit[r]); + free(gna->mit, M_DEVBUF); + + for (r=0; rnum_tx_rings; r++) { + for (i=0; inum_tx_desc; i++) { + m_freem(na->tx_rings[r].tx_pool[i]); + } + free(na->tx_rings[r].tx_pool, M_DEVBUF); + } + +#ifdef RATE + if (--rate_ctx.refcount == 0) { + D("del_timer()"); + del_timer(&rate_ctx.timer); + } +#endif + } + +#ifdef REG_RESET + error = ifp->netdev_ops->ndo_open(ifp); + if (error) { + goto free_tx_pools; + } +#endif + + return 0; + +register_handler: + rtnl_unlock(); +free_tx_pools: + for (r=0; rnum_tx_rings; r++) { + if (na->tx_rings[r].tx_pool == NULL) + continue; + for (i=0; inum_tx_desc; i++) + if (na->tx_rings[r].tx_pool[i]) + m_freem(na->tx_rings[r].tx_pool[i]); + free(na->tx_rings[r].tx_pool, M_DEVBUF); + na->tx_rings[r].tx_pool = NULL; + } + for (r=0; rnum_rx_rings; r++) { + netmap_mitigation_cleanup(&gna->mit[r]); + mbq_safe_destroy(&na->rx_rings[r].rx_queue); + } + free(gna->mit, M_DEVBUF); +out: + + return error; +} + +/* + * Callback invoked when the device driver frees an mbuf used + * by netmap to transmit a packet. This usually happens when + * the NIC notifies the driver that transmission is completed. + */ +static void +generic_mbuf_destructor(struct mbuf *m) +{ + if (netmap_verbose) + D("Tx irq (%p) queue %d", m, MBUF_TXQ(m)); + netmap_generic_irq(MBUF_IFP(m), MBUF_TXQ(m), NULL); +#ifdef __FreeBSD__ + m->m_ext.ext_type = EXT_PACKET; + m->m_ext.ext_free = NULL; + if (*(m->m_ext.ref_cnt) == 0) + *(m->m_ext.ref_cnt) = 1; + uma_zfree(zone_pack, m); +#endif /* __FreeBSD__ */ + IFRATE(rate_ctx.new.txirq++); +} + +/* Record completed transmissions and update hwtail. + * + * The oldest tx buffer not yet completed is at nr_hwtail + 1, + * nr_hwcur is the first unsent buffer. + */ +static u_int +generic_netmap_tx_clean(struct netmap_kring *kring) +{ + u_int const lim = kring->nkr_num_slots - 1; + u_int nm_i = nm_next(kring->nr_hwtail, lim); + u_int hwcur = kring->nr_hwcur; + u_int n = 0; + struct mbuf **tx_pool = kring->tx_pool; + + while (nm_i != hwcur) { /* buffers not completed */ + struct mbuf *m = tx_pool[nm_i]; + + if (unlikely(m == NULL)) { + /* this is done, try to replenish the entry */ + tx_pool[nm_i] = m = netmap_get_mbuf(GENERIC_BUF_SIZE); + if (unlikely(m == NULL)) { + D("mbuf allocation failed, XXX error"); + // XXX how do we proceed ? break ? + return -ENOMEM; + } + } else if (GET_MBUF_REFCNT(m) != 1) { + break; /* This mbuf is still busy: its refcnt is 2. */ + } + n++; + nm_i = nm_next(nm_i, lim); + } + kring->nr_hwtail = nm_prev(nm_i, lim); + ND("tx completed [%d] -> hwtail %d", n, kring->nr_hwtail); + + return n; +} + + +/* + * We have pending packets in the driver between nr_hwtail +1 and hwcur. + * Compute a position in the middle, to be used to generate + * a notification. + */ +static inline u_int +generic_tx_event_middle(struct netmap_kring *kring, u_int hwcur) +{ + u_int n = kring->nkr_num_slots; + u_int ntc = nm_next(kring->nr_hwtail, n-1); + u_int e; + + if (hwcur >= ntc) { + e = (hwcur + ntc) / 2; + } else { /* wrap around */ + e = (hwcur + n + ntc) / 2; + if (e >= n) { + e -= n; + } + } + + if (unlikely(e >= n)) { + D("This cannot happen"); + e = 0; + } + + return e; +} + +/* + * We have pending packets in the driver between nr_hwtail+1 and hwcur. + * Schedule a notification approximately in the middle of the two. + * There is a race but this is only called within txsync which does + * a double check. + */ +static void +generic_set_tx_event(struct netmap_kring *kring, u_int hwcur) +{ + struct mbuf *m; + u_int e; + + if (nm_next(kring->nr_hwtail, kring->nkr_num_slots -1) == hwcur) { + return; /* all buffers are free */ + } + e = generic_tx_event_middle(kring, hwcur); + + m = kring->tx_pool[e]; + if (m == NULL) { + /* This can happen if there is already an event on the netmap + slot 'e': There is nothing to do. */ + return; + } + ND("Event at %d mbuf %p refcnt %d", e, m, GET_MBUF_REFCNT(m)); + kring->tx_pool[e] = NULL; + SET_MBUF_DESTRUCTOR(m, generic_mbuf_destructor); + + // XXX wmb() ? + /* Decrement the refcount an free it if we have the last one. */ + m_freem(m); + smp_mb(); +} + + +/* + * generic_netmap_txsync() transforms netmap buffers into mbufs + * and passes them to the standard device driver + * (ndo_start_xmit() or ifp->if_transmit() ). + * On linux this is not done directly, but using dev_queue_xmit(), + * since it implements the TX flow control (and takes some locks). + */ +static int +generic_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct ifnet *ifp = na->ifp; + struct netmap_kring *kring = &na->tx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + u_int nm_i; /* index into the netmap ring */ // j + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; + + IFRATE(rate_ctx.new.txsync++); + + // TODO: handle the case of mbuf allocation failure + + rmb(); + + /* + * First part: process new packets to send. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* we have new packets to send */ + while (nm_i != head) { + struct netmap_slot *slot = &ring->slot[nm_i]; + u_int len = slot->len; + void *addr = NMB(slot); + + /* device-specific */ + struct mbuf *m; + int tx_ret; + + NM_CHECK_ADDR_LEN(addr, len); + + /* Tale a mbuf from the tx pool and copy in the user packet. */ + m = kring->tx_pool[nm_i]; + if (unlikely(!m)) { + RD(5, "This should never happen"); + kring->tx_pool[nm_i] = m = netmap_get_mbuf(GENERIC_BUF_SIZE); + if (unlikely(m == NULL)) { + D("mbuf allocation failed"); + break; + } + } + /* XXX we should ask notifications when NS_REPORT is set, + * or roughly every half frame. We can optimize this + * by lazily requesting notifications only when a + * transmission fails. Probably the best way is to + * break on failures and set notifications when + * ring->cur == ring->tail || nm_i != cur + */ + tx_ret = generic_xmit_frame(ifp, m, addr, len, ring_nr); + if (unlikely(tx_ret)) { + RD(5, "start_xmit failed: err %d [nm_i %u, head %u, hwtail %u]", + tx_ret, nm_i, head, kring->nr_hwtail); + /* + * No room for this mbuf in the device driver. + * Request a notification FOR A PREVIOUS MBUF, + * then call generic_netmap_tx_clean(kring) to do the + * double check and see if we can free more buffers. + * If there is space continue, else break; + * NOTE: the double check is necessary if the problem + * occurs in the txsync call after selrecord(). + * Also, we need some way to tell the caller that not + * all buffers were queued onto the device (this was + * not a problem with native netmap driver where space + * is preallocated). The bridge has a similar problem + * and we solve it there by dropping the excess packets. + */ + generic_set_tx_event(kring, nm_i); + if (generic_netmap_tx_clean(kring)) { /* space now available */ + continue; + } else { + break; + } + } + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + nm_i = nm_next(nm_i, lim); + IFRATE(rate_ctx.new.txpkt ++); + } + + /* Update hwcur to the next slot to transmit. */ + kring->nr_hwcur = nm_i; /* not head, we could break early */ + } + + /* + * Second, reclaim completed buffers + */ + if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { + /* No more available slots? Set a notification event + * on a netmap slot that will be cleaned in the future. + * No doublecheck is performed, since txsync() will be + * called twice by netmap_poll(). + */ + generic_set_tx_event(kring, nm_i); + } + ND("tx #%d, hwtail = %d", n, kring->nr_hwtail); + + generic_netmap_tx_clean(kring); + + nm_txsync_finalize(kring); + + return 0; +} + + +/* + * This handler is registered (through netmap_catch_rx()) + * within the attached network interface + * in the RX subsystem, so that every mbuf passed up by + * the driver can be stolen to the network stack. + * Stolen packets are put in a queue where the + * generic_netmap_rxsync() callback can extract them. + */ +void +generic_rx_handler(struct ifnet *ifp, struct mbuf *m) +{ + struct netmap_adapter *na = NA(ifp); + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; + u_int work_done; + u_int rr = MBUF_RXQ(m); // receive ring number + + if (rr >= na->num_rx_rings) { + rr = rr % na->num_rx_rings; // XXX expensive... + } + + /* limit the size of the queue */ + if (unlikely(mbq_len(&na->rx_rings[rr].rx_queue) > 1024)) { + m_freem(m); + } else { + mbq_safe_enqueue(&na->rx_rings[rr].rx_queue, m); + } + + if (netmap_generic_mit < 32768) { + /* no rx mitigation, pass notification up */ + netmap_generic_irq(na->ifp, rr, &work_done); + IFRATE(rate_ctx.new.rxirq++); + } else { + /* same as send combining, filter notification if there is a + * pending timer, otherwise pass it up and start a timer. + */ + if (likely(netmap_mitigation_active(&gna->mit[rr]))) { + /* Record that there is some pending work. */ + gna->mit[rr].mit_pending = 1; + } else { + netmap_generic_irq(na->ifp, rr, &work_done); + IFRATE(rate_ctx.new.rxirq++); + netmap_mitigation_start(&gna->mit[rr]); + } + } +} + +/* + * generic_netmap_rxsync() extracts mbufs from the queue filled by + * generic_netmap_rx_handler() and puts their content in the netmap + * receive ring. + * Access must be protected because the rx handler is asynchronous, + */ +static int +generic_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + u_int nm_i; /* index into the netmap ring */ //j, + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); + int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; + + if (head > lim) + return netmap_ring_reinit(kring); + + /* + * First part: import newly received packets. + */ + if (netmap_no_pendintr || force_update) { + /* extract buffers from the rx queue, stop at most one + * slot before nr_hwcur (stop_i) + */ + uint16_t slot_flags = kring->nkr_slot_flags; + u_int stop_i = nm_prev(kring->nr_hwcur, lim); + + nm_i = kring->nr_hwtail; /* first empty slot in the receive ring */ + for (n = 0; nm_i != stop_i; n++) { + int len; + void *addr = NMB(&ring->slot[nm_i]); + struct mbuf *m; + + /* we only check the address here on generic rx rings */ + if (addr == netmap_buffer_base) { /* Bad buffer */ + return netmap_ring_reinit(kring); + } + /* + * Call the locked version of the function. + * XXX Ideally we could grab a batch of mbufs at once + * and save some locking overhead. + */ + m = mbq_safe_dequeue(&kring->rx_queue); + if (!m) /* no more data */ + break; + len = MBUF_LEN(m); + m_copydata(m, 0, len, addr); + ring->slot[nm_i].len = len; + ring->slot[nm_i].flags = slot_flags; + m_freem(m); + nm_i = nm_next(nm_i, lim); + } + if (n) { + kring->nr_hwtail = nm_i; + IFRATE(rate_ctx.new.rxpkt += n); + } + kring->nr_kflags &= ~NKR_PENDINTR; + } + + // XXX should we invert the order ? + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { + /* Userspace has released some packets. */ + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; + + slot->flags &= ~NS_BUF_CHANGED; + nm_i = nm_next(nm_i, lim); + } + kring->nr_hwcur = head; + } + /* tell userspace that there might be new packets. */ + nm_rxsync_finalize(kring); + IFRATE(rate_ctx.new.rxsync++); + + return 0; +} + +static void +generic_netmap_dtor(struct netmap_adapter *na) +{ + struct ifnet *ifp = na->ifp; + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter*)na; + struct netmap_adapter *prev_na = gna->prev; + + if (prev_na != NULL) { + D("Released generic NA %p", gna); + if_rele(na->ifp); + netmap_adapter_put(prev_na); + } + if (ifp != NULL) { + WNA(ifp) = prev_na; + D("Restored native NA %p", prev_na); + na->ifp = NULL; + } +} + +/* + * generic_netmap_attach() makes it possible to use netmap on + * a device without native netmap support. + * This is less performant than native support but potentially + * faster than raw sockets or similar schemes. + * + * In this "emulated" mode, netmap rings do not necessarily + * have the same size as those in the NIC. We use a default + * value and possibly override it if the OS has ways to fetch the + * actual configuration. + */ +int +generic_netmap_attach(struct ifnet *ifp) +{ + struct netmap_adapter *na; + struct netmap_generic_adapter *gna; + int retval; + u_int num_tx_desc, num_rx_desc; + + num_tx_desc = num_rx_desc = netmap_generic_ringsize; /* starting point */ + + generic_find_num_desc(ifp, &num_tx_desc, &num_rx_desc); + ND("Netmap ring size: TX = %d, RX = %d", num_tx_desc, num_rx_desc); + + gna = malloc(sizeof(*gna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (gna == NULL) { + D("no memory on attach, give up"); + return ENOMEM; + } + na = (struct netmap_adapter *)gna; + na->ifp = ifp; + na->num_tx_desc = num_tx_desc; + na->num_rx_desc = num_rx_desc; + na->nm_register = &generic_netmap_register; + na->nm_txsync = &generic_netmap_txsync; + na->nm_rxsync = &generic_netmap_rxsync; + na->nm_dtor = &generic_netmap_dtor; + /* when using generic, IFCAP_NETMAP is set so we force + * NAF_SKIP_INTR to use the regular interrupt handler + */ + na->na_flags = NAF_SKIP_INTR | NAF_HOST_RINGS; + + ND("[GNA] num_tx_queues(%d), real_num_tx_queues(%d), len(%lu)", + ifp->num_tx_queues, ifp->real_num_tx_queues, + ifp->tx_queue_len); + ND("[GNA] num_rx_queues(%d), real_num_rx_queues(%d)", + ifp->num_rx_queues, ifp->real_num_rx_queues); + + generic_find_num_queues(ifp, &na->num_tx_rings, &na->num_rx_rings); + + retval = netmap_attach_common(na); + if (retval) { + free(gna, M_DEVBUF); + } + + return retval; +} Property changes on: stable/9/sys/dev/netmap/netmap_generic.c ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: stable/9/sys/dev/netmap/netmap_kern.h =================================================================== --- stable/9/sys/dev/netmap/netmap_kern.h (revision 262152) +++ stable/9/sys/dev/netmap/netmap_kern.h (revision 262153) @@ -1,710 +1,1380 @@ /* - * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD$ * * The header contains the definitions of constants and function * prototypes used only in kernelspace. */ #ifndef _NET_NETMAP_KERN_H_ #define _NET_NETMAP_KERN_H_ +#define WITH_VALE // comment out to disable VALE support +#define WITH_PIPES + #if defined(__FreeBSD__) #define likely(x) __builtin_expect((long)!!(x), 1L) #define unlikely(x) __builtin_expect((long)!!(x), 0L) #define NM_LOCK_T struct mtx +#define NMG_LOCK_T struct mtx +#define NMG_LOCK_INIT() mtx_init(&netmap_global_lock, \ + "netmap global lock", NULL, MTX_DEF) +#define NMG_LOCK_DESTROY() mtx_destroy(&netmap_global_lock) +#define NMG_LOCK() mtx_lock(&netmap_global_lock) +#define NMG_UNLOCK() mtx_unlock(&netmap_global_lock) +#define NMG_LOCK_ASSERT() mtx_assert(&netmap_global_lock, MA_OWNED) + #define NM_SELINFO_T struct selinfo #define MBUF_LEN(m) ((m)->m_pkthdr.len) -#define NM_SEND_UP(ifp, m) ((ifp)->if_input)(ifp, m) +#define MBUF_IFP(m) ((m)->m_pkthdr.rcvif) +#define NM_SEND_UP(ifp, m) ((NA(ifp))->if_input)(ifp, m) -#define NM_ATOMIC_T volatile int +#define NM_ATOMIC_T volatile int // XXX ? +/* atomic operations */ +#include +#define NM_ATOMIC_TEST_AND_SET(p) (!atomic_cmpset_acq_int((p), 0, 1)) +#define NM_ATOMIC_CLEAR(p) atomic_store_rel_int((p), 0) + +MALLOC_DECLARE(M_NETMAP); + +// XXX linux struct, not used in FreeBSD +struct net_device_ops { +}; +struct hrtimer { +}; + #elif defined (linux) #define NM_LOCK_T safe_spinlock_t // see bsd_glue.h #define NM_SELINFO_T wait_queue_head_t #define MBUF_LEN(m) ((m)->len) -#define NM_SEND_UP(ifp, m) netif_rx(m) +#define MBUF_IFP(m) ((m)->dev) +#define NM_SEND_UP(ifp, m) \ + do { \ + m->priority = NM_MAGIC_PRIORITY; \ + netif_rx(m); \ + } while (0) #define NM_ATOMIC_T volatile long unsigned int +// XXX a mtx would suffice here too 20130404 gl +#define NMG_LOCK_T struct semaphore +#define NMG_LOCK_INIT() sema_init(&netmap_global_lock, 1) +#define NMG_LOCK_DESTROY() +#define NMG_LOCK() down(&netmap_global_lock) +#define NMG_UNLOCK() up(&netmap_global_lock) +#define NMG_LOCK_ASSERT() // XXX to be completed + #ifndef DEV_NETMAP #define DEV_NETMAP #endif /* DEV_NETMAP */ /* * IFCAP_NETMAP goes into net_device's priv_flags (if_capenable). * This was 16 bits up to linux 2.6.36, so we need a 16 bit value on older * platforms and tolerate the clash with IFF_DYNAMIC and IFF_BRIDGE_PORT. * For the 32-bit value, 0x100000 has no clashes until at least 3.5.1 */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) #define IFCAP_NETMAP 0x8000 #else #define IFCAP_NETMAP 0x200000 #endif #elif defined (__APPLE__) #warning apple support is incomplete. #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) #define NM_LOCK_T IOLock * #define NM_SELINFO_T struct selinfo #define MBUF_LEN(m) ((m)->m_pkthdr.len) #define NM_SEND_UP(ifp, m) ((ifp)->if_input)(ifp, m) #else #error unsupported platform #endif /* end - platform-specific code */ #define ND(format, ...) #define D(format, ...) \ do { \ struct timeval __xxts; \ microtime(&__xxts); \ - printf("%03d.%06d %s [%d] " format "\n", \ + printf("%03d.%06d [%4d] %-25s " format "\n", \ (int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \ - __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + __LINE__, __FUNCTION__, ##__VA_ARGS__); \ } while (0) /* rate limited, lps indicates how many per second */ #define RD(lps, format, ...) \ do { \ static int t0, __cnt; \ if (t0 != time_second) { \ t0 = time_second; \ __cnt = 0; \ } \ if (__cnt++ < lps) \ D(format, ##__VA_ARGS__); \ } while (0) struct netmap_adapter; struct nm_bdg_fwd; struct nm_bridge; struct netmap_priv_d; const char *nm_dump_buf(char *p, int len, int lim, char *dst); +#include "netmap_mbq.h" + +extern NMG_LOCK_T netmap_global_lock; + /* * private, kernel view of a ring. Keeps track of the status of * a ring across system calls. * * nr_hwcur index of the next buffer to refill. - * It corresponds to ring->cur - ring->reserved + * It corresponds to ring->head + * at the time the system call returns. * - * nr_hwavail the number of slots "owned" by userspace. - * nr_hwavail =:= ring->avail + ring->reserved + * nr_hwtail index of the first buffer owned by the kernel. + * On RX, hwcur->hwtail are receive buffers + * not yet released. hwcur is advanced following + * ring->head, hwtail is advanced on incoming packets, + * and a wakeup is generated when hwtail passes ring->cur + * On TX, hwcur->rcur have been filled by the sender + * but not sent yet to the NIC; rcur->hwtail are available + * for new transmissions, and hwtail->hwcur-1 are pending + * transmissions not yet acknowledged. * * The indexes in the NIC and netmap rings are offset by nkr_hwofs slots. * This is so that, on a reset, buffers owned by userspace are not * modified by the kernel. In particular: - * RX rings: the next empty buffer (hwcur + hwavail + hwofs) coincides with + * RX rings: the next empty buffer (hwtail + hwofs) coincides with * the next empty buffer as known by the hardware (next_to_check or so). * TX rings: hwcur + hwofs coincides with next_to_send * * Clients cannot issue concurrent syscall on a ring. The system * detects this and reports an error using two flags, * NKR_WBUSY and NKR_RBUSY * For received packets, slot->flags is set to nkr_slot_flags * so we can provide a proper initial value (e.g. set NS_FORWARD * when operating in 'transparent' mode). * * The following fields are used to implement lock-free copy of packets * from input to output ports in VALE switch: * nkr_hwlease buffer after the last one being copied. * A writer in nm_bdg_flush reserves N buffers * from nr_hwlease, advances it, then does the * copy outside the lock. * In RX rings (used for VALE ports), - * nkr_hwcur + nkr_hwavail <= nkr_hwlease < nkr_hwcur+N-1 + * nkr_hwtail <= nkr_hwlease < nkr_hwcur+N-1 * In TX rings (used for NIC or host stack ports) - * nkr_hwcur <= nkr_hwlease < nkr_hwcur+ nkr_hwavail + * nkr_hwcur <= nkr_hwlease < nkr_hwtail * nkr_leases array of nkr_num_slots where writers can report * completion of their block. NR_NOSLOT (~0) indicates * that the writer has not finished yet - * nkr_lease_idx index of next free slot in nr_leases, to be assigned + * nkr_lease_idx index of next free slot in nr_leases, to be assigned * * The kring is manipulated by txsync/rxsync and generic netmap function. - * q_lock is used to arbitrate access to the kring from within the netmap - * code, and this and other protections guarantee that there is never - * more than 1 concurrent call to txsync or rxsync. So we are free - * to manipulate the kring from within txsync/rxsync without any extra - * locks. + * + * Concurrent rxsync or txsync on the same ring are prevented through + * by nm_kr_lock() which in turn uses nr_busy. This is all we need + * for NIC rings, and for TX rings attached to the host stack. + * + * RX rings attached to the host stack use an mbq (rx_queue) on both + * rxsync_from_host() and netmap_transmit(). The mbq is protected + * by its internal lock. + * + * RX rings attached to the VALE switch are accessed by both sender + * and receiver. They are protected through the q_lock on the RX ring. */ struct netmap_kring { - struct netmap_ring *ring; - uint32_t nr_hwcur; - uint32_t nr_hwavail; - uint32_t nr_kflags; /* private driver flags */ -#define NKR_PENDINTR 0x1 // Pending interrupt. - uint32_t nkr_num_slots; - int32_t nkr_hwofs; /* offset between NIC and netmap ring */ + struct netmap_ring *ring; + uint32_t nr_hwcur; + uint32_t nr_hwtail; + + /* + * Copies of values in user rings, so we do not need to look + * at the ring (which could be modified). These are set in the + * *sync_prologue()/finalize() routines. + */ + uint32_t rhead; + uint32_t rcur; + uint32_t rtail; + + uint32_t nr_kflags; /* private driver flags */ +#define NKR_PENDINTR 0x1 // Pending interrupt. + uint32_t nkr_num_slots; + + /* + * On a NIC reset, the NIC ring indexes may be reset but the + * indexes in the netmap rings remain the same. nkr_hwofs + * keeps track of the offset between the two. + */ + int32_t nkr_hwofs; + uint16_t nkr_slot_flags; /* initial value for flags */ + + /* last_reclaim is opaque marker to help reduce the frequency + * of operations such as reclaiming tx buffers. A possible use + * is set it to ticks and do the reclaim only once per tick. + */ + uint64_t last_reclaim; + + + NM_SELINFO_T si; /* poll/select wait queue */ + NM_LOCK_T q_lock; /* protects kring and ring. */ + NM_ATOMIC_T nr_busy; /* prevent concurrent syscalls */ + struct netmap_adapter *na; + + /* The folloiwing fields are for VALE switch support */ struct nm_bdg_fwd *nkr_ft; - uint32_t *nkr_leases; -#define NR_NOSLOT ((uint32_t)~0) - uint32_t nkr_hwlease; - uint32_t nkr_lease_idx; + uint32_t *nkr_leases; +#define NR_NOSLOT ((uint32_t)~0) /* used in nkr_*lease* */ + uint32_t nkr_hwlease; + uint32_t nkr_lease_idx; - NM_SELINFO_T si; /* poll/select wait queue */ - NM_LOCK_T q_lock; /* protects kring and ring. */ - NM_ATOMIC_T nr_busy; /* prevent concurrent syscalls */ + volatile int nkr_stopped; // XXX what for ? - volatile int nkr_stopped; + /* Support for adapters without native netmap support. + * On tx rings we preallocate an array of tx buffers + * (same size as the netmap ring), on rx rings we + * store incoming mbufs in a queue that is drained by + * a rxsync. + */ + struct mbuf **tx_pool; + // u_int nr_ntc; /* Emulation of a next-to-clean RX ring pointer. */ + struct mbq rx_queue; /* intercepted rx mbufs. */ + + uint32_t ring_id; /* debugging */ + char name[64]; /* diagnostic */ + + int (*nm_sync)(struct netmap_kring *kring, int flags); + +#ifdef WITH_PIPES + struct netmap_kring *pipe; + struct netmap_ring *save_ring; +#endif /* WITH_PIPES */ + } __attribute__((__aligned__(64))); /* return the next index, with wraparound */ static inline uint32_t nm_next(uint32_t i, uint32_t lim) { return unlikely (i == lim) ? 0 : i + 1; } + +/* return the previous index, with wraparound */ +static inline uint32_t +nm_prev(uint32_t i, uint32_t lim) +{ + return unlikely (i == 0) ? lim : i - 1; +} + + /* * * Here is the layout for the Rx and Tx rings. RxRING TxRING +-----------------+ +-----------------+ | | | | |XXX free slot XXX| |XXX free slot XXX| +-----------------+ +-----------------+ - | |<-hwcur | |<-hwcur - | reserved h | | (ready | - +----------- w -+ | to be | - cur->| a | | sent) h | - | v | +---------- w | - | a | cur->| (being a | - | i | | prepared) v | - | avail l | | a | - +-----------------+ + a ------ i + - | | ... | v l |<-hwlease - | (being | ... | a | ... - | prepared) | ... | i | ... - +-----------------+ ... | l | ... +head->| owned by user |<-hwcur | not sent to nic |<-hwcur + | | | yet | + +-----------------+ | | + cur->| available to | | | + | user, not read | +-----------------+ + | yet | cur->| (being | + | | | prepared) | + | | | | + +-----------------+ + ------ + +tail->| |<-hwtail | |<-hwlease + | (being | ... | | ... + | prepared) | ... | | ... + +-----------------+ ... | | ... | |<-hwlease +-----------------+ + | | tail->| |<-hwtail | | | | | | | | | | | | - | | | | +-----------------+ +-----------------+ - * The cur/avail (user view) and hwcur/hwavail (kernel view) + * The cur/tail (user view) and hwcur/hwtail (kernel view) * are used in the normal operation of the card. * * When a ring is the output of a switch port (Rx ring for * a VALE port, Tx ring for the host stack or NIC), slots * are reserved in blocks through 'hwlease' which points * to the next unused slot. - * On an Rx ring, hwlease is always after hwavail, - * and completions cause avail to advance. - * On a Tx ring, hwlease is always between cur and hwavail, + * On an Rx ring, hwlease is always after hwtail, + * and completions cause hwtail to advance. + * On a Tx ring, hwlease is always between cur and hwtail, * and completions cause cur to advance. * * nm_kr_space() returns the maximum number of slots that * can be assigned. * nm_kr_lease() reserves the required number of buffers, * advances nkr_hwlease and also returns an entry in * a circular array where completions should be reported. */ +enum txrx { NR_RX = 0, NR_TX = 1 }; - /* - * This struct extends the 'struct adapter' (or - * equivalent) device descriptor. It contains all fields needed to - * support netmap operation. + * The "struct netmap_adapter" extends the "struct adapter" + * (or equivalent) device descriptor. + * It contains all base fields needed to support netmap operation. + * There are in fact different types of netmap adapters + * (native, generic, VALE switch...) so a netmap_adapter is + * just the first field in the derived type. */ struct netmap_adapter { /* * On linux we do not have a good way to tell if an interface - * is netmap-capable. So we use the following trick: + * is netmap-capable. So we always use the following trick: * NA(ifp) points here, and the first entry (which hopefully * always exists and is at least 32 bits) contains a magic * value which we can use to detect that the interface is good. */ uint32_t magic; - uint32_t na_flags; /* future place for IFCAP_NETMAP */ + uint32_t na_flags; /* enabled, and other flags */ #define NAF_SKIP_INTR 1 /* use the regular interrupt handler. * useful during initialization */ #define NAF_SW_ONLY 2 /* forward packets only to sw adapter */ #define NAF_BDG_MAYSLEEP 4 /* the bridge is allowed to sleep when * forwarding packets coming from this * interface */ #define NAF_MEM_OWNER 8 /* the adapter is responsible for the * deallocation of the memory allocator */ - int refcount; /* number of user-space descriptors using this +#define NAF_NATIVE_ON 16 /* the adapter is native and the attached + * interface is in netmap mode + */ +#define NAF_NETMAP_ON 32 /* netmap is active (either native or + * emulated. Where possible (e.g. FreeBSD) + * IFCAP_NETMAP also mirrors this flag. + */ +#define NAF_HOST_RINGS 64 /* the adapter supports the host rings */ + int active_fds; /* number of user-space descriptors using this interface, which is equal to the number of struct netmap_if objs in the mapped region. */ - /* - * The selwakeup in the interrupt thread can use per-ring - * and/or global wait queues. We track how many clients - * of each type we have so we can optimize the drivers, - * and especially avoid huge contention on the locks. - */ - int na_single; /* threads attached to a single hw queue */ - int na_multi; /* threads attached to multiple hw queues */ u_int num_rx_rings; /* number of adapter receive rings */ u_int num_tx_rings; /* number of adapter transmit rings */ u_int num_tx_desc; /* number of descriptor in each queue */ u_int num_rx_desc; /* tx_rings and rx_rings are private but allocated * as a contiguous chunk of memory. Each array has * N+1 entries, for the adapter queues and for the host queue. */ struct netmap_kring *tx_rings; /* array of TX rings. */ struct netmap_kring *rx_rings; /* array of RX rings. */ + void *tailroom; /* space below the rings array */ + /* (used for leases) */ + + NM_SELINFO_T tx_si, rx_si; /* global wait queues */ + /* count users of the global wait queues */ + int tx_si_users, rx_si_users; + /* copy of if_qflush and if_transmit pointers, to intercept * packets from the network stack when netmap is active. */ int (*if_transmit)(struct ifnet *, struct mbuf *); + /* copy of if_input for netmap_send_up() */ + void (*if_input)(struct ifnet *, struct mbuf *); + /* references to the ifnet and device routines, used by * the generic netmap functions. */ struct ifnet *ifp; /* adapter is ifp->if_softc */ - NM_LOCK_T core_lock; /* used if no device lock available */ + /*---- callbacks for this netmap adapter -----*/ + /* + * nm_dtor() is the cleanup routine called when destroying + * the adapter. + * + * nm_register() is called on NIOCREGIF and close() to enter + * or exit netmap mode on the NIC + * + * nm_txsync() pushes packets to the underlying hw/switch + * + * nm_rxsync() collects packets from the underlying hw/switch + * + * nm_config() returns configuration information from the OS + * + * nm_krings_create() create and init the krings array + * (the array layout must conform to the description + * found above the definition of netmap_krings_create) + * + * nm_krings_delete() cleanup and delete the kring array + * + * nm_notify() is used to act after data have become available. + * For hw devices this is typically a selwakeup(), + * but for NIC/host ports attached to a switch (or vice-versa) + * we also need to invoke the 'txsync' code downstream. + */ - int (*nm_register)(struct ifnet *, int onoff); + /* private cleanup */ + void (*nm_dtor)(struct netmap_adapter *); - int (*nm_txsync)(struct ifnet *, u_int ring, int flags); - int (*nm_rxsync)(struct ifnet *, u_int ring, int flags); + int (*nm_register)(struct netmap_adapter *, int onoff); + + int (*nm_txsync)(struct netmap_adapter *, u_int ring, int flags); + int (*nm_rxsync)(struct netmap_adapter *, u_int ring, int flags); #define NAF_FORCE_READ 1 #define NAF_FORCE_RECLAIM 2 /* return configuration information */ - int (*nm_config)(struct ifnet *, u_int *txr, u_int *txd, - u_int *rxr, u_int *rxd); + int (*nm_config)(struct netmap_adapter *, + u_int *txr, u_int *txd, u_int *rxr, u_int *rxd); + int (*nm_krings_create)(struct netmap_adapter *); + void (*nm_krings_delete)(struct netmap_adapter *); + int (*nm_notify)(struct netmap_adapter *, + u_int ring, enum txrx, int flags); +#define NAF_DISABLE_NOTIFY 8 + /* standard refcount to control the lifetime of the adapter + * (it should be equal to the lifetime of the corresponding ifp) + */ + int na_refcount; + + /* memory allocator (opaque) + * We also cache a pointer to the lut_entry for translating + * buffer addresses, and the total number of buffers. + */ + struct netmap_mem_d *nm_mem; + struct lut_entry *na_lut; + uint32_t na_lut_objtotal; /* max buffer index */ + + /* used internally. If non-null, the interface cannot be bound + * from userspace + */ + void *na_private; + +#ifdef WITH_PIPES + struct netmap_pipe_adapter **na_pipes; + int na_next_pipe; + int na_max_pipes; +#endif /* WITH_PIPES */ +}; + + +/* + * If the NIC is owned by the kernel + * (i.e., bridge), neither another bridge nor user can use it; + * if the NIC is owned by a user, only users can share it. + * Evaluation must be done under NMG_LOCK(). + */ +#define NETMAP_OWNED_BY_KERN(na) (na->na_private) +#define NETMAP_OWNED_BY_ANY(na) \ + (NETMAP_OWNED_BY_KERN(na) || (na->active_fds > 0)) + + +/* + * derived netmap adapters for various types of ports + */ +struct netmap_vp_adapter { /* VALE software port */ + struct netmap_adapter up; + /* * Bridge support: * * bdg_port is the port number used in the bridge; - * na_bdg_refcount is a refcount used for bridge ports, - * when it goes to 0 we can detach+free this port - * (a bridge port is always attached if it exists; - * it is not always registered) * na_bdg points to the bridge this NA is attached to. */ int bdg_port; - int na_bdg_refcount; struct nm_bridge *na_bdg; - /* When we attach a physical interface to the bridge, we - * allow the controlling process to terminate, so we need - * a place to store the netmap_priv_d data structure. - * This is only done when physical interfaces are attached to a bridge. + int retry; + + /* Offset of ethernet header for each packet. */ + u_int virt_hdr_len; + /* Maximum Frame Size, used in bdg_mismatch_datapath() */ + u_int mfs; +}; + + +struct netmap_hw_adapter { /* physical device */ + struct netmap_adapter up; + + struct net_device_ops nm_ndo; // XXX linux only +}; + +/* Mitigation support. */ +struct nm_generic_mit { + struct hrtimer mit_timer; + int mit_pending; + struct netmap_adapter *mit_na; /* backpointer */ +}; + +struct netmap_generic_adapter { /* emulated device */ + struct netmap_hw_adapter up; + + /* Pointer to a previously used netmap adapter. */ + struct netmap_adapter *prev; + + /* generic netmap adapters support: + * a net_device_ops struct overrides ndo_select_queue(), + * save_if_input saves the if_input hook (FreeBSD), + * mit implements rx interrupt mitigation, */ - struct netmap_priv_d *na_kpriv; + struct net_device_ops generic_ndo; + void (*save_if_input)(struct ifnet *, struct mbuf *); - /* memory allocator */ - struct netmap_mem_d *nm_mem; + struct nm_generic_mit *mit; #ifdef linux - struct net_device_ops nm_ndo; -#endif /* linux */ + netdev_tx_t (*save_start_xmit)(struct mbuf *, struct ifnet *); +#endif }; +static __inline int +netmap_real_tx_rings(struct netmap_adapter *na) +{ + return na->num_tx_rings + !!(na->na_flags & NAF_HOST_RINGS); +} + +static __inline int +netmap_real_rx_rings(struct netmap_adapter *na) +{ + return na->num_rx_rings + !!(na->na_flags & NAF_HOST_RINGS); +} + +#ifdef WITH_VALE + /* - * Available space in the ring. + * Bridge wrapper for non VALE ports attached to a VALE switch. + * + * The real device must already have its own netmap adapter (hwna). + * The bridge wrapper and the hwna adapter share the same set of + * netmap rings and buffers, but they have two separate sets of + * krings descriptors, with tx/rx meanings swapped: + * + * netmap + * bwrap krings rings krings hwna + * +------+ +------+ +-----+ +------+ +------+ + * |tx_rings->| |\ /| |----| |<-tx_rings| + * | | +------+ \ / +-----+ +------+ | | + * | | X | | + * | | / \ | | + * | | +------+/ \+-----+ +------+ | | + * |rx_rings->| | | |----| |<-rx_rings| + * | | +------+ +-----+ +------+ | | + * +------+ +------+ + * + * - packets coming from the bridge go to the brwap rx rings, + * which are also the hwna tx rings. The bwrap notify callback + * will then complete the hwna tx (see netmap_bwrap_notify). + * + * - packets coming from the outside go to the hwna rx rings, + * which are also the bwrap tx rings. The (overwritten) hwna + * notify method will then complete the bridge tx + * (see netmap_bwrap_intr_notify). + * + * The bridge wrapper may optionally connect the hwna 'host' rings + * to the bridge. This is done by using a second port in the + * bridge and connecting it to the 'host' netmap_vp_adapter + * contained in the netmap_bwrap_adapter. The brwap host adapter + * cross-links the hwna host rings in the same way as shown above. + * + * - packets coming from the bridge and directed to the host stack + * are handled by the bwrap host notify callback + * (see netmap_bwrap_host_notify) + * + * - packets coming from the host stack are still handled by the + * overwritten hwna notify callback (netmap_bwrap_intr_notify), + * but are diverted to the host adapter depending on the ring number. + * */ +struct netmap_bwrap_adapter { + struct netmap_vp_adapter up; + struct netmap_vp_adapter host; /* for host rings */ + struct netmap_adapter *hwna; /* the underlying device */ + + /* backup of the hwna notify callback */ + int (*save_notify)(struct netmap_adapter *, + u_int ring, enum txrx, int flags); + + /* + * When we attach a physical interface to the bridge, we + * allow the controlling process to terminate, so we need + * a place to store the netmap_priv_d data structure. + * This is only done when physical interfaces + * are attached to a bridge. + */ + struct netmap_priv_d *na_kpriv; +}; + + +#endif /* WITH_VALE */ + +#ifdef WITH_PIPES + +#define NM_MAXPIPES 64 /* max number of pipes per adapter */ + +struct netmap_pipe_adapter { + struct netmap_adapter up; + + u_int id; /* pipe identifier */ + int role; /* either NR_REG_PIPE_MASTER or NR_REG_PIPE_SLAVE */ + + struct netmap_adapter *parent; /* adapter that owns the memory */ + struct netmap_pipe_adapter *peer; /* the other end of the pipe */ + int peer_ref; /* 1 iff we are holding a ref to the peer */ + + u_int parent_slot; /* index in the parent pipe array */ +}; + +#endif /* WITH_PIPES */ + + +/* return slots reserved to rx clients; used in drivers */ static inline uint32_t -nm_kr_space(struct netmap_kring *k, int is_rx) +nm_kr_rxspace(struct netmap_kring *k) { - int space; + int space = k->nr_hwtail - k->nr_hwcur; + if (space < 0) + space += k->nkr_num_slots; + ND("preserving %d rx slots %d -> %d", space, k->nr_hwcur, k->nr_hwtail); - if (is_rx) { - int busy = k->nkr_hwlease - k->nr_hwcur; - if (busy < 0) - busy += k->nkr_num_slots; - space = k->nkr_num_slots - 1 - busy; - } else { - space = k->nr_hwcur + k->nr_hwavail - k->nkr_hwlease; - if (space < 0) - space += k->nkr_num_slots; - } -#if 0 - // sanity check - if (k->nkr_hwlease >= k->nkr_num_slots || - k->nr_hwcur >= k->nkr_num_slots || - k->nr_hwavail >= k->nkr_num_slots || - busy < 0 || - busy >= k->nkr_num_slots) { - D("invalid kring, cur %d avail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease, - k->nkr_lease_idx, k->nkr_num_slots); - } -#endif return space; } -/* return update position */ -static inline uint32_t -nm_kr_rxpos(struct netmap_kring *k) +/* True if no space in the tx ring. only valid after txsync_prologue */ +static inline int +nm_kr_txempty(struct netmap_kring *kring) { - uint32_t pos = k->nr_hwcur + k->nr_hwavail; - if (pos >= k->nkr_num_slots) - pos -= k->nkr_num_slots; -#if 0 - if (pos >= k->nkr_num_slots || - k->nkr_hwlease >= k->nkr_num_slots || - k->nr_hwcur >= k->nkr_num_slots || - k->nr_hwavail >= k->nkr_num_slots || - k->nkr_lease_idx >= k->nkr_num_slots) { - D("invalid kring, cur %d avail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease, - k->nkr_lease_idx, k->nkr_num_slots); - } -#endif - return pos; + return kring->rcur == kring->nr_hwtail; } -/* make a lease on the kring for N positions. return the - * lease index +/* + * protect against multiple threads using the same ring. + * also check that the ring has not been stopped. + * We only care for 0 or !=0 as a return code. */ -static inline uint32_t -nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) +#define NM_KR_BUSY 1 +#define NM_KR_STOPPED 2 + + +static __inline void nm_kr_put(struct netmap_kring *kr) { - uint32_t lim = k->nkr_num_slots - 1; - uint32_t lease_idx = k->nkr_lease_idx; + NM_ATOMIC_CLEAR(&kr->nr_busy); +} - k->nkr_leases[lease_idx] = NR_NOSLOT; - k->nkr_lease_idx = nm_next(lease_idx, lim); - if (n > nm_kr_space(k, is_rx)) { - D("invalid request for %d slots", n); - panic("x"); +static __inline int nm_kr_tryget(struct netmap_kring *kr) +{ + /* check a first time without taking the lock + * to avoid starvation for nm_kr_get() + */ + if (unlikely(kr->nkr_stopped)) { + ND("ring %p stopped (%d)", kr, kr->nkr_stopped); + return NM_KR_STOPPED; } - /* XXX verify that there are n slots */ - k->nkr_hwlease += n; - if (k->nkr_hwlease > lim) - k->nkr_hwlease -= lim + 1; - - if (k->nkr_hwlease >= k->nkr_num_slots || - k->nr_hwcur >= k->nkr_num_slots || - k->nr_hwavail >= k->nkr_num_slots || - k->nkr_lease_idx >= k->nkr_num_slots) { - D("invalid kring %s, cur %d avail %d lease %d lease_idx %d lim %d", - k->na->ifp->if_xname, - k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease, - k->nkr_lease_idx, k->nkr_num_slots); + if (unlikely(NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))) + return NM_KR_BUSY; + /* check a second time with lock held */ + if (unlikely(kr->nkr_stopped)) { + ND("ring %p stopped (%d)", kr, kr->nkr_stopped); + nm_kr_put(kr); + return NM_KR_STOPPED; } - return lease_idx; + return 0; } /* - * XXX NETMAP_DELETING() is unused - * - * The combination of "enable" (ifp->if_capenable & IFCAP_NETMAP) - * and refcount gives the status of the interface, namely: - * - * enable refcount Status - * - * FALSE 0 normal operation - * FALSE != 0 -- (impossible) - * TRUE 1 netmap mode - * TRUE 0 being deleted. - */ - -#define NETMAP_DELETING(_na) ( ((_na)->refcount == 0) && \ - ( (_na)->ifp->if_capenable & IFCAP_NETMAP) ) - - -/* - * The following are support routines used by individual drivers to + * The following functions are used by individual drivers to * support netmap operation. * * netmap_attach() initializes a struct netmap_adapter, allocating the * struct netmap_ring's and the struct selinfo. * * netmap_detach() frees the memory allocated by netmap_attach(). * * netmap_transmit() replaces the if_transmit routine of the interface, * and is used to intercept packets coming from the stack. * * netmap_load_map/netmap_reload_map are helper routines to set/reset * the dmamap for a packet buffer * * netmap_reset() is a helper routine to be called in the driver * when reinitializing a ring. */ -int netmap_attach(struct netmap_adapter *, u_int); +int netmap_attach(struct netmap_adapter *); +int netmap_attach_common(struct netmap_adapter *); +void netmap_detach_common(struct netmap_adapter *na); void netmap_detach(struct ifnet *); int netmap_transmit(struct ifnet *, struct mbuf *); -enum txrx { NR_RX = 0, NR_TX = 1 }; struct netmap_slot *netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, u_int new_cur); int netmap_ring_reinit(struct netmap_kring *); +/* default functions to handle rx/tx interrupts */ +int netmap_rx_irq(struct ifnet *, u_int, u_int *); +#define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL) +void netmap_common_irq(struct ifnet *, u_int, u_int *work_done); + +void netmap_disable_all_rings(struct ifnet *); +void netmap_enable_all_rings(struct ifnet *); +void netmap_disable_ring(struct netmap_kring *kr); + + +/* set/clear native flags and if_transmit/netdev_ops */ +static inline void +nm_set_native_flags(struct netmap_adapter *na) +{ + struct ifnet *ifp = na->ifp; + + na->na_flags |= (NAF_NATIVE_ON | NAF_NETMAP_ON); +#ifdef IFCAP_NETMAP /* or FreeBSD ? */ + ifp->if_capenable |= IFCAP_NETMAP; +#endif +#ifdef __FreeBSD__ + na->if_transmit = ifp->if_transmit; + ifp->if_transmit = netmap_transmit; +#else + na->if_transmit = (void *)ifp->netdev_ops; + ifp->netdev_ops = &((struct netmap_hw_adapter *)na)->nm_ndo; +#endif +} + + +static inline void +nm_clear_native_flags(struct netmap_adapter *na) +{ + struct ifnet *ifp = na->ifp; + +#ifdef __FreeBSD__ + ifp->if_transmit = na->if_transmit; +#else + ifp->netdev_ops = (void *)na->if_transmit; +#endif + na->na_flags &= ~(NAF_NATIVE_ON | NAF_NETMAP_ON); +#ifdef IFCAP_NETMAP /* or FreeBSD ? */ + ifp->if_capenable &= ~IFCAP_NETMAP; +#endif +} + + +/* + * validates parameters in the ring/kring, returns a value for head + * If any error, returns ring_size to force a reinit. + */ +uint32_t nm_txsync_prologue(struct netmap_kring *); + + +/* + * validates parameters in the ring/kring, returns a value for head, + * and the 'reserved' value in the argument. + * If any error, returns ring_size lim to force a reinit. + */ +uint32_t nm_rxsync_prologue(struct netmap_kring *); + + +/* + * update kring and ring at the end of txsync. + */ +static inline void +nm_txsync_finalize(struct netmap_kring *kring) +{ + /* update ring tail to what the kernel knows */ + kring->ring->tail = kring->rtail = kring->nr_hwtail; + + /* note, head/rhead/hwcur might be behind cur/rcur + * if no carrier + */ + ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d", + kring->name, kring->nr_hwcur, kring->nr_hwtail, + kring->rhead, kring->rcur, kring->rtail); +} + + +/* + * update kring and ring at the end of rxsync + */ +static inline void +nm_rxsync_finalize(struct netmap_kring *kring) +{ + /* tell userspace that there might be new packets */ + //struct netmap_ring *ring = kring->ring; + ND("head %d cur %d tail %d -> %d", ring->head, ring->cur, ring->tail, + kring->nr_hwtail); + kring->ring->tail = kring->rtail = kring->nr_hwtail; + /* make a copy of the state for next round */ + kring->rhead = kring->ring->head; + kring->rcur = kring->ring->cur; +} + + +/* check/fix address and len in tx rings */ +#if 1 /* debug version */ +#define NM_CHECK_ADDR_LEN(_a, _l) do { \ + if (_a == netmap_buffer_base || _l > NETMAP_BUF_SIZE) { \ + RD(5, "bad addr/len ring %d slot %d idx %d len %d", \ + ring_nr, nm_i, slot->buf_idx, len); \ + if (_l > NETMAP_BUF_SIZE) \ + _l = NETMAP_BUF_SIZE; \ + } } while (0) +#else /* no debug version */ +#define NM_CHECK_ADDR_LEN(_a, _l) do { \ + if (_l > NETMAP_BUF_SIZE) \ + _l = NETMAP_BUF_SIZE; \ + } while (0) +#endif + + +/*---------------------------------------------------------------*/ +/* + * Support routines to be used with the VALE switch + */ +int netmap_update_config(struct netmap_adapter *na); +int netmap_krings_create(struct netmap_adapter *na, u_int tailroom); +void netmap_krings_delete(struct netmap_adapter *na); +int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait); + + +struct netmap_if * +netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, + uint16_t ringid, uint32_t flags, int *err); + + + u_int nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg); +int netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create); +int netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na); + +#ifdef WITH_VALE /* - * The following bridge-related interfaces are used by other kernel modules - * In the version that only supports unicast or broadcast, the lookup + * The following bridge-related functions are used by other + * kernel modules. + * + * VALE only supports unicast or broadcast. The lookup * function can return 0 .. NM_BDG_MAXPORTS-1 for regular ports, * NM_BDG_MAXPORTS for broadcast, NM_BDG_MAXPORTS+1 for unknown. * XXX in practice "unknown" might be handled same as broadcast. */ -typedef u_int (*bdg_lookup_fn_t)(char *buf, u_int len, uint8_t *ring_nr, - struct netmap_adapter *); -int netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func); -u_int netmap_bdg_learning(char *, u_int, uint8_t *, struct netmap_adapter *); -#define NM_NAME "vale" /* prefix for the bridge port name */ -#define NM_BDG_MAXPORTS 254 /* up to 32 for bitmap, 254 ok otherwise */ +typedef u_int (*bdg_lookup_fn_t)(char *buf, u_int len, + uint8_t *ring_nr, struct netmap_vp_adapter *); +u_int netmap_bdg_learning(char *, u_int, uint8_t *, + struct netmap_vp_adapter *); + +#define NM_BDG_MAXPORTS 254 /* up to 254 */ #define NM_BDG_BROADCAST NM_BDG_MAXPORTS #define NM_BDG_NOPORT (NM_BDG_MAXPORTS+1) +#define NM_NAME "vale" /* prefix for bridge port name */ + + +/* these are redefined in case of no VALE support */ +int netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create); +void netmap_init_bridges(void); +int netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func); + +#else /* !WITH_VALE */ +#define netmap_get_bdg_na(_1, _2, _3) 0 +#define netmap_init_bridges(_1) +#define netmap_bdg_ctl(_1, _2) EINVAL +#endif /* !WITH_VALE */ + +#ifdef WITH_PIPES +/* max number of pipes per device */ +#define NM_MAXPIPES 64 /* XXX how many? */ +/* in case of no error, returns the actual number of pipes in nmr->nr_arg1 */ +int netmap_pipe_alloc(struct netmap_adapter *, struct nmreq *nmr); +void netmap_pipe_dealloc(struct netmap_adapter *); +int netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create); +#else /* !WITH_PIPES */ +#define NM_MAXPIPES 0 +#define netmap_pipe_alloc(_1, _2) EOPNOTSUPP +#define netmap_pipe_dealloc(_1) +#define netmap_get_pipe_na(_1, _2, _3) 0 +#endif + +/* Various prototypes */ +int netmap_poll(struct cdev *dev, int events, struct thread *td); +int netmap_init(void); +void netmap_fini(void); +int netmap_get_memory(struct netmap_priv_d* p); +void netmap_dtor(void *data); +int netmap_dtor_locked(struct netmap_priv_d *priv); + +int netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td); + +/* netmap_adapter creation/destruction */ +#define NM_IFPNAME(ifp) ((ifp) ? (ifp)->if_xname : "zombie") + +// #define NM_DEBUG_PUTGET 1 + +#ifdef NM_DEBUG_PUTGET + +#define NM_DBG(f) __##f + +void __netmap_adapter_get(struct netmap_adapter *na); + +#define netmap_adapter_get(na) \ + do { \ + struct netmap_adapter *__na = na; \ + D("getting %p:%s (%d)", __na, NM_IFPNAME(__na->ifp), __na->na_refcount); \ + __netmap_adapter_get(__na); \ + } while (0) + +int __netmap_adapter_put(struct netmap_adapter *na); + +#define netmap_adapter_put(na) \ + ({ \ + struct netmap_adapter *__na = na; \ + D("putting %p:%s (%d)", __na, NM_IFPNAME(__na->ifp), __na->na_refcount); \ + __netmap_adapter_put(__na); \ + }) + +#else /* !NM_DEBUG_PUTGET */ + +#define NM_DBG(f) f +void netmap_adapter_get(struct netmap_adapter *na); +int netmap_adapter_put(struct netmap_adapter *na); + +#endif /* !NM_DEBUG_PUTGET */ + + +/* + * module variables + */ extern u_int netmap_buf_size; #define NETMAP_BUF_SIZE netmap_buf_size // XXX remove -extern int netmap_mitigate; +extern int netmap_mitigate; // XXX not really used extern int netmap_no_pendintr; -extern u_int netmap_total_buffers; -extern char *netmap_buffer_base; +extern u_int netmap_total_buffers; // global allocator +extern char *netmap_buffer_base; // global allocator extern int netmap_verbose; // XXX debugging enum { /* verbose flags */ NM_VERB_ON = 1, /* generic verbose */ NM_VERB_HOST = 0x2, /* verbose host stack */ NM_VERB_RXSYNC = 0x10, /* verbose on rxsync/txsync */ NM_VERB_TXSYNC = 0x20, NM_VERB_RXINTR = 0x100, /* verbose on rx/tx intr (driver) */ NM_VERB_TXINTR = 0x200, NM_VERB_NIC_RXSYNC = 0x1000, /* verbose on rx/tx intr (driver) */ NM_VERB_NIC_TXSYNC = 0x2000, }; +extern int netmap_txsync_retry; +extern int netmap_generic_mit; +extern int netmap_generic_ringsize; +extern int netmap_generic_rings; + /* * NA returns a pointer to the struct netmap adapter from the ifp, * WNA is used to write it. - * SWNA() is used for the "host stack" endpoint associated - * to an interface. It is allocated together with the main NA(), - * as an array of two objects. */ #ifndef WNA #define WNA(_ifp) (_ifp)->if_pspare[0] #endif #define NA(_ifp) ((struct netmap_adapter *)WNA(_ifp)) -#define SWNA(_ifp) (NA(_ifp) + 1) /* * Macros to determine if an interface is netmap capable or netmap enabled. * See the magic field in struct netmap_adapter. */ #ifdef __FreeBSD__ /* * on FreeBSD just use if_capabilities and if_capenable. */ #define NETMAP_CAPABLE(ifp) (NA(ifp) && \ (ifp)->if_capabilities & IFCAP_NETMAP ) #define NETMAP_SET_CAPABLE(ifp) \ (ifp)->if_capabilities |= IFCAP_NETMAP #else /* linux */ /* * on linux: * we check if NA(ifp) is set and its first element has a related * magic value. The capenable is within the struct netmap_adapter. */ #define NETMAP_MAGIC 0x52697a7a #define NETMAP_CAPABLE(ifp) (NA(ifp) && \ ((uint32_t)(uintptr_t)NA(ifp) ^ NA(ifp)->magic) == NETMAP_MAGIC ) #define NETMAP_SET_CAPABLE(ifp) \ NA(ifp)->magic = ((uint32_t)(uintptr_t)NA(ifp)) ^ NETMAP_MAGIC #endif /* linux */ #ifdef __FreeBSD__ -/* Callback invoked by the dma machinery after a successfull dmamap_load */ + +/* Callback invoked by the dma machinery after a successful dmamap_load */ static void netmap_dmamap_cb(__unused void *arg, __unused bus_dma_segment_t * segs, __unused int nseg, __unused int error) { } /* bus_dmamap_load wrapper: call aforementioned function if map != NULL. * XXX can we do it without a callback ? */ static inline void netmap_load_map(bus_dma_tag_t tag, bus_dmamap_t map, void *buf) { if (map) bus_dmamap_load(tag, map, buf, NETMAP_BUF_SIZE, netmap_dmamap_cb, NULL, BUS_DMA_NOWAIT); } /* update the map when a buffer changes. */ static inline void netmap_reload_map(bus_dma_tag_t tag, bus_dmamap_t map, void *buf) { if (map) { bus_dmamap_unload(tag, map); bus_dmamap_load(tag, map, buf, NETMAP_BUF_SIZE, netmap_dmamap_cb, NULL, BUS_DMA_NOWAIT); } } + #else /* linux */ /* * XXX How do we redefine these functions: * * on linux we need * dma_map_single(&pdev->dev, virt_addr, len, direction) * dma_unmap_single(&adapter->pdev->dev, phys_addr, len, direction * The len can be implicit (on netmap it is NETMAP_BUF_SIZE) * unfortunately the direction is not, so we need to change * something to have a cross API */ #define netmap_load_map(_t, _m, _b) #define netmap_reload_map(_t, _m, _b) #if 0 struct e1000_buffer *buffer_info = &tx_ring->buffer_info[l]; /* set time_stamp *before* dma to help avoid a possible race */ buffer_info->time_stamp = jiffies; buffer_info->mapped_as_page = false; buffer_info->length = len; //buffer_info->next_to_watch = l; /* reload dma map */ dma_unmap_single(&adapter->pdev->dev, buffer_info->dma, NETMAP_BUF_SIZE, DMA_TO_DEVICE); buffer_info->dma = dma_map_single(&adapter->pdev->dev, addr, NETMAP_BUF_SIZE, DMA_TO_DEVICE); if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { D("dma mapping error"); /* goto dma_error; See e1000_put_txbuf() */ /* XXX reset */ } tx_desc->buffer_addr = htole64(buffer_info->dma); //XXX #endif /* * The bus_dmamap_sync() can be one of wmb() or rmb() depending on direction. */ #define bus_dmamap_sync(_a, _b, _c) #endif /* linux */ /* * functions to map NIC to KRING indexes (n2k) and vice versa (k2n) */ static inline int netmap_idx_n2k(struct netmap_kring *kr, int idx) { int n = kr->nkr_num_slots; idx += kr->nkr_hwofs; if (idx < 0) return idx + n; else if (idx < n) return idx; else return idx - n; } static inline int netmap_idx_k2n(struct netmap_kring *kr, int idx) { int n = kr->nkr_num_slots; idx -= kr->nkr_hwofs; if (idx < 0) return idx + n; else if (idx < n) return idx; else return idx - n; } /* Entries of the look-up table. */ struct lut_entry { void *vaddr; /* virtual address. */ vm_paddr_t paddr; /* physical address. */ }; struct netmap_obj_pool; extern struct lut_entry *netmap_buffer_lut; #define NMB_VA(i) (netmap_buffer_lut[i].vaddr) #define NMB_PA(i) (netmap_buffer_lut[i].paddr) /* * NMB return the virtual address of a buffer (buffer 0 on bad index) * PNMB also fills the physical address */ static inline void * NMB(struct netmap_slot *slot) { uint32_t i = slot->buf_idx; return (unlikely(i >= netmap_total_buffers)) ? NMB_VA(0) : NMB_VA(i); } static inline void * PNMB(struct netmap_slot *slot, uint64_t *pp) { uint32_t i = slot->buf_idx; void *ret = (i >= netmap_total_buffers) ? NMB_VA(0) : NMB_VA(i); *pp = (i >= netmap_total_buffers) ? NMB_PA(0) : NMB_PA(i); return ret; } -/* default functions to handle rx/tx interrupts */ -int netmap_rx_irq(struct ifnet *, u_int, u_int *); -#define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL) +/* Generic version of NMB, which uses device-specific memory. */ +static inline void * +BDG_NMB(struct netmap_adapter *na, struct netmap_slot *slot) +{ + struct lut_entry *lut = na->na_lut; + uint32_t i = slot->buf_idx; + return (unlikely(i >= na->na_lut_objtotal)) ? + lut[0].vaddr : lut[i].vaddr; +} -#ifdef __FreeBSD__ -MALLOC_DECLARE(M_NETMAP); -#endif /* __FreeBSD__ */ -void netmap_disable_all_rings(struct ifnet *); -void netmap_enable_all_rings(struct ifnet *); +void netmap_txsync_to_host(struct netmap_adapter *na); + + +/* + * Structure associated to each thread which registered an interface. + * + * The first 4 fields of this structure are written by NIOCREGIF and + * read by poll() and NIOC?XSYNC. + * + * There is low contention among writers (a correct user program + * should have none) and among writers and readers, so we use a + * single global lock to protect the structure initialization; + * since initialization involves the allocation of memory, + * we reuse the memory allocator lock. + * + * Read access to the structure is lock free. Readers must check that + * np_nifp is not NULL before using the other fields. + * If np_nifp is NULL initialization has not been performed, + * so they should return an error to userspace. + * + * The ref_done field is used to regulate access to the refcount in the + * memory allocator. The refcount must be incremented at most once for + * each open("/dev/netmap"). The increment is performed by the first + * function that calls netmap_get_memory() (currently called by + * mmap(), NIOCGINFO and NIOCREGIF). + * If the refcount is incremented, it is then decremented when the + * private structure is destroyed. + */ +struct netmap_priv_d { + struct netmap_if * volatile np_nifp; /* netmap if descriptor. */ + + struct netmap_adapter *np_na; + uint32_t np_flags; /* from the ioctl */ + u_int np_txqfirst, np_txqlast; /* range of tx rings to scan */ + u_int np_rxqfirst, np_rxqlast; /* range of rx rings to scan */ + uint16_t np_txpoll; /* XXX and also np_rxpoll ? */ + + struct netmap_mem_d *np_mref; /* use with NMG_LOCK held */ + /* np_refcount is only used on FreeBSD */ + int np_refcount; /* use with NMG_LOCK held */ + + /* pointers to the selinfo to be used for selrecord. + * Either the local or the global one depending on the + * number of rings. + */ + NM_SELINFO_T *np_rxsi, *np_txsi; + struct thread *np_td; /* kqueue, just debugging */ +}; + + +/* + * generic netmap emulation for devices that do not have + * native netmap support. + */ +int generic_netmap_attach(struct ifnet *ifp); + +int netmap_catch_rx(struct netmap_adapter *na, int intercept); +void generic_rx_handler(struct ifnet *ifp, struct mbuf *m);; +void netmap_catch_tx(struct netmap_generic_adapter *na, int enable); +int generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, void *addr, u_int len, u_int ring_nr); +int generic_find_num_desc(struct ifnet *ifp, u_int *tx, u_int *rx); +void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq); + +/* + * netmap_mitigation API. This is used by the generic adapter + * to reduce the number of interrupt requests/selwakeup + * to clients on incoming packets. + */ +void netmap_mitigation_init(struct nm_generic_mit *mit, struct netmap_adapter *na); +void netmap_mitigation_start(struct nm_generic_mit *mit); +void netmap_mitigation_restart(struct nm_generic_mit *mit); +int netmap_mitigation_active(struct nm_generic_mit *mit); +void netmap_mitigation_cleanup(struct nm_generic_mit *mit); + + + +/* Shared declarations for the VALE switch. */ + +/* + * Each transmit queue accumulates a batch of packets into + * a structure before forwarding. Packets to the same + * destination are put in a list using ft_next as a link field. + * ft_frags and ft_next are valid only on the first fragment. + */ +struct nm_bdg_fwd { /* forwarding entry for a bridge */ + void *ft_buf; /* netmap or indirect buffer */ + uint8_t ft_frags; /* how many fragments (only on 1st frag) */ + uint8_t _ft_port; /* dst port (unused) */ + uint16_t ft_flags; /* flags, e.g. indirect */ + uint16_t ft_len; /* src fragment len */ + uint16_t ft_next; /* next packet to same destination */ +}; + +/* struct 'virtio_net_hdr' from linux. */ +struct nm_vnet_hdr { +#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start, csum_offset */ +#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */ + uint8_t flags; +#define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */ +#define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */ +#define VIRTIO_NET_HDR_GSO_UDP 3 /* GSO frame, IPv4 UDP (UFO) */ +#define VIRTIO_NET_HDR_GSO_TCPV6 4 /* GSO frame, IPv6 TCP */ +#define VIRTIO_NET_HDR_GSO_ECN 0x80 /* TCP has ECN set */ + uint8_t gso_type; + uint16_t hdr_len; + uint16_t gso_size; + uint16_t csum_start; + uint16_t csum_offset; +}; + +#define WORST_CASE_GSO_HEADER (14+40+60) /* IPv6 + TCP */ + +/* Private definitions for IPv4, IPv6, UDP and TCP headers. */ + +struct nm_iphdr { + uint8_t version_ihl; + uint8_t tos; + uint16_t tot_len; + uint16_t id; + uint16_t frag_off; + uint8_t ttl; + uint8_t protocol; + uint16_t check; + uint32_t saddr; + uint32_t daddr; + /*The options start here. */ +}; + +struct nm_tcphdr { + uint16_t source; + uint16_t dest; + uint32_t seq; + uint32_t ack_seq; + uint8_t doff; /* Data offset + Reserved */ + uint8_t flags; + uint16_t window; + uint16_t check; + uint16_t urg_ptr; +}; + +struct nm_udphdr { + uint16_t source; + uint16_t dest; + uint16_t len; + uint16_t check; +}; + +struct nm_ipv6hdr { + uint8_t priority_version; + uint8_t flow_lbl[3]; + + uint16_t payload_len; + uint8_t nexthdr; + uint8_t hop_limit; + + uint8_t saddr[16]; + uint8_t daddr[16]; +}; + +/* Type used to store a checksum (in host byte order) that hasn't been + * folded yet. + */ +#define rawsum_t uint32_t + +rawsum_t nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum); +uint16_t nm_csum_ipv4(struct nm_iphdr *iph); +void nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, + size_t datalen, uint16_t *check); +void nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, + size_t datalen, uint16_t *check); +uint16_t nm_csum_fold(rawsum_t cur_sum); + +void bdg_mismatch_datapath(struct netmap_vp_adapter *na, + struct netmap_vp_adapter *dst_na, + struct nm_bdg_fwd *ft_p, struct netmap_ring *ring, + u_int *j, u_int lim, u_int *howmany); #endif /* _NET_NETMAP_KERN_H_ */ Index: stable/9/sys/dev/netmap/netmap_mbq.c =================================================================== --- stable/9/sys/dev/netmap/netmap_mbq.c (nonexistent) +++ stable/9/sys/dev/netmap/netmap_mbq.c (revision 262153) @@ -0,0 +1,163 @@ +/* + * Copyright (C) 2013-2014 Vincenzo Maffione. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + */ + + +#ifdef linux +#include "bsd_glue.h" +#else /* __FreeBSD__ */ +#include +#include +#include +#include +#include +#endif /* __FreeBSD__ */ + +#include "netmap_mbq.h" + + +static inline void __mbq_init(struct mbq *q) +{ + q->head = q->tail = NULL; + q->count = 0; +} + + +void mbq_safe_init(struct mbq *q) +{ + mtx_init(&q->lock, "mbq", NULL, MTX_SPIN); + __mbq_init(q); +} + + +void mbq_init(struct mbq *q) +{ + __mbq_init(q); +} + + +static inline void __mbq_enqueue(struct mbq *q, struct mbuf *m) +{ + m->m_nextpkt = NULL; + if (q->tail) { + q->tail->m_nextpkt = m; + q->tail = m; + } else { + q->head = q->tail = m; + } + q->count++; +} + + +void mbq_safe_enqueue(struct mbq *q, struct mbuf *m) +{ + mtx_lock(&q->lock); + __mbq_enqueue(q, m); + mtx_unlock(&q->lock); +} + + +void mbq_enqueue(struct mbq *q, struct mbuf *m) +{ + __mbq_enqueue(q, m); +} + + +static inline struct mbuf *__mbq_dequeue(struct mbq *q) +{ + struct mbuf *ret = NULL; + + if (q->head) { + ret = q->head; + q->head = ret->m_nextpkt; + if (q->head == NULL) { + q->tail = NULL; + } + q->count--; + ret->m_nextpkt = NULL; + } + + return ret; +} + + +struct mbuf *mbq_safe_dequeue(struct mbq *q) +{ + struct mbuf *ret; + + mtx_lock(&q->lock); + ret = __mbq_dequeue(q); + mtx_unlock(&q->lock); + + return ret; +} + + +struct mbuf *mbq_dequeue(struct mbq *q) +{ + return __mbq_dequeue(q); +} + + +/* XXX seems pointless to have a generic purge */ +static void __mbq_purge(struct mbq *q, int safe) +{ + struct mbuf *m; + + for (;;) { + m = safe ? mbq_safe_dequeue(q) : mbq_dequeue(q); + if (m) { + m_freem(m); + } else { + break; + } + } +} + + +void mbq_purge(struct mbq *q) +{ + __mbq_purge(q, 0); +} + + +void mbq_safe_purge(struct mbq *q) +{ + __mbq_purge(q, 1); +} + + +void mbq_safe_destroy(struct mbq *q) +{ + mtx_destroy(&q->lock); +} + + +void mbq_destroy(struct mbq *q) +{ +} Property changes on: stable/9/sys/dev/netmap/netmap_mbq.c ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: stable/9/sys/dev/netmap/netmap_mbq.h =================================================================== --- stable/9/sys/dev/netmap/netmap_mbq.h (nonexistent) +++ stable/9/sys/dev/netmap/netmap_mbq.h (revision 262153) @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2013-2014 Vincenzo Maffione. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + */ + + +#ifndef __NETMAP_MBQ_H__ +#define __NETMAP_MBQ_H__ + +/* + * These function implement an mbuf tailq with an optional lock. + * The base functions act ONLY ON THE QUEUE, whereas the "safe" + * variants (mbq_safe_*) also handle the lock. + */ + +/* XXX probably rely on a previous definition of SPINLOCK_T */ +#ifdef linux +#define SPINLOCK_T safe_spinlock_t +#else +#define SPINLOCK_T struct mtx +#endif + +/* A FIFO queue of mbufs with an optional lock. */ +struct mbq { + struct mbuf *head; + struct mbuf *tail; + int count; + SPINLOCK_T lock; +}; + +/* XXX "destroy" does not match "init" as a name. + * We should also clarify whether init can be used while + * holding a lock, and whether mbq_safe_destroy() is a NOP. + */ +void mbq_init(struct mbq *q); +void mbq_destroy(struct mbq *q); +void mbq_enqueue(struct mbq *q, struct mbuf *m); +struct mbuf *mbq_dequeue(struct mbq *q); +void mbq_purge(struct mbq *q); + +/* XXX missing mbq_lock() and mbq_unlock */ + +void mbq_safe_init(struct mbq *q); +void mbq_safe_destroy(struct mbq *q); +void mbq_safe_enqueue(struct mbq *q, struct mbuf *m); +struct mbuf *mbq_safe_dequeue(struct mbq *q); +void mbq_safe_purge(struct mbq *q); + +static inline unsigned int mbq_len(struct mbq *q) +{ + return q->count; +} + +#endif /* __NETMAP_MBQ_H_ */ Property changes on: stable/9/sys/dev/netmap/netmap_mbq.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: stable/9/sys/dev/netmap/netmap_mem2.c =================================================================== --- stable/9/sys/dev/netmap/netmap_mem2.c (revision 262152) +++ stable/9/sys/dev/netmap/netmap_mem2.c (revision 262153) @@ -1,1190 +1,1377 @@ /* - * Copyright (C) 2012-2013 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. + * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. + * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifdef linux #include "bsd_glue.h" #endif /* linux */ #ifdef __APPLE__ #include "osx_glue.h" #endif /* __APPLE__ */ #ifdef __FreeBSD__ #include /* prerequisite */ __FBSDID("$FreeBSD$"); #include #include #include #include /* vtophys */ #include /* vtophys */ #include /* sockaddrs */ #include #include #include #include #include #include /* bus_dmamap_* */ #endif /* __FreeBSD__ */ #include #include #include "netmap_mem2.h" #ifdef linux #define NMA_LOCK_INIT(n) sema_init(&(n)->nm_mtx, 1) #define NMA_LOCK_DESTROY(n) #define NMA_LOCK(n) down(&(n)->nm_mtx) #define NMA_UNLOCK(n) up(&(n)->nm_mtx) #else /* !linux */ #define NMA_LOCK_INIT(n) mtx_init(&(n)->nm_mtx, "netmap memory allocator lock", NULL, MTX_DEF) #define NMA_LOCK_DESTROY(n) mtx_destroy(&(n)->nm_mtx) #define NMA_LOCK(n) mtx_lock(&(n)->nm_mtx) #define NMA_UNLOCK(n) mtx_unlock(&(n)->nm_mtx) #endif /* linux */ struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = { [NETMAP_IF_POOL] = { .size = 1024, .num = 100, }, [NETMAP_RING_POOL] = { .size = 9*PAGE_SIZE, .num = 200, }, [NETMAP_BUF_POOL] = { .size = 2048, .num = NETMAP_BUF_MAX_NUM, }, }; +struct netmap_obj_params netmap_min_priv_params[NETMAP_POOLS_NR] = { + [NETMAP_IF_POOL] = { + .size = 1024, + .num = 1, + }, + [NETMAP_RING_POOL] = { + .size = 5*PAGE_SIZE, + .num = 4, + }, + [NETMAP_BUF_POOL] = { + .size = 2048, + .num = 4098, + }, +}; + /* * nm_mem is the memory allocator used for all physical interfaces * running in netmap mode. * Virtual (VALE) ports will have each its own allocator. */ static int netmap_mem_global_config(struct netmap_mem_d *nmd); static int netmap_mem_global_finalize(struct netmap_mem_d *nmd); static void netmap_mem_global_deref(struct netmap_mem_d *nmd); struct netmap_mem_d nm_mem = { /* Our memory allocator. */ .pools = { [NETMAP_IF_POOL] = { .name = "netmap_if", .objminsize = sizeof(struct netmap_if), .objmaxsize = 4096, .nummin = 10, /* don't be stingy */ .nummax = 10000, /* XXX very large */ }, [NETMAP_RING_POOL] = { .name = "netmap_ring", .objminsize = sizeof(struct netmap_ring), .objmaxsize = 32*PAGE_SIZE, .nummin = 2, .nummax = 1024, }, [NETMAP_BUF_POOL] = { .name = "netmap_buf", .objminsize = 64, .objmaxsize = 65536, .nummin = 4, .nummax = 1000000, /* one million! */ }, }, .config = netmap_mem_global_config, .finalize = netmap_mem_global_finalize, .deref = netmap_mem_global_deref, + + .nm_id = 1, + + .prev = &nm_mem, + .next = &nm_mem, }; +struct netmap_mem_d *netmap_last_mem_d = &nm_mem; + // XXX logically belongs to nm_mem struct lut_entry *netmap_buffer_lut; /* exported */ /* blueprint for the private memory allocators */ static int netmap_mem_private_config(struct netmap_mem_d *nmd); static int netmap_mem_private_finalize(struct netmap_mem_d *nmd); static void netmap_mem_private_deref(struct netmap_mem_d *nmd); const struct netmap_mem_d nm_blueprint = { .pools = { [NETMAP_IF_POOL] = { .name = "%s_if", .objminsize = sizeof(struct netmap_if), .objmaxsize = 4096, .nummin = 1, - .nummax = 10, + .nummax = 100, }, [NETMAP_RING_POOL] = { .name = "%s_ring", .objminsize = sizeof(struct netmap_ring), .objmaxsize = 32*PAGE_SIZE, .nummin = 2, .nummax = 1024, }, [NETMAP_BUF_POOL] = { .name = "%s_buf", .objminsize = 64, .objmaxsize = 65536, .nummin = 4, .nummax = 1000000, /* one million! */ }, }, .config = netmap_mem_private_config, .finalize = netmap_mem_private_finalize, .deref = netmap_mem_private_deref, .flags = NETMAP_MEM_PRIVATE, }; /* memory allocator related sysctls */ #define STRINGIFY(x) #x #define DECLARE_SYSCTLS(id, name) \ SYSCTL_INT(_dev_netmap, OID_AUTO, name##_size, \ CTLFLAG_RW, &netmap_params[id].size, 0, "Requested size of netmap " STRINGIFY(name) "s"); \ - SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_size, \ - CTLFLAG_RD, &nm_mem.pools[id]._objsize, 0, "Current size of netmap " STRINGIFY(name) "s"); \ - SYSCTL_INT(_dev_netmap, OID_AUTO, name##_num, \ - CTLFLAG_RW, &netmap_params[id].num, 0, "Requested number of netmap " STRINGIFY(name) "s"); \ - SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_num, \ - CTLFLAG_RD, &nm_mem.pools[id].objtotal, 0, "Current number of netmap " STRINGIFY(name) "s") + SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_size, \ + CTLFLAG_RD, &nm_mem.pools[id]._objsize, 0, "Current size of netmap " STRINGIFY(name) "s"); \ + SYSCTL_INT(_dev_netmap, OID_AUTO, name##_num, \ + CTLFLAG_RW, &netmap_params[id].num, 0, "Requested number of netmap " STRINGIFY(name) "s"); \ + SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_num, \ + CTLFLAG_RD, &nm_mem.pools[id].objtotal, 0, "Current number of netmap " STRINGIFY(name) "s"); \ + SYSCTL_INT(_dev_netmap, OID_AUTO, priv_##name##_size, \ + CTLFLAG_RW, &netmap_min_priv_params[id].size, 0, \ + "Default size of private netmap " STRINGIFY(name) "s"); \ + SYSCTL_INT(_dev_netmap, OID_AUTO, priv_##name##_num, \ + CTLFLAG_RW, &netmap_min_priv_params[id].num, 0, \ + "Default number of private netmap " STRINGIFY(name) "s") SYSCTL_DECL(_dev_netmap); DECLARE_SYSCTLS(NETMAP_IF_POOL, if); DECLARE_SYSCTLS(NETMAP_RING_POOL, ring); DECLARE_SYSCTLS(NETMAP_BUF_POOL, buf); +static int +nm_mem_assign_id(struct netmap_mem_d *nmd) +{ + nm_memid_t id; + struct netmap_mem_d *scan = netmap_last_mem_d; + int error = ENOMEM; + + NMA_LOCK(&nm_mem); + + do { + /* we rely on unsigned wrap around */ + id = scan->nm_id + 1; + if (id == 0) /* reserve 0 as error value */ + id = 1; + scan = scan->next; + if (id != scan->nm_id) { + nmd->nm_id = id; + nmd->prev = scan->prev; + nmd->next = scan; + scan->prev->next = nmd; + scan->prev = nmd; + netmap_last_mem_d = nmd; + error = 0; + break; + } + } while (scan != netmap_last_mem_d); + + NMA_UNLOCK(&nm_mem); + return error; +} + +static void +nm_mem_release_id(struct netmap_mem_d *nmd) +{ + NMA_LOCK(&nm_mem); + + nmd->prev->next = nmd->next; + nmd->next->prev = nmd->prev; + + if (netmap_last_mem_d == nmd) + netmap_last_mem_d = nmd->prev; + + nmd->prev = nmd->next = NULL; + + NMA_UNLOCK(&nm_mem); +} + + /* * First, find the allocator that contains the requested offset, * then locate the cluster through a lookup table. */ vm_paddr_t netmap_mem_ofstophys(struct netmap_mem_d* nmd, vm_ooffset_t offset) { int i; vm_ooffset_t o = offset; vm_paddr_t pa; struct netmap_obj_pool *p; NMA_LOCK(nmd); p = nmd->pools; for (i = 0; i < NETMAP_POOLS_NR; offset -= p[i].memtotal, i++) { if (offset >= p[i].memtotal) continue; // now lookup the cluster's address pa = p[i].lut[offset / p[i]._objsize].paddr + offset % p[i]._objsize; NMA_UNLOCK(nmd); return pa; } /* this is only in case of errors */ D("invalid ofs 0x%x out of 0x%x 0x%x 0x%x", (u_int)o, p[NETMAP_IF_POOL].memtotal, p[NETMAP_IF_POOL].memtotal + p[NETMAP_RING_POOL].memtotal, p[NETMAP_IF_POOL].memtotal + p[NETMAP_RING_POOL].memtotal + p[NETMAP_BUF_POOL].memtotal); NMA_UNLOCK(nmd); return 0; // XXX bad address } int -netmap_mem_get_info(struct netmap_mem_d* nmd, u_int* size, u_int *memflags) +netmap_mem_get_info(struct netmap_mem_d* nmd, u_int* size, u_int *memflags, + nm_memid_t *id) { int error = 0; NMA_LOCK(nmd); error = nmd->config(nmd); if (error) goto out; if (nmd->flags & NETMAP_MEM_FINALIZED) { *size = nmd->nm_totalsize; } else { int i; *size = 0; for (i = 0; i < NETMAP_POOLS_NR; i++) { struct netmap_obj_pool *p = nmd->pools + i; *size += (p->_numclusters * p->_clustsize); } } *memflags = nmd->flags; + *id = nmd->nm_id; out: NMA_UNLOCK(nmd); return error; } /* * we store objects by kernel address, need to find the offset * within the pool to export the value to userspace. * Algorithm: scan until we find the cluster, then add the * actual offset in the cluster */ static ssize_t netmap_obj_offset(struct netmap_obj_pool *p, const void *vaddr) { int i, k = p->_clustentries, n = p->objtotal; ssize_t ofs = 0; for (i = 0; i < n; i += k, ofs += p->_clustsize) { const char *base = p->lut[i].vaddr; ssize_t relofs = (const char *) vaddr - base; if (relofs < 0 || relofs >= p->_clustsize) continue; ofs = ofs + relofs; ND("%s: return offset %d (cluster %d) for pointer %p", p->name, ofs, i, vaddr); return ofs; } D("address %p is not contained inside any cluster (%s)", vaddr, p->name); return 0; /* An error occurred */ } /* Helper functions which convert virtual addresses to offsets */ #define netmap_if_offset(n, v) \ netmap_obj_offset(&(n)->pools[NETMAP_IF_POOL], (v)) #define netmap_ring_offset(n, v) \ ((n)->pools[NETMAP_IF_POOL].memtotal + \ netmap_obj_offset(&(n)->pools[NETMAP_RING_POOL], (v))) #define netmap_buf_offset(n, v) \ ((n)->pools[NETMAP_IF_POOL].memtotal + \ (n)->pools[NETMAP_RING_POOL].memtotal + \ netmap_obj_offset(&(n)->pools[NETMAP_BUF_POOL], (v))) ssize_t netmap_mem_if_offset(struct netmap_mem_d *nmd, const void *addr) { ssize_t v; NMA_LOCK(nmd); v = netmap_if_offset(nmd, addr); NMA_UNLOCK(nmd); return v; } /* * report the index, and use start position as a hint, * otherwise buffer allocation becomes terribly expensive. */ static void * netmap_obj_malloc(struct netmap_obj_pool *p, u_int len, uint32_t *start, uint32_t *index) { uint32_t i = 0; /* index in the bitmap */ uint32_t mask, j; /* slot counter */ void *vaddr = NULL; if (len > p->_objsize) { D("%s request size %d too large", p->name, len); // XXX cannot reduce the size return NULL; } if (p->objfree == 0) { - D("%s allocator: run out of memory", p->name); + D("no more %s objects", p->name); return NULL; } if (start) i = *start; /* termination is guaranteed by p->free, but better check bounds on i */ while (vaddr == NULL && i < p->bitmap_slots) { uint32_t cur = p->bitmap[i]; if (cur == 0) { /* bitmask is fully used */ i++; continue; } /* locate a slot */ for (j = 0, mask = 1; (cur & mask) == 0; j++, mask <<= 1) ; p->bitmap[i] &= ~mask; /* mark object as in use */ p->objfree--; vaddr = p->lut[i * 32 + j].vaddr; if (index) *index = i * 32 + j; } ND("%s allocator: allocated object @ [%d][%d]: vaddr %p", i, j, vaddr); if (start) *start = i; return vaddr; } /* - * free by index, not by address. This is slow, but is only used - * for a small number of objects (rings, nifp) + * free by index, not by address. + * XXX should we also cleanup the content ? */ -static void +static int netmap_obj_free(struct netmap_obj_pool *p, uint32_t j) { + uint32_t *ptr, mask; + if (j >= p->objtotal) { D("invalid index %u, max %u", j, p->objtotal); - return; + return 1; } - p->bitmap[j / 32] |= (1 << (j % 32)); - p->objfree++; - return; + ptr = &p->bitmap[j / 32]; + mask = (1 << (j % 32)); + if (*ptr & mask) { + D("ouch, double free on buffer %d", j); + return 1; + } else { + *ptr |= mask; + p->objfree++; + return 0; + } } +/* + * free by address. This is slow but is only used for a few + * objects (rings, nifp) + */ static void netmap_obj_free_va(struct netmap_obj_pool *p, void *vaddr) { u_int i, j, n = p->numclusters; for (i = 0, j = 0; i < n; i++, j += p->_clustentries) { void *base = p->lut[i * p->_clustentries].vaddr; ssize_t relofs = (ssize_t) vaddr - (ssize_t) base; /* Given address, is out of the scope of the current cluster.*/ if (vaddr < base || relofs >= p->_clustsize) continue; j = j + relofs / p->_objsize; /* KASSERT(j != 0, ("Cannot free object 0")); */ netmap_obj_free(p, j); return; } D("address %p is not contained inside any cluster (%s)", vaddr, p->name); } #define netmap_if_malloc(n, len) netmap_obj_malloc(&(n)->pools[NETMAP_IF_POOL], len, NULL, NULL) #define netmap_if_free(n, v) netmap_obj_free_va(&(n)->pools[NETMAP_IF_POOL], (v)) #define netmap_ring_malloc(n, len) netmap_obj_malloc(&(n)->pools[NETMAP_RING_POOL], len, NULL, NULL) #define netmap_ring_free(n, v) netmap_obj_free_va(&(n)->pools[NETMAP_RING_POOL], (v)) #define netmap_buf_malloc(n, _pos, _index) \ netmap_obj_malloc(&(n)->pools[NETMAP_BUF_POOL], NETMAP_BDG_BUF_SIZE(n), _pos, _index) +#if 0 // XXX unused /* Return the index associated to the given packet buffer */ #define netmap_buf_index(n, v) \ (netmap_obj_offset(&(n)->pools[NETMAP_BUF_POOL], (v)) / NETMAP_BDG_BUF_SIZE(n)) +#endif +/* + * allocate extra buffers in a linked list. + * returns the actual number. + */ +uint32_t +netmap_extra_alloc(struct netmap_adapter *na, uint32_t *head, uint32_t n) +{ + struct netmap_mem_d *nmd = na->nm_mem; + uint32_t i, pos = 0; /* opaque, scan position in the bitmap */ + NMA_LOCK(nmd); + + *head = 0; /* default, 'null' index ie empty list */ + for (i = 0 ; i < n; i++) { + uint32_t cur = *head; /* save current head */ + uint32_t *p = netmap_buf_malloc(nmd, &pos, head); + if (p == NULL) { + D("no more buffers after %d of %d", i, n); + *head = cur; /* restore */ + break; + } + RD(5, "allocate buffer %d -> %d", *head, cur); + *p = cur; /* link to previous head */ + } + + NMA_UNLOCK(nmd); + + return i; +} + +static void +netmap_extra_free(struct netmap_adapter *na, uint32_t head) +{ + struct lut_entry *lut = na->na_lut; + struct netmap_mem_d *nmd = na->nm_mem; + struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; + uint32_t i, cur, *buf; + + D("freeing the extra list"); + for (i = 0; head >=2 && head < p->objtotal; i++) { + cur = head; + buf = lut[head].vaddr; + head = *buf; + *buf = 0; + if (netmap_obj_free(p, cur)) + break; + } + if (head != 0) + D("breaking with head %d", head); + D("freed %d buffers", i); +} + + /* Return nonzero on error */ static int -netmap_new_bufs(struct netmap_mem_d *nmd, struct netmap_if *nifp, - struct netmap_slot *slot, u_int n) +netmap_new_bufs(struct netmap_mem_d *nmd, struct netmap_slot *slot, u_int n) { struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; u_int i = 0; /* slot counter */ uint32_t pos = 0; /* slot in p->bitmap */ uint32_t index = 0; /* buffer index */ - (void)nifp; /* UNUSED */ for (i = 0; i < n; i++) { void *vaddr = netmap_buf_malloc(nmd, &pos, &index); if (vaddr == NULL) { - D("unable to locate empty packet buffer"); + D("no more buffers after %d of %d", i, n); goto cleanup; } slot[i].buf_idx = index; slot[i].len = p->_objsize; - /* XXX setting flags=NS_BUF_CHANGED forces a pointer reload - * in the NIC ring. This is a hack that hides missing - * initializations in the drivers, and should go away. - */ - // slot[i].flags = NS_BUF_CHANGED; + slot[i].flags = 0; } ND("allocated %d buffers, %d available, first at %d", n, p->objfree, pos); return (0); cleanup: while (i > 0) { i--; netmap_obj_free(p, slot[i].buf_idx); } bzero(slot, n * sizeof(slot[0])); return (ENOMEM); } +static void +netmap_mem_set_ring(struct netmap_mem_d *nmd, struct netmap_slot *slot, u_int n, uint32_t index) +{ + struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; + u_int i; + for (i = 0; i < n; i++) { + slot[i].buf_idx = index; + slot[i].len = p->_objsize; + slot[i].flags = 0; + } +} + + static void -netmap_free_buf(struct netmap_mem_d *nmd, struct netmap_if *nifp, uint32_t i) +netmap_free_buf(struct netmap_mem_d *nmd, uint32_t i) { struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; - (void)nifp; if (i < 2 || i >= p->objtotal) { D("Cannot free buf#%d: should be in [2, %d[", i, p->objtotal); return; } netmap_obj_free(p, i); } + static void +netmap_free_bufs(struct netmap_mem_d *nmd, struct netmap_slot *slot, u_int n) +{ + u_int i; + + for (i = 0; i < n; i++) { + if (slot[i].buf_idx > 2) + netmap_free_buf(nmd, slot[i].buf_idx); + } +} + +static void netmap_reset_obj_allocator(struct netmap_obj_pool *p) { if (p == NULL) return; if (p->bitmap) free(p->bitmap, M_NETMAP); p->bitmap = NULL; if (p->lut) { u_int i; size_t sz = p->_clustsize; for (i = 0; i < p->objtotal; i += p->_clustentries) { if (p->lut[i].vaddr) contigfree(p->lut[i].vaddr, sz, M_NETMAP); } bzero(p->lut, sizeof(struct lut_entry) * p->objtotal); #ifdef linux vfree(p->lut); #else free(p->lut, M_NETMAP); #endif } p->lut = NULL; p->objtotal = 0; p->memtotal = 0; p->numclusters = 0; p->objfree = 0; } /* * Free all resources related to an allocator. */ static void netmap_destroy_obj_allocator(struct netmap_obj_pool *p) { if (p == NULL) return; netmap_reset_obj_allocator(p); } /* * We receive a request for objtotal objects, of size objsize each. * Internally we may round up both numbers, as we allocate objects * in small clusters multiple of the page size. * We need to keep track of objtotal and clustentries, * as they are needed when freeing memory. * * XXX note -- userspace needs the buffers to be contiguous, * so we cannot afford gaps at the end of a cluster. */ /* call with NMA_LOCK held */ static int netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int objsize) { int i; u_int clustsize; /* the cluster size, multiple of page size */ u_int clustentries; /* how many objects per entry */ /* we store the current request, so we can * detect configuration changes later */ p->r_objtotal = objtotal; p->r_objsize = objsize; #define MAX_CLUSTSIZE (1<<17) -#define LINE_ROUND 64 +#define LINE_ROUND NM_CACHE_ALIGN // 64 if (objsize >= MAX_CLUSTSIZE) { /* we could do it but there is no point */ D("unsupported allocation for %d bytes", objsize); return EINVAL; } /* make sure objsize is a multiple of LINE_ROUND */ i = (objsize & (LINE_ROUND - 1)); if (i) { D("XXX aligning object by %d bytes", LINE_ROUND - i); objsize += LINE_ROUND - i; } if (objsize < p->objminsize || objsize > p->objmaxsize) { D("requested objsize %d out of range [%d, %d]", objsize, p->objminsize, p->objmaxsize); return EINVAL; } if (objtotal < p->nummin || objtotal > p->nummax) { D("requested objtotal %d out of range [%d, %d]", objtotal, p->nummin, p->nummax); return EINVAL; } /* * Compute number of objects using a brute-force approach: * given a max cluster size, * we try to fill it with objects keeping track of the * wasted space to the next page boundary. */ for (clustentries = 0, i = 1;; i++) { u_int delta, used = i * objsize; if (used > MAX_CLUSTSIZE) break; delta = used % PAGE_SIZE; if (delta == 0) { // exact solution clustentries = i; break; } if (delta > ( (clustentries*objsize) % PAGE_SIZE) ) clustentries = i; } // D("XXX --- ouch, delta %d (bad for buffers)", delta); /* compute clustsize and round to the next page */ clustsize = clustentries * objsize; i = (clustsize & (PAGE_SIZE - 1)); if (i) clustsize += PAGE_SIZE - i; if (netmap_verbose) D("objsize %d clustsize %d objects %d", objsize, clustsize, clustentries); /* * The number of clusters is n = ceil(objtotal/clustentries) * objtotal' = n * clustentries */ p->_clustentries = clustentries; p->_clustsize = clustsize; p->_numclusters = (objtotal + clustentries - 1) / clustentries; /* actual values (may be larger than requested) */ p->_objsize = objsize; p->_objtotal = p->_numclusters * clustentries; return 0; } /* call with NMA_LOCK held */ static int netmap_finalize_obj_allocator(struct netmap_obj_pool *p) { int i; /* must be signed */ size_t n; /* optimistically assume we have enough memory */ p->numclusters = p->_numclusters; p->objtotal = p->_objtotal; n = sizeof(struct lut_entry) * p->objtotal; #ifdef linux p->lut = vmalloc(n); #else p->lut = malloc(n, M_NETMAP, M_NOWAIT | M_ZERO); #endif if (p->lut == NULL) { D("Unable to create lookup table (%d bytes) for '%s'", (int)n, p->name); goto clean; } /* Allocate the bitmap */ n = (p->objtotal + 31) / 32; p->bitmap = malloc(sizeof(uint32_t) * n, M_NETMAP, M_NOWAIT | M_ZERO); if (p->bitmap == NULL) { D("Unable to create bitmap (%d entries) for allocator '%s'", (int)n, p->name); goto clean; } p->bitmap_slots = n; /* * Allocate clusters, init pointers and bitmap */ n = p->_clustsize; for (i = 0; i < (int)p->objtotal;) { int lim = i + p->_clustentries; char *clust; clust = contigmalloc(n, M_NETMAP, M_NOWAIT | M_ZERO, (size_t)0, -1UL, PAGE_SIZE, 0); if (clust == NULL) { /* * If we get here, there is a severe memory shortage, * so halve the allocated memory to reclaim some. */ D("Unable to create cluster at %d for '%s' allocator", i, p->name); if (i < 2) /* nothing to halve */ goto out; lim = i / 2; for (i--; i >= lim; i--) { p->bitmap[ (i>>5) ] &= ~( 1 << (i & 31) ); if (i % p->_clustentries == 0 && p->lut[i].vaddr) contigfree(p->lut[i].vaddr, n, M_NETMAP); } out: p->objtotal = i; /* we may have stopped in the middle of a cluster */ p->numclusters = (i + p->_clustentries - 1) / p->_clustentries; break; } for (; i < lim; i++, clust += p->_objsize) { p->bitmap[ (i>>5) ] |= ( 1 << (i & 31) ); p->lut[i].vaddr = clust; p->lut[i].paddr = vtophys(clust); } } p->objfree = p->objtotal; p->memtotal = p->numclusters * p->_clustsize; if (p->objfree == 0) goto clean; if (netmap_verbose) D("Pre-allocated %d clusters (%d/%dKB) for '%s'", p->numclusters, p->_clustsize >> 10, p->memtotal >> 10, p->name); return 0; clean: netmap_reset_obj_allocator(p); return ENOMEM; } /* call with lock held */ static int netmap_memory_config_changed(struct netmap_mem_d *nmd) { int i; for (i = 0; i < NETMAP_POOLS_NR; i++) { if (nmd->pools[i].r_objsize != netmap_params[i].size || nmd->pools[i].r_objtotal != netmap_params[i].num) return 1; } return 0; } static void netmap_mem_reset_all(struct netmap_mem_d *nmd) { int i; - D("resetting %p", nmd); + + if (netmap_verbose) + D("resetting %p", nmd); for (i = 0; i < NETMAP_POOLS_NR; i++) { netmap_reset_obj_allocator(&nmd->pools[i]); } nmd->flags &= ~NETMAP_MEM_FINALIZED; } static int netmap_mem_finalize_all(struct netmap_mem_d *nmd) { int i; if (nmd->flags & NETMAP_MEM_FINALIZED) return 0; nmd->lasterr = 0; nmd->nm_totalsize = 0; for (i = 0; i < NETMAP_POOLS_NR; i++) { nmd->lasterr = netmap_finalize_obj_allocator(&nmd->pools[i]); if (nmd->lasterr) goto error; nmd->nm_totalsize += nmd->pools[i].memtotal; } /* buffers 0 and 1 are reserved */ nmd->pools[NETMAP_BUF_POOL].objfree -= 2; nmd->pools[NETMAP_BUF_POOL].bitmap[0] = ~3; nmd->flags |= NETMAP_MEM_FINALIZED; - D("Have %d KB for interfaces, %d KB for rings and %d MB for buffers", - nmd->pools[NETMAP_IF_POOL].memtotal >> 10, - nmd->pools[NETMAP_RING_POOL].memtotal >> 10, - nmd->pools[NETMAP_BUF_POOL].memtotal >> 20); + if (netmap_verbose) + D("interfaces %d KB, rings %d KB, buffers %d MB", + nmd->pools[NETMAP_IF_POOL].memtotal >> 10, + nmd->pools[NETMAP_RING_POOL].memtotal >> 10, + nmd->pools[NETMAP_BUF_POOL].memtotal >> 20); - D("Free buffers: %d", nmd->pools[NETMAP_BUF_POOL].objfree); + if (netmap_verbose) + D("Free buffers: %d", nmd->pools[NETMAP_BUF_POOL].objfree); return 0; error: netmap_mem_reset_all(nmd); return nmd->lasterr; } void netmap_mem_private_delete(struct netmap_mem_d *nmd) { if (nmd == NULL) return; - D("deleting %p", nmd); + if (netmap_verbose) + D("deleting %p", nmd); if (nmd->refcount > 0) D("bug: deleting mem allocator with refcount=%d!", nmd->refcount); - D("done deleting %p", nmd); + nm_mem_release_id(nmd); + if (netmap_verbose) + D("done deleting %p", nmd); NMA_LOCK_DESTROY(nmd); free(nmd, M_DEVBUF); } static int netmap_mem_private_config(struct netmap_mem_d *nmd) { /* nothing to do, we are configured on creation * and configuration never changes thereafter */ return 0; } static int netmap_mem_private_finalize(struct netmap_mem_d *nmd) { int err; NMA_LOCK(nmd); nmd->refcount++; err = netmap_mem_finalize_all(nmd); NMA_UNLOCK(nmd); return err; } -static void netmap_mem_private_deref(struct netmap_mem_d *nmd) +static void +netmap_mem_private_deref(struct netmap_mem_d *nmd) { NMA_LOCK(nmd); if (--nmd->refcount <= 0) netmap_mem_reset_all(nmd); NMA_UNLOCK(nmd); } + +/* + * allocator for private memory + */ struct netmap_mem_d * -netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int rxr, u_int rxd) +netmap_mem_private_new(const char *name, u_int txr, u_int txd, + u_int rxr, u_int rxd, u_int extra_bufs, u_int npipes, int *perr) { struct netmap_mem_d *d = NULL; struct netmap_obj_params p[NETMAP_POOLS_NR]; - int i; - u_int maxd; + int i, err; + u_int v, maxd; d = malloc(sizeof(struct netmap_mem_d), M_DEVBUF, M_NOWAIT | M_ZERO); - if (d == NULL) - return NULL; + if (d == NULL) { + err = ENOMEM; + goto error; + } *d = nm_blueprint; - /* XXX the rest of the code assumes the stack rings are alwasy present */ + err = nm_mem_assign_id(d); + if (err) + goto error; + + /* account for the fake host rings */ txr++; rxr++; - p[NETMAP_IF_POOL].size = sizeof(struct netmap_if) + - sizeof(ssize_t) * (txr + rxr); - p[NETMAP_IF_POOL].num = 2; + + /* copy the min values */ + for (i = 0; i < NETMAP_POOLS_NR; i++) { + p[i] = netmap_min_priv_params[i]; + } + + /* possibly increase them to fit user request */ + v = sizeof(struct netmap_if) + sizeof(ssize_t) * (txr + rxr); + if (p[NETMAP_IF_POOL].size < v) + p[NETMAP_IF_POOL].size = v; + v = 2 + 4 * npipes; + if (p[NETMAP_IF_POOL].num < v) + p[NETMAP_IF_POOL].num = v; maxd = (txd > rxd) ? txd : rxd; - p[NETMAP_RING_POOL].size = sizeof(struct netmap_ring) + - sizeof(struct netmap_slot) * maxd; - p[NETMAP_RING_POOL].num = txr + rxr; - p[NETMAP_BUF_POOL].size = 2048; /* XXX find a way to let the user choose this */ - p[NETMAP_BUF_POOL].num = rxr * (rxd + 2) + txr * (txd + 2); + v = sizeof(struct netmap_ring) + sizeof(struct netmap_slot) * maxd; + if (p[NETMAP_RING_POOL].size < v) + p[NETMAP_RING_POOL].size = v; + /* each pipe endpoint needs two tx rings (1 normal + 1 host, fake) + * and two rx rings (again, 1 normal and 1 fake host) + */ + v = txr + rxr + 8 * npipes; + if (p[NETMAP_RING_POOL].num < v) + p[NETMAP_RING_POOL].num = v; + /* for each pipe we only need the buffers for the 4 "real" rings. + * On the other end, the pipe ring dimension may be different from + * the parent port ring dimension. As a compromise, we allocate twice the + * space actually needed if the pipe rings were the same size as the parent rings + */ + v = (4 * npipes + rxr) * rxd + (4 * npipes + txr) * txd + 2 + extra_bufs; + /* the +2 is for the tx and rx fake buffers (indices 0 and 1) */ + if (p[NETMAP_BUF_POOL].num < v) + p[NETMAP_BUF_POOL].num = v; - D("req if %d*%d ring %d*%d buf %d*%d", + if (netmap_verbose) + D("req if %d*%d ring %d*%d buf %d*%d", p[NETMAP_IF_POOL].num, p[NETMAP_IF_POOL].size, p[NETMAP_RING_POOL].num, p[NETMAP_RING_POOL].size, p[NETMAP_BUF_POOL].num, p[NETMAP_BUF_POOL].size); for (i = 0; i < NETMAP_POOLS_NR; i++) { snprintf(d->pools[i].name, NETMAP_POOL_MAX_NAMSZ, nm_blueprint.pools[i].name, name); - if (netmap_config_obj_allocator(&d->pools[i], - p[i].num, p[i].size)) + err = netmap_config_obj_allocator(&d->pools[i], + p[i].num, p[i].size); + if (err) goto error; } d->flags &= ~NETMAP_MEM_FINALIZED; NMA_LOCK_INIT(d); return d; error: netmap_mem_private_delete(d); + if (perr) + *perr = err; return NULL; } /* call with lock held */ static int netmap_mem_global_config(struct netmap_mem_d *nmd) { int i; if (nmd->refcount) /* already in use, we cannot change the configuration */ goto out; if (!netmap_memory_config_changed(nmd)) goto out; D("reconfiguring"); if (nmd->flags & NETMAP_MEM_FINALIZED) { /* reset previous allocation */ for (i = 0; i < NETMAP_POOLS_NR; i++) { netmap_reset_obj_allocator(&nmd->pools[i]); } nmd->flags &= ~NETMAP_MEM_FINALIZED; - } + } for (i = 0; i < NETMAP_POOLS_NR; i++) { nmd->lasterr = netmap_config_obj_allocator(&nmd->pools[i], netmap_params[i].num, netmap_params[i].size); if (nmd->lasterr) goto out; } out: return nmd->lasterr; } static int netmap_mem_global_finalize(struct netmap_mem_d *nmd) { int err; NMA_LOCK(nmd); /* update configuration if changed */ if (netmap_mem_global_config(nmd)) goto out; nmd->refcount++; if (nmd->flags & NETMAP_MEM_FINALIZED) { /* may happen if config is not changed */ ND("nothing to do"); goto out; } if (netmap_mem_finalize_all(nmd)) goto out; /* backward compatibility */ netmap_buf_size = nmd->pools[NETMAP_BUF_POOL]._objsize; netmap_total_buffers = nmd->pools[NETMAP_BUF_POOL].objtotal; netmap_buffer_lut = nmd->pools[NETMAP_BUF_POOL].lut; netmap_buffer_base = nmd->pools[NETMAP_BUF_POOL].lut[0].vaddr; nmd->lasterr = 0; out: if (nmd->lasterr) nmd->refcount--; err = nmd->lasterr; NMA_UNLOCK(nmd); return err; } int netmap_mem_init(void) { NMA_LOCK_INIT(&nm_mem); return (0); } void netmap_mem_fini(void) { int i; for (i = 0; i < NETMAP_POOLS_NR; i++) { netmap_destroy_obj_allocator(&nm_mem.pools[i]); } NMA_LOCK_DESTROY(&nm_mem); } static void netmap_free_rings(struct netmap_adapter *na) { - u_int i; + struct netmap_kring *kring; + struct netmap_ring *ring; if (!na->tx_rings) return; - for (i = 0; i < na->num_tx_rings + 1; i++) { - if (na->tx_rings[i].ring) { - netmap_ring_free(na->nm_mem, na->tx_rings[i].ring); - na->tx_rings[i].ring = NULL; - } + for (kring = na->tx_rings; kring != na->rx_rings; kring++) { + ring = kring->ring; + if (ring == NULL) + continue; + netmap_free_bufs(na->nm_mem, ring->slot, kring->nkr_num_slots); + netmap_ring_free(na->nm_mem, ring); + kring->ring = NULL; } - for (i = 0; i < na->num_rx_rings + 1; i++) { - if (na->rx_rings[i].ring) { - netmap_ring_free(na->nm_mem, na->rx_rings[i].ring); - na->rx_rings[i].ring = NULL; - } + for (/* cont'd from above */; kring != na->tailroom; kring++) { + ring = kring->ring; + if (ring == NULL) + continue; + netmap_free_bufs(na->nm_mem, ring->slot, kring->nkr_num_slots); + netmap_ring_free(na->nm_mem, ring); + kring->ring = NULL; } - free(na->tx_rings, M_DEVBUF); - na->tx_rings = na->rx_rings = NULL; } - - -/* call with NMA_LOCK held */ -/* - * Allocate the per-fd structure netmap_if. - * If this is the first instance, also allocate the krings, rings etc. +/* call with NMA_LOCK held * * - * We assume that the configuration stored in na - * (number of tx/rx rings and descs) does not change while - * the interface is in netmap mode. + * Allocate netmap rings and buffers for this card + * The rings are contiguous, but have variable size. + * The kring array must follow the layout described + * in netmap_krings_create(). */ -extern int nma_is_vp(struct netmap_adapter *na); -struct netmap_if * -netmap_mem_if_new(const char *ifname, struct netmap_adapter *na) +int +netmap_mem_rings_create(struct netmap_adapter *na) { - struct netmap_if *nifp; struct netmap_ring *ring; - ssize_t base; /* handy for relative offsets between rings and nifp */ - u_int i, len, ndesc, ntx, nrx; + u_int len, ndesc; struct netmap_kring *kring; - uint32_t *tx_leases = NULL, *rx_leases = NULL; + u_int i; - /* - * verify whether virtual port need the stack ring - */ - ntx = na->num_tx_rings + 1; /* shorthand, include stack ring */ - nrx = na->num_rx_rings + 1; /* shorthand, include stack ring */ - /* - * the descriptor is followed inline by an array of offsets - * to the tx and rx rings in the shared memory region. - * For virtual rx rings we also allocate an array of - * pointers to assign to nkr_leases. - */ - NMA_LOCK(na->nm_mem); - len = sizeof(struct netmap_if) + (nrx + ntx) * sizeof(ssize_t); - nifp = netmap_if_malloc(na->nm_mem, len); - if (nifp == NULL) { - NMA_UNLOCK(na->nm_mem); - return NULL; - } - - /* initialize base fields -- override const */ - *(u_int *)(uintptr_t)&nifp->ni_tx_rings = na->num_tx_rings; - *(u_int *)(uintptr_t)&nifp->ni_rx_rings = na->num_rx_rings; - strncpy(nifp->ni_name, ifname, (size_t)IFNAMSIZ); - - if (na->refcount) { /* already setup, we are done */ - goto final; - } - - len = (ntx + nrx) * sizeof(struct netmap_kring); - /* - * Leases are attached to TX rings on NIC/host ports, - * and to RX rings on VALE ports. - */ - if (nma_is_vp(na)) { - len += sizeof(uint32_t) * na->num_rx_desc * na->num_rx_rings; - } else { - len += sizeof(uint32_t) * na->num_tx_desc * ntx; - } - - na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); - if (na->tx_rings == NULL) { - D("Cannot allocate krings for %s", ifname); - goto cleanup; - } - na->rx_rings = na->tx_rings + ntx; - - if (nma_is_vp(na)) { - rx_leases = (uint32_t *)(na->rx_rings + nrx); - } else { - tx_leases = (uint32_t *)(na->rx_rings + nrx); - } - - /* - * First instance, allocate netmap rings and buffers for this card - * The rings are contiguous, but have variable size. - */ - for (i = 0; i < ntx; i++) { /* Transmit rings */ - kring = &na->tx_rings[i]; - ndesc = na->num_tx_desc; - bzero(kring, sizeof(*kring)); + /* transmit rings */ + for (i =0, kring = na->tx_rings; kring != na->rx_rings; kring++, i++) { + if (kring->ring) { + ND("%s %ld already created", kring->name, kring - na->tx_rings); + continue; /* already created by somebody else */ + } + ndesc = kring->nkr_num_slots; len = sizeof(struct netmap_ring) + ndesc * sizeof(struct netmap_slot); ring = netmap_ring_malloc(na->nm_mem, len); if (ring == NULL) { - D("Cannot allocate tx_ring[%d] for %s", i, ifname); + D("Cannot allocate tx_ring"); goto cleanup; } - ND("txring[%d] at %p ofs %d", i, ring); - kring->na = na; + ND("txring at %p", ring); kring->ring = ring; - if (tx_leases) { - kring->nkr_leases = tx_leases; - tx_leases += ndesc; - } - *(uint32_t *)(uintptr_t)&ring->num_slots = kring->nkr_num_slots = ndesc; - *(ssize_t *)(uintptr_t)&ring->buf_ofs = + *(uint32_t *)(uintptr_t)&ring->num_slots = ndesc; + *(int64_t *)(uintptr_t)&ring->buf_ofs = (na->nm_mem->pools[NETMAP_IF_POOL].memtotal + na->nm_mem->pools[NETMAP_RING_POOL].memtotal) - netmap_ring_offset(na->nm_mem, ring); - /* - * IMPORTANT: - * Always keep one slot empty, so we can detect new - * transmissions comparing cur and nr_hwcur (they are - * the same only if there are no new transmissions). - */ - ring->avail = kring->nr_hwavail = ndesc - 1; - ring->cur = kring->nr_hwcur = 0; + /* copy values from kring */ + ring->head = kring->rhead; + ring->cur = kring->rcur; + ring->tail = kring->rtail; *(uint16_t *)(uintptr_t)&ring->nr_buf_size = NETMAP_BDG_BUF_SIZE(na->nm_mem); - ND("initializing slots for txring[%d]", i); - if (netmap_new_bufs(na->nm_mem, nifp, ring->slot, ndesc)) { - D("Cannot allocate buffers for tx_ring[%d] for %s", i, ifname); - goto cleanup; + ND("%s h %d c %d t %d", kring->name, + ring->head, ring->cur, ring->tail); + ND("initializing slots for txring"); + if (i != na->num_tx_rings || (na->na_flags & NAF_HOST_RINGS)) { + /* this is a real ring */ + if (netmap_new_bufs(na->nm_mem, ring->slot, ndesc)) { + D("Cannot allocate buffers for tx_ring"); + goto cleanup; + } + } else { + /* this is a fake tx ring, set all indices to 0 */ + netmap_mem_set_ring(na->nm_mem, ring->slot, ndesc, 0); } } - for (i = 0; i < nrx; i++) { /* Receive rings */ - kring = &na->rx_rings[i]; - ndesc = na->num_rx_desc; - bzero(kring, sizeof(*kring)); + /* receive rings */ + for ( i = 0 /* kring cont'd from above */ ; kring != na->tailroom; kring++, i++) { + if (kring->ring) { + ND("%s %ld already created", kring->name, kring - na->rx_rings); + continue; /* already created by somebody else */ + } + ndesc = kring->nkr_num_slots; len = sizeof(struct netmap_ring) + ndesc * sizeof(struct netmap_slot); ring = netmap_ring_malloc(na->nm_mem, len); if (ring == NULL) { - D("Cannot allocate rx_ring[%d] for %s", i, ifname); + D("Cannot allocate rx_ring"); goto cleanup; } - ND("rxring[%d] at %p ofs %d", i, ring); - - kring->na = na; + ND("rxring at %p", ring); kring->ring = ring; - if (rx_leases && i < na->num_rx_rings) { - kring->nkr_leases = rx_leases; - rx_leases += ndesc; - } - *(uint32_t *)(uintptr_t)&ring->num_slots = kring->nkr_num_slots = ndesc; - *(ssize_t *)(uintptr_t)&ring->buf_ofs = + *(uint32_t *)(uintptr_t)&ring->num_slots = ndesc; + *(int64_t *)(uintptr_t)&ring->buf_ofs = (na->nm_mem->pools[NETMAP_IF_POOL].memtotal + na->nm_mem->pools[NETMAP_RING_POOL].memtotal) - netmap_ring_offset(na->nm_mem, ring); - ring->cur = kring->nr_hwcur = 0; - ring->avail = kring->nr_hwavail = 0; /* empty */ + /* copy values from kring */ + ring->head = kring->rhead; + ring->cur = kring->rcur; + ring->tail = kring->rtail; *(int *)(uintptr_t)&ring->nr_buf_size = NETMAP_BDG_BUF_SIZE(na->nm_mem); - ND("initializing slots for rxring[%d]", i); - if (netmap_new_bufs(na->nm_mem, nifp, ring->slot, ndesc)) { - D("Cannot allocate buffers for rx_ring[%d] for %s", i, ifname); - goto cleanup; + ND("%s h %d c %d t %d", kring->name, + ring->head, ring->cur, ring->tail); + ND("initializing slots for rxring %p", ring); + if (i != na->num_rx_rings || (na->na_flags & NAF_HOST_RINGS)) { + /* this is a real ring */ + if (netmap_new_bufs(na->nm_mem, ring->slot, ndesc)) { + D("Cannot allocate buffers for rx_ring"); + goto cleanup; + } + } else { + /* this is a fake rx ring, set all indices to 1 */ + netmap_mem_set_ring(na->nm_mem, ring->slot, ndesc, 1); } } -#ifdef linux - // XXX initialize the selrecord structs. - for (i = 0; i < ntx; i++) - init_waitqueue_head(&na->tx_rings[i].si); - for (i = 0; i < nrx; i++) - init_waitqueue_head(&na->rx_rings[i].si); - init_waitqueue_head(&na->tx_si); - init_waitqueue_head(&na->rx_si); -#endif -final: + + NMA_UNLOCK(na->nm_mem); + + return 0; + +cleanup: + netmap_free_rings(na); + + NMA_UNLOCK(na->nm_mem); + + return ENOMEM; +} + +void +netmap_mem_rings_delete(struct netmap_adapter *na) +{ + /* last instance, release bufs and rings */ + NMA_LOCK(na->nm_mem); + + netmap_free_rings(na); + + NMA_UNLOCK(na->nm_mem); +} + + +/* call with NMA_LOCK held */ +/* + * Allocate the per-fd structure netmap_if. + * + * We assume that the configuration stored in na + * (number of tx/rx rings and descs) does not change while + * the interface is in netmap mode. + */ +struct netmap_if * +netmap_mem_if_new(const char *ifname, struct netmap_adapter *na) +{ + struct netmap_if *nifp; + ssize_t base; /* handy for relative offsets between rings and nifp */ + u_int i, len, ntx, nrx; + + /* account for the (eventually fake) host rings */ + ntx = na->num_tx_rings + 1; + nrx = na->num_rx_rings + 1; /* + * the descriptor is followed inline by an array of offsets + * to the tx and rx rings in the shared memory region. + */ + + NMA_LOCK(na->nm_mem); + + len = sizeof(struct netmap_if) + (nrx + ntx) * sizeof(ssize_t); + nifp = netmap_if_malloc(na->nm_mem, len); + if (nifp == NULL) { + NMA_UNLOCK(na->nm_mem); + return NULL; + } + + /* initialize base fields -- override const */ + *(u_int *)(uintptr_t)&nifp->ni_tx_rings = na->num_tx_rings; + *(u_int *)(uintptr_t)&nifp->ni_rx_rings = na->num_rx_rings; + strncpy(nifp->ni_name, ifname, (size_t)IFNAMSIZ); + + /* * fill the slots for the rx and tx rings. They contain the offset * between the ring and nifp, so the information is usable in * userspace to reach the ring from the nifp. */ base = netmap_if_offset(na->nm_mem, nifp); for (i = 0; i < ntx; i++) { *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i] = netmap_ring_offset(na->nm_mem, na->tx_rings[i].ring) - base; } for (i = 0; i < nrx; i++) { *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i+ntx] = netmap_ring_offset(na->nm_mem, na->rx_rings[i].ring) - base; } NMA_UNLOCK(na->nm_mem); return (nifp); -cleanup: - netmap_free_rings(na); - netmap_if_free(na->nm_mem, nifp); - - NMA_UNLOCK(na->nm_mem); - - return NULL; } void netmap_mem_if_delete(struct netmap_adapter *na, struct netmap_if *nifp) { if (nifp == NULL) /* nothing to do */ return; NMA_LOCK(na->nm_mem); - - if (na->refcount <= 0) { - /* last instance, release bufs and rings */ - u_int i, j, lim; - struct netmap_ring *ring; - - for (i = 0; i < na->num_tx_rings + 1; i++) { - ring = na->tx_rings[i].ring; - lim = na->tx_rings[i].nkr_num_slots; - for (j = 0; j < lim; j++) - netmap_free_buf(na->nm_mem, nifp, ring->slot[j].buf_idx); - } - for (i = 0; i < na->num_rx_rings + 1; i++) { - ring = na->rx_rings[i].ring; - lim = na->rx_rings[i].nkr_num_slots; - for (j = 0; j < lim; j++) - netmap_free_buf(na->nm_mem, nifp, ring->slot[j].buf_idx); - } - netmap_free_rings(na); - } + if (nifp->ni_bufs_head) + netmap_extra_free(na, nifp->ni_bufs_head); netmap_if_free(na->nm_mem, nifp); NMA_UNLOCK(na->nm_mem); } static void netmap_mem_global_deref(struct netmap_mem_d *nmd) { NMA_LOCK(nmd); nmd->refcount--; if (netmap_verbose) D("refcount = %d", nmd->refcount); NMA_UNLOCK(nmd); } -int netmap_mem_finalize(struct netmap_mem_d *nmd) +int +netmap_mem_finalize(struct netmap_mem_d *nmd) { return nmd->finalize(nmd); } -void netmap_mem_deref(struct netmap_mem_d *nmd) +void +netmap_mem_deref(struct netmap_mem_d *nmd) { return nmd->deref(nmd); } Index: stable/9/sys/dev/netmap/netmap_mem2.h =================================================================== --- stable/9/sys/dev/netmap/netmap_mem2.h (revision 262152) +++ stable/9/sys/dev/netmap/netmap_mem2.h (revision 262153) @@ -1,216 +1,227 @@ /* - * Copyright (C) 2012-2013 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. + * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD$ * * (New) memory allocator for netmap */ /* * This allocator creates three memory pools: * nm_if_pool for the struct netmap_if * nm_ring_pool for the struct netmap_ring * nm_buf_pool for the packet buffers. * * that contain netmap objects. Each pool is made of a number of clusters, * multiple of a page size, each containing an integer number of objects. * The clusters are contiguous in user space but not in the kernel. * Only nm_buf_pool needs to be dma-able, * but for convenience use the same type of allocator for all. * * Once mapped, the three pools are exported to userspace * as a contiguous block, starting from nm_if_pool. Each * cluster (and pool) is an integral number of pages. * [ . . . ][ . . . . . .][ . . . . . . . . . .] * nm_if nm_ring nm_buf * * The userspace areas contain offsets of the objects in userspace. * When (at init time) we write these offsets, we find out the index * of the object, and from there locate the offset from the beginning * of the region. * * The invididual allocators manage a pool of memory for objects of * the same size. * The pool is split into smaller clusters, whose size is a * multiple of the page size. The cluster size is chosen * to minimize the waste for a given max cluster size * (we do it by brute force, as we have relatively few objects * per cluster). * * Objects are aligned to the cache line (64 bytes) rounding up object * sizes when needed. A bitmap contains the state of each object. * Allocation scans the bitmap; this is done only on attach, so we are not * too worried about performance * * For each allocator we can define (thorugh sysctl) the size and * number of each object. Memory is allocated at the first use of a * netmap file descriptor, and can be freed when all such descriptors * have been released (including unmapping the memory). * If memory is scarce, the system tries to get as much as possible * and the sysctl values reflect the actual allocation. * Together with desired values, the sysctl export also absolute * min and maximum values that cannot be overridden. * * struct netmap_if: * variable size, max 16 bytes per ring pair plus some fixed amount. * 1024 bytes should be large enough in practice. * * In the worst case we have one netmap_if per ring in the system. * * struct netmap_ring * variable size, 8 byte per slot plus some fixed amount. * Rings can be large (e.g. 4k slots, or >32Kbytes). * We default to 36 KB (9 pages), and a few hundred rings. * * struct netmap_buffer * The more the better, both because fast interfaces tend to have * many slots, and because we may want to use buffers to store * packets in userspace avoiding copies. * Must contain a full frame (eg 1518, or more for vlans, jumbo * frames etc.) plus be nicely aligned, plus some NICs restrict * the size to multiple of 1K or so. Default to 2K */ #ifndef _NET_NETMAP_MEM2_H_ #define _NET_NETMAP_MEM2_H_ #define NETMAP_BUF_MAX_NUM 20*4096*2 /* large machine */ #define NETMAP_POOL_MAX_NAMSZ 32 enum { NETMAP_IF_POOL = 0, NETMAP_RING_POOL, NETMAP_BUF_POOL, NETMAP_POOLS_NR }; struct netmap_obj_params { u_int size; u_int num; }; struct netmap_obj_pool { char name[NETMAP_POOL_MAX_NAMSZ]; /* name of the allocator */ /* ---------------------------------------------------*/ /* these are only meaningful if the pool is finalized */ /* (see 'finalized' field in netmap_mem_d) */ u_int objtotal; /* actual total number of objects. */ u_int memtotal; /* actual total memory space */ u_int numclusters; /* actual number of clusters */ u_int objfree; /* number of free objects. */ struct lut_entry *lut; /* virt,phys addresses, objtotal entries */ uint32_t *bitmap; /* one bit per buffer, 1 means free */ uint32_t bitmap_slots; /* number of uint32 entries in bitmap */ /* ---------------------------------------------------*/ /* limits */ u_int objminsize; /* minimum object size */ u_int objmaxsize; /* maximum object size */ u_int nummin; /* minimum number of objects */ u_int nummax; /* maximum number of objects */ /* these are changed only by config */ u_int _objtotal; /* total number of objects */ u_int _objsize; /* object size */ u_int _clustsize; /* cluster size */ u_int _clustentries; /* objects per cluster */ u_int _numclusters; /* number of clusters */ /* requested values */ u_int r_objtotal; u_int r_objsize; }; #ifdef linux // XXX a mtx would suffice here 20130415 lr #define NMA_LOCK_T struct semaphore #else /* !linux */ #define NMA_LOCK_T struct mtx #endif /* linux */ typedef int (*netmap_mem_config_t)(struct netmap_mem_d*); typedef int (*netmap_mem_finalize_t)(struct netmap_mem_d*); typedef void (*netmap_mem_deref_t)(struct netmap_mem_d*); +typedef uint16_t nm_memid_t; /* We implement two kinds of netmap_mem_d structures: * * - global: used by hardware NICS; * * - private: used by VALE ports. * * In both cases, the netmap_mem_d structure has the same lifetime as the * netmap_adapter of the corresponding NIC or port. It is the responsibility of * the client code to delete the private allocator when the associated * netmap_adapter is freed (this is implemented by the NAF_MEM_OWNER flag in * netmap.c). The 'refcount' field counts the number of active users of the * structure. The global allocator uses this information to prevent/allow * reconfiguration. The private allocators release all their memory when there * are no active users. By 'active user' we mean an existing netmap_priv * structure holding a reference to the allocator. */ struct netmap_mem_d { NMA_LOCK_T nm_mtx; /* protect the allocator */ u_int nm_totalsize; /* shorthand */ u_int flags; #define NETMAP_MEM_FINALIZED 0x1 /* preallocation done */ #define NETMAP_MEM_PRIVATE 0x2 /* uses private address space */ int lasterr; /* last error for curr config */ int refcount; /* existing priv structures */ /* the three allocators */ struct netmap_obj_pool pools[NETMAP_POOLS_NR]; - netmap_mem_config_t config; + netmap_mem_config_t config; netmap_mem_finalize_t finalize; netmap_mem_deref_t deref; + + nm_memid_t nm_id; /* allocator identifier */ + + /* list of all existing allocators, sorted by nm_id */ + struct netmap_mem_d *prev, *next; }; extern struct netmap_mem_d nm_mem; vm_paddr_t netmap_mem_ofstophys(struct netmap_mem_d *, vm_ooffset_t); int netmap_mem_finalize(struct netmap_mem_d *); int netmap_mem_init(void); void netmap_mem_fini(void); -struct netmap_if * netmap_mem_if_new(const char *, struct netmap_adapter *); -void netmap_mem_if_delete(struct netmap_adapter *na, struct netmap_if *nifp); +struct netmap_if * + netmap_mem_if_new(const char *, struct netmap_adapter *); +void netmap_mem_if_delete(struct netmap_adapter *, struct netmap_if *); +int netmap_mem_rings_create(struct netmap_adapter *); +void netmap_mem_rings_delete(struct netmap_adapter *); void netmap_mem_deref(struct netmap_mem_d *); -int netmap_mem_get_info(struct netmap_mem_d *nm_mem, u_int *size, u_int *memflags); -ssize_t netmap_mem_if_offset(struct netmap_mem_d *nm_mem, const void *vaddr); -struct netmap_mem_d* - netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int rxr, u_int rxd); -void netmap_mem_private_delete(struct netmap_mem_d *nm_mem); +int netmap_mem_get_info(struct netmap_mem_d *, u_int *size, u_int *memflags, uint16_t *id); +ssize_t netmap_mem_if_offset(struct netmap_mem_d *, const void *vaddr); +struct netmap_mem_d* netmap_mem_private_new(const char *name, + u_int txr, u_int txd, u_int rxr, u_int rxd, u_int extra_bufs, u_int npipes, + int* error); +void netmap_mem_private_delete(struct netmap_mem_d *); #define NETMAP_BDG_BUF_SIZE(n) ((n)->pools[NETMAP_BUF_POOL]._objsize) +uint32_t netmap_extra_alloc(struct netmap_adapter *, uint32_t *, uint32_t n); #endif Property changes on: stable/9/sys/dev/netmap/netmap_mem2.h ___________________________________________________________________ Deleted: svn:eol-style ## -1 +0,0 ## -native \ No newline at end of property Deleted: svn:mime-type ## -1 +0,0 ## -text/plain \ No newline at end of property Index: stable/9/sys/dev/netmap/netmap_offloadings.c =================================================================== --- stable/9/sys/dev/netmap/netmap_offloadings.c (nonexistent) +++ stable/9/sys/dev/netmap/netmap_offloadings.c (revision 262153) @@ -0,0 +1,401 @@ +/* + * Copyright (C) 2014 Vincenzo Maffione. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* $FreeBSD$ */ + +#if defined(__FreeBSD__) +#include /* prerequisite */ + +#include +#include +#include /* defines used in kernel.h */ +#include /* types used in module initialization */ +#include +#include /* struct socket */ +#include /* sockaddrs */ +#include +#include +#include /* bus_dmamap_* */ +#include + +#elif defined(linux) + +#include "bsd_glue.h" + +#elif defined(__APPLE__) + +#warning OSX support is only partial +#include "osx_glue.h" + +#else + +#error Unsupported platform + +#endif /* unsupported */ + +#include +#include + + + +/* This routine is called by bdg_mismatch_datapath() when it finishes + * accumulating bytes for a segment, in order to fix some fields in the + * segment headers (which still contain the same content as the header + * of the original GSO packet). 'buf' points to the beginning (e.g. + * the ethernet header) of the segment, and 'len' is its length. + */ +static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx, + u_int segmented_bytes, u_int last_segment, + u_int tcp, u_int iphlen) +{ + struct nm_iphdr *iph = (struct nm_iphdr *)(buf + 14); + struct nm_ipv6hdr *ip6h = (struct nm_ipv6hdr *)(buf + 14); + uint16_t *check = NULL; + uint8_t *check_data = NULL; + + if (iphlen == 20) { + /* Set the IPv4 "Total Length" field. */ + iph->tot_len = htobe16(len-14); + ND("ip total length %u", be16toh(ip->tot_len)); + + /* Set the IPv4 "Identification" field. */ + iph->id = htobe16(be16toh(iph->id) + idx); + ND("ip identification %u", be16toh(iph->id)); + + /* Compute and insert the IPv4 header checksum. */ + iph->check = 0; + iph->check = nm_csum_ipv4(iph); + ND("IP csum %x", be16toh(iph->check)); + } else {/* if (iphlen == 40) */ + /* Set the IPv6 "Payload Len" field. */ + ip6h->payload_len = htobe16(len-14-iphlen); + } + + if (tcp) { + struct nm_tcphdr *tcph = (struct nm_tcphdr *)(buf + 14 + iphlen); + + /* Set the TCP sequence number. */ + tcph->seq = htobe32(be32toh(tcph->seq) + segmented_bytes); + ND("tcp seq %u", be32toh(tcph->seq)); + + /* Zero the PSH and FIN TCP flags if this is not the last + segment. */ + if (!last_segment) + tcph->flags &= ~(0x8 | 0x1); + ND("last_segment %u", last_segment); + + check = &tcph->check; + check_data = (uint8_t *)tcph; + } else { /* UDP */ + struct nm_udphdr *udph = (struct nm_udphdr *)(buf + 14 + iphlen); + + /* Set the UDP 'Length' field. */ + udph->len = htobe16(len-14-iphlen); + + check = &udph->check; + check_data = (uint8_t *)udph; + } + + /* Compute and insert TCP/UDP checksum. */ + *check = 0; + if (iphlen == 20) + nm_csum_tcpudp_ipv4(iph, check_data, len-14-iphlen, check); + else + nm_csum_tcpudp_ipv6(ip6h, check_data, len-14-iphlen, check); + + ND("TCP/UDP csum %x", be16toh(*check)); +} + + +/* The VALE mismatch datapath implementation. */ +void bdg_mismatch_datapath(struct netmap_vp_adapter *na, + struct netmap_vp_adapter *dst_na, + struct nm_bdg_fwd *ft_p, struct netmap_ring *ring, + u_int *j, u_int lim, u_int *howmany) +{ + struct netmap_slot *slot = NULL; + struct nm_vnet_hdr *vh = NULL; + /* Number of source slots to process. */ + u_int frags = ft_p->ft_frags; + struct nm_bdg_fwd *ft_end = ft_p + frags; + + /* Source and destination pointers. */ + uint8_t *dst, *src; + size_t src_len, dst_len; + + u_int j_start = *j; + u_int dst_slots = 0; + + /* If the source port uses the offloadings, while destination doesn't, + * we grab the source virtio-net header and do the offloadings here. + */ + if (na->virt_hdr_len && !dst_na->virt_hdr_len) { + vh = (struct nm_vnet_hdr *)ft_p->ft_buf; + } + + /* Init source and dest pointers. */ + src = ft_p->ft_buf; + src_len = ft_p->ft_len; + slot = &ring->slot[*j]; + dst = BDG_NMB(&dst_na->up, slot); + dst_len = src_len; + + /* We are processing the first input slot and there is a mismatch + * between source and destination virt_hdr_len (SHL and DHL). + * When the a client is using virtio-net headers, the header length + * can be: + * - 10: the header corresponds to the struct nm_vnet_hdr + * - 12: the first 10 bytes correspond to the struct + * virtio_net_hdr, and the last 2 bytes store the + * "mergeable buffers" info, which is an optional + * hint that can be zeroed for compability + * + * The destination header is therefore built according to the + * following table: + * + * SHL | DHL | destination header + * ----------------------------- + * 0 | 10 | zero + * 0 | 12 | zero + * 10 | 0 | doesn't exist + * 10 | 12 | first 10 bytes are copied from source header, last 2 are zero + * 12 | 0 | doesn't exist + * 12 | 10 | copied from the first 10 bytes of source header + */ + bzero(dst, dst_na->virt_hdr_len); + if (na->virt_hdr_len && dst_na->virt_hdr_len) + memcpy(dst, src, sizeof(struct nm_vnet_hdr)); + /* Skip the virtio-net headers. */ + src += na->virt_hdr_len; + src_len -= na->virt_hdr_len; + dst += dst_na->virt_hdr_len; + dst_len = dst_na->virt_hdr_len + src_len; + + /* Here it could be dst_len == 0 (which implies src_len == 0), + * so we avoid passing a zero length fragment. + */ + if (dst_len == 0) { + ft_p++; + src = ft_p->ft_buf; + src_len = ft_p->ft_len; + dst_len = src_len; + } + + if (vh && vh->gso_type != VIRTIO_NET_HDR_GSO_NONE) { + u_int gso_bytes = 0; + /* Length of the GSO packet header. */ + u_int gso_hdr_len = 0; + /* Pointer to the GSO packet header. Assume it is in a single fragment. */ + uint8_t *gso_hdr = NULL; + /* Index of the current segment. */ + u_int gso_idx = 0; + /* Payload data bytes segmented so far (e.g. TCP data bytes). */ + u_int segmented_bytes = 0; + /* Length of the IP header (20 if IPv4, 40 if IPv6). */ + u_int iphlen = 0; + /* Is this a TCP or an UDP GSO packet? */ + u_int tcp = ((vh->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) + == VIRTIO_NET_HDR_GSO_UDP) ? 0 : 1; + + /* Segment the GSO packet contained into the input slots (frags). */ + while (ft_p != ft_end) { + size_t copy; + + /* Grab the GSO header if we don't have it. */ + if (!gso_hdr) { + uint16_t ethertype; + + gso_hdr = src; + + /* Look at the 'Ethertype' field to see if this packet + * is IPv4 or IPv6. + */ + ethertype = be16toh(*((uint16_t *)(gso_hdr + 12))); + if (ethertype == 0x0800) + iphlen = 20; + else /* if (ethertype == 0x86DD) */ + iphlen = 40; + ND(3, "type=%04x", ethertype); + + /* Compute gso_hdr_len. For TCP we need to read the + * content of the 'Data Offset' field. + */ + if (tcp) { + struct nm_tcphdr *tcph = + (struct nm_tcphdr *)&gso_hdr[14+iphlen]; + + gso_hdr_len = 14 + iphlen + 4*(tcph->doff >> 4); + } else + gso_hdr_len = 14 + iphlen + 8; /* UDP */ + + ND(3, "gso_hdr_len %u gso_mtu %d", gso_hdr_len, + dst_na->mfs); + + /* Advance source pointers. */ + src += gso_hdr_len; + src_len -= gso_hdr_len; + if (src_len == 0) { + ft_p++; + if (ft_p == ft_end) + break; + src = ft_p->ft_buf; + src_len = ft_p->ft_len; + continue; + } + } + + /* Fill in the header of the current segment. */ + if (gso_bytes == 0) { + memcpy(dst, gso_hdr, gso_hdr_len); + gso_bytes = gso_hdr_len; + } + + /* Fill in data and update source and dest pointers. */ + copy = src_len; + if (gso_bytes + copy > dst_na->mfs) + copy = dst_na->mfs - gso_bytes; + memcpy(dst + gso_bytes, src, copy); + gso_bytes += copy; + src += copy; + src_len -= copy; + + /* A segment is complete or we have processed all the + the GSO payload bytes. */ + if (gso_bytes >= dst_na->mfs || + (src_len == 0 && ft_p + 1 == ft_end)) { + /* After raw segmentation, we must fix some header + * fields and compute checksums, in a protocol dependent + * way. */ + gso_fix_segment(dst, gso_bytes, gso_idx, + segmented_bytes, + src_len == 0 && ft_p + 1 == ft_end, + tcp, iphlen); + + ND("frame %u completed with %d bytes", gso_idx, (int)gso_bytes); + slot->len = gso_bytes; + slot->flags = 0; + segmented_bytes += gso_bytes - gso_hdr_len; + + dst_slots++; + + /* Next destination slot. */ + *j = nm_next(*j, lim); + slot = &ring->slot[*j]; + dst = BDG_NMB(&dst_na->up, slot); + + gso_bytes = 0; + gso_idx++; + } + + /* Next input slot. */ + if (src_len == 0) { + ft_p++; + if (ft_p == ft_end) + break; + src = ft_p->ft_buf; + src_len = ft_p->ft_len; + } + } + ND(3, "%d bytes segmented", segmented_bytes); + + } else { + /* Address of a checksum field into a destination slot. */ + uint16_t *check = NULL; + /* Accumulator for an unfolded checksum. */ + rawsum_t csum = 0; + + /* Process a non-GSO packet. */ + + /* Init 'check' if necessary. */ + if (vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) { + if (unlikely(vh->csum_offset + vh->csum_start > src_len)) + D("invalid checksum request"); + else + check = (uint16_t *)(dst + vh->csum_start + + vh->csum_offset); + } + + while (ft_p != ft_end) { + /* Init/update the packet checksum if needed. */ + if (vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) { + if (!dst_slots) + csum = nm_csum_raw(src + vh->csum_start, + src_len - vh->csum_start, 0); + else + csum = nm_csum_raw(src, src_len, csum); + } + + /* Round to a multiple of 64 */ + src_len = (src_len + 63) & ~63; + + if (ft_p->ft_flags & NS_INDIRECT) { + if (copyin(src, dst, src_len)) { + /* Invalid user pointer, pretend len is 0. */ + dst_len = 0; + } + } else { + memcpy(dst, src, (int)src_len); + } + slot->len = dst_len; + + dst_slots++; + + /* Next destination slot. */ + *j = nm_next(*j, lim); + slot = &ring->slot[*j]; + dst = BDG_NMB(&dst_na->up, slot); + + /* Next source slot. */ + ft_p++; + src = ft_p->ft_buf; + dst_len = src_len = ft_p->ft_len; + + } + + /* Finalize (fold) the checksum if needed. */ + if (check && vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) { + *check = nm_csum_fold(csum); + } + ND(3, "using %u dst_slots", dst_slots); + + /* A second pass on the desitations slots to set the slot flags, + * using the right number of destination slots. + */ + while (j_start != *j) { + slot = &ring->slot[j_start]; + slot->flags = (dst_slots << 8)| NS_MOREFRAG; + j_start = nm_next(j_start, lim); + } + /* Clear NS_MOREFRAG flag on last entry. */ + slot->flags = (dst_slots << 8); + } + + /* Update howmany. */ + if (unlikely(dst_slots > *howmany)) { + dst_slots = *howmany; + D("Slot allocation error: Should never happen"); + } + *howmany -= dst_slots; +} Property changes on: stable/9/sys/dev/netmap/netmap_offloadings.c ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: stable/9/sys/dev/netmap/netmap_pipe.c =================================================================== --- stable/9/sys/dev/netmap/netmap_pipe.c (nonexistent) +++ stable/9/sys/dev/netmap/netmap_pipe.c (revision 262153) @@ -0,0 +1,711 @@ +/* + * Copyright (C) 2014 Giuseppe Lettieri. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* $FreeBSD$ */ + +#if defined(__FreeBSD__) +#include /* prerequisite */ + +#include +#include +#include /* defines used in kernel.h */ +#include /* types used in module initialization */ +#include +#include +#include +#include +#include +#include +#include /* sockaddrs */ +#include +#include +#include /* bus_dmamap_* */ +#include + + +#elif defined(linux) + +#include "bsd_glue.h" + +#elif defined(__APPLE__) + +#warning OSX support is only partial +#include "osx_glue.h" + +#else + +#error Unsupported platform + +#endif /* unsupported */ + +/* + * common headers + */ + +#include +#include +#include + +#ifdef WITH_PIPES + +#define NM_PIPE_MAXSLOTS 4096 + +int netmap_default_pipes = 0; /* default number of pipes for each nic */ +SYSCTL_DECL(_dev_netmap); +SYSCTL_INT(_dev_netmap, OID_AUTO, default_pipes, CTLFLAG_RW, &netmap_default_pipes, 0 , ""); + +/* allocate the pipe array in the parent adapter */ +int +netmap_pipe_alloc(struct netmap_adapter *na, struct nmreq *nmr) +{ + size_t len; + int mode = nmr->nr_flags & NR_REG_MASK; + u_int npipes; + + if (mode == NR_REG_PIPE_MASTER || mode == NR_REG_PIPE_SLAVE) { + /* this is for our parent, not for us */ + return 0; + } + + /* TODO: we can resize the array if the new + * request can accomodate the already existing pipes + */ + if (na->na_pipes) { + nmr->nr_arg1 = na->na_max_pipes; + return 0; + } + + npipes = nmr->nr_arg1; + if (npipes == 0) + npipes = netmap_default_pipes; + nm_bound_var(&npipes, 0, 0, NM_MAXPIPES, NULL); + + if (npipes == 0) { + /* really zero, nothing to alloc */ + goto out; + } + + len = sizeof(struct netmap_pipe_adapter *) * npipes; + na->na_pipes = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO); + if (na->na_pipes == NULL) + return ENOMEM; + + na->na_max_pipes = npipes; + na->na_next_pipe = 0; + +out: + nmr->nr_arg1 = npipes; + + return 0; +} + +/* deallocate the parent array in the parent adapter */ +void +netmap_pipe_dealloc(struct netmap_adapter *na) +{ + if (na->na_pipes) { + ND("freeing pipes for %s", NM_IFPNAME(na->ifp)); + free(na->na_pipes, M_DEVBUF); + na->na_pipes = NULL; + na->na_max_pipes = 0; + na->na_next_pipe = 0; + } +} + +/* find a pipe endpoint with the given id among the parent's pipes */ +static struct netmap_pipe_adapter * +netmap_pipe_find(struct netmap_adapter *parent, u_int pipe_id) +{ + int i; + struct netmap_pipe_adapter *na; + + for (i = 0; i < parent->na_next_pipe; i++) { + na = parent->na_pipes[i]; + if (na->id == pipe_id) { + return na; + } + } + return NULL; +} + +/* add a new pipe endpoint to the parent array */ +static int +netmap_pipe_add(struct netmap_adapter *parent, struct netmap_pipe_adapter *na) +{ + if (parent->na_next_pipe >= parent->na_max_pipes) { + D("%s: no space left for pipes", NM_IFPNAME(parent->ifp)); + return ENOMEM; + } + + parent->na_pipes[parent->na_next_pipe] = na; + na->parent_slot = parent->na_next_pipe; + parent->na_next_pipe++; + return 0; +} + +/* remove the given pipe endpoint from the parent array */ +static void +netmap_pipe_remove(struct netmap_adapter *parent, struct netmap_pipe_adapter *na) +{ + u_int n; + n = --parent->na_next_pipe; + if (n != na->parent_slot) { + parent->na_pipes[na->parent_slot] = + parent->na_pipes[n]; + } + parent->na_pipes[n] = NULL; +} + +static int +netmap_pipe_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *txkring = na->tx_rings + ring_nr, + *rxkring = txkring->pipe; + u_int limit; /* slots to transfer */ + u_int j, k, lim_tx = txkring->nkr_num_slots - 1, + lim_rx = rxkring->nkr_num_slots - 1; + int m, busy; + + ND("%p: %s %x -> %s", txkring, txkring->name, flags, rxkring->name); + ND(2, "before: hwcur %d hwtail %d cur %d head %d tail %d", txkring->nr_hwcur, txkring->nr_hwtail, + txkring->rcur, txkring->rhead, txkring->rtail); + + j = rxkring->nr_hwtail; /* RX */ + k = txkring->nr_hwcur; /* TX */ + m = txkring->rhead - txkring->nr_hwcur; /* new slots */ + if (m < 0) + m += txkring->nkr_num_slots; + limit = m; + m = rxkring->nkr_num_slots - 1; /* max avail space on destination */ + busy = j - rxkring->nr_hwcur; /* busy slots */ + if (busy < 0) + busy += txkring->nkr_num_slots; + m -= busy; /* subtract busy slots */ + ND(2, "m %d limit %d", m, limit); + if (m < limit) + limit = m; + + if (limit == 0) { + /* either the rxring is full, or nothing to send */ + nm_txsync_finalize(txkring); /* actually useless */ + return 0; + } + + while (limit-- > 0) { + struct netmap_slot *rs = &rxkring->save_ring->slot[j]; + struct netmap_slot *ts = &txkring->ring->slot[k]; + struct netmap_slot tmp; + + /* swap the slots */ + tmp = *rs; + *rs = *ts; + *ts = tmp; + + /* no need to report the buffer change */ + + j = nm_next(j, lim_rx); + k = nm_next(k, lim_tx); + } + + wmb(); /* make sure the slots are updated before publishing them */ + rxkring->nr_hwtail = j; + txkring->nr_hwcur = k; + txkring->nr_hwtail = nm_prev(k, lim_tx); + + nm_txsync_finalize(txkring); + ND(2, "after: hwcur %d hwtail %d cur %d head %d tail %d j %d", txkring->nr_hwcur, txkring->nr_hwtail, + txkring->rcur, txkring->rhead, txkring->rtail, j); + + wmb(); /* make sure rxkring->nr_hwtail is updated before notifying */ + rxkring->na->nm_notify(rxkring->na, rxkring->ring_id, NR_RX, 0); + + return 0; +} + +static int +netmap_pipe_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *rxkring = na->rx_rings + ring_nr, + *txkring = rxkring->pipe; + uint32_t oldhwcur = rxkring->nr_hwcur; + + ND("%s %x <- %s", rxkring->name, flags, txkring->name); + rxkring->nr_hwcur = rxkring->rhead; /* recover user-relased slots */ + ND(5, "hwcur %d hwtail %d cur %d head %d tail %d", rxkring->nr_hwcur, rxkring->nr_hwtail, + rxkring->rcur, rxkring->rhead, rxkring->rtail); + rmb(); /* paired with the first wmb() in txsync */ + nm_rxsync_finalize(rxkring); + + if (oldhwcur != rxkring->nr_hwcur) { + /* we have released some slots, notify the other end */ + wmb(); /* make sure nr_hwcur is updated before notifying */ + txkring->na->nm_notify(txkring->na, txkring->ring_id, NR_TX, 0); + } + return 0; +} + +/* Pipe endpoints are created and destroyed together, so that endopoints do not + * have to check for the existence of their peer at each ?xsync. + * + * To play well with the existing netmap infrastructure (refcounts etc.), we + * adopt the following strategy: + * + * 1) The first endpoint that is created also creates the other endpoint and + * grabs a reference to it. + * + * state A) user1 --> endpoint1 --> endpoint2 + * + * 2) If, starting from state A, endpoint2 is then registered, endpoint1 gives + * its reference to the user: + * + * state B) user1 --> endpoint1 endpoint2 <--- user2 + * + * 3) Assume that, starting from state B endpoint2 is closed. In the unregister + * callback endpoint2 notes that endpoint1 is still active and adds a reference + * from endpoint1 to itself. When user2 then releases her own reference, + * endpoint2 is not destroyed and we are back to state A. A symmetrical state + * would be reached if endpoint1 were released instead. + * + * 4) If, starting from state A, endpoint1 is closed, the destructor notes that + * it owns a reference to endpoint2 and releases it. + * + * Something similar goes on for the creation and destruction of the krings. + */ + + +/* netmap_pipe_krings_delete. + * + * There are two cases: + * + * 1) state is + * + * usr1 --> e1 --> e2 + * + * and we are e1. We have to create both sets + * of krings. + * + * 2) state is + * + * usr1 --> e1 --> e2 + * + * and we are e2. e1 is certainly registered and our + * krings already exist, but they may be hidden. + */ +static int +netmap_pipe_krings_create(struct netmap_adapter *na) +{ + struct netmap_pipe_adapter *pna = + (struct netmap_pipe_adapter *)na; + struct netmap_adapter *ona = &pna->peer->up; + int error = 0; + if (pna->peer_ref) { + int i; + + /* case 1) above */ + D("%p: case 1, create everything", na); + error = netmap_krings_create(na, 0); + if (error) + goto err; + + /* we also create all the rings, since we need to + * update the save_ring pointers. + * netmap_mem_rings_create (called by our caller) + * will not create the rings again + */ + + error = netmap_mem_rings_create(na); + if (error) + goto del_krings1; + + /* update our hidden ring pointers */ + for (i = 0; i < na->num_tx_rings + 1; i++) + na->tx_rings[i].save_ring = na->tx_rings[i].ring; + for (i = 0; i < na->num_rx_rings + 1; i++) + na->rx_rings[i].save_ring = na->rx_rings[i].ring; + + /* now, create krings and rings of the other end */ + error = netmap_krings_create(ona, 0); + if (error) + goto del_rings1; + + error = netmap_mem_rings_create(ona); + if (error) + goto del_krings2; + + for (i = 0; i < ona->num_tx_rings + 1; i++) + ona->tx_rings[i].save_ring = ona->tx_rings[i].ring; + for (i = 0; i < ona->num_rx_rings + 1; i++) + ona->rx_rings[i].save_ring = ona->rx_rings[i].ring; + + /* cross link the krings */ + for (i = 0; i < na->num_tx_rings; i++) { + na->tx_rings[i].pipe = pna->peer->up.rx_rings + i; + na->rx_rings[i].pipe = pna->peer->up.tx_rings + i; + pna->peer->up.tx_rings[i].pipe = na->rx_rings + i; + pna->peer->up.rx_rings[i].pipe = na->tx_rings + i; + } + } else { + int i; + /* case 2) above */ + /* recover the hidden rings */ + ND("%p: case 2, hidden rings", na); + for (i = 0; i < na->num_tx_rings + 1; i++) + na->tx_rings[i].ring = na->tx_rings[i].save_ring; + for (i = 0; i < na->num_rx_rings + 1; i++) + na->rx_rings[i].ring = na->rx_rings[i].save_ring; + } + return 0; + +del_krings2: + netmap_krings_delete(ona); +del_rings1: + netmap_mem_rings_delete(na); +del_krings1: + netmap_krings_delete(na); +err: + return error; +} + +/* netmap_pipe_reg. + * + * There are two cases on registration (onoff==1) + * + * 1.a) state is + * + * usr1 --> e1 --> e2 + * + * and we are e1. Nothing special to do. + * + * 1.b) state is + * + * usr1 --> e1 --> e2 <-- usr2 + * + * and we are e2. Drop the ref e1 is holding. + * + * There are two additional cases on unregister (onoff==0) + * + * 2.a) state is + * + * usr1 --> e1 --> e2 + * + * and we are e1. Nothing special to do, e2 will + * be cleaned up by the destructor of e1. + * + * 2.b) state is + * + * usr1 --> e1 e2 <-- usr2 + * + * and we are either e1 or e2. Add a ref from the + * other end and hide our rings. + */ +static int +netmap_pipe_reg(struct netmap_adapter *na, int onoff) +{ + struct netmap_pipe_adapter *pna = + (struct netmap_pipe_adapter *)na; + struct ifnet *ifp = na->ifp; + ND("%p: onoff %d", na, onoff); + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + } else { + ifp->if_capenable &= ~IFCAP_NETMAP; + } + if (pna->peer_ref) { + ND("%p: case 1.a or 2.a, nothing to do", na); + return 0; + } + if (onoff) { + ND("%p: case 1.b, drop peer", na); + pna->peer->peer_ref = 0; + netmap_adapter_put(na); + } else { + int i; + ND("%p: case 2.b, grab peer", na); + netmap_adapter_get(na); + pna->peer->peer_ref = 1; + /* hide our rings from netmap_mem_rings_delete */ + for (i = 0; i < na->num_tx_rings + 1; i++) { + na->tx_rings[i].ring = NULL; + } + for (i = 0; i < na->num_rx_rings + 1; i++) { + na->rx_rings[i].ring = NULL; + } + } + return 0; +} + +/* netmap_pipe_krings_delete. + * + * There are two cases: + * + * 1) state is + * + * usr1 --> e1 --> e2 + * + * and we are e1 (e2 is not registered, so krings_delete cannot be + * called on it); + * + * 2) state is + * + * usr1 --> e1 e2 <-- usr2 + * + * and we are either e1 or e2. + * + * In the former case we have to also delete the krings of e2; + * in the latter case we do nothing (note that our krings + * have already been hidden in the unregister callback). + */ +static void +netmap_pipe_krings_delete(struct netmap_adapter *na) +{ + struct netmap_pipe_adapter *pna = + (struct netmap_pipe_adapter *)na; + struct netmap_adapter *ona; /* na of the other end */ + int i; + + if (!pna->peer_ref) { + ND("%p: case 2, kept alive by peer", na); + return; + } + /* case 1) above */ + ND("%p: case 1, deleting everyhing", na); + netmap_krings_delete(na); /* also zeroes tx_rings etc. */ + /* restore the ring to be deleted on the peer */ + ona = &pna->peer->up; + if (ona->tx_rings == NULL) { + /* already deleted, we must be on an + * cleanup-after-error path */ + return; + } + for (i = 0; i < ona->num_tx_rings + 1; i++) + ona->tx_rings[i].ring = ona->tx_rings[i].save_ring; + for (i = 0; i < ona->num_rx_rings + 1; i++) + ona->rx_rings[i].ring = ona->rx_rings[i].save_ring; + netmap_mem_rings_delete(ona); + netmap_krings_delete(ona); +} + + +static void +netmap_pipe_dtor(struct netmap_adapter *na) +{ + struct netmap_pipe_adapter *pna = + (struct netmap_pipe_adapter *)na; + ND("%p", na); + if (pna->peer_ref) { + ND("%p: clean up peer", na); + pna->peer_ref = 0; + netmap_adapter_put(&pna->peer->up); + } + if (pna->role == NR_REG_PIPE_MASTER) + netmap_pipe_remove(pna->parent, pna); + netmap_adapter_put(pna->parent); + free(na->ifp, M_DEVBUF); + na->ifp = NULL; + pna->parent = NULL; +} + +int +netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create) +{ + struct nmreq pnmr; + struct netmap_adapter *pna; /* parent adapter */ + struct netmap_pipe_adapter *mna, *sna, *req; + struct ifnet *ifp, *ifp2; + u_int pipe_id; + int role = nmr->nr_flags & NR_REG_MASK; + int error; + + ND("flags %x", nmr->nr_flags); + + if (role != NR_REG_PIPE_MASTER && role != NR_REG_PIPE_SLAVE) { + ND("not a pipe"); + return 0; + } + role = nmr->nr_flags & NR_REG_MASK; + + /* first, try to find the parent adapter */ + bzero(&pnmr, sizeof(pnmr)); + memcpy(&pnmr.nr_name, nmr->nr_name, IFNAMSIZ); + /* pass to parent the requested number of pipes */ + pnmr.nr_arg1 = nmr->nr_arg1; + error = netmap_get_na(&pnmr, &pna, create); + if (error) { + ND("parent lookup failed: %d", error); + return error; + } + ND("found parent: %s", NM_IFPNAME(pna->ifp)); + + if (NETMAP_OWNED_BY_KERN(pna)) { + ND("parent busy"); + error = EBUSY; + goto put_out; + } + + /* next, lookup the pipe id in the parent list */ + req = NULL; + pipe_id = nmr->nr_ringid & NETMAP_RING_MASK; + mna = netmap_pipe_find(pna, pipe_id); + if (mna) { + if (mna->role == role) { + ND("found %d directly at %d", pipe_id, mna->parent_slot); + req = mna; + } else { + ND("found %d indirectly at %d", pipe_id, mna->parent_slot); + req = mna->peer; + } + /* the pipe we have found already holds a ref to the parent, + * so we need to drop the one we got from netmap_get_na() + */ + netmap_adapter_put(pna); + goto found; + } + ND("pipe %d not found, create %d", pipe_id, create); + if (!create) { + error = ENODEV; + goto put_out; + } + /* we create both master and slave. + * The endpoint we were asked for holds a reference to + * the other one. + */ + ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!ifp) { + error = ENOMEM; + goto put_out; + } + strcpy(ifp->if_xname, NM_IFPNAME(pna->ifp)); + + mna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (mna == NULL) { + error = ENOMEM; + goto free_ifp; + } + mna->up.ifp = ifp; + + mna->id = pipe_id; + mna->role = NR_REG_PIPE_MASTER; + mna->parent = pna; + + mna->up.nm_txsync = netmap_pipe_txsync; + mna->up.nm_rxsync = netmap_pipe_rxsync; + mna->up.nm_register = netmap_pipe_reg; + mna->up.nm_dtor = netmap_pipe_dtor; + mna->up.nm_krings_create = netmap_pipe_krings_create; + mna->up.nm_krings_delete = netmap_pipe_krings_delete; + mna->up.nm_mem = pna->nm_mem; + mna->up.na_lut = pna->na_lut; + mna->up.na_lut_objtotal = pna->na_lut_objtotal; + + mna->up.num_tx_rings = 1; + mna->up.num_rx_rings = 1; + mna->up.num_tx_desc = nmr->nr_tx_slots; + nm_bound_var(&mna->up.num_tx_desc, pna->num_tx_desc, + 1, NM_PIPE_MAXSLOTS, NULL); + mna->up.num_rx_desc = nmr->nr_rx_slots; + nm_bound_var(&mna->up.num_rx_desc, pna->num_rx_desc, + 1, NM_PIPE_MAXSLOTS, NULL); + error = netmap_attach_common(&mna->up); + if (error) + goto free_ifp; + /* register the master with the parent */ + error = netmap_pipe_add(pna, mna); + if (error) + goto free_mna; + + /* create the slave */ + ifp2 = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!ifp) { + error = ENOMEM; + goto free_mna; + } + strcpy(ifp2->if_xname, NM_IFPNAME(pna->ifp)); + + sna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (sna == NULL) { + error = ENOMEM; + goto free_ifp2; + } + /* most fields are the same, copy from master and then fix */ + *sna = *mna; + sna->up.ifp = ifp2; + sna->role = NR_REG_PIPE_SLAVE; + error = netmap_attach_common(&sna->up); + if (error) + goto free_sna; + + /* join the two endpoints */ + mna->peer = sna; + sna->peer = mna; + + /* we already have a reference to the parent, but we + * need another one for the other endpoint we created + */ + netmap_adapter_get(pna); + + if (role == NR_REG_PIPE_MASTER) { + req = mna; + mna->peer_ref = 1; + netmap_adapter_get(&sna->up); + } else { + req = sna; + sna->peer_ref = 1; + netmap_adapter_get(&mna->up); + } + ND("created master %p and slave %p", mna, sna); +found: + + ND("pipe %d %s at %p", pipe_id, + (req->role == NR_REG_PIPE_MASTER ? "master" : "slave"), req); + *na = &req->up; + netmap_adapter_get(*na); + + /* write the configuration back */ + nmr->nr_tx_rings = req->up.num_tx_rings; + nmr->nr_rx_rings = req->up.num_rx_rings; + nmr->nr_tx_slots = req->up.num_tx_desc; + nmr->nr_rx_slots = req->up.num_rx_desc; + + /* keep the reference to the parent. + * It will be released by the req destructor + */ + + return 0; + +free_sna: + free(sna, M_DEVBUF); +free_ifp2: + free(ifp2, M_DEVBUF); +free_mna: + free(mna, M_DEVBUF); +free_ifp: + free(ifp, M_DEVBUF); +put_out: + netmap_adapter_put(pna); + return error; +} + + +#endif /* WITH_PIPES */ Property changes on: stable/9/sys/dev/netmap/netmap_pipe.c ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: stable/9/sys/dev/netmap/netmap_vale.c =================================================================== --- stable/9/sys/dev/netmap/netmap_vale.c (nonexistent) +++ stable/9/sys/dev/netmap/netmap_vale.c (revision 262153) @@ -0,0 +1,2103 @@ +/* + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +/* + * This module implements the VALE switch for netmap + +--- VALE SWITCH --- + +NMG_LOCK() serializes all modifications to switches and ports. +A switch cannot be deleted until all ports are gone. + +For each switch, an SX lock (RWlock on linux) protects +deletion of ports. When configuring or deleting a new port, the +lock is acquired in exclusive mode (after holding NMG_LOCK). +When forwarding, the lock is acquired in shared mode (without NMG_LOCK). +The lock is held throughout the entire forwarding cycle, +during which the thread may incur in a page fault. +Hence it is important that sleepable shared locks are used. + +On the rx ring, the per-port lock is grabbed initially to reserve +a number of slot in the ring, then the lock is released, +packets are copied from source to destination, and then +the lock is acquired again and the receive ring is updated. +(A similar thing is done on the tx ring for NIC and host stack +ports attached to the switch) + + */ + +/* + * OS-specific code that is used only within this file. + * Other OS-specific code that must be accessed by drivers + * is present in netmap_kern.h + */ + +#if defined(__FreeBSD__) +#include /* prerequisite */ +__FBSDID("$FreeBSD$"); + +#include +#include +#include /* defines used in kernel.h */ +#include /* types used in module initialization */ +#include /* cdevsw struct, UID, GID */ +#include +#include /* struct socket */ +#include +#include +#include +#include /* sockaddrs */ +#include +#include +#include +#include +#include /* BIOCIMMEDIATE */ +#include /* bus_dmamap_* */ +#include +#include + + +#define BDG_RWLOCK_T struct rwlock // struct rwlock + +#define BDG_RWINIT(b) \ + rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS) +#define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) +#define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) +#define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) +#define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock) +#define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) +#define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock) + + +#elif defined(linux) + +#include "bsd_glue.h" + +#elif defined(__APPLE__) + +#warning OSX support is only partial +#include "osx_glue.h" + +#else + +#error Unsupported platform + +#endif /* unsupported */ + +/* + * common headers + */ + +#include +#include +#include + +#ifdef WITH_VALE + +/* + * system parameters (most of them in netmap_kern.h) + * NM_NAME prefix for switch port names, default "vale" + * NM_BDG_MAXPORTS number of ports + * NM_BRIDGES max number of switches in the system. + * XXX should become a sysctl or tunable + * + * Switch ports are named valeX:Y where X is the switch name and Y + * is the port. If Y matches a physical interface name, the port is + * connected to a physical device. + * + * Unlike physical interfaces, switch ports use their own memory region + * for rings and buffers. + * The virtual interfaces use per-queue lock instead of core lock. + * In the tx loop, we aggregate traffic in batches to make all operations + * faster. The batch size is bridge_batch. + */ +#define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ +#define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ +#define NM_BRIDGE_RINGSIZE 1024 /* in the device */ +#define NM_BDG_HASH 1024 /* forwarding table entries */ +#define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ +#define NM_MULTISEG 64 /* max size of a chain of bufs */ +/* actual size of the tables */ +#define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) +/* NM_FT_NULL terminates a list of slots in the ft */ +#define NM_FT_NULL NM_BDG_BATCH_MAX +#define NM_BRIDGES 8 /* number of bridges */ + + +/* + * bridge_batch is set via sysctl to the max batch size to be + * used in the bridge. The actual value may be larger as the + * last packet in the block may overflow the size. + */ +int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ +SYSCTL_DECL(_dev_netmap); +SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); + + +static int bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp); +static int bdg_netmap_reg(struct netmap_adapter *na, int onoff); +static int netmap_bwrap_attach(struct ifnet *, struct ifnet *); +static int netmap_bwrap_register(struct netmap_adapter *, int onoff); +int kern_netmap_regif(struct nmreq *nmr); + +/* + * For each output interface, nm_bdg_q is used to construct a list. + * bq_len is the number of output buffers (we can have coalescing + * during the copy). + */ +struct nm_bdg_q { + uint16_t bq_head; + uint16_t bq_tail; + uint32_t bq_len; /* number of buffers */ +}; + +/* XXX revise this */ +struct nm_hash_ent { + uint64_t mac; /* the top 2 bytes are the epoch */ + uint64_t ports; +}; + +/* + * nm_bridge is a descriptor for a VALE switch. + * Interfaces for a bridge are all in bdg_ports[]. + * The array has fixed size, an empty entry does not terminate + * the search, but lookups only occur on attach/detach so we + * don't mind if they are slow. + * + * The bridge is non blocking on the transmit ports: excess + * packets are dropped if there is no room on the output port. + * + * bdg_lock protects accesses to the bdg_ports array. + * This is a rw lock (or equivalent). + */ +struct nm_bridge { + /* XXX what is the proper alignment/layout ? */ + BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */ + int bdg_namelen; + uint32_t bdg_active_ports; /* 0 means free */ + char bdg_basename[IFNAMSIZ]; + + /* Indexes of active ports (up to active_ports) + * and all other remaining ports. + */ + uint8_t bdg_port_index[NM_BDG_MAXPORTS]; + + struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS]; + + + /* + * The function to decide the destination port. + * It returns either of an index of the destination port, + * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to + * forward this packet. ring_nr is the source ring index, and the + * function may overwrite this value to forward this packet to a + * different ring index. + * This function must be set by netmap_bdgctl(). + */ + bdg_lookup_fn_t nm_bdg_lookup; + + /* the forwarding table, MAC+ports. + * XXX should be changed to an argument to be passed to + * the lookup function, and allocated on attach + */ + struct nm_hash_ent ht[NM_BDG_HASH]; +}; + + +/* + * XXX in principle nm_bridges could be created dynamically + * Right now we have a static array and deletions are protected + * by an exclusive lock. + */ +struct nm_bridge nm_bridges[NM_BRIDGES]; + + +/* + * this is a slightly optimized copy routine which rounds + * to multiple of 64 bytes and is often faster than dealing + * with other odd sizes. We assume there is enough room + * in the source and destination buffers. + * + * XXX only for multiples of 64 bytes, non overlapped. + */ +static inline void +pkt_copy(void *_src, void *_dst, int l) +{ + uint64_t *src = _src; + uint64_t *dst = _dst; + if (unlikely(l >= 1024)) { + memcpy(dst, src, l); + return; + } + for (; likely(l > 0); l-=64) { + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + } +} + + +/* + * locate a bridge among the existing ones. + * MUST BE CALLED WITH NMG_LOCK() + * + * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. + * We assume that this is called with a name of at least NM_NAME chars. + */ +static struct nm_bridge * +nm_find_bridge(const char *name, int create) +{ + int i, l, namelen; + struct nm_bridge *b = NULL; + + NMG_LOCK_ASSERT(); + + namelen = strlen(NM_NAME); /* base length */ + l = name ? strlen(name) : 0; /* actual length */ + if (l < namelen) { + D("invalid bridge name %s", name ? name : NULL); + return NULL; + } + for (i = namelen + 1; i < l; i++) { + if (name[i] == ':') { + namelen = i; + break; + } + } + if (namelen >= IFNAMSIZ) + namelen = IFNAMSIZ; + ND("--- prefix is '%.*s' ---", namelen, name); + + /* lookup the name, remember empty slot if there is one */ + for (i = 0; i < NM_BRIDGES; i++) { + struct nm_bridge *x = nm_bridges + i; + + if (x->bdg_active_ports == 0) { + if (create && b == NULL) + b = x; /* record empty slot */ + } else if (x->bdg_namelen != namelen) { + continue; + } else if (strncmp(name, x->bdg_basename, namelen) == 0) { + ND("found '%.*s' at %d", namelen, name, i); + b = x; + break; + } + } + if (i == NM_BRIDGES && b) { /* name not found, can create entry */ + /* initialize the bridge */ + strncpy(b->bdg_basename, name, namelen); + ND("create new bridge %s with ports %d", b->bdg_basename, + b->bdg_active_ports); + b->bdg_namelen = namelen; + b->bdg_active_ports = 0; + for (i = 0; i < NM_BDG_MAXPORTS; i++) + b->bdg_port_index[i] = i; + /* set the default function */ + b->nm_bdg_lookup = netmap_bdg_learning; + /* reset the MAC address table */ + bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); + } + return b; +} + + +/* + * Free the forwarding tables for rings attached to switch ports. + */ +static void +nm_free_bdgfwd(struct netmap_adapter *na) +{ + int nrings, i; + struct netmap_kring *kring; + + NMG_LOCK_ASSERT(); + nrings = na->num_tx_rings; + kring = na->tx_rings; + for (i = 0; i < nrings; i++) { + if (kring[i].nkr_ft) { + free(kring[i].nkr_ft, M_DEVBUF); + kring[i].nkr_ft = NULL; /* protect from freeing twice */ + } + } +} + + +/* + * Allocate the forwarding tables for the rings attached to the bridge ports. + */ +static int +nm_alloc_bdgfwd(struct netmap_adapter *na) +{ + int nrings, l, i, num_dstq; + struct netmap_kring *kring; + + NMG_LOCK_ASSERT(); + /* all port:rings + broadcast */ + num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; + l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; + l += sizeof(struct nm_bdg_q) * num_dstq; + l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; + + nrings = netmap_real_tx_rings(na); + kring = na->tx_rings; + for (i = 0; i < nrings; i++) { + struct nm_bdg_fwd *ft; + struct nm_bdg_q *dstq; + int j; + + ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO); + if (!ft) { + nm_free_bdgfwd(na); + return ENOMEM; + } + dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); + for (j = 0; j < num_dstq; j++) { + dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; + dstq[j].bq_len = 0; + } + kring[i].nkr_ft = ft; + } + return 0; +} + + +static void +netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw) +{ + int s_hw = hw, s_sw = sw; + int i, lim =b->bdg_active_ports; + uint8_t tmp[NM_BDG_MAXPORTS]; + + /* + New algorithm: + make a copy of bdg_port_index; + lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port + in the array of bdg_port_index, replacing them with + entries from the bottom of the array; + decrement bdg_active_ports; + acquire BDG_WLOCK() and copy back the array. + */ + + if (netmap_verbose) + D("detach %d and %d (lim %d)", hw, sw, lim); + /* make a copy of the list of active ports, update it, + * and then copy back within BDG_WLOCK(). + */ + memcpy(tmp, b->bdg_port_index, sizeof(tmp)); + for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) { + if (hw >= 0 && tmp[i] == hw) { + ND("detach hw %d at %d", hw, i); + lim--; /* point to last active port */ + tmp[i] = tmp[lim]; /* swap with i */ + tmp[lim] = hw; /* now this is inactive */ + hw = -1; + } else if (sw >= 0 && tmp[i] == sw) { + ND("detach sw %d at %d", sw, i); + lim--; + tmp[i] = tmp[lim]; + tmp[lim] = sw; + sw = -1; + } else { + i++; + } + } + if (hw >= 0 || sw >= 0) { + D("XXX delete failed hw %d sw %d, should panic...", hw, sw); + } + + BDG_WLOCK(b); + b->bdg_ports[s_hw] = NULL; + if (s_sw >= 0) { + b->bdg_ports[s_sw] = NULL; + } + memcpy(b->bdg_port_index, tmp, sizeof(tmp)); + b->bdg_active_ports = lim; + BDG_WUNLOCK(b); + + ND("now %d active ports", lim); + if (lim == 0) { + ND("marking bridge %s as free", b->bdg_basename); + b->nm_bdg_lookup = NULL; + } +} + + +static void +netmap_adapter_vp_dtor(struct netmap_adapter *na) +{ + struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; + struct nm_bridge *b = vpna->na_bdg; + struct ifnet *ifp = na->ifp; + + ND("%s has %d references", NM_IFPNAME(ifp), na->na_refcount); + + if (b) { + netmap_bdg_detach_common(b, vpna->bdg_port, -1); + } + + bzero(ifp, sizeof(*ifp)); + free(ifp, M_DEVBUF); + na->ifp = NULL; +} + + +/* Try to get a reference to a netmap adapter attached to a VALE switch. + * If the adapter is found (or is created), this function returns 0, a + * non NULL pointer is returned into *na, and the caller holds a + * reference to the adapter. + * If an adapter is not found, then no reference is grabbed and the + * function returns an error code, or 0 if there is just a VALE prefix + * mismatch. Therefore the caller holds a reference when + * (*na != NULL && return == 0). + */ +int +netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create) +{ + const char *name = nmr->nr_name; + struct ifnet *ifp; + int error = 0; + struct netmap_adapter *ret; + struct netmap_vp_adapter *vpna; + struct nm_bridge *b; + int i, j, cand = -1, cand2 = -1; + int needed; + + *na = NULL; /* default return value */ + + /* first try to see if this is a bridge port. */ + NMG_LOCK_ASSERT(); + if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) { + return 0; /* no error, but no VALE prefix */ + } + + b = nm_find_bridge(name, create); + if (b == NULL) { + D("no bridges available for '%s'", name); + return (create ? ENOMEM : ENXIO); + } + + /* Now we are sure that name starts with the bridge's name, + * lookup the port in the bridge. We need to scan the entire + * list. It is not important to hold a WLOCK on the bridge + * during the search because NMG_LOCK already guarantees + * that there are no other possible writers. + */ + + /* lookup in the local list of ports */ + for (j = 0; j < b->bdg_active_ports; j++) { + i = b->bdg_port_index[j]; + vpna = b->bdg_ports[i]; + // KASSERT(na != NULL); + ifp = vpna->up.ifp; + /* XXX make sure the name only contains one : */ + if (!strcmp(NM_IFPNAME(ifp), name)) { + netmap_adapter_get(&vpna->up); + ND("found existing if %s refs %d", name, + vpna->na_bdg_refcount); + *na = (struct netmap_adapter *)vpna; + return 0; + } + } + /* not found, should we create it? */ + if (!create) + return ENXIO; + /* yes we should, see if we have space to attach entries */ + needed = 2; /* in some cases we only need 1 */ + if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) { + D("bridge full %d, cannot create new port", b->bdg_active_ports); + return ENOMEM; + } + /* record the next two ports available, but do not allocate yet */ + cand = b->bdg_port_index[b->bdg_active_ports]; + cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; + ND("+++ bridge %s port %s used %d avail %d %d", + b->bdg_basename, name, b->bdg_active_ports, cand, cand2); + + /* + * try see if there is a matching NIC with this name + * (after the bridge's name) + */ + ifp = ifunit_ref(name + b->bdg_namelen + 1); + if (!ifp) { /* this is a virtual port */ + if (nmr->nr_cmd) { + /* nr_cmd must be 0 for a virtual port */ + return EINVAL; + } + + /* create a struct ifnet for the new port. + * need M_NOWAIT as we are under nma_lock + */ + ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!ifp) + return ENOMEM; + + strcpy(ifp->if_xname, name); + /* bdg_netmap_attach creates a struct netmap_adapter */ + error = bdg_netmap_attach(nmr, ifp); + if (error) { + D("error %d", error); + free(ifp, M_DEVBUF); + return error; + } + ret = NA(ifp); + cand2 = -1; /* only need one port */ + } else { /* this is a NIC */ + struct ifnet *fake_ifp; + + error = netmap_get_hw_na(ifp, &ret); + if (error || ret == NULL) + goto out; + + /* make sure the NIC is not already in use */ + if (NETMAP_OWNED_BY_ANY(ret)) { + D("NIC %s busy, cannot attach to bridge", + NM_IFPNAME(ifp)); + error = EBUSY; + goto out; + } + /* create a fake interface */ + fake_ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!fake_ifp) { + error = ENOMEM; + goto out; + } + strcpy(fake_ifp->if_xname, name); + error = netmap_bwrap_attach(fake_ifp, ifp); + if (error) { + free(fake_ifp, M_DEVBUF); + goto out; + } + ret = NA(fake_ifp); + if (nmr->nr_arg1 != NETMAP_BDG_HOST) + cand2 = -1; /* only need one port */ + if_rele(ifp); + } + vpna = (struct netmap_vp_adapter *)ret; + + BDG_WLOCK(b); + vpna->bdg_port = cand; + ND("NIC %p to bridge port %d", vpna, cand); + /* bind the port to the bridge (virtual ports are not active) */ + b->bdg_ports[cand] = vpna; + vpna->na_bdg = b; + b->bdg_active_ports++; + if (cand2 >= 0) { + struct netmap_vp_adapter *hostna = vpna + 1; + /* also bind the host stack to the bridge */ + b->bdg_ports[cand2] = hostna; + hostna->bdg_port = cand2; + hostna->na_bdg = b; + b->bdg_active_ports++; + ND("host %p to bridge port %d", hostna, cand2); + } + ND("if %s refs %d", name, vpna->up.na_refcount); + BDG_WUNLOCK(b); + *na = ret; + netmap_adapter_get(ret); + return 0; + +out: + if_rele(ifp); + + return error; +} + + +/* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */ +static int +nm_bdg_attach(struct nmreq *nmr) +{ + struct netmap_adapter *na; + struct netmap_if *nifp; + struct netmap_priv_d *npriv; + struct netmap_bwrap_adapter *bna; + int error; + + npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO); + if (npriv == NULL) + return ENOMEM; + + NMG_LOCK(); + + error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */); + if (error) /* no device, or another bridge or user owns the device */ + goto unlock_exit; + + if (na == NULL) { /* VALE prefix missing */ + error = EINVAL; + goto unlock_exit; + } + + if (na->active_fds > 0) { /* already registered */ + error = EBUSY; + goto unref_exit; + } + + nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags, &error); + if (!nifp) { + goto unref_exit; + } + + bna = (struct netmap_bwrap_adapter*)na; + bna->na_kpriv = npriv; + NMG_UNLOCK(); + ND("registered %s to netmap-mode", NM_IFPNAME(na->ifp)); + return 0; + +unref_exit: + netmap_adapter_put(na); +unlock_exit: + NMG_UNLOCK(); + bzero(npriv, sizeof(*npriv)); + free(npriv, M_DEVBUF); + return error; +} + + +static int +nm_bdg_detach(struct nmreq *nmr) +{ + struct netmap_adapter *na; + int error; + struct netmap_bwrap_adapter *bna; + int last_instance; + + NMG_LOCK(); + error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */); + if (error) { /* no device, or another bridge or user owns the device */ + goto unlock_exit; + } + + if (na == NULL) { /* VALE prefix missing */ + error = EINVAL; + goto unlock_exit; + } + + bna = (struct netmap_bwrap_adapter *)na; + + if (na->active_fds == 0) { /* not registered */ + error = EINVAL; + goto unref_exit; + } + + last_instance = netmap_dtor_locked(bna->na_kpriv); /* unregister */ + if (!last_instance) { + D("--- error, trying to detach an entry with active mmaps"); + error = EINVAL; + } else { + struct netmap_priv_d *npriv = bna->na_kpriv; + + bna->na_kpriv = NULL; + D("deleting priv"); + + bzero(npriv, sizeof(*npriv)); + free(npriv, M_DEVBUF); + } + +unref_exit: + netmap_adapter_put(na); +unlock_exit: + NMG_UNLOCK(); + return error; + +} + + +/* exported to kernel callers, e.g. OVS ? + * Entry point. + * Called without NMG_LOCK. + */ +int +netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) +{ + struct nm_bridge *b; + struct netmap_adapter *na; + struct netmap_vp_adapter *vpna; + struct ifnet *iter; + char *name = nmr->nr_name; + int cmd = nmr->nr_cmd, namelen = strlen(name); + int error = 0, i, j; + + switch (cmd) { + case NETMAP_BDG_ATTACH: + error = nm_bdg_attach(nmr); + break; + + case NETMAP_BDG_DETACH: + error = nm_bdg_detach(nmr); + break; + + case NETMAP_BDG_LIST: + /* this is used to enumerate bridges and ports */ + if (namelen) { /* look up indexes of bridge and port */ + if (strncmp(name, NM_NAME, strlen(NM_NAME))) { + error = EINVAL; + break; + } + NMG_LOCK(); + b = nm_find_bridge(name, 0 /* don't create */); + if (!b) { + error = ENOENT; + NMG_UNLOCK(); + break; + } + + error = ENOENT; + for (j = 0; j < b->bdg_active_ports; j++) { + i = b->bdg_port_index[j]; + vpna = b->bdg_ports[i]; + if (vpna == NULL) { + D("---AAAAAAAAARGH-------"); + continue; + } + iter = vpna->up.ifp; + /* the former and the latter identify a + * virtual port and a NIC, respectively + */ + if (!strcmp(iter->if_xname, name)) { + /* bridge index */ + nmr->nr_arg1 = b - nm_bridges; + nmr->nr_arg2 = i; /* port index */ + error = 0; + break; + } + } + NMG_UNLOCK(); + } else { + /* return the first non-empty entry starting from + * bridge nr_arg1 and port nr_arg2. + * + * Users can detect the end of the same bridge by + * seeing the new and old value of nr_arg1, and can + * detect the end of all the bridge by error != 0 + */ + i = nmr->nr_arg1; + j = nmr->nr_arg2; + + NMG_LOCK(); + for (error = ENOENT; i < NM_BRIDGES; i++) { + b = nm_bridges + i; + if (j >= b->bdg_active_ports) { + j = 0; /* following bridges scan from 0 */ + continue; + } + nmr->nr_arg1 = i; + nmr->nr_arg2 = j; + j = b->bdg_port_index[j]; + vpna = b->bdg_ports[j]; + iter = vpna->up.ifp; + strncpy(name, iter->if_xname, (size_t)IFNAMSIZ); + error = 0; + break; + } + NMG_UNLOCK(); + } + break; + + case NETMAP_BDG_LOOKUP_REG: + /* register a lookup function to the given bridge. + * nmr->nr_name may be just bridge's name (including ':' + * if it is not just NM_NAME). + */ + if (!func) { + error = EINVAL; + break; + } + NMG_LOCK(); + b = nm_find_bridge(name, 0 /* don't create */); + if (!b) { + error = EINVAL; + } else { + b->nm_bdg_lookup = func; + } + NMG_UNLOCK(); + break; + + case NETMAP_BDG_VNET_HDR: + /* Valid lengths for the virtio-net header are 0 (no header), + 10 and 12. */ + if (nmr->nr_arg1 != 0 && + nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) && + nmr->nr_arg1 != 12) { + error = EINVAL; + break; + } + NMG_LOCK(); + error = netmap_get_bdg_na(nmr, &na, 0); + if (na && !error) { + vpna = (struct netmap_vp_adapter *)na; + vpna->virt_hdr_len = nmr->nr_arg1; + if (vpna->virt_hdr_len) + vpna->mfs = NETMAP_BDG_BUF_SIZE(na->nm_mem); + D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna); + netmap_adapter_put(na); + } + NMG_UNLOCK(); + break; + + default: + D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd); + error = EINVAL; + break; + } + return error; +} + +static int +netmap_vp_krings_create(struct netmap_adapter *na) +{ + u_int tailroom; + int error, i; + uint32_t *leases; + u_int nrx = netmap_real_rx_rings(na); + + /* + * Leases are attached to RX rings on vale ports + */ + tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx; + + error = netmap_krings_create(na, tailroom); + if (error) + return error; + + leases = na->tailroom; + + for (i = 0; i < nrx; i++) { /* Receive rings */ + na->rx_rings[i].nkr_leases = leases; + leases += na->num_rx_desc; + } + + error = nm_alloc_bdgfwd(na); + if (error) { + netmap_krings_delete(na); + return error; + } + + return 0; +} + + +static void +netmap_vp_krings_delete(struct netmap_adapter *na) +{ + nm_free_bdgfwd(na); + netmap_krings_delete(na); +} + + +static int +nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, + struct netmap_vp_adapter *na, u_int ring_nr); + + +/* + * Grab packets from a kring, move them into the ft structure + * associated to the tx (input) port. Max one instance per port, + * filtered on input (ioctl, poll or XXX). + * Returns the next position in the ring. + */ +static int +nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr, + struct netmap_kring *kring, u_int end) +{ + struct netmap_ring *ring = kring->ring; + struct nm_bdg_fwd *ft; + u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; + u_int ft_i = 0; /* start from 0 */ + u_int frags = 1; /* how many frags ? */ + struct nm_bridge *b = na->na_bdg; + + /* To protect against modifications to the bridge we acquire a + * shared lock, waiting if we can sleep (if the source port is + * attached to a user process) or with a trylock otherwise (NICs). + */ + ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); + if (na->up.na_flags & NAF_BDG_MAYSLEEP) + BDG_RLOCK(b); + else if (!BDG_RTRYLOCK(b)) + return 0; + ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); + ft = kring->nkr_ft; + + for (; likely(j != end); j = nm_next(j, lim)) { + struct netmap_slot *slot = &ring->slot[j]; + char *buf; + + ft[ft_i].ft_len = slot->len; + ft[ft_i].ft_flags = slot->flags; + + ND("flags is 0x%x", slot->flags); + /* this slot goes into a list so initialize the link field */ + ft[ft_i].ft_next = NM_FT_NULL; + buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? + (void *)(uintptr_t)slot->ptr : BDG_NMB(&na->up, slot); + __builtin_prefetch(buf); + ++ft_i; + if (slot->flags & NS_MOREFRAG) { + frags++; + continue; + } + if (unlikely(netmap_verbose && frags > 1)) + RD(5, "%d frags at %d", frags, ft_i - frags); + ft[ft_i - frags].ft_frags = frags; + frags = 1; + if (unlikely((int)ft_i >= bridge_batch)) + ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); + } + if (frags > 1) { + D("truncate incomplete fragment at %d (%d frags)", ft_i, frags); + // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG + ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG; + ft[ft_i - frags].ft_frags = frags - 1; + } + if (ft_i) + ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); + BDG_RUNLOCK(b); + return j; +} + + +/* ----- FreeBSD if_bridge hash function ------- */ + +/* + * The following hash function is adapted from "Hash Functions" by Bob Jenkins + * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). + * + * http://www.burtleburtle.net/bob/hash/spooky.html + */ +#define mix(a, b, c) \ +do { \ + a -= b; a -= c; a ^= (c >> 13); \ + b -= c; b -= a; b ^= (a << 8); \ + c -= a; c -= b; c ^= (b >> 13); \ + a -= b; a -= c; a ^= (c >> 12); \ + b -= c; b -= a; b ^= (a << 16); \ + c -= a; c -= b; c ^= (b >> 5); \ + a -= b; a -= c; a ^= (c >> 3); \ + b -= c; b -= a; b ^= (a << 10); \ + c -= a; c -= b; c ^= (b >> 15); \ +} while (/*CONSTCOND*/0) + + +static __inline uint32_t +nm_bridge_rthash(const uint8_t *addr) +{ + uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key + + b += addr[5] << 8; + b += addr[4]; + a += addr[3] << 24; + a += addr[2] << 16; + a += addr[1] << 8; + a += addr[0]; + + mix(a, b, c); +#define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) + return (c & BRIDGE_RTHASH_MASK); +} + +#undef mix + + +static int +bdg_netmap_reg(struct netmap_adapter *na, int onoff) +{ + struct netmap_vp_adapter *vpna = + (struct netmap_vp_adapter*)na; + struct ifnet *ifp = na->ifp; + + /* the interface is already attached to the bridge, + * so we only need to toggle IFCAP_NETMAP. + */ + BDG_WLOCK(vpna->na_bdg); + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + } else { + ifp->if_capenable &= ~IFCAP_NETMAP; + } + BDG_WUNLOCK(vpna->na_bdg); + return 0; +} + + +/* + * Lookup function for a learning bridge. + * Update the hash table with the source address, + * and then returns the destination port index, and the + * ring in *dst_ring (at the moment, always use ring 0) + */ +u_int +netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring, + struct netmap_vp_adapter *na) +{ + struct nm_hash_ent *ht = na->na_bdg->ht; + uint32_t sh, dh; + u_int dst, mysrc = na->bdg_port; + uint64_t smac, dmac; + + if (buf_len < 14) { + D("invalid buf length %d", buf_len); + return NM_BDG_NOPORT; + } + dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; + smac = le64toh(*(uint64_t *)(buf + 4)); + smac >>= 16; + + /* + * The hash is somewhat expensive, there might be some + * worthwhile optimizations here. + */ + if ((buf[6] & 1) == 0) { /* valid src */ + uint8_t *s = buf+6; + sh = nm_bridge_rthash(s); // XXX hash of source + /* update source port forwarding entry */ + ht[sh].mac = smac; /* XXX expire ? */ + ht[sh].ports = mysrc; + if (netmap_verbose) + D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", + s[0], s[1], s[2], s[3], s[4], s[5], mysrc); + } + dst = NM_BDG_BROADCAST; + if ((buf[0] & 1) == 0) { /* unicast */ + dh = nm_bridge_rthash(buf); // XXX hash of dst + if (ht[dh].mac == dmac) { /* found dst */ + dst = ht[dh].ports; + } + /* XXX otherwise return NM_BDG_UNKNOWN ? */ + } + *dst_ring = 0; + return dst; +} + + +/* + * Available space in the ring. Only used in VALE code + * and only with is_rx = 1 + */ +static inline uint32_t +nm_kr_space(struct netmap_kring *k, int is_rx) +{ + int space; + + if (is_rx) { + int busy = k->nkr_hwlease - k->nr_hwcur; + if (busy < 0) + busy += k->nkr_num_slots; + space = k->nkr_num_slots - 1 - busy; + } else { + /* XXX never used in this branch */ + space = k->nr_hwtail - k->nkr_hwlease; + if (space < 0) + space += k->nkr_num_slots; + } +#if 0 + // sanity check + if (k->nkr_hwlease >= k->nkr_num_slots || + k->nr_hwcur >= k->nkr_num_slots || + k->nr_tail >= k->nkr_num_slots || + busy < 0 || + busy >= k->nkr_num_slots) { + D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, + k->nkr_lease_idx, k->nkr_num_slots); + } +#endif + return space; +} + + + + +/* make a lease on the kring for N positions. return the + * lease index + * XXX only used in VALE code and with is_rx = 1 + */ +static inline uint32_t +nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) +{ + uint32_t lim = k->nkr_num_slots - 1; + uint32_t lease_idx = k->nkr_lease_idx; + + k->nkr_leases[lease_idx] = NR_NOSLOT; + k->nkr_lease_idx = nm_next(lease_idx, lim); + + if (n > nm_kr_space(k, is_rx)) { + D("invalid request for %d slots", n); + panic("x"); + } + /* XXX verify that there are n slots */ + k->nkr_hwlease += n; + if (k->nkr_hwlease > lim) + k->nkr_hwlease -= lim + 1; + + if (k->nkr_hwlease >= k->nkr_num_slots || + k->nr_hwcur >= k->nkr_num_slots || + k->nr_hwtail >= k->nkr_num_slots || + k->nkr_lease_idx >= k->nkr_num_slots) { + D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d", + k->na->ifp->if_xname, + k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, + k->nkr_lease_idx, k->nkr_num_slots); + } + return lease_idx; +} + +/* + * This flush routine supports only unicast and broadcast but a large + * number of ports, and lets us replace the learn and dispatch functions. + */ +int +nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, + u_int ring_nr) +{ + struct nm_bdg_q *dst_ents, *brddst; + uint16_t num_dsts = 0, *dsts; + struct nm_bridge *b = na->na_bdg; + u_int i, j, me = na->bdg_port; + + /* + * The work area (pointed by ft) is followed by an array of + * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS + * queues per port plus one for the broadcast traffic. + * Then we have an array of destination indexes. + */ + dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); + dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); + + /* first pass: find a destination for each packet in the batch */ + for (i = 0; likely(i < n); i += ft[i].ft_frags) { + uint8_t dst_ring = ring_nr; /* default, same ring as origin */ + uint16_t dst_port, d_i; + struct nm_bdg_q *d; + uint8_t *buf = ft[i].ft_buf; + u_int len = ft[i].ft_len; + + ND("slot %d frags %d", i, ft[i].ft_frags); + /* Drop the packet if the virtio-net header is not into the first + fragment nor at the very beginning of the second. */ + if (unlikely(na->virt_hdr_len > len)) + continue; + if (len == na->virt_hdr_len) { + buf = ft[i+1].ft_buf; + len = ft[i+1].ft_len; + } else { + buf += na->virt_hdr_len; + len -= na->virt_hdr_len; + } + dst_port = b->nm_bdg_lookup(buf, len, &dst_ring, na); + if (netmap_verbose > 255) + RD(5, "slot %d port %d -> %d", i, me, dst_port); + if (dst_port == NM_BDG_NOPORT) + continue; /* this packet is identified to be dropped */ + else if (unlikely(dst_port > NM_BDG_MAXPORTS)) + continue; + else if (dst_port == NM_BDG_BROADCAST) + dst_ring = 0; /* broadcasts always go to ring 0 */ + else if (unlikely(dst_port == me || + !b->bdg_ports[dst_port])) + continue; + + /* get a position in the scratch pad */ + d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; + d = dst_ents + d_i; + + /* append the first fragment to the list */ + if (d->bq_head == NM_FT_NULL) { /* new destination */ + d->bq_head = d->bq_tail = i; + /* remember this position to be scanned later */ + if (dst_port != NM_BDG_BROADCAST) + dsts[num_dsts++] = d_i; + } else { + ft[d->bq_tail].ft_next = i; + d->bq_tail = i; + } + d->bq_len += ft[i].ft_frags; + } + + /* + * Broadcast traffic goes to ring 0 on all destinations. + * So we need to add these rings to the list of ports to scan. + * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is + * expensive. We should keep a compact list of active destinations + * so we could shorten this loop. + */ + brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; + if (brddst->bq_head != NM_FT_NULL) { + for (j = 0; likely(j < b->bdg_active_ports); j++) { + uint16_t d_i; + i = b->bdg_port_index[j]; + if (unlikely(i == me)) + continue; + d_i = i * NM_BDG_MAXRINGS; + if (dst_ents[d_i].bq_head == NM_FT_NULL) + dsts[num_dsts++] = d_i; + } + } + + ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); + /* second pass: scan destinations (XXX will be modular somehow) */ + for (i = 0; i < num_dsts; i++) { + struct ifnet *dst_ifp; + struct netmap_vp_adapter *dst_na; + struct netmap_kring *kring; + struct netmap_ring *ring; + u_int dst_nr, lim, j, d_i, next, brd_next; + u_int needed, howmany; + int retry = netmap_txsync_retry; + struct nm_bdg_q *d; + uint32_t my_start = 0, lease_idx = 0; + int nrings; + int virt_hdr_mismatch = 0; + + d_i = dsts[i]; + ND("second pass %d port %d", i, d_i); + d = dst_ents + d_i; + // XXX fix the division + dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; + /* protect from the lookup function returning an inactive + * destination port + */ + if (unlikely(dst_na == NULL)) + goto cleanup; + if (dst_na->up.na_flags & NAF_SW_ONLY) + goto cleanup; + dst_ifp = dst_na->up.ifp; + /* + * The interface may be in !netmap mode in two cases: + * - when na is attached but not activated yet; + * - when na is being deactivated but is still attached. + */ + if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) { + ND("not in netmap mode!"); + goto cleanup; + } + + /* there is at least one either unicast or broadcast packet */ + brd_next = brddst->bq_head; + next = d->bq_head; + /* we need to reserve this many slots. If fewer are + * available, some packets will be dropped. + * Packets may have multiple fragments, so we may not use + * there is a chance that we may not use all of the slots + * we have claimed, so we will need to handle the leftover + * ones when we regain the lock. + */ + needed = d->bq_len + brddst->bq_len; + + if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) { + /* There is a virtio-net header/offloadings mismatch between + * source and destination. The slower mismatch datapath will + * be used to cope with all the mismatches. + */ + virt_hdr_mismatch = 1; + if (dst_na->mfs < na->mfs) { + /* We may need to do segmentation offloadings, and so + * we may need a number of destination slots greater + * than the number of input slots ('needed'). + * We look for the smallest integer 'x' which satisfies: + * needed * na->mfs + x * H <= x * na->mfs + * where 'H' is the length of the longest header that may + * be replicated in the segmentation process (e.g. for + * TCPv4 we must account for ethernet header, IP header + * and TCPv4 header). + */ + needed = (needed * na->mfs) / + (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1; + ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed); + } + } + + ND(5, "pass 2 dst %d is %x %s", + i, d_i, is_vp ? "virtual" : "nic/host"); + dst_nr = d_i & (NM_BDG_MAXRINGS-1); + nrings = dst_na->up.num_rx_rings; + if (dst_nr >= nrings) + dst_nr = dst_nr % nrings; + kring = &dst_na->up.rx_rings[dst_nr]; + ring = kring->ring; + lim = kring->nkr_num_slots - 1; + +retry: + + if (dst_na->retry && retry) { + /* try to get some free slot from the previous run */ + dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); + } + /* reserve the buffers in the queue and an entry + * to report completion, and drop lock. + * XXX this might become a helper function. + */ + mtx_lock(&kring->q_lock); + if (kring->nkr_stopped) { + mtx_unlock(&kring->q_lock); + goto cleanup; + } + my_start = j = kring->nkr_hwlease; + howmany = nm_kr_space(kring, 1); + if (needed < howmany) + howmany = needed; + lease_idx = nm_kr_lease(kring, howmany, 1); + mtx_unlock(&kring->q_lock); + + /* only retry if we need more than available slots */ + if (retry && needed <= howmany) + retry = 0; + + /* copy to the destination queue */ + while (howmany > 0) { + struct netmap_slot *slot; + struct nm_bdg_fwd *ft_p, *ft_end; + u_int cnt; + + /* find the queue from which we pick next packet. + * NM_FT_NULL is always higher than valid indexes + * so we never dereference it if the other list + * has packets (and if both are empty we never + * get here). + */ + if (next < brd_next) { + ft_p = ft + next; + next = ft_p->ft_next; + } else { /* insert broadcast */ + ft_p = ft + brd_next; + brd_next = ft_p->ft_next; + } + cnt = ft_p->ft_frags; // cnt > 0 + if (unlikely(cnt > howmany)) + break; /* no more space */ + if (netmap_verbose && cnt > 1) + RD(5, "rx %d frags to %d", cnt, j); + ft_end = ft_p + cnt; + if (unlikely(virt_hdr_mismatch)) { + bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany); + } else { + howmany -= cnt; + do { + char *dst, *src = ft_p->ft_buf; + size_t copy_len = ft_p->ft_len, dst_len = copy_len; + + slot = &ring->slot[j]; + dst = BDG_NMB(&dst_na->up, slot); + + ND("send [%d] %d(%d) bytes at %s:%d", + i, (int)copy_len, (int)dst_len, + NM_IFPNAME(dst_ifp), j); + /* round to a multiple of 64 */ + copy_len = (copy_len + 63) & ~63; + + if (ft_p->ft_flags & NS_INDIRECT) { + if (copyin(src, dst, copy_len)) { + // invalid user pointer, pretend len is 0 + dst_len = 0; + } + } else { + //memcpy(dst, src, copy_len); + pkt_copy(src, dst, (int)copy_len); + } + slot->len = dst_len; + slot->flags = (cnt << 8)| NS_MOREFRAG; + j = nm_next(j, lim); + needed--; + ft_p++; + } while (ft_p != ft_end); + slot->flags = (cnt << 8); /* clear flag on last entry */ + } + /* are we done ? */ + if (next == NM_FT_NULL && brd_next == NM_FT_NULL) + break; + } + { + /* current position */ + uint32_t *p = kring->nkr_leases; /* shorthand */ + uint32_t update_pos; + int still_locked = 1; + + mtx_lock(&kring->q_lock); + if (unlikely(howmany > 0)) { + /* not used all bufs. If i am the last one + * i can recover the slots, otherwise must + * fill them with 0 to mark empty packets. + */ + ND("leftover %d bufs", howmany); + if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { + /* yes i am the last one */ + ND("roll back nkr_hwlease to %d", j); + kring->nkr_hwlease = j; + } else { + while (howmany-- > 0) { + ring->slot[j].len = 0; + ring->slot[j].flags = 0; + j = nm_next(j, lim); + } + } + } + p[lease_idx] = j; /* report I am done */ + + update_pos = kring->nr_hwtail; + + if (my_start == update_pos) { + /* all slots before my_start have been reported, + * so scan subsequent leases to see if other ranges + * have been completed, and to a selwakeup or txsync. + */ + while (lease_idx != kring->nkr_lease_idx && + p[lease_idx] != NR_NOSLOT) { + j = p[lease_idx]; + p[lease_idx] = NR_NOSLOT; + lease_idx = nm_next(lease_idx, lim); + } + /* j is the new 'write' position. j != my_start + * means there are new buffers to report + */ + if (likely(j != my_start)) { + kring->nr_hwtail = j; + still_locked = 0; + mtx_unlock(&kring->q_lock); + dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); + if (dst_na->retry && retry--) + goto retry; + } + } + if (still_locked) + mtx_unlock(&kring->q_lock); + } +cleanup: + d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ + d->bq_len = 0; + } + brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ + brddst->bq_len = 0; + return 0; +} + + +static int +netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *kring = &na->up.tx_rings[ring_nr]; + u_int done; + u_int const lim = kring->nkr_num_slots - 1; + u_int const cur = kring->rcur; + + if (bridge_batch <= 0) { /* testing only */ + done = cur; // used all + goto done; + } + if (bridge_batch > NM_BDG_BATCH) + bridge_batch = NM_BDG_BATCH; + + done = nm_bdg_preflush(na, ring_nr, kring, cur); +done: + if (done != cur) + D("early break at %d/ %d, tail %d", done, cur, kring->nr_hwtail); + /* + * packets between 'done' and 'cur' are left unsent. + */ + kring->nr_hwcur = done; + kring->nr_hwtail = nm_prev(done, lim); + nm_txsync_finalize(kring); + if (netmap_verbose) + D("%s ring %d flags %d", NM_IFPNAME(na->up.ifp), ring_nr, flags); + return 0; +} + + +/* + * main dispatch routine for the bridge. + * We already know that only one thread is running this. + * we must run nm_bdg_preflush without lock. + */ +static int +bdg_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; + return netmap_vp_txsync(vpna, ring_nr, flags); +} + +static int +netmap_vp_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + u_int nm_i, lim = kring->nkr_num_slots - 1; + u_int head = nm_rxsync_prologue(kring); + int n; + + if (head > lim) { + D("ouch dangerous reset!!!"); + n = netmap_ring_reinit(kring); + goto done; + } + + /* First part, import newly received packets. */ + /* actually nothing to do here, they are already in the kring */ + + /* Second part, skip past packets that userspace has released. */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { + /* consistency check, but nothing really important here */ + for (n = 0; likely(nm_i != head); n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; + void *addr = BDG_NMB(na, slot); + + if (addr == netmap_buffer_base) { /* bad buf */ + D("bad buffer index %d, ignore ?", + slot->buf_idx); + } + slot->flags &= ~NS_BUF_CHANGED; + nm_i = nm_next(nm_i, lim); + } + kring->nr_hwcur = head; + } + + /* tell userspace that there are new packets */ + nm_rxsync_finalize(kring); + n = 0; +done: + return n; +} + +/* + * user process reading from a VALE switch. + * Already protected against concurrent calls from userspace, + * but we must acquire the queue's lock to protect against + * writers on the same queue. + */ +static int +bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + int n; + + mtx_lock(&kring->q_lock); + n = netmap_vp_rxsync(na, ring_nr, flags); + mtx_unlock(&kring->q_lock); + return n; +} + + +static int +bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp) +{ + struct netmap_vp_adapter *vpna; + struct netmap_adapter *na; + int error; + u_int npipes = 0; + + vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (vpna == NULL) + return ENOMEM; + + na = &vpna->up; + + na->ifp = ifp; + + /* bound checking */ + na->num_tx_rings = nmr->nr_tx_rings; + nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); + nmr->nr_tx_rings = na->num_tx_rings; // write back + na->num_rx_rings = nmr->nr_rx_rings; + nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); + nmr->nr_rx_rings = na->num_rx_rings; // write back + nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE, + 1, NM_BDG_MAXSLOTS, NULL); + na->num_tx_desc = nmr->nr_tx_slots; + nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, + 1, NM_BDG_MAXSLOTS, NULL); + /* validate number of pipes. We want at least 1, + * but probably can do with some more. + * So let's use 2 as default (when 0 is supplied) + */ + npipes = nmr->nr_arg1; + nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL); + nmr->nr_arg1 = npipes; /* write back */ + /* validate extra bufs */ + nm_bound_var(&nmr->nr_arg3, 0, 0, + 128*NM_BDG_MAXSLOTS, NULL); + na->num_rx_desc = nmr->nr_rx_slots; + vpna->virt_hdr_len = 0; + vpna->mfs = 1514; + /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero?? + vpna->mfs = netmap_buf_size; */ + if (netmap_verbose) + D("max frame size %u", vpna->mfs); + + na->na_flags |= NAF_BDG_MAYSLEEP | NAF_MEM_OWNER; + na->nm_txsync = bdg_netmap_txsync; + na->nm_rxsync = bdg_netmap_rxsync; + na->nm_register = bdg_netmap_reg; + na->nm_dtor = netmap_adapter_vp_dtor; + na->nm_krings_create = netmap_vp_krings_create; + na->nm_krings_delete = netmap_vp_krings_delete; + na->nm_mem = netmap_mem_private_new(NM_IFPNAME(na->ifp), + na->num_tx_rings, na->num_tx_desc, + na->num_rx_rings, na->num_rx_desc, + nmr->nr_arg3, npipes, &error); + if (na->nm_mem == NULL) + goto err; + /* other nmd fields are set in the common routine */ + error = netmap_attach_common(na); + if (error) + goto err; + return 0; + +err: + if (na->nm_mem != NULL) + netmap_mem_private_delete(na->nm_mem); + free(vpna, M_DEVBUF); + return error; +} + + +static void +netmap_bwrap_dtor(struct netmap_adapter *na) +{ + struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; + struct netmap_adapter *hwna = bna->hwna; + struct nm_bridge *b = bna->up.na_bdg, + *bh = bna->host.na_bdg; + struct ifnet *ifp = na->ifp; + + ND("na %p", na); + + if (b) { + netmap_bdg_detach_common(b, bna->up.bdg_port, + (bh ? bna->host.bdg_port : -1)); + } + + hwna->na_private = NULL; + netmap_adapter_put(hwna); + + bzero(ifp, sizeof(*ifp)); + free(ifp, M_DEVBUF); + na->ifp = NULL; + +} + + +/* + * Intr callback for NICs connected to a bridge. + * Simply ignore tx interrupts (maybe we could try to recover space ?) + * and pass received packets from nic to the bridge. + * + * XXX TODO check locking: this is called from the interrupt + * handler so we should make sure that the interface is not + * disconnected while passing down an interrupt. + * + * Note, no user process can access this NIC or the host stack. + * The only part of the ring that is significant are the slots, + * and head/cur/tail are set from the kring as needed + * (part as a receive ring, part as a transmit ring). + * + * callback that overwrites the hwna notify callback. + * Packets come from the outside or from the host stack and are put on an hwna rx ring. + * The bridge wrapper then sends the packets through the bridge. + */ +static int +netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, int flags) +{ + struct ifnet *ifp = na->ifp; + struct netmap_bwrap_adapter *bna = na->na_private; + struct netmap_vp_adapter *hostna = &bna->host; + struct netmap_kring *kring, *bkring; + struct netmap_ring *ring; + int is_host_ring = ring_nr == na->num_rx_rings; + struct netmap_vp_adapter *vpna = &bna->up; + int error = 0; + + if (netmap_verbose) + D("%s %s%d 0x%x", NM_IFPNAME(ifp), + (tx == NR_TX ? "TX" : "RX"), ring_nr, flags); + + if (flags & NAF_DISABLE_NOTIFY) { + kring = tx == NR_TX ? na->tx_rings : na->rx_rings; + bkring = tx == NR_TX ? vpna->up.rx_rings : vpna->up.tx_rings; + if (kring[ring_nr].nkr_stopped) + netmap_disable_ring(&bkring[ring_nr]); + else + bkring[ring_nr].nkr_stopped = 0; + return 0; + } + + if (ifp == NULL || !(ifp->if_capenable & IFCAP_NETMAP)) + return 0; + + /* we only care about receive interrupts */ + if (tx == NR_TX) + return 0; + + kring = &na->rx_rings[ring_nr]; + ring = kring->ring; + + /* make sure the ring is not disabled */ + if (nm_kr_tryget(kring)) + return 0; + + if (is_host_ring && hostna->na_bdg == NULL) { + error = bna->save_notify(na, ring_nr, tx, flags); + goto put_out; + } + + /* Here we expect ring->head = ring->cur = ring->tail + * because everything has been released from the previous round. + * However the ring is shared and we might have info from + * the wrong side (the tx ring). Hence we overwrite with + * the info from the rx kring. + */ + if (netmap_verbose) + D("%s head %d cur %d tail %d (kring %d %d %d)", NM_IFPNAME(ifp), + ring->head, ring->cur, ring->tail, + kring->rhead, kring->rcur, kring->rtail); + + ring->head = kring->rhead; + ring->cur = kring->rcur; + ring->tail = kring->rtail; + + if (is_host_ring) { + vpna = hostna; + ring_nr = 0; + } + /* simulate a user wakeup on the rx ring */ + /* fetch packets that have arrived. + * XXX maybe do this in a loop ? + */ + error = kring->nm_sync(kring, 0); + if (error) + goto put_out; + if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) { + D("how strange, interrupt with no packets on %s", + NM_IFPNAME(ifp)); + goto put_out; + } + + /* new packets are ring->cur to ring->tail, and the bkring + * had hwcur == ring->cur. So advance ring->cur to ring->tail + * to push all packets out. + */ + ring->head = ring->cur = ring->tail; + + /* also set tail to what the bwrap expects */ + bkring = &vpna->up.tx_rings[ring_nr]; + ring->tail = bkring->nr_hwtail; // rtail too ? + + /* pass packets to the switch */ + nm_txsync_prologue(bkring); // XXX error checking ? + netmap_vp_txsync(vpna, ring_nr, flags); + + /* mark all buffers as released on this ring */ + ring->head = ring->cur = kring->nr_hwtail; + ring->tail = kring->rtail; + /* another call to actually release the buffers */ + if (!is_host_ring) { + error = kring->nm_sync(kring, 0); + } else { + /* mark all packets as released, as in the + * second part of netmap_rxsync_from_host() + */ + kring->nr_hwcur = kring->nr_hwtail; + nm_rxsync_finalize(kring); + } + +put_out: + nm_kr_put(kring); + return error; +} + + +static int +netmap_bwrap_register(struct netmap_adapter *na, int onoff) +{ + struct netmap_bwrap_adapter *bna = + (struct netmap_bwrap_adapter *)na; + struct netmap_adapter *hwna = bna->hwna; + struct netmap_vp_adapter *hostna = &bna->host; + int error; + + ND("%s %s", NM_IFPNAME(na->ifp), onoff ? "on" : "off"); + + if (onoff) { + int i; + + hwna->na_lut = na->na_lut; + hwna->na_lut_objtotal = na->na_lut_objtotal; + + if (hostna->na_bdg) { + hostna->up.na_lut = na->na_lut; + hostna->up.na_lut_objtotal = na->na_lut_objtotal; + } + + /* cross-link the netmap rings + * The original number of rings comes from hwna, + * rx rings on one side equals tx rings on the other. + */ + for (i = 0; i < na->num_rx_rings + 1; i++) { + hwna->tx_rings[i].nkr_num_slots = na->rx_rings[i].nkr_num_slots; + hwna->tx_rings[i].ring = na->rx_rings[i].ring; + } + for (i = 0; i < na->num_tx_rings + 1; i++) { + hwna->rx_rings[i].nkr_num_slots = na->tx_rings[i].nkr_num_slots; + hwna->rx_rings[i].ring = na->tx_rings[i].ring; + } + } + + if (hwna->ifp) { + error = hwna->nm_register(hwna, onoff); + if (error) + return error; + } + + bdg_netmap_reg(na, onoff); + + if (onoff) { + bna->save_notify = hwna->nm_notify; + hwna->nm_notify = netmap_bwrap_intr_notify; + } else { + hwna->nm_notify = bna->save_notify; + hwna->na_lut = NULL; + hwna->na_lut_objtotal = 0; + } + + return 0; +} + + +static int +netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd, + u_int *rxr, u_int *rxd) +{ + struct netmap_bwrap_adapter *bna = + (struct netmap_bwrap_adapter *)na; + struct netmap_adapter *hwna = bna->hwna; + + /* forward the request */ + netmap_update_config(hwna); + /* swap the results */ + *txr = hwna->num_rx_rings; + *txd = hwna->num_rx_desc; + *rxr = hwna->num_tx_rings; + *rxd = hwna->num_rx_desc; + + return 0; +} + + +static int +netmap_bwrap_krings_create(struct netmap_adapter *na) +{ + struct netmap_bwrap_adapter *bna = + (struct netmap_bwrap_adapter *)na; + struct netmap_adapter *hwna = bna->hwna; + struct netmap_adapter *hostna = &bna->host.up; + int error; + + ND("%s", NM_IFPNAME(na->ifp)); + + error = netmap_vp_krings_create(na); + if (error) + return error; + + error = hwna->nm_krings_create(hwna); + if (error) { + netmap_vp_krings_delete(na); + return error; + } + + if (na->na_flags & NAF_HOST_RINGS) { + hostna->tx_rings = na->tx_rings + na->num_tx_rings; + hostna->rx_rings = na->rx_rings + na->num_rx_rings; + } + + return 0; +} + + +static void +netmap_bwrap_krings_delete(struct netmap_adapter *na) +{ + struct netmap_bwrap_adapter *bna = + (struct netmap_bwrap_adapter *)na; + struct netmap_adapter *hwna = bna->hwna; + + ND("%s", NM_IFPNAME(na->ifp)); + + hwna->nm_krings_delete(hwna); + netmap_vp_krings_delete(na); +} + + +/* notify method for the bridge-->hwna direction */ +static int +netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags) +{ + struct netmap_bwrap_adapter *bna = + (struct netmap_bwrap_adapter *)na; + struct netmap_adapter *hwna = bna->hwna; + struct netmap_kring *kring, *hw_kring; + struct netmap_ring *ring; + u_int lim; + int error = 0; + + if (tx == NR_TX) + return EINVAL; + + kring = &na->rx_rings[ring_n]; + hw_kring = &hwna->tx_rings[ring_n]; + ring = kring->ring; + lim = kring->nkr_num_slots - 1; + + if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP)) + return 0; + mtx_lock(&kring->q_lock); + /* first step: simulate a user wakeup on the rx ring */ + netmap_vp_rxsync(na, ring_n, flags); + ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", + NM_IFPNAME(na->ifp), ring_n, + kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, + ring->head, ring->cur, ring->tail, + hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail); + /* second step: the simulated user consumes all new packets */ + ring->head = ring->cur = ring->tail; + + /* third step: the new packets are sent on the tx ring + * (which is actually the same ring) + */ + /* set tail to what the hw expects */ + ring->tail = hw_kring->rtail; + nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ? + error = hw_kring->nm_sync(hw_kring, flags); + + /* fourth step: now we are back the rx ring */ + /* claim ownership on all hw owned bufs */ + ring->head = nm_next(ring->tail, lim); /* skip past reserved slot */ + ring->tail = kring->rtail; /* restore saved value of tail, for safety */ + + /* fifth step: the user goes to sleep again, causing another rxsync */ + netmap_vp_rxsync(na, ring_n, flags); + ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", + NM_IFPNAME(na->ifp), ring_n, + kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, + ring->head, ring->cur, ring->tail, + hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail); + mtx_unlock(&kring->q_lock); + return error; +} + + +static int +netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags) +{ + struct netmap_bwrap_adapter *bna = na->na_private; + struct netmap_adapter *port_na = &bna->up.up; + if (tx == NR_TX || ring_n != 0) + return EINVAL; + return netmap_bwrap_notify(port_na, port_na->num_rx_rings, NR_RX, flags); +} + + +/* attach a bridge wrapper to the 'real' device */ +static int +netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real) +{ + struct netmap_bwrap_adapter *bna; + struct netmap_adapter *na; + struct netmap_adapter *hwna = NA(real); + struct netmap_adapter *hostna; + int error; + + + bna = malloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (bna == NULL) + return ENOMEM; + + na = &bna->up.up; + na->ifp = fake; + /* fill the ring data for the bwrap adapter with rx/tx meanings + * swapped. The real cross-linking will be done during register, + * when all the krings will have been created. + */ + na->num_rx_rings = hwna->num_tx_rings; + na->num_tx_rings = hwna->num_rx_rings; + na->num_tx_desc = hwna->num_rx_desc; + na->num_rx_desc = hwna->num_tx_desc; + na->nm_dtor = netmap_bwrap_dtor; + na->nm_register = netmap_bwrap_register; + // na->nm_txsync = netmap_bwrap_txsync; + // na->nm_rxsync = netmap_bwrap_rxsync; + na->nm_config = netmap_bwrap_config; + na->nm_krings_create = netmap_bwrap_krings_create; + na->nm_krings_delete = netmap_bwrap_krings_delete; + na->nm_notify = netmap_bwrap_notify; + na->nm_mem = hwna->nm_mem; + na->na_private = na; /* prevent NIOCREGIF */ + bna->up.retry = 1; /* XXX maybe this should depend on the hwna */ + + bna->hwna = hwna; + netmap_adapter_get(hwna); + hwna->na_private = bna; /* weak reference */ + + if (hwna->na_flags & NAF_HOST_RINGS) { + na->na_flags |= NAF_HOST_RINGS; + hostna = &bna->host.up; + hostna->ifp = hwna->ifp; + hostna->num_tx_rings = 1; + hostna->num_tx_desc = hwna->num_rx_desc; + hostna->num_rx_rings = 1; + hostna->num_rx_desc = hwna->num_tx_desc; + // hostna->nm_txsync = netmap_bwrap_host_txsync; + // hostna->nm_rxsync = netmap_bwrap_host_rxsync; + hostna->nm_notify = netmap_bwrap_host_notify; + hostna->nm_mem = na->nm_mem; + hostna->na_private = bna; + } + + ND("%s<->%s txr %d txd %d rxr %d rxd %d", + fake->if_xname, real->if_xname, + na->num_tx_rings, na->num_tx_desc, + na->num_rx_rings, na->num_rx_desc); + + error = netmap_attach_common(na); + if (error) { + netmap_adapter_put(hwna); + free(bna, M_DEVBUF); + return error; + } + return 0; +} + + +void +netmap_init_bridges(void) +{ + int i; + bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */ + for (i = 0; i < NM_BRIDGES; i++) + BDG_RWINIT(&nm_bridges[i]); +} +#endif /* WITH_VALE */ Property changes on: stable/9/sys/dev/netmap/netmap_vale.c ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: stable/9/sys/modules/netmap/Makefile =================================================================== --- stable/9/sys/modules/netmap/Makefile (revision 262152) +++ stable/9/sys/modules/netmap/Makefile (revision 262153) @@ -1,14 +1,20 @@ # $FreeBSD$ # # Compile netmap as a module, useful if you want a netmap bridge # or loadable drivers. .PATH: ${.CURDIR}/../../dev/netmap .PATH.h: ${.CURDIR}/../../net +CFLAGS += -I${.CURDIR}/../../ KMOD = netmap SRCS = device_if.h bus_if.h opt_netmap.h SRCS += netmap.c netmap.h netmap_kern.h - -netmap.o: netmap_mem2.c +SRCS += netmap_mem2.c netmap_mem2.h +SRCS += netmap_generic.c +SRCS += netmap_mbq.c netmap_mbq.h +SRCS += netmap_vale.c +SRCS += netmap_freebsd.c +SRCS += netmap_offloadings.c +SRCS += netmap_pipe.c .include Index: stable/9/sys/net/netmap.h =================================================================== --- stable/9/sys/net/netmap.h (revision 262152) +++ stable/9/sys/net/netmap.h (revision 262153) @@ -1,372 +1,550 @@ /* - * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. - * + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. + * * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * + * modification, are permitted provided that the following conditions + * are met: + * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. - * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the - * distribution. - * - * 3. Neither the name of the authors nor the names of their contributors - * may be used to endorse or promote products derived from this - * software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY MATTEO LANDI AND CONTRIBUTORS "AS IS" AND + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``S IS''AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL MATTEO LANDI OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ /* * $FreeBSD$ * * Definitions of constants and the structures used by the netmap * framework, for the part visible to both kernel and userspace. * Detailed info on netmap is available with "man netmap" or at - * + * * http://info.iet.unipi.it/~luigi/netmap/ * * This API is also used to communicate with the VALE software switch */ #ifndef _NET_NETMAP_H_ #define _NET_NETMAP_H_ +#define NETMAP_API 11 /* current API version */ + +#define NETMAP_MIN_API 11 /* min and max versions accepted */ +#define NETMAP_MAX_API 15 /* + * Some fields should be cache-aligned to reduce contention. + * The alignment is architecture and OS dependent, but rather than + * digging into OS headers to find the exact value we use an estimate + * that should cover most architectures. + */ +#define NM_CACHE_ALIGN 128 + +/* * --- Netmap data structures --- * * The userspace data structures used by netmap are shown below. * They are allocated by the kernel and mmap()ed by userspace threads. * Pointers are implemented as memory offsets or indexes, * so that they can be easily dereferenced in kernel and userspace. KERNEL (opaque, obviously) ==================================================================== | USERSPACE | struct netmap_ring - +---->+--------------+ - / | cur | - struct netmap_if (nifp, 1 per fd) / | avail | - +---------------+ / | buf_ofs | - | ni_tx_rings | / +==============+ - | ni_rx_rings | / | buf_idx, len | slot[0] - | | / | flags, ptr | - | | / +--------------+ - +===============+ / | buf_idx, len | slot[1] - | txring_ofs[0] | (rel.to nifp)--' | flags, ptr | - | txring_ofs[1] | +--------------+ - (ni_tx_rings+1 entries) (num_slots entries) - | txring_ofs[t] | | buf_idx, len | slot[n-1] - +---------------+ | flags, ptr | - | rxring_ofs[0] | +--------------+ + +---->+---------------+ + / | head,cur,tail | + struct netmap_if (nifp, 1 per fd) / | buf_ofs | + +---------------+ / | other fields | + | ni_tx_rings | / +===============+ + | ni_rx_rings | / | buf_idx, len | slot[0] + | | / | flags, ptr | + | | / +---------------+ + +===============+ / | buf_idx, len | slot[1] + | txring_ofs[0] | (rel.to nifp)--' | flags, ptr | + | txring_ofs[1] | +---------------+ + (tx+1 entries) (num_slots entries) + | txring_ofs[t] | | buf_idx, len | slot[n-1] + +---------------+ | flags, ptr | + | rxring_ofs[0] | +---------------+ | rxring_ofs[1] | - (ni_rx_rings+1 entries) + (rx+1 entries) | rxring_ofs[r] | +---------------+ - * For each "interface" (NIC, host stack, VALE switch port) attached to a - * file descriptor, the mmap()ed region contains a (logically readonly) + * For each "interface" (NIC, host stack, PIPE, VALE switch port) bound to + * a file descriptor, the mmap()ed region contains a (logically readonly) * struct netmap_if pointing to struct netmap_ring's. + * * There is one netmap_ring per physical NIC ring, plus one tx/rx ring - * pair attached to the host stack (this pair is unused for VALE ports). + * pair attached to the host stack (this pair is unused for non-NIC ports). * * All physical/host stack ports share the same memory region, * so that zero-copy can be implemented between them. * VALE switch ports instead have separate memory regions. * * The netmap_ring is the userspace-visible replica of the NIC ring. * Each slot has the index of a buffer (MTU-sized and residing in the * mmapped region), its length and some flags. An extra 64-bit pointer * is provided for user-supplied buffers in the tx path. * * In user space, the buffer address is computed as - * (char *)ring + buf_ofs + index*NETMAP_BUF_SIZE - */ - -/* - * struct netmap_slot is a buffer descriptor + * (char *)ring + buf_ofs + index * NETMAP_BUF_SIZE * - * buf_idx the index of the buffer associated to the slot. - * len the length of the payload - * flags control operation on the slot, as defined below + * Added in NETMAP_API 11: * - * NS_BUF_CHANGED must be set whenever userspace wants - * to change buf_idx (it might be necessary to - * reprogram the NIC) + * + NIOCREGIF can request the allocation of extra spare buffers from + * the same memory pool. The desired number of buffers must be in + * nr_arg3. The ioctl may return fewer buffers, depending on memory + * availability. nr_arg3 will return the actual value, and, once + * mapped, nifp->ni_bufs_head will be the index of the first buffer. * - * NS_REPORT must be set if we want the NIC to generate an interrupt - * when this slot is used. Leaving it to 0 improves - * performance. + * The buffers are linked to each other using the first uint32_t + * as the index. On close, ni_bufs_head must point to the list of + * buffers to be released. * - * NS_FORWARD if set on a receive ring, and the device is in - * transparent mode, buffers released with the flag set - * will be forwarded to the 'other' side (host stack - * or NIC, respectively) on the next select() or ioctl() + * + NIOCREGIF can request space for extra rings (and buffers) + * allocated in the same memory space. The number of extra rings + * is in nr_arg1, and is advisory. This is a no-op on NICs where + * the size of the memory space is fixed. * - * NS_NO_LEARN on a VALE switch, do not 'learn' the source port for - * this packet. + * + NIOCREGIF can attach to PIPE rings sharing the same memory + * space with a parent device. The ifname indicates the parent device, + * which must already exist. Flags in nr_flags indicate if we want to + * bind the master or slave side, the index (from nr_ringid) + * is just a cookie and does need to be sequential. * - * NS_INDIRECT (tx rings only) data is in a userspace buffer pointed - * by the ptr field in the slot. + * + NIOCREGIF can also attach to 'monitor' rings that replicate + * the content of specific rings, also from the same memory space. * - * NS_MOREFRAG Part of a multi-segment frame. The last (or only) - * segment must not have this flag. - * Only supported on VALE ports. - * - * NS_PORT_MASK the high 8 bits of the flag, if not zero, indicate the - * destination port for the VALE switch, overriding - * the lookup table. + * Extra flags in nr_flags support the above functions. + * Application libraries may use the following naming scheme: + * netmap:foo all NIC ring pairs + * netmap:foo^ only host ring pair + * netmap:foo+ all NIC ring + host ring pairs + * netmap:foo-k the k-th NIC ring pair + * netmap:foo{k PIPE ring pair k, master side + * netmap:foo}k PIPE ring pair k, slave side */ +/* + * struct netmap_slot is a buffer descriptor + */ struct netmap_slot { uint32_t buf_idx; /* buffer index */ - uint16_t len; /* packet length */ + uint16_t len; /* length for this slot */ uint16_t flags; /* buf changed, etc. */ + uint64_t ptr; /* pointer for indirect buffers */ +}; + +/* + * The following flags control how the slot is used + */ + #define NS_BUF_CHANGED 0x0001 /* buf_idx changed */ -#define NS_REPORT 0x0002 /* ask the hardware to report results - * e.g. by generating an interrupt - */ -#define NS_FORWARD 0x0004 /* pass packet to the other endpoint - * (host stack or device) - */ -#define NS_NO_LEARN 0x0008 -#define NS_INDIRECT 0x0010 -#define NS_MOREFRAG 0x0020 + /* + * must be set whenever buf_idx is changed (as it might be + * necessary to recompute the physical address and mapping) + */ + +#define NS_REPORT 0x0002 /* ask the hardware to report results */ + /* + * Request notification when slot is used by the hardware. + * Normally transmit completions are handled lazily and + * may be unreported. This flag lets us know when a slot + * has been sent (e.g. to terminate the sender). + */ + +#define NS_FORWARD 0x0004 /* pass packet 'forward' */ + /* + * (Only for physical ports, rx rings with NR_FORWARD set). + * Slot released to the kernel (i.e. before ring->head) with + * this flag set are passed to the peer ring (host/NIC), + * thus restoring the host-NIC connection for these slots. + * This supports efficient traffic monitoring or firewalling. + */ + +#define NS_NO_LEARN 0x0008 /* disable bridge learning */ + /* + * On a VALE switch, do not 'learn' the source port for + * this buffer. + */ + +#define NS_INDIRECT 0x0010 /* userspace buffer */ + /* + * (VALE tx rings only) data is in a userspace buffer, + * whose address is in the 'ptr' field in the slot. + */ + +#define NS_MOREFRAG 0x0020 /* packet has more fragments */ + /* + * (VALE ports only) + * Set on all but the last slot of a multi-segment packet. + * The 'len' field refers to the individual fragment. + */ + #define NS_PORT_SHIFT 8 #define NS_PORT_MASK (0xff << NS_PORT_SHIFT) - /* - * in rx rings, the high 8 bits - * are the number of fragments. - */ + /* + * The high 8 bits of the flag, if not zero, indicate the + * destination port for the VALE switch, overriding + * the lookup table. + */ + #define NS_RFRAGS(_slot) ( ((_slot)->flags >> 8) & 0xff) - uint64_t ptr; /* pointer for indirect buffers */ -}; + /* + * (VALE rx rings only) the high 8 bits + * are the number of fragments. + */ + /* * struct netmap_ring * * Netmap representation of a TX or RX ring (also known as "queue"). * This is a queue implemented as a fixed-size circular array. - * At the software level, two fields are important: avail and cur. + * At the software level the important fields are: head, cur, tail. * * In TX rings: * - * avail tells how many slots are available for transmission. - * It is updated by the kernel in each netmap system call. - * It MUST BE decremented by the user when it - * adds a new packet to send. + * head first slot available for transmission. + * cur wakeup point. select() and poll() will unblock + * when 'tail' moves past 'cur' + * tail (readonly) first slot reserved to the kernel * - * cur indicates the slot to use for the next packet - * to send (i.e. the "tail" of the queue). - * It MUST BE incremented by the user before - * netmap system calls to reflect the number of newly - * sent packets. - * It is checked by the kernel on netmap system calls - * (normally unmodified by the kernel unless invalid). + * [head .. tail-1] can be used for new packets to send; + * 'head' and 'cur' must be incremented as slots are filled + * with new packets to be sent; + * 'cur' can be moved further ahead if we need more space + * for new transmissions. * * In RX rings: * - * avail is the number of packets available (possibly 0). - * It is updated by the kernel in each netmap system call. - * It MUST BE decremented by the user when it - * consumes a packet. + * head first valid received packet + * cur wakeup point. select() and poll() will unblock + * when 'tail' moves past 'cur' + * tail (readonly) first slot reserved to the kernel * - * cur indicates the first slot that contains a packet not - * yet processed (the "head" of the queue). - * It MUST BE incremented by the user when it consumes - * a packet. + * [head .. tail-1] contain received packets; + * 'head' and 'cur' must be incremented as slots are consumed + * and can be returned to the kernel; + * 'cur' can be moved further ahead if we want to wait for + * new packets without returning the previous ones. * - * reserved indicates the number of buffers before 'cur' - * that the user has not released yet. Normally 0, - * it MUST BE incremented by the user when it - * does not return the buffer immediately, and decremented - * when the buffer is finally freed. - * - * * DATA OWNERSHIP/LOCKING: - * The netmap_ring, all slots, and buffers in the range - * [reserved-cur , cur+avail[ are owned by the user program, - * and the kernel only touches them in the same thread context - * during a system call. - * Other buffers are reserved for use by the NIC's DMA engines. + * The netmap_ring, and all slots and buffers in the range + * [head .. tail-1] are owned by the user program; + * the kernel only accesses them during a netmap system call + * and in the user thread context. * - * FLAGS - * NR_TIMESTAMP updates the 'ts' field on each syscall. This is - * a global timestamp for all packets. - * NR_RX_TSTMP if set, the last 64 byte in each buffer will - * contain a timestamp for the frame supplied by - * the hardware (if supported) - * NR_FORWARD if set, the NS_FORWARD flag in each slot of the - * RX ring is checked, and if set the packet is - * passed to the other side (host stack or device, - * respectively). This permits bpf-like behaviour - * or transparency for selected packets. + * Other slots and buffers are reserved for use by the kernel */ struct netmap_ring { /* * buf_ofs is meant to be used through macros. * It contains the offset of the buffer region from this * descriptor. */ - const ssize_t buf_ofs; + const int64_t buf_ofs; const uint32_t num_slots; /* number of slots in the ring. */ - uint32_t avail; /* number of usable slots */ - uint32_t cur; /* 'current' r/w position */ - uint32_t reserved; /* not refilled before current */ + const uint32_t nr_buf_size; + const uint16_t ringid; + const uint16_t dir; /* 0: tx, 1: rx */ - const uint16_t nr_buf_size; - uint16_t flags; -#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */ -#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */ -#define NR_RX_TSTMP 0x0008 /* set rx timestamp in slots */ + uint32_t head; /* (u) first user slot */ + uint32_t cur; /* (u) wakeup point */ + uint32_t tail; /* (k) first kernel slot */ - struct timeval ts; /* time of last *sync() */ + uint32_t flags; + struct timeval ts; /* (k) time of last *sync() */ + + /* opaque room for a mutex or similar object */ + uint8_t sem[128] __attribute__((__aligned__(NM_CACHE_ALIGN))); + /* the slots follow. This struct has variable size */ struct netmap_slot slot[0]; /* array of slots. */ }; /* + * RING FLAGS + */ +#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */ + /* + * updates the 'ts' field on each netmap syscall. This saves + * saves a separate gettimeofday(), and is not much worse than + * software timestamps generated in the interrupt handler. + */ + +#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */ + /* + * Enables the NS_FORWARD slot flag for the ring. + */ + + +/* * Netmap representation of an interface and its queue(s). * This is initialized by the kernel when binding a file * descriptor to a port, and should be considered as readonly * by user programs. The kernel never uses it. * * There is one netmap_if for each file descriptor on which we want * to select/poll. * select/poll operates on one or all pairs depending on the value of * nmr_queueid passed on the ioctl. */ struct netmap_if { char ni_name[IFNAMSIZ]; /* name of the interface. */ const uint32_t ni_version; /* API version, currently unused */ const uint32_t ni_flags; /* properties */ #define NI_PRIV_MEM 0x1 /* private memory region */ - const uint32_t ni_rx_rings; /* number of rx rings */ - const uint32_t ni_tx_rings; /* number of tx rings */ /* + * The number of packet rings available in netmap mode. + * Physical NICs can have different numbers of tx and rx rings. + * Physical NICs also have a 'host' ring pair. + * Additionally, clients can request additional ring pairs to + * be used for internal communication. + */ + const uint32_t ni_tx_rings; /* number of HW tx rings */ + const uint32_t ni_rx_rings; /* number of HW rx rings */ + + uint32_t ni_bufs_head; /* head index for extra bufs */ + uint32_t ni_spare1[5]; + /* * The following array contains the offset of each netmap ring - * from this structure. The first ni_tx_rings+1 entries refer - * to the tx rings, the next ni_rx_rings+1 refer to the rx rings - * (the last entry in each block refers to the host stack rings). + * from this structure, in the following order: + * NIC tx rings (ni_tx_rings); host tx ring (1); extra tx rings; + * NIC rx rings (ni_rx_rings); host tx ring (1); extra rx rings. + * * The area is filled up by the kernel on NIOCREGIF, * and then only read by userspace code. */ const ssize_t ring_ofs[0]; }; -#ifndef NIOCREGIF + +#ifndef NIOCREGIF /* * ioctl names and related fields * + * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues, + * whose identity is set in NIOCREGIF through nr_ringid. + * These are non blocking and take no argument. + * * NIOCGINFO takes a struct ifreq, the interface name is the input, * the outputs are number of queues and number of descriptor * for each queue (useful to set number of threads etc.). * The info returned is only advisory and may change before * the interface is bound to a file descriptor. * - * NIOCREGIF takes an interface name within a struct ifreq, + * NIOCREGIF takes an interface name within a struct nmre, * and activates netmap mode on the interface (if possible). * - * nr_name is the name of the interface + * The argument to NIOCGINFO/NIOCREGIF overlays struct ifreq so we + * can pass it down to other NIC-related ioctls. * - * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings - * indicate the configuration of the port on return. + * The actual argument (struct nmreq) has a number of options to request + * different functions. + * The following are used in NIOCREGIF when nr_cmd == 0: * - * On input, non-zero values for nr_tx_rings, nr_tx_slots and the - * rx counterparts may be used to reconfigure the port according - * to the requested values, but this is not guaranteed. - * The actual values are returned on completion of the ioctl(). + * nr_name (in) + * The name of the port (em0, valeXXX:YYY, etc.) + * limited to IFNAMSIZ for backward compatibility. * - * nr_ringid - * indicates how rings should be bound to the file descriptors. - * The default (0) means all physical rings of a NIC are bound. - * NETMAP_HW_RING plus a ring number lets you bind just - * a single ring pair. - * NETMAP_SW_RING binds only the host tx/rx rings - * NETMAP_NO_TX_POLL prevents select()/poll() from pushing - * out packets on the tx ring unless POLLOUT is specified. + * nr_version (in/out) + * Must match NETMAP_API as used in the kernel, error otherwise. + * Always returns the desired value on output. * - * NETMAP_PRIV_MEM is a return value used to indicate that - * this ring is in a private memory region hence buffer - * swapping cannot be used - * - * nr_cmd is used to configure NICs attached to a VALE switch, - * or to dump the configuration of a VALE switch. - * - * nr_cmd = NETMAP_BDG_ATTACH and nr_name = vale*:ifname - * attaches the NIC to the switch, with nr_ringid specifying - * which rings to use + * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings (in/out) + * On input, non-zero values may be used to reconfigure the port + * according to the requested values, but this is not guaranteed. + * On output the actual values in use are reported. * - * nr_cmd = NETMAP_BDG_DETACH and nr_name = vale*:ifname - * disconnects a previously attached NIC + * nr_ringid (in) + * Indicates how rings should be bound to the file descriptors. + * If nr_flags != 0, then the low bits (in NETMAP_RING_MASK) + * are used to indicate the ring number, and nr_flags specifies + * the actual rings to bind. NETMAP_NO_TX_POLL is unaffected. * - * nr_cmd = NETMAP_BDG_LIST is used to list the configuration - * of VALE switches, with additional arguments. + * NOTE: THE FOLLOWING (nr_flags == 0) IS DEPRECATED: + * If nr_flags == 0, NETMAP_HW_RING and NETMAP_SW_RING control + * the binding as follows: + * 0 (default) binds all physical rings + * NETMAP_HW_RING | ring number binds a single ring pair + * NETMAP_SW_RING binds only the host tx/rx rings * - * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues, - * whose identity is set in NIOCREGIF through nr_ringid + * NETMAP_NO_TX_POLL can be OR-ed to make select()/poll() push + * packets on tx rings only if POLLOUT is set. + * The default is to push any pending packet. * - * NETMAP_API is the API version. + * NETMAP_DO_RX_POLL can be OR-ed to make select()/poll() release + * packets on rx rings also when POLLIN is NOT set. + * The default is to touch the rx ring only with POLLIN. + * Note that this is the opposite of TX because it + * reflects the common usage. + * + * NOTE: NETMAP_PRIV_MEM IS DEPRECATED, use nr_arg2 instead. + * NETMAP_PRIV_MEM is set on return for ports that do not use + * the global memory allocator. + * This information is not significant and applications + * should look at the region id in nr_arg2 + * + * nr_flags is the recommended mode to indicate which rings should + * be bound to a file descriptor. Values are NR_REG_* + * + * nr_arg1 (in) The number of extra rings to be reserved. + * Especially when allocating a VALE port the system only + * allocates the amount of memory needed for the port. + * If more shared memory rings are desired (e.g. for pipes), + * the first invocation for the same basename/allocator + * should specify a suitable number. Memory cannot be + * extended after the first allocation without closing + * all ports on the same region. + * + * nr_arg2 (in/out) The identity of the memory region used. + * On input, 0 means the system decides autonomously, + * other values may try to select a specific region. + * On return the actual value is reported. + * Region '1' is the global allocator, normally shared + * by all interfaces. Other values are private regions. + * If two ports the same region zero-copy is possible. + * + * nr_arg3 (in/out) number of extra buffers to be allocated. + * + * + * + * nr_cmd (in) if non-zero indicates a special command: + * NETMAP_BDG_ATTACH and nr_name = vale*:ifname + * attaches the NIC to the switch; nr_ringid specifies + * which rings to use. Used by vale-ctl -a ... + * nr_arg1 = NETMAP_BDG_HOST also attaches the host port + * as in vale-ctl -h ... + * + * NETMAP_BDG_DETACH and nr_name = vale*:ifname + * disconnects a previously attached NIC. + * Used by vale-ctl -d ... + * + * NETMAP_BDG_LIST + * list the configuration of VALE switches. + * + * NETMAP_BDG_VNET_HDR + * Set the virtio-net header length used by the client + * of a VALE switch port. + * + * nr_arg1, nr_arg2, nr_arg3 (in/out) command specific + * + * + * */ + /* - * struct nmreq overlays a struct ifreq + * struct nmreq overlays a struct ifreq (just the name) + * + * On input, nr_ringid indicates which rings we are requesting, + * with the low flags for the specific ring number. + * selection FLAGS RING INDEX + * + * all the NIC rings 0x0000 - + * only HOST ring 0x2000 ring index + * single NIC ring 0x4000 - + * all the NIC+HOST rings 0x6000 - + * one pipe ring, master 0x8000 ring index + * *** INVALID 0xA000 + * one pipe ring, slave 0xC000 ring index + * *** INVALID 0xE000 + * */ struct nmreq { char nr_name[IFNAMSIZ]; uint32_t nr_version; /* API version */ -#define NETMAP_API 5 /* current version */ uint32_t nr_offset; /* nifp offset in the shared region */ uint32_t nr_memsize; /* size of the shared region */ uint32_t nr_tx_slots; /* slots in tx rings */ uint32_t nr_rx_slots; /* slots in rx rings */ uint16_t nr_tx_rings; /* number of tx rings */ uint16_t nr_rx_rings; /* number of rx rings */ + uint16_t nr_ringid; /* ring(s) we care about */ -#define NETMAP_PRIV_MEM 0x8000 /* rings use private memory */ -#define NETMAP_HW_RING 0x4000 /* low bits indicate one hw ring */ -#define NETMAP_SW_RING 0x2000 /* process the sw ring */ +#define NETMAP_HW_RING 0x4000 /* single NIC ring pair */ +#define NETMAP_SW_RING 0x2000 /* only host ring pair */ + +#define NETMAP_RING_MASK 0x0fff /* the ring number */ + #define NETMAP_NO_TX_POLL 0x1000 /* no automatic txsync on poll */ -#define NETMAP_RING_MASK 0xfff /* the ring number */ + +#define NETMAP_DO_RX_POLL 0x8000 /* DO automatic rxsync on poll */ + uint16_t nr_cmd; #define NETMAP_BDG_ATTACH 1 /* attach the NIC */ #define NETMAP_BDG_DETACH 2 /* detach the NIC */ #define NETMAP_BDG_LOOKUP_REG 3 /* register lookup function */ #define NETMAP_BDG_LIST 4 /* get bridge's info */ - uint16_t nr_arg1; +#define NETMAP_BDG_VNET_HDR 5 /* set the port virtio-net-hdr length */ +#define NETMAP_BDG_OFFSET NETMAP_BDG_VNET_HDR /* deprecated alias */ + + uint16_t nr_arg1; /* reserve extra rings in NIOCREGIF */ #define NETMAP_BDG_HOST 1 /* attach the host stack on ATTACH */ + uint16_t nr_arg2; - uint32_t spare2[3]; + uint32_t nr_arg3; /* req. extra buffers in NIOCREGIF */ + uint32_t nr_flags; + /* various modes, extends nr_ringid */ + uint32_t spare2[1]; }; +#define NR_REG_MASK 0xf /* values for nr_flags */ +enum { NR_REG_DEFAULT = 0, /* backward compat, should not be used. */ + NR_REG_ALL_NIC = 1, + NR_REG_SW = 2, + NR_REG_NIC_SW = 3, + NR_REG_ONE_NIC = 4, + NR_REG_PIPE_MASTER = 5, + NR_REG_PIPE_SLAVE = 6, +}; +/* monitor uses the NR_REG to select the rings to monitor */ +#define NR_MONITOR_TX 0x100 +#define NR_MONITOR_RX 0x200 + + /* * FreeBSD uses the size value embedded in the _IOWR to determine * how much to copy in/out. So we need it to match the actual * data structure we pass. We put some spares in the structure * to ease compatibility with other versions */ #define NIOCGINFO _IOWR('i', 145, struct nmreq) /* return IF info */ #define NIOCREGIF _IOWR('i', 146, struct nmreq) /* interface register */ -#define NIOCUNREGIF _IO('i', 147) /* deprecated. Was interface unregister */ #define NIOCTXSYNC _IO('i', 148) /* sync tx queues */ #define NIOCRXSYNC _IO('i', 149) /* sync rx queues */ #endif /* !NIOCREGIF */ + + +/* + * Helper functions for kernel and userspace + */ + +/* + * check if space is available in the ring. + */ +static inline int +nm_ring_empty(struct netmap_ring *ring) +{ + return (ring->cur == ring->tail); +} #endif /* _NET_NETMAP_H_ */ Index: stable/9/sys/net/netmap_user.h =================================================================== --- stable/9/sys/net/netmap_user.h (revision 262152) +++ stable/9/sys/net/netmap_user.h (revision 262153) @@ -1,95 +1,677 @@ /* - * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. - * + * Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved. + * * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * + * modification, are permitted provided that the following conditions + * are met: + * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. - * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the - * distribution. - * - * 3. Neither the name of the authors nor the names of their contributors - * may be used to endorse or promote products derived from this - * software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY MATTEO LANDI AND CONTRIBUTORS "AS IS" AND + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL MATTEO LANDI OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ /* * $FreeBSD$ * - * This header contains the macros used to manipulate netmap structures - * and packets in userspace. See netmap(4) for more information. + * Functions and macros to manipulate netmap structures and packets + * in userspace. See netmap(4) for more information. * * The address of the struct netmap_if, say nifp, is computed from the * value returned from ioctl(.., NIOCREG, ...) and the mmap region: * ioctl(fd, NIOCREG, &req); * mem = mmap(0, ... ); * nifp = NETMAP_IF(mem, req.nr_nifp); * (so simple, we could just do it manually) * * From there: * struct netmap_ring *NETMAP_TXRING(nifp, index) * struct netmap_ring *NETMAP_RXRING(nifp, index) * we can access ring->nr_cur, ring->nr_avail, ring->nr_flags * * ring->slot[i] gives us the i-th slot (we can access - * directly plen, flags, bufindex) + * directly len, flags, buf_idx) * * char *buf = NETMAP_BUF(ring, x) returns a pointer to * the buffer numbered x * - * Since rings are circular, we have macros to compute the next index - * i = NETMAP_RING_NEXT(ring, i); + * All ring indexes (head, cur, tail) should always move forward. + * To compute the next index in a circular ring you can use + * i = nm_ring_next(ring, i); + * + * To ease porting apps from pcap to netmap we supply a few fuctions + * that can be called to open, close, read and write on netmap in a way + * similar to libpcap. Note that the read/write function depend on + * an ioctl()/select()/poll() being issued to refill rings or push + * packets out. + * + * In order to use these, include #define NETMAP_WITH_LIBS + * in the source file that invokes these functions. */ #ifndef _NET_NETMAP_USER_H_ #define _NET_NETMAP_USER_H_ +#include +#include /* apple needs sockaddr */ +#include /* IFNAMSIZ */ + +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif /* likely and unlikely */ + +#include + +/* helper macro */ #define _NETMAP_OFFSET(type, ptr, offset) \ ((type)(void *)((char *)(ptr) + (offset))) -#define NETMAP_IF(b, o) _NETMAP_OFFSET(struct netmap_if *, b, o) +#define NETMAP_IF(_base, _ofs) _NETMAP_OFFSET(struct netmap_if *, _base, _ofs) #define NETMAP_TXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, \ nifp, (nifp)->ring_ofs[index] ) #define NETMAP_RXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, \ nifp, (nifp)->ring_ofs[index + (nifp)->ni_tx_rings + 1] ) #define NETMAP_BUF(ring, index) \ ((char *)(ring) + (ring)->buf_ofs + ((index)*(ring)->nr_buf_size)) #define NETMAP_BUF_IDX(ring, buf) \ ( ((char *)(buf) - ((char *)(ring) + (ring)->buf_ofs) ) / \ - (ring)->nr_buf_size ) + (ring)->nr_buf_size ) -#define NETMAP_RING_NEXT(r, i) \ - ((i)+1 == (r)->num_slots ? 0 : (i) + 1 ) -#define NETMAP_RING_FIRST_RESERVED(r) \ - ( (r)->cur < (r)->reserved ? \ - (r)->cur + (r)->num_slots - (r)->reserved : \ - (r)->cur - (r)->reserved ) +static inline uint32_t +nm_ring_next(struct netmap_ring *r, uint32_t i) +{ + return ( unlikely(i + 1 == r->num_slots) ? 0 : i + 1); +} + /* - * Return 1 if the given tx ring is empty. + * Return 1 if we have pending transmissions in the tx ring. + * When everything is complete ring->head = ring->tail + 1 (modulo ring size) */ -#define NETMAP_TX_RING_EMPTY(r) ((r)->avail >= (r)->num_slots - 1) +static inline int +nm_tx_pending(struct netmap_ring *r) +{ + return nm_ring_next(r, r->tail) != r->head; +} + + +static inline uint32_t +nm_ring_space(struct netmap_ring *ring) +{ + int ret = ring->tail - ring->cur; + if (ret < 0) + ret += ring->num_slots; + return ret; +} + + +#ifdef NETMAP_WITH_LIBS +/* + * Support for simple I/O libraries. + * Include other system headers required for compiling this. + */ + +#ifndef HAVE_NETMAP_WITH_LIBS +#define HAVE_NETMAP_WITH_LIBS + +#include +#include +#include /* memset */ +#include +#include /* EINVAL */ +#include /* O_RDWR */ +#include /* close() */ +#include +#include + +#ifndef ND /* debug macros */ +/* debug support */ +#define ND(_fmt, ...) do {} while(0) +#define D(_fmt, ...) \ + do { \ + struct timeval t0; \ + gettimeofday(&t0, NULL); \ + fprintf(stderr, "%03d.%06d %s [%d] " _fmt "\n", \ + (int)(t0.tv_sec % 1000), (int)t0.tv_usec, \ + __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + } while (0) + +/* Rate limited version of "D", lps indicates how many per second */ +#define RD(lps, format, ...) \ + do { \ + static int t0, __cnt; \ + struct timeval __xxts; \ + gettimeofday(&__xxts, NULL); \ + if (t0 != __xxts.tv_sec) { \ + t0 = __xxts.tv_sec; \ + __cnt = 0; \ + } \ + if (__cnt++ < lps) { \ + D(format, ##__VA_ARGS__); \ + } \ + } while (0) +#endif + +struct nm_pkthdr { /* same as pcap_pkthdr */ + struct timeval ts; + uint32_t caplen; + uint32_t len; +}; + +struct nm_stat { /* same as pcap_stat */ + u_int ps_recv; + u_int ps_drop; + u_int ps_ifdrop; +#ifdef WIN32 + u_int bs_capt; +#endif /* WIN32 */ +}; + +#define NM_ERRBUF_SIZE 512 + +struct nm_desc { + struct nm_desc *self; /* point to self if netmap. */ + int fd; + void *mem; + int memsize; + int done_mmap; /* set if mem is the result of mmap */ + struct netmap_if * const nifp; + uint16_t first_tx_ring, last_tx_ring, cur_tx_ring; + uint16_t first_rx_ring, last_rx_ring, cur_rx_ring; + struct nmreq req; /* also contains the nr_name = ifname */ + struct nm_pkthdr hdr; + + /* + * The memory contains netmap_if, rings and then buffers. + * Given a pointer (e.g. to nm_inject) we can compare with + * mem/buf_start/buf_end to tell if it is a buffer or + * some other descriptor in our region. + * We also store a pointer to some ring as it helps in the + * translation from buffer indexes to addresses. + */ + struct netmap_ring * const some_ring; + void * const buf_start; + void * const buf_end; + /* parameters from pcap_open_live */ + int snaplen; + int promisc; + int to_ms; + char *errbuf; + + /* save flags so we can restore them on close */ + uint32_t if_flags; + uint32_t if_reqcap; + uint32_t if_curcap; + + struct nm_stat st; + char msg[NM_ERRBUF_SIZE]; +}; + +/* + * when the descriptor is open correctly, d->self == d + * Eventually we should also use some magic number. + */ +#define P2NMD(p) ((struct nm_desc *)(p)) +#define IS_NETMAP_DESC(d) ((d) && P2NMD(d)->self == P2NMD(d)) +#define NETMAP_FD(d) (P2NMD(d)->fd) + + +/* + * this is a slightly optimized copy routine which rounds + * to multiple of 64 bytes and is often faster than dealing + * with other odd sizes. We assume there is enough room + * in the source and destination buffers. + * + * XXX only for multiples of 64 bytes, non overlapped. + */ +static inline void +nm_pkt_copy(const void *_src, void *_dst, int l) +{ + const uint64_t *src = (const uint64_t *)_src; + uint64_t *dst = (uint64_t *)_dst; + + if (unlikely(l >= 1024)) { + memcpy(dst, src, l); + return; + } + for (; likely(l > 0); l-=64) { + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + } +} + + +/* + * The callback, invoked on each received packet. Same as libpcap + */ +typedef void (*nm_cb_t)(u_char *, const struct nm_pkthdr *, const u_char *d); + +/* + *--- the pcap-like API --- + * + * nm_open() opens a file descriptor, binds to a port and maps memory. + * + * ifname (netmap:foo or vale:foo) is the port name + * a suffix can indicate the follwing: + * ^ bind the host (sw) ring pair + * * bind host and NIC ring pairs (transparent) + * -NN bind individual NIC ring pair + * {NN bind master side of pipe NN + * }NN bind slave side of pipe NN + * + * req provides the initial values of nmreq before parsing ifname. + * Remember that the ifname parsing will override the ring + * number in nm_ringid, and part of nm_flags; + * flags special functions, normally 0 + * indicates which fields of *arg are significant + * arg special functions, normally NULL + * if passed a netmap_desc with mem != NULL, + * use that memory instead of mmap. + */ + +static struct nm_desc *nm_open(const char *ifname, const struct nmreq *req, + uint64_t flags, const struct nm_desc *arg); + +/* + * nm_open can import some fields from the parent descriptor. + * These flags control which ones. + * Also in flags you can specify NETMAP_NO_TX_POLL and NETMAP_DO_RX_POLL, + * which set the initial value for these flags. + * Note that the 16 low bits of the flags are reserved for data + * that may go into the nmreq. + */ +enum { + NM_OPEN_NO_MMAP = 0x040000, /* reuse mmap from parent */ + NM_OPEN_IFNAME = 0x080000, /* nr_name, nr_ringid, nr_flags */ + NM_OPEN_ARG1 = 0x100000, + NM_OPEN_ARG2 = 0x200000, + NM_OPEN_ARG3 = 0x400000, + NM_OPEN_RING_CFG = 0x800000, /* tx|rx rings|slots */ +}; + + +/* + * nm_close() closes and restores the port to its previous state + */ + +static int nm_close(struct nm_desc *); + +/* + * nm_inject() is the same as pcap_inject() + * nm_dispatch() is the same as pcap_dispatch() + * nm_nextpkt() is the same as pcap_next() + */ + +static int nm_inject(struct nm_desc *, const void *, size_t); +static int nm_dispatch(struct nm_desc *, int, nm_cb_t, u_char *); +static u_char *nm_nextpkt(struct nm_desc *, struct nm_pkthdr *); + + +/* + * Try to open, return descriptor if successful, NULL otherwise. + * An invalid netmap name will return errno = 0; + * You can pass a pointer to a pre-filled nm_desc to add special + * parameters. Flags is used as follows + * NM_OPEN_NO_MMAP use the memory from arg, only + * if the nr_arg2 (memory block) matches. + * NM_OPEN_ARG1 use req.nr_arg1 from arg + * NM_OPEN_ARG2 use req.nr_arg2 from arg + * NM_OPEN_RING_CFG user ring config from arg + */ +static struct nm_desc * +nm_open(const char *ifname, const struct nmreq *req, + uint64_t new_flags, const struct nm_desc *arg) +{ + struct nm_desc *d = NULL; + const struct nm_desc *parent = arg; + u_int namelen; + uint32_t nr_ringid = 0, nr_flags; + const char *port = NULL; + const char *errmsg = NULL; + + if (strncmp(ifname, "netmap:", 7) && strncmp(ifname, "vale", 4)) { + errno = 0; /* name not recognised, not an error */ + return NULL; + } + if (ifname[0] == 'n') + ifname += 7; + /* scan for a separator */ + for (port = ifname; *port && !index("-*^{}", *port); port++) + ; + namelen = port - ifname; + if (namelen >= sizeof(d->req.nr_name)) { + errmsg = "name too long"; + goto fail; + } + switch (*port) { + default: /* '\0', no suffix */ + nr_flags = NR_REG_ALL_NIC; + break; + case '-': /* one NIC */ + nr_flags = NR_REG_ONE_NIC; + nr_ringid = atoi(port + 1); + break; + case '*': /* NIC and SW, ignore port */ + nr_flags = NR_REG_NIC_SW; + if (port[1]) { + errmsg = "invalid port for nic+sw"; + goto fail; + } + break; + case '^': /* only sw ring */ + nr_flags = NR_REG_SW; + if (port[1]) { + errmsg = "invalid port for sw ring"; + goto fail; + } + break; + case '{': + nr_flags = NR_REG_PIPE_MASTER; + nr_ringid = atoi(port + 1); + break; + case '}': + nr_flags = NR_REG_PIPE_SLAVE; + nr_ringid = atoi(port + 1); + break; + } + + if (nr_ringid >= NETMAP_RING_MASK) { + errmsg = "invalid ringid"; + goto fail; + } + /* add the *XPOLL flags */ + nr_ringid |= new_flags & (NETMAP_NO_TX_POLL | NETMAP_DO_RX_POLL); + + d = (struct nm_desc *)calloc(1, sizeof(*d)); + if (d == NULL) { + errmsg = "nm_desc alloc failure"; + errno = ENOMEM; + return NULL; + } + d->self = d; /* set this early so nm_close() works */ + d->fd = open("/dev/netmap", O_RDWR); + if (d->fd < 0) { + errmsg = "cannot open /dev/netmap"; + goto fail; + } + + if (req) + d->req = *req; + d->req.nr_version = NETMAP_API; + d->req.nr_ringid &= ~NETMAP_RING_MASK; + + /* these fields are overridden by ifname and flags processing */ + d->req.nr_ringid |= nr_ringid; + d->req.nr_flags = nr_flags; + memcpy(d->req.nr_name, ifname, namelen); + d->req.nr_name[namelen] = '\0'; + /* optionally import info from parent */ + if (IS_NETMAP_DESC(parent) && new_flags) { + if (new_flags & NM_OPEN_ARG1) + D("overriding ARG1 %d", parent->req.nr_arg1); + d->req.nr_arg1 = new_flags & NM_OPEN_ARG1 ? + parent->req.nr_arg1 : 4; + if (new_flags & NM_OPEN_ARG2) + D("overriding ARG2 %d", parent->req.nr_arg2); + d->req.nr_arg2 = new_flags & NM_OPEN_ARG2 ? + parent->req.nr_arg2 : 0; + if (new_flags & NM_OPEN_ARG3) + D("overriding ARG3 %d", parent->req.nr_arg3); + d->req.nr_arg3 = new_flags & NM_OPEN_ARG3 ? + parent->req.nr_arg3 : 0; + if (new_flags & NM_OPEN_RING_CFG) { + D("overriding RING_CFG"); + d->req.nr_tx_slots = parent->req.nr_tx_slots; + d->req.nr_rx_slots = parent->req.nr_rx_slots; + d->req.nr_tx_rings = parent->req.nr_tx_rings; + d->req.nr_rx_rings = parent->req.nr_rx_rings; + } + if (new_flags & NM_OPEN_IFNAME) { + D("overriding ifname %s ringid 0x%x flags 0x%x", + parent->req.nr_name, parent->req.nr_ringid, + parent->req.nr_flags); + memcpy(d->req.nr_name, parent->req.nr_name, + sizeof(d->req.nr_name)); + d->req.nr_ringid = parent->req.nr_ringid; + d->req.nr_flags = parent->req.nr_flags; + } + } + if (ioctl(d->fd, NIOCREGIF, &d->req)) { + errmsg = "NIOCREGIF failed"; + goto fail; + } + + if (IS_NETMAP_DESC(parent) && parent->mem && + parent->req.nr_arg2 == d->req.nr_arg2) { + /* do not mmap, inherit from parent */ + d->memsize = parent->memsize; + d->mem = parent->mem; + } else { + d->memsize = d->req.nr_memsize; + d->mem = mmap(0, d->memsize, PROT_WRITE | PROT_READ, MAP_SHARED, + d->fd, 0); + if (d->mem == NULL) { + errmsg = "mmap failed"; + goto fail; + } + d->done_mmap = 1; + } + { + struct netmap_if *nifp = NETMAP_IF(d->mem, d->req.nr_offset); + struct netmap_ring *r = NETMAP_RXRING(nifp, ); + + *(struct netmap_if **)(uintptr_t)&(d->nifp) = nifp; + *(struct netmap_ring **)(uintptr_t)&d->some_ring = r; + *(void **)(uintptr_t)&d->buf_start = NETMAP_BUF(r, 0); + *(void **)(uintptr_t)&d->buf_end = + (char *)d->mem + d->memsize; + } + + if (nr_flags == NR_REG_SW) { /* host stack */ + d->first_tx_ring = d->last_tx_ring = d->req.nr_tx_rings; + d->first_rx_ring = d->last_rx_ring = d->req.nr_rx_rings; + } else if (nr_flags == NR_REG_ALL_NIC) { /* only nic */ + d->first_tx_ring = 0; + d->first_rx_ring = 0; + d->last_tx_ring = d->req.nr_tx_rings - 1; + d->last_rx_ring = d->req.nr_rx_rings - 1; + } else if (nr_flags == NR_REG_NIC_SW) { + d->first_tx_ring = 0; + d->first_rx_ring = 0; + d->last_tx_ring = d->req.nr_tx_rings; + d->last_rx_ring = d->req.nr_rx_rings; + } else if (nr_flags == NR_REG_ONE_NIC) { + /* XXX check validity */ + d->first_tx_ring = d->last_tx_ring = + d->first_rx_ring = d->last_rx_ring = nr_ringid; + } else { /* pipes */ + d->first_tx_ring = d->last_tx_ring = 0; + d->first_rx_ring = d->last_rx_ring = 0; + } + +#ifdef DEBUG_NETMAP_USER + { /* debugging code */ + int i; + + D("%s tx %d .. %d %d rx %d .. %d %d", ifname, + d->first_tx_ring, d->last_tx_ring, d->req.nr_tx_rings, + d->first_rx_ring, d->last_rx_ring, d->req.nr_rx_rings); + for (i = 0; i <= d->req.nr_tx_rings; i++) { + struct netmap_ring *r = NETMAP_TXRING(d->nifp, i); + D("TX%d %p h %d c %d t %d", i, r, r->head, r->cur, r->tail); + } + for (i = 0; i <= d->req.nr_rx_rings; i++) { + struct netmap_ring *r = NETMAP_RXRING(d->nifp, i); + D("RX%d %p h %d c %d t %d", i, r, r->head, r->cur, r->tail); + } + } +#endif /* debugging */ + + d->cur_tx_ring = d->first_tx_ring; + d->cur_rx_ring = d->first_rx_ring; + return d; + +fail: + nm_close(d); + if (errmsg) + D("%s %s", errmsg, ifname); + errno = EINVAL; + return NULL; +} + + +static int +nm_close(struct nm_desc *d) +{ + /* + * ugly trick to avoid unused warnings + */ + static void *__xxzt[] __attribute__ ((unused)) = + { (void *)nm_open, (void *)nm_inject, + (void *)nm_dispatch, (void *)nm_nextpkt } ; + + if (d == NULL || d->self != d) + return EINVAL; + if (d->done_mmap && d->mem) + munmap(d->mem, d->memsize); + if (d->fd != -1) + close(d->fd); + bzero(d, sizeof(*d)); + free(d); + return 0; +} + + +/* + * Same prototype as pcap_inject(), only need to cast. + */ +static int +nm_inject(struct nm_desc *d, const void *buf, size_t size) +{ + u_int c, n = d->last_tx_ring - d->first_tx_ring + 1; + + for (c = 0; c < n ; c++) { + /* compute current ring to use */ + struct netmap_ring *ring; + uint32_t i, idx; + uint32_t ri = d->cur_tx_ring + c; + + if (ri > d->last_tx_ring) + ri = d->first_tx_ring; + ring = NETMAP_TXRING(d->nifp, ri); + if (nm_ring_empty(ring)) { + continue; + } + i = ring->cur; + idx = ring->slot[i].buf_idx; + ring->slot[i].len = size; + nm_pkt_copy(buf, NETMAP_BUF(ring, idx), size); + d->cur_tx_ring = ri; + ring->head = ring->cur = nm_ring_next(ring, i); + return size; + } + return 0; /* fail */ +} + + +/* + * Same prototype as pcap_dispatch(), only need to cast. + */ +static int +nm_dispatch(struct nm_desc *d, int cnt, nm_cb_t cb, u_char *arg) +{ + int n = d->last_rx_ring - d->first_rx_ring + 1; + int c, got = 0, ri = d->cur_rx_ring; + + if (cnt == 0) + cnt = -1; + /* cnt == -1 means infinite, but rings have a finite amount + * of buffers and the int is large enough that we never wrap, + * so we can omit checking for -1 + */ + for (c=0; c < n && cnt != got; c++) { + /* compute current ring to use */ + struct netmap_ring *ring; + + ri = d->cur_rx_ring + c; + if (ri > d->last_rx_ring) + ri = d->first_rx_ring; + ring = NETMAP_RXRING(d->nifp, ri); + for ( ; !nm_ring_empty(ring) && cnt != got; got++) { + u_int i = ring->cur; + u_int idx = ring->slot[i].buf_idx; + u_char *buf = (u_char *)NETMAP_BUF(ring, idx); + + // __builtin_prefetch(buf); + d->hdr.len = d->hdr.caplen = ring->slot[i].len; + d->hdr.ts = ring->ts; + cb(arg, &d->hdr, buf); + ring->head = ring->cur = nm_ring_next(ring, i); + } + } + d->cur_rx_ring = ri; + return got; +} + +static u_char * +nm_nextpkt(struct nm_desc *d, struct nm_pkthdr *hdr) +{ + int ri = d->cur_rx_ring; + + do { + /* compute current ring to use */ + struct netmap_ring *ring = NETMAP_RXRING(d->nifp, ri); + if (!nm_ring_empty(ring)) { + u_int i = ring->cur; + u_int idx = ring->slot[i].buf_idx; + u_char *buf = (u_char *)NETMAP_BUF(ring, idx); + + // __builtin_prefetch(buf); + hdr->ts = ring->ts; + hdr->len = hdr->caplen = ring->slot[i].len; + ring->cur = nm_ring_next(ring, i); + /* we could postpone advancing head if we want + * to hold the buffer. This can be supported in + * the future. + */ + ring->head = ring->cur; + d->cur_rx_ring = ri; + return buf; + } + ri++; + if (ri > d->last_rx_ring) + ri = d->first_rx_ring; + } while (ri != d->cur_rx_ring); + return NULL; /* nothing found */ +} + +#endif /* !HAVE_NETMAP_WITH_LIBS */ + +#endif /* NETMAP_WITH_LIBS */ #endif /* _NET_NETMAP_USER_H_ */ Index: stable/9/tools/tools/netmap/pcap.c =================================================================== --- stable/9/tools/tools/netmap/pcap.c (revision 262152) +++ stable/9/tools/tools/netmap/pcap.c (nonexistent) @@ -1,654 +0,0 @@ -/* - * (C) 2011-2012 Luigi Rizzo - * - * BSD license - * - * A simple library that maps some pcap functions onto netmap - * This is not 100% complete but enough to let tcpdump, trafshow - * and other apps work. - * - * $FreeBSD$ - */ - -#define MY_PCAP -#include "nm_util.h" - -char *version = "$Id$"; -int verbose = 0; - -/* - * We redefine here a number of structures that are in pcap.h - * so we can compile this file without the system header. - */ -#ifndef PCAP_ERRBUF_SIZE -#define PCAP_ERRBUF_SIZE 128 -/* - * Each packet is accompanied by a header including the timestamp, - * captured size and actual size. - */ -struct pcap_pkthdr { - struct timeval ts; /* time stamp */ - uint32_t caplen; /* length of portion present */ - uint32_t len; /* length this packet (off wire) */ -}; - -typedef struct pcap_if pcap_if_t; - -/* - * Representation of an interface address. - */ -struct pcap_addr { - struct pcap_addr *next; - struct sockaddr *addr; /* address */ - struct sockaddr *netmask; /* netmask for the above */ - struct sockaddr *broadaddr; /* broadcast addr for the above */ - struct sockaddr *dstaddr; /* P2P dest. address for the above */ -}; - -struct pcap_if { - struct pcap_if *next; - char *name; /* name to hand to "pcap_open_live()" */ - char *description; /* textual description of interface, or NULL */ - struct pcap_addr *addresses; - uint32_t flags; /* PCAP_IF_ interface flags */ -}; - -/* - * We do not support stats (yet) - */ -struct pcap_stat { - u_int ps_recv; /* number of packets received */ - u_int ps_drop; /* number of packets dropped */ - u_int ps_ifdrop; /* drops by interface XXX not yet supported */ -#ifdef WIN32 - u_int bs_capt; /* number of packets that reach the app. */ -#endif /* WIN32 */ -}; - -typedef void pcap_t; -typedef enum { - PCAP_D_INOUT = 0, - PCAP_D_IN, - PCAP_D_OUT -} pcap_direction_t; - - - -typedef void (*pcap_handler)(u_char *user, - const struct pcap_pkthdr *h, const u_char *bytes); - -char errbuf[PCAP_ERRBUF_SIZE]; - -pcap_t *pcap_open_live(const char *device, int snaplen, - int promisc, int to_ms, char *errbuf); - -int pcap_findalldevs(pcap_if_t **alldevsp, char *errbuf); -void pcap_close(pcap_t *p); -int pcap_get_selectable_fd(pcap_t *p); -int pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user); -int pcap_setnonblock(pcap_t *p, int nonblock, char *errbuf); -int pcap_setdirection(pcap_t *p, pcap_direction_t d); -char *pcap_lookupdev(char *errbuf); -int pcap_inject(pcap_t *p, const void *buf, size_t size); -int pcap_fileno(pcap_t *p); -const char *pcap_lib_version(void); - - -struct eproto { - const char *s; - u_short p; -}; -#endif /* !PCAP_ERRBUF_SIZE */ - -#ifndef TEST -/* - * build as a shared library - */ - -char pcap_version[] = "libnetmap version 0.3"; - -/* - * Our equivalent of pcap_t - */ -struct pcap_ring { - struct my_ring me; -#if 0 - const char *ifname; - - //struct nmreq nmr; - - int fd; - char *mem; /* userspace mmap address */ - u_int memsize; - u_int queueid; - u_int begin, end; /* first..last+1 rings to check */ - struct netmap_if *nifp; - - uint32_t if_flags; - uint32_t if_reqcap; - uint32_t if_curcap; -#endif - int snaplen; - char *errbuf; - int promisc; - int to_ms; - - struct pcap_pkthdr hdr; - - - struct pcap_stat st; - - char msg[PCAP_ERRBUF_SIZE]; -}; - - - -/* - * There is a set of functions that tcpdump expects even if probably - * not used - */ -struct eproto eproto_db[] = { - { "ip", ETHERTYPE_IP }, - { "arp", ETHERTYPE_ARP }, - { (char *)0, 0 } -}; - - -const char *pcap_lib_version(void) -{ - return pcap_version; -} - -int -pcap_findalldevs(pcap_if_t **alldevsp, char *errbuf) -{ - pcap_if_t *top = NULL; -#ifndef linux - struct ifaddrs *i_head, *i; - pcap_if_t *cur; - struct pcap_addr *tail = NULL; - int l; - - D("listing all devs"); - *alldevsp = NULL; - i_head = NULL; - - if (getifaddrs(&i_head)) { - D("cannot get if addresses"); - return -1; - } - for (i = i_head; i; i = i->ifa_next) { - //struct ifaddrs *ifa; - struct pcap_addr *pca; - //struct sockaddr *sa; - - D("got interface %s", i->ifa_name); - if (!top || strcmp(top->name, i->ifa_name)) { - /* new interface */ - l = sizeof(*top) + strlen(i->ifa_name) + 1; - cur = calloc(1, l); - if (cur == NULL) { - D("no space for if descriptor"); - continue; - } - cur->name = (char *)(cur + 1); - //cur->flags = i->ifa_flags; - strcpy(cur->name, i->ifa_name); - cur->description = NULL; - cur->next = top; - top = cur; - tail = NULL; - } - /* now deal with addresses */ - D("%s addr family %d len %d %s %s", - top->name, - i->ifa_addr->sa_family, i->ifa_addr->sa_len, - i->ifa_netmask ? "Netmask" : "", - i->ifa_broadaddr ? "Broadcast" : ""); - l = sizeof(struct pcap_addr) + - (i->ifa_addr ? i->ifa_addr->sa_len:0) + - (i->ifa_netmask ? i->ifa_netmask->sa_len:0) + - (i->ifa_broadaddr? i->ifa_broadaddr->sa_len:0); - pca = calloc(1, l); - if (pca == NULL) { - D("no space for if addr"); - continue; - } -#define SA_NEXT(x) ((struct sockaddr *)((char *)(x) + (x)->sa_len)) - pca->addr = (struct sockaddr *)(pca + 1); - pkt_copy(i->ifa_addr, pca->addr, i->ifa_addr->sa_len); - if (i->ifa_netmask) { - pca->netmask = SA_NEXT(pca->addr); - bcopy(i->ifa_netmask, pca->netmask, i->ifa_netmask->sa_len); - if (i->ifa_broadaddr) { - pca->broadaddr = SA_NEXT(pca->netmask); - bcopy(i->ifa_broadaddr, pca->broadaddr, i->ifa_broadaddr->sa_len); - } - } - if (tail == NULL) { - top->addresses = pca; - } else { - tail->next = pca; - } - tail = pca; - - } - freeifaddrs(i_head); -#endif /* !linux */ - (void)errbuf; /* UNUSED */ - *alldevsp = top; - return 0; -} - -void pcap_freealldevs(pcap_if_t *alldevs) -{ - (void)alldevs; /* UNUSED */ - D("unimplemented"); -} - -char * -pcap_lookupdev(char *buf) -{ - D("%s", buf); - strcpy(buf, "/dev/netmap"); - return buf; -} - -pcap_t * -pcap_create(const char *source, char *errbuf) -{ - D("src %s (call open liveted)", source); - return pcap_open_live(source, 0, 1, 100, errbuf); -} - -int -pcap_activate(pcap_t *p) -{ - D("pcap %p running", p); - return 0; -} - -int -pcap_can_set_rfmon(pcap_t *p) -{ - (void)p; /* UNUSED */ - D(""); - return 0; /* no we can't */ -} - -int -pcap_set_snaplen(pcap_t *p, int snaplen) -{ - struct pcap_ring *me = p; - - D("len %d", snaplen); - me->snaplen = snaplen; - return 0; -} - -int -pcap_snapshot(pcap_t *p) -{ - struct pcap_ring *me = p; - - D("len %d", me->snaplen); - return me->snaplen; -} - -int -pcap_lookupnet(const char *device, uint32_t *netp, - uint32_t *maskp, char *errbuf) -{ - - (void)errbuf; /* UNUSED */ - D("device %s", device); - inet_aton("10.0.0.255", (struct in_addr *)netp); - inet_aton("255.255.255.0",(struct in_addr *) maskp); - return 0; -} - -int -pcap_set_promisc(pcap_t *p, int promisc) -{ - struct pcap_ring *me = p; - - D("promisc %d", promisc); - if (nm_do_ioctl(&me->me, SIOCGIFFLAGS, 0)) - D("SIOCGIFFLAGS failed"); - if (promisc) { - me->me.if_flags |= IFF_PPROMISC; - } else { - me->me.if_flags &= ~IFF_PPROMISC; - } - if (nm_do_ioctl(&me->me, SIOCSIFFLAGS, 0)) - D("SIOCSIFFLAGS failed"); - return 0; -} - -int -pcap_set_timeout(pcap_t *p, int to_ms) -{ - struct pcap_ring *me = p; - - D("%d ms", to_ms); - me->to_ms = to_ms; - return 0; -} - -struct bpf_program; - -int -pcap_compile(pcap_t *p, struct bpf_program *fp, - const char *str, int optimize, uint32_t netmask) -{ - (void)p; /* UNUSED */ - (void)fp; /* UNUSED */ - (void)optimize; /* UNUSED */ - (void)netmask; /* UNUSED */ - D("%s", str); - return 0; -} - -int -pcap_setfilter(pcap_t *p, struct bpf_program *fp) -{ - (void)p; /* UNUSED */ - (void)fp; /* UNUSED */ - D(""); - return 0; -} - -int -pcap_datalink(pcap_t *p) -{ - (void)p; /* UNUSED */ - D("returns 1"); - return 1; // ethernet -} - -const char * -pcap_datalink_val_to_name(int dlt) -{ - D("%d returns DLT_EN10MB", dlt); - return "DLT_EN10MB"; -} - -const char * -pcap_datalink_val_to_description(int dlt) -{ - D("%d returns Ethernet link", dlt); - return "Ethernet link"; -} - -struct pcap_stat; -int -pcap_stats(pcap_t *p, struct pcap_stat *ps) -{ - struct pcap_ring *me = p; - ND(""); - - *ps = me->st; - return 0; /* accumulate from pcap_dispatch() */ -}; - -char * -pcap_geterr(pcap_t *p) -{ - struct pcap_ring *me = p; - - D(""); - return me->msg; -} - -pcap_t * -pcap_open_live(const char *device, int snaplen, - int promisc, int to_ms, char *errbuf) -{ - struct pcap_ring *me; - int l; - - (void)snaplen; /* UNUSED */ - (void)errbuf; /* UNUSED */ - if (!device) { - D("missing device name"); - return NULL; - } - - l = strlen(device) + 1; - D("request to open %s snaplen %d promisc %d timeout %dms", - device, snaplen, promisc, to_ms); - me = calloc(1, sizeof(*me) + l); - if (me == NULL) { - D("failed to allocate struct for %s", device); - return NULL; - } - me->me.ifname = (char *)(me + 1); - strcpy((char *)me->me.ifname, device); - if (netmap_open(&me->me, 0, promisc)) { - D("error opening %s", device); - free(me); - return NULL; - } - me->to_ms = to_ms; - - return (pcap_t *)me; -} - -void -pcap_close(pcap_t *p) -{ - struct my_ring *me = p; - - D(""); - if (!me) - return; - if (me->mem) - munmap(me->mem, me->memsize); - /* restore original flags ? */ - close(me->fd); - bzero(me, sizeof(*me)); - free(me); -} - -int -pcap_fileno(pcap_t *p) -{ - struct my_ring *me = p; - D("returns %d", me->fd); - return me->fd; -} - -int -pcap_get_selectable_fd(pcap_t *p) -{ - struct my_ring *me = p; - - ND(""); - return me->fd; -} - -int -pcap_setnonblock(pcap_t *p, int nonblock, char *errbuf) -{ - (void)p; /* UNUSED */ - (void)errbuf; /* UNUSED */ - D("mode is %d", nonblock); - return 0; /* ignore */ -} - -int -pcap_setdirection(pcap_t *p, pcap_direction_t d) -{ - (void)p; /* UNUSED */ - (void)d; /* UNUSED */ - D(""); - return 0; /* ignore */ -}; - -int -pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user) -{ - struct pcap_ring *pme = p; - struct my_ring *me = &pme->me; - int got = 0; - u_int si; - - ND("cnt %d", cnt); - if (cnt == 0) - cnt = -1; - /* scan all rings */ - for (si = me->begin; si < me->end; si++) { - struct netmap_ring *ring = NETMAP_RXRING(me->nifp, si); - ND("ring has %d pkts", ring->avail); - if (ring->avail == 0) - continue; - pme->hdr.ts = ring->ts; - /* - * XXX a proper prefetch should be done as - * prefetch(i); callback(i-1); ... - */ - while ((cnt == -1 || cnt != got) && ring->avail > 0) { - u_int i = ring->cur; - u_int idx = ring->slot[i].buf_idx; - if (idx < 2) { - D("%s bogus RX index %d at offset %d", - me->nifp->ni_name, idx, i); - sleep(2); - } - u_char *buf = (u_char *)NETMAP_BUF(ring, idx); - prefetch(buf); - pme->hdr.len = pme->hdr.caplen = ring->slot[i].len; - // D("call %p len %d", p, me->hdr.len); - callback(user, &pme->hdr, buf); - ring->cur = NETMAP_RING_NEXT(ring, i); - ring->avail--; - got++; - } - } - pme->st.ps_recv += got; - return got; -} - -int -pcap_inject(pcap_t *p, const void *buf, size_t size) -{ - struct my_ring *me = p; - u_int si; - - ND("cnt %d", cnt); - /* scan all rings */ - for (si = me->begin; si < me->end; si++) { - struct netmap_ring *ring = NETMAP_TXRING(me->nifp, si); - - ND("ring has %d pkts", ring->avail); - if (ring->avail == 0) - continue; - u_int i = ring->cur; - u_int idx = ring->slot[i].buf_idx; - if (idx < 2) { - D("%s bogus TX index %d at offset %d", - me->nifp->ni_name, idx, i); - sleep(2); - } - u_char *dst = (u_char *)NETMAP_BUF(ring, idx); - ring->slot[i].len = size; - pkt_copy(buf, dst, size); - ring->cur = NETMAP_RING_NEXT(ring, i); - ring->avail--; - // if (ring->avail == 0) ioctl(me->fd, NIOCTXSYNC, NULL); - return size; - } - errno = ENOBUFS; - return -1; -} - -int -pcap_loop(pcap_t *p, int cnt, pcap_handler callback, u_char *user) -{ - struct pcap_ring *me = p; - struct pollfd fds[1]; - int i; - - ND("cnt %d", cnt); - memset(fds, 0, sizeof(fds)); - fds[0].fd = me->me.fd; - fds[0].events = (POLLIN); - - while (cnt == -1 || cnt > 0) { - if (poll(fds, 1, me->to_ms) <= 0) { - D("poll error/timeout"); - continue; - } - i = pcap_dispatch(p, cnt, callback, user); - if (cnt > 0) - cnt -= i; - } - return 0; -} - -#endif /* !TEST */ - -#ifdef TEST /* build test code */ -void do_send(u_char *user, const struct pcap_pkthdr *h, const u_char *buf) -{ - pcap_inject((pcap_t *)user, buf, h->caplen); -} - -/* - * a simple pcap test program, bridge between two interfaces. - */ -int -main(int argc, char **argv) -{ - pcap_t *p0, *p1; - int burst = 1024; - struct pollfd pollfd[2]; - - fprintf(stderr, "%s %s built %s %s\n", - argv[0], version, __DATE__, __TIME__); - - while (argc > 1 && !strcmp(argv[1], "-v")) { - verbose++; - argv++; - argc--; - } - - if (argc < 3 || argc > 4 || !strcmp(argv[1], argv[2])) { - D("Usage: %s IFNAME1 IFNAME2 [BURST]", argv[0]); - return (1); - } - if (argc > 3) - burst = atoi(argv[3]); - - p0 = pcap_open_live(argv[1], 0, 1, 100, NULL); - p1 = pcap_open_live(argv[2], 0, 1, 100, NULL); - D("%s", version); - D("open returns %p %p", p0, p1); - if (!p0 || !p1) - return(1); - bzero(pollfd, sizeof(pollfd)); - pollfd[0].fd = pcap_fileno(p0); - pollfd[1].fd = pcap_fileno(p1); - pollfd[0].events = pollfd[1].events = POLLIN; - for (;;) { - /* do i need to reset ? */ - pollfd[0].revents = pollfd[1].revents = 0; - int ret = poll(pollfd, 2, 1000); - if (ret <= 0 || verbose) - D("poll %s [0] ev %x %x [1] ev %x %x", - ret <= 0 ? "timeout" : "ok", - pollfd[0].events, - pollfd[0].revents, - pollfd[1].events, - pollfd[1].revents); - if (ret < 0) - continue; - if (pollfd[0].revents & POLLIN) - pcap_dispatch(p0, burst, do_send, p1); - if (pollfd[1].revents & POLLIN) - pcap_dispatch(p1, burst, do_send, p0); - } - - return (0); -} -#endif /* TEST */ Property changes on: stable/9/tools/tools/netmap/pcap.c ___________________________________________________________________ Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Index: stable/9/tools/tools/netmap/click-test.cfg =================================================================== --- stable/9/tools/tools/netmap/click-test.cfg (revision 262152) +++ stable/9/tools/tools/netmap/click-test.cfg (nonexistent) @@ -1,19 +0,0 @@ -// -// $FreeBSD$ -// -// A sample test configuration for click -// -// -// create a switch - -myswitch :: EtherSwitch; - -// two input devices - -c0 :: FromDevice(ix0, PROMISC true); -c1 :: FromDevice(ix1, PROMISC true); - -// and now pass packets around - -c0[0] -> [0]sw[0] -> Queue(10000) -> ToDevice(ix0); -c1[0] -> [1]sw[1] -> Queue(10000) -> ToDevice(ix1); Property changes on: stable/9/tools/tools/netmap/click-test.cfg ___________________________________________________________________ Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Index: stable/9/tools/tools/netmap/nm_util.c =================================================================== --- stable/9/tools/tools/netmap/nm_util.c (revision 262152) +++ stable/9/tools/tools/netmap/nm_util.c (nonexistent) @@ -1,244 +0,0 @@ -/* - * Copyright (C) 2012-2013 Luigi Rizzo. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $FreeBSD$ - * $Id$ - * - * utilities to use netmap devices. - * This does the basic functions of opening a device and issuing - * ioctls() - */ - -#include "nm_util.h" - -extern int verbose; - -int -nm_do_ioctl(struct my_ring *me, u_long what, int subcmd) -{ - struct ifreq ifr; - int error; -#if defined( __FreeBSD__ ) || defined (__APPLE__) - int fd = me->fd; -#endif -#ifdef linux - struct ethtool_value eval; - int fd; - fd = socket(AF_INET, SOCK_DGRAM, 0); - if (fd < 0) { - printf("Error: cannot get device control socket.\n"); - return -1; - } -#endif /* linux */ - - (void)subcmd; // unused - bzero(&ifr, sizeof(ifr)); - strncpy(ifr.ifr_name, me->ifname, sizeof(ifr.ifr_name)); - switch (what) { - case SIOCSIFFLAGS: -#ifndef __APPLE__ - ifr.ifr_flagshigh = me->if_flags >> 16; -#endif - ifr.ifr_flags = me->if_flags & 0xffff; - break; - -#if defined( __FreeBSD__ ) - case SIOCSIFCAP: - ifr.ifr_reqcap = me->if_reqcap; - ifr.ifr_curcap = me->if_curcap; - break; -#endif -#ifdef linux - case SIOCETHTOOL: - eval.cmd = subcmd; - eval.data = 0; - ifr.ifr_data = (caddr_t)&eval; - break; -#endif /* linux */ - } - error = ioctl(fd, what, &ifr); - if (error) - goto done; - switch (what) { - case SIOCGIFFLAGS: -#ifndef __APPLE__ - me->if_flags = (ifr.ifr_flagshigh << 16) | - (0xffff & ifr.ifr_flags); -#endif - if (verbose) - D("flags are 0x%x", me->if_flags); - break; - -#if defined( __FreeBSD__ ) - case SIOCGIFCAP: - me->if_reqcap = ifr.ifr_reqcap; - me->if_curcap = ifr.ifr_curcap; - if (verbose) - D("curcap are 0x%x", me->if_curcap); - break; -#endif /* __FreeBSD__ */ - } -done: -#ifdef linux - close(fd); -#endif - if (error) - D("ioctl error %d %lu", error, what); - return error; -} - -/* - * open a device. if me->mem is null then do an mmap. - * Returns the file descriptor. - * The extra flag checks configures promisc mode. - */ -int -netmap_open(struct my_ring *me, int ringid, int promisc) -{ - int fd, err, l; - struct nmreq req; - - me->fd = fd = open("/dev/netmap", O_RDWR); - if (fd < 0) { - D("Unable to open /dev/netmap"); - return (-1); - } - bzero(&req, sizeof(req)); - req.nr_version = NETMAP_API; - strncpy(req.nr_name, me->ifname, sizeof(req.nr_name)); - req.nr_ringid = ringid; - err = ioctl(fd, NIOCREGIF, &req); - if (err) { - D("Unable to register %s", me->ifname); - goto error; - } - me->memsize = l = req.nr_memsize; - if (verbose) - D("memsize is %d MB", l>>20); - - if (me->mem == NULL) { - me->mem = mmap(0, l, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0); - if (me->mem == MAP_FAILED) { - D("Unable to mmap"); - me->mem = NULL; - goto error; - } - } - - - /* Set the operating mode. */ - if (ringid != NETMAP_SW_RING) { - nm_do_ioctl(me, SIOCGIFFLAGS, 0); - if ((me[0].if_flags & IFF_UP) == 0) { - D("%s is down, bringing up...", me[0].ifname); - me[0].if_flags |= IFF_UP; - } - if (promisc) { - me[0].if_flags |= IFF_PPROMISC; - nm_do_ioctl(me, SIOCSIFFLAGS, 0); - } - -#ifdef __FreeBSD__ - /* also disable checksums etc. */ - nm_do_ioctl(me, SIOCGIFCAP, 0); - me[0].if_reqcap = me[0].if_curcap; - me[0].if_reqcap &= ~(IFCAP_HWCSUM | IFCAP_TSO | IFCAP_TOE); - nm_do_ioctl(me+0, SIOCSIFCAP, 0); -#endif -#ifdef linux - /* disable: - * - generic-segmentation-offload - * - tcp-segmentation-offload - * - rx-checksumming - * - tx-checksumming - * XXX check how to set back the caps. - */ - nm_do_ioctl(me, SIOCETHTOOL, ETHTOOL_SGSO); - nm_do_ioctl(me, SIOCETHTOOL, ETHTOOL_STSO); - nm_do_ioctl(me, SIOCETHTOOL, ETHTOOL_SRXCSUM); - nm_do_ioctl(me, SIOCETHTOOL, ETHTOOL_STXCSUM); -#endif /* linux */ - } - - me->nifp = NETMAP_IF(me->mem, req.nr_offset); - me->queueid = ringid; - if (ringid & NETMAP_SW_RING) { - me->begin = req.nr_rx_rings; - me->end = me->begin + 1; - me->tx = NETMAP_TXRING(me->nifp, req.nr_tx_rings); - me->rx = NETMAP_RXRING(me->nifp, req.nr_rx_rings); - } else if (ringid & NETMAP_HW_RING) { - D("XXX check multiple threads"); - me->begin = ringid & NETMAP_RING_MASK; - me->end = me->begin + 1; - me->tx = NETMAP_TXRING(me->nifp, me->begin); - me->rx = NETMAP_RXRING(me->nifp, me->begin); - } else { - me->begin = 0; - me->end = req.nr_rx_rings; // XXX max of the two - me->tx = NETMAP_TXRING(me->nifp, 0); - me->rx = NETMAP_RXRING(me->nifp, 0); - } - return (0); -error: - close(me->fd); - return -1; -} - - -int -netmap_close(struct my_ring *me) -{ - D(""); - if (me->mem) - munmap(me->mem, me->memsize); - close(me->fd); - return (0); -} - - -/* - * how many packets on this set of queues ? - */ -int -pkt_queued(struct my_ring *me, int tx) -{ - u_int i, tot = 0; - - ND("me %p begin %d end %d", me, me->begin, me->end); - for (i = me->begin; i < me->end; i++) { - struct netmap_ring *ring = tx ? - NETMAP_TXRING(me->nifp, i) : NETMAP_RXRING(me->nifp, i); - tot += ring->avail; - } - if (0 && verbose && tot && !tx) - D("ring %s %s %s has %d avail at %d", - me->ifname, tx ? "tx": "rx", - me->end >= me->nifp->ni_tx_rings ? // XXX who comes first ? - "host":"net", - tot, NETMAP_TXRING(me->nifp, me->begin)->cur); - return tot; -} Property changes on: stable/9/tools/tools/netmap/nm_util.c ___________________________________________________________________ Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Index: stable/9/tools/tools/netmap/nm_util.h =================================================================== --- stable/9/tools/tools/netmap/nm_util.h (revision 262152) +++ stable/9/tools/tools/netmap/nm_util.h (nonexistent) @@ -1,183 +0,0 @@ -/* - * Copyright (C) 2012 Luigi Rizzo. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $FreeBSD$ - * $Id$ - * - * Some utilities to build netmap-based programs. - */ - -#ifndef _NM_UTIL_H -#define _NM_UTIL_H -#include -#include /* signal */ -#include -#include -#include /* PRI* macros */ -#include /* strcmp */ -#include /* open */ -#include /* close */ -#include /* getifaddrs */ - -#include /* PROT_* */ -#include /* ioctl */ -#include -#include /* sockaddr.. */ -#include /* ntohs */ -#include -#include /* sysctl */ -#include /* timersub */ - -#include -#include /* ifreq */ - -#include -#include -#include - -#include -#include - -#ifndef MY_PCAP /* use the system's pcap if available */ - -#ifdef NO_PCAP -#define PCAP_ERRBUF_SIZE 512 -typedef void pcap_t; -struct pcap_pkthdr; -#define pcap_inject(a,b,c) ((void)a, (void)b, (void)c, -1) -#define pcap_dispatch(a, b, c, d) (void)c -#define pcap_open_live(a, b, c, d, e) ((void)e, NULL) -#else /* !NO_PCAP */ -#include // XXX do we need it ? -#endif /* !NO_PCAP */ - -#endif // XXX hack - -#include /* pthread_* */ - -#ifdef linux -#define ifr_flagshigh ifr_flags -#define ifr_curcap ifr_flags -#define ifr_reqcap ifr_flags -#define IFF_PPROMISC IFF_PROMISC -#include -#include - -#define CLOCK_REALTIME_PRECISE CLOCK_REALTIME -#include /* ether_aton */ -#include /* sockaddr_ll */ -#endif /* linux */ - -#ifdef __FreeBSD__ -#include /* le64toh */ -#include - -#include /* pthread w/ affinity */ -#include /* cpu_set */ -#include /* LLADDR */ -#endif /* __FreeBSD__ */ - -#ifdef __APPLE__ -#define ifr_flagshigh ifr_flags // XXX -#define IFF_PPROMISC IFF_PROMISC -#include /* LLADDR */ -#define clock_gettime(a,b) \ - do {struct timespec t0 = {0,0}; *(b) = t0; } while (0) -#endif /* __APPLE__ */ - -static inline int min(int a, int b) { return a < b ? a : b; } -extern int time_second; - -/* debug support */ -#define ND(format, ...) do {} while(0) -#define D(format, ...) \ - fprintf(stderr, "%s [%d] " format "\n", \ - __FUNCTION__, __LINE__, ##__VA_ARGS__) - -#define RD(lps, format, ...) \ - do { \ - static int t0, cnt; \ - if (t0 != time_second) { \ - t0 = time_second; \ - cnt = 0; \ - } \ - if (cnt++ < lps) \ - D(format, ##__VA_ARGS__); \ - } while (0) - - - -// XXX does it work on 32-bit machines ? -static inline void prefetch (const void *x) -{ - __asm volatile("prefetcht0 %0" :: "m" (*(const unsigned long *)x)); -} - -// XXX only for multiples of 64 bytes, non overlapped. -static inline void -pkt_copy(const void *_src, void *_dst, int l) -{ - const uint64_t *src = _src; - uint64_t *dst = _dst; -#define likely(x) __builtin_expect(!!(x), 1) -#define unlikely(x) __builtin_expect(!!(x), 0) - if (unlikely(l >= 1024)) { - bcopy(src, dst, l); - return; - } - for (; l > 0; l-=64) { - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - } -} - -/* - * info on a ring we handle - */ -struct my_ring { - const char *ifname; - int fd; - char *mem; /* userspace mmap address */ - u_int memsize; - u_int queueid; - u_int begin, end; /* first..last+1 rings to check */ - struct netmap_if *nifp; - struct netmap_ring *tx, *rx; /* shortcuts */ - - uint32_t if_flags; - uint32_t if_reqcap; - uint32_t if_curcap; -}; -int netmap_open(struct my_ring *me, int ringid, int promisc); -int netmap_close(struct my_ring *me); -int nm_do_ioctl(struct my_ring *me, u_long what, int subcmd); -#endif /* _NM_UTIL_H */ Property changes on: stable/9/tools/tools/netmap/nm_util.h ___________________________________________________________________ Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Index: stable/9/tools/tools/netmap/Makefile =================================================================== --- stable/9/tools/tools/netmap/Makefile (revision 262152) +++ stable/9/tools/tools/netmap/Makefile (revision 262153) @@ -1,28 +1,32 @@ # # $FreeBSD$ # # For multiple programs using a single source file each, # we can just define 'progs' and create custom targets. -PROGS = pkt-gen bridge vale-ctl testpcap libnetmap.so +PROGS = pkt-gen bridge vale-ctl -CLEANFILES = $(PROGS) pcap.o nm_util.o +CLEANFILES = $(PROGS) *.o NO_MAN= -CFLAGS += -Werror -Wall -nostdinc -I/usr/include -I../../../sys +CFLAGS += -Werror -Wall # -nostdinc -I/usr/include -I../../../sys CFLAGS += -Wextra -LDFLAGS += -lpthread -lpcap +LDFLAGS += -lpthread +.ifdef WITHOUT_PCAP +CFLAGS += -DNO_PCAP +.else +LDFLAGS += -lpcap +.endif .include .include all: $(PROGS) -pkt-gen bridge: nm_util.o - $(CC) $(CFLAGS) -o ${.TARGET} ${.TARGET:=.c} nm_util.o $(LDFLAGS) +pkt-gen: pkt-gen.o + $(CC) $(CFLAGS) -o pkt-gen pkt-gen.o $(LDFLAGS) -testpcap: pcap.c libnetmap.so - $(CC) $(CFLAGS) -DTEST -L. -lnetmap -o ${.TARGET} pcap.c - -libnetmap.so: pcap.c nm_util.c - $(CC) $(CFLAGS) -fpic -c ${.ALLSRC} - $(CC) -shared -o ${.TARGET} ${.ALLSRC:.c=.o} +bridge: bridge.o + $(CC) $(CFLAGS) -o bridge bridge.o + +vale-ctl: vale-ctl.o + $(CC) $(CFLAGS) -o vale-ctl vale-ctl.o Index: stable/9/tools/tools/netmap/README =================================================================== --- stable/9/tools/tools/netmap/README (revision 262152) +++ stable/9/tools/tools/netmap/README (revision 262153) @@ -1,24 +1,9 @@ $FreeBSD$ This directory contains examples that use netmap pkt-gen a packet sink/source using the netmap API bridge a two-port jumper wire, also using the native API - testpcap a jumper wire using libnetmap (or libpcap) - - click* various click examples - ------------------------------------------------------------- -Some performance data as of may 2012 for applications using libpcap. -Throughput is generally in Mpps computed with the 64-byte frames, -using 1 core on a 2.9GHz CPU and 10Gbit/s interface - -Libpcap version -- Application --------------------- -BSD netmap ---------------------------------------------------- - 0.77 3.82 ports/trafshow (version 5) - 0.94 7.7 net-mgmt/ipcad (ip accounting daemon) - 0.9 5.0 net-mgmt/darkstat (ip accounting + graphing) - 0.83 2.45 net-mgmt/iftop (curses traffic display) + vale-ctl the program to control VALE bridges Index: stable/9/tools/tools/netmap/bridge.c =================================================================== --- stable/9/tools/tools/netmap/bridge.c (revision 262152) +++ stable/9/tools/tools/netmap/bridge.c (revision 262153) @@ -1,314 +1,317 @@ /* - * (C) 2011 Luigi Rizzo, Matteo Landi + * (C) 2011-2014 Luigi Rizzo, Matteo Landi * * BSD license * * A netmap client to bridge two network interfaces * (or one interface and the host stack). * * $FreeBSD$ */ -#include "nm_util.h" +#include +#define NETMAP_WITH_LIBS +#include +#include - int verbose = 0; -char *version = "$Id$"; - static int do_abort = 0; +static int zerocopy = 1; /* enable zerocopy if possible */ static void sigint_h(int sig) { (void)sig; /* UNUSED */ do_abort = 1; signal(SIGINT, SIG_DFL); } /* + * how many packets on this set of queues ? + */ +int +pkt_queued(struct nm_desc *d, int tx) +{ + u_int i, tot = 0; + + if (tx) { + for (i = d->first_tx_ring; i <= d->last_tx_ring; i++) { + tot += nm_ring_space(NETMAP_TXRING(d->nifp, i)); + } + } else { + for (i = d->first_rx_ring; i <= d->last_rx_ring; i++) { + tot += nm_ring_space(NETMAP_RXRING(d->nifp, i)); + } + } + return tot; +} + +/* * move up to 'limit' pkts from rxring to txring swapping buffers. */ static int process_rings(struct netmap_ring *rxring, struct netmap_ring *txring, u_int limit, const char *msg) { u_int j, k, m = 0; /* print a warning if any of the ring flags is set (e.g. NM_REINIT) */ if (rxring->flags || txring->flags) D("%s rxflags %x txflags %x", msg, rxring->flags, txring->flags); j = rxring->cur; /* RX */ k = txring->cur; /* TX */ - if (rxring->avail < limit) - limit = rxring->avail; - if (txring->avail < limit) - limit = txring->avail; + m = nm_ring_space(rxring); + if (m < limit) + limit = m; + m = nm_ring_space(txring); + if (m < limit) + limit = m; m = limit; while (limit-- > 0) { struct netmap_slot *rs = &rxring->slot[j]; struct netmap_slot *ts = &txring->slot[k]; -#ifdef NO_SWAP - char *rxbuf = NETMAP_BUF(rxring, rs->buf_idx); - char *txbuf = NETMAP_BUF(txring, ts->buf_idx); -#else - uint32_t pkt; -#endif /* swap packets */ if (ts->buf_idx < 2 || rs->buf_idx < 2) { D("wrong index rx[%d] = %d -> tx[%d] = %d", j, rs->buf_idx, k, ts->buf_idx); sleep(2); } -#ifndef NO_SWAP - pkt = ts->buf_idx; - ts->buf_idx = rs->buf_idx; - rs->buf_idx = pkt; -#endif /* copy the packet length. */ - if (rs->len < 14 || rs->len > 2048) + if (rs->len > 2048) { D("wrong len %d rx[%d] -> tx[%d]", rs->len, j, k); - else if (verbose > 1) + rs->len = 0; + } else if (verbose > 1) { D("%s send len %d rx[%d] -> tx[%d]", msg, rs->len, j, k); + } ts->len = rs->len; -#ifdef NO_SWAP - pkt_copy(rxbuf, txbuf, ts->len); -#else - /* report the buffer change. */ - ts->flags |= NS_BUF_CHANGED; - rs->flags |= NS_BUF_CHANGED; -#endif /* NO_SWAP */ - j = NETMAP_RING_NEXT(rxring, j); - k = NETMAP_RING_NEXT(txring, k); + if (zerocopy) { + uint32_t pkt = ts->buf_idx; + ts->buf_idx = rs->buf_idx; + rs->buf_idx = pkt; + /* report the buffer change. */ + ts->flags |= NS_BUF_CHANGED; + rs->flags |= NS_BUF_CHANGED; + } else { + char *rxbuf = NETMAP_BUF(rxring, rs->buf_idx); + char *txbuf = NETMAP_BUF(txring, ts->buf_idx); + nm_pkt_copy(rxbuf, txbuf, ts->len); + } + j = nm_ring_next(rxring, j); + k = nm_ring_next(txring, k); } - rxring->avail -= m; - txring->avail -= m; - rxring->cur = j; - txring->cur = k; + rxring->head = rxring->cur = j; + txring->head = txring->cur = k; if (verbose && m > 0) D("%s sent %d packets to %p", msg, m, txring); return (m); } /* move packts from src to destination */ static int -move(struct my_ring *src, struct my_ring *dst, u_int limit) +move(struct nm_desc *src, struct nm_desc *dst, u_int limit) { struct netmap_ring *txring, *rxring; - u_int m = 0, si = src->begin, di = dst->begin; - const char *msg = (src->queueid & NETMAP_SW_RING) ? + u_int m = 0, si = src->first_rx_ring, di = dst->first_tx_ring; + const char *msg = (src->req.nr_ringid & NETMAP_SW_RING) ? "host->net" : "net->host"; - while (si < src->end && di < dst->end) { + while (si <= src->last_rx_ring && di <= dst->last_tx_ring) { rxring = NETMAP_RXRING(src->nifp, si); txring = NETMAP_TXRING(dst->nifp, di); ND("txring %p rxring %p", txring, rxring); - if (rxring->avail == 0) { + if (nm_ring_empty(rxring)) { si++; continue; } - if (txring->avail == 0) { + if (nm_ring_empty(txring)) { di++; continue; } m += process_rings(rxring, txring, limit, msg); } return (m); } -/* - * how many packets on this set of queues ? - */ -static int -pkt_queued(struct my_ring *me, int tx) -{ - u_int i, tot = 0; - ND("me %p begin %d end %d", me, me->begin, me->end); - for (i = me->begin; i < me->end; i++) { - struct netmap_ring *ring = tx ? - NETMAP_TXRING(me->nifp, i) : NETMAP_RXRING(me->nifp, i); - tot += ring->avail; - } - if (0 && verbose && tot && !tx) - D("ring %s %s %s has %d avail at %d", - me->ifname, tx ? "tx": "rx", - me->end >= me->nifp->ni_tx_rings ? // XXX who comes first ? - "host":"net", - tot, NETMAP_TXRING(me->nifp, me->begin)->cur); - return tot; -} - static void usage(void) { fprintf(stderr, "usage: bridge [-v] [-i ifa] [-i ifb] [-b burst] [-w wait_time] [iface]\n"); exit(1); } /* * bridge [-v] if1 [if2] * * If only one name, or the two interfaces are the same, * bridges userland and the adapter. Otherwise bridge * two intefaces. */ int main(int argc, char **argv) { struct pollfd pollfd[2]; - int i, ch; + int ch; u_int burst = 1024, wait_link = 4; - struct my_ring me[2]; + struct nm_desc *pa = NULL, *pb = NULL; char *ifa = NULL, *ifb = NULL; + char ifabuf[64] = { 0 }; - fprintf(stderr, "%s %s built %s %s\n", - argv[0], version, __DATE__, __TIME__); + fprintf(stderr, "%s built %s %s\n", + argv[0], __DATE__, __TIME__); - bzero(me, sizeof(me)); - - while ( (ch = getopt(argc, argv, "b:i:vw:")) != -1) { + while ( (ch = getopt(argc, argv, "b:ci:vw:")) != -1) { switch (ch) { default: D("bad option %c %s", ch, optarg); usage(); break; case 'b': /* burst */ burst = atoi(optarg); break; case 'i': /* interface */ if (ifa == NULL) ifa = optarg; else if (ifb == NULL) ifb = optarg; else D("%s ignored, already have 2 interfaces", optarg); break; + case 'c': + zerocopy = 0; /* do not zerocopy */ + break; case 'v': verbose++; break; case 'w': wait_link = atoi(optarg); break; } } argc -= optind; argv += optind; if (argc > 1) ifa = argv[1]; if (argc > 2) ifb = argv[2]; if (argc > 3) burst = atoi(argv[3]); if (!ifb) ifb = ifa; if (!ifa) { D("missing interface"); usage(); } if (burst < 1 || burst > 8192) { D("invalid burst %d, set to 1024", burst); burst = 1024; } if (wait_link > 100) { D("invalid wait_link %d, set to 4", wait_link); wait_link = 4; } - /* setup netmap interface #1. */ - me[0].ifname = ifa; - me[1].ifname = ifb; if (!strcmp(ifa, ifb)) { D("same interface, endpoint 0 goes to host"); - i = NETMAP_SW_RING; + snprintf(ifabuf, sizeof(ifabuf) - 1, "%s^", ifa); + ifa = ifabuf; } else { /* two different interfaces. Take all rings on if1 */ - i = 0; // all hw rings } - if (netmap_open(me, i, 1)) + pa = nm_open(ifa, NULL, 0, NULL); + if (pa == NULL) { + D("cannot open %s", ifa); return (1); - me[1].mem = me[0].mem; /* copy the pointer, so only one mmap */ - if (netmap_open(me+1, 0, 1)) + } + // XXX use a single mmap ? + pb = nm_open(ifb, NULL, NM_OPEN_NO_MMAP, pa); + if (pb == NULL) { + D("cannot open %s", ifb); + nm_close(pa); return (1); + } + zerocopy = zerocopy && (pa->mem == pb->mem); + D("------- zerocopy %ssupported", zerocopy ? "" : "NOT "); /* setup poll(2) variables. */ memset(pollfd, 0, sizeof(pollfd)); - for (i = 0; i < 2; i++) { - pollfd[i].fd = me[i].fd; - pollfd[i].events = (POLLIN); - } + pollfd[0].fd = pa->fd; + pollfd[1].fd = pb->fd; D("Wait %d secs for link to come up...", wait_link); sleep(wait_link); D("Ready to go, %s 0x%x/%d <-> %s 0x%x/%d.", - me[0].ifname, me[0].queueid, me[0].nifp->ni_rx_rings, - me[1].ifname, me[1].queueid, me[1].nifp->ni_rx_rings); + pa->req.nr_name, pa->first_rx_ring, pa->req.nr_rx_rings, + pb->req.nr_name, pb->first_rx_ring, pb->req.nr_rx_rings); /* main loop */ signal(SIGINT, sigint_h); while (!do_abort) { int n0, n1, ret; pollfd[0].events = pollfd[1].events = 0; pollfd[0].revents = pollfd[1].revents = 0; - n0 = pkt_queued(me, 0); - n1 = pkt_queued(me + 1, 0); + n0 = pkt_queued(pa, 0); + n1 = pkt_queued(pb, 0); if (n0) pollfd[1].events |= POLLOUT; else pollfd[0].events |= POLLIN; if (n1) pollfd[0].events |= POLLOUT; else pollfd[1].events |= POLLIN; ret = poll(pollfd, 2, 2500); if (ret <= 0 || verbose) D("poll %s [0] ev %x %x rx %d@%d tx %d," " [1] ev %x %x rx %d@%d tx %d", ret <= 0 ? "timeout" : "ok", pollfd[0].events, pollfd[0].revents, - pkt_queued(me, 0), - me[0].rx->cur, - pkt_queued(me, 1), + pkt_queued(pa, 0), + NETMAP_RXRING(pa->nifp, pa->cur_rx_ring)->cur, + pkt_queued(pa, 1), pollfd[1].events, pollfd[1].revents, - pkt_queued(me+1, 0), - me[1].rx->cur, - pkt_queued(me+1, 1) + pkt_queued(pb, 0), + NETMAP_RXRING(pb->nifp, pb->cur_rx_ring)->cur, + pkt_queued(pb, 1) ); if (ret < 0) continue; if (pollfd[0].revents & POLLERR) { - D("error on fd0, rxcur %d@%d", - me[0].rx->avail, me[0].rx->cur); + struct netmap_ring *rx = NETMAP_RXRING(pa->nifp, pa->cur_rx_ring); + D("error on fd0, rx [%d,%d,%d)", + rx->head, rx->cur, rx->tail); } if (pollfd[1].revents & POLLERR) { - D("error on fd1, rxcur %d@%d", - me[1].rx->avail, me[1].rx->cur); + struct netmap_ring *rx = NETMAP_RXRING(pb->nifp, pb->cur_rx_ring); + D("error on fd1, rx [%d,%d,%d)", + rx->head, rx->cur, rx->tail); } if (pollfd[0].revents & POLLOUT) { - move(me + 1, me, burst); + move(pb, pa, burst); // XXX we don't need the ioctl */ // ioctl(me[0].fd, NIOCTXSYNC, NULL); } if (pollfd[1].revents & POLLOUT) { - move(me, me + 1, burst); + move(pa, pb, burst); // XXX we don't need the ioctl */ // ioctl(me[1].fd, NIOCTXSYNC, NULL); } } D("exiting"); - netmap_close(me + 1); - netmap_close(me + 0); + nm_close(pb); + nm_close(pa); return (0); } Index: stable/9/tools/tools/netmap/pkt-gen.c =================================================================== --- stable/9/tools/tools/netmap/pkt-gen.c (revision 262152) +++ stable/9/tools/tools/netmap/pkt-gen.c (revision 262153) @@ -1,1813 +1,1897 @@ /* - * Copyright (C) 2011-2012 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD$ - * $Id: pkt-gen.c 12024 2013-01-25 05:41:51Z luigi $ + * $Id: pkt-gen.c 12346 2013-06-12 17:36:25Z luigi $ * * Example program to show how to build a multithreaded packet * source/sink using the netmap device. * * In this example we create a programmable number of threads * to take care of all the queues of the interface used to * send or receive traffic. * */ -#include "nm_util.h" +#define _GNU_SOURCE /* for CPU_SET() */ +#include +#define NETMAP_WITH_LIBS +#include + #include // isprint() +#include // sysconf() +#include +#include /* ntohs */ +#include /* sysctl */ +#include /* getifaddrs */ +#include +#include +#include +#include +#include + +#ifndef NO_PCAP +#include +#endif + +#ifdef linux + +#define cpuset_t cpu_set_t + +#define ifr_flagshigh ifr_flags /* only the low 16 bits here */ +#define IFF_PPROMISC IFF_PROMISC /* IFF_PPROMISC does not exist */ +#include +#include + +#define CLOCK_REALTIME_PRECISE CLOCK_REALTIME +#include /* ether_aton */ +#include /* sockaddr_ll */ +#endif /* linux */ + +#ifdef __FreeBSD__ +#include /* le64toh */ +#include + +#include /* pthread w/ affinity */ +#include /* cpu_set */ +#include /* LLADDR */ +#endif /* __FreeBSD__ */ + +#ifdef __APPLE__ + +#define cpuset_t uint64_t // XXX +static inline void CPU_ZERO(cpuset_t *p) +{ + *p = 0; +} + +static inline void CPU_SET(uint32_t i, cpuset_t *p) +{ + *p |= 1<< (i & 0x3f); +} + +#define pthread_setaffinity_np(a, b, c) ((void)a, 0) + +#define ifr_flagshigh ifr_flags // XXX +#define IFF_PPROMISC IFF_PROMISC +#include /* LLADDR */ +#define clock_gettime(a,b) \ + do {struct timespec t0 = {0,0}; *(b) = t0; } while (0) +#endif /* __APPLE__ */ + const char *default_payload="netmap pkt-gen DIRECT payload\n" "http://info.iet.unipi.it/~luigi/netmap/ "; const char *indirect_payload="netmap pkt-gen indirect payload\n" "http://info.iet.unipi.it/~luigi/netmap/ "; -int time_second; // support for RD() debugging macro - int verbose = 0; -#define SKIP_PAYLOAD 1 /* do not check payload. */ +#define SKIP_PAYLOAD 1 /* do not check payload. XXX unused */ + +#define VIRT_HDR_1 10 /* length of a base vnet-hdr */ +#define VIRT_HDR_2 12 /* length of the extenede vnet-hdr */ +#define VIRT_HDR_MAX VIRT_HDR_2 +struct virt_header { + uint8_t fields[VIRT_HDR_MAX]; +}; + struct pkt { + struct virt_header vh; struct ether_header eh; struct ip ip; struct udphdr udp; uint8_t body[2048]; // XXX hardwired } __attribute__((__packed__)); struct ip_range { char *name; uint32_t start, end; /* same as struct in_addr */ uint16_t port0, port1; }; struct mac_range { char *name; struct ether_addr start, end; }; +/* ifname can be netmap:foo-xxxx */ +#define MAX_IFNAMELEN 64 /* our buffer for ifname */ /* * global arguments for all threads */ struct glob_arg { struct ip_range src_ip; struct ip_range dst_ip; struct mac_range dst_mac; struct mac_range src_mac; int pkt_size; int burst; int forever; int npackets; /* total packets to send */ int frags; /* fragments per packet */ int nthreads; int cpus; int options; /* testing */ #define OPT_PREFETCH 1 #define OPT_ACCESS 2 #define OPT_COPY 4 #define OPT_MEMCPY 8 #define OPT_TS 16 /* add a timestamp */ #define OPT_INDIRECT 32 /* use indirect buffers, tx only */ #define OPT_DUMP 64 /* dump rx/tx traffic */ int dev_type; +#ifndef NO_PCAP pcap_t *p; +#endif int tx_rate; struct timespec tx_period; int affinity; int main_fd; - int report_interval; + struct nm_desc *nmd; + uint64_t nmd_flags; + int report_interval; /* milliseconds between prints */ void *(*td_body)(void *); void *mmap_addr; - int mmap_size; - char *ifname; + char ifname[MAX_IFNAMELEN]; char *nmr_config; int dummy_send; + int virt_header; /* send also the virt_header */ + int extra_bufs; /* goes in nr_arg3 */ }; enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP }; /* * Arguments for a new thread. The same structure is used by * the source and the sink */ struct targ { struct glob_arg *g; int used; int completed; int cancel; int fd; - struct nmreq nmr; - struct netmap_if *nifp; - uint16_t qfirst, qlast; /* range of queues to scan */ + struct nm_desc *nmd; volatile uint64_t count; struct timespec tic, toc; int me; pthread_t thread; int affinity; struct pkt pkt; }; /* * extract the extremes from a range of ipv4 addresses. * addr_lo[-addr_hi][:port_lo[-port_hi]] */ static void extract_ip_range(struct ip_range *r) { char *ap, *pp; struct in_addr a; - D("extract IP range from %s", r->name); + if (verbose) + D("extract IP range from %s", r->name); r->port0 = r->port1 = 0; r->start = r->end = 0; /* the first - splits start/end of range */ ap = index(r->name, '-'); /* do we have ports ? */ if (ap) { *ap++ = '\0'; } /* grab the initial values (mandatory) */ pp = index(r->name, ':'); if (pp) { *pp++ = '\0'; r->port0 = r->port1 = strtol(pp, NULL, 0); }; inet_aton(r->name, &a); r->start = r->end = ntohl(a.s_addr); if (ap) { pp = index(ap, ':'); if (pp) { *pp++ = '\0'; - if (*pp) + if (*pp) r->port1 = strtol(pp, NULL, 0); } if (*ap) { inet_aton(ap, &a); r->end = ntohl(a.s_addr); } } if (r->port0 > r->port1) { uint16_t tmp = r->port0; r->port0 = r->port1; r->port1 = tmp; } if (r->start > r->end) { uint32_t tmp = r->start; r->start = r->end; r->end = tmp; } { struct in_addr a; char buf1[16]; // one ip address a.s_addr = htonl(r->end); strncpy(buf1, inet_ntoa(a), sizeof(buf1)); a.s_addr = htonl(r->start); - D("range is %s:%d to %s:%d", + if (1) + D("range is %s:%d to %s:%d", inet_ntoa(a), r->port0, buf1, r->port1); } } static void extract_mac_range(struct mac_range *r) { - D("extract MAC range from %s", r->name); + if (verbose) + D("extract MAC range from %s", r->name); bcopy(ether_aton(r->name), &r->start, 6); bcopy(ether_aton(r->name), &r->end, 6); #if 0 bcopy(targ->src_mac, eh->ether_shost, 6); p = index(targ->g->src_mac, '-'); if (p) targ->src_mac_range = atoi(p+1); bcopy(ether_aton(targ->g->dst_mac), targ->dst_mac, 6); bcopy(targ->dst_mac, eh->ether_dhost, 6); p = index(targ->g->dst_mac, '-'); if (p) targ->dst_mac_range = atoi(p+1); #endif - D("%s starts at %s", r->name, ether_ntoa(&r->start)); + if (verbose) + D("%s starts at %s", r->name, ether_ntoa(&r->start)); } static struct targ *targs; static int global_nthreads; /* control-C handler */ static void sigint_h(int sig) { int i; (void)sig; /* UNUSED */ for (i = 0; i < global_nthreads; i++) { targs[i].cancel = 1; } signal(SIGINT, SIG_DFL); } /* sysctl wrapper to return the number of active CPUs */ static int system_ncpus(void) { -#ifdef __FreeBSD__ - int mib[2], ncpus; - size_t len; - - mib[0] = CTL_HW; - mib[1] = HW_NCPU; - len = sizeof(mib); + int ncpus; +#if defined (__FreeBSD__) + int mib[2] = { CTL_HW, HW_NCPU }; + size_t len = sizeof(mib); sysctl(mib, 2, &ncpus, &len, NULL, 0); - +#elif defined(linux) + ncpus = sysconf(_SC_NPROCESSORS_ONLN); +#else /* others */ + ncpus = 1; +#endif /* others */ return (ncpus); -#else - return 1; -#endif /* !__FreeBSD__ */ } #ifdef __linux__ #define sockaddr_dl sockaddr_ll #define sdl_family sll_family #define AF_LINK AF_PACKET #define LLADDR(s) s->sll_addr; #include #define TAP_CLONEDEV "/dev/net/tun" #endif /* __linux__ */ #ifdef __FreeBSD__ #include #define TAP_CLONEDEV "/dev/tap" #endif /* __FreeBSD */ #ifdef __APPLE__ // #warning TAP not supported on apple ? #include #define TAP_CLONEDEV "/dev/tap" #endif /* __APPLE__ */ /* * parse the vale configuration in conf and put it in nmr. + * Return the flag set if necessary. * The configuration may consist of 0 to 4 numbers separated - * by commas: #tx-slots,#rx-slots,#tx-rinzgs,#rx-rings. + * by commas: #tx-slots,#rx-slots,#tx-rings,#rx-rings. * Missing numbers or zeroes stand for default values. * As an additional convenience, if exactly one number - * is specified, then this is assigned to bot #tx-slots and #rx-slots. - * If there is no 4th number, then the 3rd is assigned to bot #tx-rings + * is specified, then this is assigned to both #tx-slots and #rx-slots. + * If there is no 4th number, then the 3rd is assigned to both #tx-rings * and #rx-rings. */ -void parse_nmr_config(const char* conf, struct nmreq *nmr) +int +parse_nmr_config(const char* conf, struct nmreq *nmr) { char *w, *tok; int i, v; nmr->nr_tx_rings = nmr->nr_rx_rings = 0; nmr->nr_tx_slots = nmr->nr_rx_slots = 0; if (conf == NULL || ! *conf) - return; + return 0; w = strdup(conf); for (i = 0, tok = strtok(w, ","); tok; i++, tok = strtok(NULL, ",")) { v = atoi(tok); switch (i) { case 0: nmr->nr_tx_slots = nmr->nr_rx_slots = v; break; case 1: nmr->nr_rx_slots = v; break; case 2: nmr->nr_tx_rings = nmr->nr_rx_rings = v; break; case 3: nmr->nr_rx_rings = v; break; default: D("ignored config: %s", tok); break; } } D("txr %d txd %d rxr %d rxd %d", nmr->nr_tx_rings, nmr->nr_tx_slots, nmr->nr_rx_rings, nmr->nr_rx_slots); free(w); + return (nmr->nr_tx_rings || nmr->nr_tx_slots || + nmr->nr_rx_rings || nmr->nr_rx_slots) ? + NM_OPEN_RING_CFG : 0; } /* * locate the src mac address for our interface, put it * into the user-supplied buffer. return 0 if ok, -1 on error. */ static int source_hwaddr(const char *ifname, char *buf) { struct ifaddrs *ifaphead, *ifap; int l = sizeof(ifap->ifa_name); if (getifaddrs(&ifaphead) != 0) { D("getifaddrs %s failed", ifname); return (-1); } for (ifap = ifaphead; ifap; ifap = ifap->ifa_next) { struct sockaddr_dl *sdl = (struct sockaddr_dl *)ifap->ifa_addr; uint8_t *mac; if (!sdl || sdl->sdl_family != AF_LINK) continue; if (strncmp(ifap->ifa_name, ifname, l) != 0) continue; mac = (uint8_t *)LLADDR(sdl); sprintf(buf, "%02x:%02x:%02x:%02x:%02x:%02x", mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); if (verbose) D("source hwaddr %s", buf); break; } freeifaddrs(ifaphead); return ifap ? 0 : 1; } /* set the thread affinity. */ static int setaffinity(pthread_t me, int i) { -#ifdef __FreeBSD__ cpuset_t cpumask; if (i == -1) return 0; /* Set thread affinity affinity.*/ CPU_ZERO(&cpumask); CPU_SET(i, &cpumask); if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) { - D("Unable to set affinity"); + D("Unable to set affinity: %s", strerror(errno)); return 1; } -#else - (void)me; /* suppress 'unused' warnings */ - (void)i; -#endif /* __FreeBSD__ */ return 0; } /* Compute the checksum of the given ip header. */ static uint16_t checksum(const void *data, uint16_t len, uint32_t sum) { const uint8_t *addr = data; uint32_t i; /* Checksum all the pairs of bytes first... */ for (i = 0; i < (len & ~1U); i += 2) { sum += (u_int16_t)ntohs(*((u_int16_t *)(addr + i))); if (sum > 0xFFFF) sum -= 0xFFFF; } /* * If there's a single byte left over, checksum it, too. * Network byte order is big-endian, so the remaining byte is * the high byte. */ if (i < len) { sum += addr[i] << 8; if (sum > 0xFFFF) sum -= 0xFFFF; } return sum; } static u_int16_t wrapsum(u_int32_t sum) { sum = ~sum & 0xFFFF; return (htons(sum)); } /* Check the payload of the packet for errors (use it for debug). * Look for consecutive ascii representations of the size of the packet. */ static void dump_payload(char *p, int len, struct netmap_ring *ring, int cur) { char buf[128]; int i, j, i0; /* get the length in ASCII of the length of the packet. */ - + printf("ring %p cur %5d [buf %6d flags 0x%04x len %5d]\n", ring, cur, ring->slot[cur].buf_idx, ring->slot[cur].flags, len); /* hexdump routine */ for (i = 0; i < len; ) { memset(buf, sizeof(buf), ' '); sprintf(buf, "%5d: ", i); i0 = i; for (j=0; j < 16 && i < len; i++, j++) sprintf(buf+7+j*3, "%02x ", (uint8_t)(p[i])); i = i0; for (j=0; j < 16 && i < len; i++, j++) sprintf(buf+7+j + 48, "%c", isprint(p[i]) ? p[i] : '.'); printf("%s\n", buf); } } /* * Fill a packet with some payload. * We create a UDP packet so the payload starts at * 14+20+8 = 42 bytes. */ #ifdef __linux__ #define uh_sport source #define uh_dport dest #define uh_ulen len #define uh_sum check #endif /* linux */ /* * increment the addressed in the packet, * starting from the least significant field. * DST_IP DST_PORT SRC_IP SRC_PORT */ static void update_addresses(struct pkt *pkt, struct glob_arg *g) { uint32_t a; uint16_t p; struct ip *ip = &pkt->ip; struct udphdr *udp = &pkt->udp; + do { p = ntohs(udp->uh_sport); if (p < g->src_ip.port1) { /* just inc, no wrap */ udp->uh_sport = htons(p + 1); - return; + break; } udp->uh_sport = htons(g->src_ip.port0); a = ntohl(ip->ip_src.s_addr); if (a < g->src_ip.end) { /* just inc, no wrap */ ip->ip_src.s_addr = htonl(a + 1); - return; + break; } ip->ip_src.s_addr = htonl(g->src_ip.start); udp->uh_sport = htons(g->src_ip.port0); p = ntohs(udp->uh_dport); if (p < g->dst_ip.port1) { /* just inc, no wrap */ udp->uh_dport = htons(p + 1); - return; + break; } udp->uh_dport = htons(g->dst_ip.port0); a = ntohl(ip->ip_dst.s_addr); if (a < g->dst_ip.end) { /* just inc, no wrap */ ip->ip_dst.s_addr = htonl(a + 1); - return; + break; } ip->ip_dst.s_addr = htonl(g->dst_ip.start); - + } while (0); + // update checksum } /* * initialize one packet and prepare for the next one. * The copy could be done better instead of repeating it each time. */ static void initialize_packet(struct targ *targ) { struct pkt *pkt = &targ->pkt; struct ether_header *eh; struct ip *ip; struct udphdr *udp; uint16_t paylen = targ->g->pkt_size - sizeof(*eh) - sizeof(struct ip); const char *payload = targ->g->options & OPT_INDIRECT ? indirect_payload : default_payload; - int i, l, l0 = strlen(payload); + int i, l0 = strlen(payload); /* create a nice NUL-terminated string */ - for (i = 0; i < paylen;) { - l = min(l0, paylen - i); - bcopy(payload, pkt->body + i, l); - i += l; + for (i = 0; i < paylen; i += l0) { + if (l0 > paylen - i) + l0 = paylen - i; // last round + bcopy(payload, pkt->body + i, l0); } pkt->body[i-1] = '\0'; ip = &pkt->ip; /* prepare the headers */ ip->ip_v = IPVERSION; ip->ip_hl = 5; ip->ip_id = 0; ip->ip_tos = IPTOS_LOWDELAY; ip->ip_len = ntohs(targ->g->pkt_size - sizeof(*eh)); ip->ip_id = 0; ip->ip_off = htons(IP_DF); /* Don't fragment */ ip->ip_ttl = IPDEFTTL; ip->ip_p = IPPROTO_UDP; ip->ip_dst.s_addr = htonl(targ->g->dst_ip.start); ip->ip_src.s_addr = htonl(targ->g->src_ip.start); ip->ip_sum = wrapsum(checksum(ip, sizeof(*ip), 0)); udp = &pkt->udp; udp->uh_sport = htons(targ->g->src_ip.port0); udp->uh_dport = htons(targ->g->dst_ip.port0); udp->uh_ulen = htons(paylen); /* Magic: taken from sbin/dhclient/packet.c */ udp->uh_sum = wrapsum(checksum(udp, sizeof(*udp), checksum(pkt->body, paylen - sizeof(*udp), checksum(&ip->ip_src, 2 * sizeof(ip->ip_src), IPPROTO_UDP + (u_int32_t)ntohs(udp->uh_ulen) ) ) )); eh = &pkt->eh; bcopy(&targ->g->src_mac.start, eh->ether_shost, 6); bcopy(&targ->g->dst_mac.start, eh->ether_dhost, 6); eh->ether_type = htons(ETHERTYPE_IP); + + bzero(&pkt->vh, sizeof(pkt->vh)); // dump_payload((void *)pkt, targ->g->pkt_size, NULL, 0); } /* * create and enqueue a batch of packets on a ring. * On the last one set NS_REPORT to tell the driver to generate * an interrupt when done. */ static int -send_packets(struct netmap_ring *ring, struct pkt *pkt, - struct glob_arg *g, u_int count, int options, u_int nfrags) +send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame, + int size, struct glob_arg *g, u_int count, int options, + u_int nfrags) { - u_int sent, cur = ring->cur; - int fcnt; - int size = g->pkt_size; + u_int n, sent, cur = ring->cur; + u_int fcnt; - if (ring->avail < count) - count = ring->avail; + n = nm_ring_space(ring); + if (n < count) + count = n; if (count < nfrags) { D("truncating packet, no room for frags %d %d", - count, nfrags); + count, nfrags); } #if 0 if (options & (OPT_COPY | OPT_PREFETCH) ) { for (sent = 0; sent < count; sent++) { struct netmap_slot *slot = &ring->slot[cur]; char *p = NETMAP_BUF(ring, slot->buf_idx); - prefetch(p); - cur = NETMAP_RING_NEXT(ring, cur); + __builtin_prefetch(p); + cur = nm_ring_next(ring, cur); } cur = ring->cur; } #endif for (fcnt = nfrags, sent = 0; sent < count; sent++) { struct netmap_slot *slot = &ring->slot[cur]; char *p = NETMAP_BUF(ring, slot->buf_idx); slot->flags = 0; if (options & OPT_INDIRECT) { slot->flags |= NS_INDIRECT; - slot->ptr = (uint64_t)pkt; + slot->ptr = (uint64_t)frame; } else if (options & OPT_COPY) { - pkt_copy(pkt, p, size); - if (fcnt == 1) + nm_pkt_copy(frame, p, size); + if (fcnt == nfrags) update_addresses(pkt, g); } else if (options & OPT_MEMCPY) { - memcpy(p, pkt, size); - if (fcnt == 1) + memcpy(p, frame, size); + if (fcnt == nfrags) update_addresses(pkt, g); } else if (options & OPT_PREFETCH) { - prefetch(p); + __builtin_prefetch(p); } if (options & OPT_DUMP) dump_payload(p, size, ring, cur); slot->len = size; if (--fcnt > 0) slot->flags |= NS_MOREFRAG; else fcnt = nfrags; if (sent == count - 1) { slot->flags &= ~NS_MOREFRAG; slot->flags |= NS_REPORT; } - cur = NETMAP_RING_NEXT(ring, cur); + cur = nm_ring_next(ring, cur); } - ring->avail -= sent; - ring->cur = cur; + ring->head = ring->cur = cur; return (sent); } /* * Send a packet, and wait for a response. * The payload (after UDP header, ofs 42) has a 4-byte sequence * followed by a struct timeval (or bintime?) */ #define PAY_OFS 42 /* where in the pkt... */ static void * pinger_body(void *data) { struct targ *targ = (struct targ *) data; - struct pollfd fds[1]; - struct netmap_if *nifp = targ->nifp; + struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; + struct netmap_if *nifp = targ->nmd->nifp; int i, rx = 0, n = targ->g->npackets; - - fds[0].fd = targ->fd; - fds[0].events = (POLLIN); - static uint32_t sent; + void *frame; + int size; + uint32_t sent = 0; struct timespec ts, now, last_print; uint32_t count = 0, min = 1000000000, av = 0; + frame = &targ->pkt; + frame += sizeof(targ->pkt.vh) - targ->g->virt_header; + size = targ->g->pkt_size + targ->g->virt_header; + + if (targ->g->nthreads > 1) { D("can only ping with 1 thread"); return NULL; } clock_gettime(CLOCK_REALTIME_PRECISE, &last_print); + now = last_print; while (n == 0 || (int)sent < n) { struct netmap_ring *ring = NETMAP_TXRING(nifp, 0); struct netmap_slot *slot; char *p; - for (i = 0; i < 1; i++) { + for (i = 0; i < 1; i++) { /* XXX why the loop for 1 pkt ? */ slot = &ring->slot[ring->cur]; - slot->len = targ->g->pkt_size; + slot->len = size; p = NETMAP_BUF(ring, slot->buf_idx); - if (ring->avail == 0) { + if (nm_ring_empty(ring)) { D("-- ouch, cannot send"); } else { - pkt_copy(&targ->pkt, p, targ->g->pkt_size); + nm_pkt_copy(frame, p, size); clock_gettime(CLOCK_REALTIME_PRECISE, &ts); bcopy(&sent, p+42, sizeof(sent)); bcopy(&ts, p+46, sizeof(ts)); sent++; - ring->cur = NETMAP_RING_NEXT(ring, ring->cur); - ring->avail--; + ring->head = ring->cur = nm_ring_next(ring, ring->cur); } } /* should use a parameter to decide how often to send */ - if (poll(fds, 1, 3000) <= 0) { - D("poll error/timeout on queue %d", targ->me); + if (poll(&pfd, 1, 3000) <= 0) { + D("poll error/timeout on queue %d: %s", targ->me, + strerror(errno)); continue; } /* see what we got back */ - for (i = targ->qfirst; i < targ->qlast; i++) { + for (i = targ->nmd->first_tx_ring; + i <= targ->nmd->last_tx_ring; i++) { ring = NETMAP_RXRING(nifp, i); - while (ring->avail > 0) { + while (!nm_ring_empty(ring)) { uint32_t seq; slot = &ring->slot[ring->cur]; p = NETMAP_BUF(ring, slot->buf_idx); clock_gettime(CLOCK_REALTIME_PRECISE, &now); bcopy(p+42, &seq, sizeof(seq)); bcopy(p+46, &ts, sizeof(ts)); ts.tv_sec = now.tv_sec - ts.tv_sec; ts.tv_nsec = now.tv_nsec - ts.tv_nsec; if (ts.tv_nsec < 0) { ts.tv_nsec += 1000000000; ts.tv_sec--; } if (1) D("seq %d/%d delta %d.%09d", seq, sent, (int)ts.tv_sec, (int)ts.tv_nsec); if (ts.tv_nsec < (int)min) min = ts.tv_nsec; count ++; av += ts.tv_nsec; - ring->avail--; - ring->cur = NETMAP_RING_NEXT(ring, ring->cur); + ring->head = ring->cur = nm_ring_next(ring, ring->cur); rx++; } } //D("tx %d rx %d", sent, rx); //usleep(100000); ts.tv_sec = now.tv_sec - last_print.tv_sec; ts.tv_nsec = now.tv_nsec - last_print.tv_nsec; if (ts.tv_nsec < 0) { ts.tv_nsec += 1000000000; ts.tv_sec--; } if (ts.tv_sec >= 1) { D("count %d min %d av %d", count, min, av/count); count = 0; av = 0; min = 100000000; last_print = now; } } return NULL; } /* * reply to ping requests */ static void * ponger_body(void *data) { struct targ *targ = (struct targ *) data; - struct pollfd fds[1]; - struct netmap_if *nifp = targ->nifp; + struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; + struct netmap_if *nifp = targ->nmd->nifp; struct netmap_ring *txring, *rxring; int i, rx = 0, sent = 0, n = targ->g->npackets; - fds[0].fd = targ->fd; - fds[0].events = (POLLIN); if (targ->g->nthreads > 1) { D("can only reply ping with 1 thread"); return NULL; } D("understood ponger %d but don't know how to do it", n); while (n == 0 || sent < n) { uint32_t txcur, txavail; //#define BUSYWAIT #ifdef BUSYWAIT - ioctl(fds[0].fd, NIOCRXSYNC, NULL); + ioctl(pfd.fd, NIOCRXSYNC, NULL); #else - if (poll(fds, 1, 1000) <= 0) { - D("poll error/timeout on queue %d", targ->me); + if (poll(&pfd, 1, 1000) <= 0) { + D("poll error/timeout on queue %d: %s", targ->me, + strerror(errno)); continue; } #endif txring = NETMAP_TXRING(nifp, 0); txcur = txring->cur; - txavail = txring->avail; + txavail = nm_ring_space(txring); /* see what we got back */ - for (i = targ->qfirst; i < targ->qlast; i++) { + for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) { rxring = NETMAP_RXRING(nifp, i); - while (rxring->avail > 0) { + while (!nm_ring_empty(rxring)) { uint16_t *spkt, *dpkt; uint32_t cur = rxring->cur; struct netmap_slot *slot = &rxring->slot[cur]; char *src, *dst; src = NETMAP_BUF(rxring, slot->buf_idx); //D("got pkt %p of size %d", src, slot->len); - rxring->avail--; - rxring->cur = NETMAP_RING_NEXT(rxring, cur); + rxring->head = rxring->cur = nm_ring_next(rxring, cur); rx++; if (txavail == 0) continue; dst = NETMAP_BUF(txring, txring->slot[txcur].buf_idx); /* copy... */ dpkt = (uint16_t *)dst; spkt = (uint16_t *)src; - pkt_copy(src, dst, slot->len); + nm_pkt_copy(src, dst, slot->len); dpkt[0] = spkt[3]; dpkt[1] = spkt[4]; dpkt[2] = spkt[5]; dpkt[3] = spkt[0]; dpkt[4] = spkt[1]; dpkt[5] = spkt[2]; txring->slot[txcur].len = slot->len; /* XXX swap src dst mac */ - txcur = NETMAP_RING_NEXT(txring, txcur); + txcur = nm_ring_next(txring, txcur); txavail--; sent++; } } - txring->cur = txcur; - txring->avail = txavail; + txring->head = txring->cur = txcur; targ->count = sent; #ifdef BUSYWAIT - ioctl(fds[0].fd, NIOCTXSYNC, NULL); + ioctl(pfd.fd, NIOCTXSYNC, NULL); #endif //D("tx %d rx %d", sent, rx); } return NULL; } static __inline int timespec_ge(const struct timespec *a, const struct timespec *b) { if (a->tv_sec > b->tv_sec) return (1); if (a->tv_sec < b->tv_sec) return (0); if (a->tv_nsec >= b->tv_nsec) return (1); return (0); } static __inline struct timespec timeval2spec(const struct timeval *a) { struct timespec ts = { .tv_sec = a->tv_sec, .tv_nsec = a->tv_usec * 1000 }; return ts; } static __inline struct timeval timespec2val(const struct timespec *a) { struct timeval tv = { .tv_sec = a->tv_sec, .tv_usec = a->tv_nsec / 1000 }; return tv; } -static int -wait_time(struct timespec ts, struct timespec *wakeup_ts, long long *waited) +static __inline struct timespec +timespec_add(struct timespec a, struct timespec b) { - struct timespec curtime; - - curtime.tv_sec = 0; - curtime.tv_nsec = 0; - - if (clock_gettime(CLOCK_REALTIME_PRECISE, &curtime) == -1) { - D("clock_gettime: %s", strerror(errno)); - return (-1); + struct timespec ret = { a.tv_sec + b.tv_sec, a.tv_nsec + b.tv_nsec }; + if (ret.tv_nsec >= 1000000000) { + ret.tv_sec++; + ret.tv_nsec -= 1000000000; } - while (timespec_ge(&ts, &curtime)) { - if (waited != NULL) - (*waited)++; - if (clock_gettime(CLOCK_REALTIME_PRECISE, &curtime) == -1) { - D("clock_gettime"); - return (-1); - } - } - if (wakeup_ts != NULL) - *wakeup_ts = curtime; - return (0); + return ret; } -static __inline void -timespec_add(struct timespec *tsa, struct timespec *tsb) +static __inline struct timespec +timespec_sub(struct timespec a, struct timespec b) { - tsa->tv_sec += tsb->tv_sec; - tsa->tv_nsec += tsb->tv_nsec; - if (tsa->tv_nsec >= 1000000000) { - tsa->tv_sec++; - tsa->tv_nsec -= 1000000000; + struct timespec ret = { a.tv_sec - b.tv_sec, a.tv_nsec - b.tv_nsec }; + if (ret.tv_nsec < 0) { + ret.tv_sec--; + ret.tv_nsec += 1000000000; } + return ret; } +/* + * wait until ts, either busy or sleeping if more than 1ms. + * Return wakeup time. + */ +static struct timespec +wait_time(struct timespec ts) +{ + for (;;) { + struct timespec w, cur; + clock_gettime(CLOCK_REALTIME_PRECISE, &cur); + w = timespec_sub(ts, cur); + if (w.tv_sec < 0) + return cur; + else if (w.tv_sec > 0 || w.tv_nsec > 1000000) + poll(NULL, 0, 1); + } +} + static void * sender_body(void *data) { struct targ *targ = (struct targ *) data; - - struct pollfd fds[1]; - struct netmap_if *nifp = targ->nifp; + struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT }; + struct netmap_if *nifp = targ->nmd->nifp; struct netmap_ring *txring; - int i, n = targ->g->npackets / targ->g->nthreads, sent = 0; + int i, n = targ->g->npackets / targ->g->nthreads; + int64_t sent = 0; int options = targ->g->options | OPT_COPY; - struct timespec tmptime, nexttime = { 0, 0}; // XXX silence compiler + struct timespec nexttime = { 0, 0}; // XXX silence compiler int rate_limit = targ->g->tx_rate; - long long waited = 0; + struct pkt *pkt = &targ->pkt; + void *frame; + int size; + frame = pkt; + frame += sizeof(pkt->vh) - targ->g->virt_header; + size = targ->g->pkt_size + targ->g->virt_header; + D("start"); if (setaffinity(targ->thread, targ->affinity)) goto quit; - /* setup poll(2) mechanism. */ - memset(fds, 0, sizeof(fds)); - fds[0].fd = targ->fd; - fds[0].events = (POLLOUT); /* main loop.*/ clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic); if (rate_limit) { - tmptime.tv_sec = 2; - tmptime.tv_nsec = 0; - timespec_add(&targ->tic, &tmptime); + targ->tic = timespec_add(targ->tic, (struct timespec){2,0}); targ->tic.tv_nsec = 0; - if (wait_time(targ->tic, NULL, NULL) == -1) { - D("wait_time: %s", strerror(errno)); - goto quit; - } + wait_time(targ->tic); nexttime = targ->tic; } - if (targ->g->dev_type == DEV_PCAP) { - int size = targ->g->pkt_size; - void *pkt = &targ->pkt; - pcap_t *p = targ->g->p; + if (targ->g->dev_type == DEV_TAP) { + D("writing to file desc %d", targ->g->main_fd); for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) { - if (pcap_inject(p, pkt, size) != -1) + if (write(targ->g->main_fd, frame, size) != -1) sent++; update_addresses(pkt, targ->g); if (i > 10000) { targ->count = sent; i = 0; } } - } else if (targ->g->dev_type == DEV_TAP) { /* tap */ - int size = targ->g->pkt_size; - void *pkt = &targ->pkt; - D("writing to file desc %d", targ->g->main_fd); +#ifndef NO_PCAP + } else if (targ->g->dev_type == DEV_PCAP) { + pcap_t *p = targ->g->p; for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) { - if (write(targ->g->main_fd, pkt, size) != -1) + if (pcap_inject(p, frame, size) != -1) sent++; update_addresses(pkt, targ->g); if (i > 10000) { targ->count = sent; i = 0; } } +#endif /* NO_PCAP */ } else { int tosend = 0; int frags = targ->g->frags; while (!targ->cancel && (n == 0 || sent < n)) { if (rate_limit && tosend <= 0) { tosend = targ->g->burst; - timespec_add(&nexttime, &targ->g->tx_period); - if (wait_time(nexttime, &tmptime, &waited) == -1) { - D("wait_time"); - goto quit; - } + nexttime = timespec_add(nexttime, targ->g->tx_period); + wait_time(nexttime); } /* * wait for available room in the send queue(s) */ - if (poll(fds, 1, 2000) <= 0) { + if (poll(&pfd, 1, 2000) <= 0) { if (targ->cancel) break; - D("poll error/timeout on queue %d", targ->me); + D("poll error/timeout on queue %d: %s", targ->me, + strerror(errno)); + // goto quit; + } + if (pfd.revents & POLLERR) { + D("poll error"); goto quit; } /* * scan our queues and send on those with room */ if (options & OPT_COPY && sent > 100000 && !(targ->g->options & OPT_COPY) ) { D("drop copy"); options &= ~OPT_COPY; } - for (i = targ->qfirst; i < targ->qlast; i++) { + for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) { int m, limit = rate_limit ? tosend : targ->g->burst; if (n > 0 && n - sent < limit) limit = n - sent; txring = NETMAP_TXRING(nifp, i); - if (txring->avail == 0) + if (nm_ring_empty(txring)) continue; if (frags > 1) limit = ((limit + frags - 1) / frags) * frags; - - m = send_packets(txring, &targ->pkt, targ->g, + + m = send_packets(txring, pkt, frame, size, targ->g, limit, options, frags); - ND("limit %d avail %d frags %d m %d", - limit, txring->avail, frags, m); + ND("limit %d tail %d frags %d m %d", + limit, txring->tail, frags, m); sent += m; targ->count = sent; if (rate_limit) { tosend -= m; if (tosend <= 0) break; } } } /* flush any remaining packets */ - ioctl(fds[0].fd, NIOCTXSYNC, NULL); + ioctl(pfd.fd, NIOCTXSYNC, NULL); /* final part: wait all the TX queues to be empty. */ - for (i = targ->qfirst; i < targ->qlast; i++) { + for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) { txring = NETMAP_TXRING(nifp, i); - while (!NETMAP_TX_RING_EMPTY(txring)) { - ioctl(fds[0].fd, NIOCTXSYNC, NULL); + while (nm_tx_pending(txring)) { + ioctl(pfd.fd, NIOCTXSYNC, NULL); usleep(1); /* wait 1 tick */ } } - } + } /* end DEV_NETMAP */ clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); targ->completed = 1; targ->count = sent; quit: /* reset the ``used`` flag. */ targ->used = 0; return (NULL); } +#ifndef NO_PCAP static void receive_pcap(u_char *user, const struct pcap_pkthdr * h, const u_char * bytes) { int *count = (int *)user; (void)h; /* UNUSED */ (void)bytes; /* UNUSED */ (*count)++; } +#endif /* !NO_PCAP */ static int receive_packets(struct netmap_ring *ring, u_int limit, int dump) { - u_int cur, rx; + u_int cur, rx, n; cur = ring->cur; - if (ring->avail < limit) - limit = ring->avail; + n = nm_ring_space(ring); + if (n < limit) + limit = n; for (rx = 0; rx < limit; rx++) { struct netmap_slot *slot = &ring->slot[cur]; char *p = NETMAP_BUF(ring, slot->buf_idx); if (dump) dump_payload(p, slot->len, ring, cur); - cur = NETMAP_RING_NEXT(ring, cur); + cur = nm_ring_next(ring, cur); } - ring->avail -= rx; - ring->cur = cur; + ring->head = ring->cur = cur; return (rx); } static void * receiver_body(void *data) { struct targ *targ = (struct targ *) data; - struct pollfd fds[1]; - struct netmap_if *nifp = targ->nifp; + struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; + struct netmap_if *nifp = targ->nmd->nifp; struct netmap_ring *rxring; int i; uint64_t received = 0; if (setaffinity(targ->thread, targ->affinity)) goto quit; - /* setup poll(2) mechanism. */ - memset(fds, 0, sizeof(fds)); - fds[0].fd = targ->fd; - fds[0].events = (POLLIN); - /* unbounded wait for the first packet. */ for (;;) { - i = poll(fds, 1, 1000); - if (i > 0 && !(fds[0].revents & POLLERR)) + i = poll(&pfd, 1, 1000); + if (i > 0 && !(pfd.revents & POLLERR)) break; - D("waiting for initial packets, poll returns %d %d", i, fds[0].revents); + RD(1, "waiting for initial packets, poll returns %d %d", + i, pfd.revents); } /* main loop, exit after 1s silence */ clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic); - if (targ->g->dev_type == DEV_PCAP) { - while (!targ->cancel) { - /* XXX should we poll ? */ - pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap, NULL); - } - } else if (targ->g->dev_type == DEV_TAP) { + if (targ->g->dev_type == DEV_TAP) { D("reading from %s fd %d", targ->g->ifname, targ->g->main_fd); while (!targ->cancel) { char buf[2048]; /* XXX should we poll ? */ if (read(targ->g->main_fd, buf, sizeof(buf)) > 0) targ->count++; } +#ifndef NO_PCAP + } else if (targ->g->dev_type == DEV_PCAP) { + while (!targ->cancel) { + /* XXX should we poll ? */ + pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap, NULL); + } +#endif /* !NO_PCAP */ } else { int dump = targ->g->options & OPT_DUMP; while (!targ->cancel) { /* Once we started to receive packets, wait at most 1 seconds before quitting. */ - if (poll(fds, 1, 1 * 1000) <= 0 && !targ->g->forever) { + if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) { clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); targ->toc.tv_sec -= 1; /* Subtract timeout time. */ - break; + goto out; } - for (i = targ->qfirst; i < targ->qlast; i++) { + if (pfd.revents & POLLERR) { + D("poll err"); + goto quit; + } + + for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) { int m; rxring = NETMAP_RXRING(nifp, i); - if (rxring->avail == 0) + if (nm_ring_empty(rxring)) continue; m = receive_packets(rxring, targ->g->burst, dump); received += m; } targ->count = received; - - // tell the card we have read the data - //ioctl(fds[0].fd, NIOCRXSYNC, NULL); } } + clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); + +out: targ->completed = 1; targ->count = received; quit: /* reset the ``used`` flag. */ targ->used = 0; return (NULL); } /* very crude code to print a number in normalized form. * Caller has to make sure that the buffer is large enough. */ static const char * norm(char *buf, double val) { - char *units[] = { "", "K", "M", "G" }; + char *units[] = { "", "K", "M", "G", "T" }; u_int i; - for (i = 0; val >=1000 && i < sizeof(units)/sizeof(char *); i++) + for (i = 0; val >=1000 && i < sizeof(units)/sizeof(char *) - 1; i++) val /= 1000; sprintf(buf, "%.2f %s", val, units[i]); return buf; } static void tx_output(uint64_t sent, int size, double delta) { double bw, raw_bw, pps; char b1[40], b2[80], b3[80]; - printf("Sent %" PRIu64 " packets, %d bytes each, in %.2f seconds.\n", - sent, size, delta); + printf("Sent %llu packets, %d bytes each, in %.2f seconds.\n", + (unsigned long long)sent, size, delta); if (delta == 0) delta = 1e-6; if (size < 60) /* correct for min packet size */ size = 60; pps = sent / delta; bw = (8.0 * size * sent) / delta; /* raw packets have4 bytes crc + 20 bytes framing */ raw_bw = (8.0 * (size + 24) * sent) / delta; printf("Speed: %spps Bandwidth: %sbps (raw %sbps)\n", norm(b1, pps), norm(b2, bw), norm(b3, raw_bw) ); } static void rx_output(uint64_t received, double delta) { double pps; char b1[40]; - printf("Received %" PRIu64 " packets, in %.2f seconds.\n", received, delta); + printf("Received %llu packets, in %.2f seconds.\n", + (unsigned long long) received, delta); if (delta == 0) delta = 1e-6; pps = received / delta; printf("Speed: %spps\n", norm(b1, pps)); } static void usage(void) { const char *cmd = "pkt-gen"; fprintf(stderr, "Usage:\n" "%s arguments\n" "\t-i interface interface name\n" "\t-f function tx rx ping pong\n" "\t-n count number of iterations (can be 0)\n" "\t-t pkts_to_send also forces tx mode\n" "\t-r pkts_to_receive also forces rx mode\n" "\t-l pkt_size in bytes excluding CRC\n" "\t-d dst_ip[:port[-dst_ip:port]] single or range\n" "\t-s src_ip[:port[-src_ip:port]] single or range\n" "\t-D dst-mac\n" "\t-S src-mac\n" "\t-a cpu_id use setaffinity\n" "\t-b burst size testing, mostly\n" "\t-c cores cores to use\n" "\t-p threads processes/threads to use\n" "\t-T report_ms milliseconds between reports\n" "\t-P use libpcap instead of netmap\n" "\t-w wait_for_link_time in seconds\n" "\t-R rate in packets per second\n" "\t-X dump payload\n" + "\t-H len add empty virtio-net-header with size 'len'\n" "", cmd); exit(0); } static void start_threads(struct glob_arg *g) { int i; targs = calloc(g->nthreads, sizeof(*targs)); /* * Now create the desired number of threads, each one * using a single descriptor. */ for (i = 0; i < g->nthreads; i++) { - bzero(&targs[i], sizeof(targs[i])); - targs[i].fd = -1; /* default, with pcap */ - targs[i].g = g; + struct targ *t = &targs[i]; + bzero(t, sizeof(*t)); + t->fd = -1; /* default, with pcap */ + t->g = g; + if (g->dev_type == DEV_NETMAP) { - struct nmreq tifreq; - int tfd; + struct nm_desc nmd = *g->nmd; /* copy, we overwrite ringid */ - /* register interface. */ - tfd = open("/dev/netmap", O_RDWR); - if (tfd == -1) { - D("Unable to open /dev/netmap"); - continue; + if (g->nthreads > 1) { + if (nmd.req.nr_flags != NR_REG_ALL_NIC) { + D("invalid nthreads mode %d", nmd.req.nr_flags); + continue; + } + nmd.req.nr_flags = NR_REG_ONE_NIC; + nmd.req.nr_ringid = i; } - targs[i].fd = tfd; + /* Only touch one of the rings (rx is already ok) */ + if (g->td_body == receiver_body) + nmd.req.nr_ringid |= NETMAP_NO_TX_POLL; - bzero(&tifreq, sizeof(tifreq)); - strncpy(tifreq.nr_name, g->ifname, sizeof(tifreq.nr_name)); - tifreq.nr_version = NETMAP_API; - tifreq.nr_ringid = (g->nthreads > 1) ? (i | NETMAP_HW_RING) : 0; - parse_nmr_config(g->nmr_config, &tifreq); + /* register interface. Override ifname and ringid etc. */ - /* - * if we are acting as a receiver only, do not touch the transmit ring. - * This is not the default because many apps may use the interface - * in both directions, but a pure receiver does not. - */ - if (g->td_body == receiver_body) { - tifreq.nr_ringid |= NETMAP_NO_TX_POLL; - } - - if ((ioctl(tfd, NIOCREGIF, &tifreq)) == -1) { - D("Unable to register %s", g->ifname); + t->nmd = nm_open(t->g->ifname, NULL, g->nmd_flags | + NM_OPEN_IFNAME | NM_OPEN_NO_MMAP, g->nmd); + if (t->nmd == NULL) { + D("Unable to open %s: %s", + t->g->ifname, strerror(errno)); continue; } - D("memsize is %d MB", tifreq.nr_memsize >> 20); - targs[i].nmr = tifreq; - targs[i].nifp = NETMAP_IF(g->mmap_addr, tifreq.nr_offset); - D("nifp flags 0x%x", targs[i].nifp->ni_flags); - /* start threads. */ - targs[i].qfirst = (g->nthreads > 1) ? i : 0; - targs[i].qlast = (g->nthreads > 1) ? i+1 : - (g->td_body == receiver_body ? tifreq.nr_rx_rings : tifreq.nr_tx_rings); + t->fd = t->nmd->fd; + } else { targs[i].fd = g->main_fd; } - targs[i].used = 1; - targs[i].me = i; + t->used = 1; + t->me = i; if (g->affinity >= 0) { if (g->affinity < g->cpus) - targs[i].affinity = g->affinity; + t->affinity = g->affinity; else - targs[i].affinity = i % g->cpus; - } else - targs[i].affinity = -1; + t->affinity = i % g->cpus; + } else { + t->affinity = -1; + } /* default, init packets */ - initialize_packet(&targs[i]); + initialize_packet(t); - if (pthread_create(&targs[i].thread, NULL, g->td_body, - &targs[i]) == -1) { - D("Unable to create thread %d", i); - targs[i].used = 0; + if (pthread_create(&t->thread, NULL, g->td_body, t) == -1) { + D("Unable to create thread %d: %s", i, strerror(errno)); + t->used = 0; } } } static void main_thread(struct glob_arg *g) { int i; uint64_t prev = 0; uint64_t count = 0; double delta_t; struct timeval tic, toc; gettimeofday(&toc, NULL); for (;;) { struct timeval now, delta; uint64_t pps, usec, my_count, npkts; int done = 0; delta.tv_sec = g->report_interval/1000; delta.tv_usec = (g->report_interval%1000)*1000; select(0, NULL, NULL, NULL, &delta); gettimeofday(&now, NULL); - time_second = now.tv_sec; timersub(&now, &toc, &toc); my_count = 0; for (i = 0; i < g->nthreads; i++) { my_count += targs[i].count; if (targs[i].used == 0) done++; } usec = toc.tv_sec* 1000000 + toc.tv_usec; if (usec < 10000) continue; npkts = my_count - prev; pps = (npkts*1000000 + usec/2) / usec; - D("%" PRIu64 " pps (%" PRIu64 " pkts in %" PRIu64 " usec)", - pps, npkts, usec); + D("%llu pps (%llu pkts in %llu usec)", + (unsigned long long)pps, + (unsigned long long)npkts, + (unsigned long long)usec); prev = my_count; toc = now; if (done == g->nthreads) break; } timerclear(&tic); timerclear(&toc); for (i = 0; i < g->nthreads; i++) { struct timespec t_tic, t_toc; /* * Join active threads, unregister interfaces and close * file descriptors. */ if (targs[i].used) pthread_join(targs[i].thread, NULL); close(targs[i].fd); if (targs[i].completed == 0) D("ouch, thread %d exited with error", i); /* * Collect threads output and extract information about * how long it took to send all the packets. */ count += targs[i].count; t_tic = timeval2spec(&tic); t_toc = timeval2spec(&toc); if (!timerisset(&tic) || timespec_ge(&targs[i].tic, &t_tic)) tic = timespec2val(&targs[i].tic); if (!timerisset(&toc) || timespec_ge(&targs[i].toc, &t_toc)) toc = timespec2val(&targs[i].toc); } /* print output. */ timersub(&toc, &tic, &toc); delta_t = toc.tv_sec + 1e-6* toc.tv_usec; if (g->td_body == sender_body) tx_output(count, g->pkt_size, delta_t); else rx_output(count, delta_t); if (g->dev_type == DEV_NETMAP) { - munmap(g->mmap_addr, g->mmap_size); + munmap(g->nmd->mem, g->nmd->req.nr_memsize); close(g->main_fd); } } struct sf { char *key; void *f; }; static struct sf func[] = { { "tx", sender_body }, { "rx", receiver_body }, { "ping", pinger_body }, { "pong", ponger_body }, { NULL, NULL } }; static int tap_alloc(char *dev) { struct ifreq ifr; int fd, err; char *clonedev = TAP_CLONEDEV; (void)err; (void)dev; /* Arguments taken by the function: * * char *dev: the name of an interface (or '\0'). MUST have enough * space to hold the interface name if '\0' is passed * int flags: interface flags (eg, IFF_TUN etc.) */ #ifdef __FreeBSD__ if (dev[3]) { /* tapSomething */ static char buf[128]; snprintf(buf, sizeof(buf), "/dev/%s", dev); clonedev = buf; } #endif /* open the device */ if( (fd = open(clonedev, O_RDWR)) < 0 ) { return fd; } D("%s open successful", clonedev); /* preparation of the struct ifr, of type "struct ifreq" */ memset(&ifr, 0, sizeof(ifr)); #ifdef linux ifr.ifr_flags = IFF_TAP | IFF_NO_PI; if (*dev) { /* if a device name was specified, put it in the structure; otherwise, * the kernel will try to allocate the "next" device of the * specified type */ strncpy(ifr.ifr_name, dev, IFNAMSIZ); } /* try to create the device */ if( (err = ioctl(fd, TUNSETIFF, (void *) &ifr)) < 0 ) { - D("failed to to a TUNSETIFF"); + D("failed to to a TUNSETIFF: %s", strerror(errno)); close(fd); return err; } /* if the operation was successful, write back the name of the * interface to the variable "dev", so the caller can know * it. Note that the caller MUST reserve space in *dev (see calling * code below) */ strcpy(dev, ifr.ifr_name); D("new name is %s", dev); #endif /* linux */ /* this is the special file descriptor that the caller will use to talk * with the virtual interface */ return fd; } int main(int arc, char **argv) { int i; struct glob_arg g; - struct nmreq nmr; int ch; int wait_link = 2; int devqueues = 1; /* how many device queues */ bzero(&g, sizeof(g)); g.main_fd = -1; g.td_body = receiver_body; g.report_interval = 1000; /* report interval */ g.affinity = -1; /* ip addresses can also be a range x.x.x.x-x.x.x.y */ g.src_ip.name = "10.0.0.1"; g.dst_ip.name = "10.1.0.1"; g.dst_mac.name = "ff:ff:ff:ff:ff:ff"; g.src_mac.name = NULL; g.pkt_size = 60; g.burst = 512; // default g.nthreads = 1; g.cpus = 1; g.forever = 1; g.tx_rate = 0; g.frags = 1; g.nmr_config = ""; + g.virt_header = 0; while ( (ch = getopt(arc, argv, - "a:f:F:n:i:It:r:l:d:s:D:S:b:c:o:p:PT:w:WvR:XC:")) != -1) { + "a:f:F:n:i:Il:d:s:D:S:b:c:o:p:T:w:WvR:XC:H:e:")) != -1) { struct sf *fn; switch(ch) { default: D("bad option %c %s", ch, optarg); usage(); break; case 'n': g.npackets = atoi(optarg); break; case 'F': i = atoi(optarg); if (i < 1 || i > 63) { D("invalid frags %d [1..63], ignore", i); break; } g.frags = i; break; case 'f': for (fn = func; fn->key; fn++) { if (!strcmp(fn->key, optarg)) break; } if (fn->key) g.td_body = fn->f; else D("unrecognised function %s", optarg); break; case 'o': /* data generation options */ g.options = atoi(optarg); break; case 'a': /* force affinity */ g.affinity = atoi(optarg); break; case 'i': /* interface */ - g.ifname = optarg; - if (!strncmp(optarg, "tap", 3)) - g.dev_type = DEV_TAP; - else + /* a prefix of tap: netmap: or pcap: forces the mode. + * otherwise we guess + */ + D("interface is %s", optarg); + if (strlen(optarg) > MAX_IFNAMELEN - 8) { + D("ifname too long %s", optarg); + break; + } + strcpy(g.ifname, optarg); + if (!strcmp(optarg, "null")) { g.dev_type = DEV_NETMAP; - if (!strcmp(g.ifname, "null")) g.dummy_send = 1; + } else if (!strncmp(optarg, "tap:", 4)) { + g.dev_type = DEV_TAP; + strcpy(g.ifname, optarg + 4); + } else if (!strncmp(optarg, "pcap:", 5)) { + g.dev_type = DEV_PCAP; + strcpy(g.ifname, optarg + 5); + } else if (!strncmp(optarg, "netmap:", 7) || + !strncmp(optarg, "vale", 4)) { + g.dev_type = DEV_NETMAP; + } else if (!strncmp(optarg, "tap", 3)) { + g.dev_type = DEV_TAP; + } else { /* prepend netmap: */ + g.dev_type = DEV_NETMAP; + sprintf(g.ifname, "netmap:%s", optarg); + } break; case 'I': g.options |= OPT_INDIRECT; /* XXX use indirect buffer */ break; - case 't': /* send, deprecated */ - D("-t deprecated, please use -f tx -n %s", optarg); - g.td_body = sender_body; - g.npackets = atoi(optarg); - break; - - case 'r': /* receive */ - D("-r deprecated, please use -f rx -n %s", optarg); - g.td_body = receiver_body; - g.npackets = atoi(optarg); - break; - case 'l': /* pkt_size */ g.pkt_size = atoi(optarg); break; case 'd': g.dst_ip.name = optarg; break; case 's': g.src_ip.name = optarg; break; case 'T': /* report interval */ g.report_interval = atoi(optarg); break; case 'w': wait_link = atoi(optarg); break; case 'W': /* XXX changed default */ g.forever = 0; /* do not exit rx even with no traffic */ break; case 'b': /* burst */ g.burst = atoi(optarg); break; case 'c': g.cpus = atoi(optarg); break; case 'p': g.nthreads = atoi(optarg); break; - case 'P': - g.dev_type = DEV_PCAP; - break; - case 'D': /* destination mac */ g.dst_mac.name = optarg; break; case 'S': /* source mac */ g.src_mac.name = optarg; break; case 'v': verbose++; break; case 'R': g.tx_rate = atoi(optarg); break; case 'X': g.options |= OPT_DUMP; break; case 'C': g.nmr_config = strdup(optarg); + break; + case 'H': + g.virt_header = atoi(optarg); + break; + case 'e': /* extra bufs */ + g.extra_bufs = atoi(optarg); + break; } } if (g.ifname == NULL) { D("missing ifname"); usage(); } i = system_ncpus(); if (g.cpus < 0 || g.cpus > i) { D("%d cpus is too high, have only %d cpus", g.cpus, i); usage(); } if (g.cpus == 0) g.cpus = i; if (g.pkt_size < 16 || g.pkt_size > 1536) { D("bad pktsize %d\n", g.pkt_size); usage(); } if (g.src_mac.name == NULL) { static char mybuf[20] = "00:00:00:00:00:00"; /* retrieve source mac address. */ if (source_hwaddr(g.ifname, mybuf) == -1) { D("Unable to retrieve source mac"); // continue, fail later } g.src_mac.name = mybuf; } /* extract address ranges */ extract_ip_range(&g.src_ip); extract_ip_range(&g.dst_ip); extract_mac_range(&g.src_mac); extract_mac_range(&g.dst_mac); + if (g.src_ip.start != g.src_ip.end || + g.src_ip.port0 != g.src_ip.port1 || + g.dst_ip.start != g.dst_ip.end || + g.dst_ip.port0 != g.dst_ip.port1) + g.options |= OPT_COPY; + + if (g.virt_header != 0 && g.virt_header != VIRT_HDR_1 + && g.virt_header != VIRT_HDR_2) { + D("bad virtio-net-header length"); + usage(); + } + if (g.dev_type == DEV_TAP) { D("want to use tap %s", g.ifname); g.main_fd = tap_alloc(g.ifname); if (g.main_fd < 0) { D("cannot open tap %s", g.ifname); usage(); } - } else if (g.dev_type > DEV_NETMAP) { +#ifndef NO_PCAP + } else if (g.dev_type == DEV_PCAP) { char pcap_errbuf[PCAP_ERRBUF_SIZE]; D("using pcap on %s", g.ifname); pcap_errbuf[0] = '\0'; // init the buffer g.p = pcap_open_live(g.ifname, 0, 1, 100, pcap_errbuf); if (g.p == NULL) { D("cannot open pcap on %s", g.ifname); usage(); } - } else if (g.dummy_send) { +#endif /* !NO_PCAP */ + } else if (g.dummy_send) { /* but DEV_NETMAP */ D("using a dummy send routine"); } else { - bzero(&nmr, sizeof(nmr)); - nmr.nr_version = NETMAP_API; + struct nm_desc base_nmd; + + bzero(&base_nmd, sizeof(base_nmd)); + + g.nmd_flags = 0; + g.nmd_flags |= parse_nmr_config(g.nmr_config, &base_nmd.req); + if (g.extra_bufs) { + base_nmd.req.nr_arg3 = g.extra_bufs; + g.nmd_flags |= NM_OPEN_ARG3; + } + /* - * Open the netmap device to fetch the number of queues of our - * interface. + * Open the netmap device using nm_open(). * - * The first NIOCREGIF also detaches the card from the * protocol stack and may cause a reset of the card, * which in turn may take some time for the PHY to - * reconfigure. + * reconfigure. We do the open here to have time to reset. */ - g.main_fd = open("/dev/netmap", O_RDWR); - if (g.main_fd == -1) { - D("Unable to open /dev/netmap"); - // fail later + g.nmd = nm_open(g.ifname, NULL, g.nmd_flags, &base_nmd); + if (g.nmd == NULL) { + D("Unable to open %s: %s", g.ifname, strerror(errno)); + goto out; } - /* - * Register the interface on the netmap device: from now on, - * we can operate on the network interface without any - * interference from the legacy network stack. - * - * We decide to put the first interface registration here to - * give time to cards that take a long time to reset the PHY. - */ - bzero(&nmr, sizeof(nmr)); - nmr.nr_version = NETMAP_API; - strncpy(nmr.nr_name, g.ifname, sizeof(nmr.nr_name)); - nmr.nr_version = NETMAP_API; - parse_nmr_config(g.nmr_config, &nmr); - if (ioctl(g.main_fd, NIOCREGIF, &nmr) == -1) { - D("Unable to register interface %s", g.ifname); - //continue, fail later - } - ND("%s: txr %d txd %d rxr %d rxd %d", g.ifname, - nmr.nr_tx_rings, nmr.nr_tx_slots, - nmr.nr_rx_rings, nmr.nr_rx_slots); - //if ((ioctl(g.main_fd, NIOCGINFO, &nmr)) == -1) { - // D("Unable to get if info without name"); - //} else { - // D("map size is %d Kb", nmr.nr_memsize >> 10); - //} - if ((ioctl(g.main_fd, NIOCGINFO, &nmr)) == -1) { - D("Unable to get if info for %s", g.ifname); - } - devqueues = nmr.nr_rx_rings; + g.main_fd = g.nmd->fd; + D("mapped %dKB at %p", g.nmd->req.nr_memsize>>10, g.nmd->mem); + devqueues = g.nmd->req.nr_rx_rings; + /* validate provided nthreads. */ if (g.nthreads < 1 || g.nthreads > devqueues) { D("bad nthreads %d, have %d queues", g.nthreads, devqueues); // continue, fail later } - /* - * Map the netmap shared memory: instead of issuing mmap() - * inside the body of the threads, we prefer to keep this - * operation here to simplify the thread logic. - */ - D("mapping %d Kbytes", nmr.nr_memsize>>10); - g.mmap_size = nmr.nr_memsize; - g.mmap_addr = (struct netmap_d *) mmap(0, nmr.nr_memsize, - PROT_WRITE | PROT_READ, - MAP_SHARED, g.main_fd, 0); - if (g.mmap_addr == MAP_FAILED) { - D("Unable to mmap %d KB", nmr.nr_memsize >> 10); - // continue, fail later + if (verbose) { + struct netmap_if *nifp = g.nmd->nifp; + struct nmreq *req = &g.nmd->req; + + D("nifp at offset %d, %d tx %d rx region %d", + req->nr_offset, req->nr_tx_rings, req->nr_rx_rings, + req->nr_arg2); + for (i = 0; i <= req->nr_tx_rings; i++) { + D(" TX%d at 0x%lx", i, + (char *)NETMAP_TXRING(nifp, i) - (char *)nifp); + } + for (i = 0; i <= req->nr_rx_rings; i++) { + D(" RX%d at 0x%lx", i, + (char *)NETMAP_RXRING(nifp, i) - (char *)nifp); + } } - - /* Print some debug information. */ fprintf(stdout, "%s %s: %d queues, %d threads and %d cpus.\n", (g.td_body == sender_body) ? "Sending on" : "Receiving from", g.ifname, devqueues, g.nthreads, g.cpus); if (g.td_body == sender_body) { fprintf(stdout, "%s -> %s (%s -> %s)\n", g.src_ip.name, g.dst_ip.name, g.src_mac.name, g.dst_mac.name); } - + +out: /* Exit if something went wrong. */ if (g.main_fd < 0) { D("aborting"); usage(); } } - + if (g.options) { D("--- SPECIAL OPTIONS:%s%s%s%s%s\n", g.options & OPT_PREFETCH ? " prefetch" : "", g.options & OPT_ACCESS ? " access" : "", g.options & OPT_MEMCPY ? " memcpy" : "", g.options & OPT_INDIRECT ? " indirect" : "", g.options & OPT_COPY ? " copy" : ""); } g.tx_period.tv_sec = g.tx_period.tv_nsec = 0; if (g.tx_rate > 0) { /* try to have at least something every second, - * reducing the burst size to 0.5s worth of data + * reducing the burst size to some 0.01s worth of data * (but no less than one full set of fragments) */ - if (g.burst > g.tx_rate/2) - g.burst = g.tx_rate/2; + uint64_t x; + int lim = (g.tx_rate)/300; + if (g.burst > lim) + g.burst = lim; if (g.burst < g.frags) g.burst = g.frags; - g.tx_period.tv_nsec = (1e9 / g.tx_rate) * g.burst; + x = ((uint64_t)1000000000 * (uint64_t)g.burst) / (uint64_t) g.tx_rate; + g.tx_period.tv_nsec = x; g.tx_period.tv_sec = g.tx_period.tv_nsec / 1000000000; g.tx_period.tv_nsec = g.tx_period.tv_nsec % 1000000000; } if (g.td_body == sender_body) D("Sending %d packets every %ld.%09ld s", g.burst, g.tx_period.tv_sec, g.tx_period.tv_nsec); /* Wait for PHY reset. */ D("Wait %d secs for phy reset", wait_link); sleep(wait_link); D("Ready..."); /* Install ^C handler. */ global_nthreads = g.nthreads; signal(SIGINT, sigint_h); -#if 0 // XXX this is not needed, i believe - if (g.dev_type > DEV_NETMAP) { - g.p = pcap_open_live(g.ifname, 0, 1, 100, NULL); - if (g.p == NULL) { - D("cannot open pcap on %s", g.ifname); - usage(); - } else - D("using pcap %p on %s", g.p, g.ifname); - } -#endif // XXX start_threads(&g); main_thread(&g); return 0; } /* end of file */ Index: stable/9/tools/tools/netmap/vale-ctl.c =================================================================== --- stable/9/tools/tools/netmap/vale-ctl.c (revision 262152) +++ stable/9/tools/tools/netmap/vale-ctl.c (revision 262153) @@ -1,163 +1,172 @@ /* - * Copyright (C) 2013 Michio Honda. All rights reserved. + * Copyright (C) 2013-2014 Michio Honda. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* $FreeBSD$ */ #include #include #include /* PRI* macros */ #include /* strcmp */ #include /* open */ #include /* close */ #include /* ioctl */ #include +#include /* apple needs sockaddr */ #include /* ifreq */ #include #include #include /* basename */ /* debug support */ #define ND(format, ...) do {} while(0) #define D(format, ...) \ fprintf(stderr, "%s [%d] " format "\n", \ __FUNCTION__, __LINE__, ##__VA_ARGS__) static int bdg_ctl(const char *name, int nr_cmd, int nr_arg) { struct nmreq nmr; int error = 0; int fd = open("/dev/netmap", O_RDWR); if (fd == -1) { D("Unable to open /dev/netmap"); return -1; } bzero(&nmr, sizeof(nmr)); nmr.nr_version = NETMAP_API; if (name != NULL) /* might be NULL */ strncpy(nmr.nr_name, name, sizeof(nmr.nr_name)); nmr.nr_cmd = nr_cmd; switch (nr_cmd) { case NETMAP_BDG_ATTACH: case NETMAP_BDG_DETACH: if (nr_arg && nr_arg != NETMAP_BDG_HOST) nr_arg = 0; nmr.nr_arg1 = nr_arg; error = ioctl(fd, NIOCREGIF, &nmr); - if (error == -1) - D("Unable to %s %s to the bridge", nr_cmd == + if (error == -1) { + ND("Unable to %s %s to the bridge", nr_cmd == NETMAP_BDG_DETACH?"detach":"attach", name); - else - D("Success to %s %s to the bridge\n", nr_cmd == + perror(name); + } else + ND("Success to %s %s to the bridge", nr_cmd == NETMAP_BDG_DETACH?"detach":"attach", name); break; case NETMAP_BDG_LIST: if (strlen(nmr.nr_name)) { /* name to bridge/port info */ error = ioctl(fd, NIOCGINFO, &nmr); - if (error) - D("Unable to obtain info for %s", name); - else + if (error) { + ND("Unable to obtain info for %s", name); + perror(name); + } else D("%s at bridge:%d port:%d", name, nmr.nr_arg1, nmr.nr_arg2); break; } /* scan all the bridges and ports */ nmr.nr_arg1 = nmr.nr_arg2 = 0; for (; !ioctl(fd, NIOCGINFO, &nmr); nmr.nr_arg2++) { D("bridge:%d port:%d %s", nmr.nr_arg1, nmr.nr_arg2, nmr.nr_name); nmr.nr_name[0] = '\0'; } break; default: /* GINFO */ nmr.nr_cmd = nmr.nr_arg1 = nmr.nr_arg2 = 0; error = ioctl(fd, NIOCGINFO, &nmr); - if (error) - D("Unable to get if info for %s", name); - else + if (error) { + ND("Unable to get if info for %s", name); + perror(name); + } else D("%s: %d queues.", name, nmr.nr_rx_rings); break; } close(fd); return error; } int main(int argc, char *argv[]) { int ch, nr_cmd = 0, nr_arg = 0; const char *command = basename(argv[0]); char *name = NULL; - if (argc != 3 && argc != 1 /* list all */ ) { + if (argc > 3) { usage: fprintf(stderr, "Usage:\n" "%s arguments\n" "\t-g interface interface name to get info\n" "\t-d interface interface name to be detached\n" "\t-a interface interface name to be attached\n" "\t-h interface interface name to be attached with the host stack\n" - "\t-l list all or specified bridge's interfaces\n" + "\t-l list all or specified bridge's interfaces (default)\n" "", command); return 0; } - while ((ch = getopt(argc, argv, "d:a:h:g:l:")) != -1) { + while ((ch = getopt(argc, argv, "d:a:h:g:l")) != -1) { + name = optarg; /* default */ switch (ch) { default: fprintf(stderr, "bad option %c %s", ch, optarg); goto usage; case 'd': nr_cmd = NETMAP_BDG_DETACH; break; case 'a': nr_cmd = NETMAP_BDG_ATTACH; break; case 'h': nr_cmd = NETMAP_BDG_ATTACH; nr_arg = NETMAP_BDG_HOST; break; case 'g': nr_cmd = 0; break; case 'l': nr_cmd = NETMAP_BDG_LIST; + if (optind < argc && argv[optind][0] == '-') + name = NULL; break; } - name = optarg; + if (optind != argc) { + // fprintf(stderr, "optind %d argc %d\n", optind, argc); + goto usage; + } } if (argc == 1) nr_cmd = NETMAP_BDG_LIST; - bdg_ctl(name, nr_cmd, nr_arg); - return 0; + return bdg_ctl(name, nr_cmd, nr_arg) ? 1 : 0; }