Index: share/man/man4/em.4 =================================================================== --- share/man/man4/em.4 +++ share/man/man4/em.4 @@ -45,6 +45,14 @@ .Cd "device em" .Ed .Pp +Optional multiqueue support is available via the following kernel +compile options: +.Bd -ragged -offset indent +.Cd "options EM_MULTIQUEUE" +.Ed +.Pp +Note: Activating EM_MULTIQUEUE support is not supported by Intel. +.Pp Alternatively, to load the driver as a module at boot time, place the following line in .Xr loader.conf 5 : @@ -197,6 +205,14 @@ prompt before booting the kernel or stored in .Xr loader.conf 5 . .Bl -tag -width indent +.It Va hw.em.eee_setting +Disable or Enable Energy Efficient Ethernet. Default 1 (disabled). +.It Va hw.em.msix +Enable or Disable MSI-X style interrupts. Default 1 (on). +.It Va hw.em.smart_pwr_down +Enable or Disable smart power down features on newer adapters. Default 0 (off). +.It Va hw.em.sbp +Show bad packets when in promiscuous mode. Default 0 (off). .It Va hw.em.rxd Number of receive descriptors allocated by the driver. The default value is 1024 for adapters newer than 82547, @@ -228,6 +244,10 @@ .Va hw.em.tx_int_delay is non-zero, this tunable limits the maximum delay in which a transmit interrupt is generated. +.It Va hw.em.num_queues +Number of h/w queues that we will run on this adapter. Max 2. Defaults to 1. +Only valid with kernel configuration +.Cd "options EM_MULTIQUEUE". .El .Sh FILES .Bl -tag -width /dev/led/em* Index: sys/conf/NOTES =================================================================== --- sys/conf/NOTES +++ sys/conf/NOTES @@ -2980,6 +2980,9 @@ # Module to enable execution of application via emulators like QEMU options IMAGACT_BINMISC +# Intel em(4) driver +options EM_MULTIQUEUE # Activate multiqueue features/disable MSI-X + # zlib I/O stream support # This enables support for compressed core dumps. options GZIO Index: sys/conf/options =================================================================== --- sys/conf/options +++ sys/conf/options @@ -940,3 +940,6 @@ RANDOM_YARROW opt_random.h RANDOM_FORTUNA opt_random.h RANDOM_DEBUG opt_random.h + +# Intel em(4) driver +EM_MULTIQUEUE opt_em.h Index: sys/dev/e1000/e1000_defines.h =================================================================== --- sys/dev/e1000/e1000_defines.h +++ sys/dev/e1000/e1000_defines.h @@ -158,10 +158,12 @@ E1000_RXDEXT_STATERR_CXE | \ E1000_RXDEXT_STATERR_RXE) +#define E1000_MRQC_RSS_ENABLE_2Q 0x00000001 #define E1000_MRQC_RSS_FIELD_MASK 0xFFFF0000 #define E1000_MRQC_RSS_FIELD_IPV4_TCP 0x00010000 #define E1000_MRQC_RSS_FIELD_IPV4 0x00020000 #define E1000_MRQC_RSS_FIELD_IPV6_TCP_EX 0x00040000 +#define E1000_MRQC_RSS_FIELD_IPV6_EX 0x00080000 #define E1000_MRQC_RSS_FIELD_IPV6 0x00100000 #define E1000_MRQC_RSS_FIELD_IPV6_TCP 0x00200000 Index: sys/dev/e1000/if_em.h =================================================================== --- sys/dev/e1000/if_em.h +++ sys/dev/e1000/if_em.h @@ -53,7 +53,11 @@ */ #define EM_MIN_TXD 80 #define EM_MAX_TXD 4096 +#ifdef EM_MULTIQUEUE +#define EM_DEFAULT_TXD 4096 +#else #define EM_DEFAULT_TXD 1024 +#endif /* * EM_RXD - Maximum number of receive Descriptors @@ -70,7 +74,11 @@ */ #define EM_MIN_RXD 80 #define EM_MAX_RXD 4096 +#ifdef EM_MULTIQUEUE +#define EM_DEFAULT_RXD 4096 +#else #define EM_DEFAULT_RXD 1024 +#endif /* * EM_TIDV - Transmit Interrupt Delay Value @@ -117,7 +125,11 @@ * restoring the network connection. To eliminate the potential * for the hang ensure that EM_RDTR is set to 0. */ +#ifdef EM_MULTIQUEUE +#define EM_RDTR 64 +#else #define EM_RDTR 0 +#endif /* * Receive Interrupt Absolute Delay Timer (Not valid for 82542/82543/82544) @@ -130,7 +142,11 @@ * along with EM_RDTR, may improve traffic throughput in specific network * conditions. */ +#ifdef EM_MULTIQUEUE +#define EM_RADV 128 +#else #define EM_RADV 64 +#endif /* * This parameter controls the max duration of transmit watchdog. @@ -209,7 +225,15 @@ */ #define EM_DBA_ALIGN 128 -#define SPEED_MODE_BIT (1<<21) /* On PCI-E MACs only */ +/* + * See Intel 82574 Driver Programming Interface Manual, Section 10.2.6.9 + */ +#define TARC_COMPENSATION_MODE (1 << 7) /* Compensation Mode */ +#define TARC_SPEED_MODE_BIT (1 << 21) /* On PCI-E MACs only */ +#define TARC_MQ_FIX (1 << 23) | \ + (1 << 24) | \ + (1 << 25) /* Handle errata in MQ mode */ +#define TARC_ERRATA_BIT (1 << 26) /* Note from errata on 82574 */ /* PCI Config defines */ #define EM_BAR_TYPE(v) ((v) & EM_BAR_TYPE_MASK) @@ -259,6 +283,14 @@ * solve it just using this define. */ #define EM_EIAC 0x000DC +/* + * 82574 only reports 3 MSI-X vectors by default; + * defines assisting with making it report 5 are + * located here. + */ +#define EM_NVM_PCIE_CTRL 0x1B +#define EM_NVM_MSIX_N_MASK (0x7 << EM_NVM_MSIX_N_SHIFT) +#define EM_NVM_MSIX_N_SHIFT 7 /* * Bus dma allocation structure used by @@ -391,7 +423,7 @@ eventhandler_tag vlan_detach; u16 num_vlans; - u16 num_queues; + u8 num_queues; /* * Transmit rings: Index: sys/dev/e1000/if_em.c =================================================================== --- sys/dev/e1000/if_em.c +++ sys/dev/e1000/if_em.c @@ -32,6 +32,8 @@ ******************************************************************************/ /*$FreeBSD$*/ +#include "opt_em.h" +#include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" @@ -41,6 +43,10 @@ #include #include +#ifdef DDB +#include +#include +#endif #if __FreeBSD_version >= 800000 #include #endif @@ -52,6 +58,7 @@ #include #include #include +#include #include #include #include @@ -208,7 +215,7 @@ #ifdef EM_MULTIQUEUE static int em_mq_start(if_t, struct mbuf *); static int em_mq_start_locked(if_t, - struct tx_ring *, struct mbuf *); + struct tx_ring *); static void em_qflush(if_t); #else static void em_start(if_t); @@ -299,6 +306,10 @@ static void em_handle_rx(void *context, int pending); static void em_handle_link(void *context, int pending); +#ifdef EM_MULTIQUEUE +static void em_enable_vectors_82574(struct adapter *); +#endif + static void em_set_sysctl_value(struct adapter *, const char *, const char *, int *, int); static int em_set_flowcntl(SYSCTL_HANDLER_ARGS); @@ -388,6 +399,19 @@ SYSCTL_INT(_hw_em, OID_AUTO, enable_msix, CTLFLAG_RDTUN, &em_enable_msix, 0, "Enable MSI-X interrupts"); +#ifdef EM_MULTIQUEUE +static int em_num_queues = 1; +SYSCTL_INT(_hw_em, OID_AUTO, num_queues, CTLFLAG_RDTUN, &em_num_queues, 0, + "82574 only: Number of queues to configure, 0 indicates autoconfigure"); +#endif + +/* +** Global variable to store last used CPU when binding queues +** to CPUs in igb_allocate_msix. Starts at CPU_FIRST and increments when a +** queue is bound to a cpu. +*/ +static int em_last_bind_cpu = -1; + /* How many packets rxeof tries to clean at a time */ static int em_rx_process_limit = 100; SYSCTL_INT(_hw_em, OID_AUTO, rx_process_limit, CTLFLAG_RDTUN, @@ -420,10 +444,10 @@ em_probe(device_t dev) { char adapter_name[60]; - u16 pci_vendor_id = 0; - u16 pci_device_id = 0; - u16 pci_subvendor_id = 0; - u16 pci_subdevice_id = 0; + uint16_t pci_vendor_id = 0; + uint16_t pci_device_id = 0; + uint16_t pci_subvendor_id = 0; + uint16_t pci_subdevice_id = 0; em_vendor_info_t *ent; INIT_DEBUGOUT("em_probe: begin"); @@ -550,6 +574,11 @@ goto err_pci; } + /* + * Setup MSI/X or MSI if PCI Express + */ + adapter->msix = em_setup_msix(adapter); + e1000_get_bus_info(hw); /* Set up some sysctls for the tunable interrupt delays */ @@ -880,7 +909,7 @@ EM_TX_LOCK(txr); #ifdef EM_MULTIQUEUE if (!drbr_empty(ifp, txr->br)) - em_mq_start_locked(ifp, txr, NULL); + em_mq_start_locked(ifp, txr); #else if (!if_sendq_empty(ifp)) em_start_locked(ifp, txr); @@ -894,105 +923,7 @@ } -#ifdef EM_MULTIQUEUE -/********************************************************************* - * Multiqueue Transmit routines - * - * em_mq_start is called by the stack to initiate a transmit. - * however, if busy the driver can queue the request rather - * than do an immediate send. It is this that is an advantage - * in this driver, rather than also having multiple tx queues. - **********************************************************************/ -static int -em_mq_start_locked(if_t ifp, struct tx_ring *txr, struct mbuf *m) -{ - struct adapter *adapter = txr->adapter; - struct mbuf *next; - int err = 0, enq = 0; - - if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != - IFF_DRV_RUNNING || adapter->link_active == 0) { - if (m != NULL) - err = drbr_enqueue(ifp, txr->br, m); - return (err); - } - - enq = 0; - if (m != NULL) { - err = drbr_enqueue(ifp, txr->br, m); - if (err) - return (err); - } - - /* Process the queue */ - while ((next = drbr_peek(ifp, txr->br)) != NULL) { - if ((err = em_xmit(txr, &next)) != 0) { - if (next == NULL) - drbr_advance(ifp, txr->br); - else - drbr_putback(ifp, txr->br, next); - break; - } - drbr_advance(ifp, txr->br); - enq++; - if_inc_counter(ifp, IFCOUNTER_OBYTES, next->m_pkthdr.len); - if (next->m_flags & M_MCAST) - if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); - if_etherbpfmtap(ifp, next); - if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) - break; - } - - /* Mark the queue as having work */ - if ((enq > 0) && (txr->busy == EM_TX_IDLE)) - txr->busy = EM_TX_BUSY; - - if (txr->tx_avail < EM_MAX_SCATTER) - em_txeof(txr); - if (txr->tx_avail < EM_MAX_SCATTER) - if_setdrvflagbits(ifp, IFF_DRV_OACTIVE,0); - return (err); -} - -/* -** Multiqueue capable stack interface -*/ -static int -em_mq_start(if_t ifp, struct mbuf *m) -{ - struct adapter *adapter = if_getsoftc(ifp); - struct tx_ring *txr = adapter->tx_rings; - int error; - - if (EM_TX_TRYLOCK(txr)) { - error = em_mq_start_locked(ifp, txr, m); - EM_TX_UNLOCK(txr); - } else - error = drbr_enqueue(ifp, txr->br, m); - - return (error); -} - -/* -** Flush all ring buffers -*/ -static void -em_qflush(if_t ifp) -{ - struct adapter *adapter = if_getsoftc(ifp); - struct tx_ring *txr = adapter->tx_rings; - struct mbuf *m; - - for (int i = 0; i < adapter->num_queues; i++, txr++) { - EM_TX_LOCK(txr); - while ((m = buf_ring_dequeue_sc(txr->br)) != NULL) - m_freem(m); - EM_TX_UNLOCK(txr); - } - if_qflush(ifp); -} -#else /* !EM_MULTIQUEUE */ - +#ifndef EM_MULTIQUEUE static void em_start_locked(if_t ifp, struct tx_ring *txr) { @@ -1035,7 +966,8 @@ txr->busy = EM_TX_BUSY; /* Send a copy of the frame to the BPF listener */ - if_etherbpfmtap(ifp, m_head); + ETHER_BPF_MTAP(ifp, m_head); + } return; @@ -1054,6 +986,115 @@ } return; } +#else /* EM_MULTIQUEUE */ +/********************************************************************* + * Multiqueue Transmit routines + * + * em_mq_start is called by the stack to initiate a transmit. + * however, if busy the driver can queue the request rather + * than do an immediate send. It is this that is an advantage + * in this driver, rather than also having multiple tx queues. + **********************************************************************/ +/* +** Multiqueue capable stack interface +*/ +static int +em_mq_start(if_t ifp, struct mbuf *m) +{ + struct adapter *adapter = if_getsoftc(ifp); + struct tx_ring *txr = adapter->tx_rings; + unsigned int i, error; + + if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) + i = m->m_pkthdr.flowid % adapter->num_queues; + else + i = curcpu % adapter->num_queues; + + txr = &adapter->tx_rings[i]; + + error = drbr_enqueue(ifp, txr->br, m); + if (error) + return (error); + + if (EM_TX_TRYLOCK(txr)) { + em_mq_start_locked(ifp, txr); + EM_TX_UNLOCK(txr); + } else + taskqueue_enqueue(txr->tq, &txr->tx_task); + + return (0); +} + +static int +em_mq_start_locked(if_t ifp, struct tx_ring *txr) +{ + struct adapter *adapter = txr->adapter; + struct mbuf *next; + int err = 0, enq = 0; + + EM_TX_LOCK_ASSERT(txr); + + if (((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) || + adapter->link_active == 0) { + return (ENETDOWN); + } + + /* Process the queue */ + while ((next = drbr_peek(ifp, txr->br)) != NULL) { + if ((err = em_xmit(txr, &next)) != 0) { + if (next == NULL) { + /* It was freed, move forward */ + drbr_advance(ifp, txr->br); + } else { + /* + * Still have one left, it may not be + * the same since the transmit function + * may have changed it. + */ + drbr_putback(ifp, txr->br, next); + } + break; + } + drbr_advance(ifp, txr->br); + enq++; + if_inc_counter(ifp, IFCOUNTER_OBYTES, next->m_pkthdr.len); + if (next->m_flags & M_MCAST) + if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); + ETHER_BPF_MTAP(ifp, next); + if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) + break; + } + + /* Mark the queue as having work */ + if ((enq > 0) && (txr->busy == EM_TX_IDLE)) + txr->busy = EM_TX_BUSY; + + if (txr->tx_avail < EM_MAX_SCATTER) + em_txeof(txr); + if (txr->tx_avail < EM_MAX_SCATTER) { + if_setdrvflagbits(ifp, IFF_DRV_OACTIVE,0); + } + return (err); +} + +/* +** Flush all ring buffers +*/ +static void +em_qflush(if_t ifp) +{ + struct adapter *adapter = if_getsoftc(ifp); + struct tx_ring *txr = adapter->tx_rings; + struct mbuf *m; + + for (int i = 0; i < adapter->num_queues; i++, txr++) { + EM_TX_LOCK(txr); + while ((m = buf_ring_dequeue_sc(txr->br)) != NULL) + m_freem(m); + EM_TX_UNLOCK(txr); + } + if_qflush(ifp); +} #endif /* EM_MULTIQUEUE */ /********************************************************************* @@ -1449,7 +1490,7 @@ em_txeof(txr); #ifdef EM_MULTIQUEUE if (!drbr_empty(ifp, txr->br)) - em_mq_start_locked(ifp, txr, NULL); + em_mq_start_locked(ifp, txr); #else if (!if_sendq_empty(ifp)) em_start_locked(ifp, txr); @@ -1516,14 +1557,14 @@ struct tx_ring *txr = adapter->tx_rings; struct rx_ring *rxr = adapter->rx_rings; - if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { bool more = em_rxeof(rxr, adapter->rx_process_limit, NULL); + EM_TX_LOCK(txr); em_txeof(txr); #ifdef EM_MULTIQUEUE if (!drbr_empty(ifp, txr->br)) - em_mq_start_locked(ifp, txr, NULL); + em_mq_start_locked(ifp, txr); #else if (!if_sendq_empty(ifp)) em_start_locked(ifp, txr); @@ -1557,11 +1598,12 @@ em_txeof(txr); #ifdef EM_MULTIQUEUE if (!drbr_empty(ifp, txr->br)) - em_mq_start_locked(ifp, txr, NULL); + em_mq_start_locked(ifp, txr); #else if (!if_sendq_empty(ifp)) em_start_locked(ifp, txr); #endif + /* Reenable this interrupt */ E1000_WRITE_REG(&adapter->hw, E1000_IMS, txr->ims); EM_TX_UNLOCK(txr); @@ -1587,9 +1629,10 @@ more = em_rxeof(rxr, adapter->rx_process_limit, NULL); if (more) taskqueue_enqueue(rxr->tq, &rxr->rx_task); - else + else { /* Reenable this interrupt */ E1000_WRITE_REG(&adapter->hw, E1000_IMS, rxr->ims); + } return; } @@ -1616,6 +1659,16 @@ } else E1000_WRITE_REG(&adapter->hw, E1000_IMS, EM_MSIX_LINK | E1000_IMS_LSC); + /* + ** Because we must read the ICR for this interrupt + ** it may clear other causes using autoclear, for + ** this reason we simply create a soft interrupt + ** for all these vectors. + */ + if (reg_icr) { + E1000_WRITE_REG(&adapter->hw, + E1000_ICS, adapter->ims); + } return; } @@ -1629,9 +1682,10 @@ more = em_rxeof(rxr, adapter->rx_process_limit, NULL); if (more) taskqueue_enqueue(rxr->tq, &rxr->rx_task); - else + else { /* Reenable this interrupt */ E1000_WRITE_REG(&adapter->hw, E1000_IMS, rxr->ims); + } } static void @@ -1645,7 +1699,7 @@ em_txeof(txr); #ifdef EM_MULTIQUEUE if (!drbr_empty(ifp, txr->br)) - em_mq_start_locked(ifp, txr, NULL); + em_mq_start_locked(ifp, txr); #else if (!if_sendq_empty(ifp)) em_start_locked(ifp, txr); @@ -1675,7 +1729,7 @@ EM_TX_LOCK(txr); #ifdef EM_MULTIQUEUE if (!drbr_empty(ifp, txr->br)) - em_mq_start_locked(ifp, txr, NULL); + em_mq_start_locked(ifp, txr); #else if (if_sendq_empty(ifp)) em_start_locked(ifp, txr); @@ -2219,7 +2273,7 @@ if_t ifp = adapter->ifp; struct tx_ring *txr = adapter->tx_rings; struct rx_ring *rxr = adapter->rx_rings; - u32 trigger; + u32 trigger = 0; EM_CORE_LOCK_ASSERT(adapter); @@ -2232,9 +2286,11 @@ e1000_rar_set(&adapter->hw, adapter->hw.mac.addr, 0); /* Mask to use in the irq trigger */ - if (adapter->msix_mem) - trigger = rxr->ims; - else + if (adapter->msix_mem) { + for (int i = 0; i < adapter->num_queues; i++, rxr++) + trigger |= rxr->ims; + rxr = adapter->rx_rings; + } else trigger = E1000_ICS_RXDMT0; /* @@ -2243,7 +2299,6 @@ ** and the HUNG state will be static if set. */ for (int i = 0; i < adapter->num_queues; i++, txr++) { - /* Last cycle a queue was declared hung */ if (txr->busy == EM_TX_HUNG) goto hung; if (txr->busy >= EM_TX_MAXTRIES) @@ -2261,14 +2316,9 @@ return; hung: /* Looks like we're hung */ - device_printf(adapter->dev, "Watchdog timeout -- resetting\n"); - device_printf(adapter->dev, - "Queue(%d) tdh = %d, hw tdt = %d\n", txr->me, - E1000_READ_REG(&adapter->hw, E1000_TDH(txr->me)), - E1000_READ_REG(&adapter->hw, E1000_TDT(txr->me))); - device_printf(adapter->dev,"TX(%d) desc avail = %d," - "Next TX to Clean = %d\n", - txr->me, txr->tx_avail, txr->next_to_clean); + device_printf(adapter->dev, "Watchdog timeout Queue[%d]-- resetting\n", + txr->me); + em_print_debug_info(adapter); if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING); adapter->watchdog_events++; em_init_locked(adapter); @@ -2320,7 +2370,7 @@ (hw->mac.type == e1000_82572))) { int tarc0; tarc0 = E1000_READ_REG(hw, E1000_TARC(0)); - tarc0 &= ~SPEED_MODE_BIT; + tarc0 &= ~TARC_SPEED_MODE_BIT; E1000_WRITE_REG(hw, E1000_TARC(0), tarc0); } if (bootverbose) @@ -2436,14 +2486,6 @@ rman_get_bushandle(adapter->memory); adapter->hw.hw_addr = (u8 *)&adapter->osdep.mem_bus_space_handle; - /* Default to a single queue */ - adapter->num_queues = 1; - - /* - * Setup MSI/X or MSI if PCI Express - */ - adapter->msix = em_setup_msix(adapter); - adapter->hw.back = &adapter->osdep; return (0); @@ -2518,13 +2560,14 @@ struct tx_ring *txr = adapter->tx_rings; struct rx_ring *rxr = adapter->rx_rings; int error, rid, vector = 0; + int cpu_id = 0; /* Make sure all interrupts are disabled */ E1000_WRITE_REG(&adapter->hw, E1000_IMC, 0xffffffff); /* First set up ring resources */ - for (int i = 0; i < adapter->num_queues; i++, txr++, rxr++) { + for (int i = 0; i < adapter->num_queues; i++, rxr++, vector++) { /* RX ring */ rid = vector + 1; @@ -2544,14 +2587,20 @@ return (error); } #if __FreeBSD_version >= 800504 - bus_describe_intr(dev, rxr->res, rxr->tag, "rx %d", i); + bus_describe_intr(dev, rxr->res, rxr->tag, "rx%d", i); #endif - rxr->msix = vector++; /* NOTE increment vector for TX */ + rxr->msix = vector; + + if (em_last_bind_cpu < 0) + em_last_bind_cpu = CPU_FIRST(); + cpu_id = em_last_bind_cpu; + bus_bind_intr(dev, rxr->res, cpu_id); + TASK_INIT(&rxr->rx_task, 0, em_handle_rx, rxr); rxr->tq = taskqueue_create_fast("em_rxq", M_NOWAIT, taskqueue_thread_enqueue, &rxr->tq); - taskqueue_start_threads(&rxr->tq, 1, PI_NET, "%s rxq", - device_get_nameunit(adapter->dev)); + taskqueue_start_threads(&rxr->tq, 1, PI_NET, "%s rxq (cpuid %d)", + device_get_nameunit(adapter->dev), cpu_id); /* ** Set the bit to enable interrupt ** in E1000_IMS -- bits 20 and 21 @@ -2559,8 +2608,13 @@ ** NOTHING to do with the MSIX vector */ rxr->ims = 1 << (20 + i); + adapter->ims |= rxr->ims; adapter->ivars |= (8 | rxr->msix) << (i * 4); + em_last_bind_cpu = CPU_NEXT(em_last_bind_cpu); + } + + for (int i = 0; i < adapter->num_queues; i++, txr++, vector++) { /* TX ring */ rid = vector + 1; txr->res = bus_alloc_resource_any(dev, @@ -2578,14 +2632,20 @@ return (error); } #if __FreeBSD_version >= 800504 - bus_describe_intr(dev, txr->res, txr->tag, "tx %d", i); + bus_describe_intr(dev, txr->res, txr->tag, "tx%d", i); #endif - txr->msix = vector++; /* Increment vector for next pass */ + txr->msix = vector; + + if (em_last_bind_cpu < 0) + em_last_bind_cpu = CPU_FIRST(); + cpu_id = em_last_bind_cpu; + bus_bind_intr(dev, txr->res, cpu_id); + TASK_INIT(&txr->tx_task, 0, em_handle_tx, txr); txr->tq = taskqueue_create_fast("em_txq", M_NOWAIT, taskqueue_thread_enqueue, &txr->tq); - taskqueue_start_threads(&txr->tq, 1, PI_NET, "%s txq", - device_get_nameunit(adapter->dev)); + taskqueue_start_threads(&txr->tq, 1, PI_NET, "%s txq (cpuid %d)", + device_get_nameunit(adapter->dev), cpu_id); /* ** Set the bit to enable interrupt ** in E1000_IMS -- bits 22 and 23 @@ -2593,13 +2653,16 @@ ** NOTHING to do with the MSIX vector */ txr->ims = 1 << (22 + i); + adapter->ims |= txr->ims; adapter->ivars |= (8 | txr->msix) << (8 + (i * 4)); + + em_last_bind_cpu = CPU_NEXT(em_last_bind_cpu); } /* Link interrupt */ - ++rid; + rid = vector + 1; adapter->res = bus_alloc_resource_any(dev, - SYS_RES_IRQ, &rid, RF_ACTIVE); + SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (!adapter->res) { device_printf(dev,"Unable to allocate " "bus resource: Link interrupt [%d]\n", rid); @@ -2615,7 +2678,7 @@ return (error); } #if __FreeBSD_version >= 800504 - bus_describe_intr(dev, adapter->res, adapter->tag, "link"); + bus_describe_intr(dev, adapter->res, adapter->tag, "link"); #endif adapter->linkvec = vector; adapter->ivars |= (8 | vector) << 16; @@ -2639,9 +2702,8 @@ */ for (int i = 0; i < adapter->num_queues; i++) { txr = &adapter->tx_rings[i]; - rxr = &adapter->rx_rings[i]; /* an early abort? */ - if ((txr == NULL) || (rxr == NULL)) + if (txr == NULL) break; rid = txr->msix +1; if (txr->tag != NULL) { @@ -2651,6 +2713,11 @@ if (txr->res != NULL) bus_release_resource(dev, SYS_RES_IRQ, rid, txr->res); + + rxr = &adapter->rx_rings[i]; + /* an early abort? */ + if (rxr == NULL) + break; rid = rxr->msix +1; if (rxr->tag != NULL) { bus_teardown_intr(dev, rxr->res, rxr->tag); @@ -2700,14 +2767,19 @@ device_t dev = adapter->dev; int val; + /* Nearly always going to use one queue */ + adapter->num_queues = 1; + /* - ** Setup MSI/X for Hartwell: tests have shown - ** use of two queues to be unstable, and to - ** provide no great gain anyway, so we simply - ** seperate the interrupts and use a single queue. + ** Try using MSI-X for Hartwell adapters */ if ((adapter->hw.mac.type == e1000_82574) && (em_enable_msix == TRUE)) { +#ifdef EM_MULTIQUEUE + adapter->num_queues = (em_num_queues == 1) ? 1 : 2; + if (adapter->num_queues > 1) + em_enable_vectors_82574(adapter); +#endif /* Map the MSIX BAR */ int rid = PCIR_BAR(EM_MSIX_BAR); adapter->msix_mem = bus_alloc_resource_any(dev, @@ -2719,16 +2791,34 @@ goto msi; } val = pci_msix_count(dev); - /* We only need/want 3 vectors */ - if (val >= 3) - val = 3; - else { - device_printf(adapter->dev, - "MSIX: insufficient vectors, using MSI\n"); - goto msi; + +#ifdef EM_MULTIQUEUE + /* We need 5 vectors in the multiqueue case */ + if (adapter->num_queues > 1 ) { + if (val >= 5) + val = 5; + else { + adapter->num_queues = 1; + device_printf(adapter->dev, + "Insufficient MSIX vectors for >1 queue, " + "using single queue...\n"); + goto msix_one; + } + } else { +msix_one: +#endif + if (val >= 3) + val = 3; + else { + device_printf(adapter->dev, + "Insufficient MSIX vectors, using MSI\n"); + goto msi; + } +#ifdef EM_MULTIQUEUE } +#endif - if ((pci_alloc_msix(dev, &val) == 0) && (val == 3)) { + if ((pci_alloc_msix(dev, &val) == 0)) { device_printf(adapter->dev, "Using MSIX interrupts " "with %d vectors\n", val); @@ -2749,7 +2839,7 @@ } val = 1; if (pci_alloc_msi(dev, &val) == 0) { - device_printf(adapter->dev,"Using an MSI interrupt\n"); + device_printf(adapter->dev, "Using an MSI interrupt\n"); return (val); } /* Should only happen due to manual configuration */ @@ -3394,7 +3484,7 @@ { struct tx_ring *txr = adapter->tx_rings; struct e1000_hw *hw = &adapter->hw; - u32 tctl, tarc, tipg = 0; + u32 tctl, txdctl = 0, tarc, tipg = 0; INIT_DEBUGOUT("em_initialize_transmit_unit: begin"); @@ -3416,6 +3506,15 @@ E1000_READ_REG(&adapter->hw, E1000_TDLEN(i))); txr->busy = EM_TX_IDLE; + txdctl = 0; /* clear txdctl */ + txdctl |= 0x1f; /* PTHRESH */ + txdctl |= 1 << 8; /* HTHRESH */ + txdctl |= 1 << 16;/* WTHRESH */ + txdctl |= 1 << 22; /* Reserved bit 22 must always be 1 */ + txdctl |= E1000_TXDCTL_GRAN; + txdctl |= 1 << 25; /* LWTHRESH */ + + E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl); } /* Set the default values for the Tx Inter Packet Gap timer */ @@ -3446,15 +3545,25 @@ if ((adapter->hw.mac.type == e1000_82571) || (adapter->hw.mac.type == e1000_82572)) { tarc = E1000_READ_REG(&adapter->hw, E1000_TARC(0)); - tarc |= SPEED_MODE_BIT; + tarc |= TARC_SPEED_MODE_BIT; E1000_WRITE_REG(&adapter->hw, E1000_TARC(0), tarc); } else if (adapter->hw.mac.type == e1000_80003es2lan) { + /* errata: program both queues to unweighted RR */ tarc = E1000_READ_REG(&adapter->hw, E1000_TARC(0)); tarc |= 1; E1000_WRITE_REG(&adapter->hw, E1000_TARC(0), tarc); tarc = E1000_READ_REG(&adapter->hw, E1000_TARC(1)); tarc |= 1; E1000_WRITE_REG(&adapter->hw, E1000_TARC(1), tarc); + } else if (adapter->hw.mac.type == e1000_82574) { + tarc = E1000_READ_REG(&adapter->hw, E1000_TARC(0)); + tarc |= TARC_ERRATA_BIT; + if ( adapter->num_queues > 1) { + tarc |= (TARC_COMPENSATION_MODE | TARC_MQ_FIX); + E1000_WRITE_REG(&adapter->hw, E1000_TARC(0), tarc); + E1000_WRITE_REG(&adapter->hw, E1000_TARC(1), tarc); + } else + E1000_WRITE_REG(&adapter->hw, E1000_TARC(0), tarc); } adapter->txd_cmd = E1000_TXD_CMD_IFCS; @@ -3885,8 +3994,9 @@ * TX lock which, with a single queue, guarantees * sanity. */ - if (txr->tx_avail >= EM_MAX_SCATTER) + if (txr->tx_avail >= EM_MAX_SCATTER) { if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); + } /* Disable hang detection if all clean */ if (txr->tx_avail == adapter->num_tx_desc) @@ -4258,6 +4368,9 @@ E1000_WRITE_REG(&adapter->hw, E1000_RADV, adapter->rx_abs_int_delay.value); + + E1000_WRITE_REG(&adapter->hw, E1000_RDTR, + adapter->rx_int_delay.value); /* * Set the interrupt throttling rate. Value is calculated * as DEFAULT_ITR = 1/(MAX_INTS_PER_SEC * 256ns) @@ -4269,20 +4382,65 @@ ** using the EITR register (82574 only) */ if (hw->mac.type == e1000_82574) { + u32 rfctl; for (int i = 0; i < 4; i++) E1000_WRITE_REG(hw, E1000_EITR_82574(i), DEFAULT_ITR); /* Disable accelerated acknowledge */ - E1000_WRITE_REG(hw, E1000_RFCTL, E1000_RFCTL_ACK_DIS); + rfctl = E1000_READ_REG(hw, E1000_RFCTL); + rfctl |= E1000_RFCTL_ACK_DIS; + E1000_WRITE_REG(hw, E1000_RFCTL, rfctl); } rxcsum = E1000_READ_REG(hw, E1000_RXCSUM); - if (if_getcapenable(ifp) & IFCAP_RXCSUM) + if (if_getcapenable(ifp) & IFCAP_RXCSUM) { +#ifdef EM_MULTIQUEUE + rxcsum |= E1000_RXCSUM_TUOFL | + E1000_RXCSUM_IPOFL | + E1000_RXCSUM_PCSD; +#else rxcsum |= E1000_RXCSUM_TUOFL; - else +#endif + } else rxcsum &= ~E1000_RXCSUM_TUOFL; + E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum); +#ifdef EM_MULTIQUEUE + if (adapter->num_queues > 1) { + uint32_t rss_key[10]; + uint32_t reta; + int i; + + /* + * Configure RSS key + */ + arc4rand(rss_key, sizeof(rss_key), 0); + for (i = 0; i < 10; ++i) + E1000_WRITE_REG_ARRAY(hw,E1000_RSSRK(0), i, rss_key[i]); + + /* + * Configure RSS redirect table in following fashion: + * (hash & ring_cnt_mask) == rdr_table[(hash & rdr_table_mask)] + */ + reta = 0; + for (i = 0; i < 4; ++i) { + uint32_t q; + q = (i % adapter->num_queues) << 7; + reta |= q << (8 * i); + } + for (i = 0; i < 32; ++i) + E1000_WRITE_REG(hw, E1000_RETA(i), reta); + + E1000_WRITE_REG(hw, E1000_MRQC, E1000_MRQC_RSS_ENABLE_2Q | + E1000_MRQC_RSS_FIELD_IPV4_TCP | + E1000_MRQC_RSS_FIELD_IPV4 | + E1000_MRQC_RSS_FIELD_IPV6_TCP_EX | + E1000_MRQC_RSS_FIELD_IPV6_EX | + E1000_MRQC_RSS_FIELD_IPV6 | + E1000_MRQC_RSS_FIELD_IPV6_TCP); + } +#endif /* ** XXX TEMPORARY WORKAROUND: on some systems with 82573 ** long latencies are observed, like Lenovo X60. This @@ -4317,13 +4475,30 @@ E1000_WRITE_REG(hw, E1000_RDT(i), rdt); } - /* Set PTHRESH for improved jumbo performance */ + /* + * Set PTHRESH for improved jumbo performance + * According to 10.2.5.11 of Intel 82574 Datasheet, + * RXDCTL(1) is written whenever RXDCTL(0) is written. + * Only write to RXDCTL(1) if there is a need for different + * settings. + */ if (((adapter->hw.mac.type == e1000_ich9lan) || (adapter->hw.mac.type == e1000_pch2lan) || (adapter->hw.mac.type == e1000_ich10lan)) && (if_getmtu(ifp) > ETHERMTU)) { u32 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(0)); E1000_WRITE_REG(hw, E1000_RXDCTL(0), rxdctl | 3); + } else if ((adapter->hw.mac.type == e1000_82574) && + (if_getmtu(ifp) > ETHERMTU)) { + for (int i = 0; i < adapter->num_queues; i++) { + u32 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i)); + + rxdctl |= 0x20; /* PTHRESH */ + rxdctl |= 4 << 8; /* HTHRESH */ + rxdctl |= 4 << 16;/* WTHRESH */ + rxdctl |= 1 << 24; /* Switch to granularity */ + E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl); + } } if (adapter->hw.mac.type >= e1000_pch2lan) { @@ -4390,6 +4565,11 @@ EM_RX_LOCK(rxr); + /* Sync the ring */ + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + + #ifdef DEV_NETMAP if (netmap_rx_irq(ifp, rxr->me, &processed)) { EM_RX_UNLOCK(rxr); @@ -4402,9 +4582,6 @@ if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) break; - bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, - BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); - cur = &rxr->rx_base[i]; status = cur->status; mp = sendmp = NULL; @@ -4470,6 +4647,10 @@ rxr->fmp = rxr->lmp = NULL; } next_desc: + /* Sync the ring */ + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + /* Zero out the receive descriptors status. */ cur->status = 0; ++rxdone; /* cumulative for POLL */ @@ -5291,10 +5472,10 @@ CTLFLAG_RD, &adapter->hw.fc.low_water, 0, "Flow Control Low Watermark"); - for (int i = 0; i < adapter->num_queues; i++, rxr++, txr++) { - snprintf(namebuf, QUEUE_NAME_LEN, "queue%d", i); + for (int i = 0; i < adapter->num_queues; i++, txr++, rxr++) { + snprintf(namebuf, QUEUE_NAME_LEN, "queue_tx_%d", i); queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, - CTLFLAG_RD, NULL, "Queue Name"); + CTLFLAG_RD, NULL, "TX Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "txd_head", @@ -5313,7 +5494,12 @@ SYSCTL_ADD_ULONG(ctx, queue_list, OID_AUTO, "no_desc_avail", CTLFLAG_RD, &txr->no_desc_avail, "Queue No Descriptor Available"); - + + snprintf(namebuf, QUEUE_NAME_LEN, "queue_rx_%d", i); + queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, + CTLFLAG_RD, NULL, "RX Queue Name"); + queue_list = SYSCTL_CHILDREN(queue_node); + SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "rxd_head", CTLTYPE_UINT | CTLFLAG_RD, adapter, E1000_RDH(rxr->me), @@ -5747,19 +5933,86 @@ else printf("and ACTIVE\n"); - device_printf(dev, "hw tdh = %d, hw tdt = %d\n", - E1000_READ_REG(&adapter->hw, E1000_TDH(0)), - E1000_READ_REG(&adapter->hw, E1000_TDT(0))); - device_printf(dev, "hw rdh = %d, hw rdt = %d\n", - E1000_READ_REG(&adapter->hw, E1000_RDH(0)), - E1000_READ_REG(&adapter->hw, E1000_RDT(0))); - device_printf(dev, "Tx Queue Status = %d\n", txr->busy); - device_printf(dev, "TX descriptors avail = %d\n", - txr->tx_avail); - device_printf(dev, "Tx Descriptors avail failure = %ld\n", - txr->no_desc_avail); - device_printf(dev, "RX discarded packets = %ld\n", - rxr->rx_discarded); - device_printf(dev, "RX Next to Check = %d\n", rxr->next_to_check); - device_printf(dev, "RX Next to Refresh = %d\n", rxr->next_to_refresh); + for (int i = 0; i < adapter->num_queues; i++, txr++, rxr++) { + device_printf(dev, "TX Queue %d ------\n", i); + device_printf(dev, "hw tdh = %d, hw tdt = %d\n", + E1000_READ_REG(&adapter->hw, E1000_TDH(i)), + E1000_READ_REG(&adapter->hw, E1000_TDT(i))); + device_printf(dev, "Tx Queue Status = %d\n", txr->busy); + device_printf(dev, "TX descriptors avail = %d\n", + txr->tx_avail); + device_printf(dev, "Tx Descriptors avail failure = %ld\n", + txr->no_desc_avail); + device_printf(dev, "RX Queue %d ------\n", i); + device_printf(dev, "hw rdh = %d, hw rdt = %d\n", + E1000_READ_REG(&adapter->hw, E1000_RDH(i)), + E1000_READ_REG(&adapter->hw, E1000_RDT(i))); + device_printf(dev, "RX discarded packets = %ld\n", + rxr->rx_discarded); + device_printf(dev, "RX Next to Check = %d\n", rxr->next_to_check); + device_printf(dev, "RX Next to Refresh = %d\n", rxr->next_to_refresh); + } } + +#ifdef EM_MULTIQUEUE +/* + * 82574 only: + * Write a new value to the EEPROM increasing the number of MSIX + * vectors from 3 to 5, for proper multiqueue support. + */ +static void +em_enable_vectors_82574(struct adapter *adapter) +{ + struct e1000_hw *hw = &adapter->hw; + device_t dev = adapter->dev; + u16 edata; + + e1000_read_nvm(hw, EM_NVM_PCIE_CTRL, 1, &edata); + printf("Current cap: %#06x\n", edata); + if (((edata & EM_NVM_MSIX_N_MASK) >> EM_NVM_MSIX_N_SHIFT) != 4) { + device_printf(dev, "Writing to eeprom: increasing " + "reported MSIX vectors from 3 to 5...\n"); + edata &= ~(EM_NVM_MSIX_N_MASK); + edata |= 4 << EM_NVM_MSIX_N_SHIFT; + e1000_write_nvm(hw, EM_NVM_PCIE_CTRL, 1, &edata); + e1000_update_nvm_checksum(hw); + device_printf(dev, "Writing to eeprom: done\n"); + } +} +#endif + +#ifdef DDB +DB_COMMAND(em_reset_dev, em_ddb_reset_dev) +{ + devclass_t dc; + int max_em; + + dc = devclass_find("em"); + max_em = devclass_get_maxunit(dc); + + for (int index = 0; index < (max_em - 1); index++) { + device_t dev; + dev = devclass_get_device(dc, index); + if (device_get_driver(dev) == &em_driver) { + struct adapter *adapter = device_get_softc(dev); + em_init_locked(adapter); + } + } +} +DB_COMMAND(em_dump_queue, em_ddb_dump_queue) +{ + devclass_t dc; + int max_em; + + dc = devclass_find("em"); + max_em = devclass_get_maxunit(dc); + + for (int index = 0; index < (max_em - 1); index++) { + device_t dev; + dev = devclass_get_device(dc, index); + if (device_get_driver(dev) == &em_driver) + em_print_debug_info(device_get_softc(dev)); + } + +} +#endif Index: sys/dev/netmap/if_em_netmap.h =================================================================== --- sys/dev/netmap/if_em_netmap.h +++ sys/dev/netmap/if_em_netmap.h @@ -70,7 +70,7 @@ struct rx_ring *rxr = adapter->rx_rings; int i; - for (i = 0; i < adapter->num_queues; i++) { + for (i = 0; i < adapter->num_queues; i++, txr++, rxr++) { taskqueue_unblock(txr->tq); taskqueue_unblock(rxr->tq); } Index: sys/modules/em/Makefile =================================================================== --- sys/modules/em/Makefile +++ sys/modules/em/Makefile @@ -2,7 +2,7 @@ .PATH: ${.CURDIR}/../../dev/e1000 KMOD = if_em -SRCS = device_if.h bus_if.h pci_if.h opt_inet.h opt_inet6.h +SRCS = device_if.h bus_if.h pci_if.h opt_ddb.h opt_inet.h opt_inet6.h SRCS += $(CORE_SRC) $(LEGACY_SRC) SRCS += $(COMMON_SHARED) $(LEGACY_SHARED) $(PCIE_SHARED) CORE_SRC = if_em.c e1000_osdep.c