Index: sys/dev/ixgbe/if_ix.c =================================================================== --- sys/dev/ixgbe/if_ix.c +++ sys/dev/ixgbe/if_ix.c @@ -349,7 +349,7 @@ * is varied over time based on the * traffic for that interrupt vector */ -static int ixgbe_enable_aim = false; +static int ixgbe_enable_aim = 1; SYSCTL_INT(_hw_ix, OID_AUTO, enable_aim, CTLFLAG_RWTUN, &ixgbe_enable_aim, 0, "Enable adaptive interrupt moderation"); @@ -1671,8 +1671,8 @@ queue_list = SYSCTL_CHILDREN(queue_node); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "interrupt_rate", - CTLTYPE_UINT | CTLFLAG_RW, - &sc->rx_queues[i], 0, + CTLTYPE_UINT | CTLFLAG_RD, + rx_que, false, ixgbe_sysctl_interrupt_rate_handler, "IU", "Interrupt Rate"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "irqs", @@ -2178,12 +2178,201 @@ return (error); } /* ixgbe_if_msix_intr_assign */ +static inline void +ixgbe_neweitr(struct ixgbe_softc *sc, struct ix_rx_queue *que) +{ +#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) +#define IXGBE_ITR_ADAPTIVE_MIN_INC 2 +#define IXGBE_ITR_ADAPTIVE_MIN_USECS 10 +#define IXGBE_ITR_ADAPTIVE_MAX_USECS 126 +#define IXGBE_ITR_ADAPTIVE_LATENCY 0x80 +#define IXGBE_ITR_ADAPTIVE_BULK 0x00 + + struct rx_ring *rxr = &que->rxr; + struct tx_ring *txr = &sc->tx_queues[que->msix].txr; + uint32_t newitr = IXGBE_ITR_ADAPTIVE_MIN_USECS | + IXGBE_ITR_ADAPTIVE_LATENCY; + uint32_t bytes_packets, packets, bytes; + + /* We have no packets to actually measure against. This means + * we are a Tx queue doing TSO with too high of an interrupt rate. + * + * When this occurs just tick up our delay by the minimum value + * and hope that this extra delay will prevent us from being called + * without any work on our queue. + */ + if (!txr->packets && !rxr->packets && txr->bytes) { + newitr = (que->eitr_setting >> 2) + IXGBE_ITR_ADAPTIVE_MIN_INC; + if (newitr > IXGBE_ITR_ADAPTIVE_MAX_USECS) + newitr = IXGBE_ITR_ADAPTIVE_MAX_USECS; + newitr &= IXGBE_ITR_ADAPTIVE_LATENCY; + goto ixgbe_set_eitr; + } + + /* Get the largest values from the associated tx and rx ring */ + if (txr->bytes && txr->packets) { + bytes = txr->bytes; + bytes_packets = txr->bytes/txr->packets; + packets = txr->packets; + } + if (rxr->bytes && rxr->packets) { + bytes = max(bytes, rxr->bytes); + bytes_packets = max(bytes_packets, rxr->bytes/rxr->packets); + packets = max(packets, rxr->packets); + } + + /* If packets are less than 4 or bytes are less than 9000 assume + * insufficient data to use bulk rate limiting approach. We are + * likely latency driven. + */ + if (packets < 4 && bytes < 9000) { + newitr = IXGBE_ITR_ADAPTIVE_LATENCY; + goto ixgbe_adjust_by_size; + } + + /* Between 4 and 48 we can assume that our current interrupt delay + * is only slightly too low. As such we should increase it by a small + * fixed amount. + */ + if (packets < 48) { + newitr = (que->eitr_setting >> 2) + IXGBE_ITR_ADAPTIVE_MIN_INC; + if (newitr > IXGBE_ITR_ADAPTIVE_MAX_USECS) + newitr = IXGBE_ITR_ADAPTIVE_MAX_USECS; + goto ixgbe_set_eitr; + } + + /* Between 48 and 96 is our "goldilocks" zone where we are working + * out "just right". Just report that our current ITR is good for us. + */ + if (packets < 96) { + newitr = que->eitr_setting >> 2; + goto ixgbe_set_eitr; + } + + /* If packet count is 96 or greater we are likely looking at a slight + * overrun of the delay we want. Try halving our delay to see if that + * will cut the number of packets in half per interrupt. + */ + if (packets < 256) { + newitr = que->eitr_setting >> 3; + if (newitr < IXGBE_ITR_ADAPTIVE_MIN_USECS) + newitr = IXGBE_ITR_ADAPTIVE_MIN_USECS; + goto ixgbe_set_eitr; + } + + /* The paths below assume we are dealing with a bulk ITR since number + * of packets is 256 or greater. We are just going to have to compute + * a value and try to bring the count under control, though for smaller + * packet sizes there isn't much we can do as NAPI polling will likely + * be kicking in sooner rather than later. + */ + newitr = IXGBE_ITR_ADAPTIVE_BULK; + +ixgbe_adjust_by_size: + /* If packet counts are 256 or greater we can assume we have a gross + * overestimation of what the rate should be. Instead of trying to fine + * tune it just use the formula below to try and dial in an exact value + * give the current packet size of the frame. + */ + + /* The following is a crude approximation of: + * wmem_default / (size + overhead) = desired_pkts_per_int + * rate / bits_per_byte / (size + ethernet overhead) = pkt_rate + * (desired_pkt_rate / pkt_rate) * usecs_per_sec = ITR value + * + * Assuming wmem_default is 212992 and overhead is 640 bytes per + * packet, (256 skb, 64 headroom, 320 shared info), we can reduce the + * formula down to + * + * (170 * (size + 24)) / (size + 640) = ITR + * + * We first do some math on the packet size and then finally bitshift + * by 8 after rounding up. We also have to account for PCIe link speed + * difference as ITR scales based on this. + */ + if (bytes_packets <= 60) { + /* Start at 50k ints/sec */ + bytes_packets = 5120; + } else if (bytes_packets <= 316) { + /* 50K ints/sec to 16K ints/sec */ + bytes_packets *= 40; + bytes_packets += 2720; + } else if (bytes_packets <= 1084) { + /* 16K ints/sec to 9.2K ints/sec */ + bytes_packets *= 15; + bytes_packets += 11452; + } else if (bytes_packets < 1968) { + /* 9.2K ints/sec to 8K ints/sec */ + bytes_packets *= 5; + bytes_packets += 22420; + } else { + /* plateau at a limit of 8K ints/sec */ + bytes_packets = 32256; + } + + /* If we are in low latency mode half our delay which doubles the rate + * to somewhere between 100K to 16K ints/sec + */ + if (newitr & IXGBE_ITR_ADAPTIVE_LATENCY) + bytes_packets >>= 1; + + /* Resultant value is 256 times larger than it needs to be. This + * gives us room to adjust the value as needed to either increase + * or decrease the value based on link speeds of 10G, 2.5G, 1G, etc. + * + * Use addition as we have already recorded the new latency flag + * for the ITR value. + */ + switch (sc->link_speed) { + case IXGBE_LINK_SPEED_10GB_FULL: + case IXGBE_LINK_SPEED_5GB_FULL: + case IXGBE_LINK_SPEED_2_5GB_FULL: + case IXGBE_LINK_SPEED_1GB_FULL: + newitr += DIV_ROUND_UP(bytes_packets, + IXGBE_ITR_ADAPTIVE_MIN_INC * 256) * + IXGBE_ITR_ADAPTIVE_MIN_INC; + break; + case IXGBE_LINK_SPEED_100_FULL: + case IXGBE_LINK_SPEED_10_FULL: + default: + if (bytes_packets > 8064) + bytes_packets = 8064; + newitr += DIV_ROUND_UP(bytes_packets, + IXGBE_ITR_ADAPTIVE_MIN_INC * 64) * + IXGBE_ITR_ADAPTIVE_MIN_INC; + break; + } + +ixgbe_set_eitr: + if (sc->hw.mac.type == ixgbe_mac_82598EB) { + newitr |= newitr << 16; + } else { + newitr |= IXGBE_EITR_CNT_WDIS; + } + + /* Clear latency flag if set, shift into correct position */ + newitr &= ~IXGBE_ITR_ADAPTIVE_LATENCY; + newitr <<= 2; + + if (newitr != que->eitr_setting) { + que->eitr_setting = newitr; + IXGBE_WRITE_REG(&sc->hw, IXGBE_EITR(que->msix), + que->eitr_setting); + } + + /* Reset state */ + txr->bytes = 0; + txr->packets = 0; + rxr->bytes = 0; + rxr->packets = 0; +} + static inline void ixgbe_perform_aim(struct ixgbe_softc *sc, struct ix_rx_queue *que) { uint32_t newitr = 0; struct rx_ring *rxr = &que->rxr; - /* FIXME struct tx_ring *txr = ... ->txr; */ + struct tx_ring *txr = &sc->tx_queues[que->msix].txr; /* * Do Adaptive Interrupt Moderation: @@ -2197,20 +2386,15 @@ } que->eitr_setting = 0; + /* Idle, do nothing */ - if (rxr->bytes == 0) { - /* FIXME && txr->bytes == 0 */ + if (txr->bytes == 0 && rxr->bytes == 0) return; - } + if ((txr->bytes) && (txr->packets)) + newitr = txr->bytes/txr->packets; if ((rxr->bytes) && (rxr->packets)) - newitr = rxr->bytes / rxr->packets; - /* FIXME for transmit accounting - * if ((txr->bytes) && (txr->packets)) - * newitr = txr->bytes/txr->packets; - * if ((rxr->bytes) && (rxr->packets)) - * newitr = max(newitr, (rxr->bytes / rxr->packets)); - */ + newitr = max(newitr, rxr->bytes / rxr->packets); newitr += 24; /* account for hardware frame, crc */ /* set an upper boundary */ @@ -2223,18 +2407,17 @@ newitr = (newitr / 2); } - if (sc->hw.mac.type == ixgbe_mac_82598EB) { + if (sc->hw.mac.type == ixgbe_mac_82598EB) newitr |= newitr << 16; - } else { + else newitr |= IXGBE_EITR_CNT_WDIS; - } /* save for next interrupt */ que->eitr_setting = newitr; /* Reset state */ - /* FIXME txr->bytes = 0; */ - /* FIXME txr->packets = 0; */ + txr->bytes = 0; + txr->packets = 0; rxr->bytes = 0; rxr->packets = 0; @@ -2251,6 +2434,8 @@ struct ixgbe_softc *sc = que->sc; if_t ifp = iflib_get_ifp(que->sc->ctx); + uint32_t newitr = 0; + /* Protect against spurious interrupts */ if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) return (FILTER_HANDLED); @@ -2259,8 +2444,17 @@ ++que->irqs; /* Check for AIM */ - if (sc->enable_aim) { + if (sc->enable_aim == 1) { + ixgbe_neweitr(sc, que); + } else if (sc->enable_aim == 2) { ixgbe_perform_aim(sc, que); + } else { + newitr = IXGBE_INTS_TO_EITR(ixgbe_max_interrupt_rate); + if (sc->hw.mac.type == ixgbe_mac_82598EB) + newitr |= newitr << 16; + else + newitr |= IXGBE_EITR_CNT_WDIS; + IXGBE_WRITE_REG(&sc->hw, IXGBE_EITR(que->msix), newitr); } return (FILTER_SCHEDULE_THREAD); @@ -2688,31 +2882,35 @@ static int ixgbe_sysctl_interrupt_rate_handler(SYSCTL_HANDLER_ARGS) { - struct ix_rx_queue *que = ((struct ix_rx_queue *)oidp->oid_arg1); + struct ix_rx_queue *rque; + struct ix_tx_queue *tque; int error; unsigned int reg, usec, rate; - if (atomic_load_acq_int(&que->sc->recovery_mode)) - return (EPERM); + bool tx = oidp->oid_arg2; - reg = IXGBE_READ_REG(&que->sc->hw, IXGBE_EITR(que->msix)); - usec = ((reg & 0x0FF8) >> 3); + if (tx) { + tque = oidp->oid_arg1; + if (atomic_load_acq_int(&tque->sc->recovery_mode)) + return (EPERM); + reg = IXGBE_READ_REG(&tque->sc->hw, IXGBE_EITR(tque->txr.me)); + } else { + rque = oidp->oid_arg1; + if (atomic_load_acq_int(&rque->sc->recovery_mode)) + return (EPERM); + reg = IXGBE_READ_REG(&rque->sc->hw, IXGBE_EITR(rque->msix)); + } + + usec = (reg & IXGBE_QVECTOR_MASK) >> 3; // >> 3; if (usec > 0) rate = 500000 / usec; + //rate = IXGBE_INTS_TO_EITR(reg); // rate = 500000 / usec; else rate = 0; + error = sysctl_handle_int(oidp, &rate, 0, req); if (error || !req->newptr) return error; - reg &= ~0xfff; /* default, no limitation */ - ixgbe_max_interrupt_rate = 0; - if (rate > 0 && rate < 500000) { - if (rate < 1000) - rate = 1000; - ixgbe_max_interrupt_rate = rate; - reg |= ((4000000/rate) & 0xff8); - } - IXGBE_WRITE_REG(&que->sc->hw, IXGBE_EITR(que->msix), reg); return (0); } /* ixgbe_sysctl_interrupt_rate_handler */ @@ -3314,7 +3512,7 @@ u32 newitr; if (ixgbe_max_interrupt_rate > 0) - newitr = (4000000 / ixgbe_max_interrupt_rate) & 0x0FF8; + newitr = IXGBE_INTS_TO_EITR(ixgbe_max_interrupt_rate); else { /* * Disable DMA coalescing if interrupt moderation is @@ -3324,6 +3522,11 @@ newitr = 0; } + if (sc->hw.mac.type == ixgbe_mac_82598EB) + newitr |= newitr << 16; + else + newitr |= IXGBE_EITR_CNT_WDIS; + for (int i = 0; i < sc->num_rx_queues; i++, rx_que++) { struct rx_ring *rxr = &rx_que->rxr; Index: sys/dev/ixgbe/ix_txrx.c =================================================================== --- sys/dev/ixgbe/ix_txrx.c +++ sys/dev/ixgbe/ix_txrx.c @@ -235,6 +235,7 @@ } txd->read.cmd_type_len |= htole32(IXGBE_TXD_CMD_EOP | flags); + ++txr->packets; txr->bytes += pi->ipi_len; pi->ipi_new_pidx = i; Index: sys/dev/ixgbe/ixgbe.h =================================================================== --- sys/dev/ixgbe/ixgbe.h +++ sys/dev/ixgbe/ixgbe.h @@ -204,6 +204,11 @@ #define DEVMETHOD_END { NULL, NULL } #endif +#define IXGBE_EITR_DIVIDEND 4000000 +#define IXGBE_QVECTOR_MASK 0x0FF8 +#define IXGBE_INTS_TO_EITR(i) (((IXGBE_EITR_DIVIDEND/i) & \ + IXGBE_QVECTOR_MASK)) + /* * Interrupt Moderation parameters */