diff --git a/sys/dev/e1000/em_txrx.c b/sys/dev/e1000/em_txrx.c --- a/sys/dev/e1000/em_txrx.c +++ b/sys/dev/e1000/em_txrx.c @@ -455,6 +455,10 @@ "tx_buffers[%d]->eop = %d ipi_new_pidx=%d\n", first, pidx_last, i); pi->ipi_new_pidx = i; + /* Sent data accounting for AIM */ + txr->tx_bytes += pi->ipi_len; + ++txr->tx_packets; + return (0); } @@ -669,6 +673,7 @@ len = le16toh(rxd->length); ri->iri_len += len; + rxr->rx_bytes += ri->iri_len; eop = (status & E1000_RXD_STAT_EOP) != 0; @@ -690,6 +695,8 @@ i++; } while (!eop); + rxr->rx_packets++; + if (scctx->isc_capenable & IFCAP_RXCSUM) em_receive_checksum(status, errors, ri); @@ -732,6 +739,7 @@ len = le16toh(rxd->wb.upper.length); ri->iri_len += len; + rxr->rx_bytes += ri->iri_len; eop = (staterr & E1000_RXD_STAT_EOP) != 0; @@ -752,6 +760,8 @@ i++; } while (!eop); + rxr->rx_packets++; + if (scctx->isc_capenable & IFCAP_RXCSUM) em_receive_checksum(staterr, staterr >> 24, ri); diff --git a/sys/dev/e1000/if_em.h b/sys/dev/e1000/if_em.h --- a/sys/dev/e1000/if_em.h +++ b/sys/dev/e1000/if_em.h @@ -248,11 +248,11 @@ #define EM_INTS_MULTIPLIER 256 #define EM_ITR_DIVIDEND 1000000000 #define EM_INTS_TO_ITR(i) (EM_ITR_DIVIDEND/(i * EM_INTS_MULTIPLIER)) -#define IGB_ITR_DIVIDEND 1000000 -#define IGB_ITR_SHIFT 2 +#define IGB_EITR_DIVIDEND 1000000 +#define IGB_EITR_SHIFT 2 #define IGB_QVECTOR_MASK 0x7FFC -#define IGB_INTS_TO_EITR(i) (((IGB_ITR_DIVIDEND/i) & IGB_QVECTOR_MASK) << \ - IGB_ITR_SHIFT) +#define IGB_INTS_TO_EITR(i) (((IGB_EITR_DIVIDEND/i) & IGB_QVECTOR_MASK) << \ + IGB_EITR_SHIFT) #define IGB_LINK_ITR 2000 #define I210_LINK_DELAY 1000 @@ -390,7 +390,11 @@ /* Interrupt resources */ void *tag; struct resource *res; + + /* Soft stats */ unsigned long tx_irq; + unsigned long tx_packets; + unsigned long tx_bytes; /* Saved csum offloading context information */ int csum_flags; @@ -441,6 +445,7 @@ u32 me; u32 msix; u32 eims; + u32 eitr_setting; struct rx_ring rxr; u64 irqs; struct if_irq que_irq; @@ -489,6 +494,7 @@ u32 rx_mbuf_sz; + int enable_aim; /* Management and WOL features */ u32 wol; bool has_manage; diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c --- a/sys/dev/e1000/if_em.c +++ b/sys/dev/e1000/if_em.c @@ -545,6 +545,15 @@ SYSCTL_INT(_hw_em, OID_AUTO, eee_setting, CTLFLAG_RDTUN, &eee_setting, 0, "Enable Energy Efficient Ethernet"); +/* + * AIM: Adaptive Interrupt Moderation + * which means that the interrupt rate is varied over time based on the + * traffic for that interrupt vector + */ +static int em_enable_aim = 1; +SYSCTL_INT(_hw_em, OID_AUTO, enable_aim, CTLFLAG_RWTUN, &em_enable_aim, + 1, "Enable adaptive interrupt moderation (1=normal, 2=lowlatency)"); + /* ** Tuneable Interrupt rate */ @@ -832,6 +841,10 @@ CTLTYPE_INT | CTLFLAG_RW, sc, 0, em_sysctl_nvm_info, "I", "NVM Information"); + SYSCTL_ADD_INT(ctx_list, child, OID_AUTO, "enable_aim", + CTLFLAG_RW, &sc->enable_aim, em_enable_aim, + "Interrupt Moderation (1=normal, 2=lowlatency)"); + SYSCTL_ADD_PROC(ctx_list, child, OID_AUTO, "fw_version", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, em_sysctl_print_fw_version, "A", @@ -1051,6 +1064,8 @@ EM_INTS_TO_ITR(em_max_interrupt_rate)); } + sc->enable_aim = em_enable_aim; + hw->mac.autoneg = DO_AUTO_NEG; hw->phy.autoneg_wait_to_complete = false; hw->phy.autoneg_advertised = AUTONEG_ADV_DEFAULT; @@ -1437,6 +1452,52 @@ } } +/********************************************************************* + * + * Helper to calculate next (E)ITR value for AIM + * + *********************************************************************/ +static u32 +em_newitr(struct e1000_softc *sc, struct tx_ring *txr, struct rx_ring *rxr) +{ + struct e1000_hw *hw = &sc->hw; + u32 newitr = 0; + u32 defaultitr; + + if (hw->mac.type < igb_mac_min) + defaultitr = EM_INTS_TO_ITR(EM_INTS_PER_SEC); + else + defaultitr = IGB_INTS_TO_EITR(EM_INTS_PER_SEC); + + /* Use half defaultitr if sub-gig */ + if (sc->link_speed != 1000) + newitr = defaultitr / 2; + else { + if ((txr->tx_bytes) && (txr->tx_packets)) + newitr = txr->tx_bytes/txr->tx_packets; + if ((rxr->rx_bytes) && (rxr->rx_packets)) + newitr = max(newitr, (rxr->rx_bytes / rxr->rx_packets)); + newitr += 24; /* account for hardware frame, crc */ + /* set an upper boundary */ + newitr = min(newitr, 3000); + /* Be nice to the mid range */ + if ((newitr > 300) && (newitr < 1200)) + newitr = (newitr / 3); + else + newitr = (newitr / 2); + } + + if (hw->mac.type >= igb_mac_min) { + newitr &= IGB_QVECTOR_MASK; /* Mask invalid bits */ + if (hw->mac.type == e1000_82575) + newitr |= newitr << 16; + else + newitr |= E1000_EITR_CNT_IGNR; + } + + return newitr; +} + /********************************************************************* * * Fast Legacy/MSI Combined Interrupt Service routine @@ -1446,10 +1507,14 @@ em_intr(void *arg) { struct e1000_softc *sc = arg; + struct e1000_hw *hw = &sc->hw; + struct em_rx_queue *que = &sc->rx_queues[0]; + struct tx_ring *txr = &sc->tx_queues[0].txr; + struct rx_ring *rxr = &que->rxr; if_ctx_t ctx = sc->ctx; u32 reg_icr; - reg_icr = E1000_READ_REG(&sc->hw, E1000_ICR); + reg_icr = E1000_READ_REG(hw, E1000_ICR); /* Hot eject? */ if (reg_icr == 0xffffffff) @@ -1463,7 +1528,7 @@ * Starting with the 82571 chip, bit 31 should be used to * determine whether the interrupt belongs to us. */ - if (sc->hw.mac.type >= e1000_82571 && + if (hw->mac.type >= e1000_82571 && (reg_icr & E1000_ICR_INT_ASSERTED) == 0) return FILTER_STRAY; @@ -1482,6 +1547,38 @@ if (reg_icr & E1000_ICR_RXO) sc->rx_overruns++; + if (sc->enable_aim && hw->mac.type >= e1000_82540) { + /* + * Do Adaptive Interrupt Moderation: + * - Write out last calculated setting + * - Calculate based on average size over the last interval. + */ + if (que->eitr_setting) { + /* igb always uses the EITR register, including non msi-x */ + if (hw->mac.type >= igb_mac_min) + E1000_WRITE_REG(hw, E1000_EITR(0), + que->eitr_setting << IGB_EITR_SHIFT); + else + E1000_WRITE_REG(hw, E1000_ITR, que->eitr_setting); + } + + que->eitr_setting = 0; + + /* Idle, do nothing */ + if ((txr->tx_bytes == 0) && (rxr->rx_bytes == 0)) + goto no_calc_intr; + + /* save for next interrupt */ + que->eitr_setting = em_newitr(sc, txr, rxr); + + /* Reset state */ + txr->tx_bytes = 0; + txr->tx_packets = 0; + rxr->rx_bytes = 0; + rxr->rx_packets = 0; + } + +no_calc_intr: return (FILTER_SCHEDULE_THREAD); } @@ -1534,9 +1631,46 @@ em_msix_que(void *arg) { struct em_rx_queue *que = arg; + struct e1000_softc *sc = que->sc; + struct e1000_hw *hw = &sc->hw; + struct tx_ring *txr = &sc->tx_queues[que->msix].txr; + struct rx_ring *rxr = &que->rxr; ++que->irqs; + if (sc->enable_aim) { + /* + * Do Adaptive Interrupt Moderation: + * - Write out last calculated setting + * - Calculate based on average size over the last interval. + */ + if (que->eitr_setting) { + if (hw->mac.type >= igb_mac_min) + E1000_WRITE_REG(hw, + E1000_EITR(que->msix), + que->eitr_setting << IGB_EITR_SHIFT); + else if (hw->mac.type == e1000_82574) + E1000_WRITE_REG(hw, + E1000_EITR_82574(que->msix), que->eitr_setting); + } + + que->eitr_setting = 0; + + /* Idle, do nothing */ + if ((txr->tx_bytes == 0) && (rxr->rx_bytes == 0)) + goto no_calc_msix; + + /* save for next interrupt */ + que->eitr_setting = em_newitr(sc, txr, rxr); + + /* Reset state */ + txr->tx_bytes = 0; + txr->tx_packets = 0; + rxr->rx_bytes = 0; + rxr->rx_packets = 0; + } + +no_calc_msix: return (FILTER_SCHEDULE_THREAD); } @@ -4410,6 +4544,57 @@ return (sysctl_handle_int(oidp, &val, 0, req)); } +/* Tunable interrupt rate handler */ +static int +em_sysctl_interrupt_rate_handler(SYSCTL_HANDLER_ARGS) +{ + struct em_rx_queue *rque; + struct em_tx_queue *tque; + struct e1000_hw *hw; + int error; + u32 reg, usec, rate; + + bool tx = oidp->oid_arg2; + + if (tx) { + tque = oidp->oid_arg1; + hw = &tque->sc->hw; + if (hw->mac.type >= igb_mac_min) + reg = E1000_READ_REG(hw, E1000_EITR(tque->me)); + else if (hw->mac.type == e1000_82574 && tque->msix) + reg = E1000_READ_REG(hw, E1000_EITR_82574(tque->me)); + else + reg = E1000_READ_REG(hw, E1000_ITR); + } else { + rque = oidp->oid_arg1; + hw = &rque->sc->hw; + if (hw->mac.type >= igb_mac_min) + reg = E1000_READ_REG(hw, E1000_EITR(rque->msix)); + else if (hw->mac.type == e1000_82574 && rque->msix) + reg = E1000_READ_REG(hw, E1000_EITR_82574(rque->msix)); + else + reg = E1000_READ_REG(hw, E1000_ITR); + } + + if (hw->mac.type < igb_mac_min) { + if (reg > 0) + rate = EM_INTS_TO_ITR(reg); + else + rate = 0; + } else { + usec = ((reg & IGB_QVECTOR_MASK) >> IGB_EITR_SHIFT); + if (usec > 0) + rate = IGB_INTS_TO_EITR(usec); + else + rate = 0; + } + + error = sysctl_handle_int(oidp, &rate, 0, req); + if (error || !req->newptr) + return error; + return 0; +} + /* * Add sysctl variables, one per statistic, to the system. */ @@ -4466,6 +4651,11 @@ CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TX Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); + SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "interrupt_rate", + CTLTYPE_UINT | CTLFLAG_RD, tx_que, + true, em_sysctl_interrupt_rate_handler, + "IU", "Interrupt Rate"); + SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "txd_head", CTLTYPE_UINT | CTLFLAG_RD, sc, E1000_TDH(txr->me), em_sysctl_reg_handler, "IU", @@ -4486,6 +4676,11 @@ CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "RX Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); + SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "interrupt_rate", + CTLTYPE_UINT | CTLFLAG_RD, rx_que, + false, em_sysctl_interrupt_rate_handler, + "IU", "Interrupt Rate"); + SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "rxd_head", CTLTYPE_UINT | CTLFLAG_RD, sc, E1000_RDH(rxr->me), em_sysctl_reg_handler, "IU", diff --git a/sys/dev/e1000/igb_txrx.c b/sys/dev/e1000/igb_txrx.c --- a/sys/dev/e1000/igb_txrx.c +++ b/sys/dev/e1000/igb_txrx.c @@ -292,6 +292,10 @@ txd->read.cmd_type_len |= htole32(E1000_TXD_CMD_EOP | txd_flags); pi->ipi_new_pidx = i; + /* Sent data accounting for AIM */ + txr->tx_bytes += pi->ipi_len; + ++txr->tx_packets; + return (0); }