diff --git a/sys/dev/gve/gve.h b/sys/dev/gve/gve.h --- a/sys/dev/gve/gve.h +++ b/sys/dev/gve/gve.h @@ -47,6 +47,21 @@ #define GVE_TX_MAX_DESCS 4 #define GVE_TX_BUFRING_ENTRIES 4096 +#define GVE_TX_TIMEOUT_PKT_SEC 5 +#define GVE_TX_TIMEOUT_CHECK_CADENCE_SEC 5 +/* + * If the driver finds timed out packets on a tx queue it first kicks it and + * records the time. If the driver again finds a timeout on the same queue + * before the end of the cooldown period, only then will it reset. Thus, for a + * reset to be able to occur at all, the cooldown must be at least as long + * as the tx timeout checking cadence multiplied by the number of queues. + */ +#define GVE_TX_TIMEOUT_MAX_TX_QUEUES 16 +#define GVE_TX_TIMEOUT_KICK_COOLDOWN_SEC \ + (2 * GVE_TX_TIMEOUT_CHECK_CADENCE_SEC * GVE_TX_TIMEOUT_MAX_TX_QUEUES) + +#define GVE_TIMESTAMP_INVALID -1 + #define ADMINQ_SIZE PAGE_SIZE #define GVE_DEFAULT_RX_BUFFER_SIZE 2048 @@ -337,6 +352,14 @@ struct gve_tx_buffer_state { struct mbuf *mbuf; + + /* + * Time at which the xmit tq places descriptors for mbuf's payload on a + * tx queue. This timestamp is invalidated when the mbuf is freed and + * must be checked for validity when read. + */ + int64_t enqueue_time_sec; + struct gve_tx_iovec iov[GVE_TX_MAX_DESCS]; }; @@ -357,12 +380,21 @@ counter_u64_t tx_mbuf_defrag_err; counter_u64_t tx_mbuf_dmamap_enomem_err; counter_u64_t tx_mbuf_dmamap_err; + counter_u64_t tx_timeout; }; #define NUM_TX_STATS (sizeof(struct gve_txq_stats) / sizeof(counter_u64_t)) struct gve_tx_pending_pkt_dqo { struct mbuf *mbuf; + + /* + * Time at which the xmit tq places descriptors for mbuf's payload on a + * tx queue. This timestamp is invalidated when the mbuf is freed and + * must be checked for validity when read. + */ + int64_t enqueue_time_sec; + union { /* RDA */ bus_dmamap_t dmamap; @@ -396,6 +428,8 @@ uint32_t req; /* free-running total number of packets written to the nic */ uint32_t done; /* free-running total number of completed packets */ + int64_t last_kicked; /* always-valid timestamp in seconds for the last queue kick */ + union { /* GQI specific stuff */ struct { @@ -594,6 +628,11 @@ struct gve_state_flags state_flags; struct sx gve_iface_lock; + + struct callout tx_timeout_service; + /* The index of tx queue that the timer service will check on its next invocation */ + uint16_t check_tx_queue_idx; + }; static inline bool @@ -652,6 +691,7 @@ void gve_free_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx); int gve_create_tx_rings(struct gve_priv *priv); int gve_destroy_tx_rings(struct gve_priv *priv); +int gve_check_tx_timeout_gqi(struct gve_priv *priv, struct gve_tx_ring *tx); int gve_tx_intr(void *arg); int gve_xmit_ifp(if_t ifp, struct mbuf *mbuf); void gve_qflush(if_t ifp); @@ -662,6 +702,7 @@ int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int i); void gve_tx_free_ring_dqo(struct gve_priv *priv, int i); void gve_clear_tx_ring_dqo(struct gve_priv *priv, int i); +int gve_check_tx_timeout_dqo(struct gve_priv *priv, struct gve_tx_ring *tx); int gve_tx_intr_dqo(void *arg); int gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr); int gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf); @@ -697,6 +738,12 @@ void gve_unmask_all_queue_irqs(struct gve_priv *priv); void gve_mask_all_queue_irqs(struct gve_priv *priv); +/* Miscellaneous functions defined in gve_utils.c */ +void gve_invalidate_timestamp(int64_t *timestamp_sec); +int64_t gve_seconds_since(int64_t *timestamp_sec); +void gve_set_timestamp(int64_t *timestamp_sec); +bool gve_timestamp_valid(int64_t *timestamp_sec); + /* Systcl functions defined in gve_sysctl.c */ extern bool gve_disable_hw_lro; extern char gve_queue_format[8]; diff --git a/sys/dev/gve/gve_main.c b/sys/dev/gve/gve_main.c --- a/sys/dev/gve/gve_main.c +++ b/sys/dev/gve/gve_main.c @@ -32,10 +32,10 @@ #include "gve_adminq.h" #include "gve_dqo.h" -#define GVE_DRIVER_VERSION "GVE-FBSD-1.3.3\n" +#define GVE_DRIVER_VERSION "GVE-FBSD-1.3.4\n" #define GVE_VERSION_MAJOR 1 #define GVE_VERSION_MINOR 3 -#define GVE_VERSION_SUB 3 +#define GVE_VERSION_SUB 4 #define GVE_DEFAULT_RX_COPYBREAK 256 @@ -50,6 +50,9 @@ struct sx gve_global_lock; +static void gve_start_tx_timeout_service(struct gve_priv *priv); +static void gve_stop_tx_timeout_service(struct gve_priv *priv); + static int gve_verify_driver_compatibility(struct gve_priv *priv) { @@ -99,6 +102,73 @@ return (err); } +static void +gve_handle_tx_timeout(struct gve_priv *priv, struct gve_tx_ring *tx, + int num_timeout_pkts) +{ + int64_t time_since_last_kick; + + counter_u64_add_protected(tx->stats.tx_timeout, 1); + + /* last_kicked is never GVE_TIMESTAMP_INVALID so we can skip checking */ + time_since_last_kick = gve_seconds_since(&tx->last_kicked); + + /* Try kicking first in case the timeout is due to a missed interrupt */ + if (time_since_last_kick > GVE_TX_TIMEOUT_KICK_COOLDOWN_SEC) { + device_printf(priv->dev, + "Found %d timed out packet(s) on txq%d, kicking it for completions\n", + num_timeout_pkts, tx->com.id); + gve_set_timestamp(&tx->last_kicked); + taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task); + } else { + device_printf(priv->dev, + "Found %d timed out packet(s) on txq%d with its last kick %ld sec ago which is less than the cooldown period %d. Resetting device\n", + num_timeout_pkts, tx->com.id, time_since_last_kick, + GVE_TX_TIMEOUT_KICK_COOLDOWN_SEC); + gve_schedule_reset(priv); + } +} + +static void +gve_tx_timeout_service_callback(void *data) +{ + struct gve_priv *priv = (struct gve_priv *)data; + struct gve_tx_ring *tx; + uint16_t num_timeout_pkts; + + tx = &priv->tx[priv->check_tx_queue_idx]; + + num_timeout_pkts = gve_is_gqi(priv) + ? gve_check_tx_timeout_gqi(priv, tx) + : gve_check_tx_timeout_dqo(priv, tx); + if (num_timeout_pkts) + gve_handle_tx_timeout(priv, tx, num_timeout_pkts); + + priv->check_tx_queue_idx = (priv->check_tx_queue_idx + 1) + % priv->tx_cfg.num_queues; + callout_reset_sbt(&priv->tx_timeout_service, SBT_1S * + GVE_TX_TIMEOUT_CHECK_CADENCE_SEC, + SBT_1S * GVE_TX_TIMEOUT_CHECK_CADENCE_SEC, + gve_tx_timeout_service_callback, (void *)priv, CALLOUT_MPSAFE); +} + +static void +gve_start_tx_timeout_service(struct gve_priv *priv) +{ + priv->check_tx_queue_idx = 0; + callout_init(&priv->tx_timeout_service, true); + callout_reset_sbt(&priv->tx_timeout_service, SBT_1S * + GVE_TX_TIMEOUT_CHECK_CADENCE_SEC, + SBT_1S * GVE_TX_TIMEOUT_CHECK_CADENCE_SEC, + gve_tx_timeout_service_callback, (void *)priv, CALLOUT_MPSAFE); +} + +static void +gve_stop_tx_timeout_service(struct gve_priv *priv) +{ + callout_drain(&priv->tx_timeout_service); +} + static int gve_up(struct gve_priv *priv) { @@ -149,6 +219,9 @@ gve_unmask_all_queue_irqs(priv); gve_set_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP); priv->interface_up_cnt++; + + gve_start_tx_timeout_service(priv); + return (0); reset: @@ -164,6 +237,8 @@ if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) return; + gve_stop_tx_timeout_service(priv); + if (gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { if_link_state_change(priv->ifp, LINK_STATE_DOWN); gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); diff --git a/sys/dev/gve/gve_sysctl.c b/sys/dev/gve/gve_sysctl.c --- a/sys/dev/gve/gve_sysctl.c +++ b/sys/dev/gve/gve_sysctl.c @@ -168,7 +168,7 @@ &stats->tx_delayed_pkt_tsoerr, "TSO packets delayed due to err in prep errors"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, - "tx_mbuf_collpase", CTLFLAG_RD, + "tx_mbuf_collapse", CTLFLAG_RD, &stats->tx_mbuf_collapse, "tx mbufs that had to be collapsed"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, @@ -187,6 +187,10 @@ "tx_mbuf_dmamap_err", CTLFLAG_RD, &stats->tx_mbuf_dmamap_err, "tx mbufs that could not be dma-mapped"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_timeout", CTLFLAG_RD, + &stats->tx_timeout, + "detections of timed out packets on tx queues"); } static void diff --git a/sys/dev/gve/gve_tx.c b/sys/dev/gve/gve_tx.c --- a/sys/dev/gve/gve_tx.c +++ b/sys/dev/gve/gve_tx.c @@ -173,6 +173,8 @@ } com->q_resources = com->q_resources_mem.cpu_addr; + tx->last_kicked = 0; + return (0); abort: @@ -218,6 +220,7 @@ for (i = 0; i < com->priv->tx_desc_cnt; i++) { tx->desc_ring[i] = (union gve_tx_desc){}; tx->info[i] = (struct gve_tx_buffer_state){}; + gve_invalidate_timestamp(&tx->info[i].enqueue_time_sec); } bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, @@ -344,6 +347,30 @@ return (0); } +int +gve_check_tx_timeout_gqi(struct gve_priv *priv, struct gve_tx_ring *tx) +{ + struct gve_tx_buffer_state *info; + uint32_t pkt_idx; + int num_timeouts; + + num_timeouts = 0; + + for (pkt_idx = 0; pkt_idx < priv->tx_desc_cnt; pkt_idx++) { + info = &tx->info[pkt_idx]; + + if (!gve_timestamp_valid(&info->enqueue_time_sec)) + continue; + + if (__predict_false( + gve_seconds_since(&info->enqueue_time_sec) > + GVE_TX_TIMEOUT_PKT_SEC)) + num_timeouts += 1; + } + + return (num_timeouts); +} + int gve_tx_intr(void *arg) { @@ -396,7 +423,10 @@ if (mbuf == NULL) continue; + gve_invalidate_timestamp(&info->enqueue_time_sec); + info->mbuf = NULL; + counter_enter(); counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len); counter_u64_add_protected(tx->stats.tpackets, 1); @@ -408,6 +438,7 @@ info->iov[i].iov_len = 0; info->iov[i].iov_padding = 0; } + } gve_tx_free_fifo(&tx->fifo, space_freed); @@ -685,6 +716,8 @@ /* So that the cleanup taskqueue can free the mbuf eventually. */ info->mbuf = mbuf; + gve_set_timestamp(&info->enqueue_time_sec); + /* * We don't want to split the header, so if necessary, pad to the end * of the fifo and then put the header at the beginning of the fifo. diff --git a/sys/dev/gve/gve_tx_dqo.c b/sys/dev/gve/gve_tx_dqo.c --- a/sys/dev/gve/gve_tx_dqo.c +++ b/sys/dev/gve/gve_tx_dqo.c @@ -527,6 +527,8 @@ tx->dqo.free_pending_pkts_csm = pending_pkt->next; pending_pkt->state = GVE_PACKET_STATE_PENDING_DATA_COMPL; + gve_set_timestamp(&pending_pkt->enqueue_time_sec); + return (pending_pkt); } @@ -539,6 +541,8 @@ pending_pkt->state = GVE_PACKET_STATE_FREE; + gve_invalidate_timestamp(&pending_pkt->enqueue_time_sec); + /* Add pending_pkt to the producer list */ while (true) { old_head = atomic_load_acq_32(&tx->dqo.free_pending_pkts_prd); @@ -939,6 +943,29 @@ return (pkt_len); } +int +gve_check_tx_timeout_dqo(struct gve_priv *priv, struct gve_tx_ring *tx) +{ + struct gve_tx_pending_pkt_dqo *pending_pkt; + int num_timeouts; + uint16_t pkt_idx; + + num_timeouts = 0; + for (pkt_idx = 0; pkt_idx < tx->dqo.num_pending_pkts; pkt_idx++) { + pending_pkt = &tx->dqo.pending_pkts[pkt_idx]; + + if (!gve_timestamp_valid(&pending_pkt->enqueue_time_sec)) + continue; + + if (__predict_false( + gve_seconds_since(&pending_pkt->enqueue_time_sec) > + GVE_TX_TIMEOUT_PKT_SEC)) + num_timeouts += 1; + } + + return (num_timeouts); +} + int gve_tx_intr_dqo(void *arg) { @@ -960,8 +987,11 @@ struct gve_ring_com *com = &tx->com; int i; - for (i = 0; i < com->priv->tx_desc_cnt; i++) + for (i = 0; i < com->priv->tx_desc_cnt; i++) { tx->dqo.desc_ring[i] = (union gve_tx_desc_dqo){}; + gve_invalidate_timestamp( + &tx->dqo.pending_pkts[i].enqueue_time_sec); + } bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); diff --git a/sys/dev/gve/gve_utils.c b/sys/dev/gve/gve_utils.c --- a/sys/dev/gve/gve_utils.c +++ b/sys/dev/gve/gve_utils.c @@ -439,3 +439,45 @@ gve_db_bar_write_4(priv, rx->com.irq_db_offset, GVE_IRQ_MASK); } } + +/* + * In some cases, such as tracking timeout events, we must mark a timestamp as + * invalid when we do not want to consider its value. Such timestamps must be + * checked for validity before reading them. + */ +inline void +gve_invalidate_timestamp(int64_t *timestamp_sec) +{ + atomic_store_64(timestamp_sec, GVE_TIMESTAMP_INVALID); +} + +/* + * Returns 0 if the timestamp is invalid, otherwise returns the elapsed seconds + * since the timestamp was set. + */ +inline int64_t +gve_seconds_since(int64_t *timestamp_sec) +{ + struct bintime curr_time; + int64_t enqueued_time; + + getbintime(&curr_time); + enqueued_time = atomic_load_64(timestamp_sec); + if (enqueued_time == GVE_TIMESTAMP_INVALID) + return (0); + return ((int64_t)(curr_time.sec - enqueued_time)); +} + +inline void +gve_set_timestamp(int64_t *timestamp_sec) +{ + struct bintime curr_time; + + getbintime(&curr_time); + atomic_store_64(timestamp_sec, curr_time.sec); +} + +inline bool +gve_timestamp_valid(int64_t *timestamp_sec) { + return (atomic_load_64(timestamp_sec) != GVE_TIMESTAMP_INVALID); +}