diff --git a/sys/dev/ena/ena.h b/sys/dev/ena/ena.h --- a/sys/dev/ena/ena.h +++ b/sys/dev/ena/ena.h @@ -325,6 +325,7 @@ }; uint8_t first_interrupt; + uint8_t cleanup_running; uint16_t no_interrupt_event_cnt; struct ena_com_rx_buf_info ena_bufs[ENA_PKT_MAX_BUFS]; @@ -582,21 +583,27 @@ } static inline void -ena_trigger_reset(struct ena_adapter *adapter, - enum ena_regs_reset_reason_types reset_reason) +ena_increment_reset_counter(struct ena_adapter *adapter) { - if (likely(!ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter))) { - const struct ena_reset_stats_offset *ena_reset_stats_offset = - &resets_to_stats_offset_map[reset_reason]; + enum ena_regs_reset_reason_types reset_reason = adapter->reset_reason; + const struct ena_reset_stats_offset *ena_reset_stats_offset = + &resets_to_stats_offset_map[reset_reason]; - if (ena_reset_stats_offset->has_counter) { - uint64_t *stat_ptr = (uint64_t *)&adapter->dev_stats + - ena_reset_stats_offset->stat_offset; + if (ena_reset_stats_offset->has_counter) { + uint64_t *stat_ptr = (uint64_t *)&adapter->dev_stats + + ena_reset_stats_offset->stat_offset; - counter_u64_add((counter_u64_t)(*stat_ptr), 1); - } + counter_u64_add((counter_u64_t)(*stat_ptr), 1); + } + + counter_u64_add(adapter->dev_stats.total_resets, 1); +} - counter_u64_add(adapter->dev_stats.total_resets, 1); +static inline void +ena_trigger_reset(struct ena_adapter *adapter, + enum ena_regs_reset_reason_types reset_reason) +{ + if (likely(!ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter))) { adapter->reset_reason = reset_reason; ENA_FLAG_SET_ATOMIC(ENA_FLAG_TRIGGER_RESET, adapter); } diff --git a/sys/dev/ena/ena.c b/sys/dev/ena/ena.c --- a/sys/dev/ena/ena.c +++ b/sys/dev/ena/ena.c @@ -169,6 +169,9 @@ static int ena_copy_srd_metrics(struct ena_adapter *); static int ena_copy_customer_metrics(struct ena_adapter *); static void ena_timer_service(void *); +static enum ena_regs_reset_reason_types check_cdesc_in_tx_cq(struct ena_adapter *, + struct ena_ring *); + static char ena_version[] = ENA_DEVICE_NAME ENA_DRV_MODULE_NAME " v" ENA_DRV_MODULE_VERSION; @@ -3088,6 +3091,31 @@ return (0); } +static enum ena_regs_reset_reason_types +check_cdesc_in_tx_cq(struct ena_adapter *adapter, + struct ena_ring *tx_ring) +{ + device_t pdev = adapter->pdev; + int rc; + u16 req_id; + + rc = ena_com_tx_comp_req_id_get(tx_ring->ena_com_io_cq, &req_id); + /* TX CQ is empty */ + if (rc == ENA_COM_TRY_AGAIN) { + ena_log(pdev, ERR, + "No completion descriptors found in CQ %d\n", + tx_ring->qid); + return ENA_REGS_RESET_MISS_TX_CMPL; + } + + /* TX CQ has cdescs */ + ena_log(pdev, ERR, + "Completion descriptors found in CQ %d", + tx_ring->qid); + + return ENA_REGS_RESET_MISS_INTERRUPT; +} + static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter, struct ena_ring *tx_ring) @@ -3100,6 +3128,8 @@ int missing_tx_comp_to; sbintime_t time_offset; int i, rc = 0; + enum ena_regs_reset_reason_types reset_reason = ENA_REGS_RESET_MISS_TX_CMPL; + bool cleanup_scheduled, cleanup_running; getbinuptime(&curtime); @@ -3155,7 +3185,19 @@ "The number of lost tx completion is above the threshold " "(%d > %d). Reset the device\n", missed_tx, adapter->missing_tx_threshold); - ena_trigger_reset(adapter, ENA_REGS_RESET_MISS_TX_CMPL); + /* Set the reset flag to prevent ena_cleanup() from running */ + ENA_FLAG_SET_ATOMIC(ENA_FLAG_TRIGGER_RESET, adapter); + /* Need to make sure that ENA_FLAG_TRIGGER_RESET is visible to ena_cleanup() and + * that cleanup_running is visible to check_missing_comp_in_tx_queue() to + * prevent the case of accessing CQ concurrently with check_cdesc_in_tx_cq() + */ + mb(); + cleanup_scheduled = !!(atomic_load_16(&tx_ring->que->cleanup_task.ta_pending)); + cleanup_running = !!(atomic_load_8((&tx_ring->cleanup_running))); + if (!(cleanup_scheduled || cleanup_running)) + reset_reason = check_cdesc_in_tx_cq(adapter, tx_ring); + + adapter->reset_reason = reset_reason; rc = EIO; } /* Add the newly discovered missing TX completions */ @@ -3618,6 +3660,7 @@ ENA_LOCK_LOCK(); if (likely(ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter))) { + ena_increment_reset_counter(adapter); ena_destroy_device(adapter, false); ena_restore_device(adapter); diff --git a/sys/dev/ena/ena_datapath.c b/sys/dev/ena/ena_datapath.c --- a/sys/dev/ena/ena_datapath.c +++ b/sys/dev/ena/ena_datapath.c @@ -77,17 +77,24 @@ int qid, ena_qid; int txc, rxc, i; - if (unlikely((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0)) - return; - - ena_log_io(adapter->pdev, DBG, "MSI-X TX/RX routine\n"); - tx_ring = que->tx_ring; rx_ring = que->rx_ring; qid = que->id; ena_qid = ENA_IO_TXQ_IDX(qid); io_cq = &adapter->ena_dev->io_cq_queues[ena_qid]; + atomic_store_8(&tx_ring->cleanup_running, 1); + /* Need to make sure that ENA_FLAG_TRIGGER_RESET is visible to ena_cleanup() and + * that cleanup_running is visible to check_missing_comp_in_tx_queue() to + * prevent the case of accessing CQ concurrently with check_cdesc_in_tx_cq() + */ + mb(); + if (unlikely(((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) || + (ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter)))) + return; + + ena_log_io(adapter->pdev, DBG, "MSI-X TX/RX routine\n"); + atomic_store_8(&tx_ring->first_interrupt, 1); atomic_store_8(&rx_ring->first_interrupt, 1); @@ -95,7 +102,8 @@ rxc = ena_rx_cleanup(rx_ring); txc = ena_tx_cleanup(tx_ring); - if (unlikely((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0)) + if (unlikely(((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) || + (ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter)))) return; if ((txc != ENA_TX_BUDGET) && (rxc != ENA_RX_BUDGET)) @@ -107,6 +115,7 @@ ENA_TX_IRQ_INTERVAL, true, false); counter_u64_add(tx_ring->tx_stats.unmask_interrupt_num, 1); ena_com_unmask_intr(io_cq, &intr_reg); + atomic_store_8(&tx_ring->cleanup_running, 0); } void