diff --git a/sys/dev/ena/ena.h b/sys/dev/ena/ena.h --- a/sys/dev/ena/ena.h +++ b/sys/dev/ena/ena.h @@ -69,6 +69,7 @@ #define ENA_DEFAULT_RING_SIZE 1024 #define ENA_MIN_RING_SIZE 256 +#define ENA_BASE_CPU_UNSPECIFIED -1 /* * Refill Rx queue when number of required descriptors is above * QUEUE_SIZE / ENA_RX_REFILL_THRESH_DIVIDER or ENA_RX_REFILL_THRESH_PACKET @@ -201,9 +202,7 @@ void *cookie; unsigned int vector; bool requested; -#ifdef RSS int cpu; -#endif char name[ENA_IRQNAME_SIZE]; }; @@ -216,10 +215,8 @@ struct taskqueue *cleanup_tq; uint32_t id; -#ifdef RSS int cpu; cpuset_t cpu_mask; -#endif int domain; struct sysctl_oid *oid; }; @@ -448,6 +445,12 @@ ena_state_t flags; + /* IRQ CPU affinity */ + int irq_cpu_base; + uint32_t irq_cpu_stride; + + uint8_t rss_enabled; + /* Queue will represent one TX and one RX ring */ struct ena_que que[ENA_MAX_NUM_IO_QUEUES] __aligned(CACHE_LINE_SIZE); @@ -524,7 +527,8 @@ int ena_update_queue_size(struct ena_adapter *adapter, uint32_t new_tx_size, uint32_t new_rx_size); int ena_update_io_queue_nb(struct ena_adapter *adapter, uint32_t new_num); - +int ena_update_base_cpu(struct ena_adapter *adapter, int new_num); +int ena_update_cpu_stride(struct ena_adapter *adapter, uint32_t new_num); static inline int ena_mbuf_count(struct mbuf *mbuf) { diff --git a/sys/dev/ena/ena.c b/sys/dev/ena/ena.c --- a/sys/dev/ena/ena.c +++ b/sys/dev/ena/ena.c @@ -1237,6 +1237,84 @@ ena_init_io_rings(adapter); } +int +ena_update_base_cpu(struct ena_adapter *adapter, int new_num) +{ + int old_num; + int rc = 0; + bool dev_was_up; + + dev_was_up = ENA_FLAG_ISSET(ENA_FLAG_DEV_UP, adapter); + old_num = adapter->irq_cpu_base; + + ena_down(adapter); + + adapter->irq_cpu_base = new_num; + + if (dev_was_up) { + rc = ena_up(adapter); + if (unlikely(rc != 0)) { + ena_log(adapter->pdev, ERR, + "Failed to configure device %d IRQ base CPU. " + "Reverting to previous value: %d\n", + new_num, old_num); + + adapter->irq_cpu_base = old_num; + + rc = ena_up(adapter); + if (unlikely(rc != 0)) { + ena_log(adapter->pdev, ERR, + "Failed to revert to previous setup." + "Triggering device reset.\n"); + ENA_FLAG_SET_ATOMIC( + ENA_FLAG_DEV_UP_BEFORE_RESET, adapter); + ena_trigger_reset(adapter, + ENA_REGS_RESET_OS_TRIGGER); + } + } + } + return (rc); +} + +int +ena_update_cpu_stride(struct ena_adapter *adapter, uint32_t new_num) +{ + uint32_t old_num; + int rc = 0; + bool dev_was_up; + + dev_was_up = ENA_FLAG_ISSET(ENA_FLAG_DEV_UP, adapter); + old_num = adapter->irq_cpu_stride; + + ena_down(adapter); + + adapter->irq_cpu_stride = new_num; + + if (dev_was_up) { + rc = ena_up(adapter); + if (unlikely(rc != 0)) { + ena_log(adapter->pdev, ERR, + "Failed to configure device %d IRQ CPU stride. " + "Reverting to previous value: %d\n", + new_num, old_num); + + adapter->irq_cpu_stride = old_num; + + rc = ena_up(adapter); + if (unlikely(rc != 0)) { + ena_log(adapter->pdev, ERR, + "Failed to revert to previous setup." + "Triggering device reset.\n"); + ENA_FLAG_SET_ATOMIC( + ENA_FLAG_DEV_UP_BEFORE_RESET, adapter); + ena_trigger_reset(adapter, + ENA_REGS_RESET_OS_TRIGGER); + } + } + } + return (rc); +} + /* Caller should sanitize new_num */ int ena_update_io_queue_nb(struct ena_adapter *adapter, uint32_t new_num) @@ -1683,6 +1761,13 @@ ena_log(adapter->pdev, DBG, "ena_setup_io_intr vector: %d\n", adapter->msix_entries[irq_idx].vector); + if (adapter->irq_cpu_base > ENA_BASE_CPU_UNSPECIFIED) { + adapter->que[i].cpu = adapter->irq_tbl[irq_idx].cpu = + (unsigned)(adapter->irq_cpu_base + + i * adapter->irq_cpu_stride) % (unsigned)mp_ncpus; + CPU_SETOF(adapter->que[i].cpu, &adapter->que[i].cpu_mask); + } + #ifdef RSS adapter->que[i].cpu = adapter->irq_tbl[irq_idx].cpu = rss_getcpu(cur_bind); @@ -1790,20 +1875,19 @@ } irq->requested = true; -#ifdef RSS - rc = bus_bind_intr(adapter->pdev, irq->res, irq->cpu); - if (unlikely(rc != 0)) { - ena_log(pdev, ERR, - "failed to bind interrupt handler for irq %ju to cpu %d: %d\n", - rman_get_start(irq->res), irq->cpu, rc); - goto err; - } + if (adapter->rss_enabled || adapter->irq_cpu_base > ENA_BASE_CPU_UNSPECIFIED) { + rc = bus_bind_intr(adapter->pdev, irq->res, irq->cpu); + if (unlikely(rc != 0)) { + ena_log(pdev, ERR, + "failed to bind interrupt handler for irq %ju to cpu %d: %d\n", + rman_get_start(irq->res), irq->cpu, rc); + goto err; + } - ena_log(pdev, INFO, "queue %d - cpu %d\n", - i - ENA_IO_IRQ_FIRST_IDX, irq->cpu); -#endif + ena_log(pdev, INFO, "queue %d - cpu %d\n", + i - ENA_IO_IRQ_FIRST_IDX, irq->cpu); + } } - return (rc); err: @@ -1814,13 +1898,14 @@ /* Once we entered err: section and irq->requested is true we free both intr and resources */ - if (irq->requested) + if (irq->requested) { rcc = bus_teardown_intr(adapter->pdev, irq->res, irq->cookie); - if (unlikely(rcc != 0)) - ena_log(pdev, ERR, - "could not release irq: %d, error: %d\n", - irq->vector, rcc); + if (unlikely(rcc != 0)) + ena_log(pdev, ERR, + "could not release irq: %d, error: %d\n", + irq->vector, rcc); + } /* If we entered err: section without irq->requested set we know it was bus_alloc_resource_any() that needs cleanup, provided @@ -3523,6 +3608,13 @@ adapter->missing_tx_max_queues = ENA_DEFAULT_TX_MONITORED_QUEUES; adapter->missing_tx_threshold = ENA_DEFAULT_TX_CMP_THRESHOLD; + adapter->irq_cpu_base = ENA_BASE_CPU_UNSPECIFIED; + adapter->irq_cpu_stride = 0; + +#ifdef RSS + adapter->rss_enabled = 1; +#endif + if (version_printed++ == 0) ena_log(pdev, INFO, "%s\n", ena_version); diff --git a/sys/dev/ena/ena_sysctl.c b/sys/dev/ena/ena_sysctl.c --- a/sys/dev/ena/ena_sysctl.c +++ b/sys/dev/ena/ena_sysctl.c @@ -38,6 +38,7 @@ static void ena_sysctl_add_stats(struct ena_adapter *); static void ena_sysctl_add_eni_metrics(struct ena_adapter *); static void ena_sysctl_add_tuneables(struct ena_adapter *); +static void ena_sysctl_add_irq_affinity(struct ena_adapter *); /* Kernel option RSS prevents manipulation of key hash and indirection table. */ #ifndef RSS static void ena_sysctl_add_rss(struct ena_adapter *); @@ -45,6 +46,8 @@ static int ena_sysctl_buf_ring_size(SYSCTL_HANDLER_ARGS); static int ena_sysctl_rx_queue_size(SYSCTL_HANDLER_ARGS); static int ena_sysctl_io_queues_nb(SYSCTL_HANDLER_ARGS); +static int ena_sysctl_irq_base_cpu(SYSCTL_HANDLER_ARGS); +static int ena_sysctl_irq_cpu_stride(SYSCTL_HANDLER_ARGS); static int ena_sysctl_eni_metrics_interval(SYSCTL_HANDLER_ARGS); #ifndef RSS static int ena_sysctl_rss_key(SYSCTL_HANDLER_ARGS); @@ -102,6 +105,7 @@ ena_sysctl_add_stats(adapter); ena_sysctl_add_eni_metrics(adapter); ena_sysctl_add_tuneables(adapter); + ena_sysctl_add_irq_affinity(adapter); #ifndef RSS ena_sysctl_add_rss(adapter); #endif @@ -448,6 +452,36 @@ } #endif /* RSS */ +static void +ena_sysctl_add_irq_affinity(struct ena_adapter *adapter) +{ + device_t dev; + + struct sysctl_ctx_list *ctx; + struct sysctl_oid *tree; + struct sysctl_oid_list *child; + + dev = adapter->pdev; + + ctx = device_get_sysctl_ctx(dev); + tree = device_get_sysctl_tree(dev); + child = SYSCTL_CHILDREN(tree); + + tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "irq_affinity", + CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Decide base CPU and stride for irqs affinity."); + child = SYSCTL_CHILDREN(tree); + + /* Add base cpu leaf */ + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "base_cpu", + CTLTYPE_S32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0, + ena_sysctl_irq_base_cpu, "I", "Base cpu index for setting irq affinity."); + + /* Add cpu stride leaf */ + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "cpu_stride", + CTLTYPE_S32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0, + ena_sysctl_irq_cpu_stride, "I", "Distance between irqs when setting affinity."); +} + /* * ena_sysctl_update_queue_node_nb - Register/unregister sysctl queue nodes. @@ -707,6 +741,117 @@ return (0); } +static int +ena_sysctl_irq_base_cpu(SYSCTL_HANDLER_ARGS) +{ + struct ena_adapter *adapter = arg1; + int irq_base_cpu = 0; + int error; + + ENA_LOCK_LOCK(); + if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) { + error = ENODEV; + goto unlock; + } + + error = sysctl_wire_old_buffer(req, sizeof(irq_base_cpu)); + if (error == 0) { + irq_base_cpu = adapter->irq_cpu_base; + error = sysctl_handle_int(oidp, &irq_base_cpu, 0, req); + } + if (error != 0 || req->newptr == NULL) + goto unlock; + + if (irq_base_cpu <= ENA_BASE_CPU_UNSPECIFIED) { + ena_log(adapter->pdev, ERR, + "Requested base CPU is less than zero.\n"); + error = EINVAL; + goto unlock; + } + + if (irq_base_cpu > mp_ncpus) { + ena_log(adapter->pdev, INFO, + "Requested base CPU is larger than the number of available CPUs. \n"); + error = EINVAL; + goto unlock; + + } + + if (irq_base_cpu == adapter->irq_cpu_base) { + ena_log(adapter->pdev, INFO, + "Requested IRQ base CPU is equal to current value " + "(%d)\n", + adapter->irq_cpu_base); + goto unlock; + } + + ena_log(adapter->pdev, INFO, + "Requested new IRQ base CPU: %d, current value: %d\n", + irq_base_cpu, adapter->irq_cpu_base); + + error = ena_update_base_cpu(adapter, irq_base_cpu); + +unlock: + ENA_LOCK_UNLOCK(); + + return (error); +} + +static int +ena_sysctl_irq_cpu_stride(SYSCTL_HANDLER_ARGS) +{ + struct ena_adapter *adapter = arg1; + int32_t irq_cpu_stride = 0; + int error; + + ENA_LOCK_LOCK(); + if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) { + error = ENODEV; + goto unlock; + } + + error = sysctl_wire_old_buffer(req, sizeof(irq_cpu_stride)); + if (error == 0) { + irq_cpu_stride = adapter->irq_cpu_stride; + error = sysctl_handle_int(oidp, &irq_cpu_stride, 0, req); + } + if (error != 0 || req->newptr == NULL) + goto unlock; + + if (irq_cpu_stride < 0) { + ena_log(adapter->pdev, ERR, + "Requested IRQ stride is less than zero.\n"); + error = EINVAL; + goto unlock; + } + + if (irq_cpu_stride > mp_ncpus) { + ena_log(adapter->pdev, INFO, + "Warning: Requested IRQ stride is larger than the number of available CPUs.\n"); + } + + if (irq_cpu_stride == adapter->irq_cpu_stride) { + ena_log(adapter->pdev, INFO, + "Requested IRQ CPU stride is equal to current value " + "(%u)\n", + adapter->irq_cpu_stride); + goto unlock; + } + + ena_log(adapter->pdev, INFO, + "Requested new IRQ CPU stride: %u, current value: %u\n", + irq_cpu_stride, adapter->irq_cpu_stride); + + error = ena_update_cpu_stride(adapter, irq_cpu_stride); + if (error != 0) + goto unlock; + +unlock: + ENA_LOCK_UNLOCK(); + + return (error); +} + #ifndef RSS /* * Change the Receive Side Scaling hash key.