diff --git a/sys/dev/mlx5/driver.h b/sys/dev/mlx5/driver.h --- a/sys/dev/mlx5/driver.h +++ b/sys/dev/mlx5/driver.h @@ -29,8 +29,6 @@ #ifndef MLX5_DRIVER_H #define MLX5_DRIVER_H -#include "opt_ratelimit.h" - #include #include #include @@ -532,23 +530,19 @@ struct radix_tree_root tree; }; -#ifdef RATELIMIT struct mlx5_rl_entry { - u32 rate; + u64 rate; /* in bytes/s */ + u32 qos_handle; /* schedule queue handle */ u16 burst; u16 index; - u32 qos_handle; /* schedule queue handle */ - u32 refcount; }; struct mlx5_rl_table { - struct mutex rl_lock; u16 max_size; - u32 max_rate; - u32 min_rate; + u64 max_rate; /* in bytes/s */ + u64 min_rate; /* in bytes/s */ struct mlx5_rl_entry *rl_entry; }; -#endif struct mlx5_pme_stats { u64 status_counters[MLX5_MODULE_STATUS_NUM]; @@ -600,9 +594,9 @@ struct list_head ctx_list; spinlock_t ctx_lock; unsigned long pci_dev_data; -#ifdef RATELIMIT + struct mlx5_rl_table rl_table; -#endif + struct mlx5_pme_stats pme_stats; struct mlx5_eswitch *eswitch; @@ -1187,13 +1181,13 @@ { return !(dev->priv.pci_dev_data & MLX5_PCI_DEV_IS_VF); } -#ifdef RATELIMIT + +extern bool mlx5_use_multi_sq; int mlx5_init_rl_table(struct mlx5_core_dev *dev); +int mlx5_load_rl_table(struct mlx5_core_dev *dev); void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev); -int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u32 rate, u32 burst, u16 *index); -void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, u32 rate, u32 burst); -bool mlx5_rl_is_in_range(const struct mlx5_core_dev *dev, u32 rate, u32 burst); -int mlx5e_query_rate_limit_cmd(struct mlx5_core_dev *dev, u16 index, u32 *scq_handle); +void mlx5_rl_find_rate_index(struct mlx5_core_dev *dev, u64 rate, u16 *index); +bool mlx5_rl_is_in_range(const struct mlx5_core_dev *dev, u64 rate, u32 burst); static inline u32 mlx5_rl_get_scq_handle(struct mlx5_core_dev *dev, uint16_t index) { @@ -1207,7 +1201,6 @@ { return !!(dev->priv.rl_table.max_size); } -#endif void mlx5_disable_interrupts(struct mlx5_core_dev *); void mlx5_poll_interrupts(struct mlx5_core_dev *); diff --git a/sys/dev/mlx5/mlx5_core/mlx5_alloc.c b/sys/dev/mlx5/mlx5_core/mlx5_alloc.c --- a/sys/dev/mlx5/mlx5_core/mlx5_alloc.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_alloc.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_cmd.c b/sys/dev/mlx5/mlx5_core/mlx5_cmd.c --- a/sys/dev/mlx5/mlx5_core/mlx5_cmd.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_cmd.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_cq.c b/sys/dev/mlx5/mlx5_core/mlx5_cq.c --- a/sys/dev/mlx5/mlx5_core/mlx5_cq.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_cq.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_diagnostics.c b/sys/dev/mlx5/mlx5_core/mlx5_diagnostics.c --- a/sys/dev/mlx5/mlx5_core/mlx5_diagnostics.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_diagnostics.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_eq.c b/sys/dev/mlx5/mlx5_core/mlx5_eq.c --- a/sys/dev/mlx5/mlx5_core/mlx5_eq.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_eq.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_eswitch.c b/sys/dev/mlx5/mlx5_core/mlx5_eswitch.c --- a/sys/dev/mlx5/mlx5_core/mlx5_eswitch.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_eswitch.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_fs_cmd.c b/sys/dev/mlx5/mlx5_core/mlx5_fs_cmd.c --- a/sys/dev/mlx5/mlx5_core/mlx5_fs_cmd.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_fs_cmd.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_fs_tree.c b/sys/dev/mlx5/mlx5_core/mlx5_fs_tree.c --- a/sys/dev/mlx5/mlx5_core/mlx5_fs_tree.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_fs_tree.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_fw.c b/sys/dev/mlx5/mlx5_core/mlx5_fw.c --- a/sys/dev/mlx5/mlx5_core/mlx5_fw.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_fw.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_fwdump.c b/sys/dev/mlx5/mlx5_core/mlx5_fwdump.c --- a/sys/dev/mlx5/mlx5_core/mlx5_fwdump.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_fwdump.c @@ -24,7 +24,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include __FBSDID("$FreeBSD$"); diff --git a/sys/dev/mlx5/mlx5_core/mlx5_health.c b/sys/dev/mlx5/mlx5_core/mlx5_health.c --- a/sys/dev/mlx5/mlx5_core/mlx5_health.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_health.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_mad.c b/sys/dev/mlx5/mlx5_core/mlx5_mad.c --- a/sys/dev/mlx5/mlx5_core/mlx5_mad.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_mad.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_main.c b/sys/dev/mlx5/mlx5_core/mlx5_main.c --- a/sys/dev/mlx5/mlx5_core/mlx5_main.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_main.c @@ -1217,6 +1217,12 @@ goto err_fpga; } + err = mlx5_load_rl_table(dev); + if (err) { + mlx5_core_err(dev, "Failed to load rate limiting table\n"); + goto err_diag_cnt; + } + err = mlx5_register_device(dev); if (err) { mlx5_core_err(dev, "mlx5_register_device failed %d\n", err); diff --git a/sys/dev/mlx5/mlx5_core/mlx5_mcg.c b/sys/dev/mlx5/mlx5_core/mlx5_mcg.c --- a/sys/dev/mlx5/mlx5_core/mlx5_mcg.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_mcg.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_mpfs.c b/sys/dev/mlx5/mlx5_core/mlx5_mpfs.c --- a/sys/dev/mlx5/mlx5_core/mlx5_mpfs.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_mpfs.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_mr.c b/sys/dev/mlx5/mlx5_core/mlx5_mr.c --- a/sys/dev/mlx5/mlx5_core/mlx5_mr.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_mr.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_pagealloc.c b/sys/dev/mlx5/mlx5_core/mlx5_pagealloc.c --- a/sys/dev/mlx5/mlx5_core/mlx5_pagealloc.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_pagealloc.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_pd.c b/sys/dev/mlx5/mlx5_core/mlx5_pd.c --- a/sys/dev/mlx5/mlx5_core/mlx5_pd.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_pd.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_port.c b/sys/dev/mlx5/mlx5_core/mlx5_port.c --- a/sys/dev/mlx5/mlx5_core/mlx5_port.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_port.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_qp.c b/sys/dev/mlx5/mlx5_core/mlx5_qp.c --- a/sys/dev/mlx5/mlx5_core/mlx5_qp.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_qp.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_rl.c b/sys/dev/mlx5/mlx5_core/mlx5_rl.c --- a/sys/dev/mlx5/mlx5_core/mlx5_rl.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_rl.c @@ -1,5 +1,6 @@ /*- - * Copyright (c) 2013-2017, Mellanox Technologies, Ltd. All rights reserved. + * Copyright (c) 2013-2017, Mellanox Technologies, Ltd. + * Copyright (c) 2022 NVIDIA corporation & affiliates. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -21,216 +22,331 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * $FreeBSD$ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include + #include #include -#ifdef RATELIMIT - -/* Finds an entry where we can register the given rate - * If the rate already exists, return the entry where it is registered, - * otherwise return the first available entry. - * If the table is full, return NULL - */ -static struct mlx5_rl_entry *find_rl_entry(struct mlx5_rl_table *table, - u32 rate, u16 burst) -{ - struct mlx5_rl_entry *ret_entry = NULL; - struct mlx5_rl_entry *entry; - u16 i; - - for (i = 0; i < table->max_size; i++) { - entry = table->rl_entry + i; - if (entry->rate == rate && entry->burst == burst) - return entry; - if (ret_entry == NULL && entry->rate == 0) - ret_entry = entry; - } - - return ret_entry; -} - -static int mlx5_set_rate_limit_cmd(struct mlx5_core_dev *dev, - u32 rate, u32 burst, u16 index) +static int +mlx5_set_rate_limit_cmd(struct mlx5_core_dev *dev, u32 rate, u32 burst, u16 index) { u32 in[MLX5_ST_SZ_DW(set_rate_limit_in)] = {}; u32 out[MLX5_ST_SZ_DW(set_rate_limit_out)] = {}; MLX5_SET(set_rate_limit_in, in, opcode, MLX5_CMD_OP_SET_RATE_LIMIT); MLX5_SET(set_rate_limit_in, in, rate_limit_index, index); - MLX5_SET(set_rate_limit_in, in, rate_limit, rate); + MLX5_SET(set_rate_limit_in, in, rate_limit, rate); /* Kbit/s */ MLX5_SET(set_rate_limit_in, in, burst_upper_bound, burst); MLX5_SET(set_rate_limit_in, in, typical_packet_size, 0 /* use MTU */); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return (mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out))); } -int mlx5e_query_rate_limit_cmd(struct mlx5_core_dev *dev, - u16 index, u32 *scq_handle) +static int +mlx5_query_rate_limit_cmd(struct mlx5_core_dev *dev, u16 index, u32 *scq_handle) { - int err; u32 in[MLX5_ST_SZ_DW(query_pp_rate_limit_in)] = {}; u32 out[MLX5_ST_SZ_DW(query_pp_rate_limit_out)] = {}; + int err; MLX5_SET(query_pp_rate_limit_in, in, opcode, MLX5_CMD_OP_QUERY_RATE_LIMIT); MLX5_SET(query_pp_rate_limit_in, in, rate_limit_index, index); err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); if (err) - return err; + return (err); *scq_handle = MLX5_GET(query_pp_rate_limit_out, out, pp_context.qos_handle); - return 0; + return (0); } -bool mlx5_rl_is_in_range(const struct mlx5_core_dev *dev, u32 rate, u32 burst) +bool +mlx5_rl_is_in_range(const struct mlx5_core_dev *dev, u64 rate, u32 burst) { const struct mlx5_rl_table *table = &dev->priv.rl_table; return (rate <= table->max_rate && rate >= table->min_rate && burst <= 65535); } -EXPORT_SYMBOL(mlx5_rl_is_in_range); -int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u32 rate, u32 burst, u16 *index) +void +mlx5_rl_find_rate_index(struct mlx5_core_dev *dev, u64 rate, u16 *index) { struct mlx5_rl_table *table = &dev->priv.rl_table; - struct mlx5_rl_entry *entry; - int err = 0; - - mutex_lock(&table->rl_lock); + u16 i; - if (!rate || !mlx5_rl_is_in_range(dev, rate, burst)) { - mlx5_core_err(dev, "Invalid rate: %u, should be %u to %u\n", - rate, table->min_rate, table->max_rate); - err = -ERANGE; - goto out; + if (unlikely(table->max_size == 0)) { + *index = 0; + return; } - entry = find_rl_entry(table, rate, burst); - if (!entry) { - mlx5_core_err(dev, "Max number of %u rates reached\n", - table->max_size); - err = -ENOSPC; - goto out; - } - if (entry->refcount == 0xFFFFFFFFU) { - /* out of refcounts */ - err = -ENOMEM; - goto out; - } else if (entry->refcount != 0) { - /* rate already configured */ - entry->refcount++; - } else { - /* new rate limit */ - err = mlx5_set_rate_limit_cmd(dev, rate, burst, entry->index); - if (err) { - mlx5_core_err(dev, "Failed configuring rate: %u (%d)\n", - rate, err); - goto out; + for (i = 0;; i++) { + if (i == table->max_size) { + *index = table->rl_entry[--i].index; + break; + } else if (table->rl_entry[i].rate >= rate) { + *index = table->rl_entry[i].index; + break; } - entry->rate = rate; - entry->burst = burst; - entry->refcount = 1; + } +} - if (MLX5_CAP_QOS(dev, qos_remap_pp)) { - err = mlx5e_query_rate_limit_cmd(dev, entry->index, &entry->qos_handle); - if (err) { - mlx5_core_err(dev, "Failed retrieving schedule queue handle for" - "SQ remap: rate: %u error:(%d)\n", rate, err); - entry->qos_handle = MLX5_INVALID_QUEUE_HANDLE; +static void +mlx5_rl_rates_sort(uint32_t *ptr, size_t num) +{ + size_t x; + size_t y; + + for (x = 0; x != num; x += 2) { + for (y = x + 2; y != num; y += 2) { + if (ptr[x] > ptr[y]) { + uint32_t temp[2] = { ptr[y], ptr[y + 1] }; + ptr[y] = ptr[x]; + ptr[y + 1] = ptr[x + 1]; + ptr[x] = temp[0]; + ptr[x + 1] = temp[1]; } - } else - entry->qos_handle = MLX5_INVALID_QUEUE_HANDLE; + } } - *index = entry->index; - -out: - mutex_unlock(&table->rl_lock); - return err; } -EXPORT_SYMBOL(mlx5_rl_add_rate); -void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, u32 rate, u32 burst) +#define MLX5_RL_BURST_DEF (4 * 1600) /* bytes */ +#define MLX5_RL_BURST_MAX 65535 /* bytes */ + +static SYSCTL_NODE(_hw_mlx5, OID_AUTO, rates, CTLFLAG_RW | CTLFLAG_MPSAFE, + 0, "Rate and burst tables"); + +bool mlx5_use_multi_sq; + +SYSCTL_BOOL(_hw_mlx5_rates, OID_AUTO, use_multi_sq, CTLFLAG_RDTUN | CTLFLAG_MPSAFE, + &mlx5_use_multi_sq, 0, "Set to force use of one SQ per available rate"); + +/* CX-4 cards, rate and burst, logarithmic table */ +static uint32_t mlx5_adapter_byte_rates_cx4_mlx[] = { + 135375, MLX5_RL_BURST_DEF, /* 1,083,000 */ + 180500, MLX5_RL_BURST_DEF, /* 1,444,000 */ + 270750, MLX5_RL_BURST_DEF, /* 2,166,000 */ + 361000, MLX5_RL_BURST_DEF, /* 2,888,000 */ + 541500, MLX5_RL_BURST_DEF, /* 4,332,000 */ + 721875, MLX5_RL_BURST_DEF, /* 5,775,000 */ + 1082875, MLX5_RL_BURST_DEF, /* 8,663,000 */ + 1443875, MLX5_RL_BURST_DEF, /* 11,551,000 */ + 2165750, MLX5_RL_BURST_DEF, /* 17,326,000 */ + 2887750, MLX5_RL_BURST_DEF, /* 23,102,000 */ + 4331625, MLX5_RL_BURST_DEF, /* 34,653,000 */ + 5775500, MLX5_RL_BURST_DEF, /* 46,204,000 */ + 8663125, MLX5_RL_BURST_DEF, /* 69,305,000 */ + 0, 0 /* END */ +}; + +static int +mlx5_sysctl_rates_cx4(SYSCTL_HANDLER_ARGS) { - struct mlx5_rl_table *table = &dev->priv.rl_table; - struct mlx5_rl_entry *entry = NULL; + const size_t size = sizeof(mlx5_adapter_byte_rates_cx4_mlx); + size_t i; + size_t j; + int err; - /* 0 is a reserved value for unlimited rate */ - if (rate == 0) - return; + err = SYSCTL_OUT(req, mlx5_adapter_byte_rates_cx4_mlx, size); + if (err || !req->newptr) + goto done; + + err = SYSCTL_IN(req, mlx5_adapter_byte_rates_cx4_mlx, size); - mutex_lock(&table->rl_lock); - entry = find_rl_entry(table, rate, burst); - if (!entry || !entry->refcount) { - mlx5_core_warn(dev, "Rate %u is not configured\n", rate); - goto out; + /* always zero pad two last entries */ + mlx5_adapter_byte_rates_cx4_mlx[ARRAY_SIZE(mlx5_adapter_byte_rates_cx4_mlx) - 2] = 0; + mlx5_adapter_byte_rates_cx4_mlx[ARRAY_SIZE(mlx5_adapter_byte_rates_cx4_mlx) - 1] = 0; + + /* find end of rates */ + for (i = 0; i != ARRAY_SIZE(mlx5_adapter_byte_rates_cx4_mlx); i += 2) { + if(mlx5_adapter_byte_rates_cx4_mlx[i] == 0) { + /* zero rest of array, if any */ + for (j = i + 2; j != ARRAY_SIZE(mlx5_adapter_byte_rates_cx4_mlx); j += 2) { + mlx5_adapter_byte_rates_cx4_mlx[j] = 0; + mlx5_adapter_byte_rates_cx4_mlx[j + 1] = 0; + } + break; + } } - entry->refcount--; - if (!entry->refcount) { - /* need to remove rate */ - mlx5_set_rate_limit_cmd(dev, 0, 0, entry->index); - entry->rate = 0; - entry->burst = 0; + /* sort rates */ + mlx5_rl_rates_sort(mlx5_adapter_byte_rates_cx4_mlx, i); +done: + return (err); +} +SYSCTL_PROC(_hw_mlx5_rates, OID_AUTO, cx4, CTLTYPE_U32 | CTLFLAG_RDTUN | CTLFLAG_MPSAFE, + NULL, 0, mlx5_sysctl_rates_cx4, "IU", + "Rate and burst table in bytes[/s] for CX-4, zero terminated"); + +/* CX-5 cards and newer, rate and burst, logarithmic table */ +static uint32_t mlx5_adapter_byte_rates_cx5_mlx[] = { + 135375, MLX5_RL_BURST_DEF, /* 1,083,000 bits/s */ + 191447, MLX5_RL_BURST_DEF, /* 1,531,576 bits/s */ + 270745, MLX5_RL_BURST_DEF, /* 2,165,960 bits/s */ + 382888, MLX5_RL_BURST_DEF, /* 3,063,104 bits/s */ + 541481, MLX5_RL_BURST_DEF, /* 4,331,848 bits/s */ + 765764, MLX5_RL_BURST_DEF, /* 6,126,112 bits/s */ + 1082945, MLX5_RL_BURST_DEF, /* 8,663,560 bits/s */ + 1531503, MLX5_RL_BURST_DEF, /* 12,252,024 bits/s */ + 2165854, MLX5_RL_BURST_DEF, /* 17,326,832 bits/s */ + 3062954, MLX5_RL_BURST_DEF, /* 24,503,632 bits/s */ + 4331635, MLX5_RL_BURST_DEF, /* 34,653,080 bits/s */ + 6125805, MLX5_RL_BURST_DEF, /* 49,006,440 bits/s */ + 8663125, MLX5_RL_BURST_DEF, /* 69,305,000 bits/s */ + 12251405, MLX5_RL_BURST_DEF, /* 98,011,240 bits/s */ + 17325958, MLX5_RL_BURST_DEF, /* 138,607,664 bits/s */ + 24502399, MLX5_RL_BURST_DEF, /* 196,019,192 bits/s */ + 34651333, MLX5_RL_BURST_DEF, /* 277,210,664 bits/s */ + 49003973, MLX5_RL_BURST_DEF, /* 392,031,784 bits/s */ + 69301500, MLX5_RL_BURST_DEF, /* 554,412,000 bits/s */ + 98006296, MLX5_RL_BURST_DEF, /* 784,050,368 bits/s */ + 138600667, MLX5_RL_BURST_DEF, /* 1,108,805,336 bits/s */ + 196009293, MLX5_RL_BURST_DEF, /* 1,568,074,344 bits/s */ + 277196668, MLX5_RL_BURST_DEF, /* 2,217,573,344 bits/s */ + 392011989, MLX5_RL_BURST_DEF, /* 3,136,095,912 bits/s */ + 554384005, MLX5_RL_BURST_DEF, /* 4,435,072,040 bits/s */ + 784010780, MLX5_RL_BURST_DEF, /* 6,272,086,240 bits/s */ + 1108749347, MLX5_RL_BURST_DEF, /* 8,869,994,776 bits/s */ + 1567995167, MLX5_RL_BURST_DEF, /* 12,543,961,336 bits/s */ + 2217461368, MLX5_RL_BURST_DEF, /* 17,739,690,944 bits/s */ + 3135937547, MLX5_RL_BURST_DEF, /* 25,087,500,376 bits/s */ + 0, 0 /* END */ +}; + +static int +mlx5_sysctl_rates_cx5(SYSCTL_HANDLER_ARGS) +{ + const size_t size = sizeof(mlx5_adapter_byte_rates_cx5_mlx); + size_t i; + size_t j; + int err; + + err = SYSCTL_OUT(req, mlx5_adapter_byte_rates_cx5_mlx, size); + if (err || !req->newptr) + goto done; + + err = SYSCTL_IN(req, mlx5_adapter_byte_rates_cx5_mlx, size); + + /* always zero pad two last entries */ + mlx5_adapter_byte_rates_cx5_mlx[ARRAY_SIZE(mlx5_adapter_byte_rates_cx5_mlx) - 2] = 0; + mlx5_adapter_byte_rates_cx5_mlx[ARRAY_SIZE(mlx5_adapter_byte_rates_cx5_mlx) - 1] = 0; + + /* find end of rates */ + for (i = 0; i != ARRAY_SIZE(mlx5_adapter_byte_rates_cx5_mlx); i += 2) { + if(mlx5_adapter_byte_rates_cx5_mlx[i] == 0) { + /* zero rest of array, if any */ + for (j = i + 2; j != ARRAY_SIZE(mlx5_adapter_byte_rates_cx5_mlx); j += 2) { + mlx5_adapter_byte_rates_cx5_mlx[j] = 0; + mlx5_adapter_byte_rates_cx5_mlx[j + 1] = 0; + } + break; + } } -out: - mutex_unlock(&table->rl_lock); + /* sort rates */ + mlx5_rl_rates_sort(mlx5_adapter_byte_rates_cx5_mlx, i); +done: + return (err); } -EXPORT_SYMBOL(mlx5_rl_remove_rate); +SYSCTL_PROC(_hw_mlx5_rates, OID_AUTO, cx5, CTLTYPE_U32 | CTLFLAG_RDTUN | CTLFLAG_MPSAFE, + NULL, 0, mlx5_sysctl_rates_cx5, "IU", + "Rate and burst table in bytes[/s] for CX-5 and newer, zero terminated"); -int mlx5_init_rl_table(struct mlx5_core_dev *dev) +int +mlx5_init_rl_table(struct mlx5_core_dev *dev) { struct mlx5_rl_table *table = &dev->priv.rl_table; - int i; + uint32_t *prate; + u16 num; + u16 i; - mutex_init(&table->rl_lock); - if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, packet_pacing)) { - table->max_size = 0; - return 0; - } + if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, packet_pacing) || + MLX5_CAP_QOS(dev, packet_pacing_rate_table_size) < 14) + return (0); + + /* one entry is reserved for unlimited traffic */ + num = MLX5_CAP_QOS(dev, packet_pacing_rate_table_size) - 1; + if (num == 13) + prate = mlx5_adapter_byte_rates_cx4_mlx; + else + prate = mlx5_adapter_byte_rates_cx5_mlx; - /* First entry is reserved for unlimited rate */ - table->max_size = MLX5_CAP_QOS(dev, packet_pacing_rate_table_size) - 1; - table->max_rate = MLX5_CAP_QOS(dev, packet_pacing_max_rate); - table->min_rate = MLX5_CAP_QOS(dev, packet_pacing_min_rate); + /* get maximum and minimum rates in bytes/s */ + table->max_rate = MLX5_CAP_QOS(dev, packet_pacing_max_rate) * (1000ULL / 8ULL); + table->min_rate = MLX5_CAP_QOS(dev, packet_pacing_min_rate) * (1000ULL / 8ULL); - table->rl_entry = kcalloc(table->max_size, sizeof(struct mlx5_rl_entry), - GFP_KERNEL); - if (!table->rl_entry) - return -ENOMEM; + /* allocate full sized table */ + table->rl_entry = kcalloc(num, sizeof(struct mlx5_rl_entry), GFP_KERNEL); + if (table->rl_entry == NULL) + return (-ENOMEM); - /* The index represents the index in HW rate limit table - * Index 0 is reserved for unlimited rate - */ - for (i = 0; i < table->max_size; i++) + /* skip too small rates, if any */ + while (*prate != 0 && *prate < table->min_rate) + prate += 2; + + /* fill and configure entries until end is reached */ + for (i = 0; *prate != 0 && *prate <= table->max_rate && i != num; i++, prate += 2) { table->rl_entry[i].index = i + 1; + table->rl_entry[i].rate = prate[0]; + if (prate[1] > MLX5_RL_BURST_MAX) + table->rl_entry[i].burst = MLX5_RL_BURST_MAX; + else + table->rl_entry[i].burst = prate[1]; + } - return 0; + table->max_size = i; + return (0); } -void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev) +int +mlx5_load_rl_table(struct mlx5_core_dev *dev) { struct mlx5_rl_table *table = &dev->priv.rl_table; - int i; + int err; + u16 i; - /* Clear all configured rates */ - for (i = 0; i < table->max_size; i++) - if (table->rl_entry[i].rate) - mlx5_set_rate_limit_cmd(dev, 0, 0, - table->rl_entry[i].index); + for (i = 0; i != table->max_size; i++) { + err = mlx5_set_rate_limit_cmd(dev, + table->rl_entry[i].rate / (1000ULL / 8ULL), + table->rl_entry[i].burst, i + 1); + if (err) + goto error; - kfree(dev->priv.rl_table.rl_entry); + if (MLX5_CAP_QOS(dev, qos_remap_pp)) { + err = mlx5_query_rate_limit_cmd(dev, i + 1, + &table->rl_entry[i].qos_handle); + if (err) + goto error; + } else { + table->rl_entry[i].qos_handle = + MLX5_INVALID_QUEUE_HANDLE; + } + } + return (0); + +error: + return (err); } -#endif +void +mlx5_cleanup_rl_table(struct mlx5_core_dev *dev) +{ + struct mlx5_rl_table *table = &dev->priv.rl_table; + u16 i; + + /* Clear all configured rates, if any. */ + for (i = 0; i != table->max_size; i++) { + if (table->rl_entry[i].rate == 0) + continue; + mlx5_set_rate_limit_cmd(dev, 0, 0, + table->rl_entry[i].index); + } + + kfree(dev->priv.rl_table.rl_entry); +} diff --git a/sys/dev/mlx5/mlx5_core/mlx5_srq.c b/sys/dev/mlx5/mlx5_core/mlx5_srq.c --- a/sys/dev/mlx5/mlx5_core/mlx5_srq.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_srq.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_tls.c b/sys/dev/mlx5/mlx5_core/mlx5_tls.c --- a/sys/dev/mlx5/mlx5_core/mlx5_tls.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_tls.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_transobj.c b/sys/dev/mlx5/mlx5_core/mlx5_transobj.c --- a/sys/dev/mlx5/mlx5_core/mlx5_transobj.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_transobj.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_uar.c b/sys/dev/mlx5/mlx5_core/mlx5_uar.c --- a/sys/dev/mlx5/mlx5_core/mlx5_uar.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_uar.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_vport.c b/sys/dev/mlx5/mlx5_core/mlx5_vport.c --- a/sys/dev/mlx5/mlx5_core/mlx5_vport.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_vport.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_vsc.c b/sys/dev/mlx5/mlx5_core/mlx5_vsc.c --- a/sys/dev/mlx5/mlx5_core/mlx5_vsc.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_vsc.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_core/mlx5_wq.c b/sys/dev/mlx5/mlx5_core/mlx5_wq.c --- a/sys/dev/mlx5/mlx5_core/mlx5_wq.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_wq.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_en/en.h b/sys/dev/mlx5/mlx5_en/en.h --- a/sys/dev/mlx5/mlx5_en/en.h +++ b/sys/dev/mlx5/mlx5_en/en.h @@ -885,7 +885,6 @@ u16 max_inline; u8 min_inline_mode; u8 min_insert_caps; - u32 queue_handle; /* SQ remap support */ #define MLX5E_INSERT_VLAN 1 #define MLX5E_INSERT_NON_VLAN 2 @@ -896,7 +895,7 @@ } __aligned(MLX5E_CACHELINE_SIZE); static inline bool -mlx5e_sq_has_room_for(struct mlx5e_sq *sq, u16 n) +mlx5e_sq_has_room_for(const struct mlx5e_sq *sq, u16 n) { u16 cc = sq->cc; u16 pc = sq->pc; @@ -904,8 +903,14 @@ return ((sq->wq.sz_m1 & (cc - pc)) >= n || cc == pc); } +static inline bool +mlx5e_sq_is_empty(const struct mlx5e_sq *sq) +{ + return (sq->cc == sq->pc); +} + static inline u32 -mlx5e_sq_queue_level(struct mlx5e_sq *sq) +mlx5e_sq_queue_level(const struct mlx5e_sq *sq) { u16 cc; u16 pc; @@ -1264,6 +1269,17 @@ mlx5_cq_arm(mcq, MLX5_CQ_DB_REQ_NOT, mcq->uar->map, dblock, cq->wq.cc); } +static inline void +mlx5e_accumulate_sq_stats(const struct mlx5e_sq_stats *src, struct mlx5e_sq_stats *dst) +{ + dst->tso_packets += src->tso_packets; + dst->tso_bytes += src->tso_bytes; + dst->dropped += src->dropped; + dst->enobuf += src->enobuf; + dst->defragged += src->defragged; + dst->csum_offload_none += src->csum_offload_none; +} + #define mlx5e_dbg(_IGN, _priv, ...) mlx5_core_dbg((_priv)->mdev, __VA_ARGS__) extern const struct ethtool_ops mlx5e_ethtool_ops; diff --git a/sys/dev/mlx5/mlx5_en/en_rl.h b/sys/dev/mlx5/mlx5_en/en_rl.h --- a/sys/dev/mlx5/mlx5_en/en_rl.h +++ b/sys/dev/mlx5/mlx5_en/en_rl.h @@ -1,5 +1,6 @@ /*- - * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Mellanox Technologies. + * Copyright (c) 2022 NVIDIA corporation & affiliates. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -39,24 +40,22 @@ #include -#define MLX5E_RL_MAX_WORKERS 128 /* limited by Toeplitz hash */ -#define MLX5E_RL_MAX_TX_RATES (64 * 1024) /* software limit */ -#define MLX5E_RL_DEF_SQ_PER_WORKER (12 * 1024) /* software limit */ -#define MLX5E_RL_MAX_SQS (120 * 1024) /* software limit */ - #define MLX5E_RL_TX_COAL_USEC_DEFAULT 32 #define MLX5E_RL_TX_COAL_PKTS_DEFAULT 4 #define MLX5E_RL_TX_COAL_MODE_DEFAULT 0 #define MLX5E_RL_TX_COMP_FACT_DEFAULT 1 -#define MLX5E_RL_WORKER_LOCK(rlw) mtx_lock(&(rlw)->mtx) -#define MLX5E_RL_WORKER_UNLOCK(rlw) mtx_unlock(&(rlw)->mtx) +#define MLX5E_RL_MAX_RATES(_rl) \ + container_of(_rl, struct mlx5e_priv, rl)->mdev->priv.rl_table.max_size + +#define MLX5E_RL_HEAD_LOCK(head) mtx_lock(&(head)->mtx) +#define MLX5E_RL_HEAD_UNLOCK(head) mtx_unlock(&(head)->mtx) -#define MLX5E_RL_RLOCK(rl) sx_slock(&(rl)->rl_sxlock) -#define MLX5E_RL_RUNLOCK(rl) sx_sunlock(&(rl)->rl_sxlock) +#define MLX5E_RL_CHANNEL_LOCK(chan) mtx_lock(&(chan)->mtx) +#define MLX5E_RL_CHANNEL_UNLOCK(chan) mtx_unlock(&(chan)->mtx) -#define MLX5E_RL_WLOCK(rl) sx_xlock(&(rl)->rl_sxlock) -#define MLX5E_RL_WUNLOCK(rl) sx_xunlock(&(rl)->rl_sxlock) +#define MLX5E_RL_PRIV_LOCK(rl) sx_xlock(&(rl)->rl_sxlock) +#define MLX5E_RL_PRIV_UNLOCK(rl) sx_xunlock(&(rl)->rl_sxlock) #define MLX5E_RL_PARAMS(m) \ m(+1, u64, tx_queue_size, "tx_queue_size", "Default send queue size") \ @@ -64,52 +63,25 @@ m(+1, u64, tx_coalesce_pkts, "tx_coalesce_pkts", "Maximum number of TX packets to join") \ m(+1, u64, tx_coalesce_mode, "tx_coalesce_mode", "0: EQE mode 1: CQE mode") \ m(+1, u64, tx_completion_fact, "tx_completion_fact", "1..MAX: Completion event ratio") \ - m(+1, u64, tx_completion_fact_max, "tx_completion_fact_max", "Maximum completion event ratio") \ - m(+1, u64, tx_worker_threads_max, "tx_worker_threads_max", "Max number of TX worker threads") \ - m(+1, u64, tx_worker_threads_def, "tx_worker_threads_def", "Default number of TX worker threads") \ - m(+1, u64, tx_channels_per_worker_max, "tx_channels_per_worker_max", "Max number of TX channels per worker") \ - m(+1, u64, tx_channels_per_worker_def, "tx_channels_per_worker_def", "Default number of TX channels per worker") \ - m(+1, u64, tx_rates_max, "tx_rates_max", "Max number of TX rates") \ - m(+1, u64, tx_rates_def, "tx_rates_def", "Default number of TX rates") \ - m(+1, u64, tx_limit_min, "tx_limit_min", "Minimum TX rate in bits/s") \ - m(+1, u64, tx_limit_max, "tx_limit_max", "Maximum TX rate in bits/s") \ - m(+1, u64, tx_burst_size, "tx_burst_size", "Current burst size in number of packets. A value of zero means use firmware default.") \ - m(+1, u64, tx_burst_size_max, "tx_burst_size_max", "Maximum burst size in number of packets") \ - m(+1, u64, tx_burst_size_min, "tx_burst_size_min", "Minimum burst size in number of packets") + m(+1, u64, tx_completion_fact_max, "tx_completion_fact_max", "Maximum completion event ratio") #define MLX5E_RL_PARAMS_NUM (0 MLX5E_RL_PARAMS(MLX5E_STATS_COUNT)) #define MLX5E_RL_STATS(m) \ m(+1, u64, tx_allocate_resource_failure, "tx_allocate_resource_failure", "Number of times firmware resource allocation failed") \ - m(+1, u64, tx_add_new_rate_failure, "tx_add_new_rate_failure", "Number of times adding a new firmware rate failed") \ m(+1, u64, tx_modify_rate_failure, "tx_modify_rate_failure", "Number of times modifying a firmware rate failed") \ m(+1, u64, tx_active_connections, "tx_active_connections", "Number of active connections") \ m(+1, u64, tx_open_queues, "tx_open_queues", "Number of open TX queues") \ - m(+1, u64, tx_available_resource_failure, "tx_available_resource_failure", "Number of times TX resources were not available") + m(+1, u64, tx_available_resource_failure, "tx_available_resource_failure", "Number of times TX resources were not available") \ + m(+1, u64, tx_rate_changed_too_quickly, "tx_rate_changed_too_quickly", "Number of times TX rate was changed too quickly") #define MLX5E_RL_STATS_NUM (0 MLX5E_RL_STATS(MLX5E_STATS_COUNT)) -#define MLX5E_RL_TABLE_PARAMS(m) \ - m(+1, u64, tx_limit_add, "tx_limit_add", "Add TX rate limit in bits/s to empty slot") \ - m(+1, u64, tx_limit_clr, "tx_limit_clr", "Clear all TX rates in table") \ - m(+1, u64, tx_allowed_deviation, "tx_allowed_deviation", "Relative rate deviation allowed in 1/1000") \ - m(+1, u64, tx_allowed_deviation_min, "tx_allowed_deviation_min", "Minimum allowed rate deviation in 1/1000") \ - m(+1, u64, tx_allowed_deviation_max, "tx_allowed_deviation_max", "Maximum allowed rate deviation in 1/1000") - -#define MLX5E_RL_TABLE_PARAMS_NUM (0 MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_COUNT)) - #define MLX5E_RL_PARAMS_INDEX(n) \ (__offsetof(struct mlx5e_rl_params, n) / sizeof(uint64_t)) struct mlx5e_priv; - -/* Indicates channel's state */ -enum { - MLX5E_RL_ST_FREE, - MLX5E_RL_ST_USED, - MLX5E_RL_ST_MODIFY, - MLX5E_RL_ST_DESTROY, -}; +struct mlx5e_rl_rate; struct mlx5e_rl_stats { u64 arg [0]; @@ -119,8 +91,6 @@ struct mlx5e_rl_params { u64 arg [0]; MLX5E_RL_PARAMS(MLX5E_STATS_VAR) - u64 table_arg [0]; - MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_VAR) }; struct mlx5e_rl_channel_param { @@ -128,45 +98,69 @@ struct mlx5e_cq_param cq; }; -struct mlx5e_rl_channel { - struct m_snd_tag tag; - STAILQ_ENTRY(mlx5e_rl_channel) entry; - struct mlx5e_sq * volatile sq; - struct mlx5e_rl_worker *worker; - uint64_t new_rate; - uint64_t init_rate; - uint64_t last_rate; - uint32_t refcount; - uint16_t last_burst; - uint16_t state; +struct mlx5e_rl_sq { + struct mlx5e_sq sq; + TAILQ_ENTRY(mlx5e_rl_sq) entry; + uint32_t cq_modify_flags; +#define MLX5_CQ_MODIFY_DESTROY -1U + uint32_t queue_handle; /* SQ remap support */ + struct mlx5e_sq *sq_next; }; -struct mlx5e_rl_worker { +struct mlx5e_rl_head { struct mtx mtx; - struct cv cv; - STAILQ_HEAD(, mlx5e_rl_channel) index_list_head; - STAILQ_HEAD(, mlx5e_rl_channel) process_head; +#define MLX5E_RL_HEAD_USED 0 +#define MLX5E_RL_HEAD_WORK 1 +#define MLX5E_RL_HEAD_FREE 2 +#define MLX5E_RL_HEAD_MAX 3 + TAILQ_HEAD(, mlx5e_rl_sq) head[MLX5E_RL_HEAD_MAX]; + uint32_t count[MLX5E_RL_HEAD_MAX]; + uint32_t level; +}; + +struct mlx5e_rl_channel { + struct m_snd_tag tag; + struct mtx mtx; + struct mlx5e_rl_sq *curr_sq; + struct mlx5e_rl_sq *next_sq; struct mlx5e_priv *priv; - struct mlx5e_rl_channel *channels; - unsigned worker_done; + uint16_t curr_rate_index; + uint16_t next_rate_index; + uint16_t last_rate_index; + uint8_t irq_index; +#define MLX5E_RL_IRQ_INDEX_MAX 128 /* limit is exclusive, 0..127 */ }; +#define MLX5E_RL_CHAN_TO_SQ_HEAD(channel, rate_index) \ + ((channel)->priv->rl.sq_rate_head[(rate_index) - 1] + \ + (channel)->irq_index) + struct mlx5e_rl_priv_data { struct sx rl_sxlock; + struct mtx rl_mtx; + struct cv rl_cv; struct sysctl_ctx_list ctx; struct mlx5e_rl_channel_param chan_param; struct mlx5e_rl_params param; struct mlx5e_rl_stats stats; - struct mlx5e_rl_worker *workers; - struct mlx5e_priv *priv; + struct mlx5e_sq_stats sq_stats; + struct mlx5e_rl_head *sq_raw_head; + struct mlx5e_rl_head **sq_rate_head; uint64_t *rate_limit_table; - unsigned opened; + unsigned worker_opened; + unsigned worker_done; + unsigned worker_pending; uint32_t tisn; + uint32_t use_multi_sq; }; -int mlx5e_rl_init(struct mlx5e_priv *priv); -void mlx5e_rl_cleanup(struct mlx5e_priv *priv); -void mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl); +int mlx5e_rl_init(struct mlx5e_priv *); +void mlx5e_rl_cleanup(struct mlx5e_priv *); +void mlx5e_rl_refresh_sq_inline(struct mlx5e_priv *); +void mlx5e_rl_refresh_channel_params(struct mlx5e_priv *, uint32_t cq_modify_flags); +void mlx5e_rl_query(struct ifnet *, struct if_ratelimit_query_results *); +struct mlx5e_sq *mlx5e_rl_get_current_sq(struct mlx5e_rl_channel *); +void mlx5e_rl_accumulate_sq_stats(struct mlx5e_rl_priv_data *, struct mlx5e_sq_stats *); if_snd_tag_alloc_t mlx5e_rl_snd_tag_alloc; diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c b/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c --- a/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c @@ -685,8 +685,10 @@ /* update inline mode */ mlx5e_refresh_sq_inline(priv); + #ifdef RATELIMIT - mlx5e_rl_refresh_sq_inline(&priv->rl); + /* update inline mode */ + mlx5e_rl_refresh_sq_inline(priv); #endif done: PRIV_UNLOCK(priv); @@ -1076,6 +1078,10 @@ /* restart network interface, if any */ if (was_opened) mlx5e_open_locked(priv->ifp); +#ifdef RATELIMIT + /* need to re-create all SQ's */ + mlx5e_rl_refresh_channel_params(priv, MLX5_CQ_MODIFY_DESTROY); +#endif break; case MLX5_PARAM_OFFSET(rx_coalesce_mode): diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_iq.c b/sys/dev/mlx5/mlx5_en/mlx5_en_iq.c --- a/sys/dev/mlx5/mlx5_en/mlx5_en_iq.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_iq.c @@ -269,11 +269,12 @@ /* SQ remap support requires reg_umr privileges level */ if (MLX5_CAP_QOS(iq->priv->mdev, qos_remap_pp)) { MLX5_SET(sqc, sqc, qos_remap_en, 1); - if (MLX5_CAP_ETH(iq->priv->mdev, reg_umr_sq)) + if (MLX5_CAP_ETH(iq->priv->mdev, reg_umr_sq)) { MLX5_SET(sqc, sqc, reg_umr, 1); - else - mlx5_en_err(iq->priv->ifp, - "No reg umr SQ capability, SQ remap disabled\n"); + } else { + err = -EOPNOTSUPP; + goto done; + } } MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); @@ -286,7 +287,7 @@ (__be64 *) MLX5_ADDR_OF(wq, wq, pas)); err = mlx5_core_create_sq(iq->priv->mdev, in, inlen, &iq->sqn); - +done: kvfree(in); return (err); diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c --- a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2015-2021 Mellanox Technologies. All rights reserved. + * Copyright (c) 2015-2021 Mellanox Technologies. * Copyright (c) 2022 NVIDIA corporation & affiliates. * * Redistribution and use in source and binary forms, with or without @@ -811,15 +811,10 @@ { struct mlx5_core_dev *mdev = priv->mdev; struct mlx5e_vport_stats *s = &priv->stats.vport; - struct mlx5e_sq_stats *sq_stats; + struct mlx5e_sq_stats sq_stats = {}; u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)]; u32 *out; int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out); - u64 tso_packets = 0; - u64 tso_bytes = 0; - u64 tx_queue_dropped = 0; - u64 tx_defragged = 0; - u64 tx_offload_none = 0; u64 lro_packets = 0; u64 lro_bytes = 0; u64 sw_lro_queued = 0; @@ -859,47 +854,19 @@ rx_decrypted_error += rq_stats->decrypted_error_packets; rx_decrypted_ok += rq_stats->decrypted_ok_packets; - for (j = 0; j < priv->num_tc; j++) { - sq_stats = &pch->sq[j].stats; - - tso_packets += sq_stats->tso_packets; - tso_bytes += sq_stats->tso_bytes; - tx_queue_dropped += sq_stats->dropped; - tx_queue_dropped += sq_stats->enobuf; - tx_defragged += sq_stats->defragged; - tx_offload_none += sq_stats->csum_offload_none; - } + for (j = 0; j < priv->num_tc; j++) + mlx5e_accumulate_sq_stats(&pch->sq[j].stats, &sq_stats); } #ifdef RATELIMIT - /* Collect statistics from all rate-limit queues */ - for (j = 0; j < priv->rl.param.tx_worker_threads_def; j++) { - struct mlx5e_rl_worker *rlw = priv->rl.workers + j; - - for (i = 0; i < priv->rl.param.tx_channels_per_worker_def; i++) { - struct mlx5e_rl_channel *channel = rlw->channels + i; - struct mlx5e_sq *sq = channel->sq; - - if (sq == NULL) - continue; - - sq_stats = &sq->stats; - - tso_packets += sq_stats->tso_packets; - tso_bytes += sq_stats->tso_bytes; - tx_queue_dropped += sq_stats->dropped; - tx_queue_dropped += sq_stats->enobuf; - tx_defragged += sq_stats->defragged; - tx_offload_none += sq_stats->csum_offload_none; - } - } + mlx5e_rl_accumulate_sq_stats(&priv->rl, &sq_stats); #endif /* update counters */ - s->tso_packets = tso_packets; - s->tso_bytes = tso_bytes; - s->tx_queue_dropped = tx_queue_dropped; - s->tx_defragged = tx_defragged; + s->tso_packets = sq_stats.tso_packets; + s->tso_bytes = sq_stats.tso_bytes; + s->tx_queue_dropped = sq_stats.dropped + sq_stats.enobuf; + s->tx_defragged = sq_stats.defragged; s->lro_packets = lro_packets; s->lro_bytes = lro_bytes; s->sw_lro_queued = sw_lro_queued; @@ -977,7 +944,7 @@ s->tx_broadcast_bytes; /* Update calculated offload counters */ - s->tx_csum_offload = s->tx_packets - tx_offload_none; + s->tx_csum_offload = s->tx_packets - sq_stats.csum_offload_none; s->rx_csum_good = s->rx_packets - s->rx_csum_none; } @@ -4396,50 +4363,6 @@ } } -#ifdef RATELIMIT -#define NUM_HDWR_RATES_MLX 13 -static const uint64_t adapter_rates_mlx[NUM_HDWR_RATES_MLX] = { - 135375, /* 1,083,000 */ - 180500, /* 1,444,000 */ - 270750, /* 2,166,000 */ - 361000, /* 2,888,000 */ - 541500, /* 4,332,000 */ - 721875, /* 5,775,000 */ - 1082875, /* 8,663,000 */ - 1443875, /* 11,551,000 */ - 2165750, /* 17,326,000 */ - 2887750, /* 23,102,000 */ - 4331625, /* 34,653,000 */ - 5775500, /* 46,204,000 */ - 8663125 /* 69,305,000 */ -}; - -static void -mlx5e_ratelimit_query(if_t ifp __unused, struct if_ratelimit_query_results *q) -{ - /* - * This function needs updating by the driver maintainer! - * For the MLX card there are currently (ConectX-4?) 13 - * pre-set rates and others i.e. ConnectX-5, 6, 7?? - * - * This will change based on later adapters - * and this code should be updated to look at ifp - * and figure out the specific adapter type - * settings i.e. how many rates as well - * as if they are fixed (as is shown here) or - * if they are dynamic (example chelsio t4). Also if there - * is a maximum number of flows that the adapter - * can handle that too needs to be updated in - * the max_flows field. - */ - q->rate_table = adapter_rates_mlx; - q->flags = RT_IS_FIXED_TABLE; - q->max_flows = 0; /* mlx has no limit */ - q->number_of_rates = NUM_HDWR_RATES_MLX; - q->min_segment_burst = 1; -} -#endif - static void mlx5e_ifm_add(struct mlx5e_priv *priv, int type) { @@ -4528,7 +4451,7 @@ IFCAP2_BIT(IFCAP2_RXTLS6), 0); if_setsndtagallocfn(ifp, mlx5e_snd_tag_alloc); #ifdef RATELIMIT - if_setratelimitqueryfn(ifp, mlx5e_ratelimit_query); + if_setratelimitqueryfn(ifp, mlx5e_rl_query); #endif /* set TSO limits so that we don't have to drop TX packets */ if_sethwtsomax(ifp, MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)); diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c b/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c --- a/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c @@ -1,5 +1,6 @@ /*- - * Copyright (c) 2016-2020 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016-2020 Mellanox Technologies. + * Copyright (c) 2022 NVIDIA corporation & affiliates. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -32,15 +33,14 @@ #ifdef RATELIMIT -static int mlx5e_rl_open_workers(struct mlx5e_priv *); -static void mlx5e_rl_close_workers(struct mlx5e_priv *); -static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS); +static void mlx5e_rl_open_worker(struct mlx5e_priv *); +static void mlx5e_rl_close_worker(struct mlx5e_priv *); static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x, struct sysctl_oid *, const char *name, const char *desc); static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x, struct sysctl_oid *node, const char *name, const char *desc); -static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value); -static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value); +static void mlx5e_rl_worker_wakeup(struct mlx5e_rl_priv_data *); +static int mlx5e_rl_modify_sq(struct mlx5e_sq *, uint16_t rl_index); static if_snd_tag_modify_t mlx5e_rl_snd_tag_modify; static if_snd_tag_query_t mlx5e_rl_snd_tag_query; static if_snd_tag_free_t mlx5e_rl_snd_tag_free; @@ -53,38 +53,38 @@ }; static void -mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl, +mlx5e_rl_build_sq_param(struct mlx5e_priv *priv, struct mlx5e_sq_param *param) { void *sqc = param->sqc; void *wq = MLX5_ADDR_OF(sqc, sqc, wq); - uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size); + uint8_t log_sq_size = order_base_2(priv->rl.param.tx_queue_size); MLX5_SET(wq, wq, log_wq_sz, log_sq_size); MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); - MLX5_SET(wq, wq, pd, rl->priv->pdn); + MLX5_SET(wq, wq, pd, priv->pdn); param->wq.linear = 1; } static void -mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl, +mlx5e_rl_build_cq_param(struct mlx5e_priv *priv, struct mlx5e_cq_param *param) { void *cqc = param->cqc; - uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size); + uint8_t log_sq_size = order_base_2(priv->rl.param.tx_queue_size); MLX5_SET(cqc, cqc, log_cq_size, log_sq_size); - MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs); - MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts); - MLX5_SET(cqc, cqc, uar_page, rl->priv->mdev->priv.uar->index); + MLX5_SET(cqc, cqc, cq_period, priv->rl.param.tx_coalesce_usecs); + MLX5_SET(cqc, cqc, cq_max_count, priv->rl.param.tx_coalesce_pkts); + MLX5_SET(cqc, cqc, uar_page, priv->mdev->priv.uar->index); - switch (rl->param.tx_coalesce_mode) { + switch (priv->rl.param.tx_coalesce_mode) { case 0: MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; default: - if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe)) + if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe)) MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE); else MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); @@ -93,13 +93,13 @@ } static void -mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl, +mlx5e_rl_build_channel_param(struct mlx5e_priv *priv, struct mlx5e_rl_channel_param *cparam) { memset(cparam, 0, sizeof(*cparam)); - mlx5e_rl_build_sq_param(rl, &cparam->sq); - mlx5e_rl_build_cq_param(rl, &cparam->cq); + mlx5e_rl_build_sq_param(priv, &cparam->sq); + mlx5e_rl_build_cq_param(priv, &cparam->cq); } static int @@ -157,111 +157,132 @@ static void mlx5e_rl_destroy_sq(struct mlx5e_sq *sq) { - mlx5e_free_sq_db(sq); mlx5_wq_destroy(&sq->wq_ctrl); bus_dma_tag_destroy(sq->dma_tag); } static int -mlx5e_rl_query_sq(struct mlx5e_sq *sq) +mlx5e_rl_query_sq(struct mlx5e_rl_sq *sq) { void *out; - int inlen; - int err; - - inlen = MLX5_ST_SZ_BYTES(query_sq_out); - out = mlx5_vzalloc(inlen); - if (!out) - return -ENOMEM; + int iolen; + int err; - err = mlx5_core_query_sq(sq->priv->mdev, sq->sqn, out); - if (err) - goto out; + iolen = MLX5_ST_SZ_BYTES(query_sq_out); + out = mlx5_vzalloc(iolen); + if (!out) + return (-ENOMEM); - sq->queue_handle = MLX5_GET(query_sq_out, out, sq_context.queue_handle); + err = mlx5_core_query_sq(sq->sq.priv->mdev, sq->sq.sqn, out); + if (err) + goto out; + sq->queue_handle = MLX5_GET(query_sq_out, out, sq_context.queue_handle); out: - kvfree(out); - return err; + kvfree(out); + return (err); } static int -mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq, - struct mlx5e_sq_param *param, int ix) +mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_rl_sq *sq, + struct mlx5e_sq_param *param, int ix, int rl_index) { int err; - err = mlx5e_rl_create_sq(priv, sq, param, ix); + err = mlx5e_rl_create_sq(priv, &sq->sq, param, ix); if (err) return (err); - err = mlx5e_enable_sq(sq, param, &priv->channel[ix].bfreg, priv->rl.tisn); + err = mlx5e_enable_sq(&sq->sq, param, &priv->channel[ix].bfreg, priv->rl.tisn); if (err) goto err_destroy_sq; - err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY); + err = mlx5e_modify_sq(&sq->sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY); if (err) goto err_disable_sq; - if (MLX5_CAP_QOS(priv->mdev, qos_remap_pp)) { + if (priv->rl.use_multi_sq == 0) { err = mlx5e_rl_query_sq(sq); - if (err) { - mlx5_en_err(priv->ifp, "Failed retrieving send queue handle for" - "SQ remap - sqn=%u, err=(%d)\n", sq->sqn, err); - sq->queue_handle = MLX5_INVALID_QUEUE_HANDLE; - } - } else + if (err) + goto err_query_sq; + } else { sq->queue_handle = MLX5_INVALID_QUEUE_HANDLE; + } + + err = mlx5e_rl_modify_sq(&sq->sq, rl_index); + if (err) + goto err_query_sq; - WRITE_ONCE(sq->running, 1); + WRITE_ONCE(sq->sq.running, 1); return (0); +err_query_sq: + mlx5e_modify_sq(&sq->sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR); err_disable_sq: - mlx5e_disable_sq(sq); + mlx5e_disable_sq(&sq->sq); err_destroy_sq: - mlx5e_rl_destroy_sq(sq); + mlx5e_rl_destroy_sq(&sq->sq); return (err); } static void -mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq) +mlx5e_rl_tx_cq_multi_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe __unused) { - mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF); - mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF); + struct mlx5e_rl_sq *rl_sq; + struct mlx5e_sq *sq_next; - callout_init_mtx(&sq->cev_callout, &sq->lock, 0); + mlx5e_tx_cq_comp(mcq, eqe); - sq->cev_factor = priv->rl.param.tx_completion_fact; + rl_sq = container_of(mcq, struct mlx5e_rl_sq, sq.cq.mcq); - /* ensure the TX completion event factor is not zero */ - if (sq->cev_factor == 0) - sq->cev_factor = 1; + mtx_lock(&rl_sq->sq.comp_lock); + if (mlx5e_sq_is_empty(&rl_sq->sq)) { + sq_next = rl_sq->sq_next; + rl_sq->sq_next = NULL; + } else { + sq_next = NULL; + } + mtx_unlock(&rl_sq->sq.comp_lock); + + if (unlikely(sq_next != NULL)) { + mtx_lock(&sq_next->lock); + mlx5e_tx_notify_hw(sq_next, true); + mtx_unlock(&sq_next->lock); + } } static int -mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix, +mlx5e_rl_alloc_sq(struct mlx5e_priv *priv, int eq_ix, int rl_index, struct mlx5e_rl_channel_param *cparam, - struct mlx5e_sq *volatile *ppsq) + struct mlx5e_rl_sq * volatile *ppsq) { - struct mlx5e_priv *priv = rlw->priv; - struct mlx5e_sq *sq; + struct mlx5e_rl_sq *sq; int err; sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO); /* init mutexes */ - mlx5e_rl_chan_mtx_init(priv, sq); + mtx_init(&sq->sq.lock, "mlx5e-rl-tx", NULL, MTX_DEF); + mtx_init(&sq->sq.comp_lock, "mlx5e-rl-comp", NULL, MTX_DEF); + + callout_init_mtx(&sq->sq.cev_callout, &sq->sq.lock, 0); + + sq->sq.cev_factor = priv->rl.param.tx_completion_fact; + + /* ensure the TX completion event factor is not zero */ + if (sq->sq.cev_factor == 0) + sq->sq.cev_factor = 1; /* open TX completion queue */ - err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq, - &mlx5e_tx_cq_comp, eq_ix); + err = mlx5e_open_cq(priv, &cparam->cq, &sq->sq.cq, + priv->rl.use_multi_sq ? &mlx5e_rl_tx_cq_multi_comp : &mlx5e_tx_cq_comp, eq_ix); if (err) goto err_free; - err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix); + err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix, rl_index); if (err) goto err_close_tx_cq; @@ -269,44 +290,51 @@ *ppsq = sq; /* poll TX queue initially */ - sq->cq.mcq.comp(&sq->cq.mcq, NULL); + sq->sq.cq.mcq.comp(&sq->sq.cq.mcq, NULL); + atomic_add_64(&priv->rl.stats.tx_open_queues, 1ULL); return (0); err_close_tx_cq: - mlx5e_close_cq(&sq->cq); + mlx5e_close_cq(&sq->sq.cq); err_free: /* destroy mutexes */ - mtx_destroy(&sq->lock); - mtx_destroy(&sq->comp_lock); + mtx_destroy(&sq->sq.lock); + mtx_destroy(&sq->sq.comp_lock); free(sq, M_MLX5EN); atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL); return (err); } static void -mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq) +mlx5e_rl_reset_sq_stats(struct mlx5e_rl_sq *sq) { - struct mlx5e_sq *sq = *ppsq; + /* store statistics for future */ + mlx5e_accumulate_sq_stats(&sq->sq.stats, &sq->sq.priv->rl.sq_stats); - /* check if channel is already closed */ - if (sq == NULL) - return; - /* ensure channel pointer is no longer used */ - *ppsq = NULL; + /* clear statistics */ + memset(&sq->sq.stats, 0, sizeof(sq->sq.stats)); +} +static void +mlx5e_rl_free_sq(struct mlx5e_rl_sq *sq) +{ /* teardown and destroy SQ */ - mlx5e_drain_sq(sq); - mlx5e_disable_sq(sq); - mlx5e_rl_destroy_sq(sq); + mlx5e_drain_sq(&sq->sq); + mlx5e_disable_sq(&sq->sq); + mlx5e_rl_destroy_sq(&sq->sq); /* close CQ */ - mlx5e_close_cq(&sq->cq); + mlx5e_close_cq(&sq->sq.cq); /* destroy mutexes */ - mtx_destroy(&sq->lock); - mtx_destroy(&sq->comp_lock); + mtx_destroy(&sq->sq.lock); + mtx_destroy(&sq->sq.comp_lock); + + atomic_add_64(&sq->sq.priv->rl.stats.tx_open_queues, -1ULL); + + mlx5e_rl_reset_sq_stats(sq); free(sq, M_MLX5EN); } @@ -378,57 +406,23 @@ return (err); } -/* - * This function will search the configured rate limit table for the - * best match to avoid that a single socket based application can - * allocate all the available hardware rates. If the user selected - * rate deviates too much from the closes rate available in the rate - * limit table, unlimited rate will be selected. - */ -static uint64_t -mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate) -{ - uint64_t distance = -1ULL; - uint64_t diff; - uint64_t retval = 0; /* unlimited */ - uint64_t x; - - /* search for closest rate */ - for (x = 0; x != rl->param.tx_rates_def; x++) { - uint64_t rate = rl->rate_limit_table[x]; - if (rate == 0) - continue; - - if (rate > user_rate) - diff = rate - user_rate; - else - diff = user_rate - rate; - - /* check if distance is smaller than previous rate */ - if (diff < distance) { - distance = diff; - retval = rate; - } - } - - /* range check for multiplication below */ - if (user_rate > rl->param.tx_limit_max) - user_rate = rl->param.tx_limit_max; +#define MLX5E_RL_POST_SQ_REMAP_DS_CNT \ + DIV_ROUND_UP(sizeof(struct mlx5e_tx_qos_remap_wqe), MLX5_SEND_WQE_DS) +#define MLX5E_RL_POST_SQ_REMAP_WQEBB_CNT \ + DIV_ROUND_UP(MLX5E_RL_POST_SQ_REMAP_DS_CNT, MLX5_SEND_WQEBB_NUM_DS) - /* fallback to unlimited, if rate deviates too much */ - if (distance > howmany(user_rate * - rl->param.tx_allowed_deviation, 1000ULL)) - retval = 0; +CTASSERT(MLX5E_RL_POST_SQ_REMAP_WQEBB_CNT == 1); - return (retval); +static void +mlx5e_rl_post_sq_remap_wqe_callback(void *arg) +{ + m_snd_tag_rele(arg); } static int -mlx5e_rl_post_sq_remap_wqe(struct mlx5e_iq *iq, u32 scq_handle, u32 sq_handle, - struct mlx5e_rl_channel *sq_channel) +mlx5e_rl_post_sq_remap_wqe(struct mlx5e_iq *iq, struct mlx5e_rl_channel *channel, + u32 scq_handle, u32 sq_handle) { - const u32 ds_cnt = DIV_ROUND_UP(sizeof(struct mlx5e_tx_qos_remap_wqe), - MLX5_SEND_WQE_DS); struct mlx5e_tx_qos_remap_wqe *wqe; int pi; @@ -447,16 +441,19 @@ wqe->ctrl.opmod_idx_opcode = cpu_to_be32((iq->pc << 8) | MLX5_OPCODE_QOS_REMAP); - wqe->ctrl.qpn_ds = cpu_to_be32((iq->sqn << 8) | ds_cnt); + wqe->ctrl.qpn_ds = cpu_to_be32((iq->sqn << 8) | + MLX5E_RL_POST_SQ_REMAP_DS_CNT); wqe->ctrl.imm = cpu_to_be32(iq->priv->tisn[0] << 8); wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE | MLX5_FENCE_MODE_INITIATOR_SMALL; /* copy data for doorbell */ memcpy(iq->doorbell.d32, &wqe->ctrl, sizeof(iq->doorbell.d32)); - iq->data[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); - iq->data[pi].p_refcount = &sq_channel->refcount; - atomic_add_int(iq->data[pi].p_refcount, 1); + iq->data[pi].num_wqebbs = MLX5E_RL_POST_SQ_REMAP_WQEBB_CNT; + iq->data[pi].callback = &mlx5e_rl_post_sq_remap_wqe_callback; + iq->data[pi].arg = m_snd_tag_ref(&channel->tag); + iq->data[pi].p_refcount = NULL; + iq->pc += iq->data[pi].num_wqebbs; mlx5e_iq_notify_hw(iq); @@ -467,139 +464,125 @@ } static int -mlx5e_rl_remap_sq(struct mlx5e_sq *sq, uint16_t index, - struct mlx5e_rl_channel *sq_channel) +mlx5e_rl_remap_sq(struct mlx5e_rl_sq *sq, struct mlx5e_rl_channel *channel, uint16_t index) { struct mlx5e_channel *iq_channel; + struct mlx5e_priv *priv; u32 scq_handle; u32 sq_handle; int error; + priv = sq->sq.priv; + /* Specific SQ remap operations should be handled by same IQ */ - iq_channel = &sq->priv->channel[sq->sqn % sq->priv->params.num_channels]; + iq_channel = &priv->channel[channel->irq_index % priv->params.num_channels]; sq_handle = sq->queue_handle; - scq_handle = mlx5_rl_get_scq_handle(sq->priv->mdev, index); + scq_handle = mlx5_rl_get_scq_handle(priv->mdev, index); - if (sq_handle == MLX5_INVALID_QUEUE_HANDLE || - scq_handle == MLX5_INVALID_QUEUE_HANDLE) + if (unlikely(sq_handle == MLX5_INVALID_QUEUE_HANDLE || + scq_handle == MLX5_INVALID_QUEUE_HANDLE)) { error = -1; - else - error = mlx5e_rl_post_sq_remap_wqe(&iq_channel->iq, scq_handle, - sq_handle, sq_channel); - + } else { + error = mlx5e_rl_post_sq_remap_wqe(&iq_channel->iq, + channel, scq_handle, sq_handle); + } return (error); } -/* - * This function sets the requested rate for a rate limit channel, in - * bits per second. The requested rate will be filtered through the - * find best rate function above. - */ -static int -mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw, - struct mlx5e_rl_channel *channel, uint64_t rate) +static struct mlx5e_rl_sq * +mlx5e_rl_dequeue_sq(struct mlx5e_rl_head *phead, unsigned which) { - struct mlx5e_rl_priv_data *rl = &rlw->priv->rl; - struct mlx5e_sq *sq; - uint64_t temp; - uint16_t index; - uint16_t burst; - int error; - bool use_sq_remap; - - if (rate != 0) { - MLX5E_RL_WORKER_UNLOCK(rlw); - - MLX5E_RL_RLOCK(rl); - - /* get current burst size in bytes */ - temp = rl->param.tx_burst_size * - MLX5E_SW2HW_MTU(if_getmtu(rlw->priv->ifp)); - - /* limit burst size to 64K currently */ - if (temp > 65535) - temp = 65535; - burst = temp; - - /* find best rate */ - rate = mlx5e_rl_find_best_rate_locked(rl, rate); + struct mlx5e_rl_sq *sq; + + MLX5E_RL_HEAD_LOCK(phead); + sq = TAILQ_FIRST(&phead->head[which]); + if (likely(sq != NULL)) { + TAILQ_REMOVE(&phead->head[which], sq, entry); + MPASS(phead->count[which] != 0); + phead->count[which]--; + } + MLX5E_RL_HEAD_UNLOCK(phead); + return (sq); +} - MLX5E_RL_RUNLOCK(rl); +static void +mlx5e_rl_enqueue_sq(struct mlx5e_rl_head *phead, unsigned which, struct mlx5e_rl_sq *sq) +{ + MLX5E_RL_HEAD_LOCK(phead); + TAILQ_INSERT_TAIL(&phead->head[which], sq, entry); + phead->count[which]++; + MPASS(phead->count[which] != 0); + MLX5E_RL_HEAD_UNLOCK(phead); +} - if (rate == 0) { - /* rate doesn't exist, fallback to unlimited */ - index = 0; - rate = 0; - atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL); - } else { - /* get a reference on the new rate */ - error = -mlx5_rl_add_rate(rlw->priv->mdev, - howmany(rate, 1000), burst, &index); - - if (error != 0) { - /* adding rate failed, fallback to unlimited */ - index = 0; - rate = 0; - atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL); - } - } - MLX5E_RL_WORKER_LOCK(rlw); - } else { - index = 0; - burst = 0; /* default */ - } +static struct mlx5e_rl_sq * +mlx5e_rl_requeue_sq(struct mlx5e_rl_head *phead, unsigned from, unsigned to, + struct mlx5e_rl_priv_data *rl, struct mlx5e_rl_sq *sq) +{ + bool level; - /* paced <--> non-paced transitions must go via FW */ - use_sq_remap = MLX5_CAP_QOS(rlw->priv->mdev, qos_remap_pp) && - channel->last_rate != 0 && rate != 0; - - /* atomically swap rates */ - temp = channel->last_rate; - channel->last_rate = rate; - rate = temp; - - /* atomically swap burst size */ - temp = channel->last_burst; - channel->last_burst = burst; - burst = temp; - - MLX5E_RL_WORKER_UNLOCK(rlw); - /* put reference on the old rate, if any */ - if (rate != 0) { - mlx5_rl_remove_rate(rlw->priv->mdev, - howmany(rate, 1000), burst); + MLX5E_RL_HEAD_LOCK(phead); + if (likely(sq == NULL)) { + sq = TAILQ_FIRST(&phead->head[from]); + if (unlikely(sq == NULL)) + goto done; } + TAILQ_REMOVE(&phead->head[from], sq, entry); + MPASS(phead->count[from] != 0); + phead->count[from]--; + TAILQ_INSERT_TAIL(&phead->head[to], sq, entry); + phead->count[to]++; + MPASS(phead->count[to] != 0); +done: + if (likely(from == MLX5E_RL_HEAD_FREE)) + level = (phead->count[from] < phead->level); + else + level = (phead->count[to] >= phead->level); + MLX5E_RL_HEAD_UNLOCK(phead); - /* set new rate, if SQ is running */ - sq = channel->sq; - if (sq != NULL && READ_ONCE(sq->running) != 0) { - if (!use_sq_remap || mlx5e_rl_remap_sq(sq, index, channel)) { - while (atomic_load_int(&channel->refcount) != 0 && - rlw->priv->mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR && - pci_channel_offline(rlw->priv->mdev->pdev) == 0) - pause("W", 1); - error = mlx5e_rl_modify_sq(sq, index); - if (error != 0) - atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL); - } - } else - error = 0; + if (level) + mlx5e_rl_worker_wakeup(rl); + return (sq); +} - MLX5E_RL_WORKER_LOCK(rlw); +static void +mlx5e_rl_worker_wait(struct mlx5e_rl_priv_data *rl) +{ + mtx_lock(&rl->rl_mtx); + while (rl->worker_pending == 0 && + rl->worker_done == 0) + cv_wait(&rl->rl_cv, &rl->rl_mtx); + rl->worker_pending = 0; + mtx_unlock(&rl->rl_mtx); +} - return (-error); +static void +mlx5e_rl_worker_wakeup(struct mlx5e_rl_priv_data *rl) +{ + mtx_lock(&rl->rl_mtx); + if (rl->worker_pending == 0) { + rl->worker_pending = 1; + cv_signal(&rl->rl_cv); + } + mtx_unlock(&rl->rl_mtx); } static void mlx5e_rl_worker(void *arg) { - struct thread *td; - struct mlx5e_rl_worker *rlw = arg; - struct mlx5e_rl_channel *channel; + struct mlx5e_rl_priv_data *rl; + struct mlx5e_rl_sq *sq; + struct mlx5e_rl_head *phead; struct mlx5e_priv *priv; - unsigned ix; - uint64_t x; + struct thread *td; + uint32_t cq_modify_supported_mask; + uint32_t total_used; + uint32_t min_free; + uint32_t channels; + uint32_t max; + uint32_t i; + uint32_t j; int error; /* set thread priority */ @@ -609,135 +592,104 @@ sched_prio(td, PI_SWI(SWI_NET)); thread_unlock(td); - priv = rlw->priv; + rl = arg; + priv = container_of(rl, struct mlx5e_priv, rl); - /* compute completion vector */ - ix = (rlw - priv->rl.workers) % - priv->mdev->priv.eq_table.num_comp_vectors; + cq_modify_supported_mask = MLX5_CQ_MODIFY_PERIOD | MLX5_CQ_MODIFY_COUNT; - /* TODO bind to CPU */ + if (MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify)) + cq_modify_supported_mask |= MLX5_CQ_MODIFY_PERIOD_MODE; - /* open all the SQs */ - MLX5E_RL_WORKER_LOCK(rlw); - for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) { - struct mlx5e_rl_channel *channel = rlw->channels + x; + if (MLX5_CAP_GEN(priv->mdev, cq_eq_remap)) + cq_modify_supported_mask |= MLX5_CQ_MODIFY_EQN; -#if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS) - if (channel->state == MLX5E_RL_ST_FREE) - continue; -#endif - MLX5E_RL_WORKER_UNLOCK(rlw); - - MLX5E_RL_RLOCK(&priv->rl); - error = mlx5e_rl_open_channel(rlw, ix, - &priv->rl.chan_param, &channel->sq); - MLX5E_RL_RUNLOCK(&priv->rl); - - MLX5E_RL_WORKER_LOCK(rlw); - if (error != 0) { - mlx5_en_err(priv->ifp, - "mlx5e_rl_open_channel failed: %d\n", error); - break; - } - mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate); - } - while (1) { - if (STAILQ_FIRST(&rlw->process_head) == NULL) { - /* check if we are tearing down */ - if (rlw->worker_done != 0) - break; - cv_wait(&rlw->cv, &rlw->mtx); - } - /* check if we are tearing down */ - if (rlw->worker_done != 0) - break; - channel = STAILQ_FIRST(&rlw->process_head); - if (channel != NULL) { - STAILQ_REMOVE_HEAD(&rlw->process_head, entry); - - switch (channel->state) { - case MLX5E_RL_ST_MODIFY: - channel->state = MLX5E_RL_ST_USED; - MLX5E_RL_WORKER_UNLOCK(rlw); - - /* create channel by demand */ - if (channel->sq == NULL) { - MLX5E_RL_RLOCK(&priv->rl); - error = mlx5e_rl_open_channel(rlw, ix, - &priv->rl.chan_param, &channel->sq); - MLX5E_RL_RUNLOCK(&priv->rl); - - if (error != 0) { - mlx5_en_err(priv->ifp, - "mlx5e_rl_open_channel failed: %d\n", error); + while (rl->worker_done == 0) { + + MLX5E_RL_PRIV_LOCK(rl); + + channels = priv->params.num_channels; + + if (unlikely(channels > MLX5E_RL_IRQ_INDEX_MAX)) + channels = MLX5E_RL_IRQ_INDEX_MAX; + + max = (rl->use_multi_sq ? MLX5E_RL_MAX_RATES(rl) : 1); + + for (i = 0; i != max; i++) { + phead = rl->sq_rate_head[i]; + + total_used = 1; + for (j = 0; j != MLX5E_RL_IRQ_INDEX_MAX; j++) + total_used += phead[j].count[MLX5E_RL_HEAD_USED]; + + min_free = howmany(total_used, channels); + min_free = howmany(min_free, 16); + + for (j = 0; j != channels; j++) { + int eqn_not_used; + int irqn; + + error = mlx5_vector2eqn(priv->mdev, j, &eqn_not_used, &irqn); + if (error != 0) + irqn = 0; + + while ((sq = mlx5e_rl_dequeue_sq(phead + j, MLX5E_RL_HEAD_WORK)) != NULL) { + if (sq->cq_modify_flags == 0) { + mlx5e_rl_reset_sq_stats(sq); + mlx5e_rl_enqueue_sq(phead + j, MLX5E_RL_HEAD_FREE, sq); + } else if (sq->cq_modify_flags != MLX5_CQ_MODIFY_DESTROY && + (cq_modify_supported_mask & sq->cq_modify_flags) == sq->cq_modify_flags) { + error = mlx5_core_modify_cq_by_mask(priv->mdev, + &sq->sq.cq.mcq, sq->cq_modify_flags, + rl->param.tx_coalesce_usecs, + rl->param.tx_coalesce_pkts, + rl->param.tx_coalesce_mode, + irqn); + + if (error != 0) { + mlx5e_rl_free_sq(sq); + } else { + /* CQ modified, clear flags */ + sq->cq_modify_flags = 0; + + mlx5e_rl_reset_sq_stats(sq); + mlx5e_rl_enqueue_sq(phead + j, MLX5E_RL_HEAD_FREE, sq); + } } else { - atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL); + mlx5e_rl_free_sq(sq); } - } else { - mlx5e_resume_sq(channel->sq); - } - - MLX5E_RL_WORKER_LOCK(rlw); - /* convert from bytes/s to bits/s and set new rate */ - error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, - channel->new_rate * 8ULL); - if (error != 0) { - mlx5_en_err(priv->ifp, - "mlx5e_rlw_channel_set_rate_locked failed: %d\n", - error); } - break; - - case MLX5E_RL_ST_DESTROY: - error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0); - if (error != 0) { - mlx5_en_err(priv->ifp, - "mlx5e_rlw_channel_set_rate_locked failed: %d\n", - error); - } - if (channel->sq != NULL) { - /* - * Make sure all packets are - * transmitted before SQ is - * returned to free list: - */ - MLX5E_RL_WORKER_UNLOCK(rlw); - mlx5e_drain_sq(channel->sq); - MLX5E_RL_WORKER_LOCK(rlw); - } - /* put the channel back into the free list */ - STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry); - channel->state = MLX5E_RL_ST_FREE; - atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL); - break; - default: - /* NOP */ - break; } - } - } - /* close all the SQs */ - for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) { - struct mlx5e_rl_channel *channel = rlw->channels + x; + for (; j != MLX5E_RL_IRQ_INDEX_MAX; j++) { + while ((sq = mlx5e_rl_dequeue_sq(phead + j, MLX5E_RL_HEAD_WORK)) != NULL) + mlx5e_rl_free_sq(sq); + } - /* update the initial rate */ - channel->init_rate = channel->last_rate; + for (j = 0; j != channels; j++) { + phead[j].level = min_free; - /* make sure we free up the rate resource */ - mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0); + while (phead[j].count[MLX5E_RL_HEAD_FREE] < min_free) { + error = mlx5e_rl_alloc_sq(priv, j, i + 1, &rl->chan_param, &sq); + if (error != 0) + break; + mlx5e_rl_enqueue_sq(phead + j, MLX5E_RL_HEAD_FREE, sq); + } + } - if (channel->sq != NULL) { - MLX5E_RL_WORKER_UNLOCK(rlw); - mlx5e_rl_close_channel(&channel->sq); - atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL); - MLX5E_RL_WORKER_LOCK(rlw); + for (; j != MLX5E_RL_IRQ_INDEX_MAX; j++) { + while ((sq = mlx5e_rl_dequeue_sq(phead + j, MLX5E_RL_HEAD_FREE)) != NULL) + mlx5e_rl_free_sq(sq); + } } + MLX5E_RL_PRIV_UNLOCK(rl); + + mlx5e_rl_worker_wait(rl); } - rlw->worker_done = 0; - cv_broadcast(&rlw->cv); - MLX5E_RL_WORKER_UNLOCK(rlw); + mtx_lock(&rl->rl_mtx); + rl->worker_done = 0; + cv_signal(&rl->rl_cv); + mtx_unlock(&rl->rl_mtx); kthread_exit(); } @@ -764,69 +716,8 @@ } static void -mlx5e_rl_set_default_params(struct mlx5e_rl_params *param, - struct mlx5_core_dev *mdev) +mlx5e_rl_set_default_params(struct mlx5e_rl_params *param) { - /* ratelimit workers */ - param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors; - param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS; - - /* range check */ - if (param->tx_worker_threads_def == 0 || - param->tx_worker_threads_def > param->tx_worker_threads_max) - param->tx_worker_threads_def = param->tx_worker_threads_max; - - /* ratelimit channels */ - param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS / - param->tx_worker_threads_def; - param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS; - - /* range check */ - if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER) - param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER; - - /* set default burst size */ - param->tx_burst_size = 4; /* MTUs */ - - /* - * Set maximum burst size - * - * The burst size is multiplied by the MTU and clamped to the - * range 0 ... 65535 bytes inclusivly before fed into the - * firmware. - * - * NOTE: If the burst size or MTU is changed only ratelimit - * connections made after the change will use the new burst - * size. - */ - param->tx_burst_size_max = 255; - - /* get firmware rate limits in 1000bit/s and convert them to bit/s */ - param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL; - param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL; - - /* ratelimit table size */ - param->tx_rates_max = mdev->priv.rl_table.max_size; - - /* range check */ - if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES) - param->tx_rates_max = MLX5E_RL_MAX_TX_RATES; - - /* set default number of rates */ - param->tx_rates_def = param->tx_rates_max; - - /* set maximum allowed rate deviation */ - if (param->tx_limit_max != 0) { - /* - * Make sure the deviation multiplication doesn't - * overflow unsigned 64-bit: - */ - param->tx_allowed_deviation_max = -1ULL / - param->tx_limit_max; - } - /* set default rate deviation */ - param->tx_allowed_deviation = 50; /* 5.0% */ - /* channel parameters */ param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE); param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT; @@ -839,10 +730,6 @@ MLX5E_RL_PARAMS(MLX5E_STATS_DESC) }; -static const char *mlx5e_rl_table_params_desc[] = { - MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC) -}; - static const char *mlx5e_rl_stats_desc[] = { MLX5E_RL_STATS(MLX5E_STATS_DESC) }; @@ -853,20 +740,51 @@ struct mlx5e_rl_priv_data *rl = &priv->rl; struct sysctl_oid *node; struct sysctl_oid *stats; - char buf[64]; - uint64_t i; - uint64_t j; + uint32_t max; + uint32_t i; + uint32_t j; int error; /* check if there is support for packet pacing */ - if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing)) + if (MLX5E_RL_MAX_RATES(rl) == 0) return (0); - rl->priv = priv; - sysctl_ctx_init(&rl->ctx); - sx_init(&rl->rl_sxlock, "ratelimit-sxlock"); + sx_init(&rl->rl_sxlock, "mlx5e-rl-global-sxlock"); + mtx_init(&rl->rl_mtx, "mlx5e-rl-global-mtx", NULL, MTX_DEF); + cv_init(&rl->rl_cv, "mlx5e-rl-global-cv"); + + rl->use_multi_sq = + !MLX5_CAP_QOS(priv->mdev, qos_remap_pp) || + !MLX5_CAP_ETH(priv->mdev, reg_umr_sq) || + mlx5_use_multi_sq; + + rl->rate_limit_table = malloc( + sizeof(rl->rate_limit_table[0]) * MLX5E_RL_MAX_RATES(rl), + M_MLX5EN, M_WAITOK); + + for (i = 0; i != MLX5E_RL_MAX_RATES(rl); i++) + rl->rate_limit_table[i] = priv->mdev->priv.rl_table.rl_entry[i].rate; + + max = (rl->use_multi_sq ? MLX5E_RL_MAX_RATES(rl) : 1); + + rl->sq_rate_head = malloc( + sizeof(rl->sq_rate_head[0]) * max, + M_MLX5EN, M_WAITOK | M_ZERO); + + rl->sq_raw_head = malloc( + sizeof(rl->sq_raw_head[0]) * max * MLX5E_RL_IRQ_INDEX_MAX, + M_MLX5EN, M_WAITOK | M_ZERO); + + for (i = 0; i != max; i++) + rl->sq_rate_head[i] = rl->sq_raw_head + (i * MLX5E_RL_IRQ_INDEX_MAX); + + for (i = 0; i != (max * MLX5E_RL_IRQ_INDEX_MAX); i++) { + mtx_init(&rl->sq_raw_head[i].mtx, "mlx5e-rl-head", NULL, MTX_DEF); + for (j = 0; j != MLX5E_RL_HEAD_MAX; j++) + TAILQ_INIT(&rl->sq_raw_head[i].head[j]); + } /* open own TIS domain for ratelimit SQs */ error = mlx5e_rl_open_tis(priv); @@ -874,7 +792,7 @@ goto done; /* setup default value for parameters */ - mlx5e_rl_set_default_params(&rl->param, priv->mdev); + mlx5e_rl_set_default_params(&rl->param); /* update the completion factor */ mlx5e_rl_sync_tx_completion_fact(rl); @@ -906,299 +824,278 @@ } } - /* allocate workers array */ - rl->workers = malloc(sizeof(rl->workers[0]) * - rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO); - - /* allocate rate limit array */ - rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) * - rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO); - - if (node != NULL) { - /* create more SYSCTls */ - SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, - "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD | - CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table, - "A", "Show table of all configured TX rates"); - - /* try to fetch rate table from kernel environment */ - for (i = 0; i != rl->param.tx_rates_def; i++) { - /* compute path for tunable */ - snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d", - device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i); - if (TUNABLE_QUAD_FETCH(buf, &j)) - mlx5e_rl_tx_limit_add(rl, j); - } - - /* setup rate table sysctls */ - for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) { - mlx5e_rl_sysctl_add_u64_oid(rl, - MLX5E_RL_PARAMS_INDEX(table_arg[i]), - node, mlx5e_rl_table_params_desc[2 * i], - mlx5e_rl_table_params_desc[2 * i + 1]); - } - } - - for (j = 0; j < rl->param.tx_worker_threads_def; j++) { - struct mlx5e_rl_worker *rlw = rl->workers + j; - - rlw->priv = priv; - - cv_init(&rlw->cv, "mlx5-worker-cv"); - mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF); - STAILQ_INIT(&rlw->index_list_head); - STAILQ_INIT(&rlw->process_head); - - rlw->channels = malloc(sizeof(rlw->channels[0]) * - rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO); - - MLX5E_RL_WORKER_LOCK(rlw); - for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) { - struct mlx5e_rl_channel *channel = rlw->channels + i; - channel->worker = rlw; - STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry); - } - MLX5E_RL_WORKER_UNLOCK(rlw); - } - PRIV_LOCK(priv); - error = mlx5e_rl_open_workers(priv); + mlx5e_rl_open_worker(priv); PRIV_UNLOCK(priv); - if (error != 0) { - mlx5_en_err(priv->ifp, - "mlx5e_rl_open_workers failed: %d\n", error); - } - return (0); done: + for (i = 0; i != (max * MLX5E_RL_IRQ_INDEX_MAX); i++) + mtx_destroy(&rl->sq_raw_head[i].mtx); + sysctl_ctx_free(&rl->ctx); + + free(rl->rate_limit_table, M_MLX5EN); + free(rl->sq_raw_head, M_MLX5EN); + free(rl->sq_rate_head, M_MLX5EN); + + cv_destroy(&rl->rl_cv); + mtx_destroy(&rl->rl_mtx); sx_destroy(&rl->rl_sxlock); + return (error); } -static int -mlx5e_rl_open_workers(struct mlx5e_priv *priv) +static void +mlx5e_rl_open_worker(struct mlx5e_priv *priv) { struct mlx5e_rl_priv_data *rl = &priv->rl; struct thread *rl_thread = NULL; struct proc *rl_proc = NULL; - uint64_t j; int error; - if (priv->gone || rl->opened) - return (-EINVAL); - - MLX5E_RL_WLOCK(rl); + MLX5E_RL_PRIV_LOCK(rl); /* compute channel parameters once */ - mlx5e_rl_build_channel_param(rl, &rl->chan_param); - MLX5E_RL_WUNLOCK(rl); - - for (j = 0; j < rl->param.tx_worker_threads_def; j++) { - struct mlx5e_rl_worker *rlw = rl->workers + j; - - /* start worker thread */ - error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread, - RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j); - if (error != 0) { - mlx5_en_err(rl->priv->ifp, - "kproc_kthread_add failed: %d\n", error); - rlw->worker_done = 1; - } - } + mlx5e_rl_build_channel_param(priv, &rl->chan_param); + MLX5E_RL_PRIV_UNLOCK(rl); - rl->opened = 1; - - return (0); + /* start worker thread */ + error = kproc_kthread_add(mlx5e_rl_worker, rl, &rl_proc, &rl_thread, + RFHIGHPID, 0, "mlx5e-ratelimit", "mlx5e-rl-worker-thread"); + if (error != 0) { + mlx5_en_err(priv->ifp, + "kproc_kthread_add failed: %d\n", error); + rl->worker_done = 1; + } + rl->worker_opened = 1; } static void -mlx5e_rl_close_workers(struct mlx5e_priv *priv) +mlx5e_rl_close_worker(struct mlx5e_priv *priv) { struct mlx5e_rl_priv_data *rl = &priv->rl; - uint64_t y; - if (rl->opened == 0) + if (rl->worker_opened == 0) return; - /* tear down worker threads simultaneously */ - for (y = 0; y < rl->param.tx_worker_threads_def; y++) { - struct mlx5e_rl_worker *rlw = rl->workers + y; - - /* tear down worker before freeing SQs */ - MLX5E_RL_WORKER_LOCK(rlw); - if (rlw->worker_done == 0) { - rlw->worker_done = 1; - cv_broadcast(&rlw->cv); - } else { - /* XXX thread not started */ - rlw->worker_done = 0; - } - MLX5E_RL_WORKER_UNLOCK(rlw); - } - - /* wait for worker threads to exit */ - for (y = 0; y < rl->param.tx_worker_threads_def; y++) { - struct mlx5e_rl_worker *rlw = rl->workers + y; - - /* tear down worker before freeing SQs */ - MLX5E_RL_WORKER_LOCK(rlw); - while (rlw->worker_done != 0) - cv_wait(&rlw->cv, &rlw->mtx); - MLX5E_RL_WORKER_UNLOCK(rlw); + /* tear down worker before freeing SQs */ + mtx_lock(&rl->rl_mtx); + if (rl->worker_done == 0) { + rl->worker_done = 1; + cv_signal(&rl->rl_cv); + } else { + /* XXX thread not started */ + rl->worker_done = 0; } + while (rl->worker_done != 0) + cv_wait(&rl->rl_cv, &rl->rl_mtx); + mtx_unlock(&rl->rl_mtx); - rl->opened = 0; -} - -static void -mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl) -{ - unsigned x; - - MLX5E_RL_WLOCK(rl); - for (x = 0; x != rl->param.tx_rates_def; x++) - rl->rate_limit_table[x] = 0; - MLX5E_RL_WUNLOCK(rl); + rl->worker_opened = 0; } void mlx5e_rl_cleanup(struct mlx5e_priv *priv) { struct mlx5e_rl_priv_data *rl = &priv->rl; - uint64_t y; + struct mlx5e_rl_head *phead; + struct mlx5e_rl_sq *sq; + uint32_t max; + uint32_t i; + uint32_t j; /* check if there is support for packet pacing */ - if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing)) + if (MLX5E_RL_MAX_RATES(rl) == 0) return; - /* TODO check if there is support for packet pacing */ - sysctl_ctx_free(&rl->ctx); PRIV_LOCK(priv); - mlx5e_rl_close_workers(priv); + mlx5e_rl_close_worker(priv); PRIV_UNLOCK(priv); - mlx5e_rl_reset_rates(rl); + max = (rl->use_multi_sq ? MLX5E_RL_MAX_RATES(rl) : 1); + + /* release all SQ's */ + for (i = 0; i != (max * MLX5E_RL_IRQ_INDEX_MAX); i++) { + phead = rl->sq_raw_head + i; + + for (j = 0; j != MLX5E_RL_HEAD_MAX; j++) { + while ((sq = mlx5e_rl_dequeue_sq(phead, j)) != NULL) + mlx5e_rl_free_sq(sq); + } + mtx_destroy(&phead->mtx); + } /* close TIS domain */ mlx5e_rl_close_tis(priv); - for (y = 0; y < rl->param.tx_worker_threads_def; y++) { - struct mlx5e_rl_worker *rlw = rl->workers + y; - - cv_destroy(&rlw->cv); - mtx_destroy(&rlw->mtx); - free(rlw->channels, M_MLX5EN); - } free(rl->rate_limit_table, M_MLX5EN); - free(rl->workers, M_MLX5EN); + free(rl->sq_raw_head, M_MLX5EN); + free(rl->sq_rate_head, M_MLX5EN); + + cv_destroy(&rl->rl_cv); + mtx_destroy(&rl->rl_mtx); sx_destroy(&rl->rl_sxlock); } -static void -mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw, - struct mlx5e_rl_channel *channel) +static struct mlx5e_sq * +mlx5e_rl_get_current_sq_multi_locked(struct mlx5e_rl_channel *channel) { - STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry); - cv_broadcast(&rlw->cv); -} + struct mlx5e_rl_sq *sq; + uint16_t index; -static void -mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel) -{ - if (channel == NULL) - return; + index = channel->last_rate_index; - MLX5E_RL_WORKER_LOCK(rlw); - switch (channel->state) { - case MLX5E_RL_ST_MODIFY: - channel->state = MLX5E_RL_ST_DESTROY; - break; - case MLX5E_RL_ST_USED: - channel->state = MLX5E_RL_ST_DESTROY; - mlx5e_rlw_queue_channel_locked(rlw, channel); - break; - default: - break; + MPASS(index != 0); +top: + sq = channel->next_sq; + + if (unlikely(sq != NULL)) { + + MPASS(channel->curr_sq != NULL); + + /* check if current SQ is empty */ + mtx_lock(&channel->curr_sq->sq.comp_lock); + + if (mlx5e_sq_is_empty(&channel->curr_sq->sq)) { + channel->curr_sq->sq_next = NULL; + mtx_unlock(&channel->curr_sq->sq.comp_lock); + + /* get the current SQ on the work list */ + mlx5e_rl_requeue_sq( + MLX5E_RL_CHAN_TO_SQ_HEAD(channel, channel->curr_rate_index), + MLX5E_RL_HEAD_USED, + MLX5E_RL_HEAD_WORK, + &channel->priv->rl, + channel->curr_sq); + + /* advance SQ pointer */ + channel->curr_sq = sq; + channel->curr_rate_index = channel->next_rate_index; + + /* ready to go */ + sq->sq.db_inhibit = 0; + + /* clear next */ + channel->next_sq = NULL; + channel->next_rate_index = 0; + + goto top; + } else { + channel->curr_sq->sq_next = &sq->sq; + mtx_unlock(&channel->curr_sq->sq.comp_lock); + + /* check if rate changed once again */ + if (unlikely(index != channel->next_rate_index)) + atomic_add_64(&channel->priv->rl.stats.tx_rate_changed_too_quickly, 1ULL); + } + } else if (unlikely((sq = channel->curr_sq) == NULL)) { + /* no SQ allocated */ + sq = mlx5e_rl_requeue_sq( + MLX5E_RL_CHAN_TO_SQ_HEAD(channel, index), + MLX5E_RL_HEAD_FREE, + MLX5E_RL_HEAD_USED, + &channel->priv->rl, + NULL); + + if (likely(sq != NULL)) { + /* ready to go */ + sq->sq.db_inhibit = 0; + + channel->curr_sq = sq; + channel->curr_rate_index = index; + } else { + atomic_add_64(&channel->priv->rl.stats.tx_available_resource_failure, 1ULL); + return (NULL); + } + } else if (unlikely(index != channel->curr_rate_index)) { + /* rate change */ + sq = mlx5e_rl_requeue_sq( + MLX5E_RL_CHAN_TO_SQ_HEAD(channel, index), + MLX5E_RL_HEAD_FREE, + MLX5E_RL_HEAD_USED, + &channel->priv->rl, + NULL); + + if (likely(sq != NULL)) { + /* don't send anything yet */ + sq->sq.db_inhibit = 1; + + channel->next_sq = sq; + channel->next_rate_index = index; + goto top; + } else { + atomic_add_64(&channel->priv->rl.stats.tx_available_resource_failure, 1ULL); + return (NULL); + } } - MLX5E_RL_WORKER_UNLOCK(rlw); + return (&sq->sq); } -static int -mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate) +static struct mlx5e_sq * +mlx5e_rl_get_current_sq_non_multi_locked(struct mlx5e_rl_channel *channel) { + struct mlx5e_rl_sq *sq; + uint16_t index; + int error; - MLX5E_RL_WORKER_LOCK(rlw); - channel->new_rate = rate; - switch (channel->state) { - case MLX5E_RL_ST_USED: - channel->state = MLX5E_RL_ST_MODIFY; - mlx5e_rlw_queue_channel_locked(rlw, channel); - break; - default: - break; + index = channel->last_rate_index; + + MPASS(index != 0); + + sq = channel->curr_sq; + + /* check for no SQ */ + if (unlikely(sq == NULL)) { + sq = mlx5e_rl_requeue_sq( + MLX5E_RL_CHAN_TO_SQ_HEAD(channel, 1), + MLX5E_RL_HEAD_FREE, + MLX5E_RL_HEAD_USED, + &channel->priv->rl, + NULL); + + if (likely(sq != NULL)) { + /* ready to go */ + sq->sq.db_inhibit = 0; + + channel->curr_sq = sq; + channel->curr_rate_index = 1; + } else { + atomic_add_64(&channel->priv->rl.stats.tx_available_resource_failure, 1ULL); + return (NULL); + } } - MLX5E_RL_WORKER_UNLOCK(rlw); - return (0); + /* check for rate change in any case */ + if (unlikely(index != channel->next_rate_index)) { + error = mlx5e_rl_remap_sq(sq, channel, index); + if (error != 0) + return (NULL); + channel->next_rate_index = index; + } + return (&sq->sq); } -static int -mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, - union if_snd_tag_query_params *params) +static struct mlx5e_sq * +mlx5e_rl_get_current_sq_locked(struct mlx5e_rl_channel *channel) { - int retval; - - MLX5E_RL_WORKER_LOCK(rlw); - switch (channel->state) { - case MLX5E_RL_ST_USED: - params->rate_limit.max_rate = channel->last_rate; - params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq); - retval = 0; - break; - case MLX5E_RL_ST_MODIFY: - params->rate_limit.max_rate = channel->last_rate; - params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq); - retval = EBUSY; - break; - default: - retval = EINVAL; - break; - } - MLX5E_RL_WORKER_UNLOCK(rlw); - - return (retval); + if (channel->priv->rl.use_multi_sq != 0) + return (mlx5e_rl_get_current_sq_multi_locked(channel)); + else + return (mlx5e_rl_get_current_sq_non_multi_locked(channel)); } -static int -mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw, - struct mlx5e_rl_channel **pchannel) +struct mlx5e_sq * +mlx5e_rl_get_current_sq(struct mlx5e_rl_channel *channel) { - struct mlx5e_rl_channel *channel; - int retval = ENOMEM; - - MLX5E_RL_WORKER_LOCK(rlw); - /* Check for available channel in free list */ - if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) { - retval = 0; - /* Remove head index from available list */ - STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry); - channel->state = MLX5E_RL_ST_USED; - atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL); - } else { - atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL); - } - MLX5E_RL_WORKER_UNLOCK(rlw); + struct mlx5e_sq *sq; - *pchannel = channel; -#ifdef RATELIMIT_DEBUG - mlx5_en_info(rlw->priv->ifp, - "Channel pointer for rate limit connection is %p\n", channel); -#endif - return (retval); + MLX5E_RL_CHANNEL_LOCK(channel); + sq = mlx5e_rl_get_current_sq_locked(channel); + MLX5E_RL_CHANNEL_UNLOCK(channel); + return (sq); } int @@ -1207,40 +1104,40 @@ struct m_snd_tag **ppmt) { struct mlx5e_rl_channel *channel; - struct mlx5e_rl_worker *rlw; struct mlx5e_priv *priv; - int error; priv = if_getsoftc(ifp); /* check if there is support for packet pacing or if device is going away */ - if (!MLX5_CAP_GEN(priv->mdev, qos) || - !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone || + if (MLX5E_RL_MAX_RATES(&priv->rl) == 0 || priv->gone || params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT) return (EOPNOTSUPP); - /* compute worker thread this TCP connection belongs to */ - rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) % - priv->rl.param.tx_worker_threads_def); + channel = malloc(sizeof(*channel), M_MLX5EN, M_NOWAIT | M_ZERO); + if (unlikely(channel == NULL)) + return (ENOMEM); - error = mlx5e_find_available_tx_ring_index(rlw, &channel); - if (error != 0) - goto done; + mtx_init(&channel->mtx, "mlx5e-rl-channel", NULL, MTX_DEF); - error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate); - if (error != 0) { - mlx5e_rl_free(rlw, channel); - goto done; - } + channel->priv = priv; + + mlx5_rl_find_rate_index(channel->priv->mdev, + params->rate_limit.max_rate, &channel->last_rate_index); + + MPASS(channel->last_rate_index != 0); + + /* get the IRQ index */ + channel->irq_index = (params->rate_limit.hdr.flowid % + MLX5E_RL_IRQ_INDEX_MAX) % priv->params.num_channels; /* store pointer to mbuf tag */ - MPASS(channel->tag.refcount == 0); m_snd_tag_init(&channel->tag, ifp, &mlx5e_rl_snd_tag_sw); *ppmt = &channel->tag; -done: - return (error); -} + atomic_add_64(&priv->rl.stats.tx_active_connections, 1ULL); + + return (0); +} static int mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params) @@ -1248,7 +1145,13 @@ struct mlx5e_rl_channel *channel = container_of(pmt, struct mlx5e_rl_channel, tag); - return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate)); + MLX5E_RL_CHANNEL_LOCK(channel); + mlx5_rl_find_rate_index(channel->priv->mdev, + params->rate_limit.max_rate, &channel->last_rate_index); + MPASS(channel->last_rate_index != 0); + MLX5E_RL_CHANNEL_UNLOCK(channel); + + return (0); } static int @@ -1256,205 +1159,170 @@ { struct mlx5e_rl_channel *channel = container_of(pmt, struct mlx5e_rl_channel, tag); + struct mlx5e_rl_sq *rl_sq; + struct mlx5e_sq *sq; + uint16_t index; - return (mlx5e_rl_query(channel->worker, channel, params)); -} + MLX5E_RL_CHANNEL_LOCK(channel); -static void -mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt) -{ - struct mlx5e_rl_channel *channel = - container_of(pmt, struct mlx5e_rl_channel, tag); + /* + * Check for existing SQ, before allocating a new. This avoids + * races if send tag methods are not serialized. + */ + rl_sq = channel->next_sq; + if (likely(rl_sq == NULL)) + rl_sq = channel->curr_sq; - mlx5e_rl_free(channel->worker, channel); -} + if (likely(rl_sq != NULL)) + sq = &rl_sq->sq; + else + sq = mlx5e_rl_get_current_sq_locked(channel); -static int -mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS) -{ - struct mlx5e_rl_priv_data *rl = arg1; - struct mlx5e_priv *priv = rl->priv; - struct sbuf sbuf; - unsigned x; - int error; + /* if there is no queue yet, consider it full */ + if (unlikely(sq == NULL)) + params->rate_limit.queue_level = IF_SND_QUEUE_LEVEL_MAX; + else + params->rate_limit.queue_level = mlx5e_sq_queue_level(sq); - error = sysctl_wire_old_buffer(req, 0); - if (error != 0) - return (error); + index = channel->last_rate_index; + MPASS(index != 0); - PRIV_LOCK(priv); + params->rate_limit.max_rate = channel->priv->rl.rate_limit_table[index - 1]; + MLX5E_RL_CHANNEL_UNLOCK(channel); - sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req); + return (0); +} - sbuf_printf(&sbuf, - "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n" - "\t" "--------------------------------------------\n"); +static void +mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt) +{ + struct mlx5e_rl_channel *channel = + container_of(pmt, struct mlx5e_rl_channel, tag); - MLX5E_RL_RLOCK(rl); - for (x = 0; x != rl->param.tx_rates_def; x++) { - if (rl->rate_limit_table[x] == 0) - continue; + if (channel->curr_sq != NULL) { + mlx5e_rl_requeue_sq( + MLX5E_RL_CHAN_TO_SQ_HEAD(channel, channel->curr_rate_index), + MLX5E_RL_HEAD_USED, + MLX5E_RL_HEAD_WORK, + &channel->priv->rl, + channel->curr_sq); + } - sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n", - x, (unsigned)rl->param.tx_burst_size, - (long long)rl->rate_limit_table[x]); + if (channel->next_sq != NULL) { + mlx5e_rl_requeue_sq( + MLX5E_RL_CHAN_TO_SQ_HEAD(channel, channel->next_rate_index), + MLX5E_RL_HEAD_USED, + MLX5E_RL_HEAD_WORK, + &channel->priv->rl, + channel->next_sq); } - MLX5E_RL_RUNLOCK(rl); - error = sbuf_finish(&sbuf); - sbuf_delete(&sbuf); + mtx_destroy(&channel->mtx); - PRIV_UNLOCK(priv); + atomic_add_64(&channel->priv->rl.stats.tx_active_connections, -1ULL); - return (error); + free(channel, M_MLX5EN); } -static int -mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl) +void +mlx5e_rl_query(if_t ifp, struct if_ratelimit_query_results *q) { - uint64_t x; - uint64_t y; + struct mlx5e_priv *priv = if_getsoftc(ifp); - MLX5E_RL_WLOCK(rl); - /* compute channel parameters once */ - mlx5e_rl_build_channel_param(rl, &rl->chan_param); - MLX5E_RL_WUNLOCK(rl); - - for (y = 0; y != rl->param.tx_worker_threads_def; y++) { - struct mlx5e_rl_worker *rlw = rl->workers + y; - - for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) { - struct mlx5e_rl_channel *channel; - struct mlx5e_sq *sq; - - channel = rlw->channels + x; - sq = channel->sq; - - if (sq == NULL) - continue; - - if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) { - mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq, - rl->param.tx_coalesce_usecs, - rl->param.tx_coalesce_pkts, - rl->param.tx_coalesce_mode); - } else { - mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq, - rl->param.tx_coalesce_usecs, - rl->param.tx_coalesce_pkts); - } - } - } - return (0); + q->rate_table = priv->rl.rate_limit_table; + q->flags = RT_IS_FIXED_TABLE; + q->max_flows = 0; /* no limit */ + q->number_of_rates = MLX5E_RL_MAX_RATES(&priv->rl); + q->min_segment_burst = 1; } void -mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl) +mlx5e_rl_refresh_channel_params(struct mlx5e_priv *priv, uint32_t cq_modify_flags) { - uint64_t x; - uint64_t y; + struct mlx5e_rl_head *phead; + struct mlx5e_rl_sq *sq; + uint32_t max; + uint32_t i; + uint32_t j; + uint32_t k; + + if (MLX5E_RL_MAX_RATES(&priv->rl) == 0) + return; + + MLX5E_RL_PRIV_LOCK(&priv->rl); - for (y = 0; y != rl->param.tx_worker_threads_def; y++) { - struct mlx5e_rl_worker *rlw = rl->workers + y; + /* compute channel parameters once */ + mlx5e_rl_build_channel_param(priv, &priv->rl.chan_param); - for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) { - struct mlx5e_rl_channel *channel; - struct mlx5e_sq *sq; + /* verify TX completion factor */ + mlx5e_rl_sync_tx_completion_fact(&priv->rl); - channel = rlw->channels + x; - sq = channel->sq; + max = (priv->rl.use_multi_sq ? MLX5E_RL_MAX_RATES(&priv->rl) : 1); - if (sq == NULL) - continue; + for (i = 0; i != max; i++) { + for (j = 0; j != MLX5E_RL_IRQ_INDEX_MAX; j++) { + phead = priv->rl.sq_rate_head[i] + j; - mtx_lock(&sq->lock); - mlx5e_update_sq_inline(sq); - mtx_unlock(&sq->lock); + MLX5E_RL_HEAD_LOCK(phead); + for (k = 0; k != MLX5E_RL_HEAD_MAX; k++) { + TAILQ_FOREACH(sq, &phead->head[k], entry) + sq->cq_modify_flags |= cq_modify_flags; + } + MLX5E_RL_HEAD_UNLOCK(phead); } } + MLX5E_RL_PRIV_UNLOCK(&priv->rl); } -static int -mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value) +void +mlx5e_rl_refresh_sq_inline(struct mlx5e_priv *priv) { - unsigned x; - int error; + struct mlx5e_rl_head *phead; + struct mlx5e_rl_sq *sq; + uint32_t max; + uint32_t i; + uint32_t j; + uint32_t k; + + if (MLX5E_RL_MAX_RATES(&priv->rl) == 0) + return; - if (value < 1000 || - mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0) - return (EINVAL); + MLX5E_RL_PRIV_LOCK(&priv->rl); - MLX5E_RL_WLOCK(rl); - error = ENOMEM; + max = (priv->rl.use_multi_sq ? MLX5E_RL_MAX_RATES(&priv->rl) : 1); - /* check if rate already exists */ - for (x = 0; x != rl->param.tx_rates_def; x++) { - if (rl->rate_limit_table[x] != value) - continue; - error = EEXIST; - break; - } + for (i = 0; i != max; i++) { + for (j = 0; j != MLX5E_RL_IRQ_INDEX_MAX; j++) { + phead = priv->rl.sq_rate_head[i] + j; - /* check if there is a free rate entry */ - if (x == rl->param.tx_rates_def) { - for (x = 0; x != rl->param.tx_rates_def; x++) { - if (rl->rate_limit_table[x] != 0) - continue; - rl->rate_limit_table[x] = value; - error = 0; - break; + MLX5E_RL_HEAD_LOCK(phead); + for (k = 0; k != MLX5E_RL_HEAD_MAX; k++) { + TAILQ_FOREACH(sq, &phead->head[k], entry) { + mtx_lock(&sq->sq.lock); + mlx5e_update_sq_inline(&sq->sq); + mtx_unlock(&sq->sq.lock); + } + } + MLX5E_RL_HEAD_UNLOCK(phead); } } - MLX5E_RL_WUNLOCK(rl); - - return (error); -} - -static int -mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value) -{ - unsigned x; - int error; - - if (value == 0) - return (EINVAL); - - MLX5E_RL_WLOCK(rl); - - /* check if rate already exists */ - for (x = 0; x != rl->param.tx_rates_def; x++) { - if (rl->rate_limit_table[x] != value) - continue; - /* free up rate */ - rl->rate_limit_table[x] = 0; - break; - } - - /* check if there is a free rate entry */ - if (x == rl->param.tx_rates_def) - error = ENOENT; - else - error = 0; - MLX5E_RL_WUNLOCK(rl); - - return (error); + MLX5E_RL_PRIV_UNLOCK(&priv->rl); } static int mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS) { - struct mlx5e_rl_priv_data *rl = arg1; - struct mlx5e_priv *priv = rl->priv; - unsigned mode_modify; - unsigned was_opened; + struct mlx5e_rl_priv_data *rl; + struct mlx5e_priv *priv; uint64_t value; int error; + rl = arg1; + priv = container_of(rl, struct mlx5e_priv, rl); + PRIV_LOCK(priv); - MLX5E_RL_RLOCK(rl); value = rl->param.arg[arg2]; - MLX5E_RL_RUNLOCK(rl); if (req != NULL) { error = sysctl_handle_64(oidp, &value, 0, req); @@ -1470,40 +1338,8 @@ error = ENXIO; goto done; } - was_opened = rl->opened; - mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify); switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) { - case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def): - if (value > rl->param.tx_worker_threads_max) - value = rl->param.tx_worker_threads_max; - else if (value < 1) - value = 1; - - /* store new value */ - rl->param.arg[arg2] = value; - break; - - case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def): - if (value > rl->param.tx_channels_per_worker_max) - value = rl->param.tx_channels_per_worker_max; - else if (value < 1) - value = 1; - - /* store new value */ - rl->param.arg[arg2] = value; - break; - - case MLX5E_RL_PARAMS_INDEX(tx_rates_def): - if (value > rl->param.tx_rates_max) - value = rl->param.tx_rates_max; - else if (value < 1) - value = 1; - - /* store new value */ - rl->param.arg[arg2] = value; - break; - case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs): /* range check */ if (value < 1) @@ -1513,10 +1349,7 @@ /* store new value */ rl->param.arg[arg2] = value; - - /* check to avoid down and up the network interface */ - if (was_opened) - error = mlx5e_rl_refresh_channel_params(rl); + mlx5e_rl_refresh_channel_params(priv, MLX5_CQ_MODIFY_PERIOD); break; case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts): @@ -1528,38 +1361,20 @@ /* store new value */ rl->param.arg[arg2] = value; - - /* check to avoid down and up the network interface */ - if (was_opened) - error = mlx5e_rl_refresh_channel_params(rl); + mlx5e_rl_refresh_channel_params(priv, MLX5_CQ_MODIFY_COUNT); break; case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode): - /* network interface must be down */ - if (was_opened != 0 && mode_modify == 0) - mlx5e_rl_close_workers(priv); - /* import TX coalesce mode */ if (value != 0) value = 1; /* store new value */ rl->param.arg[arg2] = value; - - /* restart network interface, if any */ - if (was_opened != 0) { - if (mode_modify == 0) - mlx5e_rl_open_workers(priv); - else - error = mlx5e_rl_refresh_channel_params(rl); - } + mlx5e_rl_refresh_channel_params(priv, MLX5_CQ_MODIFY_PERIOD_MODE); break; case MLX5E_RL_PARAMS_INDEX(tx_queue_size): - /* network interface must be down */ - if (was_opened) - mlx5e_rl_close_workers(priv); - /* import TX queue size */ if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE)) value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE); @@ -1571,61 +1386,13 @@ /* store new value */ rl->param.arg[arg2] = value; - - /* verify TX completion factor */ - mlx5e_rl_sync_tx_completion_fact(rl); - - /* restart network interface, if any */ - if (was_opened) - mlx5e_rl_open_workers(priv); + mlx5e_rl_refresh_channel_params(priv, MLX5_CQ_MODIFY_DESTROY); break; case MLX5E_RL_PARAMS_INDEX(tx_completion_fact): - /* network interface must be down */ - if (was_opened) - mlx5e_rl_close_workers(priv); - /* store new value */ rl->param.arg[arg2] = value; - - /* verify parameter */ - mlx5e_rl_sync_tx_completion_fact(rl); - - /* restart network interface, if any */ - if (was_opened) - mlx5e_rl_open_workers(priv); - break; - - case MLX5E_RL_PARAMS_INDEX(tx_limit_add): - error = mlx5e_rl_tx_limit_add(rl, value); - break; - - case MLX5E_RL_PARAMS_INDEX(tx_limit_clr): - error = mlx5e_rl_tx_limit_clr(rl, value); - break; - - case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation): - /* range check */ - if (value > rl->param.tx_allowed_deviation_max) - value = rl->param.tx_allowed_deviation_max; - else if (value < rl->param.tx_allowed_deviation_min) - value = rl->param.tx_allowed_deviation_min; - - MLX5E_RL_WLOCK(rl); - rl->param.arg[arg2] = value; - MLX5E_RL_WUNLOCK(rl); - break; - - case MLX5E_RL_PARAMS_INDEX(tx_burst_size): - /* range check */ - if (value > rl->param.tx_burst_size_max) - value = rl->param.tx_burst_size_max; - else if (value < rl->param.tx_burst_size_min) - value = rl->param.tx_burst_size_min; - - MLX5E_RL_WLOCK(rl); - rl->param.arg[arg2] = value; - MLX5E_RL_WUNLOCK(rl); + mlx5e_rl_refresh_channel_params(priv, MLX5_CQ_MODIFY_DESTROY); break; default: @@ -1641,9 +1408,8 @@ struct sysctl_oid *node, const char *name, const char *desc) { /* - * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will - * take care of loading default sysctl value from the kernel - * environment, if any: + * NOTE: The CTLFLAG_XXTUN flag will take care of loading + * default sysctl value from the kernel environment, if any: */ if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) { /* read-only SYSCTLs */ @@ -1676,6 +1442,43 @@ CTLFLAG_RD, &rl->stats.arg[x], 0, desc); } +void +mlx5e_rl_accumulate_sq_stats(struct mlx5e_rl_priv_data *rl, + struct mlx5e_sq_stats *sq_stats) +{ + struct mlx5e_rl_head *phead; + struct mlx5e_rl_sq *sq; + uint32_t max; + uint32_t i; + uint32_t j; + uint32_t k; + + if (MLX5E_RL_MAX_RATES(rl) == 0) + return; + + MLX5E_RL_PRIV_LOCK(rl); + + mlx5e_accumulate_sq_stats(&rl->sq_stats, sq_stats); + + max = (rl->use_multi_sq ? MLX5E_RL_MAX_RATES(rl) : 1); + + for (i = 0; i != max; i++) { + for (j = 0; j != MLX5E_RL_IRQ_INDEX_MAX; j++) { + phead = rl->sq_rate_head[i] + j; + + MLX5E_RL_HEAD_LOCK(phead); + for (k = 0; k != MLX5E_RL_HEAD_MAX; k++) { + if (k == MLX5E_RL_HEAD_FREE) + continue; + TAILQ_FOREACH(sq, &phead->head[k], entry) + mlx5e_accumulate_sq_stats(&sq->sq.stats, sq_stats); + } + MLX5E_RL_HEAD_UNLOCK(phead); + } + } + MLX5E_RL_PRIV_UNLOCK(rl); +} + #else int diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c --- a/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c @@ -102,8 +102,8 @@ switch (mb_tag->sw->type) { #ifdef RATELIMIT case IF_SND_TAG_TYPE_RATE_LIMIT: - sq = container_of(mb_tag, - struct mlx5e_rl_channel, tag)->sq; + sq = mlx5e_rl_get_current_sq( + container_of(mb_tag, struct mlx5e_rl_channel, tag)); break; #ifdef KERN_TLS case IF_SND_TAG_TYPE_TLS_RATE_LIMIT: @@ -1146,10 +1146,13 @@ MPASS(mb->m_pkthdr.snd_tag->ifp == ifp); sq = mlx5e_select_queue_by_send_tag(ifp, mb); if (unlikely(sq == NULL)) { - goto select_queue; + /* Free mbuf */ + m_freem(mb); + + /* Need to wait for a send queue to be allocated. */ + return (ENOBUFS); } } else { -select_queue: sq = mlx5e_select_queue(ifp, mb); if (unlikely(sq == NULL)) { /* Free mbuf */ diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_ah.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_ah.c --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_ah.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_ah.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_cong.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_cong.c --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_cong.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_cong.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_cq.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_cq.c --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_cq.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_cq.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_devx.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_devx.c --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_devx.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_devx.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_doorbell.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_doorbell.c --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_doorbell.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_doorbell.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_gsi.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_gsi.c --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_gsi.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_gsi.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mad.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mad.c --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mad.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mad.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mem.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mem.c --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mem.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mem.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_srq.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_srq.c --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_srq.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_srq.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_virt.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_virt.c --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_virt.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_virt.c @@ -26,7 +26,6 @@ */ #include "opt_rss.h" -#include "opt_ratelimit.h" #include #include