diff --git a/sys/dev/mlx5/driver.h b/sys/dev/mlx5/driver.h
--- a/sys/dev/mlx5/driver.h
+++ b/sys/dev/mlx5/driver.h
@@ -29,8 +29,6 @@
 #ifndef MLX5_DRIVER_H
 #define MLX5_DRIVER_H
 
-#include "opt_ratelimit.h"
-
 #include <linux/kernel.h>
 #include <linux/completion.h>
 #include <linux/pci.h>
@@ -532,23 +530,19 @@
 	struct radix_tree_root	tree;
 };
 
-#ifdef RATELIMIT
 struct mlx5_rl_entry {
-	u32			rate;
+	u64			rate;		/* in bytes/s */
+	u32			qos_handle;	/* schedule queue handle */
 	u16			burst;
 	u16			index;
-	u32			qos_handle; /* schedule queue handle */
-	u32			refcount;
 };
 
 struct mlx5_rl_table {
-	struct mutex		rl_lock;
 	u16			max_size;
-	u32			max_rate;
-	u32			min_rate;
+	u64			max_rate;	/* in bytes/s */
+	u64			min_rate;	/* in bytes/s */
 	struct mlx5_rl_entry   *rl_entry;
 };
-#endif
 
 struct mlx5_pme_stats {
 	u64			status_counters[MLX5_MODULE_STATUS_NUM];
@@ -600,9 +594,9 @@
 	struct list_head        ctx_list;
 	spinlock_t              ctx_lock;
 	unsigned long		pci_dev_data;
-#ifdef RATELIMIT
+
 	struct mlx5_rl_table	rl_table;
-#endif
+
 	struct mlx5_pme_stats pme_stats;
 
 	struct mlx5_eswitch	*eswitch;
@@ -1187,13 +1181,13 @@
 {
 	return !(dev->priv.pci_dev_data & MLX5_PCI_DEV_IS_VF);
 }
-#ifdef RATELIMIT
+
+extern bool mlx5_use_multi_sq;
 int mlx5_init_rl_table(struct mlx5_core_dev *dev);
+int mlx5_load_rl_table(struct mlx5_core_dev *dev);
 void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev);
-int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u32 rate, u32 burst, u16 *index);
-void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, u32 rate, u32 burst);
-bool mlx5_rl_is_in_range(const struct mlx5_core_dev *dev, u32 rate, u32 burst);
-int mlx5e_query_rate_limit_cmd(struct mlx5_core_dev *dev, u16 index, u32 *scq_handle);
+void mlx5_rl_find_rate_index(struct mlx5_core_dev *dev, u64 rate, u16 *index);
+bool mlx5_rl_is_in_range(const struct mlx5_core_dev *dev, u64 rate, u32 burst);
 
 static inline u32 mlx5_rl_get_scq_handle(struct mlx5_core_dev *dev, uint16_t index)
 {
@@ -1207,7 +1201,6 @@
 {
 	return !!(dev->priv.rl_table.max_size);
 }
-#endif
 
 void mlx5_disable_interrupts(struct mlx5_core_dev *);
 void mlx5_poll_interrupts(struct mlx5_core_dev *);
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_alloc.c b/sys/dev/mlx5/mlx5_core/mlx5_alloc.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_alloc.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_alloc.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/errno.h>
 #include <linux/slab.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_cmd.c b/sys/dev/mlx5/mlx5_core/mlx5_cmd.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_cmd.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_cmd.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/module.h>
 #include <linux/errno.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_cq.c b/sys/dev/mlx5/mlx5_core/mlx5_cq.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_cq.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_cq.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_diagnostics.c b/sys/dev/mlx5/mlx5_core/mlx5_diagnostics.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_diagnostics.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_diagnostics.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <dev/mlx5/driver.h>
 #include <dev/mlx5/port.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_eq.c b/sys/dev/mlx5/mlx5_core/mlx5_eq.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_eq.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_eq.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/interrupt.h>
 #include <linux/module.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_eswitch.c b/sys/dev/mlx5/mlx5_core/mlx5_eswitch.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_eswitch.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_eswitch.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/etherdevice.h>
 #include <dev/mlx5/driver.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_fs_cmd.c b/sys/dev/mlx5/mlx5_core/mlx5_fs_cmd.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_fs_cmd.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_fs_cmd.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/types.h>
 #include <linux/module.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_fs_tree.c b/sys/dev/mlx5/mlx5_core/mlx5_fs_tree.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_fs_tree.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_fs_tree.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/module.h>
 #include <dev/mlx5/driver.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_fw.c b/sys/dev/mlx5/mlx5_core/mlx5_fw.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_fw.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_fw.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <dev/mlx5/driver.h>
 #include <linux/module.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_fwdump.c b/sys/dev/mlx5/mlx5_core/mlx5_fwdump.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_fwdump.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_fwdump.c
@@ -24,7 +24,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_health.c b/sys/dev/mlx5/mlx5_core/mlx5_health.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_health.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_health.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_mad.c b/sys/dev/mlx5/mlx5_core/mlx5_mad.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_mad.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_mad.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_main.c b/sys/dev/mlx5/mlx5_core/mlx5_main.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_main.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_main.c
@@ -1217,6 +1217,12 @@
 		goto err_fpga;
 	}
 
+	err = mlx5_load_rl_table(dev);
+	if (err) {
+		mlx5_core_err(dev, "Failed to load rate limiting table\n");
+		goto err_diag_cnt;
+	}
+
 	err = mlx5_register_device(dev);
 	if (err) {
 		mlx5_core_err(dev, "mlx5_register_device failed %d\n", err);
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_mcg.c b/sys/dev/mlx5/mlx5_core/mlx5_mcg.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_mcg.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_mcg.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_mpfs.c b/sys/dev/mlx5/mlx5_core/mlx5_mpfs.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_mpfs.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_mpfs.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/types.h>
 #include <linux/etherdevice.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_mr.c b/sys/dev/mlx5/mlx5_core/mlx5_mr.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_mr.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_mr.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_pagealloc.c b/sys/dev/mlx5/mlx5_core/mlx5_pagealloc.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_pagealloc.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_pagealloc.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_pd.c b/sys/dev/mlx5/mlx5_core/mlx5_pd.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_pd.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_pd.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_port.c b/sys/dev/mlx5/mlx5_core/mlx5_port.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_port.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_port.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/module.h>
 #include <dev/mlx5/port.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_qp.c b/sys/dev/mlx5/mlx5_core/mlx5_qp.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_qp.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_qp.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/gfp.h>
 #include <dev/mlx5/qp.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_rl.c b/sys/dev/mlx5/mlx5_core/mlx5_rl.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_rl.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_rl.c
@@ -1,5 +1,6 @@
 /*-
- * Copyright (c) 2013-2017, Mellanox Technologies, Ltd.  All rights reserved.
+ * Copyright (c) 2013-2017, Mellanox Technologies, Ltd.
+ * Copyright (c) 2022 NVIDIA corporation & affiliates.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -21,216 +22,331 @@
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
- *
- * $FreeBSD$
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+
 #include <dev/mlx5/driver.h>
 #include <dev/mlx5/mlx5_core/mlx5_core.h>
 
-#ifdef RATELIMIT
-
-/* Finds an entry where we can register the given rate
- * If the rate already exists, return the entry where it is registered,
- * otherwise return the first available entry.
- * If the table is full, return NULL
- */
-static struct mlx5_rl_entry *find_rl_entry(struct mlx5_rl_table *table,
-					   u32 rate, u16 burst)
-{
-	struct mlx5_rl_entry *ret_entry = NULL;
-	struct mlx5_rl_entry *entry;
-	u16 i;
-
-	for (i = 0; i < table->max_size; i++) {
-		entry = table->rl_entry + i;
-		if (entry->rate == rate && entry->burst == burst)
-			return entry;
-		if (ret_entry == NULL && entry->rate == 0)
-			ret_entry = entry;
-	}
-
-	return ret_entry;
-}
-
-static int mlx5_set_rate_limit_cmd(struct mlx5_core_dev *dev,
-				   u32 rate, u32 burst, u16 index)
+static int
+mlx5_set_rate_limit_cmd(struct mlx5_core_dev *dev, u32 rate, u32 burst, u16 index)
 {
 	u32 in[MLX5_ST_SZ_DW(set_rate_limit_in)] = {};
 	u32 out[MLX5_ST_SZ_DW(set_rate_limit_out)] = {};
 
 	MLX5_SET(set_rate_limit_in, in, opcode, MLX5_CMD_OP_SET_RATE_LIMIT);
 	MLX5_SET(set_rate_limit_in, in, rate_limit_index, index);
-	MLX5_SET(set_rate_limit_in, in, rate_limit, rate);
+	MLX5_SET(set_rate_limit_in, in, rate_limit, rate);	/* Kbit/s */
 	MLX5_SET(set_rate_limit_in, in, burst_upper_bound, burst);
 	MLX5_SET(set_rate_limit_in, in, typical_packet_size, 0 /* use MTU */);
 
-	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	return (mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)));
 }
 
-int mlx5e_query_rate_limit_cmd(struct mlx5_core_dev *dev,
-				   u16 index, u32 *scq_handle)
+static int
+mlx5_query_rate_limit_cmd(struct mlx5_core_dev *dev, u16 index, u32 *scq_handle)
 {
-	int err;
 	u32 in[MLX5_ST_SZ_DW(query_pp_rate_limit_in)] = {};
 	u32 out[MLX5_ST_SZ_DW(query_pp_rate_limit_out)] = {};
+	int err;
 
 	MLX5_SET(query_pp_rate_limit_in, in, opcode, MLX5_CMD_OP_QUERY_RATE_LIMIT);
 	MLX5_SET(query_pp_rate_limit_in, in, rate_limit_index, index);
 
 	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 	if (err)
-		return err;
+		return (err);
 
 	*scq_handle = MLX5_GET(query_pp_rate_limit_out, out, pp_context.qos_handle);
 
-	return 0;
+	return (0);
 }
 
-bool mlx5_rl_is_in_range(const struct mlx5_core_dev *dev, u32 rate, u32 burst)
+bool
+mlx5_rl_is_in_range(const struct mlx5_core_dev *dev, u64 rate, u32 burst)
 {
 	const struct mlx5_rl_table *table = &dev->priv.rl_table;
 
 	return (rate <= table->max_rate && rate >= table->min_rate &&
 		burst <= 65535);
 }
-EXPORT_SYMBOL(mlx5_rl_is_in_range);
 
-int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u32 rate, u32 burst, u16 *index)
+void
+mlx5_rl_find_rate_index(struct mlx5_core_dev *dev, u64 rate, u16 *index)
 {
 	struct mlx5_rl_table *table = &dev->priv.rl_table;
-	struct mlx5_rl_entry *entry;
-	int err = 0;
-
-	mutex_lock(&table->rl_lock);
+	u16 i;
 
-	if (!rate || !mlx5_rl_is_in_range(dev, rate, burst)) {
-		mlx5_core_err(dev, "Invalid rate: %u, should be %u to %u\n",
-			      rate, table->min_rate, table->max_rate);
-		err = -ERANGE;
-		goto out;
+	if (unlikely(table->max_size == 0)) {
+		*index = 0;
+		return;
 	}
 
-	entry = find_rl_entry(table, rate, burst);
-	if (!entry) {
-		mlx5_core_err(dev, "Max number of %u rates reached\n",
-			      table->max_size);
-		err = -ENOSPC;
-		goto out;
-	}
-	if (entry->refcount == 0xFFFFFFFFU) {
-		/* out of refcounts */
-		err = -ENOMEM;
-		goto out;
-	} else if (entry->refcount != 0) {
-		/* rate already configured */
-		entry->refcount++;
-	} else {
-		/* new rate limit */
-		err = mlx5_set_rate_limit_cmd(dev, rate, burst, entry->index);
-		if (err) {
-			mlx5_core_err(dev, "Failed configuring rate: %u (%d)\n",
-				      rate, err);
-			goto out;
+	for (i = 0;; i++) {
+		if (i == table->max_size) {
+			*index = table->rl_entry[--i].index;
+			break;
+		} else if (table->rl_entry[i].rate >= rate) {
+			*index = table->rl_entry[i].index;
+			break;
 		}
-		entry->rate = rate;
-		entry->burst = burst;
-		entry->refcount = 1;
+	}
+}
 
-		if (MLX5_CAP_QOS(dev, qos_remap_pp)) {
-			err = mlx5e_query_rate_limit_cmd(dev, entry->index, &entry->qos_handle);
-			if (err) {
-				mlx5_core_err(dev, "Failed retrieving schedule queue handle for"
-				    "SQ remap: rate: %u error:(%d)\n", rate, err);
-				entry->qos_handle = MLX5_INVALID_QUEUE_HANDLE;
+static void
+mlx5_rl_rates_sort(uint32_t *ptr, size_t num)
+{
+	size_t x;
+	size_t y;
+
+	for (x = 0; x != num; x += 2) {
+		for (y = x + 2; y != num; y += 2) {
+			if (ptr[x] > ptr[y]) {
+				uint32_t temp[2] = { ptr[y], ptr[y + 1] };
+				ptr[y] = ptr[x];
+				ptr[y + 1] = ptr[x + 1];
+				ptr[x] = temp[0];
+				ptr[x + 1] = temp[1];
 			}
-		} else
-			entry->qos_handle = MLX5_INVALID_QUEUE_HANDLE;
+		}
 	}
-	*index = entry->index;
-
-out:
-	mutex_unlock(&table->rl_lock);
-	return err;
 }
-EXPORT_SYMBOL(mlx5_rl_add_rate);
 
-void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, u32 rate, u32 burst)
+#define	MLX5_RL_BURST_DEF	(4 * 1600)	/* bytes */
+#define	MLX5_RL_BURST_MAX	65535		/* bytes */
+
+static SYSCTL_NODE(_hw_mlx5, OID_AUTO, rates, CTLFLAG_RW | CTLFLAG_MPSAFE,
+    0, "Rate and burst tables");
+
+bool mlx5_use_multi_sq;
+
+SYSCTL_BOOL(_hw_mlx5_rates, OID_AUTO, use_multi_sq, CTLFLAG_RDTUN | CTLFLAG_MPSAFE,
+    &mlx5_use_multi_sq, 0, "Set to force use of one SQ per available rate");
+
+/* CX-4 cards, rate and burst, logarithmic table */
+static uint32_t mlx5_adapter_byte_rates_cx4_mlx[] = {
+	135375,	MLX5_RL_BURST_DEF,	/* 1,083,000 */
+	180500,	MLX5_RL_BURST_DEF,	/* 1,444,000 */
+	270750, MLX5_RL_BURST_DEF,	/* 2,166,000 */
+	361000, MLX5_RL_BURST_DEF,	/* 2,888,000 */
+	541500, MLX5_RL_BURST_DEF,	/* 4,332,000 */
+	721875, MLX5_RL_BURST_DEF,	/* 5,775,000 */
+	1082875, MLX5_RL_BURST_DEF,	/* 8,663,000 */
+	1443875, MLX5_RL_BURST_DEF,	/* 11,551,000 */
+	2165750, MLX5_RL_BURST_DEF,	/* 17,326,000 */
+	2887750, MLX5_RL_BURST_DEF,	/* 23,102,000 */
+	4331625, MLX5_RL_BURST_DEF,	/* 34,653,000 */
+	5775500, MLX5_RL_BURST_DEF,	/* 46,204,000 */
+	8663125, MLX5_RL_BURST_DEF,	/* 69,305,000 */
+	0, 0			/* END */
+};
+
+static int
+mlx5_sysctl_rates_cx4(SYSCTL_HANDLER_ARGS)
 {
-	struct mlx5_rl_table *table = &dev->priv.rl_table;
-	struct mlx5_rl_entry *entry = NULL;
+	const size_t size = sizeof(mlx5_adapter_byte_rates_cx4_mlx);
+	size_t i;
+	size_t j;
+	int err;
 
-	/* 0 is a reserved value for unlimited rate */
-	if (rate == 0)
-		return;
+	err = SYSCTL_OUT(req, mlx5_adapter_byte_rates_cx4_mlx, size);
+	if (err || !req->newptr)
+		goto done;
+
+	err = SYSCTL_IN(req, mlx5_adapter_byte_rates_cx4_mlx, size);
 
-	mutex_lock(&table->rl_lock);
-	entry = find_rl_entry(table, rate, burst);
-	if (!entry || !entry->refcount) {
-		mlx5_core_warn(dev, "Rate %u is not configured\n", rate);
-		goto out;
+	/* always zero pad two last entries */
+	mlx5_adapter_byte_rates_cx4_mlx[ARRAY_SIZE(mlx5_adapter_byte_rates_cx4_mlx) - 2] = 0;
+	mlx5_adapter_byte_rates_cx4_mlx[ARRAY_SIZE(mlx5_adapter_byte_rates_cx4_mlx) - 1] = 0;
+
+	/* find end of rates */
+	for (i = 0; i != ARRAY_SIZE(mlx5_adapter_byte_rates_cx4_mlx); i += 2) {
+		if(mlx5_adapter_byte_rates_cx4_mlx[i] == 0) {
+			/* zero rest of array, if any */
+			for (j = i + 2; j != ARRAY_SIZE(mlx5_adapter_byte_rates_cx4_mlx); j += 2) {
+				mlx5_adapter_byte_rates_cx4_mlx[j] = 0;
+				mlx5_adapter_byte_rates_cx4_mlx[j + 1] = 0;
+			}
+			break;
+		}
 	}
 
-	entry->refcount--;
-	if (!entry->refcount) {
-		/* need to remove rate */
-		mlx5_set_rate_limit_cmd(dev, 0, 0, entry->index);
-		entry->rate = 0;
-		entry->burst = 0;
+	/* sort rates */
+	mlx5_rl_rates_sort(mlx5_adapter_byte_rates_cx4_mlx, i);
+done:
+	return (err);
+}
+SYSCTL_PROC(_hw_mlx5_rates, OID_AUTO, cx4, CTLTYPE_U32 | CTLFLAG_RDTUN | CTLFLAG_MPSAFE,
+    NULL, 0, mlx5_sysctl_rates_cx4, "IU",
+    "Rate and burst table in bytes[/s] for CX-4, zero terminated");
+
+/* CX-5 cards and newer, rate and burst, logarithmic table */
+static uint32_t mlx5_adapter_byte_rates_cx5_mlx[] = {
+	135375, MLX5_RL_BURST_DEF,	/* 1,083,000 bits/s */
+	191447, MLX5_RL_BURST_DEF,	/* 1,531,576 bits/s */
+	270745, MLX5_RL_BURST_DEF,	/* 2,165,960 bits/s */
+	382888, MLX5_RL_BURST_DEF,	/* 3,063,104 bits/s */
+	541481, MLX5_RL_BURST_DEF,	/* 4,331,848 bits/s */
+	765764, MLX5_RL_BURST_DEF,	/* 6,126,112 bits/s */
+	1082945, MLX5_RL_BURST_DEF,	/* 8,663,560 bits/s */
+	1531503, MLX5_RL_BURST_DEF,	/* 12,252,024 bits/s */
+	2165854, MLX5_RL_BURST_DEF,	/* 17,326,832 bits/s */
+	3062954, MLX5_RL_BURST_DEF,	/* 24,503,632 bits/s */
+	4331635, MLX5_RL_BURST_DEF,	/* 34,653,080 bits/s */
+	6125805, MLX5_RL_BURST_DEF,	/* 49,006,440 bits/s */
+	8663125, MLX5_RL_BURST_DEF,	/* 69,305,000 bits/s */
+	12251405, MLX5_RL_BURST_DEF,	/* 98,011,240 bits/s */
+	17325958, MLX5_RL_BURST_DEF,	/* 138,607,664 bits/s */
+	24502399, MLX5_RL_BURST_DEF,	/* 196,019,192 bits/s */
+	34651333, MLX5_RL_BURST_DEF,	/* 277,210,664 bits/s */
+	49003973, MLX5_RL_BURST_DEF,	/* 392,031,784 bits/s */
+	69301500, MLX5_RL_BURST_DEF,	/* 554,412,000 bits/s */
+	98006296, MLX5_RL_BURST_DEF,	/* 784,050,368 bits/s */
+	138600667, MLX5_RL_BURST_DEF,	/* 1,108,805,336 bits/s */
+	196009293, MLX5_RL_BURST_DEF,	/* 1,568,074,344 bits/s */
+	277196668, MLX5_RL_BURST_DEF,	/* 2,217,573,344 bits/s */
+	392011989, MLX5_RL_BURST_DEF,	/* 3,136,095,912 bits/s */
+	554384005, MLX5_RL_BURST_DEF,	/* 4,435,072,040 bits/s */
+	784010780, MLX5_RL_BURST_DEF,	/* 6,272,086,240 bits/s */
+	1108749347, MLX5_RL_BURST_DEF,	/* 8,869,994,776 bits/s */
+	1567995167, MLX5_RL_BURST_DEF,	/* 12,543,961,336 bits/s */
+	2217461368, MLX5_RL_BURST_DEF,	/* 17,739,690,944 bits/s */
+	3135937547, MLX5_RL_BURST_DEF,	/* 25,087,500,376 bits/s */
+	0, 0		/* END */
+};
+
+static int
+mlx5_sysctl_rates_cx5(SYSCTL_HANDLER_ARGS)
+{
+	const size_t size = sizeof(mlx5_adapter_byte_rates_cx5_mlx);
+	size_t i;
+	size_t j;
+	int err;
+
+	err = SYSCTL_OUT(req, mlx5_adapter_byte_rates_cx5_mlx, size);
+	if (err || !req->newptr)
+		goto done;
+
+	err = SYSCTL_IN(req, mlx5_adapter_byte_rates_cx5_mlx, size);
+
+	/* always zero pad two last entries */
+	mlx5_adapter_byte_rates_cx5_mlx[ARRAY_SIZE(mlx5_adapter_byte_rates_cx5_mlx) - 2] = 0;
+	mlx5_adapter_byte_rates_cx5_mlx[ARRAY_SIZE(mlx5_adapter_byte_rates_cx5_mlx) - 1] = 0;
+
+	/* find end of rates */
+	for (i = 0; i != ARRAY_SIZE(mlx5_adapter_byte_rates_cx5_mlx); i += 2) {
+		if(mlx5_adapter_byte_rates_cx5_mlx[i] == 0) {
+			/* zero rest of array, if any */
+			for (j = i + 2; j != ARRAY_SIZE(mlx5_adapter_byte_rates_cx5_mlx); j += 2) {
+				mlx5_adapter_byte_rates_cx5_mlx[j] = 0;
+				mlx5_adapter_byte_rates_cx5_mlx[j + 1] = 0;
+			}
+			break;
+		}
 	}
 
-out:
-	mutex_unlock(&table->rl_lock);
+	/* sort rates */
+	mlx5_rl_rates_sort(mlx5_adapter_byte_rates_cx5_mlx, i);
+done:
+	return (err);
 }
-EXPORT_SYMBOL(mlx5_rl_remove_rate);
+SYSCTL_PROC(_hw_mlx5_rates, OID_AUTO, cx5, CTLTYPE_U32 | CTLFLAG_RDTUN | CTLFLAG_MPSAFE,
+    NULL, 0, mlx5_sysctl_rates_cx5, "IU",
+    "Rate and burst table in bytes[/s] for CX-5 and newer, zero terminated");
 
-int mlx5_init_rl_table(struct mlx5_core_dev *dev)
+int
+mlx5_init_rl_table(struct mlx5_core_dev *dev)
 {
 	struct mlx5_rl_table *table = &dev->priv.rl_table;
-	int i;
+	uint32_t *prate;
+	u16 num;
+	u16 i;
 
-	mutex_init(&table->rl_lock);
-	if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, packet_pacing)) {
-		table->max_size = 0;
-		return 0;
-	}
+	if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, packet_pacing) ||
+	    MLX5_CAP_QOS(dev, packet_pacing_rate_table_size) < 14)
+		return (0);
+
+	/* one entry is reserved for unlimited traffic */
+	num = MLX5_CAP_QOS(dev, packet_pacing_rate_table_size) - 1;
+	if (num == 13)
+		prate = mlx5_adapter_byte_rates_cx4_mlx;
+	else
+		prate = mlx5_adapter_byte_rates_cx5_mlx;
 
-	/* First entry is reserved for unlimited rate */
-	table->max_size = MLX5_CAP_QOS(dev, packet_pacing_rate_table_size) - 1;
-	table->max_rate = MLX5_CAP_QOS(dev, packet_pacing_max_rate);
-	table->min_rate = MLX5_CAP_QOS(dev, packet_pacing_min_rate);
+	/* get maximum and minimum rates in bytes/s */
+	table->max_rate = MLX5_CAP_QOS(dev, packet_pacing_max_rate) * (1000ULL / 8ULL);
+	table->min_rate = MLX5_CAP_QOS(dev, packet_pacing_min_rate) * (1000ULL / 8ULL);
 
-	table->rl_entry = kcalloc(table->max_size, sizeof(struct mlx5_rl_entry),
-				  GFP_KERNEL);
-	if (!table->rl_entry)
-		return -ENOMEM;
+	/* allocate full sized table */
+	table->rl_entry = kcalloc(num, sizeof(struct mlx5_rl_entry), GFP_KERNEL);
+	if (table->rl_entry == NULL)
+		return (-ENOMEM);
 
-	/* The index represents the index in HW rate limit table
-	 * Index 0 is reserved for unlimited rate
-	 */
-	for (i = 0; i < table->max_size; i++)
+	/* skip too small rates, if any */
+	while (*prate != 0 && *prate < table->min_rate)
+		prate += 2;
+
+	/* fill and configure entries until end is reached */
+	for (i = 0; *prate != 0 && *prate <= table->max_rate && i != num; i++, prate += 2) {
 		table->rl_entry[i].index = i + 1;
+		table->rl_entry[i].rate = prate[0];
+		if (prate[1] > MLX5_RL_BURST_MAX)
+			table->rl_entry[i].burst = MLX5_RL_BURST_MAX;
+		else
+			table->rl_entry[i].burst = prate[1];
+	}
 
-	return 0;
+	table->max_size = i;
+	return (0);
 }
 
-void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev)
+int
+mlx5_load_rl_table(struct mlx5_core_dev *dev)
 {
 	struct mlx5_rl_table *table = &dev->priv.rl_table;
-	int i;
+	int err;
+	u16 i;
 
-	/* Clear all configured rates */
-	for (i = 0; i < table->max_size; i++)
-		if (table->rl_entry[i].rate)
-			mlx5_set_rate_limit_cmd(dev, 0, 0,
-						table->rl_entry[i].index);
+	for (i = 0; i != table->max_size; i++) {
+		err = mlx5_set_rate_limit_cmd(dev,
+		    table->rl_entry[i].rate / (1000ULL / 8ULL),
+		    table->rl_entry[i].burst, i + 1);
+		if (err)
+			goto error;
 
-	kfree(dev->priv.rl_table.rl_entry);
+		if (MLX5_CAP_QOS(dev, qos_remap_pp)) {
+			err = mlx5_query_rate_limit_cmd(dev, i + 1,
+			    &table->rl_entry[i].qos_handle);
+			if (err)
+				goto error;
+		} else {
+			table->rl_entry[i].qos_handle =
+			    MLX5_INVALID_QUEUE_HANDLE;
+		}
+	}
+	return (0);
+
+error:
+	return (err);
 }
 
-#endif
+void
+mlx5_cleanup_rl_table(struct mlx5_core_dev *dev)
+{
+	struct mlx5_rl_table *table = &dev->priv.rl_table;
+	u16 i;
+
+	/* Clear all configured rates, if any. */
+	for (i = 0; i != table->max_size; i++) {
+		if (table->rl_entry[i].rate == 0)
+			continue;
+		mlx5_set_rate_limit_cmd(dev, 0, 0,
+		    table->rl_entry[i].index);
+	}
+
+	kfree(dev->priv.rl_table.rl_entry);
+}
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_srq.c b/sys/dev/mlx5/mlx5_core/mlx5_srq.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_srq.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_srq.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_tls.c b/sys/dev/mlx5/mlx5_core/mlx5_tls.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_tls.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_tls.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_transobj.c b/sys/dev/mlx5/mlx5_core/mlx5_transobj.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_transobj.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_transobj.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <dev/mlx5/driver.h>
 #include <dev/mlx5/mlx5_core/mlx5_core.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_uar.c b/sys/dev/mlx5/mlx5_core/mlx5_uar.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_uar.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_uar.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_vport.c b/sys/dev/mlx5/mlx5_core/mlx5_vport.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_vport.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_vport.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/etherdevice.h>
 #include <dev/mlx5/driver.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_vsc.c b/sys/dev/mlx5/mlx5_core/mlx5_vsc.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_vsc.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_vsc.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <dev/mlx5/driver.h>
 #include <dev/mlx5/device.h>
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_wq.c b/sys/dev/mlx5/mlx5_core/mlx5_wq.c
--- a/sys/dev/mlx5/mlx5_core/mlx5_wq.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_wq.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <dev/mlx5/driver.h>
 #include <dev/mlx5/mlx5_core/wq.h>
diff --git a/sys/dev/mlx5/mlx5_en/en.h b/sys/dev/mlx5/mlx5_en/en.h
--- a/sys/dev/mlx5/mlx5_en/en.h
+++ b/sys/dev/mlx5/mlx5_en/en.h
@@ -885,7 +885,6 @@
 	u16	max_inline;
 	u8	min_inline_mode;
 	u8	min_insert_caps;
-	u32	queue_handle; /* SQ remap support */
 #define	MLX5E_INSERT_VLAN 1
 #define	MLX5E_INSERT_NON_VLAN 2
 
@@ -896,7 +895,7 @@
 } __aligned(MLX5E_CACHELINE_SIZE);
 
 static inline bool
-mlx5e_sq_has_room_for(struct mlx5e_sq *sq, u16 n)
+mlx5e_sq_has_room_for(const struct mlx5e_sq *sq, u16 n)
 {
 	u16 cc = sq->cc;
 	u16 pc = sq->pc;
@@ -904,8 +903,14 @@
 	return ((sq->wq.sz_m1 & (cc - pc)) >= n || cc == pc);
 }
 
+static inline bool
+mlx5e_sq_is_empty(const struct mlx5e_sq *sq)
+{
+	return (sq->cc == sq->pc);
+}
+
 static inline u32
-mlx5e_sq_queue_level(struct mlx5e_sq *sq)
+mlx5e_sq_queue_level(const struct mlx5e_sq *sq)
 {
 	u16 cc;
 	u16 pc;
@@ -1264,6 +1269,17 @@
 	mlx5_cq_arm(mcq, MLX5_CQ_DB_REQ_NOT, mcq->uar->map, dblock, cq->wq.cc);
 }
 
+static inline void
+mlx5e_accumulate_sq_stats(const struct mlx5e_sq_stats *src, struct mlx5e_sq_stats *dst)
+{
+	dst->tso_packets += src->tso_packets;
+	dst->tso_bytes += src->tso_bytes;
+	dst->dropped += src->dropped;
+	dst->enobuf += src->enobuf;
+	dst->defragged += src->defragged;
+	dst->csum_offload_none += src->csum_offload_none;
+}
+
 #define	mlx5e_dbg(_IGN, _priv, ...) mlx5_core_dbg((_priv)->mdev, __VA_ARGS__)
 
 extern const struct ethtool_ops mlx5e_ethtool_ops;
diff --git a/sys/dev/mlx5/mlx5_en/en_rl.h b/sys/dev/mlx5/mlx5_en/en_rl.h
--- a/sys/dev/mlx5/mlx5_en/en_rl.h
+++ b/sys/dev/mlx5/mlx5_en/en_rl.h
@@ -1,5 +1,6 @@
 /*-
- * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Mellanox Technologies.
+ * Copyright (c) 2022 NVIDIA corporation & affiliates.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -39,24 +40,22 @@
 
 #include <sys/queue.h>
 
-#define	MLX5E_RL_MAX_WORKERS		128	/* limited by Toeplitz hash */
-#define	MLX5E_RL_MAX_TX_RATES		(64 * 1024)	/* software limit */
-#define	MLX5E_RL_DEF_SQ_PER_WORKER	(12 * 1024)	/* software limit */
-#define	MLX5E_RL_MAX_SQS		(120 * 1024)	/* software limit */
-
 #define	MLX5E_RL_TX_COAL_USEC_DEFAULT	32
 #define	MLX5E_RL_TX_COAL_PKTS_DEFAULT	4
 #define	MLX5E_RL_TX_COAL_MODE_DEFAULT	0
 #define	MLX5E_RL_TX_COMP_FACT_DEFAULT	1
 
-#define	MLX5E_RL_WORKER_LOCK(rlw)		mtx_lock(&(rlw)->mtx)
-#define	MLX5E_RL_WORKER_UNLOCK(rlw)		mtx_unlock(&(rlw)->mtx)
+#define	MLX5E_RL_MAX_RATES(_rl) \
+	container_of(_rl, struct mlx5e_priv, rl)->mdev->priv.rl_table.max_size
+
+#define	MLX5E_RL_HEAD_LOCK(head)	mtx_lock(&(head)->mtx)
+#define	MLX5E_RL_HEAD_UNLOCK(head)	mtx_unlock(&(head)->mtx)
 
-#define	MLX5E_RL_RLOCK(rl) sx_slock(&(rl)->rl_sxlock)
-#define	MLX5E_RL_RUNLOCK(rl) sx_sunlock(&(rl)->rl_sxlock)
+#define	MLX5E_RL_CHANNEL_LOCK(chan)	mtx_lock(&(chan)->mtx)
+#define	MLX5E_RL_CHANNEL_UNLOCK(chan)	mtx_unlock(&(chan)->mtx)
 
-#define	MLX5E_RL_WLOCK(rl) sx_xlock(&(rl)->rl_sxlock)
-#define	MLX5E_RL_WUNLOCK(rl) sx_xunlock(&(rl)->rl_sxlock)
+#define	MLX5E_RL_PRIV_LOCK(rl)		sx_xlock(&(rl)->rl_sxlock)
+#define	MLX5E_RL_PRIV_UNLOCK(rl)	sx_xunlock(&(rl)->rl_sxlock)
 
 #define	MLX5E_RL_PARAMS(m) \
   m(+1, u64, tx_queue_size, "tx_queue_size", "Default send queue size") \
@@ -64,52 +63,25 @@
   m(+1, u64, tx_coalesce_pkts, "tx_coalesce_pkts", "Maximum number of TX packets to join") \
   m(+1, u64, tx_coalesce_mode, "tx_coalesce_mode", "0: EQE mode 1: CQE mode") \
   m(+1, u64, tx_completion_fact, "tx_completion_fact", "1..MAX: Completion event ratio") \
-  m(+1, u64, tx_completion_fact_max, "tx_completion_fact_max", "Maximum completion event ratio") \
-  m(+1, u64, tx_worker_threads_max, "tx_worker_threads_max", "Max number of TX worker threads") \
-  m(+1, u64, tx_worker_threads_def, "tx_worker_threads_def", "Default number of TX worker threads") \
-  m(+1, u64, tx_channels_per_worker_max, "tx_channels_per_worker_max", "Max number of TX channels per worker") \
-  m(+1, u64, tx_channels_per_worker_def, "tx_channels_per_worker_def", "Default number of TX channels per worker") \
-  m(+1, u64, tx_rates_max, "tx_rates_max", "Max number of TX rates") \
-  m(+1, u64, tx_rates_def, "tx_rates_def", "Default number of TX rates") \
-  m(+1, u64, tx_limit_min, "tx_limit_min", "Minimum TX rate in bits/s") \
-  m(+1, u64, tx_limit_max, "tx_limit_max", "Maximum TX rate in bits/s") \
-  m(+1, u64, tx_burst_size, "tx_burst_size", "Current burst size in number of packets. A value of zero means use firmware default.") \
-  m(+1, u64, tx_burst_size_max, "tx_burst_size_max", "Maximum burst size in number of packets") \
-  m(+1, u64, tx_burst_size_min, "tx_burst_size_min", "Minimum burst size in number of packets")
+  m(+1, u64, tx_completion_fact_max, "tx_completion_fact_max", "Maximum completion event ratio")
 
 #define	MLX5E_RL_PARAMS_NUM (0 MLX5E_RL_PARAMS(MLX5E_STATS_COUNT))
 
 #define MLX5E_RL_STATS(m) \
   m(+1, u64, tx_allocate_resource_failure, "tx_allocate_resource_failure", "Number of times firmware resource allocation failed") \
-  m(+1, u64, tx_add_new_rate_failure, "tx_add_new_rate_failure", "Number of times adding a new firmware rate failed") \
   m(+1, u64, tx_modify_rate_failure, "tx_modify_rate_failure", "Number of times modifying a firmware rate failed") \
   m(+1, u64, tx_active_connections, "tx_active_connections", "Number of active connections") \
   m(+1, u64, tx_open_queues, "tx_open_queues", "Number of open TX queues") \
-  m(+1, u64, tx_available_resource_failure, "tx_available_resource_failure", "Number of times TX resources were not available")
+  m(+1, u64, tx_available_resource_failure, "tx_available_resource_failure", "Number of times TX resources were not available") \
+  m(+1, u64, tx_rate_changed_too_quickly, "tx_rate_changed_too_quickly", "Number of times TX rate was changed too quickly")
 
 #define MLX5E_RL_STATS_NUM (0 MLX5E_RL_STATS(MLX5E_STATS_COUNT))
 
-#define	MLX5E_RL_TABLE_PARAMS(m) \
-  m(+1, u64, tx_limit_add, "tx_limit_add", "Add TX rate limit in bits/s to empty slot") \
-  m(+1, u64, tx_limit_clr, "tx_limit_clr", "Clear all TX rates in table") \
-  m(+1, u64, tx_allowed_deviation, "tx_allowed_deviation", "Relative rate deviation allowed in 1/1000") \
-  m(+1, u64, tx_allowed_deviation_min, "tx_allowed_deviation_min", "Minimum allowed rate deviation in 1/1000") \
-  m(+1, u64, tx_allowed_deviation_max, "tx_allowed_deviation_max", "Maximum allowed rate deviation in 1/1000")
-
-#define	MLX5E_RL_TABLE_PARAMS_NUM (0 MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_COUNT))
-
 #define	MLX5E_RL_PARAMS_INDEX(n)			\
     (__offsetof(struct mlx5e_rl_params, n) / sizeof(uint64_t))
 
 struct mlx5e_priv;
-
-/* Indicates channel's state */
-enum {
-	MLX5E_RL_ST_FREE,
-	MLX5E_RL_ST_USED,
-	MLX5E_RL_ST_MODIFY,
-	MLX5E_RL_ST_DESTROY,
-};
+struct mlx5e_rl_rate;
 
 struct mlx5e_rl_stats {
 	u64	arg [0];
@@ -119,8 +91,6 @@
 struct mlx5e_rl_params {
 	u64	arg [0];
 	MLX5E_RL_PARAMS(MLX5E_STATS_VAR)
-	u64	table_arg [0];
-	MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_VAR)
 };
 
 struct mlx5e_rl_channel_param {
@@ -128,45 +98,69 @@
 	struct mlx5e_cq_param cq;
 };
 
-struct mlx5e_rl_channel {
-	struct m_snd_tag tag;
-	STAILQ_ENTRY(mlx5e_rl_channel) entry;
-	struct mlx5e_sq * volatile sq;
-	struct mlx5e_rl_worker *worker;
-	uint64_t new_rate;
-	uint64_t init_rate;
-	uint64_t last_rate;
-	uint32_t refcount;
-	uint16_t last_burst;
-	uint16_t state;
+struct mlx5e_rl_sq {
+	struct mlx5e_sq sq;
+	TAILQ_ENTRY(mlx5e_rl_sq) entry;
+	uint32_t cq_modify_flags;
+#define	MLX5_CQ_MODIFY_DESTROY -1U
+	uint32_t queue_handle; /* SQ remap support */
+	struct mlx5e_sq *sq_next;
 };
 
-struct mlx5e_rl_worker {
+struct mlx5e_rl_head {
 	struct mtx mtx;
-	struct cv cv;
-	STAILQ_HEAD(, mlx5e_rl_channel) index_list_head;
-	STAILQ_HEAD(, mlx5e_rl_channel) process_head;
+#define	MLX5E_RL_HEAD_USED 0
+#define	MLX5E_RL_HEAD_WORK 1
+#define	MLX5E_RL_HEAD_FREE 2
+#define	MLX5E_RL_HEAD_MAX 3
+	TAILQ_HEAD(, mlx5e_rl_sq) head[MLX5E_RL_HEAD_MAX];
+	uint32_t count[MLX5E_RL_HEAD_MAX];
+	uint32_t level;
+};
+
+struct mlx5e_rl_channel {
+	struct m_snd_tag tag;
+	struct mtx mtx;
+	struct mlx5e_rl_sq *curr_sq;
+	struct mlx5e_rl_sq *next_sq;
 	struct mlx5e_priv *priv;
-	struct mlx5e_rl_channel *channels;
-	unsigned worker_done;
+	uint16_t curr_rate_index;
+	uint16_t next_rate_index;
+	uint16_t last_rate_index;
+	uint8_t irq_index;
+#define	MLX5E_RL_IRQ_INDEX_MAX 128	/* limit is exclusive, 0..127 */
 };
 
+#define	MLX5E_RL_CHAN_TO_SQ_HEAD(channel, rate_index) \
+	((channel)->priv->rl.sq_rate_head[(rate_index) - 1] + \
+	 (channel)->irq_index)
+
 struct mlx5e_rl_priv_data {
 	struct sx rl_sxlock;
+	struct mtx rl_mtx;
+	struct cv rl_cv;
 	struct sysctl_ctx_list ctx;
 	struct mlx5e_rl_channel_param chan_param;
 	struct mlx5e_rl_params param;
 	struct mlx5e_rl_stats stats;
-	struct mlx5e_rl_worker *workers;
-	struct mlx5e_priv *priv;
+	struct mlx5e_sq_stats sq_stats;
+	struct mlx5e_rl_head *sq_raw_head;
+	struct mlx5e_rl_head **sq_rate_head;
 	uint64_t *rate_limit_table;
-	unsigned opened;
+	unsigned worker_opened;
+	unsigned worker_done;
+	unsigned worker_pending;
 	uint32_t tisn;
+	uint32_t use_multi_sq;
 };
 
-int mlx5e_rl_init(struct mlx5e_priv *priv);
-void mlx5e_rl_cleanup(struct mlx5e_priv *priv);
-void mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl);
+int mlx5e_rl_init(struct mlx5e_priv *);
+void mlx5e_rl_cleanup(struct mlx5e_priv *);
+void mlx5e_rl_refresh_sq_inline(struct mlx5e_priv *);
+void mlx5e_rl_refresh_channel_params(struct mlx5e_priv *, uint32_t cq_modify_flags);
+void mlx5e_rl_query(struct ifnet *, struct if_ratelimit_query_results *);
+struct mlx5e_sq *mlx5e_rl_get_current_sq(struct mlx5e_rl_channel *);
+void mlx5e_rl_accumulate_sq_stats(struct mlx5e_rl_priv_data *, struct mlx5e_sq_stats *);
 
 if_snd_tag_alloc_t mlx5e_rl_snd_tag_alloc;
 
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c b/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c
@@ -685,8 +685,10 @@
 
 	/* update inline mode */
 	mlx5e_refresh_sq_inline(priv);
+
 #ifdef RATELIMIT
-	mlx5e_rl_refresh_sq_inline(&priv->rl);
+	/* update inline mode */
+	mlx5e_rl_refresh_sq_inline(priv);
 #endif
 done:
 	PRIV_UNLOCK(priv);
@@ -1076,6 +1078,10 @@
 		/* restart network interface, if any */
 		if (was_opened)
 			mlx5e_open_locked(priv->ifp);
+#ifdef RATELIMIT
+		/* need to re-create all SQ's */
+		mlx5e_rl_refresh_channel_params(priv, MLX5_CQ_MODIFY_DESTROY);
+#endif
 		break;
 
 	case MLX5_PARAM_OFFSET(rx_coalesce_mode):
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_iq.c b/sys/dev/mlx5/mlx5_en/mlx5_en_iq.c
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_iq.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_iq.c
@@ -269,11 +269,12 @@
 	/* SQ remap support requires reg_umr privileges level */
 	if (MLX5_CAP_QOS(iq->priv->mdev, qos_remap_pp)) {
 		MLX5_SET(sqc, sqc, qos_remap_en, 1);
-		if (MLX5_CAP_ETH(iq->priv->mdev, reg_umr_sq))
+		if (MLX5_CAP_ETH(iq->priv->mdev, reg_umr_sq)) {
 			MLX5_SET(sqc, sqc, reg_umr, 1);
-		 else
-			mlx5_en_err(iq->priv->ifp,
-			    "No reg umr SQ capability, SQ remap disabled\n");
+		} else {
+			err = -EOPNOTSUPP;
+			goto done;
+		}
 	}
 
 	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
@@ -286,7 +287,7 @@
 	    (__be64 *) MLX5_ADDR_OF(wq, wq, pas));
 
 	err = mlx5_core_create_sq(iq->priv->mdev, in, inlen, &iq->sqn);
-
+done:
 	kvfree(in);
 
 	return (err);
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2015-2021 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2015-2021 Mellanox Technologies.
  * Copyright (c) 2022 NVIDIA corporation & affiliates.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -811,15 +811,10 @@
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5e_vport_stats *s = &priv->stats.vport;
-	struct mlx5e_sq_stats *sq_stats;
+	struct mlx5e_sq_stats sq_stats = {};
 	u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)];
 	u32 *out;
 	int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out);
-	u64 tso_packets = 0;
-	u64 tso_bytes = 0;
-	u64 tx_queue_dropped = 0;
-	u64 tx_defragged = 0;
-	u64 tx_offload_none = 0;
 	u64 lro_packets = 0;
 	u64 lro_bytes = 0;
 	u64 sw_lro_queued = 0;
@@ -859,47 +854,19 @@
 		rx_decrypted_error += rq_stats->decrypted_error_packets;
 		rx_decrypted_ok += rq_stats->decrypted_ok_packets;
 
-		for (j = 0; j < priv->num_tc; j++) {
-			sq_stats = &pch->sq[j].stats;
-
-			tso_packets += sq_stats->tso_packets;
-			tso_bytes += sq_stats->tso_bytes;
-			tx_queue_dropped += sq_stats->dropped;
-			tx_queue_dropped += sq_stats->enobuf;
-			tx_defragged += sq_stats->defragged;
-			tx_offload_none += sq_stats->csum_offload_none;
-		}
+		for (j = 0; j < priv->num_tc; j++)
+			mlx5e_accumulate_sq_stats(&pch->sq[j].stats, &sq_stats);
 	}
 
 #ifdef RATELIMIT
-	/* Collect statistics from all rate-limit queues */
-	for (j = 0; j < priv->rl.param.tx_worker_threads_def; j++) {
-		struct mlx5e_rl_worker *rlw = priv->rl.workers + j;
-
-		for (i = 0; i < priv->rl.param.tx_channels_per_worker_def; i++) {
-			struct mlx5e_rl_channel *channel = rlw->channels + i;
-			struct mlx5e_sq *sq = channel->sq;
-
-			if (sq == NULL)
-				continue;
-
-			sq_stats = &sq->stats;
-
-			tso_packets += sq_stats->tso_packets;
-			tso_bytes += sq_stats->tso_bytes;
-			tx_queue_dropped += sq_stats->dropped;
-			tx_queue_dropped += sq_stats->enobuf;
-			tx_defragged += sq_stats->defragged;
-			tx_offload_none += sq_stats->csum_offload_none;
-		}
-	}
+	mlx5e_rl_accumulate_sq_stats(&priv->rl, &sq_stats);
 #endif
 
 	/* update counters */
-	s->tso_packets = tso_packets;
-	s->tso_bytes = tso_bytes;
-	s->tx_queue_dropped = tx_queue_dropped;
-	s->tx_defragged = tx_defragged;
+	s->tso_packets = sq_stats.tso_packets;
+	s->tso_bytes = sq_stats.tso_bytes;
+	s->tx_queue_dropped = sq_stats.dropped + sq_stats.enobuf;
+	s->tx_defragged = sq_stats.defragged;
 	s->lro_packets = lro_packets;
 	s->lro_bytes = lro_bytes;
 	s->sw_lro_queued = sw_lro_queued;
@@ -977,7 +944,7 @@
 		    s->tx_broadcast_bytes;
 
 		/* Update calculated offload counters */
-		s->tx_csum_offload = s->tx_packets - tx_offload_none;
+		s->tx_csum_offload = s->tx_packets - sq_stats.csum_offload_none;
 		s->rx_csum_good = s->rx_packets - s->rx_csum_none;
 	}
 
@@ -4396,50 +4363,6 @@
 	}
 }
 
-#ifdef RATELIMIT
-#define NUM_HDWR_RATES_MLX 13
-static const uint64_t adapter_rates_mlx[NUM_HDWR_RATES_MLX] = {
-	135375,			/* 1,083,000 */
-	180500,			/* 1,444,000 */
-	270750,			/* 2,166,000 */
-	361000,			/* 2,888,000 */
-	541500,			/* 4,332,000 */
-	721875,			/* 5,775,000 */
-	1082875,		/* 8,663,000 */
-	1443875,		/* 11,551,000 */
-	2165750,		/* 17,326,000 */
-	2887750,		/* 23,102,000 */
-	4331625,		/* 34,653,000 */
-	5775500,		/* 46,204,000 */
-	8663125			/* 69,305,000 */
-};
-
-static void
-mlx5e_ratelimit_query(if_t ifp __unused, struct if_ratelimit_query_results *q)
-{
-	/*
-	 * This function needs updating by the driver maintainer!
-	 * For the MLX card there are currently (ConectX-4?) 13 
-	 * pre-set rates and others i.e. ConnectX-5, 6, 7??
-	 *
-	 * This will change based on later adapters
-	 * and this code should be updated to look at ifp
-	 * and figure out the specific adapter type
-	 * settings i.e. how many rates as well
-	 * as if they are fixed (as is shown here) or
-	 * if they are dynamic (example chelsio t4). Also if there
-	 * is a maximum number of flows that the adapter
-	 * can handle that too needs to be updated in
-	 * the max_flows field.
-	 */
-	q->rate_table = adapter_rates_mlx;
-	q->flags = RT_IS_FIXED_TABLE;
-	q->max_flows = 0;	/* mlx has no limit */
-	q->number_of_rates = NUM_HDWR_RATES_MLX;
-	q->min_segment_burst = 1;
-}
-#endif
-
 static void
 mlx5e_ifm_add(struct mlx5e_priv *priv, int type)
 {
@@ -4528,7 +4451,7 @@
 	    IFCAP2_BIT(IFCAP2_RXTLS6), 0);
 	if_setsndtagallocfn(ifp, mlx5e_snd_tag_alloc);
 #ifdef RATELIMIT
-	if_setratelimitqueryfn(ifp, mlx5e_ratelimit_query);
+	if_setratelimitqueryfn(ifp, mlx5e_rl_query);
 #endif
 	/* set TSO limits so that we don't have to drop TX packets */
 	if_sethwtsomax(ifp, MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c b/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c
@@ -1,5 +1,6 @@
 /*-
- * Copyright (c) 2016-2020 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016-2020 Mellanox Technologies.
+ * Copyright (c) 2022 NVIDIA corporation & affiliates.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -32,15 +33,14 @@
 
 #ifdef RATELIMIT
 
-static int mlx5e_rl_open_workers(struct mlx5e_priv *);
-static void mlx5e_rl_close_workers(struct mlx5e_priv *);
-static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS);
+static void mlx5e_rl_open_worker(struct mlx5e_priv *);
+static void mlx5e_rl_close_worker(struct mlx5e_priv *);
 static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x,
     struct sysctl_oid *, const char *name, const char *desc);
 static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
       struct sysctl_oid *node, const char *name, const char *desc);
-static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value);
-static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value);
+static void mlx5e_rl_worker_wakeup(struct mlx5e_rl_priv_data *);
+static int mlx5e_rl_modify_sq(struct mlx5e_sq *, uint16_t rl_index);
 static if_snd_tag_modify_t mlx5e_rl_snd_tag_modify;
 static if_snd_tag_query_t mlx5e_rl_snd_tag_query;
 static if_snd_tag_free_t mlx5e_rl_snd_tag_free;
@@ -53,38 +53,38 @@
 };
 
 static void
-mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl,
+mlx5e_rl_build_sq_param(struct mlx5e_priv *priv,
     struct mlx5e_sq_param *param)
 {
 	void *sqc = param->sqc;
 	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
-	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
+	uint8_t log_sq_size = order_base_2(priv->rl.param.tx_queue_size);
 
 	MLX5_SET(wq, wq, log_wq_sz, log_sq_size);
 	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
-	MLX5_SET(wq, wq, pd, rl->priv->pdn);
+	MLX5_SET(wq, wq, pd, priv->pdn);
 
 	param->wq.linear = 1;
 }
 
 static void
-mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl,
+mlx5e_rl_build_cq_param(struct mlx5e_priv *priv,
     struct mlx5e_cq_param *param)
 {
 	void *cqc = param->cqc;
-	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
+	uint8_t log_sq_size = order_base_2(priv->rl.param.tx_queue_size);
 
 	MLX5_SET(cqc, cqc, log_cq_size, log_sq_size);
-	MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs);
-	MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts);
-	MLX5_SET(cqc, cqc, uar_page, rl->priv->mdev->priv.uar->index);
+	MLX5_SET(cqc, cqc, cq_period, priv->rl.param.tx_coalesce_usecs);
+	MLX5_SET(cqc, cqc, cq_max_count, priv->rl.param.tx_coalesce_pkts);
+	MLX5_SET(cqc, cqc, uar_page, priv->mdev->priv.uar->index);
 
-	switch (rl->param.tx_coalesce_mode) {
+	switch (priv->rl.param.tx_coalesce_mode) {
 	case 0:
 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	default:
-		if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe))
+		if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe))
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
 		else
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
@@ -93,13 +93,13 @@
 }
 
 static void
-mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl,
+mlx5e_rl_build_channel_param(struct mlx5e_priv *priv,
     struct mlx5e_rl_channel_param *cparam)
 {
 	memset(cparam, 0, sizeof(*cparam));
 
-	mlx5e_rl_build_sq_param(rl, &cparam->sq);
-	mlx5e_rl_build_cq_param(rl, &cparam->cq);
+	mlx5e_rl_build_sq_param(priv, &cparam->sq);
+	mlx5e_rl_build_cq_param(priv, &cparam->cq);
 }
 
 static int
@@ -157,111 +157,132 @@
 static void
 mlx5e_rl_destroy_sq(struct mlx5e_sq *sq)
 {
-
 	mlx5e_free_sq_db(sq);
 	mlx5_wq_destroy(&sq->wq_ctrl);
 	bus_dma_tag_destroy(sq->dma_tag);
 }
 
 static int
-mlx5e_rl_query_sq(struct mlx5e_sq *sq)
+mlx5e_rl_query_sq(struct mlx5e_rl_sq *sq)
 {
 	void *out;
-        int inlen;
-        int err;
-
-        inlen = MLX5_ST_SZ_BYTES(query_sq_out);
-        out = mlx5_vzalloc(inlen);
-        if (!out)
-                return -ENOMEM;
+	int iolen;
+	int err;
 
-        err = mlx5_core_query_sq(sq->priv->mdev, sq->sqn, out);
-        if (err)
-                goto out;
+	iolen = MLX5_ST_SZ_BYTES(query_sq_out);
+	out = mlx5_vzalloc(iolen);
+	if (!out)
+		return (-ENOMEM);
 
-        sq->queue_handle = MLX5_GET(query_sq_out, out, sq_context.queue_handle);
+	err = mlx5_core_query_sq(sq->sq.priv->mdev, sq->sq.sqn, out);
+	if (err)
+		goto out;
 
+	sq->queue_handle = MLX5_GET(query_sq_out, out, sq_context.queue_handle);
 out:
-        kvfree(out);
-        return err;
+	kvfree(out);
+	return (err);
 }
 
 static int
-mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
-    struct mlx5e_sq_param *param, int ix)
+mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_rl_sq *sq,
+    struct mlx5e_sq_param *param, int ix, int rl_index)
 {
 	int err;
 
-	err = mlx5e_rl_create_sq(priv, sq, param, ix);
+	err = mlx5e_rl_create_sq(priv, &sq->sq, param, ix);
 	if (err)
 		return (err);
 
-	err = mlx5e_enable_sq(sq, param, &priv->channel[ix].bfreg, priv->rl.tisn);
+	err = mlx5e_enable_sq(&sq->sq, param, &priv->channel[ix].bfreg, priv->rl.tisn);
 	if (err)
 		goto err_destroy_sq;
 
-	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
+	err = mlx5e_modify_sq(&sq->sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
 	if (err)
 		goto err_disable_sq;
 
-	if (MLX5_CAP_QOS(priv->mdev, qos_remap_pp)) {
+	if (priv->rl.use_multi_sq == 0) {
 		err = mlx5e_rl_query_sq(sq);
-		if (err) {
-			mlx5_en_err(priv->ifp, "Failed retrieving send queue handle for"
-			    "SQ remap - sqn=%u, err=(%d)\n", sq->sqn, err);
-			sq->queue_handle = MLX5_INVALID_QUEUE_HANDLE;
-		}
-	} else
+		if (err)
+			goto err_query_sq;
+	} else {
 		sq->queue_handle = MLX5_INVALID_QUEUE_HANDLE;
+	}
+
+	err = mlx5e_rl_modify_sq(&sq->sq, rl_index);
+	if (err)
+		goto err_query_sq;
 
-	WRITE_ONCE(sq->running, 1);
+	WRITE_ONCE(sq->sq.running, 1);
 
 	return (0);
 
+err_query_sq:
+	mlx5e_modify_sq(&sq->sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR);
 err_disable_sq:
-	mlx5e_disable_sq(sq);
+	mlx5e_disable_sq(&sq->sq);
 err_destroy_sq:
-	mlx5e_rl_destroy_sq(sq);
+	mlx5e_rl_destroy_sq(&sq->sq);
 
 	return (err);
 }
 
 static void
-mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq)
+mlx5e_rl_tx_cq_multi_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe __unused)
 {
-	mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF);
-	mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF);
+	struct mlx5e_rl_sq *rl_sq;
+	struct mlx5e_sq *sq_next;
 
-	callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
+	mlx5e_tx_cq_comp(mcq, eqe);
 
-	sq->cev_factor = priv->rl.param.tx_completion_fact;
+	rl_sq = container_of(mcq, struct mlx5e_rl_sq, sq.cq.mcq);
 
-	/* ensure the TX completion event factor is not zero */
-	if (sq->cev_factor == 0)
-		sq->cev_factor = 1;
+	mtx_lock(&rl_sq->sq.comp_lock);
+	if (mlx5e_sq_is_empty(&rl_sq->sq)) {
+		sq_next = rl_sq->sq_next;
+		rl_sq->sq_next = NULL;
+	} else {
+		sq_next = NULL;
+	}
+	mtx_unlock(&rl_sq->sq.comp_lock);
+
+	if (unlikely(sq_next != NULL)) {
+		mtx_lock(&sq_next->lock);
+		mlx5e_tx_notify_hw(sq_next, true);
+		mtx_unlock(&sq_next->lock);
+	}
 }
 
 static int
-mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix,
+mlx5e_rl_alloc_sq(struct mlx5e_priv *priv, int eq_ix, int rl_index,
     struct mlx5e_rl_channel_param *cparam,
-    struct mlx5e_sq *volatile *ppsq)
+    struct mlx5e_rl_sq * volatile *ppsq)
 {
-	struct mlx5e_priv *priv = rlw->priv;
-	struct mlx5e_sq *sq;
+	struct mlx5e_rl_sq *sq;
 	int err;
 
 	sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO);
 
 	/* init mutexes */
-	mlx5e_rl_chan_mtx_init(priv, sq);
+	mtx_init(&sq->sq.lock, "mlx5e-rl-tx", NULL, MTX_DEF);
+	mtx_init(&sq->sq.comp_lock, "mlx5e-rl-comp", NULL, MTX_DEF);
+
+	callout_init_mtx(&sq->sq.cev_callout, &sq->sq.lock, 0);
+
+	sq->sq.cev_factor = priv->rl.param.tx_completion_fact;
+
+	/* ensure the TX completion event factor is not zero */
+	if (sq->sq.cev_factor == 0)
+		sq->sq.cev_factor = 1;
 
 	/* open TX completion queue */
-	err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq,
-	    &mlx5e_tx_cq_comp, eq_ix);
+	err = mlx5e_open_cq(priv, &cparam->cq, &sq->sq.cq,
+	    priv->rl.use_multi_sq ? &mlx5e_rl_tx_cq_multi_comp : &mlx5e_tx_cq_comp, eq_ix);
 	if (err)
 		goto err_free;
 
-	err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix);
+	err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix, rl_index);
 	if (err)
 		goto err_close_tx_cq;
 
@@ -269,44 +290,51 @@
 	*ppsq = sq;
 
 	/* poll TX queue initially */
-	sq->cq.mcq.comp(&sq->cq.mcq, NULL);
+	sq->sq.cq.mcq.comp(&sq->sq.cq.mcq, NULL);
 
+	atomic_add_64(&priv->rl.stats.tx_open_queues, 1ULL);
 	return (0);
 
 err_close_tx_cq:
-	mlx5e_close_cq(&sq->cq);
+	mlx5e_close_cq(&sq->sq.cq);
 
 err_free:
 	/* destroy mutexes */
-	mtx_destroy(&sq->lock);
-	mtx_destroy(&sq->comp_lock);
+	mtx_destroy(&sq->sq.lock);
+	mtx_destroy(&sq->sq.comp_lock);
 	free(sq, M_MLX5EN);
 	atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL);
 	return (err);
 }
 
 static void
-mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq)
+mlx5e_rl_reset_sq_stats(struct mlx5e_rl_sq *sq)
 {
-	struct mlx5e_sq *sq = *ppsq;
+	/* store statistics for future */
+	mlx5e_accumulate_sq_stats(&sq->sq.stats, &sq->sq.priv->rl.sq_stats);
 
-	/* check if channel is already closed */
-	if (sq == NULL)
-		return;
-	/* ensure channel pointer is no longer used */
-	*ppsq = NULL;
+	/* clear statistics */
+	memset(&sq->sq.stats, 0, sizeof(sq->sq.stats));
+}
 
+static void
+mlx5e_rl_free_sq(struct mlx5e_rl_sq *sq)
+{
 	/* teardown and destroy SQ */
-	mlx5e_drain_sq(sq);
-	mlx5e_disable_sq(sq);
-	mlx5e_rl_destroy_sq(sq);
+	mlx5e_drain_sq(&sq->sq);
+	mlx5e_disable_sq(&sq->sq);
+	mlx5e_rl_destroy_sq(&sq->sq);
 
 	/* close CQ */
-	mlx5e_close_cq(&sq->cq);
+	mlx5e_close_cq(&sq->sq.cq);
 
 	/* destroy mutexes */
-	mtx_destroy(&sq->lock);
-	mtx_destroy(&sq->comp_lock);
+	mtx_destroy(&sq->sq.lock);
+	mtx_destroy(&sq->sq.comp_lock);
+
+	atomic_add_64(&sq->sq.priv->rl.stats.tx_open_queues, -1ULL);
+
+	mlx5e_rl_reset_sq_stats(sq);
 
 	free(sq, M_MLX5EN);
 }
@@ -378,57 +406,23 @@
 	return (err);
 }
 
-/*
- * This function will search the configured rate limit table for the
- * best match to avoid that a single socket based application can
- * allocate all the available hardware rates. If the user selected
- * rate deviates too much from the closes rate available in the rate
- * limit table, unlimited rate will be selected.
- */
-static uint64_t
-mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate)
-{
-	uint64_t distance = -1ULL;
-	uint64_t diff;
-	uint64_t retval = 0;		/* unlimited */
-	uint64_t x;
-
-	/* search for closest rate */
-	for (x = 0; x != rl->param.tx_rates_def; x++) {
-		uint64_t rate = rl->rate_limit_table[x];
-		if (rate == 0)
-			continue;
-
-		if (rate > user_rate)
-			diff = rate - user_rate;
-		else
-			diff = user_rate - rate;
-
-		/* check if distance is smaller than previous rate */
-		if (diff < distance) {
-			distance = diff;
-			retval = rate;
-		}
-	}
-
-	/* range check for multiplication below */
-	if (user_rate > rl->param.tx_limit_max)
-		user_rate = rl->param.tx_limit_max;
+#define	MLX5E_RL_POST_SQ_REMAP_DS_CNT \
+	DIV_ROUND_UP(sizeof(struct mlx5e_tx_qos_remap_wqe), MLX5_SEND_WQE_DS)
+#define	MLX5E_RL_POST_SQ_REMAP_WQEBB_CNT \
+	DIV_ROUND_UP(MLX5E_RL_POST_SQ_REMAP_DS_CNT, MLX5_SEND_WQEBB_NUM_DS)
 
-	/* fallback to unlimited, if rate deviates too much */
-	if (distance > howmany(user_rate *
-	    rl->param.tx_allowed_deviation, 1000ULL))
-		retval = 0;
+CTASSERT(MLX5E_RL_POST_SQ_REMAP_WQEBB_CNT == 1);
 
-	return (retval);
+static void
+mlx5e_rl_post_sq_remap_wqe_callback(void *arg)
+{
+	m_snd_tag_rele(arg);
 }
 
 static int
-mlx5e_rl_post_sq_remap_wqe(struct mlx5e_iq *iq, u32 scq_handle, u32 sq_handle,
-    struct mlx5e_rl_channel *sq_channel)
+mlx5e_rl_post_sq_remap_wqe(struct mlx5e_iq *iq, struct mlx5e_rl_channel *channel,
+    u32 scq_handle, u32 sq_handle)
 {
-	const u32 ds_cnt = DIV_ROUND_UP(sizeof(struct mlx5e_tx_qos_remap_wqe),
-	            MLX5_SEND_WQE_DS);
 	struct mlx5e_tx_qos_remap_wqe *wqe;
 	int pi;
 
@@ -447,16 +441,19 @@
 
 	wqe->ctrl.opmod_idx_opcode = cpu_to_be32((iq->pc << 8) |
 	    MLX5_OPCODE_QOS_REMAP);
-	wqe->ctrl.qpn_ds = cpu_to_be32((iq->sqn << 8) | ds_cnt);
+	wqe->ctrl.qpn_ds = cpu_to_be32((iq->sqn << 8) |
+	    MLX5E_RL_POST_SQ_REMAP_DS_CNT);
 	wqe->ctrl.imm = cpu_to_be32(iq->priv->tisn[0] << 8);
 	wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE | MLX5_FENCE_MODE_INITIATOR_SMALL;
 
 	/* copy data for doorbell */
 	memcpy(iq->doorbell.d32, &wqe->ctrl, sizeof(iq->doorbell.d32));
 
-	iq->data[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
-	iq->data[pi].p_refcount = &sq_channel->refcount;
-	atomic_add_int(iq->data[pi].p_refcount, 1);
+	iq->data[pi].num_wqebbs = MLX5E_RL_POST_SQ_REMAP_WQEBB_CNT;
+	iq->data[pi].callback = &mlx5e_rl_post_sq_remap_wqe_callback;
+	iq->data[pi].arg = m_snd_tag_ref(&channel->tag);
+	iq->data[pi].p_refcount = NULL;
+
 	iq->pc += iq->data[pi].num_wqebbs;
 
 	mlx5e_iq_notify_hw(iq);
@@ -467,139 +464,125 @@
 }
 
 static int
-mlx5e_rl_remap_sq(struct mlx5e_sq *sq, uint16_t index,
-    struct mlx5e_rl_channel *sq_channel)
+mlx5e_rl_remap_sq(struct mlx5e_rl_sq *sq, struct mlx5e_rl_channel *channel, uint16_t index)
 {
 	struct mlx5e_channel *iq_channel;
+	struct mlx5e_priv *priv;
 	u32	scq_handle;
 	u32	sq_handle;
 	int 	error;
 
+	priv = sq->sq.priv;
+
 	/* Specific SQ remap operations should be handled by same IQ */
-	iq_channel = &sq->priv->channel[sq->sqn % sq->priv->params.num_channels];
+	iq_channel = &priv->channel[channel->irq_index % priv->params.num_channels];
 
 	sq_handle = sq->queue_handle;
-	scq_handle = mlx5_rl_get_scq_handle(sq->priv->mdev, index);
+	scq_handle = mlx5_rl_get_scq_handle(priv->mdev, index);
 
-	if (sq_handle == MLX5_INVALID_QUEUE_HANDLE ||
-	    scq_handle == MLX5_INVALID_QUEUE_HANDLE)
+	if (unlikely(sq_handle == MLX5_INVALID_QUEUE_HANDLE ||
+	    scq_handle == MLX5_INVALID_QUEUE_HANDLE)) {
 		error = -1;
-	else
-		error = mlx5e_rl_post_sq_remap_wqe(&iq_channel->iq, scq_handle,
-		    sq_handle, sq_channel);
-
+	} else {
+		error = mlx5e_rl_post_sq_remap_wqe(&iq_channel->iq,
+		    channel, scq_handle, sq_handle);
+	}
 	return (error);
 }
 
-/*
- * This function sets the requested rate for a rate limit channel, in
- * bits per second. The requested rate will be filtered through the
- * find best rate function above.
- */
-static int
-mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw,
-    struct mlx5e_rl_channel *channel, uint64_t rate)
+static struct mlx5e_rl_sq *
+mlx5e_rl_dequeue_sq(struct mlx5e_rl_head *phead, unsigned which)
 {
-	struct mlx5e_rl_priv_data *rl = &rlw->priv->rl;
-	struct mlx5e_sq *sq;
-	uint64_t temp;
-	uint16_t index;
-	uint16_t burst;
-	int error;
-	bool use_sq_remap;
-
-	if (rate != 0) {
-		MLX5E_RL_WORKER_UNLOCK(rlw);
-
-		MLX5E_RL_RLOCK(rl);
-
-		/* get current burst size in bytes */
-		temp = rl->param.tx_burst_size *
-		    MLX5E_SW2HW_MTU(if_getmtu(rlw->priv->ifp));
-
-		/* limit burst size to 64K currently */
-		if (temp > 65535)
-			temp = 65535;
-		burst = temp;
-
-		/* find best rate */
-		rate = mlx5e_rl_find_best_rate_locked(rl, rate);
+	struct mlx5e_rl_sq *sq;
+
+	MLX5E_RL_HEAD_LOCK(phead);
+	sq = TAILQ_FIRST(&phead->head[which]);
+	if (likely(sq != NULL)) {
+		TAILQ_REMOVE(&phead->head[which], sq, entry);
+		MPASS(phead->count[which] != 0);
+		phead->count[which]--;
+	}
+	MLX5E_RL_HEAD_UNLOCK(phead);
+	return (sq);
+}
 
-		MLX5E_RL_RUNLOCK(rl);
+static void
+mlx5e_rl_enqueue_sq(struct mlx5e_rl_head *phead, unsigned which, struct mlx5e_rl_sq *sq)
+{
+	MLX5E_RL_HEAD_LOCK(phead);
+	TAILQ_INSERT_TAIL(&phead->head[which], sq, entry);
+	phead->count[which]++;
+	MPASS(phead->count[which] != 0);
+	MLX5E_RL_HEAD_UNLOCK(phead);
+}
 
-		if (rate == 0) {
-			/* rate doesn't exist, fallback to unlimited */
-			index = 0;
-			rate = 0;
-			atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
-		} else {
-			/* get a reference on the new rate */
-			error = -mlx5_rl_add_rate(rlw->priv->mdev,
-			    howmany(rate, 1000), burst, &index);
-
-			if (error != 0) {
-				/* adding rate failed, fallback to unlimited */
-				index = 0;
-				rate = 0;
-				atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL);
-			}
-		}
-		MLX5E_RL_WORKER_LOCK(rlw);
-	} else {
-		index = 0;
-		burst = 0;	/* default */
-	}
+static struct mlx5e_rl_sq *
+mlx5e_rl_requeue_sq(struct mlx5e_rl_head *phead, unsigned from, unsigned to,
+    struct mlx5e_rl_priv_data *rl, struct mlx5e_rl_sq *sq)
+{
+	bool level;
 
-	/* paced <--> non-paced transitions must go via FW */
-	use_sq_remap = MLX5_CAP_QOS(rlw->priv->mdev, qos_remap_pp) &&
-	    channel->last_rate != 0 && rate != 0;
-
-	/* atomically swap rates */
-	temp = channel->last_rate;
-	channel->last_rate = rate;
-	rate = temp;
-
-	/* atomically swap burst size */
-	temp = channel->last_burst;
-	channel->last_burst = burst;
-	burst = temp;
-
-	MLX5E_RL_WORKER_UNLOCK(rlw);
-	/* put reference on the old rate, if any */
-	if (rate != 0) {
-		mlx5_rl_remove_rate(rlw->priv->mdev,
-		    howmany(rate, 1000), burst);
+	MLX5E_RL_HEAD_LOCK(phead);
+	if (likely(sq == NULL)) {
+		sq = TAILQ_FIRST(&phead->head[from]);
+		if (unlikely(sq == NULL))
+			goto done;
 	}
+	TAILQ_REMOVE(&phead->head[from], sq, entry);
+	MPASS(phead->count[from] != 0);
+	phead->count[from]--;
+	TAILQ_INSERT_TAIL(&phead->head[to], sq, entry);
+	phead->count[to]++;
+	MPASS(phead->count[to] != 0);
+done:
+	if (likely(from == MLX5E_RL_HEAD_FREE))
+		level = (phead->count[from] < phead->level);
+	else
+		level = (phead->count[to] >= phead->level);
+	MLX5E_RL_HEAD_UNLOCK(phead);
 
-	/* set new rate, if SQ is running */
-	sq = channel->sq;
-	if (sq != NULL && READ_ONCE(sq->running) != 0) {
-		if (!use_sq_remap || mlx5e_rl_remap_sq(sq, index, channel)) {
-			while (atomic_load_int(&channel->refcount) != 0 &&
-			    rlw->priv->mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR &&
-		            pci_channel_offline(rlw->priv->mdev->pdev) == 0)
-				pause("W", 1);
-			error = mlx5e_rl_modify_sq(sq, index);
-			if (error != 0)
-				atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
-		}
-	} else
-		error = 0;
+	if (level)
+		mlx5e_rl_worker_wakeup(rl);
+	return (sq);
+}
 
-	MLX5E_RL_WORKER_LOCK(rlw);
+static void
+mlx5e_rl_worker_wait(struct mlx5e_rl_priv_data *rl)
+{
+	mtx_lock(&rl->rl_mtx);
+	while (rl->worker_pending == 0 &&
+	       rl->worker_done == 0)
+		cv_wait(&rl->rl_cv, &rl->rl_mtx);
+	rl->worker_pending = 0;
+	mtx_unlock(&rl->rl_mtx);
+}
 
-	return (-error);
+static void
+mlx5e_rl_worker_wakeup(struct mlx5e_rl_priv_data *rl)
+{
+	mtx_lock(&rl->rl_mtx);
+        if (rl->worker_pending == 0) {
+		rl->worker_pending = 1;
+		cv_signal(&rl->rl_cv);
+	}
+	mtx_unlock(&rl->rl_mtx);
 }
 
 static void
 mlx5e_rl_worker(void *arg)
 {
-	struct thread *td;
-	struct mlx5e_rl_worker *rlw = arg;
-	struct mlx5e_rl_channel *channel;
+	struct mlx5e_rl_priv_data *rl;
+	struct mlx5e_rl_sq *sq;
+	struct mlx5e_rl_head *phead;
 	struct mlx5e_priv *priv;
-	unsigned ix;
-	uint64_t x;
+	struct thread *td;
+	uint32_t cq_modify_supported_mask;
+	uint32_t total_used;
+	uint32_t min_free;
+	uint32_t channels;
+	uint32_t max;
+	uint32_t i;
+	uint32_t j;
 	int error;
 
 	/* set thread priority */
@@ -609,135 +592,104 @@
 	sched_prio(td, PI_SWI(SWI_NET));
 	thread_unlock(td);
 
-	priv = rlw->priv;
+	rl = arg;
+	priv = container_of(rl, struct mlx5e_priv, rl);
 
-	/* compute completion vector */
-	ix = (rlw - priv->rl.workers) %
-	    priv->mdev->priv.eq_table.num_comp_vectors;
+	cq_modify_supported_mask = MLX5_CQ_MODIFY_PERIOD | MLX5_CQ_MODIFY_COUNT;
 
-	/* TODO bind to CPU */
+	if (MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify))
+		cq_modify_supported_mask |= MLX5_CQ_MODIFY_PERIOD_MODE;
 
-	/* open all the SQs */
-	MLX5E_RL_WORKER_LOCK(rlw);
-	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
-		struct mlx5e_rl_channel *channel = rlw->channels + x;
+	if (MLX5_CAP_GEN(priv->mdev, cq_eq_remap))
+		cq_modify_supported_mask |= MLX5_CQ_MODIFY_EQN;
 
-#if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS)
-		if (channel->state == MLX5E_RL_ST_FREE)
-			continue;
-#endif
-		MLX5E_RL_WORKER_UNLOCK(rlw);
-
-		MLX5E_RL_RLOCK(&priv->rl);
-		error = mlx5e_rl_open_channel(rlw, ix,
-		    &priv->rl.chan_param, &channel->sq);
-		MLX5E_RL_RUNLOCK(&priv->rl);
-
-		MLX5E_RL_WORKER_LOCK(rlw);
-		if (error != 0) {
-			mlx5_en_err(priv->ifp,
-			    "mlx5e_rl_open_channel failed: %d\n", error);
-			break;
-		}
-		mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate);
-	}
-	while (1) {
-		if (STAILQ_FIRST(&rlw->process_head) == NULL) {
-			/* check if we are tearing down */
-			if (rlw->worker_done != 0)
-				break;
-			cv_wait(&rlw->cv, &rlw->mtx);
-		}
-		/* check if we are tearing down */
-		if (rlw->worker_done != 0)
-			break;
-		channel = STAILQ_FIRST(&rlw->process_head);
-		if (channel != NULL) {
-			STAILQ_REMOVE_HEAD(&rlw->process_head, entry);
-
-			switch (channel->state) {
-			case MLX5E_RL_ST_MODIFY:
-				channel->state = MLX5E_RL_ST_USED;
-				MLX5E_RL_WORKER_UNLOCK(rlw);
-
-				/* create channel by demand */
-				if (channel->sq == NULL) {
-					MLX5E_RL_RLOCK(&priv->rl);
-					error = mlx5e_rl_open_channel(rlw, ix,
-					    &priv->rl.chan_param, &channel->sq);
-					MLX5E_RL_RUNLOCK(&priv->rl);
-
-					if (error != 0) {
-						mlx5_en_err(priv->ifp,
-						    "mlx5e_rl_open_channel failed: %d\n", error);
+	while (rl->worker_done == 0) {
+
+		MLX5E_RL_PRIV_LOCK(rl);
+
+		channels = priv->params.num_channels;
+
+		if (unlikely(channels > MLX5E_RL_IRQ_INDEX_MAX))
+			channels = MLX5E_RL_IRQ_INDEX_MAX;
+
+		max = (rl->use_multi_sq ? MLX5E_RL_MAX_RATES(rl) : 1);
+
+		for (i = 0; i != max; i++) {
+			phead = rl->sq_rate_head[i];
+
+			total_used = 1;
+			for (j = 0; j != MLX5E_RL_IRQ_INDEX_MAX; j++)
+				total_used += phead[j].count[MLX5E_RL_HEAD_USED];
+
+			min_free = howmany(total_used, channels);
+			min_free = howmany(min_free, 16);
+
+			for (j = 0; j != channels; j++) {
+				int eqn_not_used;
+				int irqn;
+
+				error = mlx5_vector2eqn(priv->mdev, j, &eqn_not_used, &irqn);
+				if (error != 0)
+					irqn = 0;
+
+				while ((sq = mlx5e_rl_dequeue_sq(phead + j, MLX5E_RL_HEAD_WORK)) != NULL) {
+					if (sq->cq_modify_flags == 0) {
+						mlx5e_rl_reset_sq_stats(sq);
+						mlx5e_rl_enqueue_sq(phead + j, MLX5E_RL_HEAD_FREE, sq);
+					} else if (sq->cq_modify_flags != MLX5_CQ_MODIFY_DESTROY &&
+					    (cq_modify_supported_mask & sq->cq_modify_flags) == sq->cq_modify_flags) {
+						error = mlx5_core_modify_cq_by_mask(priv->mdev,
+						    &sq->sq.cq.mcq, sq->cq_modify_flags,
+						    rl->param.tx_coalesce_usecs,
+						    rl->param.tx_coalesce_pkts,
+						    rl->param.tx_coalesce_mode,
+						    irqn);
+
+						if (error != 0) {
+							mlx5e_rl_free_sq(sq);
+						} else {
+							/* CQ modified, clear flags */
+							sq->cq_modify_flags = 0;
+
+							mlx5e_rl_reset_sq_stats(sq);
+							mlx5e_rl_enqueue_sq(phead + j, MLX5E_RL_HEAD_FREE, sq);
+						}
 					} else {
-						atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL);
+						mlx5e_rl_free_sq(sq);
 					}
-				} else {
-					mlx5e_resume_sq(channel->sq);
-				}
-
-				MLX5E_RL_WORKER_LOCK(rlw);
-				/* convert from bytes/s to bits/s and set new rate */
-				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel,
-				    channel->new_rate * 8ULL);
-				if (error != 0) {
-					mlx5_en_err(priv->ifp,
-					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
-					    error);
 				}
-				break;
-
-			case MLX5E_RL_ST_DESTROY:
-				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
-				if (error != 0) {
-					mlx5_en_err(priv->ifp,
-					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
-					    error);
-				}
-				if (channel->sq != NULL) {
-					/*
-					 * Make sure all packets are
-					 * transmitted before SQ is
-					 * returned to free list:
-					 */
-					MLX5E_RL_WORKER_UNLOCK(rlw);
-					mlx5e_drain_sq(channel->sq);
-					MLX5E_RL_WORKER_LOCK(rlw);
-				}
-				/* put the channel back into the free list */
-				STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry);
-				channel->state = MLX5E_RL_ST_FREE;
-				atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL);
-				break;
-			default:
-				/* NOP */
-				break;
 			}
-		}
-	}
 
-	/* close all the SQs */
-	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
-		struct mlx5e_rl_channel *channel = rlw->channels + x;
+			for (; j != MLX5E_RL_IRQ_INDEX_MAX; j++) {
+				while ((sq = mlx5e_rl_dequeue_sq(phead + j, MLX5E_RL_HEAD_WORK)) != NULL)
+					mlx5e_rl_free_sq(sq);
+			}
 
-		/* update the initial rate */
-		channel->init_rate = channel->last_rate;
+			for (j = 0; j != channels; j++) {
+				phead[j].level = min_free;
 
-		/* make sure we free up the rate resource */
-		mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
+				while (phead[j].count[MLX5E_RL_HEAD_FREE] < min_free) {
+					error = mlx5e_rl_alloc_sq(priv, j, i + 1, &rl->chan_param, &sq);
+					if (error != 0)
+						break;
+					mlx5e_rl_enqueue_sq(phead + j, MLX5E_RL_HEAD_FREE, sq);
+				}
+			}
 
-		if (channel->sq != NULL) {
-			MLX5E_RL_WORKER_UNLOCK(rlw);
-			mlx5e_rl_close_channel(&channel->sq);
-			atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL);
-			MLX5E_RL_WORKER_LOCK(rlw);
+			for (; j != MLX5E_RL_IRQ_INDEX_MAX; j++) {
+				while ((sq = mlx5e_rl_dequeue_sq(phead + j, MLX5E_RL_HEAD_FREE)) != NULL)
+					mlx5e_rl_free_sq(sq);
+			}
 		}
+		MLX5E_RL_PRIV_UNLOCK(rl);
+
+		mlx5e_rl_worker_wait(rl);
 	}
 
-	rlw->worker_done = 0;
-	cv_broadcast(&rlw->cv);
-	MLX5E_RL_WORKER_UNLOCK(rlw);
+	mtx_lock(&rl->rl_mtx);
+	rl->worker_done = 0;
+	cv_signal(&rl->rl_cv);
+	mtx_unlock(&rl->rl_mtx);
 
 	kthread_exit();
 }
@@ -764,69 +716,8 @@
 }
 
 static void
-mlx5e_rl_set_default_params(struct mlx5e_rl_params *param,
-    struct mlx5_core_dev *mdev)
+mlx5e_rl_set_default_params(struct mlx5e_rl_params *param)
 {
-	/* ratelimit workers */
-	param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors;
-	param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS;
-
-	/* range check */
-	if (param->tx_worker_threads_def == 0 ||
-	    param->tx_worker_threads_def > param->tx_worker_threads_max)
-		param->tx_worker_threads_def = param->tx_worker_threads_max;
-
-	/* ratelimit channels */
-	param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS /
-	    param->tx_worker_threads_def;
-	param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS;
-
-	/* range check */
-	if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER)
-		param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER;
-
-	/* set default burst size */
-	param->tx_burst_size = 4;	/* MTUs */
-
-	/*
-	 * Set maximum burst size
-	 *
-	 * The burst size is multiplied by the MTU and clamped to the
-	 * range 0 ... 65535 bytes inclusivly before fed into the
-	 * firmware.
-	 *
-	 * NOTE: If the burst size or MTU is changed only ratelimit
-	 * connections made after the change will use the new burst
-	 * size.
-	 */
-	param->tx_burst_size_max = 255;
-
-	/* get firmware rate limits in 1000bit/s and convert them to bit/s */
-	param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL;
-	param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL;
-
-	/* ratelimit table size */
-	param->tx_rates_max = mdev->priv.rl_table.max_size;
-
-	/* range check */
-	if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES)
-		param->tx_rates_max = MLX5E_RL_MAX_TX_RATES;
-
-	/* set default number of rates */
-	param->tx_rates_def = param->tx_rates_max;
-
-	/* set maximum allowed rate deviation */
-	if (param->tx_limit_max != 0) {
-		/*
-		 * Make sure the deviation multiplication doesn't
-		 * overflow unsigned 64-bit:
-		 */
-		param->tx_allowed_deviation_max = -1ULL /
-		    param->tx_limit_max;
-	}
-	/* set default rate deviation */
-	param->tx_allowed_deviation = 50;	/* 5.0% */
-
 	/* channel parameters */
 	param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
 	param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT;
@@ -839,10 +730,6 @@
 	MLX5E_RL_PARAMS(MLX5E_STATS_DESC)
 };
 
-static const char *mlx5e_rl_table_params_desc[] = {
-	MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC)
-};
-
 static const char *mlx5e_rl_stats_desc[] = {
 	MLX5E_RL_STATS(MLX5E_STATS_DESC)
 };
@@ -853,20 +740,51 @@
 	struct mlx5e_rl_priv_data *rl = &priv->rl;
 	struct sysctl_oid *node;
 	struct sysctl_oid *stats;
-	char buf[64];
-	uint64_t i;
-	uint64_t j;
+	uint32_t max;
+	uint32_t i;
+	uint32_t j;
 	int error;
 
 	/* check if there is support for packet pacing */
-	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
+	if (MLX5E_RL_MAX_RATES(rl) == 0)
 		return (0);
 
-	rl->priv = priv;
-
 	sysctl_ctx_init(&rl->ctx);
 
-	sx_init(&rl->rl_sxlock, "ratelimit-sxlock");
+	sx_init(&rl->rl_sxlock, "mlx5e-rl-global-sxlock");
+	mtx_init(&rl->rl_mtx, "mlx5e-rl-global-mtx", NULL, MTX_DEF);
+	cv_init(&rl->rl_cv, "mlx5e-rl-global-cv");
+
+	rl->use_multi_sq =
+	    !MLX5_CAP_QOS(priv->mdev, qos_remap_pp) ||
+	    !MLX5_CAP_ETH(priv->mdev, reg_umr_sq) ||
+	    mlx5_use_multi_sq;
+
+	rl->rate_limit_table = malloc(
+	    sizeof(rl->rate_limit_table[0]) * MLX5E_RL_MAX_RATES(rl),
+	    M_MLX5EN, M_WAITOK);
+
+	for (i = 0; i != MLX5E_RL_MAX_RATES(rl); i++)
+		rl->rate_limit_table[i] = priv->mdev->priv.rl_table.rl_entry[i].rate;
+
+	max = (rl->use_multi_sq ? MLX5E_RL_MAX_RATES(rl) : 1);
+
+	rl->sq_rate_head = malloc(
+	    sizeof(rl->sq_rate_head[0]) * max,
+	    M_MLX5EN, M_WAITOK | M_ZERO);
+
+	rl->sq_raw_head = malloc(
+	    sizeof(rl->sq_raw_head[0]) * max * MLX5E_RL_IRQ_INDEX_MAX,
+	    M_MLX5EN, M_WAITOK | M_ZERO);
+
+	for (i = 0; i != max; i++)
+		rl->sq_rate_head[i] = rl->sq_raw_head + (i * MLX5E_RL_IRQ_INDEX_MAX);
+
+	for (i = 0; i != (max * MLX5E_RL_IRQ_INDEX_MAX); i++) {
+		mtx_init(&rl->sq_raw_head[i].mtx, "mlx5e-rl-head", NULL, MTX_DEF);
+		for (j = 0; j != MLX5E_RL_HEAD_MAX; j++)
+			TAILQ_INIT(&rl->sq_raw_head[i].head[j]);
+	}
 
 	/* open own TIS domain for ratelimit SQs */
 	error = mlx5e_rl_open_tis(priv);
@@ -874,7 +792,7 @@
 		goto done;
 
 	/* setup default value for parameters */
-	mlx5e_rl_set_default_params(&rl->param, priv->mdev);
+	mlx5e_rl_set_default_params(&rl->param);
 
 	/* update the completion factor */
 	mlx5e_rl_sync_tx_completion_fact(rl);
@@ -906,299 +824,278 @@
 		}
 	}
 
-	/* allocate workers array */
-	rl->workers = malloc(sizeof(rl->workers[0]) *
-	    rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO);
-
-	/* allocate rate limit array */
-	rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) *
-	    rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO);
-
-	if (node != NULL) {
-		/* create more SYSCTls */
-		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
-		    "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD |
-		    CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table,
-		    "A", "Show table of all configured TX rates");
-
-		/* try to fetch rate table from kernel environment */
-		for (i = 0; i != rl->param.tx_rates_def; i++) {
-			/* compute path for tunable */
-			snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d",
-			    device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i);
-			if (TUNABLE_QUAD_FETCH(buf, &j))
-				mlx5e_rl_tx_limit_add(rl, j);
-		}
-
-		/* setup rate table sysctls */
-		for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) {
-			mlx5e_rl_sysctl_add_u64_oid(rl,
-			    MLX5E_RL_PARAMS_INDEX(table_arg[i]),
-			    node, mlx5e_rl_table_params_desc[2 * i],
-			    mlx5e_rl_table_params_desc[2 * i + 1]);
-		}
-	}
-
-	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
-		struct mlx5e_rl_worker *rlw = rl->workers + j;
-
-		rlw->priv = priv;
-
-		cv_init(&rlw->cv, "mlx5-worker-cv");
-		mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF);
-		STAILQ_INIT(&rlw->index_list_head);
-		STAILQ_INIT(&rlw->process_head);
-
-		rlw->channels = malloc(sizeof(rlw->channels[0]) *
-		    rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO);
-
-		MLX5E_RL_WORKER_LOCK(rlw);
-		for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) {
-			struct mlx5e_rl_channel *channel = rlw->channels + i;
-			channel->worker = rlw;
-			STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry);
-		}
-		MLX5E_RL_WORKER_UNLOCK(rlw);
-	}
-
 	PRIV_LOCK(priv);
-	error = mlx5e_rl_open_workers(priv);
+	mlx5e_rl_open_worker(priv);
 	PRIV_UNLOCK(priv);
 
-	if (error != 0) {
-		mlx5_en_err(priv->ifp,
-		    "mlx5e_rl_open_workers failed: %d\n", error);
-	}
-
 	return (0);
 
 done:
+	for (i = 0; i != (max * MLX5E_RL_IRQ_INDEX_MAX); i++)
+		mtx_destroy(&rl->sq_raw_head[i].mtx);
+
 	sysctl_ctx_free(&rl->ctx);
+
+	free(rl->rate_limit_table, M_MLX5EN);
+	free(rl->sq_raw_head, M_MLX5EN);
+	free(rl->sq_rate_head, M_MLX5EN);
+
+	cv_destroy(&rl->rl_cv);
+	mtx_destroy(&rl->rl_mtx);
 	sx_destroy(&rl->rl_sxlock);
+
 	return (error);
 }
 
-static int
-mlx5e_rl_open_workers(struct mlx5e_priv *priv)
+static void
+mlx5e_rl_open_worker(struct mlx5e_priv *priv)
 {
 	struct mlx5e_rl_priv_data *rl = &priv->rl;
 	struct thread *rl_thread = NULL;
 	struct proc *rl_proc = NULL;
-	uint64_t j;
 	int error;
 
-	if (priv->gone || rl->opened)
-		return (-EINVAL);
-
-	MLX5E_RL_WLOCK(rl);
+	MLX5E_RL_PRIV_LOCK(rl);
 	/* compute channel parameters once */
-	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
-	MLX5E_RL_WUNLOCK(rl);
-
-	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
-		struct mlx5e_rl_worker *rlw = rl->workers + j;
-
-		/* start worker thread */
-		error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread,
-		    RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j);
-		if (error != 0) {
-			mlx5_en_err(rl->priv->ifp,
-			    "kproc_kthread_add failed: %d\n", error);
-			rlw->worker_done = 1;
-		}
-	}
+	mlx5e_rl_build_channel_param(priv, &rl->chan_param);
+	MLX5E_RL_PRIV_UNLOCK(rl);
 
-	rl->opened = 1;
-
-	return (0);
+	/* start worker thread */
+	error = kproc_kthread_add(mlx5e_rl_worker, rl, &rl_proc, &rl_thread,
+	    RFHIGHPID, 0, "mlx5e-ratelimit", "mlx5e-rl-worker-thread");
+	if (error != 0) {
+		mlx5_en_err(priv->ifp,
+		    "kproc_kthread_add failed: %d\n", error);
+		rl->worker_done = 1;
+	}
+	rl->worker_opened = 1;
 }
 
 static void
-mlx5e_rl_close_workers(struct mlx5e_priv *priv)
+mlx5e_rl_close_worker(struct mlx5e_priv *priv)
 {
 	struct mlx5e_rl_priv_data *rl = &priv->rl;
-	uint64_t y;
 
-	if (rl->opened == 0)
+	if (rl->worker_opened == 0)
 		return;
 
-	/* tear down worker threads simultaneously */
-	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
-		struct mlx5e_rl_worker *rlw = rl->workers + y;
-
-		/* tear down worker before freeing SQs */
-		MLX5E_RL_WORKER_LOCK(rlw);
-		if (rlw->worker_done == 0) {
-			rlw->worker_done = 1;
-			cv_broadcast(&rlw->cv);
-		} else {
-			/* XXX thread not started */
-			rlw->worker_done = 0;
-		}
-		MLX5E_RL_WORKER_UNLOCK(rlw);
-	}
-
-	/* wait for worker threads to exit */
-	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
-		struct mlx5e_rl_worker *rlw = rl->workers + y;
-
-		/* tear down worker before freeing SQs */
-		MLX5E_RL_WORKER_LOCK(rlw);
-		while (rlw->worker_done != 0)
-			cv_wait(&rlw->cv, &rlw->mtx);
-		MLX5E_RL_WORKER_UNLOCK(rlw);
+	/* tear down worker before freeing SQs */
+	mtx_lock(&rl->rl_mtx);
+	if (rl->worker_done == 0) {
+		rl->worker_done = 1;
+		cv_signal(&rl->rl_cv);
+	} else {
+		/* XXX thread not started */
+		rl->worker_done = 0;
 	}
+	while (rl->worker_done != 0)
+		cv_wait(&rl->rl_cv, &rl->rl_mtx);
+	mtx_unlock(&rl->rl_mtx);
 
-	rl->opened = 0;
-}
-
-static void
-mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl)
-{
-	unsigned x;
-
-	MLX5E_RL_WLOCK(rl);
-	for (x = 0; x != rl->param.tx_rates_def; x++)
-		rl->rate_limit_table[x] = 0;
-	MLX5E_RL_WUNLOCK(rl);
+	rl->worker_opened = 0;
 }
 
 void
 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
 {
 	struct mlx5e_rl_priv_data *rl = &priv->rl;
-	uint64_t y;
+	struct mlx5e_rl_head *phead;
+	struct mlx5e_rl_sq *sq;
+	uint32_t max;
+	uint32_t i;
+	uint32_t j;
 
 	/* check if there is support for packet pacing */
-	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
+	if (MLX5E_RL_MAX_RATES(rl) == 0)
 		return;
 
-	/* TODO check if there is support for packet pacing */
-
 	sysctl_ctx_free(&rl->ctx);
 
 	PRIV_LOCK(priv);
-	mlx5e_rl_close_workers(priv);
+	mlx5e_rl_close_worker(priv);
 	PRIV_UNLOCK(priv);
 
-	mlx5e_rl_reset_rates(rl);
+	max = (rl->use_multi_sq ? MLX5E_RL_MAX_RATES(rl) : 1);
+
+	/* release all SQ's */
+	for (i = 0; i != (max * MLX5E_RL_IRQ_INDEX_MAX); i++) {
+		phead = rl->sq_raw_head + i;
+
+		for (j = 0; j != MLX5E_RL_HEAD_MAX; j++) {
+			while ((sq = mlx5e_rl_dequeue_sq(phead, j)) != NULL)
+				mlx5e_rl_free_sq(sq);
+		}
+		mtx_destroy(&phead->mtx);
+	}
 
 	/* close TIS domain */
 	mlx5e_rl_close_tis(priv);
 
-	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
-		struct mlx5e_rl_worker *rlw = rl->workers + y;
-
-		cv_destroy(&rlw->cv);
-		mtx_destroy(&rlw->mtx);
-		free(rlw->channels, M_MLX5EN);
-	}
 	free(rl->rate_limit_table, M_MLX5EN);
-	free(rl->workers, M_MLX5EN);
+	free(rl->sq_raw_head, M_MLX5EN);
+	free(rl->sq_rate_head, M_MLX5EN);
+
+	cv_destroy(&rl->rl_cv);
+	mtx_destroy(&rl->rl_mtx);
 	sx_destroy(&rl->rl_sxlock);
 }
 
-static void
-mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw,
-    struct mlx5e_rl_channel *channel)
+static struct mlx5e_sq *
+mlx5e_rl_get_current_sq_multi_locked(struct mlx5e_rl_channel *channel)
 {
-	STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry);
-	cv_broadcast(&rlw->cv);
-}
+	struct mlx5e_rl_sq *sq;
+	uint16_t index;
 
-static void
-mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel)
-{
-	if (channel == NULL)
-		return;
+	index = channel->last_rate_index;
 
-	MLX5E_RL_WORKER_LOCK(rlw);
-	switch (channel->state) {
-	case MLX5E_RL_ST_MODIFY:
-		channel->state = MLX5E_RL_ST_DESTROY;
-		break;
-	case MLX5E_RL_ST_USED:
-		channel->state = MLX5E_RL_ST_DESTROY;
-		mlx5e_rlw_queue_channel_locked(rlw, channel);
-		break;
-	default:
-		break;
+	MPASS(index != 0);
+top:
+	sq = channel->next_sq;
+
+	if (unlikely(sq != NULL)) {
+
+		MPASS(channel->curr_sq != NULL);
+
+		/* check if current SQ is empty */
+		mtx_lock(&channel->curr_sq->sq.comp_lock);
+
+		if (mlx5e_sq_is_empty(&channel->curr_sq->sq)) {
+			channel->curr_sq->sq_next = NULL;
+			mtx_unlock(&channel->curr_sq->sq.comp_lock);
+
+			/* get the current SQ on the work list */
+			mlx5e_rl_requeue_sq(
+			    MLX5E_RL_CHAN_TO_SQ_HEAD(channel, channel->curr_rate_index),
+			    MLX5E_RL_HEAD_USED,
+			    MLX5E_RL_HEAD_WORK,
+			    &channel->priv->rl,
+			    channel->curr_sq);
+
+			/* advance SQ pointer */
+			channel->curr_sq = sq;
+			channel->curr_rate_index = channel->next_rate_index;
+
+			/* ready to go */
+			sq->sq.db_inhibit = 0;
+
+			/* clear next */
+			channel->next_sq = NULL;
+			channel->next_rate_index = 0;
+
+			goto top;
+		} else {
+			channel->curr_sq->sq_next = &sq->sq;
+			mtx_unlock(&channel->curr_sq->sq.comp_lock);
+
+			/* check if rate changed once again */
+			if (unlikely(index != channel->next_rate_index))
+				atomic_add_64(&channel->priv->rl.stats.tx_rate_changed_too_quickly, 1ULL);
+		}
+	} else if (unlikely((sq = channel->curr_sq) == NULL)) {
+		/* no SQ allocated */
+		sq = mlx5e_rl_requeue_sq(
+		    MLX5E_RL_CHAN_TO_SQ_HEAD(channel, index),
+		    MLX5E_RL_HEAD_FREE,
+		    MLX5E_RL_HEAD_USED,
+		    &channel->priv->rl,
+		    NULL);
+
+		if (likely(sq != NULL)) {
+			/* ready to go */
+			sq->sq.db_inhibit = 0;
+
+			channel->curr_sq = sq;
+			channel->curr_rate_index = index;
+		} else {
+			atomic_add_64(&channel->priv->rl.stats.tx_available_resource_failure, 1ULL);
+			return (NULL);
+		}
+	} else if (unlikely(index != channel->curr_rate_index)) {
+		/* rate change */
+		sq = mlx5e_rl_requeue_sq(
+		    MLX5E_RL_CHAN_TO_SQ_HEAD(channel, index),
+		    MLX5E_RL_HEAD_FREE,
+		    MLX5E_RL_HEAD_USED,
+		    &channel->priv->rl,
+		    NULL);
+
+		if (likely(sq != NULL)) {
+			/* don't send anything yet */
+			sq->sq.db_inhibit = 1;
+
+			channel->next_sq = sq;
+			channel->next_rate_index = index;
+			goto top;
+		} else {
+			atomic_add_64(&channel->priv->rl.stats.tx_available_resource_failure, 1ULL);
+			return (NULL);
+		}
 	}
-	MLX5E_RL_WORKER_UNLOCK(rlw);
+	return (&sq->sq);
 }
 
-static int
-mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate)
+static struct mlx5e_sq *
+mlx5e_rl_get_current_sq_non_multi_locked(struct mlx5e_rl_channel *channel)
 {
+	struct mlx5e_rl_sq *sq;
+	uint16_t index;
+	int error;
 
-	MLX5E_RL_WORKER_LOCK(rlw);
-	channel->new_rate = rate;
-	switch (channel->state) {
-	case MLX5E_RL_ST_USED:
-		channel->state = MLX5E_RL_ST_MODIFY;
-		mlx5e_rlw_queue_channel_locked(rlw, channel);
-		break;
-	default:
-		break;
+	index = channel->last_rate_index;
+
+	MPASS(index != 0);
+
+	sq = channel->curr_sq;
+
+	/* check for no SQ */
+	if (unlikely(sq == NULL)) {
+		sq = mlx5e_rl_requeue_sq(
+		    MLX5E_RL_CHAN_TO_SQ_HEAD(channel, 1),
+		    MLX5E_RL_HEAD_FREE,
+		    MLX5E_RL_HEAD_USED,
+		    &channel->priv->rl,
+		    NULL);
+
+		if (likely(sq != NULL)) {
+			/* ready to go */
+			sq->sq.db_inhibit = 0;
+
+			channel->curr_sq = sq;
+			channel->curr_rate_index = 1;
+		} else {
+			atomic_add_64(&channel->priv->rl.stats.tx_available_resource_failure, 1ULL);
+			return (NULL);
+		}
 	}
-	MLX5E_RL_WORKER_UNLOCK(rlw);
 
-	return (0);
+	/* check for rate change in any case */
+	if (unlikely(index != channel->next_rate_index)) {
+		error = mlx5e_rl_remap_sq(sq, channel, index);
+		if (error != 0)
+			return (NULL);
+		channel->next_rate_index = index;
+	}
+	return (&sq->sq);
 }
 
-static int
-mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel,
-    union if_snd_tag_query_params *params)
+static struct mlx5e_sq *
+mlx5e_rl_get_current_sq_locked(struct mlx5e_rl_channel *channel)
 {
-	int retval;
-
-	MLX5E_RL_WORKER_LOCK(rlw);
-	switch (channel->state) {
-	case MLX5E_RL_ST_USED:
-		params->rate_limit.max_rate = channel->last_rate;
-		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
-		retval = 0;
-		break;
-	case MLX5E_RL_ST_MODIFY:
-		params->rate_limit.max_rate = channel->last_rate;
-		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
-		retval = EBUSY;
-		break;
-	default:
-		retval = EINVAL;
-		break;
-	}
-	MLX5E_RL_WORKER_UNLOCK(rlw);
-
-	return (retval);
+	if (channel->priv->rl.use_multi_sq != 0)
+		return (mlx5e_rl_get_current_sq_multi_locked(channel));
+	else
+		return (mlx5e_rl_get_current_sq_non_multi_locked(channel));
 }
 
-static int
-mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw,
-    struct mlx5e_rl_channel **pchannel)
+struct mlx5e_sq *
+mlx5e_rl_get_current_sq(struct mlx5e_rl_channel *channel)
 {
-	struct mlx5e_rl_channel *channel;
-	int retval = ENOMEM;
-
-	MLX5E_RL_WORKER_LOCK(rlw);
-	/* Check for available channel in free list */
-	if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) {
-		retval = 0;
-		/* Remove head index from available list */
-		STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry);
-		channel->state = MLX5E_RL_ST_USED;
-		atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL);
-	} else {
-		atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL);
-	}
-	MLX5E_RL_WORKER_UNLOCK(rlw);
+	struct mlx5e_sq *sq;
 
-	*pchannel = channel;
-#ifdef RATELIMIT_DEBUG
-	mlx5_en_info(rlw->priv->ifp,
-	    "Channel pointer for rate limit connection is %p\n", channel);
-#endif
-	return (retval);
+	MLX5E_RL_CHANNEL_LOCK(channel);
+	sq = mlx5e_rl_get_current_sq_locked(channel);
+	MLX5E_RL_CHANNEL_UNLOCK(channel);
+	return (sq);
 }
 
 int
@@ -1207,40 +1104,40 @@
     struct m_snd_tag **ppmt)
 {
 	struct mlx5e_rl_channel *channel;
-	struct mlx5e_rl_worker *rlw;
 	struct mlx5e_priv *priv;
-	int error;
 
 	priv = if_getsoftc(ifp);
 
 	/* check if there is support for packet pacing or if device is going away */
-	if (!MLX5_CAP_GEN(priv->mdev, qos) ||
-	    !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone ||
+	if (MLX5E_RL_MAX_RATES(&priv->rl) == 0 || priv->gone ||
 	    params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT)
 		return (EOPNOTSUPP);
 
-	/* compute worker thread this TCP connection belongs to */
-	rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) %
-	    priv->rl.param.tx_worker_threads_def);
+	channel = malloc(sizeof(*channel), M_MLX5EN, M_NOWAIT | M_ZERO);
+	if (unlikely(channel == NULL))
+		return (ENOMEM);
 
-	error = mlx5e_find_available_tx_ring_index(rlw, &channel);
-	if (error != 0)
-		goto done;
+	mtx_init(&channel->mtx, "mlx5e-rl-channel", NULL, MTX_DEF);
 
-	error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate);
-	if (error != 0) {
-		mlx5e_rl_free(rlw, channel);
-		goto done;
-	}
+	channel->priv = priv;
+
+	mlx5_rl_find_rate_index(channel->priv->mdev,
+	    params->rate_limit.max_rate, &channel->last_rate_index);
+
+	MPASS(channel->last_rate_index != 0);
+
+	/* get the IRQ index */
+	channel->irq_index = (params->rate_limit.hdr.flowid %
+	    MLX5E_RL_IRQ_INDEX_MAX) % priv->params.num_channels;
 
 	/* store pointer to mbuf tag */
-	MPASS(channel->tag.refcount == 0);
 	m_snd_tag_init(&channel->tag, ifp, &mlx5e_rl_snd_tag_sw);
 	*ppmt = &channel->tag;
-done:
-	return (error);
-}
 
+	atomic_add_64(&priv->rl.stats.tx_active_connections, 1ULL);
+
+	return (0);
+}
 
 static int
 mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
@@ -1248,7 +1145,13 @@
 	struct mlx5e_rl_channel *channel =
 	    container_of(pmt, struct mlx5e_rl_channel, tag);
 
-	return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate));
+	MLX5E_RL_CHANNEL_LOCK(channel);
+	mlx5_rl_find_rate_index(channel->priv->mdev,
+	    params->rate_limit.max_rate, &channel->last_rate_index);
+	MPASS(channel->last_rate_index != 0);
+	MLX5E_RL_CHANNEL_UNLOCK(channel);
+
+	return (0);
 }
 
 static int
@@ -1256,205 +1159,170 @@
 {
 	struct mlx5e_rl_channel *channel =
 	    container_of(pmt, struct mlx5e_rl_channel, tag);
+	struct mlx5e_rl_sq *rl_sq;
+	struct mlx5e_sq *sq;
+	uint16_t index;
 
-	return (mlx5e_rl_query(channel->worker, channel, params));
-}
+	MLX5E_RL_CHANNEL_LOCK(channel);
 
-static void
-mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt)
-{
-	struct mlx5e_rl_channel *channel =
-	    container_of(pmt, struct mlx5e_rl_channel, tag);
+	/*
+	 * Check for existing SQ, before allocating a new. This avoids
+	 * races if send tag methods are not serialized.
+	 */
+	rl_sq = channel->next_sq;
+	if (likely(rl_sq == NULL))
+		rl_sq = channel->curr_sq;
 
-	mlx5e_rl_free(channel->worker, channel);
-}
+	if (likely(rl_sq != NULL))
+		sq = &rl_sq->sq;
+	else
+		sq = mlx5e_rl_get_current_sq_locked(channel);
 
-static int
-mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS)
-{
-	struct mlx5e_rl_priv_data *rl = arg1;
-	struct mlx5e_priv *priv = rl->priv;
-	struct sbuf sbuf;
-	unsigned x;
-	int error;
+	/* if there is no queue yet, consider it full */
+	if (unlikely(sq == NULL))
+		params->rate_limit.queue_level = IF_SND_QUEUE_LEVEL_MAX;
+	else
+		params->rate_limit.queue_level = mlx5e_sq_queue_level(sq);
 
-	error = sysctl_wire_old_buffer(req, 0);
-	if (error != 0)
-		return (error);
+	index = channel->last_rate_index;
+	MPASS(index != 0);
 
-	PRIV_LOCK(priv);
+	params->rate_limit.max_rate = channel->priv->rl.rate_limit_table[index - 1];
+	MLX5E_RL_CHANNEL_UNLOCK(channel);
 
-	sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req);
+	return (0);
+}
 
-	sbuf_printf(&sbuf,
-	    "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n"
-	    "\t" "--------------------------------------------\n");
+static void
+mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt)
+{
+	struct mlx5e_rl_channel *channel =
+	    container_of(pmt, struct mlx5e_rl_channel, tag);
 
-	MLX5E_RL_RLOCK(rl);
-	for (x = 0; x != rl->param.tx_rates_def; x++) {
-		if (rl->rate_limit_table[x] == 0)
-			continue;
+	if (channel->curr_sq != NULL) {
+		mlx5e_rl_requeue_sq(
+		    MLX5E_RL_CHAN_TO_SQ_HEAD(channel, channel->curr_rate_index),
+		    MLX5E_RL_HEAD_USED,
+		    MLX5E_RL_HEAD_WORK,
+		    &channel->priv->rl,
+		    channel->curr_sq);
+	}
 
-		sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n",
-		    x, (unsigned)rl->param.tx_burst_size,
-		    (long long)rl->rate_limit_table[x]);
+	if (channel->next_sq != NULL) {
+		mlx5e_rl_requeue_sq(
+		    MLX5E_RL_CHAN_TO_SQ_HEAD(channel, channel->next_rate_index),
+		    MLX5E_RL_HEAD_USED,
+		    MLX5E_RL_HEAD_WORK,
+		    &channel->priv->rl,
+		    channel->next_sq);
 	}
-	MLX5E_RL_RUNLOCK(rl);
 
-	error = sbuf_finish(&sbuf);
-	sbuf_delete(&sbuf);
+	mtx_destroy(&channel->mtx);
 
-	PRIV_UNLOCK(priv);
+	atomic_add_64(&channel->priv->rl.stats.tx_active_connections, -1ULL);
 
-	return (error);
+	free(channel, M_MLX5EN);
 }
 
-static int
-mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl)
+void
+mlx5e_rl_query(if_t ifp, struct if_ratelimit_query_results *q)
 {
-	uint64_t x;
-	uint64_t y;
+	struct mlx5e_priv *priv = if_getsoftc(ifp);
 
-	MLX5E_RL_WLOCK(rl);
-	/* compute channel parameters once */
-	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
-	MLX5E_RL_WUNLOCK(rl);
-
-	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
-		struct mlx5e_rl_worker *rlw = rl->workers + y;
-
-		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
-			struct mlx5e_rl_channel *channel;
-			struct mlx5e_sq *sq;
-
-			channel = rlw->channels + x;
-			sq = channel->sq;
-
-			if (sq == NULL)
-				continue;
-
-			if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) {
-				mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq,
-				    rl->param.tx_coalesce_usecs,
-				    rl->param.tx_coalesce_pkts,
-				    rl->param.tx_coalesce_mode);
-			} else {
-				mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq,
-				    rl->param.tx_coalesce_usecs,
-				    rl->param.tx_coalesce_pkts);
-			}
-		}
-	}
-	return (0);
+	q->rate_table = priv->rl.rate_limit_table;
+	q->flags = RT_IS_FIXED_TABLE;
+	q->max_flows = 0;	/* no limit */
+	q->number_of_rates = MLX5E_RL_MAX_RATES(&priv->rl);
+	q->min_segment_burst = 1;
 }
 
 void
-mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl)
+mlx5e_rl_refresh_channel_params(struct mlx5e_priv *priv, uint32_t cq_modify_flags)
 {
-	uint64_t x;
-	uint64_t y;
+	struct mlx5e_rl_head *phead;
+	struct mlx5e_rl_sq *sq;
+	uint32_t max;
+	uint32_t i;
+	uint32_t j;
+	uint32_t k;
+
+	if (MLX5E_RL_MAX_RATES(&priv->rl) == 0)
+		return;
+
+	MLX5E_RL_PRIV_LOCK(&priv->rl);
 
-	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
-		struct mlx5e_rl_worker *rlw = rl->workers + y;
+	/* compute channel parameters once */
+	mlx5e_rl_build_channel_param(priv, &priv->rl.chan_param);
 
-		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
-			struct mlx5e_rl_channel *channel;
-			struct mlx5e_sq *sq;
+	/* verify TX completion factor */
+	mlx5e_rl_sync_tx_completion_fact(&priv->rl);
 
-			channel = rlw->channels + x;
-			sq = channel->sq;
+	max = (priv->rl.use_multi_sq ? MLX5E_RL_MAX_RATES(&priv->rl) : 1);
 
-			if (sq == NULL)
-				continue;
+	for (i = 0; i != max; i++) {
+		for (j = 0; j != MLX5E_RL_IRQ_INDEX_MAX; j++) {
+			phead = priv->rl.sq_rate_head[i] + j;
 
-			mtx_lock(&sq->lock);
-			mlx5e_update_sq_inline(sq);
-			mtx_unlock(&sq->lock);
+			MLX5E_RL_HEAD_LOCK(phead);
+			for (k = 0; k != MLX5E_RL_HEAD_MAX; k++) {
+				TAILQ_FOREACH(sq, &phead->head[k], entry)
+					sq->cq_modify_flags |= cq_modify_flags;
+			}
+			MLX5E_RL_HEAD_UNLOCK(phead);
 		}
 	}
+	MLX5E_RL_PRIV_UNLOCK(&priv->rl);
 }
 
-static int
-mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value)
+void
+mlx5e_rl_refresh_sq_inline(struct mlx5e_priv *priv)
 {
-	unsigned x;
-	int error;
+	struct mlx5e_rl_head *phead;
+	struct mlx5e_rl_sq *sq;
+	uint32_t max;
+	uint32_t i;
+	uint32_t j;
+	uint32_t k;
+
+	if (MLX5E_RL_MAX_RATES(&priv->rl) == 0)
+		return;
 
-	if (value < 1000 ||
-	    mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0)
-		return (EINVAL);
+	MLX5E_RL_PRIV_LOCK(&priv->rl);
 
-	MLX5E_RL_WLOCK(rl);
-	error = ENOMEM;
+	max = (priv->rl.use_multi_sq ? MLX5E_RL_MAX_RATES(&priv->rl) : 1);
 
-	/* check if rate already exists */
-	for (x = 0; x != rl->param.tx_rates_def; x++) {
-		if (rl->rate_limit_table[x] != value)
-			continue;
-		error = EEXIST;
-		break;
-	}
+	for (i = 0; i != max; i++) {
+		for (j = 0; j != MLX5E_RL_IRQ_INDEX_MAX; j++) {
+			phead = priv->rl.sq_rate_head[i] + j;
 
-	/* check if there is a free rate entry */
-	if (x == rl->param.tx_rates_def) {
-		for (x = 0; x != rl->param.tx_rates_def; x++) {
-			if (rl->rate_limit_table[x] != 0)
-				continue;
-			rl->rate_limit_table[x] = value;
-			error = 0;
-			break;
+			MLX5E_RL_HEAD_LOCK(phead);
+			for (k = 0; k != MLX5E_RL_HEAD_MAX; k++) {
+				TAILQ_FOREACH(sq, &phead->head[k], entry) {
+					mtx_lock(&sq->sq.lock);
+					mlx5e_update_sq_inline(&sq->sq);
+					mtx_unlock(&sq->sq.lock);
+				}
+			}
+			MLX5E_RL_HEAD_UNLOCK(phead);
 		}
 	}
-	MLX5E_RL_WUNLOCK(rl);
-
-	return (error);
-}
-
-static int
-mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value)
-{
-	unsigned x;
-	int error;
-
-	if (value == 0)
-		return (EINVAL);
-
-	MLX5E_RL_WLOCK(rl);
-
-	/* check if rate already exists */
-	for (x = 0; x != rl->param.tx_rates_def; x++) {
-		if (rl->rate_limit_table[x] != value)
-			continue;
-		/* free up rate */
-		rl->rate_limit_table[x] = 0;
-		break;
-	}
-
-	/* check if there is a free rate entry */
-	if (x == rl->param.tx_rates_def)
-		error = ENOENT;
-	else
-		error = 0;
-	MLX5E_RL_WUNLOCK(rl);
-
-	return (error);
+	MLX5E_RL_PRIV_UNLOCK(&priv->rl);
 }
 
 static int
 mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
-	struct mlx5e_rl_priv_data *rl = arg1;
-	struct mlx5e_priv *priv = rl->priv;
-	unsigned mode_modify;
-	unsigned was_opened;
+	struct mlx5e_rl_priv_data *rl;
+	struct mlx5e_priv *priv;
 	uint64_t value;
 	int error;
 
+	rl = arg1;
+	priv = container_of(rl, struct mlx5e_priv, rl);
+
 	PRIV_LOCK(priv);
 
-	MLX5E_RL_RLOCK(rl);
 	value = rl->param.arg[arg2];
-	MLX5E_RL_RUNLOCK(rl);
 
 	if (req != NULL) {
 		error = sysctl_handle_64(oidp, &value, 0, req);
@@ -1470,40 +1338,8 @@
 		error = ENXIO;
 		goto done;
 	}
-	was_opened = rl->opened;
-	mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify);
 
 	switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) {
-	case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def):
-		if (value > rl->param.tx_worker_threads_max)
-			value = rl->param.tx_worker_threads_max;
-		else if (value < 1)
-			value = 1;
-
-		/* store new value */
-		rl->param.arg[arg2] = value;
-		break;
-
-	case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def):
-		if (value > rl->param.tx_channels_per_worker_max)
-			value = rl->param.tx_channels_per_worker_max;
-		else if (value < 1)
-			value = 1;
-
-		/* store new value */
-		rl->param.arg[arg2] = value;
-		break;
-
-	case MLX5E_RL_PARAMS_INDEX(tx_rates_def):
-		if (value > rl->param.tx_rates_max)
-			value = rl->param.tx_rates_max;
-		else if (value < 1)
-			value = 1;
-
-		/* store new value */
-		rl->param.arg[arg2] = value;
-		break;
-
 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs):
 		/* range check */
 		if (value < 1)
@@ -1513,10 +1349,7 @@
 
 		/* store new value */
 		rl->param.arg[arg2] = value;
-
-		/* check to avoid down and up the network interface */
-		if (was_opened)
-			error = mlx5e_rl_refresh_channel_params(rl);
+		mlx5e_rl_refresh_channel_params(priv, MLX5_CQ_MODIFY_PERIOD);
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts):
@@ -1528,38 +1361,20 @@
 
 		/* store new value */
 		rl->param.arg[arg2] = value;
-
-		/* check to avoid down and up the network interface */
-		if (was_opened)
-			error = mlx5e_rl_refresh_channel_params(rl);
+		mlx5e_rl_refresh_channel_params(priv, MLX5_CQ_MODIFY_COUNT);
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode):
-		/* network interface must be down */
-		if (was_opened != 0 && mode_modify == 0)
-			mlx5e_rl_close_workers(priv);
-
 		/* import TX coalesce mode */
 		if (value != 0)
 			value = 1;
 
 		/* store new value */
 		rl->param.arg[arg2] = value;
-
-		/* restart network interface, if any */
-		if (was_opened != 0) {
-			if (mode_modify == 0)
-				mlx5e_rl_open_workers(priv);
-			else
-				error = mlx5e_rl_refresh_channel_params(rl);
-		}
+		mlx5e_rl_refresh_channel_params(priv, MLX5_CQ_MODIFY_PERIOD_MODE);
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_queue_size):
-		/* network interface must be down */
-		if (was_opened)
-			mlx5e_rl_close_workers(priv);
-
 		/* import TX queue size */
 		if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE))
 			value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
@@ -1571,61 +1386,13 @@
 
 		/* store new value */
 		rl->param.arg[arg2] = value;
-
-		/* verify TX completion factor */
-		mlx5e_rl_sync_tx_completion_fact(rl);
-
-		/* restart network interface, if any */
-		if (was_opened)
-			mlx5e_rl_open_workers(priv);
+		mlx5e_rl_refresh_channel_params(priv, MLX5_CQ_MODIFY_DESTROY);
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_completion_fact):
-		/* network interface must be down */
-		if (was_opened)
-			mlx5e_rl_close_workers(priv);
-
 		/* store new value */
 		rl->param.arg[arg2] = value;
-
-		/* verify parameter */
-		mlx5e_rl_sync_tx_completion_fact(rl);
-
-		/* restart network interface, if any */
-		if (was_opened)
-			mlx5e_rl_open_workers(priv);
-		break;
-
-	case MLX5E_RL_PARAMS_INDEX(tx_limit_add):
-		error = mlx5e_rl_tx_limit_add(rl, value);
-		break;
-
-	case MLX5E_RL_PARAMS_INDEX(tx_limit_clr):
-		error = mlx5e_rl_tx_limit_clr(rl, value);
-		break;
-
-	case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation):
-		/* range check */
-		if (value > rl->param.tx_allowed_deviation_max)
-			value = rl->param.tx_allowed_deviation_max;
-		else if (value < rl->param.tx_allowed_deviation_min)
-			value = rl->param.tx_allowed_deviation_min;
-
-		MLX5E_RL_WLOCK(rl);
-		rl->param.arg[arg2] = value;
-		MLX5E_RL_WUNLOCK(rl);
-		break;
-
-	case MLX5E_RL_PARAMS_INDEX(tx_burst_size):
-		/* range check */
-		if (value > rl->param.tx_burst_size_max)
-			value = rl->param.tx_burst_size_max;
-		else if (value < rl->param.tx_burst_size_min)
-			value = rl->param.tx_burst_size_min;
-
-		MLX5E_RL_WLOCK(rl);
-		rl->param.arg[arg2] = value;
-		MLX5E_RL_WUNLOCK(rl);
+		mlx5e_rl_refresh_channel_params(priv, MLX5_CQ_MODIFY_DESTROY);
 		break;
 
 	default:
@@ -1641,9 +1408,8 @@
     struct sysctl_oid *node, const char *name, const char *desc)
 {
 	/*
-	 * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will
-	 * take care of loading default sysctl value from the kernel
-	 * environment, if any:
+	 * NOTE: The CTLFLAG_XXTUN flag will take care of loading
+	 * default sysctl value from the kernel environment, if any:
 	 */
 	if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) {
 		/* read-only SYSCTLs */
@@ -1676,6 +1442,43 @@
 	    CTLFLAG_RD, &rl->stats.arg[x], 0, desc);
 }
 
+void
+mlx5e_rl_accumulate_sq_stats(struct mlx5e_rl_priv_data *rl,
+    struct mlx5e_sq_stats *sq_stats)
+{
+	struct mlx5e_rl_head *phead;
+	struct mlx5e_rl_sq *sq;
+	uint32_t max;
+	uint32_t i;
+	uint32_t j;
+	uint32_t k;
+
+	if (MLX5E_RL_MAX_RATES(rl) == 0)
+		return;
+
+	MLX5E_RL_PRIV_LOCK(rl);
+
+	mlx5e_accumulate_sq_stats(&rl->sq_stats, sq_stats);
+
+	max = (rl->use_multi_sq ? MLX5E_RL_MAX_RATES(rl) : 1);
+
+	for (i = 0; i != max; i++) {
+		for (j = 0; j != MLX5E_RL_IRQ_INDEX_MAX; j++) {
+			phead = rl->sq_rate_head[i] + j;
+
+			MLX5E_RL_HEAD_LOCK(phead);
+			for (k = 0; k != MLX5E_RL_HEAD_MAX; k++) {
+				if (k == MLX5E_RL_HEAD_FREE)
+					continue;
+				TAILQ_FOREACH(sq, &phead->head[k], entry)
+					mlx5e_accumulate_sq_stats(&sq->sq.stats, sq_stats);
+			}
+			MLX5E_RL_HEAD_UNLOCK(phead);
+		}
+	}
+	MLX5E_RL_PRIV_UNLOCK(rl);
+}
+
 #else
 
 int
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c
@@ -102,8 +102,8 @@
 	switch (mb_tag->sw->type) {
 #ifdef RATELIMIT
 	case IF_SND_TAG_TYPE_RATE_LIMIT:
-		sq = container_of(mb_tag,
-		    struct mlx5e_rl_channel, tag)->sq;
+		sq = mlx5e_rl_get_current_sq(
+		    container_of(mb_tag, struct mlx5e_rl_channel, tag));
 		break;
 #ifdef KERN_TLS
 	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
@@ -1146,10 +1146,13 @@
 		MPASS(mb->m_pkthdr.snd_tag->ifp == ifp);
 		sq = mlx5e_select_queue_by_send_tag(ifp, mb);
 		if (unlikely(sq == NULL)) {
-			goto select_queue;
+			/* Free mbuf */
+			m_freem(mb);
+
+			/* Need to wait for a send queue to be allocated. */
+			return (ENOBUFS);
 		}
 	} else {
-select_queue:
 		sq = mlx5e_select_queue(ifp, mb);
 		if (unlikely(sq == NULL)) {
 			/* Free mbuf */
diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_ah.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_ah.c
--- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_ah.c
+++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_ah.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <dev/mlx5/mlx5_ib/mlx5_ib.h>
 
diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_cong.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_cong.c
--- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_cong.c
+++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_cong.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <dev/mlx5/mlx5_ib/mlx5_ib.h>
 #include <dev/mlx5/cmd.h>
diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_cq.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_cq.c
--- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_cq.c
+++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_cq.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/kref.h>
 #include <rdma/ib_umem.h>
diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_devx.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_devx.c
--- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_devx.c
+++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_devx.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_verbs.h>
diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_doorbell.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_doorbell.c
--- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_doorbell.c
+++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_doorbell.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/kref.h>
 #include <linux/slab.h>
diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_gsi.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_gsi.c
--- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_gsi.c
+++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_gsi.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <dev/mlx5/mlx5_ib/mlx5_ib.h>
 
diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mad.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mad.c
--- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mad.c
+++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mad.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <dev/mlx5/vport.h>
 #include <rdma/ib_mad.h>
diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c
--- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c
+++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/module.h>
 #include <linux/errno.h>
diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mem.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mem.c
--- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mem.c
+++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mem.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/module.h>
 #include <rdma/ib_umem.h>
diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c
--- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c
+++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/kref.h>
 #include <linux/random.h>
diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c
--- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c
+++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/module.h>
 #include <rdma/ib_umem.h>
diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_srq.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_srq.c
--- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_srq.c
+++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_srq.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/module.h>
 #include <dev/mlx5/qp.h>
diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_virt.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_virt.c
--- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_virt.c
+++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_virt.c
@@ -26,7 +26,6 @@
  */
 
 #include "opt_rss.h"
-#include "opt_ratelimit.h"
 
 #include <linux/module.h>
 #include <dev/mlx5/vport.h>