diff --git a/sys/dev/nvmf/host/nvmf_qpair.c b/sys/dev/nvmf/host/nvmf_qpair.c
index 1aeb0535eacf..b03ecfa081d3 100644
--- a/sys/dev/nvmf/host/nvmf_qpair.c
+++ b/sys/dev/nvmf/host/nvmf_qpair.c
@@ -1,426 +1,449 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2023-2024 Chelsio Communications, Inc.
  * Written by: John Baldwin <jhb@FreeBSD.org>
  */
 
 #include <sys/types.h>
 #include <sys/bus.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <dev/nvme/nvme.h>
 #include <dev/nvmf/nvmf.h>
 #include <dev/nvmf/nvmf_transport.h>
 #include <dev/nvmf/host/nvmf_var.h>
 
 struct nvmf_host_command {
 	struct nvmf_request *req;
 	TAILQ_ENTRY(nvmf_host_command) link;
 	uint16_t cid;
 };
 
 struct nvmf_host_qpair {
 	struct nvmf_softc *sc;
 	struct nvmf_qpair *qp;
 
 	bool	sq_flow_control;
 	bool	shutting_down;
 	u_int	allocating;
 	u_int	num_commands;
 	uint16_t sqhd;
 	uint16_t sqtail;
 	uint64_t submitted;
 
 	struct mtx lock;
 
 	TAILQ_HEAD(, nvmf_host_command) free_commands;
 	STAILQ_HEAD(, nvmf_request) pending_requests;
 
 	/* Indexed by cid. */
 	struct nvmf_host_command **active_commands;
 
 	char	name[16];
 	struct sysctl_ctx_list sysctl_ctx;
 };
 
 struct nvmf_request *
 nvmf_allocate_request(struct nvmf_host_qpair *qp, void *sqe,
     nvmf_request_complete_t *cb, void *cb_arg, int how)
 {
 	struct nvmf_request *req;
 	struct nvmf_qpair *nq;
 
 	KASSERT(how == M_WAITOK || how == M_NOWAIT,
 	    ("%s: invalid how", __func__));
 
 	req = malloc(sizeof(*req), M_NVMF, how | M_ZERO);
 	if (req == NULL)
 		return (NULL);
 
 	mtx_lock(&qp->lock);
 	nq = qp->qp;
 	if (nq == NULL) {
 		mtx_unlock(&qp->lock);
 		free(req, M_NVMF);
 		return (NULL);
 	}
 	qp->allocating++;
 	MPASS(qp->allocating != 0);
 	mtx_unlock(&qp->lock);
 
 	req->qp = qp;
 	req->cb = cb;
 	req->cb_arg = cb_arg;
 	req->nc = nvmf_allocate_command(nq, sqe, how);
 	if (req->nc == NULL) {
 		free(req, M_NVMF);
 		req = NULL;
 	}
 
 	mtx_lock(&qp->lock);
 	qp->allocating--;
 	if (qp->allocating == 0 && qp->shutting_down)
 		wakeup(qp);
 	mtx_unlock(&qp->lock);
 
 	return (req);
 }
 
 static void
 nvmf_abort_request(struct nvmf_request *req, uint16_t cid)
 {
 	struct nvme_completion cqe;
 
 	memset(&cqe, 0, sizeof(cqe));
 	cqe.cid = cid;
 	cqe.status = htole16(NVMEF(NVME_STATUS_SCT, NVME_SCT_PATH_RELATED) |
 	    NVMEF(NVME_STATUS_SC, NVME_SC_COMMAND_ABORTED_BY_HOST));
 	req->cb(req->cb_arg, &cqe);
 }
 
 void
 nvmf_free_request(struct nvmf_request *req)
 {
 	if (req->nc != NULL)
 		nvmf_free_capsule(req->nc);
 	free(req, M_NVMF);
 }
 
 static void
 nvmf_dispatch_command(struct nvmf_host_qpair *qp, struct nvmf_host_command *cmd)
 {
 	struct nvmf_softc *sc = qp->sc;
 	struct nvme_command *sqe;
 	struct nvmf_capsule *nc;
+	uint16_t new_sqtail;
 	int error;
 
+	mtx_assert(&qp->lock, MA_OWNED);
+
+	qp->submitted++;
+
+	/*
+	 * Update flow control tracking.  This is just a sanity check.
+	 * Since num_commands == qsize - 1, there can never be too
+	 * many commands in flight.
+	 */
+	new_sqtail = (qp->sqtail + 1) % (qp->num_commands + 1);
+	KASSERT(new_sqtail != qp->sqhd, ("%s: qp %p is full", __func__, qp));
+	qp->sqtail = new_sqtail;
+	mtx_unlock(&qp->lock);
+
 	nc = cmd->req->nc;
 	sqe = nvmf_capsule_sqe(nc);
 
 	/*
 	 * NB: Don't bother byte-swapping the cid so that receive
 	 * doesn't have to swap.
 	 */
 	sqe->cid = cmd->cid;
 
 	error = nvmf_transmit_capsule(nc);
 	if (error != 0) {
 		device_printf(sc->dev,
 		    "failed to transmit capsule: %d, disconnecting\n", error);
 		nvmf_disconnect(sc);
 		return;
 	}
 
 	if (sc->ka_traffic)
 		atomic_store_int(&sc->ka_active_tx_traffic, 1);
 }
 
 static void
 nvmf_qp_error(void *arg, int error)
 {
 	struct nvmf_host_qpair *qp = arg;
 	struct nvmf_softc *sc = qp->sc;
 
 	/* Ignore simple close of queue pairs during shutdown. */
 	if (!(sc->detaching && error == 0))
 		device_printf(sc->dev, "error %d on %s, disconnecting\n", error,
 		    qp->name);
 	nvmf_disconnect(sc);
 }
 
 static void
 nvmf_receive_capsule(void *arg, struct nvmf_capsule *nc)
 {
 	struct nvmf_host_qpair *qp = arg;
 	struct nvmf_softc *sc = qp->sc;
 	struct nvmf_host_command *cmd;
 	struct nvmf_request *req;
 	const struct nvme_completion *cqe;
 	uint16_t cid;
 
 	cqe = nvmf_capsule_cqe(nc);
 
 	if (sc->ka_traffic)
 		atomic_store_int(&sc->ka_active_rx_traffic, 1);
 
 	/*
 	 * NB: Don't bother byte-swapping the cid as transmit doesn't
 	 * swap either.
 	 */
 	cid = cqe->cid;
 
 	if (cid > qp->num_commands) {
 		device_printf(sc->dev,
 		    "received invalid CID %u, disconnecting\n", cid);
 		nvmf_disconnect(sc);
 		nvmf_free_capsule(nc);
 		return;
 	}
 
+	/* Update flow control tracking. */
+	mtx_lock(&qp->lock);
+	if (qp->sq_flow_control) {
+		if (nvmf_sqhd_valid(nc))
+			qp->sqhd = le16toh(cqe->sqhd);
+	} else {
+		/*
+		 * If SQ FC is disabled, just advance the head for
+		 * each response capsule received.
+		 */
+		qp->sqhd = (qp->sqhd + 1) % (qp->num_commands + 1);
+	}
+
 	/*
 	 * If the queue has been shutdown due to an error, silently
 	 * drop the response.
 	 */
-	mtx_lock(&qp->lock);
 	if (qp->qp == NULL) {
 		device_printf(sc->dev,
 		    "received completion for CID %u on shutdown %s\n", cid,
 		    qp->name);
 		mtx_unlock(&qp->lock);
 		nvmf_free_capsule(nc);
 		return;
 	}
 
 	cmd = qp->active_commands[cid];
 	if (cmd == NULL) {
 		mtx_unlock(&qp->lock);
 		device_printf(sc->dev,
 		    "received completion for inactive CID %u, disconnecting\n",
 		    cid);
 		nvmf_disconnect(sc);
 		nvmf_free_capsule(nc);
 		return;
 	}
 
 	KASSERT(cmd->cid == cid, ("%s: CID mismatch", __func__));
 	req = cmd->req;
 	cmd->req = NULL;
 	if (STAILQ_EMPTY(&qp->pending_requests)) {
 		qp->active_commands[cid] = NULL;
 		TAILQ_INSERT_TAIL(&qp->free_commands, cmd, link);
 		mtx_unlock(&qp->lock);
 	} else {
 		cmd->req = STAILQ_FIRST(&qp->pending_requests);
 		STAILQ_REMOVE_HEAD(&qp->pending_requests, link);
-		qp->submitted++;
-		mtx_unlock(&qp->lock);
 		nvmf_dispatch_command(qp, cmd);
 	}
 
 	req->cb(req->cb_arg, cqe);
 	nvmf_free_capsule(nc);
 	nvmf_free_request(req);
 }
 
 static void
 nvmf_sysctls_qp(struct nvmf_softc *sc, struct nvmf_host_qpair *qp,
     bool admin, u_int qid)
 {
 	struct sysctl_ctx_list *ctx = &qp->sysctl_ctx;
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *list;
 	char name[8];
 
 	if (admin) {
 		oid = SYSCTL_ADD_NODE(ctx,
 		    SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)), OID_AUTO,
 		    "adminq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Admin Queue");
 	} else {
 		snprintf(name, sizeof(name), "%u", qid);
 		oid = SYSCTL_ADD_NODE(ctx, sc->ioq_oid_list, OID_AUTO, name,
 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queue");
 	}
 	list = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "num_entries", CTLFLAG_RD,
 	    NULL, qp->num_commands + 1, "Number of entries in queue");
 	SYSCTL_ADD_U16(ctx, list, OID_AUTO, "sq_head", CTLFLAG_RD, &qp->sqhd,
 	    0, "Current head of submission queue (as observed by driver)");
 	SYSCTL_ADD_U16(ctx, list, OID_AUTO, "sq_tail", CTLFLAG_RD, &qp->sqtail,
 	    0, "Current tail of submission queue (as observed by driver)");
 	SYSCTL_ADD_U64(ctx, list, OID_AUTO, "num_cmds", CTLFLAG_RD,
 	    &qp->submitted, 0, "Number of commands submitted");
 }
 
 struct nvmf_host_qpair *
 nvmf_init_qp(struct nvmf_softc *sc, enum nvmf_trtype trtype,
     struct nvmf_handoff_qpair_params *handoff, const char *name, u_int qid)
 {
 	struct nvmf_host_command *cmd, *ncmd;
 	struct nvmf_host_qpair *qp;
 	u_int i;
 
 	qp = malloc(sizeof(*qp), M_NVMF, M_WAITOK | M_ZERO);
 	qp->sc = sc;
 	qp->sq_flow_control = handoff->sq_flow_control;
 	qp->sqhd = handoff->sqhd;
 	qp->sqtail = handoff->sqtail;
 	strlcpy(qp->name, name, sizeof(qp->name));
 	mtx_init(&qp->lock, "nvmf qp", NULL, MTX_DEF);
 	(void)sysctl_ctx_init(&qp->sysctl_ctx);
 
 	/*
 	 * Allocate a spare command slot for each pending AER command
 	 * on the admin queue.
 	 */
 	qp->num_commands = handoff->qsize - 1;
 	if (handoff->admin)
 		qp->num_commands += sc->num_aer;
 
 	qp->active_commands = malloc(sizeof(*qp->active_commands) *
 	    qp->num_commands, M_NVMF, M_WAITOK | M_ZERO);
 	TAILQ_INIT(&qp->free_commands);
 	for (i = 0; i < qp->num_commands; i++) {
 		cmd = malloc(sizeof(*cmd), M_NVMF, M_WAITOK | M_ZERO);
 		cmd->cid = i;
 		TAILQ_INSERT_TAIL(&qp->free_commands, cmd, link);
 	}
 	STAILQ_INIT(&qp->pending_requests);
 
 	qp->qp = nvmf_allocate_qpair(trtype, false, handoff, nvmf_qp_error,
 	    qp, nvmf_receive_capsule, qp);
 	if (qp->qp == NULL) {
 		(void)sysctl_ctx_free(&qp->sysctl_ctx);
 		TAILQ_FOREACH_SAFE(cmd, &qp->free_commands, link, ncmd) {
 			TAILQ_REMOVE(&qp->free_commands, cmd, link);
 			free(cmd, M_NVMF);
 		}
 		free(qp->active_commands, M_NVMF);
 		mtx_destroy(&qp->lock);
 		free(qp, M_NVMF);
 		return (NULL);
 	}
 
 	nvmf_sysctls_qp(sc, qp, handoff->admin, qid);
 
 	return (qp);
 }
 
 void
 nvmf_shutdown_qp(struct nvmf_host_qpair *qp)
 {
 	struct nvmf_host_command *cmd;
 	struct nvmf_request *req;
 	struct nvmf_qpair *nq;
 
 	mtx_lock(&qp->lock);
 	nq = qp->qp;
 	qp->qp = NULL;
 
 	if (nq == NULL) {
 		while (qp->shutting_down)
 			mtx_sleep(qp, &qp->lock, 0, "nvmfqpsh", 0);
 		mtx_unlock(&qp->lock);
 		return;
 	}
 	qp->shutting_down = true;
 	while (qp->allocating != 0)
 		mtx_sleep(qp, &qp->lock, 0, "nvmfqpqu", 0);
 	mtx_unlock(&qp->lock);
 
 	nvmf_free_qpair(nq);
 
 	/*
 	 * Abort outstanding requests.  Active requests will have
 	 * their I/O completions invoked and associated capsules freed
 	 * by the transport layer via nvmf_free_qpair.  Pending
 	 * requests must have their I/O completion invoked via
 	 * nvmf_abort_capsule_data.
 	 */
 	for (u_int i = 0; i < qp->num_commands; i++) {
 		cmd = qp->active_commands[i];
 		if (cmd != NULL) {
 			if (!cmd->req->aer)
 				printf("%s: aborted active command %p (CID %u)\n",
 				    __func__, cmd->req, cmd->cid);
 
 			/* This was freed by nvmf_free_qpair. */
 			cmd->req->nc = NULL;
 			nvmf_abort_request(cmd->req, cmd->cid);
 			nvmf_free_request(cmd->req);
 			free(cmd, M_NVMF);
 		}
 	}
 	while (!STAILQ_EMPTY(&qp->pending_requests)) {
 		req = STAILQ_FIRST(&qp->pending_requests);
 		STAILQ_REMOVE_HEAD(&qp->pending_requests, link);
 		if (!req->aer)
 			printf("%s: aborted pending command %p\n", __func__,
 			    req);
 		nvmf_abort_capsule_data(req->nc, ECONNABORTED);
 		nvmf_abort_request(req, 0);
 		nvmf_free_request(req);
 	}
 
 	mtx_lock(&qp->lock);
 	qp->shutting_down = false;
 	mtx_unlock(&qp->lock);
 	wakeup(qp);
 }
 
 void
 nvmf_destroy_qp(struct nvmf_host_qpair *qp)
 {
 	struct nvmf_host_command *cmd, *ncmd;
 
 	nvmf_shutdown_qp(qp);
 	(void)sysctl_ctx_free(&qp->sysctl_ctx);
 
 	TAILQ_FOREACH_SAFE(cmd, &qp->free_commands, link, ncmd) {
 		TAILQ_REMOVE(&qp->free_commands, cmd, link);
 		free(cmd, M_NVMF);
 	}
 	free(qp->active_commands, M_NVMF);
 	mtx_destroy(&qp->lock);
 	free(qp, M_NVMF);
 }
 
 void
 nvmf_submit_request(struct nvmf_request *req)
 {
 	struct nvmf_host_qpair *qp;
 	struct nvmf_host_command *cmd;
 
 	qp = req->qp;
 	mtx_lock(&qp->lock);
 	if (qp->qp == NULL) {
 		mtx_unlock(&qp->lock);
 		printf("%s: aborted pending command %p\n", __func__, req);
 		nvmf_abort_capsule_data(req->nc, ECONNABORTED);
 		nvmf_abort_request(req, 0);
 		nvmf_free_request(req);
 		return;
 	}
 	cmd = TAILQ_FIRST(&qp->free_commands);
 	if (cmd == NULL) {
 		/*
 		 * Queue this request.  Will be sent after enough
 		 * in-flight requests have completed.
 		 */
 		STAILQ_INSERT_TAIL(&qp->pending_requests, req, link);
 		mtx_unlock(&qp->lock);
 		return;
 	}
 
 	TAILQ_REMOVE(&qp->free_commands, cmd, link);
 	KASSERT(qp->active_commands[cmd->cid] == NULL,
 	    ("%s: CID already busy", __func__));
 	qp->active_commands[cmd->cid] = cmd;
 	cmd->req = req;
-	qp->submitted++;
-	mtx_unlock(&qp->lock);
 	nvmf_dispatch_command(qp, cmd);
 }
diff --git a/sys/dev/nvmf/nvmf_transport.c b/sys/dev/nvmf/nvmf_transport.c
index ea4aee8cc7ae..316d1571e61d 100644
--- a/sys/dev/nvmf/nvmf_transport.c
+++ b/sys/dev/nvmf/nvmf_transport.c
@@ -1,342 +1,350 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2022-2024 Chelsio Communications, Inc.
  * Written by: John Baldwin <jhb@FreeBSD.org>
  */
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/refcount.h>
 #include <sys/sysctl.h>
 #include <sys/sx.h>
 #include <dev/nvme/nvme.h>
 #include <dev/nvmf/nvmf.h>
 #include <dev/nvmf/nvmf_transport.h>
 #include <dev/nvmf/nvmf_transport_internal.h>
 
 /* Transport-independent support for fabrics queue pairs and commands. */
 
 struct nvmf_transport {
 	struct nvmf_transport_ops *nt_ops;
 
 	volatile u_int nt_active_qpairs;
 	SLIST_ENTRY(nvmf_transport) nt_link;
 };
 
 /* nvmf_transports[nvmf_trtype] is sorted by priority */
 static SLIST_HEAD(, nvmf_transport) nvmf_transports[NVMF_TRTYPE_TCP + 1];
 static struct sx nvmf_transports_lock;
 
 static MALLOC_DEFINE(M_NVMF_TRANSPORT, "nvmf_xport",
     "NVMe over Fabrics transport");
 
 SYSCTL_NODE(_kern, OID_AUTO, nvmf, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "NVMe over Fabrics");
 
 static bool
 nvmf_supported_trtype(enum nvmf_trtype trtype)
 {
 	return (trtype < nitems(nvmf_transports));
 }
 
 struct nvmf_qpair *
 nvmf_allocate_qpair(enum nvmf_trtype trtype, bool controller,
     const struct nvmf_handoff_qpair_params *params,
     nvmf_qpair_error_t *error_cb, void *error_cb_arg,
     nvmf_capsule_receive_t *receive_cb, void *receive_cb_arg)
 {
 	struct nvmf_transport *nt;
 	struct nvmf_qpair *qp;
 
 	if (!nvmf_supported_trtype(trtype))
 		return (NULL);
 
 	sx_slock(&nvmf_transports_lock);
 	SLIST_FOREACH(nt, &nvmf_transports[trtype], nt_link) {
 		qp = nt->nt_ops->allocate_qpair(controller, params);
 		if (qp != NULL) {
 			refcount_acquire(&nt->nt_active_qpairs);
 			break;
 		}
 	}
 	sx_sunlock(&nvmf_transports_lock);
 	if (qp == NULL)
 		return (NULL);
 
 	qp->nq_transport = nt;
 	qp->nq_ops = nt->nt_ops;
 	qp->nq_controller = controller;
 	qp->nq_error = error_cb;
 	qp->nq_error_arg = error_cb_arg;
 	qp->nq_receive = receive_cb;
 	qp->nq_receive_arg = receive_cb_arg;
 	qp->nq_admin = params->admin;
 	return (qp);
 }
 
 void
 nvmf_free_qpair(struct nvmf_qpair *qp)
 {
 	struct nvmf_transport *nt;
 
 	nt = qp->nq_transport;
 	qp->nq_ops->free_qpair(qp);
 	if (refcount_release(&nt->nt_active_qpairs))
 		wakeup(nt);
 }
 
 struct nvmf_capsule *
 nvmf_allocate_command(struct nvmf_qpair *qp, const void *sqe, int how)
 {
 	struct nvmf_capsule *nc;
 
 	KASSERT(how == M_WAITOK || how == M_NOWAIT,
 	    ("%s: invalid how", __func__));
 	nc = qp->nq_ops->allocate_capsule(qp, how);
 	if (nc == NULL)
 		return (NULL);
 
 	nc->nc_qpair = qp;
 	nc->nc_qe_len = sizeof(struct nvme_command);
 	memcpy(&nc->nc_sqe, sqe, nc->nc_qe_len);
 
 	/* 4.2 of NVMe base spec: Fabrics always uses SGL. */
 	nc->nc_sqe.fuse &= ~NVMEM(NVME_CMD_PSDT);
 	nc->nc_sqe.fuse |= NVMEF(NVME_CMD_PSDT, NVME_PSDT_SGL);
 	return (nc);
 }
 
 struct nvmf_capsule *
 nvmf_allocate_response(struct nvmf_qpair *qp, const void *cqe, int how)
 {
 	struct nvmf_capsule *nc;
 
 	KASSERT(how == M_WAITOK || how == M_NOWAIT,
 	    ("%s: invalid how", __func__));
 	nc = qp->nq_ops->allocate_capsule(qp, how);
 	if (nc == NULL)
 		return (NULL);
 
 	nc->nc_qpair = qp;
 	nc->nc_qe_len = sizeof(struct nvme_completion);
 	memcpy(&nc->nc_cqe, cqe, nc->nc_qe_len);
 	return (nc);
 }
 
 int
 nvmf_capsule_append_data(struct nvmf_capsule *nc, struct memdesc *mem,
     size_t len, bool send, nvmf_io_complete_t *complete_cb,
     void *cb_arg)
 {
 	if (nc->nc_data.io_len != 0)
 		return (EBUSY);
 
 	nc->nc_send_data = send;
 	nc->nc_data.io_mem = *mem;
 	nc->nc_data.io_len = len;
 	nc->nc_data.io_complete = complete_cb;
 	nc->nc_data.io_complete_arg = cb_arg;
 	return (0);
 }
 
 void
 nvmf_free_capsule(struct nvmf_capsule *nc)
 {
 	nc->nc_qpair->nq_ops->free_capsule(nc);
 }
 
 int
 nvmf_transmit_capsule(struct nvmf_capsule *nc)
 {
 	return (nc->nc_qpair->nq_ops->transmit_capsule(nc));
 }
 
 void
 nvmf_abort_capsule_data(struct nvmf_capsule *nc, int error)
 {
 	if (nc->nc_data.io_len != 0)
 		nvmf_complete_io_request(&nc->nc_data, 0, error);
 }
 
 void *
 nvmf_capsule_sqe(struct nvmf_capsule *nc)
 {
 	KASSERT(nc->nc_qe_len == sizeof(struct nvme_command),
 	    ("%s: capsule %p is not a command capsule", __func__, nc));
 	return (&nc->nc_sqe);
 }
 
 void *
 nvmf_capsule_cqe(struct nvmf_capsule *nc)
 {
 	KASSERT(nc->nc_qe_len == sizeof(struct nvme_completion),
 	    ("%s: capsule %p is not a response capsule", __func__, nc));
 	return (&nc->nc_cqe);
 }
 
+bool
+nvmf_sqhd_valid(struct nvmf_capsule *nc)
+{
+	KASSERT(nc->nc_qe_len == sizeof(struct nvme_completion),
+	    ("%s: capsule %p is not a response capsule", __func__, nc));
+	return (nc->nc_sqhd_valid);
+}
+
 uint8_t
 nvmf_validate_command_capsule(struct nvmf_capsule *nc)
 {
 	KASSERT(nc->nc_qe_len == sizeof(struct nvme_command),
 	    ("%s: capsule %p is not a command capsule", __func__, nc));
 
 	if (NVMEV(NVME_CMD_PSDT, nc->nc_sqe.fuse) != NVME_PSDT_SGL)
 		return (NVME_SC_INVALID_FIELD);
 
 	return (nc->nc_qpair->nq_ops->validate_command_capsule(nc));
 }
 
 size_t
 nvmf_capsule_data_len(const struct nvmf_capsule *nc)
 {
 	return (nc->nc_qpair->nq_ops->capsule_data_len(nc));
 }
 
 int
 nvmf_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
     struct memdesc *mem, size_t len, nvmf_io_complete_t *complete_cb,
     void *cb_arg)
 {
 	struct nvmf_io_request io;
 
 	io.io_mem = *mem;
 	io.io_len = len;
 	io.io_complete = complete_cb;
 	io.io_complete_arg = cb_arg;
 	return (nc->nc_qpair->nq_ops->receive_controller_data(nc, data_offset,
 	    &io));
 }
 
 u_int
 nvmf_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
     struct mbuf *m, size_t len)
 {
 	MPASS(m_length(m, NULL) == len);
 	return (nc->nc_qpair->nq_ops->send_controller_data(nc, data_offset, m,
 	    len));
 }
 
 int
 nvmf_transport_module_handler(struct module *mod, int what, void *arg)
 {
 	struct nvmf_transport_ops *ops = arg;
 	struct nvmf_transport *nt, *nt2, *prev;
 	int error;
 
 	switch (what) {
 	case MOD_LOAD:
 		if (!nvmf_supported_trtype(ops->trtype)) {
 			printf("NVMF: Unsupported transport %u", ops->trtype);
 			return (EINVAL);
 		}
 
 		nt = malloc(sizeof(*nt), M_NVMF_TRANSPORT, M_WAITOK | M_ZERO);
 		nt->nt_ops = arg;
 
 		sx_xlock(&nvmf_transports_lock);
 		if (SLIST_EMPTY(&nvmf_transports[ops->trtype])) {
 			SLIST_INSERT_HEAD(&nvmf_transports[ops->trtype], nt,
 			    nt_link);
 		} else {
 			prev = NULL;
 			SLIST_FOREACH(nt2, &nvmf_transports[ops->trtype],
 			    nt_link) {
 				if (ops->priority > nt2->nt_ops->priority)
 					break;
 				prev = nt2;
 			}
 			if (prev == NULL)
 				SLIST_INSERT_HEAD(&nvmf_transports[ops->trtype],
 				    nt, nt_link);
 			else
 				SLIST_INSERT_AFTER(prev, nt, nt_link);
 		}
 		sx_xunlock(&nvmf_transports_lock);
 		return (0);
 
 	case MOD_QUIESCE:
 		if (!nvmf_supported_trtype(ops->trtype))
 			return (0);
 
 		sx_slock(&nvmf_transports_lock);
 		SLIST_FOREACH(nt, &nvmf_transports[ops->trtype], nt_link) {
 			if (nt->nt_ops == ops)
 				break;
 		}
 		if (nt == NULL) {
 			sx_sunlock(&nvmf_transports_lock);
 			return (0);
 		}
 		if (nt->nt_active_qpairs != 0) {
 			sx_sunlock(&nvmf_transports_lock);
 			return (EBUSY);
 		}
 		sx_sunlock(&nvmf_transports_lock);
 		return (0);
 
 	case MOD_UNLOAD:
 		if (!nvmf_supported_trtype(ops->trtype))
 			return (0);
 
 		sx_xlock(&nvmf_transports_lock);
 		prev = NULL;
 		SLIST_FOREACH(nt, &nvmf_transports[ops->trtype], nt_link) {
 			if (nt->nt_ops == ops)
 				break;
 			prev = nt;
 		}
 		if (nt == NULL) {
 			sx_xunlock(&nvmf_transports_lock);
 			return (0);
 		}
 
 		if (prev == NULL)
 			SLIST_REMOVE_HEAD(&nvmf_transports[ops->trtype],
 			    nt_link);
 		else
 			SLIST_REMOVE_AFTER(prev, nt_link);
 
 		error = 0;
 		while (nt->nt_active_qpairs != 0 && error == 0)
 			error = sx_sleep(nt, &nvmf_transports_lock, PCATCH,
 			    "nftunld", 0);
 		sx_xunlock(&nvmf_transports_lock);
 		if (error != 0)
 			return (error);
 		free(nt, M_NVMF_TRANSPORT);
 		return (0);
 
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 static int
 nvmf_transport_modevent(module_t mod __unused, int what, void *arg __unused)
 {
 	switch (what) {
 	case MOD_LOAD:
 		for (u_int i = 0; i < nitems(nvmf_transports); i++)
 			SLIST_INIT(&nvmf_transports[i]);
 		sx_init(&nvmf_transports_lock, "nvmf transports");
 		return (0);
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 static moduledata_t nvmf_transport_mod = {
 	"nvmf_transport",
 	nvmf_transport_modevent,
 	0
 };
 
 DECLARE_MODULE(nvmf_transport, nvmf_transport_mod, SI_SUB_DRIVERS,
     SI_ORDER_FIRST);
 MODULE_VERSION(nvmf_transport, 1);
diff --git a/sys/dev/nvmf/nvmf_transport.h b/sys/dev/nvmf/nvmf_transport.h
index 549170b25940..bbd830eba576 100644
--- a/sys/dev/nvmf/nvmf_transport.h
+++ b/sys/dev/nvmf/nvmf_transport.h
@@ -1,140 +1,141 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2022-2024 Chelsio Communications, Inc.
  * Written by: John Baldwin <jhb@FreeBSD.org>
  */
 
 #ifndef __NVMF_TRANSPORT_H__
 #define	__NVMF_TRANSPORT_H__
 
 /*
  * Interface used by the Fabrics host (initiator) and controller
  * (target) to send and receive capsules and associated data.
  */
 
 #include <sys/sysctl.h>
 #include <dev/nvmf/nvmf_proto.h>
 
 struct mbuf;
 struct memdesc;
 struct nvmf_capsule;
 struct nvmf_connection;
 struct nvmf_qpair;
 struct nvmf_handoff_qpair_params;
 
 SYSCTL_DECL(_kern_nvmf);
 
 /*
  * Callback to invoke when an error occurs on a qpair.  The last
  * parameter is an error value.  If the error value is zero, the qpair
  * has been closed at the transport level rather than a transport
  * error occuring.
  */
 typedef void nvmf_qpair_error_t(void *, int);
 
 /* Callback to invoke when a capsule is received. */
 typedef void nvmf_capsule_receive_t(void *, struct nvmf_capsule *);
 
 /*
  * Callback to invoke when an I/O request has completed.  The second
  * parameter is the amount of data transferred.  The last parameter is
  * an error value which is non-zero if the request did not complete
  * successfully.  A request with an error may complete partially.
  */
 typedef void nvmf_io_complete_t(void *, size_t, int);
 
 /*
  * A queue pair represents either an Admin or I/O
  * submission/completion queue pair.  The params contains negotiated
  * values passed in from userland.
  *
  * Unlike libnvmf in userland, the kernel transport interface does not
  * have any notion of an association.  Instead, qpairs are
  * independent.
  */
 struct nvmf_qpair *nvmf_allocate_qpair(enum nvmf_trtype trtype,
     bool controller, const struct nvmf_handoff_qpair_params *params,
     nvmf_qpair_error_t *error_cb, void *error_cb_arg,
     nvmf_capsule_receive_t *receive_cb, void *receive_cb_arg);
 void	nvmf_free_qpair(struct nvmf_qpair *qp);
 
 /*
  * Capsules are either commands (host -> controller) or responses
  * (controller -> host).  A data buffer may be associated with a
  * command capsule.  Transmitted data is not copied by this API but
  * instead must be preserved until the completion callback is invoked
  * to indicate capsule transmission has completed.
  */
 struct nvmf_capsule *nvmf_allocate_command(struct nvmf_qpair *qp,
     const void *sqe, int how);
 struct nvmf_capsule *nvmf_allocate_response(struct nvmf_qpair *qp,
     const void *cqe, int how);
 void	nvmf_free_capsule(struct nvmf_capsule *nc);
 int	nvmf_capsule_append_data(struct nvmf_capsule *nc,
     struct memdesc *mem, size_t len, bool send,
     nvmf_io_complete_t *complete_cb, void *cb_arg);
 int	nvmf_transmit_capsule(struct nvmf_capsule *nc);
 void	nvmf_abort_capsule_data(struct nvmf_capsule *nc, int error);
 void *nvmf_capsule_sqe(struct nvmf_capsule *nc);
 void *nvmf_capsule_cqe(struct nvmf_capsule *nc);
+bool	nvmf_sqhd_valid(struct nvmf_capsule *nc);
 
 /* Controller-specific APIs. */
 
 /*
  * A controller calls this function to check for any
  * transport-specific errors (invalid fields) in a received command
  * capsule.  The callback returns a generic command status value:
  * NVME_SC_SUCCESS if no error is found.
  */
 uint8_t	nvmf_validate_command_capsule(struct nvmf_capsule *nc);
 
 /*
  * A controller calls this function to query the amount of data
  * associated with a command capsule.
  */
 size_t	nvmf_capsule_data_len(const struct nvmf_capsule *cc);
 
 /*
  * A controller calls this function to receive data associated with a
  * command capsule (e.g. the data for a WRITE command).  This can
  * either return in-capsule data or fetch data from the host
  * (e.g. using a R2T PDU over TCP).  The received command capsule
  * should be passed in 'nc'.  The received data is stored in 'mem'.
  * If this function returns success, then the callback will be invoked
  * once the operation has completed.  Note that the callback might be
  * invoked before this function returns.
  */
 int	nvmf_receive_controller_data(struct nvmf_capsule *nc,
     uint32_t data_offset, struct memdesc *mem, size_t len,
     nvmf_io_complete_t *complete_cb, void *cb_arg);
 
 /*
  * A controller calls this function to send data in response to a
  * command prior to sending a response capsule.  If an error occurs,
  * the function returns a generic status completion code to be sent in
  * the following CQE.  Note that the transfer might send a subset of
  * the data requested by nc.  If the transfer succeeds, this function
  * can return one of the following values:
  *
  * - NVME_SC_SUCCESS: The transfer has completed successfully and the
  *   caller should send a success CQE in a response capsule.
  *
  * - NVMF_SUCCESS_SENT: The transfer has completed successfully and
  *   the transport layer has sent an implicit success CQE to the
  *   remote host (e.g. the SUCCESS flag for TCP).  The caller should
  *   not send a response capsule.
  *
  * - NVMF_MORE: The transfer has completed successfully, but the
  *   transfer did not complete the data buffer.
  *
  * The mbuf chain in 'm' is consumed by this function even if an error
  * is returned.
  */
 u_int	nvmf_send_controller_data(struct nvmf_capsule *nc,
     uint32_t data_offset, struct mbuf *m, size_t len);
 
 #define	NVMF_SUCCESS_SENT	0x100
 #define	NVMF_MORE		0x101
 
 #endif /* !__NVMF_TRANSPORT_H__ */