diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile
--- a/share/man/man4/Makefile
+++ b/share/man/man4/Makefile
@@ -408,6 +408,7 @@
 	nvd.4 \
 	${_nvdimm.4} \
 	nvme.4 \
+	nvmf.4 \
 	nvmf_tcp.4 \
 	${_nvram.4} \
 	oce.4 \
diff --git a/share/man/man4/nvmf.4 b/share/man/man4/nvmf.4
new file mode 100644
--- /dev/null
+++ b/share/man/man4/nvmf.4
@@ -0,0 +1,87 @@
+.\"
+.\" SPDX-License-Identifier: BSD-2-Clause
+.\"
+.\" Copyright (c) 2024 Chelsio Communications, Inc.
+.\"
+.Dd May 2, 2024
+.Dt NVMF 4
+.Os
+.Sh NAME
+.Nm nvmf
+.Nd "NVM Express over Fabrics host driver"
+.Sh SYNOPSIS
+To compile the driver into the kernel,
+place the following line in the
+kernel configuration file:
+.Bd -ragged -offset indent
+.Cd "device nvmf"
+.Ed
+.Pp
+Alternatively, to load the driver as a
+module at boot time, place the following line in
+.Xr loader.conf 5 :
+.Bd -literal -offset indent
+nvmf_load="YES"
+.Ed
+.Sh DESCRIPTION
+The
+.Nm
+driver provides the kernel component of an NVM Express over Fabrics
+host.
+The NVMeoF host is the client which provides local access to
+namespaces exported by a remote controller.
+.Pp
+Associations between the local host and remote controllers are managed
+using
+.Xr nvmecontrol 8 .
+New associations are created via the
+.Cm connect
+command and destroyed via the
+.Cm disconnect
+command.
+If an association's connection is interrupted,
+the
+.Cm reconnect
+command creates a new association to replace the interrupted association.
+.Pp
+Similar to
+.Xr nvme 4 ,
+.Nm
+creates controller device nodes using the format
+.Pa /dev/nvmeX
+and namespace device nodes using the format
+.Pa /dev/nvmeXnsY .
+.Nm
+also exports remote namespaces via the CAM
+.Xr nda 4
+peripheral driver.
+Unlike
+.Xr nvme 4 ,
+.Nm
+does not support the
+.Xr nvd 4
+disk driver.
+.Pp
+Associations require a supported transport such as
+.Xr nvmf_tcp 4
+for associations using TCP/IP.
+.Sh SEE ALSO
+.Xr nda 4 ,
+.Xr nvme 4 ,
+.Xr nvmf_tcp 4 ,
+.Xr nvmft 4 ,
+.Xr nvmecontrol 8
+.Sh HISTORY
+The
+.Nm
+module first appeared in
+.Fx 15.0 .
+.Sh AUTHORS
+The
+.Nm
+driver was developed by
+.An John Baldwin Aq Mt jhb@FreeBSD.org
+under sponsorship from Chelsio Communications, Inc.
+.Sh BUGS
+.Nm
+only supports a single I/O queue pair per association.
diff --git a/sys/conf/NOTES b/sys/conf/NOTES
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -1676,12 +1676,14 @@
 # NVM Express
 #
 # nvme:	PCI-express NVM Express host controllers
+# nvmf:	NVM Express over Fabrics host
 # nvmf_tcp: TCP transport for NVM Express over Fabrics
 # nda:	CAM NVMe disk driver
 # nvd:	non-CAM NVMe disk driver
 
-device		nvme		# base NVMe driver
+device		nvme		# PCI-express NVMe host driver
 options 	NVME_USE_NVD=1	# Use nvd(4) instead of the CAM nda(4) driver
+device		nvmf		# NVMeoF host driver
 device		nvmf_tcp	# NVMeoF TCP transport
 device		nda		# NVMe direct access devices (aka disks)
 device		nvd		# expose NVMe namespaces as disks, depends on nvme
diff --git a/sys/conf/files b/sys/conf/files
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -2533,7 +2533,15 @@
 dev/nvme/nvme_util.c		optional nvme
 dev/nvmem/nvmem.c		optional nvmem fdt
 dev/nvmem/nvmem_if.m		optional nvmem
+dev/nvmf/host/nvmf.c		optional nvmf
+dev/nvmf/host/nvmf_aer.c	optional nvmf
+dev/nvmf/host/nvmf_cmd.c	optional nvmf
+dev/nvmf/host/nvmf_ctldev.c	optional nvmf
+dev/nvmf/host/nvmf_ns.c		optional nvmf
+dev/nvmf/host/nvmf_qpair.c	optional nvmf
+dev/nvmf/host/nvmf_sim.c	optional nvmf
 dev/nvmf/nvmf_tcp.c		optional nvmf_tcp
+dev/nvmf/nvmf_transport.c	optional nvmf
 dev/oce/oce_hw.c		optional oce pci
 dev/oce/oce_if.c		optional oce pci
 dev/oce/oce_mbox.c		optional oce pci
diff --git a/sys/dev/nvmf/host/nvmf.c b/sys/dev/nvmf/host/nvmf.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf.c
@@ -0,0 +1,939 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/memdesc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/taskqueue.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_transport.h>
+#include <dev/nvmf/host/nvmf_var.h>
+
+static struct cdevsw nvmf_cdevsw;
+
+MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
+
+static void	nvmf_disconnect_task(void *arg, int pending);
+
+void
+nvmf_complete(void *arg, const struct nvme_completion *cqe)
+{
+	struct nvmf_completion_status *status = arg;
+	struct mtx *mtx;
+
+	status->cqe = *cqe;
+	mtx = mtx_pool_find(mtxpool_sleep, status);
+	mtx_lock(mtx);
+	status->done = true;
+	mtx_unlock(mtx);
+	wakeup(status);
+}
+
+void
+nvmf_io_complete(void *arg, size_t xfered, int error)
+{
+	struct nvmf_completion_status *status = arg;
+	struct mtx *mtx;
+
+	status->io_error = error;
+	mtx = mtx_pool_find(mtxpool_sleep, status);
+	mtx_lock(mtx);
+	status->io_done = true;
+	mtx_unlock(mtx);
+	wakeup(status);
+}
+
+void
+nvmf_wait_for_reply(struct nvmf_completion_status *status)
+{
+	struct mtx *mtx;
+
+	mtx = mtx_pool_find(mtxpool_sleep, status);
+	mtx_lock(mtx);
+	while (!status->done || !status->io_done)
+		mtx_sleep(status, mtx, 0, "nvmfcmd", 0);
+	mtx_unlock(mtx);
+}
+
+static int
+nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
+    uint64_t *value)
+{
+	const struct nvmf_fabric_prop_get_rsp *rsp;
+	struct nvmf_completion_status status;
+
+	nvmf_status_init(&status);
+	if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status,
+	    M_WAITOK))
+		return (ECONNABORTED);
+	nvmf_wait_for_reply(&status);
+
+	if (status.cqe.status != 0) {
+		device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n",
+		    le16toh(status.cqe.status));
+		return (EIO);
+	}
+
+	rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe;
+	if (size == 8)
+		*value = le64toh(rsp->value.u64);
+	else
+		*value = le32toh(rsp->value.u32.low);
+	return (0);
+}
+
+static int
+nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
+    uint64_t value)
+{
+	struct nvmf_completion_status status;
+
+	nvmf_status_init(&status);
+	if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status,
+	    M_WAITOK))
+		return (ECONNABORTED);
+	nvmf_wait_for_reply(&status);
+
+	if (status.cqe.status != 0) {
+		device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n",
+		    le16toh(status.cqe.status));
+		return (EIO);
+	}
+	return (0);
+}
+
+static void
+nvmf_shutdown_controller(struct nvmf_softc *sc)
+{
+	uint64_t cc;
+	int error;
+
+	error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc);
+	if (error != 0) {
+		device_printf(sc->dev, "Failed to fetch CC for shutdown\n");
+		return;
+	}
+
+	cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL);
+
+	error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc);
+	if (error != 0)
+		device_printf(sc->dev,
+		    "Failed to set CC to trigger shutdown\n");
+}
+
+static void
+nvmf_check_keep_alive(void *arg)
+{
+	struct nvmf_softc *sc = arg;
+	int traffic;
+
+	traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic);
+	if (traffic == 0) {
+		device_printf(sc->dev,
+		    "disconnecting due to KeepAlive timeout\n");
+		nvmf_disconnect(sc);
+		return;
+	}
+
+	callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK);
+}
+
+static void
+nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe)
+{
+	struct nvmf_softc *sc = arg;
+
+	atomic_store_int(&sc->ka_active_rx_traffic, 1);
+	if (cqe->status != 0) {
+		device_printf(sc->dev,
+		    "KeepAlive response reported status %#x\n",
+		    le16toh(cqe->status));
+	}
+}
+
+static void
+nvmf_send_keep_alive(void *arg)
+{
+	struct nvmf_softc *sc = arg;
+	int traffic;
+
+	/*
+	 * Don't bother sending a KeepAlive command if TKAS is active
+	 * and another command has been sent during the interval.
+	 */
+	traffic = atomic_load_int(&sc->ka_active_tx_traffic);
+	if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete,
+	    sc, M_NOWAIT))
+		device_printf(sc->dev,
+		    "Failed to allocate KeepAlive command\n");
+
+	/* Clear ka_active_tx_traffic after sending the keep alive command. */
+	atomic_store_int(&sc->ka_active_tx_traffic, 0);
+
+	callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK);
+}
+
+int
+nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh)
+{
+	size_t len;
+	u_int i;
+	int error;
+
+	memset(ivars, 0, sizeof(*ivars));
+
+	if (!hh->admin.admin || hh->num_io_queues < 1)
+		return (EINVAL);
+
+	ivars->cdata = malloc(sizeof(*ivars->cdata), M_NVMF, M_WAITOK);
+	error = copyin(hh->cdata, ivars->cdata, sizeof(*ivars->cdata));
+	if (error != 0)
+		goto out;
+	nvme_controller_data_swapbytes(ivars->cdata);
+
+	len = hh->num_io_queues * sizeof(*ivars->io_params);
+	ivars->io_params = malloc(len, M_NVMF, M_WAITOK);
+	error = copyin(hh->io, ivars->io_params, len);
+	if (error != 0)
+		goto out;
+	for (i = 0; i < hh->num_io_queues; i++) {
+		if (ivars->io_params[i].admin) {
+			error = EINVAL;
+			goto out;
+		}
+
+		/* Require all I/O queues to be the same size. */
+		if (ivars->io_params[i].qsize != ivars->io_params[0].qsize) {
+			error = EINVAL;
+			goto out;
+		}
+	}
+
+	ivars->hh = hh;
+	return (0);
+
+out:
+	free(ivars->io_params, M_NVMF);
+	free(ivars->cdata, M_NVMF);
+	return (error);
+}
+
+void
+nvmf_free_ivars(struct nvmf_ivars *ivars)
+{
+	free(ivars->io_params, M_NVMF);
+	free(ivars->cdata, M_NVMF);
+}
+
+static int
+nvmf_probe(device_t dev)
+{
+	struct nvmf_ivars *ivars = device_get_ivars(dev);
+	char desc[260];
+
+	if (ivars == NULL)
+		return (ENXIO);
+
+	snprintf(desc, sizeof(desc), "Fabrics: %.256s", ivars->cdata->subnqn);
+	device_set_desc_copy(dev, desc);
+	return (BUS_PROBE_DEFAULT);
+}
+
+static int
+nvmf_establish_connection(struct nvmf_softc *sc, struct nvmf_ivars *ivars)
+{
+	char name[16];
+
+	/* Setup the admin queue. */
+	sc->admin = nvmf_init_qp(sc, ivars->hh->trtype, &ivars->hh->admin,
+	    "admin queue");
+	if (sc->admin == NULL) {
+		device_printf(sc->dev, "Failed to setup admin queue\n");
+		return (ENXIO);
+	}
+
+	/* Setup I/O queues. */
+	sc->io = malloc(ivars->hh->num_io_queues * sizeof(*sc->io), M_NVMF,
+	    M_WAITOK | M_ZERO);
+	sc->num_io_queues = ivars->hh->num_io_queues;
+	for (u_int i = 0; i < sc->num_io_queues; i++) {
+		snprintf(name, sizeof(name), "I/O queue %u", i);
+		sc->io[i] = nvmf_init_qp(sc, ivars->hh->trtype,
+		    &ivars->io_params[i], name);
+		if (sc->io[i] == NULL) {
+			device_printf(sc->dev, "Failed to setup I/O queue %u\n",
+			    i + 1);
+			return (ENXIO);
+		}
+	}
+
+	/* Start KeepAlive timers. */
+	if (ivars->hh->kato != 0) {
+		sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS,
+		    sc->cdata->ctratt) != 0;
+		sc->ka_rx_sbt = mstosbt(ivars->hh->kato);
+		sc->ka_tx_sbt = sc->ka_rx_sbt / 2;
+		callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0,
+		    nvmf_check_keep_alive, sc, C_HARDCLOCK);
+		callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0,
+		    nvmf_send_keep_alive, sc, C_HARDCLOCK);
+	}
+
+	return (0);
+}
+
+static bool
+nvmf_scan_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
+    struct nvme_namespace_data *data, uint32_t *nsidp)
+{
+	struct nvmf_completion_status status;
+	uint32_t nsid;
+
+	nvmf_status_init(&status);
+	nvmf_status_wait_io(&status);
+	if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist,
+	    nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) {
+		device_printf(sc->dev,
+		    "failed to send IDENTIFY active namespaces command\n");
+		return (false);
+	}
+	nvmf_wait_for_reply(&status);
+
+	if (status.cqe.status != 0) {
+		device_printf(sc->dev,
+		    "IDENTIFY active namespaces failed, status %#x\n",
+		    le16toh(status.cqe.status));
+		return (false);
+	}
+
+	if (status.io_error != 0) {
+		device_printf(sc->dev,
+		    "IDENTIFY active namespaces failed with I/O error %d\n",
+		    status.io_error);
+		return (false);
+	}
+
+	for (u_int i = 0; i < nitems(nslist->ns); i++) {
+		nsid = nslist->ns[i];
+		if (nsid == 0) {
+			*nsidp = 0;
+			return (true);
+		}
+
+		if (sc->ns[nsid - 1] != NULL) {
+			device_printf(sc->dev,
+			    "duplicate namespace %u in active namespace list\n",
+			    nsid);
+			return (false);
+		}
+
+		nvmf_status_init(&status);
+		nvmf_status_wait_io(&status);
+		if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
+		    &status, nvmf_io_complete, &status, M_WAITOK)) {
+			device_printf(sc->dev,
+			    "failed to send IDENTIFY namespace %u command\n",
+			    nsid);
+			return (false);
+		}
+		nvmf_wait_for_reply(&status);
+
+		if (status.cqe.status != 0) {
+			device_printf(sc->dev,
+			    "IDENTIFY namespace %u failed, status %#x\n", nsid,
+			    le16toh(status.cqe.status));
+			return (false);
+		}
+
+		if (status.io_error != 0) {
+			device_printf(sc->dev,
+			    "IDENTIFY namespace %u failed with I/O error %d\n",
+			    nsid, status.io_error);
+			return (false);
+		}
+
+		/*
+		 * As in nvme_ns_construct, a size of zero indicates an
+		 * invalid namespace.
+		 */
+		nvme_namespace_data_swapbytes(data);
+		if (data->nsze == 0) {
+			device_printf(sc->dev,
+			    "ignoring active namespace %u with zero size\n",
+			    nsid);
+			continue;
+		}
+
+		sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
+
+		nvmf_sim_rescan_ns(sc, nsid);
+	}
+
+	MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0);
+
+	if (nsid >= 0xfffffffd)
+		*nsidp = 0;
+	else
+		*nsidp = nsid + 1;
+	return (true);
+}
+
+static bool
+nvmf_add_namespaces(struct nvmf_softc *sc)
+{
+	struct nvme_namespace_data *data;
+	struct nvme_ns_list *nslist;
+	uint32_t nsid;
+	bool retval;
+
+	sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
+	    M_WAITOK | M_ZERO);
+	nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK);
+	data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
+
+	nsid = 0;
+	retval = true;
+	for (;;) {
+		if (!nvmf_scan_nslist(sc, nslist, data, &nsid)) {
+			retval = false;
+			break;
+		}
+		if (nsid == 0)
+			break;
+	}
+
+	free(data, M_NVMF);
+	free(nslist, M_NVMF);
+	return (retval);
+}
+
+static int
+nvmf_attach(device_t dev)
+{
+	struct make_dev_args mda;
+	struct nvmf_softc *sc = device_get_softc(dev);
+	struct nvmf_ivars *ivars = device_get_ivars(dev);
+	uint64_t val;
+	u_int i;
+	int error;
+
+	if (ivars == NULL)
+		return (ENXIO);
+
+	sc->dev = dev;
+	sc->trtype = ivars->hh->trtype;
+	callout_init(&sc->ka_rx_timer, 1);
+	callout_init(&sc->ka_tx_timer, 1);
+	sx_init(&sc->connection_lock, "nvmf connection");
+	TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
+
+	/* Claim the cdata pointer from ivars. */
+	sc->cdata = ivars->cdata;
+	ivars->cdata = NULL;
+
+	nvmf_init_aer(sc);
+
+	/* TODO: Multiqueue support. */
+	sc->max_pending_io = ivars->io_params[0].qsize /* * sc->num_io_queues */;
+
+	error = nvmf_establish_connection(sc, ivars);
+	if (error != 0)
+		goto out;
+
+	error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap);
+	if (error != 0) {
+		device_printf(sc->dev, "Failed to fetch CAP\n");
+		error = ENXIO;
+		goto out;
+	}
+
+	error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val);
+	if (error != 0) {
+		device_printf(sc->dev, "Failed to fetch VS\n");
+		error = ENXIO;
+		goto out;
+	}
+	sc->vs = val;
+
+	/* Honor MDTS if it is set. */
+	sc->max_xfer_size = maxphys;
+	if (sc->cdata->mdts != 0) {
+		sc->max_xfer_size = ulmin(sc->max_xfer_size,
+		    1 << (sc->cdata->mdts + NVME_MPS_SHIFT +
+		    NVME_CAP_HI_MPSMIN(sc->cap >> 32)));
+	}
+
+	error = nvmf_init_sim(sc);
+	if (error != 0)
+		goto out;
+
+	error = nvmf_start_aer(sc);
+	if (error != 0) {
+		nvmf_destroy_sim(sc);
+		goto out;
+	}
+
+	if (!nvmf_add_namespaces(sc)) {
+		nvmf_destroy_sim(sc);
+		goto out;
+	}
+
+	make_dev_args_init(&mda);
+	mda.mda_devsw = &nvmf_cdevsw;
+	mda.mda_uid = UID_ROOT;
+	mda.mda_gid = GID_WHEEL;
+	mda.mda_mode = 0600;
+	mda.mda_si_drv1 = sc;
+	error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
+	if (error != 0) {
+		nvmf_destroy_sim(sc);
+		goto out;
+	}
+
+	return (0);
+out:
+	if (sc->ns != NULL) {
+		for (i = 0; i < sc->cdata->nn; i++) {
+			if (sc->ns[i] != NULL)
+				nvmf_destroy_ns(sc->ns[i]);
+		}
+		free(sc->ns, M_NVMF);
+	}
+
+	callout_drain(&sc->ka_tx_timer);
+	callout_drain(&sc->ka_rx_timer);
+
+	if (sc->admin != NULL)
+		nvmf_shutdown_controller(sc);
+
+	for (i = 0; i < sc->num_io_queues; i++) {
+		if (sc->io[i] != NULL)
+			nvmf_destroy_qp(sc->io[i]);
+	}
+	free(sc->io, M_NVMF);
+	if (sc->admin != NULL)
+		nvmf_destroy_qp(sc->admin);
+
+	nvmf_destroy_aer(sc);
+
+	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
+	sx_destroy(&sc->connection_lock);
+	free(sc->cdata, M_NVMF);
+	return (error);
+}
+
+void
+nvmf_disconnect(struct nvmf_softc *sc)
+{
+	taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task);
+}
+
+static void
+nvmf_disconnect_task(void *arg, int pending __unused)
+{
+	struct nvmf_softc *sc = arg;
+	u_int i;
+
+	sx_xlock(&sc->connection_lock);
+	if (sc->admin == NULL) {
+		/*
+		 * Ignore transport errors if there is no active
+		 * association.
+		 */
+		sx_xunlock(&sc->connection_lock);
+		return;
+	}
+
+	if (sc->detaching) {
+		if (sc->admin != NULL) {
+			/*
+			 * This unsticks the detach process if a
+			 * transport error occurs during detach.
+			 */
+			nvmf_shutdown_qp(sc->admin);
+		}
+		sx_xunlock(&sc->connection_lock);
+		return;
+	}
+
+	if (sc->cdev == NULL) {
+		/*
+		 * Transport error occurred during attach (nvmf_add_namespaces).
+		 * Shutdown the admin queue.
+		 */
+		nvmf_shutdown_qp(sc->admin);
+		sx_xunlock(&sc->connection_lock);
+		return;
+	}
+
+	callout_drain(&sc->ka_tx_timer);
+	callout_drain(&sc->ka_rx_timer);
+	sc->ka_traffic = false;
+
+	/* Quiesce namespace consumers. */
+	nvmf_disconnect_sim(sc);
+	for (i = 0; i < sc->cdata->nn; i++) {
+		if (sc->ns[i] != NULL)
+			nvmf_disconnect_ns(sc->ns[i]);
+	}
+
+	/* Shutdown the existing qpairs. */
+	for (i = 0; i < sc->num_io_queues; i++) {
+		nvmf_destroy_qp(sc->io[i]);
+	}
+	free(sc->io, M_NVMF);
+	sc->io = NULL;
+	sc->num_io_queues = 0;
+	nvmf_destroy_qp(sc->admin);
+	sc->admin = NULL;
+
+	sx_xunlock(&sc->connection_lock);
+}
+
+static int
+nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh)
+{
+	struct nvmf_ivars ivars;
+	u_int i;
+	int error;
+
+	/* XXX: Should we permit changing the transport type? */
+	if (sc->trtype != hh->trtype) {
+		device_printf(sc->dev,
+		    "transport type mismatch on reconnect\n");
+		return (EINVAL);
+	}
+
+	error = nvmf_init_ivars(&ivars, hh);
+	if (error != 0)
+		return (error);
+
+	sx_xlock(&sc->connection_lock);
+	if (sc->admin != NULL || sc->detaching) {
+		error = EBUSY;
+		goto out;
+	}
+
+	/*
+	 * Ensure this is for the same controller.  Note that the
+	 * controller ID can vary across associations if the remote
+	 * system is using the dynamic controller model.  This merely
+	 * ensures the new association is connected to the same NVMe
+	 * subsystem.
+	 */
+	if (memcmp(sc->cdata->subnqn, ivars.cdata->subnqn,
+	    sizeof(ivars.cdata->subnqn)) != 0) {
+		device_printf(sc->dev,
+		    "controller subsystem NQN mismatch on reconnect\n");
+		error = EINVAL;
+		goto out;
+	}
+
+	/*
+	 * XXX: Require same number and size of I/O queues so that
+	 * max_pending_io is still correct?
+	 */
+
+	error = nvmf_establish_connection(sc, &ivars);
+	if (error != 0)
+		goto out;
+
+	error = nvmf_start_aer(sc);
+	if (error != 0)
+		goto out;
+
+	device_printf(sc->dev,
+	    "established new association with %u I/O queues\n",
+	    sc->num_io_queues);
+
+	/* Restart namespace consumers. */
+	for (i = 0; i < sc->cdata->nn; i++) {
+		if (sc->ns[i] != NULL)
+			nvmf_reconnect_ns(sc->ns[i]);
+	}
+	nvmf_reconnect_sim(sc);
+out:
+	sx_xunlock(&sc->connection_lock);
+	nvmf_free_ivars(&ivars);
+	return (error);
+}
+
+static int
+nvmf_detach(device_t dev)
+{
+	struct nvmf_softc *sc = device_get_softc(dev);
+	u_int i;
+
+	destroy_dev(sc->cdev);
+
+	sx_xlock(&sc->connection_lock);
+	sc->detaching = true;
+	sx_xunlock(&sc->connection_lock);
+
+	nvmf_destroy_sim(sc);
+	for (i = 0; i < sc->cdata->nn; i++) {
+		if (sc->ns[i] != NULL)
+			nvmf_destroy_ns(sc->ns[i]);
+	}
+	free(sc->ns, M_NVMF);
+
+	callout_drain(&sc->ka_tx_timer);
+	callout_drain(&sc->ka_rx_timer);
+
+	if (sc->admin != NULL)
+		nvmf_shutdown_controller(sc);
+
+	for (i = 0; i < sc->num_io_queues; i++) {
+		nvmf_destroy_qp(sc->io[i]);
+	}
+	free(sc->io, M_NVMF);
+
+	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
+
+	if (sc->admin != NULL)
+		nvmf_destroy_qp(sc->admin);
+
+	nvmf_destroy_aer(sc);
+
+	sx_destroy(&sc->connection_lock);
+	free(sc->cdata, M_NVMF);
+	return (0);
+}
+
+void
+nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
+{
+	struct nvmf_completion_status status;
+	struct nvme_namespace_data *data;
+	struct nvmf_namespace *ns;
+
+	data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
+
+	nvmf_status_init(&status);
+	nvmf_status_wait_io(&status);
+	if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
+	    &status, nvmf_io_complete, &status, M_WAITOK)) {
+		device_printf(sc->dev,
+		    "failed to send IDENTIFY namespace %u command\n", nsid);
+		free(data, M_NVMF);
+		return;
+	}
+	nvmf_wait_for_reply(&status);
+
+	if (status.cqe.status != 0) {
+		device_printf(sc->dev,
+		    "IDENTIFY namespace %u failed, status %#x\n", nsid,
+		    le16toh(status.cqe.status));
+		free(data, M_NVMF);
+		return;
+	}
+
+	if (status.io_error != 0) {
+		device_printf(sc->dev,
+		    "IDENTIFY namespace %u failed with I/O error %d\n",
+		    nsid, status.io_error);
+		free(data, M_NVMF);
+		return;
+	}
+
+	nvme_namespace_data_swapbytes(data);
+
+	/* XXX: Needs locking around sc->ns[]. */
+	ns = sc->ns[nsid - 1];
+	if (data->nsze == 0) {
+		/* XXX: Needs locking */
+		if (ns != NULL) {
+			nvmf_destroy_ns(ns);
+			sc->ns[nsid - 1] = NULL;
+		}
+	} else {
+		/* XXX: Needs locking */
+		if (ns == NULL) {
+			sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
+		} else {
+			if (!nvmf_update_ns(ns, data)) {
+				nvmf_destroy_ns(ns);
+				sc->ns[nsid - 1] = NULL;
+			}
+		}
+	}
+
+	free(data, M_NVMF);
+
+	nvmf_sim_rescan_ns(sc, nsid);
+}
+
+int
+nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
+    bool admin)
+{
+	struct nvmf_completion_status status;
+	struct nvme_command cmd;
+	struct memdesc mem;
+	struct nvmf_host_qpair *qp;
+	struct nvmf_request *req;
+	void *buf;
+	int error;
+
+	if (pt->len > sc->max_xfer_size)
+		return (EINVAL);
+
+	buf = NULL;
+	if (pt->len != 0) {
+		/*
+		 * XXX: Depending on the size we may want to pin the
+		 * user pages and use a memdesc with vm_page_t's
+		 * instead.
+		 */
+		buf = malloc(pt->len, M_NVMF, M_WAITOK);
+		if (pt->is_read == 0) {
+			error = copyin(pt->buf, buf, pt->len);
+			if (error != 0) {
+				free(buf, M_NVMF);
+				return (error);
+			}
+		} else {
+			/* Ensure no kernel data is leaked to userland. */
+			memset(buf, 0, pt->len);
+		}
+	}
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.opc = pt->cmd.opc;
+	cmd.fuse = pt->cmd.fuse;
+	cmd.nsid = pt->cmd.nsid;
+	cmd.cdw10 = pt->cmd.cdw10;
+	cmd.cdw11 = pt->cmd.cdw11;
+	cmd.cdw12 = pt->cmd.cdw12;
+	cmd.cdw13 = pt->cmd.cdw13;
+	cmd.cdw14 = pt->cmd.cdw14;
+	cmd.cdw15 = pt->cmd.cdw15;
+
+	if (admin)
+		qp = sc->admin;
+	else
+		qp = nvmf_select_io_queue(sc);
+	nvmf_status_init(&status);
+	req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK);
+	if (req == NULL) {
+		device_printf(sc->dev, "failed to send passthrough command\n");
+		error = ECONNABORTED;
+		goto error;
+	}
+
+	if (pt->len != 0) {
+		mem = memdesc_vaddr(buf, pt->len);
+		nvmf_capsule_append_data(req->nc, &mem, pt->len,
+		    pt->is_read == 0, nvmf_io_complete, &status);
+		nvmf_status_wait_io(&status);
+	}
+
+	nvmf_submit_request(req);
+	nvmf_wait_for_reply(&status);
+
+	memset(&pt->cpl, 0, sizeof(pt->cpl));
+	pt->cpl.cdw0 = status.cqe.cdw0;
+	pt->cpl.status = status.cqe.status;
+
+	error = status.io_error;
+	if (error == 0 && pt->len != 0 && pt->is_read != 0)
+		error = copyout(buf, pt->buf, pt->len);
+error:
+	free(buf, M_NVMF);
+	return (error);
+}
+
+static int
+nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
+    struct thread *td)
+{
+	struct nvmf_softc *sc = cdev->si_drv1;
+	struct nvme_get_nsid *gnsid;
+	struct nvme_pt_command *pt;
+	struct nvmf_reconnect_params *rp;
+	struct nvmf_handoff_host *hh;
+
+	switch (cmd) {
+	case NVME_PASSTHROUGH_CMD:
+		pt = (struct nvme_pt_command *)arg;
+		return (nvmf_passthrough_cmd(sc, pt, true));
+	case NVME_GET_NSID:
+		gnsid = (struct nvme_get_nsid *)arg;
+		strncpy(gnsid->cdev, device_get_nameunit(sc->dev),
+		    sizeof(gnsid->cdev));
+		gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
+		gnsid->nsid = 0;
+		return (0);
+	case NVME_GET_MAX_XFER_SIZE:
+		*(uint64_t *)arg = sc->max_xfer_size;
+		return (0);
+	case NVMF_RECONNECT_PARAMS:
+		rp = (struct nvmf_reconnect_params *)arg;
+		if ((sc->cdata->fcatt & 1) == 0)
+			rp->cntlid = NVMF_CNTLID_DYNAMIC;
+		else
+			rp->cntlid = sc->cdata->ctrlr_id;
+		memcpy(rp->subnqn, sc->cdata->subnqn, sizeof(rp->subnqn));
+		return (0);
+	case NVMF_RECONNECT_HOST:
+		hh = (struct nvmf_handoff_host *)arg;
+		return (nvmf_reconnect_host(sc, hh));
+	default:
+		return (ENOTTY);
+	}
+}
+
+static struct cdevsw nvmf_cdevsw = {
+	.d_version = D_VERSION,
+	.d_ioctl = nvmf_ioctl
+};
+
+static int
+nvmf_modevent(module_t mod, int what, void *arg)
+{
+	switch (what) {
+	case MOD_LOAD:
+		return (nvmf_ctl_load());
+	case MOD_QUIESCE:
+		return (0);
+	case MOD_UNLOAD:
+		nvmf_ctl_unload();
+		destroy_dev_drain(&nvmf_cdevsw);
+		return (0);
+	default:
+		return (EOPNOTSUPP);
+	}
+}
+
+static device_method_t nvmf_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,     nvmf_probe),
+	DEVMETHOD(device_attach,    nvmf_attach),
+	DEVMETHOD(device_detach,    nvmf_detach),
+#if 0
+	DEVMETHOD(device_shutdown,  nvmf_shutdown),
+#endif
+	DEVMETHOD_END
+};
+
+driver_t nvme_nvmf_driver = {
+	"nvme",
+	nvmf_methods,
+	sizeof(struct nvmf_softc),
+};
+
+DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL);
+MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1);
diff --git a/sys/dev/nvmf/host/nvmf_aer.c b/sys/dev/nvmf/host/nvmf_aer.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf_aer.c
@@ -0,0 +1,290 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/types.h>
+#include <sys/bus.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/taskqueue.h>
+#include <dev/nvmf/host/nvmf_var.h>
+
+struct nvmf_aer {
+	struct nvmf_softc *sc;
+	uint8_t log_page_id;
+	uint8_t info;
+	uint8_t type;
+
+	u_int	page_len;
+	void	*page;
+
+	int	error;
+	uint16_t status;
+	int	pending;
+	struct mtx *lock;
+	struct task complete_task;
+	struct task finish_page_task;
+};
+
+#define	MAX_LOG_PAGE_SIZE	4096
+
+static void	nvmf_complete_aer(void *arg, const struct nvme_completion *cqe);
+
+static void
+nvmf_submit_aer(struct nvmf_softc *sc, struct nvmf_aer *aer)
+{
+	struct nvmf_request *req;
+	struct nvme_command cmd;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST;
+
+	req = nvmf_allocate_request(sc->admin, &cmd, nvmf_complete_aer, aer,
+	    M_WAITOK);
+	if (req == NULL)
+		return;
+	req->aer = true;
+	nvmf_submit_request(req);
+}
+
+static void
+nvmf_handle_changed_namespaces(struct nvmf_softc *sc,
+    struct nvme_ns_list *ns_list)
+{
+	uint32_t nsid;
+
+	/*
+	 * If more than 1024 namespaces have changed, we should
+	 * probably just rescan the entire set of namespaces.
+	 */
+	if (ns_list->ns[0] == 0xffffffff) {
+		device_printf(sc->dev, "too many changed namespaces\n");
+		return;
+	}
+
+	for (u_int i = 0; i < nitems(ns_list->ns); i++) {
+		if (ns_list->ns[i] == 0)
+			break;
+
+		nsid = le32toh(ns_list->ns[i]);
+		nvmf_rescan_ns(sc, nsid);
+	}
+}
+
+static void
+nvmf_finish_aer_page(struct nvmf_softc *sc, struct nvmf_aer *aer)
+{
+	/* If an error occurred fetching the page, just bail. */
+	if (aer->error != 0 || aer->status != 0)
+		return;
+
+	taskqueue_enqueue(taskqueue_thread, &aer->finish_page_task);
+}
+
+static void
+nvmf_finish_aer_page_task(void *arg, int pending)
+{
+	struct nvmf_aer *aer = arg;
+	struct nvmf_softc *sc = aer->sc;
+
+	switch (aer->log_page_id) {
+	case NVME_LOG_ERROR:
+		/* TODO: Should we log these? */
+		break;
+	case NVME_LOG_CHANGED_NAMESPACE:
+		nvmf_handle_changed_namespaces(sc, aer->page);
+		break;
+	}
+
+	/* Resubmit this AER command. */
+	nvmf_submit_aer(sc, aer);
+}
+
+static void
+nvmf_io_complete_aer_page(void *arg, size_t xfered, int error)
+{
+	struct nvmf_aer *aer = arg;
+	struct nvmf_softc *sc = aer->sc;
+
+	mtx_lock(aer->lock);
+	aer->error = error;
+	aer->pending--;
+	if (aer->pending == 0) {
+		mtx_unlock(aer->lock);
+		nvmf_finish_aer_page(sc, aer);
+	} else
+		mtx_unlock(aer->lock);
+}
+
+static void
+nvmf_complete_aer_page(void *arg, const struct nvme_completion *cqe)
+{
+	struct nvmf_aer *aer = arg;
+	struct nvmf_softc *sc = aer->sc;
+
+	mtx_lock(aer->lock);
+	aer->status = cqe->status;
+	aer->pending--;
+	if (aer->pending == 0) {
+		mtx_unlock(aer->lock);
+		nvmf_finish_aer_page(sc, aer);
+	} else
+		mtx_unlock(aer->lock);
+}
+
+static u_int
+nvmf_log_page_size(struct nvmf_softc *sc, uint8_t log_page_id)
+{
+	switch (log_page_id) {
+	case NVME_LOG_ERROR:
+		return ((sc->cdata->elpe + 1) *
+		    sizeof(struct nvme_error_information_entry));
+	case NVME_LOG_CHANGED_NAMESPACE:
+		return (sizeof(struct nvme_ns_list));
+	default:
+		return (0);
+	}
+}
+
+static void
+nvmf_complete_aer(void *arg, const struct nvme_completion *cqe)
+{
+	struct nvmf_aer *aer = arg;
+	struct nvmf_softc *sc = aer->sc;
+	uint32_t cdw0;
+
+	/*
+	 * The only error defined for AER is an abort due to
+	 * submitting too many AER commands.  Just discard this AER
+	 * without resubmitting if we get an error.
+	 *
+	 * NB: Pending AER commands are aborted during controller
+	 * shutdown, so discard aborted commands silently.
+	 */
+	if (cqe->status != 0) {
+		if (!nvmf_cqe_aborted(cqe))
+			device_printf(sc->dev, "Ignoring error %#x for AER\n",
+			    le16toh(cqe->status));
+		return;
+	}
+
+	cdw0 = le32toh(cqe->cdw0);
+	aer->log_page_id = NVMEV(NVME_ASYNC_EVENT_LOG_PAGE_ID, cdw0);
+	aer->info = NVMEV(NVME_ASYNC_EVENT_INFO, cdw0);
+	aer->type = NVMEV(NVME_ASYNC_EVENT_TYPE, cdw0);
+
+	device_printf(sc->dev, "AER type %u, info %#x, page %#x\n",
+	    aer->type, aer->info, aer->log_page_id);
+
+	aer->page_len = nvmf_log_page_size(sc, aer->log_page_id);
+	taskqueue_enqueue(taskqueue_thread, &aer->complete_task);
+}
+
+static void
+nvmf_complete_aer_task(void *arg, int pending)
+{
+	struct nvmf_aer *aer = arg;
+	struct nvmf_softc *sc = aer->sc;
+
+	if (aer->page_len != 0) {
+		/* Read the associated log page. */
+		aer->page_len = MIN(aer->page_len, MAX_LOG_PAGE_SIZE);
+		aer->pending = 2;
+		(void) nvmf_cmd_get_log_page(sc, NVME_GLOBAL_NAMESPACE_TAG,
+		    aer->log_page_id, 0, aer->page, aer->page_len,
+		    nvmf_complete_aer_page, aer, nvmf_io_complete_aer_page,
+		    aer, M_WAITOK);
+	} else {
+		/* Resubmit this AER command. */
+		nvmf_submit_aer(sc, aer);
+	}
+}
+
+static int
+nvmf_set_async_event_config(struct nvmf_softc *sc, uint32_t config)
+{
+	struct nvme_command cmd;
+	struct nvmf_completion_status status;
+	struct nvmf_request *req;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.opc = NVME_OPC_SET_FEATURES;
+	cmd.cdw10 = htole32(NVME_FEAT_ASYNC_EVENT_CONFIGURATION);
+	cmd.cdw11 = htole32(config);
+
+	nvmf_status_init(&status);
+	req = nvmf_allocate_request(sc->admin, &cmd, nvmf_complete, &status,
+	    M_WAITOK);
+	if (req == NULL) {
+		device_printf(sc->dev,
+		    "failed to allocate SET_FEATURES (ASYNC_EVENT_CONFIGURATION) command\n");
+		return (ECONNABORTED);
+	}
+	nvmf_submit_request(req);
+	nvmf_wait_for_reply(&status);
+
+	if (status.cqe.status != 0) {
+		device_printf(sc->dev,
+		    "SET_FEATURES (ASYNC_EVENT_CONFIGURATION) failed, status %#x\n",
+		    le16toh(status.cqe.status));
+		return (EIO);
+	}
+
+	return (0);
+}
+
+void
+nvmf_init_aer(struct nvmf_softc *sc)
+{
+	/* 8 matches NVME_MAX_ASYNC_EVENTS */
+	sc->num_aer = min(8, sc->cdata->aerl + 1);
+	sc->aer = mallocarray(sc->num_aer, sizeof(*sc->aer), M_NVMF,
+	    M_WAITOK | M_ZERO);
+	for (u_int i = 0; i < sc->num_aer; i++) {
+		sc->aer[i].sc = sc;
+		sc->aer[i].page = malloc(MAX_LOG_PAGE_SIZE, M_NVMF, M_WAITOK);
+		sc->aer[i].lock = mtx_pool_find(mtxpool_sleep, &sc->aer[i]);
+		TASK_INIT(&sc->aer[i].complete_task, 0, nvmf_complete_aer_task,
+		    &sc->aer[i]);
+		TASK_INIT(&sc->aer[i].finish_page_task, 0,
+		    nvmf_finish_aer_page_task, &sc->aer[i]);
+	}
+}
+
+int
+nvmf_start_aer(struct nvmf_softc *sc)
+{
+	uint32_t async_event_config;
+	int error;
+
+	async_event_config = NVME_CRIT_WARN_ST_AVAILABLE_SPARE |
+	    NVME_CRIT_WARN_ST_DEVICE_RELIABILITY |
+	    NVME_CRIT_WARN_ST_READ_ONLY |
+	    NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP;
+	if (sc->cdata->ver >= NVME_REV(1, 2))
+		async_event_config |=
+		    sc->cdata->oaes & NVME_ASYNC_EVENT_NS_ATTRIBUTE;
+	error = nvmf_set_async_event_config(sc, async_event_config);
+	if (error != 0)
+		return (error);
+
+	for (u_int i = 0; i < sc->num_aer; i++)
+		nvmf_submit_aer(sc, &sc->aer[i]);
+
+	return (0);
+}
+
+void
+nvmf_destroy_aer(struct nvmf_softc *sc)
+{
+	for (u_int i = 0; i < sc->num_aer; i++) {
+		taskqueue_drain(taskqueue_thread, &sc->aer[i].complete_task);
+		taskqueue_drain(taskqueue_thread, &sc->aer[i].finish_page_task);
+		free(sc->aer[i].page, M_NVMF);
+	}
+	free(sc->aer, M_NVMF);
+}
diff --git a/sys/dev/nvmf/host/nvmf_cmd.c b/sys/dev/nvmf/host/nvmf_cmd.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf_cmd.c
@@ -0,0 +1,171 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/types.h>
+#include <sys/memdesc.h>
+#include <sys/systm.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_proto.h>
+#include <dev/nvmf/host/nvmf_var.h>
+
+bool
+nvmf_cmd_get_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
+    nvmf_request_complete_t *cb, void *cb_arg, int how)
+{
+	struct nvmf_fabric_prop_get_cmd cmd;
+	struct nvmf_request *req;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.opcode = NVME_OPC_FABRICS_COMMANDS;
+	cmd.fctype = NVMF_FABRIC_COMMAND_PROPERTY_GET;
+	switch (size) {
+	case 4:
+		cmd.attrib.size = NVMF_PROP_SIZE_4;
+		break;
+	case 8:
+		cmd.attrib.size = NVMF_PROP_SIZE_8;
+		break;
+	default:
+		panic("Invalid property size");
+	}
+	cmd.ofst = htole32(offset);
+
+	req = nvmf_allocate_request(sc->admin, &cmd, cb, cb_arg, how);
+	if (req != NULL)
+		nvmf_submit_request(req);
+	return (req != NULL);
+}
+
+bool
+nvmf_cmd_set_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
+    uint64_t value, nvmf_request_complete_t *cb, void *cb_arg, int how)
+{
+	struct nvmf_fabric_prop_set_cmd cmd;
+	struct nvmf_request *req;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.opcode = NVME_OPC_FABRICS_COMMANDS;
+	cmd.fctype = NVMF_FABRIC_COMMAND_PROPERTY_SET;
+	switch (size) {
+	case 4:
+		cmd.attrib.size = NVMF_PROP_SIZE_4;
+		cmd.value.u32.low = htole32(value);
+		break;
+	case 8:
+		cmd.attrib.size = NVMF_PROP_SIZE_8;
+		cmd.value.u64 = htole64(value);
+		break;
+	default:
+		panic("Invalid property size");
+	}
+	cmd.ofst = htole32(offset);
+
+	req = nvmf_allocate_request(sc->admin, &cmd, cb, cb_arg, how);
+	if (req != NULL)
+		nvmf_submit_request(req);
+	return (req != NULL);
+}
+
+bool
+nvmf_cmd_keep_alive(struct nvmf_softc *sc, nvmf_request_complete_t *cb,
+    void *cb_arg, int how)
+{
+	struct nvme_command cmd;
+	struct nvmf_request *req;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.opc = NVME_OPC_KEEP_ALIVE;
+
+	req = nvmf_allocate_request(sc->admin, &cmd, cb, cb_arg, how);
+	if (req != NULL)
+		nvmf_submit_request(req);
+	return (req != NULL);
+}
+
+bool
+nvmf_cmd_identify_active_namespaces(struct nvmf_softc *sc, uint32_t id,
+    struct nvme_ns_list *nslist, nvmf_request_complete_t *req_cb,
+    void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how)
+{
+	struct nvme_command cmd;
+	struct memdesc mem;
+	struct nvmf_request *req;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.opc = NVME_OPC_IDENTIFY;
+
+	/* 5.15.1 Use CNS of 0x02 for namespace data. */
+	cmd.cdw10 = htole32(2);
+	cmd.nsid = htole32(id);
+
+	req = nvmf_allocate_request(sc->admin, &cmd, req_cb, req_cb_arg, how);
+	if (req == NULL)
+		return (false);
+	mem = memdesc_vaddr(nslist, sizeof(*nslist));
+	nvmf_capsule_append_data(req->nc, &mem, sizeof(*nslist), false,
+	    io_cb, io_cb_arg);
+	nvmf_submit_request(req);
+	return (true);
+}
+
+bool
+nvmf_cmd_identify_namespace(struct nvmf_softc *sc, uint32_t id,
+    struct nvme_namespace_data *nsdata, nvmf_request_complete_t *req_cb,
+    void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how)
+{
+	struct nvme_command cmd;
+	struct memdesc mem;
+	struct nvmf_request *req;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.opc = NVME_OPC_IDENTIFY;
+
+	/* 5.15.1 Use CNS of 0x00 for namespace data. */
+	cmd.cdw10 = htole32(0);
+	cmd.nsid = htole32(id);
+
+	req = nvmf_allocate_request(sc->admin, &cmd, req_cb, req_cb_arg, how);
+	if (req == NULL)
+		return (false);
+	mem = memdesc_vaddr(nsdata, sizeof(*nsdata));
+	nvmf_capsule_append_data(req->nc, &mem, sizeof(*nsdata), false,
+	    io_cb, io_cb_arg);
+	nvmf_submit_request(req);
+	return (true);
+}
+
+bool
+nvmf_cmd_get_log_page(struct nvmf_softc *sc, uint32_t nsid, uint8_t lid,
+    uint64_t offset, void *buf, size_t len, nvmf_request_complete_t *req_cb,
+    void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how)
+{
+	struct nvme_command cmd;
+	struct memdesc mem;
+	struct nvmf_request *req;
+	size_t numd;
+
+	MPASS(len != 0 && len % 4 == 0);
+	MPASS(offset % 4 == 0);
+
+	numd = (len / 4) - 1;
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.opc = NVME_OPC_GET_LOG_PAGE;
+	cmd.nsid = htole32(nsid);
+	cmd.cdw10 = htole32(numd << 16 | lid);
+	cmd.cdw11 = htole32(numd >> 16);
+	cmd.cdw12 = htole32(offset);
+	cmd.cdw13 = htole32(offset >> 32);
+
+	req = nvmf_allocate_request(sc->admin, &cmd, req_cb, req_cb_arg, how);
+	if (req == NULL)
+		return (false);
+	mem = memdesc_vaddr(buf, len);
+	nvmf_capsule_append_data(req->nc, &mem, len, false, io_cb, io_cb_arg);
+	nvmf_submit_request(req);
+	return (true);
+}
diff --git a/sys/dev/nvmf/host/nvmf_ctldev.c b/sys/dev/nvmf/host/nvmf_ctldev.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf_ctldev.c
@@ -0,0 +1,159 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/malloc.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_transport.h>
+#include <dev/nvmf/host/nvmf_var.h>
+
+static struct cdev *nvmf_cdev;
+
+static int
+nvmf_handoff_host(struct nvmf_handoff_host *hh)
+{
+	struct nvmf_ivars ivars;
+	device_t dev;
+	int error;
+
+	error = nvmf_init_ivars(&ivars, hh);
+	if (error != 0)
+		return (error);
+
+	bus_topo_lock();
+	dev = device_add_child(root_bus, "nvme", -1);
+	if (dev == NULL) {
+		bus_topo_unlock();
+		error = ENXIO;
+		goto out;
+	}
+
+	device_set_ivars(dev, &ivars);
+	error = device_probe_and_attach(dev);
+	device_set_ivars(dev, NULL);
+	if (error != 0)
+		device_delete_child(root_bus, dev);
+	bus_topo_unlock();
+
+out:
+	nvmf_free_ivars(&ivars);
+	return (error);
+}
+
+static bool
+nvmf_matches(device_t dev, char *name)
+{
+	struct nvmf_softc *sc = device_get_softc(dev);
+
+	if (strcmp(device_get_nameunit(dev), name) == 0)
+		return (true);
+	if (strcmp(sc->cdata->subnqn, name) == 0)
+		return (true);
+	return (false);
+}
+
+static int
+nvmf_disconnect_by_name(char *name)
+{
+	devclass_t dc;
+	device_t dev;
+	int error, unit;
+	bool found;
+
+	found = false;
+	error = 0;
+	bus_topo_lock();
+	dc = devclass_find("nvme");
+	if (dc == NULL)
+		goto out;
+
+	for (unit = 0; unit < devclass_get_maxunit(dc); unit++) {
+		dev = devclass_get_device(dc, unit);
+		if (dev == NULL)
+			continue;
+		if (device_get_driver(dev) != &nvme_nvmf_driver)
+			continue;
+		if (device_get_parent(dev) != root_bus)
+			continue;
+		if (name != NULL && !nvmf_matches(dev, name))
+			continue;
+
+		error = device_delete_child(root_bus, dev);
+		if (error != 0)
+			break;
+		found = true;
+	}
+out:
+	bus_topo_unlock();
+	if (error == 0 && !found)
+		error = ENOENT;
+	return (error);
+}
+
+static int
+nvmf_disconnect_host(const char **namep)
+{
+	char *name;
+	int error;
+
+	name = malloc(PATH_MAX, M_NVMF, M_WAITOK);
+	error = copyinstr(*namep, name, PATH_MAX, NULL);
+	if (error == 0)
+		error = nvmf_disconnect_by_name(name);
+	free(name, M_NVMF);
+	return (error);
+}
+
+static int
+nvmf_ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag,
+    struct thread *td)
+{
+	switch (cmd) {
+	case NVMF_HANDOFF_HOST:
+		return (nvmf_handoff_host((struct nvmf_handoff_host *)arg));
+	case NVMF_DISCONNECT_HOST:
+		return (nvmf_disconnect_host((const char **)arg));
+	case NVMF_DISCONNECT_ALL:
+		return (nvmf_disconnect_by_name(NULL));
+	default:
+		return (ENOTTY);
+	}
+}
+
+static struct cdevsw nvmf_ctl_cdevsw = {
+	.d_version = D_VERSION,
+	.d_ioctl = nvmf_ctl_ioctl
+};
+
+int
+nvmf_ctl_load(void)
+{
+	struct make_dev_args mda;
+	int error;
+
+	make_dev_args_init(&mda);
+	mda.mda_devsw = &nvmf_ctl_cdevsw;
+	mda.mda_uid = UID_ROOT;
+	mda.mda_gid = GID_WHEEL;
+	mda.mda_mode = 0600;
+	error = make_dev_s(&mda, &nvmf_cdev, "nvmf");
+	if (error != 0)
+		nvmf_cdev = NULL;
+	return (error);
+}
+
+void
+nvmf_ctl_unload(void)
+{
+	if (nvmf_cdev != NULL) {
+		destroy_dev(nvmf_cdev);
+		nvmf_cdev = NULL;
+	}
+}
diff --git a/sys/dev/nvmf/host/nvmf_ns.c b/sys/dev/nvmf/host/nvmf_ns.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf_ns.c
@@ -0,0 +1,483 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/disk.h>
+#include <sys/fcntl.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/memdesc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/refcount.h>
+#include <sys/sbuf.h>
+#include <machine/stdarg.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/host/nvmf_var.h>
+
+struct nvmf_namespace {
+	struct nvmf_softc *sc;
+	uint64_t size;
+	uint32_t id;
+	u_int	flags;
+	uint32_t lba_size;
+	bool disconnected;
+
+	TAILQ_HEAD(, bio) pending_bios;
+	struct mtx lock;
+	volatile u_int active_bios;
+
+	struct cdev *cdev;
+};
+
+static void	nvmf_ns_strategy(struct bio *bio);
+
+static void
+ns_printf(struct nvmf_namespace *ns, const char *fmt, ...)
+{
+	char buf[128];
+	struct sbuf sb;
+	va_list ap;
+
+	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
+	sbuf_set_drain(&sb, sbuf_printf_drain, NULL);
+
+	sbuf_printf(&sb, "%sns%u: ", device_get_nameunit(ns->sc->dev),
+	    ns->id);
+
+	va_start(ap, fmt);
+	sbuf_vprintf(&sb, fmt, ap);
+	va_end(ap);
+
+	sbuf_finish(&sb);
+	sbuf_delete(&sb);
+}
+
+/*
+ * The I/O completion may trigger after the received CQE if the I/O
+ * used a zero-copy mbuf that isn't harvested until after the NIC
+ * driver processes TX completions.  Abuse bio_driver1 as a refcount.
+ * Store I/O errors in bio_driver2.
+ */
+static __inline u_int *
+bio_refs(struct bio *bio)
+{
+	return ((u_int *)&bio->bio_driver1);
+}
+
+static void
+nvmf_ns_biodone(struct bio *bio)
+{
+	struct nvmf_namespace *ns;
+	int error;
+
+	if (!refcount_release(bio_refs(bio)))
+		return;
+
+	ns = bio->bio_dev->si_drv1;
+
+	/* If a request is aborted, resubmit or queue it for resubmission. */
+	if (bio->bio_error == ECONNABORTED) {
+		bio->bio_error = 0;
+		bio->bio_driver2 = 0;
+		mtx_lock(&ns->lock);
+		if (ns->disconnected) {
+			TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
+			mtx_unlock(&ns->lock);
+		} else {
+			mtx_unlock(&ns->lock);
+			nvmf_ns_strategy(bio);
+		}
+	} else {
+		/*
+		 * I/O errors take precedence over generic EIO from
+		 * CQE errors.
+		 */
+		error = (intptr_t)bio->bio_driver2;
+		if (error != 0)
+			bio->bio_error = error;
+		if (bio->bio_error != 0)
+			bio->bio_flags |= BIO_ERROR;
+		biodone(bio);
+	}
+
+	if (refcount_release(&ns->active_bios))
+		wakeup(ns);
+}
+
+static void
+nvmf_ns_io_complete(void *arg, size_t xfered, int error)
+{
+	struct bio *bio = arg;
+
+	KASSERT(xfered <= bio->bio_bcount,
+	    ("%s: xfered > bio_bcount", __func__));
+
+	bio->bio_driver2 = (void *)(intptr_t)error;
+	bio->bio_resid = bio->bio_bcount - xfered;
+
+	nvmf_ns_biodone(bio);
+}
+
+static void
+nvmf_ns_delete_complete(void *arg, size_t xfered, int error)
+{
+	struct bio *bio = arg;
+
+	if (error != 0)
+		bio->bio_resid = bio->bio_bcount;
+	else
+		bio->bio_resid = 0;
+
+	free(bio->bio_driver2, M_NVMF);
+	bio->bio_driver2 = (void *)(intptr_t)error;
+
+	nvmf_ns_biodone(bio);
+}
+
+static void
+nvmf_ns_bio_complete(void *arg, const struct nvme_completion *cqe)
+{
+	struct bio *bio = arg;
+
+	if (nvmf_cqe_aborted(cqe))
+		bio->bio_error = ECONNABORTED;
+	else if (cqe->status != 0)
+		bio->bio_error = EIO;
+
+	nvmf_ns_biodone(bio);
+}
+
+static int
+nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio)
+{
+	struct nvme_command cmd;
+	struct nvmf_request *req;
+	struct nvme_dsm_range *dsm_range;
+	struct memdesc mem;
+	uint64_t lba, lba_count;
+
+	dsm_range = NULL;
+	memset(&cmd, 0, sizeof(cmd));
+	switch (bio->bio_cmd) {
+	case BIO_READ:
+		lba = bio->bio_offset / ns->lba_size;
+		lba_count = bio->bio_bcount / ns->lba_size;
+		nvme_ns_read_cmd(&cmd, ns->id, lba, lba_count);
+		break;
+	case BIO_WRITE:
+		lba = bio->bio_offset / ns->lba_size;
+		lba_count = bio->bio_bcount / ns->lba_size;
+		nvme_ns_write_cmd(&cmd, ns->id, lba, lba_count);
+		break;
+	case BIO_FLUSH:
+		nvme_ns_flush_cmd(&cmd, ns->id);
+		break;
+	case BIO_DELETE:
+		dsm_range = malloc(sizeof(*dsm_range), M_NVMF, M_NOWAIT |
+		    M_ZERO);
+		if (dsm_range == NULL)
+			return (ENOMEM);
+		lba = bio->bio_offset / ns->lba_size;
+		lba_count = bio->bio_bcount / ns->lba_size;
+		dsm_range->starting_lba = htole64(lba);
+		dsm_range->length = htole32(lba_count);
+
+		cmd.opc = NVME_OPC_DATASET_MANAGEMENT;
+		cmd.nsid = htole32(ns->id);
+		cmd.cdw10 = htole32(0);		/* 1 range */
+		cmd.cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE);
+		break;
+	default:
+		return (EOPNOTSUPP);
+	}
+
+	mtx_lock(&ns->lock);
+	if (ns->disconnected) {
+		TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
+		mtx_unlock(&ns->lock);
+		free(dsm_range, M_NVMF);
+		return (0);
+	}
+
+	req = nvmf_allocate_request(nvmf_select_io_queue(ns->sc), &cmd,
+	    nvmf_ns_bio_complete, bio, M_NOWAIT);
+	if (req == NULL) {
+		mtx_unlock(&ns->lock);
+		free(dsm_range, M_NVMF);
+		return (ENOMEM);
+	}
+
+	switch (bio->bio_cmd) {
+	case BIO_READ:
+	case BIO_WRITE:
+		refcount_init(bio_refs(bio), 2);
+		mem = memdesc_bio(bio);
+		nvmf_capsule_append_data(req->nc, &mem, bio->bio_bcount,
+		    bio->bio_cmd == BIO_WRITE, nvmf_ns_io_complete, bio);
+		break;
+	case BIO_DELETE:
+		refcount_init(bio_refs(bio), 2);
+		mem = memdesc_vaddr(dsm_range, sizeof(*dsm_range));
+		nvmf_capsule_append_data(req->nc, &mem, sizeof(*dsm_range),
+		    true, nvmf_ns_delete_complete, bio);
+		bio->bio_driver2 = dsm_range;
+		break;
+	default:
+		refcount_init(bio_refs(bio), 1);
+		KASSERT(bio->bio_resid == 0,
+		    ("%s: input bio_resid != 0", __func__));
+		break;
+	}
+
+	refcount_acquire(&ns->active_bios);
+	nvmf_submit_request(req);
+	mtx_unlock(&ns->lock);
+	return (0);
+}
+
+static int
+nvmf_ns_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag,
+    struct thread *td)
+{
+	struct nvmf_namespace *ns = dev->si_drv1;
+	struct nvme_get_nsid *gnsid;
+	struct nvme_pt_command *pt;
+
+	switch (cmd) {
+	case NVME_PASSTHROUGH_CMD:
+		pt = (struct nvme_pt_command *)arg;
+		pt->cmd.nsid = htole32(ns->id);
+		return (nvmf_passthrough_cmd(ns->sc, pt, false));
+	case NVME_GET_NSID:
+		gnsid = (struct nvme_get_nsid *)arg;
+		strncpy(gnsid->cdev, device_get_nameunit(ns->sc->dev),
+		    sizeof(gnsid->cdev));
+		gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
+		gnsid->nsid = ns->id;
+		return (0);
+	case DIOCGMEDIASIZE:
+		*(off_t *)arg = ns->size;
+		return (0);
+	case DIOCGSECTORSIZE:
+		*(u_int *)arg = ns->lba_size;
+		return (0);
+	default:
+		return (ENOTTY);
+	}
+}
+
+static int
+nvmf_ns_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+	int error;
+
+	error = 0;
+	if ((oflags & FWRITE) != 0)
+		error = securelevel_gt(td->td_ucred, 0);
+	return (error);
+}
+
+void
+nvmf_ns_strategy(struct bio *bio)
+{
+	struct nvmf_namespace *ns;
+	int error;
+
+	ns = bio->bio_dev->si_drv1;
+
+	error = nvmf_ns_submit_bio(ns, bio);
+	if (error != 0) {
+		bio->bio_error = error;
+		bio->bio_flags |= BIO_ERROR;
+		bio->bio_resid = bio->bio_bcount;
+		biodone(bio);
+	}
+}
+
+static struct cdevsw nvmf_ns_cdevsw = {
+	.d_version = D_VERSION,
+	.d_flags = D_DISK,
+	.d_open = nvmf_ns_open,
+	.d_read = physread,
+	.d_write = physwrite,
+	.d_strategy = nvmf_ns_strategy,
+	.d_ioctl = nvmf_ns_ioctl
+};
+
+struct nvmf_namespace *
+nvmf_init_ns(struct nvmf_softc *sc, uint32_t id,
+    struct nvme_namespace_data *data)
+{
+	struct make_dev_args mda;
+	struct nvmf_namespace *ns;
+	int error;
+	uint8_t lbads, lbaf;
+
+	ns = malloc(sizeof(*ns), M_NVMF, M_WAITOK | M_ZERO);
+	ns->sc = sc;
+	ns->id = id;
+	TAILQ_INIT(&ns->pending_bios);
+	mtx_init(&ns->lock, "nvmf ns", NULL, MTX_DEF);
+
+	/* One dummy bio avoids dropping to 0 until destroy. */
+	refcount_init(&ns->active_bios, 1);
+
+	if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
+		ns_printf(ns, "End-to-end data protection not supported\n");
+		goto fail;
+	}
+
+	lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
+	if (lbaf > data->nlbaf) {
+		ns_printf(ns, "Invalid LBA format index\n");
+		goto fail;
+	}
+
+	if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
+		ns_printf(ns, "Namespaces with metadata are not supported\n");
+		goto fail;
+	}
+
+	lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
+	if (lbads == 0) {
+		ns_printf(ns, "Invalid LBA format index\n");
+		goto fail;
+	}
+
+	ns->lba_size = 1 << lbads;
+	ns->size = data->nsze * ns->lba_size;
+
+	if (nvme_ctrlr_has_dataset_mgmt(sc->cdata))
+		ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED;
+
+	if (NVMEV(NVME_CTRLR_DATA_VWC_PRESENT, sc->cdata->vwc) != 0)
+		ns->flags |= NVME_NS_FLUSH_SUPPORTED;
+
+	/*
+	 * XXX: Does any of the boundary splitting for NOIOB make any
+	 * sense for Fabrics?
+	 */
+
+	make_dev_args_init(&mda);
+	mda.mda_devsw = &nvmf_ns_cdevsw;
+	mda.mda_uid = UID_ROOT;
+	mda.mda_gid = GID_WHEEL;
+	mda.mda_mode = 0600;
+	mda.mda_si_drv1 = ns;
+	error = make_dev_s(&mda, &ns->cdev, "%sns%u",
+	    device_get_nameunit(sc->dev), id);
+	if (error != 0)
+		goto fail;
+
+	ns->cdev->si_flags |= SI_UNMAPPED;
+
+	return (ns);
+fail:
+	mtx_destroy(&ns->lock);
+	free(ns, M_NVMF);
+	return (NULL);
+}
+
+void
+nvmf_disconnect_ns(struct nvmf_namespace *ns)
+{
+	mtx_lock(&ns->lock);
+	ns->disconnected = true;
+	mtx_unlock(&ns->lock);
+}
+
+void
+nvmf_reconnect_ns(struct nvmf_namespace *ns)
+{
+	TAILQ_HEAD(, bio) bios;
+	struct bio *bio;
+
+	mtx_lock(&ns->lock);
+	ns->disconnected = false;
+	TAILQ_INIT(&bios);
+	TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
+	mtx_unlock(&ns->lock);
+
+	while (!TAILQ_EMPTY(&bios)) {
+		bio = TAILQ_FIRST(&bios);
+		TAILQ_REMOVE(&bios, bio, bio_queue);
+		nvmf_ns_strategy(bio);
+	}
+}
+
+void
+nvmf_destroy_ns(struct nvmf_namespace *ns)
+{
+	TAILQ_HEAD(, bio) bios;
+	struct bio *bio;
+
+	destroy_dev(ns->cdev);
+
+	/*
+	 * Wait for active I/O requests to drain.  The release drops
+	 * the reference on the "dummy bio" when the namespace is
+	 * created.
+	 */
+	mtx_lock(&ns->lock);
+	if (!refcount_release(&ns->active_bios)) {
+		while (ns->active_bios != 0)
+			mtx_sleep(ns, &ns->lock, 0, "nvmfrmns", 0);
+	}
+
+	/* Abort any pending I/O requests. */
+	TAILQ_INIT(&bios);
+	TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
+	mtx_unlock(&ns->lock);
+
+	while (!TAILQ_EMPTY(&bios)) {
+		bio = TAILQ_FIRST(&bios);
+		TAILQ_REMOVE(&bios, bio, bio_queue);
+		bio->bio_error = ECONNABORTED;
+		bio->bio_flags |= BIO_ERROR;
+		bio->bio_resid = bio->bio_bcount;
+		biodone(bio);
+	}
+
+	mtx_destroy(&ns->lock);
+	free(ns, M_NVMF);
+}
+
+bool
+nvmf_update_ns(struct nvmf_namespace *ns, struct nvme_namespace_data *data)
+{
+	uint8_t lbads, lbaf;
+
+	if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
+		ns_printf(ns, "End-to-end data protection not supported\n");
+		return (false);
+	}
+
+	lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
+	if (lbaf > data->nlbaf) {
+		ns_printf(ns, "Invalid LBA format index\n");
+		return (false);
+	}
+
+	if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
+		ns_printf(ns, "Namespaces with metadata are not supported\n");
+		return (false);
+	}
+
+	lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
+	if (lbads == 0) {
+		ns_printf(ns, "Invalid LBA format index\n");
+		return (false);
+	}
+
+	ns->lba_size = 1 << lbads;
+	ns->size = data->nsze * ns->lba_size;
+	return (true);
+}
diff --git a/sys/dev/nvmf/host/nvmf_qpair.c b/sys/dev/nvmf/host/nvmf_qpair.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf_qpair.c
@@ -0,0 +1,386 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/types.h>
+#include <sys/bus.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_transport.h>
+#include <dev/nvmf/host/nvmf_var.h>
+
+struct nvmf_host_command {
+	struct nvmf_request *req;
+	TAILQ_ENTRY(nvmf_host_command) link;
+	uint16_t cid;
+};
+
+struct nvmf_host_qpair {
+	struct nvmf_softc *sc;
+	struct nvmf_qpair *qp;
+
+	bool	sq_flow_control;
+	bool	shutting_down;
+	u_int	allocating;
+	u_int	num_commands;
+	uint16_t sqhd;
+	uint16_t sqtail;
+
+	struct mtx lock;
+
+	TAILQ_HEAD(, nvmf_host_command) free_commands;
+	STAILQ_HEAD(, nvmf_request) pending_requests;
+
+	/* Indexed by cid. */
+	struct nvmf_host_command **active_commands;
+
+	char	name[16];
+};
+
+struct nvmf_request *
+nvmf_allocate_request(struct nvmf_host_qpair *qp, void *sqe,
+    nvmf_request_complete_t *cb, void *cb_arg, int how)
+{
+	struct nvmf_request *req;
+	struct nvmf_qpair *nq;
+
+	KASSERT(how == M_WAITOK || how == M_NOWAIT,
+	    ("%s: invalid how", __func__));
+
+	req = malloc(sizeof(*req), M_NVMF, how | M_ZERO);
+	if (req == NULL)
+		return (NULL);
+
+	mtx_lock(&qp->lock);
+	nq = qp->qp;
+	if (nq == NULL) {
+		mtx_unlock(&qp->lock);
+		free(req, M_NVMF);
+		return (NULL);
+	}
+	qp->allocating++;
+	MPASS(qp->allocating != 0);
+	mtx_unlock(&qp->lock);
+
+	req->qp = qp;
+	req->cb = cb;
+	req->cb_arg = cb_arg;
+	req->nc = nvmf_allocate_command(nq, sqe, how);
+	if (req->nc == NULL) {
+		free(req, M_NVMF);
+		req = NULL;
+	}
+
+	mtx_lock(&qp->lock);
+	qp->allocating--;
+	if (qp->allocating == 0 && qp->shutting_down)
+		wakeup(qp);
+	mtx_unlock(&qp->lock);
+
+	return (req);
+}
+
+static void
+nvmf_abort_request(struct nvmf_request *req, uint16_t cid)
+{
+	struct nvme_completion cqe;
+
+	memset(&cqe, 0, sizeof(cqe));
+	cqe.cid = cid;
+	cqe.status = htole16(NVMEF(NVME_STATUS_SCT, NVME_SCT_PATH_RELATED) |
+	    NVMEF(NVME_STATUS_SC, NVME_SC_COMMAND_ABORTED_BY_HOST));
+	req->cb(req->cb_arg, &cqe);
+}
+
+void
+nvmf_free_request(struct nvmf_request *req)
+{
+	if (req->nc != NULL)
+		nvmf_free_capsule(req->nc);
+	free(req, M_NVMF);
+}
+
+static void
+nvmf_dispatch_command(struct nvmf_host_qpair *qp, struct nvmf_host_command *cmd)
+{
+	struct nvmf_softc *sc = qp->sc;
+	struct nvme_command *sqe;
+	struct nvmf_capsule *nc;
+	int error;
+
+	nc = cmd->req->nc;
+	sqe = nvmf_capsule_sqe(nc);
+
+	/*
+	 * NB: Don't bother byte-swapping the cid so that receive
+	 * doesn't have to swap.
+	 */
+	sqe->cid = cmd->cid;
+
+	error = nvmf_transmit_capsule(nc);
+	if (error != 0) {
+		device_printf(sc->dev,
+		    "failed to transmit capsule: %d, disconnecting\n", error);
+		nvmf_disconnect(sc);
+		return;
+	}
+
+	if (sc->ka_traffic)
+		atomic_store_int(&sc->ka_active_tx_traffic, 1);
+}
+
+static void
+nvmf_qp_error(void *arg, int error)
+{
+	struct nvmf_host_qpair *qp = arg;
+	struct nvmf_softc *sc = qp->sc;
+
+	/* Ignore simple close of queue pairs during shutdown. */
+	if (!(sc->detaching && error == 0))
+		device_printf(sc->dev, "error %d on %s, disconnecting\n", error,
+		    qp->name);
+	nvmf_disconnect(sc);
+}
+
+static void
+nvmf_receive_capsule(void *arg, struct nvmf_capsule *nc)
+{
+	struct nvmf_host_qpair *qp = arg;
+	struct nvmf_softc *sc = qp->sc;
+	struct nvmf_host_command *cmd;
+	struct nvmf_request *req;
+	const struct nvme_completion *cqe;
+	uint16_t cid;
+
+	cqe = nvmf_capsule_cqe(nc);
+
+	if (sc->ka_traffic)
+		atomic_store_int(&sc->ka_active_rx_traffic, 1);
+
+	/*
+	 * NB: Don't bother byte-swapping the cid as transmit doesn't
+	 * swap either.
+	 */
+	cid = cqe->cid;
+
+	if (cid > qp->num_commands) {
+		device_printf(sc->dev,
+		    "received invalid CID %u, disconnecting\n", cid);
+		nvmf_disconnect(sc);
+		nvmf_free_capsule(nc);
+		return;
+	}
+
+	/*
+	 * If the queue has been shutdown due to an error, silently
+	 * drop the response.
+	 */
+	mtx_lock(&qp->lock);
+	if (qp->qp == NULL) {
+		device_printf(sc->dev,
+		    "received completion for CID %u on shutdown %s\n", cid,
+		    qp->name);
+		mtx_unlock(&qp->lock);
+		nvmf_free_capsule(nc);
+		return;
+	}
+
+	cmd = qp->active_commands[cid];
+	if (cmd == NULL) {
+		mtx_unlock(&qp->lock);
+		device_printf(sc->dev,
+		    "received completion for inactive CID %u, disconnecting\n",
+		    cid);
+		nvmf_disconnect(sc);
+		nvmf_free_capsule(nc);
+		return;
+	}
+
+	KASSERT(cmd->cid == cid, ("%s: CID mismatch", __func__));
+	req = cmd->req;
+	cmd->req = NULL;
+	if (STAILQ_EMPTY(&qp->pending_requests)) {
+		qp->active_commands[cid] = NULL;
+		TAILQ_INSERT_TAIL(&qp->free_commands, cmd, link);
+		mtx_unlock(&qp->lock);
+	} else {
+		cmd->req = STAILQ_FIRST(&qp->pending_requests);
+		STAILQ_REMOVE_HEAD(&qp->pending_requests, link);
+		mtx_unlock(&qp->lock);
+		nvmf_dispatch_command(qp, cmd);
+	}
+
+	req->cb(req->cb_arg, cqe);
+	nvmf_free_capsule(nc);
+	nvmf_free_request(req);
+}
+
+struct nvmf_host_qpair *
+nvmf_init_qp(struct nvmf_softc *sc, enum nvmf_trtype trtype,
+    struct nvmf_handoff_qpair_params *handoff, const char *name)
+{
+	struct nvmf_host_command *cmd, *ncmd;
+	struct nvmf_host_qpair *qp;
+	u_int i;
+
+	qp = malloc(sizeof(*qp), M_NVMF, M_WAITOK | M_ZERO);
+	qp->sc = sc;
+	qp->sq_flow_control = handoff->sq_flow_control;
+	qp->sqhd = handoff->sqhd;
+	qp->sqtail = handoff->sqtail;
+	strlcpy(qp->name, name, sizeof(qp->name));
+	mtx_init(&qp->lock, "nvmf qp", NULL, MTX_DEF);
+
+	/*
+	 * Allocate a spare command slot for each pending AER command
+	 * on the admin queue.
+	 */
+	qp->num_commands = handoff->qsize - 1;
+	if (handoff->admin)
+		qp->num_commands += sc->num_aer;
+
+	qp->active_commands = malloc(sizeof(*qp->active_commands) *
+	    qp->num_commands, M_NVMF, M_WAITOK | M_ZERO);
+	TAILQ_INIT(&qp->free_commands);
+	for (i = 0; i < qp->num_commands; i++) {
+		cmd = malloc(sizeof(*cmd), M_NVMF, M_WAITOK | M_ZERO);
+		cmd->cid = i;
+		TAILQ_INSERT_TAIL(&qp->free_commands, cmd, link);
+	}
+	STAILQ_INIT(&qp->pending_requests);
+
+	qp->qp = nvmf_allocate_qpair(trtype, false, handoff, nvmf_qp_error,
+	    qp, nvmf_receive_capsule, qp);
+	if (qp->qp == NULL) {
+		TAILQ_FOREACH_SAFE(cmd, &qp->free_commands, link, ncmd) {
+			TAILQ_REMOVE(&qp->free_commands, cmd, link);
+			free(cmd, M_NVMF);
+		}
+		free(qp->active_commands, M_NVMF);
+		mtx_destroy(&qp->lock);
+		free(qp, M_NVMF);
+		return (NULL);
+	}
+
+	return (qp);
+}
+
+void
+nvmf_shutdown_qp(struct nvmf_host_qpair *qp)
+{
+	struct nvmf_host_command *cmd;
+	struct nvmf_request *req;
+	struct nvmf_qpair *nq;
+
+	mtx_lock(&qp->lock);
+	nq = qp->qp;
+	qp->qp = NULL;
+
+	if (nq == NULL) {
+		while (qp->shutting_down)
+			mtx_sleep(qp, &qp->lock, 0, "nvmfqpsh", 0);
+		mtx_unlock(&qp->lock);
+		return;
+	}
+	qp->shutting_down = true;
+	while (qp->allocating != 0)
+		mtx_sleep(qp, &qp->lock, 0, "nvmfqpqu", 0);
+	mtx_unlock(&qp->lock);
+
+	nvmf_free_qpair(nq);
+
+	/*
+	 * Abort outstanding requests.  Active requests will have
+	 * their I/O completions invoked and associated capsules freed
+	 * by the transport layer via nvmf_free_qpair.  Pending
+	 * requests must have their I/O completion invoked via
+	 * nvmf_abort_capsule_data.
+	 */
+	for (u_int i = 0; i < qp->num_commands; i++) {
+		cmd = qp->active_commands[i];
+		if (cmd != NULL) {
+			if (!cmd->req->aer)
+				printf("%s: aborted active command %p (CID %u)\n",
+				    __func__, cmd->req, cmd->cid);
+
+			/* This was freed by nvmf_free_qpair. */
+			cmd->req->nc = NULL;
+			nvmf_abort_request(cmd->req, cmd->cid);
+			nvmf_free_request(cmd->req);
+			free(cmd, M_NVMF);
+		}
+	}
+	while (!STAILQ_EMPTY(&qp->pending_requests)) {
+		req = STAILQ_FIRST(&qp->pending_requests);
+		STAILQ_REMOVE_HEAD(&qp->pending_requests, link);
+		if (!req->aer)
+			printf("%s: aborted pending command %p\n", __func__,
+			    req);
+		nvmf_abort_capsule_data(req->nc, ECONNABORTED);
+		nvmf_abort_request(req, 0);
+		nvmf_free_request(req);
+	}
+
+	mtx_lock(&qp->lock);
+	qp->shutting_down = false;
+	mtx_unlock(&qp->lock);
+	wakeup(qp);
+}
+
+void
+nvmf_destroy_qp(struct nvmf_host_qpair *qp)
+{
+	struct nvmf_host_command *cmd, *ncmd;
+
+	nvmf_shutdown_qp(qp);
+
+	TAILQ_FOREACH_SAFE(cmd, &qp->free_commands, link, ncmd) {
+		TAILQ_REMOVE(&qp->free_commands, cmd, link);
+		free(cmd, M_NVMF);
+	}
+	free(qp->active_commands, M_NVMF);
+	mtx_destroy(&qp->lock);
+	free(qp, M_NVMF);
+}
+
+void
+nvmf_submit_request(struct nvmf_request *req)
+{
+	struct nvmf_host_qpair *qp;
+	struct nvmf_host_command *cmd;
+
+	qp = req->qp;
+	mtx_lock(&qp->lock);
+	if (qp->qp == NULL) {
+		mtx_unlock(&qp->lock);
+		printf("%s: aborted pending command %p\n", __func__, req);
+		nvmf_abort_capsule_data(req->nc, ECONNABORTED);
+		nvmf_abort_request(req, 0);
+		nvmf_free_request(req);
+		return;
+	}
+	cmd = TAILQ_FIRST(&qp->free_commands);
+	if (cmd == NULL) {
+		/*
+		 * Queue this request.  Will be sent after enough
+		 * in-flight requests have completed.
+		 */
+		STAILQ_INSERT_TAIL(&qp->pending_requests, req, link);
+		mtx_unlock(&qp->lock);
+		return;
+	}
+
+	TAILQ_REMOVE(&qp->free_commands, cmd, link);
+	KASSERT(qp->active_commands[cmd->cid] == NULL,
+	    ("%s: CID already busy", __func__));
+	qp->active_commands[cmd->cid] = cmd;
+	cmd->req = req;
+	mtx_unlock(&qp->lock);
+	nvmf_dispatch_command(qp, cmd);
+}
diff --git a/sys/dev/nvmf/host/nvmf_sim.c b/sys/dev/nvmf/host/nvmf_sim.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf_sim.c
@@ -0,0 +1,332 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/memdesc.h>
+#include <sys/refcount.h>
+
+#include <cam/cam.h>
+#include <cam/cam_ccb.h>
+#include <cam/cam_sim.h>
+#include <cam/cam_xpt_sim.h>
+#include <cam/cam_debug.h>
+
+#include <dev/nvmf/host/nvmf_var.h>
+
+/*
+ * The I/O completion may trigger after the received CQE if the I/O
+ * used a zero-copy mbuf that isn't harvested until after the NIC
+ * driver processes TX completions.  Use spriv_field0 to as a refcount.
+ *
+ * Store any I/O error returned in spriv_field1.
+ */
+static __inline u_int *
+ccb_refs(union ccb *ccb)
+{
+	return ((u_int *)&ccb->ccb_h.spriv_field0);
+}
+
+#define	spriv_ioerror	spriv_field1
+
+static void
+nvmf_ccb_done(union ccb *ccb)
+{
+	if (!refcount_release(ccb_refs(ccb)))
+		return;
+
+	if (nvmf_cqe_aborted(&ccb->nvmeio.cpl)) {
+		ccb->ccb_h.status = CAM_REQUEUE_REQ;
+		xpt_done(ccb);
+	} else if (ccb->nvmeio.cpl.status != 0) {
+		ccb->ccb_h.status = CAM_NVME_STATUS_ERROR;
+		xpt_done(ccb);
+	} else if (ccb->ccb_h.spriv_ioerror != 0) {
+		KASSERT(ccb->ccb_h.spriv_ioerror != EJUSTRETURN,
+		    ("%s: zero sized transfer without CQE error", __func__));
+		ccb->ccb_h.status = CAM_REQ_CMP_ERR;
+		xpt_done(ccb);
+	} else {
+		ccb->ccb_h.status = CAM_REQ_CMP;
+		xpt_done_direct(ccb);
+	}
+}
+
+static void
+nvmf_ccb_io_complete(void *arg, size_t xfered, int error)
+{
+	union ccb *ccb = arg;
+
+	/*
+	 * TODO: Reporting partial completions requires extending
+	 * nvmeio to support resid and updating nda to handle partial
+	 * reads, either by returning partial success (or an error) to
+	 * the caller, or retrying all or part of the request.
+	 */
+	ccb->ccb_h.spriv_ioerror = error;
+	if (error == 0) {
+		if (xfered == 0) {
+#ifdef INVARIANTS
+			/*
+			 * If the request fails with an error in the CQE
+			 * there will be no data transferred but also no
+			 * I/O error.
+			 */
+			ccb->ccb_h.spriv_ioerror = EJUSTRETURN;
+#endif
+		} else
+			KASSERT(xfered == ccb->nvmeio.dxfer_len,
+			    ("%s: partial CCB completion", __func__));
+	}
+
+	nvmf_ccb_done(ccb);
+}
+
+static void
+nvmf_ccb_complete(void *arg, const struct nvme_completion *cqe)
+{
+	union ccb *ccb = arg;
+
+	ccb->nvmeio.cpl = *cqe;
+	nvmf_ccb_done(ccb);
+}
+
+static void
+nvmf_sim_io(struct nvmf_softc *sc, union ccb *ccb)
+{
+	struct ccb_nvmeio *nvmeio = &ccb->nvmeio;
+	struct memdesc mem;
+	struct nvmf_request *req;
+	struct nvmf_host_qpair *qp;
+
+	mtx_lock(&sc->sim_mtx);
+	if (sc->sim_disconnected) {
+		mtx_unlock(&sc->sim_mtx);
+		nvmeio->ccb_h.status = CAM_REQUEUE_REQ;
+		xpt_done(ccb);
+		return;
+	}
+	if (nvmeio->ccb_h.func_code == XPT_NVME_IO)
+		qp = nvmf_select_io_queue(sc);
+	else
+		qp = sc->admin;
+	req = nvmf_allocate_request(qp, &nvmeio->cmd, nvmf_ccb_complete,
+	    ccb, M_NOWAIT);
+	if (req == NULL) {
+		mtx_unlock(&sc->sim_mtx);
+		nvmeio->ccb_h.status = CAM_RESRC_UNAVAIL;
+		xpt_done(ccb);
+		return;
+	}
+
+	if (nvmeio->dxfer_len != 0) {
+		refcount_init(ccb_refs(ccb), 2);
+		mem = memdesc_ccb(ccb);
+		nvmf_capsule_append_data(req->nc, &mem, nvmeio->dxfer_len,
+		    (ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT,
+		    nvmf_ccb_io_complete, ccb);
+	} else
+		refcount_init(ccb_refs(ccb), 1);
+
+	/*
+	 * Clear spriv_ioerror as it can hold an earlier error if this
+	 * CCB was aborted and has been retried.
+	 */
+	ccb->ccb_h.spriv_ioerror = 0;
+	KASSERT(ccb->ccb_h.status == CAM_REQ_INPROG,
+	    ("%s: incoming CCB is not in-progress", __func__));
+	ccb->ccb_h.status |= CAM_SIM_QUEUED;
+	nvmf_submit_request(req);
+	mtx_unlock(&sc->sim_mtx);
+}
+
+static void
+nvmf_sim_action(struct cam_sim *sim, union ccb *ccb)
+{
+	struct nvmf_softc *sc = cam_sim_softc(sim);
+
+	CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE,
+	    ("nvmf_sim_action: func= %#x\n",
+		ccb->ccb_h.func_code));
+
+	switch (ccb->ccb_h.func_code) {
+	case XPT_PATH_INQ:	/* Path routing inquiry */
+	{
+		struct ccb_pathinq *cpi = &ccb->cpi;
+
+		cpi->version_num = 1;
+		cpi->hba_inquiry = 0;
+		cpi->target_sprt = 0;
+		cpi->hba_misc =  PIM_UNMAPPED | PIM_NOSCAN;
+		cpi->hba_eng_cnt = 0;
+		cpi->max_target = 0;
+		cpi->max_lun = sc->cdata->nn;
+		cpi->async_flags = 0;
+		cpi->hpath_id = 0;
+		cpi->initiator_id = 0;
+		strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
+		strlcpy(cpi->hba_vid, "NVMeoF", HBA_IDLEN);
+		strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
+		cpi->unit_number = cam_sim_unit(sim);
+		cpi->bus_id = 0;
+
+		/* XXX: Same as iSCSI. */
+		cpi->base_transfer_speed = 150000;
+		cpi->protocol = PROTO_NVME;
+		cpi->protocol_version = sc->vs;
+		cpi->transport = XPORT_NVMF;
+		cpi->transport_version = sc->vs;
+		cpi->xport_specific.nvmf.nsid =
+		    xpt_path_lun_id(ccb->ccb_h.path);
+		cpi->xport_specific.nvmf.trtype = sc->trtype;
+		strncpy(cpi->xport_specific.nvmf.dev_name,
+		    device_get_nameunit(sc->dev),
+		    sizeof(cpi->xport_specific.nvmf.dev_name));
+		cpi->maxio = sc->max_xfer_size;
+		cpi->hba_vendor = 0;
+		cpi->hba_device = 0;
+		cpi->hba_subvendor = 0;
+		cpi->hba_subdevice = 0;
+		cpi->ccb_h.status = CAM_REQ_CMP;
+		break;
+	}
+	case XPT_GET_TRAN_SETTINGS:	/* Get transport settings */
+	{
+		struct ccb_trans_settings *cts = &ccb->cts;
+		struct ccb_trans_settings_nvme *nvme;
+		struct ccb_trans_settings_nvmf *nvmf;
+
+		cts->protocol = PROTO_NVME;
+		cts->protocol_version = sc->vs;
+		cts->transport = XPORT_NVMF;
+		cts->transport_version = sc->vs;
+
+		nvme = &cts->proto_specific.nvme;
+		nvme->valid = CTS_NVME_VALID_SPEC;
+		nvme->spec = sc->vs;
+
+		nvmf = &cts->xport_specific.nvmf;
+		nvmf->valid = CTS_NVMF_VALID_TRTYPE;
+		nvmf->trtype = sc->trtype;
+		cts->ccb_h.status = CAM_REQ_CMP;
+		break;
+	}
+	case XPT_SET_TRAN_SETTINGS:	/* Set transport settings */
+		/*
+		 * No transfer settings can be set, but nvme_xpt sends
+		 * this anyway.
+		 */
+		ccb->ccb_h.status = CAM_REQ_CMP;
+		break;
+	case XPT_NVME_IO:		/* Execute the requested I/O */
+	case XPT_NVME_ADMIN:		/* or Admin operation */
+		nvmf_sim_io(sc, ccb);
+		return;
+	default:
+		/* XXX */
+		device_printf(sc->dev, "unhandled sim function %#x\n",
+		    ccb->ccb_h.func_code);
+		ccb->ccb_h.status = CAM_REQ_INVALID;
+		break;
+	}
+	xpt_done(ccb);
+}
+
+int
+nvmf_init_sim(struct nvmf_softc *sc)
+{
+	struct cam_devq *devq;
+	int max_trans;
+
+	max_trans = sc->max_pending_io * 3 / 4;
+	devq = cam_simq_alloc(max_trans);
+	if (devq == NULL) {
+		device_printf(sc->dev, "Failed to allocate CAM simq\n");
+		return (ENOMEM);
+	}
+
+	mtx_init(&sc->sim_mtx, "nvmf sim", NULL, MTX_DEF);
+	sc->sim = cam_sim_alloc(nvmf_sim_action, NULL, "nvme", sc,
+	    device_get_unit(sc->dev), NULL, max_trans, max_trans, devq);
+	if (sc->sim == NULL) {
+		device_printf(sc->dev, "Failed to allocate CAM sim\n");
+		cam_simq_free(devq);
+		mtx_destroy(&sc->sim_mtx);
+		return (ENXIO);
+	}
+	if (xpt_bus_register(sc->sim, sc->dev, 0) != CAM_SUCCESS) {
+		device_printf(sc->dev, "Failed to create CAM bus\n");
+		cam_sim_free(sc->sim, TRUE);
+		mtx_destroy(&sc->sim_mtx);
+		return (ENXIO);
+	}
+	if (xpt_create_path(&sc->path, NULL, cam_sim_path(sc->sim),
+	    CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
+		device_printf(sc->dev, "Failed to create CAM path\n");
+		xpt_bus_deregister(cam_sim_path(sc->sim));
+		cam_sim_free(sc->sim, TRUE);
+		mtx_destroy(&sc->sim_mtx);
+		return (ENXIO);
+	}
+	return (0);
+}
+
+void
+nvmf_sim_rescan_ns(struct nvmf_softc *sc, uint32_t id)
+{
+	union ccb *ccb;
+
+	ccb = xpt_alloc_ccb_nowait();
+	if (ccb == NULL) {
+		device_printf(sc->dev,
+		    "unable to alloc CCB for rescan of namespace %u\n", id);
+		return;
+	}
+
+	/*
+	 * As with nvme_sim, map NVMe namespace IDs onto CAM unit
+	 * LUNs.
+	 */
+	if (xpt_create_path(&ccb->ccb_h.path, NULL, cam_sim_path(sc->sim), 0,
+	    id) != CAM_REQ_CMP) {
+		device_printf(sc->dev,
+		    "Unable to create path for rescan of namespace %u\n", id);
+		xpt_free_ccb(ccb);
+		return;
+	}
+	xpt_rescan(ccb);
+}
+
+void
+nvmf_disconnect_sim(struct nvmf_softc *sc)
+{
+	mtx_lock(&sc->sim_mtx);
+	sc->sim_disconnected = true;
+	xpt_freeze_simq(sc->sim, 1);
+	mtx_unlock(&sc->sim_mtx);
+}
+
+void
+nvmf_reconnect_sim(struct nvmf_softc *sc)
+{
+	mtx_lock(&sc->sim_mtx);
+	sc->sim_disconnected = false;
+	mtx_unlock(&sc->sim_mtx);
+	xpt_release_simq(sc->sim, 1);
+}
+
+void
+nvmf_destroy_sim(struct nvmf_softc *sc)
+{
+	xpt_async(AC_LOST_DEVICE, sc->path, NULL);
+	if (sc->sim_disconnected)
+		xpt_release_simq(sc->sim, 1);
+	xpt_free_path(sc->path);
+	xpt_bus_deregister(cam_sim_path(sc->sim));
+	cam_sim_free(sc->sim, TRUE);
+	mtx_destroy(&sc->sim_mtx);
+}
diff --git a/sys/dev/nvmf/host/nvmf_var.h b/sys/dev/nvmf/host/nvmf_var.h
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf_var.h
@@ -0,0 +1,208 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#ifndef __NVMF_VAR_H__
+#define	__NVMF_VAR_H__
+
+#include <sys/_callout.h>
+#include <sys/_lock.h>
+#include <sys/_mutex.h>
+#include <sys/_sx.h>
+#include <sys/_task.h>
+#include <sys/queue.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/nvmf_transport.h>
+
+struct nvmf_aer;
+struct nvmf_capsule;
+struct nvmf_host_qpair;
+struct nvmf_namespace;
+
+typedef void nvmf_request_complete_t(void *, const struct nvme_completion *);
+
+struct nvmf_ivars {
+	struct nvmf_handoff_host *hh;
+	struct nvmf_handoff_qpair_params *io_params;
+	struct nvme_controller_data *cdata;
+};
+
+struct nvmf_softc {
+	device_t dev;
+
+	struct nvmf_host_qpair *admin;
+	struct nvmf_host_qpair **io;
+	u_int	num_io_queues;
+	enum nvmf_trtype trtype;
+
+	struct cam_sim *sim;
+	struct cam_path *path;
+	struct mtx sim_mtx;
+	bool sim_disconnected;
+
+	struct nvmf_namespace **ns;
+
+	struct nvme_controller_data *cdata;
+	uint64_t cap;
+	uint32_t vs;
+	u_int max_pending_io;
+	u_long max_xfer_size;
+
+	struct cdev *cdev;
+
+	/*
+	 * Keep Alive support depends on two timers.  The 'tx' timer
+	 * is responsible for sending KeepAlive commands and runs at
+	 * half the timeout interval.  The 'rx' timer is responsible
+	 * for detecting an actual timeout.
+	 *
+	 * For efficient support of TKAS, the host does not reschedule
+	 * these timers every time new commands are scheduled.
+	 * Instead, the host sets the *_traffic flags when commands
+	 * are sent and received.  The timeout handlers check and
+	 * clear these flags.  This does mean it can take up to twice
+	 * the timeout time to detect an AWOL controller.
+	 */
+	bool	ka_traffic;			/* Using TKAS? */
+
+	volatile int ka_active_tx_traffic;
+	struct callout ka_tx_timer;
+	sbintime_t ka_tx_sbt;
+
+	volatile int ka_active_rx_traffic;
+	struct callout ka_rx_timer;
+	sbintime_t ka_rx_sbt;
+
+	struct sx connection_lock;
+	struct task disconnect_task;
+	bool detaching;
+
+	u_int num_aer;
+	struct nvmf_aer *aer;
+};
+
+struct nvmf_request {
+	struct nvmf_host_qpair *qp;
+	struct nvmf_capsule *nc;
+	nvmf_request_complete_t *cb;
+	void	*cb_arg;
+	bool	aer;
+
+	STAILQ_ENTRY(nvmf_request) link;
+};
+
+struct nvmf_completion_status {
+	struct nvme_completion cqe;
+	bool	done;
+	bool	io_done;
+	int	io_error;
+};
+
+static __inline struct nvmf_host_qpair *
+nvmf_select_io_queue(struct nvmf_softc *sc)
+{
+	/* TODO: Support multiple queues? */
+	return (sc->io[0]);
+}
+
+static __inline bool
+nvmf_cqe_aborted(const struct nvme_completion *cqe)
+{
+	uint16_t status;
+
+	status = le16toh(cqe->status);
+	return (NVME_STATUS_GET_SCT(status) == NVME_SCT_PATH_RELATED &&
+	    NVME_STATUS_GET_SC(status) == NVME_SC_COMMAND_ABORTED_BY_HOST);
+}
+
+static __inline void
+nvmf_status_init(struct nvmf_completion_status *status)
+{
+	status->done = false;
+	status->io_done = true;
+	status->io_error = 0;
+}
+
+static __inline void
+nvmf_status_wait_io(struct nvmf_completion_status *status)
+{
+	status->io_done = false;
+}
+
+#ifdef DRIVER_MODULE
+extern driver_t nvme_nvmf_driver;
+#endif
+
+#ifdef MALLOC_DECLARE
+MALLOC_DECLARE(M_NVMF);
+#endif
+
+/* nvmf.c */
+void	nvmf_complete(void *arg, const struct nvme_completion *cqe);
+void	nvmf_io_complete(void *arg, size_t xfered, int error);
+void	nvmf_wait_for_reply(struct nvmf_completion_status *status);
+int	nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh);
+void	nvmf_free_ivars(struct nvmf_ivars *ivars);
+void	nvmf_disconnect(struct nvmf_softc *sc);
+void	nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid);
+int	nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
+    bool admin);
+
+/* nvmf_aer.c */
+void	nvmf_init_aer(struct nvmf_softc *sc);
+int	nvmf_start_aer(struct nvmf_softc *sc);
+void	nvmf_destroy_aer(struct nvmf_softc *sc);
+
+/* nvmf_cmd.c */
+bool	nvmf_cmd_get_property(struct nvmf_softc *sc, uint32_t offset,
+    uint8_t size, nvmf_request_complete_t *cb, void *cb_arg, int how);
+bool	nvmf_cmd_set_property(struct nvmf_softc *sc, uint32_t offset,
+    uint8_t size, uint64_t value, nvmf_request_complete_t *cb, void *cb_arg,
+    int how);
+bool	nvmf_cmd_keep_alive(struct nvmf_softc *sc, nvmf_request_complete_t *cb,
+    void *cb_arg, int how);
+bool	nvmf_cmd_identify_active_namespaces(struct nvmf_softc *sc, uint32_t id,
+    struct nvme_ns_list *nslist, nvmf_request_complete_t *req_cb,
+    void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how);
+bool	nvmf_cmd_identify_namespace(struct nvmf_softc *sc, uint32_t id,
+    struct nvme_namespace_data *nsdata, nvmf_request_complete_t *req_cb,
+    void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how);
+bool	nvmf_cmd_get_log_page(struct nvmf_softc *sc, uint32_t nsid, uint8_t lid,
+    uint64_t offset, void *buf, size_t len, nvmf_request_complete_t *req_cb,
+    void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how);
+
+/* nvmf_ctldev.c */
+int	nvmf_ctl_load(void);
+void	nvmf_ctl_unload(void);
+
+/* nvmf_ns.c */
+struct nvmf_namespace *nvmf_init_ns(struct nvmf_softc *sc, uint32_t id,
+    struct nvme_namespace_data *data);
+void	nvmf_disconnect_ns(struct nvmf_namespace *ns);
+void	nvmf_reconnect_ns(struct nvmf_namespace *ns);
+void	nvmf_destroy_ns(struct nvmf_namespace *ns);
+bool	nvmf_update_ns(struct nvmf_namespace *ns,
+    struct nvme_namespace_data *data);
+
+/* nvmf_qpair.c */
+struct nvmf_host_qpair *nvmf_init_qp(struct nvmf_softc *sc,
+    enum nvmf_trtype trtype, struct nvmf_handoff_qpair_params *handoff,
+    const char *name);
+void	nvmf_shutdown_qp(struct nvmf_host_qpair *qp);
+void	nvmf_destroy_qp(struct nvmf_host_qpair *qp);
+struct nvmf_request *nvmf_allocate_request(struct nvmf_host_qpair *qp,
+    void *sqe, nvmf_request_complete_t *cb, void *cb_arg, int how);
+void	nvmf_submit_request(struct nvmf_request *req);
+void	nvmf_free_request(struct nvmf_request *req);
+
+/* nvmf_sim.c */
+int	nvmf_init_sim(struct nvmf_softc *sc);
+void	nvmf_disconnect_sim(struct nvmf_softc *sc);
+void	nvmf_reconnect_sim(struct nvmf_softc *sc);
+void	nvmf_destroy_sim(struct nvmf_softc *sc);
+void	nvmf_sim_rescan_ns(struct nvmf_softc *sc, uint32_t id);
+
+#endif /* !__NVMF_VAR_H__ */
diff --git a/sys/modules/nvmf/Makefile b/sys/modules/nvmf/Makefile
--- a/sys/modules/nvmf/Makefile
+++ b/sys/modules/nvmf/Makefile
@@ -1,4 +1,5 @@
-SUBDIR=	nvmf_tcp \
+SUBDIR=	nvmf \
+	nvmf_tcp \
 	nvmf_transport
 
 .include <bsd.subdir.mk>
diff --git a/sys/modules/nvmf/nvmf/Makefile b/sys/modules/nvmf/nvmf/Makefile
new file mode 100644
--- /dev/null
+++ b/sys/modules/nvmf/nvmf/Makefile
@@ -0,0 +1,13 @@
+.PATH: ${SRCTOP}/sys/dev/nvmf/host
+
+KMOD=	nvmf
+
+SRCS=	nvmf.c \
+	nvmf_aer.c \
+	nvmf_cmd.c \
+	nvmf_ctldev.c \
+	nvmf_ns.c \
+	nvmf_qpair.c \
+	nvmf_sim.c
+
+.include <bsd.kmod.mk>