D44714.diff
No OneTemporary
Actions

Size

77 KB

Referenced Files

None

Subscribers

None

D44714.diff
View Options

	diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile
	--- a/share/man/man4/Makefile
	+++ b/share/man/man4/Makefile
	@@ -408,6 +408,7 @@
	nvd.4 \
	${_nvdimm.4} \
	nvme.4 \
	+ nvmf.4 \
	nvmf_tcp.4 \
	${_nvram.4} \
	oce.4 \
	diff --git a/share/man/man4/nvmf.4 b/share/man/man4/nvmf.4
	new file mode 100644
	--- /dev/null
	+++ b/share/man/man4/nvmf.4
	@@ -0,0 +1,87 @@
	+.\"
	+.\" SPDX-License-Identifier: BSD-2-Clause
	+.\"
	+.\" Copyright (c) 2024 Chelsio Communications, Inc.
	+.\"
	+.Dd May 2, 2024
	+.Dt NVMF 4
	+.Os
	+.Sh NAME
	+.Nm nvmf
	+.Nd "NVM Express over Fabrics host driver"
	+.Sh SYNOPSIS
	+To compile the driver into the kernel,
	+place the following line in the
	+kernel configuration file:
	+.Bd -ragged -offset indent
	+.Cd "device nvmf"
	+.Ed
	+.Pp
	+Alternatively, to load the driver as a
	+module at boot time, place the following line in
	+.Xr loader.conf 5 :
	+.Bd -literal -offset indent
	+nvmf_load="YES"
	+.Ed
	+.Sh DESCRIPTION
	+The
	+.Nm
	+driver provides the kernel component of an NVM Express over Fabrics
	+host.
	+The NVMeoF host is the client which provides local access to
	+namespaces exported by a remote controller.
	+.Pp
	+Associations between the local host and remote controllers are managed
	+using
	+.Xr nvmecontrol 8 .
	+New associations are created via the
	+.Cm connect
	+command and destroyed via the
	+.Cm disconnect
	+command.
	+If an association's connection is interrupted,
	+the
	+.Cm reconnect
	+command creates a new association to replace the interrupted association.
	+.Pp
	+Similar to
	+.Xr nvme 4 ,
	+.Nm
	+creates controller device nodes using the format
	+.Pa /dev/nvmeX
	+and namespace device nodes using the format
	+.Pa /dev/nvmeXnsY .
	+.Nm
	+also exports remote namespaces via the CAM
	+.Xr nda 4
	+peripheral driver.
	+Unlike
	+.Xr nvme 4 ,
	+.Nm
	+does not support the
	+.Xr nvd 4
	+disk driver.
	+.Pp
	+Associations require a supported transport such as
	+.Xr nvmf_tcp 4
	+for associations using TCP/IP.
	+.Sh SEE ALSO
	+.Xr nda 4 ,
	+.Xr nvme 4 ,
	+.Xr nvmf_tcp 4 ,
	+.Xr nvmft 4 ,
	+.Xr nvmecontrol 8
	+.Sh HISTORY
	+The
	+.Nm
	+module first appeared in
	+.Fx 15.0 .
	+.Sh AUTHORS
	+The
	+.Nm
	+driver was developed by
	+.An John Baldwin Aq Mt jhb@FreeBSD.org
	+under sponsorship from Chelsio Communications, Inc.
	+.Sh BUGS
	+.Nm
	+only supports a single I/O queue pair per association.
	diff --git a/sys/conf/NOTES b/sys/conf/NOTES
	--- a/sys/conf/NOTES
	+++ b/sys/conf/NOTES
	@@ -1676,12 +1676,14 @@
	# NVM Express
	#
	# nvme: PCI-express NVM Express host controllers
	+# nvmf: NVM Express over Fabrics host
	# nvmf_tcp: TCP transport for NVM Express over Fabrics
	# nda: CAM NVMe disk driver
	# nvd: non-CAM NVMe disk driver

	-device nvme # base NVMe driver
	+device nvme # PCI-express NVMe host driver
	options NVME_USE_NVD=1 # Use nvd(4) instead of the CAM nda(4) driver
	+device nvmf # NVMeoF host driver
	device nvmf_tcp # NVMeoF TCP transport
	device nda # NVMe direct access devices (aka disks)
	device nvd # expose NVMe namespaces as disks, depends on nvme
	diff --git a/sys/conf/files b/sys/conf/files
	--- a/sys/conf/files
	+++ b/sys/conf/files
	@@ -2533,7 +2533,15 @@
	dev/nvme/nvme_util.c optional nvme
	dev/nvmem/nvmem.c optional nvmem fdt
	dev/nvmem/nvmem_if.m optional nvmem
	+dev/nvmf/host/nvmf.c optional nvmf
	+dev/nvmf/host/nvmf_aer.c optional nvmf
	+dev/nvmf/host/nvmf_cmd.c optional nvmf
	+dev/nvmf/host/nvmf_ctldev.c optional nvmf
	+dev/nvmf/host/nvmf_ns.c optional nvmf
	+dev/nvmf/host/nvmf_qpair.c optional nvmf
	+dev/nvmf/host/nvmf_sim.c optional nvmf
	dev/nvmf/nvmf_tcp.c optional nvmf_tcp
	+dev/nvmf/nvmf_transport.c optional nvmf
	dev/oce/oce_hw.c optional oce pci
	dev/oce/oce_if.c optional oce pci
	dev/oce/oce_mbox.c optional oce pci
	diff --git a/sys/dev/nvmf/host/nvmf.c b/sys/dev/nvmf/host/nvmf.c
	new file mode 100644
	--- /dev/null
	+++ b/sys/dev/nvmf/host/nvmf.c
	@@ -0,0 +1,939 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
	+ * Written by: John Baldwin <jhb@FreeBSD.org>
	+ */
	+
	+#include <sys/param.h>
	+#include <sys/bus.h>
	+#include <sys/conf.h>
	+#include <sys/lock.h>
	+#include <sys/kernel.h>
	+#include <sys/malloc.h>
	+#include <sys/memdesc.h>
	+#include <sys/module.h>
	+#include <sys/mutex.h>
	+#include <sys/sx.h>
	+#include <sys/taskqueue.h>
	+#include <dev/nvme/nvme.h>
	+#include <dev/nvmf/nvmf.h>
	+#include <dev/nvmf/nvmf_transport.h>
	+#include <dev/nvmf/host/nvmf_var.h>
	+
	+static struct cdevsw nvmf_cdevsw;
	+
	+MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
	+
	+static void nvmf_disconnect_task(void *arg, int pending);
	+
	+void
	+nvmf_complete(void arg, const struct nvme_completion cqe)
	+{
	+ struct nvmf_completion_status *status = arg;
	+ struct mtx *mtx;
	+
	+ status->cqe = *cqe;
	+ mtx = mtx_pool_find(mtxpool_sleep, status);
	+ mtx_lock(mtx);
	+ status->done = true;
	+ mtx_unlock(mtx);
	+ wakeup(status);
	+}
	+
	+void
	+nvmf_io_complete(void *arg, size_t xfered, int error)
	+{
	+ struct nvmf_completion_status *status = arg;
	+ struct mtx *mtx;
	+
	+ status->io_error = error;
	+ mtx = mtx_pool_find(mtxpool_sleep, status);
	+ mtx_lock(mtx);
	+ status->io_done = true;
	+ mtx_unlock(mtx);
	+ wakeup(status);
	+}
	+
	+void
	+nvmf_wait_for_reply(struct nvmf_completion_status *status)
	+{
	+ struct mtx *mtx;
	+
	+ mtx = mtx_pool_find(mtxpool_sleep, status);
	+ mtx_lock(mtx);
	+ while (!status->done \|\| !status->io_done)
	+ mtx_sleep(status, mtx, 0, "nvmfcmd", 0);
	+ mtx_unlock(mtx);
	+}
	+
	+static int
	+nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
	+ uint64_t *value)
	+{
	+ const struct nvmf_fabric_prop_get_rsp *rsp;
	+ struct nvmf_completion_status status;
	+
	+ nvmf_status_init(&status);
	+ if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status,
	+ M_WAITOK))
	+ return (ECONNABORTED);
	+ nvmf_wait_for_reply(&status);
	+
	+ if (status.cqe.status != 0) {
	+ device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n",
	+ le16toh(status.cqe.status));
	+ return (EIO);
	+ }
	+
	+ rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe;
	+ if (size == 8)
	+ *value = le64toh(rsp->value.u64);
	+ else
	+ *value = le32toh(rsp->value.u32.low);
	+ return (0);
	+}
	+
	+static int
	+nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
	+ uint64_t value)
	+{
	+ struct nvmf_completion_status status;
	+
	+ nvmf_status_init(&status);
	+ if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status,
	+ M_WAITOK))
	+ return (ECONNABORTED);
	+ nvmf_wait_for_reply(&status);
	+
	+ if (status.cqe.status != 0) {
	+ device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n",
	+ le16toh(status.cqe.status));
	+ return (EIO);
	+ }
	+ return (0);
	+}
	+
	+static void
	+nvmf_shutdown_controller(struct nvmf_softc *sc)
	+{
	+ uint64_t cc;
	+ int error;
	+
	+ error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc);
	+ if (error != 0) {
	+ device_printf(sc->dev, "Failed to fetch CC for shutdown\n");
	+ return;
	+ }
	+
	+ cc \|= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL);
	+
	+ error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc);
	+ if (error != 0)
	+ device_printf(sc->dev,
	+ "Failed to set CC to trigger shutdown\n");
	+}
	+
	+static void
	+nvmf_check_keep_alive(void *arg)
	+{
	+ struct nvmf_softc *sc = arg;
	+ int traffic;
	+
	+ traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic);
	+ if (traffic == 0) {
	+ device_printf(sc->dev,
	+ "disconnecting due to KeepAlive timeout\n");
	+ nvmf_disconnect(sc);
	+ return;
	+ }
	+
	+ callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK);
	+}
	+
	+static void
	+nvmf_keep_alive_complete(void arg, const struct nvme_completion cqe)
	+{
	+ struct nvmf_softc *sc = arg;
	+
	+ atomic_store_int(&sc->ka_active_rx_traffic, 1);
	+ if (cqe->status != 0) {
	+ device_printf(sc->dev,
	+ "KeepAlive response reported status %#x\n",
	+ le16toh(cqe->status));
	+ }
	+}
	+
	+static void
	+nvmf_send_keep_alive(void *arg)
	+{
	+ struct nvmf_softc *sc = arg;
	+ int traffic;
	+
	+ /*
	+ * Don't bother sending a KeepAlive command if TKAS is active
	+ * and another command has been sent during the interval.
	+ */
	+ traffic = atomic_load_int(&sc->ka_active_tx_traffic);
	+ if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete,
	+ sc, M_NOWAIT))
	+ device_printf(sc->dev,
	+ "Failed to allocate KeepAlive command\n");
	+
	+ /* Clear ka_active_tx_traffic after sending the keep alive command. */
	+ atomic_store_int(&sc->ka_active_tx_traffic, 0);
	+
	+ callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK);
	+}
	+
	+int
	+nvmf_init_ivars(struct nvmf_ivars ivars, struct nvmf_handoff_host hh)
	+{
	+ size_t len;
	+ u_int i;
	+ int error;
	+
	+ memset(ivars, 0, sizeof(*ivars));
	+
	+ if (!hh->admin.admin \|\| hh->num_io_queues < 1)
	+ return (EINVAL);
	+
	+ ivars->cdata = malloc(sizeof(*ivars->cdata), M_NVMF, M_WAITOK);
	+ error = copyin(hh->cdata, ivars->cdata, sizeof(*ivars->cdata));
	+ if (error != 0)
	+ goto out;
	+ nvme_controller_data_swapbytes(ivars->cdata);
	+
	+ len = hh->num_io_queues * sizeof(*ivars->io_params);
	+ ivars->io_params = malloc(len, M_NVMF, M_WAITOK);
	+ error = copyin(hh->io, ivars->io_params, len);
	+ if (error != 0)
	+ goto out;
	+ for (i = 0; i < hh->num_io_queues; i++) {
	+ if (ivars->io_params[i].admin) {
	+ error = EINVAL;
	+ goto out;
	+ }
	+
	+ /* Require all I/O queues to be the same size. */
	+ if (ivars->io_params[i].qsize != ivars->io_params[0].qsize) {
	+ error = EINVAL;
	+ goto out;
	+ }
	+ }
	+
	+ ivars->hh = hh;
	+ return (0);
	+
	+out:
	+ free(ivars->io_params, M_NVMF);
	+ free(ivars->cdata, M_NVMF);
	+ return (error);
	+}
	+
	+void
	+nvmf_free_ivars(struct nvmf_ivars *ivars)
	+{
	+ free(ivars->io_params, M_NVMF);
	+ free(ivars->cdata, M_NVMF);
	+}
	+
	+static int
	+nvmf_probe(device_t dev)
	+{
	+ struct nvmf_ivars *ivars = device_get_ivars(dev);
	+ char desc[260];
	+
	+ if (ivars == NULL)
	+ return (ENXIO);
	+
	+ snprintf(desc, sizeof(desc), "Fabrics: %.256s", ivars->cdata->subnqn);
	+ device_set_desc_copy(dev, desc);
	+ return (BUS_PROBE_DEFAULT);
	+}
	+
	+static int
	+nvmf_establish_connection(struct nvmf_softc sc, struct nvmf_ivars ivars)
	+{
	+ char name[16];
	+
	+ /* Setup the admin queue. */
	+ sc->admin = nvmf_init_qp(sc, ivars->hh->trtype, &ivars->hh->admin,
	+ "admin queue");
	+ if (sc->admin == NULL) {
	+ device_printf(sc->dev, "Failed to setup admin queue\n");
	+ return (ENXIO);
	+ }
	+
	+ /* Setup I/O queues. */
	+ sc->io = malloc(ivars->hh->num_io_queues * sizeof(*sc->io), M_NVMF,
	+ M_WAITOK \| M_ZERO);
	+ sc->num_io_queues = ivars->hh->num_io_queues;
	+ for (u_int i = 0; i < sc->num_io_queues; i++) {
	+ snprintf(name, sizeof(name), "I/O queue %u", i);
	+ sc->io[i] = nvmf_init_qp(sc, ivars->hh->trtype,
	+ &ivars->io_params[i], name);
	+ if (sc->io[i] == NULL) {
	+ device_printf(sc->dev, "Failed to setup I/O queue %u\n",
	+ i + 1);
	+ return (ENXIO);
	+ }
	+ }
	+
	+ /* Start KeepAlive timers. */
	+ if (ivars->hh->kato != 0) {
	+ sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS,
	+ sc->cdata->ctratt) != 0;
	+ sc->ka_rx_sbt = mstosbt(ivars->hh->kato);
	+ sc->ka_tx_sbt = sc->ka_rx_sbt / 2;
	+ callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0,
	+ nvmf_check_keep_alive, sc, C_HARDCLOCK);
	+ callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0,
	+ nvmf_send_keep_alive, sc, C_HARDCLOCK);
	+ }
	+
	+ return (0);
	+}
	+
	+static bool
	+nvmf_scan_nslist(struct nvmf_softc sc, struct nvme_ns_list nslist,
	+ struct nvme_namespace_data data, uint32_t nsidp)
	+{
	+ struct nvmf_completion_status status;
	+ uint32_t nsid;
	+
	+ nvmf_status_init(&status);
	+ nvmf_status_wait_io(&status);
	+ if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist,
	+ nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) {
	+ device_printf(sc->dev,
	+ "failed to send IDENTIFY active namespaces command\n");
	+ return (false);
	+ }
	+ nvmf_wait_for_reply(&status);
	+
	+ if (status.cqe.status != 0) {
	+ device_printf(sc->dev,
	+ "IDENTIFY active namespaces failed, status %#x\n",
	+ le16toh(status.cqe.status));
	+ return (false);
	+ }
	+
	+ if (status.io_error != 0) {
	+ device_printf(sc->dev,
	+ "IDENTIFY active namespaces failed with I/O error %d\n",
	+ status.io_error);
	+ return (false);
	+ }
	+
	+ for (u_int i = 0; i < nitems(nslist->ns); i++) {
	+ nsid = nslist->ns[i];
	+ if (nsid == 0) {
	+ *nsidp = 0;
	+ return (true);
	+ }
	+
	+ if (sc->ns[nsid - 1] != NULL) {
	+ device_printf(sc->dev,
	+ "duplicate namespace %u in active namespace list\n",
	+ nsid);
	+ return (false);
	+ }
	+
	+ nvmf_status_init(&status);
	+ nvmf_status_wait_io(&status);
	+ if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
	+ &status, nvmf_io_complete, &status, M_WAITOK)) {
	+ device_printf(sc->dev,
	+ "failed to send IDENTIFY namespace %u command\n",
	+ nsid);
	+ return (false);
	+ }
	+ nvmf_wait_for_reply(&status);
	+
	+ if (status.cqe.status != 0) {
	+ device_printf(sc->dev,
	+ "IDENTIFY namespace %u failed, status %#x\n", nsid,
	+ le16toh(status.cqe.status));
	+ return (false);
	+ }
	+
	+ if (status.io_error != 0) {
	+ device_printf(sc->dev,
	+ "IDENTIFY namespace %u failed with I/O error %d\n",
	+ nsid, status.io_error);
	+ return (false);
	+ }
	+
	+ /*
	+ * As in nvme_ns_construct, a size of zero indicates an
	+ * invalid namespace.
	+ */
	+ nvme_namespace_data_swapbytes(data);
	+ if (data->nsze == 0) {
	+ device_printf(sc->dev,
	+ "ignoring active namespace %u with zero size\n",
	+ nsid);
	+ continue;
	+ }
	+
	+ sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
	+
	+ nvmf_sim_rescan_ns(sc, nsid);
	+ }
	+
	+ MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0);
	+
	+ if (nsid >= 0xfffffffd)
	+ *nsidp = 0;
	+ else
	+ *nsidp = nsid + 1;
	+ return (true);
	+}
	+
	+static bool
	+nvmf_add_namespaces(struct nvmf_softc *sc)
	+{
	+ struct nvme_namespace_data *data;
	+ struct nvme_ns_list *nslist;
	+ uint32_t nsid;
	+ bool retval;
	+
	+ sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
	+ M_WAITOK \| M_ZERO);
	+ nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK);
	+ data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
	+
	+ nsid = 0;
	+ retval = true;
	+ for (;;) {
	+ if (!nvmf_scan_nslist(sc, nslist, data, &nsid)) {
	+ retval = false;
	+ break;
	+ }
	+ if (nsid == 0)
	+ break;
	+ }
	+
	+ free(data, M_NVMF);
	+ free(nslist, M_NVMF);
	+ return (retval);
	+}
	+
	+static int
	+nvmf_attach(device_t dev)
	+{
	+ struct make_dev_args mda;
	+ struct nvmf_softc *sc = device_get_softc(dev);
	+ struct nvmf_ivars *ivars = device_get_ivars(dev);
	+ uint64_t val;
	+ u_int i;
	+ int error;
	+
	+ if (ivars == NULL)
	+ return (ENXIO);
	+
	+ sc->dev = dev;
	+ sc->trtype = ivars->hh->trtype;
	+ callout_init(&sc->ka_rx_timer, 1);
	+ callout_init(&sc->ka_tx_timer, 1);
	+ sx_init(&sc->connection_lock, "nvmf connection");
	+ TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
	+
	+ /* Claim the cdata pointer from ivars. */
	+ sc->cdata = ivars->cdata;
	+ ivars->cdata = NULL;
	+
	+ nvmf_init_aer(sc);
	+
	+ /* TODO: Multiqueue support. */
	+ sc->max_pending_io = ivars->io_params[0].qsize /* * sc->num_io_queues */;
	+
	+ error = nvmf_establish_connection(sc, ivars);
	+ if (error != 0)
	+ goto out;
	+
	+ error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap);
	+ if (error != 0) {
	+ device_printf(sc->dev, "Failed to fetch CAP\n");
	+ error = ENXIO;
	+ goto out;
	+ }
	+
	+ error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val);
	+ if (error != 0) {
	+ device_printf(sc->dev, "Failed to fetch VS\n");
	+ error = ENXIO;
	+ goto out;
	+ }
	+ sc->vs = val;
	+
	+ /* Honor MDTS if it is set. */
	+ sc->max_xfer_size = maxphys;
	+ if (sc->cdata->mdts != 0) {
	+ sc->max_xfer_size = ulmin(sc->max_xfer_size,
	+ 1 << (sc->cdata->mdts + NVME_MPS_SHIFT +
	+ NVME_CAP_HI_MPSMIN(sc->cap >> 32)));
	+ }
	+
	+ error = nvmf_init_sim(sc);
	+ if (error != 0)
	+ goto out;
	+
	+ error = nvmf_start_aer(sc);
	+ if (error != 0) {
	+ nvmf_destroy_sim(sc);
	+ goto out;
	+ }
	+
	+ if (!nvmf_add_namespaces(sc)) {
	+ nvmf_destroy_sim(sc);
	+ goto out;
	+ }
	+
	+ make_dev_args_init(&mda);
	+ mda.mda_devsw = &nvmf_cdevsw;
	+ mda.mda_uid = UID_ROOT;
	+ mda.mda_gid = GID_WHEEL;
	+ mda.mda_mode = 0600;
	+ mda.mda_si_drv1 = sc;
	+ error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
	+ if (error != 0) {
	+ nvmf_destroy_sim(sc);
	+ goto out;
	+ }
	+
	+ return (0);
	+out:
	+ if (sc->ns != NULL) {
	+ for (i = 0; i < sc->cdata->nn; i++) {
	+ if (sc->ns[i] != NULL)
	+ nvmf_destroy_ns(sc->ns[i]);
	+ }
	+ free(sc->ns, M_NVMF);
	+ }
	+
	+ callout_drain(&sc->ka_tx_timer);
	+ callout_drain(&sc->ka_rx_timer);
	+
	+ if (sc->admin != NULL)
	+ nvmf_shutdown_controller(sc);
	+
	+ for (i = 0; i < sc->num_io_queues; i++) {
	+ if (sc->io[i] != NULL)
	+ nvmf_destroy_qp(sc->io[i]);
	+ }
	+ free(sc->io, M_NVMF);
	+ if (sc->admin != NULL)
	+ nvmf_destroy_qp(sc->admin);
	+
	+ nvmf_destroy_aer(sc);
	+
	+ taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
	+ sx_destroy(&sc->connection_lock);
	+ free(sc->cdata, M_NVMF);
	+ return (error);
	+}
	+
	+void
	+nvmf_disconnect(struct nvmf_softc *sc)
	+{
	+ taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task);
	+}
	+
	+static void
	+nvmf_disconnect_task(void *arg, int pending __unused)
	+{
	+ struct nvmf_softc *sc = arg;
	+ u_int i;
	+
	+ sx_xlock(&sc->connection_lock);
	+ if (sc->admin == NULL) {
	+ /*
	+ * Ignore transport errors if there is no active
	+ * association.
	+ */
	+ sx_xunlock(&sc->connection_lock);
	+ return;
	+ }
	+
	+ if (sc->detaching) {
	+ if (sc->admin != NULL) {
	+ /*
	+ * This unsticks the detach process if a
	+ * transport error occurs during detach.
	+ */
	+ nvmf_shutdown_qp(sc->admin);
	+ }
	+ sx_xunlock(&sc->connection_lock);
	+ return;
	+ }
	+
	+ if (sc->cdev == NULL) {
	+ /*
	+ * Transport error occurred during attach (nvmf_add_namespaces).
	+ * Shutdown the admin queue.
	+ */
	+ nvmf_shutdown_qp(sc->admin);
	+ sx_xunlock(&sc->connection_lock);
	+ return;
	+ }
	+
	+ callout_drain(&sc->ka_tx_timer);
	+ callout_drain(&sc->ka_rx_timer);
	+ sc->ka_traffic = false;
	+
	+ /* Quiesce namespace consumers. */
	+ nvmf_disconnect_sim(sc);
	+ for (i = 0; i < sc->cdata->nn; i++) {
	+ if (sc->ns[i] != NULL)
	+ nvmf_disconnect_ns(sc->ns[i]);
	+ }
	+
	+ /* Shutdown the existing qpairs. */
	+ for (i = 0; i < sc->num_io_queues; i++) {
	+ nvmf_destroy_qp(sc->io[i]);
	+ }
	+ free(sc->io, M_NVMF);
	+ sc->io = NULL;
	+ sc->num_io_queues = 0;
	+ nvmf_destroy_qp(sc->admin);
	+ sc->admin = NULL;
	+
	+ sx_xunlock(&sc->connection_lock);
	+}
	+
	+static int
	+nvmf_reconnect_host(struct nvmf_softc sc, struct nvmf_handoff_host hh)
	+{
	+ struct nvmf_ivars ivars;
	+ u_int i;
	+ int error;
	+
	+ /* XXX: Should we permit changing the transport type? */
	+ if (sc->trtype != hh->trtype) {
	+ device_printf(sc->dev,
	+ "transport type mismatch on reconnect\n");
	+ return (EINVAL);
	+ }
	+
	+ error = nvmf_init_ivars(&ivars, hh);
	+ if (error != 0)
	+ return (error);
	+
	+ sx_xlock(&sc->connection_lock);
	+ if (sc->admin != NULL \|\| sc->detaching) {
	+ error = EBUSY;
	+ goto out;
	+ }
	+
	+ /*
	+ * Ensure this is for the same controller. Note that the
	+ * controller ID can vary across associations if the remote
	+ * system is using the dynamic controller model. This merely
	+ * ensures the new association is connected to the same NVMe
	+ * subsystem.
	+ */
	+ if (memcmp(sc->cdata->subnqn, ivars.cdata->subnqn,
	+ sizeof(ivars.cdata->subnqn)) != 0) {
	+ device_printf(sc->dev,
	+ "controller subsystem NQN mismatch on reconnect\n");
	+ error = EINVAL;
	+ goto out;
	+ }
	+
	+ /*
	+ * XXX: Require same number and size of I/O queues so that
	+ * max_pending_io is still correct?
	+ */
	+
	+ error = nvmf_establish_connection(sc, &ivars);
	+ if (error != 0)
	+ goto out;
	+
	+ error = nvmf_start_aer(sc);
	+ if (error != 0)
	+ goto out;
	+
	+ device_printf(sc->dev,
	+ "established new association with %u I/O queues\n",
	+ sc->num_io_queues);
	+
	+ /* Restart namespace consumers. */
	+ for (i = 0; i < sc->cdata->nn; i++) {
	+ if (sc->ns[i] != NULL)
	+ nvmf_reconnect_ns(sc->ns[i]);
	+ }
	+ nvmf_reconnect_sim(sc);
	+out:
	+ sx_xunlock(&sc->connection_lock);
	+ nvmf_free_ivars(&ivars);
	+ return (error);
	+}
	+
	+static int
	+nvmf_detach(device_t dev)
	+{
	+ struct nvmf_softc *sc = device_get_softc(dev);
	+ u_int i;
	+
	+ destroy_dev(sc->cdev);
	+
	+ sx_xlock(&sc->connection_lock);
	+ sc->detaching = true;
	+ sx_xunlock(&sc->connection_lock);
	+
	+ nvmf_destroy_sim(sc);
	+ for (i = 0; i < sc->cdata->nn; i++) {
	+ if (sc->ns[i] != NULL)
	+ nvmf_destroy_ns(sc->ns[i]);
	+ }
	+ free(sc->ns, M_NVMF);
	+
	+ callout_drain(&sc->ka_tx_timer);
	+ callout_drain(&sc->ka_rx_timer);
	+
	+ if (sc->admin != NULL)
	+ nvmf_shutdown_controller(sc);
	+
	+ for (i = 0; i < sc->num_io_queues; i++) {
	+ nvmf_destroy_qp(sc->io[i]);
	+ }
	+ free(sc->io, M_NVMF);
	+
	+ taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
	+
	+ if (sc->admin != NULL)
	+ nvmf_destroy_qp(sc->admin);
	+
	+ nvmf_destroy_aer(sc);
	+
	+ sx_destroy(&sc->connection_lock);
	+ free(sc->cdata, M_NVMF);
	+ return (0);
	+}
	+
	+void
	+nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
	+{
	+ struct nvmf_completion_status status;
	+ struct nvme_namespace_data *data;
	+ struct nvmf_namespace *ns;
	+
	+ data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
	+
	+ nvmf_status_init(&status);
	+ nvmf_status_wait_io(&status);
	+ if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
	+ &status, nvmf_io_complete, &status, M_WAITOK)) {
	+ device_printf(sc->dev,
	+ "failed to send IDENTIFY namespace %u command\n", nsid);
	+ free(data, M_NVMF);
	+ return;
	+ }
	+ nvmf_wait_for_reply(&status);
	+
	+ if (status.cqe.status != 0) {
	+ device_printf(sc->dev,
	+ "IDENTIFY namespace %u failed, status %#x\n", nsid,
	+ le16toh(status.cqe.status));
	+ free(data, M_NVMF);
	+ return;
	+ }
	+
	+ if (status.io_error != 0) {
	+ device_printf(sc->dev,
	+ "IDENTIFY namespace %u failed with I/O error %d\n",
	+ nsid, status.io_error);
	+ free(data, M_NVMF);
	+ return;
	+ }
	+
	+ nvme_namespace_data_swapbytes(data);
	+
	+ /* XXX: Needs locking around sc->ns[]. */
	+ ns = sc->ns[nsid - 1];
	+ if (data->nsze == 0) {
	+ /* XXX: Needs locking */
	+ if (ns != NULL) {
	+ nvmf_destroy_ns(ns);
	+ sc->ns[nsid - 1] = NULL;
	+ }
	+ } else {
	+ /* XXX: Needs locking */
	+ if (ns == NULL) {
	+ sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
	+ } else {
	+ if (!nvmf_update_ns(ns, data)) {
	+ nvmf_destroy_ns(ns);
	+ sc->ns[nsid - 1] = NULL;
	+ }
	+ }
	+ }
	+
	+ free(data, M_NVMF);
	+
	+ nvmf_sim_rescan_ns(sc, nsid);
	+}
	+
	+int
	+nvmf_passthrough_cmd(struct nvmf_softc sc, struct nvme_pt_command pt,
	+ bool admin)
	+{
	+ struct nvmf_completion_status status;
	+ struct nvme_command cmd;
	+ struct memdesc mem;
	+ struct nvmf_host_qpair *qp;
	+ struct nvmf_request *req;
	+ void *buf;
	+ int error;
	+
	+ if (pt->len > sc->max_xfer_size)
	+ return (EINVAL);
	+
	+ buf = NULL;
	+ if (pt->len != 0) {
	+ /*
	+ * XXX: Depending on the size we may want to pin the
	+ * user pages and use a memdesc with vm_page_t's
	+ * instead.
	+ */
	+ buf = malloc(pt->len, M_NVMF, M_WAITOK);
	+ if (pt->is_read == 0) {
	+ error = copyin(pt->buf, buf, pt->len);
	+ if (error != 0) {
	+ free(buf, M_NVMF);
	+ return (error);
	+ }
	+ } else {
	+ /* Ensure no kernel data is leaked to userland. */
	+ memset(buf, 0, pt->len);
	+ }
	+ }
	+
	+ memset(&cmd, 0, sizeof(cmd));
	+ cmd.opc = pt->cmd.opc;
	+ cmd.fuse = pt->cmd.fuse;
	+ cmd.nsid = pt->cmd.nsid;
	+ cmd.cdw10 = pt->cmd.cdw10;
	+ cmd.cdw11 = pt->cmd.cdw11;
	+ cmd.cdw12 = pt->cmd.cdw12;
	+ cmd.cdw13 = pt->cmd.cdw13;
	+ cmd.cdw14 = pt->cmd.cdw14;
	+ cmd.cdw15 = pt->cmd.cdw15;
	+
	+ if (admin)
	+ qp = sc->admin;
	+ else
	+ qp = nvmf_select_io_queue(sc);
	+ nvmf_status_init(&status);
	+ req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK);
	+ if (req == NULL) {
	+ device_printf(sc->dev, "failed to send passthrough command\n");
	+ error = ECONNABORTED;
	+ goto error;
	+ }
	+
	+ if (pt->len != 0) {
	+ mem = memdesc_vaddr(buf, pt->len);
	+ nvmf_capsule_append_data(req->nc, &mem, pt->len,
	+ pt->is_read == 0, nvmf_io_complete, &status);
	+ nvmf_status_wait_io(&status);
	+ }
	+
	+ nvmf_submit_request(req);
	+ nvmf_wait_for_reply(&status);
	+
	+ memset(&pt->cpl, 0, sizeof(pt->cpl));
	+ pt->cpl.cdw0 = status.cqe.cdw0;
	+ pt->cpl.status = status.cqe.status;
	+
	+ error = status.io_error;
	+ if (error == 0 && pt->len != 0 && pt->is_read != 0)
	+ error = copyout(buf, pt->buf, pt->len);
	+error:
	+ free(buf, M_NVMF);
	+ return (error);
	+}
	+
	+static int
	+nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
	+ struct thread *td)
	+{
	+ struct nvmf_softc *sc = cdev->si_drv1;
	+ struct nvme_get_nsid *gnsid;
	+ struct nvme_pt_command *pt;
	+ struct nvmf_reconnect_params *rp;
	+ struct nvmf_handoff_host *hh;
	+
	+ switch (cmd) {
	+ case NVME_PASSTHROUGH_CMD:
	+ pt = (struct nvme_pt_command *)arg;
	+ return (nvmf_passthrough_cmd(sc, pt, true));
	+ case NVME_GET_NSID:
	+ gnsid = (struct nvme_get_nsid *)arg;
	+ strncpy(gnsid->cdev, device_get_nameunit(sc->dev),
	+ sizeof(gnsid->cdev));
	+ gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
	+ gnsid->nsid = 0;
	+ return (0);
	+ case NVME_GET_MAX_XFER_SIZE:
	+ (uint64_t )arg = sc->max_xfer_size;
	+ return (0);
	+ case NVMF_RECONNECT_PARAMS:
	+ rp = (struct nvmf_reconnect_params *)arg;
	+ if ((sc->cdata->fcatt & 1) == 0)
	+ rp->cntlid = NVMF_CNTLID_DYNAMIC;
	+ else
	+ rp->cntlid = sc->cdata->ctrlr_id;
	+ memcpy(rp->subnqn, sc->cdata->subnqn, sizeof(rp->subnqn));
	+ return (0);
	+ case NVMF_RECONNECT_HOST:
	+ hh = (struct nvmf_handoff_host *)arg;
	+ return (nvmf_reconnect_host(sc, hh));
	+ default:
	+ return (ENOTTY);
	+ }
	+}
	+
	+static struct cdevsw nvmf_cdevsw = {
	+ .d_version = D_VERSION,
	+ .d_ioctl = nvmf_ioctl
	+};
	+
	+static int
	+nvmf_modevent(module_t mod, int what, void *arg)
	+{
	+ switch (what) {
	+ case MOD_LOAD:
	+ return (nvmf_ctl_load());
	+ case MOD_QUIESCE:
	+ return (0);
	+ case MOD_UNLOAD:
	+ nvmf_ctl_unload();
	+ destroy_dev_drain(&nvmf_cdevsw);
	+ return (0);
	+ default:
	+ return (EOPNOTSUPP);
	+ }
	+}
	+
	+static device_method_t nvmf_methods[] = {
	+ /* Device interface */
	+ DEVMETHOD(device_probe, nvmf_probe),
	+ DEVMETHOD(device_attach, nvmf_attach),
	+ DEVMETHOD(device_detach, nvmf_detach),
	+#if 0
	+ DEVMETHOD(device_shutdown, nvmf_shutdown),
	+#endif
	+ DEVMETHOD_END
	+};
	+
	+driver_t nvme_nvmf_driver = {
	+ "nvme",
	+ nvmf_methods,
	+ sizeof(struct nvmf_softc),
	+};
	+
	+DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL);
	+MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1);
	diff --git a/sys/dev/nvmf/host/nvmf_aer.c b/sys/dev/nvmf/host/nvmf_aer.c
	new file mode 100644
	--- /dev/null
	+++ b/sys/dev/nvmf/host/nvmf_aer.c
	@@ -0,0 +1,290 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2024 Chelsio Communications, Inc.
	+ * Written by: John Baldwin <jhb@FreeBSD.org>
	+ */
	+
	+#include <sys/types.h>
	+#include <sys/bus.h>
	+#include <sys/lock.h>
	+#include <sys/malloc.h>
	+#include <sys/mutex.h>
	+#include <sys/taskqueue.h>
	+#include <dev/nvmf/host/nvmf_var.h>
	+
	+struct nvmf_aer {
	+ struct nvmf_softc *sc;
	+ uint8_t log_page_id;
	+ uint8_t info;
	+ uint8_t type;
	+
	+ u_int page_len;
	+ void *page;
	+
	+ int error;
	+ uint16_t status;
	+ int pending;
	+ struct mtx *lock;
	+ struct task complete_task;
	+ struct task finish_page_task;
	+};
	+
	+#define MAX_LOG_PAGE_SIZE 4096
	+
	+static void nvmf_complete_aer(void arg, const struct nvme_completion cqe);
	+
	+static void
	+nvmf_submit_aer(struct nvmf_softc sc, struct nvmf_aer aer)
	+{
	+ struct nvmf_request *req;
	+ struct nvme_command cmd;
	+
	+ memset(&cmd, 0, sizeof(cmd));
	+ cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST;
	+
	+ req = nvmf_allocate_request(sc->admin, &cmd, nvmf_complete_aer, aer,
	+ M_WAITOK);
	+ if (req == NULL)
	+ return;
	+ req->aer = true;
	+ nvmf_submit_request(req);
	+}
	+
	+static void
	+nvmf_handle_changed_namespaces(struct nvmf_softc *sc,
	+ struct nvme_ns_list *ns_list)
	+{
	+ uint32_t nsid;
	+
	+ /*
	+ * If more than 1024 namespaces have changed, we should
	+ * probably just rescan the entire set of namespaces.
	+ */
	+ if (ns_list->ns[0] == 0xffffffff) {
	+ device_printf(sc->dev, "too many changed namespaces\n");
	+ return;
	+ }
	+
	+ for (u_int i = 0; i < nitems(ns_list->ns); i++) {
	+ if (ns_list->ns[i] == 0)
	+ break;
	+
	+ nsid = le32toh(ns_list->ns[i]);
	+ nvmf_rescan_ns(sc, nsid);
	+ }
	+}
	+
	+static void
	+nvmf_finish_aer_page(struct nvmf_softc sc, struct nvmf_aer aer)
	+{
	+ /* If an error occurred fetching the page, just bail. */
	+ if (aer->error != 0 \|\| aer->status != 0)
	+ return;
	+
	+ taskqueue_enqueue(taskqueue_thread, &aer->finish_page_task);
	+}
	+
	+static void
	+nvmf_finish_aer_page_task(void *arg, int pending)
	+{
	+ struct nvmf_aer *aer = arg;
	+ struct nvmf_softc *sc = aer->sc;
	+
	+ switch (aer->log_page_id) {
	+ case NVME_LOG_ERROR:
	+ /* TODO: Should we log these? */
	+ break;
	+ case NVME_LOG_CHANGED_NAMESPACE:
	+ nvmf_handle_changed_namespaces(sc, aer->page);
	+ break;
	+ }
	+
	+ /* Resubmit this AER command. */
	+ nvmf_submit_aer(sc, aer);
	+}
	+
	+static void
	+nvmf_io_complete_aer_page(void *arg, size_t xfered, int error)
	+{
	+ struct nvmf_aer *aer = arg;
	+ struct nvmf_softc *sc = aer->sc;
	+
	+ mtx_lock(aer->lock);
	+ aer->error = error;
	+ aer->pending--;
	+ if (aer->pending == 0) {
	+ mtx_unlock(aer->lock);
	+ nvmf_finish_aer_page(sc, aer);
	+ } else
	+ mtx_unlock(aer->lock);
	+}
	+
	+static void
	+nvmf_complete_aer_page(void arg, const struct nvme_completion cqe)
	+{
	+ struct nvmf_aer *aer = arg;
	+ struct nvmf_softc *sc = aer->sc;
	+
	+ mtx_lock(aer->lock);
	+ aer->status = cqe->status;
	+ aer->pending--;
	+ if (aer->pending == 0) {
	+ mtx_unlock(aer->lock);
	+ nvmf_finish_aer_page(sc, aer);
	+ } else
	+ mtx_unlock(aer->lock);
	+}
	+
	+static u_int
	+nvmf_log_page_size(struct nvmf_softc *sc, uint8_t log_page_id)
	+{
	+ switch (log_page_id) {
	+ case NVME_LOG_ERROR:
	+ return ((sc->cdata->elpe + 1) *
	+ sizeof(struct nvme_error_information_entry));
	+ case NVME_LOG_CHANGED_NAMESPACE:
	+ return (sizeof(struct nvme_ns_list));
	+ default:
	+ return (0);
	+ }
	+}
	+
	+static void
	+nvmf_complete_aer(void arg, const struct nvme_completion cqe)
	+{
	+ struct nvmf_aer *aer = arg;
	+ struct nvmf_softc *sc = aer->sc;
	+ uint32_t cdw0;
	+
	+ /*
	+ * The only error defined for AER is an abort due to
	+ * submitting too many AER commands. Just discard this AER
	+ * without resubmitting if we get an error.
	+ *
	+ * NB: Pending AER commands are aborted during controller
	+ * shutdown, so discard aborted commands silently.
	+ */
	+ if (cqe->status != 0) {
	+ if (!nvmf_cqe_aborted(cqe))
	+ device_printf(sc->dev, "Ignoring error %#x for AER\n",
	+ le16toh(cqe->status));
	+ return;
	+ }
	+
	+ cdw0 = le32toh(cqe->cdw0);
	+ aer->log_page_id = NVMEV(NVME_ASYNC_EVENT_LOG_PAGE_ID, cdw0);
	+ aer->info = NVMEV(NVME_ASYNC_EVENT_INFO, cdw0);
	+ aer->type = NVMEV(NVME_ASYNC_EVENT_TYPE, cdw0);
	+
	+ device_printf(sc->dev, "AER type %u, info %#x, page %#x\n",
	+ aer->type, aer->info, aer->log_page_id);
	+
	+ aer->page_len = nvmf_log_page_size(sc, aer->log_page_id);
	+ taskqueue_enqueue(taskqueue_thread, &aer->complete_task);
	+}
	+
	+static void
	+nvmf_complete_aer_task(void *arg, int pending)
	+{
	+ struct nvmf_aer *aer = arg;
	+ struct nvmf_softc *sc = aer->sc;
	+
	+ if (aer->page_len != 0) {
	+ /* Read the associated log page. */
	+ aer->page_len = MIN(aer->page_len, MAX_LOG_PAGE_SIZE);
	+ aer->pending = 2;
	+ (void) nvmf_cmd_get_log_page(sc, NVME_GLOBAL_NAMESPACE_TAG,
	+ aer->log_page_id, 0, aer->page, aer->page_len,
	+ nvmf_complete_aer_page, aer, nvmf_io_complete_aer_page,
	+ aer, M_WAITOK);
	+ } else {
	+ /* Resubmit this AER command. */
	+ nvmf_submit_aer(sc, aer);
	+ }
	+}
	+
	+static int
	+nvmf_set_async_event_config(struct nvmf_softc *sc, uint32_t config)
	+{
	+ struct nvme_command cmd;
	+ struct nvmf_completion_status status;
	+ struct nvmf_request *req;
	+
	+ memset(&cmd, 0, sizeof(cmd));
	+ cmd.opc = NVME_OPC_SET_FEATURES;
	+ cmd.cdw10 = htole32(NVME_FEAT_ASYNC_EVENT_CONFIGURATION);
	+ cmd.cdw11 = htole32(config);
	+
	+ nvmf_status_init(&status);
	+ req = nvmf_allocate_request(sc->admin, &cmd, nvmf_complete, &status,
	+ M_WAITOK);
	+ if (req == NULL) {
	+ device_printf(sc->dev,
	+ "failed to allocate SET_FEATURES (ASYNC_EVENT_CONFIGURATION) command\n");
	+ return (ECONNABORTED);
	+ }
	+ nvmf_submit_request(req);
	+ nvmf_wait_for_reply(&status);
	+
	+ if (status.cqe.status != 0) {
	+ device_printf(sc->dev,
	+ "SET_FEATURES (ASYNC_EVENT_CONFIGURATION) failed, status %#x\n",
	+ le16toh(status.cqe.status));
	+ return (EIO);
	+ }
	+
	+ return (0);
	+}
	+
	+void
	+nvmf_init_aer(struct nvmf_softc *sc)
	+{
	+ /* 8 matches NVME_MAX_ASYNC_EVENTS */
	+ sc->num_aer = min(8, sc->cdata->aerl + 1);
	+ sc->aer = mallocarray(sc->num_aer, sizeof(*sc->aer), M_NVMF,
	+ M_WAITOK \| M_ZERO);
	+ for (u_int i = 0; i < sc->num_aer; i++) {
	+ sc->aer[i].sc = sc;
	+ sc->aer[i].page = malloc(MAX_LOG_PAGE_SIZE, M_NVMF, M_WAITOK);
	+ sc->aer[i].lock = mtx_pool_find(mtxpool_sleep, &sc->aer[i]);
	+ TASK_INIT(&sc->aer[i].complete_task, 0, nvmf_complete_aer_task,
	+ &sc->aer[i]);
	+ TASK_INIT(&sc->aer[i].finish_page_task, 0,
	+ nvmf_finish_aer_page_task, &sc->aer[i]);
	+ }
	+}
	+
	+int
	+nvmf_start_aer(struct nvmf_softc *sc)
	+{
	+ uint32_t async_event_config;
	+ int error;
	+
	+ async_event_config = NVME_CRIT_WARN_ST_AVAILABLE_SPARE \|
	+ NVME_CRIT_WARN_ST_DEVICE_RELIABILITY \|
	+ NVME_CRIT_WARN_ST_READ_ONLY \|
	+ NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP;
	+ if (sc->cdata->ver >= NVME_REV(1, 2))
	+ async_event_config \|=
	+ sc->cdata->oaes & NVME_ASYNC_EVENT_NS_ATTRIBUTE;
	+ error = nvmf_set_async_event_config(sc, async_event_config);
	+ if (error != 0)
	+ return (error);
	+
	+ for (u_int i = 0; i < sc->num_aer; i++)
	+ nvmf_submit_aer(sc, &sc->aer[i]);
	+
	+ return (0);
	+}
	+
	+void
	+nvmf_destroy_aer(struct nvmf_softc *sc)
	+{
	+ for (u_int i = 0; i < sc->num_aer; i++) {
	+ taskqueue_drain(taskqueue_thread, &sc->aer[i].complete_task);
	+ taskqueue_drain(taskqueue_thread, &sc->aer[i].finish_page_task);
	+ free(sc->aer[i].page, M_NVMF);
	+ }
	+ free(sc->aer, M_NVMF);
	+}
	diff --git a/sys/dev/nvmf/host/nvmf_cmd.c b/sys/dev/nvmf/host/nvmf_cmd.c
	new file mode 100644
	--- /dev/null
	+++ b/sys/dev/nvmf/host/nvmf_cmd.c
	@@ -0,0 +1,171 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
	+ * Written by: John Baldwin <jhb@FreeBSD.org>
	+ */
	+
	+#include <sys/types.h>
	+#include <sys/memdesc.h>
	+#include <sys/systm.h>
	+#include <dev/nvme/nvme.h>
	+#include <dev/nvmf/nvmf.h>
	+#include <dev/nvmf/nvmf_proto.h>
	+#include <dev/nvmf/host/nvmf_var.h>
	+
	+bool
	+nvmf_cmd_get_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
	+ nvmf_request_complete_t cb, void cb_arg, int how)
	+{
	+ struct nvmf_fabric_prop_get_cmd cmd;
	+ struct nvmf_request *req;
	+
	+ memset(&cmd, 0, sizeof(cmd));
	+ cmd.opcode = NVME_OPC_FABRICS_COMMANDS;
	+ cmd.fctype = NVMF_FABRIC_COMMAND_PROPERTY_GET;
	+ switch (size) {
	+ case 4:
	+ cmd.attrib.size = NVMF_PROP_SIZE_4;
	+ break;
	+ case 8:
	+ cmd.attrib.size = NVMF_PROP_SIZE_8;
	+ break;
	+ default:
	+ panic("Invalid property size");
	+ }
	+ cmd.ofst = htole32(offset);
	+
	+ req = nvmf_allocate_request(sc->admin, &cmd, cb, cb_arg, how);
	+ if (req != NULL)
	+ nvmf_submit_request(req);
	+ return (req != NULL);
	+}
	+
	+bool
	+nvmf_cmd_set_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
	+ uint64_t value, nvmf_request_complete_t cb, void cb_arg, int how)
	+{
	+ struct nvmf_fabric_prop_set_cmd cmd;
	+ struct nvmf_request *req;
	+
	+ memset(&cmd, 0, sizeof(cmd));
	+ cmd.opcode = NVME_OPC_FABRICS_COMMANDS;
	+ cmd.fctype = NVMF_FABRIC_COMMAND_PROPERTY_SET;
	+ switch (size) {
	+ case 4:
	+ cmd.attrib.size = NVMF_PROP_SIZE_4;
	+ cmd.value.u32.low = htole32(value);
	+ break;
	+ case 8:
	+ cmd.attrib.size = NVMF_PROP_SIZE_8;
	+ cmd.value.u64 = htole64(value);
	+ break;
	+ default:
	+ panic("Invalid property size");
	+ }
	+ cmd.ofst = htole32(offset);
	+
	+ req = nvmf_allocate_request(sc->admin, &cmd, cb, cb_arg, how);
	+ if (req != NULL)
	+ nvmf_submit_request(req);
	+ return (req != NULL);
	+}
	+
	+bool
	+nvmf_cmd_keep_alive(struct nvmf_softc sc, nvmf_request_complete_t cb,
	+ void *cb_arg, int how)
	+{
	+ struct nvme_command cmd;
	+ struct nvmf_request *req;
	+
	+ memset(&cmd, 0, sizeof(cmd));
	+ cmd.opc = NVME_OPC_KEEP_ALIVE;
	+
	+ req = nvmf_allocate_request(sc->admin, &cmd, cb, cb_arg, how);
	+ if (req != NULL)
	+ nvmf_submit_request(req);
	+ return (req != NULL);
	+}
	+
	+bool
	+nvmf_cmd_identify_active_namespaces(struct nvmf_softc *sc, uint32_t id,
	+ struct nvme_ns_list nslist, nvmf_request_complete_t req_cb,
	+ void req_cb_arg, nvmf_io_complete_t io_cb, void *io_cb_arg, int how)
	+{
	+ struct nvme_command cmd;
	+ struct memdesc mem;
	+ struct nvmf_request *req;
	+
	+ memset(&cmd, 0, sizeof(cmd));
	+ cmd.opc = NVME_OPC_IDENTIFY;
	+
	+ /* 5.15.1 Use CNS of 0x02 for namespace data. */
	+ cmd.cdw10 = htole32(2);
	+ cmd.nsid = htole32(id);
	+
	+ req = nvmf_allocate_request(sc->admin, &cmd, req_cb, req_cb_arg, how);
	+ if (req == NULL)
	+ return (false);
	+ mem = memdesc_vaddr(nslist, sizeof(*nslist));
	+ nvmf_capsule_append_data(req->nc, &mem, sizeof(*nslist), false,
	+ io_cb, io_cb_arg);
	+ nvmf_submit_request(req);
	+ return (true);
	+}
	+
	+bool
	+nvmf_cmd_identify_namespace(struct nvmf_softc *sc, uint32_t id,
	+ struct nvme_namespace_data nsdata, nvmf_request_complete_t req_cb,
	+ void req_cb_arg, nvmf_io_complete_t io_cb, void *io_cb_arg, int how)
	+{
	+ struct nvme_command cmd;
	+ struct memdesc mem;
	+ struct nvmf_request *req;
	+
	+ memset(&cmd, 0, sizeof(cmd));
	+ cmd.opc = NVME_OPC_IDENTIFY;
	+
	+ /* 5.15.1 Use CNS of 0x00 for namespace data. */
	+ cmd.cdw10 = htole32(0);
	+ cmd.nsid = htole32(id);
	+
	+ req = nvmf_allocate_request(sc->admin, &cmd, req_cb, req_cb_arg, how);
	+ if (req == NULL)
	+ return (false);
	+ mem = memdesc_vaddr(nsdata, sizeof(*nsdata));
	+ nvmf_capsule_append_data(req->nc, &mem, sizeof(*nsdata), false,
	+ io_cb, io_cb_arg);
	+ nvmf_submit_request(req);
	+ return (true);
	+}
	+
	+bool
	+nvmf_cmd_get_log_page(struct nvmf_softc *sc, uint32_t nsid, uint8_t lid,
	+ uint64_t offset, void buf, size_t len, nvmf_request_complete_t req_cb,
	+ void req_cb_arg, nvmf_io_complete_t io_cb, void *io_cb_arg, int how)
	+{
	+ struct nvme_command cmd;
	+ struct memdesc mem;
	+ struct nvmf_request *req;
	+ size_t numd;
	+
	+ MPASS(len != 0 && len % 4 == 0);
	+ MPASS(offset % 4 == 0);
	+
	+ numd = (len / 4) - 1;
	+ memset(&cmd, 0, sizeof(cmd));
	+ cmd.opc = NVME_OPC_GET_LOG_PAGE;
	+ cmd.nsid = htole32(nsid);
	+ cmd.cdw10 = htole32(numd << 16 \| lid);
	+ cmd.cdw11 = htole32(numd >> 16);
	+ cmd.cdw12 = htole32(offset);
	+ cmd.cdw13 = htole32(offset >> 32);
	+
	+ req = nvmf_allocate_request(sc->admin, &cmd, req_cb, req_cb_arg, how);
	+ if (req == NULL)
	+ return (false);
	+ mem = memdesc_vaddr(buf, len);
	+ nvmf_capsule_append_data(req->nc, &mem, len, false, io_cb, io_cb_arg);
	+ nvmf_submit_request(req);
	+ return (true);
	+}
	diff --git a/sys/dev/nvmf/host/nvmf_ctldev.c b/sys/dev/nvmf/host/nvmf_ctldev.c
	new file mode 100644
	--- /dev/null
	+++ b/sys/dev/nvmf/host/nvmf_ctldev.c
	@@ -0,0 +1,159 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2023 Chelsio Communications, Inc.
	+ * Written by: John Baldwin <jhb@FreeBSD.org>
	+ */
	+
	+#include <sys/param.h>
	+#include <sys/bus.h>
	+#include <sys/conf.h>
	+#include <sys/malloc.h>
	+#include <dev/nvme/nvme.h>
	+#include <dev/nvmf/nvmf.h>
	+#include <dev/nvmf/nvmf_transport.h>
	+#include <dev/nvmf/host/nvmf_var.h>
	+
	+static struct cdev *nvmf_cdev;
	+
	+static int
	+nvmf_handoff_host(struct nvmf_handoff_host *hh)
	+{
	+ struct nvmf_ivars ivars;
	+ device_t dev;
	+ int error;
	+
	+ error = nvmf_init_ivars(&ivars, hh);
	+ if (error != 0)
	+ return (error);
	+
	+ bus_topo_lock();
	+ dev = device_add_child(root_bus, "nvme", -1);
	+ if (dev == NULL) {
	+ bus_topo_unlock();
	+ error = ENXIO;
	+ goto out;
	+ }
	+
	+ device_set_ivars(dev, &ivars);
	+ error = device_probe_and_attach(dev);
	+ device_set_ivars(dev, NULL);
	+ if (error != 0)
	+ device_delete_child(root_bus, dev);
	+ bus_topo_unlock();
	+
	+out:
	+ nvmf_free_ivars(&ivars);
	+ return (error);
	+}
	+
	+static bool
	+nvmf_matches(device_t dev, char *name)
	+{
	+ struct nvmf_softc *sc = device_get_softc(dev);
	+
	+ if (strcmp(device_get_nameunit(dev), name) == 0)
	+ return (true);
	+ if (strcmp(sc->cdata->subnqn, name) == 0)
	+ return (true);
	+ return (false);
	+}
	+
	+static int
	+nvmf_disconnect_by_name(char *name)
	+{
	+ devclass_t dc;
	+ device_t dev;
	+ int error, unit;
	+ bool found;
	+
	+ found = false;
	+ error = 0;
	+ bus_topo_lock();
	+ dc = devclass_find("nvme");
	+ if (dc == NULL)
	+ goto out;
	+
	+ for (unit = 0; unit < devclass_get_maxunit(dc); unit++) {
	+ dev = devclass_get_device(dc, unit);
	+ if (dev == NULL)
	+ continue;
	+ if (device_get_driver(dev) != &nvme_nvmf_driver)
	+ continue;
	+ if (device_get_parent(dev) != root_bus)
	+ continue;
	+ if (name != NULL && !nvmf_matches(dev, name))
	+ continue;
	+
	+ error = device_delete_child(root_bus, dev);
	+ if (error != 0)
	+ break;
	+ found = true;
	+ }
	+out:
	+ bus_topo_unlock();
	+ if (error == 0 && !found)
	+ error = ENOENT;
	+ return (error);
	+}
	+
	+static int
	+nvmf_disconnect_host(const char **namep)
	+{
	+ char *name;
	+ int error;
	+
	+ name = malloc(PATH_MAX, M_NVMF, M_WAITOK);
	+ error = copyinstr(*namep, name, PATH_MAX, NULL);
	+ if (error == 0)
	+ error = nvmf_disconnect_by_name(name);
	+ free(name, M_NVMF);
	+ return (error);
	+}
	+
	+static int
	+nvmf_ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag,
	+ struct thread *td)
	+{
	+ switch (cmd) {
	+ case NVMF_HANDOFF_HOST:
	+ return (nvmf_handoff_host((struct nvmf_handoff_host *)arg));
	+ case NVMF_DISCONNECT_HOST:
	+ return (nvmf_disconnect_host((const char **)arg));
	+ case NVMF_DISCONNECT_ALL:
	+ return (nvmf_disconnect_by_name(NULL));
	+ default:
	+ return (ENOTTY);
	+ }
	+}
	+
	+static struct cdevsw nvmf_ctl_cdevsw = {
	+ .d_version = D_VERSION,
	+ .d_ioctl = nvmf_ctl_ioctl
	+};
	+
	+int
	+nvmf_ctl_load(void)
	+{
	+ struct make_dev_args mda;
	+ int error;
	+
	+ make_dev_args_init(&mda);
	+ mda.mda_devsw = &nvmf_ctl_cdevsw;
	+ mda.mda_uid = UID_ROOT;
	+ mda.mda_gid = GID_WHEEL;
	+ mda.mda_mode = 0600;
	+ error = make_dev_s(&mda, &nvmf_cdev, "nvmf");
	+ if (error != 0)
	+ nvmf_cdev = NULL;
	+ return (error);
	+}
	+
	+void
	+nvmf_ctl_unload(void)
	+{
	+ if (nvmf_cdev != NULL) {
	+ destroy_dev(nvmf_cdev);
	+ nvmf_cdev = NULL;
	+ }
	+}
	diff --git a/sys/dev/nvmf/host/nvmf_ns.c b/sys/dev/nvmf/host/nvmf_ns.c
	new file mode 100644
	--- /dev/null
	+++ b/sys/dev/nvmf/host/nvmf_ns.c
	@@ -0,0 +1,483 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
	+ * Written by: John Baldwin <jhb@FreeBSD.org>
	+ */
	+
	+#include <sys/param.h>
	+#include <sys/bio.h>
	+#include <sys/bus.h>
	+#include <sys/conf.h>
	+#include <sys/disk.h>
	+#include <sys/fcntl.h>
	+#include <sys/lock.h>
	+#include <sys/malloc.h>
	+#include <sys/memdesc.h>
	+#include <sys/mutex.h>
	+#include <sys/proc.h>
	+#include <sys/refcount.h>
	+#include <sys/sbuf.h>
	+#include <machine/stdarg.h>
	+#include <dev/nvme/nvme.h>
	+#include <dev/nvmf/host/nvmf_var.h>
	+
	+struct nvmf_namespace {
	+ struct nvmf_softc *sc;
	+ uint64_t size;
	+ uint32_t id;
	+ u_int flags;
	+ uint32_t lba_size;
	+ bool disconnected;
	+
	+ TAILQ_HEAD(, bio) pending_bios;
	+ struct mtx lock;
	+ volatile u_int active_bios;
	+
	+ struct cdev *cdev;
	+};
	+
	+static void nvmf_ns_strategy(struct bio *bio);
	+
	+static void
	+ns_printf(struct nvmf_namespace ns, const char fmt, ...)
	+{
	+ char buf[128];
	+ struct sbuf sb;
	+ va_list ap;
	+
	+ sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
	+ sbuf_set_drain(&sb, sbuf_printf_drain, NULL);
	+
	+ sbuf_printf(&sb, "%sns%u: ", device_get_nameunit(ns->sc->dev),
	+ ns->id);
	+
	+ va_start(ap, fmt);
	+ sbuf_vprintf(&sb, fmt, ap);
	+ va_end(ap);
	+
	+ sbuf_finish(&sb);
	+ sbuf_delete(&sb);
	+}
	+
	+/*
	+ * The I/O completion may trigger after the received CQE if the I/O
	+ * used a zero-copy mbuf that isn't harvested until after the NIC
	+ * driver processes TX completions. Abuse bio_driver1 as a refcount.
	+ * Store I/O errors in bio_driver2.
	+ */
	+static __inline u_int *
	+bio_refs(struct bio *bio)
	+{
	+ return ((u_int *)&bio->bio_driver1);
	+}
	+
	+static void
	+nvmf_ns_biodone(struct bio *bio)
	+{
	+ struct nvmf_namespace *ns;
	+ int error;
	+
	+ if (!refcount_release(bio_refs(bio)))
	+ return;
	+
	+ ns = bio->bio_dev->si_drv1;
	+
	+ /* If a request is aborted, resubmit or queue it for resubmission. */
	+ if (bio->bio_error == ECONNABORTED) {
	+ bio->bio_error = 0;
	+ bio->bio_driver2 = 0;
	+ mtx_lock(&ns->lock);
	+ if (ns->disconnected) {
	+ TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
	+ mtx_unlock(&ns->lock);
	+ } else {
	+ mtx_unlock(&ns->lock);
	+ nvmf_ns_strategy(bio);
	+ }
	+ } else {
	+ /*
	+ * I/O errors take precedence over generic EIO from
	+ * CQE errors.
	+ */
	+ error = (intptr_t)bio->bio_driver2;
	+ if (error != 0)
	+ bio->bio_error = error;
	+ if (bio->bio_error != 0)
	+ bio->bio_flags \|= BIO_ERROR;
	+ biodone(bio);
	+ }
	+
	+ if (refcount_release(&ns->active_bios))
	+ wakeup(ns);
	+}
	+
	+static void
	+nvmf_ns_io_complete(void *arg, size_t xfered, int error)
	+{
	+ struct bio *bio = arg;
	+
	+ KASSERT(xfered <= bio->bio_bcount,
	+ ("%s: xfered > bio_bcount", __func__));
	+
	+ bio->bio_driver2 = (void *)(intptr_t)error;
	+ bio->bio_resid = bio->bio_bcount - xfered;
	+
	+ nvmf_ns_biodone(bio);
	+}
	+
	+static void
	+nvmf_ns_delete_complete(void *arg, size_t xfered, int error)
	+{
	+ struct bio *bio = arg;
	+
	+ if (error != 0)
	+ bio->bio_resid = bio->bio_bcount;
	+ else
	+ bio->bio_resid = 0;
	+
	+ free(bio->bio_driver2, M_NVMF);
	+ bio->bio_driver2 = (void *)(intptr_t)error;
	+
	+ nvmf_ns_biodone(bio);
	+}
	+
	+static void
	+nvmf_ns_bio_complete(void arg, const struct nvme_completion cqe)
	+{
	+ struct bio *bio = arg;
	+
	+ if (nvmf_cqe_aborted(cqe))
	+ bio->bio_error = ECONNABORTED;
	+ else if (cqe->status != 0)
	+ bio->bio_error = EIO;
	+
	+ nvmf_ns_biodone(bio);
	+}
	+
	+static int
	+nvmf_ns_submit_bio(struct nvmf_namespace ns, struct bio bio)
	+{
	+ struct nvme_command cmd;
	+ struct nvmf_request *req;
	+ struct nvme_dsm_range *dsm_range;
	+ struct memdesc mem;
	+ uint64_t lba, lba_count;
	+
	+ dsm_range = NULL;
	+ memset(&cmd, 0, sizeof(cmd));
	+ switch (bio->bio_cmd) {
	+ case BIO_READ:
	+ lba = bio->bio_offset / ns->lba_size;
	+ lba_count = bio->bio_bcount / ns->lba_size;
	+ nvme_ns_read_cmd(&cmd, ns->id, lba, lba_count);
	+ break;
	+ case BIO_WRITE:
	+ lba = bio->bio_offset / ns->lba_size;
	+ lba_count = bio->bio_bcount / ns->lba_size;
	+ nvme_ns_write_cmd(&cmd, ns->id, lba, lba_count);
	+ break;
	+ case BIO_FLUSH:
	+ nvme_ns_flush_cmd(&cmd, ns->id);
	+ break;
	+ case BIO_DELETE:
	+ dsm_range = malloc(sizeof(*dsm_range), M_NVMF, M_NOWAIT \|
	+ M_ZERO);
	+ if (dsm_range == NULL)
	+ return (ENOMEM);
	+ lba = bio->bio_offset / ns->lba_size;
	+ lba_count = bio->bio_bcount / ns->lba_size;
	+ dsm_range->starting_lba = htole64(lba);
	+ dsm_range->length = htole32(lba_count);
	+
	+ cmd.opc = NVME_OPC_DATASET_MANAGEMENT;
	+ cmd.nsid = htole32(ns->id);
	+ cmd.cdw10 = htole32(0); /* 1 range */
	+ cmd.cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE);
	+ break;
	+ default:
	+ return (EOPNOTSUPP);
	+ }
	+
	+ mtx_lock(&ns->lock);
	+ if (ns->disconnected) {
	+ TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
	+ mtx_unlock(&ns->lock);
	+ free(dsm_range, M_NVMF);
	+ return (0);
	+ }
	+
	+ req = nvmf_allocate_request(nvmf_select_io_queue(ns->sc), &cmd,
	+ nvmf_ns_bio_complete, bio, M_NOWAIT);
	+ if (req == NULL) {
	+ mtx_unlock(&ns->lock);
	+ free(dsm_range, M_NVMF);
	+ return (ENOMEM);
	+ }
	+
	+ switch (bio->bio_cmd) {
	+ case BIO_READ:
	+ case BIO_WRITE:
	+ refcount_init(bio_refs(bio), 2);
	+ mem = memdesc_bio(bio);
	+ nvmf_capsule_append_data(req->nc, &mem, bio->bio_bcount,
	+ bio->bio_cmd == BIO_WRITE, nvmf_ns_io_complete, bio);
	+ break;
	+ case BIO_DELETE:
	+ refcount_init(bio_refs(bio), 2);
	+ mem = memdesc_vaddr(dsm_range, sizeof(*dsm_range));
	+ nvmf_capsule_append_data(req->nc, &mem, sizeof(*dsm_range),
	+ true, nvmf_ns_delete_complete, bio);
	+ bio->bio_driver2 = dsm_range;
	+ break;
	+ default:
	+ refcount_init(bio_refs(bio), 1);
	+ KASSERT(bio->bio_resid == 0,
	+ ("%s: input bio_resid != 0", __func__));
	+ break;
	+ }
	+
	+ refcount_acquire(&ns->active_bios);
	+ nvmf_submit_request(req);
	+ mtx_unlock(&ns->lock);
	+ return (0);
	+}
	+
	+static int
	+nvmf_ns_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag,
	+ struct thread *td)
	+{
	+ struct nvmf_namespace *ns = dev->si_drv1;
	+ struct nvme_get_nsid *gnsid;
	+ struct nvme_pt_command *pt;
	+
	+ switch (cmd) {
	+ case NVME_PASSTHROUGH_CMD:
	+ pt = (struct nvme_pt_command *)arg;
	+ pt->cmd.nsid = htole32(ns->id);
	+ return (nvmf_passthrough_cmd(ns->sc, pt, false));
	+ case NVME_GET_NSID:
	+ gnsid = (struct nvme_get_nsid *)arg;
	+ strncpy(gnsid->cdev, device_get_nameunit(ns->sc->dev),
	+ sizeof(gnsid->cdev));
	+ gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
	+ gnsid->nsid = ns->id;
	+ return (0);
	+ case DIOCGMEDIASIZE:
	+ (off_t )arg = ns->size;
	+ return (0);
	+ case DIOCGSECTORSIZE:
	+ (u_int )arg = ns->lba_size;
	+ return (0);
	+ default:
	+ return (ENOTTY);
	+ }
	+}
	+
	+static int
	+nvmf_ns_open(struct cdev dev, int oflags, int devtype, struct thread td)
	+{
	+ int error;
	+
	+ error = 0;
	+ if ((oflags & FWRITE) != 0)
	+ error = securelevel_gt(td->td_ucred, 0);
	+ return (error);
	+}
	+
	+void
	+nvmf_ns_strategy(struct bio *bio)
	+{
	+ struct nvmf_namespace *ns;
	+ int error;
	+
	+ ns = bio->bio_dev->si_drv1;
	+
	+ error = nvmf_ns_submit_bio(ns, bio);
	+ if (error != 0) {
	+ bio->bio_error = error;
	+ bio->bio_flags \|= BIO_ERROR;
	+ bio->bio_resid = bio->bio_bcount;
	+ biodone(bio);
	+ }
	+}
	+
	+static struct cdevsw nvmf_ns_cdevsw = {
	+ .d_version = D_VERSION,
	+ .d_flags = D_DISK,
	+ .d_open = nvmf_ns_open,
	+ .d_read = physread,
	+ .d_write = physwrite,
	+ .d_strategy = nvmf_ns_strategy,
	+ .d_ioctl = nvmf_ns_ioctl
	+};
	+
	+struct nvmf_namespace *
	+nvmf_init_ns(struct nvmf_softc *sc, uint32_t id,
	+ struct nvme_namespace_data *data)
	+{
	+ struct make_dev_args mda;
	+ struct nvmf_namespace *ns;
	+ int error;
	+ uint8_t lbads, lbaf;
	+
	+ ns = malloc(sizeof(*ns), M_NVMF, M_WAITOK \| M_ZERO);
	+ ns->sc = sc;
	+ ns->id = id;
	+ TAILQ_INIT(&ns->pending_bios);
	+ mtx_init(&ns->lock, "nvmf ns", NULL, MTX_DEF);
	+
	+ /* One dummy bio avoids dropping to 0 until destroy. */
	+ refcount_init(&ns->active_bios, 1);
	+
	+ if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
	+ ns_printf(ns, "End-to-end data protection not supported\n");
	+ goto fail;
	+ }
	+
	+ lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
	+ if (lbaf > data->nlbaf) {
	+ ns_printf(ns, "Invalid LBA format index\n");
	+ goto fail;
	+ }
	+
	+ if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
	+ ns_printf(ns, "Namespaces with metadata are not supported\n");
	+ goto fail;
	+ }
	+
	+ lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
	+ if (lbads == 0) {
	+ ns_printf(ns, "Invalid LBA format index\n");
	+ goto fail;
	+ }
	+
	+ ns->lba_size = 1 << lbads;
	+ ns->size = data->nsze * ns->lba_size;
	+
	+ if (nvme_ctrlr_has_dataset_mgmt(sc->cdata))
	+ ns->flags \|= NVME_NS_DEALLOCATE_SUPPORTED;
	+
	+ if (NVMEV(NVME_CTRLR_DATA_VWC_PRESENT, sc->cdata->vwc) != 0)
	+ ns->flags \|= NVME_NS_FLUSH_SUPPORTED;
	+
	+ /*
	+ * XXX: Does any of the boundary splitting for NOIOB make any
	+ * sense for Fabrics?
	+ */
	+
	+ make_dev_args_init(&mda);
	+ mda.mda_devsw = &nvmf_ns_cdevsw;
	+ mda.mda_uid = UID_ROOT;
	+ mda.mda_gid = GID_WHEEL;
	+ mda.mda_mode = 0600;
	+ mda.mda_si_drv1 = ns;
	+ error = make_dev_s(&mda, &ns->cdev, "%sns%u",
	+ device_get_nameunit(sc->dev), id);
	+ if (error != 0)
	+ goto fail;
	+
	+ ns->cdev->si_flags \|= SI_UNMAPPED;
	+
	+ return (ns);
	+fail:
	+ mtx_destroy(&ns->lock);
	+ free(ns, M_NVMF);
	+ return (NULL);
	+}
	+
	+void
	+nvmf_disconnect_ns(struct nvmf_namespace *ns)
	+{
	+ mtx_lock(&ns->lock);
	+ ns->disconnected = true;
	+ mtx_unlock(&ns->lock);
	+}
	+
	+void
	+nvmf_reconnect_ns(struct nvmf_namespace *ns)
	+{
	+ TAILQ_HEAD(, bio) bios;
	+ struct bio *bio;
	+
	+ mtx_lock(&ns->lock);
	+ ns->disconnected = false;
	+ TAILQ_INIT(&bios);
	+ TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
	+ mtx_unlock(&ns->lock);
	+
	+ while (!TAILQ_EMPTY(&bios)) {
	+ bio = TAILQ_FIRST(&bios);
	+ TAILQ_REMOVE(&bios, bio, bio_queue);
	+ nvmf_ns_strategy(bio);
	+ }
	+}
	+
	+void
	+nvmf_destroy_ns(struct nvmf_namespace *ns)
	+{
	+ TAILQ_HEAD(, bio) bios;
	+ struct bio *bio;
	+
	+ destroy_dev(ns->cdev);
	+
	+ /*
	+ * Wait for active I/O requests to drain. The release drops
	+ * the reference on the "dummy bio" when the namespace is
	+ * created.
	+ */
	+ mtx_lock(&ns->lock);
	+ if (!refcount_release(&ns->active_bios)) {
	+ while (ns->active_bios != 0)
	+ mtx_sleep(ns, &ns->lock, 0, "nvmfrmns", 0);
	+ }
	+
	+ /* Abort any pending I/O requests. */
	+ TAILQ_INIT(&bios);
	+ TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
	+ mtx_unlock(&ns->lock);
	+
	+ while (!TAILQ_EMPTY(&bios)) {
	+ bio = TAILQ_FIRST(&bios);
	+ TAILQ_REMOVE(&bios, bio, bio_queue);
	+ bio->bio_error = ECONNABORTED;
	+ bio->bio_flags \|= BIO_ERROR;
	+ bio->bio_resid = bio->bio_bcount;
	+ biodone(bio);
	+ }
	+
	+ mtx_destroy(&ns->lock);
	+ free(ns, M_NVMF);
	+}
	+
	+bool
	+nvmf_update_ns(struct nvmf_namespace ns, struct nvme_namespace_data data)
	+{
	+ uint8_t lbads, lbaf;
	+
	+ if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
	+ ns_printf(ns, "End-to-end data protection not supported\n");
	+ return (false);
	+ }
	+
	+ lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
	+ if (lbaf > data->nlbaf) {
	+ ns_printf(ns, "Invalid LBA format index\n");
	+ return (false);
	+ }
	+
	+ if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
	+ ns_printf(ns, "Namespaces with metadata are not supported\n");
	+ return (false);
	+ }
	+
	+ lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
	+ if (lbads == 0) {
	+ ns_printf(ns, "Invalid LBA format index\n");
	+ return (false);
	+ }
	+
	+ ns->lba_size = 1 << lbads;
	+ ns->size = data->nsze * ns->lba_size;
	+ return (true);
	+}
	diff --git a/sys/dev/nvmf/host/nvmf_qpair.c b/sys/dev/nvmf/host/nvmf_qpair.c
	new file mode 100644
	--- /dev/null
	+++ b/sys/dev/nvmf/host/nvmf_qpair.c
	@@ -0,0 +1,386 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
	+ * Written by: John Baldwin <jhb@FreeBSD.org>
	+ */
	+
	+#include <sys/types.h>
	+#include <sys/bus.h>
	+#include <sys/lock.h>
	+#include <sys/malloc.h>
	+#include <sys/mutex.h>
	+#include <dev/nvme/nvme.h>
	+#include <dev/nvmf/nvmf.h>
	+#include <dev/nvmf/nvmf_transport.h>
	+#include <dev/nvmf/host/nvmf_var.h>
	+
	+struct nvmf_host_command {
	+ struct nvmf_request *req;
	+ TAILQ_ENTRY(nvmf_host_command) link;
	+ uint16_t cid;
	+};
	+
	+struct nvmf_host_qpair {
	+ struct nvmf_softc *sc;
	+ struct nvmf_qpair *qp;
	+
	+ bool sq_flow_control;
	+ bool shutting_down;
	+ u_int allocating;
	+ u_int num_commands;
	+ uint16_t sqhd;
	+ uint16_t sqtail;
	+
	+ struct mtx lock;
	+
	+ TAILQ_HEAD(, nvmf_host_command) free_commands;
	+ STAILQ_HEAD(, nvmf_request) pending_requests;
	+
	+ /* Indexed by cid. */
	+ struct nvmf_host_command **active_commands;
	+
	+ char name[16];
	+};
	+
	+struct nvmf_request *
	+nvmf_allocate_request(struct nvmf_host_qpair qp, void sqe,
	+ nvmf_request_complete_t cb, void cb_arg, int how)
	+{
	+ struct nvmf_request *req;
	+ struct nvmf_qpair *nq;
	+
	+ KASSERT(how == M_WAITOK \|\| how == M_NOWAIT,
	+ ("%s: invalid how", __func__));
	+
	+ req = malloc(sizeof(*req), M_NVMF, how \| M_ZERO);
	+ if (req == NULL)
	+ return (NULL);
	+
	+ mtx_lock(&qp->lock);
	+ nq = qp->qp;
	+ if (nq == NULL) {
	+ mtx_unlock(&qp->lock);
	+ free(req, M_NVMF);
	+ return (NULL);
	+ }
	+ qp->allocating++;
	+ MPASS(qp->allocating != 0);
	+ mtx_unlock(&qp->lock);
	+
	+ req->qp = qp;
	+ req->cb = cb;
	+ req->cb_arg = cb_arg;
	+ req->nc = nvmf_allocate_command(nq, sqe, how);
	+ if (req->nc == NULL) {
	+ free(req, M_NVMF);
	+ req = NULL;
	+ }
	+
	+ mtx_lock(&qp->lock);
	+ qp->allocating--;
	+ if (qp->allocating == 0 && qp->shutting_down)
	+ wakeup(qp);
	+ mtx_unlock(&qp->lock);
	+
	+ return (req);
	+}
	+
	+static void
	+nvmf_abort_request(struct nvmf_request *req, uint16_t cid)
	+{
	+ struct nvme_completion cqe;
	+
	+ memset(&cqe, 0, sizeof(cqe));
	+ cqe.cid = cid;
	+ cqe.status = htole16(NVMEF(NVME_STATUS_SCT, NVME_SCT_PATH_RELATED) \|
	+ NVMEF(NVME_STATUS_SC, NVME_SC_COMMAND_ABORTED_BY_HOST));
	+ req->cb(req->cb_arg, &cqe);
	+}
	+
	+void
	+nvmf_free_request(struct nvmf_request *req)
	+{
	+ if (req->nc != NULL)
	+ nvmf_free_capsule(req->nc);
	+ free(req, M_NVMF);
	+}
	+
	+static void
	+nvmf_dispatch_command(struct nvmf_host_qpair qp, struct nvmf_host_command cmd)
	+{
	+ struct nvmf_softc *sc = qp->sc;
	+ struct nvme_command *sqe;
	+ struct nvmf_capsule *nc;
	+ int error;
	+
	+ nc = cmd->req->nc;
	+ sqe = nvmf_capsule_sqe(nc);
	+
	+ /*
	+ * NB: Don't bother byte-swapping the cid so that receive
	+ * doesn't have to swap.
	+ */
	+ sqe->cid = cmd->cid;
	+
	+ error = nvmf_transmit_capsule(nc);
	+ if (error != 0) {
	+ device_printf(sc->dev,
	+ "failed to transmit capsule: %d, disconnecting\n", error);
	+ nvmf_disconnect(sc);
	+ return;
	+ }
	+
	+ if (sc->ka_traffic)
	+ atomic_store_int(&sc->ka_active_tx_traffic, 1);
	+}
	+
	+static void
	+nvmf_qp_error(void *arg, int error)
	+{
	+ struct nvmf_host_qpair *qp = arg;
	+ struct nvmf_softc *sc = qp->sc;
	+
	+ /* Ignore simple close of queue pairs during shutdown. */
	+ if (!(sc->detaching && error == 0))
	+ device_printf(sc->dev, "error %d on %s, disconnecting\n", error,
	+ qp->name);
	+ nvmf_disconnect(sc);
	+}
	+
	+static void
	+nvmf_receive_capsule(void arg, struct nvmf_capsule nc)
	+{
	+ struct nvmf_host_qpair *qp = arg;
	+ struct nvmf_softc *sc = qp->sc;
	+ struct nvmf_host_command *cmd;
	+ struct nvmf_request *req;
	+ const struct nvme_completion *cqe;
	+ uint16_t cid;
	+
	+ cqe = nvmf_capsule_cqe(nc);
	+
	+ if (sc->ka_traffic)
	+ atomic_store_int(&sc->ka_active_rx_traffic, 1);
	+
	+ /*
	+ * NB: Don't bother byte-swapping the cid as transmit doesn't
	+ * swap either.
	+ */
	+ cid = cqe->cid;
	+
	+ if (cid > qp->num_commands) {
	+ device_printf(sc->dev,
	+ "received invalid CID %u, disconnecting\n", cid);
	+ nvmf_disconnect(sc);
	+ nvmf_free_capsule(nc);
	+ return;
	+ }
	+
	+ /*
	+ * If the queue has been shutdown due to an error, silently
	+ * drop the response.
	+ */
	+ mtx_lock(&qp->lock);
	+ if (qp->qp == NULL) {
	+ device_printf(sc->dev,
	+ "received completion for CID %u on shutdown %s\n", cid,
	+ qp->name);
	+ mtx_unlock(&qp->lock);
	+ nvmf_free_capsule(nc);
	+ return;
	+ }
	+
	+ cmd = qp->active_commands[cid];
	+ if (cmd == NULL) {
	+ mtx_unlock(&qp->lock);
	+ device_printf(sc->dev,
	+ "received completion for inactive CID %u, disconnecting\n",
	+ cid);
	+ nvmf_disconnect(sc);
	+ nvmf_free_capsule(nc);
	+ return;
	+ }
	+
	+ KASSERT(cmd->cid == cid, ("%s: CID mismatch", __func__));
	+ req = cmd->req;
	+ cmd->req = NULL;
	+ if (STAILQ_EMPTY(&qp->pending_requests)) {
	+ qp->active_commands[cid] = NULL;
	+ TAILQ_INSERT_TAIL(&qp->free_commands, cmd, link);
	+ mtx_unlock(&qp->lock);
	+ } else {
	+ cmd->req = STAILQ_FIRST(&qp->pending_requests);
	+ STAILQ_REMOVE_HEAD(&qp->pending_requests, link);
	+ mtx_unlock(&qp->lock);
	+ nvmf_dispatch_command(qp, cmd);
	+ }
	+
	+ req->cb(req->cb_arg, cqe);
	+ nvmf_free_capsule(nc);
	+ nvmf_free_request(req);
	+}
	+
	+struct nvmf_host_qpair *
	+nvmf_init_qp(struct nvmf_softc *sc, enum nvmf_trtype trtype,
	+ struct nvmf_handoff_qpair_params handoff, const char name)
	+{
	+ struct nvmf_host_command cmd, ncmd;
	+ struct nvmf_host_qpair *qp;
	+ u_int i;
	+
	+ qp = malloc(sizeof(*qp), M_NVMF, M_WAITOK \| M_ZERO);
	+ qp->sc = sc;
	+ qp->sq_flow_control = handoff->sq_flow_control;
	+ qp->sqhd = handoff->sqhd;
	+ qp->sqtail = handoff->sqtail;
	+ strlcpy(qp->name, name, sizeof(qp->name));
	+ mtx_init(&qp->lock, "nvmf qp", NULL, MTX_DEF);
	+
	+ /*
	+ * Allocate a spare command slot for each pending AER command
	+ * on the admin queue.
	+ */
	+ qp->num_commands = handoff->qsize - 1;
	+ if (handoff->admin)
	+ qp->num_commands += sc->num_aer;
	+
	+ qp->active_commands = malloc(sizeof(qp->active_commands)
	+ qp->num_commands, M_NVMF, M_WAITOK \| M_ZERO);
	+ TAILQ_INIT(&qp->free_commands);
	+ for (i = 0; i < qp->num_commands; i++) {
	+ cmd = malloc(sizeof(*cmd), M_NVMF, M_WAITOK \| M_ZERO);
	+ cmd->cid = i;
	+ TAILQ_INSERT_TAIL(&qp->free_commands, cmd, link);
	+ }
	+ STAILQ_INIT(&qp->pending_requests);
	+
	+ qp->qp = nvmf_allocate_qpair(trtype, false, handoff, nvmf_qp_error,
	+ qp, nvmf_receive_capsule, qp);
	+ if (qp->qp == NULL) {
	+ TAILQ_FOREACH_SAFE(cmd, &qp->free_commands, link, ncmd) {
	+ TAILQ_REMOVE(&qp->free_commands, cmd, link);
	+ free(cmd, M_NVMF);
	+ }
	+ free(qp->active_commands, M_NVMF);
	+ mtx_destroy(&qp->lock);
	+ free(qp, M_NVMF);
	+ return (NULL);
	+ }
	+
	+ return (qp);
	+}
	+
	+void
	+nvmf_shutdown_qp(struct nvmf_host_qpair *qp)
	+{
	+ struct nvmf_host_command *cmd;
	+ struct nvmf_request *req;
	+ struct nvmf_qpair *nq;
	+
	+ mtx_lock(&qp->lock);
	+ nq = qp->qp;
	+ qp->qp = NULL;
	+
	+ if (nq == NULL) {
	+ while (qp->shutting_down)
	+ mtx_sleep(qp, &qp->lock, 0, "nvmfqpsh", 0);
	+ mtx_unlock(&qp->lock);
	+ return;
	+ }
	+ qp->shutting_down = true;
	+ while (qp->allocating != 0)
	+ mtx_sleep(qp, &qp->lock, 0, "nvmfqpqu", 0);
	+ mtx_unlock(&qp->lock);
	+
	+ nvmf_free_qpair(nq);
	+
	+ /*
	+ * Abort outstanding requests. Active requests will have
	+ * their I/O completions invoked and associated capsules freed
	+ * by the transport layer via nvmf_free_qpair. Pending
	+ * requests must have their I/O completion invoked via
	+ * nvmf_abort_capsule_data.
	+ */
	+ for (u_int i = 0; i < qp->num_commands; i++) {
	+ cmd = qp->active_commands[i];
	+ if (cmd != NULL) {
	+ if (!cmd->req->aer)
	+ printf("%s: aborted active command %p (CID %u)\n",
	+ __func__, cmd->req, cmd->cid);
	+
	+ /* This was freed by nvmf_free_qpair. */
	+ cmd->req->nc = NULL;
	+ nvmf_abort_request(cmd->req, cmd->cid);
	+ nvmf_free_request(cmd->req);
	+ free(cmd, M_NVMF);
	+ }
	+ }
	+ while (!STAILQ_EMPTY(&qp->pending_requests)) {
	+ req = STAILQ_FIRST(&qp->pending_requests);
	+ STAILQ_REMOVE_HEAD(&qp->pending_requests, link);
	+ if (!req->aer)
	+ printf("%s: aborted pending command %p\n", __func__,
	+ req);
	+ nvmf_abort_capsule_data(req->nc, ECONNABORTED);
	+ nvmf_abort_request(req, 0);
	+ nvmf_free_request(req);
	+ }
	+
	+ mtx_lock(&qp->lock);
	+ qp->shutting_down = false;
	+ mtx_unlock(&qp->lock);
	+ wakeup(qp);
	+}
	+
	+void
	+nvmf_destroy_qp(struct nvmf_host_qpair *qp)
	+{
	+ struct nvmf_host_command cmd, ncmd;
	+
	+ nvmf_shutdown_qp(qp);
	+
	+ TAILQ_FOREACH_SAFE(cmd, &qp->free_commands, link, ncmd) {
	+ TAILQ_REMOVE(&qp->free_commands, cmd, link);
	+ free(cmd, M_NVMF);
	+ }
	+ free(qp->active_commands, M_NVMF);
	+ mtx_destroy(&qp->lock);
	+ free(qp, M_NVMF);
	+}
	+
	+void
	+nvmf_submit_request(struct nvmf_request *req)
	+{
	+ struct nvmf_host_qpair *qp;
	+ struct nvmf_host_command *cmd;
	+
	+ qp = req->qp;
	+ mtx_lock(&qp->lock);
	+ if (qp->qp == NULL) {
	+ mtx_unlock(&qp->lock);
	+ printf("%s: aborted pending command %p\n", __func__, req);
	+ nvmf_abort_capsule_data(req->nc, ECONNABORTED);
	+ nvmf_abort_request(req, 0);
	+ nvmf_free_request(req);
	+ return;
	+ }
	+ cmd = TAILQ_FIRST(&qp->free_commands);
	+ if (cmd == NULL) {
	+ /*
	+ * Queue this request. Will be sent after enough
	+ * in-flight requests have completed.
	+ */
	+ STAILQ_INSERT_TAIL(&qp->pending_requests, req, link);
	+ mtx_unlock(&qp->lock);
	+ return;
	+ }
	+
	+ TAILQ_REMOVE(&qp->free_commands, cmd, link);
	+ KASSERT(qp->active_commands[cmd->cid] == NULL,
	+ ("%s: CID already busy", __func__));
	+ qp->active_commands[cmd->cid] = cmd;
	+ cmd->req = req;
	+ mtx_unlock(&qp->lock);
	+ nvmf_dispatch_command(qp, cmd);
	+}
	diff --git a/sys/dev/nvmf/host/nvmf_sim.c b/sys/dev/nvmf/host/nvmf_sim.c
	new file mode 100644
	--- /dev/null
	+++ b/sys/dev/nvmf/host/nvmf_sim.c
	@@ -0,0 +1,332 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
	+ * Written by: John Baldwin <jhb@FreeBSD.org>
	+ */
	+
	+#include <sys/types.h>
	+#include <sys/malloc.h>
	+#include <sys/memdesc.h>
	+#include <sys/refcount.h>
	+
	+#include <cam/cam.h>
	+#include <cam/cam_ccb.h>
	+#include <cam/cam_sim.h>
	+#include <cam/cam_xpt_sim.h>
	+#include <cam/cam_debug.h>
	+
	+#include <dev/nvmf/host/nvmf_var.h>
	+
	+/*
	+ * The I/O completion may trigger after the received CQE if the I/O
	+ * used a zero-copy mbuf that isn't harvested until after the NIC
	+ * driver processes TX completions. Use spriv_field0 to as a refcount.
	+ *
	+ * Store any I/O error returned in spriv_field1.
	+ */
	+static __inline u_int *
	+ccb_refs(union ccb *ccb)
	+{
	+ return ((u_int *)&ccb->ccb_h.spriv_field0);
	+}
	+
	+#define spriv_ioerror spriv_field1
	+
	+static void
	+nvmf_ccb_done(union ccb *ccb)
	+{
	+ if (!refcount_release(ccb_refs(ccb)))
	+ return;
	+
	+ if (nvmf_cqe_aborted(&ccb->nvmeio.cpl)) {
	+ ccb->ccb_h.status = CAM_REQUEUE_REQ;
	+ xpt_done(ccb);
	+ } else if (ccb->nvmeio.cpl.status != 0) {
	+ ccb->ccb_h.status = CAM_NVME_STATUS_ERROR;
	+ xpt_done(ccb);
	+ } else if (ccb->ccb_h.spriv_ioerror != 0) {
	+ KASSERT(ccb->ccb_h.spriv_ioerror != EJUSTRETURN,
	+ ("%s: zero sized transfer without CQE error", __func__));
	+ ccb->ccb_h.status = CAM_REQ_CMP_ERR;
	+ xpt_done(ccb);
	+ } else {
	+ ccb->ccb_h.status = CAM_REQ_CMP;
	+ xpt_done_direct(ccb);
	+ }
	+}
	+
	+static void
	+nvmf_ccb_io_complete(void *arg, size_t xfered, int error)
	+{
	+ union ccb *ccb = arg;
	+
	+ /*
	+ * TODO: Reporting partial completions requires extending
	+ * nvmeio to support resid and updating nda to handle partial
	+ * reads, either by returning partial success (or an error) to
	+ * the caller, or retrying all or part of the request.
	+ */
	+ ccb->ccb_h.spriv_ioerror = error;
	+ if (error == 0) {
	+ if (xfered == 0) {
	+#ifdef INVARIANTS
	+ /*
	+ * If the request fails with an error in the CQE
	+ * there will be no data transferred but also no
	+ * I/O error.
	+ */
	+ ccb->ccb_h.spriv_ioerror = EJUSTRETURN;
	+#endif
	+ } else
	+ KASSERT(xfered == ccb->nvmeio.dxfer_len,
	+ ("%s: partial CCB completion", __func__));
	+ }
	+
	+ nvmf_ccb_done(ccb);
	+}
	+
	+static void
	+nvmf_ccb_complete(void arg, const struct nvme_completion cqe)
	+{
	+ union ccb *ccb = arg;
	+
	+ ccb->nvmeio.cpl = *cqe;
	+ nvmf_ccb_done(ccb);
	+}
	+
	+static void
	+nvmf_sim_io(struct nvmf_softc sc, union ccb ccb)
	+{
	+ struct ccb_nvmeio *nvmeio = &ccb->nvmeio;
	+ struct memdesc mem;
	+ struct nvmf_request *req;
	+ struct nvmf_host_qpair *qp;
	+
	+ mtx_lock(&sc->sim_mtx);
	+ if (sc->sim_disconnected) {
	+ mtx_unlock(&sc->sim_mtx);
	+ nvmeio->ccb_h.status = CAM_REQUEUE_REQ;
	+ xpt_done(ccb);
	+ return;
	+ }
	+ if (nvmeio->ccb_h.func_code == XPT_NVME_IO)
	+ qp = nvmf_select_io_queue(sc);
	+ else
	+ qp = sc->admin;
	+ req = nvmf_allocate_request(qp, &nvmeio->cmd, nvmf_ccb_complete,
	+ ccb, M_NOWAIT);
	+ if (req == NULL) {
	+ mtx_unlock(&sc->sim_mtx);
	+ nvmeio->ccb_h.status = CAM_RESRC_UNAVAIL;
	+ xpt_done(ccb);
	+ return;
	+ }
	+
	+ if (nvmeio->dxfer_len != 0) {
	+ refcount_init(ccb_refs(ccb), 2);
	+ mem = memdesc_ccb(ccb);
	+ nvmf_capsule_append_data(req->nc, &mem, nvmeio->dxfer_len,
	+ (ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT,
	+ nvmf_ccb_io_complete, ccb);
	+ } else
	+ refcount_init(ccb_refs(ccb), 1);
	+
	+ /*
	+ * Clear spriv_ioerror as it can hold an earlier error if this
	+ * CCB was aborted and has been retried.
	+ */
	+ ccb->ccb_h.spriv_ioerror = 0;
	+ KASSERT(ccb->ccb_h.status == CAM_REQ_INPROG,
	+ ("%s: incoming CCB is not in-progress", __func__));
	+ ccb->ccb_h.status \|= CAM_SIM_QUEUED;
	+ nvmf_submit_request(req);
	+ mtx_unlock(&sc->sim_mtx);
	+}
	+
	+static void
	+nvmf_sim_action(struct cam_sim sim, union ccb ccb)
	+{
	+ struct nvmf_softc *sc = cam_sim_softc(sim);
	+
	+ CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE,
	+ ("nvmf_sim_action: func= %#x\n",
	+ ccb->ccb_h.func_code));
	+
	+ switch (ccb->ccb_h.func_code) {
	+ case XPT_PATH_INQ: /* Path routing inquiry */
	+ {
	+ struct ccb_pathinq *cpi = &ccb->cpi;
	+
	+ cpi->version_num = 1;
	+ cpi->hba_inquiry = 0;
	+ cpi->target_sprt = 0;
	+ cpi->hba_misc = PIM_UNMAPPED \| PIM_NOSCAN;
	+ cpi->hba_eng_cnt = 0;
	+ cpi->max_target = 0;
	+ cpi->max_lun = sc->cdata->nn;
	+ cpi->async_flags = 0;
	+ cpi->hpath_id = 0;
	+ cpi->initiator_id = 0;
	+ strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
	+ strlcpy(cpi->hba_vid, "NVMeoF", HBA_IDLEN);
	+ strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
	+ cpi->unit_number = cam_sim_unit(sim);
	+ cpi->bus_id = 0;
	+
	+ /* XXX: Same as iSCSI. */
	+ cpi->base_transfer_speed = 150000;
	+ cpi->protocol = PROTO_NVME;
	+ cpi->protocol_version = sc->vs;
	+ cpi->transport = XPORT_NVMF;
	+ cpi->transport_version = sc->vs;
	+ cpi->xport_specific.nvmf.nsid =
	+ xpt_path_lun_id(ccb->ccb_h.path);
	+ cpi->xport_specific.nvmf.trtype = sc->trtype;
	+ strncpy(cpi->xport_specific.nvmf.dev_name,
	+ device_get_nameunit(sc->dev),
	+ sizeof(cpi->xport_specific.nvmf.dev_name));
	+ cpi->maxio = sc->max_xfer_size;
	+ cpi->hba_vendor = 0;
	+ cpi->hba_device = 0;
	+ cpi->hba_subvendor = 0;
	+ cpi->hba_subdevice = 0;
	+ cpi->ccb_h.status = CAM_REQ_CMP;
	+ break;
	+ }
	+ case XPT_GET_TRAN_SETTINGS: /* Get transport settings */
	+ {
	+ struct ccb_trans_settings *cts = &ccb->cts;
	+ struct ccb_trans_settings_nvme *nvme;
	+ struct ccb_trans_settings_nvmf *nvmf;
	+
	+ cts->protocol = PROTO_NVME;
	+ cts->protocol_version = sc->vs;
	+ cts->transport = XPORT_NVMF;
	+ cts->transport_version = sc->vs;
	+
	+ nvme = &cts->proto_specific.nvme;
	+ nvme->valid = CTS_NVME_VALID_SPEC;
	+ nvme->spec = sc->vs;
	+
	+ nvmf = &cts->xport_specific.nvmf;
	+ nvmf->valid = CTS_NVMF_VALID_TRTYPE;
	+ nvmf->trtype = sc->trtype;
	+ cts->ccb_h.status = CAM_REQ_CMP;
	+ break;
	+ }
	+ case XPT_SET_TRAN_SETTINGS: /* Set transport settings */
	+ /*
	+ * No transfer settings can be set, but nvme_xpt sends
	+ * this anyway.
	+ */
	+ ccb->ccb_h.status = CAM_REQ_CMP;
	+ break;
	+ case XPT_NVME_IO: /* Execute the requested I/O */
	+ case XPT_NVME_ADMIN: /* or Admin operation */
	+ nvmf_sim_io(sc, ccb);
	+ return;
	+ default:
	+ /* XXX */
	+ device_printf(sc->dev, "unhandled sim function %#x\n",
	+ ccb->ccb_h.func_code);
	+ ccb->ccb_h.status = CAM_REQ_INVALID;
	+ break;
	+ }
	+ xpt_done(ccb);
	+}
	+
	+int
	+nvmf_init_sim(struct nvmf_softc *sc)
	+{
	+ struct cam_devq *devq;
	+ int max_trans;
	+
	+ max_trans = sc->max_pending_io * 3 / 4;
	+ devq = cam_simq_alloc(max_trans);
	+ if (devq == NULL) {
	+ device_printf(sc->dev, "Failed to allocate CAM simq\n");
	+ return (ENOMEM);
	+ }
	+
	+ mtx_init(&sc->sim_mtx, "nvmf sim", NULL, MTX_DEF);
	+ sc->sim = cam_sim_alloc(nvmf_sim_action, NULL, "nvme", sc,
	+ device_get_unit(sc->dev), NULL, max_trans, max_trans, devq);
	+ if (sc->sim == NULL) {
	+ device_printf(sc->dev, "Failed to allocate CAM sim\n");
	+ cam_simq_free(devq);
	+ mtx_destroy(&sc->sim_mtx);
	+ return (ENXIO);
	+ }
	+ if (xpt_bus_register(sc->sim, sc->dev, 0) != CAM_SUCCESS) {
	+ device_printf(sc->dev, "Failed to create CAM bus\n");
	+ cam_sim_free(sc->sim, TRUE);
	+ mtx_destroy(&sc->sim_mtx);
	+ return (ENXIO);
	+ }
	+ if (xpt_create_path(&sc->path, NULL, cam_sim_path(sc->sim),
	+ CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
	+ device_printf(sc->dev, "Failed to create CAM path\n");
	+ xpt_bus_deregister(cam_sim_path(sc->sim));
	+ cam_sim_free(sc->sim, TRUE);
	+ mtx_destroy(&sc->sim_mtx);
	+ return (ENXIO);
	+ }
	+ return (0);
	+}
	+
	+void
	+nvmf_sim_rescan_ns(struct nvmf_softc *sc, uint32_t id)
	+{
	+ union ccb *ccb;
	+
	+ ccb = xpt_alloc_ccb_nowait();
	+ if (ccb == NULL) {
	+ device_printf(sc->dev,
	+ "unable to alloc CCB for rescan of namespace %u\n", id);
	+ return;
	+ }
	+
	+ /*
	+ * As with nvme_sim, map NVMe namespace IDs onto CAM unit
	+ * LUNs.
	+ */
	+ if (xpt_create_path(&ccb->ccb_h.path, NULL, cam_sim_path(sc->sim), 0,
	+ id) != CAM_REQ_CMP) {
	+ device_printf(sc->dev,
	+ "Unable to create path for rescan of namespace %u\n", id);
	+ xpt_free_ccb(ccb);
	+ return;
	+ }
	+ xpt_rescan(ccb);
	+}
	+
	+void
	+nvmf_disconnect_sim(struct nvmf_softc *sc)
	+{
	+ mtx_lock(&sc->sim_mtx);
	+ sc->sim_disconnected = true;
	+ xpt_freeze_simq(sc->sim, 1);
	+ mtx_unlock(&sc->sim_mtx);
	+}
	+
	+void
	+nvmf_reconnect_sim(struct nvmf_softc *sc)
	+{
	+ mtx_lock(&sc->sim_mtx);
	+ sc->sim_disconnected = false;
	+ mtx_unlock(&sc->sim_mtx);
	+ xpt_release_simq(sc->sim, 1);
	+}
	+
	+void
	+nvmf_destroy_sim(struct nvmf_softc *sc)
	+{
	+ xpt_async(AC_LOST_DEVICE, sc->path, NULL);
	+ if (sc->sim_disconnected)
	+ xpt_release_simq(sc->sim, 1);
	+ xpt_free_path(sc->path);
	+ xpt_bus_deregister(cam_sim_path(sc->sim));
	+ cam_sim_free(sc->sim, TRUE);
	+ mtx_destroy(&sc->sim_mtx);
	+}
	diff --git a/sys/dev/nvmf/host/nvmf_var.h b/sys/dev/nvmf/host/nvmf_var.h
	new file mode 100644
	--- /dev/null
	+++ b/sys/dev/nvmf/host/nvmf_var.h
	@@ -0,0 +1,208 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
	+ * Written by: John Baldwin <jhb@FreeBSD.org>
	+ */
	+
	+#ifndef __NVMF_VAR_H__
	+#define __NVMF_VAR_H__
	+
	+#include <sys/_callout.h>
	+#include <sys/_lock.h>
	+#include <sys/_mutex.h>
	+#include <sys/_sx.h>
	+#include <sys/_task.h>
	+#include <sys/queue.h>
	+#include <dev/nvme/nvme.h>
	+#include <dev/nvmf/nvmf_transport.h>
	+
	+struct nvmf_aer;
	+struct nvmf_capsule;
	+struct nvmf_host_qpair;
	+struct nvmf_namespace;
	+
	+typedef void nvmf_request_complete_t(void , const struct nvme_completion );
	+
	+struct nvmf_ivars {
	+ struct nvmf_handoff_host *hh;
	+ struct nvmf_handoff_qpair_params *io_params;
	+ struct nvme_controller_data *cdata;
	+};
	+
	+struct nvmf_softc {
	+ device_t dev;
	+
	+ struct nvmf_host_qpair *admin;
	+ struct nvmf_host_qpair **io;
	+ u_int num_io_queues;
	+ enum nvmf_trtype trtype;
	+
	+ struct cam_sim *sim;
	+ struct cam_path *path;
	+ struct mtx sim_mtx;
	+ bool sim_disconnected;
	+
	+ struct nvmf_namespace **ns;
	+
	+ struct nvme_controller_data *cdata;
	+ uint64_t cap;
	+ uint32_t vs;
	+ u_int max_pending_io;
	+ u_long max_xfer_size;
	+
	+ struct cdev *cdev;
	+
	+ /*
	+ * Keep Alive support depends on two timers. The 'tx' timer
	+ * is responsible for sending KeepAlive commands and runs at
	+ * half the timeout interval. The 'rx' timer is responsible
	+ * for detecting an actual timeout.
	+ *
	+ * For efficient support of TKAS, the host does not reschedule
	+ * these timers every time new commands are scheduled.
	+ * Instead, the host sets the *_traffic flags when commands
	+ * are sent and received. The timeout handlers check and
	+ * clear these flags. This does mean it can take up to twice
	+ * the timeout time to detect an AWOL controller.
	+ */
	+ bool ka_traffic; /* Using TKAS? */
	+
	+ volatile int ka_active_tx_traffic;
	+ struct callout ka_tx_timer;
	+ sbintime_t ka_tx_sbt;
	+
	+ volatile int ka_active_rx_traffic;
	+ struct callout ka_rx_timer;
	+ sbintime_t ka_rx_sbt;
	+
	+ struct sx connection_lock;
	+ struct task disconnect_task;
	+ bool detaching;
	+
	+ u_int num_aer;
	+ struct nvmf_aer *aer;
	+};
	+
	+struct nvmf_request {
	+ struct nvmf_host_qpair *qp;
	+ struct nvmf_capsule *nc;
	+ nvmf_request_complete_t *cb;
	+ void *cb_arg;
	+ bool aer;
	+
	+ STAILQ_ENTRY(nvmf_request) link;
	+};
	+
	+struct nvmf_completion_status {
	+ struct nvme_completion cqe;
	+ bool done;
	+ bool io_done;
	+ int io_error;
	+};
	+
	+static __inline struct nvmf_host_qpair *
	+nvmf_select_io_queue(struct nvmf_softc *sc)
	+{
	+ /* TODO: Support multiple queues? */
	+ return (sc->io[0]);
	+}
	+
	+static __inline bool
	+nvmf_cqe_aborted(const struct nvme_completion *cqe)
	+{
	+ uint16_t status;
	+
	+ status = le16toh(cqe->status);
	+ return (NVME_STATUS_GET_SCT(status) == NVME_SCT_PATH_RELATED &&
	+ NVME_STATUS_GET_SC(status) == NVME_SC_COMMAND_ABORTED_BY_HOST);
	+}
	+
	+static __inline void
	+nvmf_status_init(struct nvmf_completion_status *status)
	+{
	+ status->done = false;
	+ status->io_done = true;
	+ status->io_error = 0;
	+}
	+
	+static __inline void
	+nvmf_status_wait_io(struct nvmf_completion_status *status)
	+{
	+ status->io_done = false;
	+}
	+
	+#ifdef DRIVER_MODULE
	+extern driver_t nvme_nvmf_driver;
	+#endif
	+
	+#ifdef MALLOC_DECLARE
	+MALLOC_DECLARE(M_NVMF);
	+#endif
	+
	+/* nvmf.c */
	+void nvmf_complete(void arg, const struct nvme_completion cqe);
	+void nvmf_io_complete(void *arg, size_t xfered, int error);
	+void nvmf_wait_for_reply(struct nvmf_completion_status *status);
	+int nvmf_init_ivars(struct nvmf_ivars ivars, struct nvmf_handoff_host hh);
	+void nvmf_free_ivars(struct nvmf_ivars *ivars);
	+void nvmf_disconnect(struct nvmf_softc *sc);
	+void nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid);
	+int nvmf_passthrough_cmd(struct nvmf_softc sc, struct nvme_pt_command pt,
	+ bool admin);
	+
	+/* nvmf_aer.c */
	+void nvmf_init_aer(struct nvmf_softc *sc);
	+int nvmf_start_aer(struct nvmf_softc *sc);
	+void nvmf_destroy_aer(struct nvmf_softc *sc);
	+
	+/* nvmf_cmd.c */
	+bool nvmf_cmd_get_property(struct nvmf_softc *sc, uint32_t offset,
	+ uint8_t size, nvmf_request_complete_t cb, void cb_arg, int how);
	+bool nvmf_cmd_set_property(struct nvmf_softc *sc, uint32_t offset,
	+ uint8_t size, uint64_t value, nvmf_request_complete_t cb, void cb_arg,
	+ int how);
	+bool nvmf_cmd_keep_alive(struct nvmf_softc sc, nvmf_request_complete_t cb,
	+ void *cb_arg, int how);
	+bool nvmf_cmd_identify_active_namespaces(struct nvmf_softc *sc, uint32_t id,
	+ struct nvme_ns_list nslist, nvmf_request_complete_t req_cb,
	+ void req_cb_arg, nvmf_io_complete_t io_cb, void *io_cb_arg, int how);
	+bool nvmf_cmd_identify_namespace(struct nvmf_softc *sc, uint32_t id,
	+ struct nvme_namespace_data nsdata, nvmf_request_complete_t req_cb,
	+ void req_cb_arg, nvmf_io_complete_t io_cb, void *io_cb_arg, int how);
	+bool nvmf_cmd_get_log_page(struct nvmf_softc *sc, uint32_t nsid, uint8_t lid,
	+ uint64_t offset, void buf, size_t len, nvmf_request_complete_t req_cb,
	+ void req_cb_arg, nvmf_io_complete_t io_cb, void *io_cb_arg, int how);
	+
	+/* nvmf_ctldev.c */
	+int nvmf_ctl_load(void);
	+void nvmf_ctl_unload(void);
	+
	+/* nvmf_ns.c */
	+struct nvmf_namespace nvmf_init_ns(struct nvmf_softc sc, uint32_t id,
	+ struct nvme_namespace_data *data);
	+void nvmf_disconnect_ns(struct nvmf_namespace *ns);
	+void nvmf_reconnect_ns(struct nvmf_namespace *ns);
	+void nvmf_destroy_ns(struct nvmf_namespace *ns);
	+bool nvmf_update_ns(struct nvmf_namespace *ns,
	+ struct nvme_namespace_data *data);
	+
	+/* nvmf_qpair.c */
	+struct nvmf_host_qpair nvmf_init_qp(struct nvmf_softc sc,
	+ enum nvmf_trtype trtype, struct nvmf_handoff_qpair_params *handoff,
	+ const char *name);
	+void nvmf_shutdown_qp(struct nvmf_host_qpair *qp);
	+void nvmf_destroy_qp(struct nvmf_host_qpair *qp);
	+struct nvmf_request nvmf_allocate_request(struct nvmf_host_qpair qp,
	+ void sqe, nvmf_request_complete_t cb, void *cb_arg, int how);
	+void nvmf_submit_request(struct nvmf_request *req);
	+void nvmf_free_request(struct nvmf_request *req);
	+
	+/* nvmf_sim.c */
	+int nvmf_init_sim(struct nvmf_softc *sc);
	+void nvmf_disconnect_sim(struct nvmf_softc *sc);
	+void nvmf_reconnect_sim(struct nvmf_softc *sc);
	+void nvmf_destroy_sim(struct nvmf_softc *sc);
	+void nvmf_sim_rescan_ns(struct nvmf_softc *sc, uint32_t id);
	+
	+#endif /* !__NVMF_VAR_H__ */
	diff --git a/sys/modules/nvmf/Makefile b/sys/modules/nvmf/Makefile
	--- a/sys/modules/nvmf/Makefile
	+++ b/sys/modules/nvmf/Makefile
	@@ -1,4 +1,5 @@
	-SUBDIR= nvmf_tcp \
	+SUBDIR= nvmf \
	+ nvmf_tcp \
	nvmf_transport

	.include <bsd.subdir.mk>
	diff --git a/sys/modules/nvmf/nvmf/Makefile b/sys/modules/nvmf/nvmf/Makefile
	new file mode 100644
	--- /dev/null
	+++ b/sys/modules/nvmf/nvmf/Makefile
	@@ -0,0 +1,13 @@
	+.PATH: ${SRCTOP}/sys/dev/nvmf/host
	+
	+KMOD= nvmf
	+
	+SRCS= nvmf.c \
	+ nvmf_aer.c \
	+ nvmf_cmd.c \
	+ nvmf_ctldev.c \
	+ nvmf_ns.c \
	+ nvmf_qpair.c \
	+ nvmf_sim.c
	+
	+.include <bsd.kmod.mk>

File Metadata

Mime Type: text/plain
Expires: Wed, Jan 15, 3:14 AM (2 h, 34 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 15805354
Default Alt Text: D44714.diff (77 KB)

D44714.diffNo OneTemporaryActions

D44714.diffView Options

File Metadata

Event Timeline

D44714.diff
No OneTemporary
Actions

D44714.diff
View Options