diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile --- a/share/man/man4/Makefile +++ b/share/man/man4/Makefile @@ -408,6 +408,7 @@ nvd.4 \ ${_nvdimm.4} \ nvme.4 \ + nvmf.4 \ nvmf_tcp.4 \ ${_nvram.4} \ oce.4 \ diff --git a/share/man/man4/nvmf.4 b/share/man/man4/nvmf.4 new file mode 100644 --- /dev/null +++ b/share/man/man4/nvmf.4 @@ -0,0 +1,87 @@ +.\" +.\" SPDX-License-Identifier: BSD-2-Clause +.\" +.\" Copyright (c) 2024 Chelsio Communications, Inc. +.\" +.Dd May 2, 2024 +.Dt NVMF 4 +.Os +.Sh NAME +.Nm nvmf +.Nd "NVM Express over Fabrics host driver" +.Sh SYNOPSIS +To compile the driver into the kernel, +place the following line in the +kernel configuration file: +.Bd -ragged -offset indent +.Cd "device nvmf" +.Ed +.Pp +Alternatively, to load the driver as a +module at boot time, place the following line in +.Xr loader.conf 5 : +.Bd -literal -offset indent +nvmf_load="YES" +.Ed +.Sh DESCRIPTION +The +.Nm +driver provides the kernel component of an NVM Express over Fabrics +host. +The NVMeoF host is the client which provides local access to +namespaces exported by a remote controller. +.Pp +Associations between the local host and remote controllers are managed +using +.Xr nvmecontrol 8 . +New associations are created via the +.Cm connect +command and destroyed via the +.Cm disconnect +command. +If an association's connection is interrupted, +the +.Cm reconnect +command creates a new association to replace the interrupted association. +.Pp +Similar to +.Xr nvme 4 , +.Nm +creates controller device nodes using the format +.Pa /dev/nvmeX +and namespace device nodes using the format +.Pa /dev/nvmeXnsY . +.Nm +also exports remote namespaces via the CAM +.Xr nda 4 +peripheral driver. +Unlike +.Xr nvme 4 , +.Nm +does not support the +.Xr nvd 4 +disk driver. +.Pp +Associations require a supported transport such as +.Xr nvmf_tcp 4 +for associations using TCP/IP. +.Sh SEE ALSO +.Xr nda 4 , +.Xr nvme 4 , +.Xr nvmf_tcp 4 , +.Xr nvmft 4 , +.Xr nvmecontrol 8 +.Sh HISTORY +The +.Nm +module first appeared in +.Fx 15.0 . +.Sh AUTHORS +The +.Nm +driver was developed by +.An John Baldwin Aq Mt jhb@FreeBSD.org +under sponsorship from Chelsio Communications, Inc. +.Sh BUGS +.Nm +only supports a single I/O queue pair per association. diff --git a/sys/conf/NOTES b/sys/conf/NOTES --- a/sys/conf/NOTES +++ b/sys/conf/NOTES @@ -1676,12 +1676,14 @@ # NVM Express # # nvme: PCI-express NVM Express host controllers +# nvmf: NVM Express over Fabrics host # nvmf_tcp: TCP transport for NVM Express over Fabrics # nda: CAM NVMe disk driver # nvd: non-CAM NVMe disk driver -device nvme # base NVMe driver +device nvme # PCI-express NVMe host driver options NVME_USE_NVD=1 # Use nvd(4) instead of the CAM nda(4) driver +device nvmf # NVMeoF host driver device nvmf_tcp # NVMeoF TCP transport device nda # NVMe direct access devices (aka disks) device nvd # expose NVMe namespaces as disks, depends on nvme diff --git a/sys/conf/files b/sys/conf/files --- a/sys/conf/files +++ b/sys/conf/files @@ -2533,7 +2533,15 @@ dev/nvme/nvme_util.c optional nvme dev/nvmem/nvmem.c optional nvmem fdt dev/nvmem/nvmem_if.m optional nvmem +dev/nvmf/host/nvmf.c optional nvmf +dev/nvmf/host/nvmf_aer.c optional nvmf +dev/nvmf/host/nvmf_cmd.c optional nvmf +dev/nvmf/host/nvmf_ctldev.c optional nvmf +dev/nvmf/host/nvmf_ns.c optional nvmf +dev/nvmf/host/nvmf_qpair.c optional nvmf +dev/nvmf/host/nvmf_sim.c optional nvmf dev/nvmf/nvmf_tcp.c optional nvmf_tcp +dev/nvmf/nvmf_transport.c optional nvmf dev/oce/oce_hw.c optional oce pci dev/oce/oce_if.c optional oce pci dev/oce/oce_mbox.c optional oce pci diff --git a/sys/dev/nvmf/host/nvmf.c b/sys/dev/nvmf/host/nvmf.c new file mode 100644 --- /dev/null +++ b/sys/dev/nvmf/host/nvmf.c @@ -0,0 +1,939 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct cdevsw nvmf_cdevsw; + +MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host"); + +static void nvmf_disconnect_task(void *arg, int pending); + +void +nvmf_complete(void *arg, const struct nvme_completion *cqe) +{ + struct nvmf_completion_status *status = arg; + struct mtx *mtx; + + status->cqe = *cqe; + mtx = mtx_pool_find(mtxpool_sleep, status); + mtx_lock(mtx); + status->done = true; + mtx_unlock(mtx); + wakeup(status); +} + +void +nvmf_io_complete(void *arg, size_t xfered, int error) +{ + struct nvmf_completion_status *status = arg; + struct mtx *mtx; + + status->io_error = error; + mtx = mtx_pool_find(mtxpool_sleep, status); + mtx_lock(mtx); + status->io_done = true; + mtx_unlock(mtx); + wakeup(status); +} + +void +nvmf_wait_for_reply(struct nvmf_completion_status *status) +{ + struct mtx *mtx; + + mtx = mtx_pool_find(mtxpool_sleep, status); + mtx_lock(mtx); + while (!status->done || !status->io_done) + mtx_sleep(status, mtx, 0, "nvmfcmd", 0); + mtx_unlock(mtx); +} + +static int +nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, + uint64_t *value) +{ + const struct nvmf_fabric_prop_get_rsp *rsp; + struct nvmf_completion_status status; + + nvmf_status_init(&status); + if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status, + M_WAITOK)) + return (ECONNABORTED); + nvmf_wait_for_reply(&status); + + if (status.cqe.status != 0) { + device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n", + le16toh(status.cqe.status)); + return (EIO); + } + + rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe; + if (size == 8) + *value = le64toh(rsp->value.u64); + else + *value = le32toh(rsp->value.u32.low); + return (0); +} + +static int +nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, + uint64_t value) +{ + struct nvmf_completion_status status; + + nvmf_status_init(&status); + if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status, + M_WAITOK)) + return (ECONNABORTED); + nvmf_wait_for_reply(&status); + + if (status.cqe.status != 0) { + device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n", + le16toh(status.cqe.status)); + return (EIO); + } + return (0); +} + +static void +nvmf_shutdown_controller(struct nvmf_softc *sc) +{ + uint64_t cc; + int error; + + error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc); + if (error != 0) { + device_printf(sc->dev, "Failed to fetch CC for shutdown\n"); + return; + } + + cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL); + + error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc); + if (error != 0) + device_printf(sc->dev, + "Failed to set CC to trigger shutdown\n"); +} + +static void +nvmf_check_keep_alive(void *arg) +{ + struct nvmf_softc *sc = arg; + int traffic; + + traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic); + if (traffic == 0) { + device_printf(sc->dev, + "disconnecting due to KeepAlive timeout\n"); + nvmf_disconnect(sc); + return; + } + + callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK); +} + +static void +nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe) +{ + struct nvmf_softc *sc = arg; + + atomic_store_int(&sc->ka_active_rx_traffic, 1); + if (cqe->status != 0) { + device_printf(sc->dev, + "KeepAlive response reported status %#x\n", + le16toh(cqe->status)); + } +} + +static void +nvmf_send_keep_alive(void *arg) +{ + struct nvmf_softc *sc = arg; + int traffic; + + /* + * Don't bother sending a KeepAlive command if TKAS is active + * and another command has been sent during the interval. + */ + traffic = atomic_load_int(&sc->ka_active_tx_traffic); + if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete, + sc, M_NOWAIT)) + device_printf(sc->dev, + "Failed to allocate KeepAlive command\n"); + + /* Clear ka_active_tx_traffic after sending the keep alive command. */ + atomic_store_int(&sc->ka_active_tx_traffic, 0); + + callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK); +} + +int +nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh) +{ + size_t len; + u_int i; + int error; + + memset(ivars, 0, sizeof(*ivars)); + + if (!hh->admin.admin || hh->num_io_queues < 1) + return (EINVAL); + + ivars->cdata = malloc(sizeof(*ivars->cdata), M_NVMF, M_WAITOK); + error = copyin(hh->cdata, ivars->cdata, sizeof(*ivars->cdata)); + if (error != 0) + goto out; + nvme_controller_data_swapbytes(ivars->cdata); + + len = hh->num_io_queues * sizeof(*ivars->io_params); + ivars->io_params = malloc(len, M_NVMF, M_WAITOK); + error = copyin(hh->io, ivars->io_params, len); + if (error != 0) + goto out; + for (i = 0; i < hh->num_io_queues; i++) { + if (ivars->io_params[i].admin) { + error = EINVAL; + goto out; + } + + /* Require all I/O queues to be the same size. */ + if (ivars->io_params[i].qsize != ivars->io_params[0].qsize) { + error = EINVAL; + goto out; + } + } + + ivars->hh = hh; + return (0); + +out: + free(ivars->io_params, M_NVMF); + free(ivars->cdata, M_NVMF); + return (error); +} + +void +nvmf_free_ivars(struct nvmf_ivars *ivars) +{ + free(ivars->io_params, M_NVMF); + free(ivars->cdata, M_NVMF); +} + +static int +nvmf_probe(device_t dev) +{ + struct nvmf_ivars *ivars = device_get_ivars(dev); + char desc[260]; + + if (ivars == NULL) + return (ENXIO); + + snprintf(desc, sizeof(desc), "Fabrics: %.256s", ivars->cdata->subnqn); + device_set_desc_copy(dev, desc); + return (BUS_PROBE_DEFAULT); +} + +static int +nvmf_establish_connection(struct nvmf_softc *sc, struct nvmf_ivars *ivars) +{ + char name[16]; + + /* Setup the admin queue. */ + sc->admin = nvmf_init_qp(sc, ivars->hh->trtype, &ivars->hh->admin, + "admin queue"); + if (sc->admin == NULL) { + device_printf(sc->dev, "Failed to setup admin queue\n"); + return (ENXIO); + } + + /* Setup I/O queues. */ + sc->io = malloc(ivars->hh->num_io_queues * sizeof(*sc->io), M_NVMF, + M_WAITOK | M_ZERO); + sc->num_io_queues = ivars->hh->num_io_queues; + for (u_int i = 0; i < sc->num_io_queues; i++) { + snprintf(name, sizeof(name), "I/O queue %u", i); + sc->io[i] = nvmf_init_qp(sc, ivars->hh->trtype, + &ivars->io_params[i], name); + if (sc->io[i] == NULL) { + device_printf(sc->dev, "Failed to setup I/O queue %u\n", + i + 1); + return (ENXIO); + } + } + + /* Start KeepAlive timers. */ + if (ivars->hh->kato != 0) { + sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS, + sc->cdata->ctratt) != 0; + sc->ka_rx_sbt = mstosbt(ivars->hh->kato); + sc->ka_tx_sbt = sc->ka_rx_sbt / 2; + callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, + nvmf_check_keep_alive, sc, C_HARDCLOCK); + callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, + nvmf_send_keep_alive, sc, C_HARDCLOCK); + } + + return (0); +} + +static bool +nvmf_scan_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist, + struct nvme_namespace_data *data, uint32_t *nsidp) +{ + struct nvmf_completion_status status; + uint32_t nsid; + + nvmf_status_init(&status); + nvmf_status_wait_io(&status); + if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist, + nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) { + device_printf(sc->dev, + "failed to send IDENTIFY active namespaces command\n"); + return (false); + } + nvmf_wait_for_reply(&status); + + if (status.cqe.status != 0) { + device_printf(sc->dev, + "IDENTIFY active namespaces failed, status %#x\n", + le16toh(status.cqe.status)); + return (false); + } + + if (status.io_error != 0) { + device_printf(sc->dev, + "IDENTIFY active namespaces failed with I/O error %d\n", + status.io_error); + return (false); + } + + for (u_int i = 0; i < nitems(nslist->ns); i++) { + nsid = nslist->ns[i]; + if (nsid == 0) { + *nsidp = 0; + return (true); + } + + if (sc->ns[nsid - 1] != NULL) { + device_printf(sc->dev, + "duplicate namespace %u in active namespace list\n", + nsid); + return (false); + } + + nvmf_status_init(&status); + nvmf_status_wait_io(&status); + if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete, + &status, nvmf_io_complete, &status, M_WAITOK)) { + device_printf(sc->dev, + "failed to send IDENTIFY namespace %u command\n", + nsid); + return (false); + } + nvmf_wait_for_reply(&status); + + if (status.cqe.status != 0) { + device_printf(sc->dev, + "IDENTIFY namespace %u failed, status %#x\n", nsid, + le16toh(status.cqe.status)); + return (false); + } + + if (status.io_error != 0) { + device_printf(sc->dev, + "IDENTIFY namespace %u failed with I/O error %d\n", + nsid, status.io_error); + return (false); + } + + /* + * As in nvme_ns_construct, a size of zero indicates an + * invalid namespace. + */ + nvme_namespace_data_swapbytes(data); + if (data->nsze == 0) { + device_printf(sc->dev, + "ignoring active namespace %u with zero size\n", + nsid); + continue; + } + + sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); + + nvmf_sim_rescan_ns(sc, nsid); + } + + MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0); + + if (nsid >= 0xfffffffd) + *nsidp = 0; + else + *nsidp = nsid + 1; + return (true); +} + +static bool +nvmf_add_namespaces(struct nvmf_softc *sc) +{ + struct nvme_namespace_data *data; + struct nvme_ns_list *nslist; + uint32_t nsid; + bool retval; + + sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF, + M_WAITOK | M_ZERO); + nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK); + data = malloc(sizeof(*data), M_NVMF, M_WAITOK); + + nsid = 0; + retval = true; + for (;;) { + if (!nvmf_scan_nslist(sc, nslist, data, &nsid)) { + retval = false; + break; + } + if (nsid == 0) + break; + } + + free(data, M_NVMF); + free(nslist, M_NVMF); + return (retval); +} + +static int +nvmf_attach(device_t dev) +{ + struct make_dev_args mda; + struct nvmf_softc *sc = device_get_softc(dev); + struct nvmf_ivars *ivars = device_get_ivars(dev); + uint64_t val; + u_int i; + int error; + + if (ivars == NULL) + return (ENXIO); + + sc->dev = dev; + sc->trtype = ivars->hh->trtype; + callout_init(&sc->ka_rx_timer, 1); + callout_init(&sc->ka_tx_timer, 1); + sx_init(&sc->connection_lock, "nvmf connection"); + TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc); + + /* Claim the cdata pointer from ivars. */ + sc->cdata = ivars->cdata; + ivars->cdata = NULL; + + nvmf_init_aer(sc); + + /* TODO: Multiqueue support. */ + sc->max_pending_io = ivars->io_params[0].qsize /* * sc->num_io_queues */; + + error = nvmf_establish_connection(sc, ivars); + if (error != 0) + goto out; + + error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap); + if (error != 0) { + device_printf(sc->dev, "Failed to fetch CAP\n"); + error = ENXIO; + goto out; + } + + error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val); + if (error != 0) { + device_printf(sc->dev, "Failed to fetch VS\n"); + error = ENXIO; + goto out; + } + sc->vs = val; + + /* Honor MDTS if it is set. */ + sc->max_xfer_size = maxphys; + if (sc->cdata->mdts != 0) { + sc->max_xfer_size = ulmin(sc->max_xfer_size, + 1 << (sc->cdata->mdts + NVME_MPS_SHIFT + + NVME_CAP_HI_MPSMIN(sc->cap >> 32))); + } + + error = nvmf_init_sim(sc); + if (error != 0) + goto out; + + error = nvmf_start_aer(sc); + if (error != 0) { + nvmf_destroy_sim(sc); + goto out; + } + + if (!nvmf_add_namespaces(sc)) { + nvmf_destroy_sim(sc); + goto out; + } + + make_dev_args_init(&mda); + mda.mda_devsw = &nvmf_cdevsw; + mda.mda_uid = UID_ROOT; + mda.mda_gid = GID_WHEEL; + mda.mda_mode = 0600; + mda.mda_si_drv1 = sc; + error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev)); + if (error != 0) { + nvmf_destroy_sim(sc); + goto out; + } + + return (0); +out: + if (sc->ns != NULL) { + for (i = 0; i < sc->cdata->nn; i++) { + if (sc->ns[i] != NULL) + nvmf_destroy_ns(sc->ns[i]); + } + free(sc->ns, M_NVMF); + } + + callout_drain(&sc->ka_tx_timer); + callout_drain(&sc->ka_rx_timer); + + if (sc->admin != NULL) + nvmf_shutdown_controller(sc); + + for (i = 0; i < sc->num_io_queues; i++) { + if (sc->io[i] != NULL) + nvmf_destroy_qp(sc->io[i]); + } + free(sc->io, M_NVMF); + if (sc->admin != NULL) + nvmf_destroy_qp(sc->admin); + + nvmf_destroy_aer(sc); + + taskqueue_drain(taskqueue_thread, &sc->disconnect_task); + sx_destroy(&sc->connection_lock); + free(sc->cdata, M_NVMF); + return (error); +} + +void +nvmf_disconnect(struct nvmf_softc *sc) +{ + taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task); +} + +static void +nvmf_disconnect_task(void *arg, int pending __unused) +{ + struct nvmf_softc *sc = arg; + u_int i; + + sx_xlock(&sc->connection_lock); + if (sc->admin == NULL) { + /* + * Ignore transport errors if there is no active + * association. + */ + sx_xunlock(&sc->connection_lock); + return; + } + + if (sc->detaching) { + if (sc->admin != NULL) { + /* + * This unsticks the detach process if a + * transport error occurs during detach. + */ + nvmf_shutdown_qp(sc->admin); + } + sx_xunlock(&sc->connection_lock); + return; + } + + if (sc->cdev == NULL) { + /* + * Transport error occurred during attach (nvmf_add_namespaces). + * Shutdown the admin queue. + */ + nvmf_shutdown_qp(sc->admin); + sx_xunlock(&sc->connection_lock); + return; + } + + callout_drain(&sc->ka_tx_timer); + callout_drain(&sc->ka_rx_timer); + sc->ka_traffic = false; + + /* Quiesce namespace consumers. */ + nvmf_disconnect_sim(sc); + for (i = 0; i < sc->cdata->nn; i++) { + if (sc->ns[i] != NULL) + nvmf_disconnect_ns(sc->ns[i]); + } + + /* Shutdown the existing qpairs. */ + for (i = 0; i < sc->num_io_queues; i++) { + nvmf_destroy_qp(sc->io[i]); + } + free(sc->io, M_NVMF); + sc->io = NULL; + sc->num_io_queues = 0; + nvmf_destroy_qp(sc->admin); + sc->admin = NULL; + + sx_xunlock(&sc->connection_lock); +} + +static int +nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh) +{ + struct nvmf_ivars ivars; + u_int i; + int error; + + /* XXX: Should we permit changing the transport type? */ + if (sc->trtype != hh->trtype) { + device_printf(sc->dev, + "transport type mismatch on reconnect\n"); + return (EINVAL); + } + + error = nvmf_init_ivars(&ivars, hh); + if (error != 0) + return (error); + + sx_xlock(&sc->connection_lock); + if (sc->admin != NULL || sc->detaching) { + error = EBUSY; + goto out; + } + + /* + * Ensure this is for the same controller. Note that the + * controller ID can vary across associations if the remote + * system is using the dynamic controller model. This merely + * ensures the new association is connected to the same NVMe + * subsystem. + */ + if (memcmp(sc->cdata->subnqn, ivars.cdata->subnqn, + sizeof(ivars.cdata->subnqn)) != 0) { + device_printf(sc->dev, + "controller subsystem NQN mismatch on reconnect\n"); + error = EINVAL; + goto out; + } + + /* + * XXX: Require same number and size of I/O queues so that + * max_pending_io is still correct? + */ + + error = nvmf_establish_connection(sc, &ivars); + if (error != 0) + goto out; + + error = nvmf_start_aer(sc); + if (error != 0) + goto out; + + device_printf(sc->dev, + "established new association with %u I/O queues\n", + sc->num_io_queues); + + /* Restart namespace consumers. */ + for (i = 0; i < sc->cdata->nn; i++) { + if (sc->ns[i] != NULL) + nvmf_reconnect_ns(sc->ns[i]); + } + nvmf_reconnect_sim(sc); +out: + sx_xunlock(&sc->connection_lock); + nvmf_free_ivars(&ivars); + return (error); +} + +static int +nvmf_detach(device_t dev) +{ + struct nvmf_softc *sc = device_get_softc(dev); + u_int i; + + destroy_dev(sc->cdev); + + sx_xlock(&sc->connection_lock); + sc->detaching = true; + sx_xunlock(&sc->connection_lock); + + nvmf_destroy_sim(sc); + for (i = 0; i < sc->cdata->nn; i++) { + if (sc->ns[i] != NULL) + nvmf_destroy_ns(sc->ns[i]); + } + free(sc->ns, M_NVMF); + + callout_drain(&sc->ka_tx_timer); + callout_drain(&sc->ka_rx_timer); + + if (sc->admin != NULL) + nvmf_shutdown_controller(sc); + + for (i = 0; i < sc->num_io_queues; i++) { + nvmf_destroy_qp(sc->io[i]); + } + free(sc->io, M_NVMF); + + taskqueue_drain(taskqueue_thread, &sc->disconnect_task); + + if (sc->admin != NULL) + nvmf_destroy_qp(sc->admin); + + nvmf_destroy_aer(sc); + + sx_destroy(&sc->connection_lock); + free(sc->cdata, M_NVMF); + return (0); +} + +void +nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid) +{ + struct nvmf_completion_status status; + struct nvme_namespace_data *data; + struct nvmf_namespace *ns; + + data = malloc(sizeof(*data), M_NVMF, M_WAITOK); + + nvmf_status_init(&status); + nvmf_status_wait_io(&status); + if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete, + &status, nvmf_io_complete, &status, M_WAITOK)) { + device_printf(sc->dev, + "failed to send IDENTIFY namespace %u command\n", nsid); + free(data, M_NVMF); + return; + } + nvmf_wait_for_reply(&status); + + if (status.cqe.status != 0) { + device_printf(sc->dev, + "IDENTIFY namespace %u failed, status %#x\n", nsid, + le16toh(status.cqe.status)); + free(data, M_NVMF); + return; + } + + if (status.io_error != 0) { + device_printf(sc->dev, + "IDENTIFY namespace %u failed with I/O error %d\n", + nsid, status.io_error); + free(data, M_NVMF); + return; + } + + nvme_namespace_data_swapbytes(data); + + /* XXX: Needs locking around sc->ns[]. */ + ns = sc->ns[nsid - 1]; + if (data->nsze == 0) { + /* XXX: Needs locking */ + if (ns != NULL) { + nvmf_destroy_ns(ns); + sc->ns[nsid - 1] = NULL; + } + } else { + /* XXX: Needs locking */ + if (ns == NULL) { + sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); + } else { + if (!nvmf_update_ns(ns, data)) { + nvmf_destroy_ns(ns); + sc->ns[nsid - 1] = NULL; + } + } + } + + free(data, M_NVMF); + + nvmf_sim_rescan_ns(sc, nsid); +} + +int +nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt, + bool admin) +{ + struct nvmf_completion_status status; + struct nvme_command cmd; + struct memdesc mem; + struct nvmf_host_qpair *qp; + struct nvmf_request *req; + void *buf; + int error; + + if (pt->len > sc->max_xfer_size) + return (EINVAL); + + buf = NULL; + if (pt->len != 0) { + /* + * XXX: Depending on the size we may want to pin the + * user pages and use a memdesc with vm_page_t's + * instead. + */ + buf = malloc(pt->len, M_NVMF, M_WAITOK); + if (pt->is_read == 0) { + error = copyin(pt->buf, buf, pt->len); + if (error != 0) { + free(buf, M_NVMF); + return (error); + } + } else { + /* Ensure no kernel data is leaked to userland. */ + memset(buf, 0, pt->len); + } + } + + memset(&cmd, 0, sizeof(cmd)); + cmd.opc = pt->cmd.opc; + cmd.fuse = pt->cmd.fuse; + cmd.nsid = pt->cmd.nsid; + cmd.cdw10 = pt->cmd.cdw10; + cmd.cdw11 = pt->cmd.cdw11; + cmd.cdw12 = pt->cmd.cdw12; + cmd.cdw13 = pt->cmd.cdw13; + cmd.cdw14 = pt->cmd.cdw14; + cmd.cdw15 = pt->cmd.cdw15; + + if (admin) + qp = sc->admin; + else + qp = nvmf_select_io_queue(sc); + nvmf_status_init(&status); + req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK); + if (req == NULL) { + device_printf(sc->dev, "failed to send passthrough command\n"); + error = ECONNABORTED; + goto error; + } + + if (pt->len != 0) { + mem = memdesc_vaddr(buf, pt->len); + nvmf_capsule_append_data(req->nc, &mem, pt->len, + pt->is_read == 0, nvmf_io_complete, &status); + nvmf_status_wait_io(&status); + } + + nvmf_submit_request(req); + nvmf_wait_for_reply(&status); + + memset(&pt->cpl, 0, sizeof(pt->cpl)); + pt->cpl.cdw0 = status.cqe.cdw0; + pt->cpl.status = status.cqe.status; + + error = status.io_error; + if (error == 0 && pt->len != 0 && pt->is_read != 0) + error = copyout(buf, pt->buf, pt->len); +error: + free(buf, M_NVMF); + return (error); +} + +static int +nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, + struct thread *td) +{ + struct nvmf_softc *sc = cdev->si_drv1; + struct nvme_get_nsid *gnsid; + struct nvme_pt_command *pt; + struct nvmf_reconnect_params *rp; + struct nvmf_handoff_host *hh; + + switch (cmd) { + case NVME_PASSTHROUGH_CMD: + pt = (struct nvme_pt_command *)arg; + return (nvmf_passthrough_cmd(sc, pt, true)); + case NVME_GET_NSID: + gnsid = (struct nvme_get_nsid *)arg; + strncpy(gnsid->cdev, device_get_nameunit(sc->dev), + sizeof(gnsid->cdev)); + gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0'; + gnsid->nsid = 0; + return (0); + case NVME_GET_MAX_XFER_SIZE: + *(uint64_t *)arg = sc->max_xfer_size; + return (0); + case NVMF_RECONNECT_PARAMS: + rp = (struct nvmf_reconnect_params *)arg; + if ((sc->cdata->fcatt & 1) == 0) + rp->cntlid = NVMF_CNTLID_DYNAMIC; + else + rp->cntlid = sc->cdata->ctrlr_id; + memcpy(rp->subnqn, sc->cdata->subnqn, sizeof(rp->subnqn)); + return (0); + case NVMF_RECONNECT_HOST: + hh = (struct nvmf_handoff_host *)arg; + return (nvmf_reconnect_host(sc, hh)); + default: + return (ENOTTY); + } +} + +static struct cdevsw nvmf_cdevsw = { + .d_version = D_VERSION, + .d_ioctl = nvmf_ioctl +}; + +static int +nvmf_modevent(module_t mod, int what, void *arg) +{ + switch (what) { + case MOD_LOAD: + return (nvmf_ctl_load()); + case MOD_QUIESCE: + return (0); + case MOD_UNLOAD: + nvmf_ctl_unload(); + destroy_dev_drain(&nvmf_cdevsw); + return (0); + default: + return (EOPNOTSUPP); + } +} + +static device_method_t nvmf_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, nvmf_probe), + DEVMETHOD(device_attach, nvmf_attach), + DEVMETHOD(device_detach, nvmf_detach), +#if 0 + DEVMETHOD(device_shutdown, nvmf_shutdown), +#endif + DEVMETHOD_END +}; + +driver_t nvme_nvmf_driver = { + "nvme", + nvmf_methods, + sizeof(struct nvmf_softc), +}; + +DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL); +MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1); diff --git a/sys/dev/nvmf/host/nvmf_aer.c b/sys/dev/nvmf/host/nvmf_aer.c new file mode 100644 --- /dev/null +++ b/sys/dev/nvmf/host/nvmf_aer.c @@ -0,0 +1,290 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#include +#include +#include +#include +#include +#include +#include + +struct nvmf_aer { + struct nvmf_softc *sc; + uint8_t log_page_id; + uint8_t info; + uint8_t type; + + u_int page_len; + void *page; + + int error; + uint16_t status; + int pending; + struct mtx *lock; + struct task complete_task; + struct task finish_page_task; +}; + +#define MAX_LOG_PAGE_SIZE 4096 + +static void nvmf_complete_aer(void *arg, const struct nvme_completion *cqe); + +static void +nvmf_submit_aer(struct nvmf_softc *sc, struct nvmf_aer *aer) +{ + struct nvmf_request *req; + struct nvme_command cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST; + + req = nvmf_allocate_request(sc->admin, &cmd, nvmf_complete_aer, aer, + M_WAITOK); + if (req == NULL) + return; + req->aer = true; + nvmf_submit_request(req); +} + +static void +nvmf_handle_changed_namespaces(struct nvmf_softc *sc, + struct nvme_ns_list *ns_list) +{ + uint32_t nsid; + + /* + * If more than 1024 namespaces have changed, we should + * probably just rescan the entire set of namespaces. + */ + if (ns_list->ns[0] == 0xffffffff) { + device_printf(sc->dev, "too many changed namespaces\n"); + return; + } + + for (u_int i = 0; i < nitems(ns_list->ns); i++) { + if (ns_list->ns[i] == 0) + break; + + nsid = le32toh(ns_list->ns[i]); + nvmf_rescan_ns(sc, nsid); + } +} + +static void +nvmf_finish_aer_page(struct nvmf_softc *sc, struct nvmf_aer *aer) +{ + /* If an error occurred fetching the page, just bail. */ + if (aer->error != 0 || aer->status != 0) + return; + + taskqueue_enqueue(taskqueue_thread, &aer->finish_page_task); +} + +static void +nvmf_finish_aer_page_task(void *arg, int pending) +{ + struct nvmf_aer *aer = arg; + struct nvmf_softc *sc = aer->sc; + + switch (aer->log_page_id) { + case NVME_LOG_ERROR: + /* TODO: Should we log these? */ + break; + case NVME_LOG_CHANGED_NAMESPACE: + nvmf_handle_changed_namespaces(sc, aer->page); + break; + } + + /* Resubmit this AER command. */ + nvmf_submit_aer(sc, aer); +} + +static void +nvmf_io_complete_aer_page(void *arg, size_t xfered, int error) +{ + struct nvmf_aer *aer = arg; + struct nvmf_softc *sc = aer->sc; + + mtx_lock(aer->lock); + aer->error = error; + aer->pending--; + if (aer->pending == 0) { + mtx_unlock(aer->lock); + nvmf_finish_aer_page(sc, aer); + } else + mtx_unlock(aer->lock); +} + +static void +nvmf_complete_aer_page(void *arg, const struct nvme_completion *cqe) +{ + struct nvmf_aer *aer = arg; + struct nvmf_softc *sc = aer->sc; + + mtx_lock(aer->lock); + aer->status = cqe->status; + aer->pending--; + if (aer->pending == 0) { + mtx_unlock(aer->lock); + nvmf_finish_aer_page(sc, aer); + } else + mtx_unlock(aer->lock); +} + +static u_int +nvmf_log_page_size(struct nvmf_softc *sc, uint8_t log_page_id) +{ + switch (log_page_id) { + case NVME_LOG_ERROR: + return ((sc->cdata->elpe + 1) * + sizeof(struct nvme_error_information_entry)); + case NVME_LOG_CHANGED_NAMESPACE: + return (sizeof(struct nvme_ns_list)); + default: + return (0); + } +} + +static void +nvmf_complete_aer(void *arg, const struct nvme_completion *cqe) +{ + struct nvmf_aer *aer = arg; + struct nvmf_softc *sc = aer->sc; + uint32_t cdw0; + + /* + * The only error defined for AER is an abort due to + * submitting too many AER commands. Just discard this AER + * without resubmitting if we get an error. + * + * NB: Pending AER commands are aborted during controller + * shutdown, so discard aborted commands silently. + */ + if (cqe->status != 0) { + if (!nvmf_cqe_aborted(cqe)) + device_printf(sc->dev, "Ignoring error %#x for AER\n", + le16toh(cqe->status)); + return; + } + + cdw0 = le32toh(cqe->cdw0); + aer->log_page_id = NVMEV(NVME_ASYNC_EVENT_LOG_PAGE_ID, cdw0); + aer->info = NVMEV(NVME_ASYNC_EVENT_INFO, cdw0); + aer->type = NVMEV(NVME_ASYNC_EVENT_TYPE, cdw0); + + device_printf(sc->dev, "AER type %u, info %#x, page %#x\n", + aer->type, aer->info, aer->log_page_id); + + aer->page_len = nvmf_log_page_size(sc, aer->log_page_id); + taskqueue_enqueue(taskqueue_thread, &aer->complete_task); +} + +static void +nvmf_complete_aer_task(void *arg, int pending) +{ + struct nvmf_aer *aer = arg; + struct nvmf_softc *sc = aer->sc; + + if (aer->page_len != 0) { + /* Read the associated log page. */ + aer->page_len = MIN(aer->page_len, MAX_LOG_PAGE_SIZE); + aer->pending = 2; + (void) nvmf_cmd_get_log_page(sc, NVME_GLOBAL_NAMESPACE_TAG, + aer->log_page_id, 0, aer->page, aer->page_len, + nvmf_complete_aer_page, aer, nvmf_io_complete_aer_page, + aer, M_WAITOK); + } else { + /* Resubmit this AER command. */ + nvmf_submit_aer(sc, aer); + } +} + +static int +nvmf_set_async_event_config(struct nvmf_softc *sc, uint32_t config) +{ + struct nvme_command cmd; + struct nvmf_completion_status status; + struct nvmf_request *req; + + memset(&cmd, 0, sizeof(cmd)); + cmd.opc = NVME_OPC_SET_FEATURES; + cmd.cdw10 = htole32(NVME_FEAT_ASYNC_EVENT_CONFIGURATION); + cmd.cdw11 = htole32(config); + + nvmf_status_init(&status); + req = nvmf_allocate_request(sc->admin, &cmd, nvmf_complete, &status, + M_WAITOK); + if (req == NULL) { + device_printf(sc->dev, + "failed to allocate SET_FEATURES (ASYNC_EVENT_CONFIGURATION) command\n"); + return (ECONNABORTED); + } + nvmf_submit_request(req); + nvmf_wait_for_reply(&status); + + if (status.cqe.status != 0) { + device_printf(sc->dev, + "SET_FEATURES (ASYNC_EVENT_CONFIGURATION) failed, status %#x\n", + le16toh(status.cqe.status)); + return (EIO); + } + + return (0); +} + +void +nvmf_init_aer(struct nvmf_softc *sc) +{ + /* 8 matches NVME_MAX_ASYNC_EVENTS */ + sc->num_aer = min(8, sc->cdata->aerl + 1); + sc->aer = mallocarray(sc->num_aer, sizeof(*sc->aer), M_NVMF, + M_WAITOK | M_ZERO); + for (u_int i = 0; i < sc->num_aer; i++) { + sc->aer[i].sc = sc; + sc->aer[i].page = malloc(MAX_LOG_PAGE_SIZE, M_NVMF, M_WAITOK); + sc->aer[i].lock = mtx_pool_find(mtxpool_sleep, &sc->aer[i]); + TASK_INIT(&sc->aer[i].complete_task, 0, nvmf_complete_aer_task, + &sc->aer[i]); + TASK_INIT(&sc->aer[i].finish_page_task, 0, + nvmf_finish_aer_page_task, &sc->aer[i]); + } +} + +int +nvmf_start_aer(struct nvmf_softc *sc) +{ + uint32_t async_event_config; + int error; + + async_event_config = NVME_CRIT_WARN_ST_AVAILABLE_SPARE | + NVME_CRIT_WARN_ST_DEVICE_RELIABILITY | + NVME_CRIT_WARN_ST_READ_ONLY | + NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP; + if (sc->cdata->ver >= NVME_REV(1, 2)) + async_event_config |= + sc->cdata->oaes & NVME_ASYNC_EVENT_NS_ATTRIBUTE; + error = nvmf_set_async_event_config(sc, async_event_config); + if (error != 0) + return (error); + + for (u_int i = 0; i < sc->num_aer; i++) + nvmf_submit_aer(sc, &sc->aer[i]); + + return (0); +} + +void +nvmf_destroy_aer(struct nvmf_softc *sc) +{ + for (u_int i = 0; i < sc->num_aer; i++) { + taskqueue_drain(taskqueue_thread, &sc->aer[i].complete_task); + taskqueue_drain(taskqueue_thread, &sc->aer[i].finish_page_task); + free(sc->aer[i].page, M_NVMF); + } + free(sc->aer, M_NVMF); +} diff --git a/sys/dev/nvmf/host/nvmf_cmd.c b/sys/dev/nvmf/host/nvmf_cmd.c new file mode 100644 --- /dev/null +++ b/sys/dev/nvmf/host/nvmf_cmd.c @@ -0,0 +1,171 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#include +#include +#include +#include +#include +#include +#include + +bool +nvmf_cmd_get_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, + nvmf_request_complete_t *cb, void *cb_arg, int how) +{ + struct nvmf_fabric_prop_get_cmd cmd; + struct nvmf_request *req; + + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = NVME_OPC_FABRICS_COMMANDS; + cmd.fctype = NVMF_FABRIC_COMMAND_PROPERTY_GET; + switch (size) { + case 4: + cmd.attrib.size = NVMF_PROP_SIZE_4; + break; + case 8: + cmd.attrib.size = NVMF_PROP_SIZE_8; + break; + default: + panic("Invalid property size"); + } + cmd.ofst = htole32(offset); + + req = nvmf_allocate_request(sc->admin, &cmd, cb, cb_arg, how); + if (req != NULL) + nvmf_submit_request(req); + return (req != NULL); +} + +bool +nvmf_cmd_set_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, + uint64_t value, nvmf_request_complete_t *cb, void *cb_arg, int how) +{ + struct nvmf_fabric_prop_set_cmd cmd; + struct nvmf_request *req; + + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = NVME_OPC_FABRICS_COMMANDS; + cmd.fctype = NVMF_FABRIC_COMMAND_PROPERTY_SET; + switch (size) { + case 4: + cmd.attrib.size = NVMF_PROP_SIZE_4; + cmd.value.u32.low = htole32(value); + break; + case 8: + cmd.attrib.size = NVMF_PROP_SIZE_8; + cmd.value.u64 = htole64(value); + break; + default: + panic("Invalid property size"); + } + cmd.ofst = htole32(offset); + + req = nvmf_allocate_request(sc->admin, &cmd, cb, cb_arg, how); + if (req != NULL) + nvmf_submit_request(req); + return (req != NULL); +} + +bool +nvmf_cmd_keep_alive(struct nvmf_softc *sc, nvmf_request_complete_t *cb, + void *cb_arg, int how) +{ + struct nvme_command cmd; + struct nvmf_request *req; + + memset(&cmd, 0, sizeof(cmd)); + cmd.opc = NVME_OPC_KEEP_ALIVE; + + req = nvmf_allocate_request(sc->admin, &cmd, cb, cb_arg, how); + if (req != NULL) + nvmf_submit_request(req); + return (req != NULL); +} + +bool +nvmf_cmd_identify_active_namespaces(struct nvmf_softc *sc, uint32_t id, + struct nvme_ns_list *nslist, nvmf_request_complete_t *req_cb, + void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how) +{ + struct nvme_command cmd; + struct memdesc mem; + struct nvmf_request *req; + + memset(&cmd, 0, sizeof(cmd)); + cmd.opc = NVME_OPC_IDENTIFY; + + /* 5.15.1 Use CNS of 0x02 for namespace data. */ + cmd.cdw10 = htole32(2); + cmd.nsid = htole32(id); + + req = nvmf_allocate_request(sc->admin, &cmd, req_cb, req_cb_arg, how); + if (req == NULL) + return (false); + mem = memdesc_vaddr(nslist, sizeof(*nslist)); + nvmf_capsule_append_data(req->nc, &mem, sizeof(*nslist), false, + io_cb, io_cb_arg); + nvmf_submit_request(req); + return (true); +} + +bool +nvmf_cmd_identify_namespace(struct nvmf_softc *sc, uint32_t id, + struct nvme_namespace_data *nsdata, nvmf_request_complete_t *req_cb, + void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how) +{ + struct nvme_command cmd; + struct memdesc mem; + struct nvmf_request *req; + + memset(&cmd, 0, sizeof(cmd)); + cmd.opc = NVME_OPC_IDENTIFY; + + /* 5.15.1 Use CNS of 0x00 for namespace data. */ + cmd.cdw10 = htole32(0); + cmd.nsid = htole32(id); + + req = nvmf_allocate_request(sc->admin, &cmd, req_cb, req_cb_arg, how); + if (req == NULL) + return (false); + mem = memdesc_vaddr(nsdata, sizeof(*nsdata)); + nvmf_capsule_append_data(req->nc, &mem, sizeof(*nsdata), false, + io_cb, io_cb_arg); + nvmf_submit_request(req); + return (true); +} + +bool +nvmf_cmd_get_log_page(struct nvmf_softc *sc, uint32_t nsid, uint8_t lid, + uint64_t offset, void *buf, size_t len, nvmf_request_complete_t *req_cb, + void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how) +{ + struct nvme_command cmd; + struct memdesc mem; + struct nvmf_request *req; + size_t numd; + + MPASS(len != 0 && len % 4 == 0); + MPASS(offset % 4 == 0); + + numd = (len / 4) - 1; + memset(&cmd, 0, sizeof(cmd)); + cmd.opc = NVME_OPC_GET_LOG_PAGE; + cmd.nsid = htole32(nsid); + cmd.cdw10 = htole32(numd << 16 | lid); + cmd.cdw11 = htole32(numd >> 16); + cmd.cdw12 = htole32(offset); + cmd.cdw13 = htole32(offset >> 32); + + req = nvmf_allocate_request(sc->admin, &cmd, req_cb, req_cb_arg, how); + if (req == NULL) + return (false); + mem = memdesc_vaddr(buf, len); + nvmf_capsule_append_data(req->nc, &mem, len, false, io_cb, io_cb_arg); + nvmf_submit_request(req); + return (true); +} diff --git a/sys/dev/nvmf/host/nvmf_ctldev.c b/sys/dev/nvmf/host/nvmf_ctldev.c new file mode 100644 --- /dev/null +++ b/sys/dev/nvmf/host/nvmf_ctldev.c @@ -0,0 +1,159 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static struct cdev *nvmf_cdev; + +static int +nvmf_handoff_host(struct nvmf_handoff_host *hh) +{ + struct nvmf_ivars ivars; + device_t dev; + int error; + + error = nvmf_init_ivars(&ivars, hh); + if (error != 0) + return (error); + + bus_topo_lock(); + dev = device_add_child(root_bus, "nvme", -1); + if (dev == NULL) { + bus_topo_unlock(); + error = ENXIO; + goto out; + } + + device_set_ivars(dev, &ivars); + error = device_probe_and_attach(dev); + device_set_ivars(dev, NULL); + if (error != 0) + device_delete_child(root_bus, dev); + bus_topo_unlock(); + +out: + nvmf_free_ivars(&ivars); + return (error); +} + +static bool +nvmf_matches(device_t dev, char *name) +{ + struct nvmf_softc *sc = device_get_softc(dev); + + if (strcmp(device_get_nameunit(dev), name) == 0) + return (true); + if (strcmp(sc->cdata->subnqn, name) == 0) + return (true); + return (false); +} + +static int +nvmf_disconnect_by_name(char *name) +{ + devclass_t dc; + device_t dev; + int error, unit; + bool found; + + found = false; + error = 0; + bus_topo_lock(); + dc = devclass_find("nvme"); + if (dc == NULL) + goto out; + + for (unit = 0; unit < devclass_get_maxunit(dc); unit++) { + dev = devclass_get_device(dc, unit); + if (dev == NULL) + continue; + if (device_get_driver(dev) != &nvme_nvmf_driver) + continue; + if (device_get_parent(dev) != root_bus) + continue; + if (name != NULL && !nvmf_matches(dev, name)) + continue; + + error = device_delete_child(root_bus, dev); + if (error != 0) + break; + found = true; + } +out: + bus_topo_unlock(); + if (error == 0 && !found) + error = ENOENT; + return (error); +} + +static int +nvmf_disconnect_host(const char **namep) +{ + char *name; + int error; + + name = malloc(PATH_MAX, M_NVMF, M_WAITOK); + error = copyinstr(*namep, name, PATH_MAX, NULL); + if (error == 0) + error = nvmf_disconnect_by_name(name); + free(name, M_NVMF); + return (error); +} + +static int +nvmf_ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag, + struct thread *td) +{ + switch (cmd) { + case NVMF_HANDOFF_HOST: + return (nvmf_handoff_host((struct nvmf_handoff_host *)arg)); + case NVMF_DISCONNECT_HOST: + return (nvmf_disconnect_host((const char **)arg)); + case NVMF_DISCONNECT_ALL: + return (nvmf_disconnect_by_name(NULL)); + default: + return (ENOTTY); + } +} + +static struct cdevsw nvmf_ctl_cdevsw = { + .d_version = D_VERSION, + .d_ioctl = nvmf_ctl_ioctl +}; + +int +nvmf_ctl_load(void) +{ + struct make_dev_args mda; + int error; + + make_dev_args_init(&mda); + mda.mda_devsw = &nvmf_ctl_cdevsw; + mda.mda_uid = UID_ROOT; + mda.mda_gid = GID_WHEEL; + mda.mda_mode = 0600; + error = make_dev_s(&mda, &nvmf_cdev, "nvmf"); + if (error != 0) + nvmf_cdev = NULL; + return (error); +} + +void +nvmf_ctl_unload(void) +{ + if (nvmf_cdev != NULL) { + destroy_dev(nvmf_cdev); + nvmf_cdev = NULL; + } +} diff --git a/sys/dev/nvmf/host/nvmf_ns.c b/sys/dev/nvmf/host/nvmf_ns.c new file mode 100644 --- /dev/null +++ b/sys/dev/nvmf/host/nvmf_ns.c @@ -0,0 +1,483 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct nvmf_namespace { + struct nvmf_softc *sc; + uint64_t size; + uint32_t id; + u_int flags; + uint32_t lba_size; + bool disconnected; + + TAILQ_HEAD(, bio) pending_bios; + struct mtx lock; + volatile u_int active_bios; + + struct cdev *cdev; +}; + +static void nvmf_ns_strategy(struct bio *bio); + +static void +ns_printf(struct nvmf_namespace *ns, const char *fmt, ...) +{ + char buf[128]; + struct sbuf sb; + va_list ap; + + sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); + sbuf_set_drain(&sb, sbuf_printf_drain, NULL); + + sbuf_printf(&sb, "%sns%u: ", device_get_nameunit(ns->sc->dev), + ns->id); + + va_start(ap, fmt); + sbuf_vprintf(&sb, fmt, ap); + va_end(ap); + + sbuf_finish(&sb); + sbuf_delete(&sb); +} + +/* + * The I/O completion may trigger after the received CQE if the I/O + * used a zero-copy mbuf that isn't harvested until after the NIC + * driver processes TX completions. Abuse bio_driver1 as a refcount. + * Store I/O errors in bio_driver2. + */ +static __inline u_int * +bio_refs(struct bio *bio) +{ + return ((u_int *)&bio->bio_driver1); +} + +static void +nvmf_ns_biodone(struct bio *bio) +{ + struct nvmf_namespace *ns; + int error; + + if (!refcount_release(bio_refs(bio))) + return; + + ns = bio->bio_dev->si_drv1; + + /* If a request is aborted, resubmit or queue it for resubmission. */ + if (bio->bio_error == ECONNABORTED) { + bio->bio_error = 0; + bio->bio_driver2 = 0; + mtx_lock(&ns->lock); + if (ns->disconnected) { + TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue); + mtx_unlock(&ns->lock); + } else { + mtx_unlock(&ns->lock); + nvmf_ns_strategy(bio); + } + } else { + /* + * I/O errors take precedence over generic EIO from + * CQE errors. + */ + error = (intptr_t)bio->bio_driver2; + if (error != 0) + bio->bio_error = error; + if (bio->bio_error != 0) + bio->bio_flags |= BIO_ERROR; + biodone(bio); + } + + if (refcount_release(&ns->active_bios)) + wakeup(ns); +} + +static void +nvmf_ns_io_complete(void *arg, size_t xfered, int error) +{ + struct bio *bio = arg; + + KASSERT(xfered <= bio->bio_bcount, + ("%s: xfered > bio_bcount", __func__)); + + bio->bio_driver2 = (void *)(intptr_t)error; + bio->bio_resid = bio->bio_bcount - xfered; + + nvmf_ns_biodone(bio); +} + +static void +nvmf_ns_delete_complete(void *arg, size_t xfered, int error) +{ + struct bio *bio = arg; + + if (error != 0) + bio->bio_resid = bio->bio_bcount; + else + bio->bio_resid = 0; + + free(bio->bio_driver2, M_NVMF); + bio->bio_driver2 = (void *)(intptr_t)error; + + nvmf_ns_biodone(bio); +} + +static void +nvmf_ns_bio_complete(void *arg, const struct nvme_completion *cqe) +{ + struct bio *bio = arg; + + if (nvmf_cqe_aborted(cqe)) + bio->bio_error = ECONNABORTED; + else if (cqe->status != 0) + bio->bio_error = EIO; + + nvmf_ns_biodone(bio); +} + +static int +nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio) +{ + struct nvme_command cmd; + struct nvmf_request *req; + struct nvme_dsm_range *dsm_range; + struct memdesc mem; + uint64_t lba, lba_count; + + dsm_range = NULL; + memset(&cmd, 0, sizeof(cmd)); + switch (bio->bio_cmd) { + case BIO_READ: + lba = bio->bio_offset / ns->lba_size; + lba_count = bio->bio_bcount / ns->lba_size; + nvme_ns_read_cmd(&cmd, ns->id, lba, lba_count); + break; + case BIO_WRITE: + lba = bio->bio_offset / ns->lba_size; + lba_count = bio->bio_bcount / ns->lba_size; + nvme_ns_write_cmd(&cmd, ns->id, lba, lba_count); + break; + case BIO_FLUSH: + nvme_ns_flush_cmd(&cmd, ns->id); + break; + case BIO_DELETE: + dsm_range = malloc(sizeof(*dsm_range), M_NVMF, M_NOWAIT | + M_ZERO); + if (dsm_range == NULL) + return (ENOMEM); + lba = bio->bio_offset / ns->lba_size; + lba_count = bio->bio_bcount / ns->lba_size; + dsm_range->starting_lba = htole64(lba); + dsm_range->length = htole32(lba_count); + + cmd.opc = NVME_OPC_DATASET_MANAGEMENT; + cmd.nsid = htole32(ns->id); + cmd.cdw10 = htole32(0); /* 1 range */ + cmd.cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE); + break; + default: + return (EOPNOTSUPP); + } + + mtx_lock(&ns->lock); + if (ns->disconnected) { + TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue); + mtx_unlock(&ns->lock); + free(dsm_range, M_NVMF); + return (0); + } + + req = nvmf_allocate_request(nvmf_select_io_queue(ns->sc), &cmd, + nvmf_ns_bio_complete, bio, M_NOWAIT); + if (req == NULL) { + mtx_unlock(&ns->lock); + free(dsm_range, M_NVMF); + return (ENOMEM); + } + + switch (bio->bio_cmd) { + case BIO_READ: + case BIO_WRITE: + refcount_init(bio_refs(bio), 2); + mem = memdesc_bio(bio); + nvmf_capsule_append_data(req->nc, &mem, bio->bio_bcount, + bio->bio_cmd == BIO_WRITE, nvmf_ns_io_complete, bio); + break; + case BIO_DELETE: + refcount_init(bio_refs(bio), 2); + mem = memdesc_vaddr(dsm_range, sizeof(*dsm_range)); + nvmf_capsule_append_data(req->nc, &mem, sizeof(*dsm_range), + true, nvmf_ns_delete_complete, bio); + bio->bio_driver2 = dsm_range; + break; + default: + refcount_init(bio_refs(bio), 1); + KASSERT(bio->bio_resid == 0, + ("%s: input bio_resid != 0", __func__)); + break; + } + + refcount_acquire(&ns->active_bios); + nvmf_submit_request(req); + mtx_unlock(&ns->lock); + return (0); +} + +static int +nvmf_ns_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag, + struct thread *td) +{ + struct nvmf_namespace *ns = dev->si_drv1; + struct nvme_get_nsid *gnsid; + struct nvme_pt_command *pt; + + switch (cmd) { + case NVME_PASSTHROUGH_CMD: + pt = (struct nvme_pt_command *)arg; + pt->cmd.nsid = htole32(ns->id); + return (nvmf_passthrough_cmd(ns->sc, pt, false)); + case NVME_GET_NSID: + gnsid = (struct nvme_get_nsid *)arg; + strncpy(gnsid->cdev, device_get_nameunit(ns->sc->dev), + sizeof(gnsid->cdev)); + gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0'; + gnsid->nsid = ns->id; + return (0); + case DIOCGMEDIASIZE: + *(off_t *)arg = ns->size; + return (0); + case DIOCGSECTORSIZE: + *(u_int *)arg = ns->lba_size; + return (0); + default: + return (ENOTTY); + } +} + +static int +nvmf_ns_open(struct cdev *dev, int oflags, int devtype, struct thread *td) +{ + int error; + + error = 0; + if ((oflags & FWRITE) != 0) + error = securelevel_gt(td->td_ucred, 0); + return (error); +} + +void +nvmf_ns_strategy(struct bio *bio) +{ + struct nvmf_namespace *ns; + int error; + + ns = bio->bio_dev->si_drv1; + + error = nvmf_ns_submit_bio(ns, bio); + if (error != 0) { + bio->bio_error = error; + bio->bio_flags |= BIO_ERROR; + bio->bio_resid = bio->bio_bcount; + biodone(bio); + } +} + +static struct cdevsw nvmf_ns_cdevsw = { + .d_version = D_VERSION, + .d_flags = D_DISK, + .d_open = nvmf_ns_open, + .d_read = physread, + .d_write = physwrite, + .d_strategy = nvmf_ns_strategy, + .d_ioctl = nvmf_ns_ioctl +}; + +struct nvmf_namespace * +nvmf_init_ns(struct nvmf_softc *sc, uint32_t id, + struct nvme_namespace_data *data) +{ + struct make_dev_args mda; + struct nvmf_namespace *ns; + int error; + uint8_t lbads, lbaf; + + ns = malloc(sizeof(*ns), M_NVMF, M_WAITOK | M_ZERO); + ns->sc = sc; + ns->id = id; + TAILQ_INIT(&ns->pending_bios); + mtx_init(&ns->lock, "nvmf ns", NULL, MTX_DEF); + + /* One dummy bio avoids dropping to 0 until destroy. */ + refcount_init(&ns->active_bios, 1); + + if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) { + ns_printf(ns, "End-to-end data protection not supported\n"); + goto fail; + } + + lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas); + if (lbaf > data->nlbaf) { + ns_printf(ns, "Invalid LBA format index\n"); + goto fail; + } + + if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) { + ns_printf(ns, "Namespaces with metadata are not supported\n"); + goto fail; + } + + lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]); + if (lbads == 0) { + ns_printf(ns, "Invalid LBA format index\n"); + goto fail; + } + + ns->lba_size = 1 << lbads; + ns->size = data->nsze * ns->lba_size; + + if (nvme_ctrlr_has_dataset_mgmt(sc->cdata)) + ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED; + + if (NVMEV(NVME_CTRLR_DATA_VWC_PRESENT, sc->cdata->vwc) != 0) + ns->flags |= NVME_NS_FLUSH_SUPPORTED; + + /* + * XXX: Does any of the boundary splitting for NOIOB make any + * sense for Fabrics? + */ + + make_dev_args_init(&mda); + mda.mda_devsw = &nvmf_ns_cdevsw; + mda.mda_uid = UID_ROOT; + mda.mda_gid = GID_WHEEL; + mda.mda_mode = 0600; + mda.mda_si_drv1 = ns; + error = make_dev_s(&mda, &ns->cdev, "%sns%u", + device_get_nameunit(sc->dev), id); + if (error != 0) + goto fail; + + ns->cdev->si_flags |= SI_UNMAPPED; + + return (ns); +fail: + mtx_destroy(&ns->lock); + free(ns, M_NVMF); + return (NULL); +} + +void +nvmf_disconnect_ns(struct nvmf_namespace *ns) +{ + mtx_lock(&ns->lock); + ns->disconnected = true; + mtx_unlock(&ns->lock); +} + +void +nvmf_reconnect_ns(struct nvmf_namespace *ns) +{ + TAILQ_HEAD(, bio) bios; + struct bio *bio; + + mtx_lock(&ns->lock); + ns->disconnected = false; + TAILQ_INIT(&bios); + TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue); + mtx_unlock(&ns->lock); + + while (!TAILQ_EMPTY(&bios)) { + bio = TAILQ_FIRST(&bios); + TAILQ_REMOVE(&bios, bio, bio_queue); + nvmf_ns_strategy(bio); + } +} + +void +nvmf_destroy_ns(struct nvmf_namespace *ns) +{ + TAILQ_HEAD(, bio) bios; + struct bio *bio; + + destroy_dev(ns->cdev); + + /* + * Wait for active I/O requests to drain. The release drops + * the reference on the "dummy bio" when the namespace is + * created. + */ + mtx_lock(&ns->lock); + if (!refcount_release(&ns->active_bios)) { + while (ns->active_bios != 0) + mtx_sleep(ns, &ns->lock, 0, "nvmfrmns", 0); + } + + /* Abort any pending I/O requests. */ + TAILQ_INIT(&bios); + TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue); + mtx_unlock(&ns->lock); + + while (!TAILQ_EMPTY(&bios)) { + bio = TAILQ_FIRST(&bios); + TAILQ_REMOVE(&bios, bio, bio_queue); + bio->bio_error = ECONNABORTED; + bio->bio_flags |= BIO_ERROR; + bio->bio_resid = bio->bio_bcount; + biodone(bio); + } + + mtx_destroy(&ns->lock); + free(ns, M_NVMF); +} + +bool +nvmf_update_ns(struct nvmf_namespace *ns, struct nvme_namespace_data *data) +{ + uint8_t lbads, lbaf; + + if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) { + ns_printf(ns, "End-to-end data protection not supported\n"); + return (false); + } + + lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas); + if (lbaf > data->nlbaf) { + ns_printf(ns, "Invalid LBA format index\n"); + return (false); + } + + if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) { + ns_printf(ns, "Namespaces with metadata are not supported\n"); + return (false); + } + + lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]); + if (lbads == 0) { + ns_printf(ns, "Invalid LBA format index\n"); + return (false); + } + + ns->lba_size = 1 << lbads; + ns->size = data->nsze * ns->lba_size; + return (true); +} diff --git a/sys/dev/nvmf/host/nvmf_qpair.c b/sys/dev/nvmf/host/nvmf_qpair.c new file mode 100644 --- /dev/null +++ b/sys/dev/nvmf/host/nvmf_qpair.c @@ -0,0 +1,386 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct nvmf_host_command { + struct nvmf_request *req; + TAILQ_ENTRY(nvmf_host_command) link; + uint16_t cid; +}; + +struct nvmf_host_qpair { + struct nvmf_softc *sc; + struct nvmf_qpair *qp; + + bool sq_flow_control; + bool shutting_down; + u_int allocating; + u_int num_commands; + uint16_t sqhd; + uint16_t sqtail; + + struct mtx lock; + + TAILQ_HEAD(, nvmf_host_command) free_commands; + STAILQ_HEAD(, nvmf_request) pending_requests; + + /* Indexed by cid. */ + struct nvmf_host_command **active_commands; + + char name[16]; +}; + +struct nvmf_request * +nvmf_allocate_request(struct nvmf_host_qpair *qp, void *sqe, + nvmf_request_complete_t *cb, void *cb_arg, int how) +{ + struct nvmf_request *req; + struct nvmf_qpair *nq; + + KASSERT(how == M_WAITOK || how == M_NOWAIT, + ("%s: invalid how", __func__)); + + req = malloc(sizeof(*req), M_NVMF, how | M_ZERO); + if (req == NULL) + return (NULL); + + mtx_lock(&qp->lock); + nq = qp->qp; + if (nq == NULL) { + mtx_unlock(&qp->lock); + free(req, M_NVMF); + return (NULL); + } + qp->allocating++; + MPASS(qp->allocating != 0); + mtx_unlock(&qp->lock); + + req->qp = qp; + req->cb = cb; + req->cb_arg = cb_arg; + req->nc = nvmf_allocate_command(nq, sqe, how); + if (req->nc == NULL) { + free(req, M_NVMF); + req = NULL; + } + + mtx_lock(&qp->lock); + qp->allocating--; + if (qp->allocating == 0 && qp->shutting_down) + wakeup(qp); + mtx_unlock(&qp->lock); + + return (req); +} + +static void +nvmf_abort_request(struct nvmf_request *req, uint16_t cid) +{ + struct nvme_completion cqe; + + memset(&cqe, 0, sizeof(cqe)); + cqe.cid = cid; + cqe.status = htole16(NVMEF(NVME_STATUS_SCT, NVME_SCT_PATH_RELATED) | + NVMEF(NVME_STATUS_SC, NVME_SC_COMMAND_ABORTED_BY_HOST)); + req->cb(req->cb_arg, &cqe); +} + +void +nvmf_free_request(struct nvmf_request *req) +{ + if (req->nc != NULL) + nvmf_free_capsule(req->nc); + free(req, M_NVMF); +} + +static void +nvmf_dispatch_command(struct nvmf_host_qpair *qp, struct nvmf_host_command *cmd) +{ + struct nvmf_softc *sc = qp->sc; + struct nvme_command *sqe; + struct nvmf_capsule *nc; + int error; + + nc = cmd->req->nc; + sqe = nvmf_capsule_sqe(nc); + + /* + * NB: Don't bother byte-swapping the cid so that receive + * doesn't have to swap. + */ + sqe->cid = cmd->cid; + + error = nvmf_transmit_capsule(nc); + if (error != 0) { + device_printf(sc->dev, + "failed to transmit capsule: %d, disconnecting\n", error); + nvmf_disconnect(sc); + return; + } + + if (sc->ka_traffic) + atomic_store_int(&sc->ka_active_tx_traffic, 1); +} + +static void +nvmf_qp_error(void *arg, int error) +{ + struct nvmf_host_qpair *qp = arg; + struct nvmf_softc *sc = qp->sc; + + /* Ignore simple close of queue pairs during shutdown. */ + if (!(sc->detaching && error == 0)) + device_printf(sc->dev, "error %d on %s, disconnecting\n", error, + qp->name); + nvmf_disconnect(sc); +} + +static void +nvmf_receive_capsule(void *arg, struct nvmf_capsule *nc) +{ + struct nvmf_host_qpair *qp = arg; + struct nvmf_softc *sc = qp->sc; + struct nvmf_host_command *cmd; + struct nvmf_request *req; + const struct nvme_completion *cqe; + uint16_t cid; + + cqe = nvmf_capsule_cqe(nc); + + if (sc->ka_traffic) + atomic_store_int(&sc->ka_active_rx_traffic, 1); + + /* + * NB: Don't bother byte-swapping the cid as transmit doesn't + * swap either. + */ + cid = cqe->cid; + + if (cid > qp->num_commands) { + device_printf(sc->dev, + "received invalid CID %u, disconnecting\n", cid); + nvmf_disconnect(sc); + nvmf_free_capsule(nc); + return; + } + + /* + * If the queue has been shutdown due to an error, silently + * drop the response. + */ + mtx_lock(&qp->lock); + if (qp->qp == NULL) { + device_printf(sc->dev, + "received completion for CID %u on shutdown %s\n", cid, + qp->name); + mtx_unlock(&qp->lock); + nvmf_free_capsule(nc); + return; + } + + cmd = qp->active_commands[cid]; + if (cmd == NULL) { + mtx_unlock(&qp->lock); + device_printf(sc->dev, + "received completion for inactive CID %u, disconnecting\n", + cid); + nvmf_disconnect(sc); + nvmf_free_capsule(nc); + return; + } + + KASSERT(cmd->cid == cid, ("%s: CID mismatch", __func__)); + req = cmd->req; + cmd->req = NULL; + if (STAILQ_EMPTY(&qp->pending_requests)) { + qp->active_commands[cid] = NULL; + TAILQ_INSERT_TAIL(&qp->free_commands, cmd, link); + mtx_unlock(&qp->lock); + } else { + cmd->req = STAILQ_FIRST(&qp->pending_requests); + STAILQ_REMOVE_HEAD(&qp->pending_requests, link); + mtx_unlock(&qp->lock); + nvmf_dispatch_command(qp, cmd); + } + + req->cb(req->cb_arg, cqe); + nvmf_free_capsule(nc); + nvmf_free_request(req); +} + +struct nvmf_host_qpair * +nvmf_init_qp(struct nvmf_softc *sc, enum nvmf_trtype trtype, + struct nvmf_handoff_qpair_params *handoff, const char *name) +{ + struct nvmf_host_command *cmd, *ncmd; + struct nvmf_host_qpair *qp; + u_int i; + + qp = malloc(sizeof(*qp), M_NVMF, M_WAITOK | M_ZERO); + qp->sc = sc; + qp->sq_flow_control = handoff->sq_flow_control; + qp->sqhd = handoff->sqhd; + qp->sqtail = handoff->sqtail; + strlcpy(qp->name, name, sizeof(qp->name)); + mtx_init(&qp->lock, "nvmf qp", NULL, MTX_DEF); + + /* + * Allocate a spare command slot for each pending AER command + * on the admin queue. + */ + qp->num_commands = handoff->qsize - 1; + if (handoff->admin) + qp->num_commands += sc->num_aer; + + qp->active_commands = malloc(sizeof(*qp->active_commands) * + qp->num_commands, M_NVMF, M_WAITOK | M_ZERO); + TAILQ_INIT(&qp->free_commands); + for (i = 0; i < qp->num_commands; i++) { + cmd = malloc(sizeof(*cmd), M_NVMF, M_WAITOK | M_ZERO); + cmd->cid = i; + TAILQ_INSERT_TAIL(&qp->free_commands, cmd, link); + } + STAILQ_INIT(&qp->pending_requests); + + qp->qp = nvmf_allocate_qpair(trtype, false, handoff, nvmf_qp_error, + qp, nvmf_receive_capsule, qp); + if (qp->qp == NULL) { + TAILQ_FOREACH_SAFE(cmd, &qp->free_commands, link, ncmd) { + TAILQ_REMOVE(&qp->free_commands, cmd, link); + free(cmd, M_NVMF); + } + free(qp->active_commands, M_NVMF); + mtx_destroy(&qp->lock); + free(qp, M_NVMF); + return (NULL); + } + + return (qp); +} + +void +nvmf_shutdown_qp(struct nvmf_host_qpair *qp) +{ + struct nvmf_host_command *cmd; + struct nvmf_request *req; + struct nvmf_qpair *nq; + + mtx_lock(&qp->lock); + nq = qp->qp; + qp->qp = NULL; + + if (nq == NULL) { + while (qp->shutting_down) + mtx_sleep(qp, &qp->lock, 0, "nvmfqpsh", 0); + mtx_unlock(&qp->lock); + return; + } + qp->shutting_down = true; + while (qp->allocating != 0) + mtx_sleep(qp, &qp->lock, 0, "nvmfqpqu", 0); + mtx_unlock(&qp->lock); + + nvmf_free_qpair(nq); + + /* + * Abort outstanding requests. Active requests will have + * their I/O completions invoked and associated capsules freed + * by the transport layer via nvmf_free_qpair. Pending + * requests must have their I/O completion invoked via + * nvmf_abort_capsule_data. + */ + for (u_int i = 0; i < qp->num_commands; i++) { + cmd = qp->active_commands[i]; + if (cmd != NULL) { + if (!cmd->req->aer) + printf("%s: aborted active command %p (CID %u)\n", + __func__, cmd->req, cmd->cid); + + /* This was freed by nvmf_free_qpair. */ + cmd->req->nc = NULL; + nvmf_abort_request(cmd->req, cmd->cid); + nvmf_free_request(cmd->req); + free(cmd, M_NVMF); + } + } + while (!STAILQ_EMPTY(&qp->pending_requests)) { + req = STAILQ_FIRST(&qp->pending_requests); + STAILQ_REMOVE_HEAD(&qp->pending_requests, link); + if (!req->aer) + printf("%s: aborted pending command %p\n", __func__, + req); + nvmf_abort_capsule_data(req->nc, ECONNABORTED); + nvmf_abort_request(req, 0); + nvmf_free_request(req); + } + + mtx_lock(&qp->lock); + qp->shutting_down = false; + mtx_unlock(&qp->lock); + wakeup(qp); +} + +void +nvmf_destroy_qp(struct nvmf_host_qpair *qp) +{ + struct nvmf_host_command *cmd, *ncmd; + + nvmf_shutdown_qp(qp); + + TAILQ_FOREACH_SAFE(cmd, &qp->free_commands, link, ncmd) { + TAILQ_REMOVE(&qp->free_commands, cmd, link); + free(cmd, M_NVMF); + } + free(qp->active_commands, M_NVMF); + mtx_destroy(&qp->lock); + free(qp, M_NVMF); +} + +void +nvmf_submit_request(struct nvmf_request *req) +{ + struct nvmf_host_qpair *qp; + struct nvmf_host_command *cmd; + + qp = req->qp; + mtx_lock(&qp->lock); + if (qp->qp == NULL) { + mtx_unlock(&qp->lock); + printf("%s: aborted pending command %p\n", __func__, req); + nvmf_abort_capsule_data(req->nc, ECONNABORTED); + nvmf_abort_request(req, 0); + nvmf_free_request(req); + return; + } + cmd = TAILQ_FIRST(&qp->free_commands); + if (cmd == NULL) { + /* + * Queue this request. Will be sent after enough + * in-flight requests have completed. + */ + STAILQ_INSERT_TAIL(&qp->pending_requests, req, link); + mtx_unlock(&qp->lock); + return; + } + + TAILQ_REMOVE(&qp->free_commands, cmd, link); + KASSERT(qp->active_commands[cmd->cid] == NULL, + ("%s: CID already busy", __func__)); + qp->active_commands[cmd->cid] = cmd; + cmd->req = req; + mtx_unlock(&qp->lock); + nvmf_dispatch_command(qp, cmd); +} diff --git a/sys/dev/nvmf/host/nvmf_sim.c b/sys/dev/nvmf/host/nvmf_sim.c new file mode 100644 --- /dev/null +++ b/sys/dev/nvmf/host/nvmf_sim.c @@ -0,0 +1,332 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +/* + * The I/O completion may trigger after the received CQE if the I/O + * used a zero-copy mbuf that isn't harvested until after the NIC + * driver processes TX completions. Use spriv_field0 to as a refcount. + * + * Store any I/O error returned in spriv_field1. + */ +static __inline u_int * +ccb_refs(union ccb *ccb) +{ + return ((u_int *)&ccb->ccb_h.spriv_field0); +} + +#define spriv_ioerror spriv_field1 + +static void +nvmf_ccb_done(union ccb *ccb) +{ + if (!refcount_release(ccb_refs(ccb))) + return; + + if (nvmf_cqe_aborted(&ccb->nvmeio.cpl)) { + ccb->ccb_h.status = CAM_REQUEUE_REQ; + xpt_done(ccb); + } else if (ccb->nvmeio.cpl.status != 0) { + ccb->ccb_h.status = CAM_NVME_STATUS_ERROR; + xpt_done(ccb); + } else if (ccb->ccb_h.spriv_ioerror != 0) { + KASSERT(ccb->ccb_h.spriv_ioerror != EJUSTRETURN, + ("%s: zero sized transfer without CQE error", __func__)); + ccb->ccb_h.status = CAM_REQ_CMP_ERR; + xpt_done(ccb); + } else { + ccb->ccb_h.status = CAM_REQ_CMP; + xpt_done_direct(ccb); + } +} + +static void +nvmf_ccb_io_complete(void *arg, size_t xfered, int error) +{ + union ccb *ccb = arg; + + /* + * TODO: Reporting partial completions requires extending + * nvmeio to support resid and updating nda to handle partial + * reads, either by returning partial success (or an error) to + * the caller, or retrying all or part of the request. + */ + ccb->ccb_h.spriv_ioerror = error; + if (error == 0) { + if (xfered == 0) { +#ifdef INVARIANTS + /* + * If the request fails with an error in the CQE + * there will be no data transferred but also no + * I/O error. + */ + ccb->ccb_h.spriv_ioerror = EJUSTRETURN; +#endif + } else + KASSERT(xfered == ccb->nvmeio.dxfer_len, + ("%s: partial CCB completion", __func__)); + } + + nvmf_ccb_done(ccb); +} + +static void +nvmf_ccb_complete(void *arg, const struct nvme_completion *cqe) +{ + union ccb *ccb = arg; + + ccb->nvmeio.cpl = *cqe; + nvmf_ccb_done(ccb); +} + +static void +nvmf_sim_io(struct nvmf_softc *sc, union ccb *ccb) +{ + struct ccb_nvmeio *nvmeio = &ccb->nvmeio; + struct memdesc mem; + struct nvmf_request *req; + struct nvmf_host_qpair *qp; + + mtx_lock(&sc->sim_mtx); + if (sc->sim_disconnected) { + mtx_unlock(&sc->sim_mtx); + nvmeio->ccb_h.status = CAM_REQUEUE_REQ; + xpt_done(ccb); + return; + } + if (nvmeio->ccb_h.func_code == XPT_NVME_IO) + qp = nvmf_select_io_queue(sc); + else + qp = sc->admin; + req = nvmf_allocate_request(qp, &nvmeio->cmd, nvmf_ccb_complete, + ccb, M_NOWAIT); + if (req == NULL) { + mtx_unlock(&sc->sim_mtx); + nvmeio->ccb_h.status = CAM_RESRC_UNAVAIL; + xpt_done(ccb); + return; + } + + if (nvmeio->dxfer_len != 0) { + refcount_init(ccb_refs(ccb), 2); + mem = memdesc_ccb(ccb); + nvmf_capsule_append_data(req->nc, &mem, nvmeio->dxfer_len, + (ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT, + nvmf_ccb_io_complete, ccb); + } else + refcount_init(ccb_refs(ccb), 1); + + /* + * Clear spriv_ioerror as it can hold an earlier error if this + * CCB was aborted and has been retried. + */ + ccb->ccb_h.spriv_ioerror = 0; + KASSERT(ccb->ccb_h.status == CAM_REQ_INPROG, + ("%s: incoming CCB is not in-progress", __func__)); + ccb->ccb_h.status |= CAM_SIM_QUEUED; + nvmf_submit_request(req); + mtx_unlock(&sc->sim_mtx); +} + +static void +nvmf_sim_action(struct cam_sim *sim, union ccb *ccb) +{ + struct nvmf_softc *sc = cam_sim_softc(sim); + + CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE, + ("nvmf_sim_action: func= %#x\n", + ccb->ccb_h.func_code)); + + switch (ccb->ccb_h.func_code) { + case XPT_PATH_INQ: /* Path routing inquiry */ + { + struct ccb_pathinq *cpi = &ccb->cpi; + + cpi->version_num = 1; + cpi->hba_inquiry = 0; + cpi->target_sprt = 0; + cpi->hba_misc = PIM_UNMAPPED | PIM_NOSCAN; + cpi->hba_eng_cnt = 0; + cpi->max_target = 0; + cpi->max_lun = sc->cdata->nn; + cpi->async_flags = 0; + cpi->hpath_id = 0; + cpi->initiator_id = 0; + strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN); + strlcpy(cpi->hba_vid, "NVMeoF", HBA_IDLEN); + strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN); + cpi->unit_number = cam_sim_unit(sim); + cpi->bus_id = 0; + + /* XXX: Same as iSCSI. */ + cpi->base_transfer_speed = 150000; + cpi->protocol = PROTO_NVME; + cpi->protocol_version = sc->vs; + cpi->transport = XPORT_NVMF; + cpi->transport_version = sc->vs; + cpi->xport_specific.nvmf.nsid = + xpt_path_lun_id(ccb->ccb_h.path); + cpi->xport_specific.nvmf.trtype = sc->trtype; + strncpy(cpi->xport_specific.nvmf.dev_name, + device_get_nameunit(sc->dev), + sizeof(cpi->xport_specific.nvmf.dev_name)); + cpi->maxio = sc->max_xfer_size; + cpi->hba_vendor = 0; + cpi->hba_device = 0; + cpi->hba_subvendor = 0; + cpi->hba_subdevice = 0; + cpi->ccb_h.status = CAM_REQ_CMP; + break; + } + case XPT_GET_TRAN_SETTINGS: /* Get transport settings */ + { + struct ccb_trans_settings *cts = &ccb->cts; + struct ccb_trans_settings_nvme *nvme; + struct ccb_trans_settings_nvmf *nvmf; + + cts->protocol = PROTO_NVME; + cts->protocol_version = sc->vs; + cts->transport = XPORT_NVMF; + cts->transport_version = sc->vs; + + nvme = &cts->proto_specific.nvme; + nvme->valid = CTS_NVME_VALID_SPEC; + nvme->spec = sc->vs; + + nvmf = &cts->xport_specific.nvmf; + nvmf->valid = CTS_NVMF_VALID_TRTYPE; + nvmf->trtype = sc->trtype; + cts->ccb_h.status = CAM_REQ_CMP; + break; + } + case XPT_SET_TRAN_SETTINGS: /* Set transport settings */ + /* + * No transfer settings can be set, but nvme_xpt sends + * this anyway. + */ + ccb->ccb_h.status = CAM_REQ_CMP; + break; + case XPT_NVME_IO: /* Execute the requested I/O */ + case XPT_NVME_ADMIN: /* or Admin operation */ + nvmf_sim_io(sc, ccb); + return; + default: + /* XXX */ + device_printf(sc->dev, "unhandled sim function %#x\n", + ccb->ccb_h.func_code); + ccb->ccb_h.status = CAM_REQ_INVALID; + break; + } + xpt_done(ccb); +} + +int +nvmf_init_sim(struct nvmf_softc *sc) +{ + struct cam_devq *devq; + int max_trans; + + max_trans = sc->max_pending_io * 3 / 4; + devq = cam_simq_alloc(max_trans); + if (devq == NULL) { + device_printf(sc->dev, "Failed to allocate CAM simq\n"); + return (ENOMEM); + } + + mtx_init(&sc->sim_mtx, "nvmf sim", NULL, MTX_DEF); + sc->sim = cam_sim_alloc(nvmf_sim_action, NULL, "nvme", sc, + device_get_unit(sc->dev), NULL, max_trans, max_trans, devq); + if (sc->sim == NULL) { + device_printf(sc->dev, "Failed to allocate CAM sim\n"); + cam_simq_free(devq); + mtx_destroy(&sc->sim_mtx); + return (ENXIO); + } + if (xpt_bus_register(sc->sim, sc->dev, 0) != CAM_SUCCESS) { + device_printf(sc->dev, "Failed to create CAM bus\n"); + cam_sim_free(sc->sim, TRUE); + mtx_destroy(&sc->sim_mtx); + return (ENXIO); + } + if (xpt_create_path(&sc->path, NULL, cam_sim_path(sc->sim), + CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) { + device_printf(sc->dev, "Failed to create CAM path\n"); + xpt_bus_deregister(cam_sim_path(sc->sim)); + cam_sim_free(sc->sim, TRUE); + mtx_destroy(&sc->sim_mtx); + return (ENXIO); + } + return (0); +} + +void +nvmf_sim_rescan_ns(struct nvmf_softc *sc, uint32_t id) +{ + union ccb *ccb; + + ccb = xpt_alloc_ccb_nowait(); + if (ccb == NULL) { + device_printf(sc->dev, + "unable to alloc CCB for rescan of namespace %u\n", id); + return; + } + + /* + * As with nvme_sim, map NVMe namespace IDs onto CAM unit + * LUNs. + */ + if (xpt_create_path(&ccb->ccb_h.path, NULL, cam_sim_path(sc->sim), 0, + id) != CAM_REQ_CMP) { + device_printf(sc->dev, + "Unable to create path for rescan of namespace %u\n", id); + xpt_free_ccb(ccb); + return; + } + xpt_rescan(ccb); +} + +void +nvmf_disconnect_sim(struct nvmf_softc *sc) +{ + mtx_lock(&sc->sim_mtx); + sc->sim_disconnected = true; + xpt_freeze_simq(sc->sim, 1); + mtx_unlock(&sc->sim_mtx); +} + +void +nvmf_reconnect_sim(struct nvmf_softc *sc) +{ + mtx_lock(&sc->sim_mtx); + sc->sim_disconnected = false; + mtx_unlock(&sc->sim_mtx); + xpt_release_simq(sc->sim, 1); +} + +void +nvmf_destroy_sim(struct nvmf_softc *sc) +{ + xpt_async(AC_LOST_DEVICE, sc->path, NULL); + if (sc->sim_disconnected) + xpt_release_simq(sc->sim, 1); + xpt_free_path(sc->path); + xpt_bus_deregister(cam_sim_path(sc->sim)); + cam_sim_free(sc->sim, TRUE); + mtx_destroy(&sc->sim_mtx); +} diff --git a/sys/dev/nvmf/host/nvmf_var.h b/sys/dev/nvmf/host/nvmf_var.h new file mode 100644 --- /dev/null +++ b/sys/dev/nvmf/host/nvmf_var.h @@ -0,0 +1,208 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#ifndef __NVMF_VAR_H__ +#define __NVMF_VAR_H__ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct nvmf_aer; +struct nvmf_capsule; +struct nvmf_host_qpair; +struct nvmf_namespace; + +typedef void nvmf_request_complete_t(void *, const struct nvme_completion *); + +struct nvmf_ivars { + struct nvmf_handoff_host *hh; + struct nvmf_handoff_qpair_params *io_params; + struct nvme_controller_data *cdata; +}; + +struct nvmf_softc { + device_t dev; + + struct nvmf_host_qpair *admin; + struct nvmf_host_qpair **io; + u_int num_io_queues; + enum nvmf_trtype trtype; + + struct cam_sim *sim; + struct cam_path *path; + struct mtx sim_mtx; + bool sim_disconnected; + + struct nvmf_namespace **ns; + + struct nvme_controller_data *cdata; + uint64_t cap; + uint32_t vs; + u_int max_pending_io; + u_long max_xfer_size; + + struct cdev *cdev; + + /* + * Keep Alive support depends on two timers. The 'tx' timer + * is responsible for sending KeepAlive commands and runs at + * half the timeout interval. The 'rx' timer is responsible + * for detecting an actual timeout. + * + * For efficient support of TKAS, the host does not reschedule + * these timers every time new commands are scheduled. + * Instead, the host sets the *_traffic flags when commands + * are sent and received. The timeout handlers check and + * clear these flags. This does mean it can take up to twice + * the timeout time to detect an AWOL controller. + */ + bool ka_traffic; /* Using TKAS? */ + + volatile int ka_active_tx_traffic; + struct callout ka_tx_timer; + sbintime_t ka_tx_sbt; + + volatile int ka_active_rx_traffic; + struct callout ka_rx_timer; + sbintime_t ka_rx_sbt; + + struct sx connection_lock; + struct task disconnect_task; + bool detaching; + + u_int num_aer; + struct nvmf_aer *aer; +}; + +struct nvmf_request { + struct nvmf_host_qpair *qp; + struct nvmf_capsule *nc; + nvmf_request_complete_t *cb; + void *cb_arg; + bool aer; + + STAILQ_ENTRY(nvmf_request) link; +}; + +struct nvmf_completion_status { + struct nvme_completion cqe; + bool done; + bool io_done; + int io_error; +}; + +static __inline struct nvmf_host_qpair * +nvmf_select_io_queue(struct nvmf_softc *sc) +{ + /* TODO: Support multiple queues? */ + return (sc->io[0]); +} + +static __inline bool +nvmf_cqe_aborted(const struct nvme_completion *cqe) +{ + uint16_t status; + + status = le16toh(cqe->status); + return (NVME_STATUS_GET_SCT(status) == NVME_SCT_PATH_RELATED && + NVME_STATUS_GET_SC(status) == NVME_SC_COMMAND_ABORTED_BY_HOST); +} + +static __inline void +nvmf_status_init(struct nvmf_completion_status *status) +{ + status->done = false; + status->io_done = true; + status->io_error = 0; +} + +static __inline void +nvmf_status_wait_io(struct nvmf_completion_status *status) +{ + status->io_done = false; +} + +#ifdef DRIVER_MODULE +extern driver_t nvme_nvmf_driver; +#endif + +#ifdef MALLOC_DECLARE +MALLOC_DECLARE(M_NVMF); +#endif + +/* nvmf.c */ +void nvmf_complete(void *arg, const struct nvme_completion *cqe); +void nvmf_io_complete(void *arg, size_t xfered, int error); +void nvmf_wait_for_reply(struct nvmf_completion_status *status); +int nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh); +void nvmf_free_ivars(struct nvmf_ivars *ivars); +void nvmf_disconnect(struct nvmf_softc *sc); +void nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid); +int nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt, + bool admin); + +/* nvmf_aer.c */ +void nvmf_init_aer(struct nvmf_softc *sc); +int nvmf_start_aer(struct nvmf_softc *sc); +void nvmf_destroy_aer(struct nvmf_softc *sc); + +/* nvmf_cmd.c */ +bool nvmf_cmd_get_property(struct nvmf_softc *sc, uint32_t offset, + uint8_t size, nvmf_request_complete_t *cb, void *cb_arg, int how); +bool nvmf_cmd_set_property(struct nvmf_softc *sc, uint32_t offset, + uint8_t size, uint64_t value, nvmf_request_complete_t *cb, void *cb_arg, + int how); +bool nvmf_cmd_keep_alive(struct nvmf_softc *sc, nvmf_request_complete_t *cb, + void *cb_arg, int how); +bool nvmf_cmd_identify_active_namespaces(struct nvmf_softc *sc, uint32_t id, + struct nvme_ns_list *nslist, nvmf_request_complete_t *req_cb, + void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how); +bool nvmf_cmd_identify_namespace(struct nvmf_softc *sc, uint32_t id, + struct nvme_namespace_data *nsdata, nvmf_request_complete_t *req_cb, + void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how); +bool nvmf_cmd_get_log_page(struct nvmf_softc *sc, uint32_t nsid, uint8_t lid, + uint64_t offset, void *buf, size_t len, nvmf_request_complete_t *req_cb, + void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how); + +/* nvmf_ctldev.c */ +int nvmf_ctl_load(void); +void nvmf_ctl_unload(void); + +/* nvmf_ns.c */ +struct nvmf_namespace *nvmf_init_ns(struct nvmf_softc *sc, uint32_t id, + struct nvme_namespace_data *data); +void nvmf_disconnect_ns(struct nvmf_namespace *ns); +void nvmf_reconnect_ns(struct nvmf_namespace *ns); +void nvmf_destroy_ns(struct nvmf_namespace *ns); +bool nvmf_update_ns(struct nvmf_namespace *ns, + struct nvme_namespace_data *data); + +/* nvmf_qpair.c */ +struct nvmf_host_qpair *nvmf_init_qp(struct nvmf_softc *sc, + enum nvmf_trtype trtype, struct nvmf_handoff_qpair_params *handoff, + const char *name); +void nvmf_shutdown_qp(struct nvmf_host_qpair *qp); +void nvmf_destroy_qp(struct nvmf_host_qpair *qp); +struct nvmf_request *nvmf_allocate_request(struct nvmf_host_qpair *qp, + void *sqe, nvmf_request_complete_t *cb, void *cb_arg, int how); +void nvmf_submit_request(struct nvmf_request *req); +void nvmf_free_request(struct nvmf_request *req); + +/* nvmf_sim.c */ +int nvmf_init_sim(struct nvmf_softc *sc); +void nvmf_disconnect_sim(struct nvmf_softc *sc); +void nvmf_reconnect_sim(struct nvmf_softc *sc); +void nvmf_destroy_sim(struct nvmf_softc *sc); +void nvmf_sim_rescan_ns(struct nvmf_softc *sc, uint32_t id); + +#endif /* !__NVMF_VAR_H__ */ diff --git a/sys/modules/nvmf/Makefile b/sys/modules/nvmf/Makefile --- a/sys/modules/nvmf/Makefile +++ b/sys/modules/nvmf/Makefile @@ -1,4 +1,5 @@ -SUBDIR= nvmf_tcp \ +SUBDIR= nvmf \ + nvmf_tcp \ nvmf_transport .include diff --git a/sys/modules/nvmf/nvmf/Makefile b/sys/modules/nvmf/nvmf/Makefile new file mode 100644 --- /dev/null +++ b/sys/modules/nvmf/nvmf/Makefile @@ -0,0 +1,13 @@ +.PATH: ${SRCTOP}/sys/dev/nvmf/host + +KMOD= nvmf + +SRCS= nvmf.c \ + nvmf_aer.c \ + nvmf_cmd.c \ + nvmf_ctldev.c \ + nvmf_ns.c \ + nvmf_qpair.c \ + nvmf_sim.c + +.include