diff --git a/share/man/man4/nvmf.4 b/share/man/man4/nvmf.4 index 8afbb4d9daaf..298365acefa9 100644 --- a/share/man/man4/nvmf.4 +++ b/share/man/man4/nvmf.4 @@ -1,87 +1,109 @@ .\" .\" SPDX-License-Identifier: BSD-2-Clause .\" .\" Copyright (c) 2024 Chelsio Communications, Inc. .\" -.Dd May 2, 2024 +.Dd June 5, 2024 .Dt NVMF 4 .Os .Sh NAME .Nm nvmf .Nd "NVM Express over Fabrics host driver" .Sh SYNOPSIS To compile the driver into the kernel, place the following line in the kernel configuration file: .Bd -ragged -offset indent .Cd "device nvmf" .Ed .Pp Alternatively, to load the driver as a module at boot time, place the following line in .Xr loader.conf 5 : .Bd -literal -offset indent nvmf_load="YES" .Ed .Sh DESCRIPTION The .Nm driver provides the kernel component of an NVM Express over Fabrics host. The NVMeoF host is the client which provides local access to namespaces exported by a remote controller. .Pp Associations between the local host and remote controllers are managed using .Xr nvmecontrol 8 . New associations are created via the .Cm connect command and destroyed via the .Cm disconnect command. If an association's connection is interrupted, the .Cm reconnect command creates a new association to replace the interrupted association. .Pp Similar to .Xr nvme 4 , .Nm creates controller device nodes using the format .Pa /dev/nvmeX and namespace device nodes using the format .Pa /dev/nvmeXnsY . .Nm also exports remote namespaces via the CAM .Xr nda 4 peripheral driver. Unlike .Xr nvme 4 , .Nm does not support the .Xr nvd 4 disk driver. .Pp Associations require a supported transport such as .Xr nvmf_tcp 4 for associations using TCP/IP. +.Sh SYSCTL VARIABLES +The following variables are available as both +.Xr sysctl 8 +variables and +.Xr loader 8 +tunables: +.Bl -tag -width indent +.It Va kern.nvmf.fail_on_disconnection +Determines the behavior when an association's connection is interrupted. +By default, input/output operations are suspended while a host is disconnected. +This includes operations pending at the time the association's connection was +interrupted as well as new requests submitted while the host is disconnected. +Once a new association is established, suspended I/O requests are retried. +When set to 1, input/output operations fail with +.Er EIO +while a host is disconnected and +.Xr nda 4 +peripherals are destroyed after the first failed I/O request. +Note that any destroyed +.Xr nda 4 +peripherals will be recreated after a new association is established. +.El .Sh SEE ALSO .Xr nda 4 , .Xr nvme 4 , .Xr nvmf_tcp 4 , .Xr nvmft 4 , .Xr nvmecontrol 8 .Sh HISTORY The .Nm module first appeared in .Fx 15.0 . .Sh AUTHORS The .Nm driver was developed by .An John Baldwin Aq Mt jhb@FreeBSD.org under sponsorship from Chelsio Communications, Inc. .Sh BUGS .Nm only supports a single I/O queue pair per association. diff --git a/sys/dev/nvmf/host/nvmf.c b/sys/dev/nvmf/host/nvmf.c index 9684170c1de9..c309836ed8a8 100644 --- a/sys/dev/nvmf/host/nvmf.c +++ b/sys/dev/nvmf/host/nvmf.c @@ -1,1017 +1,1022 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include static struct cdevsw nvmf_cdevsw; +bool nvmf_fail_disconnect = false; +SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN, + &nvmf_fail_disconnect, 0, "Fail I/O requests on connection failure"); + MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host"); static void nvmf_disconnect_task(void *arg, int pending); void nvmf_complete(void *arg, const struct nvme_completion *cqe) { struct nvmf_completion_status *status = arg; struct mtx *mtx; status->cqe = *cqe; mtx = mtx_pool_find(mtxpool_sleep, status); mtx_lock(mtx); status->done = true; mtx_unlock(mtx); wakeup(status); } void nvmf_io_complete(void *arg, size_t xfered, int error) { struct nvmf_completion_status *status = arg; struct mtx *mtx; status->io_error = error; mtx = mtx_pool_find(mtxpool_sleep, status); mtx_lock(mtx); status->io_done = true; mtx_unlock(mtx); wakeup(status); } void nvmf_wait_for_reply(struct nvmf_completion_status *status) { struct mtx *mtx; mtx = mtx_pool_find(mtxpool_sleep, status); mtx_lock(mtx); while (!status->done || !status->io_done) mtx_sleep(status, mtx, 0, "nvmfcmd", 0); mtx_unlock(mtx); } static int nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, uint64_t *value) { const struct nvmf_fabric_prop_get_rsp *rsp; struct nvmf_completion_status status; nvmf_status_init(&status); if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status, M_WAITOK)) return (ECONNABORTED); nvmf_wait_for_reply(&status); if (status.cqe.status != 0) { device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n", le16toh(status.cqe.status)); return (EIO); } rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe; if (size == 8) *value = le64toh(rsp->value.u64); else *value = le32toh(rsp->value.u32.low); return (0); } static int nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, uint64_t value) { struct nvmf_completion_status status; nvmf_status_init(&status); if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status, M_WAITOK)) return (ECONNABORTED); nvmf_wait_for_reply(&status); if (status.cqe.status != 0) { device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n", le16toh(status.cqe.status)); return (EIO); } return (0); } static void nvmf_shutdown_controller(struct nvmf_softc *sc) { uint64_t cc; int error; error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc); if (error != 0) { device_printf(sc->dev, "Failed to fetch CC for shutdown\n"); return; } cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL); error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc); if (error != 0) device_printf(sc->dev, "Failed to set CC to trigger shutdown\n"); } static void nvmf_check_keep_alive(void *arg) { struct nvmf_softc *sc = arg; int traffic; traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic); if (traffic == 0) { device_printf(sc->dev, "disconnecting due to KeepAlive timeout\n"); nvmf_disconnect(sc); return; } callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK); } static void nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe) { struct nvmf_softc *sc = arg; atomic_store_int(&sc->ka_active_rx_traffic, 1); if (cqe->status != 0) { device_printf(sc->dev, "KeepAlive response reported status %#x\n", le16toh(cqe->status)); } } static void nvmf_send_keep_alive(void *arg) { struct nvmf_softc *sc = arg; int traffic; /* * Don't bother sending a KeepAlive command if TKAS is active * and another command has been sent during the interval. */ traffic = atomic_load_int(&sc->ka_active_tx_traffic); if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete, sc, M_NOWAIT)) device_printf(sc->dev, "Failed to allocate KeepAlive command\n"); /* Clear ka_active_tx_traffic after sending the keep alive command. */ atomic_store_int(&sc->ka_active_tx_traffic, 0); callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK); } int nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh) { size_t len; u_int i; int error; memset(ivars, 0, sizeof(*ivars)); if (!hh->admin.admin || hh->num_io_queues < 1) return (EINVAL); ivars->cdata = malloc(sizeof(*ivars->cdata), M_NVMF, M_WAITOK); error = copyin(hh->cdata, ivars->cdata, sizeof(*ivars->cdata)); if (error != 0) goto out; nvme_controller_data_swapbytes(ivars->cdata); len = hh->num_io_queues * sizeof(*ivars->io_params); ivars->io_params = malloc(len, M_NVMF, M_WAITOK); error = copyin(hh->io, ivars->io_params, len); if (error != 0) goto out; for (i = 0; i < hh->num_io_queues; i++) { if (ivars->io_params[i].admin) { error = EINVAL; goto out; } /* Require all I/O queues to be the same size. */ if (ivars->io_params[i].qsize != ivars->io_params[0].qsize) { error = EINVAL; goto out; } } ivars->hh = hh; return (0); out: free(ivars->io_params, M_NVMF); free(ivars->cdata, M_NVMF); return (error); } void nvmf_free_ivars(struct nvmf_ivars *ivars) { free(ivars->io_params, M_NVMF); free(ivars->cdata, M_NVMF); } static int nvmf_probe(device_t dev) { struct nvmf_ivars *ivars = device_get_ivars(dev); char desc[260]; if (ivars == NULL) return (ENXIO); snprintf(desc, sizeof(desc), "Fabrics: %.256s", ivars->cdata->subnqn); device_set_desc_copy(dev, desc); return (BUS_PROBE_DEFAULT); } static int nvmf_establish_connection(struct nvmf_softc *sc, struct nvmf_ivars *ivars) { char name[16]; /* Setup the admin queue. */ sc->admin = nvmf_init_qp(sc, ivars->hh->trtype, &ivars->hh->admin, "admin queue"); if (sc->admin == NULL) { device_printf(sc->dev, "Failed to setup admin queue\n"); return (ENXIO); } /* Setup I/O queues. */ sc->io = malloc(ivars->hh->num_io_queues * sizeof(*sc->io), M_NVMF, M_WAITOK | M_ZERO); sc->num_io_queues = ivars->hh->num_io_queues; for (u_int i = 0; i < sc->num_io_queues; i++) { snprintf(name, sizeof(name), "I/O queue %u", i); sc->io[i] = nvmf_init_qp(sc, ivars->hh->trtype, &ivars->io_params[i], name); if (sc->io[i] == NULL) { device_printf(sc->dev, "Failed to setup I/O queue %u\n", i + 1); return (ENXIO); } } /* Start KeepAlive timers. */ if (ivars->hh->kato != 0) { sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS, sc->cdata->ctratt) != 0; sc->ka_rx_sbt = mstosbt(ivars->hh->kato); sc->ka_tx_sbt = sc->ka_rx_sbt / 2; callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, nvmf_check_keep_alive, sc, C_HARDCLOCK); callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, nvmf_send_keep_alive, sc, C_HARDCLOCK); } return (0); } typedef bool nvmf_scan_active_ns_cb(struct nvmf_softc *, uint32_t, const struct nvme_namespace_data *, void *); static bool nvmf_scan_active_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist, struct nvme_namespace_data *data, uint32_t *nsidp, nvmf_scan_active_ns_cb *cb, void *cb_arg) { struct nvmf_completion_status status; uint32_t nsid; nvmf_status_init(&status); nvmf_status_wait_io(&status); if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist, nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) { device_printf(sc->dev, "failed to send IDENTIFY active namespaces command\n"); return (false); } nvmf_wait_for_reply(&status); if (status.cqe.status != 0) { device_printf(sc->dev, "IDENTIFY active namespaces failed, status %#x\n", le16toh(status.cqe.status)); return (false); } if (status.io_error != 0) { device_printf(sc->dev, "IDENTIFY active namespaces failed with I/O error %d\n", status.io_error); return (false); } for (u_int i = 0; i < nitems(nslist->ns); i++) { nsid = nslist->ns[i]; if (nsid == 0) { *nsidp = 0; return (true); } nvmf_status_init(&status); nvmf_status_wait_io(&status); if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) { device_printf(sc->dev, "failed to send IDENTIFY namespace %u command\n", nsid); return (false); } nvmf_wait_for_reply(&status); if (status.cqe.status != 0) { device_printf(sc->dev, "IDENTIFY namespace %u failed, status %#x\n", nsid, le16toh(status.cqe.status)); return (false); } if (status.io_error != 0) { device_printf(sc->dev, "IDENTIFY namespace %u failed with I/O error %d\n", nsid, status.io_error); return (false); } nvme_namespace_data_swapbytes(data); if (!cb(sc, nsid, data, cb_arg)) return (false); } MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0); if (nsid >= 0xfffffffd) *nsidp = 0; else *nsidp = nsid + 1; return (true); } static bool nvmf_scan_active_namespaces(struct nvmf_softc *sc, nvmf_scan_active_ns_cb *cb, void *cb_arg) { struct nvme_namespace_data *data; struct nvme_ns_list *nslist; uint32_t nsid; bool retval; nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK); data = malloc(sizeof(*data), M_NVMF, M_WAITOK); nsid = 0; retval = true; for (;;) { if (!nvmf_scan_active_nslist(sc, nslist, data, &nsid, cb, cb_arg)) { retval = false; break; } if (nsid == 0) break; } free(data, M_NVMF); free(nslist, M_NVMF); return (retval); } static bool nvmf_add_ns(struct nvmf_softc *sc, uint32_t nsid, const struct nvme_namespace_data *data, void *arg __unused) { if (sc->ns[nsid - 1] != NULL) { device_printf(sc->dev, "duplicate namespace %u in active namespace list\n", nsid); return (false); } /* * As in nvme_ns_construct, a size of zero indicates an * invalid namespace. */ if (data->nsze == 0) { device_printf(sc->dev, "ignoring active namespace %u with zero size\n", nsid); return (true); } sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); nvmf_sim_rescan_ns(sc, nsid); return (true); } static bool nvmf_add_namespaces(struct nvmf_softc *sc) { sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF, M_WAITOK | M_ZERO); return (nvmf_scan_active_namespaces(sc, nvmf_add_ns, NULL)); } static int nvmf_attach(device_t dev) { struct make_dev_args mda; struct nvmf_softc *sc = device_get_softc(dev); struct nvmf_ivars *ivars = device_get_ivars(dev); uint64_t val; u_int i; int error; if (ivars == NULL) return (ENXIO); sc->dev = dev; sc->trtype = ivars->hh->trtype; callout_init(&sc->ka_rx_timer, 1); callout_init(&sc->ka_tx_timer, 1); sx_init(&sc->connection_lock, "nvmf connection"); TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc); /* Claim the cdata pointer from ivars. */ sc->cdata = ivars->cdata; ivars->cdata = NULL; nvmf_init_aer(sc); /* TODO: Multiqueue support. */ sc->max_pending_io = ivars->io_params[0].qsize /* * sc->num_io_queues */; error = nvmf_establish_connection(sc, ivars); if (error != 0) goto out; error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap); if (error != 0) { device_printf(sc->dev, "Failed to fetch CAP\n"); error = ENXIO; goto out; } error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val); if (error != 0) { device_printf(sc->dev, "Failed to fetch VS\n"); error = ENXIO; goto out; } sc->vs = val; /* Honor MDTS if it is set. */ sc->max_xfer_size = maxphys; if (sc->cdata->mdts != 0) { sc->max_xfer_size = ulmin(sc->max_xfer_size, 1 << (sc->cdata->mdts + NVME_MPS_SHIFT + NVME_CAP_HI_MPSMIN(sc->cap >> 32))); } error = nvmf_init_sim(sc); if (error != 0) goto out; error = nvmf_start_aer(sc); if (error != 0) { nvmf_destroy_sim(sc); goto out; } if (!nvmf_add_namespaces(sc)) { nvmf_destroy_sim(sc); goto out; } make_dev_args_init(&mda); mda.mda_devsw = &nvmf_cdevsw; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_WHEEL; mda.mda_mode = 0600; mda.mda_si_drv1 = sc; error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev)); if (error != 0) { nvmf_destroy_sim(sc); goto out; } return (0); out: if (sc->ns != NULL) { for (i = 0; i < sc->cdata->nn; i++) { if (sc->ns[i] != NULL) nvmf_destroy_ns(sc->ns[i]); } free(sc->ns, M_NVMF); } callout_drain(&sc->ka_tx_timer); callout_drain(&sc->ka_rx_timer); if (sc->admin != NULL) nvmf_shutdown_controller(sc); for (i = 0; i < sc->num_io_queues; i++) { if (sc->io[i] != NULL) nvmf_destroy_qp(sc->io[i]); } free(sc->io, M_NVMF); if (sc->admin != NULL) nvmf_destroy_qp(sc->admin); nvmf_destroy_aer(sc); taskqueue_drain(taskqueue_thread, &sc->disconnect_task); sx_destroy(&sc->connection_lock); free(sc->cdata, M_NVMF); return (error); } void nvmf_disconnect(struct nvmf_softc *sc) { taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task); } static void nvmf_disconnect_task(void *arg, int pending __unused) { struct nvmf_softc *sc = arg; u_int i; sx_xlock(&sc->connection_lock); if (sc->admin == NULL) { /* * Ignore transport errors if there is no active * association. */ sx_xunlock(&sc->connection_lock); return; } if (sc->detaching) { if (sc->admin != NULL) { /* * This unsticks the detach process if a * transport error occurs during detach. */ nvmf_shutdown_qp(sc->admin); } sx_xunlock(&sc->connection_lock); return; } if (sc->cdev == NULL) { /* * Transport error occurred during attach (nvmf_add_namespaces). * Shutdown the admin queue. */ nvmf_shutdown_qp(sc->admin); sx_xunlock(&sc->connection_lock); return; } callout_drain(&sc->ka_tx_timer); callout_drain(&sc->ka_rx_timer); sc->ka_traffic = false; /* Quiesce namespace consumers. */ nvmf_disconnect_sim(sc); for (i = 0; i < sc->cdata->nn; i++) { if (sc->ns[i] != NULL) nvmf_disconnect_ns(sc->ns[i]); } /* Shutdown the existing qpairs. */ for (i = 0; i < sc->num_io_queues; i++) { nvmf_destroy_qp(sc->io[i]); } free(sc->io, M_NVMF); sc->io = NULL; sc->num_io_queues = 0; nvmf_destroy_qp(sc->admin); sc->admin = NULL; sx_xunlock(&sc->connection_lock); } static int nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh) { struct nvmf_ivars ivars; u_int i; int error; /* XXX: Should we permit changing the transport type? */ if (sc->trtype != hh->trtype) { device_printf(sc->dev, "transport type mismatch on reconnect\n"); return (EINVAL); } error = nvmf_init_ivars(&ivars, hh); if (error != 0) return (error); sx_xlock(&sc->connection_lock); if (sc->admin != NULL || sc->detaching) { error = EBUSY; goto out; } /* * Ensure this is for the same controller. Note that the * controller ID can vary across associations if the remote * system is using the dynamic controller model. This merely * ensures the new association is connected to the same NVMe * subsystem. */ if (memcmp(sc->cdata->subnqn, ivars.cdata->subnqn, sizeof(ivars.cdata->subnqn)) != 0) { device_printf(sc->dev, "controller subsystem NQN mismatch on reconnect\n"); error = EINVAL; goto out; } /* * XXX: Require same number and size of I/O queues so that * max_pending_io is still correct? */ error = nvmf_establish_connection(sc, &ivars); if (error != 0) goto out; error = nvmf_start_aer(sc); if (error != 0) goto out; device_printf(sc->dev, "established new association with %u I/O queues\n", sc->num_io_queues); /* Restart namespace consumers. */ for (i = 0; i < sc->cdata->nn; i++) { if (sc->ns[i] != NULL) nvmf_reconnect_ns(sc->ns[i]); } nvmf_reconnect_sim(sc); nvmf_rescan_all_ns(sc); out: sx_xunlock(&sc->connection_lock); nvmf_free_ivars(&ivars); return (error); } static int nvmf_detach(device_t dev) { struct nvmf_softc *sc = device_get_softc(dev); u_int i; destroy_dev(sc->cdev); sx_xlock(&sc->connection_lock); sc->detaching = true; sx_xunlock(&sc->connection_lock); nvmf_destroy_sim(sc); for (i = 0; i < sc->cdata->nn; i++) { if (sc->ns[i] != NULL) nvmf_destroy_ns(sc->ns[i]); } free(sc->ns, M_NVMF); callout_drain(&sc->ka_tx_timer); callout_drain(&sc->ka_rx_timer); if (sc->admin != NULL) nvmf_shutdown_controller(sc); for (i = 0; i < sc->num_io_queues; i++) { nvmf_destroy_qp(sc->io[i]); } free(sc->io, M_NVMF); taskqueue_drain(taskqueue_thread, &sc->disconnect_task); if (sc->admin != NULL) nvmf_destroy_qp(sc->admin); nvmf_destroy_aer(sc); sx_destroy(&sc->connection_lock); free(sc->cdata, M_NVMF); return (0); } static void nvmf_rescan_ns_1(struct nvmf_softc *sc, uint32_t nsid, const struct nvme_namespace_data *data) { struct nvmf_namespace *ns; /* XXX: Needs locking around sc->ns[]. */ ns = sc->ns[nsid - 1]; if (data->nsze == 0) { /* XXX: Needs locking */ if (ns != NULL) { nvmf_destroy_ns(ns); sc->ns[nsid - 1] = NULL; } } else { /* XXX: Needs locking */ if (ns == NULL) { sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); } else { if (!nvmf_update_ns(ns, data)) { nvmf_destroy_ns(ns); sc->ns[nsid - 1] = NULL; } } } nvmf_sim_rescan_ns(sc, nsid); } void nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid) { struct nvmf_completion_status status; struct nvme_namespace_data *data; data = malloc(sizeof(*data), M_NVMF, M_WAITOK); nvmf_status_init(&status); nvmf_status_wait_io(&status); if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) { device_printf(sc->dev, "failed to send IDENTIFY namespace %u command\n", nsid); free(data, M_NVMF); return; } nvmf_wait_for_reply(&status); if (status.cqe.status != 0) { device_printf(sc->dev, "IDENTIFY namespace %u failed, status %#x\n", nsid, le16toh(status.cqe.status)); free(data, M_NVMF); return; } if (status.io_error != 0) { device_printf(sc->dev, "IDENTIFY namespace %u failed with I/O error %d\n", nsid, status.io_error); free(data, M_NVMF); return; } nvme_namespace_data_swapbytes(data); nvmf_rescan_ns_1(sc, nsid, data); free(data, M_NVMF); } static void nvmf_purge_namespaces(struct nvmf_softc *sc, uint32_t first_nsid, uint32_t next_valid_nsid) { struct nvmf_namespace *ns; for (uint32_t nsid = first_nsid; nsid < next_valid_nsid; nsid++) { /* XXX: Needs locking around sc->ns[]. */ ns = sc->ns[nsid - 1]; if (ns != NULL) { nvmf_destroy_ns(ns); sc->ns[nsid - 1] = NULL; nvmf_sim_rescan_ns(sc, nsid); } } } static bool nvmf_rescan_ns_cb(struct nvmf_softc *sc, uint32_t nsid, const struct nvme_namespace_data *data, void *arg) { uint32_t *last_nsid = arg; /* Check for any gaps prior to this namespace. */ nvmf_purge_namespaces(sc, *last_nsid + 1, nsid); *last_nsid = nsid; nvmf_rescan_ns_1(sc, nsid, data); return (true); } void nvmf_rescan_all_ns(struct nvmf_softc *sc) { uint32_t last_nsid; last_nsid = 0; if (!nvmf_scan_active_namespaces(sc, nvmf_rescan_ns_cb, &last_nsid)) return; /* * Check for any namespace devices after the last active * namespace. */ nvmf_purge_namespaces(sc, last_nsid + 1, sc->cdata->nn + 1); } int nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt, bool admin) { struct nvmf_completion_status status; struct nvme_command cmd; struct memdesc mem; struct nvmf_host_qpair *qp; struct nvmf_request *req; void *buf; int error; if (pt->len > sc->max_xfer_size) return (EINVAL); buf = NULL; if (pt->len != 0) { /* * XXX: Depending on the size we may want to pin the * user pages and use a memdesc with vm_page_t's * instead. */ buf = malloc(pt->len, M_NVMF, M_WAITOK); if (pt->is_read == 0) { error = copyin(pt->buf, buf, pt->len); if (error != 0) { free(buf, M_NVMF); return (error); } } else { /* Ensure no kernel data is leaked to userland. */ memset(buf, 0, pt->len); } } memset(&cmd, 0, sizeof(cmd)); cmd.opc = pt->cmd.opc; cmd.fuse = pt->cmd.fuse; cmd.nsid = pt->cmd.nsid; cmd.cdw10 = pt->cmd.cdw10; cmd.cdw11 = pt->cmd.cdw11; cmd.cdw12 = pt->cmd.cdw12; cmd.cdw13 = pt->cmd.cdw13; cmd.cdw14 = pt->cmd.cdw14; cmd.cdw15 = pt->cmd.cdw15; if (admin) qp = sc->admin; else qp = nvmf_select_io_queue(sc); nvmf_status_init(&status); req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK); if (req == NULL) { device_printf(sc->dev, "failed to send passthrough command\n"); error = ECONNABORTED; goto error; } if (pt->len != 0) { mem = memdesc_vaddr(buf, pt->len); nvmf_capsule_append_data(req->nc, &mem, pt->len, pt->is_read == 0, nvmf_io_complete, &status); nvmf_status_wait_io(&status); } nvmf_submit_request(req); nvmf_wait_for_reply(&status); memset(&pt->cpl, 0, sizeof(pt->cpl)); pt->cpl.cdw0 = status.cqe.cdw0; pt->cpl.status = status.cqe.status; error = status.io_error; if (error == 0 && pt->len != 0 && pt->is_read != 0) error = copyout(buf, pt->buf, pt->len); error: free(buf, M_NVMF); return (error); } static int nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, struct thread *td) { struct nvmf_softc *sc = cdev->si_drv1; struct nvme_get_nsid *gnsid; struct nvme_pt_command *pt; struct nvmf_reconnect_params *rp; struct nvmf_handoff_host *hh; switch (cmd) { case NVME_PASSTHROUGH_CMD: pt = (struct nvme_pt_command *)arg; return (nvmf_passthrough_cmd(sc, pt, true)); case NVME_GET_NSID: gnsid = (struct nvme_get_nsid *)arg; strlcpy(gnsid->cdev, device_get_nameunit(sc->dev), sizeof(gnsid->cdev)); gnsid->nsid = 0; return (0); case NVME_GET_MAX_XFER_SIZE: *(uint64_t *)arg = sc->max_xfer_size; return (0); case NVMF_RECONNECT_PARAMS: rp = (struct nvmf_reconnect_params *)arg; if ((sc->cdata->fcatt & 1) == 0) rp->cntlid = NVMF_CNTLID_DYNAMIC; else rp->cntlid = sc->cdata->ctrlr_id; memcpy(rp->subnqn, sc->cdata->subnqn, sizeof(rp->subnqn)); return (0); case NVMF_RECONNECT_HOST: hh = (struct nvmf_handoff_host *)arg; return (nvmf_reconnect_host(sc, hh)); default: return (ENOTTY); } } static struct cdevsw nvmf_cdevsw = { .d_version = D_VERSION, .d_ioctl = nvmf_ioctl }; static int nvmf_modevent(module_t mod, int what, void *arg) { switch (what) { case MOD_LOAD: return (nvmf_ctl_load()); case MOD_QUIESCE: return (0); case MOD_UNLOAD: nvmf_ctl_unload(); destroy_dev_drain(&nvmf_cdevsw); return (0); default: return (EOPNOTSUPP); } } static device_method_t nvmf_methods[] = { /* Device interface */ DEVMETHOD(device_probe, nvmf_probe), DEVMETHOD(device_attach, nvmf_attach), DEVMETHOD(device_detach, nvmf_detach), #if 0 DEVMETHOD(device_shutdown, nvmf_shutdown), #endif DEVMETHOD_END }; driver_t nvme_nvmf_driver = { "nvme", nvmf_methods, sizeof(struct nvmf_softc), }; DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL); MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1); diff --git a/sys/dev/nvmf/host/nvmf_ns.c b/sys/dev/nvmf/host/nvmf_ns.c index 0727ca960a57..8381cc4aec54 100644 --- a/sys/dev/nvmf/host/nvmf_ns.c +++ b/sys/dev/nvmf/host/nvmf_ns.c @@ -1,487 +1,502 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct nvmf_namespace { struct nvmf_softc *sc; uint64_t size; uint32_t id; u_int flags; uint32_t lba_size; bool disconnected; TAILQ_HEAD(, bio) pending_bios; struct mtx lock; volatile u_int active_bios; struct cdev *cdev; }; static void nvmf_ns_strategy(struct bio *bio); static void ns_printf(struct nvmf_namespace *ns, const char *fmt, ...) { char buf[128]; struct sbuf sb; va_list ap; sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); sbuf_set_drain(&sb, sbuf_printf_drain, NULL); sbuf_printf(&sb, "%sn%u: ", device_get_nameunit(ns->sc->dev), ns->id); va_start(ap, fmt); sbuf_vprintf(&sb, fmt, ap); va_end(ap); sbuf_finish(&sb); sbuf_delete(&sb); } /* * The I/O completion may trigger after the received CQE if the I/O * used a zero-copy mbuf that isn't harvested until after the NIC * driver processes TX completions. Abuse bio_driver1 as a refcount. * Store I/O errors in bio_driver2. */ static __inline u_int * bio_refs(struct bio *bio) { return ((u_int *)&bio->bio_driver1); } static void nvmf_ns_biodone(struct bio *bio) { struct nvmf_namespace *ns; int error; if (!refcount_release(bio_refs(bio))) return; ns = bio->bio_dev->si_drv1; /* If a request is aborted, resubmit or queue it for resubmission. */ - if (bio->bio_error == ECONNABORTED) { + if (bio->bio_error == ECONNABORTED && !nvmf_fail_disconnect) { bio->bio_error = 0; bio->bio_driver2 = 0; mtx_lock(&ns->lock); if (ns->disconnected) { - TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue); - mtx_unlock(&ns->lock); + if (nvmf_fail_disconnect) { + mtx_unlock(&ns->lock); + bio->bio_error = ECONNABORTED; + bio->bio_flags |= BIO_ERROR; + bio->bio_resid = bio->bio_bcount; + biodone(bio); + } else { + TAILQ_INSERT_TAIL(&ns->pending_bios, bio, + bio_queue); + mtx_unlock(&ns->lock); + } } else { mtx_unlock(&ns->lock); nvmf_ns_strategy(bio); } } else { /* * I/O errors take precedence over generic EIO from * CQE errors. */ error = (intptr_t)bio->bio_driver2; if (error != 0) bio->bio_error = error; if (bio->bio_error != 0) bio->bio_flags |= BIO_ERROR; biodone(bio); } if (refcount_release(&ns->active_bios)) wakeup(ns); } static void nvmf_ns_io_complete(void *arg, size_t xfered, int error) { struct bio *bio = arg; KASSERT(xfered <= bio->bio_bcount, ("%s: xfered > bio_bcount", __func__)); bio->bio_driver2 = (void *)(intptr_t)error; bio->bio_resid = bio->bio_bcount - xfered; nvmf_ns_biodone(bio); } static void nvmf_ns_delete_complete(void *arg, size_t xfered, int error) { struct bio *bio = arg; if (error != 0) bio->bio_resid = bio->bio_bcount; else bio->bio_resid = 0; free(bio->bio_driver2, M_NVMF); bio->bio_driver2 = (void *)(intptr_t)error; nvmf_ns_biodone(bio); } static void nvmf_ns_bio_complete(void *arg, const struct nvme_completion *cqe) { struct bio *bio = arg; if (nvmf_cqe_aborted(cqe)) bio->bio_error = ECONNABORTED; else if (cqe->status != 0) bio->bio_error = EIO; nvmf_ns_biodone(bio); } static int nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio) { struct nvme_command cmd; struct nvmf_request *req; struct nvme_dsm_range *dsm_range; struct memdesc mem; uint64_t lba, lba_count; + int error; dsm_range = NULL; memset(&cmd, 0, sizeof(cmd)); switch (bio->bio_cmd) { case BIO_READ: lba = bio->bio_offset / ns->lba_size; lba_count = bio->bio_bcount / ns->lba_size; nvme_ns_read_cmd(&cmd, ns->id, lba, lba_count); break; case BIO_WRITE: lba = bio->bio_offset / ns->lba_size; lba_count = bio->bio_bcount / ns->lba_size; nvme_ns_write_cmd(&cmd, ns->id, lba, lba_count); break; case BIO_FLUSH: nvme_ns_flush_cmd(&cmd, ns->id); break; case BIO_DELETE: dsm_range = malloc(sizeof(*dsm_range), M_NVMF, M_NOWAIT | M_ZERO); if (dsm_range == NULL) return (ENOMEM); lba = bio->bio_offset / ns->lba_size; lba_count = bio->bio_bcount / ns->lba_size; dsm_range->starting_lba = htole64(lba); dsm_range->length = htole32(lba_count); cmd.opc = NVME_OPC_DATASET_MANAGEMENT; cmd.nsid = htole32(ns->id); cmd.cdw10 = htole32(0); /* 1 range */ cmd.cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE); break; default: return (EOPNOTSUPP); } mtx_lock(&ns->lock); if (ns->disconnected) { - TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue); + if (nvmf_fail_disconnect) { + error = ECONNABORTED; + } else { + TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue); + error = 0; + } mtx_unlock(&ns->lock); free(dsm_range, M_NVMF); - return (0); + return (error); } req = nvmf_allocate_request(nvmf_select_io_queue(ns->sc), &cmd, nvmf_ns_bio_complete, bio, M_NOWAIT); if (req == NULL) { mtx_unlock(&ns->lock); free(dsm_range, M_NVMF); return (ENOMEM); } switch (bio->bio_cmd) { case BIO_READ: case BIO_WRITE: refcount_init(bio_refs(bio), 2); mem = memdesc_bio(bio); nvmf_capsule_append_data(req->nc, &mem, bio->bio_bcount, bio->bio_cmd == BIO_WRITE, nvmf_ns_io_complete, bio); break; case BIO_DELETE: refcount_init(bio_refs(bio), 2); mem = memdesc_vaddr(dsm_range, sizeof(*dsm_range)); nvmf_capsule_append_data(req->nc, &mem, sizeof(*dsm_range), true, nvmf_ns_delete_complete, bio); bio->bio_driver2 = dsm_range; break; default: refcount_init(bio_refs(bio), 1); KASSERT(bio->bio_resid == 0, ("%s: input bio_resid != 0", __func__)); break; } refcount_acquire(&ns->active_bios); nvmf_submit_request(req); mtx_unlock(&ns->lock); return (0); } static int nvmf_ns_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag, struct thread *td) { struct nvmf_namespace *ns = dev->si_drv1; struct nvme_get_nsid *gnsid; struct nvme_pt_command *pt; switch (cmd) { case NVME_PASSTHROUGH_CMD: pt = (struct nvme_pt_command *)arg; pt->cmd.nsid = htole32(ns->id); return (nvmf_passthrough_cmd(ns->sc, pt, false)); case NVME_GET_NSID: gnsid = (struct nvme_get_nsid *)arg; strlcpy(gnsid->cdev, device_get_nameunit(ns->sc->dev), sizeof(gnsid->cdev)); gnsid->nsid = ns->id; return (0); case DIOCGMEDIASIZE: *(off_t *)arg = ns->size; return (0); case DIOCGSECTORSIZE: *(u_int *)arg = ns->lba_size; return (0); default: return (ENOTTY); } } static int nvmf_ns_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { int error; error = 0; if ((oflags & FWRITE) != 0) error = securelevel_gt(td->td_ucred, 0); return (error); } void nvmf_ns_strategy(struct bio *bio) { struct nvmf_namespace *ns; int error; ns = bio->bio_dev->si_drv1; error = nvmf_ns_submit_bio(ns, bio); if (error != 0) { bio->bio_error = error; bio->bio_flags |= BIO_ERROR; bio->bio_resid = bio->bio_bcount; biodone(bio); } } static struct cdevsw nvmf_ns_cdevsw = { .d_version = D_VERSION, .d_flags = D_DISK, .d_open = nvmf_ns_open, .d_read = physread, .d_write = physwrite, .d_strategy = nvmf_ns_strategy, .d_ioctl = nvmf_ns_ioctl }; struct nvmf_namespace * nvmf_init_ns(struct nvmf_softc *sc, uint32_t id, const struct nvme_namespace_data *data) { struct make_dev_args mda; struct nvmf_namespace *ns; int error; uint8_t lbads, lbaf; ns = malloc(sizeof(*ns), M_NVMF, M_WAITOK | M_ZERO); ns->sc = sc; ns->id = id; TAILQ_INIT(&ns->pending_bios); mtx_init(&ns->lock, "nvmf ns", NULL, MTX_DEF); /* One dummy bio avoids dropping to 0 until destroy. */ refcount_init(&ns->active_bios, 1); if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) { ns_printf(ns, "End-to-end data protection not supported\n"); goto fail; } lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas); if (lbaf > data->nlbaf) { ns_printf(ns, "Invalid LBA format index\n"); goto fail; } if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) { ns_printf(ns, "Namespaces with metadata are not supported\n"); goto fail; } lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]); if (lbads == 0) { ns_printf(ns, "Invalid LBA format index\n"); goto fail; } ns->lba_size = 1 << lbads; ns->size = data->nsze * ns->lba_size; if (nvme_ctrlr_has_dataset_mgmt(sc->cdata)) ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED; if (NVMEV(NVME_CTRLR_DATA_VWC_PRESENT, sc->cdata->vwc) != 0) ns->flags |= NVME_NS_FLUSH_SUPPORTED; /* * XXX: Does any of the boundary splitting for NOIOB make any * sense for Fabrics? */ make_dev_args_init(&mda); mda.mda_devsw = &nvmf_ns_cdevsw; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_WHEEL; mda.mda_mode = 0600; mda.mda_si_drv1 = ns; error = make_dev_s(&mda, &ns->cdev, "%sn%u", device_get_nameunit(sc->dev), id); if (error != 0) goto fail; ns->cdev->si_drv2 = make_dev_alias(ns->cdev, "%sns%u", device_get_nameunit(sc->dev), id); ns->cdev->si_flags |= SI_UNMAPPED; return (ns); fail: mtx_destroy(&ns->lock); free(ns, M_NVMF); return (NULL); } void nvmf_disconnect_ns(struct nvmf_namespace *ns) { mtx_lock(&ns->lock); ns->disconnected = true; mtx_unlock(&ns->lock); } void nvmf_reconnect_ns(struct nvmf_namespace *ns) { TAILQ_HEAD(, bio) bios; struct bio *bio; mtx_lock(&ns->lock); ns->disconnected = false; TAILQ_INIT(&bios); TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue); mtx_unlock(&ns->lock); while (!TAILQ_EMPTY(&bios)) { bio = TAILQ_FIRST(&bios); TAILQ_REMOVE(&bios, bio, bio_queue); nvmf_ns_strategy(bio); } } void nvmf_destroy_ns(struct nvmf_namespace *ns) { TAILQ_HEAD(, bio) bios; struct bio *bio; if (ns->cdev->si_drv2 != NULL) destroy_dev(ns->cdev->si_drv2); destroy_dev(ns->cdev); /* * Wait for active I/O requests to drain. The release drops * the reference on the "dummy bio" when the namespace is * created. */ mtx_lock(&ns->lock); if (!refcount_release(&ns->active_bios)) { while (ns->active_bios != 0) mtx_sleep(ns, &ns->lock, 0, "nvmfrmns", 0); } /* Abort any pending I/O requests. */ TAILQ_INIT(&bios); TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue); mtx_unlock(&ns->lock); while (!TAILQ_EMPTY(&bios)) { bio = TAILQ_FIRST(&bios); TAILQ_REMOVE(&bios, bio, bio_queue); bio->bio_error = ECONNABORTED; bio->bio_flags |= BIO_ERROR; bio->bio_resid = bio->bio_bcount; biodone(bio); } mtx_destroy(&ns->lock); free(ns, M_NVMF); } bool nvmf_update_ns(struct nvmf_namespace *ns, const struct nvme_namespace_data *data) { uint8_t lbads, lbaf; if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) { ns_printf(ns, "End-to-end data protection not supported\n"); return (false); } lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas); if (lbaf > data->nlbaf) { ns_printf(ns, "Invalid LBA format index\n"); return (false); } if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) { ns_printf(ns, "Namespaces with metadata are not supported\n"); return (false); } lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]); if (lbads == 0) { ns_printf(ns, "Invalid LBA format index\n"); return (false); } ns->lba_size = 1 << lbads; ns->size = data->nsze * ns->lba_size; return (true); } diff --git a/sys/dev/nvmf/host/nvmf_sim.c b/sys/dev/nvmf/host/nvmf_sim.c index 00dad07889d1..71bb71dd4063 100644 --- a/sys/dev/nvmf/host/nvmf_sim.c +++ b/sys/dev/nvmf/host/nvmf_sim.c @@ -1,332 +1,338 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #include #include #include #include #include #include #include #include #include #include /* * The I/O completion may trigger after the received CQE if the I/O * used a zero-copy mbuf that isn't harvested until after the NIC * driver processes TX completions. Use spriv_field0 to as a refcount. * * Store any I/O error returned in spriv_field1. */ static __inline u_int * ccb_refs(union ccb *ccb) { return ((u_int *)&ccb->ccb_h.spriv_field0); } #define spriv_ioerror spriv_field1 static void nvmf_ccb_done(union ccb *ccb) { if (!refcount_release(ccb_refs(ccb))) return; if (nvmf_cqe_aborted(&ccb->nvmeio.cpl)) { - ccb->ccb_h.status = CAM_REQUEUE_REQ; + if (nvmf_fail_disconnect) + ccb->ccb_h.status = CAM_DEV_NOT_THERE; + else + ccb->ccb_h.status = CAM_REQUEUE_REQ; xpt_done(ccb); } else if (ccb->nvmeio.cpl.status != 0) { ccb->ccb_h.status = CAM_NVME_STATUS_ERROR; xpt_done(ccb); } else if (ccb->ccb_h.spriv_ioerror != 0) { KASSERT(ccb->ccb_h.spriv_ioerror != EJUSTRETURN, ("%s: zero sized transfer without CQE error", __func__)); ccb->ccb_h.status = CAM_REQ_CMP_ERR; xpt_done(ccb); } else { ccb->ccb_h.status = CAM_REQ_CMP; xpt_done_direct(ccb); } } static void nvmf_ccb_io_complete(void *arg, size_t xfered, int error) { union ccb *ccb = arg; /* * TODO: Reporting partial completions requires extending * nvmeio to support resid and updating nda to handle partial * reads, either by returning partial success (or an error) to * the caller, or retrying all or part of the request. */ ccb->ccb_h.spriv_ioerror = error; if (error == 0) { if (xfered == 0) { #ifdef INVARIANTS /* * If the request fails with an error in the CQE * there will be no data transferred but also no * I/O error. */ ccb->ccb_h.spriv_ioerror = EJUSTRETURN; #endif } else KASSERT(xfered == ccb->nvmeio.dxfer_len, ("%s: partial CCB completion", __func__)); } nvmf_ccb_done(ccb); } static void nvmf_ccb_complete(void *arg, const struct nvme_completion *cqe) { union ccb *ccb = arg; ccb->nvmeio.cpl = *cqe; nvmf_ccb_done(ccb); } static void nvmf_sim_io(struct nvmf_softc *sc, union ccb *ccb) { struct ccb_nvmeio *nvmeio = &ccb->nvmeio; struct memdesc mem; struct nvmf_request *req; struct nvmf_host_qpair *qp; mtx_lock(&sc->sim_mtx); if (sc->sim_disconnected) { mtx_unlock(&sc->sim_mtx); - nvmeio->ccb_h.status = CAM_REQUEUE_REQ; + if (nvmf_fail_disconnect) + nvmeio->ccb_h.status = CAM_DEV_NOT_THERE; + else + nvmeio->ccb_h.status = CAM_REQUEUE_REQ; xpt_done(ccb); return; } if (nvmeio->ccb_h.func_code == XPT_NVME_IO) qp = nvmf_select_io_queue(sc); else qp = sc->admin; req = nvmf_allocate_request(qp, &nvmeio->cmd, nvmf_ccb_complete, ccb, M_NOWAIT); if (req == NULL) { mtx_unlock(&sc->sim_mtx); nvmeio->ccb_h.status = CAM_RESRC_UNAVAIL; xpt_done(ccb); return; } if (nvmeio->dxfer_len != 0) { refcount_init(ccb_refs(ccb), 2); mem = memdesc_ccb(ccb); nvmf_capsule_append_data(req->nc, &mem, nvmeio->dxfer_len, (ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT, nvmf_ccb_io_complete, ccb); } else refcount_init(ccb_refs(ccb), 1); /* * Clear spriv_ioerror as it can hold an earlier error if this * CCB was aborted and has been retried. */ ccb->ccb_h.spriv_ioerror = 0; KASSERT(ccb->ccb_h.status == CAM_REQ_INPROG, ("%s: incoming CCB is not in-progress", __func__)); ccb->ccb_h.status |= CAM_SIM_QUEUED; nvmf_submit_request(req); mtx_unlock(&sc->sim_mtx); } static void nvmf_sim_action(struct cam_sim *sim, union ccb *ccb) { struct nvmf_softc *sc = cam_sim_softc(sim); CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE, ("nvmf_sim_action: func= %#x\n", ccb->ccb_h.func_code)); switch (ccb->ccb_h.func_code) { case XPT_PATH_INQ: /* Path routing inquiry */ { struct ccb_pathinq *cpi = &ccb->cpi; cpi->version_num = 1; cpi->hba_inquiry = 0; cpi->target_sprt = 0; cpi->hba_misc = PIM_UNMAPPED | PIM_NOSCAN; cpi->hba_eng_cnt = 0; cpi->max_target = 0; cpi->max_lun = sc->cdata->nn; cpi->async_flags = 0; cpi->hpath_id = 0; cpi->initiator_id = 0; strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN); strlcpy(cpi->hba_vid, "NVMeoF", HBA_IDLEN); strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN); cpi->unit_number = cam_sim_unit(sim); cpi->bus_id = 0; /* XXX: Same as iSCSI. */ cpi->base_transfer_speed = 150000; cpi->protocol = PROTO_NVME; cpi->protocol_version = sc->vs; cpi->transport = XPORT_NVMF; cpi->transport_version = sc->vs; cpi->xport_specific.nvmf.nsid = xpt_path_lun_id(ccb->ccb_h.path); cpi->xport_specific.nvmf.trtype = sc->trtype; strlcpy(cpi->xport_specific.nvmf.dev_name, device_get_nameunit(sc->dev), sizeof(cpi->xport_specific.nvmf.dev_name)); cpi->maxio = sc->max_xfer_size; cpi->hba_vendor = 0; cpi->hba_device = 0; cpi->hba_subvendor = 0; cpi->hba_subdevice = 0; cpi->ccb_h.status = CAM_REQ_CMP; break; } case XPT_GET_TRAN_SETTINGS: /* Get transport settings */ { struct ccb_trans_settings *cts = &ccb->cts; struct ccb_trans_settings_nvme *nvme; struct ccb_trans_settings_nvmf *nvmf; cts->protocol = PROTO_NVME; cts->protocol_version = sc->vs; cts->transport = XPORT_NVMF; cts->transport_version = sc->vs; nvme = &cts->proto_specific.nvme; nvme->valid = CTS_NVME_VALID_SPEC; nvme->spec = sc->vs; nvmf = &cts->xport_specific.nvmf; nvmf->valid = CTS_NVMF_VALID_TRTYPE; nvmf->trtype = sc->trtype; cts->ccb_h.status = CAM_REQ_CMP; break; } case XPT_SET_TRAN_SETTINGS: /* Set transport settings */ /* * No transfer settings can be set, but nvme_xpt sends * this anyway. */ ccb->ccb_h.status = CAM_REQ_CMP; break; case XPT_NVME_IO: /* Execute the requested I/O */ case XPT_NVME_ADMIN: /* or Admin operation */ nvmf_sim_io(sc, ccb); return; default: /* XXX */ device_printf(sc->dev, "unhandled sim function %#x\n", ccb->ccb_h.func_code); ccb->ccb_h.status = CAM_REQ_INVALID; break; } xpt_done(ccb); } int nvmf_init_sim(struct nvmf_softc *sc) { struct cam_devq *devq; int max_trans; max_trans = sc->max_pending_io * 3 / 4; devq = cam_simq_alloc(max_trans); if (devq == NULL) { device_printf(sc->dev, "Failed to allocate CAM simq\n"); return (ENOMEM); } mtx_init(&sc->sim_mtx, "nvmf sim", NULL, MTX_DEF); sc->sim = cam_sim_alloc(nvmf_sim_action, NULL, "nvme", sc, device_get_unit(sc->dev), NULL, max_trans, max_trans, devq); if (sc->sim == NULL) { device_printf(sc->dev, "Failed to allocate CAM sim\n"); cam_simq_free(devq); mtx_destroy(&sc->sim_mtx); return (ENXIO); } if (xpt_bus_register(sc->sim, sc->dev, 0) != CAM_SUCCESS) { device_printf(sc->dev, "Failed to create CAM bus\n"); cam_sim_free(sc->sim, TRUE); mtx_destroy(&sc->sim_mtx); return (ENXIO); } if (xpt_create_path(&sc->path, NULL, cam_sim_path(sc->sim), CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) { device_printf(sc->dev, "Failed to create CAM path\n"); xpt_bus_deregister(cam_sim_path(sc->sim)); cam_sim_free(sc->sim, TRUE); mtx_destroy(&sc->sim_mtx); return (ENXIO); } return (0); } void nvmf_sim_rescan_ns(struct nvmf_softc *sc, uint32_t id) { union ccb *ccb; ccb = xpt_alloc_ccb_nowait(); if (ccb == NULL) { device_printf(sc->dev, "unable to alloc CCB for rescan of namespace %u\n", id); return; } /* * As with nvme_sim, map NVMe namespace IDs onto CAM unit * LUNs. */ if (xpt_create_path(&ccb->ccb_h.path, NULL, cam_sim_path(sc->sim), 0, id) != CAM_REQ_CMP) { device_printf(sc->dev, "Unable to create path for rescan of namespace %u\n", id); xpt_free_ccb(ccb); return; } xpt_rescan(ccb); } void nvmf_disconnect_sim(struct nvmf_softc *sc) { mtx_lock(&sc->sim_mtx); sc->sim_disconnected = true; xpt_freeze_simq(sc->sim, 1); mtx_unlock(&sc->sim_mtx); } void nvmf_reconnect_sim(struct nvmf_softc *sc) { mtx_lock(&sc->sim_mtx); sc->sim_disconnected = false; mtx_unlock(&sc->sim_mtx); xpt_release_simq(sc->sim, 1); } void nvmf_destroy_sim(struct nvmf_softc *sc) { xpt_async(AC_LOST_DEVICE, sc->path, NULL); if (sc->sim_disconnected) xpt_release_simq(sc->sim, 1); xpt_free_path(sc->path); xpt_bus_deregister(cam_sim_path(sc->sim)); cam_sim_free(sc->sim, TRUE); mtx_destroy(&sc->sim_mtx); } diff --git a/sys/dev/nvmf/host/nvmf_var.h b/sys/dev/nvmf/host/nvmf_var.h index 2fa0216baab8..cf88d2f7b01e 100644 --- a/sys/dev/nvmf/host/nvmf_var.h +++ b/sys/dev/nvmf/host/nvmf_var.h @@ -1,209 +1,212 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #ifndef __NVMF_VAR_H__ #define __NVMF_VAR_H__ #include #include #include #include #include #include #include #include struct nvmf_aer; struct nvmf_capsule; struct nvmf_host_qpair; struct nvmf_namespace; typedef void nvmf_request_complete_t(void *, const struct nvme_completion *); struct nvmf_ivars { struct nvmf_handoff_host *hh; struct nvmf_handoff_qpair_params *io_params; struct nvme_controller_data *cdata; }; struct nvmf_softc { device_t dev; struct nvmf_host_qpair *admin; struct nvmf_host_qpair **io; u_int num_io_queues; enum nvmf_trtype trtype; struct cam_sim *sim; struct cam_path *path; struct mtx sim_mtx; bool sim_disconnected; struct nvmf_namespace **ns; struct nvme_controller_data *cdata; uint64_t cap; uint32_t vs; u_int max_pending_io; u_long max_xfer_size; struct cdev *cdev; /* * Keep Alive support depends on two timers. The 'tx' timer * is responsible for sending KeepAlive commands and runs at * half the timeout interval. The 'rx' timer is responsible * for detecting an actual timeout. * * For efficient support of TKAS, the host does not reschedule * these timers every time new commands are scheduled. * Instead, the host sets the *_traffic flags when commands * are sent and received. The timeout handlers check and * clear these flags. This does mean it can take up to twice * the timeout time to detect an AWOL controller. */ bool ka_traffic; /* Using TKAS? */ volatile int ka_active_tx_traffic; struct callout ka_tx_timer; sbintime_t ka_tx_sbt; volatile int ka_active_rx_traffic; struct callout ka_rx_timer; sbintime_t ka_rx_sbt; struct sx connection_lock; struct task disconnect_task; bool detaching; u_int num_aer; struct nvmf_aer *aer; }; struct nvmf_request { struct nvmf_host_qpair *qp; struct nvmf_capsule *nc; nvmf_request_complete_t *cb; void *cb_arg; bool aer; STAILQ_ENTRY(nvmf_request) link; }; struct nvmf_completion_status { struct nvme_completion cqe; bool done; bool io_done; int io_error; }; static __inline struct nvmf_host_qpair * nvmf_select_io_queue(struct nvmf_softc *sc) { /* TODO: Support multiple queues? */ return (sc->io[0]); } static __inline bool nvmf_cqe_aborted(const struct nvme_completion *cqe) { uint16_t status; status = le16toh(cqe->status); return (NVME_STATUS_GET_SCT(status) == NVME_SCT_PATH_RELATED && NVME_STATUS_GET_SC(status) == NVME_SC_COMMAND_ABORTED_BY_HOST); } static __inline void nvmf_status_init(struct nvmf_completion_status *status) { status->done = false; status->io_done = true; status->io_error = 0; } static __inline void nvmf_status_wait_io(struct nvmf_completion_status *status) { status->io_done = false; } #ifdef DRIVER_MODULE extern driver_t nvme_nvmf_driver; #endif #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_NVMF); #endif +/* If true, I/O requests will fail while the host is disconnected. */ +extern bool nvmf_fail_disconnect; + /* nvmf.c */ void nvmf_complete(void *arg, const struct nvme_completion *cqe); void nvmf_io_complete(void *arg, size_t xfered, int error); void nvmf_wait_for_reply(struct nvmf_completion_status *status); int nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh); void nvmf_free_ivars(struct nvmf_ivars *ivars); void nvmf_disconnect(struct nvmf_softc *sc); void nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid); void nvmf_rescan_all_ns(struct nvmf_softc *sc); int nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt, bool admin); /* nvmf_aer.c */ void nvmf_init_aer(struct nvmf_softc *sc); int nvmf_start_aer(struct nvmf_softc *sc); void nvmf_destroy_aer(struct nvmf_softc *sc); /* nvmf_cmd.c */ bool nvmf_cmd_get_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, nvmf_request_complete_t *cb, void *cb_arg, int how); bool nvmf_cmd_set_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, uint64_t value, nvmf_request_complete_t *cb, void *cb_arg, int how); bool nvmf_cmd_keep_alive(struct nvmf_softc *sc, nvmf_request_complete_t *cb, void *cb_arg, int how); bool nvmf_cmd_identify_active_namespaces(struct nvmf_softc *sc, uint32_t id, struct nvme_ns_list *nslist, nvmf_request_complete_t *req_cb, void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how); bool nvmf_cmd_identify_namespace(struct nvmf_softc *sc, uint32_t id, struct nvme_namespace_data *nsdata, nvmf_request_complete_t *req_cb, void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how); bool nvmf_cmd_get_log_page(struct nvmf_softc *sc, uint32_t nsid, uint8_t lid, uint64_t offset, void *buf, size_t len, nvmf_request_complete_t *req_cb, void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how); /* nvmf_ctldev.c */ int nvmf_ctl_load(void); void nvmf_ctl_unload(void); /* nvmf_ns.c */ struct nvmf_namespace *nvmf_init_ns(struct nvmf_softc *sc, uint32_t id, const struct nvme_namespace_data *data); void nvmf_disconnect_ns(struct nvmf_namespace *ns); void nvmf_reconnect_ns(struct nvmf_namespace *ns); void nvmf_destroy_ns(struct nvmf_namespace *ns); bool nvmf_update_ns(struct nvmf_namespace *ns, const struct nvme_namespace_data *data); /* nvmf_qpair.c */ struct nvmf_host_qpair *nvmf_init_qp(struct nvmf_softc *sc, enum nvmf_trtype trtype, struct nvmf_handoff_qpair_params *handoff, const char *name); void nvmf_shutdown_qp(struct nvmf_host_qpair *qp); void nvmf_destroy_qp(struct nvmf_host_qpair *qp); struct nvmf_request *nvmf_allocate_request(struct nvmf_host_qpair *qp, void *sqe, nvmf_request_complete_t *cb, void *cb_arg, int how); void nvmf_submit_request(struct nvmf_request *req); void nvmf_free_request(struct nvmf_request *req); /* nvmf_sim.c */ int nvmf_init_sim(struct nvmf_softc *sc); void nvmf_disconnect_sim(struct nvmf_softc *sc); void nvmf_reconnect_sim(struct nvmf_softc *sc); void nvmf_destroy_sim(struct nvmf_softc *sc); void nvmf_sim_rescan_ns(struct nvmf_softc *sc, uint32_t id); #endif /* !__NVMF_VAR_H__ */