diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile --- a/share/man/man4/Makefile +++ b/share/man/man4/Makefile @@ -410,6 +410,7 @@ nvme.4 \ nvmf.4 \ nvmf_tcp.4 \ + nvmft.4 \ ${_nvram.4} \ oce.4 \ ocs_fc.4\ diff --git a/share/man/man4/nvmft.4 b/share/man/man4/nvmft.4 new file mode 100644 --- /dev/null +++ b/share/man/man4/nvmft.4 @@ -0,0 +1,85 @@ +.\" +.\" SPDX-License-Identifier: BSD-2-Clause +.\" +.\" Copyright (c) 2024 Chelsio Communications, Inc. +.\" +.Dd May 2, 2024 +.Dt NVMFT 4 +.Os +.Sh NAME +.Nm nvmft +.Nd "NVM Express over Fabrics CAM Target Layer frontend" +.Sh SYNOPSIS +To compile the subsystem into the kernel, +place the following lines in the +kernel configuration file: +.Bd -ragged -offset indent +.Cd "device nvmft" +.Cd "device ctl" +.Ed +.Pp +Alternatively, to load the subsystem as a +module at boot time, place the following line in +.Xr loader.conf 5 : +.Bd -literal -offset indent +nvmft_load="YES" +.Ed +.Sh DESCRIPTION +The +.Nm +driver provides the kernel component of an NVM Express over Fabrics +controller. +The NVMeoF controller is the server exporting namespaces backed by +local files and volumes to remote hosts. +.Nm +follows the dynamic controller model and creates a new dynamic controller +for each association. +.Pp +.Nm +is implemented as a +.Xr ctl 4 +frontend and exports CAM Target Layer LUNs as namespaces to remote hosts. +LUNs can be configured via +.Xr ctladm 8 . +.Pp +Associations between the local controller and remote hosts are managed +using both the +.Xr nvmfd 8 +daemon and the +.Xr ctladm 8 +utility. +The +.Xr nvmfd 8 +daemon listens for new associations and handles transport-specific +negotiation before handing off connected queue pairs to +.Nm +which associates queue pairs with a suitable controller instance. +The +.Cm nvlist +.Xr ctladm 8 +command lists active controllers. +The +.Cm nvterminate +command terminates one or more associations between a local controller +and a remote host. +.Pp +Associations require a supported transport such as +.Xr nvmf_tcp 4 +for associations using TCP/IP. +.Sh SEE ALSO +.Xr ctl 4 , +.Xr nvmf 4 , +.Xr nvmf_tcp 4 , +.Xr ctladm 8 , +.Xr nvmfd 8 +.Sh HISTORY +The +.Nm +module first appeared in +.Fx 15.0 . +.Sh AUTHORS +The +.Nm +subsystem was developed by +.An John Baldwin Aq Mt jhb@FreeBSD.org +under sponsorship from Chelsio Communications, Inc. diff --git a/sys/conf/NOTES b/sys/conf/NOTES --- a/sys/conf/NOTES +++ b/sys/conf/NOTES @@ -1677,6 +1677,7 @@ # # nvme: PCI-express NVM Express host controllers # nvmf: NVM Express over Fabrics host +# nvmft: NVM Express over Fabrics CAM Target Layer frontend # nvmf_tcp: TCP transport for NVM Express over Fabrics # nda: CAM NVMe disk driver # nvd: non-CAM NVMe disk driver @@ -1684,6 +1685,7 @@ device nvme # PCI-express NVMe host driver options NVME_USE_NVD=1 # Use nvd(4) instead of the CAM nda(4) driver device nvmf # NVMeoF host driver +device nvmft # NVMeoF ctl(4) frontend device nvmf_tcp # NVMeoF TCP transport device nda # NVMe direct access devices (aka disks) device nvd # expose NVMe namespaces as disks, depends on nvme diff --git a/sys/conf/files b/sys/conf/files --- a/sys/conf/files +++ b/sys/conf/files @@ -2535,6 +2535,10 @@ dev/nvme/nvme_util.c optional nvme dev/nvmem/nvmem.c optional nvmem fdt dev/nvmem/nvmem_if.m optional nvmem +dev/nvmf/controller/ctl_frontend_nvmf.c optional nvmft +dev/nvmf/controller/nvmft_controller.c optional nvmft +dev/nvmf/controller/nvmft_subr.c optional nvmft +dev/nvmf/controller/nvmft_qpair.c optional nvmft dev/nvmf/host/nvmf.c optional nvmf dev/nvmf/host/nvmf_aer.c optional nvmf dev/nvmf/host/nvmf_cmd.c optional nvmf @@ -2543,7 +2547,7 @@ dev/nvmf/host/nvmf_qpair.c optional nvmf dev/nvmf/host/nvmf_sim.c optional nvmf dev/nvmf/nvmf_tcp.c optional nvmf_tcp -dev/nvmf/nvmf_transport.c optional nvmf +dev/nvmf/nvmf_transport.c optional nvmf | optional nvmft dev/oce/oce_hw.c optional oce pci dev/oce/oce_if.c optional oce pci dev/oce/oce_mbox.c optional oce pci diff --git a/sys/dev/nvmf/controller/ctl_frontend_nvmf.c b/sys/dev/nvmf/controller/ctl_frontend_nvmf.c new file mode 100644 --- /dev/null +++ b/sys/dev/nvmf/controller/ctl_frontend_nvmf.c @@ -0,0 +1,1123 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * Store pointers to the capsule and qpair in the two pointer members + * of CTL_PRIV_FRONTEND. + */ +#define NVMFT_NC(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptrs[0]) +#define NVMFT_QP(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptrs[1]) + +static void nvmft_done(union ctl_io *io); +static int nvmft_init(void); +static int nvmft_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, + int flag, struct thread *td); +static int nvmft_shutdown(void); + +static TAILQ_HEAD(, nvmft_port) nvmft_ports; +static struct sx nvmft_ports_lock; + +MALLOC_DEFINE(M_NVMFT, "nvmft", "NVMe over Fabrics controller"); + +static struct ctl_frontend nvmft_frontend = { + .name = "nvmf", + .init = nvmft_init, + .ioctl = nvmft_ioctl, + .fe_dump = NULL, + .shutdown = nvmft_shutdown, +}; + +static void +nvmft_online(void *arg) +{ + struct nvmft_port *np = arg; + + sx_xlock(&np->lock); + np->online = true; + sx_xunlock(&np->lock); +} + +static void +nvmft_offline(void *arg) +{ + struct nvmft_port *np = arg; + struct nvmft_controller *ctrlr; + + sx_xlock(&np->lock); + np->online = false; + + TAILQ_FOREACH(ctrlr, &np->controllers, link) { + nvmft_printf(ctrlr, + "shutting down due to port going offline\n"); + nvmft_controller_error(ctrlr, NULL, ENODEV); + } + + while (!TAILQ_EMPTY(&np->controllers)) + sx_sleep(np, &np->lock, 0, "nvmfoff", 0); + sx_xunlock(&np->lock); +} + +static int +nvmft_lun_enable(void *arg, int lun_id) +{ + struct nvmft_port *np = arg; + struct nvmft_controller *ctrlr; + uint32_t *old_ns, *new_ns; + uint32_t nsid; + u_int i; + + if (lun_id >= le32toh(np->cdata.nn)) { + printf("NVMFT: %s lun %d larger than maximum nsid %u\n", + np->cdata.subnqn, lun_id, le32toh(np->cdata.nn)); + return (EOPNOTSUPP); + } + nsid = lun_id + 1; + + sx_xlock(&np->lock); + new_ns = mallocarray(np->num_ns + 1, sizeof(*new_ns), M_NVMFT, + M_WAITOK); + for (i = 0; i < np->num_ns; i++) { + if (np->active_ns[i] < nsid) + continue; + if (np->active_ns[i] == nsid) { + sx_xunlock(&np->lock); + free(new_ns, M_NVMFT); + printf("NVMFT: %s duplicate lun %d\n", + np->cdata.subnqn, lun_id); + return (EINVAL); + } + break; + } + + /* Copy over IDs smaller than nsid. */ + memcpy(new_ns, np->active_ns, i * sizeof(*np->active_ns)); + + /* Insert nsid. */ + new_ns[i] = nsid; + + /* Copy over IDs greater than nsid. */ + memcpy(new_ns + i + 1, np->active_ns + i, (np->num_ns - i) * + sizeof(*np->active_ns)); + + np->num_ns++; + old_ns = np->active_ns; + np->active_ns = new_ns; + + TAILQ_FOREACH(ctrlr, &np->controllers, link) { + nvmft_controller_lun_changed(ctrlr, lun_id); + } + + sx_xunlock(&np->lock); + free(old_ns, M_NVMFT); + + return (0); +} + +static int +nvmft_lun_disable(void *arg, int lun_id) +{ + struct nvmft_port *np = arg; + struct nvmft_controller *ctrlr; + uint32_t nsid; + u_int i; + + if (lun_id >= le32toh(np->cdata.nn)) + return (0); + nsid = lun_id + 1; + + sx_xlock(&np->lock); + for (i = 0; i < np->num_ns; i++) { + if (np->active_ns[i] == nsid) + goto found; + } + sx_xunlock(&np->lock); + printf("NVMFT: %s request to disable nonexistent lun %d\n", + np->cdata.subnqn, lun_id); + return (EINVAL); + +found: + /* Move down IDs greater than nsid. */ + memmove(np->active_ns + i, np->active_ns + i + 1, + (np->num_ns - (i + 1)) * sizeof(*np->active_ns)); + np->num_ns--; + + /* NB: Don't bother freeing the old active_ns array. */ + + TAILQ_FOREACH(ctrlr, &np->controllers, link) { + nvmft_controller_lun_changed(ctrlr, lun_id); + } + + sx_xunlock(&np->lock); + + return (0); +} + +void +nvmft_populate_active_nslist(struct nvmft_port *np, uint32_t nsid, + struct nvme_ns_list *nslist) +{ + u_int i, count; + + sx_slock(&np->lock); + count = 0; + for (i = 0; i < np->num_ns; i++) { + if (np->active_ns[i] <= nsid) + continue; + nslist->ns[count] = htole32(np->active_ns[i]); + count++; + if (count == nitems(nslist->ns)) + break; + } + sx_sunlock(&np->lock); +} + +void +nvmft_dispatch_command(struct nvmft_qpair *qp, struct nvmf_capsule *nc, + bool admin) +{ + struct nvmft_controller *ctrlr = nvmft_qpair_ctrlr(qp); + const struct nvme_command *cmd = nvmf_capsule_sqe(nc); + struct nvmft_port *np = ctrlr->np; + union ctl_io *io; + int error; + + if (cmd->nsid == htole32(0)) { + nvmft_send_generic_error(qp, nc, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT); + nvmf_free_capsule(nc); + return; + } + + mtx_lock(&ctrlr->lock); + if (ctrlr->pending_commands == 0) + ctrlr->start_busy = sbinuptime(); + ctrlr->pending_commands++; + mtx_unlock(&ctrlr->lock); + io = ctl_alloc_io(np->port.ctl_pool_ref); + ctl_zero_io(io); + NVMFT_NC(io) = nc; + NVMFT_QP(io) = qp; + io->io_hdr.io_type = admin ? CTL_IO_NVME_ADMIN : CTL_IO_NVME; + io->io_hdr.nexus.initid = ctrlr->cntlid; + io->io_hdr.nexus.targ_port = np->port.targ_port; + io->io_hdr.nexus.targ_lun = le32toh(cmd->nsid) - 1; + io->nvmeio.cmd = *cmd; + error = ctl_run(io); + if (error != 0) { + nvmft_printf(ctrlr, "ctl_run failed for command on %s: %d\n", + nvmft_qpair_name(qp), error); + ctl_nvme_set_generic_error(&io->nvmeio, + NVME_SC_INTERNAL_DEVICE_ERROR); + nvmft_done(io); + + nvmft_controller_error(ctrlr, qp, ENXIO); + } +} + +void +nvmft_terminate_commands(struct nvmft_controller *ctrlr) +{ + struct nvmft_port *np = ctrlr->np; + union ctl_io *io; + int error; + + mtx_lock(&ctrlr->lock); + if (ctrlr->pending_commands == 0) + ctrlr->start_busy = sbinuptime(); + ctrlr->pending_commands++; + mtx_unlock(&ctrlr->lock); + io = ctl_alloc_io(np->port.ctl_pool_ref); + ctl_zero_io(io); + NVMFT_QP(io) = ctrlr->admin; + io->io_hdr.io_type = CTL_IO_TASK; + io->io_hdr.nexus.initid = ctrlr->cntlid; + io->io_hdr.nexus.targ_port = np->port.targ_port; + io->io_hdr.nexus.targ_lun = 0; + io->taskio.tag_type = CTL_TAG_SIMPLE; /* XXX: unused? */ + io->taskio.task_action = CTL_TASK_I_T_NEXUS_RESET; + error = ctl_run(io); + if (error != CTL_RETVAL_COMPLETE) { + nvmft_printf(ctrlr, "failed to terminate tasks: %d\n", error); +#ifdef INVARIANTS + io->io_hdr.status = CTL_SUCCESS; +#endif + nvmft_done(io); + } +} + +static void +nvmft_datamove_out_cb(void *arg, size_t xfered, int error) +{ + struct ctl_nvmeio *ctnio = arg; + + if (error != 0) { + ctl_nvme_set_data_transfer_error(ctnio); + } else { + MPASS(xfered == ctnio->kern_data_len); + ctnio->kern_data_resid -= xfered; + } + + if (ctnio->kern_sg_entries) { + free(ctnio->ext_data_ptr, M_NVMFT); + ctnio->ext_data_ptr = NULL; + } else + MPASS(ctnio->ext_data_ptr == NULL); + ctl_datamove_done((union ctl_io *)ctnio, false); +} + +static void +nvmft_datamove_out(struct ctl_nvmeio *ctnio, struct nvmft_qpair *qp, + struct nvmf_capsule *nc) +{ + struct memdesc mem; + int error; + + MPASS(ctnio->ext_data_ptr == NULL); + if (ctnio->kern_sg_entries > 0) { + struct ctl_sg_entry *sgl; + struct bus_dma_segment *vlist; + + vlist = mallocarray(ctnio->kern_sg_entries, sizeof(*vlist), + M_NVMFT, M_WAITOK); + ctnio->ext_data_ptr = (void *)vlist; + sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr; + for (u_int i = 0; i < ctnio->kern_sg_entries; i++) { + vlist[i].ds_addr = (uintptr_t)sgl[i].addr; + vlist[i].ds_len = sgl[i].len; + } + mem = memdesc_vlist(vlist, ctnio->kern_sg_entries); + } else + mem = memdesc_vaddr(ctnio->kern_data_ptr, ctnio->kern_data_len); + + error = nvmf_receive_controller_data(nc, ctnio->kern_rel_offset, &mem, + ctnio->kern_data_len, nvmft_datamove_out_cb, ctnio); + if (error == 0) + return; + + nvmft_printf(nvmft_qpair_ctrlr(qp), + "Failed to request capsule data: %d\n", error); + ctl_nvme_set_data_transfer_error(ctnio); + + if (ctnio->kern_sg_entries) { + free(ctnio->ext_data_ptr, M_NVMFT); + ctnio->ext_data_ptr = NULL; + } else + MPASS(ctnio->ext_data_ptr == NULL); + ctl_datamove_done((union ctl_io *)ctnio, true); +} + +static struct mbuf * +nvmft_copy_data(struct ctl_nvmeio *ctnio) +{ + struct ctl_sg_entry *sgl; + struct mbuf *m0, *m; + uint32_t resid, off, todo; + int mlen; + + MPASS(ctnio->kern_data_len != 0); + + m0 = m_getm2(NULL, ctnio->kern_data_len, M_WAITOK, MT_DATA, 0); + + if (ctnio->kern_sg_entries == 0) { + m_copyback(m0, 0, ctnio->kern_data_len, ctnio->kern_data_ptr); + return (m0); + } + + resid = ctnio->kern_data_len; + sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr; + off = 0; + m = m0; + mlen = M_TRAILINGSPACE(m); + for (;;) { + todo = MIN(mlen, sgl->len - off); + memcpy(mtod(m, char *) + m->m_len, (char *)sgl->addr + off, + todo); + m->m_len += todo; + resid -= todo; + if (resid == 0) { + MPASS(m->m_next == NULL); + break; + } + + off += todo; + if (off == sgl->len) { + sgl++; + off = 0; + } + mlen -= todo; + if (mlen == 0) { + m = m->m_next; + mlen = M_TRAILINGSPACE(m); + } + } + + return (m0); +} + +static void +m_free_ref_data(struct mbuf *m) +{ + ctl_ref kern_data_ref = m->m_ext.ext_arg1; + + kern_data_ref(m->m_ext.ext_arg2, -1); +} + +static struct mbuf * +m_get_ref_data(struct ctl_nvmeio *ctnio, void *buf, u_int size) +{ + struct mbuf *m; + + m = m_get(M_WAITOK, MT_DATA); + m_extadd(m, buf, size, m_free_ref_data, ctnio->kern_data_ref, + ctnio->kern_data_arg, M_RDONLY, EXT_CTL); + m->m_len = size; + ctnio->kern_data_ref(ctnio->kern_data_arg, 1); + return (m); +} + +static struct mbuf * +nvmft_ref_data(struct ctl_nvmeio *ctnio) +{ + struct ctl_sg_entry *sgl; + struct mbuf *m0, *m; + + MPASS(ctnio->kern_data_len != 0); + + if (ctnio->kern_sg_entries == 0) + return (m_get_ref_data(ctnio, ctnio->kern_data_ptr, + ctnio->kern_data_len)); + + sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr; + m0 = m_get_ref_data(ctnio, sgl[0].addr, sgl[0].len); + m = m0; + for (u_int i = 1; i < ctnio->kern_sg_entries; i++) { + m->m_next = m_get_ref_data(ctnio, sgl[i].addr, sgl[i].len); + m = m->m_next; + } + return (m0); +} + +static void +nvmft_datamove_in(struct ctl_nvmeio *ctnio, struct nvmft_qpair *qp, + struct nvmf_capsule *nc) +{ + struct mbuf *m; + u_int status; + + if (ctnio->kern_data_ref != NULL) + m = nvmft_ref_data(ctnio); + else + m = nvmft_copy_data(ctnio); + status = nvmf_send_controller_data(nc, ctnio->kern_rel_offset, m, + ctnio->kern_data_len); + switch (status) { + case NVMF_SUCCESS_SENT: + ctnio->success_sent = true; + nvmft_command_completed(qp, nc); + /* FALLTHROUGH */ + case NVMF_MORE: + case NVME_SC_SUCCESS: + break; + default: + ctl_nvme_set_generic_error(ctnio, status); + break; + } + ctl_datamove_done((union ctl_io *)ctnio, true); +} + +static void +nvmft_datamove(union ctl_io *io) +{ + struct nvmf_capsule *nc; + struct nvmft_qpair *qp; + + /* Some CTL commands preemptively set a success status. */ + MPASS(io->io_hdr.status == CTL_STATUS_NONE || + io->io_hdr.status == CTL_SUCCESS); + MPASS(!io->nvmeio.success_sent); + + nc = NVMFT_NC(io); + qp = NVMFT_QP(io); + + if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) + nvmft_datamove_in(&io->nvmeio, qp, nc); + else + nvmft_datamove_out(&io->nvmeio, qp, nc); +} + +static void +hip_add(uint64_t pair[2], uint64_t addend) +{ + uint64_t old, new; + + old = le64toh(pair[0]); + new = old + addend; + pair[0] = htole64(new); + if (new < old) + pair[1] += htole64(1); +} + +static void +nvmft_done(union ctl_io *io) +{ + struct nvmft_controller *ctrlr; + const struct nvme_command *cmd; + struct nvmft_qpair *qp; + struct nvmf_capsule *nc; + size_t len; + + KASSERT(io->io_hdr.status == CTL_SUCCESS || + io->io_hdr.status == CTL_NVME_ERROR, + ("%s: bad status %u", __func__, io->io_hdr.status)); + + nc = NVMFT_NC(io); + qp = NVMFT_QP(io); + ctrlr = nvmft_qpair_ctrlr(qp); + + if (nc == NULL) { + /* Completion of nvmft_terminate_commands. */ + goto end; + } + + cmd = nvmf_capsule_sqe(nc); + + if (io->io_hdr.status == CTL_SUCCESS) + len = nvmf_capsule_data_len(nc) / 512; + else + len = 0; + switch (cmd->opc) { + case NVME_OPC_WRITE: + mtx_lock(&ctrlr->lock); + hip_add(ctrlr->hip.host_write_commands, 1); + len += ctrlr->partial_duw; + if (len > 1000) + hip_add(ctrlr->hip.data_units_written, len / 1000); + ctrlr->partial_duw = len % 1000; + mtx_unlock(&ctrlr->lock); + break; + case NVME_OPC_READ: + case NVME_OPC_COMPARE: + case NVME_OPC_VERIFY: + mtx_lock(&ctrlr->lock); + if (cmd->opc != NVME_OPC_VERIFY) + hip_add(ctrlr->hip.host_read_commands, 1); + len += ctrlr->partial_dur; + if (len > 1000) + hip_add(ctrlr->hip.data_units_read, len / 1000); + ctrlr->partial_dur = len % 1000; + mtx_unlock(&ctrlr->lock); + break; + } + + if (io->nvmeio.success_sent) { + MPASS(io->io_hdr.status == CTL_SUCCESS); + } else { + io->nvmeio.cpl.cid = cmd->cid; + nvmft_send_response(qp, &io->nvmeio.cpl); + } + nvmf_free_capsule(nc); +end: + ctl_free_io(io); + mtx_lock(&ctrlr->lock); + ctrlr->pending_commands--; + if (ctrlr->pending_commands == 0) + ctrlr->busy_total += sbinuptime() - ctrlr->start_busy; + mtx_unlock(&ctrlr->lock); +} + +static int +nvmft_init(void) +{ + TAILQ_INIT(&nvmft_ports); + sx_init(&nvmft_ports_lock, "nvmft ports"); + return (0); +} + +void +nvmft_port_free(struct nvmft_port *np) +{ + KASSERT(TAILQ_EMPTY(&np->controllers), + ("%s(%p): active controllers", __func__, np)); + + if (np->port.targ_port != -1) { + if (ctl_port_deregister(&np->port) != 0) + printf("%s: ctl_port_deregister() failed\n", __func__); + } + + free(np->active_ns, M_NVMFT); + clean_unrhdr(np->ids); + delete_unrhdr(np->ids); + sx_destroy(&np->lock); + free(np, M_NVMFT); +} + +static struct nvmft_port * +nvmft_port_find(const char *subnqn) +{ + struct nvmft_port *np; + + KASSERT(nvmf_nqn_valid(subnqn), ("%s: invalid nqn", __func__)); + + sx_assert(&nvmft_ports_lock, SA_LOCKED); + TAILQ_FOREACH(np, &nvmft_ports, link) { + if (strcmp(np->cdata.subnqn, subnqn) == 0) + break; + } + return (np); +} + +static struct nvmft_port * +nvmft_port_find_by_id(int port_id) +{ + struct nvmft_port *np; + + sx_assert(&nvmft_ports_lock, SA_LOCKED); + TAILQ_FOREACH(np, &nvmft_ports, link) { + if (np->port.targ_port == port_id) + break; + } + return (np); +} + +/* + * Helper function to fetch a number stored as a string in an nv_list. + * Returns false if the string was not a valid number. + */ +static bool +dnvlist_get_strnum(nvlist_t *nvl, const char *name, u_long default_value, + u_long *value) +{ + const char *str; + char *cp; + + str = dnvlist_get_string(nvl, name, NULL); + if (str == NULL) { + *value = default_value; + return (true); + } + if (*str == '\0') + return (false); + *value = strtoul(str, &cp, 0); + if (*cp != '\0') + return (false); + return (true); +} + +/* + * NVMeoF ports support the following parameters: + * + * Mandatory: + * + * subnqn: subsystem NVMe Qualified Name + * portid: integer port ID from Discovery Log Page entry + * + * Optional: + * serial: Serial Number string + * max_io_qsize: Maximum number of I/O queue entries + * enable_timeout: Timeout for controller enable in milliseconds + * ioccsz: Maximum command capsule size + * iorcsz: Maximum response capsule size + * nn: Number of namespaces + */ +static void +nvmft_port_create(struct ctl_req *req) +{ + struct nvmft_port *np; + struct ctl_port *port; + const char *serial, *subnqn; + char serial_buf[NVME_SERIAL_NUMBER_LENGTH]; + u_long enable_timeout, hostid, ioccsz, iorcsz, max_io_qsize, nn, portid; + int error; + + /* Required parameters. */ + subnqn = dnvlist_get_string(req->args_nvl, "subnqn", NULL); + if (subnqn == NULL || !nvlist_exists_string(req->args_nvl, "portid")) { + req->status = CTL_LUN_ERROR; + snprintf(req->error_str, sizeof(req->error_str), + "Missing required argument"); + return; + } + if (!nvmf_nqn_valid(subnqn)) { + req->status = CTL_LUN_ERROR; + snprintf(req->error_str, sizeof(req->error_str), + "Invalid SubNQN"); + return; + } + if (!dnvlist_get_strnum(req->args_nvl, "portid", UINT16_MAX, &portid) || + portid > UINT16_MAX) { + req->status = CTL_LUN_ERROR; + snprintf(req->error_str, sizeof(req->error_str), + "Invalid port ID"); + return; + } + + /* Optional parameters. */ + if (!dnvlist_get_strnum(req->args_nvl, "max_io_qsize", + NVMF_MAX_IO_ENTRIES, &max_io_qsize) || + max_io_qsize < NVME_MIN_IO_ENTRIES || + max_io_qsize > NVME_MAX_IO_ENTRIES) { + req->status = CTL_LUN_ERROR; + snprintf(req->error_str, sizeof(req->error_str), + "Invalid maximum I/O queue size"); + return; + } + + if (!dnvlist_get_strnum(req->args_nvl, "enable_timeout", + NVMF_CC_EN_TIMEOUT * 500, &enable_timeout) || + (enable_timeout % 500) != 0 || (enable_timeout / 500) > 255) { + req->status = CTL_LUN_ERROR; + snprintf(req->error_str, sizeof(req->error_str), + "Invalid enable timeout"); + return; + } + + if (!dnvlist_get_strnum(req->args_nvl, "ioccsz", NVMF_IOCCSZ, + &ioccsz) || ioccsz < sizeof(struct nvme_command) || + (ioccsz % 16) != 0) { + req->status = CTL_LUN_ERROR; + snprintf(req->error_str, sizeof(req->error_str), + "Invalid Command Capsule size"); + return; + } + + if (!dnvlist_get_strnum(req->args_nvl, "iorcsz", NVMF_IORCSZ, + &iorcsz) || iorcsz < sizeof(struct nvme_completion) || + (iorcsz % 16) != 0) { + req->status = CTL_LUN_ERROR; + snprintf(req->error_str, sizeof(req->error_str), + "Invalid Response Capsule size"); + return; + } + + if (!dnvlist_get_strnum(req->args_nvl, "nn", NVMF_NN, &nn) || + nn < 1 || nn > UINT32_MAX) { + req->status = CTL_LUN_ERROR; + snprintf(req->error_str, sizeof(req->error_str), + "Invalid number of namespaces"); + return; + } + + serial = dnvlist_get_string(req->args_nvl, "serial", NULL); + if (serial == NULL) { + getcredhostid(curthread->td_ucred, &hostid); + nvmf_controller_serial(serial_buf, sizeof(serial_buf), hostid); + serial = serial_buf; + } + + sx_xlock(&nvmft_ports_lock); + + np = nvmft_port_find(subnqn); + if (np != NULL) { + req->status = CTL_LUN_ERROR; + snprintf(req->error_str, sizeof(req->error_str), + "SubNQN \"%s\" already exists", subnqn); + sx_xunlock(&nvmft_ports_lock); + return; + } + + np = malloc(sizeof(*np), M_NVMFT, M_WAITOK | M_ZERO); + refcount_init(&np->refs, 1); + np->max_io_qsize = max_io_qsize; + np->cap = _nvmf_controller_cap(max_io_qsize, enable_timeout / 500); + sx_init(&np->lock, "nvmft port"); + np->ids = new_unrhdr(0, MIN(CTL_MAX_INIT_PER_PORT - 1, + NVMF_CNTLID_STATIC_MAX), UNR_NO_MTX); + TAILQ_INIT(&np->controllers); + + /* The controller ID is set later for individual controllers. */ + _nvmf_init_io_controller_data(0, max_io_qsize, serial, ostype, + osrelease, subnqn, nn, ioccsz, iorcsz, &np->cdata); + np->cdata.aerl = NVMFT_NUM_AER - 1; + np->cdata.oaes = htole32(NVME_ASYNC_EVENT_NS_ATTRIBUTE); + np->cdata.oncs = htole16(NVMEF(NVME_CTRLR_DATA_ONCS_VERIFY, 1) | + NVMEF(NVME_CTRLR_DATA_ONCS_WRZERO, 1) | + NVMEF(NVME_CTRLR_DATA_ONCS_DSM, 1) | + NVMEF(NVME_CTRLR_DATA_ONCS_COMPARE, 1)); + np->cdata.fuses = NVMEF(NVME_CTRLR_DATA_FUSES_CNW, 1); + + np->fp.afi = NVMEF(NVME_FIRMWARE_PAGE_AFI_SLOT, 1); + memcpy(np->fp.revision[0], np->cdata.fr, sizeof(np->cdata.fr)); + + port = &np->port; + + port->frontend = &nvmft_frontend; + port->port_type = CTL_PORT_NVMF; + port->num_requested_ctl_io = max_io_qsize; + port->port_name = "nvmf"; + port->physical_port = portid; + port->virtual_port = 0; + port->port_online = nvmft_online; + port->port_offline = nvmft_offline; + port->onoff_arg = np; + port->lun_enable = nvmft_lun_enable; + port->lun_disable = nvmft_lun_disable; + port->targ_lun_arg = np; + port->fe_datamove = nvmft_datamove; + port->fe_done = nvmft_done; + port->targ_port = -1; + port->options = nvlist_clone(req->args_nvl); + + error = ctl_port_register(port); + if (error != 0) { + sx_xunlock(&nvmft_ports_lock); + nvlist_destroy(port->options); + nvmft_port_rele(np); + req->status = CTL_LUN_ERROR; + snprintf(req->error_str, sizeof(req->error_str), + "Failed to register CTL port with error %d", error); + return; + } + + TAILQ_INSERT_TAIL(&nvmft_ports, np, link); + sx_xunlock(&nvmft_ports_lock); + + req->status = CTL_LUN_OK; + req->result_nvl = nvlist_create(0); + nvlist_add_number(req->result_nvl, "port_id", port->targ_port); +} + +static void +nvmft_port_remove(struct ctl_req *req) +{ + struct nvmft_port *np; + const char *subnqn; + u_long port_id; + + /* + * ctladm port -r just provides the port_id, so permit looking + * up a port either by "subnqn" or "port_id". + */ + port_id = ULONG_MAX; + subnqn = dnvlist_get_string(req->args_nvl, "subnqn", NULL); + if (subnqn == NULL) { + if (!nvlist_exists_string(req->args_nvl, "port_id")) { + req->status = CTL_LUN_ERROR; + snprintf(req->error_str, sizeof(req->error_str), + "Missing required argument"); + return; + } + if (!dnvlist_get_strnum(req->args_nvl, "port_id", ULONG_MAX, + &port_id)) { + req->status = CTL_LUN_ERROR; + snprintf(req->error_str, sizeof(req->error_str), + "Invalid CTL port ID"); + return; + } + } else { + if (nvlist_exists_string(req->args_nvl, "port_id")) { + req->status = CTL_LUN_ERROR; + snprintf(req->error_str, sizeof(req->error_str), + "Ambiguous port removal request"); + return; + } + } + + sx_xlock(&nvmft_ports_lock); + + if (subnqn != NULL) { + np = nvmft_port_find(subnqn); + if (np == NULL) { + req->status = CTL_LUN_ERROR; + snprintf(req->error_str, sizeof(req->error_str), + "SubNQN \"%s\" does not exist", subnqn); + sx_xunlock(&nvmft_ports_lock); + return; + } + } else { + np = nvmft_port_find_by_id(port_id); + if (np == NULL) { + req->status = CTL_LUN_ERROR; + snprintf(req->error_str, sizeof(req->error_str), + "CTL port %lu is not a NVMF port", port_id); + sx_xunlock(&nvmft_ports_lock); + return; + } + } + + TAILQ_REMOVE(&nvmft_ports, np, link); + sx_xunlock(&nvmft_ports_lock); + + ctl_port_offline(&np->port); + nvmft_port_rele(np); + req->status = CTL_LUN_OK; +} + +static void +nvmft_handoff(struct ctl_nvmf *cn) +{ + struct nvmf_fabric_connect_cmd cmd; + struct nvmf_handoff_controller_qpair *handoff; + struct nvmf_fabric_connect_data *data; + struct nvmft_port *np; + int error; + + np = NULL; + data = NULL; + handoff = &cn->data.handoff; + error = copyin(handoff->cmd, &cmd, sizeof(cmd)); + if (error != 0) { + cn->status = CTL_NVMF_ERROR; + snprintf(cn->error_str, sizeof(cn->error_str), + "Failed to copyin CONNECT SQE"); + return; + } + + data = malloc(sizeof(*data), M_NVMFT, M_WAITOK); + error = copyin(handoff->data, data, sizeof(*data)); + if (error != 0) { + cn->status = CTL_NVMF_ERROR; + snprintf(cn->error_str, sizeof(cn->error_str), + "Failed to copyin CONNECT data"); + goto out; + } + + if (!nvmf_nqn_valid(data->subnqn)) { + cn->status = CTL_NVMF_ERROR; + snprintf(cn->error_str, sizeof(cn->error_str), + "Invalid SubNQN"); + goto out; + } + + sx_slock(&nvmft_ports_lock); + np = nvmft_port_find(data->subnqn); + if (np == NULL) { + sx_sunlock(&nvmft_ports_lock); + cn->status = CTL_NVMF_ERROR; + snprintf(cn->error_str, sizeof(cn->error_str), + "Unknown SubNQN"); + goto out; + } + if (!np->online) { + sx_sunlock(&nvmft_ports_lock); + cn->status = CTL_NVMF_ERROR; + snprintf(cn->error_str, sizeof(cn->error_str), + "CTL port offline"); + np = NULL; + goto out; + } + nvmft_port_ref(np); + sx_sunlock(&nvmft_ports_lock); + + if (handoff->params.admin) { + error = nvmft_handoff_admin_queue(np, handoff, &cmd, data); + if (error != 0) { + cn->status = CTL_NVMF_ERROR; + snprintf(cn->error_str, sizeof(cn->error_str), + "Failed to handoff admin queue: %d", error); + goto out; + } + } else { + error = nvmft_handoff_io_queue(np, handoff, &cmd, data); + if (error != 0) { + cn->status = CTL_NVMF_ERROR; + snprintf(cn->error_str, sizeof(cn->error_str), + "Failed to handoff admin queue: %d", error); + goto out; + } + } + + cn->status = CTL_NVMF_OK; +out: + if (np != NULL) + nvmft_port_rele(np); + free(data, M_NVMFT); +} + +static void +nvmft_list(struct ctl_nvmf *cn) +{ + struct ctl_nvmf_list_params *lp; + struct nvmft_controller *ctrlr; + struct nvmft_port *np; + struct sbuf *sb; + int error; + + lp = &cn->data.list; + + sb = sbuf_new(NULL, NULL, lp->alloc_len, SBUF_FIXEDLEN | + SBUF_INCLUDENUL); + if (sb == NULL) { + cn->status = CTL_NVMF_ERROR; + snprintf(cn->error_str, sizeof(cn->error_str), + "Failed to allocate NVMeoF session list"); + return; + } + + sbuf_printf(sb, "\n"); + sx_slock(&nvmft_ports_lock); + TAILQ_FOREACH(np, &nvmft_ports, link) { + sx_slock(&np->lock); + TAILQ_FOREACH(ctrlr, &np->controllers, link) { + sbuf_printf(sb, "" + "%s" + "%s" + "%u" + "\n", + ctrlr->cntlid, + ctrlr->hostnqn, + np->cdata.subnqn, + ctrlr->trtype); + } + sx_sunlock(&np->lock); + } + sx_sunlock(&nvmft_ports_lock); + sbuf_printf(sb, "\n"); + if (sbuf_finish(sb) != 0) { + sbuf_delete(sb); + cn->status = CTL_NVMF_LIST_NEED_MORE_SPACE; + snprintf(cn->error_str, sizeof(cn->error_str), + "Out of space, %d bytes is too small", lp->alloc_len); + return; + } + + error = copyout(sbuf_data(sb), lp->conn_xml, sbuf_len(sb)); + if (error != 0) { + sbuf_delete(sb); + cn->status = CTL_NVMF_ERROR; + snprintf(cn->error_str, sizeof(cn->error_str), + "Failed to copyout session list: %d", error); + return; + } + lp->fill_len = sbuf_len(sb); + cn->status = CTL_NVMF_OK; + sbuf_delete(sb); +} + +static void +nvmft_terminate(struct ctl_nvmf *cn) +{ + struct ctl_nvmf_terminate_params *tp; + struct nvmft_controller *ctrlr; + struct nvmft_port *np; + bool found, match; + + tp = &cn->data.terminate; + + found = false; + sx_slock(&nvmft_ports_lock); + TAILQ_FOREACH(np, &nvmft_ports, link) { + sx_slock(&np->lock); + TAILQ_FOREACH(ctrlr, &np->controllers, link) { + if (tp->all != 0) + match = true; + else if (tp->cntlid != -1) + match = tp->cntlid == ctrlr->cntlid; + else if (tp->hostnqn[0] != '\0') + match = strncmp(tp->hostnqn, ctrlr->hostnqn, + sizeof(tp->hostnqn)) == 0; + else + match = false; + if (!match) + continue; + nvmft_printf(ctrlr, + "disconnecting due to administrative request\n"); + nvmft_controller_error(ctrlr, NULL, ECONNABORTED); + found = true; + } + sx_sunlock(&np->lock); + } + sx_sunlock(&nvmft_ports_lock); + + if (!found) { + cn->status = CTL_NVMF_ASSOCIATION_NOT_FOUND; + snprintf(cn->error_str, sizeof(cn->error_str), + "No matching associations found"); + return; + } + cn->status = CTL_NVMF_OK; +} + +static int +nvmft_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int flag, + struct thread *td) +{ + struct ctl_nvmf *cn; + struct ctl_req *req; + + switch (cmd) { + case CTL_PORT_REQ: + req = (struct ctl_req *)data; + switch (req->reqtype) { + case CTL_REQ_CREATE: + nvmft_port_create(req); + break; + case CTL_REQ_REMOVE: + nvmft_port_remove(req); + break; + default: + req->status = CTL_LUN_ERROR; + snprintf(req->error_str, sizeof(req->error_str), + "Unsupported request type %d", req->reqtype); + break; + } + return (0); + case CTL_NVMF: + cn = (struct ctl_nvmf *)data; + switch (cn->type) { + case CTL_NVMF_HANDOFF: + nvmft_handoff(cn); + break; + case CTL_NVMF_LIST: + nvmft_list(cn); + break; + case CTL_NVMF_TERMINATE: + nvmft_terminate(cn); + break; + default: + cn->status = CTL_NVMF_ERROR; + snprintf(cn->error_str, sizeof(cn->error_str), + "Invalid NVMeoF request type %d", cn->type); + break; + } + return (0); + default: + return (ENOTTY); + } +} + +static int +nvmft_shutdown(void) +{ + /* TODO: Need to check for active controllers. */ + if (!TAILQ_EMPTY(&nvmft_ports)) + return (EBUSY); + + sx_destroy(&nvmft_ports_lock); + return (0); +} + +CTL_FRONTEND_DECLARE(nvmft, nvmft_frontend); +MODULE_DEPEND(nvmft, nvmf_transport, 1, 1, 1); diff --git a/sys/dev/nvmf/controller/nvmft_controller.c b/sys/dev/nvmf/controller/nvmft_controller.c new file mode 100644 --- /dev/null +++ b/sys/dev/nvmf/controller/nvmft_controller.c @@ -0,0 +1,1130 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static void nvmft_controller_shutdown(void *arg, int pending); +static void nvmft_controller_terminate(void *arg, int pending); + +int +nvmft_printf(struct nvmft_controller *ctrlr, const char *fmt, ...) +{ + char buf[128]; + struct sbuf sb; + va_list ap; + size_t retval; + + sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); + sbuf_set_drain(&sb, sbuf_printf_drain, &retval); + + sbuf_printf(&sb, "nvmft%u: ", ctrlr->cntlid); + + va_start(ap, fmt); + sbuf_vprintf(&sb, fmt, ap); + va_end(ap); + + sbuf_finish(&sb); + sbuf_delete(&sb); + + return (retval); +} + +static struct nvmft_controller * +nvmft_controller_alloc(struct nvmft_port *np, uint16_t cntlid, + const struct nvmf_fabric_connect_data *data) +{ + struct nvmft_controller *ctrlr; + + ctrlr = malloc(sizeof(*ctrlr), M_NVMFT, M_WAITOK | M_ZERO); + ctrlr->cntlid = cntlid; + nvmft_port_ref(np); + TAILQ_INSERT_TAIL(&np->controllers, ctrlr, link); + ctrlr->np = np; + mtx_init(&ctrlr->lock, "nvmft controller", NULL, MTX_DEF); + callout_init(&ctrlr->ka_timer, 1); + TASK_INIT(&ctrlr->shutdown_task, 0, nvmft_controller_shutdown, ctrlr); + TIMEOUT_TASK_INIT(taskqueue_thread, &ctrlr->terminate_task, 0, + nvmft_controller_terminate, ctrlr); + + ctrlr->cdata = np->cdata; + ctrlr->cdata.ctrlr_id = htole16(cntlid); + memcpy(ctrlr->hostid, data->hostid, sizeof(ctrlr->hostid)); + memcpy(ctrlr->hostnqn, data->hostnqn, sizeof(ctrlr->hostnqn)); + ctrlr->hip.power_cycles[0] = 1; + ctrlr->create_time = sbinuptime(); + + ctrlr->changed_ns = malloc(sizeof(*ctrlr->changed_ns), M_NVMFT, + M_WAITOK | M_ZERO); + + return (ctrlr); +} + +static void +nvmft_controller_free(struct nvmft_controller *ctrlr) +{ + mtx_destroy(&ctrlr->lock); + MPASS(ctrlr->io_qpairs == NULL); + free(ctrlr->changed_ns, M_NVMFT); + free(ctrlr, M_NVMFT); +} + +static void +nvmft_keep_alive_timer(void *arg) +{ + struct nvmft_controller *ctrlr = arg; + int traffic; + + if (ctrlr->shutdown) + return; + + traffic = atomic_readandclear_int(&ctrlr->ka_active_traffic); + if (traffic == 0) { + nvmft_printf(ctrlr, + "disconnecting due to KeepAlive timeout\n"); + nvmft_controller_error(ctrlr, NULL, ETIMEDOUT); + return; + } + + callout_schedule_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0, C_HARDCLOCK); +} + +int +nvmft_handoff_admin_queue(struct nvmft_port *np, + const struct nvmf_handoff_controller_qpair *handoff, + const struct nvmf_fabric_connect_cmd *cmd, + const struct nvmf_fabric_connect_data *data) +{ + struct nvmft_controller *ctrlr; + struct nvmft_qpair *qp; + uint32_t kato; + int cntlid; + + if (cmd->qid != htole16(0)) + return (EINVAL); + + qp = nvmft_qpair_init(handoff->trtype, &handoff->params, 0, + "admin queue"); + + sx_xlock(&np->lock); + cntlid = alloc_unr(np->ids); + if (cntlid == -1) { + sx_xunlock(&np->lock); + printf("NVMFT: Unable to allocate controller for %.*s\n", + (int)sizeof(data->hostnqn), data->hostnqn); + nvmft_connect_error(qp, cmd, NVME_SCT_COMMAND_SPECIFIC, + NVMF_FABRIC_SC_INVALID_HOST); + nvmft_qpair_destroy(qp); + return (ENOMEM); + } + +#ifdef INVARIANTS + TAILQ_FOREACH(ctrlr, &np->controllers, link) { + KASSERT(ctrlr->cntlid != cntlid, + ("%s: duplicate controllers with id %d", __func__, cntlid)); + } +#endif + + ctrlr = nvmft_controller_alloc(np, cntlid, data); + nvmft_printf(ctrlr, "associated with %.*s\n", + (int)sizeof(data->hostnqn), data->hostnqn); + ctrlr->admin = qp; + ctrlr->trtype = handoff->trtype; + + /* + * The spec requires a non-zero KeepAlive timer, but allow a + * zero KATO value to match Linux. + */ + kato = le32toh(cmd->kato); + if (kato != 0) { + /* + * Round up to 1 second matching granularity + * advertised in cdata. + */ + ctrlr->ka_sbt = mstosbt(roundup(kato, 1000)); + callout_reset_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0, + nvmft_keep_alive_timer, ctrlr, C_HARDCLOCK); + } + + nvmft_finish_accept(qp, cmd, ctrlr); + sx_xunlock(&np->lock); + + return (0); +} + +int +nvmft_handoff_io_queue(struct nvmft_port *np, + const struct nvmf_handoff_controller_qpair *handoff, + const struct nvmf_fabric_connect_cmd *cmd, + const struct nvmf_fabric_connect_data *data) +{ + struct nvmft_controller *ctrlr; + struct nvmft_qpair *qp; + char name[16]; + uint16_t cntlid, qid; + + qid = le16toh(cmd->qid); + if (qid == 0) + return (EINVAL); + cntlid = le16toh(data->cntlid); + + snprintf(name, sizeof(name), "I/O queue %u", qid); + qp = nvmft_qpair_init(handoff->trtype, &handoff->params, qid, name); + + sx_slock(&np->lock); + TAILQ_FOREACH(ctrlr, &np->controllers, link) { + if (ctrlr->cntlid == cntlid) + break; + } + if (ctrlr == NULL) { + sx_sunlock(&np->lock); + printf("NVMFT: Nonexistent controller %u for I/O queue %u from %.*s\n", + ctrlr->cntlid, qid, (int)sizeof(data->hostnqn), + data->hostnqn); + nvmft_connect_invalid_parameters(qp, cmd, true, + offsetof(struct nvmf_fabric_connect_data, cntlid)); + nvmft_qpair_destroy(qp); + return (ENOENT); + } + + if (memcmp(ctrlr->hostid, data->hostid, sizeof(ctrlr->hostid)) != 0) { + sx_sunlock(&np->lock); + nvmft_printf(ctrlr, + "hostid mismatch for I/O queue %u from %.*s\n", qid, + (int)sizeof(data->hostnqn), data->hostnqn); + nvmft_connect_invalid_parameters(qp, cmd, true, + offsetof(struct nvmf_fabric_connect_data, hostid)); + nvmft_qpair_destroy(qp); + return (EINVAL); + } + if (memcmp(ctrlr->hostnqn, data->hostnqn, sizeof(ctrlr->hostnqn)) != 0) { + sx_sunlock(&np->lock); + nvmft_printf(ctrlr, + "hostnqn mismatch for I/O queue %u from %.*s\n", qid, + (int)sizeof(data->hostnqn), data->hostnqn); + nvmft_connect_invalid_parameters(qp, cmd, true, + offsetof(struct nvmf_fabric_connect_data, hostnqn)); + nvmft_qpair_destroy(qp); + return (EINVAL); + } + + /* XXX: Require handoff->trtype == ctrlr->trtype? */ + + mtx_lock(&ctrlr->lock); + if (ctrlr->shutdown) { + mtx_unlock(&ctrlr->lock); + sx_sunlock(&np->lock); + nvmft_printf(ctrlr, + "attempt to create I/O queue %u on disabled controller from %.*s\n", + qid, (int)sizeof(data->hostnqn), data->hostnqn); + nvmft_connect_invalid_parameters(qp, cmd, true, + offsetof(struct nvmf_fabric_connect_data, cntlid)); + nvmft_qpair_destroy(qp); + return (EINVAL); + } + if (ctrlr->num_io_queues == 0) { + mtx_unlock(&ctrlr->lock); + sx_sunlock(&np->lock); + nvmft_printf(ctrlr, + "attempt to create I/O queue %u without enabled queues from %.*s\n", + qid, (int)sizeof(data->hostnqn), data->hostnqn); + nvmft_connect_error(qp, cmd, NVME_SCT_GENERIC, + NVME_SC_COMMAND_SEQUENCE_ERROR); + nvmft_qpair_destroy(qp); + return (EINVAL); + } + if (cmd->qid > ctrlr->num_io_queues) { + mtx_unlock(&ctrlr->lock); + sx_sunlock(&np->lock); + nvmft_printf(ctrlr, + "attempt to create invalid I/O queue %u from %.*s\n", qid, + (int)sizeof(data->hostnqn), data->hostnqn); + nvmft_connect_invalid_parameters(qp, cmd, false, + offsetof(struct nvmf_fabric_connect_cmd, qid)); + nvmft_qpair_destroy(qp); + return (EINVAL); + } + if (ctrlr->io_qpairs[qid - 1].qp != NULL) { + mtx_unlock(&ctrlr->lock); + sx_sunlock(&np->lock); + nvmft_printf(ctrlr, + "attempt to re-create I/O queue %u from %.*s\n", qid, + (int)sizeof(data->hostnqn), data->hostnqn); + nvmft_connect_error(qp, cmd, NVME_SCT_GENERIC, + NVME_SC_COMMAND_SEQUENCE_ERROR); + nvmft_qpair_destroy(qp); + return (EINVAL); + } + + ctrlr->io_qpairs[qid - 1].qp = qp; + mtx_unlock(&ctrlr->lock); + nvmft_finish_accept(qp, cmd, ctrlr); + sx_sunlock(&np->lock); + + return (0); +} + +static void +nvmft_controller_shutdown(void *arg, int pending) +{ + struct nvmft_controller *ctrlr = arg; + + MPASS(pending == 1); + + /* + * Shutdown all I/O queues to terminate pending datamoves and + * stop receiving new commands. + */ + mtx_lock(&ctrlr->lock); + for (u_int i = 0; i < ctrlr->num_io_queues; i++) { + if (ctrlr->io_qpairs[i].qp != NULL) { + ctrlr->io_qpairs[i].shutdown = true; + mtx_unlock(&ctrlr->lock); + nvmft_qpair_shutdown(ctrlr->io_qpairs[i].qp); + mtx_lock(&ctrlr->lock); + } + } + mtx_unlock(&ctrlr->lock); + + /* Terminate active CTL commands. */ + nvmft_terminate_commands(ctrlr); + + /* Wait for all pending CTL commands to complete. */ + mtx_lock(&ctrlr->lock); + while (ctrlr->pending_commands != 0) + mtx_sleep(&ctrlr->pending_commands, &ctrlr->lock, 0, "nvmftsh", + hz / 100); + mtx_unlock(&ctrlr->lock); + + /* Delete all of the I/O queues. */ + for (u_int i = 0; i < ctrlr->num_io_queues; i++) { + if (ctrlr->io_qpairs[i].qp != NULL) + nvmft_qpair_destroy(ctrlr->io_qpairs[i].qp); + } + free(ctrlr->io_qpairs, M_NVMFT); + ctrlr->io_qpairs = NULL; + + mtx_lock(&ctrlr->lock); + ctrlr->num_io_queues = 0; + + /* Mark shutdown complete. */ + if (NVMEV(NVME_CSTS_REG_SHST, ctrlr->csts) == NVME_SHST_OCCURRING) { + ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_SHST); + ctrlr->csts |= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_COMPLETE); + } + + if (NVMEV(NVME_CSTS_REG_CFS, ctrlr->csts) == 0) { + ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_RDY); + ctrlr->shutdown = false; + } + mtx_unlock(&ctrlr->lock); + + /* + * If the admin queue was closed while shutting down or a + * fatal controller error has occurred, terminate the + * association immediately, otherwise wait up to 2 minutes + * (NVMe-over-Fabrics 1.1 4.6). + */ + if (ctrlr->admin_closed || NVMEV(NVME_CSTS_REG_CFS, ctrlr->csts) != 0) + nvmft_controller_terminate(ctrlr, 0); + else + taskqueue_enqueue_timeout(taskqueue_thread, + &ctrlr->terminate_task, hz * 60 * 2); +} + +static void +nvmft_controller_terminate(void *arg, int pending) +{ + struct nvmft_controller *ctrlr = arg; + struct nvmft_port *np; + bool wakeup_np; + + /* If the controller has been re-enabled, nothing to do. */ + mtx_lock(&ctrlr->lock); + if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) != 0) { + mtx_unlock(&ctrlr->lock); + + if (ctrlr->ka_sbt != 0) + callout_schedule_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0, + C_HARDCLOCK); + return; + } + + /* Disable updates to CC while destroying admin qpair. */ + ctrlr->shutdown = true; + mtx_unlock(&ctrlr->lock); + + nvmft_qpair_destroy(ctrlr->admin); + + /* Remove association (CNTLID). */ + np = ctrlr->np; + sx_xlock(&np->lock); + TAILQ_REMOVE(&np->controllers, ctrlr, link); + free_unr(np->ids, ctrlr->cntlid); + wakeup_np = (!np->online && TAILQ_EMPTY(&np->controllers)); + sx_xunlock(&np->lock); + if (wakeup_np) + wakeup(np); + + callout_drain(&ctrlr->ka_timer); + + nvmft_printf(ctrlr, "association terminated\n"); + nvmft_controller_free(ctrlr); + nvmft_port_rele(np); +} + +void +nvmft_controller_error(struct nvmft_controller *ctrlr, struct nvmft_qpair *qp, + int error) +{ + /* + * If a queue pair is closed, that isn't an error per se. + * That just means additional commands cannot be received on + * that queue pair. + * + * If the admin queue pair is closed while idle or while + * shutting down, terminate the association immediately. + * + * If an I/O queue pair is closed, just ignore it. + */ + if (error == 0) { + if (qp != ctrlr->admin) + return; + + mtx_lock(&ctrlr->lock); + if (ctrlr->shutdown) { + ctrlr->admin_closed = true; + mtx_unlock(&ctrlr->lock); + return; + } + + if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) == 0) { + MPASS(ctrlr->num_io_queues == 0); + mtx_unlock(&ctrlr->lock); + + /* + * Ok to drop lock here since ctrlr->cc can't + * change if the admin queue pair has closed. + * This also means no new queues can be handed + * off, etc. Note that since there are no I/O + * queues, only the admin queue needs to be + * destroyed, so it is safe to skip + * nvmft_controller_shutdown and just schedule + * nvmft_controller_terminate. Note that we + * cannot call nvmft_controller_terminate from + * here directly as this is called from the + * transport layer and freeing the admin qpair + * might deadlock waiting for the current + * thread to exit. + */ + if (taskqueue_cancel_timeout(taskqueue_thread, + &ctrlr->terminate_task, NULL) == 0) + taskqueue_enqueue_timeout(taskqueue_thread, + &ctrlr->terminate_task, 0); + return; + } + + /* + * Treat closing of the admin queue pair while enabled + * as a transport error. Note that the admin queue + * pair has been closed. + */ + ctrlr->admin_closed = true; + } else + mtx_lock(&ctrlr->lock); + + /* Ignore transport errors if we are already shutting down. */ + if (ctrlr->shutdown) { + mtx_unlock(&ctrlr->lock); + return; + } + + ctrlr->csts |= NVMEF(NVME_CSTS_REG_CFS, 1); + ctrlr->cc &= ~NVMEM(NVME_CC_REG_EN); + ctrlr->shutdown = true; + mtx_unlock(&ctrlr->lock); + + callout_stop(&ctrlr->ka_timer); + taskqueue_enqueue(taskqueue_thread, &ctrlr->shutdown_task); +} + +/* Wrapper around m_getm2 that also sets m_len in the mbufs in the chain. */ +static struct mbuf * +m_getml(size_t len, int how) +{ + struct mbuf *m, *n; + + m = m_getm2(NULL, len, how, MT_DATA, 0); + if (m == NULL) + return (NULL); + for (n = m; len > 0; n = n->m_next) { + n->m_len = M_SIZE(n); + if (n->m_len >= len) { + n->m_len = len; + MPASS(n->m_next == NULL); + } + len -= n->m_len; + } + return (m); +} + +static void +m_zero(struct mbuf *m, u_int offset, u_int len) +{ + u_int todo; + + if (len == 0) + return; + + while (m->m_len <= offset) { + offset -= m->m_len; + m = m->m_next; + } + + todo = m->m_len - offset; + if (todo > len) + todo = len; + memset(mtodo(m, offset), 0, todo); + m = m->m_next; + len -= todo; + + while (len > 0) { + todo = m->m_len; + if (todo > len) + todo = len; + memset(mtod(m, void *), 0, todo); + m = m->m_next; + len -= todo; + } +} + +static void +handle_get_log_page(struct nvmft_controller *ctrlr, + struct nvmf_capsule *nc, const struct nvme_command *cmd) +{ + struct mbuf *m; + uint64_t offset; + uint32_t numd; + size_t len, todo; + u_int status; + uint8_t lid; + bool rae; + + lid = le32toh(cmd->cdw10) & 0xff; + rae = (le32toh(cmd->cdw10) & (1U << 15)) != 0; + numd = le32toh(cmd->cdw10) >> 16 | le32toh(cmd->cdw11) << 16; + offset = le32toh(cmd->cdw12) | (uint64_t)le32toh(cmd->cdw13) << 32; + + if (offset % 3 != 0) { + status = NVME_SC_INVALID_FIELD; + goto done; + } + + len = (numd + 1) * 4; + + switch (lid) { + case NVME_LOG_ERROR: + todo = 0; + + m = m_getml(len, M_WAITOK); + if (todo != len) + m_zero(m, todo, len - todo); + status = nvmf_send_controller_data(nc, 0, m, len); + MPASS(status != NVMF_MORE); + break; + case NVME_LOG_HEALTH_INFORMATION: + { + struct nvme_health_information_page hip; + + if (offset >= sizeof(hip)) { + status = NVME_SC_INVALID_FIELD; + goto done; + } + todo = sizeof(hip) - offset; + if (todo > len) + todo = len; + + mtx_lock(&ctrlr->lock); + hip = ctrlr->hip; + hip.controller_busy_time[0] = + sbintime_getsec(ctrlr->busy_total) / 60; + hip.power_on_hours[0] = + sbintime_getsec(sbinuptime() - ctrlr->create_time) / 3600; + mtx_unlock(&ctrlr->lock); + + m = m_getml(len, M_WAITOK); + m_copyback(m, 0, todo, (char *)&hip + offset); + if (todo != len) + m_zero(m, todo, len - todo); + status = nvmf_send_controller_data(nc, 0, m, len); + MPASS(status != NVMF_MORE); + break; + } + case NVME_LOG_FIRMWARE_SLOT: + if (offset >= sizeof(ctrlr->np->fp)) { + status = NVME_SC_INVALID_FIELD; + goto done; + } + todo = sizeof(ctrlr->np->fp) - offset; + if (todo > len) + todo = len; + + m = m_getml(len, M_WAITOK); + m_copyback(m, 0, todo, (char *)&ctrlr->np->fp + offset); + if (todo != len) + m_zero(m, todo, len - todo); + status = nvmf_send_controller_data(nc, 0, m, len); + MPASS(status != NVMF_MORE); + break; + case NVME_LOG_CHANGED_NAMESPACE: + if (offset >= sizeof(*ctrlr->changed_ns)) { + status = NVME_SC_INVALID_FIELD; + goto done; + } + todo = sizeof(*ctrlr->changed_ns) - offset; + if (todo > len) + todo = len; + + m = m_getml(len, M_WAITOK); + mtx_lock(&ctrlr->lock); + m_copyback(m, 0, todo, (char *)ctrlr->changed_ns + offset); + if (offset == 0 && len == sizeof(*ctrlr->changed_ns)) + memset(ctrlr->changed_ns, 0, + sizeof(*ctrlr->changed_ns)); + if (!rae) + ctrlr->changed_ns_reported = false; + mtx_unlock(&ctrlr->lock); + if (todo != len) + m_zero(m, todo, len - todo); + status = nvmf_send_controller_data(nc, 0, m, len); + MPASS(status != NVMF_MORE); + break; + default: + nvmft_printf(ctrlr, "Unsupported page %#x for GET_LOG_PAGE\n", + lid); + status = NVME_SC_INVALID_FIELD; + break; + } + +done: + if (status == NVMF_SUCCESS_SENT) + nvmft_command_completed(ctrlr->admin, nc); + else + nvmft_send_generic_error(ctrlr->admin, nc, status); + nvmf_free_capsule(nc); +} + +static void +m_free_nslist(struct mbuf *m) +{ + free(m->m_ext.ext_arg1, M_NVMFT); +} + +static void +handle_identify_command(struct nvmft_controller *ctrlr, + struct nvmf_capsule *nc, const struct nvme_command *cmd) +{ + struct mbuf *m; + size_t data_len; + u_int status; + uint8_t cns; + + cns = le32toh(cmd->cdw10) & 0xFF; + data_len = nvmf_capsule_data_len(nc); + if (data_len != sizeof(ctrlr->cdata)) { + nvmft_printf(ctrlr, + "Invalid length %zu for IDENTIFY with CNS %#x\n", data_len, + cns); + nvmft_send_generic_error(ctrlr->admin, nc, + NVME_SC_INVALID_OPCODE); + nvmf_free_capsule(nc); + return; + } + + switch (cns) { + case 0: /* Namespace data. */ + case 3: /* Namespace Identification Descriptor list. */ + nvmft_dispatch_command(ctrlr->admin, nc, true); + return; + case 1: + /* Controller data. */ + m = m_getml(sizeof(ctrlr->cdata), M_WAITOK); + m_copyback(m, 0, sizeof(ctrlr->cdata), (void *)&ctrlr->cdata); + status = nvmf_send_controller_data(nc, 0, m, + sizeof(ctrlr->cdata)); + MPASS(status != NVMF_MORE); + break; + case 2: + { + /* Active namespace list. */ + struct nvme_ns_list *nslist; + uint32_t nsid; + + nsid = le32toh(cmd->nsid); + if (nsid >= 0xfffffffe) { + status = NVME_SC_INVALID_FIELD; + break; + } + + nslist = malloc(sizeof(*nslist), M_NVMFT, M_WAITOK | M_ZERO); + nvmft_populate_active_nslist(ctrlr->np, nsid, nslist); + m = m_get(M_WAITOK, MT_DATA); + m_extadd(m, (void *)nslist, sizeof(*nslist), m_free_nslist, + nslist, NULL, 0, EXT_CTL); + m->m_len = sizeof(*nslist); + status = nvmf_send_controller_data(nc, 0, m, m->m_len); + MPASS(status != NVMF_MORE); + break; + } + default: + nvmft_printf(ctrlr, "Unsupported CNS %#x for IDENTIFY\n", cns); + status = NVME_SC_INVALID_FIELD; + break; + } + + if (status == NVMF_SUCCESS_SENT) + nvmft_command_completed(ctrlr->admin, nc); + else + nvmft_send_generic_error(ctrlr->admin, nc, status); + nvmf_free_capsule(nc); +} + +static void +handle_set_features(struct nvmft_controller *ctrlr, + struct nvmf_capsule *nc, const struct nvme_command *cmd) +{ + struct nvme_completion cqe; + uint8_t fid; + + fid = NVMEV(NVME_FEAT_SET_FID, le32toh(cmd->cdw10)); + switch (fid) { + case NVME_FEAT_NUMBER_OF_QUEUES: + { + uint32_t num_queues; + struct nvmft_io_qpair *io_qpairs; + + num_queues = le32toh(cmd->cdw11) & 0xffff; + + /* 5.12.1.7: 65535 is invalid. */ + if (num_queues == 65535) + goto error; + + /* Fabrics requires the same number of SQs and CQs. */ + if (le32toh(cmd->cdw11) >> 16 != num_queues) + goto error; + + /* Convert to 1's based */ + num_queues++; + + io_qpairs = mallocarray(num_queues, sizeof(*io_qpairs), + M_NVMFT, M_WAITOK | M_ZERO); + + mtx_lock(&ctrlr->lock); + if (ctrlr->num_io_queues != 0) { + mtx_unlock(&ctrlr->lock); + free(io_qpairs, M_NVMFT); + nvmft_send_generic_error(ctrlr->admin, nc, + NVME_SC_COMMAND_SEQUENCE_ERROR); + nvmf_free_capsule(nc); + return; + } + + ctrlr->num_io_queues = num_queues; + ctrlr->io_qpairs = io_qpairs; + mtx_unlock(&ctrlr->lock); + + nvmft_init_cqe(&cqe, nc, 0); + cqe.cdw0 = cmd->cdw11; + nvmft_send_response(ctrlr->admin, &cqe); + nvmf_free_capsule(nc); + return; + } + case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: + { + uint32_t aer_mask; + + aer_mask = le32toh(cmd->cdw11); + + /* Check for any reserved or unimplemented feature bits. */ + if ((aer_mask & 0xffffc000) != 0) + goto error; + + mtx_lock(&ctrlr->lock); + ctrlr->aer_mask = aer_mask; + mtx_unlock(&ctrlr->lock); + nvmft_send_success(ctrlr->admin, nc); + return; + } + default: + nvmft_printf(ctrlr, + "Unsupported feature ID %u for SET_FEATURES\n", fid); + goto error; + } + +error: + nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD); + nvmf_free_capsule(nc); +} + +static bool +update_cc(struct nvmft_controller *ctrlr, uint32_t new_cc, bool *need_shutdown) +{ + struct nvmft_port *np = ctrlr->np; + uint32_t changes; + + *need_shutdown = false; + + mtx_lock(&ctrlr->lock); + + /* Don't allow any changes while shutting down. */ + if (ctrlr->shutdown) { + mtx_unlock(&ctrlr->lock); + return (false); + } + + if (!_nvmf_validate_cc(np->max_io_qsize, np->cap, ctrlr->cc, new_cc)) { + mtx_unlock(&ctrlr->lock); + return (false); + } + + changes = ctrlr->cc ^ new_cc; + ctrlr->cc = new_cc; + + /* Handle shutdown requests. */ + if (NVMEV(NVME_CC_REG_SHN, changes) != 0 && + NVMEV(NVME_CC_REG_SHN, new_cc) != 0) { + ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_SHST); + ctrlr->csts |= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_OCCURRING); + ctrlr->cc &= ~NVMEM(NVME_CC_REG_EN); + ctrlr->shutdown = true; + *need_shutdown = true; + nvmft_printf(ctrlr, "shutdown requested\n"); + } + + if (NVMEV(NVME_CC_REG_EN, changes) != 0) { + if (NVMEV(NVME_CC_REG_EN, new_cc) == 0) { + /* Controller reset. */ + nvmft_printf(ctrlr, "reset requested\n"); + ctrlr->shutdown = true; + *need_shutdown = true; + } else + ctrlr->csts |= NVMEF(NVME_CSTS_REG_RDY, 1); + } + mtx_unlock(&ctrlr->lock); + + return (true); +} + +static void +handle_property_get(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc, + const struct nvmf_fabric_prop_get_cmd *pget) +{ + struct nvmf_fabric_prop_get_rsp rsp; + + nvmft_init_cqe(&rsp, nc, 0); + + switch (le32toh(pget->ofst)) { + case NVMF_PROP_CAP: + if (pget->attrib.size != NVMF_PROP_SIZE_8) + goto error; + rsp.value.u64 = htole64(ctrlr->np->cap); + break; + case NVMF_PROP_VS: + if (pget->attrib.size != NVMF_PROP_SIZE_4) + goto error; + rsp.value.u32.low = ctrlr->cdata.ver; + break; + case NVMF_PROP_CC: + if (pget->attrib.size != NVMF_PROP_SIZE_4) + goto error; + rsp.value.u32.low = htole32(ctrlr->cc); + break; + case NVMF_PROP_CSTS: + if (pget->attrib.size != NVMF_PROP_SIZE_4) + goto error; + rsp.value.u32.low = htole32(ctrlr->csts); + break; + default: + goto error; + } + + nvmft_send_response(ctrlr->admin, &rsp); + return; +error: + nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD); +} + +static void +handle_property_set(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc, + const struct nvmf_fabric_prop_set_cmd *pset) +{ + bool need_shutdown; + + need_shutdown = false; + switch (le32toh(pset->ofst)) { + case NVMF_PROP_CC: + if (pset->attrib.size != NVMF_PROP_SIZE_4) + goto error; + if (!update_cc(ctrlr, le32toh(pset->value.u32.low), + &need_shutdown)) + goto error; + break; + default: + goto error; + } + + nvmft_send_success(ctrlr->admin, nc); + if (need_shutdown) { + callout_stop(&ctrlr->ka_timer); + taskqueue_enqueue(taskqueue_thread, &ctrlr->shutdown_task); + } + return; +error: + nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD); +} + +static void +handle_admin_fabrics_command(struct nvmft_controller *ctrlr, + struct nvmf_capsule *nc, const struct nvmf_fabric_cmd *fc) +{ + switch (fc->fctype) { + case NVMF_FABRIC_COMMAND_PROPERTY_GET: + handle_property_get(ctrlr, nc, + (const struct nvmf_fabric_prop_get_cmd *)fc); + break; + case NVMF_FABRIC_COMMAND_PROPERTY_SET: + handle_property_set(ctrlr, nc, + (const struct nvmf_fabric_prop_set_cmd *)fc); + break; + case NVMF_FABRIC_COMMAND_CONNECT: + nvmft_printf(ctrlr, + "CONNECT command on connected admin queue\n"); + nvmft_send_generic_error(ctrlr->admin, nc, + NVME_SC_COMMAND_SEQUENCE_ERROR); + break; + case NVMF_FABRIC_COMMAND_DISCONNECT: + nvmft_printf(ctrlr, "DISCONNECT command on admin queue\n"); + nvmft_send_error(ctrlr->admin, nc, NVME_SCT_COMMAND_SPECIFIC, + NVMF_FABRIC_SC_INVALID_QUEUE_TYPE); + break; + default: + nvmft_printf(ctrlr, "Unsupported fabrics command %#x\n", + fc->fctype); + nvmft_send_generic_error(ctrlr->admin, nc, + NVME_SC_INVALID_OPCODE); + break; + } + nvmf_free_capsule(nc); +} + +void +nvmft_handle_admin_command(struct nvmft_controller *ctrlr, + struct nvmf_capsule *nc) +{ + const struct nvme_command *cmd = nvmf_capsule_sqe(nc); + + /* Only permit Fabrics commands while a controller is disabled. */ + if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) == 0 && + cmd->opc != NVME_OPC_FABRICS_COMMANDS) { + nvmft_printf(ctrlr, + "Unsupported admin opcode %#x whiled disabled\n", cmd->opc); + nvmft_send_generic_error(ctrlr->admin, nc, + NVME_SC_COMMAND_SEQUENCE_ERROR); + nvmf_free_capsule(nc); + return; + } + + atomic_store_int(&ctrlr->ka_active_traffic, 1); + + switch (cmd->opc) { + case NVME_OPC_GET_LOG_PAGE: + handle_get_log_page(ctrlr, nc, cmd); + break; + case NVME_OPC_IDENTIFY: + handle_identify_command(ctrlr, nc, cmd); + break; + case NVME_OPC_SET_FEATURES: + handle_set_features(ctrlr, nc, cmd); + break; + case NVME_OPC_ASYNC_EVENT_REQUEST: + mtx_lock(&ctrlr->lock); + if (ctrlr->aer_pending == NVMFT_NUM_AER) { + mtx_unlock(&ctrlr->lock); + nvmft_send_error(ctrlr->admin, nc, + NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); + } else { + /* NB: Store the CID without byte-swapping. */ + ctrlr->aer_cids[ctrlr->aer_pidx] = cmd->cid; + ctrlr->aer_pending++; + ctrlr->aer_pidx = (ctrlr->aer_pidx + 1) % NVMFT_NUM_AER; + mtx_unlock(&ctrlr->lock); + } + nvmf_free_capsule(nc); + break; + case NVME_OPC_KEEP_ALIVE: + nvmft_send_success(ctrlr->admin, nc); + nvmf_free_capsule(nc); + break; + case NVME_OPC_FABRICS_COMMANDS: + handle_admin_fabrics_command(ctrlr, nc, + (const struct nvmf_fabric_cmd *)cmd); + break; + default: + nvmft_printf(ctrlr, "Unsupported admin opcode %#x\n", cmd->opc); + nvmft_send_generic_error(ctrlr->admin, nc, + NVME_SC_INVALID_OPCODE); + nvmf_free_capsule(nc); + break; + } +} + +void +nvmft_handle_io_command(struct nvmft_qpair *qp, uint16_t qid, + struct nvmf_capsule *nc) +{ + struct nvmft_controller *ctrlr = nvmft_qpair_ctrlr(qp); + const struct nvme_command *cmd = nvmf_capsule_sqe(nc); + + atomic_store_int(&ctrlr->ka_active_traffic, 1); + + switch (cmd->opc) { + case NVME_OPC_FLUSH: + if (cmd->nsid == htole32(0xffffffff)) { + nvmft_send_generic_error(qp, nc, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT); + nvmf_free_capsule(nc); + break; + } + /* FALLTHROUGH */ + case NVME_OPC_WRITE: + case NVME_OPC_READ: + case NVME_OPC_WRITE_UNCORRECTABLE: + case NVME_OPC_COMPARE: + case NVME_OPC_WRITE_ZEROES: + case NVME_OPC_DATASET_MANAGEMENT: + case NVME_OPC_VERIFY: + nvmft_dispatch_command(qp, nc, false); + break; + default: + nvmft_printf(ctrlr, "Unsupported I/O opcode %#x\n", cmd->opc); + nvmft_send_generic_error(qp, nc, + NVME_SC_INVALID_OPCODE); + nvmf_free_capsule(nc); + break; + } +} + +static void +nvmft_report_aer(struct nvmft_controller *ctrlr, uint32_t aer_mask, + u_int type, uint8_t info, uint8_t log_page_id) +{ + struct nvme_completion cpl; + + MPASS(type <= 7); + + /* Drop events that are not enabled. */ + mtx_lock(&ctrlr->lock); + if ((ctrlr->aer_mask & aer_mask) == 0) { + mtx_unlock(&ctrlr->lock); + return; + } + + /* + * If there is no pending AER command, drop it. + * XXX: Should we queue these? + */ + if (ctrlr->aer_pending == 0) { + mtx_unlock(&ctrlr->lock); + nvmft_printf(ctrlr, + "dropping AER type %u, info %#x, page %#x\n", + type, info, log_page_id); + return; + } + + memset(&cpl, 0, sizeof(cpl)); + cpl.cid = ctrlr->aer_cids[ctrlr->aer_cidx]; + ctrlr->aer_pending--; + ctrlr->aer_cidx = (ctrlr->aer_cidx + 1) % NVMFT_NUM_AER; + mtx_unlock(&ctrlr->lock); + + cpl.cdw0 = htole32(NVMEF(NVME_ASYNC_EVENT_TYPE, type) | + NVMEF(NVME_ASYNC_EVENT_INFO, info) | + NVMEF(NVME_ASYNC_EVENT_LOG_PAGE_ID, log_page_id)); + + nvmft_send_response(ctrlr->admin, &cpl); +} + +void +nvmft_controller_lun_changed(struct nvmft_controller *ctrlr, int lun_id) +{ + struct nvme_ns_list *nslist; + uint32_t new_nsid, nsid; + u_int i; + + new_nsid = lun_id + 1; + + mtx_lock(&ctrlr->lock); + nslist = ctrlr->changed_ns; + + /* If the first entry is 0xffffffff, the list is already full. */ + if (nslist->ns[0] != 0xffffffff) { + /* Find the insertion point for this namespace ID. */ + for (i = 0; i < nitems(nslist->ns); i++) { + nsid = le32toh(nslist->ns[i]); + if (nsid == new_nsid) { + /* Already reported, nothing to do. */ + mtx_unlock(&ctrlr->lock); + return; + } + + if (nsid == 0 || nsid > new_nsid) + break; + } + + if (nslist->ns[nitems(nslist->ns) - 1] != htole32(0)) { + /* List is full. */ + memset(ctrlr->changed_ns, 0, + sizeof(*ctrlr->changed_ns)); + ctrlr->changed_ns->ns[0] = 0xffffffff; + } else if (nslist->ns[i] == htole32(0)) { + /* + * Optimize case where this ID is appended to + * the end. + */ + nslist->ns[i] = htole32(new_nsid); + } else { + memmove(&nslist->ns[i + 1], &nslist->ns[i], + (nitems(nslist->ns) - i - 1) * + sizeof(nslist->ns[0])); + nslist->ns[i] = htole32(new_nsid); + } + } + + if (ctrlr->changed_ns_reported) { + mtx_unlock(&ctrlr->lock); + return; + } + ctrlr->changed_ns_reported = true; + mtx_unlock(&ctrlr->lock); + + nvmft_report_aer(ctrlr, NVME_ASYNC_EVENT_NS_ATTRIBUTE, 0x2, 0x0, + NVME_LOG_CHANGED_NAMESPACE); +} diff --git a/sys/dev/nvmf/controller/nvmft_qpair.c b/sys/dev/nvmf/controller/nvmft_qpair.c new file mode 100644 --- /dev/null +++ b/sys/dev/nvmf/controller/nvmft_qpair.c @@ -0,0 +1,361 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#include +#include +#include +#include +#include + +#include +#include + +/* + * A bitmask of command ID values. This is used to detect duplicate + * commands with the same ID. + */ +#define NUM_CIDS (UINT16_MAX + 1) +BITSET_DEFINE(cidset, NUM_CIDS); + +struct nvmft_qpair { + struct nvmft_controller *ctrlr; + struct nvmf_qpair *qp; + struct cidset *cids; + + bool admin; + bool sq_flow_control; + uint16_t qid; + u_int qsize; + uint16_t sqhd; + uint16_t sqtail; + volatile u_int qp_refs; /* Internal references on 'qp'. */ + + struct mtx lock; + + char name[16]; +}; + +static int _nvmft_send_generic_error(struct nvmft_qpair *qp, + struct nvmf_capsule *nc, uint8_t sc_status); + +static void +nvmft_qpair_error(void *arg, int error) +{ + struct nvmft_qpair *qp = arg; + struct nvmft_controller *ctrlr = qp->ctrlr; + + /* + * XXX: The Linux TCP initiator sends a RST immediately after + * the FIN, so treat ECONNRESET as plain EOF to avoid spurious + * errors on shutdown. + */ + if (error == ECONNRESET) + error = 0; + + if (error != 0) + nvmft_printf(ctrlr, "error %d on %s\n", error, qp->name); + nvmft_controller_error(ctrlr, qp, error); +} + +static void +nvmft_receive_capsule(void *arg, struct nvmf_capsule *nc) +{ + struct nvmft_qpair *qp = arg; + struct nvmft_controller *ctrlr = qp->ctrlr; + const struct nvme_command *cmd; + uint8_t sc_status; + + cmd = nvmf_capsule_sqe(nc); + if (ctrlr == NULL) { + printf("NVMFT: %s received CID %u opcode %u on newborn queue\n", + qp->name, le16toh(cmd->cid), cmd->opc); + nvmf_free_capsule(nc); + return; + } + + sc_status = nvmf_validate_command_capsule(nc); + if (sc_status != NVME_SC_SUCCESS) { + _nvmft_send_generic_error(qp, nc, sc_status); + nvmf_free_capsule(nc); + return; + } + + /* Don't bother byte-swapping CID. */ + if (BIT_TEST_SET_ATOMIC(NUM_CIDS, cmd->cid, qp->cids)) { + _nvmft_send_generic_error(qp, nc, NVME_SC_COMMAND_ID_CONFLICT); + nvmf_free_capsule(nc); + return; + } + + if (qp->admin) + nvmft_handle_admin_command(ctrlr, nc); + else + nvmft_handle_io_command(qp, qp->qid, nc); +} + +struct nvmft_qpair * +nvmft_qpair_init(enum nvmf_trtype trtype, + const struct nvmf_handoff_qpair_params *handoff, uint16_t qid, + const char *name) +{ + struct nvmft_qpair *qp; + + qp = malloc(sizeof(*qp), M_NVMFT, M_WAITOK | M_ZERO); + qp->admin = handoff->admin; + qp->sq_flow_control = handoff->sq_flow_control; + qp->qsize = handoff->qsize; + qp->qid = qid; + qp->sqhd = handoff->sqhd; + qp->sqtail = handoff->sqtail; + strlcpy(qp->name, name, sizeof(qp->name)); + mtx_init(&qp->lock, "nvmft qp", NULL, MTX_DEF); + qp->cids = BITSET_ALLOC(NUM_CIDS, M_NVMFT, M_WAITOK | M_ZERO); + + qp->qp = nvmf_allocate_qpair(trtype, true, handoff, nvmft_qpair_error, + qp, nvmft_receive_capsule, qp); + if (qp->qp == NULL) { + mtx_destroy(&qp->lock); + free(qp->cids, M_NVMFT); + free(qp, M_NVMFT); + return (NULL); + } + + refcount_init(&qp->qp_refs, 1); + return (qp); +} + +void +nvmft_qpair_shutdown(struct nvmft_qpair *qp) +{ + struct nvmf_qpair *nq; + + mtx_lock(&qp->lock); + nq = qp->qp; + qp->qp = NULL; + mtx_unlock(&qp->lock); + if (nq != NULL && refcount_release(&qp->qp_refs)) + nvmf_free_qpair(nq); +} + +void +nvmft_qpair_destroy(struct nvmft_qpair *qp) +{ + nvmft_qpair_shutdown(qp); + mtx_destroy(&qp->lock); + free(qp->cids, M_NVMFT); + free(qp, M_NVMFT); +} + +struct nvmft_controller * +nvmft_qpair_ctrlr(struct nvmft_qpair *qp) +{ + return (qp->ctrlr); +} + +uint16_t +nvmft_qpair_id(struct nvmft_qpair *qp) +{ + return (qp->qid); +} + +const char * +nvmft_qpair_name(struct nvmft_qpair *qp) +{ + return (qp->name); +} + +static int +_nvmft_send_response(struct nvmft_qpair *qp, const void *cqe) +{ + struct nvme_completion cpl; + struct nvmf_qpair *nq; + struct nvmf_capsule *rc; + int error; + + memcpy(&cpl, cqe, sizeof(cpl)); + mtx_lock(&qp->lock); + nq = qp->qp; + if (nq == NULL) { + mtx_unlock(&qp->lock); + return (ENOTCONN); + } + refcount_acquire(&qp->qp_refs); + + /* Set SQHD. */ + if (qp->sq_flow_control) { + qp->sqhd = (qp->sqhd + 1) % qp->qsize; + cpl.sqhd = htole16(qp->sqhd); + } else + cpl.sqhd = 0; + mtx_unlock(&qp->lock); + + rc = nvmf_allocate_response(nq, &cpl, M_WAITOK); + error = nvmf_transmit_capsule(rc); + nvmf_free_capsule(rc); + + if (refcount_release(&qp->qp_refs)) + nvmf_free_qpair(nq); + return (error); +} + +void +nvmft_command_completed(struct nvmft_qpair *qp, struct nvmf_capsule *nc) +{ + const struct nvme_command *cmd = nvmf_capsule_sqe(nc); + + /* Don't bother byte-swapping CID. */ + KASSERT(BIT_ISSET(NUM_CIDS, cmd->cid, qp->cids), + ("%s: CID %u not busy", __func__, cmd->cid)); + + BIT_CLR_ATOMIC(NUM_CIDS, cmd->cid, qp->cids); +} + +int +nvmft_send_response(struct nvmft_qpair *qp, const void *cqe) +{ + const struct nvme_completion *cpl = cqe; + + /* Don't bother byte-swapping CID. */ + KASSERT(BIT_ISSET(NUM_CIDS, cpl->cid, qp->cids), + ("%s: CID %u not busy", __func__, cpl->cid)); + + BIT_CLR_ATOMIC(NUM_CIDS, cpl->cid, qp->cids); + return (_nvmft_send_response(qp, cqe)); +} + +void +nvmft_init_cqe(void *cqe, struct nvmf_capsule *nc, uint16_t status) +{ + struct nvme_completion *cpl = cqe; + const struct nvme_command *cmd = nvmf_capsule_sqe(nc); + + memset(cpl, 0, sizeof(*cpl)); + cpl->cid = cmd->cid; + cpl->status = htole16(status); +} + +int +nvmft_send_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc, + uint8_t sc_type, uint8_t sc_status) +{ + struct nvme_completion cpl; + uint16_t status; + + status = NVMEF(NVME_STATUS_SCT, sc_type) | + NVMEF(NVME_STATUS_SC, sc_status); + nvmft_init_cqe(&cpl, nc, status); + return (nvmft_send_response(qp, &cpl)); +} + +int +nvmft_send_generic_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc, + uint8_t sc_status) +{ + return (nvmft_send_error(qp, nc, NVME_SCT_GENERIC, sc_status)); +} + +/* + * This version doesn't clear CID in qp->cids and is used for errors + * before the CID is validated. + */ +static int +_nvmft_send_generic_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc, + uint8_t sc_status) +{ + struct nvme_completion cpl; + uint16_t status; + + status = NVMEF(NVME_STATUS_SCT, NVME_SCT_GENERIC) | + NVMEF(NVME_STATUS_SC, sc_status); + nvmft_init_cqe(&cpl, nc, status); + return (_nvmft_send_response(qp, &cpl)); +} + +int +nvmft_send_success(struct nvmft_qpair *qp, struct nvmf_capsule *nc) +{ + return (nvmft_send_generic_error(qp, nc, NVME_SC_SUCCESS)); +} + +static void +nvmft_init_connect_rsp(struct nvmf_fabric_connect_rsp *rsp, + const struct nvmf_fabric_connect_cmd *cmd, uint16_t status) +{ + memset(rsp, 0, sizeof(*rsp)); + rsp->cid = cmd->cid; + rsp->status = htole16(status); +} + +static int +nvmft_send_connect_response(struct nvmft_qpair *qp, + const struct nvmf_fabric_connect_rsp *rsp) +{ + struct nvmf_capsule *rc; + struct nvmf_qpair *nq; + int error; + + mtx_lock(&qp->lock); + nq = qp->qp; + if (nq == NULL) { + mtx_unlock(&qp->lock); + return (ENOTCONN); + } + refcount_acquire(&qp->qp_refs); + mtx_unlock(&qp->lock); + + rc = nvmf_allocate_response(qp->qp, rsp, M_WAITOK); + error = nvmf_transmit_capsule(rc); + nvmf_free_capsule(rc); + + if (refcount_release(&qp->qp_refs)) + nvmf_free_qpair(nq); + return (error); +} + +void +nvmft_connect_error(struct nvmft_qpair *qp, + const struct nvmf_fabric_connect_cmd *cmd, uint8_t sc_type, + uint8_t sc_status) +{ + struct nvmf_fabric_connect_rsp rsp; + uint16_t status; + + status = NVMEF(NVME_STATUS_SCT, sc_type) | + NVMEF(NVME_STATUS_SC, sc_status); + nvmft_init_connect_rsp(&rsp, cmd, status); + nvmft_send_connect_response(qp, &rsp); +} + +void +nvmft_connect_invalid_parameters(struct nvmft_qpair *qp, + const struct nvmf_fabric_connect_cmd *cmd, bool data, uint16_t offset) +{ + struct nvmf_fabric_connect_rsp rsp; + + nvmft_init_connect_rsp(&rsp, cmd, + NVMEF(NVME_STATUS_SCT, NVME_SCT_COMMAND_SPECIFIC) | + NVMEF(NVME_STATUS_SC, NVMF_FABRIC_SC_INVALID_PARAM)); + rsp.status_code_specific.invalid.ipo = htole16(offset); + rsp.status_code_specific.invalid.iattr = data ? 1 : 0; + nvmft_send_connect_response(qp, &rsp); +} + +int +nvmft_finish_accept(struct nvmft_qpair *qp, + const struct nvmf_fabric_connect_cmd *cmd, struct nvmft_controller *ctrlr) +{ + struct nvmf_fabric_connect_rsp rsp; + + qp->ctrlr = ctrlr; + nvmft_init_connect_rsp(&rsp, cmd, 0); + if (qp->sq_flow_control) + rsp.sqhd = htole16(qp->sqhd); + else + rsp.sqhd = htole16(0xffff); + rsp.status_code_specific.success.cntlid = htole16(ctrlr->cntlid); + return (nvmft_send_connect_response(qp, &rsp)); +} diff --git a/sys/dev/nvmf/controller/nvmft_var.h b/sys/dev/nvmf/controller/nvmft_var.h new file mode 100644 --- /dev/null +++ b/sys/dev/nvmf/controller/nvmft_var.h @@ -0,0 +1,174 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#ifndef __NVMFT_VAR_H__ +#define __NVMFT_VAR_H__ + +#include +#include +#include + +#include + +#include +#include +#include + +struct nvmf_capsule; +struct nvmft_controller; +struct nvmft_qpair; + +#define NVMFT_NUM_AER 16 + +struct nvmft_port { + TAILQ_ENTRY(nvmft_port) link; + u_int refs; + struct ctl_port port; + struct nvme_controller_data cdata; + struct nvme_firmware_page fp; + uint64_t cap; + uint32_t max_io_qsize; + bool online; + + struct sx lock; + + struct unrhdr *ids; + TAILQ_HEAD(, nvmft_controller) controllers; + + uint32_t *active_ns; + u_int num_ns; +}; + +struct nvmft_io_qpair { + struct nvmft_qpair *qp; + + bool shutdown; +}; + +struct nvmft_controller { + struct nvmft_qpair *admin; + struct nvmft_io_qpair *io_qpairs; + u_int num_io_queues; + bool shutdown; + bool admin_closed; + uint16_t cntlid; + uint32_t cc; + uint32_t csts; + + struct nvmft_port *np; + struct mtx lock; + + struct nvme_controller_data cdata; + struct nvme_health_information_page hip; + sbintime_t create_time; + sbintime_t start_busy; + sbintime_t busy_total; + uint16_t partial_dur; + uint16_t partial_duw; + + uint8_t hostid[16]; + uint8_t hostnqn[NVME_NQN_FIELD_SIZE]; + u_int trtype; + + TAILQ_ENTRY(nvmft_controller) link; + + /* + * Each queue can have at most UINT16_MAX commands, so the total + * across all queues will fit in a uint32_t. + */ + uint32_t pending_commands; + + volatile int ka_active_traffic; + struct callout ka_timer; + sbintime_t ka_sbt; + + /* AER fields. */ + uint32_t aer_mask; + uint16_t aer_cids[NVMFT_NUM_AER]; + uint8_t aer_pending; + uint8_t aer_cidx; + uint8_t aer_pidx; + + /* Changed namespace IDs. */ + struct nvme_ns_list *changed_ns; + bool changed_ns_reported; + + struct task shutdown_task; + struct timeout_task terminate_task; +}; + +MALLOC_DECLARE(M_NVMFT); + +/* ctl_frontend_nvmf.c */ +void nvmft_port_free(struct nvmft_port *np); +void nvmft_populate_active_nslist(struct nvmft_port *np, uint32_t nsid, + struct nvme_ns_list *nslist); +void nvmft_dispatch_command(struct nvmft_qpair *qp, + struct nvmf_capsule *nc, bool admin); +void nvmft_terminate_commands(struct nvmft_controller *ctrlr); + +/* nvmft_controller.c */ +void nvmft_controller_error(struct nvmft_controller *ctrlr, + struct nvmft_qpair *qp, int error); +void nvmft_controller_lun_changed(struct nvmft_controller *ctrlr, + int lun_id); +void nvmft_handle_admin_command(struct nvmft_controller *ctrlr, + struct nvmf_capsule *nc); +void nvmft_handle_io_command(struct nvmft_qpair *qp, uint16_t qid, + struct nvmf_capsule *nc); +int nvmft_handoff_admin_queue(struct nvmft_port *np, + const struct nvmf_handoff_controller_qpair *handoff, + const struct nvmf_fabric_connect_cmd *cmd, + const struct nvmf_fabric_connect_data *data); +int nvmft_handoff_io_queue(struct nvmft_port *np, + const struct nvmf_handoff_controller_qpair *handoff, + const struct nvmf_fabric_connect_cmd *cmd, + const struct nvmf_fabric_connect_data *data); +int nvmft_printf(struct nvmft_controller *ctrlr, const char *fmt, ...) + __printflike(2, 3); + +/* nvmft_qpair.c */ +struct nvmft_qpair *nvmft_qpair_init(enum nvmf_trtype trtype, + const struct nvmf_handoff_qpair_params *handoff, uint16_t qid, + const char *name); +void nvmft_qpair_shutdown(struct nvmft_qpair *qp); +void nvmft_qpair_destroy(struct nvmft_qpair *qp); +struct nvmft_controller *nvmft_qpair_ctrlr(struct nvmft_qpair *qp); +uint16_t nvmft_qpair_id(struct nvmft_qpair *qp); +const char *nvmft_qpair_name(struct nvmft_qpair *qp); +void nvmft_command_completed(struct nvmft_qpair *qp, + struct nvmf_capsule *nc); +int nvmft_send_response(struct nvmft_qpair *qp, const void *cqe); +void nvmft_init_cqe(void *cqe, struct nvmf_capsule *nc, uint16_t status); +int nvmft_send_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc, + uint8_t sc_type, uint8_t sc_status); +int nvmft_send_generic_error(struct nvmft_qpair *qp, + struct nvmf_capsule *nc, uint8_t sc_status); +int nvmft_send_success(struct nvmft_qpair *qp, + struct nvmf_capsule *nc); +void nvmft_connect_error(struct nvmft_qpair *qp, + const struct nvmf_fabric_connect_cmd *cmd, uint8_t sc_type, + uint8_t sc_status); +void nvmft_connect_invalid_parameters(struct nvmft_qpair *qp, + const struct nvmf_fabric_connect_cmd *cmd, bool data, uint16_t offset); +int nvmft_finish_accept(struct nvmft_qpair *qp, + const struct nvmf_fabric_connect_cmd *cmd, struct nvmft_controller *ctrlr); + +static __inline void +nvmft_port_ref(struct nvmft_port *np) +{ + refcount_acquire(&np->refs); +} + +static __inline void +nvmft_port_rele(struct nvmft_port *np) +{ + if (refcount_release(&np->refs)) + nvmft_port_free(np); +} + +#endif /* !__NVMFT_VAR_H__ */ diff --git a/sys/modules/nvmf/Makefile b/sys/modules/nvmf/Makefile --- a/sys/modules/nvmf/Makefile +++ b/sys/modules/nvmf/Makefile @@ -1,5 +1,6 @@ SUBDIR= nvmf \ nvmf_tcp \ - nvmf_transport + nvmf_transport \ + nvmft .include diff --git a/sys/modules/nvmf/nvmft/Makefile b/sys/modules/nvmf/nvmft/Makefile new file mode 100644 --- /dev/null +++ b/sys/modules/nvmf/nvmft/Makefile @@ -0,0 +1,10 @@ +.PATH: ${SRCTOP}/sys/dev/nvmf/controller + +KMOD= nvmft + +SRCS= ctl_frontend_nvmf.c \ + nvmft_controller.c \ + nvmft_subr.c \ + nvmft_qpair.c + +.include