diff --git a/usr.sbin/nvmfd/discovery.c b/usr.sbin/nvmfd/discovery.c index 1cee8755c65c..2cfe56731d7c 100644 --- a/usr.sbin/nvmfd/discovery.c +++ b/usr.sbin/nvmfd/discovery.c @@ -1,342 +1,342 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #include #include #include #include #include #include #include #include #include #include #include #include "internal.h" struct io_controller_data { struct nvme_discovery_log_entry entry; bool wildcard; }; struct discovery_controller { struct nvme_discovery_log *discovery_log; size_t discovery_log_len; int s; }; struct discovery_thread_arg { struct controller *c; struct nvmf_qpair *qp; int s; }; static struct io_controller_data *io_controllers; static struct nvmf_association *discovery_na; static u_int num_io_controllers; static bool init_discovery_log_entry(struct nvme_discovery_log_entry *entry, int s, const char *subnqn) { struct sockaddr_storage ss; socklen_t len; bool wildcard; len = sizeof(ss); if (getsockname(s, (struct sockaddr *)&ss, &len) == -1) err(1, "getsockname"); memset(entry, 0, sizeof(*entry)); entry->trtype = NVMF_TRTYPE_TCP; switch (ss.ss_family) { case AF_INET: { struct sockaddr_in *sin; sin = (struct sockaddr_in *)&ss; entry->adrfam = NVMF_ADRFAM_IPV4; snprintf(entry->trsvcid, sizeof(entry->trsvcid), "%u", htons(sin->sin_port)); if (inet_ntop(AF_INET, &sin->sin_addr, entry->traddr, sizeof(entry->traddr)) == NULL) err(1, "inet_ntop"); wildcard = (sin->sin_addr.s_addr == htonl(INADDR_ANY)); break; } case AF_INET6: { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)&ss; entry->adrfam = NVMF_ADRFAM_IPV6; snprintf(entry->trsvcid, sizeof(entry->trsvcid), "%u", htons(sin6->sin6_port)); if (inet_ntop(AF_INET6, &sin6->sin6_addr, entry->traddr, sizeof(entry->traddr)) == NULL) err(1, "inet_ntop"); wildcard = (memcmp(&sin6->sin6_addr, &in6addr_any, sizeof(in6addr_any)) == 0); break; } default: errx(1, "Unsupported address family %u", ss.ss_family); } entry->subtype = NVMF_SUBTYPE_NVME; if (flow_control_disable) entry->treq |= (1 << 2); entry->portid = htole16(1); entry->cntlid = htole16(NVMF_CNTLID_DYNAMIC); entry->aqsz = NVME_MAX_ADMIN_ENTRIES; strlcpy(entry->subnqn, subnqn, sizeof(entry->subnqn)); return (wildcard); } void init_discovery(void) { struct nvmf_association_params aparams; memset(&aparams, 0, sizeof(aparams)); aparams.sq_flow_control = false; aparams.dynamic_controller_model = true; aparams.max_admin_qsize = NVME_MAX_ADMIN_ENTRIES; aparams.tcp.pda = 0; aparams.tcp.header_digests = header_digests; aparams.tcp.data_digests = data_digests; - aparams.tcp.maxh2cdata = 256 * 1024; + aparams.tcp.maxh2cdata = maxh2cdata; discovery_na = nvmf_allocate_association(NVMF_TRTYPE_TCP, true, &aparams); if (discovery_na == NULL) err(1, "Failed to create discovery association"); } void discovery_add_io_controller(int s, const char *subnqn) { struct io_controller_data *icd; io_controllers = reallocf(io_controllers, (num_io_controllers + 1) * sizeof(*io_controllers)); icd = &io_controllers[num_io_controllers]; num_io_controllers++; icd->wildcard = init_discovery_log_entry(&icd->entry, s, subnqn); } static void build_discovery_log_page(struct discovery_controller *dc) { struct sockaddr_storage ss; socklen_t len; char traddr[256]; u_int i, nentries; uint8_t adrfam; if (dc->discovery_log != NULL) return; len = sizeof(ss); if (getsockname(dc->s, (struct sockaddr *)&ss, &len) == -1) { warn("build_discovery_log_page: getsockname"); return; } memset(traddr, 0, sizeof(traddr)); switch (ss.ss_family) { case AF_INET: { struct sockaddr_in *sin; sin = (struct sockaddr_in *)&ss; adrfam = NVMF_ADRFAM_IPV4; if (inet_ntop(AF_INET, &sin->sin_addr, traddr, sizeof(traddr)) == NULL) { warn("build_discovery_log_page: inet_ntop"); return; } break; } case AF_INET6: { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)&ss; adrfam = NVMF_ADRFAM_IPV6; if (inet_ntop(AF_INET6, &sin6->sin6_addr, traddr, sizeof(traddr)) == NULL) { warn("build_discovery_log_page: inet_ntop"); return; } break; } default: assert(false); } nentries = 0; for (i = 0; i < num_io_controllers; i++) { if (io_controllers[i].wildcard && io_controllers[i].entry.adrfam != adrfam) continue; nentries++; } dc->discovery_log_len = sizeof(*dc->discovery_log) + nentries * sizeof(struct nvme_discovery_log_entry); dc->discovery_log = calloc(dc->discovery_log_len, 1); dc->discovery_log->numrec = nentries; dc->discovery_log->recfmt = 0; nentries = 0; for (i = 0; i < num_io_controllers; i++) { if (io_controllers[i].wildcard && io_controllers[i].entry.adrfam != adrfam) continue; dc->discovery_log->entries[nentries] = io_controllers[i].entry; if (io_controllers[i].wildcard) memcpy(dc->discovery_log->entries[nentries].traddr, traddr, sizeof(traddr)); } } static void handle_get_log_page_command(const struct nvmf_capsule *nc, const struct nvme_command *cmd, struct discovery_controller *dc) { uint64_t offset; uint32_t length; switch (nvmf_get_log_page_id(cmd)) { case NVME_LOG_DISCOVERY: break; default: warnx("Unsupported log page %u for discovery controller", nvmf_get_log_page_id(cmd)); goto error; } build_discovery_log_page(dc); offset = nvmf_get_log_page_offset(cmd); if (offset >= dc->discovery_log_len) goto error; length = nvmf_get_log_page_length(cmd); if (length > dc->discovery_log_len - offset) length = dc->discovery_log_len - offset; nvmf_send_controller_data(nc, (char *)dc->discovery_log + offset, length); return; error: nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); } static bool discovery_command(const struct nvmf_capsule *nc, const struct nvme_command *cmd, void *arg) { struct discovery_controller *dc = arg; switch (cmd->opc) { case NVME_OPC_GET_LOG_PAGE: handle_get_log_page_command(nc, cmd, dc); return (true); default: return (false); } } static void * discovery_thread(void *arg) { struct discovery_thread_arg *dta = arg; struct discovery_controller dc; pthread_detach(pthread_self()); memset(&dc, 0, sizeof(dc)); dc.s = dta->s; controller_handle_admin_commands(dta->c, discovery_command, &dc); free(dc.discovery_log); free_controller(dta->c); nvmf_free_qpair(dta->qp); close(dta->s); free(dta); return (NULL); } void handle_discovery_socket(int s) { struct nvmf_fabric_connect_data data; struct nvme_controller_data cdata; struct nvmf_qpair_params qparams; struct discovery_thread_arg *dta; struct nvmf_capsule *nc; struct nvmf_qpair *qp; pthread_t thr; int error; memset(&qparams, 0, sizeof(qparams)); qparams.tcp.fd = s; nc = NULL; qp = nvmf_accept(discovery_na, &qparams, &nc, &data); if (qp == NULL) { warnx("Failed to create discovery qpair: %s", nvmf_association_error(discovery_na)); goto error; } if (strcmp(data.subnqn, NVMF_DISCOVERY_NQN) != 0) { warn("Discovery qpair with invalid SubNQN: %.*s", (int)sizeof(data.subnqn), data.subnqn); nvmf_connect_invalid_parameters(nc, true, offsetof(struct nvmf_fabric_connect_data, subnqn)); goto error; } /* Just use a controller ID of 1 for all discovery controllers. */ error = nvmf_finish_accept(nc, 1); if (error != 0) { warnc(error, "Failed to send CONNECT reponse"); goto error; } nvmf_init_discovery_controller_data(qp, &cdata); dta = malloc(sizeof(*dta)); dta->qp = qp; dta->s = s; dta->c = init_controller(qp, &cdata); error = pthread_create(&thr, NULL, discovery_thread, dta); if (error != 0) { warnc(error, "Failed to create discovery thread"); free_controller(dta->c); free(dta); goto error; } nvmf_free_capsule(nc); return; error: if (nc != NULL) nvmf_free_capsule(nc); if (qp != NULL) nvmf_free_qpair(qp); close(s); } diff --git a/usr.sbin/nvmfd/internal.h b/usr.sbin/nvmfd/internal.h index 5ddbc1cf89f0..f70dc78881c6 100644 --- a/usr.sbin/nvmfd/internal.h +++ b/usr.sbin/nvmfd/internal.h @@ -1,65 +1,66 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #ifndef __INTERNAL_H__ #define __INTERNAL_H__ #include struct controller; struct nvme_command; struct nvme_controller_data; struct nvme_ns_list; struct nvmf_capsule; struct nvmf_qpair; typedef bool handle_command(const struct nvmf_capsule *, const struct nvme_command *, void *); extern bool data_digests; extern bool header_digests; extern bool flow_control_disable; extern bool kernel_io; +extern uint32_t maxh2cdata; /* controller.c */ void controller_handle_admin_commands(struct controller *c, handle_command *cb, void *cb_arg); struct controller *init_controller(struct nvmf_qpair *qp, const struct nvme_controller_data *cdata); void free_controller(struct controller *c); /* discovery.c */ void init_discovery(void); void handle_discovery_socket(int s); void discovery_add_io_controller(int s, const char *subnqn); /* io.c */ void init_io(const char *subnqn); void handle_io_socket(int s); void shutdown_io(void); /* devices.c */ void register_devices(int ac, char **av); u_int device_count(void); void device_active_nslist(uint32_t nsid, struct nvme_ns_list *nslist); bool device_identification_descriptor(uint32_t nsid, void *buf); bool device_namespace_data(uint32_t nsid, struct nvme_namespace_data *nsdata); void device_read(uint32_t nsid, uint64_t lba, u_int nlb, const struct nvmf_capsule *nc); void device_write(uint32_t nsid, uint64_t lba, u_int nlb, const struct nvmf_capsule *nc); void device_flush(uint32_t nsid, const struct nvmf_capsule *nc); /* ctl.c */ void init_ctl_port(const char *subnqn, const struct nvmf_association_params *params); void ctl_handoff_qpair(struct nvmf_qpair *qp, const struct nvmf_fabric_connect_cmd *cmd, const struct nvmf_fabric_connect_data *data); void shutdown_ctl_port(const char *subnqn); #endif /* !__INTERNAL_H__ */ diff --git a/usr.sbin/nvmfd/io.c b/usr.sbin/nvmfd/io.c index 3c25d1944eb8..4407360257a2 100644 --- a/usr.sbin/nvmfd/io.c +++ b/usr.sbin/nvmfd/io.c @@ -1,676 +1,676 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #include #include #include #include #include #include #include #include #include #include "internal.h" struct io_controller { struct controller *c; u_int num_io_queues; u_int active_io_queues; struct nvmf_qpair **io_qpairs; int *io_sockets; struct nvme_firmware_page fp; struct nvme_health_information_page hip; uint16_t partial_dur; uint16_t partial_duw; uint16_t cntlid; char hostid[16]; char hostnqn[NVME_NQN_FIELD_SIZE]; }; static struct nvmf_association *io_na; static pthread_cond_t io_cond; static pthread_mutex_t io_na_mutex; static struct io_controller *io_controller; static const char *nqn; static char serial[NVME_SERIAL_NUMBER_LENGTH]; void init_io(const char *subnqn) { struct nvmf_association_params aparams; u_long hostid; size_t len; memset(&aparams, 0, sizeof(aparams)); aparams.sq_flow_control = !flow_control_disable; aparams.dynamic_controller_model = true; aparams.max_admin_qsize = NVME_MAX_ADMIN_ENTRIES; aparams.max_io_qsize = NVMF_MAX_IO_ENTRIES; aparams.tcp.pda = 0; aparams.tcp.header_digests = header_digests; aparams.tcp.data_digests = data_digests; - aparams.tcp.maxh2cdata = 256 * 1024; + aparams.tcp.maxh2cdata = maxh2cdata; io_na = nvmf_allocate_association(NVMF_TRTYPE_TCP, true, &aparams); if (io_na == NULL) err(1, "Failed to create I/O controller association"); nqn = subnqn; /* Generate a serial number from the kern.hostid node. */ len = sizeof(hostid); if (sysctlbyname("kern.hostid", &hostid, &len, NULL, 0) == -1) err(1, "sysctl: kern.hostid"); nvmf_controller_serial(serial, sizeof(serial), hostid); pthread_cond_init(&io_cond, NULL); pthread_mutex_init(&io_na_mutex, NULL); if (kernel_io) init_ctl_port(subnqn, &aparams); } void shutdown_io(void) { if (kernel_io) shutdown_ctl_port(nqn); } static void handle_get_log_page(struct io_controller *ioc, const struct nvmf_capsule *nc, const struct nvme_command *cmd) { uint64_t offset; uint32_t numd; size_t len; uint8_t lid; lid = le32toh(cmd->cdw10) & 0xff; numd = le32toh(cmd->cdw10) >> 16 | le32toh(cmd->cdw11) << 16; offset = le32toh(cmd->cdw12) | (uint64_t)le32toh(cmd->cdw13) << 32; if (offset % 3 != 0) goto error; len = (numd + 1) * 4; switch (lid) { case NVME_LOG_ERROR: { void *buf; if (len % sizeof(struct nvme_error_information_entry) != 0) goto error; buf = calloc(1, len); nvmf_send_controller_data(nc, buf, len); free(buf); return; } case NVME_LOG_HEALTH_INFORMATION: if (len != sizeof(ioc->hip)) goto error; nvmf_send_controller_data(nc, &ioc->hip, sizeof(ioc->hip)); return; case NVME_LOG_FIRMWARE_SLOT: if (len != sizeof(ioc->fp)) goto error; nvmf_send_controller_data(nc, &ioc->fp, sizeof(ioc->fp)); return; default: warnx("Unsupported page %#x for GET_LOG_PAGE\n", lid); goto error; } error: nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); } static bool handle_io_identify_command(const struct nvmf_capsule *nc, const struct nvme_command *cmd) { struct nvme_namespace_data nsdata; struct nvme_ns_list nslist; uint32_t nsid; uint8_t cns; cns = le32toh(cmd->cdw10) & 0xFF; switch (cns) { case 0: /* Namespace data. */ if (!device_namespace_data(le32toh(cmd->nsid), &nsdata)) { nvmf_send_generic_error(nc, NVME_SC_INVALID_NAMESPACE_OR_FORMAT); return (true); } nvmf_send_controller_data(nc, &nsdata, sizeof(nsdata)); return (true); case 2: /* Active namespace list. */ nsid = le32toh(cmd->nsid); if (nsid >= 0xfffffffe) { nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); return (true); } device_active_nslist(nsid, &nslist); nvmf_send_controller_data(nc, &nslist, sizeof(nslist)); return (true); case 3: /* Namespace Identification Descriptor list. */ if (!device_identification_descriptor(le32toh(cmd->nsid), &nsdata)) { nvmf_send_generic_error(nc, NVME_SC_INVALID_NAMESPACE_OR_FORMAT); return (true); } nvmf_send_controller_data(nc, &nsdata, sizeof(nsdata)); return (true); default: return (false); } } static void handle_set_features(struct io_controller *ioc, const struct nvmf_capsule *nc, const struct nvme_command *cmd) { struct nvme_completion cqe; uint8_t fid; fid = NVMEV(NVME_FEAT_SET_FID, le32toh(cmd->cdw10)); switch (fid) { case NVME_FEAT_NUMBER_OF_QUEUES: { uint32_t num_queues; if (ioc->num_io_queues != 0) { nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR); return; } num_queues = le32toh(cmd->cdw11) & 0xffff; /* 5.12.1.7: 65535 is invalid. */ if (num_queues == 65535) goto error; /* Fabrics requires the same number of SQs and CQs. */ if (le32toh(cmd->cdw11) >> 16 != num_queues) goto error; /* Convert to 1's based */ num_queues++; /* Lock to synchronize with handle_io_qpair. */ pthread_mutex_lock(&io_na_mutex); ioc->num_io_queues = num_queues; ioc->io_qpairs = calloc(num_queues, sizeof(*ioc->io_qpairs)); ioc->io_sockets = calloc(num_queues, sizeof(*ioc->io_sockets)); pthread_mutex_unlock(&io_na_mutex); nvmf_init_cqe(&cqe, nc, 0); cqe.cdw0 = cmd->cdw11; nvmf_send_response(nc, &cqe); return; } case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: { uint32_t aer_mask; aer_mask = le32toh(cmd->cdw11); /* Check for any reserved or unimplemented feature bits. */ if ((aer_mask & 0xffffc000) != 0) goto error; /* No AERs are generated by this daemon. */ nvmf_send_success(nc); return; } default: warnx("Unsupported feature ID %u for SET_FEATURES", fid); goto error; } error: nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); } static bool admin_command(const struct nvmf_capsule *nc, const struct nvme_command *cmd, void *arg) { struct io_controller *ioc = arg; switch (cmd->opc) { case NVME_OPC_GET_LOG_PAGE: handle_get_log_page(ioc, nc, cmd); return (true); case NVME_OPC_IDENTIFY: return (handle_io_identify_command(nc, cmd)); case NVME_OPC_SET_FEATURES: handle_set_features(ioc, nc, cmd); return (true); case NVME_OPC_ASYNC_EVENT_REQUEST: /* Ignore and never complete. */ return (true); case NVME_OPC_KEEP_ALIVE: nvmf_send_success(nc); return (true); default: return (false); } } static void handle_admin_qpair(struct io_controller *ioc) { pthread_setname_np(pthread_self(), "admin queue"); controller_handle_admin_commands(ioc->c, admin_command, ioc); pthread_mutex_lock(&io_na_mutex); for (u_int i = 0; i < ioc->num_io_queues; i++) { if (ioc->io_qpairs[i] == NULL || ioc->io_sockets[i] == -1) continue; close(ioc->io_sockets[i]); ioc->io_sockets[i] = -1; } /* Wait for I/O threads to notice. */ while (ioc->active_io_queues > 0) pthread_cond_wait(&io_cond, &io_na_mutex); io_controller = NULL; pthread_mutex_unlock(&io_na_mutex); free_controller(ioc->c); free(ioc); } static bool handle_io_fabrics_command(const struct nvmf_capsule *nc, const struct nvmf_fabric_cmd *fc) { switch (fc->fctype) { case NVMF_FABRIC_COMMAND_CONNECT: warnx("CONNECT command on connected queue"); nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR); break; case NVMF_FABRIC_COMMAND_DISCONNECT: { const struct nvmf_fabric_disconnect_cmd *dis = (const struct nvmf_fabric_disconnect_cmd *)fc; if (dis->recfmt != htole16(0)) { nvmf_send_error(nc, NVME_SCT_COMMAND_SPECIFIC, NVMF_FABRIC_SC_INCOMPATIBLE_FORMAT); break; } nvmf_send_success(nc); return (true); } default: warnx("Unsupported fabrics command %#x", fc->fctype); nvmf_send_generic_error(nc, NVME_SC_INVALID_OPCODE); break; } return (false); } static void hip_add(uint64_t pair[2], uint64_t addend) { uint64_t old, new; old = le64toh(pair[0]); new = old + addend; pair[0] = htole64(new); if (new < old) pair[1] += htole64(1); } static uint64_t cmd_lba(const struct nvme_command *cmd) { return ((uint64_t)le32toh(cmd->cdw11) << 32 | le32toh(cmd->cdw10)); } static u_int cmd_nlb(const struct nvme_command *cmd) { return ((le32toh(cmd->cdw12) & 0xffff) + 1); } static void handle_read(struct io_controller *ioc, const struct nvmf_capsule *nc, const struct nvme_command *cmd) { size_t len; len = nvmf_capsule_data_len(nc); device_read(le32toh(cmd->nsid), cmd_lba(cmd), cmd_nlb(cmd), nc); hip_add(ioc->hip.host_read_commands, 1); len /= 512; len += ioc->partial_dur; if (len > 1000) hip_add(ioc->hip.data_units_read, len / 1000); ioc->partial_dur = len % 1000; } static void handle_write(struct io_controller *ioc, const struct nvmf_capsule *nc, const struct nvme_command *cmd) { size_t len; len = nvmf_capsule_data_len(nc); device_write(le32toh(cmd->nsid), cmd_lba(cmd), cmd_nlb(cmd), nc); hip_add(ioc->hip.host_write_commands, 1); len /= 512; len += ioc->partial_duw; if (len > 1000) hip_add(ioc->hip.data_units_written, len / 1000); ioc->partial_duw = len % 1000; } static void handle_flush(const struct nvmf_capsule *nc, const struct nvme_command *cmd) { device_flush(le32toh(cmd->nsid), nc); } static bool handle_io_commands(struct io_controller *ioc, struct nvmf_qpair *qp) { const struct nvme_command *cmd; struct nvmf_capsule *nc; int error; bool disconnect; disconnect = false; while (!disconnect) { error = nvmf_controller_receive_capsule(qp, &nc); if (error != 0) { if (error != ECONNRESET) warnc(error, "Failed to read command capsule"); break; } cmd = nvmf_capsule_sqe(nc); switch (cmd->opc) { case NVME_OPC_FLUSH: if (cmd->nsid == htole32(0xffffffff)) { nvmf_send_generic_error(nc, NVME_SC_INVALID_NAMESPACE_OR_FORMAT); break; } handle_flush(nc, cmd); break; case NVME_OPC_WRITE: handle_write(ioc, nc, cmd); break; case NVME_OPC_READ: handle_read(ioc, nc, cmd); break; case NVME_OPC_FABRICS_COMMANDS: disconnect = handle_io_fabrics_command(nc, (const struct nvmf_fabric_cmd *)cmd); break; default: warnx("Unsupported NVM opcode %#x", cmd->opc); nvmf_send_generic_error(nc, NVME_SC_INVALID_OPCODE); break; } nvmf_free_capsule(nc); } return (disconnect); } static void handle_io_qpair(struct io_controller *ioc, struct nvmf_qpair *qp, int qid) { char name[64]; bool disconnect; snprintf(name, sizeof(name), "I/O queue %d", qid); pthread_setname_np(pthread_self(), name); disconnect = handle_io_commands(ioc, qp); pthread_mutex_lock(&io_na_mutex); if (disconnect) ioc->io_qpairs[qid - 1] = NULL; if (ioc->io_sockets[qid - 1] != -1) { close(ioc->io_sockets[qid - 1]); ioc->io_sockets[qid - 1] = -1; } ioc->active_io_queues--; if (ioc->active_io_queues == 0) pthread_cond_broadcast(&io_cond); pthread_mutex_unlock(&io_na_mutex); } static void connect_admin_qpair(int s, struct nvmf_qpair *qp, struct nvmf_capsule *nc, const struct nvmf_fabric_connect_data *data) { struct nvme_controller_data cdata; struct io_controller *ioc; int error; /* Can only have one active I/O controller at a time. */ pthread_mutex_lock(&io_na_mutex); if (io_controller != NULL) { pthread_mutex_unlock(&io_na_mutex); nvmf_send_error(nc, NVME_SCT_COMMAND_SPECIFIC, NVMF_FABRIC_SC_CONTROLLER_BUSY); goto error; } error = nvmf_finish_accept(nc, 2); if (error != 0) { pthread_mutex_unlock(&io_na_mutex); warnc(error, "Failed to send CONNECT response"); goto error; } ioc = calloc(1, sizeof(*ioc)); ioc->cntlid = 2; memcpy(ioc->hostid, data->hostid, sizeof(ioc->hostid)); memcpy(ioc->hostnqn, data->hostnqn, sizeof(ioc->hostnqn)); nvmf_init_io_controller_data(qp, serial, nqn, device_count(), NVMF_IOCCSZ, &cdata); ioc->fp.afi = NVMEF(NVME_FIRMWARE_PAGE_AFI_SLOT, 1); memcpy(ioc->fp.revision[0], cdata.fr, sizeof(cdata.fr)); ioc->hip.power_cycles[0] = 1; ioc->c = init_controller(qp, &cdata); io_controller = ioc; pthread_mutex_unlock(&io_na_mutex); nvmf_free_capsule(nc); handle_admin_qpair(ioc); close(s); return; error: nvmf_free_capsule(nc); close(s); } static void connect_io_qpair(int s, struct nvmf_qpair *qp, struct nvmf_capsule *nc, const struct nvmf_fabric_connect_data *data, uint16_t qid) { struct io_controller *ioc; int error; pthread_mutex_lock(&io_na_mutex); if (io_controller == NULL) { pthread_mutex_unlock(&io_na_mutex); warnx("Attempt to create I/O qpair without admin qpair"); nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR); goto error; } if (memcmp(io_controller->hostid, data->hostid, sizeof(data->hostid)) != 0) { pthread_mutex_unlock(&io_na_mutex); warnx("hostid mismatch for I/O qpair CONNECT"); nvmf_connect_invalid_parameters(nc, true, offsetof(struct nvmf_fabric_connect_data, hostid)); goto error; } if (le16toh(data->cntlid) != io_controller->cntlid) { pthread_mutex_unlock(&io_na_mutex); warnx("cntlid mismatch for I/O qpair CONNECT"); nvmf_connect_invalid_parameters(nc, true, offsetof(struct nvmf_fabric_connect_data, cntlid)); goto error; } if (memcmp(io_controller->hostnqn, data->hostnqn, sizeof(data->hostid)) != 0) { pthread_mutex_unlock(&io_na_mutex); warnx("host NQN mismatch for I/O qpair CONNECT"); nvmf_connect_invalid_parameters(nc, true, offsetof(struct nvmf_fabric_connect_data, hostnqn)); goto error; } if (io_controller->num_io_queues == 0) { pthread_mutex_unlock(&io_na_mutex); warnx("Attempt to create I/O qpair without enabled queues"); nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR); goto error; } if (qid > io_controller->num_io_queues) { pthread_mutex_unlock(&io_na_mutex); warnx("Attempt to create invalid I/O qpair %u", qid); nvmf_connect_invalid_parameters(nc, false, offsetof(struct nvmf_fabric_connect_cmd, qid)); goto error; } if (io_controller->io_qpairs[qid - 1] != NULL) { pthread_mutex_unlock(&io_na_mutex); warnx("Attempt to re-create I/O qpair %u", qid); nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR); goto error; } error = nvmf_finish_accept(nc, io_controller->cntlid); if (error != 0) { pthread_mutex_unlock(&io_na_mutex); warnc(error, "Failed to send CONNECT response"); goto error; } ioc = io_controller; ioc->active_io_queues++; ioc->io_qpairs[qid - 1] = qp; ioc->io_sockets[qid - 1] = s; pthread_mutex_unlock(&io_na_mutex); nvmf_free_capsule(nc); handle_io_qpair(ioc, qp, qid); return; error: nvmf_free_capsule(nc); close(s); } static void * io_socket_thread(void *arg) { struct nvmf_fabric_connect_data data; struct nvmf_qpair_params qparams; const struct nvmf_fabric_connect_cmd *cmd; struct nvmf_capsule *nc; struct nvmf_qpair *qp; int s; pthread_detach(pthread_self()); s = (intptr_t)arg; memset(&qparams, 0, sizeof(qparams)); qparams.tcp.fd = s; nc = NULL; qp = nvmf_accept(io_na, &qparams, &nc, &data); if (qp == NULL) { warnx("Failed to create I/O qpair: %s", nvmf_association_error(io_na)); goto error; } if (kernel_io) { ctl_handoff_qpair(qp, nvmf_capsule_sqe(nc), &data); goto error; } if (strcmp(data.subnqn, nqn) != 0) { warn("I/O qpair with invalid SubNQN: %.*s", (int)sizeof(data.subnqn), data.subnqn); nvmf_connect_invalid_parameters(nc, true, offsetof(struct nvmf_fabric_connect_data, subnqn)); goto error; } /* Is this an admin or I/O queue pair? */ cmd = nvmf_capsule_sqe(nc); if (cmd->qid == 0) connect_admin_qpair(s, qp, nc, &data); else connect_io_qpair(s, qp, nc, &data, le16toh(cmd->qid)); nvmf_free_qpair(qp); return (NULL); error: if (nc != NULL) nvmf_free_capsule(nc); if (qp != NULL) nvmf_free_qpair(qp); close(s); return (NULL); } void handle_io_socket(int s) { pthread_t thr; int error; error = pthread_create(&thr, NULL, io_socket_thread, (void *)(uintptr_t)s); if (error != 0) { warnc(error, "Failed to create I/O qpair thread"); close(s); } } diff --git a/usr.sbin/nvmfd/nvmfd.8 b/usr.sbin/nvmfd/nvmfd.8 index 40b1c0e2ebe0..1076583c417c 100644 --- a/usr.sbin/nvmfd/nvmfd.8 +++ b/usr.sbin/nvmfd/nvmfd.8 @@ -1,125 +1,131 @@ .\" .\" SPDX-License-Identifier: BSD-2-Clause .\" .\" Copyright (c) 2024 Chelsio Communications, Inc. .\" .Dd July 25, 2024 .Dt NVMFD 8 .Os .Sh NAME .Nm nvmfd .Nd "NVMeoF controller daemon" .Sh SYNOPSIS .Nm .Fl K .Op Fl dFGg +.Op Fl H Ar MAXH2CDATA .Op Fl P Ar port .Op Fl p Ar port .Op Fl t Ar transport .Op Fl n Ar subnqn .Nm .Op Fl dFGg +.Op Fl H Ar MAXH2CDATA .Op Fl P Ar port .Op Fl p Ar port .Op Fl t Ar transport .Op Fl n Ar subnqn .Ar device .Op Ar device ... .Sh DESCRIPTION .Nm accepts incoming NVMeoF connections for both I/O and discovery controllers. .Nm can either implement a single dynamic I/O controller in user mode or hand off incoming I/O controller connections to .Xr nvmft 4 . A dynamic discovery controller service is always provided in user mode. .Pp The following options are available: .Bl -tag -width "-t transport" .It Fl F Permit remote hosts to disable SQ flow control. .It Fl G Permit remote hosts to enable PDU data digests for the TCP transport. .It Fl g Permit remote hosts to enable PDU header digests for the TCP transport. +.It Fl H +Set the MAXH2CDATA value advertised to the remote host for the TCP transport. +This value is in bytes and determines the maximum data payload size for +data PDUs sent by the remote host. +The value must be at least 4096 and defaults to 256KiB. .It Fl K Enable kernel mode which hands off incoming I/O controller connections to .Xr nvmft 4 . .It Fl P Ar port Use .Ar port as the listen TCP port for the discovery controller service. The default value is 8009. .It Fl d Enable debug mode. The daemon sends any errors to standard output and does not place itself in the background. .It Fl p Ar port Use .Ar port as the listen TCP port for the I/O controller service. By default an unused ephemeral port will be chosen. .It Fl n Ar subnqn The Subsystem NVMe Qualified Name for the I/O controller. If an explicit NQN is not given, a default value is generated from the current host's UUID obtained from the .Vt kern.hostuuid sysctl. .It Fl t Ar transport The transport type to use. The default transport is .Dq tcp . .It Ar device When implementing a user mode I/O controller, one or more .Ar device arguments must be specified. Each .Ar device describes the backing store for a namespace exported to remote hosts. Devices can be specified using one of the following syntaxes: .Bl -tag -width "ramdisk:size" .It Pa pathname File or disk device .It ramdisk : Ns Ar size Allocate a memory disk with the given .Ar size . .Ar size may use any of the suffixes supported by .Xr expand_number 3 . .El .El .Sh FILES .Bl -tag -width "/var/run/nvmfd.pid" -compact .It Pa /var/run/nvmfd.pid The default location of the .Nm PID file. .El .Sh EXIT STATUS .Ex -std .Sh SEE ALSO .Xr ctl 4 , .Xr nvmft 4 , .Xr ctladm 8 , .Xr ctld 8 .Sh HISTORY The .Nm module first appeared in .Fx 15.0 . .Sh AUTHORS The .Nm subsystem was developed by .An John Baldwin Aq Mt jhb@FreeBSD.org under sponsorship from Chelsio Communications, Inc. .Sh BUGS The discovery controller and kernel mode functionality of .Nm should be merged into .Xr ctld 8 . .Pp Additional parameters such as -.Va MAXH2CDATA -and queue sizes should be configurable. +queue sizes should be configurable. diff --git a/usr.sbin/nvmfd/nvmfd.c b/usr.sbin/nvmfd/nvmfd.c index cce7a88706d2..df6f400b40e5 100644 --- a/usr.sbin/nvmfd/nvmfd.c +++ b/usr.sbin/nvmfd/nvmfd.c @@ -1,260 +1,271 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include "internal.h" bool data_digests = false; bool header_digests = false; bool flow_control_disable = false; bool kernel_io = false; +uint32_t maxh2cdata = 256 * 1024; static const char *subnqn; static volatile bool quit = false; static void usage(void) { - fprintf(stderr, "nvmfd -K [-dFGg] [-P port] [-p port] [-t transport] [-n subnqn]\n" - "nvmfd [-dFGg] [-P port] [-p port] [-t transport] [-n subnqn]\n" + fprintf(stderr, "nvmfd -K [-dFGg] [-H MAXH2CDATA] [-P port] [-p port] [-t transport] [-n subnqn]\n" + "nvmfd [-dFGg] [-H MAXH2CDATA] [-P port] [-p port] [-t transport] [-n subnqn]\n" "\tdevice [device [...]]\n" "\n" "Devices use one of the following syntaxes:\n" "\tpathame - file or disk device\n" "\tramdisk:size - memory disk of given size\n"); exit(1); } static void handle_sig(int sig __unused) { quit = true; } static void register_listen_socket(int kqfd, int s, void *udata) { struct kevent kev; if (listen(s, -1) != 0) err(1, "listen"); EV_SET(&kev, s, EVFILT_READ, EV_ADD, 0, 0, udata); if (kevent(kqfd, &kev, 1, NULL, 0, NULL) == -1) err(1, "kevent: failed to add listen socket"); } static void create_passive_sockets(int kqfd, const char *port, bool discovery) { struct addrinfo hints, *ai, *list; bool created; int error, s; memset(&hints, 0, sizeof(hints)); hints.ai_flags = AI_PASSIVE; hints.ai_family = AF_UNSPEC; hints.ai_protocol = IPPROTO_TCP; error = getaddrinfo(NULL, port, &hints, &list); if (error != 0) errx(1, "%s", gai_strerror(error)); created = false; for (ai = list; ai != NULL; ai = ai->ai_next) { s = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol); if (s == -1) continue; if (bind(s, ai->ai_addr, ai->ai_addrlen) != 0) { close(s); continue; } if (discovery) { register_listen_socket(kqfd, s, (void *)1); } else { register_listen_socket(kqfd, s, (void *)2); discovery_add_io_controller(s, subnqn); } created = true; } freeaddrinfo(list); if (!created) err(1, "Failed to create any listen sockets"); } static void handle_connections(int kqfd) { struct kevent ev; int s; signal(SIGHUP, handle_sig); signal(SIGINT, handle_sig); signal(SIGQUIT, handle_sig); signal(SIGTERM, handle_sig); while (!quit) { if (kevent(kqfd, NULL, 0, &ev, 1, NULL) == -1) { if (errno == EINTR) continue; err(1, "kevent"); } assert(ev.filter == EVFILT_READ); s = accept(ev.ident, NULL, NULL); if (s == -1) { warn("accept"); continue; } switch ((uintptr_t)ev.udata) { case 1: handle_discovery_socket(s); break; case 2: handle_io_socket(s); break; default: __builtin_unreachable(); } } } int main(int ac, char **av) { struct pidfh *pfh; const char *dport, *ioport, *transport; pid_t pid; + uint64_t value; int ch, error, kqfd; bool daemonize; static char nqn[NVMF_NQN_MAX_LEN]; /* 7.4.9.3 Default port for discovery */ dport = "8009"; pfh = NULL; daemonize = true; ioport = "0"; subnqn = NULL; transport = "tcp"; - while ((ch = getopt(ac, av, "dFgGKn:P:p:t:")) != -1) { + while ((ch = getopt(ac, av, "dFgGH:Kn:P:p:t:")) != -1) { switch (ch) { case 'd': daemonize = false; break; case 'F': flow_control_disable = true; break; case 'G': data_digests = true; break; case 'g': header_digests = true; break; + case 'H': + if (expand_number(optarg, &value) != 0) + errx(1, "Invalid MAXH2CDATA value %s", optarg); + if (value < 4096 || value > UINT32_MAX || + value % 4 != 0) + errx(1, "Invalid MAXH2CDATA value %s", optarg); + maxh2cdata = value; + break; case 'K': kernel_io = true; break; case 'n': subnqn = optarg; break; case 'P': dport = optarg; break; case 'p': ioport = optarg; break; case 't': transport = optarg; break; default: usage(); } } av += optind; ac -= optind; if (kernel_io) { if (ac > 0) usage(); if (modfind("nvmft") == -1 && kldload("nvmft") == -1) warn("couldn't load nvmft"); } else { if (ac < 1) usage(); } if (strcasecmp(transport, "tcp") == 0) { } else errx(1, "Invalid transport %s", transport); if (subnqn == NULL) { error = nvmf_nqn_from_hostuuid(nqn); if (error != 0) errc(1, error, "Failed to generate NQN"); subnqn = nqn; } if (!kernel_io) register_devices(ac, av); init_discovery(); init_io(subnqn); if (daemonize) { pfh = pidfile_open(NULL, 0600, &pid); if (pfh == NULL) { if (errno == EEXIST) errx(1, "Daemon already running, pid: %jd", (intmax_t)pid); warn("Cannot open or create pidfile"); } if (daemon(0, 0) != 0) { pidfile_remove(pfh); err(1, "Failed to fork into the background"); } pidfile_write(pfh); } kqfd = kqueue(); if (kqfd == -1) { pidfile_remove(pfh); err(1, "kqueue"); } create_passive_sockets(kqfd, dport, true); create_passive_sockets(kqfd, ioport, false); handle_connections(kqfd); shutdown_io(); if (pfh != NULL) pidfile_remove(pfh); return (0); }