diff --git a/usr.sbin/Makefile b/usr.sbin/Makefile --- a/usr.sbin/Makefile +++ b/usr.sbin/Makefile @@ -56,6 +56,7 @@ nfsuserd \ nmtree \ nologin \ + nvmfd \ pciconf \ periodic \ pnfsdscopymr \ diff --git a/usr.sbin/nvmfd/Makefile b/usr.sbin/nvmfd/Makefile new file mode 100644 --- /dev/null +++ b/usr.sbin/nvmfd/Makefile @@ -0,0 +1,14 @@ +.include +.PATH: ${SRCTOP}/sys/libkern + +PACKAGE=nvme-tools +PROG= nvmfd +SRCS= nvmfd.c controller.c ctl.c devices.c discovery.c gsb_crc32.c io.c +CFLAGS+= -I${SRCTOP}/lib/libnvmf +MAN= nvmfd.8 +LIBADD+= nvmf pthread util nv + +.include + +CFLAGS.ctl.c= -I${SRCTOP}/sys +CWARNFLAGS.gsb_crc32.c= -Wno-cast-align diff --git a/usr.sbin/nvmfd/controller.c b/usr.sbin/nvmfd/controller.c new file mode 100644 --- /dev/null +++ b/usr.sbin/nvmfd/controller.c @@ -0,0 +1,244 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#include +#include +#include +#include + +#include "internal.h" + +struct controller { + struct nvmf_qpair *qp; + + uint64_t cap; + uint32_t vs; + uint32_t cc; + uint32_t csts; + + bool shutdown; + + struct nvme_controller_data cdata; +}; + +static bool +update_cc(struct controller *c, uint32_t new_cc) +{ + uint32_t changes; + + if (c->shutdown) + return (false); + if (!nvmf_validate_cc(c->qp, c->cap, c->cc, new_cc)) + return (false); + + changes = c->cc ^ new_cc; + c->cc = new_cc; + + /* Handle shutdown requests. */ + if (NVMEV(NVME_CC_REG_SHN, changes) != 0 && + NVMEV(NVME_CC_REG_SHN, new_cc) != 0) { + c->csts &= ~NVMEM(NVME_CSTS_REG_SHST); + c->csts |= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_COMPLETE); + c->shutdown = true; + } + + if (NVMEV(NVME_CC_REG_EN, changes) != 0) { + if (NVMEV(NVME_CC_REG_EN, new_cc) == 0) { + /* Controller reset. */ + c->csts = 0; + c->shutdown = true; + } else + c->csts |= NVMEF(NVME_CSTS_REG_RDY, 1); + } + return (true); +} + +static void +handle_property_get(const struct controller *c, const struct nvmf_capsule *nc, + const struct nvmf_fabric_prop_get_cmd *pget) +{ + struct nvmf_fabric_prop_get_rsp rsp; + + nvmf_init_cqe(&rsp, nc, 0); + + switch (le32toh(pget->ofst)) { + case NVMF_PROP_CAP: + if (pget->attrib.size != NVMF_PROP_SIZE_8) + goto error; + rsp.value.u64 = htole64(c->cap); + break; + case NVMF_PROP_VS: + if (pget->attrib.size != NVMF_PROP_SIZE_4) + goto error; + rsp.value.u32.low = htole32(c->vs); + break; + case NVMF_PROP_CC: + if (pget->attrib.size != NVMF_PROP_SIZE_4) + goto error; + rsp.value.u32.low = htole32(c->cc); + break; + case NVMF_PROP_CSTS: + if (pget->attrib.size != NVMF_PROP_SIZE_4) + goto error; + rsp.value.u32.low = htole32(c->csts); + break; + default: + goto error; + } + + nvmf_send_response(nc, &rsp); + return; +error: + nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); +} + +static void +handle_property_set(struct controller *c, const struct nvmf_capsule *nc, + const struct nvmf_fabric_prop_set_cmd *pset) +{ + switch (le32toh(pset->ofst)) { + case NVMF_PROP_CC: + if (pset->attrib.size != NVMF_PROP_SIZE_4) + goto error; + if (!update_cc(c, le32toh(pset->value.u32.low))) + goto error; + break; + default: + goto error; + } + + nvmf_send_success(nc); + return; +error: + nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); +} + +static void +handle_fabrics_command(struct controller *c, + const struct nvmf_capsule *nc, const struct nvmf_fabric_cmd *fc) +{ + switch (fc->fctype) { + case NVMF_FABRIC_COMMAND_PROPERTY_GET: + handle_property_get(c, nc, + (const struct nvmf_fabric_prop_get_cmd *)fc); + break; + case NVMF_FABRIC_COMMAND_PROPERTY_SET: + handle_property_set(c, nc, + (const struct nvmf_fabric_prop_set_cmd *)fc); + break; + case NVMF_FABRIC_COMMAND_CONNECT: + warnx("CONNECT command on connected queue"); + nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR); + break; + case NVMF_FABRIC_COMMAND_DISCONNECT: + warnx("DISCONNECT command on admin queue"); + nvmf_send_error(nc, NVME_SCT_COMMAND_SPECIFIC, + NVMF_FABRIC_SC_INVALID_QUEUE_TYPE); + break; + default: + warnx("Unsupported fabrics command %#x", fc->fctype); + nvmf_send_generic_error(nc, NVME_SC_INVALID_OPCODE); + break; + } +} + +static void +handle_identify_command(const struct controller *c, + const struct nvmf_capsule *nc, const struct nvme_command *cmd) +{ + uint8_t cns; + + cns = le32toh(cmd->cdw10) & 0xFF; + switch (cns) { + case 1: + break; + default: + warnx("Unsupported CNS %#x for IDENTIFY", cns); + goto error; + } + + nvmf_send_controller_data(nc, &c->cdata, sizeof(c->cdata)); + return; +error: + nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); +} + +void +controller_handle_admin_commands(struct controller *c, handle_command *cb, + void *cb_arg) +{ + struct nvmf_qpair *qp = c->qp; + const struct nvme_command *cmd; + struct nvmf_capsule *nc; + int error; + + for (;;) { + error = nvmf_controller_receive_capsule(qp, &nc); + if (error != 0) { + if (error != ECONNRESET) + warnc(error, "Failed to read command capsule"); + break; + } + + cmd = nvmf_capsule_sqe(nc); + + /* + * Only permit Fabrics commands while a controller is + * disabled. + */ + if (NVMEV(NVME_CC_REG_EN, c->cc) == 0 && + cmd->opc != NVME_OPC_FABRICS_COMMANDS) { + warnx("Unsupported admin opcode %#x whiled disabled\n", + cmd->opc); + nvmf_send_generic_error(nc, + NVME_SC_COMMAND_SEQUENCE_ERROR); + nvmf_free_capsule(nc); + continue; + } + + if (cb(nc, cmd, cb_arg)) { + nvmf_free_capsule(nc); + continue; + } + + switch (cmd->opc) { + case NVME_OPC_FABRICS_COMMANDS: + handle_fabrics_command(c, nc, + (const struct nvmf_fabric_cmd *)cmd); + break; + case NVME_OPC_IDENTIFY: + handle_identify_command(c, nc, cmd); + break; + default: + warnx("Unsupported admin opcode %#x", cmd->opc); + nvmf_send_generic_error(nc, NVME_SC_INVALID_OPCODE); + break; + } + nvmf_free_capsule(nc); + } +} + +struct controller * +init_controller(struct nvmf_qpair *qp, + const struct nvme_controller_data *cdata) +{ + struct controller *c; + + c = calloc(1, sizeof(*c)); + c->qp = qp; + c->cap = nvmf_controller_cap(c->qp); + c->vs = cdata->ver; + c->cdata = *cdata; + + return (c); +} + +void +free_controller(struct controller *c) +{ + free(c); +} diff --git a/usr.sbin/nvmfd/ctl.c b/usr.sbin/nvmfd/ctl.c new file mode 100644 --- /dev/null +++ b/usr.sbin/nvmfd/ctl.c @@ -0,0 +1,139 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "internal.h" + +static int ctl_fd = -1; +static int ctl_port; + +static void +open_ctl(void) +{ + if (ctl_fd > 0) + return; + + ctl_fd = open(CTL_DEFAULT_DEV, O_RDWR); + if (ctl_fd == -1 && errno == ENOENT) { + if (kldload("ctl") == -1) + err(1, "Failed to load ctl.ko"); + ctl_fd = open(CTL_DEFAULT_DEV, O_RDWR); + } + if (ctl_fd == -1) + err(1, "Failed to open %s", CTL_DEFAULT_DEV); +} + +void +init_ctl_port(const char *subnqn, const struct nvmf_association_params *params) +{ + char result_buf[256]; + struct ctl_port_entry entry; + struct ctl_req req; + nvlist_t *nvl; + + open_ctl(); + + nvl = nvlist_create(0); + + nvlist_add_string(nvl, "subnqn", subnqn); + + /* XXX: Hardcoded in discovery.c */ + nvlist_add_stringf(nvl, "portid", "%u", 1); + + nvlist_add_stringf(nvl, "max_io_qsize", "%u", params->max_io_qsize); + + memset(&req, 0, sizeof(req)); + strlcpy(req.driver, "nvmf", sizeof(req.driver)); + req.reqtype = CTL_REQ_CREATE; + req.args = nvlist_pack(nvl, &req.args_len); + if (req.args == NULL) + errx(1, "Failed to pack nvlist for CTL_PORT/CTL_REQ_CREATE"); + req.result = result_buf; + req.result_len = sizeof(result_buf); + if (ioctl(ctl_fd, CTL_PORT_REQ, &req) != 0) + err(1, "ioctl(CTL_PORT/CTL_REQ_CREATE)"); + if (req.status == CTL_LUN_ERROR) + errx(1, "Failed to create CTL port: %s", req.error_str); + if (req.status != CTL_LUN_OK) + errx(1, "Failed to create CTL port: %d", req.status); + + nvlist_destroy(nvl); + nvl = nvlist_unpack(result_buf, req.result_len, 0); + if (nvl == NULL) + errx(1, "Failed to unpack nvlist from CTL_PORT/CTL_REQ_CREATE"); + + ctl_port = nvlist_get_number(nvl, "port_id"); + nvlist_destroy(nvl); + + memset(&entry, 0, sizeof(entry)); + entry.targ_port = ctl_port; + if (ioctl(ctl_fd, CTL_ENABLE_PORT, &entry) != 0) + errx(1, "ioctl(CTL_ENABLE_PORT)"); +} + +void +shutdown_ctl_port(const char *subnqn) +{ + struct ctl_req req; + nvlist_t *nvl; + + open_ctl(); + + nvl = nvlist_create(0); + + nvlist_add_string(nvl, "subnqn", subnqn); + + memset(&req, 0, sizeof(req)); + strlcpy(req.driver, "nvmf", sizeof(req.driver)); + req.reqtype = CTL_REQ_REMOVE; + req.args = nvlist_pack(nvl, &req.args_len); + if (req.args == NULL) + errx(1, "Failed to pack nvlist for CTL_PORT/CTL_REQ_REMOVE"); + if (ioctl(ctl_fd, CTL_PORT_REQ, &req) != 0) + err(1, "ioctl(CTL_PORT/CTL_REQ_REMOVE)"); + if (req.status == CTL_LUN_ERROR) + errx(1, "Failed to remove CTL port: %s", req.error_str); + if (req.status != CTL_LUN_OK) + errx(1, "Failed to remove CTL port: %d", req.status); + + nvlist_destroy(nvl); +} + +void +ctl_handoff_qpair(struct nvmf_qpair *qp, + const struct nvmf_fabric_connect_cmd *cmd, + const struct nvmf_fabric_connect_data *data) +{ + struct ctl_nvmf req; + int error; + + memset(&req, 0, sizeof(req)); + req.type = CTL_NVMF_HANDOFF; + error = nvmf_handoff_controller_qpair(qp, &req.data.handoff); + if (error != 0) { + warnc(error, "Failed to prepare qpair for handoff"); + return; + } + + req.data.handoff.cmd = cmd; + req.data.handoff.data = data; + if (ioctl(ctl_fd, CTL_NVMF, &req) != 0) + warn("ioctl(CTL_NVMF/CTL_NVMF_HANDOFF)"); +} diff --git a/usr.sbin/nvmfd/devices.c b/usr.sbin/nvmfd/devices.c new file mode 100644 --- /dev/null +++ b/usr.sbin/nvmfd/devices.c @@ -0,0 +1,386 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +#define RAMDISK_PREFIX "ramdisk:" + +struct backing_device { + enum { RAMDISK, FILE, CDEV } type; + union { + int fd; /* FILE, CDEV */ + void *mem; /* RAMDISK */ + }; + u_int sector_size; + uint64_t nlbas; + uint64_t eui64; +}; + +static struct backing_device *devices; +static u_int ndevices; + +static uint64_t +generate_eui64(uint32_t low) +{ + return (OUI_FREEBSD_NVME_LOW << 16 | low); +} + +static uint32_t +crc32(const void *buf, size_t len) +{ + return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff); +} + +static void +init_ramdisk(const char *config, struct backing_device *dev) +{ + static uint32_t ramdisk_idx = 1; + uint64_t num; + + dev->type = RAMDISK; + dev->sector_size = 512; + if (expand_number(config, &num)) + errx(1, "Invalid ramdisk specification: %s", config); + if ((num % dev->sector_size) != 0) + errx(1, "Invalid ramdisk size %ju", (uintmax_t)num); + dev->mem = calloc(num, 1); + dev->nlbas = num / dev->sector_size; + dev->eui64 = generate_eui64('M' << 24 | ramdisk_idx++); +} + +static void +init_filedevice(const char *config, int fd, struct stat *sb, + struct backing_device *dev) +{ + dev->type = FILE; + dev->fd = fd; + dev->sector_size = 512; + if ((sb->st_size % dev->sector_size) != 0) + errx(1, "File size is not a multiple of 512: %s", config); + dev->nlbas = sb->st_size / dev->sector_size; + dev->eui64 = generate_eui64('F' << 24 | + (crc32(config, strlen(config)) & 0xffffff)); +} + +static void +init_chardevice(const char *config, int fd, struct backing_device *dev) +{ + off_t len; + + dev->type = CDEV; + dev->fd = fd; + if (ioctl(fd, DIOCGSECTORSIZE, &dev->sector_size) != 0) + err(1, "Failed to fetch sector size for %s", config); + if (ioctl(fd, DIOCGMEDIASIZE, &len) != 0) + err(1, "Failed to fetch sector size for %s", config); + dev->nlbas = len / dev->sector_size; + dev->eui64 = generate_eui64('C' << 24 | + (crc32(config, strlen(config)) & 0xffffff)); +} + +static void +init_device(const char *config, struct backing_device *dev) +{ + struct stat sb; + int fd; + + /* Check for a RAM disk. */ + if (strncmp(RAMDISK_PREFIX, config, strlen(RAMDISK_PREFIX)) == 0) { + init_ramdisk(config + strlen(RAMDISK_PREFIX), dev); + return; + } + + fd = open(config, O_RDWR); + if (fd == -1) + err(1, "Failed to open %s", config); + if (fstat(fd, &sb) == -1) + err(1, "fstat"); + switch (sb.st_mode & S_IFMT) { + case S_IFCHR: + init_chardevice(config, fd, dev); + break; + case S_IFREG: + init_filedevice(config, fd, &sb, dev); + break; + default: + errx(1, "Invalid file type for %s", config); + } +} + +void +register_devices(int ac, char **av) +{ + ndevices = ac; + devices = calloc(ndevices, sizeof(*devices)); + + for (int i = 0; i < ac; i++) + init_device(av[i], &devices[i]); +} + +u_int +device_count(void) +{ + return (ndevices); +} + +static struct backing_device * +lookup_device(uint32_t nsid) +{ + if (nsid == 0 || nsid > ndevices) + return (NULL); + return (&devices[nsid - 1]); +} + +void +device_active_nslist(uint32_t nsid, struct nvme_ns_list *nslist) +{ + u_int count; + + memset(nslist, 0, sizeof(*nslist)); + count = 0; + nsid++; + while (nsid <= ndevices) { + nslist->ns[count] = htole32(nsid); + count++; + if (count == nitems(nslist->ns)) + break; + nsid++; + } +} + +bool +device_identification_descriptor(uint32_t nsid, void *buf) +{ + struct backing_device *dev; + char *p; + + dev = lookup_device(nsid); + if (dev == NULL) + return (false); + + memset(buf, 0, 4096); + + p = buf; + + /* EUI64 */ + *p++ = 1; + *p++ = 8; + p += 2; + be64enc(p, dev->eui64); + return (true); +} + +bool +device_namespace_data(uint32_t nsid, struct nvme_namespace_data *nsdata) +{ + struct backing_device *dev; + + dev = lookup_device(nsid); + if (dev == NULL) + return (false); + + memset(nsdata, 0, sizeof(*nsdata)); + nsdata->nsze = htole64(dev->nlbas); + nsdata->ncap = nsdata->nsze; + nsdata->nuse = nsdata->ncap; + nsdata->nlbaf = 1 - 1; + nsdata->flbas = NVMEF(NVME_NS_DATA_FLBAS_FORMAT, 0); + nsdata->lbaf[0] = NVMEF(NVME_NS_DATA_LBAF_LBADS, + ffs(dev->sector_size) - 1); + + be64enc(nsdata->eui64, dev->eui64); + return (true); +} + +static bool +read_buffer(int fd, void *buf, size_t len, off_t offset) +{ + ssize_t nread; + char *dst; + + dst = buf; + while (len > 0) { + nread = pread(fd, dst, len, offset); + if (nread == -1 && errno == EINTR) + continue; + if (nread <= 0) + return (false); + dst += nread; + len -= nread; + offset += nread; + } + return (true); +} + +void +device_read(uint32_t nsid, uint64_t lba, u_int nlb, + const struct nvmf_capsule *nc) +{ + struct backing_device *dev; + char *p, *src; + off_t off; + size_t len; + + dev = lookup_device(nsid); + if (dev == NULL) { + nvmf_send_generic_error(nc, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT); + return; + } + + if (lba + nlb < lba || lba + nlb > dev->nlbas) { + nvmf_send_generic_error(nc, NVME_SC_LBA_OUT_OF_RANGE); + return; + } + + off = lba * dev->sector_size; + len = nlb * dev->sector_size; + if (nvmf_capsule_data_len(nc) != len) { + nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); + return; + } + + if (dev->type == RAMDISK) { + p = NULL; + src = (char *)dev->mem + off; + } else { + p = malloc(len); + if (!read_buffer(dev->fd, p, len, off)) { + free(p); + nvmf_send_generic_error(nc, + NVME_SC_INTERNAL_DEVICE_ERROR); + return; + } + src = p; + } + + nvmf_send_controller_data(nc, src, len); + free(p); +} + +static bool +write_buffer(int fd, const void *buf, size_t len, off_t offset) +{ + ssize_t nwritten; + const char *src; + + src = buf; + while (len > 0) { + nwritten = pwrite(fd, src, len, offset); + if (nwritten == -1 && errno == EINTR) + continue; + if (nwritten <= 0) + return (false); + src += nwritten; + len -= nwritten; + offset += nwritten; + } + return (true); +} + +void +device_write(uint32_t nsid, uint64_t lba, u_int nlb, + const struct nvmf_capsule *nc) +{ + struct backing_device *dev; + char *p, *dst; + off_t off; + size_t len; + int error; + + dev = lookup_device(nsid); + if (dev == NULL) { + nvmf_send_generic_error(nc, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT); + return; + } + + if (lba + nlb < lba || lba + nlb > dev->nlbas) { + nvmf_send_generic_error(nc, NVME_SC_LBA_OUT_OF_RANGE); + return; + } + + off = lba * dev->sector_size; + len = nlb * dev->sector_size; + if (nvmf_capsule_data_len(nc) != len) { + nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); + return; + } + + if (dev->type == RAMDISK) { + p = NULL; + dst = (char *)dev->mem + off; + } else { + p = malloc(len); + dst = p; + } + + error = nvmf_receive_controller_data(nc, 0, dst, len); + if (error != 0) { + nvmf_send_generic_error(nc, NVME_SC_TRANSIENT_TRANSPORT_ERROR); + free(p); + return; + } + + if (dev->type != RAMDISK) { + if (!write_buffer(dev->fd, p, len, off)) { + free(p); + nvmf_send_generic_error(nc, + NVME_SC_INTERNAL_DEVICE_ERROR); + return; + } + } + free(p); + nvmf_send_success(nc); +} + +void +device_flush(uint32_t nsid, const struct nvmf_capsule *nc) +{ + struct backing_device *dev; + + dev = lookup_device(nsid); + if (dev == NULL) { + nvmf_send_generic_error(nc, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT); + return; + } + + switch (dev->type) { + case RAMDISK: + break; + case FILE: + if (fdatasync(dev->fd) == -1) { + nvmf_send_error(nc, NVME_SCT_MEDIA_ERROR, + NVME_SC_WRITE_FAULTS); + return; + } + break; + case CDEV: + if (ioctl(dev->fd, DIOCGFLUSH) == -1) { + nvmf_send_error(nc, NVME_SCT_MEDIA_ERROR, + NVME_SC_WRITE_FAULTS); + return; + } + } + + nvmf_send_success(nc); +} diff --git a/usr.sbin/nvmfd/discovery.c b/usr.sbin/nvmfd/discovery.c new file mode 100644 --- /dev/null +++ b/usr.sbin/nvmfd/discovery.c @@ -0,0 +1,343 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +struct io_controller_data { + struct nvme_discovery_log_entry entry; + bool wildcard; +}; + +struct discovery_controller { + struct nvme_discovery_log *discovery_log; + size_t discovery_log_len; + int s; +}; + +struct discovery_thread_arg { + struct controller *c; + struct nvmf_qpair *qp; + int s; +}; + +static struct io_controller_data *io_controllers; +static struct nvmf_association *discovery_na; +static u_int num_io_controllers; + +static bool +init_discovery_log_entry(struct nvme_discovery_log_entry *entry, int s, + const char *subnqn) +{ + struct sockaddr_storage ss; + socklen_t len; + bool wildcard; + + len = sizeof(ss); + if (getsockname(s, (struct sockaddr *)&ss, &len) == -1) + err(1, "getsockname"); + + memset(entry, 0, sizeof(*entry)); + entry->trtype = NVMF_TRTYPE_TCP; + switch (ss.ss_family) { + case AF_INET: + { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&ss; + entry->adrfam = NVMF_ADRFAM_IPV4; + snprintf(entry->trsvcid, sizeof(entry->trsvcid), "%u", + htons(sin->sin_port)); + if (inet_ntop(AF_INET, &sin->sin_addr, entry->traddr, + sizeof(entry->traddr)) == NULL) + err(1, "inet_ntop"); + wildcard = (sin->sin_addr.s_addr == htonl(INADDR_ANY)); + break; + } + case AF_INET6: + { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&ss; + entry->adrfam = NVMF_ADRFAM_IPV6; + snprintf(entry->trsvcid, sizeof(entry->trsvcid), "%u", + htons(sin6->sin6_port)); + if (inet_ntop(AF_INET6, &sin6->sin6_addr, entry->traddr, + sizeof(entry->traddr)) == NULL) + err(1, "inet_ntop"); + wildcard = (memcmp(&sin6->sin6_addr, &in6addr_any, + sizeof(in6addr_any)) == 0); + break; + } + default: + errx(1, "Unsupported address family %u", ss.ss_family); + } + entry->subtype = NVMF_SUBTYPE_NVME; + if (flow_control_disable) + entry->treq |= (1 << 2); + entry->portid = htole16(1); + entry->cntlid = htole16(NVMF_CNTLID_DYNAMIC); + entry->aqsz = NVME_MAX_ADMIN_ENTRIES; + strlcpy(entry->subnqn, subnqn, sizeof(entry->subnqn)); + return (wildcard); +} + +void +init_discovery(void) +{ + struct nvmf_association_params aparams; + + memset(&aparams, 0, sizeof(aparams)); + aparams.sq_flow_control = false; + aparams.dynamic_controller_model = true; + aparams.max_admin_qsize = NVME_MAX_ADMIN_ENTRIES; + aparams.tcp.pda = 0; + aparams.tcp.header_digests = header_digests; + aparams.tcp.data_digests = data_digests; + aparams.tcp.maxr2t = 1; + aparams.tcp.maxh2cdata = 256 * 1024; + discovery_na = nvmf_allocate_association(NVMF_TRTYPE_TCP, true, + &aparams); + if (discovery_na == NULL) + err(1, "Failed to create discovery association"); +} + +void +discovery_add_io_controller(int s, const char *subnqn) +{ + struct io_controller_data *icd; + + io_controllers = reallocf(io_controllers, (num_io_controllers + 1) * + sizeof(*io_controllers)); + + icd = &io_controllers[num_io_controllers]; + num_io_controllers++; + + icd->wildcard = init_discovery_log_entry(&icd->entry, s, subnqn); +} + +static void +build_discovery_log_page(struct discovery_controller *dc) +{ + struct sockaddr_storage ss; + socklen_t len; + char traddr[256]; + u_int i, nentries; + uint8_t adrfam; + + if (dc->discovery_log != NULL) + return; + + len = sizeof(ss); + if (getsockname(dc->s, (struct sockaddr *)&ss, &len) == -1) { + warn("build_discovery_log_page: getsockname"); + return; + } + + memset(traddr, 0, sizeof(traddr)); + switch (ss.ss_family) { + case AF_INET: + { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&ss; + adrfam = NVMF_ADRFAM_IPV4; + if (inet_ntop(AF_INET, &sin->sin_addr, traddr, + sizeof(traddr)) == NULL) { + warn("build_discovery_log_page: inet_ntop"); + return; + } + break; + } + case AF_INET6: + { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&ss; + adrfam = NVMF_ADRFAM_IPV6; + if (inet_ntop(AF_INET6, &sin6->sin6_addr, traddr, + sizeof(traddr)) == NULL) { + warn("build_discovery_log_page: inet_ntop"); + return; + } + break; + } + default: + assert(false); + } + + nentries = 0; + for (i = 0; i < num_io_controllers; i++) { + if (io_controllers[i].wildcard && + io_controllers[i].entry.adrfam != adrfam) + continue; + nentries++; + } + + dc->discovery_log_len = sizeof(*dc->discovery_log) + + nentries * sizeof(struct nvme_discovery_log_entry); + dc->discovery_log = calloc(dc->discovery_log_len, 1); + dc->discovery_log->numrec = nentries; + dc->discovery_log->recfmt = 0; + nentries = 0; + for (i = 0; i < num_io_controllers; i++) { + if (io_controllers[i].wildcard && + io_controllers[i].entry.adrfam != adrfam) + continue; + + dc->discovery_log->entries[nentries] = io_controllers[i].entry; + if (io_controllers[i].wildcard) + memcpy(dc->discovery_log->entries[nentries].traddr, + traddr, sizeof(traddr)); + } +} + +static void +handle_get_log_page_command(const struct nvmf_capsule *nc, + const struct nvme_command *cmd, struct discovery_controller *dc) +{ + uint64_t offset; + uint32_t length; + + switch (nvmf_get_log_page_id(cmd)) { + case NVME_LOG_DISCOVERY: + break; + default: + warnx("Unsupported log page %u for discovery controller", + nvmf_get_log_page_id(cmd)); + goto error; + } + + build_discovery_log_page(dc); + + offset = nvmf_get_log_page_offset(cmd); + if (offset >= dc->discovery_log_len) + goto error; + + length = nvmf_get_log_page_length(cmd); + if (length > dc->discovery_log_len - offset) + length = dc->discovery_log_len - offset; + + nvmf_send_controller_data(nc, (char *)dc->discovery_log + offset, + length); + return; +error: + nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); +} + +static bool +discovery_command(const struct nvmf_capsule *nc, const struct nvme_command *cmd, + void *arg) +{ + struct discovery_controller *dc = arg; + + switch (cmd->opc) { + case NVME_OPC_GET_LOG_PAGE: + handle_get_log_page_command(nc, cmd, dc); + return (true); + default: + return (false); + } +} + +static void * +discovery_thread(void *arg) +{ + struct discovery_thread_arg *dta = arg; + struct discovery_controller dc; + + pthread_detach(pthread_self()); + + memset(&dc, 0, sizeof(dc)); + dc.s = dta->s; + + controller_handle_admin_commands(dta->c, discovery_command, &dc); + + free(dc.discovery_log); + free_controller(dta->c); + + nvmf_free_qpair(dta->qp); + + close(dta->s); + free(dta); + return (NULL); +} + +void +handle_discovery_socket(int s) +{ + struct nvmf_fabric_connect_data data; + struct nvme_controller_data cdata; + struct nvmf_qpair_params qparams; + struct discovery_thread_arg *dta; + struct nvmf_capsule *nc; + struct nvmf_qpair *qp; + pthread_t thr; + int error; + + memset(&qparams, 0, sizeof(qparams)); + qparams.tcp.fd = s; + + nc = NULL; + qp = nvmf_accept(discovery_na, &qparams, &nc, &data); + if (qp == NULL) { + warnx("Failed to create discovery qpair: %s", + nvmf_association_error(discovery_na)); + goto error; + } + + if (strcmp(data.subnqn, NVMF_DISCOVERY_NQN) != 0) { + warn("Discovery qpair with invalid SubNQN: %.*s", + (int)sizeof(data.subnqn), data.subnqn); + nvmf_connect_invalid_parameters(nc, true, + offsetof(struct nvmf_fabric_connect_data, subnqn)); + goto error; + } + + /* Just use a controller ID of 1 for all discovery controllers. */ + error = nvmf_finish_accept(nc, 1); + if (error != 0) { + warnc(error, "Failed to send CONNECT reponse"); + goto error; + } + + nvmf_init_discovery_controller_data(qp, &cdata); + + dta = malloc(sizeof(*dta)); + dta->qp = qp; + dta->s = s; + dta->c = init_controller(qp, &cdata); + + error = pthread_create(&thr, NULL, discovery_thread, dta); + if (error != 0) { + warnc(error, "Failed to create discovery thread"); + free_controller(dta->c); + free(dta); + goto error; + } + + nvmf_free_capsule(nc); + return; + +error: + if (nc != NULL) + nvmf_free_capsule(nc); + if (qp != NULL) + nvmf_free_qpair(qp); + close(s); +} diff --git a/usr.sbin/nvmfd/internal.h b/usr.sbin/nvmfd/internal.h new file mode 100644 --- /dev/null +++ b/usr.sbin/nvmfd/internal.h @@ -0,0 +1,65 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#ifndef __INTERNAL_H__ +#define __INTERNAL_H__ + +#include + +struct controller; +struct nvme_command; +struct nvme_controller_data; +struct nvme_ns_list; +struct nvmf_capsule; +struct nvmf_qpair; + +typedef bool handle_command(const struct nvmf_capsule *, + const struct nvme_command *, void *); + +extern bool data_digests; +extern bool header_digests; +extern bool flow_control_disable; +extern bool kernel_io; + +/* controller.c */ +void controller_handle_admin_commands(struct controller *c, + handle_command *cb, void *cb_arg); +struct controller *init_controller(struct nvmf_qpair *qp, + const struct nvme_controller_data *cdata); +void free_controller(struct controller *c); + +/* discovery.c */ +void init_discovery(void); +void handle_discovery_socket(int s); +void discovery_add_io_controller(int s, const char *subnqn); + +/* io.c */ +void init_io(const char *subnqn); +void handle_io_socket(int s); +void shutdown_io(void); + +/* devices.c */ +void register_devices(int ac, char **av); +u_int device_count(void); +void device_active_nslist(uint32_t nsid, struct nvme_ns_list *nslist); +bool device_identification_descriptor(uint32_t nsid, void *buf); +bool device_namespace_data(uint32_t nsid, struct nvme_namespace_data *nsdata); +void device_read(uint32_t nsid, uint64_t lba, u_int nlb, + const struct nvmf_capsule *nc); +void device_write(uint32_t nsid, uint64_t lba, u_int nlb, + const struct nvmf_capsule *nc); +void device_flush(uint32_t nsid, const struct nvmf_capsule *nc); + +/* ctl.c */ +void init_ctl_port(const char *subnqn, + const struct nvmf_association_params *params); +void ctl_handoff_qpair(struct nvmf_qpair *qp, + const struct nvmf_fabric_connect_cmd *cmd, + const struct nvmf_fabric_connect_data *data); +void shutdown_ctl_port(const char *subnqn); + +#endif /* !__INTERNAL_H__ */ diff --git a/usr.sbin/nvmfd/io.c b/usr.sbin/nvmfd/io.c new file mode 100644 --- /dev/null +++ b/usr.sbin/nvmfd/io.c @@ -0,0 +1,677 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +struct io_controller { + struct controller *c; + + u_int num_io_queues; + u_int active_io_queues; + struct nvmf_qpair **io_qpairs; + int *io_sockets; + + struct nvme_firmware_page fp; + struct nvme_health_information_page hip; + uint16_t partial_dur; + uint16_t partial_duw; + + uint16_t cntlid; + char hostid[16]; + char hostnqn[NVME_NQN_FIELD_SIZE]; +}; + +static struct nvmf_association *io_na; +static pthread_cond_t io_cond; +static pthread_mutex_t io_na_mutex; +static struct io_controller *io_controller; +static const char *nqn; +static char serial[NVME_SERIAL_NUMBER_LENGTH]; + +void +init_io(const char *subnqn) +{ + struct nvmf_association_params aparams; + u_long hostid; + size_t len; + + memset(&aparams, 0, sizeof(aparams)); + aparams.sq_flow_control = !flow_control_disable; + aparams.dynamic_controller_model = true; + aparams.max_admin_qsize = NVME_MAX_ADMIN_ENTRIES; + aparams.max_io_qsize = NVMF_MAX_IO_ENTRIES; + aparams.tcp.pda = 0; + aparams.tcp.header_digests = header_digests; + aparams.tcp.data_digests = data_digests; + aparams.tcp.maxr2t = 1; + aparams.tcp.maxh2cdata = 256 * 1024; + io_na = nvmf_allocate_association(NVMF_TRTYPE_TCP, true, + &aparams); + if (io_na == NULL) + err(1, "Failed to create I/O controller association"); + + nqn = subnqn; + + /* Generate a serial number from the kern.hostid node. */ + len = sizeof(hostid); + if (sysctlbyname("kern.hostid", &hostid, &len, NULL, 0) == -1) + err(1, "sysctl: kern.hostid"); + + nvmf_controller_serial(serial, sizeof(serial), hostid); + + pthread_cond_init(&io_cond, NULL); + pthread_mutex_init(&io_na_mutex, NULL); + + if (kernel_io) + init_ctl_port(subnqn, &aparams); +} + +void +shutdown_io(void) +{ + if (kernel_io) + shutdown_ctl_port(nqn); +} + +static void +handle_get_log_page(struct io_controller *ioc, const struct nvmf_capsule *nc, + const struct nvme_command *cmd) +{ + uint64_t offset; + uint32_t numd; + size_t len; + uint8_t lid; + + lid = le32toh(cmd->cdw10) & 0xff; + numd = le32toh(cmd->cdw10) >> 16 | le32toh(cmd->cdw11) << 16; + offset = le32toh(cmd->cdw12) | (uint64_t)le32toh(cmd->cdw13) << 32; + + if (offset % 3 != 0) + goto error; + + len = (numd + 1) * 4; + + switch (lid) { + case NVME_LOG_ERROR: + { + void *buf; + + if (len % sizeof(struct nvme_error_information_entry) != 0) + goto error; + + buf = calloc(1, len); + nvmf_send_controller_data(nc, buf, len); + free(buf); + return; + } + case NVME_LOG_HEALTH_INFORMATION: + if (len != sizeof(ioc->hip)) + goto error; + + nvmf_send_controller_data(nc, &ioc->hip, sizeof(ioc->hip)); + return; + case NVME_LOG_FIRMWARE_SLOT: + if (len != sizeof(ioc->fp)) + goto error; + + nvmf_send_controller_data(nc, &ioc->fp, sizeof(ioc->fp)); + return; + default: + warnx("Unsupported page %#x for GET_LOG_PAGE\n", lid); + goto error; + } + +error: + nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); +} + +static bool +handle_io_identify_command(const struct nvmf_capsule *nc, + const struct nvme_command *cmd) +{ + struct nvme_namespace_data nsdata; + struct nvme_ns_list nslist; + uint32_t nsid; + uint8_t cns; + + cns = le32toh(cmd->cdw10) & 0xFF; + switch (cns) { + case 0: /* Namespace data. */ + if (!device_namespace_data(le32toh(cmd->nsid), &nsdata)) { + nvmf_send_generic_error(nc, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT); + return (true); + } + + nvmf_send_controller_data(nc, &nsdata, sizeof(nsdata)); + return (true); + case 2: /* Active namespace list. */ + nsid = le32toh(cmd->nsid); + if (nsid >= 0xfffffffe) { + nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); + return (true); + } + + device_active_nslist(nsid, &nslist); + nvmf_send_controller_data(nc, &nslist, sizeof(nslist)); + return (true); + case 3: /* Namespace Identification Descriptor list. */ + if (!device_identification_descriptor(le32toh(cmd->nsid), + &nsdata)) { + nvmf_send_generic_error(nc, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT); + return (true); + } + + nvmf_send_controller_data(nc, &nsdata, sizeof(nsdata)); + return (true); + default: + return (false); + } +} + +static void +handle_set_features(struct io_controller *ioc, const struct nvmf_capsule *nc, + const struct nvme_command *cmd) +{ + struct nvme_completion cqe; + uint8_t fid; + + fid = NVMEV(NVME_FEAT_SET_FID, le32toh(cmd->cdw10)); + switch (fid) { + case NVME_FEAT_NUMBER_OF_QUEUES: + { + uint32_t num_queues; + + if (ioc->num_io_queues != 0) { + nvmf_send_generic_error(nc, + NVME_SC_COMMAND_SEQUENCE_ERROR); + return; + } + + num_queues = le32toh(cmd->cdw11) & 0xffff; + + /* 5.12.1.7: 65535 is invalid. */ + if (num_queues == 65535) + goto error; + + /* Fabrics requires the same number of SQs and CQs. */ + if (le32toh(cmd->cdw11) >> 16 != num_queues) + goto error; + + /* Convert to 1's based */ + num_queues++; + + /* Lock to synchronize with handle_io_qpair. */ + pthread_mutex_lock(&io_na_mutex); + ioc->num_io_queues = num_queues; + ioc->io_qpairs = calloc(num_queues, sizeof(*ioc->io_qpairs)); + ioc->io_sockets = calloc(num_queues, sizeof(*ioc->io_sockets)); + pthread_mutex_unlock(&io_na_mutex); + + nvmf_init_cqe(&cqe, nc, 0); + cqe.cdw0 = cmd->cdw11; + nvmf_send_response(nc, &cqe); + return; + } + case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: + { + uint32_t aer_mask; + + aer_mask = le32toh(cmd->cdw11); + + /* Check for any reserved or unimplemented feature bits. */ + if ((aer_mask & 0xffffc000) != 0) + goto error; + + /* No AERs are generated by this daemon. */ + nvmf_send_success(nc); + return; + } + default: + warnx("Unsupported feature ID %u for SET_FEATURES", fid); + goto error; + } + +error: + nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); +} + +static bool +admin_command(const struct nvmf_capsule *nc, const struct nvme_command *cmd, + void *arg) +{ + struct io_controller *ioc = arg; + + switch (cmd->opc) { + case NVME_OPC_GET_LOG_PAGE: + handle_get_log_page(ioc, nc, cmd); + return (true); + case NVME_OPC_IDENTIFY: + return (handle_io_identify_command(nc, cmd)); + case NVME_OPC_SET_FEATURES: + handle_set_features(ioc, nc, cmd); + return (true); + case NVME_OPC_ASYNC_EVENT_REQUEST: + /* Ignore and never complete. */ + return (true); + case NVME_OPC_KEEP_ALIVE: + nvmf_send_success(nc); + return (true); + default: + return (false); + } +} + +static void +handle_admin_qpair(struct io_controller *ioc) +{ + pthread_setname_np(pthread_self(), "admin queue"); + + controller_handle_admin_commands(ioc->c, admin_command, ioc); + + pthread_mutex_lock(&io_na_mutex); + for (u_int i = 0; i < ioc->num_io_queues; i++) { + if (ioc->io_qpairs[i] == NULL || ioc->io_sockets[i] == -1) + continue; + close(ioc->io_sockets[i]); + ioc->io_sockets[i] = -1; + } + + /* Wait for I/O threads to notice. */ + while (ioc->active_io_queues > 0) + pthread_cond_wait(&io_cond, &io_na_mutex); + + io_controller = NULL; + pthread_mutex_unlock(&io_na_mutex); + + free_controller(ioc->c); + + free(ioc); +} + +static bool +handle_io_fabrics_command(const struct nvmf_capsule *nc, + const struct nvmf_fabric_cmd *fc) +{ + switch (fc->fctype) { + case NVMF_FABRIC_COMMAND_CONNECT: + warnx("CONNECT command on connected queue"); + nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR); + break; + case NVMF_FABRIC_COMMAND_DISCONNECT: + { + const struct nvmf_fabric_disconnect_cmd *dis = + (const struct nvmf_fabric_disconnect_cmd *)fc; + if (dis->recfmt != htole16(0)) { + nvmf_send_error(nc, NVME_SCT_COMMAND_SPECIFIC, + NVMF_FABRIC_SC_INCOMPATIBLE_FORMAT); + break; + } + nvmf_send_success(nc); + return (true); + } + default: + warnx("Unsupported fabrics command %#x", fc->fctype); + nvmf_send_generic_error(nc, NVME_SC_INVALID_OPCODE); + break; + } + + return (false); +} + +static void +hip_add(uint64_t pair[2], uint64_t addend) +{ + uint64_t old, new; + + old = le64toh(pair[0]); + new = old + addend; + pair[0] = htole64(new); + if (new < old) + pair[1] += htole64(1); +} + +static uint64_t +cmd_lba(const struct nvme_command *cmd) +{ + return ((uint64_t)le32toh(cmd->cdw11) << 32 | le32toh(cmd->cdw10)); +} + +static u_int +cmd_nlb(const struct nvme_command *cmd) +{ + return ((le32toh(cmd->cdw12) & 0xffff) + 1); +} + +static void +handle_read(struct io_controller *ioc, const struct nvmf_capsule *nc, + const struct nvme_command *cmd) +{ + size_t len; + + len = nvmf_capsule_data_len(nc); + device_read(le32toh(cmd->nsid), cmd_lba(cmd), cmd_nlb(cmd), nc); + hip_add(ioc->hip.host_read_commands, 1); + + len /= 512; + len += ioc->partial_dur; + if (len > 1000) + hip_add(ioc->hip.data_units_read, len / 1000); + ioc->partial_dur = len % 1000; +} + +static void +handle_write(struct io_controller *ioc, const struct nvmf_capsule *nc, + const struct nvme_command *cmd) +{ + size_t len; + + len = nvmf_capsule_data_len(nc); + device_write(le32toh(cmd->nsid), cmd_lba(cmd), cmd_nlb(cmd), nc); + hip_add(ioc->hip.host_write_commands, 1); + + len /= 512; + len += ioc->partial_duw; + if (len > 1000) + hip_add(ioc->hip.data_units_written, len / 1000); + ioc->partial_duw = len % 1000; +} + +static void +handle_flush(const struct nvmf_capsule *nc, const struct nvme_command *cmd) +{ + device_flush(le32toh(cmd->nsid), nc); +} + +static bool +handle_io_commands(struct io_controller *ioc, struct nvmf_qpair *qp) +{ + const struct nvme_command *cmd; + struct nvmf_capsule *nc; + int error; + bool disconnect; + + disconnect = false; + + while (!disconnect) { + error = nvmf_controller_receive_capsule(qp, &nc); + if (error != 0) { + if (error != ECONNRESET) + warnc(error, "Failed to read command capsule"); + break; + } + + cmd = nvmf_capsule_sqe(nc); + + switch (cmd->opc) { + case NVME_OPC_FLUSH: + if (cmd->nsid == htole32(0xffffffff)) { + nvmf_send_generic_error(nc, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT); + break; + } + handle_flush(nc, cmd); + break; + case NVME_OPC_WRITE: + handle_write(ioc, nc, cmd); + break; + case NVME_OPC_READ: + handle_read(ioc, nc, cmd); + break; + case NVME_OPC_FABRICS_COMMANDS: + disconnect = handle_io_fabrics_command(nc, + (const struct nvmf_fabric_cmd *)cmd); + break; + default: + warnx("Unsupported NVM opcode %#x", cmd->opc); + nvmf_send_generic_error(nc, NVME_SC_INVALID_OPCODE); + break; + } + nvmf_free_capsule(nc); + } + + return (disconnect); +} + +static void +handle_io_qpair(struct io_controller *ioc, struct nvmf_qpair *qp, int qid) +{ + char name[64]; + bool disconnect; + + snprintf(name, sizeof(name), "I/O queue %d", qid); + pthread_setname_np(pthread_self(), name); + + disconnect = handle_io_commands(ioc, qp); + + pthread_mutex_lock(&io_na_mutex); + if (disconnect) + ioc->io_qpairs[qid - 1] = NULL; + if (ioc->io_sockets[qid - 1] != -1) { + close(ioc->io_sockets[qid - 1]); + ioc->io_sockets[qid - 1] = -1; + } + ioc->active_io_queues--; + if (ioc->active_io_queues == 0) + pthread_cond_broadcast(&io_cond); + pthread_mutex_unlock(&io_na_mutex); +} + +static void +connect_admin_qpair(int s, struct nvmf_qpair *qp, struct nvmf_capsule *nc, + const struct nvmf_fabric_connect_data *data) +{ + struct nvme_controller_data cdata; + struct io_controller *ioc; + int error; + + /* Can only have one active I/O controller at a time. */ + pthread_mutex_lock(&io_na_mutex); + if (io_controller != NULL) { + pthread_mutex_unlock(&io_na_mutex); + nvmf_send_error(nc, NVME_SCT_COMMAND_SPECIFIC, + NVMF_FABRIC_SC_CONTROLLER_BUSY); + goto error; + } + + error = nvmf_finish_accept(nc, 2); + if (error != 0) { + pthread_mutex_unlock(&io_na_mutex); + warnc(error, "Failed to send CONNECT response"); + goto error; + } + + ioc = calloc(1, sizeof(*ioc)); + ioc->cntlid = 2; + memcpy(ioc->hostid, data->hostid, sizeof(ioc->hostid)); + memcpy(ioc->hostnqn, data->hostnqn, sizeof(ioc->hostnqn)); + + nvmf_init_io_controller_data(qp, serial, nqn, device_count(), + NVMF_IOCCSZ, &cdata); + + ioc->fp.afi = NVMEF(NVME_FIRMWARE_PAGE_AFI_SLOT, 1); + memcpy(ioc->fp.revision[0], cdata.fr, sizeof(cdata.fr)); + + ioc->hip.power_cycles[0] = 1; + + ioc->c = init_controller(qp, &cdata); + + io_controller = ioc; + pthread_mutex_unlock(&io_na_mutex); + + nvmf_free_capsule(nc); + + handle_admin_qpair(ioc); + close(s); + return; + +error: + nvmf_free_capsule(nc); + close(s); +} + +static void +connect_io_qpair(int s, struct nvmf_qpair *qp, struct nvmf_capsule *nc, + const struct nvmf_fabric_connect_data *data, uint16_t qid) +{ + struct io_controller *ioc; + int error; + + pthread_mutex_lock(&io_na_mutex); + if (io_controller == NULL) { + pthread_mutex_unlock(&io_na_mutex); + warnx("Attempt to create I/O qpair without admin qpair"); + nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR); + goto error; + } + + if (memcmp(io_controller->hostid, data->hostid, + sizeof(data->hostid)) != 0) { + pthread_mutex_unlock(&io_na_mutex); + warnx("hostid mismatch for I/O qpair CONNECT"); + nvmf_connect_invalid_parameters(nc, true, + offsetof(struct nvmf_fabric_connect_data, hostid)); + goto error; + } + if (le16toh(data->cntlid) != io_controller->cntlid) { + pthread_mutex_unlock(&io_na_mutex); + warnx("cntlid mismatch for I/O qpair CONNECT"); + nvmf_connect_invalid_parameters(nc, true, + offsetof(struct nvmf_fabric_connect_data, cntlid)); + goto error; + } + if (memcmp(io_controller->hostnqn, data->hostnqn, + sizeof(data->hostid)) != 0) { + pthread_mutex_unlock(&io_na_mutex); + warnx("host NQN mismatch for I/O qpair CONNECT"); + nvmf_connect_invalid_parameters(nc, true, + offsetof(struct nvmf_fabric_connect_data, hostnqn)); + goto error; + } + + if (io_controller->num_io_queues == 0) { + pthread_mutex_unlock(&io_na_mutex); + warnx("Attempt to create I/O qpair without enabled queues"); + nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR); + goto error; + } + if (qid > io_controller->num_io_queues) { + pthread_mutex_unlock(&io_na_mutex); + warnx("Attempt to create invalid I/O qpair %u", qid); + nvmf_connect_invalid_parameters(nc, false, + offsetof(struct nvmf_fabric_connect_cmd, qid)); + goto error; + } + if (io_controller->io_qpairs[qid - 1] != NULL) { + pthread_mutex_unlock(&io_na_mutex); + warnx("Attempt to re-create I/O qpair %u", qid); + nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR); + goto error; + } + + error = nvmf_finish_accept(nc, io_controller->cntlid); + if (error != 0) { + pthread_mutex_unlock(&io_na_mutex); + warnc(error, "Failed to send CONNECT response"); + goto error; + } + + ioc = io_controller; + ioc->active_io_queues++; + ioc->io_qpairs[qid - 1] = qp; + ioc->io_sockets[qid - 1] = s; + pthread_mutex_unlock(&io_na_mutex); + + nvmf_free_capsule(nc); + + handle_io_qpair(ioc, qp, qid); + return; + +error: + nvmf_free_capsule(nc); + close(s); +} + +static void * +io_socket_thread(void *arg) +{ + struct nvmf_fabric_connect_data data; + struct nvmf_qpair_params qparams; + const struct nvmf_fabric_connect_cmd *cmd; + struct nvmf_capsule *nc; + struct nvmf_qpair *qp; + int s; + + pthread_detach(pthread_self()); + + s = (intptr_t)arg; + memset(&qparams, 0, sizeof(qparams)); + qparams.tcp.fd = s; + + nc = NULL; + qp = nvmf_accept(io_na, &qparams, &nc, &data); + if (qp == NULL) { + warnx("Failed to create I/O qpair: %s", + nvmf_association_error(io_na)); + goto error; + } + + if (kernel_io) { + ctl_handoff_qpair(qp, nvmf_capsule_sqe(nc), &data); + goto error; + } + + if (strcmp(data.subnqn, nqn) != 0) { + warn("I/O qpair with invalid SubNQN: %.*s", + (int)sizeof(data.subnqn), data.subnqn); + nvmf_connect_invalid_parameters(nc, true, + offsetof(struct nvmf_fabric_connect_data, subnqn)); + goto error; + } + + /* Is this an admin or I/O queue pair? */ + cmd = nvmf_capsule_sqe(nc); + if (cmd->qid == 0) + connect_admin_qpair(s, qp, nc, &data); + else + connect_io_qpair(s, qp, nc, &data, le16toh(cmd->qid)); + nvmf_free_qpair(qp); + return (NULL); + +error: + if (nc != NULL) + nvmf_free_capsule(nc); + if (qp != NULL) + nvmf_free_qpair(qp); + close(s); + return (NULL); +} + +void +handle_io_socket(int s) +{ + pthread_t thr; + int error; + + error = pthread_create(&thr, NULL, io_socket_thread, + (void *)(uintptr_t)s); + if (error != 0) { + warnc(error, "Failed to create I/O qpair thread"); + close(s); + } +} diff --git a/usr.sbin/nvmfd/nvmfd.8 b/usr.sbin/nvmfd/nvmfd.8 new file mode 100644 --- /dev/null +++ b/usr.sbin/nvmfd/nvmfd.8 @@ -0,0 +1,126 @@ +.\" +.\" SPDX-License-Identifier: BSD-2-Clause +.\" +.\" Copyright (c) 2024 Chelsio Communications, Inc. +.\" +.Dd May 2, 2024 +.Dt NVMFD 8 +.Os +.Sh NAME +.Nm nvmfd +.Nd "NVMeoF controller daemon" +.Sh SYNOPSIS +.Nm +.Fl K +.Op Fl dFGg +.Op Fl P Ar port +.Op Fl p Ar port +.Op Fl t Ar transport +.Op Fl n Ar subnqn +.Nm +.Op Fl dFGg +.Op Fl P Ar port +.Op Fl p Ar port +.Op Fl t Ar transport +.Op Fl n Ar subnqn +.Ar device +.Op Ar device ... +.Sh DESCRIPTION +.Nm +accepts incoming NVMeoF connections for both I/O and discovery controllers. +.Nm +can either implement a single dynamic I/O controller in user mode or hand +off incoming I/O controller connections to +.Xr nvmft 4 . +A dynamic discovery controller service is always provided in user mode. +.Pp +The following options are available: +.Bl -tag -width "-t transport" +.It Fl F +Permit remote hosts to disable SQ flow control. +.It Fl G +Permit remote hosts to enable PDU data digests for the TCP transport. +.It Fl g +Permit remote hosts to enable PDU header digests for the TCP transport. +.It Fl K +Enable kernel mode which hands off incoming I/O controller connections to +.Xr nvmft 4 . +.It Fl P Ar port +Use +.Ar port +as the listen TCP port for the discovery controller service. +The default value is 8009. +.It Fl d +Enable debug mode. +The daemon sends any errors to standard output and does not place +itself in the background. +.It Fl p Ar port +Use +.Ar port +as the listen TCP port for the I/O controller service. +By default an unused ephemeral port will be chosen. +.It Fl n Ar subnqn +The Subsystem NVMe Qualified Name for the I/O controller. +If an explicit NQN is not given, a default value is generated from the +current host's UUID obtained from the +.Vt kern.hostuuid +sysctl. +.It Fl t Ar transport +The transport type to use. +The default transport is +.Dq tcp . +.It Ar device +When implementing a user mode I/O controller, +one or more +.Ar device +arguments must be specified. +Each +.Ar device +describes the backing store for a namespace exported to remote hosts. +Devices can be specified using one of the following syntaxes: +.Bl -tag -width "ramdisk:size" +.It Pa pathname +File or disk device +.It ramdisk : Ns Ar size +Allocate a memory disk with the given +.Ar size . +.Ar size +may use any of the suffixes supported by +.Xr expand_number 3 . +.El +.El +.Sh FILES +.Bl -tag -width "/var/run/nvmfd.pid" -compact +.It Pa /var/run/nvmfd.pid +The default location of the +.Nm +PID file. +.El +.Sh EXIT STATUS +.Ex -std +.Sh SEE ALSO +.Xr ctl 4 , +.Xr nvmft 4 , +.Xr ctladm 8 , +.Xr ctld 8 +.Sh HISTORY +The +.Nm +module first appeared in +.Fx 15.0 . +.Sh AUTHORS +The +.Nm +subsystem was developed by +.An John Baldwin Aq Mt jhb@FreeBSD.org +under sponsorship from Chelsio Communications, Inc. +.Sh BUGS +The discovery controller and kernel mode functionality of +.Nm +should be merged into +.Xr ctld 8 . +.Pp +Additional paramters such as +.Va MAXR2T , +.Va MAXH2CDATA , +and queue sizes should be configurable. diff --git a/usr.sbin/nvmfd/nvmfd.c b/usr.sbin/nvmfd/nvmfd.c new file mode 100644 --- /dev/null +++ b/usr.sbin/nvmfd/nvmfd.c @@ -0,0 +1,260 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +bool data_digests = false; +bool header_digests = false; +bool flow_control_disable = false; +bool kernel_io = false; + +static const char *subnqn; +static volatile bool quit = false; + +static void +usage(void) +{ + fprintf(stderr, "nvmfd -K [-FGg] [-P port] [-p port] [-t transport] [-n subnqn]\n" + "nvmfd [-dDFH] [-P port] [-p port] [-t transport] [-n subnqn]\n" + "\tdevice [device [...]]\n" + "\n" + "Devices use one of the following syntaxes:\n" + "\tpathame - file or disk device\n" + "\tramdisk:size - memory disk of given size\n"); + exit(1); +} + +static void +handle_sig(int sig __unused) +{ + quit = true; +} + +static void +register_listen_socket(int kqfd, int s, void *udata) +{ + struct kevent kev; + + if (listen(s, -1) != 0) + err(1, "listen"); + + EV_SET(&kev, s, EVFILT_READ, EV_ADD, 0, 0, udata); + if (kevent(kqfd, &kev, 1, NULL, 0, NULL) == -1) + err(1, "kevent: failed to add listen socket"); +} + +static void +create_passive_sockets(int kqfd, const char *port, bool discovery) +{ + struct addrinfo hints, *ai, *list; + bool created; + int error, s; + + memset(&hints, 0, sizeof(hints)); + hints.ai_flags = AI_PASSIVE; + hints.ai_family = AF_UNSPEC; + hints.ai_protocol = IPPROTO_TCP; + error = getaddrinfo(NULL, port, &hints, &list); + if (error != 0) + errx(1, "%s", gai_strerror(error)); + created = false; + + for (ai = list; ai != NULL; ai = ai->ai_next) { + s = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol); + if (s == -1) + continue; + + if (bind(s, ai->ai_addr, ai->ai_addrlen) != 0) { + close(s); + continue; + } + + if (discovery) { + register_listen_socket(kqfd, s, (void *)1); + } else { + register_listen_socket(kqfd, s, (void *)2); + discovery_add_io_controller(s, subnqn); + } + created = true; + } + + freeaddrinfo(list); + if (!created) + err(1, "Failed to create any listen sockets"); +} + +static void +handle_connections(int kqfd) +{ + struct kevent ev; + int s; + + signal(SIGHUP, handle_sig); + signal(SIGINT, handle_sig); + signal(SIGQUIT, handle_sig); + signal(SIGTERM, handle_sig); + + while (!quit) { + if (kevent(kqfd, NULL, 0, &ev, 1, NULL) == -1) { + if (errno == EINTR) + continue; + err(1, "kevent"); + } + + assert(ev.filter == EVFILT_READ); + + s = accept(ev.ident, NULL, NULL); + if (s == -1) { + warn("accept"); + continue; + } + + switch ((uintptr_t)ev.udata) { + case 1: + handle_discovery_socket(s); + break; + case 2: + handle_io_socket(s); + break; + default: + __builtin_unreachable(); + } + } +} + +int +main(int ac, char **av) +{ + struct pidfh *pfh; + const char *dport, *ioport, *transport; + pid_t pid; + int ch, error, kqfd; + bool daemonize; + static char nqn[NVMF_NQN_MAX_LEN]; + + /* 7.4.9.3 Default port for discovery */ + dport = "8009"; + + pfh = NULL; + daemonize = true; + ioport = "0"; + subnqn = NULL; + transport = "tcp"; + while ((ch = getopt(ac, av, "dFgGKn:P:p:t:")) != -1) { + switch (ch) { + case 'd': + daemonize = false; + break; + case 'F': + flow_control_disable = true; + break; + case 'G': + data_digests = true; + break; + case 'g': + header_digests = true; + break; + case 'K': + kernel_io = true; + break; + case 'n': + subnqn = optarg; + break; + case 'P': + dport = optarg; + break; + case 'p': + ioport = optarg; + break; + case 't': + transport = optarg; + break; + default: + usage(); + } + } + + av += optind; + ac -= optind; + + if (kernel_io) { + if (ac > 0) + usage(); + if (modfind("nvmft") == -1 && kldload("nvmft") == -1) + warn("couldn't load nvmft"); + } else { + if (ac < 1) + usage(); + } + + if (strcasecmp(transport, "tcp") == 0) { + } else + errx(1, "Invalid transport %s", transport); + + if (subnqn == NULL) { + error = nvmf_nqn_from_hostuuid(nqn); + if (error != 0) + errc(1, error, "Failed to generate NQN"); + subnqn = nqn; + } + + if (!kernel_io) + register_devices(ac, av); + + init_discovery(); + init_io(subnqn); + + if (daemonize) { + pfh = pidfile_open(NULL, 0600, &pid); + if (pfh == NULL) { + if (errno == EEXIST) + errx(1, "Daemon already running, pid: %jd", + (intmax_t)pid); + warn("Cannot open or create pidfile"); + } + + if (daemon(0, 0) != 0) { + pidfile_remove(pfh); + err(1, "Failed to fork into the background"); + } + + pidfile_write(pfh); + } + + kqfd = kqueue(); + if (kqfd == -1) { + pidfile_remove(pfh); + err(1, "kqueue"); + } + + create_passive_sockets(kqfd, dport, true); + create_passive_sockets(kqfd, ioport, false); + + handle_connections(kqfd); + shutdown_io(); + if (pfh != NULL) + pidfile_remove(pfh); + return (0); +}