Page MenuHomeFreeBSD

D44720.id137971.diff
No OneTemporary

D44720.id137971.diff

diff --git a/sys/cam/ctl/ctl.c b/sys/cam/ctl/ctl.c
--- a/sys/cam/ctl/ctl.c
+++ b/sys/cam/ctl/ctl.c
@@ -83,6 +83,7 @@
#include <cam/ctl/ctl_ha.h>
#include <cam/ctl/ctl_private.h>
#include <cam/ctl/ctl_debug.h>
+#include <cam/ctl/ctl_nvme_all.h>
#include <cam/ctl/ctl_scsi_all.h>
#include <cam/ctl/ctl_error.h>
@@ -447,6 +448,8 @@
static void ctl_failover_lun(union ctl_io *io);
static void ctl_scsiio_precheck(struct ctl_scsiio *ctsio);
static int ctl_scsiio(struct ctl_scsiio *ctsio);
+static void ctl_nvmeio_precheck(struct ctl_nvmeio *ctnio);
+static int ctl_nvmeio(struct ctl_nvmeio *ctnio);
static int ctl_target_reset(union ctl_io *io);
static void ctl_do_lun_reset(struct ctl_lun *lun, uint32_t initidx,
@@ -4963,7 +4966,6 @@
int retval;
CTL_DEBUG_PRINT(("ctl_config_move_done\n"));
- CTL_IO_ASSERT(io, SCSI);
if (ctl_debug & CTL_DEBUG_CDB_DATA)
ctl_data_print(io);
@@ -4998,7 +5000,17 @@
* XXX KDM call ctl_scsiio() again for now, and check flag
* bits to see whether we're allocated or not.
*/
- retval = ctl_scsiio(&io->scsiio);
+ switch (io->io_hdr.io_type) {
+ case CTL_IO_SCSI:
+ retval = ctl_scsiio(&io->scsiio);
+ break;
+ case CTL_IO_NVME:
+ case CTL_IO_NVME_ADMIN:
+ retval = ctl_nvmeio(&io->nvmeio);
+ break;
+ default:
+ __assert_unreachable();
+ }
}
return (retval);
}
@@ -10598,6 +10610,725 @@
return (CTL_RETVAL_COMPLETE);
}
+/*
+ * For NVMe commands, parse the LBA and length.
+ */
+static bool
+ctl_nvme_get_lba_len(struct ctl_nvmeio *ctnio, uint64_t *lba, uint32_t *len)
+{
+ CTL_IO_ASSERT(ctnio, NVME);
+
+ switch (ctnio->cmd.opc) {
+ case NVME_OPC_WRITE:
+ case NVME_OPC_READ:
+ case NVME_OPC_WRITE_UNCORRECTABLE:
+ case NVME_OPC_COMPARE:
+ case NVME_OPC_WRITE_ZEROES:
+ case NVME_OPC_VERIFY:
+ *lba = (uint64_t)le32toh(ctnio->cmd.cdw11) << 32 |
+ le32toh(ctnio->cmd.cdw10);
+ *len = (le32toh(ctnio->cmd.cdw12) & 0xffff) + 1;
+ return (true);
+ default:
+ *lba = 0;
+ *len = 0;
+ return (false);
+ }
+}
+
+static bool
+ctl_nvme_fua(struct ctl_nvmeio *ctnio)
+{
+ return ((le32toh(ctnio->cmd.cdw12) & (1U << 30)) != 0);
+}
+
+int
+ctl_nvme_identify(struct ctl_nvmeio *ctnio)
+{
+ struct ctl_lun *lun = CTL_LUN(ctnio);
+ size_t len;
+ int retval;
+ uint8_t cns;
+
+ CTL_DEBUG_PRINT(("ctl_nvme_identify\n"));
+
+ CTL_IO_ASSERT(ctnio, NVME_ADMIN);
+ MPASS(ctnio->cmd.opc == NVME_OPC_IDENTIFY);
+
+ /*
+ * The data buffer for Identify is always 4096 bytes, see
+ * 5.51.1 in NVMe base specification 1.4.
+ */
+ len = 4096;
+
+ ctnio->kern_data_ptr = malloc(len, M_CTL, M_WAITOK);
+ ctnio->kern_data_len = len;
+ ctnio->kern_total_len = len;
+ ctnio->kern_rel_offset = 0;
+ ctnio->kern_sg_entries = 0;
+
+ ctl_nvme_set_success(ctnio);
+ ctnio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
+ ctnio->be_move_done = ctl_config_move_done;
+
+ /*
+ * If we don't have a LUN, return an empty result for CNS == 0.
+ */
+ if (lun == NULL) {
+ cns = le32toh(ctnio->cmd.cdw10) & 0xff;
+ switch (cns) {
+ case 0:
+ memset(ctnio->kern_data_ptr, 0, len);
+ ctl_datamove((union ctl_io *)ctnio);
+ break;
+ default:
+ ctl_nvme_set_invalid_field(ctnio);
+ break;
+ }
+ return (CTL_RETVAL_COMPLETE);
+ }
+
+ retval = lun->backend->config_read((union ctl_io *)ctnio);
+ return (retval);
+}
+
+int
+ctl_nvme_flush(struct ctl_nvmeio *ctnio)
+{
+ struct ctl_lun *lun = CTL_LUN(ctnio);
+ int retval;
+
+ CTL_DEBUG_PRINT(("ctl_nvme_flush\n"));
+
+ CTL_IO_ASSERT(ctnio, NVME);
+ MPASS(ctnio->cmd.opc == NVME_OPC_FLUSH);
+
+ /*
+ * NVMe flushes always flush the entire namespace, not an LBA
+ * range.
+ */
+ retval = lun->backend->config_write((union ctl_io *)ctnio);
+
+ return (retval);
+}
+
+int
+ctl_nvme_read_write(struct ctl_nvmeio *ctnio)
+{
+ struct ctl_lun *lun = CTL_LUN(ctnio);
+ struct ctl_lba_len_flags *lbalen;
+ uint64_t lba;
+ uint32_t num_blocks;
+ int flags, retval;
+ bool isread;
+
+ CTL_DEBUG_PRINT(("ctl_nvme_read_write: command: %#x\n",
+ ctnio->cmd.opc));
+
+ CTL_IO_ASSERT(ctnio, NVME);
+ MPASS(ctnio->cmd.opc == NVME_OPC_WRITE ||
+ ctnio->cmd.opc == NVME_OPC_READ);
+
+ flags = 0;
+ isread = ctnio->cmd.opc == NVME_OPC_READ;
+ ctl_nvme_get_lba_len(ctnio, &lba, &num_blocks);
+
+ /*
+ * The first check is to make sure we're in bounds, the second
+ * check is to catch wrap-around problems. If the lba + num blocks
+ * is less than the lba, then we've wrapped around and the block
+ * range is invalid anyway.
+ */
+ if (((lba + num_blocks) > (lun->be_lun->maxlba + 1))
+ || ((lba + num_blocks) < lba)) {
+ ctl_nvme_set_lba_out_of_range(ctnio);
+ ctl_done((union ctl_io *)ctnio);
+ return (CTL_RETVAL_COMPLETE);
+ }
+
+ /*
+ * Set FUA and/or DPO if caches are disabled.
+ *
+ * For a read this may not be quite correct for the block
+ * backend as any earlier writes to the LBA range should be
+ * flushed to backing store as part of the read.
+ */
+ if (ctl_nvme_fua(ctnio)) {
+ flags |= CTL_LLF_FUA;
+ if (isread)
+ flags |= CTL_LLF_DPO;
+ }
+
+ lbalen = (struct ctl_lba_len_flags *)
+ &ctnio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
+ lbalen->lba = lba;
+ lbalen->len = num_blocks;
+ lbalen->flags = (isread ? CTL_LLF_READ : CTL_LLF_WRITE) | flags;
+
+ ctnio->kern_total_len = num_blocks * lun->be_lun->blocksize;
+ ctnio->kern_rel_offset = 0;
+
+ CTL_DEBUG_PRINT(("ctl_nvme_read_write: calling data_submit()\n"));
+
+ retval = lun->backend->data_submit((union ctl_io *)ctnio);
+ return (retval);
+}
+
+int
+ctl_nvme_write_uncorrectable(struct ctl_nvmeio *ctnio)
+{
+ struct ctl_lun *lun = CTL_LUN(ctnio);
+ struct ctl_lba_len_flags *lbalen;
+ uint64_t lba;
+ uint32_t num_blocks;
+ int retval;
+
+ CTL_DEBUG_PRINT(("ctl_nvme_write_uncorrectable\n"));
+
+ CTL_IO_ASSERT(ctnio, NVME);
+ MPASS(ctnio->cmd.opc == NVME_OPC_WRITE_UNCORRECTABLE);
+
+ ctl_nvme_get_lba_len(ctnio, &lba, &num_blocks);
+
+ /*
+ * The first check is to make sure we're in bounds, the second
+ * check is to catch wrap-around problems. If the lba + num blocks
+ * is less than the lba, then we've wrapped around and the block
+ * range is invalid anyway.
+ */
+ if (((lba + num_blocks) > (lun->be_lun->maxlba + 1))
+ || ((lba + num_blocks) < lba)) {
+ ctl_nvme_set_lba_out_of_range(ctnio);
+ ctl_done((union ctl_io *)ctnio);
+ return (CTL_RETVAL_COMPLETE);
+ }
+
+ lbalen = (struct ctl_lba_len_flags *)
+ &ctnio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
+ lbalen->lba = lba;
+ lbalen->len = num_blocks;
+ lbalen->flags = 0;
+ retval = lun->backend->config_write((union ctl_io *)ctnio);
+
+ return (retval);
+}
+
+int
+ctl_nvme_compare(struct ctl_nvmeio *ctnio)
+{
+ struct ctl_lun *lun = CTL_LUN(ctnio);
+ struct ctl_lba_len_flags *lbalen;
+ uint64_t lba;
+ uint32_t num_blocks;
+ int flags;
+ int retval;
+
+ CTL_DEBUG_PRINT(("ctl_nvme_compare\n"));
+
+ CTL_IO_ASSERT(ctnio, NVME);
+ MPASS(ctnio->cmd.opc == NVME_OPC_COMPARE);
+
+ flags = 0;
+ ctl_nvme_get_lba_len(ctnio, &lba, &num_blocks);
+ if (ctl_nvme_fua(ctnio))
+ flags |= CTL_LLF_FUA;
+
+ /*
+ * The first check is to make sure we're in bounds, the second
+ * check is to catch wrap-around problems. If the lba + num blocks
+ * is less than the lba, then we've wrapped around and the block
+ * range is invalid anyway.
+ */
+ if (((lba + num_blocks) > (lun->be_lun->maxlba + 1))
+ || ((lba + num_blocks) < lba)) {
+ ctl_nvme_set_lba_out_of_range(ctnio);
+ ctl_done((union ctl_io *)ctnio);
+ return (CTL_RETVAL_COMPLETE);
+ }
+
+ lbalen = (struct ctl_lba_len_flags *)
+ &ctnio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
+ lbalen->lba = lba;
+ lbalen->len = num_blocks;
+ lbalen->flags = CTL_LLF_COMPARE | flags;
+ ctnio->kern_total_len = num_blocks * lun->be_lun->blocksize;
+ ctnio->kern_rel_offset = 0;
+
+ CTL_DEBUG_PRINT(("ctl_nvme_compare: calling data_submit()\n"));
+ retval = lun->backend->data_submit((union ctl_io *)ctnio);
+ return (retval);
+}
+
+int
+ctl_nvme_write_zeroes(struct ctl_nvmeio *ctnio)
+{
+ struct ctl_lun *lun = CTL_LUN(ctnio);
+ struct ctl_lba_len_flags *lbalen;
+ uint64_t lba;
+ uint32_t num_blocks;
+ int retval;
+
+ CTL_DEBUG_PRINT(("ctl_nvme_write_zeroes\n"));
+
+ CTL_IO_ASSERT(ctnio, NVME);
+ MPASS(ctnio->cmd.opc == NVME_OPC_WRITE_ZEROES);
+
+ ctl_nvme_get_lba_len(ctnio, &lba, &num_blocks);
+
+ /*
+ * The first check is to make sure we're in bounds, the second
+ * check is to catch wrap-around problems. If the lba + num blocks
+ * is less than the lba, then we've wrapped around and the block
+ * range is invalid anyway.
+ */
+ if (((lba + num_blocks) > (lun->be_lun->maxlba + 1))
+ || ((lba + num_blocks) < lba)) {
+ ctl_nvme_set_lba_out_of_range(ctnio);
+ ctl_done((union ctl_io *)ctnio);
+ return (CTL_RETVAL_COMPLETE);
+ }
+
+ lbalen = (struct ctl_lba_len_flags *)
+ &ctnio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
+ lbalen->lba = lba;
+ lbalen->len = num_blocks;
+ lbalen->flags = 0;
+ retval = lun->backend->config_write((union ctl_io *)ctnio);
+
+ return (retval);
+}
+
+int
+ctl_nvme_dataset_management(struct ctl_nvmeio *ctnio)
+{
+ struct ctl_lun *lun = CTL_LUN(ctnio);
+ struct nvme_dsm_range *r;
+ uint64_t lba;
+ uint32_t len, num_blocks;
+ u_int i, ranges;
+ int retval;
+
+ CTL_DEBUG_PRINT(("ctl_nvme_dataset_management\n"));
+
+ CTL_IO_ASSERT(ctnio, NVME);
+ MPASS(ctnio->cmd.opc == NVME_OPC_DATASET_MANAGEMENT);
+
+ ranges = le32toh(ctnio->cmd.cdw10) & 0xff;
+ len = ranges * sizeof(struct nvme_dsm_range);
+
+ /*
+ * If we've got a kernel request that hasn't been malloced yet,
+ * malloc it and tell the caller the data buffer is here.
+ */
+ if ((ctnio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) {
+ ctnio->kern_data_ptr = malloc(len, M_CTL, M_WAITOK);
+ ctnio->kern_data_len = len;
+ ctnio->kern_total_len = len;
+ ctnio->kern_rel_offset = 0;
+ ctnio->kern_sg_entries = 0;
+ ctnio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
+ ctnio->be_move_done = ctl_config_move_done;
+ ctl_datamove((union ctl_io *)ctnio);
+
+ return (CTL_RETVAL_COMPLETE);
+ }
+
+ /*
+ * Require a flat buffer of the correct size.
+ */
+ if (ctnio->kern_sg_entries > 0 ||
+ ctnio->kern_total_len - ctnio->kern_data_resid != len)
+ return (CTL_RETVAL_ERROR);
+
+ /*
+ * Verify that none of the ranges are out of bounds.
+ */
+ r = (struct nvme_dsm_range *)ctnio->kern_data_ptr;
+ for (i = 0; i < ranges; i++) {
+ lba = le64toh(r[i].starting_lba);
+ num_blocks = le32toh(r[i].length);
+ if (((lba + num_blocks) > (lun->be_lun->maxlba + 1))
+ || ((lba + num_blocks) < lba)) {
+ ctl_nvme_set_lba_out_of_range(ctnio);
+ ctl_done((union ctl_io *)ctnio);
+ return (CTL_RETVAL_COMPLETE);
+ }
+ }
+
+ CTL_DEBUG_PRINT(("ctl_nvme_dataset_management: calling config_write()\n"));
+ retval = lun->backend->config_write((union ctl_io *)ctnio);
+ return (retval);
+}
+
+int
+ctl_nvme_verify(struct ctl_nvmeio *ctnio)
+{
+ struct ctl_lun *lun = CTL_LUN(ctnio);
+ struct ctl_lba_len_flags *lbalen;
+ uint64_t lba;
+ uint32_t num_blocks;
+ int flags;
+ int retval;
+
+ CTL_DEBUG_PRINT(("ctl_nvme_verify\n"));
+
+ CTL_IO_ASSERT(ctnio, NVME);
+ MPASS(ctnio->cmd.opc == NVME_OPC_VERIFY);
+
+ flags = 0;
+ ctl_nvme_get_lba_len(ctnio, &lba, &num_blocks);
+ if (ctl_nvme_fua(ctnio))
+ flags |= CTL_LLF_FUA;
+
+ /*
+ * The first check is to make sure we're in bounds, the second
+ * check is to catch wrap-around problems. If the lba + num blocks
+ * is less than the lba, then we've wrapped around and the block
+ * range is invalid anyway.
+ */
+ if (((lba + num_blocks) > (lun->be_lun->maxlba + 1))
+ || ((lba + num_blocks) < lba)) {
+ ctl_nvme_set_lba_out_of_range(ctnio);
+ ctl_done((union ctl_io *)ctnio);
+ return (CTL_RETVAL_COMPLETE);
+ }
+
+ lbalen = (struct ctl_lba_len_flags *)
+ &ctnio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
+ lbalen->lba = lba;
+ lbalen->len = num_blocks;
+ lbalen->flags = CTL_LLF_VERIFY | flags;
+ ctnio->kern_total_len = 0;
+ ctnio->kern_rel_offset = 0;
+
+ CTL_DEBUG_PRINT(("ctl_nvme_verify: calling data_submit()\n"));
+ retval = lun->backend->data_submit((union ctl_io *)ctnio);
+ return (retval);
+}
+
+static const struct ctl_nvme_cmd_entry *
+ctl_nvme_get_cmd_entry(struct ctl_nvmeio *ctnio)
+{
+ const struct ctl_nvme_cmd_entry *entry;
+
+ switch (ctnio->io_hdr.io_type) {
+ case CTL_IO_NVME:
+ entry = &nvme_nvm_cmd_table[ctnio->cmd.opc];
+ break;
+ case CTL_IO_NVME_ADMIN:
+ entry = &nvme_admin_cmd_table[ctnio->cmd.opc];
+ break;
+ default:
+ __assert_unreachable();
+ }
+ return (entry);
+}
+
+static const struct ctl_nvme_cmd_entry *
+ctl_nvme_validate_command(struct ctl_nvmeio *ctnio)
+{
+ const struct ctl_nvme_cmd_entry *entry;
+
+ entry = ctl_nvme_get_cmd_entry(ctnio);
+ if (entry->execute == NULL) {
+ ctl_nvme_set_invalid_opcode(ctnio);
+ ctl_done((union ctl_io *)ctnio);
+ return (NULL);
+ }
+
+ /* Validate fused commands. */
+ switch (NVMEV(NVME_CMD_FUSE, ctnio->cmd.fuse)) {
+ case NVME_FUSE_NORMAL:
+ break;
+ case NVME_FUSE_FIRST:
+ if (ctnio->io_hdr.io_type != CTL_IO_NVME ||
+ ctnio->cmd.opc != NVME_OPC_COMPARE) {
+ ctl_nvme_set_invalid_field(ctnio);
+ ctl_done((union ctl_io *)ctnio);
+ return (NULL);
+ }
+ break;
+ case NVME_FUSE_SECOND:
+ if (ctnio->io_hdr.io_type != CTL_IO_NVME ||
+ ctnio->cmd.opc != NVME_OPC_COMPARE) {
+ ctl_nvme_set_invalid_field(ctnio);
+ ctl_done((union ctl_io *)ctnio);
+ return (NULL);
+ }
+ break;
+ default:
+ ctl_nvme_set_invalid_field(ctnio);
+ ctl_done((union ctl_io *)ctnio);
+ return (NULL);
+ }
+
+ return (entry);
+}
+
+/*
+ * This is a simpler version of ctl_scsiio_lun_check that fails
+ * requests on a LUN without active media.
+ *
+ * Returns true if the command has been completed with an error.
+ */
+static bool
+ctl_nvmeio_lun_check(struct ctl_lun *lun,
+ const struct ctl_nvme_cmd_entry *entry, struct ctl_nvmeio *ctnio)
+{
+ mtx_assert(&lun->lun_lock, MA_OWNED);
+
+ if ((entry->flags & CTL_CMD_FLAG_OK_ON_NO_MEDIA) == 0) {
+ if ((lun->flags & (CTL_LUN_EJECTED | CTL_LUN_NO_MEDIA |
+ CTL_LUN_STOPPED)) != 0) {
+ ctl_nvme_set_namespace_not_ready(ctnio);
+ return (true);
+ }
+ }
+
+ return (false);
+}
+
+/*
+ * Check for blockage against the OOA (Order Of Arrival) queue.
+ * Assumptions:
+ * - pending_io is generally either incoming, or on the blocked queue
+ * - starting I/O is the I/O we want to start the check with.
+ */
+static ctl_action
+ctl_nvme_check_ooa(struct ctl_lun *lun, union ctl_io *pending_io,
+ union ctl_io **starting_io, union ctl_io **aborted_io)
+{
+ union ctl_io *ooa_io = *starting_io;
+
+ CTL_IO_ASSERT(pending_io, NVME, NVME_ADMIN);
+
+ mtx_assert(&lun->lun_lock, MA_OWNED);
+
+ *aborted_io = NULL;
+
+ /*
+ * Aborted commands are not going to be executed and may even
+ * not report completion, so we don't care about their order.
+ * Let them complete ASAP to clean the OOA queue.
+ */
+ if (__predict_false(pending_io->io_hdr.flags & CTL_FLAG_ABORT))
+ return (CTL_ACTION_PASS);
+
+ /*
+ * NVMe has rather simple command ordering requirements. In
+ * particular, there is no requirement on the controller to
+ * enforce a specific order for overlapping LBAs. The only
+ * constraint is that fused operations (Compare and Write),
+ * must be completed as a unit.
+ *
+ * To support fused operations, the following strategy is used:
+ * - the first half of a fused command is not enqueued to rtr
+ * until the second half is enqueued
+ * - the second half of a fused command blocks on the first
+ * half of a fuse command
+ * - subsequent commands block on the second half of the
+ * fused command
+ */
+
+ /*
+ * Is the previously submitted command the first half of a
+ * fused operation?
+ */
+ if (ooa_io != NULL &&
+ NVMEV(NVME_CMD_FUSE, ooa_io->nvmeio.cmd.fuse) == NVME_FUSE_FIRST) {
+ /*
+ * If this is the second half, enqueue the first half
+ * and block the second half on the first half.
+ */
+ if (NVMEV(NVME_CMD_FUSE, pending_io->nvmeio.cmd.fuse) ==
+ NVME_FUSE_SECOND) {
+ /*
+ * XXX: Do we need to wait for other rtr requests
+ * to drain so this is truly atomic?
+ */
+ return (CTL_ACTION_FUSED);
+ }
+
+ /* Abort the first half. */
+ ctl_nvme_set_missing_fused_command(&ooa_io->nvmeio);
+ *aborted_io = ooa_io;
+ } else {
+ switch (NVMEV(NVME_CMD_FUSE, pending_io->nvmeio.cmd.fuse)) {
+ case NVME_FUSE_FIRST:
+ /* First half, wait for the second half. */
+ return (CTL_ACTION_SKIP);
+ case NVME_FUSE_SECOND:
+ /* Second half without a matching first half, abort. */
+ ctl_nvme_set_missing_fused_command(&pending_io->nvmeio);
+ *aborted_io = pending_io;
+ return (CTL_ACTION_SKIP);
+ }
+ }
+
+ /*
+ * Scan the OOA queue looking for the most recent second half
+ * of a fused op.
+ */
+ for (; ooa_io != NULL;
+ ooa_io = (union ctl_io *)LIST_NEXT(&ooa_io->io_hdr, ooa_links)) {
+ if (NVMEV(NVME_CMD_FUSE, ooa_io->nvmeio.cmd.fuse) ==
+ NVME_FUSE_SECOND) {
+ *starting_io = ooa_io;
+ return (CTL_ACTION_BLOCK);
+ }
+ }
+
+ *starting_io = NULL;
+ return (CTL_ACTION_PASS);
+}
+
+static void
+ctl_nvmeio_precheck(struct ctl_nvmeio *ctnio)
+{
+ struct ctl_softc *softc = CTL_SOFTC(ctnio);
+ struct ctl_lun *lun;
+ const struct ctl_nvme_cmd_entry *entry;
+ union ctl_io *bio, *aborted_io;
+ uint32_t targ_lun;
+
+ lun = NULL;
+ targ_lun = ctnio->io_hdr.nexus.targ_mapped_lun;
+ if (targ_lun < ctl_max_luns)
+ lun = softc->ctl_luns[targ_lun];
+ if (lun != NULL) {
+ /*
+ * If the LUN is invalid, pretend that it doesn't exist.
+ * It will go away as soon as all pending I/O has been
+ * completed.
+ */
+ mtx_lock(&lun->lun_lock);
+ if (lun->flags & CTL_LUN_DISABLED) {
+ mtx_unlock(&lun->lun_lock);
+ lun = NULL;
+ }
+ }
+ CTL_LUN(ctnio) = lun;
+ if (lun != NULL) {
+ CTL_BACKEND_LUN(ctnio) = lun->be_lun;
+
+ /*
+ * Every I/O goes into the OOA queue for a particular LUN,
+ * and stays there until completion.
+ */
+#ifdef CTL_TIME_IO
+ if (LIST_EMPTY(&lun->ooa_queue))
+ lun->idle_time += getsbinuptime() - lun->last_busy;
+#endif
+ LIST_INSERT_HEAD(&lun->ooa_queue, &ctnio->io_hdr, ooa_links);
+ }
+
+ /* Get command entry and return error if it is unsupported. */
+ entry = ctl_nvme_validate_command(ctnio);
+ if (entry == NULL) {
+ if (lun)
+ mtx_unlock(&lun->lun_lock);
+ return;
+ }
+
+ ctnio->io_hdr.flags &= ~CTL_FLAG_DATA_MASK;
+ ctnio->io_hdr.flags |= entry->flags & CTL_FLAG_DATA_MASK;
+
+ /* All NVMe commands other than IDENTIFY require a LUN. */
+ if (lun == NULL) {
+ if (entry->flags & CTL_CMD_FLAG_OK_ON_NO_LUN) {
+ ctnio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR;
+ ctl_enqueue_rtr((union ctl_io *)ctnio);
+ return;
+ }
+
+ ctl_nvme_set_invalid_namespace(ctnio);
+ ctl_done((union ctl_io *)ctnio);
+ CTL_DEBUG_PRINT(("ctl_nvmeio_precheck: bailing out due to invalid LUN\n"));
+ return;
+ } else {
+ /*
+ * NVMe namespaces can only be backed by T_DIRECT LUNs.
+ */
+ if (lun->be_lun->lun_type != T_DIRECT) {
+ mtx_unlock(&lun->lun_lock);
+ ctl_nvme_set_invalid_namespace(ctnio);
+ ctl_done((union ctl_io *)ctnio);
+ return;
+ }
+ }
+
+ if (ctl_nvmeio_lun_check(lun, entry, ctnio) != 0) {
+ mtx_unlock(&lun->lun_lock);
+ ctl_done((union ctl_io *)ctnio);
+ return;
+ }
+
+ bio = (union ctl_io *)LIST_NEXT(&ctnio->io_hdr, ooa_links);
+ switch (ctl_nvme_check_ooa(lun, (union ctl_io *)ctnio, &bio,
+ &aborted_io)) {
+ case CTL_ACTION_PASS:
+ ctnio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR;
+ mtx_unlock(&lun->lun_lock);
+ ctl_enqueue_rtr((union ctl_io *)ctnio);
+ break;
+ case CTL_ACTION_FUSED:
+ /* Block the second half on the first half. */
+ ctnio->io_hdr.blocker = bio;
+ TAILQ_INSERT_TAIL(&bio->io_hdr.blocked_queue, &ctnio->io_hdr,
+ blocked_links);
+
+ /* Pass the first half. */
+ bio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR;
+ mtx_unlock(&lun->lun_lock);
+ ctl_enqueue_rtr(bio);
+ break;
+ case CTL_ACTION_SKIP:
+ mtx_unlock(&lun->lun_lock);
+ break;
+ case CTL_ACTION_BLOCK:
+ ctnio->io_hdr.blocker = bio;
+ TAILQ_INSERT_TAIL(&bio->io_hdr.blocked_queue, &ctnio->io_hdr,
+ blocked_links);
+ mtx_unlock(&lun->lun_lock);
+ break;
+ default:
+ __assert_unreachable();
+ }
+ if (aborted_io != NULL)
+ ctl_done(aborted_io);
+}
+
+static int
+ctl_nvmeio(struct ctl_nvmeio *ctnio)
+{
+ const struct ctl_nvme_cmd_entry *entry;
+ int retval;
+
+ CTL_DEBUG_PRINT(("ctl_nvmeio %s opc=%02X\n",
+ ctnio->io_hdr.io_type == CTL_IO_NVME ? "nvm" : "admin",
+ ctnio->cmd.opc));
+
+ entry = ctl_nvme_get_cmd_entry(ctnio);
+ MPASS(entry != NULL);
+
+ /*
+ * If this I/O has been aborted, just send it straight to
+ * ctl_done() without executing it.
+ */
+ if (ctnio->io_hdr.flags & CTL_FLAG_ABORT) {
+ ctl_done((union ctl_io *)ctnio);
+ return (CTL_RETVAL_COMPLETE);
+ }
+
+ /*
+ * All the checks should have been handled by ctl_nvmeio_precheck().
+ * We should be clear now to just execute the I/O.
+ */
+ retval = entry->execute(ctnio);
+
+ return (retval);
+}
+
/*
* For known CDB types, parse the LBA and length.
*/
@@ -11016,7 +11747,7 @@
* we know for sure that the blocker I/O does no longer count.
*/
static void
-ctl_try_unblock_io(struct ctl_lun *lun, union ctl_io *io, bool skip)
+ctl_scsi_try_unblock_io(struct ctl_lun *lun, union ctl_io *io, bool skip)
{
struct ctl_softc *softc = lun->ctl_softc;
union ctl_io *bio, *obio;
@@ -11111,6 +11842,72 @@
}
}
+static void
+ctl_nvme_try_unblock_io(struct ctl_lun *lun, union ctl_io *io, bool skip)
+{
+ union ctl_io *bio;
+ const struct ctl_nvme_cmd_entry *entry;
+
+ CTL_IO_ASSERT(io, NVME, NVME_ADMIN);
+
+ mtx_assert(&lun->lun_lock, MA_OWNED);
+
+ if (io->io_hdr.blocker == NULL)
+ return;
+
+ /*
+ * If this is the second half of a fused operation, it should
+ * be the only io on the blocked list. If the first half
+ * failed, complete the second half with an appropriate error.
+ */
+ bio = io->io_hdr.blocker;
+ if (NVMEV(NVME_CMD_FUSE, io->nvmeio.cmd.fuse) == NVME_FUSE_SECOND) {
+ MPASS(io ==
+ (union ctl_io *)TAILQ_FIRST(&bio->io_hdr.blocked_queue));
+ MPASS(TAILQ_NEXT(&io->io_hdr, blocked_links) == NULL);
+
+ TAILQ_REMOVE(&bio->io_hdr.blocked_queue, &io->io_hdr,
+ blocked_links);
+ if (bio->io_hdr.status != CTL_SUCCESS) {
+ ctl_nvme_set_failed_fused_command(&io->nvmeio);
+ ctl_done(io);
+ return;
+ }
+ } else {
+ /*
+ * This must be a command that was blocked on the
+ * second half of a fused operation.
+ */
+ MPASS(NVMEV(NVME_CMD_FUSE, bio->nvmeio.cmd.fuse) ==
+ NVME_FUSE_SECOND);
+ TAILQ_REMOVE(&bio->io_hdr.blocked_queue, &io->io_hdr,
+ blocked_links);
+ }
+
+ entry = ctl_nvme_get_cmd_entry(&io->nvmeio);
+ if (ctl_nvmeio_lun_check(lun, entry, &io->nvmeio) != 0) {
+ ctl_done(io);
+ return;
+ }
+
+ io->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR;
+ ctl_enqueue_rtr(io);
+}
+
+static void
+ctl_try_unblock_io(struct ctl_lun *lun, union ctl_io *io, bool skip)
+{
+ switch (io->io_hdr.io_type) {
+ case CTL_IO_SCSI:
+ return (ctl_scsi_try_unblock_io(lun, io, skip));
+ case CTL_IO_NVME:
+ case CTL_IO_NVME_ADMIN:
+ return (ctl_nvme_try_unblock_io(lun, io, skip));
+ default:
+ __assert_unreachable();
+ }
+}
+
/*
* Try to unblock I/Os blocked by the specified I/O.
*
@@ -11824,6 +12621,7 @@
*/
LIST_FOREACH(xioh, &lun->ooa_queue, ooa_links) {
union ctl_io *xio = (union ctl_io *)xioh;
+
if ((targ_port == UINT32_MAX ||
targ_port == xioh->nexus.targ_port) &&
(init_id == UINT32_MAX ||
@@ -13196,7 +13994,22 @@
{
struct ctl_port *port = CTL_PORT(io);
- CTL_DEBUG_PRINT(("ctl_queue cdb[0]=%02X\n", io->scsiio.cdb[0]));
+ switch (io->io_hdr.io_type) {
+ case CTL_IO_SCSI:
+ case CTL_IO_TASK:
+ CTL_DEBUG_PRINT(("ctl_queue cdb[0]=%02X\n", io->scsiio.cdb[0]));
+ break;
+ case CTL_IO_NVME:
+ CTL_DEBUG_PRINT(("ctl_queue nvme nvm cmd=%02X\n",
+ io->nvmeio.cmd.opc));
+ break;
+ case CTL_IO_NVME_ADMIN:
+ CTL_DEBUG_PRINT(("ctl_queue nvme admin cmd=%02X\n",
+ io->nvmeio.cmd.opc));
+ break;
+ default:
+ break;
+ }
#ifdef CTL_TIME_IO
io->io_hdr.start_time = time_uptime;
@@ -13210,6 +14023,8 @@
switch (io->io_hdr.io_type) {
case CTL_IO_SCSI:
case CTL_IO_TASK:
+ case CTL_IO_NVME:
+ case CTL_IO_NVME_ADMIN:
if (ctl_debug & CTL_DEBUG_CDB)
ctl_io_print(io);
ctl_enqueue_incoming(io);
@@ -13249,6 +14064,12 @@
ctl_io_print(io);
ctl_run_task(io);
break;
+ case CTL_IO_NVME:
+ case CTL_IO_NVME_ADMIN:
+ if (ctl_debug & CTL_DEBUG_CDB)
+ ctl_io_print(io);
+ ctl_nvmeio_precheck(&io->nvmeio);
+ break;
default:
printf("ctl_run: unknown I/O type %d\n", io->io_hdr.io_type);
return (EINVAL);
@@ -13418,19 +14239,41 @@
if (io != NULL) {
STAILQ_REMOVE_HEAD(&thr->incoming_queue, links);
mtx_unlock(&thr->queue_lock);
- if (io->io_hdr.io_type == CTL_IO_TASK)
+ switch (io->io_hdr.io_type) {
+ case CTL_IO_TASK:
ctl_run_task(io);
- else
+ break;
+ case CTL_IO_SCSI:
ctl_scsiio_precheck(&io->scsiio);
+ break;
+ case CTL_IO_NVME:
+ case CTL_IO_NVME_ADMIN:
+ ctl_nvmeio_precheck(&io->nvmeio);
+ break;
+ default:
+ __assert_unreachable();
+ }
continue;
}
io = (union ctl_io *)STAILQ_FIRST(&thr->rtr_queue);
if (io != NULL) {
STAILQ_REMOVE_HEAD(&thr->rtr_queue, links);
mtx_unlock(&thr->queue_lock);
- retval = ctl_scsiio(&io->scsiio);
- if (retval != CTL_RETVAL_COMPLETE)
- CTL_DEBUG_PRINT(("ctl_scsiio failed\n"));
+ switch (io->io_hdr.io_type) {
+ case CTL_IO_SCSI:
+ retval = ctl_scsiio(&io->scsiio);
+ if (retval != CTL_RETVAL_COMPLETE)
+ CTL_DEBUG_PRINT(("ctl_scsiio failed\n"));
+ break;
+ case CTL_IO_NVME:
+ case CTL_IO_NVME_ADMIN:
+ retval = ctl_nvmeio(&io->nvmeio);
+ if (retval != CTL_RETVAL_COMPLETE)
+ CTL_DEBUG_PRINT(("ctl_nvmeio failed\n"));
+ break;
+ default:
+ __assert_unreachable();
+ }
continue;
}
diff --git a/sys/cam/ctl/ctl_nvme_cmd_table.c b/sys/cam/ctl/ctl_nvme_cmd_table.c
new file mode 100644
--- /dev/null
+++ b/sys/cam/ctl/ctl_nvme_cmd_table.c
@@ -0,0 +1,35 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Chelsio Communications, Inc.
+ */
+
+#include <dev/nvme/nvme.h>
+
+#include <cam/ctl/ctl.h>
+#include <cam/ctl/ctl_ha.h>
+#include <cam/ctl/ctl_io.h>
+#include <cam/ctl/ctl_ioctl.h>
+#include <cam/ctl/ctl_private.h>
+
+/* Administrative Command Set (CTL_IO_NVME_ADMIN). */
+const struct ctl_nvme_cmd_entry nvme_admin_cmd_table[256] =
+{
+ [NVME_OPC_IDENTIFY] = { ctl_nvme_identify, CTL_FLAG_DATA_IN |
+ CTL_CMD_FLAG_OK_ON_NO_LUN },
+};
+
+/* NVM Command Set (CTL_IO_NVME). */
+const struct ctl_nvme_cmd_entry nvme_nvm_cmd_table[256] =
+{
+ [NVME_OPC_FLUSH] = { ctl_nvme_flush, CTL_FLAG_DATA_NONE },
+ [NVME_OPC_WRITE] = { ctl_nvme_read_write, CTL_FLAG_DATA_OUT },
+ [NVME_OPC_READ] = { ctl_nvme_read_write, CTL_FLAG_DATA_IN },
+ [NVME_OPC_WRITE_UNCORRECTABLE] = { ctl_nvme_write_uncorrectable,
+ CTL_FLAG_DATA_NONE },
+ [NVME_OPC_COMPARE] = { ctl_nvme_compare, CTL_FLAG_DATA_OUT },
+ [NVME_OPC_WRITE_ZEROES] = { ctl_nvme_write_zeroes, CTL_FLAG_DATA_NONE },
+ [NVME_OPC_DATASET_MANAGEMENT] = { ctl_nvme_dataset_management,
+ CTL_FLAG_DATA_OUT },
+ [NVME_OPC_VERIFY] = { ctl_nvme_verify, CTL_FLAG_DATA_NONE },
+};
diff --git a/sys/cam/ctl/ctl_private.h b/sys/cam/ctl/ctl_private.h
--- a/sys/cam/ctl/ctl_private.h
+++ b/sys/cam/ctl/ctl_private.h
@@ -78,7 +78,8 @@
CTL_ACTION_SKIP,
CTL_ACTION_BLOCK,
CTL_ACTION_OVERLAP,
- CTL_ACTION_OVERLAP_TAG
+ CTL_ACTION_OVERLAP_TAG,
+ CTL_ACTION_FUSED,
} ctl_action;
/*
@@ -139,6 +140,12 @@
* after the opcode byte. */
};
+/* Only data flags are currently used for NVMe commands. */
+struct ctl_nvme_cmd_entry {
+ int (*execute)(struct ctl_nvmeio *);
+ ctl_io_flags flags;
+};
+
typedef enum {
CTL_LUN_NONE = 0x000,
CTL_LUN_CONTROL = 0x001,
@@ -412,6 +419,8 @@
#ifdef _KERNEL
extern const struct ctl_cmd_entry ctl_cmd_table[256];
+extern const struct ctl_nvme_cmd_entry nvme_admin_cmd_table[256];
+extern const struct ctl_nvme_cmd_entry nvme_nvm_cmd_table[256];
uint32_t ctl_get_initindex(struct ctl_nexus *nexus);
int ctl_lun_map_init(struct ctl_port *port);
@@ -459,6 +468,15 @@
int ctl_report_timestamp(struct ctl_scsiio *ctsio);
int ctl_get_lba_status(struct ctl_scsiio *ctsio);
+int ctl_nvme_identify(struct ctl_nvmeio *ctnio);
+int ctl_nvme_flush(struct ctl_nvmeio *ctnio);
+int ctl_nvme_read_write(struct ctl_nvmeio *ctnio);
+int ctl_nvme_write_uncorrectable(struct ctl_nvmeio *ctnio);
+int ctl_nvme_compare(struct ctl_nvmeio *ctnio);
+int ctl_nvme_write_zeroes(struct ctl_nvmeio *ctnio);
+int ctl_nvme_dataset_management(struct ctl_nvmeio *ctnio);
+int ctl_nvme_verify(struct ctl_nvmeio *ctnio);
+
void ctl_tpc_init(struct ctl_softc *softc);
void ctl_tpc_shutdown(struct ctl_softc *softc);
void ctl_tpc_lun_init(struct ctl_lun *lun);
diff --git a/sys/conf/files b/sys/conf/files
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -109,6 +109,7 @@
cam/ctl/ctl_frontend_iscsi.c optional ctl cfiscsi
cam/ctl/ctl_ha.c optional ctl
cam/ctl/ctl_nvme_all.c optional ctl
+cam/ctl/ctl_nvme_cmd_table.c optional ctl
cam/ctl/ctl_scsi_all.c optional ctl
cam/ctl/ctl_tpc.c optional ctl
cam/ctl/ctl_tpc_local.c optional ctl
diff --git a/sys/modules/ctl/Makefile b/sys/modules/ctl/Makefile
--- a/sys/modules/ctl/Makefile
+++ b/sys/modules/ctl/Makefile
@@ -13,6 +13,7 @@
SRCS+= ctl_frontend_ioctl.c
SRCS+= ctl_ha.c
SRCS+= ctl_nvme_all.c
+SRCS+= ctl_nvme_cmd_table.c
SRCS+= ctl_scsi_all.c
SRCS+= ctl_tpc.c
SRCS+= ctl_tpc_local.c

File Metadata

Mime Type
text/plain
Expires
Wed, Jan 15, 4:46 PM (14 h, 18 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
15805391
Default Alt Text
D44720.id137971.diff (28 KB)

Event Timeline