diff --git a/sys/cam/ctl/ctl.c b/sys/cam/ctl/ctl.c --- a/sys/cam/ctl/ctl.c +++ b/sys/cam/ctl/ctl.c @@ -83,6 +83,7 @@ #include #include #include +#include #include #include @@ -447,6 +448,8 @@ static void ctl_failover_lun(union ctl_io *io); static void ctl_scsiio_precheck(struct ctl_scsiio *ctsio); static int ctl_scsiio(struct ctl_scsiio *ctsio); +static void ctl_nvmeio_precheck(struct ctl_nvmeio *ctnio); +static int ctl_nvmeio(struct ctl_nvmeio *ctnio); static int ctl_target_reset(union ctl_io *io); static void ctl_do_lun_reset(struct ctl_lun *lun, uint32_t initidx, @@ -528,6 +531,38 @@ DECLARE_MODULE(ctl, ctl_moduledata, SI_SUB_CONFIGURE, SI_ORDER_THIRD); MODULE_VERSION(ctl, 1); +static void +ctl_be_move_done(union ctl_io *io, bool samethr) +{ + switch (io->io_hdr.io_type) { + case CTL_IO_SCSI: + io->scsiio.be_move_done(io, samethr); + break; + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: + io->nvmeio.be_move_done(io, samethr); + break; + default: + __assert_unreachable(); + } +} + +static void +ctl_continue_io(union ctl_io *io) +{ + switch (io->io_hdr.io_type) { + case CTL_IO_SCSI: + io->scsiio.io_cont(io); + break; + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: + io->nvmeio.io_cont(io); + break; + default: + __assert_unreachable(); + } +} + static struct ctl_frontend ha_frontend = { .name = "ha", @@ -585,6 +620,9 @@ uint32_t sg_entries_sent; int do_sg_copy, i, j; + KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, io->io_hdr.io_type)); + memset(&msg.dt, 0, sizeof(msg.dt)); msg.hdr.msg_type = CTL_MSG_DATAMOVE; msg.hdr.original_sc = io->io_hdr.remote_io; @@ -601,32 +639,32 @@ * us to get more than CTL_HA_MAX_SG_ENTRIES S/G entries, * then we need to break this up into multiple transfers. */ - if (io->scsiio.kern_sg_entries == 0) { + if (ctl_kern_sg_entries(io) == 0) { msg.dt.kern_sg_entries = 1; #if 0 if (io->io_hdr.flags & CTL_FLAG_BUS_ADDR) { - msg.dt.sg_list[0].addr = io->scsiio.kern_data_ptr; + msg.dt.sg_list[0].addr = ctl_kern_data_ptr(io); } else { /* XXX KDM use busdma here! */ msg.dt.sg_list[0].addr = - (void *)vtophys(io->scsiio.kern_data_ptr); + (void *)vtophys(ctl_kern_data_ptr(io)); } #else KASSERT((io->io_hdr.flags & CTL_FLAG_BUS_ADDR) == 0, ("HA does not support BUS_ADDR")); - msg.dt.sg_list[0].addr = io->scsiio.kern_data_ptr; + msg.dt.sg_list[0].addr = ctl_kern_data_ptr(io); #endif - msg.dt.sg_list[0].len = io->scsiio.kern_data_len; + msg.dt.sg_list[0].len = ctl_kern_data_len(io); do_sg_copy = 0; } else { - msg.dt.kern_sg_entries = io->scsiio.kern_sg_entries; + msg.dt.kern_sg_entries = ctl_kern_sg_entries(io); do_sg_copy = 1; } - msg.dt.kern_data_len = io->scsiio.kern_data_len; - msg.dt.kern_total_len = io->scsiio.kern_total_len; - msg.dt.kern_data_resid = io->scsiio.kern_data_resid; - msg.dt.kern_rel_offset = io->scsiio.kern_rel_offset; + msg.dt.kern_data_len = ctl_kern_data_len(io); + msg.dt.kern_total_len = ctl_kern_total_len(io); + msg.dt.kern_data_resid = ctl_kern_data_resid(io); + msg.dt.kern_rel_offset = ctl_kern_rel_offset(io); msg.dt.sg_sequence = 0; /* @@ -640,7 +678,7 @@ sizeof(msg.dt.sg_list[0])), msg.dt.kern_sg_entries - sg_entries_sent); if (do_sg_copy != 0) { - sgl = (struct ctl_sg_entry *)io->scsiio.kern_data_ptr; + sgl = (struct ctl_sg_entry *)ctl_kern_data_ptr(io); for (i = sg_entries_sent, j = 0; i < msg.dt.cur_sg_entries; i++, j++) { #if 0 @@ -1496,6 +1534,10 @@ /* XXX KDM do something here */ break; } + KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, + io->io_hdr.io_type)); + io->io_hdr.msg_type = CTL_MSG_DATAMOVE; io->io_hdr.flags |= CTL_FLAG_IO_ACTIVE; /* @@ -1569,6 +1611,10 @@ * back to the initiator. */ io = msg->hdr.serializing_sc; + KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, + io->io_hdr.io_type)); + io->io_hdr.msg_type = CTL_MSG_DATAMOVE_DONE; io->io_hdr.flags &= ~CTL_FLAG_DMA_INPROG; io->io_hdr.flags |= CTL_FLAG_IO_ACTIVE; @@ -2410,6 +2456,10 @@ union ctl_io *io = (union ctl_io *)ioh; struct ctl_ooa_entry *entry; + KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, + io->io_hdr.io_type)); + /* * If we've got more than we can fit, just count the * remaining entries. @@ -4923,8 +4973,6 @@ int retval; CTL_DEBUG_PRINT(("ctl_config_move_done\n")); - KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, - ("%s: unexpected I/O type %x", __func__, io->io_hdr.io_type)); if (ctl_debug & CTL_DEBUG_CDB_DATA) ctl_data_print(io); @@ -4938,7 +4986,7 @@ * we'll need to know how to clean them up here as well. */ if (io->io_hdr.flags & CTL_FLAG_ALLOCATED) - free(io->scsiio.kern_data_ptr, M_CTL); + free(ctl_kern_data_ptr(io), M_CTL); ctl_done(io); retval = CTL_RETVAL_COMPLETE; } else { @@ -4959,7 +5007,17 @@ * XXX KDM call ctl_scsiio() again for now, and check flag * bits to see whether we're allocated or not. */ - retval = ctl_scsiio(&io->scsiio); + switch (io->io_hdr.io_type) { + case CTL_IO_SCSI: + retval = ctl_scsiio(&io->scsiio); + break; + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: + retval = ctl_nvmeio(&io->nvmeio); + break; + default: + __assert_unreachable(); + } } return (retval); } @@ -4983,7 +5041,7 @@ (io->io_hdr.flags & CTL_FLAG_ABORT) == 0 && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE || (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) { - io->scsiio.io_cont(io); + ctl_continue_io(io); return; } ctl_done(io); @@ -5010,7 +5068,7 @@ (io->io_hdr.flags & CTL_FLAG_ABORT) == 0 && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE || (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) { - io->scsiio.io_cont(io); + ctl_continue_io(io); return; } /* @@ -5019,7 +5077,7 @@ * no data, like start/stop unit, we need to check here. */ if (io->io_hdr.flags & CTL_FLAG_ALLOCATED) - buf = io->scsiio.kern_data_ptr; + buf = ctl_kern_data_ptr(io); else buf = NULL; ctl_done(io); @@ -5039,7 +5097,7 @@ ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE && (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) { if (io->io_hdr.flags & CTL_FLAG_ALLOCATED) - buf = io->scsiio.kern_data_ptr; + buf = ctl_kern_data_ptr(io); else buf = NULL; ctl_done(io); @@ -5054,7 +5112,7 @@ * the I/O just yet. */ if (io->io_hdr.flags & CTL_FLAG_IO_CONT) { - io->scsiio.io_cont(io); + ctl_continue_io(io); return; } @@ -5512,6 +5570,9 @@ struct ctl_lba_len_flags *lbalen; int retval; + KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, io->io_hdr.io_type)); + ctsio = &io->scsiio; ctsio->io_hdr.status = CTL_STATUS_NONE; lbalen = (struct ctl_lba_len_flags *) @@ -5857,6 +5918,9 @@ uint16_t *len_left, *len_used; int retval, i; + KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, io->io_hdr.io_type)); + ctsio = &io->scsiio; page_index = NULL; page_len = 0; @@ -8770,6 +8834,9 @@ struct ctl_lba_len_flags *lbalen; int retval; + KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, io->io_hdr.io_type)); + ctsio = &io->scsiio; ctsio->io_hdr.status = CTL_STATUS_NONE; ctsio->io_hdr.flags &= ~CTL_FLAG_IO_CONT; @@ -10553,6 +10620,739 @@ return (CTL_RETVAL_COMPLETE); } +/* + * For NVMe commands, parse the LBA and length. + */ +static bool +ctl_nvme_get_lba_len(struct ctl_nvmeio *ctnio, uint64_t *lba, uint32_t *len) +{ + KASSERT(ctnio->io_hdr.io_type == CTL_IO_NVME, + ("%s: unexpected I/O type %x", __func__, ctnio->io_hdr.io_type)); + + switch (ctnio->cmd.opc) { + case NVME_OPC_WRITE: + case NVME_OPC_READ: + case NVME_OPC_WRITE_UNCORRECTABLE: + case NVME_OPC_COMPARE: + case NVME_OPC_WRITE_ZEROES: + case NVME_OPC_VERIFY: + *lba = (uint64_t)le32toh(ctnio->cmd.cdw11) << 32 | + le32toh(ctnio->cmd.cdw10); + *len = (le32toh(ctnio->cmd.cdw12) & 0xffff) + 1; + return (true); + default: + *lba = 0; + *len = 0; + return (false); + } +} + +static bool +ctl_nvme_fua(struct ctl_nvmeio *ctnio) +{ + return ((le32toh(ctnio->cmd.cdw12) & (1U << 30)) != 0); +} + +int +ctl_nvme_identify(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + size_t len; + int retval; + uint8_t cns; + + CTL_DEBUG_PRINT(("ctl_nvme_identify\n")); + + KASSERT(ctnio->io_hdr.io_type == CTL_IO_NVME_ADMIN, + ("%s: unexpected I/O type %x", __func__, ctnio->io_hdr.io_type)); + MPASS(ctnio->cmd.opc == NVME_OPC_IDENTIFY); + + /* + * The data buffer for Identify is always 4096 bytes, see + * 5.51.1 in NVMe base specification 1.4. + */ + len = 4096; + + ctnio->kern_data_ptr = malloc(len, M_CTL, M_WAITOK); + ctnio->kern_data_len = len; + ctnio->kern_total_len = len; + ctnio->kern_rel_offset = 0; + ctnio->kern_sg_entries = 0; + + ctl_nvme_set_success(ctnio); + ctnio->io_hdr.flags |= CTL_FLAG_ALLOCATED; + ctnio->be_move_done = ctl_config_move_done; + + /* + * If we don't have a LUN, return an empty result for CNS == 0. + */ + if (lun == NULL) { + cns = le32toh(ctnio->cmd.cdw10) & 0xff; + switch (cns) { + case 0: + memset(ctnio->kern_data_ptr, 0, len); + ctl_datamove((union ctl_io *)ctnio); + break; + default: + ctl_nvme_set_invalid_field(ctnio); + break; + } + return (CTL_RETVAL_COMPLETE); + } + + retval = lun->backend->config_read((union ctl_io *)ctnio); + return (retval); +} + +int +ctl_nvme_flush(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + int retval; + + CTL_DEBUG_PRINT(("ctl_nvme_flush\n")); + + KASSERT(ctnio->io_hdr.io_type == CTL_IO_NVME, + ("%s: unexpected I/O type %x", __func__, ctnio->io_hdr.io_type)); + MPASS(ctnio->cmd.opc == NVME_OPC_FLUSH); + + /* + * NVMe flushes always flush the entire namespace, not an LBA + * range. + */ + retval = lun->backend->config_write((union ctl_io *)ctnio); + + return (retval); +} + +int +ctl_nvme_read_write(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + struct ctl_lba_len_flags *lbalen; + uint64_t lba; + uint32_t num_blocks; + int flags, retval; + bool isread; + + CTL_DEBUG_PRINT(("ctl_nvme_read_write: command: %#x\n", + ctnio->cmd.opc)); + + KASSERT(ctnio->io_hdr.io_type == CTL_IO_NVME, + ("%s: unexpected I/O type %x", __func__, ctnio->io_hdr.io_type)); + MPASS(ctnio->cmd.opc == NVME_OPC_WRITE || + ctnio->cmd.opc == NVME_OPC_READ); + + flags = 0; + isread = ctnio->cmd.opc == NVME_OPC_READ; + ctl_nvme_get_lba_len(ctnio, &lba, &num_blocks); + + /* + * The first check is to make sure we're in bounds, the second + * check is to catch wrap-around problems. If the lba + num blocks + * is less than the lba, then we've wrapped around and the block + * range is invalid anyway. + */ + if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) + || ((lba + num_blocks) < lba)) { + ctl_nvme_set_lba_out_of_range(ctnio); + ctl_done((union ctl_io *)ctnio); + return (CTL_RETVAL_COMPLETE); + } + + /* + * Set FUA and/or DPO if caches are disabled. + * + * For a read this may not be quite correct for the block + * backend as any earlier writes to the LBA range should be + * flushed to backing store as part of the read. + */ + if (ctl_nvme_fua(ctnio)) { + flags |= CTL_LLF_FUA; + if (isread) + flags |= CTL_LLF_DPO; + } + + lbalen = (struct ctl_lba_len_flags *) + &ctnio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; + lbalen->lba = lba; + lbalen->len = num_blocks; + lbalen->flags = (isread ? CTL_LLF_READ : CTL_LLF_WRITE) | flags; + + ctnio->kern_total_len = num_blocks * lun->be_lun->blocksize; + ctnio->kern_rel_offset = 0; + + CTL_DEBUG_PRINT(("ctl_nvme_read_write: calling data_submit()\n")); + + retval = lun->backend->data_submit((union ctl_io *)ctnio); + return (retval); +} + +int +ctl_nvme_write_uncorrectable(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + struct ctl_lba_len_flags *lbalen; + uint64_t lba; + uint32_t num_blocks; + int retval; + + CTL_DEBUG_PRINT(("ctl_nvme_write_uncorrectable\n")); + + KASSERT(ctnio->io_hdr.io_type == CTL_IO_NVME, + ("%s: unexpected I/O type %x", __func__, ctnio->io_hdr.io_type)); + MPASS(ctnio->cmd.opc == NVME_OPC_WRITE_UNCORRECTABLE); + + ctl_nvme_get_lba_len(ctnio, &lba, &num_blocks); + + /* + * The first check is to make sure we're in bounds, the second + * check is to catch wrap-around problems. If the lba + num blocks + * is less than the lba, then we've wrapped around and the block + * range is invalid anyway. + */ + if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) + || ((lba + num_blocks) < lba)) { + ctl_nvme_set_lba_out_of_range(ctnio); + ctl_done((union ctl_io *)ctnio); + return (CTL_RETVAL_COMPLETE); + } + + lbalen = (struct ctl_lba_len_flags *) + &ctnio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; + lbalen->lba = lba; + lbalen->len = num_blocks; + lbalen->flags = 0; + retval = lun->backend->config_write((union ctl_io *)ctnio); + + return (retval); +} + +int +ctl_nvme_compare(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + struct ctl_lba_len_flags *lbalen; + uint64_t lba; + uint32_t num_blocks; + int flags; + int retval; + + CTL_DEBUG_PRINT(("ctl_nvme_compare\n")); + + KASSERT(ctnio->io_hdr.io_type == CTL_IO_NVME, + ("%s: unexpected I/O type %x", __func__, ctnio->io_hdr.io_type)); + MPASS(ctnio->cmd.opc == NVME_OPC_COMPARE); + + flags = 0; + ctl_nvme_get_lba_len(ctnio, &lba, &num_blocks); + if (ctl_nvme_fua(ctnio)) + flags |= CTL_LLF_FUA; + + /* + * The first check is to make sure we're in bounds, the second + * check is to catch wrap-around problems. If the lba + num blocks + * is less than the lba, then we've wrapped around and the block + * range is invalid anyway. + */ + if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) + || ((lba + num_blocks) < lba)) { + ctl_nvme_set_lba_out_of_range(ctnio); + ctl_done((union ctl_io *)ctnio); + return (CTL_RETVAL_COMPLETE); + } + + lbalen = (struct ctl_lba_len_flags *) + &ctnio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; + lbalen->lba = lba; + lbalen->len = num_blocks; + lbalen->flags = CTL_LLF_COMPARE | flags; + ctnio->kern_total_len = num_blocks * lun->be_lun->blocksize; + ctnio->kern_rel_offset = 0; + + CTL_DEBUG_PRINT(("ctl_nvme_compare: calling data_submit()\n")); + retval = lun->backend->data_submit((union ctl_io *)ctnio); + return (retval); +} + +int +ctl_nvme_write_zeroes(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + struct ctl_lba_len_flags *lbalen; + uint64_t lba; + uint32_t num_blocks; + int retval; + + CTL_DEBUG_PRINT(("ctl_nvme_write_zeroes\n")); + + KASSERT(ctnio->io_hdr.io_type == CTL_IO_NVME, + ("%s: unexpected I/O type %x", __func__, ctnio->io_hdr.io_type)); + MPASS(ctnio->cmd.opc == NVME_OPC_WRITE_ZEROES); + + ctl_nvme_get_lba_len(ctnio, &lba, &num_blocks); + + /* + * The first check is to make sure we're in bounds, the second + * check is to catch wrap-around problems. If the lba + num blocks + * is less than the lba, then we've wrapped around and the block + * range is invalid anyway. + */ + if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) + || ((lba + num_blocks) < lba)) { + ctl_nvme_set_lba_out_of_range(ctnio); + ctl_done((union ctl_io *)ctnio); + return (CTL_RETVAL_COMPLETE); + } + + lbalen = (struct ctl_lba_len_flags *) + &ctnio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; + lbalen->lba = lba; + lbalen->len = num_blocks; + lbalen->flags = 0; + retval = lun->backend->config_write((union ctl_io *)ctnio); + + return (retval); +} + +int +ctl_nvme_dataset_management(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + struct nvme_dsm_range *r; + uint64_t lba; + uint32_t len, num_blocks; + u_int i, ranges; + int retval; + + CTL_DEBUG_PRINT(("ctl_nvme_dataset_management\n")); + + KASSERT(ctnio->io_hdr.io_type == CTL_IO_NVME, + ("%s: unexpected I/O type %x", __func__, ctnio->io_hdr.io_type)); + MPASS(ctnio->cmd.opc == NVME_OPC_DATASET_MANAGEMENT); + + ranges = le32toh(ctnio->cmd.cdw10) & 0xff; + len = ranges * sizeof(struct nvme_dsm_range); + + /* + * If we've got a kernel request that hasn't been malloced yet, + * malloc it and tell the caller the data buffer is here. + */ + if ((ctnio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) { + ctnio->kern_data_ptr = malloc(len, M_CTL, M_WAITOK); + ctnio->kern_data_len = len; + ctnio->kern_total_len = len; + ctnio->kern_rel_offset = 0; + ctnio->kern_sg_entries = 0; + ctnio->io_hdr.flags |= CTL_FLAG_ALLOCATED; + ctnio->be_move_done = ctl_config_move_done; + ctl_datamove((union ctl_io *)ctnio); + + return (CTL_RETVAL_COMPLETE); + } + + /* + * Require a flat buffer of the correct size. + */ + if (ctnio->kern_sg_entries > 0 || + ctnio->kern_total_len - ctnio->kern_data_resid != len) + return (CTL_RETVAL_ERROR); + + /* + * Verify that none of the ranges are out of bounds. + */ + r = (struct nvme_dsm_range *)ctnio->kern_data_ptr; + for (i = 0; i < ranges; i++) { + lba = le64toh(r[i].starting_lba); + num_blocks = le32toh(r[i].length); + if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) + || ((lba + num_blocks) < lba)) { + ctl_nvme_set_lba_out_of_range(ctnio); + ctl_done((union ctl_io *)ctnio); + return (CTL_RETVAL_COMPLETE); + } + } + + CTL_DEBUG_PRINT(("ctl_nvme_dataset_management: calling config_write()\n")); + retval = lun->backend->config_write((union ctl_io *)ctnio); + return (retval); +} + +int +ctl_nvme_verify(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + struct ctl_lba_len_flags *lbalen; + uint64_t lba; + uint32_t num_blocks; + int flags; + int retval; + + CTL_DEBUG_PRINT(("ctl_nvme_verify\n")); + + KASSERT(ctnio->io_hdr.io_type == CTL_IO_NVME, + ("%s: unexpected I/O type %x", __func__, ctnio->io_hdr.io_type)); + MPASS(ctnio->cmd.opc == NVME_OPC_VERIFY); + + flags = 0; + ctl_nvme_get_lba_len(ctnio, &lba, &num_blocks); + if (ctl_nvme_fua(ctnio)) + flags |= CTL_LLF_FUA; + + /* + * The first check is to make sure we're in bounds, the second + * check is to catch wrap-around problems. If the lba + num blocks + * is less than the lba, then we've wrapped around and the block + * range is invalid anyway. + */ + if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) + || ((lba + num_blocks) < lba)) { + ctl_nvme_set_lba_out_of_range(ctnio); + ctl_done((union ctl_io *)ctnio); + return (CTL_RETVAL_COMPLETE); + } + + lbalen = (struct ctl_lba_len_flags *) + &ctnio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; + lbalen->lba = lba; + lbalen->len = num_blocks; + lbalen->flags = CTL_LLF_VERIFY | flags; + ctnio->kern_total_len = 0; + ctnio->kern_rel_offset = 0; + + CTL_DEBUG_PRINT(("ctl_nvme_verify: calling data_submit()\n")); + retval = lun->backend->data_submit((union ctl_io *)ctnio); + return (retval); +} + +static const struct ctl_nvme_cmd_entry * +ctl_nvme_get_cmd_entry(struct ctl_nvmeio *ctnio) +{ + const struct ctl_nvme_cmd_entry *entry; + + switch (ctnio->io_hdr.io_type) { + case CTL_IO_NVME: + entry = &nvme_nvm_cmd_table[ctnio->cmd.opc]; + break; + case CTL_IO_NVME_ADMIN: + entry = &nvme_admin_cmd_table[ctnio->cmd.opc]; + break; + default: + __assert_unreachable(); + } + return (entry); +} + +static const struct ctl_nvme_cmd_entry * +ctl_nvme_validate_command(struct ctl_nvmeio *ctnio) +{ + const struct ctl_nvme_cmd_entry *entry; + + entry = ctl_nvme_get_cmd_entry(ctnio); + if (entry->execute == NULL) { + ctl_nvme_set_invalid_opcode(ctnio); + ctl_done((union ctl_io *)ctnio); + return (NULL); + } + + /* Validate fused commands. */ + switch (NVMEV(NVME_CMD_FUSE, ctnio->cmd.fuse)) { + case 0x00: + /* Non-fused. */ + break; + case 0x01: + /* First half. */ + if (ctnio->io_hdr.io_type != CTL_IO_NVME || + ctnio->cmd.opc != NVME_OPC_COMPARE) { + ctl_nvme_set_invalid_field(ctnio); + ctl_done((union ctl_io *)ctnio); + return (NULL); + } + break; + case 0x02: + /* Second half. */ + if (ctnio->io_hdr.io_type != CTL_IO_NVME || + ctnio->cmd.opc != NVME_OPC_COMPARE) { + ctl_nvme_set_invalid_field(ctnio); + ctl_done((union ctl_io *)ctnio); + return (NULL); + } + break; + case 0x03: + ctl_nvme_set_invalid_field(ctnio); + ctl_done((union ctl_io *)ctnio); + return (NULL); + } + + return (entry); +} + +/* + * This is a simpler version of ctl_scsiio_lun_check that fails + * requests on a LUN without active media. + * + * Returns true if the command has been completed with an error. + */ +static bool +ctl_nvmeio_lun_check(struct ctl_lun *lun, + const struct ctl_nvme_cmd_entry *entry, struct ctl_nvmeio *ctnio) +{ + mtx_assert(&lun->lun_lock, MA_OWNED); + + if ((entry->flags & CTL_CMD_FLAG_OK_ON_NO_MEDIA) == 0) { + if ((lun->flags & (CTL_LUN_EJECTED | CTL_LUN_NO_MEDIA | + CTL_LUN_STOPPED)) != 0) { + ctl_nvme_set_namespace_not_ready(ctnio); + return (true); + } + } + + return (false); +} + +/* + * Check for blockage against the OOA (Order Of Arrival) queue. + * Assumptions: + * - pending_io is generally either incoming, or on the blocked queue + * - starting I/O is the I/O we want to start the check with. + */ +static ctl_action +ctl_nvme_check_ooa(struct ctl_lun *lun, union ctl_io *pending_io, + union ctl_io **starting_io, union ctl_io **aborted_io) +{ + union ctl_io *ooa_io = *starting_io; + + KASSERT(pending_io->io_hdr.io_type == CTL_IO_NVME || + pending_io->io_hdr.io_type == CTL_IO_NVME_ADMIN, + ("%s: unexpected I/O type %x", __func__, + pending_io->io_hdr.io_type)); + + mtx_assert(&lun->lun_lock, MA_OWNED); + + *aborted_io = NULL; + + /* + * Aborted commands are not going to be executed and may even + * not report completion, so we don't care about their order. + * Let them complete ASAP to clean the OOA queue. + */ + if (__predict_false(pending_io->io_hdr.flags & CTL_FLAG_ABORT)) + return (CTL_ACTION_PASS); + + /* + * NVMe has rather simple command ordering requirements. In + * particular, there is no requirement on the controller to + * enforce a specific order for overlapping LBAs. The only + * constraint is that fused operations (Compare and Write), + * must be completed as a unit. + * + * To support fused operations, the following strategy is used: + * - the first half of a fused command is not enqueued to rtr + * until the second half is enqueued + * - the second half of a fused command blocks on the first + * half of a fuse command + * - subsequent commands block on the second half of the + * fused command + */ + + /* + * Is the previously submitted command the first half of a + * fused operation? + */ + if (ooa_io != NULL && + NVMEV(NVME_CMD_FUSE, ooa_io->nvmeio.cmd.fuse) == 0x01) { + /* + * If this is the second half, enqueue the first half + * and block the second half on the first half. + */ + if (NVMEV(NVME_CMD_FUSE, pending_io->nvmeio.cmd.fuse) == + 0x02) { + /* + * XXX: Do we need to wait for other rtr requests + * to drain so this is truly atomic? + */ + return (CTL_ACTION_FUSED); + } + + /* Abort the first half. */ + ctl_nvme_set_missing_fused_command(&ooa_io->nvmeio); + *aborted_io = ooa_io; + } else { + switch (NVMEV(NVME_CMD_FUSE, pending_io->nvmeio.cmd.fuse)) { + case 0x01: + /* First half, wait for the second half. */ + return (CTL_ACTION_SKIP); + case 0x02: + /* Second half without a matching first half, abort. */ + ctl_nvme_set_missing_fused_command(&pending_io->nvmeio); + *aborted_io = pending_io; + return (CTL_ACTION_SKIP); + } + } + + /* + * Scan the OOA queue looking for the most recent second half + * of a fused op. + */ + for (; ooa_io != NULL; + ooa_io = (union ctl_io *)LIST_NEXT(&ooa_io->io_hdr, ooa_links)) { + if (NVMEV(NVME_CMD_FUSE, ooa_io->nvmeio.cmd.fuse) == 0x02) { + *starting_io = ooa_io; + return (CTL_ACTION_BLOCK); + } + } + + *starting_io = NULL; + return (CTL_ACTION_PASS); +} + +static void +ctl_nvmeio_precheck(struct ctl_nvmeio *ctnio) +{ + struct ctl_softc *softc = CTL_SOFTC(ctnio); + struct ctl_lun *lun; + const struct ctl_nvme_cmd_entry *entry; + union ctl_io *bio, *aborted_io; + uint32_t targ_lun; + + lun = NULL; + targ_lun = ctnio->io_hdr.nexus.targ_mapped_lun; + if (targ_lun < ctl_max_luns) + lun = softc->ctl_luns[targ_lun]; + if (lun != NULL) { + /* + * If the LUN is invalid, pretend that it doesn't exist. + * It will go away as soon as all pending I/O has been + * completed. + */ + mtx_lock(&lun->lun_lock); + if (lun->flags & CTL_LUN_DISABLED) { + mtx_unlock(&lun->lun_lock); + lun = NULL; + } + } + CTL_LUN(ctnio) = lun; + if (lun != NULL) { + CTL_BACKEND_LUN(ctnio) = lun->be_lun; + + /* + * Every I/O goes into the OOA queue for a particular LUN, + * and stays there until completion. + */ +#ifdef CTL_TIME_IO + if (LIST_EMPTY(&lun->ooa_queue)) + lun->idle_time += getsbinuptime() - lun->last_busy; +#endif + LIST_INSERT_HEAD(&lun->ooa_queue, &ctnio->io_hdr, ooa_links); + } + + /* Get command entry and return error if it is unsupported. */ + entry = ctl_nvme_validate_command(ctnio); + if (entry == NULL) { + if (lun) + mtx_unlock(&lun->lun_lock); + return; + } + + ctnio->io_hdr.flags &= ~CTL_FLAG_DATA_MASK; + ctnio->io_hdr.flags |= entry->flags & CTL_FLAG_DATA_MASK; + + /* All NVMe commands other than IDENTIFY require a LUN. */ + if (lun == NULL) { + if (entry->flags & CTL_CMD_FLAG_OK_ON_NO_LUN) { + ctnio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; + ctl_enqueue_rtr((union ctl_io *)ctnio); + return; + } + + ctl_nvme_set_invalid_namespace(ctnio); + ctl_done((union ctl_io *)ctnio); + CTL_DEBUG_PRINT(("ctl_nvmeio_precheck: bailing out due to invalid LUN\n")); + return; + } else { + /* + * NVMe namespaces can only be backed by T_DIRECT LUNs. + */ + if (lun->be_lun->lun_type != T_DIRECT) { + mtx_unlock(&lun->lun_lock); + ctl_nvme_set_invalid_namespace(ctnio); + ctl_done((union ctl_io *)ctnio); + return; + } + } + + if (ctl_nvmeio_lun_check(lun, entry, ctnio) != 0) { + mtx_unlock(&lun->lun_lock); + ctl_done((union ctl_io *)ctnio); + return; + } + + bio = (union ctl_io *)LIST_NEXT(&ctnio->io_hdr, ooa_links); + switch (ctl_nvme_check_ooa(lun, (union ctl_io *)ctnio, &bio, + &aborted_io)) { + case CTL_ACTION_PASS: + ctnio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; + mtx_unlock(&lun->lun_lock); + ctl_enqueue_rtr((union ctl_io *)ctnio); + break; + case CTL_ACTION_FUSED: + /* Block the second half on the first half. */ + ctnio->io_hdr.blocker = bio; + TAILQ_INSERT_TAIL(&bio->io_hdr.blocked_queue, &ctnio->io_hdr, + blocked_links); + + /* Pass the first half. */ + bio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; + mtx_unlock(&lun->lun_lock); + ctl_enqueue_rtr(bio); + break; + case CTL_ACTION_SKIP: + mtx_unlock(&lun->lun_lock); + break; + case CTL_ACTION_BLOCK: + ctnio->io_hdr.blocker = bio; + TAILQ_INSERT_TAIL(&bio->io_hdr.blocked_queue, &ctnio->io_hdr, + blocked_links); + mtx_unlock(&lun->lun_lock); + break; + default: + __assert_unreachable(); + } + if (aborted_io != NULL) + ctl_done(aborted_io); +} + +static int +ctl_nvmeio(struct ctl_nvmeio *ctnio) +{ + const struct ctl_nvme_cmd_entry *entry; + int retval; + + CTL_DEBUG_PRINT(("ctl_nvmeio %s opc=%02X\n", + ctnio->io_hdr.io_type == CTL_IO_NVME ? "nvm" : "admin", + ctnio->cmd.opc)); + + entry = ctl_nvme_get_cmd_entry(ctnio); + MPASS(entry != NULL); + + /* + * If this I/O has been aborted, just send it straight to + * ctl_done() without executing it. + */ + if (ctnio->io_hdr.flags & CTL_FLAG_ABORT) { + ctl_done((union ctl_io *)ctnio); + return (CTL_RETVAL_COMPLETE); + } + + /* + * All the checks should have been handled by ctl_nvmeio_precheck(). + * We should be clear now to just execute the I/O. + */ + retval = entry->execute(ctnio); + + return (retval); +} + /* * For known CDB types, parse the LBA and length. */ @@ -10812,6 +11612,11 @@ ctl_check_for_blockage(struct ctl_lun *lun, union ctl_io *pending_io, const uint8_t *serialize_row, union ctl_io *ooa_io) { + KASSERT(pending_io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, + pending_io->io_hdr.io_type)); + KASSERT(ooa_io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, ooa_io->io_hdr.io_type)); /* * The initiator attempted multiple untagged commands at the same @@ -10920,6 +11725,10 @@ const uint8_t *serialize_row; ctl_action action; + KASSERT(pending_io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, + pending_io->io_hdr.io_type)); + mtx_assert(&lun->lun_lock, MA_OWNED); /* @@ -10969,7 +11778,7 @@ * we know for sure that the blocker I/O does no longer count. */ static void -ctl_try_unblock_io(struct ctl_lun *lun, union ctl_io *io, bool skip) +ctl_scsi_try_unblock_io(struct ctl_lun *lun, union ctl_io *io, bool skip) { struct ctl_softc *softc = lun->ctl_softc; union ctl_io *bio, *obio; @@ -10977,6 +11786,9 @@ union ctl_ha_msg msg_info; ctl_action action; + KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, io->io_hdr.io_type)); + mtx_assert(&lun->lun_lock, MA_OWNED); if (io->io_hdr.blocker == NULL) @@ -11062,6 +11874,73 @@ } } +static void +ctl_nvme_try_unblock_io(struct ctl_lun *lun, union ctl_io *io, bool skip) +{ + union ctl_io *bio; + const struct ctl_nvme_cmd_entry *entry; + + KASSERT(io->io_hdr.io_type == CTL_IO_NVME || + io->io_hdr.io_type == CTL_IO_NVME_ADMIN, + ("%s: unexpected I/O type %x", __func__, io->io_hdr.io_type)); + + mtx_assert(&lun->lun_lock, MA_OWNED); + + if (io->io_hdr.blocker == NULL) + return; + + /* + * If this is the second half of a fused operation, it should + * be the only io on the blocked list. If the first half + * failed, complete the second half with an appropriate error. + */ + bio = io->io_hdr.blocker; + if (NVMEV(NVME_CMD_FUSE, io->nvmeio.cmd.fuse) == 0x02) { + MPASS(io == + (union ctl_io *)TAILQ_FIRST(&bio->io_hdr.blocked_queue)); + MPASS(TAILQ_NEXT(&io->io_hdr, blocked_links) == NULL); + + TAILQ_REMOVE(&bio->io_hdr.blocked_queue, &io->io_hdr, + blocked_links); + if (bio->io_hdr.status != CTL_SUCCESS) { + ctl_nvme_set_failed_fused_command(&io->nvmeio); + ctl_done(io); + return; + } + } else { + /* + * This must be a command that was blocked on the + * second half of a fused operation. + */ + MPASS(NVMEV(NVME_CMD_FUSE, bio->nvmeio.cmd.fuse) == 0x02); + TAILQ_REMOVE(&bio->io_hdr.blocked_queue, &io->io_hdr, + blocked_links); + } + + entry = ctl_nvme_get_cmd_entry(&io->nvmeio); + if (ctl_nvmeio_lun_check(lun, entry, &io->nvmeio) != 0) { + ctl_done(io); + return; + } + + io->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; + ctl_enqueue_rtr(io); +} + +static void +ctl_try_unblock_io(struct ctl_lun *lun, union ctl_io *io, bool skip) +{ + switch (io->io_hdr.io_type) { + case CTL_IO_SCSI: + return (ctl_scsi_try_unblock_io(lun, io, skip)); + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: + return (ctl_nvme_try_unblock_io(lun, io, skip)); + default: + __assert_unreachable(); + } +} + /* * Try to unblock I/Os blocked by the specified I/O. * @@ -11217,6 +12096,9 @@ static void ctl_failover_io(union ctl_io *io, int have_lock) { + KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, io->io_hdr.io_type)); + ctl_set_busy(&io->scsiio); ctl_done(io); } @@ -11773,6 +12655,7 @@ */ LIST_FOREACH(xioh, &lun->ooa_queue, ooa_links) { union ctl_io *xio = (union ctl_io *)xioh; + if ((targ_port == UINT32_MAX || targ_port == xioh->nexus.targ_port) && (init_id == UINT32_MAX || @@ -11784,6 +12667,9 @@ if (!other_sc && !(lun->flags & CTL_LUN_PRIMARY_SC)) { union ctl_ha_msg msg_info; + KASSERT(xio->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, + xio->io_hdr.io_type)); msg_info.hdr.nexus = xioh->nexus; msg_info.task.task_action = CTL_TASK_ABORT_TASK; msg_info.task.tag_num = xio->scsiio.tag_num; @@ -11924,6 +12810,10 @@ */ LIST_FOREACH(xioh, &lun->ooa_queue, ooa_links) { union ctl_io *xio = (union ctl_io *)xioh; + KASSERT(xio->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, + xio->io_hdr.io_type)); + if ((xioh->nexus.targ_port != io->io_hdr.nexus.targ_port) || (xioh->nexus.initid != io->io_hdr.nexus.initid) || (xioh->flags & CTL_FLAG_ABORT)) @@ -11995,6 +12885,10 @@ mtx_unlock(&softc->ctl_lock); LIST_FOREACH(xioh, &lun->ooa_queue, ooa_links) { union ctl_io *xio = (union ctl_io *)xioh; + KASSERT(xio->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, + xio->io_hdr.io_type)); + if ((xioh->nexus.targ_port != io->io_hdr.nexus.targ_port) || (xioh->nexus.initid != io->io_hdr.nexus.initid) || (xioh->flags & CTL_FLAG_ABORT)) @@ -12108,6 +13002,9 @@ const struct ctl_cmd_entry *entry; uint32_t targ_lun; + KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, io->io_hdr.io_type)); + targ_lun = io->io_hdr.nexus.targ_mapped_lun; switch (io->io_hdr.msg_type) { case CTL_MSG_SERIALIZE: @@ -12243,6 +13140,9 @@ { struct ctl_error_desc *desc, *desc2; + KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, io->io_hdr.io_type)); + mtx_assert(&lun->lun_lock, MA_OWNED); STAILQ_FOREACH_SAFE(desc, &lun->error_list, links, desc2) { @@ -12323,9 +13223,6 @@ struct bintime cur_bt; #endif - KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, - ("%s: unexpected I/O type %x", __func__, io->io_hdr.io_type)); - #ifdef CTL_TIME_IO getbinuptime(&cur_bt); bintime_sub(&cur_bt, &io->io_hdr.dma_start_bt); @@ -12336,13 +13233,36 @@ if ((io->io_hdr.port_status != 0) && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE || (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) { - ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1, - /*retry_count*/ io->io_hdr.port_status); - } else if (io->scsiio.kern_data_resid != 0 && + switch (io->io_hdr.io_type) { + case CTL_IO_SCSI: + ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1, + /*retry_count*/ io->io_hdr.port_status); + break; + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: + if (io->io_hdr.flags & CTL_FLAG_ABORT) + ctl_nvme_set_command_aborted(&io->nvmeio); + else + ctl_nvme_set_data_transfer_error(&io->nvmeio); + break; + default: + __assert_unreachable(); + } + } else if (ctl_kern_data_resid(io) != 0 && (io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_OUT && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE || (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) { - ctl_set_invalid_field_ciu(&io->scsiio); + switch (io->io_hdr.io_type) { + case CTL_IO_SCSI: + ctl_set_invalid_field_ciu(&io->scsiio); + break; + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: + ctl_nvme_set_data_transfer_error(&io->nvmeio); + break; + default: + __assert_unreachable(); + } } else if (ctl_debug & CTL_DEBUG_CDB_DATA) ctl_data_print(io); } @@ -12352,7 +13272,7 @@ { ctl_datamove_done_process(io); - io->scsiio.be_move_done(io, samethr); + ctl_be_move_done(io, samethr); } void @@ -12365,7 +13285,7 @@ CTL_DEBUG_PRINT(("ctl_datamove\n")); /* No data transferred yet. Frontend must update this when done. */ - io->scsiio.kern_data_resid = io->scsiio.kern_data_len; + ctl_set_kern_data_resid(io, ctl_kern_data_len(io)); #ifdef CTL_TIME_IO getbinuptime(&io->io_hdr.dma_start_bt); @@ -12398,20 +13318,33 @@ * the data move. */ if (io->io_hdr.flags & CTL_FLAG_ABORT) { - printf("ctl_datamove: tag 0x%jx on (%u:%u:%u) aborted\n", - io->scsiio.tag_num, io->io_hdr.nexus.initid, - io->io_hdr.nexus.targ_port, - io->io_hdr.nexus.targ_lun); + switch (io->io_hdr.io_type) { + case CTL_IO_SCSI: + printf("ctl_datamove: tag 0x%jx on (%u:%u:%u) aborted\n", + io->scsiio.tag_num, io->io_hdr.nexus.initid, + io->io_hdr.nexus.targ_port, + io->io_hdr.nexus.targ_lun); + break; + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: + printf("ctl_datamove: CID 0x%x on (%u:%u:%u) aborted\n", + le16toh(io->nvmeio.cmd.cid), + io->io_hdr.nexus.initid, io->io_hdr.nexus.targ_port, + io->io_hdr.nexus.targ_lun); + break; + default: + __assert_unreachable(); + } io->io_hdr.port_status = 31337; ctl_datamove_done_process(io); - io->scsiio.be_move_done(io, true); + ctl_be_move_done(io, true); return; } /* Don't confuse frontend with zero length data move. */ - if (io->scsiio.kern_data_len == 0) { + if (ctl_kern_data_len(io) == 0) { ctl_datamove_done_process(io); - io->scsiio.be_move_done(io, true); + ctl_be_move_done(io, true); return; } @@ -12427,6 +13360,9 @@ struct bintime cur_bt; #endif + KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, io->io_hdr.io_type)); + memset(&msg, 0, sizeof(msg)); msg.hdr.msg_type = CTL_MSG_DATAMOVE_DONE; msg.hdr.original_sc = io; @@ -12469,6 +13405,8 @@ uint32_t i; io = rq->context; + KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, io->io_hdr.io_type)); if (rq->ret != CTL_HA_STATUS_SUCCESS) { printf("%s: ISC DMA write failed with error %d", __func__, @@ -12513,6 +13451,9 @@ int retval; void (*fe_datamove)(union ctl_io *io); + KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, io->io_hdr.io_type)); + /* * - Get the data from the host/HBA into local memory. * - DMA memory from the local controller to the remote controller. @@ -12541,6 +13482,9 @@ { uint32_t i; + KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, io->io_hdr.io_type)); + for (i = 0; i < io->scsiio.kern_sg_entries; i++) free(CTL_LSGLT(io)[i].addr, M_CTL); free(CTL_RSGL(io), M_CTL); @@ -12563,6 +13507,8 @@ void (*fe_datamove)(union ctl_io *io); io = rq->context; + KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, io->io_hdr.io_type)); if (rq->ret != CTL_HA_STATUS_SUCCESS) { printf("%s: ISC DMA read failed with error %d\n", __func__, @@ -12597,6 +13543,9 @@ int retval; int i; + KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, io->io_hdr.io_type)); + retval = 0; local_sglist = CTL_LSGL(io); len_to_go = io->scsiio.kern_data_len; @@ -12634,6 +13583,9 @@ rq = ctl_dt_req_alloc(); + KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, io->io_hdr.io_type)); + /* * If we failed to allocate the request, and if the DMA didn't fail * anyway, set busy status. This is just a resource allocation @@ -12767,7 +13719,7 @@ * datamove done message, or call the callback with an * error if there is a problem. */ - for (i = 0; i < io->scsiio.kern_sg_entries; i++) + for (i = 0; i < ctl_kern_sg_entries(io); i++) free(CTL_LSGLT(io)[i].addr, M_CTL); free(CTL_RSGL(io), M_CTL); CTL_RSGL(io) = NULL; @@ -12785,6 +13737,8 @@ static void ctl_datamove_remote(union ctl_io *io) { + KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, io->io_hdr.io_type)); mtx_assert(&((struct ctl_softc *)CTL_SOFTC(io))->ctl_lock, MA_NOTOWNED); @@ -12851,6 +13805,8 @@ switch (io->io_hdr.io_type) { case CTL_IO_SCSI: + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: break; case CTL_IO_TASK: if (ctl_debug & CTL_DEBUG_INFO) @@ -12881,6 +13837,9 @@ uint8_t mrie = lun->MODE_IE.mrie; uint8_t per = ((lun->MODE_RWER.byte3 & SMS_RWER_PER) || (lun->MODE_VER.byte3 & SMS_VER_PER)); + KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, + io->io_hdr.io_type)); if (((mrie == SIEP_MRIE_REC_COND && per) || mrie == SIEP_MRIE_REC_UNCOND || mrie == SIEP_MRIE_NO_SENSE) && @@ -12914,7 +13873,9 @@ * XXX KDM should we also track I/O latency? */ if ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS && - io->io_hdr.io_type == CTL_IO_SCSI) { + (io->io_hdr.io_type == CTL_IO_SCSI || + io->io_hdr.io_type == CTL_IO_NVME || + io->io_hdr.io_type == CTL_IO_NVME_ADMIN)) { int type; #ifdef CTL_TIME_IO struct bintime bt; @@ -12931,7 +13892,7 @@ else type = CTL_STATS_NO_IO; - lun->stats.bytes[type] += io->scsiio.kern_total_len; + lun->stats.bytes[type] += ctl_kern_total_len(io); lun->stats.operations[type] ++; lun->stats.dmas[type] += io->io_hdr.num_dmas; #ifdef CTL_TIME_IO @@ -12940,7 +13901,7 @@ #endif mtx_lock(&port->port_lock); - port->stats.bytes[type] += io->scsiio.kern_total_len; + port->stats.bytes[type] += ctl_kern_total_len(io); port->stats.operations[type] ++; port->stats.dmas[type] += io->io_hdr.num_dmas; #ifdef CTL_TIME_IO @@ -12984,8 +13945,19 @@ * properly. The FETD is responsible for freeing the I/O and doing * whatever it needs to do to clean up its state. */ - if (io->io_hdr.flags & CTL_FLAG_ABORT) - ctl_set_task_aborted(&io->scsiio); + if (io->io_hdr.flags & CTL_FLAG_ABORT) { + switch (io->io_hdr.io_type) { + case CTL_IO_SCSI: + ctl_set_task_aborted(&io->scsiio); + break; + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: + ctl_nvme_set_command_aborted(&io->nvmeio); + break; + default: + __assert_unreachable(); + } + } /* * If enabled, print command error status. @@ -13028,6 +14000,9 @@ CTL_DEBUG_PRINT(("ctl_queue_sense\n")); + KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, + ("%s: unexpected I/O type %x", __func__, io->io_hdr.io_type)); + targ_lun = ctl_lun_map_from_port(port, io->io_hdr.nexus.targ_lun); /* @@ -13074,7 +14049,22 @@ { struct ctl_port *port = CTL_PORT(io); - CTL_DEBUG_PRINT(("ctl_queue cdb[0]=%02X\n", io->scsiio.cdb[0])); + switch (io->io_hdr.io_type) { + case CTL_IO_SCSI: + case CTL_IO_TASK: + CTL_DEBUG_PRINT(("ctl_queue cdb[0]=%02X\n", io->scsiio.cdb[0])); + break; + case CTL_IO_NVME: + CTL_DEBUG_PRINT(("ctl_queue nvme nvm cmd=%02X\n", + io->nvmeio.cmd.opc)); + break; + case CTL_IO_NVME_ADMIN: + CTL_DEBUG_PRINT(("ctl_queue nvme admin cmd=%02X\n", + io->nvmeio.cmd.opc)); + break; + default: + break; + } #ifdef CTL_TIME_IO io->io_hdr.start_time = time_uptime; @@ -13088,6 +14078,8 @@ switch (io->io_hdr.io_type) { case CTL_IO_SCSI: case CTL_IO_TASK: + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: if (ctl_debug & CTL_DEBUG_CDB) ctl_io_print(io); ctl_enqueue_incoming(io); @@ -13127,6 +14119,12 @@ ctl_io_print(io); ctl_run_task(io); break; + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: + if (ctl_debug & CTL_DEBUG_CDB) + ctl_io_print(io); + ctl_nvmeio_precheck(&io->nvmeio); + break; default: printf("ctl_run: unknown I/O type %d\n", io->io_hdr.io_type); return (EINVAL); @@ -13170,22 +14168,54 @@ */ #if 0 if (io->io_hdr.flags & CTL_FLAG_ALREADY_DONE) { - printf("%s: type %d msg %d cdb %x iptl: " - "%u:%u:%u tag 0x%04x " - "flag %#x status %x\n", - __func__, - io->io_hdr.io_type, - io->io_hdr.msg_type, - io->scsiio.cdb[0], - io->io_hdr.nexus.initid, - io->io_hdr.nexus.targ_port, - io->io_hdr.nexus.targ_lun, - (io->io_hdr.io_type == - CTL_IO_TASK) ? - io->taskio.tag_num : - io->scsiio.tag_num, - io->io_hdr.flags, - io->io_hdr.status); + switch (io->io_hdr.io_type) { + case CTL_IO_SCSI: + case CTL_IO_TASK: + printf("%s: type %d msg %d cdb %x iptl: " + "%u:%u:%u tag 0x%04lx " + "flag %#x status %x\n", + __func__, + io->io_hdr.io_type, + io->io_hdr.msg_type, + io->scsiio.cdb[0], + io->io_hdr.nexus.initid, + io->io_hdr.nexus.targ_port, + io->io_hdr.nexus.targ_lun, + (io->io_hdr.io_type == CTL_IO_TASK) ? + io->taskio.tag_num : + io->scsiio.tag_num, + io->io_hdr.flags, + io->io_hdr.status); + break; + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: + printf("%s: type %d msg %d opc %x iptl: " + "%u:%u:%u cid 0x%04x " + "flag %#x status %x\n", + __func__, + io->io_hdr.io_type, + io->io_hdr.msg_type, + io->nvmeio.cmd.opc, + io->io_hdr.nexus.initid, + io->io_hdr.nexus.targ_port, + io->io_hdr.nexus.targ_lun, + io->nvmeio.cmd.cid, + io->io_hdr.flags, + io->io_hdr.status); + break; + default: + printf("%s: type %d msg %d iptl: " + "%u:%u:%u flag %#x status %x\n", + __func__, + io->io_hdr.io_type, + io->io_hdr.msg_type, + io->io_hdr.nexus.initid, + io->io_hdr.nexus.targ_port, + io->io_hdr.nexus.targ_lun, + io->io_hdr.flags, + io->io_hdr.status); + break; + } } else io->io_hdr.flags |= CTL_FLAG_ALREADY_DONE; #endif @@ -13264,19 +14294,41 @@ if (io != NULL) { STAILQ_REMOVE_HEAD(&thr->incoming_queue, links); mtx_unlock(&thr->queue_lock); - if (io->io_hdr.io_type == CTL_IO_TASK) + switch (io->io_hdr.io_type) { + case CTL_IO_TASK: ctl_run_task(io); - else + break; + case CTL_IO_SCSI: ctl_scsiio_precheck(&io->scsiio); + break; + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: + ctl_nvmeio_precheck(&io->nvmeio); + break; + default: + __assert_unreachable(); + } continue; } io = (union ctl_io *)STAILQ_FIRST(&thr->rtr_queue); if (io != NULL) { STAILQ_REMOVE_HEAD(&thr->rtr_queue, links); mtx_unlock(&thr->queue_lock); - retval = ctl_scsiio(&io->scsiio); - if (retval != CTL_RETVAL_COMPLETE) - CTL_DEBUG_PRINT(("ctl_scsiio failed\n")); + switch (io->io_hdr.io_type) { + case CTL_IO_SCSI: + retval = ctl_scsiio(&io->scsiio); + if (retval != CTL_RETVAL_COMPLETE) + CTL_DEBUG_PRINT(("ctl_scsiio failed\n")); + break; + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: + retval = ctl_nvmeio(&io->nvmeio); + if (retval != CTL_RETVAL_COMPLETE) + CTL_DEBUG_PRINT(("ctl_nvmeio failed\n")); + break; + default: + __assert_unreachable(); + } continue; } diff --git a/sys/cam/ctl/ctl_nvme_cmd_table.c b/sys/cam/ctl/ctl_nvme_cmd_table.c new file mode 100644 --- /dev/null +++ b/sys/cam/ctl/ctl_nvme_cmd_table.c @@ -0,0 +1,35 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Chelsio Communications, Inc. + */ + +#include + +#include +#include +#include +#include +#include + +/* Administrative Command Set (CTL_IO_NVME_ADMIN). */ +const struct ctl_nvme_cmd_entry nvme_admin_cmd_table[256] = +{ + [NVME_OPC_IDENTIFY] = { ctl_nvme_identify, CTL_FLAG_DATA_IN | + CTL_CMD_FLAG_OK_ON_NO_LUN }, +}; + +/* NVM Command Set (CTL_IO_NVME). */ +const struct ctl_nvme_cmd_entry nvme_nvm_cmd_table[256] = +{ + [NVME_OPC_FLUSH] = { ctl_nvme_flush, CTL_FLAG_DATA_NONE }, + [NVME_OPC_WRITE] = { ctl_nvme_read_write, CTL_FLAG_DATA_OUT }, + [NVME_OPC_READ] = { ctl_nvme_read_write, CTL_FLAG_DATA_IN }, + [NVME_OPC_WRITE_UNCORRECTABLE] = { ctl_nvme_write_uncorrectable, + CTL_FLAG_DATA_NONE }, + [NVME_OPC_COMPARE] = { ctl_nvme_compare, CTL_FLAG_DATA_OUT }, + [NVME_OPC_WRITE_ZEROES] = { ctl_nvme_write_zeroes, CTL_FLAG_DATA_NONE }, + [NVME_OPC_DATASET_MANAGEMENT] = { ctl_nvme_dataset_management, + CTL_FLAG_DATA_OUT }, + [NVME_OPC_VERIFY] = { ctl_nvme_verify, CTL_FLAG_DATA_NONE }, +}; diff --git a/sys/cam/ctl/ctl_private.h b/sys/cam/ctl/ctl_private.h --- a/sys/cam/ctl/ctl_private.h +++ b/sys/cam/ctl/ctl_private.h @@ -78,7 +78,8 @@ CTL_ACTION_SKIP, CTL_ACTION_BLOCK, CTL_ACTION_OVERLAP, - CTL_ACTION_OVERLAP_TAG + CTL_ACTION_OVERLAP_TAG, + CTL_ACTION_FUSED, } ctl_action; /* @@ -139,6 +140,12 @@ * after the opcode byte. */ }; +/* Only data flags are currently used for NVMe commands. */ +struct ctl_nvme_cmd_entry { + int (*execute)(struct ctl_nvmeio *); + ctl_io_flags flags; +}; + typedef enum { CTL_LUN_NONE = 0x000, CTL_LUN_CONTROL = 0x001, @@ -412,6 +419,8 @@ #ifdef _KERNEL extern const struct ctl_cmd_entry ctl_cmd_table[256]; +extern const struct ctl_nvme_cmd_entry nvme_admin_cmd_table[256]; +extern const struct ctl_nvme_cmd_entry nvme_nvm_cmd_table[256]; uint32_t ctl_get_initindex(struct ctl_nexus *nexus); int ctl_lun_map_init(struct ctl_port *port); @@ -459,6 +468,15 @@ int ctl_report_timestamp(struct ctl_scsiio *ctsio); int ctl_get_lba_status(struct ctl_scsiio *ctsio); +int ctl_nvme_identify(struct ctl_nvmeio *ctnio); +int ctl_nvme_flush(struct ctl_nvmeio *ctnio); +int ctl_nvme_read_write(struct ctl_nvmeio *ctnio); +int ctl_nvme_write_uncorrectable(struct ctl_nvmeio *ctnio); +int ctl_nvme_compare(struct ctl_nvmeio *ctnio); +int ctl_nvme_write_zeroes(struct ctl_nvmeio *ctnio); +int ctl_nvme_dataset_management(struct ctl_nvmeio *ctnio); +int ctl_nvme_verify(struct ctl_nvmeio *ctnio); + void ctl_tpc_init(struct ctl_softc *softc); void ctl_tpc_shutdown(struct ctl_softc *softc); void ctl_tpc_lun_init(struct ctl_lun *lun); diff --git a/sys/conf/files b/sys/conf/files --- a/sys/conf/files +++ b/sys/conf/files @@ -109,6 +109,7 @@ cam/ctl/ctl_frontend_iscsi.c optional ctl cfiscsi cam/ctl/ctl_ha.c optional ctl cam/ctl/ctl_nvme_all.c optional ctl +cam/ctl/ctl_nvme_cmd_table.c optional ctl cam/ctl/ctl_scsi_all.c optional ctl cam/ctl/ctl_tpc.c optional ctl cam/ctl/ctl_tpc_local.c optional ctl diff --git a/sys/modules/ctl/Makefile b/sys/modules/ctl/Makefile --- a/sys/modules/ctl/Makefile +++ b/sys/modules/ctl/Makefile @@ -13,6 +13,7 @@ SRCS+= ctl_frontend_ioctl.c SRCS+= ctl_ha.c SRCS+= ctl_nvme_all.c +SRCS+= ctl_nvme_cmd_table.c SRCS+= ctl_scsi_all.c SRCS+= ctl_tpc.c SRCS+= ctl_tpc_local.c