Index: head/sbin/nvmecontrol/firmware.c =================================================================== --- head/sbin/nvmecontrol/firmware.c (revision 338181) +++ head/sbin/nvmecontrol/firmware.c (revision 338182) @@ -1,340 +1,340 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 EMC Corp. * All rights reserved. * * Copyright (C) 2012-2013 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "nvmecontrol.h" static int slot_has_valid_firmware(int fd, int slot) { struct nvme_firmware_page fw; int has_fw = false; read_logpage(fd, NVME_LOG_FIRMWARE_SLOT, NVME_GLOBAL_NAMESPACE_TAG, &fw, sizeof(fw)); if (fw.revision[slot-1] != 0LLU) has_fw = true; return (has_fw); } static void read_image_file(char *path, void **buf, int32_t *size) { struct stat sb; int32_t filesize; int fd; *size = 0; *buf = NULL; if ((fd = open(path, O_RDONLY)) < 0) err(1, "unable to open '%s'", path); if (fstat(fd, &sb) < 0) err(1, "unable to stat '%s'", path); /* * The NVMe spec does not explicitly state a maximum firmware image * size, although one can be inferred from the dword size limitation * for the size and offset fields in the Firmware Image Download * command. * * Technically, the max is UINT32_MAX * sizeof(uint32_t), since the * size and offsets are specified in terms of dwords (not bytes), but * realistically INT32_MAX is sufficient here and simplifies matters * a bit. */ if (sb.st_size > INT32_MAX) errx(1, "size of file '%s' is too large (%jd bytes)", path, (intmax_t)sb.st_size); filesize = (int32_t)sb.st_size; if ((*buf = malloc(filesize)) == NULL) errx(1, "unable to malloc %d bytes", filesize); if ((*size = read(fd, *buf, filesize)) < 0) err(1, "error reading '%s'", path); /* XXX assuming no short reads */ if (*size != filesize) errx(1, "error reading '%s' (read %d bytes, requested %d bytes)", path, *size, filesize); } static void update_firmware(int fd, uint8_t *payload, int32_t payload_size) { struct nvme_pt_command pt; int32_t off, resid, size; void *chunk; off = 0; resid = payload_size; if ((chunk = aligned_alloc(PAGE_SIZE, NVME_MAX_XFER_SIZE)) == NULL) errx(1, "unable to malloc %d bytes", NVME_MAX_XFER_SIZE); while (resid > 0) { size = (resid >= NVME_MAX_XFER_SIZE) ? NVME_MAX_XFER_SIZE : resid; memcpy(chunk, payload + off, size); memset(&pt, 0, sizeof(pt)); - pt.cmd.opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD); + pt.cmd.opc = NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD; pt.cmd.cdw10 = htole32((size / sizeof(uint32_t)) - 1); pt.cmd.cdw11 = htole32(off / sizeof(uint32_t)); pt.buf = chunk; pt.len = size; pt.is_read = 0; if (ioctl(fd, NVME_PASSTHROUGH_CMD, &pt) < 0) err(1, "firmware download request failed"); if (nvme_completion_is_error(&pt.cpl)) errx(1, "firmware download request returned error"); resid -= size; off += size; } } static int activate_firmware(int fd, int slot, int activate_action) { struct nvme_pt_command pt; uint16_t sct, sc; memset(&pt, 0, sizeof(pt)); - pt.cmd.opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_FIRMWARE_ACTIVATE); + pt.cmd.opc = NVME_OPC_FIRMWARE_ACTIVATE; pt.cmd.cdw10 = htole32((activate_action << 3) | slot); pt.is_read = 0; if (ioctl(fd, NVME_PASSTHROUGH_CMD, &pt) < 0) err(1, "firmware activate request failed"); sct = NVME_STATUS_GET_SCT(pt.cpl.status); sc = NVME_STATUS_GET_SC(pt.cpl.status); if (sct == NVME_SCT_COMMAND_SPECIFIC && sc == NVME_SC_FIRMWARE_REQUIRES_RESET) return 1; if (nvme_completion_is_error(&pt.cpl)) errx(1, "firmware activate request returned error"); return 0; } static void firmware_usage(void) { fprintf(stderr, "usage:\n"); fprintf(stderr, FIRMWARE_USAGE); exit(1); } void firmware(int argc, char *argv[]) { int fd = -1, slot = 0; int a_flag, s_flag, f_flag; int activate_action, reboot_required; int opt; char *p, *image = NULL; char *controller = NULL, prompt[64]; void *buf = NULL; int32_t size = 0; uint16_t oacs_fw; uint8_t fw_slot1_ro, fw_num_slots; struct nvme_controller_data cdata; a_flag = s_flag = f_flag = false; while ((opt = getopt(argc, argv, "af:s:")) != -1) { switch (opt) { case 'a': a_flag = true; break; case 's': slot = strtol(optarg, &p, 0); if (p != NULL && *p != '\0') { fprintf(stderr, "\"%s\" not valid slot.\n", optarg); firmware_usage(); } else if (slot == 0) { fprintf(stderr, "0 is not a valid slot number. " "Slot numbers start at 1.\n"); firmware_usage(); } else if (slot > 7) { fprintf(stderr, "Slot number %s specified which is " "greater than max allowed slot number of " "7.\n", optarg); firmware_usage(); } s_flag = true; break; case 'f': image = optarg; f_flag = true; break; } } /* Check that a controller (and not a namespace) was specified. */ if (optind >= argc || strstr(argv[optind], NVME_NS_PREFIX) != NULL) firmware_usage(); if (!f_flag && !a_flag) { fprintf(stderr, "Neither a replace ([-f path_to_firmware]) nor " "activate ([-a]) firmware image action\n" "was specified.\n"); firmware_usage(); } if (!f_flag && a_flag && slot == 0) { fprintf(stderr, "Slot number to activate not specified.\n"); firmware_usage(); } controller = argv[optind]; open_dev(controller, &fd, 1, 1); read_controller_data(fd, &cdata); oacs_fw = (cdata.oacs >> NVME_CTRLR_DATA_OACS_FIRMWARE_SHIFT) & NVME_CTRLR_DATA_OACS_FIRMWARE_MASK; if (oacs_fw == 0) errx(1, "controller does not support firmware activate/download"); fw_slot1_ro = (cdata.frmw >> NVME_CTRLR_DATA_FRMW_SLOT1_RO_SHIFT) & NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK; if (f_flag && slot == 1 && fw_slot1_ro) errx(1, "slot %d is marked as read only", slot); fw_num_slots = (cdata.frmw >> NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT) & NVME_CTRLR_DATA_FRMW_NUM_SLOTS_MASK; if (slot > fw_num_slots) errx(1, "slot %d specified but controller only supports %d slots", slot, fw_num_slots); if (a_flag && !f_flag && !slot_has_valid_firmware(fd, slot)) errx(1, "slot %d does not contain valid firmware,\n" "try 'nvmecontrol logpage -p 3 %s' to get a list " "of available images\n", slot, controller); if (f_flag) read_image_file(image, &buf, &size); if (f_flag && a_flag) printf("You are about to download and activate " "firmware image (%s) to controller %s.\n" "This may damage your controller and/or " "overwrite an existing firmware image.\n", image, controller); else if (a_flag) printf("You are about to activate a new firmware " "image on controller %s.\n" "This may damage your controller.\n", controller); else if (f_flag) printf("You are about to download firmware image " "(%s) to controller %s.\n" "This may damage your controller and/or " "overwrite an existing firmware image.\n", image, controller); printf("Are you sure you want to continue? (yes/no) "); while (1) { fgets(prompt, sizeof(prompt), stdin); if (strncasecmp(prompt, "yes", 3) == 0) break; if (strncasecmp(prompt, "no", 2) == 0) exit(1); printf("Please answer \"yes\" or \"no\". "); } if (f_flag) { update_firmware(fd, buf, size); if (a_flag) activate_action = NVME_AA_REPLACE_ACTIVATE; else activate_action = NVME_AA_REPLACE_NO_ACTIVATE; } else { activate_action = NVME_AA_ACTIVATE; } reboot_required = activate_firmware(fd, slot, activate_action); if (a_flag) { if (reboot_required) { printf("New firmware image activated but requires " "conventional reset (i.e. reboot) to " "complete activation.\n"); } else { printf("New firmware image activated and will take " "effect after next controller reset.\n" "Controller reset can be initiated via " "'nvmecontrol reset %s'\n", controller); } } close(fd); exit(0); } Index: head/sbin/nvmecontrol/format.c =================================================================== --- head/sbin/nvmecontrol/format.c (revision 338181) +++ head/sbin/nvmecontrol/format.c (revision 338182) @@ -1,187 +1,187 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2018 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include "nvmecontrol.h" static void format_usage(void) { fprintf(stderr, "usage:\n"); fprintf(stderr, FORMAT_USAGE); exit(1); } void format(int argc, char *argv[]) { struct nvme_controller_data cd; struct nvme_namespace_data nsd; struct nvme_pt_command pt; char path[64]; char *target; uint32_t nsid; int ch, fd; int lbaf = -1, mset = -1, pi = -1, pil = -1, ses = 0; if (argc < 2) format_usage(); while ((ch = getopt(argc, argv, "f:m:p:l:EC")) != -1) { switch ((char)ch) { case 'f': lbaf = strtol(optarg, NULL, 0); break; case 'm': mset = strtol(optarg, NULL, 0); break; case 'p': pi = strtol(optarg, NULL, 0); break; case 'l': pil = strtol(optarg, NULL, 0); break; case 'E': if (ses == 2) errx(1, "-E and -C are mutually exclusive"); ses = 1; break; case 'C': if (ses == 1) errx(1, "-E and -C are mutually exclusive"); ses = 2; break; default: format_usage(); } } /* Check that a controller or namespace was specified. */ if (optind >= argc) format_usage(); target = argv[optind]; /* * Check if the specified device node exists before continuing. * This is a cleaner check for cases where the correct controller * is specified, but an invalid namespace on that controller. */ open_dev(target, &fd, 1, 1); /* * If device node contains "ns", we consider it a namespace, * otherwise, consider it a controller. */ if (strstr(target, NVME_NS_PREFIX) == NULL) { nsid = NVME_GLOBAL_NAMESPACE_TAG; } else { /* * We send FORMAT commands to the controller, not the namespace, * since it is an admin cmd. The namespace ID will be specified * in the command itself. So parse the namespace's device node * string to get the controller substring and namespace ID. */ close(fd); parse_ns_str(target, path, &nsid); open_dev(path, &fd, 1, 1); } /* Check that controller can execute this command. */ read_controller_data(fd, &cd); if (((cd.oacs >> NVME_CTRLR_DATA_OACS_FORMAT_SHIFT) & NVME_CTRLR_DATA_OACS_FORMAT_MASK) == 0) errx(1, "controller does not support format"); if (((cd.fna >> NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_SHIFT) & NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_MASK) == 0 && ses == 2) errx(1, "controller does not support cryptographic erase"); if (nsid != NVME_GLOBAL_NAMESPACE_TAG) { if (((cd.fna >> NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT) & NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK) && ses == 0) errx(1, "controller does not support per-NS format"); if (((cd.fna >> NVME_CTRLR_DATA_FNA_ERASE_ALL_SHIFT) & NVME_CTRLR_DATA_FNA_ERASE_ALL_MASK) && ses != 0) errx(1, "controller does not support per-NS erase"); /* Try to keep previous namespace parameters. */ read_namespace_data(fd, nsid, &nsd); if (lbaf < 0) lbaf = (nsd.flbas >> NVME_NS_DATA_FLBAS_FORMAT_SHIFT) & NVME_NS_DATA_FLBAS_FORMAT_MASK; if (lbaf > nsd.nlbaf) errx(1, "LBA format is out of range"); if (mset < 0) mset = (nsd.flbas >> NVME_NS_DATA_FLBAS_EXTENDED_SHIFT) & NVME_NS_DATA_FLBAS_EXTENDED_MASK; if (pi < 0) pi = (nsd.dps >> NVME_NS_DATA_DPS_MD_START_SHIFT) & NVME_NS_DATA_DPS_MD_START_MASK; if (pil < 0) pil = (nsd.dps >> NVME_NS_DATA_DPS_PIT_SHIFT) & NVME_NS_DATA_DPS_PIT_MASK; } else { /* We have no previous parameters, so default to zeroes. */ if (lbaf < 0) lbaf = 0; if (mset < 0) mset = 0; if (pi < 0) pi = 0; if (pil < 0) pil = 0; } memset(&pt, 0, sizeof(pt)); - pt.cmd.opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_FORMAT_NVM); + pt.cmd.opc = NVME_OPC_FORMAT_NVM; pt.cmd.nsid = htole32(nsid); pt.cmd.cdw10 = htole32((ses << 9) + (pil << 8) + (pi << 5) + (mset << 4) + lbaf); if (ioctl(fd, NVME_PASSTHROUGH_CMD, &pt) < 0) err(1, "format request failed"); if (nvme_completion_is_error(&pt.cpl)) errx(1, "format request returned error"); close(fd); exit(0); } Index: head/sbin/nvmecontrol/logpage.c =================================================================== --- head/sbin/nvmecontrol/logpage.c (revision 338181) +++ head/sbin/nvmecontrol/logpage.c (revision 338182) @@ -1,1031 +1,1031 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 EMC Corp. * All rights reserved. * * Copyright (C) 2012-2013 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "nvmecontrol.h" #define DEFAULT_SIZE (4096) #define MAX_FW_SLOTS (7) typedef void (*print_fn_t)(const struct nvme_controller_data *cdata, void *buf, uint32_t size); struct kv_name { uint32_t key; const char *name; }; static const char * kv_lookup(const struct kv_name *kv, size_t kv_count, uint32_t key) { static char bad[32]; size_t i; for (i = 0; i < kv_count; i++, kv++) if (kv->key == key) return kv->name; snprintf(bad, sizeof(bad), "Attribute %#x", key); return bad; } static void print_log_hex(const struct nvme_controller_data *cdata __unused, void *data, uint32_t length) { print_hex(data, length); } static void print_bin(const struct nvme_controller_data *cdata __unused, void *data, uint32_t length) { write(STDOUT_FILENO, data, length); } static void * get_log_buffer(uint32_t size) { void *buf; if ((buf = malloc(size)) == NULL) errx(1, "unable to malloc %u bytes", size); memset(buf, 0, size); return (buf); } void read_logpage(int fd, uint8_t log_page, uint32_t nsid, void *payload, uint32_t payload_size) { struct nvme_pt_command pt; struct nvme_error_information_entry *err_entry; int i, err_pages; memset(&pt, 0, sizeof(pt)); - pt.cmd.opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_GET_LOG_PAGE); + pt.cmd.opc = NVME_OPC_GET_LOG_PAGE; pt.cmd.nsid = htole32(nsid); pt.cmd.cdw10 = ((payload_size/sizeof(uint32_t)) - 1) << 16; pt.cmd.cdw10 |= log_page; pt.cmd.cdw10 = htole32(pt.cmd.cdw10); pt.buf = payload; pt.len = payload_size; pt.is_read = 1; if (ioctl(fd, NVME_PASSTHROUGH_CMD, &pt) < 0) err(1, "get log page request failed"); /* Convert data to host endian */ switch (log_page) { case NVME_LOG_ERROR: err_entry = (struct nvme_error_information_entry *)payload; err_pages = payload_size / sizeof(struct nvme_error_information_entry); for (i = 0; i < err_pages; i++) nvme_error_information_entry_swapbytes(err_entry++); break; case NVME_LOG_HEALTH_INFORMATION: nvme_health_information_page_swapbytes( (struct nvme_health_information_page *)payload); break; case NVME_LOG_FIRMWARE_SLOT: nvme_firmware_page_swapbytes( (struct nvme_firmware_page *)payload); break; case INTEL_LOG_TEMP_STATS: intel_log_temp_stats_swapbytes( (struct intel_log_temp_stats *)payload); break; default: break; } if (nvme_completion_is_error(&pt.cpl)) errx(1, "get log page request returned error"); } static void print_log_error(const struct nvme_controller_data *cdata __unused, void *buf, uint32_t size) { int i, nentries; uint16_t status; uint8_t p, sc, sct, m, dnr; struct nvme_error_information_entry *entry = buf; printf("Error Information Log\n"); printf("=====================\n"); if (entry->error_count == 0) { printf("No error entries found\n"); return; } nentries = size/sizeof(struct nvme_error_information_entry); for (i = 0; i < nentries; i++, entry++) { if (entry->error_count == 0) break; status = entry->status; p = NVME_STATUS_GET_P(status); sc = NVME_STATUS_GET_SC(status); sct = NVME_STATUS_GET_SCT(status); m = NVME_STATUS_GET_M(status); dnr = NVME_STATUS_GET_DNR(status); printf("Entry %02d\n", i + 1); printf("=========\n"); printf(" Error count: %ju\n", entry->error_count); printf(" Submission queue ID: %u\n", entry->sqid); printf(" Command ID: %u\n", entry->cid); /* TODO: Export nvme_status_string structures from kernel? */ printf(" Status:\n"); printf(" Phase tag: %d\n", p); printf(" Status code: %d\n", sc); printf(" Status code type: %d\n", sct); printf(" More: %d\n", m); printf(" DNR: %d\n", dnr); printf(" Error location: %u\n", entry->error_location); printf(" LBA: %ju\n", entry->lba); printf(" Namespace ID: %u\n", entry->nsid); printf(" Vendor specific info: %u\n", entry->vendor_specific); } } static void print_temp(uint16_t t) { printf("%u K, %2.2f C, %3.2f F\n", t, (float)t - 273.15, (float)t * 9 / 5 - 459.67); } static void print_log_health(const struct nvme_controller_data *cdata __unused, void *buf, uint32_t size __unused) { struct nvme_health_information_page *health = buf; char cbuf[UINT128_DIG + 1]; uint8_t warning; int i; warning = health->critical_warning; printf("SMART/Health Information Log\n"); printf("============================\n"); printf("Critical Warning State: 0x%02x\n", warning); printf(" Available spare: %d\n", !!(warning & NVME_CRIT_WARN_ST_AVAILABLE_SPARE)); printf(" Temperature: %d\n", !!(warning & NVME_CRIT_WARN_ST_TEMPERATURE)); printf(" Device reliability: %d\n", !!(warning & NVME_CRIT_WARN_ST_DEVICE_RELIABILITY)); printf(" Read only: %d\n", !!(warning & NVME_CRIT_WARN_ST_READ_ONLY)); printf(" Volatile memory backup: %d\n", !!(warning & NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP)); printf("Temperature: "); print_temp(health->temperature); printf("Available spare: %u\n", health->available_spare); printf("Available spare threshold: %u\n", health->available_spare_threshold); printf("Percentage used: %u\n", health->percentage_used); printf("Data units (512,000 byte) read: %s\n", uint128_to_str(to128(health->data_units_read), cbuf, sizeof(cbuf))); printf("Data units written: %s\n", uint128_to_str(to128(health->data_units_written), cbuf, sizeof(cbuf))); printf("Host read commands: %s\n", uint128_to_str(to128(health->host_read_commands), cbuf, sizeof(cbuf))); printf("Host write commands: %s\n", uint128_to_str(to128(health->host_write_commands), cbuf, sizeof(cbuf))); printf("Controller busy time (minutes): %s\n", uint128_to_str(to128(health->controller_busy_time), cbuf, sizeof(cbuf))); printf("Power cycles: %s\n", uint128_to_str(to128(health->power_cycles), cbuf, sizeof(cbuf))); printf("Power on hours: %s\n", uint128_to_str(to128(health->power_on_hours), cbuf, sizeof(cbuf))); printf("Unsafe shutdowns: %s\n", uint128_to_str(to128(health->unsafe_shutdowns), cbuf, sizeof(cbuf))); printf("Media errors: %s\n", uint128_to_str(to128(health->media_errors), cbuf, sizeof(cbuf))); printf("No. error info log entries: %s\n", uint128_to_str(to128(health->num_error_info_log_entries), cbuf, sizeof(cbuf))); printf("Warning Temp Composite Time: %d\n", health->warning_temp_time); printf("Error Temp Composite Time: %d\n", health->error_temp_time); for (i = 0; i < 8; i++) { if (health->temp_sensor[i] == 0) continue; printf("Temperature Sensor %d: ", i + 1); print_temp(health->temp_sensor[i]); } } static void print_log_firmware(const struct nvme_controller_data *cdata, void *buf, uint32_t size __unused) { int i, slots; const char *status; struct nvme_firmware_page *fw = buf; uint8_t afi_slot; uint16_t oacs_fw; uint8_t fw_num_slots; afi_slot = fw->afi >> NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT; afi_slot &= NVME_FIRMWARE_PAGE_AFI_SLOT_MASK; oacs_fw = (cdata->oacs >> NVME_CTRLR_DATA_OACS_FIRMWARE_SHIFT) & NVME_CTRLR_DATA_OACS_FIRMWARE_MASK; fw_num_slots = (cdata->frmw >> NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT) & NVME_CTRLR_DATA_FRMW_NUM_SLOTS_MASK; printf("Firmware Slot Log\n"); printf("=================\n"); if (oacs_fw == 0) slots = 1; else slots = MIN(fw_num_slots, MAX_FW_SLOTS); for (i = 0; i < slots; i++) { printf("Slot %d: ", i + 1); if (afi_slot == i + 1) status = " Active"; else status = "Inactive"; if (fw->revision[i] == 0LLU) printf("Empty\n"); else if (isprint(*(char *)&fw->revision[i])) printf("[%s] %.8s\n", status, (char *)&fw->revision[i]); else printf("[%s] %016jx\n", status, fw->revision[i]); } } /* * Intel specific log pages from * http://www.intel.com/content/dam/www/public/us/en/documents/product-specifications/ssd-dc-p3700-spec.pdf * * Though the version as of this date has a typo for the size of log page 0xca, * offset 147: it is only 1 byte, not 6. */ static void print_intel_temp_stats(const struct nvme_controller_data *cdata __unused, void *buf, uint32_t size __unused) { struct intel_log_temp_stats *temp = buf; printf("Intel Temperature Log\n"); printf("=====================\n"); printf("Current: "); print_temp(temp->current); printf("Overtemp Last Flags %#jx\n", (uintmax_t)temp->overtemp_flag_last); printf("Overtemp Lifetime Flags %#jx\n", (uintmax_t)temp->overtemp_flag_life); printf("Max Temperature "); print_temp(temp->max_temp); printf("Min Temperature "); print_temp(temp->min_temp); printf("Max Operating Temperature "); print_temp(temp->max_oper_temp); printf("Min Operating Temperature "); print_temp(temp->min_oper_temp); printf("Estimated Temperature Offset: %ju C/K\n", (uintmax_t)temp->est_offset); } /* * Format from Table 22, section 5.7 IO Command Latency Statistics. * Read and write stats pages have identical encoding. */ static void print_intel_read_write_lat_log(const struct nvme_controller_data *cdata __unused, void *buf, uint32_t size __unused) { const char *walker = buf; int i; printf("Major: %d\n", le16dec(walker + 0)); printf("Minor: %d\n", le16dec(walker + 2)); for (i = 0; i < 32; i++) printf("%4dus-%4dus: %ju\n", i * 32, (i + 1) * 32, (uintmax_t)le32dec(walker + 4 + i * 4)); for (i = 1; i < 32; i++) printf("%4dms-%4dms: %ju\n", i, i + 1, (uintmax_t)le32dec(walker + 132 + i * 4)); for (i = 1; i < 32; i++) printf("%4dms-%4dms: %ju\n", i * 32, (i + 1) * 32, (uintmax_t)le32dec(walker + 256 + i * 4)); } static void print_intel_read_lat_log(const struct nvme_controller_data *cdata __unused, void *buf, uint32_t size) { printf("Intel Read Latency Log\n"); printf("======================\n"); print_intel_read_write_lat_log(cdata, buf, size); } static void print_intel_write_lat_log(const struct nvme_controller_data *cdata __unused, void *buf, uint32_t size) { printf("Intel Write Latency Log\n"); printf("=======================\n"); print_intel_read_write_lat_log(cdata, buf, size); } /* * Table 19. 5.4 SMART Attributes. Samsung also implements this and some extra data not documented. */ static void print_intel_add_smart(const struct nvme_controller_data *cdata __unused, void *buf, uint32_t size __unused) { uint8_t *walker = buf; uint8_t *end = walker + 150; const char *name; uint64_t raw; uint8_t normalized; static struct kv_name kv[] = { { 0xab, "Program Fail Count" }, { 0xac, "Erase Fail Count" }, { 0xad, "Wear Leveling Count" }, { 0xb8, "End to End Error Count" }, { 0xc7, "CRC Error Count" }, { 0xe2, "Timed: Media Wear" }, { 0xe3, "Timed: Host Read %" }, { 0xe4, "Timed: Elapsed Time" }, { 0xea, "Thermal Throttle Status" }, { 0xf0, "Retry Buffer Overflows" }, { 0xf3, "PLL Lock Loss Count" }, { 0xf4, "NAND Bytes Written" }, { 0xf5, "Host Bytes Written" }, }; printf("Additional SMART Data Log\n"); printf("=========================\n"); /* * walker[0] = Key * walker[1,2] = reserved * walker[3] = Normalized Value * walker[4] = reserved * walker[5..10] = Little Endian Raw value * (or other represenations) * walker[11] = reserved */ while (walker < end) { name = kv_lookup(kv, nitems(kv), *walker); normalized = walker[3]; raw = le48dec(walker + 5); switch (*walker){ case 0: break; case 0xad: printf("%-32s: %3d min: %u max: %u ave: %u\n", name, normalized, le16dec(walker + 5), le16dec(walker + 7), le16dec(walker + 9)); break; case 0xe2: printf("%-32s: %3d %.3f%%\n", name, normalized, raw / 1024.0); break; case 0xea: printf("%-32s: %3d %d%% %d times\n", name, normalized, walker[5], le32dec(walker+6)); break; default: printf("%-32s: %3d %ju\n", name, normalized, (uintmax_t)raw); break; } walker += 12; } } /* * HGST's 0xc1 page. This is a grab bag of additional data. Please see * https://www.hgst.com/sites/default/files/resources/US_SN150_ProdManual.pdf * https://www.hgst.com/sites/default/files/resources/US_SN100_ProdManual.pdf * Appendix A for details */ typedef void (*subprint_fn_t)(void *buf, uint16_t subtype, uint8_t res, uint32_t size); struct subpage_print { uint16_t key; subprint_fn_t fn; }; static void print_hgst_info_write_errors(void *buf, uint16_t subtype, uint8_t res, uint32_t size); static void print_hgst_info_read_errors(void *buf, uint16_t subtype, uint8_t res, uint32_t size); static void print_hgst_info_verify_errors(void *buf, uint16_t subtype, uint8_t res, uint32_t size); static void print_hgst_info_self_test(void *buf, uint16_t subtype, uint8_t res, uint32_t size); static void print_hgst_info_background_scan(void *buf, uint16_t subtype, uint8_t res, uint32_t size); static void print_hgst_info_erase_errors(void *buf, uint16_t subtype, uint8_t res, uint32_t size); static void print_hgst_info_erase_counts(void *buf, uint16_t subtype, uint8_t res, uint32_t size); static void print_hgst_info_temp_history(void *buf, uint16_t subtype, uint8_t res, uint32_t size); static void print_hgst_info_ssd_perf(void *buf, uint16_t subtype, uint8_t res, uint32_t size); static void print_hgst_info_firmware_load(void *buf, uint16_t subtype, uint8_t res, uint32_t size); static struct subpage_print hgst_subpage[] = { { 0x02, print_hgst_info_write_errors }, { 0x03, print_hgst_info_read_errors }, { 0x05, print_hgst_info_verify_errors }, { 0x10, print_hgst_info_self_test }, { 0x15, print_hgst_info_background_scan }, { 0x30, print_hgst_info_erase_errors }, { 0x31, print_hgst_info_erase_counts }, { 0x32, print_hgst_info_temp_history }, { 0x37, print_hgst_info_ssd_perf }, { 0x38, print_hgst_info_firmware_load }, }; /* Print a subpage that is basically just key value pairs */ static void print_hgst_info_subpage_gen(void *buf, uint16_t subtype __unused, uint32_t size, const struct kv_name *kv, size_t kv_count) { uint8_t *wsp, *esp; uint16_t ptype; uint8_t plen; uint64_t param; int i; wsp = buf; esp = wsp + size; while (wsp < esp) { ptype = le16dec(wsp); wsp += 2; wsp++; /* Flags, just ignore */ plen = *wsp++; param = 0; for (i = 0; i < plen; i++) param |= (uint64_t)*wsp++ << (i * 8); printf(" %-30s: %jd\n", kv_lookup(kv, kv_count, ptype), (uintmax_t)param); } } static void print_hgst_info_write_errors(void *buf, uint16_t subtype, uint8_t res __unused, uint32_t size) { static struct kv_name kv[] = { { 0x0000, "Corrected Without Delay" }, { 0x0001, "Corrected Maybe Delayed" }, { 0x0002, "Re-Writes" }, { 0x0003, "Errors Corrected" }, { 0x0004, "Correct Algorithm Used" }, { 0x0005, "Bytes Processed" }, { 0x0006, "Uncorrected Errors" }, { 0x8000, "Flash Write Commands" }, { 0x8001, "HGST Special" }, }; printf("Write Errors Subpage:\n"); print_hgst_info_subpage_gen(buf, subtype, size, kv, nitems(kv)); } static void print_hgst_info_read_errors(void *buf, uint16_t subtype, uint8_t res __unused, uint32_t size) { static struct kv_name kv[] = { { 0x0000, "Corrected Without Delay" }, { 0x0001, "Corrected Maybe Delayed" }, { 0x0002, "Re-Reads" }, { 0x0003, "Errors Corrected" }, { 0x0004, "Correct Algorithm Used" }, { 0x0005, "Bytes Processed" }, { 0x0006, "Uncorrected Errors" }, { 0x8000, "Flash Read Commands" }, { 0x8001, "XOR Recovered" }, { 0x8002, "Total Corrected Bits" }, }; printf("Read Errors Subpage:\n"); print_hgst_info_subpage_gen(buf, subtype, size, kv, nitems(kv)); } static void print_hgst_info_verify_errors(void *buf, uint16_t subtype, uint8_t res __unused, uint32_t size) { static struct kv_name kv[] = { { 0x0000, "Corrected Without Delay" }, { 0x0001, "Corrected Maybe Delayed" }, { 0x0002, "Re-Reads" }, { 0x0003, "Errors Corrected" }, { 0x0004, "Correct Algorithm Used" }, { 0x0005, "Bytes Processed" }, { 0x0006, "Uncorrected Errors" }, { 0x8000, "Commands Processed" }, }; printf("Verify Errors Subpage:\n"); print_hgst_info_subpage_gen(buf, subtype, size, kv, nitems(kv)); } static void print_hgst_info_self_test(void *buf, uint16_t subtype __unused, uint8_t res __unused, uint32_t size) { size_t i; uint8_t *walker = buf; uint16_t code, hrs; uint32_t lba; printf("Self Test Subpage:\n"); for (i = 0; i < size / 20; i++) { /* Each entry is 20 bytes */ code = le16dec(walker); walker += 2; walker++; /* Ignore fixed flags */ if (*walker == 0) /* Last entry is zero length */ break; if (*walker++ != 0x10) { printf("Bad length for self test report\n"); return; } printf(" %-30s: %d\n", "Recent Test", code); printf(" %-28s: %#x\n", "Self-Test Results", *walker & 0xf); printf(" %-28s: %#x\n", "Self-Test Code", (*walker >> 5) & 0x7); walker++; printf(" %-28s: %#x\n", "Self-Test Number", *walker++); hrs = le16dec(walker); walker += 2; lba = le32dec(walker); walker += 4; printf(" %-28s: %u\n", "Total Power On Hrs", hrs); printf(" %-28s: %#jx (%jd)\n", "LBA", (uintmax_t)lba, (uintmax_t)lba); printf(" %-28s: %#x\n", "Sense Key", *walker++ & 0xf); printf(" %-28s: %#x\n", "Additional Sense Code", *walker++); printf(" %-28s: %#x\n", "Additional Sense Qualifier", *walker++); printf(" %-28s: %#x\n", "Vendor Specific Detail", *walker++); } } static void print_hgst_info_background_scan(void *buf, uint16_t subtype __unused, uint8_t res __unused, uint32_t size) { uint8_t *walker = buf; uint8_t status; uint16_t code, nscan, progress; uint32_t pom, nand; printf("Background Media Scan Subpage:\n"); /* Decode the header */ code = le16dec(walker); walker += 2; walker++; /* Ignore fixed flags */ if (*walker++ != 0x10) { printf("Bad length for background scan header\n"); return; } if (code != 0) { printf("Expceted code 0, found code %#x\n", code); return; } pom = le32dec(walker); walker += 4; walker++; /* Reserved */ status = *walker++; nscan = le16dec(walker); walker += 2; progress = le16dec(walker); walker += 2; walker += 6; /* Reserved */ printf(" %-30s: %d\n", "Power On Minutes", pom); printf(" %-30s: %x (%s)\n", "BMS Status", status, status == 0 ? "idle" : (status == 1 ? "active" : (status == 8 ? "suspended" : "unknown"))); printf(" %-30s: %d\n", "Number of BMS", nscan); printf(" %-30s: %d\n", "Progress Current BMS", progress); /* Report retirements */ if (walker - (uint8_t *)buf != 20) { printf("Coding error, offset not 20\n"); return; } size -= 20; printf(" %-30s: %d\n", "BMS retirements", size / 0x18); while (size > 0) { code = le16dec(walker); walker += 2; walker++; if (*walker++ != 0x14) { printf("Bad length parameter\n"); return; } pom = le32dec(walker); walker += 4; /* * Spec sheet says the following are hard coded, if true, just * print the NAND retirement. */ if (walker[0] == 0x41 && walker[1] == 0x0b && walker[2] == 0x01 && walker[3] == 0x00 && walker[4] == 0x00 && walker[5] == 0x00 && walker[6] == 0x00 && walker[7] == 0x00) { walker += 8; walker += 4; /* Skip reserved */ nand = le32dec(walker); walker += 4; printf(" %-30s: %d\n", "Retirement number", code); printf(" %-28s: %#x\n", "NAND (C/T)BBBPPP", nand); } else { printf("Parameter %#x entry corrupt\n", code); walker += 16; } } } static void print_hgst_info_erase_errors(void *buf, uint16_t subtype __unused, uint8_t res __unused, uint32_t size) { static struct kv_name kv[] = { { 0x0000, "Corrected Without Delay" }, { 0x0001, "Corrected Maybe Delayed" }, { 0x0002, "Re-Erase" }, { 0x0003, "Errors Corrected" }, { 0x0004, "Correct Algorithm Used" }, { 0x0005, "Bytes Processed" }, { 0x0006, "Uncorrected Errors" }, { 0x8000, "Flash Erase Commands" }, { 0x8001, "Mfg Defect Count" }, { 0x8002, "Grown Defect Count" }, { 0x8003, "Erase Count -- User" }, { 0x8004, "Erase Count -- System" }, }; printf("Erase Errors Subpage:\n"); print_hgst_info_subpage_gen(buf, subtype, size, kv, nitems(kv)); } static void print_hgst_info_erase_counts(void *buf, uint16_t subtype, uint8_t res __unused, uint32_t size) { /* My drive doesn't export this -- so not coding up */ printf("XXX: Erase counts subpage: %p, %#x %d\n", buf, subtype, size); } static void print_hgst_info_temp_history(void *buf, uint16_t subtype __unused, uint8_t res __unused, uint32_t size __unused) { uint8_t *walker = buf; uint32_t min; printf("Temperature History:\n"); printf(" %-30s: %d C\n", "Current Temperature", *walker++); printf(" %-30s: %d C\n", "Reference Temperature", *walker++); printf(" %-30s: %d C\n", "Maximum Temperature", *walker++); printf(" %-30s: %d C\n", "Minimum Temperature", *walker++); min = le32dec(walker); walker += 4; printf(" %-30s: %d:%02d:00\n", "Max Temperature Time", min / 60, min % 60); min = le32dec(walker); walker += 4; printf(" %-30s: %d:%02d:00\n", "Over Temperature Duration", min / 60, min % 60); min = le32dec(walker); walker += 4; printf(" %-30s: %d:%02d:00\n", "Min Temperature Time", min / 60, min % 60); } static void print_hgst_info_ssd_perf(void *buf, uint16_t subtype __unused, uint8_t res, uint32_t size __unused) { uint8_t *walker = buf; uint64_t val; printf("SSD Performance Subpage Type %d:\n", res); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "Host Read Commands", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "Host Read Blocks", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "Host Cache Read Hits Commands", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "Host Cache Read Hits Blocks", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "Host Read Commands Stalled", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "Host Write Commands", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "Host Write Blocks", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "Host Write Odd Start Commands", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "Host Write Odd End Commands", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "Host Write Commands Stalled", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "NAND Read Commands", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "NAND Read Blocks", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "NAND Write Commands", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "NAND Write Blocks", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "NAND Read Before Writes", val); } static void print_hgst_info_firmware_load(void *buf, uint16_t subtype __unused, uint8_t res __unused, uint32_t size __unused) { uint8_t *walker = buf; printf("Firmware Load Subpage:\n"); printf(" %-30s: %d\n", "Firmware Downloads", le32dec(walker)); } static void kv_indirect(void *buf, uint32_t subtype, uint8_t res, uint32_t size, struct subpage_print *sp, size_t nsp) { size_t i; for (i = 0; i < nsp; i++, sp++) { if (sp->key == subtype) { sp->fn(buf, subtype, res, size); return; } } printf("No handler for page type %x\n", subtype); } static void print_hgst_info_log(const struct nvme_controller_data *cdata __unused, void *buf, uint32_t size __unused) { uint8_t *walker, *end, *subpage; int pages; uint16_t len; uint8_t subtype, res; printf("HGST Extra Info Log\n"); printf("===================\n"); walker = buf; pages = *walker++; walker++; len = le16dec(walker); walker += 2; end = walker + len; /* Length is exclusive of this header */ while (walker < end) { subpage = walker + 4; subtype = *walker++ & 0x3f; /* subtype */ res = *walker++; /* Reserved */ len = le16dec(walker); walker += len + 2; /* Length, not incl header */ if (walker > end) { printf("Ooops! Off the end of the list\n"); break; } kv_indirect(subpage, subtype, res, len, hgst_subpage, nitems(hgst_subpage)); } } /* * Table of log page printer / sizing. * * This includes Intel specific pages that are widely implemented. * Make sure you keep all the pages of one vendor together so -v help * lists all the vendors pages. */ static struct logpage_function { uint8_t log_page; const char *vendor; const char *name; print_fn_t print_fn; size_t size; } logfuncs[] = { {NVME_LOG_ERROR, NULL, "Drive Error Log", print_log_error, 0}, {NVME_LOG_HEALTH_INFORMATION, NULL, "Health/SMART Data", print_log_health, sizeof(struct nvme_health_information_page)}, {NVME_LOG_FIRMWARE_SLOT, NULL, "Firmware Information", print_log_firmware, sizeof(struct nvme_firmware_page)}, {HGST_INFO_LOG, "hgst", "Detailed Health/SMART", print_hgst_info_log, DEFAULT_SIZE}, {HGST_INFO_LOG, "wds", "Detailed Health/SMART", print_hgst_info_log, DEFAULT_SIZE}, {INTEL_LOG_TEMP_STATS, "intel", "Temperature Stats", print_intel_temp_stats, sizeof(struct intel_log_temp_stats)}, {INTEL_LOG_READ_LAT_LOG, "intel", "Read Latencies", print_intel_read_lat_log, DEFAULT_SIZE}, {INTEL_LOG_WRITE_LAT_LOG, "intel", "Write Latencies", print_intel_write_lat_log, DEFAULT_SIZE}, {INTEL_LOG_ADD_SMART, "intel", "Extra Health/SMART Data", print_intel_add_smart, DEFAULT_SIZE}, {INTEL_LOG_ADD_SMART, "samsung", "Extra Health/SMART Data", print_intel_add_smart, DEFAULT_SIZE}, {0, NULL, NULL, NULL, 0}, }; static void logpage_usage(void) { fprintf(stderr, "usage:\n"); fprintf(stderr, LOGPAGE_USAGE); exit(1); } static void logpage_help(void) { struct logpage_function *f; const char *v; fprintf(stderr, "\n"); fprintf(stderr, "%-8s %-10s %s\n", "Page", "Vendor","Page Name"); fprintf(stderr, "-------- ---------- ----------\n"); for (f = logfuncs; f->log_page > 0; f++) { v = f->vendor == NULL ? "-" : f->vendor; fprintf(stderr, "0x%02x %-10s %s\n", f->log_page, v, f->name); } exit(1); } void logpage(int argc, char *argv[]) { int fd; int log_page = 0, pageflag = false; int binflag = false, hexflag = false, ns_specified; int opt; char *p; char cname[64]; uint32_t nsid, size; void *buf; const char *vendor = NULL; struct logpage_function *f; struct nvme_controller_data cdata; print_fn_t print_fn; uint8_t ns_smart; while ((opt = getopt(argc, argv, "bp:xv:")) != -1) { switch (opt) { case 'b': binflag = true; break; case 'p': if (strcmp(optarg, "help") == 0) logpage_help(); /* TODO: Add human-readable ASCII page IDs */ log_page = strtol(optarg, &p, 0); if (p != NULL && *p != '\0') { fprintf(stderr, "\"%s\" not valid log page id.\n", optarg); logpage_usage(); } pageflag = true; break; case 'x': hexflag = true; break; case 'v': if (strcmp(optarg, "help") == 0) logpage_help(); vendor = optarg; break; } } if (!pageflag) { printf("Missing page_id (-p).\n"); logpage_usage(); } /* Check that a controller and/or namespace was specified. */ if (optind >= argc) logpage_usage(); if (strstr(argv[optind], NVME_NS_PREFIX) != NULL) { ns_specified = true; parse_ns_str(argv[optind], cname, &nsid); open_dev(cname, &fd, 1, 1); } else { ns_specified = false; nsid = NVME_GLOBAL_NAMESPACE_TAG; open_dev(argv[optind], &fd, 1, 1); } read_controller_data(fd, &cdata); ns_smart = (cdata.lpa >> NVME_CTRLR_DATA_LPA_NS_SMART_SHIFT) & NVME_CTRLR_DATA_LPA_NS_SMART_MASK; /* * The log page attribtues indicate whether or not the controller * supports the SMART/Health information log page on a per * namespace basis. */ if (ns_specified) { if (log_page != NVME_LOG_HEALTH_INFORMATION) errx(1, "log page %d valid only at controller level", log_page); if (ns_smart == 0) errx(1, "controller does not support per namespace " "smart/health information"); } print_fn = print_log_hex; size = DEFAULT_SIZE; if (binflag) print_fn = print_bin; if (!binflag && !hexflag) { /* * See if there is a pretty print function for the specified log * page. If one isn't found, we just revert to the default * (print_hex). If there was a vendor specified bt the user, and * the page is vendor specific, don't match the print function * unless the vendors match. */ for (f = logfuncs; f->log_page > 0; f++) { if (f->vendor != NULL && vendor != NULL && strcmp(f->vendor, vendor) != 0) continue; if (log_page != f->log_page) continue; print_fn = f->print_fn; size = f->size; break; } } if (log_page == NVME_LOG_ERROR) { size = sizeof(struct nvme_error_information_entry); size *= (cdata.elpe + 1); } /* Read the log page */ buf = get_log_buffer(size); read_logpage(fd, log_page, nsid, buf, size); print_fn(&cdata, buf, size); close(fd); exit(0); } Index: head/sbin/nvmecontrol/ns.c =================================================================== --- head/sbin/nvmecontrol/ns.c (revision 338181) +++ head/sbin/nvmecontrol/ns.c (revision 338182) @@ -1,474 +1,474 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2017 Netflix, Inc * Copyright (C) 2018 Alexander Motin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include "nvmecontrol.h" /* handles NVME_OPC_NAMESPACE_MANAGEMENT and ATTACHMENT admin cmds */ #define NSCREATE_USAGE \ " nvmecontrol ns create -s size [-c cap] [-f fmt] [-m mset] [-n nmic] [-p pi] [-l pil] nvmeN\n" #define NSDELETE_USAGE \ " nvmecontrol ns delete -n nsid nvmeN\n" #define NSATTACH_USAGE \ " nvmecontrol ns attach -n nsid [-c ctrlrid] nvmeN \n" #define NSDETACH_USAGE \ " nvmecontrol ns detach -n nsid [-c ctrlrid] nvmeN\n" void nscreate(int argc, char *argv[]); void nsdelete(int argc, char *argv[]); void nsattach(int argc, char *argv[]); void nsdetach(int argc, char *argv[]); static struct nvme_function ns_funcs[] = { {"create", nscreate, NSCREATE_USAGE}, {"delete", nsdelete, NSDELETE_USAGE}, {"attach", nsattach, NSATTACH_USAGE}, {"detach", nsdetach, NSDETACH_USAGE}, {NULL, NULL, NULL}, }; static void nscreate_usage(void) { fprintf(stderr, "usage:\n"); fprintf(stderr, NSCREATE_USAGE); exit(1); } static void nsdelete_usage(void) { fprintf(stderr, "usage:\n"); fprintf(stderr, NSDELETE_USAGE); exit(1); } static void nsattach_usage(void) { fprintf(stderr, "usage:\n"); fprintf(stderr, NSATTACH_USAGE); exit(1); } static void nsdetach_usage(void) { fprintf(stderr, "usage:\n"); fprintf(stderr, NSDETACH_USAGE); exit(1); } struct ns_result_str { uint16_t res; const char * str; }; static struct ns_result_str ns_result[] = { { 0x2, "Invalid Field"}, { 0xa, "Invalid Format"}, { 0xb, "Invalid Namespace or format"}, { 0x15, "Namespace insufficent capacity"}, { 0x16, "Namespace ID unavaliable"}, { 0x18, "Namespace already attached"}, { 0x19, "Namespace is private"}, { 0x1a, "Namespace is not attached"}, { 0x1b, "Thin provisioning not supported"}, { 0x1c, "Controller list invalid"}, { 0xFFFF, "Unknown"} }; static const char * get_res_str(uint16_t res) { struct ns_result_str *t = ns_result; while (t->res != 0xFFFF) { if (t->res == res) return (t->str); t++; } return t->str; } /* * NS MGMT Command specific status values: * 0xa = Invalid Format * 0x15 = Namespace Insuffience capacity * 0x16 = Namespace ID unavailable (number namespaces exceeded) * 0xb = Thin Provisioning Not supported */ void nscreate(int argc, char *argv[]) { struct nvme_pt_command pt; struct nvme_controller_data cd; struct nvme_namespace_data nsdata; int64_t nsze = -1, cap = -1; int ch, fd, result, lbaf = 0, mset = 0, nmic = -1, pi = 0, pil = 0; if (optind >= argc) nscreate_usage(); while ((ch = getopt(argc, argv, "s:c:f:m:n:p:l:")) != -1) { switch (ch) { case 's': nsze = strtol(optarg, (char **)NULL, 0); break; case 'c': cap = strtol(optarg, (char **)NULL, 0); break; case 'f': lbaf = strtol(optarg, (char **)NULL, 0); break; case 'm': mset = strtol(optarg, NULL, 0); break; case 'n': nmic = strtol(optarg, NULL, 0); break; case 'p': pi = strtol(optarg, NULL, 0); break; case 'l': pil = strtol(optarg, NULL, 0); break; default: nscreate_usage(); } } if (optind >= argc) nscreate_usage(); if (cap == -1) cap = nsze; if (nsze == -1 || cap == -1) nscreate_usage(); open_dev(argv[optind], &fd, 1, 1); read_controller_data(fd, &cd); /* Check that controller can execute this command. */ if (((cd.oacs >> NVME_CTRLR_DATA_OACS_NSMGMT_SHIFT) & NVME_CTRLR_DATA_OACS_NSMGMT_MASK) == 0) errx(1, "controller does not support namespace management"); /* Allow namespaces sharing if Multi-Path I/O is supported. */ if (nmic == -1) { nmic = cd.mic ? (NVME_NS_DATA_NMIC_MAY_BE_SHARED_MASK << NVME_NS_DATA_NMIC_MAY_BE_SHARED_SHIFT) : 0; } memset(&nsdata, 0, sizeof(nsdata)); nsdata.nsze = (uint64_t)nsze; nsdata.ncap = (uint64_t)cap; nsdata.flbas = ((lbaf & NVME_NS_DATA_FLBAS_FORMAT_MASK) << NVME_NS_DATA_FLBAS_FORMAT_SHIFT) | ((mset & NVME_NS_DATA_FLBAS_EXTENDED_MASK) << NVME_NS_DATA_FLBAS_EXTENDED_SHIFT); nsdata.dps = ((pi & NVME_NS_DATA_DPS_MD_START_MASK) << NVME_NS_DATA_DPS_MD_START_SHIFT) | ((pil & NVME_NS_DATA_DPS_PIT_MASK) << NVME_NS_DATA_DPS_PIT_SHIFT); nsdata.nmic = nmic; nvme_namespace_data_swapbytes(&nsdata); memset(&pt, 0, sizeof(pt)); - pt.cmd.opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_NAMESPACE_MANAGEMENT); + pt.cmd.opc = NVME_OPC_NAMESPACE_MANAGEMENT; pt.cmd.cdw10 = 0; /* create */ pt.buf = &nsdata; pt.len = sizeof(struct nvme_namespace_data); pt.is_read = 0; /* passthrough writes data to ctrlr */ if ((result = ioctl(fd, NVME_PASSTHROUGH_CMD, &pt)) < 0) errx(1, "ioctl request to %s failed: %d", argv[optind], result); if (nvme_completion_is_error(&pt.cpl)) { errx(1, "namespace creation failed: %s", get_res_str((pt.cpl.status >> NVME_STATUS_SC_SHIFT) & NVME_STATUS_SC_MASK)); } printf("namespace %d created\n", pt.cpl.cdw0); exit(0); } void nsdelete(int argc, char *argv[]) { struct nvme_pt_command pt; struct nvme_controller_data cd; int ch, fd, result, nsid = -2; char buf[2]; if (optind >= argc) nsdelete_usage(); while ((ch = getopt(argc, argv, "n:")) != -1) { switch ((char)ch) { case 'n': nsid = strtol(optarg, (char **)NULL, 0); break; default: nsdelete_usage(); } } if (optind >= argc || nsid == -2) nsdelete_usage(); open_dev(argv[optind], &fd, 1, 1); read_controller_data(fd, &cd); /* Check that controller can execute this command. */ if (((cd.oacs >> NVME_CTRLR_DATA_OACS_NSMGMT_SHIFT) & NVME_CTRLR_DATA_OACS_NSMGMT_MASK) == 0) errx(1, "controller does not support namespace management"); memset(&pt, 0, sizeof(pt)); - pt.cmd.opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_NAMESPACE_MANAGEMENT); + pt.cmd.opc = NVME_OPC_NAMESPACE_MANAGEMENT; pt.cmd.cdw10 = 1; /* delete */ pt.buf = buf; pt.len = sizeof(buf); pt.is_read = 1; pt.cmd.nsid = (uint32_t)nsid; if ((result = ioctl(fd, NVME_PASSTHROUGH_CMD, &pt)) < 0) errx(1, "ioctl request to %s failed: %d", argv[optind], result); if (nvme_completion_is_error(&pt.cpl)) { errx(1, "namespace deletion failed: %s", get_res_str((pt.cpl.status >> NVME_STATUS_SC_SHIFT) & NVME_STATUS_SC_MASK)); } printf("namespace %d deleted\n", nsid); exit(0); } /* * Attach and Detach use Dword 10, and a controller list (section 4.9) * This struct is 4096 bytes in size. * 0h = attach * 1h = detach * * Result values for both attach/detach: * * Completion 18h = Already attached * 19h = NS is private and already attached to a controller * 1Ah = Not attached, request could not be completed * 1Ch = Controller list invalid. * * 0x2 Invalid Field can occur if ctrlrid d.n.e in system. */ void nsattach(int argc, char *argv[]) { struct nvme_pt_command pt; struct nvme_controller_data cd; int ctrlrid = -2; int fd, ch, result, nsid = -1; uint16_t clist[2048]; if (optind >= argc) nsattach_usage(); while ((ch = getopt(argc, argv, "n:c:")) != -1) { switch (ch) { case 'n': nsid = strtol(optarg, (char **)NULL, 0); break; case 'c': ctrlrid = strtol(optarg, (char **)NULL, 0); break; default: nsattach_usage(); } } if (optind >= argc) nsattach_usage(); if (nsid == -1 ) nsattach_usage(); open_dev(argv[optind], &fd, 1, 1); read_controller_data(fd, &cd); /* Check that controller can execute this command. */ if (((cd.oacs >> NVME_CTRLR_DATA_OACS_NSMGMT_SHIFT) & NVME_CTRLR_DATA_OACS_NSMGMT_MASK) == 0) errx(1, "controller does not support namespace management"); if (ctrlrid == -1) { /* Get full list of controllers to attach to. */ memset(&pt, 0, sizeof(pt)); - pt.cmd.opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_IDENTIFY); + pt.cmd.opc = NVME_OPC_IDENTIFY; pt.cmd.cdw10 = htole32(0x13); pt.buf = clist; pt.len = sizeof(clist); pt.is_read = 1; if (ioctl(fd, NVME_PASSTHROUGH_CMD, &pt) < 0) err(1, "identify request failed"); if (nvme_completion_is_error(&pt.cpl)) errx(1, "identify request returned error"); } else { /* By default attach to this controller. */ if (ctrlrid == -2) ctrlrid = cd.ctrlr_id; memset(&clist, 0, sizeof(clist)); clist[0] = htole16(1); clist[1] = htole16(ctrlrid); } memset(&pt, 0, sizeof(pt)); - pt.cmd.opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_NAMESPACE_ATTACHMENT); + pt.cmd.opc = NVME_OPC_NAMESPACE_ATTACHMENT; pt.cmd.cdw10 = 0; /* attach */ pt.cmd.nsid = (uint32_t)nsid; pt.buf = &clist; pt.len = sizeof(clist); if ((result = ioctl(fd, NVME_PASSTHROUGH_CMD, &pt)) < 0) errx(1, "ioctl request to %s failed: %d", argv[optind], result); if (nvme_completion_is_error(&pt.cpl)) { errx(1, "namespace attach failed: %s", get_res_str((pt.cpl.status >> NVME_STATUS_SC_SHIFT) & NVME_STATUS_SC_MASK)); } printf("namespace %d attached\n", nsid); exit(0); } void nsdetach(int argc, char *argv[]) { struct nvme_pt_command pt; struct nvme_controller_data cd; int ctrlrid = -2; int fd, ch, result, nsid = -1; uint16_t clist[2048]; if (optind >= argc) nsdetach_usage(); while ((ch = getopt(argc, argv, "n:c:")) != -1) { switch (ch) { case 'n': nsid = strtol(optarg, (char **)NULL, 0); break; case 'c': ctrlrid = strtol(optarg, (char **)NULL, 0); break; default: nsdetach_usage(); } } if (optind >= argc) nsdetach_usage(); if (nsid == -1) nsdetach_usage(); open_dev(argv[optind], &fd, 1, 1); read_controller_data(fd, &cd); /* Check that controller can execute this command. */ if (((cd.oacs >> NVME_CTRLR_DATA_OACS_NSMGMT_SHIFT) & NVME_CTRLR_DATA_OACS_NSMGMT_MASK) == 0) errx(1, "controller does not support namespace management"); if (ctrlrid == -1) { /* Get list of controllers this namespace attached to. */ memset(&pt, 0, sizeof(pt)); - pt.cmd.opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_IDENTIFY); + pt.cmd.opc = NVME_OPC_IDENTIFY; pt.cmd.nsid = htole32(nsid); pt.cmd.cdw10 = htole32(0x12); pt.buf = clist; pt.len = sizeof(clist); pt.is_read = 1; if (ioctl(fd, NVME_PASSTHROUGH_CMD, &pt) < 0) err(1, "identify request failed"); if (nvme_completion_is_error(&pt.cpl)) errx(1, "identify request returned error"); if (clist[0] == 0) { ctrlrid = cd.ctrlr_id; memset(&clist, 0, sizeof(clist)); clist[0] = htole16(1); clist[1] = htole16(ctrlrid); } } else { /* By default detach from this controller. */ if (ctrlrid == -2) ctrlrid = cd.ctrlr_id; memset(&clist, 0, sizeof(clist)); clist[0] = htole16(1); clist[1] = htole16(ctrlrid); } memset(&pt, 0, sizeof(pt)); - pt.cmd.opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_NAMESPACE_ATTACHMENT); + pt.cmd.opc = NVME_OPC_NAMESPACE_ATTACHMENT; pt.cmd.cdw10 = 1; /* detach */ pt.cmd.nsid = (uint32_t)nsid; pt.buf = &clist; pt.len = sizeof(clist); if ((result = ioctl(fd, NVME_PASSTHROUGH_CMD, &pt)) < 0) errx(1, "ioctl request to %s failed: %d", argv[optind], result); if (nvme_completion_is_error(&pt.cpl)) { errx(1, "namespace detach failed: %s", get_res_str((pt.cpl.status >> NVME_STATUS_SC_SHIFT) & NVME_STATUS_SC_MASK)); } printf("namespace %d detached\n", nsid); exit(0); } void ns(int argc, char *argv[]) { dispatch(argc, argv, ns_funcs); } Index: head/sbin/nvmecontrol/nvmecontrol.c =================================================================== --- head/sbin/nvmecontrol/nvmecontrol.c (revision 338181) +++ head/sbin/nvmecontrol/nvmecontrol.c (revision 338182) @@ -1,251 +1,251 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2012-2013 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "nvmecontrol.h" static struct nvme_function funcs[] = { {"devlist", devlist, DEVLIST_USAGE}, {"identify", identify, IDENTIFY_USAGE}, {"perftest", perftest, PERFTEST_USAGE}, {"reset", reset, RESET_USAGE}, {"logpage", logpage, LOGPAGE_USAGE}, {"firmware", firmware, FIRMWARE_USAGE}, {"format", format, FORMAT_USAGE}, {"power", power, POWER_USAGE}, {"wdc", wdc, WDC_USAGE}, {"ns", ns, NS_USAGE}, {NULL, NULL, NULL}, }; void gen_usage(struct nvme_function *f) { fprintf(stderr, "usage:\n"); while (f->name != NULL) { fprintf(stderr, "%s", f->usage); f++; } exit(1); } void dispatch(int argc, char *argv[], struct nvme_function *tbl) { struct nvme_function *f = tbl; if (argv[1] == NULL) { gen_usage(tbl); return; } while (f->name != NULL) { if (strcmp(argv[1], f->name) == 0) f->fn(argc-1, &argv[1]); f++; } fprintf(stderr, "Unknown command: %s\n", argv[1]); gen_usage(tbl); } static void print_bytes(void *data, uint32_t length) { uint32_t i, j; uint8_t *p, *end; end = (uint8_t *)data + length; for (i = 0; i < length; i++) { p = (uint8_t *)data + (i*16); printf("%03x: ", i*16); for (j = 0; j < 16 && p < end; j++) printf("%02x ", *p++); if (p >= end) break; printf("\n"); } printf("\n"); } static void print_dwords(void *data, uint32_t length) { uint32_t *p; uint32_t i, j; p = (uint32_t *)data; length /= sizeof(uint32_t); for (i = 0; i < length; i+=8) { printf("%03x: ", i*4); for (j = 0; j < 8; j++) printf("%08x ", p[i+j]); printf("\n"); } printf("\n"); } void print_hex(void *data, uint32_t length) { if (length >= sizeof(uint32_t) || length % sizeof(uint32_t) == 0) print_dwords(data, length); else print_bytes(data, length); } void read_controller_data(int fd, struct nvme_controller_data *cdata) { struct nvme_pt_command pt; memset(&pt, 0, sizeof(pt)); - pt.cmd.opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_IDENTIFY); + pt.cmd.opc = NVME_OPC_IDENTIFY; pt.cmd.cdw10 = htole32(1); pt.buf = cdata; pt.len = sizeof(*cdata); pt.is_read = 1; if (ioctl(fd, NVME_PASSTHROUGH_CMD, &pt) < 0) err(1, "identify request failed"); /* Convert data to host endian */ nvme_controller_data_swapbytes(cdata); if (nvme_completion_is_error(&pt.cpl)) errx(1, "identify request returned error"); } void read_namespace_data(int fd, uint32_t nsid, struct nvme_namespace_data *nsdata) { struct nvme_pt_command pt; memset(&pt, 0, sizeof(pt)); - pt.cmd.opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_IDENTIFY); + pt.cmd.opc = NVME_OPC_IDENTIFY; pt.cmd.nsid = htole32(nsid); pt.buf = nsdata; pt.len = sizeof(*nsdata); pt.is_read = 1; if (ioctl(fd, NVME_PASSTHROUGH_CMD, &pt) < 0) err(1, "identify request failed"); /* Convert data to host endian */ nvme_namespace_data_swapbytes(nsdata); if (nvme_completion_is_error(&pt.cpl)) errx(1, "identify request returned error"); } int open_dev(const char *str, int *fd, int show_error, int exit_on_error) { char full_path[64]; if (!strnstr(str, NVME_CTRLR_PREFIX, strlen(NVME_CTRLR_PREFIX))) { if (show_error) warnx("controller/namespace ids must begin with '%s'", NVME_CTRLR_PREFIX); if (exit_on_error) exit(1); else return (EINVAL); } snprintf(full_path, sizeof(full_path), _PATH_DEV"%s", str); *fd = open(full_path, O_RDWR); if (*fd < 0) { if (show_error) warn("could not open %s", full_path); if (exit_on_error) exit(1); else return (errno); } return (0); } void parse_ns_str(const char *ns_str, char *ctrlr_str, uint32_t *nsid) { char *nsloc; /* * Pull the namespace id from the string. +2 skips past the "ns" part * of the string. Don't search past 10 characters into the string, * otherwise we know it is malformed. */ nsloc = strnstr(ns_str, NVME_NS_PREFIX, 10); if (nsloc != NULL) *nsid = strtol(nsloc + 2, NULL, 10); if (nsloc == NULL || (*nsid == 0 && errno != 0)) errx(1, "invalid namespace ID '%s'", ns_str); /* * The controller string will include only the nvmX part of the * nvmeXnsY string. */ snprintf(ctrlr_str, nsloc - ns_str + 1, "%s", ns_str); } int main(int argc, char *argv[]) { if (argc < 2) gen_usage(funcs); dispatch(argc, argv, funcs); return (0); } Index: head/sbin/nvmecontrol/power.c =================================================================== --- head/sbin/nvmecontrol/power.c (revision 338181) +++ head/sbin/nvmecontrol/power.c (revision 338182) @@ -1,195 +1,195 @@ /*- * Copyright (c) 2016 Netflix, Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include "nvmecontrol.h" _Static_assert(sizeof(struct nvme_power_state) == 256 / NBBY, "nvme_power_state size wrong"); static void power_usage(void) { fprintf(stderr, "usage:\n"); fprintf(stderr, POWER_USAGE); exit(1); } static void power_list_one(int i, struct nvme_power_state *nps) { int mpower, apower, ipower; uint8_t mps, nops, aps, apw; mps = (nps->mps_nops >> NVME_PWR_ST_MPS_SHIFT) & NVME_PWR_ST_MPS_MASK; nops = (nps->mps_nops >> NVME_PWR_ST_NOPS_SHIFT) & NVME_PWR_ST_NOPS_MASK; apw = (nps->apw_aps >> NVME_PWR_ST_APW_SHIFT) & NVME_PWR_ST_APW_MASK; aps = (nps->apw_aps >> NVME_PWR_ST_APS_SHIFT) & NVME_PWR_ST_APS_MASK; mpower = nps->mp; if (mps == 0) mpower *= 100; ipower = nps->idlp; if (nps->ips == 1) ipower *= 100; apower = nps->actp; if (aps == 1) apower *= 100; printf("%2d: %2d.%04dW%c %3d.%03dms %3d.%03dms %2d %2d %2d %2d %2d.%04dW %2d.%04dW %d\n", i, mpower / 10000, mpower % 10000, nops ? '*' : ' ', nps->enlat / 1000, nps->enlat % 1000, nps->exlat / 1000, nps->exlat % 1000, nps->rrt, nps->rrl, nps->rwt, nps->rwl, ipower / 10000, ipower % 10000, apower / 10000, apower % 10000, apw); } static void power_list(struct nvme_controller_data *cdata) { int i; printf("\nPower States Supported: %d\n\n", cdata->npss + 1); printf(" # Max pwr Enter Lat Exit Lat RT RL WT WL Idle Pwr Act Pwr Workloadd\n"); printf("-- -------- --------- --------- -- -- -- -- -------- -------- --\n"); for (i = 0; i <= cdata->npss; i++) power_list_one(i, &cdata->power_state[i]); } static void power_set(int fd, int power_val, int workload, int perm) { struct nvme_pt_command pt; uint32_t p; p = perm ? (1u << 31) : 0; memset(&pt, 0, sizeof(pt)); - pt.cmd.opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_SET_FEATURES); + pt.cmd.opc = NVME_OPC_SET_FEATURES; pt.cmd.cdw10 = htole32(NVME_FEAT_POWER_MANAGEMENT | p); pt.cmd.cdw11 = htole32(power_val | (workload << 5)); if (ioctl(fd, NVME_PASSTHROUGH_CMD, &pt) < 0) err(1, "set feature power mgmt request failed"); if (nvme_completion_is_error(&pt.cpl)) errx(1, "set feature power mgmt request returned error"); } static void power_show(int fd) { struct nvme_pt_command pt; memset(&pt, 0, sizeof(pt)); - pt.cmd.opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_GET_FEATURES); + pt.cmd.opc = NVME_OPC_GET_FEATURES; pt.cmd.cdw10 = htole32(NVME_FEAT_POWER_MANAGEMENT); if (ioctl(fd, NVME_PASSTHROUGH_CMD, &pt) < 0) err(1, "set feature power mgmt request failed"); if (nvme_completion_is_error(&pt.cpl)) errx(1, "set feature power mgmt request returned error"); printf("Current Power Mode is %d\n", pt.cpl.cdw0); } void power(int argc, char *argv[]) { struct nvme_controller_data cdata; int ch, listflag = 0, powerflag = 0, power_val = 0, fd; int workload = 0; char *end; while ((ch = getopt(argc, argv, "lp:w:")) != -1) { switch ((char)ch) { case 'l': listflag = 1; break; case 'p': powerflag = 1; power_val = strtol(optarg, &end, 0); if (*end != '\0') { fprintf(stderr, "Invalid power state number: %s\n", optarg); power_usage(); } break; case 'w': workload = strtol(optarg, &end, 0); if (*end != '\0') { fprintf(stderr, "Invalid workload hint: %s\n", optarg); power_usage(); } break; default: power_usage(); } } /* Check that a controller was specified. */ if (optind >= argc) power_usage(); if (listflag && powerflag) { fprintf(stderr, "Can't set power and list power states\n"); power_usage(); } open_dev(argv[optind], &fd, 1, 1); read_controller_data(fd, &cdata); if (listflag) { power_list(&cdata); goto out; } if (powerflag) { power_set(fd, power_val, workload, 0); goto out; } power_show(fd); out: close(fd); exit(0); } Index: head/sbin/nvmecontrol/wdc.c =================================================================== --- head/sbin/nvmecontrol/wdc.c (revision 338181) +++ head/sbin/nvmecontrol/wdc.c (revision 338182) @@ -1,196 +1,196 @@ /*- * Copyright (c) 2017 Netflix, Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include "nvmecontrol.h" #define WDC_NVME_TOC_SIZE 8 #define WDC_NVME_CAP_DIAG_OPCODE 0xe6 #define WDC_NVME_CAP_DIAG_CMD 0x0000 static void wdc_cap_diag(int argc, char *argv[]); #define WDC_CAP_DIAG_USAGE "\tnvmecontrol wdc cap-diag [-o path-template]\n" static struct nvme_function wdc_funcs[] = { {"cap-diag", wdc_cap_diag, WDC_CAP_DIAG_USAGE}, {NULL, NULL, NULL}, }; static void wdc_append_serial_name(int fd, char *buf, size_t len, const char *suffix) { struct nvme_controller_data cdata; char sn[NVME_SERIAL_NUMBER_LENGTH + 1]; char *walker; len -= strlen(buf); buf += strlen(buf); read_controller_data(fd, &cdata); memcpy(sn, cdata.sn, NVME_SERIAL_NUMBER_LENGTH); walker = sn + NVME_SERIAL_NUMBER_LENGTH - 1; while (walker > sn && *walker == ' ') walker--; *++walker = '\0'; snprintf(buf, len, "%s%s.bin", sn, suffix); } static void wdc_get_data(int fd, uint32_t opcode, uint32_t len, uint32_t off, uint32_t cmd, uint8_t *buffer, size_t buflen) { struct nvme_pt_command pt; memset(&pt, 0, sizeof(pt)); - pt.cmd.opc_fuse = NVME_CMD_SET_OPC(opcode); + pt.cmd.opc = opcode; pt.cmd.cdw10 = htole32(len / sizeof(uint32_t)); /* - 1 like all the others ??? */ pt.cmd.cdw11 = htole32(off / sizeof(uint32_t)); pt.cmd.cdw12 = htole32(cmd); pt.buf = buffer; pt.len = buflen; pt.is_read = 1; // printf("opcode %#x cdw10(len) %#x cdw11(offset?) %#x cdw12(cmd/sub) %#x buflen %zd\n", // (int)opcode, (int)cdw10, (int)cdw11, (int)cdw12, buflen); if (ioctl(fd, NVME_PASSTHROUGH_CMD, &pt) < 0) err(1, "wdc_get_data request failed"); if (nvme_completion_is_error(&pt.cpl)) errx(1, "wdc_get_data request returned error"); } static void wdc_do_dump(int fd, char *tmpl, const char *suffix, uint32_t opcode, uint32_t cmd, int len_off) { int first; int fd2; uint8_t *buf; uint32_t len, offset; size_t resid; wdc_append_serial_name(fd, tmpl, MAXPATHLEN, suffix); /* XXX overwrite protection? */ fd2 = open(tmpl, O_WRONLY | O_CREAT | O_TRUNC, 0644); if (fd2 < 0) err(1, "open %s", tmpl); buf = aligned_alloc(PAGE_SIZE, NVME_MAX_XFER_SIZE); if (buf == NULL) errx(1, "Can't get buffer to read dump"); offset = 0; len = NVME_MAX_XFER_SIZE; first = 1; do { resid = len > NVME_MAX_XFER_SIZE ? NVME_MAX_XFER_SIZE : len; wdc_get_data(fd, opcode, resid, offset, cmd, buf, resid); if (first) { len = be32dec(buf + len_off); if (len == 0) errx(1, "No data for %s", suffix); if (memcmp("E6LG", buf, 4) != 0) printf("Expected header of E6LG, found '%4.4s' instead\n", buf); printf("Dumping %d bytes of version %d.%d log to %s\n", len, buf[8], buf[9], tmpl); /* * Adjust amount to dump if total dump < 1MB, * though it likely doesn't matter to the WDC * analysis tools. */ if (resid > len) resid = len; first = 0; } if (write(fd2, buf, resid) != (ssize_t)resid) err(1, "write"); offset += resid; len -= resid; } while (len > 0); free(buf); close(fd2); } static void wdc_cap_diag_usage(void) { fprintf(stderr, "usage:\n"); fprintf(stderr, WDC_CAP_DIAG_USAGE); exit(1); } static void wdc_cap_diag(int argc, char *argv[]) { char path_tmpl[MAXPATHLEN]; int ch, fd; path_tmpl[0] = '\0'; while ((ch = getopt(argc, argv, "o:")) != -1) { switch ((char)ch) { case 'o': strlcpy(path_tmpl, optarg, MAXPATHLEN); break; default: wdc_cap_diag_usage(); } } /* Check that a controller was specified. */ if (optind >= argc) wdc_cap_diag_usage(); open_dev(argv[optind], &fd, 1, 1); wdc_do_dump(fd, path_tmpl, "cap_diag", WDC_NVME_CAP_DIAG_OPCODE, WDC_NVME_CAP_DIAG_CMD, 4); close(fd); exit(1); } void wdc(int argc, char *argv[]) { dispatch(argc, argv, wdc_funcs); } Index: head/sys/cam/nvme/nvme_all.c =================================================================== --- head/sys/cam/nvme/nvme_all.c (revision 338181) +++ head/sys/cam/nvme/nvme_all.c (revision 338182) @@ -1,168 +1,163 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2015 Netflix, Inc * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifdef _KERNEL #include "opt_scsi.h" #include #include #include #include #include #else #include #include #include #include #ifndef min #define min(a,b) (((a)<(b))?(a):(b)) #endif #endif #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #include #endif void nvme_ns_cmd(struct ccb_nvmeio *nvmeio, uint8_t cmd, uint32_t nsid, uint32_t cdw10, uint32_t cdw11, uint32_t cdw12, uint32_t cdw13, uint32_t cdw14, uint32_t cdw15) { bzero(&nvmeio->cmd, sizeof(struct nvme_command)); - nvmeio->cmd.opc_fuse = NVME_CMD_SET_OPC(cmd); + nvmeio->cmd.opc = cmd; nvmeio->cmd.nsid = htole32(nsid); nvmeio->cmd.cdw10 = htole32(cdw10); nvmeio->cmd.cdw11 = htole32(cdw11); nvmeio->cmd.cdw12 = htole32(cdw12); nvmeio->cmd.cdw13 = htole32(cdw13); nvmeio->cmd.cdw14 = htole32(cdw14); nvmeio->cmd.cdw15 = htole32(cdw15); } int nvme_identify_match(caddr_t identbuffer, caddr_t table_entry) { return 0; } void nvme_print_ident(const struct nvme_controller_data *cdata, const struct nvme_namespace_data *data, struct sbuf *sb) { sbuf_printf(sb, "<"); cam_strvis_sbuf(sb, cdata->mn, sizeof(cdata->mn), 0); sbuf_printf(sb, " "); cam_strvis_sbuf(sb, cdata->fr, sizeof(cdata->fr), 0); sbuf_printf(sb, " "); cam_strvis_sbuf(sb, cdata->sn, sizeof(cdata->sn), 0); sbuf_printf(sb, ">\n"); } /* XXX need to do nvme admin opcodes too, but those aren't used yet by nda */ static const char * nvme_opc2str[] = { "FLUSH", "WRITE", "READ", "RSVD-3", "WRITE_UNCORRECTABLE", "COMPARE", "RSVD-6", "RSVD-7", "DATASET_MANAGEMENT" }; const char * nvme_op_string(const struct nvme_command *cmd) { - uint8_t opc; - opc = (cmd->opc_fuse >> NVME_CMD_OPC_SHIFT) & NVME_CMD_OPC_MASK; - if (opc >= nitems(nvme_opc2str)) + if (cmd->opc >= nitems(nvme_opc2str)) return "UNKNOWN"; - return nvme_opc2str[opc]; + return nvme_opc2str[cmd->opc]; } const char * nvme_cmd_string(const struct nvme_command *cmd, char *cmd_string, size_t len) { - uint8_t opc, fuse; - opc = (cmd->opc_fuse >> NVME_CMD_OPC_SHIFT) & NVME_CMD_OPC_MASK; - fuse = (cmd->opc_fuse >> NVME_CMD_FUSE_SHIFT) & NVME_CMD_FUSE_MASK; /* * cid, rsvd areas and mptr not printed, since they are used * only internally by the SIM. */ snprintf(cmd_string, len, "opc=%x fuse=%x nsid=%x prp1=%llx prp2=%llx cdw=%x %x %x %x %x %x", - opc, fuse, cmd->nsid, + cmd->opc, cmd->fuse, cmd->nsid, (unsigned long long)cmd->prp1, (unsigned long long)cmd->prp2, cmd->cdw10, cmd->cdw11, cmd->cdw12, cmd->cdw13, cmd->cdw14, cmd->cdw15); return cmd_string; } const void * nvme_get_identify_cntrl(struct cam_periph *periph) { struct cam_ed *device; device = periph->path->device; return device->nvme_cdata; } const void * nvme_get_identify_ns(struct cam_periph *periph) { struct cam_ed *device; device = periph->path->device; return device->nvme_data; } Index: head/sys/dev/mpr/mpr_sas.c =================================================================== --- head/sys/dev/mpr/mpr_sas.c (revision 338181) +++ head/sys/dev/mpr/mpr_sas.c (revision 338182) @@ -1,3898 +1,3898 @@ /*- * Copyright (c) 2009 Yahoo! Inc. * Copyright (c) 2011-2015 LSI Corp. * Copyright (c) 2013-2016 Avago Technologies * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Avago Technologies (LSI) MPT-Fusion Host Adapter FreeBSD * */ #include __FBSDID("$FreeBSD$"); /* Communications core for Avago Technologies (LSI) MPT3 */ /* TODO Move headers to mprvar */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if __FreeBSD_version >= 900026 #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #define MPRSAS_DISCOVERY_TIMEOUT 20 #define MPRSAS_MAX_DISCOVERY_TIMEOUTS 10 /* 200 seconds */ /* * static array to check SCSI OpCode for EEDP protection bits */ #define PRO_R MPI2_SCSIIO_EEDPFLAGS_CHECK_REMOVE_OP #define PRO_W MPI2_SCSIIO_EEDPFLAGS_INSERT_OP #define PRO_V MPI2_SCSIIO_EEDPFLAGS_INSERT_OP static uint8_t op_code_prot[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PRO_R, 0, PRO_W, 0, 0, 0, PRO_W, PRO_V, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PRO_W, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PRO_R, 0, PRO_W, 0, 0, 0, PRO_W, PRO_V, 0, 0, 0, PRO_W, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PRO_R, 0, PRO_W, 0, 0, 0, PRO_W, PRO_V, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; MALLOC_DEFINE(M_MPRSAS, "MPRSAS", "MPR SAS memory"); static void mprsas_remove_device(struct mpr_softc *, struct mpr_command *); static void mprsas_remove_complete(struct mpr_softc *, struct mpr_command *); static void mprsas_action(struct cam_sim *sim, union ccb *ccb); static void mprsas_poll(struct cam_sim *sim); static void mprsas_scsiio_timeout(void *data); static void mprsas_abort_complete(struct mpr_softc *sc, struct mpr_command *cm); static void mprsas_action_scsiio(struct mprsas_softc *, union ccb *); static void mprsas_scsiio_complete(struct mpr_softc *, struct mpr_command *); static void mprsas_action_resetdev(struct mprsas_softc *, union ccb *); static void mprsas_resetdev_complete(struct mpr_softc *, struct mpr_command *); static int mprsas_send_abort(struct mpr_softc *sc, struct mpr_command *tm, struct mpr_command *cm); static void mprsas_async(void *callback_arg, uint32_t code, struct cam_path *path, void *arg); #if (__FreeBSD_version < 901503) || \ ((__FreeBSD_version >= 1000000) && (__FreeBSD_version < 1000006)) static void mprsas_check_eedp(struct mpr_softc *sc, struct cam_path *path, struct ccb_getdev *cgd); static void mprsas_read_cap_done(struct cam_periph *periph, union ccb *done_ccb); #endif static int mprsas_send_portenable(struct mpr_softc *sc); static void mprsas_portenable_complete(struct mpr_softc *sc, struct mpr_command *cm); #if __FreeBSD_version >= 900026 static void mprsas_smpio_complete(struct mpr_softc *sc, struct mpr_command *cm); static void mprsas_send_smpcmd(struct mprsas_softc *sassc, union ccb *ccb, uint64_t sasaddr); static void mprsas_action_smpio(struct mprsas_softc *sassc, union ccb *ccb); #endif //FreeBSD_version >= 900026 struct mprsas_target * mprsas_find_target_by_handle(struct mprsas_softc *sassc, int start, uint16_t handle) { struct mprsas_target *target; int i; for (i = start; i < sassc->maxtargets; i++) { target = &sassc->targets[i]; if (target->handle == handle) return (target); } return (NULL); } /* we need to freeze the simq during attach and diag reset, to avoid failing * commands before device handles have been found by discovery. Since * discovery involves reading config pages and possibly sending commands, * discovery actions may continue even after we receive the end of discovery * event, so refcount discovery actions instead of assuming we can unfreeze * the simq when we get the event. */ void mprsas_startup_increment(struct mprsas_softc *sassc) { MPR_FUNCTRACE(sassc->sc); if ((sassc->flags & MPRSAS_IN_STARTUP) != 0) { if (sassc->startup_refcount++ == 0) { /* just starting, freeze the simq */ mpr_dprint(sassc->sc, MPR_INIT, "%s freezing simq\n", __func__); #if (__FreeBSD_version >= 1000039) || \ ((__FreeBSD_version < 1000000) && (__FreeBSD_version >= 902502)) xpt_hold_boot(); #endif xpt_freeze_simq(sassc->sim, 1); } mpr_dprint(sassc->sc, MPR_INIT, "%s refcount %u\n", __func__, sassc->startup_refcount); } } void mprsas_release_simq_reinit(struct mprsas_softc *sassc) { if (sassc->flags & MPRSAS_QUEUE_FROZEN) { sassc->flags &= ~MPRSAS_QUEUE_FROZEN; xpt_release_simq(sassc->sim, 1); mpr_dprint(sassc->sc, MPR_INFO, "Unfreezing SIM queue\n"); } } void mprsas_startup_decrement(struct mprsas_softc *sassc) { MPR_FUNCTRACE(sassc->sc); if ((sassc->flags & MPRSAS_IN_STARTUP) != 0) { if (--sassc->startup_refcount == 0) { /* finished all discovery-related actions, release * the simq and rescan for the latest topology. */ mpr_dprint(sassc->sc, MPR_INIT, "%s releasing simq\n", __func__); sassc->flags &= ~MPRSAS_IN_STARTUP; xpt_release_simq(sassc->sim, 1); #if (__FreeBSD_version >= 1000039) || \ ((__FreeBSD_version < 1000000) && (__FreeBSD_version >= 902502)) xpt_release_boot(); #else mprsas_rescan_target(sassc->sc, NULL); #endif } mpr_dprint(sassc->sc, MPR_INIT, "%s refcount %u\n", __func__, sassc->startup_refcount); } } /* The firmware requires us to stop sending commands when we're doing task * management, so refcount the TMs and keep the simq frozen when any are in * use. */ struct mpr_command * mprsas_alloc_tm(struct mpr_softc *sc) { struct mpr_command *tm; MPR_FUNCTRACE(sc); tm = mpr_alloc_high_priority_command(sc); return tm; } void mprsas_free_tm(struct mpr_softc *sc, struct mpr_command *tm) { int target_id = 0xFFFFFFFF; MPR_FUNCTRACE(sc); if (tm == NULL) return; /* * For TM's the devq is frozen for the device. Unfreeze it here and * free the resources used for freezing the devq. Must clear the * INRESET flag as well or scsi I/O will not work. */ if (tm->cm_targ != NULL) { tm->cm_targ->flags &= ~MPRSAS_TARGET_INRESET; target_id = tm->cm_targ->tid; } if (tm->cm_ccb) { mpr_dprint(sc, MPR_INFO, "Unfreezing devq for target ID %d\n", target_id); xpt_release_devq(tm->cm_ccb->ccb_h.path, 1, TRUE); xpt_free_path(tm->cm_ccb->ccb_h.path); xpt_free_ccb(tm->cm_ccb); } mpr_free_high_priority_command(sc, tm); } void mprsas_rescan_target(struct mpr_softc *sc, struct mprsas_target *targ) { struct mprsas_softc *sassc = sc->sassc; path_id_t pathid; target_id_t targetid; union ccb *ccb; MPR_FUNCTRACE(sc); pathid = cam_sim_path(sassc->sim); if (targ == NULL) targetid = CAM_TARGET_WILDCARD; else targetid = targ - sassc->targets; /* * Allocate a CCB and schedule a rescan. */ ccb = xpt_alloc_ccb_nowait(); if (ccb == NULL) { mpr_dprint(sc, MPR_ERROR, "unable to alloc CCB for rescan\n"); return; } if (xpt_create_path(&ccb->ccb_h.path, NULL, pathid, targetid, CAM_LUN_WILDCARD) != CAM_REQ_CMP) { mpr_dprint(sc, MPR_ERROR, "unable to create path for rescan\n"); xpt_free_ccb(ccb); return; } if (targetid == CAM_TARGET_WILDCARD) ccb->ccb_h.func_code = XPT_SCAN_BUS; else ccb->ccb_h.func_code = XPT_SCAN_TGT; mpr_dprint(sc, MPR_TRACE, "%s targetid %u\n", __func__, targetid); xpt_rescan(ccb); } static void mprsas_log_command(struct mpr_command *cm, u_int level, const char *fmt, ...) { struct sbuf sb; va_list ap; char str[192]; char path_str[64]; if (cm == NULL) return; /* No need to be in here if debugging isn't enabled */ if ((cm->cm_sc->mpr_debug & level) == 0) return; sbuf_new(&sb, str, sizeof(str), 0); va_start(ap, fmt); if (cm->cm_ccb != NULL) { xpt_path_string(cm->cm_ccb->csio.ccb_h.path, path_str, sizeof(path_str)); sbuf_cat(&sb, path_str); if (cm->cm_ccb->ccb_h.func_code == XPT_SCSI_IO) { scsi_command_string(&cm->cm_ccb->csio, &sb); sbuf_printf(&sb, "length %d ", cm->cm_ccb->csio.dxfer_len); } } else { sbuf_printf(&sb, "(noperiph:%s%d:%u:%u:%u): ", cam_sim_name(cm->cm_sc->sassc->sim), cam_sim_unit(cm->cm_sc->sassc->sim), cam_sim_bus(cm->cm_sc->sassc->sim), cm->cm_targ ? cm->cm_targ->tid : 0xFFFFFFFF, cm->cm_lun); } sbuf_printf(&sb, "SMID %u ", cm->cm_desc.Default.SMID); sbuf_vprintf(&sb, fmt, ap); sbuf_finish(&sb); mpr_print_field(cm->cm_sc, "%s", sbuf_data(&sb)); va_end(ap); } static void mprsas_remove_volume(struct mpr_softc *sc, struct mpr_command *tm) { MPI2_SCSI_TASK_MANAGE_REPLY *reply; struct mprsas_target *targ; uint16_t handle; MPR_FUNCTRACE(sc); reply = (MPI2_SCSI_TASK_MANAGE_REPLY *)tm->cm_reply; handle = (uint16_t)(uintptr_t)tm->cm_complete_data; targ = tm->cm_targ; if (reply == NULL) { /* XXX retry the remove after the diag reset completes? */ mpr_dprint(sc, MPR_FAULT, "%s NULL reply resetting device " "0x%04x\n", __func__, handle); mprsas_free_tm(sc, tm); return; } if ((le16toh(reply->IOCStatus) & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS) { mpr_dprint(sc, MPR_ERROR, "IOCStatus = 0x%x while resetting " "device 0x%x\n", le16toh(reply->IOCStatus), handle); } mpr_dprint(sc, MPR_XINFO, "Reset aborted %u commands\n", le32toh(reply->TerminationCount)); mpr_free_reply(sc, tm->cm_reply_data); tm->cm_reply = NULL; /* Ensures the reply won't get re-freed */ mpr_dprint(sc, MPR_XINFO, "clearing target %u handle 0x%04x\n", targ->tid, handle); /* * Don't clear target if remove fails because things will get confusing. * Leave the devname and sasaddr intact so that we know to avoid reusing * this target id if possible, and so we can assign the same target id * to this device if it comes back in the future. */ if ((le16toh(reply->IOCStatus) & MPI2_IOCSTATUS_MASK) == MPI2_IOCSTATUS_SUCCESS) { targ = tm->cm_targ; targ->handle = 0x0; targ->encl_handle = 0x0; targ->encl_level_valid = 0x0; targ->encl_level = 0x0; targ->connector_name[0] = ' '; targ->connector_name[1] = ' '; targ->connector_name[2] = ' '; targ->connector_name[3] = ' '; targ->encl_slot = 0x0; targ->exp_dev_handle = 0x0; targ->phy_num = 0x0; targ->linkrate = 0x0; targ->devinfo = 0x0; targ->flags = 0x0; targ->scsi_req_desc_type = 0; } mprsas_free_tm(sc, tm); } /* * No Need to call "MPI2_SAS_OP_REMOVE_DEVICE" For Volume removal. * Otherwise Volume Delete is same as Bare Drive Removal. */ void mprsas_prepare_volume_remove(struct mprsas_softc *sassc, uint16_t handle) { MPI2_SCSI_TASK_MANAGE_REQUEST *req; struct mpr_softc *sc; struct mpr_command *cm; struct mprsas_target *targ = NULL; MPR_FUNCTRACE(sassc->sc); sc = sassc->sc; targ = mprsas_find_target_by_handle(sassc, 0, handle); if (targ == NULL) { /* FIXME: what is the action? */ /* We don't know about this device? */ mpr_dprint(sc, MPR_ERROR, "%s %d : invalid handle 0x%x \n", __func__,__LINE__, handle); return; } targ->flags |= MPRSAS_TARGET_INREMOVAL; cm = mprsas_alloc_tm(sc); if (cm == NULL) { mpr_dprint(sc, MPR_ERROR, "%s: command alloc failure\n", __func__); return; } mprsas_rescan_target(sc, targ); req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)cm->cm_req; req->DevHandle = targ->handle; req->Function = MPI2_FUNCTION_SCSI_TASK_MGMT; req->TaskType = MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET; /* SAS Hard Link Reset / SATA Link Reset */ req->MsgFlags = MPI2_SCSITASKMGMT_MSGFLAGS_LINK_RESET; cm->cm_targ = targ; cm->cm_data = NULL; cm->cm_desc.HighPriority.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY; cm->cm_complete = mprsas_remove_volume; cm->cm_complete_data = (void *)(uintptr_t)handle; mpr_dprint(sc, MPR_INFO, "%s: Sending reset for target ID %d\n", __func__, targ->tid); mprsas_prepare_for_tm(sc, cm, targ, CAM_LUN_WILDCARD); mpr_map_command(sc, cm); } /* * The firmware performs debounce on the link to avoid transient link errors * and false removals. When it does decide that link has been lost and a * device needs to go away, it expects that the host will perform a target reset * and then an op remove. The reset has the side-effect of aborting any * outstanding requests for the device, which is required for the op-remove to * succeed. It's not clear if the host should check for the device coming back * alive after the reset. */ void mprsas_prepare_remove(struct mprsas_softc *sassc, uint16_t handle) { MPI2_SCSI_TASK_MANAGE_REQUEST *req; struct mpr_softc *sc; struct mpr_command *cm; struct mprsas_target *targ = NULL; MPR_FUNCTRACE(sassc->sc); sc = sassc->sc; targ = mprsas_find_target_by_handle(sassc, 0, handle); if (targ == NULL) { /* FIXME: what is the action? */ /* We don't know about this device? */ mpr_dprint(sc, MPR_ERROR, "%s : invalid handle 0x%x \n", __func__, handle); return; } targ->flags |= MPRSAS_TARGET_INREMOVAL; cm = mprsas_alloc_tm(sc); if (cm == NULL) { mpr_dprint(sc, MPR_ERROR, "%s: command alloc failure\n", __func__); return; } mprsas_rescan_target(sc, targ); req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)cm->cm_req; memset(req, 0, sizeof(*req)); req->DevHandle = htole16(targ->handle); req->Function = MPI2_FUNCTION_SCSI_TASK_MGMT; req->TaskType = MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET; /* SAS Hard Link Reset / SATA Link Reset */ req->MsgFlags = MPI2_SCSITASKMGMT_MSGFLAGS_LINK_RESET; cm->cm_targ = targ; cm->cm_data = NULL; cm->cm_desc.HighPriority.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY; cm->cm_complete = mprsas_remove_device; cm->cm_complete_data = (void *)(uintptr_t)handle; mpr_dprint(sc, MPR_INFO, "%s: Sending reset for target ID %d\n", __func__, targ->tid); mprsas_prepare_for_tm(sc, cm, targ, CAM_LUN_WILDCARD); mpr_map_command(sc, cm); } static void mprsas_remove_device(struct mpr_softc *sc, struct mpr_command *tm) { MPI2_SCSI_TASK_MANAGE_REPLY *reply; MPI2_SAS_IOUNIT_CONTROL_REQUEST *req; struct mprsas_target *targ; struct mpr_command *next_cm; uint16_t handle; MPR_FUNCTRACE(sc); reply = (MPI2_SCSI_TASK_MANAGE_REPLY *)tm->cm_reply; handle = (uint16_t)(uintptr_t)tm->cm_complete_data; targ = tm->cm_targ; /* * Currently there should be no way we can hit this case. It only * happens when we have a failure to allocate chain frames, and * task management commands don't have S/G lists. */ if ((tm->cm_flags & MPR_CM_FLAGS_ERROR_MASK) != 0) { mpr_dprint(sc, MPR_ERROR, "%s: cm_flags = %#x for remove of " "handle %#04x! This should not happen!\n", __func__, tm->cm_flags, handle); } if (reply == NULL) { /* XXX retry the remove after the diag reset completes? */ mpr_dprint(sc, MPR_FAULT, "%s NULL reply resetting device " "0x%04x\n", __func__, handle); mprsas_free_tm(sc, tm); return; } if ((le16toh(reply->IOCStatus) & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS) { mpr_dprint(sc, MPR_ERROR, "IOCStatus = 0x%x while resetting " "device 0x%x\n", le16toh(reply->IOCStatus), handle); } mpr_dprint(sc, MPR_XINFO, "Reset aborted %u commands\n", le32toh(reply->TerminationCount)); mpr_free_reply(sc, tm->cm_reply_data); tm->cm_reply = NULL; /* Ensures the reply won't get re-freed */ /* Reuse the existing command */ req = (MPI2_SAS_IOUNIT_CONTROL_REQUEST *)tm->cm_req; memset(req, 0, sizeof(*req)); req->Function = MPI2_FUNCTION_SAS_IO_UNIT_CONTROL; req->Operation = MPI2_SAS_OP_REMOVE_DEVICE; req->DevHandle = htole16(handle); tm->cm_data = NULL; tm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE; tm->cm_complete = mprsas_remove_complete; tm->cm_complete_data = (void *)(uintptr_t)handle; mpr_map_command(sc, tm); mpr_dprint(sc, MPR_INFO, "clearing target %u handle 0x%04x\n", targ->tid, handle); if (targ->encl_level_valid) { mpr_dprint(sc, MPR_INFO, "At enclosure level %d, slot %d, " "connector name (%4s)\n", targ->encl_level, targ->encl_slot, targ->connector_name); } TAILQ_FOREACH_SAFE(tm, &targ->commands, cm_link, next_cm) { union ccb *ccb; mpr_dprint(sc, MPR_XINFO, "Completing missed command %p\n", tm); ccb = tm->cm_complete_data; mprsas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); mprsas_scsiio_complete(sc, tm); } } static void mprsas_remove_complete(struct mpr_softc *sc, struct mpr_command *tm) { MPI2_SAS_IOUNIT_CONTROL_REPLY *reply; uint16_t handle; struct mprsas_target *targ; struct mprsas_lun *lun; MPR_FUNCTRACE(sc); reply = (MPI2_SAS_IOUNIT_CONTROL_REPLY *)tm->cm_reply; handle = (uint16_t)(uintptr_t)tm->cm_complete_data; /* * Currently there should be no way we can hit this case. It only * happens when we have a failure to allocate chain frames, and * task management commands don't have S/G lists. */ if ((tm->cm_flags & MPR_CM_FLAGS_ERROR_MASK) != 0) { mpr_dprint(sc, MPR_XINFO, "%s: cm_flags = %#x for remove of " "handle %#04x! This should not happen!\n", __func__, tm->cm_flags, handle); mprsas_free_tm(sc, tm); return; } if (reply == NULL) { /* most likely a chip reset */ mpr_dprint(sc, MPR_FAULT, "%s NULL reply removing device " "0x%04x\n", __func__, handle); mprsas_free_tm(sc, tm); return; } mpr_dprint(sc, MPR_XINFO, "%s on handle 0x%04x, IOCStatus= 0x%x\n", __func__, handle, le16toh(reply->IOCStatus)); /* * Don't clear target if remove fails because things will get confusing. * Leave the devname and sasaddr intact so that we know to avoid reusing * this target id if possible, and so we can assign the same target id * to this device if it comes back in the future. */ if ((le16toh(reply->IOCStatus) & MPI2_IOCSTATUS_MASK) == MPI2_IOCSTATUS_SUCCESS) { targ = tm->cm_targ; targ->handle = 0x0; targ->encl_handle = 0x0; targ->encl_level_valid = 0x0; targ->encl_level = 0x0; targ->connector_name[0] = ' '; targ->connector_name[1] = ' '; targ->connector_name[2] = ' '; targ->connector_name[3] = ' '; targ->encl_slot = 0x0; targ->exp_dev_handle = 0x0; targ->phy_num = 0x0; targ->linkrate = 0x0; targ->devinfo = 0x0; targ->flags = 0x0; targ->scsi_req_desc_type = 0; while (!SLIST_EMPTY(&targ->luns)) { lun = SLIST_FIRST(&targ->luns); SLIST_REMOVE_HEAD(&targ->luns, lun_link); free(lun, M_MPR); } } mprsas_free_tm(sc, tm); } static int mprsas_register_events(struct mpr_softc *sc) { uint8_t events[16]; bzero(events, 16); setbit(events, MPI2_EVENT_SAS_DEVICE_STATUS_CHANGE); setbit(events, MPI2_EVENT_SAS_DISCOVERY); setbit(events, MPI2_EVENT_SAS_BROADCAST_PRIMITIVE); setbit(events, MPI2_EVENT_SAS_INIT_DEVICE_STATUS_CHANGE); setbit(events, MPI2_EVENT_SAS_INIT_TABLE_OVERFLOW); setbit(events, MPI2_EVENT_SAS_TOPOLOGY_CHANGE_LIST); setbit(events, MPI2_EVENT_SAS_ENCL_DEVICE_STATUS_CHANGE); setbit(events, MPI2_EVENT_IR_CONFIGURATION_CHANGE_LIST); setbit(events, MPI2_EVENT_IR_VOLUME); setbit(events, MPI2_EVENT_IR_PHYSICAL_DISK); setbit(events, MPI2_EVENT_IR_OPERATION_STATUS); setbit(events, MPI2_EVENT_TEMP_THRESHOLD); setbit(events, MPI2_EVENT_SAS_DEVICE_DISCOVERY_ERROR); if (sc->facts->MsgVersion >= MPI2_VERSION_02_06) { setbit(events, MPI2_EVENT_ACTIVE_CABLE_EXCEPTION); if (sc->mpr_flags & MPR_FLAGS_GEN35_IOC) { setbit(events, MPI2_EVENT_PCIE_DEVICE_STATUS_CHANGE); setbit(events, MPI2_EVENT_PCIE_ENUMERATION); setbit(events, MPI2_EVENT_PCIE_TOPOLOGY_CHANGE_LIST); } } mpr_register_events(sc, events, mprsas_evt_handler, NULL, &sc->sassc->mprsas_eh); return (0); } int mpr_attach_sas(struct mpr_softc *sc) { struct mprsas_softc *sassc; cam_status status; int unit, error = 0, reqs; MPR_FUNCTRACE(sc); mpr_dprint(sc, MPR_INIT, "%s entered\n", __func__); sassc = malloc(sizeof(struct mprsas_softc), M_MPR, M_WAITOK|M_ZERO); if (!sassc) { mpr_dprint(sc, MPR_INIT|MPR_ERROR, "Cannot allocate SAS subsystem memory\n"); return (ENOMEM); } /* * XXX MaxTargets could change during a reinit. Since we don't * resize the targets[] array during such an event, cache the value * of MaxTargets here so that we don't get into trouble later. This * should move into the reinit logic. */ sassc->maxtargets = sc->facts->MaxTargets + sc->facts->MaxVolumes; sassc->targets = malloc(sizeof(struct mprsas_target) * sassc->maxtargets, M_MPR, M_WAITOK|M_ZERO); if (!sassc->targets) { mpr_dprint(sc, MPR_INIT|MPR_ERROR, "Cannot allocate SAS target memory\n"); free(sassc, M_MPR); return (ENOMEM); } sc->sassc = sassc; sassc->sc = sc; reqs = sc->num_reqs - sc->num_prireqs - 1; if ((sassc->devq = cam_simq_alloc(reqs)) == NULL) { mpr_dprint(sc, MPR_INIT|MPR_ERROR, "Cannot allocate SIMQ\n"); error = ENOMEM; goto out; } unit = device_get_unit(sc->mpr_dev); sassc->sim = cam_sim_alloc(mprsas_action, mprsas_poll, "mpr", sassc, unit, &sc->mpr_mtx, reqs, reqs, sassc->devq); if (sassc->sim == NULL) { mpr_dprint(sc, MPR_INIT|MPR_ERROR, "Cannot allocate SIM\n"); error = EINVAL; goto out; } TAILQ_INIT(&sassc->ev_queue); /* Initialize taskqueue for Event Handling */ TASK_INIT(&sassc->ev_task, 0, mprsas_firmware_event_work, sc); sassc->ev_tq = taskqueue_create("mpr_taskq", M_NOWAIT | M_ZERO, taskqueue_thread_enqueue, &sassc->ev_tq); taskqueue_start_threads(&sassc->ev_tq, 1, PRIBIO, "%s taskq", device_get_nameunit(sc->mpr_dev)); mpr_lock(sc); /* * XXX There should be a bus for every port on the adapter, but since * we're just going to fake the topology for now, we'll pretend that * everything is just a target on a single bus. */ if ((error = xpt_bus_register(sassc->sim, sc->mpr_dev, 0)) != 0) { mpr_dprint(sc, MPR_INIT|MPR_ERROR, "Error %d registering SCSI bus\n", error); mpr_unlock(sc); goto out; } /* * Assume that discovery events will start right away. * * Hold off boot until discovery is complete. */ sassc->flags |= MPRSAS_IN_STARTUP | MPRSAS_IN_DISCOVERY; sc->sassc->startup_refcount = 0; mprsas_startup_increment(sassc); callout_init(&sassc->discovery_callout, 1 /*mpsafe*/); /* * Register for async events so we can determine the EEDP * capabilities of devices. */ status = xpt_create_path(&sassc->path, /*periph*/NULL, cam_sim_path(sc->sassc->sim), CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD); if (status != CAM_REQ_CMP) { mpr_dprint(sc, MPR_INIT|MPR_ERROR, "Error %#x creating sim path\n", status); sassc->path = NULL; } else { int event; #if (__FreeBSD_version >= 1000006) || \ ((__FreeBSD_version >= 901503) && (__FreeBSD_version < 1000000)) event = AC_ADVINFO_CHANGED | AC_FOUND_DEVICE; #else event = AC_FOUND_DEVICE; #endif /* * Prior to the CAM locking improvements, we can't call * xpt_register_async() with a particular path specified. * * If a path isn't specified, xpt_register_async() will * generate a wildcard path and acquire the XPT lock while * it calls xpt_action() to execute the XPT_SASYNC_CB CCB. * It will then drop the XPT lock once that is done. * * If a path is specified for xpt_register_async(), it will * not acquire and drop the XPT lock around the call to * xpt_action(). xpt_action() asserts that the caller * holds the SIM lock, so the SIM lock has to be held when * calling xpt_register_async() when the path is specified. * * But xpt_register_async calls xpt_for_all_devices(), * which calls xptbustraverse(), which will acquire each * SIM lock. When it traverses our particular bus, it will * necessarily acquire the SIM lock, which will lead to a * recursive lock acquisition. * * The CAM locking changes fix this problem by acquiring * the XPT topology lock around bus traversal in * xptbustraverse(), so the caller can hold the SIM lock * and it does not cause a recursive lock acquisition. * * These __FreeBSD_version values are approximate, especially * for stable/10, which is two months later than the actual * change. */ #if (__FreeBSD_version < 1000703) || \ ((__FreeBSD_version >= 1100000) && (__FreeBSD_version < 1100002)) mpr_unlock(sc); status = xpt_register_async(event, mprsas_async, sc, NULL); mpr_lock(sc); #else status = xpt_register_async(event, mprsas_async, sc, sassc->path); #endif if (status != CAM_REQ_CMP) { mpr_dprint(sc, MPR_ERROR, "Error %#x registering async handler for " "AC_ADVINFO_CHANGED events\n", status); xpt_free_path(sassc->path); sassc->path = NULL; } } if (status != CAM_REQ_CMP) { /* * EEDP use is the exception, not the rule. * Warn the user, but do not fail to attach. */ mpr_printf(sc, "EEDP capabilities disabled.\n"); } mpr_unlock(sc); mprsas_register_events(sc); out: if (error) mpr_detach_sas(sc); mpr_dprint(sc, MPR_INIT, "%s exit, error= %d\n", __func__, error); return (error); } int mpr_detach_sas(struct mpr_softc *sc) { struct mprsas_softc *sassc; struct mprsas_lun *lun, *lun_tmp; struct mprsas_target *targ; int i; MPR_FUNCTRACE(sc); if (sc->sassc == NULL) return (0); sassc = sc->sassc; mpr_deregister_events(sc, sassc->mprsas_eh); /* * Drain and free the event handling taskqueue with the lock * unheld so that any parallel processing tasks drain properly * without deadlocking. */ if (sassc->ev_tq != NULL) taskqueue_free(sassc->ev_tq); /* Make sure CAM doesn't wedge if we had to bail out early. */ mpr_lock(sc); while (sassc->startup_refcount != 0) mprsas_startup_decrement(sassc); /* Deregister our async handler */ if (sassc->path != NULL) { xpt_register_async(0, mprsas_async, sc, sassc->path); xpt_free_path(sassc->path); sassc->path = NULL; } if (sassc->flags & MPRSAS_IN_STARTUP) xpt_release_simq(sassc->sim, 1); if (sassc->sim != NULL) { xpt_bus_deregister(cam_sim_path(sassc->sim)); cam_sim_free(sassc->sim, FALSE); } mpr_unlock(sc); if (sassc->devq != NULL) cam_simq_free(sassc->devq); for (i = 0; i < sassc->maxtargets; i++) { targ = &sassc->targets[i]; SLIST_FOREACH_SAFE(lun, &targ->luns, lun_link, lun_tmp) { free(lun, M_MPR); } } free(sassc->targets, M_MPR); free(sassc, M_MPR); sc->sassc = NULL; return (0); } void mprsas_discovery_end(struct mprsas_softc *sassc) { struct mpr_softc *sc = sassc->sc; MPR_FUNCTRACE(sc); if (sassc->flags & MPRSAS_DISCOVERY_TIMEOUT_PENDING) callout_stop(&sassc->discovery_callout); /* * After discovery has completed, check the mapping table for any * missing devices and update their missing counts. Only do this once * whenever the driver is initialized so that missing counts aren't * updated unnecessarily. Note that just because discovery has * completed doesn't mean that events have been processed yet. The * check_devices function is a callout timer that checks if ALL devices * are missing. If so, it will wait a little longer for events to * complete and keep resetting itself until some device in the mapping * table is not missing, meaning that event processing has started. */ if (sc->track_mapping_events) { mpr_dprint(sc, MPR_XINFO | MPR_MAPPING, "Discovery has " "completed. Check for missing devices in the mapping " "table.\n"); callout_reset(&sc->device_check_callout, MPR_MISSING_CHECK_DELAY * hz, mpr_mapping_check_devices, sc); } } static void mprsas_action(struct cam_sim *sim, union ccb *ccb) { struct mprsas_softc *sassc; sassc = cam_sim_softc(sim); MPR_FUNCTRACE(sassc->sc); mpr_dprint(sassc->sc, MPR_TRACE, "ccb func_code 0x%x\n", ccb->ccb_h.func_code); mtx_assert(&sassc->sc->mpr_mtx, MA_OWNED); switch (ccb->ccb_h.func_code) { case XPT_PATH_INQ: { struct ccb_pathinq *cpi = &ccb->cpi; struct mpr_softc *sc = sassc->sc; cpi->version_num = 1; cpi->hba_inquiry = PI_SDTR_ABLE|PI_TAG_ABLE|PI_WIDE_16; cpi->target_sprt = 0; #if (__FreeBSD_version >= 1000039) || \ ((__FreeBSD_version < 1000000) && (__FreeBSD_version >= 902502)) cpi->hba_misc = PIM_NOBUSRESET | PIM_UNMAPPED | PIM_NOSCAN; #else cpi->hba_misc = PIM_NOBUSRESET | PIM_UNMAPPED; #endif cpi->hba_eng_cnt = 0; cpi->max_target = sassc->maxtargets - 1; cpi->max_lun = 255; /* * initiator_id is set here to an ID outside the set of valid * target IDs (including volumes). */ cpi->initiator_id = sassc->maxtargets; strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN); strlcpy(cpi->hba_vid, "Avago Tech", HBA_IDLEN); strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN); cpi->unit_number = cam_sim_unit(sim); cpi->bus_id = cam_sim_bus(sim); /* * XXXSLM-I think this needs to change based on config page or * something instead of hardcoded to 150000. */ cpi->base_transfer_speed = 150000; cpi->transport = XPORT_SAS; cpi->transport_version = 0; cpi->protocol = PROTO_SCSI; cpi->protocol_version = SCSI_REV_SPC; cpi->maxio = sc->maxio; mprsas_set_ccbstatus(ccb, CAM_REQ_CMP); break; } case XPT_GET_TRAN_SETTINGS: { struct ccb_trans_settings *cts; struct ccb_trans_settings_sas *sas; struct ccb_trans_settings_scsi *scsi; struct mprsas_target *targ; cts = &ccb->cts; sas = &cts->xport_specific.sas; scsi = &cts->proto_specific.scsi; KASSERT(cts->ccb_h.target_id < sassc->maxtargets, ("Target %d out of bounds in XPT_GET_TRAN_SETTINGS\n", cts->ccb_h.target_id)); targ = &sassc->targets[cts->ccb_h.target_id]; if (targ->handle == 0x0) { mprsas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); break; } cts->protocol_version = SCSI_REV_SPC2; cts->transport = XPORT_SAS; cts->transport_version = 0; sas->valid = CTS_SAS_VALID_SPEED; switch (targ->linkrate) { case 0x08: sas->bitrate = 150000; break; case 0x09: sas->bitrate = 300000; break; case 0x0a: sas->bitrate = 600000; break; case 0x0b: sas->bitrate = 1200000; break; default: sas->valid = 0; } cts->protocol = PROTO_SCSI; scsi->valid = CTS_SCSI_VALID_TQ; scsi->flags = CTS_SCSI_FLAGS_TAG_ENB; mprsas_set_ccbstatus(ccb, CAM_REQ_CMP); break; } case XPT_CALC_GEOMETRY: cam_calc_geometry(&ccb->ccg, /*extended*/1); mprsas_set_ccbstatus(ccb, CAM_REQ_CMP); break; case XPT_RESET_DEV: mpr_dprint(sassc->sc, MPR_XINFO, "mprsas_action " "XPT_RESET_DEV\n"); mprsas_action_resetdev(sassc, ccb); return; case XPT_RESET_BUS: case XPT_ABORT: case XPT_TERM_IO: mpr_dprint(sassc->sc, MPR_XINFO, "mprsas_action faking success " "for abort or reset\n"); mprsas_set_ccbstatus(ccb, CAM_REQ_CMP); break; case XPT_SCSI_IO: mprsas_action_scsiio(sassc, ccb); return; #if __FreeBSD_version >= 900026 case XPT_SMP_IO: mprsas_action_smpio(sassc, ccb); return; #endif default: mprsas_set_ccbstatus(ccb, CAM_FUNC_NOTAVAIL); break; } xpt_done(ccb); } static void mprsas_announce_reset(struct mpr_softc *sc, uint32_t ac_code, target_id_t target_id, lun_id_t lun_id) { path_id_t path_id = cam_sim_path(sc->sassc->sim); struct cam_path *path; mpr_dprint(sc, MPR_XINFO, "%s code %x target %d lun %jx\n", __func__, ac_code, target_id, (uintmax_t)lun_id); if (xpt_create_path(&path, NULL, path_id, target_id, lun_id) != CAM_REQ_CMP) { mpr_dprint(sc, MPR_ERROR, "unable to create path for reset " "notification\n"); return; } xpt_async(ac_code, path, NULL); xpt_free_path(path); } static void mprsas_complete_all_commands(struct mpr_softc *sc) { struct mpr_command *cm; int i; int completed; MPR_FUNCTRACE(sc); mtx_assert(&sc->mpr_mtx, MA_OWNED); /* complete all commands with a NULL reply */ for (i = 1; i < sc->num_reqs; i++) { cm = &sc->commands[i]; if (cm->cm_state == MPR_CM_STATE_FREE) continue; cm->cm_state = MPR_CM_STATE_BUSY; cm->cm_reply = NULL; completed = 0; if (cm->cm_flags & MPR_CM_FLAGS_POLLED) cm->cm_flags |= MPR_CM_FLAGS_COMPLETE; if (cm->cm_complete != NULL) { mprsas_log_command(cm, MPR_RECOVERY, "completing cm %p state %x ccb %p for diag reset\n", cm, cm->cm_state, cm->cm_ccb); cm->cm_complete(sc, cm); completed = 1; } else if (cm->cm_flags & MPR_CM_FLAGS_WAKEUP) { mprsas_log_command(cm, MPR_RECOVERY, "waking up cm %p state %x ccb %p for diag reset\n", cm, cm->cm_state, cm->cm_ccb); wakeup(cm); completed = 1; } if ((completed == 0) && (cm->cm_state != MPR_CM_STATE_FREE)) { /* this should never happen, but if it does, log */ mprsas_log_command(cm, MPR_RECOVERY, "cm %p state %x flags 0x%x ccb %p during diag " "reset\n", cm, cm->cm_state, cm->cm_flags, cm->cm_ccb); } } sc->io_cmds_active = 0; } void mprsas_handle_reinit(struct mpr_softc *sc) { int i; /* Go back into startup mode and freeze the simq, so that CAM * doesn't send any commands until after we've rediscovered all * targets and found the proper device handles for them. * * After the reset, portenable will trigger discovery, and after all * discovery-related activities have finished, the simq will be * released. */ mpr_dprint(sc, MPR_INIT, "%s startup\n", __func__); sc->sassc->flags |= MPRSAS_IN_STARTUP; sc->sassc->flags |= MPRSAS_IN_DISCOVERY; mprsas_startup_increment(sc->sassc); /* notify CAM of a bus reset */ mprsas_announce_reset(sc, AC_BUS_RESET, CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD); /* complete and cleanup after all outstanding commands */ mprsas_complete_all_commands(sc); mpr_dprint(sc, MPR_INIT, "%s startup %u after command completion\n", __func__, sc->sassc->startup_refcount); /* zero all the target handles, since they may change after the * reset, and we have to rediscover all the targets and use the new * handles. */ for (i = 0; i < sc->sassc->maxtargets; i++) { if (sc->sassc->targets[i].outstanding != 0) mpr_dprint(sc, MPR_INIT, "target %u outstanding %u\n", i, sc->sassc->targets[i].outstanding); sc->sassc->targets[i].handle = 0x0; sc->sassc->targets[i].exp_dev_handle = 0x0; sc->sassc->targets[i].outstanding = 0; sc->sassc->targets[i].flags = MPRSAS_TARGET_INDIAGRESET; } } static void mprsas_tm_timeout(void *data) { struct mpr_command *tm = data; struct mpr_softc *sc = tm->cm_sc; mtx_assert(&sc->mpr_mtx, MA_OWNED); mprsas_log_command(tm, MPR_INFO|MPR_RECOVERY, "task mgmt %p timed " "out\n", tm); KASSERT(tm->cm_state == MPR_CM_STATE_INQUEUE, ("command not inqueue\n")); tm->cm_state = MPR_CM_STATE_BUSY; mpr_reinit(sc); } static void mprsas_logical_unit_reset_complete(struct mpr_softc *sc, struct mpr_command *tm) { MPI2_SCSI_TASK_MANAGE_REPLY *reply; MPI2_SCSI_TASK_MANAGE_REQUEST *req; unsigned int cm_count = 0; struct mpr_command *cm; struct mprsas_target *targ; callout_stop(&tm->cm_callout); req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)tm->cm_req; reply = (MPI2_SCSI_TASK_MANAGE_REPLY *)tm->cm_reply; targ = tm->cm_targ; /* * Currently there should be no way we can hit this case. It only * happens when we have a failure to allocate chain frames, and * task management commands don't have S/G lists. */ if ((tm->cm_flags & MPR_CM_FLAGS_ERROR_MASK) != 0) { mpr_dprint(sc, MPR_RECOVERY|MPR_ERROR, "%s: cm_flags = %#x for LUN reset! " "This should not happen!\n", __func__, tm->cm_flags); mprsas_free_tm(sc, tm); return; } if (reply == NULL) { mpr_dprint(sc, MPR_RECOVERY, "NULL reset reply for tm %p\n", tm); if ((sc->mpr_flags & MPR_FLAGS_DIAGRESET) != 0) { /* this completion was due to a reset, just cleanup */ mpr_dprint(sc, MPR_RECOVERY, "Hardware undergoing " "reset, ignoring NULL LUN reset reply\n"); targ->tm = NULL; mprsas_free_tm(sc, tm); } else { /* we should have gotten a reply. */ mpr_dprint(sc, MPR_INFO|MPR_RECOVERY, "NULL reply on " "LUN reset attempt, resetting controller\n"); mpr_reinit(sc); } return; } mpr_dprint(sc, MPR_RECOVERY, "logical unit reset status 0x%x code 0x%x count %u\n", le16toh(reply->IOCStatus), le32toh(reply->ResponseCode), le32toh(reply->TerminationCount)); /* * See if there are any outstanding commands for this LUN. * This could be made more efficient by using a per-LU data * structure of some sort. */ TAILQ_FOREACH(cm, &targ->commands, cm_link) { if (cm->cm_lun == tm->cm_lun) cm_count++; } if (cm_count == 0) { mpr_dprint(sc, MPR_RECOVERY|MPR_INFO, "Finished recovery after LUN reset for target %u\n", targ->tid); mprsas_announce_reset(sc, AC_SENT_BDR, targ->tid, tm->cm_lun); /* * We've finished recovery for this logical unit. check and * see if some other logical unit has a timedout command * that needs to be processed. */ cm = TAILQ_FIRST(&targ->timedout_commands); if (cm) { mpr_dprint(sc, MPR_INFO|MPR_RECOVERY, "More commands to abort for target %u\n", targ->tid); mprsas_send_abort(sc, tm, cm); } else { targ->tm = NULL; mprsas_free_tm(sc, tm); } } else { /* if we still have commands for this LUN, the reset * effectively failed, regardless of the status reported. * Escalate to a target reset. */ mpr_dprint(sc, MPR_INFO|MPR_RECOVERY, "logical unit reset complete for target %u, but still " "have %u command(s), sending target reset\n", targ->tid, cm_count); mprsas_send_reset(sc, tm, MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET); } } static void mprsas_target_reset_complete(struct mpr_softc *sc, struct mpr_command *tm) { MPI2_SCSI_TASK_MANAGE_REPLY *reply; MPI2_SCSI_TASK_MANAGE_REQUEST *req; struct mprsas_target *targ; callout_stop(&tm->cm_callout); req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)tm->cm_req; reply = (MPI2_SCSI_TASK_MANAGE_REPLY *)tm->cm_reply; targ = tm->cm_targ; /* * Currently there should be no way we can hit this case. It only * happens when we have a failure to allocate chain frames, and * task management commands don't have S/G lists. */ if ((tm->cm_flags & MPR_CM_FLAGS_ERROR_MASK) != 0) { mpr_dprint(sc, MPR_ERROR, "%s: cm_flags = %#x for target " "reset! This should not happen!\n", __func__, tm->cm_flags); mprsas_free_tm(sc, tm); return; } if (reply == NULL) { mpr_dprint(sc, MPR_RECOVERY, "NULL target reset reply for tm %p TaskMID %u\n", tm, le16toh(req->TaskMID)); if ((sc->mpr_flags & MPR_FLAGS_DIAGRESET) != 0) { /* this completion was due to a reset, just cleanup */ mpr_dprint(sc, MPR_RECOVERY, "Hardware undergoing " "reset, ignoring NULL target reset reply\n"); targ->tm = NULL; mprsas_free_tm(sc, tm); } else { /* we should have gotten a reply. */ mpr_dprint(sc, MPR_INFO|MPR_RECOVERY, "NULL reply on " "target reset attempt, resetting controller\n"); mpr_reinit(sc); } return; } mpr_dprint(sc, MPR_RECOVERY, "target reset status 0x%x code 0x%x count %u\n", le16toh(reply->IOCStatus), le32toh(reply->ResponseCode), le32toh(reply->TerminationCount)); if (targ->outstanding == 0) { /* * We've finished recovery for this target and all * of its logical units. */ mpr_dprint(sc, MPR_RECOVERY|MPR_INFO, "Finished reset recovery for target %u\n", targ->tid); mprsas_announce_reset(sc, AC_SENT_BDR, tm->cm_targ->tid, CAM_LUN_WILDCARD); targ->tm = NULL; mprsas_free_tm(sc, tm); } else { /* * After a target reset, if this target still has * outstanding commands, the reset effectively failed, * regardless of the status reported. escalate. */ mpr_dprint(sc, MPR_INFO|MPR_RECOVERY, "Target reset complete for target %u, but still have %u " "command(s), resetting controller\n", targ->tid, targ->outstanding); mpr_reinit(sc); } } #define MPR_RESET_TIMEOUT 30 int mprsas_send_reset(struct mpr_softc *sc, struct mpr_command *tm, uint8_t type) { MPI2_SCSI_TASK_MANAGE_REQUEST *req; struct mprsas_target *target; int err; target = tm->cm_targ; if (target->handle == 0) { mpr_dprint(sc, MPR_ERROR, "%s null devhandle for target_id " "%d\n", __func__, target->tid); return -1; } req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)tm->cm_req; req->DevHandle = htole16(target->handle); req->Function = MPI2_FUNCTION_SCSI_TASK_MGMT; req->TaskType = type; if (type == MPI2_SCSITASKMGMT_TASKTYPE_LOGICAL_UNIT_RESET) { /* XXX Need to handle invalid LUNs */ MPR_SET_LUN(req->LUN, tm->cm_lun); tm->cm_targ->logical_unit_resets++; mpr_dprint(sc, MPR_RECOVERY|MPR_INFO, "Sending logical unit reset to target %u lun %d\n", target->tid, tm->cm_lun); tm->cm_complete = mprsas_logical_unit_reset_complete; mprsas_prepare_for_tm(sc, tm, target, tm->cm_lun); } else if (type == MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET) { /* * Target reset method = * SAS Hard Link Reset / SATA Link Reset */ req->MsgFlags = MPI2_SCSITASKMGMT_MSGFLAGS_LINK_RESET; tm->cm_targ->target_resets++; mpr_dprint(sc, MPR_RECOVERY|MPR_INFO, "Sending target reset to target %u\n", target->tid); tm->cm_complete = mprsas_target_reset_complete; mprsas_prepare_for_tm(sc, tm, target, CAM_LUN_WILDCARD); } else { mpr_dprint(sc, MPR_ERROR, "unexpected reset type 0x%x\n", type); return -1; } if (target->encl_level_valid) { mpr_dprint(sc, MPR_RECOVERY|MPR_INFO, "At enclosure level %d, slot %d, connector name (%4s)\n", target->encl_level, target->encl_slot, target->connector_name); } tm->cm_data = NULL; tm->cm_desc.HighPriority.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY; tm->cm_complete_data = (void *)tm; callout_reset(&tm->cm_callout, MPR_RESET_TIMEOUT * hz, mprsas_tm_timeout, tm); err = mpr_map_command(sc, tm); if (err) mpr_dprint(sc, MPR_ERROR|MPR_RECOVERY, "error %d sending reset type %u\n", err, type); return err; } static void mprsas_abort_complete(struct mpr_softc *sc, struct mpr_command *tm) { struct mpr_command *cm; MPI2_SCSI_TASK_MANAGE_REPLY *reply; MPI2_SCSI_TASK_MANAGE_REQUEST *req; struct mprsas_target *targ; callout_stop(&tm->cm_callout); req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)tm->cm_req; reply = (MPI2_SCSI_TASK_MANAGE_REPLY *)tm->cm_reply; targ = tm->cm_targ; /* * Currently there should be no way we can hit this case. It only * happens when we have a failure to allocate chain frames, and * task management commands don't have S/G lists. */ if ((tm->cm_flags & MPR_CM_FLAGS_ERROR_MASK) != 0) { mpr_dprint(sc, MPR_RECOVERY|MPR_ERROR, "cm_flags = %#x for abort %p TaskMID %u!\n", tm->cm_flags, tm, le16toh(req->TaskMID)); mprsas_free_tm(sc, tm); return; } if (reply == NULL) { mpr_dprint(sc, MPR_RECOVERY, "NULL abort reply for tm %p TaskMID %u\n", tm, le16toh(req->TaskMID)); if ((sc->mpr_flags & MPR_FLAGS_DIAGRESET) != 0) { /* this completion was due to a reset, just cleanup */ mpr_dprint(sc, MPR_RECOVERY, "Hardware undergoing " "reset, ignoring NULL abort reply\n"); targ->tm = NULL; mprsas_free_tm(sc, tm); } else { /* we should have gotten a reply. */ mpr_dprint(sc, MPR_INFO|MPR_RECOVERY, "NULL reply on " "abort attempt, resetting controller\n"); mpr_reinit(sc); } return; } mpr_dprint(sc, MPR_RECOVERY, "abort TaskMID %u status 0x%x code 0x%x count %u\n", le16toh(req->TaskMID), le16toh(reply->IOCStatus), le32toh(reply->ResponseCode), le32toh(reply->TerminationCount)); cm = TAILQ_FIRST(&tm->cm_targ->timedout_commands); if (cm == NULL) { /* * if there are no more timedout commands, we're done with * error recovery for this target. */ mpr_dprint(sc, MPR_INFO|MPR_RECOVERY, "Finished abort recovery for target %u\n", targ->tid); targ->tm = NULL; mprsas_free_tm(sc, tm); } else if (le16toh(req->TaskMID) != cm->cm_desc.Default.SMID) { /* abort success, but we have more timedout commands to abort */ mpr_dprint(sc, MPR_INFO|MPR_RECOVERY, "Continuing abort recovery for target %u\n", targ->tid); mprsas_send_abort(sc, tm, cm); } else { /* * we didn't get a command completion, so the abort * failed as far as we're concerned. escalate. */ mpr_dprint(sc, MPR_INFO|MPR_RECOVERY, "Abort failed for target %u, sending logical unit reset\n", targ->tid); mprsas_send_reset(sc, tm, MPI2_SCSITASKMGMT_TASKTYPE_LOGICAL_UNIT_RESET); } } #define MPR_ABORT_TIMEOUT 5 static int mprsas_send_abort(struct mpr_softc *sc, struct mpr_command *tm, struct mpr_command *cm) { MPI2_SCSI_TASK_MANAGE_REQUEST *req; struct mprsas_target *targ; int err; targ = cm->cm_targ; if (targ->handle == 0) { mpr_dprint(sc, MPR_ERROR|MPR_RECOVERY, "%s null devhandle for target_id %d\n", __func__, cm->cm_ccb->ccb_h.target_id); return -1; } mprsas_log_command(cm, MPR_RECOVERY|MPR_INFO, "Aborting command %p\n", cm); req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)tm->cm_req; req->DevHandle = htole16(targ->handle); req->Function = MPI2_FUNCTION_SCSI_TASK_MGMT; req->TaskType = MPI2_SCSITASKMGMT_TASKTYPE_ABORT_TASK; /* XXX Need to handle invalid LUNs */ MPR_SET_LUN(req->LUN, cm->cm_ccb->ccb_h.target_lun); req->TaskMID = htole16(cm->cm_desc.Default.SMID); tm->cm_data = NULL; tm->cm_desc.HighPriority.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY; tm->cm_complete = mprsas_abort_complete; tm->cm_complete_data = (void *)tm; tm->cm_targ = cm->cm_targ; tm->cm_lun = cm->cm_lun; callout_reset(&tm->cm_callout, MPR_ABORT_TIMEOUT * hz, mprsas_tm_timeout, tm); targ->aborts++; mprsas_prepare_for_tm(sc, tm, targ, tm->cm_lun); err = mpr_map_command(sc, tm); if (err) mpr_dprint(sc, MPR_ERROR|MPR_RECOVERY, "error %d sending abort for cm %p SMID %u\n", err, cm, req->TaskMID); return err; } static void mprsas_scsiio_timeout(void *data) { sbintime_t elapsed, now; union ccb *ccb; struct mpr_softc *sc; struct mpr_command *cm; struct mprsas_target *targ; cm = (struct mpr_command *)data; sc = cm->cm_sc; ccb = cm->cm_ccb; now = sbinuptime(); MPR_FUNCTRACE(sc); mtx_assert(&sc->mpr_mtx, MA_OWNED); mpr_dprint(sc, MPR_XINFO|MPR_RECOVERY, "Timeout checking cm %p\n", cm); /* * Run the interrupt handler to make sure it's not pending. This * isn't perfect because the command could have already completed * and been re-used, though this is unlikely. */ mpr_intr_locked(sc); if (cm->cm_state != MPR_CM_STATE_INQUEUE) { mprsas_log_command(cm, MPR_XINFO, "SCSI command %p almost timed out\n", cm); return; } if (cm->cm_ccb == NULL) { mpr_dprint(sc, MPR_ERROR, "command timeout with NULL ccb\n"); return; } targ = cm->cm_targ; targ->timeouts++; elapsed = now - ccb->ccb_h.qos.sim_data; mprsas_log_command(cm, MPR_INFO|MPR_RECOVERY, "Command timeout on target %u(0x%04x), %d set, %d.%d elapsed\n", targ->tid, targ->handle, ccb->ccb_h.timeout, sbintime_getsec(elapsed), elapsed & 0xffffffff); if (targ->encl_level_valid) { mpr_dprint(sc, MPR_INFO|MPR_RECOVERY, "At enclosure level %d, slot %d, connector name (%4s)\n", targ->encl_level, targ->encl_slot, targ->connector_name); } /* XXX first, check the firmware state, to see if it's still * operational. if not, do a diag reset. */ mprsas_set_ccbstatus(cm->cm_ccb, CAM_CMD_TIMEOUT); cm->cm_state = MPR_CM_STATE_TIMEDOUT; TAILQ_INSERT_TAIL(&targ->timedout_commands, cm, cm_recovery); if (targ->tm != NULL) { /* target already in recovery, just queue up another * timedout command to be processed later. */ mpr_dprint(sc, MPR_RECOVERY, "queued timedout cm %p for " "processing by tm %p\n", cm, targ->tm); } else if ((targ->tm = mprsas_alloc_tm(sc)) != NULL) { /* start recovery by aborting the first timedout command */ mpr_dprint(sc, MPR_RECOVERY|MPR_INFO, "Sending abort to target %u for SMID %d\n", targ->tid, cm->cm_desc.Default.SMID); mpr_dprint(sc, MPR_RECOVERY, "timedout cm %p allocated tm %p\n", cm, targ->tm); mprsas_send_abort(sc, targ->tm, cm); } else { /* XXX queue this target up for recovery once a TM becomes * available. The firmware only has a limited number of * HighPriority credits for the high priority requests used * for task management, and we ran out. * * Isilon: don't worry about this for now, since we have * more credits than disks in an enclosure, and limit * ourselves to one TM per target for recovery. */ mpr_dprint(sc, MPR_ERROR|MPR_RECOVERY, "timedout cm %p failed to allocate a tm\n", cm); } } /** * mprsas_build_nvme_unmap - Build Native NVMe DSM command equivalent * to SCSI Unmap. * Return 0 - for success, * 1 - to immediately return back the command with success status to CAM * negative value - to fallback to firmware path i.e. issue scsi unmap * to FW without any translation. */ static int mprsas_build_nvme_unmap(struct mpr_softc *sc, struct mpr_command *cm, union ccb *ccb, struct mprsas_target *targ) { Mpi26NVMeEncapsulatedRequest_t *req = NULL; struct ccb_scsiio *csio; struct unmap_parm_list *plist; struct nvme_dsm_range *nvme_dsm_ranges = NULL; struct nvme_command *c; int i, res; uint16_t ndesc, list_len, data_length; struct mpr_prp_page *prp_page_info; uint64_t nvme_dsm_ranges_dma_handle; csio = &ccb->csio; #if __FreeBSD_version >= 1100103 list_len = (scsiio_cdb_ptr(csio)[7] << 8 | scsiio_cdb_ptr(csio)[8]); #else if (csio->ccb_h.flags & CAM_CDB_POINTER) { list_len = (ccb->csio.cdb_io.cdb_ptr[7] << 8 | ccb->csio.cdb_io.cdb_ptr[8]); } else { list_len = (ccb->csio.cdb_io.cdb_bytes[7] << 8 | ccb->csio.cdb_io.cdb_bytes[8]); } #endif if (!list_len) { mpr_dprint(sc, MPR_ERROR, "Parameter list length is Zero\n"); return -EINVAL; } plist = malloc(csio->dxfer_len, M_MPR, M_ZERO|M_NOWAIT); if (!plist) { mpr_dprint(sc, MPR_ERROR, "Unable to allocate memory to " "save UNMAP data\n"); return -ENOMEM; } /* Copy SCSI unmap data to a local buffer */ bcopy(csio->data_ptr, plist, csio->dxfer_len); /* return back the unmap command to CAM with success status, * if number of descripts is zero. */ ndesc = be16toh(plist->unmap_blk_desc_data_len) >> 4; if (!ndesc) { mpr_dprint(sc, MPR_XINFO, "Number of descriptors in " "UNMAP cmd is Zero\n"); res = 1; goto out; } data_length = ndesc * sizeof(struct nvme_dsm_range); if (data_length > targ->MDTS) { mpr_dprint(sc, MPR_ERROR, "data length: %d is greater than " "Device's MDTS: %d\n", data_length, targ->MDTS); res = -EINVAL; goto out; } prp_page_info = mpr_alloc_prp_page(sc); KASSERT(prp_page_info != NULL, ("%s: There is no PRP Page for " "UNMAP command.\n", __func__)); /* * Insert the allocated PRP page into the command's PRP page list. This * will be freed when the command is freed. */ TAILQ_INSERT_TAIL(&cm->cm_prp_page_list, prp_page_info, prp_page_link); nvme_dsm_ranges = (struct nvme_dsm_range *)prp_page_info->prp_page; nvme_dsm_ranges_dma_handle = prp_page_info->prp_page_busaddr; bzero(nvme_dsm_ranges, data_length); /* Convert SCSI unmap's descriptor data to NVMe DSM specific Range data * for each descriptors contained in SCSI UNMAP data. */ for (i = 0; i < ndesc; i++) { nvme_dsm_ranges[i].length = htole32(be32toh(plist->desc[i].nlb)); nvme_dsm_ranges[i].starting_lba = htole64(be64toh(plist->desc[i].slba)); nvme_dsm_ranges[i].attributes = 0; } /* Build MPI2.6's NVMe Encapsulated Request Message */ req = (Mpi26NVMeEncapsulatedRequest_t *)cm->cm_req; bzero(req, sizeof(*req)); req->DevHandle = htole16(targ->handle); req->Function = MPI2_FUNCTION_NVME_ENCAPSULATED; req->Flags = MPI26_NVME_FLAGS_WRITE; req->ErrorResponseBaseAddress.High = htole32((uint32_t)((uint64_t)cm->cm_sense_busaddr >> 32)); req->ErrorResponseBaseAddress.Low = htole32(cm->cm_sense_busaddr); req->ErrorResponseAllocationLength = htole16(sizeof(struct nvme_completion)); req->EncapsulatedCommandLength = htole16(sizeof(struct nvme_command)); req->DataLength = htole32(data_length); /* Build NVMe DSM command */ c = (struct nvme_command *) req->NVMe_Command; - c->opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_DATASET_MANAGEMENT); + c->opc = NVME_OPC_DATASET_MANAGEMENT; c->nsid = htole32(csio->ccb_h.target_lun + 1); c->cdw10 = htole32(ndesc - 1); c->cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE); cm->cm_length = data_length; cm->cm_data = NULL; cm->cm_complete = mprsas_scsiio_complete; cm->cm_complete_data = ccb; cm->cm_targ = targ; cm->cm_lun = csio->ccb_h.target_lun; cm->cm_ccb = ccb; cm->cm_desc.Default.RequestFlags = MPI26_REQ_DESCRIPT_FLAGS_PCIE_ENCAPSULATED; csio->ccb_h.qos.sim_data = sbinuptime(); #if __FreeBSD_version >= 1000029 callout_reset_sbt(&cm->cm_callout, SBT_1MS * ccb->ccb_h.timeout, 0, mprsas_scsiio_timeout, cm, 0); #else //__FreeBSD_version < 1000029 callout_reset(&cm->cm_callout, (ccb->ccb_h.timeout * hz) / 1000, mprsas_scsiio_timeout, cm); #endif //__FreeBSD_version >= 1000029 targ->issued++; targ->outstanding++; TAILQ_INSERT_TAIL(&targ->commands, cm, cm_link); ccb->ccb_h.status |= CAM_SIM_QUEUED; mprsas_log_command(cm, MPR_XINFO, "%s cm %p ccb %p outstanding %u\n", __func__, cm, ccb, targ->outstanding); mpr_build_nvme_prp(sc, cm, req, (void *)(uintptr_t)nvme_dsm_ranges_dma_handle, 0, data_length); mpr_map_command(sc, cm); out: free(plist, M_MPR); return 0; } static void mprsas_action_scsiio(struct mprsas_softc *sassc, union ccb *ccb) { MPI2_SCSI_IO_REQUEST *req; struct ccb_scsiio *csio; struct mpr_softc *sc; struct mprsas_target *targ; struct mprsas_lun *lun; struct mpr_command *cm; uint8_t i, lba_byte, *ref_tag_addr, scsi_opcode; uint16_t eedp_flags; uint32_t mpi_control; int rc; sc = sassc->sc; MPR_FUNCTRACE(sc); mtx_assert(&sc->mpr_mtx, MA_OWNED); csio = &ccb->csio; KASSERT(csio->ccb_h.target_id < sassc->maxtargets, ("Target %d out of bounds in XPT_SCSI_IO\n", csio->ccb_h.target_id)); targ = &sassc->targets[csio->ccb_h.target_id]; mpr_dprint(sc, MPR_TRACE, "ccb %p target flag %x\n", ccb, targ->flags); if (targ->handle == 0x0) { mpr_dprint(sc, MPR_ERROR, "%s NULL handle for target %u\n", __func__, csio->ccb_h.target_id); mprsas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); xpt_done(ccb); return; } if (targ->flags & MPR_TARGET_FLAGS_RAID_COMPONENT) { mpr_dprint(sc, MPR_ERROR, "%s Raid component no SCSI IO " "supported %u\n", __func__, csio->ccb_h.target_id); mprsas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); xpt_done(ccb); return; } /* * Sometimes, it is possible to get a command that is not "In * Progress" and was actually aborted by the upper layer. Check for * this here and complete the command without error. */ if (mprsas_get_ccbstatus(ccb) != CAM_REQ_INPROG) { mpr_dprint(sc, MPR_TRACE, "%s Command is not in progress for " "target %u\n", __func__, csio->ccb_h.target_id); xpt_done(ccb); return; } /* * If devinfo is 0 this will be a volume. In that case don't tell CAM * that the volume has timed out. We want volumes to be enumerated * until they are deleted/removed, not just failed. */ if (targ->flags & MPRSAS_TARGET_INREMOVAL) { if (targ->devinfo == 0) mprsas_set_ccbstatus(ccb, CAM_REQ_CMP); else mprsas_set_ccbstatus(ccb, CAM_SEL_TIMEOUT); xpt_done(ccb); return; } if ((sc->mpr_flags & MPR_FLAGS_SHUTDOWN) != 0) { mpr_dprint(sc, MPR_INFO, "%s shutting down\n", __func__); mprsas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); xpt_done(ccb); return; } /* * If target has a reset in progress, freeze the devq and return. The * devq will be released when the TM reset is finished. */ if (targ->flags & MPRSAS_TARGET_INRESET) { ccb->ccb_h.status = CAM_BUSY | CAM_DEV_QFRZN; mpr_dprint(sc, MPR_INFO, "%s: Freezing devq for target ID %d\n", __func__, targ->tid); xpt_freeze_devq(ccb->ccb_h.path, 1); xpt_done(ccb); return; } cm = mpr_alloc_command(sc); if (cm == NULL || (sc->mpr_flags & MPR_FLAGS_DIAGRESET)) { if (cm != NULL) { mpr_free_command(sc, cm); } if ((sassc->flags & MPRSAS_QUEUE_FROZEN) == 0) { xpt_freeze_simq(sassc->sim, 1); sassc->flags |= MPRSAS_QUEUE_FROZEN; } ccb->ccb_h.status &= ~CAM_SIM_QUEUED; ccb->ccb_h.status |= CAM_REQUEUE_REQ; xpt_done(ccb); return; } /* For NVME device's issue UNMAP command directly to NVME drives by * constructing equivalent native NVMe DataSetManagement command. */ #if __FreeBSD_version >= 1100103 scsi_opcode = scsiio_cdb_ptr(csio)[0]; #else if (csio->ccb_h.flags & CAM_CDB_POINTER) scsi_opcode = csio->cdb_io.cdb_ptr[0]; else scsi_opcode = csio->cdb_io.cdb_bytes[0]; #endif if (scsi_opcode == UNMAP && targ->is_nvme && (csio->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_VADDR) { rc = mprsas_build_nvme_unmap(sc, cm, ccb, targ); if (rc == 1) { /* return command to CAM with success status */ mpr_free_command(sc, cm); mprsas_set_ccbstatus(ccb, CAM_REQ_CMP); xpt_done(ccb); return; } else if (!rc) /* Issued NVMe Encapsulated Request Message */ return; } req = (MPI2_SCSI_IO_REQUEST *)cm->cm_req; bzero(req, sizeof(*req)); req->DevHandle = htole16(targ->handle); req->Function = MPI2_FUNCTION_SCSI_IO_REQUEST; req->MsgFlags = 0; req->SenseBufferLowAddress = htole32(cm->cm_sense_busaddr); req->SenseBufferLength = MPR_SENSE_LEN; req->SGLFlags = 0; req->ChainOffset = 0; req->SGLOffset0 = 24; /* 32bit word offset to the SGL */ req->SGLOffset1= 0; req->SGLOffset2= 0; req->SGLOffset3= 0; req->SkipCount = 0; req->DataLength = htole32(csio->dxfer_len); req->BidirectionalDataLength = 0; req->IoFlags = htole16(csio->cdb_len); req->EEDPFlags = 0; /* Note: BiDirectional transfers are not supported */ switch (csio->ccb_h.flags & CAM_DIR_MASK) { case CAM_DIR_IN: mpi_control = MPI2_SCSIIO_CONTROL_READ; cm->cm_flags |= MPR_CM_FLAGS_DATAIN; break; case CAM_DIR_OUT: mpi_control = MPI2_SCSIIO_CONTROL_WRITE; cm->cm_flags |= MPR_CM_FLAGS_DATAOUT; break; case CAM_DIR_NONE: default: mpi_control = MPI2_SCSIIO_CONTROL_NODATATRANSFER; break; } if (csio->cdb_len == 32) mpi_control |= 4 << MPI2_SCSIIO_CONTROL_ADDCDBLEN_SHIFT; /* * It looks like the hardware doesn't require an explicit tag * number for each transaction. SAM Task Management not supported * at the moment. */ switch (csio->tag_action) { case MSG_HEAD_OF_Q_TAG: mpi_control |= MPI2_SCSIIO_CONTROL_HEADOFQ; break; case MSG_ORDERED_Q_TAG: mpi_control |= MPI2_SCSIIO_CONTROL_ORDEREDQ; break; case MSG_ACA_TASK: mpi_control |= MPI2_SCSIIO_CONTROL_ACAQ; break; case CAM_TAG_ACTION_NONE: case MSG_SIMPLE_Q_TAG: default: mpi_control |= MPI2_SCSIIO_CONTROL_SIMPLEQ; break; } mpi_control |= sc->mapping_table[csio->ccb_h.target_id].TLR_bits; req->Control = htole32(mpi_control); if (MPR_SET_LUN(req->LUN, csio->ccb_h.target_lun) != 0) { mpr_free_command(sc, cm); mprsas_set_ccbstatus(ccb, CAM_LUN_INVALID); xpt_done(ccb); return; } if (csio->ccb_h.flags & CAM_CDB_POINTER) bcopy(csio->cdb_io.cdb_ptr, &req->CDB.CDB32[0], csio->cdb_len); else { KASSERT(csio->cdb_len <= IOCDBLEN, ("cdb_len %d is greater than IOCDBLEN but CAM_CDB_POINTER " "is not set", csio->cdb_len)); bcopy(csio->cdb_io.cdb_bytes, &req->CDB.CDB32[0],csio->cdb_len); } req->IoFlags = htole16(csio->cdb_len); /* * Check if EEDP is supported and enabled. If it is then check if the * SCSI opcode could be using EEDP. If so, make sure the LUN exists and * is formatted for EEDP support. If all of this is true, set CDB up * for EEDP transfer. */ eedp_flags = op_code_prot[req->CDB.CDB32[0]]; if (sc->eedp_enabled && eedp_flags) { SLIST_FOREACH(lun, &targ->luns, lun_link) { if (lun->lun_id == csio->ccb_h.target_lun) { break; } } if ((lun != NULL) && (lun->eedp_formatted)) { req->EEDPBlockSize = htole16(lun->eedp_block_size); eedp_flags |= (MPI2_SCSIIO_EEDPFLAGS_INC_PRI_REFTAG | MPI2_SCSIIO_EEDPFLAGS_CHECK_REFTAG | MPI2_SCSIIO_EEDPFLAGS_CHECK_GUARD); if (sc->mpr_flags & MPR_FLAGS_GEN35_IOC) { eedp_flags |= MPI25_SCSIIO_EEDPFLAGS_APPTAG_DISABLE_MODE; } req->EEDPFlags = htole16(eedp_flags); /* * If CDB less than 32, fill in Primary Ref Tag with * low 4 bytes of LBA. If CDB is 32, tag stuff is * already there. Also, set protection bit. FreeBSD * currently does not support CDBs bigger than 16, but * the code doesn't hurt, and will be here for the * future. */ if (csio->cdb_len != 32) { lba_byte = (csio->cdb_len == 16) ? 6 : 2; ref_tag_addr = (uint8_t *)&req->CDB.EEDP32. PrimaryReferenceTag; for (i = 0; i < 4; i++) { *ref_tag_addr = req->CDB.CDB32[lba_byte + i]; ref_tag_addr++; } req->CDB.EEDP32.PrimaryReferenceTag = htole32(req-> CDB.EEDP32.PrimaryReferenceTag); req->CDB.EEDP32.PrimaryApplicationTagMask = 0xFFFF; req->CDB.CDB32[1] = (req->CDB.CDB32[1] & 0x1F) | 0x20; } else { eedp_flags |= MPI2_SCSIIO_EEDPFLAGS_INC_PRI_APPTAG; req->EEDPFlags = htole16(eedp_flags); req->CDB.CDB32[10] = (req->CDB.CDB32[10] & 0x1F) | 0x20; } } } cm->cm_length = csio->dxfer_len; if (cm->cm_length != 0) { cm->cm_data = ccb; cm->cm_flags |= MPR_CM_FLAGS_USE_CCB; } else { cm->cm_data = NULL; } cm->cm_sge = &req->SGL; cm->cm_sglsize = (32 - 24) * 4; cm->cm_complete = mprsas_scsiio_complete; cm->cm_complete_data = ccb; cm->cm_targ = targ; cm->cm_lun = csio->ccb_h.target_lun; cm->cm_ccb = ccb; /* * If using FP desc type, need to set a bit in IoFlags (SCSI IO is 0) * and set descriptor type. */ if (targ->scsi_req_desc_type == MPI25_REQ_DESCRIPT_FLAGS_FAST_PATH_SCSI_IO) { req->IoFlags |= MPI25_SCSIIO_IOFLAGS_FAST_PATH; cm->cm_desc.FastPathSCSIIO.RequestFlags = MPI25_REQ_DESCRIPT_FLAGS_FAST_PATH_SCSI_IO; if (!sc->atomic_desc_capable) { cm->cm_desc.FastPathSCSIIO.DevHandle = htole16(targ->handle); } } else { cm->cm_desc.SCSIIO.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_SCSI_IO; if (!sc->atomic_desc_capable) cm->cm_desc.SCSIIO.DevHandle = htole16(targ->handle); } csio->ccb_h.qos.sim_data = sbinuptime(); #if __FreeBSD_version >= 1000029 callout_reset_sbt(&cm->cm_callout, SBT_1MS * ccb->ccb_h.timeout, 0, mprsas_scsiio_timeout, cm, 0); #else //__FreeBSD_version < 1000029 callout_reset(&cm->cm_callout, (ccb->ccb_h.timeout * hz) / 1000, mprsas_scsiio_timeout, cm); #endif //__FreeBSD_version >= 1000029 targ->issued++; targ->outstanding++; TAILQ_INSERT_TAIL(&targ->commands, cm, cm_link); ccb->ccb_h.status |= CAM_SIM_QUEUED; mprsas_log_command(cm, MPR_XINFO, "%s cm %p ccb %p outstanding %u\n", __func__, cm, ccb, targ->outstanding); mpr_map_command(sc, cm); return; } /** * mpr_sc_failed_io_info - translated non-succesfull SCSI_IO request */ static void mpr_sc_failed_io_info(struct mpr_softc *sc, struct ccb_scsiio *csio, Mpi2SCSIIOReply_t *mpi_reply, struct mprsas_target *targ) { u32 response_info; u8 *response_bytes; u16 ioc_status = le16toh(mpi_reply->IOCStatus) & MPI2_IOCSTATUS_MASK; u8 scsi_state = mpi_reply->SCSIState; u8 scsi_status = mpi_reply->SCSIStatus; char *desc_ioc_state = NULL; char *desc_scsi_status = NULL; u32 log_info = le32toh(mpi_reply->IOCLogInfo); if (log_info == 0x31170000) return; desc_ioc_state = mpr_describe_table(mpr_iocstatus_string, ioc_status); desc_scsi_status = mpr_describe_table(mpr_scsi_status_string, scsi_status); mpr_dprint(sc, MPR_XINFO, "\thandle(0x%04x), ioc_status(%s)(0x%04x)\n", le16toh(mpi_reply->DevHandle), desc_ioc_state, ioc_status); if (targ->encl_level_valid) { mpr_dprint(sc, MPR_XINFO, "At enclosure level %d, slot %d, " "connector name (%4s)\n", targ->encl_level, targ->encl_slot, targ->connector_name); } /* * We can add more detail about underflow data here * TO-DO */ mpr_dprint(sc, MPR_XINFO, "\tscsi_status(%s)(0x%02x), " "scsi_state %b\n", desc_scsi_status, scsi_status, scsi_state, "\20" "\1AutosenseValid" "\2AutosenseFailed" "\3NoScsiStatus" "\4Terminated" "\5Response InfoValid"); if (sc->mpr_debug & MPR_XINFO && scsi_state & MPI2_SCSI_STATE_AUTOSENSE_VALID) { mpr_dprint(sc, MPR_XINFO, "-> Sense Buffer Data : Start :\n"); scsi_sense_print(csio); mpr_dprint(sc, MPR_XINFO, "-> Sense Buffer Data : End :\n"); } if (scsi_state & MPI2_SCSI_STATE_RESPONSE_INFO_VALID) { response_info = le32toh(mpi_reply->ResponseInfo); response_bytes = (u8 *)&response_info; mpr_dprint(sc, MPR_XINFO, "response code(0x%01x): %s\n", response_bytes[0], mpr_describe_table(mpr_scsi_taskmgmt_string, response_bytes[0])); } } /** mprsas_nvme_trans_status_code * * Convert Native NVMe command error status to * equivalent SCSI error status. * * Returns appropriate scsi_status */ static u8 mprsas_nvme_trans_status_code(uint16_t nvme_status, struct mpr_command *cm) { u8 status = MPI2_SCSI_STATUS_GOOD; int skey, asc, ascq; union ccb *ccb = cm->cm_complete_data; int returned_sense_len; uint8_t sct, sc; sct = NVME_STATUS_GET_SCT(nvme_status); sc = NVME_STATUS_GET_SC(nvme_status); status = MPI2_SCSI_STATUS_CHECK_CONDITION; skey = SSD_KEY_ILLEGAL_REQUEST; asc = SCSI_ASC_NO_SENSE; ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; switch (sct) { case NVME_SCT_GENERIC: switch (sc) { case NVME_SC_SUCCESS: status = MPI2_SCSI_STATUS_GOOD; skey = SSD_KEY_NO_SENSE; asc = SCSI_ASC_NO_SENSE; ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; break; case NVME_SC_INVALID_OPCODE: status = MPI2_SCSI_STATUS_CHECK_CONDITION; skey = SSD_KEY_ILLEGAL_REQUEST; asc = SCSI_ASC_ILLEGAL_COMMAND; ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; break; case NVME_SC_INVALID_FIELD: status = MPI2_SCSI_STATUS_CHECK_CONDITION; skey = SSD_KEY_ILLEGAL_REQUEST; asc = SCSI_ASC_INVALID_CDB; ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; break; case NVME_SC_DATA_TRANSFER_ERROR: status = MPI2_SCSI_STATUS_CHECK_CONDITION; skey = SSD_KEY_MEDIUM_ERROR; asc = SCSI_ASC_NO_SENSE; ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; break; case NVME_SC_ABORTED_POWER_LOSS: status = MPI2_SCSI_STATUS_TASK_ABORTED; skey = SSD_KEY_ABORTED_COMMAND; asc = SCSI_ASC_WARNING; ascq = SCSI_ASCQ_POWER_LOSS_EXPECTED; break; case NVME_SC_INTERNAL_DEVICE_ERROR: status = MPI2_SCSI_STATUS_CHECK_CONDITION; skey = SSD_KEY_HARDWARE_ERROR; asc = SCSI_ASC_INTERNAL_TARGET_FAILURE; ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; break; case NVME_SC_ABORTED_BY_REQUEST: case NVME_SC_ABORTED_SQ_DELETION: case NVME_SC_ABORTED_FAILED_FUSED: case NVME_SC_ABORTED_MISSING_FUSED: status = MPI2_SCSI_STATUS_TASK_ABORTED; skey = SSD_KEY_ABORTED_COMMAND; asc = SCSI_ASC_NO_SENSE; ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; break; case NVME_SC_INVALID_NAMESPACE_OR_FORMAT: status = MPI2_SCSI_STATUS_CHECK_CONDITION; skey = SSD_KEY_ILLEGAL_REQUEST; asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID; ascq = SCSI_ASCQ_INVALID_LUN_ID; break; case NVME_SC_LBA_OUT_OF_RANGE: status = MPI2_SCSI_STATUS_CHECK_CONDITION; skey = SSD_KEY_ILLEGAL_REQUEST; asc = SCSI_ASC_ILLEGAL_BLOCK; ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; break; case NVME_SC_CAPACITY_EXCEEDED: status = MPI2_SCSI_STATUS_CHECK_CONDITION; skey = SSD_KEY_MEDIUM_ERROR; asc = SCSI_ASC_NO_SENSE; ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; break; case NVME_SC_NAMESPACE_NOT_READY: status = MPI2_SCSI_STATUS_CHECK_CONDITION; skey = SSD_KEY_NOT_READY; asc = SCSI_ASC_LUN_NOT_READY; ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; break; } break; case NVME_SCT_COMMAND_SPECIFIC: switch (sc) { case NVME_SC_INVALID_FORMAT: status = MPI2_SCSI_STATUS_CHECK_CONDITION; skey = SSD_KEY_ILLEGAL_REQUEST; asc = SCSI_ASC_FORMAT_COMMAND_FAILED; ascq = SCSI_ASCQ_FORMAT_COMMAND_FAILED; break; case NVME_SC_CONFLICTING_ATTRIBUTES: status = MPI2_SCSI_STATUS_CHECK_CONDITION; skey = SSD_KEY_ILLEGAL_REQUEST; asc = SCSI_ASC_INVALID_CDB; ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; break; } break; case NVME_SCT_MEDIA_ERROR: switch (sc) { case NVME_SC_WRITE_FAULTS: status = MPI2_SCSI_STATUS_CHECK_CONDITION; skey = SSD_KEY_MEDIUM_ERROR; asc = SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT; ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; break; case NVME_SC_UNRECOVERED_READ_ERROR: status = MPI2_SCSI_STATUS_CHECK_CONDITION; skey = SSD_KEY_MEDIUM_ERROR; asc = SCSI_ASC_UNRECOVERED_READ_ERROR; ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; break; case NVME_SC_GUARD_CHECK_ERROR: status = MPI2_SCSI_STATUS_CHECK_CONDITION; skey = SSD_KEY_MEDIUM_ERROR; asc = SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED; ascq = SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED; break; case NVME_SC_APPLICATION_TAG_CHECK_ERROR: status = MPI2_SCSI_STATUS_CHECK_CONDITION; skey = SSD_KEY_MEDIUM_ERROR; asc = SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED; ascq = SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED; break; case NVME_SC_REFERENCE_TAG_CHECK_ERROR: status = MPI2_SCSI_STATUS_CHECK_CONDITION; skey = SSD_KEY_MEDIUM_ERROR; asc = SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED; ascq = SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED; break; case NVME_SC_COMPARE_FAILURE: status = MPI2_SCSI_STATUS_CHECK_CONDITION; skey = SSD_KEY_MISCOMPARE; asc = SCSI_ASC_MISCOMPARE_DURING_VERIFY; ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; break; case NVME_SC_ACCESS_DENIED: status = MPI2_SCSI_STATUS_CHECK_CONDITION; skey = SSD_KEY_ILLEGAL_REQUEST; asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID; ascq = SCSI_ASCQ_INVALID_LUN_ID; break; } break; } returned_sense_len = sizeof(struct scsi_sense_data); if (returned_sense_len < ccb->csio.sense_len) ccb->csio.sense_resid = ccb->csio.sense_len - returned_sense_len; else ccb->csio.sense_resid = 0; scsi_set_sense_data(&ccb->csio.sense_data, SSD_TYPE_FIXED, 1, skey, asc, ascq, SSD_ELEM_NONE); ccb->ccb_h.status |= CAM_AUTOSNS_VALID; return status; } /** mprsas_complete_nvme_unmap * * Complete native NVMe command issued using NVMe Encapsulated * Request Message. */ static u8 mprsas_complete_nvme_unmap(struct mpr_softc *sc, struct mpr_command *cm) { Mpi26NVMeEncapsulatedErrorReply_t *mpi_reply; struct nvme_completion *nvme_completion = NULL; u8 scsi_status = MPI2_SCSI_STATUS_GOOD; mpi_reply =(Mpi26NVMeEncapsulatedErrorReply_t *)cm->cm_reply; if (le16toh(mpi_reply->ErrorResponseCount)){ nvme_completion = (struct nvme_completion *)cm->cm_sense; scsi_status = mprsas_nvme_trans_status_code( nvme_completion->status, cm); } return scsi_status; } static void mprsas_scsiio_complete(struct mpr_softc *sc, struct mpr_command *cm) { MPI2_SCSI_IO_REPLY *rep; union ccb *ccb; struct ccb_scsiio *csio; struct mprsas_softc *sassc; struct scsi_vpd_supported_page_list *vpd_list = NULL; u8 *TLR_bits, TLR_on, *scsi_cdb; int dir = 0, i; u16 alloc_len; struct mprsas_target *target; target_id_t target_id; MPR_FUNCTRACE(sc); mpr_dprint(sc, MPR_TRACE, "cm %p SMID %u ccb %p reply %p outstanding %u\n", cm, cm->cm_desc.Default.SMID, cm->cm_ccb, cm->cm_reply, cm->cm_targ->outstanding); callout_stop(&cm->cm_callout); mtx_assert(&sc->mpr_mtx, MA_OWNED); sassc = sc->sassc; ccb = cm->cm_complete_data; csio = &ccb->csio; target_id = csio->ccb_h.target_id; rep = (MPI2_SCSI_IO_REPLY *)cm->cm_reply; /* * XXX KDM if the chain allocation fails, does it matter if we do * the sync and unload here? It is simpler to do it in every case, * assuming it doesn't cause problems. */ if (cm->cm_data != NULL) { if (cm->cm_flags & MPR_CM_FLAGS_DATAIN) dir = BUS_DMASYNC_POSTREAD; else if (cm->cm_flags & MPR_CM_FLAGS_DATAOUT) dir = BUS_DMASYNC_POSTWRITE; bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap, dir); bus_dmamap_unload(sc->buffer_dmat, cm->cm_dmamap); } cm->cm_targ->completed++; cm->cm_targ->outstanding--; TAILQ_REMOVE(&cm->cm_targ->commands, cm, cm_link); ccb->ccb_h.status &= ~(CAM_STATUS_MASK | CAM_SIM_QUEUED); if (cm->cm_state == MPR_CM_STATE_TIMEDOUT) { TAILQ_REMOVE(&cm->cm_targ->timedout_commands, cm, cm_recovery); cm->cm_state = MPR_CM_STATE_BUSY; if (cm->cm_reply != NULL) mprsas_log_command(cm, MPR_RECOVERY, "completed timedout cm %p ccb %p during recovery " "ioc %x scsi %x state %x xfer %u\n", cm, cm->cm_ccb, le16toh(rep->IOCStatus), rep->SCSIStatus, rep->SCSIState, le32toh(rep->TransferCount)); else mprsas_log_command(cm, MPR_RECOVERY, "completed timedout cm %p ccb %p during recovery\n", cm, cm->cm_ccb); } else if (cm->cm_targ->tm != NULL) { if (cm->cm_reply != NULL) mprsas_log_command(cm, MPR_RECOVERY, "completed cm %p ccb %p during recovery " "ioc %x scsi %x state %x xfer %u\n", cm, cm->cm_ccb, le16toh(rep->IOCStatus), rep->SCSIStatus, rep->SCSIState, le32toh(rep->TransferCount)); else mprsas_log_command(cm, MPR_RECOVERY, "completed cm %p ccb %p during recovery\n", cm, cm->cm_ccb); } else if ((sc->mpr_flags & MPR_FLAGS_DIAGRESET) != 0) { mprsas_log_command(cm, MPR_RECOVERY, "reset completed cm %p ccb %p\n", cm, cm->cm_ccb); } if ((cm->cm_flags & MPR_CM_FLAGS_ERROR_MASK) != 0) { /* * We ran into an error after we tried to map the command, * so we're getting a callback without queueing the command * to the hardware. So we set the status here, and it will * be retained below. We'll go through the "fast path", * because there can be no reply when we haven't actually * gone out to the hardware. */ mprsas_set_ccbstatus(ccb, CAM_REQUEUE_REQ); /* * Currently the only error included in the mask is * MPR_CM_FLAGS_CHAIN_FAILED, which means we're out of * chain frames. We need to freeze the queue until we get * a command that completed without this error, which will * hopefully have some chain frames attached that we can * use. If we wanted to get smarter about it, we would * only unfreeze the queue in this condition when we're * sure that we're getting some chain frames back. That's * probably unnecessary. */ if ((sassc->flags & MPRSAS_QUEUE_FROZEN) == 0) { xpt_freeze_simq(sassc->sim, 1); sassc->flags |= MPRSAS_QUEUE_FROZEN; mpr_dprint(sc, MPR_XINFO, "Error sending command, " "freezing SIM queue\n"); } } /* * Point to the SCSI CDB, which is dependent on the CAM_CDB_POINTER * flag, and use it in a few places in the rest of this function for * convenience. Use the macro if available. */ #if __FreeBSD_version >= 1100103 scsi_cdb = scsiio_cdb_ptr(csio); #else if (csio->ccb_h.flags & CAM_CDB_POINTER) scsi_cdb = csio->cdb_io.cdb_ptr; else scsi_cdb = csio->cdb_io.cdb_bytes; #endif /* * If this is a Start Stop Unit command and it was issued by the driver * during shutdown, decrement the refcount to account for all of the * commands that were sent. All SSU commands should be completed before * shutdown completes, meaning SSU_refcount will be 0 after SSU_started * is TRUE. */ if (sc->SSU_started && (scsi_cdb[0] == START_STOP_UNIT)) { mpr_dprint(sc, MPR_INFO, "Decrementing SSU count.\n"); sc->SSU_refcount--; } /* Take the fast path to completion */ if (cm->cm_reply == NULL) { if (mprsas_get_ccbstatus(ccb) == CAM_REQ_INPROG) { if ((sc->mpr_flags & MPR_FLAGS_DIAGRESET) != 0) mprsas_set_ccbstatus(ccb, CAM_SCSI_BUS_RESET); else { mprsas_set_ccbstatus(ccb, CAM_REQ_CMP); csio->scsi_status = SCSI_STATUS_OK; } if (sassc->flags & MPRSAS_QUEUE_FROZEN) { ccb->ccb_h.status |= CAM_RELEASE_SIMQ; sassc->flags &= ~MPRSAS_QUEUE_FROZEN; mpr_dprint(sc, MPR_XINFO, "Unfreezing SIM queue\n"); } } /* * There are two scenarios where the status won't be * CAM_REQ_CMP. The first is if MPR_CM_FLAGS_ERROR_MASK is * set, the second is in the MPR_FLAGS_DIAGRESET above. */ if (mprsas_get_ccbstatus(ccb) != CAM_REQ_CMP) { /* * Freeze the dev queue so that commands are * executed in the correct order after error * recovery. */ ccb->ccb_h.status |= CAM_DEV_QFRZN; xpt_freeze_devq(ccb->ccb_h.path, /*count*/ 1); } mpr_free_command(sc, cm); xpt_done(ccb); return; } target = &sassc->targets[target_id]; if (scsi_cdb[0] == UNMAP && target->is_nvme && (csio->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_VADDR) { rep->SCSIStatus = mprsas_complete_nvme_unmap(sc, cm); csio->scsi_status = rep->SCSIStatus; } mprsas_log_command(cm, MPR_XINFO, "ioc %x scsi %x state %x xfer %u\n", le16toh(rep->IOCStatus), rep->SCSIStatus, rep->SCSIState, le32toh(rep->TransferCount)); switch (le16toh(rep->IOCStatus) & MPI2_IOCSTATUS_MASK) { case MPI2_IOCSTATUS_SCSI_DATA_UNDERRUN: csio->resid = cm->cm_length - le32toh(rep->TransferCount); /* FALLTHROUGH */ case MPI2_IOCSTATUS_SUCCESS: case MPI2_IOCSTATUS_SCSI_RECOVERED_ERROR: if ((le16toh(rep->IOCStatus) & MPI2_IOCSTATUS_MASK) == MPI2_IOCSTATUS_SCSI_RECOVERED_ERROR) mprsas_log_command(cm, MPR_XINFO, "recovered error\n"); /* Completion failed at the transport level. */ if (rep->SCSIState & (MPI2_SCSI_STATE_NO_SCSI_STATUS | MPI2_SCSI_STATE_TERMINATED)) { mprsas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR); break; } /* In a modern packetized environment, an autosense failure * implies that there's not much else that can be done to * recover the command. */ if (rep->SCSIState & MPI2_SCSI_STATE_AUTOSENSE_FAILED) { mprsas_set_ccbstatus(ccb, CAM_AUTOSENSE_FAIL); break; } /* * CAM doesn't care about SAS Response Info data, but if this is * the state check if TLR should be done. If not, clear the * TLR_bits for the target. */ if ((rep->SCSIState & MPI2_SCSI_STATE_RESPONSE_INFO_VALID) && ((le32toh(rep->ResponseInfo) & MPI2_SCSI_RI_MASK_REASONCODE) == MPR_SCSI_RI_INVALID_FRAME)) { sc->mapping_table[target_id].TLR_bits = (u8)MPI2_SCSIIO_CONTROL_NO_TLR; } /* * Intentionally override the normal SCSI status reporting * for these two cases. These are likely to happen in a * multi-initiator environment, and we want to make sure that * CAM retries these commands rather than fail them. */ if ((rep->SCSIStatus == MPI2_SCSI_STATUS_COMMAND_TERMINATED) || (rep->SCSIStatus == MPI2_SCSI_STATUS_TASK_ABORTED)) { mprsas_set_ccbstatus(ccb, CAM_REQ_ABORTED); break; } /* Handle normal status and sense */ csio->scsi_status = rep->SCSIStatus; if (rep->SCSIStatus == MPI2_SCSI_STATUS_GOOD) mprsas_set_ccbstatus(ccb, CAM_REQ_CMP); else mprsas_set_ccbstatus(ccb, CAM_SCSI_STATUS_ERROR); if (rep->SCSIState & MPI2_SCSI_STATE_AUTOSENSE_VALID) { int sense_len, returned_sense_len; returned_sense_len = min(le32toh(rep->SenseCount), sizeof(struct scsi_sense_data)); if (returned_sense_len < csio->sense_len) csio->sense_resid = csio->sense_len - returned_sense_len; else csio->sense_resid = 0; sense_len = min(returned_sense_len, csio->sense_len - csio->sense_resid); bzero(&csio->sense_data, sizeof(csio->sense_data)); bcopy(cm->cm_sense, &csio->sense_data, sense_len); ccb->ccb_h.status |= CAM_AUTOSNS_VALID; } /* * Check if this is an INQUIRY command. If it's a VPD inquiry, * and it's page code 0 (Supported Page List), and there is * inquiry data, and this is for a sequential access device, and * the device is an SSP target, and TLR is supported by the * controller, turn the TLR_bits value ON if page 0x90 is * supported. */ if ((scsi_cdb[0] == INQUIRY) && (scsi_cdb[1] & SI_EVPD) && (scsi_cdb[2] == SVPD_SUPPORTED_PAGE_LIST) && ((csio->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_VADDR) && (csio->data_ptr != NULL) && ((csio->data_ptr[0] & 0x1f) == T_SEQUENTIAL) && (sc->control_TLR) && (sc->mapping_table[target_id].device_info & MPI2_SAS_DEVICE_INFO_SSP_TARGET)) { vpd_list = (struct scsi_vpd_supported_page_list *) csio->data_ptr; TLR_bits = &sc->mapping_table[target_id].TLR_bits; *TLR_bits = (u8)MPI2_SCSIIO_CONTROL_NO_TLR; TLR_on = (u8)MPI2_SCSIIO_CONTROL_TLR_ON; alloc_len = ((u16)scsi_cdb[3] << 8) + scsi_cdb[4]; alloc_len -= csio->resid; for (i = 0; i < MIN(vpd_list->length, alloc_len); i++) { if (vpd_list->list[i] == 0x90) { *TLR_bits = TLR_on; break; } } } /* * If this is a SATA direct-access end device, mark it so that * a SCSI StartStopUnit command will be sent to it when the * driver is being shutdown. */ if ((scsi_cdb[0] == INQUIRY) && (csio->data_ptr != NULL) && ((csio->data_ptr[0] & 0x1f) == T_DIRECT) && (sc->mapping_table[target_id].device_info & MPI2_SAS_DEVICE_INFO_SATA_DEVICE) && ((sc->mapping_table[target_id].device_info & MPI2_SAS_DEVICE_INFO_MASK_DEVICE_TYPE) == MPI2_SAS_DEVICE_INFO_END_DEVICE)) { target = &sassc->targets[target_id]; target->supports_SSU = TRUE; mpr_dprint(sc, MPR_XINFO, "Target %d supports SSU\n", target_id); } break; case MPI2_IOCSTATUS_SCSI_INVALID_DEVHANDLE: case MPI2_IOCSTATUS_SCSI_DEVICE_NOT_THERE: /* * If devinfo is 0 this will be a volume. In that case don't * tell CAM that the volume is not there. We want volumes to * be enumerated until they are deleted/removed, not just * failed. */ if (cm->cm_targ->devinfo == 0) mprsas_set_ccbstatus(ccb, CAM_REQ_CMP); else mprsas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); break; case MPI2_IOCSTATUS_INVALID_SGL: mpr_print_scsiio_cmd(sc, cm); mprsas_set_ccbstatus(ccb, CAM_UNREC_HBA_ERROR); break; case MPI2_IOCSTATUS_SCSI_TASK_TERMINATED: /* * This is one of the responses that comes back when an I/O * has been aborted. If it is because of a timeout that we * initiated, just set the status to CAM_CMD_TIMEOUT. * Otherwise set it to CAM_REQ_ABORTED. The effect on the * command is the same (it gets retried, subject to the * retry counter), the only difference is what gets printed * on the console. */ if (cm->cm_state == MPR_CM_STATE_TIMEDOUT) mprsas_set_ccbstatus(ccb, CAM_CMD_TIMEOUT); else mprsas_set_ccbstatus(ccb, CAM_REQ_ABORTED); break; case MPI2_IOCSTATUS_SCSI_DATA_OVERRUN: /* resid is ignored for this condition */ csio->resid = 0; mprsas_set_ccbstatus(ccb, CAM_DATA_RUN_ERR); break; case MPI2_IOCSTATUS_SCSI_IOC_TERMINATED: case MPI2_IOCSTATUS_SCSI_EXT_TERMINATED: /* * These can sometimes be transient transport-related * errors, and sometimes persistent drive-related errors. * We used to retry these without decrementing the retry * count by returning CAM_REQUEUE_REQ. Unfortunately, if * we hit a persistent drive problem that returns one of * these error codes, we would retry indefinitely. So, * return CAM_REQ_CMP_ERROR so that we decrement the retry * count and avoid infinite retries. We're taking the * potential risk of flagging false failures in the event * of a topology-related error (e.g. a SAS expander problem * causes a command addressed to a drive to fail), but * avoiding getting into an infinite retry loop. */ mprsas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR); mpr_dprint(sc, MPR_INFO, "Controller reported %s tgt %u SMID %u loginfo %x\n", mpr_describe_table(mpr_iocstatus_string, le16toh(rep->IOCStatus) & MPI2_IOCSTATUS_MASK), target_id, cm->cm_desc.Default.SMID, le32toh(rep->IOCLogInfo)); mpr_dprint(sc, MPR_XINFO, "SCSIStatus %x SCSIState %x xfercount %u\n", rep->SCSIStatus, rep->SCSIState, le32toh(rep->TransferCount)); break; case MPI2_IOCSTATUS_INVALID_FUNCTION: case MPI2_IOCSTATUS_INTERNAL_ERROR: case MPI2_IOCSTATUS_INVALID_VPID: case MPI2_IOCSTATUS_INVALID_FIELD: case MPI2_IOCSTATUS_INVALID_STATE: case MPI2_IOCSTATUS_OP_STATE_NOT_SUPPORTED: case MPI2_IOCSTATUS_SCSI_IO_DATA_ERROR: case MPI2_IOCSTATUS_SCSI_PROTOCOL_ERROR: case MPI2_IOCSTATUS_SCSI_RESIDUAL_MISMATCH: case MPI2_IOCSTATUS_SCSI_TASK_MGMT_FAILED: default: mprsas_log_command(cm, MPR_XINFO, "completed ioc %x loginfo %x scsi %x state %x xfer %u\n", le16toh(rep->IOCStatus), le32toh(rep->IOCLogInfo), rep->SCSIStatus, rep->SCSIState, le32toh(rep->TransferCount)); csio->resid = cm->cm_length; if (scsi_cdb[0] == UNMAP && target->is_nvme && (csio->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_VADDR) mprsas_set_ccbstatus(ccb, CAM_REQ_CMP); else mprsas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR); break; } mpr_sc_failed_io_info(sc, csio, rep, cm->cm_targ); if (sassc->flags & MPRSAS_QUEUE_FROZEN) { ccb->ccb_h.status |= CAM_RELEASE_SIMQ; sassc->flags &= ~MPRSAS_QUEUE_FROZEN; mpr_dprint(sc, MPR_XINFO, "Command completed, unfreezing SIM " "queue\n"); } if (mprsas_get_ccbstatus(ccb) != CAM_REQ_CMP) { ccb->ccb_h.status |= CAM_DEV_QFRZN; xpt_freeze_devq(ccb->ccb_h.path, /*count*/ 1); } mpr_free_command(sc, cm); xpt_done(ccb); } #if __FreeBSD_version >= 900026 static void mprsas_smpio_complete(struct mpr_softc *sc, struct mpr_command *cm) { MPI2_SMP_PASSTHROUGH_REPLY *rpl; MPI2_SMP_PASSTHROUGH_REQUEST *req; uint64_t sasaddr; union ccb *ccb; ccb = cm->cm_complete_data; /* * Currently there should be no way we can hit this case. It only * happens when we have a failure to allocate chain frames, and SMP * commands require two S/G elements only. That should be handled * in the standard request size. */ if ((cm->cm_flags & MPR_CM_FLAGS_ERROR_MASK) != 0) { mpr_dprint(sc, MPR_ERROR, "%s: cm_flags = %#x on SMP " "request!\n", __func__, cm->cm_flags); mprsas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR); goto bailout; } rpl = (MPI2_SMP_PASSTHROUGH_REPLY *)cm->cm_reply; if (rpl == NULL) { mpr_dprint(sc, MPR_ERROR, "%s: NULL cm_reply!\n", __func__); mprsas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR); goto bailout; } req = (MPI2_SMP_PASSTHROUGH_REQUEST *)cm->cm_req; sasaddr = le32toh(req->SASAddress.Low); sasaddr |= ((uint64_t)(le32toh(req->SASAddress.High))) << 32; if ((le16toh(rpl->IOCStatus) & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS || rpl->SASStatus != MPI2_SASSTATUS_SUCCESS) { mpr_dprint(sc, MPR_XINFO, "%s: IOCStatus %04x SASStatus %02x\n", __func__, le16toh(rpl->IOCStatus), rpl->SASStatus); mprsas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR); goto bailout; } mpr_dprint(sc, MPR_XINFO, "%s: SMP request to SAS address %#jx " "completed successfully\n", __func__, (uintmax_t)sasaddr); if (ccb->smpio.smp_response[2] == SMP_FR_ACCEPTED) mprsas_set_ccbstatus(ccb, CAM_REQ_CMP); else mprsas_set_ccbstatus(ccb, CAM_SMP_STATUS_ERROR); bailout: /* * We sync in both directions because we had DMAs in the S/G list * in both directions. */ bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(sc->buffer_dmat, cm->cm_dmamap); mpr_free_command(sc, cm); xpt_done(ccb); } static void mprsas_send_smpcmd(struct mprsas_softc *sassc, union ccb *ccb, uint64_t sasaddr) { struct mpr_command *cm; uint8_t *request, *response; MPI2_SMP_PASSTHROUGH_REQUEST *req; struct mpr_softc *sc; struct sglist *sg; int error; sc = sassc->sc; sg = NULL; error = 0; #if (__FreeBSD_version >= 1000028) || \ ((__FreeBSD_version >= 902001) && (__FreeBSD_version < 1000000)) switch (ccb->ccb_h.flags & CAM_DATA_MASK) { case CAM_DATA_PADDR: case CAM_DATA_SG_PADDR: /* * XXX We don't yet support physical addresses here. */ mpr_dprint(sc, MPR_ERROR, "%s: physical addresses not " "supported\n", __func__); mprsas_set_ccbstatus(ccb, CAM_REQ_INVALID); xpt_done(ccb); return; case CAM_DATA_SG: /* * The chip does not support more than one buffer for the * request or response. */ if ((ccb->smpio.smp_request_sglist_cnt > 1) || (ccb->smpio.smp_response_sglist_cnt > 1)) { mpr_dprint(sc, MPR_ERROR, "%s: multiple request or " "response buffer segments not supported for SMP\n", __func__); mprsas_set_ccbstatus(ccb, CAM_REQ_INVALID); xpt_done(ccb); return; } /* * The CAM_SCATTER_VALID flag was originally implemented * for the XPT_SCSI_IO CCB, which only has one data pointer. * We have two. So, just take that flag to mean that we * might have S/G lists, and look at the S/G segment count * to figure out whether that is the case for each individual * buffer. */ if (ccb->smpio.smp_request_sglist_cnt != 0) { bus_dma_segment_t *req_sg; req_sg = (bus_dma_segment_t *)ccb->smpio.smp_request; request = (uint8_t *)(uintptr_t)req_sg[0].ds_addr; } else request = ccb->smpio.smp_request; if (ccb->smpio.smp_response_sglist_cnt != 0) { bus_dma_segment_t *rsp_sg; rsp_sg = (bus_dma_segment_t *)ccb->smpio.smp_response; response = (uint8_t *)(uintptr_t)rsp_sg[0].ds_addr; } else response = ccb->smpio.smp_response; break; case CAM_DATA_VADDR: request = ccb->smpio.smp_request; response = ccb->smpio.smp_response; break; default: mprsas_set_ccbstatus(ccb, CAM_REQ_INVALID); xpt_done(ccb); return; } #else /* __FreeBSD_version < 1000028 */ /* * XXX We don't yet support physical addresses here. */ if (ccb->ccb_h.flags & (CAM_DATA_PHYS|CAM_SG_LIST_PHYS)) { mpr_dprint(sc, MPR_ERROR, "%s: physical addresses not " "supported\n", __func__); mprsas_set_ccbstatus(ccb, CAM_REQ_INVALID); xpt_done(ccb); return; } /* * If the user wants to send an S/G list, check to make sure they * have single buffers. */ if (ccb->ccb_h.flags & CAM_SCATTER_VALID) { /* * The chip does not support more than one buffer for the * request or response. */ if ((ccb->smpio.smp_request_sglist_cnt > 1) || (ccb->smpio.smp_response_sglist_cnt > 1)) { mpr_dprint(sc, MPR_ERROR, "%s: multiple request or " "response buffer segments not supported for SMP\n", __func__); mprsas_set_ccbstatus(ccb, CAM_REQ_INVALID); xpt_done(ccb); return; } /* * The CAM_SCATTER_VALID flag was originally implemented * for the XPT_SCSI_IO CCB, which only has one data pointer. * We have two. So, just take that flag to mean that we * might have S/G lists, and look at the S/G segment count * to figure out whether that is the case for each individual * buffer. */ if (ccb->smpio.smp_request_sglist_cnt != 0) { bus_dma_segment_t *req_sg; req_sg = (bus_dma_segment_t *)ccb->smpio.smp_request; request = (uint8_t *)(uintptr_t)req_sg[0].ds_addr; } else request = ccb->smpio.smp_request; if (ccb->smpio.smp_response_sglist_cnt != 0) { bus_dma_segment_t *rsp_sg; rsp_sg = (bus_dma_segment_t *)ccb->smpio.smp_response; response = (uint8_t *)(uintptr_t)rsp_sg[0].ds_addr; } else response = ccb->smpio.smp_response; } else { request = ccb->smpio.smp_request; response = ccb->smpio.smp_response; } #endif /* __FreeBSD_version < 1000028 */ cm = mpr_alloc_command(sc); if (cm == NULL) { mpr_dprint(sc, MPR_ERROR, "%s: cannot allocate command\n", __func__); mprsas_set_ccbstatus(ccb, CAM_RESRC_UNAVAIL); xpt_done(ccb); return; } req = (MPI2_SMP_PASSTHROUGH_REQUEST *)cm->cm_req; bzero(req, sizeof(*req)); req->Function = MPI2_FUNCTION_SMP_PASSTHROUGH; /* Allow the chip to use any route to this SAS address. */ req->PhysicalPort = 0xff; req->RequestDataLength = htole16(ccb->smpio.smp_request_len); req->SGLFlags = MPI2_SGLFLAGS_SYSTEM_ADDRESS_SPACE | MPI2_SGLFLAGS_SGL_TYPE_MPI; mpr_dprint(sc, MPR_XINFO, "%s: sending SMP request to SAS address " "%#jx\n", __func__, (uintmax_t)sasaddr); mpr_init_sge(cm, req, &req->SGL); /* * Set up a uio to pass into mpr_map_command(). This allows us to * do one map command, and one busdma call in there. */ cm->cm_uio.uio_iov = cm->cm_iovec; cm->cm_uio.uio_iovcnt = 2; cm->cm_uio.uio_segflg = UIO_SYSSPACE; /* * The read/write flag isn't used by busdma, but set it just in * case. This isn't exactly accurate, either, since we're going in * both directions. */ cm->cm_uio.uio_rw = UIO_WRITE; cm->cm_iovec[0].iov_base = request; cm->cm_iovec[0].iov_len = le16toh(req->RequestDataLength); cm->cm_iovec[1].iov_base = response; cm->cm_iovec[1].iov_len = ccb->smpio.smp_response_len; cm->cm_uio.uio_resid = cm->cm_iovec[0].iov_len + cm->cm_iovec[1].iov_len; /* * Trigger a warning message in mpr_data_cb() for the user if we * wind up exceeding two S/G segments. The chip expects one * segment for the request and another for the response. */ cm->cm_max_segs = 2; cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE; cm->cm_complete = mprsas_smpio_complete; cm->cm_complete_data = ccb; /* * Tell the mapping code that we're using a uio, and that this is * an SMP passthrough request. There is a little special-case * logic there (in mpr_data_cb()) to handle the bidirectional * transfer. */ cm->cm_flags |= MPR_CM_FLAGS_USE_UIO | MPR_CM_FLAGS_SMP_PASS | MPR_CM_FLAGS_DATAIN | MPR_CM_FLAGS_DATAOUT; /* The chip data format is little endian. */ req->SASAddress.High = htole32(sasaddr >> 32); req->SASAddress.Low = htole32(sasaddr); /* * XXX Note that we don't have a timeout/abort mechanism here. * From the manual, it looks like task management requests only * work for SCSI IO and SATA passthrough requests. We may need to * have a mechanism to retry requests in the event of a chip reset * at least. Hopefully the chip will insure that any errors short * of that are relayed back to the driver. */ error = mpr_map_command(sc, cm); if ((error != 0) && (error != EINPROGRESS)) { mpr_dprint(sc, MPR_ERROR, "%s: error %d returned from " "mpr_map_command()\n", __func__, error); goto bailout_error; } return; bailout_error: mpr_free_command(sc, cm); mprsas_set_ccbstatus(ccb, CAM_RESRC_UNAVAIL); xpt_done(ccb); return; } static void mprsas_action_smpio(struct mprsas_softc *sassc, union ccb *ccb) { struct mpr_softc *sc; struct mprsas_target *targ; uint64_t sasaddr = 0; sc = sassc->sc; /* * Make sure the target exists. */ KASSERT(ccb->ccb_h.target_id < sassc->maxtargets, ("Target %d out of bounds in XPT_SMP_IO\n", ccb->ccb_h.target_id)); targ = &sassc->targets[ccb->ccb_h.target_id]; if (targ->handle == 0x0) { mpr_dprint(sc, MPR_ERROR, "%s: target %d does not exist!\n", __func__, ccb->ccb_h.target_id); mprsas_set_ccbstatus(ccb, CAM_SEL_TIMEOUT); xpt_done(ccb); return; } /* * If this device has an embedded SMP target, we'll talk to it * directly. * figure out what the expander's address is. */ if ((targ->devinfo & MPI2_SAS_DEVICE_INFO_SMP_TARGET) != 0) sasaddr = targ->sasaddr; /* * If we don't have a SAS address for the expander yet, try * grabbing it from the page 0x83 information cached in the * transport layer for this target. LSI expanders report the * expander SAS address as the port-associated SAS address in * Inquiry VPD page 0x83. Maxim expanders don't report it in page * 0x83. * * XXX KDM disable this for now, but leave it commented out so that * it is obvious that this is another possible way to get the SAS * address. * * The parent handle method below is a little more reliable, and * the other benefit is that it works for devices other than SES * devices. So you can send a SMP request to a da(4) device and it * will get routed to the expander that device is attached to. * (Assuming the da(4) device doesn't contain an SMP target...) */ #if 0 if (sasaddr == 0) sasaddr = xpt_path_sas_addr(ccb->ccb_h.path); #endif /* * If we still don't have a SAS address for the expander, look for * the parent device of this device, which is probably the expander. */ if (sasaddr == 0) { #ifdef OLD_MPR_PROBE struct mprsas_target *parent_target; #endif if (targ->parent_handle == 0x0) { mpr_dprint(sc, MPR_ERROR, "%s: handle %d does not have " "a valid parent handle!\n", __func__, targ->handle); mprsas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); goto bailout; } #ifdef OLD_MPR_PROBE parent_target = mprsas_find_target_by_handle(sassc, 0, targ->parent_handle); if (parent_target == NULL) { mpr_dprint(sc, MPR_ERROR, "%s: handle %d does not have " "a valid parent target!\n", __func__, targ->handle); mprsas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); goto bailout; } if ((parent_target->devinfo & MPI2_SAS_DEVICE_INFO_SMP_TARGET) == 0) { mpr_dprint(sc, MPR_ERROR, "%s: handle %d parent %d " "does not have an SMP target!\n", __func__, targ->handle, parent_target->handle); mprsas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); goto bailout; } sasaddr = parent_target->sasaddr; #else /* OLD_MPR_PROBE */ if ((targ->parent_devinfo & MPI2_SAS_DEVICE_INFO_SMP_TARGET) == 0) { mpr_dprint(sc, MPR_ERROR, "%s: handle %d parent %d " "does not have an SMP target!\n", __func__, targ->handle, targ->parent_handle); mprsas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); goto bailout; } if (targ->parent_sasaddr == 0x0) { mpr_dprint(sc, MPR_ERROR, "%s: handle %d parent handle " "%d does not have a valid SAS address!\n", __func__, targ->handle, targ->parent_handle); mprsas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); goto bailout; } sasaddr = targ->parent_sasaddr; #endif /* OLD_MPR_PROBE */ } if (sasaddr == 0) { mpr_dprint(sc, MPR_INFO, "%s: unable to find SAS address for " "handle %d\n", __func__, targ->handle); mprsas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); goto bailout; } mprsas_send_smpcmd(sassc, ccb, sasaddr); return; bailout: xpt_done(ccb); } #endif //__FreeBSD_version >= 900026 static void mprsas_action_resetdev(struct mprsas_softc *sassc, union ccb *ccb) { MPI2_SCSI_TASK_MANAGE_REQUEST *req; struct mpr_softc *sc; struct mpr_command *tm; struct mprsas_target *targ; MPR_FUNCTRACE(sassc->sc); mtx_assert(&sassc->sc->mpr_mtx, MA_OWNED); KASSERT(ccb->ccb_h.target_id < sassc->maxtargets, ("Target %d out of " "bounds in XPT_RESET_DEV\n", ccb->ccb_h.target_id)); sc = sassc->sc; tm = mpr_alloc_command(sc); if (tm == NULL) { mpr_dprint(sc, MPR_ERROR, "command alloc failure in " "mprsas_action_resetdev\n"); mprsas_set_ccbstatus(ccb, CAM_RESRC_UNAVAIL); xpt_done(ccb); return; } targ = &sassc->targets[ccb->ccb_h.target_id]; req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)tm->cm_req; req->DevHandle = htole16(targ->handle); req->Function = MPI2_FUNCTION_SCSI_TASK_MGMT; req->TaskType = MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET; /* SAS Hard Link Reset / SATA Link Reset */ req->MsgFlags = MPI2_SCSITASKMGMT_MSGFLAGS_LINK_RESET; tm->cm_data = NULL; tm->cm_desc.HighPriority.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY; tm->cm_complete = mprsas_resetdev_complete; tm->cm_complete_data = ccb; mpr_dprint(sc, MPR_INFO, "%s: Sending reset for target ID %d\n", __func__, targ->tid); tm->cm_targ = targ; targ->flags |= MPRSAS_TARGET_INRESET; mpr_map_command(sc, tm); } static void mprsas_resetdev_complete(struct mpr_softc *sc, struct mpr_command *tm) { MPI2_SCSI_TASK_MANAGE_REPLY *resp; union ccb *ccb; MPR_FUNCTRACE(sc); mtx_assert(&sc->mpr_mtx, MA_OWNED); resp = (MPI2_SCSI_TASK_MANAGE_REPLY *)tm->cm_reply; ccb = tm->cm_complete_data; /* * Currently there should be no way we can hit this case. It only * happens when we have a failure to allocate chain frames, and * task management commands don't have S/G lists. */ if ((tm->cm_flags & MPR_CM_FLAGS_ERROR_MASK) != 0) { MPI2_SCSI_TASK_MANAGE_REQUEST *req; req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)tm->cm_req; mpr_dprint(sc, MPR_ERROR, "%s: cm_flags = %#x for reset of " "handle %#04x! This should not happen!\n", __func__, tm->cm_flags, req->DevHandle); mprsas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR); goto bailout; } mpr_dprint(sc, MPR_XINFO, "%s: IOCStatus = 0x%x ResponseCode = 0x%x\n", __func__, le16toh(resp->IOCStatus), le32toh(resp->ResponseCode)); if (le32toh(resp->ResponseCode) == MPI2_SCSITASKMGMT_RSP_TM_COMPLETE) { mprsas_set_ccbstatus(ccb, CAM_REQ_CMP); mprsas_announce_reset(sc, AC_SENT_BDR, tm->cm_targ->tid, CAM_LUN_WILDCARD); } else mprsas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR); bailout: mprsas_free_tm(sc, tm); xpt_done(ccb); } static void mprsas_poll(struct cam_sim *sim) { struct mprsas_softc *sassc; sassc = cam_sim_softc(sim); if (sassc->sc->mpr_debug & MPR_TRACE) { /* frequent debug messages during a panic just slow * everything down too much. */ mpr_dprint(sassc->sc, MPR_XINFO, "%s clearing MPR_TRACE\n", __func__); sassc->sc->mpr_debug &= ~MPR_TRACE; } mpr_intr_locked(sassc->sc); } static void mprsas_async(void *callback_arg, uint32_t code, struct cam_path *path, void *arg) { struct mpr_softc *sc; sc = (struct mpr_softc *)callback_arg; switch (code) { #if (__FreeBSD_version >= 1000006) || \ ((__FreeBSD_version >= 901503) && (__FreeBSD_version < 1000000)) case AC_ADVINFO_CHANGED: { struct mprsas_target *target; struct mprsas_softc *sassc; struct scsi_read_capacity_data_long rcap_buf; struct ccb_dev_advinfo cdai; struct mprsas_lun *lun; lun_id_t lunid; int found_lun; uintptr_t buftype; buftype = (uintptr_t)arg; found_lun = 0; sassc = sc->sassc; /* * We're only interested in read capacity data changes. */ if (buftype != CDAI_TYPE_RCAPLONG) break; /* * See the comment in mpr_attach_sas() for a detailed * explanation. In these versions of FreeBSD we register * for all events and filter out the events that don't * apply to us. */ #if (__FreeBSD_version < 1000703) || \ ((__FreeBSD_version >= 1100000) && (__FreeBSD_version < 1100002)) if (xpt_path_path_id(path) != sassc->sim->path_id) break; #endif /* * We should have a handle for this, but check to make sure. */ KASSERT(xpt_path_target_id(path) < sassc->maxtargets, ("Target %d out of bounds in mprsas_async\n", xpt_path_target_id(path))); target = &sassc->targets[xpt_path_target_id(path)]; if (target->handle == 0) break; lunid = xpt_path_lun_id(path); SLIST_FOREACH(lun, &target->luns, lun_link) { if (lun->lun_id == lunid) { found_lun = 1; break; } } if (found_lun == 0) { lun = malloc(sizeof(struct mprsas_lun), M_MPR, M_NOWAIT | M_ZERO); if (lun == NULL) { mpr_dprint(sc, MPR_ERROR, "Unable to alloc " "LUN for EEDP support.\n"); break; } lun->lun_id = lunid; SLIST_INSERT_HEAD(&target->luns, lun, lun_link); } bzero(&rcap_buf, sizeof(rcap_buf)); xpt_setup_ccb(&cdai.ccb_h, path, CAM_PRIORITY_NORMAL); cdai.ccb_h.func_code = XPT_DEV_ADVINFO; cdai.ccb_h.flags = CAM_DIR_IN; cdai.buftype = CDAI_TYPE_RCAPLONG; #if (__FreeBSD_version >= 1100061) || \ ((__FreeBSD_version >= 1001510) && (__FreeBSD_version < 1100000)) cdai.flags = CDAI_FLAG_NONE; #else cdai.flags = 0; #endif cdai.bufsiz = sizeof(rcap_buf); cdai.buf = (uint8_t *)&rcap_buf; xpt_action((union ccb *)&cdai); if ((cdai.ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(cdai.ccb_h.path, 0, 0, 0, FALSE); if ((mprsas_get_ccbstatus((union ccb *)&cdai) == CAM_REQ_CMP) && (rcap_buf.prot & SRC16_PROT_EN)) { switch (rcap_buf.prot & SRC16_P_TYPE) { case SRC16_PTYPE_1: case SRC16_PTYPE_3: lun->eedp_formatted = TRUE; lun->eedp_block_size = scsi_4btoul(rcap_buf.length); break; case SRC16_PTYPE_2: default: lun->eedp_formatted = FALSE; lun->eedp_block_size = 0; break; } } else { lun->eedp_formatted = FALSE; lun->eedp_block_size = 0; } break; } #endif case AC_FOUND_DEVICE: { struct ccb_getdev *cgd; /* * See the comment in mpr_attach_sas() for a detailed * explanation. In these versions of FreeBSD we register * for all events and filter out the events that don't * apply to us. */ #if (__FreeBSD_version < 1000703) || \ ((__FreeBSD_version >= 1100000) && (__FreeBSD_version < 1100002)) if (xpt_path_path_id(path) != sc->sassc->sim->path_id) break; #endif cgd = arg; #if (__FreeBSD_version < 901503) || \ ((__FreeBSD_version >= 1000000) && (__FreeBSD_version < 1000006)) mprsas_check_eedp(sc, path, cgd); #endif break; } default: break; } } #if (__FreeBSD_version < 901503) || \ ((__FreeBSD_version >= 1000000) && (__FreeBSD_version < 1000006)) static void mprsas_check_eedp(struct mpr_softc *sc, struct cam_path *path, struct ccb_getdev *cgd) { struct mprsas_softc *sassc = sc->sassc; struct ccb_scsiio *csio; struct scsi_read_capacity_16 *scsi_cmd; struct scsi_read_capacity_eedp *rcap_buf; path_id_t pathid; target_id_t targetid; lun_id_t lunid; union ccb *ccb; struct cam_path *local_path; struct mprsas_target *target; struct mprsas_lun *lun; uint8_t found_lun; char path_str[64]; pathid = cam_sim_path(sassc->sim); targetid = xpt_path_target_id(path); lunid = xpt_path_lun_id(path); KASSERT(targetid < sassc->maxtargets, ("Target %d out of bounds in " "mprsas_check_eedp\n", targetid)); target = &sassc->targets[targetid]; if (target->handle == 0x0) return; /* * Determine if the device is EEDP capable. * * If this flag is set in the inquiry data, the device supports * protection information, and must support the 16 byte read capacity * command, otherwise continue without sending read cap 16. */ if ((cgd->inq_data.spc3_flags & SPC3_SID_PROTECT) == 0) return; /* * Issue a READ CAPACITY 16 command. This info is used to determine if * the LUN is formatted for EEDP support. */ ccb = xpt_alloc_ccb_nowait(); if (ccb == NULL) { mpr_dprint(sc, MPR_ERROR, "Unable to alloc CCB for EEDP " "support.\n"); return; } if (xpt_create_path(&local_path, xpt_periph, pathid, targetid, lunid) != CAM_REQ_CMP) { mpr_dprint(sc, MPR_ERROR, "Unable to create path for EEDP " "support.\n"); xpt_free_ccb(ccb); return; } /* * If LUN is already in list, don't create a new one. */ found_lun = FALSE; SLIST_FOREACH(lun, &target->luns, lun_link) { if (lun->lun_id == lunid) { found_lun = TRUE; break; } } if (!found_lun) { lun = malloc(sizeof(struct mprsas_lun), M_MPR, M_NOWAIT | M_ZERO); if (lun == NULL) { mpr_dprint(sc, MPR_ERROR, "Unable to alloc LUN for " "EEDP support.\n"); xpt_free_path(local_path); xpt_free_ccb(ccb); return; } lun->lun_id = lunid; SLIST_INSERT_HEAD(&target->luns, lun, lun_link); } xpt_path_string(local_path, path_str, sizeof(path_str)); mpr_dprint(sc, MPR_INFO, "Sending read cap: path %s handle %d\n", path_str, target->handle); /* * Issue a READ CAPACITY 16 command for the LUN. The * mprsas_read_cap_done function will load the read cap info into the * LUN struct. */ rcap_buf = malloc(sizeof(struct scsi_read_capacity_eedp), M_MPR, M_NOWAIT | M_ZERO); if (rcap_buf == NULL) { mpr_dprint(sc, MPR_ERROR, "Unable to alloc read capacity " "buffer for EEDP support.\n"); xpt_free_path(ccb->ccb_h.path); xpt_free_ccb(ccb); return; } xpt_setup_ccb(&ccb->ccb_h, local_path, CAM_PRIORITY_XPT); csio = &ccb->csio; csio->ccb_h.func_code = XPT_SCSI_IO; csio->ccb_h.flags = CAM_DIR_IN; csio->ccb_h.retry_count = 4; csio->ccb_h.cbfcnp = mprsas_read_cap_done; csio->ccb_h.timeout = 60000; csio->data_ptr = (uint8_t *)rcap_buf; csio->dxfer_len = sizeof(struct scsi_read_capacity_eedp); csio->sense_len = MPR_SENSE_LEN; csio->cdb_len = sizeof(*scsi_cmd); csio->tag_action = MSG_SIMPLE_Q_TAG; scsi_cmd = (struct scsi_read_capacity_16 *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = 0x9E; scsi_cmd->service_action = SRC16_SERVICE_ACTION; ((uint8_t *)scsi_cmd)[13] = sizeof(struct scsi_read_capacity_eedp); ccb->ccb_h.ppriv_ptr1 = sassc; xpt_action(ccb); } static void mprsas_read_cap_done(struct cam_periph *periph, union ccb *done_ccb) { struct mprsas_softc *sassc; struct mprsas_target *target; struct mprsas_lun *lun; struct scsi_read_capacity_eedp *rcap_buf; if (done_ccb == NULL) return; /* Driver need to release devq, it Scsi command is * generated by driver internally. * Currently there is a single place where driver * calls scsi command internally. In future if driver * calls more scsi command internally, it needs to release * devq internally, since those command will not go back to * cam_periph. */ if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) ) { done_ccb->ccb_h.status &= ~CAM_DEV_QFRZN; xpt_release_devq(done_ccb->ccb_h.path, /*count*/ 1, /*run_queue*/TRUE); } rcap_buf = (struct scsi_read_capacity_eedp *)done_ccb->csio.data_ptr; /* * Get the LUN ID for the path and look it up in the LUN list for the * target. */ sassc = (struct mprsas_softc *)done_ccb->ccb_h.ppriv_ptr1; KASSERT(done_ccb->ccb_h.target_id < sassc->maxtargets, ("Target %d out " "of bounds in mprsas_read_cap_done\n", done_ccb->ccb_h.target_id)); target = &sassc->targets[done_ccb->ccb_h.target_id]; SLIST_FOREACH(lun, &target->luns, lun_link) { if (lun->lun_id != done_ccb->ccb_h.target_lun) continue; /* * Got the LUN in the target's LUN list. Fill it in with EEDP * info. If the READ CAP 16 command had some SCSI error (common * if command is not supported), mark the lun as not supporting * EEDP and set the block size to 0. */ if ((mprsas_get_ccbstatus(done_ccb) != CAM_REQ_CMP) || (done_ccb->csio.scsi_status != SCSI_STATUS_OK)) { lun->eedp_formatted = FALSE; lun->eedp_block_size = 0; break; } if (rcap_buf->protect & 0x01) { mpr_dprint(sassc->sc, MPR_INFO, "LUN %d for target ID " "%d is formatted for EEDP support.\n", done_ccb->ccb_h.target_lun, done_ccb->ccb_h.target_id); lun->eedp_formatted = TRUE; lun->eedp_block_size = scsi_4btoul(rcap_buf->length); } break; } // Finished with this CCB and path. free(rcap_buf, M_MPR); xpt_free_path(done_ccb->ccb_h.path); xpt_free_ccb(done_ccb); } #endif /* (__FreeBSD_version < 901503) || \ ((__FreeBSD_version >= 1000000) && (__FreeBSD_version < 1000006)) */ void mprsas_prepare_for_tm(struct mpr_softc *sc, struct mpr_command *tm, struct mprsas_target *target, lun_id_t lun_id) { union ccb *ccb; path_id_t path_id; /* * Set the INRESET flag for this target so that no I/O will be sent to * the target until the reset has completed. If an I/O request does * happen, the devq will be frozen. The CCB holds the path which is * used to release the devq. The devq is released and the CCB is freed * when the TM completes. */ ccb = xpt_alloc_ccb_nowait(); if (ccb) { path_id = cam_sim_path(sc->sassc->sim); if (xpt_create_path(&ccb->ccb_h.path, xpt_periph, path_id, target->tid, lun_id) != CAM_REQ_CMP) { xpt_free_ccb(ccb); } else { tm->cm_ccb = ccb; tm->cm_targ = target; target->flags |= MPRSAS_TARGET_INRESET; } } } int mprsas_startup(struct mpr_softc *sc) { /* * Send the port enable message and set the wait_for_port_enable flag. * This flag helps to keep the simq frozen until all discovery events * are processed. */ sc->wait_for_port_enable = 1; mprsas_send_portenable(sc); return (0); } static int mprsas_send_portenable(struct mpr_softc *sc) { MPI2_PORT_ENABLE_REQUEST *request; struct mpr_command *cm; MPR_FUNCTRACE(sc); if ((cm = mpr_alloc_command(sc)) == NULL) return (EBUSY); request = (MPI2_PORT_ENABLE_REQUEST *)cm->cm_req; request->Function = MPI2_FUNCTION_PORT_ENABLE; request->MsgFlags = 0; request->VP_ID = 0; cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE; cm->cm_complete = mprsas_portenable_complete; cm->cm_data = NULL; cm->cm_sge = NULL; mpr_map_command(sc, cm); mpr_dprint(sc, MPR_XINFO, "mpr_send_portenable finished cm %p req %p complete %p\n", cm, cm->cm_req, cm->cm_complete); return (0); } static void mprsas_portenable_complete(struct mpr_softc *sc, struct mpr_command *cm) { MPI2_PORT_ENABLE_REPLY *reply; struct mprsas_softc *sassc; MPR_FUNCTRACE(sc); sassc = sc->sassc; /* * Currently there should be no way we can hit this case. It only * happens when we have a failure to allocate chain frames, and * port enable commands don't have S/G lists. */ if ((cm->cm_flags & MPR_CM_FLAGS_ERROR_MASK) != 0) { mpr_dprint(sc, MPR_ERROR, "%s: cm_flags = %#x for port enable! " "This should not happen!\n", __func__, cm->cm_flags); } reply = (MPI2_PORT_ENABLE_REPLY *)cm->cm_reply; if (reply == NULL) mpr_dprint(sc, MPR_FAULT, "Portenable NULL reply\n"); else if (le16toh(reply->IOCStatus & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS) mpr_dprint(sc, MPR_FAULT, "Portenable failed\n"); mpr_free_command(sc, cm); /* * Done waiting for port enable to complete. Decrement the refcount. * If refcount is 0, discovery is complete and a rescan of the bus can * take place. */ sc->wait_for_port_enable = 0; sc->port_enable_complete = 1; wakeup(&sc->port_enable_complete); mprsas_startup_decrement(sassc); } int mprsas_check_id(struct mprsas_softc *sassc, int id) { struct mpr_softc *sc = sassc->sc; char *ids; char *name; ids = &sc->exclude_ids[0]; while((name = strsep(&ids, ",")) != NULL) { if (name[0] == '\0') continue; if (strtol(name, NULL, 0) == (long)id) return (1); } return (0); } void mprsas_realloc_targets(struct mpr_softc *sc, int maxtargets) { struct mprsas_softc *sassc; struct mprsas_lun *lun, *lun_tmp; struct mprsas_target *targ; int i; sassc = sc->sassc; /* * The number of targets is based on IOC Facts, so free all of * the allocated LUNs for each target and then the target buffer * itself. */ for (i=0; i< maxtargets; i++) { targ = &sassc->targets[i]; SLIST_FOREACH_SAFE(lun, &targ->luns, lun_link, lun_tmp) { free(lun, M_MPR); } } free(sassc->targets, M_MPR); sassc->targets = malloc(sizeof(struct mprsas_target) * maxtargets, M_MPR, M_WAITOK|M_ZERO); if (!sassc->targets) { panic("%s failed to alloc targets with error %d\n", __func__, ENOMEM); } } Index: head/sys/dev/nvme/nvme.c =================================================================== --- head/sys/dev/nvme/nvme.c (revision 338181) +++ head/sys/dev/nvme/nvme.c (revision 338182) @@ -1,506 +1,502 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2012-2014 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include "nvme_private.h" struct nvme_consumer { uint32_t id; nvme_cons_ns_fn_t ns_fn; nvme_cons_ctrlr_fn_t ctrlr_fn; nvme_cons_async_fn_t async_fn; nvme_cons_fail_fn_t fail_fn; }; struct nvme_consumer nvme_consumer[NVME_MAX_CONSUMERS]; #define INVALID_CONSUMER_ID 0xFFFF uma_zone_t nvme_request_zone; int32_t nvme_retry_count; MALLOC_DEFINE(M_NVME, "nvme", "nvme(4) memory allocations"); static int nvme_probe(device_t); static int nvme_attach(device_t); static int nvme_detach(device_t); static int nvme_shutdown(device_t); static int nvme_modevent(module_t mod, int type, void *arg); static devclass_t nvme_devclass; static device_method_t nvme_pci_methods[] = { /* Device interface */ DEVMETHOD(device_probe, nvme_probe), DEVMETHOD(device_attach, nvme_attach), DEVMETHOD(device_detach, nvme_detach), DEVMETHOD(device_shutdown, nvme_shutdown), { 0, 0 } }; static driver_t nvme_pci_driver = { "nvme", nvme_pci_methods, sizeof(struct nvme_controller), }; DRIVER_MODULE(nvme, pci, nvme_pci_driver, nvme_devclass, nvme_modevent, 0); MODULE_VERSION(nvme, 1); MODULE_DEPEND(nvme, cam, 1, 1, 1); static struct _pcsid { uint32_t devid; int match_subdevice; uint16_t subdevice; const char *desc; uint32_t quirks; } pci_ids[] = { { 0x01118086, 0, 0, "NVMe Controller" }, { IDT32_PCI_ID, 0, 0, "IDT NVMe Controller (32 channel)" }, { IDT8_PCI_ID, 0, 0, "IDT NVMe Controller (8 channel)" }, { 0x09538086, 1, 0x3702, "DC P3700 SSD" }, { 0x09538086, 1, 0x3703, "DC P3700 SSD [2.5\" SFF]" }, { 0x09538086, 1, 0x3704, "DC P3500 SSD [Add-in Card]" }, { 0x09538086, 1, 0x3705, "DC P3500 SSD [2.5\" SFF]" }, { 0x09538086, 1, 0x3709, "DC P3600 SSD [Add-in Card]" }, { 0x09538086, 1, 0x370a, "DC P3600 SSD [2.5\" SFF]" }, { 0x00031c58, 0, 0, "HGST SN100", QUIRK_DELAY_B4_CHK_RDY }, { 0x00231c58, 0, 0, "WDC SN200", QUIRK_DELAY_B4_CHK_RDY }, { 0x05401c5f, 0, 0, "Memblaze Pblaze4", QUIRK_DELAY_B4_CHK_RDY }, { 0xa821144d, 0, 0, "Samsung PM1725", QUIRK_DELAY_B4_CHK_RDY }, { 0xa822144d, 0, 0, "Samsung PM1725a", QUIRK_DELAY_B4_CHK_RDY }, { 0x00000000, 0, 0, NULL } }; static int nvme_match(uint32_t devid, uint16_t subdevice, struct _pcsid *ep) { if (devid != ep->devid) return 0; if (!ep->match_subdevice) return 1; if (subdevice == ep->subdevice) return 1; else return 0; } static int nvme_probe (device_t device) { struct _pcsid *ep; uint32_t devid; uint16_t subdevice; devid = pci_get_devid(device); subdevice = pci_get_subdevice(device); ep = pci_ids; while (ep->devid) { if (nvme_match(devid, subdevice, ep)) break; ++ep; } if (ep->desc) { device_set_desc(device, ep->desc); return (BUS_PROBE_DEFAULT); } #if defined(PCIS_STORAGE_NVM) if (pci_get_class(device) == PCIC_STORAGE && pci_get_subclass(device) == PCIS_STORAGE_NVM && pci_get_progif(device) == PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0) { device_set_desc(device, "Generic NVMe Device"); return (BUS_PROBE_GENERIC); } #endif return (ENXIO); } static void nvme_init(void) { uint32_t i; nvme_request_zone = uma_zcreate("nvme_request", sizeof(struct nvme_request), NULL, NULL, NULL, NULL, 0, 0); for (i = 0; i < NVME_MAX_CONSUMERS; i++) nvme_consumer[i].id = INVALID_CONSUMER_ID; } SYSINIT(nvme_register, SI_SUB_DRIVERS, SI_ORDER_SECOND, nvme_init, NULL); static void nvme_uninit(void) { uma_zdestroy(nvme_request_zone); } SYSUNINIT(nvme_unregister, SI_SUB_DRIVERS, SI_ORDER_SECOND, nvme_uninit, NULL); static void nvme_load(void) { } static void nvme_unload(void) { } static int nvme_shutdown(device_t dev) { struct nvme_controller *ctrlr; ctrlr = DEVICE2SOFTC(dev); nvme_ctrlr_shutdown(ctrlr); return (0); } static int nvme_modevent(module_t mod, int type, void *arg) { switch (type) { case MOD_LOAD: nvme_load(); break; case MOD_UNLOAD: nvme_unload(); break; default: break; } return (0); } void nvme_dump_command(struct nvme_command *cmd) { - uint8_t opc, fuse; - opc = (cmd->opc_fuse >> NVME_CMD_OPC_SHIFT) & NVME_CMD_OPC_MASK; - fuse = (cmd->opc_fuse >> NVME_CMD_FUSE_SHIFT) & NVME_CMD_FUSE_MASK; - printf( "opc:%x f:%x cid:%x nsid:%x r2:%x r3:%x mptr:%jx prp1:%jx prp2:%jx cdw:%x %x %x %x %x %x\n", - opc, fuse, cmd->cid, le32toh(cmd->nsid), + cmd->opc, cmd->fuse, cmd->cid, le32toh(cmd->nsid), cmd->rsvd2, cmd->rsvd3, (uintmax_t)le64toh(cmd->mptr), (uintmax_t)le64toh(cmd->prp1), (uintmax_t)le64toh(cmd->prp2), le32toh(cmd->cdw10), le32toh(cmd->cdw11), le32toh(cmd->cdw12), le32toh(cmd->cdw13), le32toh(cmd->cdw14), le32toh(cmd->cdw15)); } void nvme_dump_completion(struct nvme_completion *cpl) { uint8_t p, sc, sct, m, dnr; uint16_t status; status = le16toh(cpl->status); p = NVME_STATUS_GET_P(status); sc = NVME_STATUS_GET_SC(status); sct = NVME_STATUS_GET_SCT(status); m = NVME_STATUS_GET_M(status); dnr = NVME_STATUS_GET_DNR(status); printf("cdw0:%08x sqhd:%04x sqid:%04x " "cid:%04x p:%x sc:%02x sct:%x m:%x dnr:%x\n", le32toh(cpl->cdw0), le16toh(cpl->sqhd), le16toh(cpl->sqid), cpl->cid, p, sc, sct, m, dnr); } static int nvme_attach(device_t dev) { struct nvme_controller *ctrlr = DEVICE2SOFTC(dev); int status; struct _pcsid *ep; uint32_t devid; uint16_t subdevice; devid = pci_get_devid(dev); subdevice = pci_get_subdevice(dev); ep = pci_ids; while (ep->devid) { if (nvme_match(devid, subdevice, ep)) break; ++ep; } ctrlr->quirks = ep->quirks; status = nvme_ctrlr_construct(ctrlr, dev); if (status != 0) { nvme_ctrlr_destruct(ctrlr, dev); return (status); } /* * Enable busmastering so the completion status messages can * be busmastered back to the host. */ pci_enable_busmaster(dev); /* * Reset controller twice to ensure we do a transition from cc.en==1 * to cc.en==0. This is because we don't really know what status * the controller was left in when boot handed off to OS. */ status = nvme_ctrlr_hw_reset(ctrlr); if (status != 0) { nvme_ctrlr_destruct(ctrlr, dev); return (status); } status = nvme_ctrlr_hw_reset(ctrlr); if (status != 0) { nvme_ctrlr_destruct(ctrlr, dev); return (status); } ctrlr->config_hook.ich_func = nvme_ctrlr_start_config_hook; ctrlr->config_hook.ich_arg = ctrlr; config_intrhook_establish(&ctrlr->config_hook); return (0); } static int nvme_detach (device_t dev) { struct nvme_controller *ctrlr = DEVICE2SOFTC(dev); nvme_ctrlr_destruct(ctrlr, dev); pci_disable_busmaster(dev); return (0); } static void nvme_notify(struct nvme_consumer *cons, struct nvme_controller *ctrlr) { struct nvme_namespace *ns; void *ctrlr_cookie; int cmpset, ns_idx; /* * The consumer may register itself after the nvme devices * have registered with the kernel, but before the * driver has completed initialization. In that case, * return here, and when initialization completes, the * controller will make sure the consumer gets notified. */ if (!ctrlr->is_initialized) return; cmpset = atomic_cmpset_32(&ctrlr->notification_sent, 0, 1); if (cmpset == 0) return; if (cons->ctrlr_fn != NULL) ctrlr_cookie = (*cons->ctrlr_fn)(ctrlr); else ctrlr_cookie = NULL; ctrlr->cons_cookie[cons->id] = ctrlr_cookie; if (ctrlr->is_failed) { if (cons->fail_fn != NULL) (*cons->fail_fn)(ctrlr_cookie); /* * Do not notify consumers about the namespaces of a * failed controller. */ return; } for (ns_idx = 0; ns_idx < min(ctrlr->cdata.nn, NVME_MAX_NAMESPACES); ns_idx++) { ns = &ctrlr->ns[ns_idx]; if (ns->data.nsze == 0) continue; if (cons->ns_fn != NULL) ns->cons_cookie[cons->id] = (*cons->ns_fn)(ns, ctrlr_cookie); } } void nvme_notify_new_controller(struct nvme_controller *ctrlr) { int i; for (i = 0; i < NVME_MAX_CONSUMERS; i++) { if (nvme_consumer[i].id != INVALID_CONSUMER_ID) { nvme_notify(&nvme_consumer[i], ctrlr); } } } static void nvme_notify_new_consumer(struct nvme_consumer *cons) { device_t *devlist; struct nvme_controller *ctrlr; int dev_idx, devcount; if (devclass_get_devices(nvme_devclass, &devlist, &devcount)) return; for (dev_idx = 0; dev_idx < devcount; dev_idx++) { ctrlr = DEVICE2SOFTC(devlist[dev_idx]); nvme_notify(cons, ctrlr); } free(devlist, M_TEMP); } void nvme_notify_async_consumers(struct nvme_controller *ctrlr, const struct nvme_completion *async_cpl, uint32_t log_page_id, void *log_page_buffer, uint32_t log_page_size) { struct nvme_consumer *cons; uint32_t i; for (i = 0; i < NVME_MAX_CONSUMERS; i++) { cons = &nvme_consumer[i]; if (cons->id != INVALID_CONSUMER_ID && cons->async_fn != NULL) (*cons->async_fn)(ctrlr->cons_cookie[i], async_cpl, log_page_id, log_page_buffer, log_page_size); } } void nvme_notify_fail_consumers(struct nvme_controller *ctrlr) { struct nvme_consumer *cons; uint32_t i; /* * This controller failed during initialization (i.e. IDENTIFY * command failed or timed out). Do not notify any nvme * consumers of the failure here, since the consumer does not * even know about the controller yet. */ if (!ctrlr->is_initialized) return; for (i = 0; i < NVME_MAX_CONSUMERS; i++) { cons = &nvme_consumer[i]; if (cons->id != INVALID_CONSUMER_ID && cons->fail_fn != NULL) cons->fail_fn(ctrlr->cons_cookie[i]); } } void nvme_notify_ns(struct nvme_controller *ctrlr, int nsid) { struct nvme_consumer *cons; struct nvme_namespace *ns = &ctrlr->ns[nsid - 1]; uint32_t i; if (!ctrlr->is_initialized) return; for (i = 0; i < NVME_MAX_CONSUMERS; i++) { cons = &nvme_consumer[i]; if (cons->id != INVALID_CONSUMER_ID && cons->ns_fn != NULL) ns->cons_cookie[cons->id] = (*cons->ns_fn)(ns, ctrlr->cons_cookie[cons->id]); } } struct nvme_consumer * nvme_register_consumer(nvme_cons_ns_fn_t ns_fn, nvme_cons_ctrlr_fn_t ctrlr_fn, nvme_cons_async_fn_t async_fn, nvme_cons_fail_fn_t fail_fn) { int i; /* * TODO: add locking around consumer registration. Not an issue * right now since we only have one nvme consumer - nvd(4). */ for (i = 0; i < NVME_MAX_CONSUMERS; i++) if (nvme_consumer[i].id == INVALID_CONSUMER_ID) { nvme_consumer[i].id = i; nvme_consumer[i].ns_fn = ns_fn; nvme_consumer[i].ctrlr_fn = ctrlr_fn; nvme_consumer[i].async_fn = async_fn; nvme_consumer[i].fail_fn = fail_fn; nvme_notify_new_consumer(&nvme_consumer[i]); return (&nvme_consumer[i]); } printf("nvme(4): consumer not registered - no slots available\n"); return (NULL); } void nvme_unregister_consumer(struct nvme_consumer *consumer) { consumer->id = INVALID_CONSUMER_ID; } void nvme_completion_poll_cb(void *arg, const struct nvme_completion *cpl) { struct nvme_completion_poll_status *status = arg; /* * Copy status into the argument passed by the caller, so that * the caller can check the status to determine if the * the request passed or failed. */ memcpy(&status->cpl, cpl, sizeof(*cpl)); atomic_store_rel_int(&status->done, 1); } Index: head/sys/dev/nvme/nvme.h =================================================================== --- head/sys/dev/nvme/nvme.h (revision 338181) +++ head/sys/dev/nvme/nvme.h (revision 338182) @@ -1,1502 +1,1499 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2012-2013 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __NVME_H__ #define __NVME_H__ #ifdef _KERNEL #include #endif #include #include #define NVME_PASSTHROUGH_CMD _IOWR('n', 0, struct nvme_pt_command) #define NVME_RESET_CONTROLLER _IO('n', 1) #define NVME_IO_TEST _IOWR('n', 100, struct nvme_io_test) #define NVME_BIO_TEST _IOWR('n', 101, struct nvme_io_test) /* * Macros to deal with NVME revisions, as defined VS register */ #define NVME_REV(x, y) (((x) << 16) | ((y) << 8)) #define NVME_MAJOR(r) (((r) >> 16) & 0xffff) #define NVME_MINOR(r) (((r) >> 8) & 0xff) /* * Use to mark a command to apply to all namespaces, or to retrieve global * log pages. */ #define NVME_GLOBAL_NAMESPACE_TAG ((uint32_t)0xFFFFFFFF) /* Cap nvme to 1MB transfers driver explodes with larger sizes */ #define NVME_MAX_XFER_SIZE (MAXPHYS < (1<<20) ? MAXPHYS : (1<<20)) /* Register field definitions */ #define NVME_CAP_LO_REG_MQES_SHIFT (0) #define NVME_CAP_LO_REG_MQES_MASK (0xFFFF) #define NVME_CAP_LO_REG_CQR_SHIFT (16) #define NVME_CAP_LO_REG_CQR_MASK (0x1) #define NVME_CAP_LO_REG_AMS_SHIFT (17) #define NVME_CAP_LO_REG_AMS_MASK (0x3) #define NVME_CAP_LO_REG_TO_SHIFT (24) #define NVME_CAP_LO_REG_TO_MASK (0xFF) #define NVME_CAP_HI_REG_DSTRD_SHIFT (0) #define NVME_CAP_HI_REG_DSTRD_MASK (0xF) #define NVME_CAP_HI_REG_CSS_NVM_SHIFT (5) #define NVME_CAP_HI_REG_CSS_NVM_MASK (0x1) #define NVME_CAP_HI_REG_MPSMIN_SHIFT (16) #define NVME_CAP_HI_REG_MPSMIN_MASK (0xF) #define NVME_CAP_HI_REG_MPSMAX_SHIFT (20) #define NVME_CAP_HI_REG_MPSMAX_MASK (0xF) #define NVME_CC_REG_EN_SHIFT (0) #define NVME_CC_REG_EN_MASK (0x1) #define NVME_CC_REG_CSS_SHIFT (4) #define NVME_CC_REG_CSS_MASK (0x7) #define NVME_CC_REG_MPS_SHIFT (7) #define NVME_CC_REG_MPS_MASK (0xF) #define NVME_CC_REG_AMS_SHIFT (11) #define NVME_CC_REG_AMS_MASK (0x7) #define NVME_CC_REG_SHN_SHIFT (14) #define NVME_CC_REG_SHN_MASK (0x3) #define NVME_CC_REG_IOSQES_SHIFT (16) #define NVME_CC_REG_IOSQES_MASK (0xF) #define NVME_CC_REG_IOCQES_SHIFT (20) #define NVME_CC_REG_IOCQES_MASK (0xF) #define NVME_CSTS_REG_RDY_SHIFT (0) #define NVME_CSTS_REG_RDY_MASK (0x1) #define NVME_CSTS_REG_CFS_SHIFT (1) #define NVME_CSTS_REG_CFS_MASK (0x1) #define NVME_CSTS_REG_SHST_SHIFT (2) #define NVME_CSTS_REG_SHST_MASK (0x3) #define NVME_CSTS_GET_SHST(csts) (((csts) >> NVME_CSTS_REG_SHST_SHIFT) & NVME_CSTS_REG_SHST_MASK) #define NVME_AQA_REG_ASQS_SHIFT (0) #define NVME_AQA_REG_ASQS_MASK (0xFFF) #define NVME_AQA_REG_ACQS_SHIFT (16) #define NVME_AQA_REG_ACQS_MASK (0xFFF) /* Command field definitions */ -#define NVME_CMD_OPC_SHIFT (0) -#define NVME_CMD_OPC_MASK (0xFF) #define NVME_CMD_FUSE_SHIFT (8) #define NVME_CMD_FUSE_MASK (0x3) -#define NVME_CMD_SET_OPC(opc) (htole16(((uint16_t)(opc) & NVME_CMD_OPC_MASK) << NVME_CMD_OPC_SHIFT)) - #define NVME_STATUS_P_SHIFT (0) #define NVME_STATUS_P_MASK (0x1) #define NVME_STATUS_SC_SHIFT (1) #define NVME_STATUS_SC_MASK (0xFF) #define NVME_STATUS_SCT_SHIFT (9) #define NVME_STATUS_SCT_MASK (0x7) #define NVME_STATUS_M_SHIFT (14) #define NVME_STATUS_M_MASK (0x1) #define NVME_STATUS_DNR_SHIFT (15) #define NVME_STATUS_DNR_MASK (0x1) #define NVME_STATUS_GET_P(st) (((st) >> NVME_STATUS_P_SHIFT) & NVME_STATUS_P_MASK) #define NVME_STATUS_GET_SC(st) (((st) >> NVME_STATUS_SC_SHIFT) & NVME_STATUS_SC_MASK) #define NVME_STATUS_GET_SCT(st) (((st) >> NVME_STATUS_SCT_SHIFT) & NVME_STATUS_SCT_MASK) #define NVME_STATUS_GET_M(st) (((st) >> NVME_STATUS_M_SHIFT) & NVME_STATUS_M_MASK) #define NVME_STATUS_GET_DNR(st) (((st) >> NVME_STATUS_DNR_SHIFT) & NVME_STATUS_DNR_MASK) #define NVME_PWR_ST_MPS_SHIFT (0) #define NVME_PWR_ST_MPS_MASK (0x1) #define NVME_PWR_ST_NOPS_SHIFT (1) #define NVME_PWR_ST_NOPS_MASK (0x1) #define NVME_PWR_ST_RRT_SHIFT (0) #define NVME_PWR_ST_RRT_MASK (0x1F) #define NVME_PWR_ST_RRL_SHIFT (0) #define NVME_PWR_ST_RRL_MASK (0x1F) #define NVME_PWR_ST_RWT_SHIFT (0) #define NVME_PWR_ST_RWT_MASK (0x1F) #define NVME_PWR_ST_RWL_SHIFT (0) #define NVME_PWR_ST_RWL_MASK (0x1F) #define NVME_PWR_ST_IPS_SHIFT (6) #define NVME_PWR_ST_IPS_MASK (0x3) #define NVME_PWR_ST_APW_SHIFT (0) #define NVME_PWR_ST_APW_MASK (0x7) #define NVME_PWR_ST_APS_SHIFT (6) #define NVME_PWR_ST_APS_MASK (0x3) /** Controller Multi-path I/O and Namespace Sharing Capabilities */ /* More then one port */ #define NVME_CTRLR_DATA_MIC_MPORTS_SHIFT (0) #define NVME_CTRLR_DATA_MIC_MPORTS_MASK (0x1) /* More then one controller */ #define NVME_CTRLR_DATA_MIC_MCTRLRS_SHIFT (1) #define NVME_CTRLR_DATA_MIC_MCTRLRS_MASK (0x1) /* SR-IOV Virtual Function */ #define NVME_CTRLR_DATA_MIC_SRIOVVF_SHIFT (2) #define NVME_CTRLR_DATA_MIC_SRIOVVF_MASK (0x1) /** OACS - optional admin command support */ /* supports security send/receive commands */ #define NVME_CTRLR_DATA_OACS_SECURITY_SHIFT (0) #define NVME_CTRLR_DATA_OACS_SECURITY_MASK (0x1) /* supports format nvm command */ #define NVME_CTRLR_DATA_OACS_FORMAT_SHIFT (1) #define NVME_CTRLR_DATA_OACS_FORMAT_MASK (0x1) /* supports firmware activate/download commands */ #define NVME_CTRLR_DATA_OACS_FIRMWARE_SHIFT (2) #define NVME_CTRLR_DATA_OACS_FIRMWARE_MASK (0x1) /* supports namespace management commands */ #define NVME_CTRLR_DATA_OACS_NSMGMT_SHIFT (3) #define NVME_CTRLR_DATA_OACS_NSMGMT_MASK (0x1) /* supports Device Self-test command */ #define NVME_CTRLR_DATA_OACS_SELFTEST_SHIFT (4) #define NVME_CTRLR_DATA_OACS_SELFTEST_MASK (0x1) /* supports Directives */ #define NVME_CTRLR_DATA_OACS_DIRECTIVES_SHIFT (5) #define NVME_CTRLR_DATA_OACS_DIRECTIVES_MASK (0x1) /* supports NVMe-MI Send/Receive */ #define NVME_CTRLR_DATA_OACS_NVMEMI_SHIFT (6) #define NVME_CTRLR_DATA_OACS_NVMEMI_MASK (0x1) /* supports Virtualization Management */ #define NVME_CTRLR_DATA_OACS_VM_SHIFT (7) #define NVME_CTRLR_DATA_OACS_VM_MASK (0x1) /* supports Doorbell Buffer Config */ #define NVME_CTRLR_DATA_OACS_DBBUFFER_SHIFT (8) #define NVME_CTRLR_DATA_OACS_DBBUFFER_MASK (0x1) /** firmware updates */ /* first slot is read-only */ #define NVME_CTRLR_DATA_FRMW_SLOT1_RO_SHIFT (0) #define NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK (0x1) /* number of firmware slots */ #define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT (1) #define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_MASK (0x7) /** log page attributes */ /* per namespace smart/health log page */ #define NVME_CTRLR_DATA_LPA_NS_SMART_SHIFT (0) #define NVME_CTRLR_DATA_LPA_NS_SMART_MASK (0x1) /** AVSCC - admin vendor specific command configuration */ /* admin vendor specific commands use spec format */ #define NVME_CTRLR_DATA_AVSCC_SPEC_FORMAT_SHIFT (0) #define NVME_CTRLR_DATA_AVSCC_SPEC_FORMAT_MASK (0x1) /** Autonomous Power State Transition Attributes */ /* Autonomous Power State Transitions supported */ #define NVME_CTRLR_DATA_APSTA_APST_SUPP_SHIFT (0) #define NVME_CTRLR_DATA_APSTA_APST_SUPP_MASK (0x1) /** submission queue entry size */ #define NVME_CTRLR_DATA_SQES_MIN_SHIFT (0) #define NVME_CTRLR_DATA_SQES_MIN_MASK (0xF) #define NVME_CTRLR_DATA_SQES_MAX_SHIFT (4) #define NVME_CTRLR_DATA_SQES_MAX_MASK (0xF) /** completion queue entry size */ #define NVME_CTRLR_DATA_CQES_MIN_SHIFT (0) #define NVME_CTRLR_DATA_CQES_MIN_MASK (0xF) #define NVME_CTRLR_DATA_CQES_MAX_SHIFT (4) #define NVME_CTRLR_DATA_CQES_MAX_MASK (0xF) /** optional nvm command support */ #define NVME_CTRLR_DATA_ONCS_COMPARE_SHIFT (0) #define NVME_CTRLR_DATA_ONCS_COMPARE_MASK (0x1) #define NVME_CTRLR_DATA_ONCS_WRITE_UNC_SHIFT (1) #define NVME_CTRLR_DATA_ONCS_WRITE_UNC_MASK (0x1) #define NVME_CTRLR_DATA_ONCS_DSM_SHIFT (2) #define NVME_CTRLR_DATA_ONCS_DSM_MASK (0x1) #define NVME_CTRLR_DATA_ONCS_WRZERO_SHIFT (3) #define NVME_CTRLR_DATA_ONCS_WRZERO_MASK (0x1) #define NVME_CTRLR_DATA_ONCS_SAVEFEAT_SHIFT (4) #define NVME_CTRLR_DATA_ONCS_SAVEFEAT_MASK (0x1) #define NVME_CTRLR_DATA_ONCS_RESERV_SHIFT (5) #define NVME_CTRLR_DATA_ONCS_RESERV_MASK (0x1) #define NVME_CTRLR_DATA_ONCS_TIMESTAMP_SHIFT (6) #define NVME_CTRLR_DATA_ONCS_TIMESTAMP_MASK (0x1) /** Fused Operation Support */ #define NVME_CTRLR_DATA_FUSES_CNW_SHIFT (0) #define NVME_CTRLR_DATA_FUSES_CNW_MASK (0x1) /** Format NVM Attributes */ #define NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT (0) #define NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK (0x1) #define NVME_CTRLR_DATA_FNA_ERASE_ALL_SHIFT (1) #define NVME_CTRLR_DATA_FNA_ERASE_ALL_MASK (0x1) #define NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_SHIFT (2) #define NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_MASK (0x1) /** volatile write cache */ #define NVME_CTRLR_DATA_VWC_PRESENT_SHIFT (0) #define NVME_CTRLR_DATA_VWC_PRESENT_MASK (0x1) /** namespace features */ /* thin provisioning */ #define NVME_NS_DATA_NSFEAT_THIN_PROV_SHIFT (0) #define NVME_NS_DATA_NSFEAT_THIN_PROV_MASK (0x1) /* NAWUN, NAWUPF, and NACWU fields are valid */ #define NVME_NS_DATA_NSFEAT_NA_FIELDS_SHIFT (1) #define NVME_NS_DATA_NSFEAT_NA_FIELDS_MASK (0x1) /* Deallocated or Unwritten Logical Block errors supported */ #define NVME_NS_DATA_NSFEAT_DEALLOC_SHIFT (2) #define NVME_NS_DATA_NSFEAT_DEALLOC_MASK (0x1) /* NGUID and EUI64 fields are not reusable */ #define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_SHIFT (3) #define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_MASK (0x1) /** formatted lba size */ #define NVME_NS_DATA_FLBAS_FORMAT_SHIFT (0) #define NVME_NS_DATA_FLBAS_FORMAT_MASK (0xF) #define NVME_NS_DATA_FLBAS_EXTENDED_SHIFT (4) #define NVME_NS_DATA_FLBAS_EXTENDED_MASK (0x1) /** metadata capabilities */ /* metadata can be transferred as part of data prp list */ #define NVME_NS_DATA_MC_EXTENDED_SHIFT (0) #define NVME_NS_DATA_MC_EXTENDED_MASK (0x1) /* metadata can be transferred with separate metadata pointer */ #define NVME_NS_DATA_MC_POINTER_SHIFT (1) #define NVME_NS_DATA_MC_POINTER_MASK (0x1) /** end-to-end data protection capabilities */ /* protection information type 1 */ #define NVME_NS_DATA_DPC_PIT1_SHIFT (0) #define NVME_NS_DATA_DPC_PIT1_MASK (0x1) /* protection information type 2 */ #define NVME_NS_DATA_DPC_PIT2_SHIFT (1) #define NVME_NS_DATA_DPC_PIT2_MASK (0x1) /* protection information type 3 */ #define NVME_NS_DATA_DPC_PIT3_SHIFT (2) #define NVME_NS_DATA_DPC_PIT3_MASK (0x1) /* first eight bytes of metadata */ #define NVME_NS_DATA_DPC_MD_START_SHIFT (3) #define NVME_NS_DATA_DPC_MD_START_MASK (0x1) /* last eight bytes of metadata */ #define NVME_NS_DATA_DPC_MD_END_SHIFT (4) #define NVME_NS_DATA_DPC_MD_END_MASK (0x1) /** end-to-end data protection type settings */ /* protection information type */ #define NVME_NS_DATA_DPS_PIT_SHIFT (0) #define NVME_NS_DATA_DPS_PIT_MASK (0x7) /* 1 == protection info transferred at start of metadata */ /* 0 == protection info transferred at end of metadata */ #define NVME_NS_DATA_DPS_MD_START_SHIFT (3) #define NVME_NS_DATA_DPS_MD_START_MASK (0x1) /** Namespace Multi-path I/O and Namespace Sharing Capabilities */ /* the namespace may be attached to two or more controllers */ #define NVME_NS_DATA_NMIC_MAY_BE_SHARED_SHIFT (0) #define NVME_NS_DATA_NMIC_MAY_BE_SHARED_MASK (0x1) /** Reservation Capabilities */ /* Persist Through Power Loss */ #define NVME_NS_DATA_RESCAP_PTPL_SHIFT (0) #define NVME_NS_DATA_RESCAP_PTPL_MASK (0x1) /* supports the Write Exclusive */ #define NVME_NS_DATA_RESCAP_WR_EX_SHIFT (1) #define NVME_NS_DATA_RESCAP_WR_EX_MASK (0x1) /* supports the Exclusive Access */ #define NVME_NS_DATA_RESCAP_EX_AC_SHIFT (2) #define NVME_NS_DATA_RESCAP_EX_AC_MASK (0x1) /* supports the Write Exclusive – Registrants Only */ #define NVME_NS_DATA_RESCAP_WR_EX_RO_SHIFT (3) #define NVME_NS_DATA_RESCAP_WR_EX_RO_MASK (0x1) /* supports the Exclusive Access - Registrants Only */ #define NVME_NS_DATA_RESCAP_EX_AC_RO_SHIFT (4) #define NVME_NS_DATA_RESCAP_EX_AC_RO_MASK (0x1) /* supports the Write Exclusive – All Registrants */ #define NVME_NS_DATA_RESCAP_WR_EX_AR_SHIFT (5) #define NVME_NS_DATA_RESCAP_WR_EX_AR_MASK (0x1) /* supports the Exclusive Access - All Registrants */ #define NVME_NS_DATA_RESCAP_EX_AC_AR_SHIFT (6) #define NVME_NS_DATA_RESCAP_EX_AC_AR_MASK (0x1) /* Ignore Existing Key is used as defined in revision 1.3 or later */ #define NVME_NS_DATA_RESCAP_IEKEY13_SHIFT (7) #define NVME_NS_DATA_RESCAP_IEKEY13_MASK (0x1) /** Format Progress Indicator */ /* percentage of the Format NVM command that remains to be completed */ #define NVME_NS_DATA_FPI_PERC_SHIFT (0) #define NVME_NS_DATA_FPI_PERC_MASK (0x7f) /* namespace supports the Format Progress Indicator */ #define NVME_NS_DATA_FPI_SUPP_SHIFT (7) #define NVME_NS_DATA_FPI_SUPP_MASK (0x1) /** lba format support */ /* metadata size */ #define NVME_NS_DATA_LBAF_MS_SHIFT (0) #define NVME_NS_DATA_LBAF_MS_MASK (0xFFFF) /* lba data size */ #define NVME_NS_DATA_LBAF_LBADS_SHIFT (16) #define NVME_NS_DATA_LBAF_LBADS_MASK (0xFF) /* relative performance */ #define NVME_NS_DATA_LBAF_RP_SHIFT (24) #define NVME_NS_DATA_LBAF_RP_MASK (0x3) enum nvme_critical_warning_state { NVME_CRIT_WARN_ST_AVAILABLE_SPARE = 0x1, NVME_CRIT_WARN_ST_TEMPERATURE = 0x2, NVME_CRIT_WARN_ST_DEVICE_RELIABILITY = 0x4, NVME_CRIT_WARN_ST_READ_ONLY = 0x8, NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP = 0x10, }; #define NVME_CRIT_WARN_ST_RESERVED_MASK (0xE0) /* slot for current FW */ #define NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT (0) #define NVME_FIRMWARE_PAGE_AFI_SLOT_MASK (0x7) /* CC register SHN field values */ enum shn_value { NVME_SHN_NORMAL = 0x1, NVME_SHN_ABRUPT = 0x2, }; /* CSTS register SHST field values */ enum shst_value { NVME_SHST_NORMAL = 0x0, NVME_SHST_OCCURRING = 0x1, NVME_SHST_COMPLETE = 0x2, }; struct nvme_registers { /** controller capabilities */ uint32_t cap_lo; uint32_t cap_hi; uint32_t vs; /* version */ uint32_t intms; /* interrupt mask set */ uint32_t intmc; /* interrupt mask clear */ /** controller configuration */ uint32_t cc; uint32_t reserved1; /** controller status */ uint32_t csts; uint32_t reserved2; /** admin queue attributes */ uint32_t aqa; uint64_t asq; /* admin submission queue base addr */ uint64_t acq; /* admin completion queue base addr */ uint32_t reserved3[0x3f2]; struct { uint32_t sq_tdbl; /* submission queue tail doorbell */ uint32_t cq_hdbl; /* completion queue head doorbell */ } doorbell[1] __packed; } __packed; _Static_assert(sizeof(struct nvme_registers) == 0x1008, "bad size for nvme_registers"); struct nvme_command { /* dword 0 */ - uint16_t opc_fuse; /* opcode, fused operation */ + uint8_t opc; /* opcode */ + uint8_t fuse; /* fused operation */ uint16_t cid; /* command identifier */ /* dword 1 */ uint32_t nsid; /* namespace identifier */ /* dword 2-3 */ uint32_t rsvd2; uint32_t rsvd3; /* dword 4-5 */ uint64_t mptr; /* metadata pointer */ /* dword 6-7 */ uint64_t prp1; /* prp entry 1 */ /* dword 8-9 */ uint64_t prp2; /* prp entry 2 */ /* dword 10-15 */ uint32_t cdw10; /* command-specific */ uint32_t cdw11; /* command-specific */ uint32_t cdw12; /* command-specific */ uint32_t cdw13; /* command-specific */ uint32_t cdw14; /* command-specific */ uint32_t cdw15; /* command-specific */ } __packed; _Static_assert(sizeof(struct nvme_command) == 16 * 4, "bad size for nvme_command"); struct nvme_completion { /* dword 0 */ uint32_t cdw0; /* command-specific */ /* dword 1 */ uint32_t rsvd1; /* dword 2 */ uint16_t sqhd; /* submission queue head pointer */ uint16_t sqid; /* submission queue identifier */ /* dword 3 */ uint16_t cid; /* command identifier */ uint16_t status; } __packed; _Static_assert(sizeof(struct nvme_completion) == 4 * 4, "bad size for nvme_completion"); struct nvme_dsm_range { uint32_t attributes; uint32_t length; uint64_t starting_lba; } __packed; /* Largest DSM Trim that can be done */ #define NVME_MAX_DSM_TRIM 4096 _Static_assert(sizeof(struct nvme_dsm_range) == 16, "bad size for nvme_dsm_ranage"); /* status code types */ enum nvme_status_code_type { NVME_SCT_GENERIC = 0x0, NVME_SCT_COMMAND_SPECIFIC = 0x1, NVME_SCT_MEDIA_ERROR = 0x2, /* 0x3-0x6 - reserved */ NVME_SCT_VENDOR_SPECIFIC = 0x7, }; /* generic command status codes */ enum nvme_generic_command_status_code { NVME_SC_SUCCESS = 0x00, NVME_SC_INVALID_OPCODE = 0x01, NVME_SC_INVALID_FIELD = 0x02, NVME_SC_COMMAND_ID_CONFLICT = 0x03, NVME_SC_DATA_TRANSFER_ERROR = 0x04, NVME_SC_ABORTED_POWER_LOSS = 0x05, NVME_SC_INTERNAL_DEVICE_ERROR = 0x06, NVME_SC_ABORTED_BY_REQUEST = 0x07, NVME_SC_ABORTED_SQ_DELETION = 0x08, NVME_SC_ABORTED_FAILED_FUSED = 0x09, NVME_SC_ABORTED_MISSING_FUSED = 0x0a, NVME_SC_INVALID_NAMESPACE_OR_FORMAT = 0x0b, NVME_SC_COMMAND_SEQUENCE_ERROR = 0x0c, NVME_SC_INVALID_SGL_SEGMENT_DESCR = 0x0d, NVME_SC_INVALID_NUMBER_OF_SGL_DESCR = 0x0e, NVME_SC_DATA_SGL_LENGTH_INVALID = 0x0f, NVME_SC_METADATA_SGL_LENGTH_INVALID = 0x10, NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID = 0x11, NVME_SC_INVALID_USE_OF_CMB = 0x12, NVME_SC_PRP_OFFET_INVALID = 0x13, NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED = 0x14, NVME_SC_OPERATION_DENIED = 0x15, NVME_SC_SGL_OFFSET_INVALID = 0x16, /* 0x17 - reserved */ NVME_SC_HOST_ID_INCONSISTENT_FORMAT = 0x18, NVME_SC_KEEP_ALIVE_TIMEOUT_EXPIRED = 0x19, NVME_SC_KEEP_ALIVE_TIMEOUT_INVALID = 0x1a, NVME_SC_ABORTED_DUE_TO_PREEMPT = 0x1b, NVME_SC_SANITIZE_FAILED = 0x1c, NVME_SC_SANITIZE_IN_PROGRESS = 0x1d, NVME_SC_SGL_DATA_BLOCK_GRAN_INVALID = 0x1e, NVME_SC_NOT_SUPPORTED_IN_CMB = 0x1f, NVME_SC_LBA_OUT_OF_RANGE = 0x80, NVME_SC_CAPACITY_EXCEEDED = 0x81, NVME_SC_NAMESPACE_NOT_READY = 0x82, NVME_SC_RESERVATION_CONFLICT = 0x83, NVME_SC_FORMAT_IN_PROGRESS = 0x84, }; /* command specific status codes */ enum nvme_command_specific_status_code { NVME_SC_COMPLETION_QUEUE_INVALID = 0x00, NVME_SC_INVALID_QUEUE_IDENTIFIER = 0x01, NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED = 0x02, NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED = 0x03, /* 0x04 - reserved */ NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED = 0x05, NVME_SC_INVALID_FIRMWARE_SLOT = 0x06, NVME_SC_INVALID_FIRMWARE_IMAGE = 0x07, NVME_SC_INVALID_INTERRUPT_VECTOR = 0x08, NVME_SC_INVALID_LOG_PAGE = 0x09, NVME_SC_INVALID_FORMAT = 0x0a, NVME_SC_FIRMWARE_REQUIRES_RESET = 0x0b, NVME_SC_INVALID_QUEUE_DELETION = 0x0c, NVME_SC_FEATURE_NOT_SAVEABLE = 0x0d, NVME_SC_FEATURE_NOT_CHANGEABLE = 0x0e, NVME_SC_FEATURE_NOT_NS_SPECIFIC = 0x0f, NVME_SC_FW_ACT_REQUIRES_NVMS_RESET = 0x10, NVME_SC_FW_ACT_REQUIRES_RESET = 0x11, NVME_SC_FW_ACT_REQUIRES_TIME = 0x12, NVME_SC_FW_ACT_PROHIBITED = 0x13, NVME_SC_OVERLAPPING_RANGE = 0x14, NVME_SC_NS_INSUFFICIENT_CAPACITY = 0x15, NVME_SC_NS_ID_UNAVAILABLE = 0x16, /* 0x17 - reserved */ NVME_SC_NS_ALREADY_ATTACHED = 0x18, NVME_SC_NS_IS_PRIVATE = 0x19, NVME_SC_NS_NOT_ATTACHED = 0x1a, NVME_SC_THIN_PROV_NOT_SUPPORTED = 0x1b, NVME_SC_CTRLR_LIST_INVALID = 0x1c, NVME_SC_SELT_TEST_IN_PROGRESS = 0x1d, NVME_SC_BOOT_PART_WRITE_PROHIB = 0x1e, NVME_SC_INVALID_CTRLR_ID = 0x1f, NVME_SC_INVALID_SEC_CTRLR_STATE = 0x20, NVME_SC_INVALID_NUM_OF_CTRLR_RESRC = 0x21, NVME_SC_INVALID_RESOURCE_ID = 0x22, NVME_SC_CONFLICTING_ATTRIBUTES = 0x80, NVME_SC_INVALID_PROTECTION_INFO = 0x81, NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE = 0x82, }; /* media error status codes */ enum nvme_media_error_status_code { NVME_SC_WRITE_FAULTS = 0x80, NVME_SC_UNRECOVERED_READ_ERROR = 0x81, NVME_SC_GUARD_CHECK_ERROR = 0x82, NVME_SC_APPLICATION_TAG_CHECK_ERROR = 0x83, NVME_SC_REFERENCE_TAG_CHECK_ERROR = 0x84, NVME_SC_COMPARE_FAILURE = 0x85, NVME_SC_ACCESS_DENIED = 0x86, NVME_SC_DEALLOCATED_OR_UNWRITTEN = 0x87, }; /* admin opcodes */ enum nvme_admin_opcode { NVME_OPC_DELETE_IO_SQ = 0x00, NVME_OPC_CREATE_IO_SQ = 0x01, NVME_OPC_GET_LOG_PAGE = 0x02, /* 0x03 - reserved */ NVME_OPC_DELETE_IO_CQ = 0x04, NVME_OPC_CREATE_IO_CQ = 0x05, NVME_OPC_IDENTIFY = 0x06, /* 0x07 - reserved */ NVME_OPC_ABORT = 0x08, NVME_OPC_SET_FEATURES = 0x09, NVME_OPC_GET_FEATURES = 0x0a, /* 0x0b - reserved */ NVME_OPC_ASYNC_EVENT_REQUEST = 0x0c, NVME_OPC_NAMESPACE_MANAGEMENT = 0x0d, /* 0x0e-0x0f - reserved */ NVME_OPC_FIRMWARE_ACTIVATE = 0x10, NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD = 0x11, NVME_OPC_DEVICE_SELF_TEST = 0x14, NVME_OPC_NAMESPACE_ATTACHMENT = 0x15, NVME_OPC_KEEP_ALIVE = 0x18, NVME_OPC_DIRECTIVE_SEND = 0x19, NVME_OPC_DIRECTIVE_RECEIVE = 0x1a, NVME_OPC_VIRTUALIZATION_MANAGEMENT = 0x1c, NVME_OPC_NVME_MI_SEND = 0x1d, NVME_OPC_NVME_MI_RECEIVE = 0x1e, NVME_OPC_DOORBELL_BUFFER_CONFIG = 0x7c, NVME_OPC_FORMAT_NVM = 0x80, NVME_OPC_SECURITY_SEND = 0x81, NVME_OPC_SECURITY_RECEIVE = 0x82, NVME_OPC_SANITIZE = 0x84, }; /* nvme nvm opcodes */ enum nvme_nvm_opcode { NVME_OPC_FLUSH = 0x00, NVME_OPC_WRITE = 0x01, NVME_OPC_READ = 0x02, /* 0x03 - reserved */ NVME_OPC_WRITE_UNCORRECTABLE = 0x04, NVME_OPC_COMPARE = 0x05, /* 0x06 - reserved */ NVME_OPC_WRITE_ZEROES = 0x08, /* 0x07 - reserved */ NVME_OPC_DATASET_MANAGEMENT = 0x09, /* 0x0a-0x0c - reserved */ NVME_OPC_RESERVATION_REGISTER = 0x0d, NVME_OPC_RESERVATION_REPORT = 0x0e, /* 0x0f-0x10 - reserved */ NVME_OPC_RESERVATION_ACQUIRE = 0x11, /* 0x12-0x14 - reserved */ NVME_OPC_RESERVATION_RELEASE = 0x15, }; enum nvme_feature { /* 0x00 - reserved */ NVME_FEAT_ARBITRATION = 0x01, NVME_FEAT_POWER_MANAGEMENT = 0x02, NVME_FEAT_LBA_RANGE_TYPE = 0x03, NVME_FEAT_TEMPERATURE_THRESHOLD = 0x04, NVME_FEAT_ERROR_RECOVERY = 0x05, NVME_FEAT_VOLATILE_WRITE_CACHE = 0x06, NVME_FEAT_NUMBER_OF_QUEUES = 0x07, NVME_FEAT_INTERRUPT_COALESCING = 0x08, NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION = 0x09, NVME_FEAT_WRITE_ATOMICITY = 0x0A, NVME_FEAT_ASYNC_EVENT_CONFIGURATION = 0x0B, NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION = 0x0C, NVME_FEAT_HOST_MEMORY_BUFFER = 0x0D, NVME_FEAT_TIMESTAMP = 0x0E, NVME_FEAT_KEEP_ALIVE_TIMER = 0x0F, NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT = 0x10, NVME_FEAT_NON_OP_POWER_STATE_CONFIG = 0x11, /* 0x12-0x77 - reserved */ /* 0x78-0x7f - NVMe Management Interface */ NVME_FEAT_SOFTWARE_PROGRESS_MARKER = 0x80, /* 0x81-0xBF - command set specific (reserved) */ /* 0xC0-0xFF - vendor specific */ }; enum nvme_dsm_attribute { NVME_DSM_ATTR_INTEGRAL_READ = 0x1, NVME_DSM_ATTR_INTEGRAL_WRITE = 0x2, NVME_DSM_ATTR_DEALLOCATE = 0x4, }; enum nvme_activate_action { NVME_AA_REPLACE_NO_ACTIVATE = 0x0, NVME_AA_REPLACE_ACTIVATE = 0x1, NVME_AA_ACTIVATE = 0x2, }; struct nvme_power_state { /** Maximum Power */ uint16_t mp; /* Maximum Power */ uint8_t ps_rsvd1; uint8_t mps_nops; /* Max Power Scale, Non-Operational State */ uint32_t enlat; /* Entry Latency */ uint32_t exlat; /* Exit Latency */ uint8_t rrt; /* Relative Read Throughput */ uint8_t rrl; /* Relative Read Latency */ uint8_t rwt; /* Relative Write Throughput */ uint8_t rwl; /* Relative Write Latency */ uint16_t idlp; /* Idle Power */ uint8_t ips; /* Idle Power Scale */ uint8_t ps_rsvd8; uint16_t actp; /* Active Power */ uint8_t apw_aps; /* Active Power Workload, Active Power Scale */ uint8_t ps_rsvd10[9]; } __packed; _Static_assert(sizeof(struct nvme_power_state) == 32, "bad size for nvme_power_state"); #define NVME_SERIAL_NUMBER_LENGTH 20 #define NVME_MODEL_NUMBER_LENGTH 40 #define NVME_FIRMWARE_REVISION_LENGTH 8 struct nvme_controller_data { /* bytes 0-255: controller capabilities and features */ /** pci vendor id */ uint16_t vid; /** pci subsystem vendor id */ uint16_t ssvid; /** serial number */ uint8_t sn[NVME_SERIAL_NUMBER_LENGTH]; /** model number */ uint8_t mn[NVME_MODEL_NUMBER_LENGTH]; /** firmware revision */ uint8_t fr[NVME_FIRMWARE_REVISION_LENGTH]; /** recommended arbitration burst */ uint8_t rab; /** ieee oui identifier */ uint8_t ieee[3]; /** multi-interface capabilities */ uint8_t mic; /** maximum data transfer size */ uint8_t mdts; /** Controller ID */ uint16_t ctrlr_id; /** Version */ uint32_t ver; /** RTD3 Resume Latency */ uint32_t rtd3r; /** RTD3 Enter Latency */ uint32_t rtd3e; /** Optional Asynchronous Events Supported */ uint32_t oaes; /* bitfield really */ /** Controller Attributes */ uint32_t ctratt; /* bitfield really */ uint8_t reserved1[12]; /** FRU Globally Unique Identifier */ uint8_t fguid[16]; uint8_t reserved2[128]; /* bytes 256-511: admin command set attributes */ /** optional admin command support */ uint16_t oacs; /** abort command limit */ uint8_t acl; /** asynchronous event request limit */ uint8_t aerl; /** firmware updates */ uint8_t frmw; /** log page attributes */ uint8_t lpa; /** error log page entries */ uint8_t elpe; /** number of power states supported */ uint8_t npss; /** admin vendor specific command configuration */ uint8_t avscc; /** Autonomous Power State Transition Attributes */ uint8_t apsta; /** Warning Composite Temperature Threshold */ uint16_t wctemp; /** Critical Composite Temperature Threshold */ uint16_t cctemp; /** Maximum Time for Firmware Activation */ uint16_t mtfa; /** Host Memory Buffer Preferred Size */ uint32_t hmpre; /** Host Memory Buffer Minimum Size */ uint32_t hmmin; /** Name space capabilities */ struct { /* if nsmgmt, report tnvmcap and unvmcap */ uint8_t tnvmcap[16]; uint8_t unvmcap[16]; } __packed untncap; /** Replay Protected Memory Block Support */ uint32_t rpmbs; /* Really a bitfield */ /** Extended Device Self-test Time */ uint16_t edstt; /** Device Self-test Options */ uint8_t dsto; /* Really a bitfield */ /** Firmware Update Granularity */ uint8_t fwug; /** Keep Alive Support */ uint16_t kas; /** Host Controlled Thermal Management Attributes */ uint16_t hctma; /* Really a bitfield */ /** Minimum Thermal Management Temperature */ uint16_t mntmt; /** Maximum Thermal Management Temperature */ uint16_t mxtmt; /** Sanitize Capabilities */ uint32_t sanicap; /* Really a bitfield */ uint8_t reserved3[180]; /* bytes 512-703: nvm command set attributes */ /** submission queue entry size */ uint8_t sqes; /** completion queue entry size */ uint8_t cqes; /** Maximum Outstanding Commands */ uint16_t maxcmd; /** number of namespaces */ uint32_t nn; /** optional nvm command support */ uint16_t oncs; /** fused operation support */ uint16_t fuses; /** format nvm attributes */ uint8_t fna; /** volatile write cache */ uint8_t vwc; /** Atomic Write Unit Normal */ uint16_t awun; /** Atomic Write Unit Power Fail */ uint16_t awupf; /** NVM Vendor Specific Command Configuration */ uint8_t nvscc; uint8_t reserved5; /** Atomic Compare & Write Unit */ uint16_t acwu; uint16_t reserved6; /** SGL Support */ uint32_t sgls; /* bytes 540-767: Reserved */ uint8_t reserved7[228]; /** NVM Subsystem NVMe Qualified Name */ uint8_t subnqn[256]; /* bytes 1024-1791: Reserved */ uint8_t reserved8[768]; /* bytes 1792-2047: NVMe over Fabrics specification */ uint8_t reserved9[256]; /* bytes 2048-3071: power state descriptors */ struct nvme_power_state power_state[32]; /* bytes 3072-4095: vendor specific */ uint8_t vs[1024]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_controller_data) == 4096, "bad size for nvme_controller_data"); struct nvme_namespace_data { /** namespace size */ uint64_t nsze; /** namespace capacity */ uint64_t ncap; /** namespace utilization */ uint64_t nuse; /** namespace features */ uint8_t nsfeat; /** number of lba formats */ uint8_t nlbaf; /** formatted lba size */ uint8_t flbas; /** metadata capabilities */ uint8_t mc; /** end-to-end data protection capabilities */ uint8_t dpc; /** end-to-end data protection type settings */ uint8_t dps; /** Namespace Multi-path I/O and Namespace Sharing Capabilities */ uint8_t nmic; /** Reservation Capabilities */ uint8_t rescap; /** Format Progress Indicator */ uint8_t fpi; /** Deallocate Logical Block Features */ uint8_t dlfeat; /** Namespace Atomic Write Unit Normal */ uint16_t nawun; /** Namespace Atomic Write Unit Power Fail */ uint16_t nawupf; /** Namespace Atomic Compare & Write Unit */ uint16_t nacwu; /** Namespace Atomic Boundary Size Normal */ uint16_t nabsn; /** Namespace Atomic Boundary Offset */ uint16_t nabo; /** Namespace Atomic Boundary Size Power Fail */ uint16_t nabspf; /** Namespace Optimal IO Boundary */ uint16_t noiob; /** NVM Capacity */ uint8_t nvmcap[16]; /* bytes 64-103: Reserved */ uint8_t reserved5[40]; /** Namespace Globally Unique Identifier */ uint8_t nguid[16]; /** IEEE Extended Unique Identifier */ uint8_t eui64[8]; /** lba format support */ uint32_t lbaf[16]; uint8_t reserved6[192]; uint8_t vendor_specific[3712]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_namespace_data) == 4096, "bad size for nvme_namepsace_data"); enum nvme_log_page { /* 0x00 - reserved */ NVME_LOG_ERROR = 0x01, NVME_LOG_HEALTH_INFORMATION = 0x02, NVME_LOG_FIRMWARE_SLOT = 0x03, NVME_LOG_CHANGED_NAMESPACE = 0x04, NVME_LOG_COMMAND_EFFECT = 0x05, /* 0x06-0x7F - reserved */ /* 0x80-0xBF - I/O command set specific */ NVME_LOG_RES_NOTIFICATION = 0x80, /* 0xC0-0xFF - vendor specific */ /* * The following are Intel Specific log pages, but they seem * to be widely implemented. */ INTEL_LOG_READ_LAT_LOG = 0xc1, INTEL_LOG_WRITE_LAT_LOG = 0xc2, INTEL_LOG_TEMP_STATS = 0xc5, INTEL_LOG_ADD_SMART = 0xca, INTEL_LOG_DRIVE_MKT_NAME = 0xdd, /* * HGST log page, with lots ofs sub pages. */ HGST_INFO_LOG = 0xc1, }; struct nvme_error_information_entry { uint64_t error_count; uint16_t sqid; uint16_t cid; uint16_t status; uint16_t error_location; uint64_t lba; uint32_t nsid; uint8_t vendor_specific; uint8_t reserved[35]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_error_information_entry) == 64, "bad size for nvme_error_information_entry"); struct nvme_health_information_page { uint8_t critical_warning; uint16_t temperature; uint8_t available_spare; uint8_t available_spare_threshold; uint8_t percentage_used; uint8_t reserved[26]; /* * Note that the following are 128-bit values, but are * defined as an array of 2 64-bit values. */ /* Data Units Read is always in 512-byte units. */ uint64_t data_units_read[2]; /* Data Units Written is always in 512-byte units. */ uint64_t data_units_written[2]; /* For NVM command set, this includes Compare commands. */ uint64_t host_read_commands[2]; uint64_t host_write_commands[2]; /* Controller Busy Time is reported in minutes. */ uint64_t controller_busy_time[2]; uint64_t power_cycles[2]; uint64_t power_on_hours[2]; uint64_t unsafe_shutdowns[2]; uint64_t media_errors[2]; uint64_t num_error_info_log_entries[2]; uint32_t warning_temp_time; uint32_t error_temp_time; uint16_t temp_sensor[8]; uint8_t reserved2[296]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_health_information_page) == 512, "bad size for nvme_health_information_page"); struct nvme_firmware_page { uint8_t afi; uint8_t reserved[7]; uint64_t revision[7]; /* revisions for 7 slots */ uint8_t reserved2[448]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_firmware_page) == 512, "bad size for nvme_firmware_page"); struct nvme_ns_list { uint32_t ns[1024]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_ns_list) == 4096, "bad size for nvme_ns_list"); struct intel_log_temp_stats { uint64_t current; uint64_t overtemp_flag_last; uint64_t overtemp_flag_life; uint64_t max_temp; uint64_t min_temp; uint64_t _rsvd[5]; uint64_t max_oper_temp; uint64_t min_oper_temp; uint64_t est_offset; } __packed __aligned(4); _Static_assert(sizeof(struct intel_log_temp_stats) == 13 * 8, "bad size for intel_log_temp_stats"); #define NVME_TEST_MAX_THREADS 128 struct nvme_io_test { enum nvme_nvm_opcode opc; uint32_t size; uint32_t time; /* in seconds */ uint32_t num_threads; uint32_t flags; uint64_t io_completed[NVME_TEST_MAX_THREADS]; }; enum nvme_io_test_flags { /* * Specifies whether dev_refthread/dev_relthread should be * called during NVME_BIO_TEST. Ignored for other test * types. */ NVME_TEST_FLAG_REFTHREAD = 0x1, }; struct nvme_pt_command { /* * cmd is used to specify a passthrough command to a controller or * namespace. * * The following fields from cmd may be specified by the caller: * * opc (opcode) * * nsid (namespace id) - for admin commands only * * cdw10-cdw15 * * Remaining fields must be set to 0 by the caller. */ struct nvme_command cmd; /* * cpl returns completion status for the passthrough command * specified by cmd. * * The following fields will be filled out by the driver, for * consumption by the caller: * * cdw0 * * status (except for phase) * * Remaining fields will be set to 0 by the driver. */ struct nvme_completion cpl; /* buf is the data buffer associated with this passthrough command. */ void * buf; /* * len is the length of the data buffer associated with this * passthrough command. */ uint32_t len; /* * is_read = 1 if the passthrough command will read data into the * supplied buffer from the controller. * * is_read = 0 if the passthrough command will write data from the * supplied buffer to the controller. */ uint32_t is_read; /* * driver_lock is used by the driver only. It must be set to 0 * by the caller. */ struct mtx * driver_lock; }; #define nvme_completion_is_error(cpl) \ (NVME_STATUS_GET_SC((cpl)->status) != 0 || NVME_STATUS_GET_SCT((cpl)->status) != 0) void nvme_strvis(uint8_t *dst, const uint8_t *src, int dstlen, int srclen); #ifdef _KERNEL struct bio; struct nvme_namespace; struct nvme_controller; struct nvme_consumer; typedef void (*nvme_cb_fn_t)(void *, const struct nvme_completion *); typedef void *(*nvme_cons_ns_fn_t)(struct nvme_namespace *, void *); typedef void *(*nvme_cons_ctrlr_fn_t)(struct nvme_controller *); typedef void (*nvme_cons_async_fn_t)(void *, const struct nvme_completion *, uint32_t, void *, uint32_t); typedef void (*nvme_cons_fail_fn_t)(void *); enum nvme_namespace_flags { NVME_NS_DEALLOCATE_SUPPORTED = 0x1, NVME_NS_FLUSH_SUPPORTED = 0x2, }; int nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, struct nvme_pt_command *pt, uint32_t nsid, int is_user_buffer, int is_admin_cmd); /* Admin functions */ void nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr, uint8_t feature, uint32_t cdw11, void *payload, uint32_t payload_size, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_get_feature(struct nvme_controller *ctrlr, uint8_t feature, uint32_t cdw11, void *payload, uint32_t payload_size, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_get_log_page(struct nvme_controller *ctrlr, uint8_t log_page, uint32_t nsid, void *payload, uint32_t payload_size, nvme_cb_fn_t cb_fn, void *cb_arg); /* NVM I/O functions */ int nvme_ns_cmd_write(struct nvme_namespace *ns, void *payload, uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_cmd_write_bio(struct nvme_namespace *ns, struct bio *bp, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_cmd_read(struct nvme_namespace *ns, void *payload, uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_cmd_read_bio(struct nvme_namespace *ns, struct bio *bp, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_cmd_deallocate(struct nvme_namespace *ns, void *payload, uint8_t num_ranges, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_cmd_flush(struct nvme_namespace *ns, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_dump(struct nvme_namespace *ns, void *virt, off_t offset, size_t len); /* Registration functions */ struct nvme_consumer * nvme_register_consumer(nvme_cons_ns_fn_t ns_fn, nvme_cons_ctrlr_fn_t ctrlr_fn, nvme_cons_async_fn_t async_fn, nvme_cons_fail_fn_t fail_fn); void nvme_unregister_consumer(struct nvme_consumer *consumer); /* Controller helper functions */ device_t nvme_ctrlr_get_device(struct nvme_controller *ctrlr); const struct nvme_controller_data * nvme_ctrlr_get_data(struct nvme_controller *ctrlr); /* Namespace helper functions */ uint32_t nvme_ns_get_max_io_xfer_size(struct nvme_namespace *ns); uint32_t nvme_ns_get_sector_size(struct nvme_namespace *ns); uint64_t nvme_ns_get_num_sectors(struct nvme_namespace *ns); uint64_t nvme_ns_get_size(struct nvme_namespace *ns); uint32_t nvme_ns_get_flags(struct nvme_namespace *ns); const char * nvme_ns_get_serial_number(struct nvme_namespace *ns); const char * nvme_ns_get_model_number(struct nvme_namespace *ns); const struct nvme_namespace_data * nvme_ns_get_data(struct nvme_namespace *ns); uint32_t nvme_ns_get_stripesize(struct nvme_namespace *ns); int nvme_ns_bio_process(struct nvme_namespace *ns, struct bio *bp, nvme_cb_fn_t cb_fn); /* * Command building helper functions -- shared with CAM * These functions assume allocator zeros out cmd structure * CAM's xpt_get_ccb and the request allocator for nvme both * do zero'd allocations. */ static inline void nvme_ns_flush_cmd(struct nvme_command *cmd, uint32_t nsid) { - cmd->opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_FLUSH); + cmd->opc = NVME_OPC_FLUSH; cmd->nsid = htole32(nsid); } static inline void nvme_ns_rw_cmd(struct nvme_command *cmd, uint32_t rwcmd, uint32_t nsid, uint64_t lba, uint32_t count) { - cmd->opc_fuse = NVME_CMD_SET_OPC(rwcmd); + cmd->opc = rwcmd; cmd->nsid = htole32(nsid); cmd->cdw10 = htole32(lba & 0xffffffffu); cmd->cdw11 = htole32(lba >> 32); cmd->cdw12 = htole32(count-1); } static inline void nvme_ns_write_cmd(struct nvme_command *cmd, uint32_t nsid, uint64_t lba, uint32_t count) { nvme_ns_rw_cmd(cmd, NVME_OPC_WRITE, nsid, lba, count); } static inline void nvme_ns_read_cmd(struct nvme_command *cmd, uint32_t nsid, uint64_t lba, uint32_t count) { nvme_ns_rw_cmd(cmd, NVME_OPC_READ, nsid, lba, count); } static inline void nvme_ns_trim_cmd(struct nvme_command *cmd, uint32_t nsid, uint32_t num_ranges) { - cmd->opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_DATASET_MANAGEMENT); + cmd->opc = NVME_OPC_DATASET_MANAGEMENT; cmd->nsid = htole32(nsid); cmd->cdw10 = htole32(num_ranges - 1); cmd->cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE); } extern int nvme_use_nvd; #endif /* _KERNEL */ /* Endianess conversion functions for NVMe structs */ static inline void nvme_completion_swapbytes(struct nvme_completion *s) { s->cdw0 = le32toh(s->cdw0); /* omit rsvd1 */ s->sqhd = le16toh(s->sqhd); s->sqid = le16toh(s->sqid); /* omit cid */ s->status = le16toh(s->status); } static inline void nvme_power_state_swapbytes(struct nvme_power_state *s) { s->mp = le16toh(s->mp); s->enlat = le32toh(s->enlat); s->exlat = le32toh(s->exlat); s->idlp = le16toh(s->idlp); s->actp = le16toh(s->actp); } static inline void nvme_controller_data_swapbytes(struct nvme_controller_data *s) { int i; s->vid = le16toh(s->vid); s->ssvid = le16toh(s->ssvid); s->ctrlr_id = le16toh(s->ctrlr_id); s->ver = le32toh(s->ver); s->rtd3r = le32toh(s->rtd3r); s->rtd3e = le32toh(s->rtd3e); s->oaes = le32toh(s->oaes); s->ctratt = le32toh(s->ctratt); s->oacs = le16toh(s->oacs); s->wctemp = le16toh(s->wctemp); s->cctemp = le16toh(s->cctemp); s->mtfa = le16toh(s->mtfa); s->hmpre = le32toh(s->hmpre); s->hmmin = le32toh(s->hmmin); s->rpmbs = le32toh(s->rpmbs); s->edstt = le16toh(s->edstt); s->kas = le16toh(s->kas); s->hctma = le16toh(s->hctma); s->mntmt = le16toh(s->mntmt); s->mxtmt = le16toh(s->mxtmt); s->sanicap = le32toh(s->sanicap); s->maxcmd = le16toh(s->maxcmd); s->nn = le32toh(s->nn); s->oncs = le16toh(s->oncs); s->fuses = le16toh(s->fuses); s->awun = le16toh(s->awun); s->awupf = le16toh(s->awupf); s->acwu = le16toh(s->acwu); s->sgls = le32toh(s->sgls); for (i = 0; i < 32; i++) nvme_power_state_swapbytes(&s->power_state[i]); } static inline void nvme_namespace_data_swapbytes(struct nvme_namespace_data *s) { int i; s->nsze = le64toh(s->nsze); s->ncap = le64toh(s->ncap); s->nuse = le64toh(s->nuse); s->nawun = le16toh(s->nawun); s->nawupf = le16toh(s->nawupf); s->nacwu = le16toh(s->nacwu); s->nabsn = le16toh(s->nabsn); s->nabo = le16toh(s->nabo); s->nabspf = le16toh(s->nabspf); s->noiob = le16toh(s->noiob); for (i = 0; i < 16; i++) s->lbaf[i] = le32toh(s->lbaf[i]); } static inline void nvme_error_information_entry_swapbytes(struct nvme_error_information_entry *s) { s->error_count = le64toh(s->error_count); s->sqid = le16toh(s->sqid); s->cid = le16toh(s->cid); s->status = le16toh(s->status); s->error_location = le16toh(s->error_location); s->lba = le64toh(s->lba); s->nsid = le32toh(s->nsid); } static inline void nvme_le128toh(void *p) { #if _BYTE_ORDER != _LITTLE_ENDIAN /* Swap 16 bytes in place */ char *tmp = (char*)p; char b; int i; for (i = 0; i < 8; i++) { b = tmp[i]; tmp[i] = tmp[15-i]; tmp[15-i] = b; } #else (void)p; #endif } static inline void nvme_health_information_page_swapbytes(struct nvme_health_information_page *s) { int i; s->temperature = le16toh(s->temperature); nvme_le128toh((void *)s->data_units_read); nvme_le128toh((void *)s->data_units_written); nvme_le128toh((void *)s->host_read_commands); nvme_le128toh((void *)s->host_write_commands); nvme_le128toh((void *)s->controller_busy_time); nvme_le128toh((void *)s->power_cycles); nvme_le128toh((void *)s->power_on_hours); nvme_le128toh((void *)s->unsafe_shutdowns); nvme_le128toh((void *)s->media_errors); nvme_le128toh((void *)s->num_error_info_log_entries); s->warning_temp_time = le32toh(s->warning_temp_time); s->error_temp_time = le32toh(s->error_temp_time); for (i = 0; i < 8; i++) s->temp_sensor[i] = le16toh(s->temp_sensor[i]); } static inline void nvme_firmware_page_swapbytes(struct nvme_firmware_page *s) { int i; for (i = 0; i < 7; i++) s->revision[i] = le64toh(s->revision[i]); } static inline void nvme_ns_list_swapbytes(struct nvme_ns_list *s) { int i; for (i = 0; i < 1024; i++) s->ns[i] = le32toh(s->ns[i]); } static inline void intel_log_temp_stats_swapbytes(struct intel_log_temp_stats *s) { s->current = le64toh(s->current); s->overtemp_flag_last = le64toh(s->overtemp_flag_last); s->overtemp_flag_life = le64toh(s->overtemp_flag_life); s->max_temp = le64toh(s->max_temp); s->min_temp = le64toh(s->min_temp); /* omit _rsvd[] */ s->max_oper_temp = le64toh(s->max_oper_temp); s->min_oper_temp = le64toh(s->min_oper_temp); s->est_offset = le64toh(s->est_offset); } #endif /* __NVME_H__ */ Index: head/sys/dev/nvme/nvme_ctrlr.c =================================================================== --- head/sys/dev/nvme/nvme_ctrlr.c (revision 338181) +++ head/sys/dev/nvme/nvme_ctrlr.c (revision 338182) @@ -1,1414 +1,1415 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2012-2016 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_cam.h" #include #include #include #include #include #include #include #include #include #include #include #include #include "nvme_private.h" #define B4_CHK_RDY_DELAY_MS 2300 /* work around controller bug */ static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr, struct nvme_async_event_request *aer); static void nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr); static int nvme_ctrlr_allocate_bar(struct nvme_controller *ctrlr) { ctrlr->resource_id = PCIR_BAR(0); ctrlr->resource = bus_alloc_resource_any(ctrlr->dev, SYS_RES_MEMORY, &ctrlr->resource_id, RF_ACTIVE); if(ctrlr->resource == NULL) { nvme_printf(ctrlr, "unable to allocate pci resource\n"); return (ENOMEM); } ctrlr->bus_tag = rman_get_bustag(ctrlr->resource); ctrlr->bus_handle = rman_get_bushandle(ctrlr->resource); ctrlr->regs = (struct nvme_registers *)ctrlr->bus_handle; /* * The NVMe spec allows for the MSI-X table to be placed behind * BAR 4/5, separate from the control/doorbell registers. Always * try to map this bar, because it must be mapped prior to calling * pci_alloc_msix(). If the table isn't behind BAR 4/5, * bus_alloc_resource() will just return NULL which is OK. */ ctrlr->bar4_resource_id = PCIR_BAR(4); ctrlr->bar4_resource = bus_alloc_resource_any(ctrlr->dev, SYS_RES_MEMORY, &ctrlr->bar4_resource_id, RF_ACTIVE); return (0); } static int nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr) { struct nvme_qpair *qpair; uint32_t num_entries; int error; qpair = &ctrlr->adminq; num_entries = NVME_ADMIN_ENTRIES; TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries); /* * If admin_entries was overridden to an invalid value, revert it * back to our default value. */ if (num_entries < NVME_MIN_ADMIN_ENTRIES || num_entries > NVME_MAX_ADMIN_ENTRIES) { nvme_printf(ctrlr, "invalid hw.nvme.admin_entries=%d " "specified\n", num_entries); num_entries = NVME_ADMIN_ENTRIES; } /* * The admin queue's max xfer size is treated differently than the * max I/O xfer size. 16KB is sufficient here - maybe even less? */ error = nvme_qpair_construct(qpair, 0, /* qpair ID */ 0, /* vector */ num_entries, NVME_ADMIN_TRACKERS, ctrlr); return (error); } static int nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr) { struct nvme_qpair *qpair; uint32_t cap_lo; uint16_t mqes; int i, error, num_entries, num_trackers; num_entries = NVME_IO_ENTRIES; TUNABLE_INT_FETCH("hw.nvme.io_entries", &num_entries); /* * NVMe spec sets a hard limit of 64K max entries, but * devices may specify a smaller limit, so we need to check * the MQES field in the capabilities register. */ cap_lo = nvme_mmio_read_4(ctrlr, cap_lo); mqes = (cap_lo >> NVME_CAP_LO_REG_MQES_SHIFT) & NVME_CAP_LO_REG_MQES_MASK; num_entries = min(num_entries, mqes + 1); num_trackers = NVME_IO_TRACKERS; TUNABLE_INT_FETCH("hw.nvme.io_trackers", &num_trackers); num_trackers = max(num_trackers, NVME_MIN_IO_TRACKERS); num_trackers = min(num_trackers, NVME_MAX_IO_TRACKERS); /* * No need to have more trackers than entries in the submit queue. * Note also that for a queue size of N, we can only have (N-1) * commands outstanding, hence the "-1" here. */ num_trackers = min(num_trackers, (num_entries-1)); /* * Our best estimate for the maximum number of I/Os that we should * noramlly have in flight at one time. This should be viewed as a hint, * not a hard limit and will need to be revisitted when the upper layers * of the storage system grows multi-queue support. */ ctrlr->max_hw_pend_io = num_trackers * ctrlr->num_io_queues * 3 / 4; /* * This was calculated previously when setting up interrupts, but * a controller could theoretically support fewer I/O queues than * MSI-X vectors. So calculate again here just to be safe. */ ctrlr->num_cpus_per_ioq = howmany(mp_ncpus, ctrlr->num_io_queues); ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair), M_NVME, M_ZERO | M_WAITOK); for (i = 0; i < ctrlr->num_io_queues; i++) { qpair = &ctrlr->ioq[i]; /* * Admin queue has ID=0. IO queues start at ID=1 - * hence the 'i+1' here. * * For I/O queues, use the controller-wide max_xfer_size * calculated in nvme_attach(). */ error = nvme_qpair_construct(qpair, i+1, /* qpair ID */ ctrlr->msix_enabled ? i+1 : 0, /* vector */ num_entries, num_trackers, ctrlr); if (error) return (error); /* * Do not bother binding interrupts if we only have one I/O * interrupt thread for this controller. */ if (ctrlr->num_io_queues > 1) bus_bind_intr(ctrlr->dev, qpair->res, i * ctrlr->num_cpus_per_ioq); } return (0); } static void nvme_ctrlr_fail(struct nvme_controller *ctrlr) { int i; ctrlr->is_failed = TRUE; nvme_qpair_fail(&ctrlr->adminq); if (ctrlr->ioq != NULL) { for (i = 0; i < ctrlr->num_io_queues; i++) nvme_qpair_fail(&ctrlr->ioq[i]); } nvme_notify_fail_consumers(ctrlr); } void nvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr, struct nvme_request *req) { mtx_lock(&ctrlr->lock); STAILQ_INSERT_TAIL(&ctrlr->fail_req, req, stailq); mtx_unlock(&ctrlr->lock); taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task); } static void nvme_ctrlr_fail_req_task(void *arg, int pending) { struct nvme_controller *ctrlr = arg; struct nvme_request *req; mtx_lock(&ctrlr->lock); while ((req = STAILQ_FIRST(&ctrlr->fail_req)) != NULL) { STAILQ_REMOVE_HEAD(&ctrlr->fail_req, stailq); mtx_unlock(&ctrlr->lock); nvme_qpair_manual_complete_request(req->qpair, req, NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, TRUE); mtx_lock(&ctrlr->lock); } mtx_unlock(&ctrlr->lock); } static int nvme_ctrlr_wait_for_ready(struct nvme_controller *ctrlr, int desired_val) { int ms_waited; uint32_t csts; csts = nvme_mmio_read_4(ctrlr, csts); ms_waited = 0; while (((csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK) != desired_val) { if (ms_waited++ > ctrlr->ready_timeout_in_ms) { nvme_printf(ctrlr, "controller ready did not become %d " "within %d ms\n", desired_val, ctrlr->ready_timeout_in_ms); return (ENXIO); } DELAY(1000); csts = nvme_mmio_read_4(ctrlr, csts); } return (0); } static int nvme_ctrlr_disable(struct nvme_controller *ctrlr) { uint32_t cc; uint32_t csts; uint8_t en, rdy; int err; cc = nvme_mmio_read_4(ctrlr, cc); csts = nvme_mmio_read_4(ctrlr, csts); en = (cc >> NVME_CC_REG_EN_SHIFT) & NVME_CC_REG_EN_MASK; rdy = (csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK; /* * Per 3.1.5 in NVME 1.3 spec, transitioning CC.EN from 0 to 1 * when CSTS.RDY is 1 or transitioning CC.EN from 1 to 0 when * CSTS.RDY is 0 "has undefined results" So make sure that CSTS.RDY * isn't the desired value. Short circuit if we're already disabled. */ if (en == 1) { if (rdy == 0) { /* EN == 1, wait for RDY == 1 or fail */ err = nvme_ctrlr_wait_for_ready(ctrlr, 1); if (err != 0) return (err); } } else { /* EN == 0 already wait for RDY == 0 */ if (rdy == 0) return (0); else return (nvme_ctrlr_wait_for_ready(ctrlr, 0)); } cc &= ~NVME_CC_REG_EN_MASK; nvme_mmio_write_4(ctrlr, cc, cc); /* * Some drives have issues with accessing the mmio after we * disable, so delay for a bit after we write the bit to * cope with these issues. */ if (ctrlr->quirks & QUIRK_DELAY_B4_CHK_RDY) pause("nvmeR", B4_CHK_RDY_DELAY_MS * hz / 1000); return (nvme_ctrlr_wait_for_ready(ctrlr, 0)); } static int nvme_ctrlr_enable(struct nvme_controller *ctrlr) { uint32_t cc; uint32_t csts; uint32_t aqa; uint32_t qsize; uint8_t en, rdy; int err; cc = nvme_mmio_read_4(ctrlr, cc); csts = nvme_mmio_read_4(ctrlr, csts); en = (cc >> NVME_CC_REG_EN_SHIFT) & NVME_CC_REG_EN_MASK; rdy = (csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK; /* * See note in nvme_ctrlr_disable. Short circuit if we're already enabled. */ if (en == 1) { if (rdy == 1) return (0); else return (nvme_ctrlr_wait_for_ready(ctrlr, 1)); } else { /* EN == 0 already wait for RDY == 0 or fail */ err = nvme_ctrlr_wait_for_ready(ctrlr, 0); if (err != 0) return (err); } nvme_mmio_write_8(ctrlr, asq, ctrlr->adminq.cmd_bus_addr); DELAY(5000); nvme_mmio_write_8(ctrlr, acq, ctrlr->adminq.cpl_bus_addr); DELAY(5000); /* acqs and asqs are 0-based. */ qsize = ctrlr->adminq.num_entries - 1; aqa = 0; aqa = (qsize & NVME_AQA_REG_ACQS_MASK) << NVME_AQA_REG_ACQS_SHIFT; aqa |= (qsize & NVME_AQA_REG_ASQS_MASK) << NVME_AQA_REG_ASQS_SHIFT; nvme_mmio_write_4(ctrlr, aqa, aqa); DELAY(5000); /* Initialization values for CC */ cc = 0; cc |= 1 << NVME_CC_REG_EN_SHIFT; cc |= 0 << NVME_CC_REG_CSS_SHIFT; cc |= 0 << NVME_CC_REG_AMS_SHIFT; cc |= 0 << NVME_CC_REG_SHN_SHIFT; cc |= 6 << NVME_CC_REG_IOSQES_SHIFT; /* SQ entry size == 64 == 2^6 */ cc |= 4 << NVME_CC_REG_IOCQES_SHIFT; /* CQ entry size == 16 == 2^4 */ /* This evaluates to 0, which is according to spec. */ cc |= (PAGE_SIZE >> 13) << NVME_CC_REG_MPS_SHIFT; nvme_mmio_write_4(ctrlr, cc, cc); return (nvme_ctrlr_wait_for_ready(ctrlr, 1)); } int nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr) { int i, err; nvme_admin_qpair_disable(&ctrlr->adminq); /* * I/O queues are not allocated before the initial HW * reset, so do not try to disable them. Use is_initialized * to determine if this is the initial HW reset. */ if (ctrlr->is_initialized) { for (i = 0; i < ctrlr->num_io_queues; i++) nvme_io_qpair_disable(&ctrlr->ioq[i]); } DELAY(100*1000); err = nvme_ctrlr_disable(ctrlr); if (err != 0) return err; return (nvme_ctrlr_enable(ctrlr)); } void nvme_ctrlr_reset(struct nvme_controller *ctrlr) { int cmpset; cmpset = atomic_cmpset_32(&ctrlr->is_resetting, 0, 1); if (cmpset == 0 || ctrlr->is_failed) /* * Controller is already resetting or has failed. Return * immediately since there is no need to kick off another * reset in these cases. */ return; taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task); } static int nvme_ctrlr_identify(struct nvme_controller *ctrlr) { struct nvme_completion_poll_status status; status.done = 0; nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata, nvme_completion_poll_cb, &status); while (!atomic_load_acq_int(&status.done)) pause("nvme", 1); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_identify_controller failed!\n"); return (ENXIO); } /* Convert data to host endian */ nvme_controller_data_swapbytes(&ctrlr->cdata); /* * Use MDTS to ensure our default max_xfer_size doesn't exceed what the * controller supports. */ if (ctrlr->cdata.mdts > 0) ctrlr->max_xfer_size = min(ctrlr->max_xfer_size, ctrlr->min_page_size * (1 << (ctrlr->cdata.mdts))); return (0); } static int nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr) { struct nvme_completion_poll_status status; int cq_allocated, sq_allocated; status.done = 0; nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues, nvme_completion_poll_cb, &status); while (!atomic_load_acq_int(&status.done)) pause("nvme", 1); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_ctrlr_set_num_qpairs failed!\n"); return (ENXIO); } /* * Data in cdw0 is 0-based. * Lower 16-bits indicate number of submission queues allocated. * Upper 16-bits indicate number of completion queues allocated. */ sq_allocated = (status.cpl.cdw0 & 0xFFFF) + 1; cq_allocated = (status.cpl.cdw0 >> 16) + 1; /* * Controller may allocate more queues than we requested, * so use the minimum of the number requested and what was * actually allocated. */ ctrlr->num_io_queues = min(ctrlr->num_io_queues, sq_allocated); ctrlr->num_io_queues = min(ctrlr->num_io_queues, cq_allocated); return (0); } static int nvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr) { struct nvme_completion_poll_status status; struct nvme_qpair *qpair; int i; for (i = 0; i < ctrlr->num_io_queues; i++) { qpair = &ctrlr->ioq[i]; status.done = 0; nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair, qpair->vector, nvme_completion_poll_cb, &status); while (!atomic_load_acq_int(&status.done)) pause("nvme", 1); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_create_io_cq failed!\n"); return (ENXIO); } status.done = 0; nvme_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, nvme_completion_poll_cb, &status); while (!atomic_load_acq_int(&status.done)) pause("nvme", 1); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_create_io_sq failed!\n"); return (ENXIO); } } return (0); } static int nvme_ctrlr_destroy_qpair(struct nvme_controller *ctrlr, struct nvme_qpair *qpair) { struct nvme_completion_poll_status status; status.done = 0; nvme_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, &status); while (!atomic_load_acq_int(&status.done)) pause("nvme", 1); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_destroy_io_sq failed!\n"); return (ENXIO); } status.done = 0; nvme_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status); while (!atomic_load_acq_int(&status.done)) pause("nvme", 1); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_destroy_io_cq failed!\n"); return (ENXIO); } return (0); } static int nvme_ctrlr_construct_namespaces(struct nvme_controller *ctrlr) { struct nvme_namespace *ns; uint32_t i; for (i = 0; i < min(ctrlr->cdata.nn, NVME_MAX_NAMESPACES); i++) { ns = &ctrlr->ns[i]; nvme_ns_construct(ns, i+1, ctrlr); } return (0); } static boolean_t is_log_page_id_valid(uint8_t page_id) { switch (page_id) { case NVME_LOG_ERROR: case NVME_LOG_HEALTH_INFORMATION: case NVME_LOG_FIRMWARE_SLOT: case NVME_LOG_CHANGED_NAMESPACE: return (TRUE); } return (FALSE); } static uint32_t nvme_ctrlr_get_log_page_size(struct nvme_controller *ctrlr, uint8_t page_id) { uint32_t log_page_size; switch (page_id) { case NVME_LOG_ERROR: log_page_size = min( sizeof(struct nvme_error_information_entry) * (ctrlr->cdata.elpe + 1), NVME_MAX_AER_LOG_SIZE); break; case NVME_LOG_HEALTH_INFORMATION: log_page_size = sizeof(struct nvme_health_information_page); break; case NVME_LOG_FIRMWARE_SLOT: log_page_size = sizeof(struct nvme_firmware_page); break; case NVME_LOG_CHANGED_NAMESPACE: log_page_size = sizeof(struct nvme_ns_list); break; default: log_page_size = 0; break; } return (log_page_size); } static void nvme_ctrlr_log_critical_warnings(struct nvme_controller *ctrlr, uint8_t state) { if (state & NVME_CRIT_WARN_ST_AVAILABLE_SPARE) nvme_printf(ctrlr, "available spare space below threshold\n"); if (state & NVME_CRIT_WARN_ST_TEMPERATURE) nvme_printf(ctrlr, "temperature above threshold\n"); if (state & NVME_CRIT_WARN_ST_DEVICE_RELIABILITY) nvme_printf(ctrlr, "device reliability degraded\n"); if (state & NVME_CRIT_WARN_ST_READ_ONLY) nvme_printf(ctrlr, "media placed in read only mode\n"); if (state & NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP) nvme_printf(ctrlr, "volatile memory backup device failed\n"); if (state & NVME_CRIT_WARN_ST_RESERVED_MASK) nvme_printf(ctrlr, "unknown critical warning(s): state = 0x%02x\n", state); } static void nvme_ctrlr_async_event_log_page_cb(void *arg, const struct nvme_completion *cpl) { struct nvme_async_event_request *aer = arg; struct nvme_health_information_page *health_info; struct nvme_ns_list *nsl; struct nvme_error_information_entry *err; int i; /* * If the log page fetch for some reason completed with an error, * don't pass log page data to the consumers. In practice, this case * should never happen. */ if (nvme_completion_is_error(cpl)) nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, aer->log_page_id, NULL, 0); else { /* Convert data to host endian */ switch (aer->log_page_id) { case NVME_LOG_ERROR: err = (struct nvme_error_information_entry *)aer->log_page_buffer; for (i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++) nvme_error_information_entry_swapbytes(err++); break; case NVME_LOG_HEALTH_INFORMATION: nvme_health_information_page_swapbytes( (struct nvme_health_information_page *)aer->log_page_buffer); break; case NVME_LOG_FIRMWARE_SLOT: nvme_firmware_page_swapbytes( (struct nvme_firmware_page *)aer->log_page_buffer); break; case NVME_LOG_CHANGED_NAMESPACE: nvme_ns_list_swapbytes( (struct nvme_ns_list *)aer->log_page_buffer); break; case INTEL_LOG_TEMP_STATS: intel_log_temp_stats_swapbytes( (struct intel_log_temp_stats *)aer->log_page_buffer); break; default: break; } if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) { health_info = (struct nvme_health_information_page *) aer->log_page_buffer; nvme_ctrlr_log_critical_warnings(aer->ctrlr, health_info->critical_warning); /* * Critical warnings reported through the * SMART/health log page are persistent, so * clear the associated bits in the async event * config so that we do not receive repeated * notifications for the same event. */ aer->ctrlr->async_event_config &= ~health_info->critical_warning; nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr, aer->ctrlr->async_event_config, NULL, NULL); } else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE && !nvme_use_nvd) { nsl = (struct nvme_ns_list *)aer->log_page_buffer; for (i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) { if (nsl->ns[i] > NVME_MAX_NAMESPACES) break; nvme_notify_ns(aer->ctrlr, nsl->ns[i]); } } /* * Pass the cpl data from the original async event completion, * not the log page fetch. */ nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, aer->log_page_id, aer->log_page_buffer, aer->log_page_size); } /* * Repost another asynchronous event request to replace the one * that just completed. */ nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer); } static void nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl) { struct nvme_async_event_request *aer = arg; if (nvme_completion_is_error(cpl)) { /* * Do not retry failed async event requests. This avoids * infinite loops where a new async event request is submitted * to replace the one just failed, only to fail again and * perpetuate the loop. */ return; } /* Associated log page is in bits 23:16 of completion entry dw0. */ aer->log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; nvme_printf(aer->ctrlr, "async event occurred (type 0x%x, info 0x%02x," " page 0x%02x)\n", (cpl->cdw0 & 0x03), (cpl->cdw0 & 0xFF00) >> 8, aer->log_page_id); if (is_log_page_id_valid(aer->log_page_id)) { aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr, aer->log_page_id); memcpy(&aer->cpl, cpl, sizeof(*cpl)); nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id, NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer, aer->log_page_size, nvme_ctrlr_async_event_log_page_cb, aer); /* Wait to notify consumers until after log page is fetched. */ } else { nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id, NULL, 0); /* * Repost another asynchronous event request to replace the one * that just completed. */ nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer); } } static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr, struct nvme_async_event_request *aer) { struct nvme_request *req; aer->ctrlr = ctrlr; req = nvme_allocate_request_null(nvme_ctrlr_async_event_cb, aer); aer->req = req; /* * Disable timeout here, since asynchronous event requests should by * nature never be timed out. */ req->timeout = FALSE; - req->cmd.opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_ASYNC_EVENT_REQUEST); + req->cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST; nvme_ctrlr_submit_admin_request(ctrlr, req); } static void nvme_ctrlr_configure_aer(struct nvme_controller *ctrlr) { struct nvme_completion_poll_status status; struct nvme_async_event_request *aer; uint32_t i; ctrlr->async_event_config = NVME_CRIT_WARN_ST_AVAILABLE_SPARE | NVME_CRIT_WARN_ST_DEVICE_RELIABILITY | NVME_CRIT_WARN_ST_READ_ONLY | NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP; if (ctrlr->cdata.ver >= NVME_REV(1, 2)) ctrlr->async_event_config |= 0x300; status.done = 0; nvme_ctrlr_cmd_get_feature(ctrlr, NVME_FEAT_TEMPERATURE_THRESHOLD, 0, NULL, 0, nvme_completion_poll_cb, &status); while (!atomic_load_acq_int(&status.done)) pause("nvme", 1); if (nvme_completion_is_error(&status.cpl) || (status.cpl.cdw0 & 0xFFFF) == 0xFFFF || (status.cpl.cdw0 & 0xFFFF) == 0x0000) { nvme_printf(ctrlr, "temperature threshold not supported\n"); } else ctrlr->async_event_config |= NVME_CRIT_WARN_ST_TEMPERATURE; nvme_ctrlr_cmd_set_async_event_config(ctrlr, ctrlr->async_event_config, NULL, NULL); /* aerl is a zero-based value, so we need to add 1 here. */ ctrlr->num_aers = min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl+1)); for (i = 0; i < ctrlr->num_aers; i++) { aer = &ctrlr->aer[i]; nvme_ctrlr_construct_and_submit_aer(ctrlr, aer); } } static void nvme_ctrlr_configure_int_coalescing(struct nvme_controller *ctrlr) { ctrlr->int_coal_time = 0; TUNABLE_INT_FETCH("hw.nvme.int_coal_time", &ctrlr->int_coal_time); ctrlr->int_coal_threshold = 0; TUNABLE_INT_FETCH("hw.nvme.int_coal_threshold", &ctrlr->int_coal_threshold); nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr, ctrlr->int_coal_time, ctrlr->int_coal_threshold, NULL, NULL); } static void nvme_ctrlr_start(void *ctrlr_arg) { struct nvme_controller *ctrlr = ctrlr_arg; uint32_t old_num_io_queues; int i; /* * Only reset adminq here when we are restarting the * controller after a reset. During initialization, * we have already submitted admin commands to get * the number of I/O queues supported, so cannot reset * the adminq again here. */ if (ctrlr->is_resetting) { nvme_qpair_reset(&ctrlr->adminq); } for (i = 0; i < ctrlr->num_io_queues; i++) nvme_qpair_reset(&ctrlr->ioq[i]); nvme_admin_qpair_enable(&ctrlr->adminq); if (nvme_ctrlr_identify(ctrlr) != 0) { nvme_ctrlr_fail(ctrlr); return; } /* * The number of qpairs are determined during controller initialization, * including using NVMe SET_FEATURES/NUMBER_OF_QUEUES to determine the * HW limit. We call SET_FEATURES again here so that it gets called * after any reset for controllers that depend on the driver to * explicit specify how many queues it will use. This value should * never change between resets, so panic if somehow that does happen. */ if (ctrlr->is_resetting) { old_num_io_queues = ctrlr->num_io_queues; if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) { nvme_ctrlr_fail(ctrlr); return; } if (old_num_io_queues != ctrlr->num_io_queues) { panic("num_io_queues changed from %u to %u", old_num_io_queues, ctrlr->num_io_queues); } } if (nvme_ctrlr_create_qpairs(ctrlr) != 0) { nvme_ctrlr_fail(ctrlr); return; } if (nvme_ctrlr_construct_namespaces(ctrlr) != 0) { nvme_ctrlr_fail(ctrlr); return; } nvme_ctrlr_configure_aer(ctrlr); nvme_ctrlr_configure_int_coalescing(ctrlr); for (i = 0; i < ctrlr->num_io_queues; i++) nvme_io_qpair_enable(&ctrlr->ioq[i]); } void nvme_ctrlr_start_config_hook(void *arg) { struct nvme_controller *ctrlr = arg; nvme_qpair_reset(&ctrlr->adminq); nvme_admin_qpair_enable(&ctrlr->adminq); if (nvme_ctrlr_set_num_qpairs(ctrlr) == 0 && nvme_ctrlr_construct_io_qpairs(ctrlr) == 0) nvme_ctrlr_start(ctrlr); else nvme_ctrlr_fail(ctrlr); nvme_sysctl_initialize_ctrlr(ctrlr); config_intrhook_disestablish(&ctrlr->config_hook); ctrlr->is_initialized = 1; nvme_notify_new_controller(ctrlr); } static void nvme_ctrlr_reset_task(void *arg, int pending) { struct nvme_controller *ctrlr = arg; int status; nvme_printf(ctrlr, "resetting controller\n"); status = nvme_ctrlr_hw_reset(ctrlr); /* * Use pause instead of DELAY, so that we yield to any nvme interrupt * handlers on this CPU that were blocked on a qpair lock. We want * all nvme interrupts completed before proceeding with restarting the * controller. * * XXX - any way to guarantee the interrupt handlers have quiesced? */ pause("nvmereset", hz / 10); if (status == 0) nvme_ctrlr_start(ctrlr); else nvme_ctrlr_fail(ctrlr); atomic_cmpset_32(&ctrlr->is_resetting, 1, 0); } /* * Poll all the queues enabled on the device for completion. */ void nvme_ctrlr_poll(struct nvme_controller *ctrlr) { int i; nvme_qpair_process_completions(&ctrlr->adminq); for (i = 0; i < ctrlr->num_io_queues; i++) if (ctrlr->ioq && ctrlr->ioq[i].cpl) nvme_qpair_process_completions(&ctrlr->ioq[i]); } /* * Poll the single-vector intertrupt case: num_io_queues will be 1 and * there's only a single vector. While we're polling, we mask further * interrupts in the controller. */ void nvme_ctrlr_intx_handler(void *arg) { struct nvme_controller *ctrlr = arg; nvme_mmio_write_4(ctrlr, intms, 1); nvme_ctrlr_poll(ctrlr); nvme_mmio_write_4(ctrlr, intmc, 1); } static int nvme_ctrlr_configure_intx(struct nvme_controller *ctrlr) { ctrlr->msix_enabled = 0; ctrlr->num_io_queues = 1; ctrlr->num_cpus_per_ioq = mp_ncpus; ctrlr->rid = 0; ctrlr->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ, &ctrlr->rid, RF_SHAREABLE | RF_ACTIVE); if (ctrlr->res == NULL) { nvme_printf(ctrlr, "unable to allocate shared IRQ\n"); return (ENOMEM); } bus_setup_intr(ctrlr->dev, ctrlr->res, INTR_TYPE_MISC | INTR_MPSAFE, NULL, nvme_ctrlr_intx_handler, ctrlr, &ctrlr->tag); if (ctrlr->tag == NULL) { nvme_printf(ctrlr, "unable to setup intx handler\n"); return (ENOMEM); } return (0); } static void nvme_pt_done(void *arg, const struct nvme_completion *cpl) { struct nvme_pt_command *pt = arg; struct mtx *mtx = pt->driver_lock; uint16_t status; bzero(&pt->cpl, sizeof(pt->cpl)); pt->cpl.cdw0 = cpl->cdw0; status = cpl->status; status &= ~NVME_STATUS_P_MASK; pt->cpl.status = status; mtx_lock(mtx); pt->driver_lock = NULL; wakeup(pt); mtx_unlock(mtx); } int nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, struct nvme_pt_command *pt, uint32_t nsid, int is_user_buffer, int is_admin_cmd) { struct nvme_request *req; struct mtx *mtx; struct buf *buf = NULL; int ret = 0; vm_offset_t addr, end; if (pt->len > 0) { /* * vmapbuf calls vm_fault_quick_hold_pages which only maps full * pages. Ensure this request has fewer than MAXPHYS bytes when * extended to full pages. */ addr = (vm_offset_t)pt->buf; end = round_page(addr + pt->len); addr = trunc_page(addr); if (end - addr > MAXPHYS) return EIO; if (pt->len > ctrlr->max_xfer_size) { nvme_printf(ctrlr, "pt->len (%d) " "exceeds max_xfer_size (%d)\n", pt->len, ctrlr->max_xfer_size); return EIO; } if (is_user_buffer) { /* * Ensure the user buffer is wired for the duration of * this passthrough command. */ PHOLD(curproc); buf = getpbuf(NULL); buf->b_data = pt->buf; buf->b_bufsize = pt->len; buf->b_iocmd = pt->is_read ? BIO_READ : BIO_WRITE; #ifdef NVME_UNMAPPED_BIO_SUPPORT if (vmapbuf(buf, 1) < 0) { #else if (vmapbuf(buf) < 0) { #endif ret = EFAULT; goto err; } req = nvme_allocate_request_vaddr(buf->b_data, pt->len, nvme_pt_done, pt); } else req = nvme_allocate_request_vaddr(pt->buf, pt->len, nvme_pt_done, pt); } else req = nvme_allocate_request_null(nvme_pt_done, pt); /* Assume userspace already converted to little-endian */ - req->cmd.opc_fuse = pt->cmd.opc_fuse; + req->cmd.opc = pt->cmd.opc; + req->cmd.fuse = pt->cmd.fuse; req->cmd.cdw10 = pt->cmd.cdw10; req->cmd.cdw11 = pt->cmd.cdw11; req->cmd.cdw12 = pt->cmd.cdw12; req->cmd.cdw13 = pt->cmd.cdw13; req->cmd.cdw14 = pt->cmd.cdw14; req->cmd.cdw15 = pt->cmd.cdw15; req->cmd.nsid = htole32(nsid); mtx = mtx_pool_find(mtxpool_sleep, pt); pt->driver_lock = mtx; if (is_admin_cmd) nvme_ctrlr_submit_admin_request(ctrlr, req); else nvme_ctrlr_submit_io_request(ctrlr, req); mtx_lock(mtx); while (pt->driver_lock != NULL) mtx_sleep(pt, mtx, PRIBIO, "nvme_pt", 0); mtx_unlock(mtx); err: if (buf != NULL) { relpbuf(buf, NULL); PRELE(curproc); } return (ret); } static int nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, struct thread *td) { struct nvme_controller *ctrlr; struct nvme_pt_command *pt; ctrlr = cdev->si_drv1; switch (cmd) { case NVME_RESET_CONTROLLER: nvme_ctrlr_reset(ctrlr); break; case NVME_PASSTHROUGH_CMD: pt = (struct nvme_pt_command *)arg; return (nvme_ctrlr_passthrough_cmd(ctrlr, pt, le32toh(pt->cmd.nsid), 1 /* is_user_buffer */, 1 /* is_admin_cmd */)); default: return (ENOTTY); } return (0); } static struct cdevsw nvme_ctrlr_cdevsw = { .d_version = D_VERSION, .d_flags = 0, .d_ioctl = nvme_ctrlr_ioctl }; static void nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr) { device_t dev; int per_cpu_io_queues; int min_cpus_per_ioq; int num_vectors_requested, num_vectors_allocated; int num_vectors_available; dev = ctrlr->dev; min_cpus_per_ioq = 1; TUNABLE_INT_FETCH("hw.nvme.min_cpus_per_ioq", &min_cpus_per_ioq); if (min_cpus_per_ioq < 1) { min_cpus_per_ioq = 1; } else if (min_cpus_per_ioq > mp_ncpus) { min_cpus_per_ioq = mp_ncpus; } per_cpu_io_queues = 1; TUNABLE_INT_FETCH("hw.nvme.per_cpu_io_queues", &per_cpu_io_queues); if (per_cpu_io_queues == 0) { min_cpus_per_ioq = mp_ncpus; } ctrlr->force_intx = 0; TUNABLE_INT_FETCH("hw.nvme.force_intx", &ctrlr->force_intx); /* * FreeBSD currently cannot allocate more than about 190 vectors at * boot, meaning that systems with high core count and many devices * requesting per-CPU interrupt vectors will not get their full * allotment. So first, try to allocate as many as we may need to * understand what is available, then immediately release them. * Then figure out how many of those we will actually use, based on * assigning an equal number of cores to each I/O queue. */ /* One vector for per core I/O queue, plus one vector for admin queue. */ num_vectors_available = min(pci_msix_count(dev), mp_ncpus + 1); if (pci_alloc_msix(dev, &num_vectors_available) != 0) { num_vectors_available = 0; } pci_release_msi(dev); if (ctrlr->force_intx || num_vectors_available < 2) { nvme_ctrlr_configure_intx(ctrlr); return; } /* * Do not use all vectors for I/O queues - one must be saved for the * admin queue. */ ctrlr->num_cpus_per_ioq = max(min_cpus_per_ioq, howmany(mp_ncpus, num_vectors_available - 1)); ctrlr->num_io_queues = howmany(mp_ncpus, ctrlr->num_cpus_per_ioq); num_vectors_requested = ctrlr->num_io_queues + 1; num_vectors_allocated = num_vectors_requested; /* * Now just allocate the number of vectors we need. This should * succeed, since we previously called pci_alloc_msix() * successfully returning at least this many vectors, but just to * be safe, if something goes wrong just revert to INTx. */ if (pci_alloc_msix(dev, &num_vectors_allocated) != 0) { nvme_ctrlr_configure_intx(ctrlr); return; } if (num_vectors_allocated < num_vectors_requested) { pci_release_msi(dev); nvme_ctrlr_configure_intx(ctrlr); return; } ctrlr->msix_enabled = 1; } int nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) { struct make_dev_args md_args; uint32_t cap_lo; uint32_t cap_hi; uint8_t to; uint8_t dstrd; uint8_t mpsmin; int status, timeout_period; ctrlr->dev = dev; mtx_init(&ctrlr->lock, "nvme ctrlr lock", NULL, MTX_DEF); status = nvme_ctrlr_allocate_bar(ctrlr); if (status != 0) return (status); /* * Software emulators may set the doorbell stride to something * other than zero, but this driver is not set up to handle that. */ cap_hi = nvme_mmio_read_4(ctrlr, cap_hi); dstrd = (cap_hi >> NVME_CAP_HI_REG_DSTRD_SHIFT) & NVME_CAP_HI_REG_DSTRD_MASK; if (dstrd != 0) return (ENXIO); mpsmin = (cap_hi >> NVME_CAP_HI_REG_MPSMIN_SHIFT) & NVME_CAP_HI_REG_MPSMIN_MASK; ctrlr->min_page_size = 1 << (12 + mpsmin); /* Get ready timeout value from controller, in units of 500ms. */ cap_lo = nvme_mmio_read_4(ctrlr, cap_lo); to = (cap_lo >> NVME_CAP_LO_REG_TO_SHIFT) & NVME_CAP_LO_REG_TO_MASK; ctrlr->ready_timeout_in_ms = to * 500; timeout_period = NVME_DEFAULT_TIMEOUT_PERIOD; TUNABLE_INT_FETCH("hw.nvme.timeout_period", &timeout_period); timeout_period = min(timeout_period, NVME_MAX_TIMEOUT_PERIOD); timeout_period = max(timeout_period, NVME_MIN_TIMEOUT_PERIOD); ctrlr->timeout_period = timeout_period; nvme_retry_count = NVME_DEFAULT_RETRY_COUNT; TUNABLE_INT_FETCH("hw.nvme.retry_count", &nvme_retry_count); ctrlr->enable_aborts = 0; TUNABLE_INT_FETCH("hw.nvme.enable_aborts", &ctrlr->enable_aborts); nvme_ctrlr_setup_interrupts(ctrlr); ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE; if (nvme_ctrlr_construct_admin_qpair(ctrlr) != 0) return (ENXIO); ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK, taskqueue_thread_enqueue, &ctrlr->taskqueue); taskqueue_start_threads(&ctrlr->taskqueue, 1, PI_DISK, "nvme taskq"); ctrlr->is_resetting = 0; ctrlr->is_initialized = 0; ctrlr->notification_sent = 0; TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr); TASK_INIT(&ctrlr->fail_req_task, 0, nvme_ctrlr_fail_req_task, ctrlr); STAILQ_INIT(&ctrlr->fail_req); ctrlr->is_failed = FALSE; make_dev_args_init(&md_args); md_args.mda_devsw = &nvme_ctrlr_cdevsw; md_args.mda_uid = UID_ROOT; md_args.mda_gid = GID_WHEEL; md_args.mda_mode = 0600; md_args.mda_unit = device_get_unit(dev); md_args.mda_si_drv1 = (void *)ctrlr; status = make_dev_s(&md_args, &ctrlr->cdev, "nvme%d", device_get_unit(dev)); if (status != 0) return (ENXIO); return (0); } void nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev) { int i; if (ctrlr->resource == NULL) goto nores; nvme_notify_fail_consumers(ctrlr); for (i = 0; i < NVME_MAX_NAMESPACES; i++) nvme_ns_destruct(&ctrlr->ns[i]); if (ctrlr->cdev) destroy_dev(ctrlr->cdev); for (i = 0; i < ctrlr->num_io_queues; i++) { nvme_ctrlr_destroy_qpair(ctrlr, &ctrlr->ioq[i]); nvme_io_qpair_destroy(&ctrlr->ioq[i]); } free(ctrlr->ioq, M_NVME); nvme_admin_qpair_destroy(&ctrlr->adminq); /* * Notify the controller of a shutdown, even though this is due to * a driver unload, not a system shutdown (this path is not invoked * during shutdown). This ensures the controller receives a * shutdown notification in case the system is shutdown before * reloading the driver. */ nvme_ctrlr_shutdown(ctrlr); nvme_ctrlr_disable(ctrlr); if (ctrlr->taskqueue) taskqueue_free(ctrlr->taskqueue); if (ctrlr->tag) bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag); if (ctrlr->res) bus_release_resource(ctrlr->dev, SYS_RES_IRQ, rman_get_rid(ctrlr->res), ctrlr->res); if (ctrlr->msix_enabled) pci_release_msi(dev); if (ctrlr->bar4_resource != NULL) { bus_release_resource(dev, SYS_RES_MEMORY, ctrlr->bar4_resource_id, ctrlr->bar4_resource); } bus_release_resource(dev, SYS_RES_MEMORY, ctrlr->resource_id, ctrlr->resource); nores: mtx_destroy(&ctrlr->lock); } void nvme_ctrlr_shutdown(struct nvme_controller *ctrlr) { uint32_t cc; uint32_t csts; int ticks = 0; cc = nvme_mmio_read_4(ctrlr, cc); cc &= ~(NVME_CC_REG_SHN_MASK << NVME_CC_REG_SHN_SHIFT); cc |= NVME_SHN_NORMAL << NVME_CC_REG_SHN_SHIFT; nvme_mmio_write_4(ctrlr, cc, cc); csts = nvme_mmio_read_4(ctrlr, csts); while ((NVME_CSTS_GET_SHST(csts) != NVME_SHST_COMPLETE) && (ticks++ < 5*hz)) { pause("nvme shn", 1); csts = nvme_mmio_read_4(ctrlr, csts); } if (NVME_CSTS_GET_SHST(csts) != NVME_SHST_COMPLETE) nvme_printf(ctrlr, "did not complete shutdown within 5 seconds " "of notification\n"); } void nvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr, struct nvme_request *req) { nvme_qpair_submit_request(&ctrlr->adminq, req); } void nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr, struct nvme_request *req) { struct nvme_qpair *qpair; qpair = &ctrlr->ioq[curcpu / ctrlr->num_cpus_per_ioq]; nvme_qpair_submit_request(qpair, req); } device_t nvme_ctrlr_get_device(struct nvme_controller *ctrlr) { return (ctrlr->dev); } const struct nvme_controller_data * nvme_ctrlr_get_data(struct nvme_controller *ctrlr) { return (&ctrlr->cdata); } Index: head/sys/dev/nvme/nvme_ctrlr_cmd.c =================================================================== --- head/sys/dev/nvme/nvme_ctrlr_cmd.c (revision 338181) +++ head/sys/dev/nvme/nvme_ctrlr_cmd.c (revision 338182) @@ -1,327 +1,327 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2012-2013 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "nvme_private.h" void nvme_ctrlr_cmd_identify_controller(struct nvme_controller *ctrlr, void *payload, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; struct nvme_command *cmd; req = nvme_allocate_request_vaddr(payload, sizeof(struct nvme_controller_data), cb_fn, cb_arg); cmd = &req->cmd; - cmd->opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_IDENTIFY); + cmd->opc = NVME_OPC_IDENTIFY; /* * TODO: create an identify command data structure, which * includes this CNS bit in cdw10. */ cmd->cdw10 = htole32(1); nvme_ctrlr_submit_admin_request(ctrlr, req); } void nvme_ctrlr_cmd_identify_namespace(struct nvme_controller *ctrlr, uint32_t nsid, void *payload, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; struct nvme_command *cmd; req = nvme_allocate_request_vaddr(payload, sizeof(struct nvme_namespace_data), cb_fn, cb_arg); cmd = &req->cmd; - cmd->opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_IDENTIFY); + cmd->opc = NVME_OPC_IDENTIFY; /* * TODO: create an identify command data structure */ cmd->nsid = htole32(nsid); nvme_ctrlr_submit_admin_request(ctrlr, req); } void nvme_ctrlr_cmd_create_io_cq(struct nvme_controller *ctrlr, struct nvme_qpair *io_que, uint16_t vector, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; struct nvme_command *cmd; req = nvme_allocate_request_null(cb_fn, cb_arg); cmd = &req->cmd; - cmd->opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_CREATE_IO_CQ); + cmd->opc = NVME_OPC_CREATE_IO_CQ; /* * TODO: create a create io completion queue command data * structure. */ cmd->cdw10 = htole32(((io_que->num_entries-1) << 16) | io_que->id); /* 0x3 = interrupts enabled | physically contiguous */ cmd->cdw11 = htole32((vector << 16) | 0x3); cmd->prp1 = htole64(io_que->cpl_bus_addr); nvme_ctrlr_submit_admin_request(ctrlr, req); } void nvme_ctrlr_cmd_create_io_sq(struct nvme_controller *ctrlr, struct nvme_qpair *io_que, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; struct nvme_command *cmd; req = nvme_allocate_request_null(cb_fn, cb_arg); cmd = &req->cmd; - cmd->opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_CREATE_IO_SQ); + cmd->opc = NVME_OPC_CREATE_IO_SQ; /* * TODO: create a create io submission queue command data * structure. */ cmd->cdw10 = htole32(((io_que->num_entries-1) << 16) | io_que->id); /* 0x1 = physically contiguous */ cmd->cdw11 = htole32((io_que->id << 16) | 0x1); cmd->prp1 = htole64(io_que->cmd_bus_addr); nvme_ctrlr_submit_admin_request(ctrlr, req); } void nvme_ctrlr_cmd_delete_io_cq(struct nvme_controller *ctrlr, struct nvme_qpair *io_que, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; struct nvme_command *cmd; req = nvme_allocate_request_null(cb_fn, cb_arg); cmd = &req->cmd; - cmd->opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_DELETE_IO_CQ); + cmd->opc = NVME_OPC_DELETE_IO_CQ; /* * TODO: create a delete io completion queue command data * structure. */ cmd->cdw10 = htole32(io_que->id); nvme_ctrlr_submit_admin_request(ctrlr, req); } void nvme_ctrlr_cmd_delete_io_sq(struct nvme_controller *ctrlr, struct nvme_qpair *io_que, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; struct nvme_command *cmd; req = nvme_allocate_request_null(cb_fn, cb_arg); cmd = &req->cmd; - cmd->opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_DELETE_IO_SQ); + cmd->opc = NVME_OPC_DELETE_IO_SQ; /* * TODO: create a delete io submission queue command data * structure. */ cmd->cdw10 = htole32(io_que->id); nvme_ctrlr_submit_admin_request(ctrlr, req); } void nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr, uint8_t feature, uint32_t cdw11, void *payload, uint32_t payload_size, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; struct nvme_command *cmd; req = nvme_allocate_request_null(cb_fn, cb_arg); cmd = &req->cmd; - cmd->opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_SET_FEATURES); + cmd->opc = NVME_OPC_SET_FEATURES; cmd->cdw10 = htole32(feature); cmd->cdw11 = htole32(cdw11); nvme_ctrlr_submit_admin_request(ctrlr, req); } void nvme_ctrlr_cmd_get_feature(struct nvme_controller *ctrlr, uint8_t feature, uint32_t cdw11, void *payload, uint32_t payload_size, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; struct nvme_command *cmd; req = nvme_allocate_request_null(cb_fn, cb_arg); cmd = &req->cmd; - cmd->opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_GET_FEATURES); + cmd->opc = NVME_OPC_GET_FEATURES; cmd->cdw10 = htole32(feature); cmd->cdw11 = htole32(cdw11); nvme_ctrlr_submit_admin_request(ctrlr, req); } void nvme_ctrlr_cmd_set_num_queues(struct nvme_controller *ctrlr, uint32_t num_queues, nvme_cb_fn_t cb_fn, void *cb_arg) { uint32_t cdw11; cdw11 = ((num_queues - 1) << 16) | (num_queues - 1); nvme_ctrlr_cmd_set_feature(ctrlr, NVME_FEAT_NUMBER_OF_QUEUES, cdw11, NULL, 0, cb_fn, cb_arg); } void nvme_ctrlr_cmd_set_async_event_config(struct nvme_controller *ctrlr, uint32_t state, nvme_cb_fn_t cb_fn, void *cb_arg) { uint32_t cdw11; cdw11 = state; nvme_ctrlr_cmd_set_feature(ctrlr, NVME_FEAT_ASYNC_EVENT_CONFIGURATION, cdw11, NULL, 0, cb_fn, cb_arg); } void nvme_ctrlr_cmd_set_interrupt_coalescing(struct nvme_controller *ctrlr, uint32_t microseconds, uint32_t threshold, nvme_cb_fn_t cb_fn, void *cb_arg) { uint32_t cdw11; if ((microseconds/100) >= 0x100) { nvme_printf(ctrlr, "invalid coal time %d, disabling\n", microseconds); microseconds = 0; threshold = 0; } if (threshold >= 0x100) { nvme_printf(ctrlr, "invalid threshold %d, disabling\n", threshold); threshold = 0; microseconds = 0; } cdw11 = ((microseconds/100) << 8) | threshold; nvme_ctrlr_cmd_set_feature(ctrlr, NVME_FEAT_INTERRUPT_COALESCING, cdw11, NULL, 0, cb_fn, cb_arg); } void nvme_ctrlr_cmd_get_log_page(struct nvme_controller *ctrlr, uint8_t log_page, uint32_t nsid, void *payload, uint32_t payload_size, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; struct nvme_command *cmd; req = nvme_allocate_request_vaddr(payload, payload_size, cb_fn, cb_arg); cmd = &req->cmd; - cmd->opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_GET_LOG_PAGE); + cmd->opc = NVME_OPC_GET_LOG_PAGE; cmd->nsid = htole32(nsid); cmd->cdw10 = ((payload_size/sizeof(uint32_t)) - 1) << 16; cmd->cdw10 |= log_page; cmd->cdw10 = htole32(cmd->cdw10); nvme_ctrlr_submit_admin_request(ctrlr, req); } void nvme_ctrlr_cmd_get_error_page(struct nvme_controller *ctrlr, struct nvme_error_information_entry *payload, uint32_t num_entries, nvme_cb_fn_t cb_fn, void *cb_arg) { KASSERT(num_entries > 0, ("%s called with num_entries==0\n", __func__)); /* Controller's error log page entries is 0-based. */ KASSERT(num_entries <= (ctrlr->cdata.elpe + 1), ("%s called with num_entries=%d but (elpe+1)=%d\n", __func__, num_entries, ctrlr->cdata.elpe + 1)); if (num_entries > (ctrlr->cdata.elpe + 1)) num_entries = ctrlr->cdata.elpe + 1; nvme_ctrlr_cmd_get_log_page(ctrlr, NVME_LOG_ERROR, NVME_GLOBAL_NAMESPACE_TAG, payload, sizeof(*payload) * num_entries, cb_fn, cb_arg); } void nvme_ctrlr_cmd_get_health_information_page(struct nvme_controller *ctrlr, uint32_t nsid, struct nvme_health_information_page *payload, nvme_cb_fn_t cb_fn, void *cb_arg) { nvme_ctrlr_cmd_get_log_page(ctrlr, NVME_LOG_HEALTH_INFORMATION, nsid, payload, sizeof(*payload), cb_fn, cb_arg); } void nvme_ctrlr_cmd_get_firmware_page(struct nvme_controller *ctrlr, struct nvme_firmware_page *payload, nvme_cb_fn_t cb_fn, void *cb_arg) { nvme_ctrlr_cmd_get_log_page(ctrlr, NVME_LOG_FIRMWARE_SLOT, NVME_GLOBAL_NAMESPACE_TAG, payload, sizeof(*payload), cb_fn, cb_arg); } void nvme_ctrlr_cmd_abort(struct nvme_controller *ctrlr, uint16_t cid, uint16_t sqid, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; struct nvme_command *cmd; req = nvme_allocate_request_null(cb_fn, cb_arg); cmd = &req->cmd; - cmd->opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_ABORT); + cmd->opc = NVME_OPC_ABORT; cmd->cdw10 = htole32((cid << 16) | sqid); nvme_ctrlr_submit_admin_request(ctrlr, req); } Index: head/sys/dev/nvme/nvme_ns_cmd.c =================================================================== --- head/sys/dev/nvme/nvme_ns_cmd.c (revision 338181) +++ head/sys/dev/nvme/nvme_ns_cmd.c (revision 338182) @@ -1,198 +1,198 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2012 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "nvme_private.h" int nvme_ns_cmd_read(struct nvme_namespace *ns, void *payload, uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; req = nvme_allocate_request_vaddr(payload, lba_count*nvme_ns_get_sector_size(ns), cb_fn, cb_arg); if (req == NULL) return (ENOMEM); nvme_ns_read_cmd(&req->cmd, ns->id, lba, lba_count); nvme_ctrlr_submit_io_request(ns->ctrlr, req); return (0); } int nvme_ns_cmd_read_bio(struct nvme_namespace *ns, struct bio *bp, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; uint64_t lba; uint64_t lba_count; req = nvme_allocate_request_bio(bp, cb_fn, cb_arg); if (req == NULL) return (ENOMEM); lba = bp->bio_offset / nvme_ns_get_sector_size(ns); lba_count = bp->bio_bcount / nvme_ns_get_sector_size(ns); nvme_ns_read_cmd(&req->cmd, ns->id, lba, lba_count); nvme_ctrlr_submit_io_request(ns->ctrlr, req); return (0); } int nvme_ns_cmd_write(struct nvme_namespace *ns, void *payload, uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; req = nvme_allocate_request_vaddr(payload, lba_count*nvme_ns_get_sector_size(ns), cb_fn, cb_arg); if (req == NULL) return (ENOMEM); nvme_ns_write_cmd(&req->cmd, ns->id, lba, lba_count); nvme_ctrlr_submit_io_request(ns->ctrlr, req); return (0); } int nvme_ns_cmd_write_bio(struct nvme_namespace *ns, struct bio *bp, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; uint64_t lba; uint64_t lba_count; req = nvme_allocate_request_bio(bp, cb_fn, cb_arg); if (req == NULL) return (ENOMEM); lba = bp->bio_offset / nvme_ns_get_sector_size(ns); lba_count = bp->bio_bcount / nvme_ns_get_sector_size(ns); nvme_ns_write_cmd(&req->cmd, ns->id, lba, lba_count); nvme_ctrlr_submit_io_request(ns->ctrlr, req); return (0); } int nvme_ns_cmd_deallocate(struct nvme_namespace *ns, void *payload, uint8_t num_ranges, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; struct nvme_command *cmd; req = nvme_allocate_request_vaddr(payload, num_ranges * sizeof(struct nvme_dsm_range), cb_fn, cb_arg); if (req == NULL) return (ENOMEM); cmd = &req->cmd; - cmd->opc_fuse = NVME_CMD_SET_OPC(NVME_OPC_DATASET_MANAGEMENT); + cmd->opc = NVME_OPC_DATASET_MANAGEMENT; cmd->nsid = htole32(ns->id); /* TODO: create a delete command data structure */ cmd->cdw10 = htole32(num_ranges - 1); cmd->cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE); nvme_ctrlr_submit_io_request(ns->ctrlr, req); return (0); } int nvme_ns_cmd_flush(struct nvme_namespace *ns, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; req = nvme_allocate_request_null(cb_fn, cb_arg); if (req == NULL) return (ENOMEM); nvme_ns_flush_cmd(&req->cmd, ns->id); nvme_ctrlr_submit_io_request(ns->ctrlr, req); return (0); } /* Timeout = 1 sec */ #define NVD_DUMP_TIMEOUT 200000 int nvme_ns_dump(struct nvme_namespace *ns, void *virt, off_t offset, size_t len) { struct nvme_completion_poll_status status; struct nvme_request *req; struct nvme_command *cmd; uint64_t lba, lba_count; int i; status.done = FALSE; req = nvme_allocate_request_vaddr(virt, len, nvme_completion_poll_cb, &status); if (req == NULL) return (ENOMEM); cmd = &req->cmd; if (len > 0) { lba = offset / nvme_ns_get_sector_size(ns); lba_count = len / nvme_ns_get_sector_size(ns); nvme_ns_write_cmd(cmd, ns->id, lba, lba_count); } else nvme_ns_flush_cmd(cmd, ns->id); nvme_ctrlr_submit_io_request(ns->ctrlr, req); if (req->qpair == NULL) return (ENXIO); i = 0; while ((i++ < NVD_DUMP_TIMEOUT) && (status.done == FALSE)) { DELAY(5); nvme_qpair_process_completions(req->qpair); } if (status.done == FALSE) return (ETIMEDOUT); return (0); } Index: head/sys/dev/nvme/nvme_qpair.c =================================================================== --- head/sys/dev/nvme/nvme_qpair.c (revision 338181) +++ head/sys/dev/nvme/nvme_qpair.c (revision 338182) @@ -1,1160 +1,1156 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2012-2014 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include "nvme_private.h" static void _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req); static void nvme_qpair_destroy(struct nvme_qpair *qpair); struct nvme_opcode_string { uint16_t opc; const char * str; }; static struct nvme_opcode_string admin_opcode[] = { { NVME_OPC_DELETE_IO_SQ, "DELETE IO SQ" }, { NVME_OPC_CREATE_IO_SQ, "CREATE IO SQ" }, { NVME_OPC_GET_LOG_PAGE, "GET LOG PAGE" }, { NVME_OPC_DELETE_IO_CQ, "DELETE IO CQ" }, { NVME_OPC_CREATE_IO_CQ, "CREATE IO CQ" }, { NVME_OPC_IDENTIFY, "IDENTIFY" }, { NVME_OPC_ABORT, "ABORT" }, { NVME_OPC_SET_FEATURES, "SET FEATURES" }, { NVME_OPC_GET_FEATURES, "GET FEATURES" }, { NVME_OPC_ASYNC_EVENT_REQUEST, "ASYNC EVENT REQUEST" }, { NVME_OPC_FIRMWARE_ACTIVATE, "FIRMWARE ACTIVATE" }, { NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD, "FIRMWARE IMAGE DOWNLOAD" }, { NVME_OPC_DEVICE_SELF_TEST, "DEVICE SELF-TEST" }, { NVME_OPC_NAMESPACE_ATTACHMENT, "NAMESPACE ATTACHMENT" }, { NVME_OPC_KEEP_ALIVE, "KEEP ALIVE" }, { NVME_OPC_DIRECTIVE_SEND, "DIRECTIVE SEND" }, { NVME_OPC_DIRECTIVE_RECEIVE, "DIRECTIVE RECEIVE" }, { NVME_OPC_VIRTUALIZATION_MANAGEMENT, "VIRTUALIZATION MANAGEMENT" }, { NVME_OPC_NVME_MI_SEND, "NVME-MI SEND" }, { NVME_OPC_NVME_MI_RECEIVE, "NVME-MI RECEIVE" }, { NVME_OPC_DOORBELL_BUFFER_CONFIG, "DOORBELL BUFFER CONFIG" }, { NVME_OPC_FORMAT_NVM, "FORMAT NVM" }, { NVME_OPC_SECURITY_SEND, "SECURITY SEND" }, { NVME_OPC_SECURITY_RECEIVE, "SECURITY RECEIVE" }, { NVME_OPC_SANITIZE, "SANITIZE" }, { 0xFFFF, "ADMIN COMMAND" } }; static struct nvme_opcode_string io_opcode[] = { { NVME_OPC_FLUSH, "FLUSH" }, { NVME_OPC_WRITE, "WRITE" }, { NVME_OPC_READ, "READ" }, { NVME_OPC_WRITE_UNCORRECTABLE, "WRITE UNCORRECTABLE" }, { NVME_OPC_COMPARE, "COMPARE" }, { NVME_OPC_WRITE_ZEROES, "WRITE ZEROES" }, { NVME_OPC_DATASET_MANAGEMENT, "DATASET MANAGEMENT" }, { NVME_OPC_RESERVATION_REGISTER, "RESERVATION REGISTER" }, { NVME_OPC_RESERVATION_REPORT, "RESERVATION REPORT" }, { NVME_OPC_RESERVATION_ACQUIRE, "RESERVATION ACQUIRE" }, { NVME_OPC_RESERVATION_RELEASE, "RESERVATION RELEASE" }, { 0xFFFF, "IO COMMAND" } }; static const char * get_admin_opcode_string(uint16_t opc) { struct nvme_opcode_string *entry; entry = admin_opcode; while (entry->opc != 0xFFFF) { if (entry->opc == opc) return (entry->str); entry++; } return (entry->str); } static const char * get_io_opcode_string(uint16_t opc) { struct nvme_opcode_string *entry; entry = io_opcode; while (entry->opc != 0xFFFF) { if (entry->opc == opc) return (entry->str); entry++; } return (entry->str); } static void nvme_admin_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd) { - uint16_t opc; - opc = le16toh(cmd->opc_fuse) & NVME_CMD_OPC_MASK; nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%x " "cdw10:%08x cdw11:%08x\n", - get_admin_opcode_string(opc), opc, qpair->id, cmd->cid, + get_admin_opcode_string(cmd->opc), cmd->opc, qpair->id, cmd->cid, le32toh(cmd->nsid), le32toh(cmd->cdw10), le32toh(cmd->cdw11)); } static void nvme_io_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd) { - uint16_t opc; - opc = le16toh(cmd->opc_fuse) & NVME_CMD_OPC_MASK; - switch (opc) { + switch (cmd->opc) { case NVME_OPC_WRITE: case NVME_OPC_READ: case NVME_OPC_WRITE_UNCORRECTABLE: case NVME_OPC_COMPARE: case NVME_OPC_WRITE_ZEROES: nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d " "lba:%llu len:%d\n", - get_io_opcode_string(opc), qpair->id, cmd->cid, le32toh(cmd->nsid), + get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, le32toh(cmd->nsid), ((unsigned long long)le32toh(cmd->cdw11) << 32) + le32toh(cmd->cdw10), (le32toh(cmd->cdw12) & 0xFFFF) + 1); break; case NVME_OPC_FLUSH: case NVME_OPC_DATASET_MANAGEMENT: case NVME_OPC_RESERVATION_REGISTER: case NVME_OPC_RESERVATION_REPORT: case NVME_OPC_RESERVATION_ACQUIRE: case NVME_OPC_RESERVATION_RELEASE: nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d\n", - get_io_opcode_string(opc), qpair->id, cmd->cid, le32toh(cmd->nsid)); + get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, le32toh(cmd->nsid)); break; default: nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%d\n", - get_io_opcode_string(opc), opc, qpair->id, + get_io_opcode_string(cmd->opc), cmd->opc, qpair->id, cmd->cid, le32toh(cmd->nsid)); break; } } static void nvme_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd) { if (qpair->id == 0) nvme_admin_qpair_print_command(qpair, cmd); else nvme_io_qpair_print_command(qpair, cmd); } struct nvme_status_string { uint16_t sc; const char * str; }; static struct nvme_status_string generic_status[] = { { NVME_SC_SUCCESS, "SUCCESS" }, { NVME_SC_INVALID_OPCODE, "INVALID OPCODE" }, { NVME_SC_INVALID_FIELD, "INVALID_FIELD" }, { NVME_SC_COMMAND_ID_CONFLICT, "COMMAND ID CONFLICT" }, { NVME_SC_DATA_TRANSFER_ERROR, "DATA TRANSFER ERROR" }, { NVME_SC_ABORTED_POWER_LOSS, "ABORTED - POWER LOSS" }, { NVME_SC_INTERNAL_DEVICE_ERROR, "INTERNAL DEVICE ERROR" }, { NVME_SC_ABORTED_BY_REQUEST, "ABORTED - BY REQUEST" }, { NVME_SC_ABORTED_SQ_DELETION, "ABORTED - SQ DELETION" }, { NVME_SC_ABORTED_FAILED_FUSED, "ABORTED - FAILED FUSED" }, { NVME_SC_ABORTED_MISSING_FUSED, "ABORTED - MISSING FUSED" }, { NVME_SC_INVALID_NAMESPACE_OR_FORMAT, "INVALID NAMESPACE OR FORMAT" }, { NVME_SC_COMMAND_SEQUENCE_ERROR, "COMMAND SEQUENCE ERROR" }, { NVME_SC_INVALID_SGL_SEGMENT_DESCR, "INVALID SGL SEGMENT DESCRIPTOR" }, { NVME_SC_INVALID_NUMBER_OF_SGL_DESCR, "INVALID NUMBER OF SGL DESCRIPTORS" }, { NVME_SC_DATA_SGL_LENGTH_INVALID, "DATA SGL LENGTH INVALID" }, { NVME_SC_METADATA_SGL_LENGTH_INVALID, "METADATA SGL LENGTH INVALID" }, { NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID, "SGL DESCRIPTOR TYPE INVALID" }, { NVME_SC_INVALID_USE_OF_CMB, "INVALID USE OF CONTROLLER MEMORY BUFFER" }, { NVME_SC_PRP_OFFET_INVALID, "PRP OFFET INVALID" }, { NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED, "ATOMIC WRITE UNIT EXCEEDED" }, { NVME_SC_OPERATION_DENIED, "OPERATION DENIED" }, { NVME_SC_SGL_OFFSET_INVALID, "SGL OFFSET INVALID" }, { NVME_SC_HOST_ID_INCONSISTENT_FORMAT, "HOST IDENTIFIER INCONSISTENT FORMAT" }, { NVME_SC_KEEP_ALIVE_TIMEOUT_EXPIRED, "KEEP ALIVE TIMEOUT EXPIRED" }, { NVME_SC_KEEP_ALIVE_TIMEOUT_INVALID, "KEEP ALIVE TIMEOUT INVALID" }, { NVME_SC_ABORTED_DUE_TO_PREEMPT, "COMMAND ABORTED DUE TO PREEMPT AND ABORT" }, { NVME_SC_SANITIZE_FAILED, "SANITIZE FAILED" }, { NVME_SC_SANITIZE_IN_PROGRESS, "SANITIZE IN PROGRESS" }, { NVME_SC_SGL_DATA_BLOCK_GRAN_INVALID, "SGL_DATA_BLOCK_GRANULARITY_INVALID" }, { NVME_SC_NOT_SUPPORTED_IN_CMB, "COMMAND NOT SUPPORTED FOR QUEUE IN CMB" }, { NVME_SC_LBA_OUT_OF_RANGE, "LBA OUT OF RANGE" }, { NVME_SC_CAPACITY_EXCEEDED, "CAPACITY EXCEEDED" }, { NVME_SC_NAMESPACE_NOT_READY, "NAMESPACE NOT READY" }, { NVME_SC_RESERVATION_CONFLICT, "RESERVATION CONFLICT" }, { NVME_SC_FORMAT_IN_PROGRESS, "FORMAT IN PROGRESS" }, { 0xFFFF, "GENERIC" } }; static struct nvme_status_string command_specific_status[] = { { NVME_SC_COMPLETION_QUEUE_INVALID, "INVALID COMPLETION QUEUE" }, { NVME_SC_INVALID_QUEUE_IDENTIFIER, "INVALID QUEUE IDENTIFIER" }, { NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED, "MAX QUEUE SIZE EXCEEDED" }, { NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED, "ABORT CMD LIMIT EXCEEDED" }, { NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED, "ASYNC LIMIT EXCEEDED" }, { NVME_SC_INVALID_FIRMWARE_SLOT, "INVALID FIRMWARE SLOT" }, { NVME_SC_INVALID_FIRMWARE_IMAGE, "INVALID FIRMWARE IMAGE" }, { NVME_SC_INVALID_INTERRUPT_VECTOR, "INVALID INTERRUPT VECTOR" }, { NVME_SC_INVALID_LOG_PAGE, "INVALID LOG PAGE" }, { NVME_SC_INVALID_FORMAT, "INVALID FORMAT" }, { NVME_SC_FIRMWARE_REQUIRES_RESET, "FIRMWARE REQUIRES RESET" }, { NVME_SC_INVALID_QUEUE_DELETION, "INVALID QUEUE DELETION" }, { NVME_SC_FEATURE_NOT_SAVEABLE, "FEATURE IDENTIFIER NOT SAVEABLE" }, { NVME_SC_FEATURE_NOT_CHANGEABLE, "FEATURE NOT CHANGEABLE" }, { NVME_SC_FEATURE_NOT_NS_SPECIFIC, "FEATURE NOT NAMESPACE SPECIFIC" }, { NVME_SC_FW_ACT_REQUIRES_NVMS_RESET, "FIRMWARE ACTIVATION REQUIRES NVM SUBSYSTEM RESET" }, { NVME_SC_FW_ACT_REQUIRES_RESET, "FIRMWARE ACTIVATION REQUIRES RESET" }, { NVME_SC_FW_ACT_REQUIRES_TIME, "FIRMWARE ACTIVATION REQUIRES MAXIMUM TIME VIOLATION" }, { NVME_SC_FW_ACT_PROHIBITED, "FIRMWARE ACTIVATION PROHIBITED" }, { NVME_SC_OVERLAPPING_RANGE, "OVERLAPPING RANGE" }, { NVME_SC_NS_INSUFFICIENT_CAPACITY, "NAMESPACE INSUFFICIENT CAPACITY" }, { NVME_SC_NS_ID_UNAVAILABLE, "NAMESPACE IDENTIFIER UNAVAILABLE" }, { NVME_SC_NS_ALREADY_ATTACHED, "NAMESPACE ALREADY ATTACHED" }, { NVME_SC_NS_IS_PRIVATE, "NAMESPACE IS PRIVATE" }, { NVME_SC_NS_NOT_ATTACHED, "NS NOT ATTACHED" }, { NVME_SC_THIN_PROV_NOT_SUPPORTED, "THIN PROVISIONING NOT SUPPORTED" }, { NVME_SC_CTRLR_LIST_INVALID, "CONTROLLER LIST INVALID" }, { NVME_SC_SELT_TEST_IN_PROGRESS, "DEVICE SELT-TEST IN PROGRESS" }, { NVME_SC_BOOT_PART_WRITE_PROHIB, "BOOT PARTITION WRITE PROHIBITED" }, { NVME_SC_INVALID_CTRLR_ID, "INVALID CONTROLLER IDENTIFIER" }, { NVME_SC_INVALID_SEC_CTRLR_STATE, "INVALID SECONDARY CONTROLLER STATE" }, { NVME_SC_INVALID_NUM_OF_CTRLR_RESRC, "INVALID NUMBER OF CONTROLLER RESOURCES" }, { NVME_SC_INVALID_RESOURCE_ID, "INVALID RESOURCE IDENTIFIER" }, { NVME_SC_CONFLICTING_ATTRIBUTES, "CONFLICTING ATTRIBUTES" }, { NVME_SC_INVALID_PROTECTION_INFO, "INVALID PROTECTION INFO" }, { NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE, "WRITE TO RO PAGE" }, { 0xFFFF, "COMMAND SPECIFIC" } }; static struct nvme_status_string media_error_status[] = { { NVME_SC_WRITE_FAULTS, "WRITE FAULTS" }, { NVME_SC_UNRECOVERED_READ_ERROR, "UNRECOVERED READ ERROR" }, { NVME_SC_GUARD_CHECK_ERROR, "GUARD CHECK ERROR" }, { NVME_SC_APPLICATION_TAG_CHECK_ERROR, "APPLICATION TAG CHECK ERROR" }, { NVME_SC_REFERENCE_TAG_CHECK_ERROR, "REFERENCE TAG CHECK ERROR" }, { NVME_SC_COMPARE_FAILURE, "COMPARE FAILURE" }, { NVME_SC_ACCESS_DENIED, "ACCESS DENIED" }, { NVME_SC_DEALLOCATED_OR_UNWRITTEN, "DEALLOCATED OR UNWRITTEN LOGICAL BLOCK" }, { 0xFFFF, "MEDIA ERROR" } }; static const char * get_status_string(uint16_t sct, uint16_t sc) { struct nvme_status_string *entry; switch (sct) { case NVME_SCT_GENERIC: entry = generic_status; break; case NVME_SCT_COMMAND_SPECIFIC: entry = command_specific_status; break; case NVME_SCT_MEDIA_ERROR: entry = media_error_status; break; case NVME_SCT_VENDOR_SPECIFIC: return ("VENDOR SPECIFIC"); default: return ("RESERVED"); } while (entry->sc != 0xFFFF) { if (entry->sc == sc) return (entry->str); entry++; } return (entry->str); } static void nvme_qpair_print_completion(struct nvme_qpair *qpair, struct nvme_completion *cpl) { uint16_t sct, sc; sct = NVME_STATUS_GET_SCT(cpl->status); sc = NVME_STATUS_GET_SC(cpl->status); nvme_printf(qpair->ctrlr, "%s (%02x/%02x) sqid:%d cid:%d cdw0:%x\n", get_status_string(sct, sc), sct, sc, cpl->sqid, cpl->cid, cpl->cdw0); } static boolean_t nvme_completion_is_retry(const struct nvme_completion *cpl) { uint8_t sct, sc, dnr; sct = NVME_STATUS_GET_SCT(cpl->status); sc = NVME_STATUS_GET_SC(cpl->status); dnr = NVME_STATUS_GET_DNR(cpl->status); /* * TODO: spec is not clear how commands that are aborted due * to TLER will be marked. So for now, it seems * NAMESPACE_NOT_READY is the only case where we should * look at the DNR bit. */ switch (sct) { case NVME_SCT_GENERIC: switch (sc) { case NVME_SC_ABORTED_BY_REQUEST: case NVME_SC_NAMESPACE_NOT_READY: if (dnr) return (0); else return (1); case NVME_SC_INVALID_OPCODE: case NVME_SC_INVALID_FIELD: case NVME_SC_COMMAND_ID_CONFLICT: case NVME_SC_DATA_TRANSFER_ERROR: case NVME_SC_ABORTED_POWER_LOSS: case NVME_SC_INTERNAL_DEVICE_ERROR: case NVME_SC_ABORTED_SQ_DELETION: case NVME_SC_ABORTED_FAILED_FUSED: case NVME_SC_ABORTED_MISSING_FUSED: case NVME_SC_INVALID_NAMESPACE_OR_FORMAT: case NVME_SC_COMMAND_SEQUENCE_ERROR: case NVME_SC_LBA_OUT_OF_RANGE: case NVME_SC_CAPACITY_EXCEEDED: default: return (0); } case NVME_SCT_COMMAND_SPECIFIC: case NVME_SCT_MEDIA_ERROR: case NVME_SCT_VENDOR_SPECIFIC: default: return (0); } } static void nvme_qpair_complete_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr, struct nvme_completion *cpl, boolean_t print_on_error) { struct nvme_request *req; boolean_t retry, error; req = tr->req; error = nvme_completion_is_error(cpl); retry = error && nvme_completion_is_retry(cpl) && req->retries < nvme_retry_count; if (error && print_on_error) { nvme_qpair_print_command(qpair, &req->cmd); nvme_qpair_print_completion(qpair, cpl); } qpair->act_tr[cpl->cid] = NULL; KASSERT(cpl->cid == req->cmd.cid, ("cpl cid does not match cmd cid\n")); if (req->cb_fn && !retry) req->cb_fn(req->cb_arg, cpl); mtx_lock(&qpair->lock); callout_stop(&tr->timer); if (retry) { req->retries++; nvme_qpair_submit_tracker(qpair, tr); } else { if (req->type != NVME_REQUEST_NULL) { bus_dmamap_sync(qpair->dma_tag_payload, tr->payload_dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(qpair->dma_tag_payload, tr->payload_dma_map); } nvme_free_request(req); tr->req = NULL; TAILQ_REMOVE(&qpair->outstanding_tr, tr, tailq); TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq); /* * If the controller is in the middle of resetting, don't * try to submit queued requests here - let the reset logic * handle that instead. */ if (!STAILQ_EMPTY(&qpair->queued_req) && !qpair->ctrlr->is_resetting) { req = STAILQ_FIRST(&qpair->queued_req); STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); _nvme_qpair_submit_request(qpair, req); } } mtx_unlock(&qpair->lock); } static void nvme_qpair_manual_complete_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr, boolean_t print_on_error) { struct nvme_completion cpl; memset(&cpl, 0, sizeof(cpl)); cpl.sqid = qpair->id; cpl.cid = tr->cid; cpl.status |= (sct & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT; cpl.status |= (sc & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; cpl.status |= (dnr & NVME_STATUS_DNR_MASK) << NVME_STATUS_DNR_SHIFT; nvme_qpair_complete_tracker(qpair, tr, &cpl, print_on_error); } void nvme_qpair_manual_complete_request(struct nvme_qpair *qpair, struct nvme_request *req, uint32_t sct, uint32_t sc, boolean_t print_on_error) { struct nvme_completion cpl; boolean_t error; memset(&cpl, 0, sizeof(cpl)); cpl.sqid = qpair->id; cpl.status |= (sct & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT; cpl.status |= (sc & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; error = nvme_completion_is_error(&cpl); if (error && print_on_error) { nvme_qpair_print_command(qpair, &req->cmd); nvme_qpair_print_completion(qpair, &cpl); } if (req->cb_fn) req->cb_fn(req->cb_arg, &cpl); nvme_free_request(req); } bool nvme_qpair_process_completions(struct nvme_qpair *qpair) { struct nvme_tracker *tr; struct nvme_completion cpl; int done = 0; qpair->num_intr_handler_calls++; if (!qpair->is_enabled) /* * qpair is not enabled, likely because a controller reset is * is in progress. Ignore the interrupt - any I/O that was * associated with this interrupt will get retried when the * reset is complete. */ return (false); bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); while (1) { cpl = qpair->cpl[qpair->cq_head]; /* Convert to host endian */ nvme_completion_swapbytes(&cpl); if (NVME_STATUS_GET_P(cpl.status) != qpair->phase) break; tr = qpair->act_tr[cpl.cid]; if (tr != NULL) { nvme_qpair_complete_tracker(qpair, tr, &cpl, TRUE); qpair->sq_head = cpl.sqhd; done++; } else { nvme_printf(qpair->ctrlr, "cpl does not map to outstanding cmd\n"); /* nvme_dump_completion expects device endianess */ nvme_dump_completion(&qpair->cpl[qpair->cq_head]); KASSERT(0, ("received completion for unknown cmd\n")); } if (++qpair->cq_head == qpair->num_entries) { qpair->cq_head = 0; qpair->phase = !qpair->phase; } nvme_mmio_write_4(qpair->ctrlr, doorbell[qpair->id].cq_hdbl, qpair->cq_head); } return (done != 0); } static void nvme_qpair_msix_handler(void *arg) { struct nvme_qpair *qpair = arg; nvme_qpair_process_completions(qpair); } int nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, uint16_t vector, uint32_t num_entries, uint32_t num_trackers, struct nvme_controller *ctrlr) { struct nvme_tracker *tr; size_t cmdsz, cplsz, prpsz, allocsz, prpmemsz; uint64_t queuemem_phys, prpmem_phys, list_phys; uint8_t *queuemem, *prpmem, *prp_list; int i, err; qpair->id = id; qpair->vector = vector; qpair->num_entries = num_entries; qpair->num_trackers = num_trackers; qpair->ctrlr = ctrlr; if (ctrlr->msix_enabled) { /* * MSI-X vector resource IDs start at 1, so we add one to * the queue's vector to get the corresponding rid to use. */ qpair->rid = vector + 1; qpair->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ, &qpair->rid, RF_ACTIVE); bus_setup_intr(ctrlr->dev, qpair->res, INTR_TYPE_MISC | INTR_MPSAFE, NULL, nvme_qpair_msix_handler, qpair, &qpair->tag); } mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF); /* Note: NVMe PRP format is restricted to 4-byte alignment. */ err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 4, PAGE_SIZE, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, NVME_MAX_XFER_SIZE, (NVME_MAX_XFER_SIZE/PAGE_SIZE)+1, PAGE_SIZE, 0, NULL, NULL, &qpair->dma_tag_payload); if (err != 0) { nvme_printf(ctrlr, "payload tag create failed %d\n", err); goto out; } /* * Each component must be page aligned, and individual PRP lists * cannot cross a page boundary. */ cmdsz = qpair->num_entries * sizeof(struct nvme_command); cmdsz = roundup2(cmdsz, PAGE_SIZE); cplsz = qpair->num_entries * sizeof(struct nvme_completion); cplsz = roundup2(cplsz, PAGE_SIZE); prpsz = sizeof(uint64_t) * NVME_MAX_PRP_LIST_ENTRIES;; prpmemsz = qpair->num_trackers * prpsz; allocsz = cmdsz + cplsz + prpmemsz; err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), PAGE_SIZE, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, allocsz, 1, allocsz, 0, NULL, NULL, &qpair->dma_tag); if (err != 0) { nvme_printf(ctrlr, "tag create failed %d\n", err); goto out; } if (bus_dmamem_alloc(qpair->dma_tag, (void **)&queuemem, BUS_DMA_NOWAIT, &qpair->queuemem_map)) { nvme_printf(ctrlr, "failed to alloc qpair memory\n"); goto out; } if (bus_dmamap_load(qpair->dma_tag, qpair->queuemem_map, queuemem, allocsz, nvme_single_map, &queuemem_phys, 0) != 0) { nvme_printf(ctrlr, "failed to load qpair memory\n"); goto out; } qpair->num_cmds = 0; qpair->num_intr_handler_calls = 0; qpair->cmd = (struct nvme_command *)queuemem; qpair->cpl = (struct nvme_completion *)(queuemem + cmdsz); prpmem = (uint8_t *)(queuemem + cmdsz + cplsz); qpair->cmd_bus_addr = queuemem_phys; qpair->cpl_bus_addr = queuemem_phys + cmdsz; prpmem_phys = queuemem_phys + cmdsz + cplsz; qpair->sq_tdbl_off = nvme_mmio_offsetof(doorbell[id].sq_tdbl); qpair->cq_hdbl_off = nvme_mmio_offsetof(doorbell[id].cq_hdbl); TAILQ_INIT(&qpair->free_tr); TAILQ_INIT(&qpair->outstanding_tr); STAILQ_INIT(&qpair->queued_req); list_phys = prpmem_phys; prp_list = prpmem; for (i = 0; i < qpair->num_trackers; i++) { if (list_phys + prpsz > prpmem_phys + prpmemsz) { qpair->num_trackers = i; break; } /* * Make sure that the PRP list for this tracker doesn't * overflow to another page. */ if (trunc_page(list_phys) != trunc_page(list_phys + prpsz - 1)) { list_phys = roundup2(list_phys, PAGE_SIZE); prp_list = (uint8_t *)roundup2((uintptr_t)prp_list, PAGE_SIZE); } tr = malloc(sizeof(*tr), M_NVME, M_ZERO | M_WAITOK); bus_dmamap_create(qpair->dma_tag_payload, 0, &tr->payload_dma_map); callout_init(&tr->timer, 1); tr->cid = i; tr->qpair = qpair; tr->prp = (uint64_t *)prp_list; tr->prp_bus_addr = list_phys; TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq); list_phys += prpsz; prp_list += prpsz; } if (qpair->num_trackers == 0) { nvme_printf(ctrlr, "failed to allocate enough trackers\n"); goto out; } qpair->act_tr = malloc(sizeof(struct nvme_tracker *) * qpair->num_entries, M_NVME, M_ZERO | M_WAITOK); return (0); out: nvme_qpair_destroy(qpair); return (ENOMEM); } static void nvme_qpair_destroy(struct nvme_qpair *qpair) { struct nvme_tracker *tr; if (qpair->tag) bus_teardown_intr(qpair->ctrlr->dev, qpair->res, qpair->tag); if (mtx_initialized(&qpair->lock)) mtx_destroy(&qpair->lock); if (qpair->res) bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ, rman_get_rid(qpair->res), qpair->res); if (qpair->cmd != NULL) { bus_dmamap_unload(qpair->dma_tag, qpair->queuemem_map); bus_dmamem_free(qpair->dma_tag, qpair->cmd, qpair->queuemem_map); } if (qpair->act_tr) free(qpair->act_tr, M_NVME); while (!TAILQ_EMPTY(&qpair->free_tr)) { tr = TAILQ_FIRST(&qpair->free_tr); TAILQ_REMOVE(&qpair->free_tr, tr, tailq); bus_dmamap_destroy(qpair->dma_tag_payload, tr->payload_dma_map); free(tr, M_NVME); } if (qpair->dma_tag) bus_dma_tag_destroy(qpair->dma_tag); if (qpair->dma_tag_payload) bus_dma_tag_destroy(qpair->dma_tag_payload); } static void nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair) { struct nvme_tracker *tr; tr = TAILQ_FIRST(&qpair->outstanding_tr); while (tr != NULL) { - if ((le16toh(tr->req->cmd.opc_fuse) & NVME_CMD_OPC_MASK) == NVME_OPC_ASYNC_EVENT_REQUEST) { + if (tr->req->cmd.opc == NVME_OPC_ASYNC_EVENT_REQUEST) { nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0, FALSE); tr = TAILQ_FIRST(&qpair->outstanding_tr); } else { tr = TAILQ_NEXT(tr, tailq); } } } void nvme_admin_qpair_destroy(struct nvme_qpair *qpair) { nvme_admin_qpair_abort_aers(qpair); nvme_qpair_destroy(qpair); } void nvme_io_qpair_destroy(struct nvme_qpair *qpair) { nvme_qpair_destroy(qpair); } static void nvme_abort_complete(void *arg, const struct nvme_completion *status) { struct nvme_tracker *tr = arg; /* * If cdw0 == 1, the controller was not able to abort the command * we requested. We still need to check the active tracker array, * to cover race where I/O timed out at same time controller was * completing the I/O. */ if (status->cdw0 == 1 && tr->qpair->act_tr[tr->cid] != NULL) { /* * An I/O has timed out, and the controller was unable to * abort it for some reason. Construct a fake completion * status, and then complete the I/O's tracker manually. */ nvme_printf(tr->qpair->ctrlr, "abort command failed, aborting command manually\n"); nvme_qpair_manual_complete_tracker(tr->qpair, tr, NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, TRUE); } } static void nvme_timeout(void *arg) { struct nvme_tracker *tr = arg; struct nvme_qpair *qpair = tr->qpair; struct nvme_controller *ctrlr = qpair->ctrlr; uint32_t csts; uint8_t cfs; /* * Read csts to get value of cfs - controller fatal status. * If no fatal status, try to call the completion routine, and * if completes transactions, report a missed interrupt and * return (this may need to be rate limited). Otherwise, if * aborts are enabled and the controller is not reporting * fatal status, abort the command. Otherwise, just reset the * controller and hope for the best. */ csts = nvme_mmio_read_4(ctrlr, csts); cfs = (csts >> NVME_CSTS_REG_CFS_SHIFT) & NVME_CSTS_REG_CFS_MASK; if (cfs == 0 && nvme_qpair_process_completions(qpair)) { nvme_printf(ctrlr, "Missing interrupt\n"); return; } if (ctrlr->enable_aborts && cfs == 0) { nvme_printf(ctrlr, "Aborting command due to a timeout.\n"); nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id, nvme_abort_complete, tr); } else { nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n", cfs ? " and fatal error status" : ""); nvme_ctrlr_reset(ctrlr); } } void nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr) { struct nvme_request *req; struct nvme_controller *ctrlr; mtx_assert(&qpair->lock, MA_OWNED); req = tr->req; req->cmd.cid = tr->cid; qpair->act_tr[tr->cid] = tr; ctrlr = qpair->ctrlr; if (req->timeout) #if __FreeBSD_version >= 800030 callout_reset_curcpu(&tr->timer, ctrlr->timeout_period * hz, nvme_timeout, tr); #else callout_reset(&tr->timer, ctrlr->timeout_period * hz, nvme_timeout, tr); #endif /* Copy the command from the tracker to the submission queue. */ memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd)); if (++qpair->sq_tail == qpair->num_entries) qpair->sq_tail = 0; bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); #ifndef __powerpc__ /* * powerpc's bus_dmamap_sync() already includes a heavyweight sync, but * no other archs do. */ wmb(); #endif nvme_mmio_write_4(qpair->ctrlr, doorbell[qpair->id].sq_tdbl, qpair->sq_tail); qpair->num_cmds++; } static void nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg, int error) { struct nvme_tracker *tr = arg; uint32_t cur_nseg; /* * If the mapping operation failed, return immediately. The caller * is responsible for detecting the error status and failing the * tracker manually. */ if (error != 0) { nvme_printf(tr->qpair->ctrlr, "nvme_payload_map err %d\n", error); return; } /* * Note that we specified PAGE_SIZE for alignment and max * segment size when creating the bus dma tags. So here * we can safely just transfer each segment to its * associated PRP entry. */ tr->req->cmd.prp1 = htole64(seg[0].ds_addr); if (nseg == 2) { tr->req->cmd.prp2 = htole64(seg[1].ds_addr); } else if (nseg > 2) { cur_nseg = 1; tr->req->cmd.prp2 = htole64((uint64_t)tr->prp_bus_addr); while (cur_nseg < nseg) { tr->prp[cur_nseg-1] = htole64((uint64_t)seg[cur_nseg].ds_addr); cur_nseg++; } } else { /* * prp2 should not be used by the controller * since there is only one segment, but set * to 0 just to be safe. */ tr->req->cmd.prp2 = 0; } bus_dmamap_sync(tr->qpair->dma_tag_payload, tr->payload_dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); nvme_qpair_submit_tracker(tr->qpair, tr); } static void _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req) { struct nvme_tracker *tr; int err = 0; mtx_assert(&qpair->lock, MA_OWNED); tr = TAILQ_FIRST(&qpair->free_tr); req->qpair = qpair; if (tr == NULL || !qpair->is_enabled) { /* * No tracker is available, or the qpair is disabled due to * an in-progress controller-level reset or controller * failure. */ if (qpair->ctrlr->is_failed) { /* * The controller has failed. Post the request to a * task where it will be aborted, so that we do not * invoke the request's callback in the context * of the submission. */ nvme_ctrlr_post_failed_request(qpair->ctrlr, req); } else { /* * Put the request on the qpair's request queue to be * processed when a tracker frees up via a command * completion or when the controller reset is * completed. */ STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq); } return; } TAILQ_REMOVE(&qpair->free_tr, tr, tailq); TAILQ_INSERT_TAIL(&qpair->outstanding_tr, tr, tailq); tr->req = req; switch (req->type) { case NVME_REQUEST_VADDR: KASSERT(req->payload_size <= qpair->ctrlr->max_xfer_size, ("payload_size (%d) exceeds max_xfer_size (%d)\n", req->payload_size, qpair->ctrlr->max_xfer_size)); err = bus_dmamap_load(tr->qpair->dma_tag_payload, tr->payload_dma_map, req->u.payload, req->payload_size, nvme_payload_map, tr, 0); if (err != 0) nvme_printf(qpair->ctrlr, "bus_dmamap_load returned 0x%x!\n", err); break; case NVME_REQUEST_NULL: nvme_qpair_submit_tracker(tr->qpair, tr); break; #ifdef NVME_UNMAPPED_BIO_SUPPORT case NVME_REQUEST_BIO: KASSERT(req->u.bio->bio_bcount <= qpair->ctrlr->max_xfer_size, ("bio->bio_bcount (%jd) exceeds max_xfer_size (%d)\n", (intmax_t)req->u.bio->bio_bcount, qpair->ctrlr->max_xfer_size)); err = bus_dmamap_load_bio(tr->qpair->dma_tag_payload, tr->payload_dma_map, req->u.bio, nvme_payload_map, tr, 0); if (err != 0) nvme_printf(qpair->ctrlr, "bus_dmamap_load_bio returned 0x%x!\n", err); break; #endif case NVME_REQUEST_CCB: err = bus_dmamap_load_ccb(tr->qpair->dma_tag_payload, tr->payload_dma_map, req->u.payload, nvme_payload_map, tr, 0); if (err != 0) nvme_printf(qpair->ctrlr, "bus_dmamap_load_ccb returned 0x%x!\n", err); break; default: panic("unknown nvme request type 0x%x\n", req->type); break; } if (err != 0) { /* * The dmamap operation failed, so we manually fail the * tracker here with DATA_TRANSFER_ERROR status. * * nvme_qpair_manual_complete_tracker must not be called * with the qpair lock held. */ mtx_unlock(&qpair->lock); nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, NVME_SC_DATA_TRANSFER_ERROR, 1 /* do not retry */, TRUE); mtx_lock(&qpair->lock); } } void nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req) { mtx_lock(&qpair->lock); _nvme_qpair_submit_request(qpair, req); mtx_unlock(&qpair->lock); } static void nvme_qpair_enable(struct nvme_qpair *qpair) { qpair->is_enabled = TRUE; } void nvme_qpair_reset(struct nvme_qpair *qpair) { qpair->sq_head = qpair->sq_tail = qpair->cq_head = 0; /* * First time through the completion queue, HW will set phase * bit on completions to 1. So set this to 1 here, indicating * we're looking for a 1 to know which entries have completed. * we'll toggle the bit each time when the completion queue * rolls over. */ qpair->phase = 1; memset(qpair->cmd, 0, qpair->num_entries * sizeof(struct nvme_command)); memset(qpair->cpl, 0, qpair->num_entries * sizeof(struct nvme_completion)); } void nvme_admin_qpair_enable(struct nvme_qpair *qpair) { struct nvme_tracker *tr; struct nvme_tracker *tr_temp; /* * Manually abort each outstanding admin command. Do not retry * admin commands found here, since they will be left over from * a controller reset and its likely the context in which the * command was issued no longer applies. */ TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { nvme_printf(qpair->ctrlr, "aborting outstanding admin command\n"); nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 1 /* do not retry */, TRUE); } nvme_qpair_enable(qpair); } void nvme_io_qpair_enable(struct nvme_qpair *qpair) { STAILQ_HEAD(, nvme_request) temp; struct nvme_tracker *tr; struct nvme_tracker *tr_temp; struct nvme_request *req; /* * Manually abort each outstanding I/O. This normally results in a * retry, unless the retry count on the associated request has * reached its limit. */ TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { nvme_printf(qpair->ctrlr, "aborting outstanding i/o\n"); nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, TRUE); } mtx_lock(&qpair->lock); nvme_qpair_enable(qpair); STAILQ_INIT(&temp); STAILQ_SWAP(&qpair->queued_req, &temp, nvme_request); while (!STAILQ_EMPTY(&temp)) { req = STAILQ_FIRST(&temp); STAILQ_REMOVE_HEAD(&temp, stailq); nvme_printf(qpair->ctrlr, "resubmitting queued i/o\n"); nvme_qpair_print_command(qpair, &req->cmd); _nvme_qpair_submit_request(qpair, req); } mtx_unlock(&qpair->lock); } static void nvme_qpair_disable(struct nvme_qpair *qpair) { struct nvme_tracker *tr; qpair->is_enabled = FALSE; mtx_lock(&qpair->lock); TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) callout_stop(&tr->timer); mtx_unlock(&qpair->lock); } void nvme_admin_qpair_disable(struct nvme_qpair *qpair) { nvme_qpair_disable(qpair); nvme_admin_qpair_abort_aers(qpair); } void nvme_io_qpair_disable(struct nvme_qpair *qpair) { nvme_qpair_disable(qpair); } void nvme_qpair_fail(struct nvme_qpair *qpair) { struct nvme_tracker *tr; struct nvme_request *req; if (!mtx_initialized(&qpair->lock)) return; mtx_lock(&qpair->lock); while (!STAILQ_EMPTY(&qpair->queued_req)) { req = STAILQ_FIRST(&qpair->queued_req); STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); nvme_printf(qpair->ctrlr, "failing queued i/o\n"); mtx_unlock(&qpair->lock); nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, TRUE); mtx_lock(&qpair->lock); } /* Manually abort each outstanding I/O. */ while (!TAILQ_EMPTY(&qpair->outstanding_tr)) { tr = TAILQ_FIRST(&qpair->outstanding_tr); /* * Do not remove the tracker. The abort_tracker path will * do that for us. */ nvme_printf(qpair->ctrlr, "failing outstanding i/o\n"); mtx_unlock(&qpair->lock); nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 1 /* do not retry */, TRUE); mtx_lock(&qpair->lock); } mtx_unlock(&qpair->lock); } Index: head/sys/sys/param.h =================================================================== --- head/sys/sys/param.h (revision 338181) +++ head/sys/sys/param.h (revision 338182) @@ -1,365 +1,365 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)param.h 8.3 (Berkeley) 4/4/95 * $FreeBSD$ */ #ifndef _SYS_PARAM_H_ #define _SYS_PARAM_H_ #include #define BSD 199506 /* System version (year & month). */ #define BSD4_3 1 #define BSD4_4 1 /* * __FreeBSD_version numbers are documented in the Porter's Handbook. * If you bump the version for any reason, you should update the documentation * there. * Currently this lives here in the doc/ repository: * * head/en_US.ISO8859-1/books/porters-handbook/versions/chapter.xml * * scheme is: Rxx * 'R' is in the range 0 to 4 if this is a release branch or * X.0-CURRENT before releng/X.0 is created, otherwise 'R' is * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1200080 /* Master, propagated to newvers */ +#define __FreeBSD_version 1200081 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, * which by definition is always true on FreeBSD. This macro is also defined * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD. * * It is tempting to use this macro in userland code when we want to enable * kernel-specific routines, and in fact it's fine to do this in code that * is part of FreeBSD itself. However, be aware that as presence of this * macro is still not widespread (e.g. older FreeBSD versions, 3rd party * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in * external applications without also checking for __FreeBSD__ as an * alternative. */ #undef __FreeBSD_kernel__ #define __FreeBSD_kernel__ #if defined(_KERNEL) || defined(IN_RTLD) #define P_OSREL_SIGWAIT 700000 #define P_OSREL_SIGSEGV 700004 #define P_OSREL_MAP_ANON 800104 #define P_OSREL_MAP_FSTRICT 1100036 #define P_OSREL_SHUTDOWN_ENOTCONN 1100077 #define P_OSREL_MAP_GUARD 1200035 #define P_OSREL_WRFSBASE 1200041 #define P_OSREL_CK_CYLGRP 1200046 #define P_OSREL_VMTOTAL64 1200054 #define P_OSREL_MAJOR(x) ((x) / 100000) #endif #ifndef LOCORE #include #endif /* * Machine-independent constants (some used in following include files). * Redefined constants are from POSIX 1003.1 limits file. * * MAXCOMLEN should be >= sizeof(ac_comm) (see ) */ #include #define MAXCOMLEN 19 /* max command name remembered */ #define MAXINTERP PATH_MAX /* max interpreter file name length */ #define MAXLOGNAME 33 /* max login name length (incl. NUL) */ #define MAXUPRC CHILD_MAX /* max simultaneous processes */ #define NCARGS ARG_MAX /* max bytes for an exec function */ #define NGROUPS (NGROUPS_MAX+1) /* max number groups */ #define NOFILE OPEN_MAX /* max open files per process */ #define NOGROUP 65535 /* marker for empty group set member */ #define MAXHOSTNAMELEN 256 /* max hostname size */ #define SPECNAMELEN 63 /* max length of devicename */ /* More types and definitions used throughout the kernel. */ #ifdef _KERNEL #include #include #ifndef LOCORE #include #include #endif #ifndef FALSE #define FALSE 0 #endif #ifndef TRUE #define TRUE 1 #endif #endif #ifndef _KERNEL /* Signals. */ #include #endif /* Machine type dependent parameters. */ #include #ifndef _KERNEL #include #endif #ifndef DEV_BSHIFT #define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ #endif #define DEV_BSIZE (1<>PAGE_SHIFT) #endif /* * btodb() is messy and perhaps slow because `bytes' may be an off_t. We * want to shift an unsigned type to avoid sign extension and we don't * want to widen `bytes' unnecessarily. Assume that the result fits in * a daddr_t. */ #ifndef btodb #define btodb(bytes) /* calculates (bytes / DEV_BSIZE) */ \ (sizeof (bytes) > sizeof(long) \ ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \ : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT)) #endif #ifndef dbtob #define dbtob(db) /* calculates (db * DEV_BSIZE) */ \ ((off_t)(db) << DEV_BSHIFT) #endif #define PRIMASK 0x0ff #define PCATCH 0x100 /* OR'd with pri for tsleep to check signals */ #define PDROP 0x200 /* OR'd with pri to stop re-entry of interlock mutex */ #define NZERO 0 /* default "nice" */ #define NBBY 8 /* number of bits in a byte */ #define NBPW sizeof(int) /* number of bytes per word (integer) */ #define CMASK 022 /* default file mask: S_IWGRP|S_IWOTH */ #define NODEV (dev_t)(-1) /* non-existent device */ /* * File system parameters and macros. * * MAXBSIZE - Filesystems are made out of blocks of at most MAXBSIZE bytes * per block. MAXBSIZE may be made larger without effecting * any existing filesystems as long as it does not exceed MAXPHYS, * and may be made smaller at the risk of not being able to use * filesystems which require a block size exceeding MAXBSIZE. * * MAXBCACHEBUF - Maximum size of a buffer in the buffer cache. This must * be >= MAXBSIZE and can be set differently for different * architectures by defining it in . * Making this larger allows NFS to do larger reads/writes. * * BKVASIZE - Nominal buffer space per buffer, in bytes. BKVASIZE is the * minimum KVM memory reservation the kernel is willing to make. * Filesystems can of course request smaller chunks. Actual * backing memory uses a chunk size of a page (PAGE_SIZE). * The default value here can be overridden on a per-architecture * basis by defining it in . * * If you make BKVASIZE too small you risk seriously fragmenting * the buffer KVM map which may slow things down a bit. If you * make it too big the kernel will not be able to optimally use * the KVM memory reserved for the buffer cache and will wind * up with too-few buffers. * * The default is 16384, roughly 2x the block size used by a * normal UFS filesystem. */ #define MAXBSIZE 65536 /* must be power of 2 */ #ifndef MAXBCACHEBUF #define MAXBCACHEBUF MAXBSIZE /* must be a power of 2 >= MAXBSIZE */ #endif #ifndef BKVASIZE #define BKVASIZE 16384 /* must be power of 2 */ #endif #define BKVAMASK (BKVASIZE-1) /* * MAXPATHLEN defines the longest permissible path length after expanding * symbolic links. It is used to allocate a temporary buffer from the buffer * pool in which to do the name expansion, hence should be a power of two, * and must be less than or equal to MAXBSIZE. MAXSYMLINKS defines the * maximum number of symbolic links that may be expanded in a path name. * It should be set high enough to allow all legitimate uses, but halt * infinite loops reasonably quickly. */ #define MAXPATHLEN PATH_MAX #define MAXSYMLINKS 32 /* Bit map related macros. */ #define setbit(a,i) (((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY)) #define clrbit(a,i) (((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY))) #define isset(a,i) \ (((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) #define isclr(a,i) \ ((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0) /* Macros for counting and rounding. */ #ifndef howmany #define howmany(x, y) (((x)+((y)-1))/(y)) #endif #define nitems(x) (sizeof((x)) / sizeof((x)[0])) #define rounddown(x, y) (((x)/(y))*(y)) #define rounddown2(x, y) ((x)&(~((y)-1))) /* if y is power of two */ #define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */ #define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */ #define powerof2(x) ((((x)-1)&(x))==0) /* Macros for min/max. */ #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) #ifdef _KERNEL /* * Basic byte order function prototypes for non-inline functions. */ #ifndef LOCORE #ifndef _BYTEORDER_PROTOTYPED #define _BYTEORDER_PROTOTYPED __BEGIN_DECLS __uint32_t htonl(__uint32_t); __uint16_t htons(__uint16_t); __uint32_t ntohl(__uint32_t); __uint16_t ntohs(__uint16_t); __END_DECLS #endif #endif #ifndef _BYTEORDER_FUNC_DEFINED #define _BYTEORDER_FUNC_DEFINED #define htonl(x) __htonl(x) #define htons(x) __htons(x) #define ntohl(x) __ntohl(x) #define ntohs(x) __ntohs(x) #endif /* !_BYTEORDER_FUNC_DEFINED */ #endif /* _KERNEL */ /* * Scale factor for scaled integers used to count %cpu time and load avgs. * * The number of CPU `tick's that map to a unique `%age' can be expressed * by the formula (1 / (2 ^ (FSHIFT - 11))). The maximum load average that * can be calculated (assuming 32 bits) can be closely approximated using * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15). * * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age', * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024. */ #define FSHIFT 11 /* bits to right of fixed binary point */ #define FSCALE (1<> (PAGE_SHIFT - DEV_BSHIFT)) #define ctodb(db) /* calculates pages to devblks */ \ ((db) << (PAGE_SHIFT - DEV_BSHIFT)) /* * Old spelling of __containerof(). */ #define member2struct(s, m, x) \ ((struct s *)(void *)((char *)(x) - offsetof(struct s, m))) /* * Access a variable length array that has been declared as a fixed * length array. */ #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset]) #endif /* _SYS_PARAM_H_ */ Index: head/usr.sbin/bhyve/pci_nvme.c =================================================================== --- head/usr.sbin/bhyve/pci_nvme.c (revision 338181) +++ head/usr.sbin/bhyve/pci_nvme.c (revision 338182) @@ -1,1854 +1,1849 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2017 Shunsuke Mie * Copyright (c) 2018 Leon Dang * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * bhyve PCIe-NVMe device emulation. * * options: * -s ,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z * * accepted devpath: * /dev/blockdev * /path/to/image * ram=size_in_MiB * * maxq = max number of queues * qsz = max elements in each queue * ioslots = max number of concurrent io requests * sectsz = sector size (defaults to blockif sector size) * ser = serial number (20-chars max) * */ /* TODO: - create async event for smart and log - intr coalesce */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "block_if.h" #include "pci_emul.h" static int nvme_debug = 0; #define DPRINTF(params) if (nvme_debug) printf params #define WPRINTF(params) printf params /* defaults; can be overridden */ #define NVME_MSIX_BAR 4 #define NVME_IOSLOTS 8 #define NVME_QUEUES 16 #define NVME_MAX_QENTRIES 2048 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) #define NVME_MAX_BLOCKIOVS 512 /* helpers */ #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) enum nvme_controller_register_offsets { NVME_CR_CAP_LOW = 0x00, NVME_CR_CAP_HI = 0x04, NVME_CR_VS = 0x08, NVME_CR_INTMS = 0x0c, NVME_CR_INTMC = 0x10, NVME_CR_CC = 0x14, NVME_CR_CSTS = 0x1c, NVME_CR_NSSR = 0x20, NVME_CR_AQA = 0x24, NVME_CR_ASQ_LOW = 0x28, NVME_CR_ASQ_HI = 0x2c, NVME_CR_ACQ_LOW = 0x30, NVME_CR_ACQ_HI = 0x34, }; enum nvme_cmd_cdw11 { NVME_CMD_CDW11_PC = 0x0001, NVME_CMD_CDW11_IEN = 0x0002, NVME_CMD_CDW11_IV = 0xFFFF0000, }; -#define NVME_CMD_GET_OPC(opc) \ - ((opc) >> NVME_CMD_OPC_SHIFT & NVME_CMD_OPC_MASK) - #define NVME_CQ_INTEN 0x01 #define NVME_CQ_INTCOAL 0x02 struct nvme_completion_queue { struct nvme_completion *qbase; uint32_t size; uint16_t tail; /* nvme progress */ uint16_t head; /* guest progress */ uint16_t intr_vec; uint32_t intr_en; pthread_mutex_t mtx; }; struct nvme_submission_queue { struct nvme_command *qbase; uint32_t size; uint16_t head; /* nvme progress */ uint16_t tail; /* guest progress */ uint16_t cqid; /* completion queue id */ int busy; /* queue is being processed */ int qpriority; }; enum nvme_storage_type { NVME_STOR_BLOCKIF = 0, NVME_STOR_RAM = 1, }; struct pci_nvme_blockstore { enum nvme_storage_type type; void *ctx; uint64_t size; uint32_t sectsz; uint32_t sectsz_bits; }; struct pci_nvme_ioreq { struct pci_nvme_softc *sc; struct pci_nvme_ioreq *next; struct nvme_submission_queue *nvme_sq; uint16_t sqid; /* command information */ uint16_t opc; uint16_t cid; uint32_t nsid; uint64_t prev_gpaddr; size_t prev_size; /* * lock if all iovs consumed (big IO); * complete transaction before continuing */ pthread_mutex_t mtx; pthread_cond_t cv; struct blockif_req io_req; /* pad to fit up to 512 page descriptors from guest IO request */ struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX]; }; struct pci_nvme_softc { struct pci_devinst *nsc_pi; pthread_mutex_t mtx; struct nvme_registers regs; struct nvme_namespace_data nsdata; struct nvme_controller_data ctrldata; struct pci_nvme_blockstore nvstore; uint16_t max_qentries; /* max entries per queue */ uint32_t max_queues; uint32_t num_cqueues; uint32_t num_squeues; struct pci_nvme_ioreq *ioreqs; struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */ uint32_t pending_ios; uint32_t ioslots; sem_t iosemlock; /* status and guest memory mapped queues */ struct nvme_completion_queue *compl_queues; struct nvme_submission_queue *submit_queues; /* controller features */ uint32_t intr_coales_aggr_time; /* 0x08: uS to delay intr */ uint32_t intr_coales_aggr_thresh; /* 0x08: compl-Q entries */ uint32_t async_ev_config; /* 0x0B: async event config */ }; static void pci_nvme_io_partial(struct blockif_req *br, int err); /* Controller Configuration utils */ #define NVME_CC_GET_EN(cc) \ ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) #define NVME_CC_GET_CSS(cc) \ ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) #define NVME_CC_GET_SHN(cc) \ ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) #define NVME_CC_GET_IOSQES(cc) \ ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) #define NVME_CC_GET_IOCQES(cc) \ ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) #define NVME_CC_WRITE_MASK \ ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) #define NVME_CC_NEN_WRITE_MASK \ ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) /* Controller Status utils */ #define NVME_CSTS_GET_RDY(sts) \ ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) /* Completion Queue status word utils */ #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) #define NVME_STATUS_MASK \ ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) static __inline void cpywithpad(char *dst, int dst_size, const char *src, char pad) { int len = strnlen(src, dst_size); memcpy(dst, src, len); memset(dst + len, pad, dst_size - len); } static __inline void pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) { *status &= ~NVME_STATUS_MASK; *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; } static __inline void pci_nvme_status_genc(uint16_t *status, uint16_t code) { pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); } static __inline void pci_nvme_toggle_phase(uint16_t *status, int prev) { if (prev) *status &= ~NVME_STATUS_P; else *status |= NVME_STATUS_P; } static void pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) { struct nvme_controller_data *cd = &sc->ctrldata; cd->vid = 0xFB5D; cd->ssvid = 0x0000; cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); /* Num of submission commands that we can handle at a time (2^rab) */ cd->rab = 4; /* FreeBSD OUI */ cd->ieee[0] = 0x58; cd->ieee[1] = 0x9c; cd->ieee[2] = 0xfc; cd->mic = 0; cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */ cd->ver = 0x00010300; cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; cd->acl = 2; cd->aerl = 4; cd->lpa = 0; /* TODO: support some simple things like SMART */ cd->elpe = 0; /* max error log page entries */ cd->npss = 1; /* number of power states support */ /* Warning Composite Temperature Threshold */ cd->wctemp = 0x0157; cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); cd->nn = 1; /* number of namespaces */ cd->fna = 0x03; cd->power_state[0].mp = 10; } static void pci_nvme_init_nsdata(struct pci_nvme_softc *sc) { struct nvme_namespace_data *nd; nd = &sc->nsdata; nd->nsze = sc->nvstore.size / sc->nvstore.sectsz; nd->ncap = nd->nsze; nd->nuse = nd->nsze; /* Get LBA and backstore information from backing store */ nd->nlbaf = 1; /* LBA data-sz = 2^lbads */ nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; nd->flbas = 0; } static void pci_nvme_reset_locked(struct pci_nvme_softc *sc) { DPRINTF(("%s\r\n", __func__)); sc->regs.cap_lo = (sc->max_qentries & NVME_CAP_LO_REG_MQES_MASK) | (1 << NVME_CAP_LO_REG_CQR_SHIFT) | (60 << NVME_CAP_LO_REG_TO_SHIFT); sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; sc->regs.vs = 0x00010300; /* NVMe v1.3 */ sc->regs.cc = 0; sc->regs.csts = 0; sc->num_cqueues = sc->num_squeues = sc->max_queues; if (sc->submit_queues != NULL) { for (int i = 0; i <= sc->max_queues; i++) { /* * The Admin Submission Queue is at index 0. * It must not be changed at reset otherwise the * emulation will be out of sync with the guest. */ if (i != 0) { sc->submit_queues[i].qbase = NULL; sc->submit_queues[i].size = 0; sc->submit_queues[i].cqid = 0; sc->compl_queues[i].qbase = NULL; sc->compl_queues[i].size = 0; } sc->submit_queues[i].tail = 0; sc->submit_queues[i].head = 0; sc->submit_queues[i].busy = 0; sc->compl_queues[i].tail = 0; sc->compl_queues[i].head = 0; } } else sc->submit_queues = calloc(sc->max_queues + 1, sizeof(struct nvme_submission_queue)); if (sc->compl_queues == NULL) { sc->compl_queues = calloc(sc->max_queues + 1, sizeof(struct nvme_completion_queue)); for (int i = 0; i <= sc->num_cqueues; i++) pthread_mutex_init(&sc->compl_queues[i].mtx, NULL); } } static void pci_nvme_reset(struct pci_nvme_softc *sc) { pthread_mutex_lock(&sc->mtx); pci_nvme_reset_locked(sc); pthread_mutex_unlock(&sc->mtx); } static void pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) { uint16_t acqs, asqs; DPRINTF(("%s\r\n", __func__)); asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; sc->submit_queues[0].size = asqs; sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, sizeof(struct nvme_command) * asqs); DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n", __func__, sc->regs.asq, sc->submit_queues[0].qbase)); acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & NVME_AQA_REG_ACQS_MASK) + 1; sc->compl_queues[0].size = acqs; sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, sizeof(struct nvme_completion) * acqs); DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n", __func__, sc->regs.acq, sc->compl_queues[0].qbase)); } static int nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint16_t qid = command->cdw10 & 0xffff; DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid)); if (qid == 0 || qid > sc->num_cqueues) { WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n", __func__, qid, sc->num_squeues)); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } sc->submit_queues[qid].qbase = NULL; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { if (command->cdw11 & NVME_CMD_CDW11_PC) { uint16_t qid = command->cdw10 & 0xffff; struct nvme_submission_queue *nsq; if (qid > sc->num_squeues) { WPRINTF(("%s queue index %u > num_squeues %u\r\n", __func__, qid, sc->num_squeues)); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } nsq = &sc->submit_queues[qid]; nsq->size = ((command->cdw10 >> 16) & 0xffff) + 1; nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(struct nvme_command) * (size_t)nsq->size); nsq->cqid = (command->cdw11 >> 16) & 0xffff; nsq->qpriority = (command->cdw11 >> 1) & 0x03; DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__, qid, nsq->size, nsq->qbase, nsq->cqid)); pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); DPRINTF(("%s completed creating IOSQ qid %u\r\n", __func__, qid)); } else { /* * Guest sent non-cont submission queue request. * This setting is unsupported by this emulation. */ WPRINTF(("%s unsupported non-contig (list-based) " "create i/o submission queue\r\n", __func__)); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); } return (1); } static int nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint16_t qid = command->cdw10 & 0xffff; DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid)); if (qid == 0 || qid > sc->num_cqueues) { WPRINTF(("%s queue index %u / num_cqueues %u\r\n", __func__, qid, sc->num_cqueues)); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } sc->compl_queues[qid].qbase = NULL; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { if (command->cdw11 & NVME_CMD_CDW11_PC) { uint16_t qid = command->cdw10 & 0xffff; struct nvme_completion_queue *ncq; if (qid > sc->num_cqueues) { WPRINTF(("%s queue index %u > num_cqueues %u\r\n", __func__, qid, sc->num_cqueues)); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } ncq = &sc->compl_queues[qid]; ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; ncq->size = ((command->cdw10 >> 16) & 0xffff) + 1; ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(struct nvme_command) * (size_t)ncq->size); pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); } else { /* * Non-contig completion queue unsupported. */ WPRINTF(("%s unsupported non-contig (list-based) " "create i/o completion queue\r\n", __func__)); /* 0x12 = Invalid Use of Controller Memory Buffer */ pci_nvme_status_genc(&compl->status, 0x12); } return (1); } static int nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2; uint8_t logpage = command->cdw10 & 0xFF; void *data; DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize)); if (logpage >= 1 && logpage <= 3) data = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, PAGE_SIZE); pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); switch (logpage) { case 0x01: /* Error information */ memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize); break; case 0x02: /* SMART/Health information */ /* TODO: present some smart info */ memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize); break; case 0x03: /* Firmware slot information */ memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize); break; default: WPRINTF(("%s get log page %x command not supported\r\n", __func__, logpage)); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_LOG_PAGE); } return (1); } static int nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { void *dest; DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__, command->cdw10 & 0xFF, command->nsid)); switch (command->cdw10 & 0xFF) { case 0x00: /* return Identify Namespace data structure */ dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(sc->nsdata)); memcpy(dest, &sc->nsdata, sizeof(sc->nsdata)); break; case 0x01: /* return Identify Controller data structure */ dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(sc->ctrldata)); memcpy(dest, &sc->ctrldata, sizeof(sc->ctrldata)); break; case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(uint32_t) * 1024); ((uint32_t *)dest)[0] = 1; ((uint32_t *)dest)[1] = 0; break; case 0x11: pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_NAMESPACE_OR_FORMAT); return (1); case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ case 0x10: case 0x12: case 0x13: case 0x14: case 0x15: default: DPRINTF(("%s unsupported identify command requested 0x%x\r\n", __func__, command->cdw10 & 0xFF)); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { int feature = command->cdw10 & 0xFF; uint32_t iv; DPRINTF(("%s feature 0x%x\r\n", __func__, feature)); compl->cdw0 = 0; switch (feature) { case NVME_FEAT_ARBITRATION: DPRINTF((" arbitration 0x%x\r\n", command->cdw11)); break; case NVME_FEAT_POWER_MANAGEMENT: DPRINTF((" power management 0x%x\r\n", command->cdw11)); break; case NVME_FEAT_LBA_RANGE_TYPE: DPRINTF((" lba range 0x%x\r\n", command->cdw11)); break; case NVME_FEAT_TEMPERATURE_THRESHOLD: DPRINTF((" temperature threshold 0x%x\r\n", command->cdw11)); break; case NVME_FEAT_ERROR_RECOVERY: DPRINTF((" error recovery 0x%x\r\n", command->cdw11)); break; case NVME_FEAT_VOLATILE_WRITE_CACHE: DPRINTF((" volatile write cache 0x%x\r\n", command->cdw11)); break; case NVME_FEAT_NUMBER_OF_QUEUES: sc->num_squeues = command->cdw11 & 0xFFFF; sc->num_cqueues = (command->cdw11 >> 16) & 0xFFFF; DPRINTF((" number of queues (submit %u, completion %u)\r\n", sc->num_squeues, sc->num_cqueues)); if (sc->num_squeues == 0 || sc->num_squeues > sc->max_queues) sc->num_squeues = sc->max_queues; if (sc->num_cqueues == 0 || sc->num_cqueues > sc->max_queues) sc->num_cqueues = sc->max_queues; compl->cdw0 = (sc->num_squeues & 0xFFFF) | ((sc->num_cqueues & 0xFFFF) << 16); break; case NVME_FEAT_INTERRUPT_COALESCING: DPRINTF((" interrupt coalescing 0x%x\r\n", command->cdw11)); /* in uS */ sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100; sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF; break; case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: iv = command->cdw11 & 0xFFFF; DPRINTF((" interrupt vector configuration 0x%x\r\n", command->cdw11)); for (uint32_t i = 0; i <= sc->num_cqueues; i++) { if (sc->compl_queues[i].intr_vec == iv) { if (command->cdw11 & (1 << 16)) sc->compl_queues[i].intr_en |= NVME_CQ_INTCOAL; else sc->compl_queues[i].intr_en &= ~NVME_CQ_INTCOAL; } } break; case NVME_FEAT_WRITE_ATOMICITY: DPRINTF((" write atomicity 0x%x\r\n", command->cdw11)); break; case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: DPRINTF((" async event configuration 0x%x\r\n", command->cdw11)); sc->async_ev_config = command->cdw11; break; case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: DPRINTF((" software progress marker 0x%x\r\n", command->cdw11)); break; case 0x0C: DPRINTF((" autonomous power state transition 0x%x\r\n", command->cdw11)); break; default: WPRINTF(("%s invalid feature\r\n", __func__)); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { int feature = command->cdw10 & 0xFF; DPRINTF(("%s feature 0x%x\r\n", __func__, feature)); compl->cdw0 = 0; switch (feature) { case NVME_FEAT_ARBITRATION: DPRINTF((" arbitration\r\n")); break; case NVME_FEAT_POWER_MANAGEMENT: DPRINTF((" power management\r\n")); break; case NVME_FEAT_LBA_RANGE_TYPE: DPRINTF((" lba range\r\n")); break; case NVME_FEAT_TEMPERATURE_THRESHOLD: DPRINTF((" temperature threshold\r\n")); switch ((command->cdw11 >> 20) & 0x3) { case 0: /* Over temp threshold */ compl->cdw0 = 0xFFFF; break; case 1: /* Under temp threshold */ compl->cdw0 = 0; break; default: WPRINTF((" invalid threshold type select\r\n")); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } break; case NVME_FEAT_ERROR_RECOVERY: DPRINTF((" error recovery\r\n")); break; case NVME_FEAT_VOLATILE_WRITE_CACHE: DPRINTF((" volatile write cache\r\n")); break; case NVME_FEAT_NUMBER_OF_QUEUES: compl->cdw0 = 0; if (sc->num_squeues == 0) compl->cdw0 |= sc->max_queues & 0xFFFF; else compl->cdw0 |= sc->num_squeues & 0xFFFF; if (sc->num_cqueues == 0) compl->cdw0 |= (sc->max_queues & 0xFFFF) << 16; else compl->cdw0 |= (sc->num_cqueues & 0xFFFF) << 16; DPRINTF((" number of queues (submit %u, completion %u)\r\n", compl->cdw0 & 0xFFFF, (compl->cdw0 >> 16) & 0xFFFF)); break; case NVME_FEAT_INTERRUPT_COALESCING: DPRINTF((" interrupt coalescing\r\n")); break; case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: DPRINTF((" interrupt vector configuration\r\n")); break; case NVME_FEAT_WRITE_ATOMICITY: DPRINTF((" write atomicity\r\n")); break; case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: DPRINTF((" async event configuration\r\n")); sc->async_ev_config = command->cdw11; break; case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: DPRINTF((" software progress marker\r\n")); break; case 0x0C: DPRINTF((" autonomous power state transition\r\n")); break; default: WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature)); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__, command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF)); /* TODO: search for the command ID and abort it */ compl->cdw0 = 1; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_async_event_req(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11)); /* * TODO: raise events when they happen based on the Set Features cmd. * These events happen async, so only set completion successful if * there is an event reflective of the request to get event. */ pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); return (0); } static void pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) { struct nvme_completion compl; struct nvme_command *cmd; struct nvme_submission_queue *sq; struct nvme_completion_queue *cq; int do_intr = 0; uint16_t sqhead; DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value)); sq = &sc->submit_queues[0]; sqhead = atomic_load_acq_short(&sq->head); if (atomic_testandset_int(&sq->busy, 1)) { DPRINTF(("%s SQ busy, head %u, tail %u\r\n", __func__, sqhead, sq->tail)); return; } DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail)); while (sqhead != atomic_load_acq_short(&sq->tail)) { cmd = &(sq->qbase)[sqhead]; compl.status = 0; - switch (NVME_CMD_GET_OPC(cmd->opc_fuse)) { + switch (cmd->opc) { case NVME_OPC_DELETE_IO_SQ: DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__)); do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl); break; case NVME_OPC_CREATE_IO_SQ: DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__)); do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl); break; case NVME_OPC_DELETE_IO_CQ: DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__)); do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl); break; case NVME_OPC_CREATE_IO_CQ: DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__)); do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl); break; case NVME_OPC_GET_LOG_PAGE: DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__)); do_intr |= nvme_opc_get_log_page(sc, cmd, &compl); break; case NVME_OPC_IDENTIFY: DPRINTF(("%s command IDENTIFY\r\n", __func__)); do_intr |= nvme_opc_identify(sc, cmd, &compl); break; case NVME_OPC_ABORT: DPRINTF(("%s command ABORT\r\n", __func__)); do_intr |= nvme_opc_abort(sc, cmd, &compl); break; case NVME_OPC_SET_FEATURES: DPRINTF(("%s command SET_FEATURES\r\n", __func__)); do_intr |= nvme_opc_set_features(sc, cmd, &compl); break; case NVME_OPC_GET_FEATURES: DPRINTF(("%s command GET_FEATURES\r\n", __func__)); do_intr |= nvme_opc_get_features(sc, cmd, &compl); break; case NVME_OPC_ASYNC_EVENT_REQUEST: DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__)); /* XXX dont care, unhandled for now do_intr |= nvme_opc_async_event_req(sc, cmd, &compl); */ break; default: WPRINTF(("0x%x command is not implemented\r\n", - NVME_CMD_GET_OPC(cmd->opc_fuse))); + cmd->opc)); } /* for now skip async event generation */ - if (NVME_CMD_GET_OPC(cmd->opc_fuse) != - NVME_OPC_ASYNC_EVENT_REQUEST) { + if (cmd->opc != NVME_OPC_ASYNC_EVENT_REQUEST) { struct nvme_completion *cp; int phase; cq = &sc->compl_queues[0]; cp = &(cq->qbase)[cq->tail]; cp->sqid = 0; cp->sqhd = sqhead; cp->cid = cmd->cid; phase = NVME_STATUS_GET_P(cp->status); cp->status = compl.status; pci_nvme_toggle_phase(&cp->status, phase); cq->tail = (cq->tail + 1) % cq->size; } sqhead = (sqhead + 1) % sq->size; } DPRINTF(("setting sqhead %u\r\n", sqhead)); atomic_store_short(&sq->head, sqhead); atomic_store_int(&sq->busy, 0); if (do_intr) pci_generate_msix(sc->nsc_pi, 0); } static int pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, int do_write, uint64_t lba) { int iovidx; if (req != NULL) { /* concatenate contig block-iovs to minimize number of iovs */ if ((req->prev_gpaddr + req->prev_size) == gpaddr) { iovidx = req->io_req.br_iovcnt - 1; req->io_req.br_iov[iovidx].iov_base = paddr_guest2host(req->sc->nsc_pi->pi_vmctx, req->prev_gpaddr, size); req->prev_size += size; req->io_req.br_resid += size; req->io_req.br_iov[iovidx].iov_len = req->prev_size; } else { pthread_mutex_lock(&req->mtx); iovidx = req->io_req.br_iovcnt; if (iovidx == NVME_MAX_BLOCKIOVS) { int err = 0; DPRINTF(("large I/O, doing partial req\r\n")); iovidx = 0; req->io_req.br_iovcnt = 0; req->io_req.br_callback = pci_nvme_io_partial; if (!do_write) err = blockif_read(sc->nvstore.ctx, &req->io_req); else err = blockif_write(sc->nvstore.ctx, &req->io_req); /* wait until req completes before cont */ if (err == 0) pthread_cond_wait(&req->cv, &req->mtx); } if (iovidx == 0) { req->io_req.br_offset = lba; req->io_req.br_resid = 0; req->io_req.br_param = req; } req->io_req.br_iov[iovidx].iov_base = paddr_guest2host(req->sc->nsc_pi->pi_vmctx, gpaddr, size); req->io_req.br_iov[iovidx].iov_len = size; req->prev_gpaddr = gpaddr; req->prev_size = size; req->io_req.br_resid += size; req->io_req.br_iovcnt++; pthread_mutex_unlock(&req->mtx); } } else { /* RAM buffer: read/write directly */ void *p = sc->nvstore.ctx; void *gptr; if ((lba + size) > sc->nvstore.size) { WPRINTF(("%s write would overflow RAM\r\n", __func__)); return (-1); } p = (void *)((uintptr_t)p + (uintptr_t)lba); gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size); if (do_write) memcpy(p, gptr, size); else memcpy(gptr, p, size); } return (0); } static void pci_nvme_set_completion(struct pci_nvme_softc *sc, struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint32_t cdw0, uint16_t status, int ignore_busy) { struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; struct nvme_completion *compl; int do_intr = 0; int phase; DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n", __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), NVME_STATUS_GET_SC(status))); pthread_mutex_lock(&cq->mtx); assert(cq->qbase != NULL); compl = &cq->qbase[cq->tail]; compl->sqhd = atomic_load_acq_short(&sq->head); compl->sqid = sqid; compl->cid = cid; // toggle phase phase = NVME_STATUS_GET_P(compl->status); compl->status = status; pci_nvme_toggle_phase(&compl->status, phase); cq->tail = (cq->tail + 1) % cq->size; if (cq->intr_en & NVME_CQ_INTEN) do_intr = 1; pthread_mutex_unlock(&cq->mtx); if (ignore_busy || !atomic_load_acq_int(&sq->busy)) if (do_intr) pci_generate_msix(sc->nsc_pi, cq->intr_vec); } static void pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) { req->sc = NULL; req->nvme_sq = NULL; req->sqid = 0; pthread_mutex_lock(&sc->mtx); req->next = sc->ioreqs_free; sc->ioreqs_free = req; sc->pending_ios--; /* when no more IO pending, can set to ready if device reset/enabled */ if (sc->pending_ios == 0 && NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) sc->regs.csts |= NVME_CSTS_RDY; pthread_mutex_unlock(&sc->mtx); sem_post(&sc->iosemlock); } static struct pci_nvme_ioreq * pci_nvme_get_ioreq(struct pci_nvme_softc *sc) { struct pci_nvme_ioreq *req = NULL;; sem_wait(&sc->iosemlock); pthread_mutex_lock(&sc->mtx); req = sc->ioreqs_free; assert(req != NULL); sc->ioreqs_free = req->next; req->next = NULL; req->sc = sc; sc->pending_ios++; pthread_mutex_unlock(&sc->mtx); req->io_req.br_iovcnt = 0; req->io_req.br_offset = 0; req->io_req.br_resid = 0; req->io_req.br_param = req; req->prev_gpaddr = 0; req->prev_size = 0; return req; } static void pci_nvme_io_done(struct blockif_req *br, int err) { struct pci_nvme_ioreq *req = br->br_param; struct nvme_submission_queue *sq = req->nvme_sq; uint16_t code, status; DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err))); /* TODO return correct error */ code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; pci_nvme_status_genc(&status, code); pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0); pci_nvme_release_ioreq(req->sc, req); } static void pci_nvme_io_partial(struct blockif_req *br, int err) { struct pci_nvme_ioreq *req = br->br_param; DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err))); pthread_cond_signal(&req->cv); } static void pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) { struct nvme_submission_queue *sq; uint16_t status; uint16_t sqhead; int err; /* handle all submissions up to sq->tail index */ sq = &sc->submit_queues[idx]; if (atomic_testandset_int(&sq->busy, 1)) { DPRINTF(("%s sqid %u busy\r\n", __func__, idx)); return; } sqhead = atomic_load_acq_short(&sq->head); DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n", idx, sqhead, sq->tail, sq->qbase)); while (sqhead != atomic_load_acq_short(&sq->tail)) { struct nvme_command *cmd; struct pci_nvme_ioreq *req = NULL; uint64_t lba; uint64_t nblocks, bytes, size, cpsz; /* TODO: support scatter gather list handling */ cmd = &sq->qbase[sqhead]; sqhead = (sqhead + 1) % sq->size; lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; - if (NVME_CMD_GET_OPC(cmd->opc_fuse) == NVME_OPC_FLUSH) { + if (cmd->opc == NVME_OPC_FLUSH) { pci_nvme_status_genc(&status, NVME_SC_SUCCESS); pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, status, 1); continue; - } else if (NVME_CMD_GET_OPC(cmd->opc_fuse) == 0x08) { + } else if (cmd->opc == 0x08) { /* TODO: write zeroes */ WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n", __func__, lba, cmd->cdw12 & 0xFFFF)); pci_nvme_status_genc(&status, NVME_SC_SUCCESS); pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, status, 1); continue; } nblocks = (cmd->cdw12 & 0xFFFF) + 1; bytes = nblocks * sc->nvstore.sectsz; if (sc->nvstore.type == NVME_STOR_BLOCKIF) { req = pci_nvme_get_ioreq(sc); req->nvme_sq = sq; req->sqid = idx; } /* * If data starts mid-page and flows into the next page, then * increase page count */ DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu " "(%lu-bytes)\r\n", sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size, - NVME_CMD_GET_OPC(cmd->opc_fuse) == NVME_OPC_WRITE ? + cmd->opc == NVME_OPC_WRITE ? "WRITE" : "READ", lba, nblocks, bytes)); cmd->prp1 &= ~(0x03UL); cmd->prp2 &= ~(0x03UL); DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2)); size = bytes; lba *= sc->nvstore.sectsz; cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE); if (cpsz > bytes) cpsz = bytes; if (req != NULL) { req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; - req->opc = NVME_CMD_GET_OPC(cmd->opc_fuse); + req->opc = cmd->opc; req->cid = cmd->cid; req->nsid = cmd->nsid; } err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz, - NVME_CMD_GET_OPC(cmd->opc_fuse) == NVME_OPC_WRITE, lba); + cmd->opc == NVME_OPC_WRITE, lba); lba += cpsz; size -= cpsz; if (size == 0) goto iodone; if (size <= PAGE_SIZE) { /* prp2 is second (and final) page in transfer */ err = pci_nvme_append_iov_req(sc, req, cmd->prp2, size, - NVME_CMD_GET_OPC(cmd->opc_fuse) == NVME_OPC_WRITE, + cmd->opc == NVME_OPC_WRITE, lba); } else { uint64_t *prp_list; int i; /* prp2 is pointer to a physical region page list */ prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx, cmd->prp2, PAGE_SIZE); i = 0; while (size != 0) { cpsz = MIN(size, PAGE_SIZE); /* * Move to linked physical region page list * in last item. */ if (i == (NVME_PRP2_ITEMS-1) && size > PAGE_SIZE) { assert((prp_list[i] & (PAGE_SIZE-1)) == 0); prp_list = paddr_guest2host( sc->nsc_pi->pi_vmctx, prp_list[i], PAGE_SIZE); i = 0; } if (prp_list[i] == 0) { WPRINTF(("PRP2[%d] = 0 !!!\r\n", i)); err = 1; break; } err = pci_nvme_append_iov_req(sc, req, prp_list[i], cpsz, - NVME_CMD_GET_OPC(cmd->opc_fuse) == - NVME_OPC_WRITE, lba); + cmd->opc == NVME_OPC_WRITE, lba); if (err) break; lba += cpsz; size -= cpsz; i++; } } iodone: if (sc->nvstore.type == NVME_STOR_RAM) { uint16_t code, status; code = err ? NVME_SC_LBA_OUT_OF_RANGE : NVME_SC_SUCCESS; pci_nvme_status_genc(&status, code); pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, status, 1); continue; } if (err) goto do_error; req->io_req.br_callback = pci_nvme_io_done; err = 0; - switch (NVME_CMD_GET_OPC(cmd->opc_fuse)) { + switch (cmd->opc) { case NVME_OPC_READ: err = blockif_read(sc->nvstore.ctx, &req->io_req); break; case NVME_OPC_WRITE: err = blockif_write(sc->nvstore.ctx, &req->io_req); break; default: WPRINTF(("%s unhandled io command 0x%x\r\n", - __func__, NVME_CMD_GET_OPC(cmd->opc_fuse))); + __func__, cmd->opc)); err = 1; } do_error: if (err) { uint16_t status; pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, status, 1); pci_nvme_release_ioreq(sc, req); } } atomic_store_short(&sq->head, sqhead); atomic_store_int(&sq->busy, 0); } static void pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, uint64_t idx, int is_sq, uint64_t value) { DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n", idx, is_sq ? "SQ" : "CQ", value & 0xFFFF)); if (is_sq) { atomic_store_short(&sc->submit_queues[idx].tail, (uint16_t)value); if (idx == 0) { pci_nvme_handle_admin_cmd(sc, value); } else { /* submission queue; handle new entries in SQ */ if (idx > sc->num_squeues) { WPRINTF(("%s SQ index %lu overflow from " "guest (max %u)\r\n", __func__, idx, sc->num_squeues)); return; } pci_nvme_handle_io_cmd(sc, (uint16_t)idx); } } else { if (idx > sc->num_cqueues) { WPRINTF(("%s queue index %lu overflow from " "guest (max %u)\r\n", __func__, idx, sc->num_cqueues)); return; } sc->compl_queues[idx].head = (uint16_t)value; } } static void pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) { const char *s = iswrite ? "WRITE" : "READ"; switch (offset) { case NVME_CR_CAP_LOW: DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s)); break; case NVME_CR_CAP_HI: DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s)); break; case NVME_CR_VS: DPRINTF(("%s %s NVME_CR_VS\r\n", func, s)); break; case NVME_CR_INTMS: DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s)); break; case NVME_CR_INTMC: DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s)); break; case NVME_CR_CC: DPRINTF(("%s %s NVME_CR_CC\r\n", func, s)); break; case NVME_CR_CSTS: DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s)); break; case NVME_CR_NSSR: DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s)); break; case NVME_CR_AQA: DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s)); break; case NVME_CR_ASQ_LOW: DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s)); break; case NVME_CR_ASQ_HI: DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s)); break; case NVME_CR_ACQ_LOW: DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s)); break; case NVME_CR_ACQ_HI: DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s)); break; default: DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset)); } } static void pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, uint64_t offset, int size, uint64_t value) { uint32_t ccreg; if (offset >= NVME_DOORBELL_OFFSET) { uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; uint64_t idx = belloffset / 8; /* door bell size = 2*int */ int is_sq = (belloffset % 8) < 4; if (belloffset > ((sc->max_queues+1) * 8 - 4)) { WPRINTF(("guest attempted an overflow write offset " "0x%lx, val 0x%lx in %s", offset, value, __func__)); return; } pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); return; } DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n", offset, size, value)); if (size != 4) { WPRINTF(("guest wrote invalid size %d (offset 0x%lx, " "val 0x%lx) to bar0 in %s", size, offset, value, __func__)); /* TODO: shutdown device */ return; } pci_nvme_bar0_reg_dumps(__func__, offset, 1); pthread_mutex_lock(&sc->mtx); switch (offset) { case NVME_CR_CAP_LOW: case NVME_CR_CAP_HI: /* readonly */ break; case NVME_CR_VS: /* readonly */ break; case NVME_CR_INTMS: /* MSI-X, so ignore */ break; case NVME_CR_INTMC: /* MSI-X, so ignore */ break; case NVME_CR_CC: ccreg = (uint32_t)value; DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u " "iocqes %u\r\n", __func__, NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), NVME_CC_GET_IOCQES(ccreg))); if (NVME_CC_GET_SHN(ccreg)) { /* perform shutdown - flush out data to backend */ sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << NVME_CSTS_REG_SHST_SHIFT); sc->regs.csts |= NVME_SHST_COMPLETE << NVME_CSTS_REG_SHST_SHIFT; } if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { if (NVME_CC_GET_EN(ccreg) == 0) /* transition 1-> causes controller reset */ pci_nvme_reset_locked(sc); else pci_nvme_init_controller(ctx, sc); } /* Insert the iocqes, iosqes and en bits from the write */ sc->regs.cc &= ~NVME_CC_WRITE_MASK; sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; if (NVME_CC_GET_EN(ccreg) == 0) { /* Insert the ams, mps and css bit fields */ sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; sc->regs.csts &= ~NVME_CSTS_RDY; } else if (sc->pending_ios == 0) { sc->regs.csts |= NVME_CSTS_RDY; } break; case NVME_CR_CSTS: break; case NVME_CR_NSSR: /* ignore writes; don't support subsystem reset */ break; case NVME_CR_AQA: sc->regs.aqa = (uint32_t)value; break; case NVME_CR_ASQ_LOW: sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | (0xFFFFF000 & value); break; case NVME_CR_ASQ_HI: sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | (value << 32); break; case NVME_CR_ACQ_LOW: sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | (0xFFFFF000 & value); break; case NVME_CR_ACQ_HI: sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | (value << 32); break; default: DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n", __func__, offset, value, size)); } pthread_mutex_unlock(&sc->mtx); } static void pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct pci_nvme_softc* sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, " " value 0x%lx\r\n", baridx, offset, size, value)); pci_emul_msix_twrite(pi, offset, size, value); return; } switch (baridx) { case 0: pci_nvme_write_bar_0(ctx, sc, offset, size, value); break; default: DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n", __func__, baridx, value)); } } static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, uint64_t offset, int size) { uint64_t value; pci_nvme_bar0_reg_dumps(__func__, offset, 0); if (offset < NVME_DOORBELL_OFFSET) { void *p = &(sc->regs); pthread_mutex_lock(&sc->mtx); memcpy(&value, (void *)((uintptr_t)p + offset), size); pthread_mutex_unlock(&sc->mtx); } else { value = 0; WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset)); } switch (size) { case 1: value &= 0xFF; break; case 2: value &= 0xFFFF; break; case 4: value &= 0xFFFFFFFF; break; } DPRINTF((" nvme-read offset 0x%lx, size %d -> value 0x%x\r\n", offset, size, (uint32_t)value)); return (value); } static uint64_t pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_nvme_softc* sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n", baridx, offset, size)); return pci_emul_msix_tread(pi, offset, size); } switch (baridx) { case 0: return pci_nvme_read_bar_0(sc, offset, size); default: DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset)); } return (0); } static int pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts) { char bident[sizeof("XX:X:X")]; char *uopt, *xopts, *config; uint32_t sectsz; int optidx; sc->max_queues = NVME_QUEUES; sc->max_qentries = NVME_MAX_QENTRIES; sc->ioslots = NVME_IOSLOTS; sc->num_squeues = sc->max_queues; sc->num_cqueues = sc->max_queues; sectsz = 0; uopt = strdup(opts); optidx = 0; snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); for (xopts = strtok(uopt, ","); xopts != NULL; xopts = strtok(NULL, ",")) { if ((config = strchr(xopts, '=')) != NULL) *config++ = '\0'; if (!strcmp("maxq", xopts)) { sc->max_queues = atoi(config); } else if (!strcmp("qsz", xopts)) { sc->max_qentries = atoi(config); } else if (!strcmp("ioslots", xopts)) { sc->ioslots = atoi(config); } else if (!strcmp("sectsz", xopts)) { sectsz = atoi(config); } else if (!strcmp("ser", xopts)) { /* * This field indicates the Product Serial Number in * 7-bit ASCII, unused bytes should be space characters. * Ref: NVMe v1.3c. */ cpywithpad((char *)sc->ctrldata.sn, sizeof(sc->ctrldata.sn), config, ' '); } else if (!strcmp("ram", xopts)) { uint64_t sz = strtoull(&xopts[4], NULL, 10); sc->nvstore.type = NVME_STOR_RAM; sc->nvstore.size = sz * 1024 * 1024; sc->nvstore.ctx = calloc(1, sc->nvstore.size); sc->nvstore.sectsz = 4096; sc->nvstore.sectsz_bits = 12; if (sc->nvstore.ctx == NULL) { perror("Unable to allocate RAM"); free(uopt); return (-1); } } else if (optidx == 0) { snprintf(bident, sizeof(bident), "%d:%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); sc->nvstore.ctx = blockif_open(xopts, bident); if (sc->nvstore.ctx == NULL) { perror("Could not open backing file"); free(uopt); return (-1); } sc->nvstore.type = NVME_STOR_BLOCKIF; sc->nvstore.size = blockif_size(sc->nvstore.ctx); } else { fprintf(stderr, "Invalid option %s\n", xopts); free(uopt); return (-1); } optidx++; } free(uopt); if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) { fprintf(stderr, "backing store not specified\n"); return (-1); } if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) sc->nvstore.sectsz = sectsz; else if (sc->nvstore.type != NVME_STOR_RAM) sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); for (sc->nvstore.sectsz_bits = 9; (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; sc->nvstore.sectsz_bits++); if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) sc->max_queues = NVME_QUEUES; if (sc->max_qentries <= 0) { fprintf(stderr, "Invalid qsz option\n"); return (-1); } if (sc->ioslots <= 0) { fprintf(stderr, "Invalid ioslots option\n"); return (-1); } return (0); } static int pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { struct pci_nvme_softc *sc; uint32_t pci_membar_sz; int error; error = 0; sc = calloc(1, sizeof(struct pci_nvme_softc)); pi->pi_arg = sc; sc->nsc_pi = pi; error = pci_nvme_parse_opts(sc, opts); if (error < 0) goto done; else error = 0; sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); for (int i = 0; i < sc->ioslots; i++) { if (i < (sc->ioslots-1)) sc->ioreqs[i].next = &sc->ioreqs[i+1]; pthread_mutex_init(&sc->ioreqs[i].mtx, NULL); pthread_cond_init(&sc->ioreqs[i].cv, NULL); } sc->ioreqs_free = sc->ioreqs; sc->intr_coales_aggr_thresh = 1; pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); /* allocate size of nvme registers + doorbell space for all queues */ pci_membar_sz = sizeof(struct nvme_registers) + 2*sizeof(uint32_t)*(sc->max_queues); DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz)); error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); if (error) { WPRINTF(("%s pci alloc mem bar failed\r\n", __func__)); goto done; } error = pci_emul_add_msixcap(pi, sc->max_queues, NVME_MSIX_BAR); if (error) { WPRINTF(("%s pci add msixcap failed\r\n", __func__)); goto done; } pthread_mutex_init(&sc->mtx, NULL); sem_init(&sc->iosemlock, 0, sc->ioslots); pci_nvme_reset(sc); pci_nvme_init_ctrldata(sc); pci_nvme_init_nsdata(sc); pci_lintr_request(pi); done: return (error); } struct pci_devemu pci_de_nvme = { .pe_emu = "nvme", .pe_init = pci_nvme_init, .pe_barwrite = pci_nvme_write, .pe_barread = pci_nvme_read }; PCI_EMUL_SET(pci_de_nvme);