diff --git a/sys/dev/nvme/nvme.h b/sys/dev/nvme/nvme.h index 94b20069c12e..ccc980a7c6ec 100644 --- a/sys/dev/nvme/nvme.h +++ b/sys/dev/nvme/nvme.h @@ -1,2091 +1,2107 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2012-2013 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __NVME_H__ #define __NVME_H__ #ifdef _KERNEL #include #endif #include #include #define NVME_PASSTHROUGH_CMD _IOWR('n', 0, struct nvme_pt_command) #define NVME_RESET_CONTROLLER _IO('n', 1) #define NVME_GET_NSID _IOR('n', 2, struct nvme_get_nsid) #define NVME_GET_MAX_XFER_SIZE _IOR('n', 3, uint64_t) #define NVME_IO_TEST _IOWR('n', 100, struct nvme_io_test) #define NVME_BIO_TEST _IOWR('n', 101, struct nvme_io_test) /* * Macros to deal with NVME revisions, as defined VS register */ #define NVME_REV(x, y) (((x) << 16) | ((y) << 8)) #define NVME_MAJOR(r) (((r) >> 16) & 0xffff) #define NVME_MINOR(r) (((r) >> 8) & 0xff) /* * Use to mark a command to apply to all namespaces, or to retrieve global * log pages. */ #define NVME_GLOBAL_NAMESPACE_TAG ((uint32_t)0xFFFFFFFF) /* Host memory buffer sizes are always in 4096 byte chunks */ #define NVME_HMB_UNITS 4096 /* Many items are expressed in terms of power of two times MPS */ #define NVME_MPS_SHIFT 12 /* Register field definitions */ #define NVME_CAP_LO_REG_MQES_SHIFT (0) #define NVME_CAP_LO_REG_MQES_MASK (0xFFFF) #define NVME_CAP_LO_REG_CQR_SHIFT (16) #define NVME_CAP_LO_REG_CQR_MASK (0x1) #define NVME_CAP_LO_REG_AMS_SHIFT (17) #define NVME_CAP_LO_REG_AMS_MASK (0x3) #define NVME_CAP_LO_REG_TO_SHIFT (24) #define NVME_CAP_LO_REG_TO_MASK (0xFF) #define NVME_CAP_LO_MQES(x) \ (((x) >> NVME_CAP_LO_REG_MQES_SHIFT) & NVME_CAP_LO_REG_MQES_MASK) #define NVME_CAP_LO_CQR(x) \ (((x) >> NVME_CAP_LO_REG_CQR_SHIFT) & NVME_CAP_LO_REG_CQR_MASK) #define NVME_CAP_LO_AMS(x) \ (((x) >> NVME_CAP_LO_REG_AMS_SHIFT) & NVME_CAP_LO_REG_AMS_MASK) #define NVME_CAP_LO_TO(x) \ (((x) >> NVME_CAP_LO_REG_TO_SHIFT) & NVME_CAP_LO_REG_TO_MASK) #define NVME_CAP_HI_REG_DSTRD_SHIFT (0) #define NVME_CAP_HI_REG_DSTRD_MASK (0xF) #define NVME_CAP_HI_REG_NSSRS_SHIFT (4) #define NVME_CAP_HI_REG_NSSRS_MASK (0x1) #define NVME_CAP_HI_REG_CSS_SHIFT (5) #define NVME_CAP_HI_REG_CSS_MASK (0xff) #define NVME_CAP_HI_REG_CSS_NVM_SHIFT (5) #define NVME_CAP_HI_REG_CSS_NVM_MASK (0x1) #define NVME_CAP_HI_REG_BPS_SHIFT (13) #define NVME_CAP_HI_REG_BPS_MASK (0x1) #define NVME_CAP_HI_REG_MPSMIN_SHIFT (16) #define NVME_CAP_HI_REG_MPSMIN_MASK (0xF) #define NVME_CAP_HI_REG_MPSMAX_SHIFT (20) #define NVME_CAP_HI_REG_MPSMAX_MASK (0xF) #define NVME_CAP_HI_REG_PMRS_SHIFT (24) #define NVME_CAP_HI_REG_PMRS_MASK (0x1) #define NVME_CAP_HI_REG_CMBS_SHIFT (25) #define NVME_CAP_HI_REG_CMBS_MASK (0x1) #define NVME_CAP_HI_DSTRD(x) \ (((x) >> NVME_CAP_HI_REG_DSTRD_SHIFT) & NVME_CAP_HI_REG_DSTRD_MASK) #define NVME_CAP_HI_NSSRS(x) \ (((x) >> NVME_CAP_HI_REG_NSSRS_SHIFT) & NVME_CAP_HI_REG_NSSRS_MASK) #define NVME_CAP_HI_CSS(x) \ (((x) >> NVME_CAP_HI_REG_CSS_SHIFT) & NVME_CAP_HI_REG_CSS_MASK) #define NVME_CAP_HI_CSS_NVM(x) \ (((x) >> NVME_CAP_HI_REG_CSS_NVM_SHIFT) & NVME_CAP_HI_REG_CSS_NVM_MASK) #define NVME_CAP_HI_BPS(x) \ (((x) >> NVME_CAP_HI_REG_BPS_SHIFT) & NVME_CAP_HI_REG_BPS_MASK) #define NVME_CAP_HI_MPSMIN(x) \ (((x) >> NVME_CAP_HI_REG_MPSMIN_SHIFT) & NVME_CAP_HI_REG_MPSMIN_MASK) #define NVME_CAP_HI_MPSMAX(x) \ (((x) >> NVME_CAP_HI_REG_MPSMAX_SHIFT) & NVME_CAP_HI_REG_MPSMAX_MASK) #define NVME_CAP_HI_PMRS(x) \ (((x) >> NVME_CAP_HI_REG_PMRS_SHIFT) & NVME_CAP_HI_REG_PMRS_MASK) #define NVME_CAP_HI_CMBS(x) \ (((x) >> NVME_CAP_HI_REG_CMBS_SHIFT) & NVME_CAP_HI_REG_CMBS_MASK) #define NVME_CC_REG_EN_SHIFT (0) #define NVME_CC_REG_EN_MASK (0x1) #define NVME_CC_REG_CSS_SHIFT (4) #define NVME_CC_REG_CSS_MASK (0x7) #define NVME_CC_REG_MPS_SHIFT (7) #define NVME_CC_REG_MPS_MASK (0xF) #define NVME_CC_REG_AMS_SHIFT (11) #define NVME_CC_REG_AMS_MASK (0x7) #define NVME_CC_REG_SHN_SHIFT (14) #define NVME_CC_REG_SHN_MASK (0x3) #define NVME_CC_REG_IOSQES_SHIFT (16) #define NVME_CC_REG_IOSQES_MASK (0xF) #define NVME_CC_REG_IOCQES_SHIFT (20) #define NVME_CC_REG_IOCQES_MASK (0xF) #define NVME_CSTS_REG_RDY_SHIFT (0) #define NVME_CSTS_REG_RDY_MASK (0x1) #define NVME_CSTS_REG_CFS_SHIFT (1) #define NVME_CSTS_REG_CFS_MASK (0x1) #define NVME_CSTS_REG_SHST_SHIFT (2) #define NVME_CSTS_REG_SHST_MASK (0x3) #define NVME_CSTS_REG_NVSRO_SHIFT (4) #define NVME_CSTS_REG_NVSRO_MASK (0x1) #define NVME_CSTS_REG_PP_SHIFT (5) #define NVME_CSTS_REG_PP_MASK (0x1) #define NVME_CSTS_GET_SHST(csts) (((csts) >> NVME_CSTS_REG_SHST_SHIFT) & NVME_CSTS_REG_SHST_MASK) #define NVME_AQA_REG_ASQS_SHIFT (0) #define NVME_AQA_REG_ASQS_MASK (0xFFF) #define NVME_AQA_REG_ACQS_SHIFT (16) #define NVME_AQA_REG_ACQS_MASK (0xFFF) #define NVME_PMRCAP_REG_RDS_SHIFT (3) #define NVME_PMRCAP_REG_RDS_MASK (0x1) #define NVME_PMRCAP_REG_WDS_SHIFT (4) #define NVME_PMRCAP_REG_WDS_MASK (0x1) #define NVME_PMRCAP_REG_BIR_SHIFT (5) #define NVME_PMRCAP_REG_BIR_MASK (0x7) #define NVME_PMRCAP_REG_PMRTU_SHIFT (8) #define NVME_PMRCAP_REG_PMRTU_MASK (0x3) #define NVME_PMRCAP_REG_PMRWBM_SHIFT (10) #define NVME_PMRCAP_REG_PMRWBM_MASK (0xf) #define NVME_PMRCAP_REG_PMRTO_SHIFT (16) #define NVME_PMRCAP_REG_PMRTO_MASK (0xff) #define NVME_PMRCAP_REG_CMSS_SHIFT (24) #define NVME_PMRCAP_REG_CMSS_MASK (0x1) #define NVME_PMRCAP_RDS(x) \ (((x) >> NVME_PMRCAP_REG_RDS_SHIFT) & NVME_PMRCAP_REG_RDS_MASK) #define NVME_PMRCAP_WDS(x) \ (((x) >> NVME_PMRCAP_REG_WDS_SHIFT) & NVME_PMRCAP_REG_WDS_MASK) #define NVME_PMRCAP_BIR(x) \ (((x) >> NVME_PMRCAP_REG_BIR_SHIFT) & NVME_PMRCAP_REG_BIR_MASK) #define NVME_PMRCAP_PMRTU(x) \ (((x) >> NVME_PMRCAP_REG_PMRTU_SHIFT) & NVME_PMRCAP_REG_PMRTU_MASK) #define NVME_PMRCAP_PMRWBM(x) \ (((x) >> NVME_PMRCAP_REG_PMRWBM_SHIFT) & NVME_PMRCAP_REG_PMRWBM_MASK) #define NVME_PMRCAP_PMRTO(x) \ (((x) >> NVME_PMRCAP_REG_PMRTO_SHIFT) & NVME_PMRCAP_REG_PMRTO_MASK) #define NVME_PMRCAP_CMSS(x) \ (((x) >> NVME_PMRCAP_REG_CMSS_SHIFT) & NVME_PMRCAP_REG_CMSS_MASK) /* Command field definitions */ #define NVME_CMD_FUSE_SHIFT (8) #define NVME_CMD_FUSE_MASK (0x3) #define NVME_STATUS_P_SHIFT (0) #define NVME_STATUS_P_MASK (0x1) #define NVME_STATUS_SC_SHIFT (1) #define NVME_STATUS_SC_MASK (0xFF) #define NVME_STATUS_SCT_SHIFT (9) #define NVME_STATUS_SCT_MASK (0x7) #define NVME_STATUS_CRD_SHIFT (12) #define NVME_STATUS_CRD_MASK (0x3) #define NVME_STATUS_M_SHIFT (14) #define NVME_STATUS_M_MASK (0x1) #define NVME_STATUS_DNR_SHIFT (15) #define NVME_STATUS_DNR_MASK (0x1) #define NVME_STATUS_GET_P(st) (((st) >> NVME_STATUS_P_SHIFT) & NVME_STATUS_P_MASK) #define NVME_STATUS_GET_SC(st) (((st) >> NVME_STATUS_SC_SHIFT) & NVME_STATUS_SC_MASK) #define NVME_STATUS_GET_SCT(st) (((st) >> NVME_STATUS_SCT_SHIFT) & NVME_STATUS_SCT_MASK) #define NVME_STATUS_GET_CRD(st) (((st) >> NVME_STATUS_CRD_SHIFT) & NVME_STATUS_CRD_MASK) #define NVME_STATUS_GET_M(st) (((st) >> NVME_STATUS_M_SHIFT) & NVME_STATUS_M_MASK) #define NVME_STATUS_GET_DNR(st) (((st) >> NVME_STATUS_DNR_SHIFT) & NVME_STATUS_DNR_MASK) #define NVME_PWR_ST_MPS_SHIFT (0) #define NVME_PWR_ST_MPS_MASK (0x1) #define NVME_PWR_ST_NOPS_SHIFT (1) #define NVME_PWR_ST_NOPS_MASK (0x1) #define NVME_PWR_ST_RRT_SHIFT (0) #define NVME_PWR_ST_RRT_MASK (0x1F) #define NVME_PWR_ST_RRL_SHIFT (0) #define NVME_PWR_ST_RRL_MASK (0x1F) #define NVME_PWR_ST_RWT_SHIFT (0) #define NVME_PWR_ST_RWT_MASK (0x1F) #define NVME_PWR_ST_RWL_SHIFT (0) #define NVME_PWR_ST_RWL_MASK (0x1F) #define NVME_PWR_ST_IPS_SHIFT (6) #define NVME_PWR_ST_IPS_MASK (0x3) #define NVME_PWR_ST_APW_SHIFT (0) #define NVME_PWR_ST_APW_MASK (0x7) #define NVME_PWR_ST_APS_SHIFT (6) #define NVME_PWR_ST_APS_MASK (0x3) /** Controller Multi-path I/O and Namespace Sharing Capabilities */ /* More then one port */ #define NVME_CTRLR_DATA_MIC_MPORTS_SHIFT (0) #define NVME_CTRLR_DATA_MIC_MPORTS_MASK (0x1) /* More then one controller */ #define NVME_CTRLR_DATA_MIC_MCTRLRS_SHIFT (1) #define NVME_CTRLR_DATA_MIC_MCTRLRS_MASK (0x1) /* SR-IOV Virtual Function */ #define NVME_CTRLR_DATA_MIC_SRIOVVF_SHIFT (2) #define NVME_CTRLR_DATA_MIC_SRIOVVF_MASK (0x1) /* Asymmetric Namespace Access Reporting */ #define NVME_CTRLR_DATA_MIC_ANAR_SHIFT (3) #define NVME_CTRLR_DATA_MIC_ANAR_MASK (0x1) /** OAES - Optional Asynchronous Events Supported */ /* supports Namespace Attribute Notices event */ #define NVME_CTRLR_DATA_OAES_NS_ATTR_SHIFT (8) #define NVME_CTRLR_DATA_OAES_NS_ATTR_MASK (0x1) /* supports Firmware Activation Notices event */ #define NVME_CTRLR_DATA_OAES_FW_ACTIVATE_SHIFT (9) #define NVME_CTRLR_DATA_OAES_FW_ACTIVATE_MASK (0x1) /* supports Asymmetric Namespace Access Change Notices event */ #define NVME_CTRLR_DATA_OAES_ASYM_NS_CHANGE_SHIFT (11) #define NVME_CTRLR_DATA_OAES_ASYM_NS_CHANGE_MASK (0x1) /* supports Predictable Latency Event Aggregate Log Change Notices event */ #define NVME_CTRLR_DATA_OAES_PREDICT_LATENCY_SHIFT (12) #define NVME_CTRLR_DATA_OAES_PREDICT_LATENCY_MASK (0x1) /* supports LBA Status Information Notices event */ #define NVME_CTRLR_DATA_OAES_LBA_STATUS_SHIFT (13) #define NVME_CTRLR_DATA_OAES_LBA_STATUS_MASK (0x1) /* supports Endurance Group Event Aggregate Log Page Changes Notices event */ #define NVME_CTRLR_DATA_OAES_ENDURANCE_GROUP_SHIFT (14) #define NVME_CTRLR_DATA_OAES_ENDURANCE_GROUP_MASK (0x1) /* supports Normal NVM Subsystem Shutdown event */ #define NVME_CTRLR_DATA_OAES_NORMAL_SHUTDOWN_SHIFT (15) #define NVME_CTRLR_DATA_OAES_NORMAL_SHUTDOWN_MASK (0x1) /* supports Zone Descriptor Changed Notices event */ #define NVME_CTRLR_DATA_OAES_ZONE_DESC_CHANGE_SHIFT (27) #define NVME_CTRLR_DATA_OAES_ZONE_DESC_CHANGE_MASK (0x1) /* supports Discovery Log Page Change Notification event */ #define NVME_CTRLR_DATA_OAES_LOG_PAGE_CHANGE_SHIFT (31) #define NVME_CTRLR_DATA_OAES_LOG_PAGE_CHANGE_MASK (0x1) /** OACS - optional admin command support */ /* supports security send/receive commands */ #define NVME_CTRLR_DATA_OACS_SECURITY_SHIFT (0) #define NVME_CTRLR_DATA_OACS_SECURITY_MASK (0x1) /* supports format nvm command */ #define NVME_CTRLR_DATA_OACS_FORMAT_SHIFT (1) #define NVME_CTRLR_DATA_OACS_FORMAT_MASK (0x1) /* supports firmware activate/download commands */ #define NVME_CTRLR_DATA_OACS_FIRMWARE_SHIFT (2) #define NVME_CTRLR_DATA_OACS_FIRMWARE_MASK (0x1) /* supports namespace management commands */ #define NVME_CTRLR_DATA_OACS_NSMGMT_SHIFT (3) #define NVME_CTRLR_DATA_OACS_NSMGMT_MASK (0x1) /* supports Device Self-test command */ #define NVME_CTRLR_DATA_OACS_SELFTEST_SHIFT (4) #define NVME_CTRLR_DATA_OACS_SELFTEST_MASK (0x1) /* supports Directives */ #define NVME_CTRLR_DATA_OACS_DIRECTIVES_SHIFT (5) #define NVME_CTRLR_DATA_OACS_DIRECTIVES_MASK (0x1) /* supports NVMe-MI Send/Receive */ #define NVME_CTRLR_DATA_OACS_NVMEMI_SHIFT (6) #define NVME_CTRLR_DATA_OACS_NVMEMI_MASK (0x1) /* supports Virtualization Management */ #define NVME_CTRLR_DATA_OACS_VM_SHIFT (7) #define NVME_CTRLR_DATA_OACS_VM_MASK (0x1) /* supports Doorbell Buffer Config */ #define NVME_CTRLR_DATA_OACS_DBBUFFER_SHIFT (8) #define NVME_CTRLR_DATA_OACS_DBBUFFER_MASK (0x1) /* supports Get LBA Status */ #define NVME_CTRLR_DATA_OACS_GETLBA_SHIFT (9) #define NVME_CTRLR_DATA_OACS_GETLBA_MASK (0x1) /** firmware updates */ /* first slot is read-only */ #define NVME_CTRLR_DATA_FRMW_SLOT1_RO_SHIFT (0) #define NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK (0x1) /* number of firmware slots */ #define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT (1) #define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_MASK (0x7) /* firmware activation without reset */ #define NVME_CTRLR_DATA_FRMW_ACT_WO_RESET_SHIFT (4) #define NVME_CTRLR_DATA_FRMW_ACT_WO_RESET_MASK (0x1) /** log page attributes */ /* per namespace smart/health log page */ #define NVME_CTRLR_DATA_LPA_NS_SMART_SHIFT (0) #define NVME_CTRLR_DATA_LPA_NS_SMART_MASK (0x1) /** AVSCC - admin vendor specific command configuration */ /* admin vendor specific commands use spec format */ #define NVME_CTRLR_DATA_AVSCC_SPEC_FORMAT_SHIFT (0) #define NVME_CTRLR_DATA_AVSCC_SPEC_FORMAT_MASK (0x1) /** Autonomous Power State Transition Attributes */ /* Autonomous Power State Transitions supported */ #define NVME_CTRLR_DATA_APSTA_APST_SUPP_SHIFT (0) #define NVME_CTRLR_DATA_APSTA_APST_SUPP_MASK (0x1) /** Sanitize Capabilities */ /* Crypto Erase Support */ #define NVME_CTRLR_DATA_SANICAP_CES_SHIFT (0) #define NVME_CTRLR_DATA_SANICAP_CES_MASK (0x1) /* Block Erase Support */ #define NVME_CTRLR_DATA_SANICAP_BES_SHIFT (1) #define NVME_CTRLR_DATA_SANICAP_BES_MASK (0x1) /* Overwrite Support */ #define NVME_CTRLR_DATA_SANICAP_OWS_SHIFT (2) #define NVME_CTRLR_DATA_SANICAP_OWS_MASK (0x1) /* No-Deallocate Inhibited */ #define NVME_CTRLR_DATA_SANICAP_NDI_SHIFT (29) #define NVME_CTRLR_DATA_SANICAP_NDI_MASK (0x1) /* No-Deallocate Modifies Media After Sanitize */ #define NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT (30) #define NVME_CTRLR_DATA_SANICAP_NODMMAS_MASK (0x3) #define NVME_CTRLR_DATA_SANICAP_NODMMAS_UNDEF (0) #define NVME_CTRLR_DATA_SANICAP_NODMMAS_NO (1) #define NVME_CTRLR_DATA_SANICAP_NODMMAS_YES (2) /** submission queue entry size */ #define NVME_CTRLR_DATA_SQES_MIN_SHIFT (0) #define NVME_CTRLR_DATA_SQES_MIN_MASK (0xF) #define NVME_CTRLR_DATA_SQES_MAX_SHIFT (4) #define NVME_CTRLR_DATA_SQES_MAX_MASK (0xF) /** completion queue entry size */ #define NVME_CTRLR_DATA_CQES_MIN_SHIFT (0) #define NVME_CTRLR_DATA_CQES_MIN_MASK (0xF) #define NVME_CTRLR_DATA_CQES_MAX_SHIFT (4) #define NVME_CTRLR_DATA_CQES_MAX_MASK (0xF) /** optional nvm command support */ #define NVME_CTRLR_DATA_ONCS_COMPARE_SHIFT (0) #define NVME_CTRLR_DATA_ONCS_COMPARE_MASK (0x1) #define NVME_CTRLR_DATA_ONCS_WRITE_UNC_SHIFT (1) #define NVME_CTRLR_DATA_ONCS_WRITE_UNC_MASK (0x1) #define NVME_CTRLR_DATA_ONCS_DSM_SHIFT (2) #define NVME_CTRLR_DATA_ONCS_DSM_MASK (0x1) #define NVME_CTRLR_DATA_ONCS_WRZERO_SHIFT (3) #define NVME_CTRLR_DATA_ONCS_WRZERO_MASK (0x1) #define NVME_CTRLR_DATA_ONCS_SAVEFEAT_SHIFT (4) #define NVME_CTRLR_DATA_ONCS_SAVEFEAT_MASK (0x1) #define NVME_CTRLR_DATA_ONCS_RESERV_SHIFT (5) #define NVME_CTRLR_DATA_ONCS_RESERV_MASK (0x1) #define NVME_CTRLR_DATA_ONCS_TIMESTAMP_SHIFT (6) #define NVME_CTRLR_DATA_ONCS_TIMESTAMP_MASK (0x1) #define NVME_CTRLR_DATA_ONCS_VERIFY_SHIFT (7) #define NVME_CTRLR_DATA_ONCS_VERIFY_MASK (0x1) /** Fused Operation Support */ #define NVME_CTRLR_DATA_FUSES_CNW_SHIFT (0) #define NVME_CTRLR_DATA_FUSES_CNW_MASK (0x1) /** Format NVM Attributes */ #define NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT (0) #define NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK (0x1) #define NVME_CTRLR_DATA_FNA_ERASE_ALL_SHIFT (1) #define NVME_CTRLR_DATA_FNA_ERASE_ALL_MASK (0x1) #define NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_SHIFT (2) #define NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_MASK (0x1) /** volatile write cache */ /* volatile write cache present */ #define NVME_CTRLR_DATA_VWC_PRESENT_SHIFT (0) #define NVME_CTRLR_DATA_VWC_PRESENT_MASK (0x1) /* flush all namespaces supported */ #define NVME_CTRLR_DATA_VWC_ALL_SHIFT (1) #define NVME_CTRLR_DATA_VWC_ALL_MASK (0x3) #define NVME_CTRLR_DATA_VWC_ALL_UNKNOWN (0) #define NVME_CTRLR_DATA_VWC_ALL_NO (2) #define NVME_CTRLR_DATA_VWC_ALL_YES (3) /** namespace features */ /* thin provisioning */ #define NVME_NS_DATA_NSFEAT_THIN_PROV_SHIFT (0) #define NVME_NS_DATA_NSFEAT_THIN_PROV_MASK (0x1) /* NAWUN, NAWUPF, and NACWU fields are valid */ #define NVME_NS_DATA_NSFEAT_NA_FIELDS_SHIFT (1) #define NVME_NS_DATA_NSFEAT_NA_FIELDS_MASK (0x1) /* Deallocated or Unwritten Logical Block errors supported */ #define NVME_NS_DATA_NSFEAT_DEALLOC_SHIFT (2) #define NVME_NS_DATA_NSFEAT_DEALLOC_MASK (0x1) /* NGUID and EUI64 fields are not reusable */ #define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_SHIFT (3) #define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_MASK (0x1) /* NPWG, NPWA, NPDG, NPDA, and NOWS are valid */ #define NVME_NS_DATA_NSFEAT_NPVALID_SHIFT (4) #define NVME_NS_DATA_NSFEAT_NPVALID_MASK (0x1) /** formatted lba size */ #define NVME_NS_DATA_FLBAS_FORMAT_SHIFT (0) #define NVME_NS_DATA_FLBAS_FORMAT_MASK (0xF) #define NVME_NS_DATA_FLBAS_EXTENDED_SHIFT (4) #define NVME_NS_DATA_FLBAS_EXTENDED_MASK (0x1) /** metadata capabilities */ /* metadata can be transferred as part of data prp list */ #define NVME_NS_DATA_MC_EXTENDED_SHIFT (0) #define NVME_NS_DATA_MC_EXTENDED_MASK (0x1) /* metadata can be transferred with separate metadata pointer */ #define NVME_NS_DATA_MC_POINTER_SHIFT (1) #define NVME_NS_DATA_MC_POINTER_MASK (0x1) /** end-to-end data protection capabilities */ /* protection information type 1 */ #define NVME_NS_DATA_DPC_PIT1_SHIFT (0) #define NVME_NS_DATA_DPC_PIT1_MASK (0x1) /* protection information type 2 */ #define NVME_NS_DATA_DPC_PIT2_SHIFT (1) #define NVME_NS_DATA_DPC_PIT2_MASK (0x1) /* protection information type 3 */ #define NVME_NS_DATA_DPC_PIT3_SHIFT (2) #define NVME_NS_DATA_DPC_PIT3_MASK (0x1) /* first eight bytes of metadata */ #define NVME_NS_DATA_DPC_MD_START_SHIFT (3) #define NVME_NS_DATA_DPC_MD_START_MASK (0x1) /* last eight bytes of metadata */ #define NVME_NS_DATA_DPC_MD_END_SHIFT (4) #define NVME_NS_DATA_DPC_MD_END_MASK (0x1) /** end-to-end data protection type settings */ /* protection information type */ #define NVME_NS_DATA_DPS_PIT_SHIFT (0) #define NVME_NS_DATA_DPS_PIT_MASK (0x7) /* 1 == protection info transferred at start of metadata */ /* 0 == protection info transferred at end of metadata */ #define NVME_NS_DATA_DPS_MD_START_SHIFT (3) #define NVME_NS_DATA_DPS_MD_START_MASK (0x1) /** Namespace Multi-path I/O and Namespace Sharing Capabilities */ /* the namespace may be attached to two or more controllers */ #define NVME_NS_DATA_NMIC_MAY_BE_SHARED_SHIFT (0) #define NVME_NS_DATA_NMIC_MAY_BE_SHARED_MASK (0x1) /** Reservation Capabilities */ /* Persist Through Power Loss */ #define NVME_NS_DATA_RESCAP_PTPL_SHIFT (0) #define NVME_NS_DATA_RESCAP_PTPL_MASK (0x1) /* supports the Write Exclusive */ #define NVME_NS_DATA_RESCAP_WR_EX_SHIFT (1) #define NVME_NS_DATA_RESCAP_WR_EX_MASK (0x1) /* supports the Exclusive Access */ #define NVME_NS_DATA_RESCAP_EX_AC_SHIFT (2) #define NVME_NS_DATA_RESCAP_EX_AC_MASK (0x1) /* supports the Write Exclusive – Registrants Only */ #define NVME_NS_DATA_RESCAP_WR_EX_RO_SHIFT (3) #define NVME_NS_DATA_RESCAP_WR_EX_RO_MASK (0x1) /* supports the Exclusive Access - Registrants Only */ #define NVME_NS_DATA_RESCAP_EX_AC_RO_SHIFT (4) #define NVME_NS_DATA_RESCAP_EX_AC_RO_MASK (0x1) /* supports the Write Exclusive – All Registrants */ #define NVME_NS_DATA_RESCAP_WR_EX_AR_SHIFT (5) #define NVME_NS_DATA_RESCAP_WR_EX_AR_MASK (0x1) /* supports the Exclusive Access - All Registrants */ #define NVME_NS_DATA_RESCAP_EX_AC_AR_SHIFT (6) #define NVME_NS_DATA_RESCAP_EX_AC_AR_MASK (0x1) /* Ignore Existing Key is used as defined in revision 1.3 or later */ #define NVME_NS_DATA_RESCAP_IEKEY13_SHIFT (7) #define NVME_NS_DATA_RESCAP_IEKEY13_MASK (0x1) /** Format Progress Indicator */ /* percentage of the Format NVM command that remains to be completed */ #define NVME_NS_DATA_FPI_PERC_SHIFT (0) #define NVME_NS_DATA_FPI_PERC_MASK (0x7f) /* namespace supports the Format Progress Indicator */ #define NVME_NS_DATA_FPI_SUPP_SHIFT (7) #define NVME_NS_DATA_FPI_SUPP_MASK (0x1) /** Deallocate Logical Block Features */ /* deallocated logical block read behavior */ #define NVME_NS_DATA_DLFEAT_READ_SHIFT (0) #define NVME_NS_DATA_DLFEAT_READ_MASK (0x07) #define NVME_NS_DATA_DLFEAT_READ_NR (0x00) #define NVME_NS_DATA_DLFEAT_READ_00 (0x01) #define NVME_NS_DATA_DLFEAT_READ_FF (0x02) /* supports the Deallocate bit in the Write Zeroes */ #define NVME_NS_DATA_DLFEAT_DWZ_SHIFT (3) #define NVME_NS_DATA_DLFEAT_DWZ_MASK (0x01) /* Guard field for deallocated logical blocks is set to the CRC */ #define NVME_NS_DATA_DLFEAT_GCRC_SHIFT (4) #define NVME_NS_DATA_DLFEAT_GCRC_MASK (0x01) /** lba format support */ /* metadata size */ #define NVME_NS_DATA_LBAF_MS_SHIFT (0) #define NVME_NS_DATA_LBAF_MS_MASK (0xFFFF) /* lba data size */ #define NVME_NS_DATA_LBAF_LBADS_SHIFT (16) #define NVME_NS_DATA_LBAF_LBADS_MASK (0xFF) /* relative performance */ #define NVME_NS_DATA_LBAF_RP_SHIFT (24) #define NVME_NS_DATA_LBAF_RP_MASK (0x3) enum nvme_critical_warning_state { NVME_CRIT_WARN_ST_AVAILABLE_SPARE = 0x1, NVME_CRIT_WARN_ST_TEMPERATURE = 0x2, NVME_CRIT_WARN_ST_DEVICE_RELIABILITY = 0x4, NVME_CRIT_WARN_ST_READ_ONLY = 0x8, NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP = 0x10, }; #define NVME_CRIT_WARN_ST_RESERVED_MASK (0xE0) #define NVME_ASYNC_EVENT_NS_ATTRIBUTE (0x100) #define NVME_ASYNC_EVENT_FW_ACTIVATE (0x200) /* slot for current FW */ #define NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT (0) #define NVME_FIRMWARE_PAGE_AFI_SLOT_MASK (0x7) /* Commands Supported and Effects */ #define NVME_CE_PAGE_CSUP_SHIFT (0) #define NVME_CE_PAGE_CSUP_MASK (0x1) #define NVME_CE_PAGE_LBCC_SHIFT (1) #define NVME_CE_PAGE_LBCC_MASK (0x1) #define NVME_CE_PAGE_NCC_SHIFT (2) #define NVME_CE_PAGE_NCC_MASK (0x1) #define NVME_CE_PAGE_NIC_SHIFT (3) #define NVME_CE_PAGE_NIC_MASK (0x1) #define NVME_CE_PAGE_CCC_SHIFT (4) #define NVME_CE_PAGE_CCC_MASK (0x1) #define NVME_CE_PAGE_CSE_SHIFT (16) #define NVME_CE_PAGE_CSE_MASK (0x7) #define NVME_CE_PAGE_UUID_SHIFT (19) #define NVME_CE_PAGE_UUID_MASK (0x1) /* Sanitize Status */ #define NVME_SS_PAGE_SSTAT_STATUS_SHIFT (0) #define NVME_SS_PAGE_SSTAT_STATUS_MASK (0x7) #define NVME_SS_PAGE_SSTAT_STATUS_NEVER (0) #define NVME_SS_PAGE_SSTAT_STATUS_COMPLETED (1) #define NVME_SS_PAGE_SSTAT_STATUS_INPROG (2) #define NVME_SS_PAGE_SSTAT_STATUS_FAILED (3) #define NVME_SS_PAGE_SSTAT_STATUS_COMPLETEDWD (4) #define NVME_SS_PAGE_SSTAT_PASSES_SHIFT (3) #define NVME_SS_PAGE_SSTAT_PASSES_MASK (0x1f) #define NVME_SS_PAGE_SSTAT_GDE_SHIFT (8) #define NVME_SS_PAGE_SSTAT_GDE_MASK (0x1) +/* Features */ +/* Get Features */ +#define NVME_FEAT_GET_SEL_SHIFT (8) +#define NVME_FEAT_GET_SEL_MASK (0x7) +#define NVME_FEAT_GET_FID_SHIFT (0) +#define NVME_FEAT_GET_FID_MASK (0xff) + +/* Set Features */ +#define NVME_FEAT_SET_SV_SHIFT (31) +#define NVME_FEAT_SET_SV_MASK (0x1) +#define NVME_FEAT_SET_FID_SHIFT (0) +#define NVME_FEAT_SET_FID_MASK (0xff) + /* Helper macro to combine *_MASK and *_SHIFT defines */ #define NVMEB(name) (name##_MASK << name##_SHIFT) +/* Helper macro to extract value from x */ +#define NVMEV(name, x) (((x) >> name##_SHIFT) & name##_MASK) + /* CC register SHN field values */ enum shn_value { NVME_SHN_NORMAL = 0x1, NVME_SHN_ABRUPT = 0x2, }; /* CSTS register SHST field values */ enum shst_value { NVME_SHST_NORMAL = 0x0, NVME_SHST_OCCURRING = 0x1, NVME_SHST_COMPLETE = 0x2, }; struct nvme_registers { uint32_t cap_lo; /* controller capabilities */ uint32_t cap_hi; uint32_t vs; /* version */ uint32_t intms; /* interrupt mask set */ uint32_t intmc; /* interrupt mask clear */ uint32_t cc; /* controller configuration */ uint32_t reserved1; uint32_t csts; /* controller status */ uint32_t nssr; /* NVM Subsystem Reset */ uint32_t aqa; /* admin queue attributes */ uint64_t asq; /* admin submission queue base addr */ uint64_t acq; /* admin completion queue base addr */ uint32_t cmbloc; /* Controller Memory Buffer Location */ uint32_t cmbsz; /* Controller Memory Buffer Size */ uint32_t bpinfo; /* Boot Partition Information */ uint32_t bprsel; /* Boot Partition Read Select */ uint64_t bpmbl; /* Boot Partition Memory Buffer Location */ uint64_t cmbmsc; /* Controller Memory Buffer Memory Space Control */ uint32_t cmbsts; /* Controller Memory Buffer Status */ uint8_t reserved3[3492]; /* 5Ch - DFFh */ uint32_t pmrcap; /* Persistent Memory Capabilities */ uint32_t pmrctl; /* Persistent Memory Region Control */ uint32_t pmrsts; /* Persistent Memory Region Status */ uint32_t pmrebs; /* Persistent Memory Region Elasticity Buffer Size */ uint32_t pmrswtp; /* Persistent Memory Region Sustained Write Throughput */ uint32_t pmrmsc_lo; /* Persistent Memory Region Controller Memory Space Control */ uint32_t pmrmsc_hi; uint8_t reserved4[484]; /* E1Ch - FFFh */ struct { uint32_t sq_tdbl; /* submission queue tail doorbell */ uint32_t cq_hdbl; /* completion queue head doorbell */ } doorbell[1]; }; _Static_assert(sizeof(struct nvme_registers) == 0x1008, "bad size for nvme_registers"); struct nvme_command { /* dword 0 */ uint8_t opc; /* opcode */ uint8_t fuse; /* fused operation */ uint16_t cid; /* command identifier */ /* dword 1 */ uint32_t nsid; /* namespace identifier */ /* dword 2-3 */ uint32_t rsvd2; uint32_t rsvd3; /* dword 4-5 */ uint64_t mptr; /* metadata pointer */ /* dword 6-7 */ uint64_t prp1; /* prp entry 1 */ /* dword 8-9 */ uint64_t prp2; /* prp entry 2 */ /* dword 10-15 */ uint32_t cdw10; /* command-specific */ uint32_t cdw11; /* command-specific */ uint32_t cdw12; /* command-specific */ uint32_t cdw13; /* command-specific */ uint32_t cdw14; /* command-specific */ uint32_t cdw15; /* command-specific */ }; _Static_assert(sizeof(struct nvme_command) == 16 * 4, "bad size for nvme_command"); struct nvme_completion { /* dword 0 */ uint32_t cdw0; /* command-specific */ /* dword 1 */ uint32_t rsvd1; /* dword 2 */ uint16_t sqhd; /* submission queue head pointer */ uint16_t sqid; /* submission queue identifier */ /* dword 3 */ uint16_t cid; /* command identifier */ uint16_t status; } __aligned(8); /* riscv: nvme_qpair_process_completions has better code gen */ _Static_assert(sizeof(struct nvme_completion) == 4 * 4, "bad size for nvme_completion"); struct nvme_dsm_range { uint32_t attributes; uint32_t length; uint64_t starting_lba; }; /* Largest DSM Trim that can be done */ #define NVME_MAX_DSM_TRIM 4096 _Static_assert(sizeof(struct nvme_dsm_range) == 16, "bad size for nvme_dsm_ranage"); /* status code types */ enum nvme_status_code_type { NVME_SCT_GENERIC = 0x0, NVME_SCT_COMMAND_SPECIFIC = 0x1, NVME_SCT_MEDIA_ERROR = 0x2, NVME_SCT_PATH_RELATED = 0x3, /* 0x3-0x6 - reserved */ NVME_SCT_VENDOR_SPECIFIC = 0x7, }; /* generic command status codes */ enum nvme_generic_command_status_code { NVME_SC_SUCCESS = 0x00, NVME_SC_INVALID_OPCODE = 0x01, NVME_SC_INVALID_FIELD = 0x02, NVME_SC_COMMAND_ID_CONFLICT = 0x03, NVME_SC_DATA_TRANSFER_ERROR = 0x04, NVME_SC_ABORTED_POWER_LOSS = 0x05, NVME_SC_INTERNAL_DEVICE_ERROR = 0x06, NVME_SC_ABORTED_BY_REQUEST = 0x07, NVME_SC_ABORTED_SQ_DELETION = 0x08, NVME_SC_ABORTED_FAILED_FUSED = 0x09, NVME_SC_ABORTED_MISSING_FUSED = 0x0a, NVME_SC_INVALID_NAMESPACE_OR_FORMAT = 0x0b, NVME_SC_COMMAND_SEQUENCE_ERROR = 0x0c, NVME_SC_INVALID_SGL_SEGMENT_DESCR = 0x0d, NVME_SC_INVALID_NUMBER_OF_SGL_DESCR = 0x0e, NVME_SC_DATA_SGL_LENGTH_INVALID = 0x0f, NVME_SC_METADATA_SGL_LENGTH_INVALID = 0x10, NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID = 0x11, NVME_SC_INVALID_USE_OF_CMB = 0x12, NVME_SC_PRP_OFFET_INVALID = 0x13, NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED = 0x14, NVME_SC_OPERATION_DENIED = 0x15, NVME_SC_SGL_OFFSET_INVALID = 0x16, /* 0x17 - reserved */ NVME_SC_HOST_ID_INCONSISTENT_FORMAT = 0x18, NVME_SC_KEEP_ALIVE_TIMEOUT_EXPIRED = 0x19, NVME_SC_KEEP_ALIVE_TIMEOUT_INVALID = 0x1a, NVME_SC_ABORTED_DUE_TO_PREEMPT = 0x1b, NVME_SC_SANITIZE_FAILED = 0x1c, NVME_SC_SANITIZE_IN_PROGRESS = 0x1d, NVME_SC_SGL_DATA_BLOCK_GRAN_INVALID = 0x1e, NVME_SC_NOT_SUPPORTED_IN_CMB = 0x1f, NVME_SC_NAMESPACE_IS_WRITE_PROTECTED = 0x20, NVME_SC_COMMAND_INTERRUPTED = 0x21, NVME_SC_TRANSIENT_TRANSPORT_ERROR = 0x22, NVME_SC_LBA_OUT_OF_RANGE = 0x80, NVME_SC_CAPACITY_EXCEEDED = 0x81, NVME_SC_NAMESPACE_NOT_READY = 0x82, NVME_SC_RESERVATION_CONFLICT = 0x83, NVME_SC_FORMAT_IN_PROGRESS = 0x84, }; /* command specific status codes */ enum nvme_command_specific_status_code { NVME_SC_COMPLETION_QUEUE_INVALID = 0x00, NVME_SC_INVALID_QUEUE_IDENTIFIER = 0x01, NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED = 0x02, NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED = 0x03, /* 0x04 - reserved */ NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED = 0x05, NVME_SC_INVALID_FIRMWARE_SLOT = 0x06, NVME_SC_INVALID_FIRMWARE_IMAGE = 0x07, NVME_SC_INVALID_INTERRUPT_VECTOR = 0x08, NVME_SC_INVALID_LOG_PAGE = 0x09, NVME_SC_INVALID_FORMAT = 0x0a, NVME_SC_FIRMWARE_REQUIRES_RESET = 0x0b, NVME_SC_INVALID_QUEUE_DELETION = 0x0c, NVME_SC_FEATURE_NOT_SAVEABLE = 0x0d, NVME_SC_FEATURE_NOT_CHANGEABLE = 0x0e, NVME_SC_FEATURE_NOT_NS_SPECIFIC = 0x0f, NVME_SC_FW_ACT_REQUIRES_NVMS_RESET = 0x10, NVME_SC_FW_ACT_REQUIRES_RESET = 0x11, NVME_SC_FW_ACT_REQUIRES_TIME = 0x12, NVME_SC_FW_ACT_PROHIBITED = 0x13, NVME_SC_OVERLAPPING_RANGE = 0x14, NVME_SC_NS_INSUFFICIENT_CAPACITY = 0x15, NVME_SC_NS_ID_UNAVAILABLE = 0x16, /* 0x17 - reserved */ NVME_SC_NS_ALREADY_ATTACHED = 0x18, NVME_SC_NS_IS_PRIVATE = 0x19, NVME_SC_NS_NOT_ATTACHED = 0x1a, NVME_SC_THIN_PROV_NOT_SUPPORTED = 0x1b, NVME_SC_CTRLR_LIST_INVALID = 0x1c, NVME_SC_SELF_TEST_IN_PROGRESS = 0x1d, NVME_SC_BOOT_PART_WRITE_PROHIB = 0x1e, NVME_SC_INVALID_CTRLR_ID = 0x1f, NVME_SC_INVALID_SEC_CTRLR_STATE = 0x20, NVME_SC_INVALID_NUM_OF_CTRLR_RESRC = 0x21, NVME_SC_INVALID_RESOURCE_ID = 0x22, NVME_SC_SANITIZE_PROHIBITED_WPMRE = 0x23, NVME_SC_ANA_GROUP_ID_INVALID = 0x24, NVME_SC_ANA_ATTACH_FAILED = 0x25, NVME_SC_CONFLICTING_ATTRIBUTES = 0x80, NVME_SC_INVALID_PROTECTION_INFO = 0x81, NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE = 0x82, }; /* media error status codes */ enum nvme_media_error_status_code { NVME_SC_WRITE_FAULTS = 0x80, NVME_SC_UNRECOVERED_READ_ERROR = 0x81, NVME_SC_GUARD_CHECK_ERROR = 0x82, NVME_SC_APPLICATION_TAG_CHECK_ERROR = 0x83, NVME_SC_REFERENCE_TAG_CHECK_ERROR = 0x84, NVME_SC_COMPARE_FAILURE = 0x85, NVME_SC_ACCESS_DENIED = 0x86, NVME_SC_DEALLOCATED_OR_UNWRITTEN = 0x87, }; /* path related status codes */ enum nvme_path_related_status_code { NVME_SC_INTERNAL_PATH_ERROR = 0x00, NVME_SC_ASYMMETRIC_ACCESS_PERSISTENT_LOSS = 0x01, NVME_SC_ASYMMETRIC_ACCESS_INACCESSIBLE = 0x02, NVME_SC_ASYMMETRIC_ACCESS_TRANSITION = 0x03, NVME_SC_CONTROLLER_PATHING_ERROR = 0x60, NVME_SC_HOST_PATHING_ERROR = 0x70, NVME_SC_COMMAND_ABOTHED_BY_HOST = 0x71, }; /* admin opcodes */ enum nvme_admin_opcode { NVME_OPC_DELETE_IO_SQ = 0x00, NVME_OPC_CREATE_IO_SQ = 0x01, NVME_OPC_GET_LOG_PAGE = 0x02, /* 0x03 - reserved */ NVME_OPC_DELETE_IO_CQ = 0x04, NVME_OPC_CREATE_IO_CQ = 0x05, NVME_OPC_IDENTIFY = 0x06, /* 0x07 - reserved */ NVME_OPC_ABORT = 0x08, NVME_OPC_SET_FEATURES = 0x09, NVME_OPC_GET_FEATURES = 0x0a, /* 0x0b - reserved */ NVME_OPC_ASYNC_EVENT_REQUEST = 0x0c, NVME_OPC_NAMESPACE_MANAGEMENT = 0x0d, /* 0x0e-0x0f - reserved */ NVME_OPC_FIRMWARE_ACTIVATE = 0x10, NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD = 0x11, /* 0x12-0x13 - reserved */ NVME_OPC_DEVICE_SELF_TEST = 0x14, NVME_OPC_NAMESPACE_ATTACHMENT = 0x15, /* 0x16-0x17 - reserved */ NVME_OPC_KEEP_ALIVE = 0x18, NVME_OPC_DIRECTIVE_SEND = 0x19, NVME_OPC_DIRECTIVE_RECEIVE = 0x1a, /* 0x1b - reserved */ NVME_OPC_VIRTUALIZATION_MANAGEMENT = 0x1c, NVME_OPC_NVME_MI_SEND = 0x1d, NVME_OPC_NVME_MI_RECEIVE = 0x1e, /* 0x1f-0x7b - reserved */ NVME_OPC_DOORBELL_BUFFER_CONFIG = 0x7c, NVME_OPC_FORMAT_NVM = 0x80, NVME_OPC_SECURITY_SEND = 0x81, NVME_OPC_SECURITY_RECEIVE = 0x82, /* 0x83 - reserved */ NVME_OPC_SANITIZE = 0x84, /* 0x85 - reserved */ NVME_OPC_GET_LBA_STATUS = 0x86, }; /* nvme nvm opcodes */ enum nvme_nvm_opcode { NVME_OPC_FLUSH = 0x00, NVME_OPC_WRITE = 0x01, NVME_OPC_READ = 0x02, /* 0x03 - reserved */ NVME_OPC_WRITE_UNCORRECTABLE = 0x04, NVME_OPC_COMPARE = 0x05, /* 0x06-0x07 - reserved */ NVME_OPC_WRITE_ZEROES = 0x08, NVME_OPC_DATASET_MANAGEMENT = 0x09, /* 0x0a-0x0b - reserved */ NVME_OPC_VERIFY = 0x0c, NVME_OPC_RESERVATION_REGISTER = 0x0d, NVME_OPC_RESERVATION_REPORT = 0x0e, /* 0x0f-0x10 - reserved */ NVME_OPC_RESERVATION_ACQUIRE = 0x11, /* 0x12-0x14 - reserved */ NVME_OPC_RESERVATION_RELEASE = 0x15, }; enum nvme_feature { /* 0x00 - reserved */ NVME_FEAT_ARBITRATION = 0x01, NVME_FEAT_POWER_MANAGEMENT = 0x02, NVME_FEAT_LBA_RANGE_TYPE = 0x03, NVME_FEAT_TEMPERATURE_THRESHOLD = 0x04, NVME_FEAT_ERROR_RECOVERY = 0x05, NVME_FEAT_VOLATILE_WRITE_CACHE = 0x06, NVME_FEAT_NUMBER_OF_QUEUES = 0x07, NVME_FEAT_INTERRUPT_COALESCING = 0x08, NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION = 0x09, NVME_FEAT_WRITE_ATOMICITY = 0x0A, NVME_FEAT_ASYNC_EVENT_CONFIGURATION = 0x0B, NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION = 0x0C, NVME_FEAT_HOST_MEMORY_BUFFER = 0x0D, NVME_FEAT_TIMESTAMP = 0x0E, NVME_FEAT_KEEP_ALIVE_TIMER = 0x0F, NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT = 0x10, NVME_FEAT_NON_OP_POWER_STATE_CONFIG = 0x11, NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG = 0x12, NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG = 0x13, NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW = 0x14, NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES = 0x15, NVME_FEAT_HOST_BEHAVIOR_SUPPORT = 0x16, NVME_FEAT_SANITIZE_CONFIG = 0x17, NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION = 0x18, /* 0x19-0x77 - reserved */ /* 0x78-0x7f - NVMe Management Interface */ NVME_FEAT_SOFTWARE_PROGRESS_MARKER = 0x80, NVME_FEAT_HOST_IDENTIFIER = 0x81, NVME_FEAT_RESERVATION_NOTIFICATION_MASK = 0x82, NVME_FEAT_RESERVATION_PERSISTENCE = 0x83, NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG = 0x84, /* 0x85-0xBF - command set specific (reserved) */ /* 0xC0-0xFF - vendor specific */ }; enum nvme_dsm_attribute { NVME_DSM_ATTR_INTEGRAL_READ = 0x1, NVME_DSM_ATTR_INTEGRAL_WRITE = 0x2, NVME_DSM_ATTR_DEALLOCATE = 0x4, }; enum nvme_activate_action { NVME_AA_REPLACE_NO_ACTIVATE = 0x0, NVME_AA_REPLACE_ACTIVATE = 0x1, NVME_AA_ACTIVATE = 0x2, }; struct nvme_power_state { /** Maximum Power */ uint16_t mp; /* Maximum Power */ uint8_t ps_rsvd1; uint8_t mps_nops; /* Max Power Scale, Non-Operational State */ uint32_t enlat; /* Entry Latency */ uint32_t exlat; /* Exit Latency */ uint8_t rrt; /* Relative Read Throughput */ uint8_t rrl; /* Relative Read Latency */ uint8_t rwt; /* Relative Write Throughput */ uint8_t rwl; /* Relative Write Latency */ uint16_t idlp; /* Idle Power */ uint8_t ips; /* Idle Power Scale */ uint8_t ps_rsvd8; uint16_t actp; /* Active Power */ uint8_t apw_aps; /* Active Power Workload, Active Power Scale */ uint8_t ps_rsvd10[9]; } __packed; _Static_assert(sizeof(struct nvme_power_state) == 32, "bad size for nvme_power_state"); #define NVME_SERIAL_NUMBER_LENGTH 20 #define NVME_MODEL_NUMBER_LENGTH 40 #define NVME_FIRMWARE_REVISION_LENGTH 8 struct nvme_controller_data { /* bytes 0-255: controller capabilities and features */ /** pci vendor id */ uint16_t vid; /** pci subsystem vendor id */ uint16_t ssvid; /** serial number */ uint8_t sn[NVME_SERIAL_NUMBER_LENGTH]; /** model number */ uint8_t mn[NVME_MODEL_NUMBER_LENGTH]; /** firmware revision */ uint8_t fr[NVME_FIRMWARE_REVISION_LENGTH]; /** recommended arbitration burst */ uint8_t rab; /** ieee oui identifier */ uint8_t ieee[3]; /** multi-interface capabilities */ uint8_t mic; /** maximum data transfer size */ uint8_t mdts; /** Controller ID */ uint16_t ctrlr_id; /** Version */ uint32_t ver; /** RTD3 Resume Latency */ uint32_t rtd3r; /** RTD3 Enter Latency */ uint32_t rtd3e; /** Optional Asynchronous Events Supported */ uint32_t oaes; /* bitfield really */ /** Controller Attributes */ uint32_t ctratt; /* bitfield really */ /** Read Recovery Levels Supported */ uint16_t rrls; uint8_t reserved1[9]; /** Controller Type */ uint8_t cntrltype; /** FRU Globally Unique Identifier */ uint8_t fguid[16]; /** Command Retry Delay Time 1 */ uint16_t crdt1; /** Command Retry Delay Time 2 */ uint16_t crdt2; /** Command Retry Delay Time 3 */ uint16_t crdt3; uint8_t reserved2[122]; /* bytes 256-511: admin command set attributes */ /** optional admin command support */ uint16_t oacs; /** abort command limit */ uint8_t acl; /** asynchronous event request limit */ uint8_t aerl; /** firmware updates */ uint8_t frmw; /** log page attributes */ uint8_t lpa; /** error log page entries */ uint8_t elpe; /** number of power states supported */ uint8_t npss; /** admin vendor specific command configuration */ uint8_t avscc; /** Autonomous Power State Transition Attributes */ uint8_t apsta; /** Warning Composite Temperature Threshold */ uint16_t wctemp; /** Critical Composite Temperature Threshold */ uint16_t cctemp; /** Maximum Time for Firmware Activation */ uint16_t mtfa; /** Host Memory Buffer Preferred Size */ uint32_t hmpre; /** Host Memory Buffer Minimum Size */ uint32_t hmmin; /** Name space capabilities */ struct { /* if nsmgmt, report tnvmcap and unvmcap */ uint8_t tnvmcap[16]; uint8_t unvmcap[16]; } __packed untncap; /** Replay Protected Memory Block Support */ uint32_t rpmbs; /* Really a bitfield */ /** Extended Device Self-test Time */ uint16_t edstt; /** Device Self-test Options */ uint8_t dsto; /* Really a bitfield */ /** Firmware Update Granularity */ uint8_t fwug; /** Keep Alive Support */ uint16_t kas; /** Host Controlled Thermal Management Attributes */ uint16_t hctma; /* Really a bitfield */ /** Minimum Thermal Management Temperature */ uint16_t mntmt; /** Maximum Thermal Management Temperature */ uint16_t mxtmt; /** Sanitize Capabilities */ uint32_t sanicap; /* Really a bitfield */ /** Host Memory Buffer Minimum Descriptor Entry Size */ uint32_t hmminds; /** Host Memory Maximum Descriptors Entries */ uint16_t hmmaxd; /** NVM Set Identifier Maximum */ uint16_t nsetidmax; /** Endurance Group Identifier Maximum */ uint16_t endgidmax; /** ANA Transition Time */ uint8_t anatt; /** Asymmetric Namespace Access Capabilities */ uint8_t anacap; /** ANA Group Identifier Maximum */ uint32_t anagrpmax; /** Number of ANA Group Identifiers */ uint32_t nanagrpid; /** Persistent Event Log Size */ uint32_t pels; uint8_t reserved3[156]; /* bytes 512-703: nvm command set attributes */ /** submission queue entry size */ uint8_t sqes; /** completion queue entry size */ uint8_t cqes; /** Maximum Outstanding Commands */ uint16_t maxcmd; /** number of namespaces */ uint32_t nn; /** optional nvm command support */ uint16_t oncs; /** fused operation support */ uint16_t fuses; /** format nvm attributes */ uint8_t fna; /** volatile write cache */ uint8_t vwc; /** Atomic Write Unit Normal */ uint16_t awun; /** Atomic Write Unit Power Fail */ uint16_t awupf; /** NVM Vendor Specific Command Configuration */ uint8_t nvscc; /** Namespace Write Protection Capabilities */ uint8_t nwpc; /** Atomic Compare & Write Unit */ uint16_t acwu; uint16_t reserved6; /** SGL Support */ uint32_t sgls; /** Maximum Number of Allowed Namespaces */ uint32_t mnan; /* bytes 540-767: Reserved */ uint8_t reserved7[224]; /** NVM Subsystem NVMe Qualified Name */ uint8_t subnqn[256]; /* bytes 1024-1791: Reserved */ uint8_t reserved8[768]; /* bytes 1792-2047: NVMe over Fabrics specification */ uint8_t reserved9[256]; /* bytes 2048-3071: power state descriptors */ struct nvme_power_state power_state[32]; /* bytes 3072-4095: vendor specific */ uint8_t vs[1024]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_controller_data) == 4096, "bad size for nvme_controller_data"); struct nvme_namespace_data { /** namespace size */ uint64_t nsze; /** namespace capacity */ uint64_t ncap; /** namespace utilization */ uint64_t nuse; /** namespace features */ uint8_t nsfeat; /** number of lba formats */ uint8_t nlbaf; /** formatted lba size */ uint8_t flbas; /** metadata capabilities */ uint8_t mc; /** end-to-end data protection capabilities */ uint8_t dpc; /** end-to-end data protection type settings */ uint8_t dps; /** Namespace Multi-path I/O and Namespace Sharing Capabilities */ uint8_t nmic; /** Reservation Capabilities */ uint8_t rescap; /** Format Progress Indicator */ uint8_t fpi; /** Deallocate Logical Block Features */ uint8_t dlfeat; /** Namespace Atomic Write Unit Normal */ uint16_t nawun; /** Namespace Atomic Write Unit Power Fail */ uint16_t nawupf; /** Namespace Atomic Compare & Write Unit */ uint16_t nacwu; /** Namespace Atomic Boundary Size Normal */ uint16_t nabsn; /** Namespace Atomic Boundary Offset */ uint16_t nabo; /** Namespace Atomic Boundary Size Power Fail */ uint16_t nabspf; /** Namespace Optimal IO Boundary */ uint16_t noiob; /** NVM Capacity */ uint8_t nvmcap[16]; /** Namespace Preferred Write Granularity */ uint16_t npwg; /** Namespace Preferred Write Alignment */ uint16_t npwa; /** Namespace Preferred Deallocate Granularity */ uint16_t npdg; /** Namespace Preferred Deallocate Alignment */ uint16_t npda; /** Namespace Optimal Write Size */ uint16_t nows; /* bytes 74-91: Reserved */ uint8_t reserved5[18]; /** ANA Group Identifier */ uint32_t anagrpid; /* bytes 96-98: Reserved */ uint8_t reserved6[3]; /** Namespace Attributes */ uint8_t nsattr; /** NVM Set Identifier */ uint16_t nvmsetid; /** Endurance Group Identifier */ uint16_t endgid; /** Namespace Globally Unique Identifier */ uint8_t nguid[16]; /** IEEE Extended Unique Identifier */ uint8_t eui64[8]; /** lba format support */ uint32_t lbaf[16]; uint8_t reserved7[192]; uint8_t vendor_specific[3712]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_namespace_data) == 4096, "bad size for nvme_namepsace_data"); enum nvme_log_page { /* 0x00 - reserved */ NVME_LOG_ERROR = 0x01, NVME_LOG_HEALTH_INFORMATION = 0x02, NVME_LOG_FIRMWARE_SLOT = 0x03, NVME_LOG_CHANGED_NAMESPACE = 0x04, NVME_LOG_COMMAND_EFFECT = 0x05, NVME_LOG_DEVICE_SELF_TEST = 0x06, NVME_LOG_TELEMETRY_HOST_INITIATED = 0x07, NVME_LOG_TELEMETRY_CONTROLLER_INITIATED = 0x08, NVME_LOG_ENDURANCE_GROUP_INFORMATION = 0x09, NVME_LOG_PREDICTABLE_LATENCY_PER_NVM_SET = 0x0a, NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE = 0x0b, NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS = 0x0c, NVME_LOG_PERSISTENT_EVENT_LOG = 0x0d, NVME_LOG_LBA_STATUS_INFORMATION = 0x0e, NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE = 0x0f, /* 0x06-0x7F - reserved */ /* 0x80-0xBF - I/O command set specific */ NVME_LOG_RES_NOTIFICATION = 0x80, NVME_LOG_SANITIZE_STATUS = 0x81, /* 0x82-0xBF - reserved */ /* 0xC0-0xFF - vendor specific */ /* * The following are Intel Specific log pages, but they seem * to be widely implemented. */ INTEL_LOG_READ_LAT_LOG = 0xc1, INTEL_LOG_WRITE_LAT_LOG = 0xc2, INTEL_LOG_TEMP_STATS = 0xc5, INTEL_LOG_ADD_SMART = 0xca, INTEL_LOG_DRIVE_MKT_NAME = 0xdd, /* * HGST log page, with lots ofs sub pages. */ HGST_INFO_LOG = 0xc1, }; struct nvme_error_information_entry { uint64_t error_count; uint16_t sqid; uint16_t cid; uint16_t status; uint16_t error_location; uint64_t lba; uint32_t nsid; uint8_t vendor_specific; uint8_t trtype; uint16_t reserved30; uint64_t csi; uint16_t ttsi; uint8_t reserved[22]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_error_information_entry) == 64, "bad size for nvme_error_information_entry"); struct nvme_health_information_page { uint8_t critical_warning; uint16_t temperature; uint8_t available_spare; uint8_t available_spare_threshold; uint8_t percentage_used; uint8_t reserved[26]; /* * Note that the following are 128-bit values, but are * defined as an array of 2 64-bit values. */ /* Data Units Read is always in 512-byte units. */ uint64_t data_units_read[2]; /* Data Units Written is always in 512-byte units. */ uint64_t data_units_written[2]; /* For NVM command set, this includes Compare commands. */ uint64_t host_read_commands[2]; uint64_t host_write_commands[2]; /* Controller Busy Time is reported in minutes. */ uint64_t controller_busy_time[2]; uint64_t power_cycles[2]; uint64_t power_on_hours[2]; uint64_t unsafe_shutdowns[2]; uint64_t media_errors[2]; uint64_t num_error_info_log_entries[2]; uint32_t warning_temp_time; uint32_t error_temp_time; uint16_t temp_sensor[8]; /* Thermal Management Temperature 1 Transition Count */ uint32_t tmt1tc; /* Thermal Management Temperature 2 Transition Count */ uint32_t tmt2tc; /* Total Time For Thermal Management Temperature 1 */ uint32_t ttftmt1; /* Total Time For Thermal Management Temperature 2 */ uint32_t ttftmt2; uint8_t reserved2[280]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_health_information_page) == 512, "bad size for nvme_health_information_page"); struct nvme_firmware_page { uint8_t afi; uint8_t reserved[7]; uint64_t revision[7]; /* revisions for 7 slots */ uint8_t reserved2[448]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_firmware_page) == 512, "bad size for nvme_firmware_page"); struct nvme_ns_list { uint32_t ns[1024]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_ns_list) == 4096, "bad size for nvme_ns_list"); struct nvme_command_effects_page { uint32_t acs[256]; uint32_t iocs[256]; uint8_t reserved[2048]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_command_effects_page) == 4096, "bad size for nvme_command_effects_page"); struct nvme_device_self_test_page { uint8_t curr_operation; uint8_t curr_compl; uint8_t rsvd2[2]; struct { uint8_t status; uint8_t segment_num; uint8_t valid_diag_info; uint8_t rsvd3; uint64_t poh; uint32_t nsid; /* Define as an array to simplify alignment issues */ uint8_t failing_lba[8]; uint8_t status_code_type; uint8_t status_code; uint8_t vendor_specific[2]; } __packed result[20]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_device_self_test_page) == 564, "bad size for nvme_device_self_test_page"); struct nvme_res_notification_page { uint64_t log_page_count; uint8_t log_page_type; uint8_t available_log_pages; uint8_t reserved2; uint32_t nsid; uint8_t reserved[48]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_res_notification_page) == 64, "bad size for nvme_res_notification_page"); struct nvme_sanitize_status_page { uint16_t sprog; uint16_t sstat; uint32_t scdw10; uint32_t etfo; uint32_t etfbe; uint32_t etfce; uint32_t etfownd; uint32_t etfbewnd; uint32_t etfcewnd; uint8_t reserved[480]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_sanitize_status_page) == 512, "bad size for nvme_sanitize_status_page"); struct intel_log_temp_stats { uint64_t current; uint64_t overtemp_flag_last; uint64_t overtemp_flag_life; uint64_t max_temp; uint64_t min_temp; uint64_t _rsvd[5]; uint64_t max_oper_temp; uint64_t min_oper_temp; uint64_t est_offset; } __packed __aligned(4); _Static_assert(sizeof(struct intel_log_temp_stats) == 13 * 8, "bad size for intel_log_temp_stats"); struct nvme_resv_reg_ctrlr { uint16_t ctrlr_id; /* Controller ID */ uint8_t rcsts; /* Reservation Status */ uint8_t reserved3[5]; uint64_t hostid; /* Host Identifier */ uint64_t rkey; /* Reservation Key */ } __packed __aligned(4); _Static_assert(sizeof(struct nvme_resv_reg_ctrlr) == 24, "bad size for nvme_resv_reg_ctrlr"); struct nvme_resv_reg_ctrlr_ext { uint16_t ctrlr_id; /* Controller ID */ uint8_t rcsts; /* Reservation Status */ uint8_t reserved3[5]; uint64_t rkey; /* Reservation Key */ uint64_t hostid[2]; /* Host Identifier */ uint8_t reserved32[32]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_resv_reg_ctrlr_ext) == 64, "bad size for nvme_resv_reg_ctrlr_ext"); struct nvme_resv_status { uint32_t gen; /* Generation */ uint8_t rtype; /* Reservation Type */ uint8_t regctl[2]; /* Number of Registered Controllers */ uint8_t reserved7[2]; uint8_t ptpls; /* Persist Through Power Loss State */ uint8_t reserved10[14]; struct nvme_resv_reg_ctrlr ctrlr[0]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_resv_status) == 24, "bad size for nvme_resv_status"); struct nvme_resv_status_ext { uint32_t gen; /* Generation */ uint8_t rtype; /* Reservation Type */ uint8_t regctl[2]; /* Number of Registered Controllers */ uint8_t reserved7[2]; uint8_t ptpls; /* Persist Through Power Loss State */ uint8_t reserved10[14]; uint8_t reserved24[40]; struct nvme_resv_reg_ctrlr_ext ctrlr[0]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_resv_status_ext) == 64, "bad size for nvme_resv_status_ext"); #define NVME_TEST_MAX_THREADS 128 struct nvme_io_test { enum nvme_nvm_opcode opc; uint32_t size; uint32_t time; /* in seconds */ uint32_t num_threads; uint32_t flags; uint64_t io_completed[NVME_TEST_MAX_THREADS]; }; enum nvme_io_test_flags { /* * Specifies whether dev_refthread/dev_relthread should be * called during NVME_BIO_TEST. Ignored for other test * types. */ NVME_TEST_FLAG_REFTHREAD = 0x1, }; struct nvme_pt_command { /* * cmd is used to specify a passthrough command to a controller or * namespace. * * The following fields from cmd may be specified by the caller: * * opc (opcode) * * nsid (namespace id) - for admin commands only * * cdw10-cdw15 * * Remaining fields must be set to 0 by the caller. */ struct nvme_command cmd; /* * cpl returns completion status for the passthrough command * specified by cmd. * * The following fields will be filled out by the driver, for * consumption by the caller: * * cdw0 * * status (except for phase) * * Remaining fields will be set to 0 by the driver. */ struct nvme_completion cpl; /* buf is the data buffer associated with this passthrough command. */ void * buf; /* * len is the length of the data buffer associated with this * passthrough command. */ uint32_t len; /* * is_read = 1 if the passthrough command will read data into the * supplied buffer from the controller. * * is_read = 0 if the passthrough command will write data from the * supplied buffer to the controller. */ uint32_t is_read; /* * driver_lock is used by the driver only. It must be set to 0 * by the caller. */ struct mtx * driver_lock; }; struct nvme_get_nsid { char cdev[SPECNAMELEN + 1]; uint32_t nsid; }; struct nvme_hmb_desc { uint64_t addr; uint32_t size; uint32_t reserved; }; #define nvme_completion_is_error(cpl) \ (NVME_STATUS_GET_SC((cpl)->status) != 0 || NVME_STATUS_GET_SCT((cpl)->status) != 0) void nvme_strvis(uint8_t *dst, const uint8_t *src, int dstlen, int srclen); #ifdef _KERNEL struct bio; struct thread; struct nvme_namespace; struct nvme_controller; struct nvme_consumer; typedef void (*nvme_cb_fn_t)(void *, const struct nvme_completion *); typedef void *(*nvme_cons_ns_fn_t)(struct nvme_namespace *, void *); typedef void *(*nvme_cons_ctrlr_fn_t)(struct nvme_controller *); typedef void (*nvme_cons_async_fn_t)(void *, const struct nvme_completion *, uint32_t, void *, uint32_t); typedef void (*nvme_cons_fail_fn_t)(void *); enum nvme_namespace_flags { NVME_NS_DEALLOCATE_SUPPORTED = 0x1, NVME_NS_FLUSH_SUPPORTED = 0x2, }; int nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, struct nvme_pt_command *pt, uint32_t nsid, int is_user_buffer, int is_admin_cmd); /* Admin functions */ void nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr, uint8_t feature, uint32_t cdw11, uint32_t cdw12, uint32_t cdw13, uint32_t cdw14, uint32_t cdw15, void *payload, uint32_t payload_size, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_get_feature(struct nvme_controller *ctrlr, uint8_t feature, uint32_t cdw11, void *payload, uint32_t payload_size, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_get_log_page(struct nvme_controller *ctrlr, uint8_t log_page, uint32_t nsid, void *payload, uint32_t payload_size, nvme_cb_fn_t cb_fn, void *cb_arg); /* NVM I/O functions */ int nvme_ns_cmd_write(struct nvme_namespace *ns, void *payload, uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_cmd_write_bio(struct nvme_namespace *ns, struct bio *bp, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_cmd_read(struct nvme_namespace *ns, void *payload, uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_cmd_read_bio(struct nvme_namespace *ns, struct bio *bp, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_cmd_deallocate(struct nvme_namespace *ns, void *payload, uint8_t num_ranges, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_cmd_flush(struct nvme_namespace *ns, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_dump(struct nvme_namespace *ns, void *virt, off_t offset, size_t len); /* Registration functions */ struct nvme_consumer * nvme_register_consumer(nvme_cons_ns_fn_t ns_fn, nvme_cons_ctrlr_fn_t ctrlr_fn, nvme_cons_async_fn_t async_fn, nvme_cons_fail_fn_t fail_fn); void nvme_unregister_consumer(struct nvme_consumer *consumer); /* Controller helper functions */ device_t nvme_ctrlr_get_device(struct nvme_controller *ctrlr); const struct nvme_controller_data * nvme_ctrlr_get_data(struct nvme_controller *ctrlr); static inline bool nvme_ctrlr_has_dataset_mgmt(const struct nvme_controller_data *cd) { /* Assumes cd was byte swapped by nvme_controller_data_swapbytes() */ return ((cd->oncs >> NVME_CTRLR_DATA_ONCS_DSM_SHIFT) & NVME_CTRLR_DATA_ONCS_DSM_MASK); } /* Namespace helper functions */ uint32_t nvme_ns_get_max_io_xfer_size(struct nvme_namespace *ns); uint32_t nvme_ns_get_sector_size(struct nvme_namespace *ns); uint64_t nvme_ns_get_num_sectors(struct nvme_namespace *ns); uint64_t nvme_ns_get_size(struct nvme_namespace *ns); uint32_t nvme_ns_get_flags(struct nvme_namespace *ns); const char * nvme_ns_get_serial_number(struct nvme_namespace *ns); const char * nvme_ns_get_model_number(struct nvme_namespace *ns); const struct nvme_namespace_data * nvme_ns_get_data(struct nvme_namespace *ns); uint32_t nvme_ns_get_stripesize(struct nvme_namespace *ns); int nvme_ns_bio_process(struct nvme_namespace *ns, struct bio *bp, nvme_cb_fn_t cb_fn); int nvme_ns_ioctl_process(struct nvme_namespace *ns, u_long cmd, caddr_t arg, int flag, struct thread *td); /* * Command building helper functions -- shared with CAM * These functions assume allocator zeros out cmd structure * CAM's xpt_get_ccb and the request allocator for nvme both * do zero'd allocations. */ static inline void nvme_ns_flush_cmd(struct nvme_command *cmd, uint32_t nsid) { cmd->opc = NVME_OPC_FLUSH; cmd->nsid = htole32(nsid); } static inline void nvme_ns_rw_cmd(struct nvme_command *cmd, uint32_t rwcmd, uint32_t nsid, uint64_t lba, uint32_t count) { cmd->opc = rwcmd; cmd->nsid = htole32(nsid); cmd->cdw10 = htole32(lba & 0xffffffffu); cmd->cdw11 = htole32(lba >> 32); cmd->cdw12 = htole32(count-1); } static inline void nvme_ns_write_cmd(struct nvme_command *cmd, uint32_t nsid, uint64_t lba, uint32_t count) { nvme_ns_rw_cmd(cmd, NVME_OPC_WRITE, nsid, lba, count); } static inline void nvme_ns_read_cmd(struct nvme_command *cmd, uint32_t nsid, uint64_t lba, uint32_t count) { nvme_ns_rw_cmd(cmd, NVME_OPC_READ, nsid, lba, count); } static inline void nvme_ns_trim_cmd(struct nvme_command *cmd, uint32_t nsid, uint32_t num_ranges) { cmd->opc = NVME_OPC_DATASET_MANAGEMENT; cmd->nsid = htole32(nsid); cmd->cdw10 = htole32(num_ranges - 1); cmd->cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE); } extern int nvme_use_nvd; #endif /* _KERNEL */ /* Endianess conversion functions for NVMe structs */ static inline void nvme_completion_swapbytes(struct nvme_completion *s __unused) { #if _BYTE_ORDER != _LITTLE_ENDIAN s->cdw0 = le32toh(s->cdw0); /* omit rsvd1 */ s->sqhd = le16toh(s->sqhd); s->sqid = le16toh(s->sqid); /* omit cid */ s->status = le16toh(s->status); #endif } static inline void nvme_power_state_swapbytes(struct nvme_power_state *s __unused) { #if _BYTE_ORDER != _LITTLE_ENDIAN s->mp = le16toh(s->mp); s->enlat = le32toh(s->enlat); s->exlat = le32toh(s->exlat); s->idlp = le16toh(s->idlp); s->actp = le16toh(s->actp); #endif } static inline void nvme_controller_data_swapbytes(struct nvme_controller_data *s __unused) { #if _BYTE_ORDER != _LITTLE_ENDIAN int i; s->vid = le16toh(s->vid); s->ssvid = le16toh(s->ssvid); s->ctrlr_id = le16toh(s->ctrlr_id); s->ver = le32toh(s->ver); s->rtd3r = le32toh(s->rtd3r); s->rtd3e = le32toh(s->rtd3e); s->oaes = le32toh(s->oaes); s->ctratt = le32toh(s->ctratt); s->rrls = le16toh(s->rrls); s->crdt1 = le16toh(s->crdt1); s->crdt2 = le16toh(s->crdt2); s->crdt3 = le16toh(s->crdt3); s->oacs = le16toh(s->oacs); s->wctemp = le16toh(s->wctemp); s->cctemp = le16toh(s->cctemp); s->mtfa = le16toh(s->mtfa); s->hmpre = le32toh(s->hmpre); s->hmmin = le32toh(s->hmmin); s->rpmbs = le32toh(s->rpmbs); s->edstt = le16toh(s->edstt); s->kas = le16toh(s->kas); s->hctma = le16toh(s->hctma); s->mntmt = le16toh(s->mntmt); s->mxtmt = le16toh(s->mxtmt); s->sanicap = le32toh(s->sanicap); s->hmminds = le32toh(s->hmminds); s->hmmaxd = le16toh(s->hmmaxd); s->nsetidmax = le16toh(s->nsetidmax); s->endgidmax = le16toh(s->endgidmax); s->anagrpmax = le32toh(s->anagrpmax); s->nanagrpid = le32toh(s->nanagrpid); s->pels = le32toh(s->pels); s->maxcmd = le16toh(s->maxcmd); s->nn = le32toh(s->nn); s->oncs = le16toh(s->oncs); s->fuses = le16toh(s->fuses); s->awun = le16toh(s->awun); s->awupf = le16toh(s->awupf); s->acwu = le16toh(s->acwu); s->sgls = le32toh(s->sgls); s->mnan = le32toh(s->mnan); for (i = 0; i < 32; i++) nvme_power_state_swapbytes(&s->power_state[i]); #endif } static inline void nvme_namespace_data_swapbytes(struct nvme_namespace_data *s __unused) { #if _BYTE_ORDER != _LITTLE_ENDIAN int i; s->nsze = le64toh(s->nsze); s->ncap = le64toh(s->ncap); s->nuse = le64toh(s->nuse); s->nawun = le16toh(s->nawun); s->nawupf = le16toh(s->nawupf); s->nacwu = le16toh(s->nacwu); s->nabsn = le16toh(s->nabsn); s->nabo = le16toh(s->nabo); s->nabspf = le16toh(s->nabspf); s->noiob = le16toh(s->noiob); s->npwg = le16toh(s->npwg); s->npwa = le16toh(s->npwa); s->npdg = le16toh(s->npdg); s->npda = le16toh(s->npda); s->nows = le16toh(s->nows); s->anagrpid = le32toh(s->anagrpid); s->nvmsetid = le16toh(s->nvmsetid); s->endgid = le16toh(s->endgid); for (i = 0; i < 16; i++) s->lbaf[i] = le32toh(s->lbaf[i]); #endif } static inline void nvme_error_information_entry_swapbytes( struct nvme_error_information_entry *s __unused) { #if _BYTE_ORDER != _LITTLE_ENDIAN s->error_count = le64toh(s->error_count); s->sqid = le16toh(s->sqid); s->cid = le16toh(s->cid); s->status = le16toh(s->status); s->error_location = le16toh(s->error_location); s->lba = le64toh(s->lba); s->nsid = le32toh(s->nsid); s->csi = le64toh(s->csi); s->ttsi = le16toh(s->ttsi); #endif } static inline void nvme_le128toh(void *p __unused) { #if _BYTE_ORDER != _LITTLE_ENDIAN /* Swap 16 bytes in place */ char *tmp = (char*)p; char b; int i; for (i = 0; i < 8; i++) { b = tmp[i]; tmp[i] = tmp[15-i]; tmp[15-i] = b; } #endif } static inline void nvme_health_information_page_swapbytes( struct nvme_health_information_page *s __unused) { #if _BYTE_ORDER != _LITTLE_ENDIAN int i; s->temperature = le16toh(s->temperature); nvme_le128toh((void *)s->data_units_read); nvme_le128toh((void *)s->data_units_written); nvme_le128toh((void *)s->host_read_commands); nvme_le128toh((void *)s->host_write_commands); nvme_le128toh((void *)s->controller_busy_time); nvme_le128toh((void *)s->power_cycles); nvme_le128toh((void *)s->power_on_hours); nvme_le128toh((void *)s->unsafe_shutdowns); nvme_le128toh((void *)s->media_errors); nvme_le128toh((void *)s->num_error_info_log_entries); s->warning_temp_time = le32toh(s->warning_temp_time); s->error_temp_time = le32toh(s->error_temp_time); for (i = 0; i < 8; i++) s->temp_sensor[i] = le16toh(s->temp_sensor[i]); s->tmt1tc = le32toh(s->tmt1tc); s->tmt2tc = le32toh(s->tmt2tc); s->ttftmt1 = le32toh(s->ttftmt1); s->ttftmt2 = le32toh(s->ttftmt2); #endif } static inline void nvme_firmware_page_swapbytes(struct nvme_firmware_page *s __unused) { #if _BYTE_ORDER != _LITTLE_ENDIAN int i; for (i = 0; i < 7; i++) s->revision[i] = le64toh(s->revision[i]); #endif } static inline void nvme_ns_list_swapbytes(struct nvme_ns_list *s __unused) { #if _BYTE_ORDER != _LITTLE_ENDIAN int i; for (i = 0; i < 1024; i++) s->ns[i] = le32toh(s->ns[i]); #endif } static inline void nvme_command_effects_page_swapbytes( struct nvme_command_effects_page *s __unused) { #if _BYTE_ORDER != _LITTLE_ENDIAN int i; for (i = 0; i < 256; i++) s->acs[i] = le32toh(s->acs[i]); for (i = 0; i < 256; i++) s->iocs[i] = le32toh(s->iocs[i]); #endif } static inline void nvme_res_notification_page_swapbytes( struct nvme_res_notification_page *s __unused) { #if _BYTE_ORDER != _LITTLE_ENDIAN s->log_page_count = le64toh(s->log_page_count); s->nsid = le32toh(s->nsid); #endif } static inline void nvme_sanitize_status_page_swapbytes( struct nvme_sanitize_status_page *s __unused) { #if _BYTE_ORDER != _LITTLE_ENDIAN s->sprog = le16toh(s->sprog); s->sstat = le16toh(s->sstat); s->scdw10 = le32toh(s->scdw10); s->etfo = le32toh(s->etfo); s->etfbe = le32toh(s->etfbe); s->etfce = le32toh(s->etfce); s->etfownd = le32toh(s->etfownd); s->etfbewnd = le32toh(s->etfbewnd); s->etfcewnd = le32toh(s->etfcewnd); #endif } static inline void intel_log_temp_stats_swapbytes(struct intel_log_temp_stats *s __unused) { #if _BYTE_ORDER != _LITTLE_ENDIAN s->current = le64toh(s->current); s->overtemp_flag_last = le64toh(s->overtemp_flag_last); s->overtemp_flag_life = le64toh(s->overtemp_flag_life); s->max_temp = le64toh(s->max_temp); s->min_temp = le64toh(s->min_temp); /* omit _rsvd[] */ s->max_oper_temp = le64toh(s->max_oper_temp); s->min_oper_temp = le64toh(s->min_oper_temp); s->est_offset = le64toh(s->est_offset); #endif } static inline void nvme_resv_status_swapbytes(struct nvme_resv_status *s __unused, size_t size __unused) { #if _BYTE_ORDER != _LITTLE_ENDIAN u_int i, n; s->gen = le32toh(s->gen); n = (s->regctl[1] << 8) | s->regctl[0]; n = MIN(n, (size - sizeof(s)) / sizeof(s->ctrlr[0])); for (i = 0; i < n; i++) { s->ctrlr[i].ctrlr_id = le16toh(s->ctrlr[i].ctrlr_id); s->ctrlr[i].hostid = le64toh(s->ctrlr[i].hostid); s->ctrlr[i].rkey = le64toh(s->ctrlr[i].rkey); } #endif } static inline void nvme_resv_status_ext_swapbytes(struct nvme_resv_status_ext *s __unused, size_t size __unused) { #if _BYTE_ORDER != _LITTLE_ENDIAN u_int i, n; s->gen = le32toh(s->gen); n = (s->regctl[1] << 8) | s->regctl[0]; n = MIN(n, (size - sizeof(s)) / sizeof(s->ctrlr[0])); for (i = 0; i < n; i++) { s->ctrlr[i].ctrlr_id = le16toh(s->ctrlr[i].ctrlr_id); s->ctrlr[i].rkey = le64toh(s->ctrlr[i].rkey); nvme_le128toh((void *)s->ctrlr[i].hostid); } #endif } static inline void nvme_device_self_test_swapbytes(struct nvme_device_self_test_page *s __unused) { #if _BYTE_ORDER != _LITTLE_ENDIAN uint8_t *tmp; uint32_t r, i; uint8_t b; for (r = 0; r < 20; r++) { s->result[r].poh = le64toh(s->result[r].poh); s->result[r].nsid = le32toh(s->result[r].nsid); /* Unaligned 64-bit loads fail on some architectures */ tmp = s->result[r].failing_lba; for (i = 0; i < 4; i++) { b = tmp[i]; tmp[i] = tmp[7-i]; tmp[7-i] = b; } } #endif } #endif /* __NVME_H__ */ diff --git a/usr.sbin/bhyve/pci_nvme.c b/usr.sbin/bhyve/pci_nvme.c index b341b70f4e48..1cc9838c112f 100644 --- a/usr.sbin/bhyve/pci_nvme.c +++ b/usr.sbin/bhyve/pci_nvme.c @@ -1,3373 +1,3381 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2017 Shunsuke Mie * Copyright (c) 2018 Leon Dang * Copyright (c) 2020 Chuck Tuffli * * Function crc16 Copyright (c) 2017, Fedor Uporov * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * bhyve PCIe-NVMe device emulation. * * options: * -s ,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm= * * accepted devpath: * /dev/blockdev * /path/to/image * ram=size_in_MiB * * maxq = max number of queues * qsz = max elements in each queue * ioslots = max number of concurrent io requests * sectsz = sector size (defaults to blockif sector size) * ser = serial number (20-chars max) * eui64 = IEEE Extended Unique Identifier (8 byte value) * dsm = DataSet Management support. Option is one of auto, enable,disable * */ /* TODO: - create async event for smart and log - intr coalesce */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "block_if.h" #include "config.h" #include "debug.h" #include "pci_emul.h" static int nvme_debug = 0; #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) /* defaults; can be overridden */ #define NVME_MSIX_BAR 4 #define NVME_IOSLOTS 8 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ #define NVME_MMIO_SPACE_MIN (1 << 14) #define NVME_QUEUES 16 #define NVME_MAX_QENTRIES 2048 /* Memory Page size Minimum reported in CAP register */ #define NVME_MPSMIN 0 /* MPSMIN converted to bytes */ #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) #define NVME_MDTS 9 /* Note the + 1 allows for the initial descriptor to not be page aligned */ #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) /* This is a synthetic status code to indicate there is no status */ #define NVME_NO_STATUS 0xffff #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) /* Reported temperature in Kelvin (i.e. room temperature) */ #define NVME_TEMPERATURE 296 /* helpers */ /* Convert a zero-based value into a one-based value */ #define ONE_BASED(zero) ((zero) + 1) /* Convert a one-based value into a zero-based value */ #define ZERO_BASED(one) ((one) - 1) /* Encode number of SQ's and CQ's for Set/Get Features */ #define NVME_FEATURE_NUM_QUEUES(sc) \ (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) enum nvme_controller_register_offsets { NVME_CR_CAP_LOW = 0x00, NVME_CR_CAP_HI = 0x04, NVME_CR_VS = 0x08, NVME_CR_INTMS = 0x0c, NVME_CR_INTMC = 0x10, NVME_CR_CC = 0x14, NVME_CR_CSTS = 0x1c, NVME_CR_NSSR = 0x20, NVME_CR_AQA = 0x24, NVME_CR_ASQ_LOW = 0x28, NVME_CR_ASQ_HI = 0x2c, NVME_CR_ACQ_LOW = 0x30, NVME_CR_ACQ_HI = 0x34, }; enum nvme_cmd_cdw11 { NVME_CMD_CDW11_PC = 0x0001, NVME_CMD_CDW11_IEN = 0x0002, NVME_CMD_CDW11_IV = 0xFFFF0000, }; enum nvme_copy_dir { NVME_COPY_TO_PRP, NVME_COPY_FROM_PRP, }; #define NVME_CQ_INTEN 0x01 #define NVME_CQ_INTCOAL 0x02 struct nvme_completion_queue { struct nvme_completion *qbase; pthread_mutex_t mtx; uint32_t size; uint16_t tail; /* nvme progress */ uint16_t head; /* guest progress */ uint16_t intr_vec; uint32_t intr_en; }; struct nvme_submission_queue { struct nvme_command *qbase; pthread_mutex_t mtx; uint32_t size; uint16_t head; /* nvme progress */ uint16_t tail; /* guest progress */ uint16_t cqid; /* completion queue id */ int qpriority; }; enum nvme_storage_type { NVME_STOR_BLOCKIF = 0, NVME_STOR_RAM = 1, }; struct pci_nvme_blockstore { enum nvme_storage_type type; void *ctx; uint64_t size; uint32_t sectsz; uint32_t sectsz_bits; uint64_t eui64; uint32_t deallocate:1; }; /* * Calculate the number of additional page descriptors for guest IO requests * based on the advertised Max Data Transfer (MDTS) and given the number of * default iovec's in a struct blockif_req. */ #define MDTS_PAD_SIZE \ ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 0 ) struct pci_nvme_ioreq { struct pci_nvme_softc *sc; STAILQ_ENTRY(pci_nvme_ioreq) link; struct nvme_submission_queue *nvme_sq; uint16_t sqid; /* command information */ uint16_t opc; uint16_t cid; uint32_t nsid; uint64_t prev_gpaddr; size_t prev_size; size_t bytes; struct blockif_req io_req; struct iovec iovpadding[MDTS_PAD_SIZE]; }; enum nvme_dsm_type { /* Dataset Management bit in ONCS reflects backing storage capability */ NVME_DATASET_MANAGEMENT_AUTO, /* Unconditionally set Dataset Management bit in ONCS */ NVME_DATASET_MANAGEMENT_ENABLE, /* Unconditionally clear Dataset Management bit in ONCS */ NVME_DATASET_MANAGEMENT_DISABLE, }; struct pci_nvme_softc; struct nvme_feature_obj; typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); struct nvme_feature_obj { uint32_t cdw11; nvme_feature_cb set; nvme_feature_cb get; bool namespace_specific; }; #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) typedef enum { PCI_NVME_AE_TYPE_ERROR = 0, PCI_NVME_AE_TYPE_SMART, PCI_NVME_AE_TYPE_NOTICE, PCI_NVME_AE_TYPE_IO_CMD = 6, PCI_NVME_AE_TYPE_VENDOR = 7, PCI_NVME_AE_TYPE_MAX /* Must be last */ } pci_nvme_async_type; /* Asynchronous Event Requests */ struct pci_nvme_aer { STAILQ_ENTRY(pci_nvme_aer) link; uint16_t cid; /* Command ID of the submitted AER */ }; /** Asynchronous Event Information - Notice */ typedef enum { PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0, PCI_NVME_AEI_NOTICE_FW_ACTIVATION, PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE, PCI_NVME_AEI_NOTICE_ANA_CHANGE, PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE, PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT, PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE, PCI_NVME_AEI_NOTICE_MAX, } pci_nvme_async_event_info_notice; #define PCI_NVME_AEI_NOTICE_SHIFT 8 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT)) /* Asynchronous Event Notifications */ struct pci_nvme_aen { pci_nvme_async_type atype; uint32_t event_data; bool posted; }; /* * By default, enable all Asynchrnous Event Notifications: * SMART / Health Critical Warnings * Namespace Attribute Notices */ #define PCI_NVME_AEN_DEFAULT_MASK 0x11f typedef enum { NVME_CNTRLTYPE_IO = 1, NVME_CNTRLTYPE_DISCOVERY = 2, NVME_CNTRLTYPE_ADMIN = 3, } pci_nvme_cntrl_type; struct pci_nvme_softc { struct pci_devinst *nsc_pi; pthread_mutex_t mtx; struct nvme_registers regs; struct nvme_namespace_data nsdata; struct nvme_controller_data ctrldata; struct nvme_error_information_entry err_log; struct nvme_health_information_page health_log; struct nvme_firmware_page fw_log; struct nvme_ns_list ns_log; struct pci_nvme_blockstore nvstore; uint16_t max_qentries; /* max entries per queue */ uint32_t max_queues; /* max number of IO SQ's or CQ's */ uint32_t num_cqueues; uint32_t num_squeues; bool num_q_is_set; /* Has host set Number of Queues */ struct pci_nvme_ioreq *ioreqs; STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ uint32_t pending_ios; uint32_t ioslots; sem_t iosemlock; /* * Memory mapped Submission and Completion queues * Each array includes both Admin and IO queues */ struct nvme_completion_queue *compl_queues; struct nvme_submission_queue *submit_queues; struct nvme_feature_obj feat[NVME_FID_MAX]; enum nvme_dsm_type dataset_management; /* Accounting for SMART data */ __uint128_t read_data_units; __uint128_t write_data_units; __uint128_t read_commands; __uint128_t write_commands; uint32_t read_dunits_remainder; uint32_t write_dunits_remainder; STAILQ_HEAD(, pci_nvme_aer) aer_list; pthread_mutex_t aer_mtx; uint32_t aer_count; struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; pthread_t aen_tid; pthread_mutex_t aen_mtx; pthread_cond_t aen_cond; }; static void pci_nvme_cq_update(struct pci_nvme_softc *sc, struct nvme_completion_queue *cq, uint32_t cdw0, uint16_t cid, uint16_t sqid, uint16_t status); static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); static void pci_nvme_io_done(struct blockif_req *, int); /* Controller Configuration utils */ #define NVME_CC_GET_EN(cc) \ ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) #define NVME_CC_GET_CSS(cc) \ ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) #define NVME_CC_GET_SHN(cc) \ ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) #define NVME_CC_GET_IOSQES(cc) \ ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) #define NVME_CC_GET_IOCQES(cc) \ ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) #define NVME_CC_WRITE_MASK \ ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) #define NVME_CC_NEN_WRITE_MASK \ ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) /* Controller Status utils */ #define NVME_CSTS_GET_RDY(sts) \ ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) #define NVME_CSTS_CFS (1 << NVME_CSTS_REG_CFS_SHIFT) /* Completion Queue status word utils */ #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) #define NVME_STATUS_MASK \ ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ NVME_CTRLR_DATA_ONCS_DSM_SHIFT) static void nvme_feature_invalid_cb(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static void nvme_feature_temperature(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static void nvme_feature_num_queues(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static void nvme_feature_iv_config(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static void nvme_feature_async_event(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static void *aen_thr(void *arg); static __inline void cpywithpad(char *dst, size_t dst_size, const char *src, char pad) { size_t len; len = strnlen(src, dst_size); memset(dst, pad, dst_size); memcpy(dst, src, len); } static __inline void pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) { *status &= ~NVME_STATUS_MASK; *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; } static __inline void pci_nvme_status_genc(uint16_t *status, uint16_t code) { pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); } /* * Initialize the requested number or IO Submission and Completion Queues. * Admin queues are allocated implicitly. */ static void pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) { uint32_t i; /* * Allocate and initialize the Submission Queues */ if (nsq > NVME_QUEUES) { WPRINTF("%s: clamping number of SQ from %u to %u", __func__, nsq, NVME_QUEUES); nsq = NVME_QUEUES; } sc->num_squeues = nsq; sc->submit_queues = calloc(sc->num_squeues + 1, sizeof(struct nvme_submission_queue)); if (sc->submit_queues == NULL) { WPRINTF("%s: SQ allocation failed", __func__); sc->num_squeues = 0; } else { struct nvme_submission_queue *sq = sc->submit_queues; for (i = 0; i < sc->num_squeues + 1; i++) pthread_mutex_init(&sq[i].mtx, NULL); } /* * Allocate and initialize the Completion Queues */ if (ncq > NVME_QUEUES) { WPRINTF("%s: clamping number of CQ from %u to %u", __func__, ncq, NVME_QUEUES); ncq = NVME_QUEUES; } sc->num_cqueues = ncq; sc->compl_queues = calloc(sc->num_cqueues + 1, sizeof(struct nvme_completion_queue)); if (sc->compl_queues == NULL) { WPRINTF("%s: CQ allocation failed", __func__); sc->num_cqueues = 0; } else { struct nvme_completion_queue *cq = sc->compl_queues; for (i = 0; i < sc->num_cqueues + 1; i++) pthread_mutex_init(&cq[i].mtx, NULL); } } static void pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) { struct nvme_controller_data *cd = &sc->ctrldata; cd->vid = 0xFB5D; cd->ssvid = 0x0000; cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); /* Num of submission commands that we can handle at a time (2^rab) */ cd->rab = 4; /* FreeBSD OUI */ cd->ieee[0] = 0x58; cd->ieee[1] = 0x9c; cd->ieee[2] = 0xfc; cd->mic = 0; cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ cd->ver = NVME_REV(1,4); cd->cntrltype = NVME_CNTRLTYPE_IO; cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR); cd->acl = 2; cd->aerl = 4; /* Advertise 1, Read-only firmware slot */ cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) | (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); cd->lpa = 0; /* TODO: support some simple things like SMART */ cd->elpe = 0; /* max error log page entries */ /* * Report a single power state (zero-based value) * power_state[] values are left as zero to indicate "Not reported" */ cd->npss = 0; /* Warning Composite Temperature Threshold */ cd->wctemp = 0x0157; cd->cctemp = 0x0157; /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */ cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO << NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT); cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); cd->nn = 1; /* number of namespaces */ cd->oncs = 0; switch (sc->dataset_management) { case NVME_DATASET_MANAGEMENT_AUTO: if (sc->nvstore.deallocate) cd->oncs |= NVME_ONCS_DSM; break; case NVME_DATASET_MANAGEMENT_ENABLE: cd->oncs |= NVME_ONCS_DSM; break; default: break; } cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK << NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT; cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT; } /* * Calculate the CRC-16 of the given buffer * See copyright attribution at top of file */ static uint16_t crc16(uint16_t crc, const void *buffer, unsigned int len) { const unsigned char *cp = buffer; /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ static uint16_t const crc16_table[256] = { 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 }; while (len--) crc = (((crc >> 8) & 0xffU) ^ crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; return crc; } static void pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, struct nvme_namespace_data *nd) { /* Get capacity and block size information from backing store */ nd->nsze = nvstore->size / nvstore->sectsz; nd->ncap = nd->nsze; nd->nuse = nd->nsze; } static void pci_nvme_init_nsdata(struct pci_nvme_softc *sc, struct nvme_namespace_data *nd, uint32_t nsid, struct pci_nvme_blockstore *nvstore) { pci_nvme_init_nsdata_size(nvstore, nd); if (nvstore->type == NVME_STOR_BLOCKIF) nvstore->deallocate = blockif_candelete(nvstore->ctx); nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ nd->flbas = 0; /* Create an EUI-64 if user did not provide one */ if (nvstore->eui64 == 0) { char *data = NULL; uint64_t eui64 = nvstore->eui64; asprintf(&data, "%s%u%u%u", get_config_value("name"), sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); if (data != NULL) { eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); free(data); } nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); } be64enc(nd->eui64, nvstore->eui64); /* LBA data-sz = 2^lbads */ nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; } static void pci_nvme_init_logpages(struct pci_nvme_softc *sc) { __uint128_t power_cycles = 1; memset(&sc->err_log, 0, sizeof(sc->err_log)); memset(&sc->health_log, 0, sizeof(sc->health_log)); memset(&sc->fw_log, 0, sizeof(sc->fw_log)); memset(&sc->ns_log, 0, sizeof(sc->ns_log)); /* Set read/write remainder to round up according to spec */ sc->read_dunits_remainder = 999; sc->write_dunits_remainder = 999; /* Set nominal Health values checked by implementations */ sc->health_log.temperature = NVME_TEMPERATURE; sc->health_log.available_spare = 100; sc->health_log.available_spare_threshold = 10; /* Set Active Firmware Info to slot 1 */ sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT); memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr, sizeof(sc->fw_log.revision[0])); memcpy(&sc->health_log.power_cycles, &power_cycles, sizeof(sc->health_log.power_cycles)); } static void pci_nvme_init_features(struct pci_nvme_softc *sc) { enum nvme_feature fid; for (fid = 0; fid < NVME_FID_MAX; fid++) { switch (fid) { case NVME_FEAT_ARBITRATION: case NVME_FEAT_POWER_MANAGEMENT: case NVME_FEAT_INTERRUPT_COALESCING: //XXX case NVME_FEAT_WRITE_ATOMICITY: /* Mandatory but no special handling required */ //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: // this returns a data buffer break; case NVME_FEAT_TEMPERATURE_THRESHOLD: sc->feat[fid].set = nvme_feature_temperature; break; case NVME_FEAT_ERROR_RECOVERY: sc->feat[fid].namespace_specific = true; break; case NVME_FEAT_NUMBER_OF_QUEUES: sc->feat[fid].set = nvme_feature_num_queues; break; case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: sc->feat[fid].set = nvme_feature_iv_config; break; case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: sc->feat[fid].set = nvme_feature_async_event; /* Enable all AENs by default */ sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK; break; default: sc->feat[fid].set = nvme_feature_invalid_cb; sc->feat[fid].get = nvme_feature_invalid_cb; } } } static void pci_nvme_aer_reset(struct pci_nvme_softc *sc) { STAILQ_INIT(&sc->aer_list); sc->aer_count = 0; } static void pci_nvme_aer_init(struct pci_nvme_softc *sc) { pthread_mutex_init(&sc->aer_mtx, NULL); pci_nvme_aer_reset(sc); } static void pci_nvme_aer_destroy(struct pci_nvme_softc *sc) { struct pci_nvme_aer *aer = NULL; pthread_mutex_lock(&sc->aer_mtx); while (!STAILQ_EMPTY(&sc->aer_list)) { aer = STAILQ_FIRST(&sc->aer_list); STAILQ_REMOVE_HEAD(&sc->aer_list, link); free(aer); } pthread_mutex_unlock(&sc->aer_mtx); pci_nvme_aer_reset(sc); } static bool pci_nvme_aer_available(struct pci_nvme_softc *sc) { return (sc->aer_count != 0); } static bool pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) { struct nvme_controller_data *cd = &sc->ctrldata; /* AERL is a zero based value while aer_count is one's based */ return (sc->aer_count == (cd->aerl + 1)); } /* * Add an Async Event Request * * Stores an AER to be returned later if the Controller needs to notify the * host of an event. * Note that while the NVMe spec doesn't require Controllers to return AER's * in order, this implementation does preserve the order. */ static int pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) { struct pci_nvme_aer *aer = NULL; aer = calloc(1, sizeof(struct pci_nvme_aer)); if (aer == NULL) return (-1); /* Save the Command ID for use in the completion message */ aer->cid = cid; pthread_mutex_lock(&sc->aer_mtx); sc->aer_count++; STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); pthread_mutex_unlock(&sc->aer_mtx); return (0); } /* * Get an Async Event Request structure * * Returns a pointer to an AER previously submitted by the host or NULL if * no AER's exist. Caller is responsible for freeing the returned struct. */ static struct pci_nvme_aer * pci_nvme_aer_get(struct pci_nvme_softc *sc) { struct pci_nvme_aer *aer = NULL; pthread_mutex_lock(&sc->aer_mtx); aer = STAILQ_FIRST(&sc->aer_list); if (aer != NULL) { STAILQ_REMOVE_HEAD(&sc->aer_list, link); sc->aer_count--; } pthread_mutex_unlock(&sc->aer_mtx); return (aer); } static void pci_nvme_aen_reset(struct pci_nvme_softc *sc) { uint32_t atype; memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { sc->aen[atype].atype = atype; } } static void pci_nvme_aen_init(struct pci_nvme_softc *sc) { char nstr[80]; pci_nvme_aen_reset(sc); pthread_mutex_init(&sc->aen_mtx, NULL); pthread_create(&sc->aen_tid, NULL, aen_thr, sc); snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); pthread_set_name_np(sc->aen_tid, nstr); } static void pci_nvme_aen_destroy(struct pci_nvme_softc *sc) { pci_nvme_aen_reset(sc); } /* Notify the AEN thread of pending work */ static void pci_nvme_aen_notify(struct pci_nvme_softc *sc) { pthread_cond_signal(&sc->aen_cond); } /* * Post an Asynchronous Event Notification */ static int32_t pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, uint32_t event_data) { struct pci_nvme_aen *aen; if (atype >= PCI_NVME_AE_TYPE_MAX) { return(EINVAL); } pthread_mutex_lock(&sc->aen_mtx); aen = &sc->aen[atype]; /* Has the controller already posted an event of this type? */ if (aen->posted) { pthread_mutex_unlock(&sc->aen_mtx); return(EALREADY); } aen->event_data = event_data; aen->posted = true; pthread_mutex_unlock(&sc->aen_mtx); pci_nvme_aen_notify(sc); return(0); } static void pci_nvme_aen_process(struct pci_nvme_softc *sc) { struct pci_nvme_aer *aer; struct pci_nvme_aen *aen; pci_nvme_async_type atype; uint32_t mask; uint16_t status; uint8_t lid; assert(pthread_mutex_isowned_np(&sc->aen_mtx)); for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { aen = &sc->aen[atype]; /* Previous iterations may have depleted the available AER's */ if (!pci_nvme_aer_available(sc)) { DPRINTF("%s: no AER", __func__); break; } if (!aen->posted) { DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); continue; } status = NVME_SC_SUCCESS; /* Is the event masked? */ mask = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); switch (atype) { case PCI_NVME_AE_TYPE_ERROR: lid = NVME_LOG_ERROR; break; case PCI_NVME_AE_TYPE_SMART: mask &= 0xff; if ((mask & aen->event_data) == 0) continue; lid = NVME_LOG_HEALTH_INFORMATION; break; case PCI_NVME_AE_TYPE_NOTICE: if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) { EPRINTLN("%s unknown AEN notice type %u", __func__, aen->event_data); status = NVME_SC_INTERNAL_DEVICE_ERROR; break; } if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0) continue; switch (aen->event_data) { case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED: lid = NVME_LOG_CHANGED_NAMESPACE; break; case PCI_NVME_AEI_NOTICE_FW_ACTIVATION: lid = NVME_LOG_FIRMWARE_SLOT; break; case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE: lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; break; case PCI_NVME_AEI_NOTICE_ANA_CHANGE: lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; break; case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE: lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; break; case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT: lid = NVME_LOG_LBA_STATUS_INFORMATION; break; case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE: lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; break; default: lid = 0; } break; default: /* bad type?!? */ EPRINTLN("%s unknown AEN type %u", __func__, atype); status = NVME_SC_INTERNAL_DEVICE_ERROR; break; } aer = pci_nvme_aer_get(sc); assert(aer != NULL); DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); pci_nvme_cq_update(sc, &sc->compl_queues[0], (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ aer->cid, 0, /* SQID */ status); aen->event_data = 0; aen->posted = false; pci_generate_msix(sc->nsc_pi, 0); } } static void * aen_thr(void *arg) { struct pci_nvme_softc *sc; sc = arg; pthread_mutex_lock(&sc->aen_mtx); for (;;) { pci_nvme_aen_process(sc); pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); } pthread_mutex_unlock(&sc->aen_mtx); pthread_exit(NULL); return (NULL); } static void pci_nvme_reset_locked(struct pci_nvme_softc *sc) { uint32_t i; DPRINTF("%s", __func__); sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | (1 << NVME_CAP_LO_REG_CQR_SHIFT) | (60 << NVME_CAP_LO_REG_TO_SHIFT); sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */ sc->regs.cc = 0; assert(sc->submit_queues != NULL); for (i = 0; i < sc->num_squeues + 1; i++) { sc->submit_queues[i].qbase = NULL; sc->submit_queues[i].size = 0; sc->submit_queues[i].cqid = 0; sc->submit_queues[i].tail = 0; sc->submit_queues[i].head = 0; } assert(sc->compl_queues != NULL); for (i = 0; i < sc->num_cqueues + 1; i++) { sc->compl_queues[i].qbase = NULL; sc->compl_queues[i].size = 0; sc->compl_queues[i].tail = 0; sc->compl_queues[i].head = 0; } sc->num_q_is_set = false; pci_nvme_aer_destroy(sc); pci_nvme_aen_destroy(sc); /* * Clear CSTS.RDY last to prevent the host from enabling Controller * before cleanup completes */ sc->regs.csts = 0; } static void pci_nvme_reset(struct pci_nvme_softc *sc) { pthread_mutex_lock(&sc->mtx); pci_nvme_reset_locked(sc); pthread_mutex_unlock(&sc->mtx); } static int pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) { uint16_t acqs, asqs; DPRINTF("%s", __func__); /* * NVMe 2.0 states that "enabling a controller while this field is * cleared to 0h produces undefined results" for both ACQS and * ASQS. If zero, set CFS and do not become ready. */ asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK); if (asqs < 2) { EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__, asqs - 1, sc->regs.aqa); sc->regs.csts |= NVME_CSTS_CFS; return (-1); } sc->submit_queues[0].size = asqs; sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, sizeof(struct nvme_command) * asqs); if (sc->submit_queues[0].qbase == NULL) { EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__, sc->regs.asq); sc->regs.csts |= NVME_CSTS_CFS; return (-1); } DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", __func__, sc->regs.asq, sc->submit_queues[0].qbase); acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & NVME_AQA_REG_ACQS_MASK); if (acqs < 2) { EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__, acqs - 1, sc->regs.aqa); sc->regs.csts |= NVME_CSTS_CFS; return (-1); } sc->compl_queues[0].size = acqs; sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, sizeof(struct nvme_completion) * acqs); if (sc->compl_queues[0].qbase == NULL) { EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__, sc->regs.acq); sc->regs.csts |= NVME_CSTS_CFS; return (-1); } sc->compl_queues[0].intr_en = NVME_CQ_INTEN; DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", __func__, sc->regs.acq, sc->compl_queues[0].qbase); return (0); } static int nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, size_t len, enum nvme_copy_dir dir) { uint8_t *p; size_t bytes; if (len > (8 * 1024)) { return (-1); } /* Copy from the start of prp1 to the end of the physical page */ bytes = PAGE_SIZE - (prp1 & PAGE_MASK); bytes = MIN(bytes, len); p = vm_map_gpa(ctx, prp1, bytes); if (p == NULL) { return (-1); } if (dir == NVME_COPY_TO_PRP) memcpy(p, b, bytes); else memcpy(b, p, bytes); b += bytes; len -= bytes; if (len == 0) { return (0); } len = MIN(len, PAGE_SIZE); p = vm_map_gpa(ctx, prp2, len); if (p == NULL) { return (-1); } if (dir == NVME_COPY_TO_PRP) memcpy(p, b, len); else memcpy(b, p, len); return (0); } /* * Write a Completion Queue Entry update * * Write the completion and update the doorbell value */ static void pci_nvme_cq_update(struct pci_nvme_softc *sc, struct nvme_completion_queue *cq, uint32_t cdw0, uint16_t cid, uint16_t sqid, uint16_t status) { struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; struct nvme_completion *cqe; assert(cq->qbase != NULL); pthread_mutex_lock(&cq->mtx); cqe = &cq->qbase[cq->tail]; /* Flip the phase bit */ status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; cqe->cdw0 = cdw0; cqe->sqhd = sq->head; cqe->sqid = sqid; cqe->cid = cid; cqe->status = status; cq->tail++; if (cq->tail >= cq->size) { cq->tail = 0; } pthread_mutex_unlock(&cq->mtx); } static int nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint16_t qid = command->cdw10 & 0xffff; DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); if (qid == 0 || qid > sc->num_squeues || (sc->submit_queues[qid].qbase == NULL)) { WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", __func__, qid, sc->num_squeues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } sc->submit_queues[qid].qbase = NULL; sc->submit_queues[qid].cqid = 0; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { if (command->cdw11 & NVME_CMD_CDW11_PC) { uint16_t qid = command->cdw10 & 0xffff; struct nvme_submission_queue *nsq; if ((qid == 0) || (qid > sc->num_squeues) || (sc->submit_queues[qid].qbase != NULL)) { WPRINTF("%s queue index %u > num_squeues %u", __func__, qid, sc->num_squeues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } nsq = &sc->submit_queues[qid]; nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { /* * Queues must specify at least two entries * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec */ pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); return (1); } nsq->head = nsq->tail = 0; nsq->cqid = (command->cdw11 >> 16) & 0xffff; if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } if (sc->compl_queues[nsq->cqid].qbase == NULL) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_COMPLETION_QUEUE_INVALID); return (1); } nsq->qpriority = (command->cdw11 >> 1) & 0x03; nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(struct nvme_command) * (size_t)nsq->size); DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, qid, nsq->size, nsq->qbase, nsq->cqid); pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); DPRINTF("%s completed creating IOSQ qid %u", __func__, qid); } else { /* * Guest sent non-cont submission queue request. * This setting is unsupported by this emulation. */ WPRINTF("%s unsupported non-contig (list-based) " "create i/o submission queue", __func__); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); } return (1); } static int nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint16_t qid = command->cdw10 & 0xffff; uint16_t sqid; DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); if (qid == 0 || qid > sc->num_cqueues || (sc->compl_queues[qid].qbase == NULL)) { WPRINTF("%s queue index %u / num_cqueues %u", __func__, qid, sc->num_cqueues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } /* Deleting an Active CQ is an error */ for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) if (sc->submit_queues[sqid].cqid == qid) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_DELETION); return (1); } sc->compl_queues[qid].qbase = NULL; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { struct nvme_completion_queue *ncq; uint16_t qid = command->cdw10 & 0xffff; /* Only support Physically Contiguous queues */ if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { WPRINTF("%s unsupported non-contig (list-based) " "create i/o completion queue", __func__); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } if ((qid == 0) || (qid > sc->num_cqueues) || (sc->compl_queues[qid].qbase != NULL)) { WPRINTF("%s queue index %u > num_cqueues %u", __func__, qid, sc->num_cqueues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } ncq = &sc->compl_queues[qid]; ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; if (ncq->intr_vec > (sc->max_queues + 1)) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_INTERRUPT_VECTOR); return (1); } ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { /* * Queues must specify at least two entries * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec */ pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); return (1); } ncq->head = ncq->tail = 0; ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(struct nvme_command) * (size_t)ncq->size); pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint64_t logoff; uint32_t logsize; uint8_t logpage; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); /* * Command specifies the number of dwords to return in fields NUMDU * and NUMDL. This is a zero-based value. */ logpage = command->cdw10 & 0xFF; logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; logsize *= sizeof(uint32_t); logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12; DPRINTF("%s log page %u len %u", __func__, logpage, logsize); switch (logpage) { case NVME_LOG_ERROR: if (logoff >= sizeof(sc->err_log)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); break; } nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->err_log + logoff, MIN(logsize - logoff, sizeof(sc->err_log)), NVME_COPY_TO_PRP); break; case NVME_LOG_HEALTH_INFORMATION: if (logoff >= sizeof(sc->health_log)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); break; } pthread_mutex_lock(&sc->mtx); memcpy(&sc->health_log.data_units_read, &sc->read_data_units, sizeof(sc->health_log.data_units_read)); memcpy(&sc->health_log.data_units_written, &sc->write_data_units, sizeof(sc->health_log.data_units_written)); memcpy(&sc->health_log.host_read_commands, &sc->read_commands, sizeof(sc->health_log.host_read_commands)); memcpy(&sc->health_log.host_write_commands, &sc->write_commands, sizeof(sc->health_log.host_write_commands)); pthread_mutex_unlock(&sc->mtx); nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->health_log + logoff, MIN(logsize - logoff, sizeof(sc->health_log)), NVME_COPY_TO_PRP); break; case NVME_LOG_FIRMWARE_SLOT: if (logoff >= sizeof(sc->fw_log)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); break; } nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->fw_log + logoff, MIN(logsize - logoff, sizeof(sc->fw_log)), NVME_COPY_TO_PRP); break; case NVME_LOG_CHANGED_NAMESPACE: if (logoff >= sizeof(sc->ns_log)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); break; } nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->ns_log + logoff, MIN(logsize - logoff, sizeof(sc->ns_log)), NVME_COPY_TO_PRP); memset(&sc->ns_log, 0, sizeof(sc->ns_log)); break; default: DPRINTF("%s get log page %x command not supported", __func__, logpage); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_LOG_PAGE); } return (1); } static int nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { void *dest; uint16_t status; DPRINTF("%s identify 0x%x nsid 0x%x", __func__, command->cdw10 & 0xFF, command->nsid); pci_nvme_status_genc(&status, NVME_SC_SUCCESS); switch (command->cdw10 & 0xFF) { case 0x00: /* return Identify Namespace data structure */ /* Global NS only valid with NS Management */ if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) { pci_nvme_status_genc(&status, NVME_SC_INVALID_NAMESPACE_OR_FORMAT); break; } nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), NVME_COPY_TO_PRP); break; case 0x01: /* return Identify Controller data structure */ nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->ctrldata, sizeof(sc->ctrldata), NVME_COPY_TO_PRP); break; case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(uint32_t) * 1024); /* All unused entries shall be zero */ memset(dest, 0, sizeof(uint32_t) * 1024); ((uint32_t *)dest)[0] = 1; break; case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ if (command->nsid != 1) { pci_nvme_status_genc(&status, NVME_SC_INVALID_NAMESPACE_OR_FORMAT); break; } dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(uint32_t) * 1024); /* All bytes after the descriptor shall be zero */ memset(dest, 0, sizeof(uint32_t) * 1024); /* Return NIDT=1 (i.e. EUI64) descriptor */ ((uint8_t *)dest)[0] = 1; ((uint8_t *)dest)[1] = sizeof(uint64_t); memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t)); break; case 0x13: /* * Controller list is optional but used by UNH tests. Return * a valid but empty list. */ dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(uint16_t) * 2048); memset(dest, 0, sizeof(uint16_t) * 2048); break; default: DPRINTF("%s unsupported identify command requested 0x%x", __func__, command->cdw10 & 0xFF); pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); break; } compl->status = status; return (1); } static const char * nvme_fid_to_name(uint8_t fid) { const char *name; switch (fid) { case NVME_FEAT_ARBITRATION: name = "Arbitration"; break; case NVME_FEAT_POWER_MANAGEMENT: name = "Power Management"; break; case NVME_FEAT_LBA_RANGE_TYPE: name = "LBA Range Type"; break; case NVME_FEAT_TEMPERATURE_THRESHOLD: name = "Temperature Threshold"; break; case NVME_FEAT_ERROR_RECOVERY: name = "Error Recovery"; break; case NVME_FEAT_VOLATILE_WRITE_CACHE: name = "Volatile Write Cache"; break; case NVME_FEAT_NUMBER_OF_QUEUES: name = "Number of Queues"; break; case NVME_FEAT_INTERRUPT_COALESCING: name = "Interrupt Coalescing"; break; case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: name = "Interrupt Vector Configuration"; break; case NVME_FEAT_WRITE_ATOMICITY: name = "Write Atomicity Normal"; break; case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: name = "Asynchronous Event Configuration"; break; case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: name = "Autonomous Power State Transition"; break; case NVME_FEAT_HOST_MEMORY_BUFFER: name = "Host Memory Buffer"; break; case NVME_FEAT_TIMESTAMP: name = "Timestamp"; break; case NVME_FEAT_KEEP_ALIVE_TIMER: name = "Keep Alive Timer"; break; case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: name = "Host Controlled Thermal Management"; break; case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: name = "Non-Operation Power State Config"; break; case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: name = "Read Recovery Level Config"; break; case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: name = "Predictable Latency Mode Config"; break; case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: name = "Predictable Latency Mode Window"; break; case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: name = "LBA Status Information Report Interval"; break; case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: name = "Host Behavior Support"; break; case NVME_FEAT_SANITIZE_CONFIG: name = "Sanitize Config"; break; case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: name = "Endurance Group Event Configuration"; break; case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: name = "Software Progress Marker"; break; case NVME_FEAT_HOST_IDENTIFIER: name = "Host Identifier"; break; case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: name = "Reservation Notification Mask"; break; case NVME_FEAT_RESERVATION_PERSISTENCE: name = "Reservation Persistence"; break; case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: name = "Namespace Write Protection Config"; break; default: name = "Unknown"; break; } return (name); } static void nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused, struct nvme_feature_obj *feat __unused, struct nvme_command *command __unused, struct nvme_completion *compl) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); } static void nvme_feature_iv_config(struct pci_nvme_softc *sc, struct nvme_feature_obj *feat __unused, struct nvme_command *command, struct nvme_completion *compl) { uint32_t i; uint32_t cdw11 = command->cdw11; uint16_t iv; bool cd; pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); iv = cdw11 & 0xffff; cd = cdw11 & (1 << 16); if (iv > (sc->max_queues + 1)) { return; } /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ if ((iv == 0) && !cd) return; /* Requested Interrupt Vector must be used by a CQ */ for (i = 0; i < sc->num_cqueues + 1; i++) { if (sc->compl_queues[i].intr_vec == iv) { pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); } } } #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000) static void nvme_feature_async_event(struct pci_nvme_softc *sc __unused, struct nvme_feature_obj *feat __unused, struct nvme_command *command, struct nvme_completion *compl) { if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP) pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); } #define NVME_TEMP_THRESH_OVER 0 #define NVME_TEMP_THRESH_UNDER 1 static void nvme_feature_temperature(struct pci_nvme_softc *sc, struct nvme_feature_obj *feat __unused, struct nvme_command *command, struct nvme_completion *compl) { uint16_t tmpth; /* Temperature Threshold */ uint8_t tmpsel; /* Threshold Temperature Select */ uint8_t thsel; /* Threshold Type Select */ bool set_crit = false; tmpth = command->cdw11 & 0xffff; tmpsel = (command->cdw11 >> 16) & 0xf; thsel = (command->cdw11 >> 20) & 0x3; DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel); /* Check for unsupported values */ if (((tmpsel != 0) && (tmpsel != 0xf)) || (thsel > NVME_TEMP_THRESH_UNDER)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return; } if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) || ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth))) set_crit = true; pthread_mutex_lock(&sc->mtx); if (set_crit) sc->health_log.critical_warning |= NVME_CRIT_WARN_ST_TEMPERATURE; else sc->health_log.critical_warning &= ~NVME_CRIT_WARN_ST_TEMPERATURE; pthread_mutex_unlock(&sc->mtx); if (set_crit) pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART, sc->health_log.critical_warning); DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status); } static void nvme_feature_num_queues(struct pci_nvme_softc *sc, struct nvme_feature_obj *feat __unused, struct nvme_command *command, struct nvme_completion *compl) { uint16_t nqr; /* Number of Queues Requested */ if (sc->num_q_is_set) { WPRINTF("%s: Number of Queues already set", __func__); pci_nvme_status_genc(&compl->status, NVME_SC_COMMAND_SEQUENCE_ERROR); return; } nqr = command->cdw11 & 0xFFFF; if (nqr == 0xffff) { WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return; } sc->num_squeues = ONE_BASED(nqr); if (sc->num_squeues > sc->max_queues) { DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, sc->max_queues); sc->num_squeues = sc->max_queues; } nqr = (command->cdw11 >> 16) & 0xFFFF; if (nqr == 0xffff) { WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return; } sc->num_cqueues = ONE_BASED(nqr); if (sc->num_cqueues > sc->max_queues) { DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, sc->max_queues); sc->num_cqueues = sc->max_queues; } /* Patch the command value which will be saved on callback's return */ command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); sc->num_q_is_set = true; } static int nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, struct nvme_completion *compl) { struct nvme_feature_obj *feat; uint32_t nsid = command->nsid; - uint8_t fid = command->cdw10 & 0xFF; + uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10); + bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10); DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); if (fid >= NVME_FID_MAX) { DPRINTF("%s invalid feature 0x%x", __func__, fid); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } + + if (sv) { + pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_FEATURE_NOT_SAVEABLE); + return (1); + } + feat = &sc->feat[fid]; if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } if (!feat->namespace_specific && !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_FEATURE_NOT_NS_SPECIFIC); return (1); } compl->cdw0 = 0; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); if (feat->set) feat->set(sc, feat, command, compl); DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); if (compl->status == NVME_SC_SUCCESS) { feat->cdw11 = command->cdw11; if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && (command->cdw11 != 0)) pci_nvme_aen_notify(sc); } return (0); } #define NVME_FEATURES_SEL_SUPPORTED 0x3 #define NVME_FEATURES_NS_SPECIFIC (1 << 1) static int nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { struct nvme_feature_obj *feat; uint8_t fid = command->cdw10 & 0xFF; uint8_t sel = (command->cdw10 >> 8) & 0x7; DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); if (fid >= NVME_FID_MAX) { DPRINTF("%s invalid feature 0x%x", __func__, fid); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } compl->cdw0 = 0; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); feat = &sc->feat[fid]; if (feat->get) { feat->get(sc, feat, command, compl); } if (compl->status == NVME_SC_SUCCESS) { if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific) compl->cdw0 = NVME_FEATURES_NS_SPECIFIC; else compl->cdw0 = feat->cdw11; } return (0); } static int nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint8_t ses, lbaf, pi; /* Only supports Secure Erase Setting - User Data Erase */ ses = (command->cdw10 >> 9) & 0x7; if (ses > 0x1) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } /* Only supports a single LBA Format */ lbaf = command->cdw10 & 0xf; if (lbaf != 0) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_FORMAT); return (1); } /* Doesn't support Protection Infomation */ pi = (command->cdw10 >> 5) & 0x7; if (pi != 0) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } if (sc->nvstore.type == NVME_STOR_RAM) { if (sc->nvstore.ctx) free(sc->nvstore.ctx); sc->nvstore.ctx = calloc(1, sc->nvstore.size); pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); } else { struct pci_nvme_ioreq *req; int err; req = pci_nvme_get_ioreq(sc); if (req == NULL) { pci_nvme_status_genc(&compl->status, NVME_SC_INTERNAL_DEVICE_ERROR); WPRINTF("%s: unable to allocate IO req", __func__); return (1); } req->nvme_sq = &sc->submit_queues[0]; req->sqid = 0; req->opc = command->opc; req->cid = command->cid; req->nsid = command->nsid; req->io_req.br_offset = 0; req->io_req.br_resid = sc->nvstore.size; req->io_req.br_callback = pci_nvme_io_done; err = blockif_delete(sc->nvstore.ctx, &req->io_req); if (err) { pci_nvme_status_genc(&compl->status, NVME_SC_INTERNAL_DEVICE_ERROR); pci_nvme_release_ioreq(sc, req); } else compl->status = NVME_NO_STATUS; } return (1); } static int nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command, struct nvme_completion *compl) { DPRINTF("%s submission queue %u, command ID 0x%x", __func__, command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); /* TODO: search for the command ID and abort it */ compl->cdw0 = 1; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_async_event_req(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, sc->aer_count, sc->ctrldata.aerl, command->cid); /* Don't exceed the Async Event Request Limit (AERL). */ if (pci_nvme_aer_limit_reached(sc)) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); return (1); } if (pci_nvme_aer_add(sc, command->cid)) { pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, NVME_SC_INTERNAL_DEVICE_ERROR); return (1); } /* * Raise events when they happen based on the Set Features cmd. * These events happen async, so only set completion successful if * there is an event reflective of the request to get event. */ compl->status = NVME_NO_STATUS; pci_nvme_aen_notify(sc); return (0); } static void pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) { struct nvme_completion compl; struct nvme_command *cmd; struct nvme_submission_queue *sq; struct nvme_completion_queue *cq; uint16_t sqhead; DPRINTF("%s index %u", __func__, (uint32_t)value); sq = &sc->submit_queues[0]; cq = &sc->compl_queues[0]; pthread_mutex_lock(&sq->mtx); sqhead = sq->head; DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); while (sqhead != atomic_load_acq_short(&sq->tail)) { cmd = &(sq->qbase)[sqhead]; compl.cdw0 = 0; compl.status = 0; switch (cmd->opc) { case NVME_OPC_DELETE_IO_SQ: DPRINTF("%s command DELETE_IO_SQ", __func__); nvme_opc_delete_io_sq(sc, cmd, &compl); break; case NVME_OPC_CREATE_IO_SQ: DPRINTF("%s command CREATE_IO_SQ", __func__); nvme_opc_create_io_sq(sc, cmd, &compl); break; case NVME_OPC_DELETE_IO_CQ: DPRINTF("%s command DELETE_IO_CQ", __func__); nvme_opc_delete_io_cq(sc, cmd, &compl); break; case NVME_OPC_CREATE_IO_CQ: DPRINTF("%s command CREATE_IO_CQ", __func__); nvme_opc_create_io_cq(sc, cmd, &compl); break; case NVME_OPC_GET_LOG_PAGE: DPRINTF("%s command GET_LOG_PAGE", __func__); nvme_opc_get_log_page(sc, cmd, &compl); break; case NVME_OPC_IDENTIFY: DPRINTF("%s command IDENTIFY", __func__); nvme_opc_identify(sc, cmd, &compl); break; case NVME_OPC_ABORT: DPRINTF("%s command ABORT", __func__); nvme_opc_abort(sc, cmd, &compl); break; case NVME_OPC_SET_FEATURES: DPRINTF("%s command SET_FEATURES", __func__); nvme_opc_set_features(sc, cmd, &compl); break; case NVME_OPC_GET_FEATURES: DPRINTF("%s command GET_FEATURES", __func__); nvme_opc_get_features(sc, cmd, &compl); break; case NVME_OPC_FIRMWARE_ACTIVATE: DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); pci_nvme_status_tc(&compl.status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_FIRMWARE_SLOT); break; case NVME_OPC_ASYNC_EVENT_REQUEST: DPRINTF("%s command ASYNC_EVENT_REQ", __func__); nvme_opc_async_event_req(sc, cmd, &compl); break; case NVME_OPC_FORMAT_NVM: DPRINTF("%s command FORMAT_NVM", __func__); if ((sc->ctrldata.oacs & (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); break; } nvme_opc_format_nvm(sc, cmd, &compl); break; case NVME_OPC_SECURITY_SEND: case NVME_OPC_SECURITY_RECEIVE: case NVME_OPC_SANITIZE: case NVME_OPC_GET_LBA_STATUS: DPRINTF("%s command OPC=%#x (unsupported)", __func__, cmd->opc); /* Valid but unsupported opcodes */ pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD); break; default: DPRINTF("%s command OPC=%#X (not implemented)", __func__, cmd->opc); pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); } sqhead = (sqhead + 1) % sq->size; if (NVME_COMPLETION_VALID(compl)) { pci_nvme_cq_update(sc, &sc->compl_queues[0], compl.cdw0, cmd->cid, 0, /* SQID */ compl.status); } } DPRINTF("setting sqhead %u", sqhead); sq->head = sqhead; if (cq->head != cq->tail) pci_generate_msix(sc->nsc_pi, 0); pthread_mutex_unlock(&sq->mtx); } /* * Update the Write and Read statistics reported in SMART data * * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. */ static void pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, size_t bytes, uint16_t status) { pthread_mutex_lock(&sc->mtx); switch (opc) { case NVME_OPC_WRITE: sc->write_commands++; if (status != NVME_SC_SUCCESS) break; sc->write_dunits_remainder += (bytes / 512); while (sc->write_dunits_remainder >= 1000) { sc->write_data_units++; sc->write_dunits_remainder -= 1000; } break; case NVME_OPC_READ: sc->read_commands++; if (status != NVME_SC_SUCCESS) break; sc->read_dunits_remainder += (bytes / 512); while (sc->read_dunits_remainder >= 1000) { sc->read_data_units++; sc->read_dunits_remainder -= 1000; } break; default: DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); break; } pthread_mutex_unlock(&sc->mtx); } /* * Check if the combination of Starting LBA (slba) and number of blocks * exceeds the range of the underlying storage. * * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores * the capacity in bytes as a uint64_t, care must be taken to avoid integer * overflow. */ static bool pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, uint32_t nblocks) { size_t offset, bytes; /* Overflow check of multiplying Starting LBA by the sector size */ if (slba >> (64 - nvstore->sectsz_bits)) return (true); offset = slba << nvstore->sectsz_bits; bytes = nblocks << nvstore->sectsz_bits; /* Overflow check of Number of Logical Blocks */ if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes)) return (true); return (false); } static int pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused, struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset) { int iovidx; bool range_is_contiguous; if (req == NULL) return (-1); if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { return (-1); } /* * Minimize the number of IOVs by concatenating contiguous address * ranges. If the IOV count is zero, there is no previous range to * concatenate. */ if (req->io_req.br_iovcnt == 0) range_is_contiguous = false; else range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr; if (range_is_contiguous) { iovidx = req->io_req.br_iovcnt - 1; req->io_req.br_iov[iovidx].iov_base = paddr_guest2host(req->sc->nsc_pi->pi_vmctx, req->prev_gpaddr, size); if (req->io_req.br_iov[iovidx].iov_base == NULL) return (-1); req->prev_size += size; req->io_req.br_resid += size; req->io_req.br_iov[iovidx].iov_len = req->prev_size; } else { iovidx = req->io_req.br_iovcnt; if (iovidx == 0) { req->io_req.br_offset = offset; req->io_req.br_resid = 0; req->io_req.br_param = req; } req->io_req.br_iov[iovidx].iov_base = paddr_guest2host(req->sc->nsc_pi->pi_vmctx, gpaddr, size); if (req->io_req.br_iov[iovidx].iov_base == NULL) return (-1); req->io_req.br_iov[iovidx].iov_len = size; req->prev_gpaddr = gpaddr; req->prev_size = size; req->io_req.br_resid += size; req->io_req.br_iovcnt++; } return (0); } static void pci_nvme_set_completion(struct pci_nvme_softc *sc, struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status) { struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), NVME_STATUS_GET_SC(status)); pci_nvme_cq_update(sc, cq, 0, cid, sqid, status); if (cq->head != cq->tail) { if (cq->intr_en & NVME_CQ_INTEN) { pci_generate_msix(sc->nsc_pi, cq->intr_vec); } else { DPRINTF("%s: CQ%u interrupt disabled", __func__, sq->cqid); } } } static void pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) { req->sc = NULL; req->nvme_sq = NULL; req->sqid = 0; pthread_mutex_lock(&sc->mtx); STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); sc->pending_ios--; /* when no more IO pending, can set to ready if device reset/enabled */ if (sc->pending_ios == 0 && NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) sc->regs.csts |= NVME_CSTS_RDY; pthread_mutex_unlock(&sc->mtx); sem_post(&sc->iosemlock); } static struct pci_nvme_ioreq * pci_nvme_get_ioreq(struct pci_nvme_softc *sc) { struct pci_nvme_ioreq *req = NULL; sem_wait(&sc->iosemlock); pthread_mutex_lock(&sc->mtx); req = STAILQ_FIRST(&sc->ioreqs_free); assert(req != NULL); STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); req->sc = sc; sc->pending_ios++; pthread_mutex_unlock(&sc->mtx); req->io_req.br_iovcnt = 0; req->io_req.br_offset = 0; req->io_req.br_resid = 0; req->io_req.br_param = req; req->prev_gpaddr = 0; req->prev_size = 0; return req; } static void pci_nvme_io_done(struct blockif_req *br, int err) { struct pci_nvme_ioreq *req = br->br_param; struct nvme_submission_queue *sq = req->nvme_sq; uint16_t code, status; DPRINTF("%s error %d %s", __func__, err, strerror(err)); /* TODO return correct error */ code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; pci_nvme_status_genc(&status, code); pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status); pci_nvme_stats_write_read_update(req->sc, req->opc, req->bytes, status); pci_nvme_release_ioreq(req->sc, req); } /* * Implements the Flush command. The specification states: * If a volatile write cache is not present, Flush commands complete * successfully and have no effect * in the description of the Volatile Write Cache (VWC) field of the Identify * Controller data. Therefore, set status to Success if the command is * not supported (i.e. RAM or as indicated by the blockif). */ static bool nvme_opc_flush(struct pci_nvme_softc *sc __unused, struct nvme_command *cmd __unused, struct pci_nvme_blockstore *nvstore, struct pci_nvme_ioreq *req, uint16_t *status) { bool pending = false; if (nvstore->type == NVME_STOR_RAM) { pci_nvme_status_genc(status, NVME_SC_SUCCESS); } else { int err; req->io_req.br_callback = pci_nvme_io_done; err = blockif_flush(nvstore->ctx, &req->io_req); switch (err) { case 0: pending = true; break; case EOPNOTSUPP: pci_nvme_status_genc(status, NVME_SC_SUCCESS); break; default: pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); } } return (pending); } static uint16_t nvme_write_read_ram(struct pci_nvme_softc *sc, struct pci_nvme_blockstore *nvstore, uint64_t prp1, uint64_t prp2, size_t offset, uint64_t bytes, bool is_write) { uint8_t *buf = nvstore->ctx; enum nvme_copy_dir dir; uint16_t status; if (is_write) dir = NVME_COPY_TO_PRP; else dir = NVME_COPY_FROM_PRP; if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, buf + offset, bytes, dir)) pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); else pci_nvme_status_genc(&status, NVME_SC_SUCCESS); return (status); } static uint16_t nvme_write_read_blockif(struct pci_nvme_softc *sc, struct pci_nvme_blockstore *nvstore, struct pci_nvme_ioreq *req, uint64_t prp1, uint64_t prp2, size_t offset, uint64_t bytes, bool is_write) { uint64_t size; int err; uint16_t status = NVME_NO_STATUS; size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) { err = -1; goto out; } offset += size; bytes -= size; if (bytes == 0) { ; } else if (bytes <= PAGE_SIZE) { size = bytes; if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) { err = -1; goto out; } } else { void *vmctx = sc->nsc_pi->pi_vmctx; uint64_t *prp_list = &prp2; uint64_t *last = prp_list; /* PRP2 is pointer to a physical region page list */ while (bytes) { /* Last entry in list points to the next list */ if ((prp_list == last) && (bytes > PAGE_SIZE)) { uint64_t prp = *prp_list; prp_list = paddr_guest2host(vmctx, prp, PAGE_SIZE - (prp % PAGE_SIZE)); if (prp_list == NULL) { err = -1; goto out; } last = prp_list + (NVME_PRP2_ITEMS - 1); } size = MIN(bytes, PAGE_SIZE); if (pci_nvme_append_iov_req(sc, req, *prp_list, size, offset)) { err = -1; goto out; } offset += size; bytes -= size; prp_list++; } } req->io_req.br_callback = pci_nvme_io_done; if (is_write) err = blockif_write(nvstore->ctx, &req->io_req); else err = blockif_read(nvstore->ctx, &req->io_req); out: if (err) pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); return (status); } static bool nvme_opc_write_read(struct pci_nvme_softc *sc, struct nvme_command *cmd, struct pci_nvme_blockstore *nvstore, struct pci_nvme_ioreq *req, uint16_t *status) { uint64_t lba, nblocks, bytes; size_t offset; bool is_write = cmd->opc == NVME_OPC_WRITE; bool pending = false; lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; nblocks = (cmd->cdw12 & 0xFFFF) + 1; bytes = nblocks << nvstore->sectsz_bits; if (bytes > NVME_MAX_DATA_SIZE) { WPRINTF("%s command would exceed MDTS", __func__); pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); goto out; } if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)", __func__, lba, nblocks); pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); goto out; } offset = lba << nvstore->sectsz_bits; req->bytes = bytes; req->io_req.br_offset = lba; /* PRP bits 1:0 must be zero */ cmd->prp1 &= ~0x3UL; cmd->prp2 &= ~0x3UL; if (nvstore->type == NVME_STOR_RAM) { *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, cmd->prp2, offset, bytes, is_write); } else { *status = nvme_write_read_blockif(sc, nvstore, req, cmd->prp1, cmd->prp2, offset, bytes, is_write); if (*status == NVME_NO_STATUS) pending = true; } out: if (!pending) pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); return (pending); } static void pci_nvme_dealloc_sm(struct blockif_req *br, int err) { struct pci_nvme_ioreq *req = br->br_param; struct pci_nvme_softc *sc = req->sc; bool done = true; uint16_t status; if (err) { pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { pci_nvme_status_genc(&status, NVME_SC_SUCCESS); } else { struct iovec *iov = req->io_req.br_iov; req->prev_gpaddr++; iov += req->prev_gpaddr; /* The iov_* values already include the sector size */ req->io_req.br_offset = (off_t)iov->iov_base; req->io_req.br_resid = iov->iov_len; if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); } else done = false; } if (done) { pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid, status); pci_nvme_release_ioreq(sc, req); } } static bool nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, struct nvme_command *cmd, struct pci_nvme_blockstore *nvstore, struct pci_nvme_ioreq *req, uint16_t *status) { struct nvme_dsm_range *range = NULL; uint32_t nr, r, non_zero, dr; int err; bool pending = false; if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); goto out; } nr = cmd->cdw10 & 0xff; /* copy locally because a range entry could straddle PRPs */ range = calloc(1, NVME_MAX_DSM_TRIM); if (range == NULL) { pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); goto out; } nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); /* Check for invalid ranges and the number of non-zero lengths */ non_zero = 0; for (r = 0; r <= nr; r++) { if (pci_nvme_out_of_range(nvstore, range[r].starting_lba, range[r].length)) { pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); goto out; } if (range[r].length != 0) non_zero++; } if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { size_t offset, bytes; int sectsz_bits = sc->nvstore.sectsz_bits; /* * DSM calls are advisory only, and compliant controllers * may choose to take no actions (i.e. return Success). */ if (!nvstore->deallocate) { pci_nvme_status_genc(status, NVME_SC_SUCCESS); goto out; } /* If all ranges have a zero length, return Success */ if (non_zero == 0) { pci_nvme_status_genc(status, NVME_SC_SUCCESS); goto out; } if (req == NULL) { pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); goto out; } offset = range[0].starting_lba << sectsz_bits; bytes = range[0].length << sectsz_bits; /* * If the request is for more than a single range, store * the ranges in the br_iov. Optimize for the common case * of a single range. * * Note that NVMe Number of Ranges is a zero based value */ req->io_req.br_iovcnt = 0; req->io_req.br_offset = offset; req->io_req.br_resid = bytes; if (nr == 0) { req->io_req.br_callback = pci_nvme_io_done; } else { struct iovec *iov = req->io_req.br_iov; for (r = 0, dr = 0; r <= nr; r++) { offset = range[r].starting_lba << sectsz_bits; bytes = range[r].length << sectsz_bits; if (bytes == 0) continue; if ((nvstore->size - offset) < bytes) { pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); goto out; } iov[dr].iov_base = (void *)offset; iov[dr].iov_len = bytes; dr++; } req->io_req.br_callback = pci_nvme_dealloc_sm; /* * Use prev_gpaddr to track the current entry and * prev_size to track the number of entries */ req->prev_gpaddr = 0; req->prev_size = dr; } err = blockif_delete(nvstore->ctx, &req->io_req); if (err) pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); else pending = true; } out: free(range); return (pending); } static void pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) { struct nvme_submission_queue *sq; uint16_t status; uint16_t sqhead; /* handle all submissions up to sq->tail index */ sq = &sc->submit_queues[idx]; pthread_mutex_lock(&sq->mtx); sqhead = sq->head; DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", idx, sqhead, sq->tail, sq->qbase); while (sqhead != atomic_load_acq_short(&sq->tail)) { struct nvme_command *cmd; struct pci_nvme_ioreq *req; uint32_t nsid; bool pending; pending = false; req = NULL; status = 0; cmd = &sq->qbase[sqhead]; sqhead = (sqhead + 1) % sq->size; nsid = le32toh(cmd->nsid); if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { pci_nvme_status_genc(&status, NVME_SC_INVALID_NAMESPACE_OR_FORMAT); status |= NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; goto complete; } req = pci_nvme_get_ioreq(sc); if (req == NULL) { pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); WPRINTF("%s: unable to allocate IO req", __func__); goto complete; } req->nvme_sq = sq; req->sqid = idx; req->opc = cmd->opc; req->cid = cmd->cid; req->nsid = cmd->nsid; switch (cmd->opc) { case NVME_OPC_FLUSH: pending = nvme_opc_flush(sc, cmd, &sc->nvstore, req, &status); break; case NVME_OPC_WRITE: case NVME_OPC_READ: pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, req, &status); break; case NVME_OPC_WRITE_ZEROES: /* TODO: write zeroes WPRINTF("%s write zeroes lba 0x%lx blocks %u", __func__, lba, cmd->cdw12 & 0xFFFF); */ pci_nvme_status_genc(&status, NVME_SC_SUCCESS); break; case NVME_OPC_DATASET_MANAGEMENT: pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, req, &status); break; default: WPRINTF("%s unhandled io command 0x%x", __func__, cmd->opc); pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); } complete: if (!pending) { pci_nvme_set_completion(sc, sq, idx, cmd->cid, status); if (req != NULL) pci_nvme_release_ioreq(sc, req); } } sq->head = sqhead; pthread_mutex_unlock(&sq->mtx); } static void pci_nvme_handle_doorbell(struct vmctx *ctx __unused, struct pci_nvme_softc* sc, uint64_t idx, int is_sq, uint64_t value) { DPRINTF("nvme doorbell %lu, %s, val 0x%lx", idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); if (is_sq) { if (idx > sc->num_squeues) { WPRINTF("%s queue index %lu overflow from " "guest (max %u)", __func__, idx, sc->num_squeues); return; } atomic_store_short(&sc->submit_queues[idx].tail, (uint16_t)value); if (idx == 0) { pci_nvme_handle_admin_cmd(sc, value); } else { /* submission queue; handle new entries in SQ */ if (idx > sc->num_squeues) { WPRINTF("%s SQ index %lu overflow from " "guest (max %u)", __func__, idx, sc->num_squeues); return; } pci_nvme_handle_io_cmd(sc, (uint16_t)idx); } } else { if (idx > sc->num_cqueues) { WPRINTF("%s queue index %lu overflow from " "guest (max %u)", __func__, idx, sc->num_cqueues); return; } atomic_store_short(&sc->compl_queues[idx].head, (uint16_t)value); } } static void pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) { const char *s = iswrite ? "WRITE" : "READ"; switch (offset) { case NVME_CR_CAP_LOW: DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); break; case NVME_CR_CAP_HI: DPRINTF("%s %s NVME_CR_CAP_HI", func, s); break; case NVME_CR_VS: DPRINTF("%s %s NVME_CR_VS", func, s); break; case NVME_CR_INTMS: DPRINTF("%s %s NVME_CR_INTMS", func, s); break; case NVME_CR_INTMC: DPRINTF("%s %s NVME_CR_INTMC", func, s); break; case NVME_CR_CC: DPRINTF("%s %s NVME_CR_CC", func, s); break; case NVME_CR_CSTS: DPRINTF("%s %s NVME_CR_CSTS", func, s); break; case NVME_CR_NSSR: DPRINTF("%s %s NVME_CR_NSSR", func, s); break; case NVME_CR_AQA: DPRINTF("%s %s NVME_CR_AQA", func, s); break; case NVME_CR_ASQ_LOW: DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); break; case NVME_CR_ASQ_HI: DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); break; case NVME_CR_ACQ_LOW: DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); break; case NVME_CR_ACQ_HI: DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); break; default: DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); } } static void pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, uint64_t offset, int size, uint64_t value) { uint32_t ccreg; if (offset >= NVME_DOORBELL_OFFSET) { uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; uint64_t idx = belloffset / 8; /* door bell size = 2*int */ int is_sq = (belloffset % 8) < 4; if ((sc->regs.csts & NVME_CSTS_RDY) == 0) { WPRINTF("doorbell write prior to RDY (offset=%#lx)\n", offset); return; } if (belloffset > ((sc->max_queues+1) * 8 - 4)) { WPRINTF("guest attempted an overflow write offset " "0x%lx, val 0x%lx in %s", offset, value, __func__); return; } if (is_sq) { if (sc->submit_queues[idx].qbase == NULL) return; } else if (sc->compl_queues[idx].qbase == NULL) return; pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); return; } DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", offset, size, value); if (size != 4) { WPRINTF("guest wrote invalid size %d (offset 0x%lx, " "val 0x%lx) to bar0 in %s", size, offset, value, __func__); /* TODO: shutdown device */ return; } pci_nvme_bar0_reg_dumps(__func__, offset, 1); pthread_mutex_lock(&sc->mtx); switch (offset) { case NVME_CR_CAP_LOW: case NVME_CR_CAP_HI: /* readonly */ break; case NVME_CR_VS: /* readonly */ break; case NVME_CR_INTMS: /* MSI-X, so ignore */ break; case NVME_CR_INTMC: /* MSI-X, so ignore */ break; case NVME_CR_CC: ccreg = (uint32_t)value; DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " "iocqes %u", __func__, NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), NVME_CC_GET_IOCQES(ccreg)); if (NVME_CC_GET_SHN(ccreg)) { /* perform shutdown - flush out data to backend */ sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << NVME_CSTS_REG_SHST_SHIFT); sc->regs.csts |= NVME_SHST_COMPLETE << NVME_CSTS_REG_SHST_SHIFT; } if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { if (NVME_CC_GET_EN(ccreg) == 0) /* transition 1-> causes controller reset */ pci_nvme_reset_locked(sc); else pci_nvme_init_controller(ctx, sc); } /* Insert the iocqes, iosqes and en bits from the write */ sc->regs.cc &= ~NVME_CC_WRITE_MASK; sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; if (NVME_CC_GET_EN(ccreg) == 0) { /* Insert the ams, mps and css bit fields */ sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; sc->regs.csts &= ~NVME_CSTS_RDY; } else if ((sc->pending_ios == 0) && !(sc->regs.csts & NVME_CSTS_CFS)) { sc->regs.csts |= NVME_CSTS_RDY; } break; case NVME_CR_CSTS: break; case NVME_CR_NSSR: /* ignore writes; don't support subsystem reset */ break; case NVME_CR_AQA: sc->regs.aqa = (uint32_t)value; break; case NVME_CR_ASQ_LOW: sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | (0xFFFFF000 & value); break; case NVME_CR_ASQ_HI: sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | (value << 32); break; case NVME_CR_ACQ_LOW: sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | (0xFFFFF000 & value); break; case NVME_CR_ACQ_HI: sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | (value << 32); break; default: DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", __func__, offset, value, size); } pthread_mutex_unlock(&sc->mtx); } static void pci_nvme_write(struct vmctx *ctx, int vcpu __unused, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct pci_nvme_softc* sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " " value 0x%lx", baridx, offset, size, value); pci_emul_msix_twrite(pi, offset, size, value); return; } switch (baridx) { case 0: pci_nvme_write_bar_0(ctx, sc, offset, size, value); break; default: DPRINTF("%s unknown baridx %d, val 0x%lx", __func__, baridx, value); } } static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, uint64_t offset, int size) { uint64_t value; pci_nvme_bar0_reg_dumps(__func__, offset, 0); if (offset < NVME_DOORBELL_OFFSET) { void *p = &(sc->regs); pthread_mutex_lock(&sc->mtx); memcpy(&value, (void *)((uintptr_t)p + offset), size); pthread_mutex_unlock(&sc->mtx); } else { value = 0; WPRINTF("pci_nvme: read invalid offset %ld", offset); } switch (size) { case 1: value &= 0xFF; break; case 2: value &= 0xFFFF; break; case 4: value &= 0xFFFFFFFF; break; } DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", offset, size, (uint32_t)value); return (value); } static uint64_t pci_nvme_read(struct vmctx *ctx __unused, int vcpu __unused, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_nvme_softc* sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", baridx, offset, size); return pci_emul_msix_tread(pi, offset, size); } switch (baridx) { case 0: return pci_nvme_read_bar_0(sc, offset, size); default: DPRINTF("unknown bar %d, 0x%lx", baridx, offset); } return (0); } static int pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) { char bident[sizeof("XX:X:X")]; const char *value; uint32_t sectsz; sc->max_queues = NVME_QUEUES; sc->max_qentries = NVME_MAX_QENTRIES; sc->ioslots = NVME_IOSLOTS; sc->num_squeues = sc->max_queues; sc->num_cqueues = sc->max_queues; sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; sectsz = 0; snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); value = get_config_value_node(nvl, "maxq"); if (value != NULL) sc->max_queues = atoi(value); value = get_config_value_node(nvl, "qsz"); if (value != NULL) { sc->max_qentries = atoi(value); if (sc->max_qentries <= 0) { EPRINTLN("nvme: Invalid qsz option %d", sc->max_qentries); return (-1); } } value = get_config_value_node(nvl, "ioslots"); if (value != NULL) { sc->ioslots = atoi(value); if (sc->ioslots <= 0) { EPRINTLN("Invalid ioslots option %d", sc->ioslots); return (-1); } } value = get_config_value_node(nvl, "sectsz"); if (value != NULL) sectsz = atoi(value); value = get_config_value_node(nvl, "ser"); if (value != NULL) { /* * This field indicates the Product Serial Number in * 7-bit ASCII, unused bytes should be space characters. * Ref: NVMe v1.3c. */ cpywithpad((char *)sc->ctrldata.sn, sizeof(sc->ctrldata.sn), value, ' '); } value = get_config_value_node(nvl, "eui64"); if (value != NULL) sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); value = get_config_value_node(nvl, "dsm"); if (value != NULL) { if (strcmp(value, "auto") == 0) sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; else if (strcmp(value, "enable") == 0) sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; else if (strcmp(value, "disable") == 0) sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; } value = get_config_value_node(nvl, "ram"); if (value != NULL) { uint64_t sz = strtoull(value, NULL, 10); sc->nvstore.type = NVME_STOR_RAM; sc->nvstore.size = sz * 1024 * 1024; sc->nvstore.ctx = calloc(1, sc->nvstore.size); sc->nvstore.sectsz = 4096; sc->nvstore.sectsz_bits = 12; if (sc->nvstore.ctx == NULL) { EPRINTLN("nvme: Unable to allocate RAM"); return (-1); } } else { snprintf(bident, sizeof(bident), "%d:%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); sc->nvstore.ctx = blockif_open(nvl, bident); if (sc->nvstore.ctx == NULL) { EPRINTLN("nvme: Could not open backing file: %s", strerror(errno)); return (-1); } sc->nvstore.type = NVME_STOR_BLOCKIF; sc->nvstore.size = blockif_size(sc->nvstore.ctx); } if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) sc->nvstore.sectsz = sectsz; else if (sc->nvstore.type != NVME_STOR_RAM) sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); for (sc->nvstore.sectsz_bits = 9; (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; sc->nvstore.sectsz_bits++); if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) sc->max_queues = NVME_QUEUES; return (0); } static void pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg, size_t new_size) { struct pci_nvme_softc *sc; struct pci_nvme_blockstore *nvstore; struct nvme_namespace_data *nd; sc = arg; nvstore = &sc->nvstore; nd = &sc->nsdata; nvstore->size = new_size; pci_nvme_init_nsdata_size(nvstore, nd); /* Add changed NSID to list */ sc->ns_log.ns[0] = 1; sc->ns_log.ns[1] = 0; pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED); } static int pci_nvme_init(struct vmctx *ctx __unused, struct pci_devinst *pi, nvlist_t *nvl) { struct pci_nvme_softc *sc; uint32_t pci_membar_sz; int error; error = 0; sc = calloc(1, sizeof(struct pci_nvme_softc)); pi->pi_arg = sc; sc->nsc_pi = pi; error = pci_nvme_parse_config(sc, nvl); if (error < 0) goto done; else error = 0; STAILQ_INIT(&sc->ioreqs_free); sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); for (uint32_t i = 0; i < sc->ioslots; i++) { STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); } pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); /* * Allocate size of NVMe registers + doorbell space for all queues. * * The specification requires a minimum memory I/O window size of 16K. * The Windows driver will refuse to start a device with a smaller * window. */ pci_membar_sz = sizeof(struct nvme_registers) + 2 * sizeof(uint32_t) * (sc->max_queues + 1); pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); DPRINTF("nvme membar size: %u", pci_membar_sz); error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); if (error) { WPRINTF("%s pci alloc mem bar failed", __func__); goto done; } error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); if (error) { WPRINTF("%s pci add msixcap failed", __func__); goto done; } error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); if (error) { WPRINTF("%s pci add Express capability failed", __func__); goto done; } pthread_mutex_init(&sc->mtx, NULL); sem_init(&sc->iosemlock, 0, sc->ioslots); blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); /* * Controller data depends on Namespace data so initialize Namespace * data first. */ pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); pci_nvme_init_ctrldata(sc); pci_nvme_init_logpages(sc); pci_nvme_init_features(sc); pci_nvme_aer_init(sc); pci_nvme_aen_init(sc); pci_nvme_reset(sc); pci_lintr_request(pi); done: return (error); } static int pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) { char *cp, *ram; if (opts == NULL) return (0); if (strncmp(opts, "ram=", 4) == 0) { cp = strchr(opts, ','); if (cp == NULL) { set_config_value_node(nvl, "ram", opts + 4); return (0); } ram = strndup(opts + 4, cp - opts - 4); set_config_value_node(nvl, "ram", ram); free(ram); return (pci_parse_legacy_config(nvl, cp + 1)); } else return (blockif_legacy_config(nvl, opts)); } static const struct pci_devemu pci_de_nvme = { .pe_emu = "nvme", .pe_init = pci_nvme_init, .pe_legacy_config = pci_nvme_legacy_config, .pe_barwrite = pci_nvme_write, .pe_barread = pci_nvme_read }; PCI_EMUL_SET(pci_de_nvme);