Index: sys/dev/nvme/nvme.h =================================================================== --- sys/dev/nvme/nvme.h +++ sys/dev/nvme/nvme.h @@ -124,6 +124,7 @@ } __packed; enum shn_value { + NVME_SHN_NOEFCT = 0x0, NVME_SHN_NORMAL = 0x1, NVME_SHN_ABRUPT = 0x2, }; Index: usr.sbin/bhyve/Makefile =================================================================== --- usr.sbin/bhyve/Makefile +++ usr.sbin/bhyve/Makefile @@ -39,6 +39,7 @@ pci_virtio_rnd.c \ pci_uart.c \ pci_xhci.c \ + pci_nvme.c \ pm.c \ post.c \ ps2kbd.c \ @@ -61,7 +62,7 @@ LIBADD= vmmapi md pthread z -CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/usb/controller +CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/usb/controller -I${BHYVE_SYSDIR}/sys/dev/nvme/ WARNS?= 2 Index: usr.sbin/bhyve/bhyve.8 =================================================================== --- usr.sbin/bhyve/bhyve.8 +++ usr.sbin/bhyve/bhyve.8 @@ -180,6 +180,8 @@ .It Li lpc LPC PCI-ISA bridge with COM1 and COM2 16550 serial ports and a boot ROM. The LPC bridge emulation can only be configured on bus 0. +.It Li nvme +NVMe controller attached. .El .It Op Ar conf This optional parameter describes the backend for device emulations. Index: usr.sbin/bhyve/pci_nvme.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/pci_nvme.c @@ -0,0 +1,1257 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "pci_emul.h" +#include "block_if.h" +#include "bhyverun.h" + +#ifdef NVME_DEBUG +static FILE* dbg; +#define DPRINTF(format, arg...) \ + do { \ + fprintf(dbg, format, ##arg); \ + fflush(dbg); \ + } while (0) +#else +#define DPRINTF(format, arg...) +#endif + +enum nvme_controller_register_offsets { + NVME_CR_CAP_LOW = 0x00, + NVME_CR_CAP_HI = 0x04, + NVME_CR_VS = 0x08, + NVME_CR_INTMS = 0x0c, + NVME_CR_INTMC = 0x10, + NVME_CR_CC = 0x14, + NVME_CR_CSTS = 0x1c, + NVME_CR_NSSR = 0x20, + NVME_CR_AQA = 0x24, + NVME_CR_ASQ_LOW = 0x28, + NVME_CR_ASQ_HI = 0x2c, + NVME_CR_ACQ_LOW = 0x30, + NVME_CR_ACQ_HI = 0x34, + // submission queue 0 tail doorbell (admin) + NVME_CR_IO_QUEUE_BASE = 0x1000, + /* + * 0x1000 ~ 0x1003 : submission queue 0 head doorbell(admin) + * 0x1004 ~ 0x1007 : completion queue 0 tail doorbell(admin) + * 0x1008 ~ 0x100b : submission queue 1 head doorbell + * 0x100c ~ 0x100f : completion queue 1 tail doorbell + */ + NVME_CR_ADMIN_SQ_TAIL = 0x1000, + NVME_CR_ADMIN_CQ_HEAD = 0x1004, + NVME_CR_SQ_1_TAIL = 0x1008, + NVME_CR_CQ_1_HEAD = 0x100c, + NVME_CR_SIZE = 0x1010, +}; + +#define NVME_IO_SQ_NUM 1 +#define NVME_IO_CQ_NUM 1 + +#define NVME_IO_SQS_SIZE (NVME_IO_SQ_NUM + 1) +#define NVME_IO_CQS_SIZE (NVME_IO_CQ_NUM + 1) + +enum nvme_pci_bar { + NVME_BAR_CR = 0, // 0 and 1 + NVME_BAR_RSV = 3, // reserved + NVME_BAR_MSIX = 4, // 4 and 5 +}; + +/* + * NVME_CR_ADMIN_CQ_HEAD and NVMe I/O completion queues + */ +#define NVME_COMPLETION_QUEUE_NUM (NVME_IO_CQ_NUM + 1) + +enum nvme_cmd_identify_cdw10 { + NVME_CMD_IDENTIFY_CDW10_CNTID = 0xffff0000, + NVME_CMD_IDENTIFY_CDW10_RSV = 0x0000ff00, + NVME_CMD_IDENTIFY_CDW10_CNS = 0x000000ff, +}; + +enum nvme_cmd_identify_data { + NVME_CMD_IDENTIFY_CNS_NAMESPACE = 0x0, + NVME_CMD_IDENTIFY_CNS_CONTROLLER = 0x1, +}; + +enum nvme_cc_bits { + NVME_CC_EN = 0x00000001, + NVME_CC_RSV0 = 0x0000000e, + NVME_CC_CCS = 0x00000070, + NVME_CC_MPS = 0x00000780, + NVME_CC_AMS = 0x00003800, + NVME_CC_SHN = 0x0000c000, + NVME_CC_IOSQES = 0x000f0000, + NVME_CC_IOCQES = 0x00f00000, + NVME_CC_RSV1 = 0xff000000, +}; + +struct nvme_features { + union { + uint32_t raw; + struct { + uint8_t ab : 2; + uint8_t reserved : 5; + uint8_t lpw : 8; + uint8_t mpw : 8; + uint8_t hpw : 8; + } __packed bits; + } __packed arbitration; + + union { + uint32_t raw; + struct { + uint8_t ps : 5; + uint32_t reserved : 27; + } __packed bits; + } __packed power_management; + + union { + uint32_t raw; + struct { + uint8_t num : 6; + uint32_t reserved : 26; + } __packed bits; + } __packed lba_range_type; + + union { + uint32_t raw; + struct { + uint16_t over; + uint16_t under; + } __packed bits; + } __packed temparture_threshold; + + union { + uint32_t raw; + struct { + uint16_t tler; + uint16_t reserved; + } __packed bits; + } __packed error_recovery; + + union { + uint32_t raw; + struct { + uint8_t wce : 1; + uint32_t reserved : 31; + } __packed bits; + } __packed volatile_write_cache; + + union { + uint32_t raw; + struct { + uint16_t ncqr : 16; + uint16_t nsqr : 16; + } __packed bits; + } __packed num_of_queues; + + union { + uint32_t raw; + struct { + uint8_t thr : 8; + uint8_t time : 8; + uint16_t reserved : 16; + } __packed bits; + } __packed interrupt_coalscing; + + union { + uint32_t raw; + struct { + uint16_t iv; + uint8_t cd : 1; + uint16_t reserved : 15; + } __packed bits; + + + } __packed interrupt_vector_config; + + union { + uint32_t raw; + struct { + uint8_t dn : 1; + uint32_t reserved : 31; + } __packed bits; + } __packed write_atomicity; + + union { + uint32_t raw; + struct { + uint16_t smart : 8; + uint16_t ns_attr_noti : 1; + uint16_t fw_act_noti : 1; + uint16_t tele_log_noti : 1; + uint32_t reserved : 21; + } __packed bits; + } __packed async_event_config; + + union { + uint32_t raw; + struct { + uint8_t pbslc; + uint32_t reserved : 24; + } __packed bits; + } __packed software_progress_marker; + +}; + +struct nvme_completion_queue_info { + uintptr_t base_addr; + uint16_t size; + uint16_t tail; + uint16_t qid; + uint16_t interrupt_vector; + pthread_mutex_t mtx; +}; + +struct pci_nvme_softc; +// io request to block_if. +struct nvme_ioreq { + struct blockif_req io_req; + struct nvme_completion completion_entry; + struct nvme_completion_queue_info* cq_info; + struct nvme_submission_queue_info* sq_info; + struct pci_nvme_softc* sc; + STAILQ_ENTRY(nvme_ioreq) io_flist; + TAILQ_ENTRY(nvme_ioreq) io_blist; +}; + +struct nvme_submission_queue_info { + uintptr_t base_addr; + uint16_t size; + uint16_t qid; + uint16_t completion_qid; + struct nvme_ioreq* ioreq; + STAILQ_HEAD(nvme_fhead, nvme_ioreq) iofhd; + TAILQ_HEAD(nvme_bhead, nvme_ioreq) iobhd; +}; + +struct pci_nvme_softc { + struct nvme_registers regs; + struct nvme_features features; + struct pci_devinst* pi; + uint16_t completion_queue_head; + uint16_t submission_queue_tail; + uintptr_t asq_base; + uintptr_t acq_base; + struct nvme_controller_data controller_data; + struct nvme_namespace_data namespace_data; + struct nvme_completion_queue_info* cqs_info; + struct nvme_submission_queue_info* sqs_info; + struct blockif_ctxt* bctx; +}; + +static void pci_nvme_reset(struct pci_nvme_softc* sc) +{ + /* + * Controller Register values are according to NVMe specification 1.0e. + */ + + /* + * Controller Capabilites + */ + // Maximum queue (I/O submission and completion queue) entries supported by nvme controller + sc->regs.cap_lo.bits.mqes = 0x10; + + // Contiguous Queues Required + sc->regs.cap_lo.bits.cqr = 1; + sc->regs.cap_lo.bits.ams = 0; + sc->regs.cap_lo.bits.reserved1 = 0; + sc->regs.cap_lo.bits.to = 10; + + sc->regs.cap_hi.bits.dstrd = 0; + sc->regs.cap_hi.bits.reserved3 = 0; + sc->regs.cap_hi.bits.css_nvm = 0; + sc->regs.cap_hi.bits.css_reserved = 0; + sc->regs.cap_hi.bits.reserved2 = 0; + /* + * MPSMIN and MPSMAX + * indicate the minimum and maximum host memory page size + * (2 ^ (12 + MPSMAX or MPSMIN)) + */ + sc->regs.cap_hi.bits.mpsmin = 0; + sc->regs.cap_hi.bits.mpsmax = 0; + sc->regs.cap_hi.bits.reserved1 = 0; + + /* + * Version of NVM express specification. + */ + // in this case, the version is 1.0 + uint32_t version = (0x0001 << 16) | 0x0000; + sc->regs.vs = version; + + // Interrupt Mask Set + sc->regs.intms = 0; + + // Interrupt Mask clear + sc->regs.intmc = 0; + + sc->regs.cc.raw = 0; + + sc->regs.csts.raw = 0; + + sc->completion_queue_head = 0; + sc->asq_base = 0; + sc->acq_base = 0; +} + +static void nvme_initialize_feature(struct pci_nvme_softc* sc) +{ + sc->features.arbitration.raw = 0; + + sc->features.temparture_threshold.bits.over = 0xffff; + sc->features.temparture_threshold.bits.under = 0x0000; + + sc->features.power_management.raw = 0; + + sc->features.error_recovery.raw = 0; + + sc->features.num_of_queues.raw = 0; + + sc->features.interrupt_coalscing.raw = 0; + + sc->features.interrupt_vector_config.raw = 0; + sc->features.interrupt_vector_config.bits.cd = 1; + + sc->features.async_event_config.raw = 0; +} + +static void nvme_initialize_identify(struct pci_nvme_softc* sc) +{ + // only one namespace + sc->controller_data.nn = 0x1; + + // LBA format + sc->namespace_data.lbaf[0].ms = 0x00; + + /* + * LBA data size is 2^n (n is started by 0) + * should be larger than 9.(i.e. 512 bytes) + */ + uint64_t lba_data_size = 9; + sc->namespace_data.lbaf[0].lbads = lba_data_size; + sc->namespace_data.lbaf[0].rp = 0x0; + + sc->namespace_data.nlbaf = 0x00; + sc->namespace_data.flbas.format = 0x00; + sc->namespace_data.nlbaf = 0x1; + + uint64_t block_size = blockif_size(sc->bctx); + sc->namespace_data.nsze = block_size / (2 << (lba_data_size - 1)); + sc->namespace_data.ncap = block_size / (2 << (lba_data_size - 1)); +} + +static int pci_nvme_submission_queue_init( + struct nvme_submission_queue_info* qinfo, + struct blockif_ctxt* ctxt) +{ + struct nvme_ioreq* req; + int ioq_size = blockif_queuesz(ctxt); + qinfo->ioreq = calloc(ioq_size, sizeof(struct nvme_ioreq)); + STAILQ_INIT(&qinfo->iofhd); + + // setup and insert requests to the free queue + for (int i = 0; i < ioq_size; ++i) { + req = &qinfo->ioreq[i]; + req->sq_info = qinfo; + // setup callback function + STAILQ_INSERT_TAIL(&qinfo->iofhd, req, io_flist); + } + + TAILQ_INIT(&qinfo->iobhd); + return 0; +} + +static void +pci_nvme_cq_init(struct nvme_completion_queue_info* cqinfo) +{ + pthread_mutex_init(&cqinfo->mtx, NULL); + cqinfo->base_addr = (uintptr_t)NULL; + cqinfo->size = -1; + cqinfo->tail = 0; + cqinfo->qid = -1; + cqinfo->interrupt_vector = -1; +} + +static int pci_nvme_init(struct vmctx* ctx, struct pci_devinst* pi, char* opts) +{ + struct pci_nvme_softc* sc; + struct blockif_ctxt* bctxt; + char bident[sizeof("XX:X:X")]; + +#ifdef NVME_DEBUG + dbg = fopen("/tmp/nvme_emu_log", "w+"); +#endif + DPRINTF("--- start nvme controller ---\n"); + + if (opts == NULL) { + fprintf(stderr, "pci_nvme: backing device required\n"); + return 1; + } + DPRINTF("%s\n", opts); + + /* + * Attempt to open the backing image. Use the PCI + * slot/func for the identifier string. + */ + snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func); + bctxt = blockif_open(opts, bident); + if (bctxt == NULL) { + goto fail; + } + +/* pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0953); */ + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0111); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); + + // for NVMe Controller Registers + if (pci_emul_alloc_bar(pi, NVME_BAR_CR, PCIBAR_MEM64, NVME_CR_SIZE)) { + DPRINTF("error is occured in pci_emul_alloc_bar\n"); + goto fail; + } + + // It is recommended that the controller allocate a uniqueue MSI-X vector for each completion queue. + if (pci_emul_add_msixcap(pi, NVME_COMPLETION_QUEUE_NUM, NVME_BAR_MSIX)) + { + DPRINTF("error is occured in pci_emul_add_msixcap\n"); + goto fail; + } + DPRINTF("table %d, pba %d\n", pci_msix_table_bar(pi), pci_msix_pba_bar(pi)); + + sc = calloc(1, sizeof(struct pci_nvme_softc)); + pi->pi_arg = sc; + pi->pi_vmctx = ctx; + sc->pi = pi; + + sc->bctx = bctxt; + + pci_nvme_reset(sc); + + sc->cqs_info = calloc(NVME_IO_CQS_SIZE, + sizeof(struct nvme_completion_queue_info)); + for(int i=0; i< NVME_IO_CQS_SIZE; ++i) { + pci_nvme_cq_init(&sc->cqs_info[i]); + } + + sc->sqs_info = calloc(NVME_IO_SQS_SIZE, + sizeof(struct nvme_submission_queue_info)); + + for (int i = 0; i < NVME_IO_SQS_SIZE; ++i) { + pci_nvme_submission_queue_init(&sc->sqs_info[i], sc->bctx); + } + + nvme_initialize_identify(sc); + nvme_initialize_feature(sc); + + return 0; + +fail: + blockif_close(bctxt); + free(sc->cqs_info); + free(sc->sqs_info); + free(sc); + return 1; +} + +static void pci_nvme_setup_controller(struct vmctx* ctx, + struct pci_nvme_softc* sc) +{ + DPRINTF("asqs 0x%x, acqs 0x%x\n", sc->regs.aqa.bits.asqs, + sc->regs.aqa.bits.acqs); + sc->asq_base = (uintptr_t)vm_map_gpa( + ctx, sc->regs.asq, + sizeof(struct nvme_command) * sc->regs.aqa.bits.asqs); + sc->acq_base = (uintptr_t)vm_map_gpa( + ctx, sc->regs.acq, + sizeof(struct nvme_completion) * sc->regs.aqa.bits.acqs); + + sc->regs.csts.bits.rdy = 1; +} + +static void execute_set_feature_command(struct pci_nvme_softc* sc, + struct nvme_command* command, + struct nvme_completion* cmp_entry) +{ + DPRINTF("0x%x\n", command->cdw11); + DPRINTF("0x%x\n", command->cdw10); + cmp_entry->cdw0 = 0x00000000; + enum nvme_feature feature = command->cdw10 & 0xf; + switch (feature) { + case NVME_FEAT_NUMBER_OF_QUEUES: + sc->features.num_of_queues.raw = command->cdw11 & 0xffff; + DPRINTF("SET_FEATURE cmd: ncqr 0x%x, nsqr 0x%x\n", + (command->cdw11 >> 16), (command->cdw11 & 0xffff)); + cmp_entry->status.sc = 0x00; + cmp_entry->status.sct = 0x0; + if (pci_msix_enabled(sc->pi)) { + DPRINTF("generate msix, table_count %d, \n", + sc->pi->pi_msix.table_count); + pci_generate_msix(sc->pi, 0); + } + else { + assert(0 && "pci_msix is disable?"); + } + break; + + case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: + //TODO + sc->features.async_event_config.raw = command->cdw11; + cmp_entry->status.sc = 0x00; + cmp_entry->status.sct = 0x0; + pci_generate_msix(sc->pi, 0); + break; + + case NVME_FEAT_INTERRUPT_COALESCING: + DPRINTF("interrupt coalescing cdw11 0x%x\n", command->cdw11); + cmp_entry->status.sc = 0x00; + cmp_entry->status.sct = 0x0; + sc->features.interrupt_coalscing.bits.thr = command->cdw11 & 0xff; + sc->features.interrupt_coalscing.bits.time = + (command->cdw11 >> 8) & 0xff; + pci_generate_msix(sc->pi, 0); + break; + + default: + assert(0 && "this feature is not implemented"); + } +} + +enum temp_threshold_cdw11 { + NVME_TEMP_THRESHOLD_TMPTH = 0x0000ffff, + NVME_TEMP_THRESHOLD_TMPSEL = 0x000f0000, + NVME_TEMP_THRESHOLD_THSEL = 0x00300000, + NVME_TEMP_THRESHOLD_RESERVED = 0xffc00000, +}; + +static void execute_get_feature_command(struct pci_nvme_softc* sc, + struct nvme_command* command, + struct nvme_completion* cmp_entry) +{ + enum nvme_feature feature = command->cdw10 & 0xf; + switch (feature) { + case NVME_FEAT_TEMPERATURE_THRESHOLD: { + uint8_t thsel = (command->cdw11 & NVME_TEMP_THRESHOLD_THSEL) >> 20; + // over temparture threshold + if (thsel == 0x00) { + cmp_entry->cdw0 = sc->features.temparture_threshold.bits.over; + } + // under temparture threshold + else if (thsel == 0x1) { + cmp_entry->cdw0 = sc->features.temparture_threshold.bits.under; + } + else { + assert("the thsel is invalied"); + } + cmp_entry->status.sc = 0x00; + cmp_entry->status.sct = 0x0; + pci_generate_msix(sc->pi, 0); + break; + } + + default: + DPRINTF("feature number: 0x%x\n", feature); + assert(0 && "not implemented"); + } + + return; +} + +static void nvme_execute_identify_command(struct pci_nvme_softc* sc, + struct nvme_command* command, + struct nvme_completion* cmp_entry) +{ + DPRINTF("Identify command (0x%x)\n", + command->cdw10 & NVME_CMD_IDENTIFY_CDW10_CNS); + DPRINTF("cdw10 0x%x, dptr 0x%lx, 0x%lx", command->cdw10, command->prp1, + command->prp2); + uintptr_t dest_addr = (uintptr_t)vm_map_gpa( + sc->pi->pi_vmctx, command->prp1, sizeof(struct nvme_controller_data)); + + // TODO have to consider about completion queue entry content + switch (command->cdw10 & NVME_CMD_IDENTIFY_CDW10_CNS) { + case NVME_CMD_IDENTIFY_CNS_NAMESPACE: + memcpy((struct nvme_namespace_data*)dest_addr, &sc->namespace_data, + sizeof(struct nvme_namespace_data)); + cmp_entry->status.sc = 0x00; + cmp_entry->status.sct = 0x0; + pci_generate_msix(sc->pi, 0); + return; + case NVME_CMD_IDENTIFY_CNS_CONTROLLER: + memcpy((struct nvme_controller_data*)dest_addr, + &sc->controller_data, sizeof(struct nvme_controller_data)); + cmp_entry->status.sc = 0x00; + cmp_entry->status.sct = 0x0; + pci_generate_msix(sc->pi, 0); + return; + default: + assert(0 && "[CNS] not inplemented"); + } + + assert(0); +} + +enum create_io_cq_cdw11 { + NVME_CREATE_IO_CQ_CDW11_PC = 0x00000001, + NVME_CREATE_IO_CQ_CDW11_IEN = 0x00000002, + NVME_CREATE_IO_CQ_CDW11_RSV = 0x0000fffc, + NVME_CREATE_IO_CQ_CDW11_IV = 0xffff0000, +}; + +static void +nvme_execute_create_io_cq_command(struct pci_nvme_softc* sc, + struct nvme_command* command, + struct nvme_completion* cmp_entry) +{ + // TODO + // IEN + // IV + DPRINTF("interrupt vector 0x%x\n", command->cdw11 >> 16); + if (command->cdw11 & NVME_CREATE_IO_CQ_CDW11_PC) { + uint16_t qid = command->cdw10 & 0xffff; + assert(qid < NVME_IO_CQS_SIZE && + "you should increase number of completion queue of this controller emulator"); + + if (sc->cqs_info[qid].base_addr != (uintptr_t)NULL) { + assert(0 && "the completion queue is already used"); + } + + uint16_t interrupt_vector = command->cdw11 >> 16; + uint16_t queue_size = command->cdw10 >> 16; + sc->cqs_info[qid].base_addr = + (uintptr_t)vm_map_gpa(sc->pi->pi_vmctx, command->prp1, + sizeof(struct nvme_completion) * queue_size); + sc->cqs_info[qid].size = queue_size; + sc->cqs_info[qid].qid = qid; + sc->cqs_info[qid].interrupt_vector = interrupt_vector; + + cmp_entry->status.sc = 0x00; + cmp_entry->status.sct = 0x0; + pci_generate_msix(sc->pi, 0); + DPRINTF("qid %d, qsize 0x%x, addr 0x%lx, IV %d\n", + qid, queue_size, command->prp1, interrupt_vector); + } + else { + assert(0 && "not implemented"); + } + + return; +} + +enum create_io_sq_cdw11 { + NVME_CREATE_IO_SQ_CDW11_PC = 0x00000001, + NVME_CREATE_IO_SQ_CDW11_QPRIO = 0x00000060, + NVME_CREATE_IO_SQ_CDW11_RSV = 0x0000ff80, + NVME_CREATE_IO_SQ_CDW11_CQID = 0xffff0000, +}; + +static void nvme_execute_create_io_sq_command(struct pci_nvme_softc* sc, + struct nvme_command* command, + struct nvme_completion* cmp_entry) +{ + if (command->cdw11 & NVME_CREATE_IO_SQ_CDW11_PC) { + uint16_t qid = command->cdw10 & 0xffff; + assert(qid < NVME_IO_SQS_SIZE && + "you should increase number of submission queue of this controller emulator"); + // TODO + /* uint8_t queue_priority = (command->cdw11 & + * NVME_CREATE_IO_SQ_CDW11_QPRIO) >> 1; */ + struct nvme_submission_queue_info* sq_info = &sc->sqs_info[qid]; + if (sq_info->base_addr != (uintptr_t)NULL) { + assert(0); + } + uint16_t cqid = command->cdw11 >> 16; + uint16_t queue_size = command->cdw10 >> 16; + sq_info->base_addr = + (uintptr_t)vm_map_gpa(sc->pi->pi_vmctx, command->prp1, + sizeof(struct nvme_command) * queue_size); + sq_info->size = queue_size; + sq_info->completion_qid = cqid; + sq_info->qid = qid; + + cmp_entry->status.sc = 0x00; + cmp_entry->status.sct = 0x0; + pci_generate_msix(sc->pi, 0); + DPRINTF("qid %d, qsize 0x%x, addr 0x%lx, cqid %d\n", + qid, queue_size, command->prp1, cqid); + + } + else { + assert(0 && "not implemented"); + } +} +static void nvme_execute_delete_io_sq_command( + struct pci_nvme_softc* sc, + struct nvme_command* command, + struct nvme_completion* cmp_entry) +{ + uint16_t qid = command->cdw10 & 0xffff; + sc->sqs_info[qid].base_addr = (uintptr_t)NULL; + pci_generate_msix(sc->pi, 0); +} + +static void nvme_execute_delete_io_cq_command( + struct pci_nvme_softc* sc, + struct nvme_command* command, + struct nvme_completion* cmp_entry) +{ + uint16_t qid = command->cdw10 & 0xffff; + sc->cqs_info[qid].base_addr = (uintptr_t)NULL; + pci_generate_msix(sc->pi, 0); +} + +static void execute_async_event_request_command( + struct pci_nvme_softc* sc, + struct nvme_command* command, + struct nvme_completion* cmp_entry) +{ + // TODO + // when some events are occured, the controller send notify +} + +static void pci_nvme_execute_admin_command(struct pci_nvme_softc* sc, + uint64_t value) +{ + struct nvme_command* command = + (struct nvme_command*)(sc->asq_base + + sizeof(struct nvme_command) * (value - 1)); + + struct nvme_completion* cmp_entry = + (struct nvme_completion*)(sc->acq_base + + sizeof(struct nvme_completion) * + sc->completion_queue_head); + + cmp_entry->sqid = 0; + cmp_entry->sqhd = value - 1; + cmp_entry->cid = command->cid; + cmp_entry->status.p = !cmp_entry->status.p; + + DPRINTF("[admin command] 0x%x\n", command->opc); + switch (command->opc) { + case NVME_OPC_DELETE_IO_SQ: + nvme_execute_delete_io_sq_command(sc, command, cmp_entry); + break; + case NVME_OPC_CREATE_IO_SQ: + nvme_execute_create_io_sq_command(sc, command, cmp_entry); + break; + case NVME_OPC_DELETE_IO_CQ: + nvme_execute_delete_io_cq_command(sc, command, cmp_entry); + break; + case NVME_OPC_CREATE_IO_CQ: + nvme_execute_create_io_cq_command(sc, command, cmp_entry); + break; + case NVME_OPC_IDENTIFY: + nvme_execute_identify_command(sc, command, cmp_entry); + break; + case NVME_OPC_SET_FEATURES: + execute_set_feature_command(sc, command, cmp_entry); + break; + case NVME_OPC_GET_FEATURES: + execute_get_feature_command(sc, command, cmp_entry); + break; + case NVME_OPC_ASYNC_EVENT_REQUEST: + // XXX this is a dirty hack.... should be fixed... + cmp_entry->status.p = !cmp_entry->status.p; + execute_async_event_request_command(sc, command, cmp_entry); + sc->completion_queue_head--; + break; + default: + assert(0 && "the admin command is not implemented"); + } + + sc->completion_queue_head++; + if(sc->regs.aqa.bits.acqs == sc->completion_queue_head) + { + sc->completion_queue_head = 0; + } +} + +static void pci_nvme_blockif_ioreq_cb(struct blockif_req* br, int err) +{ + DPRINTF("%s %d\n", __func__, err); + + struct nvme_ioreq* nreq = br->br_param; + struct nvme_completion_queue_info* cq_info = nreq->cq_info; + struct nvme_submission_queue_info* sq_info = nreq->sq_info; + + pthread_mutex_lock(&cq_info->mtx); + struct nvme_completion* completion_entry = + (struct nvme_completion*)(cq_info->base_addr + + sizeof(struct nvme_completion) * + cq_info->tail); + + nreq->completion_entry.status.sct = 0x0; + nreq->completion_entry.status.sc = 0x00; + + // save phase value before write values to completion queue entry + uint8_t status_phase = completion_entry->status.p; + + memcpy(completion_entry, &nreq->completion_entry, + sizeof(struct nvme_completion)); + completion_entry->status.p = !status_phase; + + DPRINTF("br_resid 0x%lx\n", br->br_resid); + DPRINTF("cq_head 0x%x(max 0x%x), qid %d, status phase %x IV %d\n", + cq_info->tail, cq_info->size, cq_info->qid, + !status_phase, cq_info->interrupt_vector); + + cq_info->tail++; +/* if (cq_info->tail == cq_info->size) { */ + if (cq_info->tail > cq_info->size) { + cq_info->tail = 0; + } + + TAILQ_REMOVE(&sq_info->iobhd, nreq, io_blist); + + STAILQ_INSERT_TAIL(&sq_info->iofhd, nreq, io_flist); + + pci_generate_msix(nreq->sc->pi, cq_info->interrupt_vector); + pthread_mutex_unlock(&cq_info->mtx); +} + +static void nvme_nvm_command_read_write( + struct pci_nvme_softc* sc, + struct nvme_command* command, + struct nvme_submission_queue_info* sq_info, + uint16_t sqhd) +{ + int err = 0; + uintptr_t starting_lba = ((uint64_t)command->cdw11 << 32) | command->cdw10; + // NLB (number of logic block) is a 0's based value + uint16_t number_of_lb = (command->cdw12 & 0xffff) + 1; + int logic_block_size = 1 << sc->namespace_data.lbaf[0].lbads; + + DPRINTF("slba 0x%lx, nlba 0x%x, lb size 0x%x\n", + starting_lba, number_of_lb, logic_block_size); + DPRINTF("sqhd: 0x%x, destination addr : 0x%08lx\n", sqhd, command->prp1); + + /* struct nvme_ioreq *nreq = sq_info->ioreq; */ + struct nvme_ioreq* nreq = STAILQ_FIRST(&sq_info->iofhd); + assert(nreq != NULL); + STAILQ_REMOVE_HEAD(&sq_info->iofhd, io_flist); + + nreq->completion_entry.sqhd = sqhd; + nreq->completion_entry.sqid = sq_info->qid; + nreq->completion_entry.cid = command->cid; + nreq->cq_info = &sc->cqs_info[sq_info->completion_qid]; + nreq->sc = sc; + nreq->sq_info = sq_info; + DPRINTF("sqid %d, cqid %d\n", sq_info->qid, sq_info->completion_qid); + + + int data_size = number_of_lb * logic_block_size; + int page_num = data_size / (1 << 12); + if(data_size % (1 << 12)) + { + page_num++; + } + //TODO page size is depend on CC.MPS + DPRINTF("all size 0x%x, page num 0x%x\n", data_size, page_num); + DPRINTF("mod 0x%x\n", data_size % (1 << 12)); + DPRINTF("PRP 0x%08lx 0x%08lx\n", command->prp1, command->prp2); + DPRINTF("CDW0 0x%x\n", command->rsvd1); + + struct blockif_req* breq = &nreq->io_req; + breq->br_offset = 0; + + if(command->prp2 == 0) { + + breq->br_iov[0].iov_base = + paddr_guest2host(sc->pi->pi_vmctx, command->prp1, data_size); + breq->br_iov[0].iov_len = data_size; + breq->br_iovcnt = 1; + } + else { + + if(page_num == 2) { + // page size (4k) + int size = 1 << 12; + breq->br_iov[0].iov_base = + paddr_guest2host(sc->pi->pi_vmctx, command->prp1, size); + breq->br_iov[0].iov_len = size; + + breq->br_iov[1].iov_base = + paddr_guest2host(sc->pi->pi_vmctx, command->prp2, size); + breq->br_iov[1].iov_len = size; + } + else if(page_num > 2) { + + int prp1_offset = (command->prp1 & 0xfff); + int prp1_size = (1 << 12) - prp1_offset; + + breq->br_iov[0].iov_base = + paddr_guest2host(sc->pi->pi_vmctx, command->prp1, prp1_size); + breq->br_iov[0].iov_len = prp1_size; + + data_size -= 1 << 12; + + uint64_t* prp_list = paddr_guest2host( + sc->pi->pi_vmctx, + command->prp2, + sizeof(uint64_t) * (page_num - 1)); + + DPRINTF("prp1 : 0x%lx, 0x%x\n", command->prp1, prp1_size); + + //TODO be carefull of size. br_iov[BLOCKIF_IOV_MAX]. + for(int i=1; i< page_num; ++i) + { + int size = (1<<12); + if(data_size < (1 << 12)) + { + assert(i == (page_num - 1)); + size = data_size; + } + breq->br_iov[i].iov_base = + paddr_guest2host(sc->pi->pi_vmctx, prp_list[i - 1], size); + breq->br_iov[i].iov_len = size; + + data_size -= size; + DPRINTF("prp2[%d] : 0x%lx, 0x%x\n", i, prp_list[i], size); + } + DPRINTF("last data size: 0x%x\n", data_size); + DPRINTF("page num 0x%x", page_num); + } + else { + breq->br_iov[0].iov_base = + paddr_guest2host(sc->pi->pi_vmctx, command->prp1, + number_of_lb * logic_block_size); + + breq->br_iov[0].iov_len = number_of_lb * logic_block_size; + } + } + breq->br_iovcnt = page_num; + breq->br_offset += starting_lba * logic_block_size; + breq->br_resid = number_of_lb * logic_block_size; + breq->br_callback = pci_nvme_blockif_ioreq_cb; + breq->br_param = nreq; + + TAILQ_INSERT_HEAD(&sq_info->iobhd, nreq, io_blist); + + switch (command->opc) { + case NVME_OPC_READ: + err = blockif_read(sc->bctx, breq); + break; + case NVME_OPC_WRITE: + err = blockif_write(sc->bctx, breq); + break; + default: + //TODO + assert(0 && "??"); + } + + assert(err == 0 && "blockif_read or blockif_write failed"); +} + +/* static void nvme_nvm_command_flush(struct pci_nvme_softc* sc, */ +/* struct nvme_submission_queue_info* sq_info, */ +/* uint16_t cid, */ +/* uint16_t sqhd) */ +/* { */ +/* assert("not yet implemented"); */ +/* struct nvme_completion_queue_info* cq_info = */ +/* &sc->cqs_info[sq_info->completion_qid]; */ + +/* pthread_mutex_lock(&cq_info->mtx); */ +/* struct nvme_completion* completion_entry = */ +/* (struct nvme_completion*)(cq_info->base_addr + */ +/* sizeof(struct nvme_completion) * */ +/* cq_info->head); */ + +/* completion_entry->sqhd = sqhd; */ +/* completion_entry->sqid = sq_info->qid; */ +/* completion_entry->cid = cid; */ +/* completion_entry->status.sct = 0x0; */ +/* completion_entry->status.sc = 0x00; */ +/* completion_entry->status.p = !completion_entry->status.p; */ + +/* DPRINTF("cid: 0x%x, cqid: 0x%x, cq_info->head 0x%x, cq_info->size 0x%x\n", */ +/* cid, sq_info->completion_qid, cq_info->head, cq_info->size); */ +/* cq_info->head++; */ +/* if (cq_info->head == cq_info->size) { */ +/* cq_info->head = 0; */ +/* } */ +/* pthread_mutex_unlock(&cq_info->mtx); */ + +/* pci_generate_msix(sc->pi, sq_info->completion_qid); */ +/* } */ + +static void pci_nvme_execute_nvme_command(struct pci_nvme_softc* sc, + uint16_t qid, + uint64_t sq_tail) +{ + struct nvme_submission_queue_info* sq_info = &sc->sqs_info[qid]; + + uint16_t command_index = sq_tail - 1; + uint16_t sqhd = sq_tail - 1; + if (sq_tail == 0x0) { + command_index = sq_info->size; + sqhd = sq_info->size; + } + + struct nvme_command* command = + (struct nvme_command*)(sq_info->base_addr + + sizeof(struct nvme_command) * (command_index)); + + /* uint16_t completion_qid = sq_info->completion_qid; */ + /* struct nvme_completion_queue_info *cq_info = + * &sc->cqs_info[completion_qid]; */ + + DPRINTF("***** nvm command 0x%x *****\n", command->opc); + DPRINTF("opc: 0x%x, cid: 0x%x, nsid: 0x%x, qid: 0x%x, sq_tail 0x%lx\n", + command->opc, command->cid, command->nsid, qid, sq_tail); + + switch (command->opc) { + case NVME_OPC_READ: + case NVME_OPC_WRITE: + nvme_nvm_command_read_write(sc, command, sq_info, sqhd); + return; + /* case NVME_OPC_FLUSH: */ + /* nvme_nvm_command_flush(sc, sq_info, command->cid, sqhd); + */ + /* return; */ + /* case NVME_OPC_DATASET_MANAGEMENT: */ + /* { */ + /* pthread_mutex_lock(&cq_info->mtx); */ + /* struct nvme_completion* completion_entry = */ + /* pci_nvme_acquire_completion_entry(sc, cq_info); + */ + /* completion_entry->status.sc = 0x00; */ + /* completion_entry->status.sct = 0x0; */ + /* pthread_mutex_unlock(&cq_info->mtx); */ + + /* pci_generate_msix(sc->pi, sq_info->completion_qid); + */ + /* return; */ + /* } */ + + default: + assert(0 && "the nvme command is not implemented yet"); + } +} + +static void pci_nvme_write_bar_0(struct vmctx* ctx, + struct pci_nvme_softc* sc, + uint64_t regoff, + uint64_t value, + int size) +{ + // write value to doorbell register + // TODO limit of doorbell register address is depend on the number of queue + // pairs. + if (regoff >= NVME_CR_IO_QUEUE_BASE && regoff < NVME_CR_SIZE) { + int queue_offset = regoff - NVME_CR_IO_QUEUE_BASE; // 0x1000 is NVME_CR_IO_QUEUE_BASE + DPRINTF("regoff 0x%lx\n", regoff); + int qid = queue_offset / 8; // 4 is doorbell register size + int is_completion = (queue_offset / 4) % 2; + + // completion doorbell + if (is_completion) { + DPRINTF("completion doorbell (%d) is knocked, cqhd 0x%x\n", + qid, (uint16_t)value); + if (qid == 0) { + // TODO + } + else { + // TODO + } + return; + } + // submission doorbell + else { + // admin command + if (qid == 0) { + pci_nvme_execute_admin_command(sc, value); + return; + } + // nvme command + // TODO validate qid + else { + pci_nvme_execute_nvme_command(sc, qid, value); + return; + } + } + assert(0); + } + + DPRINTF("write 0x%lx, value %lx \n", regoff, value); + assert(size == 4 && "word size should be 4(byte)"); + switch (regoff) { + case NVME_CR_CC: + + // enable (EN) bit in controller register + /* - 0 -> 1 + * - When a controller is ready to process commands, the + * controller set '1' to CSTS.RDY. + */ + if ((sc->regs.cc.bits.en == 0) && (value & NVME_CC_EN)) { + pci_nvme_reset(sc); + pci_nvme_setup_controller(ctx, sc); + DPRINTF("nvme up\n"); + } + + // TODO + /* - 1 -> 0 + * Controler Reset + * - delete all I/O Submission queues and Completion queues + * - reset admin submittion queue and completion queue + * - move to idle state + * - all controller registers are reset to their default values + * (defined in section 5.21.1) + * - CSTS.RDY bit is cleared to '0'. + */ + if ((sc->regs.cc.bits.en == 1) && !(value & NVME_CC_EN)) { + DPRINTF("nvme down\n"); + sc->regs.csts.bits.rdy = 0; + // TODO + /* assert(0); */ + } + + switch((value & NVME_CC_SHN) >> 14) + { + case NVME_SHN_NOEFCT: + break; + case NVME_SHN_NORMAL: + sc->regs.csts.bits.shst = NVME_SHST_COMPLETE; + break; + case NVME_SHN_ABRUPT: + default: + assert(0 && "not yet implemented"); + } + + sc->regs.cc.raw = (uint32_t)value; + return; + + case NVME_CR_AQA: + sc->regs.aqa.raw = (uint32_t)value; + return; + + case NVME_CR_ASQ_LOW: + sc->regs.asq = + (sc->regs.asq & 0xffffffff00000000) | (0xfffff000 & value); + return; + + case NVME_CR_ASQ_HI: + sc->regs.asq = (sc->regs.asq & 0x00000000ffffffff) | (value << 32); + return; + + case NVME_CR_ACQ_LOW: + sc->regs.acq = + (sc->regs.acq & 0xffffffff00000000) | (0xfffff000 & value); + return; + + case NVME_CR_ACQ_HI: + sc->regs.acq = (sc->regs.acq & 0x00000000ffffffff) | (value << 32); + return; + + default: + DPRINTF("unknown regoff 0x%lx with value 0x%lx, size %d in %s\n", + regoff, value, size, __func__); + assert(0); + } +} + +static void pci_nvme_write(struct vmctx* ctx, + int vcpu, + struct pci_devinst* pi, + int baridx, + uint64_t regoff, + int size, + uint64_t value) +{ + struct pci_nvme_softc* sc = pi->pi_arg; + + if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { + DPRINTF("baridx, %d, msix: regoff 0x%lx, size %d, value %lx\n", baridx, + regoff, size, value); + pci_emul_msix_twrite(pi, regoff, size, value); + return; + } + + switch (baridx) { + case 0: + pci_nvme_write_bar_0(ctx, sc, regoff, value, size); + break; + + default: + DPRINTF("unknown baridx %d with 0x%lx in %s\n", baridx, value, + __func__); + assert(0); + } +} + +static uint64_t pci_nvme_read_bar_0( + struct pci_nvme_softc* sc, + enum nvme_controller_register_offsets offset, + int size) +{ + DPRINTF("read 0x%x\n", offset); + assert(size == 4 && "word size should be 4."); + switch (offset) { + case NVME_CR_CAP_LOW: + return (uint64_t)sc->regs.cap_lo.raw; + + case NVME_CR_CAP_HI: + return (uint64_t)sc->regs.cap_hi.raw; + + case NVME_CR_VS: + return (uint64_t)sc->regs.vs; + + case NVME_CR_CC: + return (uint64_t)sc->regs.cc.raw; + + case NVME_CR_CSTS: + DPRINTF("CSTS raw 0x%x\n", sc->regs.csts.raw); + return (uint64_t)sc->regs.csts.raw; + + default: + DPRINTF("unknown regoff value: 0x%x, size %d in %s\n", offset, size, + __func__); + assert(0); + } + + return 0; +} + +static uint64_t pci_nvme_read(struct vmctx* ctx, + int vcpu, + struct pci_devinst* pi, + int baridx, + uint64_t regoff, + int size) +{ + struct pci_nvme_softc* sc = pi->pi_arg; + + if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { + DPRINTF("baridx: %d, msix: regoff 0x%lx, size %d\n", baridx, regoff, + size); + return pci_emul_msix_tread(pi, regoff, size); + } + + switch (baridx) { + case 0: + return pci_nvme_read_bar_0(sc, regoff, size); + + default: + DPRINTF("unknown bar %d, 0x%lx\n", baridx, regoff); + assert(0); + } + + return 0; +} + +struct pci_devemu pci_de_nvme = { + .pe_emu = "nvme", + .pe_init = pci_nvme_init, + .pe_barwrite = pci_nvme_write, + .pe_barread = pci_nvme_read +}; +PCI_EMUL_SET(pci_de_nvme);