Changeset View
Changeset View
Standalone View
Standalone View
head/usr.sbin/bhyve/pci_nvme.c
Show First 20 Lines • Show All 93 Lines • ▼ Show 20 Lines | |||||
#define NVME_IOSLOTS 8 | #define NVME_IOSLOTS 8 | ||||
/* The NVMe spec defines bits 13:4 in BAR0 as reserved */ | /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ | ||||
#define NVME_MMIO_SPACE_MIN (1 << 14) | #define NVME_MMIO_SPACE_MIN (1 << 14) | ||||
#define NVME_QUEUES 16 | #define NVME_QUEUES 16 | ||||
#define NVME_MAX_QENTRIES 2048 | #define NVME_MAX_QENTRIES 2048 | ||||
/* Memory Page size Minimum reported in CAP register */ | |||||
#define NVME_MPSMIN 0 | |||||
/* MPSMIN converted to bytes */ | |||||
#define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) | |||||
#define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) | #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) | ||||
#define NVME_MAX_BLOCKIOVS 512 | #define NVME_MDTS 9 | ||||
/* Note the + 1 allows for the initial descriptor to not be page aligned */ | |||||
#define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) | |||||
#define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) | |||||
/* This is a synthetic status code to indicate there is no status */ | /* This is a synthetic status code to indicate there is no status */ | ||||
#define NVME_NO_STATUS 0xffff | #define NVME_NO_STATUS 0xffff | ||||
#define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) | #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) | ||||
/* helpers */ | /* helpers */ | ||||
/* Convert a zero-based value into a one-based value */ | /* Convert a zero-based value into a one-based value */ | ||||
▲ Show 20 Lines • Show All 68 Lines • ▼ Show 20 Lines | struct pci_nvme_blockstore { | ||||
void *ctx; | void *ctx; | ||||
uint64_t size; | uint64_t size; | ||||
uint32_t sectsz; | uint32_t sectsz; | ||||
uint32_t sectsz_bits; | uint32_t sectsz_bits; | ||||
uint64_t eui64; | uint64_t eui64; | ||||
uint32_t deallocate:1; | uint32_t deallocate:1; | ||||
}; | }; | ||||
/* | |||||
* Calculate the number of additional page descriptors for guest IO requests | |||||
* based on the advertised Max Data Transfer (MDTS) and given the number of | |||||
* default iovec's in a struct blockif_req. | |||||
* | |||||
* Note the + 1 allows for the initial descriptor to not be page aligned. | |||||
*/ | |||||
#define MDTS_PAD_SIZE \ | |||||
NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ | |||||
NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ | |||||
0 | |||||
struct pci_nvme_ioreq { | struct pci_nvme_ioreq { | ||||
struct pci_nvme_softc *sc; | struct pci_nvme_softc *sc; | ||||
STAILQ_ENTRY(pci_nvme_ioreq) link; | STAILQ_ENTRY(pci_nvme_ioreq) link; | ||||
struct nvme_submission_queue *nvme_sq; | struct nvme_submission_queue *nvme_sq; | ||||
uint16_t sqid; | uint16_t sqid; | ||||
/* command information */ | /* command information */ | ||||
uint16_t opc; | uint16_t opc; | ||||
uint16_t cid; | uint16_t cid; | ||||
uint32_t nsid; | uint32_t nsid; | ||||
uint64_t prev_gpaddr; | uint64_t prev_gpaddr; | ||||
size_t prev_size; | size_t prev_size; | ||||
/* | |||||
* lock if all iovs consumed (big IO); | |||||
* complete transaction before continuing | |||||
*/ | |||||
pthread_mutex_t mtx; | |||||
pthread_cond_t cv; | |||||
struct blockif_req io_req; | struct blockif_req io_req; | ||||
/* pad to fit up to 512 page descriptors from guest IO request */ | struct iovec iovpadding[MDTS_PAD_SIZE]; | ||||
struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX]; | |||||
}; | }; | ||||
enum nvme_dsm_type { | enum nvme_dsm_type { | ||||
/* Dataset Management bit in ONCS reflects backing storage capability */ | /* Dataset Management bit in ONCS reflects backing storage capability */ | ||||
NVME_DATASET_MANAGEMENT_AUTO, | NVME_DATASET_MANAGEMENT_AUTO, | ||||
/* Unconditionally set Dataset Management bit in ONCS */ | /* Unconditionally set Dataset Management bit in ONCS */ | ||||
NVME_DATASET_MANAGEMENT_ENABLE, | NVME_DATASET_MANAGEMENT_ENABLE, | ||||
/* Unconditionally clear Dataset Management bit in ONCS */ | /* Unconditionally clear Dataset Management bit in ONCS */ | ||||
▲ Show 20 Lines • Show All 52 Lines • ▼ Show 20 Lines | struct pci_nvme_softc { | ||||
struct nvme_submission_queue *submit_queues; | struct nvme_submission_queue *submit_queues; | ||||
struct nvme_feature_obj feat[NVME_FID_MAX]; | struct nvme_feature_obj feat[NVME_FID_MAX]; | ||||
enum nvme_dsm_type dataset_management; | enum nvme_dsm_type dataset_management; | ||||
}; | }; | ||||
static void pci_nvme_io_partial(struct blockif_req *br, int err); | |||||
static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); | static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); | ||||
static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); | static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); | ||||
static void pci_nvme_io_done(struct blockif_req *, int); | static void pci_nvme_io_done(struct blockif_req *, int); | ||||
/* Controller Configuration utils */ | /* Controller Configuration utils */ | ||||
#define NVME_CC_GET_EN(cc) \ | #define NVME_CC_GET_EN(cc) \ | ||||
((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) | ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) | ||||
#define NVME_CC_GET_CSS(cc) \ | #define NVME_CC_GET_CSS(cc) \ | ||||
▲ Show 20 Lines • Show All 137 Lines • ▼ Show 20 Lines | pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) | ||||
/* FreeBSD OUI */ | /* FreeBSD OUI */ | ||||
cd->ieee[0] = 0x58; | cd->ieee[0] = 0x58; | ||||
cd->ieee[1] = 0x9c; | cd->ieee[1] = 0x9c; | ||||
cd->ieee[2] = 0xfc; | cd->ieee[2] = 0xfc; | ||||
cd->mic = 0; | cd->mic = 0; | ||||
cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */ | cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ | ||||
cd->ver = 0x00010300; | cd->ver = 0x00010300; | ||||
cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; | cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; | ||||
cd->acl = 2; | cd->acl = 2; | ||||
cd->aerl = 4; | cd->aerl = 4; | ||||
cd->lpa = 0; /* TODO: support some simple things like SMART */ | cd->lpa = 0; /* TODO: support some simple things like SMART */ | ||||
▲ Show 20 Lines • Show All 1,011 Lines • ▼ Show 20 Lines | |||||
} | } | ||||
static int | static int | ||||
pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, | pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, | ||||
uint64_t gpaddr, size_t size, int do_write, uint64_t lba) | uint64_t gpaddr, size_t size, int do_write, uint64_t lba) | ||||
{ | { | ||||
int iovidx; | int iovidx; | ||||
if (req != NULL) { | if (req == NULL) | ||||
return (-1); | |||||
if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { | |||||
return (-1); | |||||
} | |||||
/* concatenate contig block-iovs to minimize number of iovs */ | /* concatenate contig block-iovs to minimize number of iovs */ | ||||
if ((req->prev_gpaddr + req->prev_size) == gpaddr) { | if ((req->prev_gpaddr + req->prev_size) == gpaddr) { | ||||
iovidx = req->io_req.br_iovcnt - 1; | iovidx = req->io_req.br_iovcnt - 1; | ||||
req->io_req.br_iov[iovidx].iov_base = | req->io_req.br_iov[iovidx].iov_base = | ||||
paddr_guest2host(req->sc->nsc_pi->pi_vmctx, | paddr_guest2host(req->sc->nsc_pi->pi_vmctx, | ||||
req->prev_gpaddr, size); | req->prev_gpaddr, size); | ||||
req->prev_size += size; | req->prev_size += size; | ||||
req->io_req.br_resid += size; | req->io_req.br_resid += size; | ||||
req->io_req.br_iov[iovidx].iov_len = req->prev_size; | req->io_req.br_iov[iovidx].iov_len = req->prev_size; | ||||
} else { | } else { | ||||
pthread_mutex_lock(&req->mtx); | |||||
iovidx = req->io_req.br_iovcnt; | iovidx = req->io_req.br_iovcnt; | ||||
if (iovidx == NVME_MAX_BLOCKIOVS) { | |||||
int err = 0; | |||||
DPRINTF("large I/O, doing partial req"); | |||||
iovidx = 0; | |||||
req->io_req.br_iovcnt = 0; | |||||
req->io_req.br_callback = pci_nvme_io_partial; | |||||
if (!do_write) | |||||
err = blockif_read(sc->nvstore.ctx, | |||||
&req->io_req); | |||||
else | |||||
err = blockif_write(sc->nvstore.ctx, | |||||
&req->io_req); | |||||
/* wait until req completes before cont */ | |||||
if (err == 0) | |||||
pthread_cond_wait(&req->cv, &req->mtx); | |||||
} | |||||
if (iovidx == 0) { | if (iovidx == 0) { | ||||
req->io_req.br_offset = lba; | req->io_req.br_offset = lba; | ||||
req->io_req.br_resid = 0; | req->io_req.br_resid = 0; | ||||
req->io_req.br_param = req; | req->io_req.br_param = req; | ||||
} | } | ||||
req->io_req.br_iov[iovidx].iov_base = | req->io_req.br_iov[iovidx].iov_base = | ||||
paddr_guest2host(req->sc->nsc_pi->pi_vmctx, | paddr_guest2host(req->sc->nsc_pi->pi_vmctx, | ||||
gpaddr, size); | gpaddr, size); | ||||
req->io_req.br_iov[iovidx].iov_len = size; | req->io_req.br_iov[iovidx].iov_len = size; | ||||
req->prev_gpaddr = gpaddr; | req->prev_gpaddr = gpaddr; | ||||
req->prev_size = size; | req->prev_size = size; | ||||
req->io_req.br_resid += size; | req->io_req.br_resid += size; | ||||
req->io_req.br_iovcnt++; | req->io_req.br_iovcnt++; | ||||
pthread_mutex_unlock(&req->mtx); | |||||
} | } | ||||
} else { | |||||
/* RAM buffer: read/write directly */ | |||||
void *p = sc->nvstore.ctx; | |||||
void *gptr; | |||||
if ((lba + size) > sc->nvstore.size) { | |||||
WPRINTF("%s write would overflow RAM", __func__); | |||||
return (-1); | |||||
} | |||||
p = (void *)((uintptr_t)p + (uintptr_t)lba); | |||||
gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size); | |||||
if (do_write) | |||||
memcpy(p, gptr, size); | |||||
else | |||||
memcpy(gptr, p, size); | |||||
} | |||||
return (0); | return (0); | ||||
} | } | ||||
static void | static void | ||||
pci_nvme_set_completion(struct pci_nvme_softc *sc, | pci_nvme_set_completion(struct pci_nvme_softc *sc, | ||||
struct nvme_submission_queue *sq, int sqid, uint16_t cid, | struct nvme_submission_queue *sq, int sqid, uint16_t cid, | ||||
uint32_t cdw0, uint16_t status) | uint32_t cdw0, uint16_t status) | ||||
{ | { | ||||
▲ Show 20 Lines • Show All 81 Lines • ▼ Show 20 Lines | pci_nvme_io_done(struct blockif_req *br, int err) | ||||
/* TODO return correct error */ | /* TODO return correct error */ | ||||
code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; | code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; | ||||
pci_nvme_status_genc(&status, code); | pci_nvme_status_genc(&status, code); | ||||
pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status); | pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status); | ||||
pci_nvme_release_ioreq(req->sc, req); | pci_nvme_release_ioreq(req->sc, req); | ||||
} | } | ||||
static void | |||||
pci_nvme_io_partial(struct blockif_req *br, int err) | |||||
{ | |||||
struct pci_nvme_ioreq *req = br->br_param; | |||||
DPRINTF("%s error %d %s", __func__, err, strerror(err)); | |||||
pthread_cond_signal(&req->cv); | |||||
} | |||||
/* | /* | ||||
* Implements the Flush command. The specification states: | * Implements the Flush command. The specification states: | ||||
* If a volatile write cache is not present, Flush commands complete | * If a volatile write cache is not present, Flush commands complete | ||||
* successfully and have no effect | * successfully and have no effect | ||||
* in the description of the Volatile Write Cache (VWC) field of the Identify | * in the description of the Volatile Write Cache (VWC) field of the Identify | ||||
* Controller data. Therefore, set status to Success if the command is | * Controller data. Therefore, set status to Success if the command is | ||||
* not supported (i.e. RAM or as indicated by the blockif). | * not supported (i.e. RAM or as indicated by the blockif). | ||||
*/ | */ | ||||
▲ Show 20 Lines • Show All 141 Lines • ▼ Show 20 Lines | nvme_opc_write_read(struct pci_nvme_softc *sc, | ||||
uint64_t lba, nblocks, bytes; | uint64_t lba, nblocks, bytes; | ||||
size_t offset; | size_t offset; | ||||
bool is_write = cmd->opc == NVME_OPC_WRITE; | bool is_write = cmd->opc == NVME_OPC_WRITE; | ||||
bool pending = false; | bool pending = false; | ||||
lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; | lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; | ||||
nblocks = (cmd->cdw12 & 0xFFFF) + 1; | nblocks = (cmd->cdw12 & 0xFFFF) + 1; | ||||
offset = lba * nvstore->sectsz; | |||||
bytes = nblocks * nvstore->sectsz; | bytes = nblocks * nvstore->sectsz; | ||||
if (bytes > NVME_MAX_DATA_SIZE) { | |||||
WPRINTF("%s command would exceed MDTS", __func__); | |||||
pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); | |||||
goto out; | |||||
} | |||||
offset = lba * nvstore->sectsz; | |||||
if ((offset + bytes) > nvstore->size) { | if ((offset + bytes) > nvstore->size) { | ||||
WPRINTF("%s command would exceed LBA range", __func__); | WPRINTF("%s command would exceed LBA range", __func__); | ||||
pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); | pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); | ||||
goto out; | goto out; | ||||
} | } | ||||
req->io_req.br_offset = lba; | req->io_req.br_offset = lba; | ||||
▲ Show 20 Lines • Show All 660 Lines • ▼ Show 20 Lines | if (error < 0) | ||||
goto done; | goto done; | ||||
else | else | ||||
error = 0; | error = 0; | ||||
STAILQ_INIT(&sc->ioreqs_free); | STAILQ_INIT(&sc->ioreqs_free); | ||||
sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); | sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); | ||||
for (int i = 0; i < sc->ioslots; i++) { | for (int i = 0; i < sc->ioslots; i++) { | ||||
STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); | STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); | ||||
pthread_mutex_init(&sc->ioreqs[i].mtx, NULL); | |||||
pthread_cond_init(&sc->ioreqs[i].cv, NULL); | |||||
} | } | ||||
pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); | pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); | ||||
pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); | pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); | ||||
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); | pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); | ||||
pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); | pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); | ||||
pci_set_cfgdata8(pi, PCIR_PROGIF, | pci_set_cfgdata8(pi, PCIR_PROGIF, | ||||
PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); | PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); | ||||
▲ Show 20 Lines • Show All 61 Lines • Show Last 20 Lines |