diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c index 4571f4caeffb..09b996476774 100644 --- a/usr.sbin/bhyve/pci_ahci.c +++ b/usr.sbin/bhyve/pci_ahci.c @@ -1,2740 +1,2740 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Zhixiang Yu * Copyright (c) 2015-2016 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" #include "ahci.h" #include "block_if.h" #define DEF_PORTS 6 /* Intel ICH8 AHCI supports 6 ports */ #define MAX_PORTS 32 /* AHCI supports 32 ports */ #define PxSIG_ATA 0x00000101 /* ATA drive */ #define PxSIG_ATAPI 0xeb140101 /* ATAPI drive */ enum sata_fis_type { FIS_TYPE_REGH2D = 0x27, /* Register FIS - host to device */ FIS_TYPE_REGD2H = 0x34, /* Register FIS - device to host */ FIS_TYPE_DMAACT = 0x39, /* DMA activate FIS - device to host */ FIS_TYPE_DMASETUP = 0x41, /* DMA setup FIS - bidirectional */ FIS_TYPE_DATA = 0x46, /* Data FIS - bidirectional */ FIS_TYPE_BIST = 0x58, /* BIST activate FIS - bidirectional */ FIS_TYPE_PIOSETUP = 0x5F, /* PIO setup FIS - device to host */ FIS_TYPE_SETDEVBITS = 0xA1, /* Set dev bits FIS - device to host */ }; /* * SCSI opcodes */ #define TEST_UNIT_READY 0x00 #define REQUEST_SENSE 0x03 #define INQUIRY 0x12 #define START_STOP_UNIT 0x1B #define PREVENT_ALLOW 0x1E #define READ_CAPACITY 0x25 #define READ_10 0x28 #define POSITION_TO_ELEMENT 0x2B #define READ_TOC 0x43 #define GET_EVENT_STATUS_NOTIFICATION 0x4A #define MODE_SENSE_10 0x5A #define REPORT_LUNS 0xA0 #define READ_12 0xA8 #define READ_CD 0xBE /* * SCSI mode page codes */ #define MODEPAGE_RW_ERROR_RECOVERY 0x01 #define MODEPAGE_CD_CAPABILITIES 0x2A /* * ATA commands */ #define ATA_SF_ENAB_SATA_SF 0x10 #define ATA_SATA_SF_AN 0x05 #define ATA_SF_DIS_SATA_SF 0x90 /* * Debug printf */ #ifdef AHCI_DEBUG static FILE *dbg; #define DPRINTF(format, arg...) do{fprintf(dbg, format, ##arg);fflush(dbg);}while(0) #else #define DPRINTF(format, arg...) #endif #define WPRINTF(format, arg...) printf(format, ##arg) #define AHCI_PORT_IDENT 20 + 1 struct ahci_ioreq { struct blockif_req io_req; struct ahci_port *io_pr; STAILQ_ENTRY(ahci_ioreq) io_flist; TAILQ_ENTRY(ahci_ioreq) io_blist; uint8_t *cfis; uint32_t len; uint32_t done; int slot; int more; int readop; }; struct ahci_port { struct blockif_ctxt *bctx; struct pci_ahci_softc *pr_sc; struct ata_params ata_ident; uint8_t *cmd_lst; uint8_t *rfis; int port; int atapi; int reset; int waitforclear; int mult_sectors; uint8_t xfermode; uint8_t err_cfis[20]; uint8_t sense_key; uint8_t asc; u_int ccs; uint32_t pending; uint32_t clb; uint32_t clbu; uint32_t fb; uint32_t fbu; uint32_t is; uint32_t ie; uint32_t cmd; uint32_t unused0; uint32_t tfd; uint32_t sig; uint32_t ssts; uint32_t sctl; uint32_t serr; uint32_t sact; uint32_t ci; uint32_t sntf; uint32_t fbs; /* * i/o request info */ struct ahci_ioreq *ioreq; int ioqsz; STAILQ_HEAD(ahci_fhead, ahci_ioreq) iofhd; TAILQ_HEAD(ahci_bhead, ahci_ioreq) iobhd; }; struct ahci_cmd_hdr { uint16_t flags; uint16_t prdtl; uint32_t prdbc; uint64_t ctba; uint32_t reserved[4]; }; struct ahci_prdt_entry { uint64_t dba; uint32_t reserved; #define DBCMASK 0x3fffff uint32_t dbc; }; struct pci_ahci_softc { struct pci_devinst *asc_pi; pthread_mutex_t mtx; int ports; uint32_t cap; uint32_t ghc; uint32_t is; uint32_t pi; uint32_t vs; uint32_t ccc_ctl; uint32_t ccc_pts; uint32_t em_loc; uint32_t em_ctl; uint32_t cap2; uint32_t bohc; uint32_t lintr; struct ahci_port port[MAX_PORTS]; }; #define ahci_ctx(sc) ((sc)->asc_pi->pi_vmctx) static void ahci_handle_port(struct ahci_port *p); static inline void lba_to_msf(uint8_t *buf, int lba) { lba += 150; buf[0] = (lba / 75) / 60; buf[1] = (lba / 75) % 60; buf[2] = lba % 75; } /* * Generate HBA interrupts on global IS register write. */ static void ahci_generate_intr(struct pci_ahci_softc *sc, uint32_t mask) { struct pci_devinst *pi = sc->asc_pi; struct ahci_port *p; int i, nmsg; uint32_t mmask; /* Update global IS from PxIS/PxIE. */ for (i = 0; i < sc->ports; i++) { p = &sc->port[i]; if (p->is & p->ie) sc->is |= (1 << i); } DPRINTF("%s(%08x) %08x", __func__, mask, sc->is); /* If there is nothing enabled -- clear legacy interrupt and exit. */ if (sc->is == 0 || (sc->ghc & AHCI_GHC_IE) == 0) { if (sc->lintr) { pci_lintr_deassert(pi); sc->lintr = 0; } return; } /* If there is anything and no MSI -- assert legacy interrupt. */ nmsg = pci_msi_maxmsgnum(pi); if (nmsg == 0) { if (!sc->lintr) { sc->lintr = 1; pci_lintr_assert(pi); } return; } /* Assert respective MSIs for ports that were touched. */ for (i = 0; i < nmsg; i++) { if (sc->ports <= nmsg || i < nmsg - 1) mmask = 1 << i; else mmask = 0xffffffff << i; if (sc->is & mask && mmask & mask) pci_generate_msi(pi, i); } } /* * Generate HBA interrupt on specific port event. */ static void ahci_port_intr(struct ahci_port *p) { struct pci_ahci_softc *sc = p->pr_sc; struct pci_devinst *pi = sc->asc_pi; int nmsg; DPRINTF("%s(%d) %08x/%08x %08x", __func__, p->port, p->is, p->ie, sc->is); /* If there is nothing enabled -- we are done. */ if ((p->is & p->ie) == 0) return; /* In case of non-shared MSI always generate interrupt. */ nmsg = pci_msi_maxmsgnum(pi); if (sc->ports <= nmsg || p->port < nmsg - 1) { sc->is |= (1 << p->port); if ((sc->ghc & AHCI_GHC_IE) == 0) return; pci_generate_msi(pi, p->port); return; } /* If IS for this port is already set -- do nothing. */ if (sc->is & (1 << p->port)) return; sc->is |= (1 << p->port); /* If interrupts are enabled -- generate one. */ if ((sc->ghc & AHCI_GHC_IE) == 0) return; if (nmsg > 0) { pci_generate_msi(pi, nmsg - 1); } else if (!sc->lintr) { sc->lintr = 1; pci_lintr_assert(pi); } } static void ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis) { int offset, len, irq; if (p->rfis == NULL || !(p->cmd & AHCI_P_CMD_FRE)) return; switch (ft) { case FIS_TYPE_REGD2H: offset = 0x40; len = 20; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_DHR : 0; break; case FIS_TYPE_SETDEVBITS: offset = 0x58; len = 8; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_SDB : 0; break; case FIS_TYPE_PIOSETUP: offset = 0x20; len = 20; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_PS : 0; break; default: WPRINTF("unsupported fis type %d", ft); return; } if (fis[2] & ATA_S_ERROR) { p->waitforclear = 1; irq |= AHCI_P_IX_TFE; } memcpy(p->rfis + offset, fis, len); if (irq) { if (~p->is & irq) { p->is |= irq; ahci_port_intr(p); } } } static void ahci_write_fis_piosetup(struct ahci_port *p) { uint8_t fis[20]; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_PIOSETUP; ahci_write_fis(p, FIS_TYPE_PIOSETUP, fis); } static void ahci_write_fis_sdb(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) { uint8_t fis[8]; uint8_t error; error = (tfd >> 8) & 0xff; tfd &= 0x77; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_SETDEVBITS; fis[1] = (1 << 6); fis[2] = tfd; fis[3] = error; if (fis[2] & ATA_S_ERROR) { p->err_cfis[0] = slot; p->err_cfis[2] = tfd; p->err_cfis[3] = error; memcpy(&p->err_cfis[4], cfis + 4, 16); } else { *(uint32_t *)(fis + 4) = (1 << slot); p->sact &= ~(1 << slot); } p->tfd &= ~0x77; p->tfd |= tfd; ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis); } static void ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) { uint8_t fis[20]; uint8_t error; error = (tfd >> 8) & 0xff; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[1] = (1 << 6); fis[2] = tfd & 0xff; fis[3] = error; fis[4] = cfis[4]; fis[5] = cfis[5]; fis[6] = cfis[6]; fis[7] = cfis[7]; fis[8] = cfis[8]; fis[9] = cfis[9]; fis[10] = cfis[10]; fis[11] = cfis[11]; fis[12] = cfis[12]; fis[13] = cfis[13]; if (fis[2] & ATA_S_ERROR) { p->err_cfis[0] = 0x80; p->err_cfis[2] = tfd & 0xff; p->err_cfis[3] = error; memcpy(&p->err_cfis[4], cfis + 4, 16); } else p->ci &= ~(1 << slot); p->tfd = tfd; ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_write_fis_d2h_ncq(struct ahci_port *p, int slot) { uint8_t fis[20]; p->tfd = ATA_S_READY | ATA_S_DSC; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[1] = 0; /* No interrupt */ fis[2] = p->tfd; /* Status */ fis[3] = 0; /* No error */ p->ci &= ~(1 << slot); ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_write_reset_fis_d2h(struct ahci_port *p) { uint8_t fis[20]; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[3] = 1; fis[4] = 1; if (p->atapi) { fis[5] = 0x14; fis[6] = 0xeb; } fis[12] = 1; ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_check_stopped(struct ahci_port *p) { /* * If we are no longer processing the command list and nothing * is in-flight, clear the running bit, the current command * slot, the command issue and active bits. */ if (!(p->cmd & AHCI_P_CMD_ST)) { if (p->pending == 0) { p->ccs = 0; p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK); p->ci = 0; p->sact = 0; p->waitforclear = 0; } } } static void ahci_port_stop(struct ahci_port *p) { struct ahci_ioreq *aior; uint8_t *cfis; int slot; int error; assert(pthread_mutex_isowned_np(&p->pr_sc->mtx)); TAILQ_FOREACH(aior, &p->iobhd, io_blist) { /* * Try to cancel the outstanding blockif request. */ error = blockif_cancel(p->bctx, &aior->io_req); if (error != 0) continue; slot = aior->slot; cfis = aior->cfis; if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED || cfis[2] == ATA_SEND_FPDMA_QUEUED) p->sact &= ~(1 << slot); /* NCQ */ else p->ci &= ~(1 << slot); /* * This command is now done. */ p->pending &= ~(1 << slot); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); } ahci_check_stopped(p); } static void ahci_port_reset(struct ahci_port *pr) { pr->serr = 0; pr->sact = 0; pr->xfermode = ATA_UDMA6; pr->mult_sectors = 128; if (!pr->bctx) { pr->ssts = ATA_SS_DET_NO_DEVICE; pr->sig = 0xFFFFFFFF; pr->tfd = 0x7F; return; } pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_IPM_ACTIVE; if (pr->sctl & ATA_SC_SPD_MASK) pr->ssts |= (pr->sctl & ATA_SC_SPD_MASK); else pr->ssts |= ATA_SS_SPD_GEN3; pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA; if (!pr->atapi) { pr->sig = PxSIG_ATA; pr->tfd |= ATA_S_READY; } else pr->sig = PxSIG_ATAPI; ahci_write_reset_fis_d2h(pr); } static void ahci_reset(struct pci_ahci_softc *sc) { int i; sc->ghc = AHCI_GHC_AE; sc->is = 0; if (sc->lintr) { pci_lintr_deassert(sc->asc_pi); sc->lintr = 0; } for (i = 0; i < sc->ports; i++) { sc->port[i].ie = 0; sc->port[i].is = 0; sc->port[i].cmd = (AHCI_P_CMD_SUD | AHCI_P_CMD_POD); if (sc->port[i].bctx) sc->port[i].cmd |= AHCI_P_CMD_CPS; sc->port[i].sctl = 0; ahci_port_reset(&sc->port[i]); } } static void ata_string(uint8_t *dest, const char *src, int len) { int i; for (i = 0; i < len; i++) { if (*src) dest[i ^ 1] = *src++; else dest[i ^ 1] = ' '; } } static void atapi_string(uint8_t *dest, const char *src, int len) { int i; for (i = 0; i < len; i++) { if (*src) dest[i] = *src++; else dest[i] = ' '; } } /* * Build up the iovec based on the PRDT, 'done' and 'len'. */ static void ahci_build_iov(struct ahci_port *p, struct ahci_ioreq *aior, struct ahci_prdt_entry *prdt, uint16_t prdtl) { struct blockif_req *breq = &aior->io_req; uint32_t dbcsz, extra, left, skip, todo; int i, j; assert(aior->len >= aior->done); /* Copy part of PRDT between 'done' and 'len' bytes into the iov. */ skip = aior->done; left = aior->len - aior->done; todo = 0; for (i = 0, j = 0; i < prdtl && j < BLOCKIF_IOV_MAX && left > 0; i++, prdt++) { dbcsz = (prdt->dbc & DBCMASK) + 1; /* Skip already done part of the PRDT */ if (dbcsz <= skip) { skip -= dbcsz; continue; } dbcsz -= skip; if (dbcsz > left) dbcsz = left; breq->br_iov[j].iov_base = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba + skip, dbcsz); breq->br_iov[j].iov_len = dbcsz; todo += dbcsz; left -= dbcsz; skip = 0; j++; } /* If we got limited by IOV length, round I/O down to sector size. */ if (j == BLOCKIF_IOV_MAX) { extra = todo % blockif_sectsz(p->bctx); todo -= extra; assert(todo > 0); while (extra > 0) { if (breq->br_iov[j - 1].iov_len > extra) { breq->br_iov[j - 1].iov_len -= extra; break; } extra -= breq->br_iov[j - 1].iov_len; j--; } } breq->br_iovcnt = j; breq->br_resid = todo; aior->done += todo; aior->more = (aior->done < aior->len && i < prdtl); } static void ahci_handle_rw(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct blockif_req *breq; struct ahci_prdt_entry *prdt; struct ahci_cmd_hdr *hdr; uint64_t lba; uint32_t len; int err, first, ncq, readop; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); ncq = 0; readop = 1; first = (done == 0); if (cfis[2] == ATA_WRITE || cfis[2] == ATA_WRITE48 || cfis[2] == ATA_WRITE_MUL || cfis[2] == ATA_WRITE_MUL48 || cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 || cfis[2] == ATA_WRITE_FPDMA_QUEUED) readop = 0; if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED) { lba = ((uint64_t)cfis[10] << 40) | ((uint64_t)cfis[9] << 32) | ((uint64_t)cfis[8] << 24) | ((uint64_t)cfis[6] << 16) | ((uint64_t)cfis[5] << 8) | cfis[4]; len = cfis[11] << 8 | cfis[3]; if (!len) len = 65536; ncq = 1; } else if (cfis[2] == ATA_READ48 || cfis[2] == ATA_WRITE48 || cfis[2] == ATA_READ_MUL48 || cfis[2] == ATA_WRITE_MUL48 || cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) { lba = ((uint64_t)cfis[10] << 40) | ((uint64_t)cfis[9] << 32) | ((uint64_t)cfis[8] << 24) | ((uint64_t)cfis[6] << 16) | ((uint64_t)cfis[5] << 8) | cfis[4]; len = cfis[13] << 8 | cfis[12]; if (!len) len = 65536; } else { lba = ((cfis[7] & 0xf) << 24) | (cfis[6] << 16) | (cfis[5] << 8) | cfis[4]; len = cfis[12]; if (!len) len = 256; } lba *= blockif_sectsz(p->bctx); len *= blockif_sectsz(p->bctx); /* Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; aior->readop = readop; breq = &aior->io_req; breq->br_offset = lba + done; ahci_build_iov(p, aior, prdt, hdr->prdtl); /* Mark this command in-flight. */ p->pending |= 1 << slot; /* Stuff request onto busy list. */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); if (ncq && first) ahci_write_fis_d2h_ncq(p, slot); if (readop) err = blockif_read(p->bctx, breq); else err = blockif_write(p->bctx, breq); assert(err == 0); } static void ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_ioreq *aior; struct blockif_req *breq; int err; /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = 0; aior->done = 0; aior->more = 0; breq = &aior->io_req; /* * Mark this command in-flight. */ p->pending |= 1 << slot; /* * Stuff request onto busy list */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); err = blockif_flush(p->bctx, breq); assert(err == 0); } static inline void read_prdt(struct ahci_port *p, int slot, uint8_t *cfis, void *buf, unsigned int size) { struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; uint8_t *to; unsigned int len; int i; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); len = size; to = buf; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); for (i = 0; i < hdr->prdtl && len; i++) { uint8_t *ptr; uint32_t dbcsz; unsigned int sublen; dbcsz = (prdt->dbc & DBCMASK) + 1; ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); sublen = MIN(len, dbcsz); memcpy(to, ptr, sublen); len -= sublen; to += sublen; prdt++; } } static void ahci_handle_dsm_trim(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct blockif_req *breq; uint8_t *entry; uint64_t elba; uint32_t len, elen; int err, first, ncq; uint8_t buf[512]; first = (done == 0); if (cfis[2] == ATA_DATA_SET_MANAGEMENT) { len = (uint16_t)cfis[13] << 8 | cfis[12]; len *= 512; ncq = 0; } else { /* ATA_SEND_FPDMA_QUEUED */ len = (uint16_t)cfis[11] << 8 | cfis[3]; len *= 512; ncq = 1; } read_prdt(p, slot, cfis, buf, sizeof(buf)); next: entry = &buf[done]; elba = ((uint64_t)entry[5] << 40) | ((uint64_t)entry[4] << 32) | ((uint64_t)entry[3] << 24) | ((uint64_t)entry[2] << 16) | ((uint64_t)entry[1] << 8) | entry[0]; elen = (uint16_t)entry[7] << 8 | entry[6]; done += 8; if (elen == 0) { if (done >= len) { if (ncq) { if (first) ahci_write_fis_d2h_ncq(p, slot); ahci_write_fis_sdb(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } else { ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } p->pending &= ~(1 << slot); ahci_check_stopped(p); if (!first) ahci_handle_port(p); return; } goto next; } /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; aior->more = (len != done); breq = &aior->io_req; breq->br_offset = elba * blockif_sectsz(p->bctx); breq->br_resid = elen * blockif_sectsz(p->bctx); /* * Mark this command in-flight. */ p->pending |= 1 << slot; /* * Stuff request onto busy list */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); if (ncq && first) ahci_write_fis_d2h_ncq(p, slot); err = blockif_delete(p->bctx, breq); assert(err == 0); } static inline void write_prdt(struct ahci_port *p, int slot, uint8_t *cfis, void *buf, unsigned int size) { struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; uint8_t *from; unsigned int len; int i; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); len = size; from = buf; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); for (i = 0; i < hdr->prdtl && len; i++) { uint8_t *ptr; uint32_t dbcsz; int sublen; dbcsz = (prdt->dbc & DBCMASK) + 1; ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); sublen = MIN(len, dbcsz); memcpy(ptr, from, sublen); len -= sublen; from += sublen; prdt++; } hdr->prdbc = size - len; } static void ahci_checksum(uint8_t *buf, int size) { int i; uint8_t sum = 0; for (i = 0; i < size - 1; i++) sum += buf[i]; buf[size - 1] = 0x100 - sum; } static void ahci_handle_read_log(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_cmd_hdr *hdr; uint32_t buf[128]; uint8_t *buf8 = (uint8_t *)buf; uint16_t *buf16 = (uint16_t *)buf; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (p->atapi || hdr->prdtl == 0 || cfis[5] != 0 || cfis[9] != 0 || cfis[12] != 1 || cfis[13] != 0) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); return; } memset(buf, 0, sizeof(buf)); if (cfis[4] == 0x00) { /* Log directory */ buf16[0x00] = 1; /* Version -- 1 */ buf16[0x10] = 1; /* NCQ Command Error Log -- 1 page */ buf16[0x13] = 1; /* SATA NCQ Send and Receive Log -- 1 page */ } else if (cfis[4] == 0x10) { /* NCQ Command Error Log */ memcpy(buf8, p->err_cfis, sizeof(p->err_cfis)); ahci_checksum(buf8, sizeof(buf)); } else if (cfis[4] == 0x13) { /* SATA NCQ Send and Receive Log */ if (blockif_candelete(p->bctx) && !blockif_is_ro(p->bctx)) { buf[0x00] = 1; /* SFQ DSM supported */ buf[0x01] = 1; /* SFQ DSM TRIM supported */ } } else { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); return; } if (cfis[2] == ATA_READ_LOG_EXT) ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } static void handle_identify(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_cmd_hdr *hdr; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (p->atapi || hdr->prdtl == 0) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else { ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void*)&p->ata_ident, sizeof(struct ata_params)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } } static void ata_identify_init(struct ahci_port* p, int atapi) { struct ata_params* ata_ident = &p->ata_ident; if (atapi) { ata_ident->config = ATA_PROTO_ATAPI | ATA_ATAPI_TYPE_CDROM | ATA_ATAPI_REMOVABLE | ATA_DRQ_FAST; ata_ident->capabilities1 = ATA_SUPPORT_LBA | ATA_SUPPORT_DMA; ata_ident->capabilities2 = (1 << 14 | 1); ata_ident->atavalid = ATA_FLAG_64_70 | ATA_FLAG_88; ata_ident->obsolete62 = 0x3f; ata_ident->mwdmamodes = 7; if (p->xfermode & ATA_WDMA0) ata_ident->mwdmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->apiomodes = 3; ata_ident->mwdmamin = 0x0078; ata_ident->mwdmarec = 0x0078; ata_ident->pioblind = 0x0078; ata_ident->pioiordy = 0x0078; ata_ident->satacapabilities = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3); ata_ident->satacapabilities2 = ((p->ssts & ATA_SS_SPD_MASK) >> 3); ata_ident->satasupport = ATA_SUPPORT_NCQ_STREAM; ata_ident->version_major = 0x3f0; ata_ident->support.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); ata_ident->support.command2 = (1 << 14); ata_ident->support.extension = (1 << 14); ata_ident->enabled.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); ata_ident->enabled.extension = (1 << 14); ata_ident->udmamodes = 0x7f; if (p->xfermode & ATA_UDMA0) ata_ident->udmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->transport_major = 0x1020; ata_ident->integrity = 0x00a5; } else { uint64_t sectors; int sectsz, psectsz, psectoff, candelete, ro; uint16_t cyl; uint8_t sech, heads; ro = blockif_is_ro(p->bctx); candelete = blockif_candelete(p->bctx); sectsz = blockif_sectsz(p->bctx); sectors = blockif_size(p->bctx) / sectsz; blockif_chs(p->bctx, &cyl, &heads, &sech); blockif_psectsz(p->bctx, &psectsz, &psectoff); ata_ident->config = ATA_DRQ_FAST; ata_ident->cylinders = cyl; ata_ident->heads = heads; ata_ident->sectors = sech; ata_ident->sectors_intr = (0x8000 | 128); ata_ident->tcg = 0; ata_ident->capabilities1 = ATA_SUPPORT_DMA | ATA_SUPPORT_LBA | ATA_SUPPORT_IORDY; ata_ident->capabilities2 = (1 << 14); ata_ident->atavalid = ATA_FLAG_64_70 | ATA_FLAG_88; if (p->mult_sectors) ata_ident->multi = (ATA_MULTI_VALID | p->mult_sectors); if (sectors <= 0x0fffffff) { ata_ident->lba_size_1 = sectors; ata_ident->lba_size_2 = (sectors >> 16); } else { ata_ident->lba_size_1 = 0xffff; ata_ident->lba_size_2 = 0x0fff; } ata_ident->mwdmamodes = 0x7; if (p->xfermode & ATA_WDMA0) ata_ident->mwdmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->apiomodes = 0x3; ata_ident->mwdmamin = 0x0078; ata_ident->mwdmarec = 0x0078; ata_ident->pioblind = 0x0078; ata_ident->pioiordy = 0x0078; ata_ident->support3 = 0; ata_ident->queue = 31; ata_ident->satacapabilities = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3 | ATA_SUPPORT_NCQ); ata_ident->satacapabilities2 = (ATA_SUPPORT_RCVSND_FPDMA_QUEUED | (p->ssts & ATA_SS_SPD_MASK) >> 3); ata_ident->version_major = 0x3f0; ata_ident->version_minor = 0x28; ata_ident->support.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE | ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); ata_ident->support.command2 = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | ATA_SUPPORT_FLUSHCACHE48 | 1 << 14); ata_ident->support.extension = (1 << 14); ata_ident->enabled.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE | ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); ata_ident->enabled.command2 = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | ATA_SUPPORT_FLUSHCACHE48 | 1 << 15); ata_ident->enabled.extension = (1 << 14); ata_ident->udmamodes = 0x7f; if (p->xfermode & ATA_UDMA0) ata_ident->udmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->lba_size48_1 = sectors; ata_ident->lba_size48_2 = (sectors >> 16); ata_ident->lba_size48_3 = (sectors >> 32); ata_ident->lba_size48_4 = (sectors >> 48); if (candelete && !ro) { ata_ident->support3 |= ATA_SUPPORT_RZAT | ATA_SUPPORT_DRAT; ata_ident->max_dsm_blocks = 1; ata_ident->support_dsm = ATA_SUPPORT_DSM_TRIM; } ata_ident->pss = ATA_PSS_VALID_VALUE; ata_ident->lsalign = 0x4000; if (psectsz > sectsz) { ata_ident->pss |= ATA_PSS_MULTLS; ata_ident->pss |= ffsl(psectsz / sectsz) - 1; ata_ident->lsalign |= (psectoff / sectsz); } if (sectsz > 512) { ata_ident->pss |= ATA_PSS_LSSABOVE512; ata_ident->lss_1 = sectsz / 2; ata_ident->lss_2 = ((sectsz / 2) >> 16); } ata_ident->support2 = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); ata_ident->enabled2 = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); ata_ident->transport_major = 0x1020; ata_ident->integrity = 0x00a5; } ahci_checksum((uint8_t*)ata_ident, sizeof(struct ata_params)); } static void handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis) { if (!p->atapi) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else { ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void *)&p->ata_ident, sizeof(struct ata_params)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } } static void atapi_inquiry(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[36]; uint8_t *acmd; unsigned int len; uint32_t tfd; acmd = cfis + 0x40; if (acmd[1] & 1) { /* VPD */ if (acmd[2] == 0) { /* Supported VPD pages */ buf[0] = 0x05; buf[1] = 0; buf[2] = 0; buf[3] = 1; buf[4] = 0; len = 4 + buf[3]; } else { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); return; } } else { buf[0] = 0x05; buf[1] = 0x80; buf[2] = 0x00; buf[3] = 0x21; buf[4] = 31; buf[5] = 0; buf[6] = 0; buf[7] = 0; atapi_string(buf + 8, "BHYVE", 8); atapi_string(buf + 16, "BHYVE DVD-ROM", 16); atapi_string(buf + 32, "001", 4); len = sizeof(buf); } if (len > acmd[4]) len = acmd[4]; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, len); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read_capacity(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[8]; uint64_t sectors; sectors = blockif_size(p->bctx) / 2048; be32enc(buf, sectors - 1); be32enc(buf + 4, 2048); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint8_t format; unsigned int len; acmd = cfis + 0x40; len = be16dec(acmd + 7); format = acmd[9] >> 6; switch (format) { case 0: { size_t size; int msf; uint64_t sectors; uint8_t start_track, buf[20], *bp; msf = (acmd[1] >> 1) & 1; start_track = acmd[6]; if (start_track > 1 && start_track != 0xaa) { uint32_t tfd; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); return; } bp = buf + 2; *bp++ = 1; *bp++ = 1; if (start_track <= 1) { *bp++ = 0; *bp++ = 0x14; *bp++ = 1; *bp++ = 0; if (msf) { *bp++ = 0; lba_to_msf(bp, 0); bp += 3; } else { *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; } } *bp++ = 0; *bp++ = 0x14; *bp++ = 0xaa; *bp++ = 0; sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); sectors >>= 2; if (msf) { *bp++ = 0; lba_to_msf(bp, sectors); bp += 3; } else { be32enc(bp, sectors); bp += 4; } size = bp - buf; be16enc(buf, size - 2); if (len > size) len = size; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } case 1: { uint8_t buf[12]; memset(buf, 0, sizeof(buf)); buf[1] = 0xa; buf[2] = 0x1; buf[3] = 0x1; if (len > sizeof(buf)) len = sizeof(buf); write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } case 2: { size_t size; int msf; uint64_t sectors; uint8_t *bp, buf[50]; msf = (acmd[1] >> 1) & 1; bp = buf + 2; *bp++ = 1; *bp++ = 1; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa1; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa2; *bp++ = 0; *bp++ = 0; *bp++ = 0; sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); sectors >>= 2; if (msf) { *bp++ = 0; lba_to_msf(bp, sectors); bp += 3; } else { be32enc(bp, sectors); bp += 4; } *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 0; if (msf) { *bp++ = 0; lba_to_msf(bp, 0); bp += 3; } else { *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; } size = bp - buf; be16enc(buf, size - 2); if (len > size) len = size; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } default: { uint32_t tfd; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); break; } } } static void atapi_report_luns(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[16]; memset(buf, 0, sizeof(buf)); buf[3] = 8; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; struct blockif_req *breq; uint8_t *acmd; uint64_t lba; uint32_t len; int err; acmd = cfis + 0x40; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); prdt = (struct ahci_prdt_entry *)(cfis + 0x80); lba = be32dec(acmd + 2); if (acmd[0] == READ_10) len = be16dec(acmd + 7); else len = be32dec(acmd + 6); if (len == 0) { cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } lba *= 2048; len *= 2048; /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; aior->readop = 1; breq = &aior->io_req; breq->br_offset = lba + done; ahci_build_iov(p, aior, prdt, hdr->prdtl); /* Mark this command in-flight. */ p->pending |= 1 << slot; /* Stuff request onto busy list. */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); err = blockif_read(p->bctx, breq); assert(err == 0); } static void atapi_request_sense(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[64]; uint8_t *acmd; unsigned int len; acmd = cfis + 0x40; len = acmd[4]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, len); buf[0] = 0x70 | (1 << 7); buf[2] = p->sense_key; buf[7] = 10; buf[12] = p->asc; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_start_stop_unit(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd = cfis + 0x40; uint32_t tfd; switch (acmd[4] & 3) { case 0: case 1: case 3: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; tfd = ATA_S_READY | ATA_S_DSC; break; case 2: /* TODO eject media */ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x53; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; } ahci_write_fis_d2h(p, slot, cfis, tfd); } static void atapi_mode_sense(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint32_t tfd; uint8_t pc, code; unsigned int len; acmd = cfis + 0x40; len = be16dec(acmd + 7); pc = acmd[2] >> 6; code = acmd[2] & 0x3f; switch (pc) { case 0: switch (code) { case MODEPAGE_RW_ERROR_RECOVERY: { uint8_t buf[16]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 16 - 2); buf[2] = 0x70; buf[8] = 0x01; buf[9] = 16 - 10; buf[11] = 0x05; write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; break; } case MODEPAGE_CD_CAPABILITIES: { uint8_t buf[30]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 30 - 2); buf[2] = 0x70; buf[8] = 0x2A; buf[9] = 30 - 10; buf[10] = 0x08; buf[12] = 0x71; be16enc(&buf[18], 2); be16enc(&buf[20], 512); write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; break; } default: goto error; break; } break; case 3: p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x39; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; error: case 1: case 2: p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); } static void atapi_get_event_status_notification(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint32_t tfd; acmd = cfis + 0x40; /* we don't support asynchronous operation */ if (!(acmd[1] & 1)) { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; } else { uint8_t buf[8]; unsigned int len; len = be16dec(acmd + 7); if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 8 - 2); buf[2] = 0x04; buf[3] = 0x10; buf[5] = 0x02; write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); } static void handle_packet_cmd(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; acmd = cfis + 0x40; #ifdef AHCI_DEBUG { int i; DPRINTF("ACMD:"); for (i = 0; i < 16; i++) DPRINTF("%02x ", acmd[i]); DPRINTF(""); } #endif switch (acmd[0]) { case TEST_UNIT_READY: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case INQUIRY: atapi_inquiry(p, slot, cfis); break; case READ_CAPACITY: atapi_read_capacity(p, slot, cfis); break; case PREVENT_ALLOW: /* TODO */ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case READ_TOC: atapi_read_toc(p, slot, cfis); break; case REPORT_LUNS: atapi_report_luns(p, slot, cfis); break; case READ_10: case READ_12: atapi_read(p, slot, cfis, 0); break; case REQUEST_SENSE: atapi_request_sense(p, slot, cfis); break; case START_STOP_UNIT: atapi_start_stop_unit(p, slot, cfis); break; case MODE_SENSE_10: atapi_mode_sense(p, slot, cfis); break; case GET_EVENT_STATUS_NOTIFICATION: atapi_get_event_status_notification(p, slot, cfis); break; default: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x20; ahci_write_fis_d2h(p, slot, cfis, (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR); break; } } static void ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis) { p->tfd |= ATA_S_BUSY; switch (cfis[2]) { case ATA_ATA_IDENTIFY: handle_identify(p, slot, cfis); break; case ATA_SETFEATURES: { switch (cfis[3]) { case ATA_SF_ENAB_SATA_SF: switch (cfis[12]) { case ATA_SATA_SF_AN: p->tfd = ATA_S_DSC | ATA_S_READY; break; default: p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); break; } break; case ATA_SF_ENAB_WCACHE: case ATA_SF_DIS_WCACHE: case ATA_SF_ENAB_RCACHE: case ATA_SF_DIS_RCACHE: p->tfd = ATA_S_DSC | ATA_S_READY; break; case ATA_SF_SETXFER: { switch (cfis[12] & 0xf8) { case ATA_PIO: case ATA_PIO0: break; case ATA_WDMA0: case ATA_UDMA0: p->xfermode = (cfis[12] & 0x7); break; } p->tfd = ATA_S_DSC | ATA_S_READY; break; } default: p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); break; } ahci_write_fis_d2h(p, slot, cfis, p->tfd); break; } case ATA_SET_MULTI: if (cfis[12] != 0 && (cfis[12] > 128 || (cfis[12] & (cfis[12] - 1)))) { p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); } else { p->mult_sectors = cfis[12]; p->tfd = ATA_S_DSC | ATA_S_READY; } ahci_write_fis_d2h(p, slot, cfis, p->tfd); break; case ATA_READ: case ATA_WRITE: case ATA_READ48: case ATA_WRITE48: case ATA_READ_MUL: case ATA_WRITE_MUL: case ATA_READ_MUL48: case ATA_WRITE_MUL48: case ATA_READ_DMA: case ATA_WRITE_DMA: case ATA_READ_DMA48: case ATA_WRITE_DMA48: case ATA_READ_FPDMA_QUEUED: case ATA_WRITE_FPDMA_QUEUED: ahci_handle_rw(p, slot, cfis, 0); break; case ATA_FLUSHCACHE: case ATA_FLUSHCACHE48: ahci_handle_flush(p, slot, cfis); break; case ATA_DATA_SET_MANAGEMENT: if (cfis[11] == 0 && cfis[3] == ATA_DSM_TRIM && cfis[13] == 0 && cfis[12] == 1) { ahci_handle_dsm_trim(p, slot, cfis, 0); break; } ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_SEND_FPDMA_QUEUED: if ((cfis[13] & 0x1f) == ATA_SFPDMA_DSM && cfis[17] == 0 && cfis[16] == ATA_DSM_TRIM && cfis[11] == 0 && cfis[3] == 1) { ahci_handle_dsm_trim(p, slot, cfis, 0); break; } ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_READ_LOG_EXT: case ATA_READ_LOG_DMA_EXT: ahci_handle_read_log(p, slot, cfis); break; case ATA_SECURITY_FREEZE_LOCK: case ATA_SMART_CMD: case ATA_NOP: ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_CHECK_POWER_MODE: cfis[12] = 0xff; /* always on */ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case ATA_STANDBY_CMD: case ATA_STANDBY_IMMEDIATE: case ATA_IDLE_CMD: case ATA_IDLE_IMMEDIATE: case ATA_SLEEP: case ATA_READ_VERIFY: case ATA_READ_VERIFY48: ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case ATA_ATAPI_IDENTIFY: handle_atapi_identify(p, slot, cfis); break; case ATA_PACKET_CMD: if (!p->atapi) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else handle_packet_cmd(p, slot, cfis); break; default: WPRINTF("Unsupported cmd:%02x", cfis[2]); ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; } } static void ahci_handle_slot(struct ahci_port *p, int slot) { struct ahci_cmd_hdr *hdr; #ifdef AHCI_DEBUG struct ahci_prdt_entry *prdt; #endif struct pci_ahci_softc *sc; uint8_t *cfis; #ifdef AHCI_DEBUG int cfl, i; #endif sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); #ifdef AHCI_DEBUG cfl = (hdr->flags & 0x1f) * 4; #endif cfis = paddr_guest2host(ahci_ctx(sc), hdr->ctba, 0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry)); #ifdef AHCI_DEBUG prdt = (struct ahci_prdt_entry *)(cfis + 0x80); DPRINTF("cfis:"); for (i = 0; i < cfl; i++) { if (i % 10 == 0) DPRINTF(""); DPRINTF("%02x ", cfis[i]); } DPRINTF(""); for (i = 0; i < hdr->prdtl; i++) { DPRINTF("%d@%08"PRIx64"", prdt->dbc & 0x3fffff, prdt->dba); prdt++; } #endif if (cfis[0] != FIS_TYPE_REGH2D) { WPRINTF("Not a H2D FIS:%02x", cfis[0]); return; } if (cfis[1] & 0x80) { ahci_handle_cmd(p, slot, cfis); } else { if (cfis[15] & (1 << 2)) p->reset = 1; else if (p->reset) { p->reset = 0; ahci_port_reset(p); } p->ci &= ~(1 << slot); } } static void ahci_handle_port(struct ahci_port *p) { if (!(p->cmd & AHCI_P_CMD_ST)) return; /* * Search for any new commands to issue ignoring those that * are already in-flight. Stop if device is busy or in error. */ for (; (p->ci & ~p->pending) != 0; p->ccs = ((p->ccs + 1) & 31)) { if ((p->tfd & (ATA_S_BUSY | ATA_S_DRQ)) != 0) break; if (p->waitforclear) break; if ((p->ci & ~p->pending & (1 << p->ccs)) != 0) { p->cmd &= ~AHCI_P_CMD_CCS_MASK; p->cmd |= p->ccs << AHCI_P_CMD_CCS_SHIFT; ahci_handle_slot(p, p->ccs); } } } /* * blockif callback routine - this runs in the context of the blockif * i/o thread, so the mutex needs to be acquired. */ static void ata_ioreq_cb(struct blockif_req *br, int err) { struct ahci_cmd_hdr *hdr; struct ahci_ioreq *aior; struct ahci_port *p; struct pci_ahci_softc *sc; uint32_t tfd; uint8_t *cfis; int slot, ncq, dsm; DPRINTF("%s %d", __func__, err); ncq = dsm = 0; aior = br->br_param; p = aior->io_pr; cfis = aior->cfis; slot = aior->slot; sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED || cfis[2] == ATA_SEND_FPDMA_QUEUED) ncq = 1; if (cfis[2] == ATA_DATA_SET_MANAGEMENT || (cfis[2] == ATA_SEND_FPDMA_QUEUED && (cfis[13] & 0x1f) == ATA_SFPDMA_DSM)) dsm = 1; pthread_mutex_lock(&sc->mtx); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); if (!err) hdr->prdbc = aior->done; if (!err && aior->more) { if (dsm) ahci_handle_dsm_trim(p, slot, cfis, aior->done); else ahci_handle_rw(p, slot, cfis, aior->done); goto out; } if (!err) tfd = ATA_S_READY | ATA_S_DSC; else tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; if (ncq) ahci_write_fis_sdb(p, slot, cfis, tfd); else ahci_write_fis_d2h(p, slot, cfis, tfd); /* * This command is now complete. */ p->pending &= ~(1 << slot); ahci_check_stopped(p); ahci_handle_port(p); out: pthread_mutex_unlock(&sc->mtx); DPRINTF("%s exit", __func__); } static void atapi_ioreq_cb(struct blockif_req *br, int err) { struct ahci_cmd_hdr *hdr; struct ahci_ioreq *aior; struct ahci_port *p; struct pci_ahci_softc *sc; uint8_t *cfis; uint32_t tfd; int slot; DPRINTF("%s %d", __func__, err); aior = br->br_param; p = aior->io_pr; cfis = aior->cfis; slot = aior->slot; sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + aior->slot * AHCI_CL_SIZE); pthread_mutex_lock(&sc->mtx); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); if (!err) hdr->prdbc = aior->done; if (!err && aior->more) { atapi_read(p, slot, cfis, aior->done); goto out; } if (!err) { tfd = ATA_S_READY | ATA_S_DSC; } else { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x21; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); /* * This command is now complete. */ p->pending &= ~(1 << slot); ahci_check_stopped(p); ahci_handle_port(p); out: pthread_mutex_unlock(&sc->mtx); DPRINTF("%s exit", __func__); } static void pci_ahci_ioreq_init(struct ahci_port *pr) { struct ahci_ioreq *vr; int i; pr->ioqsz = blockif_queuesz(pr->bctx); pr->ioreq = calloc(pr->ioqsz, sizeof(struct ahci_ioreq)); STAILQ_INIT(&pr->iofhd); /* * Add all i/o request entries to the free queue */ for (i = 0; i < pr->ioqsz; i++) { vr = &pr->ioreq[i]; vr->io_pr = pr; if (!pr->atapi) vr->io_req.br_callback = ata_ioreq_cb; else vr->io_req.br_callback = atapi_ioreq_cb; vr->io_req.br_param = vr; STAILQ_INSERT_TAIL(&pr->iofhd, vr, io_flist); } TAILQ_INIT(&pr->iobhd); } static void pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) { int port = (offset - AHCI_OFFSET) / AHCI_STEP; offset = (offset - AHCI_OFFSET) % AHCI_STEP; struct ahci_port *p = &sc->port[port]; DPRINTF("pci_ahci_port %d: write offset 0x%"PRIx64" value 0x%"PRIx64"", port, offset, value); switch (offset) { case AHCI_P_CLB: p->clb = value; break; case AHCI_P_CLBU: p->clbu = value; break; case AHCI_P_FB: p->fb = value; break; case AHCI_P_FBU: p->fbu = value; break; case AHCI_P_IS: p->is &= ~value; ahci_port_intr(p); break; case AHCI_P_IE: p->ie = value & 0xFDC000FF; ahci_port_intr(p); break; case AHCI_P_CMD: { p->cmd &= ~(AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK); p->cmd |= (AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK) & value; if (!(value & AHCI_P_CMD_ST)) { ahci_port_stop(p); } else { uint64_t clb; p->cmd |= AHCI_P_CMD_CR; clb = (uint64_t)p->clbu << 32 | p->clb; p->cmd_lst = paddr_guest2host(ahci_ctx(sc), clb, AHCI_CL_SIZE * AHCI_MAX_SLOTS); } if (value & AHCI_P_CMD_FRE) { uint64_t fb; p->cmd |= AHCI_P_CMD_FR; fb = (uint64_t)p->fbu << 32 | p->fb; /* we don't support FBSCP, so rfis size is 256Bytes */ p->rfis = paddr_guest2host(ahci_ctx(sc), fb, 256); } else { p->cmd &= ~AHCI_P_CMD_FR; } if (value & AHCI_P_CMD_CLO) { p->tfd &= ~(ATA_S_BUSY | ATA_S_DRQ); p->cmd &= ~AHCI_P_CMD_CLO; } if (value & AHCI_P_CMD_ICC_MASK) { p->cmd &= ~AHCI_P_CMD_ICC_MASK; } ahci_handle_port(p); break; } case AHCI_P_TFD: case AHCI_P_SIG: case AHCI_P_SSTS: WPRINTF("pci_ahci_port: read only registers 0x%"PRIx64"", offset); break; case AHCI_P_SCTL: p->sctl = value; if (!(p->cmd & AHCI_P_CMD_ST)) { if (value & ATA_SC_DET_RESET) ahci_port_reset(p); } break; case AHCI_P_SERR: p->serr &= ~value; break; case AHCI_P_SACT: p->sact |= value; break; case AHCI_P_CI: p->ci |= value; ahci_handle_port(p); break; case AHCI_P_SNTF: case AHCI_P_FBS: default: break; } } static void pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) { DPRINTF("pci_ahci_host: write offset 0x%"PRIx64" value 0x%"PRIx64"", offset, value); switch (offset) { case AHCI_CAP: case AHCI_PI: case AHCI_VS: case AHCI_CAP2: DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"", offset); break; case AHCI_GHC: if (value & AHCI_GHC_HR) { ahci_reset(sc); break; } if (value & AHCI_GHC_IE) sc->ghc |= AHCI_GHC_IE; else sc->ghc &= ~AHCI_GHC_IE; ahci_generate_intr(sc, 0xffffffff); break; case AHCI_IS: sc->is &= ~value; ahci_generate_intr(sc, value); break; default: break; } } static void pci_ahci_write(struct vmctx *ctx __unused, int vcpu __unused, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct pci_ahci_softc *sc = pi->pi_arg; assert(baridx == 5); assert((offset % 4) == 0 && size == 4); pthread_mutex_lock(&sc->mtx); if (offset < AHCI_OFFSET) pci_ahci_host_write(sc, offset, value); else if (offset < (uint64_t)AHCI_OFFSET + sc->ports * AHCI_STEP) pci_ahci_port_write(sc, offset, value); else WPRINTF("pci_ahci: unknown i/o write offset 0x%"PRIx64"", offset); pthread_mutex_unlock(&sc->mtx); } static uint64_t pci_ahci_host_read(struct pci_ahci_softc *sc, uint64_t offset) { uint32_t value; switch (offset) { case AHCI_CAP: case AHCI_GHC: case AHCI_IS: case AHCI_PI: case AHCI_VS: case AHCI_CCCC: case AHCI_CCCP: case AHCI_EM_LOC: case AHCI_EM_CTL: case AHCI_CAP2: { uint32_t *p = &sc->cap; p += (offset - AHCI_CAP) / sizeof(uint32_t); value = *p; break; } default: value = 0; break; } DPRINTF("pci_ahci_host: read offset 0x%"PRIx64" value 0x%x", offset, value); return (value); } static uint64_t pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset) { uint32_t value; int port = (offset - AHCI_OFFSET) / AHCI_STEP; offset = (offset - AHCI_OFFSET) % AHCI_STEP; switch (offset) { case AHCI_P_CLB: case AHCI_P_CLBU: case AHCI_P_FB: case AHCI_P_FBU: case AHCI_P_IS: case AHCI_P_IE: case AHCI_P_CMD: case AHCI_P_TFD: case AHCI_P_SIG: case AHCI_P_SSTS: case AHCI_P_SCTL: case AHCI_P_SERR: case AHCI_P_SACT: case AHCI_P_CI: case AHCI_P_SNTF: case AHCI_P_FBS: { uint32_t *p= &sc->port[port].clb; p += (offset - AHCI_P_CLB) / sizeof(uint32_t); value = *p; break; } default: value = 0; break; } DPRINTF("pci_ahci_port %d: read offset 0x%"PRIx64" value 0x%x", port, offset, value); return value; } static uint64_t pci_ahci_read(struct vmctx *ctx __unused, int vcpu __unused, struct pci_devinst *pi, int baridx, uint64_t regoff, int size) { struct pci_ahci_softc *sc = pi->pi_arg; uint64_t offset; uint32_t value; assert(baridx == 5); assert(size == 1 || size == 2 || size == 4); assert((regoff & (size - 1)) == 0); pthread_mutex_lock(&sc->mtx); offset = regoff & ~0x3; /* round down to a multiple of 4 bytes */ if (offset < AHCI_OFFSET) value = pci_ahci_host_read(sc, offset); else if (offset < (uint64_t)AHCI_OFFSET + sc->ports * AHCI_STEP) value = pci_ahci_port_read(sc, offset); else { value = 0; WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"", regoff); } value >>= 8 * (regoff & 0x3); pthread_mutex_unlock(&sc->mtx); return (value); } /* * Each AHCI controller has a "port" node which contains nodes for * each port named after the decimal number of the port (no leading * zeroes). Port nodes contain a "type" ("hd" or "cd"), as well as * options for blockif. For example: * * pci.0.1.0 * .device="ahci" * .port * .0 * .type="hd" * .path="/path/to/image" */ static int pci_ahci_legacy_config_port(nvlist_t *nvl, int port, const char *type, const char *opts) { char node_name[sizeof("XX")]; nvlist_t *port_nvl; snprintf(node_name, sizeof(node_name), "%d", port); port_nvl = create_relative_config_node(nvl, node_name); set_config_value_node(port_nvl, "type", type); return (blockif_legacy_config(port_nvl, opts)); } static int pci_ahci_legacy_config(nvlist_t *nvl, const char *opts) { nvlist_t *ports_nvl; const char *type; char *next, *next2, *str, *tofree; int p, ret; if (opts == NULL) return (0); ports_nvl = create_relative_config_node(nvl, "port"); ret = 1; tofree = str = strdup(opts); for (p = 0; p < MAX_PORTS && str != NULL; p++, str = next) { /* Identify and cut off type of present port. */ if (strncmp(str, "hd:", 3) == 0) { type = "hd"; str += 3; } else if (strncmp(str, "cd:", 3) == 0) { type = "cd"; str += 3; } else type = NULL; /* Find and cut off the next port options. */ next = strstr(str, ",hd:"); next2 = strstr(str, ",cd:"); if (next == NULL || (next2 != NULL && next2 < next)) next = next2; if (next != NULL) { next[0] = 0; next++; } if (str[0] == 0) continue; if (type == NULL) { EPRINTLN("Missing or invalid type for port %d: \"%s\"", p, str); goto out; } if (pci_ahci_legacy_config_port(ports_nvl, p, type, str) != 0) goto out; } ret = 0; out: free(tofree); return (ret); } static int pci_ahci_cd_legacy_config(nvlist_t *nvl, const char *opts) { nvlist_t *ports_nvl; ports_nvl = create_relative_config_node(nvl, "port"); return (pci_ahci_legacy_config_port(ports_nvl, 0, "cd", opts)); } static int pci_ahci_hd_legacy_config(nvlist_t *nvl, const char *opts) { nvlist_t *ports_nvl; ports_nvl = create_relative_config_node(nvl, "port"); return (pci_ahci_legacy_config_port(ports_nvl, 0, "hd", opts)); } static int pci_ahci_init(struct vmctx *ctx __unused, struct pci_devinst *pi, nvlist_t *nvl) { - char bident[sizeof("XX:XX:XX")]; + char bident[sizeof("XXX:XXX:XXX")]; char node_name[sizeof("XX")]; struct blockif_ctxt *bctxt; struct pci_ahci_softc *sc; int atapi, ret, slots, p; MD5_CTX mdctx; u_char digest[16]; const char *path, *type, *value; nvlist_t *ports_nvl, *port_nvl; ret = 0; #ifdef AHCI_DEBUG dbg = fopen("/tmp/log", "w+"); #endif sc = calloc(1, sizeof(struct pci_ahci_softc)); pi->pi_arg = sc; sc->asc_pi = pi; pthread_mutex_init(&sc->mtx, NULL); sc->ports = 0; sc->pi = 0; slots = 32; ports_nvl = find_relative_config_node(nvl, "port"); for (p = 0; ports_nvl != NULL && p < MAX_PORTS; p++) { struct ata_params *ata_ident = &sc->port[p].ata_ident; char ident[AHCI_PORT_IDENT]; snprintf(node_name, sizeof(node_name), "%d", p); port_nvl = find_relative_config_node(ports_nvl, node_name); if (port_nvl == NULL) continue; type = get_config_value_node(port_nvl, "type"); if (type == NULL) continue; if (strcmp(type, "hd") == 0) atapi = 0; else atapi = 1; /* * Attempt to open the backing image. Use the PCI slot/func * and the port number for the identifier string. */ - snprintf(bident, sizeof(bident), "%d:%d:%d", pi->pi_slot, + snprintf(bident, sizeof(bident), "%u:%u:%u", pi->pi_slot, pi->pi_func, p); bctxt = blockif_open(port_nvl, bident); if (bctxt == NULL) { sc->ports = p; ret = 1; goto open_fail; } sc->port[p].bctx = bctxt; sc->port[p].pr_sc = sc; sc->port[p].port = p; sc->port[p].atapi = atapi; /* * Create an identifier for the backing file. * Use parts of the md5 sum of the filename */ path = get_config_value_node(port_nvl, "path"); MD5Init(&mdctx); MD5Update(&mdctx, path, strlen(path)); MD5Final(digest, &mdctx); snprintf(ident, AHCI_PORT_IDENT, "BHYVE-%02X%02X-%02X%02X-%02X%02X", digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); memset(ata_ident, 0, sizeof(struct ata_params)); ata_string((uint8_t*)&ata_ident->serial, ident, 20); ata_string((uint8_t*)&ata_ident->revision, "001", 8); if (atapi) ata_string((uint8_t*)&ata_ident->model, "BHYVE SATA DVD ROM", 40); else ata_string((uint8_t*)&ata_ident->model, "BHYVE SATA DISK", 40); value = get_config_value_node(port_nvl, "nmrr"); if (value != NULL) ata_ident->media_rotation_rate = atoi(value); value = get_config_value_node(port_nvl, "ser"); if (value != NULL) ata_string((uint8_t*)(&ata_ident->serial), value, 20); value = get_config_value_node(port_nvl, "rev"); if (value != NULL) ata_string((uint8_t*)(&ata_ident->revision), value, 8); value = get_config_value_node(port_nvl, "model"); if (value != NULL) ata_string((uint8_t*)(&ata_ident->model), value, 40); ata_identify_init(&sc->port[p], atapi); /* * Allocate blockif request structures and add them * to the free list */ pci_ahci_ioreq_init(&sc->port[p]); sc->pi |= (1 << p); if (sc->port[p].ioqsz < slots) slots = sc->port[p].ioqsz; } sc->ports = p; /* Intel ICH8 AHCI */ --slots; if (sc->ports < DEF_PORTS) sc->ports = DEF_PORTS; sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF | AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP | AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)| AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC | (slots << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | (sc->ports - 1); sc->vs = 0x10300; sc->cap2 = AHCI_CAP2_APST; ahci_reset(sc); pci_set_cfgdata16(pi, PCIR_DEVICE, 0x2821); pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA); pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0); p = MIN(sc->ports, 16); p = flsl(p) - ((p & (p - 1)) ? 0 : 1); pci_emul_add_msicap(pi, 1 << p); pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32, AHCI_OFFSET + sc->ports * AHCI_STEP); pci_lintr_request(pi); open_fail: if (ret) { for (p = 0; p < sc->ports; p++) { if (sc->port[p].bctx != NULL) blockif_close(sc->port[p].bctx); } free(sc); } return (ret); } #ifdef BHYVE_SNAPSHOT static int pci_ahci_snapshot(struct vm_snapshot_meta *meta) { int i, ret; void *bctx; struct pci_devinst *pi; struct pci_ahci_softc *sc; struct ahci_port *port; pi = meta->dev_data; sc = pi->pi_arg; /* TODO: add mtx lock/unlock */ SNAPSHOT_VAR_OR_LEAVE(sc->ports, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->cap, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->ghc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->is, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->pi, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->vs, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->ccc_ctl, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->ccc_pts, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->em_loc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->em_ctl, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->cap2, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->bohc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->lintr, meta, ret, done); for (i = 0; i < MAX_PORTS; i++) { port = &sc->port[i]; if (meta->op == VM_SNAPSHOT_SAVE) bctx = port->bctx; SNAPSHOT_VAR_OR_LEAVE(bctx, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->port, meta, ret, done); /* Mostly for restore; save is ensured by the lines above. */ if (((bctx == NULL) && (port->bctx != NULL)) || ((bctx != NULL) && (port->bctx == NULL))) { fprintf(stderr, "%s: ports not matching\r\n", __func__); ret = EINVAL; goto done; } if (port->bctx == NULL) continue; if (port->port != i) { fprintf(stderr, "%s: ports not matching: " "actual: %d expected: %d\r\n", __func__, port->port, i); ret = EINVAL; goto done; } SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(port->cmd_lst, AHCI_CL_SIZE * AHCI_MAX_SLOTS, false, meta, ret, done); SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(port->rfis, 256, false, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ata_ident, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->atapi, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->reset, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->waitforclear, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->mult_sectors, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->xfermode, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->err_cfis, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sense_key, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->asc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ccs, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->pending, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->clb, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->clbu, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->fb, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->fbu, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ie, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->cmd, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->unused0, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->tfd, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sig, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ssts, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sctl, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->serr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sact, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ci, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sntf, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->fbs, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ioqsz, meta, ret, done); assert(TAILQ_EMPTY(&port->iobhd)); } done: return (ret); } static int pci_ahci_pause(struct vmctx *ctx __unused, struct pci_devinst *pi) { struct pci_ahci_softc *sc; struct blockif_ctxt *bctxt; int i; sc = pi->pi_arg; for (i = 0; i < MAX_PORTS; i++) { bctxt = sc->port[i].bctx; if (bctxt == NULL) continue; blockif_pause(bctxt); } return (0); } static int pci_ahci_resume(struct vmctx *ctx __unused, struct pci_devinst *pi) { struct pci_ahci_softc *sc; struct blockif_ctxt *bctxt; int i; sc = pi->pi_arg; for (i = 0; i < MAX_PORTS; i++) { bctxt = sc->port[i].bctx; if (bctxt == NULL) continue; blockif_resume(bctxt); } return (0); } #endif /* BHYVE_SNAPSHOT */ /* * Use separate emulation names to distinguish drive and atapi devices */ static const struct pci_devemu pci_de_ahci = { .pe_emu = "ahci", .pe_init = pci_ahci_init, .pe_legacy_config = pci_ahci_legacy_config, .pe_barwrite = pci_ahci_write, .pe_barread = pci_ahci_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = pci_ahci_snapshot, .pe_pause = pci_ahci_pause, .pe_resume = pci_ahci_resume, #endif }; PCI_EMUL_SET(pci_de_ahci); static const struct pci_devemu pci_de_ahci_hd = { .pe_emu = "ahci-hd", .pe_legacy_config = pci_ahci_hd_legacy_config, .pe_alias = "ahci", }; PCI_EMUL_SET(pci_de_ahci_hd); static const struct pci_devemu pci_de_ahci_cd = { .pe_emu = "ahci-cd", .pe_legacy_config = pci_ahci_cd_legacy_config, .pe_alias = "ahci", }; PCI_EMUL_SET(pci_de_ahci_cd); diff --git a/usr.sbin/bhyve/pci_nvme.c b/usr.sbin/bhyve/pci_nvme.c index 571a6a9fda62..edbac73c7ed5 100644 --- a/usr.sbin/bhyve/pci_nvme.c +++ b/usr.sbin/bhyve/pci_nvme.c @@ -1,3395 +1,3395 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2017 Shunsuke Mie * Copyright (c) 2018 Leon Dang * Copyright (c) 2020 Chuck Tuffli * * Function crc16 Copyright (c) 2017, Fedor Uporov * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * bhyve PCIe-NVMe device emulation. * * options: * -s ,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm= * * accepted devpath: * /dev/blockdev * /path/to/image * ram=size_in_MiB * * maxq = max number of queues * qsz = max elements in each queue * ioslots = max number of concurrent io requests * sectsz = sector size (defaults to blockif sector size) * ser = serial number (20-chars max) * eui64 = IEEE Extended Unique Identifier (8 byte value) * dsm = DataSet Management support. Option is one of auto, enable,disable * */ /* TODO: - create async event for smart and log - intr coalesce */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "block_if.h" #include "config.h" #include "debug.h" #include "pci_emul.h" static int nvme_debug = 0; #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) /* defaults; can be overridden */ #define NVME_MSIX_BAR 4 #define NVME_IOSLOTS 8 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ #define NVME_MMIO_SPACE_MIN (1 << 14) #define NVME_QUEUES 16 #define NVME_MAX_QENTRIES 2048 /* Memory Page size Minimum reported in CAP register */ #define NVME_MPSMIN 0 /* MPSMIN converted to bytes */ #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) #define NVME_MDTS 9 /* Note the + 1 allows for the initial descriptor to not be page aligned */ #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) /* This is a synthetic status code to indicate there is no status */ #define NVME_NO_STATUS 0xffff #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) /* Reported temperature in Kelvin (i.e. room temperature) */ #define NVME_TEMPERATURE 296 /* helpers */ /* Convert a zero-based value into a one-based value */ #define ONE_BASED(zero) ((zero) + 1) /* Convert a one-based value into a zero-based value */ #define ZERO_BASED(one) ((one) - 1) /* Encode number of SQ's and CQ's for Set/Get Features */ #define NVME_FEATURE_NUM_QUEUES(sc) \ (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) enum nvme_controller_register_offsets { NVME_CR_CAP_LOW = 0x00, NVME_CR_CAP_HI = 0x04, NVME_CR_VS = 0x08, NVME_CR_INTMS = 0x0c, NVME_CR_INTMC = 0x10, NVME_CR_CC = 0x14, NVME_CR_CSTS = 0x1c, NVME_CR_NSSR = 0x20, NVME_CR_AQA = 0x24, NVME_CR_ASQ_LOW = 0x28, NVME_CR_ASQ_HI = 0x2c, NVME_CR_ACQ_LOW = 0x30, NVME_CR_ACQ_HI = 0x34, }; enum nvme_cmd_cdw11 { NVME_CMD_CDW11_PC = 0x0001, NVME_CMD_CDW11_IEN = 0x0002, NVME_CMD_CDW11_IV = 0xFFFF0000, }; enum nvme_copy_dir { NVME_COPY_TO_PRP, NVME_COPY_FROM_PRP, }; #define NVME_CQ_INTEN 0x01 #define NVME_CQ_INTCOAL 0x02 struct nvme_completion_queue { struct nvme_completion *qbase; pthread_mutex_t mtx; uint32_t size; uint16_t tail; /* nvme progress */ uint16_t head; /* guest progress */ uint16_t intr_vec; uint32_t intr_en; }; struct nvme_submission_queue { struct nvme_command *qbase; pthread_mutex_t mtx; uint32_t size; uint16_t head; /* nvme progress */ uint16_t tail; /* guest progress */ uint16_t cqid; /* completion queue id */ int qpriority; }; enum nvme_storage_type { NVME_STOR_BLOCKIF = 0, NVME_STOR_RAM = 1, }; struct pci_nvme_blockstore { enum nvme_storage_type type; void *ctx; uint64_t size; uint32_t sectsz; uint32_t sectsz_bits; uint64_t eui64; uint32_t deallocate:1; }; /* * Calculate the number of additional page descriptors for guest IO requests * based on the advertised Max Data Transfer (MDTS) and given the number of * default iovec's in a struct blockif_req. */ #define MDTS_PAD_SIZE \ ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 0 ) struct pci_nvme_ioreq { struct pci_nvme_softc *sc; STAILQ_ENTRY(pci_nvme_ioreq) link; struct nvme_submission_queue *nvme_sq; uint16_t sqid; /* command information */ uint16_t opc; uint16_t cid; uint32_t nsid; uint64_t prev_gpaddr; size_t prev_size; size_t bytes; struct blockif_req io_req; struct iovec iovpadding[MDTS_PAD_SIZE]; }; enum nvme_dsm_type { /* Dataset Management bit in ONCS reflects backing storage capability */ NVME_DATASET_MANAGEMENT_AUTO, /* Unconditionally set Dataset Management bit in ONCS */ NVME_DATASET_MANAGEMENT_ENABLE, /* Unconditionally clear Dataset Management bit in ONCS */ NVME_DATASET_MANAGEMENT_DISABLE, }; struct pci_nvme_softc; struct nvme_feature_obj; typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); struct nvme_feature_obj { uint32_t cdw11; nvme_feature_cb set; nvme_feature_cb get; bool namespace_specific; }; #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) typedef enum { PCI_NVME_AE_TYPE_ERROR = 0, PCI_NVME_AE_TYPE_SMART, PCI_NVME_AE_TYPE_NOTICE, PCI_NVME_AE_TYPE_IO_CMD = 6, PCI_NVME_AE_TYPE_VENDOR = 7, PCI_NVME_AE_TYPE_MAX /* Must be last */ } pci_nvme_async_type; /* Asynchronous Event Requests */ struct pci_nvme_aer { STAILQ_ENTRY(pci_nvme_aer) link; uint16_t cid; /* Command ID of the submitted AER */ }; /** Asynchronous Event Information - Notice */ typedef enum { PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0, PCI_NVME_AEI_NOTICE_FW_ACTIVATION, PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE, PCI_NVME_AEI_NOTICE_ANA_CHANGE, PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE, PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT, PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE, PCI_NVME_AEI_NOTICE_MAX, } pci_nvme_async_event_info_notice; #define PCI_NVME_AEI_NOTICE_SHIFT 8 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT)) /* Asynchronous Event Notifications */ struct pci_nvme_aen { pci_nvme_async_type atype; uint32_t event_data; bool posted; }; /* * By default, enable all Asynchrnous Event Notifications: * SMART / Health Critical Warnings * Namespace Attribute Notices */ #define PCI_NVME_AEN_DEFAULT_MASK 0x11f typedef enum { NVME_CNTRLTYPE_IO = 1, NVME_CNTRLTYPE_DISCOVERY = 2, NVME_CNTRLTYPE_ADMIN = 3, } pci_nvme_cntrl_type; struct pci_nvme_softc { struct pci_devinst *nsc_pi; pthread_mutex_t mtx; struct nvme_registers regs; struct nvme_namespace_data nsdata; struct nvme_controller_data ctrldata; struct nvme_error_information_entry err_log; struct nvme_health_information_page health_log; struct nvme_firmware_page fw_log; struct nvme_ns_list ns_log; struct pci_nvme_blockstore nvstore; uint16_t max_qentries; /* max entries per queue */ uint32_t max_queues; /* max number of IO SQ's or CQ's */ uint32_t num_cqueues; uint32_t num_squeues; bool num_q_is_set; /* Has host set Number of Queues */ struct pci_nvme_ioreq *ioreqs; STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ uint32_t pending_ios; uint32_t ioslots; sem_t iosemlock; /* * Memory mapped Submission and Completion queues * Each array includes both Admin and IO queues */ struct nvme_completion_queue *compl_queues; struct nvme_submission_queue *submit_queues; struct nvme_feature_obj feat[NVME_FID_MAX]; enum nvme_dsm_type dataset_management; /* Accounting for SMART data */ __uint128_t read_data_units; __uint128_t write_data_units; __uint128_t read_commands; __uint128_t write_commands; uint32_t read_dunits_remainder; uint32_t write_dunits_remainder; STAILQ_HEAD(, pci_nvme_aer) aer_list; pthread_mutex_t aer_mtx; uint32_t aer_count; struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; pthread_t aen_tid; pthread_mutex_t aen_mtx; pthread_cond_t aen_cond; }; static void pci_nvme_cq_update(struct pci_nvme_softc *sc, struct nvme_completion_queue *cq, uint32_t cdw0, uint16_t cid, uint16_t sqid, uint16_t status); static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); static void pci_nvme_io_done(struct blockif_req *, int); /* Controller Configuration utils */ #define NVME_CC_GET_EN(cc) \ ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) #define NVME_CC_GET_CSS(cc) \ ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) #define NVME_CC_GET_SHN(cc) \ ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) #define NVME_CC_GET_IOSQES(cc) \ ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) #define NVME_CC_GET_IOCQES(cc) \ ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) #define NVME_CC_WRITE_MASK \ ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) #define NVME_CC_NEN_WRITE_MASK \ ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) /* Controller Status utils */ #define NVME_CSTS_GET_RDY(sts) \ ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) #define NVME_CSTS_CFS (1 << NVME_CSTS_REG_CFS_SHIFT) /* Completion Queue status word utils */ #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) #define NVME_STATUS_MASK \ ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ NVME_CTRLR_DATA_ONCS_DSM_SHIFT) static void nvme_feature_invalid_cb(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static void nvme_feature_temperature(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static void nvme_feature_num_queues(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static void nvme_feature_iv_config(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static void nvme_feature_async_event(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static void *aen_thr(void *arg); static __inline void cpywithpad(char *dst, size_t dst_size, const char *src, char pad) { size_t len; len = strnlen(src, dst_size); memset(dst, pad, dst_size); memcpy(dst, src, len); } static __inline void pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) { *status &= ~NVME_STATUS_MASK; *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; } static __inline void pci_nvme_status_genc(uint16_t *status, uint16_t code) { pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); } /* * Initialize the requested number or IO Submission and Completion Queues. * Admin queues are allocated implicitly. */ static void pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) { uint32_t i; /* * Allocate and initialize the Submission Queues */ if (nsq > NVME_QUEUES) { WPRINTF("%s: clamping number of SQ from %u to %u", __func__, nsq, NVME_QUEUES); nsq = NVME_QUEUES; } sc->num_squeues = nsq; sc->submit_queues = calloc(sc->num_squeues + 1, sizeof(struct nvme_submission_queue)); if (sc->submit_queues == NULL) { WPRINTF("%s: SQ allocation failed", __func__); sc->num_squeues = 0; } else { struct nvme_submission_queue *sq = sc->submit_queues; for (i = 0; i < sc->num_squeues + 1; i++) pthread_mutex_init(&sq[i].mtx, NULL); } /* * Allocate and initialize the Completion Queues */ if (ncq > NVME_QUEUES) { WPRINTF("%s: clamping number of CQ from %u to %u", __func__, ncq, NVME_QUEUES); ncq = NVME_QUEUES; } sc->num_cqueues = ncq; sc->compl_queues = calloc(sc->num_cqueues + 1, sizeof(struct nvme_completion_queue)); if (sc->compl_queues == NULL) { WPRINTF("%s: CQ allocation failed", __func__); sc->num_cqueues = 0; } else { struct nvme_completion_queue *cq = sc->compl_queues; for (i = 0; i < sc->num_cqueues + 1; i++) pthread_mutex_init(&cq[i].mtx, NULL); } } static void pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) { struct nvme_controller_data *cd = &sc->ctrldata; cd->vid = 0xFB5D; cd->ssvid = 0x0000; cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); /* Num of submission commands that we can handle at a time (2^rab) */ cd->rab = 4; /* FreeBSD OUI */ cd->ieee[0] = 0x58; cd->ieee[1] = 0x9c; cd->ieee[2] = 0xfc; cd->mic = 0; cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ cd->ver = NVME_REV(1,4); cd->cntrltype = NVME_CNTRLTYPE_IO; cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR); cd->acl = 2; cd->aerl = 4; /* Advertise 1, Read-only firmware slot */ cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) | (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); cd->lpa = 0; /* TODO: support some simple things like SMART */ cd->elpe = 0; /* max error log page entries */ /* * Report a single power state (zero-based value) * power_state[] values are left as zero to indicate "Not reported" */ cd->npss = 0; /* Warning Composite Temperature Threshold */ cd->wctemp = 0x0157; cd->cctemp = 0x0157; /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */ cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO << NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT); cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); cd->nn = 1; /* number of namespaces */ cd->oncs = 0; switch (sc->dataset_management) { case NVME_DATASET_MANAGEMENT_AUTO: if (sc->nvstore.deallocate) cd->oncs |= NVME_ONCS_DSM; break; case NVME_DATASET_MANAGEMENT_ENABLE: cd->oncs |= NVME_ONCS_DSM; break; default: break; } cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK << NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT; cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT; } /* * Calculate the CRC-16 of the given buffer * See copyright attribution at top of file */ static uint16_t crc16(uint16_t crc, const void *buffer, unsigned int len) { const unsigned char *cp = buffer; /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ static uint16_t const crc16_table[256] = { 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 }; while (len--) crc = (((crc >> 8) & 0xffU) ^ crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; return crc; } static void pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, struct nvme_namespace_data *nd) { /* Get capacity and block size information from backing store */ nd->nsze = nvstore->size / nvstore->sectsz; nd->ncap = nd->nsze; nd->nuse = nd->nsze; } static void pci_nvme_init_nsdata(struct pci_nvme_softc *sc, struct nvme_namespace_data *nd, uint32_t nsid, struct pci_nvme_blockstore *nvstore) { pci_nvme_init_nsdata_size(nvstore, nd); if (nvstore->type == NVME_STOR_BLOCKIF) nvstore->deallocate = blockif_candelete(nvstore->ctx); nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ nd->flbas = 0; /* Create an EUI-64 if user did not provide one */ if (nvstore->eui64 == 0) { char *data = NULL; uint64_t eui64 = nvstore->eui64; asprintf(&data, "%s%u%u%u", get_config_value("name"), sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); if (data != NULL) { eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); free(data); } nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); } be64enc(nd->eui64, nvstore->eui64); /* LBA data-sz = 2^lbads */ nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; } static void pci_nvme_init_logpages(struct pci_nvme_softc *sc) { __uint128_t power_cycles = 1; memset(&sc->err_log, 0, sizeof(sc->err_log)); memset(&sc->health_log, 0, sizeof(sc->health_log)); memset(&sc->fw_log, 0, sizeof(sc->fw_log)); memset(&sc->ns_log, 0, sizeof(sc->ns_log)); /* Set read/write remainder to round up according to spec */ sc->read_dunits_remainder = 999; sc->write_dunits_remainder = 999; /* Set nominal Health values checked by implementations */ sc->health_log.temperature = NVME_TEMPERATURE; sc->health_log.available_spare = 100; sc->health_log.available_spare_threshold = 10; /* Set Active Firmware Info to slot 1 */ sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT); memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr, sizeof(sc->fw_log.revision[0])); memcpy(&sc->health_log.power_cycles, &power_cycles, sizeof(sc->health_log.power_cycles)); } static void pci_nvme_init_features(struct pci_nvme_softc *sc) { enum nvme_feature fid; for (fid = 0; fid < NVME_FID_MAX; fid++) { switch (fid) { case NVME_FEAT_ARBITRATION: case NVME_FEAT_POWER_MANAGEMENT: case NVME_FEAT_INTERRUPT_COALESCING: //XXX case NVME_FEAT_WRITE_ATOMICITY: /* Mandatory but no special handling required */ //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: // this returns a data buffer break; case NVME_FEAT_TEMPERATURE_THRESHOLD: sc->feat[fid].set = nvme_feature_temperature; break; case NVME_FEAT_ERROR_RECOVERY: sc->feat[fid].namespace_specific = true; break; case NVME_FEAT_NUMBER_OF_QUEUES: sc->feat[fid].set = nvme_feature_num_queues; break; case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: sc->feat[fid].set = nvme_feature_iv_config; break; case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: sc->feat[fid].set = nvme_feature_async_event; /* Enable all AENs by default */ sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK; break; default: sc->feat[fid].set = nvme_feature_invalid_cb; sc->feat[fid].get = nvme_feature_invalid_cb; } } } static void pci_nvme_aer_reset(struct pci_nvme_softc *sc) { STAILQ_INIT(&sc->aer_list); sc->aer_count = 0; } static void pci_nvme_aer_init(struct pci_nvme_softc *sc) { pthread_mutex_init(&sc->aer_mtx, NULL); pci_nvme_aer_reset(sc); } static void pci_nvme_aer_destroy(struct pci_nvme_softc *sc) { struct pci_nvme_aer *aer = NULL; pthread_mutex_lock(&sc->aer_mtx); while (!STAILQ_EMPTY(&sc->aer_list)) { aer = STAILQ_FIRST(&sc->aer_list); STAILQ_REMOVE_HEAD(&sc->aer_list, link); free(aer); } pthread_mutex_unlock(&sc->aer_mtx); pci_nvme_aer_reset(sc); } static bool pci_nvme_aer_available(struct pci_nvme_softc *sc) { return (sc->aer_count != 0); } static bool pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) { struct nvme_controller_data *cd = &sc->ctrldata; /* AERL is a zero based value while aer_count is one's based */ return (sc->aer_count == (cd->aerl + 1)); } /* * Add an Async Event Request * * Stores an AER to be returned later if the Controller needs to notify the * host of an event. * Note that while the NVMe spec doesn't require Controllers to return AER's * in order, this implementation does preserve the order. */ static int pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) { struct pci_nvme_aer *aer = NULL; aer = calloc(1, sizeof(struct pci_nvme_aer)); if (aer == NULL) return (-1); /* Save the Command ID for use in the completion message */ aer->cid = cid; pthread_mutex_lock(&sc->aer_mtx); sc->aer_count++; STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); pthread_mutex_unlock(&sc->aer_mtx); return (0); } /* * Get an Async Event Request structure * * Returns a pointer to an AER previously submitted by the host or NULL if * no AER's exist. Caller is responsible for freeing the returned struct. */ static struct pci_nvme_aer * pci_nvme_aer_get(struct pci_nvme_softc *sc) { struct pci_nvme_aer *aer = NULL; pthread_mutex_lock(&sc->aer_mtx); aer = STAILQ_FIRST(&sc->aer_list); if (aer != NULL) { STAILQ_REMOVE_HEAD(&sc->aer_list, link); sc->aer_count--; } pthread_mutex_unlock(&sc->aer_mtx); return (aer); } static void pci_nvme_aen_reset(struct pci_nvme_softc *sc) { uint32_t atype; memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { sc->aen[atype].atype = atype; } } static void pci_nvme_aen_init(struct pci_nvme_softc *sc) { char nstr[80]; pci_nvme_aen_reset(sc); pthread_mutex_init(&sc->aen_mtx, NULL); pthread_create(&sc->aen_tid, NULL, aen_thr, sc); snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); pthread_set_name_np(sc->aen_tid, nstr); } static void pci_nvme_aen_destroy(struct pci_nvme_softc *sc) { pci_nvme_aen_reset(sc); } /* Notify the AEN thread of pending work */ static void pci_nvme_aen_notify(struct pci_nvme_softc *sc) { pthread_cond_signal(&sc->aen_cond); } /* * Post an Asynchronous Event Notification */ static int32_t pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, uint32_t event_data) { struct pci_nvme_aen *aen; if (atype >= PCI_NVME_AE_TYPE_MAX) { return(EINVAL); } pthread_mutex_lock(&sc->aen_mtx); aen = &sc->aen[atype]; /* Has the controller already posted an event of this type? */ if (aen->posted) { pthread_mutex_unlock(&sc->aen_mtx); return(EALREADY); } aen->event_data = event_data; aen->posted = true; pthread_mutex_unlock(&sc->aen_mtx); pci_nvme_aen_notify(sc); return(0); } static void pci_nvme_aen_process(struct pci_nvme_softc *sc) { struct pci_nvme_aer *aer; struct pci_nvme_aen *aen; pci_nvme_async_type atype; uint32_t mask; uint16_t status; uint8_t lid; assert(pthread_mutex_isowned_np(&sc->aen_mtx)); for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { aen = &sc->aen[atype]; /* Previous iterations may have depleted the available AER's */ if (!pci_nvme_aer_available(sc)) { DPRINTF("%s: no AER", __func__); break; } if (!aen->posted) { DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); continue; } status = NVME_SC_SUCCESS; /* Is the event masked? */ mask = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); switch (atype) { case PCI_NVME_AE_TYPE_ERROR: lid = NVME_LOG_ERROR; break; case PCI_NVME_AE_TYPE_SMART: mask &= 0xff; if ((mask & aen->event_data) == 0) continue; lid = NVME_LOG_HEALTH_INFORMATION; break; case PCI_NVME_AE_TYPE_NOTICE: if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) { EPRINTLN("%s unknown AEN notice type %u", __func__, aen->event_data); status = NVME_SC_INTERNAL_DEVICE_ERROR; lid = 0; break; } if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0) continue; switch (aen->event_data) { case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED: lid = NVME_LOG_CHANGED_NAMESPACE; break; case PCI_NVME_AEI_NOTICE_FW_ACTIVATION: lid = NVME_LOG_FIRMWARE_SLOT; break; case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE: lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; break; case PCI_NVME_AEI_NOTICE_ANA_CHANGE: lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; break; case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE: lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; break; case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT: lid = NVME_LOG_LBA_STATUS_INFORMATION; break; case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE: lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; break; default: lid = 0; } break; default: /* bad type?!? */ EPRINTLN("%s unknown AEN type %u", __func__, atype); status = NVME_SC_INTERNAL_DEVICE_ERROR; lid = 0; break; } aer = pci_nvme_aer_get(sc); assert(aer != NULL); DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); pci_nvme_cq_update(sc, &sc->compl_queues[0], (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ aer->cid, 0, /* SQID */ status); aen->event_data = 0; aen->posted = false; pci_generate_msix(sc->nsc_pi, 0); } } static void * aen_thr(void *arg) { struct pci_nvme_softc *sc; sc = arg; pthread_mutex_lock(&sc->aen_mtx); for (;;) { pci_nvme_aen_process(sc); pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); } pthread_mutex_unlock(&sc->aen_mtx); pthread_exit(NULL); return (NULL); } static void pci_nvme_reset_locked(struct pci_nvme_softc *sc) { uint32_t i; DPRINTF("%s", __func__); sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | (1 << NVME_CAP_LO_REG_CQR_SHIFT) | (60 << NVME_CAP_LO_REG_TO_SHIFT); sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */ sc->regs.cc = 0; assert(sc->submit_queues != NULL); for (i = 0; i < sc->num_squeues + 1; i++) { sc->submit_queues[i].qbase = NULL; sc->submit_queues[i].size = 0; sc->submit_queues[i].cqid = 0; sc->submit_queues[i].tail = 0; sc->submit_queues[i].head = 0; } assert(sc->compl_queues != NULL); for (i = 0; i < sc->num_cqueues + 1; i++) { sc->compl_queues[i].qbase = NULL; sc->compl_queues[i].size = 0; sc->compl_queues[i].tail = 0; sc->compl_queues[i].head = 0; } sc->num_q_is_set = false; pci_nvme_aer_destroy(sc); pci_nvme_aen_destroy(sc); /* * Clear CSTS.RDY last to prevent the host from enabling Controller * before cleanup completes */ sc->regs.csts = 0; } static void pci_nvme_reset(struct pci_nvme_softc *sc) { pthread_mutex_lock(&sc->mtx); pci_nvme_reset_locked(sc); pthread_mutex_unlock(&sc->mtx); } static int pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) { uint16_t acqs, asqs; DPRINTF("%s", __func__); /* * NVMe 2.0 states that "enabling a controller while this field is * cleared to 0h produces undefined results" for both ACQS and * ASQS. If zero, set CFS and do not become ready. */ asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK); if (asqs < 2) { EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__, asqs - 1, sc->regs.aqa); sc->regs.csts |= NVME_CSTS_CFS; return (-1); } sc->submit_queues[0].size = asqs; sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, sizeof(struct nvme_command) * asqs); if (sc->submit_queues[0].qbase == NULL) { EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__, sc->regs.asq); sc->regs.csts |= NVME_CSTS_CFS; return (-1); } DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", __func__, sc->regs.asq, sc->submit_queues[0].qbase); acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & NVME_AQA_REG_ACQS_MASK); if (acqs < 2) { EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__, acqs - 1, sc->regs.aqa); sc->regs.csts |= NVME_CSTS_CFS; return (-1); } sc->compl_queues[0].size = acqs; sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, sizeof(struct nvme_completion) * acqs); if (sc->compl_queues[0].qbase == NULL) { EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__, sc->regs.acq); sc->regs.csts |= NVME_CSTS_CFS; return (-1); } sc->compl_queues[0].intr_en = NVME_CQ_INTEN; DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", __func__, sc->regs.acq, sc->compl_queues[0].qbase); return (0); } static int nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, size_t len, enum nvme_copy_dir dir) { uint8_t *p; size_t bytes; if (len > (8 * 1024)) { return (-1); } /* Copy from the start of prp1 to the end of the physical page */ bytes = PAGE_SIZE - (prp1 & PAGE_MASK); bytes = MIN(bytes, len); p = vm_map_gpa(ctx, prp1, bytes); if (p == NULL) { return (-1); } if (dir == NVME_COPY_TO_PRP) memcpy(p, b, bytes); else memcpy(b, p, bytes); b += bytes; len -= bytes; if (len == 0) { return (0); } len = MIN(len, PAGE_SIZE); p = vm_map_gpa(ctx, prp2, len); if (p == NULL) { return (-1); } if (dir == NVME_COPY_TO_PRP) memcpy(p, b, len); else memcpy(b, p, len); return (0); } /* * Write a Completion Queue Entry update * * Write the completion and update the doorbell value */ static void pci_nvme_cq_update(struct pci_nvme_softc *sc, struct nvme_completion_queue *cq, uint32_t cdw0, uint16_t cid, uint16_t sqid, uint16_t status) { struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; struct nvme_completion *cqe; assert(cq->qbase != NULL); pthread_mutex_lock(&cq->mtx); cqe = &cq->qbase[cq->tail]; /* Flip the phase bit */ status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; cqe->cdw0 = cdw0; cqe->sqhd = sq->head; cqe->sqid = sqid; cqe->cid = cid; cqe->status = status; cq->tail++; if (cq->tail >= cq->size) { cq->tail = 0; } pthread_mutex_unlock(&cq->mtx); } static int nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint16_t qid = command->cdw10 & 0xffff; DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); if (qid == 0 || qid > sc->num_squeues || (sc->submit_queues[qid].qbase == NULL)) { WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", __func__, qid, sc->num_squeues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } sc->submit_queues[qid].qbase = NULL; sc->submit_queues[qid].cqid = 0; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { if (command->cdw11 & NVME_CMD_CDW11_PC) { uint16_t qid = command->cdw10 & 0xffff; struct nvme_submission_queue *nsq; if ((qid == 0) || (qid > sc->num_squeues) || (sc->submit_queues[qid].qbase != NULL)) { WPRINTF("%s queue index %u > num_squeues %u", __func__, qid, sc->num_squeues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } nsq = &sc->submit_queues[qid]; nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { /* * Queues must specify at least two entries * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec */ pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); return (1); } nsq->head = nsq->tail = 0; nsq->cqid = (command->cdw11 >> 16) & 0xffff; if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } if (sc->compl_queues[nsq->cqid].qbase == NULL) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_COMPLETION_QUEUE_INVALID); return (1); } nsq->qpriority = (command->cdw11 >> 1) & 0x03; nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(struct nvme_command) * (size_t)nsq->size); DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, qid, nsq->size, nsq->qbase, nsq->cqid); pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); DPRINTF("%s completed creating IOSQ qid %u", __func__, qid); } else { /* * Guest sent non-cont submission queue request. * This setting is unsupported by this emulation. */ WPRINTF("%s unsupported non-contig (list-based) " "create i/o submission queue", __func__); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); } return (1); } static int nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint16_t qid = command->cdw10 & 0xffff; uint16_t sqid; DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); if (qid == 0 || qid > sc->num_cqueues || (sc->compl_queues[qid].qbase == NULL)) { WPRINTF("%s queue index %u / num_cqueues %u", __func__, qid, sc->num_cqueues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } /* Deleting an Active CQ is an error */ for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) if (sc->submit_queues[sqid].cqid == qid) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_DELETION); return (1); } sc->compl_queues[qid].qbase = NULL; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { struct nvme_completion_queue *ncq; uint16_t qid = command->cdw10 & 0xffff; /* Only support Physically Contiguous queues */ if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { WPRINTF("%s unsupported non-contig (list-based) " "create i/o completion queue", __func__); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } if ((qid == 0) || (qid > sc->num_cqueues) || (sc->compl_queues[qid].qbase != NULL)) { WPRINTF("%s queue index %u > num_cqueues %u", __func__, qid, sc->num_cqueues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } ncq = &sc->compl_queues[qid]; ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; if (ncq->intr_vec > (sc->max_queues + 1)) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_INTERRUPT_VECTOR); return (1); } ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { /* * Queues must specify at least two entries * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec */ pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); return (1); } ncq->head = ncq->tail = 0; ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(struct nvme_command) * (size_t)ncq->size); pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint64_t logoff; uint32_t logsize; uint8_t logpage; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); /* * Command specifies the number of dwords to return in fields NUMDU * and NUMDL. This is a zero-based value. */ logpage = command->cdw10 & 0xFF; logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; logsize *= sizeof(uint32_t); logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12; DPRINTF("%s log page %u len %u", __func__, logpage, logsize); switch (logpage) { case NVME_LOG_ERROR: if (logoff >= sizeof(sc->err_log)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); break; } nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->err_log + logoff, MIN(logsize - logoff, sizeof(sc->err_log)), NVME_COPY_TO_PRP); break; case NVME_LOG_HEALTH_INFORMATION: if (logoff >= sizeof(sc->health_log)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); break; } pthread_mutex_lock(&sc->mtx); memcpy(&sc->health_log.data_units_read, &sc->read_data_units, sizeof(sc->health_log.data_units_read)); memcpy(&sc->health_log.data_units_written, &sc->write_data_units, sizeof(sc->health_log.data_units_written)); memcpy(&sc->health_log.host_read_commands, &sc->read_commands, sizeof(sc->health_log.host_read_commands)); memcpy(&sc->health_log.host_write_commands, &sc->write_commands, sizeof(sc->health_log.host_write_commands)); pthread_mutex_unlock(&sc->mtx); nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->health_log + logoff, MIN(logsize - logoff, sizeof(sc->health_log)), NVME_COPY_TO_PRP); break; case NVME_LOG_FIRMWARE_SLOT: if (logoff >= sizeof(sc->fw_log)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); break; } nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->fw_log + logoff, MIN(logsize - logoff, sizeof(sc->fw_log)), NVME_COPY_TO_PRP); break; case NVME_LOG_CHANGED_NAMESPACE: if (logoff >= sizeof(sc->ns_log)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); break; } nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->ns_log + logoff, MIN(logsize - logoff, sizeof(sc->ns_log)), NVME_COPY_TO_PRP); memset(&sc->ns_log, 0, sizeof(sc->ns_log)); break; default: DPRINTF("%s get log page %x command not supported", __func__, logpage); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_LOG_PAGE); } return (1); } static int nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { void *dest; uint16_t status; DPRINTF("%s identify 0x%x nsid 0x%x", __func__, command->cdw10 & 0xFF, command->nsid); status = 0; pci_nvme_status_genc(&status, NVME_SC_SUCCESS); switch (command->cdw10 & 0xFF) { case 0x00: /* return Identify Namespace data structure */ /* Global NS only valid with NS Management */ if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) { pci_nvme_status_genc(&status, NVME_SC_INVALID_NAMESPACE_OR_FORMAT); break; } nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), NVME_COPY_TO_PRP); break; case 0x01: /* return Identify Controller data structure */ nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->ctrldata, sizeof(sc->ctrldata), NVME_COPY_TO_PRP); break; case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(uint32_t) * 1024); /* All unused entries shall be zero */ memset(dest, 0, sizeof(uint32_t) * 1024); ((uint32_t *)dest)[0] = 1; break; case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ if (command->nsid != 1) { pci_nvme_status_genc(&status, NVME_SC_INVALID_NAMESPACE_OR_FORMAT); break; } dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(uint32_t) * 1024); /* All bytes after the descriptor shall be zero */ memset(dest, 0, sizeof(uint32_t) * 1024); /* Return NIDT=1 (i.e. EUI64) descriptor */ ((uint8_t *)dest)[0] = 1; ((uint8_t *)dest)[1] = sizeof(uint64_t); memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t)); break; case 0x13: /* * Controller list is optional but used by UNH tests. Return * a valid but empty list. */ dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(uint16_t) * 2048); memset(dest, 0, sizeof(uint16_t) * 2048); break; default: DPRINTF("%s unsupported identify command requested 0x%x", __func__, command->cdw10 & 0xFF); pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); break; } compl->status = status; return (1); } static const char * nvme_fid_to_name(uint8_t fid) { const char *name; switch (fid) { case NVME_FEAT_ARBITRATION: name = "Arbitration"; break; case NVME_FEAT_POWER_MANAGEMENT: name = "Power Management"; break; case NVME_FEAT_LBA_RANGE_TYPE: name = "LBA Range Type"; break; case NVME_FEAT_TEMPERATURE_THRESHOLD: name = "Temperature Threshold"; break; case NVME_FEAT_ERROR_RECOVERY: name = "Error Recovery"; break; case NVME_FEAT_VOLATILE_WRITE_CACHE: name = "Volatile Write Cache"; break; case NVME_FEAT_NUMBER_OF_QUEUES: name = "Number of Queues"; break; case NVME_FEAT_INTERRUPT_COALESCING: name = "Interrupt Coalescing"; break; case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: name = "Interrupt Vector Configuration"; break; case NVME_FEAT_WRITE_ATOMICITY: name = "Write Atomicity Normal"; break; case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: name = "Asynchronous Event Configuration"; break; case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: name = "Autonomous Power State Transition"; break; case NVME_FEAT_HOST_MEMORY_BUFFER: name = "Host Memory Buffer"; break; case NVME_FEAT_TIMESTAMP: name = "Timestamp"; break; case NVME_FEAT_KEEP_ALIVE_TIMER: name = "Keep Alive Timer"; break; case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: name = "Host Controlled Thermal Management"; break; case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: name = "Non-Operation Power State Config"; break; case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: name = "Read Recovery Level Config"; break; case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: name = "Predictable Latency Mode Config"; break; case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: name = "Predictable Latency Mode Window"; break; case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: name = "LBA Status Information Report Interval"; break; case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: name = "Host Behavior Support"; break; case NVME_FEAT_SANITIZE_CONFIG: name = "Sanitize Config"; break; case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: name = "Endurance Group Event Configuration"; break; case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: name = "Software Progress Marker"; break; case NVME_FEAT_HOST_IDENTIFIER: name = "Host Identifier"; break; case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: name = "Reservation Notification Mask"; break; case NVME_FEAT_RESERVATION_PERSISTENCE: name = "Reservation Persistence"; break; case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: name = "Namespace Write Protection Config"; break; default: name = "Unknown"; break; } return (name); } static void nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused, struct nvme_feature_obj *feat __unused, struct nvme_command *command __unused, struct nvme_completion *compl) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); } static void nvme_feature_iv_config(struct pci_nvme_softc *sc, struct nvme_feature_obj *feat __unused, struct nvme_command *command, struct nvme_completion *compl) { uint32_t i; uint32_t cdw11 = command->cdw11; uint16_t iv; bool cd; pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); iv = cdw11 & 0xffff; cd = cdw11 & (1 << 16); if (iv > (sc->max_queues + 1)) { return; } /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ if ((iv == 0) && !cd) return; /* Requested Interrupt Vector must be used by a CQ */ for (i = 0; i < sc->num_cqueues + 1; i++) { if (sc->compl_queues[i].intr_vec == iv) { pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); } } } #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000) static void nvme_feature_async_event(struct pci_nvme_softc *sc __unused, struct nvme_feature_obj *feat __unused, struct nvme_command *command, struct nvme_completion *compl) { if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP) pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); } #define NVME_TEMP_THRESH_OVER 0 #define NVME_TEMP_THRESH_UNDER 1 static void nvme_feature_temperature(struct pci_nvme_softc *sc, struct nvme_feature_obj *feat __unused, struct nvme_command *command, struct nvme_completion *compl) { uint16_t tmpth; /* Temperature Threshold */ uint8_t tmpsel; /* Threshold Temperature Select */ uint8_t thsel; /* Threshold Type Select */ bool set_crit = false; bool report_crit; tmpth = command->cdw11 & 0xffff; tmpsel = (command->cdw11 >> 16) & 0xf; thsel = (command->cdw11 >> 20) & 0x3; DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel); /* Check for unsupported values */ if (((tmpsel != 0) && (tmpsel != 0xf)) || (thsel > NVME_TEMP_THRESH_UNDER)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return; } if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) || ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth))) set_crit = true; pthread_mutex_lock(&sc->mtx); if (set_crit) sc->health_log.critical_warning |= NVME_CRIT_WARN_ST_TEMPERATURE; else sc->health_log.critical_warning &= ~NVME_CRIT_WARN_ST_TEMPERATURE; pthread_mutex_unlock(&sc->mtx); report_crit = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 & NVME_CRIT_WARN_ST_TEMPERATURE; if (set_crit && report_crit) pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART, sc->health_log.critical_warning); DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status); } static void nvme_feature_num_queues(struct pci_nvme_softc *sc, struct nvme_feature_obj *feat __unused, struct nvme_command *command, struct nvme_completion *compl) { uint16_t nqr; /* Number of Queues Requested */ if (sc->num_q_is_set) { WPRINTF("%s: Number of Queues already set", __func__); pci_nvme_status_genc(&compl->status, NVME_SC_COMMAND_SEQUENCE_ERROR); return; } nqr = command->cdw11 & 0xFFFF; if (nqr == 0xffff) { WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return; } sc->num_squeues = ONE_BASED(nqr); if (sc->num_squeues > sc->max_queues) { DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, sc->max_queues); sc->num_squeues = sc->max_queues; } nqr = (command->cdw11 >> 16) & 0xFFFF; if (nqr == 0xffff) { WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return; } sc->num_cqueues = ONE_BASED(nqr); if (sc->num_cqueues > sc->max_queues) { DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, sc->max_queues); sc->num_cqueues = sc->max_queues; } /* Patch the command value which will be saved on callback's return */ command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); sc->num_q_is_set = true; } static int nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, struct nvme_completion *compl) { struct nvme_feature_obj *feat; uint32_t nsid = command->nsid; uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10); bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10); DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); if (fid >= NVME_FID_MAX) { DPRINTF("%s invalid feature 0x%x", __func__, fid); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } if (sv) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_FEATURE_NOT_SAVEABLE); return (1); } feat = &sc->feat[fid]; if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } if (!feat->namespace_specific && !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_FEATURE_NOT_NS_SPECIFIC); return (1); } compl->cdw0 = 0; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); if (feat->set) feat->set(sc, feat, command, compl); else { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_FEATURE_NOT_CHANGEABLE); return (1); } DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); if (compl->status == NVME_SC_SUCCESS) { feat->cdw11 = command->cdw11; if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && (command->cdw11 != 0)) pci_nvme_aen_notify(sc); } return (0); } #define NVME_FEATURES_SEL_SUPPORTED 0x3 #define NVME_FEATURES_NS_SPECIFIC (1 << 1) static int nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { struct nvme_feature_obj *feat; uint8_t fid = command->cdw10 & 0xFF; uint8_t sel = (command->cdw10 >> 8) & 0x7; DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); if (fid >= NVME_FID_MAX) { DPRINTF("%s invalid feature 0x%x", __func__, fid); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } compl->cdw0 = 0; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); feat = &sc->feat[fid]; if (feat->get) { feat->get(sc, feat, command, compl); } if (compl->status == NVME_SC_SUCCESS) { if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific) compl->cdw0 = NVME_FEATURES_NS_SPECIFIC; else compl->cdw0 = feat->cdw11; } return (0); } static int nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint8_t ses, lbaf, pi; /* Only supports Secure Erase Setting - User Data Erase */ ses = (command->cdw10 >> 9) & 0x7; if (ses > 0x1) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } /* Only supports a single LBA Format */ lbaf = command->cdw10 & 0xf; if (lbaf != 0) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_FORMAT); return (1); } /* Doesn't support Protection Infomation */ pi = (command->cdw10 >> 5) & 0x7; if (pi != 0) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } if (sc->nvstore.type == NVME_STOR_RAM) { if (sc->nvstore.ctx) free(sc->nvstore.ctx); sc->nvstore.ctx = calloc(1, sc->nvstore.size); pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); } else { struct pci_nvme_ioreq *req; int err; req = pci_nvme_get_ioreq(sc); if (req == NULL) { pci_nvme_status_genc(&compl->status, NVME_SC_INTERNAL_DEVICE_ERROR); WPRINTF("%s: unable to allocate IO req", __func__); return (1); } req->nvme_sq = &sc->submit_queues[0]; req->sqid = 0; req->opc = command->opc; req->cid = command->cid; req->nsid = command->nsid; req->io_req.br_offset = 0; req->io_req.br_resid = sc->nvstore.size; req->io_req.br_callback = pci_nvme_io_done; err = blockif_delete(sc->nvstore.ctx, &req->io_req); if (err) { pci_nvme_status_genc(&compl->status, NVME_SC_INTERNAL_DEVICE_ERROR); pci_nvme_release_ioreq(sc, req); } else compl->status = NVME_NO_STATUS; } return (1); } static int nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command, struct nvme_completion *compl) { DPRINTF("%s submission queue %u, command ID 0x%x", __func__, command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); /* TODO: search for the command ID and abort it */ compl->cdw0 = 1; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_async_event_req(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, sc->aer_count, sc->ctrldata.aerl, command->cid); /* Don't exceed the Async Event Request Limit (AERL). */ if (pci_nvme_aer_limit_reached(sc)) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); return (1); } if (pci_nvme_aer_add(sc, command->cid)) { pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, NVME_SC_INTERNAL_DEVICE_ERROR); return (1); } /* * Raise events when they happen based on the Set Features cmd. * These events happen async, so only set completion successful if * there is an event reflective of the request to get event. */ compl->status = NVME_NO_STATUS; pci_nvme_aen_notify(sc); return (0); } static void pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) { struct nvme_completion compl; struct nvme_command *cmd; struct nvme_submission_queue *sq; struct nvme_completion_queue *cq; uint16_t sqhead; DPRINTF("%s index %u", __func__, (uint32_t)value); sq = &sc->submit_queues[0]; cq = &sc->compl_queues[0]; pthread_mutex_lock(&sq->mtx); sqhead = sq->head; DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); while (sqhead != atomic_load_acq_short(&sq->tail)) { cmd = &(sq->qbase)[sqhead]; compl.cdw0 = 0; compl.status = 0; switch (cmd->opc) { case NVME_OPC_DELETE_IO_SQ: DPRINTF("%s command DELETE_IO_SQ", __func__); nvme_opc_delete_io_sq(sc, cmd, &compl); break; case NVME_OPC_CREATE_IO_SQ: DPRINTF("%s command CREATE_IO_SQ", __func__); nvme_opc_create_io_sq(sc, cmd, &compl); break; case NVME_OPC_DELETE_IO_CQ: DPRINTF("%s command DELETE_IO_CQ", __func__); nvme_opc_delete_io_cq(sc, cmd, &compl); break; case NVME_OPC_CREATE_IO_CQ: DPRINTF("%s command CREATE_IO_CQ", __func__); nvme_opc_create_io_cq(sc, cmd, &compl); break; case NVME_OPC_GET_LOG_PAGE: DPRINTF("%s command GET_LOG_PAGE", __func__); nvme_opc_get_log_page(sc, cmd, &compl); break; case NVME_OPC_IDENTIFY: DPRINTF("%s command IDENTIFY", __func__); nvme_opc_identify(sc, cmd, &compl); break; case NVME_OPC_ABORT: DPRINTF("%s command ABORT", __func__); nvme_opc_abort(sc, cmd, &compl); break; case NVME_OPC_SET_FEATURES: DPRINTF("%s command SET_FEATURES", __func__); nvme_opc_set_features(sc, cmd, &compl); break; case NVME_OPC_GET_FEATURES: DPRINTF("%s command GET_FEATURES", __func__); nvme_opc_get_features(sc, cmd, &compl); break; case NVME_OPC_FIRMWARE_ACTIVATE: DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); pci_nvme_status_tc(&compl.status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_FIRMWARE_SLOT); break; case NVME_OPC_ASYNC_EVENT_REQUEST: DPRINTF("%s command ASYNC_EVENT_REQ", __func__); nvme_opc_async_event_req(sc, cmd, &compl); break; case NVME_OPC_FORMAT_NVM: DPRINTF("%s command FORMAT_NVM", __func__); if ((sc->ctrldata.oacs & (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); break; } nvme_opc_format_nvm(sc, cmd, &compl); break; case NVME_OPC_SECURITY_SEND: case NVME_OPC_SECURITY_RECEIVE: case NVME_OPC_SANITIZE: case NVME_OPC_GET_LBA_STATUS: DPRINTF("%s command OPC=%#x (unsupported)", __func__, cmd->opc); /* Valid but unsupported opcodes */ pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD); break; default: DPRINTF("%s command OPC=%#X (not implemented)", __func__, cmd->opc); pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); } sqhead = (sqhead + 1) % sq->size; if (NVME_COMPLETION_VALID(compl)) { pci_nvme_cq_update(sc, &sc->compl_queues[0], compl.cdw0, cmd->cid, 0, /* SQID */ compl.status); } } DPRINTF("setting sqhead %u", sqhead); sq->head = sqhead; if (cq->head != cq->tail) pci_generate_msix(sc->nsc_pi, 0); pthread_mutex_unlock(&sq->mtx); } /* * Update the Write and Read statistics reported in SMART data * * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. */ static void pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, size_t bytes, uint16_t status) { pthread_mutex_lock(&sc->mtx); switch (opc) { case NVME_OPC_WRITE: sc->write_commands++; if (status != NVME_SC_SUCCESS) break; sc->write_dunits_remainder += (bytes / 512); while (sc->write_dunits_remainder >= 1000) { sc->write_data_units++; sc->write_dunits_remainder -= 1000; } break; case NVME_OPC_READ: sc->read_commands++; if (status != NVME_SC_SUCCESS) break; sc->read_dunits_remainder += (bytes / 512); while (sc->read_dunits_remainder >= 1000) { sc->read_data_units++; sc->read_dunits_remainder -= 1000; } break; default: DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); break; } pthread_mutex_unlock(&sc->mtx); } /* * Check if the combination of Starting LBA (slba) and number of blocks * exceeds the range of the underlying storage. * * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores * the capacity in bytes as a uint64_t, care must be taken to avoid integer * overflow. */ static bool pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, uint32_t nblocks) { size_t offset, bytes; /* Overflow check of multiplying Starting LBA by the sector size */ if (slba >> (64 - nvstore->sectsz_bits)) return (true); offset = slba << nvstore->sectsz_bits; bytes = nblocks << nvstore->sectsz_bits; /* Overflow check of Number of Logical Blocks */ if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes)) return (true); return (false); } static int pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused, struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset) { int iovidx; bool range_is_contiguous; if (req == NULL) return (-1); if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { return (-1); } /* * Minimize the number of IOVs by concatenating contiguous address * ranges. If the IOV count is zero, there is no previous range to * concatenate. */ if (req->io_req.br_iovcnt == 0) range_is_contiguous = false; else range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr; if (range_is_contiguous) { iovidx = req->io_req.br_iovcnt - 1; req->io_req.br_iov[iovidx].iov_base = paddr_guest2host(req->sc->nsc_pi->pi_vmctx, req->prev_gpaddr, size); if (req->io_req.br_iov[iovidx].iov_base == NULL) return (-1); req->prev_size += size; req->io_req.br_resid += size; req->io_req.br_iov[iovidx].iov_len = req->prev_size; } else { iovidx = req->io_req.br_iovcnt; if (iovidx == 0) { req->io_req.br_offset = offset; req->io_req.br_resid = 0; req->io_req.br_param = req; } req->io_req.br_iov[iovidx].iov_base = paddr_guest2host(req->sc->nsc_pi->pi_vmctx, gpaddr, size); if (req->io_req.br_iov[iovidx].iov_base == NULL) return (-1); req->io_req.br_iov[iovidx].iov_len = size; req->prev_gpaddr = gpaddr; req->prev_size = size; req->io_req.br_resid += size; req->io_req.br_iovcnt++; } return (0); } static void pci_nvme_set_completion(struct pci_nvme_softc *sc, struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status) { struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), NVME_STATUS_GET_SC(status)); pci_nvme_cq_update(sc, cq, 0, cid, sqid, status); if (cq->head != cq->tail) { if (cq->intr_en & NVME_CQ_INTEN) { pci_generate_msix(sc->nsc_pi, cq->intr_vec); } else { DPRINTF("%s: CQ%u interrupt disabled", __func__, sq->cqid); } } } static void pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) { req->sc = NULL; req->nvme_sq = NULL; req->sqid = 0; pthread_mutex_lock(&sc->mtx); STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); sc->pending_ios--; /* when no more IO pending, can set to ready if device reset/enabled */ if (sc->pending_ios == 0 && NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) sc->regs.csts |= NVME_CSTS_RDY; pthread_mutex_unlock(&sc->mtx); sem_post(&sc->iosemlock); } static struct pci_nvme_ioreq * pci_nvme_get_ioreq(struct pci_nvme_softc *sc) { struct pci_nvme_ioreq *req = NULL; sem_wait(&sc->iosemlock); pthread_mutex_lock(&sc->mtx); req = STAILQ_FIRST(&sc->ioreqs_free); assert(req != NULL); STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); req->sc = sc; sc->pending_ios++; pthread_mutex_unlock(&sc->mtx); req->io_req.br_iovcnt = 0; req->io_req.br_offset = 0; req->io_req.br_resid = 0; req->io_req.br_param = req; req->prev_gpaddr = 0; req->prev_size = 0; return req; } static void pci_nvme_io_done(struct blockif_req *br, int err) { struct pci_nvme_ioreq *req = br->br_param; struct nvme_submission_queue *sq = req->nvme_sq; uint16_t code, status; DPRINTF("%s error %d %s", __func__, err, strerror(err)); /* TODO return correct error */ code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; status = 0; pci_nvme_status_genc(&status, code); pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status); pci_nvme_stats_write_read_update(req->sc, req->opc, req->bytes, status); pci_nvme_release_ioreq(req->sc, req); } /* * Implements the Flush command. The specification states: * If a volatile write cache is not present, Flush commands complete * successfully and have no effect * in the description of the Volatile Write Cache (VWC) field of the Identify * Controller data. Therefore, set status to Success if the command is * not supported (i.e. RAM or as indicated by the blockif). */ static bool nvme_opc_flush(struct pci_nvme_softc *sc __unused, struct nvme_command *cmd __unused, struct pci_nvme_blockstore *nvstore, struct pci_nvme_ioreq *req, uint16_t *status) { bool pending = false; if (nvstore->type == NVME_STOR_RAM) { pci_nvme_status_genc(status, NVME_SC_SUCCESS); } else { int err; req->io_req.br_callback = pci_nvme_io_done; err = blockif_flush(nvstore->ctx, &req->io_req); switch (err) { case 0: pending = true; break; case EOPNOTSUPP: pci_nvme_status_genc(status, NVME_SC_SUCCESS); break; default: pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); } } return (pending); } static uint16_t nvme_write_read_ram(struct pci_nvme_softc *sc, struct pci_nvme_blockstore *nvstore, uint64_t prp1, uint64_t prp2, size_t offset, uint64_t bytes, bool is_write) { uint8_t *buf = nvstore->ctx; enum nvme_copy_dir dir; uint16_t status; if (is_write) dir = NVME_COPY_TO_PRP; else dir = NVME_COPY_FROM_PRP; status = 0; if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, buf + offset, bytes, dir)) pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); else pci_nvme_status_genc(&status, NVME_SC_SUCCESS); return (status); } static uint16_t nvme_write_read_blockif(struct pci_nvme_softc *sc, struct pci_nvme_blockstore *nvstore, struct pci_nvme_ioreq *req, uint64_t prp1, uint64_t prp2, size_t offset, uint64_t bytes, bool is_write) { uint64_t size; int err; uint16_t status = NVME_NO_STATUS; size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) { err = -1; goto out; } offset += size; bytes -= size; if (bytes == 0) { ; } else if (bytes <= PAGE_SIZE) { size = bytes; if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) { err = -1; goto out; } } else { void *vmctx = sc->nsc_pi->pi_vmctx; uint64_t *prp_list = &prp2; uint64_t *last = prp_list; /* PRP2 is pointer to a physical region page list */ while (bytes) { /* Last entry in list points to the next list */ if ((prp_list == last) && (bytes > PAGE_SIZE)) { uint64_t prp = *prp_list; prp_list = paddr_guest2host(vmctx, prp, PAGE_SIZE - (prp % PAGE_SIZE)); if (prp_list == NULL) { err = -1; goto out; } last = prp_list + (NVME_PRP2_ITEMS - 1); } size = MIN(bytes, PAGE_SIZE); if (pci_nvme_append_iov_req(sc, req, *prp_list, size, offset)) { err = -1; goto out; } offset += size; bytes -= size; prp_list++; } } req->io_req.br_callback = pci_nvme_io_done; if (is_write) err = blockif_write(nvstore->ctx, &req->io_req); else err = blockif_read(nvstore->ctx, &req->io_req); out: if (err) pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); return (status); } static bool nvme_opc_write_read(struct pci_nvme_softc *sc, struct nvme_command *cmd, struct pci_nvme_blockstore *nvstore, struct pci_nvme_ioreq *req, uint16_t *status) { uint64_t lba, nblocks, bytes; size_t offset; bool is_write = cmd->opc == NVME_OPC_WRITE; bool pending = false; lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; nblocks = (cmd->cdw12 & 0xFFFF) + 1; bytes = nblocks << nvstore->sectsz_bits; if (bytes > NVME_MAX_DATA_SIZE) { WPRINTF("%s command would exceed MDTS", __func__); pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); goto out; } if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)", __func__, lba, nblocks); pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); goto out; } offset = lba << nvstore->sectsz_bits; req->bytes = bytes; req->io_req.br_offset = lba; /* PRP bits 1:0 must be zero */ cmd->prp1 &= ~0x3UL; cmd->prp2 &= ~0x3UL; if (nvstore->type == NVME_STOR_RAM) { *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, cmd->prp2, offset, bytes, is_write); } else { *status = nvme_write_read_blockif(sc, nvstore, req, cmd->prp1, cmd->prp2, offset, bytes, is_write); if (*status == NVME_NO_STATUS) pending = true; } out: if (!pending) pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); return (pending); } static void pci_nvme_dealloc_sm(struct blockif_req *br, int err) { struct pci_nvme_ioreq *req = br->br_param; struct pci_nvme_softc *sc = req->sc; bool done = true; uint16_t status; status = 0; if (err) { pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { pci_nvme_status_genc(&status, NVME_SC_SUCCESS); } else { struct iovec *iov = req->io_req.br_iov; req->prev_gpaddr++; iov += req->prev_gpaddr; /* The iov_* values already include the sector size */ req->io_req.br_offset = (off_t)iov->iov_base; req->io_req.br_resid = iov->iov_len; if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); } else done = false; } if (done) { pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid, status); pci_nvme_release_ioreq(sc, req); } } static bool nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, struct nvme_command *cmd, struct pci_nvme_blockstore *nvstore, struct pci_nvme_ioreq *req, uint16_t *status) { struct nvme_dsm_range *range = NULL; uint32_t nr, r, non_zero, dr; int err; bool pending = false; if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); goto out; } nr = cmd->cdw10 & 0xff; /* copy locally because a range entry could straddle PRPs */ range = calloc(1, NVME_MAX_DSM_TRIM); if (range == NULL) { pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); goto out; } nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); /* Check for invalid ranges and the number of non-zero lengths */ non_zero = 0; for (r = 0; r <= nr; r++) { if (pci_nvme_out_of_range(nvstore, range[r].starting_lba, range[r].length)) { pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); goto out; } if (range[r].length != 0) non_zero++; } if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { size_t offset, bytes; int sectsz_bits = sc->nvstore.sectsz_bits; /* * DSM calls are advisory only, and compliant controllers * may choose to take no actions (i.e. return Success). */ if (!nvstore->deallocate) { pci_nvme_status_genc(status, NVME_SC_SUCCESS); goto out; } /* If all ranges have a zero length, return Success */ if (non_zero == 0) { pci_nvme_status_genc(status, NVME_SC_SUCCESS); goto out; } if (req == NULL) { pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); goto out; } offset = range[0].starting_lba << sectsz_bits; bytes = range[0].length << sectsz_bits; /* * If the request is for more than a single range, store * the ranges in the br_iov. Optimize for the common case * of a single range. * * Note that NVMe Number of Ranges is a zero based value */ req->io_req.br_iovcnt = 0; req->io_req.br_offset = offset; req->io_req.br_resid = bytes; if (nr == 0) { req->io_req.br_callback = pci_nvme_io_done; } else { struct iovec *iov = req->io_req.br_iov; for (r = 0, dr = 0; r <= nr; r++) { offset = range[r].starting_lba << sectsz_bits; bytes = range[r].length << sectsz_bits; if (bytes == 0) continue; if ((nvstore->size - offset) < bytes) { pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); goto out; } iov[dr].iov_base = (void *)offset; iov[dr].iov_len = bytes; dr++; } req->io_req.br_callback = pci_nvme_dealloc_sm; /* * Use prev_gpaddr to track the current entry and * prev_size to track the number of entries */ req->prev_gpaddr = 0; req->prev_size = dr; } err = blockif_delete(nvstore->ctx, &req->io_req); if (err) pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); else pending = true; } out: free(range); return (pending); } static void pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) { struct nvme_submission_queue *sq; uint16_t status; uint16_t sqhead; /* handle all submissions up to sq->tail index */ sq = &sc->submit_queues[idx]; pthread_mutex_lock(&sq->mtx); sqhead = sq->head; DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", idx, sqhead, sq->tail, sq->qbase); while (sqhead != atomic_load_acq_short(&sq->tail)) { struct nvme_command *cmd; struct pci_nvme_ioreq *req; uint32_t nsid; bool pending; pending = false; req = NULL; status = 0; cmd = &sq->qbase[sqhead]; sqhead = (sqhead + 1) % sq->size; nsid = le32toh(cmd->nsid); if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { pci_nvme_status_genc(&status, NVME_SC_INVALID_NAMESPACE_OR_FORMAT); status |= NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; goto complete; } req = pci_nvme_get_ioreq(sc); if (req == NULL) { pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); WPRINTF("%s: unable to allocate IO req", __func__); goto complete; } req->nvme_sq = sq; req->sqid = idx; req->opc = cmd->opc; req->cid = cmd->cid; req->nsid = cmd->nsid; switch (cmd->opc) { case NVME_OPC_FLUSH: pending = nvme_opc_flush(sc, cmd, &sc->nvstore, req, &status); break; case NVME_OPC_WRITE: case NVME_OPC_READ: pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, req, &status); break; case NVME_OPC_WRITE_ZEROES: /* TODO: write zeroes WPRINTF("%s write zeroes lba 0x%lx blocks %u", __func__, lba, cmd->cdw12 & 0xFFFF); */ pci_nvme_status_genc(&status, NVME_SC_SUCCESS); break; case NVME_OPC_DATASET_MANAGEMENT: pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, req, &status); break; default: WPRINTF("%s unhandled io command 0x%x", __func__, cmd->opc); pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); } complete: if (!pending) { pci_nvme_set_completion(sc, sq, idx, cmd->cid, status); if (req != NULL) pci_nvme_release_ioreq(sc, req); } } sq->head = sqhead; pthread_mutex_unlock(&sq->mtx); } static void pci_nvme_handle_doorbell(struct vmctx *ctx __unused, struct pci_nvme_softc* sc, uint64_t idx, int is_sq, uint64_t value) { DPRINTF("nvme doorbell %lu, %s, val 0x%lx", idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); if (is_sq) { if (idx > sc->num_squeues) { WPRINTF("%s queue index %lu overflow from " "guest (max %u)", __func__, idx, sc->num_squeues); return; } atomic_store_short(&sc->submit_queues[idx].tail, (uint16_t)value); if (idx == 0) { pci_nvme_handle_admin_cmd(sc, value); } else { /* submission queue; handle new entries in SQ */ if (idx > sc->num_squeues) { WPRINTF("%s SQ index %lu overflow from " "guest (max %u)", __func__, idx, sc->num_squeues); return; } pci_nvme_handle_io_cmd(sc, (uint16_t)idx); } } else { if (idx > sc->num_cqueues) { WPRINTF("%s queue index %lu overflow from " "guest (max %u)", __func__, idx, sc->num_cqueues); return; } atomic_store_short(&sc->compl_queues[idx].head, (uint16_t)value); } } static void pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) { const char *s = iswrite ? "WRITE" : "READ"; switch (offset) { case NVME_CR_CAP_LOW: DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); break; case NVME_CR_CAP_HI: DPRINTF("%s %s NVME_CR_CAP_HI", func, s); break; case NVME_CR_VS: DPRINTF("%s %s NVME_CR_VS", func, s); break; case NVME_CR_INTMS: DPRINTF("%s %s NVME_CR_INTMS", func, s); break; case NVME_CR_INTMC: DPRINTF("%s %s NVME_CR_INTMC", func, s); break; case NVME_CR_CC: DPRINTF("%s %s NVME_CR_CC", func, s); break; case NVME_CR_CSTS: DPRINTF("%s %s NVME_CR_CSTS", func, s); break; case NVME_CR_NSSR: DPRINTF("%s %s NVME_CR_NSSR", func, s); break; case NVME_CR_AQA: DPRINTF("%s %s NVME_CR_AQA", func, s); break; case NVME_CR_ASQ_LOW: DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); break; case NVME_CR_ASQ_HI: DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); break; case NVME_CR_ACQ_LOW: DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); break; case NVME_CR_ACQ_HI: DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); break; default: DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); } } static void pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, uint64_t offset, int size, uint64_t value) { uint32_t ccreg; if (offset >= NVME_DOORBELL_OFFSET) { uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; uint64_t idx = belloffset / 8; /* door bell size = 2*int */ int is_sq = (belloffset % 8) < 4; if ((sc->regs.csts & NVME_CSTS_RDY) == 0) { WPRINTF("doorbell write prior to RDY (offset=%#lx)\n", offset); return; } if (belloffset > ((sc->max_queues+1) * 8 - 4)) { WPRINTF("guest attempted an overflow write offset " "0x%lx, val 0x%lx in %s", offset, value, __func__); return; } if (is_sq) { if (sc->submit_queues[idx].qbase == NULL) return; } else if (sc->compl_queues[idx].qbase == NULL) return; pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); return; } DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", offset, size, value); if (size != 4) { WPRINTF("guest wrote invalid size %d (offset 0x%lx, " "val 0x%lx) to bar0 in %s", size, offset, value, __func__); /* TODO: shutdown device */ return; } pci_nvme_bar0_reg_dumps(__func__, offset, 1); pthread_mutex_lock(&sc->mtx); switch (offset) { case NVME_CR_CAP_LOW: case NVME_CR_CAP_HI: /* readonly */ break; case NVME_CR_VS: /* readonly */ break; case NVME_CR_INTMS: /* MSI-X, so ignore */ break; case NVME_CR_INTMC: /* MSI-X, so ignore */ break; case NVME_CR_CC: ccreg = (uint32_t)value; DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " "iocqes %u", __func__, NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), NVME_CC_GET_IOCQES(ccreg)); if (NVME_CC_GET_SHN(ccreg)) { /* perform shutdown - flush out data to backend */ sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << NVME_CSTS_REG_SHST_SHIFT); sc->regs.csts |= NVME_SHST_COMPLETE << NVME_CSTS_REG_SHST_SHIFT; } if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { if (NVME_CC_GET_EN(ccreg) == 0) /* transition 1-> causes controller reset */ pci_nvme_reset_locked(sc); else pci_nvme_init_controller(ctx, sc); } /* Insert the iocqes, iosqes and en bits from the write */ sc->regs.cc &= ~NVME_CC_WRITE_MASK; sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; if (NVME_CC_GET_EN(ccreg) == 0) { /* Insert the ams, mps and css bit fields */ sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; sc->regs.csts &= ~NVME_CSTS_RDY; } else if ((sc->pending_ios == 0) && !(sc->regs.csts & NVME_CSTS_CFS)) { sc->regs.csts |= NVME_CSTS_RDY; } break; case NVME_CR_CSTS: break; case NVME_CR_NSSR: /* ignore writes; don't support subsystem reset */ break; case NVME_CR_AQA: sc->regs.aqa = (uint32_t)value; break; case NVME_CR_ASQ_LOW: sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | (0xFFFFF000 & value); break; case NVME_CR_ASQ_HI: sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | (value << 32); break; case NVME_CR_ACQ_LOW: sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | (0xFFFFF000 & value); break; case NVME_CR_ACQ_HI: sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | (value << 32); break; default: DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", __func__, offset, value, size); } pthread_mutex_unlock(&sc->mtx); } static void pci_nvme_write(struct vmctx *ctx, int vcpu __unused, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct pci_nvme_softc* sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " " value 0x%lx", baridx, offset, size, value); pci_emul_msix_twrite(pi, offset, size, value); return; } switch (baridx) { case 0: pci_nvme_write_bar_0(ctx, sc, offset, size, value); break; default: DPRINTF("%s unknown baridx %d, val 0x%lx", __func__, baridx, value); } } static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, uint64_t offset, int size) { uint64_t value; pci_nvme_bar0_reg_dumps(__func__, offset, 0); if (offset < NVME_DOORBELL_OFFSET) { void *p = &(sc->regs); pthread_mutex_lock(&sc->mtx); memcpy(&value, (void *)((uintptr_t)p + offset), size); pthread_mutex_unlock(&sc->mtx); } else { value = 0; WPRINTF("pci_nvme: read invalid offset %ld", offset); } switch (size) { case 1: value &= 0xFF; break; case 2: value &= 0xFFFF; break; case 4: value &= 0xFFFFFFFF; break; } DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", offset, size, (uint32_t)value); return (value); } static uint64_t pci_nvme_read(struct vmctx *ctx __unused, int vcpu __unused, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_nvme_softc* sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", baridx, offset, size); return pci_emul_msix_tread(pi, offset, size); } switch (baridx) { case 0: return pci_nvme_read_bar_0(sc, offset, size); default: DPRINTF("unknown bar %d, 0x%lx", baridx, offset); } return (0); } static int pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) { - char bident[sizeof("XX:X:X")]; + char bident[sizeof("XXX:XXX")]; const char *value; uint32_t sectsz; sc->max_queues = NVME_QUEUES; sc->max_qentries = NVME_MAX_QENTRIES; sc->ioslots = NVME_IOSLOTS; sc->num_squeues = sc->max_queues; sc->num_cqueues = sc->max_queues; sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; sectsz = 0; snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); value = get_config_value_node(nvl, "maxq"); if (value != NULL) sc->max_queues = atoi(value); value = get_config_value_node(nvl, "qsz"); if (value != NULL) { sc->max_qentries = atoi(value); if (sc->max_qentries <= 0) { EPRINTLN("nvme: Invalid qsz option %d", sc->max_qentries); return (-1); } } value = get_config_value_node(nvl, "ioslots"); if (value != NULL) { sc->ioslots = atoi(value); if (sc->ioslots <= 0) { EPRINTLN("Invalid ioslots option %d", sc->ioslots); return (-1); } } value = get_config_value_node(nvl, "sectsz"); if (value != NULL) sectsz = atoi(value); value = get_config_value_node(nvl, "ser"); if (value != NULL) { /* * This field indicates the Product Serial Number in * 7-bit ASCII, unused bytes should be space characters. * Ref: NVMe v1.3c. */ cpywithpad((char *)sc->ctrldata.sn, sizeof(sc->ctrldata.sn), value, ' '); } value = get_config_value_node(nvl, "eui64"); if (value != NULL) sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); value = get_config_value_node(nvl, "dsm"); if (value != NULL) { if (strcmp(value, "auto") == 0) sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; else if (strcmp(value, "enable") == 0) sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; else if (strcmp(value, "disable") == 0) sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; } value = get_config_value_node(nvl, "ram"); if (value != NULL) { uint64_t sz = strtoull(value, NULL, 10); sc->nvstore.type = NVME_STOR_RAM; sc->nvstore.size = sz * 1024 * 1024; sc->nvstore.ctx = calloc(1, sc->nvstore.size); sc->nvstore.sectsz = 4096; sc->nvstore.sectsz_bits = 12; if (sc->nvstore.ctx == NULL) { EPRINTLN("nvme: Unable to allocate RAM"); return (-1); } } else { - snprintf(bident, sizeof(bident), "%d:%d", + snprintf(bident, sizeof(bident), "%u:%u", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); sc->nvstore.ctx = blockif_open(nvl, bident); if (sc->nvstore.ctx == NULL) { EPRINTLN("nvme: Could not open backing file: %s", strerror(errno)); return (-1); } sc->nvstore.type = NVME_STOR_BLOCKIF; sc->nvstore.size = blockif_size(sc->nvstore.ctx); } if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) sc->nvstore.sectsz = sectsz; else if (sc->nvstore.type != NVME_STOR_RAM) sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); for (sc->nvstore.sectsz_bits = 9; (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; sc->nvstore.sectsz_bits++); if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) sc->max_queues = NVME_QUEUES; return (0); } static void pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg, size_t new_size) { struct pci_nvme_softc *sc; struct pci_nvme_blockstore *nvstore; struct nvme_namespace_data *nd; sc = arg; nvstore = &sc->nvstore; nd = &sc->nsdata; nvstore->size = new_size; pci_nvme_init_nsdata_size(nvstore, nd); /* Add changed NSID to list */ sc->ns_log.ns[0] = 1; sc->ns_log.ns[1] = 0; pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED); } static int pci_nvme_init(struct vmctx *ctx __unused, struct pci_devinst *pi, nvlist_t *nvl) { struct pci_nvme_softc *sc; uint32_t pci_membar_sz; int error; error = 0; sc = calloc(1, sizeof(struct pci_nvme_softc)); pi->pi_arg = sc; sc->nsc_pi = pi; error = pci_nvme_parse_config(sc, nvl); if (error < 0) goto done; else error = 0; STAILQ_INIT(&sc->ioreqs_free); sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); for (uint32_t i = 0; i < sc->ioslots; i++) { STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); } pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); /* * Allocate size of NVMe registers + doorbell space for all queues. * * The specification requires a minimum memory I/O window size of 16K. * The Windows driver will refuse to start a device with a smaller * window. */ pci_membar_sz = sizeof(struct nvme_registers) + 2 * sizeof(uint32_t) * (sc->max_queues + 1); pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); DPRINTF("nvme membar size: %u", pci_membar_sz); error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); if (error) { WPRINTF("%s pci alloc mem bar failed", __func__); goto done; } error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); if (error) { WPRINTF("%s pci add msixcap failed", __func__); goto done; } error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); if (error) { WPRINTF("%s pci add Express capability failed", __func__); goto done; } pthread_mutex_init(&sc->mtx, NULL); sem_init(&sc->iosemlock, 0, sc->ioslots); blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); /* * Controller data depends on Namespace data so initialize Namespace * data first. */ pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); pci_nvme_init_ctrldata(sc); pci_nvme_init_logpages(sc); pci_nvme_init_features(sc); pci_nvme_aer_init(sc); pci_nvme_aen_init(sc); pci_nvme_reset(sc); pci_lintr_request(pi); done: return (error); } static int pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) { char *cp, *ram; if (opts == NULL) return (0); if (strncmp(opts, "ram=", 4) == 0) { cp = strchr(opts, ','); if (cp == NULL) { set_config_value_node(nvl, "ram", opts + 4); return (0); } ram = strndup(opts + 4, cp - opts - 4); set_config_value_node(nvl, "ram", ram); free(ram); return (pci_parse_legacy_config(nvl, cp + 1)); } else return (blockif_legacy_config(nvl, opts)); } static const struct pci_devemu pci_de_nvme = { .pe_emu = "nvme", .pe_init = pci_nvme_init, .pe_legacy_config = pci_nvme_legacy_config, .pe_barwrite = pci_nvme_write, .pe_barread = pci_nvme_read }; PCI_EMUL_SET(pci_de_nvme); diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c index 2b2e6a412a5a..06d2a9671c0b 100644 --- a/usr.sbin/bhyve/pci_virtio_block.c +++ b/usr.sbin/bhyve/pci_virtio_block.c @@ -1,602 +1,602 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * Copyright 2020-2021 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" #include "block_if.h" #define VTBLK_BSIZE 512 #define VTBLK_RINGSZ 128 _Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request"); #define VTBLK_S_OK 0 #define VTBLK_S_IOERR 1 #define VTBLK_S_UNSUPP 2 #define VTBLK_BLK_ID_BYTES 20 + 1 /* Capability bits */ #define VTBLK_F_BARRIER (1 << 0) /* Does host support barriers? */ #define VTBLK_F_SIZE_MAX (1 << 1) /* Indicates maximum segment size */ #define VTBLK_F_SEG_MAX (1 << 2) /* Indicates maximum # of segments */ #define VTBLK_F_GEOMETRY (1 << 4) /* Legacy geometry available */ #define VTBLK_F_RO (1 << 5) /* Disk is read-only */ #define VTBLK_F_BLK_SIZE (1 << 6) /* Block size of disk is available*/ #define VTBLK_F_SCSI (1 << 7) /* Supports scsi command passthru */ #define VTBLK_F_FLUSH (1 << 9) /* Writeback mode enabled after reset */ #define VTBLK_F_WCE (1 << 9) /* Legacy alias for FLUSH */ #define VTBLK_F_TOPOLOGY (1 << 10) /* Topology information is available */ #define VTBLK_F_CONFIG_WCE (1 << 11) /* Writeback mode available in config */ #define VTBLK_F_MQ (1 << 12) /* Multi-Queue */ #define VTBLK_F_DISCARD (1 << 13) /* Trim blocks */ #define VTBLK_F_WRITE_ZEROES (1 << 14) /* Write zeros */ /* * Host capabilities */ #define VTBLK_S_HOSTCAPS \ ( VTBLK_F_SEG_MAX | \ VTBLK_F_BLK_SIZE | \ VTBLK_F_FLUSH | \ VTBLK_F_TOPOLOGY | \ VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */ /* * The current blockif_delete() interface only allows a single delete * request at a time. */ #define VTBLK_MAX_DISCARD_SEG 1 /* * An arbitrary limit to prevent excessive latency due to large * delete requests. */ #define VTBLK_MAX_DISCARD_SECT ((16 << 20) / VTBLK_BSIZE) /* 16 MiB */ /* * Config space "registers" */ struct vtblk_config { uint64_t vbc_capacity; uint32_t vbc_size_max; uint32_t vbc_seg_max; struct { uint16_t cylinders; uint8_t heads; uint8_t sectors; } vbc_geometry; uint32_t vbc_blk_size; struct { uint8_t physical_block_exp; uint8_t alignment_offset; uint16_t min_io_size; uint32_t opt_io_size; } vbc_topology; uint8_t vbc_writeback; uint8_t unused0[1]; uint16_t num_queues; uint32_t max_discard_sectors; uint32_t max_discard_seg; uint32_t discard_sector_alignment; uint32_t max_write_zeroes_sectors; uint32_t max_write_zeroes_seg; uint8_t write_zeroes_may_unmap; uint8_t unused1[3]; } __packed; /* * Fixed-size block header */ struct virtio_blk_hdr { #define VBH_OP_READ 0 #define VBH_OP_WRITE 1 #define VBH_OP_SCSI_CMD 2 #define VBH_OP_SCSI_CMD_OUT 3 #define VBH_OP_FLUSH 4 #define VBH_OP_FLUSH_OUT 5 #define VBH_OP_IDENT 8 #define VBH_OP_DISCARD 11 #define VBH_OP_WRITE_ZEROES 13 #define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */ uint32_t vbh_type; uint32_t vbh_ioprio; uint64_t vbh_sector; } __packed; /* * Debug printf */ static int pci_vtblk_debug; #define DPRINTF(params) if (pci_vtblk_debug) PRINTLN params #define WPRINTF(params) PRINTLN params struct pci_vtblk_ioreq { struct blockif_req io_req; struct pci_vtblk_softc *io_sc; uint8_t *io_status; uint16_t io_idx; }; struct virtio_blk_discard_write_zeroes { uint64_t sector; uint32_t num_sectors; struct { uint32_t unmap:1; uint32_t reserved:31; } flags; }; /* * Per-device softc */ struct pci_vtblk_softc { struct virtio_softc vbsc_vs; pthread_mutex_t vsc_mtx; struct vqueue_info vbsc_vq; struct vtblk_config vbsc_cfg; struct virtio_consts vbsc_consts; struct blockif_ctxt *bc; char vbsc_ident[VTBLK_BLK_ID_BYTES]; struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ]; }; static void pci_vtblk_reset(void *); static void pci_vtblk_notify(void *, struct vqueue_info *); static int pci_vtblk_cfgread(void *, int, int, uint32_t *); static int pci_vtblk_cfgwrite(void *, int, int, uint32_t); #ifdef BHYVE_SNAPSHOT static void pci_vtblk_pause(void *); static void pci_vtblk_resume(void *); static int pci_vtblk_snapshot(void *, struct vm_snapshot_meta *); #endif static struct virtio_consts vtblk_vi_consts = { .vc_name = "vtblk", .vc_nvq = 1, .vc_cfgsize = sizeof(struct vtblk_config), .vc_reset = pci_vtblk_reset, .vc_qnotify = pci_vtblk_notify, .vc_cfgread = pci_vtblk_cfgread, .vc_cfgwrite = pci_vtblk_cfgwrite, .vc_apply_features = NULL, .vc_hv_caps = VTBLK_S_HOSTCAPS, #ifdef BHYVE_SNAPSHOT .vc_pause = pci_vtblk_pause, .vc_resume = pci_vtblk_resume, .vc_snapshot = pci_vtblk_snapshot, #endif }; static void pci_vtblk_reset(void *vsc) { struct pci_vtblk_softc *sc = vsc; DPRINTF(("vtblk: device reset requested !")); vi_reset_dev(&sc->vbsc_vs); } static void pci_vtblk_done_locked(struct pci_vtblk_ioreq *io, int err) { struct pci_vtblk_softc *sc = io->io_sc; /* convert errno into a virtio block error return */ if (err == EOPNOTSUPP || err == ENOSYS) *io->io_status = VTBLK_S_UNSUPP; else if (err != 0) *io->io_status = VTBLK_S_IOERR; else *io->io_status = VTBLK_S_OK; /* * Return the descriptor back to the host. * We wrote 1 byte (our status) to host. */ vq_relchain(&sc->vbsc_vq, io->io_idx, 1); vq_endchains(&sc->vbsc_vq, 0); } #ifdef BHYVE_SNAPSHOT static void pci_vtblk_pause(void *vsc) { struct pci_vtblk_softc *sc = vsc; DPRINTF(("vtblk: device pause requested !\n")); blockif_pause(sc->bc); } static void pci_vtblk_resume(void *vsc) { struct pci_vtblk_softc *sc = vsc; DPRINTF(("vtblk: device resume requested !\n")); blockif_resume(sc->bc); } static int pci_vtblk_snapshot(void *vsc, struct vm_snapshot_meta *meta) { int ret; struct pci_vtblk_softc *sc = vsc; SNAPSHOT_VAR_OR_LEAVE(sc->vbsc_cfg, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(sc->vbsc_ident, sizeof(sc->vbsc_ident), meta, ret, done); done: return (ret); } #endif static void pci_vtblk_done(struct blockif_req *br, int err) { struct pci_vtblk_ioreq *io = br->br_param; struct pci_vtblk_softc *sc = io->io_sc; pthread_mutex_lock(&sc->vsc_mtx); pci_vtblk_done_locked(io, err); pthread_mutex_unlock(&sc->vsc_mtx); } static void pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) { struct virtio_blk_hdr *vbh; struct pci_vtblk_ioreq *io; int i, n; int err; ssize_t iolen; int writeop, type; struct vi_req req; struct iovec iov[BLOCKIF_IOV_MAX + 2]; struct virtio_blk_discard_write_zeroes *discard; n = vq_getchain(vq, iov, BLOCKIF_IOV_MAX + 2, &req); /* * The first descriptor will be the read-only fixed header, * and the last is for status (hence +2 above and below). * The remaining iov's are the actual data I/O vectors. * * XXX - note - this fails on crash dump, which does a * VIRTIO_BLK_T_FLUSH with a zero transfer length */ assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2); io = &sc->vbsc_ios[req.idx]; assert(req.readable != 0); assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr)); vbh = (struct virtio_blk_hdr *)iov[0].iov_base; memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2)); io->io_req.br_iovcnt = n - 2; io->io_req.br_offset = vbh->vbh_sector * VTBLK_BSIZE; io->io_status = (uint8_t *)iov[--n].iov_base; assert(req.writable != 0); assert(iov[n].iov_len == 1); /* * XXX * The guest should not be setting the BARRIER flag because * we don't advertise the capability. */ type = vbh->vbh_type & ~VBH_FLAG_BARRIER; writeop = (type == VBH_OP_WRITE || type == VBH_OP_DISCARD); /* * - Write op implies read-only descriptor * - Read/ident op implies write-only descriptor * * By taking away either the read-only fixed header or the write-only * status iovec, the following condition should hold true. */ assert(n == (writeop ? req.readable : req.writable)); iolen = 0; for (i = 1; i < n; i++) { iolen += iov[i].iov_len; } io->io_req.br_resid = iolen; DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld", writeop ? "write/discard" : "read/ident", iolen, i - 1, io->io_req.br_offset)); switch (type) { case VBH_OP_READ: err = blockif_read(sc->bc, &io->io_req); break; case VBH_OP_WRITE: err = blockif_write(sc->bc, &io->io_req); break; case VBH_OP_DISCARD: /* * We currently only support a single request, if the guest * has submitted a request that doesn't conform to the * requirements, we return a error. */ if (iov[1].iov_len != sizeof (*discard)) { pci_vtblk_done_locked(io, EINVAL); return; } /* The segments to discard are provided rather than data */ discard = (struct virtio_blk_discard_write_zeroes *) iov[1].iov_base; /* * virtio v1.1 5.2.6.2: * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP * for discard and write zeroes commands if any unknown flag is * set. Furthermore, the device MUST set the status byte to * VIRTIO_BLK_S_UNSUPP for discard commands if the unmap flag * is set. * * Currently there are no known flags for a DISCARD request. */ if (discard->flags.unmap != 0 || discard->flags.reserved != 0) { pci_vtblk_done_locked(io, ENOTSUP); return; } /* Make sure the request doesn't exceed our size limit */ if (discard->num_sectors > VTBLK_MAX_DISCARD_SECT) { pci_vtblk_done_locked(io, EINVAL); return; } io->io_req.br_offset = discard->sector * VTBLK_BSIZE; io->io_req.br_resid = discard->num_sectors * VTBLK_BSIZE; err = blockif_delete(sc->bc, &io->io_req); break; case VBH_OP_FLUSH: case VBH_OP_FLUSH_OUT: err = blockif_flush(sc->bc, &io->io_req); break; case VBH_OP_IDENT: /* Assume a single buffer */ /* S/n equal to buffer is not zero-terminated. */ memset(iov[1].iov_base, 0, iov[1].iov_len); strncpy(iov[1].iov_base, sc->vbsc_ident, MIN(iov[1].iov_len, sizeof(sc->vbsc_ident))); pci_vtblk_done_locked(io, 0); return; default: pci_vtblk_done_locked(io, EOPNOTSUPP); return; } assert(err == 0); } static void pci_vtblk_notify(void *vsc, struct vqueue_info *vq) { struct pci_vtblk_softc *sc = vsc; while (vq_has_descs(vq)) pci_vtblk_proc(sc, vq); } static void pci_vtblk_resized(struct blockif_ctxt *bctxt __unused, void *arg, size_t new_size) { struct pci_vtblk_softc *sc; sc = arg; sc->vbsc_cfg.vbc_capacity = new_size / VTBLK_BSIZE; /* 512-byte units */ vi_interrupt(&sc->vbsc_vs, VIRTIO_PCI_ISR_CONFIG, sc->vbsc_vs.vs_msix_cfg_idx); } static int pci_vtblk_init(struct vmctx *ctx __unused, struct pci_devinst *pi, nvlist_t *nvl) { - char bident[sizeof("XX:X:X")]; + char bident[sizeof("XXX:XXX")]; struct blockif_ctxt *bctxt; const char *path, *serial; MD5_CTX mdctx; u_char digest[16]; struct pci_vtblk_softc *sc; off_t size; int i, sectsz, sts, sto; /* * The supplied backing file has to exist */ - snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func); + snprintf(bident, sizeof(bident), "%u:%u", pi->pi_slot, pi->pi_func); bctxt = blockif_open(nvl, bident); if (bctxt == NULL) { perror("Could not open backing file"); return (1); } size = blockif_size(bctxt); sectsz = blockif_sectsz(bctxt); blockif_psectsz(bctxt, &sts, &sto); sc = calloc(1, sizeof(struct pci_vtblk_softc)); sc->bc = bctxt; for (i = 0; i < VTBLK_RINGSZ; i++) { struct pci_vtblk_ioreq *io = &sc->vbsc_ios[i]; io->io_req.br_callback = pci_vtblk_done; io->io_req.br_param = io; io->io_sc = sc; io->io_idx = i; } bcopy(&vtblk_vi_consts, &sc->vbsc_consts, sizeof (vtblk_vi_consts)); if (blockif_candelete(sc->bc)) sc->vbsc_consts.vc_hv_caps |= VTBLK_F_DISCARD; pthread_mutex_init(&sc->vsc_mtx, NULL); /* init virtio softc and virtqueues */ vi_softc_linkup(&sc->vbsc_vs, &sc->vbsc_consts, sc, pi, &sc->vbsc_vq); sc->vbsc_vs.vs_mtx = &sc->vsc_mtx; sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ; /* sc->vbsc_vq.vq_notify = we have no per-queue notify */ /* * If an explicit identifier is not given, create an * identifier using parts of the md5 sum of the filename. */ bzero(sc->vbsc_ident, VTBLK_BLK_ID_BYTES); if ((serial = get_config_value_node(nvl, "serial")) != NULL || (serial = get_config_value_node(nvl, "ser")) != NULL) { strlcpy(sc->vbsc_ident, serial, VTBLK_BLK_ID_BYTES); } else { path = get_config_value_node(nvl, "path"); MD5Init(&mdctx); MD5Update(&mdctx, path, strlen(path)); MD5Final(digest, &mdctx); snprintf(sc->vbsc_ident, VTBLK_BLK_ID_BYTES, "BHYVE-%02X%02X-%02X%02X-%02X%02X", digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); } /* setup virtio block config space */ sc->vbsc_cfg.vbc_capacity = size / VTBLK_BSIZE; /* 512-byte units */ sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */ /* * If Linux is presented with a seg_max greater than the virtio queue * size, it can stumble into situations where it violates its own * invariants and panics. For safety, we keep seg_max clamped, paying * heed to the two extra descriptors needed for the header and status * of a request. */ sc->vbsc_cfg.vbc_seg_max = MIN(VTBLK_RINGSZ - 2, BLOCKIF_IOV_MAX); sc->vbsc_cfg.vbc_geometry.cylinders = 0; /* no geometry */ sc->vbsc_cfg.vbc_geometry.heads = 0; sc->vbsc_cfg.vbc_geometry.sectors = 0; sc->vbsc_cfg.vbc_blk_size = sectsz; sc->vbsc_cfg.vbc_topology.physical_block_exp = (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0; sc->vbsc_cfg.vbc_topology.alignment_offset = (sto != 0) ? ((sts - sto) / sectsz) : 0; sc->vbsc_cfg.vbc_topology.min_io_size = 0; sc->vbsc_cfg.vbc_topology.opt_io_size = 0; sc->vbsc_cfg.vbc_writeback = 0; sc->vbsc_cfg.max_discard_sectors = VTBLK_MAX_DISCARD_SECT; sc->vbsc_cfg.max_discard_seg = VTBLK_MAX_DISCARD_SEG; sc->vbsc_cfg.discard_sector_alignment = MAX(sectsz, sts) / VTBLK_BSIZE; /* * Should we move some of this into virtio.c? Could * have the device, class, and subdev_0 as fields in * the virtio constants structure. */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_BLOCK); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) { blockif_close(sc->bc); free(sc); return (1); } vi_set_io_bar(&sc->vbsc_vs, 0); blockif_register_resize_callback(sc->bc, pci_vtblk_resized, sc); return (0); } static int pci_vtblk_cfgwrite(void *vsc __unused, int offset, int size __unused, uint32_t value __unused) { DPRINTF(("vtblk: write to readonly reg %d", offset)); return (1); } static int pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vtblk_softc *sc = vsc; void *ptr; /* our caller has already verified offset and size */ ptr = (uint8_t *)&sc->vbsc_cfg + offset; memcpy(retval, ptr, size); return (0); } static const struct pci_devemu pci_de_vblk = { .pe_emu = "virtio-blk", .pe_init = pci_vtblk_init, .pe_legacy_config = blockif_legacy_config, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = vi_pci_snapshot, .pe_pause = vi_pci_pause, .pe_resume = vi_pci_resume, #endif }; PCI_EMUL_SET(pci_de_vblk);