Index: usr.sbin/bhyve/block_if.c =================================================================== --- usr.sbin/bhyve/block_if.c +++ usr.sbin/bhyve/block_if.c @@ -28,6 +28,10 @@ * $FreeBSD$ */ +/* + * Copyright 2020 Joyent, Inc. + */ + #include __FBSDID("$FreeBSD$"); @@ -410,6 +414,8 @@ off_t size, psectsz, psectoff; int extra, fd, i, sectsz; int nocache, sync, ro, candelete, geom, ssopt, pssopt; + int nodelete; + #ifndef WITHOUT_CAPSICUM cap_rights_t rights; cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE }; @@ -422,6 +428,7 @@ nocache = 0; sync = 0; ro = 0; + nodelete = 0; /* * The first element in the optstring is always a pathname. @@ -434,6 +441,8 @@ continue; else if (!strcmp(cp, "nocache")) nocache = 1; + else if (!strcmp(cp, "nodelete")) + nodelete = 1; else if (!strcmp(cp, "sync") || !strcmp(cp, "direct")) sync = 1; else if (!strcmp(cp, "ro")) @@ -500,7 +509,7 @@ ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); arg.len = sizeof(arg.value.i); - if (ioctl(fd, DIOCGATTR, &arg) == 0) + if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) candelete = arg.value.i; if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) geom = 1; Index: usr.sbin/bhyve/pci_virtio_block.c =================================================================== --- usr.sbin/bhyve/pci_virtio_block.c +++ usr.sbin/bhyve/pci_virtio_block.c @@ -3,7 +3,7 @@ * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. - * Copyright (c) 2019 Joyent, Inc. + * Copyright 2020 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -57,32 +57,55 @@ #include "virtio.h" #include "block_if.h" -#define VTBLK_RINGSZ 128 +#define VTBLK_BSIZE 512 +#define VTBLK_RINGSZ 128 _Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request"); -#define VTBLK_S_OK 0 -#define VTBLK_S_IOERR 1 +#define VTBLK_S_OK 0 +#define VTBLK_S_IOERR 1 #define VTBLK_S_UNSUPP 2 #define VTBLK_BLK_ID_BYTES 20 + 1 /* Capability bits */ -#define VTBLK_F_SEG_MAX (1 << 2) /* Maximum request segments */ -#define VTBLK_F_BLK_SIZE (1 << 6) /* cfg block size valid */ -#define VTBLK_F_FLUSH (1 << 9) /* Cache flush support */ -#define VTBLK_F_TOPOLOGY (1 << 10) /* Optimal I/O alignment */ +#define VTBLK_F_BARRIER (1 << 0) /* Does host support barriers? */ +#define VTBLK_F_SIZE_MAX (1 << 1) /* Indicates maximum segment size */ +#define VTBLK_F_SEG_MAX (1 << 2) /* Indicates maximum # of segments */ +#define VTBLK_F_GEOMETRY (1 << 4) /* Legacy geometry available */ +#define VTBLK_F_RO (1 << 5) /* Disk is read-only */ +#define VTBLK_F_BLK_SIZE (1 << 6) /* Block size of disk is available*/ +#define VTBLK_F_SCSI (1 << 7) /* Supports scsi command passthru */ +#define VTBLK_F_FLUSH (1 << 9) /* Writeback mode enabled after reset */ +#define VTBLK_F_WCE (1 << 9) /* Legacy alias for FLUSH */ +#define VTBLK_F_TOPOLOGY (1 << 10) /* Topology information is available */ +#define VTBLK_F_CONFIG_WCE (1 << 11) /* Writeback mode available in config */ +#define VTBLK_F_MQ (1 << 12) /* Multi-Queue */ +#define VTBLK_F_DISCARD (1 << 13) /* Trim blocks */ +#define VTBLK_F_WRITE_ZEROES (1 << 14) /* Write zeros */ /* * Host capabilities */ -#define VTBLK_S_HOSTCAPS \ +#define VTBLK_S_HOSTCAPS \ ( VTBLK_F_SEG_MAX | \ VTBLK_F_BLK_SIZE | \ VTBLK_F_FLUSH | \ VTBLK_F_TOPOLOGY | \ VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */ +/* + * The current blockif_delete() interface only allows a single delete + * request at a time. + */ +#define VTBLK_MAX_DISCARD_SEG 1 + +/* + * An arbitrary limit to prevent excessive latency due to large + * delete requests. + */ +#define VTBLK_MAX_DISCARD_SECT ((16 << 20) / VTBLK_BSIZE) /* 16 MiB */ + /* * Config space "registers" */ @@ -103,6 +126,15 @@ uint32_t opt_io_size; } vbc_topology; uint8_t vbc_writeback; + uint8_t unused0[1]; + uint16_t num_queues; + uint32_t max_discard_sectors; + uint32_t max_discard_seg; + uint32_t discard_sector_alignment; + uint32_t max_write_zeroes_sectors; + uint32_t max_write_zeroes_seg; + uint8_t write_zeroes_may_unmap; + uint8_t unused1[3]; } __packed; /* @@ -111,9 +143,14 @@ struct virtio_blk_hdr { #define VBH_OP_READ 0 #define VBH_OP_WRITE 1 +#define VBH_OP_SCSI_CMD 2 +#define VBH_OP_SCSI_CMD_OUT 3 #define VBH_OP_FLUSH 4 #define VBH_OP_FLUSH_OUT 5 #define VBH_OP_IDENT 8 +#define VBH_OP_DISCARD 11 +#define VBH_OP_WRITE_ZEROES 13 + #define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */ uint32_t vbh_type; uint32_t vbh_ioprio; @@ -124,8 +161,8 @@ * Debug printf */ static int pci_vtblk_debug; -#define DPRINTF(params) if (pci_vtblk_debug) PRINTLN params -#define WPRINTF(params) PRINTLN params +#define DPRINTF(params) if (pci_vtblk_debug) PRINTLN params +#define WPRINTF(params) PRINTLN params struct pci_vtblk_ioreq { struct blockif_req io_req; @@ -134,6 +171,15 @@ uint16_t io_idx; }; +struct virtio_blk_discard_write_zeroes { + uint64_t sector; + uint32_t num_sectors; + struct { + uint32_t unmap:1; + uint32_t reserved:31; + } flags; +}; + /* * Per-device softc */ @@ -142,6 +188,7 @@ pthread_mutex_t vsc_mtx; struct vqueue_info vbsc_vq; struct vtblk_config vbsc_cfg; + struct virtio_consts vbsc_consts; struct blockif_ctxt *bc; char vbsc_ident[VTBLK_BLK_ID_BYTES]; struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ]; @@ -174,9 +221,8 @@ } static void -pci_vtblk_done(struct blockif_req *br, int err) +pci_vtblk_done_locked(struct pci_vtblk_ioreq *io, int err) { - struct pci_vtblk_ioreq *io = br->br_param; struct pci_vtblk_softc *sc = io->io_sc; /* convert errno into a virtio block error return */ @@ -191,9 +237,18 @@ * Return the descriptor back to the host. * We wrote 1 byte (our status) to host. */ - pthread_mutex_lock(&sc->vsc_mtx); vq_relchain(&sc->vbsc_vq, io->io_idx, 1); vq_endchains(&sc->vbsc_vq, 0); +} + +static void +pci_vtblk_done(struct blockif_req *br, int err) +{ + struct pci_vtblk_ioreq *io = br->br_param; + struct pci_vtblk_softc *sc = io->io_sc; + + pthread_mutex_lock(&sc->vsc_mtx); + pci_vtblk_done_locked(io, err); pthread_mutex_unlock(&sc->vsc_mtx); } @@ -208,6 +263,7 @@ int writeop, type; struct iovec iov[BLOCKIF_IOV_MAX + 2]; uint16_t idx, flags[BLOCKIF_IOV_MAX + 2]; + struct virtio_blk_discard_write_zeroes *discard; n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags); @@ -224,11 +280,11 @@ io = &sc->vbsc_ios[idx]; assert((flags[0] & VRING_DESC_F_WRITE) == 0); assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr)); - vbh = iov[0].iov_base; + vbh = (struct virtio_blk_hdr *)iov[0].iov_base; memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2)); io->io_req.br_iovcnt = n - 2; - io->io_req.br_offset = vbh->vbh_sector * DEV_BSIZE; - io->io_status = iov[--n].iov_base; + io->io_req.br_offset = vbh->vbh_sector * VTBLK_BSIZE; + io->io_status = (uint8_t *)iov[--n].iov_base; assert(iov[n].iov_len == 1); assert(flags[n] & VRING_DESC_F_WRITE); @@ -238,7 +294,7 @@ * we don't advertise the capability. */ type = vbh->vbh_type & ~VBH_FLAG_BARRIER; - writeop = (type == VBH_OP_WRITE); + writeop = (type == VBH_OP_WRITE || type == VBH_OP_DISCARD); iolen = 0; for (i = 1; i < n; i++) { @@ -254,7 +310,7 @@ io->io_req.br_resid = iolen; DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld", - writeop ? "write" : "read/ident", iolen, i - 1, + writeop ? "write/discard" : "read/ident", iolen, i - 1, io->io_req.br_offset)); switch (type) { @@ -264,6 +320,46 @@ case VBH_OP_WRITE: err = blockif_write(sc->bc, &io->io_req); break; + case VBH_OP_DISCARD: + /* + * We currently only support a single request, if the guest + * has submitted a request that doesn't conform to the + * requirements, we return a error. + */ + if (iov[1].iov_len != sizeof (*discard)) { + pci_vtblk_done_locked(io, EINVAL); + return; + } + + /* The segments to discard are provided rather than data */ + discard = (struct virtio_blk_discard_write_zeroes *) + iov[1].iov_base; + + /* + * virtio v1.1 5.2.6.2: + * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP + * for discard and write zeroes commands if any unknown flag is + * set. Furthermore, the device MUST set the status byte to + * VIRTIO_BLK_S_UNSUPP for discard commands if the unmap flag + * is set. + * + * Currently there are no known flags for a DISCARD request. + */ + if (discard->flags.unmap != 0 || discard->flags.reserved != 0) { + pci_vtblk_done_locked(io, ENOTSUP); + return; + } + + /* Make sure the request doesn't exceed our size limit */ + if (discard->num_sectors > VTBLK_MAX_DISCARD_SECT) { + pci_vtblk_done_locked(io, EINVAL); + return; + } + + io->io_req.br_offset = discard->sector * VTBLK_BSIZE; + io->io_req.br_resid = discard->num_sectors * VTBLK_BSIZE; + err = blockif_delete(sc->bc, &io->io_req); + break; case VBH_OP_FLUSH: case VBH_OP_FLUSH_OUT: err = blockif_flush(sc->bc, &io->io_req); @@ -274,10 +370,10 @@ memset(iov[1].iov_base, 0, iov[1].iov_len); strncpy(iov[1].iov_base, sc->vbsc_ident, MIN(iov[1].iov_len, sizeof(sc->vbsc_ident))); - pci_vtblk_done(&io->io_req, 0); + pci_vtblk_done_locked(io, 0); return; default: - pci_vtblk_done(&io->io_req, EOPNOTSUPP); + pci_vtblk_done_locked(io, EOPNOTSUPP); return; } assert(err == 0); @@ -332,10 +428,14 @@ io->io_idx = i; } + bcopy(&vtblk_vi_consts, &sc->vbsc_consts, sizeof (vtblk_vi_consts)); + if (blockif_candelete(sc->bc)) + sc->vbsc_consts.vc_hv_caps |= VTBLK_F_DISCARD; + pthread_mutex_init(&sc->vsc_mtx, NULL); /* init virtio softc and virtqueues */ - vi_softc_linkup(&sc->vbsc_vs, &vtblk_vi_consts, sc, pi, &sc->vbsc_vq); + vi_softc_linkup(&sc->vbsc_vs, &sc->vbsc_consts, sc, pi, &sc->vbsc_vq); sc->vbsc_vs.vs_mtx = &sc->vsc_mtx; sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ; @@ -353,7 +453,7 @@ digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); /* setup virtio block config space */ - sc->vbsc_cfg.vbc_capacity = size / DEV_BSIZE; /* 512-byte units */ + sc->vbsc_cfg.vbc_capacity = size / VTBLK_BSIZE; /* 512-byte units */ sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */ /* @@ -375,6 +475,9 @@ sc->vbsc_cfg.vbc_topology.min_io_size = 0; sc->vbsc_cfg.vbc_topology.opt_io_size = 0; sc->vbsc_cfg.vbc_writeback = 0; + sc->vbsc_cfg.max_discard_sectors = VTBLK_MAX_DISCARD_SECT; + sc->vbsc_cfg.max_discard_seg = VTBLK_MAX_DISCARD_SEG; + sc->vbsc_cfg.discard_sector_alignment = sectsz / VTBLK_BSIZE; /* * Should we move some of this into virtio.c? Could