Index: releng/12.2/sys/dev/virtio/block/virtio_blk.c =================================================================== --- releng/12.2/sys/dev/virtio/block/virtio_blk.c (revision 365701) +++ releng/12.2/sys/dev/virtio/block/virtio_blk.c (revision 365702) @@ -1,1405 +1,1457 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011, Bryan Venteicher * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Driver for VirtIO block devices. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "virtio_if.h" struct vtblk_request { struct virtio_blk_outhdr vbr_hdr; struct bio *vbr_bp; uint8_t vbr_ack; TAILQ_ENTRY(vtblk_request) vbr_link; }; enum vtblk_cache_mode { VTBLK_CACHE_WRITETHROUGH, VTBLK_CACHE_WRITEBACK, VTBLK_CACHE_MAX }; struct vtblk_softc { device_t vtblk_dev; struct mtx vtblk_mtx; uint64_t vtblk_features; uint32_t vtblk_flags; #define VTBLK_FLAG_INDIRECT 0x0001 #define VTBLK_FLAG_READONLY 0x0002 #define VTBLK_FLAG_DETACH 0x0004 #define VTBLK_FLAG_SUSPEND 0x0008 #define VTBLK_FLAG_BARRIER 0x0010 #define VTBLK_FLAG_WC_CONFIG 0x0020 +#define VTBLK_FLAG_DISCARD 0x0040 struct virtqueue *vtblk_vq; struct sglist *vtblk_sglist; struct disk *vtblk_disk; struct bio_queue_head vtblk_bioq; TAILQ_HEAD(, vtblk_request) vtblk_req_free; TAILQ_HEAD(, vtblk_request) vtblk_req_ready; struct vtblk_request *vtblk_req_ordered; int vtblk_max_nsegs; int vtblk_request_count; enum vtblk_cache_mode vtblk_write_cache; struct bio_queue vtblk_dump_queue; struct vtblk_request vtblk_dump_request; }; static struct virtio_feature_desc vtblk_feature_desc[] = { { VIRTIO_BLK_F_BARRIER, "HostBarrier" }, { VIRTIO_BLK_F_SIZE_MAX, "MaxSegSize" }, { VIRTIO_BLK_F_SEG_MAX, "MaxNumSegs" }, { VIRTIO_BLK_F_GEOMETRY, "DiskGeometry" }, { VIRTIO_BLK_F_RO, "ReadOnly" }, { VIRTIO_BLK_F_BLK_SIZE, "BlockSize" }, { VIRTIO_BLK_F_SCSI, "SCSICmds" }, { VIRTIO_BLK_F_WCE, "WriteCache" }, { VIRTIO_BLK_F_TOPOLOGY, "Topology" }, { VIRTIO_BLK_F_CONFIG_WCE, "ConfigWCE" }, + { VIRTIO_BLK_F_DISCARD, "Discard" }, { 0, NULL } }; static int vtblk_modevent(module_t, int, void *); static int vtblk_probe(device_t); static int vtblk_attach(device_t); static int vtblk_detach(device_t); static int vtblk_suspend(device_t); static int vtblk_resume(device_t); static int vtblk_shutdown(device_t); static int vtblk_config_change(device_t); static int vtblk_open(struct disk *); static int vtblk_close(struct disk *); static int vtblk_ioctl(struct disk *, u_long, void *, int, struct thread *); static int vtblk_dump(void *, void *, vm_offset_t, off_t, size_t); static void vtblk_strategy(struct bio *); static void vtblk_negotiate_features(struct vtblk_softc *); static void vtblk_setup_features(struct vtblk_softc *); static int vtblk_maximum_segments(struct vtblk_softc *, struct virtio_blk_config *); static int vtblk_alloc_virtqueue(struct vtblk_softc *); static void vtblk_resize_disk(struct vtblk_softc *, uint64_t); static void vtblk_alloc_disk(struct vtblk_softc *, struct virtio_blk_config *); static void vtblk_create_disk(struct vtblk_softc *); static int vtblk_request_prealloc(struct vtblk_softc *); static void vtblk_request_free(struct vtblk_softc *); static struct vtblk_request * vtblk_request_dequeue(struct vtblk_softc *); static void vtblk_request_enqueue(struct vtblk_softc *, struct vtblk_request *); static struct vtblk_request * vtblk_request_next_ready(struct vtblk_softc *); static void vtblk_request_requeue_ready(struct vtblk_softc *, struct vtblk_request *); static struct vtblk_request * vtblk_request_next(struct vtblk_softc *); static struct vtblk_request * vtblk_request_bio(struct vtblk_softc *); static int vtblk_request_execute(struct vtblk_softc *, struct vtblk_request *); static int vtblk_request_error(struct vtblk_request *); static void vtblk_queue_completed(struct vtblk_softc *, struct bio_queue *); static void vtblk_done_completed(struct vtblk_softc *, struct bio_queue *); static void vtblk_drain_vq(struct vtblk_softc *); static void vtblk_drain(struct vtblk_softc *); static void vtblk_startio(struct vtblk_softc *); static void vtblk_bio_done(struct vtblk_softc *, struct bio *, int); static void vtblk_read_config(struct vtblk_softc *, struct virtio_blk_config *); static void vtblk_ident(struct vtblk_softc *); static int vtblk_poll_request(struct vtblk_softc *, struct vtblk_request *); static int vtblk_quiesce(struct vtblk_softc *); static void vtblk_vq_intr(void *); static void vtblk_stop(struct vtblk_softc *); static void vtblk_dump_quiesce(struct vtblk_softc *); static int vtblk_dump_write(struct vtblk_softc *, void *, off_t, size_t); static int vtblk_dump_flush(struct vtblk_softc *); static void vtblk_dump_complete(struct vtblk_softc *); static void vtblk_set_write_cache(struct vtblk_softc *, int); static int vtblk_write_cache_enabled(struct vtblk_softc *sc, struct virtio_blk_config *); static int vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS); static void vtblk_setup_sysctl(struct vtblk_softc *); static int vtblk_tunable_int(struct vtblk_softc *, const char *, int); /* Tunables. */ static int vtblk_no_ident = 0; TUNABLE_INT("hw.vtblk.no_ident", &vtblk_no_ident); static int vtblk_writecache_mode = -1; TUNABLE_INT("hw.vtblk.writecache_mode", &vtblk_writecache_mode); /* Features desired/implemented by this driver. */ #define VTBLK_FEATURES \ (VIRTIO_BLK_F_BARRIER | \ VIRTIO_BLK_F_SIZE_MAX | \ VIRTIO_BLK_F_SEG_MAX | \ VIRTIO_BLK_F_GEOMETRY | \ VIRTIO_BLK_F_RO | \ VIRTIO_BLK_F_BLK_SIZE | \ VIRTIO_BLK_F_WCE | \ VIRTIO_BLK_F_TOPOLOGY | \ VIRTIO_BLK_F_CONFIG_WCE | \ + VIRTIO_BLK_F_DISCARD | \ VIRTIO_RING_F_INDIRECT_DESC) #define VTBLK_MTX(_sc) &(_sc)->vtblk_mtx #define VTBLK_LOCK_INIT(_sc, _name) \ mtx_init(VTBLK_MTX((_sc)), (_name), \ "VirtIO Block Lock", MTX_DEF) #define VTBLK_LOCK(_sc) mtx_lock(VTBLK_MTX((_sc))) #define VTBLK_UNLOCK(_sc) mtx_unlock(VTBLK_MTX((_sc))) #define VTBLK_LOCK_DESTROY(_sc) mtx_destroy(VTBLK_MTX((_sc))) #define VTBLK_LOCK_ASSERT(_sc) mtx_assert(VTBLK_MTX((_sc)), MA_OWNED) #define VTBLK_LOCK_ASSERT_NOTOWNED(_sc) \ mtx_assert(VTBLK_MTX((_sc)), MA_NOTOWNED) #define VTBLK_DISK_NAME "vtbd" #define VTBLK_QUIESCE_TIMEOUT (30 * hz) /* * Each block request uses at least two segments - one for the header * and one for the status. */ #define VTBLK_MIN_SEGMENTS 2 static device_method_t vtblk_methods[] = { /* Device methods. */ DEVMETHOD(device_probe, vtblk_probe), DEVMETHOD(device_attach, vtblk_attach), DEVMETHOD(device_detach, vtblk_detach), DEVMETHOD(device_suspend, vtblk_suspend), DEVMETHOD(device_resume, vtblk_resume), DEVMETHOD(device_shutdown, vtblk_shutdown), /* VirtIO methods. */ DEVMETHOD(virtio_config_change, vtblk_config_change), DEVMETHOD_END }; static driver_t vtblk_driver = { "vtblk", vtblk_methods, sizeof(struct vtblk_softc) }; static devclass_t vtblk_devclass; DRIVER_MODULE(virtio_blk, virtio_mmio, vtblk_driver, vtblk_devclass, vtblk_modevent, 0); DRIVER_MODULE(virtio_blk, virtio_pci, vtblk_driver, vtblk_devclass, vtblk_modevent, 0); MODULE_VERSION(virtio_blk, 1); MODULE_DEPEND(virtio_blk, virtio, 1, 1, 1); static int vtblk_modevent(module_t mod, int type, void *unused) { int error; error = 0; switch (type) { case MOD_LOAD: case MOD_QUIESCE: case MOD_UNLOAD: case MOD_SHUTDOWN: break; default: error = EOPNOTSUPP; break; } return (error); } static int vtblk_probe(device_t dev) { if (virtio_get_device_type(dev) != VIRTIO_ID_BLOCK) return (ENXIO); device_set_desc(dev, "VirtIO Block Adapter"); return (BUS_PROBE_DEFAULT); } static int vtblk_attach(device_t dev) { struct vtblk_softc *sc; struct virtio_blk_config blkcfg; int error; virtio_set_feature_desc(dev, vtblk_feature_desc); sc = device_get_softc(dev); sc->vtblk_dev = dev; VTBLK_LOCK_INIT(sc, device_get_nameunit(dev)); bioq_init(&sc->vtblk_bioq); TAILQ_INIT(&sc->vtblk_dump_queue); TAILQ_INIT(&sc->vtblk_req_free); TAILQ_INIT(&sc->vtblk_req_ready); vtblk_setup_sysctl(sc); vtblk_setup_features(sc); vtblk_read_config(sc, &blkcfg); /* * With the current sglist(9) implementation, it is not easy * for us to support a maximum segment size as adjacent * segments are coalesced. For now, just make sure it's larger * than the maximum supported transfer size. */ if (virtio_with_feature(dev, VIRTIO_BLK_F_SIZE_MAX)) { if (blkcfg.size_max < MAXPHYS) { error = ENOTSUP; device_printf(dev, "host requires unsupported " "maximum segment size feature\n"); goto fail; } } sc->vtblk_max_nsegs = vtblk_maximum_segments(sc, &blkcfg); if (sc->vtblk_max_nsegs <= VTBLK_MIN_SEGMENTS) { error = EINVAL; device_printf(dev, "fewer than minimum number of segments " "allowed: %d\n", sc->vtblk_max_nsegs); goto fail; } sc->vtblk_sglist = sglist_alloc(sc->vtblk_max_nsegs, M_NOWAIT); if (sc->vtblk_sglist == NULL) { error = ENOMEM; device_printf(dev, "cannot allocate sglist\n"); goto fail; } error = vtblk_alloc_virtqueue(sc); if (error) { device_printf(dev, "cannot allocate virtqueue\n"); goto fail; } error = vtblk_request_prealloc(sc); if (error) { device_printf(dev, "cannot preallocate requests\n"); goto fail; } vtblk_alloc_disk(sc, &blkcfg); error = virtio_setup_intr(dev, INTR_TYPE_BIO | INTR_ENTROPY); if (error) { device_printf(dev, "cannot setup virtqueue interrupt\n"); goto fail; } vtblk_create_disk(sc); virtqueue_enable_intr(sc->vtblk_vq); fail: if (error) vtblk_detach(dev); return (error); } static int vtblk_detach(device_t dev) { struct vtblk_softc *sc; sc = device_get_softc(dev); VTBLK_LOCK(sc); sc->vtblk_flags |= VTBLK_FLAG_DETACH; if (device_is_attached(dev)) vtblk_stop(sc); VTBLK_UNLOCK(sc); vtblk_drain(sc); if (sc->vtblk_disk != NULL) { disk_destroy(sc->vtblk_disk); sc->vtblk_disk = NULL; } if (sc->vtblk_sglist != NULL) { sglist_free(sc->vtblk_sglist); sc->vtblk_sglist = NULL; } VTBLK_LOCK_DESTROY(sc); return (0); } static int vtblk_suspend(device_t dev) { struct vtblk_softc *sc; int error; sc = device_get_softc(dev); VTBLK_LOCK(sc); sc->vtblk_flags |= VTBLK_FLAG_SUSPEND; /* XXX BMV: virtio_stop(), etc needed here? */ error = vtblk_quiesce(sc); if (error) sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND; VTBLK_UNLOCK(sc); return (error); } static int vtblk_resume(device_t dev) { struct vtblk_softc *sc; sc = device_get_softc(dev); VTBLK_LOCK(sc); /* XXX BMV: virtio_reinit(), etc needed here? */ sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND; vtblk_startio(sc); VTBLK_UNLOCK(sc); return (0); } static int vtblk_shutdown(device_t dev) { return (0); } static int vtblk_config_change(device_t dev) { struct vtblk_softc *sc; struct virtio_blk_config blkcfg; uint64_t capacity; sc = device_get_softc(dev); vtblk_read_config(sc, &blkcfg); /* Capacity is always in 512-byte units. */ - capacity = blkcfg.capacity * 512; + capacity = blkcfg.capacity * VTBLK_BSIZE; if (sc->vtblk_disk->d_mediasize != capacity) vtblk_resize_disk(sc, capacity); return (0); } static int vtblk_open(struct disk *dp) { struct vtblk_softc *sc; if ((sc = dp->d_drv1) == NULL) return (ENXIO); return (sc->vtblk_flags & VTBLK_FLAG_DETACH ? ENXIO : 0); } static int vtblk_close(struct disk *dp) { struct vtblk_softc *sc; if ((sc = dp->d_drv1) == NULL) return (ENXIO); return (0); } static int vtblk_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td) { struct vtblk_softc *sc; if ((sc = dp->d_drv1) == NULL) return (ENXIO); return (ENOTTY); } static int vtblk_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct disk *dp; struct vtblk_softc *sc; int error; dp = arg; error = 0; if ((sc = dp->d_drv1) == NULL) return (ENXIO); VTBLK_LOCK(sc); vtblk_dump_quiesce(sc); if (length > 0) error = vtblk_dump_write(sc, virtual, offset, length); if (error || (virtual == NULL && offset == 0)) vtblk_dump_complete(sc); VTBLK_UNLOCK(sc); return (error); } static void vtblk_strategy(struct bio *bp) { struct vtblk_softc *sc; if ((sc = bp->bio_disk->d_drv1) == NULL) { vtblk_bio_done(NULL, bp, EINVAL); return; } /* * Fail any write if RO. Unfortunately, there does not seem to * be a better way to report our readonly'ness to GEOM above. */ if (sc->vtblk_flags & VTBLK_FLAG_READONLY && - (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_FLUSH)) { + (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_FLUSH || + bp->bio_cmd == BIO_DELETE)) { vtblk_bio_done(sc, bp, EROFS); return; } + if ((bp->bio_cmd != BIO_READ) && (bp->bio_cmd != BIO_WRITE) && + (bp->bio_cmd != BIO_FLUSH) && (bp->bio_cmd != BIO_DELETE)) { + vtblk_bio_done(sc, bp, EOPNOTSUPP); + return; + } + VTBLK_LOCK(sc); if (sc->vtblk_flags & VTBLK_FLAG_DETACH) { VTBLK_UNLOCK(sc); vtblk_bio_done(sc, bp, ENXIO); return; } + if ((bp->bio_cmd == BIO_DELETE) && + !(sc->vtblk_flags & VTBLK_FLAG_DISCARD)) { + VTBLK_UNLOCK(sc); + vtblk_bio_done(sc, bp, EOPNOTSUPP); + return; + } + bioq_insert_tail(&sc->vtblk_bioq, bp); vtblk_startio(sc); VTBLK_UNLOCK(sc); } static void vtblk_negotiate_features(struct vtblk_softc *sc) { device_t dev; uint64_t features; dev = sc->vtblk_dev; features = VTBLK_FEATURES; sc->vtblk_features = virtio_negotiate_features(dev, features); } static void vtblk_setup_features(struct vtblk_softc *sc) { device_t dev; dev = sc->vtblk_dev; vtblk_negotiate_features(sc); if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC)) sc->vtblk_flags |= VTBLK_FLAG_INDIRECT; if (virtio_with_feature(dev, VIRTIO_BLK_F_RO)) sc->vtblk_flags |= VTBLK_FLAG_READONLY; if (virtio_with_feature(dev, VIRTIO_BLK_F_BARRIER)) sc->vtblk_flags |= VTBLK_FLAG_BARRIER; if (virtio_with_feature(dev, VIRTIO_BLK_F_CONFIG_WCE)) sc->vtblk_flags |= VTBLK_FLAG_WC_CONFIG; + if (virtio_with_feature(dev, VIRTIO_BLK_F_DISCARD)) + sc->vtblk_flags |= VTBLK_FLAG_DISCARD; } static int vtblk_maximum_segments(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg) { device_t dev; int nsegs; dev = sc->vtblk_dev; nsegs = VTBLK_MIN_SEGMENTS; if (virtio_with_feature(dev, VIRTIO_BLK_F_SEG_MAX)) { nsegs += MIN(blkcfg->seg_max, MAXPHYS / PAGE_SIZE + 1); if (sc->vtblk_flags & VTBLK_FLAG_INDIRECT) nsegs = MIN(nsegs, VIRTIO_MAX_INDIRECT); } else nsegs += 1; return (nsegs); } static int vtblk_alloc_virtqueue(struct vtblk_softc *sc) { device_t dev; struct vq_alloc_info vq_info; dev = sc->vtblk_dev; VQ_ALLOC_INFO_INIT(&vq_info, sc->vtblk_max_nsegs, vtblk_vq_intr, sc, &sc->vtblk_vq, "%s request", device_get_nameunit(dev)); return (virtio_alloc_virtqueues(dev, 0, 1, &vq_info)); } static void vtblk_resize_disk(struct vtblk_softc *sc, uint64_t new_capacity) { device_t dev; struct disk *dp; int error; dev = sc->vtblk_dev; dp = sc->vtblk_disk; dp->d_mediasize = new_capacity; if (bootverbose) { device_printf(dev, "resized to %juMB (%ju %u byte sectors)\n", (uintmax_t) dp->d_mediasize >> 20, (uintmax_t) dp->d_mediasize / dp->d_sectorsize, dp->d_sectorsize); } error = disk_resize(dp, M_NOWAIT); if (error) { device_printf(dev, "disk_resize(9) failed, error: %d\n", error); } } static void vtblk_alloc_disk(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg) { device_t dev; struct disk *dp; dev = sc->vtblk_dev; sc->vtblk_disk = dp = disk_alloc(); dp->d_open = vtblk_open; dp->d_close = vtblk_close; dp->d_ioctl = vtblk_ioctl; dp->d_strategy = vtblk_strategy; dp->d_name = VTBLK_DISK_NAME; dp->d_unit = device_get_unit(dev); dp->d_drv1 = sc; dp->d_flags = DISKFLAG_CANFLUSHCACHE | DISKFLAG_UNMAPPED_BIO | DISKFLAG_DIRECT_COMPLETION; dp->d_hba_vendor = virtio_get_vendor(dev); dp->d_hba_device = virtio_get_device(dev); dp->d_hba_subvendor = virtio_get_subvendor(dev); dp->d_hba_subdevice = virtio_get_subdevice(dev); if ((sc->vtblk_flags & VTBLK_FLAG_READONLY) == 0) dp->d_dump = vtblk_dump; /* Capacity is always in 512-byte units. */ - dp->d_mediasize = blkcfg->capacity * 512; + dp->d_mediasize = blkcfg->capacity * VTBLK_BSIZE; if (virtio_with_feature(dev, VIRTIO_BLK_F_BLK_SIZE)) dp->d_sectorsize = blkcfg->blk_size; else - dp->d_sectorsize = 512; + dp->d_sectorsize = VTBLK_BSIZE; /* * The VirtIO maximum I/O size is given in terms of segments. * However, FreeBSD limits I/O size by logical buffer size, not * by physically contiguous pages. Therefore, we have to assume * no pages are contiguous. This may impose an artificially low * maximum I/O size. But in practice, since QEMU advertises 128 * segments, this gives us a maximum IO size of 125 * PAGE_SIZE, * which is typically greater than MAXPHYS. Eventually we should * just advertise MAXPHYS and split buffers that are too big. * * Note we must subtract one additional segment in case of non * page aligned buffers. */ dp->d_maxsize = (sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS - 1) * PAGE_SIZE; if (dp->d_maxsize < PAGE_SIZE) dp->d_maxsize = PAGE_SIZE; /* XXX */ if (virtio_with_feature(dev, VIRTIO_BLK_F_GEOMETRY)) { dp->d_fwsectors = blkcfg->geometry.sectors; dp->d_fwheads = blkcfg->geometry.heads; } if (virtio_with_feature(dev, VIRTIO_BLK_F_TOPOLOGY) && blkcfg->topology.physical_block_exp > 0) { dp->d_stripesize = dp->d_sectorsize * (1 << blkcfg->topology.physical_block_exp); dp->d_stripeoffset = (dp->d_stripesize - blkcfg->topology.alignment_offset * dp->d_sectorsize) % dp->d_stripesize; } + if (virtio_with_feature(dev, VIRTIO_BLK_F_DISCARD)) { + dp->d_flags |= DISKFLAG_CANDELETE; + dp->d_delmaxsize = blkcfg->max_discard_sectors * VTBLK_BSIZE; + } + if (vtblk_write_cache_enabled(sc, blkcfg) != 0) sc->vtblk_write_cache = VTBLK_CACHE_WRITEBACK; else sc->vtblk_write_cache = VTBLK_CACHE_WRITETHROUGH; } static void vtblk_create_disk(struct vtblk_softc *sc) { struct disk *dp; dp = sc->vtblk_disk; vtblk_ident(sc); device_printf(sc->vtblk_dev, "%juMB (%ju %u byte sectors)\n", (uintmax_t) dp->d_mediasize >> 20, (uintmax_t) dp->d_mediasize / dp->d_sectorsize, dp->d_sectorsize); disk_create(dp, DISK_VERSION); } static int vtblk_request_prealloc(struct vtblk_softc *sc) { struct vtblk_request *req; int i, nreqs; nreqs = virtqueue_size(sc->vtblk_vq); /* * Preallocate sufficient requests to keep the virtqueue full. Each * request consumes VTBLK_MIN_SEGMENTS or more descriptors so reduce * the number allocated when indirect descriptors are not available. */ if ((sc->vtblk_flags & VTBLK_FLAG_INDIRECT) == 0) nreqs /= VTBLK_MIN_SEGMENTS; for (i = 0; i < nreqs; i++) { req = malloc(sizeof(struct vtblk_request), M_DEVBUF, M_NOWAIT); if (req == NULL) return (ENOMEM); MPASS(sglist_count(&req->vbr_hdr, sizeof(req->vbr_hdr)) == 1); MPASS(sglist_count(&req->vbr_ack, sizeof(req->vbr_ack)) == 1); sc->vtblk_request_count++; vtblk_request_enqueue(sc, req); } return (0); } static void vtblk_request_free(struct vtblk_softc *sc) { struct vtblk_request *req; MPASS(TAILQ_EMPTY(&sc->vtblk_req_ready)); while ((req = vtblk_request_dequeue(sc)) != NULL) { sc->vtblk_request_count--; free(req, M_DEVBUF); } KASSERT(sc->vtblk_request_count == 0, ("%s: leaked %d requests", __func__, sc->vtblk_request_count)); } static struct vtblk_request * vtblk_request_dequeue(struct vtblk_softc *sc) { struct vtblk_request *req; req = TAILQ_FIRST(&sc->vtblk_req_free); if (req != NULL) { TAILQ_REMOVE(&sc->vtblk_req_free, req, vbr_link); bzero(req, sizeof(struct vtblk_request)); } return (req); } static void vtblk_request_enqueue(struct vtblk_softc *sc, struct vtblk_request *req) { TAILQ_INSERT_HEAD(&sc->vtblk_req_free, req, vbr_link); } static struct vtblk_request * vtblk_request_next_ready(struct vtblk_softc *sc) { struct vtblk_request *req; req = TAILQ_FIRST(&sc->vtblk_req_ready); if (req != NULL) TAILQ_REMOVE(&sc->vtblk_req_ready, req, vbr_link); return (req); } static void vtblk_request_requeue_ready(struct vtblk_softc *sc, struct vtblk_request *req) { /* NOTE: Currently, there will be at most one request in the queue. */ TAILQ_INSERT_HEAD(&sc->vtblk_req_ready, req, vbr_link); } static struct vtblk_request * vtblk_request_next(struct vtblk_softc *sc) { struct vtblk_request *req; req = vtblk_request_next_ready(sc); if (req != NULL) return (req); return (vtblk_request_bio(sc)); } static struct vtblk_request * vtblk_request_bio(struct vtblk_softc *sc) { struct bio_queue_head *bioq; struct vtblk_request *req; struct bio *bp; bioq = &sc->vtblk_bioq; if (bioq_first(bioq) == NULL) return (NULL); req = vtblk_request_dequeue(sc); if (req == NULL) return (NULL); bp = bioq_takefirst(bioq); req->vbr_bp = bp; req->vbr_ack = -1; req->vbr_hdr.ioprio = 1; switch (bp->bio_cmd) { case BIO_FLUSH: req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH; break; case BIO_READ: req->vbr_hdr.type = VIRTIO_BLK_T_IN; - req->vbr_hdr.sector = bp->bio_offset / 512; + req->vbr_hdr.sector = bp->bio_offset / VTBLK_BSIZE; break; case BIO_WRITE: req->vbr_hdr.type = VIRTIO_BLK_T_OUT; - req->vbr_hdr.sector = bp->bio_offset / 512; + req->vbr_hdr.sector = bp->bio_offset / VTBLK_BSIZE; break; + case BIO_DELETE: + req->vbr_hdr.type = VIRTIO_BLK_T_DISCARD; + req->vbr_hdr.sector = bp->bio_offset / VTBLK_BSIZE; + break; default: panic("%s: bio with unhandled cmd: %d", __func__, bp->bio_cmd); } if (bp->bio_flags & BIO_ORDERED) req->vbr_hdr.type |= VIRTIO_BLK_T_BARRIER; return (req); } static int vtblk_request_execute(struct vtblk_softc *sc, struct vtblk_request *req) { struct virtqueue *vq; struct sglist *sg; struct bio *bp; int ordered, readable, writable, error; vq = sc->vtblk_vq; sg = sc->vtblk_sglist; bp = req->vbr_bp; ordered = 0; writable = 0; /* * Some hosts (such as bhyve) do not implement the barrier feature, * so we emulate it in the driver by allowing the barrier request * to be the only one in flight. */ if ((sc->vtblk_flags & VTBLK_FLAG_BARRIER) == 0) { if (sc->vtblk_req_ordered != NULL) return (EBUSY); if (bp->bio_flags & BIO_ORDERED) { if (!virtqueue_empty(vq)) return (EBUSY); ordered = 1; req->vbr_hdr.type &= ~VIRTIO_BLK_T_BARRIER; } } sglist_reset(sg); sglist_append(sg, &req->vbr_hdr, sizeof(struct virtio_blk_outhdr)); if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) { error = sglist_append_bio(sg, bp); if (error || sg->sg_nseg == sg->sg_maxseg) { panic("%s: bio %p data buffer too big %d", __func__, bp, error); } /* BIO_READ means the host writes into our buffer. */ if (bp->bio_cmd == BIO_READ) writable = sg->sg_nseg - 1; + } else if (bp->bio_cmd == BIO_DELETE) { + struct virtio_blk_discard_write_zeroes *discard; + + discard = malloc(sizeof(*discard), M_DEVBUF, M_NOWAIT | M_ZERO); + if (discard == NULL) + return (ENOMEM); + discard->sector = bp->bio_offset / VTBLK_BSIZE; + discard->num_sectors = bp->bio_bcount / VTBLK_BSIZE; + bp->bio_driver1 = discard; + error = sglist_append(sg, discard, sizeof(*discard)); + if (error || sg->sg_nseg == sg->sg_maxseg) { + panic("%s: bio %p data buffer too big %d", + __func__, bp, error); + } } writable++; sglist_append(sg, &req->vbr_ack, sizeof(uint8_t)); readable = sg->sg_nseg - writable; error = virtqueue_enqueue(vq, req, sg, readable, writable); if (error == 0 && ordered) sc->vtblk_req_ordered = req; return (error); } static int vtblk_request_error(struct vtblk_request *req) { int error; switch (req->vbr_ack) { case VIRTIO_BLK_S_OK: error = 0; break; case VIRTIO_BLK_S_UNSUPP: error = ENOTSUP; break; default: error = EIO; break; } return (error); } static void vtblk_queue_completed(struct vtblk_softc *sc, struct bio_queue *queue) { struct vtblk_request *req; struct bio *bp; while ((req = virtqueue_dequeue(sc->vtblk_vq, NULL)) != NULL) { if (sc->vtblk_req_ordered != NULL) { MPASS(sc->vtblk_req_ordered == req); sc->vtblk_req_ordered = NULL; } bp = req->vbr_bp; bp->bio_error = vtblk_request_error(req); TAILQ_INSERT_TAIL(queue, bp, bio_queue); vtblk_request_enqueue(sc, req); } } static void vtblk_done_completed(struct vtblk_softc *sc, struct bio_queue *queue) { struct bio *bp, *tmp; TAILQ_FOREACH_SAFE(bp, queue, bio_queue, tmp) { if (bp->bio_error != 0) disk_err(bp, "hard error", -1, 1); vtblk_bio_done(sc, bp, bp->bio_error); } } static void vtblk_drain_vq(struct vtblk_softc *sc) { struct virtqueue *vq; struct vtblk_request *req; int last; vq = sc->vtblk_vq; last = 0; while ((req = virtqueue_drain(vq, &last)) != NULL) { vtblk_bio_done(sc, req->vbr_bp, ENXIO); vtblk_request_enqueue(sc, req); } sc->vtblk_req_ordered = NULL; KASSERT(virtqueue_empty(vq), ("virtqueue not empty")); } static void vtblk_drain(struct vtblk_softc *sc) { struct bio_queue queue; struct bio_queue_head *bioq; struct vtblk_request *req; struct bio *bp; bioq = &sc->vtblk_bioq; TAILQ_INIT(&queue); if (sc->vtblk_vq != NULL) { vtblk_queue_completed(sc, &queue); vtblk_done_completed(sc, &queue); vtblk_drain_vq(sc); } while ((req = vtblk_request_next_ready(sc)) != NULL) { vtblk_bio_done(sc, req->vbr_bp, ENXIO); vtblk_request_enqueue(sc, req); } while (bioq_first(bioq) != NULL) { bp = bioq_takefirst(bioq); vtblk_bio_done(sc, bp, ENXIO); } vtblk_request_free(sc); } static void vtblk_startio(struct vtblk_softc *sc) { struct virtqueue *vq; struct vtblk_request *req; int enq; VTBLK_LOCK_ASSERT(sc); vq = sc->vtblk_vq; enq = 0; if (sc->vtblk_flags & VTBLK_FLAG_SUSPEND) return; while (!virtqueue_full(vq)) { req = vtblk_request_next(sc); if (req == NULL) break; if (vtblk_request_execute(sc, req) != 0) { vtblk_request_requeue_ready(sc, req); break; } enq++; } if (enq > 0) virtqueue_notify(vq); } static void vtblk_bio_done(struct vtblk_softc *sc, struct bio *bp, int error) { /* Because of GEOM direct dispatch, we cannot hold any locks. */ if (sc != NULL) VTBLK_LOCK_ASSERT_NOTOWNED(sc); if (error) { bp->bio_resid = bp->bio_bcount; bp->bio_error = error; bp->bio_flags |= BIO_ERROR; } + if (bp->bio_driver1 != NULL) { + free(bp->bio_driver1, M_DEVBUF); + bp->bio_driver1 = NULL; + } + biodone(bp); } #define VTBLK_GET_CONFIG(_dev, _feature, _field, _cfg) \ if (virtio_with_feature(_dev, _feature)) { \ virtio_read_device_config(_dev, \ offsetof(struct virtio_blk_config, _field), \ &(_cfg)->_field, sizeof((_cfg)->_field)); \ } static void vtblk_read_config(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg) { device_t dev; dev = sc->vtblk_dev; bzero(blkcfg, sizeof(struct virtio_blk_config)); /* The capacity is always available. */ virtio_read_device_config(dev, offsetof(struct virtio_blk_config, capacity), &blkcfg->capacity, sizeof(blkcfg->capacity)); /* Read the configuration if the feature was negotiated. */ VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_SIZE_MAX, size_max, blkcfg); VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_SEG_MAX, seg_max, blkcfg); VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_GEOMETRY, geometry, blkcfg); VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_BLK_SIZE, blk_size, blkcfg); VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_TOPOLOGY, topology, blkcfg); - VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_CONFIG_WCE, writeback, blkcfg); + VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_CONFIG_WCE, wce, blkcfg); + VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_DISCARD, max_discard_sectors, + blkcfg); + VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_DISCARD, max_discard_seg, blkcfg); + VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_DISCARD, discard_sector_alignment, + blkcfg); } #undef VTBLK_GET_CONFIG static void vtblk_ident(struct vtblk_softc *sc) { struct bio buf; struct disk *dp; struct vtblk_request *req; int len, error; dp = sc->vtblk_disk; len = MIN(VIRTIO_BLK_ID_BYTES, DISK_IDENT_SIZE); if (vtblk_tunable_int(sc, "no_ident", vtblk_no_ident) != 0) return; req = vtblk_request_dequeue(sc); if (req == NULL) return; req->vbr_ack = -1; req->vbr_hdr.type = VIRTIO_BLK_T_GET_ID; req->vbr_hdr.ioprio = 1; req->vbr_hdr.sector = 0; req->vbr_bp = &buf; g_reset_bio(&buf); buf.bio_cmd = BIO_READ; buf.bio_data = dp->d_ident; buf.bio_bcount = len; VTBLK_LOCK(sc); error = vtblk_poll_request(sc, req); VTBLK_UNLOCK(sc); vtblk_request_enqueue(sc, req); if (error) { device_printf(sc->vtblk_dev, "error getting device identifier: %d\n", error); } } static int vtblk_poll_request(struct vtblk_softc *sc, struct vtblk_request *req) { struct virtqueue *vq; int error; vq = sc->vtblk_vq; if (!virtqueue_empty(vq)) return (EBUSY); error = vtblk_request_execute(sc, req); if (error) return (error); virtqueue_notify(vq); virtqueue_poll(vq, NULL); error = vtblk_request_error(req); if (error && bootverbose) { device_printf(sc->vtblk_dev, "%s: IO error: %d\n", __func__, error); } return (error); } static int vtblk_quiesce(struct vtblk_softc *sc) { int error; VTBLK_LOCK_ASSERT(sc); error = 0; while (!virtqueue_empty(sc->vtblk_vq)) { if (mtx_sleep(&sc->vtblk_vq, VTBLK_MTX(sc), PRIBIO, "vtblkq", VTBLK_QUIESCE_TIMEOUT) == EWOULDBLOCK) { error = EBUSY; break; } } return (error); } static void vtblk_vq_intr(void *xsc) { struct vtblk_softc *sc; struct virtqueue *vq; struct bio_queue queue; sc = xsc; vq = sc->vtblk_vq; TAILQ_INIT(&queue); VTBLK_LOCK(sc); again: if (sc->vtblk_flags & VTBLK_FLAG_DETACH) goto out; vtblk_queue_completed(sc, &queue); vtblk_startio(sc); if (virtqueue_enable_intr(vq) != 0) { virtqueue_disable_intr(vq); goto again; } if (sc->vtblk_flags & VTBLK_FLAG_SUSPEND) wakeup(&sc->vtblk_vq); out: VTBLK_UNLOCK(sc); vtblk_done_completed(sc, &queue); } static void vtblk_stop(struct vtblk_softc *sc) { virtqueue_disable_intr(sc->vtblk_vq); virtio_stop(sc->vtblk_dev); } static void vtblk_dump_quiesce(struct vtblk_softc *sc) { /* * Spin here until all the requests in-flight at the time of the * dump are completed and queued. The queued requests will be * biodone'd once the dump is finished. */ while (!virtqueue_empty(sc->vtblk_vq)) vtblk_queue_completed(sc, &sc->vtblk_dump_queue); } static int vtblk_dump_write(struct vtblk_softc *sc, void *virtual, off_t offset, size_t length) { struct bio buf; struct vtblk_request *req; req = &sc->vtblk_dump_request; req->vbr_ack = -1; req->vbr_hdr.type = VIRTIO_BLK_T_OUT; req->vbr_hdr.ioprio = 1; - req->vbr_hdr.sector = offset / 512; + req->vbr_hdr.sector = offset / VTBLK_BSIZE; req->vbr_bp = &buf; g_reset_bio(&buf); buf.bio_cmd = BIO_WRITE; buf.bio_data = virtual; buf.bio_bcount = length; return (vtblk_poll_request(sc, req)); } static int vtblk_dump_flush(struct vtblk_softc *sc) { struct bio buf; struct vtblk_request *req; req = &sc->vtblk_dump_request; req->vbr_ack = -1; req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH; req->vbr_hdr.ioprio = 1; req->vbr_hdr.sector = 0; req->vbr_bp = &buf; g_reset_bio(&buf); buf.bio_cmd = BIO_FLUSH; return (vtblk_poll_request(sc, req)); } static void vtblk_dump_complete(struct vtblk_softc *sc) { vtblk_dump_flush(sc); VTBLK_UNLOCK(sc); vtblk_done_completed(sc, &sc->vtblk_dump_queue); VTBLK_LOCK(sc); } static void vtblk_set_write_cache(struct vtblk_softc *sc, int wc) { /* Set either writeback (1) or writethrough (0) mode. */ virtio_write_dev_config_1(sc->vtblk_dev, - offsetof(struct virtio_blk_config, writeback), wc); + offsetof(struct virtio_blk_config, wce), wc); } static int vtblk_write_cache_enabled(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg) { int wc; if (sc->vtblk_flags & VTBLK_FLAG_WC_CONFIG) { wc = vtblk_tunable_int(sc, "writecache_mode", vtblk_writecache_mode); if (wc >= 0 && wc < VTBLK_CACHE_MAX) vtblk_set_write_cache(sc, wc); else - wc = blkcfg->writeback; + wc = blkcfg->wce; } else wc = virtio_with_feature(sc->vtblk_dev, VIRTIO_BLK_F_WCE); return (wc); } static int vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS) { struct vtblk_softc *sc; int wc, error; sc = oidp->oid_arg1; wc = sc->vtblk_write_cache; error = sysctl_handle_int(oidp, &wc, 0, req); if (error || req->newptr == NULL) return (error); if ((sc->vtblk_flags & VTBLK_FLAG_WC_CONFIG) == 0) return (EPERM); if (wc < 0 || wc >= VTBLK_CACHE_MAX) return (EINVAL); VTBLK_LOCK(sc); sc->vtblk_write_cache = wc; vtblk_set_write_cache(sc, sc->vtblk_write_cache); VTBLK_UNLOCK(sc); return (0); } static void vtblk_setup_sysctl(struct vtblk_softc *sc) { device_t dev; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; dev = sc->vtblk_dev; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "writecache_mode", CTLTYPE_INT | CTLFLAG_RW, sc, 0, vtblk_write_cache_sysctl, "I", "Write cache mode (writethrough (0) or writeback (1))"); } static int vtblk_tunable_int(struct vtblk_softc *sc, const char *knob, int def) { char path[64]; snprintf(path, sizeof(path), "hw.vtblk.%d.%s", device_get_unit(sc->vtblk_dev), knob); TUNABLE_INT_FETCH(path, &def); return (def); } Index: releng/12.2/sys/dev/virtio/block/virtio_blk.h =================================================================== --- releng/12.2/sys/dev/virtio/block/virtio_blk.h (revision 365701) +++ releng/12.2/sys/dev/virtio/block/virtio_blk.h (revision 365702) @@ -1,132 +1,173 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * This header is BSD licensed so anyone can use the definitions to implement * compatible drivers/servers. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of IBM nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VIRTIO_BLK_H #define _VIRTIO_BLK_H +#define VTBLK_BSIZE 512 + /* Feature bits */ -#define VIRTIO_BLK_F_BARRIER 0x0001 /* Does host support barriers? */ -#define VIRTIO_BLK_F_SIZE_MAX 0x0002 /* Indicates maximum segment size */ -#define VIRTIO_BLK_F_SEG_MAX 0x0004 /* Indicates maximum # of segments */ -#define VIRTIO_BLK_F_GEOMETRY 0x0010 /* Legacy geometry available */ -#define VIRTIO_BLK_F_RO 0x0020 /* Disk is read-only */ -#define VIRTIO_BLK_F_BLK_SIZE 0x0040 /* Block size of disk is available*/ -#define VIRTIO_BLK_F_SCSI 0x0080 /* Supports scsi command passthru */ -#define VIRTIO_BLK_F_WCE 0x0200 /* Writeback mode enabled after reset */ -#define VIRTIO_BLK_F_TOPOLOGY 0x0400 /* Topology information is available */ -#define VIRTIO_BLK_F_CONFIG_WCE 0x0800 /* Writeback mode available in config */ -#define VIRTIO_BLK_ID_BYTES 20 /* ID string length */ +#define VIRTIO_BLK_F_BARRIER 0x0001 /* Does host support barriers? */ +#define VIRTIO_BLK_F_SIZE_MAX 0x0002 /* Indicates maximum segment size */ +#define VIRTIO_BLK_F_SEG_MAX 0x0004 /* Indicates maximum # of segments */ +#define VIRTIO_BLK_F_GEOMETRY 0x0010 /* Legacy geometry available */ +#define VIRTIO_BLK_F_RO 0x0020 /* Disk is read-only */ +#define VIRTIO_BLK_F_BLK_SIZE 0x0040 /* Block size of disk is available*/ +#define VIRTIO_BLK_F_SCSI 0x0080 /* Supports scsi command passthru */ +#define VIRTIO_BLK_F_FLUSH 0x0200 /* Flush command supported */ +#define VIRTIO_BLK_F_WCE 0x0200 /* Legacy alias for FLUSH */ +#define VIRTIO_BLK_F_TOPOLOGY 0x0400 /* Topology information is available */ +#define VIRTIO_BLK_F_CONFIG_WCE 0x0800 /* Writeback mode available in config */ +#define VIRTIO_BLK_F_MQ 0x1000 /* Support more than one vq */ +#define VIRTIO_BLK_F_DISCARD 0x2000 /* Trim blocks */ +#define VIRTIO_BLK_F_WRITE_ZEROES 0x4000 /* Write zeros */ +#define VIRTIO_BLK_ID_BYTES 20 /* ID string length */ + struct virtio_blk_config { /* The capacity (in 512-byte sectors). */ uint64_t capacity; /* The maximum segment size (if VIRTIO_BLK_F_SIZE_MAX) */ uint32_t size_max; /* The maximum number of segments (if VIRTIO_BLK_F_SEG_MAX) */ uint32_t seg_max; /* Geometry of the device (if VIRTIO_BLK_F_GEOMETRY) */ struct virtio_blk_geometry { uint16_t cylinders; uint8_t heads; uint8_t sectors; } geometry; /* Block size of device (if VIRTIO_BLK_F_BLK_SIZE) */ uint32_t blk_size; /* Topology of the device (if VIRTIO_BLK_F_TOPOLOGY) */ struct virtio_blk_topology { + /* Exponent for physical block per logical block. */ uint8_t physical_block_exp; + /* Alignment offset in logical blocks. */ uint8_t alignment_offset; + /* Minimum I/O size without performance penalty in logical + * blocks. */ uint16_t min_io_size; + /* Optimal sustained I/O size in logical blocks. */ uint32_t opt_io_size; } topology; /* Writeback mode (if VIRTIO_BLK_F_CONFIG_WCE) */ - uint8_t writeback; - + uint8_t wce; + uint8_t unused; + /* Number of vqs, only available when VIRTIO_BLK_F_MQ is set */ + uint16_t num_queues; + uint32_t max_discard_sectors; + uint32_t max_discard_seg; + uint32_t discard_sector_alignment; + uint32_t max_write_zeroes_sectors; + uint32_t max_write_zeroes_seg; + uint8_t write_zeroes_may_unmap; + uint8_t unused1[3]; } __packed; /* * Command types * * Usage is a bit tricky as some bits are used as flags and some are not. * * Rules: * VIRTIO_BLK_T_OUT may be combined with VIRTIO_BLK_T_SCSI_CMD or * VIRTIO_BLK_T_BARRIER. VIRTIO_BLK_T_FLUSH is a command of its own * and may not be combined with any of the other flags. */ /* These two define direction. */ -#define VIRTIO_BLK_T_IN 0 -#define VIRTIO_BLK_T_OUT 1 +#define VIRTIO_BLK_T_IN 0 +#define VIRTIO_BLK_T_OUT 1 /* This bit says it's a scsi command, not an actual read or write. */ -#define VIRTIO_BLK_T_SCSI_CMD 2 +#define VIRTIO_BLK_T_SCSI_CMD 2 +#define VIRTIO_BLK_T_SCSI_CMD_OUT 3 /* Cache flush command */ -#define VIRTIO_BLK_T_FLUSH 4 +#define VIRTIO_BLK_T_FLUSH 4 +#define VIRTIO_BLK_T_FLUSH_OUT 5 /* Get device ID command */ -#define VIRTIO_BLK_T_GET_ID 8 +#define VIRTIO_BLK_T_GET_ID 8 +/* Discard command */ +#define VIRTIO_BLK_T_DISCARD 11 + +/* Write zeros command */ +#define VIRTIO_BLK_T_WRITE_ZEROES 13 + /* Barrier before this op. */ -#define VIRTIO_BLK_T_BARRIER 0x80000000 +#define VIRTIO_BLK_T_BARRIER 0x80000000 /* ID string length */ -#define VIRTIO_BLK_ID_BYTES 20 +#define VIRTIO_BLK_ID_BYTES 20 +/* Unmap this range (only valid for write zeroes command) */ +#define VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP 0x00000001 + /* This is the first element of the read scatter-gather list. */ struct virtio_blk_outhdr { /* VIRTIO_BLK_T* */ uint32_t type; /* io priority. */ uint32_t ioprio; /* Sector (ie. 512 byte offset) */ uint64_t sector; +}; + +struct virtio_blk_discard_write_zeroes { + uint64_t sector; + uint32_t num_sectors; + struct { + uint32_t unmap:1; + uint32_t reserved:31; + } flags; }; struct virtio_scsi_inhdr { uint32_t errors; uint32_t data_len; uint32_t sense_len; uint32_t residual; }; /* And this is the final byte of the write scatter-gather list. */ #define VIRTIO_BLK_S_OK 0 #define VIRTIO_BLK_S_IOERR 1 #define VIRTIO_BLK_S_UNSUPP 2 #endif /* _VIRTIO_BLK_H */ Index: releng/12.2/usr.sbin/bhyve/block_if.c =================================================================== --- releng/12.2/usr.sbin/bhyve/block_if.c (revision 365701) +++ releng/12.2/usr.sbin/bhyve/block_if.c (revision 365702) @@ -1,850 +1,856 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Peter Grehan * All rights reserved. + * Copyright 2020 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "debug.h" #include "mevent.h" #include "block_if.h" #define BLOCKIF_SIG 0xb109b109 #define BLOCKIF_NUMTHR 8 #define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) enum blockop { BOP_READ, BOP_WRITE, BOP_FLUSH, BOP_DELETE }; enum blockstat { BST_FREE, BST_BLOCK, BST_PEND, BST_BUSY, BST_DONE }; struct blockif_elem { TAILQ_ENTRY(blockif_elem) be_link; struct blockif_req *be_req; enum blockop be_op; enum blockstat be_status; pthread_t be_tid; off_t be_block; }; struct blockif_ctxt { int bc_magic; int bc_fd; int bc_ischr; int bc_isgeom; int bc_candelete; int bc_rdonly; off_t bc_size; int bc_sectsz; int bc_psectsz; int bc_psectoff; int bc_closing; pthread_t bc_btid[BLOCKIF_NUMTHR]; pthread_mutex_t bc_mtx; pthread_cond_t bc_cond; /* Request elements and free/pending/busy queues */ TAILQ_HEAD(, blockif_elem) bc_freeq; TAILQ_HEAD(, blockif_elem) bc_pendq; TAILQ_HEAD(, blockif_elem) bc_busyq; struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; }; static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; struct blockif_sig_elem { pthread_mutex_t bse_mtx; pthread_cond_t bse_cond; int bse_pending; struct blockif_sig_elem *bse_next; }; static struct blockif_sig_elem *blockif_bse_head; static int blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, enum blockop op) { struct blockif_elem *be, *tbe; off_t off; int i; be = TAILQ_FIRST(&bc->bc_freeq); assert(be != NULL); assert(be->be_status == BST_FREE); TAILQ_REMOVE(&bc->bc_freeq, be, be_link); be->be_req = breq; be->be_op = op; switch (op) { case BOP_READ: case BOP_WRITE: case BOP_DELETE: off = breq->br_offset; for (i = 0; i < breq->br_iovcnt; i++) off += breq->br_iov[i].iov_len; break; default: off = OFF_MAX; } be->be_block = off; TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { if (tbe->be_block == breq->br_offset) break; } if (tbe == NULL) { TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { if (tbe->be_block == breq->br_offset) break; } } if (tbe == NULL) be->be_status = BST_PEND; else be->be_status = BST_BLOCK; TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); return (be->be_status == BST_PEND); } static int blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) { struct blockif_elem *be; TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { if (be->be_status == BST_PEND) break; assert(be->be_status == BST_BLOCK); } if (be == NULL) return (0); TAILQ_REMOVE(&bc->bc_pendq, be, be_link); be->be_status = BST_BUSY; be->be_tid = t; TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); *bep = be; return (1); } static void blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) { struct blockif_elem *tbe; if (be->be_status == BST_DONE || be->be_status == BST_BUSY) TAILQ_REMOVE(&bc->bc_busyq, be, be_link); else TAILQ_REMOVE(&bc->bc_pendq, be, be_link); TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { if (tbe->be_req->br_offset == be->be_block) tbe->be_status = BST_PEND; } be->be_tid = 0; be->be_status = BST_FREE; be->be_req = NULL; TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); } static void blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) { struct blockif_req *br; off_t arg[2]; ssize_t clen, len, off, boff, voff; int i, err; br = be->be_req; if (br->br_iovcnt <= 1) buf = NULL; err = 0; switch (be->be_op) { case BOP_READ: if (buf == NULL) { if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, br->br_offset)) < 0) err = errno; else br->br_resid -= len; break; } i = 0; off = voff = 0; while (br->br_resid > 0) { len = MIN(br->br_resid, MAXPHYS); if (pread(bc->bc_fd, buf, len, br->br_offset + off) < 0) { err = errno; break; } boff = 0; do { clen = MIN(len - boff, br->br_iov[i].iov_len - voff); memcpy(br->br_iov[i].iov_base + voff, buf + boff, clen); if (clen < br->br_iov[i].iov_len - voff) voff += clen; else { i++; voff = 0; } boff += clen; } while (boff < len); off += len; br->br_resid -= len; } break; case BOP_WRITE: if (bc->bc_rdonly) { err = EROFS; break; } if (buf == NULL) { if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, br->br_offset)) < 0) err = errno; else br->br_resid -= len; break; } i = 0; off = voff = 0; while (br->br_resid > 0) { len = MIN(br->br_resid, MAXPHYS); boff = 0; do { clen = MIN(len - boff, br->br_iov[i].iov_len - voff); memcpy(buf + boff, br->br_iov[i].iov_base + voff, clen); if (clen < br->br_iov[i].iov_len - voff) voff += clen; else { i++; voff = 0; } boff += clen; } while (boff < len); if (pwrite(bc->bc_fd, buf, len, br->br_offset + off) < 0) { err = errno; break; } off += len; br->br_resid -= len; } break; case BOP_FLUSH: if (bc->bc_ischr) { if (ioctl(bc->bc_fd, DIOCGFLUSH)) err = errno; } else if (fsync(bc->bc_fd)) err = errno; break; case BOP_DELETE: if (!bc->bc_candelete) err = EOPNOTSUPP; else if (bc->bc_rdonly) err = EROFS; else if (bc->bc_ischr) { arg[0] = br->br_offset; arg[1] = br->br_resid; if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) err = errno; else br->br_resid = 0; } else err = EOPNOTSUPP; break; default: err = EINVAL; break; } be->be_status = BST_DONE; (*br->br_callback)(br, err); } static void * blockif_thr(void *arg) { struct blockif_ctxt *bc; struct blockif_elem *be; pthread_t t; uint8_t *buf; bc = arg; if (bc->bc_isgeom) buf = malloc(MAXPHYS); else buf = NULL; t = pthread_self(); pthread_mutex_lock(&bc->bc_mtx); for (;;) { while (blockif_dequeue(bc, t, &be)) { pthread_mutex_unlock(&bc->bc_mtx); blockif_proc(bc, be, buf); pthread_mutex_lock(&bc->bc_mtx); blockif_complete(bc, be); } /* Check ctxt status here to see if exit requested */ if (bc->bc_closing) break; pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); } pthread_mutex_unlock(&bc->bc_mtx); if (buf) free(buf); pthread_exit(NULL); return (NULL); } static void blockif_sigcont_handler(int signal, enum ev_type type, void *arg) { struct blockif_sig_elem *bse; for (;;) { /* * Process the entire list even if not intended for * this thread. */ do { bse = blockif_bse_head; if (bse == NULL) return; } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, (uintptr_t)bse, (uintptr_t)bse->bse_next)); pthread_mutex_lock(&bse->bse_mtx); bse->bse_pending = 0; pthread_cond_signal(&bse->bse_cond); pthread_mutex_unlock(&bse->bse_mtx); } } static void blockif_init(void) { mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); (void) signal(SIGCONT, SIG_IGN); } struct blockif_ctxt * blockif_open(const char *optstr, const char *ident) { char tname[MAXCOMLEN + 1]; char name[MAXPATHLEN]; char *nopt, *xopts, *cp; struct blockif_ctxt *bc; struct stat sbuf; struct diocgattr_arg arg; off_t size, psectsz, psectoff; int extra, fd, i, sectsz; int nocache, sync, ro, candelete, geom, ssopt, pssopt; + int nodelete; + #ifndef WITHOUT_CAPSICUM cap_rights_t rights; cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE }; #endif pthread_once(&blockif_once, blockif_init); fd = -1; ssopt = 0; nocache = 0; sync = 0; ro = 0; + nodelete = 0; /* * The first element in the optstring is always a pathname. * Optional elements follow */ nopt = xopts = strdup(optstr); while (xopts != NULL) { cp = strsep(&xopts, ","); if (cp == nopt) /* file or device pathname */ continue; else if (!strcmp(cp, "nocache")) nocache = 1; + else if (!strcmp(cp, "nodelete")) + nodelete = 1; else if (!strcmp(cp, "sync") || !strcmp(cp, "direct")) sync = 1; else if (!strcmp(cp, "ro")) ro = 1; else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2) ; else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1) pssopt = ssopt; else { EPRINTLN("Invalid device option \"%s\"", cp); goto err; } } extra = 0; if (nocache) extra |= O_DIRECT; if (sync) extra |= O_SYNC; fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra); if (fd < 0 && !ro) { /* Attempt a r/w fail with a r/o open */ fd = open(nopt, O_RDONLY | extra); ro = 1; } if (fd < 0) { warn("Could not open backing file: %s", nopt); goto err; } if (fstat(fd, &sbuf) < 0) { warn("Could not stat backing file %s", nopt); goto err; } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, CAP_WRITE); if (ro) cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); if (caph_rights_limit(fd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif /* * Deal with raw devices */ size = sbuf.st_size; sectsz = DEV_BSIZE; psectsz = psectoff = 0; candelete = geom = 0; if (S_ISCHR(sbuf.st_mode)) { if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || ioctl(fd, DIOCGSECTORSIZE, §sz)) { perror("Could not fetch dev blk/sector size"); goto err; } assert(size != 0); assert(sectsz != 0); if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); arg.len = sizeof(arg.value.i); - if (ioctl(fd, DIOCGATTR, &arg) == 0) + if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) candelete = arg.value.i; if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) geom = 1; } else psectsz = sbuf.st_blksize; #ifndef WITHOUT_CAPSICUM if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif if (ssopt != 0) { if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || ssopt > pssopt) { EPRINTLN("Invalid sector size %d/%d", ssopt, pssopt); goto err; } /* * Some backend drivers (e.g. cd0, ada0) require that the I/O * size be a multiple of the device's sector size. * * Validate that the emulated sector size complies with this * requirement. */ if (S_ISCHR(sbuf.st_mode)) { if (ssopt < sectsz || (ssopt % sectsz) != 0) { EPRINTLN("Sector size %d incompatible " "with underlying device sector size %d", ssopt, sectsz); goto err; } } sectsz = ssopt; psectsz = pssopt; psectoff = 0; } bc = calloc(1, sizeof(struct blockif_ctxt)); if (bc == NULL) { perror("calloc"); goto err; } bc->bc_magic = BLOCKIF_SIG; bc->bc_fd = fd; bc->bc_ischr = S_ISCHR(sbuf.st_mode); bc->bc_isgeom = geom; bc->bc_candelete = candelete; bc->bc_rdonly = ro; bc->bc_size = size; bc->bc_sectsz = sectsz; bc->bc_psectsz = psectsz; bc->bc_psectoff = psectoff; pthread_mutex_init(&bc->bc_mtx, NULL); pthread_cond_init(&bc->bc_cond, NULL); TAILQ_INIT(&bc->bc_freeq); TAILQ_INIT(&bc->bc_pendq); TAILQ_INIT(&bc->bc_busyq); for (i = 0; i < BLOCKIF_MAXREQ; i++) { bc->bc_reqs[i].be_status = BST_FREE; TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); } for (i = 0; i < BLOCKIF_NUMTHR; i++) { pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); pthread_set_name_np(bc->bc_btid[i], tname); } return (bc); err: if (fd >= 0) close(fd); return (NULL); } static int blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, enum blockop op) { int err; err = 0; pthread_mutex_lock(&bc->bc_mtx); if (!TAILQ_EMPTY(&bc->bc_freeq)) { /* * Enqueue and inform the block i/o thread * that there is work available */ if (blockif_enqueue(bc, breq, op)) pthread_cond_signal(&bc->bc_cond); } else { /* * Callers are not allowed to enqueue more than * the specified blockif queue limit. Return an * error to indicate that the queue length has been * exceeded. */ err = E2BIG; } pthread_mutex_unlock(&bc->bc_mtx); return (err); } int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_READ)); } int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_WRITE)); } int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_FLUSH)); } int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_DELETE)); } int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) { struct blockif_elem *be; assert(bc->bc_magic == BLOCKIF_SIG); pthread_mutex_lock(&bc->bc_mtx); /* * Check pending requests. */ TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { if (be->be_req == breq) break; } if (be != NULL) { /* * Found it. */ blockif_complete(bc, be); pthread_mutex_unlock(&bc->bc_mtx); return (0); } /* * Check in-flight requests. */ TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { if (be->be_req == breq) break; } if (be == NULL) { /* * Didn't find it. */ pthread_mutex_unlock(&bc->bc_mtx); return (EINVAL); } /* * Interrupt the processing thread to force it return * prematurely via it's normal callback path. */ while (be->be_status == BST_BUSY) { struct blockif_sig_elem bse, *old_head; pthread_mutex_init(&bse.bse_mtx, NULL); pthread_cond_init(&bse.bse_cond, NULL); bse.bse_pending = 1; do { old_head = blockif_bse_head; bse.bse_next = old_head; } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, (uintptr_t)old_head, (uintptr_t)&bse)); pthread_kill(be->be_tid, SIGCONT); pthread_mutex_lock(&bse.bse_mtx); while (bse.bse_pending) pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); pthread_mutex_unlock(&bse.bse_mtx); } pthread_mutex_unlock(&bc->bc_mtx); /* * The processing thread has been interrupted. Since it's not * clear if the callback has been invoked yet, return EBUSY. */ return (EBUSY); } int blockif_close(struct blockif_ctxt *bc) { void *jval; int i; assert(bc->bc_magic == BLOCKIF_SIG); /* * Stop the block i/o thread */ pthread_mutex_lock(&bc->bc_mtx); bc->bc_closing = 1; pthread_mutex_unlock(&bc->bc_mtx); pthread_cond_broadcast(&bc->bc_cond); for (i = 0; i < BLOCKIF_NUMTHR; i++) pthread_join(bc->bc_btid[i], &jval); /* XXX Cancel queued i/o's ??? */ /* * Release resources */ bc->bc_magic = 0; close(bc->bc_fd); free(bc); return (0); } /* * Return virtual C/H/S values for a given block. Use the algorithm * outlined in the VHD specification to calculate values. */ void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) { off_t sectors; /* total sectors of the block dev */ off_t hcyl; /* cylinders times heads */ uint16_t secpt; /* sectors per track */ uint8_t heads; assert(bc->bc_magic == BLOCKIF_SIG); sectors = bc->bc_size / bc->bc_sectsz; /* Clamp the size to the largest possible with CHS */ if (sectors > 65535UL*16*255) sectors = 65535UL*16*255; if (sectors >= 65536UL*16*63) { secpt = 255; heads = 16; hcyl = sectors / secpt; } else { secpt = 17; hcyl = sectors / secpt; heads = (hcyl + 1023) / 1024; if (heads < 4) heads = 4; if (hcyl >= (heads * 1024) || heads > 16) { secpt = 31; heads = 16; hcyl = sectors / secpt; } if (hcyl >= (heads * 1024)) { secpt = 63; heads = 16; hcyl = sectors / secpt; } } *c = hcyl / heads; *h = heads; *s = secpt; } /* * Accessors */ off_t blockif_size(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_size); } int blockif_sectsz(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_sectsz); } void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) { assert(bc->bc_magic == BLOCKIF_SIG); *size = bc->bc_psectsz; *off = bc->bc_psectoff; } int blockif_queuesz(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (BLOCKIF_MAXREQ - 1); } int blockif_is_ro(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_rdonly); } int blockif_candelete(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_candelete); } Index: releng/12.2/usr.sbin/bhyve/pci_virtio_block.c =================================================================== --- releng/12.2/usr.sbin/bhyve/pci_virtio_block.c (revision 365701) +++ releng/12.2/usr.sbin/bhyve/pci_virtio_block.c (revision 365702) @@ -1,425 +1,528 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. - * Copyright (c) 2019 Joyent, Inc. + * Copyright 2020 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" #include "block_if.h" -#define VTBLK_RINGSZ 128 +#define VTBLK_BSIZE 512 +#define VTBLK_RINGSZ 128 _Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request"); -#define VTBLK_S_OK 0 -#define VTBLK_S_IOERR 1 +#define VTBLK_S_OK 0 +#define VTBLK_S_IOERR 1 #define VTBLK_S_UNSUPP 2 #define VTBLK_BLK_ID_BYTES 20 + 1 /* Capability bits */ -#define VTBLK_F_SEG_MAX (1 << 2) /* Maximum request segments */ -#define VTBLK_F_BLK_SIZE (1 << 6) /* cfg block size valid */ -#define VTBLK_F_FLUSH (1 << 9) /* Cache flush support */ -#define VTBLK_F_TOPOLOGY (1 << 10) /* Optimal I/O alignment */ +#define VTBLK_F_BARRIER (1 << 0) /* Does host support barriers? */ +#define VTBLK_F_SIZE_MAX (1 << 1) /* Indicates maximum segment size */ +#define VTBLK_F_SEG_MAX (1 << 2) /* Indicates maximum # of segments */ +#define VTBLK_F_GEOMETRY (1 << 4) /* Legacy geometry available */ +#define VTBLK_F_RO (1 << 5) /* Disk is read-only */ +#define VTBLK_F_BLK_SIZE (1 << 6) /* Block size of disk is available*/ +#define VTBLK_F_SCSI (1 << 7) /* Supports scsi command passthru */ +#define VTBLK_F_FLUSH (1 << 9) /* Writeback mode enabled after reset */ +#define VTBLK_F_WCE (1 << 9) /* Legacy alias for FLUSH */ +#define VTBLK_F_TOPOLOGY (1 << 10) /* Topology information is available */ +#define VTBLK_F_CONFIG_WCE (1 << 11) /* Writeback mode available in config */ +#define VTBLK_F_MQ (1 << 12) /* Multi-Queue */ +#define VTBLK_F_DISCARD (1 << 13) /* Trim blocks */ +#define VTBLK_F_WRITE_ZEROES (1 << 14) /* Write zeros */ /* * Host capabilities */ -#define VTBLK_S_HOSTCAPS \ +#define VTBLK_S_HOSTCAPS \ ( VTBLK_F_SEG_MAX | \ VTBLK_F_BLK_SIZE | \ VTBLK_F_FLUSH | \ VTBLK_F_TOPOLOGY | \ VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */ /* + * The current blockif_delete() interface only allows a single delete + * request at a time. + */ +#define VTBLK_MAX_DISCARD_SEG 1 + +/* + * An arbitrary limit to prevent excessive latency due to large + * delete requests. + */ +#define VTBLK_MAX_DISCARD_SECT ((16 << 20) / VTBLK_BSIZE) /* 16 MiB */ + +/* * Config space "registers" */ struct vtblk_config { uint64_t vbc_capacity; uint32_t vbc_size_max; uint32_t vbc_seg_max; struct { uint16_t cylinders; uint8_t heads; uint8_t sectors; } vbc_geometry; uint32_t vbc_blk_size; struct { uint8_t physical_block_exp; uint8_t alignment_offset; uint16_t min_io_size; uint32_t opt_io_size; } vbc_topology; uint8_t vbc_writeback; + uint8_t unused0[1]; + uint16_t num_queues; + uint32_t max_discard_sectors; + uint32_t max_discard_seg; + uint32_t discard_sector_alignment; + uint32_t max_write_zeroes_sectors; + uint32_t max_write_zeroes_seg; + uint8_t write_zeroes_may_unmap; + uint8_t unused1[3]; } __packed; /* * Fixed-size block header */ struct virtio_blk_hdr { #define VBH_OP_READ 0 #define VBH_OP_WRITE 1 +#define VBH_OP_SCSI_CMD 2 +#define VBH_OP_SCSI_CMD_OUT 3 #define VBH_OP_FLUSH 4 #define VBH_OP_FLUSH_OUT 5 #define VBH_OP_IDENT 8 +#define VBH_OP_DISCARD 11 +#define VBH_OP_WRITE_ZEROES 13 + #define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */ uint32_t vbh_type; uint32_t vbh_ioprio; uint64_t vbh_sector; } __packed; /* * Debug printf */ static int pci_vtblk_debug; -#define DPRINTF(params) if (pci_vtblk_debug) PRINTLN params -#define WPRINTF(params) PRINTLN params +#define DPRINTF(params) if (pci_vtblk_debug) PRINTLN params +#define WPRINTF(params) PRINTLN params struct pci_vtblk_ioreq { struct blockif_req io_req; struct pci_vtblk_softc *io_sc; uint8_t *io_status; uint16_t io_idx; }; +struct virtio_blk_discard_write_zeroes { + uint64_t sector; + uint32_t num_sectors; + struct { + uint32_t unmap:1; + uint32_t reserved:31; + } flags; +}; + /* * Per-device softc */ struct pci_vtblk_softc { struct virtio_softc vbsc_vs; pthread_mutex_t vsc_mtx; struct vqueue_info vbsc_vq; struct vtblk_config vbsc_cfg; + struct virtio_consts vbsc_consts; struct blockif_ctxt *bc; char vbsc_ident[VTBLK_BLK_ID_BYTES]; struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ]; }; static void pci_vtblk_reset(void *); static void pci_vtblk_notify(void *, struct vqueue_info *); static int pci_vtblk_cfgread(void *, int, int, uint32_t *); static int pci_vtblk_cfgwrite(void *, int, int, uint32_t); static struct virtio_consts vtblk_vi_consts = { "vtblk", /* our name */ 1, /* we support 1 virtqueue */ sizeof(struct vtblk_config), /* config reg size */ pci_vtblk_reset, /* reset */ pci_vtblk_notify, /* device-wide qnotify */ pci_vtblk_cfgread, /* read PCI config */ pci_vtblk_cfgwrite, /* write PCI config */ NULL, /* apply negotiated features */ VTBLK_S_HOSTCAPS, /* our capabilities */ }; static void pci_vtblk_reset(void *vsc) { struct pci_vtblk_softc *sc = vsc; DPRINTF(("vtblk: device reset requested !")); vi_reset_dev(&sc->vbsc_vs); } static void -pci_vtblk_done(struct blockif_req *br, int err) +pci_vtblk_done_locked(struct pci_vtblk_ioreq *io, int err) { - struct pci_vtblk_ioreq *io = br->br_param; struct pci_vtblk_softc *sc = io->io_sc; /* convert errno into a virtio block error return */ if (err == EOPNOTSUPP || err == ENOSYS) *io->io_status = VTBLK_S_UNSUPP; else if (err != 0) *io->io_status = VTBLK_S_IOERR; else *io->io_status = VTBLK_S_OK; /* * Return the descriptor back to the host. * We wrote 1 byte (our status) to host. */ - pthread_mutex_lock(&sc->vsc_mtx); vq_relchain(&sc->vbsc_vq, io->io_idx, 1); vq_endchains(&sc->vbsc_vq, 0); +} + +static void +pci_vtblk_done(struct blockif_req *br, int err) +{ + struct pci_vtblk_ioreq *io = br->br_param; + struct pci_vtblk_softc *sc = io->io_sc; + + pthread_mutex_lock(&sc->vsc_mtx); + pci_vtblk_done_locked(io, err); pthread_mutex_unlock(&sc->vsc_mtx); } static void pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) { struct virtio_blk_hdr *vbh; struct pci_vtblk_ioreq *io; int i, n; int err; ssize_t iolen; int writeop, type; struct iovec iov[BLOCKIF_IOV_MAX + 2]; uint16_t idx, flags[BLOCKIF_IOV_MAX + 2]; + struct virtio_blk_discard_write_zeroes *discard; n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags); /* * The first descriptor will be the read-only fixed header, * and the last is for status (hence +2 above and below). * The remaining iov's are the actual data I/O vectors. * * XXX - note - this fails on crash dump, which does a * VIRTIO_BLK_T_FLUSH with a zero transfer length */ assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2); io = &sc->vbsc_ios[idx]; assert((flags[0] & VRING_DESC_F_WRITE) == 0); assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr)); - vbh = iov[0].iov_base; + vbh = (struct virtio_blk_hdr *)iov[0].iov_base; memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2)); io->io_req.br_iovcnt = n - 2; - io->io_req.br_offset = vbh->vbh_sector * DEV_BSIZE; - io->io_status = iov[--n].iov_base; + io->io_req.br_offset = vbh->vbh_sector * VTBLK_BSIZE; + io->io_status = (uint8_t *)iov[--n].iov_base; assert(iov[n].iov_len == 1); assert(flags[n] & VRING_DESC_F_WRITE); /* * XXX * The guest should not be setting the BARRIER flag because * we don't advertise the capability. */ type = vbh->vbh_type & ~VBH_FLAG_BARRIER; - writeop = (type == VBH_OP_WRITE); + writeop = (type == VBH_OP_WRITE || type == VBH_OP_DISCARD); iolen = 0; for (i = 1; i < n; i++) { /* * - write op implies read-only descriptor, * - read/ident op implies write-only descriptor, * therefore test the inverse of the descriptor bit * to the op. */ assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop); iolen += iov[i].iov_len; } io->io_req.br_resid = iolen; DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld", - writeop ? "write" : "read/ident", iolen, i - 1, + writeop ? "write/discard" : "read/ident", iolen, i - 1, io->io_req.br_offset)); switch (type) { case VBH_OP_READ: err = blockif_read(sc->bc, &io->io_req); break; case VBH_OP_WRITE: err = blockif_write(sc->bc, &io->io_req); break; + case VBH_OP_DISCARD: + /* + * We currently only support a single request, if the guest + * has submitted a request that doesn't conform to the + * requirements, we return a error. + */ + if (iov[1].iov_len != sizeof (*discard)) { + pci_vtblk_done_locked(io, EINVAL); + return; + } + + /* The segments to discard are provided rather than data */ + discard = (struct virtio_blk_discard_write_zeroes *) + iov[1].iov_base; + + /* + * virtio v1.1 5.2.6.2: + * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP + * for discard and write zeroes commands if any unknown flag is + * set. Furthermore, the device MUST set the status byte to + * VIRTIO_BLK_S_UNSUPP for discard commands if the unmap flag + * is set. + * + * Currently there are no known flags for a DISCARD request. + */ + if (discard->flags.unmap != 0 || discard->flags.reserved != 0) { + pci_vtblk_done_locked(io, ENOTSUP); + return; + } + + /* Make sure the request doesn't exceed our size limit */ + if (discard->num_sectors > VTBLK_MAX_DISCARD_SECT) { + pci_vtblk_done_locked(io, EINVAL); + return; + } + + io->io_req.br_offset = discard->sector * VTBLK_BSIZE; + io->io_req.br_resid = discard->num_sectors * VTBLK_BSIZE; + err = blockif_delete(sc->bc, &io->io_req); + break; case VBH_OP_FLUSH: case VBH_OP_FLUSH_OUT: err = blockif_flush(sc->bc, &io->io_req); break; case VBH_OP_IDENT: /* Assume a single buffer */ /* S/n equal to buffer is not zero-terminated. */ memset(iov[1].iov_base, 0, iov[1].iov_len); strncpy(iov[1].iov_base, sc->vbsc_ident, MIN(iov[1].iov_len, sizeof(sc->vbsc_ident))); - pci_vtblk_done(&io->io_req, 0); + pci_vtblk_done_locked(io, 0); return; default: - pci_vtblk_done(&io->io_req, EOPNOTSUPP); + pci_vtblk_done_locked(io, EOPNOTSUPP); return; } assert(err == 0); } static void pci_vtblk_notify(void *vsc, struct vqueue_info *vq) { struct pci_vtblk_softc *sc = vsc; while (vq_has_descs(vq)) pci_vtblk_proc(sc, vq); } static int pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { char bident[sizeof("XX:X:X")]; struct blockif_ctxt *bctxt; MD5_CTX mdctx; u_char digest[16]; struct pci_vtblk_softc *sc; off_t size; int i, sectsz, sts, sto; if (opts == NULL) { WPRINTF(("virtio-block: backing device required")); return (1); } /* * The supplied backing file has to exist */ snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func); bctxt = blockif_open(opts, bident); if (bctxt == NULL) { perror("Could not open backing file"); return (1); } size = blockif_size(bctxt); sectsz = blockif_sectsz(bctxt); blockif_psectsz(bctxt, &sts, &sto); sc = calloc(1, sizeof(struct pci_vtblk_softc)); sc->bc = bctxt; for (i = 0; i < VTBLK_RINGSZ; i++) { struct pci_vtblk_ioreq *io = &sc->vbsc_ios[i]; io->io_req.br_callback = pci_vtblk_done; io->io_req.br_param = io; io->io_sc = sc; io->io_idx = i; } + bcopy(&vtblk_vi_consts, &sc->vbsc_consts, sizeof (vtblk_vi_consts)); + if (blockif_candelete(sc->bc)) + sc->vbsc_consts.vc_hv_caps |= VTBLK_F_DISCARD; + pthread_mutex_init(&sc->vsc_mtx, NULL); /* init virtio softc and virtqueues */ - vi_softc_linkup(&sc->vbsc_vs, &vtblk_vi_consts, sc, pi, &sc->vbsc_vq); + vi_softc_linkup(&sc->vbsc_vs, &sc->vbsc_consts, sc, pi, &sc->vbsc_vq); sc->vbsc_vs.vs_mtx = &sc->vsc_mtx; sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ; /* sc->vbsc_vq.vq_notify = we have no per-queue notify */ /* * Create an identifier for the backing file. Use parts of the * md5 sum of the filename */ MD5Init(&mdctx); MD5Update(&mdctx, opts, strlen(opts)); MD5Final(digest, &mdctx); snprintf(sc->vbsc_ident, VTBLK_BLK_ID_BYTES, "BHYVE-%02X%02X-%02X%02X-%02X%02X", digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); /* setup virtio block config space */ - sc->vbsc_cfg.vbc_capacity = size / DEV_BSIZE; /* 512-byte units */ + sc->vbsc_cfg.vbc_capacity = size / VTBLK_BSIZE; /* 512-byte units */ sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */ /* * If Linux is presented with a seg_max greater than the virtio queue * size, it can stumble into situations where it violates its own * invariants and panics. For safety, we keep seg_max clamped, paying * heed to the two extra descriptors needed for the header and status * of a request. */ sc->vbsc_cfg.vbc_seg_max = MIN(VTBLK_RINGSZ - 2, BLOCKIF_IOV_MAX); sc->vbsc_cfg.vbc_geometry.cylinders = 0; /* no geometry */ sc->vbsc_cfg.vbc_geometry.heads = 0; sc->vbsc_cfg.vbc_geometry.sectors = 0; sc->vbsc_cfg.vbc_blk_size = sectsz; sc->vbsc_cfg.vbc_topology.physical_block_exp = (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0; sc->vbsc_cfg.vbc_topology.alignment_offset = (sto != 0) ? ((sts - sto) / sectsz) : 0; sc->vbsc_cfg.vbc_topology.min_io_size = 0; sc->vbsc_cfg.vbc_topology.opt_io_size = 0; sc->vbsc_cfg.vbc_writeback = 0; + sc->vbsc_cfg.max_discard_sectors = VTBLK_MAX_DISCARD_SECT; + sc->vbsc_cfg.max_discard_seg = VTBLK_MAX_DISCARD_SEG; + sc->vbsc_cfg.discard_sector_alignment = sectsz / VTBLK_BSIZE; /* * Should we move some of this into virtio.c? Could * have the device, class, and subdev_0 as fields in * the virtio constants structure. */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) { blockif_close(sc->bc); free(sc); return (1); } vi_set_io_bar(&sc->vbsc_vs, 0); return (0); } static int pci_vtblk_cfgwrite(void *vsc, int offset, int size, uint32_t value) { DPRINTF(("vtblk: write to readonly reg %d", offset)); return (1); } static int pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vtblk_softc *sc = vsc; void *ptr; /* our caller has already verified offset and size */ ptr = (uint8_t *)&sc->vbsc_cfg + offset; memcpy(retval, ptr, size); return (0); } struct pci_devemu pci_de_vblk = { .pe_emu = "virtio-blk", .pe_init = pci_vtblk_init, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read }; PCI_EMUL_SET(pci_de_vblk); Index: releng/12.2 =================================================================== --- releng/12.2 (revision 365701) +++ releng/12.2 (revision 365702) Property changes on: releng/12.2 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,2 ## Merged /stable/12:r365614 Merged /head:r360229,363255