Index: stable/10/release/tools/ec2.conf =================================================================== --- stable/10/release/tools/ec2.conf (revision 287801) +++ stable/10/release/tools/ec2.conf (revision 287802) @@ -1,83 +1,88 @@ #!/bin/sh # # $FreeBSD$ # # Packages to install into the image we're creating. This is a deliberately # minimalist set, providing only the packages necessary to bootstrap further # package installation as specified via EC2 user-data. export VM_EXTRA_PACKAGES="ec2-scripts firstboot-freebsd-update firstboot-pkgs" # Set to a list of third-party software to enable in rc.conf(5). export VM_RC_LIST="ec2_configinit ec2_fetchkey ec2_ephemeralswap ec2_loghostkey firstboot_freebsd_update firstboot_pkgs" # Build with a 1.5 GB UFS partition; the growfs rc.d script will expand # the partition to fill the root disk after the EC2 instance is launched. # Note that if this is set to G, we will end up with an GB disk # image since VMSIZE is the size of the UFS partition, not the disk which # it resides within. export VMSIZE=1536M # No swap space; the ec2_ephemeralswap rc.d script will allocate swap # space on EC2 ephemeral disks. (If they exist -- the T2 low-cost instances # and the C4 compute-optimized instances don't have ephemeral disks. But # it would be silly to bloat the image and increase costs for every instance # just for those two families, especially since instances ranging in size # from 1 GB of RAM to 60 GB of RAM would need different sizes of swap space # anyway.) export NOSWAP=YES vm_extra_pre_umount() { # The firstboot_pkgs rc.d script will download the repository # catalogue and install or update pkg when the instance first # launches, so these files would just be replaced anyway; removing # them from the image allows it to boot faster. env ASSUME_ALWAYS_YES=yes pkg -c ${DESTDIR} delete -f -y pkg rm ${DESTDIR}/var/db/pkg/repo-*.sqlite # The size of the EC2 root disk can be configured at instance launch # time; expand our filesystem to fill the disk. echo 'growfs_enable="YES"' >> ${DESTDIR}/etc/rc.conf # EC2 instances use DHCP to get their network configuration. echo 'ifconfig_DEFAULT="SYNCDHCP"' >> ${DESTDIR}/etc/rc.conf # Unless the system has been configured via EC2 user-data, the user # will need to SSH in to do anything. echo 'sshd_enable="YES"' >> ${DESTDIR}/etc/rc.conf # The AWS CLI tools are generally useful, and small enough that they # will download quickly; but users will often override this setting # via EC2 user-data. echo 'firstboot_pkgs_list="awscli"' >> ${DESTDIR}/etc/rc.conf # The EC2 console is output-only, so while printing a backtrace can # be useful, there's no point dropping into a debugger or waiting # for a keypress. echo 'debug.trace_on_panic=1' >> ${DESTDIR}/etc/sysctl.conf echo 'debug.debugger_on_panic=0' >> ${DESTDIR}/etc/sysctl.conf echo 'kern.panic_reboot_wait_time=0' >> ${DESTDIR}/etc/sysctl.conf # The console is not interactive, so we might as well boot quickly. echo 'autoboot_delay="-1"' >> ${DESTDIR}/boot/loader.conf echo 'beastie_disable="YES"' >> ${DESTDIR}/boot/loader.conf # The EC2 console is an emulated serial port. echo 'console="comconsole"' >> ${DESTDIR}/boot/loader.conf # Some older EC2 hardware used a version of Xen with a bug in its # emulated serial port. It is not clear if EC2 still has any such # nodes, but apply the workaround just in case. echo 'hw.broken_txfifo="1"' >> ${DESTDIR}/boot/loader.conf + # Some EC2 instances suffer a significant (~40%) reduction in + # throughput when using blkif indirect segment I/Os. Disable this + # by default for now. + echo 'hw.xbd.xbd_enable_indirect="0"' >> ${DESTDIR}/boot/loader.conf + # The first time the AMI boots, the installed "first boot" scripts # should be allowed to run: # * ec2_configinit (download and process EC2 user-data) # * ec2_fetchkey (arrange for SSH using the EC2-provided public key) # * growfs (expand the filesystem to fill the provided disk) # * firstboot_freebsd_update (install critical updates) # * firstboot_pkgs (install packages) touch ${DESTDIR}/firstboot return 0 } Index: stable/10/sys/dev/xen/blkfront/blkfront.c =================================================================== --- stable/10/sys/dev/xen/blkfront/blkfront.c (revision 287801) +++ stable/10/sys/dev/xen/blkfront/blkfront.c (revision 287802) @@ -1,1498 +1,1569 @@ /* * XenBSD block device driver * * Copyright (c) 2010-2013 Spectra Logic Corporation * Copyright (c) 2009 Scott Long, Yahoo! * Copyright (c) 2009 Frank Suchomel, Citrix * Copyright (c) 2009 Doug F. Rabson, Citrix * Copyright (c) 2005 Kip Macy * Copyright (c) 2003-2004, Keir Fraser & Steve Hand * Modifications by Mark A. Williamson are (c) Intel Research Cambridge * * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "xenbus_if.h" /*--------------------------- Forward Declarations ---------------------------*/ static void xbd_closing(device_t); static void xbd_startio(struct xbd_softc *sc); /*---------------------------------- Macros ----------------------------------*/ #if 0 #define DPRINTK(fmt, args...) printf("[XEN] %s:%d: " fmt ".\n", __func__, __LINE__, ##args) #else #define DPRINTK(fmt, args...) #endif #define XBD_SECTOR_SHFT 9 /*---------------------------- Global Static Data ----------------------------*/ static MALLOC_DEFINE(M_XENBLOCKFRONT, "xbd", "Xen Block Front driver data"); +static int xbd_enable_indirect = 1; +SYSCTL_NODE(_hw, OID_AUTO, xbd, CTLFLAG_RD, 0, "xbd driver parameters"); +SYSCTL_INT(_hw_xbd, OID_AUTO, xbd_enable_indirect, CTLFLAG_RDTUN, + &xbd_enable_indirect, 0, "Enable xbd indirect segments"); + /*---------------------------- Command Processing ----------------------------*/ static void xbd_freeze(struct xbd_softc *sc, xbd_flag_t xbd_flag) { if (xbd_flag != XBDF_NONE && (sc->xbd_flags & xbd_flag) != 0) return; sc->xbd_flags |= xbd_flag; sc->xbd_qfrozen_cnt++; } static void xbd_thaw(struct xbd_softc *sc, xbd_flag_t xbd_flag) { if (xbd_flag != XBDF_NONE && (sc->xbd_flags & xbd_flag) == 0) return; if (sc->xbd_qfrozen_cnt == 0) panic("%s: Thaw with flag 0x%x while not frozen.", __func__, xbd_flag); sc->xbd_flags &= ~xbd_flag; sc->xbd_qfrozen_cnt--; } static void xbd_cm_freeze(struct xbd_softc *sc, struct xbd_command *cm, xbdc_flag_t cm_flag) { if ((cm->cm_flags & XBDCF_FROZEN) != 0) return; cm->cm_flags |= XBDCF_FROZEN|cm_flag; xbd_freeze(sc, XBDF_NONE); } static void xbd_cm_thaw(struct xbd_softc *sc, struct xbd_command *cm) { if ((cm->cm_flags & XBDCF_FROZEN) == 0) return; cm->cm_flags &= ~XBDCF_FROZEN; xbd_thaw(sc, XBDF_NONE); } static inline void xbd_flush_requests(struct xbd_softc *sc) { int notify; RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->xbd_ring, notify); if (notify) xen_intr_signal(sc->xen_intr_handle); } static void xbd_free_command(struct xbd_command *cm) { KASSERT((cm->cm_flags & XBDCF_Q_MASK) == XBD_Q_NONE, ("Freeing command that is still on queue %d.", cm->cm_flags & XBDCF_Q_MASK)); cm->cm_flags = XBDCF_INITIALIZER; cm->cm_bp = NULL; cm->cm_complete = NULL; xbd_enqueue_cm(cm, XBD_Q_FREE); xbd_thaw(cm->cm_sc, XBDF_CM_SHORTAGE); } static void xbd_mksegarray(bus_dma_segment_t *segs, int nsegs, grant_ref_t * gref_head, int otherend_id, int readonly, grant_ref_t * sg_ref, blkif_request_segment_t * sg) { struct blkif_request_segment *last_block_sg = sg + nsegs; vm_paddr_t buffer_ma; uint64_t fsect, lsect; int ref; while (sg < last_block_sg) { buffer_ma = segs->ds_addr; fsect = (buffer_ma & PAGE_MASK) >> XBD_SECTOR_SHFT; lsect = fsect + (segs->ds_len >> XBD_SECTOR_SHFT) - 1; KASSERT(lsect <= 7, ("XEN disk driver data cannot " "cross a page boundary")); /* install a grant reference. */ ref = gnttab_claim_grant_reference(gref_head); /* * GNTTAB_LIST_END == 0xffffffff, but it is private * to gnttab.c. */ KASSERT(ref != ~0, ("grant_reference failed")); gnttab_grant_foreign_access_ref( ref, otherend_id, buffer_ma >> PAGE_SHIFT, readonly); *sg_ref = ref; *sg = (struct blkif_request_segment) { .gref = ref, .first_sect = fsect, .last_sect = lsect }; sg++; sg_ref++; segs++; } } static void xbd_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error) { struct xbd_softc *sc; struct xbd_command *cm; - blkif_request_t *ring_req; int op; cm = arg; sc = cm->cm_sc; if (error) { cm->cm_bp->bio_error = EIO; biodone(cm->cm_bp); xbd_free_command(cm); return; } - KASSERT(nsegs <= BLKIF_MAX_SEGMENTS_PER_REQUEST, + KASSERT(nsegs <= sc->xbd_max_request_segments, ("Too many segments in a blkfront I/O")); - /* Fill out a communications ring structure. */ - ring_req = RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt); - sc->xbd_ring.req_prod_pvt++; - ring_req->id = cm->cm_id; - ring_req->operation = cm->cm_operation; - ring_req->sector_number = cm->cm_sector_number; - ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk; - ring_req->nr_segments = nsegs; - cm->cm_nseg = nsegs; - xbd_mksegarray(segs, nsegs, &cm->cm_gref_head, - xenbus_get_otherend_id(sc->xbd_dev), - cm->cm_operation == BLKIF_OP_WRITE, - cm->cm_sg_refs, ring_req->seg); + if (nsegs <= BLKIF_MAX_SEGMENTS_PER_REQUEST) { + blkif_request_t *ring_req; + /* Fill out a blkif_request_t structure. */ + ring_req = (blkif_request_t *) + RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt); + sc->xbd_ring.req_prod_pvt++; + ring_req->id = cm->cm_id; + ring_req->operation = cm->cm_operation; + ring_req->sector_number = cm->cm_sector_number; + ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk; + ring_req->nr_segments = nsegs; + cm->cm_nseg = nsegs; + xbd_mksegarray(segs, nsegs, &cm->cm_gref_head, + xenbus_get_otherend_id(sc->xbd_dev), + cm->cm_operation == BLKIF_OP_WRITE, + cm->cm_sg_refs, ring_req->seg); + } else { + blkif_request_indirect_t *ring_req; + + /* Fill out a blkif_request_indirect_t structure. */ + ring_req = (blkif_request_indirect_t *) + RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt); + sc->xbd_ring.req_prod_pvt++; + ring_req->id = cm->cm_id; + ring_req->operation = BLKIF_OP_INDIRECT; + ring_req->indirect_op = cm->cm_operation; + ring_req->sector_number = cm->cm_sector_number; + ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk; + ring_req->nr_segments = nsegs; + cm->cm_nseg = nsegs; + xbd_mksegarray(segs, nsegs, &cm->cm_gref_head, + xenbus_get_otherend_id(sc->xbd_dev), + cm->cm_operation == BLKIF_OP_WRITE, + cm->cm_sg_refs, cm->cm_indirectionpages); + memcpy(ring_req->indirect_grefs, &cm->cm_indirectionrefs, + sizeof(grant_ref_t) * sc->xbd_max_request_indirectpages); + } + if (cm->cm_operation == BLKIF_OP_READ) op = BUS_DMASYNC_PREREAD; else if (cm->cm_operation == BLKIF_OP_WRITE) op = BUS_DMASYNC_PREWRITE; else op = 0; bus_dmamap_sync(sc->xbd_io_dmat, cm->cm_map, op); gnttab_free_grant_references(cm->cm_gref_head); xbd_enqueue_cm(cm, XBD_Q_BUSY); /* * If bus dma had to asynchronously call us back to dispatch * this command, we are no longer executing in the context of * xbd_startio(). Thus we cannot rely on xbd_startio()'s call to * xbd_flush_requests() to publish this command to the backend * along with any other commands that it could batch. */ if ((cm->cm_flags & XBDCF_ASYNC_MAPPING) != 0) xbd_flush_requests(sc); return; } static int xbd_queue_request(struct xbd_softc *sc, struct xbd_command *cm) { int error; error = bus_dmamap_load(sc->xbd_io_dmat, cm->cm_map, cm->cm_data, cm->cm_datalen, xbd_queue_cb, cm, 0); if (error == EINPROGRESS) { /* * Maintain queuing order by freezing the queue. The next * command may not require as many resources as the command * we just attempted to map, so we can't rely on bus dma * blocking for it too. */ xbd_cm_freeze(sc, cm, XBDCF_ASYNC_MAPPING); return (0); } return (error); } static void xbd_restart_queue_callback(void *arg) { struct xbd_softc *sc = arg; mtx_lock(&sc->xbd_io_lock); xbd_thaw(sc, XBDF_GNT_SHORTAGE); xbd_startio(sc); mtx_unlock(&sc->xbd_io_lock); } static struct xbd_command * xbd_bio_command(struct xbd_softc *sc) { struct xbd_command *cm; struct bio *bp; if (__predict_false(sc->xbd_state != XBD_STATE_CONNECTED)) return (NULL); bp = xbd_dequeue_bio(sc); if (bp == NULL) return (NULL); if ((cm = xbd_dequeue_cm(sc, XBD_Q_FREE)) == NULL) { xbd_freeze(sc, XBDF_CM_SHORTAGE); xbd_requeue_bio(sc, bp); return (NULL); } if (gnttab_alloc_grant_references(sc->xbd_max_request_segments, &cm->cm_gref_head) != 0) { gnttab_request_free_callback(&sc->xbd_callback, xbd_restart_queue_callback, sc, sc->xbd_max_request_segments); xbd_freeze(sc, XBDF_GNT_SHORTAGE); xbd_requeue_bio(sc, bp); xbd_enqueue_cm(cm, XBD_Q_FREE); return (NULL); } cm->cm_bp = bp; cm->cm_data = bp->bio_data; cm->cm_datalen = bp->bio_bcount; cm->cm_sector_number = (blkif_sector_t)bp->bio_pblkno; switch (bp->bio_cmd) { case BIO_READ: cm->cm_operation = BLKIF_OP_READ; break; case BIO_WRITE: cm->cm_operation = BLKIF_OP_WRITE; if ((bp->bio_flags & BIO_ORDERED) != 0) { if ((sc->xbd_flags & XBDF_BARRIER) != 0) { cm->cm_operation = BLKIF_OP_WRITE_BARRIER; } else { /* * Single step this command. */ cm->cm_flags |= XBDCF_Q_FREEZE; if (xbd_queue_length(sc, XBD_Q_BUSY) != 0) { /* * Wait for in-flight requests to * finish. */ xbd_freeze(sc, XBDF_WAIT_IDLE); xbd_requeue_cm(cm, XBD_Q_READY); return (NULL); } } } break; case BIO_FLUSH: if ((sc->xbd_flags & XBDF_FLUSH) != 0) cm->cm_operation = BLKIF_OP_FLUSH_DISKCACHE; else if ((sc->xbd_flags & XBDF_BARRIER) != 0) cm->cm_operation = BLKIF_OP_WRITE_BARRIER; else panic("flush request, but no flush support available"); break; default: panic("unknown bio command %d", bp->bio_cmd); } return (cm); } /* * Dequeue buffers and place them in the shared communication ring. * Return when no more requests can be accepted or all buffers have * been queued. * * Signal XEN once the ring has been filled out. */ static void xbd_startio(struct xbd_softc *sc) { struct xbd_command *cm; int error, queued = 0; mtx_assert(&sc->xbd_io_lock, MA_OWNED); if (sc->xbd_state != XBD_STATE_CONNECTED) return; while (!RING_FULL(&sc->xbd_ring)) { if (sc->xbd_qfrozen_cnt != 0) break; cm = xbd_dequeue_cm(sc, XBD_Q_READY); if (cm == NULL) cm = xbd_bio_command(sc); if (cm == NULL) break; if ((cm->cm_flags & XBDCF_Q_FREEZE) != 0) { /* * Single step command. Future work is * held off until this command completes. */ xbd_cm_freeze(sc, cm, XBDCF_Q_FREEZE); } if ((error = xbd_queue_request(sc, cm)) != 0) { printf("xbd_queue_request returned %d\n", error); break; } queued++; } if (queued != 0) xbd_flush_requests(sc); } static void xbd_bio_complete(struct xbd_softc *sc, struct xbd_command *cm) { struct bio *bp; bp = cm->cm_bp; if (__predict_false(cm->cm_status != BLKIF_RSP_OKAY)) { disk_err(bp, "disk error" , -1, 0); printf(" status: %x\n", cm->cm_status); bp->bio_flags |= BIO_ERROR; } if (bp->bio_flags & BIO_ERROR) bp->bio_error = EIO; else bp->bio_resid = 0; xbd_free_command(cm); biodone(bp); } static void xbd_int(void *xsc) { struct xbd_softc *sc = xsc; struct xbd_command *cm; blkif_response_t *bret; RING_IDX i, rp; int op; mtx_lock(&sc->xbd_io_lock); if (__predict_false(sc->xbd_state == XBD_STATE_DISCONNECTED)) { mtx_unlock(&sc->xbd_io_lock); return; } again: rp = sc->xbd_ring.sring->rsp_prod; rmb(); /* Ensure we see queued responses up to 'rp'. */ for (i = sc->xbd_ring.rsp_cons; i != rp;) { bret = RING_GET_RESPONSE(&sc->xbd_ring, i); cm = &sc->xbd_shadow[bret->id]; xbd_remove_cm(cm, XBD_Q_BUSY); gnttab_end_foreign_access_references(cm->cm_nseg, cm->cm_sg_refs); i++; if (cm->cm_operation == BLKIF_OP_READ) op = BUS_DMASYNC_POSTREAD; else if (cm->cm_operation == BLKIF_OP_WRITE || cm->cm_operation == BLKIF_OP_WRITE_BARRIER) op = BUS_DMASYNC_POSTWRITE; else op = 0; bus_dmamap_sync(sc->xbd_io_dmat, cm->cm_map, op); bus_dmamap_unload(sc->xbd_io_dmat, cm->cm_map); /* * Release any hold this command has on future command * dispatch. */ xbd_cm_thaw(sc, cm); /* * Directly call the i/o complete routine to save an * an indirection in the common case. */ cm->cm_status = bret->status; if (cm->cm_bp) xbd_bio_complete(sc, cm); else if (cm->cm_complete != NULL) cm->cm_complete(cm); else xbd_free_command(cm); } sc->xbd_ring.rsp_cons = i; if (i != sc->xbd_ring.req_prod_pvt) { int more_to_do; RING_FINAL_CHECK_FOR_RESPONSES(&sc->xbd_ring, more_to_do); if (more_to_do) goto again; } else { sc->xbd_ring.sring->rsp_event = i + 1; } if (xbd_queue_length(sc, XBD_Q_BUSY) == 0) xbd_thaw(sc, XBDF_WAIT_IDLE); xbd_startio(sc); if (__predict_false(sc->xbd_state == XBD_STATE_SUSPENDED)) wakeup(&sc->xbd_cm_q[XBD_Q_BUSY]); mtx_unlock(&sc->xbd_io_lock); } /*------------------------------- Dump Support -------------------------------*/ /** * Quiesce the disk writes for a dump file before allowing the next buffer. */ static void xbd_quiesce(struct xbd_softc *sc) { int mtd; // While there are outstanding requests while (xbd_queue_length(sc, XBD_Q_BUSY) != 0) { RING_FINAL_CHECK_FOR_RESPONSES(&sc->xbd_ring, mtd); if (mtd) { /* Recieved request completions, update queue. */ xbd_int(sc); } if (xbd_queue_length(sc, XBD_Q_BUSY) != 0) { /* * Still pending requests, wait for the disk i/o * to complete. */ HYPERVISOR_yield(); } } } /* Kernel dump function for a paravirtualized disk device */ static void xbd_dump_complete(struct xbd_command *cm) { xbd_enqueue_cm(cm, XBD_Q_COMPLETE); } static int xbd_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct disk *dp = arg; struct xbd_softc *sc = dp->d_drv1; struct xbd_command *cm; size_t chunk; int sbp; int rc = 0; if (length <= 0) return (rc); xbd_quiesce(sc); /* All quiet on the western front. */ /* * If this lock is held, then this module is failing, and a * successful kernel dump is highly unlikely anyway. */ mtx_lock(&sc->xbd_io_lock); /* Split the 64KB block as needed */ for (sbp=0; length > 0; sbp++) { cm = xbd_dequeue_cm(sc, XBD_Q_FREE); if (cm == NULL) { mtx_unlock(&sc->xbd_io_lock); device_printf(sc->xbd_dev, "dump: no more commands?\n"); return (EBUSY); } if (gnttab_alloc_grant_references(sc->xbd_max_request_segments, &cm->cm_gref_head) != 0) { xbd_free_command(cm); mtx_unlock(&sc->xbd_io_lock); device_printf(sc->xbd_dev, "no more grant allocs?\n"); return (EBUSY); } chunk = length > sc->xbd_max_request_size ? sc->xbd_max_request_size : length; cm->cm_data = virtual; cm->cm_datalen = chunk; cm->cm_operation = BLKIF_OP_WRITE; cm->cm_sector_number = offset / dp->d_sectorsize; cm->cm_complete = xbd_dump_complete; xbd_enqueue_cm(cm, XBD_Q_READY); length -= chunk; offset += chunk; virtual = (char *) virtual + chunk; } /* Tell DOM0 to do the I/O */ xbd_startio(sc); mtx_unlock(&sc->xbd_io_lock); /* Poll for the completion. */ xbd_quiesce(sc); /* All quite on the eastern front */ /* If there were any errors, bail out... */ while ((cm = xbd_dequeue_cm(sc, XBD_Q_COMPLETE)) != NULL) { if (cm->cm_status != BLKIF_RSP_OKAY) { device_printf(sc->xbd_dev, "Dump I/O failed at sector %jd\n", cm->cm_sector_number); rc = EIO; } xbd_free_command(cm); } return (rc); } /*----------------------------- Disk Entrypoints -----------------------------*/ static int xbd_open(struct disk *dp) { struct xbd_softc *sc = dp->d_drv1; if (sc == NULL) { printf("xb%d: not found", sc->xbd_unit); return (ENXIO); } sc->xbd_flags |= XBDF_OPEN; sc->xbd_users++; return (0); } static int xbd_close(struct disk *dp) { struct xbd_softc *sc = dp->d_drv1; if (sc == NULL) return (ENXIO); sc->xbd_flags &= ~XBDF_OPEN; if (--(sc->xbd_users) == 0) { /* * Check whether we have been instructed to close. We will * have ignored this request initially, as the device was * still mounted. */ if (xenbus_get_otherend_state(sc->xbd_dev) == XenbusStateClosing) xbd_closing(sc->xbd_dev); } return (0); } static int xbd_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td) { struct xbd_softc *sc = dp->d_drv1; if (sc == NULL) return (ENXIO); return (ENOTTY); } /* * Read/write routine for a buffer. Finds the proper unit, place it on * the sortq and kick the controller. */ static void xbd_strategy(struct bio *bp) { struct xbd_softc *sc = bp->bio_disk->d_drv1; /* bogus disk? */ if (sc == NULL) { bp->bio_error = EINVAL; bp->bio_flags |= BIO_ERROR; bp->bio_resid = bp->bio_bcount; biodone(bp); return; } /* * Place it in the queue of disk activities for this disk */ mtx_lock(&sc->xbd_io_lock); xbd_enqueue_bio(sc, bp); xbd_startio(sc); mtx_unlock(&sc->xbd_io_lock); return; } /*------------------------------ Ring Management -----------------------------*/ static int xbd_alloc_ring(struct xbd_softc *sc) { blkif_sring_t *sring; uintptr_t sring_page_addr; int error; int i; sring = malloc(sc->xbd_ring_pages * PAGE_SIZE, M_XENBLOCKFRONT, M_NOWAIT|M_ZERO); if (sring == NULL) { xenbus_dev_fatal(sc->xbd_dev, ENOMEM, "allocating shared ring"); return (ENOMEM); } SHARED_RING_INIT(sring); FRONT_RING_INIT(&sc->xbd_ring, sring, sc->xbd_ring_pages * PAGE_SIZE); for (i = 0, sring_page_addr = (uintptr_t)sring; i < sc->xbd_ring_pages; i++, sring_page_addr += PAGE_SIZE) { error = xenbus_grant_ring(sc->xbd_dev, (vtomach(sring_page_addr) >> PAGE_SHIFT), &sc->xbd_ring_ref[i]); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "granting ring_ref(%d)", i); return (error); } } if (sc->xbd_ring_pages == 1) { error = xs_printf(XST_NIL, xenbus_get_node(sc->xbd_dev), "ring-ref", "%u", sc->xbd_ring_ref[0]); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "writing %s/ring-ref", xenbus_get_node(sc->xbd_dev)); return (error); } } else { for (i = 0; i < sc->xbd_ring_pages; i++) { char ring_ref_name[]= "ring_refXX"; snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", i); error = xs_printf(XST_NIL, xenbus_get_node(sc->xbd_dev), ring_ref_name, "%u", sc->xbd_ring_ref[i]); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "writing %s/%s", xenbus_get_node(sc->xbd_dev), ring_ref_name); return (error); } } } error = xen_intr_alloc_and_bind_local_port(sc->xbd_dev, xenbus_get_otherend_id(sc->xbd_dev), NULL, xbd_int, sc, INTR_TYPE_BIO | INTR_MPSAFE, &sc->xen_intr_handle); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "xen_intr_alloc_and_bind_local_port failed"); return (error); } return (0); } static void xbd_free_ring(struct xbd_softc *sc) { int i; if (sc->xbd_ring.sring == NULL) return; for (i = 0; i < sc->xbd_ring_pages; i++) { if (sc->xbd_ring_ref[i] != GRANT_REF_INVALID) { gnttab_end_foreign_access_ref(sc->xbd_ring_ref[i]); sc->xbd_ring_ref[i] = GRANT_REF_INVALID; } } free(sc->xbd_ring.sring, M_XENBLOCKFRONT); sc->xbd_ring.sring = NULL; } /*-------------------------- Initialization/Teardown -------------------------*/ static int xbd_feature_string(struct xbd_softc *sc, char *features, size_t len) { struct sbuf sb; int feature_cnt; sbuf_new(&sb, features, len, SBUF_FIXEDLEN); feature_cnt = 0; if ((sc->xbd_flags & XBDF_FLUSH) != 0) { sbuf_printf(&sb, "flush"); feature_cnt++; } if ((sc->xbd_flags & XBDF_BARRIER) != 0) { if (feature_cnt != 0) sbuf_printf(&sb, ", "); sbuf_printf(&sb, "write_barrier"); feature_cnt++; } (void) sbuf_finish(&sb); return (sbuf_len(&sb)); } static int xbd_sysctl_features(SYSCTL_HANDLER_ARGS) { char features[80]; struct xbd_softc *sc = arg1; int error; int len; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); len = xbd_feature_string(sc, features, sizeof(features)); /* len is -1 on error, which will make the SYSCTL_OUT a no-op. */ return (SYSCTL_OUT(req, features, len + 1/*NUL*/)); } static void xbd_setup_sysctl(struct xbd_softc *xbd) { struct sysctl_ctx_list *sysctl_ctx = NULL; struct sysctl_oid *sysctl_tree = NULL; struct sysctl_oid_list *children; sysctl_ctx = device_get_sysctl_ctx(xbd->xbd_dev); if (sysctl_ctx == NULL) return; sysctl_tree = device_get_sysctl_tree(xbd->xbd_dev); if (sysctl_tree == NULL) return; children = SYSCTL_CHILDREN(sysctl_tree); SYSCTL_ADD_UINT(sysctl_ctx, children, OID_AUTO, "max_requests", CTLFLAG_RD, &xbd->xbd_max_requests, -1, "maximum outstanding requests (negotiated)"); SYSCTL_ADD_UINT(sysctl_ctx, children, OID_AUTO, "max_request_segments", CTLFLAG_RD, &xbd->xbd_max_request_segments, 0, "maximum number of pages per requests (negotiated)"); SYSCTL_ADD_UINT(sysctl_ctx, children, OID_AUTO, "max_request_size", CTLFLAG_RD, &xbd->xbd_max_request_size, 0, "maximum size in bytes of a request (negotiated)"); SYSCTL_ADD_UINT(sysctl_ctx, children, OID_AUTO, "ring_pages", CTLFLAG_RD, &xbd->xbd_ring_pages, 0, "communication channel pages (negotiated)"); SYSCTL_ADD_PROC(sysctl_ctx, children, OID_AUTO, "features", CTLTYPE_STRING|CTLFLAG_RD, xbd, 0, xbd_sysctl_features, "A", "protocol features (negotiated)"); } /* * Translate Linux major/minor to an appropriate name and unit * number. For HVM guests, this allows us to use the same drive names * with blkfront as the emulated drives, easing transition slightly. */ static void xbd_vdevice_to_unit(uint32_t vdevice, int *unit, const char **name) { static struct vdev_info { int major; int shift; int base; const char *name; } info[] = { {3, 6, 0, "ada"}, /* ide0 */ {22, 6, 2, "ada"}, /* ide1 */ {33, 6, 4, "ada"}, /* ide2 */ {34, 6, 6, "ada"}, /* ide3 */ {56, 6, 8, "ada"}, /* ide4 */ {57, 6, 10, "ada"}, /* ide5 */ {88, 6, 12, "ada"}, /* ide6 */ {89, 6, 14, "ada"}, /* ide7 */ {90, 6, 16, "ada"}, /* ide8 */ {91, 6, 18, "ada"}, /* ide9 */ {8, 4, 0, "da"}, /* scsi disk0 */ {65, 4, 16, "da"}, /* scsi disk1 */ {66, 4, 32, "da"}, /* scsi disk2 */ {67, 4, 48, "da"}, /* scsi disk3 */ {68, 4, 64, "da"}, /* scsi disk4 */ {69, 4, 80, "da"}, /* scsi disk5 */ {70, 4, 96, "da"}, /* scsi disk6 */ {71, 4, 112, "da"}, /* scsi disk7 */ {128, 4, 128, "da"}, /* scsi disk8 */ {129, 4, 144, "da"}, /* scsi disk9 */ {130, 4, 160, "da"}, /* scsi disk10 */ {131, 4, 176, "da"}, /* scsi disk11 */ {132, 4, 192, "da"}, /* scsi disk12 */ {133, 4, 208, "da"}, /* scsi disk13 */ {134, 4, 224, "da"}, /* scsi disk14 */ {135, 4, 240, "da"}, /* scsi disk15 */ {202, 4, 0, "xbd"}, /* xbd */ {0, 0, 0, NULL}, }; int major = vdevice >> 8; int minor = vdevice & 0xff; int i; if (vdevice & (1 << 28)) { *unit = (vdevice & ((1 << 28) - 1)) >> 8; *name = "xbd"; return; } for (i = 0; info[i].major; i++) { if (info[i].major == major) { *unit = info[i].base + (minor >> info[i].shift); *name = info[i].name; return; } } *unit = minor >> 4; *name = "xbd"; } int xbd_instance_create(struct xbd_softc *sc, blkif_sector_t sectors, int vdevice, uint16_t vdisk_info, unsigned long sector_size) { char features[80]; int unit, error = 0; const char *name; xbd_vdevice_to_unit(vdevice, &unit, &name); sc->xbd_unit = unit; if (strcmp(name, "xbd") != 0) device_printf(sc->xbd_dev, "attaching as %s%d\n", name, unit); if (xbd_feature_string(sc, features, sizeof(features)) > 0) { device_printf(sc->xbd_dev, "features: %s\n", features); } sc->xbd_disk = disk_alloc(); sc->xbd_disk->d_unit = sc->xbd_unit; sc->xbd_disk->d_open = xbd_open; sc->xbd_disk->d_close = xbd_close; sc->xbd_disk->d_ioctl = xbd_ioctl; sc->xbd_disk->d_strategy = xbd_strategy; sc->xbd_disk->d_dump = xbd_dump; sc->xbd_disk->d_name = name; sc->xbd_disk->d_drv1 = sc; sc->xbd_disk->d_sectorsize = sector_size; sc->xbd_disk->d_mediasize = sectors * sector_size; sc->xbd_disk->d_maxsize = sc->xbd_max_request_size; sc->xbd_disk->d_flags = 0; if ((sc->xbd_flags & (XBDF_FLUSH|XBDF_BARRIER)) != 0) { sc->xbd_disk->d_flags |= DISKFLAG_CANFLUSHCACHE; device_printf(sc->xbd_dev, "synchronize cache commands enabled.\n"); } disk_create(sc->xbd_disk, DISK_VERSION); return error; } static void xbd_free(struct xbd_softc *sc) { int i; /* Prevent new requests being issued until we fix things up. */ mtx_lock(&sc->xbd_io_lock); sc->xbd_state = XBD_STATE_DISCONNECTED; mtx_unlock(&sc->xbd_io_lock); /* Free resources associated with old device channel. */ xbd_free_ring(sc); if (sc->xbd_shadow) { for (i = 0; i < sc->xbd_max_requests; i++) { struct xbd_command *cm; cm = &sc->xbd_shadow[i]; if (cm->cm_sg_refs != NULL) { free(cm->cm_sg_refs, M_XENBLOCKFRONT); cm->cm_sg_refs = NULL; } + if (cm->cm_indirectionpages != NULL) { + gnttab_end_foreign_access_references( + sc->xbd_max_request_indirectpages, + &cm->cm_indirectionrefs[0]); + contigfree(cm->cm_indirectionpages, PAGE_SIZE * + sc->xbd_max_request_indirectpages, + M_XENBLOCKFRONT); + cm->cm_indirectionpages = NULL; + } + bus_dmamap_destroy(sc->xbd_io_dmat, cm->cm_map); } free(sc->xbd_shadow, M_XENBLOCKFRONT); sc->xbd_shadow = NULL; bus_dma_tag_destroy(sc->xbd_io_dmat); xbd_initq_cm(sc, XBD_Q_FREE); xbd_initq_cm(sc, XBD_Q_READY); xbd_initq_cm(sc, XBD_Q_COMPLETE); } xen_intr_unbind(&sc->xen_intr_handle); } /*--------------------------- State Change Handlers --------------------------*/ static void xbd_initialize(struct xbd_softc *sc) { const char *otherend_path; const char *node_path; uint32_t max_ring_page_order; int error; if (xenbus_get_state(sc->xbd_dev) != XenbusStateInitialising) { /* Initialization has already been performed. */ return; } /* * Protocol defaults valid even if negotiation for a * setting fails. */ max_ring_page_order = 0; sc->xbd_ring_pages = 1; - sc->xbd_max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; - sc->xbd_max_request_size = - XBD_SEGS_TO_SIZE(sc->xbd_max_request_segments); /* * Protocol negotiation. * * \note xs_gather() returns on the first encountered error, so * we must use independant calls in order to guarantee * we don't miss information in a sparsly populated back-end * tree. * * \note xs_scanf() does not update variables for unmatched * fields. */ otherend_path = xenbus_get_otherend_path(sc->xbd_dev); node_path = xenbus_get_node(sc->xbd_dev); /* Support both backend schemes for relaying ring page limits. */ (void)xs_scanf(XST_NIL, otherend_path, "max-ring-page-order", NULL, "%" PRIu32, &max_ring_page_order); sc->xbd_ring_pages = 1 << max_ring_page_order; (void)xs_scanf(XST_NIL, otherend_path, "max-ring-pages", NULL, "%" PRIu32, &sc->xbd_ring_pages); if (sc->xbd_ring_pages < 1) sc->xbd_ring_pages = 1; if (sc->xbd_ring_pages > XBD_MAX_RING_PAGES) { device_printf(sc->xbd_dev, "Back-end specified ring-pages of %u " "limited to front-end limit of %u.\n", sc->xbd_ring_pages, XBD_MAX_RING_PAGES); sc->xbd_ring_pages = XBD_MAX_RING_PAGES; } if (powerof2(sc->xbd_ring_pages) == 0) { uint32_t new_page_limit; new_page_limit = 0x01 << (fls(sc->xbd_ring_pages) - 1); device_printf(sc->xbd_dev, "Back-end specified ring-pages of %u " "is not a power of 2. Limited to %u.\n", sc->xbd_ring_pages, new_page_limit); sc->xbd_ring_pages = new_page_limit; } sc->xbd_max_requests = BLKIF_MAX_RING_REQUESTS(sc->xbd_ring_pages * PAGE_SIZE); if (sc->xbd_max_requests > XBD_MAX_REQUESTS) { device_printf(sc->xbd_dev, "Back-end specified max_requests of %u " "limited to front-end limit of %zu.\n", sc->xbd_max_requests, XBD_MAX_REQUESTS); sc->xbd_max_requests = XBD_MAX_REQUESTS; } if (xbd_alloc_ring(sc) != 0) return; /* Support both backend schemes for relaying ring page limits. */ if (sc->xbd_ring_pages > 1) { error = xs_printf(XST_NIL, node_path, "num-ring-pages","%u", sc->xbd_ring_pages); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "writing %s/num-ring-pages", node_path); return; } error = xs_printf(XST_NIL, node_path, "ring-page-order", "%u", fls(sc->xbd_ring_pages) - 1); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "writing %s/ring-page-order", node_path); return; } } error = xs_printf(XST_NIL, node_path, "event-channel", "%u", xen_intr_port(sc->xen_intr_handle)); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "writing %s/event-channel", node_path); return; } error = xs_printf(XST_NIL, node_path, "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "writing %s/protocol", node_path); return; } xenbus_set_state(sc->xbd_dev, XenbusStateInitialised); } /* * Invoked when the backend is finally 'ready' (and has published * the details about the physical device - #sectors, size, etc). */ static void xbd_connect(struct xbd_softc *sc) { device_t dev = sc->xbd_dev; unsigned long sectors, sector_size; unsigned int binfo; int err, feature_barrier, feature_flush; - int i; + int i, j; if (sc->xbd_state == XBD_STATE_CONNECTED || sc->xbd_state == XBD_STATE_SUSPENDED) return; DPRINTK("blkfront.c:connect:%s.\n", xenbus_get_otherend_path(dev)); err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "sectors", "%lu", §ors, "info", "%u", &binfo, "sector-size", "%lu", §or_size, NULL); if (err) { xenbus_dev_fatal(dev, err, "reading backend fields at %s", xenbus_get_otherend_path(dev)); return; } err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "feature-barrier", "%lu", &feature_barrier, NULL); if (err == 0 && feature_barrier != 0) sc->xbd_flags |= XBDF_BARRIER; err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "feature-flush-cache", "%lu", &feature_flush, NULL); if (err == 0 && feature_flush != 0) sc->xbd_flags |= XBDF_FLUSH; + err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), + "feature-max-indirect-segments", "%" PRIu32, + &sc->xbd_max_request_segments, NULL); + if ((err != 0) || (xbd_enable_indirect == 0)) + sc->xbd_max_request_segments = 0; + if (sc->xbd_max_request_segments > XBD_MAX_INDIRECT_SEGMENTS) + sc->xbd_max_request_segments = XBD_MAX_INDIRECT_SEGMENTS; + if (sc->xbd_max_request_segments > XBD_SIZE_TO_SEGS(MAXPHYS)) + sc->xbd_max_request_segments = XBD_SIZE_TO_SEGS(MAXPHYS); + sc->xbd_max_request_indirectpages = + XBD_INDIRECT_SEGS_TO_PAGES(sc->xbd_max_request_segments); + if (sc->xbd_max_request_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST) + sc->xbd_max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; + sc->xbd_max_request_size = + XBD_SEGS_TO_SIZE(sc->xbd_max_request_segments); + /* Allocate datastructures based on negotiated values. */ err = bus_dma_tag_create( bus_get_dma_tag(sc->xbd_dev), /* parent */ 512, PAGE_SIZE, /* algnmnt, boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ sc->xbd_max_request_size, sc->xbd_max_request_segments, PAGE_SIZE, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ busdma_lock_mutex, /* lockfunc */ &sc->xbd_io_lock, /* lockarg */ &sc->xbd_io_dmat); if (err != 0) { xenbus_dev_fatal(sc->xbd_dev, err, "Cannot allocate parent DMA tag\n"); return; } /* Per-transaction data allocation. */ sc->xbd_shadow = malloc(sizeof(*sc->xbd_shadow) * sc->xbd_max_requests, M_XENBLOCKFRONT, M_NOWAIT|M_ZERO); if (sc->xbd_shadow == NULL) { bus_dma_tag_destroy(sc->xbd_io_dmat); xenbus_dev_fatal(sc->xbd_dev, ENOMEM, "Cannot allocate request structures\n"); return; } for (i = 0; i < sc->xbd_max_requests; i++) { struct xbd_command *cm; + void * indirectpages; cm = &sc->xbd_shadow[i]; cm->cm_sg_refs = malloc( sizeof(grant_ref_t) * sc->xbd_max_request_segments, M_XENBLOCKFRONT, M_NOWAIT); if (cm->cm_sg_refs == NULL) break; cm->cm_id = i; cm->cm_flags = XBDCF_INITIALIZER; cm->cm_sc = sc; if (bus_dmamap_create(sc->xbd_io_dmat, 0, &cm->cm_map) != 0) break; + if (sc->xbd_max_request_indirectpages > 0) { + indirectpages = contigmalloc( + PAGE_SIZE * sc->xbd_max_request_indirectpages, + M_XENBLOCKFRONT, M_ZERO, 0, ~0, PAGE_SIZE, 0); + } else { + indirectpages = NULL; + } + for (j = 0; j < sc->xbd_max_request_indirectpages; j++) { + if (gnttab_grant_foreign_access( + xenbus_get_otherend_id(sc->xbd_dev), + (vtomach(indirectpages) >> PAGE_SHIFT) + j, + 1 /* grant read-only access */, + &cm->cm_indirectionrefs[j])) + break; + } + if (j < sc->xbd_max_request_indirectpages) + break; + cm->cm_indirectionpages = indirectpages; xbd_free_command(cm); } if (sc->xbd_disk == NULL) { device_printf(dev, "%juMB <%s> at %s", (uintmax_t) sectors / (1048576 / sector_size), device_get_desc(dev), xenbus_get_node(dev)); bus_print_child_footer(device_get_parent(dev), dev); xbd_instance_create(sc, sectors, sc->xbd_vdevice, binfo, sector_size); } (void)xenbus_set_state(dev, XenbusStateConnected); /* Kick pending requests. */ mtx_lock(&sc->xbd_io_lock); sc->xbd_state = XBD_STATE_CONNECTED; xbd_startio(sc); sc->xbd_flags |= XBDF_READY; mtx_unlock(&sc->xbd_io_lock); } /** * Handle the change of state of the backend to Closing. We must delete our * device-layer structures now, to ensure that writes are flushed through to * the backend. Once this is done, we can switch to Closed in * acknowledgement. */ static void xbd_closing(device_t dev) { struct xbd_softc *sc = device_get_softc(dev); xenbus_set_state(dev, XenbusStateClosing); DPRINTK("xbd_closing: %s removed\n", xenbus_get_node(dev)); if (sc->xbd_disk != NULL) { disk_destroy(sc->xbd_disk); sc->xbd_disk = NULL; } xenbus_set_state(dev, XenbusStateClosed); } /*---------------------------- NewBus Entrypoints ----------------------------*/ static int xbd_probe(device_t dev) { if (strcmp(xenbus_get_type(dev), "vbd") != 0) return (ENXIO); if (xen_hvm_domain()) { int error; char *type; /* * When running in an HVM domain, IDE disk emulation is * disabled early in boot so that native drivers will * not see emulated hardware. However, CDROM device * emulation cannot be disabled. * * Through use of FreeBSD's vm_guest and xen_hvm_domain() * APIs, we could modify the native CDROM driver to fail its * probe when running under Xen. Unfortunatlely, the PV * CDROM support in XenServer (up through at least version * 6.2) isn't functional, so we instead rely on the emulated * CDROM instance, and fail to attach the PV one here in * the blkfront driver. */ error = xs_read(XST_NIL, xenbus_get_node(dev), "device-type", NULL, (void **) &type); if (error) return (ENXIO); if (strncmp(type, "cdrom", 5) == 0) { free(type, M_XENSTORE); return (ENXIO); } free(type, M_XENSTORE); } device_set_desc(dev, "Virtual Block Device"); device_quiet(dev); return (0); } /* * Setup supplies the backend dir, virtual device. We place an event * channel and shared frame entries. We watch backend to wait if it's * ok. */ static int xbd_attach(device_t dev) { struct xbd_softc *sc; const char *name; uint32_t vdevice; int error; int i; int unit; /* FIXME: Use dynamic device id if this is not set. */ error = xs_scanf(XST_NIL, xenbus_get_node(dev), "virtual-device", NULL, "%" PRIu32, &vdevice); if (error) error = xs_scanf(XST_NIL, xenbus_get_node(dev), "virtual-device-ext", NULL, "%" PRIu32, &vdevice); if (error) { xenbus_dev_fatal(dev, error, "reading virtual-device"); device_printf(dev, "Couldn't determine virtual device.\n"); return (error); } xbd_vdevice_to_unit(vdevice, &unit, &name); if (!strcmp(name, "xbd")) device_set_unit(dev, unit); sc = device_get_softc(dev); mtx_init(&sc->xbd_io_lock, "blkfront i/o lock", NULL, MTX_DEF); xbd_initqs(sc); for (i = 0; i < XBD_MAX_RING_PAGES; i++) sc->xbd_ring_ref[i] = GRANT_REF_INVALID; sc->xbd_dev = dev; sc->xbd_vdevice = vdevice; sc->xbd_state = XBD_STATE_DISCONNECTED; xbd_setup_sysctl(sc); /* Wait for backend device to publish its protocol capabilities. */ xenbus_set_state(dev, XenbusStateInitialising); return (0); } static int xbd_detach(device_t dev) { struct xbd_softc *sc = device_get_softc(dev); DPRINTK("%s: %s removed\n", __func__, xenbus_get_node(dev)); xbd_free(sc); mtx_destroy(&sc->xbd_io_lock); return 0; } static int xbd_suspend(device_t dev) { struct xbd_softc *sc = device_get_softc(dev); int retval; int saved_state; /* Prevent new requests being issued until we fix things up. */ mtx_lock(&sc->xbd_io_lock); saved_state = sc->xbd_state; sc->xbd_state = XBD_STATE_SUSPENDED; /* Wait for outstanding I/O to drain. */ retval = 0; while (xbd_queue_length(sc, XBD_Q_BUSY) != 0) { if (msleep(&sc->xbd_cm_q[XBD_Q_BUSY], &sc->xbd_io_lock, PRIBIO, "blkf_susp", 30 * hz) == EWOULDBLOCK) { retval = EBUSY; break; } } mtx_unlock(&sc->xbd_io_lock); if (retval != 0) sc->xbd_state = saved_state; return (retval); } static int xbd_resume(device_t dev) { struct xbd_softc *sc = device_get_softc(dev); DPRINTK("xbd_resume: %s\n", xenbus_get_node(dev)); xbd_free(sc); xbd_initialize(sc); return (0); } /** * Callback received when the backend's state changes. */ static void xbd_backend_changed(device_t dev, XenbusState backend_state) { struct xbd_softc *sc = device_get_softc(dev); DPRINTK("backend_state=%d\n", backend_state); switch (backend_state) { case XenbusStateUnknown: case XenbusStateInitialising: case XenbusStateReconfigured: case XenbusStateReconfiguring: case XenbusStateClosed: break; case XenbusStateInitWait: case XenbusStateInitialised: xbd_initialize(sc); break; case XenbusStateConnected: xbd_initialize(sc); xbd_connect(sc); break; case XenbusStateClosing: if (sc->xbd_users > 0) xenbus_dev_error(dev, -EBUSY, "Device in use; refusing to close"); else xbd_closing(dev); break; } } /*---------------------------- NewBus Registration ---------------------------*/ static device_method_t xbd_methods[] = { /* Device interface */ DEVMETHOD(device_probe, xbd_probe), DEVMETHOD(device_attach, xbd_attach), DEVMETHOD(device_detach, xbd_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, xbd_suspend), DEVMETHOD(device_resume, xbd_resume), /* Xenbus interface */ DEVMETHOD(xenbus_otherend_changed, xbd_backend_changed), { 0, 0 } }; static driver_t xbd_driver = { "xbd", xbd_methods, sizeof(struct xbd_softc), }; devclass_t xbd_devclass; DRIVER_MODULE(xbd, xenbusb_front, xbd_driver, xbd_devclass, 0, 0); Index: stable/10/sys/dev/xen/blkfront/block.h =================================================================== --- stable/10/sys/dev/xen/blkfront/block.h (revision 287801) +++ stable/10/sys/dev/xen/blkfront/block.h (revision 287802) @@ -1,337 +1,354 @@ /* * XenBSD block device driver * * Copyright (c) 2010-2013 Spectra Logic Corporation * Copyright (c) 2009 Scott Long, Yahoo! * Copyright (c) 2009 Frank Suchomel, Citrix * Copyright (c) 2009 Doug F. Rabson, Citrix * Copyright (c) 2005 Kip Macy * Copyright (c) 2003-2004, Keir Fraser & Steve Hand * Modifications by Mark A. Williamson are (c) Intel Research Cambridge * * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * $FreeBSD$ */ #ifndef __XEN_BLKFRONT_BLOCK_H__ #define __XEN_BLKFRONT_BLOCK_H__ #include /** * Given a number of blkif segments, compute the maximum I/O size supported. * * \note This calculation assumes that all but the first and last segments * of the I/O are fully utilized. * * \note We reserve a segement from the maximum supported by the transport to * guarantee we can handle an unaligned transfer without the need to * use a bounce buffer. */ #define XBD_SEGS_TO_SIZE(segs) \ (((segs) - 1) * PAGE_SIZE) /** * Compute the maximum number of blkif segments requried to represent * an I/O of the given size. * * \note This calculation assumes that all but the first and last segments * of the I/O are fully utilized. * * \note We reserve a segement to guarantee we can handle an unaligned * transfer without the need to use a bounce buffer. */ #define XBD_SIZE_TO_SEGS(size) \ ((size / PAGE_SIZE) + 1) /** * The maximum number of shared memory ring pages we will allow in a * negotiated block-front/back communication channel. Allow enough * ring space for all requests to be XBD_MAX_REQUEST_SIZE'd. */ #define XBD_MAX_RING_PAGES 32 /** * The maximum number of outstanding requests we will allow in a negotiated * block-front/back communication channel. */ #define XBD_MAX_REQUESTS \ __CONST_RING_SIZE(blkif, PAGE_SIZE * XBD_MAX_RING_PAGES) /** - * The maximum mapped region size per request we will allow in a negotiated - * block-front/back communication channel. + * The maximum number of blkif segments which can be provided per indirect + * page in an indirect request. */ -#define XBD_MAX_REQUEST_SIZE \ - MIN(MAXPHYS, XBD_SEGS_TO_SIZE(BLKIF_MAX_SEGMENTS_PER_REQUEST)) +#define XBD_MAX_SEGMENTS_PER_PAGE \ + (PAGE_SIZE / sizeof(struct blkif_request_segment)) +/** + * The maximum number of blkif segments which can be provided in an indirect + * request. + */ +#define XBD_MAX_INDIRECT_SEGMENTS \ + (BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST * XBD_MAX_SEGMENTS_PER_PAGE) + +/** + * Compute the number of indirect segment pages required for an I/O with the + * specified number of indirect segments. + */ +#define XBD_INDIRECT_SEGS_TO_PAGES(segs) \ + ((segs + XBD_MAX_SEGMENTS_PER_PAGE - 1) / XBD_MAX_SEGMENTS_PER_PAGE) + typedef enum { XBDCF_Q_MASK = 0xFF, /* This command has contributed to xbd_qfrozen_cnt. */ XBDCF_FROZEN = 1<<8, /* Freeze the command queue on dispatch (i.e. single step command). */ XBDCF_Q_FREEZE = 1<<9, /* Bus DMA returned EINPROGRESS for this command. */ XBDCF_ASYNC_MAPPING = 1<<10, XBDCF_INITIALIZER = XBDCF_Q_MASK } xbdc_flag_t; struct xbd_command; typedef void xbd_cbcf_t(struct xbd_command *); struct xbd_command { TAILQ_ENTRY(xbd_command) cm_link; struct xbd_softc *cm_sc; xbdc_flag_t cm_flags; bus_dmamap_t cm_map; uint64_t cm_id; grant_ref_t *cm_sg_refs; struct bio *cm_bp; grant_ref_t cm_gref_head; void *cm_data; size_t cm_datalen; u_int cm_nseg; int cm_operation; blkif_sector_t cm_sector_number; int cm_status; xbd_cbcf_t *cm_complete; + void *cm_indirectionpages; + grant_ref_t cm_indirectionrefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST]; }; typedef enum { XBD_Q_FREE, XBD_Q_READY, XBD_Q_BUSY, XBD_Q_COMPLETE, XBD_Q_BIO, XBD_Q_COUNT, XBD_Q_NONE = XBDCF_Q_MASK } xbd_q_index_t; typedef struct xbd_cm_q { TAILQ_HEAD(, xbd_command) q_tailq; uint32_t q_length; uint32_t q_max; } xbd_cm_q_t; typedef enum { XBD_STATE_DISCONNECTED, XBD_STATE_CONNECTED, XBD_STATE_SUSPENDED } xbd_state_t; typedef enum { XBDF_NONE = 0, XBDF_OPEN = 1 << 0, /* drive is open (can't shut down) */ XBDF_BARRIER = 1 << 1, /* backend supports barriers */ XBDF_FLUSH = 1 << 2, /* backend supports flush */ XBDF_READY = 1 << 3, /* Is ready */ XBDF_CM_SHORTAGE = 1 << 4, /* Free cm resource shortage active. */ XBDF_GNT_SHORTAGE = 1 << 5, /* Grant ref resource shortage active */ XBDF_WAIT_IDLE = 1 << 6 /* * No new work until oustanding work * completes. */ } xbd_flag_t; /* * We have one of these per vbd, whether ide, scsi or 'other'. */ struct xbd_softc { device_t xbd_dev; struct disk *xbd_disk; /* disk params */ struct bio_queue_head xbd_bioq; /* sort queue */ int xbd_unit; xbd_flag_t xbd_flags; int xbd_qfrozen_cnt; int xbd_vdevice; xbd_state_t xbd_state; u_int xbd_ring_pages; uint32_t xbd_max_requests; uint32_t xbd_max_request_segments; uint32_t xbd_max_request_size; + uint32_t xbd_max_request_indirectpages; grant_ref_t xbd_ring_ref[XBD_MAX_RING_PAGES]; blkif_front_ring_t xbd_ring; xen_intr_handle_t xen_intr_handle; struct gnttab_free_callback xbd_callback; xbd_cm_q_t xbd_cm_q[XBD_Q_COUNT]; bus_dma_tag_t xbd_io_dmat; /** * The number of people holding this device open. We won't allow a * hot-unplug unless this is 0. */ int xbd_users; struct mtx xbd_io_lock; struct xbd_command *xbd_shadow; }; int xbd_instance_create(struct xbd_softc *, blkif_sector_t sectors, int device, uint16_t vdisk_info, unsigned long sector_size); static inline void xbd_added_qentry(struct xbd_softc *sc, xbd_q_index_t index) { struct xbd_cm_q *cmq; cmq = &sc->xbd_cm_q[index]; cmq->q_length++; if (cmq->q_length > cmq->q_max) cmq->q_max = cmq->q_length; } static inline void xbd_removed_qentry(struct xbd_softc *sc, xbd_q_index_t index) { sc->xbd_cm_q[index].q_length--; } static inline uint32_t xbd_queue_length(struct xbd_softc *sc, xbd_q_index_t index) { return (sc->xbd_cm_q[index].q_length); } static inline void xbd_initq_cm(struct xbd_softc *sc, xbd_q_index_t index) { struct xbd_cm_q *cmq; cmq = &sc->xbd_cm_q[index]; TAILQ_INIT(&cmq->q_tailq); cmq->q_length = 0; cmq->q_max = 0; } static inline void xbd_enqueue_cm(struct xbd_command *cm, xbd_q_index_t index) { KASSERT(index != XBD_Q_BIO, ("%s: Commands cannot access the bio queue.", __func__)); if ((cm->cm_flags & XBDCF_Q_MASK) != XBD_Q_NONE) panic("%s: command %p is already on queue %d.", __func__, cm, cm->cm_flags & XBDCF_Q_MASK); TAILQ_INSERT_TAIL(&cm->cm_sc->xbd_cm_q[index].q_tailq, cm, cm_link); cm->cm_flags &= ~XBDCF_Q_MASK; cm->cm_flags |= index; xbd_added_qentry(cm->cm_sc, index); } static inline void xbd_requeue_cm(struct xbd_command *cm, xbd_q_index_t index) { KASSERT(index != XBD_Q_BIO, ("%s: Commands cannot access the bio queue.", __func__)); if ((cm->cm_flags & XBDCF_Q_MASK) != XBD_Q_NONE) panic("%s: command %p is already on queue %d.", __func__, cm, cm->cm_flags & XBDCF_Q_MASK); TAILQ_INSERT_HEAD(&cm->cm_sc->xbd_cm_q[index].q_tailq, cm, cm_link); cm->cm_flags &= ~XBDCF_Q_MASK; cm->cm_flags |= index; xbd_added_qentry(cm->cm_sc, index); } static inline struct xbd_command * xbd_dequeue_cm(struct xbd_softc *sc, xbd_q_index_t index) { struct xbd_command *cm; KASSERT(index != XBD_Q_BIO, ("%s: Commands cannot access the bio queue.", __func__)); if ((cm = TAILQ_FIRST(&sc->xbd_cm_q[index].q_tailq)) != NULL) { if ((cm->cm_flags & XBDCF_Q_MASK) != index) { panic("%s: command %p is on queue %d, " "not specified queue %d", __func__, cm, cm->cm_flags & XBDCF_Q_MASK, index); } TAILQ_REMOVE(&sc->xbd_cm_q[index].q_tailq, cm, cm_link); cm->cm_flags &= ~XBDCF_Q_MASK; cm->cm_flags |= XBD_Q_NONE; xbd_removed_qentry(cm->cm_sc, index); } return (cm); } static inline void xbd_remove_cm(struct xbd_command *cm, xbd_q_index_t expected_index) { xbd_q_index_t index; index = cm->cm_flags & XBDCF_Q_MASK; KASSERT(index != XBD_Q_BIO, ("%s: Commands cannot access the bio queue.", __func__)); if (index != expected_index) { panic("%s: command %p is on queue %d, not specified queue %d", __func__, cm, index, expected_index); } TAILQ_REMOVE(&cm->cm_sc->xbd_cm_q[index].q_tailq, cm, cm_link); cm->cm_flags &= ~XBDCF_Q_MASK; cm->cm_flags |= XBD_Q_NONE; xbd_removed_qentry(cm->cm_sc, index); } static inline void xbd_initq_bio(struct xbd_softc *sc) { bioq_init(&sc->xbd_bioq); } static inline void xbd_enqueue_bio(struct xbd_softc *sc, struct bio *bp) { bioq_insert_tail(&sc->xbd_bioq, bp); xbd_added_qentry(sc, XBD_Q_BIO); } static inline void xbd_requeue_bio(struct xbd_softc *sc, struct bio *bp) { bioq_insert_head(&sc->xbd_bioq, bp); xbd_added_qentry(sc, XBD_Q_BIO); } static inline struct bio * xbd_dequeue_bio(struct xbd_softc *sc) { struct bio *bp; if ((bp = bioq_first(&sc->xbd_bioq)) != NULL) { bioq_remove(&sc->xbd_bioq, bp); xbd_removed_qentry(sc, XBD_Q_BIO); } return (bp); } static inline void xbd_initqs(struct xbd_softc *sc) { u_int index; for (index = 0; index < XBD_Q_COUNT; index++) xbd_initq_cm(sc, index); xbd_initq_bio(sc); } #endif /* __XEN_BLKFRONT_BLOCK_H__ */ Index: stable/10/sys/xen/interface/io/blkif.h =================================================================== --- stable/10/sys/xen/interface/io/blkif.h (revision 287801) +++ stable/10/sys/xen/interface/io/blkif.h (revision 287802) @@ -1,492 +1,641 @@ /****************************************************************************** * blkif.h * * Unified block-device I/O interface for Xen guest OSes. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2003-2004, Keir Fraser * Copyright (c) 2012, Spectra Logic Corporation */ #ifndef __XEN_PUBLIC_IO_BLKIF_H__ #define __XEN_PUBLIC_IO_BLKIF_H__ #include "ring.h" #include "../grant_table.h" /* * Front->back notifications: When enqueuing a new request, sending a * notification can be made conditional on req_event (i.e., the generic * hold-off mechanism provided by the ring macros). Backends must set * req_event appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()). * * Back->front notifications: When enqueuing a new response, sending a * notification can be made conditional on rsp_event (i.e., the generic * hold-off mechanism provided by the ring macros). Frontends must set * rsp_event appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()). */ #ifndef blkif_vdev_t #define blkif_vdev_t uint16_t #endif #define blkif_sector_t uint64_t /* * Feature and Parameter Negotiation * ================================= * The two halves of a Xen block driver utilize nodes within the XenStore to * communicate capabilities and to negotiate operating parameters. This * section enumerates these nodes which reside in the respective front and * backend portions of the XenStore, following the XenBus convention. * * All data in the XenStore is stored as strings. Nodes specifying numeric * values are encoded in decimal. Integer value ranges listed below are * expressed as fixed sized integer types capable of storing the conversion * of a properly formatted node string, without loss of information. * * Any specified default value is in effect if the corresponding XenBus node * is not present in the XenStore. * * XenStore nodes in sections marked "PRIVATE" are solely for use by the * driver side whose XenBus tree contains them. * * XenStore nodes marked "DEPRECATED" in their notes section should only be * used to provide interoperability with legacy implementations. * * See the XenBus state transition diagram below for details on when XenBus * nodes must be published and when they can be queried. * ***************************************************************************** * Backend XenBus Nodes ***************************************************************************** * *------------------ Backend Device Identification (PRIVATE) ------------------ * * mode * Values: "r" (read only), "w" (writable) * * The read or write access permissions to the backing store to be * granted to the frontend. * * params * Values: string * * Data used by the backend driver to locate and configure the backing * device. The format and semantics of this data vary according to the * backing device in use and are outside the scope of this specification. * * type * Values: "file", "phy", "tap" * * The type of the backing device/object. * + * + * direct-io-safe + * Values: 0/1 (boolean) + * Default Value: 0 + * + * The underlying storage is not affected by the direct IO memory + * lifetime bug. See: + * http://lists.xen.org/archives/html/xen-devel/2012-12/msg01154.html + * + * Therefore this option gives the backend permission to use + * O_DIRECT, notwithstanding that bug. + * + * That is, if this option is enabled, use of O_DIRECT is safe, + * in circumstances where we would normally have avoided it as a + * workaround for that bug. This option is not relevant for all + * backends, and even not necessarily supported for those for + * which it is relevant. A backend which knows that it is not + * affected by the bug can ignore this option. + * + * This option doesn't require a backend to use O_DIRECT, so it + * should not be used to try to control the caching behaviour. + * *--------------------------------- Features --------------------------------- * * feature-barrier * Values: 0/1 (boolean) * Default Value: 0 * * A value of "1" indicates that the backend can process requests * containing the BLKIF_OP_WRITE_BARRIER request opcode. Requests * of this type may still be returned at any time with the * BLKIF_RSP_EOPNOTSUPP result code. * * feature-flush-cache * Values: 0/1 (boolean) * Default Value: 0 * * A value of "1" indicates that the backend can process requests * containing the BLKIF_OP_FLUSH_DISKCACHE request opcode. Requests * of this type may still be returned at any time with the * BLKIF_RSP_EOPNOTSUPP result code. * * feature-discard * Values: 0/1 (boolean) * Default Value: 0 * * A value of "1" indicates that the backend can process requests * containing the BLKIF_OP_DISCARD request opcode. Requests * of this type may still be returned at any time with the * BLKIF_RSP_EOPNOTSUPP result code. * + * feature-persistent + * Values: 0/1 (boolean) + * Default Value: 0 + * Notes: 7 + * + * A value of "1" indicates that the backend can keep the grants used + * by the frontend driver mapped, so the same set of grants should be + * used in all transactions. The maximum number of grants the backend + * can map persistently depends on the implementation, but ideally it + * should be RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST. Using this + * feature the backend doesn't need to unmap each grant, preventing + * costly TLB flushes. The backend driver should only map grants + * persistently if the frontend supports it. If a backend driver chooses + * to use the persistent protocol when the frontend doesn't support it, + * it will probably hit the maximum number of persistently mapped grants + * (due to the fact that the frontend won't be reusing the same grants), + * and fall back to non-persistent mode. Backend implementations may + * shrink or expand the number of persistently mapped grants without + * notifying the frontend depending on memory constraints (this might + * cause a performance degradation). + * + * If a backend driver wants to limit the maximum number of persistently + * mapped grants to a value less than RING_SIZE * + * BLKIF_MAX_SEGMENTS_PER_REQUEST a LRU strategy should be used to + * discard the grants that are less commonly used. Using a LRU in the + * backend driver paired with a LIFO queue in the frontend will + * allow us to have better performance in this scenario. + * *----------------------- Request Transport Parameters ------------------------ * * max-ring-page-order * Values: * Default Value: 0 * Notes: 1, 3 * * The maximum supported size of the request ring buffer in units of * lb(machine pages). (e.g. 0 == 1 page, 1 = 2 pages, 2 == 4 pages, * etc.). * * max-ring-pages * Values: * Default Value: 1 * Notes: DEPRECATED, 2, 3 * * The maximum supported size of the request ring buffer in units of * machine pages. The value must be a power of 2. * *------------------------- Backend Device Properties ------------------------- * + * discard-enable + * Values: 0/1 (boolean) + * Default Value: 1 + * + * This optional property, set by the toolstack, instructs the backend + * to offer discard to the frontend. If the property is missing the + * backend should offer discard if the backing storage actually supports + * it. This optional property, set by the toolstack, requests that the + * backend offer, or not offer, discard to the frontend. + * * discard-alignment * Values: * Default Value: 0 * Notes: 4, 5 * * The offset, in bytes from the beginning of the virtual block device, * to the first, addressable, discard extent on the underlying device. * * discard-granularity * Values: * Default Value: <"sector-size"> * Notes: 4 * * The size, in bytes, of the individually addressable discard extents * of the underlying device. * * discard-secure * Values: 0/1 (boolean) * Default Value: 0 + * Notes: 10 * * A value of "1" indicates that the backend can process BLKIF_OP_DISCARD * requests with the BLKIF_DISCARD_SECURE flag set. * * info * Values: (bitmap) * * A collection of bit flags describing attributes of the backing * device. The VDISK_* macros define the meaning of each bit * location. * * sector-size * Values: * - * The size, in bytes, of the individually addressible data blocks - * on the backend device. + * The logical sector size, in bytes, of the backend device. * + * physical-sector-size + * Values: + * + * The physical sector size, in bytes, of the backend device. + * * sectors * Values: * - * The size of the backend device, expressed in units of its native + * The size of the backend device, expressed in units of its logical * sector size ("sector-size"). * ***************************************************************************** * Frontend XenBus Nodes ***************************************************************************** * *----------------------- Request Transport Parameters ----------------------- * * event-channel * Values: * * The identifier of the Xen event channel used to signal activity * in the ring buffer. * * ring-ref * Values: * Notes: 6 * * The Xen grant reference granting permission for the backend to map * the sole page in a single page sized ring buffer. * * ring-ref%u * Values: * Notes: 6 * * For a frontend providing a multi-page ring, a "number of ring pages" * sized list of nodes, each containing a Xen grant reference granting * permission for the backend to map the page of the ring located * at page index "%u". Page indexes are zero based. * * protocol * Values: string (XEN_IO_PROTO_ABI_*) * Default Value: XEN_IO_PROTO_ABI_NATIVE * * The machine ABI rules governing the format of all ring request and * response structures. * * ring-page-order * Values: * Default Value: 0 * Maximum Value: MAX(ffs(max-ring-pages) - 1, max-ring-page-order) * Notes: 1, 3 * * The size of the frontend allocated request ring buffer in units * of lb(machine pages). (e.g. 0 == 1 page, 1 = 2 pages, 2 == 4 pages, * etc.). * * num-ring-pages * Values: * Default Value: 1 * Maximum Value: MAX(max-ring-pages,(0x1 << max-ring-page-order)) * Notes: DEPRECATED, 2, 3 * * The size of the frontend allocated request ring buffer in units of * machine pages. The value must be a power of 2. * + * feature-persistent + * Values: 0/1 (boolean) + * Default Value: 0 + * Notes: 7, 8, 9 + * + * A value of "1" indicates that the frontend will reuse the same grants + * for all transactions, allowing the backend to map them with write + * access (even when it should be read-only). If the frontend hits the + * maximum number of allowed persistently mapped grants, it can fallback + * to non persistent mode. This will cause a performance degradation, + * since the the backend driver will still try to map those grants + * persistently. Since the persistent grants protocol is compatible with + * the previous protocol, a frontend driver can choose to work in + * persistent mode even when the backend doesn't support it. + * + * It is recommended that the frontend driver stores the persistently + * mapped grants in a LIFO queue, so a subset of all persistently mapped + * grants gets used commonly. This is done in case the backend driver + * decides to limit the maximum number of persistently mapped grants + * to a value less than RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST. + * *------------------------- Virtual Device Properties ------------------------- * * device-type * Values: "disk", "cdrom", "floppy", etc. * * virtual-device * Values: * * A value indicating the physical device to virtualize within the * frontend's domain. (e.g. "The first ATA disk", "The third SCSI * disk", etc.) * * See docs/misc/vbd-interface.txt for details on the format of this * value. * * Notes * ----- * (1) Multi-page ring buffer scheme first developed in the Citrix XenServer * PV drivers. - * (2) Multi-page ring buffer scheme first used in some Red Hat distributions + * (2) Multi-page ring buffer scheme first used in some RedHat distributions * including a distribution deployed on certain nodes of the Amazon * EC2 cluster. * (3) Support for multi-page ring buffers was implemented independently, - * in slightly different forms, by both Citrix and Red Hat/Amazon. + * in slightly different forms, by both Citrix and RedHat/Amazon. * For full interoperability, block front and backends should publish * identical ring parameters, adjusted for unit differences, to the * XenStore nodes used in both schemes. - * (4) Devices that support discard functionality may internally allocate - * space (discardable extents) in units that are larger than the - * exported logical block size. + * (4) Devices that support discard functionality may internally allocate space + * (discardable extents) in units that are larger than the exported logical + * block size. If the backing device has such discardable extents the + * backend should provide both discard-granularity and discard-alignment. + * Providing just one of the two may be considered an error by the frontend. + * Backends supporting discard should include discard-granularity and + * discard-alignment even if it supports discarding individual sectors. + * Frontends should assume discard-alignment == 0 and discard-granularity + * == sector size if these keys are missing. * (5) The discard-alignment parameter allows a physical device to be * partitioned into virtual devices that do not necessarily begin or * end on a discardable extent boundary. * (6) When there is only a single page allocated to the request ring, * 'ring-ref' is used to communicate the grant reference for this * page to the backend. When using a multi-page ring, the 'ring-ref' * node is not created. Instead 'ring-ref0' - 'ring-refN' are used. + * (7) When using persistent grants data has to be copied from/to the page + * where the grant is currently mapped. The overhead of doing this copy + * however doesn't suppress the speed improvement of not having to unmap + * the grants. + * (8) The frontend driver has to allow the backend driver to map all grants + * with write access, even when they should be mapped read-only, since + * further requests may reuse these grants and require write permissions. + * (9) Linux implementation doesn't have a limit on the maximum number of + * grants that can be persistently mapped in the frontend driver, but + * due to the frontent driver implementation it should never be bigger + * than RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST. + *(10) The discard-secure property may be present and will be set to 1 if the + * backing device supports secure discard. */ /* * STATE DIAGRAMS * ***************************************************************************** * Startup * ***************************************************************************** * * Tool stack creates front and back nodes with state XenbusStateInitialising. * * Front Back * ================================= ===================================== * XenbusStateInitialising XenbusStateInitialising * o Query virtual device o Query backend device identification * properties. data. * o Setup OS device instance. o Open and validate backend device. * o Publish backend features and * transport parameters. * | * | * V * XenbusStateInitWait * * o Query backend features and * transport parameters. * o Allocate and initialize the * request ring. * o Publish transport parameters * that will be in effect during * this connection. * | * | * V * XenbusStateInitialised * * o Query frontend transport parameters. * o Connect to the request ring and * event channel. * o Publish backend device properties. * | * | * V * XenbusStateConnected * * o Query backend device properties. * o Finalize OS virtual device * instance. * | * | * V * XenbusStateConnected * * Note: Drivers that do not support any optional features, or the negotiation * of transport parameters, can skip certain states in the state machine: * * o A frontend may transition to XenbusStateInitialised without * waiting for the backend to enter XenbusStateInitWait. In this * case, default transport parameters are in effect and any * transport parameters published by the frontend must contain * their default values. * * o A backend may transition to XenbusStateInitialised, bypassing * XenbusStateInitWait, without waiting for the frontend to first * enter the XenbusStateInitialised state. In this case, default * transport parameters are in effect and any transport parameters * published by the backend must contain their default values. * * Drivers that support optional features and/or transport parameter * negotiation must tolerate these additional state transition paths. * In general this means performing the work of any skipped state * transition, if it has not already been performed, in addition to the * work associated with entry into the current state. */ /* * REQUEST CODES. */ #define BLKIF_OP_READ 0 #define BLKIF_OP_WRITE 1 /* * All writes issued prior to a request with the BLKIF_OP_WRITE_BARRIER * operation code ("barrier request") must be completed prior to the * execution of the barrier request. All writes issued after the barrier * request must not execute until after the completion of the barrier request. * * Optional. See "feature-barrier" XenBus node documentation above. */ #define BLKIF_OP_WRITE_BARRIER 2 /* * Commit any uncommitted contents of the backing device's volatile cache * to stable storage. * * Optional. See "feature-flush-cache" XenBus node documentation above. */ #define BLKIF_OP_FLUSH_DISKCACHE 3 /* * Used in SLES sources for device specific command packet * contained within the request. Reserved for that purpose. */ #define BLKIF_OP_RESERVED_1 4 /* * Indicate to the backend device that a region of storage is no longer in * use, and may be discarded at any time without impact to the client. If * the BLKIF_DISCARD_SECURE flag is set on the request, all copies of the * discarded region on the device must be rendered unrecoverable before the * command returns. * * This operation is analogous to performing a trim (ATA) or unmap (SCSI), * command on a native device. * * More information about trim/unmap operations can be found at: * http://t13.org/Documents/UploadedDocuments/docs2008/ * e07154r6-Data_Set_Management_Proposal_for_ATA-ACS2.doc * http://www.seagate.com/staticfiles/support/disc/manuals/ * Interface%20manuals/100293068c.pdf * * Optional. See "feature-discard", "discard-alignment", * "discard-granularity", and "discard-secure" in the XenBus node * documentation above. */ #define BLKIF_OP_DISCARD 5 /* + * Recognized if "feature-max-indirect-segments" in present in the backend + * xenbus info. The "feature-max-indirect-segments" node contains the maximum + * number of segments allowed by the backend per request. If the node is + * present, the frontend might use blkif_request_indirect structs in order to + * issue requests with more than BLKIF_MAX_SEGMENTS_PER_REQUEST (11). The + * maximum number of indirect segments is fixed by the backend, but the + * frontend can issue requests with any number of indirect segments as long as + * it's less than the number provided by the backend. The indirect_grefs field + * in blkif_request_indirect should be filled by the frontend with the + * grant references of the pages that are holding the indirect segments. + * These pages are filled with an array of blkif_request_segment that hold the + * information about the segments. The number of indirect pages to use is + * determined by the number of segments an indirect request contains. Every + * indirect page can contain a maximum of + * (PAGE_SIZE / sizeof(struct blkif_request_segment)) segments, so to + * calculate the number of indirect pages to use we have to do + * ceil(indirect_segments / (PAGE_SIZE / sizeof(struct blkif_request_segment))). + * + * If a backend does not recognize BLKIF_OP_INDIRECT, it should *not* + * create the "feature-max-indirect-segments" node! + */ +#define BLKIF_OP_INDIRECT 6 + +/* * Maximum scatter/gather segments per request. * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE. * NB. This could be 12 if the ring indexes weren't stored in the same page. */ #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 /* + * Maximum number of indirect pages to use per request. + */ +#define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8 + +/* * NB. first_sect and last_sect in blkif_request_segment, as well as * sector_number in blkif_request, are always expressed in 512-byte units. * However they must be properly aligned to the real sector size of the - * physical disk, which is reported in the "sector-size" node in the backend - * xenbus info. Also the xenbus "sectors" node is expressed in 512-byte units. + * physical disk, which is reported in the "physical-sector-size" node in + * the backend xenbus info. Also the xenbus "sectors" node is expressed in + * 512-byte units. */ struct blkif_request_segment { grant_ref_t gref; /* reference to I/O buffer frame */ /* @first_sect: first sector in frame to transfer (inclusive). */ /* @last_sect: last sector in frame to transfer (inclusive). */ uint8_t first_sect, last_sect; }; typedef struct blkif_request_segment blkif_request_segment_t; /* * Starting ring element for any I/O request. */ struct blkif_request { uint8_t operation; /* BLKIF_OP_??? */ uint8_t nr_segments; /* number of segments */ blkif_vdev_t handle; /* only for read/write requests */ uint64_t id; /* private guest value, echoed in resp */ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ blkif_request_segment_t seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; }; typedef struct blkif_request blkif_request_t; /* * Cast to this structure when blkif_request.operation == BLKIF_OP_DISCARD * sizeof(struct blkif_request_discard) <= sizeof(struct blkif_request) */ struct blkif_request_discard { uint8_t operation; /* BLKIF_OP_DISCARD */ uint8_t flag; /* BLKIF_DISCARD_SECURE or zero */ #define BLKIF_DISCARD_SECURE (1<<0) /* ignored if discard-secure=0 */ blkif_vdev_t handle; /* same as for read/write requests */ uint64_t id; /* private guest value, echoed in resp */ blkif_sector_t sector_number;/* start sector idx on disk */ uint64_t nr_sectors; /* number of contiguous sectors to discard*/ }; typedef struct blkif_request_discard blkif_request_discard_t; +struct blkif_request_indirect { + uint8_t operation; /* BLKIF_OP_INDIRECT */ + uint8_t indirect_op; /* BLKIF_OP_{READ/WRITE} */ + uint16_t nr_segments; /* number of segments */ + uint64_t id; /* private guest value, echoed in resp */ + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + blkif_vdev_t handle; /* same as for read/write requests */ + grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST]; +#ifdef __i386__ + uint64_t pad; /* Make it 64 byte aligned on i386 */ +#endif +}; +typedef struct blkif_request_indirect blkif_request_indirect_t; + struct blkif_response { uint64_t id; /* copied from request */ uint8_t operation; /* copied from request */ int16_t status; /* BLKIF_RSP_??? */ }; typedef struct blkif_response blkif_response_t; /* * STATUS RETURN CODES. */ /* Operation not supported (only happens on barrier writes). */ #define BLKIF_RSP_EOPNOTSUPP -2 /* Operation failed for some unspecified reason (-EIO). */ #define BLKIF_RSP_ERROR -1 /* Operation completed successfully. */ #define BLKIF_RSP_OKAY 0 /* * Generate blkif ring structures and types. */ DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response); #define VDISK_CDROM 0x1 #define VDISK_REMOVABLE 0x2 #define VDISK_READONLY 0x4 #endif /* __XEN_PUBLIC_IO_BLKIF_H__ */ /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */