Index: head/sys/cam/ctl/ctl_backend_block.c
===================================================================
--- head/sys/cam/ctl/ctl_backend_block.c	(revision 356199)
+++ head/sys/cam/ctl/ctl_backend_block.c	(revision 356200)
@@ -1,2901 +1,2889 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2003 Silicon Graphics International Corp.
  * Copyright (c) 2009-2011 Spectra Logic Corporation
  * Copyright (c) 2012 The FreeBSD Foundation
  * Copyright (c) 2014-2015 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
  *
  * Portions of this software were developed by Edward Tomasz Napierala
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification.
  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
  *    substantially similar to the "NO WARRANTY" disclaimer below
  *    ("Disclaimer") and any redistribution must be conditioned upon
  *    including a substantially similar Disclaimer requirement for further
  *    binary redistribution.
  *
  * NO WARRANTY
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGES.
  *
  * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_backend_block.c#5 $
  */
 /*
  * CAM Target Layer driver backend for block devices.
  *
  * Author: Ken Merry <ken@FreeBSD.org>
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/types.h>
 #include <sys/kthread.h>
 #include <sys/bio.h>
 #include <sys/fcntl.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/condvar.h>
 #include <sys/malloc.h>
 #include <sys/conf.h>
 #include <sys/ioccom.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/endian.h>
 #include <sys/uio.h>
 #include <sys/buf.h>
 #include <sys/taskqueue.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/mount.h>
 #include <sys/disk.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/proc.h>
 #include <sys/pcpu.h>
 #include <sys/module.h>
 #include <sys/sdt.h>
 #include <sys/devicestat.h>
 #include <sys/sysctl.h>
 #include <sys/nv.h>
 #include <sys/dnv.h>
 
 #include <geom/geom.h>
 
 #include <cam/cam.h>
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_da.h>
 #include <cam/ctl/ctl_io.h>
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_backend.h>
 #include <cam/ctl/ctl_ioctl.h>
 #include <cam/ctl/ctl_ha.h>
 #include <cam/ctl/ctl_scsi_all.h>
 #include <cam/ctl/ctl_private.h>
 #include <cam/ctl/ctl_error.h>
 
 /*
  * The idea here is that we'll allocate enough S/G space to hold a 1MB
  * I/O.  If we get an I/O larger than that, we'll split it.
  */
 #define	CTLBLK_HALF_IO_SIZE	(512 * 1024)
 #define	CTLBLK_MAX_IO_SIZE	(CTLBLK_HALF_IO_SIZE * 2)
 #define	CTLBLK_MAX_SEG		MAXPHYS
 #define	CTLBLK_HALF_SEGS	MAX(CTLBLK_HALF_IO_SIZE / CTLBLK_MAX_SEG, 1)
 #define	CTLBLK_MAX_SEGS		(CTLBLK_HALF_SEGS * 2)
 
 #ifdef CTLBLK_DEBUG
 #define DPRINTF(fmt, args...) \
     printf("cbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
 #else
 #define DPRINTF(fmt, args...) do {} while(0)
 #endif
 
 #define PRIV(io)	\
     ((struct ctl_ptr_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_BACKEND])
 #define ARGS(io)	\
     ((struct ctl_lba_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_LBA_LEN])
 
 SDT_PROVIDER_DEFINE(cbb);
 
 typedef enum {
 	CTL_BE_BLOCK_LUN_UNCONFIGURED	= 0x01,
 	CTL_BE_BLOCK_LUN_CONFIG_ERR	= 0x02,
 	CTL_BE_BLOCK_LUN_WAITING	= 0x04,
 } ctl_be_block_lun_flags;
 
 typedef enum {
 	CTL_BE_BLOCK_NONE,
 	CTL_BE_BLOCK_DEV,
 	CTL_BE_BLOCK_FILE
 } ctl_be_block_type;
 
 struct ctl_be_block_filedata {
 	struct ucred *cred;
 };
 
 union ctl_be_block_bedata {
 	struct ctl_be_block_filedata file;
 };
 
 struct ctl_be_block_io;
 struct ctl_be_block_lun;
 
 typedef void (*cbb_dispatch_t)(struct ctl_be_block_lun *be_lun,
 			       struct ctl_be_block_io *beio);
 typedef uint64_t (*cbb_getattr_t)(struct ctl_be_block_lun *be_lun,
 				  const char *attrname);
 
 /*
  * Backend LUN structure.  There is a 1:1 mapping between a block device
  * and a backend block LUN, and between a backend block LUN and a CTL LUN.
  */
 struct ctl_be_block_lun {
 	struct ctl_lun_create_params params;
 	char lunname[32];
 	char *dev_path;
 	ctl_be_block_type dev_type;
 	struct vnode *vn;
 	union ctl_be_block_bedata backend;
 	cbb_dispatch_t dispatch;
 	cbb_dispatch_t lun_flush;
 	cbb_dispatch_t unmap;
 	cbb_dispatch_t get_lba_status;
 	cbb_getattr_t getattr;
 	uma_zone_t lun_zone;
 	uint64_t size_blocks;
 	uint64_t size_bytes;
 	struct ctl_be_block_softc *softc;
 	struct devstat *disk_stats;
 	ctl_be_block_lun_flags flags;
 	STAILQ_ENTRY(ctl_be_block_lun) links;
 	struct ctl_be_lun cbe_lun;
 	struct taskqueue *io_taskqueue;
 	struct task io_task;
 	int num_threads;
 	STAILQ_HEAD(, ctl_io_hdr) input_queue;
 	STAILQ_HEAD(, ctl_io_hdr) config_read_queue;
 	STAILQ_HEAD(, ctl_io_hdr) config_write_queue;
 	STAILQ_HEAD(, ctl_io_hdr) datamove_queue;
 	struct mtx_padalign io_lock;
 	struct mtx_padalign queue_lock;
 };
 
 /*
  * Overall softc structure for the block backend module.
  */
 struct ctl_be_block_softc {
 	struct mtx			 lock;
 	uma_zone_t			 beio_zone;
 	int				 num_luns;
 	STAILQ_HEAD(, ctl_be_block_lun)	 lun_list;
 };
 
 static struct ctl_be_block_softc backend_block_softc;
 
 /*
  * Per-I/O information.
  */
 struct ctl_be_block_io {
 	union ctl_io			*io;
 	struct ctl_sg_entry		sg_segs[CTLBLK_MAX_SEGS];
 	struct iovec			xiovecs[CTLBLK_MAX_SEGS];
 	int				bio_cmd;
 	int				num_segs;
 	int				num_bios_sent;
 	int				num_bios_done;
 	int				send_complete;
 	int				first_error;
 	uint64_t			first_error_offset;
 	struct bintime			ds_t0;
 	devstat_tag_type		ds_tag_type;
 	devstat_trans_flags		ds_trans_type;
 	uint64_t			io_len;
 	uint64_t			io_offset;
 	int				io_arg;
 	struct ctl_be_block_softc	*softc;
 	struct ctl_be_block_lun		*lun;
 	void (*beio_cont)(struct ctl_be_block_io *beio); /* to continue processing */
 };
 
 extern struct ctl_softc *control_softc;
 
 static int cbb_num_threads = 14;
 SYSCTL_NODE(_kern_cam_ctl, OID_AUTO, block, CTLFLAG_RD, 0,
 	    "CAM Target Layer Block Backend");
 SYSCTL_INT(_kern_cam_ctl_block, OID_AUTO, num_threads, CTLFLAG_RWTUN,
            &cbb_num_threads, 0, "Number of threads per backing file");
 
 static struct ctl_be_block_io *ctl_alloc_beio(struct ctl_be_block_softc *softc);
 static void ctl_free_beio(struct ctl_be_block_io *beio);
 static void ctl_complete_beio(struct ctl_be_block_io *beio);
 static int ctl_be_block_move_done(union ctl_io *io);
 static void ctl_be_block_biodone(struct bio *bio);
 static void ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
 				    struct ctl_be_block_io *beio);
 static void ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
 				       struct ctl_be_block_io *beio);
 static void ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun,
 				  struct ctl_be_block_io *beio);
 static uint64_t ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun,
 					 const char *attrname);
 static void ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
 				   struct ctl_be_block_io *beio);
 static void ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun,
 				   struct ctl_be_block_io *beio);
 static void ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
 				      struct ctl_be_block_io *beio);
 static uint64_t ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun,
 					 const char *attrname);
 static void ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun,
 				    union ctl_io *io);
 static void ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
 				    union ctl_io *io);
 static void ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
 				  union ctl_io *io);
 static void ctl_be_block_worker(void *context, int pending);
 static int ctl_be_block_submit(union ctl_io *io);
 static int ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
 				   int flag, struct thread *td);
 static int ctl_be_block_open_file(struct ctl_be_block_lun *be_lun,
 				  struct ctl_lun_req *req);
 static int ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun,
 				 struct ctl_lun_req *req);
 static int ctl_be_block_close(struct ctl_be_block_lun *be_lun);
 static int ctl_be_block_open(struct ctl_be_block_lun *be_lun,
 			     struct ctl_lun_req *req);
 static int ctl_be_block_create(struct ctl_be_block_softc *softc,
 			       struct ctl_lun_req *req);
 static int ctl_be_block_rm(struct ctl_be_block_softc *softc,
 			   struct ctl_lun_req *req);
 static int ctl_be_block_modify(struct ctl_be_block_softc *softc,
 			   struct ctl_lun_req *req);
 static void ctl_be_block_lun_shutdown(void *be_lun);
 static void ctl_be_block_lun_config_status(void *be_lun,
 					   ctl_lun_config_status status);
 static int ctl_be_block_config_write(union ctl_io *io);
 static int ctl_be_block_config_read(union ctl_io *io);
 static int ctl_be_block_lun_info(void *be_lun, struct sbuf *sb);
 static uint64_t ctl_be_block_lun_attr(void *be_lun, const char *attrname);
 static int ctl_be_block_init(void);
 static int ctl_be_block_shutdown(void);
 
 static struct ctl_backend_driver ctl_be_block_driver = 
 {
 	.name = "block",
 	.flags = CTL_BE_FLAG_HAS_CONFIG,
 	.init = ctl_be_block_init,
 	.shutdown = ctl_be_block_shutdown,
 	.data_submit = ctl_be_block_submit,
 	.data_move_done = ctl_be_block_move_done,
 	.config_read = ctl_be_block_config_read,
 	.config_write = ctl_be_block_config_write,
 	.ioctl = ctl_be_block_ioctl,
 	.lun_info = ctl_be_block_lun_info,
 	.lun_attr = ctl_be_block_lun_attr
 };
 
 MALLOC_DEFINE(M_CTLBLK, "ctlblk", "Memory used for CTL block backend");
 CTL_BACKEND_DECLARE(cbb, ctl_be_block_driver);
 
 static struct ctl_be_block_io *
 ctl_alloc_beio(struct ctl_be_block_softc *softc)
 {
 	struct ctl_be_block_io *beio;
 
 	beio = uma_zalloc(softc->beio_zone, M_WAITOK | M_ZERO);
 	beio->softc = softc;
 	return (beio);
 }
 
 static void
 ctl_free_beio(struct ctl_be_block_io *beio)
 {
 	int duplicate_free;
 	int i;
 
 	duplicate_free = 0;
 
 	for (i = 0; i < beio->num_segs; i++) {
 		if (beio->sg_segs[i].addr == NULL)
 			duplicate_free++;
 
 		uma_zfree(beio->lun->lun_zone, beio->sg_segs[i].addr);
 		beio->sg_segs[i].addr = NULL;
 
 		/* For compare we had two equal S/G lists. */
 		if (ARGS(beio->io)->flags & CTL_LLF_COMPARE) {
 			uma_zfree(beio->lun->lun_zone,
 			    beio->sg_segs[i + CTLBLK_HALF_SEGS].addr);
 			beio->sg_segs[i + CTLBLK_HALF_SEGS].addr = NULL;
 		}
 	}
 
 	if (duplicate_free > 0) {
 		printf("%s: %d duplicate frees out of %d segments\n", __func__,
 		       duplicate_free, beio->num_segs);
 	}
 
 	uma_zfree(beio->softc->beio_zone, beio);
 }
 
 static void
 ctl_complete_beio(struct ctl_be_block_io *beio)
 {
 	union ctl_io *io = beio->io;
 
 	if (beio->beio_cont != NULL) {
 		beio->beio_cont(beio);
 	} else {
 		ctl_free_beio(beio);
 		ctl_data_submit_done(io);
 	}
 }
 
 static size_t
 cmp(uint8_t *a, uint8_t *b, size_t size)
 {
 	size_t i;
 
 	for (i = 0; i < size; i++) {
 		if (a[i] != b[i])
 			break;
 	}
 	return (i);
 }
 
 static void
 ctl_be_block_compare(union ctl_io *io)
 {
 	struct ctl_be_block_io *beio;
 	uint64_t off, res;
 	int i;
 	uint8_t info[8];
 
 	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
 	off = 0;
 	for (i = 0; i < beio->num_segs; i++) {
 		res = cmp(beio->sg_segs[i].addr,
 		    beio->sg_segs[i + CTLBLK_HALF_SEGS].addr,
 		    beio->sg_segs[i].len);
 		off += res;
 		if (res < beio->sg_segs[i].len)
 			break;
 	}
 	if (i < beio->num_segs) {
 		scsi_u64to8b(off, info);
 		ctl_set_sense(&io->scsiio, /*current_error*/ 1,
 		    /*sense_key*/ SSD_KEY_MISCOMPARE,
 		    /*asc*/ 0x1D, /*ascq*/ 0x00,
 		    /*type*/ SSD_ELEM_INFO,
 		    /*size*/ sizeof(info), /*data*/ &info,
 		    /*type*/ SSD_ELEM_NONE);
 	} else
 		ctl_set_success(&io->scsiio);
 }
 
 static int
 ctl_be_block_move_done(union ctl_io *io)
 {
 	struct ctl_be_block_io *beio;
 	struct ctl_be_block_lun *be_lun;
 	struct ctl_lba_len_flags *lbalen;
 #ifdef CTL_TIME_IO
 	struct bintime cur_bt;
 #endif
 
 	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
 	be_lun = beio->lun;
 
 	DPRINTF("entered\n");
 
 #ifdef CTL_TIME_IO
 	getbinuptime(&cur_bt);
 	bintime_sub(&cur_bt, &io->io_hdr.dma_start_bt);
 	bintime_add(&io->io_hdr.dma_bt, &cur_bt);
 #endif
 	io->io_hdr.num_dmas++;
 	io->scsiio.kern_rel_offset += io->scsiio.kern_data_len;
 
 	/*
 	 * We set status at this point for read commands, and write
 	 * commands with errors.
 	 */
 	if (io->io_hdr.flags & CTL_FLAG_ABORT) {
 		;
 	} else if ((io->io_hdr.port_status != 0) &&
 	    ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE ||
 	     (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) {
 		ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1,
 		    /*retry_count*/ io->io_hdr.port_status);
 	} else if (io->scsiio.kern_data_resid != 0 &&
 	    (io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_OUT &&
 	    ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE ||
 	     (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) {
 		ctl_set_invalid_field_ciu(&io->scsiio);
 	} else if ((io->io_hdr.port_status == 0) &&
 	    ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE)) {
 		lbalen = ARGS(beio->io);
 		if (lbalen->flags & CTL_LLF_READ) {
 			ctl_set_success(&io->scsiio);
 		} else if (lbalen->flags & CTL_LLF_COMPARE) {
 			/* We have two data blocks ready for comparison. */
 			ctl_be_block_compare(io);
 		}
 	}
 
 	/*
 	 * If this is a read, or a write with errors, it is done.
 	 */
 	if ((beio->bio_cmd == BIO_READ)
 	 || ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0)
 	 || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE)) {
 		ctl_complete_beio(beio);
 		return (0);
 	}
 
 	/*
 	 * At this point, we have a write and the DMA completed
 	 * successfully.  We now have to queue it to the task queue to
 	 * execute the backend I/O.  That is because we do blocking
 	 * memory allocations, and in the file backing case, blocking I/O.
 	 * This move done routine is generally called in the SIM's
 	 * interrupt context, and therefore we cannot block.
 	 */
 	mtx_lock(&be_lun->queue_lock);
 	STAILQ_INSERT_TAIL(&be_lun->datamove_queue, &io->io_hdr, links);
 	mtx_unlock(&be_lun->queue_lock);
 	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
 
 	return (0);
 }
 
 static void
 ctl_be_block_biodone(struct bio *bio)
 {
 	struct ctl_be_block_io *beio;
 	struct ctl_be_block_lun *be_lun;
 	union ctl_io *io;
 	int error;
 
 	beio = bio->bio_caller1;
 	be_lun = beio->lun;
 	io = beio->io;
 
 	DPRINTF("entered\n");
 
 	error = bio->bio_error;
 	mtx_lock(&be_lun->io_lock);
 	if (error != 0 &&
 	    (beio->first_error == 0 ||
 	     bio->bio_offset < beio->first_error_offset)) {
 		beio->first_error = error;
 		beio->first_error_offset = bio->bio_offset;
 	}
 
 	beio->num_bios_done++;
 
 	/*
 	 * XXX KDM will this cause WITNESS to complain?  Holding a lock
 	 * during the free might cause it to complain.
 	 */
 	g_destroy_bio(bio);
 
 	/*
 	 * If the send complete bit isn't set, or we aren't the last I/O to
 	 * complete, then we're done.
 	 */
 	if ((beio->send_complete == 0)
 	 || (beio->num_bios_done < beio->num_bios_sent)) {
 		mtx_unlock(&be_lun->io_lock);
 		return;
 	}
 
 	/*
 	 * At this point, we've verified that we are the last I/O to
 	 * complete, so it's safe to drop the lock.
 	 */
 	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
 	    beio->ds_tag_type, beio->ds_trans_type,
 	    /*now*/ NULL, /*then*/&beio->ds_t0);
 	mtx_unlock(&be_lun->io_lock);
 
 	/*
 	 * If there are any errors from the backing device, we fail the
 	 * entire I/O with a medium error.
 	 */
 	error = beio->first_error;
 	if (error != 0) {
 		if (error == EOPNOTSUPP) {
 			ctl_set_invalid_opcode(&io->scsiio);
 		} else if (error == ENOSPC || error == EDQUOT) {
 			ctl_set_space_alloc_fail(&io->scsiio);
 		} else if (error == EROFS || error == EACCES) {
 			ctl_set_hw_write_protected(&io->scsiio);
 		} else if (beio->bio_cmd == BIO_FLUSH) {
 			/* XXX KDM is there is a better error here? */
 			ctl_set_internal_failure(&io->scsiio,
 						 /*sks_valid*/ 1,
 						 /*retry_count*/ 0xbad2);
 		} else {
 			ctl_set_medium_error(&io->scsiio,
 			    beio->bio_cmd == BIO_READ);
 		}
 		ctl_complete_beio(beio);
 		return;
 	}
 
 	/*
 	 * If this is a write, a flush, a delete or verify, we're all done.
 	 * If this is a read, we can now send the data to the user.
 	 */
 	if ((beio->bio_cmd == BIO_WRITE)
 	 || (beio->bio_cmd == BIO_FLUSH)
 	 || (beio->bio_cmd == BIO_DELETE)
 	 || (ARGS(io)->flags & CTL_LLF_VERIFY)) {
 		ctl_set_success(&io->scsiio);
 		ctl_complete_beio(beio);
 	} else {
 		if ((ARGS(io)->flags & CTL_LLF_READ) &&
 		    beio->beio_cont == NULL) {
 			ctl_set_success(&io->scsiio);
 			ctl_serseq_done(io);
 		}
 #ifdef CTL_TIME_IO
 		getbinuptime(&io->io_hdr.dma_start_bt);
 #endif
 		ctl_datamove(io);
 	}
 }
 
 static void
 ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
 			struct ctl_be_block_io *beio)
 {
 	union ctl_io *io = beio->io;
 	struct mount *mountpoint;
 	int error, lock_flags;
 
 	DPRINTF("entered\n");
 
 	binuptime(&beio->ds_t0);
-	mtx_lock(&be_lun->io_lock);
 	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
-	mtx_unlock(&be_lun->io_lock);
 
 	(void) vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
 
 	if (MNT_SHARED_WRITES(mountpoint) ||
 	    ((mountpoint == NULL) && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
 		lock_flags = LK_SHARED;
 	else
 		lock_flags = LK_EXCLUSIVE;
 	vn_lock(be_lun->vn, lock_flags | LK_RETRY);
 	error = VOP_FSYNC(be_lun->vn, beio->io_arg ? MNT_NOWAIT : MNT_WAIT,
 	    curthread);
 	VOP_UNLOCK(be_lun->vn, 0);
 
 	vn_finished_write(mountpoint);
 
 	mtx_lock(&be_lun->io_lock);
 	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
 	    beio->ds_tag_type, beio->ds_trans_type,
 	    /*now*/ NULL, /*then*/&beio->ds_t0);
 	mtx_unlock(&be_lun->io_lock);
 
 	if (error == 0)
 		ctl_set_success(&io->scsiio);
 	else {
 		/* XXX KDM is there is a better error here? */
 		ctl_set_internal_failure(&io->scsiio,
 					 /*sks_valid*/ 1,
 					 /*retry_count*/ 0xbad1);
 	}
 
 	ctl_complete_beio(beio);
 }
 
 SDT_PROBE_DEFINE1(cbb, , read, file_start, "uint64_t");
 SDT_PROBE_DEFINE1(cbb, , write, file_start, "uint64_t");
 SDT_PROBE_DEFINE1(cbb, , read, file_done,"uint64_t");
 SDT_PROBE_DEFINE1(cbb, , write, file_done, "uint64_t");
 
 static void
 ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
 			   struct ctl_be_block_io *beio)
 {
 	struct ctl_be_block_filedata *file_data;
 	union ctl_io *io;
 	struct uio xuio;
 	struct iovec *xiovec;
 	size_t s;
 	int error, flags, i;
 
 	DPRINTF("entered\n");
 
 	file_data = &be_lun->backend.file;
 	io = beio->io;
 	flags = 0;
 	if (ARGS(io)->flags & CTL_LLF_DPO)
 		flags |= IO_DIRECT;
 	if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA)
 		flags |= IO_SYNC;
 
 	bzero(&xuio, sizeof(xuio));
 	if (beio->bio_cmd == BIO_READ) {
 		SDT_PROBE0(cbb, , read, file_start);
 		xuio.uio_rw = UIO_READ;
 	} else {
 		SDT_PROBE0(cbb, , write, file_start);
 		xuio.uio_rw = UIO_WRITE;
 	}
 	xuio.uio_offset = beio->io_offset;
 	xuio.uio_resid = beio->io_len;
 	xuio.uio_segflg = UIO_SYSSPACE;
 	xuio.uio_iov = beio->xiovecs;
 	xuio.uio_iovcnt = beio->num_segs;
 	xuio.uio_td = curthread;
 
 	for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) {
 		xiovec->iov_base = beio->sg_segs[i].addr;
 		xiovec->iov_len = beio->sg_segs[i].len;
 	}
 
 	binuptime(&beio->ds_t0);
-	mtx_lock(&be_lun->io_lock);
 	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
-	mtx_unlock(&be_lun->io_lock);
 
 	if (beio->bio_cmd == BIO_READ) {
 		vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
 
 		/*
 		 * UFS pays attention to IO_DIRECT for reads.  If the
 		 * DIRECTIO option is configured into the kernel, it calls
 		 * ffs_rawread().  But that only works for single-segment
 		 * uios with user space addresses.  In our case, with a
 		 * kernel uio, it still reads into the buffer cache, but it
 		 * will just try to release the buffer from the cache later
 		 * on in ffs_read().
 		 *
 		 * ZFS does not pay attention to IO_DIRECT for reads.
 		 *
 		 * UFS does not pay attention to IO_SYNC for reads.
 		 *
 		 * ZFS pays attention to IO_SYNC (which translates into the
 		 * Solaris define FRSYNC for zfs_read()) for reads.  It
 		 * attempts to sync the file before reading.
 		 */
 		error = VOP_READ(be_lun->vn, &xuio, flags, file_data->cred);
 
 		VOP_UNLOCK(be_lun->vn, 0);
 		SDT_PROBE0(cbb, , read, file_done);
 		if (error == 0 && xuio.uio_resid > 0) {
 			/*
 			 * If we red less then requested (EOF), then
 			 * we should clean the rest of the buffer.
 			 */
 			s = beio->io_len - xuio.uio_resid;
 			for (i = 0; i < beio->num_segs; i++) {
 				if (s >= beio->sg_segs[i].len) {
 					s -= beio->sg_segs[i].len;
 					continue;
 				}
 				bzero((uint8_t *)beio->sg_segs[i].addr + s,
 				    beio->sg_segs[i].len - s);
 				s = 0;
 			}
 		}
 	} else {
 		struct mount *mountpoint;
 		int lock_flags;
 
 		(void)vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
 
 		if (MNT_SHARED_WRITES(mountpoint) || ((mountpoint == NULL)
 		  && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
 			lock_flags = LK_SHARED;
 		else
 			lock_flags = LK_EXCLUSIVE;
 		vn_lock(be_lun->vn, lock_flags | LK_RETRY);
 
 		/*
 		 * UFS pays attention to IO_DIRECT for writes.  The write
 		 * is done asynchronously.  (Normally the write would just
 		 * get put into cache.
 		 *
 		 * UFS pays attention to IO_SYNC for writes.  It will
 		 * attempt to write the buffer out synchronously if that
 		 * flag is set.
 		 *
 		 * ZFS does not pay attention to IO_DIRECT for writes.
 		 *
 		 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
 		 * for writes.  It will flush the transaction from the
 		 * cache before returning.
 		 */
 		error = VOP_WRITE(be_lun->vn, &xuio, flags, file_data->cred);
 		VOP_UNLOCK(be_lun->vn, 0);
 
 		vn_finished_write(mountpoint);
 		SDT_PROBE0(cbb, , write, file_done);
         }
 
 	mtx_lock(&be_lun->io_lock);
 	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
 	    beio->ds_tag_type, beio->ds_trans_type,
 	    /*now*/ NULL, /*then*/&beio->ds_t0);
 	mtx_unlock(&be_lun->io_lock);
 
 	/*
 	 * If we got an error, set the sense data to "MEDIUM ERROR" and
 	 * return the I/O to the user.
 	 */
 	if (error != 0) {
 		if (error == ENOSPC || error == EDQUOT) {
 			ctl_set_space_alloc_fail(&io->scsiio);
 		} else if (error == EROFS || error == EACCES) {
 			ctl_set_hw_write_protected(&io->scsiio);
 		} else {
 			ctl_set_medium_error(&io->scsiio,
 			    beio->bio_cmd == BIO_READ);
 		}
 		ctl_complete_beio(beio);
 		return;
 	}
 
 	/*
 	 * If this is a write or a verify, we're all done.
 	 * If this is a read, we can now send the data to the user.
 	 */
 	if ((beio->bio_cmd == BIO_WRITE) ||
 	    (ARGS(io)->flags & CTL_LLF_VERIFY)) {
 		ctl_set_success(&io->scsiio);
 		ctl_complete_beio(beio);
 	} else {
 		if ((ARGS(io)->flags & CTL_LLF_READ) &&
 		    beio->beio_cont == NULL) {
 			ctl_set_success(&io->scsiio);
 			ctl_serseq_done(io);
 		}
 #ifdef CTL_TIME_IO
 		getbinuptime(&io->io_hdr.dma_start_bt);
 #endif
 		ctl_datamove(io);
 	}
 }
 
 static void
 ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun,
 			struct ctl_be_block_io *beio)
 {
 	union ctl_io *io = beio->io;
 	struct ctl_lba_len_flags *lbalen = ARGS(io);
 	struct scsi_get_lba_status_data *data;
 	off_t roff, off;
 	int error, status;
 
 	DPRINTF("entered\n");
 
 	off = roff = ((off_t)lbalen->lba) * be_lun->cbe_lun.blocksize;
 	vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
 	error = VOP_IOCTL(be_lun->vn, FIOSEEKHOLE, &off,
 	    0, curthread->td_ucred, curthread);
 	if (error == 0 && off > roff)
 		status = 0;	/* mapped up to off */
 	else {
 		error = VOP_IOCTL(be_lun->vn, FIOSEEKDATA, &off,
 		    0, curthread->td_ucred, curthread);
 		if (error == 0 && off > roff)
 			status = 1;	/* deallocated up to off */
 		else {
 			status = 0;	/* unknown up to the end */
 			off = be_lun->size_bytes;
 		}
 	}
 	VOP_UNLOCK(be_lun->vn, 0);
 
 	data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr;
 	scsi_u64to8b(lbalen->lba, data->descr[0].addr);
 	scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->cbe_lun.blocksize -
 	    lbalen->lba), data->descr[0].length);
 	data->descr[0].status = status;
 
 	ctl_complete_beio(beio);
 }
 
 static uint64_t
 ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun, const char *attrname)
 {
 	struct vattr		vattr;
 	struct statfs		statfs;
 	uint64_t		val;
 	int			error;
 
 	val = UINT64_MAX;
 	if (be_lun->vn == NULL)
 		return (val);
 	vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
 	if (strcmp(attrname, "blocksused") == 0) {
 		error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
 		if (error == 0)
 			val = vattr.va_bytes / be_lun->cbe_lun.blocksize;
 	}
 	if (strcmp(attrname, "blocksavail") == 0 &&
 	    !VN_IS_DOOMED(be_lun->vn)) {
 		error = VFS_STATFS(be_lun->vn->v_mount, &statfs);
 		if (error == 0)
 			val = statfs.f_bavail * statfs.f_bsize /
 			    be_lun->cbe_lun.blocksize;
 	}
 	VOP_UNLOCK(be_lun->vn, 0);
 	return (val);
 }
 
 static void
 ctl_be_block_dispatch_zvol(struct ctl_be_block_lun *be_lun,
 			   struct ctl_be_block_io *beio)
 {
 	union ctl_io *io;
 	struct cdevsw *csw;
 	struct cdev *dev;
 	struct uio xuio;
 	struct iovec *xiovec;
 	int error, flags, i, ref;
 
 	DPRINTF("entered\n");
 
 	io = beio->io;
 	flags = 0;
 	if (ARGS(io)->flags & CTL_LLF_DPO)
 		flags |= IO_DIRECT;
 	if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA)
 		flags |= IO_SYNC;
 
 	bzero(&xuio, sizeof(xuio));
 	if (beio->bio_cmd == BIO_READ) {
 		SDT_PROBE0(cbb, , read, file_start);
 		xuio.uio_rw = UIO_READ;
 	} else {
 		SDT_PROBE0(cbb, , write, file_start);
 		xuio.uio_rw = UIO_WRITE;
 	}
 	xuio.uio_offset = beio->io_offset;
 	xuio.uio_resid = beio->io_len;
 	xuio.uio_segflg = UIO_SYSSPACE;
 	xuio.uio_iov = beio->xiovecs;
 	xuio.uio_iovcnt = beio->num_segs;
 	xuio.uio_td = curthread;
 
 	for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) {
 		xiovec->iov_base = beio->sg_segs[i].addr;
 		xiovec->iov_len = beio->sg_segs[i].len;
 	}
 
 	binuptime(&beio->ds_t0);
-	mtx_lock(&be_lun->io_lock);
 	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
-	mtx_unlock(&be_lun->io_lock);
 
 	csw = devvn_refthread(be_lun->vn, &dev, &ref);
 	if (csw) {
 		if (beio->bio_cmd == BIO_READ)
 			error = csw->d_read(dev, &xuio, flags);
 		else
 			error = csw->d_write(dev, &xuio, flags);
 		dev_relthread(dev, ref);
 	} else
 		error = ENXIO;
 
 	if (beio->bio_cmd == BIO_READ)
 		SDT_PROBE0(cbb, , read, file_done);
 	else
 		SDT_PROBE0(cbb, , write, file_done);
 
 	mtx_lock(&be_lun->io_lock);
 	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
 	    beio->ds_tag_type, beio->ds_trans_type,
 	    /*now*/ NULL, /*then*/&beio->ds_t0);
 	mtx_unlock(&be_lun->io_lock);
 
 	/*
 	 * If we got an error, set the sense data to "MEDIUM ERROR" and
 	 * return the I/O to the user.
 	 */
 	if (error != 0) {
 		if (error == ENOSPC || error == EDQUOT) {
 			ctl_set_space_alloc_fail(&io->scsiio);
 		} else if (error == EROFS || error == EACCES) {
 			ctl_set_hw_write_protected(&io->scsiio);
 		} else {
 			ctl_set_medium_error(&io->scsiio,
 			    beio->bio_cmd == BIO_READ);
 		}
 		ctl_complete_beio(beio);
 		return;
 	}
 
 	/*
 	 * If this is a write or a verify, we're all done.
 	 * If this is a read, we can now send the data to the user.
 	 */
 	if ((beio->bio_cmd == BIO_WRITE) ||
 	    (ARGS(io)->flags & CTL_LLF_VERIFY)) {
 		ctl_set_success(&io->scsiio);
 		ctl_complete_beio(beio);
 	} else {
 		if ((ARGS(io)->flags & CTL_LLF_READ) &&
 		    beio->beio_cont == NULL) {
 			ctl_set_success(&io->scsiio);
 			ctl_serseq_done(io);
 		}
 #ifdef CTL_TIME_IO
 		getbinuptime(&io->io_hdr.dma_start_bt);
 #endif
 		ctl_datamove(io);
 	}
 }
 
 static void
 ctl_be_block_gls_zvol(struct ctl_be_block_lun *be_lun,
 			struct ctl_be_block_io *beio)
 {
 	union ctl_io *io = beio->io;
 	struct cdevsw *csw;
 	struct cdev *dev;
 	struct ctl_lba_len_flags *lbalen = ARGS(io);
 	struct scsi_get_lba_status_data *data;
 	off_t roff, off;
 	int error, ref, status;
 
 	DPRINTF("entered\n");
 
 	csw = devvn_refthread(be_lun->vn, &dev, &ref);
 	if (csw == NULL) {
 		status = 0;	/* unknown up to the end */
 		off = be_lun->size_bytes;
 		goto done;
 	}
 	off = roff = ((off_t)lbalen->lba) * be_lun->cbe_lun.blocksize;
 	error = csw->d_ioctl(dev, FIOSEEKHOLE, (caddr_t)&off, FREAD,
 	    curthread);
 	if (error == 0 && off > roff)
 		status = 0;	/* mapped up to off */
 	else {
 		error = csw->d_ioctl(dev, FIOSEEKDATA, (caddr_t)&off, FREAD,
 		    curthread);
 		if (error == 0 && off > roff)
 			status = 1;	/* deallocated up to off */
 		else {
 			status = 0;	/* unknown up to the end */
 			off = be_lun->size_bytes;
 		}
 	}
 	dev_relthread(dev, ref);
 
 done:
 	data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr;
 	scsi_u64to8b(lbalen->lba, data->descr[0].addr);
 	scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->cbe_lun.blocksize -
 	    lbalen->lba), data->descr[0].length);
 	data->descr[0].status = status;
 
 	ctl_complete_beio(beio);
 }
 
 static void
 ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
 		       struct ctl_be_block_io *beio)
 {
 	struct bio *bio;
 	struct cdevsw *csw;
 	struct cdev *dev;
 	int ref;
 
 	DPRINTF("entered\n");
 
 	/* This can't fail, it's a blocking allocation. */
 	bio = g_alloc_bio();
 
 	bio->bio_cmd	    = BIO_FLUSH;
 	bio->bio_offset	    = 0;
 	bio->bio_data	    = 0;
 	bio->bio_done	    = ctl_be_block_biodone;
 	bio->bio_caller1    = beio;
 	bio->bio_pblkno	    = 0;
 
 	/*
 	 * We don't need to acquire the LUN lock here, because we are only
 	 * sending one bio, and so there is no other context to synchronize
 	 * with.
 	 */
 	beio->num_bios_sent = 1;
 	beio->send_complete = 1;
 
 	binuptime(&beio->ds_t0);
-	mtx_lock(&be_lun->io_lock);
 	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
-	mtx_unlock(&be_lun->io_lock);
 
 	csw = devvn_refthread(be_lun->vn, &dev, &ref);
 	if (csw) {
 		bio->bio_dev = dev;
 		csw->d_strategy(bio);
 		dev_relthread(dev, ref);
 	} else {
 		bio->bio_error = ENXIO;
 		ctl_be_block_biodone(bio);
 	}
 }
 
 static void
 ctl_be_block_unmap_dev_range(struct ctl_be_block_lun *be_lun,
 		       struct ctl_be_block_io *beio,
 		       uint64_t off, uint64_t len, int last)
 {
 	struct bio *bio;
 	uint64_t maxlen;
 	struct cdevsw *csw;
 	struct cdev *dev;
 	int ref;
 
 	csw = devvn_refthread(be_lun->vn, &dev, &ref);
 	maxlen = LONG_MAX - (LONG_MAX % be_lun->cbe_lun.blocksize);
 	while (len > 0) {
 		bio = g_alloc_bio();
 		bio->bio_cmd	    = BIO_DELETE;
 		bio->bio_dev	    = dev;
 		bio->bio_offset	    = off;
 		bio->bio_length	    = MIN(len, maxlen);
 		bio->bio_data	    = 0;
 		bio->bio_done	    = ctl_be_block_biodone;
 		bio->bio_caller1    = beio;
 		bio->bio_pblkno     = off / be_lun->cbe_lun.blocksize;
 
 		off += bio->bio_length;
 		len -= bio->bio_length;
 
 		mtx_lock(&be_lun->io_lock);
 		beio->num_bios_sent++;
 		if (last && len == 0)
 			beio->send_complete = 1;
 		mtx_unlock(&be_lun->io_lock);
 
 		if (csw) {
 			csw->d_strategy(bio);
 		} else {
 			bio->bio_error = ENXIO;
 			ctl_be_block_biodone(bio);
 		}
 	}
 	if (csw)
 		dev_relthread(dev, ref);
 }
 
 static void
 ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun,
 		       struct ctl_be_block_io *beio)
 {
 	union ctl_io *io;
 	struct ctl_ptr_len_flags *ptrlen;
 	struct scsi_unmap_desc *buf, *end;
 	uint64_t len;
 
 	io = beio->io;
 
 	DPRINTF("entered\n");
 
 	binuptime(&beio->ds_t0);
-	mtx_lock(&be_lun->io_lock);
 	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
-	mtx_unlock(&be_lun->io_lock);
 
 	if (beio->io_offset == -1) {
 		beio->io_len = 0;
 		ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
 		buf = (struct scsi_unmap_desc *)ptrlen->ptr;
 		end = buf + ptrlen->len / sizeof(*buf);
 		for (; buf < end; buf++) {
 			len = (uint64_t)scsi_4btoul(buf->length) *
 			    be_lun->cbe_lun.blocksize;
 			beio->io_len += len;
 			ctl_be_block_unmap_dev_range(be_lun, beio,
 			    scsi_8btou64(buf->lba) * be_lun->cbe_lun.blocksize,
 			    len, (end - buf < 2) ? TRUE : FALSE);
 		}
 	} else
 		ctl_be_block_unmap_dev_range(be_lun, beio,
 		    beio->io_offset, beio->io_len, TRUE);
 }
 
 static void
 ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
 			  struct ctl_be_block_io *beio)
 {
 	TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue);
 	struct bio *bio;
 	struct cdevsw *csw;
 	struct cdev *dev;
 	off_t cur_offset;
 	int i, max_iosize, ref;
 
 	DPRINTF("entered\n");
 	csw = devvn_refthread(be_lun->vn, &dev, &ref);
 
 	/*
 	 * We have to limit our I/O size to the maximum supported by the
 	 * backend device.  Hopefully it is MAXPHYS.  If the driver doesn't
 	 * set it properly, use DFLTPHYS.
 	 */
 	if (csw) {
 		max_iosize = dev->si_iosize_max;
 		if (max_iosize < PAGE_SIZE)
 			max_iosize = DFLTPHYS;
 	} else
 		max_iosize = DFLTPHYS;
 
 	cur_offset = beio->io_offset;
 	for (i = 0; i < beio->num_segs; i++) {
 		size_t cur_size;
 		uint8_t *cur_ptr;
 
 		cur_size = beio->sg_segs[i].len;
 		cur_ptr = beio->sg_segs[i].addr;
 
 		while (cur_size > 0) {
 			/* This can't fail, it's a blocking allocation. */
 			bio = g_alloc_bio();
 
 			KASSERT(bio != NULL, ("g_alloc_bio() failed!\n"));
 
 			bio->bio_cmd = beio->bio_cmd;
 			bio->bio_dev = dev;
 			bio->bio_caller1 = beio;
 			bio->bio_length = min(cur_size, max_iosize);
 			bio->bio_offset = cur_offset;
 			bio->bio_data = cur_ptr;
 			bio->bio_done = ctl_be_block_biodone;
 			bio->bio_pblkno = cur_offset / be_lun->cbe_lun.blocksize;
 
 			cur_offset += bio->bio_length;
 			cur_ptr += bio->bio_length;
 			cur_size -= bio->bio_length;
 
 			TAILQ_INSERT_TAIL(&queue, bio, bio_queue);
 			beio->num_bios_sent++;
 		}
 	}
+	beio->send_complete = 1;
 	binuptime(&beio->ds_t0);
-	mtx_lock(&be_lun->io_lock);
 	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
-	beio->send_complete = 1;
-	mtx_unlock(&be_lun->io_lock);
 
 	/*
 	 * Fire off all allocated requests!
 	 */
 	while ((bio = TAILQ_FIRST(&queue)) != NULL) {
 		TAILQ_REMOVE(&queue, bio, bio_queue);
 		if (csw)
 			csw->d_strategy(bio);
 		else {
 			bio->bio_error = ENXIO;
 			ctl_be_block_biodone(bio);
 		}
 	}
 	if (csw)
 		dev_relthread(dev, ref);
 }
 
 static uint64_t
 ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun, const char *attrname)
 {
 	struct diocgattr_arg	arg;
 	struct cdevsw *csw;
 	struct cdev *dev;
 	int error, ref;
 
 	csw = devvn_refthread(be_lun->vn, &dev, &ref);
 	if (csw == NULL)
 		return (UINT64_MAX);
 	strlcpy(arg.name, attrname, sizeof(arg.name));
 	arg.len = sizeof(arg.value.off);
 	if (csw->d_ioctl) {
 		error = csw->d_ioctl(dev, DIOCGATTR, (caddr_t)&arg, FREAD,
 		    curthread);
 	} else
 		error = ENODEV;
 	dev_relthread(dev, ref);
 	if (error != 0)
 		return (UINT64_MAX);
 	return (arg.value.off);
 }
 
 static void
 ctl_be_block_cw_dispatch_sync(struct ctl_be_block_lun *be_lun,
 			    union ctl_io *io)
 {
 	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
 	struct ctl_be_block_io *beio;
 	struct ctl_lba_len_flags *lbalen;
 
 	DPRINTF("entered\n");
 	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
 	lbalen = (struct ctl_lba_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
 
 	beio->io_len = lbalen->len * cbe_lun->blocksize;
 	beio->io_offset = lbalen->lba * cbe_lun->blocksize;
 	beio->io_arg = (lbalen->flags & SSC_IMMED) != 0;
 	beio->bio_cmd = BIO_FLUSH;
 	beio->ds_trans_type = DEVSTAT_NO_DATA;
 	DPRINTF("SYNC\n");
 	be_lun->lun_flush(be_lun, beio);
 }
 
 static void
 ctl_be_block_cw_done_ws(struct ctl_be_block_io *beio)
 {
 	union ctl_io *io;
 
 	io = beio->io;
 	ctl_free_beio(beio);
 	if ((io->io_hdr.flags & CTL_FLAG_ABORT) ||
 	    ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE &&
 	     (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) {
 		ctl_config_write_done(io);
 		return;
 	}
 
 	ctl_be_block_config_write(io);
 }
 
 static void
 ctl_be_block_cw_dispatch_ws(struct ctl_be_block_lun *be_lun,
 			    union ctl_io *io)
 {
 	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
 	struct ctl_be_block_io *beio;
 	struct ctl_lba_len_flags *lbalen;
 	uint64_t len_left, lba;
 	uint32_t pb, pbo, adj;
 	int i, seglen;
 	uint8_t *buf, *end;
 
 	DPRINTF("entered\n");
 
 	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
 	lbalen = ARGS(beio->io);
 
 	if (lbalen->flags & ~(SWS_LBDATA | SWS_UNMAP | SWS_ANCHOR | SWS_NDOB) ||
 	    (lbalen->flags & (SWS_UNMAP | SWS_ANCHOR) && be_lun->unmap == NULL)) {
 		ctl_free_beio(beio);
 		ctl_set_invalid_field(&io->scsiio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 1,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		ctl_config_write_done(io);
 		return;
 	}
 
 	if (lbalen->flags & (SWS_UNMAP | SWS_ANCHOR)) {
 		beio->io_offset = lbalen->lba * cbe_lun->blocksize;
 		beio->io_len = (uint64_t)lbalen->len * cbe_lun->blocksize;
 		beio->bio_cmd = BIO_DELETE;
 		beio->ds_trans_type = DEVSTAT_FREE;
 
 		be_lun->unmap(be_lun, beio);
 		return;
 	}
 
 	beio->bio_cmd = BIO_WRITE;
 	beio->ds_trans_type = DEVSTAT_WRITE;
 
 	DPRINTF("WRITE SAME at LBA %jx len %u\n",
 	       (uintmax_t)lbalen->lba, lbalen->len);
 
 	pb = cbe_lun->blocksize << be_lun->cbe_lun.pblockexp;
 	if (be_lun->cbe_lun.pblockoff > 0)
 		pbo = pb - cbe_lun->blocksize * be_lun->cbe_lun.pblockoff;
 	else
 		pbo = 0;
 	len_left = (uint64_t)lbalen->len * cbe_lun->blocksize;
 	for (i = 0, lba = 0; i < CTLBLK_MAX_SEGS && len_left > 0; i++) {
 
 		/*
 		 * Setup the S/G entry for this chunk.
 		 */
 		seglen = MIN(CTLBLK_MAX_SEG, len_left);
 		if (pb > cbe_lun->blocksize) {
 			adj = ((lbalen->lba + lba) * cbe_lun->blocksize +
 			    seglen - pbo) % pb;
 			if (seglen > adj)
 				seglen -= adj;
 			else
 				seglen -= seglen % cbe_lun->blocksize;
 		} else
 			seglen -= seglen % cbe_lun->blocksize;
 		beio->sg_segs[i].len = seglen;
 		beio->sg_segs[i].addr = uma_zalloc(be_lun->lun_zone, M_WAITOK);
 
 		DPRINTF("segment %d addr %p len %zd\n", i,
 			beio->sg_segs[i].addr, beio->sg_segs[i].len);
 
 		beio->num_segs++;
 		len_left -= seglen;
 
 		buf = beio->sg_segs[i].addr;
 		end = buf + seglen;
 		for (; buf < end; buf += cbe_lun->blocksize) {
 			if (lbalen->flags & SWS_NDOB) {
 				memset(buf, 0, cbe_lun->blocksize);
 			} else {
 				memcpy(buf, io->scsiio.kern_data_ptr,
 				    cbe_lun->blocksize);
 			}
 			if (lbalen->flags & SWS_LBDATA)
 				scsi_ulto4b(lbalen->lba + lba, buf);
 			lba++;
 		}
 	}
 
 	beio->io_offset = lbalen->lba * cbe_lun->blocksize;
 	beio->io_len = lba * cbe_lun->blocksize;
 
 	/* We can not do all in one run. Correct and schedule rerun. */
 	if (len_left > 0) {
 		lbalen->lba += lba;
 		lbalen->len -= lba;
 		beio->beio_cont = ctl_be_block_cw_done_ws;
 	}
 
 	be_lun->dispatch(be_lun, beio);
 }
 
 static void
 ctl_be_block_cw_dispatch_unmap(struct ctl_be_block_lun *be_lun,
 			    union ctl_io *io)
 {
 	struct ctl_be_block_io *beio;
 	struct ctl_ptr_len_flags *ptrlen;
 
 	DPRINTF("entered\n");
 
 	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
 	ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
 
 	if ((ptrlen->flags & ~SU_ANCHOR) != 0 || be_lun->unmap == NULL) {
 		ctl_free_beio(beio);
 		ctl_set_invalid_field(&io->scsiio,
 				      /*sks_valid*/ 0,
 				      /*command*/ 1,
 				      /*field*/ 0,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		ctl_config_write_done(io);
 		return;
 	}
 
 	beio->io_len = 0;
 	beio->io_offset = -1;
 	beio->bio_cmd = BIO_DELETE;
 	beio->ds_trans_type = DEVSTAT_FREE;
 	DPRINTF("UNMAP\n");
 	be_lun->unmap(be_lun, beio);
 }
 
 static void
 ctl_be_block_cr_done(struct ctl_be_block_io *beio)
 {
 	union ctl_io *io;
 
 	io = beio->io;
 	ctl_free_beio(beio);
 	ctl_config_read_done(io);
 }
 
 static void
 ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun,
 			 union ctl_io *io)
 {
 	struct ctl_be_block_io *beio;
 	struct ctl_be_block_softc *softc;
 
 	DPRINTF("entered\n");
 
 	softc = be_lun->softc;
 	beio = ctl_alloc_beio(softc);
 	beio->io = io;
 	beio->lun = be_lun;
 	beio->beio_cont = ctl_be_block_cr_done;
 	PRIV(io)->ptr = (void *)beio;
 
 	switch (io->scsiio.cdb[0]) {
 	case SERVICE_ACTION_IN:		/* GET LBA STATUS */
 		beio->bio_cmd = -1;
 		beio->ds_trans_type = DEVSTAT_NO_DATA;
 		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
 		beio->io_len = 0;
 		if (be_lun->get_lba_status)
 			be_lun->get_lba_status(be_lun, beio);
 		else
 			ctl_be_block_cr_done(beio);
 		break;
 	default:
 		panic("Unhandled CDB type %#x", io->scsiio.cdb[0]);
 		break;
 	}
 }
 
 static void
 ctl_be_block_cw_done(struct ctl_be_block_io *beio)
 {
 	union ctl_io *io;
 
 	io = beio->io;
 	ctl_free_beio(beio);
 	ctl_config_write_done(io);
 }
 
 static void
 ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
 			 union ctl_io *io)
 {
 	struct ctl_be_block_io *beio;
 	struct ctl_be_block_softc *softc;
 
 	DPRINTF("entered\n");
 
 	softc = be_lun->softc;
 	beio = ctl_alloc_beio(softc);
 	beio->io = io;
 	beio->lun = be_lun;
 	beio->beio_cont = ctl_be_block_cw_done;
 	switch (io->scsiio.tag_type) {
 	case CTL_TAG_ORDERED:
 		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
 		break;
 	case CTL_TAG_HEAD_OF_QUEUE:
 		beio->ds_tag_type = DEVSTAT_TAG_HEAD;
 		break;
 	case CTL_TAG_UNTAGGED:
 	case CTL_TAG_SIMPLE:
 	case CTL_TAG_ACA:
 	default:
 		beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
 		break;
 	}
 	PRIV(io)->ptr = (void *)beio;
 
 	switch (io->scsiio.cdb[0]) {
 	case SYNCHRONIZE_CACHE:
 	case SYNCHRONIZE_CACHE_16:
 		ctl_be_block_cw_dispatch_sync(be_lun, io);
 		break;
 	case WRITE_SAME_10:
 	case WRITE_SAME_16:
 		ctl_be_block_cw_dispatch_ws(be_lun, io);
 		break;
 	case UNMAP:
 		ctl_be_block_cw_dispatch_unmap(be_lun, io);
 		break;
 	default:
 		panic("Unhandled CDB type %#x", io->scsiio.cdb[0]);
 		break;
 	}
 }
 
 SDT_PROBE_DEFINE1(cbb, , read, start, "uint64_t");
 SDT_PROBE_DEFINE1(cbb, , write, start, "uint64_t");
 SDT_PROBE_DEFINE1(cbb, , read, alloc_done, "uint64_t");
 SDT_PROBE_DEFINE1(cbb, , write, alloc_done, "uint64_t");
 
 static void
 ctl_be_block_next(struct ctl_be_block_io *beio)
 {
 	struct ctl_be_block_lun *be_lun;
 	union ctl_io *io;
 
 	io = beio->io;
 	be_lun = beio->lun;
 	ctl_free_beio(beio);
 	if ((io->io_hdr.flags & CTL_FLAG_ABORT) ||
 	    ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE &&
 	     (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) {
 		ctl_data_submit_done(io);
 		return;
 	}
 
 	io->io_hdr.status &= ~CTL_STATUS_MASK;
 	io->io_hdr.status |= CTL_STATUS_NONE;
 
 	mtx_lock(&be_lun->queue_lock);
 	STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links);
 	mtx_unlock(&be_lun->queue_lock);
 	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
 }
 
 static void
 ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
 			   union ctl_io *io)
 {
 	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
 	struct ctl_be_block_io *beio;
 	struct ctl_be_block_softc *softc;
 	struct ctl_lba_len_flags *lbalen;
 	struct ctl_ptr_len_flags *bptrlen;
 	uint64_t len_left, lbas;
 	int i;
 
 	softc = be_lun->softc;
 
 	DPRINTF("entered\n");
 
 	lbalen = ARGS(io);
 	if (lbalen->flags & CTL_LLF_WRITE) {
 		SDT_PROBE0(cbb, , write, start);
 	} else {
 		SDT_PROBE0(cbb, , read, start);
 	}
 
 	beio = ctl_alloc_beio(softc);
 	beio->io = io;
 	beio->lun = be_lun;
 	bptrlen = PRIV(io);
 	bptrlen->ptr = (void *)beio;
 
 	switch (io->scsiio.tag_type) {
 	case CTL_TAG_ORDERED:
 		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
 		break;
 	case CTL_TAG_HEAD_OF_QUEUE:
 		beio->ds_tag_type = DEVSTAT_TAG_HEAD;
 		break;
 	case CTL_TAG_UNTAGGED:
 	case CTL_TAG_SIMPLE:
 	case CTL_TAG_ACA:
 	default:
 		beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
 		break;
 	}
 
 	if (lbalen->flags & CTL_LLF_WRITE) {
 		beio->bio_cmd = BIO_WRITE;
 		beio->ds_trans_type = DEVSTAT_WRITE;
 	} else {
 		beio->bio_cmd = BIO_READ;
 		beio->ds_trans_type = DEVSTAT_READ;
 	}
 
 	DPRINTF("%s at LBA %jx len %u @%ju\n",
 	       (beio->bio_cmd == BIO_READ) ? "READ" : "WRITE",
 	       (uintmax_t)lbalen->lba, lbalen->len, bptrlen->len);
 	if (lbalen->flags & CTL_LLF_COMPARE)
 		lbas = CTLBLK_HALF_IO_SIZE;
 	else
 		lbas = CTLBLK_MAX_IO_SIZE;
 	lbas = MIN(lbalen->len - bptrlen->len, lbas / cbe_lun->blocksize);
 	beio->io_offset = (lbalen->lba + bptrlen->len) * cbe_lun->blocksize;
 	beio->io_len = lbas * cbe_lun->blocksize;
 	bptrlen->len += lbas;
 
 	for (i = 0, len_left = beio->io_len; len_left > 0; i++) {
 		KASSERT(i < CTLBLK_MAX_SEGS, ("Too many segs (%d >= %d)",
 		    i, CTLBLK_MAX_SEGS));
 
 		/*
 		 * Setup the S/G entry for this chunk.
 		 */
 		beio->sg_segs[i].len = min(CTLBLK_MAX_SEG, len_left);
 		beio->sg_segs[i].addr = uma_zalloc(be_lun->lun_zone, M_WAITOK);
 
 		DPRINTF("segment %d addr %p len %zd\n", i,
 			beio->sg_segs[i].addr, beio->sg_segs[i].len);
 
 		/* Set up second segment for compare operation. */
 		if (lbalen->flags & CTL_LLF_COMPARE) {
 			beio->sg_segs[i + CTLBLK_HALF_SEGS].len =
 			    beio->sg_segs[i].len;
 			beio->sg_segs[i + CTLBLK_HALF_SEGS].addr =
 			    uma_zalloc(be_lun->lun_zone, M_WAITOK);
 		}
 
 		beio->num_segs++;
 		len_left -= beio->sg_segs[i].len;
 	}
 	if (bptrlen->len < lbalen->len)
 		beio->beio_cont = ctl_be_block_next;
 	io->scsiio.be_move_done = ctl_be_block_move_done;
 	/* For compare we have separate S/G lists for read and datamove. */
 	if (lbalen->flags & CTL_LLF_COMPARE)
 		io->scsiio.kern_data_ptr = (uint8_t *)&beio->sg_segs[CTLBLK_HALF_SEGS];
 	else
 		io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs;
 	io->scsiio.kern_data_len = beio->io_len;
 	io->scsiio.kern_sg_entries = beio->num_segs;
 	io->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 
 	/*
 	 * For the read case, we need to read the data into our buffers and
 	 * then we can send it back to the user.  For the write case, we
 	 * need to get the data from the user first.
 	 */
 	if (beio->bio_cmd == BIO_READ) {
 		SDT_PROBE0(cbb, , read, alloc_done);
 		be_lun->dispatch(be_lun, beio);
 	} else {
 		SDT_PROBE0(cbb, , write, alloc_done);
 #ifdef CTL_TIME_IO
 		getbinuptime(&io->io_hdr.dma_start_bt);
 #endif
 		ctl_datamove(io);
 	}
 }
 
 static void
 ctl_be_block_worker(void *context, int pending)
 {
 	struct ctl_be_block_lun *be_lun = (struct ctl_be_block_lun *)context;
 	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
 	union ctl_io *io;
 	struct ctl_be_block_io *beio;
 
 	DPRINTF("entered\n");
 	/*
 	 * Fetch and process I/Os from all queues.  If we detect LUN
 	 * CTL_LUN_FLAG_NO_MEDIA status here -- it is result of a race,
 	 * so make response maximally opaque to not confuse initiator.
 	 */
 	for (;;) {
 		mtx_lock(&be_lun->queue_lock);
 		io = (union ctl_io *)STAILQ_FIRST(&be_lun->datamove_queue);
 		if (io != NULL) {
 			DPRINTF("datamove queue\n");
 			STAILQ_REMOVE(&be_lun->datamove_queue, &io->io_hdr,
 				      ctl_io_hdr, links);
 			mtx_unlock(&be_lun->queue_lock);
 			beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
 			if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) {
 				ctl_set_busy(&io->scsiio);
 				ctl_complete_beio(beio);
 				return;
 			}
 			be_lun->dispatch(be_lun, beio);
 			continue;
 		}
 		io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_write_queue);
 		if (io != NULL) {
 			DPRINTF("config write queue\n");
 			STAILQ_REMOVE(&be_lun->config_write_queue, &io->io_hdr,
 				      ctl_io_hdr, links);
 			mtx_unlock(&be_lun->queue_lock);
 			if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) {
 				ctl_set_busy(&io->scsiio);
 				ctl_config_write_done(io);
 				return;
 			}
 			ctl_be_block_cw_dispatch(be_lun, io);
 			continue;
 		}
 		io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_read_queue);
 		if (io != NULL) {
 			DPRINTF("config read queue\n");
 			STAILQ_REMOVE(&be_lun->config_read_queue, &io->io_hdr,
 				      ctl_io_hdr, links);
 			mtx_unlock(&be_lun->queue_lock);
 			if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) {
 				ctl_set_busy(&io->scsiio);
 				ctl_config_read_done(io);
 				return;
 			}
 			ctl_be_block_cr_dispatch(be_lun, io);
 			continue;
 		}
 		io = (union ctl_io *)STAILQ_FIRST(&be_lun->input_queue);
 		if (io != NULL) {
 			DPRINTF("input queue\n");
 			STAILQ_REMOVE(&be_lun->input_queue, &io->io_hdr,
 				      ctl_io_hdr, links);
 			mtx_unlock(&be_lun->queue_lock);
 			if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) {
 				ctl_set_busy(&io->scsiio);
 				ctl_data_submit_done(io);
 				return;
 			}
 			ctl_be_block_dispatch(be_lun, io);
 			continue;
 		}
 
 		/*
 		 * If we get here, there is no work left in the queues, so
 		 * just break out and let the task queue go to sleep.
 		 */
 		mtx_unlock(&be_lun->queue_lock);
 		break;
 	}
 }
 
 /*
  * Entry point from CTL to the backend for I/O.  We queue everything to a
  * work thread, so this just puts the I/O on a queue and wakes up the
  * thread.
  */
 static int
 ctl_be_block_submit(union ctl_io *io)
 {
 	struct ctl_be_block_lun *be_lun;
 	struct ctl_be_lun *cbe_lun;
 
 	DPRINTF("entered\n");
 
 	cbe_lun = CTL_BACKEND_LUN(io);
 	be_lun = (struct ctl_be_block_lun *)cbe_lun->be_lun;
 
 	/*
 	 * Make sure we only get SCSI I/O.
 	 */
 	KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, ("Non-SCSI I/O (type "
 		"%#x) encountered", io->io_hdr.io_type));
 
 	PRIV(io)->len = 0;
 
 	mtx_lock(&be_lun->queue_lock);
 	STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links);
 	mtx_unlock(&be_lun->queue_lock);
 	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 static int
 ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
 			int flag, struct thread *td)
 {
 	struct ctl_be_block_softc *softc;
 	int error;
 
 	softc = &backend_block_softc;
 
 	error = 0;
 
 	switch (cmd) {
 	case CTL_LUN_REQ: {
 		struct ctl_lun_req *lun_req;
 
 		lun_req = (struct ctl_lun_req *)addr;
 
 		switch (lun_req->reqtype) {
 		case CTL_LUNREQ_CREATE:
 			error = ctl_be_block_create(softc, lun_req);
 			break;
 		case CTL_LUNREQ_RM:
 			error = ctl_be_block_rm(softc, lun_req);
 			break;
 		case CTL_LUNREQ_MODIFY:
 			error = ctl_be_block_modify(softc, lun_req);
 			break;
 		default:
 			lun_req->status = CTL_LUN_ERROR;
 			snprintf(lun_req->error_str, sizeof(lun_req->error_str),
 				 "invalid LUN request type %d",
 				 lun_req->reqtype);
 			break;
 		}
 		break;
 	}
 	default:
 		error = ENOTTY;
 		break;
 	}
 
 	return (error);
 }
 
 static int
 ctl_be_block_open_file(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
 {
 	struct ctl_be_lun *cbe_lun;
 	struct ctl_be_block_filedata *file_data;
 	struct ctl_lun_create_params *params;
 	const char		     *value;
 	struct vattr		      vattr;
 	off_t			      ps, pss, po, pos, us, uss, uo, uos;
 	int			      error;
 
 	cbe_lun = &be_lun->cbe_lun;
 	file_data = &be_lun->backend.file;
 	params = &be_lun->params;
 
 	be_lun->dev_type = CTL_BE_BLOCK_FILE;
 	be_lun->dispatch = ctl_be_block_dispatch_file;
 	be_lun->lun_flush = ctl_be_block_flush_file;
 	be_lun->get_lba_status = ctl_be_block_gls_file;
 	be_lun->getattr = ctl_be_block_getattr_file;
 	be_lun->unmap = NULL;
 	cbe_lun->flags &= ~CTL_LUN_FLAG_UNMAP;
 
 	error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
 	if (error != 0) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "error calling VOP_GETATTR() for file %s",
 			 be_lun->dev_path);
 		return (error);
 	}
 
 	file_data->cred = crhold(curthread->td_ucred);
 	if (params->lun_size_bytes != 0)
 		be_lun->size_bytes = params->lun_size_bytes;
 	else
 		be_lun->size_bytes = vattr.va_size;
 
 	/*
 	 * For files we can use any logical block size.  Prefer 512 bytes
 	 * for compatibility reasons.  If file's vattr.va_blocksize
 	 * (preferred I/O block size) is bigger and multiple to chosen
 	 * logical block size -- report it as physical block size.
 	 */
 	if (params->blocksize_bytes != 0)
 		cbe_lun->blocksize = params->blocksize_bytes;
 	else if (cbe_lun->lun_type == T_CDROM)
 		cbe_lun->blocksize = 2048;
 	else
 		cbe_lun->blocksize = 512;
 	be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize;
 	cbe_lun->maxlba = (be_lun->size_blocks == 0) ?
 	    0 : (be_lun->size_blocks - 1);
 
 	us = ps = vattr.va_blocksize;
 	uo = po = 0;
 
 	value = dnvlist_get_string(cbe_lun->options, "pblocksize", NULL);
 	if (value != NULL)
 		ctl_expand_number(value, &ps);
 	value = dnvlist_get_string(cbe_lun->options, "pblockoffset", NULL);
 	if (value != NULL)
 		ctl_expand_number(value, &po);
 	pss = ps / cbe_lun->blocksize;
 	pos = po / cbe_lun->blocksize;
 	if ((pss > 0) && (pss * cbe_lun->blocksize == ps) && (pss >= pos) &&
 	    ((pss & (pss - 1)) == 0) && (pos * cbe_lun->blocksize == po)) {
 		cbe_lun->pblockexp = fls(pss) - 1;
 		cbe_lun->pblockoff = (pss - pos) % pss;
 	}
 
 	value = dnvlist_get_string(cbe_lun->options, "ublocksize", NULL);
 	if (value != NULL)
 		ctl_expand_number(value, &us);
 	value = dnvlist_get_string(cbe_lun->options, "ublockoffset", NULL);
 	if (value != NULL)
 		ctl_expand_number(value, &uo);
 	uss = us / cbe_lun->blocksize;
 	uos = uo / cbe_lun->blocksize;
 	if ((uss > 0) && (uss * cbe_lun->blocksize == us) && (uss >= uos) &&
 	    ((uss & (uss - 1)) == 0) && (uos * cbe_lun->blocksize == uo)) {
 		cbe_lun->ublockexp = fls(uss) - 1;
 		cbe_lun->ublockoff = (uss - uos) % uss;
 	}
 
 	/*
 	 * Sanity check.  The media size has to be at least one
 	 * sector long.
 	 */
 	if (be_lun->size_bytes < cbe_lun->blocksize) {
 		error = EINVAL;
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "file %s size %ju < block size %u", be_lun->dev_path,
 			 (uintmax_t)be_lun->size_bytes, cbe_lun->blocksize);
 	}
 
 	cbe_lun->opttxferlen = CTLBLK_MAX_IO_SIZE / cbe_lun->blocksize;
 	return (error);
 }
 
 static int
 ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
 {
 	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
 	struct ctl_lun_create_params *params;
 	struct cdevsw		     *csw;
 	struct cdev		     *dev;
 	const char		     *value;
 	int			      error, atomic, maxio, ref, unmap, tmp;
 	off_t			      ps, pss, po, pos, us, uss, uo, uos, otmp;
 
 	params = &be_lun->params;
 
 	be_lun->dev_type = CTL_BE_BLOCK_DEV;
 	csw = devvn_refthread(be_lun->vn, &dev, &ref);
 	if (csw == NULL)
 		return (ENXIO);
 	if (strcmp(csw->d_name, "zvol") == 0) {
 		be_lun->dispatch = ctl_be_block_dispatch_zvol;
 		be_lun->get_lba_status = ctl_be_block_gls_zvol;
 		atomic = maxio = CTLBLK_MAX_IO_SIZE;
 	} else {
 		be_lun->dispatch = ctl_be_block_dispatch_dev;
 		be_lun->get_lba_status = NULL;
 		atomic = 0;
 		maxio = dev->si_iosize_max;
 		if (maxio <= 0)
 			maxio = DFLTPHYS;
 		if (maxio > CTLBLK_MAX_IO_SIZE)
 			maxio = CTLBLK_MAX_IO_SIZE;
 	}
 	be_lun->lun_flush = ctl_be_block_flush_dev;
 	be_lun->getattr = ctl_be_block_getattr_dev;
 	be_lun->unmap = ctl_be_block_unmap_dev;
 
 	if (!csw->d_ioctl) {
 		dev_relthread(dev, ref);
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "no d_ioctl for device %s!", be_lun->dev_path);
 		return (ENODEV);
 	}
 
 	error = csw->d_ioctl(dev, DIOCGSECTORSIZE, (caddr_t)&tmp, FREAD,
 			       curthread);
 	if (error) {
 		dev_relthread(dev, ref);
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "error %d returned for DIOCGSECTORSIZE ioctl "
 			 "on %s!", error, be_lun->dev_path);
 		return (error);
 	}
 
 	/*
 	 * If the user has asked for a blocksize that is greater than the
 	 * backing device's blocksize, we can do it only if the blocksize
 	 * the user is asking for is an even multiple of the underlying 
 	 * device's blocksize.
 	 */
 	if ((params->blocksize_bytes != 0) &&
 	    (params->blocksize_bytes >= tmp)) {
 		if (params->blocksize_bytes % tmp == 0) {
 			cbe_lun->blocksize = params->blocksize_bytes;
 		} else {
 			dev_relthread(dev, ref);
 			snprintf(req->error_str, sizeof(req->error_str),
 				 "requested blocksize %u is not an even "
 				 "multiple of backing device blocksize %u",
 				 params->blocksize_bytes, tmp);
 			return (EINVAL);
 		}
 	} else if (params->blocksize_bytes != 0) {
 		dev_relthread(dev, ref);
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "requested blocksize %u < backing device "
 			 "blocksize %u", params->blocksize_bytes, tmp);
 		return (EINVAL);
 	} else if (cbe_lun->lun_type == T_CDROM)
 		cbe_lun->blocksize = MAX(tmp, 2048);
 	else
 		cbe_lun->blocksize = tmp;
 
 	error = csw->d_ioctl(dev, DIOCGMEDIASIZE, (caddr_t)&otmp, FREAD,
 			     curthread);
 	if (error) {
 		dev_relthread(dev, ref);
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "error %d returned for DIOCGMEDIASIZE "
 			 " ioctl on %s!", error,
 			 be_lun->dev_path);
 		return (error);
 	}
 
 	if (params->lun_size_bytes != 0) {
 		if (params->lun_size_bytes > otmp) {
 			dev_relthread(dev, ref);
 			snprintf(req->error_str, sizeof(req->error_str),
 				 "requested LUN size %ju > backing device "
 				 "size %ju",
 				 (uintmax_t)params->lun_size_bytes,
 				 (uintmax_t)otmp);
 			return (EINVAL);
 		}
 
 		be_lun->size_bytes = params->lun_size_bytes;
 	} else
 		be_lun->size_bytes = otmp;
 	be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize;
 	cbe_lun->maxlba = (be_lun->size_blocks == 0) ?
 	    0 : (be_lun->size_blocks - 1);
 
 	error = csw->d_ioctl(dev, DIOCGSTRIPESIZE, (caddr_t)&ps, FREAD,
 	    curthread);
 	if (error)
 		ps = po = 0;
 	else {
 		error = csw->d_ioctl(dev, DIOCGSTRIPEOFFSET, (caddr_t)&po,
 		    FREAD, curthread);
 		if (error)
 			po = 0;
 	}
 	us = ps;
 	uo = po;
 
 	value = dnvlist_get_string(cbe_lun->options, "pblocksize", NULL);
 	if (value != NULL)
 		ctl_expand_number(value, &ps);
 	value = dnvlist_get_string(cbe_lun->options, "pblockoffset", NULL);
 	if (value != NULL)
 		ctl_expand_number(value, &po);
 	pss = ps / cbe_lun->blocksize;
 	pos = po / cbe_lun->blocksize;
 	if ((pss > 0) && (pss * cbe_lun->blocksize == ps) && (pss >= pos) &&
 	    ((pss & (pss - 1)) == 0) && (pos * cbe_lun->blocksize == po)) {
 		cbe_lun->pblockexp = fls(pss) - 1;
 		cbe_lun->pblockoff = (pss - pos) % pss;
 	}
 
 	value = dnvlist_get_string(cbe_lun->options, "ublocksize", NULL);
 	if (value != NULL)
 		ctl_expand_number(value, &us);
 	value = dnvlist_get_string(cbe_lun->options, "ublockoffset", NULL);
 	if (value != NULL)
 		ctl_expand_number(value, &uo);
 	uss = us / cbe_lun->blocksize;
 	uos = uo / cbe_lun->blocksize;
 	if ((uss > 0) && (uss * cbe_lun->blocksize == us) && (uss >= uos) &&
 	    ((uss & (uss - 1)) == 0) && (uos * cbe_lun->blocksize == uo)) {
 		cbe_lun->ublockexp = fls(uss) - 1;
 		cbe_lun->ublockoff = (uss - uos) % uss;
 	}
 
 	cbe_lun->atomicblock = atomic / cbe_lun->blocksize;
 	cbe_lun->opttxferlen = maxio / cbe_lun->blocksize;
 
 	if (be_lun->dispatch == ctl_be_block_dispatch_zvol) {
 		unmap = 1;
 	} else {
 		struct diocgattr_arg	arg;
 
 		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
 		arg.len = sizeof(arg.value.i);
 		error = csw->d_ioctl(dev, DIOCGATTR, (caddr_t)&arg, FREAD,
 		    curthread);
 		unmap = (error == 0) ? arg.value.i : 0;
 	}
 	value = dnvlist_get_string(cbe_lun->options, "unmap", NULL);
 	if (value != NULL)
 		unmap = (strcmp(value, "on") == 0);
 	if (unmap)
 		cbe_lun->flags |= CTL_LUN_FLAG_UNMAP;
 	else
 		cbe_lun->flags &= ~CTL_LUN_FLAG_UNMAP;
 
 	dev_relthread(dev, ref);
 	return (0);
 }
 
 static int
 ctl_be_block_close(struct ctl_be_block_lun *be_lun)
 {
 	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
 	int flags;
 
 	if (be_lun->vn) {
 		flags = FREAD;
 		if ((cbe_lun->flags & CTL_LUN_FLAG_READONLY) == 0)
 			flags |= FWRITE;
 		(void)vn_close(be_lun->vn, flags, NOCRED, curthread);
 		be_lun->vn = NULL;
 
 		switch (be_lun->dev_type) {
 		case CTL_BE_BLOCK_DEV:
 			break;
 		case CTL_BE_BLOCK_FILE:
 			if (be_lun->backend.file.cred != NULL) {
 				crfree(be_lun->backend.file.cred);
 				be_lun->backend.file.cred = NULL;
 			}
 			break;
 		case CTL_BE_BLOCK_NONE:
 			break;
 		default:
 			panic("Unexpected backend type %d", be_lun->dev_type);
 			break;
 		}
 		be_lun->dev_type = CTL_BE_BLOCK_NONE;
 	}
 	return (0);
 }
 
 static int
 ctl_be_block_open(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
 {
 	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
 	struct nameidata nd;
 	const char	*value;
 	int		 error, flags;
 
 	error = 0;
 	if (rootvnode == NULL) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "Root filesystem is not mounted");
 		return (1);
 	}
 	pwd_ensure_dirs();
 
 	value = dnvlist_get_string(cbe_lun->options, "file", NULL);
 	if (value == NULL) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "no file argument specified");
 		return (1);
 	}
 	free(be_lun->dev_path, M_CTLBLK);
 	be_lun->dev_path = strdup(value, M_CTLBLK);
 
 	flags = FREAD;
 	value = dnvlist_get_string(cbe_lun->options, "readonly", NULL);
 	if (value != NULL) {
 		if (strcmp(value, "on") != 0)
 			flags |= FWRITE;
 	} else if (cbe_lun->lun_type == T_DIRECT)
 		flags |= FWRITE;
 
 again:
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, be_lun->dev_path, curthread);
 	error = vn_open(&nd, &flags, 0, NULL);
 	if ((error == EROFS || error == EACCES) && (flags & FWRITE)) {
 		flags &= ~FWRITE;
 		goto again;
 	}
 	if (error) {
 		/*
 		 * This is the only reasonable guess we can make as far as
 		 * path if the user doesn't give us a fully qualified path.
 		 * If they want to specify a file, they need to specify the
 		 * full path.
 		 */
 		if (be_lun->dev_path[0] != '/') {
 			char *dev_name;
 
 			asprintf(&dev_name, M_CTLBLK, "/dev/%s",
 				be_lun->dev_path);
 			free(be_lun->dev_path, M_CTLBLK);
 			be_lun->dev_path = dev_name;
 			goto again;
 		}
 		snprintf(req->error_str, sizeof(req->error_str),
 		    "error opening %s: %d", be_lun->dev_path, error);
 		return (error);
 	}
 	if (flags & FWRITE)
 		cbe_lun->flags &= ~CTL_LUN_FLAG_READONLY;
 	else
 		cbe_lun->flags |= CTL_LUN_FLAG_READONLY;
 
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	be_lun->vn = nd.ni_vp;
 
 	/* We only support disks and files. */
 	if (vn_isdisk(be_lun->vn, &error)) {
 		error = ctl_be_block_open_dev(be_lun, req);
 	} else if (be_lun->vn->v_type == VREG) {
 		error = ctl_be_block_open_file(be_lun, req);
 	} else {
 		error = EINVAL;
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "%s is not a disk or plain file", be_lun->dev_path);
 	}
 	VOP_UNLOCK(be_lun->vn, 0);
 
 	if (error != 0)
 		ctl_be_block_close(be_lun);
 	cbe_lun->serseq = CTL_LUN_SERSEQ_OFF;
 	if (be_lun->dispatch != ctl_be_block_dispatch_dev)
 		cbe_lun->serseq = CTL_LUN_SERSEQ_READ;
 	value = dnvlist_get_string(cbe_lun->options, "serseq", NULL);
 	if (value != NULL && strcmp(value, "on") == 0)
 		cbe_lun->serseq = CTL_LUN_SERSEQ_ON;
 	else if (value != NULL && strcmp(value, "read") == 0)
 		cbe_lun->serseq = CTL_LUN_SERSEQ_READ;
 	else if (value != NULL && strcmp(value, "off") == 0)
 		cbe_lun->serseq = CTL_LUN_SERSEQ_OFF;
 	return (0);
 }
 
 static int
 ctl_be_block_create(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
 {
 	struct ctl_be_lun *cbe_lun;
 	struct ctl_be_block_lun *be_lun;
 	struct ctl_lun_create_params *params;
 	char num_thread_str[16];
 	char tmpstr[32];
 	const char *value;
 	int retval, num_threads;
 	int tmp_num_threads;
 
 	params = &req->reqdata.create;
 	retval = 0;
 	req->status = CTL_LUN_OK;
 
 	be_lun = malloc(sizeof(*be_lun), M_CTLBLK, M_ZERO | M_WAITOK);
 	cbe_lun = &be_lun->cbe_lun;
 	cbe_lun->be_lun = be_lun;
 	be_lun->params = req->reqdata.create;
 	be_lun->softc = softc;
 	STAILQ_INIT(&be_lun->input_queue);
 	STAILQ_INIT(&be_lun->config_read_queue);
 	STAILQ_INIT(&be_lun->config_write_queue);
 	STAILQ_INIT(&be_lun->datamove_queue);
 	sprintf(be_lun->lunname, "cblk%d", softc->num_luns);
 	mtx_init(&be_lun->io_lock, "cblk io lock", NULL, MTX_DEF);
 	mtx_init(&be_lun->queue_lock, "cblk queue lock", NULL, MTX_DEF);
 	cbe_lun->options = nvlist_clone(req->args_nvl);
 	be_lun->lun_zone = uma_zcreate(be_lun->lunname, CTLBLK_MAX_SEG,
 	    NULL, NULL, NULL, NULL, /*align*/ 0, /*flags*/0);
 	if (be_lun->lun_zone == NULL) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "error allocating UMA zone");
 		goto bailout_error;
 	}
 
 	if (params->flags & CTL_LUN_FLAG_DEV_TYPE)
 		cbe_lun->lun_type = params->device_type;
 	else
 		cbe_lun->lun_type = T_DIRECT;
 	be_lun->flags = CTL_BE_BLOCK_LUN_UNCONFIGURED;
 	cbe_lun->flags = 0;
 	value = dnvlist_get_string(cbe_lun->options, "ha_role", NULL);
 	if (value != NULL) {
 		if (strcmp(value, "primary") == 0)
 			cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
 	} else if (control_softc->flags & CTL_FLAG_ACTIVE_SHELF)
 		cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
 
 	if (cbe_lun->lun_type == T_DIRECT ||
 	    cbe_lun->lun_type == T_CDROM) {
 		be_lun->size_bytes = params->lun_size_bytes;
 		if (params->blocksize_bytes != 0)
 			cbe_lun->blocksize = params->blocksize_bytes;
 		else if (cbe_lun->lun_type == T_CDROM)
 			cbe_lun->blocksize = 2048;
 		else
 			cbe_lun->blocksize = 512;
 		be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize;
 		cbe_lun->maxlba = (be_lun->size_blocks == 0) ?
 		    0 : (be_lun->size_blocks - 1);
 
 		if ((cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) ||
 		    control_softc->ha_mode == CTL_HA_MODE_SER_ONLY) {
 			retval = ctl_be_block_open(be_lun, req);
 			if (retval != 0) {
 				retval = 0;
 				req->status = CTL_LUN_WARNING;
 			}
 		}
 		num_threads = cbb_num_threads;
 	} else {
 		num_threads = 1;
 	}
 
 	value = dnvlist_get_string(cbe_lun->options, "num_threads", NULL);
 	if (value != NULL) {
 		tmp_num_threads = strtol(value, NULL, 0);
 
 		/*
 		 * We don't let the user specify less than one
 		 * thread, but hope he's clueful enough not to
 		 * specify 1000 threads.
 		 */
 		if (tmp_num_threads < 1) {
 			snprintf(req->error_str, sizeof(req->error_str),
 				 "invalid number of threads %s",
 				 num_thread_str);
 			goto bailout_error;
 		}
 		num_threads = tmp_num_threads;
 	}
 
 	if (be_lun->vn == NULL)
 		cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
 	/* Tell the user the blocksize we ended up using */
 	params->lun_size_bytes = be_lun->size_bytes;
 	params->blocksize_bytes = cbe_lun->blocksize;
 	if (params->flags & CTL_LUN_FLAG_ID_REQ) {
 		cbe_lun->req_lun_id = params->req_lun_id;
 		cbe_lun->flags |= CTL_LUN_FLAG_ID_REQ;
 	} else
 		cbe_lun->req_lun_id = 0;
 
 	cbe_lun->lun_shutdown = ctl_be_block_lun_shutdown;
 	cbe_lun->lun_config_status = ctl_be_block_lun_config_status;
 	cbe_lun->be = &ctl_be_block_driver;
 
 	if ((params->flags & CTL_LUN_FLAG_SERIAL_NUM) == 0) {
 		snprintf(tmpstr, sizeof(tmpstr), "MYSERIAL%04d",
 			 softc->num_luns);
 		strncpy((char *)cbe_lun->serial_num, tmpstr,
 			MIN(sizeof(cbe_lun->serial_num), sizeof(tmpstr)));
 
 		/* Tell the user what we used for a serial number */
 		strncpy((char *)params->serial_num, tmpstr,
 			MIN(sizeof(params->serial_num), sizeof(tmpstr)));
 	} else { 
 		strncpy((char *)cbe_lun->serial_num, params->serial_num,
 			MIN(sizeof(cbe_lun->serial_num),
 			sizeof(params->serial_num)));
 	}
 	if ((params->flags & CTL_LUN_FLAG_DEVID) == 0) {
 		snprintf(tmpstr, sizeof(tmpstr), "MYDEVID%04d", softc->num_luns);
 		strncpy((char *)cbe_lun->device_id, tmpstr,
 			MIN(sizeof(cbe_lun->device_id), sizeof(tmpstr)));
 
 		/* Tell the user what we used for a device ID */
 		strncpy((char *)params->device_id, tmpstr,
 			MIN(sizeof(params->device_id), sizeof(tmpstr)));
 	} else {
 		strncpy((char *)cbe_lun->device_id, params->device_id,
 			MIN(sizeof(cbe_lun->device_id),
 			    sizeof(params->device_id)));
 	}
 
 	TASK_INIT(&be_lun->io_task, /*priority*/0, ctl_be_block_worker, be_lun);
 
 	be_lun->io_taskqueue = taskqueue_create(be_lun->lunname, M_WAITOK,
 	    taskqueue_thread_enqueue, /*context*/&be_lun->io_taskqueue);
 
 	if (be_lun->io_taskqueue == NULL) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "unable to create taskqueue");
 		goto bailout_error;
 	}
 
 	/*
 	 * Note that we start the same number of threads by default for
 	 * both the file case and the block device case.  For the file
 	 * case, we need multiple threads to allow concurrency, because the
 	 * vnode interface is designed to be a blocking interface.  For the
 	 * block device case, ZFS zvols at least will block the caller's
 	 * context in many instances, and so we need multiple threads to
 	 * overcome that problem.  Other block devices don't need as many
 	 * threads, but they shouldn't cause too many problems.
 	 *
 	 * If the user wants to just have a single thread for a block
 	 * device, he can specify that when the LUN is created, or change
 	 * the tunable/sysctl to alter the default number of threads.
 	 */
 	retval = taskqueue_start_threads(&be_lun->io_taskqueue,
 					 /*num threads*/num_threads,
 					 /*priority*/PUSER,
 					 /*thread name*/
 					 "%s taskq", be_lun->lunname);
 
 	if (retval != 0)
 		goto bailout_error;
 
 	be_lun->num_threads = num_threads;
 
 	mtx_lock(&softc->lock);
 	softc->num_luns++;
 	STAILQ_INSERT_TAIL(&softc->lun_list, be_lun, links);
 
 	mtx_unlock(&softc->lock);
 
 	retval = ctl_add_lun(&be_lun->cbe_lun);
 	if (retval != 0) {
 		mtx_lock(&softc->lock);
 		STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun,
 			      links);
 		softc->num_luns--;
 		mtx_unlock(&softc->lock);
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "ctl_add_lun() returned error %d, see dmesg for "
 			 "details", retval);
 		retval = 0;
 		goto bailout_error;
 	}
 
 	mtx_lock(&softc->lock);
 
 	/*
 	 * Tell the config_status routine that we're waiting so it won't
 	 * clean up the LUN in the event of an error.
 	 */
 	be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING;
 
 	while (be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) {
 		retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0);
 		if (retval == EINTR)
 			break;
 	}
 	be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
 
 	if (be_lun->flags & CTL_BE_BLOCK_LUN_CONFIG_ERR) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "LUN configuration error, see dmesg for details");
 		STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun,
 			      links);
 		softc->num_luns--;
 		mtx_unlock(&softc->lock);
 		goto bailout_error;
 	} else {
 		params->req_lun_id = cbe_lun->lun_id;
 	}
 
 	mtx_unlock(&softc->lock);
 
 	be_lun->disk_stats = devstat_new_entry("cbb", params->req_lun_id,
 					       cbe_lun->blocksize,
 					       DEVSTAT_ALL_SUPPORTED,
 					       cbe_lun->lun_type
 					       | DEVSTAT_TYPE_IF_OTHER,
 					       DEVSTAT_PRIORITY_OTHER);
 
 	return (retval);
 
 bailout_error:
 	req->status = CTL_LUN_ERROR;
 
 	if (be_lun->io_taskqueue != NULL)
 		taskqueue_free(be_lun->io_taskqueue);
 	ctl_be_block_close(be_lun);
 	if (be_lun->dev_path != NULL)
 		free(be_lun->dev_path, M_CTLBLK);
 	if (be_lun->lun_zone != NULL)
 		uma_zdestroy(be_lun->lun_zone);
 	nvlist_destroy(cbe_lun->options);
 	mtx_destroy(&be_lun->queue_lock);
 	mtx_destroy(&be_lun->io_lock);
 	free(be_lun, M_CTLBLK);
 
 	return (retval);
 }
 
 static int
 ctl_be_block_rm(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
 {
 	struct ctl_lun_rm_params *params;
 	struct ctl_be_block_lun *be_lun;
 	struct ctl_be_lun *cbe_lun;
 	int retval;
 
 	params = &req->reqdata.rm;
 
 	mtx_lock(&softc->lock);
 	STAILQ_FOREACH(be_lun, &softc->lun_list, links) {
 		if (be_lun->cbe_lun.lun_id == params->lun_id)
 			break;
 	}
 	mtx_unlock(&softc->lock);
 	if (be_lun == NULL) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "LUN %u is not managed by the block backend",
 			 params->lun_id);
 		goto bailout_error;
 	}
 	cbe_lun = &be_lun->cbe_lun;
 
 	retval = ctl_disable_lun(cbe_lun);
 	if (retval != 0) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "error %d returned from ctl_disable_lun() for "
 			 "LUN %d", retval, params->lun_id);
 		goto bailout_error;
 	}
 
 	if (be_lun->vn != NULL) {
 		cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
 		ctl_lun_no_media(cbe_lun);
 		taskqueue_drain_all(be_lun->io_taskqueue);
 		ctl_be_block_close(be_lun);
 	}
 
 	retval = ctl_invalidate_lun(cbe_lun);
 	if (retval != 0) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "error %d returned from ctl_invalidate_lun() for "
 			 "LUN %d", retval, params->lun_id);
 		goto bailout_error;
 	}
 
 	mtx_lock(&softc->lock);
 	be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING;
 	while ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
                 retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0);
                 if (retval == EINTR)
                         break;
         }
 	be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
 
 	if ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "interrupted waiting for LUN to be freed");
 		mtx_unlock(&softc->lock);
 		goto bailout_error;
 	}
 
 	STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun, links);
 
 	softc->num_luns--;
 	mtx_unlock(&softc->lock);
 
 	taskqueue_drain_all(be_lun->io_taskqueue);
 	taskqueue_free(be_lun->io_taskqueue);
 
 	if (be_lun->disk_stats != NULL)
 		devstat_remove_entry(be_lun->disk_stats);
 
 	uma_zdestroy(be_lun->lun_zone);
 
 	nvlist_destroy(cbe_lun->options);
 	free(be_lun->dev_path, M_CTLBLK);
 	mtx_destroy(&be_lun->queue_lock);
 	mtx_destroy(&be_lun->io_lock);
 	free(be_lun, M_CTLBLK);
 
 	req->status = CTL_LUN_OK;
 	return (0);
 
 bailout_error:
 	req->status = CTL_LUN_ERROR;
 	return (0);
 }
 
 static int
 ctl_be_block_modify(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
 {
 	struct ctl_lun_modify_params *params;
 	struct ctl_be_block_lun *be_lun;
 	struct ctl_be_lun *cbe_lun;
 	const char *value;
 	uint64_t oldsize;
 	int error, wasprim;
 
 	params = &req->reqdata.modify;
 
 	mtx_lock(&softc->lock);
 	STAILQ_FOREACH(be_lun, &softc->lun_list, links) {
 		if (be_lun->cbe_lun.lun_id == params->lun_id)
 			break;
 	}
 	mtx_unlock(&softc->lock);
 	if (be_lun == NULL) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "LUN %u is not managed by the block backend",
 			 params->lun_id);
 		goto bailout_error;
 	}
 	cbe_lun = &be_lun->cbe_lun;
 
 	if (params->lun_size_bytes != 0)
 		be_lun->params.lun_size_bytes = params->lun_size_bytes;
 
 	nvlist_destroy(cbe_lun->options);
 	cbe_lun->options = nvlist_clone(req->args_nvl);
 
 	wasprim = (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY);
 	value = dnvlist_get_string(cbe_lun->options, "ha_role", NULL);
 	if (value != NULL) {
 		if (strcmp(value, "primary") == 0)
 			cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
 		else
 			cbe_lun->flags &= ~CTL_LUN_FLAG_PRIMARY;
 	} else if (control_softc->flags & CTL_FLAG_ACTIVE_SHELF)
 		cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
 	else
 		cbe_lun->flags &= ~CTL_LUN_FLAG_PRIMARY;
 	if (wasprim != (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY)) {
 		if (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY)
 			ctl_lun_primary(cbe_lun);
 		else
 			ctl_lun_secondary(cbe_lun);
 	}
 
 	oldsize = be_lun->size_blocks;
 	if ((cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) ||
 	    control_softc->ha_mode == CTL_HA_MODE_SER_ONLY) {
 		if (be_lun->vn == NULL)
 			error = ctl_be_block_open(be_lun, req);
 		else if (vn_isdisk(be_lun->vn, &error))
 			error = ctl_be_block_open_dev(be_lun, req);
 		else if (be_lun->vn->v_type == VREG) {
 			vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
 			error = ctl_be_block_open_file(be_lun, req);
 			VOP_UNLOCK(be_lun->vn, 0);
 		} else
 			error = EINVAL;
 		if ((cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) &&
 		    be_lun->vn != NULL) {
 			cbe_lun->flags &= ~CTL_LUN_FLAG_NO_MEDIA;
 			ctl_lun_has_media(cbe_lun);
 		} else if ((cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) == 0 &&
 		    be_lun->vn == NULL) {
 			cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
 			ctl_lun_no_media(cbe_lun);
 		}
 		cbe_lun->flags &= ~CTL_LUN_FLAG_EJECTED;
 	} else {
 		if (be_lun->vn != NULL) {
 			cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
 			ctl_lun_no_media(cbe_lun);
 			taskqueue_drain_all(be_lun->io_taskqueue);
 			error = ctl_be_block_close(be_lun);
 		} else
 			error = 0;
 	}
 	if (be_lun->size_blocks != oldsize)
 		ctl_lun_capacity_changed(cbe_lun);
 
 	/* Tell the user the exact size we ended up using */
 	params->lun_size_bytes = be_lun->size_bytes;
 
 	req->status = error ? CTL_LUN_WARNING : CTL_LUN_OK;
 	return (0);
 
 bailout_error:
 	req->status = CTL_LUN_ERROR;
 	return (0);
 }
 
 static void
 ctl_be_block_lun_shutdown(void *be_lun)
 {
 	struct ctl_be_block_lun *lun = be_lun;
 	struct ctl_be_block_softc *softc = lun->softc;
 
 	mtx_lock(&softc->lock);
 	lun->flags |= CTL_BE_BLOCK_LUN_UNCONFIGURED;
 	if (lun->flags & CTL_BE_BLOCK_LUN_WAITING)
 		wakeup(lun);
 	mtx_unlock(&softc->lock);
 }
 
 static void
 ctl_be_block_lun_config_status(void *be_lun, ctl_lun_config_status status)
 {
 	struct ctl_be_block_lun *lun;
 	struct ctl_be_block_softc *softc;
 
 	lun = (struct ctl_be_block_lun *)be_lun;
 	softc = lun->softc;
 
 	if (status == CTL_LUN_CONFIG_OK) {
 		mtx_lock(&softc->lock);
 		lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED;
 		if (lun->flags & CTL_BE_BLOCK_LUN_WAITING)
 			wakeup(lun);
 		mtx_unlock(&softc->lock);
 
 		/*
 		 * We successfully added the LUN, attempt to enable it.
 		 */
 		if (ctl_enable_lun(&lun->cbe_lun) != 0) {
 			printf("%s: ctl_enable_lun() failed!\n", __func__);
 			if (ctl_invalidate_lun(&lun->cbe_lun) != 0) {
 				printf("%s: ctl_invalidate_lun() failed!\n",
 				       __func__);
 			}
 		}
 
 		return;
 	}
 
 
 	mtx_lock(&softc->lock);
 	lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED;
 	lun->flags |= CTL_BE_BLOCK_LUN_CONFIG_ERR;
 	wakeup(lun);
 	mtx_unlock(&softc->lock);
 }
 
 
 static int
 ctl_be_block_config_write(union ctl_io *io)
 {
 	struct ctl_be_block_lun *be_lun;
 	struct ctl_be_lun *cbe_lun;
 	int retval;
 
 	DPRINTF("entered\n");
 
 	cbe_lun = CTL_BACKEND_LUN(io);
 	be_lun = (struct ctl_be_block_lun *)cbe_lun->be_lun;
 
 	retval = 0;
 	switch (io->scsiio.cdb[0]) {
 	case SYNCHRONIZE_CACHE:
 	case SYNCHRONIZE_CACHE_16:
 	case WRITE_SAME_10:
 	case WRITE_SAME_16:
 	case UNMAP:
 		/*
 		 * The upper level CTL code will filter out any CDBs with
 		 * the immediate bit set and return the proper error.
 		 *
 		 * We don't really need to worry about what LBA range the
 		 * user asked to be synced out.  When they issue a sync
 		 * cache command, we'll sync out the whole thing.
 		 */
 		mtx_lock(&be_lun->queue_lock);
 		STAILQ_INSERT_TAIL(&be_lun->config_write_queue, &io->io_hdr,
 				   links);
 		mtx_unlock(&be_lun->queue_lock);
 		taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
 		break;
 	case START_STOP_UNIT: {
 		struct scsi_start_stop_unit *cdb;
 		struct ctl_lun_req req;
 
 		cdb = (struct scsi_start_stop_unit *)io->scsiio.cdb;
 		if ((cdb->how & SSS_PC_MASK) != 0) {
 			ctl_set_success(&io->scsiio);
 			ctl_config_write_done(io);
 			break;
 		}
 		if (cdb->how & SSS_START) {
 			if ((cdb->how & SSS_LOEJ) && be_lun->vn == NULL) {
 				retval = ctl_be_block_open(be_lun, &req);
 				cbe_lun->flags &= ~CTL_LUN_FLAG_EJECTED;
 				if (retval == 0) {
 					cbe_lun->flags &= ~CTL_LUN_FLAG_NO_MEDIA;
 					ctl_lun_has_media(cbe_lun);
 				} else {
 					cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
 					ctl_lun_no_media(cbe_lun);
 				}
 			}
 			ctl_start_lun(cbe_lun);
 		} else {
 			ctl_stop_lun(cbe_lun);
 			if (cdb->how & SSS_LOEJ) {
 				cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
 				cbe_lun->flags |= CTL_LUN_FLAG_EJECTED;
 				ctl_lun_ejected(cbe_lun);
 				if (be_lun->vn != NULL)
 					ctl_be_block_close(be_lun);
 			}
 		}
 
 		ctl_set_success(&io->scsiio);
 		ctl_config_write_done(io);
 		break;
 	}
 	case PREVENT_ALLOW:
 		ctl_set_success(&io->scsiio);
 		ctl_config_write_done(io);
 		break;
 	default:
 		ctl_set_invalid_opcode(&io->scsiio);
 		ctl_config_write_done(io);
 		retval = CTL_RETVAL_COMPLETE;
 		break;
 	}
 
 	return (retval);
 }
 
 static int
 ctl_be_block_config_read(union ctl_io *io)
 {
 	struct ctl_be_block_lun *be_lun;
 	struct ctl_be_lun *cbe_lun;
 	int retval = 0;
 
 	DPRINTF("entered\n");
 
 	cbe_lun = CTL_BACKEND_LUN(io);
 	be_lun = (struct ctl_be_block_lun *)cbe_lun->be_lun;
 
 	switch (io->scsiio.cdb[0]) {
 	case SERVICE_ACTION_IN:
 		if (io->scsiio.cdb[1] == SGLS_SERVICE_ACTION) {
 			mtx_lock(&be_lun->queue_lock);
 			STAILQ_INSERT_TAIL(&be_lun->config_read_queue,
 			    &io->io_hdr, links);
 			mtx_unlock(&be_lun->queue_lock);
 			taskqueue_enqueue(be_lun->io_taskqueue,
 			    &be_lun->io_task);
 			retval = CTL_RETVAL_QUEUED;
 			break;
 		}
 		ctl_set_invalid_field(&io->scsiio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 1,
 				      /*bit_valid*/ 1,
 				      /*bit*/ 4);
 		ctl_config_read_done(io);
 		retval = CTL_RETVAL_COMPLETE;
 		break;
 	default:
 		ctl_set_invalid_opcode(&io->scsiio);
 		ctl_config_read_done(io);
 		retval = CTL_RETVAL_COMPLETE;
 		break;
 	}
 
 	return (retval);
 }
 
 static int
 ctl_be_block_lun_info(void *be_lun, struct sbuf *sb)
 {
 	struct ctl_be_block_lun *lun;
 	int retval;
 
 	lun = (struct ctl_be_block_lun *)be_lun;
 
 	retval = sbuf_printf(sb, "\t<num_threads>");
 	if (retval != 0)
 		goto bailout;
 	retval = sbuf_printf(sb, "%d", lun->num_threads);
 	if (retval != 0)
 		goto bailout;
 	retval = sbuf_printf(sb, "</num_threads>\n");
 
 bailout:
 	return (retval);
 }
 
 static uint64_t
 ctl_be_block_lun_attr(void *be_lun, const char *attrname)
 {
 	struct ctl_be_block_lun *lun = (struct ctl_be_block_lun *)be_lun;
 
 	if (lun->getattr == NULL)
 		return (UINT64_MAX);
 	return (lun->getattr(lun, attrname));
 }
 
 static int
 ctl_be_block_init(void)
 {
 	struct ctl_be_block_softc *softc = &backend_block_softc;
 
 	mtx_init(&softc->lock, "ctlblock", NULL, MTX_DEF);
 	softc->beio_zone = uma_zcreate("beio", sizeof(struct ctl_be_block_io),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	STAILQ_INIT(&softc->lun_list);
 	return (0);
 }
 
 
 static int
 ctl_be_block_shutdown(void)
 {
 	struct ctl_be_block_softc *softc = &backend_block_softc;
 	struct ctl_be_block_lun *lun, *next_lun;
 
 	mtx_lock(&softc->lock);
 	STAILQ_FOREACH_SAFE(lun, &softc->lun_list, links, next_lun) {
 		/*
 		 * Drop our lock here.  Since ctl_invalidate_lun() can call
 		 * back into us, this could potentially lead to a recursive
 		 * lock of the same mutex, which would cause a hang.
 		 */
 		mtx_unlock(&softc->lock);
 		ctl_disable_lun(&lun->cbe_lun);
 		ctl_invalidate_lun(&lun->cbe_lun);
 		mtx_lock(&softc->lock);
 	}
 	mtx_unlock(&softc->lock);
 
 	uma_zdestroy(softc->beio_zone);
 	mtx_destroy(&softc->lock);
 	return (0);
 }
Index: head/sys/dev/md/md.c
===================================================================
--- head/sys/dev/md/md.c	(revision 356199)
+++ head/sys/dev/md/md.c	(revision 356200)
@@ -1,2163 +1,2157 @@
 /*-
  * SPDX-License-Identifier: (Beerware AND BSD-3-Clause)
  *
  * ----------------------------------------------------------------------------
  * "THE BEER-WARE LICENSE" (Revision 42):
  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  *
  * $FreeBSD$
  *
  */
 
 /*-
  * The following functions are based in the vn(4) driver: mdstart_swap(),
  * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
  * and as such under the following copyright:
  *
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (c) 2013 The FreeBSD Foundation
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * from: Utah Hdr: vn.c 1.13 94/04/02
  *
  *	from: @(#)vn.c	8.6 (Berkeley) 4/1/94
  * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
  */
 
 #include "opt_rootdevname.h"
 #include "opt_geom.h"
 #include "opt_md.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/devicestat.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/limits.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mdioctl.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/sf_buf.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/vnode.h>
 #include <sys/disk.h>
 
 #include <geom/geom.h>
 #include <geom/geom_int.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/uma.h>
 
 #include <machine/bus.h>
 
 #define MD_MODVER 1
 
 #define MD_SHUTDOWN	0x10000		/* Tell worker thread to terminate. */
 #define	MD_EXITING	0x20000		/* Worker thread is exiting. */
 #define MD_PROVIDERGONE	0x40000		/* Safe to free the softc */
 
 #ifndef MD_NSECT
 #define MD_NSECT (10000 * 2)
 #endif
 
 struct md_req {
 	unsigned	md_unit;	/* unit number */
 	enum md_types	md_type;	/* type of disk */
 	off_t		md_mediasize;	/* size of disk in bytes */
 	unsigned	md_sectorsize;	/* sectorsize */
 	unsigned	md_options;	/* options */
 	int		md_fwheads;	/* firmware heads */
 	int		md_fwsectors;	/* firmware sectors */
 	char		*md_file;	/* pathname of file to mount */
 	enum uio_seg	md_file_seg;	/* location of md_file */
 	char		*md_label;	/* label of the device (userspace) */
 	int		*md_units;	/* pointer to units array (kernel) */
 	size_t		md_units_nitems; /* items in md_units array */
 };
 
 #ifdef COMPAT_FREEBSD32
 struct md_ioctl32 {
 	unsigned	md_version;
 	unsigned	md_unit;
 	enum md_types	md_type;
 	uint32_t	md_file;
 	off_t		md_mediasize;
 	unsigned	md_sectorsize;
 	unsigned	md_options;
 	uint64_t	md_base;
 	int		md_fwheads;
 	int		md_fwsectors;
 	uint32_t	md_label;
 	int		md_pad[MDNPAD];
 } __attribute__((__packed__));
 CTASSERT((sizeof(struct md_ioctl32)) == 436);
 
 #define	MDIOCATTACH_32	_IOC_NEWTYPE(MDIOCATTACH, struct md_ioctl32)
 #define	MDIOCDETACH_32	_IOC_NEWTYPE(MDIOCDETACH, struct md_ioctl32)
 #define	MDIOCQUERY_32	_IOC_NEWTYPE(MDIOCQUERY, struct md_ioctl32)
 #define	MDIOCRESIZE_32	_IOC_NEWTYPE(MDIOCRESIZE, struct md_ioctl32)
 #endif /* COMPAT_FREEBSD32 */
 
 static MALLOC_DEFINE(M_MD, "md_disk", "Memory Disk");
 static MALLOC_DEFINE(M_MDSECT, "md_sectors", "Memory Disk Sectors");
 
 static int md_debug;
 SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0,
     "Enable md(4) debug messages");
 static int md_malloc_wait;
 SYSCTL_INT(_vm, OID_AUTO, md_malloc_wait, CTLFLAG_RW, &md_malloc_wait, 0,
     "Allow malloc to wait for memory allocations");
 
 #if defined(MD_ROOT) && !defined(MD_ROOT_FSTYPE)
 #define	MD_ROOT_FSTYPE	"ufs"
 #endif
 
 #if defined(MD_ROOT)
 /*
  * Preloaded image gets put here.
  */
 #if defined(MD_ROOT_SIZE)
 /*
  * We put the mfs_root symbol into the oldmfs section of the kernel object file.
  * Applications that patch the object with the image can determine
  * the size looking at the oldmfs section size within the kernel.
  */
 u_char mfs_root[MD_ROOT_SIZE*1024] __attribute__ ((section ("oldmfs")));
 const int mfs_root_size = sizeof(mfs_root);
 #elif defined(MD_ROOT_MEM)
 /* MD region already mapped in the memory */
 u_char *mfs_root;
 int mfs_root_size;
 #else
 extern volatile u_char __weak_symbol mfs_root;
 extern volatile u_char __weak_symbol mfs_root_end;
 __GLOBL(mfs_root);
 __GLOBL(mfs_root_end);
 #define mfs_root_size ((uintptr_t)(&mfs_root_end - &mfs_root))
 #endif
 #endif
 
 static g_init_t g_md_init;
 static g_fini_t g_md_fini;
 static g_start_t g_md_start;
 static g_access_t g_md_access;
 static void g_md_dumpconf(struct sbuf *sb, const char *indent,
     struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp);
 static g_provgone_t g_md_providergone;
 
 static struct cdev *status_dev = NULL;
 static struct sx md_sx;
 static struct unrhdr *md_uh;
 
 static d_ioctl_t mdctlioctl;
 
 static struct cdevsw mdctl_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_ioctl =	mdctlioctl,
 	.d_name =	MD_NAME,
 };
 
 struct g_class g_md_class = {
 	.name = "MD",
 	.version = G_VERSION,
 	.init = g_md_init,
 	.fini = g_md_fini,
 	.start = g_md_start,
 	.access = g_md_access,
 	.dumpconf = g_md_dumpconf,
 	.providergone = g_md_providergone,
 };
 
 DECLARE_GEOM_CLASS(g_md_class, g_md);
 
 
 static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(md_softc_list);
 
 #define NINDIR	(PAGE_SIZE / sizeof(uintptr_t))
 #define NMASK	(NINDIR-1)
 static int nshift;
 
 static uma_zone_t md_pbuf_zone;
 
 struct indir {
 	uintptr_t	*array;
 	u_int		total;
 	u_int		used;
 	u_int		shift;
 };
 
 struct md_s {
 	int unit;
 	LIST_ENTRY(md_s) list;
 	struct bio_queue_head bio_queue;
 	struct mtx queue_mtx;
-	struct mtx stat_mtx;
 	struct cdev *dev;
 	enum md_types type;
 	off_t mediasize;
 	unsigned sectorsize;
 	unsigned opencount;
 	unsigned fwheads;
 	unsigned fwsectors;
 	char ident[32];
 	unsigned flags;
 	char name[20];
 	struct proc *procp;
 	struct g_geom *gp;
 	struct g_provider *pp;
 	int (*start)(struct md_s *sc, struct bio *bp);
 	struct devstat *devstat;
 
 	/* MD_MALLOC related fields */
 	struct indir *indir;
 	uma_zone_t uma;
 
 	/* MD_PRELOAD related fields */
 	u_char *pl_ptr;
 	size_t pl_len;
 
 	/* MD_VNODE related fields */
 	struct vnode *vnode;
 	char file[PATH_MAX];
 	char label[PATH_MAX];
 	struct ucred *cred;
 
 	/* MD_SWAP related fields */
 	vm_object_t object;
 };
 
 static struct indir *
 new_indir(u_int shift)
 {
 	struct indir *ip;
 
 	ip = malloc(sizeof *ip, M_MD, (md_malloc_wait ? M_WAITOK : M_NOWAIT)
 	    | M_ZERO);
 	if (ip == NULL)
 		return (NULL);
 	ip->array = malloc(sizeof(uintptr_t) * NINDIR,
 	    M_MDSECT, (md_malloc_wait ? M_WAITOK : M_NOWAIT) | M_ZERO);
 	if (ip->array == NULL) {
 		free(ip, M_MD);
 		return (NULL);
 	}
 	ip->total = NINDIR;
 	ip->shift = shift;
 	return (ip);
 }
 
 static void
 del_indir(struct indir *ip)
 {
 
 	free(ip->array, M_MDSECT);
 	free(ip, M_MD);
 }
 
 static void
 destroy_indir(struct md_s *sc, struct indir *ip)
 {
 	int i;
 
 	for (i = 0; i < NINDIR; i++) {
 		if (!ip->array[i])
 			continue;
 		if (ip->shift)
 			destroy_indir(sc, (struct indir*)(ip->array[i]));
 		else if (ip->array[i] > 255)
 			uma_zfree(sc->uma, (void *)(ip->array[i]));
 	}
 	del_indir(ip);
 }
 
 /*
  * This function does the math and allocates the top level "indir" structure
  * for a device of "size" sectors.
  */
 
 static struct indir *
 dimension(off_t size)
 {
 	off_t rcnt;
 	struct indir *ip;
 	int layer;
 
 	rcnt = size;
 	layer = 0;
 	while (rcnt > NINDIR) {
 		rcnt /= NINDIR;
 		layer++;
 	}
 
 	/*
 	 * XXX: the top layer is probably not fully populated, so we allocate
 	 * too much space for ip->array in here.
 	 */
 	ip = malloc(sizeof *ip, M_MD, M_WAITOK | M_ZERO);
 	ip->array = malloc(sizeof(uintptr_t) * NINDIR,
 	    M_MDSECT, M_WAITOK | M_ZERO);
 	ip->total = NINDIR;
 	ip->shift = layer * nshift;
 	return (ip);
 }
 
 /*
  * Read a given sector
  */
 
 static uintptr_t
 s_read(struct indir *ip, off_t offset)
 {
 	struct indir *cip;
 	int idx;
 	uintptr_t up;
 
 	if (md_debug > 1)
 		printf("s_read(%jd)\n", (intmax_t)offset);
 	up = 0;
 	for (cip = ip; cip != NULL;) {
 		if (cip->shift) {
 			idx = (offset >> cip->shift) & NMASK;
 			up = cip->array[idx];
 			cip = (struct indir *)up;
 			continue;
 		}
 		idx = offset & NMASK;
 		return (cip->array[idx]);
 	}
 	return (0);
 }
 
 /*
  * Write a given sector, prune the tree if the value is 0
  */
 
 static int
 s_write(struct indir *ip, off_t offset, uintptr_t ptr)
 {
 	struct indir *cip, *lip[10];
 	int idx, li;
 	uintptr_t up;
 
 	if (md_debug > 1)
 		printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr);
 	up = 0;
 	li = 0;
 	cip = ip;
 	for (;;) {
 		lip[li++] = cip;
 		if (cip->shift) {
 			idx = (offset >> cip->shift) & NMASK;
 			up = cip->array[idx];
 			if (up != 0) {
 				cip = (struct indir *)up;
 				continue;
 			}
 			/* Allocate branch */
 			cip->array[idx] =
 			    (uintptr_t)new_indir(cip->shift - nshift);
 			if (cip->array[idx] == 0)
 				return (ENOSPC);
 			cip->used++;
 			up = cip->array[idx];
 			cip = (struct indir *)up;
 			continue;
 		}
 		/* leafnode */
 		idx = offset & NMASK;
 		up = cip->array[idx];
 		if (up != 0)
 			cip->used--;
 		cip->array[idx] = ptr;
 		if (ptr != 0)
 			cip->used++;
 		break;
 	}
 	if (cip->used != 0 || li == 1)
 		return (0);
 	li--;
 	while (cip->used == 0 && cip != ip) {
 		li--;
 		idx = (offset >> lip[li]->shift) & NMASK;
 		up = lip[li]->array[idx];
 		KASSERT(up == (uintptr_t)cip, ("md screwed up"));
 		del_indir(cip);
 		lip[li]->array[idx] = 0;
 		lip[li]->used--;
 		cip = lip[li];
 	}
 	return (0);
 }
 
 
 static int
 g_md_access(struct g_provider *pp, int r, int w, int e)
 {
 	struct md_s *sc;
 
 	sc = pp->geom->softc;
 	if (sc == NULL) {
 		if (r <= 0 && w <= 0 && e <= 0)
 			return (0);
 		return (ENXIO);
 	}
 	r += pp->acr;
 	w += pp->acw;
 	e += pp->ace;
 	if ((sc->flags & MD_READONLY) != 0 && w > 0)
 		return (EROFS);
 	if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) {
 		sc->opencount = 1;
 	} else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) {
 		sc->opencount = 0;
 	}
 	return (0);
 }
 
 static void
 g_md_start(struct bio *bp)
 {
 	struct md_s *sc;
 
 	sc = bp->bio_to->geom->softc;
 	if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE)) {
-		mtx_lock(&sc->stat_mtx);
 		devstat_start_transaction_bio(sc->devstat, bp);
-		mtx_unlock(&sc->stat_mtx);
 	}
 	mtx_lock(&sc->queue_mtx);
 	bioq_disksort(&sc->bio_queue, bp);
 	wakeup(sc);
 	mtx_unlock(&sc->queue_mtx);
 }
 
 #define	MD_MALLOC_MOVE_ZERO	1
 #define	MD_MALLOC_MOVE_FILL	2
 #define	MD_MALLOC_MOVE_READ	3
 #define	MD_MALLOC_MOVE_WRITE	4
 #define	MD_MALLOC_MOVE_CMP	5
 
 static int
 md_malloc_move_ma(vm_page_t **mp, int *ma_offs, unsigned sectorsize,
     void *ptr, u_char fill, int op)
 {
 	struct sf_buf *sf;
 	vm_page_t m, *mp1;
 	char *p, first;
 	off_t *uc;
 	unsigned n;
 	int error, i, ma_offs1, sz, first_read;
 
 	m = NULL;
 	error = 0;
 	sf = NULL;
 	/* if (op == MD_MALLOC_MOVE_CMP) { gcc */
 		first = 0;
 		first_read = 0;
 		uc = ptr;
 		mp1 = *mp;
 		ma_offs1 = *ma_offs;
 	/* } */
 	sched_pin();
 	for (n = sectorsize; n != 0; n -= sz) {
 		sz = imin(PAGE_SIZE - *ma_offs, n);
 		if (m != **mp) {
 			if (sf != NULL)
 				sf_buf_free(sf);
 			m = **mp;
 			sf = sf_buf_alloc(m, SFB_CPUPRIVATE |
 			    (md_malloc_wait ? 0 : SFB_NOWAIT));
 			if (sf == NULL) {
 				error = ENOMEM;
 				break;
 			}
 		}
 		p = (char *)sf_buf_kva(sf) + *ma_offs;
 		switch (op) {
 		case MD_MALLOC_MOVE_ZERO:
 			bzero(p, sz);
 			break;
 		case MD_MALLOC_MOVE_FILL:
 			memset(p, fill, sz);
 			break;
 		case MD_MALLOC_MOVE_READ:
 			bcopy(ptr, p, sz);
 			cpu_flush_dcache(p, sz);
 			break;
 		case MD_MALLOC_MOVE_WRITE:
 			bcopy(p, ptr, sz);
 			break;
 		case MD_MALLOC_MOVE_CMP:
 			for (i = 0; i < sz; i++, p++) {
 				if (!first_read) {
 					*uc = (u_char)*p;
 					first = *p;
 					first_read = 1;
 				} else if (*p != first) {
 					error = EDOOFUS;
 					break;
 				}
 			}
 			break;
 		default:
 			KASSERT(0, ("md_malloc_move_ma unknown op %d\n", op));
 			break;
 		}
 		if (error != 0)
 			break;
 		*ma_offs += sz;
 		*ma_offs %= PAGE_SIZE;
 		if (*ma_offs == 0)
 			(*mp)++;
 		ptr = (char *)ptr + sz;
 	}
 
 	if (sf != NULL)
 		sf_buf_free(sf);
 	sched_unpin();
 	if (op == MD_MALLOC_MOVE_CMP && error != 0) {
 		*mp = mp1;
 		*ma_offs = ma_offs1;
 	}
 	return (error);
 }
 
 static int
 md_malloc_move_vlist(bus_dma_segment_t **pvlist, int *pma_offs,
     unsigned len, void *ptr, u_char fill, int op)
 {
 	bus_dma_segment_t *vlist;
 	uint8_t *p, *end, first;
 	off_t *uc;
 	int ma_offs, seg_len;
 
 	vlist = *pvlist;
 	ma_offs = *pma_offs;
 	uc = ptr;
 
 	for (; len != 0; len -= seg_len) {
 		seg_len = imin(vlist->ds_len - ma_offs, len);
 		p = (uint8_t *)(uintptr_t)vlist->ds_addr + ma_offs;
 		switch (op) {
 		case MD_MALLOC_MOVE_ZERO:
 			bzero(p, seg_len);
 			break;
 		case MD_MALLOC_MOVE_FILL:
 			memset(p, fill, seg_len);
 			break;
 		case MD_MALLOC_MOVE_READ:
 			bcopy(ptr, p, seg_len);
 			cpu_flush_dcache(p, seg_len);
 			break;
 		case MD_MALLOC_MOVE_WRITE:
 			bcopy(p, ptr, seg_len);
 			break;
 		case MD_MALLOC_MOVE_CMP:
 			end = p + seg_len;
 			first = *uc = *p;
 			/* Confirm all following bytes match the first */
 			while (++p < end) {
 				if (*p != first)
 					return (EDOOFUS);
 			}
 			break;
 		default:
 			KASSERT(0, ("md_malloc_move_vlist unknown op %d\n", op));
 			break;
 		}
 
 		ma_offs += seg_len;
 		if (ma_offs == vlist->ds_len) {
 			ma_offs = 0;
 			vlist++;
 		}
 		ptr = (uint8_t *)ptr + seg_len;
 	}
 	*pvlist = vlist;
 	*pma_offs = ma_offs;
 
 	return (0);
 }
 
 static int
 mdstart_malloc(struct md_s *sc, struct bio *bp)
 {
 	u_char *dst;
 	vm_page_t *m;
 	bus_dma_segment_t *vlist;
 	int i, error, error1, ma_offs, notmapped;
 	off_t secno, nsec, uc;
 	uintptr_t sp, osp;
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	notmapped = (bp->bio_flags & BIO_UNMAPPED) != 0;
 	vlist = (bp->bio_flags & BIO_VLIST) != 0 ?
 	    (bus_dma_segment_t *)bp->bio_data : NULL;
 	if (notmapped) {
 		m = bp->bio_ma;
 		ma_offs = bp->bio_ma_offset;
 		dst = NULL;
 		KASSERT(vlist == NULL, ("vlists cannot be unmapped"));
 	} else if (vlist != NULL) {
 		ma_offs = bp->bio_ma_offset;
 		dst = NULL;
 	} else {
 		dst = bp->bio_data;
 	}
 
 	nsec = bp->bio_length / sc->sectorsize;
 	secno = bp->bio_offset / sc->sectorsize;
 	error = 0;
 	while (nsec--) {
 		osp = s_read(sc->indir, secno);
 		if (bp->bio_cmd == BIO_DELETE) {
 			if (osp != 0)
 				error = s_write(sc->indir, secno, 0);
 		} else if (bp->bio_cmd == BIO_READ) {
 			if (osp == 0) {
 				if (notmapped) {
 					error = md_malloc_move_ma(&m, &ma_offs,
 					    sc->sectorsize, NULL, 0,
 					    MD_MALLOC_MOVE_ZERO);
 				} else if (vlist != NULL) {
 					error = md_malloc_move_vlist(&vlist,
 					    &ma_offs, sc->sectorsize, NULL, 0,
 					    MD_MALLOC_MOVE_ZERO);
 				} else
 					bzero(dst, sc->sectorsize);
 			} else if (osp <= 255) {
 				if (notmapped) {
 					error = md_malloc_move_ma(&m, &ma_offs,
 					    sc->sectorsize, NULL, osp,
 					    MD_MALLOC_MOVE_FILL);
 				} else if (vlist != NULL) {
 					error = md_malloc_move_vlist(&vlist,
 					    &ma_offs, sc->sectorsize, NULL, osp,
 					    MD_MALLOC_MOVE_FILL);
 				} else
 					memset(dst, osp, sc->sectorsize);
 			} else {
 				if (notmapped) {
 					error = md_malloc_move_ma(&m, &ma_offs,
 					    sc->sectorsize, (void *)osp, 0,
 					    MD_MALLOC_MOVE_READ);
 				} else if (vlist != NULL) {
 					error = md_malloc_move_vlist(&vlist,
 					    &ma_offs, sc->sectorsize,
 					    (void *)osp, 0,
 					    MD_MALLOC_MOVE_READ);
 				} else {
 					bcopy((void *)osp, dst, sc->sectorsize);
 					cpu_flush_dcache(dst, sc->sectorsize);
 				}
 			}
 			osp = 0;
 		} else if (bp->bio_cmd == BIO_WRITE) {
 			if (sc->flags & MD_COMPRESS) {
 				if (notmapped) {
 					error1 = md_malloc_move_ma(&m, &ma_offs,
 					    sc->sectorsize, &uc, 0,
 					    MD_MALLOC_MOVE_CMP);
 					i = error1 == 0 ? sc->sectorsize : 0;
 				} else if (vlist != NULL) {
 					error1 = md_malloc_move_vlist(&vlist,
 					    &ma_offs, sc->sectorsize, &uc, 0,
 					    MD_MALLOC_MOVE_CMP);
 					i = error1 == 0 ? sc->sectorsize : 0;
 				} else {
 					uc = dst[0];
 					for (i = 1; i < sc->sectorsize; i++) {
 						if (dst[i] != uc)
 							break;
 					}
 				}
 			} else {
 				i = 0;
 				uc = 0;
 			}
 			if (i == sc->sectorsize) {
 				if (osp != uc)
 					error = s_write(sc->indir, secno, uc);
 			} else {
 				if (osp <= 255) {
 					sp = (uintptr_t)uma_zalloc(sc->uma,
 					    md_malloc_wait ? M_WAITOK :
 					    M_NOWAIT);
 					if (sp == 0) {
 						error = ENOSPC;
 						break;
 					}
 					if (notmapped) {
 						error = md_malloc_move_ma(&m,
 						    &ma_offs, sc->sectorsize,
 						    (void *)sp, 0,
 						    MD_MALLOC_MOVE_WRITE);
 					} else if (vlist != NULL) {
 						error = md_malloc_move_vlist(
 						    &vlist, &ma_offs,
 						    sc->sectorsize, (void *)sp,
 						    0, MD_MALLOC_MOVE_WRITE);
 					} else {
 						bcopy(dst, (void *)sp,
 						    sc->sectorsize);
 					}
 					error = s_write(sc->indir, secno, sp);
 				} else {
 					if (notmapped) {
 						error = md_malloc_move_ma(&m,
 						    &ma_offs, sc->sectorsize,
 						    (void *)osp, 0,
 						    MD_MALLOC_MOVE_WRITE);
 					} else if (vlist != NULL) {
 						error = md_malloc_move_vlist(
 						    &vlist, &ma_offs,
 						    sc->sectorsize, (void *)osp,
 						    0, MD_MALLOC_MOVE_WRITE);
 					} else {
 						bcopy(dst, (void *)osp,
 						    sc->sectorsize);
 					}
 					osp = 0;
 				}
 			}
 		} else {
 			error = EOPNOTSUPP;
 		}
 		if (osp > 255)
 			uma_zfree(sc->uma, (void*)osp);
 		if (error != 0)
 			break;
 		secno++;
 		if (!notmapped && vlist == NULL)
 			dst += sc->sectorsize;
 	}
 	bp->bio_resid = 0;
 	return (error);
 }
 
 static void
 mdcopyto_vlist(void *src, bus_dma_segment_t *vlist, off_t offset, off_t len)
 {
 	off_t seg_len;
 
 	while (offset >= vlist->ds_len) {
 		offset -= vlist->ds_len;
 		vlist++;
 	}
 
 	while (len != 0) {
 		seg_len = omin(len, vlist->ds_len - offset);
 		bcopy(src, (void *)(uintptr_t)(vlist->ds_addr + offset),
 		    seg_len);
 		offset = 0;
 		src = (uint8_t *)src + seg_len;
 		len -= seg_len;
 		vlist++;
 	}
 }
 
 static void
 mdcopyfrom_vlist(bus_dma_segment_t *vlist, off_t offset, void *dst, off_t len)
 {
 	off_t seg_len;
 
 	while (offset >= vlist->ds_len) {
 		offset -= vlist->ds_len;
 		vlist++;
 	}
 
 	while (len != 0) {
 		seg_len = omin(len, vlist->ds_len - offset);
 		bcopy((void *)(uintptr_t)(vlist->ds_addr + offset), dst,
 		    seg_len);
 		offset = 0;
 		dst = (uint8_t *)dst + seg_len;
 		len -= seg_len;
 		vlist++;
 	}
 }
 
 static int
 mdstart_preload(struct md_s *sc, struct bio *bp)
 {
 	uint8_t *p;
 
 	p = sc->pl_ptr + bp->bio_offset;
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 		if ((bp->bio_flags & BIO_VLIST) != 0) {
 			mdcopyto_vlist(p, (bus_dma_segment_t *)bp->bio_data,
 			    bp->bio_ma_offset, bp->bio_length);
 		} else {
 			bcopy(p, bp->bio_data, bp->bio_length);
 		}
 		cpu_flush_dcache(bp->bio_data, bp->bio_length);
 		break;
 	case BIO_WRITE:
 		if ((bp->bio_flags & BIO_VLIST) != 0) {
 			mdcopyfrom_vlist((bus_dma_segment_t *)bp->bio_data,
 			    bp->bio_ma_offset, p, bp->bio_length);
 		} else {
 			bcopy(bp->bio_data, p, bp->bio_length);
 		}
 		break;
 	}
 	bp->bio_resid = 0;
 	return (0);
 }
 
 static int
 mdstart_vnode(struct md_s *sc, struct bio *bp)
 {
 	int error;
 	struct uio auio;
 	struct iovec aiov;
 	struct iovec *piov;
 	struct mount *mp;
 	struct vnode *vp;
 	struct buf *pb;
 	bus_dma_segment_t *vlist;
 	struct thread *td;
 	off_t iolen, iostart, len, zerosize;
 	int ma_offs, npages;
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 		auio.uio_rw = UIO_READ;
 		break;
 	case BIO_WRITE:
 	case BIO_DELETE:
 		auio.uio_rw = UIO_WRITE;
 		break;
 	case BIO_FLUSH:
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	td = curthread;
 	vp = sc->vnode;
 	pb = NULL;
 	piov = NULL;
 	ma_offs = bp->bio_ma_offset;
 	len = bp->bio_length;
 
 	/*
 	 * VNODE I/O
 	 *
 	 * If an error occurs, we set BIO_ERROR but we do not set
 	 * B_INVAL because (for a write anyway), the buffer is
 	 * still valid.
 	 */
 
 	if (bp->bio_cmd == BIO_FLUSH) {
 		(void) vn_start_write(vp, &mp, V_WAIT);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_FSYNC(vp, MNT_WAIT, td);
 		VOP_UNLOCK(vp, 0);
 		vn_finished_write(mp);
 		return (error);
 	}
 
 	auio.uio_offset = (vm_ooffset_t)bp->bio_offset;
 	auio.uio_resid = bp->bio_length;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 
 	if (bp->bio_cmd == BIO_DELETE) {
 		/*
 		 * Emulate BIO_DELETE by writing zeros.
 		 */
 		zerosize = ZERO_REGION_SIZE -
 		    (ZERO_REGION_SIZE % sc->sectorsize);
 		auio.uio_iovcnt = howmany(bp->bio_length, zerosize);
 		piov = malloc(sizeof(*piov) * auio.uio_iovcnt, M_MD, M_WAITOK);
 		auio.uio_iov = piov;
 		while (len > 0) {
 			piov->iov_base = __DECONST(void *, zero_region);
 			piov->iov_len = len;
 			if (len > zerosize)
 				piov->iov_len = zerosize;
 			len -= piov->iov_len;
 			piov++;
 		}
 		piov = auio.uio_iov;
 	} else if ((bp->bio_flags & BIO_VLIST) != 0) {
 		piov = malloc(sizeof(*piov) * bp->bio_ma_n, M_MD, M_WAITOK);
 		auio.uio_iov = piov;
 		vlist = (bus_dma_segment_t *)bp->bio_data;
 		while (len > 0) {
 			piov->iov_base = (void *)(uintptr_t)(vlist->ds_addr +
 			    ma_offs);
 			piov->iov_len = vlist->ds_len - ma_offs;
 			if (piov->iov_len > len)
 				piov->iov_len = len;
 			len -= piov->iov_len;
 			ma_offs = 0;
 			vlist++;
 			piov++;
 		}
 		auio.uio_iovcnt = piov - auio.uio_iov;
 		piov = auio.uio_iov;
 	} else if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 		pb = uma_zalloc(md_pbuf_zone, M_WAITOK);
 		bp->bio_resid = len;
 unmapped_step:
 		npages = atop(min(MAXPHYS, round_page(len + (ma_offs &
 		    PAGE_MASK))));
 		iolen = min(ptoa(npages) - (ma_offs & PAGE_MASK), len);
 		KASSERT(iolen > 0, ("zero iolen"));
 		pmap_qenter((vm_offset_t)pb->b_data,
 		    &bp->bio_ma[atop(ma_offs)], npages);
 		aiov.iov_base = (void *)((vm_offset_t)pb->b_data +
 		    (ma_offs & PAGE_MASK));
 		aiov.iov_len = iolen;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_resid = iolen;
 	} else {
 		aiov.iov_base = bp->bio_data;
 		aiov.iov_len = bp->bio_length;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 	}
 	iostart = auio.uio_offset;
 	if (auio.uio_rw == UIO_READ) {
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_READ(vp, &auio, 0, sc->cred);
 		VOP_UNLOCK(vp, 0);
 	} else {
 		(void) vn_start_write(vp, &mp, V_WAIT);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_WRITE(vp, &auio, sc->flags & MD_ASYNC ? 0 : IO_SYNC,
 		    sc->cred);
 		VOP_UNLOCK(vp, 0);
 		vn_finished_write(mp);
 		if (error == 0)
 			sc->flags &= ~MD_VERIFY;
 	}
 
 	/* When MD_CACHE is set, try to avoid double-caching the data. */
 	if (error == 0 && (sc->flags & MD_CACHE) == 0)
 		VOP_ADVISE(vp, iostart, auio.uio_offset - 1,
 		    POSIX_FADV_DONTNEED);
 
 	if (pb != NULL) {
 		pmap_qremove((vm_offset_t)pb->b_data, npages);
 		if (error == 0) {
 			len -= iolen;
 			bp->bio_resid -= iolen;
 			ma_offs += iolen;
 			if (len > 0)
 				goto unmapped_step;
 		}
 		uma_zfree(md_pbuf_zone, pb);
 	}
 
 	free(piov, M_MD);
 	if (pb == NULL)
 		bp->bio_resid = auio.uio_resid;
 	return (error);
 }
 
 static int
 mdstart_swap(struct md_s *sc, struct bio *bp)
 {
 	vm_page_t m;
 	u_char *p;
 	vm_pindex_t i, lastp;
 	bus_dma_segment_t *vlist;
 	int rv, ma_offs, offs, len, lastend;
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	p = bp->bio_data;
 	ma_offs = (bp->bio_flags & (BIO_UNMAPPED|BIO_VLIST)) != 0 ?
 	    bp->bio_ma_offset : 0;
 	vlist = (bp->bio_flags & BIO_VLIST) != 0 ?
 	    (bus_dma_segment_t *)bp->bio_data : NULL;
 
 	/*
 	 * offs is the offset at which to start operating on the
 	 * next (ie, first) page.  lastp is the last page on
 	 * which we're going to operate.  lastend is the ending
 	 * position within that last page (ie, PAGE_SIZE if
 	 * we're operating on complete aligned pages).
 	 */
 	offs = bp->bio_offset % PAGE_SIZE;
 	lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE;
 	lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1;
 
 	rv = VM_PAGER_OK;
 	VM_OBJECT_WLOCK(sc->object);
 	vm_object_pip_add(sc->object, 1);
 	for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) {
 		len = ((i == lastp) ? lastend : PAGE_SIZE) - offs;
 		m = vm_page_grab(sc->object, i, VM_ALLOC_SYSTEM);
 		if (bp->bio_cmd == BIO_READ) {
 			if (vm_page_all_valid(m))
 				rv = VM_PAGER_OK;
 			else
 				rv = vm_pager_get_pages(sc->object, &m, 1,
 				    NULL, NULL);
 			if (rv == VM_PAGER_ERROR) {
 				vm_page_free(m);
 				break;
 			} else if (rv == VM_PAGER_FAIL) {
 				/*
 				 * Pager does not have the page.  Zero
 				 * the allocated page, and mark it as
 				 * valid. Do not set dirty, the page
 				 * can be recreated if thrown out.
 				 */
 				pmap_zero_page(m);
 				vm_page_valid(m);
 			}
 			if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 				pmap_copy_pages(&m, offs, bp->bio_ma,
 				    ma_offs, len);
 			} else if ((bp->bio_flags & BIO_VLIST) != 0) {
 				physcopyout_vlist(VM_PAGE_TO_PHYS(m) + offs,
 				    vlist, ma_offs, len);
 				cpu_flush_dcache(p, len);
 			} else {
 				physcopyout(VM_PAGE_TO_PHYS(m) + offs, p, len);
 				cpu_flush_dcache(p, len);
 			}
 		} else if (bp->bio_cmd == BIO_WRITE) {
 			if (len == PAGE_SIZE || vm_page_all_valid(m))
 				rv = VM_PAGER_OK;
 			else
 				rv = vm_pager_get_pages(sc->object, &m, 1,
 				    NULL, NULL);
 			if (rv == VM_PAGER_ERROR) {
 				vm_page_free(m);
 				break;
 			} else if (rv == VM_PAGER_FAIL)
 				pmap_zero_page(m);
 
 			if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 				pmap_copy_pages(bp->bio_ma, ma_offs, &m,
 				    offs, len);
 			} else if ((bp->bio_flags & BIO_VLIST) != 0) {
 				physcopyin_vlist(vlist, ma_offs,
 				    VM_PAGE_TO_PHYS(m) + offs, len);
 			} else {
 				physcopyin(p, VM_PAGE_TO_PHYS(m) + offs, len);
 			}
 
 			vm_page_valid(m);
 			vm_page_set_dirty(m);
 		} else if (bp->bio_cmd == BIO_DELETE) {
 			if (len == PAGE_SIZE || vm_page_all_valid(m))
 				rv = VM_PAGER_OK;
 			else
 				rv = vm_pager_get_pages(sc->object, &m, 1,
 				    NULL, NULL);
 			if (rv == VM_PAGER_ERROR) {
 				vm_page_free(m);
 				break;
 			} else if (rv == VM_PAGER_FAIL) {
 				vm_page_free(m);
 				m = NULL;
 			} else {
 				/* Page is valid. */
 				if (len != PAGE_SIZE) {
 					pmap_zero_page_area(m, offs, len);
 					vm_page_set_dirty(m);
 				} else {
 					vm_pager_page_unswapped(m);
 					vm_page_free(m);
 					m = NULL;
 				}
 			}
 		}
 		if (m != NULL) {
 			vm_page_xunbusy(m);
 			vm_page_reference(m);
 		}
 
 		/* Actions on further pages start at offset 0 */
 		p += PAGE_SIZE - offs;
 		offs = 0;
 		ma_offs += len;
 	}
 	vm_object_pip_wakeup(sc->object);
 	VM_OBJECT_WUNLOCK(sc->object);
 	return (rv != VM_PAGER_ERROR ? 0 : ENOSPC);
 }
 
 static int
 mdstart_null(struct md_s *sc, struct bio *bp)
 {
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 		bzero(bp->bio_data, bp->bio_length);
 		cpu_flush_dcache(bp->bio_data, bp->bio_length);
 		break;
 	case BIO_WRITE:
 		break;
 	}
 	bp->bio_resid = 0;
 	return (0);
 }
 
 static void
 md_kthread(void *arg)
 {
 	struct md_s *sc;
 	struct bio *bp;
 	int error;
 
 	sc = arg;
 	thread_lock(curthread);
 	sched_prio(curthread, PRIBIO);
 	thread_unlock(curthread);
 	if (sc->type == MD_VNODE)
 		curthread->td_pflags |= TDP_NORUNNINGBUF;
 
 	for (;;) {
 		mtx_lock(&sc->queue_mtx);
 		if (sc->flags & MD_SHUTDOWN) {
 			sc->flags |= MD_EXITING;
 			mtx_unlock(&sc->queue_mtx);
 			kproc_exit(0);
 		}
 		bp = bioq_takefirst(&sc->bio_queue);
 		if (!bp) {
 			msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0);
 			continue;
 		}
 		mtx_unlock(&sc->queue_mtx);
 		if (bp->bio_cmd == BIO_GETATTR) {
 			int isv = ((sc->flags & MD_VERIFY) != 0);
 
 			if ((sc->fwsectors && sc->fwheads &&
 			    (g_handleattr_int(bp, "GEOM::fwsectors",
 			    sc->fwsectors) ||
 			    g_handleattr_int(bp, "GEOM::fwheads",
 			    sc->fwheads))) ||
 			    g_handleattr_int(bp, "GEOM::candelete", 1))
 				error = -1;
 			else if (sc->ident[0] != '\0' &&
 			    g_handleattr_str(bp, "GEOM::ident", sc->ident))
 				error = -1;
 			else if (g_handleattr_int(bp, "MNT::verified", isv))
 				error = -1;
 			else
 				error = EOPNOTSUPP;
 		} else {
 			error = sc->start(sc, bp);
 		}
 
 		if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
 			/*
 			 * Devstat uses (bio_bcount, bio_resid) for
 			 * determining the length of the completed part of
 			 * the i/o.  g_io_deliver() will translate from
 			 * bio_completed to that, but it also destroys the
 			 * bio so we must do our own translation.
 			 */
 			bp->bio_bcount = bp->bio_length;
 			bp->bio_resid = (error == -1 ? bp->bio_bcount : 0);
 			devstat_end_transaction_bio(sc->devstat, bp);
 		}
 		if (error != -1) {
 			bp->bio_completed = bp->bio_length;
 			g_io_deliver(bp, error);
 		}
 	}
 }
 
 static struct md_s *
 mdfind(int unit)
 {
 	struct md_s *sc;
 
 	LIST_FOREACH(sc, &md_softc_list, list) {
 		if (sc->unit == unit)
 			break;
 	}
 	return (sc);
 }
 
 static struct md_s *
 mdnew(int unit, int *errp, enum md_types type)
 {
 	struct md_s *sc;
 	int error;
 
 	*errp = 0;
 	if (unit == -1)
 		unit = alloc_unr(md_uh);
 	else
 		unit = alloc_unr_specific(md_uh, unit);
 
 	if (unit == -1) {
 		*errp = EBUSY;
 		return (NULL);
 	}
 
 	sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO);
 	sc->type = type;
 	bioq_init(&sc->bio_queue);
 	mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF);
-	mtx_init(&sc->stat_mtx, "md stat", NULL, MTX_DEF);
 	sc->unit = unit;
 	sprintf(sc->name, "md%d", unit);
 	LIST_INSERT_HEAD(&md_softc_list, sc, list);
 	error = kproc_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name);
 	if (error == 0)
 		return (sc);
 	LIST_REMOVE(sc, list);
-	mtx_destroy(&sc->stat_mtx);
 	mtx_destroy(&sc->queue_mtx);
 	free_unr(md_uh, sc->unit);
 	free(sc, M_MD);
 	*errp = error;
 	return (NULL);
 }
 
 static void
 mdinit(struct md_s *sc)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	g_topology_lock();
 	gp = g_new_geomf(&g_md_class, "md%d", sc->unit);
 	gp->softc = sc;
 	pp = g_new_providerf(gp, "md%d", sc->unit);
 	pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
 	pp->mediasize = sc->mediasize;
 	pp->sectorsize = sc->sectorsize;
 	switch (sc->type) {
 	case MD_MALLOC:
 	case MD_VNODE:
 	case MD_SWAP:
 		pp->flags |= G_PF_ACCEPT_UNMAPPED;
 		break;
 	case MD_PRELOAD:
 	case MD_NULL:
 		break;
 	}
 	sc->gp = gp;
 	sc->pp = pp;
 	g_error_provider(pp, 0);
 	g_topology_unlock();
 	sc->devstat = devstat_new_entry("md", sc->unit, sc->sectorsize,
 	    DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
 }
 
 static int
 mdcreate_malloc(struct md_s *sc, struct md_req *mdr)
 {
 	uintptr_t sp;
 	int error;
 	off_t u;
 
 	error = 0;
 	if (mdr->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE))
 		return (EINVAL);
 	if (mdr->md_sectorsize != 0 && !powerof2(mdr->md_sectorsize))
 		return (EINVAL);
 	/* Compression doesn't make sense if we have reserved space */
 	if (mdr->md_options & MD_RESERVE)
 		mdr->md_options &= ~MD_COMPRESS;
 	if (mdr->md_fwsectors != 0)
 		sc->fwsectors = mdr->md_fwsectors;
 	if (mdr->md_fwheads != 0)
 		sc->fwheads = mdr->md_fwheads;
 	sc->flags = mdr->md_options & (MD_COMPRESS | MD_FORCE);
 	sc->indir = dimension(sc->mediasize / sc->sectorsize);
 	sc->uma = uma_zcreate(sc->name, sc->sectorsize, NULL, NULL, NULL, NULL,
 	    0x1ff, 0);
 	if (mdr->md_options & MD_RESERVE) {
 		off_t nsectors;
 
 		nsectors = sc->mediasize / sc->sectorsize;
 		for (u = 0; u < nsectors; u++) {
 			sp = (uintptr_t)uma_zalloc(sc->uma, (md_malloc_wait ?
 			    M_WAITOK : M_NOWAIT) | M_ZERO);
 			if (sp != 0)
 				error = s_write(sc->indir, u, sp);
 			else
 				error = ENOMEM;
 			if (error != 0)
 				break;
 		}
 	}
 	return (error);
 }
 
 
 static int
 mdsetcred(struct md_s *sc, struct ucred *cred)
 {
 	char *tmpbuf;
 	int error = 0;
 
 	/*
 	 * Set credits in our softc
 	 */
 
 	if (sc->cred)
 		crfree(sc->cred);
 	sc->cred = crhold(cred);
 
 	/*
 	 * Horrible kludge to establish credentials for NFS  XXX.
 	 */
 
 	if (sc->vnode) {
 		struct uio auio;
 		struct iovec aiov;
 
 		tmpbuf = malloc(sc->sectorsize, M_TEMP, M_WAITOK);
 		bzero(&auio, sizeof(auio));
 
 		aiov.iov_base = tmpbuf;
 		aiov.iov_len = sc->sectorsize;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_resid = aiov.iov_len;
 		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
 		VOP_UNLOCK(sc->vnode, 0);
 		free(tmpbuf, M_TEMP);
 	}
 	return (error);
 }
 
 static int
 mdcreate_vnode(struct md_s *sc, struct md_req *mdr, struct thread *td)
 {
 	struct vattr vattr;
 	struct nameidata nd;
 	char *fname;
 	int error, flags;
 
 	fname = mdr->md_file;
 	if (mdr->md_file_seg == UIO_USERSPACE) {
 		error = copyinstr(fname, sc->file, sizeof(sc->file), NULL);
 		if (error != 0)
 			return (error);
 	} else if (mdr->md_file_seg == UIO_SYSSPACE)
 		strlcpy(sc->file, fname, sizeof(sc->file));
 	else
 		return (EDOOFUS);
 
 	/*
 	 * If the user specified that this is a read only device, don't
 	 * set the FWRITE mask before trying to open the backing store.
 	 */
 	flags = FREAD | ((mdr->md_options & MD_READONLY) ? 0 : FWRITE) \
 	    | ((mdr->md_options & MD_VERIFY) ? O_VERIFY : 0);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, sc->file, td);
 	error = vn_open(&nd, &flags, 0, NULL);
 	if (error != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (nd.ni_vp->v_type != VREG) {
 		error = EINVAL;
 		goto bad;
 	}
 	error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred);
 	if (error != 0)
 		goto bad;
 	if (VOP_ISLOCKED(nd.ni_vp) != LK_EXCLUSIVE) {
 		vn_lock(nd.ni_vp, LK_UPGRADE | LK_RETRY);
 		if (VN_IS_DOOMED(nd.ni_vp)) {
 			/* Forced unmount. */
 			error = EBADF;
 			goto bad;
 		}
 	}
 	nd.ni_vp->v_vflag |= VV_MD;
 	VOP_UNLOCK(nd.ni_vp, 0);
 
 	if (mdr->md_fwsectors != 0)
 		sc->fwsectors = mdr->md_fwsectors;
 	if (mdr->md_fwheads != 0)
 		sc->fwheads = mdr->md_fwheads;
 	snprintf(sc->ident, sizeof(sc->ident), "MD-DEV%ju-INO%ju",
 	    (uintmax_t)vattr.va_fsid, (uintmax_t)vattr.va_fileid);
 	sc->flags = mdr->md_options & (MD_ASYNC | MD_CACHE | MD_FORCE |
 	    MD_VERIFY);
 	if (!(flags & FWRITE))
 		sc->flags |= MD_READONLY;
 	sc->vnode = nd.ni_vp;
 
 	error = mdsetcred(sc, td->td_ucred);
 	if (error != 0) {
 		sc->vnode = NULL;
 		vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
 		nd.ni_vp->v_vflag &= ~VV_MD;
 		goto bad;
 	}
 	return (0);
 bad:
 	VOP_UNLOCK(nd.ni_vp, 0);
 	(void)vn_close(nd.ni_vp, flags, td->td_ucred, td);
 	return (error);
 }
 
 static void
 g_md_providergone(struct g_provider *pp)
 {
 	struct md_s *sc = pp->geom->softc;
 
 	mtx_lock(&sc->queue_mtx);
 	sc->flags |= MD_PROVIDERGONE;
 	wakeup(&sc->flags);
 	mtx_unlock(&sc->queue_mtx);
 }
 
 static int
 mddestroy(struct md_s *sc, struct thread *td)
 {
 
 	if (sc->gp) {
 		g_topology_lock();
 		g_wither_geom(sc->gp, ENXIO);
 		g_topology_unlock();
 
 		mtx_lock(&sc->queue_mtx);
 		while (!(sc->flags & MD_PROVIDERGONE))
 			msleep(&sc->flags, &sc->queue_mtx, PRIBIO, "mddestroy", 0);
 		mtx_unlock(&sc->queue_mtx);
 	}
 	if (sc->devstat) {
 		devstat_remove_entry(sc->devstat);
 		sc->devstat = NULL;
 	}
 	mtx_lock(&sc->queue_mtx);
 	sc->flags |= MD_SHUTDOWN;
 	wakeup(sc);
 	while (!(sc->flags & MD_EXITING))
 		msleep(sc->procp, &sc->queue_mtx, PRIBIO, "mddestroy", hz / 10);
 	mtx_unlock(&sc->queue_mtx);
-	mtx_destroy(&sc->stat_mtx);
 	mtx_destroy(&sc->queue_mtx);
 	if (sc->vnode != NULL) {
 		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
 		sc->vnode->v_vflag &= ~VV_MD;
 		VOP_UNLOCK(sc->vnode, 0);
 		(void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
 		    FREAD : (FREAD|FWRITE), sc->cred, td);
 	}
 	if (sc->cred != NULL)
 		crfree(sc->cred);
 	if (sc->object != NULL)
 		vm_object_deallocate(sc->object);
 	if (sc->indir)
 		destroy_indir(sc, sc->indir);
 	if (sc->uma)
 		uma_zdestroy(sc->uma);
 
 	LIST_REMOVE(sc, list);
 	free_unr(md_uh, sc->unit);
 	free(sc, M_MD);
 	return (0);
 }
 
 static int
 mdresize(struct md_s *sc, struct md_req *mdr)
 {
 	int error, res;
 	vm_pindex_t oldpages, newpages;
 
 	switch (sc->type) {
 	case MD_VNODE:
 	case MD_NULL:
 		break;
 	case MD_SWAP:
 		if (mdr->md_mediasize <= 0 ||
 		    (mdr->md_mediasize % PAGE_SIZE) != 0)
 			return (EDOM);
 		oldpages = OFF_TO_IDX(round_page(sc->mediasize));
 		newpages = OFF_TO_IDX(round_page(mdr->md_mediasize));
 		if (newpages < oldpages) {
 			VM_OBJECT_WLOCK(sc->object);
 			vm_object_page_remove(sc->object, newpages, 0, 0);
 			swap_pager_freespace(sc->object, newpages,
 			    oldpages - newpages);
 			swap_release_by_cred(IDX_TO_OFF(oldpages -
 			    newpages), sc->cred);
 			sc->object->charge = IDX_TO_OFF(newpages);
 			sc->object->size = newpages;
 			VM_OBJECT_WUNLOCK(sc->object);
 		} else if (newpages > oldpages) {
 			res = swap_reserve_by_cred(IDX_TO_OFF(newpages -
 			    oldpages), sc->cred);
 			if (!res)
 				return (ENOMEM);
 			if ((mdr->md_options & MD_RESERVE) ||
 			    (sc->flags & MD_RESERVE)) {
 				error = swap_pager_reserve(sc->object,
 				    oldpages, newpages - oldpages);
 				if (error < 0) {
 					swap_release_by_cred(
 					    IDX_TO_OFF(newpages - oldpages),
 					    sc->cred);
 					return (EDOM);
 				}
 			}
 			VM_OBJECT_WLOCK(sc->object);
 			sc->object->charge = IDX_TO_OFF(newpages);
 			sc->object->size = newpages;
 			VM_OBJECT_WUNLOCK(sc->object);
 		}
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	sc->mediasize = mdr->md_mediasize;
 	g_topology_lock();
 	g_resize_provider(sc->pp, sc->mediasize);
 	g_topology_unlock();
 	return (0);
 }
 
 static int
 mdcreate_swap(struct md_s *sc, struct md_req *mdr, struct thread *td)
 {
 	vm_ooffset_t npage;
 	int error;
 
 	/*
 	 * Range check.  Disallow negative sizes and sizes not being
 	 * multiple of page size.
 	 */
 	if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0)
 		return (EDOM);
 
 	/*
 	 * Allocate an OBJT_SWAP object.
 	 *
 	 * Note the truncation.
 	 */
 
 	if ((mdr->md_options & MD_VERIFY) != 0)
 		return (EINVAL);
 	npage = mdr->md_mediasize / PAGE_SIZE;
 	if (mdr->md_fwsectors != 0)
 		sc->fwsectors = mdr->md_fwsectors;
 	if (mdr->md_fwheads != 0)
 		sc->fwheads = mdr->md_fwheads;
 	sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * npage,
 	    VM_PROT_DEFAULT, 0, td->td_ucred);
 	if (sc->object == NULL)
 		return (ENOMEM);
 	sc->flags = mdr->md_options & (MD_FORCE | MD_RESERVE);
 	if (mdr->md_options & MD_RESERVE) {
 		if (swap_pager_reserve(sc->object, 0, npage) < 0) {
 			error = EDOM;
 			goto finish;
 		}
 	}
 	error = mdsetcred(sc, td->td_ucred);
  finish:
 	if (error != 0) {
 		vm_object_deallocate(sc->object);
 		sc->object = NULL;
 	}
 	return (error);
 }
 
 static int
 mdcreate_null(struct md_s *sc, struct md_req *mdr, struct thread *td)
 {
 
 	/*
 	 * Range check.  Disallow negative sizes and sizes not being
 	 * multiple of page size.
 	 */
 	if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0)
 		return (EDOM);
 
 	return (0);
 }
 
 static int
 kern_mdattach_locked(struct thread *td, struct md_req *mdr)
 {
 	struct md_s *sc;
 	unsigned sectsize;
 	int error, i;
 
 	sx_assert(&md_sx, SA_XLOCKED);
 
 	switch (mdr->md_type) {
 	case MD_MALLOC:
 	case MD_PRELOAD:
 	case MD_VNODE:
 	case MD_SWAP:
 	case MD_NULL:
 		break;
 	default:
 		return (EINVAL);
 	}
 	if (mdr->md_sectorsize == 0)
 		sectsize = DEV_BSIZE;
 	else
 		sectsize = mdr->md_sectorsize;
 	if (sectsize > MAXPHYS || mdr->md_mediasize < sectsize)
 		return (EINVAL);
 	if (mdr->md_options & MD_AUTOUNIT)
 		sc = mdnew(-1, &error, mdr->md_type);
 	else {
 		if (mdr->md_unit > INT_MAX)
 			return (EINVAL);
 		sc = mdnew(mdr->md_unit, &error, mdr->md_type);
 	}
 	if (sc == NULL)
 		return (error);
 	if (mdr->md_label != NULL)
 		error = copyinstr(mdr->md_label, sc->label,
 		    sizeof(sc->label), NULL);
 	if (error != 0)
 		goto err_after_new;
 	if (mdr->md_options & MD_AUTOUNIT)
 		mdr->md_unit = sc->unit;
 	sc->mediasize = mdr->md_mediasize;
 	sc->sectorsize = sectsize;
 	error = EDOOFUS;
 	switch (sc->type) {
 	case MD_MALLOC:
 		sc->start = mdstart_malloc;
 		error = mdcreate_malloc(sc, mdr);
 		break;
 	case MD_PRELOAD:
 		/*
 		 * We disallow attaching preloaded memory disks via
 		 * ioctl. Preloaded memory disks are automatically
 		 * attached in g_md_init().
 		 */
 		error = EOPNOTSUPP;
 		break;
 	case MD_VNODE:
 		sc->start = mdstart_vnode;
 		error = mdcreate_vnode(sc, mdr, td);
 		break;
 	case MD_SWAP:
 		sc->start = mdstart_swap;
 		error = mdcreate_swap(sc, mdr, td);
 		break;
 	case MD_NULL:
 		sc->start = mdstart_null;
 		error = mdcreate_null(sc, mdr, td);
 		break;
 	}
 err_after_new:
 	if (error != 0) {
 		mddestroy(sc, td);
 		return (error);
 	}
 
 	/* Prune off any residual fractional sector */
 	i = sc->mediasize % sc->sectorsize;
 	sc->mediasize -= i;
 
 	mdinit(sc);
 	return (0);
 }
 
 static int
 kern_mdattach(struct thread *td, struct md_req *mdr)
 {
 	int error;
 
 	sx_xlock(&md_sx);
 	error = kern_mdattach_locked(td, mdr);
 	sx_xunlock(&md_sx);
 	return (error);
 }
 
 static int
 kern_mddetach_locked(struct thread *td, struct md_req *mdr)
 {
 	struct md_s *sc;
 
 	sx_assert(&md_sx, SA_XLOCKED);
 
 	if (mdr->md_mediasize != 0 ||
 	    (mdr->md_options & ~MD_FORCE) != 0)
 		return (EINVAL);
 
 	sc = mdfind(mdr->md_unit);
 	if (sc == NULL)
 		return (ENOENT);
 	if (sc->opencount != 0 && !(sc->flags & MD_FORCE) &&
 	    !(mdr->md_options & MD_FORCE))
 		return (EBUSY);
 	return (mddestroy(sc, td));
 }
 
 static int
 kern_mddetach(struct thread *td, struct md_req *mdr)
 {
 	int error;
 
 	sx_xlock(&md_sx);
 	error = kern_mddetach_locked(td, mdr);
 	sx_xunlock(&md_sx);
 	return (error);
 }
 
 static int
 kern_mdresize_locked(struct md_req *mdr)
 {
 	struct md_s *sc;
 
 	sx_assert(&md_sx, SA_XLOCKED);
 
 	if ((mdr->md_options & ~(MD_FORCE | MD_RESERVE)) != 0)
 		return (EINVAL);
 
 	sc = mdfind(mdr->md_unit);
 	if (sc == NULL)
 		return (ENOENT);
 	if (mdr->md_mediasize < sc->sectorsize)
 		return (EINVAL);
 	if (mdr->md_mediasize < sc->mediasize &&
 	    !(sc->flags & MD_FORCE) &&
 	    !(mdr->md_options & MD_FORCE))
 		return (EBUSY);
 	return (mdresize(sc, mdr));
 }
 
 static int
 kern_mdresize(struct md_req *mdr)
 {
 	int error;
 
 	sx_xlock(&md_sx);
 	error = kern_mdresize_locked(mdr);
 	sx_xunlock(&md_sx);
 	return (error);
 }
 
 static int
 kern_mdquery_locked(struct md_req *mdr)
 {
 	struct md_s *sc;
 	int error;
 
 	sx_assert(&md_sx, SA_XLOCKED);
 
 	sc = mdfind(mdr->md_unit);
 	if (sc == NULL)
 		return (ENOENT);
 	mdr->md_type = sc->type;
 	mdr->md_options = sc->flags;
 	mdr->md_mediasize = sc->mediasize;
 	mdr->md_sectorsize = sc->sectorsize;
 	error = 0;
 	if (mdr->md_label != NULL) {
 		error = copyout(sc->label, mdr->md_label,
 		    strlen(sc->label) + 1);
 		if (error != 0)
 			return (error);
 	}
 	if (sc->type == MD_VNODE ||
 	    (sc->type == MD_PRELOAD && mdr->md_file != NULL))
 		error = copyout(sc->file, mdr->md_file,
 		    strlen(sc->file) + 1);
 	return (error);
 }
 
 static int
 kern_mdquery(struct md_req *mdr)
 {
 	int error;
 
 	sx_xlock(&md_sx);
 	error = kern_mdquery_locked(mdr);
 	sx_xunlock(&md_sx);
 	return (error);
 }
 
 /* Copy members that are not userspace pointers. */
 #define	MD_IOCTL2REQ(mdio, mdr) do {					\
 	(mdr)->md_unit = (mdio)->md_unit;				\
 	(mdr)->md_type = (mdio)->md_type;				\
 	(mdr)->md_mediasize = (mdio)->md_mediasize;			\
 	(mdr)->md_sectorsize = (mdio)->md_sectorsize;			\
 	(mdr)->md_options = (mdio)->md_options;				\
 	(mdr)->md_fwheads = (mdio)->md_fwheads;				\
 	(mdr)->md_fwsectors = (mdio)->md_fwsectors;			\
 	(mdr)->md_units = &(mdio)->md_pad[0];				\
 	(mdr)->md_units_nitems = nitems((mdio)->md_pad);		\
 } while(0)
 
 /* Copy members that might have been updated */
 #define MD_REQ2IOCTL(mdr, mdio) do {					\
 	(mdio)->md_unit = (mdr)->md_unit;				\
 	(mdio)->md_type = (mdr)->md_type;				\
 	(mdio)->md_mediasize = (mdr)->md_mediasize;			\
 	(mdio)->md_sectorsize = (mdr)->md_sectorsize;			\
 	(mdio)->md_options = (mdr)->md_options;				\
 	(mdio)->md_fwheads = (mdr)->md_fwheads;				\
 	(mdio)->md_fwsectors = (mdr)->md_fwsectors;			\
 } while(0)
 
 static int
 mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
     struct thread *td)
 {
 	struct md_req mdr;
 	int error;
 
 	if (md_debug)
 		printf("mdctlioctl(%s %lx %p %x %p)\n",
 			devtoname(dev), cmd, addr, flags, td);
 
 	bzero(&mdr, sizeof(mdr));
 	switch (cmd) {
 	case MDIOCATTACH:
 	case MDIOCDETACH:
 	case MDIOCRESIZE:
 	case MDIOCQUERY: {
 		struct md_ioctl *mdio = (struct md_ioctl *)addr;
 		if (mdio->md_version != MDIOVERSION)
 			return (EINVAL);
 		MD_IOCTL2REQ(mdio, &mdr);
 		mdr.md_file = mdio->md_file;
 		mdr.md_file_seg = UIO_USERSPACE;
 		/* If the file is adjacent to the md_ioctl it's in kernel. */
 		if ((void *)mdio->md_file == (void *)(mdio + 1))
 			mdr.md_file_seg = UIO_SYSSPACE;
 		mdr.md_label = mdio->md_label;
 		break;
 	}
 #ifdef COMPAT_FREEBSD32
 	case MDIOCATTACH_32:
 	case MDIOCDETACH_32:
 	case MDIOCRESIZE_32:
 	case MDIOCQUERY_32: {
 		struct md_ioctl32 *mdio = (struct md_ioctl32 *)addr;
 		if (mdio->md_version != MDIOVERSION)
 			return (EINVAL);
 		MD_IOCTL2REQ(mdio, &mdr);
 		mdr.md_file = (void *)(uintptr_t)mdio->md_file;
 		mdr.md_file_seg = UIO_USERSPACE;
 		mdr.md_label = (void *)(uintptr_t)mdio->md_label;
 		break;
 	}
 #endif
 	default:
 		/* Fall through to handler switch. */
 		break;
 	}
 
 	error = 0;
 	switch (cmd) {
 	case MDIOCATTACH:
 #ifdef COMPAT_FREEBSD32
 	case MDIOCATTACH_32:
 #endif
 		error = kern_mdattach(td, &mdr);
 		break;
 	case MDIOCDETACH:
 #ifdef COMPAT_FREEBSD32
 	case MDIOCDETACH_32:
 #endif
 		error = kern_mddetach(td, &mdr);
 		break;
 	case MDIOCRESIZE:
 #ifdef COMPAT_FREEBSD32
 	case MDIOCRESIZE_32:
 #endif
 		error = kern_mdresize(&mdr);
 		break;
 	case MDIOCQUERY:
 #ifdef COMPAT_FREEBSD32
 	case MDIOCQUERY_32:
 #endif
 		error = kern_mdquery(&mdr);
 		break;
 	default:
 		error = ENOIOCTL;
 	}
 
 	switch (cmd) {
 	case MDIOCATTACH:
 	case MDIOCQUERY: {
 		struct md_ioctl *mdio = (struct md_ioctl *)addr;
 		MD_REQ2IOCTL(&mdr, mdio);
 		break;
 	}
 #ifdef COMPAT_FREEBSD32
 	case MDIOCATTACH_32:
 	case MDIOCQUERY_32: {
 		struct md_ioctl32 *mdio = (struct md_ioctl32 *)addr;
 		MD_REQ2IOCTL(&mdr, mdio);
 		break;
 	}
 #endif
 	default:
 		/* Other commands to not alter mdr. */
 		break;
 	}
 
 	return (error);
 }
 
 static void
 md_preloaded(u_char *image, size_t length, const char *name)
 {
 	struct md_s *sc;
 	int error;
 
 	sc = mdnew(-1, &error, MD_PRELOAD);
 	if (sc == NULL)
 		return;
 	sc->mediasize = length;
 	sc->sectorsize = DEV_BSIZE;
 	sc->pl_ptr = image;
 	sc->pl_len = length;
 	sc->start = mdstart_preload;
 	if (name != NULL)
 		strlcpy(sc->file, name, sizeof(sc->file));
 #ifdef MD_ROOT
 	if (sc->unit == 0) {
 #ifndef ROOTDEVNAME
 		rootdevnames[0] = MD_ROOT_FSTYPE ":/dev/md0";
 #endif
 #ifdef MD_ROOT_READONLY
 		sc->flags |= MD_READONLY;
 #endif
 	}
 #endif
 	mdinit(sc);
 	if (name != NULL) {
 		printf("%s%d: Preloaded image <%s> %zd bytes at %p\n",
 		    MD_NAME, sc->unit, name, length, image);
 	} else {
 		printf("%s%d: Embedded image %zd bytes at %p\n",
 		    MD_NAME, sc->unit, length, image);
 	}
 }
 
 static void
 g_md_init(struct g_class *mp __unused)
 {
 	caddr_t mod;
 	u_char *ptr, *name, *type;
 	unsigned len;
 	int i;
 
 	/* figure out log2(NINDIR) */
 	for (i = NINDIR, nshift = -1; i; nshift++)
 		i >>= 1;
 
 	mod = NULL;
 	sx_init(&md_sx, "MD config lock");
 	g_topology_unlock();
 	md_uh = new_unrhdr(0, INT_MAX, NULL);
 #ifdef MD_ROOT
 	if (mfs_root_size != 0) {
 		sx_xlock(&md_sx);
 #ifdef MD_ROOT_MEM
 		md_preloaded(mfs_root, mfs_root_size, NULL);
 #else
 		md_preloaded(__DEVOLATILE(u_char *, &mfs_root), mfs_root_size,
 		    NULL);
 #endif
 		sx_xunlock(&md_sx);
 	}
 #endif
 	/* XXX: are preload_* static or do they need Giant ? */
 	while ((mod = preload_search_next_name(mod)) != NULL) {
 		name = (char *)preload_search_info(mod, MODINFO_NAME);
 		if (name == NULL)
 			continue;
 		type = (char *)preload_search_info(mod, MODINFO_TYPE);
 		if (type == NULL)
 			continue;
 		if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
 			continue;
 		ptr = preload_fetch_addr(mod);
 		len = preload_fetch_size(mod);
 		if (ptr != NULL && len != 0) {
 			sx_xlock(&md_sx);
 			md_preloaded(ptr, len, name);
 			sx_xunlock(&md_sx);
 		}
 	}
 	md_pbuf_zone = pbuf_zsecond_create("mdpbuf", nswbuf / 10);
 	status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL,
 	    0600, MDCTL_NAME);
 	g_topology_lock();
 }
 
 static void
 g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp __unused, struct g_provider *pp)
 {
 	struct md_s *mp;
 	char *type;
 
 	mp = gp->softc;
 	if (mp == NULL)
 		return;
 
 	switch (mp->type) {
 	case MD_MALLOC:
 		type = "malloc";
 		break;
 	case MD_PRELOAD:
 		type = "preload";
 		break;
 	case MD_VNODE:
 		type = "vnode";
 		break;
 	case MD_SWAP:
 		type = "swap";
 		break;
 	case MD_NULL:
 		type = "null";
 		break;
 	default:
 		type = "unknown";
 		break;
 	}
 
 	if (pp != NULL) {
 		if (indent == NULL) {
 			sbuf_printf(sb, " u %d", mp->unit);
 			sbuf_printf(sb, " s %ju", (uintmax_t) mp->sectorsize);
 			sbuf_printf(sb, " f %ju", (uintmax_t) mp->fwheads);
 			sbuf_printf(sb, " fs %ju", (uintmax_t) mp->fwsectors);
 			sbuf_printf(sb, " l %ju", (uintmax_t) mp->mediasize);
 			sbuf_printf(sb, " t %s", type);
 			if ((mp->type == MD_VNODE && mp->vnode != NULL) ||
 			    (mp->type == MD_PRELOAD && mp->file[0] != '\0'))
 				sbuf_printf(sb, " file %s", mp->file);
 			sbuf_printf(sb, " label %s", mp->label);
 		} else {
 			sbuf_printf(sb, "%s<unit>%d</unit>\n", indent,
 			    mp->unit);
 			sbuf_printf(sb, "%s<sectorsize>%ju</sectorsize>\n",
 			    indent, (uintmax_t) mp->sectorsize);
 			sbuf_printf(sb, "%s<fwheads>%ju</fwheads>\n",
 			    indent, (uintmax_t) mp->fwheads);
 			sbuf_printf(sb, "%s<fwsectors>%ju</fwsectors>\n",
 			    indent, (uintmax_t) mp->fwsectors);
 			if (mp->ident[0] != '\0') {
 				sbuf_printf(sb, "%s<ident>", indent);
 				g_conf_printf_escaped(sb, "%s", mp->ident);
 				sbuf_printf(sb, "</ident>\n");
 			}
 			sbuf_printf(sb, "%s<length>%ju</length>\n",
 			    indent, (uintmax_t) mp->mediasize);
 			sbuf_printf(sb, "%s<compression>%s</compression>\n", indent,
 			    (mp->flags & MD_COMPRESS) == 0 ? "off": "on");
 			sbuf_printf(sb, "%s<access>%s</access>\n", indent,
 			    (mp->flags & MD_READONLY) == 0 ? "read-write":
 			    "read-only");
 			sbuf_printf(sb, "%s<type>%s</type>\n", indent,
 			    type);
 			if ((mp->type == MD_VNODE && mp->vnode != NULL) ||
 			    (mp->type == MD_PRELOAD && mp->file[0] != '\0')) {
 				sbuf_printf(sb, "%s<file>", indent);
 				g_conf_printf_escaped(sb, "%s", mp->file);
 				sbuf_printf(sb, "</file>\n");
 			}
 			if (mp->type == MD_VNODE)
 				sbuf_printf(sb, "%s<cache>%s</cache>\n", indent,
 				    (mp->flags & MD_CACHE) == 0 ? "off": "on");
 			sbuf_printf(sb, "%s<label>", indent);
 			g_conf_printf_escaped(sb, "%s", mp->label);
 			sbuf_printf(sb, "</label>\n");
 		}
 	}
 }
 
 static void
 g_md_fini(struct g_class *mp __unused)
 {
 
 	sx_destroy(&md_sx);
 	if (status_dev != NULL)
 		destroy_dev(status_dev);
 	uma_zdestroy(md_pbuf_zone);
 	delete_unrhdr(md_uh);
 }
Index: head/sys/dev/nvdimm/nvdimm_spa.c
===================================================================
--- head/sys/dev/nvdimm/nvdimm_spa.c	(revision 356199)
+++ head/sys/dev/nvdimm/nvdimm_spa.c	(revision 356200)
@@ -1,627 +1,622 @@
 /*-
  * Copyright (c) 2017, 2018 The FreeBSD Foundation
  * All rights reserved.
  * Copyright (c) 2018, 2019 Intel Corporation
  *
  * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_acpi.h"
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/devicestat.h>
 #include <sys/disk.h>
 #include <sys/efi.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/rwlock.h>
 #include <sys/sglist.h>
 #include <sys/uio.h>
 #include <sys/uuid.h>
 #include <geom/geom.h>
 #include <geom/geom_int.h>
 #include <machine/vmparam.h>
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <contrib/dev/acpica/include/acpi.h>
 #include <contrib/dev/acpica/include/accommon.h>
 #include <contrib/dev/acpica/include/acuuid.h>
 #include <dev/acpica/acpivar.h>
 #include <dev/nvdimm/nvdimm_var.h>
 
 #define UUID_INITIALIZER_VOLATILE_MEMORY \
     {0x7305944f,0xfdda,0x44e3,0xb1,0x6c,{0x3f,0x22,0xd2,0x52,0xe5,0xd0}}
 #define UUID_INITIALIZER_PERSISTENT_MEMORY \
     {0x66f0d379,0xb4f3,0x4074,0xac,0x43,{0x0d,0x33,0x18,0xb7,0x8c,0xdb}}
 #define UUID_INITIALIZER_CONTROL_REGION \
     {0x92f701f6,0x13b4,0x405d,0x91,0x0b,{0x29,0x93,0x67,0xe8,0x23,0x4c}}
 #define UUID_INITIALIZER_DATA_REGION \
     {0x91af0530,0x5d86,0x470e,0xa6,0xb0,{0x0a,0x2d,0xb9,0x40,0x82,0x49}}
 #define UUID_INITIALIZER_VOLATILE_VIRTUAL_DISK \
     {0x77ab535a,0x45fc,0x624b,0x55,0x60,{0xf7,0xb2,0x81,0xd1,0xf9,0x6e}}
 #define UUID_INITIALIZER_VOLATILE_VIRTUAL_CD \
     {0x3d5abd30,0x4175,0x87ce,0x6d,0x64,{0xd2,0xad,0xe5,0x23,0xc4,0xbb}}
 #define UUID_INITIALIZER_PERSISTENT_VIRTUAL_DISK \
     {0x5cea02c9,0x4d07,0x69d3,0x26,0x9f,{0x44,0x96,0xfb,0xe0,0x96,0xf9}}
 #define UUID_INITIALIZER_PERSISTENT_VIRTUAL_CD \
     {0x08018188,0x42cd,0xbb48,0x10,0x0f,{0x53,0x87,0xd5,0x3d,0xed,0x3d}}
 
 static struct nvdimm_SPA_uuid_list_elm {
 	const char		*u_name;
 	struct uuid		u_id;
 	const bool		u_usr_acc;
 } nvdimm_SPA_uuid_list[] = {
 	[SPA_TYPE_VOLATILE_MEMORY] = {
 		.u_name =	"VOLA MEM ",
 		.u_id =		UUID_INITIALIZER_VOLATILE_MEMORY,
 		.u_usr_acc =	true,
 	},
 	[SPA_TYPE_PERSISTENT_MEMORY] = {
 		.u_name =	"PERS MEM",
 		.u_id =		UUID_INITIALIZER_PERSISTENT_MEMORY,
 		.u_usr_acc =	true,
 	},
 	[SPA_TYPE_CONTROL_REGION] = {
 		.u_name =	"CTRL RG ",
 		.u_id =		UUID_INITIALIZER_CONTROL_REGION,
 		.u_usr_acc =	false,
 	},
 	[SPA_TYPE_DATA_REGION] = {
 		.u_name =	"DATA RG ",
 		.u_id =		UUID_INITIALIZER_DATA_REGION,
 		.u_usr_acc =	true,
 	},
 	[SPA_TYPE_VOLATILE_VIRTUAL_DISK] = {
 		.u_name =	"VIRT DSK",
 		.u_id =		UUID_INITIALIZER_VOLATILE_VIRTUAL_DISK,
 		.u_usr_acc =	true,
 	},
 	[SPA_TYPE_VOLATILE_VIRTUAL_CD] = {
 		.u_name =	"VIRT CD ",
 		.u_id =		UUID_INITIALIZER_VOLATILE_VIRTUAL_CD,
 		.u_usr_acc =	true,
 	},
 	[SPA_TYPE_PERSISTENT_VIRTUAL_DISK] = {
 		.u_name =	"PV DSK  ",
 		.u_id =		UUID_INITIALIZER_PERSISTENT_VIRTUAL_DISK,
 		.u_usr_acc =	true,
 	},
 	[SPA_TYPE_PERSISTENT_VIRTUAL_CD] = {
 		.u_name =	"PV CD   ",
 		.u_id =		UUID_INITIALIZER_PERSISTENT_VIRTUAL_CD,
 		.u_usr_acc =	true,
 	},
 };
 
 enum SPA_mapping_type
 nvdimm_spa_type_from_name(const char *name)
 {
 	int j;
 
 	for (j = 0; j < nitems(nvdimm_SPA_uuid_list); j++) {
 		if (strcmp(name, nvdimm_SPA_uuid_list[j].u_name) != 0)
 			continue;
 		return (j);
 	}
 	return (SPA_TYPE_UNKNOWN);
 }
 
 enum SPA_mapping_type
 nvdimm_spa_type_from_uuid(struct uuid *uuid)
 {
 	int j;
 
 	for (j = 0; j < nitems(nvdimm_SPA_uuid_list); j++) {
 		if (uuidcmp(uuid, &nvdimm_SPA_uuid_list[j].u_id) != 0)
 			continue;
 		return (j);
 	}
 	return (SPA_TYPE_UNKNOWN);
 }
 
 bool
 nvdimm_spa_type_user_accessible(enum SPA_mapping_type spa_type)
 {
 
 	if ((int)spa_type < 0 || spa_type >= nitems(nvdimm_SPA_uuid_list))
 		return (false);
 	return (nvdimm_SPA_uuid_list[spa_type].u_usr_acc);
 }
 
 static vm_memattr_t
 nvdimm_spa_memattr(uint64_t efi_mem_flags)
 {
 	vm_memattr_t mode;
 
 	if ((efi_mem_flags & EFI_MD_ATTR_WB) != 0)
 		mode = VM_MEMATTR_WRITE_BACK;
 	else if ((efi_mem_flags & EFI_MD_ATTR_WT) != 0)
 		mode = VM_MEMATTR_WRITE_THROUGH;
 	else if ((efi_mem_flags & EFI_MD_ATTR_WC) != 0)
 		mode = VM_MEMATTR_WRITE_COMBINING;
 	else if ((efi_mem_flags & EFI_MD_ATTR_WP) != 0)
 		mode = VM_MEMATTR_WRITE_PROTECTED;
 	else if ((efi_mem_flags & EFI_MD_ATTR_UC) != 0)
 		mode = VM_MEMATTR_UNCACHEABLE;
 	else {
 		if (bootverbose)
 			printf("SPA mapping attr %#lx unsupported\n",
 			    efi_mem_flags);
 		mode = VM_MEMATTR_UNCACHEABLE;
 	}
 	return (mode);
 }
 
 static int
 nvdimm_spa_uio(struct nvdimm_spa_dev *dev, struct uio *uio)
 {
 	struct vm_page m, *ma;
 	off_t off;
 	vm_memattr_t mattr;
 	int error, n;
 
 	error = 0;
 	if (dev->spa_kva == NULL) {
 		mattr = dev->spa_memattr;
 		bzero(&m, sizeof(m));
 		vm_page_initfake(&m, 0, mattr);
 		ma = &m;
 		while (uio->uio_resid > 0) {
 			if (uio->uio_offset >= dev->spa_len)
 				break;
 			off = dev->spa_phys_base + uio->uio_offset;
 			vm_page_updatefake(&m, trunc_page(off), mattr);
 			n = PAGE_SIZE;
 			if (n > uio->uio_resid)
 				n = uio->uio_resid;
 			error = uiomove_fromphys(&ma, off & PAGE_MASK, n, uio);
 			if (error != 0)
 				break;
 		}
 	} else {
 		while (uio->uio_resid > 0) {
 			if (uio->uio_offset >= dev->spa_len)
 				break;
 			n = INT_MAX;
 			if (n > uio->uio_resid)
 				n = uio->uio_resid;
 			if (uio->uio_offset + n > dev->spa_len)
 				n = dev->spa_len - uio->uio_offset;
 			error = uiomove((char *)dev->spa_kva + uio->uio_offset,
 			    n, uio);
 			if (error != 0)
 				break;
 		}
 	}
 	return (error);
 }
 
 static int
 nvdimm_spa_rw(struct cdev *dev, struct uio *uio, int ioflag)
 {
 
 	return (nvdimm_spa_uio(dev->si_drv1, uio));
 }
 
 static int
 nvdimm_spa_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
     struct thread *td)
 {
 	struct nvdimm_spa_dev *dev;
 	int error;
 
 	dev = cdev->si_drv1;
 	error = 0;
 	switch (cmd) {
 	case DIOCGSECTORSIZE:
 		*(u_int *)data = DEV_BSIZE;
 		break;
 	case DIOCGMEDIASIZE:
 		*(off_t *)data = dev->spa_len;
 		break;
 	default:
 		error = ENOTTY;
 		break;
 	}
 	return (error);
 }
 
 static int
 nvdimm_spa_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t size,
     vm_object_t *objp, int nprot)
 {
 	struct nvdimm_spa_dev *dev;
 
 	dev = cdev->si_drv1;
 	if (dev->spa_obj == NULL)
 		return (ENXIO);
 	if (*offset >= dev->spa_len || *offset + size < *offset ||
 	    *offset + size > dev->spa_len)
 		return (EINVAL);
 	vm_object_reference(dev->spa_obj);
 	*objp = dev->spa_obj;
 	return (0);
 }
 
 static struct cdevsw spa_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_flags =	D_DISK,
 	.d_name =	"nvdimm_spa",
 	.d_read =	nvdimm_spa_rw,
 	.d_write =	nvdimm_spa_rw,
 	.d_ioctl =	nvdimm_spa_ioctl,
 	.d_mmap_single = nvdimm_spa_mmap_single,
 };
 
 static void
 nvdimm_spa_g_all_unmapped(struct nvdimm_spa_dev *dev, struct bio *bp, int rw)
 {
 	struct vm_page maa[bp->bio_ma_n];
 	vm_page_t ma[bp->bio_ma_n];
 	vm_memattr_t mattr;
 	int i;
 
 	mattr = dev->spa_memattr;
 	for (i = 0; i < nitems(ma); i++) {
 		bzero(&maa[i], sizeof(maa[i]));
 		vm_page_initfake(&maa[i], dev->spa_phys_base +
 		    trunc_page(bp->bio_offset) + PAGE_SIZE * i, mattr);
 		ma[i] = &maa[i];
 	}
 	if (rw == BIO_READ)
 		pmap_copy_pages(ma, bp->bio_offset & PAGE_MASK, bp->bio_ma,
 		    bp->bio_ma_offset, bp->bio_length);
 	else
 		pmap_copy_pages(bp->bio_ma, bp->bio_ma_offset, ma,
 		    bp->bio_offset & PAGE_MASK, bp->bio_length);
 }
 
 static void
 nvdimm_spa_g_thread(void *arg)
 {
 	struct g_spa *sc;
 	struct bio *bp;
 	struct uio auio;
 	struct iovec aiovec;
 	int error;
 
 	sc = arg;
 	for (;;) {
 		mtx_lock(&sc->spa_g_mtx);
 		for (;;) {
 			bp = bioq_takefirst(&sc->spa_g_queue);
 			if (bp != NULL)
 				break;
 			msleep(&sc->spa_g_queue, &sc->spa_g_mtx, PRIBIO,
 			    "spa_g", 0);
 			if (!sc->spa_g_proc_run) {
 				sc->spa_g_proc_exiting = true;
 				wakeup(&sc->spa_g_queue);
 				mtx_unlock(&sc->spa_g_mtx);
 				kproc_exit(0);
 			}
 			continue;
 		}
 		mtx_unlock(&sc->spa_g_mtx);
 		if (bp->bio_cmd != BIO_READ && bp->bio_cmd != BIO_WRITE &&
 		    bp->bio_cmd != BIO_FLUSH) {
 			error = EOPNOTSUPP;
 			goto completed;
 		}
 
 		error = 0;
 		if (bp->bio_cmd == BIO_FLUSH) {
 			if (sc->dev->spa_kva != NULL) {
 				pmap_large_map_wb(sc->dev->spa_kva,
 				    sc->dev->spa_len);
 			} else {
 				pmap_flush_cache_phys_range(
 				    (vm_paddr_t)sc->dev->spa_phys_base,
 				    (vm_paddr_t)sc->dev->spa_phys_base +
 				    sc->dev->spa_len, sc->dev->spa_memattr);
 			}
 			/*
 			 * XXX flush IMC
 			 */
 			goto completed;
 		}
 		
 		if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 			if (sc->dev->spa_kva != NULL) {
 				aiovec.iov_base = (char *)sc->dev->spa_kva +
 				    bp->bio_offset;
 				aiovec.iov_len = bp->bio_length;
 				auio.uio_iov = &aiovec;
 				auio.uio_iovcnt = 1;
 				auio.uio_resid = bp->bio_length;
 				auio.uio_offset = bp->bio_offset;
 				auio.uio_segflg = UIO_SYSSPACE;
 				auio.uio_rw = bp->bio_cmd == BIO_READ ?
 				    UIO_WRITE : UIO_READ;
 				auio.uio_td = curthread;
 				error = uiomove_fromphys(bp->bio_ma,
 				    bp->bio_ma_offset, bp->bio_length, &auio);
 				bp->bio_resid = auio.uio_resid;
 			} else {
 				nvdimm_spa_g_all_unmapped(sc->dev, bp,
 				    bp->bio_cmd);
 				bp->bio_resid = bp->bio_length;
 				error = 0;
 			}
 		} else {
 			aiovec.iov_base = bp->bio_data;
 			aiovec.iov_len = bp->bio_length;
 			auio.uio_iov = &aiovec;
 			auio.uio_iovcnt = 1;
 			auio.uio_resid = bp->bio_length;
 			auio.uio_offset = bp->bio_offset;
 			auio.uio_segflg = UIO_SYSSPACE;
 			auio.uio_rw = bp->bio_cmd == BIO_READ ? UIO_READ :
 			    UIO_WRITE;
 			auio.uio_td = curthread;
 			error = nvdimm_spa_uio(sc->dev, &auio);
 			bp->bio_resid = auio.uio_resid;
 		}
 		bp->bio_bcount = bp->bio_length;
 		devstat_end_transaction_bio(sc->spa_g_devstat, bp);
 completed:
 		bp->bio_completed = bp->bio_length;
 		g_io_deliver(bp, error);
 	}
 }
 
 static void
 nvdimm_spa_g_start(struct bio *bp)
 {
 	struct g_spa *sc;
 
 	sc = bp->bio_to->geom->softc;
 	if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
-		mtx_lock(&sc->spa_g_stat_mtx);
 		devstat_start_transaction_bio(sc->spa_g_devstat, bp);
-		mtx_unlock(&sc->spa_g_stat_mtx);
 	}
 	mtx_lock(&sc->spa_g_mtx);
 	bioq_disksort(&sc->spa_g_queue, bp);
 	wakeup(&sc->spa_g_queue);
 	mtx_unlock(&sc->spa_g_mtx);
 }
 
 static int
 nvdimm_spa_g_access(struct g_provider *pp, int r, int w, int e)
 {
 
 	return (0);
 }
 
 static struct g_geom * nvdimm_spa_g_create(struct nvdimm_spa_dev *dev,
     const char *name);
 static g_ctl_destroy_geom_t nvdimm_spa_g_destroy_geom;
 
 struct g_class nvdimm_spa_g_class = {
 	.name =		"SPA",
 	.version =	G_VERSION,
 	.start =	nvdimm_spa_g_start,
 	.access =	nvdimm_spa_g_access,
 	.destroy_geom =	nvdimm_spa_g_destroy_geom,
 };
 DECLARE_GEOM_CLASS(nvdimm_spa_g_class, g_spa);
 
 int
 nvdimm_spa_init(struct SPA_mapping *spa, ACPI_NFIT_SYSTEM_ADDRESS *nfitaddr,
     enum SPA_mapping_type spa_type)
 {
 	char *name;
 	int error;
 
 	spa->spa_type = spa_type;
 	spa->spa_nfit_idx = nfitaddr->RangeIndex;
 	spa->dev.spa_domain =
 	    ((nfitaddr->Flags & ACPI_NFIT_PROXIMITY_VALID) != 0) ?
 	    nfitaddr->ProximityDomain : -1;
 	spa->dev.spa_phys_base = nfitaddr->Address;
 	spa->dev.spa_len = nfitaddr->Length;
 	spa->dev.spa_efi_mem_flags = nfitaddr->MemoryMapping;
 	if (bootverbose) {
 		printf("NVDIMM SPA%d base %#016jx len %#016jx %s fl %#jx\n",
 		    spa->spa_nfit_idx,
 		    (uintmax_t)spa->dev.spa_phys_base,
 		    (uintmax_t)spa->dev.spa_len,
 		    nvdimm_SPA_uuid_list[spa_type].u_name,
 		    spa->dev.spa_efi_mem_flags);
 	}
 	spa->dev.spa_memattr = nvdimm_spa_memattr(nfitaddr->MemoryMapping);
 	if (!nvdimm_SPA_uuid_list[spa_type].u_usr_acc)
 		return (0);
 
 	asprintf(&name, M_NVDIMM, "spa%d", spa->spa_nfit_idx);
 	error = nvdimm_spa_dev_init(&spa->dev, name, spa->spa_nfit_idx);
 	free(name, M_NVDIMM);
 	return (error);
 }
 
 int
 nvdimm_spa_dev_init(struct nvdimm_spa_dev *dev, const char *name, int unit)
 {
 	struct make_dev_args mda;
 	struct sglist *spa_sg;
 	char *devname;
 	int error, error1;
 
 	error1 = pmap_large_map(dev->spa_phys_base, dev->spa_len,
 	    &dev->spa_kva, dev->spa_memattr);
 	if (error1 != 0) {
 		printf("NVDIMM %s cannot map into KVA, error %d\n", name,
 		    error1);
 		dev->spa_kva = NULL;
 	}
 
 	spa_sg = sglist_alloc(1, M_WAITOK);
 	error = sglist_append_phys(spa_sg, dev->spa_phys_base,
 	    dev->spa_len);
 	if (error == 0) {
 		dev->spa_obj = vm_pager_allocate(OBJT_SG, spa_sg, dev->spa_len,
 		    VM_PROT_ALL, 0, NULL);
 		if (dev->spa_obj == NULL) {
 			printf("NVDIMM %s failed to alloc vm object", name);
 			sglist_free(spa_sg);
 		}
 	} else {
 		printf("NVDIMM %s failed to init sglist, error %d", name,
 		    error);
 		sglist_free(spa_sg);
 	}
 
 	make_dev_args_init(&mda);
 	mda.mda_flags = MAKEDEV_WAITOK | MAKEDEV_CHECKNAME;
 	mda.mda_devsw = &spa_cdevsw;
 	mda.mda_cr = NULL;
 	mda.mda_uid = UID_ROOT;
 	mda.mda_gid = GID_OPERATOR;
 	mda.mda_mode = 0660;
 	mda.mda_si_drv1 = dev;
 	mda.mda_unit = unit;
 	asprintf(&devname, M_NVDIMM, "nvdimm_%s", name);
 	error = make_dev_s(&mda, &dev->spa_dev, "%s", devname);
 	free(devname, M_NVDIMM);
 	if (error != 0) {
 		printf("NVDIMM %s cannot create devfs node, error %d\n", name,
 		    error);
 		if (error1 == 0)
 			error1 = error;
 	}
 	dev->spa_g = nvdimm_spa_g_create(dev, name);
 	if (dev->spa_g == NULL && error1 == 0)
 		error1 = ENXIO;
 	return (error1);
 }
 
 static struct g_geom *
 nvdimm_spa_g_create(struct nvdimm_spa_dev *dev, const char *name)
 {
 	struct g_geom *gp;
 	struct g_spa *sc;
 	int error;
 
 	gp = NULL;
 	sc = malloc(sizeof(struct g_spa), M_NVDIMM, M_WAITOK | M_ZERO);
 	sc->dev = dev;
 	bioq_init(&sc->spa_g_queue);
 	mtx_init(&sc->spa_g_mtx, "spag", NULL, MTX_DEF);
-	mtx_init(&sc->spa_g_stat_mtx, "spagst", NULL, MTX_DEF);
 	sc->spa_g_proc_run = true;
 	sc->spa_g_proc_exiting = false;
 	error = kproc_create(nvdimm_spa_g_thread, sc, &sc->spa_g_proc, 0, 0,
 	    "g_spa");
 	if (error != 0) {
 		mtx_destroy(&sc->spa_g_mtx);
-		mtx_destroy(&sc->spa_g_stat_mtx);
 		free(sc, M_NVDIMM);
 		printf("NVDIMM %s cannot create geom worker, error %d\n", name,
 		    error);
 	} else {
 		g_topology_lock();
 		gp = g_new_geomf(&nvdimm_spa_g_class, "%s", name);
 		gp->softc = sc;
 		sc->spa_p = g_new_providerf(gp, "%s", name);
 		sc->spa_p->mediasize = dev->spa_len;
 		sc->spa_p->sectorsize = DEV_BSIZE;
 		sc->spa_p->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE |
 		    G_PF_ACCEPT_UNMAPPED;
 		g_error_provider(sc->spa_p, 0);
 		sc->spa_g_devstat = devstat_new_entry("spa", -1, DEV_BSIZE,
 		    DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT,
 		    DEVSTAT_PRIORITY_MAX);
 		g_topology_unlock();
 	}
 	return (gp);
 }
 
 void
 nvdimm_spa_fini(struct SPA_mapping *spa)
 {
 
 	nvdimm_spa_dev_fini(&spa->dev);
 }
 
 void
 nvdimm_spa_dev_fini(struct nvdimm_spa_dev *dev)
 {
 
 	if (dev->spa_g != NULL) {
 		g_topology_lock();
 		nvdimm_spa_g_destroy_geom(NULL, dev->spa_g->class, dev->spa_g);
 		g_topology_unlock();
 	}
 	if (dev->spa_dev != NULL) {
 		destroy_dev(dev->spa_dev);
 		dev->spa_dev = NULL;
 	}
 	vm_object_deallocate(dev->spa_obj);
 	if (dev->spa_kva != NULL) {
 		pmap_large_unmap(dev->spa_kva, dev->spa_len);
 		dev->spa_kva = NULL;
 	}
 }
 
 static int
 nvdimm_spa_g_destroy_geom(struct gctl_req *req, struct g_class *cp,
     struct g_geom *gp)
 {
 	struct g_spa *sc;
 
 	sc = gp->softc;
 	mtx_lock(&sc->spa_g_mtx);
 	sc->spa_g_proc_run = false;
 	wakeup(&sc->spa_g_queue);
 	while (!sc->spa_g_proc_exiting)
 		msleep(&sc->spa_g_queue, &sc->spa_g_mtx, PRIBIO, "spa_e", 0);
 	mtx_unlock(&sc->spa_g_mtx);
 	g_topology_assert();
 	g_wither_geom(gp, ENXIO);
 	sc->spa_p = NULL;
 	if (sc->spa_g_devstat != NULL) {
 		devstat_remove_entry(sc->spa_g_devstat);
 		sc->spa_g_devstat = NULL;
 	}
 	mtx_destroy(&sc->spa_g_mtx);
-	mtx_destroy(&sc->spa_g_stat_mtx);
 	free(sc, M_NVDIMM);
 	return (0);
 }
Index: head/sys/dev/nvdimm/nvdimm_var.h
===================================================================
--- head/sys/dev/nvdimm/nvdimm_var.h	(revision 356199)
+++ head/sys/dev/nvdimm/nvdimm_var.h	(revision 356200)
@@ -1,179 +1,178 @@
 /*-
  * Copyright (c) 2017 The FreeBSD Foundation
  * All rights reserved.
  * Copyright (c) 2018, 2019 Intel Corporation
  *
  * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef __DEV_NVDIMM_VAR_H__
 #define	__DEV_NVDIMM_VAR_H__
 
 #define NVDIMM_INDEX_BLOCK_SIGNATURE "NAMESPACE_INDEX"
 
 struct nvdimm_label_index {
 	char		signature[16];
 	uint8_t		flags[3];
 	uint8_t		label_size;
 	uint32_t	seq;
 	uint64_t	this_offset;
 	uint64_t	this_size;
 	uint64_t	other_offset;
 	uint64_t	label_offset;
 	uint32_t	slot_cnt;
 	uint16_t	rev_major;
 	uint16_t	rev_minor;
 	uint64_t	checksum;
 	uint8_t		free[0];
 };
 
 struct nvdimm_label {
 	struct uuid	uuid;
 	char		name[64];
 	uint32_t	flags;
 	uint16_t	nlabel;
 	uint16_t	position;
 	uint64_t	set_cookie;
 	uint64_t	lba_size;
 	uint64_t	dimm_phys_addr;
 	uint64_t	raw_size;
 	uint32_t	slot;
 	uint8_t		alignment;
 	uint8_t		reserved[3];
 	struct uuid	type_guid;
 	struct uuid	address_abstraction_guid;
 	uint8_t		reserved1[88];
 	uint64_t	checksum;
 };
 
 struct nvdimm_label_entry {
 	SLIST_ENTRY(nvdimm_label_entry) link;
 	struct nvdimm_label	label;
 };
 
 _Static_assert(sizeof(struct nvdimm_label_index) == 72, "Incorrect layout");
 _Static_assert(sizeof(struct nvdimm_label) == 256, "Incorrect layout");
 
 typedef uint32_t nfit_handle_t;
 
 enum nvdimm_acpi_ivar {
 	NVDIMM_ROOT_IVAR_ACPI_HANDLE,
 	NVDIMM_ROOT_IVAR_DEVICE_HANDLE,
 	NVDIMM_ROOT_IVAR_MAX,
 };
 __BUS_ACCESSOR(nvdimm_root, acpi_handle, NVDIMM_ROOT, ACPI_HANDLE, ACPI_HANDLE)
 __BUS_ACCESSOR(nvdimm_root, device_handle, NVDIMM_ROOT, DEVICE_HANDLE,
     nfit_handle_t)
 
 struct nvdimm_dev {
 	device_t	nv_dev;
 	nfit_handle_t	nv_handle;
 	uint64_t	**nv_flush_addr;
 	int		nv_flush_addr_cnt;
 	uint32_t	label_area_size;
 	uint32_t	max_label_xfer;
 	struct nvdimm_label_index *label_index;
 	SLIST_HEAD(, nvdimm_label_entry) labels;
 };
 
 enum SPA_mapping_type {
 	SPA_TYPE_VOLATILE_MEMORY	= 0,
 	SPA_TYPE_PERSISTENT_MEMORY	= 1,
 	SPA_TYPE_CONTROL_REGION		= 2,
 	SPA_TYPE_DATA_REGION		= 3,
 	SPA_TYPE_VOLATILE_VIRTUAL_DISK	= 4,
 	SPA_TYPE_VOLATILE_VIRTUAL_CD	= 5,
 	SPA_TYPE_PERSISTENT_VIRTUAL_DISK= 6,
 	SPA_TYPE_PERSISTENT_VIRTUAL_CD	= 7,
 	SPA_TYPE_UNKNOWN		= 127,
 };
 
 struct nvdimm_spa_dev {
 	int			spa_domain;
 	vm_memattr_t		spa_memattr;
 	uint64_t		spa_phys_base;
 	uint64_t		spa_len;
 	uint64_t		spa_efi_mem_flags;
 	void			*spa_kva;
 	struct vm_object	*spa_obj;
 	struct cdev		*spa_dev;
 	struct g_geom		*spa_g;
 };
 
 struct g_spa {
 	struct nvdimm_spa_dev	*dev;
 	struct g_provider	*spa_p;
 	struct bio_queue_head	spa_g_queue;
 	struct mtx		spa_g_mtx;
-	struct mtx		spa_g_stat_mtx;
 	struct devstat		*spa_g_devstat;
 	struct proc		*spa_g_proc;
 	bool			spa_g_proc_run;
 	bool			spa_g_proc_exiting;
 };
 
 struct nvdimm_namespace {
 	SLIST_ENTRY(nvdimm_namespace) link;
 	struct SPA_mapping	*spa;
 	struct nvdimm_spa_dev	dev;
 };
 
 struct SPA_mapping {
 	SLIST_ENTRY(SPA_mapping) link;
 	enum SPA_mapping_type	spa_type;
 	int			spa_nfit_idx;
 	struct nvdimm_spa_dev	dev;
 	SLIST_HEAD(, nvdimm_namespace) namespaces;
 };
 
 MALLOC_DECLARE(M_NVDIMM);
 
 void acpi_nfit_get_dimm_ids(ACPI_TABLE_NFIT *nfitbl, nfit_handle_t **listp,
     int *countp);
 void acpi_nfit_get_spa_range(ACPI_TABLE_NFIT *nfitbl, uint16_t range_index,
     ACPI_NFIT_SYSTEM_ADDRESS **spa);
 void acpi_nfit_get_spa_ranges(ACPI_TABLE_NFIT *nfitbl,
     ACPI_NFIT_SYSTEM_ADDRESS ***listp, int *countp);
 void acpi_nfit_get_region_mappings_by_spa_range(ACPI_TABLE_NFIT *nfitbl,
     uint16_t spa_range_index, ACPI_NFIT_MEMORY_MAP ***listp, int *countp);
 void acpi_nfit_get_control_region(ACPI_TABLE_NFIT *nfitbl,
     uint16_t control_region_index, ACPI_NFIT_CONTROL_REGION **out);
 void acpi_nfit_get_flush_addrs(ACPI_TABLE_NFIT *nfitbl, nfit_handle_t dimm,
     uint64_t ***listp, int *countp);
 enum SPA_mapping_type nvdimm_spa_type_from_name(const char *);
 enum SPA_mapping_type nvdimm_spa_type_from_uuid(struct uuid *);
 bool nvdimm_spa_type_user_accessible(enum SPA_mapping_type);
 struct nvdimm_dev *nvdimm_find_by_handle(nfit_handle_t nv_handle);
 int nvdimm_spa_init(struct SPA_mapping *spa, ACPI_NFIT_SYSTEM_ADDRESS *nfitaddr,
     enum SPA_mapping_type spa_type);
 void nvdimm_spa_fini(struct SPA_mapping *spa);
 int nvdimm_spa_dev_init(struct nvdimm_spa_dev *dev, const char *name, int unit);
 void nvdimm_spa_dev_fini(struct nvdimm_spa_dev *dev);
 int nvdimm_create_namespaces(struct SPA_mapping *spa, ACPI_TABLE_NFIT *nfitbl);
 void nvdimm_destroy_namespaces(struct SPA_mapping *spa);
 
 #endif		/* __DEV_NVDIMM_VAR_H__ */
Index: head/sys/geom/geom_disk.c
===================================================================
--- head/sys/geom/geom_disk.c	(revision 356199)
+++ head/sys/geom/geom_disk.c	(revision 356200)
@@ -1,1086 +1,1079 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2002 Poul-Henning Kamp
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
  * and NAI Labs, the Security Research Division of Network Associates, Inc.
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The names of the authors may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_geom.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/ctype.h>
 #include <sys/fcntl.h>
 #include <sys/malloc.h>
 #include <sys/sbuf.h>
 #include <sys/devicestat.h>
 #include <machine/md_var.h>
 
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <geom/geom.h>
 #include <geom/geom_disk.h>
 #include <geom/geom_int.h>
 
 #include <dev/led/led.h>
 
 #include <machine/bus.h>
 
 struct g_disk_softc {
-	struct mtx		 done_mtx;
 	struct disk		*dp;
 	struct sysctl_ctx_list	sysctl_ctx;
 	struct sysctl_oid	*sysctl_tree;
 	char			led[64];
 	uint32_t		state;
-	struct mtx		 start_mtx;
+	struct mtx		 done_mtx;
 };
 
 static g_access_t g_disk_access;
 static g_start_t g_disk_start;
 static g_ioctl_t g_disk_ioctl;
 static g_dumpconf_t g_disk_dumpconf;
 static g_provgone_t g_disk_providergone;
 
 static int g_disk_sysctl_flags(SYSCTL_HANDLER_ARGS);
 
 static struct g_class g_disk_class = {
 	.name = G_DISK_CLASS_NAME,
 	.version = G_VERSION,
 	.start = g_disk_start,
 	.access = g_disk_access,
 	.ioctl = g_disk_ioctl,
 	.providergone = g_disk_providergone,
 	.dumpconf = g_disk_dumpconf,
 };
 
 SYSCTL_DECL(_kern_geom);
 static SYSCTL_NODE(_kern_geom, OID_AUTO, disk, CTLFLAG_RW, 0,
     "GEOM_DISK stuff");
 
 DECLARE_GEOM_CLASS(g_disk_class, g_disk);
 
 static int
 g_disk_access(struct g_provider *pp, int r, int w, int e)
 {
 	struct disk *dp;
 	struct g_disk_softc *sc;
 	int error;
 
 	g_trace(G_T_ACCESS, "g_disk_access(%s, %d, %d, %d)",
 	    pp->name, r, w, e);
 	g_topology_assert();
 	sc = pp->private;
 	if ((dp = sc->dp) == NULL || dp->d_destroyed) {
 		/*
 		 * Allow decreasing access count even if disk is not
 		 * available anymore.
 		 */
 		if (r <= 0 && w <= 0 && e <= 0)
 			return (0);
 		return (ENXIO);
 	}
 	r += pp->acr;
 	w += pp->acw;
 	e += pp->ace;
 	error = 0;
 	if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) {
 		/*
 		 * It would be better to defer this decision to d_open if
 		 * it was able to take flags.
 		 */
 		if (w > 0 && (dp->d_flags & DISKFLAG_WRITE_PROTECT) != 0)
 			error = EROFS;
 		if (error == 0 && dp->d_open != NULL)
 			error = dp->d_open(dp);
 		if (bootverbose && error != 0)
 			printf("Opened disk %s -> %d\n", pp->name, error);
 		if (error != 0)
 			return (error);
 		pp->sectorsize = dp->d_sectorsize;
 		if (dp->d_maxsize == 0) {
 			printf("WARNING: Disk drive %s%d has no d_maxsize\n",
 			    dp->d_name, dp->d_unit);
 			dp->d_maxsize = DFLTPHYS;
 		}
 		if (dp->d_delmaxsize == 0) {
 			if (bootverbose && dp->d_flags & DISKFLAG_CANDELETE) {
 				printf("WARNING: Disk drive %s%d has no "
 				    "d_delmaxsize\n", dp->d_name, dp->d_unit);
 			}
 			dp->d_delmaxsize = dp->d_maxsize;
 		}
 		pp->stripeoffset = dp->d_stripeoffset;
 		pp->stripesize = dp->d_stripesize;
 		dp->d_flags |= DISKFLAG_OPEN;
 		/*
 		 * Do not invoke resize event when initial size was zero.
 		 * Some disks report its size only after first opening.
 		 */
 		if (pp->mediasize == 0)
 			pp->mediasize = dp->d_mediasize;
 		else
 			g_resize_provider(pp, dp->d_mediasize);
 	} else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) {
 		if (dp->d_close != NULL) {
 			error = dp->d_close(dp);
 			if (error != 0)
 				printf("Closed disk %s -> %d\n",
 				    pp->name, error);
 		}
 		sc->state = G_STATE_ACTIVE;
 		if (sc->led[0] != 0)
 			led_set(sc->led, "0");
 		dp->d_flags &= ~DISKFLAG_OPEN;
 	}
 	return (error);
 }
 
 static void
 g_disk_kerneldump(struct bio *bp, struct disk *dp)
 {
 	struct g_kerneldump *gkd;
 	struct g_geom *gp;
 
 	gkd = (struct g_kerneldump*)bp->bio_data;
 	gp = bp->bio_to->geom;
 	g_trace(G_T_TOPOLOGY, "g_disk_kerneldump(%s, %jd, %jd)",
 		gp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length);
 	if (dp->d_dump == NULL) {
 		g_io_deliver(bp, ENODEV);
 		return;
 	}
 	gkd->di.dumper = dp->d_dump;
 	gkd->di.priv = dp;
 	gkd->di.blocksize = dp->d_sectorsize;
 	gkd->di.maxiosize = dp->d_maxsize;
 	gkd->di.mediaoffset = gkd->offset;
 	if ((gkd->offset + gkd->length) > dp->d_mediasize)
 		gkd->length = dp->d_mediasize - gkd->offset;
 	gkd->di.mediasize = gkd->length;
 	g_io_deliver(bp, 0);
 }
 
 static void
 g_disk_setstate(struct bio *bp, struct g_disk_softc *sc)
 {
 	const char *cmd;
 
 	memcpy(&sc->state, bp->bio_data, sizeof(sc->state));
 	if (sc->led[0] != 0) {
 		switch (sc->state) {
 		case G_STATE_FAILED:
 			cmd = "1";
 			break;
 		case G_STATE_REBUILD:
 			cmd = "f5";
 			break;
 		case G_STATE_RESYNC:
 			cmd = "f1";
 			break;
 		default:
 			cmd = "0";
 			break;
 		}
 		led_set(sc->led, cmd);
 	}
 	g_io_deliver(bp, 0);
 }
 
 static void
 g_disk_done(struct bio *bp)
 {
 	struct bintime now;
 	struct bio *bp2;
 	struct g_disk_softc *sc;
 
 	/* See "notes" for why we need a mutex here */
 	/* XXX: will witness accept a mix of Giant/unGiant drivers here ? */
 	bp2 = bp->bio_parent;
 	sc = bp2->bio_to->private;
 	bp->bio_completed = bp->bio_length - bp->bio_resid;
 	binuptime(&now);
 	mtx_lock(&sc->done_mtx);
 	if (bp2->bio_error == 0)
 		bp2->bio_error = bp->bio_error;
 	bp2->bio_completed += bp->bio_completed;
 
 	switch (bp->bio_cmd) {
 	case BIO_ZONE:
 		bcopy(&bp->bio_zone, &bp2->bio_zone, sizeof(bp->bio_zone));
 		/*FALLTHROUGH*/
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 	case BIO_FLUSH:
 		devstat_end_transaction_bio_bt(sc->dp->d_devstat, bp, &now);
 		break;
 	default:
 		break;
 	}
 	bp2->bio_inbed++;
 	if (bp2->bio_children == bp2->bio_inbed) {
 		mtx_unlock(&sc->done_mtx);
 		bp2->bio_resid = bp2->bio_bcount - bp2->bio_completed;
 		g_io_deliver(bp2, bp2->bio_error);
 	} else
 		mtx_unlock(&sc->done_mtx);
 	g_destroy_bio(bp);
 }
 
 static int
 g_disk_ioctl(struct g_provider *pp, u_long cmd, void * data, int fflag, struct thread *td)
 {
 	struct disk *dp;
 	struct g_disk_softc *sc;
 	int error;
 
 	sc = pp->private;
 	dp = sc->dp;
 	KASSERT(dp != NULL && !dp->d_destroyed,
 	    ("g_disk_ioctl(%lx) on destroyed disk %s", cmd, pp->name));
 
 	if (dp->d_ioctl == NULL)
 		return (ENOIOCTL);
 	error = dp->d_ioctl(dp, cmd, data, fflag, td);
 	return (error);
 }
 
 static off_t
 g_disk_maxsize(struct disk *dp, struct bio *bp)
 {
 	if (bp->bio_cmd == BIO_DELETE)
 		return (dp->d_delmaxsize);
 	return (dp->d_maxsize);
 }
 
 static int
 g_disk_maxsegs(struct disk *dp, struct bio *bp)
 {
 	return ((g_disk_maxsize(dp, bp) / PAGE_SIZE) + 1);
 }
 
 static void
 g_disk_advance(struct disk *dp, struct bio *bp, off_t off)
 {
 
 	bp->bio_offset += off;
 	bp->bio_length -= off;
 
 	if ((bp->bio_flags & BIO_VLIST) != 0) {
 		bus_dma_segment_t *seg, *end;
 
 		seg = (bus_dma_segment_t *)bp->bio_data;
 		end = (bus_dma_segment_t *)bp->bio_data + bp->bio_ma_n;
 		off += bp->bio_ma_offset;
 		while (off >= seg->ds_len) {
 			KASSERT((seg != end),
 			    ("vlist request runs off the end"));
 			off -= seg->ds_len;
 			seg++;
 		}
 		bp->bio_ma_offset = off;
 		bp->bio_ma_n = end - seg;
 		bp->bio_data = (void *)seg;
 	} else if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 		bp->bio_ma += off / PAGE_SIZE;
 		bp->bio_ma_offset += off;
 		bp->bio_ma_offset %= PAGE_SIZE;
 		bp->bio_ma_n -= off / PAGE_SIZE;
 	} else {
 		bp->bio_data += off;
 	}
 }
 
 static void
 g_disk_seg_limit(bus_dma_segment_t *seg, off_t *poffset,
     off_t *plength, int *ppages)
 {
 	uintptr_t seg_page_base;
 	uintptr_t seg_page_end;
 	off_t offset;
 	off_t length;
 	int seg_pages;
 
 	offset = *poffset;
 	length = *plength;
 
 	if (length > seg->ds_len - offset)
 		length = seg->ds_len - offset;
 
 	seg_page_base = trunc_page(seg->ds_addr + offset);
 	seg_page_end  = round_page(seg->ds_addr + offset + length);
 	seg_pages = (seg_page_end - seg_page_base) >> PAGE_SHIFT;
 
 	if (seg_pages > *ppages) {
 		seg_pages = *ppages;
 		length = (seg_page_base + (seg_pages << PAGE_SHIFT)) -
 		    (seg->ds_addr + offset);
 	}
 
 	*poffset = 0;
 	*plength -= length;
 	*ppages -= seg_pages;
 }
 
 static off_t
 g_disk_vlist_limit(struct disk *dp, struct bio *bp, bus_dma_segment_t **pendseg)
 {
 	bus_dma_segment_t *seg, *end;
 	off_t residual;
 	off_t offset;
 	int pages;
 
 	seg = (bus_dma_segment_t *)bp->bio_data;
 	end = (bus_dma_segment_t *)bp->bio_data + bp->bio_ma_n;
 	residual = bp->bio_length;
 	offset = bp->bio_ma_offset;
 	pages = g_disk_maxsegs(dp, bp);
 	while (residual != 0 && pages != 0) {
 		KASSERT((seg != end),
 		    ("vlist limit runs off the end"));
 		g_disk_seg_limit(seg, &offset, &residual, &pages);
 		seg++;
 	}
 	if (pendseg != NULL)
 		*pendseg = seg;
 	return (residual);
 }
 
 static bool
 g_disk_limit(struct disk *dp, struct bio *bp)
 {
 	bool limited = false;
 	off_t maxsz;
 
 	maxsz = g_disk_maxsize(dp, bp);
 
 	/*
 	 * XXX: If we have a stripesize we should really use it here.
 	 *      Care should be taken in the delete case if this is done
 	 *      as deletes can be very sensitive to size given how they
 	 *      are processed.
 	 */
 	if (bp->bio_length > maxsz) {
 		bp->bio_length = maxsz;
 		limited = true;
 	}
 
 	if ((bp->bio_flags & BIO_VLIST) != 0) {
 		bus_dma_segment_t *firstseg, *endseg;
 		off_t residual;
 
 		firstseg = (bus_dma_segment_t*)bp->bio_data;
 		residual = g_disk_vlist_limit(dp, bp, &endseg);
 		if (residual != 0) {
 			bp->bio_ma_n = endseg - firstseg;
 			bp->bio_length -= residual;
 			limited = true;
 		}
 	} else if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 		bp->bio_ma_n =
 		    howmany(bp->bio_ma_offset + bp->bio_length, PAGE_SIZE);
 	}
 
 	return (limited);
 }
 
 static void
 g_disk_start(struct bio *bp)
 {
 	struct bio *bp2, *bp3;
 	struct disk *dp;
 	struct g_disk_softc *sc;
 	int error;
 	off_t off;
 
 	biotrack(bp, __func__);
 
 	sc = bp->bio_to->private;
 	dp = sc->dp;
 	KASSERT(dp != NULL && !dp->d_destroyed,
 	    ("g_disk_start(%p) on destroyed disk %s", bp, bp->bio_to->name));
 	error = EJUSTRETURN;
 	switch(bp->bio_cmd) {
 	case BIO_DELETE:
 		if (!(dp->d_flags & DISKFLAG_CANDELETE)) {
 			error = EOPNOTSUPP;
 			break;
 		}
 		/* fall-through */
 	case BIO_READ:
 	case BIO_WRITE:
 		KASSERT((dp->d_flags & DISKFLAG_UNMAPPED_BIO) != 0 ||
 		    (bp->bio_flags & BIO_UNMAPPED) == 0,
 		    ("unmapped bio not supported by disk %s", dp->d_name));
 		off = 0;
 		bp3 = NULL;
 		bp2 = g_clone_bio(bp);
 		if (bp2 == NULL) {
 			error = ENOMEM;
 			break;
 		}
 		for (;;) {
 			if (g_disk_limit(dp, bp2)) {
 				off += bp2->bio_length;
 
 				/*
 				 * To avoid a race, we need to grab the next bio
 				 * before we schedule this one.  See "notes".
 				 */
 				bp3 = g_clone_bio(bp);
 				if (bp3 == NULL)
 					bp->bio_error = ENOMEM;
 			}
 			bp2->bio_done = g_disk_done;
 			bp2->bio_pblkno = bp2->bio_offset / dp->d_sectorsize;
 			bp2->bio_bcount = bp2->bio_length;
 			bp2->bio_disk = dp;
-			mtx_lock(&sc->start_mtx); 
 			devstat_start_transaction_bio(dp->d_devstat, bp2);
-			mtx_unlock(&sc->start_mtx); 
 			dp->d_strategy(bp2);
 
 			if (bp3 == NULL)
 				break;
 
 			bp2 = bp3;
 			bp3 = NULL;
 			g_disk_advance(dp, bp2, off);
 		}
 		break;
 	case BIO_GETATTR:
 		/* Give the driver a chance to override */
 		if (dp->d_getattr != NULL) {
 			if (bp->bio_disk == NULL)
 				bp->bio_disk = dp;
 			error = dp->d_getattr(bp);
 			if (error != -1)
 				break;
 			error = EJUSTRETURN;
 		}
 		if (g_handleattr_int(bp, "GEOM::candelete",
 		    (dp->d_flags & DISKFLAG_CANDELETE) != 0))
 			break;
 		else if (g_handleattr_int(bp, "GEOM::fwsectors",
 		    dp->d_fwsectors))
 			break;
 		else if (g_handleattr_int(bp, "GEOM::fwheads", dp->d_fwheads))
 			break;
 		else if (g_handleattr_off_t(bp, "GEOM::frontstuff", 0))
 			break;
 		else if (g_handleattr_str(bp, "GEOM::ident", dp->d_ident))
 			break;
 		else if (g_handleattr_str(bp, "GEOM::descr", dp->d_descr))
 			break;
 		else if (g_handleattr_uint16_t(bp, "GEOM::hba_vendor",
 		    dp->d_hba_vendor))
 			break;
 		else if (g_handleattr_uint16_t(bp, "GEOM::hba_device",
 		    dp->d_hba_device))
 			break;
 		else if (g_handleattr_uint16_t(bp, "GEOM::hba_subvendor",
 		    dp->d_hba_subvendor))
 			break;
 		else if (g_handleattr_uint16_t(bp, "GEOM::hba_subdevice",
 		    dp->d_hba_subdevice))
 			break;
 		else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump"))
 			g_disk_kerneldump(bp, dp);
 		else if (!strcmp(bp->bio_attribute, "GEOM::setstate"))
 			g_disk_setstate(bp, sc);
 		else if (g_handleattr_uint16_t(bp, "GEOM::rotation_rate",
 		    dp->d_rotation_rate))
 			break;
 		else if (g_handleattr_str(bp, "GEOM::attachment",
 		    dp->d_attachment))
 			break;
 		else
 			error = ENOIOCTL;
 		break;
 	case BIO_FLUSH:
 		g_trace(G_T_BIO, "g_disk_flushcache(%s)",
 		    bp->bio_to->name);
 		if (!(dp->d_flags & DISKFLAG_CANFLUSHCACHE)) {
 			error = EOPNOTSUPP;
 			break;
 		}
 		/*FALLTHROUGH*/
 	case BIO_ZONE:
 		if (bp->bio_cmd == BIO_ZONE) {
 			if (!(dp->d_flags & DISKFLAG_CANZONE)) {
 				error = EOPNOTSUPP;
 				break;
 			}
 			g_trace(G_T_BIO, "g_disk_zone(%s)",
 			    bp->bio_to->name);
 		}
 		bp2 = g_clone_bio(bp);
 		if (bp2 == NULL) {
 			g_io_deliver(bp, ENOMEM);
 			return;
 		}
 		bp2->bio_done = g_disk_done;
 		bp2->bio_disk = dp;
-		mtx_lock(&sc->start_mtx);
 		devstat_start_transaction_bio(dp->d_devstat, bp2);
-		mtx_unlock(&sc->start_mtx);
 		dp->d_strategy(bp2);
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	if (error != EJUSTRETURN)
 		g_io_deliver(bp, error);
 	return;
 }
 
 static void
 g_disk_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp)
 {
 	struct bio *bp;
 	struct disk *dp;
 	struct g_disk_softc *sc;
 	char *buf;
 	int res = 0;
 
 	sc = gp->softc;
 	if (sc == NULL || (dp = sc->dp) == NULL)
 		return;
 	if (indent == NULL) {
 		sbuf_printf(sb, " hd %u", dp->d_fwheads);
 		sbuf_printf(sb, " sc %u", dp->d_fwsectors);
 		return;
 	}
 	if (pp != NULL) {
 		sbuf_printf(sb, "%s<fwheads>%u</fwheads>\n",
 		    indent, dp->d_fwheads);
 		sbuf_printf(sb, "%s<fwsectors>%u</fwsectors>\n",
 		    indent, dp->d_fwsectors);
 
 		/*
 		 * "rotationrate" is a little complicated, because the value
 		 * returned by the drive might not be the RPM; 0 and 1 are
 		 * special cases, and there's also a valid range.
 		 */
 		sbuf_printf(sb, "%s<rotationrate>", indent);
 		if (dp->d_rotation_rate == DISK_RR_UNKNOWN) /* Old drives */
 			sbuf_cat(sb, "unknown");	/* don't report RPM. */
 		else if (dp->d_rotation_rate == DISK_RR_NON_ROTATING)
 			sbuf_cat(sb, "0");
 		else if ((dp->d_rotation_rate >= DISK_RR_MIN) &&
 		    (dp->d_rotation_rate <= DISK_RR_MAX))
 			sbuf_printf(sb, "%u", dp->d_rotation_rate);
 		else
 			sbuf_cat(sb, "invalid");
 		sbuf_cat(sb, "</rotationrate>\n");
 		if (dp->d_getattr != NULL) {
 			buf = g_malloc(DISK_IDENT_SIZE, M_WAITOK);
 			bp = g_alloc_bio();
 			bp->bio_disk = dp;
 			bp->bio_attribute = "GEOM::ident";
 			bp->bio_length = DISK_IDENT_SIZE;
 			bp->bio_data = buf;
 			res = dp->d_getattr(bp);
 			sbuf_printf(sb, "%s<ident>", indent);
 			g_conf_cat_escaped(sb, res == 0 ? buf : dp->d_ident);
 			sbuf_cat(sb, "</ident>\n");
 			bp->bio_attribute = "GEOM::lunid";
 			bp->bio_length = DISK_IDENT_SIZE;
 			bp->bio_data = buf;
 			if (dp->d_getattr(bp) == 0) {
 				sbuf_printf(sb, "%s<lunid>", indent);
 				g_conf_cat_escaped(sb, buf);
 				sbuf_cat(sb, "</lunid>\n");
 			}
 			bp->bio_attribute = "GEOM::lunname";
 			bp->bio_length = DISK_IDENT_SIZE;
 			bp->bio_data = buf;
 			if (dp->d_getattr(bp) == 0) {
 				sbuf_printf(sb, "%s<lunname>", indent);
 				g_conf_cat_escaped(sb, buf);
 				sbuf_cat(sb, "</lunname>\n");
 			}
 			g_destroy_bio(bp);
 			g_free(buf);
 		} else {
 			sbuf_printf(sb, "%s<ident>", indent);
 			g_conf_cat_escaped(sb, dp->d_ident);
 			sbuf_cat(sb, "</ident>\n");
 		}
 		sbuf_printf(sb, "%s<descr>", indent);
 		g_conf_cat_escaped(sb, dp->d_descr);
 		sbuf_cat(sb, "</descr>\n");
 	}
 }
 
 static void
 g_disk_resize(void *ptr, int flag)
 {
 	struct disk *dp;
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	if (flag == EV_CANCEL)
 		return;
 	g_topology_assert();
 
 	dp = ptr;
 	gp = dp->d_geom;
 
 	if (dp->d_destroyed || gp == NULL)
 		return;
 
 	LIST_FOREACH(pp, &gp->provider, provider) {
 		if (pp->sectorsize != 0 &&
 		    pp->sectorsize != dp->d_sectorsize)
 			g_wither_provider(pp, ENXIO);
 		else
 			g_resize_provider(pp, dp->d_mediasize);
 	}
 }
 
 static void
 g_disk_create(void *arg, int flag)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct disk *dp;
 	struct g_disk_softc *sc;
 	struct disk_alias *dap;
 	char tmpstr[80];
 
 	if (flag == EV_CANCEL)
 		return;
 	g_topology_assert();
 	dp = arg;
 
 	mtx_pool_lock(mtxpool_sleep, dp);
 	dp->d_init_level = DISK_INIT_START;
 
 	/*
 	 * If the disk has already gone away, we can just stop here and
 	 * call the user's callback to tell him we've cleaned things up.
 	 */
 	if (dp->d_goneflag != 0) {
 		mtx_pool_unlock(mtxpool_sleep, dp);
 		if (dp->d_gone != NULL)
 			dp->d_gone(dp);
 		return;
 	}
 	mtx_pool_unlock(mtxpool_sleep, dp);
 
 	sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
-	mtx_init(&sc->start_mtx, "g_disk_start", NULL, MTX_DEF);
 	mtx_init(&sc->done_mtx, "g_disk_done", NULL, MTX_DEF);
 	sc->dp = dp;
 	gp = g_new_geomf(&g_disk_class, "%s%d", dp->d_name, dp->d_unit);
 	gp->softc = sc;
 	LIST_FOREACH(dap, &dp->d_aliases, da_next) {
 		snprintf(tmpstr, sizeof(tmpstr), "%s%d", dap->da_alias, dp->d_unit);
 		g_geom_add_alias(gp, tmpstr);
 	}
 	pp = g_new_providerf(gp, "%s", gp->name);
 	devstat_remove_entry(pp->stat);
 	pp->stat = NULL;
 	dp->d_devstat->id = pp;
 	pp->mediasize = dp->d_mediasize;
 	pp->sectorsize = dp->d_sectorsize;
 	pp->stripeoffset = dp->d_stripeoffset;
 	pp->stripesize = dp->d_stripesize;
 	if ((dp->d_flags & DISKFLAG_UNMAPPED_BIO) != 0)
 		pp->flags |= G_PF_ACCEPT_UNMAPPED;
 	if ((dp->d_flags & DISKFLAG_DIRECT_COMPLETION) != 0)
 		pp->flags |= G_PF_DIRECT_SEND;
 	pp->flags |= G_PF_DIRECT_RECEIVE;
 	if (bootverbose)
 		printf("GEOM: new disk %s\n", gp->name);
 	sysctl_ctx_init(&sc->sysctl_ctx);
 	snprintf(tmpstr, sizeof(tmpstr), "GEOM disk %s", gp->name);
 	sc->sysctl_tree = SYSCTL_ADD_NODE(&sc->sysctl_ctx,
 		SYSCTL_STATIC_CHILDREN(_kern_geom_disk), OID_AUTO, gp->name,
 		CTLFLAG_RD, 0, tmpstr);
 	if (sc->sysctl_tree != NULL) {
 		SYSCTL_ADD_STRING(&sc->sysctl_ctx,
 		    SYSCTL_CHILDREN(sc->sysctl_tree), OID_AUTO, "led",
 		    CTLFLAG_RWTUN, sc->led, sizeof(sc->led),
 		    "LED name");
 		SYSCTL_ADD_PROC(&sc->sysctl_ctx,
 		    SYSCTL_CHILDREN(sc->sysctl_tree), OID_AUTO, "flags",
 		    CTLTYPE_STRING | CTLFLAG_RD, dp, 0, g_disk_sysctl_flags,
 		    "A", "Report disk flags");
 	}
 	pp->private = sc;
 	dp->d_geom = gp;
 	g_error_provider(pp, 0);
 
 	mtx_pool_lock(mtxpool_sleep, dp);
 	dp->d_init_level = DISK_INIT_DONE;
 
 	/*
 	 * If the disk has gone away at this stage, start the withering
 	 * process for it.
 	 */
 	if (dp->d_goneflag != 0) {
 		mtx_pool_unlock(mtxpool_sleep, dp);
 		g_wither_provider(pp, ENXIO);
 		return;
 	}
 	mtx_pool_unlock(mtxpool_sleep, dp);
 
 }
 
 /*
  * We get this callback after all of the consumers have gone away, and just
  * before the provider is freed.  If the disk driver provided a d_gone
  * callback, let them know that it is okay to free resources -- they won't
  * be getting any more accesses from GEOM.
  */
 static void
 g_disk_providergone(struct g_provider *pp)
 {
 	struct disk *dp;
 	struct g_disk_softc *sc;
 
 	sc = (struct g_disk_softc *)pp->private;
 	dp = sc->dp;
 	if (dp != NULL && dp->d_gone != NULL)
 		dp->d_gone(dp);
 	if (sc->sysctl_tree != NULL) {
 		sysctl_ctx_free(&sc->sysctl_ctx);
 		sc->sysctl_tree = NULL;
 	}
 	if (sc->led[0] != 0) {
 		led_set(sc->led, "0");
 		sc->led[0] = 0;
 	}
 	pp->private = NULL;
 	pp->geom->softc = NULL;
 	mtx_destroy(&sc->done_mtx);
-	mtx_destroy(&sc->start_mtx);
 	g_free(sc);
 }
 
 static void
 g_disk_destroy(void *ptr, int flag)
 {
 	struct disk *dp;
 	struct g_geom *gp;
 	struct g_disk_softc *sc;
 	struct disk_alias *dap, *daptmp;
 
 	g_topology_assert();
 	dp = ptr;
 	gp = dp->d_geom;
 	if (gp != NULL) {
 		sc = gp->softc;
 		if (sc != NULL)
 			sc->dp = NULL;
 		dp->d_geom = NULL;
 		g_wither_geom(gp, ENXIO);
 	}
 	LIST_FOREACH_SAFE(dap, &dp->d_aliases, da_next, daptmp)
 		g_free(dap);
 
 	g_free(dp);
 }
 
 /*
  * We only allow printable characters in disk ident,
  * the rest is converted to 'x<HH>'.
  */
 static void
 g_disk_ident_adjust(char *ident, size_t size)
 {
 	char *p, tmp[4], newid[DISK_IDENT_SIZE];
 
 	newid[0] = '\0';
 	for (p = ident; *p != '\0'; p++) {
 		if (isprint(*p)) {
 			tmp[0] = *p;
 			tmp[1] = '\0';
 		} else {
 			snprintf(tmp, sizeof(tmp), "x%02hhx",
 			    *(unsigned char *)p);
 		}
 		if (strlcat(newid, tmp, sizeof(newid)) >= sizeof(newid))
 			break;
 	}
 	bzero(ident, size);
 	strlcpy(ident, newid, size);
 }
 
 struct disk *
 disk_alloc(void)
 {
 	struct disk *dp;
 
 	dp = g_malloc(sizeof(struct disk), M_WAITOK | M_ZERO);
 	LIST_INIT(&dp->d_aliases);
 	return (dp);
 }
 
 void
 disk_create(struct disk *dp, int version)
 {
 
 	if (version != DISK_VERSION) {
 		printf("WARNING: Attempt to add disk %s%d %s",
 		    dp->d_name, dp->d_unit,
 		    " using incompatible ABI version of disk(9)\n");
 		printf("WARNING: Ignoring disk %s%d\n",
 		    dp->d_name, dp->d_unit);
 		return;
 	}
 	if (dp->d_flags & DISKFLAG_RESERVED) {
 		printf("WARNING: Attempt to add non-MPSAFE disk %s%d\n",
 		    dp->d_name, dp->d_unit);
 		printf("WARNING: Ignoring disk %s%d\n",
 		    dp->d_name, dp->d_unit);
 		return;
 	}
 	KASSERT(dp->d_strategy != NULL, ("disk_create need d_strategy"));
 	KASSERT(dp->d_name != NULL, ("disk_create need d_name"));
 	KASSERT(*dp->d_name != 0, ("disk_create need d_name"));
 	KASSERT(strlen(dp->d_name) < SPECNAMELEN - 4, ("disk name too long"));
 	if (dp->d_devstat == NULL)
 		dp->d_devstat = devstat_new_entry(dp->d_name, dp->d_unit,
 		    dp->d_sectorsize, DEVSTAT_ALL_SUPPORTED,
 		    DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
 	dp->d_geom = NULL;
 
 	dp->d_init_level = DISK_INIT_NONE;
 
 	g_disk_ident_adjust(dp->d_ident, sizeof(dp->d_ident));
 	g_post_event(g_disk_create, dp, M_WAITOK, dp, NULL);
 }
 
 void
 disk_destroy(struct disk *dp)
 {
 
 	disk_gone(dp);
 	dp->d_destroyed = 1;
 	g_cancel_event(dp);
 	if (dp->d_devstat != NULL)
 		devstat_remove_entry(dp->d_devstat);
 	g_post_event(g_disk_destroy, dp, M_WAITOK, NULL);
 }
 
 void
 disk_add_alias(struct disk *dp, const char *name)
 {
 	struct disk_alias *dap;
 
 	dap = (struct disk_alias *)g_malloc(
 		sizeof(struct disk_alias) + strlen(name) + 1, M_WAITOK);
 	strcpy((char *)(dap + 1), name);
 	dap->da_alias = (const char *)(dap + 1);
 	LIST_INSERT_HEAD(&dp->d_aliases, dap, da_next);
 }
 
 void
 disk_gone(struct disk *dp)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	mtx_pool_lock(mtxpool_sleep, dp);
 
 	/*
 	 * Second wither call makes no sense, plus we can not access the list
 	 * of providers without topology lock after calling wither once.
 	 */
 	if (dp->d_goneflag != 0) {
 		mtx_pool_unlock(mtxpool_sleep, dp);
 		return;
 	}
 
 	dp->d_goneflag = 1;
 
 	/*
 	 * If we're still in the process of creating this disk (the
 	 * g_disk_create() function is still queued, or is in
 	 * progress), the init level will not yet be DISK_INIT_DONE.
 	 *
 	 * If that is the case, g_disk_create() will see d_goneflag
 	 * and take care of cleaning things up.
 	 *
 	 * If the disk has already been created, we default to
 	 * withering the provider as usual below.
 	 *
 	 * If the caller has not set a d_gone() callback, he will
 	 * not be any worse off by returning here, because the geom
 	 * has not been fully setup in any case.
 	 */
 	if (dp->d_init_level < DISK_INIT_DONE) {
 		mtx_pool_unlock(mtxpool_sleep, dp);
 		return;
 	}
 	mtx_pool_unlock(mtxpool_sleep, dp);
 
 	gp = dp->d_geom;
 	pp = LIST_FIRST(&gp->provider);
 	if (pp != NULL) {
 		KASSERT(LIST_NEXT(pp, provider) == NULL,
 		    ("geom %p has more than one provider", gp));
 		g_wither_provider(pp, ENXIO);
 	}
 }
 
 void
 disk_attr_changed(struct disk *dp, const char *attr, int flag)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 	char devnamebuf[128];
 
 	gp = dp->d_geom;
 	if (gp != NULL)
 		LIST_FOREACH(pp, &gp->provider, provider)
 			(void)g_attr_changed(pp, attr, flag);
 	snprintf(devnamebuf, sizeof(devnamebuf), "devname=%s%d", dp->d_name,
 	    dp->d_unit);
 	devctl_notify("GEOM", "disk", attr, devnamebuf);
 }
 
 void
 disk_media_changed(struct disk *dp, int flag)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	gp = dp->d_geom;
 	if (gp != NULL) {
 		pp = LIST_FIRST(&gp->provider);
 		if (pp != NULL) {
 			KASSERT(LIST_NEXT(pp, provider) == NULL,
 			    ("geom %p has more than one provider", gp));
 			g_media_changed(pp, flag);
 		}
 	}
 }
 
 void
 disk_media_gone(struct disk *dp, int flag)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	gp = dp->d_geom;
 	if (gp != NULL) {
 		pp = LIST_FIRST(&gp->provider);
 		if (pp != NULL) {
 			KASSERT(LIST_NEXT(pp, provider) == NULL,
 			    ("geom %p has more than one provider", gp));
 			g_media_gone(pp, flag);
 		}
 	}
 }
 
 int
 disk_resize(struct disk *dp, int flag)
 {
 
 	if (dp->d_destroyed || dp->d_geom == NULL)
 		return (0);
 
 	return (g_post_event(g_disk_resize, dp, flag, NULL));
 }
 
 static void
 g_kern_disks(void *p, int flag __unused)
 {
 	struct sbuf *sb;
 	struct g_geom *gp;
 	char *sp;
 
 	sb = p;
 	sp = "";
 	g_topology_assert();
 	LIST_FOREACH(gp, &g_disk_class.geom, geom) {
 		sbuf_printf(sb, "%s%s", sp, gp->name);
 		sp = " ";
 	}
 	sbuf_finish(sb);
 }
 
 static int
 g_disk_sysctl_flags(SYSCTL_HANDLER_ARGS)
 {
 	struct disk *dp;
 	struct sbuf *sb;
 	int error;
 
 	sb = sbuf_new_auto();
 	dp = (struct disk *)arg1;
 	sbuf_printf(sb, "%b", dp->d_flags,
 		"\20"
 		"\2OPEN"
 		"\3CANDELETE"
 		"\4CANFLUSHCACHE"
 		"\5UNMAPPEDBIO"
 		"\6DIRECTCOMPLETION"
 		"\10CANZONE"
 		"\11WRITEPROTECT");
 
 	sbuf_finish(sb);
 	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
 	sbuf_delete(sb);
 	return (error);
 }
 
 static int
 sysctl_disks(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct sbuf *sb;
 
 	sb = sbuf_new_auto();
 	g_waitfor_event(g_kern_disks, sb, M_WAITOK, NULL);
 	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
 	sbuf_delete(sb);
 	return error;
 }
  
 SYSCTL_PROC(_kern, OID_AUTO, disks,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_disks, "A", "names of available disks");
Index: head/sys/geom/geom_io.c
===================================================================
--- head/sys/geom/geom_io.c	(revision 356199)
+++ head/sys/geom/geom_io.c	(revision 356200)
@@ -1,1088 +1,1079 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2002 Poul-Henning Kamp
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * Copyright (c) 2013 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
  * and NAI Labs, the Security Research Division of Network Associates, Inc.
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The names of the authors may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/bio.h>
 #include <sys/ktr.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/stack.h>
 #include <sys/sysctl.h>
 #include <sys/vmem.h>
 #include <machine/stdarg.h>
 
 #include <sys/errno.h>
 #include <geom/geom.h>
 #include <geom/geom_int.h>
 #include <sys/devicestat.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 
 static int	g_io_transient_map_bio(struct bio *bp);
 
 static struct g_bioq g_bio_run_down;
 static struct g_bioq g_bio_run_up;
 
 /*
  * Pace is a hint that we've had some trouble recently allocating
  * bios, so we should back off trying to send I/O down the stack
  * a bit to let the problem resolve. When pacing, we also turn
  * off direct dispatch to also reduce memory pressure from I/Os
  * there, at the expxense of some added latency while the memory
  * pressures exist. See g_io_schedule_down() for more details
  * and limitations.
  */
 static volatile u_int __read_mostly pace;
 
 static uma_zone_t __read_mostly biozone;
 
 #include <machine/atomic.h>
 
 static void
 g_bioq_lock(struct g_bioq *bq)
 {
 
 	mtx_lock(&bq->bio_queue_lock);
 }
 
 static void
 g_bioq_unlock(struct g_bioq *bq)
 {
 
 	mtx_unlock(&bq->bio_queue_lock);
 }
 
 #if 0
 static void
 g_bioq_destroy(struct g_bioq *bq)
 {
 
 	mtx_destroy(&bq->bio_queue_lock);
 }
 #endif
 
 static void
 g_bioq_init(struct g_bioq *bq)
 {
 
 	TAILQ_INIT(&bq->bio_queue);
 	mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF);
 }
 
 static struct bio *
 g_bioq_first(struct g_bioq *bq)
 {
 	struct bio *bp;
 
 	bp = TAILQ_FIRST(&bq->bio_queue);
 	if (bp != NULL) {
 		KASSERT((bp->bio_flags & BIO_ONQUEUE),
 		    ("Bio not on queue bp=%p target %p", bp, bq));
 		bp->bio_flags &= ~BIO_ONQUEUE;
 		TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue);
 		bq->bio_queue_length--;
 	}
 	return (bp);
 }
 
 struct bio *
 g_new_bio(void)
 {
 	struct bio *bp;
 
 	bp = uma_zalloc(biozone, M_NOWAIT | M_ZERO);
 #ifdef KTR
 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
 		struct stack st;
 
 		CTR1(KTR_GEOM, "g_new_bio(): %p", bp);
 		stack_save(&st);
 		CTRSTACK(KTR_GEOM, &st, 3);
 	}
 #endif
 	return (bp);
 }
 
 struct bio *
 g_alloc_bio(void)
 {
 	struct bio *bp;
 
 	bp = uma_zalloc(biozone, M_WAITOK | M_ZERO);
 #ifdef KTR
 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
 		struct stack st;
 
 		CTR1(KTR_GEOM, "g_alloc_bio(): %p", bp);
 		stack_save(&st);
 		CTRSTACK(KTR_GEOM, &st, 3);
 	}
 #endif
 	return (bp);
 }
 
 void
 g_destroy_bio(struct bio *bp)
 {
 #ifdef KTR
 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
 		struct stack st;
 
 		CTR1(KTR_GEOM, "g_destroy_bio(): %p", bp);
 		stack_save(&st);
 		CTRSTACK(KTR_GEOM, &st, 3);
 	}
 #endif
 	uma_zfree(biozone, bp);
 }
 
 struct bio *
 g_clone_bio(struct bio *bp)
 {
 	struct bio *bp2;
 
 	bp2 = uma_zalloc(biozone, M_NOWAIT | M_ZERO);
 	if (bp2 != NULL) {
 		bp2->bio_parent = bp;
 		bp2->bio_cmd = bp->bio_cmd;
 		/*
 		 *  BIO_ORDERED flag may be used by disk drivers to enforce
 		 *  ordering restrictions, so this flag needs to be cloned.
 		 *  BIO_UNMAPPED and BIO_VLIST should be inherited, to properly
 		 *  indicate which way the buffer is passed.
 		 *  Other bio flags are not suitable for cloning.
 		 */
 		bp2->bio_flags = bp->bio_flags &
 		    (BIO_ORDERED | BIO_UNMAPPED | BIO_VLIST);
 		bp2->bio_length = bp->bio_length;
 		bp2->bio_offset = bp->bio_offset;
 		bp2->bio_data = bp->bio_data;
 		bp2->bio_ma = bp->bio_ma;
 		bp2->bio_ma_n = bp->bio_ma_n;
 		bp2->bio_ma_offset = bp->bio_ma_offset;
 		bp2->bio_attribute = bp->bio_attribute;
 		if (bp->bio_cmd == BIO_ZONE)
 			bcopy(&bp->bio_zone, &bp2->bio_zone,
 			    sizeof(bp->bio_zone));
 #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
 		bp2->bio_track_bp = bp->bio_track_bp;
 #endif
 		bp->bio_children++;
 	}
 #ifdef KTR
 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
 		struct stack st;
 
 		CTR2(KTR_GEOM, "g_clone_bio(%p): %p", bp, bp2);
 		stack_save(&st);
 		CTRSTACK(KTR_GEOM, &st, 3);
 	}
 #endif
 	return(bp2);
 }
 
 struct bio *
 g_duplicate_bio(struct bio *bp)
 {
 	struct bio *bp2;
 
 	bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO);
 	bp2->bio_flags = bp->bio_flags & (BIO_UNMAPPED | BIO_VLIST);
 	bp2->bio_parent = bp;
 	bp2->bio_cmd = bp->bio_cmd;
 	bp2->bio_length = bp->bio_length;
 	bp2->bio_offset = bp->bio_offset;
 	bp2->bio_data = bp->bio_data;
 	bp2->bio_ma = bp->bio_ma;
 	bp2->bio_ma_n = bp->bio_ma_n;
 	bp2->bio_ma_offset = bp->bio_ma_offset;
 	bp2->bio_attribute = bp->bio_attribute;
 	bp->bio_children++;
 #ifdef KTR
 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
 		struct stack st;
 
 		CTR2(KTR_GEOM, "g_duplicate_bio(%p): %p", bp, bp2);
 		stack_save(&st);
 		CTRSTACK(KTR_GEOM, &st, 3);
 	}
 #endif
 	return(bp2);
 }
 
 void
 g_reset_bio(struct bio *bp)
 {
 
 	bzero(bp, sizeof(*bp));
 }
 
 void
 g_io_init()
 {
 
 	g_bioq_init(&g_bio_run_down);
 	g_bioq_init(&g_bio_run_up);
 	biozone = uma_zcreate("g_bio", sizeof (struct bio),
 	    NULL, NULL,
 	    NULL, NULL,
 	    0, 0);
 }
 
 int
 g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr)
 {
 	struct bio *bp;
 	int error;
 
 	g_trace(G_T_BIO, "bio_getattr(%s)", attr);
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_GETATTR;
 	bp->bio_done = NULL;
 	bp->bio_attribute = attr;
 	bp->bio_length = *len;
 	bp->bio_data = ptr;
 	g_io_request(bp, cp);
 	error = biowait(bp, "ggetattr");
 	*len = bp->bio_completed;
 	g_destroy_bio(bp);
 	return (error);
 }
 
 int
 g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp)
 {
 	struct bio *bp;
 	int error;
 	
 	g_trace(G_T_BIO, "bio_zone(%d)", zone_args->zone_cmd);
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_ZONE;
 	bp->bio_done = NULL;
 	/*
 	 * XXX KDM need to handle report zone data.
 	 */
 	bcopy(zone_args, &bp->bio_zone, sizeof(*zone_args));
 	if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES)
 		bp->bio_length =
 		    zone_args->zone_params.report.entries_allocated *
 		    sizeof(struct disk_zone_rep_entry);
 	else
 		bp->bio_length = 0;
 
 	g_io_request(bp, cp);
 	error = biowait(bp, "gzone");
 	bcopy(&bp->bio_zone, zone_args, sizeof(*zone_args));
 	g_destroy_bio(bp);
 	return (error);
 }
 
 /*
  * Send a BIO_SPEEDUP down the stack. This is used to tell the lower layers that
  * the upper layers have detected a resource shortage. The lower layers are
  * advised to stop delaying I/O that they might be holding for performance
  * reasons and to schedule it (non-trims) or complete it successfully (trims) as
  * quickly as it can. bio_length is the amount of the shortage.  This call
  * should be non-blocking. bio_resid is used to communicate back if the lower
  * layers couldn't find bio_length worth of I/O to schedule or discard. A length
  * of 0 means to do as much as you can (schedule the h/w queues full, discard
  * all trims). flags are a hint from the upper layers to the lower layers what
  * operation should be done.
  */
 int
 g_io_speedup(size_t shortage, u_int flags, size_t *resid, struct g_consumer *cp)
 {
 	struct bio *bp;
 	int error;
 
 	KASSERT((flags & (BIO_SPEEDUP_TRIM | BIO_SPEEDUP_WRITE)) != 0,
 	    ("Invalid flags passed to g_io_speedup: %#x", flags));
 	g_trace(G_T_BIO, "bio_speedup(%s, %zu, %#x)", cp->provider->name,
 	    shortage, flags);
 	bp = g_new_bio();
 	if (bp == NULL)
 		return (ENOMEM);
 	bp->bio_cmd = BIO_SPEEDUP;
 	bp->bio_length = shortage;
 	bp->bio_done = NULL;
 	bp->bio_flags |= flags;
 	g_io_request(bp, cp);
 	error = biowait(bp, "gflush");
 	*resid = bp->bio_resid;
 	g_destroy_bio(bp);
 	return (error);
 }
 
 int
 g_io_flush(struct g_consumer *cp)
 {
 	struct bio *bp;
 	int error;
 
 	g_trace(G_T_BIO, "bio_flush(%s)", cp->provider->name);
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_FLUSH;
 	bp->bio_flags |= BIO_ORDERED;
 	bp->bio_done = NULL;
 	bp->bio_attribute = NULL;
 	bp->bio_offset = cp->provider->mediasize;
 	bp->bio_length = 0;
 	bp->bio_data = NULL;
 	g_io_request(bp, cp);
 	error = biowait(bp, "gflush");
 	g_destroy_bio(bp);
 	return (error);
 }
 
 static int
 g_io_check(struct bio *bp)
 {
 	struct g_consumer *cp;
 	struct g_provider *pp;
 	off_t excess;
 	int error;
 
 	biotrack(bp, __func__);
 
 	cp = bp->bio_from;
 	pp = bp->bio_to;
 
 	/* Fail if access counters dont allow the operation */
 	switch(bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_GETATTR:
 		if (cp->acr == 0)
 			return (EPERM);
 		break;
 	case BIO_WRITE:
 	case BIO_DELETE:
 	case BIO_FLUSH:
 		if (cp->acw == 0)
 			return (EPERM);
 		break;
 	case BIO_ZONE:
 		if ((bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES) ||
 		    (bp->bio_zone.zone_cmd == DISK_ZONE_GET_PARAMS)) {
 			if (cp->acr == 0)
 				return (EPERM);
 		} else if (cp->acw == 0)
 			return (EPERM);
 		break;
 	default:
 		return (EPERM);
 	}
 	/* if provider is marked for error, don't disturb. */
 	if (pp->error)
 		return (pp->error);
 	if (cp->flags & G_CF_ORPHAN)
 		return (ENXIO);
 
 	switch(bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		/* Zero sectorsize or mediasize is probably a lack of media. */
 		if (pp->sectorsize == 0 || pp->mediasize == 0)
 			return (ENXIO);
 		/* Reject I/O not on sector boundary */
 		if (bp->bio_offset % pp->sectorsize)
 			return (EINVAL);
 		/* Reject I/O not integral sector long */
 		if (bp->bio_length % pp->sectorsize)
 			return (EINVAL);
 		/* Reject requests before or past the end of media. */
 		if (bp->bio_offset < 0)
 			return (EIO);
 		if (bp->bio_offset > pp->mediasize)
 			return (EIO);
 
 		/* Truncate requests to the end of providers media. */
 		excess = bp->bio_offset + bp->bio_length;
 		if (excess > bp->bio_to->mediasize) {
 			KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 ||
 			    round_page(bp->bio_ma_offset +
 			    bp->bio_length) / PAGE_SIZE == bp->bio_ma_n,
 			    ("excess bio %p too short", bp));
 			excess -= bp->bio_to->mediasize;
 			bp->bio_length -= excess;
 			if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 				bp->bio_ma_n = round_page(bp->bio_ma_offset +
 				    bp->bio_length) / PAGE_SIZE;
 			}
 			if (excess > 0)
 				CTR3(KTR_GEOM, "g_down truncated bio "
 				    "%p provider %s by %d", bp,
 				    bp->bio_to->name, excess);
 		}
 
 		/* Deliver zero length transfers right here. */
 		if (bp->bio_length == 0) {
 			CTR2(KTR_GEOM, "g_down terminated 0-length "
 			    "bp %p provider %s", bp, bp->bio_to->name);
 			return (0);
 		}
 
 		if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
 		    (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 &&
 		    (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) {
 			if ((error = g_io_transient_map_bio(bp)) >= 0)
 				return (error);
 		}
 		break;
 	default:
 		break;
 	}
 	return (EJUSTRETURN);
 }
 
 void
 g_io_request(struct bio *bp, struct g_consumer *cp)
 {
 	struct g_provider *pp;
-	struct mtx *mtxp;
 	int direct, error, first;
 	uint8_t cmd;
 
 	biotrack(bp, __func__);
 
 	KASSERT(cp != NULL, ("NULL cp in g_io_request"));
 	KASSERT(bp != NULL, ("NULL bp in g_io_request"));
 	pp = cp->provider;
 	KASSERT(pp != NULL, ("consumer not attached in g_io_request"));
 #ifdef DIAGNOSTIC
 	KASSERT(bp->bio_driver1 == NULL,
 	    ("bio_driver1 used by the consumer (geom %s)", cp->geom->name));
 	KASSERT(bp->bio_driver2 == NULL,
 	    ("bio_driver2 used by the consumer (geom %s)", cp->geom->name));
 	KASSERT(bp->bio_pflags == 0,
 	    ("bio_pflags used by the consumer (geom %s)", cp->geom->name));
 	/*
 	 * Remember consumer's private fields, so we can detect if they were
 	 * modified by the provider.
 	 */
 	bp->_bio_caller1 = bp->bio_caller1;
 	bp->_bio_caller2 = bp->bio_caller2;
 	bp->_bio_cflags = bp->bio_cflags;
 #endif
 
 	cmd = bp->bio_cmd;
 	if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_GETATTR) {
 		KASSERT(bp->bio_data != NULL,
 		    ("NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd));
 	}
 	if (cmd == BIO_DELETE || cmd == BIO_FLUSH) {
 		KASSERT(bp->bio_data == NULL,
 		    ("non-NULL bp->data in g_io_request(cmd=%hu)",
 		    bp->bio_cmd));
 	}
 	if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_DELETE) {
 		KASSERT(bp->bio_offset % cp->provider->sectorsize == 0,
 		    ("wrong offset %jd for sectorsize %u",
 		    bp->bio_offset, cp->provider->sectorsize));
 		KASSERT(bp->bio_length % cp->provider->sectorsize == 0,
 		    ("wrong length %jd for sectorsize %u",
 		    bp->bio_length, cp->provider->sectorsize));
 	}
 
 	g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d",
 	    bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd);
 
 	bp->bio_from = cp;
 	bp->bio_to = pp;
 	bp->bio_error = 0;
 	bp->bio_completed = 0;
 
 	KASSERT(!(bp->bio_flags & BIO_ONQUEUE),
 	    ("Bio already on queue bp=%p", bp));
+
 	if ((g_collectstats & G_STATS_CONSUMERS) != 0 ||
 	    ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL))
 		binuptime(&bp->bio_t0);
 	else
 		getbinuptime(&bp->bio_t0);
+	if (g_collectstats & G_STATS_CONSUMERS)
+		devstat_start_transaction(cp->stat, &bp->bio_t0);
+	if (g_collectstats & G_STATS_PROVIDERS)
+		devstat_start_transaction(pp->stat, &bp->bio_t0);
+#ifdef INVARIANTS
+	atomic_add_int(&cp->nstart, 1);
+#endif
 
 #ifdef GET_STACK_USAGE
 	direct = (cp->flags & G_CF_DIRECT_SEND) != 0 &&
 	    (pp->flags & G_PF_DIRECT_RECEIVE) != 0 &&
 	    !g_is_geom_thread(curthread) &&
 	    ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ||
 	    (bp->bio_flags & BIO_UNMAPPED) == 0 || THREAD_CAN_SLEEP()) &&
 	    pace == 0;
 	if (direct) {
 		/* Block direct execution if less then half of stack left. */
 		size_t	st, su;
 		GET_STACK_USAGE(st, su);
 		if (su * 2 > st)
 			direct = 0;
 	}
 #else
 	direct = 0;
 #endif
-
-	/*
-	 * The statistics collection is lockless, as such, but we
-	 * can not update one instance of the statistics from more
-	 * than one thread at a time, so grab the lock first.
-	 */
-	mtxp = mtx_pool_find(mtxpool_sleep, pp);
-	mtx_lock(mtxp);
-	if (g_collectstats & G_STATS_PROVIDERS)
-		devstat_start_transaction(pp->stat, &bp->bio_t0);
-	if (g_collectstats & G_STATS_CONSUMERS)
-		devstat_start_transaction(cp->stat, &bp->bio_t0);
-#ifdef INVARIANTS
-	cp->nstart++;
-#endif
-	mtx_unlock(mtxp);
 
 	if (direct) {
 		error = g_io_check(bp);
 		if (error >= 0) {
 			CTR3(KTR_GEOM, "g_io_request g_io_check on bp %p "
 			    "provider %s returned %d", bp, bp->bio_to->name,
 			    error);
 			g_io_deliver(bp, error);
 			return;
 		}
 		bp->bio_to->geom->start(bp);
 	} else {
 		g_bioq_lock(&g_bio_run_down);
 		first = TAILQ_EMPTY(&g_bio_run_down.bio_queue);
 		TAILQ_INSERT_TAIL(&g_bio_run_down.bio_queue, bp, bio_queue);
 		bp->bio_flags |= BIO_ONQUEUE;
 		g_bio_run_down.bio_queue_length++;
 		g_bioq_unlock(&g_bio_run_down);
 		/* Pass it on down. */
 		if (first)
 			wakeup(&g_wait_down);
 	}
 }
 
 void
 g_io_deliver(struct bio *bp, int error)
 {
 	struct bintime now;
 	struct g_consumer *cp;
 	struct g_provider *pp;
 	struct mtx *mtxp;
 	int direct, first;
 
 	biotrack(bp, __func__);
 
 	KASSERT(bp != NULL, ("NULL bp in g_io_deliver"));
 	pp = bp->bio_to;
 	KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver"));
 	cp = bp->bio_from;
 	if (cp == NULL) {
 		bp->bio_error = error;
 		bp->bio_done(bp);
 		return;
 	}
 	KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver"));
 	KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver"));
 #ifdef DIAGNOSTIC
 	/*
 	 * Some classes - GJournal in particular - can modify bio's
 	 * private fields while the bio is in transit; G_GEOM_VOLATILE_BIO
 	 * flag means it's an expected behaviour for that particular geom.
 	 */
 	if ((cp->geom->flags & G_GEOM_VOLATILE_BIO) == 0) {
 		KASSERT(bp->bio_caller1 == bp->_bio_caller1,
 		    ("bio_caller1 used by the provider %s", pp->name));
 		KASSERT(bp->bio_caller2 == bp->_bio_caller2,
 		    ("bio_caller2 used by the provider %s", pp->name));
 		KASSERT(bp->bio_cflags == bp->_bio_cflags,
 		    ("bio_cflags used by the provider %s", pp->name));
 	}
 #endif
 	KASSERT(bp->bio_completed >= 0, ("bio_completed can't be less than 0"));
 	KASSERT(bp->bio_completed <= bp->bio_length,
 	    ("bio_completed can't be greater than bio_length"));
 
 	g_trace(G_T_BIO,
 "g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd",
 	    bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error,
 	    (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length);
 
 	KASSERT(!(bp->bio_flags & BIO_ONQUEUE),
 	    ("Bio already on queue bp=%p", bp));
 
 	/*
 	 * XXX: next two doesn't belong here
 	 */
 	bp->bio_bcount = bp->bio_length;
 	bp->bio_resid = bp->bio_bcount - bp->bio_completed;
 
 #ifdef GET_STACK_USAGE
 	direct = (pp->flags & G_PF_DIRECT_SEND) &&
 		 (cp->flags & G_CF_DIRECT_RECEIVE) &&
 		 !g_is_geom_thread(curthread);
 	if (direct) {
 		/* Block direct execution if less then half of stack left. */
 		size_t	st, su;
 		GET_STACK_USAGE(st, su);
 		if (su * 2 > st)
 			direct = 0;
 	}
 #else
 	direct = 0;
 #endif
 
 	/*
 	 * The statistics collection is lockless, as such, but we
 	 * can not update one instance of the statistics from more
 	 * than one thread at a time, so grab the lock first.
 	 */
 	if ((g_collectstats & G_STATS_CONSUMERS) != 0 ||
 	    ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL))
 		binuptime(&now);
 	mtxp = mtx_pool_find(mtxpool_sleep, cp);
 	mtx_lock(mtxp);
 	if (g_collectstats & G_STATS_PROVIDERS)
 		devstat_end_transaction_bio_bt(pp->stat, bp, &now);
 	if (g_collectstats & G_STATS_CONSUMERS)
 		devstat_end_transaction_bio_bt(cp->stat, bp, &now);
 #ifdef INVARIANTS
 	cp->nend++;
 #endif
 	mtx_unlock(mtxp);
 
 	if (error != ENOMEM) {
 		bp->bio_error = error;
 		if (direct) {
 			biodone(bp);
 		} else {
 			g_bioq_lock(&g_bio_run_up);
 			first = TAILQ_EMPTY(&g_bio_run_up.bio_queue);
 			TAILQ_INSERT_TAIL(&g_bio_run_up.bio_queue, bp, bio_queue);
 			bp->bio_flags |= BIO_ONQUEUE;
 			g_bio_run_up.bio_queue_length++;
 			g_bioq_unlock(&g_bio_run_up);
 			if (first)
 				wakeup(&g_wait_up);
 		}
 		return;
 	}
 
 	if (bootverbose)
 		printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name);
 	bp->bio_children = 0;
 	bp->bio_inbed = 0;
 	bp->bio_driver1 = NULL;
 	bp->bio_driver2 = NULL;
 	bp->bio_pflags = 0;
 	g_io_request(bp, cp);
 	pace = 1;
 	return;
 }
 
 SYSCTL_DECL(_kern_geom);
 
 static long transient_maps;
 SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD,
     &transient_maps, 0,
     "Total count of the transient mapping requests");
 u_int transient_map_retries = 10;
 SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW,
     &transient_map_retries, 0,
     "Max count of retries used before giving up on creating transient map");
 int transient_map_hard_failures;
 SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD,
     &transient_map_hard_failures, 0,
     "Failures to establish the transient mapping due to retry attempts "
     "exhausted");
 int transient_map_soft_failures;
 SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD,
     &transient_map_soft_failures, 0,
     "Count of retried failures to establish the transient mapping");
 int inflight_transient_maps;
 SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD,
     &inflight_transient_maps, 0,
     "Current count of the active transient maps");
 
 static int
 g_io_transient_map_bio(struct bio *bp)
 {
 	vm_offset_t addr;
 	long size;
 	u_int retried;
 
 	KASSERT(unmapped_buf_allowed, ("unmapped disabled"));
 
 	size = round_page(bp->bio_ma_offset + bp->bio_length);
 	KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp));
 	addr = 0;
 	retried = 0;
 	atomic_add_long(&transient_maps, 1);
 retry:
 	if (vmem_alloc(transient_arena, size, M_BESTFIT | M_NOWAIT, &addr)) {
 		if (transient_map_retries != 0 &&
 		    retried >= transient_map_retries) {
 			CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s",
 			    bp, bp->bio_to->name);
 			atomic_add_int(&transient_map_hard_failures, 1);
 			return (EDEADLK/* XXXKIB */);
 		} else {
 			/*
 			 * Naive attempt to quisce the I/O to get more
 			 * in-flight requests completed and defragment
 			 * the transient_arena.
 			 */
 			CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d",
 			    bp, bp->bio_to->name, retried);
 			pause("g_d_tra", hz / 10);
 			retried++;
 			atomic_add_int(&transient_map_soft_failures, 1);
 			goto retry;
 		}
 	}
 	atomic_add_int(&inflight_transient_maps, 1);
 	pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size));
 	bp->bio_data = (caddr_t)addr + bp->bio_ma_offset;
 	bp->bio_flags |= BIO_TRANSIENT_MAPPING;
 	bp->bio_flags &= ~BIO_UNMAPPED;
 	return (EJUSTRETURN);
 }
 
 void
 g_io_schedule_down(struct thread *tp __unused)
 {
 	struct bio *bp;
 	int error;
 
 	for(;;) {
 		g_bioq_lock(&g_bio_run_down);
 		bp = g_bioq_first(&g_bio_run_down);
 		if (bp == NULL) {
 			CTR0(KTR_GEOM, "g_down going to sleep");
 			msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock,
 			    PRIBIO | PDROP, "-", 0);
 			continue;
 		}
 		CTR0(KTR_GEOM, "g_down has work to do");
 		g_bioq_unlock(&g_bio_run_down);
 		biotrack(bp, __func__);
 		if (pace != 0) {
 			/*
 			 * There has been at least one memory allocation
 			 * failure since the last I/O completed. Pause 1ms to
 			 * give the system a chance to free up memory. We only
 			 * do this once because a large number of allocations
 			 * can fail in the direct dispatch case and there's no
 			 * relationship between the number of these failures and
 			 * the length of the outage. If there's still an outage,
 			 * we'll pause again and again until it's
 			 * resolved. Older versions paused longer and once per
 			 * allocation failure. This was OK for a single threaded
 			 * g_down, but with direct dispatch would lead to max of
 			 * 10 IOPs for minutes at a time when transient memory
 			 * issues prevented allocation for a batch of requests
 			 * from the upper layers.
 			 *
 			 * XXX This pacing is really lame. It needs to be solved
 			 * by other methods. This is OK only because the worst
 			 * case scenario is so rare. In the worst case scenario
 			 * all memory is tied up waiting for I/O to complete
 			 * which can never happen since we can't allocate bios
 			 * for that I/O.
 			 */
 			CTR0(KTR_GEOM, "g_down pacing self");
 			pause("g_down", min(hz/1000, 1));
 			pace = 0;
 		}
 		CTR2(KTR_GEOM, "g_down processing bp %p provider %s", bp,
 		    bp->bio_to->name);
 		error = g_io_check(bp);
 		if (error >= 0) {
 			CTR3(KTR_GEOM, "g_down g_io_check on bp %p provider "
 			    "%s returned %d", bp, bp->bio_to->name, error);
 			g_io_deliver(bp, error);
 			continue;
 		}
 		THREAD_NO_SLEEPING();
 		CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld "
 		    "len %ld", bp, bp->bio_to->name, bp->bio_offset,
 		    bp->bio_length);
 		bp->bio_to->geom->start(bp);
 		THREAD_SLEEPING_OK();
 	}
 }
 
 void
 g_io_schedule_up(struct thread *tp __unused)
 {
 	struct bio *bp;
 
 	for(;;) {
 		g_bioq_lock(&g_bio_run_up);
 		bp = g_bioq_first(&g_bio_run_up);
 		if (bp == NULL) {
 			CTR0(KTR_GEOM, "g_up going to sleep");
 			msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock,
 			    PRIBIO | PDROP, "-", 0);
 			continue;
 		}
 		g_bioq_unlock(&g_bio_run_up);
 		THREAD_NO_SLEEPING();
 		CTR4(KTR_GEOM, "g_up biodone bp %p provider %s off "
 		    "%jd len %ld", bp, bp->bio_to->name,
 		    bp->bio_offset, bp->bio_length);
 		biodone(bp);
 		THREAD_SLEEPING_OK();
 	}
 }
 
 void *
 g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error)
 {
 	struct bio *bp;
 	void *ptr;
 	int errorc;
 
 	KASSERT(length > 0 && length >= cp->provider->sectorsize &&
 	    length <= MAXPHYS, ("g_read_data(): invalid length %jd",
 	    (intmax_t)length));
 
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_READ;
 	bp->bio_done = NULL;
 	bp->bio_offset = offset;
 	bp->bio_length = length;
 	ptr = g_malloc(length, M_WAITOK);
 	bp->bio_data = ptr;
 	g_io_request(bp, cp);
 	errorc = biowait(bp, "gread");
 	if (error != NULL)
 		*error = errorc;
 	g_destroy_bio(bp);
 	if (errorc) {
 		g_free(ptr);
 		ptr = NULL;
 	}
 	return (ptr);
 }
 
 /*
  * A read function for use by ffs_sbget when used by GEOM-layer routines.
  */
 int
 g_use_g_read_data(void *devfd, off_t loc, void **bufp, int size)
 {
 	struct g_consumer *cp;
 
 	KASSERT(*bufp == NULL,
 	    ("g_use_g_read_data: non-NULL *bufp %p\n", *bufp));
 
 	cp = (struct g_consumer *)devfd;
 	/*
 	 * Take care not to issue an invalid I/O request. The offset of
 	 * the superblock candidate must be multiples of the provider's
 	 * sector size, otherwise an FFS can't exist on the provider
 	 * anyway.
 	 */
 	if (loc % cp->provider->sectorsize != 0)
 		return (ENOENT);
 	*bufp = g_read_data(cp, loc, size, NULL);
 	if (*bufp == NULL)
 		return (ENOENT);
 	return (0);
 }
 
 int
 g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length)
 {
 	struct bio *bp;
 	int error;
 
 	KASSERT(length > 0 && length >= cp->provider->sectorsize &&
 	    length <= MAXPHYS, ("g_write_data(): invalid length %jd",
 	    (intmax_t)length));
 
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_WRITE;
 	bp->bio_done = NULL;
 	bp->bio_offset = offset;
 	bp->bio_length = length;
 	bp->bio_data = ptr;
 	g_io_request(bp, cp);
 	error = biowait(bp, "gwrite");
 	g_destroy_bio(bp);
 	return (error);
 }
 
 /*
  * A write function for use by ffs_sbput when used by GEOM-layer routines.
  */
 int
 g_use_g_write_data(void *devfd, off_t loc, void *buf, int size)
 {
 
 	return (g_write_data((struct g_consumer *)devfd, loc, buf, size));
 }
 
 int
 g_delete_data(struct g_consumer *cp, off_t offset, off_t length)
 {
 	struct bio *bp;
 	int error;
 
 	KASSERT(length > 0 && length >= cp->provider->sectorsize,
 	    ("g_delete_data(): invalid length %jd", (intmax_t)length));
 
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_DELETE;
 	bp->bio_done = NULL;
 	bp->bio_offset = offset;
 	bp->bio_length = length;
 	bp->bio_data = NULL;
 	g_io_request(bp, cp);
 	error = biowait(bp, "gdelete");
 	g_destroy_bio(bp);
 	return (error);
 }
 
 void
 g_print_bio(const char *prefix, const struct bio *bp, const char *fmtsuffix,
     ...)
 {
 #ifndef PRINTF_BUFR_SIZE
 #define PRINTF_BUFR_SIZE 64
 #endif
 	char bufr[PRINTF_BUFR_SIZE];
 	struct sbuf sb, *sbp __unused;
 	va_list ap;
 
 	sbp = sbuf_new(&sb, bufr, sizeof(bufr), SBUF_FIXEDLEN);
 	KASSERT(sbp != NULL, ("sbuf_new misused?"));
 
 	sbuf_set_drain(&sb, sbuf_printf_drain, NULL);
 
 	sbuf_cat(&sb, prefix);
 	g_format_bio(&sb, bp);
 
 	va_start(ap, fmtsuffix);
 	sbuf_vprintf(&sb, fmtsuffix, ap);
 	va_end(ap);
 
 	sbuf_nl_terminate(&sb);
 
 	sbuf_finish(&sb);
 	sbuf_delete(&sb);
 }
 
 void
 g_format_bio(struct sbuf *sb, const struct bio *bp)
 {
 	const char *pname, *cmd = NULL;
 
 	if (bp->bio_to != NULL)
 		pname = bp->bio_to->name;
 	else
 		pname = "[unknown]";
 
 	switch (bp->bio_cmd) {
 	case BIO_GETATTR:
 		cmd = "GETATTR";
 		sbuf_printf(sb, "%s[%s(attr=%s)]", pname, cmd,
 		    bp->bio_attribute);
 		return;
 	case BIO_FLUSH:
 		cmd = "FLUSH";
 		sbuf_printf(sb, "%s[%s]", pname, cmd);
 		return;
 	case BIO_ZONE: {
 		char *subcmd = NULL;
 		cmd = "ZONE";
 		switch (bp->bio_zone.zone_cmd) {
 		case DISK_ZONE_OPEN:
 			subcmd = "OPEN";
 			break;
 		case DISK_ZONE_CLOSE:
 			subcmd = "CLOSE";
 			break;
 		case DISK_ZONE_FINISH:
 			subcmd = "FINISH";
 			break;
 		case DISK_ZONE_RWP:
 			subcmd = "RWP";
 			break;
 		case DISK_ZONE_REPORT_ZONES:
 			subcmd = "REPORT ZONES";
 			break;
 		case DISK_ZONE_GET_PARAMS:
 			subcmd = "GET PARAMS";
 			break;
 		default:
 			subcmd = "UNKNOWN";
 			break;
 		}
 		sbuf_printf(sb, "%s[%s,%s]", pname, cmd, subcmd);
 		return;
 	}
 	case BIO_READ:
 		cmd = "READ";
 		break;
 	case BIO_WRITE:
 		cmd = "WRITE";
 		break;
 	case BIO_DELETE:
 		cmd = "DELETE";
 		break;
 	default:
 		cmd = "UNKNOWN";
 		sbuf_printf(sb, "%s[%s()]", pname, cmd);
 		return;
 	}
 	sbuf_printf(sb, "%s[%s(offset=%jd, length=%jd)]", pname, cmd,
 	    (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length);
 }
Index: head/sys/kern/subr_devstat.c
===================================================================
--- head/sys/kern/subr_devstat.c	(revision 356199)
+++ head/sys/kern/subr_devstat.c	(revision 356200)
@@ -1,587 +1,582 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/devicestat.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/conf.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/atomic.h>
 
 SDT_PROVIDER_DEFINE(io);
 
 SDT_PROBE_DEFINE2(io, , , start, "struct bio *", "struct devstat *");
 SDT_PROBE_DEFINE2(io, , , done, "struct bio *", "struct devstat *");
 SDT_PROBE_DEFINE2(io, , , wait__start, "struct bio *",
     "struct devstat *");
 SDT_PROBE_DEFINE2(io, , , wait__done, "struct bio *",
     "struct devstat *");
 
 #define	DTRACE_DEVSTAT_START()		SDT_PROBE2(io, , , start, NULL, ds)
 #define	DTRACE_DEVSTAT_BIO_START()	SDT_PROBE2(io, , , start, bp, ds)
 #define	DTRACE_DEVSTAT_DONE()		SDT_PROBE2(io, , , done, NULL, ds)
 #define	DTRACE_DEVSTAT_BIO_DONE()	SDT_PROBE2(io, , , done, bp, ds)
 #define	DTRACE_DEVSTAT_WAIT_START()	SDT_PROBE2(io, , , wait__start, NULL, ds)
 #define	DTRACE_DEVSTAT_WAIT_DONE()	SDT_PROBE2(io, , , wait__done, NULL, ds)
 
 static int devstat_num_devs;
 static long devstat_generation = 1;
 static int devstat_version = DEVSTAT_VERSION;
 static int devstat_current_devnumber;
 static struct mtx devstat_mutex;
 MTX_SYSINIT(devstat_mutex, &devstat_mutex, "devstat", MTX_DEF);
 
 static struct devstatlist device_statq = STAILQ_HEAD_INITIALIZER(device_statq);
 static struct devstat *devstat_alloc(void);
 static void devstat_free(struct devstat *);
 static void devstat_add_entry(struct devstat *ds, const void *dev_name, 
 		       int unit_number, uint32_t block_size,
 		       devstat_support_flags flags,
 		       devstat_type_flags device_type,
 		       devstat_priority priority);
 
 /*
  * Allocate a devstat and initialize it
  */
 struct devstat *
 devstat_new_entry(const void *dev_name,
 		  int unit_number, uint32_t block_size,
 		  devstat_support_flags flags,
 		  devstat_type_flags device_type,
 		  devstat_priority priority)
 {
 	struct devstat *ds;
 
 	mtx_assert(&devstat_mutex, MA_NOTOWNED);
 
 	ds = devstat_alloc();
 	mtx_lock(&devstat_mutex);
 	if (unit_number == -1) {
 		ds->unit_number = unit_number;
 		ds->id = dev_name;
 		binuptime(&ds->creation_time);
 		devstat_generation++;
 	} else {
 		devstat_add_entry(ds, dev_name, unit_number, block_size,
 				  flags, device_type, priority);
 	}
 	mtx_unlock(&devstat_mutex);
 	return (ds);
 }
 
 /*
  * Take a malloced and zeroed devstat structure given to us, fill it in 
  * and add it to the queue of devices.  
  */
 static void
 devstat_add_entry(struct devstat *ds, const void *dev_name, 
 		  int unit_number, uint32_t block_size,
 		  devstat_support_flags flags,
 		  devstat_type_flags device_type,
 		  devstat_priority priority)
 {
 	struct devstatlist *devstat_head;
 	struct devstat *ds_tmp;
 
 	mtx_assert(&devstat_mutex, MA_OWNED);
 	devstat_num_devs++;
 
 	devstat_head = &device_statq;
 
 	/*
 	 * Priority sort.  Each driver passes in its priority when it adds
 	 * its devstat entry.  Drivers are sorted first by priority, and
 	 * then by probe order.
 	 * 
 	 * For the first device, we just insert it, since the priority
 	 * doesn't really matter yet.  Subsequent devices are inserted into
 	 * the list using the order outlined above.
 	 */
 	if (devstat_num_devs == 1)
 		STAILQ_INSERT_TAIL(devstat_head, ds, dev_links);
 	else {
 		STAILQ_FOREACH(ds_tmp, devstat_head, dev_links) {
 			struct devstat *ds_next;
 
 			ds_next = STAILQ_NEXT(ds_tmp, dev_links);
 
 			/*
 			 * If we find a break between higher and lower
 			 * priority items, and if this item fits in the
 			 * break, insert it.  This also applies if the
 			 * "lower priority item" is the end of the list.
 			 */
 			if ((priority <= ds_tmp->priority)
 			 && ((ds_next == NULL)
 			   || (priority > ds_next->priority))) {
 				STAILQ_INSERT_AFTER(devstat_head, ds_tmp, ds,
 						    dev_links);
 				break;
 			} else if (priority > ds_tmp->priority) {
 				/*
 				 * If this is the case, we should be able
 				 * to insert ourselves at the head of the
 				 * list.  If we can't, something is wrong.
 				 */
 				if (ds_tmp == STAILQ_FIRST(devstat_head)) {
 					STAILQ_INSERT_HEAD(devstat_head,
 							   ds, dev_links);
 					break;
 				} else {
 					STAILQ_INSERT_TAIL(devstat_head,
 							   ds, dev_links);
 					printf("devstat_add_entry: HELP! "
 					       "sorting problem detected "
 					       "for name %p unit %d\n",
 					       dev_name, unit_number);
 					break;
 				}
 			}
 		}
 	}
 
 	ds->device_number = devstat_current_devnumber++;
 	ds->unit_number = unit_number;
 	strlcpy(ds->device_name, dev_name, DEVSTAT_NAME_LEN);
 	ds->block_size = block_size;
 	ds->flags = flags;
 	ds->device_type = device_type;
 	ds->priority = priority;
 	binuptime(&ds->creation_time);
 	devstat_generation++;
 }
 
 /*
  * Remove a devstat structure from the list of devices.
  */
 void
 devstat_remove_entry(struct devstat *ds)
 {
 	struct devstatlist *devstat_head;
 
 	mtx_assert(&devstat_mutex, MA_NOTOWNED);
 	if (ds == NULL)
 		return;
 
 	mtx_lock(&devstat_mutex);
 
 	devstat_head = &device_statq;
 
 	/* Remove this entry from the devstat queue */
 	atomic_add_acq_int(&ds->sequence1, 1);
 	if (ds->unit_number != -1) {
 		devstat_num_devs--;
 		STAILQ_REMOVE(devstat_head, ds, devstat, dev_links);
 	}
 	devstat_free(ds);
 	devstat_generation++;
 	mtx_unlock(&devstat_mutex);
 }
 
 /*
  * Record a transaction start.
  *
  * See comments for devstat_end_transaction().  Ordering is very important
  * here.
  */
 void
 devstat_start_transaction(struct devstat *ds, const struct bintime *now)
 {
 
-	mtx_assert(&devstat_mutex, MA_NOTOWNED);
-
 	/* sanity check */
 	if (ds == NULL)
 		return;
 
 	atomic_add_acq_int(&ds->sequence1, 1);
 	/*
 	 * We only want to set the start time when we are going from idle
 	 * to busy.  The start time is really the start of the latest busy
 	 * period.
 	 */
-	if (ds->start_count == ds->end_count) {
+	if (atomic_fetchadd_int(&ds->start_count, 1) == ds->end_count) {
 		if (now != NULL)
 			ds->busy_from = *now;
 		else
 			binuptime(&ds->busy_from);
 	}
-	ds->start_count++;
 	atomic_add_rel_int(&ds->sequence0, 1);
 	DTRACE_DEVSTAT_START();
 }
 
 void
 devstat_start_transaction_bio(struct devstat *ds, struct bio *bp)
 {
-
-	mtx_assert(&devstat_mutex, MA_NOTOWNED);
 
 	/* sanity check */
 	if (ds == NULL)
 		return;
 
 	binuptime(&bp->bio_t0);
 	devstat_start_transaction(ds, &bp->bio_t0);
 	DTRACE_DEVSTAT_BIO_START();
 }
 
 /*
  * Record the ending of a transaction, and incrment the various counters.
  *
  * Ordering in this function, and in devstat_start_transaction() is VERY
  * important.  The idea here is to run without locks, so we are very
  * careful to only modify some fields on the way "down" (i.e. at
  * transaction start) and some fields on the way "up" (i.e. at transaction
  * completion).  One exception is busy_from, which we only modify in
  * devstat_start_transaction() when there are no outstanding transactions,
  * and thus it can't be modified in devstat_end_transaction()
  * simultaneously.
  *
  * The sequence0 and sequence1 fields are provided to enable an application
  * spying on the structures with mmap(2) to tell when a structure is in a
  * consistent state or not.
  *
  * For this to work 100% reliably, it is important that the two fields
  * are at opposite ends of the structure and that they are incremented
  * in the opposite order of how a memcpy(3) in userland would copy them.
  * We assume that the copying happens front to back, but there is actually
  * no way short of writing your own memcpy(3) replacement to guarantee
  * this will be the case.
  *
  * In addition to this, being a kind of locks, they must be updated with
  * atomic instructions using appropriate memory barriers.
  */
 void
 devstat_end_transaction(struct devstat *ds, uint32_t bytes, 
 			devstat_tag_type tag_type, devstat_trans_flags flags,
 			const struct bintime *now, const struct bintime *then)
 {
 	struct bintime dt, lnow;
 
 	/* sanity check */
 	if (ds == NULL)
 		return;
 
 	if (now == NULL) {
 		binuptime(&lnow);
 		now = &lnow;
 	}
 
 	atomic_add_acq_int(&ds->sequence1, 1);
 	/* Update byte and operations counts */
 	ds->bytes[flags] += bytes;
 	ds->operations[flags]++;
 
 	/*
 	 * Keep a count of the various tag types sent.
 	 */
 	if ((ds->flags & DEVSTAT_NO_ORDERED_TAGS) == 0 &&
 	    tag_type != DEVSTAT_TAG_NONE)
 		ds->tag_types[tag_type]++;
 
 	if (then != NULL) {
 		/* Update duration of operations */
 		dt = *now;
 		bintime_sub(&dt, then);
 		bintime_add(&ds->duration[flags], &dt);
 	}
 
 	/* Accumulate busy time */
 	dt = *now;
 	bintime_sub(&dt, &ds->busy_from);
 	bintime_add(&ds->busy_time, &dt);
 	ds->busy_from = *now;
 
 	ds->end_count++;
 	atomic_add_rel_int(&ds->sequence0, 1);
 	DTRACE_DEVSTAT_DONE();
 }
 
 void
 devstat_end_transaction_bio(struct devstat *ds, const struct bio *bp)
 {
 
 	devstat_end_transaction_bio_bt(ds, bp, NULL);
 }
 
 void
 devstat_end_transaction_bio_bt(struct devstat *ds, const struct bio *bp,
     const struct bintime *now)
 {
 	devstat_trans_flags flg;
 	devstat_tag_type tag;
 
 	/* sanity check */
 	if (ds == NULL)
 		return;
 
 	if (bp->bio_flags & BIO_ORDERED)
 		tag = DEVSTAT_TAG_ORDERED;
 	else
 		tag = DEVSTAT_TAG_SIMPLE;
 	if (bp->bio_cmd == BIO_DELETE)
 		flg = DEVSTAT_FREE;
 	else if ((bp->bio_cmd == BIO_READ)
 	      || ((bp->bio_cmd == BIO_ZONE)
 	       && (bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES)))
 		flg = DEVSTAT_READ;
 	else if (bp->bio_cmd == BIO_WRITE)
 		flg = DEVSTAT_WRITE;
 	else 
 		flg = DEVSTAT_NO_DATA;
 
 	devstat_end_transaction(ds, bp->bio_bcount - bp->bio_resid,
 				tag, flg, now, &bp->bio_t0);
 	DTRACE_DEVSTAT_BIO_DONE();
 }
 
 /*
  * This is the sysctl handler for the devstat package.  The data pushed out
  * on the kern.devstat.all sysctl variable consists of the current devstat
  * generation number, and then an array of devstat structures, one for each
  * device in the system.
  *
  * This is more cryptic that obvious, but basically we neither can nor
  * want to hold the devstat_mutex for any amount of time, so we grab it
  * only when we need to and keep an eye on devstat_generation all the time.
  */
 static int
 sysctl_devstat(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	long mygen;
 	struct devstat *nds;
 
 	mtx_assert(&devstat_mutex, MA_NOTOWNED);
 
 	/*
 	 * XXX devstat_generation should really be "volatile" but that
 	 * XXX freaks out the sysctl macro below.  The places where we
 	 * XXX change it and inspect it are bracketed in the mutex which
 	 * XXX guarantees us proper write barriers.  I don't believe the
 	 * XXX compiler is allowed to optimize mygen away across calls
 	 * XXX to other functions, so the following is belived to be safe.
 	 */
 	mygen = devstat_generation;
 
 	error = SYSCTL_OUT(req, &mygen, sizeof(mygen));
 
 	if (devstat_num_devs == 0)
 		return(0);
 
 	if (error != 0)
 		return (error);
 
 	mtx_lock(&devstat_mutex);
 	nds = STAILQ_FIRST(&device_statq); 
 	if (mygen != devstat_generation)
 		error = EBUSY;
 	mtx_unlock(&devstat_mutex);
 
 	if (error != 0)
 		return (error);
 
 	for (;nds != NULL;) {
 		error = SYSCTL_OUT(req, nds, sizeof(struct devstat));
 		if (error != 0)
 			return (error);
 		mtx_lock(&devstat_mutex);
 		if (mygen != devstat_generation)
 			error = EBUSY;
 		else
 			nds = STAILQ_NEXT(nds, dev_links);
 		mtx_unlock(&devstat_mutex);
 		if (error != 0)
 			return (error);
 	}
 	return(error);
 }
 
 /*
  * Sysctl entries for devstat.  The first one is a node that all the rest
  * hang off of. 
  */
 static SYSCTL_NODE(_kern, OID_AUTO, devstat, CTLFLAG_RD, NULL,
     "Device Statistics");
 
 SYSCTL_PROC(_kern_devstat, OID_AUTO, all, CTLFLAG_RD|CTLTYPE_OPAQUE,
     NULL, 0, sysctl_devstat, "S,devstat", "All devices in the devstat list");
 /*
  * Export the number of devices in the system so that userland utilities
  * can determine how much memory to allocate to hold all the devices.
  */
 SYSCTL_INT(_kern_devstat, OID_AUTO, numdevs, CTLFLAG_RD, 
     &devstat_num_devs, 0, "Number of devices in the devstat list");
 SYSCTL_LONG(_kern_devstat, OID_AUTO, generation, CTLFLAG_RD,
     &devstat_generation, 0, "Devstat list generation");
 SYSCTL_INT(_kern_devstat, OID_AUTO, version, CTLFLAG_RD, 
     &devstat_version, 0, "Devstat list version number");
 
 /*
  * Allocator for struct devstat structures.  We sub-allocate these from pages
  * which we get from malloc.  These pages are exported for mmap(2)'ing through
  * a miniature device driver
  */
 
 #define statsperpage (PAGE_SIZE / sizeof(struct devstat))
 
 static d_mmap_t devstat_mmap;
 
 static struct cdevsw devstat_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_mmap =	devstat_mmap,
 	.d_name =	"devstat",
 };
 
 struct statspage {
 	TAILQ_ENTRY(statspage)	list;
 	struct devstat		*stat;
 	u_int			nfree;
 };
 
 static TAILQ_HEAD(, statspage)	pagelist = TAILQ_HEAD_INITIALIZER(pagelist);
 static MALLOC_DEFINE(M_DEVSTAT, "devstat", "Device statistics");
 
 static int
 devstat_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
     int nprot, vm_memattr_t *memattr)
 {
 	struct statspage *spp;
 
 	if (nprot != VM_PROT_READ)
 		return (-1);
 	mtx_lock(&devstat_mutex);
 	TAILQ_FOREACH(spp, &pagelist, list) {
 		if (offset == 0) {
 			*paddr = vtophys(spp->stat);
 			mtx_unlock(&devstat_mutex);
 			return (0);
 		}
 		offset -= PAGE_SIZE;
 	}
 	mtx_unlock(&devstat_mutex);
 	return (-1);
 }
 
 static struct devstat *
 devstat_alloc(void)
 {
 	struct devstat *dsp;
 	struct statspage *spp, *spp2;
 	u_int u;
 	static int once;
 
 	mtx_assert(&devstat_mutex, MA_NOTOWNED);
 	if (!once) {
 		make_dev_credf(MAKEDEV_ETERNAL | MAKEDEV_CHECKNAME,
 		    &devstat_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0444,
 		    DEVSTAT_DEVICE_NAME);
 		once = 1;
 	}
 	spp2 = NULL;
 	mtx_lock(&devstat_mutex);
 	for (;;) {
 		TAILQ_FOREACH(spp, &pagelist, list) {
 			if (spp->nfree > 0)
 				break;
 		}
 		if (spp != NULL)
 			break;
 		mtx_unlock(&devstat_mutex);
 		spp2 = malloc(sizeof *spp, M_DEVSTAT, M_ZERO | M_WAITOK);
 		spp2->stat = malloc(PAGE_SIZE, M_DEVSTAT, M_ZERO | M_WAITOK);
 		spp2->nfree = statsperpage;
 
 		/*
 		 * If free statspages were added while the lock was released
 		 * just reuse them.
 		 */
 		mtx_lock(&devstat_mutex);
 		TAILQ_FOREACH(spp, &pagelist, list)
 			if (spp->nfree > 0)
 				break;
 		if (spp == NULL) {
 			spp = spp2;
 
 			/*
 			 * It would make more sense to add the new page at the
 			 * head but the order on the list determine the
 			 * sequence of the mapping so we can't do that.
 			 */
 			TAILQ_INSERT_TAIL(&pagelist, spp, list);
 		} else
 			break;
 	}
 	dsp = spp->stat;
 	for (u = 0; u < statsperpage; u++) {
 		if (dsp->allocated == 0)
 			break;
 		dsp++;
 	}
 	spp->nfree--;
 	dsp->allocated = 1;
 	mtx_unlock(&devstat_mutex);
 	if (spp2 != NULL && spp2 != spp) {
 		free(spp2->stat, M_DEVSTAT);
 		free(spp2, M_DEVSTAT);
 	}
 	return (dsp);
 }
 
 static void
 devstat_free(struct devstat *dsp)
 {
 	struct statspage *spp;
 
 	mtx_assert(&devstat_mutex, MA_OWNED);
 	bzero(dsp, sizeof *dsp);
 	TAILQ_FOREACH(spp, &pagelist, list) {
 		if (dsp >= spp->stat && dsp < (spp->stat + statsperpage)) {
 			spp->nfree++;
 			return;
 		}
 	}
 }
 
 SYSCTL_INT(_debug_sizeof, OID_AUTO, devstat, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, sizeof(struct devstat), "sizeof(struct devstat)");