Index: head/sys/cam/cam_iosched.c
===================================================================
--- head/sys/cam/cam_iosched.c	(revision 333433)
+++ head/sys/cam/cam_iosched.c	(revision 333434)
@@ -1,1762 +1,1761 @@
 /*-
  * CAM IO Scheduler Interface
  *
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2015 Netflix, Inc.
- * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_cam.h"
 #include "opt_ddb.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bio.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_xpt_internal.h>
 #include <cam/cam_iosched.h>
 
 #include <ddb/ddb.h>
 
 static MALLOC_DEFINE(M_CAMSCHED, "CAM I/O Scheduler",
     "CAM I/O Scheduler buffers");
 
 /*
  * Default I/O scheduler for FreeBSD. This implementation is just a thin-vineer
  * over the bioq_* interface, with notions of separate calls for normal I/O and
  * for trims.
  *
  * When CAM_IOSCHED_DYNAMIC is defined, the scheduler is enhanced to dynamically
  * steer the rate of one type of traffic to help other types of traffic (eg
  * limit writes when read latency deteriorates on SSDs).
  */
 
 #ifdef CAM_IOSCHED_DYNAMIC
 
 static int do_dynamic_iosched = 1;
 TUNABLE_INT("kern.cam.do_dynamic_iosched", &do_dynamic_iosched);
 SYSCTL_INT(_kern_cam, OID_AUTO, do_dynamic_iosched, CTLFLAG_RD,
     &do_dynamic_iosched, 1,
     "Enable Dynamic I/O scheduler optimizations.");
 
 /*
  * For an EMA, with an alpha of alpha, we know
  * 	alpha = 2 / (N + 1)
  * or
  * 	N = 1 + (2 / alpha)
  * where N is the number of samples that 86% of the current
  * EMA is derived from.
  *
  * So we invent[*] alpha_bits:
  *	alpha_bits = -log_2(alpha)
  *	alpha = 2^-alpha_bits
  * So
  *	N = 1 + 2^(alpha_bits + 1)
  *
  * The default 9 gives a 1025 lookback for 86% of the data.
  * For a brief intro: https://en.wikipedia.org/wiki/Moving_average
  *
  * [*] Steal from the load average code and many other places.
  * Note: See computation of EMA and EMVAR for acceptable ranges of alpha.
  */
 static int alpha_bits = 9;
 TUNABLE_INT("kern.cam.iosched_alpha_bits", &alpha_bits);
 SYSCTL_INT(_kern_cam, OID_AUTO, iosched_alpha_bits, CTLFLAG_RW,
     &alpha_bits, 1,
     "Bits in EMA's alpha.");
 
 struct iop_stats;
 struct cam_iosched_softc;
 
 int iosched_debug = 0;
 
 typedef enum {
 	none = 0,				/* No limits */
 	queue_depth,			/* Limit how many ops we queue to SIM */
 	iops,				/* Limit # of IOPS to the drive */
 	bandwidth,			/* Limit bandwidth to the drive */
 	limiter_max
 } io_limiter;
 
 static const char *cam_iosched_limiter_names[] =
     { "none", "queue_depth", "iops", "bandwidth" };
 
 /*
  * Called to initialize the bits of the iop_stats structure relevant to the
  * limiter. Called just after the limiter is set.
  */
 typedef int l_init_t(struct iop_stats *);
 
 /*
  * Called every tick.
  */
 typedef int l_tick_t(struct iop_stats *);
 
 /*
  * Called to see if the limiter thinks this IOP can be allowed to
  * proceed. If so, the limiter assumes that the IOP proceeded
  * and makes any accounting of it that's needed.
  */
 typedef int l_iop_t(struct iop_stats *, struct bio *);
 
 /*
  * Called when an I/O completes so the limiter can update its
  * accounting. Pending I/Os may complete in any order (even when
  * sent to the hardware at the same time), so the limiter may not
  * make any assumptions other than this I/O has completed. If it
  * returns 1, then xpt_schedule() needs to be called again.
  */
 typedef int l_iodone_t(struct iop_stats *, struct bio *);
 
 static l_iop_t cam_iosched_qd_iop;
 static l_iop_t cam_iosched_qd_caniop;
 static l_iodone_t cam_iosched_qd_iodone;
 
 static l_init_t cam_iosched_iops_init;
 static l_tick_t cam_iosched_iops_tick;
 static l_iop_t cam_iosched_iops_caniop;
 static l_iop_t cam_iosched_iops_iop;
 
 static l_init_t cam_iosched_bw_init;
 static l_tick_t cam_iosched_bw_tick;
 static l_iop_t cam_iosched_bw_caniop;
 static l_iop_t cam_iosched_bw_iop;
 
 struct limswitch {
 	l_init_t	*l_init;
 	l_tick_t	*l_tick;
 	l_iop_t		*l_iop;
 	l_iop_t		*l_caniop;
 	l_iodone_t	*l_iodone;
 } limsw[] =
 {
 	{	/* none */
 		.l_init = NULL,
 		.l_tick = NULL,
 		.l_iop = NULL,
 		.l_iodone= NULL,
 	},
 	{	/* queue_depth */
 		.l_init = NULL,
 		.l_tick = NULL,
 		.l_caniop = cam_iosched_qd_caniop,
 		.l_iop = cam_iosched_qd_iop,
 		.l_iodone= cam_iosched_qd_iodone,
 	},
 	{	/* iops */
 		.l_init = cam_iosched_iops_init,
 		.l_tick = cam_iosched_iops_tick,
 		.l_caniop = cam_iosched_iops_caniop,
 		.l_iop = cam_iosched_iops_iop,
 		.l_iodone= NULL,
 	},
 	{	/* bandwidth */
 		.l_init = cam_iosched_bw_init,
 		.l_tick = cam_iosched_bw_tick,
 		.l_caniop = cam_iosched_bw_caniop,
 		.l_iop = cam_iosched_bw_iop,
 		.l_iodone= NULL,
 	},
 };
 
 struct iop_stats {
 	/*
 	 * sysctl state for this subnode.
 	 */
 	struct sysctl_ctx_list	sysctl_ctx;
 	struct sysctl_oid	*sysctl_tree;
 
 	/*
 	 * Information about the current rate limiters, if any
 	 */
 	io_limiter	limiter;	/* How are I/Os being limited */
 	int		min;		/* Low range of limit */
 	int		max;		/* High range of limit */
 	int		current;	/* Current rate limiter */
 	int		l_value1;	/* per-limiter scratch value 1. */
 	int		l_value2;	/* per-limiter scratch value 2. */
 
 	/*
 	 * Debug information about counts of I/Os that have gone through the
 	 * scheduler.
 	 */
 	int		pending;	/* I/Os pending in the hardware */
 	int		queued;		/* number currently in the queue */
 	int		total;		/* Total for all time -- wraps */
 	int		in;		/* number queued all time -- wraps */
 	int		out;		/* number completed all time -- wraps */
 	int		errs;		/* Number of I/Os completed with error --  wraps */
 
 	/*
 	 * Statistics on different bits of the process.
 	 */
 		/* Exp Moving Average, see alpha_bits for more details */
 	sbintime_t      ema;
 	sbintime_t      emvar;
 	sbintime_t      sd;		/* Last computed sd */
 
 	uint32_t	state_flags;
 #define IOP_RATE_LIMITED		1u
 
 #define LAT_BUCKETS 15			/* < 1ms < 2ms ... < 2^(n-1)ms >= 2^(n-1)ms*/
 	uint64_t	latencies[LAT_BUCKETS];
 
 	struct cam_iosched_softc *softc;
 };
 
 
 typedef enum {
 	set_max = 0,			/* current = max */
 	read_latency,			/* Steer read latency by throttling writes */
 	cl_max				/* Keep last */
 } control_type;
 
 static const char *cam_iosched_control_type_names[] =
     { "set_max", "read_latency" };
 
 struct control_loop {
 	/*
 	 * sysctl state for this subnode.
 	 */
 	struct sysctl_ctx_list	sysctl_ctx;
 	struct sysctl_oid	*sysctl_tree;
 
 	sbintime_t	next_steer;		/* Time of next steer */
 	sbintime_t	steer_interval;		/* How often do we steer? */
 	sbintime_t	lolat;
 	sbintime_t	hilat;
 	int		alpha;
 	control_type	type;			/* What type of control? */
 	int		last_count;		/* Last I/O count */
 
 	struct cam_iosched_softc *softc;
 };
 
 #endif
 
 struct cam_iosched_softc {
 	struct bio_queue_head bio_queue;
 	struct bio_queue_head trim_queue;
 				/* scheduler flags < 16, user flags >= 16 */
 	uint32_t	flags;
 	int		sort_io_queue;
 #ifdef CAM_IOSCHED_DYNAMIC
 	int		read_bias;		/* Read bias setting */
 	int		current_read_bias;	/* Current read bias state */
 	int		total_ticks;
 	int		load;			/* EMA of 'load average' of disk / 2^16 */
 
 	struct bio_queue_head write_queue;
 	struct iop_stats read_stats, write_stats, trim_stats;
 	struct sysctl_ctx_list	sysctl_ctx;
 	struct sysctl_oid	*sysctl_tree;
 
 	int		quanta;			/* Number of quanta per second */
 	struct callout	ticker;			/* Callout for our quota system */
 	struct cam_periph *periph;		/* cam periph associated with this device */
 	uint32_t	this_frac;		/* Fraction of a second (1024ths) for this tick */
 	sbintime_t	last_time;		/* Last time we ticked */
 	struct control_loop cl;
 #endif
 };
 
 #ifdef CAM_IOSCHED_DYNAMIC
 /*
  * helper functions to call the limsw functions.
  */
 static int
 cam_iosched_limiter_init(struct iop_stats *ios)
 {
 	int lim = ios->limiter;
 
 	/* maybe this should be a kassert */
 	if (lim < none || lim >= limiter_max)
 		return EINVAL;
 
 	if (limsw[lim].l_init)
 		return limsw[lim].l_init(ios);
 
 	return 0;
 }
 
 static int
 cam_iosched_limiter_tick(struct iop_stats *ios)
 {
 	int lim = ios->limiter;
 
 	/* maybe this should be a kassert */
 	if (lim < none || lim >= limiter_max)
 		return EINVAL;
 
 	if (limsw[lim].l_tick)
 		return limsw[lim].l_tick(ios);
 
 	return 0;
 }
 
 static int
 cam_iosched_limiter_iop(struct iop_stats *ios, struct bio *bp)
 {
 	int lim = ios->limiter;
 
 	/* maybe this should be a kassert */
 	if (lim < none || lim >= limiter_max)
 		return EINVAL;
 
 	if (limsw[lim].l_iop)
 		return limsw[lim].l_iop(ios, bp);
 
 	return 0;
 }
 
 static int
 cam_iosched_limiter_caniop(struct iop_stats *ios, struct bio *bp)
 {
 	int lim = ios->limiter;
 
 	/* maybe this should be a kassert */
 	if (lim < none || lim >= limiter_max)
 		return EINVAL;
 
 	if (limsw[lim].l_caniop)
 		return limsw[lim].l_caniop(ios, bp);
 
 	return 0;
 }
 
 static int
 cam_iosched_limiter_iodone(struct iop_stats *ios, struct bio *bp)
 {
 	int lim = ios->limiter;
 
 	/* maybe this should be a kassert */
 	if (lim < none || lim >= limiter_max)
 		return 0;
 
 	if (limsw[lim].l_iodone)
 		return limsw[lim].l_iodone(ios, bp);
 
 	return 0;
 }
 
 /*
  * Functions to implement the different kinds of limiters
  */
 
 static int
 cam_iosched_qd_iop(struct iop_stats *ios, struct bio *bp)
 {
 
 	if (ios->current <= 0 || ios->pending < ios->current)
 		return 0;
 
 	return EAGAIN;
 }
 
 static int
 cam_iosched_qd_caniop(struct iop_stats *ios, struct bio *bp)
 {
 
 	if (ios->current <= 0 || ios->pending < ios->current)
 		return 0;
 
 	return EAGAIN;
 }
 
 static int
 cam_iosched_qd_iodone(struct iop_stats *ios, struct bio *bp)
 {
 
 	if (ios->current <= 0 || ios->pending != ios->current)
 		return 0;
 
 	return 1;
 }
 
 static int
 cam_iosched_iops_init(struct iop_stats *ios)
 {
 
 	ios->l_value1 = ios->current / ios->softc->quanta;
 	if (ios->l_value1 <= 0)
 		ios->l_value1 = 1;
 	ios->l_value2 = 0;
 
 	return 0;
 }
 
 static int
 cam_iosched_iops_tick(struct iop_stats *ios)
 {
 	int new_ios;
 
 	/*
 	 * Allow at least one IO per tick until all
 	 * the IOs for this interval have been spent.
 	 */
 	new_ios = (int)((ios->current * (uint64_t)ios->softc->this_frac) >> 16);
 	if (new_ios < 1 && ios->l_value2 < ios->current) {
 		new_ios = 1;
 		ios->l_value2++;
 	}
 
 	/*
 	 * If this a new accounting interval, discard any "unspent" ios
 	 * granted in the previous interval.  Otherwise add the new ios to
 	 * the previously granted ones that haven't been spent yet.
 	 */
 	if ((ios->softc->total_ticks % ios->softc->quanta) == 0) {
 		ios->l_value1 = new_ios;
 		ios->l_value2 = 1;
 	} else {
 		ios->l_value1 += new_ios;
 	}
 
 
 	return 0;
 }
 
 static int
 cam_iosched_iops_caniop(struct iop_stats *ios, struct bio *bp)
 {
 
 	/*
 	 * So if we have any more IOPs left, allow it,
 	 * otherwise wait. If current iops is 0, treat that
 	 * as unlimited as a failsafe.
 	 */
 	if (ios->current > 0 && ios->l_value1 <= 0)
 		return EAGAIN;
 	return 0;
 }
 
 static int
 cam_iosched_iops_iop(struct iop_stats *ios, struct bio *bp)
 {
 	int rv;
 
 	rv = cam_iosched_limiter_caniop(ios, bp);
 	if (rv == 0)
 		ios->l_value1--;
 
 	return rv;
 }
 
 static int
 cam_iosched_bw_init(struct iop_stats *ios)
 {
 
 	/* ios->current is in kB/s, so scale to bytes */
 	ios->l_value1 = ios->current * 1000 / ios->softc->quanta;
 
 	return 0;
 }
 
 static int
 cam_iosched_bw_tick(struct iop_stats *ios)
 {
 	int bw;
 
 	/*
 	 * If we're in the hole for available quota from
 	 * the last time, then add the quantum for this.
 	 * If we have any left over from last quantum,
 	 * then too bad, that's lost. Also, ios->current
 	 * is in kB/s, so scale.
 	 *
 	 * We also allow up to 4 quanta of credits to
 	 * accumulate to deal with burstiness. 4 is extremely
 	 * arbitrary.
 	 */
 	bw = (int)((ios->current * 1000ull * (uint64_t)ios->softc->this_frac) >> 16);
 	if (ios->l_value1 < bw * 4)
 		ios->l_value1 += bw;
 
 	return 0;
 }
 
 static int
 cam_iosched_bw_caniop(struct iop_stats *ios, struct bio *bp)
 {
 	/*
 	 * So if we have any more bw quota left, allow it,
 	 * otherwise wait. Note, we'll go negative and that's
 	 * OK. We'll just get a little less next quota.
 	 *
 	 * Note on going negative: that allows us to process
 	 * requests in order better, since we won't allow
 	 * shorter reads to get around the long one that we
 	 * don't have the quota to do just yet. It also prevents
 	 * starvation by being a little more permissive about
 	 * what we let through this quantum (to prevent the
 	 * starvation), at the cost of getting a little less
 	 * next quantum.
 	 *
 	 * Also note that if the current limit is <= 0,
 	 * we treat it as unlimited as a failsafe.
 	 */
 	if (ios->current > 0 && ios->l_value1 <= 0)
 		return EAGAIN;
 
 
 	return 0;
 }
 
 static int
 cam_iosched_bw_iop(struct iop_stats *ios, struct bio *bp)
 {
 	int rv;
 
 	rv = cam_iosched_limiter_caniop(ios, bp);
 	if (rv == 0)
 		ios->l_value1 -= bp->bio_length;
 
 	return rv;
 }
 
 static void cam_iosched_cl_maybe_steer(struct control_loop *clp);
 
 static void
 cam_iosched_ticker(void *arg)
 {
 	struct cam_iosched_softc *isc = arg;
 	sbintime_t now, delta;
 	int pending;
 
 	callout_reset(&isc->ticker, hz / isc->quanta, cam_iosched_ticker, isc);
 
 	now = sbinuptime();
 	delta = now - isc->last_time;
 	isc->this_frac = (uint32_t)delta >> 16;		/* Note: discards seconds -- should be 0 harmless if not */
 	isc->last_time = now;
 
 	cam_iosched_cl_maybe_steer(&isc->cl);
 
 	cam_iosched_limiter_tick(&isc->read_stats);
 	cam_iosched_limiter_tick(&isc->write_stats);
 	cam_iosched_limiter_tick(&isc->trim_stats);
 
 	cam_iosched_schedule(isc, isc->periph);
 
 	/*
 	 * isc->load is an EMA of the pending I/Os at each tick. The number of
 	 * pending I/Os is the sum of the I/Os queued to the hardware, and those
 	 * in the software queue that could be queued to the hardware if there
 	 * were slots.
 	 *
 	 * ios_stats.pending is a count of requests in the SIM right now for
 	 * each of these types of I/O. So the total pending count is the sum of
 	 * these I/Os and the sum of the queued I/Os still in the software queue
 	 * for those operations that aren't being rate limited at the moment.
 	 *
 	 * The reason for the rate limiting bit is because those I/Os
 	 * aren't part of the software queued load (since we could
 	 * give them to hardware, but choose not to).
 	 *
 	 * Note: due to a bug in counting pending TRIM in the device, we
 	 * don't include them in this count. We count each BIO_DELETE in
 	 * the pending count, but the periph drivers collapse them down
 	 * into one TRIM command. That one trim command gets the completion
 	 * so the counts get off.
 	 */
 	pending = isc->read_stats.pending + isc->write_stats.pending /* + isc->trim_stats.pending */;
 	pending += !!(isc->read_stats.state_flags & IOP_RATE_LIMITED) * isc->read_stats.queued +
 	    !!(isc->write_stats.state_flags & IOP_RATE_LIMITED) * isc->write_stats.queued /* +
 	    !!(isc->trim_stats.state_flags & IOP_RATE_LIMITED) * isc->trim_stats.queued */ ;
 	pending <<= 16;
 	pending /= isc->periph->path->device->ccbq.total_openings;
 
 	isc->load = (pending + (isc->load << 13) - isc->load) >> 13; /* see above: 13 -> 16139 / 200/s = ~81s ~1 minute */
 
 	isc->total_ticks++;
 }
 
 
 static void
 cam_iosched_cl_init(struct control_loop *clp, struct cam_iosched_softc *isc)
 {
 
 	clp->next_steer = sbinuptime();
 	clp->softc = isc;
 	clp->steer_interval = SBT_1S * 5;	/* Let's start out steering every 5s */
 	clp->lolat = 5 * SBT_1MS;
 	clp->hilat = 15 * SBT_1MS;
 	clp->alpha = 20;			/* Alpha == gain. 20 = .2 */
 	clp->type = set_max;
 }
 
 static void
 cam_iosched_cl_maybe_steer(struct control_loop *clp)
 {
 	struct cam_iosched_softc *isc;
 	sbintime_t now, lat;
 	int old;
 
 	isc = clp->softc;
 	now = isc->last_time;
 	if (now < clp->next_steer)
 		return;
 
 	clp->next_steer = now + clp->steer_interval;
 	switch (clp->type) {
 	case set_max:
 		if (isc->write_stats.current != isc->write_stats.max)
 			printf("Steering write from %d kBps to %d kBps\n",
 			    isc->write_stats.current, isc->write_stats.max);
 		isc->read_stats.current = isc->read_stats.max;
 		isc->write_stats.current = isc->write_stats.max;
 		isc->trim_stats.current = isc->trim_stats.max;
 		break;
 	case read_latency:
 		old = isc->write_stats.current;
 		lat = isc->read_stats.ema;
 		/*
 		 * Simple PLL-like engine. Since we're steering to a range for
 		 * the SP (set point) that makes things a little more
 		 * complicated. In addition, we're not directly controlling our
 		 * PV (process variable), the read latency, but instead are
 		 * manipulating the write bandwidth limit for our MV
 		 * (manipulation variable), analysis of this code gets a bit
 		 * messy. Also, the MV is a very noisy control surface for read
 		 * latency since it is affected by many hidden processes inside
 		 * the device which change how responsive read latency will be
 		 * in reaction to changes in write bandwidth. Unlike the classic
 		 * boiler control PLL. this may result in over-steering while
 		 * the SSD takes its time to react to the new, lower load. This
 		 * is why we use a relatively low alpha of between .1 and .25 to
 		 * compensate for this effect. At .1, it takes ~22 steering
 		 * intervals to back off by a factor of 10. At .2 it only takes
 		 * ~10. At .25 it only takes ~8. However some preliminary data
 		 * from the SSD drives suggests a reasponse time in 10's of
 		 * seconds before latency drops regardless of the new write
 		 * rate. Careful observation will be required to tune this
 		 * effectively.
 		 *
 		 * Also, when there's no read traffic, we jack up the write
 		 * limit too regardless of the last read latency.  10 is
 		 * somewhat arbitrary.
 		 */
 		if (lat < clp->lolat || isc->read_stats.total - clp->last_count < 10)
 			isc->write_stats.current = isc->write_stats.current *
 			    (100 + clp->alpha) / 100;	/* Scale up */
 		else if (lat > clp->hilat)
 			isc->write_stats.current = isc->write_stats.current *
 			    (100 - clp->alpha) / 100;	/* Scale down */
 		clp->last_count = isc->read_stats.total;
 
 		/*
 		 * Even if we don't steer, per se, enforce the min/max limits as
 		 * those may have changed.
 		 */
 		if (isc->write_stats.current < isc->write_stats.min)
 			isc->write_stats.current = isc->write_stats.min;
 		if (isc->write_stats.current > isc->write_stats.max)
 			isc->write_stats.current = isc->write_stats.max;
 		if (old != isc->write_stats.current && 	iosched_debug)
 			printf("Steering write from %d kBps to %d kBps due to latency of %jdus\n",
 			    old, isc->write_stats.current,
 			    (uintmax_t)((uint64_t)1000000 * (uint32_t)lat) >> 32);
 		break;
 	case cl_max:
 		break;
 	}
 }
 #endif
 
 /*
  * Trim or similar currently pending completion. Should only be set for
  * those drivers wishing only one Trim active at a time.
  */
 #define CAM_IOSCHED_FLAG_TRIM_ACTIVE	(1ul << 0)
 			/* Callout active, and needs to be torn down */
 #define CAM_IOSCHED_FLAG_CALLOUT_ACTIVE (1ul << 1)
 
 			/* Periph drivers set these flags to indicate work */
 #define CAM_IOSCHED_FLAG_WORK_FLAGS	((0xffffu) << 16)
 
 #ifdef CAM_IOSCHED_DYNAMIC
 static void
 cam_iosched_io_metric_update(struct cam_iosched_softc *isc,
     sbintime_t sim_latency, int cmd, size_t size);
 #endif
 
 static inline bool
 cam_iosched_has_flagged_work(struct cam_iosched_softc *isc)
 {
 	return !!(isc->flags & CAM_IOSCHED_FLAG_WORK_FLAGS);
 }
 
 static inline bool
 cam_iosched_has_io(struct cam_iosched_softc *isc)
 {
 #ifdef CAM_IOSCHED_DYNAMIC
 	if (do_dynamic_iosched) {
 		struct bio *rbp = bioq_first(&isc->bio_queue);
 		struct bio *wbp = bioq_first(&isc->write_queue);
 		bool can_write = wbp != NULL &&
 		    cam_iosched_limiter_caniop(&isc->write_stats, wbp) == 0;
 		bool can_read = rbp != NULL &&
 		    cam_iosched_limiter_caniop(&isc->read_stats, rbp) == 0;
 		if (iosched_debug > 2) {
 			printf("can write %d: pending_writes %d max_writes %d\n", can_write, isc->write_stats.pending, isc->write_stats.max);
 			printf("can read %d: read_stats.pending %d max_reads %d\n", can_read, isc->read_stats.pending, isc->read_stats.max);
 			printf("Queued reads %d writes %d\n", isc->read_stats.queued, isc->write_stats.queued);
 		}
 		return can_read || can_write;
 	}
 #endif
 	return bioq_first(&isc->bio_queue) != NULL;
 }
 
 static inline bool
 cam_iosched_has_more_trim(struct cam_iosched_softc *isc)
 {
 	return !(isc->flags & CAM_IOSCHED_FLAG_TRIM_ACTIVE) &&
 	    bioq_first(&isc->trim_queue);
 }
 
 #define cam_iosched_sort_queue(isc)	((isc)->sort_io_queue >= 0 ?	\
     (isc)->sort_io_queue : cam_sort_io_queues)
 
 
 static inline bool
 cam_iosched_has_work(struct cam_iosched_softc *isc)
 {
 #ifdef CAM_IOSCHED_DYNAMIC
 	if (iosched_debug > 2)
 		printf("has work: %d %d %d\n", cam_iosched_has_io(isc),
 		    cam_iosched_has_more_trim(isc),
 		    cam_iosched_has_flagged_work(isc));
 #endif
 
 	return cam_iosched_has_io(isc) ||
 		cam_iosched_has_more_trim(isc) ||
 		cam_iosched_has_flagged_work(isc);
 }
 
 #ifdef CAM_IOSCHED_DYNAMIC
 static void
 cam_iosched_iop_stats_init(struct cam_iosched_softc *isc, struct iop_stats *ios)
 {
 
 	ios->limiter = none;
 	ios->in = 0;
 	ios->max = ios->current = 300000;
 	ios->min = 1;
 	ios->out = 0;
 	ios->errs = 0;
 	ios->pending = 0;
 	ios->queued = 0;
 	ios->total = 0;
 	ios->ema = 0;
 	ios->emvar = 0;
 	ios->softc = isc;
 	cam_iosched_limiter_init(ios);
 }
 
 static int
 cam_iosched_limiter_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	char buf[16];
 	struct iop_stats *ios;
 	struct cam_iosched_softc *isc;
 	int value, i, error;
 	const char *p;
 
 	ios = arg1;
 	isc = ios->softc;
 	value = ios->limiter;
 	if (value < none || value >= limiter_max)
 		p = "UNKNOWN";
 	else
 		p = cam_iosched_limiter_names[value];
 
 	strlcpy(buf, p, sizeof(buf));
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return error;
 
 	cam_periph_lock(isc->periph);
 
 	for (i = none; i < limiter_max; i++) {
 		if (strcmp(buf, cam_iosched_limiter_names[i]) != 0)
 			continue;
 		ios->limiter = i;
 		error = cam_iosched_limiter_init(ios);
 		if (error != 0) {
 			ios->limiter = value;
 			cam_periph_unlock(isc->periph);
 			return error;
 		}
 		/* Note: disk load averate requires ticker to be always running */
 		callout_reset(&isc->ticker, hz / isc->quanta, cam_iosched_ticker, isc);
 		isc->flags |= CAM_IOSCHED_FLAG_CALLOUT_ACTIVE;
 
 		cam_periph_unlock(isc->periph);
 		return 0;
 	}
 
 	cam_periph_unlock(isc->periph);
 	return EINVAL;
 }
 
 static int
 cam_iosched_control_type_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	char buf[16];
 	struct control_loop *clp;
 	struct cam_iosched_softc *isc;
 	int value, i, error;
 	const char *p;
 
 	clp = arg1;
 	isc = clp->softc;
 	value = clp->type;
 	if (value < none || value >= cl_max)
 		p = "UNKNOWN";
 	else
 		p = cam_iosched_control_type_names[value];
 
 	strlcpy(buf, p, sizeof(buf));
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return error;
 
 	for (i = set_max; i < cl_max; i++) {
 		if (strcmp(buf, cam_iosched_control_type_names[i]) != 0)
 			continue;
 		cam_periph_lock(isc->periph);
 		clp->type = i;
 		cam_periph_unlock(isc->periph);
 		return 0;
 	}
 
 	return EINVAL;
 }
 
 static int
 cam_iosched_sbintime_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	char buf[16];
 	sbintime_t value;
 	int error;
 	uint64_t us;
 
 	value = *(sbintime_t *)arg1;
 	us = (uint64_t)value / SBT_1US;
 	snprintf(buf, sizeof(buf), "%ju", (intmax_t)us);
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return error;
 	us = strtoul(buf, NULL, 10);
 	if (us == 0)
 		return EINVAL;
 	*(sbintime_t *)arg1 = us * SBT_1US;
 	return 0;
 }
 
 static int
 cam_iosched_sysctl_latencies(SYSCTL_HANDLER_ARGS)
 {
 	int i, error;
 	struct sbuf sb;
 	uint64_t *latencies;
 
 	latencies = arg1;
 	sbuf_new_for_sysctl(&sb, NULL, LAT_BUCKETS * 16, req);
 
 	for (i = 0; i < LAT_BUCKETS - 1; i++)
 		sbuf_printf(&sb, "%jd,", (intmax_t)latencies[i]);
 	sbuf_printf(&sb, "%jd", (intmax_t)latencies[LAT_BUCKETS - 1]);
 	error = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 
 	return (error);
 }
 
 static int
 cam_iosched_quanta_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int *quanta;
 	int error, value;
 
 	quanta = (unsigned *)arg1;
 	value = *quanta;
 
 	error = sysctl_handle_int(oidp, (int *)&value, 0, req);
 	if ((error != 0) || (req->newptr == NULL))
 		return (error);
 
 	if (value < 1 || value > hz)
 		return (EINVAL);
 
 	*quanta = value;
 
 	return (0);
 }
 
 static void
 cam_iosched_iop_stats_sysctl_init(struct cam_iosched_softc *isc, struct iop_stats *ios, char *name)
 {
 	struct sysctl_oid_list *n;
 	struct sysctl_ctx_list *ctx;
 
 	ios->sysctl_tree = SYSCTL_ADD_NODE(&isc->sysctl_ctx,
 	    SYSCTL_CHILDREN(isc->sysctl_tree), OID_AUTO, name,
 	    CTLFLAG_RD, 0, name);
 	n = SYSCTL_CHILDREN(ios->sysctl_tree);
 	ctx = &ios->sysctl_ctx;
 
 	SYSCTL_ADD_UQUAD(ctx, n,
 	    OID_AUTO, "ema", CTLFLAG_RD,
 	    &ios->ema,
 	    "Fast Exponentially Weighted Moving Average");
 	SYSCTL_ADD_UQUAD(ctx, n,
 	    OID_AUTO, "emvar", CTLFLAG_RD,
 	    &ios->emvar,
 	    "Fast Exponentially Weighted Moving Variance");
 
 	SYSCTL_ADD_INT(ctx, n,
 	    OID_AUTO, "pending", CTLFLAG_RD,
 	    &ios->pending, 0,
 	    "Instantaneous # of pending transactions");
 	SYSCTL_ADD_INT(ctx, n,
 	    OID_AUTO, "count", CTLFLAG_RD,
 	    &ios->total, 0,
 	    "# of transactions submitted to hardware");
 	SYSCTL_ADD_INT(ctx, n,
 	    OID_AUTO, "queued", CTLFLAG_RD,
 	    &ios->queued, 0,
 	    "# of transactions in the queue");
 	SYSCTL_ADD_INT(ctx, n,
 	    OID_AUTO, "in", CTLFLAG_RD,
 	    &ios->in, 0,
 	    "# of transactions queued to driver");
 	SYSCTL_ADD_INT(ctx, n,
 	    OID_AUTO, "out", CTLFLAG_RD,
 	    &ios->out, 0,
 	    "# of transactions completed (including with error)");
 	SYSCTL_ADD_INT(ctx, n,
 	    OID_AUTO, "errs", CTLFLAG_RD,
 	    &ios->errs, 0,
 	    "# of transactions completed with an error");
 
 	SYSCTL_ADD_PROC(ctx, n,
 	    OID_AUTO, "limiter", CTLTYPE_STRING | CTLFLAG_RW,
 	    ios, 0, cam_iosched_limiter_sysctl, "A",
 	    "Current limiting type.");
 	SYSCTL_ADD_INT(ctx, n,
 	    OID_AUTO, "min", CTLFLAG_RW,
 	    &ios->min, 0,
 	    "min resource");
 	SYSCTL_ADD_INT(ctx, n,
 	    OID_AUTO, "max", CTLFLAG_RW,
 	    &ios->max, 0,
 	    "max resource");
 	SYSCTL_ADD_INT(ctx, n,
 	    OID_AUTO, "current", CTLFLAG_RW,
 	    &ios->current, 0,
 	    "current resource");
 
 	SYSCTL_ADD_PROC(ctx, n,
 	    OID_AUTO, "latencies", CTLTYPE_STRING | CTLFLAG_RD,
 	    &ios->latencies, 0,
 	    cam_iosched_sysctl_latencies, "A",
 	    "Array of power of 2 latency from 1ms to 1.024s");
 }
 
 static void
 cam_iosched_iop_stats_fini(struct iop_stats *ios)
 {
 	if (ios->sysctl_tree)
 		if (sysctl_ctx_free(&ios->sysctl_ctx) != 0)
 			printf("can't remove iosched sysctl stats context\n");
 }
 
 static void
 cam_iosched_cl_sysctl_init(struct cam_iosched_softc *isc)
 {
 	struct sysctl_oid_list *n;
 	struct sysctl_ctx_list *ctx;
 	struct control_loop *clp;
 
 	clp = &isc->cl;
 	clp->sysctl_tree = SYSCTL_ADD_NODE(&isc->sysctl_ctx,
 	    SYSCTL_CHILDREN(isc->sysctl_tree), OID_AUTO, "control",
 	    CTLFLAG_RD, 0, "Control loop info");
 	n = SYSCTL_CHILDREN(clp->sysctl_tree);
 	ctx = &clp->sysctl_ctx;
 
 	SYSCTL_ADD_PROC(ctx, n,
 	    OID_AUTO, "type", CTLTYPE_STRING | CTLFLAG_RW,
 	    clp, 0, cam_iosched_control_type_sysctl, "A",
 	    "Control loop algorithm");
 	SYSCTL_ADD_PROC(ctx, n,
 	    OID_AUTO, "steer_interval", CTLTYPE_STRING | CTLFLAG_RW,
 	    &clp->steer_interval, 0, cam_iosched_sbintime_sysctl, "A",
 	    "How often to steer (in us)");
 	SYSCTL_ADD_PROC(ctx, n,
 	    OID_AUTO, "lolat", CTLTYPE_STRING | CTLFLAG_RW,
 	    &clp->lolat, 0, cam_iosched_sbintime_sysctl, "A",
 	    "Low water mark for Latency (in us)");
 	SYSCTL_ADD_PROC(ctx, n,
 	    OID_AUTO, "hilat", CTLTYPE_STRING | CTLFLAG_RW,
 	    &clp->hilat, 0, cam_iosched_sbintime_sysctl, "A",
 	    "Hi water mark for Latency (in us)");
 	SYSCTL_ADD_INT(ctx, n,
 	    OID_AUTO, "alpha", CTLFLAG_RW,
 	    &clp->alpha, 0,
 	    "Alpha for PLL (x100) aka gain");
 }
 
 static void
 cam_iosched_cl_sysctl_fini(struct control_loop *clp)
 {
 	if (clp->sysctl_tree)
 		if (sysctl_ctx_free(&clp->sysctl_ctx) != 0)
 			printf("can't remove iosched sysctl control loop context\n");
 }
 #endif
 
 /*
  * Allocate the iosched structure. This also insulates callers from knowing
  * sizeof struct cam_iosched_softc.
  */
 int
 cam_iosched_init(struct cam_iosched_softc **iscp, struct cam_periph *periph)
 {
 
 	*iscp = malloc(sizeof(**iscp), M_CAMSCHED, M_NOWAIT | M_ZERO);
 	if (*iscp == NULL)
 		return ENOMEM;
 #ifdef CAM_IOSCHED_DYNAMIC
 	if (iosched_debug)
 		printf("CAM IOSCHEDULER Allocating entry at %p\n", *iscp);
 #endif
 	(*iscp)->sort_io_queue = -1;
 	bioq_init(&(*iscp)->bio_queue);
 	bioq_init(&(*iscp)->trim_queue);
 #ifdef CAM_IOSCHED_DYNAMIC
 	if (do_dynamic_iosched) {
 		bioq_init(&(*iscp)->write_queue);
 		(*iscp)->read_bias = 100;
 		(*iscp)->current_read_bias = 100;
 		(*iscp)->quanta = min(hz, 200);
 		cam_iosched_iop_stats_init(*iscp, &(*iscp)->read_stats);
 		cam_iosched_iop_stats_init(*iscp, &(*iscp)->write_stats);
 		cam_iosched_iop_stats_init(*iscp, &(*iscp)->trim_stats);
 		(*iscp)->trim_stats.max = 1;	/* Trims are special: one at a time for now */
 		(*iscp)->last_time = sbinuptime();
 		callout_init_mtx(&(*iscp)->ticker, cam_periph_mtx(periph), 0);
 		(*iscp)->periph = periph;
 		cam_iosched_cl_init(&(*iscp)->cl, *iscp);
 		callout_reset(&(*iscp)->ticker, hz / (*iscp)->quanta, cam_iosched_ticker, *iscp);
 		(*iscp)->flags |= CAM_IOSCHED_FLAG_CALLOUT_ACTIVE;
 	}
 #endif
 
 	return 0;
 }
 
 /*
  * Reclaim all used resources. This assumes that other folks have
  * drained the requests in the hardware. Maybe an unwise assumption.
  */
 void
 cam_iosched_fini(struct cam_iosched_softc *isc)
 {
 	if (isc) {
 		cam_iosched_flush(isc, NULL, ENXIO);
 #ifdef CAM_IOSCHED_DYNAMIC
 		cam_iosched_iop_stats_fini(&isc->read_stats);
 		cam_iosched_iop_stats_fini(&isc->write_stats);
 		cam_iosched_iop_stats_fini(&isc->trim_stats);
 		cam_iosched_cl_sysctl_fini(&isc->cl);
 		if (isc->sysctl_tree)
 			if (sysctl_ctx_free(&isc->sysctl_ctx) != 0)
 				printf("can't remove iosched sysctl stats context\n");
 		if (isc->flags & CAM_IOSCHED_FLAG_CALLOUT_ACTIVE) {
 			callout_drain(&isc->ticker);
 			isc->flags &= ~ CAM_IOSCHED_FLAG_CALLOUT_ACTIVE;
 		}
 #endif
 		free(isc, M_CAMSCHED);
 	}
 }
 
 /*
  * After we're sure we're attaching a device, go ahead and add
  * hooks for any sysctl we may wish to honor.
  */
 void cam_iosched_sysctl_init(struct cam_iosched_softc *isc,
     struct sysctl_ctx_list *ctx, struct sysctl_oid *node)
 {
 #ifdef CAM_IOSCHED_DYNAMIC
 	struct sysctl_oid_list *n;
 #endif
 
 	SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(node),
 		OID_AUTO, "sort_io_queue", CTLFLAG_RW | CTLFLAG_MPSAFE,
 		&isc->sort_io_queue, 0,
 		"Sort IO queue to try and optimise disk access patterns");
 
 #ifdef CAM_IOSCHED_DYNAMIC
 	if (!do_dynamic_iosched)
 		return;
 
 	isc->sysctl_tree = SYSCTL_ADD_NODE(&isc->sysctl_ctx,
 	    SYSCTL_CHILDREN(node), OID_AUTO, "iosched",
 	    CTLFLAG_RD, 0, "I/O scheduler statistics");
 	n = SYSCTL_CHILDREN(isc->sysctl_tree);
 	ctx = &isc->sysctl_ctx;
 
 	cam_iosched_iop_stats_sysctl_init(isc, &isc->read_stats, "read");
 	cam_iosched_iop_stats_sysctl_init(isc, &isc->write_stats, "write");
 	cam_iosched_iop_stats_sysctl_init(isc, &isc->trim_stats, "trim");
 	cam_iosched_cl_sysctl_init(isc);
 
 	SYSCTL_ADD_INT(ctx, n,
 	    OID_AUTO, "read_bias", CTLFLAG_RW,
 	    &isc->read_bias, 100,
 	    "How biased towards read should we be independent of limits");
 
 	SYSCTL_ADD_PROC(ctx, n,
 	    OID_AUTO, "quanta", CTLTYPE_UINT | CTLFLAG_RW,
 	    &isc->quanta, 0, cam_iosched_quanta_sysctl, "I",
 	    "How many quanta per second do we slice the I/O up into");
 
 	SYSCTL_ADD_INT(ctx, n,
 	    OID_AUTO, "total_ticks", CTLFLAG_RD,
 	    &isc->total_ticks, 0,
 	    "Total number of ticks we've done");
 
 	SYSCTL_ADD_INT(ctx, n,
 	    OID_AUTO, "load", CTLFLAG_RD,
 	    &isc->load, 0,
 	    "scaled load average / 100");
 #endif
 }
 
 /*
  * Flush outstanding I/O. Consumers of this library don't know all the
  * queues we may keep, so this allows all I/O to be flushed in one
  * convenient call.
  */
 void
 cam_iosched_flush(struct cam_iosched_softc *isc, struct devstat *stp, int err)
 {
 	bioq_flush(&isc->bio_queue, stp, err);
 	bioq_flush(&isc->trim_queue, stp, err);
 #ifdef CAM_IOSCHED_DYNAMIC
 	if (do_dynamic_iosched)
 		bioq_flush(&isc->write_queue, stp, err);
 #endif
 }
 
 #ifdef CAM_IOSCHED_DYNAMIC
 static struct bio *
 cam_iosched_get_write(struct cam_iosched_softc *isc)
 {
 	struct bio *bp;
 
 	/*
 	 * We control the write rate by controlling how many requests we send
 	 * down to the drive at any one time. Fewer requests limits the
 	 * effects of both starvation when the requests take a while and write
 	 * amplification when each request is causing more than one write to
 	 * the NAND media. Limiting the queue depth like this will also limit
 	 * the write throughput and give and reads that want to compete to
 	 * compete unfairly.
 	 */
 	bp = bioq_first(&isc->write_queue);
 	if (bp == NULL) {
 		if (iosched_debug > 3)
 			printf("No writes present in write_queue\n");
 		return NULL;
 	}
 
 	/*
 	 * If pending read, prefer that based on current read bias
 	 * setting.
 	 */
 	if (bioq_first(&isc->bio_queue) && isc->current_read_bias) {
 		if (iosched_debug)
 			printf(
 			    "Reads present and current_read_bias is %d queued "
 			    "writes %d queued reads %d\n",
 			    isc->current_read_bias, isc->write_stats.queued,
     			    isc->read_stats.queued);
 		isc->current_read_bias--;
 		/* We're not limiting writes, per se, just doing reads first */
 		return NULL;
 	}
 
 	/*
 	 * See if our current limiter allows this I/O.
 	 */
 	if (cam_iosched_limiter_iop(&isc->write_stats, bp) != 0) {
 		if (iosched_debug)
 			printf("Can't write because limiter says no.\n");
 		isc->write_stats.state_flags |= IOP_RATE_LIMITED;
 		return NULL;
 	}
 
 	/*
 	 * Let's do this: We've passed all the gates and we're a go
 	 * to schedule the I/O in the SIM.
 	 */
 	isc->current_read_bias = isc->read_bias;
 	bioq_remove(&isc->write_queue, bp);
 	if (bp->bio_cmd == BIO_WRITE) {
 		isc->write_stats.queued--;
 		isc->write_stats.total++;
 		isc->write_stats.pending++;
 	}
 	if (iosched_debug > 9)
 		printf("HWQ : %p %#x\n", bp, bp->bio_cmd);
 	isc->write_stats.state_flags &= ~IOP_RATE_LIMITED;
 	return bp;
 }
 #endif
 
 /*
  * Put back a trim that you weren't able to actually schedule this time.
  */
 void
 cam_iosched_put_back_trim(struct cam_iosched_softc *isc, struct bio *bp)
 {
 	bioq_insert_head(&isc->trim_queue, bp);
 #ifdef CAM_IOSCHED_DYNAMIC
 	isc->trim_stats.queued++;
 	isc->trim_stats.total--;		/* since we put it back, don't double count */
 	isc->trim_stats.pending--;
 #endif
 }
 
 /*
  * gets the next trim from the trim queue.
  *
  * Assumes we're called with the periph lock held.  It removes this
  * trim from the queue and the device must explicitly reinsert it
  * should the need arise.
  */
 struct bio *
 cam_iosched_next_trim(struct cam_iosched_softc *isc)
 {
 	struct bio *bp;
 
 	bp  = bioq_first(&isc->trim_queue);
 	if (bp == NULL)
 		return NULL;
 	bioq_remove(&isc->trim_queue, bp);
 #ifdef CAM_IOSCHED_DYNAMIC
 	isc->trim_stats.queued--;
 	isc->trim_stats.total++;
 	isc->trim_stats.pending++;
 #endif
 	return bp;
 }
 
 /*
  * gets an available trim from the trim queue, if there's no trim
  * already pending. It removes this trim from the queue and the device
  * must explicitly reinsert it should the need arise.
  *
  * Assumes we're called with the periph lock held.
  */
 struct bio *
 cam_iosched_get_trim(struct cam_iosched_softc *isc)
 {
 
 	if (!cam_iosched_has_more_trim(isc))
 		return NULL;
 
 	return cam_iosched_next_trim(isc);
 }
 
 /*
  * Determine what the next bit of work to do is for the periph. The
  * default implementation looks to see if we have trims to do, but no
  * trims outstanding. If so, we do that. Otherwise we see if we have
  * other work. If we do, then we do that. Otherwise why were we called?
  */
 struct bio *
 cam_iosched_next_bio(struct cam_iosched_softc *isc)
 {
 	struct bio *bp;
 
 	/*
 	 * See if we have a trim that can be scheduled. We can only send one
 	 * at a time down, so this takes that into account.
 	 *
 	 * XXX newer TRIM commands are queueable. Revisit this when we
 	 * implement them.
 	 */
 	if ((bp = cam_iosched_get_trim(isc)) != NULL)
 		return bp;
 
 #ifdef CAM_IOSCHED_DYNAMIC
 	/*
 	 * See if we have any pending writes, and room in the queue for them,
 	 * and if so, those are next.
 	 */
 	if (do_dynamic_iosched) {
 		if ((bp = cam_iosched_get_write(isc)) != NULL)
 			return bp;
 	}
 #endif
 
 	/*
 	 * next, see if there's other, normal I/O waiting. If so return that.
 	 */
 	if ((bp = bioq_first(&isc->bio_queue)) == NULL)
 		return NULL;
 
 #ifdef CAM_IOSCHED_DYNAMIC
 	/*
 	 * For the dynamic scheduler, bio_queue is only for reads, so enforce
 	 * the limits here. Enforce only for reads.
 	 */
 	if (do_dynamic_iosched) {
 		if (bp->bio_cmd == BIO_READ &&
 		    cam_iosched_limiter_iop(&isc->read_stats, bp) != 0) {
 			isc->read_stats.state_flags |= IOP_RATE_LIMITED;
 			return NULL;
 		}
 	}
 	isc->read_stats.state_flags &= ~IOP_RATE_LIMITED;
 #endif
 	bioq_remove(&isc->bio_queue, bp);
 #ifdef CAM_IOSCHED_DYNAMIC
 	if (do_dynamic_iosched) {
 		if (bp->bio_cmd == BIO_READ) {
 			isc->read_stats.queued--;
 			isc->read_stats.total++;
 			isc->read_stats.pending++;
 		} else
 			printf("Found bio_cmd = %#x\n", bp->bio_cmd);
 	}
 	if (iosched_debug > 9)
 		printf("HWQ : %p %#x\n", bp, bp->bio_cmd);
 #endif
 	return bp;
 }
 
 /*
  * Driver has been given some work to do by the block layer. Tell the
  * scheduler about it and have it queue the work up. The scheduler module
  * will then return the currently most useful bit of work later, possibly
  * deferring work for various reasons.
  */
 void
 cam_iosched_queue_work(struct cam_iosched_softc *isc, struct bio *bp)
 {
 
 	/*
 	 * Put all trims on the trim queue sorted, since we know
 	 * that the collapsing code requires this. Otherwise put
 	 * the work on the bio queue.
 	 */
 	if (bp->bio_cmd == BIO_DELETE) {
 		bioq_insert_tail(&isc->trim_queue, bp);
 #ifdef CAM_IOSCHED_DYNAMIC
 		isc->trim_stats.in++;
 		isc->trim_stats.queued++;
 #endif
 	}
 #ifdef CAM_IOSCHED_DYNAMIC
 	else if (do_dynamic_iosched && (bp->bio_cmd != BIO_READ)) {
 		if (cam_iosched_sort_queue(isc))
 			bioq_disksort(&isc->write_queue, bp);
 		else
 			bioq_insert_tail(&isc->write_queue, bp);
 		if (iosched_debug > 9)
 			printf("Qw  : %p %#x\n", bp, bp->bio_cmd);
 		if (bp->bio_cmd == BIO_WRITE) {
 			isc->write_stats.in++;
 			isc->write_stats.queued++;
 		}
 	}
 #endif
 	else {
 		if (cam_iosched_sort_queue(isc))
 			bioq_disksort(&isc->bio_queue, bp);
 		else
 			bioq_insert_tail(&isc->bio_queue, bp);
 #ifdef CAM_IOSCHED_DYNAMIC
 		if (iosched_debug > 9)
 			printf("Qr  : %p %#x\n", bp, bp->bio_cmd);
 		if (bp->bio_cmd == BIO_READ) {
 			isc->read_stats.in++;
 			isc->read_stats.queued++;
 		} else if (bp->bio_cmd == BIO_WRITE) {
 			isc->write_stats.in++;
 			isc->write_stats.queued++;
 		}
 #endif
 	}
 }
 
 /*
  * If we have work, get it scheduled. Called with the periph lock held.
  */
 void
 cam_iosched_schedule(struct cam_iosched_softc *isc, struct cam_periph *periph)
 {
 
 	if (cam_iosched_has_work(isc))
 		xpt_schedule(periph, CAM_PRIORITY_NORMAL);
 }
 
 /*
  * Complete a trim request. Mark that we no longer have one in flight.
  */
 void
 cam_iosched_trim_done(struct cam_iosched_softc *isc)
 {
 
 	isc->flags &= ~CAM_IOSCHED_FLAG_TRIM_ACTIVE;
 }
 
 /*
  * Complete a bio. Called before we release the ccb with xpt_release_ccb so we
  * might use notes in the ccb for statistics.
  */
 int
 cam_iosched_bio_complete(struct cam_iosched_softc *isc, struct bio *bp,
     union ccb *done_ccb)
 {
 	int retval = 0;
 #ifdef CAM_IOSCHED_DYNAMIC
 	if (!do_dynamic_iosched)
 		return retval;
 
 	if (iosched_debug > 10)
 		printf("done: %p %#x\n", bp, bp->bio_cmd);
 	if (bp->bio_cmd == BIO_WRITE) {
 		retval = cam_iosched_limiter_iodone(&isc->write_stats, bp);
 		if ((bp->bio_flags & BIO_ERROR) != 0)
 			isc->write_stats.errs++;
 		isc->write_stats.out++;
 		isc->write_stats.pending--;
 	} else if (bp->bio_cmd == BIO_READ) {
 		retval = cam_iosched_limiter_iodone(&isc->read_stats, bp);
 		if ((bp->bio_flags & BIO_ERROR) != 0)
 			isc->read_stats.errs++;
 		isc->read_stats.out++;
 		isc->read_stats.pending--;
 	} else if (bp->bio_cmd == BIO_DELETE) {
 		if ((bp->bio_flags & BIO_ERROR) != 0)
 			isc->trim_stats.errs++;
 		isc->trim_stats.out++;
 		isc->trim_stats.pending--;
 	} else if (bp->bio_cmd != BIO_FLUSH) {
 		if (iosched_debug)
 			printf("Completing command with bio_cmd == %#x\n", bp->bio_cmd);
 	}
 
 	if (!(bp->bio_flags & BIO_ERROR) && done_ccb != NULL)
 		cam_iosched_io_metric_update(isc,
 		    cam_iosched_sbintime_t(done_ccb->ccb_h.qos.periph_data),
 		    bp->bio_cmd, bp->bio_bcount);
 #endif
 	return retval;
 }
 
 /*
  * Tell the io scheduler that you've pushed a trim down into the sim.
  * This also tells the I/O scheduler not to push any more trims down, so
  * some periphs do not call it if they can cope with multiple trims in flight.
  */
 void
 cam_iosched_submit_trim(struct cam_iosched_softc *isc)
 {
 
 	isc->flags |= CAM_IOSCHED_FLAG_TRIM_ACTIVE;
 }
 
 /*
  * Change the sorting policy hint for I/O transactions for this device.
  */
 void
 cam_iosched_set_sort_queue(struct cam_iosched_softc *isc, int val)
 {
 
 	isc->sort_io_queue = val;
 }
 
 int
 cam_iosched_has_work_flags(struct cam_iosched_softc *isc, uint32_t flags)
 {
 	return isc->flags & flags;
 }
 
 void
 cam_iosched_set_work_flags(struct cam_iosched_softc *isc, uint32_t flags)
 {
 	isc->flags |= flags;
 }
 
 void
 cam_iosched_clr_work_flags(struct cam_iosched_softc *isc, uint32_t flags)
 {
 	isc->flags &= ~flags;
 }
 
 #ifdef CAM_IOSCHED_DYNAMIC
 /*
  * After the method presented in Jack Crenshaw's 1998 article "Integer
  * Square Roots," reprinted at
  * http://www.embedded.com/electronics-blogs/programmer-s-toolbox/4219659/Integer-Square-Roots
  * and well worth the read. Briefly, we find the power of 4 that's the
  * largest smaller than val. We then check each smaller power of 4 to
  * see if val is still bigger. The right shifts at each step divide
  * the result by 2 which after successive application winds up
  * accumulating the right answer. It could also have been accumulated
  * using a separate root counter, but this code is smaller and faster
  * than that method. This method is also integer size invariant.
  * It returns floor(sqrt((float)val)), or the largest integer less than
  * or equal to the square root.
  */
 static uint64_t
 isqrt64(uint64_t val)
 {
 	uint64_t res = 0;
 	uint64_t bit = 1ULL << (sizeof(uint64_t) * NBBY - 2);
 
 	/*
 	 * Find the largest power of 4 smaller than val.
 	 */
 	while (bit > val)
 		bit >>= 2;
 
 	/*
 	 * Accumulate the answer, one bit at a time (we keep moving
 	 * them over since 2 is the square root of 4 and we test
 	 * powers of 4). We accumulate where we find the bit, but
 	 * the successive shifts land the bit in the right place
 	 * by the end.
 	 */
 	while (bit != 0) {
 		if (val >= res + bit) {
 			val -= res + bit;
 			res = (res >> 1) + bit;
 		} else
 			res >>= 1;
 		bit >>= 2;
 	}
 
 	return res;
 }
 
 static sbintime_t latencies[LAT_BUCKETS - 1] = {
 	SBT_1MS <<  0,
 	SBT_1MS <<  1,
 	SBT_1MS <<  2,
 	SBT_1MS <<  3,
 	SBT_1MS <<  4,
 	SBT_1MS <<  5,
 	SBT_1MS <<  6,
 	SBT_1MS <<  7,
 	SBT_1MS <<  8,
 	SBT_1MS <<  9,
 	SBT_1MS << 10,
 	SBT_1MS << 11,
 	SBT_1MS << 12,
 	SBT_1MS << 13		/* 8.192s */
 };
 
 static void
 cam_iosched_update(struct iop_stats *iop, sbintime_t sim_latency)
 {
 	sbintime_t y, deltasq, delta;
 	int i;
 
 	/*
 	 * Keep counts for latency. We do it by power of two buckets.
 	 * This helps us spot outlier behavior obscured by averages.
 	 */
 	for (i = 0; i < LAT_BUCKETS - 1; i++) {
 		if (sim_latency < latencies[i]) {
 			iop->latencies[i]++;
 			break;
 		}
 	}
 	if (i == LAT_BUCKETS - 1)
 		iop->latencies[i]++; 	 /* Put all > 1024ms values into the last bucket. */
 
 	/*
 	 * Classic exponentially decaying average with a tiny alpha
 	 * (2 ^ -alpha_bits). For more info see the NIST statistical
 	 * handbook.
 	 *
 	 * ema_t = y_t * alpha + ema_t-1 * (1 - alpha)		[nist]
 	 * ema_t = y_t * alpha + ema_t-1 - alpha * ema_t-1
 	 * ema_t = alpha * y_t - alpha * ema_t-1 + ema_t-1
 	 * alpha = 1 / (1 << alpha_bits)
 	 * sub e == ema_t-1, b == 1/alpha (== 1 << alpha_bits), d == y_t - ema_t-1
 	 *	= y_t/b - e/b + be/b
 	 *      = (y_t - e + be) / b
 	 *	= (e + d) / b
 	 *
 	 * Since alpha is a power of two, we can compute this w/o any mult or
 	 * division.
 	 *
 	 * Variance can also be computed. Usually, it would be expressed as follows:
 	 *	diff_t = y_t - ema_t-1
 	 *	emvar_t = (1 - alpha) * (emavar_t-1 + diff_t^2 * alpha)
 	 *	  = emavar_t-1 - alpha * emavar_t-1 + delta_t^2 * alpha - (delta_t * alpha)^2
 	 * sub b == 1/alpha (== 1 << alpha_bits), e == emavar_t-1, d = delta_t^2
 	 *	  = e - e/b + dd/b + dd/bb
 	 *	  = (bbe - be + bdd + dd) / bb
 	 *	  = (bbe + b(dd-e) + dd) / bb (which is expanded below bb = 1<<(2*alpha_bits))
 	 */
 	/*
 	 * XXX possible numeric issues
 	 *	o We assume right shifted integers do the right thing, since that's
 	 *	  implementation defined. You can change the right shifts to / (1LL << alpha).
 	 *	o alpha_bits = 9 gives ema ceiling of 23 bits of seconds for ema and 14 bits
 	 *	  for emvar. This puts a ceiling of 13 bits on alpha since we need a
 	 *	  few tens of seconds of representation.
 	 *	o We mitigate alpha issues by never setting it too high.
 	 */
 	y = sim_latency;
 	delta = (y - iop->ema);					/* d */
 	iop->ema = ((iop->ema << alpha_bits) + delta) >> alpha_bits;
 
 	/*
 	 * Were we to naively plow ahead at this point, we wind up with many numerical
 	 * issues making any SD > ~3ms unreliable. So, we shift right by 12. This leaves
 	 * us with microsecond level precision in the input, so the same in the
 	 * output. It means we can't overflow deltasq unless delta > 4k seconds. It
 	 * also means that emvar can be up 46 bits 40 of which are fraction, which
 	 * gives us a way to measure up to ~8s in the SD before the computation goes
 	 * unstable. Even the worst hard disk rarely has > 1s service time in the
 	 * drive. It does mean we have to shift left 12 bits after taking the
 	 * square root to compute the actual standard deviation estimate. This loss of
 	 * precision is preferable to needing int128 types to work. The above numbers
 	 * assume alpha=9. 10 or 11 are ok, but we start to run into issues at 12,
 	 * so 12 or 13 is OK for EMA, EMVAR and SD will be wrong in those cases.
 	 */
 	delta >>= 12;
 	deltasq = delta * delta;				/* dd */
 	iop->emvar = ((iop->emvar << (2 * alpha_bits)) +	/* bbe */
 	    ((deltasq - iop->emvar) << alpha_bits) +		/* b(dd-e) */
 	    deltasq)						/* dd */
 	    >> (2 * alpha_bits);				/* div bb */
 	iop->sd = (sbintime_t)isqrt64((uint64_t)iop->emvar) << 12;
 }
 
 static void
 cam_iosched_io_metric_update(struct cam_iosched_softc *isc,
     sbintime_t sim_latency, int cmd, size_t size)
 {
 	/* xxx Do we need to scale based on the size of the I/O ? */
 	switch (cmd) {
 	case BIO_READ:
 		cam_iosched_update(&isc->read_stats, sim_latency);
 		break;
 	case BIO_WRITE:
 		cam_iosched_update(&isc->write_stats, sim_latency);
 		break;
 	case BIO_DELETE:
 		cam_iosched_update(&isc->trim_stats, sim_latency);
 		break;
 	default:
 		break;
 	}
 }
 
 #ifdef DDB
 static int biolen(struct bio_queue_head *bq)
 {
 	int i = 0;
 	struct bio *bp;
 
 	TAILQ_FOREACH(bp, &bq->queue, bio_queue) {
 		i++;
 	}
 	return i;
 }
 
 /*
  * Show the internal state of the I/O scheduler.
  */
 DB_SHOW_COMMAND(iosched, cam_iosched_db_show)
 {
 	struct cam_iosched_softc *isc;
 
 	if (!have_addr) {
 		db_printf("Need addr\n");
 		return;
 	}
 	isc = (struct cam_iosched_softc *)addr;
 	db_printf("pending_reads:     %d\n", isc->read_stats.pending);
 	db_printf("min_reads:         %d\n", isc->read_stats.min);
 	db_printf("max_reads:         %d\n", isc->read_stats.max);
 	db_printf("reads:             %d\n", isc->read_stats.total);
 	db_printf("in_reads:          %d\n", isc->read_stats.in);
 	db_printf("out_reads:         %d\n", isc->read_stats.out);
 	db_printf("queued_reads:      %d\n", isc->read_stats.queued);
 	db_printf("Current Q len      %d\n", biolen(&isc->bio_queue));
 	db_printf("pending_writes:    %d\n", isc->write_stats.pending);
 	db_printf("min_writes:        %d\n", isc->write_stats.min);
 	db_printf("max_writes:        %d\n", isc->write_stats.max);
 	db_printf("writes:            %d\n", isc->write_stats.total);
 	db_printf("in_writes:         %d\n", isc->write_stats.in);
 	db_printf("out_writes:        %d\n", isc->write_stats.out);
 	db_printf("queued_writes:     %d\n", isc->write_stats.queued);
 	db_printf("Current Q len      %d\n", biolen(&isc->write_queue));
 	db_printf("pending_trims:     %d\n", isc->trim_stats.pending);
 	db_printf("min_trims:         %d\n", isc->trim_stats.min);
 	db_printf("max_trims:         %d\n", isc->trim_stats.max);
 	db_printf("trims:             %d\n", isc->trim_stats.total);
 	db_printf("in_trims:          %d\n", isc->trim_stats.in);
 	db_printf("out_trims:         %d\n", isc->trim_stats.out);
 	db_printf("queued_trims:      %d\n", isc->trim_stats.queued);
 	db_printf("Current Q len      %d\n", biolen(&isc->trim_queue));
 	db_printf("read_bias:         %d\n", isc->read_bias);
 	db_printf("current_read_bias: %d\n", isc->current_read_bias);
 	db_printf("Trim active?       %s\n",
 	    (isc->flags & CAM_IOSCHED_FLAG_TRIM_ACTIVE) ? "yes" : "no");
 }
 #endif
 #endif
Index: head/sys/cam/cam_iosched.h
===================================================================
--- head/sys/cam/cam_iosched.h	(revision 333433)
+++ head/sys/cam/cam_iosched.h	(revision 333434)
@@ -1,104 +1,103 @@
 /*-
  * CAM IO Scheduler Interface
  *
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2015 Netflix, Inc.
- * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _CAM_CAM_IOSCHED_H
 #define _CAM_CAM_IOSCHED_H
 
 /* No user-serviceable parts in here. */
 #ifdef _KERNEL
 
 /* Forward declare all structs to keep interface thin */
 struct cam_iosched_softc;
 struct sysctl_ctx_list;
 struct sysctl_oid;
 union ccb;
 struct bio;
 
 /*
  * For 64-bit platforms, we know that uintptr_t is the same size as sbintime_t
  * so we can store values in it. For 32-bit systems, however, uintptr_t is only
  * 32-bits, so it won't fit. For those systems, store 24 bits of fraction and 8
  * bits of seconds. This allows us to measure an interval of up to ~256s, which
  * is ~200x what our current uses require. Provide some convenience functions to
  * get the time, subtract two times and convert back to sbintime_t in a safe way
  * that can be centralized.
  */
 #ifdef __LP64__
 #define CAM_IOSCHED_TIME_SHIFT 0
 #else
 #define CAM_IOSCHED_TIME_SHIFT 8
 #endif
 static inline uintptr_t
 cam_iosched_now(void)
 {
 
 	/* Cast here is to avoid right shifting a signed value */
 	return (uintptr_t)((uint64_t)sbinuptime() >> CAM_IOSCHED_TIME_SHIFT);
 }
 
 static inline uintptr_t
 cam_iosched_delta_t(uintptr_t then)
 {
 
 	/* Since the types are identical, wrapping works correctly */
 	return (cam_iosched_now() - then);
 }
 
 static inline sbintime_t
 cam_iosched_sbintime_t(uintptr_t delta)
 {
 
 	/* Cast here is to widen the type so the left shift doesn't lose precision */
 	return (sbintime_t)((uint64_t)delta << CAM_IOSCHED_TIME_SHIFT);
 }
 
 int cam_iosched_init(struct cam_iosched_softc **, struct cam_periph *periph);
 void cam_iosched_fini(struct cam_iosched_softc *);
 void cam_iosched_sysctl_init(struct cam_iosched_softc *, struct sysctl_ctx_list *, struct sysctl_oid *);
 struct bio *cam_iosched_next_trim(struct cam_iosched_softc *isc);
 struct bio *cam_iosched_get_trim(struct cam_iosched_softc *isc);
 struct bio *cam_iosched_next_bio(struct cam_iosched_softc *isc);
 void cam_iosched_queue_work(struct cam_iosched_softc *isc, struct bio *bp);
 void cam_iosched_flush(struct cam_iosched_softc *isc, struct devstat *stp, int err);
 void cam_iosched_schedule(struct cam_iosched_softc *isc, struct cam_periph *periph);
 void cam_iosched_finish_trim(struct cam_iosched_softc *isc);
 void cam_iosched_submit_trim(struct cam_iosched_softc *isc);
 void cam_iosched_put_back_trim(struct cam_iosched_softc *isc, struct bio *bp);
 void cam_iosched_set_sort_queue(struct cam_iosched_softc *isc, int val);
 int cam_iosched_has_work_flags(struct cam_iosched_softc *isc, uint32_t flags);
 void cam_iosched_set_work_flags(struct cam_iosched_softc *isc, uint32_t flags);
 void cam_iosched_clr_work_flags(struct cam_iosched_softc *isc, uint32_t flags);
 void cam_iosched_trim_done(struct cam_iosched_softc *isc);
 int cam_iosched_bio_complete(struct cam_iosched_softc *isc, struct bio *bp, union ccb *done_ccb);
 
 #endif
 #endif
Index: head/sys/cam/nvme/nvme_all.c
===================================================================
--- head/sys/cam/nvme/nvme_all.c	(revision 333433)
+++ head/sys/cam/nvme/nvme_all.c	(revision 333434)
@@ -1,169 +1,168 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2015 Netflix, Inc
- * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 
 #ifdef _KERNEL
 #include "opt_scsi.h"
 
 #include <sys/systm.h>
 #include <sys/libkern.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/sysctl.h>
 #else
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #ifndef min
 #define min(a,b) (((a)<(b))?(a):(b))
 #endif
 #endif
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_queue.h>
 #include <cam/cam_xpt.h>
 #include <cam/nvme/nvme_all.h>
 #include <sys/sbuf.h>
 #include <sys/endian.h>
 
 #ifdef _KERNEL
 #include <cam/cam_periph.h>
 #include <cam/cam_xpt_sim.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_xpt_internal.h>
 #endif
 
 void
 nvme_ns_cmd(struct ccb_nvmeio *nvmeio, uint8_t cmd, uint32_t nsid,
     uint32_t cdw10, uint32_t cdw11, uint32_t cdw12, uint32_t cdw13,
     uint32_t cdw14, uint32_t cdw15)
 {
 	bzero(&nvmeio->cmd, sizeof(struct nvme_command));
 	nvmeio->cmd.opc_fuse = NVME_CMD_SET_OPC(cmd);
 	nvmeio->cmd.nsid = htole32(nsid);
 	nvmeio->cmd.cdw10 = htole32(cdw10);
 	nvmeio->cmd.cdw11 = htole32(cdw11);
 	nvmeio->cmd.cdw12 = htole32(cdw12);
 	nvmeio->cmd.cdw13 = htole32(cdw13);
 	nvmeio->cmd.cdw14 = htole32(cdw14);
 	nvmeio->cmd.cdw15 = htole32(cdw15);
 }
 
 int
 nvme_identify_match(caddr_t identbuffer, caddr_t table_entry)
 {
 	return 0;
 }
 
 
 void
 nvme_print_ident(const struct nvme_controller_data *cdata,
     const struct nvme_namespace_data *data, struct sbuf *sb)
 {
 
 	sbuf_printf(sb, "<");
 	cam_strvis_sbuf(sb, cdata->mn, sizeof(cdata->mn), 0);
 	sbuf_printf(sb, " ");
 	cam_strvis_sbuf(sb, cdata->fr, sizeof(cdata->fr), 0);
 	sbuf_printf(sb, " ");
 	cam_strvis_sbuf(sb, cdata->sn, sizeof(cdata->sn), 0);
 	sbuf_printf(sb, ">\n");
 }
 
 /* XXX need to do nvme admin opcodes too, but those aren't used yet by nda */
 static const char *
 nvme_opc2str[] = {
 	"FLUSH",
 	"WRITE",
 	"READ",
 	"RSVD-3",
 	"WRITE_UNCORRECTABLE",
 	"COMPARE",
 	"RSVD-6",
 	"RSVD-7",
 	"DATASET_MANAGEMENT"
 };
 
 const char *
 nvme_op_string(const struct nvme_command *cmd)
 {
 	uint8_t opc;
 
 	opc = (cmd->opc_fuse >> NVME_CMD_OPC_SHIFT) & NVME_CMD_OPC_MASK;
 	if (opc > nitems(nvme_opc2str))
 		return "UNKNOWN";
 
 	return nvme_opc2str[opc];
 }
 
 const char *
 nvme_cmd_string(const struct nvme_command *cmd, char *cmd_string, size_t len)
 {
 	uint8_t opc, fuse;
 
 	opc = (cmd->opc_fuse >> NVME_CMD_OPC_SHIFT) & NVME_CMD_OPC_MASK;
 	fuse = (cmd->opc_fuse >> NVME_CMD_FUSE_SHIFT) & NVME_CMD_FUSE_MASK;
 	/*
 	 * cid, rsvd areas and mptr not printed, since they are used
 	 * only internally by the SIM.
 	 */
 	snprintf(cmd_string, len,
 	    "opc=%x fuse=%x nsid=%x prp1=%llx prp2=%llx cdw=%x %x %x %x %x %x",
 	    opc, fuse, cmd->nsid,
 	    (unsigned long long)cmd->prp1, (unsigned long long)cmd->prp2,
 	    cmd->cdw10, cmd->cdw11, cmd->cdw12,
 	    cmd->cdw13, cmd->cdw14, cmd->cdw15);
 
 	return cmd_string;
 }
 
 const void *
 nvme_get_identify_cntrl(struct cam_periph *periph)
 {
 	struct cam_ed *device;
 
 	device = periph->path->device;
 
 	return device->nvme_cdata;
 }
 
 const void *
 nvme_get_identify_ns(struct cam_periph *periph)
 {
 	struct cam_ed *device;
 
 	device = periph->path->device;
 
 	return device->nvme_data;
 }
Index: head/sys/cam/nvme/nvme_all.h
===================================================================
--- head/sys/cam/nvme/nvme_all.h	(revision 333433)
+++ head/sys/cam/nvme/nvme_all.h	(revision 333434)
@@ -1,51 +1,50 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2015 Netflix, Inc
- * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef CAM_NVME_NVME_ALL_H
 #define CAM_NVME_NVME_ALL_H 1
 
 #include <dev/nvme/nvme.h>
 
 struct ccb_nvmeio;
 
 void	nvme_ns_cmd(struct ccb_nvmeio *nvmeio, uint8_t cmd, uint32_t nsid,
     uint32_t cdw10, uint32_t cdw11, uint32_t cdw12, uint32_t cdw13,
     uint32_t cdw14, uint32_t cdw15);
 
 int	nvme_identify_match(caddr_t identbuffer, caddr_t table_entry);
 
 struct sbuf;
 void	nvme_print_ident(const struct nvme_controller_data *, const struct nvme_namespace_data *, struct sbuf *);
 const char *nvme_op_string(const struct nvme_command *);
 const char *nvme_cmd_string(const struct nvme_command *, char *, size_t);
 const void *nvme_get_identify_cntrl(struct cam_periph *);
 const void *nvme_get_identify_ns(struct cam_periph *);
 
 #endif /* CAM_NVME_NVME_ALL_H */
Index: head/sys/cam/nvme/nvme_da.c
===================================================================
--- head/sys/cam/nvme/nvme_da.c	(revision 333433)
+++ head/sys/cam/nvme/nvme_da.c	(revision 333434)
@@ -1,1210 +1,1209 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2015 Netflix, Inc
- * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Derived from ata_da.c:
  * Copyright (c) 2009 Alexander Motin <mav@FreeBSD.org>
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 
 #ifdef _KERNEL
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bio.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/conf.h>
 #include <sys/devicestat.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/cons.h>
 #include <sys/proc.h>
 #include <sys/reboot.h>
 #include <geom/geom_disk.h>
 #endif /* _KERNEL */
 
 #ifndef _KERNEL
 #include <stdio.h>
 #include <string.h>
 #endif /* _KERNEL */
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_sim.h>
 #include <cam/cam_iosched.h>
 
 #include <cam/nvme/nvme_all.h>
 
 typedef enum {
 	NDA_STATE_NORMAL
 } nda_state;
 
 typedef enum {
 	NDA_FLAG_OPEN		= 0x0001,
 	NDA_FLAG_DIRTY		= 0x0002,
 	NDA_FLAG_SCTX_INIT	= 0x0004,
 } nda_flags;
 
 typedef enum {
 	NDA_Q_4K   = 0x01,
 	NDA_Q_NONE = 0x00,
 } nda_quirks;
 	
 #define NDA_Q_BIT_STRING	\
 	"\020"			\
 	"\001Bit 0"
 
 typedef enum {
 	NDA_CCB_BUFFER_IO	= 0x01,
 	NDA_CCB_DUMP            = 0x02,
 	NDA_CCB_TRIM            = 0x03,
 	NDA_CCB_TYPE_MASK	= 0x0F,
 } nda_ccb_state;
 
 /* Offsets into our private area for storing information */
 #define ccb_state	ccb_h.ppriv_field0
 #define ccb_bp		ccb_h.ppriv_ptr1	/* For NDA_CCB_BUFFER_IO */
 #define ccb_trim	ccb_h.ppriv_ptr1	/* For NDA_CCB_TRIM */
 
 struct nda_softc {
 	struct   cam_iosched_softc *cam_iosched;
 	int			outstanding_cmds;	/* Number of active commands */
 	int			refcount;		/* Active xpt_action() calls */
 	nda_state		state;
 	nda_flags		flags;
 	nda_quirks		quirks;
 	int			unmappedio;
 	quad_t			deletes;
 	quad_t			dsm_req;
 	uint32_t		nsid;			/* Namespace ID for this nda device */
 	struct disk		*disk;
 	struct task		sysctl_task;
 	struct sysctl_ctx_list	sysctl_ctx;
 	struct sysctl_oid	*sysctl_tree;
 #ifdef CAM_TEST_FAILURE
 	int			force_read_error;
 	int			force_write_error;
 	int			periodic_read_error;
 	int			periodic_read_count;
 #endif
 #ifdef CAM_IO_STATS
 	struct sysctl_ctx_list	sysctl_stats_ctx;
 	struct sysctl_oid	*sysctl_stats_tree;
 	u_int			timeouts;
 	u_int			errors;
 	u_int			invalidations;
 #endif
 };
 
 struct nda_trim_request {
 	union {
 		struct nvme_dsm_range dsm;
 		uint8_t		data[NVME_MAX_DSM_TRIM];
 	};
 	TAILQ_HEAD(, bio) bps;
 };
 
 /* Need quirk table */
 
 static	disk_strategy_t	ndastrategy;
 static	dumper_t	ndadump;
 static	periph_init_t	ndainit;
 static	void		ndaasync(void *callback_arg, u_int32_t code,
 				struct cam_path *path, void *arg);
 static	void		ndasysctlinit(void *context, int pending);
 static	periph_ctor_t	ndaregister;
 static	periph_dtor_t	ndacleanup;
 static	periph_start_t	ndastart;
 static	periph_oninv_t	ndaoninvalidate;
 static	void		ndadone(struct cam_periph *periph,
 			       union ccb *done_ccb);
 static  int		ndaerror(union ccb *ccb, u_int32_t cam_flags,
 				u_int32_t sense_flags);
 static void		ndashutdown(void *arg, int howto);
 static void		ndasuspend(void *arg);
 
 #ifndef	NDA_DEFAULT_SEND_ORDERED
 #define	NDA_DEFAULT_SEND_ORDERED	1
 #endif
 #ifndef NDA_DEFAULT_TIMEOUT
 #define NDA_DEFAULT_TIMEOUT 30	/* Timeout in seconds */
 #endif
 #ifndef	NDA_DEFAULT_RETRY
 #define	NDA_DEFAULT_RETRY	4
 #endif
 #ifndef NDA_MAX_TRIM_ENTRIES
 #define NDA_MAX_TRIM_ENTRIES  (NVME_MAX_DSM_TRIM / sizeof(struct nvme_dsm_range))/* Number of DSM trims to use, max 256 */
 #endif
 
 static SYSCTL_NODE(_kern_cam, OID_AUTO, nda, CTLFLAG_RD, 0,
             "CAM Direct Access Disk driver");
 
 //static int nda_retry_count = NDA_DEFAULT_RETRY;
 static int nda_send_ordered = NDA_DEFAULT_SEND_ORDERED;
 static int nda_default_timeout = NDA_DEFAULT_TIMEOUT;
 static int nda_max_trim_entries = NDA_MAX_TRIM_ENTRIES;
 SYSCTL_INT(_kern_cam_nda, OID_AUTO, max_trim, CTLFLAG_RDTUN,
     &nda_max_trim_entries, NDA_MAX_TRIM_ENTRIES,
     "Maximum number of BIO_DELETE to send down as a DSM TRIM.");
 
 /*
  * All NVMe media is non-rotational, so all nvme device instances
  * share this to implement the sysctl.
  */
 static int nda_rotating_media = 0;
 
 static struct periph_driver ndadriver =
 {
 	ndainit, "nda",
 	TAILQ_HEAD_INITIALIZER(ndadriver.units), /* generation */ 0
 };
 
 PERIPHDRIVER_DECLARE(nda, ndadriver);
 
 static MALLOC_DEFINE(M_NVMEDA, "nvme_da", "nvme_da buffers");
 
 /*
  * nice wrappers. Maybe these belong in nvme_all.c instead of
  * here, but this is the only place that uses these. Should
  * we ever grow another NVME periph, we should move them
  * all there wholesale.
  */
 
 static void
 nda_nvme_flush(struct nda_softc *softc, struct ccb_nvmeio *nvmeio)
 {
 	cam_fill_nvmeio(nvmeio,
 	    0,			/* retries */
 	    ndadone,		/* cbfcnp */
 	    CAM_DIR_NONE,	/* flags */
 	    NULL,		/* data_ptr */
 	    0,			/* dxfer_len */
 	    nda_default_timeout * 1000); /* timeout 30s */
 	nvme_ns_flush_cmd(&nvmeio->cmd, softc->nsid);
 }
 
 static void
 nda_nvme_trim(struct nda_softc *softc, struct ccb_nvmeio *nvmeio,
     void *payload, uint32_t num_ranges)
 {
 	cam_fill_nvmeio(nvmeio,
 	    0,			/* retries */
 	    ndadone,		/* cbfcnp */
 	    CAM_DIR_OUT,	/* flags */
 	    payload,		/* data_ptr */
 	    num_ranges * sizeof(struct nvme_dsm_range), /* dxfer_len */
 	    nda_default_timeout * 1000); /* timeout 30s */
 	nvme_ns_trim_cmd(&nvmeio->cmd, softc->nsid, num_ranges);
 }
 
 static void
 nda_nvme_write(struct nda_softc *softc, struct ccb_nvmeio *nvmeio,
     void *payload, uint64_t lba, uint32_t len, uint32_t count)
 {
 	cam_fill_nvmeio(nvmeio,
 	    0,			/* retries */
 	    ndadone,		/* cbfcnp */
 	    CAM_DIR_OUT,	/* flags */
 	    payload,		/* data_ptr */
 	    len,		/* dxfer_len */
 	    nda_default_timeout * 1000); /* timeout 30s */
 	nvme_ns_write_cmd(&nvmeio->cmd, softc->nsid, lba, count);
 }
 
 static void
 nda_nvme_rw_bio(struct nda_softc *softc, struct ccb_nvmeio *nvmeio,
     struct bio *bp, uint32_t rwcmd)
 {
 	int flags = rwcmd == NVME_OPC_READ ? CAM_DIR_IN : CAM_DIR_OUT;
 	void *payload;
 	uint64_t lba;
 	uint32_t count;
 
 	if (bp->bio_flags & BIO_UNMAPPED) {
 		flags |= CAM_DATA_BIO;
 		payload = bp;
 	} else {
 		payload = bp->bio_data;
 	}
 
 	lba = bp->bio_pblkno;
 	count = bp->bio_bcount / softc->disk->d_sectorsize;
 
 	cam_fill_nvmeio(nvmeio,
 	    0,			/* retries */
 	    ndadone,		/* cbfcnp */
 	    flags,		/* flags */
 	    payload,		/* data_ptr */
 	    bp->bio_bcount,	/* dxfer_len */
 	    nda_default_timeout * 1000); /* timeout 30s */
 	nvme_ns_rw_cmd(&nvmeio->cmd, rwcmd, softc->nsid, lba, count);
 }
 
 static int
 ndaopen(struct disk *dp)
 {
 	struct cam_periph *periph;
 	struct nda_softc *softc;
 	int error;
 
 	periph = (struct cam_periph *)dp->d_drv1;
 	if (cam_periph_acquire(periph) != 0) {
 		return(ENXIO);
 	}
 
 	cam_periph_lock(periph);
 	if ((error = cam_periph_hold(periph, PRIBIO|PCATCH)) != 0) {
 		cam_periph_unlock(periph);
 		cam_periph_release(periph);
 		return (error);
 	}
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH,
 	    ("ndaopen\n"));
 
 	softc = (struct nda_softc *)periph->softc;
 	softc->flags |= NDA_FLAG_OPEN;
 
 	cam_periph_unhold(periph);
 	cam_periph_unlock(periph);
 	return (0);
 }
 
 static int
 ndaclose(struct disk *dp)
 {
 	struct	cam_periph *periph;
 	struct	nda_softc *softc;
 	union ccb *ccb;
 	int error;
 
 	periph = (struct cam_periph *)dp->d_drv1;
 	softc = (struct nda_softc *)periph->softc;
 	cam_periph_lock(periph);
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH,
 	    ("ndaclose\n"));
 
 	if ((softc->flags & NDA_FLAG_DIRTY) != 0 &&
 	    (periph->flags & CAM_PERIPH_INVALID) == 0 &&
 	    cam_periph_hold(periph, PRIBIO) == 0) {
 
 		ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
 		nda_nvme_flush(softc, &ccb->nvmeio);
 		error = cam_periph_runccb(ccb, ndaerror, /*cam_flags*/0,
 		    /*sense_flags*/0, softc->disk->d_devstat);
 
 		if (error != 0)
 			xpt_print(periph->path, "Synchronize cache failed\n");
 		else
 			softc->flags &= ~NDA_FLAG_DIRTY;
 		xpt_release_ccb(ccb);
 		cam_periph_unhold(periph);
 	}
 
 	softc->flags &= ~NDA_FLAG_OPEN;
 
 	while (softc->refcount != 0)
 		cam_periph_sleep(periph, &softc->refcount, PRIBIO, "ndaclose", 1);
 	cam_periph_unlock(periph);
 	cam_periph_release(periph);
 	return (0);	
 }
 
 static void
 ndaschedule(struct cam_periph *periph)
 {
 	struct nda_softc *softc = (struct nda_softc *)periph->softc;
 
 	if (softc->state != NDA_STATE_NORMAL)
 		return;
 
 	cam_iosched_schedule(softc->cam_iosched, periph);
 }
 
 /*
  * Actually translate the requested transfer into one the physical driver
  * can understand.  The transfer is described by a buf and will include
  * only one physical transfer.
  */
 static void
 ndastrategy(struct bio *bp)
 {
 	struct cam_periph *periph;
 	struct nda_softc *softc;
 	
 	periph = (struct cam_periph *)bp->bio_disk->d_drv1;
 	softc = (struct nda_softc *)periph->softc;
 
 	cam_periph_lock(periph);
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("ndastrategy(%p)\n", bp));
 
 	/*
 	 * If the device has been made invalid, error out
 	 */
 	if ((periph->flags & CAM_PERIPH_INVALID) != 0) {
 		cam_periph_unlock(periph);
 		biofinish(bp, NULL, ENXIO);
 		return;
 	}
 	
 	if (bp->bio_cmd == BIO_DELETE)
 		softc->deletes++;
 
 	/*
 	 * Place it in the queue of disk activities for this disk
 	 */
 	cam_iosched_queue_work(softc->cam_iosched, bp);
 
 	/*
 	 * Schedule ourselves for performing the work.
 	 */
 	ndaschedule(periph);
 	cam_periph_unlock(periph);
 
 	return;
 }
 
 static int
 ndadump(void *arg, void *virtual, vm_offset_t physical, off_t offset, size_t length)
 {
 	struct	    cam_periph *periph;
 	struct	    nda_softc *softc;
 	u_int	    secsize;
 	struct ccb_nvmeio nvmeio;
 	struct	    disk *dp;
 	uint64_t    lba;
 	uint32_t    count;
 	int	    error = 0;
 
 	dp = arg;
 	periph = dp->d_drv1;
 	softc = (struct nda_softc *)periph->softc;
 	secsize = softc->disk->d_sectorsize;
 	lba = offset / secsize;
 	count = length / secsize;
 	
 	if ((periph->flags & CAM_PERIPH_INVALID) != 0)
 		return (ENXIO);
 
 	/* xpt_get_ccb returns a zero'd allocation for the ccb, mimic that here */
 	memset(&nvmeio, 0, sizeof(nvmeio));
 	if (length > 0) {
 		xpt_setup_ccb(&nvmeio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
 		nvmeio.ccb_state = NDA_CCB_DUMP;
 		nda_nvme_write(softc, &nvmeio, virtual, lba, length, count);
 		error = cam_periph_runccb((union ccb *)&nvmeio, cam_periph_error,
 		    0, SF_NO_RECOVERY | SF_NO_RETRY, NULL);
 		if (error != 0)
 			printf("Aborting dump due to I/O error %d.\n", error);
 
 		return (error);
 	}
 	
 	/* Flush */
 	xpt_setup_ccb(&nvmeio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
 
 	nvmeio.ccb_state = NDA_CCB_DUMP;
 	nda_nvme_flush(softc, &nvmeio);
 	error = cam_periph_runccb((union ccb *)&nvmeio, cam_periph_error,
 	    0, SF_NO_RECOVERY | SF_NO_RETRY, NULL);
 	if (error != 0)
 		xpt_print(periph->path, "flush cmd failed\n");
 	return (error);
 }
 
 static void
 ndainit(void)
 {
 	cam_status status;
 
 	/*
 	 * Install a global async callback.  This callback will
 	 * receive async callbacks like "new device found".
 	 */
 	status = xpt_register_async(AC_FOUND_DEVICE, ndaasync, NULL, NULL);
 
 	if (status != CAM_REQ_CMP) {
 		printf("nda: Failed to attach master async callback "
 		       "due to status 0x%x!\n", status);
 	} else if (nda_send_ordered) {
 
 		/* Register our event handlers */
 		if ((EVENTHANDLER_REGISTER(power_suspend, ndasuspend,
 					   NULL, EVENTHANDLER_PRI_LAST)) == NULL)
 		    printf("ndainit: power event registration failed!\n");
 		if ((EVENTHANDLER_REGISTER(shutdown_post_sync, ndashutdown,
 					   NULL, SHUTDOWN_PRI_DEFAULT)) == NULL)
 		    printf("ndainit: shutdown event registration failed!\n");
 	}
 }
 
 /*
  * Callback from GEOM, called when it has finished cleaning up its
  * resources.
  */
 static void
 ndadiskgonecb(struct disk *dp)
 {
 	struct cam_periph *periph;
 
 	periph = (struct cam_periph *)dp->d_drv1;
 
 	cam_periph_release(periph);
 }
 
 static void
 ndaoninvalidate(struct cam_periph *periph)
 {
 	struct nda_softc *softc;
 
 	softc = (struct nda_softc *)periph->softc;
 
 	/*
 	 * De-register any async callbacks.
 	 */
 	xpt_register_async(0, ndaasync, periph, periph->path);
 #ifdef CAM_IO_STATS
 	softc->invalidations++;
 #endif
 
 	/*
 	 * Return all queued I/O with ENXIO.
 	 * XXX Handle any transactions queued to the card
 	 *     with XPT_ABORT_CCB.
 	 */
 	cam_iosched_flush(softc->cam_iosched, NULL, ENXIO);
 
 	disk_gone(softc->disk);
 }
 
 static void
 ndacleanup(struct cam_periph *periph)
 {
 	struct nda_softc *softc;
 
 	softc = (struct nda_softc *)periph->softc;
 
 	cam_periph_unlock(periph);
 
 	cam_iosched_fini(softc->cam_iosched);
 
 	/*
 	 * If we can't free the sysctl tree, oh well...
 	 */
 	if ((softc->flags & NDA_FLAG_SCTX_INIT) != 0) {
 #ifdef CAM_IO_STATS
 		if (sysctl_ctx_free(&softc->sysctl_stats_ctx) != 0)
 			xpt_print(periph->path,
 			    "can't remove sysctl stats context\n");
 #endif
 		if (sysctl_ctx_free(&softc->sysctl_ctx) != 0)
 			xpt_print(periph->path,
 			    "can't remove sysctl context\n");
 	}
 
 	disk_destroy(softc->disk);
 	free(softc, M_DEVBUF);
 	cam_periph_lock(periph);
 }
 
 static void
 ndaasync(void *callback_arg, u_int32_t code,
 	struct cam_path *path, void *arg)
 {
 	struct cam_periph *periph;
 
 	periph = (struct cam_periph *)callback_arg;
 	switch (code) {
 	case AC_FOUND_DEVICE:
 	{
 		struct ccb_getdev *cgd;
 		cam_status status;
  
 		cgd = (struct ccb_getdev *)arg;
 		if (cgd == NULL)
 			break;
 
 		if (cgd->protocol != PROTO_NVME)
 			break;
 
 		/*
 		 * Allocate a peripheral instance for
 		 * this device and start the probe
 		 * process.
 		 */
 		status = cam_periph_alloc(ndaregister, ndaoninvalidate,
 					  ndacleanup, ndastart,
 					  "nda", CAM_PERIPH_BIO,
 					  path, ndaasync,
 					  AC_FOUND_DEVICE, cgd);
 
 		if (status != CAM_REQ_CMP
 		 && status != CAM_REQ_INPROG)
 			printf("ndaasync: Unable to attach to new device "
 				"due to status 0x%x\n", status);
 		break;
 	}
 	case AC_ADVINFO_CHANGED:
 	{
 		uintptr_t buftype;
 
 		buftype = (uintptr_t)arg;
 		if (buftype == CDAI_TYPE_PHYS_PATH) {
 			struct nda_softc *softc;
 
 			softc = periph->softc;
 			disk_attr_changed(softc->disk, "GEOM::physpath",
 					  M_NOWAIT);
 		}
 		break;
 	}
 	case AC_LOST_DEVICE:
 	default:
 		cam_periph_async(periph, code, path, arg);
 		break;
 	}
 }
 
 static void
 ndasysctlinit(void *context, int pending)
 {
 	struct cam_periph *periph;
 	struct nda_softc *softc;
 	char tmpstr[32], tmpstr2[16];
 
 	periph = (struct cam_periph *)context;
 
 	/* periph was held for us when this task was enqueued */
 	if ((periph->flags & CAM_PERIPH_INVALID) != 0) {
 		cam_periph_release(periph);
 		return;
 	}
 
 	softc = (struct nda_softc *)periph->softc;
 	snprintf(tmpstr, sizeof(tmpstr), "CAM NDA unit %d", periph->unit_number);
 	snprintf(tmpstr2, sizeof(tmpstr2), "%d", periph->unit_number);
 
 	sysctl_ctx_init(&softc->sysctl_ctx);
 	softc->flags |= NDA_FLAG_SCTX_INIT;
 	softc->sysctl_tree = SYSCTL_ADD_NODE_WITH_LABEL(&softc->sysctl_ctx,
 		SYSCTL_STATIC_CHILDREN(_kern_cam_nda), OID_AUTO, tmpstr2,
 		CTLFLAG_RD, 0, tmpstr, "device_index");
 	if (softc->sysctl_tree == NULL) {
 		printf("ndasysctlinit: unable to allocate sysctl tree\n");
 		cam_periph_release(periph);
 		return;
 	}
 
 	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
 	    OID_AUTO, "unmapped_io", CTLFLAG_RD,
 	    &softc->unmappedio, 0, "Unmapped I/O leaf");
 
 	SYSCTL_ADD_QUAD(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
 	    OID_AUTO, "deletes", CTLFLAG_RD,
 	    &softc->deletes, "Number of BIO_DELETE requests");
 
 	SYSCTL_ADD_QUAD(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
 	    OID_AUTO, "dsm_req", CTLFLAG_RD,
 	    &softc->dsm_req, "Number of DSM requests sent to SIM");
 
 	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
 	    OID_AUTO, "rotating", CTLFLAG_RD, &nda_rotating_media, 1,
 	    "Rotating media");
 
 #ifdef CAM_IO_STATS
 	softc->sysctl_stats_tree = SYSCTL_ADD_NODE(&softc->sysctl_stats_ctx,
 		SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "stats",
 		CTLFLAG_RD, 0, "Statistics");
 	if (softc->sysctl_stats_tree == NULL) {
 		printf("ndasysctlinit: unable to allocate sysctl tree for stats\n");
 		cam_periph_release(periph);
 		return;
 	}
 	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
 		SYSCTL_CHILDREN(softc->sysctl_stats_tree),
 		OID_AUTO, "timeouts", CTLFLAG_RD,
 		&softc->timeouts, 0,
 		"Device timeouts reported by the SIM");
 	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
 		SYSCTL_CHILDREN(softc->sysctl_stats_tree),
 		OID_AUTO, "errors", CTLFLAG_RD,
 		&softc->errors, 0,
 		"Transport errors reported by the SIM.");
 	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
 		SYSCTL_CHILDREN(softc->sysctl_stats_tree),
 		OID_AUTO, "pack_invalidations", CTLFLAG_RD,
 		&softc->invalidations, 0,
 		"Device pack invalidations.");
 #endif
 
 #ifdef CAM_TEST_FAILURE
 	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
 		OID_AUTO, "invalidate", CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE,
 		periph, 0, cam_periph_invalidate_sysctl, "I",
 		"Write 1 to invalidate the drive immediately");
 #endif
 
 	cam_iosched_sysctl_init(softc->cam_iosched, &softc->sysctl_ctx,
 	    softc->sysctl_tree);
 
 	cam_periph_release(periph);
 }
 
 static int
 ndagetattr(struct bio *bp)
 {
 	int ret;
 	struct cam_periph *periph;
 
 	periph = (struct cam_periph *)bp->bio_disk->d_drv1;
 	cam_periph_lock(periph);
 	ret = xpt_getattr(bp->bio_data, bp->bio_length, bp->bio_attribute,
 	    periph->path);
 	cam_periph_unlock(periph);
 	if (ret == 0)
 		bp->bio_completed = bp->bio_length;
 	return ret;
 }
 
 static cam_status
 ndaregister(struct cam_periph *periph, void *arg)
 {
 	struct nda_softc *softc;
 	struct disk *disk;
 	struct ccb_pathinq cpi;
 	const struct nvme_namespace_data *nsd;
 	const struct nvme_controller_data *cd;
 	char   announce_buf[80];
 	uint8_t flbas_fmt, lbads, vwc_present;
 	u_int maxio;
 	int quirks;
 
 	nsd = nvme_get_identify_ns(periph);
 	cd = nvme_get_identify_cntrl(periph);
 
 	softc = (struct nda_softc *)malloc(sizeof(*softc), M_DEVBUF,
 	    M_NOWAIT | M_ZERO);
 
 	if (softc == NULL) {
 		printf("ndaregister: Unable to probe new device. "
 		    "Unable to allocate softc\n");
 		return(CAM_REQ_CMP_ERR);
 	}
 
 	if (cam_iosched_init(&softc->cam_iosched, periph) != 0) {
 		printf("ndaregister: Unable to probe new device. "
 		       "Unable to allocate iosched memory\n");
 		free(softc, M_DEVBUF);
 		return(CAM_REQ_CMP_ERR);
 	}
 
 	/* ident_data parsing */
 
 	periph->softc = softc;
 
 	softc->quirks = NDA_Q_NONE;
 
 	xpt_path_inq(&cpi, periph->path);
 
 	TASK_INIT(&softc->sysctl_task, 0, ndasysctlinit, periph);
 
 	/*
 	 * The name space ID is the lun, save it for later I/O
 	 */
 	softc->nsid = (uint32_t)xpt_path_lun_id(periph->path);
 
 	/*
 	 * Register this media as a disk
 	 */
 	(void)cam_periph_hold(periph, PRIBIO);
 	cam_periph_unlock(periph);
 	snprintf(announce_buf, sizeof(announce_buf),
 	    "kern.cam.nda.%d.quirks", periph->unit_number);
 	quirks = softc->quirks;
 	TUNABLE_INT_FETCH(announce_buf, &quirks);
 	softc->quirks = quirks;
 	cam_iosched_set_sort_queue(softc->cam_iosched, 0);
 	softc->disk = disk = disk_alloc();
 	strlcpy(softc->disk->d_descr, cd->mn,
 	    MIN(sizeof(softc->disk->d_descr), sizeof(cd->mn)));
 	strlcpy(softc->disk->d_ident, cd->sn,
 	    MIN(sizeof(softc->disk->d_ident), sizeof(cd->sn)));
 	disk->d_rotation_rate = DISK_RR_NON_ROTATING;
 	disk->d_open = ndaopen;
 	disk->d_close = ndaclose;
 	disk->d_strategy = ndastrategy;
 	disk->d_getattr = ndagetattr;
 	disk->d_dump = ndadump;
 	disk->d_gone = ndadiskgonecb;
 	disk->d_name = "nda";
 	disk->d_drv1 = periph;
 	disk->d_unit = periph->unit_number;
 	maxio = cpi.maxio;		/* Honor max I/O size of SIM */
 	if (maxio == 0)
 		maxio = DFLTPHYS;	/* traditional default */
 	else if (maxio > MAXPHYS)
 		maxio = MAXPHYS;	/* for safety */
 	disk->d_maxsize = maxio;
 	flbas_fmt = (nsd->flbas >> NVME_NS_DATA_FLBAS_FORMAT_SHIFT) &
 		NVME_NS_DATA_FLBAS_FORMAT_MASK;
 	lbads = (nsd->lbaf[flbas_fmt] >> NVME_NS_DATA_LBAF_LBADS_SHIFT) &
 		NVME_NS_DATA_LBAF_LBADS_MASK;
 	disk->d_sectorsize = 1 << lbads;
 	disk->d_mediasize = (off_t)(disk->d_sectorsize * nsd->nsze);
 	disk->d_delmaxsize = disk->d_mediasize;
 	disk->d_flags = DISKFLAG_DIRECT_COMPLETION;
 //	if (cd->oncs.dsm) // XXX broken?
 		disk->d_flags |= DISKFLAG_CANDELETE;
 	vwc_present = (cd->vwc >> NVME_CTRLR_DATA_VWC_PRESENT_SHIFT) &
 		NVME_CTRLR_DATA_VWC_PRESENT_MASK;
 	if (vwc_present)
 		disk->d_flags |= DISKFLAG_CANFLUSHCACHE;
 	if ((cpi.hba_misc & PIM_UNMAPPED) != 0) {
 		disk->d_flags |= DISKFLAG_UNMAPPED_BIO;
 		softc->unmappedio = 1;
 	}
 	/*
 	 * d_ident and d_descr are both far bigger than the length of either
 	 *  the serial or model number strings.
 	 */
 	nvme_strvis(disk->d_descr, cd->mn,
 	    sizeof(disk->d_descr), NVME_MODEL_NUMBER_LENGTH);
 	nvme_strvis(disk->d_ident, cd->sn,
 	    sizeof(disk->d_ident), NVME_SERIAL_NUMBER_LENGTH);
 	disk->d_hba_vendor = cpi.hba_vendor;
 	disk->d_hba_device = cpi.hba_device;
 	disk->d_hba_subvendor = cpi.hba_subvendor;
 	disk->d_hba_subdevice = cpi.hba_subdevice;
 	disk->d_stripesize = disk->d_sectorsize;
 	disk->d_stripeoffset = 0;
 	disk->d_devstat = devstat_new_entry(periph->periph_name,
 	    periph->unit_number, disk->d_sectorsize,
 	    DEVSTAT_ALL_SUPPORTED,
 	    DEVSTAT_TYPE_DIRECT | XPORT_DEVSTAT_TYPE(cpi.transport),
 	    DEVSTAT_PRIORITY_DISK);
 	/*
 	 * Add alias for older nvd drives to ease transition.
 	 */
 	/* disk_add_alias(disk, "nvd"); Have reports of this causing problems */
 
 	/*
 	 * Acquire a reference to the periph before we register with GEOM.
 	 * We'll release this reference once GEOM calls us back (via
 	 * ndadiskgonecb()) telling us that our provider has been freed.
 	 */
 	if (cam_periph_acquire(periph) != 0) {
 		xpt_print(periph->path, "%s: lost periph during "
 			  "registration!\n", __func__);
 		cam_periph_lock(periph);
 		return (CAM_REQ_CMP_ERR);
 	}
 	disk_create(softc->disk, DISK_VERSION);
 	cam_periph_lock(periph);
 	cam_periph_unhold(periph);
 
 	snprintf(announce_buf, sizeof(announce_buf),
 		"%juMB (%ju %u byte sectors)",
 	    (uintmax_t)((uintmax_t)disk->d_mediasize / (1024*1024)),
 		(uintmax_t)disk->d_mediasize / disk->d_sectorsize,
 		disk->d_sectorsize);
 	xpt_announce_periph(periph, announce_buf);
 	xpt_announce_quirks(periph, softc->quirks, NDA_Q_BIT_STRING);
 
 	/*
 	 * Create our sysctl variables, now that we know
 	 * we have successfully attached.
 	 */
 	if (cam_periph_acquire(periph) == 0)
 		taskqueue_enqueue(taskqueue_thread, &softc->sysctl_task);
 
 	/*
 	 * Register for device going away and info about the drive
 	 * changing (though with NVMe, it can't)
 	 */
 	xpt_register_async(AC_LOST_DEVICE | AC_ADVINFO_CHANGED,
 	    ndaasync, periph, periph->path);
 
 	softc->state = NDA_STATE_NORMAL;
 	return(CAM_REQ_CMP);
 }
 
 static void
 ndastart(struct cam_periph *periph, union ccb *start_ccb)
 {
 	struct nda_softc *softc = (struct nda_softc *)periph->softc;
 	struct ccb_nvmeio *nvmeio = &start_ccb->nvmeio;
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("ndastart\n"));
 
 	switch (softc->state) {
 	case NDA_STATE_NORMAL:
 	{
 		struct bio *bp;
 
 		bp = cam_iosched_next_bio(softc->cam_iosched);
 		CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("ndastart: bio %p\n", bp));
 		if (bp == NULL) {
 			xpt_release_ccb(start_ccb);
 			break;
 		}
 
 		switch (bp->bio_cmd) {
 		case BIO_WRITE:
 			softc->flags |= NDA_FLAG_DIRTY;
 			/* FALLTHROUGH */
 		case BIO_READ:
 		{
 #ifdef CAM_TEST_FAILURE
 			int fail = 0;
 
 			/*
 			 * Support the failure ioctls.  If the command is a
 			 * read, and there are pending forced read errors, or
 			 * if a write and pending write errors, then fail this
 			 * operation with EIO.  This is useful for testing
 			 * purposes.  Also, support having every Nth read fail.
 			 *
 			 * This is a rather blunt tool.
 			 */
 			if (bp->bio_cmd == BIO_READ) {
 				if (softc->force_read_error) {
 					softc->force_read_error--;
 					fail = 1;
 				}
 				if (softc->periodic_read_error > 0) {
 					if (++softc->periodic_read_count >=
 					    softc->periodic_read_error) {
 						softc->periodic_read_count = 0;
 						fail = 1;
 					}
 				}
 			} else {
 				if (softc->force_write_error) {
 					softc->force_write_error--;
 					fail = 1;
 				}
 			}
 			if (fail) {
 				biofinish(bp, NULL, EIO);
 				xpt_release_ccb(start_ccb);
 				ndaschedule(periph);
 				return;
 			}
 #endif
 			KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 ||
 			    round_page(bp->bio_bcount + bp->bio_ma_offset) /
 			    PAGE_SIZE == bp->bio_ma_n,
 			    ("Short bio %p", bp));
 			nda_nvme_rw_bio(softc, &start_ccb->nvmeio, bp, bp->bio_cmd == BIO_READ ?
 			    NVME_OPC_READ : NVME_OPC_WRITE);
 			break;
 		}
 		case BIO_DELETE:
 		{
 			struct nvme_dsm_range *dsm_range, *dsm_end;
 			struct nda_trim_request *trim;
 			struct bio *bp1;
 			int ents;
 
 			trim = malloc(sizeof(*trim), M_NVMEDA, M_ZERO | M_NOWAIT);
 			if (trim == NULL) {
 				biofinish(bp, NULL, ENOMEM);
 				xpt_release_ccb(start_ccb);
 				ndaschedule(periph);
 				return;
 			}
 			TAILQ_INIT(&trim->bps);
 			bp1 = bp;
 			ents = sizeof(trim->data) / sizeof(struct nvme_dsm_range);
 			ents = min(ents, nda_max_trim_entries);
 			dsm_range = &trim->dsm;
 			dsm_end = dsm_range + ents;
 			do {
 				TAILQ_INSERT_TAIL(&trim->bps, bp1, bio_queue);
 				dsm_range->length =
 				    htole32(bp1->bio_bcount / softc->disk->d_sectorsize);
 				dsm_range->starting_lba =
 				    htole64(bp1->bio_offset / softc->disk->d_sectorsize);
 				dsm_range++;
 				if (dsm_range >= dsm_end)
 					break;
 				bp1 = cam_iosched_next_trim(softc->cam_iosched);
 				/* XXX -- Could collapse adjacent ranges, but we don't for now */
 				/* XXX -- Could limit based on total payload size */
 			} while (bp1 != NULL);
 			start_ccb->ccb_trim = trim;
 			softc->dsm_req++;
 			nda_nvme_trim(softc, &start_ccb->nvmeio, &trim->dsm,
 			    dsm_range - &trim->dsm);
 			start_ccb->ccb_state = NDA_CCB_TRIM;
 			/*
 			 * Note: We can have multiple TRIMs in flight, so we don't call
 			 * cam_iosched_submit_trim(softc->cam_iosched);
 			 * since that forces the I/O scheduler to only schedule one at a time.
 			 * On NVMe drives, this is a performance disaster.
 			 */
 			goto out;
 		}
 		case BIO_FLUSH:
 			nda_nvme_flush(softc, nvmeio);
 			break;
 		}
 		start_ccb->ccb_state = NDA_CCB_BUFFER_IO;
 		start_ccb->ccb_bp = bp;
 out:
 		start_ccb->ccb_h.flags |= CAM_UNLOCKED;
 		softc->outstanding_cmds++;
 		softc->refcount++;
 		cam_periph_unlock(periph);
 		xpt_action(start_ccb);
 		cam_periph_lock(periph);
 		softc->refcount--;
 
 		/* May have more work to do, so ensure we stay scheduled */
 		ndaschedule(periph);
 		break;
 		}
 	}
 }
 
 static void
 ndadone(struct cam_periph *periph, union ccb *done_ccb)
 {
 	struct nda_softc *softc;
 	struct ccb_nvmeio *nvmeio = &done_ccb->nvmeio;
 	struct cam_path *path;
 	int state;
 
 	softc = (struct nda_softc *)periph->softc;
 	path = done_ccb->ccb_h.path;
 
 	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("ndadone\n"));
 
 	state = nvmeio->ccb_state & NDA_CCB_TYPE_MASK;
 	switch (state) {
 	case NDA_CCB_BUFFER_IO:
 	case NDA_CCB_TRIM:
 	{
 		int error;
 
 		cam_periph_lock(periph);
 		if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 			error = ndaerror(done_ccb, 0, 0);
 			if (error == ERESTART) {
 				/* A retry was scheduled, so just return. */
 				cam_periph_unlock(periph);
 				return;
 			}
 			if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
 				cam_release_devq(path,
 						 /*relsim_flags*/0,
 						 /*reduction*/0,
 						 /*timeout*/0,
 						 /*getcount_only*/0);
 		} else {
 			if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
 				panic("REQ_CMP with QFRZN");
 			error = 0;
 		}
 		if (state == NDA_CCB_BUFFER_IO) {
 			struct bio *bp;
 
 			bp = (struct bio *)done_ccb->ccb_bp;
 			bp->bio_error = error;
 			if (error != 0) {
 				bp->bio_resid = bp->bio_bcount;
 				bp->bio_flags |= BIO_ERROR;
 			} else {
 				bp->bio_resid = 0;
 			}
 			softc->outstanding_cmds--;
 
 			/*
 			 * We need to call cam_iosched before we call biodone so that we
 			 * don't measure any activity that happens in the completion
 			 * routine, which in the case of sendfile can be quite
 			 * extensive.
 			 */
 			cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb);
 			xpt_release_ccb(done_ccb);
 			ndaschedule(periph);
 			cam_periph_unlock(periph);
 			biodone(bp);
 		} else { /* state == NDA_CCB_TRIM */
 			struct nda_trim_request *trim;
 			struct bio *bp1, *bp2;
 			TAILQ_HEAD(, bio) queue;
 
 			trim = nvmeio->ccb_trim;
 			TAILQ_INIT(&queue);
 			TAILQ_CONCAT(&queue, &trim->bps, bio_queue);
 			free(trim, M_NVMEDA);
 
 			/*
 			 * Since we can have multiple trims in flight, we don't
 			 * need to call this here.
 			 * cam_iosched_trim_done(softc->cam_iosched);
 			 */
 			/*
 			 * The the I/O scheduler that we're finishing the I/O
 			 * so we can keep book. The first one we pass in the CCB
 			 * which has the timing information. The rest we pass in NULL
 			 * so we can keep proper counts.
 			 */
 			bp1 = TAILQ_FIRST(&queue);
 			cam_iosched_bio_complete(softc->cam_iosched, bp1, done_ccb);
 			xpt_release_ccb(done_ccb);
 			ndaschedule(periph);
 			cam_periph_unlock(periph);
 			while ((bp2 = TAILQ_FIRST(&queue)) != NULL) {
 				TAILQ_REMOVE(&queue, bp2, bio_queue);
 				bp2->bio_error = error;
 				if (error != 0) {
 					bp2->bio_flags |= BIO_ERROR;
 					bp2->bio_resid = bp1->bio_bcount;
 				} else
 					bp2->bio_resid = 0;
 				if (bp1 != bp2)
 					cam_iosched_bio_complete(softc->cam_iosched, bp2, NULL);
 				biodone(bp2);
 			}
 		}
 		return;
 	}
 	case NDA_CCB_DUMP:
 		/* No-op.  We're polling */
 		return;
 	default:
 		break;
 	}
 	xpt_release_ccb(done_ccb);
 }
 
 static int
 ndaerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags)
 {
 	struct nda_softc *softc;
 	struct cam_periph *periph;
 
 	periph = xpt_path_periph(ccb->ccb_h.path);
 	softc = (struct nda_softc *)periph->softc;
 
 	switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
 	case CAM_CMD_TIMEOUT:
 #ifdef CAM_IO_STATS
 		softc->timeouts++;
 #endif
 		break;
 	case CAM_REQ_ABORTED:
 	case CAM_REQ_CMP_ERR:
 	case CAM_REQ_TERMIO:
 	case CAM_UNREC_HBA_ERROR:
 	case CAM_DATA_RUN_ERR:
 	case CAM_ATA_STATUS_ERROR:
 #ifdef CAM_IO_STATS
 		softc->errors++;
 #endif
 		break;
 	default:
 		break;
 	}
 
 	return(cam_periph_error(ccb, cam_flags, sense_flags));
 }
 
 /*
  * Step through all NDA peripheral drivers, and if the device is still open,
  * sync the disk cache to physical media.
  */
 static void
 ndaflush(void)
 {
 	struct cam_periph *periph;
 	struct nda_softc *softc;
 	union ccb *ccb;
 	int error;
 
 	CAM_PERIPH_FOREACH(periph, &ndadriver) {
 		softc = (struct nda_softc *)periph->softc;
 
 		if (SCHEDULER_STOPPED()) {
 			/*
 			 * If we paniced with the lock held or the periph is not
 			 * open, do not recurse.  Otherwise, call ndadump since
 			 * that avoids the sleeping cam_periph_getccb does if no
 			 * CCBs are available.
 			 */
 			if (!cam_periph_owned(periph) &&
 			    (softc->flags & NDA_FLAG_OPEN)) {
 				ndadump(softc->disk, NULL, 0, 0, 0);
 			}
 			continue;
 		}
 
 		/*
 		 * We only sync the cache if the drive is still open
 		 */
 		cam_periph_lock(periph);
 		if ((softc->flags & NDA_FLAG_OPEN) == 0) {
 			cam_periph_unlock(periph);
 			continue;
 		}
 
 		ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
 		nda_nvme_flush(softc, &ccb->nvmeio);
 		error = cam_periph_runccb(ccb, ndaerror, /*cam_flags*/0,
 		    /*sense_flags*/ SF_NO_RECOVERY | SF_NO_RETRY,
 		    softc->disk->d_devstat);
 		if (error != 0)
 			xpt_print(periph->path, "Synchronize cache failed\n");
 		xpt_release_ccb(ccb);
 		cam_periph_unlock(periph);
 	}
 }
 
 static void
 ndashutdown(void *arg, int howto)
 {
 
 	ndaflush();
 }
 
 static void
 ndasuspend(void *arg)
 {
 
 	ndaflush();
 }
Index: head/sys/cam/nvme/nvme_xpt.c
===================================================================
--- head/sys/cam/nvme/nvme_xpt.c	(revision 333433)
+++ head/sys/cam/nvme/nvme_xpt.c	(revision 333434)
@@ -1,672 +1,671 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2015 Netflix, Inc.
- * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * derived from ata_xpt.c: Copyright (c) 2009 Alexander Motin <mav@FreeBSD.org>
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/endian.h>
 #include <sys/systm.h>
 #include <sys/types.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/time.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/interrupt.h>
 #include <sys/sbuf.h>
 
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_queue.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_sim.h>
 #include <cam/cam_xpt.h>
 #include <cam/cam_xpt_sim.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_xpt_internal.h>
 #include <cam/cam_debug.h>
 
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_message.h>
 #include <cam/nvme/nvme_all.h>
 #include <machine/stdarg.h>	/* for xpt_print below */
 #include "opt_cam.h"
 
 struct nvme_quirk_entry {
 	u_int quirks;
 #define CAM_QUIRK_MAXTAGS 1
 	u_int mintags;
 	u_int maxtags;
 };
 
 /* Not even sure why we need this */
 static periph_init_t nvme_probe_periph_init;
 
 static struct periph_driver nvme_probe_driver =
 {
 	nvme_probe_periph_init, "nvme_probe",
 	TAILQ_HEAD_INITIALIZER(nvme_probe_driver.units), /* generation */ 0,
 	CAM_PERIPH_DRV_EARLY
 };
 
 PERIPHDRIVER_DECLARE(nvme_probe, nvme_probe_driver);
 
 typedef enum {
 	NVME_PROBE_IDENTIFY,
 	NVME_PROBE_DONE,
 	NVME_PROBE_INVALID,
 	NVME_PROBE_RESET
 } nvme_probe_action;
 
 static char *nvme_probe_action_text[] = {
 	"NVME_PROBE_IDENTIFY",
 	"NVME_PROBE_DONE",
 	"NVME_PROBE_INVALID",
 	"NVME_PROBE_RESET",
 };
 
 #define NVME_PROBE_SET_ACTION(softc, newaction)	\
 do {									\
 	char **text;							\
 	text = nvme_probe_action_text;					\
 	CAM_DEBUG((softc)->periph->path, CAM_DEBUG_PROBE,		\
 	    ("Probe %s to %s\n", text[(softc)->action],			\
 	    text[(newaction)]));					\
 	(softc)->action = (newaction);					\
 } while(0)
 
 typedef enum {
 	NVME_PROBE_NO_ANNOUNCE	= 0x04
 } nvme_probe_flags;
 
 typedef struct {
 	TAILQ_HEAD(, ccb_hdr) request_ccbs;
 	nvme_probe_action	action;
 	nvme_probe_flags	flags;
 	int		restart;
 	struct cam_periph *periph;
 } nvme_probe_softc;
 
 static struct nvme_quirk_entry nvme_quirk_table[] =
 {
 	{
 //		{
 //		  T_ANY, SIP_MEDIA_REMOVABLE|SIP_MEDIA_FIXED,
 //		  /*vendor*/"*", /*product*/"*", /*revision*/"*"
 //		},
 		.quirks = 0, .mintags = 0, .maxtags = 0
 	},
 };
 
 static const int nvme_quirk_table_size =
 	sizeof(nvme_quirk_table) / sizeof(*nvme_quirk_table);
 
 static cam_status	nvme_probe_register(struct cam_periph *periph,
 				      void *arg);
 static void	 nvme_probe_schedule(struct cam_periph *nvme_probe_periph);
 static void	 nvme_probe_start(struct cam_periph *periph, union ccb *start_ccb);
 static void	 nvme_probe_cleanup(struct cam_periph *periph);
 //static void	 nvme_find_quirk(struct cam_ed *device);
 static void	 nvme_scan_lun(struct cam_periph *periph,
 			       struct cam_path *path, cam_flags flags,
 			       union ccb *ccb);
 static struct cam_ed *
 		 nvme_alloc_device(struct cam_eb *bus, struct cam_et *target,
 				   lun_id_t lun_id);
 static void	 nvme_device_transport(struct cam_path *path);
 static void	 nvme_dev_async(u_int32_t async_code,
 				struct cam_eb *bus,
 				struct cam_et *target,
 				struct cam_ed *device,
 				void *async_arg);
 static void	 nvme_action(union ccb *start_ccb);
 static void	 nvme_announce_periph(struct cam_periph *periph);
 static void	 nvme_proto_announce(struct cam_ed *device);
 static void	 nvme_proto_denounce(struct cam_ed *device);
 static void	 nvme_proto_debug_out(union ccb *ccb);
 
 static struct xpt_xport_ops nvme_xport_ops = {
 	.alloc_device = nvme_alloc_device,
 	.action = nvme_action,
 	.async = nvme_dev_async,
 	.announce = nvme_announce_periph,
 };
 #define NVME_XPT_XPORT(x, X)			\
 static struct xpt_xport nvme_xport_ ## x = {	\
 	.xport = XPORT_ ## X,			\
 	.name = #x,				\
 	.ops = &nvme_xport_ops,			\
 };						\
 CAM_XPT_XPORT(nvme_xport_ ## x);
 
 NVME_XPT_XPORT(nvme, NVME);
 
 #undef NVME_XPT_XPORT
 
 static struct xpt_proto_ops nvme_proto_ops = {
 	.announce = nvme_proto_announce,
 	.denounce = nvme_proto_denounce,
 	.debug_out = nvme_proto_debug_out,
 };
 static struct xpt_proto nvme_proto = {
 	.proto = PROTO_NVME,
 	.name = "nvme",
 	.ops = &nvme_proto_ops,
 };
 CAM_XPT_PROTO(nvme_proto);
 
 static void
 nvme_probe_periph_init()
 {
 
 }
 
 static cam_status
 nvme_probe_register(struct cam_periph *periph, void *arg)
 {
 	union ccb *request_ccb;	/* CCB representing the probe request */
 	nvme_probe_softc *softc;
 
 	request_ccb = (union ccb *)arg;
 	if (request_ccb == NULL) {
 		printf("nvme_probe_register: no probe CCB, "
 		       "can't register device\n");
 		return(CAM_REQ_CMP_ERR);
 	}
 
 	softc = (nvme_probe_softc *)malloc(sizeof(*softc), M_CAMXPT, M_ZERO | M_NOWAIT);
 
 	if (softc == NULL) {
 		printf("nvme_probe_register: Unable to probe new device. "
 		       "Unable to allocate softc\n");
 		return(CAM_REQ_CMP_ERR);
 	}
 	TAILQ_INIT(&softc->request_ccbs);
 	TAILQ_INSERT_TAIL(&softc->request_ccbs, &request_ccb->ccb_h,
 			  periph_links.tqe);
 	softc->flags = 0;
 	periph->softc = softc;
 	softc->periph = periph;
 	softc->action = NVME_PROBE_INVALID;
 	if (cam_periph_acquire(periph) != 0)
 		return (CAM_REQ_CMP_ERR);
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_PROBE, ("Probe started\n"));
 
 //	nvme_device_transport(periph->path);
 	nvme_probe_schedule(periph);
 
 	return(CAM_REQ_CMP);
 }
 
 static void
 nvme_probe_schedule(struct cam_periph *periph)
 {
 	union ccb *ccb;
 	nvme_probe_softc *softc;
 
 	softc = (nvme_probe_softc *)periph->softc;
 	ccb = (union ccb *)TAILQ_FIRST(&softc->request_ccbs);
 
 	NVME_PROBE_SET_ACTION(softc, NVME_PROBE_IDENTIFY);
 
 	if (ccb->crcn.flags & CAM_EXPECT_INQ_CHANGE)
 		softc->flags |= NVME_PROBE_NO_ANNOUNCE;
 	else
 		softc->flags &= ~NVME_PROBE_NO_ANNOUNCE;
 
 	xpt_schedule(periph, CAM_PRIORITY_XPT);
 }
 
 static void
 nvme_probe_start(struct cam_periph *periph, union ccb *start_ccb)
 {
 	struct ccb_nvmeio *nvmeio;
 	struct ccb_scsiio *csio;
 	nvme_probe_softc *softc;
 	struct cam_path *path;
 	const struct nvme_namespace_data *nvme_data;
 	lun_id_t lun;
 
 	CAM_DEBUG(start_ccb->ccb_h.path, CAM_DEBUG_TRACE, ("nvme_probe_start\n"));
 
 	softc = (nvme_probe_softc *)periph->softc;
 	path = start_ccb->ccb_h.path;
 	nvmeio = &start_ccb->nvmeio;
 	csio = &start_ccb->csio;
 	nvme_data = periph->path->device->nvme_data;
 
 	if (softc->restart) {
 		softc->restart = 0;
 		if (periph->path->device->flags & CAM_DEV_UNCONFIGURED)
 			NVME_PROBE_SET_ACTION(softc, NVME_PROBE_RESET);
 		else
 			NVME_PROBE_SET_ACTION(softc, NVME_PROBE_IDENTIFY);
 	}
 
 	/*
 	 * Other transports have to ask their SIM to do a lot of action.
 	 * NVMe doesn't, so don't do the dance. Just do things
 	 * directly.
 	 */
 	switch (softc->action) {
 	case NVME_PROBE_RESET:
 		/* FALLTHROUGH */
 	case NVME_PROBE_IDENTIFY:
 		nvme_device_transport(path);
 		/*
 		 * Test for lun == CAM_LUN_WILDCARD is lame, but
 		 * appears to be necessary here. XXX
 		 */
 		lun = xpt_path_lun_id(periph->path);
 		if (lun == CAM_LUN_WILDCARD ||
 		    periph->path->device->flags & CAM_DEV_UNCONFIGURED) {
 			path->device->flags &= ~CAM_DEV_UNCONFIGURED;
 			xpt_acquire_device(path->device);
 			start_ccb->ccb_h.func_code = XPT_GDEV_TYPE;
 			xpt_action(start_ccb);
 			xpt_async(AC_FOUND_DEVICE, path, start_ccb);
 		}
 		NVME_PROBE_SET_ACTION(softc, NVME_PROBE_DONE);
 		break;
 	default:
 		panic("nvme_probe_start: invalid action state 0x%x\n", softc->action);
 	}
 	/*
 	 * Probing is now done. We need to complete any lingering items
 	 * in the queue, though there shouldn't be any.
 	 */
 	xpt_release_ccb(start_ccb);
 	CAM_DEBUG(periph->path, CAM_DEBUG_PROBE, ("Probe completed\n"));
 	while ((start_ccb = (union ccb *)TAILQ_FIRST(&softc->request_ccbs))) {
 		TAILQ_REMOVE(&softc->request_ccbs,
 		    &start_ccb->ccb_h, periph_links.tqe);
 		start_ccb->ccb_h.status = CAM_REQ_CMP;
 		xpt_done(start_ccb);
 	}
 	cam_periph_invalidate(periph);
 	cam_periph_release_locked(periph);
 }
 
 static void
 nvme_probe_cleanup(struct cam_periph *periph)
 {
 
 	free(periph->softc, M_CAMXPT);
 }
 
 #if 0
 /* XXX should be used, don't delete */
 static void
 nvme_find_quirk(struct cam_ed *device)
 {
 	struct nvme_quirk_entry *quirk;
 	caddr_t	match;
 
 	match = cam_quirkmatch((caddr_t)&device->nvme_data,
 			       (caddr_t)nvme_quirk_table,
 			       nvme_quirk_table_size,
 			       sizeof(*nvme_quirk_table), nvme_identify_match);
 
 	if (match == NULL)
 		panic("xpt_find_quirk: device didn't match wildcard entry!!");
 
 	quirk = (struct nvme_quirk_entry *)match;
 	device->quirk = quirk;
 	if (quirk->quirks & CAM_QUIRK_MAXTAGS) {
 		device->mintags = quirk->mintags;
 		device->maxtags = quirk->maxtags;
 	}
 }
 #endif
 
 static void
 nvme_scan_lun(struct cam_periph *periph, struct cam_path *path,
 	     cam_flags flags, union ccb *request_ccb)
 {
 	struct ccb_pathinq cpi;
 	cam_status status;
 	struct cam_periph *old_periph;
 	int lock;
 
 	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("nvme_scan_lun\n"));
 
 	xpt_path_inq(&cpi, path);
 
 	if (cpi.ccb_h.status != CAM_REQ_CMP) {
 		if (request_ccb != NULL) {
 			request_ccb->ccb_h.status = cpi.ccb_h.status;
 			xpt_done(request_ccb);
 		}
 		return;
 	}
 
 	if (xpt_path_lun_id(path) == CAM_LUN_WILDCARD) {
 		CAM_DEBUG(path, CAM_DEBUG_TRACE, ("nvme_scan_lun ignoring bus\n"));
 		request_ccb->ccb_h.status = CAM_REQ_CMP;	/* XXX signal error ? */
 		xpt_done(request_ccb);
 		return;
 	}
 
 	lock = (xpt_path_owned(path) == 0);
 	if (lock)
 		xpt_path_lock(path);
 	if ((old_periph = cam_periph_find(path, "nvme_probe")) != NULL) {
 		if ((old_periph->flags & CAM_PERIPH_INVALID) == 0) {
 			nvme_probe_softc *softc;
 
 			softc = (nvme_probe_softc *)old_periph->softc;
 			TAILQ_INSERT_TAIL(&softc->request_ccbs,
 				&request_ccb->ccb_h, periph_links.tqe);
 			softc->restart = 1;
 			CAM_DEBUG(path, CAM_DEBUG_TRACE,
 			    ("restarting nvme_probe device\n"));
 		} else {
 			request_ccb->ccb_h.status = CAM_REQ_CMP_ERR;
 			CAM_DEBUG(path, CAM_DEBUG_TRACE,
 			    ("Failing to restart nvme_probe device\n"));
 			xpt_done(request_ccb);
 		}
 	} else {
 		CAM_DEBUG(path, CAM_DEBUG_TRACE,
 		    ("Adding nvme_probe device\n"));
 		status = cam_periph_alloc(nvme_probe_register, NULL, nvme_probe_cleanup,
 					  nvme_probe_start, "nvme_probe",
 					  CAM_PERIPH_BIO,
 					  request_ccb->ccb_h.path, NULL, 0,
 					  request_ccb);
 
 		if (status != CAM_REQ_CMP) {
 			xpt_print(path, "xpt_scan_lun: cam_alloc_periph "
 			    "returned an error, can't continue probe\n");
 			request_ccb->ccb_h.status = status;
 			xpt_done(request_ccb);
 		}
 	}
 	if (lock)
 		xpt_path_unlock(path);
 }
 
 static struct cam_ed *
 nvme_alloc_device(struct cam_eb *bus, struct cam_et *target, lun_id_t lun_id)
 {
 	struct nvme_quirk_entry *quirk;
 	struct cam_ed *device;
 
 	device = xpt_alloc_device(bus, target, lun_id);
 	if (device == NULL)
 		return (NULL);
 
 	/*
 	 * Take the default quirk entry until we have inquiry
 	 * data from nvme and can determine a better quirk to use.
 	 */
 	quirk = &nvme_quirk_table[nvme_quirk_table_size - 1];
 	device->quirk = (void *)quirk;
 	device->mintags = 0;
 	device->maxtags = 0;
 	device->inq_flags = 0;
 	device->queue_flags = 0;
 	device->device_id = NULL;	/* XXX Need to set this somewhere */
 	device->device_id_len = 0;
 	device->serial_num = NULL;	/* XXX Need to set this somewhere */
 	device->serial_num_len = 0;
 	return (device);
 }
 
 static void
 nvme_device_transport(struct cam_path *path)
 {
 	struct ccb_pathinq cpi;
 	struct ccb_trans_settings cts;
 	/* XXX get data from nvme namespace and other info ??? */
 
 	/* Get transport information from the SIM */
 	xpt_path_inq(&cpi, path);
 
 	path->device->transport = cpi.transport;
 	path->device->transport_version = cpi.transport_version;
 
 	path->device->protocol = cpi.protocol;
 	path->device->protocol_version = cpi.protocol_version;
 
 	/* Tell the controller what we think */
 	xpt_setup_ccb(&cts.ccb_h, path, CAM_PRIORITY_NONE);
 	cts.ccb_h.func_code = XPT_SET_TRAN_SETTINGS;
 	cts.type = CTS_TYPE_CURRENT_SETTINGS;
 	cts.transport = path->device->transport;
 	cts.transport_version = path->device->transport_version;
 	cts.protocol = path->device->protocol;
 	cts.protocol_version = path->device->protocol_version;
 	cts.proto_specific.valid = 0;
 	cts.xport_specific.valid = 0;
 	xpt_action((union ccb *)&cts);
 }
 
 static void
 nvme_dev_advinfo(union ccb *start_ccb)
 {
 	struct cam_ed *device;
 	struct ccb_dev_advinfo *cdai;
 	off_t amt; 
 
 	start_ccb->ccb_h.status = CAM_REQ_INVALID;
 	device = start_ccb->ccb_h.path->device;
 	cdai = &start_ccb->cdai;
 	switch(cdai->buftype) {
 	case CDAI_TYPE_SCSI_DEVID:
 		if (cdai->flags & CDAI_FLAG_STORE)
 			return;
 		cdai->provsiz = device->device_id_len;
 		if (device->device_id_len == 0)
 			break;
 		amt = device->device_id_len;
 		if (cdai->provsiz > cdai->bufsiz)
 			amt = cdai->bufsiz;
 		memcpy(cdai->buf, device->device_id, amt);
 		break;
 	case CDAI_TYPE_SERIAL_NUM:
 		if (cdai->flags & CDAI_FLAG_STORE)
 			return;
 		cdai->provsiz = device->serial_num_len;
 		if (device->serial_num_len == 0)
 			break;
 		amt = device->serial_num_len;
 		if (cdai->provsiz > cdai->bufsiz)
 			amt = cdai->bufsiz;
 		memcpy(cdai->buf, device->serial_num, amt);
 		break;
 	case CDAI_TYPE_PHYS_PATH:
 		if (cdai->flags & CDAI_FLAG_STORE) {
 			if (device->physpath != NULL)
 				free(device->physpath, M_CAMXPT);
 			device->physpath_len = cdai->bufsiz;
 			/* Clear existing buffer if zero length */
 			if (cdai->bufsiz == 0)
 				break;
 			device->physpath = malloc(cdai->bufsiz, M_CAMXPT, M_NOWAIT);
 			if (device->physpath == NULL) {
 				start_ccb->ccb_h.status = CAM_REQ_ABORTED;
 				return;
 			}
 			memcpy(device->physpath, cdai->buf, cdai->bufsiz);
 		} else {
 			cdai->provsiz = device->physpath_len;
 			if (device->physpath_len == 0)
 				break;
 			amt = device->physpath_len;
 			if (cdai->provsiz > cdai->bufsiz)
 				amt = cdai->bufsiz;
 			memcpy(cdai->buf, device->physpath, amt);
 		}
 		break;
 	case CDAI_TYPE_NVME_CNTRL:
 		if (cdai->flags & CDAI_FLAG_STORE)
 			return;
 		amt = sizeof(struct nvme_controller_data);
 		cdai->provsiz = amt;
 		if (amt > cdai->bufsiz)
 			amt = cdai->bufsiz;
 		memcpy(cdai->buf, device->nvme_cdata, amt);
 		break;
 	case CDAI_TYPE_NVME_NS:
 		if (cdai->flags & CDAI_FLAG_STORE)
 			return;
 		amt = sizeof(struct nvme_namespace_data);
 		cdai->provsiz = amt;
 		if (amt > cdai->bufsiz)
 			amt = cdai->bufsiz;
 		memcpy(cdai->buf, device->nvme_data, amt);
 		break;
 	default:
 		return;
 	}
 	start_ccb->ccb_h.status = CAM_REQ_CMP;
 
 	if (cdai->flags & CDAI_FLAG_STORE) {
 		xpt_async(AC_ADVINFO_CHANGED, start_ccb->ccb_h.path,
 			  (void *)(uintptr_t)cdai->buftype);
 	}
 }
 
 static void
 nvme_action(union ccb *start_ccb)
 {
 	CAM_DEBUG(start_ccb->ccb_h.path, CAM_DEBUG_TRACE,
 	    ("nvme_action: func= %#x\n", start_ccb->ccb_h.func_code));
 
 	switch (start_ccb->ccb_h.func_code) {
 	case XPT_SCAN_BUS:
 	case XPT_SCAN_TGT:
 	case XPT_SCAN_LUN:
 		nvme_scan_lun(start_ccb->ccb_h.path->periph,
 			      start_ccb->ccb_h.path, start_ccb->crcn.flags,
 			      start_ccb);
 		break;
 	case XPT_DEV_ADVINFO:
 		nvme_dev_advinfo(start_ccb);
 		break;
 
 	default:
 		xpt_action_default(start_ccb);
 		break;
 	}
 }
 
 /*
  * Handle any per-device event notifications that require action by the XPT.
  */
 static void
 nvme_dev_async(u_int32_t async_code, struct cam_eb *bus, struct cam_et *target,
 	      struct cam_ed *device, void *async_arg)
 {
 
 	/*
 	 * We only need to handle events for real devices.
 	 */
 	if (target->target_id == CAM_TARGET_WILDCARD
 	 || device->lun_id == CAM_LUN_WILDCARD)
 		return;
 
 	if (async_code == AC_LOST_DEVICE &&
 	    (device->flags & CAM_DEV_UNCONFIGURED) == 0) {
 		device->flags |= CAM_DEV_UNCONFIGURED;
 		xpt_release_device(device);
 	}
 }
 
 static void
 nvme_announce_periph(struct cam_periph *periph)
 {
 	struct	ccb_pathinq cpi;
 	struct	ccb_trans_settings cts;
 	struct	cam_path *path = periph->path;
 	struct ccb_trans_settings_nvme	*nvmex;
 
 	cam_periph_assert(periph, MA_OWNED);
 
 	/* Ask the SIM for connection details */
 	xpt_setup_ccb(&cts.ccb_h, path, CAM_PRIORITY_NORMAL);
 	cts.ccb_h.func_code = XPT_GET_TRAN_SETTINGS;
 	cts.type = CTS_TYPE_CURRENT_SETTINGS;
 	xpt_action((union ccb*)&cts);
 	if ((cts.ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)
 		return;
 	nvmex = &cts.xport_specific.nvme;
 
 	/* Ask the SIM for its base transfer speed */
 	xpt_path_inq(&cpi, periph->path);
 	printf("%s%d: nvme version %d.%d x%d (max x%d) lanes PCIe Gen%d (max Gen%d) link",
 	    periph->periph_name, periph->unit_number,
 	    NVME_MAJOR(nvmex->spec),
 	    NVME_MINOR(nvmex->spec),
 	    nvmex->lanes, nvmex->max_lanes,
 	    nvmex->speed, nvmex->max_speed);
 	printf("\n");
 }
 
 static void
 nvme_proto_announce(struct cam_ed *device)
 {
 	struct sbuf	sb;
 	char		buffer[120];
 
 	sbuf_new(&sb, buffer, sizeof(buffer), SBUF_FIXEDLEN);
 	nvme_print_ident(device->nvme_cdata, device->nvme_data, &sb);
 	sbuf_finish(&sb);
 	sbuf_putbuf(&sb);
 }
 
 static void
 nvme_proto_denounce(struct cam_ed *device)
 {
 
 	nvme_proto_announce(device);
 }
 
 static void
 nvme_proto_debug_out(union ccb *ccb)
 {
 	char cdb_str[(sizeof(struct nvme_command) * 3) + 1];
 
 	if (ccb->ccb_h.func_code != XPT_NVME_IO)
 		return;
 
 	CAM_DEBUG(ccb->ccb_h.path,
 	    CAM_DEBUG_CDB,("%s. NCB: %s\n", nvme_op_string(&ccb->nvmeio.cmd),
 		nvme_cmd_string(&ccb->nvmeio.cmd, cdb_str, sizeof(cdb_str))));
 }
 
Index: head/sys/dev/nvme/nvme_sim.c
===================================================================
--- head/sys/dev/nvme/nvme_sim.c	(revision 333433)
+++ head/sys/dev/nvme/nvme_sim.c	(revision 333434)
@@ -1,433 +1,432 @@
 /*-
  * Copyright (c) 2016 Netflix, Inc
- * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/ioccom.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_sim.h>
 #include <cam/cam_xpt_sim.h>
 #include <cam/cam_xpt_internal.h>	// Yes, this is wrong.
 #include <cam/cam_debug.h>
 
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pcireg.h>
 
 #include "nvme_private.h"
 
 #define ccb_accb_ptr spriv_ptr0
 #define ccb_ctrlr_ptr spriv_ptr1
 static void	nvme_sim_action(struct cam_sim *sim, union ccb *ccb);
 static void	nvme_sim_poll(struct cam_sim *sim);
 
 #define sim2softc(sim)	((struct nvme_sim_softc *)cam_sim_softc(sim))
 #define sim2ns(sim)	(sim2softc(sim)->s_ns)
 #define sim2ctrlr(sim)	(sim2softc(sim)->s_ctrlr)
 
 struct nvme_sim_softc
 {
 	struct nvme_controller	*s_ctrlr;
 	struct nvme_namespace	*s_ns;
 	struct cam_sim		*s_sim;
 	struct cam_path		*s_path;
 };
 
 static void
 nvme_sim_nvmeio_done(void *ccb_arg, const struct nvme_completion *cpl)
 {
 	union ccb *ccb = (union ccb *)ccb_arg;
 
 	/*
 	 * Let the periph know the completion, and let it sort out what
 	 * it means. Make our best guess, though for the status code.
 	 */
 	memcpy(&ccb->nvmeio.cpl, cpl, sizeof(*cpl));
 	ccb->ccb_h.status &= ~CAM_SIM_QUEUED;
 	if (nvme_completion_is_error(cpl)) {
 		ccb->ccb_h.status = CAM_REQ_CMP_ERR;
 		xpt_done(ccb);
 	} else {
 		ccb->ccb_h.status = CAM_REQ_CMP;
 		xpt_done_direct(ccb);
 	}
 }
 
 static void
 nvme_sim_nvmeio(struct cam_sim *sim, union ccb *ccb)
 {
 	struct ccb_nvmeio	*nvmeio = &ccb->nvmeio;
 	struct nvme_request	*req;
 	void			*payload;
 	uint32_t		size;
 	struct nvme_controller *ctrlr;
 
 	ctrlr = sim2ctrlr(sim);
 	payload = nvmeio->data_ptr;
 	size = nvmeio->dxfer_len;
 	/* SG LIST ??? */
 	if ((nvmeio->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_BIO)
 		req = nvme_allocate_request_bio((struct bio *)payload,
 		    nvme_sim_nvmeio_done, ccb);
 	else if ((nvmeio->ccb_h.flags & CAM_DATA_SG) == CAM_DATA_SG)
 		req = nvme_allocate_request_ccb(ccb, nvme_sim_nvmeio_done, ccb);
 	else if (payload == NULL)
 		req = nvme_allocate_request_null(nvme_sim_nvmeio_done, ccb);
 	else
 		req = nvme_allocate_request_vaddr(payload, size,
 		    nvme_sim_nvmeio_done, ccb);
 
 	if (req == NULL) {
 		nvmeio->ccb_h.status = CAM_RESRC_UNAVAIL;
 		xpt_done(ccb);
 		return;
 	}
 	ccb->ccb_h.status |= CAM_SIM_QUEUED;
 
 	memcpy(&req->cmd, &ccb->nvmeio.cmd, sizeof(ccb->nvmeio.cmd));
 
 	if (ccb->ccb_h.func_code == XPT_NVME_IO)
 		nvme_ctrlr_submit_io_request(ctrlr, req);
 	else
 		nvme_ctrlr_submit_admin_request(ctrlr, req);
 }
 
 static uint32_t
 nvme_link_kBps(struct nvme_controller *ctrlr)
 {
 	uint32_t speed, lanes, link[] = { 1, 250000, 500000, 985000, 1970000 };
 	uint32_t status;
 
 	status = pcie_read_config(ctrlr->dev, PCIER_LINK_STA, 2);
 	speed = status & PCIEM_LINK_STA_SPEED;
 	lanes = (status & PCIEM_LINK_STA_WIDTH) >> 4;
 	/*
 	 * Failsafe on link speed indicator. If it is insane report the number of
 	 * lanes as the speed. Not 100% accurate, but may be diagnostic.
 	 */
 	if (speed >= nitems(link))
 		speed = 0;
 	return link[speed] * lanes;
 }
 
 static void
 nvme_sim_action(struct cam_sim *sim, union ccb *ccb)
 {
 	struct nvme_controller *ctrlr;
 	struct nvme_namespace *ns;
 
 	CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE,
 	    ("nvme_sim_action: func= %#x\n",
 		ccb->ccb_h.func_code));
 
 	/*
 	 * XXX when we support multiple namespaces in the base driver we'll need
 	 * to revisit how all this gets stored and saved in the periph driver's
 	 * reserved areas. Right now we store all three in the softc of the sim.
 	 */
 	ns = sim2ns(sim);
 	ctrlr = sim2ctrlr(sim);
 
 	mtx_assert(&ctrlr->lock, MA_OWNED);
 
 	switch (ccb->ccb_h.func_code) {
 	case XPT_CALC_GEOMETRY:		/* Calculate Geometry Totally nuts ? XXX */
 		/* 
 		 * Only meaningful for old-school SCSI disks since only the SCSI
 		 * da driver generates them. Reject all these that slip through.
 		 */
 		/*FALLTHROUGH*/
 	case XPT_ABORT:			/* Abort the specified CCB */
 		ccb->ccb_h.status = CAM_REQ_INVALID;
 		break;
 	case XPT_SET_TRAN_SETTINGS:
 		/*
 		 * NVMe doesn't really have different transfer settings, but
 		 * other parts of CAM think failure here is a big deal.
 		 */
 		ccb->ccb_h.status = CAM_REQ_CMP;
 		break;
 	case XPT_PATH_INQ:		/* Path routing inquiry */
 	{
 		struct ccb_pathinq	*cpi = &ccb->cpi;
 		device_t		dev = ctrlr->dev;
 
 		/*
 		 * NVMe may have multiple LUNs on the same path. Current generation
 		 * of NVMe devives support only a single name space. Multiple name
 		 * space drives are coming, but it's unclear how we should report
 		 * them up the stack.
 		 */
 		cpi->version_num = 1;
 		cpi->hba_inquiry = 0;
 		cpi->target_sprt = 0;
 		cpi->hba_misc =  PIM_UNMAPPED /* | PIM_NOSCAN */;
 		cpi->hba_eng_cnt = 0;
 		cpi->max_target = 0;
 		cpi->max_lun = ctrlr->cdata.nn;
 		cpi->maxio = nvme_ns_get_max_io_xfer_size(ns);
 		cpi->initiator_id = 0;
 		cpi->bus_id = cam_sim_bus(sim);
 		cpi->base_transfer_speed = nvme_link_kBps(ctrlr);
 		strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
 		strlcpy(cpi->hba_vid, "NVMe", HBA_IDLEN);
 		strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
 		cpi->unit_number = cam_sim_unit(sim);
 		cpi->transport = XPORT_NVME;		/* XXX XPORT_PCIE ? */
 		cpi->transport_version = nvme_mmio_read_4(ctrlr, vs);
 		cpi->protocol = PROTO_NVME;
 		cpi->protocol_version = nvme_mmio_read_4(ctrlr, vs);
 		cpi->xport_specific.nvme.nsid = ns->id;
 		cpi->xport_specific.nvme.domain = pci_get_domain(dev);
 		cpi->xport_specific.nvme.bus = pci_get_bus(dev);
 		cpi->xport_specific.nvme.slot = pci_get_slot(dev);
 		cpi->xport_specific.nvme.function = pci_get_function(dev);
 		cpi->xport_specific.nvme.extra = 0;
 		cpi->ccb_h.status = CAM_REQ_CMP;
 		break;
 	}
 	case XPT_GET_TRAN_SETTINGS:	/* Get transport settings */
 	{
 		struct ccb_trans_settings	*cts;
 		struct ccb_trans_settings_nvme	*nvmep;
 		struct ccb_trans_settings_nvme	*nvmex;
 		device_t dev;
 		uint32_t status, caps;
 
 		dev = ctrlr->dev;
 		cts = &ccb->cts;
 		nvmex = &cts->xport_specific.nvme;
 		nvmep = &cts->proto_specific.nvme;
 
 		status = pcie_read_config(dev, PCIER_LINK_STA, 2);
 		caps = pcie_read_config(dev, PCIER_LINK_CAP, 2);
 		nvmex->valid = CTS_NVME_VALID_SPEC | CTS_NVME_VALID_LINK;
 		nvmex->spec = nvme_mmio_read_4(ctrlr, vs);
 		nvmex->speed = status & PCIEM_LINK_STA_SPEED;
 		nvmex->lanes = (status & PCIEM_LINK_STA_WIDTH) >> 4;
 		nvmex->max_speed = caps & PCIEM_LINK_CAP_MAX_SPEED;
 		nvmex->max_lanes = (caps & PCIEM_LINK_CAP_MAX_WIDTH) >> 4;
 
 		/* XXX these should be something else maybe ? */
 		nvmep->valid = 1;
 		nvmep->spec = nvmex->spec;
 
 		cts->transport = XPORT_NVME;
 		cts->protocol = PROTO_NVME;
 		cts->ccb_h.status = CAM_REQ_CMP;
 		break;
 	}
 	case XPT_TERM_IO:		/* Terminate the I/O process */
 		/*
 		 * every driver handles this, but nothing generates it. Assume
 		 * it's OK to just say 'that worked'.
 		 */
 		/*FALLTHROUGH*/
 	case XPT_RESET_DEV:		/* Bus Device Reset the specified device */
 	case XPT_RESET_BUS:		/* Reset the specified bus */
 		/*
 		 * NVMe doesn't really support physically resetting the bus. It's part
 		 * of the bus scanning dance, so return sucess to tell the process to
 		 * proceed.
 		 */
 		ccb->ccb_h.status = CAM_REQ_CMP;
 		break;
 	case XPT_NVME_IO:		/* Execute the requested I/O operation */
 	case XPT_NVME_ADMIN:		/* or Admin operation */
 		nvme_sim_nvmeio(sim, ccb);
 		return;			/* no done */
 	default:
 		ccb->ccb_h.status = CAM_REQ_INVALID;
 		break;
 	}
 	xpt_done(ccb);
 }
 
 static void
 nvme_sim_poll(struct cam_sim *sim)
 {
 
 	nvme_ctrlr_poll(sim2ctrlr(sim));
 }
 
 static void *
 nvme_sim_new_controller(struct nvme_controller *ctrlr)
 {
 	struct cam_devq *devq;
 	int max_trans;
 	int unit;
 	struct nvme_sim_softc *sc = NULL;
 
 	max_trans = ctrlr->max_hw_pend_io;
 	unit = device_get_unit(ctrlr->dev);
 	devq = cam_simq_alloc(max_trans);
 	if (devq == NULL)
 		return NULL;
 
 	sc = malloc(sizeof(*sc), M_NVME, M_ZERO | M_WAITOK);
 
 	sc->s_ctrlr = ctrlr;
 
 	sc->s_sim = cam_sim_alloc(nvme_sim_action, nvme_sim_poll,
 	    "nvme", sc, unit, &ctrlr->lock, max_trans, max_trans, devq);
 	if (sc->s_sim == NULL) {
 		printf("Failed to allocate a sim\n");
 		cam_simq_free(devq);
 		free(sc, M_NVME);
 		return NULL;
 	}
 
 	return sc;
 }
 
 static void
 nvme_sim_rescan_target(struct nvme_controller *ctrlr, struct cam_path *path)
 {
 	union ccb *ccb;
 
 	ccb = xpt_alloc_ccb_nowait();
 	if (ccb == NULL) {
 		printf("unable to alloc CCB for rescan\n");
 		return;
 	}
 
 	if (xpt_clone_path(&ccb->ccb_h.path, path) != CAM_REQ_CMP) {
 		printf("unable to copy path for rescan\n");
 		xpt_free_ccb(ccb);
 		return;
 	}
 
 	xpt_rescan(ccb);
 }
 	
 static void *
 nvme_sim_new_ns(struct nvme_namespace *ns, void *sc_arg)
 {
 	struct nvme_sim_softc *sc = sc_arg;
 	struct nvme_controller *ctrlr = sc->s_ctrlr;
 	int i;
 
 	sc->s_ns = ns;
 
 	/*
 	 * XXX this is creating one bus per ns, but it should be one
 	 * XXX target per controller, and one LUN per namespace.
 	 * XXX Current drives only support one NS, so there's time
 	 * XXX to fix it later when new drives arrive.
 	 *
 	 * XXX I'm pretty sure the xpt_bus_register() call below is
 	 * XXX like super lame and it really belongs in the sim_new_ctrlr
 	 * XXX callback. Then the create_path below would be pretty close
 	 * XXX to being right. Except we should be per-ns not per-ctrlr
 	 * XXX data.
 	 */
 
 	mtx_lock(&ctrlr->lock);
 /* Create bus */
 
 	/*
 	 * XXX do I need to lock ctrlr->lock ? 
 	 * XXX do I need to lock the path?
 	 * ata and scsi seem to in their code, but their discovery is
 	 * somewhat more asynchronous. We're only every called one at a
 	 * time, and nothing is in parallel.
 	 */
 
 	i = 0;
 	if (xpt_bus_register(sc->s_sim, ctrlr->dev, 0) != CAM_SUCCESS)
 		goto error;
 	i++;
 	if (xpt_create_path(&sc->s_path, /*periph*/NULL, cam_sim_path(sc->s_sim),
 	    1, ns->id) != CAM_REQ_CMP)
 		goto error;
 	i++;
 
 	sc->s_path->device->nvme_data = nvme_ns_get_data(ns);
 	sc->s_path->device->nvme_cdata = nvme_ctrlr_get_data(ns->ctrlr);
 
 /* Scan bus */
 	nvme_sim_rescan_target(ctrlr, sc->s_path);
 
 	mtx_unlock(&ctrlr->lock);
 
 	return ns;
 
 error:
 	switch (i) {
 	case 2:
 		xpt_free_path(sc->s_path);
 	case 1:
 		xpt_bus_deregister(cam_sim_path(sc->s_sim));
 	case 0:
 		cam_sim_free(sc->s_sim, /*free_devq*/TRUE);
 	}
 	mtx_unlock(&ctrlr->lock);
 	return NULL;
 }
 
 static void
 nvme_sim_controller_fail(void *ctrlr_arg)
 {
 	/* XXX cleanup XXX */
 }
 
 struct nvme_consumer *consumer_cookie;
 
 static void
 nvme_sim_init(void)
 {
 	if (nvme_use_nvd)
 		return;
 
 	consumer_cookie = nvme_register_consumer(nvme_sim_new_ns,
 	    nvme_sim_new_controller, NULL, nvme_sim_controller_fail);
 }
 
 SYSINIT(nvme_sim_register, SI_SUB_DRIVERS, SI_ORDER_ANY,
     nvme_sim_init, NULL);
 
 static void
 nvme_sim_uninit(void)
 {
 	if (nvme_use_nvd)
 		return;
 	/* XXX Cleanup */
 
 	nvme_unregister_consumer(consumer_cookie);
 }
 
 SYSUNINIT(nvme_sim_unregister, SI_SUB_DRIVERS, SI_ORDER_ANY,
     nvme_sim_uninit, NULL);