diff --git a/share/man/man4/nvme.4 b/share/man/man4/nvme.4
index 011ff483c839..dcd2ec86f5fa 100644
--- a/share/man/man4/nvme.4
+++ b/share/man/man4/nvme.4
@@ -1,264 +1,273 @@
 .\"
 .\" Copyright (c) 2012-2016 Intel Corporation
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions, and the following disclaimer,
 .\"    without modification.
 .\" 2. Redistributions in binary form must reproduce at minimum a disclaimer
 .\"    substantially similar to the "NO WARRANTY" disclaimer below
 .\"    ("Disclaimer") and any redistribution must be conditioned upon
 .\"    including a substantially similar Disclaimer requirement for further
 .\"    binary redistribution.
 .\"
 .\" NO WARRANTY
 .\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 .\" "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 .\" LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
 .\" A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 .\" HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 .\" STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
 .\" IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 .\" POSSIBILITY OF SUCH DAMAGES.
 .\"
 .\" nvme driver man page.
 .\"
 .\" Author: Jim Harris <jimharris@FreeBSD.org>
 .\"
 .Dd June 6, 2020
 .Dt NVME 4
 .Os
 .Sh NAME
 .Nm nvme
 .Nd NVM Express core driver
 .Sh SYNOPSIS
 To compile this driver into your kernel,
 place the following line in your kernel configuration file:
 .Bd -ragged -offset indent
 .Cd "device nvme"
 .Ed
 .Pp
 Or, to load the driver as a module at boot, place the following line in
 .Xr loader.conf 5 :
 .Bd -literal -offset indent
 nvme_load="YES"
 .Ed
 .Pp
 Most users will also want to enable
 .Xr nvd 4
 or
 .Xr nda 4
 to expose NVM Express namespaces as disk devices which can be
 partitioned.
 Note that in NVM Express terms, a namespace is roughly equivalent to a
 SCSI LUN.
 .Sh DESCRIPTION
 The
 .Nm
 driver provides support for NVM Express (NVMe) controllers, such as:
 .Bl -bullet
 .It
 Hardware initialization
 .It
 Per-CPU IO queue pairs
 .It
 API for registering NVMe namespace consumers such as
 .Xr nvd 4
 or
 .Xr nda 4
 .It
 API for submitting NVM commands to namespaces
 .It
 Ioctls for controller and namespace configuration and management
 .El
 .Pp
 The
 .Nm
 driver creates controller device nodes in the format
 .Pa /dev/nvmeX
 and namespace device nodes in
 the format
 .Pa /dev/nvmeXnsY .
 Note that the NVM Express specification starts numbering namespaces at 1,
 not 0, and this driver follows that convention.
 .Sh CONFIGURATION
 By default,
 .Nm
 will create an I/O queue pair for each CPU, provided enough MSI-X vectors
 and NVMe queue pairs can be allocated.
 If not enough vectors or queue
 pairs are available, nvme(4) will use a smaller number of queue pairs and
 assign multiple CPUs per queue pair.
 .Pp
 To force a single I/O queue pair shared by all CPUs, set the following
 tunable value in
 .Xr loader.conf 5 :
 .Bd -literal -offset indent
 hw.nvme.per_cpu_io_queues=0
 .Ed
 .Pp
 To assign more than one CPU per I/O queue pair, thereby reducing the number
 of MSI-X vectors consumed by the device, set the following tunable value in
 .Xr loader.conf 5 :
 .Bd -literal -offset indent
 hw.nvme.min_cpus_per_ioq=X
 .Ed
 .Pp
 To force legacy interrupts for all
 .Nm
 driver instances, set the following tunable value in
 .Xr loader.conf 5 :
 .Bd -literal -offset indent
 hw.nvme.force_intx=1
 .Ed
 .Pp
 Note that use of INTx implies disabling of per-CPU I/O queue pairs.
 .Pp
 To control maximum amount of system RAM in bytes to use as Host Memory
 Buffer for capable devices, set the following tunable:
 .Bd -literal -offset indent
 hw.nvme.hmb_max
 .Ed
 .Pp
 The default value is 5% of physical memory size per device.
 .Pp
 The
 .Xr nvd 4
 driver is used to provide a disk driver to the system by default.
 The
 .Xr nda 4
 driver can also be used instead.
 The
 .Xr nvd 4
 driver performs better with smaller transactions and few TRIM
 commands.
 It sends all commands directly to the drive immediately.
 The
 .Xr nda 4
 driver performs better with larger transactions and also collapses
 TRIM commands giving better performance.
 It can queue commands to the drive; combine
 .Dv BIO_DELETE
 commands into a single trip; and
 use the CAM I/O scheduler to bias one type of operation over another.
 To select the
 .Xr nda 4
 driver, set the following tunable value in
 .Xr loader.conf 5 :
 .Bd -literal -offset indent
 hw.nvme.use_nvd=0
 .Ed
 .Pp
 This value may also be set in the kernel config file with
 .Bd -literal -offset indent
 .Cd options NVME_USE_NVD=0
 .Ed
 .Pp
 When there is an error,
 .Nm
 prints only the most relevant information about the command by default.
 To enable dumping of all information about the command, set the following tunable
 value in
 .Xr loader.conf 5 :
 .Bd -literal -offset indent
 hw.nvme.verbose_cmd_dump=1
 .Ed
 .Pp
 Prior versions of the driver reset the card twice on boot.
 This proved to be unnecessary and inefficient, so the driver now resets drive
 controller only once.
 The old behavior may be restored in the kernel config file with
 .Bd -literal -offset indent
 .Cd options NVME_2X_RESET
 .Ed
 .Sh SYSCTL VARIABLES
 The following controller-level sysctls are currently implemented:
 .Bl -tag -width indent
 .It Va dev.nvme.0.num_cpus_per_ioq
 (R) Number of CPUs associated with each I/O queue pair.
 .It Va dev.nvme.0.int_coal_time
 (R/W) Interrupt coalescing timer period in microseconds.
 Set to 0 to disable.
 .It Va dev.nvme.0.int_coal_threshold
 (R/W) Interrupt coalescing threshold in number of command completions.
 Set to 0 to disable.
 .El
 .Pp
 The following queue pair-level sysctls are currently implemented.
 Admin queue sysctls take the format of dev.nvme.0.adminq and I/O queue sysctls
 take the format of dev.nvme.0.ioq0.
 .Bl -tag -width indent
 .It Va dev.nvme.0.ioq0.num_entries
 (R) Number of entries in this queue pair's command and completion queue.
 .It Va dev.nvme.0.ioq0.num_tr
 (R) Number of nvme_tracker structures currently allocated for this queue pair.
 .It Va dev.nvme.0.ioq0.num_prp_list
 (R) Number of nvme_prp_list structures currently allocated for this queue pair.
 .It Va dev.nvme.0.ioq0.sq_head
 (R) Current location of the submission queue head pointer as observed by
 the driver.
 The head pointer is incremented by the controller as it takes commands off
 of the submission queue.
 .It Va dev.nvme.0.ioq0.sq_tail
 (R) Current location of the submission queue tail pointer as observed by
 the driver.
 The driver increments the tail pointer after writing a command
 into the submission queue to signal that a new command is ready to be
 processed.
 .It Va dev.nvme.0.ioq0.cq_head
 (R) Current location of the completion queue head pointer as observed by
 the driver.
 The driver increments the head pointer after finishing
 with a completion entry that was posted by the controller.
 .It Va dev.nvme.0.ioq0.num_cmds
 (R) Number of commands that have been submitted on this queue pair.
 .It Va dev.nvme.0.ioq0.dump_debug
 (W) Writing 1 to this sysctl will dump the full contents of the submission
 and completion queues to the console.
 .El
 .Pp
 In addition to the typical pci attachment, the
 .Nm
 driver supports attaching to a
 .Xr ahci 4
 device.
 Intel's Rapid Storage Technology (RST) hides the nvme device
 behind the AHCI device due to limitations in Windows.
 However, this effectively hides it from the
 .Fx
 kernel.
 To work around this limitation,
 .Fx
 detects that the AHCI device supports RST and when it is enabled.
 See
 .Xr ahci 4
 for more details.
+.Sh DIAGNOSTICS
+.Bl -diag
+.It "nvme%d: System interrupt issues?"
+The driver found a timed-out transaction had a pending completion record,
+indicating an interrupt had not been delivered.
+The system is either not configuring interrupts properly, or the system drops
+them under load.
+This message will appear at most once per boot per controller.
+.El
 .Sh SEE ALSO
 .Xr nda 4 ,
 .Xr nvd 4 ,
 .Xr pci 4 ,
 .Xr nvmecontrol 8 ,
 .Xr disk 9
 .Sh HISTORY
 The
 .Nm
 driver first appeared in
 .Fx 9.2 .
 .Sh AUTHORS
 .An -nosplit
 The
 .Nm
 driver was developed by Intel and originally written by
 .An Jim Harris Aq Mt jimharris@FreeBSD.org ,
 with contributions from
 .An Joe Golio
 at EMC.
 .Pp
 This man page was written by
 .An Jim Harris Aq Mt jimharris@FreeBSD.org .
diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h
index ff08f6581db5..05b5f3189eb2 100644
--- a/sys/dev/nvme/nvme_private.h
+++ b/sys/dev/nvme/nvme_private.h
@@ -1,564 +1,565 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (C) 2012-2014 Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef __NVME_PRIVATE_H__
 #define __NVME_PRIVATE_H__
 
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/counter.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memdesc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/rman.h>
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
 
 #include <vm/uma.h>
 
 #include <machine/bus.h>
 
 #include "nvme.h"
 
 #define DEVICE2SOFTC(dev) ((struct nvme_controller *) device_get_softc(dev))
 
 MALLOC_DECLARE(M_NVME);
 
 #define IDT32_PCI_ID		0x80d0111d /* 32 channel board */
 #define IDT8_PCI_ID		0x80d2111d /* 8 channel board */
 
 #define NVME_ADMIN_TRACKERS	(16)
 #define NVME_ADMIN_ENTRIES	(128)
 
 /*
  * NVME_IO_ENTRIES defines the size of an I/O qpair's submission and completion
  *  queues, while NVME_IO_TRACKERS defines the maximum number of I/O that we
  *  will allow outstanding on an I/O qpair at any time.  The only advantage in
  *  having IO_ENTRIES > IO_TRACKERS is for debugging purposes - when dumping
  *  the contents of the submission and completion queues, it will show a longer
  *  history of data.
  */
 #define NVME_IO_ENTRIES		(256)
 #define NVME_IO_TRACKERS	(128)
 #define NVME_MIN_IO_TRACKERS	(4)
 #define NVME_MAX_IO_TRACKERS	(1024)
 
 #define NVME_INT_COAL_TIME	(0)	/* disabled */
 #define NVME_INT_COAL_THRESHOLD (0)	/* 0-based */
 
 #define NVME_MAX_NAMESPACES	(16)
 #define NVME_MAX_CONSUMERS	(2)
 #define NVME_MAX_ASYNC_EVENTS	(8)
 
 #define NVME_ADMIN_TIMEOUT_PERIOD	(60)    /* in seconds */
 #define NVME_DEFAULT_TIMEOUT_PERIOD	(30)    /* in seconds */
 #define NVME_MIN_TIMEOUT_PERIOD		(5)
 #define NVME_MAX_TIMEOUT_PERIOD		(120)
 
 #define NVME_DEFAULT_RETRY_COUNT	(4)
 
 /* Maximum log page size to fetch for AERs. */
 #define NVME_MAX_AER_LOG_SIZE		(4096)
 
 /*
  * Define CACHE_LINE_SIZE here for older FreeBSD versions that do not define
  *  it.
  */
 #ifndef CACHE_LINE_SIZE
 #define CACHE_LINE_SIZE		(64)
 #endif
 
 #define NVME_GONE		0xfffffffful
 
 extern int32_t		nvme_retry_count;
 extern bool		nvme_verbose_cmd_dump;
 
 struct nvme_completion_poll_status {
 	struct nvme_completion	cpl;
 	int			done;
 };
 
 struct nvme_request {
 	struct nvme_command		cmd;
 	struct nvme_qpair		*qpair;
 	struct memdesc			payload;
 	nvme_cb_fn_t			cb_fn;
 	void				*cb_arg;
 	int32_t				retries;
 	bool				payload_valid;
 	bool				timeout;
 	bool				spare[2];		/* Future use */
 	STAILQ_ENTRY(nvme_request)	stailq;
 };
 
 struct nvme_async_event_request {
 	struct nvme_controller		*ctrlr;
 	struct nvme_request		*req;
 	struct nvme_completion		cpl;
 	uint32_t			log_page_id;
 	uint32_t			log_page_size;
 	uint8_t				log_page_buffer[NVME_MAX_AER_LOG_SIZE];
 };
 
 struct nvme_tracker {
 	TAILQ_ENTRY(nvme_tracker)	tailq;
 	struct nvme_request		*req;
 	struct nvme_qpair		*qpair;
 	sbintime_t			deadline;
 	bus_dmamap_t			payload_dma_map;
 	uint16_t			cid;
 
 	uint64_t			*prp;
 	bus_addr_t			prp_bus_addr;
 };
 
 enum nvme_recovery {
 	RECOVERY_NONE = 0,		/* Normal operations */
 	RECOVERY_WAITING,		/* waiting for the reset to complete */
 };
 struct nvme_qpair {
 	struct nvme_controller	*ctrlr;
 	uint32_t		id;
 	int			domain;
 	int			cpu;
 
 	uint16_t		vector;
 	int			rid;
 	struct resource		*res;
 	void 			*tag;
 
 	struct callout		timer;			/* recovery lock */
 	bool			timer_armed;		/* recovery lock */
 	enum nvme_recovery	recovery_state;		/* recovery lock */
 
 	uint32_t		num_entries;
 	uint32_t		num_trackers;
 	uint32_t		sq_tdbl_off;
 	uint32_t		cq_hdbl_off;
 
 	uint32_t		phase;
 	uint32_t		sq_head;
 	uint32_t		sq_tail;
 	uint32_t		cq_head;
 
 	int64_t			num_cmds;
 	int64_t			num_intr_handler_calls;
 	int64_t			num_retries;
 	int64_t			num_failures;
 	int64_t			num_ignored;
 	int64_t			num_recovery_nolock;
 
 	struct nvme_command	*cmd;
 	struct nvme_completion	*cpl;
 
 	bus_dma_tag_t		dma_tag;
 	bus_dma_tag_t		dma_tag_payload;
 
 	bus_dmamap_t		queuemem_map;
 	uint64_t		cmd_bus_addr;
 	uint64_t		cpl_bus_addr;
 
 	TAILQ_HEAD(, nvme_tracker)	free_tr;
 	TAILQ_HEAD(, nvme_tracker)	outstanding_tr;
 	STAILQ_HEAD(, nvme_request)	queued_req;
 
 	struct nvme_tracker	**act_tr;
 
 	struct mtx_padalign	lock;
 	struct mtx_padalign	recovery;
 } __aligned(CACHE_LINE_SIZE);
 
 struct nvme_namespace {
 	struct nvme_controller		*ctrlr;
 	struct nvme_namespace_data	data;
 	uint32_t			id;
 	uint32_t			flags;
 	struct cdev			*cdev;
 	void				*cons_cookie[NVME_MAX_CONSUMERS];
 	uint32_t			boundary;
 	struct mtx			lock;
 };
 
 /*
  * One of these per allocated PCI device.
  */
 struct nvme_controller {
 	device_t		dev;
 
 	struct mtx		lock;
 	int			domain;
 	uint32_t		ready_timeout_in_ms;
 	uint32_t		quirks;
 #define	QUIRK_DELAY_B4_CHK_RDY	1		/* Can't touch MMIO on disable */
 #define	QUIRK_DISABLE_TIMEOUT	2		/* Disable broken completion timeout feature */
 #define	QUIRK_INTEL_ALIGNMENT	4		/* Pre NVMe 1.3 performance alignment */
 #define QUIRK_AHCI		8		/* Attached via AHCI redirect */
 
 	bus_space_tag_t		bus_tag;
 	bus_space_handle_t	bus_handle;
 	int			resource_id;
 	struct resource		*resource;
 
 	/*
 	 * The NVMe spec allows for the MSI-X table to be placed in BAR 4/5,
 	 *  separate from the control registers which are in BAR 0/1.  These
 	 *  members track the mapping of BAR 4/5 for that reason.
 	 */
 	int			bar4_resource_id;
 	struct resource		*bar4_resource;
 
 	int			msi_count;
 	uint32_t		enable_aborts;
 
 	uint32_t		num_io_queues;
 	uint32_t		max_hw_pend_io;
 
 	/* Fields for tracking progress during controller initialization. */
 	struct intr_config_hook	config_hook;
 	uint32_t		ns_identified;
 	uint32_t		queues_created;
 
 	struct task		reset_task;
 	struct taskqueue	*taskqueue;
 
 	/* For shared legacy interrupt. */
 	int			rid;
 	struct resource		*res;
 	void			*tag;
 
 	/** maximum i/o size in bytes */
 	uint32_t		max_xfer_size;
 
 	/** LO and HI capacity mask */
 	uint32_t		cap_lo;
 	uint32_t		cap_hi;
 
 	/** Page size and log2(page_size) - 12 that we're currently using */
 	uint32_t		page_size;
 	uint32_t		mps;
 
 	/** interrupt coalescing time period (in microseconds) */
 	uint32_t		int_coal_time;
 
 	/** interrupt coalescing threshold */
 	uint32_t		int_coal_threshold;
 
 	/** timeout period in seconds */
 	uint32_t		admin_timeout_period;
 	uint32_t		timeout_period;
 
 	/** doorbell stride */
 	uint32_t		dstrd;
 
 	struct nvme_qpair	adminq;
 	struct nvme_qpair	*ioq;
 
 	struct nvme_registers		*regs;
 
 	struct nvme_controller_data	cdata;
 	struct nvme_namespace		ns[NVME_MAX_NAMESPACES];
 
 	struct cdev			*cdev;
 
 	/** bit mask of event types currently enabled for async events */
 	uint32_t			async_event_config;
 
 	uint32_t			num_aers;
 	struct nvme_async_event_request	aer[NVME_MAX_ASYNC_EVENTS];
 
 	void				*cons_cookie[NVME_MAX_CONSUMERS];
 
 	uint32_t			is_resetting;
 	uint32_t			is_initialized;
 	uint32_t			notification_sent;
 
 	bool				is_failed;
 	bool				is_dying;
+	bool				isr_warned;
 	STAILQ_HEAD(, nvme_request)	fail_req;
 
 	/* Host Memory Buffer */
 	int				hmb_nchunks;
 	size_t				hmb_chunk;
 	bus_dma_tag_t			hmb_tag;
 	struct nvme_hmb_chunk {
 		bus_dmamap_t		hmbc_map;
 		void			*hmbc_vaddr;
 		uint64_t		hmbc_paddr;
 	} *hmb_chunks;
 	bus_dma_tag_t			hmb_desc_tag;
 	bus_dmamap_t			hmb_desc_map;
 	struct nvme_hmb_desc		*hmb_desc_vaddr;
 	uint64_t			hmb_desc_paddr;
 
 	/* Statistics */
 	counter_u64_t			alignment_splits;
 };
 
 #define nvme_mmio_offsetof(reg)						       \
 	offsetof(struct nvme_registers, reg)
 
 #define nvme_mmio_read_4(sc, reg)					       \
 	bus_space_read_4((sc)->bus_tag, (sc)->bus_handle,		       \
 	    nvme_mmio_offsetof(reg))
 
 #define nvme_mmio_write_4(sc, reg, val)					       \
 	bus_space_write_4((sc)->bus_tag, (sc)->bus_handle,		       \
 	    nvme_mmio_offsetof(reg), val)
 
 #define nvme_mmio_write_8(sc, reg, val)					       \
 	do {								       \
 		bus_space_write_4((sc)->bus_tag, (sc)->bus_handle,	       \
 		    nvme_mmio_offsetof(reg), val & 0xFFFFFFFF); 	       \
 		bus_space_write_4((sc)->bus_tag, (sc)->bus_handle,	       \
 		    nvme_mmio_offsetof(reg)+4,				       \
 		    (val & 0xFFFFFFFF00000000ULL) >> 32);		       \
 	} while (0);
 
 #define nvme_printf(ctrlr, fmt, args...)	\
     device_printf(ctrlr->dev, fmt, ##args)
 
 void	nvme_ns_test(struct nvme_namespace *ns, u_long cmd, caddr_t arg);
 
 void	nvme_ctrlr_cmd_identify_controller(struct nvme_controller *ctrlr,
 					   void *payload,
 					   nvme_cb_fn_t cb_fn, void *cb_arg);
 void	nvme_ctrlr_cmd_identify_namespace(struct nvme_controller *ctrlr,
 					  uint32_t nsid, void *payload,
 					  nvme_cb_fn_t cb_fn, void *cb_arg);
 void	nvme_ctrlr_cmd_set_interrupt_coalescing(struct nvme_controller *ctrlr,
 						uint32_t microseconds,
 						uint32_t threshold,
 						nvme_cb_fn_t cb_fn,
 						void *cb_arg);
 void	nvme_ctrlr_cmd_get_error_page(struct nvme_controller *ctrlr,
 				      struct nvme_error_information_entry *payload,
 				      uint32_t num_entries, /* 0 = max */
 				      nvme_cb_fn_t cb_fn,
 				      void *cb_arg);
 void	nvme_ctrlr_cmd_get_health_information_page(struct nvme_controller *ctrlr,
 						   uint32_t nsid,
 						   struct nvme_health_information_page *payload,
 						   nvme_cb_fn_t cb_fn,
 						   void *cb_arg);
 void	nvme_ctrlr_cmd_get_firmware_page(struct nvme_controller *ctrlr,
 					 struct nvme_firmware_page *payload,
 					 nvme_cb_fn_t cb_fn,
 					 void *cb_arg);
 void	nvme_ctrlr_cmd_create_io_cq(struct nvme_controller *ctrlr,
 				    struct nvme_qpair *io_que,
 				    nvme_cb_fn_t cb_fn, void *cb_arg);
 void	nvme_ctrlr_cmd_create_io_sq(struct nvme_controller *ctrlr,
 				    struct nvme_qpair *io_que,
 				    nvme_cb_fn_t cb_fn, void *cb_arg);
 void	nvme_ctrlr_cmd_delete_io_cq(struct nvme_controller *ctrlr,
 				    struct nvme_qpair *io_que,
 				    nvme_cb_fn_t cb_fn, void *cb_arg);
 void	nvme_ctrlr_cmd_delete_io_sq(struct nvme_controller *ctrlr,
 				    struct nvme_qpair *io_que,
 				    nvme_cb_fn_t cb_fn, void *cb_arg);
 void	nvme_ctrlr_cmd_set_num_queues(struct nvme_controller *ctrlr,
 				      uint32_t num_queues, nvme_cb_fn_t cb_fn,
 				      void *cb_arg);
 void	nvme_ctrlr_cmd_set_async_event_config(struct nvme_controller *ctrlr,
 					      uint32_t state,
 					      nvme_cb_fn_t cb_fn, void *cb_arg);
 void	nvme_ctrlr_cmd_abort(struct nvme_controller *ctrlr, uint16_t cid,
 			     uint16_t sqid, nvme_cb_fn_t cb_fn, void *cb_arg);
 
 void	nvme_completion_poll_cb(void *arg, const struct nvme_completion *cpl);
 
 int	nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev);
 void	nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev);
 void	nvme_ctrlr_shutdown(struct nvme_controller *ctrlr);
 void	nvme_ctrlr_reset(struct nvme_controller *ctrlr);
 /* ctrlr defined as void * to allow use with config_intrhook. */
 void	nvme_ctrlr_start_config_hook(void *ctrlr_arg);
 void	nvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr,
 					struct nvme_request *req);
 void	nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr,
 				     struct nvme_request *req);
 
 int	nvme_qpair_construct(struct nvme_qpair *qpair,
 			     uint32_t num_entries, uint32_t num_trackers,
 			     struct nvme_controller *ctrlr);
 void	nvme_qpair_submit_tracker(struct nvme_qpair *qpair,
 				  struct nvme_tracker *tr);
 bool	nvme_qpair_process_completions(struct nvme_qpair *qpair);
 void	nvme_qpair_submit_request(struct nvme_qpair *qpair,
 				  struct nvme_request *req);
 void	nvme_qpair_reset(struct nvme_qpair *qpair);
 void	nvme_qpair_fail(struct nvme_qpair *qpair);
 
 void	nvme_admin_qpair_enable(struct nvme_qpair *qpair);
 void	nvme_admin_qpair_disable(struct nvme_qpair *qpair);
 void	nvme_admin_qpair_destroy(struct nvme_qpair *qpair);
 
 void	nvme_io_qpair_enable(struct nvme_qpair *qpair);
 void	nvme_io_qpair_disable(struct nvme_qpair *qpair);
 void	nvme_io_qpair_destroy(struct nvme_qpair *qpair);
 
 int	nvme_ns_construct(struct nvme_namespace *ns, uint32_t id,
 			  struct nvme_controller *ctrlr);
 void	nvme_ns_destruct(struct nvme_namespace *ns);
 
 void	nvme_sysctl_initialize_ctrlr(struct nvme_controller *ctrlr);
 
 void	nvme_qpair_print_command(struct nvme_qpair *qpair,
 	    struct nvme_command *cmd);
 void	nvme_qpair_print_completion(struct nvme_qpair *qpair,
 	    struct nvme_completion *cpl);
 
 int	nvme_attach(device_t dev);
 int	nvme_shutdown(device_t dev);
 int	nvme_detach(device_t dev);
 
 /*
  * Wait for a command to complete using the nvme_completion_poll_cb.  Used in
  * limited contexts where the caller knows it's OK to block briefly while the
  * command runs. The ISR will run the callback which will set status->done to
  * true, usually within microseconds. If not, then after one second timeout
  * handler should reset the controller and abort all outstanding requests
  * including this polled one. If still not after ten seconds, then something is
  * wrong with the driver, and panic is the only way to recover.
  *
  * Most commands using this interface aren't actual I/O to the drive's media so
  * complete within a few microseconds. Adaptively spin for one tick to catch the
  * vast majority of these without waiting for a tick plus scheduling delays. Since
  * these are on startup, this drastically reduces startup time.
  */
 static __inline
 void
 nvme_completion_poll(struct nvme_completion_poll_status *status)
 {
 	int timeout = ticks + 10 * hz;
 	sbintime_t delta_t = SBT_1US;
 
 	while (!atomic_load_acq_int(&status->done)) {
 		if (timeout - ticks < 0)
 			panic("NVME polled command failed to complete within 10s.");
 		pause_sbt("nvme", delta_t, 0, C_PREL(1));
 		delta_t = min(SBT_1MS, delta_t * 3 / 2);
 	}
 }
 
 static __inline void
 nvme_single_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
 {
 	uint64_t *bus_addr = (uint64_t *)arg;
 
 	KASSERT(nseg == 1, ("number of segments (%d) is not 1", nseg));
 	if (error != 0)
 		printf("nvme_single_map err %d\n", error);
 	*bus_addr = seg[0].ds_addr;
 }
 
 static __inline struct nvme_request *
 _nvme_allocate_request(nvme_cb_fn_t cb_fn, void *cb_arg)
 {
 	struct nvme_request *req;
 
 	req = malloc(sizeof(*req), M_NVME, M_NOWAIT | M_ZERO);
 	if (req != NULL) {
 		req->cb_fn = cb_fn;
 		req->cb_arg = cb_arg;
 		req->timeout = true;
 	}
 	return (req);
 }
 
 static __inline struct nvme_request *
 nvme_allocate_request_vaddr(void *payload, uint32_t payload_size,
     nvme_cb_fn_t cb_fn, void *cb_arg)
 {
 	struct nvme_request *req;
 
 	req = _nvme_allocate_request(cb_fn, cb_arg);
 	if (req != NULL) {
 		req->payload = memdesc_vaddr(payload, payload_size);
 		req->payload_valid = true;
 	}
 	return (req);
 }
 
 static __inline struct nvme_request *
 nvme_allocate_request_null(nvme_cb_fn_t cb_fn, void *cb_arg)
 {
 	struct nvme_request *req;
 
 	req = _nvme_allocate_request(cb_fn, cb_arg);
 	return (req);
 }
 
 static __inline struct nvme_request *
 nvme_allocate_request_bio(struct bio *bio, nvme_cb_fn_t cb_fn, void *cb_arg)
 {
 	struct nvme_request *req;
 
 	req = _nvme_allocate_request(cb_fn, cb_arg);
 	if (req != NULL) {
 		req->payload = memdesc_bio(bio);
 		req->payload_valid = true;
 	}
 	return (req);
 }
 
 static __inline struct nvme_request *
 nvme_allocate_request_ccb(union ccb *ccb, nvme_cb_fn_t cb_fn, void *cb_arg)
 {
 	struct nvme_request *req;
 
 	req = _nvme_allocate_request(cb_fn, cb_arg);
 	if (req != NULL) {
 		req->payload = memdesc_ccb(ccb);
 		req->payload_valid = true;
 	}
 
 	return (req);
 }
 
 #define nvme_free_request(req)	free(req, M_NVME)
 
 void	nvme_notify_async_consumers(struct nvme_controller *ctrlr,
 				    const struct nvme_completion *async_cpl,
 				    uint32_t log_page_id, void *log_page_buffer,
 				    uint32_t log_page_size);
 void	nvme_notify_fail_consumers(struct nvme_controller *ctrlr);
 void	nvme_notify_new_controller(struct nvme_controller *ctrlr);
 void	nvme_notify_ns(struct nvme_controller *ctrlr, int nsid);
 
 void	nvme_ctrlr_shared_handler(void *arg);
 void	nvme_ctrlr_poll(struct nvme_controller *ctrlr);
 
 int	nvme_ctrlr_suspend(struct nvme_controller *ctrlr);
 int	nvme_ctrlr_resume(struct nvme_controller *ctrlr);
 
 #endif /* __NVME_PRIVATE_H__ */
diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c
index c917b34dbe43..0c3a36d4d76f 100644
--- a/sys/dev/nvme/nvme_qpair.c
+++ b/sys/dev/nvme/nvme_qpair.c
@@ -1,1600 +1,1605 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (C) 2012-2014 Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/domainset.h>
 #include <sys/proc.h>
 
 #include <dev/pci/pcivar.h>
 
 #include "nvme_private.h"
 
 typedef enum error_print { ERROR_PRINT_NONE, ERROR_PRINT_NO_RETRY, ERROR_PRINT_ALL } error_print_t;
 #define DO_NOT_RETRY	1
 
 static void	_nvme_qpair_submit_request(struct nvme_qpair *qpair,
 					   struct nvme_request *req);
 static void	nvme_qpair_destroy(struct nvme_qpair *qpair);
 
 #define DEFAULT_INDEX	256
 #define DEFAULT_ENTRY(x)	[DEFAULT_INDEX] = x
 #define OPC_ENTRY(x)		[NVME_OPC_ ## x] = #x
 
 static const char *admin_opcode[DEFAULT_INDEX + 1] = {
 	OPC_ENTRY(DELETE_IO_SQ),
 	OPC_ENTRY(CREATE_IO_SQ),
 	OPC_ENTRY(GET_LOG_PAGE),
 	OPC_ENTRY(DELETE_IO_CQ),
 	OPC_ENTRY(CREATE_IO_CQ),
 	OPC_ENTRY(IDENTIFY),
 	OPC_ENTRY(ABORT),
 	OPC_ENTRY(SET_FEATURES),
 	OPC_ENTRY(GET_FEATURES),
 	OPC_ENTRY(ASYNC_EVENT_REQUEST),
 	OPC_ENTRY(NAMESPACE_MANAGEMENT),
 	OPC_ENTRY(FIRMWARE_ACTIVATE),
 	OPC_ENTRY(FIRMWARE_IMAGE_DOWNLOAD),
 	OPC_ENTRY(DEVICE_SELF_TEST),
 	OPC_ENTRY(NAMESPACE_ATTACHMENT),
 	OPC_ENTRY(KEEP_ALIVE),
 	OPC_ENTRY(DIRECTIVE_SEND),
 	OPC_ENTRY(DIRECTIVE_RECEIVE),
 	OPC_ENTRY(VIRTUALIZATION_MANAGEMENT),
 	OPC_ENTRY(NVME_MI_SEND),
 	OPC_ENTRY(NVME_MI_RECEIVE),
 	OPC_ENTRY(CAPACITY_MANAGEMENT),
 	OPC_ENTRY(LOCKDOWN),
 	OPC_ENTRY(DOORBELL_BUFFER_CONFIG),
 	OPC_ENTRY(FABRICS_COMMANDS),
 	OPC_ENTRY(FORMAT_NVM),
 	OPC_ENTRY(SECURITY_SEND),
 	OPC_ENTRY(SECURITY_RECEIVE),
 	OPC_ENTRY(SANITIZE),
 	OPC_ENTRY(GET_LBA_STATUS),
 	DEFAULT_ENTRY("ADMIN COMMAND"),
 };
 
 static const char *io_opcode[DEFAULT_INDEX + 1] = {
 	OPC_ENTRY(FLUSH),
 	OPC_ENTRY(WRITE),
 	OPC_ENTRY(READ),
 	OPC_ENTRY(WRITE_UNCORRECTABLE),
 	OPC_ENTRY(COMPARE),
 	OPC_ENTRY(WRITE_ZEROES),
 	OPC_ENTRY(DATASET_MANAGEMENT),
 	OPC_ENTRY(VERIFY),
 	OPC_ENTRY(RESERVATION_REGISTER),
 	OPC_ENTRY(RESERVATION_REPORT),
 	OPC_ENTRY(RESERVATION_ACQUIRE),
 	OPC_ENTRY(RESERVATION_RELEASE),
 	OPC_ENTRY(COPY),
 	DEFAULT_ENTRY("IO COMMAND"),
 };
 
 static const char *
 get_opcode_string(const char *op[DEFAULT_INDEX + 1], uint16_t opc)
 {
 	const char *nm = opc < DEFAULT_INDEX ? op[opc] : op[DEFAULT_INDEX];
 
 	return (nm != NULL ? nm : op[DEFAULT_INDEX]);
 }
 
 static const char *
 get_admin_opcode_string(uint16_t opc)
 {
 	return (get_opcode_string(admin_opcode, opc));
 }
 
 static const char *
 get_io_opcode_string(uint16_t opc)
 {
 	return (get_opcode_string(io_opcode, opc));
 }
 
 static void
 nvme_admin_qpair_print_command(struct nvme_qpair *qpair,
     struct nvme_command *cmd)
 {
 
 	nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%x "
 	    "cdw10:%08x cdw11:%08x\n",
 	    get_admin_opcode_string(cmd->opc), cmd->opc, qpair->id, cmd->cid,
 	    le32toh(cmd->nsid), le32toh(cmd->cdw10), le32toh(cmd->cdw11));
 }
 
 static void
 nvme_io_qpair_print_command(struct nvme_qpair *qpair,
     struct nvme_command *cmd)
 {
 
 	switch (cmd->opc) {
 	case NVME_OPC_WRITE:
 	case NVME_OPC_READ:
 	case NVME_OPC_WRITE_UNCORRECTABLE:
 	case NVME_OPC_COMPARE:
 	case NVME_OPC_WRITE_ZEROES:
 	case NVME_OPC_VERIFY:
 		nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d "
 		    "lba:%llu len:%d\n",
 		    get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, le32toh(cmd->nsid),
 		    ((unsigned long long)le32toh(cmd->cdw11) << 32) + le32toh(cmd->cdw10),
 		    (le32toh(cmd->cdw12) & 0xFFFF) + 1);
 		break;
 	case NVME_OPC_FLUSH:
 	case NVME_OPC_DATASET_MANAGEMENT:
 	case NVME_OPC_RESERVATION_REGISTER:
 	case NVME_OPC_RESERVATION_REPORT:
 	case NVME_OPC_RESERVATION_ACQUIRE:
 	case NVME_OPC_RESERVATION_RELEASE:
 		nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d\n",
 		    get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, le32toh(cmd->nsid));
 		break;
 	default:
 		nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%d\n",
 		    get_io_opcode_string(cmd->opc), cmd->opc, qpair->id,
 		    cmd->cid, le32toh(cmd->nsid));
 		break;
 	}
 }
 
 void
 nvme_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd)
 {
 	if (qpair->id == 0)
 		nvme_admin_qpair_print_command(qpair, cmd);
 	else
 		nvme_io_qpair_print_command(qpair, cmd);
 	if (nvme_verbose_cmd_dump) {
 		nvme_printf(qpair->ctrlr,
 		    "nsid:%#x rsvd2:%#x rsvd3:%#x mptr:%#jx prp1:%#jx prp2:%#jx\n",
 		    cmd->nsid, cmd->rsvd2, cmd->rsvd3, (uintmax_t)cmd->mptr,
 		    (uintmax_t)cmd->prp1, (uintmax_t)cmd->prp2);
 		nvme_printf(qpair->ctrlr,
 		    "cdw10: %#x cdw11:%#x cdw12:%#x cdw13:%#x cdw14:%#x cdw15:%#x\n",
 		    cmd->cdw10, cmd->cdw11, cmd->cdw12, cmd->cdw13, cmd->cdw14,
 		    cmd->cdw15);
 	}
 }
 
 struct nvme_status_string {
 	uint16_t	sc;
 	const char *	str;
 };
 
 static struct nvme_status_string generic_status[] = {
 	{ NVME_SC_SUCCESS, "SUCCESS" },
 	{ NVME_SC_INVALID_OPCODE, "INVALID OPCODE" },
 	{ NVME_SC_INVALID_FIELD, "INVALID_FIELD" },
 	{ NVME_SC_COMMAND_ID_CONFLICT, "COMMAND ID CONFLICT" },
 	{ NVME_SC_DATA_TRANSFER_ERROR, "DATA TRANSFER ERROR" },
 	{ NVME_SC_ABORTED_POWER_LOSS, "ABORTED - POWER LOSS" },
 	{ NVME_SC_INTERNAL_DEVICE_ERROR, "INTERNAL DEVICE ERROR" },
 	{ NVME_SC_ABORTED_BY_REQUEST, "ABORTED - BY REQUEST" },
 	{ NVME_SC_ABORTED_SQ_DELETION, "ABORTED - SQ DELETION" },
 	{ NVME_SC_ABORTED_FAILED_FUSED, "ABORTED - FAILED FUSED" },
 	{ NVME_SC_ABORTED_MISSING_FUSED, "ABORTED - MISSING FUSED" },
 	{ NVME_SC_INVALID_NAMESPACE_OR_FORMAT, "INVALID NAMESPACE OR FORMAT" },
 	{ NVME_SC_COMMAND_SEQUENCE_ERROR, "COMMAND SEQUENCE ERROR" },
 	{ NVME_SC_INVALID_SGL_SEGMENT_DESCR, "INVALID SGL SEGMENT DESCRIPTOR" },
 	{ NVME_SC_INVALID_NUMBER_OF_SGL_DESCR, "INVALID NUMBER OF SGL DESCRIPTORS" },
 	{ NVME_SC_DATA_SGL_LENGTH_INVALID, "DATA SGL LENGTH INVALID" },
 	{ NVME_SC_METADATA_SGL_LENGTH_INVALID, "METADATA SGL LENGTH INVALID" },
 	{ NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID, "SGL DESCRIPTOR TYPE INVALID" },
 	{ NVME_SC_INVALID_USE_OF_CMB, "INVALID USE OF CONTROLLER MEMORY BUFFER" },
 	{ NVME_SC_PRP_OFFET_INVALID, "PRP OFFET INVALID" },
 	{ NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED, "ATOMIC WRITE UNIT EXCEEDED" },
 	{ NVME_SC_OPERATION_DENIED, "OPERATION DENIED" },
 	{ NVME_SC_SGL_OFFSET_INVALID, "SGL OFFSET INVALID" },
 	{ NVME_SC_HOST_ID_INCONSISTENT_FORMAT, "HOST IDENTIFIER INCONSISTENT FORMAT" },
 	{ NVME_SC_KEEP_ALIVE_TIMEOUT_EXPIRED, "KEEP ALIVE TIMEOUT EXPIRED" },
 	{ NVME_SC_KEEP_ALIVE_TIMEOUT_INVALID, "KEEP ALIVE TIMEOUT INVALID" },
 	{ NVME_SC_ABORTED_DUE_TO_PREEMPT, "COMMAND ABORTED DUE TO PREEMPT AND ABORT" },
 	{ NVME_SC_SANITIZE_FAILED, "SANITIZE FAILED" },
 	{ NVME_SC_SANITIZE_IN_PROGRESS, "SANITIZE IN PROGRESS" },
 	{ NVME_SC_SGL_DATA_BLOCK_GRAN_INVALID, "SGL_DATA_BLOCK_GRANULARITY_INVALID" },
 	{ NVME_SC_NOT_SUPPORTED_IN_CMB, "COMMAND NOT SUPPORTED FOR QUEUE IN CMB" },
 	{ NVME_SC_NAMESPACE_IS_WRITE_PROTECTED, "NAMESPACE IS WRITE PROTECTED" },
 	{ NVME_SC_COMMAND_INTERRUPTED, "COMMAND INTERRUPTED" },
 	{ NVME_SC_TRANSIENT_TRANSPORT_ERROR, "TRANSIENT TRANSPORT ERROR" },
 
 	{ NVME_SC_LBA_OUT_OF_RANGE, "LBA OUT OF RANGE" },
 	{ NVME_SC_CAPACITY_EXCEEDED, "CAPACITY EXCEEDED" },
 	{ NVME_SC_NAMESPACE_NOT_READY, "NAMESPACE NOT READY" },
 	{ NVME_SC_RESERVATION_CONFLICT, "RESERVATION CONFLICT" },
 	{ NVME_SC_FORMAT_IN_PROGRESS, "FORMAT IN PROGRESS" },
 	{ 0xFFFF, "GENERIC" }
 };
 
 static struct nvme_status_string command_specific_status[] = {
 	{ NVME_SC_COMPLETION_QUEUE_INVALID, "INVALID COMPLETION QUEUE" },
 	{ NVME_SC_INVALID_QUEUE_IDENTIFIER, "INVALID QUEUE IDENTIFIER" },
 	{ NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED, "MAX QUEUE SIZE EXCEEDED" },
 	{ NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED, "ABORT CMD LIMIT EXCEEDED" },
 	{ NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED, "ASYNC LIMIT EXCEEDED" },
 	{ NVME_SC_INVALID_FIRMWARE_SLOT, "INVALID FIRMWARE SLOT" },
 	{ NVME_SC_INVALID_FIRMWARE_IMAGE, "INVALID FIRMWARE IMAGE" },
 	{ NVME_SC_INVALID_INTERRUPT_VECTOR, "INVALID INTERRUPT VECTOR" },
 	{ NVME_SC_INVALID_LOG_PAGE, "INVALID LOG PAGE" },
 	{ NVME_SC_INVALID_FORMAT, "INVALID FORMAT" },
 	{ NVME_SC_FIRMWARE_REQUIRES_RESET, "FIRMWARE REQUIRES RESET" },
 	{ NVME_SC_INVALID_QUEUE_DELETION, "INVALID QUEUE DELETION" },
 	{ NVME_SC_FEATURE_NOT_SAVEABLE, "FEATURE IDENTIFIER NOT SAVEABLE" },
 	{ NVME_SC_FEATURE_NOT_CHANGEABLE, "FEATURE NOT CHANGEABLE" },
 	{ NVME_SC_FEATURE_NOT_NS_SPECIFIC, "FEATURE NOT NAMESPACE SPECIFIC" },
 	{ NVME_SC_FW_ACT_REQUIRES_NVMS_RESET, "FIRMWARE ACTIVATION REQUIRES NVM SUBSYSTEM RESET" },
 	{ NVME_SC_FW_ACT_REQUIRES_RESET, "FIRMWARE ACTIVATION REQUIRES RESET" },
 	{ NVME_SC_FW_ACT_REQUIRES_TIME, "FIRMWARE ACTIVATION REQUIRES MAXIMUM TIME VIOLATION" },
 	{ NVME_SC_FW_ACT_PROHIBITED, "FIRMWARE ACTIVATION PROHIBITED" },
 	{ NVME_SC_OVERLAPPING_RANGE, "OVERLAPPING RANGE" },
 	{ NVME_SC_NS_INSUFFICIENT_CAPACITY, "NAMESPACE INSUFFICIENT CAPACITY" },
 	{ NVME_SC_NS_ID_UNAVAILABLE, "NAMESPACE IDENTIFIER UNAVAILABLE" },
 	{ NVME_SC_NS_ALREADY_ATTACHED, "NAMESPACE ALREADY ATTACHED" },
 	{ NVME_SC_NS_IS_PRIVATE, "NAMESPACE IS PRIVATE" },
 	{ NVME_SC_NS_NOT_ATTACHED, "NS NOT ATTACHED" },
 	{ NVME_SC_THIN_PROV_NOT_SUPPORTED, "THIN PROVISIONING NOT SUPPORTED" },
 	{ NVME_SC_CTRLR_LIST_INVALID, "CONTROLLER LIST INVALID" },
 	{ NVME_SC_SELF_TEST_IN_PROGRESS, "DEVICE SELF-TEST IN PROGRESS" },
 	{ NVME_SC_BOOT_PART_WRITE_PROHIB, "BOOT PARTITION WRITE PROHIBITED" },
 	{ NVME_SC_INVALID_CTRLR_ID, "INVALID CONTROLLER IDENTIFIER" },
 	{ NVME_SC_INVALID_SEC_CTRLR_STATE, "INVALID SECONDARY CONTROLLER STATE" },
 	{ NVME_SC_INVALID_NUM_OF_CTRLR_RESRC, "INVALID NUMBER OF CONTROLLER RESOURCES" },
 	{ NVME_SC_INVALID_RESOURCE_ID, "INVALID RESOURCE IDENTIFIER" },
 	{ NVME_SC_SANITIZE_PROHIBITED_WPMRE, "SANITIZE PROHIBITED WRITE PERSISTENT MEMORY REGION ENABLED" },
 	{ NVME_SC_ANA_GROUP_ID_INVALID, "ANA GROUP IDENTIFIED INVALID" },
 	{ NVME_SC_ANA_ATTACH_FAILED, "ANA ATTACH FAILED" },
 
 	{ NVME_SC_CONFLICTING_ATTRIBUTES, "CONFLICTING ATTRIBUTES" },
 	{ NVME_SC_INVALID_PROTECTION_INFO, "INVALID PROTECTION INFO" },
 	{ NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE, "WRITE TO RO PAGE" },
 	{ 0xFFFF, "COMMAND SPECIFIC" }
 };
 
 static struct nvme_status_string media_error_status[] = {
 	{ NVME_SC_WRITE_FAULTS, "WRITE FAULTS" },
 	{ NVME_SC_UNRECOVERED_READ_ERROR, "UNRECOVERED READ ERROR" },
 	{ NVME_SC_GUARD_CHECK_ERROR, "GUARD CHECK ERROR" },
 	{ NVME_SC_APPLICATION_TAG_CHECK_ERROR, "APPLICATION TAG CHECK ERROR" },
 	{ NVME_SC_REFERENCE_TAG_CHECK_ERROR, "REFERENCE TAG CHECK ERROR" },
 	{ NVME_SC_COMPARE_FAILURE, "COMPARE FAILURE" },
 	{ NVME_SC_ACCESS_DENIED, "ACCESS DENIED" },
 	{ NVME_SC_DEALLOCATED_OR_UNWRITTEN, "DEALLOCATED OR UNWRITTEN LOGICAL BLOCK" },
 	{ 0xFFFF, "MEDIA ERROR" }
 };
 
 static struct nvme_status_string path_related_status[] = {
 	{ NVME_SC_INTERNAL_PATH_ERROR, "INTERNAL PATH ERROR" },
 	{ NVME_SC_ASYMMETRIC_ACCESS_PERSISTENT_LOSS, "ASYMMETRIC ACCESS PERSISTENT LOSS" },
 	{ NVME_SC_ASYMMETRIC_ACCESS_INACCESSIBLE, "ASYMMETRIC ACCESS INACCESSIBLE" },
 	{ NVME_SC_ASYMMETRIC_ACCESS_TRANSITION, "ASYMMETRIC ACCESS TRANSITION" },
 	{ NVME_SC_CONTROLLER_PATHING_ERROR, "CONTROLLER PATHING ERROR" },
 	{ NVME_SC_HOST_PATHING_ERROR, "HOST PATHING ERROR" },
 	{ NVME_SC_COMMAND_ABORTED_BY_HOST, "COMMAND ABORTED BY HOST" },
 	{ 0xFFFF, "PATH RELATED" },
 };
 
 static const char *
 get_status_string(uint16_t sct, uint16_t sc)
 {
 	struct nvme_status_string *entry;
 
 	switch (sct) {
 	case NVME_SCT_GENERIC:
 		entry = generic_status;
 		break;
 	case NVME_SCT_COMMAND_SPECIFIC:
 		entry = command_specific_status;
 		break;
 	case NVME_SCT_MEDIA_ERROR:
 		entry = media_error_status;
 		break;
 	case NVME_SCT_PATH_RELATED:
 		entry = path_related_status;
 		break;
 	case NVME_SCT_VENDOR_SPECIFIC:
 		return ("VENDOR SPECIFIC");
 	default:
 		return ("RESERVED");
 	}
 
 	while (entry->sc != 0xFFFF) {
 		if (entry->sc == sc)
 			return (entry->str);
 		entry++;
 	}
 	return (entry->str);
 }
 
 void
 nvme_qpair_print_completion(struct nvme_qpair *qpair,
     struct nvme_completion *cpl)
 {
 	uint8_t sct, sc, crd, m, dnr, p;
 
 	sct = NVME_STATUS_GET_SCT(cpl->status);
 	sc = NVME_STATUS_GET_SC(cpl->status);
 	crd = NVME_STATUS_GET_CRD(cpl->status);
 	m = NVME_STATUS_GET_M(cpl->status);
 	dnr = NVME_STATUS_GET_DNR(cpl->status);
 	p = NVME_STATUS_GET_P(cpl->status);
 
 	nvme_printf(qpair->ctrlr, "%s (%02x/%02x) crd:%x m:%x dnr:%x p:%d "
 	    "sqid:%d cid:%d cdw0:%x\n",
 	    get_status_string(sct, sc), sct, sc, crd, m, dnr, p,
 	    cpl->sqid, cpl->cid, cpl->cdw0);
 }
 
 static bool
 nvme_completion_is_retry(const struct nvme_completion *cpl)
 {
 	uint8_t sct, sc, dnr;
 
 	sct = NVME_STATUS_GET_SCT(cpl->status);
 	sc = NVME_STATUS_GET_SC(cpl->status);
 	dnr = NVME_STATUS_GET_DNR(cpl->status);	/* Do Not Retry Bit */
 
 	/*
 	 * TODO: spec is not clear how commands that are aborted due
 	 *  to TLER will be marked.  So for now, it seems
 	 *  NAMESPACE_NOT_READY is the only case where we should
 	 *  look at the DNR bit. Requests failed with ABORTED_BY_REQUEST
 	 *  set the DNR bit correctly since the driver controls that.
 	 */
 	switch (sct) {
 	case NVME_SCT_GENERIC:
 		switch (sc) {
 		case NVME_SC_ABORTED_BY_REQUEST:
 		case NVME_SC_NAMESPACE_NOT_READY:
 			if (dnr)
 				return (0);
 			else
 				return (1);
 		case NVME_SC_INVALID_OPCODE:
 		case NVME_SC_INVALID_FIELD:
 		case NVME_SC_COMMAND_ID_CONFLICT:
 		case NVME_SC_DATA_TRANSFER_ERROR:
 		case NVME_SC_ABORTED_POWER_LOSS:
 		case NVME_SC_INTERNAL_DEVICE_ERROR:
 		case NVME_SC_ABORTED_SQ_DELETION:
 		case NVME_SC_ABORTED_FAILED_FUSED:
 		case NVME_SC_ABORTED_MISSING_FUSED:
 		case NVME_SC_INVALID_NAMESPACE_OR_FORMAT:
 		case NVME_SC_COMMAND_SEQUENCE_ERROR:
 		case NVME_SC_LBA_OUT_OF_RANGE:
 		case NVME_SC_CAPACITY_EXCEEDED:
 		default:
 			return (0);
 		}
 	case NVME_SCT_COMMAND_SPECIFIC:
 	case NVME_SCT_MEDIA_ERROR:
 		return (0);
 	case NVME_SCT_PATH_RELATED:
 		switch (sc) {
 		case NVME_SC_INTERNAL_PATH_ERROR:
 			if (dnr)
 				return (0);
 			else
 				return (1);
 		default:
 			return (0);
 		}
 	case NVME_SCT_VENDOR_SPECIFIC:
 	default:
 		return (0);
 	}
 }
 
 static void
 nvme_qpair_complete_tracker(struct nvme_tracker *tr,
     struct nvme_completion *cpl, error_print_t print_on_error)
 {
 	struct nvme_qpair	*qpair = tr->qpair;
 	struct nvme_request	*req;
 	bool			retry, error, retriable;
 
 	mtx_assert(&qpair->lock, MA_NOTOWNED);
 
 	req = tr->req;
 	error = nvme_completion_is_error(cpl);
 	retriable = nvme_completion_is_retry(cpl);
 	retry = error && retriable && req->retries < nvme_retry_count;
 	if (retry)
 		qpair->num_retries++;
 	if (error && req->retries >= nvme_retry_count && retriable)
 		qpair->num_failures++;
 
 	if (error && (print_on_error == ERROR_PRINT_ALL ||
 		(!retry && print_on_error == ERROR_PRINT_NO_RETRY))) {
 		nvme_qpair_print_command(qpair, &req->cmd);
 		nvme_qpair_print_completion(qpair, cpl);
 	}
 
 	qpair->act_tr[cpl->cid] = NULL;
 
 	KASSERT(cpl->cid == req->cmd.cid, ("cpl cid does not match cmd cid\n"));
 
 	if (!retry) {
 		if (req->payload_valid) {
 			bus_dmamap_sync(qpair->dma_tag_payload,
 			    tr->payload_dma_map,
 			    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 		}
 		if (req->cb_fn)
 			req->cb_fn(req->cb_arg, cpl);
 	}
 
 	mtx_lock(&qpair->lock);
 
 	if (retry) {
 		req->retries++;
 		nvme_qpair_submit_tracker(qpair, tr);
 	} else {
 		if (req->payload_valid) {
 			bus_dmamap_unload(qpair->dma_tag_payload,
 			    tr->payload_dma_map);
 		}
 
 		nvme_free_request(req);
 		tr->req = NULL;
 
 		TAILQ_REMOVE(&qpair->outstanding_tr, tr, tailq);
 		TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq);
 
 		/*
 		 * If the controller is in the middle of resetting, don't
 		 *  try to submit queued requests here - let the reset logic
 		 *  handle that instead.
 		 */
 		if (!STAILQ_EMPTY(&qpair->queued_req) &&
 		    !qpair->ctrlr->is_resetting) {
 			req = STAILQ_FIRST(&qpair->queued_req);
 			STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
 			_nvme_qpair_submit_request(qpair, req);
 		}
 	}
 
 	mtx_unlock(&qpair->lock);
 }
 
 static uint32_t
 nvme_qpair_make_status(uint32_t sct, uint32_t sc, uint32_t dnr)
 {
 	uint32_t status = 0;
 
 	status |= NVMEF(NVME_STATUS_SCT, sct);
 	status |= NVMEF(NVME_STATUS_SC, sc);
 	status |= NVMEF(NVME_STATUS_DNR, dnr);
 	/* M=0 : this is artificial so no data in error log page */
 	/* CRD=0 : this is artificial and no delayed retry support anyway */
 	/* P=0 : phase not checked */
 	return (status);
 }
 
 static void
 nvme_qpair_manual_complete_tracker(
     struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
     error_print_t print_on_error)
 {
 	struct nvme_completion	cpl;
 	struct nvme_qpair * qpair = tr->qpair;
 
 	mtx_assert(&qpair->lock, MA_NOTOWNED);
 
 	memset(&cpl, 0, sizeof(cpl));
 
 	cpl.sqid = qpair->id;
 	cpl.cid = tr->cid;
 	cpl.status = nvme_qpair_make_status(sct, sc, dnr);
 	nvme_qpair_complete_tracker(tr, &cpl, print_on_error);
 }
 
 static void
 nvme_qpair_manual_complete_request(struct nvme_qpair *qpair,
     struct nvme_request *req, uint32_t sct, uint32_t sc, uint32_t dnr,
     error_print_t print_on_error)
 {
 	struct nvme_completion	cpl;
 	bool			error;
 
 	memset(&cpl, 0, sizeof(cpl));
 	cpl.sqid = qpair->id;
 	cpl.status = nvme_qpair_make_status(sct, sc, dnr);
 	error = nvme_completion_is_error(&cpl);
 
 	if (error && print_on_error == ERROR_PRINT_ALL) {
 		nvme_qpair_print_command(qpair, &req->cmd);
 		nvme_qpair_print_completion(qpair, &cpl);
 	}
 
 	if (req->cb_fn)
 		req->cb_fn(req->cb_arg, &cpl);
 
 	nvme_free_request(req);
 }
 
 /* Locked version of completion processor */
 static bool
 _nvme_qpair_process_completions(struct nvme_qpair *qpair)
 {
 	struct nvme_tracker	*tr;
 	struct nvme_completion	cpl;
 	bool done = false;
 	bool in_panic = dumping || SCHEDULER_STOPPED();
 
 	mtx_assert(&qpair->recovery, MA_OWNED);
 
 	/*
 	 * qpair is not enabled, likely because a controller reset is in
 	 * progress.  Ignore the interrupt - any I/O that was associated with
 	 * this interrupt will get retried when the reset is complete. Any
 	 * pending completions for when we're in startup will be completed
 	 * as soon as initialization is complete and we start sending commands
 	 * to the device.
 	 */
 	if (qpair->recovery_state != RECOVERY_NONE) {
 		qpair->num_ignored++;
 		return (false);
 	}
 
 	/*
 	 * Sanity check initialization. After we reset the hardware, the phase
 	 * is defined to be 1. So if we get here with zero prior calls and the
 	 * phase is 0, it means that we've lost a race between the
 	 * initialization and the ISR running. With the phase wrong, we'll
 	 * process a bunch of completions that aren't really completions leading
 	 * to a KASSERT below.
 	 */
 	KASSERT(!(qpair->num_intr_handler_calls == 0 && qpair->phase == 0),
 	    ("%s: Phase wrong for first interrupt call.",
 		device_get_nameunit(qpair->ctrlr->dev)));
 
 	qpair->num_intr_handler_calls++;
 
 	bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
 	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 	/*
 	 * A panic can stop the CPU this routine is running on at any point.  If
 	 * we're called during a panic, complete the sq_head wrap protocol for
 	 * the case where we are interrupted just after the increment at 1
 	 * below, but before we can reset cq_head to zero at 2. Also cope with
 	 * the case where we do the zero at 2, but may or may not have done the
 	 * phase adjustment at step 3. The panic machinery flushes all pending
 	 * memory writes, so we can make these strong ordering assumptions
 	 * that would otherwise be unwise if we were racing in real time.
 	 */
 	if (__predict_false(in_panic)) {
 		if (qpair->cq_head == qpair->num_entries) {
 			/*
 			 * Here we know that we need to zero cq_head and then negate
 			 * the phase, which hasn't been assigned if cq_head isn't
 			 * zero due to the atomic_store_rel.
 			 */
 			qpair->cq_head = 0;
 			qpair->phase = !qpair->phase;
 		} else if (qpair->cq_head == 0) {
 			/*
 			 * In this case, we know that the assignment at 2
 			 * happened below, but we don't know if it 3 happened or
 			 * not. To do this, we look at the last completion
 			 * entry and set the phase to the opposite phase
 			 * that it has. This gets us back in sync
 			 */
 			cpl = qpair->cpl[qpair->num_entries - 1];
 			nvme_completion_swapbytes(&cpl);
 			qpair->phase = !NVME_STATUS_GET_P(cpl.status);
 		}
 	}
 
 	while (1) {
 		uint16_t status;
 
 		/*
 		 * We need to do this dance to avoid a race between the host and
 		 * the device where the device overtakes the host while the host
 		 * is reading this record, leaving the status field 'new' and
 		 * the sqhd and cid fields potentially stale. If the phase
 		 * doesn't match, that means status hasn't yet been updated and
 		 * we'll get any pending changes next time. It also means that
 		 * the phase must be the same the second time. We have to sync
 		 * before reading to ensure any bouncing completes.
 		 */
 		status = le16toh(qpair->cpl[qpair->cq_head].status);
 		if (NVME_STATUS_GET_P(status) != qpair->phase)
 			break;
 
 		bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
 		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 		cpl = qpair->cpl[qpair->cq_head];
 		nvme_completion_swapbytes(&cpl);
 
 		KASSERT(
 		    NVME_STATUS_GET_P(status) == NVME_STATUS_GET_P(cpl.status),
 		    ("Phase unexpectedly inconsistent"));
 
 		if (cpl.cid < qpair->num_trackers)
 			tr = qpair->act_tr[cpl.cid];
 		else
 			tr = NULL;
 
 		done = true;
 		if (tr != NULL) {
 			nvme_qpair_complete_tracker(tr, &cpl, ERROR_PRINT_ALL);
 			qpair->sq_head = cpl.sqhd;
 		} else if (!in_panic) {
 			/*
 			 * A missing tracker is normally an error.  However, a
 			 * panic can stop the CPU this routine is running on
 			 * after completing an I/O but before updating
 			 * qpair->cq_head at 1 below.  Later, we re-enter this
 			 * routine to poll I/O associated with the kernel
 			 * dump. We find that the tr has been set to null before
 			 * calling the completion routine.  If it hasn't
 			 * completed (or it triggers a panic), then '1' below
 			 * won't have updated cq_head. Rather than panic again,
 			 * ignore this condition because it's not unexpected.
 			 */
 			nvme_printf(qpair->ctrlr,
 			    "cpl (cid = %u) does not map to outstanding cmd\n",
 				cpl.cid);
 			nvme_qpair_print_completion(qpair,
 			    &qpair->cpl[qpair->cq_head]);
 			KASSERT(0, ("received completion for unknown cmd"));
 		}
 
 		/*
 		 * There's a number of races with the following (see above) when
 		 * the system panics. We compensate for each one of them by
 		 * using the atomic store to force strong ordering (at least when
 		 * viewed in the aftermath of a panic).
 		 */
 		if (++qpair->cq_head == qpair->num_entries) {		/* 1 */
 			atomic_store_rel_int(&qpair->cq_head, 0);	/* 2 */
 			qpair->phase = !qpair->phase;			/* 3 */
 		}
 	}
 
 	if (done) {
 		bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle,
 		    qpair->cq_hdbl_off, qpair->cq_head);
 	}
 
 	return (done);
 }
 
 bool
 nvme_qpair_process_completions(struct nvme_qpair *qpair)
 {
 	bool done = false;
 
 	/*
 	 * Interlock with reset / recovery code. This is an usually uncontended
 	 * to make sure that we drain out of the ISRs before we reset the card
 	 * and to prevent races with the recovery process called from a timeout
 	 * context.
 	 */
 	mtx_lock(&qpair->recovery);
 
 	if (__predict_true(qpair->recovery_state == RECOVERY_NONE))
 		done = _nvme_qpair_process_completions(qpair);
 	else
 		qpair->num_recovery_nolock++;	// XXX likely need to rename
 
 	mtx_unlock(&qpair->recovery);
 
 	return (done);
 }
 
 static void
 nvme_qpair_msi_handler(void *arg)
 {
 	struct nvme_qpair *qpair = arg;
 
 	nvme_qpair_process_completions(qpair);
 }
 
 int
 nvme_qpair_construct(struct nvme_qpair *qpair,
     uint32_t num_entries, uint32_t num_trackers,
     struct nvme_controller *ctrlr)
 {
 	struct nvme_tracker	*tr;
 	size_t			cmdsz, cplsz, prpsz, allocsz, prpmemsz;
 	uint64_t		queuemem_phys, prpmem_phys, list_phys;
 	uint8_t			*queuemem, *prpmem, *prp_list;
 	int			i, err;
 
 	qpair->vector = ctrlr->msi_count > 1 ? qpair->id : 0;
 	qpair->num_entries = num_entries;
 	qpair->num_trackers = num_trackers;
 	qpair->ctrlr = ctrlr;
 
 	mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF);
 	mtx_init(&qpair->recovery, "nvme qpair recovery", NULL, MTX_DEF);
 
 	callout_init_mtx(&qpair->timer, &qpair->recovery, 0);
 	qpair->timer_armed = false;
 	qpair->recovery_state = RECOVERY_WAITING;
 
 	/* Note: NVMe PRP format is restricted to 4-byte alignment. */
 	err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
 	    4, ctrlr->page_size, BUS_SPACE_MAXADDR,
 	    BUS_SPACE_MAXADDR, NULL, NULL, ctrlr->max_xfer_size,
 	    howmany(ctrlr->max_xfer_size, ctrlr->page_size) + 1,
 	    ctrlr->page_size, 0,
 	    NULL, NULL, &qpair->dma_tag_payload);
 	if (err != 0) {
 		nvme_printf(ctrlr, "payload tag create failed %d\n", err);
 		goto out;
 	}
 
 	/*
 	 * Each component must be page aligned, and individual PRP lists
 	 * cannot cross a page boundary.
 	 */
 	cmdsz = qpair->num_entries * sizeof(struct nvme_command);
 	cmdsz = roundup2(cmdsz, ctrlr->page_size);
 	cplsz = qpair->num_entries * sizeof(struct nvme_completion);
 	cplsz = roundup2(cplsz, ctrlr->page_size);
 	/*
 	 * For commands requiring more than 2 PRP entries, one PRP will be
 	 * embedded in the command (prp1), and the rest of the PRP entries
 	 * will be in a list pointed to by the command (prp2).
 	 */
 	prpsz = sizeof(uint64_t) *
 	    howmany(ctrlr->max_xfer_size, ctrlr->page_size);
 	prpmemsz = qpair->num_trackers * prpsz;
 	allocsz = cmdsz + cplsz + prpmemsz;
 
 	err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
 	    ctrlr->page_size, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
 	    allocsz, 1, allocsz, 0, NULL, NULL, &qpair->dma_tag);
 	if (err != 0) {
 		nvme_printf(ctrlr, "tag create failed %d\n", err);
 		goto out;
 	}
 	bus_dma_tag_set_domain(qpair->dma_tag, qpair->domain);
 
 	if (bus_dmamem_alloc(qpair->dma_tag, (void **)&queuemem,
 	     BUS_DMA_COHERENT | BUS_DMA_NOWAIT, &qpair->queuemem_map)) {
 		nvme_printf(ctrlr, "failed to alloc qpair memory\n");
 		goto out;
 	}
 
 	if (bus_dmamap_load(qpair->dma_tag, qpair->queuemem_map,
 	    queuemem, allocsz, nvme_single_map, &queuemem_phys, 0) != 0) {
 		nvme_printf(ctrlr, "failed to load qpair memory\n");
 		bus_dmamem_free(qpair->dma_tag, qpair->cmd,
 		    qpair->queuemem_map);
 		goto out;
 	}
 
 	qpair->num_cmds = 0;
 	qpair->num_intr_handler_calls = 0;
 	qpair->num_retries = 0;
 	qpair->num_failures = 0;
 	qpair->num_ignored = 0;
 	qpair->cmd = (struct nvme_command *)queuemem;
 	qpair->cpl = (struct nvme_completion *)(queuemem + cmdsz);
 	prpmem = (uint8_t *)(queuemem + cmdsz + cplsz);
 	qpair->cmd_bus_addr = queuemem_phys;
 	qpair->cpl_bus_addr = queuemem_phys + cmdsz;
 	prpmem_phys = queuemem_phys + cmdsz + cplsz;
 
 	/*
 	 * Calcuate the stride of the doorbell register. Many emulators set this
 	 * value to correspond to a cache line. However, some hardware has set
 	 * it to various small values.
 	 */
 	qpair->sq_tdbl_off = nvme_mmio_offsetof(doorbell[0]) +
 	    (qpair->id << (ctrlr->dstrd + 1));
 	qpair->cq_hdbl_off = nvme_mmio_offsetof(doorbell[0]) +
 	    (qpair->id << (ctrlr->dstrd + 1)) + (1 << ctrlr->dstrd);
 
 	TAILQ_INIT(&qpair->free_tr);
 	TAILQ_INIT(&qpair->outstanding_tr);
 	STAILQ_INIT(&qpair->queued_req);
 
 	list_phys = prpmem_phys;
 	prp_list = prpmem;
 	for (i = 0; i < qpair->num_trackers; i++) {
 		if (list_phys + prpsz > prpmem_phys + prpmemsz) {
 			qpair->num_trackers = i;
 			break;
 		}
 
 		/*
 		 * Make sure that the PRP list for this tracker doesn't
 		 * overflow to another nvme page.
 		 */
 		if (trunc_page(list_phys) !=
 		    trunc_page(list_phys + prpsz - 1)) {
 			list_phys = roundup2(list_phys, ctrlr->page_size);
 			prp_list =
 			    (uint8_t *)roundup2((uintptr_t)prp_list, ctrlr->page_size);
 		}
 
 		tr = malloc_domainset(sizeof(*tr), M_NVME,
 		    DOMAINSET_PREF(qpair->domain), M_ZERO | M_WAITOK);
 		bus_dmamap_create(qpair->dma_tag_payload, 0,
 		    &tr->payload_dma_map);
 		tr->cid = i;
 		tr->qpair = qpair;
 		tr->prp = (uint64_t *)prp_list;
 		tr->prp_bus_addr = list_phys;
 		TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq);
 		list_phys += prpsz;
 		prp_list += prpsz;
 	}
 
 	if (qpair->num_trackers == 0) {
 		nvme_printf(ctrlr, "failed to allocate enough trackers\n");
 		goto out;
 	}
 
 	qpair->act_tr = malloc_domainset(sizeof(struct nvme_tracker *) *
 	    qpair->num_entries, M_NVME, DOMAINSET_PREF(qpair->domain),
 	    M_ZERO | M_WAITOK);
 
 	if (ctrlr->msi_count > 1) {
 		/*
 		 * MSI-X vector resource IDs start at 1, so we add one to
 		 *  the queue's vector to get the corresponding rid to use.
 		 */
 		qpair->rid = qpair->vector + 1;
 
 		qpair->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ,
 		    &qpair->rid, RF_ACTIVE);
 		if (qpair->res == NULL) {
 			nvme_printf(ctrlr, "unable to allocate MSI\n");
 			goto out;
 		}
 		if (bus_setup_intr(ctrlr->dev, qpair->res,
 		    INTR_TYPE_MISC | INTR_MPSAFE, NULL,
 		    nvme_qpair_msi_handler, qpair, &qpair->tag) != 0) {
 			nvme_printf(ctrlr, "unable to setup MSI\n");
 			goto out;
 		}
 		if (qpair->id == 0) {
 			bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag,
 			    "admin");
 		} else {
 			bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag,
 			    "io%d", qpair->id - 1);
 		}
 	}
 
 	return (0);
 
 out:
 	nvme_qpair_destroy(qpair);
 	return (ENOMEM);
 }
 
 static void
 nvme_qpair_destroy(struct nvme_qpair *qpair)
 {
 	struct nvme_tracker	*tr;
 
 	mtx_lock(&qpair->recovery);
 	qpair->timer_armed = false;
 	mtx_unlock(&qpair->recovery);
 	callout_drain(&qpair->timer);
 
 	if (qpair->tag) {
 		bus_teardown_intr(qpair->ctrlr->dev, qpair->res, qpair->tag);
 		qpair->tag = NULL;
 	}
 
 	if (qpair->act_tr) {
 		free(qpair->act_tr, M_NVME);
 		qpair->act_tr = NULL;
 	}
 
 	while (!TAILQ_EMPTY(&qpair->free_tr)) {
 		tr = TAILQ_FIRST(&qpair->free_tr);
 		TAILQ_REMOVE(&qpair->free_tr, tr, tailq);
 		bus_dmamap_destroy(qpair->dma_tag_payload,
 		    tr->payload_dma_map);
 		free(tr, M_NVME);
 	}
 
 	if (qpair->cmd != NULL) {
 		bus_dmamap_unload(qpair->dma_tag, qpair->queuemem_map);
 		bus_dmamem_free(qpair->dma_tag, qpair->cmd,
 		    qpair->queuemem_map);
 		qpair->cmd = NULL;
 	}
 
 	if (qpair->dma_tag) {
 		bus_dma_tag_destroy(qpair->dma_tag);
 		qpair->dma_tag = NULL;
 	}
 
 	if (qpair->dma_tag_payload) {
 		bus_dma_tag_destroy(qpair->dma_tag_payload);
 		qpair->dma_tag_payload = NULL;
 	}
 
 	if (mtx_initialized(&qpair->lock))
 		mtx_destroy(&qpair->lock);
 	if (mtx_initialized(&qpair->recovery))
 		mtx_destroy(&qpair->recovery);
 
 	if (qpair->res) {
 		bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ,
 		    rman_get_rid(qpair->res), qpair->res);
 		qpair->res = NULL;
 	}
 }
 
 static void
 nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair)
 {
 	struct nvme_tracker	*tr;
 
 	/*
 	 * nvme_complete_tracker must be called without the qpair lock held. It
 	 * takes the lock to adjust outstanding_tr list, so make sure we don't
 	 * have it yet. We need the lock to make the list traverse safe, but
 	 * have to drop the lock to complete any AER. We restart the list scan
 	 * when we do this to make this safe. There's interlock with the ISR so
 	 * we know this tracker won't be completed twice.
 	 */
 	mtx_assert(&qpair->lock, MA_NOTOWNED);
 
 	mtx_lock(&qpair->lock);
 	tr = TAILQ_FIRST(&qpair->outstanding_tr);
 	while (tr != NULL) {
 		if (tr->req->cmd.opc != NVME_OPC_ASYNC_EVENT_REQUEST) {
 			tr = TAILQ_NEXT(tr, tailq);
 			continue;
 		}
 		mtx_unlock(&qpair->lock);
 		nvme_qpair_manual_complete_tracker(tr,
 		    NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0,
 		    ERROR_PRINT_NONE);
 		mtx_lock(&qpair->lock);
 		tr = TAILQ_FIRST(&qpair->outstanding_tr);
 	}
 	mtx_unlock(&qpair->lock);
 }
 
 void
 nvme_admin_qpair_destroy(struct nvme_qpair *qpair)
 {
 	mtx_assert(&qpair->lock, MA_NOTOWNED);
 
 	nvme_admin_qpair_abort_aers(qpair);
 	nvme_qpair_destroy(qpair);
 }
 
 void
 nvme_io_qpair_destroy(struct nvme_qpair *qpair)
 {
 
 	nvme_qpair_destroy(qpair);
 }
 
 static void
 nvme_abort_complete(void *arg, const struct nvme_completion *status)
 {
 	struct nvme_tracker     *tr = arg;
 
 	/*
 	 * If cdw0 bit 0 == 1, the controller was not able to abort the command
 	 * we requested.  We still need to check the active tracker array, to
 	 * cover race where I/O timed out at same time controller was completing
 	 * the I/O. An abort command always is on the admin queue, but affects
 	 * either an admin or an I/O queue, so take the appropriate qpair lock
 	 * for the original command's queue, since we'll need it to avoid races
 	 * with the completion code and to complete the command manually.
 	 */
 	mtx_lock(&tr->qpair->lock);
 	if ((status->cdw0 & 1) == 1 && tr->qpair->act_tr[tr->cid] != NULL) {
 		/*
 		 * An I/O has timed out, and the controller was unable to abort
 		 * it for some reason.  And we've not processed a completion for
 		 * it yet. Construct a fake completion status, and then complete
 		 * the I/O's tracker manually.
 		 */
 		nvme_printf(tr->qpair->ctrlr,
 		    "abort command failed, aborting command manually\n");
 		nvme_qpair_manual_complete_tracker(tr,
 		    NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL);
 	}
 	/*
 	 * XXX We don't check status for the possible 'Could not abort because
 	 * excess aborts were submitted to the controller'. We don't prevent
 	 * that, either. Document for the future here, since the standard is
 	 * squishy and only says 'may generate' but implies anything is possible
 	 * including hangs if you exceed the ACL.
 	 */
 	mtx_unlock(&tr->qpair->lock);
 }
 
 static void
 nvme_qpair_timeout(void *arg)
 {
 	struct nvme_qpair	*qpair = arg;
 	struct nvme_controller	*ctrlr = qpair->ctrlr;
 	struct nvme_tracker	*tr;
 	sbintime_t		now;
 	bool			idle = true;
 	bool			fast;
 	uint32_t		csts;
 	uint8_t			cfs;
 
 	mtx_assert(&qpair->recovery, MA_OWNED);
 
 	/*
 	 * If the controller is failed, then stop polling. This ensures that any
 	 * failure processing that races with the qpair timeout will fail
 	 * safely.
 	 */
 	if (qpair->ctrlr->is_failed) {
 		nvme_printf(qpair->ctrlr,
 		    "Failed controller, stopping watchdog timeout.\n");
 		qpair->timer_armed = false;
 		return;
 	}
 
 	/*
 	 * Shutdown condition: We set qpair->timer_armed to false in
 	 * nvme_qpair_destroy before calling callout_drain. When we call that,
 	 * this routine might get called one last time. Exit w/o setting a
 	 * timeout. None of the watchdog stuff needs to be done since we're
 	 * destroying the qpair.
 	 */
 	if (!qpair->timer_armed) {
 		nvme_printf(qpair->ctrlr,
 		    "Timeout fired during nvme_qpair_destroy\n");
 		return;
 	}
 
 	switch (qpair->recovery_state) {
 	case RECOVERY_NONE:
 		/*
 		 * Read csts to get value of cfs - controller fatal status.  If
 		 * we are in the hot-plug or controller failed status proceed
 		 * directly to reset. We also bail early if the status reads all
 		 * 1's or the control fatal status bit is now 1. The latter is
 		 * always true when the former is true, but not vice versa.  The
 		 * intent of the code is that if the card is gone (all 1's) or
 		 * we've failed, then try to do a reset (which someitmes
 		 * unwedges a card reading all 1's that's not gone away, but
 		 * usually doesn't).
 		 */
 		csts = nvme_mmio_read_4(ctrlr, csts);
 		cfs = NVMEV(NVME_CSTS_REG_CFS, csts);
 		if (csts == NVME_GONE || cfs == 1) {
 			/*
 			 * We've had a command timeout that we weren't able to
 			 * abort or we have aborts disabled and any command
 			 * timed out.
 			 *
 			 * If we get here due to a possible surprise hot-unplug
 			 * event, then we let nvme_ctrlr_reset confirm and fail
 			 * the controller.
 			 */
 do_reset:
 			nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",
 			    (csts == 0xffffffff) ? " and possible hot unplug" :
 			    (cfs ? " and fatal error status" : ""));
 			qpair->recovery_state = RECOVERY_WAITING;
 			nvme_ctrlr_reset(ctrlr);
 			idle = false;
 			break;
 		}
 
 
 		/*
 		 * See if there's any recovery needed. First, do a fast check to
 		 * see if anything could have timed out. If not, then skip
 		 * everything else.
 		 */
 		fast = false;
 		mtx_lock(&qpair->lock);
 		now = getsbinuptime();
 		TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
 			/*
 			 * Skip async commands, they are posted to the card for
 			 * an indefinite amount of time and have no deadline.
 			 */
 			if (tr->deadline == SBT_MAX)
 				continue;
 
 			/*
 			 * If the first real transaction is not in timeout, then
 			 * we're done. Otherwise, we try recovery.
 			 */
 			idle = false;
 			if (now <= tr->deadline)
 				fast = true;
 			break;
 		}
 		mtx_unlock(&qpair->lock);
 		if (idle || fast)
 			break;
 
 		/*
 		 * There's a stale transaction at the start of the queue whose
 		 * deadline has passed. Poll the competions as a last-ditch
-		 * effort in case an interrupt has been missed.
+		 * effort in case an interrupt has been missed. Warn the user if
+		 * transactions were found of possible interrupt issues, but
+		 * just once per controller.
 		 */
-		_nvme_qpair_process_completions(qpair);
+		if (_nvme_qpair_process_completions(qpair) && !ctrlr->isr_warned) {
+			nvme_printf(ctrlr, "System interrupt issues?\n");
+			ctrlr->isr_warned = true;
+		}
 
 		/*
 		 * Now that we've run the ISR, re-rheck to see if there's any
 		 * timed out commands and abort them or reset the card if so.
 		 */
 		mtx_lock(&qpair->lock);
 		idle = true;
 		TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
 			/*
 			 * Skip async commands, they are posted to the card for
 			 * an indefinite amount of time and have no deadline.
 			 */
 			if (tr->deadline == SBT_MAX)
 				continue;
 
 			/*
 			 * If we know this tracker hasn't timed out, we also
 			 * know all subsequent ones haven't timed out. The tr
 			 * queue is in submission order and all normal commands
 			 * in a queue have the same timeout (or the timeout was
 			 * changed by the user, but we eventually timeout then).
 			 */
 			idle = false;
 			if (now <= tr->deadline)
 				break;
 
 			/*
 			 * Timeout expired, abort it or reset controller.
 			 */
 			if (ctrlr->enable_aborts &&
 			    tr->req->cb_fn != nvme_abort_complete) {
 				/*
 				 * This isn't an abort command, ask for a
 				 * hardware abort. This goes to the admin
 				 * queue which will reset the card if it
 				 * times out.
 				 */
 				nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id,
 				    nvme_abort_complete, tr);
 			} else {
 				/*
 				 * We have a live command in the card (either
 				 * one we couldn't abort, or aborts weren't
 				 * enabled).  We can only reset.
 				 */
 				mtx_unlock(&qpair->lock);
 				goto do_reset;
 			}
 		}
 		mtx_unlock(&qpair->lock);
 		break;
 
 	case RECOVERY_WAITING:
 		/*
 		 * These messages aren't interesting while we're suspended. We
 		 * put the queues into waiting state while
 		 * suspending. Suspending takes a while, so we'll see these
 		 * during that time and they aren't diagnostic. At other times,
 		 * they indicate a problem that's worth complaining about.
 		 */
 		if (!device_is_suspended(ctrlr->dev))
 			nvme_printf(ctrlr, "Waiting for reset to complete\n");
 		idle = false;		/* We want to keep polling */
 		break;
 	}
 
 	/*
 	 * Rearm the timeout.
 	 */
 	if (!idle) {
 		callout_schedule_sbt(&qpair->timer, SBT_1S / 2, SBT_1S / 2, 0);
 	} else {
 		qpair->timer_armed = false;
 	}
 }
 
 /*
  * Submit the tracker to the hardware. Must already be in the
  * outstanding queue when called.
  */
 void
 nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr)
 {
 	struct nvme_request	*req;
 	struct nvme_controller	*ctrlr;
 	int timeout;
 
 	mtx_assert(&qpair->lock, MA_OWNED);
 
 	req = tr->req;
 	req->cmd.cid = tr->cid;
 	qpair->act_tr[tr->cid] = tr;
 	ctrlr = qpair->ctrlr;
 
 	if (req->timeout) {
 		if (req->cb_fn == nvme_completion_poll_cb)
 			timeout = 1;
 		else if (qpair->id == 0)
 			timeout = ctrlr->admin_timeout_period;
 		else
 			timeout = ctrlr->timeout_period;
 		tr->deadline = getsbinuptime() + timeout * SBT_1S;
 		if (!qpair->timer_armed) {
 			qpair->timer_armed = true;
 			callout_reset_sbt_on(&qpair->timer, SBT_1S / 2, SBT_1S / 2,
 			    nvme_qpair_timeout, qpair, qpair->cpu, 0);
 		}
 	} else
 		tr->deadline = SBT_MAX;
 
 	/* Copy the command from the tracker to the submission queue. */
 	memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd));
 
 	if (++qpair->sq_tail == qpair->num_entries)
 		qpair->sq_tail = 0;
 
 	bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	bus_space_write_4(ctrlr->bus_tag, ctrlr->bus_handle,
 	    qpair->sq_tdbl_off, qpair->sq_tail);
 	qpair->num_cmds++;
 }
 
 static void
 nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
 {
 	struct nvme_tracker 	*tr = arg;
 	uint32_t		cur_nseg;
 
 	/*
 	 * If the mapping operation failed, return immediately.  The caller
 	 *  is responsible for detecting the error status and failing the
 	 *  tracker manually.
 	 */
 	if (error != 0) {
 		nvme_printf(tr->qpair->ctrlr,
 		    "nvme_payload_map err %d\n", error);
 		return;
 	}
 
 	/*
 	 * Note that we specified ctrlr->page_size for alignment and max
 	 * segment size when creating the bus dma tags.  So here we can safely
 	 * just transfer each segment to its associated PRP entry.
 	 */
 	tr->req->cmd.prp1 = htole64(seg[0].ds_addr);
 
 	if (nseg == 2) {
 		tr->req->cmd.prp2 = htole64(seg[1].ds_addr);
 	} else if (nseg > 2) {
 		cur_nseg = 1;
 		tr->req->cmd.prp2 = htole64((uint64_t)tr->prp_bus_addr);
 		while (cur_nseg < nseg) {
 			tr->prp[cur_nseg-1] =
 			    htole64((uint64_t)seg[cur_nseg].ds_addr);
 			cur_nseg++;
 		}
 	} else {
 		/*
 		 * prp2 should not be used by the controller
 		 *  since there is only one segment, but set
 		 *  to 0 just to be safe.
 		 */
 		tr->req->cmd.prp2 = 0;
 	}
 
 	bus_dmamap_sync(tr->qpair->dma_tag_payload, tr->payload_dma_map,
 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	nvme_qpair_submit_tracker(tr->qpair, tr);
 }
 
 static void
 _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
 {
 	struct nvme_tracker	*tr;
 	int			err = 0;
 
 	mtx_assert(&qpair->lock, MA_OWNED);
 
 	tr = TAILQ_FIRST(&qpair->free_tr);
 	req->qpair = qpair;
 
 	/*
 	 * The controller has failed, so fail the request. Note, that this races
 	 * the recovery / timeout code. Since we hold the qpair lock, we know
 	 * it's safe to fail directly. is_failed is set when we fail the controller.
 	 * It is only ever reset in the ioctl reset controller path, which is safe
 	 * to race (for failed controllers, we make no guarantees about bringing
 	 * it out of failed state relative to other commands).
 	 */
 	if (qpair->ctrlr->is_failed) {
 		nvme_qpair_manual_complete_request(qpair, req,
 		    NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 1,
 		    ERROR_PRINT_NONE);
 		return;
 	}
 
 	/*
 	 * No tracker is available, or the qpair is disabled due to an
 	 * in-progress controller-level reset. If we lose the race with
 	 * recovery_state, then we may add an extra request to the queue which
 	 * will be resubmitted later.  We only set recovery_state to NONE with
 	 * qpair->lock also held, so if we observe that the state is not NONE,
 	 * we know it won't transition back to NONE without retrying queued
 	 * request.
 	 */
 	if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) {
 		STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
 		return;
 	}
 
 	TAILQ_REMOVE(&qpair->free_tr, tr, tailq);
 	TAILQ_INSERT_TAIL(&qpair->outstanding_tr, tr, tailq);
 	tr->deadline = SBT_MAX;
 	tr->req = req;
 
 	if (!req->payload_valid) {
 		nvme_qpair_submit_tracker(tr->qpair, tr);
 		return;
 	}
 
 	/*
 	 * tr->deadline updating when nvme_payload_map calls
 	 * nvme_qpair_submit_tracker (we call it above directly
 	 * when there's no map to load).
 	 */
 	err = bus_dmamap_load_mem(tr->qpair->dma_tag_payload,
 	    tr->payload_dma_map, &req->payload, nvme_payload_map, tr, 0);
 	if (err != 0) {
 		/*
 		 * The dmamap operation failed, so we manually fail the
 		 *  tracker here with DATA_TRANSFER_ERROR status.
 		 *
 		 * nvme_qpair_manual_complete_tracker must not be called
 		 *  with the qpair lock held.
 		 */
 		nvme_printf(qpair->ctrlr,
 		    "bus_dmamap_load_mem returned 0x%x!\n", err);
 		mtx_unlock(&qpair->lock);
 		nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
 		    NVME_SC_DATA_TRANSFER_ERROR, DO_NOT_RETRY, ERROR_PRINT_ALL);
 		mtx_lock(&qpair->lock);
 	}
 }
 
 void
 nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
 {
 
 	mtx_lock(&qpair->lock);
 	_nvme_qpair_submit_request(qpair, req);
 	mtx_unlock(&qpair->lock);
 }
 
 static void
 nvme_qpair_enable(struct nvme_qpair *qpair)
 {
 	if (mtx_initialized(&qpair->recovery))
 		mtx_assert(&qpair->recovery, MA_OWNED);
 	if (mtx_initialized(&qpair->lock))
 		mtx_assert(&qpair->lock, MA_OWNED);
 	KASSERT(!qpair->ctrlr->is_failed,
 	    ("Enabling a failed qpair\n"));
 
 	qpair->recovery_state = RECOVERY_NONE;
 }
 
 void
 nvme_qpair_reset(struct nvme_qpair *qpair)
 {
 
 	qpair->sq_head = qpair->sq_tail = qpair->cq_head = 0;
 
 	/*
 	 * First time through the completion queue, HW will set phase
 	 *  bit on completions to 1.  So set this to 1 here, indicating
 	 *  we're looking for a 1 to know which entries have completed.
 	 *  we'll toggle the bit each time when the completion queue
 	 *  rolls over.
 	 */
 	qpair->phase = 1;
 
 	memset(qpair->cmd, 0,
 	    qpair->num_entries * sizeof(struct nvme_command));
 	memset(qpair->cpl, 0,
 	    qpair->num_entries * sizeof(struct nvme_completion));
 }
 
 void
 nvme_admin_qpair_enable(struct nvme_qpair *qpair)
 {
 	struct nvme_tracker		*tr;
 	struct nvme_tracker		*tr_temp;
 	bool				rpt;
 
 	/*
 	 * Manually abort each outstanding admin command.  Do not retry
 	 * admin commands found here, since they will be left over from
 	 * a controller reset and its likely the context in which the
 	 * command was issued no longer applies.
 	 */
 	rpt = !TAILQ_EMPTY(&qpair->outstanding_tr);
 	if (rpt)
 		nvme_printf(qpair->ctrlr,
 		    "aborting outstanding admin command\n");
 	TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
 		nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
 		    NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL);
 	}
 	if (rpt)
 		nvme_printf(qpair->ctrlr,
 		    "done aborting outstanding admin\n");
 
 	mtx_lock(&qpair->recovery);
 	mtx_lock(&qpair->lock);
 	nvme_qpair_enable(qpair);
 	mtx_unlock(&qpair->lock);
 	mtx_unlock(&qpair->recovery);
 }
 
 void
 nvme_io_qpair_enable(struct nvme_qpair *qpair)
 {
 	STAILQ_HEAD(, nvme_request)	temp;
 	struct nvme_tracker		*tr;
 	struct nvme_tracker		*tr_temp;
 	struct nvme_request		*req;
 	bool				report;
 
 	/*
 	 * Manually abort each outstanding I/O.  This normally results in a
 	 * retry, unless the retry count on the associated request has
 	 * reached its limit.
 	 */
 	report = !TAILQ_EMPTY(&qpair->outstanding_tr);
 	if (report)
 		nvme_printf(qpair->ctrlr, "aborting outstanding i/o\n");
 	TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
 		nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
 		    NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_NO_RETRY);
 	}
 	if (report)
 		nvme_printf(qpair->ctrlr, "done aborting outstanding i/o\n");
 
 	mtx_lock(&qpair->recovery);
 	mtx_lock(&qpair->lock);
 	nvme_qpair_enable(qpair);
 
 	STAILQ_INIT(&temp);
 	STAILQ_SWAP(&qpair->queued_req, &temp, nvme_request);
 
 	report = !STAILQ_EMPTY(&temp);
 	if (report)
 		nvme_printf(qpair->ctrlr, "resubmitting queued i/o\n");
 	while (!STAILQ_EMPTY(&temp)) {
 		req = STAILQ_FIRST(&temp);
 		STAILQ_REMOVE_HEAD(&temp, stailq);
 		nvme_qpair_print_command(qpair, &req->cmd);
 		_nvme_qpair_submit_request(qpair, req);
 	}
 	if (report)
 		nvme_printf(qpair->ctrlr, "done resubmitting i/o\n");
 
 	mtx_unlock(&qpair->lock);
 	mtx_unlock(&qpair->recovery);
 }
 
 static void
 nvme_qpair_disable(struct nvme_qpair *qpair)
 {
 	struct nvme_tracker	*tr, *tr_temp;
 
 	if (mtx_initialized(&qpair->recovery))
 		mtx_assert(&qpair->recovery, MA_OWNED);
 	if (mtx_initialized(&qpair->lock))
 		mtx_assert(&qpair->lock, MA_OWNED);
 
 	qpair->recovery_state = RECOVERY_WAITING;
 	TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
 		tr->deadline = SBT_MAX;
 	}
 }
 
 void
 nvme_admin_qpair_disable(struct nvme_qpair *qpair)
 {
 	mtx_lock(&qpair->recovery);
 
 	mtx_lock(&qpair->lock);
 	nvme_qpair_disable(qpair);
 	mtx_unlock(&qpair->lock);
 
 	nvme_admin_qpair_abort_aers(qpair);
 
 	mtx_unlock(&qpair->recovery);
 }
 
 void
 nvme_io_qpair_disable(struct nvme_qpair *qpair)
 {
 	mtx_lock(&qpair->recovery);
 	mtx_lock(&qpair->lock);
 
 	nvme_qpair_disable(qpair);
 
 	mtx_unlock(&qpair->lock);
 	mtx_unlock(&qpair->recovery);
 }
 
 void
 nvme_qpair_fail(struct nvme_qpair *qpair)
 {
 	struct nvme_tracker		*tr;
 	struct nvme_request		*req;
 
 	if (!mtx_initialized(&qpair->lock))
 		return;
 
 	mtx_lock(&qpair->lock);
 
 	if (!STAILQ_EMPTY(&qpair->queued_req)) {
 		nvme_printf(qpair->ctrlr, "failing queued i/o\n");
 	}
 	while (!STAILQ_EMPTY(&qpair->queued_req)) {
 		req = STAILQ_FIRST(&qpair->queued_req);
 		STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
 		mtx_unlock(&qpair->lock);
 		nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC,
 		    NVME_SC_ABORTED_BY_REQUEST, 1, ERROR_PRINT_ALL);
 		mtx_lock(&qpair->lock);
 	}
 
 	if (!TAILQ_EMPTY(&qpair->outstanding_tr)) {
 		nvme_printf(qpair->ctrlr, "failing outstanding i/o\n");
 	}
 	/* Manually abort each outstanding I/O. */
 	while (!TAILQ_EMPTY(&qpair->outstanding_tr)) {
 		tr = TAILQ_FIRST(&qpair->outstanding_tr);
 		/*
 		 * Do not remove the tracker.  The abort_tracker path will
 		 *  do that for us.
 		 */
 		mtx_unlock(&qpair->lock);
 		nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
 		    NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL);
 		mtx_lock(&qpair->lock);
 	}
 
 	mtx_unlock(&qpair->lock);
 }